From 5d92baa7d85c62e792beb7bb3b1fc2d0aa3cd6ae Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Thu, 8 Apr 2021 13:01:46 +0200 Subject: [PATCH 001/188] Bump decoders versions --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 51548a7d98..7313ad61c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,8 +69,8 @@ num-derive = "0.3" paste = "1.0" noop_proc_macro = "0.3.0" serde = { version = "1.0", features = ["derive"], optional = true } -dav1d-sys = { version = "0.3.2", optional = true } -aom-sys = { version = "0.2.1", optional = true } +dav1d-sys = { version = "0.3.4", optional = true } +aom-sys = { version = "0.3.0", optional = true } scan_fmt = { version = "0.2.3", optional = true, default-features = false } ivf = { version = "0.1", path = "ivf/", optional = true } v_frame = { version = "0.2.1", path = "v_frame/" } From 02106e0bc2019723545cd65f08eaaef25a9e434d Mon Sep 17 00:00:00 2001 From: Vibhoothi Date: Thu, 8 Apr 2021 13:13:15 +0100 Subject: [PATCH 002/188] CI: Update to libaom3 for both travis and GitHub Actions --- .github/workflows/rav1e.yml | 10 +++++----- .travis/install-aom.sh | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/rav1e.yml b/.github/workflows/rav1e.yml index 9f08407c3c..b07b05f605 100644 --- a/.github/workflows/rav1e.yml +++ b/.github/workflows/rav1e.yml @@ -126,16 +126,16 @@ jobs: matrix.conf == 'grcov-coveralls' env: LINK: https://www.deb-multimedia.org/pool/main/a/aom-dmo - AOM_VERSION: 2.0.2-dmo0~bpo10+1 + AOM_VERSION: 3.0.0-dmo0~bpo10+1 AOM_DEV_SHA256: >- - d31eee6524ea64c080312eeafc65355e378e043b1d738ff9b9bde3734a85779c + 40b273179f1d9d75202b18295b8fe3e406076e061831d66c2b469bc1a10e5bce AOM_LIB_SHA256: >- - db8a04ca0984604f410c6bd8810ee31666a3bfd3964f3109cdb8f1ae33fec664 + 1bb977ff6b7c42e4e0d5f743670008fc71e79eb5a9fdf83154fe7890379e9a17 run: | echo "$LINK/libaom-dev_${AOM_VERSION}_amd64.deb" >> DEBS - echo "$LINK/libaom2_${AOM_VERSION}_amd64.deb" >> DEBS + echo "$LINK/libaom3_${AOM_VERSION}_amd64.deb" >> DEBS echo "$AOM_DEV_SHA256 libaom-dev_${AOM_VERSION}_amd64.deb" >> CHECKSUMS - echo "$AOM_LIB_SHA256 libaom2_${AOM_VERSION}_amd64.deb" >> CHECKSUMS + echo "$AOM_LIB_SHA256 libaom3_${AOM_VERSION}_amd64.deb" >> CHECKSUMS - name: Add dav1d if: > matrix.conf == '1.51.0-tests' || matrix.conf == 'dav1d-tests' || diff --git a/.travis/install-aom.sh b/.travis/install-aom.sh index 1316d56f94..53bef70953 100755 --- a/.travis/install-aom.sh +++ b/.travis/install-aom.sh @@ -1,7 +1,7 @@ #!/bin/bash set -ex -AOM_VERSION="2.0.2-dmo0~bpo10+1" +AOM_VERSION="3.0.0-dmo0~bpo10+1" PKG_URL="https://www.deb-multimedia.org/pool/main/a/aom-dmo" ARCH="arm64" @@ -10,12 +10,12 @@ cd "$DEPS_DIR" [ -f "libaom-dev_${AOM_VERSION}_${ARCH}.deb" ] && [ -f "libaom2_${AOM_VERSION}_${ARCH}.deb" ] || curl -O "${PKG_URL}/libaom-dev_${AOM_VERSION}_${ARCH}.deb" \ - -O "${PKG_URL}/libaom2_${AOM_VERSION}_${ARCH}.deb" + -O "${PKG_URL}/libaom3_${AOM_VERSION}_${ARCH}.deb" sha256sum --check --ignore-missing < Date: Wed, 14 Apr 2021 15:01:57 +0200 Subject: [PATCH 003/188] Move the channel api in a directory --- src/api/{channel.rs => channel/mod.rs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/api/{channel.rs => channel/mod.rs} (100%) diff --git a/src/api/channel.rs b/src/api/channel/mod.rs similarity index 100% rename from src/api/channel.rs rename to src/api/channel/mod.rs From 0f057ed00f7923e637c83eeb76c71627623afa23 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Wed, 14 Apr 2021 15:18:13 +0200 Subject: [PATCH 004/188] Split the data structures from the channel impl --- src/api/channel/data.rs | 232 ++++++++++++++++++++++++++++++++++++++++ src/api/channel/mod.rs | 215 +------------------------------------ 2 files changed, 234 insertions(+), 213 deletions(-) create mode 100644 src/api/channel/data.rs diff --git a/src/api/channel/data.rs b/src/api/channel/data.rs new file mode 100644 index 0000000000..999b8e72b7 --- /dev/null +++ b/src/api/channel/data.rs @@ -0,0 +1,232 @@ +// Copyright (c) 2018-2020, The rav1e contributors. All rights reserved +// +// This source code is subject to the terms of the BSD 2 Clause License and +// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +// was not distributed with this source code in the LICENSE file, you can +// obtain it at www.aomedia.org/license/software. If the Alliance for Open +// Media Patent License 1.0 was not distributed with this source code in the +// PATENTS file, you can obtain it at www.aomedia.org/license/patent. +#![allow(missing_docs)] + +use crate::api::color::*; +use crate::api::config::EncoderConfig; +use crate::api::context::RcData; +use crate::api::util::*; +use crate::encoder::*; +use crate::frame::*; +use crate::util::Pixel; + +use bitstream_io::*; +use crossbeam::channel::*; + +use std::io; +use std::sync::Arc; + +/// Endpoint to send previous-pass statistics data +pub struct RcDataSender { + pub(crate) sender: Sender, + pub(crate) limit: u64, + pub(crate) count: u64, +} + +impl RcDataSender { + pub(crate) fn new(limit: u64, sender: Sender) -> RcDataSender { + Self { sender, limit, count: 0 } + } + pub fn try_send( + &mut self, data: RcData, + ) -> Result<(), TrySendError> { + if self.limit <= self.count { + Err(TrySendError::Full(data)) + } else { + let r = self.sender.try_send(data); + if r.is_ok() { + self.count += 1; + } + r + } + } + pub fn send(&mut self, data: RcData) -> Result<(), SendError> { + if self.limit <= self.count { + Err(SendError(data)) + } else { + let r = self.sender.send(data); + if r.is_ok() { + self.count += 1; + } + r + } + } + pub fn len(&self) -> usize { + self.sender.len() + } + pub fn is_empty(&self) -> bool { + self.sender.is_empty() + } + + // TODO: proxy more methods +} + +/// Endpoint to receive current-pass statistics data +pub struct RcDataReceiver(pub(crate) Receiver); + +impl RcDataReceiver { + pub fn try_recv(&self) -> Result { + self.0.try_recv() + } + pub fn recv(&self) -> Result { + self.0.recv() + } + pub fn len(&self) -> usize { + self.0.len() + } + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + pub fn iter(&self) -> Iter { + self.0.iter() + } + + pub fn summary_size(&self) -> usize { + crate::rate::TWOPASS_HEADER_SZ + } +} + +pub type PassDataChannel = (RcDataSender, RcDataReceiver); + +pub type FrameInput = (Option>>, Option); + +/// Endpoint to send frames +pub struct FrameSender { + sender: Sender>, + config: Arc, + limit: u64, + count: u64, +} + +// Proxy the crossbeam Sender +// +// TODO: enforce the limit +impl FrameSender { + pub(crate) fn new( + limit: u64, sender: Sender>, config: Arc, + ) -> FrameSender { + Self { sender, config, limit, count: 0 } + } + pub fn try_send>( + &mut self, frame: F, + ) -> Result<(), TrySendError>> { + if self.limit <= self.count { + Err(TrySendError::Full(frame.into())) + } else { + let r = self.sender.try_send(frame.into()); + if r.is_ok() { + self.count += 1; + } + r + } + } + + pub fn send>( + &mut self, frame: F, + ) -> Result<(), SendError>> { + if self.limit <= self.count { + Err(SendError(frame.into())) + } else { + let r = self.sender.send(frame.into()); + if r.is_ok() { + self.count += 1; + } + r + } + } + pub fn len(&self) -> usize { + self.sender.len() + } + pub fn is_empty(&self) -> bool { + self.sender.is_empty() + } + // TODO: proxy more methods +} + +// Frame factory +impl FrameSender { + /// Helper to create a new frame with the current encoder configuration + #[inline] + pub fn new_frame(&self) -> Frame { + Frame::new( + self.config.width, + self.config.height, + self.config.chroma_sampling, + ) + } +} + +/// Endpoint to receive packets +pub struct PacketReceiver { + pub(crate) receiver: Receiver>, + pub(crate) config: Arc, +} + +impl PacketReceiver { + pub fn try_recv(&self) -> Result, TryRecvError> { + self.receiver.try_recv() + } + pub fn recv(&self) -> Result, RecvError> { + self.receiver.recv() + } + pub fn len(&self) -> usize { + self.receiver.len() + } + pub fn is_empty(&self) -> bool { + self.receiver.is_empty() + } + pub fn iter(&self) -> Iter> { + self.receiver.iter() + } +} + +impl PacketReceiver { + /// Produces a sequence header matching the current encoding context. + /// + /// Its format is compatible with the AV1 Matroska and ISOBMFF specification. + /// Note that the returned header does not include any config OBUs which are + /// required for some uses. See [the specification]. + /// + /// [the specification]: + /// https://aomediacodec.github.io/av1-isobmff/#av1codecconfigurationbox-section + #[inline] + pub fn container_sequence_header(&self) -> Vec { + fn sequence_header_inner(seq: &Sequence) -> io::Result> { + let mut buf = Vec::new(); + + { + let mut bw = BitWriter::endian(&mut buf, BigEndian); + bw.write_bit(true)?; // marker + bw.write(7, 1)?; // version + bw.write(3, seq.profile)?; + bw.write(5, 31)?; // level + bw.write_bit(false)?; // tier + bw.write_bit(seq.bit_depth > 8)?; // high_bitdepth + bw.write_bit(seq.bit_depth == 12)?; // twelve_bit + bw.write_bit(seq.bit_depth == 1)?; // monochrome + bw.write_bit(seq.chroma_sampling != ChromaSampling::Cs444)?; // chroma_subsampling_x + bw.write_bit(seq.chroma_sampling == ChromaSampling::Cs420)?; // chroma_subsampling_y + bw.write(2, 0)?; // sample_position + bw.write(3, 0)?; // reserved + bw.write_bit(false)?; // initial_presentation_delay_present + + bw.write(4, 0)?; // reserved + } + + Ok(buf) + } + + let seq = Sequence::new(&self.config); + + sequence_header_inner(&seq).unwrap() + } +} + +/// A channel modeling an encoding process +pub type VideoDataChannel = (FrameSender, PacketReceiver); diff --git a/src/api/channel/mod.rs b/src/api/channel/mod.rs index 217a128c64..3225a68078 100644 --- a/src/api/channel/mod.rs +++ b/src/api/channel/mod.rs @@ -8,232 +8,21 @@ // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(missing_docs)] -use crate::api::color::*; use crate::api::config::*; use crate::api::context::RcData; use crate::api::internal::ContextInner; use crate::api::util::*; -use bitstream_io::*; use crossbeam::channel::*; -use crate::encoder::*; -use crate::frame::*; use crate::rate::RCState; use crate::rayon::ThreadPool; use crate::util::Pixel; -use std::io; use std::sync::Arc; -/// Endpoint to send previous-pass statistics data -pub struct RcDataSender { - sender: Sender, - limit: u64, - count: u64, -} - -impl RcDataSender { - fn new(limit: u64, sender: Sender) -> RcDataSender { - Self { sender, limit, count: 0 } - } - pub fn try_send( - &mut self, data: RcData, - ) -> Result<(), TrySendError> { - if self.limit <= self.count { - Err(TrySendError::Full(data)) - } else { - let r = self.sender.try_send(data); - if r.is_ok() { - self.count += 1; - } - r - } - } - pub fn send(&mut self, data: RcData) -> Result<(), SendError> { - if self.limit <= self.count { - Err(SendError(data)) - } else { - let r = self.sender.send(data); - if r.is_ok() { - self.count += 1; - } - r - } - } - pub fn len(&self) -> usize { - self.sender.len() - } - pub fn is_empty(&self) -> bool { - self.sender.is_empty() - } - - // TODO: proxy more methods -} - -/// Endpoint to receive current-pass statistics data -pub struct RcDataReceiver(Receiver); - -impl RcDataReceiver { - pub fn try_recv(&self) -> Result { - self.0.try_recv() - } - pub fn recv(&self) -> Result { - self.0.recv() - } - pub fn len(&self) -> usize { - self.0.len() - } - pub fn is_empty(&self) -> bool { - self.0.is_empty() - } - pub fn iter(&self) -> Iter { - self.0.iter() - } - - pub fn summary_size(&self) -> usize { - crate::rate::TWOPASS_HEADER_SZ - } -} - -pub type PassDataChannel = (RcDataSender, RcDataReceiver); - -pub type FrameInput = (Option>>, Option); - -/// Endpoint to send frames -pub struct FrameSender { - sender: Sender>, - config: Arc, - limit: u64, - count: u64, -} - -// Proxy the crossbeam Sender -// -// TODO: enforce the limit -impl FrameSender { - fn new( - limit: u64, sender: Sender>, config: Arc, - ) -> FrameSender { - Self { sender, config, limit, count: 0 } - } - pub fn try_send>( - &mut self, frame: F, - ) -> Result<(), TrySendError>> { - if self.limit <= self.count { - Err(TrySendError::Full(frame.into())) - } else { - let r = self.sender.try_send(frame.into()); - if r.is_ok() { - self.count += 1; - } - r - } - } - - pub fn send>( - &mut self, frame: F, - ) -> Result<(), SendError>> { - if self.limit <= self.count { - Err(SendError(frame.into())) - } else { - let r = self.sender.send(frame.into()); - if r.is_ok() { - self.count += 1; - } - r - } - } - pub fn len(&self) -> usize { - self.sender.len() - } - pub fn is_empty(&self) -> bool { - self.sender.is_empty() - } - // TODO: proxy more methods -} - -// Frame factory -impl FrameSender { - /// Helper to create a new frame with the current encoder configuration - #[inline] - pub fn new_frame(&self) -> Frame { - Frame::new( - self.config.width, - self.config.height, - self.config.chroma_sampling, - ) - } -} - -/// Endpoint to receive packets -pub struct PacketReceiver { - receiver: Receiver>, - config: Arc, -} - -impl PacketReceiver { - pub fn try_recv(&self) -> Result, TryRecvError> { - self.receiver.try_recv() - } - pub fn recv(&self) -> Result, RecvError> { - self.receiver.recv() - } - pub fn len(&self) -> usize { - self.receiver.len() - } - pub fn is_empty(&self) -> bool { - self.receiver.is_empty() - } - pub fn iter(&self) -> Iter> { - self.receiver.iter() - } -} - -impl PacketReceiver { - /// Produces a sequence header matching the current encoding context. - /// - /// Its format is compatible with the AV1 Matroska and ISOBMFF specification. - /// Note that the returned header does not include any config OBUs which are - /// required for some uses. See [the specification]. - /// - /// [the specification]: - /// https://aomediacodec.github.io/av1-isobmff/#av1codecconfigurationbox-section - #[inline] - pub fn container_sequence_header(&self) -> Vec { - fn sequence_header_inner(seq: &Sequence) -> io::Result> { - let mut buf = Vec::new(); - - { - let mut bw = BitWriter::endian(&mut buf, BigEndian); - bw.write_bit(true)?; // marker - bw.write(7, 1)?; // version - bw.write(3, seq.profile)?; - bw.write(5, 31)?; // level - bw.write_bit(false)?; // tier - bw.write_bit(seq.bit_depth > 8)?; // high_bitdepth - bw.write_bit(seq.bit_depth == 12)?; // twelve_bit - bw.write_bit(seq.bit_depth == 1)?; // monochrome - bw.write_bit(seq.chroma_sampling != ChromaSampling::Cs444)?; // chroma_subsampling_x - bw.write_bit(seq.chroma_sampling == ChromaSampling::Cs420)?; // chroma_subsampling_y - bw.write(2, 0)?; // sample_position - bw.write(3, 0)?; // reserved - bw.write_bit(false)?; // initial_presentation_delay_present - - bw.write(4, 0)?; // reserved - } - - Ok(buf) - } - - let seq = Sequence::new(&self.config); - - sequence_header_inner(&seq).unwrap() - } -} - -/// A channel modeling an encoding process -pub type VideoDataChannel = (FrameSender, PacketReceiver); +mod data; +pub use data::*; impl Config { fn setup( From a3af49f4b12745e16b4af5c2cf3653b2647cd816 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Fri, 16 Apr 2021 15:01:39 +0900 Subject: [PATCH 005/188] Check available rows upfront in cdef_dist_wxh_8x8 Admits full unrolling and autovec for the following loop. At speed 2, this yields a 9.6% increase in encoding speed. --- src/rdo.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/rdo.rs b/src/rdo.rs index cba3e79a14..8c4f69de17 100644 --- a/src/rdo.rs +++ b/src/rdo.rs @@ -154,6 +154,10 @@ fn cdef_dist_wxh_8x8( let mut sum_d2_cols: [u32; 8] = [0; 8]; let mut sum_sd_cols: [u32; 8] = [0; 8]; + // Check upfront that 8 rows are available. + let _row1 = &src1[7]; + let _row2 = &src2[7]; + for j in 0..8 { let row1 = &src1[j][0..8]; let row2 = &src2[j][0..8]; From 0aa01fb38c690b454b4923aadcf1ffb31e5b42a3 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Wed, 17 Mar 2021 17:32:52 +0100 Subject: [PATCH 006/188] Proof of concept of by_gop parallel encoding It uses the internals as they are so there is lots of overhead due unneeded computation we will remove later. --- src/api/channel/by_gop.rs | 355 ++++++++++++++++++++++++++++++++++++++ src/api/channel/mod.rs | 5 +- src/bin/common.rs | 11 ++ src/bin/rav1e-ch.rs | 14 +- 4 files changed, 380 insertions(+), 5 deletions(-) create mode 100644 src/api/channel/by_gop.rs diff --git a/src/api/channel/by_gop.rs b/src/api/channel/by_gop.rs new file mode 100644 index 0000000000..f79e30a197 --- /dev/null +++ b/src/api/channel/by_gop.rs @@ -0,0 +1,355 @@ +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved +// +// This source code is subject to the terms of the BSD 2 Clause License and +// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +// was not distributed with this source code in the LICENSE file, you can +// obtain it at www.aomedia.org/license/software. If the Alliance for Open +// Media Patent License 1.0 was not distributed with this source code in the +// PATENTS file, you can obtain it at www.aomedia.org/license/patent. + +use crate::api::channel::data::*; +use crate::api::config::*; +use crate::api::util::*; +use crate::api::InterConfig; + +use crossbeam::channel::*; + +// use crate::encoder::*; +use crate::frame::*; +use crate::util::Pixel; + +use std::collections::BTreeMap; +use std::sync::Arc; + +struct SubGop { + frames: Vec>>, + end_gop: bool, +} + +/* +impl SubGop { + fn build_fi(&self) -> Vec> { + todo!() + } +} +*/ + +// Extra +struct SceneChange { + frames: u64, + pyramid_size: usize, + min_key_frame_interval: u64, + max_key_frame_interval: u64, +} + +impl SceneChange { + fn new( + pyramid_size: usize, min_key_frame_interval: u64, + max_key_frame_interval: u64, + ) -> Self { + Self { + frames: 0, + pyramid_size, + min_key_frame_interval, + max_key_frame_interval, + } + } + + // Tell where to split the lookahead + // 7 is currently hardcoded, it should be a parameter + fn split( + &mut self, lookahead: &[Arc>], + ) -> Option<(usize, bool)> { + self.frames += 1; + + let new_gop = if self.frames < self.min_key_frame_interval { + false + } else if self.frames >= self.max_key_frame_interval { + self.frames = 0; + true + } else { + false + }; + + let len = lookahead.len(); + + if len > self.pyramid_size { + Some((self.pyramid_size, new_gop)) + } else if new_gop { + Some((len - 1, true)) + } else { + None + } + } +} + +struct WorkLoad { + s_recv: Receiver>, + send: Sender>, +} + +struct WorkerPoolSend { + recv_workers: Receiver>>>, + send_reassemble: Sender<(usize, Receiver>)>, + count: usize, +} + +impl WorkerPoolSend { + fn get_worker(&mut self) -> Option>> { + self.recv_workers.recv().ok().map(|sender| { + let (s_send, s_recv) = unbounded(); + let (send, recv) = unbounded(); + + let _ = self.send_reassemble.send((self.count, recv)); + + let wl = WorkLoad { s_recv, send }; + + let _ = sender.send(Some(wl)); + + self.count += 1; + + s_send + }) + } +} + +struct WorkerPoolRecv { + recv_reassemble: Receiver<(usize, Receiver>)>, + recv_workers: Receiver>>>, +} + +// TODO: make it Drop ? +impl WorkerPoolRecv { + fn close(&self) { + for worker in self.recv_workers.iter() { + let _ = worker.send(None); + } + } +} + +fn workerpool( + s: &rayon::ScopeFifo, workers: usize, mut cfg: Config, +) -> (WorkerPoolSend, WorkerPoolRecv) { + let (send_workers, recv_workers) = bounded(workers); + let (send_reassemble, recv_reassemble) = unbounded(); + + // TODO: unpack send_frame in process + cfg.enc.speed_settings.no_scene_detection = true; + + for _ in 0..workers { + let (send_workload, recv_workload) = unbounded::>>(); + let send_workload2 = send_workload.clone(); + let send_back = send_workers.clone(); + + let cfg = cfg.clone(); + s.spawn_fifo(move |_| { + for wl in recv_workload.iter() { + match wl { + Some(wl) => { + let mut inner = cfg.new_inner().unwrap(); + for s in wl.s_recv.iter() { + for f in s.frames { + while !inner.needs_more_fi_lookahead() { + let r = inner.receive_packet(); + match r { + Ok(p) => { + wl.send.send(p).unwrap(); + } + Err(EncoderStatus::Encoded) => {} + _ => todo!("Error management {:?}", r), + } + } + let _ = inner.send_frame(Some(f), None); + } + } + + inner.limit = Some(inner.frame_count); + let _ = inner.send_frame(None, None); + + loop { + match inner.receive_packet() { + Ok(p) => wl.send.send(p).unwrap(), + Err(EncoderStatus::LimitReached) => break, + Err(EncoderStatus::Encoded) => {} + _ => todo!("Error management"), + } + } + + let _ = send_back.send(send_workload2.clone()); + } + None => break, + } + } + }); + let _ = send_workers.send(send_workload); + } + + ( + WorkerPoolSend { + recv_workers: recv_workers.clone(), + send_reassemble, + count: 0, + }, + WorkerPoolRecv { recv_reassemble, recv_workers }, + ) +} + +fn reassemble( + pool: WorkerPoolRecv

, s: &rayon::ScopeFifo, + send_packet: Sender>, +) { + s.spawn_fifo(move |_| { + let mut pending = BTreeMap::new(); + let mut last_idx = 0; + let mut packet_index = 0; + for (idx, recv) in pool.recv_reassemble.iter() { + pending.insert(idx, recv); + while let Some(recv) = pending.remove(&last_idx) { + for mut p in recv { + // patch up the packet_index + p.input_frameno = packet_index; + let _ = send_packet.send(p); + packet_index += 1; + } + last_idx += 1; + } + } + + while !pending.is_empty() { + if let Some(recv) = pending.remove(&last_idx) { + for mut p in recv { + // patch up the packet_index + p.input_frameno = packet_index; + let _ = send_packet.send(p); + packet_index += 1; + } + } + last_idx += 1; + } + + pool.close(); + }); +} + +impl Config { + // Group the incoming frames in Gops, emit a SubGop at time. + fn scenechange( + &self, s: &rayon::ScopeFifo, r: Receiver>, + ) -> Receiver> { + let inter_cfg = InterConfig::new(&self.enc); + let lookahead_distance = + inter_cfg.keyframe_lookahead_distance() as usize + 1; + let (send, recv) = bounded(lookahead_distance * 2); + + let mut sc = SceneChange::new( + lookahead_distance, + self.enc.min_key_frame_interval, + self.enc.max_key_frame_interval, + ); + + s.spawn_fifo(move |_| { + let mut lookahead = Vec::new(); + for f in r.iter() { + let (frame, _params) = f; + + lookahead.push(frame.unwrap()); + + // we need at least lookahead_distance frames to reason + if lookahead.len() < lookahead_distance { + continue; + } + + if let Some((split_pos, end_gop)) = sc.split(&lookahead) { + let rem = lookahead.split_off(split_pos); + let _ = send.send(SubGop { frames: lookahead, end_gop }); + + lookahead = rem; + } + } + + while let Some((split_pos, end_gop)) = sc.split(&lookahead) { + let rem = lookahead.split_off(split_pos); + let _ = send.send(SubGop { frames: lookahead, end_gop }); + + lookahead = rem; + } + + if !lookahead.is_empty() { + let _ = send.send(SubGop { frames: lookahead, end_gop: true }); + } + }); + + recv + } + + /// Encode the subgops, dispatch each Gop to an available worker + fn encode( + &self, s: &rayon::ScopeFifo, workers: usize, r: Receiver>, + send_packet: Sender>, + ) { + let (mut workers, recv) = workerpool(s, workers, self.clone()); + + s.spawn_fifo(move |_| { + let mut sg_send = workers.get_worker().unwrap(); + for sb in r.iter() { + let end_gop = sb.end_gop; + let _ = sg_send.send(sb); + + if end_gop { + sg_send = workers.get_worker().unwrap(); + } + } + }); + + reassemble(recv, s, send_packet) + } + + /// Create a single pass by_gop encoder channel + /// + /// Drop the `FrameSender` endpoint to flush the encoder. + /// + /// + pub fn new_by_gop_channel( + &self, slots: usize, + ) -> Result, InvalidConfig> { + let rc = &self.rate_control; + + if rc.emit_pass_data || rc.summary.is_some() { + return Err(InvalidConfig::RateControlConfigurationMismatch); + } + + self.validate()?; + + // TODO: make it user-settable + let input_len = self.enc.rdo_lookahead_frames as usize * 4; + let frame_limit = std::i32::MAX as u64; + + let (send_frame, receive_frame) = bounded(input_len); + let (send_packet, receive_packet) = unbounded(); + + let cfg = self.clone(); + + let pool = self.new_thread_pool(); + + // TODO: move the accounting threads outside the threadpool + let run = move || { + let _ = rayon::scope_fifo(|s| { + let sg_recv = cfg.scenechange(s, receive_frame); + cfg.encode(s, slots, sg_recv, send_packet); + }); + }; + + if let Some(pool) = pool { + pool.spawn_fifo(run); + } else { + rayon::spawn_fifo(run); + } + + let channel = ( + FrameSender::new(frame_limit, send_frame, Arc::new(self.enc)), + PacketReceiver { receiver: receive_packet, config: Arc::new(self.enc) }, + ); + + Ok(channel) + } +} diff --git a/src/api/channel/mod.rs b/src/api/channel/mod.rs index 3225a68078..5822b755e4 100644 --- a/src/api/channel/mod.rs +++ b/src/api/channel/mod.rs @@ -24,8 +24,11 @@ use std::sync::Arc; mod data; pub use data::*; +mod by_gop; +pub use by_gop::*; + impl Config { - fn setup( + pub(crate) fn setup( &self, ) -> Result<(ContextInner, Option>), InvalidConfig> { self.validate()?; diff --git a/src/bin/common.rs b/src/bin/common.rs index 30a52ebef6..3b77bc9eac 100644 --- a/src/bin/common.rs +++ b/src/bin/common.rs @@ -47,6 +47,7 @@ pub struct CliOptions { pub pass1file_name: Option, pub pass2file_name: Option, pub save_config: Option, + pub slots: usize, } #[cfg(feature = "serialize")] @@ -369,6 +370,13 @@ pub fn parse_cli() -> Result { .help("Overwrite output file.") .short("y") ) + .arg( + Arg::with_name("SLOTS") + .help("Select the number of by_gop encoder-slots to allocate") + .long("slots") + .takes_value(true) + .default_value("0") + ) .subcommand(SubCommand::with_name("advanced") .setting(AppSettings::Hidden) .about("Advanced features") @@ -486,6 +494,8 @@ pub fn parse_cli() -> Result { panic!("A limit cannot be set above 1 in still picture mode"); } + let slots = matches.value_of("SLOTS").unwrap().parse().unwrap(); + Ok(CliOptions { io, enc, @@ -502,6 +512,7 @@ pub fn parse_cli() -> Result { pass1file_name: matches.value_of("FIRST_PASS").map(|s| s.to_owned()), pass2file_name: matches.value_of("SECOND_PASS").map(|s| s.to_owned()), save_config, + slots, }) } diff --git a/src/bin/rav1e-ch.rs b/src/bin/rav1e-ch.rs index 22d1bca92c..fd8c2fd4cf 100644 --- a/src/bin/rav1e-ch.rs +++ b/src/bin/rav1e-ch.rs @@ -103,7 +103,6 @@ impl Source { return false; } } - match self.input.read_frame(send_frame, &video_info) { Ok(frame) => { self.count += 1; @@ -120,7 +119,7 @@ fn do_encode( output: &mut dyn Muxer, mut source: Source, pass1file: Option, pass2file: Option, mut y4m_enc: Option>>, - metrics_enabled: MetricsEnabled, + metrics_enabled: MetricsEnabled, slots: usize, ) -> Result<(), CliError> { let ((mut send_frame, receive_packet), (send_rc, receive_rc)) = match (pass1file.is_some(), pass2file.is_some()) { @@ -143,8 +142,13 @@ fn do_encode( (channel, (Some(send_rc), None)) } (false, false) => { - let channel = - cfg.new_channel().map_err(|e| e.context("Invalid setup"))?; + let channel = if slots == 0 { + cfg.new_channel().map_err(|e| e.context("Invalid setup"))? + } else { + cfg + .new_by_gop_channel(slots) + .map_err(|e| e.context("Invalid setup"))? + }; (channel, (None, None)) } }; @@ -560,6 +564,7 @@ fn run() -> Result<(), error::CliError> { pass2file, y4m_enc, cli.metrics_enabled, + cli.slots, )? } else { do_encode::>>( @@ -572,6 +577,7 @@ fn run() -> Result<(), error::CliError> { pass2file, y4m_enc, cli.metrics_enabled, + cli.slots, )? } if cli.benchmark { From b03724fce0864500c23a0998b21589c12d55d1f0 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Wed, 14 Apr 2021 17:04:23 +0200 Subject: [PATCH 007/188] Change the naming to be more user-friendly And report that parallel gop encoding is disabled for multi-pass rate control. --- src/api/channel/mod.rs | 26 ++++++++++++++++++++++++-- src/api/config/mod.rs | 10 ++++++++++ src/bin/common.rs | 24 ++++++++++++++++-------- src/bin/rav1e-ch.rs | 16 +++++----------- 4 files changed, 55 insertions(+), 21 deletions(-) diff --git a/src/api/channel/mod.rs b/src/api/channel/mod.rs index 5822b755e4..83d4f7ee0e 100644 --- a/src/api/channel/mod.rs +++ b/src/api/channel/mod.rs @@ -52,8 +52,11 @@ impl Config { if rc.emit_pass_data || rc.summary.is_some() { return Err(InvalidConfig::RateControlConfigurationMismatch); } - - let (v, _) = self.new_channel_internal()?; + let v = if self.slots > 1 { + self.new_by_gop_channel(self.slots)? + } else { + self.new_channel_internal()?.0 + }; Ok(v) } @@ -73,6 +76,13 @@ impl Config { if !rc.emit_pass_data { return Err(InvalidConfig::RateControlConfigurationMismatch); } + + if self.slots > 1 { + log::warn!( + "Parallel gop encoding does not support multi pass rate control" + ); + } + let (v, (_, r)) = self.new_channel_internal()?; Ok((v, r.unwrap())) @@ -91,6 +101,12 @@ impl Config { return Err(InvalidConfig::RateControlConfigurationMismatch); } + if self.slots > 1 { + log::warn!( + "Parallel gop encoding does not support multi pass rate control" + ); + } + let (v, (s, _)) = self.new_channel_internal()?; Ok((v, s.unwrap())) @@ -112,6 +128,12 @@ impl Config { return Err(InvalidConfig::RateControlConfigurationMismatch); } + if self.slots > 1 { + log::warn!( + "Parallel gop encoding does not support multi pass rate control" + ); + } + let (v, (s, r)) = self.new_channel_internal()?; Ok((v, (s.unwrap(), r.unwrap()))) diff --git a/src/api/config/mod.rs b/src/api/config/mod.rs index 19eb44bd75..51421a3c8a 100644 --- a/src/api/config/mod.rs +++ b/src/api/config/mod.rs @@ -128,6 +128,9 @@ pub struct Config { pub(crate) threads: usize, /// Shared thread pool pub(crate) pool: Option>, + #[cfg(feature = "unstable")] + /// Number of parallel encoding slots + pub(crate) slots: usize, } impl Config { @@ -175,6 +178,13 @@ impl Config { self.pool = Some(pool); self } + + #[cfg(feature = "unstable")] + /// Set the maximum number of GOPs to encode in parallel + pub fn with_parallel_gops(mut self, slots: usize) -> Self { + self.slots = slots; + self + } } fn check_tile_log2(n: usize) -> bool { diff --git a/src/bin/common.rs b/src/bin/common.rs index 3b77bc9eac..4ce986a5a2 100644 --- a/src/bin/common.rs +++ b/src/bin/common.rs @@ -370,13 +370,6 @@ pub fn parse_cli() -> Result { .help("Overwrite output file.") .short("y") ) - .arg( - Arg::with_name("SLOTS") - .help("Select the number of by_gop encoder-slots to allocate") - .long("slots") - .takes_value(true) - .default_value("0") - ) .subcommand(SubCommand::with_name("advanced") .setting(AppSettings::Hidden) .about("Advanced features") @@ -401,6 +394,17 @@ pub fn parse_cli() -> Result { ) ); + if cfg!(feature = "unstable") { + app = app.arg( + Arg::with_name("SLOTS") + .help("Maximum number of GOPs that can be encoded in parallel") + .long("parallel_gops") + .long("slots") + .takes_value(true) + .default_value("0"), + ); + } + let matches = app.clone().get_matches(); if matches.is_present("FULLHELP") { @@ -494,7 +498,11 @@ pub fn parse_cli() -> Result { panic!("A limit cannot be set above 1 in still picture mode"); } - let slots = matches.value_of("SLOTS").unwrap().parse().unwrap(); + let slots = if cfg!(feature = "unstable") { + matches.value_of("SLOTS").unwrap().parse().unwrap() + } else { + 0 + }; Ok(CliOptions { io, diff --git a/src/bin/rav1e-ch.rs b/src/bin/rav1e-ch.rs index fd8c2fd4cf..0b8237229f 100644 --- a/src/bin/rav1e-ch.rs +++ b/src/bin/rav1e-ch.rs @@ -119,7 +119,7 @@ fn do_encode( output: &mut dyn Muxer, mut source: Source, pass1file: Option, pass2file: Option, mut y4m_enc: Option>>, - metrics_enabled: MetricsEnabled, slots: usize, + metrics_enabled: MetricsEnabled, ) -> Result<(), CliError> { let ((mut send_frame, receive_packet), (send_rc, receive_rc)) = match (pass1file.is_some(), pass2file.is_some()) { @@ -142,13 +142,8 @@ fn do_encode( (channel, (Some(send_rc), None)) } (false, false) => { - let channel = if slots == 0 { - cfg.new_channel().map_err(|e| e.context("Invalid setup"))? - } else { - cfg - .new_by_gop_channel(slots) - .map_err(|e| e.context("Invalid setup"))? - }; + let channel = + cfg.new_channel().map_err(|e| e.context("Invalid setup"))?; (channel, (None, None)) } }; @@ -491,7 +486,8 @@ fn run() -> Result<(), error::CliError> { let cfg = Config::new() .with_encoder_config(cli.enc) .with_threads(cli.threads) - .with_rate_control(rc); + .with_rate_control(rc) + .with_parallel_gops(cli.slots); #[cfg(feature = "serialize")] { @@ -564,7 +560,6 @@ fn run() -> Result<(), error::CliError> { pass2file, y4m_enc, cli.metrics_enabled, - cli.slots, )? } else { do_encode::>>( @@ -577,7 +572,6 @@ fn run() -> Result<(), error::CliError> { pass2file, y4m_enc, cli.metrics_enabled, - cli.slots, )? } if cli.benchmark { From d7604b97524a2dd889131d3274ec79f9d09db751 Mon Sep 17 00:00:00 2001 From: Ewout ter Hoeven Date: Wed, 21 Apr 2021 10:55:02 +0200 Subject: [PATCH 008/188] CI: Update to cargo-c v0.8.0 --- .github/workflows/deploy.yml | 2 +- .github/workflows/rav1e.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 34bd02c61b..2d28dffecd 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -29,7 +29,7 @@ jobs: - name: Install cargo-c run: | - $LINK = "https://github.com/lu-zero/cargo-c/releases/download/v0.7.1" + $LINK = "https://github.com/lu-zero/cargo-c/releases/download/v0.8.0" $CARGO_C_FILE = "cargo-c-windows-msvc" curl -LO "$LINK/$CARGO_C_FILE.zip" 7z e -y "$CARGO_C_FILE.zip" -o"${env:USERPROFILE}\.cargo\bin" diff --git a/.github/workflows/rav1e.yml b/.github/workflows/rav1e.yml index b07b05f605..d12de8a084 100644 --- a/.github/workflows/rav1e.yml +++ b/.github/workflows/rav1e.yml @@ -174,7 +174,7 @@ jobs: if: matrix.conf == 'cargo-c' env: LINK: https://github.com/lu-zero/cargo-c/releases/download - CARGO_C_VERSION: 0.7.1 + CARGO_C_VERSION: 0.8.0 run: | curl -L "$LINK/v$CARGO_C_VERSION/cargo-c-linux.tar.gz" | tar xz -C $HOME/.cargo/bin @@ -449,7 +449,7 @@ jobs: - name: Install cargo-c if: matrix.conf == 'cargo-c' run: | - $LINK = "https://github.com/lu-zero/cargo-c/releases/download/v0.7.3" + $LINK = "https://github.com/lu-zero/cargo-c/releases/download/v0.8.0" $CARGO_C_FILE = "cargo-c-windows-msvc" curl -LO "$LINK/$CARGO_C_FILE.zip" 7z e -y "$CARGO_C_FILE.zip" -o"${env:USERPROFILE}\.cargo\bin" From 8eaff16491ac8e1218997a5e28417f7c6227040f Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Wed, 21 Apr 2021 14:06:50 +0200 Subject: [PATCH 009/188] Run cargo check for unstable features --- .github/workflows/rav1e.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/rav1e.yml b/.github/workflows/rav1e.yml index d12de8a084..cb00c4b337 100644 --- a/.github/workflows/rav1e.yml +++ b/.github/workflows/rav1e.yml @@ -54,6 +54,7 @@ jobs: - cargo-c - check-no-default - check-extra-feats + - check-unstable-feats - fuzz include: - conf: beta-build @@ -78,6 +79,8 @@ jobs: toolchain: stable - conf: check-extra-feats toolchain: stable + - conf: check-unstable-feats + toolchain: stable - conf: fuzz toolchain: stable @@ -252,6 +255,10 @@ jobs: if: matrix.toolchain == 'stable' && matrix.conf == 'check-extra-feats' run: | cargo check --features=check_asm,capi,dump_lookahead_data,serialize + - name: Check extra features + if: matrix.toolchain == 'stable' && matrix.conf == 'check-unstable-feats' + run: | + cargo check --features=unstable,channel-api - name: Run cargo-c if: matrix.conf == 'cargo-c' run: | From b91f83ff280a2da975887897aac57b427f7cd52b Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Mon, 26 Apr 2021 16:28:56 +0900 Subject: [PATCH 010/188] Reconfigure speed level 2 to be level 3 with bottom-up partitioning The average encoding time increase is approximately 5% on objective-1-fast: PSNR Y | PSNR Cb | PSNR Cr | CIEDE2000 | SSIM | MS-SSIM | PSNR-HVS Y | PSNR-HVS Cb | PSNR-HVS Cr | PSNR-HVS | VMAF | VMAF-NEG -1.4287 | 0.2338 | -0.0450 | -0.8039 | -1.3400 | -1.2139 | -1.5199 | 0.2588 | -0.0137 | -1.4722 | -1.2587 | -1.1720 Note that after this change we no longer have a speed level with both top-down partitioning and 4x4 partitions. --- src/api/config/speedsettings.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/api/config/speedsettings.rs b/src/api/config/speedsettings.rs index be2c9a62fc..4b13c71e13 100644 --- a/src/api/config/speedsettings.rs +++ b/src/api/config/speedsettings.rs @@ -136,8 +136,8 @@ impl SpeedSettings { /// - 4: min block size 8x8, complex pred modes for keyframes, RDO TX decision, full SGR search. /// - 3: min block size 8x8, complex pred modes for keyframes, RDO TX decision, include near MVs, /// full SGR search. - /// - 2: min block size 4x4, complex pred modes, RDO TX decision, include near MVs, - /// full SGR search. + /// - 2: min block size 8x8, complex pred modes for keyframes, RDO TX decision, include near MVs, + /// bottom-up encoding, full SGR search. /// - 1: min block size 4x4, complex pred modes, RDO TX decision, include near MVs, /// bottom-up encoding, full SGR search. /// - 0 (slowest): min block size 4x4, complex pred modes, RDO TX decision, include near MVs, @@ -170,7 +170,7 @@ impl SpeedSettings { /// This preset is set this way because 8x8 with reduced TX set is faster but with equivalent /// or better quality compared to 16x16 (to which reduced TX set does not apply). fn partition_range_preset(speed: usize) -> PartitionRange { - if speed <= 2 { + if speed <= 1 { PartitionRange::new(BlockSize::BLOCK_4X4, BlockSize::BLOCK_64X64) } else if speed <= 8 { PartitionRange::new(BlockSize::BLOCK_8X8, BlockSize::BLOCK_64X64) @@ -208,7 +208,7 @@ impl SpeedSettings { } const fn encode_bottomup_preset(speed: usize) -> bool { - speed <= 1 + speed <= 2 } /// Set default rdo-lookahead-frames for different speed settings @@ -216,8 +216,8 @@ impl SpeedSettings { match speed { 9..=10 => 10, 6..=8 => 20, - 3..=5 => 30, - 0..=2 => 40, + 2..=5 => 30, + 0..=1 => 40, _ => 40, } } @@ -227,7 +227,7 @@ impl SpeedSettings { } fn prediction_modes_preset(speed: usize) -> PredictionModesSetting { - if speed <= 2 { + if speed <= 1 { PredictionModesSetting::ComplexAll } else if speed <= 6 { PredictionModesSetting::ComplexKeyframes From f54b23b606dd62a1439bae697423719022f175f0 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Thu, 29 Apr 2021 16:20:51 +0900 Subject: [PATCH 011/188] Create new speed level 1, drop level 4 Drop speed level 4 which requires 14.69% greater encoding time than speed 5 for very marginal gains: PSNR Y | PSNR Cb | PSNR Cr | CIEDE2000 | SSIM | MS-SSIM | PSNR-HVS Y | PSNR-HVS Cb | PSNR-HVS Cr | PSNR-HVS | VMAF | VMAF-NEG 0.0187 | -0.1219 | -0.2468 | -0.0744 | 0.0048 | -0.0100 | 0.0146 | -0.0276 | -0.1719 | 0.0107 | 0.0978 | 0.0966 Insert a new speed level with twice the encoding time of the prior speed 1 but very significant efficiency gains: PSNR Y | PSNR Cb | PSNR Cr | CIEDE2000 | SSIM | MS-SSIM | PSNR-HVS Y | PSNR-HVS Cb | PSNR-HVS Cr | PSNR-HVS | VMAF | VMAF-NEG -3.3335 | -5.9477 | -6.2786 | -4.4695 | -3.4404 | -3.3753 | -3.3563 | -5.9527 | -6.7053 | -3.4610 | -3.4325 | -3.4850 Likewise, the prior speed 0 has more than twice the encoding time of this new level and significant efficiency gains: PSNR Y | PSNR Cb | PSNR Cr | CIEDE2000 | SSIM | MS-SSIM | PSNR-HVS Y | PSNR-HVS Cb | PSNR-HVS Cr | PSNR-HVS | VMAF | VMAF-NEG -2.0265 | 0.8643 | 0.3991 | -1.2325 | -1.4498 | -1.3123 | -2.0252 | 2.0639 | 1.0175 | -1.9425 | -2.4707 | -2.2548 The new speed level 1 is equivalent to speed 0 with simple segment selection rather than a full search. --- src/api/config/speedsettings.rs | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/api/config/speedsettings.rs b/src/api/config/speedsettings.rs index 4b13c71e13..5524396d3e 100644 --- a/src/api/config/speedsettings.rs +++ b/src/api/config/speedsettings.rs @@ -133,15 +133,17 @@ impl SpeedSettings { /// - 7: min block size 8x8, reduced TX set. /// - 6 (default): min block size 8x8, reduced TX set, complex pred modes for keyframes. /// - 5: min block size 8x8, complex pred modes for keyframes, RDO TX decision. - /// - 4: min block size 8x8, complex pred modes for keyframes, RDO TX decision, full SGR search. - /// - 3: min block size 8x8, complex pred modes for keyframes, RDO TX decision, include near MVs, + /// - 4: min block size 8x8, complex pred modes for keyframes, RDO TX decision, include near MVs, /// full SGR search. - /// - 2: min block size 8x8, complex pred modes for keyframes, RDO TX decision, include near MVs, + /// - 3: min block size 8x8, complex pred modes for keyframes, RDO TX decision, include near MVs, /// bottom-up encoding, full SGR search. - /// - 1: min block size 4x4, complex pred modes, RDO TX decision, include near MVs, + /// - 2: min block size 4x4, complex pred modes, RDO TX decision, include near MVs, /// bottom-up encoding, full SGR search. - /// - 0 (slowest): min block size 4x4, complex pred modes, RDO TX decision, include near MVs, + /// - 1: min block size 4x4, complex pred modes, RDO TX decision, include near MVs, /// bottom-up encoding with non-square partitions everywhere, full SGR search. + /// - 0 (slowest): min block size 4x4, complex pred modes, RDO TX decision, include near MVs, + /// bottom-up encoding with non-square partitions everywhere, full SGR search, + /// full segmentation search. pub fn from_preset(speed: usize) -> Self { SpeedSettings { partition_range: Self::partition_range_preset(speed), @@ -170,7 +172,7 @@ impl SpeedSettings { /// This preset is set this way because 8x8 with reduced TX set is faster but with equivalent /// or better quality compared to 16x16 (to which reduced TX set does not apply). fn partition_range_preset(speed: usize) -> PartitionRange { - if speed <= 1 { + if speed <= 2 { PartitionRange::new(BlockSize::BLOCK_4X4, BlockSize::BLOCK_64X64) } else if speed <= 8 { PartitionRange::new(BlockSize::BLOCK_8X8, BlockSize::BLOCK_64X64) @@ -208,7 +210,7 @@ impl SpeedSettings { } const fn encode_bottomup_preset(speed: usize) -> bool { - speed <= 2 + speed <= 3 } /// Set default rdo-lookahead-frames for different speed settings @@ -216,8 +218,8 @@ impl SpeedSettings { match speed { 9..=10 => 10, 6..=8 => 20, - 2..=5 => 30, - 0..=1 => 40, + 3..=5 => 30, + 0..=2 => 40, _ => 40, } } @@ -227,7 +229,7 @@ impl SpeedSettings { } fn prediction_modes_preset(speed: usize) -> PredictionModesSetting { - if speed <= 1 { + if speed <= 2 { PredictionModesSetting::ComplexAll } else if speed <= 6 { PredictionModesSetting::ComplexKeyframes @@ -237,7 +239,7 @@ impl SpeedSettings { } const fn include_near_mvs_preset(speed: usize) -> bool { - speed <= 3 + speed <= 4 } const fn no_scene_detection_preset(_speed: usize) -> bool { @@ -269,7 +271,7 @@ impl SpeedSettings { } const fn non_square_partition_preset(speed: usize) -> bool { - speed == 0 + speed <= 1 } fn segmentation_preset(speed: usize) -> SegmentationLevel { From 7749085a1b9edf033a07b0161c0e92d2e803cd91 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Fri, 7 May 2021 23:39:23 +0900 Subject: [PATCH 012/188] CI: Update libaom to 3.1.0-dmo0~bpo10+1 --- .github/workflows/rav1e.yml | 6 +++--- .travis/install-aom.sh | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/rav1e.yml b/.github/workflows/rav1e.yml index cb00c4b337..71ebceb617 100644 --- a/.github/workflows/rav1e.yml +++ b/.github/workflows/rav1e.yml @@ -129,11 +129,11 @@ jobs: matrix.conf == 'grcov-coveralls' env: LINK: https://www.deb-multimedia.org/pool/main/a/aom-dmo - AOM_VERSION: 3.0.0-dmo0~bpo10+1 + AOM_VERSION: 3.1.0-dmo0~bpo10+1 AOM_DEV_SHA256: >- - 40b273179f1d9d75202b18295b8fe3e406076e061831d66c2b469bc1a10e5bce + 1a78ad10714c0cd9ed2324007369c20a5d9047d98e7098f932f48edb01056f36 AOM_LIB_SHA256: >- - 1bb977ff6b7c42e4e0d5f743670008fc71e79eb5a9fdf83154fe7890379e9a17 + a2e1f0a0ab1be6b93a1582d68b869d27e88c1fb8df7fae7bd793ebc0322c76a2 run: | echo "$LINK/libaom-dev_${AOM_VERSION}_amd64.deb" >> DEBS echo "$LINK/libaom3_${AOM_VERSION}_amd64.deb" >> DEBS diff --git a/.travis/install-aom.sh b/.travis/install-aom.sh index 53bef70953..ae0a192877 100755 --- a/.travis/install-aom.sh +++ b/.travis/install-aom.sh @@ -1,7 +1,7 @@ #!/bin/bash set -ex -AOM_VERSION="3.0.0-dmo0~bpo10+1" +AOM_VERSION="3.1.0-dmo0~bpo10+1" PKG_URL="https://www.deb-multimedia.org/pool/main/a/aom-dmo" ARCH="arm64" @@ -13,8 +13,8 @@ curl -O "${PKG_URL}/libaom-dev_${AOM_VERSION}_${ARCH}.deb" \ -O "${PKG_URL}/libaom3_${AOM_VERSION}_${ARCH}.deb" sha256sum --check --ignore-missing < Date: Sat, 8 May 2021 00:07:04 +0900 Subject: [PATCH 013/188] Resolve new clippy warnings --- src/bin/decoder/mod.rs | 1 + src/bin/rav1e.rs | 37 +++++++++++++++++-------------------- src/context/cdf_context.rs | 2 +- src/predict.rs | 10 ++-------- 4 files changed, 21 insertions(+), 29 deletions(-) diff --git a/src/bin/decoder/mod.rs b/src/bin/decoder/mod.rs index 8db83696ff..d9cecc64cb 100644 --- a/src/bin/decoder/mod.rs +++ b/src/bin/decoder/mod.rs @@ -26,6 +26,7 @@ pub trait Decoder: Send { #[derive(Debug)] pub enum DecodeError { + #[allow(clippy::upper_case_acronyms)] EOF, BadInput, UnknownColorspace, diff --git a/src/bin/rav1e.rs b/src/bin/rav1e.rs index c85d3b1e1e..74fd111921 100644 --- a/src/bin/rav1e.rs +++ b/src/bin/rav1e.rs @@ -400,26 +400,23 @@ fn run() -> Result<(), error::CliError> { Ok(d) => d, }; let video_info = y4m_dec.get_video_details(); - let y4m_enc = match cli.io.rec { - Some(rec) => Some( - y4m::encode( - video_info.width, - video_info.height, - y4m::Ratio::new( - video_info.time_base.den as usize, - video_info.time_base.num as usize, - ), - ) - .with_colorspace(y4m_dec.get_colorspace()) - .with_pixel_aspect(y4m::Ratio { - num: video_info.sample_aspect_ratio.num as usize, - den: video_info.sample_aspect_ratio.den as usize, - }) - .write_header(rec) - .unwrap(), - ), - None => None, - }; + let y4m_enc = cli.io.rec.map(|rec| { + y4m::encode( + video_info.width, + video_info.height, + y4m::Ratio::new( + video_info.time_base.den as usize, + video_info.time_base.num as usize, + ), + ) + .with_colorspace(y4m_dec.get_colorspace()) + .with_pixel_aspect(y4m::Ratio { + num: video_info.sample_aspect_ratio.num as usize, + den: video_info.sample_aspect_ratio.den as usize, + }) + .write_header(rec) + .unwrap() + }); cli.enc.width = video_info.width; cli.enc.height = video_info.height; diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index 372d6d9885..9f5362d51a 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -701,8 +701,8 @@ impl<'a> ContextWriter<'a> { let fc_log = CDFContextLog::new(fc); #[allow(unused_mut)] let mut cw = ContextWriter { - fc, bc, + fc, fc_log, #[cfg(feature = "desync_finder")] fc_map: Default::default(), diff --git a/src/predict.rs b/src/predict.rs index af44494394..cdea8740a3 100644 --- a/src/predict.rs +++ b/src/predict.rs @@ -561,14 +561,8 @@ impl IntraEdgeFilterParameters { .into(), None => None, }, - above_ref_frame_types: match above_ctx { - Some(bi) => Some(bi.reference_types), - None => None, - }, - left_ref_frame_types: match left_ctx { - Some(bi) => Some(bi.reference_types), - None => None, - }, + above_ref_frame_types: above_ctx.map(|bi| bi.reference_types), + left_ref_frame_types: left_ctx.map(|bi| bi.reference_types), } } From 73146b7ff999688bbfc2653f3e94699f81d3ccb7 Mon Sep 17 00:00:00 2001 From: Tristan Matthews Date: Fri, 7 May 2021 22:56:48 -0400 Subject: [PATCH 014/188] lrf: fix comment --- src/lrf.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lrf.rs b/src/lrf.rs index ea91e9d88b..db4600348b 100644 --- a/src/lrf.rs +++ b/src/lrf.rs @@ -83,7 +83,7 @@ const SGRPROJ_ALL_SETS: &[u8] = // parameters as non-zero. The other two are distinguishable by which of the // two parameters is zero. There are an even number of each of these groups and // the non-zero parameters grow as the indices increase. This array uses the -// 1nd, 3rd, ... smallest params of each group. +// 1st, 3rd, ... smallest params of each group. const SGRPROJ_REDUCED_SETS: &[u8] = &[1, 3, 5, 7, 9, 11, 13, 15]; pub fn get_sgr_sets(complexity: SGRComplexityLevel) -> &'static [u8] { From 64295338636c264d476fb04f12e58542208fe680 Mon Sep 17 00:00:00 2001 From: Tristan Matthews Date: Fri, 7 May 2021 22:57:40 -0400 Subject: [PATCH 015/188] rate: fix typos --- src/rate.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rate.rs b/src/rate.rs index 748aba5e69..5da59889e9 100644 --- a/src/rate.rs +++ b/src/rate.rs @@ -388,7 +388,7 @@ impl IIRBessel2 { // This does not alter the x/y state, but changes the reaction time of the // filter. // Altering the time constant of a reactive filter without altering internal - // state is something that has to be done carefuly, but our design operates + // state is something that has to be done carefully, but our design operates // at high enough delays and with small enough time constant changes to make // it safe. pub fn reinit(&mut self, delay: i32) { @@ -823,7 +823,7 @@ impl RCState { // Insane framerates or frame sizes mean insane bitrates. // Let's not get carried away. // We also subtract 16 bits from each temporal unit to account for the - // temporal delimeter, whose bits are not included in the frame sizes + // temporal delimiter, whose bits are not included in the frame sizes // reported to update_state(). // TODO: Support constraints imposed by levels. let bits_per_tu = clamp( @@ -1427,7 +1427,7 @@ impl RCState { self.reservoir_fullness -= bits; if show_frame { self.reservoir_fullness += self.bits_per_tu; - // TODO: Properly account for temporal delimeter bits. + // TODO: Properly account for temporal delimiter bits. } // If we're too quick filling the buffer and overflow is capped, that // rate is lost forever. From 7ce12710a60b474fc51a7d43221d4279f8a5cea4 Mon Sep 17 00:00:00 2001 From: Tristan Matthews Date: Fri, 7 May 2021 22:58:23 -0400 Subject: [PATCH 016/188] deblock: fix typo --- src/deblock.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deblock.rs b/src/deblock.rs index 973870382a..e837e2b112 100644 --- a/src/deblock.rs +++ b/src/deblock.rs @@ -1355,7 +1355,7 @@ pub fn deblock_plane( >> ydec << ydec; // Clippy can go suck an egg - // vertical edge filtering leads horizonal by one full MI-sized + // vertical edge filtering leads horizontal by one full MI-sized // row (and horizontal filtering doesn't happen along the upper // edge). Unroll to avoid corner-cases. if rows > 0 { From 0e49c5b673a7b7d9dee8c45240b7a21694f7a865 Mon Sep 17 00:00:00 2001 From: Tristan Matthews Date: Fri, 7 May 2021 22:58:59 -0400 Subject: [PATCH 017/188] cdef: fix typos --- src/cdef.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cdef.rs b/src/cdef.rs index d65bb78e60..de895bc594 100644 --- a/src/cdef.rs +++ b/src/cdef.rs @@ -386,7 +386,7 @@ pub fn cdef_analyze_superblock( // provided blocks co-locate with the output region. The TileBlocks // provide by-[super]qblock CDEF parameters. -// output: TileMut detination for filtered pixels. The output's +// output: TileMut destination for filtered pixels. The output's // rect specifies the region of the input to be processed (x and y // are relative to the input Frame's origin). Note that an // additional area of 2 pixels of padding is used for CDEF. When @@ -578,7 +578,7 @@ pub fn cdef_filter_superblock( // tb: the TileBlocks associated with the filtered region; the // provided blocks co-locate with the output region. -// output: TileMut detination for filtered pixels. The output's +// output: TileMut destination for filtered pixels. The output's // rect specifies the region of the input to be processed (x and y // are relative to the input Frame's origin). Note that an // additional area of 2 pixels of padding is used for CDEF. When From 5282fe8e4d373fe6ea5609fa971ff3df53078f03 Mon Sep 17 00:00:00 2001 From: Tristan Matthews Date: Fri, 7 May 2021 22:59:34 -0400 Subject: [PATCH 018/188] ec: fix typos --- src/ec.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ec.rs b/src/ec.rs index be52216e91..ce9b5237d0 100644 --- a/src/ec.rs +++ b/src/ec.rs @@ -86,7 +86,7 @@ pub trait Writer { ) -> u32; /// Return current length of range-coded bitstream in integer bits fn tell(&mut self) -> u32; - /// Return currrent length of range-coded bitstream in fractional + /// Return current length of range-coded bitstream in fractional /// bits with OD_BITRES decimal precision fn tell_frac(&mut self) -> u32; /// Save current point in coding/recording to a checkpoint @@ -669,7 +669,7 @@ where } } } - /// Resturns QOD_BITRES bits for symbol v in [0, n-1] with parameter k as finite subexponential + /// Returns QOD_BITRES bits for symbol v in [0, n-1] with parameter k as finite subexponential /// n: size of interval /// k: 'parameter' /// v: value to encode From 2e11360652206ff20c65b38b8b3b09ecbf58bed9 Mon Sep 17 00:00:00 2001 From: Tristan Matthews Date: Fri, 7 May 2021 22:59:53 -0400 Subject: [PATCH 019/188] bin: fix typo --- src/bin/common.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bin/common.rs b/src/bin/common.rs index 4ce986a5a2..759e6627ad 100644 --- a/src/bin/common.rs +++ b/src/bin/common.rs @@ -355,7 +355,7 @@ pub fn parse_cli() -> Result { ) .arg( Arg::with_name("METRICS") - .help("Calulate and display several metrics including PSNR, SSIM, CIEDE2000 etc") + .help("Calculate and display several metrics including PSNR, SSIM, CIEDE2000 etc") .long("metrics") ) .arg( From 1b65a0467129a175b487850ecff24f23c636a660 Mon Sep 17 00:00:00 2001 From: Tristan Matthews Date: Fri, 7 May 2021 23:00:24 -0400 Subject: [PATCH 020/188] config: encoder: fix typo --- src/api/config/encoder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/config/encoder.rs b/src/api/config/encoder.rs index c0f0d494c4..c91d7f4098 100644 --- a/src/api/config/encoder.rs +++ b/src/api/config/encoder.rs @@ -105,7 +105,7 @@ pub struct EncoderConfig { /// Number of frames to read ahead for the RDO lookahead computation. pub rdo_lookahead_frames: usize, - /// Settings which affect the enconding speed vs. quality trade-off. + /// Settings which affect the encoding speed vs. quality trade-off. pub speed_settings: SpeedSettings, } From 99669fade5850ce51e49f74672a6c7ff7a437650 Mon Sep 17 00:00:00 2001 From: Tristan Matthews Date: Fri, 7 May 2021 23:00:49 -0400 Subject: [PATCH 021/188] frame: mod: fix typo --- src/frame/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/frame/mod.rs b/src/frame/mod.rs index af0f11d273..2ea9c4fd8b 100644 --- a/src/frame/mod.rs +++ b/src/frame/mod.rs @@ -68,7 +68,7 @@ impl FrameAlloc for Frame { } } -/// Public Trait for calulating Padding +/// Public Trait for calculating Padding pub(crate) trait FramePad { fn pad(&mut self, w: usize, h: usize, planes: usize); } From 01898fe014c7764bbb92b0d0a5b3106ceee3fc0e Mon Sep 17 00:00:00 2001 From: Tristan Matthews Date: Fri, 7 May 2021 23:02:56 -0400 Subject: [PATCH 022/188] x86:itx: fix typo --- src/x86/itx.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/x86/itx.asm b/src/x86/itx.asm index 5b373c5f7e..1caa814ed4 100644 --- a/src/x86/itx.asm +++ b/src/x86/itx.asm @@ -124,7 +124,7 @@ pw_m2751_3035x8: dw -2751*8, 3035*8 SECTION .text -; Code size reduction trickery: Intead of using rip-relative loads with +; Code size reduction trickery: Instead of using rip-relative loads with ; mandatory 4-byte offsets everywhere, we can set up a base pointer with a ; single rip-relative lea and then address things relative from that with ; 1-byte offsets as long as data is within +-128 bytes of the base pointer. From 6f9208c44a2c8dad22a4562c7e264735e4fe9123 Mon Sep 17 00:00:00 2001 From: Tristan Matthews Date: Fri, 7 May 2021 23:03:45 -0400 Subject: [PATCH 023/188] transform: fix typos --- src/transform/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transform/mod.rs b/src/transform/mod.rs index 1e8760a0f1..ede695ec10 100644 --- a/src/transform/mod.rs +++ b/src/transform/mod.rs @@ -559,10 +559,10 @@ mod test { (TX_16X16, IDTX, 0), (TX_16X16, V_DCT, 1), (TX_16X16, H_DCT, 1), - // 32x tranforms only use DCT_DCT and IDTX + // 32x transforms only use DCT_DCT and IDTX (TX_32X32, DCT_DCT, 2), (TX_32X32, IDTX, 0), - // 64x tranforms only use DCT_DCT and IDTX + // 64x transforms only use DCT_DCT and IDTX //(TX_64X64, DCT_DCT, 0), (TX_4X8, DCT_DCT, 1), (TX_8X4, DCT_DCT, 1), From c5bc3402475bf6b61907cfc0d73574caaaee3be9 Mon Sep 17 00:00:00 2001 From: Tristan Matthews Date: Fri, 7 May 2021 23:04:05 -0400 Subject: [PATCH 024/188] x86: quanize: fix typo --- src/asm/x86/quantize.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/asm/x86/quantize.rs b/src/asm/x86/quantize.rs index 27eaeb33a6..d4ba56695e 100644 --- a/src/asm/x86/quantize.rs +++ b/src/asm/x86/quantize.rs @@ -191,7 +191,7 @@ mod test { let mut qcoeffs = Aligned::new([0i16; 32 * 32]); let mut rcoeffs = Aligned::new([0i16; 32 * 32]); - // Generate quantized coefficients upto the eob + // Generate quantized coefficients up to the eob let between = Uniform::from(-std::i16::MAX..=std::i16::MAX); for (i, qcoeff) in qcoeffs.data.iter_mut().enumerate().take(eob) { *qcoeff = between.sample(&mut rng) From b31295235f5f317d2b7eade2fb559264171f2253 Mon Sep 17 00:00:00 2001 From: Tristan Matthews Date: Fri, 7 May 2021 23:04:33 -0400 Subject: [PATCH 025/188] tools: draw-importances: fix typo --- tools/draw-importances.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/draw-importances.py b/tools/draw-importances.py index 4b661afd39..bbf7a2594f 100755 --- a/tools/draw-importances.py +++ b/tools/draw-importances.py @@ -269,7 +269,7 @@ def blockImportance(input, verbose, path, figure, raw, csv): elif verbose and csv: click.secho( - "CSV Saved sucessfully: " + folder_path + "/" + csv + ".", + "CSV Saved successfully: " + folder_path + "/" + csv + ".", fg="blue", ) From c1ddbfe2e339a831090a30182ad586b4bebb9977 Mon Sep 17 00:00:00 2001 From: Tristan Matthews Date: Fri, 7 May 2021 23:04:56 -0400 Subject: [PATCH 026/188] doc: STRUCTURE: fix typo --- doc/STRUCTURE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/STRUCTURE.md b/doc/STRUCTURE.md index 214f3c0a9e..8ad189680d 100644 --- a/doc/STRUCTURE.md +++ b/doc/STRUCTURE.md @@ -31,7 +31,7 @@ The below table gives a brief overview of design of [`src/*`](../src/) | [bin/rav1e.rs](../src/bin/rav1e.rs) | CLI Interface for encoding from y4m files with rav1e | | [bin/stats.rs](../src/bin/stats.rs) | Functions for displaying Frame summary, progress info, metrics of the encoding process | | [bin/kv.rs](../src/bin/kv.rs) | Serialisation configuration of Key-value strings | -| [bin/errror.rs](../src/bin/error.rs) | Functions and enums to parse various errors and displaying | +| [bin/error.rs](../src/bin/error.rs) | Functions and enums to parse various errors and displaying | | [bin/muxer/*.rs](../src/bin/muxer/) | Contains IVF Muxer functions for header definition, writing frames and flushing | | [bin/decoder/*.rs](../src/bin/decoder/) | Decoder related structures and functions | | [capi.rs](../src/capi.rs) | C Compatible API for using rav1e as a library | From ef16a0a11862865b4f03213fc9f831fe8fd64d2e Mon Sep 17 00:00:00 2001 From: Tristan Matthews Date: Fri, 7 May 2021 23:05:26 -0400 Subject: [PATCH 027/188] doc: RDO: fix typos --- doc/RDO.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/RDO.md b/doc/RDO.md index 103a97af7b..ededbc0be1 100644 --- a/doc/RDO.md +++ b/doc/RDO.md @@ -11,7 +11,7 @@ Distortion is a metric selected based on user options, such as MSE for PSNR opti ![](tile_group.svg) -The bitstream is composed of a hiearchy of units, and rav1e's RDO loops are structured in the same way. After computing some lookahead information, rav1e's first task is to split the frame into tiles (sometimes just one). These are processed completely independently, and usually in parallel. +The bitstream is composed of a hierarchy of units, and rav1e's RDO loops are structured in the same way. After computing some lookahead information, rav1e's first task is to split the frame into tiles (sometimes just one). These are processed completely independently, and usually in parallel. The next split is into superblocks, which are currently always 64x64 pixels in the luma plane. rav1e processes superblocks serially, one at a time. Although choices made in a superblock affect later superblocks, rav1e will never revisit a superblock once it has determined the best way to code it - it writes it and continues on. @@ -41,7 +41,7 @@ Inter mode rav1e's intra search starts with populating a list of inter modes to search. Modes such as NEWMV are always added, however modes such as NEARMV are only added if the current motion vector list is sufficiently long enough for them to be coded. Compound modes are added in a similar manner, if enabled. -Next, a rough distortion approximation based on SATD of the residual is computed. This is also computed at the partition level, however unlike intra mode, this is not an approximation. The resulting list is sorted and pruned to the best 9 entires. +Next, a rough distortion approximation based on SATD of the residual is computed. This is also computed at the partition level, however unlike intra mode, this is not an approximation. The resulting list is sorted and pruned to the best 9 entries. Next, each of the modes is fully encoded (with bitstream write disabled). The real distortion and bitrate are measured, and the best mode is chosen. From 056f13851cc374a61b7355553236e2bccf20338e Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Tue, 11 May 2021 19:25:56 +0200 Subject: [PATCH 028/188] Fix current clippy lints --- src/api/channel/mod.rs | 11 +++++------ src/bin/rav1e-ch.rs | 29 +++++++++++++---------------- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/api/channel/mod.rs b/src/api/channel/mod.rs index 83d4f7ee0e..542aa9d247 100644 --- a/src/api/channel/mod.rs +++ b/src/api/channel/mod.rs @@ -172,15 +172,13 @@ impl RcFirstPass for Sender { impl RcFirstPass for Option> { fn send_pass_data(&mut self, rc_state: &mut RCState) { - match self.as_mut() { - Some(s) => s.send_pass_data(rc_state), - None => {} + if let Some(s) = self.as_mut() { + s.send_pass_data(rc_state) } } fn send_pass_summary(&mut self, rc_state: &mut RCState) { - match self.as_mut() { - Some(s) => s.send_pass_summary(rc_state), - None => {} + if let Some(s) = self.as_mut() { + s.send_pass_summary(rc_state) } } } @@ -218,6 +216,7 @@ impl RcSecondPass for Option> { } impl Config { + #[allow(clippy::type_complexity)] fn new_channel_internal( &self, ) -> Result< diff --git a/src/bin/rav1e-ch.rs b/src/bin/rav1e-ch.rs index 0b8237229f..8707162610 100644 --- a/src/bin/rav1e-ch.rs +++ b/src/bin/rav1e-ch.rs @@ -410,22 +410,19 @@ fn run() -> Result<(), error::CliError> { Ok(d) => d, }; let video_info = y4m_dec.get_video_details(); - let y4m_enc = match cli.io.rec { - Some(rec) => Some( - y4m::encode( - video_info.width, - video_info.height, - y4m::Ratio::new( - video_info.time_base.den as usize, - video_info.time_base.num as usize, - ), - ) - .with_colorspace(y4m_dec.get_colorspace()) - .write_header(rec) - .unwrap(), - ), - None => None, - }; + let y4m_enc = cli.io.rec.map(|rec| { + y4m::encode( + video_info.width, + video_info.height, + y4m::Ratio::new( + video_info.time_base.den as usize, + video_info.time_base.num as usize, + ), + ) + .with_colorspace(y4m_dec.get_colorspace()) + .write_header(rec) + .unwrap() + }); match video_info.bit_depth { 8 | 10 | 12 => {} From 852e48523212468a1f5989848e4ba1021b2efd27 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Wed, 12 May 2021 20:42:09 +0200 Subject: [PATCH 029/188] Remove redundant early validation checks in the C-API The configuration is validated already on encoder instantiation. Fixes #2730 --- src/capi.rs | 46 ++++------------------------------------------ 1 file changed, 4 insertions(+), 42 deletions(-) diff --git a/src/capi.rs b/src/capi.rs index d0aeda1f82..6e988af529 100644 --- a/src/capi.rs +++ b/src/capi.rs @@ -602,40 +602,6 @@ pub unsafe extern fn rav1e_config_unref(cfg: *mut Config) { } } -fn tile_log2(blk_size: usize, target: usize) -> usize { - let mut k = 0; - while (blk_size << k) < target { - k += 1; - } - k -} - -fn check_tile_log2(n: Result) -> Result { - match n { - Ok(n) => { - if ((1 << tile_log2(1, n)) - n) == 0 || n == 0 { - Ok(n) - } else { - Err(()) - } - } - Err(e) => Err(e), - } -} - -fn check_frame_size(n: Result) -> Result { - match n { - Ok(n) => { - if n >= 16 && n < u16::max_value().into() { - Ok(n) - } else { - Err(()) - } - } - Err(e) => Err(e), - } -} - unsafe fn option_match( cfg: *mut Config, key: *const c_char, value: *const c_char, ) -> Result<(), ()> { @@ -644,8 +610,8 @@ unsafe fn option_match( let enc = &mut (*cfg).cfg.enc; match key { - "width" => enc.width = check_frame_size(value.parse().map_err(|_| ()))?, - "height" => enc.height = check_frame_size(value.parse().map_err(|_| ()))?, + "width" => enc.width = value.parse().map_err(|_| ())?, + "height" => enc.height = value.parse().map_err(|_| ())?, "speed" => { enc.speed_settings = rav1e::SpeedSettings::from_preset(value.parse().map_err(|_| ())?) @@ -654,12 +620,8 @@ unsafe fn option_match( "threads" => (*cfg).cfg.threads = value.parse().map_err(|_| ())?, "tiles" => enc.tiles = value.parse().map_err(|_| ())?, - "tile_rows" => { - enc.tile_rows = check_tile_log2(value.parse().map_err(|_| ()))? - } - "tile_cols" => { - enc.tile_cols = check_tile_log2(value.parse().map_err(|_| ()))? - } + "tile_rows" => enc.tile_rows = value.parse().map_err(|_| ())?, + "tile_cols" => enc.tile_cols = value.parse().map_err(|_| ())?, "tune" => enc.tune = value.parse().map_err(|_| ())?, "quantizer" => enc.quantizer = value.parse().map_err(|_| ())?, From fd995318b1ed457686406c38afad63601732a588 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 22 Apr 2021 15:29:56 +0300 Subject: [PATCH 030/188] x86: Fix writes past the intended area in AVX2 fguv --- src/x86/film_grain.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/x86/film_grain.asm b/src/x86/film_grain.asm index 72a1e3c009..bfd7a22085 100644 --- a/src/x86/film_grain.asm +++ b/src/x86/film_grain.asm @@ -2072,7 +2072,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ %endif sub hb, 1+%2 - jl %%end_y_v_overlap + jle %%end_y_v_overlap %if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] From 0eec4d44b85e93d0125a4fec77428c50b8307658 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:00:25 +0200 Subject: [PATCH 031/188] x86: Make asm file names more consistent --- build.rs | 9 +++++---- src/x86/{film_grain.asm => film_grain_avx2.asm} | 0 src/x86/{film_grain_ssse3.asm => film_grain_sse.asm} | 0 src/x86/{ipred.asm => ipred_avx2.asm} | 0 src/x86/{ipred_ssse3.asm => ipred_sse.asm} | 0 src/x86/{itx.asm => itx_avx2.asm} | 0 src/x86/{itx_ssse3.asm => itx_sse.asm} | 0 src/x86/{loopfilter.asm => loopfilter_avx2.asm} | 0 src/x86/{loopfilter_ssse3.asm => loopfilter_sse.asm} | 0 .../{looprestoration.asm => looprestoration_avx2.asm} | 0 10 files changed, 5 insertions(+), 4 deletions(-) rename src/x86/{film_grain.asm => film_grain_avx2.asm} (100%) rename src/x86/{film_grain_ssse3.asm => film_grain_sse.asm} (100%) rename src/x86/{ipred.asm => ipred_avx2.asm} (100%) rename src/x86/{ipred_ssse3.asm => ipred_sse.asm} (100%) rename src/x86/{itx.asm => itx_avx2.asm} (100%) rename src/x86/{itx_ssse3.asm => itx_sse.asm} (100%) rename src/x86/{loopfilter.asm => loopfilter_avx2.asm} (100%) rename src/x86/{loopfilter_ssse3.asm => loopfilter_sse.asm} (100%) rename src/x86/{looprestoration.asm => looprestoration_avx2.asm} (100%) diff --git a/build.rs b/build.rs index 490da9165f..bf24c0775b 100644 --- a/build.rs +++ b/build.rs @@ -85,10 +85,11 @@ fn build_nasm_files() { } let asm_files = &[ - "src/x86/ipred.asm", - "src/x86/ipred_ssse3.asm", - "src/x86/itx.asm", - "src/x86/itx_ssse3.asm", + "src/x86/ipred_avx2.asm", + "src/x86/ipred_sse.asm", + "src/x86/itx_avx2.asm", + "src/x86/itx_sse.asm", + "src/x86/looprestoration_avx2.asm", "src/x86/looprestoration16_avx2.asm", "src/x86/mc_avx2.asm", "src/x86/mc16_avx2.asm", diff --git a/src/x86/film_grain.asm b/src/x86/film_grain_avx2.asm similarity index 100% rename from src/x86/film_grain.asm rename to src/x86/film_grain_avx2.asm diff --git a/src/x86/film_grain_ssse3.asm b/src/x86/film_grain_sse.asm similarity index 100% rename from src/x86/film_grain_ssse3.asm rename to src/x86/film_grain_sse.asm diff --git a/src/x86/ipred.asm b/src/x86/ipred_avx2.asm similarity index 100% rename from src/x86/ipred.asm rename to src/x86/ipred_avx2.asm diff --git a/src/x86/ipred_ssse3.asm b/src/x86/ipred_sse.asm similarity index 100% rename from src/x86/ipred_ssse3.asm rename to src/x86/ipred_sse.asm diff --git a/src/x86/itx.asm b/src/x86/itx_avx2.asm similarity index 100% rename from src/x86/itx.asm rename to src/x86/itx_avx2.asm diff --git a/src/x86/itx_ssse3.asm b/src/x86/itx_sse.asm similarity index 100% rename from src/x86/itx_ssse3.asm rename to src/x86/itx_sse.asm diff --git a/src/x86/loopfilter.asm b/src/x86/loopfilter_avx2.asm similarity index 100% rename from src/x86/loopfilter.asm rename to src/x86/loopfilter_avx2.asm diff --git a/src/x86/loopfilter_ssse3.asm b/src/x86/loopfilter_sse.asm similarity index 100% rename from src/x86/loopfilter_ssse3.asm rename to src/x86/loopfilter_sse.asm diff --git a/src/x86/looprestoration.asm b/src/x86/looprestoration_avx2.asm similarity index 100% rename from src/x86/looprestoration.asm rename to src/x86/looprestoration_avx2.asm From 3a10c86fbff034b575fcfaeac26e66686a311e4c Mon Sep 17 00:00:00 2001 From: Josh Holmer Date: Tue, 11 May 2021 12:35:39 -0400 Subject: [PATCH 032/188] Revert mc16_avx2 to a pristine state --- build.rs | 1 - src/asm/x86/mc.rs | 12 +- src/x86/mc16_avx2.asm | 2113 ----------------------------------------- 3 files changed, 5 insertions(+), 2121 deletions(-) delete mode 100644 src/x86/mc16_avx2.asm diff --git a/build.rs b/build.rs index bf24c0775b..eb986f0bfd 100644 --- a/build.rs +++ b/build.rs @@ -92,7 +92,6 @@ fn build_nasm_files() { "src/x86/looprestoration_avx2.asm", "src/x86/looprestoration16_avx2.asm", "src/x86/mc_avx2.asm", - "src/x86/mc16_avx2.asm", "src/x86/mc_avx512.asm", "src/x86/mc_sse.asm", "src/x86/me.asm", diff --git a/src/asm/x86/mc.rs b/src/asm/x86/mc.rs index 3731c8cee3..9029a5be2b 100644 --- a/src/asm/x86/mc.rs +++ b/src/asm/x86/mc.rs @@ -7,6 +7,8 @@ // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. +#![allow(dead_code)] + use crate::cpu_features::CpuFeatureLevel; use crate::frame::*; use crate::mc::FilterMode::*; @@ -355,7 +357,7 @@ decl_mc_hbd_fns!( cpu_function_lookup_table!( PUT_HBD_FNS: [[Option; 16]], default: [None; 16], - [AVX2] + [] ); macro_rules! decl_mct_fns { @@ -462,7 +464,7 @@ decl_mct_hbd_fns!( cpu_function_lookup_table!( PREP_HBD_FNS: [[Option; 16]], default: [None; 16], - [AVX2] + [] ); extern { @@ -488,11 +490,7 @@ cpu_function_lookup_table!( [(SSSE3, Some(rav1e_avg_ssse3)), (AVX2, Some(rav1e_avg_avx2))] ); -cpu_function_lookup_table!( - AVG_HBD_FNS: [Option], - default: None, - [(AVX2, Some(rav1e_avg_16bpc_avx2))] -); +cpu_function_lookup_table!(AVG_HBD_FNS: [Option], default: None, []); #[cfg(test)] mod test { diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm deleted file mode 100644 index ee56da2ec7..0000000000 --- a/src/x86/mc16_avx2.asm +++ /dev/null @@ -1,2113 +0,0 @@ -; Copyright (c) 2017-2020, The rav1e contributors -; Copyright (c) 2020, Nathan Egge -; All rights reserved. -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. - -%include "config.asm" -%include "ext/x86/x86inc.asm" - -%if ARCH_X86_64 - -SECTION_RODATA 32 - -spf_h_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 - db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 -pq_2: dq (6 - 4) -pq_3: dq (6 - 4) + 1 -pq_6: dq (6 - 4) + 4 -pq_4: dq (6 - 2) -pq_5: dq (6 - 2) + 1 -pq_8: dq (6 + 2) -pq_10: dq (6 + 4) -pd_32: dd (1 << 6 >> 1) -pd_34: dd (1 << 6 >> 1) + (1 << (6 - 4) >> 1) -pd_40: dd (1 << 6 >> 1) + (1 << (6 - 2) >> 1) -pd_2: dd (1 << (6 - 4) >> 1) -pd_512: dd (1 << (6 + 4) >> 1) -pd_8: dd (1 << (6 - 2) >> 1) -pd_128: dd (1 << (6 + 2) >> 1) -nd_524256: dd (1 << 6 >> 1) - (8192 << 6) -nd_32766: dd (1 << (6 - 4) >> 1) - (8192 << (6 - 4)) -nd_131064: dd (1 << (6 - 2) >> 1) - (8192 << (6 - 2)) -pd_16388: dd (1 << (6 - 4)) + 8192*2 -pd_16400: dd (1 << (6 - 2)) + 8192*2 -pd_131104: dd ((1 << (6 - 4)) + 8192*2) << 3 -pd_131200: dd ((1 << (6 - 2)) + 8192*2) << 3 -pd_524416: dd ((1 << (6 - 4)) + 8192*2) << 5 -pd_524800: dd ((1 << (6 - 2)) + 8192*2) << 5 -pw_8192: dw 8192 -pw_1: dw 1 -pw_16: dw 16 -pw_64: dw 64 - -SECTION .text - -%macro PUT_4TAP_H 6 - pshufb %1, %3 - pshufb %2, %3 - pmaddwd %1, %4 - pmaddwd %2, %4 - phaddd %1, %2 - paddd %1, %5 - psrad %1, %6 -%endm - -%macro PUT_8TAP_H 8 - movu xm%1, [srcq + %8 + 0] - movu xm%3, [srcq + %8 + 2] - vinserti128 m%1, [srcq + ssq + %8 + 0], 1 - vinserti128 m%3, [srcq + ssq + %8 + 2], 1 - movu xm%2, [srcq + %8 + 4] - movu xm%4, [srcq + %8 + 6] - vinserti128 m%2, [srcq + ssq + %8 + 4], 1 - vinserti128 m%4, [srcq + ssq + %8 + 6], 1 - pmaddwd m%1, %5 - pmaddwd m%3, %5 - pmaddwd m%2, %5 - pmaddwd m%4, %5 - phaddd m%1, m%3 - phaddd m%2, m%4 - phaddd m%1, m%2 - paddd m%1, %6 - psrad m%1, %7 -%endm - -%macro PUT_4TAP_HS1 5 - pshufb %1, %2 - pmaddwd %1, %3 - phaddd %1, %1 - paddd %1, %4 - psrad %1, %5 - packssdw %1, %1 -%endm - -%macro PUT_4TAP_HS2 6 - pshufb %1, %3 - pshufb %2, %3 - pmaddwd %1, %4 - pmaddwd %2, %4 - phaddd %1, %1 - phaddd %2, %2 - paddd %1, %5 - paddd %2, %5 - psrad %1, %6 - psrad %2, %6 - packssdw %1, %1 - packssdw %2, %2 -%endm - -%macro PUT_8TAP_HS 7-8 - movu xm%1, [srcq + %7 + 0] - movu xm%3, [srcq + %7 + 2] - vinserti128 m%1, [srcq + %7 + 8], 1 - vinserti128 m%3, [srcq + %7 + 10], 1 - pmaddwd m%1, %4 - pmaddwd m%3, %4 - phaddd m%1, m%3 - movu xm%2, [srcq + %7 + 4] - movu xm%3, [srcq + %7 + 6] - vinserti128 m%2, [srcq + %7 + 12], 1 - vinserti128 m%3, [srcq + %7 + 14], 1 - pmaddwd m%2, %4 - pmaddwd m%3, %4 - phaddd m%2, m%3 -%if %0 > 7 - vpbroadcastd %5, %8 -%endif - phaddd m%1, m%2 - paddd m%1, %5 - psrad m%1, %6 - packssdw m%1, m%1 -%endm - -%macro LOAD_REGS_2 3 - mov%1 xm%2, [srcq + ssq*0] - mov%1 xm%3, [srcq + ssq*1] -%ifidn %1, u - vpermq m%2, m%2, q3120 - vpermq m%3, m%3, q3120 -%endif - lea srcq, [srcq + ssq*2] -%endm - -%macro LOAD_REGS_3 4 - mov%1 xm%2, [srcq + ssq*0] - mov%1 xm%3, [srcq + ssq*1] - mov%1 xm%4, [srcq + ssq*2] -%ifidn %1, u - vpermq m%2, m%2, q3120 - vpermq m%3, m%3, q3120 - vpermq m%4, m%4, q3120 -%endif - add srcq, ss3q -%endm - -%macro LOAD_REGS 3-8 -%if %0 == 3 - LOAD_REGS_2 %1, %2, %3 -%elif %0 == 4 - LOAD_REGS_3 %1, %2, %3, %4 -%elif %0 == 5 - LOAD_REGS_2 %1, %2, %3 - LOAD_REGS_2 %1, %4, %5 -%elif %0 == 6 - LOAD_REGS_3 %1, %2, %3, %4 - LOAD_RESG_2 %1, %5, %6 -%elif %0 == 7 - LOAD_REGS_3 %1, %2, %3, %4 - LOAD_REGS_3 %1, %5, %6, %7 -%else - LOAD_REGS_3 %1, %2, %3, %4 - LOAD_REGS_2 %1, %5, %6 - LOAD_REGS_2 %1, %7, %8 -%endif -%endm - -%macro STORE_REGS 3 -%ifidn %1, u - vpermq m%2, m%2, q3120 - vpermq m%3, m%3, q3120 -%endif - mov%1 [dstq + dsq*0], xm%2 - mov%1 [dstq + dsq*1], xm%3 - lea dstq, [dstq + dsq*2] -%endm - -%macro INTERLEAVE_REGS 4-8 - punpckl%1 %2, %3 - punpckl%1 %3, %4 -%if %0 > 4 - punpckl%1 %4, %5 - punpckl%1 %5, %6 -%endif -%if %0 > 6 - punpckl%1 %6, %7 - punpckl%1 %7, %8 -%endif -%endm - -%macro MUL_ADD_R 8 - pmaddwd %3, %7 - pmaddwd %1, %5, %8 - paddd %1, %3 - mova %3, %5 - pmaddwd %4, %7 - pmaddwd %2, %6, %8 - paddd %2, %4 - mova %4, %6 -%endm - -%macro MUL_ACC_R 7 - pmaddwd %3, %5, %7 - pmaddwd %4, %6, %7 - paddd %1, %3 - paddd %2, %4 - mova %3, %5 - mova %4, %6 -%endm - -%macro RND_SHR_MIN_R 5 - paddd %1, %3 - paddd %2, %3 - psrad %1, %4 - psrad %2, %4 - packusdw %1, %1 - packusdw %2, %2 - pminuw %1, %5 - pminuw %2, %5 -%endm - -%macro RND_SHR_R 4 - paddd %1, %3 - paddd %2, %3 - psrad %1, %4 - psrad %2, %4 - packssdw %1, %1 - packssdw %2, %2 -%endm - -; int8_t subpel_filters[5][15][8] -%assign FILTER_REGULAR (0*15 << 7) | 3*15 -%assign FILTER_SMOOTH (1*15 << 7) | 4*15 -%assign FILTER_SHARP (2*15 << 7) | 3*15 - -%macro make_8tap_fn 4 ; type, op, type_h, type_v -INIT_XMM avx2 -cglobal %1_8tap_%2_16bpc - mov t0d, FILTER_%3 - mov t1d, FILTER_%4 - jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) -%endmacro - -cextern mc_subpel_filters - -%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) - -%macro filter_fn 1 - -%if WIN64 -%ifidn %1, put -DECLARE_REG_TMP 5, 4 -%else -DECLARE_REG_TMP 4, 5 -%endif -%else -DECLARE_REG_TMP 7, 8 -%endif - -make_8tap_fn %1, regular, REGULAR, REGULAR -make_8tap_fn %1, regular_smooth, REGULAR, SMOOTH -make_8tap_fn %1, regular_sharp, REGULAR, SHARP -make_8tap_fn %1, smooth, SMOOTH, SMOOTH -make_8tap_fn %1, smooth_regular, SMOOTH, REGULAR -make_8tap_fn %1, smooth_sharp, SMOOTH, SHARP -make_8tap_fn %1, sharp, SHARP, SHARP -make_8tap_fn %1, sharp_regular, SHARP, REGULAR -make_8tap_fn %1, sharp_smooth, SHARP, SMOOTH - -INIT_YMM avx2 -%ifidn %1, put -cglobal put_8tap_16bpc, 4, 10, 16, dst, ds, src, ss, _w, h, mx, my, bdmax, ss3 -%else -cglobal prep_8tap_16bpc, 3, 10, 16, dst, src, ss, _w, h, mx, my, bdmax, ds, ss3 -%endif - -%ifidn %1, put - imul mxd, mxm, 0x4081 ; (1 << 14) | (1 << 7) | (1 << 0) - add mxd, t0d - imul myd, mym, 0x4081 ; (1 << 14) | (1 << 7) | (1 << 0) - add myd, t1d -%else - imul myd, mym, 0x4081 ; (1 << 14) | (1 << 7) | (1 << 0) - add myd, t1d - imul mxd, mxm, 0x4081 ; (1 << 14) | (1 << 7) | (1 << 0) - add mxd, t0d -%endif - - movsxd _wq, _wm - movifnidn hd, hm - -%ifidn %1, put - vpbroadcastw m7, bdmaxm -%else - lea dsq, [_wq*2] -%endif - - test mxd, (0x7f << 14) - jnz .%1_8tap_h_16bpc - test myd, (0x7f << 14) - jnz .%1_8tap_v_16bpc - -; ---- {put,prep}_16bpc ---- - -INIT_XMM avx2 -.%1_16bpc: ; cglobal put_16bpc, 6, 8, 8, dst, ds, src, ss, w, h - -%ifidn %1, prep -INIT_YMM avx2 - popcnt bdmaxd, bdmaxm - vpbroadcastq m8, [pq_4] - vpbroadcastw m9, [pw_8192] - cmp bdmaxd, 12 - jne .prep_bits10 - vpbroadcastq m8, [pq_2] -.prep_bits10: -INIT_XMM avx2 -%endif - -%ifidn %1, put - DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, jr, ss3 -%else - DEFINE_ARGS dst, src, ss, _w, h, mx, my, jr, ds, ss3 -%endif - - lea jrq, [.jmp_tbl] - tzcnt _wd, _wm -%ifidn %1, put - sub _wd, 1 -%else - sub _wd, 2 -%endif - movsxd _wq, [jrq + _wq*4] - add _wq, jrq - jmp _wq - -%ifidn %1, put -.w2: ; 2xN - movd m0, [srcq] - movd m1, [srcq + ssq] - lea srcq, [srcq + ssq*2] - movd [dstq], m0 - movd [dstq + dsq], m1 - lea dstq, [dstq + dsq*2] - sub hd, 2 - jg .w2 - RET -%endif - -.w4: ; 4xN - movq m0, [srcq] - movq m1, [srcq + ssq] - lea srcq, [srcq + ssq*2] -%ifidn %1, prep - psllw m0, m8 - psllw m1, m8 - psubw m0, m9 - psubw m1, m9 -%endif - movq [dstq], m0 - movq [dstq + dsq], m1 - lea dstq, [dstq + dsq*2] - sub hd, 2 - jg .w4 - RET - - ; XXX is unaligned input (but aligned output) a hard requirement, or is checkasm broken? -.w8: ; 8xN - movu m0, [srcq] - movu m1, [srcq + ssq] - lea srcq, [srcq + ssq*2] -%ifidn %1, prep - psllw m0, m8 - psllw m1, m8 - psubw m0, m9 - psubw m1, m9 -%endif - mova [dstq], m0 - mova [dstq + dsq], m1 - lea dstq, [dstq + dsq*2] - sub hd, 2 - jg .w8 - RET - -INIT_YMM avx2 -.w16: ; 16xN - movu m0, [srcq] - movu m1, [srcq + ssq] - lea srcq, [srcq + ssq*2] -%ifidn %1, prep - psllw m0, xm8 - psllw m1, xm8 - psubw m0, m9 - psubw m1, m9 -%endif - mova [dstq], m0 - mova [dstq + dsq], m1 - lea dstq, [dstq + dsq*2] - sub hd, 2 - jg .w16 - RET - -.w32: ; 32xN - movu m0, [srcq + 32*0] - movu m1, [srcq + 32*1] - movu m2, [srcq + ssq] - movu m3, [srcq + ssq + 32*1] - lea srcq, [srcq + ssq*2] -%ifidn %1, prep - psllw m0, xm8 - psllw m1, xm8 - psllw m2, xm8 - psllw m3, xm8 - psubw m0, m9 - psubw m1, m9 - psubw m2, m9 - psubw m3, m9 -%endif - mova [dstq + 32*0], m0 - mova [dstq + 32*1], m1 - mova [dstq + dsq + 32*0], m2 - mova [dstq + dsq + 32*1], m3 - lea dstq, [dstq + dsq*2] - sub hd, 2 - jg .w32 - RET - -.w64: ; 64xN - movu m0, [srcq + 32*0] - movu m1, [srcq + 32*1] - movu m2, [srcq + 32*2] - movu m3, [srcq + 32*3] - movu m4, [srcq + ssq + 32*0] - movu m5, [srcq + ssq + 32*1] - movu m6, [srcq + ssq + 32*2] - movu m7, [srcq + ssq + 32*3] - lea srcq, [srcq + ssq*2] -%ifidn %1, prep - psllw m0, xm8 - psllw m1, xm8 - psllw m2, xm8 - psllw m3, xm8 - psllw m4, xm8 - psllw m5, xm8 - psllw m6, xm8 - psllw m7, xm8 - psubw m0, m9 - psubw m1, m9 - psubw m2, m9 - psubw m3, m9 - psubw m4, m9 - psubw m5, m9 - psubw m6, m9 - psubw m7, m9 -%endif - mova [dstq + 32*0], m0 - mova [dstq + 32*1], m1 - mova [dstq + 32*2], m2 - mova [dstq + 32*3], m3 - mova [dstq + dsq + 32*0], m4 - mova [dstq + dsq + 32*1], m5 - mova [dstq + dsq + 32*2], m6 - mova [dstq + dsq + 32*3], m7 - lea dstq, [dstq + dsq*2] - sub hd, 2 - jg .w64 - RET - -.w128: ; 128xN - movu m0, [srcq + 32*0] - movu m1, [srcq + 32*1] - movu m2, [srcq + 32*2] - movu m3, [srcq + 32*3] - movu m4, [srcq + 32*4] - movu m5, [srcq + 32*5] - movu m6, [srcq + 32*6] - movu m7, [srcq + 32*7] - add srcq, ssq -%ifidn %1, prep - psllw m0, xm8 - psllw m1, xm8 - psllw m2, xm8 - psllw m3, xm8 - psllw m4, xm8 - psllw m5, xm8 - psllw m6, xm8 - psllw m7, xm8 - psubw m0, m9 - psubw m1, m9 - psubw m2, m9 - psubw m3, m9 - psubw m4, m9 - psubw m5, m9 - psubw m6, m9 - psubw m7, m9 -%endif - mova [dstq + 32*0], m0 - mova [dstq + 32*1], m1 - mova [dstq + 32*2], m2 - mova [dstq + 32*3], m3 - mova [dstq + 32*4], m4 - mova [dstq + 32*5], m5 - mova [dstq + 32*6], m6 - mova [dstq + 32*7], m7 - add dstq, dsq - dec hd - jg .w128 - RET - -.jmp_tbl: -%ifidn %1, put - dd .w2 - .jmp_tbl -%endif - dd .w4 - .jmp_tbl - dd .w8 - .jmp_tbl - dd .w16 - .jmp_tbl - dd .w32 - .jmp_tbl - dd .w64 - .jmp_tbl - dd .w128 - .jmp_tbl - -; ---- {put,prep}_8tap_h_16bpc ---- - -INIT_XMM avx2 -.%1_8tap_h_16bpc: ; cglobal put_8tap_h_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, bdmax -%ifidn %1, put - DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, bdmax, ss3 -%else - DEFINE_ARGS dst, src, ss, _w, h, mx, my, bdmax, ds, ss3 -%endif - - cmp _wd, 4 - jle .h_use4tap - shr mxd, 7 -.h_use4tap: - and mxd, 0x7f - - test myd, (0x7f << 14) - jnz .%1_8tap_hv_16bpc - -INIT_YMM avx2 - popcnt bdmaxd, bdmaxm -%ifidn %1, put - vpbroadcastd m6, [pd_34] ; (1 << 6 >> 1) + (1 << (6 - 4) >> 1) -%else - vpbroadcastd m6, [nd_32766] ; (1 << (6 - 4) >> 1) - (8192 << (6 - 4)) - vpbroadcastq m7, [pq_2] ; (6 - 4) -%endif - cmp bdmaxd, 12 - jne .h_bits10 -%ifidn %1, put - vpbroadcastd m6, [pd_40] ; (1 << 6 >> 1) + (1 << (6 - 2) >> 1) -%else - vpbroadcastd m6, [nd_131064] ; (1 << (6 - 2) >> 1) - (8192 << (6 - 2)) - vpbroadcastq m7, [pq_4] ; (6 - 2) -%endif -.h_bits10: -INIT_XMM avx2 - -%ifidn %1, put - DEFINE_ARGS dst, ds, src, ss, _w, h, mx, w2, jr, ss3 -%else - DEFINE_ARGS dst, src, ss, _w, h, mx, w2, jr, ds, ss3 -%endif - - lea w2q, [_wq*2] - - lea jrq, [.h_jmp_tbl] - tzcnt _wd, _wm -%ifidn %1, put - sub _wd, 1 -%else - sub _wd, 2 -%endif - movsxd _wq, [jrq + _wq*4] - add _wq, jrq - jmp _wq - -%ifidn %1, put -.h_w2: - sub srcq, 2 - mova xm4, [spf_h_shuf] - vpbroadcastd m5, [jrq - .h_jmp_tbl + subpel_filters + mxq*8 + 2] - vpmovsxbw m5, m5 - -.h_w2l: - movu m0, [srcq] - movu m1, [srcq + ssq] - lea srcq, [srcq + ssq*2] - -%ifidn %1, put - PUT_4TAP_H m0, m1, m4, m5, m6, 6 - packusdw m0, m0 - pminuw m0, m7 -%else - PUT_4TAP_H m0, m1, m4, m5, m6, m7 - packssdw m0, m1 -%endif - - movd [dstq], m0 - pextrd [dstq + dsq], m0, 1 - lea dstq, [dstq + dsq*2] - - sub hd, 2 - jg .h_w2l - RET -%endif - -INIT_YMM avx2 -.h_w4: - sub srcq, 2 - mova m4, [spf_h_shuf] - vpbroadcastd xm5, [jrq - .h_jmp_tbl + subpel_filters + mxq*8 + 2] - vpmovsxbw m5, xm5 - -.h_w4l: - vbroadcasti128 m0, [srcq] - vbroadcasti128 m1, [srcq + ssq] - lea srcq, [srcq + ssq*2] - -%ifidn %1, put - PUT_4TAP_H m0, m1, m4, m5, m6, 6 - packusdw m0, m0 - pminuw m0, m7 -%else - PUT_4TAP_H m0, m1, m4, m5, m6, xm7 - packssdw m0, m0 -%endif - - vextracti128 xm1, m0, 1 - movd [dstq], xm0 - movd [dstq + 4], xm1 - pextrd [dstq + dsq], xm0, 1 - pextrd [dstq + dsq + 4], xm1, 1 - lea dstq, [dstq + dsq*2] - - sub hd, 2 - jg .h_w4l - RET - -.h_w8: - sub srcq, 6 - vpbroadcastq xm5, [jrq - .h_jmp_tbl + subpel_filters + mxq*8] - vpmovsxbw m5, xm5 - -.h_w8l: - mov _wd, w2d - -.h_w8c: -%ifidn %1, put - PUT_8TAP_H 0, 1, 2, 3, m5, m6, 6, 4*0 - PUT_8TAP_H 1, 2, 3, 4, m5, m6, 6, 4*2 - packusdw m0, m1 - pminuw m0, m7 -%else - PUT_8TAP_H 0, 1, 2, 3, m5, m6, xm7, 4*0 - PUT_8TAP_H 1, 2, 3, 4, m5, m6, xm7, 4*2 - packssdw m0, m1 -%endif - add srcq, 8*2 - - mova [dstq], xm0 - vextracti128 [dstq + dsq], m0, 1 - - add dstq, 8*2 - sub _wd, 8*2 - jg .h_w8c - - sub srcq, w2q - sub dstq, w2q - lea srcq, [srcq + ssq*2] - lea dstq, [dstq + dsq*2] - sub hd, 2 - jg .h_w8l - RET - -.h_jmp_tbl: -%ifidn %1, put - dd .h_w2 - .h_jmp_tbl -%endif - dd .h_w4 - .h_jmp_tbl - dd .h_w8 - .h_jmp_tbl - dd .h_w8 - .h_jmp_tbl - dd .h_w8 - .h_jmp_tbl - dd .h_w8 - .h_jmp_tbl - dd .h_w8 - .h_jmp_tbl - -; ---- {put,prep}_8tap_v_16bpc ---- - -INIT_XMM avx2 -.%1_8tap_v_16bpc: ; cglobal put_8tap_v_16bpc, 4, 9, 0, dst, ds, src, ss, _w, h, mx, my, bdmax, ss3 -%ifidn %1, put - DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, bdmax, ss3 -%else - DEFINE_ARGS dst, src, ss, _w, h, mx, my, bdmax, ds, ss3 -%endif - - cmp hd, 4 - jle .v_use4tap - shr myd, 7 -.v_use4tap: - and myd, 0x7f - -INIT_YMM avx2 -%ifidn %1, put - vpbroadcastd m6, [pd_32] ; (1 << 6 >> 1) -%else - popcnt bdmaxd, bdmaxm - vpbroadcastd m6, [nd_32766] ; (1 << (6 - 4) >> 1) - (8192 << (6 - 4)) - vpbroadcastq m7, [pq_2] ; (6 - 4) - cmp bdmaxd, 12 - jne .v_bits10 - vpbroadcastd m6, [nd_131064] ; (1 << (6 - 2) >> 1) - (8192 << (6 - 2)) - vpbroadcastq m7, [pq_4] ; (6 - 2) -.v_bits10: -%endif -INIT_XMM avx2 - -%ifidn %1, put - DEFINE_ARGS dst, ds, src, ss, _w, h, w2, my, jr, ss3 -%else - DEFINE_ARGS dst, src, ss, _w, h, w2, my, jr, ds, ss3 -%endif - - lea jrq, [.v_jmp_tbl] - lea w2q, [_wq*2] - lea ss3q, [ssq*3] - -INIT_YMM avx2 - lea myq, [jrq - .v_jmp_tbl + subpel_filters + myq*8] - vpbroadcastw m8, [myq+0] - vpbroadcastw m9, [myq+2] - vpbroadcastw m10, [myq+4] - vpbroadcastw m11, [myq+6] - vpmovsxbw m8, xm8 - vpmovsxbw m9, xm9 - vpmovsxbw m10, xm10 - vpmovsxbw m11, xm11 -INIT_XMM avx2 - - tzcnt _wd, _wm -%ifidn %1, put - sub _wd, 1 -%else - sub _wd, 2 -%endif - movsxd _wq, [jrq + _wq*4] - add _wq, jrq - jmp _wq - -%ifidn %1, put -.v_w2: - - cmp hd, 4 - jg .v_w28 - - sub srcq, ssq - LOAD_REGS d, 0, 1, 2 - INTERLEAVE_REGS wd, m0, m1, m2 - -.v_w2l: ; 2x2, 2x4 - - LOAD_REGS d, 3, 4 - INTERLEAVE_REGS wd, m2, m3, m4 - - MUL_ADD_R m5, m8, m0, m1, m2, m3, m9, m10 - mova m2, m4 - - RND_SHR_MIN_R m5, m8, m6, 6, m7 - STORE_REGS d, 5, 8 - - sub hd, 2 - jg .v_w2l - RET - -.v_w28: - - sub srcq, ss3q - LOAD_REGS d, 0, 1, 2, 3, 4, 12, 13 - INTERLEAVE_REGS wd, m0, m1, m2, m3, m4, m12, m13 - -.v_w28l: ; 2x6, 2x8, 2x12, 2x16, 2x24, 2x32 - - sub srcq, ssq - LOAD_REGS d, 13, 14, 15 - INTERLEAVE_REGS wd, m13, m14, m15 - - MUL_ADD_R m5, m15, m0, m1, m2, m3, m8, m9 - MUL_ACC_R m5, m15, m2, m3, m4, m12, m10 - MUL_ACC_R m5, m15, m4, m12, m13, m14, m11 - - RND_SHR_MIN_R m5, m15, m6, 6, m7 - STORE_REGS d, 5, 15 - - sub hd, 2 - jg .v_w28l - RET -%endif - -.v_w4: - - cmp hd, 4 - jg .v_w48 - - sub srcq, ssq - LOAD_REGS q, 0, 1, 2 - INTERLEAVE_REGS wd, m0, m1, m2 - -.v_w4l: ; 4x2 4x4 - - LOAD_REGS q, 3, 4 - INTERLEAVE_REGS wd, m2, m3, m4 - - MUL_ADD_R m5, m8, m0, m1, m2, m3, m9, m10 - mova m2, m4 - -%ifidn %1, put - RND_SHR_MIN_R m5, m8, m6, 6, m7 -%else - RND_SHR_R m5, m8, m6, m7 -%endif - STORE_REGS q, 5, 8 - - sub hd, 2 - jg .v_w4l - RET - -.v_w48: - - sub srcq, ss3q - LOAD_REGS q, 0, 1, 2, 3, 4, 12, 13 - INTERLEAVE_REGS wd, m0, m1, m2, m3, m4, m12, m13 - -.v_w48l: ; 4x6, 4x8, 4x12, 4x16, 4x24, 4x32 - - sub srcq, ssq - LOAD_REGS q, 13, 14, 15 - INTERLEAVE_REGS wd, m13, m14, m15 - - MUL_ADD_R m5, m15, m0, m1, m2, m3, m8, m9 - MUL_ACC_R m5, m15, m2, m3, m4, m12, m10 - MUL_ACC_R m5, m15, m4, m12, m13, m14, m11 - -%ifidn %1, put - RND_SHR_MIN_R m5, m15, m6, 6, m7 -%else - RND_SHR_R m5, m15, m6, m7 -%endif - STORE_REGS q, 5, 15 - - sub hd, 2 - jg .v_w48l - - RET - -INIT_YMM avx2 -.v_w8: - -%ifidn %1, put - DEFINE_ARGS dst, ds, src, ss, oh, h, w2, tdst, tsrc, ss3 -%elifidn %1, prep - DEFINE_ARGS dst, src, ss, oh, h, w2, tdst, tsrc, ds, ss3 -%endif - - mov ohd, hd - mov tdstq, dstq - - cmp hd, 4 - jg .v_w88 - - sub srcq, ssq - mov tsrcq, srcq - -.v_w8l: ; N = 8, 16, 32, 64, 128 - - LOAD_REGS u, 0, 1, 2 - INTERLEAVE_REGS wd, m0, m1, m2 - -.v_w8c: ; Nx2, Nx4 - - LOAD_REGS u, 3, 4 - INTERLEAVE_REGS wd, m2, m3, m4 - - MUL_ADD_R m5, m8, m0, m1, m2, m3, m9, m10 - mova m2, m4 - -%ifidn %1, put - RND_SHR_MIN_R m5, m8, m6, 6, m7 -%else - RND_SHR_R m5, m8, m6, xm7 -%endif - STORE_REGS u, 5, 8 - - sub hd, 2 - jg .v_w8c - - add tdstq, 2*8 - add tsrcq, 2*8 - mov hd, ohd - mov dstq, tdstq - mov srcq, tsrcq - sub w2d, 2*8 - jg .v_w8l - - RET - -.v_w88: - - sub srcq, ss3q - mov tsrcq, srcq - -.v_w88l: ; N = 8, 16, 32, 64, 128 - - LOAD_REGS u, 0, 1, 2, 3, 4, 12, 13 - INTERLEAVE_REGS wd, m0, m1, m2, m3, m4, m12, m13 - -.v_w88c: ; Nx6, Nx8, Nx12, Nx16, Nx24, Nx32 - - sub srcq, ssq - - LOAD_REGS u, 13, 14, 15 - INTERLEAVE_REGS wd, m13, m14, m15 - - MUL_ADD_R m5, m15, m0, m1, m2, m3, m8, m9 - MUL_ACC_R m5, m15, m2, m3, m4, m12, m10 - MUL_ACC_R m5, m15, m4, m12, m13, m14, m11 - -%ifidn %1, put - RND_SHR_MIN_R m5, m15, m6, 6, m7 -%else - RND_SHR_R m5, m15, m6, xm7 -%endif - STORE_REGS u, 5, 15 - - sub hd, 2 - jg .v_w88c - - add tdstq, 2*8 - add tsrcq, 2*8 - mov hd, ohd - mov dstq, tdstq - mov srcq, tsrcq - sub w2d, 2*8 - jg .v_w88l - - RET - -.v_jmp_tbl: -%ifidn %1, put - dd .v_w2 - .v_jmp_tbl -%endif - dd .v_w4 - .v_jmp_tbl - dd .v_w8 - .v_jmp_tbl - dd .v_w8 - .v_jmp_tbl - dd .v_w8 - .v_jmp_tbl - dd .v_w8 - .v_jmp_tbl - dd .v_w8 - .v_jmp_tbl - -; ---- {put,prep}_8tap_hv_16bpc ---- - -INIT_XMM avx2 -.%1_8tap_hv_16bpc: ; cglobal put_8tap_hv_16bpc, 4, 9, 0, dst, ds, src, ss, _w, h, mx, my, bdmax, ss3 -%ifidn %1, put - DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, bdmax, ss3 -%elifidn %1, prep - DEFINE_ARGS dst, src, ss, _w, h, mx, my, bdmax, ds, ss3 -%endif - - cmp hd, 4 - jle .hv_use4tap - shr myd, 7 -.hv_use4tap: - and myd, 0x7f - -INIT_YMM avx2 - popcnt bdmaxd, bdmaxm - vpbroadcastd m6, [pd_2] ; (1 << (6 - 4) >> 1) - movq xm13, [pq_2] ; 6 - 4 -%ifidn %1, put - vpbroadcastd m14, [pd_512] ; (1 << (6 + 4) >> 1) - movq xm15, [pq_10] ; 6 + 4 -%else - vpbroadcastd m14, [nd_524256] ; (1 << 6 >> 1) - (8192 << 6) -%endif - cmp bdmaxd, 12 - jne .hv_bits10 - vpbroadcastd m6, [pd_8] ; (1 << (6 - 2) >> 1) - movq xm13, [pq_4] ; 6 - 2 -%ifidn %1, put - vpbroadcastd m14, [pd_128] ; (1 << (6 + 2) >> 1) - movq xm15, [pq_8] ; 6 + 2 -%endif -.hv_bits10: -INIT_XMM avx2 - -%ifidn %1, put - DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, jr, ss3 -%elifidn %1, prep - DEFINE_ARGS dst, src, ss, _w, h, mx, my, jr, ds, ss3 -%endif - - lea jrq, [.hv_jmp_tbl] - -INIT_YMM avx2 - lea ss3q, [jrq - .hv_jmp_tbl + subpel_filters + myq*8] - vpbroadcastw xm8, [ss3q] - vpbroadcastw xm9, [ss3q + 2] - vpbroadcastw xm10, [ss3q + 4] - vpbroadcastw xm11, [ss3q + 6] - vpmovsxbw m8, xm8 - vpmovsxbw m9, xm9 - vpmovsxbw m10, xm10 - vpmovsxbw m11, xm11 -INIT_XMM avx2 - - ; Width is need for for filters 8 and larger, see .hv_w8 - mov ss3q, _wq - - tzcnt _wd, _wm -%ifidn %1, put - sub _wd, 1 -%else - sub _wd, 2 -%endif - movsxd _wq, [jrq + _wq*4] - add _wq, jrq - jmp _wq - -%ifidn %1, put -.hv_w2: - cmp hd, 4 - jg .hv_w28 - - lea ss3q, [ssq*3] - - mova m8, [spf_h_shuf] - vpbroadcastd m5, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8 + 2] - vpmovsxbw m5, m5 - - sub srcq, 2 - sub srcq, ssq - - movu m0, [srcq] - movu m1, [srcq + ssq] - movu m2, [srcq + ssq*2] - add srcq, ss3q - - PUT_4TAP_HS2 m0, m1, m8, m5, m6, m13 - PUT_4TAP_HS1 m2, m8, m5, m6, m13 - INTERLEAVE_REGS wd, m0, m1, m2 - -.hv_w2l: - - movu m3, [srcq] - movu m4, [srcq + ssq] - lea srcq, [srcq + ssq*2] - - PUT_4TAP_HS2 m3, m4, m8, m5, m6, m13 - - INTERLEAVE_REGS wd, m2, m3, m4 - - MUL_ADD_R m11, m12, m0, m1, m2, m3, m9, m10 - mova m2, m4 - - RND_SHR_MIN_R m11, m12, m14, m15, m7 - STORE_REGS d, 11, 12 - - sub hd, 2 - jg .hv_w2l - - RET - -.hv_w28: - lea ss3q, [ssq*3] - - mova m8, [spf_h_shuf] - vpbroadcastd m5, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8 + 2] - vpmovsxbw m5, m5 - - lea myq, [jrq - .hv_jmp_tbl + subpel_filters + myq*8] - vpbroadcastd m9, [myq] - vpbroadcastd m10, [myq + 4] - vpmovsxbw m9, m9 - vpmovsxbw m10, m10 - - sub srcq, 2 - sub srcq, ss3q - - movu m0, [srcq] - movu m1, [srcq + ssq] - lea srcq, [srcq + ssq*2] - - PUT_4TAP_HS2 m0, m1, m8, m5, m6, m13 - - movu m4, [srcq] - movu m3, [srcq + ssq] - movu m2, [srcq + ssq*2] - add srcq, ss3q - - PUT_4TAP_HS2 m4, m3, m8, m5, m6, m13 - PUT_4TAP_HS1 m2, m8, m5, m6, m13 - - INTERLEAVE_REGS wd, m0, m1, m4, m3, m2 - punpckldq m0, m4 - punpckldq m1, m3 - - movu m3, [srcq] - movu m4, [srcq + ssq] - lea srcq, [srcq + ssq*2] - - PUT_4TAP_HS2 m3, m4, m8, m5, m6, m13 - - INTERLEAVE_REGS wd, m2, m3, m4 - -.hv_w28l: - - movu m11, [srcq] - movu m12, [srcq + ssq] - lea srcq, [srcq + ssq*2] - - PUT_4TAP_HS2 m11, m12, m8, m5, m6, m13 - - INTERLEAVE_REGS wd, m4, m11, m12 - punpckldq m2, m4 - punpckldq m3, m11 - - pmaddwd m11, m0, m9 - pmaddwd m4, m2, m10 - pmaddwd m12, m1, m9 - paddd m11, m4 - pmaddwd m4, m3, m10 - paddd m12, m4 - phaddd m11, m11 - phaddd m12, m12 - - RND_SHR_MIN_R m11, m12, m14, m15, m7 - STORE_REGS d, 11, 12 - - pshufd m0, m0, q2031 - pshufd m1, m1, q2031 - pshufd m11, m2, q3120 - pshufd m12, m3, q3120 - pshufd m2, m2, q2031 - pshufd m3, m3, q2031 - - mova m4, m3 - psrad m4, 16 - packssdw m4, m4 - - punpckldq m0, m11 - punpckldq m1, m12 - - sub hd, 2 - jg .hv_w28l - - RET -%endif - -INIT_YMM avx2 -.hv_w4: - cmp hd, 4 - jg .hv_w48 - - lea ss3q, [ssq*3] - - mova m8, [spf_h_shuf] - vpbroadcastd xm5, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8 + 2] - vpmovsxbw m5, xm5 - - sub srcq, 2 - sub srcq, ssq - - vbroadcasti128 m0, [srcq] - vbroadcasti128 m1, [srcq + ssq] - vbroadcasti128 m2, [srcq + ssq*2] - add srcq, ss3q - - PUT_4TAP_HS2 m0, m1, m8, m5, m6, xm13 - PUT_4TAP_HS1 m2, m8, m5, m6, xm13 - INTERLEAVE_REGS wd, m0, m1, m2 - -.hv_w4l: - - vbroadcasti128 m3, [srcq] - vbroadcasti128 m4, [srcq + ssq] - lea srcq, [srcq + ssq*2] - - PUT_4TAP_HS2 m3, m4, m8, m5, m6, xm13 - - INTERLEAVE_REGS wd, m2, m3, m4 - - MUL_ADD_R m11, m12, m0, m1, m2, m3, m9, m10 - mova m2, m4 - -%ifidn %1, put - RND_SHR_MIN_R m11, m12, m14, xm15, m7 -%else - RND_SHR_R m11, m12, m14, 6 -%endif - - vextracti128 xm3, m11, 1 - vextracti128 xm4, m12, 1 - - movd [dstq], xm11 - movd [dstq + 4], xm3 - movd [dstq + dsq], xm12 - movd [dstq + dsq + 4], xm4 - lea dstq, [dstq + dsq*2] - - sub hd, 2 - jg .hv_w4l - - RET - -.hv_w48: - lea ss3q, [ssq*3] - - mova m8, [spf_h_shuf] - vpbroadcastd xm5, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8 + 2] - vpmovsxbw m5, xm5 - - lea myq, [jrq - .hv_jmp_tbl + subpel_filters + myq*8] - vpbroadcastd xm9, [myq] - vpbroadcastd xm10, [myq + 4] - vpmovsxbw m9, xm9 - vpmovsxbw m10, xm10 - - sub srcq, 2 - sub srcq, ss3q - - vbroadcasti128 m0, [srcq] - vbroadcasti128 m1, [srcq + ssq] - lea srcq, [srcq + ssq*2] - - PUT_4TAP_HS2 m0, m1, m8, m5, m6, xm13 - - vbroadcasti128 m4, [srcq] - vbroadcasti128 m3, [srcq + ssq] - vbroadcasti128 m2, [srcq + ssq*2] - add srcq, ss3q - - PUT_4TAP_HS2 m4, m3, m8, m5, m6, xm13 - PUT_4TAP_HS1 m2, m8, m5, m6, xm13 - - INTERLEAVE_REGS wd, m0, m1, m4, m3, m2 - punpckldq m0, m4 - punpckldq m1, m3 - - vbroadcasti128 m3, [srcq] - vbroadcasti128 m4, [srcq + ssq] - lea srcq, [srcq + ssq*2] - - PUT_4TAP_HS2 m3, m4, m8, m5, m6, xm13 - - INTERLEAVE_REGS wd, m2, m3, m4 - -.hv_w48l: - - vbroadcasti128 m11, [srcq] - vbroadcasti128 m12, [srcq + ssq] - lea srcq, [srcq + ssq*2] - - PUT_4TAP_HS2 m11, m12, m8, m5, m6, xm13 - - INTERLEAVE_REGS wd, m4, m11, m12 - punpckldq m2, m4 - punpckldq m3, m11 - - pmaddwd m11, m0, m9 - pmaddwd m4, m2, m10 - pmaddwd m12, m1, m9 - paddd m11, m4 - pmaddwd m4, m3, m10 - paddd m12, m4 - phaddd m11, m11 - phaddd m12, m12 - -%ifidn %1, put - RND_SHR_MIN_R m11, m12, m14, xm15, m7 -%else - RND_SHR_R m11, m12, m14, 6 -%endif - - vextracti128 xm4, m11, 1 - movd [dstq], xm11 - movd [dstq + 4], xm4 - vextracti128 xm4, m12, 1 - movd [dstq + dsq], xm12 - movd [dstq + dsq + 4], xm4 - lea dstq, [dstq + dsq*2] - - pshufd m0, m0, q2031 - pshufd m1, m1, q2031 - pshufd m11, m2, q3120 - pshufd m12, m3, q3120 - pshufd m2, m2, q2031 - pshufd m3, m3, q2031 - - mova m4, m3 - psrad m4, 16 - packssdw m4, m4 - - punpckldq m0, m11 - punpckldq m1, m12 - - sub hd, 2 - jg .hv_w48l - RET - -.hv_w8: - mov _wq, ss3q - - cmp hd, 4 - jg .hv_w88 - - lea ss3q, [ssq*3] - - vpbroadcastq xm5, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8] - vpmovsxbw m5, xm5 - -%ifidn %1, put - DEFINE_ARGS dst, ds, src, ss, _w, h, oh, tdst, tsrc, ss3 -%elifidn %1, prep - DEFINE_ARGS dst, src, ss, _w, h, oh, tdst, tsrc, ds, ss3 -%endif - - sub srcq, 6 - sub srcq, ssq - - mov ohd, hd - mov tdstq, dstq - mov tsrcq, srcq - -.hv_w8l: - - PUT_8TAP_HS 0, 1, 2, m5, m6, xm13, 0*ssq - PUT_8TAP_HS 1, 2, 3, m5, m6, xm13, 1*ssq - PUT_8TAP_HS 2, 3, 4, m5, m6, xm13, 2*ssq - add srcq, ss3q - - INTERLEAVE_REGS wd, m0, m1, m2 - -.hv_w8c: ; Nx2, Nx4 - - PUT_8TAP_HS 3, 8, 11, m5, m6, xm13, 0*ssq - PUT_8TAP_HS 4, 8, 11, m5, m6, xm13, 1*ssq - lea srcq, [srcq + ssq*2] - - INTERLEAVE_REGS wd, m2, m3, m4 - - MUL_ADD_R m8, m11, m0, m1, m2, m3, m9, m10 - mova m2, m4 - -%ifidn %1, put - RND_SHR_MIN_R m8, m11, m14, xm15, m7 -%else - RND_SHR_R m8, m11, m14, 6 -%endif - - vextracti128 xm3, m8, 1 - vextracti128 xm4, m11, 1 - - movq [dstq], xm8 - movq [dstq + 8], xm3 - movq [dstq + dsq], xm11 - movq [dstq + dsq + 8], xm4 - lea dstq, [dstq + dsq*2] - - sub hd, 2 - jg .hv_w8c - - add tdstq, 2*8 - add tsrcq, 2*8 - mov hd, ohd - mov dstq, tdstq - mov srcq, tsrcq - sub _wd, 8 - jg .hv_w8l - RET - -.hv_w88: -%ifidn %1, put - DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, jr, ss3 -%elifidn %1, prep - DEFINE_ARGS dst, src, ss, _w, h, mx, my, jr, ds, ss3 -%endif - - lea ss3q, [ssq*3] - - vpbroadcastq xm7, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8] - vpmovsxbw m7, xm7 - - sub srcq, 6 - sub srcq, ss3q - -%ifidn %1, put - DEFINE_ARGS dst, ds, src, ss, _w, h, oh, tdst, bdmax, ss3 -%elifidn %1, prep - DEFINE_ARGS dst, src, ss, _w, h, oh, tdst, bdmax, ds, ss3 -%endif - - mov ohd, hd - mov tdstq, dstq - - popcnt bdmaxd, bdmaxm - cmp bdmaxd, 12 - je .hv_w88_12bit - -%ifidn %1, put - DEFINE_ARGS dst, ds, src, ss, _w, h, oh, tdst, tsrc, ss3 -%elifidn %1, prep - DEFINE_ARGS dst, src, ss, _w, h, oh, tdst, tsrc, ds, ss3 -%endif - - mov tsrcq, srcq - -.hv_w88l_10bit: ; Nx6, Nx8, Nx12, Nx16, Nx24, Nx32: - - vpbroadcastd m15, [pd_2] ; (1 << (6 - 4) >> 1) - - PUT_8TAP_HS 0, 12, 13, m7, m15, 6 - 4, 0*ssq - PUT_8TAP_HS 1, 12, 13, m7, m15, 6 - 4, 1*ssq - PUT_8TAP_HS 2, 12, 13, m7, m15, 6 - 4, 2*ssq - add srcq, ss3q - - PUT_8TAP_HS 3, 12, 13, m7, m15, 6 - 4, 0*ssq - PUT_8TAP_HS 4, 12, 13, m7, m15, 6 - 4, 1*ssq - lea srcq, [srcq + ssq*2] - - PUT_8TAP_HS 5, 12, 13, m7, m15, 6 - 4, 0*ssq - PUT_8TAP_HS 6, 12, 13, m7, m15, 6 - 4, 1*ssq - lea srcq, [srcq + ssq*2] - - INTERLEAVE_REGS wd, m0, m1, m2, m3, m4, m5, m6 - -.hv_w88c_10bit: - - PUT_8TAP_HS 12, 14, 15, m7, m15, 6 - 4, 0*ssq, [pd_2] - PUT_8TAP_HS 13, 14, 15, m7, m15, 6 - 4, 1*ssq, [pd_2] - lea srcq, [srcq + ssq*2] - - INTERLEAVE_REGS wd, m6, m12, m13 - - MUL_ADD_R m14, m15, m0, m1, m2, m3, m8, m9 - MUL_ACC_R m14, m15, m2, m3, m4, m5, m10 - MUL_ACC_R m14, m15, m4, m5, m6, m12, m11 - -%ifidn %1, put - vpbroadcastd m6, [pd_512] ; (1 << (6 + 4) >> 1) - vpbroadcastw m12, tsrcm ; bdmaxm - RND_SHR_MIN_R m14, m15, m6, 6 + 4, m12 -%else - vpbroadcastd m6, [nd_524256] ; (1 << 6 >> 1) - (8192 << 6) - RND_SHR_R m14, m15, m6, 6 -%endif - - mova m6, m13 - - vextracti128 xm12, m14, 1 - vextracti128 xm13, m15, 1 - - movq [dstq], xm14 - movq [dstq + 8], xm12 - movq [dstq + dsq], xm15 - movq [dstq + dsq + 8], xm13 - lea dstq, [dstq + dsq*2] - - sub hd, 2 - jg .hv_w88c_10bit - - add tdstq, 2*8 - add tsrcq, 2*8 - mov hd, ohd - mov dstq, tdstq - mov srcq, tsrcq - sub _wd, 8 - jg .hv_w88l_10bit - RET - -.hv_w88_12bit: - - mov tsrcq, srcq - -.hv_w88l_12bit: ; Nx6, Nx8, Nx12, Nx16, Nx24, Nx32: - - vpbroadcastd m15, [pd_8] ; (1 << (6 - 2) >> 1) - - PUT_8TAP_HS 0, 12, 13, m7, m15, 6 - 2, 0*ssq - PUT_8TAP_HS 1, 12, 13, m7, m15, 6 - 2, 1*ssq - PUT_8TAP_HS 2, 12, 13, m7, m15, 6 - 2, 2*ssq - add srcq, ss3q - - PUT_8TAP_HS 3, 12, 13, m7, m15, 6 - 2, 0*ssq - PUT_8TAP_HS 4, 12, 13, m7, m15, 6 - 2, 1*ssq - lea srcq, [srcq + ssq*2] - - PUT_8TAP_HS 5, 12, 13, m7, m15, 6 - 2, 0*ssq - PUT_8TAP_HS 6, 12, 13, m7, m15, 6 - 2, 1*ssq - lea srcq, [srcq + ssq*2] - - INTERLEAVE_REGS wd, m0, m1, m2, m3, m4, m5, m6 - -.hv_w88c_12bit: - - PUT_8TAP_HS 12, 14, 15, m7, m15, 6 - 2, 0*ssq, [pd_8] - PUT_8TAP_HS 13, 14, 15, m7, m15, 6 - 2, 1*ssq, [pd_8] - lea srcq, [srcq + ssq*2] - - INTERLEAVE_REGS wd, m6, m12, m13 - - MUL_ADD_R m14, m15, m0, m1, m2, m3, m8, m9 - MUL_ACC_R m14, m15, m2, m3, m4, m5, m10 - MUL_ACC_R m14, m15, m4, m5, m6, m12, m11 - -%ifidn %1, put - vpbroadcastd m6, [pd_128] ; (1 << (6 + 2) >> 1) - vpbroadcastw m12, tsrcm ; bdmaxm - RND_SHR_MIN_R m14, m15, m6, 6 + 2, m12 -%else - vpbroadcastd m6, [nd_524256] ; (1 << 6 >> 1) - (8192 << 6) - RND_SHR_R m14, m15, m6, 6 -%endif - - mova m6, m13 - - vextracti128 xm12, m14, 1 - vextracti128 xm13, m15, 1 - - movq [dstq], xm14 - movq [dstq + 8], xm12 - movq [dstq + dsq], xm15 - movq [dstq + dsq + 8], xm13 - lea dstq, [dstq + dsq*2] - - sub hd, 2 - jg .hv_w88c_12bit - - add tdstq, 2*8 - add tsrcq, 2*8 - mov hd, ohd - mov dstq, tdstq - mov srcq, tsrcq - sub _wd, 8 - jg .hv_w88l_12bit - RET - -.hv_jmp_tbl: -%ifidn %1, put - dd .hv_w2 - .hv_jmp_tbl -%endif - dd .hv_w4 - .hv_jmp_tbl - dd .hv_w8 - .hv_jmp_tbl - dd .hv_w8 - .hv_jmp_tbl - dd .hv_w8 - .hv_jmp_tbl - dd .hv_w8 - .hv_jmp_tbl - dd .hv_w8 - .hv_jmp_tbl -%endm - -filter_fn put -filter_fn prep - -%macro AVG 1 - mova m0, [p1q] - mova m2, [p2q] - punpckhwd m1, m0, m2 - punpcklwd m0, m2 -%ifidn %1, mask - mova xm2, [mq] - vpmovsxbw m2, xm2 - vpbroadcastw m7, [pw_64] - psubw m7, m2 - punpckhwd m3, m2, m7 - punpcklwd m2, m7 - pmaddwd m0, m2 -%else - pmaddwd m0, m3 -%endif - pmaddwd m1, m3 - paddd m0, m4 - paddd m1, m4 - psrad m0, xm5 - psrad m1, xm5 - packusdw m0, m1 - pminuw m0, m6 -%endm - -%macro bilin_fn 1 -%ifidn %1, avg -cglobal avg_16bpc, 4, 9, 8, dst, ds, p1, p2, w, h, bdmax, ds3, ow -%elifidn %1, w_avg -cglobal w_avg_16bpc, 4, 9, 8, dst, ds, p1, p2, w, h, wg, bdmax, ow -%else -cglobal mask_16bpc, 4, 9, 8, dst, ds, p1, p2, w, h, m, bdmax, ow -%endif - - movifnidn hd, hm - movifnidn wd, wm - -%ifidn %1, avg - vpbroadcastw m3, [pw_1] - vpbroadcastd m4, [pd_16400] ; (1 << (6 - 2)) + 8192*2 - movq xm5, [pq_5] ; (6 - 2) + 1 -%elifidn %1, w_avg - vpbroadcastw m3, wgm - vpbroadcastw m4, [pw_16] - psubw m4, m3 - punpcklwd m3, m4 - vpbroadcastd m4, [pd_131200] ; ((1 << (6 - 2)) + 8192*2) << 3 - movq xm5, [pq_8] ; (6 - 2) + 1 + 3 -%else - movifnidn mq, mmp - vpbroadcastd m4, [pd_524800] ; ((1 << (6 - 2)) + 8192*2) << 5 - movq xm5, [pq_10] ; (6 - 2) + 1 + 5 -%endif - - popcnt bdmaxd, bdmaxm - cmp bdmaxd, 10 - je .bits10 - -%ifidn %1, avg - vpbroadcastd m4, [pd_16388] ; (1 << (6 - 4)) + 8192*2 - movq xm5, [pq_3] ; (6 - 4) + 1 -%elifidn %1, w_avg - vpbroadcastd m4, [pd_131104] ; ((1 << (6 - 4)) + 8192*2) << 3 - movq xm5, [pq_6] ; (6 - 4) + 1 + 3 -%else - vpbroadcastd m4, [pd_524416] ; ((1 << (6 - 4)) + 8192*2) << 5 - movq xm5, [pq_8] ; (6 - 4) + 1 + 5 -%endif -.bits10: - - vpbroadcastw m6, bdmaxm - - lea owd, [2*wd] - -DEFINE_ARGS dst, ds, p1, p2, w, h, m, jr, ow - - lea jrq, [.jmp_tbl] - tzcnt wd, wm - sub wd, 2 - movsxd wq, [jrq + wq*4] - add wq, jrq - jmp wq - -.w4: -DEFINE_ARGS dst, ds, p1, p2, w, h, m, ds3, ow - - lea ds3q, [dsq*3] - -.w4l: - AVG %1 - - vextracti128 xm1, m0, 1 - movq [dstq], xm0 - pextrq [dstq + dsq], xm0, 1 - movq [dstq + 2*dsq], xm1 - pextrq [dstq + ds3q], xm1, 1 - - lea dstq, [dstq + 4*dsq] - add p1q, 32 - add p2q, 32 -%ifidn %1, mask - add mq, 16 -%endif - - sub hd, 4 - jg .w4l - RET - -.w8: - AVG %1 - - vextracti128 xm1, m0, 1 - mova [dstq], xm0 - mova [dstq + dsq], xm1 - - lea dstq, [dstq + dsq*2] - add p1q, 32 - add p2q, 32 -%ifidn %1, mask - add mq, 16 -%endif - - sub hd, 2 - jg .w8 - - RET - -.w16: - - mov wd, owd ; upper 32-bits of wq zerod by jmp - sub dsq, wq - -.w16l: - AVG %1 - - mova [dstq], m0 - - add dstq, 32 - add p1q, 32 - add p2q, 32 -%ifidn %1, mask - add mq, 16 -%endif - - sub wd, 32 - jg .w16l - - add dstq, dsq - mov wd, owd - dec hd - jg .w16l - - RET - -.jmp_tbl: - dd .w4 - .jmp_tbl - dd .w8 - .jmp_tbl - dd .w16 - .jmp_tbl - dd .w16 - .jmp_tbl - dd .w16 - .jmp_tbl - dd .w16 - .jmp_tbl -%endm - -bilin_fn avg -bilin_fn w_avg -bilin_fn mask - -INIT_XMM avx2 -cglobal blend_16bpc, 6, 7, 7, dst, ds, tmp, w, h, mask, jr - pxor m3, m3 - lea jrq, [.jmp_tbl] - tzcnt wd, wm - sub wd, 2 - movsxd wq, [jrq + wq*4] - add wq, jrq - jmp wq -.w4: - movq m0, [dstq] - pinsrq m0, [dstq + dsq], 1 - mova m1, [tmpq] - movq m2, [maskq] - psubb m2, m3, m2 - pmovsxbw m2, m2 - psllw m2, 9 - psubw m1, m0, m1 - pmulhrsw m1, m2 - paddw m0, m1 - movq [dstq], m0 - pextrq [dstq + dsq], m0, 1 - add maskq, 8 - add tmpq, 16 - lea dstq, [dstq + 2*dsq] - sub hd, 2 - jg .w4 - RET -INIT_YMM avx2 -.w8: - mova xm0, [dstq] - vinserti128 m0, [dstq + dsq], 1 - mova m1, [tmpq] - mova xm2, [maskq] - psubb xm2, xm3, xm2 - pmovsxbw m2, xm2 - psllw m2, 9 - psubw m1, m0, m1 - pmulhrsw m1, m2 - paddw m0, m1 - mova [dstq], xm0 - vextracti128 [dstq + dsq], m0, 1 - add maskq, 16 - add tmpq, 32 - lea dstq, [dstq + 2*dsq] - sub hd, 2 - jg .w8 - RET -.w16: - mova m0, [dstq] - mova m4, [dstq + dsq] - mova m1, [tmpq] - mova m5, [tmpq + 32] - mova xm2, [maskq] - mova xm6, [maskq + 16] - psubb xm2, xm3, xm2 - psubb xm6, xm3, xm6 - pmovsxbw m2, xm2 - pmovsxbw m6, xm6 - psllw m2, 9 - psllw m6, 9 - psubw m1, m0, m1 - psubw m5, m4, m5 - pmulhrsw m1, m2 - pmulhrsw m5, m6 - paddw m0, m1 - paddw m4, m5 - mova [dstq], m0 - mova [dstq + dsq], m4 - add maskq, 32 - add tmpq, 64 - lea dstq, [dstq + 2*dsq] - sub hd, 2 - jg .w16 - RET -.w32: - mova m0, [dstq] - mova m4, [dstq + 32] - mova m1, [tmpq] - mova m5, [tmpq + 32] - mova xm2, [maskq] - mova xm6, [maskq + 16] - psubb xm2, xm3, xm2 - psubb xm6, xm3, xm6 - pmovsxbw m2, xm2 - pmovsxbw m6, xm6 - psllw m2, 9 - psllw m6, 9 - psubw m1, m0, m1 - psubw m5, m4, m5 - pmulhrsw m1, m2 - pmulhrsw m5, m6 - paddw m0, m1 - paddw m4, m5 - mova [dstq], m0 - mova [dstq + 32], m4 - add maskq, 32 - add tmpq, 64 - add dstq, dsq - dec hd - jg .w32 - RET -.jmp_tbl: - dd .w4 - .jmp_tbl - dd .w8 - .jmp_tbl - dd .w16 - .jmp_tbl - dd .w32 - .jmp_tbl - -cextern obmc_masks - -INIT_XMM avx2 -cglobal blend_v_16bpc, 5, 7, 7, dst, ds, tmp, w, h, o, jr - lea oq, [obmc_masks] - pxor m3, m3 - movsx wq, wd - add oq, wq - lea jrq, [.jmp_tbl] - tzcnt wd, wm - sub wd, 1 - movsxd wq, [jrq + wq*4] - add wq, jrq - jmp wq -.w2: - vpbroadcastw m2, [oq] - psubb m2, m3, m2 - pmovsxbw m2, m2 - psllw m2, 9 -.w2l: - movd m0, [dstq] - movd m1, [tmpq] - pinsrd m0, [dstq + dsq], 1 - pinsrd m1, [tmpq + 4], 1 - psubw m1, m0, m1 - pmulhrsw m1, m2 - paddw m0, m1 - movd [dstq], m0 - pextrd [dstq + dsq], m0, 1 - add tmpq, 8 - lea dstq, [dstq + 2*dsq] - sub hd, 2 - jg .w2l - RET -.w4: - vpbroadcastd m2, [oq] - psubb m2, m3, m2 - pmovsxbw m2, m2 - psllw m2, 9 -.w4l: - movq m0, [dstq] - movq m1, [tmpq] - pinsrq m0, [dstq + dsq], 1 - pinsrq m1, [tmpq + 8], 1 - psubw m1, m0, m1 - pmulhrsw m1, m2 - paddw m0, m1 - movq [dstq], m0 - pextrq [dstq + dsq], m0, 1 - add tmpq, 16 - lea dstq, [dstq + 2*dsq] - sub hd, 2 - jg .w4l - RET -INIT_YMM avx2 -.w8: - vpbroadcastq xm2, [oq] - psubb xm2, xm3, xm2 - pmovsxbw m2, xm2 - psllw m2, 9 -.w8l: - mova xm0, [dstq] - vinserti128 m0, [dstq + dsq], 1 - mova m1, [tmpq] - psubw m1, m0, m1 - pmulhrsw m1, m2 - paddw m0, m1 - mova [dstq], xm0 - vextracti128 [dstq + dsq], m0, 1 - add tmpq, 32 - lea dstq, [dstq + 2*dsq] - sub hd, 2 - jg .w8l - RET -.w16: - mova xm2, [oq] - psubb xm2, xm3, xm2 - pmovsxbw m2, xm2 - psllw m2, 9 -.w16l: - mova m0, [dstq] - mova m4, [dstq + dsq] - mova m1, [tmpq] - mova m5, [tmpq + 32] - psubw m1, m0, m1 - psubw m5, m4, m5 - pmulhrsw m1, m2 - pmulhrsw m5, m2 - paddw m0, m1 - paddw m4, m5 - mova [dstq], m0 - mova [dstq + dsq], m4 - add tmpq, 64 - lea dstq, [dstq + 2*dsq] - sub hd, 2 - jg .w16l - RET -.w32: - mova xm2, [oq] - mova xm6, [oq + 16] - psubb xm2, xm3, xm2 - psubb xm6, xm3, xm6 - pmovsxbw m2, xm2 - pmovsxbw m6, xm6 - psllw m2, 9 - psllw m6, 9 -.w32l: - mova m0, [dstq] - mova m4, [dstq + 32] - mova m1, [tmpq] - mova m5, [tmpq + 32] - psubw m1, m0, m1 - psubw m5, m4, m5 - pmulhrsw m1, m2 - pmulhrsw m5, m6 - paddw m0, m1 - paddw m4, m5 - mova [dstq], m0 - mova [dstq + 32], m4 - add tmpq, 64 - add dstq, dsq - dec hd - jg .w32l - RET -.jmp_tbl: - dd .w2 - .jmp_tbl - dd .w4 - .jmp_tbl - dd .w8 - .jmp_tbl - dd .w16 - .jmp_tbl - dd .w32 - .jmp_tbl - -INIT_XMM avx2 -cglobal blend_h_16bpc, 5, 8, 7, dst, ds, tmp, w, h, o, jr, w2 - pxor m3, m3 - lea w2d, [wd*2] - lea oq, [obmc_masks] - movsx hq, hd - add oq, hq - imul hq, 3 - shr hq, 2 - lea jrq, [.jmp_tbl] - tzcnt wd, wm - sub wd, 1 - movsxd wq, [jrq + wq*4] - add wq, jrq - jmp wq -.w2: - movd m2, [oq] - psubb m2, m3, m2 - punpcklbw m2, m2 - pmovsxbw m2, m2 - psllw m2, 9 - movd m0, [dstq] - movd m1, [tmpq] - pinsrd m0, [dstq + dsq], 1 - pinsrd m1, [tmpq + 4], 1 - psubw m1, m0, m1 - pmulhrsw m1, m2 - paddw m0, m1 - movd [dstq], m0 - pextrd [dstq + dsq], m0, 1 - add tmpq, 8 - lea dstq, [dstq + 2*dsq] - add oq, 2 - sub hd, 2 - jg .w2 - RET -.w4: - movd m2, [oq] - punpcklbw m2, m2 - punpcklwd m2, m2 - psubb m2, m3, m2 - pmovsxbw m2, m2 - psllw m2, 9 - movq m0, [dstq] - movq m1, [tmpq] - pinsrq m0, [dstq + dsq], 1 - pinsrq m1, [tmpq + 8], 1 - psubw m1, m0, m1 - pmulhrsw m1, m2 - paddw m0, m1 - movq [dstq], m0 - pextrq [dstq + dsq], m0, 1 - add tmpq, 16 - lea dstq, [dstq + 2*dsq] - add oq, 2 - sub hd, 2 - jg .w4 - RET -INIT_YMM avx2 -.w8: - movd xm2, [oq] - psubb xm2, xm3, xm2 - punpcklbw xm2, xm2 - punpcklwd xm2, xm2 - punpckldq xm2, xm2 - pmovsxbw m2, xm2 - psllw m2, 9 - mova xm0, [dstq] - vinserti128 m0, [dstq + dsq], 1 - mova m1, [tmpq] - psubw m1, m0, m1 - pmulhrsw m1, m2 - paddw m0, m1 - mova [dstq], xm0 - vextracti128 [dstq + dsq], m0, 1 - add tmpq, 32 - lea dstq, [dstq + 2*dsq] - add oq, 2 - sub hd, 2 - jg .w8 - RET -.w16: - vpbroadcastb xm2, [oq] - vpbroadcastb xm6, [oq + 1] - psubb xm2, xm3, xm2 - psubb xm6, xm3, xm6 - pmovsxbw m2, xm2 - pmovsxbw m6, xm6 - psllw m2, 9 - psllw m6, 9 - mova m0, [dstq] - mova m1, [tmpq] - mova m4, [dstq + dsq] - mova m5, [tmpq + 32] - psubw m1, m0, m1 - psubw m5, m4, m5 - pmulhrsw m1, m2 - pmulhrsw m5, m6 - paddw m0, m1 - paddw m4, m5 - mova [dstq], m0 - mova [dstq + dsq], m4 - add tmpq, 64 - lea dstq, [dstq + 2*dsq] - add oq, 2 - sub hd, 2 - jg .w16 - RET -.w32: - mov wd, w2d - sub dsq, wq -.w32l: - vpbroadcastb xm2, [oq] - psubb xm2, xm3, xm2 - pmovsxbw m2, xm2 - psllw m2, 9 - mov wd, w2d -.w32c: - mova m0, [dstq] - mova m1, [tmpq] - psubw m1, m0, m1 - pmulhrsw m1, m2 - paddw m0, m1 - mova [dstq], m0 - add dstq, 32 - add tmpq, 32 - sub wd, 32 - jg .w32c - add dstq, dsq - inc oq - dec hd - jg .w32l - RET -.jmp_tbl: - dd .w2 - .jmp_tbl - dd .w4 - .jmp_tbl - dd .w8 - .jmp_tbl - dd .w16 - .jmp_tbl - dd .w32 - .jmp_tbl - dd .w32 - .jmp_tbl - dd .w32 - .jmp_tbl - -%endif ; ARCH_X86_64 From fad01783fc6893169e09f194a3476980762b51f4 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:02:01 +0200 Subject: [PATCH 033/188] x86: Add high bitdepth put_bilin AVX2 asm --- src/x86/mc16_avx2.asm | 615 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 615 insertions(+) create mode 100644 src/x86/mc16_avx2.asm diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm new file mode 100644 index 0000000000..2ee3fc8872 --- /dev/null +++ b/src/x86/mc16_avx2.asm @@ -0,0 +1,615 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA + +put_bilin_h_rnd: dw 8, 8, 10, 10 + +pw_2: times 2 dw 2 +pw_16: times 2 dw 16 +pw_2048: times 2 dw 2048 +pw_8192: times 2 dw 8192 + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put) + +BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 + +%macro HV_JMP_TABLE 5-* + %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 + %xdefine %1_%2_h_%3_table (%%h - %5) + %%h: + %rep %0 - 4 + dw %%prefix %+ .h_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 2 + %xdefine %1_%2_v_%3_table (%%v - %5) + %%v: + %rep %0 - 4 + dw %%prefix %+ .v_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 4 + %xdefine %1_%2_hv_%3_table (%%hv - %5) + %%hv: + %rep %0 - 4 + dw %%prefix %+ .hv_w%5 - %%base + %rotate 1 + %endrep + %endif +%endmacro + +HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 + +%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +SECTION .text + +INIT_XMM avx2 +cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy + mov mxyd, r6m ; mx + lea r7, [put_avx2] +%if UNIX64 + DECLARE_REG_TMP 8 + %define org_w r8d + mov r8d, wd +%else + DECLARE_REG_TMP 7 + %define org_w wm +%endif + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + movzx wd, word [r7+wq*2+table_offset(put,)] + add wq, r7 + jmp wq +.put_w2: + mov r6d, [srcq+ssq*0] + mov r7d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6d + mov [dstq+dsq*1], r7d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + mov r6, [srcq+ssq*0] + mov r7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6 + mov [dstq+dsq*1], r7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +INIT_YMM avx2 +.put_w16: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +.put_w32: + movu m0, [srcq+ssq*0+32*0] + movu m1, [srcq+ssq*0+32*1] + movu m2, [srcq+ssq*1+32*0] + movu m3, [srcq+ssq*1+32*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+32*0], m0 + mova [dstq+dsq*0+32*1], m1 + mova [dstq+dsq*1+32*0], m2 + mova [dstq+dsq*1+32*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+32*0] + movu m1, [srcq+32*1] + movu m2, [srcq+32*2] + movu m3, [srcq+32*3] + add srcq, ssq + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + add dstq, dsq + dec hd + jg .put_w64 + RET +.put_w128: + movu m0, [srcq+32*0] + movu m1, [srcq+32*1] + movu m2, [srcq+32*2] + movu m3, [srcq+32*3] + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + movu m0, [srcq+32*4] + movu m1, [srcq+32*5] + movu m2, [srcq+32*6] + movu m3, [srcq+32*7] + add srcq, ssq + mova [dstq+32*4], m0 + mova [dstq+32*5], m1 + mova [dstq+32*6], m2 + mova [dstq+32*7], m3 + add dstq, dsq + dec hd + jg .put_w128 + RET +.h: + movd xm5, mxyd + mov mxyd, r7m ; my + vpbroadcastd m4, [pw_16] + vpbroadcastw m5, xm5 + psubw m4, m5 + test mxyd, mxyd + jnz .hv + ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v + movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] + mov r6d, r8m ; bitdepth_max + add wq, r7 + shr r6d, 11 + vpbroadcastd m3, [r7-put_avx2+put_bilin_h_rnd+r6*4] + jmp wq +.h_w2: + movq xm1, [srcq+ssq*0] + movhps xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmullw xm0, xm4, xm1 + psrlq xm1, 16 + pmullw xm1, xm5 + paddw xm0, xm3 + paddw xm0, xm1 + psrlw xm0, 4 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + movq xm0, [srcq+ssq*0] + movhps xm0, [srcq+ssq*1] + movq xm1, [srcq+ssq*0+2] + movhps xm1, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw xm0, xm4 + pmullw xm1, xm5 + paddw xm0, xm3 + paddw xm0, xm1 + psrlw xm0, 4 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu xm0, [srcq+ssq*0] + vinserti128 m0, [srcq+ssq*1], 1 + movu xm1, [srcq+ssq*0+2] + vinserti128 m1, [srcq+ssq*1+2], 1 + lea srcq, [srcq+ssq*2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 4 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + pmullw m0, m4, [srcq+ssq*0] + pmullw m1, m5, [srcq+ssq*0+2] + paddw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+ssq*1] + pmullw m2, m5, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16 + RET +.h_w32: + pmullw m0, m4, [srcq+32*0] + pmullw m1, m5, [srcq+32*0+2] + paddw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+32*1] + pmullw m2, m5, [srcq+32*1+2] + add srcq, ssq + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + add dstq, dsq + dec hd + jg .h_w32 + RET +.h_w64: +.h_w128: + movifnidn t0d, org_w +.h_w64_loop0: + mov r6d, t0d +.h_w64_loop: + pmullw m0, m4, [srcq+r6*2-32*1] + pmullw m1, m5, [srcq+r6*2-32*1+2] + paddw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+r6*2-32*2] + pmullw m2, m5, [srcq+r6*2-32*2+2] + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+r6*2-32*1], m0 + mova [dstq+r6*2-32*2], m1 + sub r6d, 32 + jg .h_w64_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w64_loop0 + RET +.v: + movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] + shl mxyd, 11 + movd xm5, mxyd + add wq, r7 + vpbroadcastw m5, xm5 + jmp wq +.v_w2: + movd xm0, [srcq+ssq*0] +.v_w2_loop: + movd xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpckldq xm2, xm0, xm1 + movd xm0, [srcq+ssq*0] + punpckldq xm1, xm0 + psubw xm1, xm2 + pmulhrsw xm1, xm5 + paddw xm1, xm2 + movd [dstq+dsq*0], xm1 + pextrd [dstq+dsq*1], xm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq xm0, [srcq+ssq*0] +.v_w4_loop: + movq xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq xm2, xm0, xm1 + movq xm0, [srcq+ssq*0] + punpcklqdq xm1, xm0 + psubw xm1, xm2 + pmulhrsw xm1, xm5 + paddw xm1, xm2 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movu xm0, [srcq+ssq*0] +.v_w8_loop: + vbroadcasti128 m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd m2, m0, m1, 0xf0 + vbroadcasti128 m0, [srcq+ssq*0] + vpblendd m1, m0, 0xf0 + psubw m1, m2 + pmulhrsw m1, m5 + paddw m1, m2 + mova [dstq+dsq*0], xm1 + vextracti128 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w32: + movu m0, [srcq+ssq*0+32*0] + movu m1, [srcq+ssq*0+32*1] +.v_w32_loop: + movu m2, [srcq+ssq*1+32*0] + movu m3, [srcq+ssq*1+32*1] + lea srcq, [srcq+ssq*2] + psubw m4, m2, m0 + pmulhrsw m4, m5 + paddw m4, m0 + movu m0, [srcq+ssq*0+32*0] + mova [dstq+dsq*0+32*0], m4 + psubw m4, m3, m1 + pmulhrsw m4, m5 + paddw m4, m1 + movu m1, [srcq+ssq*0+32*1] + mova [dstq+dsq*0+32*1], m4 + psubw m4, m0, m2 + pmulhrsw m4, m5 + paddw m4, m2 + mova [dstq+dsq*1+32*0], m4 + psubw m4, m1, m3 + pmulhrsw m4, m5 + paddw m4, m3 + mova [dstq+dsq*1+32*1], m4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + RET +.v_w16: +.v_w64: +.v_w128: + movifnidn t0d, org_w + add t0d, t0d + mov r4, srcq + lea r6d, [hq+t0*8-256] + mov r7, dstq +.v_w16_loop0: + movu m0, [srcq+ssq*0] +.v_w16_loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + psubw m1, m3, m0 + pmulhrsw m1, m5 + paddw m1, m0 + movu m0, [srcq+ssq*0] + psubw m2, m0, m3 + pmulhrsw m2, m5 + paddw m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + add r4, 32 + add r7, 32 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .v_w16_loop0 + RET +.hv: + movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] + WIN64_SPILL_XMM 8 + shl mxyd, 11 + vpbroadcastd m3, [pw_2] + movd xm6, mxyd + vpbroadcastd m7, [pw_8192] + add wq, r7 + vpbroadcastw m6, xm6 + test dword r8m, 0x800 + jnz .hv_12bpc + psllw m4, 2 + psllw m5, 2 + vpbroadcastd m7, [pw_2048] +.hv_12bpc: + jmp wq +.hv_w2: + vpbroadcastq xm1, [srcq+ssq*0] + pmullw xm0, xm4, xm1 + psrlq xm1, 16 + pmullw xm1, xm5 + paddw xm0, xm3 + paddw xm0, xm1 + psrlw xm0, 2 +.hv_w2_loop: + movq xm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xm2, [srcq+ssq*0] + pmullw xm1, xm4, xm2 + psrlq xm2, 16 + pmullw xm2, xm5 + paddw xm1, xm3 + paddw xm1, xm2 + psrlw xm1, 2 ; 1 _ 2 _ + shufpd xm2, xm0, xm1, 0x01 ; 0 _ 1 _ + mova xm0, xm1 + psubw xm1, xm2 + paddw xm1, xm1 + pmulhw xm1, xm6 + paddw xm1, xm2 + pmulhrsw xm1, xm7 + movd [dstq+dsq*0], xm1 + pextrd [dstq+dsq*1], xm1, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + pmullw xm0, xm4, [srcq+ssq*0-8] + pmullw xm1, xm5, [srcq+ssq*0-6] + paddw xm0, xm3 + paddw xm0, xm1 + psrlw xm0, 2 +.hv_w4_loop: + movq xm1, [srcq+ssq*1] + movq xm2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + movhps xm1, [srcq+ssq*0] + movhps xm2, [srcq+ssq*0+2] + pmullw xm1, xm4 + pmullw xm2, xm5 + paddw xm1, xm3 + paddw xm1, xm2 + psrlw xm1, 2 ; 1 2 + shufpd xm2, xm0, xm1, 0x01 ; 0 1 + mova xm0, xm1 + psubw xm1, xm2 + paddw xm1, xm1 + pmulhw xm1, xm6 + paddw xm1, xm2 + pmulhrsw xm1, xm7 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + pmullw xm0, xm4, [srcq+ssq*0] + pmullw xm1, xm5, [srcq+ssq*0+2] + paddw xm0, xm3 + paddw xm0, xm1 + psrlw xm0, 2 + vinserti128 m0, xm0, 1 +.hv_w8_loop: + movu xm1, [srcq+ssq*1] + movu xm2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + vinserti128 m1, [srcq+ssq*0], 1 + vinserti128 m2, [srcq+ssq*0+2], 1 + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 ; 1 2 + vperm2i128 m2, m0, m1, 0x21 ; 0 1 + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m2 + pmulhrsw m1, m7 + mova [dstq+dsq*0], xm1 + vextracti128 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: +.hv_w32: +.hv_w64: +.hv_w128: +%if UNIX64 + lea r6d, [r8*2-32] +%else + mov r6d, wm + lea r6d, [r6*2-32] +%endif + mov r4, srcq + lea r6d, [hq+r6*8] + mov r7, dstq +.hv_w16_loop0: + pmullw m0, m4, [srcq+ssq*0] + pmullw m1, m5, [srcq+ssq*0+2] + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w16_loop: + pmullw m1, m4, [srcq+ssq*1] + pmullw m2, m5, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 + psubw m2, m1, m0 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m0 + pmulhrsw m2, m7 + mova [dstq+dsq*0], m2 + pmullw m0, m4, [srcq+ssq*0] + pmullw m2, m5, [srcq+ssq*0+2] + paddw m0, m3 + paddw m0, m2 + psrlw m0, 2 + psubw m2, m0, m1 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m1 + pmulhrsw m2, m7 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + add r4, 32 + add r7, 32 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .hv_w16_loop0 + RET + +%endif ; ARCH_X86_64 From 765c0e6760fae2e876d60e7e79e2ef0e84ca94cb Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:02:10 +0200 Subject: [PATCH 034/188] x86: Add high bitdepth prep_bilin AVX2 asm --- src/x86/mc16_avx2.asm | 500 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 499 insertions(+), 1 deletion(-) diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm index 2ee3fc8872..e3e1ebcf79 100644 --- a/src/x86/mc16_avx2.asm +++ b/src/x86/mc16_avx2.asm @@ -31,11 +31,14 @@ SECTION_RODATA put_bilin_h_rnd: dw 8, 8, 10, 10 +prep_mul: dw 16, 16, 4, 4 + +%define pw_16 prep_mul pw_2: times 2 dw 2 -pw_16: times 2 dw 16 pw_2048: times 2 dw 2048 pw_8192: times 2 dw 8192 +pw_32766: times 2 dw 32766 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) @@ -48,8 +51,10 @@ pw_8192: times 2 dw 8192 %endmacro %xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put) +%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep) BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) @@ -84,6 +89,7 @@ BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 %endmacro HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX @@ -612,4 +618,496 @@ INIT_YMM avx2 jg .hv_w16_loop0 RET +cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + lea r6, [prep_avx2] +%if UNIX64 + DECLARE_REG_TMP 7 + %define org_w r7d +%else + DECLARE_REG_TMP 6 + %define org_w r5m +%endif + mov org_w, wd + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + movzx wd, word [r6+wq*2+table_offset(prep,)] + mov r5d, r7m ; bitdepth_max + vpbroadcastd m5, [r6-prep_avx2+pw_8192] + add wq, r6 + shr r5d, 11 + vpbroadcastd m4, [r6-prep_avx2+prep_mul+r5*4] + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movq xm0, [srcq+strideq*0] + movhps xm0, [srcq+strideq*1] + vpbroadcastq m1, [srcq+strideq*2] + vpbroadcastq m2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m0, m1, 0x30 + vpblendd m0, m2, 0xc0 + pmullw m0, m4 + psubw m0, m5 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + movu xm1, [srcq+strideq*2] + vinserti128 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + pmullw m1, m4 + psubw m0, m5 + psubw m1, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + pmullw m0, m4, [srcq+strideq*0] + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m4, [srcq+strideq*2] + pmullw m3, m4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 4 + jg .prep_w16 + RET +.prep_w32: + pmullw m0, m4, [srcq+strideq*0+32*0] + pmullw m1, m4, [srcq+strideq*0+32*1] + pmullw m2, m4, [srcq+strideq*1+32*0] + pmullw m3, m4, [srcq+strideq*1+32*1] + lea srcq, [srcq+strideq*2] + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 2 + jg .prep_w32 + RET +.prep_w64: + pmullw m0, m4, [srcq+32*0] + pmullw m1, m4, [srcq+32*1] + pmullw m2, m4, [srcq+32*2] + pmullw m3, m4, [srcq+32*3] + add srcq, strideq + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + dec hd + jg .prep_w64 + RET +.prep_w128: + pmullw m0, m4, [srcq+32*0] + pmullw m1, m4, [srcq+32*1] + pmullw m2, m4, [srcq+32*2] + pmullw m3, m4, [srcq+32*3] + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + pmullw m0, m4, [srcq+32*4] + pmullw m1, m4, [srcq+32*5] + pmullw m2, m4, [srcq+32*6] + pmullw m3, m4, [srcq+32*7] + add tmpq, 32*8 + add srcq, strideq + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq-32*4], m0 + mova [tmpq-32*3], m1 + mova [tmpq-32*2], m2 + mova [tmpq-32*1], m3 + dec hd + jg .prep_w128 + RET +.h: + movd xm5, mxyd + mov mxyd, r6m ; my + vpbroadcastd m4, [pw_16] + vpbroadcastw m5, xm5 + vpbroadcastd m3, [pw_32766] + psubw m4, m5 + test dword r7m, 0x800 + jnz .h_12bpc + psllw m4, 2 + psllw m5, 2 +.h_12bpc: + test mxyd, mxyd + jnz .hv + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.h_w4: + movu xm1, [srcq+strideq*0] + vinserti128 m1, [srcq+strideq*2], 1 + movu xm2, [srcq+strideq*1] + vinserti128 m2, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + punpcklqdq m0, m1, m2 + psrldq m1, 2 + pslldq m2, 6 + pmullw m0, m4 + vpblendd m1, m2, 0xcc + pmullw m1, m5 + psubw m0, m3 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .h_w4 + RET +.h_w8: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + movu xm1, [srcq+strideq*0+2] + vinserti128 m1, [srcq+strideq*1+2], 1 + lea srcq, [srcq+strideq*2] + pmullw m0, m4 + pmullw m1, m5 + psubw m0, m3 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + pmullw m0, m4, [srcq+strideq*0] + pmullw m1, m5, [srcq+strideq*0+2] + psubw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m5, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + psubw m1, m3 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 2 + jg .h_w16 + RET +.h_w32: +.h_w64: +.h_w128: + movifnidn t0d, org_w +.h_w32_loop0: + mov r3d, t0d +.h_w32_loop: + pmullw m0, m4, [srcq+r3*2-32*1] + pmullw m1, m5, [srcq+r3*2-32*1+2] + psubw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+r3*2-32*2] + pmullw m2, m5, [srcq+r3*2-32*2+2] + psubw m1, m3 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+r3*2-32*1], m0 + mova [tmpq+r3*2-32*2], m1 + sub r3d, 32 + jg .h_w32_loop + add srcq, strideq + lea tmpq, [tmpq+t0*2] + dec hd + jg .h_w32_loop0 + RET +.v: + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] + movd xm5, mxyd + vpbroadcastd m4, [pw_16] + vpbroadcastw m5, xm5 + vpbroadcastd m3, [pw_32766] + add wq, r6 + lea stride3q, [strideq*3] + psubw m4, m5 + test dword r7m, 0x800 + jnz .v_12bpc + psllw m4, 2 + psllw m5, 2 +.v_12bpc: + jmp wq +.v_w4: + movq xm0, [srcq+strideq*0] +.v_w4_loop: + vpbroadcastq m2, [srcq+strideq*2] + vpbroadcastq xm1, [srcq+strideq*1] + vpblendd m2, m0, 0x03 ; 0 2 2 2 + vpbroadcastq m0, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m1, m0, 0xf0 ; 1 1 3 3 + vpbroadcastq m0, [srcq+strideq*0] + vpblendd m1, m2, 0x33 ; 0 1 2 3 + vpblendd m0, m2, 0x0c ; 4 2 4 4 + punpckhqdq m2, m1, m0 ; 1 2 3 4 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movu xm0, [srcq+strideq*0] +.v_w8_loop: + vbroadcasti128 m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpblendd m1, m0, m2, 0xf0 ; 0 1 + vbroadcasti128 m0, [srcq+strideq*0] + vpblendd m2, m0, 0xf0 ; 1 2 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + movu m0, [srcq+strideq*0] +.v_w16_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m4 + pmullw m1, m5, m2 + psubw m0, m3 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m4 + mova [tmpq+32*0], m1 + pmullw m1, m5, m0 + psubw m2, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +.v_w64: +.v_w128: +%if WIN64 + PUSH r7 +%endif + movifnidn r7d, org_w + add r7d, r7d + mov r3, srcq + lea r6d, [hq+r7*8-256] + mov r5, tmpq +.v_w32_loop0: + movu m0, [srcq+strideq*0] +.v_w32_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m4 + pmullw m1, m5, m2 + psubw m0, m3 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m4 + mova [tmpq+r7*0], m1 + pmullw m1, m5, m0 + psubw m2, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+r7*1], m1 + lea tmpq, [tmpq+r7*2] + sub hd, 2 + jg .v_w32_loop + add r3, 32 + add r5, 32 + movzx hd, r6b + mov srcq, r3 + mov tmpq, r5 + sub r6d, 1<<8 + jg .v_w32_loop0 +%if WIN64 + POP r7 +%endif + RET +.hv: + WIN64_SPILL_XMM 7 + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] + shl mxyd, 11 + movd xm6, mxyd + add wq, r6 + lea stride3q, [strideq*3] + vpbroadcastw m6, xm6 + jmp wq +.hv_w4: + movu xm1, [srcq+strideq*0] +%if WIN64 + movaps [rsp+24], xmm7 +%endif + pmullw xm0, xm4, xm1 + psrldq xm1, 2 + pmullw xm1, xm5 + psubw xm0, xm3 + paddw xm0, xm1 + psraw xm0, 2 + vpbroadcastq m0, xm0 +.hv_w4_loop: + movu xm1, [srcq+strideq*1] + vinserti128 m1, [srcq+stride3q ], 1 + movu xm2, [srcq+strideq*2] + lea srcq, [srcq+strideq*4] + vinserti128 m2, [srcq+strideq*0], 1 + punpcklqdq m7, m1, m2 + psrldq m1, 2 + pslldq m2, 6 + pmullw m7, m4 + vpblendd m1, m2, 0xcc + pmullw m1, m5 + psubw m7, m3 + paddw m1, m7 + psraw m1, 2 ; 1 2 3 4 + vpblendd m0, m1, 0x3f + vpermq m2, m0, q2103 ; 0 1 2 3 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop +%if WIN64 + movaps xmm7, [rsp+24] +%endif + RET +.hv_w8: + pmullw xm0, xm4, [srcq+strideq*0] + pmullw xm1, xm5, [srcq+strideq*0+2] + psubw xm0, xm3 + paddw xm0, xm1 + psraw xm0, 2 + vinserti128 m0, xm0, 1 +.hv_w8_loop: + movu xm1, [srcq+strideq*1] + movu xm2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + vinserti128 m1, [srcq+strideq*0], 1 + vinserti128 m2, [srcq+strideq*0+2], 1 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 ; 1 2 + vperm2i128 m2, m0, m1, 0x21 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: +.hv_w32: +.hv_w64: +.hv_w128: +%if WIN64 + PUSH r7 +%endif + movifnidn r7d, org_w + add r7d, r7d + mov r3, srcq + lea r6d, [hq+r7*8-256] + mov r5, tmpq +.hv_w16_loop0: + pmullw m0, m4, [srcq] + pmullw m1, m5, [srcq+2] + psubw m0, m3 + paddw m0, m1 + psraw m0, 2 +.hv_w16_loop: + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m5, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 + psubw m2, m1, m0 + pmulhrsw m2, m6 + paddw m2, m0 + mova [tmpq+r7*0], m2 + pmullw m0, m4, [srcq+strideq*0] + pmullw m2, m5, [srcq+strideq*0+2] + psubw m0, m3 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+r7*1], m2 + lea tmpq, [tmpq+r7*2] + sub hd, 2 + jg .hv_w16_loop + add r3, 32 + add r5, 32 + movzx hd, r6b + mov srcq, r3 + mov tmpq, r5 + sub r6d, 1<<8 + jg .hv_w16_loop0 +%if WIN64 + POP r7 +%endif + RET + %endif ; ARCH_X86_64 From 0f0314cbea46b9dba6c7355bbf8bb3d24b7a8dca Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:02:19 +0200 Subject: [PATCH 035/188] x86: Add high bitdepth put_8tap/prep_8tap AVX2 asm --- src/x86/mc16_avx2.asm | 1351 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1351 insertions(+) diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm index e3e1ebcf79..f35115ef9f 100644 --- a/src/x86/mc16_avx2.asm +++ b/src/x86/mc16_avx2.asm @@ -30,8 +30,15 @@ SECTION_RODATA +subpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +subpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +subpel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 + put_bilin_h_rnd: dw 8, 8, 10, 10 prep_mul: dw 16, 16, 4, 4 +put_8tap_h_rnd: dd 34, 40 +prep_8tap_1d_rnd: dd 8 - (8192 << 4) +prep_8tap_2d_rnd: dd 32 - (8192 << 5) %define pw_16 prep_mul @@ -39,6 +46,8 @@ pw_2: times 2 dw 2 pw_2048: times 2 dw 2048 pw_8192: times 2 dw 8192 pw_32766: times 2 dw 32766 +pd_32: dd 32 +pd_512: dd 512 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) @@ -93,6 +102,9 @@ HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + SECTION .text INIT_XMM avx2 @@ -1110,4 +1122,1343 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 %endif RET +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v +cglobal %1_8tap_%2_16bpc + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) +%endif +%endmacro + +%if WIN64 +DECLARE_REG_TMP 4, 5 +%else +DECLARE_REG_TMP 7, 8 +%endif + +MC_8TAP_FN put, sharp, SHARP, SHARP +MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN put, smooth, SMOOTH, SMOOTH +MC_8TAP_FN put, sharp_regular, SHARP, REGULAR +MC_8TAP_FN put, regular_sharp, REGULAR, SHARP +MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN put, regular, REGULAR, REGULAR + +cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my +%define base r8-put_avx2 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx2] + movifnidn wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r8+wq*2+table_offset(put,)] + add wq, r8 +%if WIN64 + pop r8 +%endif + jmp wq +.h_w2: + movzx mxd, mxb + sub srcq, 2 + mova xm2, [subpel_h_shuf2] + vpbroadcastd xm3, [base+subpel_filters+mxq*8+2] + pmovsxbw xm3, xm3 +.h_w2_loop: + movu xm0, [srcq+ssq*0] + movu xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm0, xm2 + pshufb xm1, xm2 + pmaddwd xm0, xm3 + pmaddwd xm1, xm3 + phaddd xm0, xm1 + paddd xm0, xm4 + psrad xm0, 6 + packusdw xm0, xm0 + pminsw xm0, xm5 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xm3, [base+subpel_filters+mxq*8] + WIN64_SPILL_XMM 8 + vbroadcasti128 m6, [subpel_h_shufA] + vbroadcasti128 m7, [subpel_h_shufB] + pshufd xm3, xm3, q2211 + vpbroadcastq m2, xm3 + vpermq m3, m3, q1111 +.h_w4_loop: + movu xm1, [srcq+ssq*0] + vinserti128 m1, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 + pshufb m1, m7 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m2 + pmaddwd m1, m3 + paddd m0, m4 + paddd m0, m1 + psrad m0, 6 + vextracti128 xm1, m0, 1 + packusdw xm0, xm1 + pminsw xm0, xm5 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h: + test myd, 0xf00 + jnz .hv + mov r7d, r8m + vpbroadcastw m5, r8m + shr r7d, 11 + vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4] + cmp wd, 4 + je .h_w4 + jl .h_w2 + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 13 + shr mxd, 16 + sub srcq, 6 + vpbroadcastq m0, [base+subpel_filters+mxq*8] + vbroadcasti128 m6, [subpel_h_shufA] + vbroadcasti128 m7, [subpel_h_shufB] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + cmp wd, 8 + jg .h_w16 +.h_w8: +%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 + pmaddwd m%5, m9, m%4 ; abcd1 + pmaddwd m%1, m8 ; abcd0 + pshufb m%2, m7 ; 6 7 7 8 8 9 9 a + shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m%5, m4 + paddd m%1, m%5 + pmaddwd m%5, m11, m%2 ; abcd3 + paddd m%1, m%5 + pmaddwd m%5, m10, m%4 ; abcd2 + pshufb m%3, m7 ; a b b c c d d e + pmaddwd m%4, m8 ; efgh0 + paddd m%1, m%5 + pmaddwd m%5, m9, m%2 ; efgh1 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m11 ; efgh3 + pmaddwd m%2, m10 ; efgh2 + paddd m%4, m4 + paddd m%4, m%5 + paddd m%3, m%4 + paddd m%2, m%3 + psrad m%1, 6 + psrad m%2, 6 + packusdw m%1, m%2 + pminsw m%1, m5 +%endmacro + movu xm0, [srcq+ssq*0+ 0] + vinserti128 m0, [srcq+ssq*1+ 0], 1 + movu xm2, [srcq+ssq*0+16] + vinserti128 m2, [srcq+ssq*1+16], 1 + lea srcq, [srcq+ssq*2] + shufpd m1, m0, m2, 0x05 + PUT_8TAP_H 0, 1, 2, 3, 12 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + mov r6d, wd +.h_w16_loop: + movu m0, [srcq+r6*2-32] + movu m1, [srcq+r6*2-24] + movu m2, [srcq+r6*2-16] + PUT_8TAP_H 0, 1, 2, 3, 12 + mova [dstq+r6*2-32], m0 + sub r6d, 16 + jg .h_w16_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w16 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + vpbroadcastq m0, [base+subpel_filters+myq*8] + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 15 + vpbroadcastd m6, [pd_32] + vpbroadcastw m7, r8m + lea r6, [ssq*3] + sub srcq, r6 + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + cmp wd, 4 + jg .v_w8 + je .v_w4 +.v_w2: + movd xm2, [srcq+ssq*0] + pinsrd xm2, [srcq+ssq*1], 1 + pinsrd xm2, [srcq+ssq*2], 2 + pinsrd xm2, [srcq+r6 ], 3 ; 0 1 2 3 + lea srcq, [srcq+ssq*4] + movd xm3, [srcq+ssq*0] + vpbroadcastd xm1, [srcq+ssq*1] + vpbroadcastd xm0, [srcq+ssq*2] + add srcq, r6 + vpblendd xm3, xm1, 0x02 ; 4 5 + vpblendd xm1, xm0, 0x02 ; 5 6 + palignr xm4, xm3, xm2, 4 ; 1 2 3 4 + punpcklwd xm3, xm1 ; 45 56 + punpcklwd xm1, xm2, xm4 ; 01 12 + punpckhwd xm2, xm4 ; 23 34 +.v_w2_loop: + vpbroadcastd xm4, [srcq+ssq*0] + pmaddwd xm5, xm8, xm1 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm9 ; a1 b1 + paddd xm5, xm6 + paddd xm5, xm2 + mova xm2, xm3 + pmaddwd xm3, xm10 ; a2 b2 + paddd xm5, xm3 + vpblendd xm3, xm0, xm4, 0x02 ; 6 7 + vpbroadcastd xm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xm4, xm0, 0x02 ; 7 8 + punpcklwd xm3, xm4 ; 67 78 + pmaddwd xm4, xm11, xm3 ; a3 b3 + paddd xm5, xm4 + psrad xm5, 6 + packusdw xm5, xm5 + pminsw xm5, xm7 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq xm1, [srcq+ssq*0] + vpbroadcastq m0, [srcq+ssq*1] + vpbroadcastq m2, [srcq+ssq*2] + vpbroadcastq m4, [srcq+r6 ] + lea srcq, [srcq+ssq*4] + vpbroadcastq m3, [srcq+ssq*0] + vpbroadcastq m5, [srcq+ssq*1] + vpblendd m1, m0, 0x30 + vpblendd m0, m2, 0x30 + punpcklwd m1, m0 ; 01 12 + vpbroadcastq m0, [srcq+ssq*2] + add srcq, r6 + vpblendd m2, m4, 0x30 + vpblendd m4, m3, 0x30 + punpcklwd m2, m4 ; 23 34 + vpblendd m3, m5, 0x30 + vpblendd m5, m0, 0x30 + punpcklwd m3, m5 ; 45 56 +.v_w4_loop: + vpbroadcastq m4, [srcq+ssq*0] + pmaddwd m5, m8, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m9 ; a1 b1 + paddd m5, m6 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m10 ; a2 b2 + paddd m5, m3 + vpblendd m3, m0, m4, 0x30 + vpbroadcastq m0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd m4, m0, 0x30 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m11, m3 ; a3 b3 + paddd m5, m4 + psrad m5, 6 + vextracti128 xm4, m5, 1 + packusdw xm5, xm4 + pminsw xm5, xm7 + movq [dstq+dsq*0], xm5 + movhps [dstq+dsq*1], xm5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + shl wd, 5 + mov r7, srcq + mov r8, dstq + lea wd, [hq+wq-256] +.v_w8_loop0: + vbroadcasti128 m4, [srcq+ssq*0] + vbroadcasti128 m5, [srcq+ssq*1] + vbroadcasti128 m0, [srcq+r6 ] + vbroadcasti128 m6, [srcq+ssq*2] + lea srcq, [srcq+ssq*4] + vbroadcasti128 m1, [srcq+ssq*0] + vbroadcasti128 m2, [srcq+ssq*1] + vbroadcasti128 m3, [srcq+ssq*2] + add srcq, r6 + shufpd m4, m0, 0x0c + shufpd m5, m1, 0x0c + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + shufpd m6, m2, 0x0c + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + shufpd m0, m3, 0x0c + punpcklwd m3, m6, m0 ; 23 + punpckhwd m6, m0 ; 56 +.v_w8_loop: + vbroadcasti128 m14, [srcq+ssq*0] + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + vbroadcasti128 m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + paddd m13, m6 + shufpd m6, m0, m14, 0x0d + shufpd m0, m14, m5, 0x0c + punpcklwd m5, m6, m0 ; 67 + punpckhwd m6, m0 ; 78 + pmaddwd m14, m11, m5 ; a3 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m14 + psrad m12, 5 + psrad m13, 5 + packusdw m12, m13 + pxor m13, m13 + pavgw m12, m13 + pminsw m12, m7 + vpermq m12, m12, q3120 + mova [dstq+dsq*0], xm12 + vextracti128 [dstq+dsq*1], m12, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + add r7, 16 + add r8, 16 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 + jg .v_w8_loop0 + RET +.hv: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + vpbroadcastw m15, r8m + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + vpbroadcastd m0, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + vpbroadcastq m1, [base+subpel_filters+myq*8] + vpbroadcastd m6, [pd_512] + lea r6, [ssq*3] + sub srcq, 2 + sub srcq, r6 + pxor m7, m7 + punpcklbw m7, m0 + punpcklbw m1, m1 + psraw m1, 8 ; sign-extend + test dword r8m, 0x800 + jz .hv_10bit + psraw m7, 2 + psllw m1, 2 +.hv_10bit: + pshufd m11, m1, q0000 + pshufd m12, m1, q1111 + pshufd m13, m1, q2222 + pshufd m14, m1, q3333 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 m9, [subpel_h_shuf2] + vbroadcasti128 m1, [srcq+r6 ] ; 3 3 + movu xm3, [srcq+ssq*2] + movu xm0, [srcq+ssq*0] + movu xm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*4] + vinserti128 m3, [srcq+ssq*0], 1 ; 2 4 + vinserti128 m0, [srcq+ssq*1], 1 ; 0 5 + vinserti128 m2, [srcq+ssq*2], 1 ; 1 6 + add srcq, r6 + pshufb m1, m9 + pshufb m3, m9 + pshufb m0, m9 + pshufb m2, m9 + pmaddwd m1, m7 + pmaddwd m3, m7 + pmaddwd m0, m7 + pmaddwd m2, m7 + phaddd m1, m3 + phaddd m0, m2 + paddd m1, m6 + paddd m0, m6 + psrad m1, 10 + psrad m0, 10 + packssdw m1, m0 ; 3 2 0 1 + vextracti128 xm0, m1, 1 ; 3 4 5 6 + pshufd xm2, xm1, q1301 ; 2 3 1 2 + pshufd xm3, xm0, q2121 ; 4 5 4 5 + punpckhwd xm1, xm2 ; 01 12 + punpcklwd xm2, xm0 ; 23 34 + punpckhwd xm3, xm0 ; 45 56 +.hv_w2_loop: + movu xm4, [srcq+ssq*0] + movu xm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm4, xm9 + pshufb xm5, xm9 + pmaddwd xm4, xm7 + pmaddwd xm5, xm7 + phaddd xm4, xm5 + pmaddwd xm5, xm11, xm1 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm12 ; a1 b1 + paddd xm5, xm2 + mova xm2, xm3 + pmaddwd xm3, xm13 ; a2 b2 + paddd xm5, xm3 + paddd xm4, xm6 + psrad xm4, 10 + packssdw xm4, xm4 + palignr xm3, xm4, xm0, 12 + mova xm0, xm4 + punpcklwd xm3, xm0 ; 67 78 + pmaddwd xm4, xm14, xm3 ; a3 b3 + paddd xm5, xm6 + paddd xm5, xm4 + psrad xm5, 10 + packusdw xm5, xm5 + pminsw xm5, xm15 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + vbroadcasti128 m9, [subpel_h_shufA] + vbroadcasti128 m10, [subpel_h_shufB] + pshufd m8, m7, q1111 + pshufd m7, m7, q0000 + movu xm1, [srcq+ssq*0] + vinserti128 m1, [srcq+ssq*1], 1 ; 0 1 + vbroadcasti128 m0, [srcq+r6 ] + vinserti128 m2, m0, [srcq+ssq*2], 0 ; 2 3 + lea srcq, [srcq+ssq*4] + vinserti128 m0, [srcq+ssq*0], 1 ; 3 4 + movu xm3, [srcq+ssq*1] + vinserti128 m3, [srcq+ssq*2], 1 ; 5 6 + add srcq, r6 + pshufb m4, m1, m9 + pshufb m1, m10 + pmaddwd m4, m7 + pmaddwd m1, m8 + pshufb m5, m2, m9 + pshufb m2, m10 + pmaddwd m5, m7 + pmaddwd m2, m8 + paddd m4, m6 + paddd m1, m4 + pshufb m4, m0, m9 + pshufb m0, m10 + pmaddwd m4, m7 + pmaddwd m0, m8 + paddd m5, m6 + paddd m2, m5 + pshufb m5, m3, m9 + pshufb m3, m10 + pmaddwd m5, m7 + pmaddwd m3, m8 + paddd m4, m6 + paddd m4, m0 + paddd m5, m6 + paddd m5, m3 + vperm2i128 m0, m1, m2, 0x21 + psrld m1, 10 + psrld m2, 10 + vperm2i128 m3, m4, m5, 0x21 + pslld m4, 6 + pslld m5, 6 + pblendw m2, m4, 0xaa ; 23 34 + pslld m0, 6 + pblendw m1, m0, 0xaa ; 01 12 + psrld m3, 10 + pblendw m3, m5, 0xaa ; 45 56 + psrad m0, m5, 16 +.hv_w4_loop: + movu xm4, [srcq+ssq*0] + vinserti128 m4, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pmaddwd m5, m11, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m12 ; a1 b1 + paddd m5, m6 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m13 ; a2 b2 + paddd m5, m3 + pshufb m3, m4, m9 + pshufb m4, m10 + pmaddwd m3, m7 + pmaddwd m4, m8 + paddd m3, m6 + paddd m4, m3 + psrad m4, 10 + packssdw m0, m4 ; _ 7 6 8 + vpermq m3, m0, q1122 ; _ 6 _ 7 + punpckhwd m3, m0 ; 67 78 + mova m0, m4 + pmaddwd m4, m14, m3 ; a3 b3 + paddd m4, m5 + psrad m4, 10 + vextracti128 xm5, m4, 1 + packusdw xm4, xm5 + pminsw xm4, xm15 + movq [dstq+dsq*0], xm4 + movhps [dstq+dsq*1], xm4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + vpbroadcastq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + pmovsxbw xm1, [base+subpel_filters+myq*8] + shl wd, 5 + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 + pxor m0, m0 + punpcklbw m0, m2 + mov r7, srcq + mov r8, dstq + lea wd, [hq+wq-256] + test dword r8m, 0x800 + jz .hv_w8_10bit + psraw m0, 2 + psllw xm1, 2 +.hv_w8_10bit: + pshufd m11, m0, q0000 + pshufd m12, m0, q1111 + pshufd m13, m0, q2222 + pshufd m14, m0, q3333 +%if WIN64 + %define v_mul (rsp+stack_offset+40) ; r4m +%else + %define v_mul (rsp-24) ; red zone +%endif + mova [v_mul], xm1 +.hv_w8_loop0: +%macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 + pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 + pmaddwd m3, m12, m2 + pmaddwd m%1, m11 + pshufb m%2, m9 ; 6 7 7 8 8 9 9 a + shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m3, m10 + paddd m%1, m3 + pmaddwd m3, m14, m%2 + paddd m%1, m3 + pmaddwd m3, m13, m2 + pshufb m%3, m9 ; a b b c c d d e + pmaddwd m2, m11 + paddd m%1, m3 + pmaddwd m3, m12, m%2 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m14 + pmaddwd m%2, m13 + paddd m2, m10 + paddd m2, m3 + paddd m%3, m2 + paddd m%2, m%3 + psrad m%1, 10 + psrad m%2, 10 + packssdw m%1, m%2 +%endmacro + movu xm4, [srcq+r6 *1+ 0] + vbroadcasti128 m8, [subpel_h_shufA] + movu xm6, [srcq+r6 *1+ 8] + vbroadcasti128 m9, [subpel_h_shufB] + movu xm0, [srcq+r6 *1+16] + vpbroadcastd m10, [pd_512] + movu xm5, [srcq+ssq*0+ 0] + vinserti128 m5, [srcq+ssq*4+ 0], 1 + movu xm1, [srcq+ssq*0+16] + vinserti128 m1, [srcq+ssq*4+16], 1 + shufpd m7, m5, m1, 0x05 + INIT_XMM avx2 + PUT_8TAP_HV_H 4, 6, 0 ; 3 + INIT_YMM avx2 + PUT_8TAP_HV_H 5, 7, 1 ; 0 4 + movu xm0, [srcq+ssq*2+ 0] + vinserti128 m0, [srcq+r6 *2+ 0], 1 + movu xm1, [srcq+ssq*2+16] + vinserti128 m1, [srcq+r6 *2+16], 1 + shufpd m7, m0, m1, 0x05 + PUT_8TAP_HV_H 0, 7, 1 ; 2 6 + movu xm6, [srcq+ssq*1+ 0] + movu xm1, [srcq+ssq*1+16] + lea srcq, [srcq+ssq*4] + vinserti128 m6, [srcq+ssq*1+ 0], 1 + vinserti128 m1, [srcq+ssq*1+16], 1 + add srcq, r6 + shufpd m7, m6, m1, 0x05 + PUT_8TAP_HV_H 6, 7, 1 ; 1 5 + vpermq m4, m4, q1100 + vpermq m5, m5, q3120 + vpermq m6, m6, q3120 + vpermq m7, m0, q3120 + punpcklwd m3, m7, m4 ; 23 + punpckhwd m4, m5 ; 34 + punpcklwd m1, m5, m6 ; 01 + punpckhwd m5, m6 ; 45 + punpcklwd m2, m6, m7 ; 12 + punpckhwd m6, m7 ; 56 +.hv_w8_loop: + vpbroadcastd m9, [v_mul+4*0] + vpbroadcastd m7, [v_mul+4*1] + vpbroadcastd m10, [v_mul+4*2] + pmaddwd m8, m9, m1 ; a0 + pmaddwd m9, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m7 ; a1 + pmaddwd m4, m7 ; b1 + paddd m8, m3 + paddd m9, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m8, m5 + paddd m9, m6 + movu xm5, [srcq+ssq*0] + vinserti128 m5, [srcq+ssq*1], 1 + vbroadcasti128 m7, [subpel_h_shufA] + vbroadcasti128 m10, [subpel_h_shufB] + movu xm6, [srcq+ssq*0+16] + vinserti128 m6, [srcq+ssq*1+16], 1 + vextracti128 [dstq], m0, 1 + pshufb m0, m5, m7 ; 01 + pshufb m5, m10 ; 23 + pmaddwd m0, m11 + pmaddwd m5, m12 + paddd m0, m5 + pshufb m5, m6, m7 ; 89 + pshufb m6, m10 ; ab + pmaddwd m5, m13 + pmaddwd m6, m14 + paddd m6, m5 + movu xm5, [srcq+ssq*0+8] + vinserti128 m5, [srcq+ssq*1+8], 1 + lea srcq, [srcq+ssq*2] + pshufb m7, m5, m7 + pshufb m5, m10 + pmaddwd m10, m13, m7 + pmaddwd m7, m11 + paddd m0, m10 + vpbroadcastd m10, [pd_512] + paddd m6, m7 + pmaddwd m7, m14, m5 + pmaddwd m5, m12 + paddd m0, m7 + paddd m5, m6 + vbroadcasti128 m6, [dstq] + paddd m8, m10 + paddd m9, m10 + paddd m0, m10 + paddd m5, m10 + vpbroadcastd m10, [v_mul+4*3] + psrad m0, 10 + psrad m5, 10 + packssdw m0, m5 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m7, 0x04 ; 6 7 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m10, m5 ; a3 + pmaddwd m10, m6 ; b3 + paddd m7, m8 + paddd m9, m10 + psrad m7, 10 + psrad m9, 10 + packusdw m7, m9 + pminsw m7, m15 + vpermq m7, m7, q3120 + mova [dstq+dsq*0], xm7 + vextracti128 [dstq+dsq*1], m7, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + add r7, 16 + add r8, 16 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 + jg .hv_w8_loop0 + RET + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +MC_8TAP_FN prep, sharp, SHARP, SHARP +MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH +MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR +MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP +MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN prep, regular, REGULAR, REGULAR + +cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my +%define base r7-prep_avx2 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r7, [prep_avx2] + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + mov r6d, r7m ; bitdepth_max + movzx wd, word [r7+wq*2+table_offset(prep,)] + vpbroadcastd m5, [r7-prep_avx2+pw_8192] + shr r6d, 11 + add wq, r7 + vpbroadcastd m4, [base+prep_mul+r6*4] + lea r6, [strideq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h_w4: + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xm0, [base+subpel_filters+mxq*8] + vbroadcasti128 m3, [subpel_h_shufA] + vbroadcasti128 m4, [subpel_h_shufB] + WIN64_SPILL_XMM 8 + pshufd xm0, xm0, q2211 + test dword r7m, 0x800 + jnz .h_w4_12bpc + psllw xm0, 2 +.h_w4_12bpc: + vpbroadcastq m6, xm0 + vpermq m7, m0, q1111 +.h_w4_loop: + movu xm1, [srcq+strideq*0] + vinserti128 m1, [srcq+strideq*2], 1 + movu xm2, [srcq+strideq*1] + vinserti128 m2, [srcq+r6 ], 1 + lea srcq, [srcq+strideq*4] + pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 + pshufb m1, m4 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m6 + pmaddwd m1, m7 + paddd m0, m5 + paddd m0, m1 + pshufb m1, m2, m3 + pshufb m2, m4 + pmaddwd m1, m6 + pmaddwd m2, m7 + paddd m1, m5 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) + lea r6, [strideq*3] + cmp wd, 4 + je .h_w4 + shr mxd, 16 + sub srcq, 6 + vpbroadcastq m0, [base+subpel_filters+mxq*8] + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 12 + vbroadcasti128 m6, [subpel_h_shufA] + vbroadcasti128 m7, [subpel_h_shufB] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + test dword r7m, 0x800 + jnz .h_12bpc + psllw m0, 2 +.h_12bpc: + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + cmp wd, 8 + jg .h_w16 +.h_w8: +%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 + pmaddwd m%5, m9, m%4 ; abcd1 + pmaddwd m%1, m8 ; abcd0 + pshufb m%2, m7 ; 6 7 7 8 8 9 9 a + shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m%5, m5 + paddd m%1, m%5 + pmaddwd m%5, m11, m%2 ; abcd3 + paddd m%1, m%5 + pmaddwd m%5, m10, m%4 ; abcd2 + pshufb m%3, m7 ; a b b c c d d e + pmaddwd m%4, m8 ; efgh0 + paddd m%1, m%5 + pmaddwd m%5, m9, m%2 ; efgh1 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m11 ; efgh3 + pmaddwd m%2, m10 ; efgh2 + paddd m%4, m5 + paddd m%4, m%5 + paddd m%3, m%4 + paddd m%2, m%3 + psrad m%1, 4 + psrad m%2, 4 + packssdw m%1, m%2 +%endmacro + movu xm0, [srcq+strideq*0+ 0] + vinserti128 m0, [srcq+strideq*1+ 0], 1 + movu xm2, [srcq+strideq*0+16] + vinserti128 m2, [srcq+strideq*1+16], 1 + lea srcq, [srcq+strideq*2] + shufpd m1, m0, m2, 0x05 + PREP_8TAP_H 0, 1, 2, 3, 4 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + add wd, wd +.h_w16_loop0: + mov r6d, wd +.h_w16_loop: + movu m0, [srcq+r6-32] + movu m1, [srcq+r6-24] + movu m2, [srcq+r6-16] + PREP_8TAP_H 0, 1, 2, 3, 4 + mova [tmpq+r6-32], m0 + sub r6d, 32 + jg .h_w16_loop + add srcq, strideq + add tmpq, wq + dec hd + jg .h_w16_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + vpbroadcastq m0, [base+subpel_filters+myq*8] + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 15 + vpbroadcastd m7, [prep_8tap_1d_rnd] + lea r6, [strideq*3] + sub srcq, r6 + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + test dword r7m, 0x800 + jnz .v_12bpc + psllw m0, 2 +.v_12bpc: + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + cmp wd, 4 + jg .v_w8 +.v_w4: + movq xm1, [srcq+strideq*0] + vpbroadcastq m0, [srcq+strideq*1] + vpbroadcastq m2, [srcq+strideq*2] + vpbroadcastq m4, [srcq+r6 ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m3, [srcq+strideq*0] + vpbroadcastq m5, [srcq+strideq*1] + vpblendd m1, m0, 0x30 + vpblendd m0, m2, 0x30 + punpcklwd m1, m0 ; 01 12 + vpbroadcastq m0, [srcq+strideq*2] + add srcq, r6 + vpblendd m2, m4, 0x30 + vpblendd m4, m3, 0x30 + punpcklwd m2, m4 ; 23 34 + vpblendd m3, m5, 0x30 + vpblendd m5, m0, 0x30 + punpcklwd m3, m5 ; 45 56 +.v_w4_loop: + vpbroadcastq m4, [srcq+strideq*0] + pmaddwd m5, m8, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m9 ; a1 b1 + paddd m5, m7 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m10 ; a2 b2 + paddd m5, m3 + vpblendd m3, m0, m4, 0x30 + vpbroadcastq m0, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpblendd m4, m0, 0x30 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m11, m3 ; a3 b3 + paddd m5, m4 + psrad m5, 4 + vextracti128 xm4, m5, 1 + packssdw xm5, xm4 + mova [tmpq], xm5 + add tmpq, 16 + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: +%if WIN64 + push r8 +%endif + mov r8d, wd + shl wd, 5 + mov r5, srcq + mov r7, tmpq + lea wd, [hq+wq-256] +.v_w8_loop0: + vbroadcasti128 m4, [srcq+strideq*0] + vbroadcasti128 m5, [srcq+strideq*1] + vbroadcasti128 m0, [srcq+r6 ] + vbroadcasti128 m6, [srcq+strideq*2] + lea srcq, [srcq+strideq*4] + vbroadcasti128 m1, [srcq+strideq*0] + vbroadcasti128 m2, [srcq+strideq*1] + vbroadcasti128 m3, [srcq+strideq*2] + add srcq, r6 + shufpd m4, m0, 0x0c + shufpd m5, m1, 0x0c + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + shufpd m6, m2, 0x0c + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + shufpd m0, m3, 0x0c + punpcklwd m3, m6, m0 ; 23 + punpckhwd m6, m0 ; 56 +.v_w8_loop: + vbroadcasti128 m14, [srcq+strideq*0] + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m7 + paddd m13, m7 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + vbroadcasti128 m5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + paddd m13, m6 + shufpd m6, m0, m14, 0x0d + shufpd m0, m14, m5, 0x0c + punpcklwd m5, m6, m0 ; 67 + punpckhwd m6, m0 ; 78 + pmaddwd m14, m11, m5 ; a3 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m14 + psrad m12, 4 + psrad m13, 4 + packssdw m12, m13 + vpermq m12, m12, q3120 + mova [tmpq+r8*0], xm12 + vextracti128 [tmpq+r8*2], m12, 1 + lea tmpq, [tmpq+r8*4] + sub hd, 2 + jg .v_w8_loop + add r5, 16 + add r7, 16 + movzx hd, wb + mov srcq, r5 + mov tmpq, r7 + sub wd, 1<<8 + jg .v_w8_loop0 +%if WIN64 + pop r8 +%endif + RET +.hv: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + vpbroadcastd m15, [prep_8tap_2d_rnd] + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + vpbroadcastd m0, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + vpbroadcastq m1, [base+subpel_filters+myq*8] + lea r6, [strideq*3] + sub srcq, 2 + sub srcq, r6 + pxor m7, m7 + punpcklbw m7, m0 + punpcklbw m1, m1 + psraw m7, 4 + psraw m1, 8 + test dword r7m, 0x800 + jz .hv_w4_10bit + psraw m7, 2 +.hv_w4_10bit: + pshufd m11, m1, q0000 + pshufd m12, m1, q1111 + pshufd m13, m1, q2222 + pshufd m14, m1, q3333 +.hv_w4: + vbroadcasti128 m9, [subpel_h_shufA] + vbroadcasti128 m10, [subpel_h_shufB] + pshufd m8, m7, q1111 + pshufd m7, m7, q0000 + movu xm1, [srcq+strideq*0] + vinserti128 m1, [srcq+strideq*1], 1 ; 0 1 + vbroadcasti128 m0, [srcq+r6 ] + vinserti128 m2, m0, [srcq+strideq*2], 0 ; 2 3 + lea srcq, [srcq+strideq*4] + vinserti128 m0, [srcq+strideq*0], 1 ; 3 4 + movu xm3, [srcq+strideq*1] + vinserti128 m3, [srcq+strideq*2], 1 ; 5 6 + add srcq, r6 + pshufb m4, m1, m9 + pshufb m1, m10 + pmaddwd m4, m7 + pmaddwd m1, m8 + pshufb m5, m2, m9 + pshufb m2, m10 + pmaddwd m5, m7 + pmaddwd m2, m8 + paddd m4, m15 + paddd m1, m4 + pshufb m4, m0, m9 + pshufb m0, m10 + pmaddwd m4, m7 + pmaddwd m0, m8 + paddd m5, m15 + paddd m2, m5 + pshufb m5, m3, m9 + pshufb m3, m10 + pmaddwd m5, m7 + pmaddwd m3, m8 + paddd m4, m15 + paddd m4, m0 + paddd m5, m15 + paddd m5, m3 + vperm2i128 m0, m1, m2, 0x21 + psrld m1, 6 + psrld m2, 6 + vperm2i128 m3, m4, m5, 0x21 + pslld m4, 10 + pslld m5, 10 + pblendw m2, m4, 0xaa ; 23 34 + pslld m0, 10 + pblendw m1, m0, 0xaa ; 01 12 + psrld m3, 6 + pblendw m3, m5, 0xaa ; 45 56 + psrad m0, m5, 16 +.hv_w4_loop: + movu xm4, [srcq+strideq*0] + vinserti128 m4, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] + pmaddwd m5, m11, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m12 ; a1 b1 + paddd m5, m15 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m13 ; a2 b2 + paddd m5, m3 + pshufb m3, m4, m9 + pshufb m4, m10 + pmaddwd m3, m7 + pmaddwd m4, m8 + paddd m3, m15 + paddd m4, m3 + psrad m4, 6 + packssdw m0, m4 ; _ 7 6 8 + vpermq m3, m0, q1122 ; _ 6 _ 7 + punpckhwd m3, m0 ; 67 78 + mova m0, m4 + pmaddwd m4, m14, m3 ; a3 b3 + paddd m4, m5 + psrad m4, 6 + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + mova [tmpq], xm4 + add tmpq, 16 + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + vpbroadcastq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + pmovsxbw xm1, [base+subpel_filters+myq*8] +%if WIN64 + PUSH r8 +%endif + mov r8d, wd + shl wd, 5 + lea r6, [strideq*3] + sub srcq, 6 + sub srcq, r6 + mov r5, srcq + mov r7, tmpq + lea wd, [hq+wq-256] + pxor m0, m0 + punpcklbw m0, m2 + mova [v_mul], xm1 + psraw m0, 4 + test dword r7m, 0x800 + jz .hv_w8_10bit + psraw m0, 2 +.hv_w8_10bit: + pshufd m11, m0, q0000 + pshufd m12, m0, q1111 + pshufd m13, m0, q2222 + pshufd m14, m0, q3333 +.hv_w8_loop0: +%macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 + pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 + pmaddwd m3, m12, m2 + pmaddwd m%1, m11 + pshufb m%2, m9 ; 6 7 7 8 8 9 9 a + shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m3, m15 + paddd m%1, m3 + pmaddwd m3, m14, m%2 + paddd m%1, m3 + pmaddwd m3, m13, m2 + pshufb m%3, m9 ; a b b c c d d e + pmaddwd m2, m11 + paddd m%1, m3 + pmaddwd m3, m12, m%2 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m14 + pmaddwd m%2, m13 + paddd m2, m15 + paddd m2, m3 + paddd m2, m%3 + paddd m2, m%2 + psrad m%1, 6 + psrad m2, 6 + packssdw m%1, m2 +%endmacro + movu xm4, [srcq+r6 + 0] + vbroadcasti128 m8, [subpel_h_shufA] + movu xm6, [srcq+r6 + 8] + vbroadcasti128 m9, [subpel_h_shufB] + movu xm0, [srcq+r6 +16] + movu xm5, [srcq+strideq*0+ 0] + vinserti128 m5, [srcq+strideq*4+ 0], 1 + movu xm1, [srcq+strideq*0+16] + vinserti128 m1, [srcq+strideq*4+16], 1 + shufpd m7, m5, m1, 0x05 + INIT_XMM avx2 + PREP_8TAP_HV_H 4, 6, 0 ; 3 + INIT_YMM avx2 + PREP_8TAP_HV_H 5, 7, 1 ; 0 4 + movu xm0, [srcq+strideq*2+ 0] + vinserti128 m0, [srcq+r6 *2+ 0], 1 + movu xm1, [srcq+strideq*2+16] + vinserti128 m1, [srcq+r6 *2+16], 1 + shufpd m7, m0, m1, 0x05 + PREP_8TAP_HV_H 0, 7, 1 ; 2 6 + movu xm6, [srcq+strideq*1+ 0] + movu xm1, [srcq+strideq*1+16] + lea srcq, [srcq+strideq*4] + vinserti128 m6, [srcq+strideq*1+ 0], 1 + vinserti128 m1, [srcq+strideq*1+16], 1 + add srcq, r6 + shufpd m7, m6, m1, 0x05 + PREP_8TAP_HV_H 6, 7, 1 ; 1 5 + vpermq m4, m4, q1100 + vpermq m5, m5, q3120 + vpermq m6, m6, q3120 + vpermq m7, m0, q3120 + punpcklwd m3, m7, m4 ; 23 + punpckhwd m4, m5 ; 34 + punpcklwd m1, m5, m6 ; 01 + punpckhwd m5, m6 ; 45 + punpcklwd m2, m6, m7 ; 12 + punpckhwd m6, m7 ; 56 +.hv_w8_loop: + vpbroadcastd m9, [v_mul+4*0] + vpbroadcastd m7, [v_mul+4*1] + vpbroadcastd m10, [v_mul+4*2] + pmaddwd m8, m9, m1 ; a0 + pmaddwd m9, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m7 ; a1 + pmaddwd m4, m7 ; b1 + paddd m8, m15 + paddd m9, m15 + paddd m8, m3 + paddd m9, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m8, m5 + paddd m9, m6 + movu xm5, [srcq+strideq*0] + vinserti128 m5, [srcq+strideq*1], 1 + vbroadcasti128 m7, [subpel_h_shufA] + vbroadcasti128 m10, [subpel_h_shufB] + movu xm6, [srcq+strideq*0+16] + vinserti128 m6, [srcq+strideq*1+16], 1 + vextracti128 [tmpq], m0, 1 + pshufb m0, m5, m7 ; 01 + pshufb m5, m10 ; 23 + pmaddwd m0, m11 + pmaddwd m5, m12 + paddd m0, m15 + paddd m0, m5 + pshufb m5, m6, m7 ; 89 + pshufb m6, m10 ; ab + pmaddwd m5, m13 + pmaddwd m6, m14 + paddd m5, m15 + paddd m6, m5 + movu xm5, [srcq+strideq*0+8] + vinserti128 m5, [srcq+strideq*1+8], 1 + lea srcq, [srcq+strideq*2] + pshufb m7, m5, m7 + pshufb m5, m10 + pmaddwd m10, m13, m7 + pmaddwd m7, m11 + paddd m0, m10 + paddd m6, m7 + pmaddwd m7, m14, m5 + pmaddwd m5, m12 + paddd m0, m7 + paddd m5, m6 + vbroadcasti128 m6, [tmpq] + vpbroadcastd m10, [v_mul+4*3] + psrad m0, 6 + psrad m5, 6 + packssdw m0, m5 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m7, 0x04 ; 6 7 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m10, m5 ; a3 + pmaddwd m10, m6 ; b3 + paddd m7, m8 + paddd m9, m10 + psrad m7, 6 + psrad m9, 6 + packssdw m7, m9 + vpermq m7, m7, q3120 + mova [tmpq+r8*0], xm7 + vextracti128 [tmpq+r8*2], m7, 1 + lea tmpq, [tmpq+r8*4] + sub hd, 2 + jg .hv_w8_loop + add r5, 16 + add r7, 16 + movzx hd, wb + mov srcq, r5 + mov tmpq, r7 + sub wd, 1<<8 + jg .hv_w8_loop0 +%if WIN64 + POP r8 +%endif + RET + %endif ; ARCH_X86_64 From be8db330989c1147426b18464ec8753c6fd84c18 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:02:23 +0200 Subject: [PATCH 036/188] x86: Add high bitdepth avg AVX2 asm --- src/x86/mc16_avx2.asm | 168 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm index f35115ef9f..98c9d60a50 100644 --- a/src/x86/mc16_avx2.asm +++ b/src/x86/mc16_avx2.asm @@ -39,6 +39,8 @@ prep_mul: dw 16, 16, 4, 4 put_8tap_h_rnd: dd 34, 40 prep_8tap_1d_rnd: dd 8 - (8192 << 4) prep_8tap_2d_rnd: dd 32 - (8192 << 5) +bidir_rnd: dw -16400, -16400, -16388, -16388 +bidir_mul: dw 2048, 2048, 8192, 8192 %define pw_16 prep_mul @@ -49,6 +51,19 @@ pw_32766: times 2 dw 32766 pd_32: dd 32 pd_512: dd 512 +%macro BIDIR_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - 2*%3) + %xdefine %%base %1_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .w%3 - %%base + %rotate 1 + %endrep +%endmacro + +BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 + %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 @@ -2461,4 +2476,157 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my %endif RET +%macro BIDIR_FN 0 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq ], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + cmp hd, 4 + je .ret + lea dstq, [dstq+strideq*4] + movq [dstq ], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + cmp hd, 8 + je .ret + lea dstq, [dstq+strideq*4] + movq [dstq ], xm2 + movhps [dstq+strideq*1], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + lea dstq, [dstq+strideq*4] + movq [dstq ], xm3 + movhps [dstq+strideq*1], xm3 + vextracti128 xm3, m3, 1 + movq [dstq+strideq*2], xm3 + movhps [dstq+stride3q ], xm3 +.ret: + RET +.w8: + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + cmp hd, 4 + jne .w8_loop_start + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 +.w8_loop_start: + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm2 + vextracti128 [dstq+strideq*1], m2, 1 + mova [dstq+strideq*2], xm3 + vextracti128 [dstq+stride3q ], m3, 1 + sub hd, 8 + jg .w8_loop + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + call .main + mova [dstq+32*4], m0 + mova [dstq+32*5], m1 + mova [dstq+32*6], m2 + mova [dstq+32*7], m3 + dec hd + jg .w128_loop + RET +%endmacro + +%if WIN64 +DECLARE_REG_TMP 5 +%else +DECLARE_REG_TMP 7 +%endif + +cglobal avg_16bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-avg_avx2_table + lea r6, [avg_avx2_table] + tzcnt wd, wm + mov t0d, r6m ; pixel_max + movsxd wq, [r6+wq*4] + shr t0d, 11 + vpbroadcastd m4, [base+bidir_rnd+t0*4] + vpbroadcastd m5, [base+bidir_mul+t0*4] + movifnidn hd, hm + add wq, r6 + BIDIR_FN +ALIGN function_align +.main: + mova m0, [tmp1q+32*0] + paddsw m0, [tmp2q+32*0] + mova m1, [tmp1q+32*1] + paddsw m1, [tmp2q+32*1] + mova m2, [tmp1q+32*2] + paddsw m2, [tmp2q+32*2] + mova m3, [tmp1q+32*3] + paddsw m3, [tmp2q+32*3] + add tmp1q, 32*4 + add tmp2q, 32*4 + pmaxsw m0, m4 + pmaxsw m1, m4 + pmaxsw m2, m4 + pmaxsw m3, m4 + psubsw m0, m4 + psubsw m1, m4 + psubsw m2, m4 + psubsw m3, m4 + pmulhw m0, m5 + pmulhw m1, m5 + pmulhw m2, m5 + pmulhw m3, m5 + ret + %endif ; ARCH_X86_64 From e2228517ef68821f06c4e53ccbb0240c89576b8e Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:02:26 +0200 Subject: [PATCH 037/188] x86: Add high bitdepth w_avg AVX2 asm --- src/x86/mc16_avx2.asm | 76 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm index 98c9d60a50..09c932ab03 100644 --- a/src/x86/mc16_avx2.asm +++ b/src/x86/mc16_avx2.asm @@ -50,6 +50,7 @@ pw_8192: times 2 dw 8192 pw_32766: times 2 dw 32766 pd_32: dd 32 pd_512: dd 512 +pd_65538: dd 65538 %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) @@ -63,6 +64,7 @@ pd_512: dd 512 %endmacro BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) @@ -2629,4 +2631,78 @@ ALIGN function_align pmulhw m3, m5 ret +cglobal w_avg_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, stride3 + lea r6, [w_avg_avx2_table] + tzcnt wd, wm + mov t0d, r6m ; weight + vpbroadcastw m8, r7m ; pixel_max + vpbroadcastd m7, [r6-w_avg_avx2_table+pd_65538] + movsxd wq, [r6+wq*4] + paddw m7, m8 + add wq, r6 + lea r6d, [t0-16] + shl t0d, 16 + sub t0d, r6d ; 16-weight, weight + pslld m7, 7 + rorx r6d, t0d, 30 ; << 2 + test dword r7m, 0x800 + cmovz r6d, t0d + movifnidn hd, hm + movd xm6, r6d + vpbroadcastd m6, xm6 + BIDIR_FN +ALIGN function_align +.main: + mova m4, [tmp1q+32*0] + mova m0, [tmp2q+32*0] + punpckhwd m5, m0, m4 + punpcklwd m0, m4 + mova m4, [tmp1q+32*1] + mova m1, [tmp2q+32*1] + pmaddwd m5, m6 + pmaddwd m0, m6 + paddd m5, m7 + paddd m0, m7 + psrad m5, 8 + psrad m0, 8 + packusdw m0, m5 + punpckhwd m5, m1, m4 + punpcklwd m1, m4 + mova m4, [tmp1q+32*2] + mova m2, [tmp2q+32*2] + pmaddwd m5, m6 + pmaddwd m1, m6 + paddd m5, m7 + paddd m1, m7 + psrad m5, 8 + psrad m1, 8 + packusdw m1, m5 + punpckhwd m5, m2, m4 + punpcklwd m2, m4 + mova m4, [tmp1q+32*3] + mova m3, [tmp2q+32*3] + add tmp1q, 32*4 + add tmp2q, 32*4 + pmaddwd m5, m6 + pmaddwd m2, m6 + paddd m5, m7 + paddd m2, m7 + psrad m5, 8 + psrad m2, 8 + packusdw m2, m5 + punpckhwd m5, m3, m4 + punpcklwd m3, m4 + pmaddwd m5, m6 + pmaddwd m3, m6 + paddd m5, m7 + paddd m3, m7 + psrad m5, 8 + psrad m3, 8 + packusdw m3, m5 + pminsw m0, m8 + pminsw m1, m8 + pminsw m2, m8 + pminsw m3, m8 + ret + %endif ; ARCH_X86_64 From f8f4aa08d2b5ecb97f13beca0f93ce5c79f768ae Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:02:28 +0200 Subject: [PATCH 038/188] x86: Add high bitdepth mask AVX2 asm --- src/x86/mc16_avx2.asm | 45 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm index 09c932ab03..e31bff66fc 100644 --- a/src/x86/mc16_avx2.asm +++ b/src/x86/mc16_avx2.asm @@ -45,6 +45,7 @@ bidir_mul: dw 2048, 2048, 8192, 8192 %define pw_16 prep_mul pw_2: times 2 dw 2 +pw_64: times 2 dw 64 pw_2048: times 2 dw 2048 pw_8192: times 2 dw 8192 pw_32766: times 2 dw 32766 @@ -65,6 +66,7 @@ pd_65538: dd 65538 BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) @@ -2705,4 +2707,47 @@ ALIGN function_align pminsw m3, m8 ret +cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-mask_avx2_table + lea r7, [mask_avx2_table] + tzcnt wd, wm + mov r6d, r7m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m8, [base+pw_64] + vpbroadcastd m9, [base+bidir_rnd+r6*4] + vpbroadcastd m10, [base+bidir_mul+r6*4] + mov maskq, maskmp + add wq, r7 + BIDIR_FN +ALIGN function_align +.main: +%macro MASK 1 + pmovzxbw m5, [maskq+16*%1] + mova m%1, [tmp1q+32*%1] + mova m6, [tmp2q+32*%1] + punpckhwd m4, m%1, m6 + punpcklwd m%1, m6 + psubw m7, m8, m5 + punpckhwd m6, m5, m7 ; m, 64-m + punpcklwd m5, m7 + pmaddwd m4, m6 ; tmp1 * m + tmp2 * (64-m) + pmaddwd m%1, m5 + psrad m4, 5 + psrad m%1, 5 + packssdw m%1, m4 + pmaxsw m%1, m9 + psubsw m%1, m9 + pmulhw m%1, m10 +%endmacro + MASK 0 + MASK 1 + MASK 2 + MASK 3 + add maskq, 16*4 + add tmp1q, 32*4 + add tmp2q, 32*4 + ret + %endif ; ARCH_X86_64 From 0811c5e4a1741304dfde83eb03c4a5a8e04513e4 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:02:30 +0200 Subject: [PATCH 039/188] x86: Add high bitdepth w_mask_420 AVX2 asm --- src/x86/mc16_avx2.asm | 248 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 247 insertions(+), 1 deletion(-) diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm index e31bff66fc..87a38480d2 100644 --- a/src/x86/mc16_avx2.asm +++ b/src/x86/mc16_avx2.asm @@ -28,8 +28,9 @@ %if ARCH_X86_64 -SECTION_RODATA +SECTION_RODATA 32 +deint_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 subpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 subpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 subpel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 @@ -48,6 +49,7 @@ pw_2: times 2 dw 2 pw_64: times 2 dw 64 pw_2048: times 2 dw 2048 pw_8192: times 2 dw 8192 +pw_27615: times 2 dw 27615 pw_32766: times 2 dw 32766 pd_32: dd 32 pd_512: dd 512 @@ -67,6 +69,7 @@ pd_65538: dd 65538 BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) @@ -2750,4 +2753,247 @@ ALIGN function_align add tmp2q, 32*4 ret +cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_420_avx2_table + lea r7, [w_mask_420_avx2_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movd xm0, r7m ; sign + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + vpbroadcastd m11, [base+pw_64] + vpbroadcastd m12, [base+bidir_rnd+r6*4] + vpbroadcastd m13, [base+bidir_mul+r6*4] + movd xm14, [base+pw_2] + mov maskq, maskmp + psubw xm14, xm0 + vpbroadcastw m14, xm14 + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + phaddd m4, m5 + paddw m4, m14 + psrlw m4, 2 + packuswb m4, m4 + vextracti128 xm5, m4, 1 + punpcklwd xm4, xm5 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + mova [maskq], xm4 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm3 + movhps [dstq+strideq*1], xm3 + vextracti128 xm3, m3, 1 + movq [dstq+strideq*2], xm3 + movhps [dstq+stride3q ], xm3 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w8: + vperm2i128 m6, m4, m5, 0x21 + vpblendd m4, m5, 0xf0 + paddw m4, m14 + paddw m4, m6 + psrlw m4, 2 + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + mova [maskq], xm4 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm2 + vextracti128 [dstq+strideq*1], m2, 1 + mova [dstq+strideq*2], xm3 + vextracti128 [dstq+stride3q ], m3, 1 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w16: + punpcklqdq m6, m4, m5 + punpckhqdq m4, m5 + paddw m6, m14 + paddw m4, m6 + psrlw m4, 2 + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + pshufd xm4, xm4, q3120 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + mova [maskq], xm4 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 32 +.w32: + paddw m4, m14 + paddw m4, m5 + psrlw m15, m4, 2 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + call .main + mova m6, [deint_shuf] + paddw m4, m14 + paddw m4, m5 + psrlw m4, 2 + packuswb m15, m4 + vpermd m4, m6, m15 + mova [dstq+strideq*2+32*0], m0 + mova [dstq+strideq*2+32*1], m1 + mova [dstq+stride3q +32*0], m2 + mova [dstq+stride3q +32*1], m3 + mova [maskq], m4 + sub hd, 4 + jg .w32_loop + RET +.w64_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 32 +.w64: + paddw m4, m14 + paddw m15, m14, m5 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*0+32*2], m2 + mova [dstq+strideq*0+32*3], m3 + mova [maskq], m4 ; no available registers + call .main + paddw m4, [maskq] + mova m6, [deint_shuf] + paddw m5, m15 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 ; 0 2 4 6 1 3 5 7 + vpermd m4, m6, m4 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*1+32*2], m2 + mova [dstq+strideq*1+32*3], m3 + mova [maskq], m4 + sub hd, 2 + jg .w64_loop + RET +.w128_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 64 +.w128: + paddw m4, m14 + paddw m5, m14 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*0+32*2], m2 + mova [dstq+strideq*0+32*3], m3 + mova [maskq+32*0], m4 + mova [dstq+strideq], m5 + call .main + paddw m4, m14 + paddw m15, m14, m5 + mova [dstq+strideq*0+32*4], m0 + mova [dstq+strideq*0+32*5], m1 + mova [dstq+strideq*0+32*6], m2 + mova [dstq+strideq*0+32*7], m3 + mova [maskq+32*1], m4 + call .main + paddw m4, [maskq+32*0] + paddw m5, [dstq+strideq] + mova m6, [deint_shuf] + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 + vpermd m4, m6, m4 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*1+32*2], m2 + mova [dstq+strideq*1+32*3], m3 + mova [maskq+32*0], m4 + call .main + paddw m4, [maskq+32*1] + mova m6, [deint_shuf] + paddw m5, m15 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 + vpermd m4, m6, m4 + mova [dstq+strideq*1+32*4], m0 + mova [dstq+strideq*1+32*5], m1 + mova [dstq+strideq*1+32*6], m2 + mova [dstq+strideq*1+32*7], m3 + mova [maskq+32*1], m4 + sub hd, 2 + jg .w128_loop + RET +ALIGN function_align +.main: +%macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul + mova m%1, [tmp1q+32*%1] + mova m%2, [tmp2q+32*%1] + punpcklwd m8, m%2, m%1 + punpckhwd m9, m%2, m%1 + psubsw m%1, m%2 + pabsw m%1, m%1 + psubusw m7, m10, m%1 + psrlw m7, 10 ; 64-m + psubw m%2, m%3, m7 ; m + punpcklwd m%1, m7, m%2 + punpckhwd m7, m%2 + pmaddwd m%1, m8 + pmaddwd m7, m9 + psrad m%1, 5 + psrad m7, 5 + packssdw m%1, m7 + pmaxsw m%1, m%4 + psubsw m%1, m%4 + pmulhw m%1, m%5 +%endmacro + W_MASK 0, 4 + W_MASK 1, 5 + phaddw m4, m5 + W_MASK 2, 5 + W_MASK 3, 6 + phaddw m5, m6 + add tmp1q, 32*4 + add tmp2q, 32*4 + ret + %endif ; ARCH_X86_64 From c58f669c6d2294af9139ede7e5f9d6ce9b8f471f Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:02:31 +0200 Subject: [PATCH 040/188] x86: Add high bitdepth w_mask_422 AVX2 asm --- src/x86/mc16_avx2.asm | 135 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm index 87a38480d2..2b6d2dbd3d 100644 --- a/src/x86/mc16_avx2.asm +++ b/src/x86/mc16_avx2.asm @@ -70,6 +70,7 @@ BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) @@ -2996,4 +2997,138 @@ ALIGN function_align add tmp2q, 32*4 ret +cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_422_avx2_table + lea r7, [w_mask_422_avx2_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + vpbroadcastb m14, r7m ; sign + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m10, [base+pw_27615] + vpbroadcastd m11, [base+pw_64] + vpbroadcastd m12, [base+bidir_rnd+r6*4] + vpbroadcastd m13, [base+bidir_mul+r6*4] + mova m15, [base+deint_shuf] + mov maskq, maskmp + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm3 + movhps [dstq+strideq*1], xm3 + vextracti128 xm3, m3, 1 + movq [dstq+strideq*2], xm3 + movhps [dstq+stride3q ], xm3 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm2 + vextracti128 [dstq+strideq*1], m2, 1 + mova [dstq+strideq*2], xm3 + vextracti128 [dstq+stride3q ], m3, 1 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + call .main + mova [dstq+32*4], m0 + mova [dstq+32*5], m1 + mova [dstq+32*6], m2 + mova [dstq+32*7], m3 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 4 + W_MASK 1, 5 + phaddw m4, m5 + W_MASK 2, 5 + W_MASK 3, 6 + phaddw m5, m6 + add tmp1q, 32*4 + add tmp2q, 32*4 + packuswb m4, m5 + pxor m5, m5 + psubb m4, m14 + pavgb m4, m5 + vpermd m4, m15, m4 + mova [maskq], m4 + add maskq, 32 + ret + %endif ; ARCH_X86_64 From 07d98f306130148254cf8280ad52bffc90b5721f Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:02:35 +0200 Subject: [PATCH 041/188] x86: Add high bitdepth w_mask_444 AVX2 asm --- src/x86/mc16_avx2.asm | 120 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm index 2b6d2dbd3d..be778244a6 100644 --- a/src/x86/mc16_avx2.asm +++ b/src/x86/mc16_avx2.asm @@ -71,6 +71,7 @@ BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) @@ -3131,4 +3132,123 @@ ALIGN function_align add maskq, 32 ret +cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_444_avx2_table + lea r7, [w_mask_444_avx2_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m10, [base+pw_27615] + vpbroadcastd m4, [base+pw_64] + vpbroadcastd m5, [base+bidir_rnd+r6*4] + vpbroadcastd m6, [base+bidir_mul+r6*4] + mov maskq, maskmp + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + je .w4_end + call .main + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + sub hd, 4 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + call .main + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + call .main + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + call .main + mova [dstq+32*4], m0 + mova [dstq+32*5], m1 + call .main + mova [dstq+32*6], m0 + mova [dstq+32*7], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 2, 4, 5, 6 + W_MASK 1, 3, 4, 5, 6 + packuswb m2, m3 + vpermq m2, m2, q3120 + add tmp1q, 32*2 + add tmp2q, 32*2 + mova [maskq], m2 + add maskq, 32 + ret + %endif ; ARCH_X86_64 From deb8a32bceae114c82a3c2dd79a55b98c46fd930 Mon Sep 17 00:00:00 2001 From: Josh Holmer Date: Tue, 11 May 2021 12:46:23 -0400 Subject: [PATCH 042/188] Enable HBD put_bilin and prep_bilin functions --- build.rs | 1 + src/asm/x86/mc.rs | 18 +++++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/build.rs b/build.rs index eb986f0bfd..bf24c0775b 100644 --- a/build.rs +++ b/build.rs @@ -92,6 +92,7 @@ fn build_nasm_files() { "src/x86/looprestoration_avx2.asm", "src/x86/looprestoration16_avx2.asm", "src/x86/mc_avx2.asm", + "src/x86/mc16_avx2.asm", "src/x86/mc_avx512.asm", "src/x86/mc_sse.asm", "src/x86/me.asm", diff --git a/src/asm/x86/mc.rs b/src/asm/x86/mc.rs index 9029a5be2b..f24128aa54 100644 --- a/src/asm/x86/mc.rs +++ b/src/asm/x86/mc.rs @@ -7,8 +7,6 @@ // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. -#![allow(dead_code)] - use crate::cpu_features::CpuFeatureLevel; use crate::frame::*; use crate::mc::FilterMode::*; @@ -351,13 +349,14 @@ decl_mc_hbd_fns!( (SMOOTH, SHARP, rav1e_put_8tap_smooth_sharp_16bpc_avx2), (SHARP, REGULAR, rav1e_put_8tap_sharp_regular_16bpc_avx2), (SHARP, SMOOTH, rav1e_put_8tap_sharp_smooth_16bpc_avx2), - (SHARP, SHARP, rav1e_put_8tap_sharp_16bpc_avx2) + (SHARP, SHARP, rav1e_put_8tap_sharp_16bpc_avx2), + (BILINEAR, BILINEAR, rav1e_put_bilin_16bpc_avx2) ); cpu_function_lookup_table!( PUT_HBD_FNS: [[Option; 16]], default: [None; 16], - [] + [AVX2] ); macro_rules! decl_mct_fns { @@ -458,13 +457,14 @@ decl_mct_hbd_fns!( (SMOOTH, SHARP, rav1e_prep_8tap_smooth_sharp_16bpc_avx2), (SHARP, REGULAR, rav1e_prep_8tap_sharp_regular_16bpc_avx2), (SHARP, SMOOTH, rav1e_prep_8tap_sharp_smooth_16bpc_avx2), - (SHARP, SHARP, rav1e_prep_8tap_sharp_16bpc_avx2) + (SHARP, SHARP, rav1e_prep_8tap_sharp_16bpc_avx2), + (BILINEAR, BILINEAR, rav1e_prep_bilin_16bpc_avx2) ); cpu_function_lookup_table!( PREP_HBD_FNS: [[Option; 16]], default: [None; 16], - [] + [AVX2] ); extern { @@ -490,7 +490,11 @@ cpu_function_lookup_table!( [(SSSE3, Some(rav1e_avg_ssse3)), (AVX2, Some(rav1e_avg_avx2))] ); -cpu_function_lookup_table!(AVG_HBD_FNS: [Option], default: None, []); +cpu_function_lookup_table!( + AVG_HBD_FNS: [Option], + default: None, + [(AVX2, Some(rav1e_avg_16bpc_avx2))] +); #[cfg(test)] mod test { From 7c441af3b8371ad6135f204052ef6bbc5c163af4 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:02:54 +0200 Subject: [PATCH 043/188] x86: Add improved high bitdepth wiener AVX2 asm --- src/x86/looprestoration16_avx2.asm | 1034 ++++++++++++++++------------ 1 file changed, 603 insertions(+), 431 deletions(-) diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm index 4eb1b8056c..4551c3da76 100644 --- a/src/x86/looprestoration16_avx2.asm +++ b/src/x86/looprestoration16_avx2.asm @@ -1,5 +1,5 @@ -; Copyright (c) 2017-2021, The rav1e contributors -; Copyright (c) 2021, Nathan Egge +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without @@ -30,451 +30,623 @@ SECTION_RODATA 32 -wiener5_shufB: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 -wiener5_shufC: db 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11, 14, 15, 12, 13 -wiener5_shufD: db 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1, 10, 11, -1, -1 -wiener5_l_shuf: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 +wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 +wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 +wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 +wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15 +pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 -wiener7_shufC: db 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9 -wiener7_shufD: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 -wiener7_shufE: db 8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1 -rev_w: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 -rev_d: db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 -wiener7_l_shuf: db 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +wiener_hshift: dw 4, 4, 1, 1 +wiener_vshift: dw 1024, 1024, 4096, 4096 +wiener_round: dd 1049600, 1048832 -pq_3: dq (6 - 4) + 1 -pq_5: dq (6 - 2) + 1 -pd_65540: dd (1 << (8 + (6 - 4) + 6)) + (1 << (6 - 4)) -pd_262160: dd (1 << (8 + (6 - 2) + 6)) + (1 << (6 - 2)) +pb_m10_m9: times 2 db -10, -9 +pb_m6_m5: times 2 db -6, -5 +pb_m2_m1: times 2 db -2, -1 +pb_2_3: times 2 db 2, 3 +pb_6_7: times 2 db 6, 7 +pd_m262128 dd -262128 -pq_11: dq 12 - (6 - 4) + 1 -pq_9: dq 12 - (6 - 2) + 1 -nd_1047552: dd (1 << (12 - (6 - 4))) - (1 << (12 + 8)) -nd_1048320: dd (1 << (12 - (6 - 2))) - (1 << (12 + 8)) - -pb_wiener5_l: times 2 db 2, 3 -pb_wiener5_r: times 2 db -6, -5 +SECTION .text -pb_wiener7_l: times 2 db 4, 5 -pb_wiener7_m: times 2 db -4, -3 -pb_wiener7_r: times 2 db -8, -7 +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro -SECTION .text +DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers INIT_YMM avx2 -cglobal wiener_filter5_h_16bpc, 6, 9, 14, dst, left, src, ss, f, w, h, edge, bdmax - movifnidn wd, wm - movifnidn hd, hm - movifnidn edgeb, edgem - vbroadcasti128 m6, [wiener5_shufB] - vpbroadcastd m12, [fq + 2] - vbroadcasti128 m7, [wiener5_shufC] - vpbroadcastw m13, [fq + 6] - vbroadcasti128 m8, [wiener5_shufD] - popcnt bdmaxd, bdmaxm - vpbroadcastd m9, [pd_65540] - movq xm10, [pq_3] - cmp bdmaxd, 10 - je .bits10 - vpbroadcastd m9, [pd_262160] - movq xm10, [pq_5] -.bits10: - pxor m11, m11 - add wq, wq - add srcq, wq - add dstq, wq - neg wq - DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x -.v_loop: - mov xq, wq - test edgeb, 1 ; LR_HAVE_LEFT - jz .h_extend_left - test leftq, leftq - jz .h_loop - movd xm4, [leftq + 4] - vpblendd m4, [srcq + xq - 4], 0xfe - add leftq, 8 - jmp .h_main +cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h +%define base t4-wiener_hshift + mov fltq, fltmp + mov edged, r8m + movifnidn wd, wm + mov hd, r6m + mov t3d, r9m ; pixel_max + vbroadcasti128 m6, [wiener_shufA] + vpbroadcastd m12, [fltq+ 0] ; x0 x1 + lea t4, [wiener_hshift] + vbroadcasti128 m7, [wiener_shufB] + add wd, wd + vpbroadcastd m13, [fltq+ 4] ; x2 x3 + shr t3d, 11 + vbroadcasti128 m8, [wiener_shufC] + add lpfq, wq + vbroadcasti128 m9, [wiener_shufD] + lea t1, [rsp+wq+16] + vpbroadcastd m14, [fltq+16] ; y0 y1 + add dstq, wq + vpbroadcastd m15, [fltq+20] ; y2 y3 + neg wq + vpbroadcastd m0, [base+wiener_hshift+t3*4] + vpbroadcastd m10, [base+wiener_round+t3*4] + vpbroadcastd m11, [base+wiener_vshift+t3*4] + pmullw m12, m0 ; upshift filter coefs to make the + pmullw m13, m0 ; horizontal downshift constant + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + mov [rsp+8*1], lpf_strideq + add r7, lpf_strideq + mov [rsp+8*0], r7 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, [rsp+8*0] + call .hv_bottom + add lpfq, [rsp+8*1] + call .hv_bottom +.v1: + call .v + RET +.no_top: + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + lea r7, [r7+lpf_strideq*2] + mov [rsp+8*0], r7 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v +.v2: + call .v + jmp .v1 +.extend_right: + movd xm1, r10d + vpbroadcastd m0, [pb_6_7] + movu m2, [pb_0to31] + vpbroadcastb m1, xm1 + psubb m0, m1 + pminub m0, m2 + pshufb m3, m0 + vpbroadcastd m0, [pb_m2_m1] + psubb m0, m1 + pminub m0, m2 + pshufb m4, m0 + vpbroadcastd m0, [pb_m10_m9] + psubb m0, m1 + pminub m0, m2 + pshufb m5, m0 + ret +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movq xm3, [leftq] + vpblendd m3, [lpfq+r10-8], 0xfc + add leftq, 8 + jmp .h_main .h_extend_left: - vbroadcasti128 m5, [srcq + xq] - mova m4, [srcq + xq] - palignr m4, m5, 12 - pshufb m4, [wiener5_l_shuf] - jmp .h_main + vbroadcasti128 m3, [lpfq+r10] ; avoid accessing memory located + mova m4, [lpfq+r10] ; before the start of the buffer + shufpd m3, m4, 0x05 + pshufb m3, [wiener_lshuf7] + jmp .h_main2 +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left .h_loop: - movu m4, [srcq + xq - 4] + movu m3, [lpfq+r10-8] .h_main: - movu m5, [srcq + xq + 4] - test edgeb, 2 ; LR_HAVE_RIGHT - jnz .h_have_right - cmp xd, -36 - jl .h_have_right - movd xm2, xd - vpbroadcastd m0, [pb_wiener5_l] - vpbroadcastd m1, [pb_wiener5_r] - vpbroadcastb m2, xm2 - movu m3, [pb_0to31] - psubb m0, m2 - psubb m1, m2 - pminub m0, m3 - pminub m1, m3 - pshufb m4, m0 - pshufb m5, m1 + mova m4, [lpfq+r10+0] +.h_main2: + movu m5, [lpfq+r10+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -36 + jl .h_have_right + call .extend_right .h_have_right: - pshufb m0, m4, m6 - pshufb m2, m4, m7 - paddw m0, m2 - pmaddwd m0, m12 - pshufb m1, m5, m6 - pshufb m3, m5, m7 - paddw m1, m3 - pmaddwd m1, m12 - pshufb m4, m8 - pmaddwd m4, m13 - pshufb m5, m8 - pmaddwd m5, m13 - paddd m0, m4 - paddd m1, m5 - paddd m0, m9 - paddd m1, m9 - psrad m0, xm10 - psrad m1, xm10 - packssdw m0, m1 - pmaxsw m0, m11 - mova [dstq + xq], m0 - add xq, 32 - jl .h_loop - add srcq, ssq - add dstq, 384*2 - dec hd - jg .v_loop - RET - -DECLARE_REG_TMP 8, 9, 10, 11, 12, 13, 14 - -INIT_YMM avx2 -cglobal wiener_filter5_v_16bpc, 6, 13, 12, dst, ds, mid, f, w, h, edge, bdmax - movifnidn wd, wm - movifnidn hd, hm - movifnidn edgeb, edgem - pxor m6, m6 - vpbroadcastd m7, [fq + 2] - vpbroadcastd m8, [fq + 6] - popcnt bdmaxd, bdmaxm - vpbroadcastd m9, [nd_1047552] - movq xm10, [pq_11] - cmp bdmaxd, 10 - je .bits10 - vpbroadcastd m9, [nd_1048320] - movq xm10, [pq_9] -.bits10: - vpbroadcastw m11, bdmaxm - add wq, wq - add midq, wq - add dstq, wq - neg wq - DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x - mov msq, 2*384 - mov t0, midq - lea t1, [t0 + msq] - lea t2, [t1 + msq] - lea t3, [t2 + msq] - lea t4, [t3 + msq] - test edgeb, 4 ; LR_HAVE_TOP - jnz .have_top - mov t0, t2 - mov t1, t2 -.have_top: - test edgeb, 8 ; LR_HAVE_BOTTOM - jnz .v_loop - cmp hd, 2 - jg .v_loop - cmp hd, 1 - jne .limit_v - mov t3, t2 -.limit_v: - mov t4, t3 + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + vpbroadcastd m2, [pd_m262128] ; (1 << 4) - (1 << 18) + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+r10], m0 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movq xm3, [leftq] + vpblendd m3, [lpfq+r10-8], 0xfc + add leftq, 8 + jmp .hv_main +.hv_extend_left: + movu m3, [lpfq+r10-8] + pshufb m3, [wiener_lshuf7] + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+r10-8] +.hv_main: + mova m4, [lpfq+r10+0] + movu m5, [lpfq+r10+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -36 + jl .hv_have_right + call .extend_right +.hv_have_right: + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + vpbroadcastd m2, [pd_m262128] + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 + mova m2, [t4+r10] + paddw m2, [t2+r10] + mova m5, [t3+r10] + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova m4, [t5+r10] + paddw m4, [t1+r10] + psraw m0, 1 + paddw m3, m0, [t6+r10] + mova [t0+r10], m0 + punpcklwd m0, m2, m5 + pmaddwd m0, m15 + punpckhwd m2, m5 + pmaddwd m2, m15 + punpcklwd m1, m3, m4 + pmaddwd m1, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m2, m10 + paddd m0, m1 + paddd m2, m3 + psrad m0, 5 + psrad m2, 5 + packusdw m0, m2 + pmulhuw m0, m11 + mova [dstq+r10], m0 + add r10, 32 + jl .hv_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 + add dstq, dst_strideq + ret +.v: + mov r10, wq .v_loop: - mov xq, wq -.h_loop: - mova m1, [t0 + xq] - mova m2, [t1 + xq] - mova m3, [t2 + xq] - mova m4, [t3 + xq] - mova m5, [t4 + xq] - punpcklwd m0, m1, m2 - pmaddwd m0, m7 - punpckhwd m1, m2 - pmaddwd m1, m7 - punpcklwd m2, m5, m4 - pmaddwd m2, m7 - punpckhwd m5, m4 - pmaddwd m5, m7 - paddd m0, m2 - paddd m1, m5 - punpcklwd m2, m3, m6 - pmaddwd m2, m8 - punpckhwd m3, m6 - pmaddwd m3, m8 - paddd m0, m2 - paddd m1, m3 - paddd m0, m9 - paddd m1, m9 - psrad m0, xm10 - psrad m1, xm10 - packusdw m0, m1 - pminuw m0, m11 - mova [dstq + xq], m0 - add xq, 32 - jl .h_loop - add dstq, dsq - mov t0, t1 - mov t1, t2 - mov t2, t3 - mov t3, t4 - add t4, msq - test edgeb, 8 ; LR_HAVE_BOTTOM - jnz .have_bottom - cmp hd, 3 - jg .have_bottom - mov t4, t3 -.have_bottom: - dec hd - jg .v_loop - RET - -INIT_YMM avx2 -cglobal wiener_filter7_h_16bpc, 6, 10, 16, dst, left, src, ss, f, w, h, edge, bdmax, rh - movifnidn wd, wm - movifnidn hd, hm - movifnidn edgeb, edgem - vpbroadcastd m7, [fq] - vpbroadcastd m8, [fq + 4] - vbroadcasti128 m10, [rev_w] - vbroadcasti128 m11, [wiener5_shufB] - vbroadcasti128 m12, [wiener7_shufC] - vbroadcasti128 m13, [wiener7_shufD] - vbroadcasti128 m14, [wiener7_shufE] - vbroadcasti128 m15, [rev_d] - popcnt bdmaxd, bdmaxm - vpbroadcastd m9, [pd_65540] - mov rhq, [pq_3] - cmp bdmaxd, 10 - je .bits10 - vpbroadcastd m9, [pd_262160] - mov rhq, [pq_5] -.bits10: - add wq, wq - add srcq, wq - add dstq, wq - neg wq - DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x, rh -.v_loop: - mov xq, wq - test edgeb, 1 ; LR_HAVE_LEFT - jz .h_extend_left - test leftq, leftq - jz .h_loop - movq xm4, [leftq + 2] - vpblendw xm4, [srcq + xq - 6], 0xf8 - vinserti128 m4, [srcq + xq + 10], 1 - add leftq, 8 - jmp .h_main + mova m1, [t4+r10] + paddw m1, [t2+r10] + mova m2, [t3+r10] + mova m4, [t1+r10] + paddw m3, m4, [t6+r10] + paddw m4, [t5+r10] + punpcklwd m0, m1, m2 + pmaddwd m0, m15 + punpckhwd m1, m2 + pmaddwd m1, m15 + punpcklwd m2, m3, m4 + pmaddwd m2, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m1, m10 + paddd m0, m2 + paddd m1, m3 + psrad m0, 5 + psrad m1, 5 + packusdw m0, m1 + pmulhuw m0, m11 + mova [dstq+r10], m0 + add r10, 32 + jl .v_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, dst_strideq + ret +cglobal wiener_filter5_16bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h +%define base t4-wiener_hshift + mov fltq, fltmp + mov edged, r8m + movifnidn wd, wm + mov hd, r6m + mov t3d, r9m ; pixel_max + vbroadcasti128 m5, [wiener_shufE] + vpbroadcastw m11, [fltq+ 2] ; x1 + vbroadcasti128 m6, [wiener_shufB] + lea t4, [wiener_hshift] + vbroadcasti128 m7, [wiener_shufD] + add wd, wd + vpbroadcastd m12, [fltq+ 4] ; x2 x3 + shr t3d, 11 + vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18) + add lpfq, wq + lea t1, [rsp+wq+16] + vpbroadcastw m13, [fltq+18] ; y1 + add dstq, wq + vpbroadcastd m14, [fltq+20] ; y2 y3 + neg wq + vpbroadcastd m0, [base+wiener_hshift+t3*4] + vpbroadcastd m9, [base+wiener_round+t3*4] + vpbroadcastd m10, [base+wiener_vshift+t3*4] + movu xm15, [wiener_lshuf5] + pmullw m11, m0 + vinserti128 m15, [pb_0to31], 1 + pmullw m12, m0 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t4, t1 + add t1, 384*2 + call .h_top + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov t3, t1 + add t1, 384*2 + mov [rsp+8*1], lpf_strideq + add r7, lpf_strideq + mov [rsp+8*0], r7 ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, [rsp+8*0] + call .hv_bottom + add lpfq, [rsp+8*1] + call .hv_bottom +.end: + RET +.no_top: + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + lea r7, [r7+lpf_strideq*2] + mov [rsp+8*0], r7 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call .v + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, dst_strideq +.v1: + call .v + jmp .end +.extend_right: + movd xm2, r10d + vpbroadcastd m0, [pb_2_3] + vpbroadcastd m1, [pb_m6_m5] + vpbroadcastb m2, xm2 + psubb m0, m2 + psubb m1, m2 + movu m2, [pb_0to31] + pminub m0, m2 + pminub m1, m2 + pshufb m3, m0 + pshufb m4, m1 + ret +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm3, [leftq+4] + vpblendd m3, [lpfq+r10-4], 0xfe + add leftq, 8 + jmp .h_main .h_extend_left: - vbroadcasti128 m5, [srcq + xq] - mova m4, [srcq + xq] - palignr m4, m5, 10 - pshufb m4, [wiener7_l_shuf] - jmp .h_main + vbroadcasti128 m4, [lpfq+r10] ; avoid accessing memory located + mova m3, [lpfq+r10] ; before the start of the buffer + palignr m3, m4, 12 + pshufb m3, m15 + jmp .h_main +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left .h_loop: - movu m4, [srcq + xq - 6] + movu m3, [lpfq+r10-4] .h_main: - movu m5, [srcq + xq + 2] - movu m6, [srcq + xq + 6] - test edgeb, 2 ; LR_HAVE_RIGHT - jnz .h_have_right - cmp xd, -38 - jl .h_have_right - movd xm3, xd - vpbroadcastd m0, [pb_wiener7_l] - vpbroadcastd m1, [pb_wiener7_m] - vpbroadcastd m2, [pb_wiener7_r] - vpbroadcastb m3, xm3 - psubb m0, m3 - psubb m1, m3 - psubb m2, m3 - movu m3, [pb_0to31] - pminub m0, m3 - pminub m1, m3 - pminub m2, m3 - pshufb m4, m0 - pshufb m5, m1 - pshufb m6, m2 - cmp xd, -9*2 - jne .hack - vpbroadcastw xm3, [srcq + xq + 16] - vinserti128 m5, xm3, 1 - jmp .h_have_right -.hack: - cmp xd, -1*2 - jne .h_have_right - vpbroadcastw xm5, [srcq + xq] + movu m4, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + call .extend_right .h_have_right: - pshufb m6, m10 - pshufb m0, m4, m11 - pshufb m2, m5, m12 - paddw m0, m2 - pmaddwd m0, m7 - pshufb m2, m4, m13 - pshufb m4, m14 - paddw m2, m4 - pmaddwd m2, m8 - pshufb m1, m6, m11 - pshufb m5, m11 - pmaddwd m1, m7 - pmaddwd m5, m7 - pshufb m3, m6, m13 - pshufb m6, m14 - paddw m3, m6 - pmaddwd m3, m8 - paddd m0, m2 - paddd m1, m3 - pshufb m1, m15 - paddd m1, m5 - movq xm4, rhq - pxor m5, m5 - paddd m0, m9 - paddd m1, m9 - psrad m0, xm4 - psrad m1, xm4 - packssdw m0, m1 - pmaxsw m0, m5 - mova [dstq + xq], m0 - add xq, 32 - jl .h_loop - add srcq, ssq - add dstq, 384*2 - dec hd - jg .v_loop - RET - -INIT_YMM avx2 -cglobal wiener_filter7_v_16bpc, 6, 15, 13, dst, ds, mid, f, w, h, edge, bdmax - movifnidn wd, wm - movifnidn hd, hm - movifnidn edgeb, edgem - pxor m6, m6 - vpbroadcastd m7, [fq] - vpbroadcastw m8, [fq + 4] - vpbroadcastd m9, [fq + 6] - popcnt bdmaxd, bdmaxm - vpbroadcastd m10, [nd_1047552] - movq xm11, [pq_11] - cmp bdmaxd, 10 - je .bits10 - vpbroadcastd m10, [nd_1048320] - movq xm11, [pq_9] -.bits10: - vpbroadcastw m12, bdmaxm - add wq, wq - add midq, wq - add dstq, wq - neg wq - DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x - mov msq, 2*384 - mov t0, midq - mov t1, t0 - lea t2, [t1 + msq] - lea t3, [t2 + msq] - lea t4, [t3 + msq] - lea t5, [t4 + msq] - lea t6, [t5 + msq] - test edgeb, 4 ; LR_HAVE_TOP - jnz .have_top - mov t0, t3 - mov t1, t3 - mov t2, t3 -.have_top: - cmp hd, 3 - jg .v_loop - test edgeb, 8 ; LR_HAVE_BOTTOM - jz .no_bottom0 - cmp hd, 1 - jg .v_loop - jmp .h3 -.no_bottom0: - cmp hd, 2 - je .h2 - jns .h3 -.h1: - mov t4, t3 -.h2: - mov t5, t4 -.h3: - mov t6, t5 + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 + paddd m1, m3 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+r10], m0 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm3, [leftq+4] + vpblendd m3, [lpfq+r10-4], 0xfe + add leftq, 8 + jmp .hv_main +.hv_extend_left: + movu m3, [lpfq+r10-4] + pshufb m3, m15 + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+r10-4] +.hv_main: + movu m4, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -34 + jl .hv_have_right + call .extend_right +.hv_have_right: + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 + mova m2, [t3+r10] + paddw m2, [t1+r10] + paddd m1, m3 + mova m4, [t2+r10] + punpckhwd m3, m2, m4 + pmaddwd m3, m14 + punpcklwd m2, m4 + mova m4, [t4+r10] + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + pmaddwd m2, m14 + psraw m0, 1 + mova [t0+r10], m0 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 5 + psrad m0, 5 + packusdw m0, m1 + pmulhuw m0, m10 + mova [dstq+r10], m0 + add r10, 32 + jl .hv_loop + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 + add dstq, dst_strideq + ret +.v: + mov r10, wq .v_loop: - mov xq, wq -.h_loop: - mova m1, [t0 + xq] - mova m2, [t1 + xq] - mova m3, [t5 + xq] - mova m4, [t6 + xq] - punpcklwd m0, m1, m2 - pmaddwd m0, m7 - punpckhwd m1, m2 - pmaddwd m1, m7 - punpcklwd m2, m4, m3 - pmaddwd m2, m7 - punpckhwd m4, m3 - pmaddwd m4, m7 - paddd m0, m2 - paddd m1, m4 - mova m3, [t2 + xq] - mova m4, [t4 + xq] - punpcklwd m2, m3, m4 - pmaddwd m2, m8 - punpckhwd m3, m4 - pmaddwd m3, m8 - paddd m0, m2 - paddd m1, m3 - mova m3, [t3 + xq] - punpcklwd m2, m3, m6 - pmaddwd m2, m9 - punpckhwd m3, m6 - pmaddwd m3, m9 - paddd m0, m2 - paddd m1, m3 - paddd m0, m10 - paddd m1, m10 - psrad m0, xm11 - psrad m1, xm11 - packusdw m0, m1 - pminuw m0, m12 - mova [dstq + xq], m0 - add xq, 32 - jl .h_loop - add dstq, dsq - mov t0, t1 - mov t1, t2 - mov t2, t3 - mov t3, t4 - mov t4, t5 - mov t5, t6 - add t6, msq - cmp hd, 4 - jg .next_row - test edgeb, 8 ; LR_HAVE_BOTTOM - jz .no_bottom - cmp hd, 2 - jg .next_row -.no_bottom: - mov t6, t5 -.next_row: - dec hd - jg .v_loop - RET + mova m0, [t1+r10] + paddw m2, m0, [t3+r10] + mova m1, [t2+r10] + mova m4, [t4+r10] + punpckhwd m3, m2, m1 + pmaddwd m3, m14 + punpcklwd m2, m1 + pmaddwd m2, m14 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 5 + psrad m0, 5 + packusdw m0, m1 + pmulhuw m0, m10 + mova [dstq+r10], m0 + add r10, 32 + jl .v_loop + ret %endif ; ARCH_X86_64 From b56bc6858579682f6bf655416c2e5b3f63ff2adc Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:02:57 +0200 Subject: [PATCH 044/188] x86: Add high bitdepth (10-bit) sgr AVX2 asm --- src/x86/looprestoration16_avx2.asm | 1929 ++++++++++++++++++++++++++++ src/x86/looprestoration_avx2.asm | 24 +- 2 files changed, 1942 insertions(+), 11 deletions(-) diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm index 4551c3da76..c1ebdc487d 100644 --- a/src/x86/looprestoration16_avx2.asm +++ b/src/x86/looprestoration16_avx2.asm @@ -30,6 +30,8 @@ SECTION_RODATA 32 +sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 @@ -49,7 +51,18 @@ pb_m6_m5: times 2 db -6, -5 pb_m2_m1: times 2 db -2, -1 pb_2_3: times 2 db 2, 3 pb_6_7: times 2 db 6, 7 +pw_1023: times 2 dw 1023 +pd_8: dd 8 +pd_25: dd 25 +pd_4096: dd 4096 +pd_34816: dd 34816 pd_m262128 dd -262128 +pd_0xf00800a4: dd 0xf00800a4 +pd_0xf00801c7: dd 0xf00801c7 + +%define pw_256 sgr_lshuf5 + +cextern sgr_x_by_x_avx2 SECTION .text @@ -649,4 +662,1920 @@ ALIGN function_align jl .v_loop ret +cglobal sgr_filter_5x5_16bpc, 5, 14, 16, 400*24+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h + movifnidn wd, wm + mov paramsq, paramsmp + lea r13, [sgr_x_by_x_avx2+256*4] + mov edged, r8m + mov hd, r6m + add wd, wd + vpbroadcastw m7, [paramsq+8] ; w0 + add lpfq, wq + vpbroadcastd m8, [pd_8] + lea t1, [rsp+wq+20] + vpbroadcastd m9, [pd_25] + add dstq, wq + vpbroadcastd m10, [paramsq+0] ; s0 + lea t3, [rsp+wq*2+400*12+16] + vpbroadcastd m11, [pd_0xf00800a4] + lea t4, [rsp+wq+400*20+16] + vpbroadcastd m12, [pw_256] + neg wq + vpbroadcastd m13, [pd_34816] ; (1 << 11) + (1 << 15) + pxor m6, m6 + vpbroadcastd m14, [pw_1023] + psllw m7, 4 + mova xm15, [sgr_lshuf5] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + call .top_fixup + add t1, 400*6 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + add r10, lpf_strideq + mov [rsp+8*0], r10 ; below + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, dst_strideq + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, dst_strideq + test hd, hd + jz .odd_height + call .h + add lpfq, dst_strideq + call .hv + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp+8*0] + call .h_top + add lpfq, [rsp+8*1] + call .hv_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + call .n0 + call .n1 +.odd_height_end: + call .v + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + lea r10, [r10+lpf_strideq*2] + mov [rsp+8*0], r10 + call .h + lea t2, [t1+400*6] + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + jmp .main +.no_top_height1: + call .v + call .prep_n + jmp .odd_height_end +.extend_right: + vpbroadcastw m0, [lpfq-2] + movu m1, [r13+r10+ 0] + movu m2, [r13+r10+16] + vpblendvb m4, m0, m1 + vpblendvb m5, m0, m2 + ret +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm15 + vinserti128 m4, [lpfq+wq+10], 1 + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10- 2] +.h_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -36 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m2, m5, m4, 2 + paddw m0, m4, m2 + palignr m3, m5, m4, 6 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + shufpd m5, m4, m5, 0x05 + paddw m0, m5 + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + paddd m1, m3 + punpckhwd m3, m4, m5 + pmaddwd m3, m3 + shufps m4, m5, q2121 + paddw m0, m4 ; sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m2, m3 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+r10+400*0] + paddd m1, [t1+r10+400*2] + paddd m2, [t1+r10+400*4] +.h_loop_end: + paddd m1, m5 ; sumsq + paddd m2, m4 + mova [t1+r10+400*0], m0 + mova [t1+r10+400*2], m1 + mova [t1+r10+400*4], m2 + add r10, 32 + jl .h_loop + ret +.top_fixup: + lea r10, [wq-4] +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+r10+400*0] + mova m1, [t1+r10+400*2] + mova m2, [t1+r10+400*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m1 + mova [t2+r10+400*4], m2 + add r10, 32 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .hv_main +.hv_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm15 + vinserti128 m4, [lpfq+wq+10], 1 + jmp .hv_main +.hv_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+r10- 2] +.hv_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -36 + jl .hv_have_right + call .extend_right +.hv_have_right: + palignr m3, m5, m4, 2 + paddw m0, m4, m3 + palignr m1, m5, m4, 6 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + shufpd m5, m4, m5, 0x05 + paddw m0, m5 + punpcklwd m1, m4, m5 + pmaddwd m1, m1 + paddd m2, m1 + punpckhwd m1, m4, m5 + pmaddwd m1, m1 + shufps m4, m5, q2121 + paddw m0, m4 ; h sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m3, m1 + paddd m2, m5 ; h sumsq + paddd m3, m4 + paddw m1, m0, [t1+r10+400*0] + paddd m4, m2, [t1+r10+400*2] + paddd m5, m3, [t1+r10+400*4] + test hd, hd + jz .hv_last_row +.hv_main2: + paddw m1, [t2+r10+400*0] ; hv sum + paddd m4, [t2+r10+400*2] ; hv sumsq + paddd m5, [t2+r10+400*4] + mova [t0+r10+400*0], m0 + mova [t0+r10+400*2], m2 + mova [t0+r10+400*4], m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + punpcklwd m2, m3, m6 + psrld m5, 4 + punpckhwd m3, m6 + pmulld m4, m9 ; a * 25 + pmulld m5, m9 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + pmaxud m5, m3 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m10 ; p * s + pmulld m5, m10 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + psubw m2, m12, m2 ; a + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+r10+4], m2 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+r10*2+ 8], xm0 + vextracti128 [t3+r10*2+40], m0, 1 + mova [t3+r10*2+24], xm1 + vextracti128 [t3+r10*2+56], m1, 1 + add r10, 32 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+r10+400*0], m1 + paddw m1, m0 + mova [t1+r10+400*2], m4 + paddd m4, m2 + mova [t1+r10+400*4], m5 + paddd m5, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab + lea r10, [wq-4] +.v_loop: + mova m0, [t1+r10+400*0] + mova m2, [t1+r10+400*2] + mova m3, [t1+r10+400*4] + paddw m1, m0, [t2+r10+400*0] + paddd m4, m2, [t2+r10+400*2] + paddd m5, m3, [t2+r10+400*4] + paddw m0, m0 + paddd m2, m2 + paddd m3, m3 + paddw m1, m0 ; hv sum + paddd m4, m2 ; hv sumsq + paddd m5, m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + punpcklwd m2, m3, m6 + psrld m5, 4 + punpckhwd m3, m6 + pmulld m4, m9 ; a * 25 + pmulld m5, m9 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + pmaxud m5, m3 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m10 ; p * s + pmulld m5, m10 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + psubw m2, m12, m2 ; a + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+r10+4], m2 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+r10*2+ 8], xm0 + vextracti128 [t3+r10*2+40], m0, 1 + mova [t3+r10*2+24], xm1 + vextracti128 [t3+r10*2+56], m1, 1 + add r10, 32 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t4+r10*1+ 2] + movu m1, [t3+r10*2+ 4] + movu m2, [t3+r10*2+36] + paddw m3, m0, [t4+r10*1+ 0] + paddd m4, m1, [t3+r10*2+ 0] + paddd m5, m2, [t3+r10*2+32] + paddw m3, [t4+r10*1+ 4] + paddd m4, [t3+r10*2+ 8] + paddd m5, [t3+r10*2+40] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + mova [t4+r10*1+400*2+ 0], m0 + mova [t3+r10*2+400*4+ 0], m1 + mova [t3+r10*2+400*4+32], m2 + add r10, 32 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m0, [t4+r10*1+ 2] + movu m1, [t3+r10*2+ 4] + movu m2, [t3+r10*2+36] + paddw m3, m0, [t4+r10*1+ 0] + paddd m4, m1, [t3+r10*2+ 0] + paddd m5, m2, [t3+r10*2+32] + paddw m3, [t4+r10*1+ 4] + paddd m4, [t3+r10*2+ 8] + paddd m5, [t3+r10*2+40] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + paddw m3, m0, [t4+r10*1+400*2+ 0] + paddd m4, m1, [t3+r10*2+400*4+ 0] + paddd m5, m2, [t3+r10*2+400*4+32] + mova [t4+r10*1+400*2+ 0], m0 + mova [t3+r10*2+400*4+ 0], m1 + mova [t3+r10*2+400*4+32], m2 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + paddd m2, m1 ; a * src + b + (1 << 8) + paddd m3, m4 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 32 + jl .n0_loop + add dstq, dst_strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m0, [dstq+r10] + mova m3, [t4+r10*1+400*2+ 0] + mova m4, [t3+r10*2+400*4+ 0] + mova m5, [t3+r10*2+400*4+32] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + paddd m2, m1 ; a * src + b + (1 <<7) + paddd m3, m4 + psrld m2, 8 + psrld m3, 8 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 32 + jl .n1_loop + add dstq, dst_strideq + ret + +cglobal sgr_filter_3x3_16bpc, 5, 14, 15, 400*42+8, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h + movifnidn wd, wm + mov paramsq, paramsmp + lea r13, [sgr_x_by_x_avx2+256*4] + mov edged, r8m + add wd, wd + mov hd, r6m + add lpfq, wq + vpbroadcastw m7, [paramsq+10] ; w1 + lea t1, [rsp+wq+12] + vpbroadcastd m8, [pd_8] + add dstq, wq + vpbroadcastd m9, [paramsq+ 4] ; s1 + lea t3, [rsp+wq*2+400*12+8] + vpbroadcastd m10, [pd_0xf00801c7] + lea t4, [rsp+wq+400*32+8] + vpbroadcastd m11, [pd_34816] + neg wq + vpbroadcastd m12, [pw_256] + pxor m6, m6 + vpbroadcastd m13, [pw_1023] + psllw m7, 4 + mova xm14, [sgr_lshuf3] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + add t1, 400*6 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + add r10, lpf_strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, dst_strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, dst_strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, dst_strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, lpf_strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + lea r10, [r10+lpf_strideq*2] + mov [rsp], r10 + call .h + lea r10, [wq-4] + lea t2, [t1+400*6] +.top_fixup_loop: + mova m0, [t1+r10+400*0] + mova m1, [t1+r10+400*2] + mova m2, [t1+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m1 + mova [t2+r10+400*4], m2 + add r10, 32 + jl .top_fixup_loop + call .v0 + jmp .main +.extend_right: + vpbroadcastw m0, [lpfq-2] + movu m1, [r13+r10+ 2] + movu m2, [r13+r10+18] + vpblendvb m4, m0, m1 + vpblendvb m5, m0, m2 + ret +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 12 + jmp .h_main +.h_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm14 + vinserti128 m4, [lpfq+wq+12], 1 + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10+ 0] +.h_main: + movu m5, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + mova [t1+r10+400*0], m1 + mova [t1+r10+400*2], m2 + mova [t1+r10+400*4], m3 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 12 + jmp .hv0_main +.hv0_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm14 + vinserti128 m4, [lpfq+wq+12], 1 + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu m4, [lpfq+r10+ 0] +.hv0_main: + movu m5, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -34 + jl .hv0_have_right + call .extend_right +.hv0_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + paddw m0, m1, [t1+r10+400*0] + paddd m4, m2, [t1+r10+400*2] + paddd m5, m3, [t1+r10+400*4] + mova [t1+r10+400*0], m1 + mova [t1+r10+400*2], m2 + mova [t1+r10+400*4], m3 + paddw m1, m0, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psubw m2, m12, m2 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m2 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 12 + jmp .hv1_main +.hv1_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm14 + vinserti128 m4, [lpfq+wq+12], 1 + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu m4, [lpfq+r10+ 0] +.hv1_main: + movu m5, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -34 + jl .hv1_have_right + call .extend_right +.hv1_have_right: + palignr m1, m5, m4, 2 + paddw m0, m4, m1 + punpcklwd m2, m4, m1 + pmaddwd m2, m2 + punpckhwd m3, m4, m1 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m0, m5 ; h sum + punpcklwd m1, m5, m6 + pmaddwd m1, m1 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m1 ; h sumsq + paddd m3, m5 + paddw m1, m0, [t2+r10+400*0] + paddd m4, m2, [t2+r10+400*2] + paddd m5, m3, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m2 + mova [t2+r10+400*4], m3 + paddd m4, m8 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + psrld m5, 4 + pslld m2, m4, 3 + pslld m3, m5, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psubw m2, m12, m2 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*2 +4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab (even rows) + lea r10, [wq-4] +.v0_loop: + mova m0, [t1+r10+400*0] + mova m4, [t1+r10+400*2] + mova m5, [t1+r10+400*4] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psubw m2, m12, m2 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m2 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-4] +.v1_loop: + mova m0, [t1+r10+400*0] + mova m4, [t1+r10+400*2] + mova m5, [t1+r10+400*4] + paddw m1, m0, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psubw m2, m12, m2 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*2+ 4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + mova xm0, [t4+r10*1+400*0+0] + paddw xm0, [t4+r10*1+400*0+4] + paddw xm2, xm0, [t4+r10*1+400*0+2] + mova m1, [t3+r10*2+400*0+0] + paddd m1, [t3+r10*2+400*0+8] + paddd m3, m1, [t3+r10*2+400*0+4] + psllw xm2, 2 ; a[-1] 444 + pslld m3, 2 ; b[-1] 444 + psubw xm2, xm0 ; a[-1] 343 + psubd m3, m1 ; b[-1] 343 + mova [t4+r10*1+400* 4], xm2 + mova [t3+r10*2+400* 8], m3 + mova xm0, [t4+r10*1+400*2+0] + paddw xm0, [t4+r10*1+400*2+4] + paddw xm2, xm0, [t4+r10*1+400*2+2] + mova m1, [t3+r10*2+400*4+0] + paddd m1, [t3+r10*2+400*4+8] + paddd m3, m1, [t3+r10*2+400*4+4] + psllw xm2, 2 ; a[ 0] 444 + pslld m3, 2 ; b[ 0] 444 + mova [t4+r10*1+400* 6], xm2 + mova [t3+r10*2+400*12], m3 + psubw xm2, xm0 ; a[ 0] 343 + psubd m3, m1 ; b[ 0] 343 + mova [t4+r10*1+400* 8], xm2 + mova [t3+r10*2+400*16], m3 + add r10, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + mova m3, [t4+r10*1+400*0+0] + paddw m3, [t4+r10*1+400*0+4] + paddw m1, m3, [t4+r10*1+400*0+2] + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+r10*1+400*4] + paddw m3, [t4+r10*1+400*6] + mova [t4+r10*1+400*4], m2 + mova [t4+r10*1+400*6], m1 + mova m4, [t3+r10*2+400*0+0] + paddd m4, [t3+r10*2+400*0+8] + paddd m1, m4, [t3+r10*2+400*0+4] + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+r10*2+400* 8+ 0] + paddd m4, [t3+r10*2+400*12+ 0] + mova [t3+r10*2+400* 8+ 0], m2 + mova [t3+r10*2+400*12+ 0], m1 + mova m5, [t3+r10*2+400*0+32] + paddd m5, [t3+r10*2+400*0+40] + paddd m1, m5, [t3+r10*2+400*0+36] + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+r10*2+400* 8+32] + paddd m5, [t3+r10*2+400*12+32] + mova [t3+r10*2+400* 8+32], m2 + mova [t3+r10*2+400*12+32], m1 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + paddd m2, m1 ; a * src + b + (1 << 8) + paddd m3, m4 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+r10], m0 + add r10, 32 + jl .n0_loop + add dstq, dst_strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m3, [t4+r10*1+400*2+0] + paddw m3, [t4+r10*1+400*2+4] + paddw m1, m3, [t4+r10*1+400*2+2] + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+r10*1+400*6] + paddw m3, [t4+r10*1+400*8] + mova [t4+r10*1+400*6], m1 + mova [t4+r10*1+400*8], m2 + mova m4, [t3+r10*2+400*4+0] + paddd m4, [t3+r10*2+400*4+8] + paddd m1, m4, [t3+r10*2+400*4+4] + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+r10*2+400*12+ 0] + paddd m4, [t3+r10*2+400*16+ 0] + mova [t3+r10*2+400*12+ 0], m1 + mova [t3+r10*2+400*16+ 0], m2 + mova m5, [t3+r10*2+400*4+32] + paddd m5, [t3+r10*2+400*4+40] + paddd m1, m5, [t3+r10*2+400*4+36] + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+r10*2+400*12+32] + paddd m5, [t3+r10*2+400*16+32] + mova [t3+r10*2+400*12+32], m1 + mova [t3+r10*2+400*16+32], m2 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + paddd m2, m1 ; a * src + b + (1 << 8) + paddd m3, m4 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+r10], m0 + add r10, 32 + jl .n1_loop + add dstq, dst_strideq + ret + +cglobal sgr_filter_mix_16bpc, 5, 14, 16, 400*66+8, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h + movifnidn wd, wm + mov paramsq, paramsmp + lea r13, [sgr_x_by_x_avx2+256*4] + mov edged, r8m + add wd, wd + mov hd, r6m + add lpfq, wq + vpbroadcastd m9, [pd_8] + lea t1, [rsp+wq+12] + vpbroadcastd m10, [pd_34816] + add dstq, wq + vpbroadcastd m11, [pw_256] + lea t3, [rsp+wq*2+400*24+8] + vpbroadcastd m12, [pd_0xf00801c7] + lea t4, [rsp+wq+400*52+8] + vpbroadcastd m15, [paramsq+8] ; w0 w1 + neg wq + vpbroadcastd m13, [paramsq+0] ; s0 + pxor m7, m7 + vpbroadcastd m14, [paramsq+4] ; s1 + psllw m15, 2 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup + add t1, 400*12 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + add r10, lpf_strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, dst_strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, dst_strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, dst_strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, lpf_strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + lea r10, [r10+lpf_strideq*2] + mov [rsp], r10 + call .h + lea r10, [wq-4] + lea t2, [t1+400*12] +.top_fixup_loop: + mova m0, [t1+r10+400* 0] + mova m1, [t1+r10+400* 2] + mova m2, [t1+r10+400* 4] + paddw m0, m0 + mova m3, [t1+r10+400* 6] + paddd m1, m1 + mova m4, [t1+r10+400* 8] + paddd m2, m2 + mova m5, [t1+r10+400*10] + mova [t2+r10+400* 0], m0 + mova [t2+r10+400* 2], m1 + mova [t2+r10+400* 4], m2 + mova [t2+r10+400* 6], m3 + mova [t2+r10+400* 8], m4 + mova [t2+r10+400*10], m5 + add r10, 32 + jl .top_fixup_loop + call .v0 + jmp .main +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, [sgr_lshuf5] + vinserti128 m4, [lpfq+wq+10], 1 + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10- 2] +.h_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -36 + jl .h_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right +.h_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; sum3 + punpcklwd m6, m0, m7 + pmaddwd m6, m6 + punpckhwd m0, m7 + pmaddwd m0, m0 + paddd m2, m6 ; sumsq3 + shufpd m6, m4, m5, 0x05 + punpcklwd m5, m6, m4 + paddw m8, m4, m6 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + paddd m3, m0 + mova [t1+r10+400* 6], m1 + mova [t1+r10+400* 8], m2 + mova [t1+r10+400*10], m3 + paddw m8, m1 ; sum5 + paddd m5, m2 ; sumsq5 + paddd m6, m3 + mova [t1+r10+400* 0], m8 + mova [t1+r10+400* 2], m5 + mova [t1+r10+400* 4], m6 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .hv0_main +.hv0_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, [sgr_lshuf5] + vinserti128 m4, [lpfq+wq+10], 1 + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu m4, [lpfq+r10- 2] +.hv0_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -36 + jl .hv0_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right +.hv0_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; h sum3 + punpcklwd m6, m0, m7 + pmaddwd m6, m6 + punpckhwd m0, m7 + pmaddwd m0, m0 + paddd m2, m6 ; h sumsq3 + shufpd m6, m4, m5, 0x05 + punpcklwd m5, m6, m4 + paddw m8, m4, m6 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + paddd m3, m0 + paddw m8, m1 ; h sum5 + paddd m5, m2 ; h sumsq5 + paddd m6, m3 + mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4? + mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd + mova [t3+r10*2+400*0+40], m6 + paddw m8, [t1+r10+400* 0] + paddd m5, [t1+r10+400* 2] + paddd m6, [t1+r10+400* 4] + mova [t1+r10+400* 0], m8 + mova [t1+r10+400* 2], m5 + mova [t1+r10+400* 4], m6 + paddw m0, m1, [t1+r10+400* 6] + paddd m4, m2, [t1+r10+400* 8] + paddd m5, m3, [t1+r10+400*10] + mova [t1+r10+400* 6], m1 + mova [t1+r10+400* 8], m2 + mova [t1+r10+400*10], m3 + paddw m1, m0, [t2+r10+400* 6] + paddd m2, m4, [t2+r10+400* 8] + paddd m3, m5, [t2+r10+400*10] + mova [t2+r10+400* 6], m0 + mova [t2+r10+400* 8], m4 + mova [t2+r10+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmaxud m4, m2 + psubd m4, m2 ; p3 + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + psubw m2, m11, m2 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*2+ 4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .hv1_main +.hv1_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, [sgr_lshuf5] + vinserti128 m4, [lpfq+wq+10], 1 + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu m4, [lpfq+r10- 2] +.hv1_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -36 + jl .hv1_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right +.hv1_have_right: + palignr m6, m5, m4, 2 + palignr m3, m5, m4, 4 + paddw m2, m6, m3 + punpcklwd m0, m6, m3 + pmaddwd m0, m0 + punpckhwd m6, m3 + pmaddwd m6, m6 + palignr m3, m5, m4, 6 + paddw m2, m3 ; h sum3 + punpcklwd m1, m3, m7 + pmaddwd m1, m1 + punpckhwd m3, m7 + pmaddwd m3, m3 + paddd m0, m1 ; h sumsq3 + shufpd m1, m4, m5, 0x05 + punpckhwd m5, m4, m1 + paddw m8, m4, m1 + pmaddwd m5, m5 + punpcklwd m4, m1 + pmaddwd m4, m4 + paddd m6, m3 + paddw m1, m2, [t2+r10+400* 6] + mova [t2+r10+400* 6], m2 + paddw m8, m2 ; h sum5 + paddd m2, m0, [t2+r10+400* 8] + paddd m3, m6, [t2+r10+400*10] + mova [t2+r10+400* 8], m0 + mova [t2+r10+400*10], m6 + paddd m4, m0 ; h sumsq5 + paddd m5, m6 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m0, m2, 3 + pslld m6, m3, 3 + paddd m2, m0 ; ((a3 + 8) >> 4) * 9 + paddd m3, m6 + psrlw m6, m1, 1 + pavgw m6, m7 ; (b3 + 2) >> 2 + punpcklwd m0, m6, m7 + pmaddwd m0, m0 + punpckhwd m6, m7 + pmaddwd m6, m6 + pmaxud m2, m0 + psubd m2, m0 ; p3 + pmaxud m3, m6 + psubd m3, m6 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmulld m2, m14 ; p3 * s1 + pmulld m3, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m2, m12 + paddusw m3, m12 + psrad m7, m2, 20 ; min(z3, 255) - 256 + vpgatherdd m6, [r13+m7*4], m2 + psrad m2, m3, 20 + vpgatherdd m7, [r13+m2*4], m3 + pmulld m0, m6 + packssdw m6, m7 + pmulld m7, m1 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m7, m10 + psubw m6, m11, m6 + psrld m0, 12 + psrld m7, 12 + paddw m1, m8, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + paddw m1, [t1+r10+400*0] + paddd m2, [t1+r10+400*2] + paddd m3, [t1+r10+400*4] + mova [t2+r10+400*0], m8 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + mova [t4+r10*1+400*4 +4], m6 + mova [t3+r10*2+400*8+ 8], xm0 + vextracti128 [t3+r10*2+400*8+40], m0, 1 + mova [t3+r10*2+400*8+24], xm7 + vextracti128 [t3+r10*2+400*8+56], m7, 1 + vpbroadcastd m4, [pd_25] + pxor m7, m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 + pmulld m3, m4 + psrlw m5, m1, 1 + pavgw m5, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m5, m7 + pmaddwd m4, m4 + punpckhwd m5, m7 + pmaddwd m5, m5 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmaxud m2, m4 + psubd m2, m4 ; p5 + vpbroadcastd m4, [pd_0xf00800a4] + pmaxud m3, m5 + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r13+m5*4], m2 + psrad m2, m3, 20 + vpgatherdd m5, [r13+m2*4], m3 + pmulld m0, m4 + pmulld m1, m5 + packssdw m4, m5 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m10 + psubw m4, m11, m4 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m4 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) + lea r10, [wq-4] +.v0_loop: + mova m0, [t1+r10+400* 6] + mova m4, [t1+r10+400* 8] + mova m5, [t1+r10+400*10] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+r10+400* 6] + paddd m2, m4, [t2+r10+400* 8] + paddd m3, m5, [t2+r10+400*10] + mova [t2+r10+400* 6], m0 + mova [t2+r10+400* 8], m4 + mova [t2+r10+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmaxud m4, m2 + psubd m4, m2 ; p3 + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + psubw m2, m11, m2 + psrld m0, 12 + psrld m1, 12 + mova m3, [t1+r10+400*0] + mova m4, [t1+r10+400*2] + mova m5, [t1+r10+400*4] + mova [t3+r10*2+400*8+ 8], m3 + mova [t3+r10*2+400*0+ 8], m4 + mova [t3+r10*2+400*0+40], m5 + paddw m3, m3 ; cc5 + paddd m4, m4 + paddd m5, m5 + mova [t1+r10+400*0], m3 + mova [t1+r10+400*2], m4 + mova [t1+r10+400*4], m5 + mova [t4+r10*1+400*2+ 4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-4] +.v1_loop: + mova m4, [t1+r10+400* 6] + mova m5, [t1+r10+400* 8] + mova m6, [t1+r10+400*10] + paddw m1, m4, [t2+r10+400* 6] + paddd m2, m5, [t2+r10+400* 8] + paddd m3, m6, [t2+r10+400*10] + mova [t2+r10+400* 6], m4 + mova [t2+r10+400* 8], m5 + mova [t2+r10+400*10], m6 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmaxud m4, m2 + psubd m4, m2 ; p3 + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + psubw m2, m11, m2 + psrld m0, 12 + psrld m8, m1, 12 + mova [t4+r10*1+400*4+4], m2 + mova m4, [t3+r10*2+400*8+ 8] + mova m5, [t3+r10*2+400*0+ 8] + mova m6, [t3+r10*2+400*0+40] + paddw m1, m4, [t2+r10+400*0] + paddd m2, m5, [t2+r10+400*2] + paddd m3, m6, [t2+r10+400*4] + paddw m1, [t1+r10+400*0] + paddd m2, [t1+r10+400*2] + paddd m3, [t1+r10+400*4] + mova [t2+r10+400*0], m4 + mova [t2+r10+400*2], m5 + mova [t2+r10+400*4], m6 + vpbroadcastd m4, [pd_25] + mova [t3+r10*2+400*8+ 8], xm0 + vextracti128 [t3+r10*2+400*8+40], m0, 1 + mova [t3+r10*2+400*8+24], xm8 + vextracti128 [t3+r10*2+400*8+56], m8, 1 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 + pmulld m3, m4 + psrlw m5, m1, 1 + pavgw m5, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m5, m7 + pmaddwd m4, m4 + punpckhwd m5, m7 + pmaddwd m5, m5 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmaxud m2, m4 + psubd m2, m4 ; p5 + vpbroadcastd m4, [pd_0xf00800a4] + pmaxud m3, m5 + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r13+m5*4], m2 + psrad m2, m3, 20 + vpgatherdd m5, [r13+m2*4], m3 + pmulld m0, m4 + pmulld m1, m5 + packssdw m4, m5 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m10 + psubw m4, m11, m4 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m4 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu xm0, [t4+r10*1+400*0+2] + paddw xm2, xm0, [t4+r10*1+400*0+0] + paddw xm2, [t4+r10*1+400*0+4] + movu m1, [t3+r10*2+400*0+4] + paddd m3, m1, [t3+r10*2+400*0+0] + paddd m3, [t3+r10*2+400*0+8] + paddw xm0, xm2 + paddd m1, m3 + psllw xm2, 2 + pslld m3, 2 + paddw xm0, xm2 ; a5 565 + paddd m1, m3 ; b5 565 + mova [t4+r10*1+400* 6], xm0 + mova [t3+r10*2+400*12], m1 + mova xm0, [t4+r10*1+400*2+0] + paddw xm0, [t4+r10*1+400*2+4] + paddw xm2, xm0, [t4+r10*1+400*2+2] + mova m1, [t3+r10*2+400*4+0] + paddd m1, [t3+r10*2+400*4+8] + paddd m3, m1, [t3+r10*2+400*4+4] + psllw xm2, 2 ; a3[-1] 444 + pslld m3, 2 ; b3[-1] 444 + psubw xm2, xm0 ; a3[-1] 343 + psubd m3, m1 ; b3[-1] 343 + mova [t4+r10*1+400* 8], xm2 + mova [t3+r10*2+400*16], m3 + mova xm0, [t4+r10*1+400*4+0] + paddw xm0, [t4+r10*1+400*4+4] + paddw xm2, xm0, [t4+r10*1+400*4+2] + mova m1, [t3+r10*2+400*8+0] + paddd m1, [t3+r10*2+400*8+8] + paddd m3, m1, [t3+r10*2+400*8+4] + psllw xm2, 2 ; a3[ 0] 444 + pslld m3, 2 ; b3[ 0] 444 + mova [t4+r10*1+400*10], xm2 + mova [t3+r10*2+400*20], m3 + psubw xm2, xm0 ; a3[ 0] 343 + psubd m3, m1 ; b3[ 0] 343 + mova [t4+r10*1+400*12], xm2 + mova [t3+r10*2+400*24], m3 + add r10, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu xm2, [t4+r10*1+2] + paddw xm0, xm2, [t4+r10*1+0] + paddw xm0, [t4+r10*1+4] + paddw xm2, xm0 + psllw xm0, 2 + paddw xm0, xm2 ; a5 + movu m1, [t3+r10*2+4] + paddd m4, m1, [t3+r10*2+0] + paddd m4, [t3+r10*2+8] + paddd m1, m4 + pslld m4, 2 + paddd m4, m1 ; b5 + paddw xm2, xm0, [t4+r10*1+400* 6] + mova [t4+r10*1+400* 6], xm0 + paddd m0, m4, [t3+r10*2+400*12] + mova [t3+r10*2+400*12], m4 + mova xm3, [t4+r10*1+400*2+0] + paddw xm3, [t4+r10*1+400*2+4] + paddw xm5, xm3, [t4+r10*1+400*2+2] + psllw xm5, 2 ; a3[ 1] 444 + psubw xm4, xm5, xm3 ; a3[ 1] 343 + paddw xm3, xm4, [t4+r10*1+400* 8] + paddw xm3, [t4+r10*1+400*10] + mova [t4+r10*1+400* 8], xm4 + mova [t4+r10*1+400*10], xm5 + mova m1, [t3+r10*2+400*4+0] + paddd m1, [t3+r10*2+400*4+8] + paddd m5, m1, [t3+r10*2+400*4+4] + pslld m5, 2 ; b3[ 1] 444 + psubd m4, m5, m1 ; b3[ 1] 343 + paddd m1, m4, [t3+r10*2+400*16] + paddd m1, [t3+r10*2+400*20] + mova [t3+r10*2+400*16], m4 + mova [t3+r10*2+400*20], m5 + pmovzxwd m4, [dstq+r10] + pmovzxwd m2, xm2 ; a5 + pmovzxwd m3, xm3 ; a3 + pmaddwd m2, m4 ; a5 * src + pmaddwd m3, m4 ; a3 * src + pslld m4, 13 + psubd m0, m4 + psubd m1, m4 + paddd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) + paddd m1, m3 ; a3 * src + b3 + (1 << 8) - (src << 13) + psrld m0, 9 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + vpbroadcastd m1, [pd_4096] + paddd m4, m1 + paddd m0, m4 + psrad m0, 7 + vextracti128 xm1, m0, 1 + packusdw xm0, xm1 ; clip + psrlw xm0, 6 + mova [dstq+r10], xm0 + add r10, 16 + jl .n0_loop + add dstq, dst_strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova xm3, [t4+r10*1+400*4+0] + paddw xm3, [t4+r10*1+400*4+4] + paddw xm5, xm3, [t4+r10*1+400*4+2] + psllw xm5, 2 ; a3[ 1] 444 + psubw xm4, xm5, xm3 ; a3[ 1] 343 + paddw xm3, xm4, [t4+r10*1+400*12] + paddw xm3, [t4+r10*1+400*10] + mova [t4+r10*1+400*10], xm5 + mova [t4+r10*1+400*12], xm4 + mova m1, [t3+r10*2+400*8+0] + paddd m1, [t3+r10*2+400*8+8] + paddd m5, m1, [t3+r10*2+400*8+4] + pslld m5, 2 ; b3[ 1] 444 + psubd m4, m5, m1 ; b3[ 1] 343 + paddd m1, m4, [t3+r10*2+400*24] + paddd m1, [t3+r10*2+400*20] + mova [t3+r10*2+400*20], m5 + mova [t3+r10*2+400*24], m4 + pmovzxwd m4, [dstq+r10] + pmovzxwd m0, [t4+r10*1+400* 6] + pmovzxwd m3, xm3 + pmaddwd m0, m4 ; a5 * src + pmaddwd m3, m4 ; a3 * src + pslld m4, 12 + psubd m2, m4, [t3+r10*2+400*12] + paddd m4, m4 + psubd m1, m4 + psubd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) + paddd m1, m3 ; a3 * src + b3 + (1 << 8) - (src << 13) + psrld m0, 8 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + vpbroadcastd m1, [pd_4096] + paddd m4, m1 + paddd m0, m4 + psrad m0, 7 + vextracti128 xm1, m0, 1 + packusdw xm0, xm1 ; clip + psrlw xm0, 6 + mova [dstq+r10], xm0 + add r10, 16 + jl .n1_loop + add dstq, dst_strideq + ret + %endif ; ARCH_X86_64 diff --git a/src/x86/looprestoration_avx2.asm b/src/x86/looprestoration_avx2.asm index 71e3e0d225..67ea6cc580 100644 --- a/src/x86/looprestoration_avx2.asm +++ b/src/x86/looprestoration_avx2.asm @@ -41,7 +41,8 @@ sgr_r_ext: times 16 db 1 ; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of ; cache but eliminates some shifts in the inner sgr loop which is overall a win -sgr_x_by_x: dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 +const sgr_x_by_x_avx2 + dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 dd 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8 dd 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5 dd 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 @@ -58,17 +59,18 @@ sgr_x_by_x: dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 + times 4 db -1 ; needed for 16-bit sgr +pb_m5: times 4 db -5 +pb_3: times 4 db 3 +pw_5_6: dw 5, 6 + sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 db 9, -1, 10, -1, 11, -1, 12, -1 -pb_3: times 4 db 3 -pb_m5: times 4 db -5 -pw_16: times 2 dw 16 pw_256: times 2 dw 256 pw_2056: times 2 dw 2056 pw_m16380: times 2 dw -16380 -pw_5_6: dw 5, 6 pd_25: dd 25 pd_34816: dd 34816 pd_m4096: dd -4096 @@ -729,8 +731,8 @@ ALIGN function_align cglobal sgr_filter_5x5_8bpc, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \ lpf_stride, w, edge, params, h -%define base r12-sgr_x_by_x-256*4 - lea r12, [sgr_x_by_x+256*4] +%define base r12-sgr_x_by_x_avx2-256*4 + lea r12, [sgr_x_by_x_avx2+256*4] mov paramsq, paramsmp mov wd, wm mov edged, r8m @@ -1189,12 +1191,12 @@ ALIGN function_align cglobal sgr_filter_3x3_8bpc, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \ lpf_stride, w, edge, params, h -%define base r14-sgr_x_by_x-256*4 +%define base r14-sgr_x_by_x_avx2-256*4 mov paramsq, paramsmp mov edged, r8m mov wd, wm mov hd, r6m - lea r14, [sgr_x_by_x+256*4] + lea r14, [sgr_x_by_x_avx2+256*4] vbroadcasti128 m8, [base+sgr_shuf+2] add lpfq, wq vbroadcasti128 m9, [base+sgr_shuf+4] @@ -1548,8 +1550,8 @@ ALIGN function_align cglobal sgr_filter_mix_8bpc, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ lpf_stride, w, edge, params, h -%define base r12-sgr_x_by_x-256*4 - lea r12, [sgr_x_by_x+256*4] +%define base r12-sgr_x_by_x_avx2-256*4 + lea r12, [sgr_x_by_x_avx2+256*4] mov paramsq, paramsmp mov wd, wm mov edged, r8m From b72f049b876bf7bf0318607a91c393821c0fe213 Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Tue, 4 May 2021 14:03:14 +0200 Subject: [PATCH 045/188] x86: Add high bitdepth ipred_dc AVX2 asm --- src/x86/ipred16_avx2.asm | 356 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 356 insertions(+) create mode 100644 src/x86/ipred16_avx2.asm diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm new file mode 100644 index 0000000000..e99ae0bd9a --- /dev/null +++ b/src/x86/ipred16_avx2.asm @@ -0,0 +1,356 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA + +pw_512: times 2 dw 512 +pw_2048: times 2 dw 2048 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4) + +JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 +JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64 + +SECTION .text + +INIT_YMM avx2 + +cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h + movifnidn hd, hm + add tlq, 2 + movd xm4, wd + pxor xm3, xm3 + pavgw xm4, xm3 + tzcnt wd, wd + movd xm5, wd + movu m0, [tlq] + lea r5, [ipred_dc_left_16bpc_avx2_table] + movsxd r6, [r5+wq*4] + add r6, r5 + add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + mov hd, hm + sub tlq, hq + movd xm4, hd + sub tlq, hq + pxor xm3, xm3 + pavgw xm4, xm3 + tzcnt r6d, hd + movd xm5, r6d + movu m0, [tlq] + lea r5, [ipred_dc_left_16bpc_avx2_table] + movsxd r6, [r5+r6*4] + add r6, r5 + add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table + tzcnt wd, wd + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + paddw m0, [tlq+96] + paddw m0, [tlq+64] +.h32: + paddw m0, [tlq+32] +.h16: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +.h8: + psrldq xm1, xm0, 8 + paddw xm0, xm1 +.h4: + punpcklwd xm0, xm3 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + paddd xm0, xm4 + psrld xm0, xm5 + lea stride3q, [strideq*3] + vpbroadcastw m0, xm0 + mova m1, m0 + mova m2, m0 + mova m3, m0 + jmp wq + +cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd xm4, r5d + tzcnt r5d, r5d + movd xm5, r5d + lea r5, [ipred_dc_16bpc_avx2_table] + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+5*4] + pxor m3, m3 + psrlw xm4, 1 + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movq xm0, [tlq-8] + jmp wq +.w4: + movq xm1, [tlq+2] + paddw m0, m4 + paddw m0, m1 + psrlq m1, m0, 32 + paddw m0, m1 + psrld m1, m0, 16 + paddw m0, m1 + cmp hd, 4 + jg .w4_mul + psrlw xm0, 3 + jmp .w4_end +.w4_mul: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + lea r2d, [hq*2] + mov r6d, 0xAAAB6667 + shrx r6d, r6d, r2d + punpckhwd xm1, xm0, xm3 + punpcklwd xm0, xm3 + paddd xm0, xm1 + movd xm1, r6d + psrld xm0, 2 + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w4_end: + vpbroadcastw xm0, xm0 +.s4: + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm0 + movq [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +ALIGN function_align +.h8: + mova xm0, [tlq-16] + jmp wq +.w8: + vextracti128 xm1, m0, 1 + paddw xm0, [tlq+2] + paddw xm0, xm4 + paddw xm0, xm1 + psrld xm1, xm0, 16 + paddw xm0, xm1 + pblendw xm0, xm3, 0xAA + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 8 + je .w8_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w8_end: + vpbroadcastw xm0, xm0 +.s8: + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm0 + mova [dstq+strideq*2], xm0 + mova [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +ALIGN function_align +.h16: + mova m0, [tlq-32] + jmp wq +.w16: + paddw m0, [tlq+2] + vextracti128 xm1, m0, 1 + paddw xm0, xm4 + paddw xm0, xm1 + punpckhwd xm1, xm0, xm3 + punpcklwd xm0, xm3 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 16 + je .w16_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + test hb, 8|32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w16_end: + vpbroadcastw m0, xm0 +.s16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +ALIGN function_align +.h32: + mova m0, [tlq-64] + paddw m0, [tlq-32] + jmp wq +.w32: + paddw m0, [tlq+ 2] + paddw m0, [tlq+34] + vextracti128 xm1, m0, 1 + paddw xm0, xm4 + paddw xm0, xm1 + punpcklwd xm1, xm0, xm3 + punpckhwd xm0, xm3 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x6667AAAB + shrx r6d, r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w32_end: + vpbroadcastw m0, xm0 + mova m1, m0 +.s32: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*2+32*0], m0 + mova [dstq+strideq*2+32*1], m1 + mova [dstq+stride3q +32*0], m0 + mova [dstq+stride3q +32*1], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s32 + RET +ALIGN function_align +.h64: + mova m0, [tlq-128] + mova m1, [tlq- 96] + paddw m0, [tlq- 64] + paddw m1, [tlq- 32] + paddw m0, m1 + jmp wq +.w64: + movu m1, [tlq+ 2] + paddw m0, [tlq+34] + paddw m1, [tlq+66] + paddw m0, [tlq+98] + paddw m0, m1 + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + punpcklwd xm1, xm0, xm3 + punpckhwd xm0, xm3 + paddd xm1, xm4 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 64 + je .w64_end + mov r6d, 0x6667AAAB + shrx r6d, r6d, hd + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w64_end: + vpbroadcastw m0, xm0 + mova m1, m0 + mova m2, m0 + mova m3, m0 +.s64: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*0+32*2], m2 + mova [dstq+strideq*0+32*3], m3 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*1+32*2], m2 + mova [dstq+strideq*1+32*3], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .s64 + RET + +cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 + mov r6d, r8m + shr r6d, 11 + lea r5, [ipred_dc_splat_16bpc_avx2_table] + tzcnt wd, wd + movifnidn hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4] + mova m1, m0 + mova m2, m0 + mova m3, m0 + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +%endif From 492a060b71c3b237e89aaf74b35d102b086e804c Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Tue, 4 May 2021 14:03:18 +0200 Subject: [PATCH 046/188] x86: Add high bitdepth ipred_{h,v} AVX2 asm --- src/x86/ipred16_avx2.asm | 82 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index e99ae0bd9a..efefa155d4 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -48,6 +48,7 @@ pw_2048: times 2 dw 2048 JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64 +JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64 SECTION .text @@ -353,4 +354,85 @@ cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 lea stride3q, [strideq*3] jmp wq +cglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + movu m0, [tlq+ 2] + movu m1, [tlq+34] + movu m2, [tlq+66] + movu m3, [tlq+98] + lea r5, [ipred_dc_splat_16bpc_avx2_table] + tzcnt wd, wd + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +%macro IPRED_H 2 ; w, store_type + vpbroadcastw m0, [tlq-2] + vpbroadcastw m1, [tlq-4] + vpbroadcastw m2, [tlq-6] + vpbroadcastw m3, [tlq-8] + sub tlq, 8 + mov%2 [dstq+strideq*0], m0 + mov%2 [dstq+strideq*1], m1 + mov%2 [dstq+strideq*2], m2 + mov%2 [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w%1 + RET +ALIGN function_align +%endmacro + +cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + lea r5, [ipred_h_16bpc_avx2_table] + tzcnt wd, wd + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +INIT_XMM avx2 +.w4: + IPRED_H 4, q +.w8: + IPRED_H 8, a +INIT_YMM avx2 +.w16: + IPRED_H 16, a +.w32: + vpbroadcastw m0, [tlq-2] + vpbroadcastw m1, [tlq-4] + vpbroadcastw m2, [tlq-6] + vpbroadcastw m3, [tlq-8] + sub tlq, 8 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m0 + mova [dstq+strideq*1+32*0], m1 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*2+32*0], m2 + mova [dstq+strideq*2+32*1], m2 + mova [dstq+stride3q +32*0], m3 + mova [dstq+stride3q +32*1], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w32 + RET +.w64: + vpbroadcastw m0, [tlq-2] + vpbroadcastw m1, [tlq-4] + sub tlq, 4 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m0 + mova [dstq+strideq*0+32*2], m0 + mova [dstq+strideq*0+32*3], m0 + mova [dstq+strideq*1+32*0], m1 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*1+32*2], m1 + mova [dstq+strideq*1+32*3], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w64 + RET + %endif From ebc3df03b2205de83135c2d5062d8a4bfa411e34 Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Tue, 4 May 2021 14:03:20 +0200 Subject: [PATCH 047/188] x86: Add high bitdepth ipred_paeth AVX2 asm --- src/x86/ipred16_avx2.asm | 138 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 137 insertions(+), 1 deletion(-) diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index efefa155d4..d7b86c38e6 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -28,7 +28,10 @@ %if ARCH_X86_64 -SECTION_RODATA +SECTION_RODATA 32 + +ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 + db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15 pw_512: times 2 dw 512 pw_2048: times 2 dw 2048 @@ -49,6 +52,7 @@ JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64 SECTION .text @@ -435,4 +439,136 @@ INIT_YMM avx2 jg .w64 RET +%macro PAETH 3 ; top, signed_ldiff, ldiff + paddw m0, m%2, m1 + psubw m7, m3, m0 ; tldiff + psubw m0, m%1 ; tdiff + pabsw m7, m7 + pabsw m0, m0 + pminsw m7, m0 + pcmpeqw m0, m7 + pcmpgtw m7, m%3, m7 + vpblendvb m0, m3, m%1, m0 + vpblendvb m0, m1, m0, m7 +%endmacro + +cglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h +%define base r5-ipred_paeth_16bpc_avx2_table + movifnidn hd, hm + lea r5, [ipred_paeth_16bpc_avx2_table] + tzcnt wd, wd + movsxd wq, [r5+wq*4] + vpbroadcastw m3, [tlq] ; topleft + add wq, r5 + jmp wq +.w4: + vpbroadcastq m2, [tlq+2] ; top + movsldup m6, [base+ipred_hv_shuf] + lea r3, [strideq*3] + psubw m4, m2, m3 + pabsw m5, m4 +.w4_loop: + sub tlq, 8 + vpbroadcastq m1, [tlq] + pshufb m1, m6 ; left + PAETH 2, 4, 5 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +ALIGN function_align +.w8: + vbroadcasti128 m2, [tlq+2] + movsldup m6, [base+ipred_hv_shuf] + lea r3, [strideq*3] + psubw m4, m2, m3 + pabsw m5, m4 +.w8_loop: + sub tlq, 4 + vpbroadcastd m1, [tlq] + pshufb m1, m6 + PAETH 2, 4, 5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + movu m2, [tlq+2] + psubw m4, m2, m3 + pabsw m5, m4 +.w16_loop: + sub tlq, 2 + vpbroadcastw m1, [tlq] + PAETH 2, 4, 5 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w16_loop + RET +ALIGN function_align +.w32: + movu m2, [tlq+2] + movu m6, [tlq+34] +%if WIN64 + movaps r4m, xmm8 + movaps r6m, xmm9 +%endif + psubw m4, m2, m3 + psubw m8, m6, m3 + pabsw m5, m4 + pabsw m9, m8 +.w32_loop: + sub tlq, 2 + vpbroadcastw m1, [tlq] + PAETH 2, 4, 5 + mova [dstq+32*0], m0 + PAETH 6, 8, 9 + mova [dstq+32*1], m0 + add dstq, strideq + dec hd + jg .w32_loop +%if WIN64 + movaps xmm8, r4m + movaps xmm9, r6m +%endif + RET +ALIGN function_align +.w64: + WIN64_SPILL_XMM 16 + movu m2, [tlq+ 2] + movu m6, [tlq+34] + movu m10, [tlq+66] + movu m13, [tlq+98] + psubw m4, m2, m3 + psubw m8, m6, m3 + psubw m11, m10, m3 + psubw m14, m13, m3 + pabsw m5, m4 + pabsw m9, m8 + pabsw m12, m11 + pabsw m15, m14 +.w64_loop: + sub tlq, 2 + vpbroadcastw m1, [tlq] + PAETH 2, 4, 5 + mova [dstq+32*0], m0 + PAETH 6, 8, 9 + mova [dstq+32*1], m0 + PAETH 10, 11, 12 + mova [dstq+32*2], m0 + PAETH 13, 14, 15 + mova [dstq+32*3], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + %endif From 354e84f60aabf09b419f9e6e1b0ed2be9269cff6 Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Tue, 4 May 2021 14:03:21 +0200 Subject: [PATCH 048/188] x86: Add high bitdepth ipred_smooth AVX2 asm --- src/x86/ipred16_avx2.asm | 526 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 526 insertions(+) diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index d7b86c38e6..b5525edb60 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -30,11 +30,39 @@ SECTION_RODATA 32 +%macro SMOOTH_WEIGHT_TABLE 1-* + %rep %0 + dw %1, 256-%1 + %rotate 1 + %endrep +%endmacro + +; sm_weights[], but modified to precalculate x and 256-x +smooth_weights: SMOOTH_WEIGHT_TABLE \ + 0, 0, 255, 128, 255, 149, 85, 64, \ + 255, 197, 146, 105, 73, 50, 37, 32, \ + 255, 225, 196, 170, 145, 123, 102, 84, \ + 68, 54, 43, 33, 26, 20, 17, 16, \ + 255, 240, 225, 210, 196, 182, 169, 157, \ + 145, 133, 122, 111, 101, 92, 83, 74, \ + 66, 59, 52, 45, 39, 34, 29, 25, \ + 21, 17, 14, 12, 10, 9, 8, 8, \ + 255, 248, 240, 233, 225, 218, 210, 203, \ + 196, 189, 182, 176, 169, 163, 156, 150, \ + 144, 138, 133, 127, 121, 116, 111, 106, \ + 101, 96, 91, 86, 82, 77, 73, 69, \ + 65, 61, 57, 54, 50, 47, 44, 41, \ + 38, 35, 32, 29, 27, 25, 22, 20, \ + 18, 16, 15, 13, 12, 10, 9, 8, \ + 7, 6, 6, 5, 5, 4, 4, 4 + ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15 pw_512: times 2 dw 512 pw_2048: times 2 dw 2048 +pd_128: dd 128 +pd_256: dd 256 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) @@ -53,6 +81,9 @@ JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 SECTION .text @@ -571,4 +602,499 @@ ALIGN function_align jg .w64_loop RET +%macro SMOOTH 4 ; src[1-2], mul[1-2] + pmaddwd m0, m%3, m%1 + pmaddwd m1, m%4, m%2 + paddd m0, m2 + paddd m1, m2 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 +%endmacro + +cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights +%define base r6-ipred_smooth_v_16bpc_avx2_table + lea r6, [ipred_smooth_v_16bpc_avx2_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastd m2, [base+pd_128] + lea weightsq, [base+smooth_weights+hq*8] + neg hq + vpbroadcastw m5, [tlq+hq*2] ; bottom + add wq, r6 + jmp wq +.w4: + vpbroadcastq m3, [tlq+2] + punpcklwd m3, m5 ; top, bottom + movshdup m5, [base+ipred_hv_shuf] + lea r3, [strideq*3] + punpcklqdq m4, m5, m5 + punpckhqdq m5, m5 +.w4_loop: + vbroadcasti128 m1, [weightsq+hq*4] + pshufb m0, m1, m4 + pshufb m1, m5 + SMOOTH 3, 3, 0, 1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w4_loop +.ret: + RET +ALIGN function_align +.w8: + vbroadcasti128 m4, [tlq+2] + punpcklwd m3, m4, m5 + punpckhwd m4, m5 + movshdup m5, [base+ipred_hv_shuf] +.w8_loop: + vpbroadcastq m1, [weightsq+hq*4] + pshufb m1, m5 + SMOOTH 3, 4, 1, 1 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w8_loop + RET +ALIGN function_align +.w16: + movu m4, [tlq+2] + punpcklwd m3, m4, m5 + punpckhwd m4, m5 +.w16_loop: + vpbroadcastd m1, [weightsq+hq*4] + vpbroadcastd m5, [weightsq+hq*4+4] + SMOOTH 3, 4, 1, 1 + mova [dstq+strideq*0], m0 + SMOOTH 3, 4, 5, 5 + mova [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w16_loop + RET +ALIGN function_align +.w32: + WIN64_SPILL_XMM 8 + movu m4, [tlq+2] + movu m7, [tlq+34] + punpcklwd m3, m4, m5 + punpckhwd m4, m5 + punpcklwd m6, m7, m5 + punpckhwd m7, m5 +.w32_loop: + vpbroadcastd m5, [weightsq+hq*4] + SMOOTH 3, 4, 5, 5 + mova [dstq+32*0], m0 + SMOOTH 6, 7, 5, 5 + mova [dstq+32*1], m0 + add dstq, strideq + inc hq + jl .w32_loop + RET +ALIGN function_align +.w64: + WIN64_SPILL_XMM 12 + movu m4, [tlq+ 2] + movu m7, [tlq+34] + movu m9, [tlq+66] + movu m11, [tlq+98] + punpcklwd m3, m4, m5 + punpckhwd m4, m5 + punpcklwd m6, m7, m5 + punpckhwd m7, m5 + punpcklwd m8, m9, m5 + punpckhwd m9, m5 + punpcklwd m10, m11, m5 + punpckhwd m11, m5 +.w64_loop: + vpbroadcastd m5, [weightsq+hq*4] + SMOOTH 3, 4, 5, 5 + mova [dstq+32*0], m0 + SMOOTH 6, 7, 5, 5 + mova [dstq+32*1], m0 + SMOOTH 8, 9, 5, 5 + mova [dstq+32*2], m0 + SMOOTH 10, 11, 5, 5 + mova [dstq+32*3], m0 + add dstq, strideq + inc hq + jl .w64_loop + RET + +cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h +%define base r6-ipred_smooth_h_16bpc_avx2_table + lea r6, [ipred_smooth_h_16bpc_avx2_table] + mov wd, wm + mov hd, hm + vpbroadcastw m3, [tlq+wq*2] ; right + tzcnt wd, wd + movsxd wq, [r6+wq*4] + vpbroadcastd m2, [base+pd_128] + add wq, r6 + jmp wq +.w4: + vbroadcasti128 m4, [base+smooth_weights+4*4] + movsldup m5, [base+ipred_hv_shuf] + sub tlq, 8 + sub tlq, hq + sub tlq, hq + lea r3, [strideq*3] +.w4_loop: + vpbroadcastq m1, [tlq+hq*2] + pshufb m1, m5 + punpcklwd m0, m1, m3 ; left, right + punpckhwd m1, m3 + SMOOTH 0, 1, 4, 4 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +ALIGN function_align +.w8: + WIN64_SPILL_XMM 7 + vbroadcasti128 m4, [base+smooth_weights+8*4+16*0] + vbroadcasti128 m5, [base+smooth_weights+8*4+16*1] + movsldup m6, [base+ipred_hv_shuf] + sub tlq, 4 + sub tlq, hq + sub tlq, hq +.w8_loop: + vpbroadcastd m1, [tlq+hq*2] + pshufb m1, m6 + punpcklwd m0, m1, m3 + punpckhwd m1, m3 + SMOOTH 0, 1, 4, 5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hq, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + WIN64_SPILL_XMM 6 + mova xm4, [base+smooth_weights+16*4+16*0] + mova xm5, [base+smooth_weights+16*4+16*1] + vinserti128 m4, [base+smooth_weights+16*4+16*2], 1 + vinserti128 m5, [base+smooth_weights+16*4+16*3], 1 + sub tlq, 2 + sub tlq, hq + sub tlq, hq +.w16_loop: + vpbroadcastw m1, [tlq+hq*2] + punpcklwd m0, m1, m3 + punpckhwd m1, m3 + SMOOTH 0, 1, 4, 5 + mova [dstq], m0 + add dstq, strideq + dec hq + jg .w16_loop + RET +ALIGN function_align +.w32: + WIN64_SPILL_XMM 10 + mova xm6, [base+smooth_weights+32*4+16*0] + mova xm7, [base+smooth_weights+32*4+16*1] + vinserti128 m6, [base+smooth_weights+32*4+16*2], 1 + vinserti128 m7, [base+smooth_weights+32*4+16*3], 1 + mova xm8, [base+smooth_weights+32*4+16*4] + mova xm9, [base+smooth_weights+32*4+16*5] + vinserti128 m8, [base+smooth_weights+32*4+16*6], 1 + vinserti128 m9, [base+smooth_weights+32*4+16*7], 1 + sub tlq, 2 + sub tlq, hq + sub tlq, hq +.w32_loop: + vpbroadcastw m5, [tlq+hq*2] + punpcklwd m4, m5, m3 + punpckhwd m5, m3 + SMOOTH 4, 5, 6, 7 + mova [dstq+32*0], m0 + SMOOTH 4, 5, 8, 9 + mova [dstq+32*1], m0 + add dstq, strideq + dec hq + jg .w32_loop + RET +ALIGN function_align +.w64: +%assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 14 + mova xm6, [base+smooth_weights+64*4+16* 0] + mova xm7, [base+smooth_weights+64*4+16* 1] + vinserti128 m6, [base+smooth_weights+64*4+16* 2], 1 + vinserti128 m7, [base+smooth_weights+64*4+16* 3], 1 + mova xm8, [base+smooth_weights+64*4+16* 4] + mova xm9, [base+smooth_weights+64*4+16* 5] + vinserti128 m8, [base+smooth_weights+64*4+16* 6], 1 + vinserti128 m9, [base+smooth_weights+64*4+16* 7], 1 + mova xm10, [base+smooth_weights+64*4+16* 8] + mova xm11, [base+smooth_weights+64*4+16* 9] + vinserti128 m10, [base+smooth_weights+64*4+16*10], 1 + vinserti128 m11, [base+smooth_weights+64*4+16*11], 1 + mova xm12, [base+smooth_weights+64*4+16*12] + mova xm13, [base+smooth_weights+64*4+16*13] + vinserti128 m12, [base+smooth_weights+64*4+16*14], 1 + vinserti128 m13, [base+smooth_weights+64*4+16*15], 1 + sub tlq, 2 + sub tlq, hq + sub tlq, hq +.w64_loop: + vpbroadcastw m5, [tlq+hq*2] + punpcklwd m4, m5, m3 + punpckhwd m5, m3 + SMOOTH 4, 5, 6, 7 + mova [dstq+32*0], m0 + SMOOTH 4, 5, 8, 9 + mova [dstq+32*1], m0 + SMOOTH 4, 5, 10, 11 + mova [dstq+32*2], m0 + SMOOTH 4, 5, 12, 13 + mova [dstq+32*3], m0 + add dstq, strideq + dec hq + jg .w64_loop + RET + +%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] + pmaddwd m0, m%1, m%3 + pmaddwd m1, m%2, m%4 + paddd m0, m%5 + paddd m1, m%6 + paddd m0, m5 + paddd m1, m5 + psrld m0, 9 + psrld m1, 9 + packssdw m0, m1 +%endmacro + +cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights +%define base r6-ipred_smooth_16bpc_avx2_table + lea r6, [ipred_smooth_16bpc_avx2_table] + mov wd, wm + vpbroadcastw m4, [tlq+wq*2] ; right + tzcnt wd, wd + mov hd, hm + sub tlq, hq + sub tlq, hq + movsxd wq, [r6+wq*4] + vpbroadcastd m5, [base+pd_256] + add wq, r6 + lea v_weightsq, [base+smooth_weights+hq*4] + jmp wq +.w4: + WIN64_SPILL_XMM 11 + vpbroadcastw m0, [tlq] ; bottom + vpbroadcastq m6, [tlq+hq*2+2] + movsldup m7, [base+ipred_hv_shuf] + movshdup m9, [base+ipred_hv_shuf] + vbroadcasti128 m10, [base+smooth_weights+4*4] + punpcklwd m6, m0 ; top, bottom + punpcklqdq m8, m9, m9 + punpckhqdq m9, m9 + lea r3, [strideq*3] + sub tlq, 8 +.w4_loop: + vbroadcasti128 m1, [v_weightsq] + vpbroadcastq m3, [tlq+hq*2] + pshufb m3, m7 + punpcklwd m2, m3, m4 ; left, right + punpckhwd m3, m4 + pmaddwd m2, m10 + pmaddwd m3, m10 + pshufb m0, m1, m8 + pshufb m1, m9 + SMOOTH_2D_END 6, 6, 0, 1, 2, 3 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + add v_weightsq, 16 + sub hd, 4 + jg .w4_loop + RET +ALIGN function_align +.w8: +%assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 12 + vpbroadcastw m0, [tlq] ; bottom + vbroadcasti128 m7, [tlq+hq*2+2] + movsldup m8, [base+ipred_hv_shuf] + movshdup m9, [base+ipred_hv_shuf] + vbroadcasti128 m10, [base+smooth_weights+8*4+16*0] + vbroadcasti128 m11, [base+smooth_weights+8*4+16*1] + punpcklwd m6, m7, m0 ; top, bottom + punpckhwd m7, m0 + sub tlq, 4 +.w8_loop: + vpbroadcastq m1, [v_weightsq] + vpbroadcastd m3, [tlq+hq*2] + pshufb m3, m8 + punpcklwd m2, m3, m4 ; left, right + punpckhwd m3, m4 + pmaddwd m2, m10 + pmaddwd m3, m11 + pshufb m1, m9 + SMOOTH_2D_END 6, 7, 1, 1, 2, 3 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + add v_weightsq, 8 + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: +%assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 14 + vpbroadcastw m0, [tlq] ; bottom + movu m7, [tlq+hq*2+2] + mova xm8, [base+smooth_weights+16*4+16*0] + mova xm9, [base+smooth_weights+16*4+16*1] + vinserti128 m8, [base+smooth_weights+16*4+16*2], 1 + vinserti128 m9, [base+smooth_weights+16*4+16*3], 1 + punpcklwd m6, m7, m0 ; top, bottom + punpckhwd m7, m0 + sub tlq, 2 +.w16_loop: + vpbroadcastd m10, [v_weightsq+0] + vpbroadcastd m11, [v_weightsq+4] + vpbroadcastw m3, [tlq+hq*2-0] + vpbroadcastw m13, [tlq+hq*2-2] + punpcklwd m2, m3, m4 ; left, right + punpckhwd m3, m4 + punpcklwd m12, m13, m4 + punpckhwd m13, m4 + pmaddwd m2, m8 + pmaddwd m3, m9 + pmaddwd m12, m8 + pmaddwd m13, m9 + SMOOTH_2D_END 6, 7, 10, 10, 2, 3 + mova [dstq+strideq*0], m0 + SMOOTH_2D_END 6, 7, 11, 11, 12, 13 + mova [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + add v_weightsq, 8 + sub hq, 2 + jg .w16_loop + RET +ALIGN function_align +.w32: +%assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + vpbroadcastw m0, [tlq] ; bottom + movu m7, [tlq+hq*2+ 2] + movu m9, [tlq+hq*2+34] + mova xm10, [base+smooth_weights+32*4+16*0] + mova xm11, [base+smooth_weights+32*4+16*1] + vinserti128 m10, [base+smooth_weights+32*4+16*2], 1 + vinserti128 m11, [base+smooth_weights+32*4+16*3], 1 + mova xm12, [base+smooth_weights+32*4+16*4] + mova xm13, [base+smooth_weights+32*4+16*5] + vinserti128 m12, [base+smooth_weights+32*4+16*6], 1 + vinserti128 m13, [base+smooth_weights+32*4+16*7], 1 + punpcklwd m6, m7, m0 + punpckhwd m7, m0 + punpcklwd m8, m9, m0 + punpckhwd m9, m0 + sub tlq, 2 +.w32_loop: + vpbroadcastw m3, [tlq+hq*2] + punpcklwd m2, m3, m4 + punpckhwd m3, m4 + pmaddwd m14, m2, m10 + pmaddwd m15, m3, m11 + pmaddwd m2, m12 + pmaddwd m3, m13 + vpbroadcastd m1, [v_weightsq] + pmaddwd m0, m6, m1 + paddd m0, m14 + paddd m0, m5 + psrld m0, 9 + pmaddwd m14, m7, m1 + paddd m14, m15 + paddd m14, m5 + psrld m14, 9 + packssdw m0, m14 + mova [dstq+32*0], m0 + SMOOTH_2D_END 8, 9, 1, 1, 2, 3 + mova [dstq+32*1], m0 + add dstq, strideq + add v_weightsq, 4 + dec hd + jg .w32_loop + RET +ALIGN function_align +.w64: +%assign stack_offset stack_offset - stack_size_padded + PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base + mov dst_baseq, dstq + mov tl_baseq, tlq + mov v_weights_baseq, v_weightsq + xor xq, xq +.w64_loop_x: + mov yq, hq + lea tlq, [tl_baseq+hq*2] + vpbroadcastw m0, [tl_baseq] ; bottom + movu m7, [tlq+xq*2+ 2] + movu m9, [tlq+xq*2+34] + mova xm10, [base+smooth_weights+64*4+16*0] + mova xm11, [base+smooth_weights+64*4+16*1] + vinserti128 m10, [base+smooth_weights+64*4+16*2], 1 + vinserti128 m11, [base+smooth_weights+64*4+16*3], 1 + mova xm12, [base+smooth_weights+64*4+16*4] + mova xm13, [base+smooth_weights+64*4+16*5] + vinserti128 m12, [base+smooth_weights+64*4+16*6], 1 + vinserti128 m13, [base+smooth_weights+64*4+16*7], 1 + punpcklwd m6, m7, m0 + punpckhwd m7, m0 + punpcklwd m8, m9, m0 + punpckhwd m9, m0 + lea tlq, [tl_baseq-2] +.w64_loop_y: + vpbroadcastd m1, [v_weightsq] + vpbroadcastw m3, [tlq+yq*2] + punpcklwd m2, m3, m4 + punpckhwd m3, m4 + pmaddwd m14, m2, m10 + pmaddwd m15, m3, m11 + pmaddwd m2, m12 + pmaddwd m3, m13 + pmaddwd m0, m6, m1 + paddd m0, m14 + paddd m0, m5 + psrld m0, 9 + pmaddwd m14, m7, m1 + paddd m14, m15 + paddd m14, m5 + psrld m14, 9 + packssdw m0, m14 + mova [dstq+32*0], m0 + SMOOTH_2D_END 8, 9, 1, 1, 2, 3 + mova [dstq+32*1], m0 + add dstq, strideq + add v_weightsq, 4 + dec yq + jg .w64_loop_y + lea dstq, [dst_baseq+32*2] + add r6, 16*8 + mov v_weightsq, v_weights_baseq + add xq, 32 + test xb, 64 + jz .w64_loop_x + RET + %endif From d66190a23c8d323e99456375b1bd67ab15c255d2 Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Tue, 4 May 2021 14:03:22 +0200 Subject: [PATCH 049/188] x86: Add high bitdepth ipred_cfl_dc AVX2 asm --- src/x86/ipred16_avx2.asm | 308 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 308 insertions(+) diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index b5525edb60..68698822ec 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -75,6 +75,7 @@ pd_256: dd 256 %endmacro %define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4) +%define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4) JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 @@ -84,6 +85,9 @@ JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ + s4-8*4, s8-8*4, s16-8*4, s32-8*4 +JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32 SECTION .text @@ -1097,4 +1101,308 @@ ALIGN function_align jz .w64_loop_x RET +%if WIN64 +DECLARE_REG_TMP 5 +%else +DECLARE_REG_TMP 7 +%endif + +%macro IPRED_CFL 1 ; ac in, unpacked pixels out + psignw m3, m%1, m1 + pabsw m%1, m%1 + pmulhrsw m%1, m2 + psignw m%1, m3 + paddw m%1, m0 +%endmacro + +cglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + add tlq, 2 + movd xm4, wd + pxor m6, m6 + vpbroadcastw m7, r7m + pavgw xm4, xm6 + tzcnt wd, wd + movd xm5, wd + movu m0, [tlq] + lea t0, [ipred_cfl_left_16bpc_avx2_table] + movsxd r6, [t0+wq*4] + add r6, t0 + add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 + +cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + mov hd, hm ; zero upper half + sub tlq, hq + movd xm4, hd + sub tlq, hq + pxor m6, m6 + vpbroadcastw m7, r7m + pavgw xm4, xm6 + tzcnt r6d, hd + movd xm5, r6d + movu m0, [tlq] + lea t0, [ipred_cfl_left_16bpc_avx2_table] + movsxd r6, [t0+r6*4] + add r6, t0 + add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table + tzcnt wd, wd + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h32: + paddw m0, [tlq+32] +.h16: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +.h8: + psrldq xm1, xm0, 8 + paddw xm0, xm1 +.h4: + punpcklwd xm0, xm6 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + paddd xm0, xm4 + psrld xm0, xm5 + vpbroadcastw m0, xm0 + jmp wq + +cglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + movifnidn wd, wm + tzcnt r6d, hd + lea t0d, [wq+hq] + movd xm4, t0d + tzcnt t0d, t0d + movd xm5, t0d + lea t0, [ipred_cfl_16bpc_avx2_table] + tzcnt wd, wd + movsxd r6, [t0+r6*4] + movsxd wq, [t0+wq*4+4*4] + psrlw xm4, 1 + pxor m6, m6 + vpbroadcastw m7, r7m + add r6, t0 + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h4: + movq xm0, [tlq-8] + jmp wq +.w4: + movq xm1, [tlq+2] + paddw m0, m4 + paddw m0, m1 + psrlq m1, m0, 32 + paddw m0, m1 + psrld m1, m0, 16 + paddw m0, m1 + cmp hd, 4 + jg .w4_mul + psrlw xm0, 3 + jmp .w4_end +.w4_mul: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + lea r2d, [hq*2] + mov r6d, 0xAAAB6667 + shrx r6d, r6d, r2d + punpckhwd xm1, xm0, xm6 + punpcklwd xm0, xm6 + paddd xm0, xm1 + movd xm1, r6d + psrld xm0, 2 + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w4_end: + vpbroadcastw m0, xm0 +.s4: + vpbroadcastw m1, alpham + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s4_loop: + mova m4, [acq] + IPRED_CFL 4 + pmaxsw m4, m6 + pminsw m4, m7 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*2], xm5 + movhps [dstq+strideq*1], xm4 + movhps [dstq+r6 ], xm5 + lea dstq, [dstq+strideq*4] + add acq, 32 + sub hd, 4 + jg .s4_loop + RET +ALIGN function_align +.h8: + mova xm0, [tlq-16] + jmp wq +.w8: + vextracti128 xm1, m0, 1 + paddw xm0, [tlq+2] + paddw xm0, xm4 + paddw xm0, xm1 + psrld xm1, xm0, 16 + paddw xm0, xm1 + pblendw xm0, xm6, 0xAA + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 8 + je .w8_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w8_end: + vpbroadcastw m0, xm0 +.s8: + vpbroadcastw m1, alpham + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s8_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + pmaxsw m4, m6 + pmaxsw m5, m6 + pminsw m4, m7 + pminsw m5, m7 + mova [dstq+strideq*0], xm4 + mova [dstq+strideq*2], xm5 + vextracti128 [dstq+strideq*1], m4, 1 + vextracti128 [dstq+r6 ], m5, 1 + lea dstq, [dstq+strideq*4] + add acq, 64 + sub hd, 4 + jg .s8_loop + RET +ALIGN function_align +.h16: + mova m0, [tlq-32] + jmp wq +.w16: + paddw m0, [tlq+2] + vextracti128 xm1, m0, 1 + paddw xm0, xm4 + paddw xm0, xm1 + punpckhwd xm1, xm0, xm6 + punpcklwd xm0, xm6 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 16 + je .w16_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + test hb, 8|32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w16_end: + vpbroadcastw m0, xm0 +.s16: + vpbroadcastw m1, alpham + pabsw m2, m1 + psllw m2, 9 +.s16_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + pmaxsw m4, m6 + pmaxsw m5, m6 + pminsw m4, m7 + pminsw m5, m7 + mova [dstq+strideq*0], m4 + mova [dstq+strideq*1], m5 + lea dstq, [dstq+strideq*2] + add acq, 64 + sub hd, 2 + jg .s16_loop + RET +ALIGN function_align +.h32: + mova m0, [tlq-64] + paddw m0, [tlq-32] + jmp wq +.w32: + paddw m0, [tlq+ 2] + paddw m0, [tlq+34] + vextracti128 xm1, m0, 1 + paddw xm0, xm4 + paddw xm0, xm1 + punpcklwd xm1, xm0, xm6 + punpckhwd xm0, xm6 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x6667AAAB + shrx r6d, r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w32_end: + vpbroadcastw m0, xm0 +.s32: + vpbroadcastw m1, alpham + pabsw m2, m1 + psllw m2, 9 +.s32_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + pmaxsw m4, m6 + pmaxsw m5, m6 + pminsw m4, m7 + pminsw m5, m7 + mova [dstq+32*0], m4 + mova [dstq+32*1], m5 + add dstq, strideq + add acq, 64 + dec hd + jg .s32_loop + RET + +cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + mov r6d, r7m + shr r6d, 11 + lea t0, [ipred_cfl_splat_16bpc_avx2_table] + tzcnt wd, wd + movifnidn hd, hm + movsxd wq, [t0+wq*4] + vpbroadcastd m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4] + pxor m6, m6 + vpbroadcastw m7, r7m + add wq, t0 + movifnidn acq, acmp + jmp wq + %endif From 53b6addf9beecaa55d0ea8512d2f634c6a807b31 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:03:27 +0200 Subject: [PATCH 050/188] x86: Add high bitdepth blend AVX2 asm --- src/x86/mc16_avx2.asm | 359 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 359 insertions(+) diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm index be778244a6..8e223f9ec4 100644 --- a/src/x86/mc16_avx2.asm +++ b/src/x86/mc16_avx2.asm @@ -30,6 +30,16 @@ SECTION_RODATA 32 +; dav1d_obmc_masks[] * -512 +obmc_masks: dw 0, 0, -9728, 0, -12800, -7168, -2560, 0 + dw -14336, -11264, -8192, -5632, -3584, -1536, 0, 0 + dw -15360, -13824, -12288, -10752, -9216, -7680, -6144, -5120 + dw -4096, -3072, -2048, -1536, 0, 0, 0, 0 + dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240 + dw -9728, -8704, -8192, -7168, -6656, -6144, -5632, -4608 + dw -4096, -3584, -3072, -2560, -2048, -2048, -1536, -1024 + +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 deint_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 subpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 subpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 @@ -51,6 +61,7 @@ pw_2048: times 2 dw 2048 pw_8192: times 2 dw 8192 pw_27615: times 2 dw 27615 pw_32766: times 2 dw 32766 +pw_m512: times 2 dw -512 pd_32: dd 32 pd_512: dd 512 pd_65538: dd 65538 @@ -72,6 +83,9 @@ BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 64, 128 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) @@ -3251,4 +3265,349 @@ ALIGN function_align add maskq, 32 ret +; (a * (64 - m) + b * m + 32) >> 6 +; = (((b - a) * m + 32) >> 6) + a +; = (((b - a) * (m << 9) + 16384) >> 15) + a +; except m << 9 overflows int16_t when m == 64 (which is possible), +; but if we negate m it works out (-64 << 9 == -32768). +; = (((a - b) * (m * -512) + 16384) >> 15) + a +cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask +%define base r6-blend_avx2_table + lea r6, [blend_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r6+wq*4] + movifnidn maskq, maskmp + vpbroadcastd m6, [base+pw_m512] + add wq, r6 + lea r6, [dsq*3] + jmp wq +.w4: + pmovzxbw m3, [maskq] + movq xm0, [dstq+dsq*0] + movhps xm0, [dstq+dsq*1] + vpbroadcastq m1, [dstq+dsq*2] + vpbroadcastq m2, [dstq+r6 ] + vpblendd m0, m1, 0x30 + vpblendd m0, m2, 0xc0 + psubw m1, m0, [tmpq] + add maskq, 16 + add tmpq, 32 + pmullw m3, m6 + pmulhrsw m1, m3 + paddw m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + movq [dstq+dsq*2], xm1 + movhps [dstq+r6 ], xm1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w4 + RET +.w8: + pmovzxbw m4, [maskq+16*0] + pmovzxbw m5, [maskq+16*1] + mova xm0, [dstq+dsq*0] + vinserti128 m0, [dstq+dsq*1], 1 + mova xm1, [dstq+dsq*2] + vinserti128 m1, [dstq+r6 ], 1 + psubw m2, m0, [tmpq+32*0] + psubw m3, m1, [tmpq+32*1] + add maskq, 16*2 + add tmpq, 32*2 + pmullw m4, m6 + pmullw m5, m6 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + mova [dstq+dsq*2], xm1 + vextracti128 [dstq+r6 ], m1, 1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w8 + RET +.w16: + pmovzxbw m4, [maskq+16*0] + pmovzxbw m5, [maskq+16*1] + mova m0, [dstq+dsq*0] + psubw m2, m0, [tmpq+ 32*0] + mova m1, [dstq+dsq*1] + psubw m3, m1, [tmpq+ 32*1] + add maskq, 16*2 + add tmpq, 32*2 + pmullw m4, m6 + pmullw m5, m6 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16 + RET +.w32: + pmovzxbw m4, [maskq+16*0] + pmovzxbw m5, [maskq+16*1] + mova m0, [dstq+32*0] + psubw m2, m0, [tmpq+32*0] + mova m1, [dstq+32*1] + psubw m3, m1, [tmpq+32*1] + add maskq, 16*2 + add tmpq, 32*2 + pmullw m4, m6 + pmullw m5, m6 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + add dstq, dsq + dec hd + jg .w32 + RET + +INIT_XMM avx2 +cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h +%define base r5-blend_v_avx2_table + lea r5, [blend_v_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + jmp wq +.w2: + vpbroadcastd m2, [base+obmc_masks+2*2] +.w2_loop: + movd m0, [dstq+dsq*0] + pinsrd m0, [dstq+dsq*1], 1 + movq m1, [tmpq] + add tmpq, 4*2 + psubw m1, m0, m1 + pmulhrsw m1, m2 + paddw m0, m1 + movd [dstq+dsq*0], m0 + pextrd [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w2_loop + RET +.w4: + vpbroadcastq m2, [base+obmc_masks+4*2] +.w4_loop: + movq m0, [dstq+dsq*0] + movhps m0, [dstq+dsq*1] + psubw m1, m0, [tmpq] + add tmpq, 8*2 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w4_loop + RET +INIT_YMM avx2 +.w8: + vbroadcasti128 m2, [base+obmc_masks+8*2] +.w8_loop: + mova xm0, [dstq+dsq*0] + vinserti128 m0, [dstq+dsq*1], 1 + psubw m1, m0, [tmpq] + add tmpq, 16*2 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w8_loop + RET +.w16: + mova m4, [base+obmc_masks+16*2] +.w16_loop: + mova m0, [dstq+dsq*0] + psubw m2, m0, [tmpq+ 32*0] + mova m1, [dstq+dsq*1] + psubw m3, m1, [tmpq+ 32*1] + add tmpq, 32*2 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16_loop + RET +.w32: +%if WIN64 + movaps [rsp+ 8], xmm6 + movaps [rsp+24], xmm7 +%endif + mova m6, [base+obmc_masks+32*2] + vbroadcasti128 m7, [base+obmc_masks+32*3] +.w32_loop: + mova m0, [dstq+dsq*0+32*0] + psubw m3, m0, [tmpq +32*0] + mova xm2, [dstq+dsq*0+32*1] + mova xm5, [tmpq +32*1] + mova m1, [dstq+dsq*1+32*0] + psubw m4, m1, [tmpq +32*2] + vinserti128 m2, [dstq+dsq*1+32*1], 1 + vinserti128 m5, [tmpq +32*3], 1 + add tmpq, 32*4 + psubw m5, m2, m5 + pmulhrsw m3, m6 + pmulhrsw m4, m6 + pmulhrsw m5, m7 + paddw m0, m3 + paddw m1, m4 + paddw m2, m5 + mova [dstq+dsq*0+32*0], m0 + mova [dstq+dsq*1+32*0], m1 + mova [dstq+dsq*0+32*1], xm2 + vextracti128 [dstq+dsq*1+32*1], m2, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w32_loop +%if WIN64 + movaps xmm6, [rsp+ 8] + movaps xmm7, [rsp+24] +%endif + RET + +%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp + mova m0, [dstq+32*(%1+0)] + psubw m2, m0, [tmpq+32*(%2+0)] + mova m1, [dstq+32*(%1+1)] + psubw m3, m1, [tmpq+32*(%2+1)] +%if %3 + add tmpq, 32*%3 +%endif + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+32*(%1+0)], m0 + mova [dstq+32*(%1+1)], m1 +%endmacro + +INIT_XMM avx2 +cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, mask +%define base r5-blend_h_avx2_table + lea r5, [blend_h_avx2_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd m0, [dstq+dsq*0] + pinsrd m0, [dstq+dsq*1], 1 + movd m2, [maskq+hq*2] + movq m1, [tmpq] + add tmpq, 4*2 + punpcklwd m2, m2 + psubw m1, m0, m1 + pmulhrsw m1, m2 + paddw m0, m1 + movd [dstq+dsq*0], m0 + pextrd [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +.w4: + mova m3, [blend_shuf] +.w4_loop: + movq m0, [dstq+dsq*0] + movhps m0, [dstq+dsq*1] + movd m2, [maskq+hq*2] + psubw m1, m0, [tmpq] + add tmpq, 8*2 + pshufb m2, m3 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +INIT_YMM avx2 +.w8: + vbroadcasti128 m3, [blend_shuf] + shufpd m3, m3, 0x0c +.w8_loop: + mova xm0, [dstq+dsq*0] + vinserti128 m0, [dstq+dsq*1], 1 + vpbroadcastd m2, [maskq+hq*2] + psubw m1, m0, [tmpq] + add tmpq, 16*2 + pshufb m2, m3 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop + RET +.w16: + vpbroadcastw m4, [maskq+hq*2] + vpbroadcastw m5, [maskq+hq*2+2] + mova m0, [dstq+dsq*0] + psubw m2, m0, [tmpq+ 32*0] + mova m1, [dstq+dsq*1] + psubw m3, m1, [tmpq+ 32*1] + add tmpq, 32*2 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w16 + RET +.w32: + vpbroadcastw m4, [maskq+hq*2] + BLEND_H_ROW 0, 0, 2 + add dstq, dsq + inc hq + jl .w32 + RET +.w64: + vpbroadcastw m4, [maskq+hq*2] + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2, 4 + add dstq, dsq + inc hq + jl .w64 + RET +.w128: + vpbroadcastw m4, [maskq+hq*2] + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2, 8 + BLEND_H_ROW 4, -4 + BLEND_H_ROW 6, -2 + add dstq, dsq + inc hq + jl .w128 + RET + %endif ; ARCH_X86_64 From 709474917f811704e39d32d558281e967beff0d7 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:04:20 +0200 Subject: [PATCH 051/188] Move the x86-specific warp filter ordering to asm It's only useful for 8-bit since the default ordering is more efficient for high bit-depth --- src/x86/mc_avx2.asm | 4 +- src/x86/mc_sse.asm | 106 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 105 insertions(+), 5 deletions(-) diff --git a/src/x86/mc_avx2.asm b/src/x86/mc_avx2.asm index 6a1ab0570f..9fa576062e 100644 --- a/src/x86/mc_avx2.asm +++ b/src/x86/mc_avx2.asm @@ -94,7 +94,7 @@ pd_0x4000: dd 0x4000 pq_0x40000000: dq 0x40000000 cextern mc_subpel_filters -cextern mc_warp_filter +cextern mc_warp_filter2 cextern resize_filter %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) @@ -4183,7 +4183,7 @@ ALIGN function_align vpbroadcastd m14, [pw_8192] vpbroadcastd m15, [pd_32768] pxor m11, m11 - lea filterq, [mc_warp_filter] + lea filterq, [mc_warp_filter2] lea tmp1q, [ssq*3+3] add mxd, 512+(64<<10) lea tmp2d, [alphaq*3] diff --git a/src/x86/mc_sse.asm b/src/x86/mc_sse.asm index edbd186564..a7c9d032c6 100644 --- a/src/x86/mc_sse.asm +++ b/src/x86/mc_sse.asm @@ -95,6 +95,108 @@ pd_0x3ff: times 4 dd 0x3ff pd_0x4000:times 4 dd 0x4000 pq_0x40000000: times 2 dq 0x40000000 +const mc_warp_filter2 ; dav1d_mc_warp_filter[] reordered for pmaddubsw usage + ; [-1, 0) + db 0, 127, 0, 0, 0, 1, 0, 0, 0, 127, 0, 0, -1, 2, 0, 0 + db 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1, 0 + db 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1, 0 + db 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1, 0 + db 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1, 0 + db 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2, 0 + db 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2, 0 + db 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2, 0 + db 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3, 0 + db 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3, 0 + db 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3, 0 + db 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4, 0 + db 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4, 0 + db 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4, 0 + db 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4, 0 + db 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4, 0 + db 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4, 0 + db 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4, 0 + db 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4, 0 + db 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4, 0 + db 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4, 0 + db 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4, 0 + db 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4, 0 + db 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3, 0 + db 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3, 0 + db 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3, 0 + db 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2, 0 + db 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2, 0 + db 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2, 0 + db 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1, 0 + db 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1, 0 + db 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0, 0 + ; [0, 1) + db 0, 0, 1, 0, 0, 127, 0, 0, 0, -1, 2, 0, 0, 127, 0, 0 + db 0, -3, 4, 1, 1, 127, -2, 0, 0, -5, 6, 1, 1, 127, -2, 0 + db 0, -6, 8, 1, 2, 126, -3, 0, -1, -7, 11, 2, 2, 126, -4, -1 + db -1, -8, 13, 2, 3, 125, -5, -1, -1, -10, 16, 3, 3, 124, -6, -1 + db -1, -11, 18, 3, 4, 123, -7, -1, -1, -12, 20, 3, 4, 122, -7, -1 + db -1, -13, 23, 3, 4, 121, -8, -1, -2, -14, 25, 4, 5, 120, -9, -1 + db -1, -15, 27, 4, 5, 119, -10, -1, -1, -16, 30, 4, 5, 118, -11, -1 + db -2, -17, 33, 5, 6, 116, -12, -1, -2, -17, 35, 5, 6, 114, -12, -1 + db -2, -18, 38, 5, 6, 113, -13, -1, -2, -19, 41, 6, 7, 111, -14, -2 + db -2, -19, 43, 6, 7, 110, -15, -2, -2, -20, 46, 6, 7, 108, -15, -2 + db -2, -20, 49, 6, 7, 106, -16, -2, -2, -21, 51, 7, 7, 104, -16, -2 + db -2, -21, 54, 7, 7, 102, -17, -2, -2, -21, 56, 7, 8, 100, -18, -2 + db -2, -22, 59, 7, 8, 98, -18, -2, -2, -22, 62, 7, 8, 96, -19, -2 + db -2, -22, 64, 7, 8, 94, -19, -2, -2, -22, 67, 8, 8, 91, -20, -2 + db -2, -22, 69, 8, 8, 89, -20, -2, -2, -22, 72, 8, 8, 87, -21, -2 + db -2, -21, 74, 8, 8, 84, -21, -2, -2, -22, 77, 8, 8, 82, -21, -2 + db -2, -21, 79, 8, 8, 79, -21, -2, -2, -21, 82, 8, 8, 77, -22, -2 + db -2, -21, 84, 8, 8, 74, -21, -2, -2, -21, 87, 8, 8, 72, -22, -2 + db -2, -20, 89, 8, 8, 69, -22, -2, -2, -20, 91, 8, 8, 67, -22, -2 + db -2, -19, 94, 8, 7, 64, -22, -2, -2, -19, 96, 8, 7, 62, -22, -2 + db -2, -18, 98, 8, 7, 59, -22, -2, -2, -18, 100, 8, 7, 56, -21, -2 + db -2, -17, 102, 7, 7, 54, -21, -2, -2, -16, 104, 7, 7, 51, -21, -2 + db -2, -16, 106, 7, 6, 49, -20, -2, -2, -15, 108, 7, 6, 46, -20, -2 + db -2, -15, 110, 7, 6, 43, -19, -2, -2, -14, 111, 7, 6, 41, -19, -2 + db -1, -13, 113, 6, 5, 38, -18, -2, -1, -12, 114, 6, 5, 35, -17, -2 + db -1, -12, 116, 6, 5, 33, -17, -2, -1, -11, 118, 5, 4, 30, -16, -1 + db -1, -10, 119, 5, 4, 27, -15, -1, -1, -9, 120, 5, 4, 25, -14, -2 + db -1, -8, 121, 4, 3, 23, -13, -1, -1, -7, 122, 4, 3, 20, -12, -1 + db -1, -7, 123, 4, 3, 18, -11, -1, -1, -6, 124, 3, 3, 16, -10, -1 + db -1, -5, 125, 3, 2, 13, -8, -1, -1, -4, 126, 2, 2, 11, -7, -1 + db 0, -3, 126, 2, 1, 8, -6, 0, 0, -2, 127, 1, 1, 6, -5, 0 + db 0, -2, 127, 1, 1, 4, -3, 0, 0, 0, 127, 0, 0, 2, -1, 0 + ; [1, 2) + db 0, 0, 127, 0, 0, 1, 0, 0, 0, 0, 127, 0, 0, -1, 2, 0 + db 0, 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1 + db 0, 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1 + db 0, 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1 + db 0, 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1 + db 0, 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2 + db 0, 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2 + db 0, 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2 + db 0, 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3 + db 0, 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3 + db 0, 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3 + db 0, 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4 + db 0, 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4 + db 0, 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4 + db 0, 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4 + db 0, 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4 + db 0, 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4 + db 0, 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4 + db 0, 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4 + db 0, 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4 + db 0, 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4 + db 0, 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4 + db 0, 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4 + db 0, 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3 + db 0, 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3 + db 0, 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3 + db 0, 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2 + db 0, 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2 + db 0, 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2 + db 0, 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1 + db 0, 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1 + db 0, 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0 + db 0, 0, 2, -1, 0, 0, 127, 0 + pw_258: times 2 dw 258 cextern mc_subpel_filters @@ -208,8 +310,6 @@ SCALED_JMP_TABLE prep_8tap_scaled_ssse3, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX -cextern mc_warp_filter - SECTION .text INIT_XMM ssse3 @@ -5938,7 +6038,7 @@ ALIGN function_align mov PIC_reg, PIC_mem %endif sub betad, tmp2d ; beta -= alpha*3 - lea filterq, [PIC_sym(mc_warp_filter)] + lea filterq, [PIC_sym(mc_warp_filter2)] %if ARCH_X86_64 mov myd, r6m %if cpuflag(ssse3) From 55dc4dd735ee06817179c375f078a5afce1ddf4c Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:04:24 +0200 Subject: [PATCH 052/188] x86: Add high bitdepth warp8x8 AVX2 asm --- src/x86/mc16_avx2.asm | 206 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm index 8e223f9ec4..0ca7f06cd1 100644 --- a/src/x86/mc16_avx2.asm +++ b/src/x86/mc16_avx2.asm @@ -50,6 +50,9 @@ prep_mul: dw 16, 16, 4, 4 put_8tap_h_rnd: dd 34, 40 prep_8tap_1d_rnd: dd 8 - (8192 << 4) prep_8tap_2d_rnd: dd 32 - (8192 << 5) +warp8x8t_rnd: dd 16384 - (8192 << 15) +warp8x8_shift: dd 5, 3 +warp8x8_rnd: dw 4096, 4096, 16384, 16384 bidir_rnd: dw -16400, -16400, -16388, -16388 bidir_mul: dw 2048, 2048, 8192, 8192 @@ -64,6 +67,7 @@ pw_32766: times 2 dw 32766 pw_m512: times 2 dw -512 pd_32: dd 32 pd_512: dd 512 +pd_32768: dd 32768 pd_65538: dd 65538 %macro BIDIR_JMP_TABLE 2-* @@ -143,6 +147,8 @@ HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) +cextern mc_warp_filter + SECTION .text INIT_XMM avx2 @@ -2499,6 +2505,206 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my %endif RET +%macro WARP_V 5 ; dst, 01, 23, 45, 67 + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq xm8, [filterq+myq *8] + vinserti128 m8, [filterq+tmp1q*8], 1 ; a e + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+deltaq*1] + shr tmp2d, 10 + shr tmp1d, 10 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 ; b f + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq xm9, [filterq+myq *8] + vinserti128 m9, [filterq+tmp1q*8], 1 ; c g + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+gammaq] ; my += gamma + punpcklwd m8, m0 + shr tmp2d, 10 + shr tmp1d, 10 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 ; d h + punpcklwd m0, m9, m0 + punpckldq m9, m8, m0 + punpckhdq m0, m8, m0 + punpcklbw m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 + punpckhbw m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 + pmaddwd m%2, m8 + pmaddwd m9, m%3 + punpcklbw m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 + punpckhbw m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 + pmaddwd m8, m%4 + pmaddwd m0, m%5 + paddd m9, m%2 + mova m%2, m%3 + paddd m0, m8 + mova m%3, m%4 + mova m%4, m%5 + paddd m%1, m0, m9 +%endmacro + +cglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts + mov r6d, r7m + lea r9, [$$] + shr r6d, 11 + vpbroadcastd m13, [r9-$$+warp8x8_shift+r6*4] + vpbroadcastd m14, [warp8x8t_rnd] + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main + jmp .start +.loop: + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2 + lea tmpq, [tmpq+tsq*4] +.start: + paddd m7, m14 + paddd m0, m14 + psrad m7, 15 + psrad m0, 15 + packssdw m7, m0 + vpermq m7, m7, q3120 + mova [tmpq+tsq*0], xm7 + vextracti128 [tmpq+tsq*2], m7, 1 + dec r4d + jg .loop +.end: + RET + +cglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \ + alpha, beta, filter, tmp1, delta, \ + my, gamma + mov r6d, r7m + lea filterq, [$$] + shr r6d, 11 + vpbroadcastd m13, [filterq-$$+warp8x8_shift+r6*4] + vpbroadcastd m14, [filterq-$$+warp8x8_rnd +r6*4] + vpbroadcastw m15, r7m ; pixel_max + call .main + jmp .start +.loop: + call .main2 + lea dstq, [dstq+dsq*2] +.start: + psrad m7, 16 + psrad m0, 16 + packusdw m7, m0 + pmulhrsw m7, m14 + pminsw m7, m15 + vpermq m7, m7, q3120 + mova [dstq+dsq*0], xm7 + vextracti128 [dstq+dsq*1], m7, 1 + dec r4d + jg .loop +.end: + RET +ALIGN function_align +.main: + ; Stack args offset by one (r4m -> r5m etc.) due to call +%if WIN64 + mov abcdq, r5m + mov mxd, r6m +%endif + movsx alphad, word [abcdq+2*0] + movsx betad, word [abcdq+2*1] + vpbroadcastd m12, [pd_32768] + pxor m11, m11 + add filterq, mc_warp_filter-$$ + lea tmp1q, [ssq*3] + add mxd, 512+(64<<10) + lea tmp2d, [alphaq*3] + sub srcq, tmp1q ; src -= src_stride*3 + sub betad, tmp2d ; beta -= alpha*3 + mov myd, r7m + call .h + psrld m1, m0, 16 + call .h + pblendw m1, m0, 0xaa ; 01 + psrld m2, m0, 16 + call .h + pblendw m2, m0, 0xaa ; 12 + psrld m3, m0, 16 + call .h + pblendw m3, m0, 0xaa ; 23 + psrld m4, m0, 16 + call .h + pblendw m4, m0, 0xaa ; 34 + psrld m5, m0, 16 + call .h + pblendw m5, m0, 0xaa ; 45 + psrld m6, m0, 16 + call .h + pblendw m6, m0, 0xaa ; 56 + movsx deltad, word [abcdq+2*2] + movsx gammad, word [abcdq+2*3] + add myd, 512+(64<<10) + mov r4d, 4 + lea tmp1d, [deltaq*3] + sub gammad, tmp1d ; gamma -= delta*3 +.main2: + call .h + psrld m7, m6, 16 + pblendw m7, m0, 0xaa ; 67 + WARP_V 7, 1, 3, 5, 7 + call .h + psrld m10, m5, 16 + pblendw m10, m0, 0xaa ; 78 + WARP_V 0, 2, 4, 6, 10 + ret +ALIGN function_align +.h: + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] + movu xm10, [srcq-6] + vinserti128 m10, [srcq+2], 1 + shr mxd, 10 ; 0 + shr tmp1d, 10 ; 4 + movq xm0, [filterq+mxq *8] + vinserti128 m0, [filterq+tmp1q*8], 1 + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+alphaq*1] + movu xm8, [srcq-4] + vinserti128 m8, [srcq+4], 1 + shr tmp2d, 10 ; 1 + shr tmp1d, 10 ; 5 + movq xm9, [filterq+tmp2q*8] + vinserti128 m9, [filterq+tmp1q*8], 1 + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] + shr mxd, 10 ; 2 + shr tmp1d, 10 ; 6 + punpcklbw m0, m11, m0 + pmaddwd m0, m10 + movu xm10, [srcq-2] + vinserti128 m10, [srcq+6], 1 + punpcklbw m9, m11, m9 + pmaddwd m9, m8 + movq xm8, [filterq+mxq *8] + vinserti128 m8, [filterq+tmp1q*8], 1 + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+betaq] ; mx += beta + phaddd m0, m9 ; 0 1 4 5 + movu xm9, [srcq+0] + vinserti128 m9, [srcq+8], 1 + shr tmp2d, 10 ; 3 + shr tmp1d, 10 ; 7 + punpcklbw m8, m11, m8 + pmaddwd m8, m10 + movq xm10, [filterq+tmp2q*8] + vinserti128 m10, [filterq+tmp1q*8], 1 + punpcklbw m10, m11, m10 + pmaddwd m9, m10 + add srcq, ssq + phaddd m8, m9 ; 2 3 6 7 + phaddd m0, m8 ; 0 1 2 3 4 5 6 7 + vpsllvd m0, m13 + paddd m0, m12 ; rounded 14-bit result in upper 16 bits of dword + ret + %macro BIDIR_FN 0 call .main lea stride3q, [strideq*3] From d65796f0fcc835a37e709808bab37175959e5037 Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Tue, 4 May 2021 14:04:27 +0200 Subject: [PATCH 053/188] x86: Add high bitdepth ipred_filter AVX2 asm --- src/x86/ipred16_avx2.asm | 389 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 389 insertions(+) diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index 68698822ec..770e0d1129 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -58,9 +58,13 @@ smooth_weights: SMOOTH_WEIGHT_TABLE \ ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15 +filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1 +filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 +filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1 pw_512: times 2 dw 512 pw_2048: times 2 dw 2048 +pd_8: dd 8 pd_128: dd 128 pd_256: dd 256 @@ -85,10 +89,13 @@ JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32 JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32 +cextern filter_intra_taps + SECTION .text INIT_YMM avx2 @@ -1101,6 +1108,388 @@ ALIGN function_align jz .w64_loop_x RET +%macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax +%ifnum %4 + pshufb xm%2, xm%4 +%else + pshufb xm%2, %4 +%endif + vinserti128 m%2, xm%2, 1 + pshufd m%1, m%2, q0000 + pmaddwd m%1, m2 + pshufd m%3, m%2, q1111 + pmaddwd m%3, m3 + paddd m%1, m1 + paddd m%1, m%3 + pshufd m%3, m%2, q2222 + pmaddwd m%3, m4 + paddd m%1, m%3 + pshufd m%3, m%2, q3333 + pmaddwd m%3, m5 + paddd m%1, m%3 + psrad m%1, 4 + packusdw m%1, m%1 + pminsw m%1, m%5 +%endmacro + +%macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax + pshufb m%2, m%6 + vpermq m%4, m%2, q3232 + vinserti128 m%2, xm%2, 1 + pshufd m%1, m%2, q0000 + pshufd m%3, m%4, q0000 + pmaddwd m%1, m2 + pmaddwd m%3, m2 + paddd m%1, m1 + paddd m%3, m1 + pshufd m%5, m%2, q1111 + pmaddwd m%5, m3 + paddd m%1, m%5 + pshufd m%5, m%4, q1111 + pmaddwd m%5, m3 + paddd m%3, m%5 + pshufd m%5, m%2, q2222 + pmaddwd m%5, m4 + paddd m%1, m%5 + pshufd m%5, m%4, q2222 + pmaddwd m%5, m4 + paddd m%3, m%5 + pshufd m%5, m%2, q3333 + pmaddwd m%5, m5 + paddd m%1, m%5 + pshufd m%5, m%4, q3333 + pmaddwd m%5, m5 + paddd m%3, m%5 + psrad m%1, 4 + psrad m%3, 4 + packusdw m%1, m%3 + pminsw m%1, m%7 +%endmacro + +; The ipred_filter SIMD processes 4x2 blocks in the following order which +; increases parallelism compared to doing things row by row. One redundant +; block is calculated for w8 and w16, two for w32. +; w4 w8 w16 w32 +; 1 1 2 1 2 3 5 1 2 3 5 b c d f +; 2 2 3 2 4 5 7 2 4 5 7 c e f h +; 3 3 4 4 6 7 9 4 6 7 9 e g h j +; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ +; 5 8 8 i + +cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter +%assign org_stack_offset stack_offset +%define base r6-ipred_filter_16bpc_avx2_table + lea r6, [filter_intra_taps] + tzcnt wd, wm +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + shl filterd, 6 + add filterq, r6 + lea r6, [ipred_filter_16bpc_avx2_table] + vbroadcasti128 m0, [tlq-6] + movsxd wq, [r6+wq*4] + vpbroadcastd m1, [base+pd_8] + pmovsxbw m2, [filterq+16*0] + pmovsxbw m3, [filterq+16*1] + pmovsxbw m4, [filterq+16*2] + pmovsxbw m5, [filterq+16*3] + add wq, r6 + mov hd, hm + jmp wq +.w4: + WIN64_SPILL_XMM 10 + mova xm8, [base+filter_shuf2] + vpbroadcastw m9, r8m ; bitdepth_max + lea r7, [6+hq*2] + sub tlq, r7 + jmp .w4_loop_start +.w4_loop: + pinsrq xm0, [tlq+hq*2], 0 + lea dstq, [dstq+strideq*2] +.w4_loop_start: + FILTER_1BLK 6, 0, 7, 8, 9 + vextracti128 xm0, m6, 1 + movq [dstq+strideq*0], xm6 + movq [dstq+strideq*1], xm0 + sub hd, 2 + jg .w4_loop + RET +ALIGN function_align +.w8: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + vbroadcasti128 m14, [base+filter_shuf3] + vpbroadcastw m15, r8m ; bitdepth_max + FILTER_1BLK 10, 0, 7, [base+filter_shuf2], 15 + vpermq m6, m10, q1302 ; ____ ____ | ____ 4321 + pslldq m8, m0, 4 + psrldq m7, m6, 2 + psrldq m0, m6, 10 + punpcklwd m7, m0 + vpblendd m8, m6, 0x33 ; _0__ 4321 | ____ 4321 + vpblendd m8, m7, 0x40 ; _056 4321 | ____ 4321 + vpblendd m8, [tlq-6], 0x30 ; _056 4321 | ____ 4321 + lea r7, [16+hq*2] + sub tlq, r7 + jmp .w8_loop_start +.w8_loop: + vpermq m8, m9, q1302 ; ____ 4321 | ____ 4321 + vpermq m6, m9, q2031 + psrldq m0, m6, 2 + psrldq m6, 10 + punpcklwd m6, m0 + vpblendd m8, m7, 0x80 ; _0__ 4321 | ____ 4321 + vpblendd m8, m6, 0x40 ; _056 4321 | ____ 4321 + mova m10, m9 +.w8_loop_start: + vpblendd m8, [tlq+hq*2], 0x0C ; _056 4321 | _056 4321 + call .main + vpblendd m10, m9, 0xCC + mova [dstq+strideq*0], xm10 + vextracti128 [dstq+strideq*1], m10, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + %assign stack_offset stack_offset - stack_size_padded + ALLOC_STACK 32, 16 + vpbroadcastw m15, r8m ; bitdepth_max + sub hd, 2 + TAIL_CALL .w16_main, 0 +.w16_main: + mova xm10, [base+filter_shuf2] + FILTER_1BLK 13, 0, 6, 10, 15 + vpermq m12, m13, q3120 + mova xm14, [base+filter_shuf3] + vinserti128 m14, [base+filter_shuf1], 1 + vpbroadcastq m0, [tlq+10] + vpblendd m0, [tlq-16], 0x4C ; ___0 4321 | _056 ____ + psrldq m6, m12, 8 + vpblendd m0, m6, 0x03 ; ___0 4321 | _056 4321 + punpcklwd m6, m12 + vpblendd m0, m6, 0x80 ; 56_0 4321 | _056 4321 + FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 + vpblendd m13, m12, 0xCC + vpermq m12, m12, q2031 ; 6___ 5___ + psrldq xm6, xm12, 2 + psrldq xm8, xm12, 12 + vpblendd xm6, xm8, 0x01 + pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ + FILTER_1BLK 11, 6, 8, 10, 15 + vpermq m11, m11, q3120 + pshufd m9, m11, q1032 + movu m8, [tlq+6] ; __43 210_ | ____ ____ + pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ + pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ + vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 + mova [dstq+strideq*0], xm13 + vextracti128 [dstq+strideq*1], m13, 1 + lea r7, [20+hq*2] + sub tlq, r7 + vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 + jmp .w16_loop_start +.w16_loop: + vpermq m13, m13, q3322 + vpermq m11, m9, q2020 + vpermq m9, m9, q1302 + vpermq m6, m12, q0123 + psrldq m7, 4 + vpblendd m13, m10, 0xCC + vpblendd m9, m7, 0x40 + mova m0, [rsp+8] + mova [dstq+strideq*0], xm13 + vextracti128 [dstq+strideq*1], m13, 1 +.w16_loop_start: + mova m13, m12 + vpblendd m0, [tlq+hq*2], 0x0C + psrldq m7, m12, 8 + punpcklwd m7, m12 + vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 + vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 + FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 + vpermq m12, m10, q2031 + mova [rsp+8], m0 + psrldq m8, m11, 8 + psrldq xm6, xm12, 2 + psrldq xm7, xm12, 10 + psrldq xm0, xm13, 2 + punpcklwd m8, m11 + punpcklwd xm7, xm6 + vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 + vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 + vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 + call .main + vpermq m8, m11, q3120 + vpblendd m6, m8, m9, 0xCC + mova [dstq+strideq*0+16], xm6 + vextracti128 [dstq+strideq*1+16], m6, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + vpermq m8, m9, q3120 + vextracti128 xm0, m8, 1 ; 4321 ____ + pshufd xm11, xm11, q1032 + vpblendd xm0, xm11, 0x02 ; 4321 0___ + psrldq xm6, xm8, 2 + psrldq xm7, xm8, 12 + pblendw xm0, xm6, 0x4 ; 4321 05__ + pblendw xm0, xm7, 0x2 ; 4321 056_ + FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 + vpermq m12, m13, q1302 + vpblendd m12, m10, 0xCC + vpblendd m9, m6, 0xCC + mova [dstq+strideq*0+ 0], xm12 + mova [dstq+strideq*0+16], xm9 + vextracti128 [dstq+strideq*1+ 0], m12, 1 + vextracti128 [dstq+strideq*1+16], m9, 1 + ret +ALIGN function_align +.w32: + %assign stack_offset org_stack_offset + ALLOC_STACK 64, 16 + vpbroadcastw m15, r8m ; bitdepth_max + sub hd, 2 + lea r3, [dstq+32] + lea r5d, [hd*2+20] + call .w16_main + mov dstq, r3 + lea tlq, [tlq+r5+32] + sub r5d, 20 + shr r5d, 1 + sub r5d, 2 + lea r4, [dstq+strideq*2-2] +DEFINE_ARGS dst, stride, tl, stride3, left, h + lea stride3q, [strideq*3] + movu m8, [tlq-6] ; 4321 0___ + mova xm10, [base+filter_shuf2] + pinsrw xm0, xm8, [dstq+strideq*0-2], 2 + pinsrw xm0, xm0, [dstq+strideq*1-2], 1 ; 4321 056_ + pinsrw xm9, [leftq+strideq*0], 5 + pinsrw xm9, [leftq+strideq*1], 4 + FILTER_1BLK 13, 0, 6, 10, 15 + vpermq m12, m13, q3120 + mova xm14, [base+filter_shuf3] + vinserti128 m14, [base+filter_shuf1], 1 + psrldq m6, m12, 8 + punpcklwd m7, m6, m12 + vpblendd m0, m6, 0x03 ; ___0 ____ | _0__ 4321 + vpblendd m0, m7, 0x80 ; 56_0 ____ | _0__ 4321 + vpblendd m0, m8, 0x30 ; 56_0 4321 | _0__ 4321 + vpblendd m0, m9, 0x04 ; 56_0 4321 | _056 4321 + FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 + vpblendd m13, m12, 0xCC + pinsrw xm9, [leftq+strideq*2], 3 + pinsrw xm9, [leftq+stride3q ], 2 + lea leftq, [leftq+strideq*4] + pinsrw xm9, [leftq+strideq*0], 1 + pinsrw xm9, [leftq+strideq*1], 0 + movq [rsp+32], xm9 + mov r7d, 1 + pslldq m8, m9, 4 + vpblendd m0, m8, 0x0C ; ___0 ____ | _056 ____ + vpermq m12, m12, q2031 ; 6___ 5___ + psrldq xm6, xm12, 2 + psrldq xm7, xm12, 12 + vpblendd xm6, xm7, 0x01 ; ____ _56_ + pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ + FILTER_1BLK 11, 6, 7, 10, 15 + vpermq m11, m11, q3120 + pshufd m9, m11, q1032 + vbroadcasti128 m8, [tlq+22] ; __43 210_ | ____ ____ + pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ + pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ + vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 + mova [dstq+strideq*0], xm13 + vextracti128 [dstq+strideq*1], m13, 1 + vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 + jmp .w32_loop_start +.w32_loop_last: + mova m0, [rsp+0] + jmp .w32_loop +.w32_loop_left: + mova m0, [rsp+0] + vpblendd m0, [rsp+32+r7*4-12], 0x0C + dec r7d + jg .w32_loop + cmp hd, 2 + je .w32_loop + pinsrw xm6, [rsp+32], 6 + pinsrw xm6, [leftq+strideq*2], 5 + pinsrw xm6, [leftq+stride3q ], 4 + lea leftq, [leftq+strideq*4] + pinsrw xm6, [leftq+strideq*0], 3 + pinsrw xm6, [leftq+strideq*1], 2 + pinsrw xm6, [leftq+strideq*2], 1 + pinsrw xm6, [leftq+stride3q ], 0 + lea leftq, [leftq+strideq*4] + movu [rsp+36], xm6 + pinsrw xm6, [leftq+strideq*0], 1 + pinsrw xm6, [leftq+strideq*1], 0 + movd [rsp+32], xm6 + mov r7d, 4 +.w32_loop: + vpermq m13, m13, q3322 + vpermq m11, m9, q2020 + vpermq m9, m9, q1302 + vpermq m6, m12, q0123 + psrldq m7, 4 + vpblendd m13, m10, 0xCC + vpblendd m9, m7, 0x40 ; ___0 4321 | ____ 4321 + mova [dstq+strideq*0], xm13 + vextracti128 [dstq+strideq*1], m13, 1 +.w32_loop_start: + mova m13, m12 + psrldq m7, m12, 8 + punpcklwd m7, m12 + vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 + vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 + FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 + vpermq m12, m10, q2031 + mova [rsp+0], m0 + psrldq m8, m11, 8 + psrldq xm6, xm12, 2 + psrldq xm7, xm12, 10 + psrldq xm0, xm13, 2 + punpcklwd m8, m11 + punpcklwd xm7, xm6 + vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 + vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 + vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 + call .main + vpermq m8, m11, q3120 + vpblendd m6, m8, m9, 0xCC + mova [dstq+strideq*0+16], xm6 + vextracti128 [dstq+strideq*1+16], m6, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop_left + jz .w32_loop_last + vpermq m8, m9, q3120 + vextracti128 xm0, m8, 1 ; 4321 ____ + pshufd xm11, xm11, q1032 + vpblendd xm0, xm11, 0x02 ; 4321 0___ + psrldq xm6, xm8, 2 + psrldq xm7, xm8, 12 + pblendw xm0, xm6, 0x4 ; 4321 05__ + pblendw xm0, xm7, 0x2 ; 4321 056_ + FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 + vpermq m12, m13, q1302 + vpblendd m12, m10, 0xCC + vpblendd m9, m6, 0xCC + mova [dstq+strideq*0+ 0], xm12 + mova [dstq+strideq*0+16], xm9 + vextracti128 [dstq+strideq*1+ 0], m12, 1 + vextracti128 [dstq+strideq*1+16], m9, 1 + RET +.main: + FILTER_2BLK 9, 8, 6, 7, 0, 14, 15 + ret + %if WIN64 DECLARE_REG_TMP 5 %else From e195d572841fabf330e7b324b5ce24053cb1beb3 Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Tue, 4 May 2021 14:04:28 +0200 Subject: [PATCH 054/188] x86: Add high bitdepth ipred_cfl_ac_420 AVX2 asm --- src/x86/ipred16_avx2.asm | 182 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index 770e0d1129..ebc9f64a06 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -62,6 +62,7 @@ filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1 filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1 +pw_2: times 2 dw 2 pw_512: times 2 dw 512 pw_2048: times 2 dw 2048 pd_8: dd 8 @@ -93,6 +94,7 @@ JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32 JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32 +JMP_TABLE ipred_cfl_ac_420_16bpc, avx2, w16_wpad_pad1, w16_wpad_pad2, w16_wpad_pad3 cextern filter_intra_taps @@ -1794,4 +1796,184 @@ cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha movifnidn acq, acmp jmp wq +cglobal ipred_cfl_ac_420_16bpc, 4, 9, 6, ac, ypx, stride, wpad, hpad, w, h, sz, ac_bak + movifnidn hpadd, hpadm + movifnidn wd, wm + mov hd, hm + mov szd, wd + mov ac_bakq, acq + imul szd, hd + shl hpadd, 2 + sub hd, hpadd + vpbroadcastd m2, [pw_2] + pxor m4, m4 + cmp wd, 8 + jg .w16 + je .w8 +DEFINE_ARGS ac, ypx, stride, wpad, hpad, stride3, h, sz, ac_bak +.w4: + lea stride3q, [strideq*3] +.w4_loop: + mova xm0, [ypxq+strideq*2] + mova xm1, [ypxq+stride3q ] + vinserti128 m0, [ypxq+strideq*0], 1 + vinserti128 m1, [ypxq+strideq*1], 1 + pmaddwd m0, m2 + pmaddwd m1, m2 + paddd m0, m1 + vextracti128 xm1, m0, 1 + paddd m4, m0 + packssdw xm1, xm0 + mova [acq], xm1 + lea ypxq, [ypxq+strideq*4] + add acq, 16 + sub hd, 2 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg + vpermq m1, m1, q1111 + pslld xm0, 2 +.w4_hpad_loop: + mova [acq], m1 + paddd m4, m0 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp .calc_avg +.w8: + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + pmaddwd m0, m2, [ypxq+strideq*0] + pmaddwd m1, m2, [ypxq+strideq*1] + paddd m0, m1 + vextracti128 xm1, m0, 1 + paddd m4, m0 + packssdw xm1, xm0, xm1 + mova [acq], xm1 + lea ypxq, [ypxq+strideq*2] + add acq, 16 + dec hd + jg .w8_loop + jmp .w8_hpad +.w8_wpad: + pmaddwd xm0, xm2, [ypxq+strideq*0] + pmaddwd xm3, xm2, [ypxq+strideq*1] + paddd xm0, xm3 + pshufd xm3, xm0, q3333 + packssdw xm1, xm0, xm3 + paddd xm0, xm3 + paddd xm4, xm0 + mova [acq], xm1 + lea ypxq, [ypxq+strideq*2] + add acq, 16 + dec hd + jg .w8_wpad +.w8_hpad: + test hpadd, hpadd + jz .calc_avg + vinserti128 m1, xm1, 1 + paddd m0, m0 +.w8_hpad_loop: + paddd m4, m0 + mova [acq], m1 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad_loop + jmp .calc_avg +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + pmaddwd m0, m2, [ypxq+strideq*0+ 0] + pmaddwd m1, m2, [ypxq+strideq*1+ 0] + pmaddwd m3, m2, [ypxq+strideq*0+32] + pmaddwd m5, m2, [ypxq+strideq*1+32] + paddd m0, m1 + paddd m3, m5 + packssdw m1, m0, m3 + paddd m0, m3 + vpermq m1, m1, q3120 + paddd m4, m0 + mova [acq], m1 + lea ypxq, [ypxq+strideq*2] + add acq, 32 + dec hd + jg .w16_loop + jmp .w16_hpad +.w16_wpad: +DEFINE_ARGS ac, ypx, stride, wpad, hpad, iptr, h, sz, ac_bak + lea iptrq, [ipred_cfl_ac_420_16bpc_avx2_table] + mov wpadd, wpadd + movsxd wpadq, [iptrq+wpadq*4+4] + add iptrq, wpadq + jmp iptrq +.w16_wpad_pad3: + vpbroadcastd m3, [ypxq+strideq*0+12] + vpbroadcastd m5, [ypxq+strideq*1+12] + vinserti128 m0, m3, [ypxq+strideq*0], 0 + vinserti128 m1, m5, [ypxq+strideq*1], 0 + jmp .w16_wpad_end +.w16_wpad_pad2: + mova m0, [ypxq+strideq*0+ 0] + mova m1, [ypxq+strideq*1+ 0] + vpbroadcastd m3, [ypxq+strideq*0+28] + vpbroadcastd m5, [ypxq+strideq*1+28] + jmp .w16_wpad_end +.w16_wpad_pad1: + mova m0, [ypxq+strideq*0+ 0] + mova m1, [ypxq+strideq*1+ 0] + vpbroadcastd m3, [ypxq+strideq*0+44] + vpbroadcastd m5, [ypxq+strideq*1+44] + vinserti128 m3, [ypxq+strideq*0+32], 0 + vinserti128 m5, [ypxq+strideq*1+32], 0 +.w16_wpad_end: + pmaddwd m0, m2 + pmaddwd m1, m2 + pmaddwd m3, m2 + pmaddwd m5, m2 + paddd m0, m1 + paddd m3, m5 + packssdw m1, m0, m3 + paddd m0, m3 + vpermq m1, m1, q3120 + paddd m4, m0 + mova [acq], m1 + lea ypxq, [ypxq+strideq*2] + add acq, 32 + dec hd + jz .w16_hpad + jmp iptrq +.w16_hpad: + test hpadd, hpadd + jz .calc_avg +.w16_hpad_loop: + mova [acq], m1 + paddd m4, m0 + add acq, 32 + dec hpadd + jg .w16_hpad_loop +.calc_avg: + vextracti128 xm0, m4, 1 + tzcnt r1d, szd + movd xm3, szd + paddd xm0, xm4 + movd xm2, r1d + punpckhqdq xm1, xm0, xm0 + psrld xm3, 1 + paddd xm0, xm1 + pshuflw xm1, xm0, q1032 + paddd xm0, xm3 + paddd xm0, xm1 + psrld xm0, xm2 + vpbroadcastw m0, xm0 +.sub_loop: + mova m1, [ac_bakq] + psubw m1, m0 + mova [ac_bakq], m1 + add ac_bakq, 32 + sub szd, 16 + jg .sub_loop + RET + %endif From 1074250d2068b6fdae64b16ae0c970a2b8a5c40b Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Tue, 4 May 2021 14:04:30 +0200 Subject: [PATCH 055/188] x86: Add high bitdepth ipred_cfl_ac_422 AVX2 asm --- src/x86/ipred16_avx2.asm | 191 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index ebc9f64a06..aaac24d031 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -63,6 +63,7 @@ filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1 pw_2: times 2 dw 2 +pw_4: times 2 dw 4 pw_512: times 2 dw 512 pw_2048: times 2 dw 2048 pd_8: dd 8 @@ -95,6 +96,7 @@ JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32 JMP_TABLE ipred_cfl_ac_420_16bpc, avx2, w16_wpad_pad1, w16_wpad_pad2, w16_wpad_pad3 +JMP_TABLE ipred_cfl_ac_422_16bpc, avx2, w16_wpad_pad1, w16_wpad_pad2, w16_wpad_pad3 cextern filter_intra_taps @@ -1976,4 +1978,193 @@ DEFINE_ARGS ac, ypx, stride, wpad, hpad, iptr, h, sz, ac_bak jg .sub_loop RET +cglobal ipred_cfl_ac_422_16bpc, 4, 9, 6, ac, ypx, stride, wpad, hpad, w, h, sz, ac_bak + movifnidn hpadd, hpadm + movifnidn wd, wm + mov hd, hm + mov szd, wd + mov ac_bakq, acq + imul szd, hd + shl hpadd, 2 + sub hd, hpadd + vpbroadcastd m2, [pw_4] + pxor m4, m4 + cmp wd, 8 + jg .w16 + je .w8 +DEFINE_ARGS ac, ypx, stride, wpad, hpad, stride3, h, sz, ac_bak +.w4: + lea stride3q, [strideq*3] +.w4_loop: + mova xm0, [ypxq+strideq*0] + mova xm1, [ypxq+strideq*1] + vinserti128 m0, [ypxq+strideq*2], 1 + vinserti128 m1, [ypxq+stride3q ], 1 + pmaddwd m0, m2 + pmaddwd m1, m2 + paddd m4, m0 + packssdw m0, m1 + paddd m4, m1 + mova [acq], m0 + lea ypxq, [ypxq+strideq*4] + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg + vpermq m0, m0, q3333 + vextracti128 xm1, m1, 1 + pslld xm1, 2 +.w4_hpad_loop: + mova [acq], m0 + paddd m4, m1 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp .calc_avg +.w8: + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + pmaddwd m0, m2, [ypxq+strideq*0] + pmaddwd m1, m2, [ypxq+strideq*1] + paddd m4, m0 + packssdw m0, m1 + paddd m4, m1 + vpermq m0, m0, q3120 + mova [acq], m0 + lea ypxq, [ypxq+strideq*2] + add acq, 32 + sub hd, 2 + jg .w8_loop + jmp .w8_hpad +.w8_wpad: + vpbroadcastd m0, [ypxq+strideq*0+12] + vpbroadcastd m1, [ypxq+strideq*1+12] + vinserti128 m0, [ypxq+strideq*0+ 0], 0 + vinserti128 m1, [ypxq+strideq*1+ 0], 0 + pmaddwd m0, m2 + pmaddwd m1, m2 + paddd m4, m0 + packssdw m0, m1 + paddd m4, m1 + vpermq m0, m0, q3120 + mova [acq], m0 + lea ypxq, [ypxq+strideq*2] + add acq, 32 + sub hd, 2 + jg .w8_wpad +.w8_hpad: + test hpadd, hpadd + jz .calc_avg + vpermq m0, m0, q3232 + paddd m1, m1 +.w8_hpad_loop: + mova [acq], m0 + paddd m4, m1 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad_loop + jmp .calc_avg +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + pmaddwd m3, m2, [ypxq+strideq*0+ 0] + pmaddwd m0, m2, [ypxq+strideq*0+32] + pmaddwd m1, m2, [ypxq+strideq*1+ 0] + pmaddwd m5, m2, [ypxq+strideq*1+32] + paddd m4, m3 + packssdw m3, m0 + paddd m4, m0 + packssdw m0, m1, m5 + paddd m1, m5 + paddd m4, m1 + vpermq m3, m3, q3120 + vpermq m0, m0, q3120 + mova [acq+ 0], m3 + mova [acq+32], m0 + lea ypxq, [ypxq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_loop + jmp .w16_hpad +.w16_wpad: +DEFINE_ARGS ac, ypx, stride, wpad, hpad, iptr, h, sz, ac_bak + lea iptrq, [ipred_cfl_ac_422_16bpc_avx2_table] + mov wpadd, wpadd + movsxd wpadq, [iptrq+wpadq*4+4] + add iptrq, wpadq + jmp iptrq +.w16_wpad_pad3: + vpbroadcastd m0, [ypxq+strideq*0+12] + vpbroadcastd m3, [ypxq+strideq*1+12] + vinserti128 m5, m0, [ypxq+strideq*0], 0 + vinserti128 m1, m3, [ypxq+strideq*1], 0 + jmp .w16_wpad_end +.w16_wpad_pad2: + mova m5, [ypxq+strideq*0+ 0] + mova m1, [ypxq+strideq*1+ 0] + vpbroadcastd m0, [ypxq+strideq*0+28] + vpbroadcastd m3, [ypxq+strideq*1+28] + jmp .w16_wpad_end +.w16_wpad_pad1: + mova m5, [ypxq+strideq*0+ 0] + mova m1, [ypxq+strideq*1+ 0] + vpbroadcastd m0, [ypxq+strideq*0+44] + vpbroadcastd m3, [ypxq+strideq*1+44] + vinserti128 m0, [ypxq+strideq*0+32], 0 + vinserti128 m3, [ypxq+strideq*1+32], 0 +.w16_wpad_end: + pmaddwd m5, m2 + pmaddwd m1, m2 + pmaddwd m0, m2 + pmaddwd m3, m2 + paddd m4, m5 + packssdw m5, m0 + paddd m4, m0 + packssdw m0, m1, m3 + paddd m1, m3 + paddd m4, m1 + vpermq m5, m5, q3120 + vpermq m0, m0, q3120 + mova [acq+ 0], m5 + mova [acq+32], m0 + lea ypxq, [ypxq+strideq*2] + add acq, 64 + sub hd, 2 + jz .w16_hpad + jmp iptrq +.w16_hpad: + test hpadd, hpadd + jz .calc_avg +.w16_hpad_loop: + mova [acq], m0 + paddd m4, m1 + add acq, 32 + dec hpadd + jg .w16_hpad_loop +.calc_avg: + vextracti128 xm0, m4, 1 + tzcnt r1d, szd + movd xm2, r1d + paddd xm0, xm4 + movd xm3, szd + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 + psrld xm3, 1 + psrlq xm1, xm0, 32 + paddd xm0, xm3 + paddd xm0, xm1 + psrld xm0, xm2 + vpbroadcastw m0, xm0 +.sub_loop: + mova m1, [ac_bakq] + psubw m1, m0 + mova [ac_bakq], m1 + add ac_bakq, 32 + sub szd, 16 + jg .sub_loop + RET + %endif From a2a1a5681d65ba526b1c0fe1fc411ba4c8b0960b Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Tue, 4 May 2021 14:04:32 +0200 Subject: [PATCH 056/188] x86: Add high bitdepth pal_pred AVX2 asm --- src/x86/ipred16_avx2.asm | 106 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index aaac24d031..0314b7c10d 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -61,6 +61,7 @@ ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1 filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1 +pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 pw_2: times 2 dw 2 pw_4: times 2 dw 4 @@ -97,6 +98,7 @@ JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32 JMP_TABLE ipred_cfl_ac_420_16bpc, avx2, w16_wpad_pad1, w16_wpad_pad2, w16_wpad_pad3 JMP_TABLE ipred_cfl_ac_422_16bpc, avx2, w16_wpad_pad1, w16_wpad_pad2, w16_wpad_pad3 +JMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64 cextern filter_intra_taps @@ -2167,4 +2169,108 @@ DEFINE_ARGS ac, ypx, stride, wpad, hpad, iptr, h, sz, ac_bak jg .sub_loop RET +cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h + vbroadcasti128 m3, [palq] + lea r2, [pal_pred_16bpc_avx2_table] + tzcnt wd, wm + vbroadcasti128 m4, [pal_pred_shuf] + movifnidn hd, hm + movsxd wq, [r2+wq*4] + pshufb m3, m4 + punpckhqdq m4, m3, m3 + add wq, r2 +DEFINE_ARGS dst, stride, stride3, idx, w, h + lea stride3q, [strideq*3] + jmp wq +.w4: + mova xm2, [idxq] + add idxq, 16 + pshufb xm1, xm3, xm2 + pshufb xm2, xm4, xm2 + punpcklbw xm0, xm1, xm2 + punpckhbw xm1, xm2 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+strideq*1], xm0 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + movu m2, [idxq] ; only 16-byte alignment + add idxq, 32 + pshufb m1, m3, m2 + pshufb m2, m4, m2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+strideq*2], m0, 1 + vextracti128 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +.w16: + vpermq m2, [idxq+ 0], q3120 + vpermq m5, [idxq+32], q3120 + add idxq, 64 + pshufb m1, m3, m2 + pshufb m2, m4, m2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufb m1, m3, m5 + pshufb m2, m4, m5 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +.w32: + vpermq m2, [idxq+ 0], q3120 + vpermq m5, [idxq+32], q3120 + add idxq, 64 + pshufb m1, m3, m2 + pshufb m2, m4, m2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+32], m1 + pshufb m1, m3, m5 + pshufb m2, m4, m5 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+32], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32 + RET +.w64: + vpermq m2, [idxq+ 0], q3120 + vpermq m5, [idxq+32], q3120 + add idxq, 64 + pshufb m1, m3, m2 + pshufb m2, m4, m2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+ 0], m0 + mova [dstq+32], m1 + pshufb m1, m3, m5 + pshufb m2, m4, m5 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+64], m0 + mova [dstq+96], m1 + add dstq, strideq + dec hd + jg .w64 + RET + %endif From 53f424dccef3ff5cc5791de81bb5dfb889b79ee3 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:04:33 +0200 Subject: [PATCH 057/188] x86: Add high bitdepth ipred_z1 AVX2 asm --- src/x86/ipred16_avx2.asm | 761 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 760 insertions(+), 1 deletion(-) diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index 0314b7c10d..a38823c539 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -62,9 +62,21 @@ filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1 filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1 pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 +z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 + dw 8*64, 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64 +z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 +z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 +z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 + db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 +pw_m1024: times 2 dw -1024 +z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +z_filter_k: dw 4, 4, 5, 5, 4, 4, 8, 8, 6, 6, 4, 4 + +%define pw_4 z_filter_k pw_2: times 2 dw 2 -pw_4: times 2 dw 4 +pw_3: times 2 dw 3 +pw_62: times 2 dw 62 pw_512: times 2 dw 512 pw_2048: times 2 dw 2048 pd_8: dd 8 @@ -92,6 +104,7 @@ JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32 JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 @@ -100,6 +113,7 @@ JMP_TABLE ipred_cfl_ac_420_16bpc, avx2, w16_wpad_pad1, w16_wpad_pad2, w16_wpad_p JMP_TABLE ipred_cfl_ac_422_16bpc, avx2, w16_wpad_pad1, w16_wpad_pad2, w16_wpad_pad3 JMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64 +cextern dr_intra_derivative cextern filter_intra_taps SECTION .text @@ -1114,6 +1128,751 @@ ALIGN function_align jz .w64_loop_x RET +cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase + %assign org_stack_offset stack_offset + lea r6, [ipred_z1_16bpc_avx2_table] + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + lea r7, [dr_intra_derivative] + movsxd wq, [r6+wq*4] + add tlq, 2 + add wq, r6 + mov dxd, angled + and dxd, 0x7e + add angled, 165 ; ~90 + movzx dxd, word [r7+dxq] + xor angled, 0x4ff ; d = 90 - angle + vpbroadcastd m5, [pw_62] + jmp wq +.w4: + ALLOC_STACK -64, 7 + cmp angleb, 40 + jae .w4_no_upsample + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + vpbroadcastw xm3, [tlq+14] + movu xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 + palignr xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8 + paddw xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 + add dxd, dxd + palignr xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8 + paddw xm2, xm1 ; -1 * a + 9 * b + 9 * c + -1 * d + psubw xm0, xm2, xm0 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 + psraw xm0, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 + pxor xm4, xm4 + paddw xm2, xm0 + vpbroadcastw xm0, r8m ; pixel_max + mova [rsp+32], xm3 + movd xm3, dxd + pmaxsw xm2, xm4 + mov r3d, dxd + pavgw xm2, xm4 + vpbroadcastw m3, xm3 + pminsw xm2, xm0 + punpcklwd xm0, xm1, xm2 + punpckhwd xm1, xm2 + lea r5, [strideq*3] + pslldq m2, m3, 8 + mova [rsp+ 0], xm0 + mova [rsp+16], xm1 + paddw m6, m3, m3 + paddw m3, m2 + vpblendd m4, m6, 0xf0 + paddw m6, m6 + paddw m3, m4 ; xpos0 xpos1 xpos2 xpos3 + vbroadcasti128 m4, [z_upsample] +.w4_upsample_loop: + lea r2d, [r3+dxq] + shr r3d, 6 ; base0 + movu xm1, [rsp+r3*2] + lea r3d, [r2+dxq] + shr r2d, 6 ; base1 + movu xm2, [rsp+r2*2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base2 + vinserti128 m1, [rsp+r3*2], 1 ; 0 2 + lea r3d, [r2+dxq] + shr r2d, 6 ; base3 + vinserti128 m2, [rsp+r2*2], 1 ; 1 3 + pshufb m1, m4 + pshufb m2, m4 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pand m2, m5, m3 ; frac + psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 + psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) + pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) + paddw m3, m6 ; xpos += dx + paddw m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r5 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_upsample_loop + RET +ALIGN function_align +.filter_strength: ; w4/w8/w16 +%define base r3-z_filter_t0 + movd xm0, maxbased + lea r3, [z_filter_t0] + movd xm1, angled + shr angled, 8 ; is_sm << 1 + vpbroadcastb m0, xm0 + vpbroadcastb m1, xm1 + pcmpeqb m0, [base+z_filter_wh] + mova xm2, [r3+angleq*8] + pand m0, m1 + pcmpgtb m0, m2 + pmovmskb r5d, m0 + ret +.w4_no_upsample: + mov maxbased, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w4_main + lea maxbased, [hq+3] + call .filter_strength + mov maxbased, 7 + test r5d, r5d + jz .w4_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastw xm3, [tlq+14] + mova xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 + vpbroadcastd xm1, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] + palignr xm2, xm3, xm0, 4 ; 2 3 4 5 6 7 8 8 + pmullw xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 + paddw xm2, xm0 + pmullw xm2, xm4 + movd [rsp+16], xm3 + cmp r5d, 3 + jne .w4_3tap + paddw xm1, xm2 + palignr xm2, xm3, xm0, 6 ; 3 4 5 6 7 8 8 8 + pblendw xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 + movzx r3d, word [tlq+14] + movzx r2d, word [tlq+12] + inc maxbased + paddw xm2, xm0 + sub r2d, r3d + paddw xm2, xm2 + lea r2d, [r2+r3*8+4] + shr r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3 + mov [rsp+16], r2w +.w4_3tap: + pxor xm0, xm0 + paddw xm1, xm2 + mov tlq, rsp + psrlw xm1, 3 + cmp hd, 8 + sbb maxbased, -1 + pavgw xm0, xm1 + mova [tlq], xm0 +.w4_main: + movd xm3, dxd + vpbroadcastq m1, [z_base_inc] + vpbroadcastw m6, [tlq+maxbaseq*2] ; top[max_base_x] + shl maxbased, 6 + vpbroadcastw m3, xm3 + movd xm0, maxbased + mov r3d, dxd ; xpos + vpbroadcastw m0, xm0 + paddw m4, m3, m3 + psubw m1, m0 ; -max_base_x + vpblendd m3, m4, 0xcc + paddw m0, m4, m3 + vpblendd m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3 + paddw m4, m4 + paddw m3, m1 +.w4_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + movu xm1, [tlq+r3*2] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + movu xm2, [tlq+r5*2] + lea r5d, [r3+dxq] + shr r3d, 6 ; base2 + vinserti128 m1, [tlq+r3*2], 1 ; 0 2 + lea r3d, [r5+dxq] + shr r5d, 6 ; base3 + vinserti128 m2, [tlq+r5*2], 1 ; 1 3 + punpcklqdq m0, m1, m2 + psrldq m1, 2 + pslldq m2, 6 + vpblendd m1, m2, 0xcc + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 ; xpos < max_base_x + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + sub hd, 4 + jz .w4_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w4_loop + lea r6, [strideq*3] +.w4_end_loop: + movq [dstq+strideq*0], xm6 + movq [dstq+strideq*1], xm6 + movq [dstq+strideq*2], xm6 + movq [dstq+r6 ], xm6 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_end_loop +.w4_end: + RET +.w8: + %assign stack_offset org_stack_offset + ALLOC_STACK -64, 7 + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + movu m2, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g _ + movu m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g _ _ + movu m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + cmp hd, 4 + jne .w8_upsample_h8 ; awkward single-pixel edge case + vpblendd m0, m2, 0x20 ; 3 4 5 6 7 8 9 a b c c _ _ _ _ _ +.w8_upsample_h8: + paddw m2, m1 + paddw m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + add dxd, dxd + psubw m0, m2, m0 + psraw m0, 3 + pxor m4, m4 + paddw m2, m0 + vpbroadcastw m0, r8m + movd xm3, dxd + pmaxsw m2, m4 + mov r3d, dxd + pavgw m2, m4 + vpbroadcastw m3, xm3 + pminsw m2, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + vbroadcasti128 m4, [z_upsample] + mova [rsp+ 0], xm0 + mova [rsp+16], xm1 + paddw m6, m3, m3 + vextracti128 [rsp+32], m0, 1 + vextracti128 [rsp+48], m1, 1 + vpblendd m3, m6, 0xf0 ; xpos0 xpos1 +.w8_upsample_loop: + lea r2d, [r3+dxq] + shr r3d, 6 ; base0 + movu xm1, [rsp+r3*2] + movu xm2, [rsp+r3*2+16] + lea r3d, [r2+dxq] + shr r2d, 6 ; base1 + vinserti128 m1, [rsp+r2*2], 1 + vinserti128 m2, [rsp+r2*2+16], 1 + pshufb m1, m4 + pshufb m2, m4 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m3, m6 + paddw m0, m1 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_upsample_loop + RET +.w8_no_intra_edge_filter: + and maxbased, 7 + or maxbased, 8 ; imin(h+7, 15) + jmp .w8_main +.w8_no_upsample: + lea maxbased, [hq+7] + test angled, 0x400 + jnz .w8_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .w8_main + popcnt r5d, r5d + vpbroadcastd m1, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + movu m2, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pmullw m1, m2 + cmp hd, 8 + jl .w8_filter_h4 + punpckhwd m2, m2 + vpblendd m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + je .w8_filter_end ; 8x4 and 8x8 are always 3-tap + movzx r3d, word [tlq+30] + mov maxbased, 16 + mov [rsp+32], r3d + cmp r5d, 3 + jne .w8_filter_end + punpcklwd xm6, xm0, xm0 + vpblendd m2, [tlq+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g + vpblendd m6, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + movzx r5d, word [tlq+28] + mov [rsp+34], r3w + paddw m2, m6 + sub r5d, r3d + inc maxbased + paddw m2, m2 + lea r3d, [r5+r3*8+4] + paddw m1, m2 + shr r3d, 3 + mov [rsp+32], r3w + jmp .w8_filter_end +.w8_filter_h4: + pshuflw m3, m2, q3321 + vinserti128 m3, [tlq+2], 0 ; 2 3 4 5 6 7 8 9 a b c c _ _ _ _ +.w8_filter_end: + paddw m0, m3 + pmullw m0, m4 + mov tlq, rsp + pxor m2, m2 + paddw m0, m1 + psrlw m0, 3 + pavgw m0, m2 + mova [tlq], m0 +.w8_main: + movd xm3, dxd + vbroadcasti128 m1, [z_base_inc] + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m3, xm3 + movd xm0, maxbased + mov r3d, dxd + vpbroadcastw m0, xm0 + paddw m4, m3, m3 + psubw m1, m0 + vpblendd m3, m4, 0xf0 ; xpos0 xpos1 + paddw m3, m1 +.w8_loop: + lea r5d, [r3+dxq] + shr r3d, 6 + movu xm0, [tlq+r3*2] + movu xm1, [tlq+r3*2+2] + lea r3d, [r5+dxq] + shr r5d, 6 + vinserti128 m0, [tlq+r5*2], 1 + vinserti128 m1, [tlq+r5*2+2], 1 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w8_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w8_loop +.w8_end_loop: + mova [dstq+strideq*0], xm6 + mova [dstq+strideq*1], xm6 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_end_loop +.w8_end: + RET +.w16_no_intra_edge_filter: + and maxbased, 15 + or maxbased, 16 ; imin(h+15, 31) + jmp .w16_main +.w16: + %assign stack_offset org_stack_offset + ALLOC_STACK -96, 7 + lea maxbased, [hq+15] + test angled, 0x400 + jnz .w16_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .w16_main + popcnt r5d, r5d + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h + cmp r5d, 3 + jne .w16_filter_3tap + vpbroadcastd m2, [base+pw_3] + punpcklwd xm0, xm0 + vpblendd m0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + paddw m0, m2 + pavgw m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i + paddw m0, m1 + psrlw m0, 2 + movu m3, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m3, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + cmp hd, 8 + jl .w16_filter_5tap_h4 + punpckhwd m3, m3 + je .w16_filter_5tap_h8 + vpblendd m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h + vpblendd m3, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h + movzx r3d, word [tlq+62] + movzx r2d, word [tlq+60] + pavgw m2, m4 + sub r2d, r3d + paddw m1, m3 + lea r2d, [r2+r3*8+4] + paddw m1, m2 + shr r2d, 3 + psrlw m1, 2 + mov [rsp+66], r3w + mov [rsp+64], r2w + mov tlq, rsp + mov r3d, 33 + cmp hd, 16 + cmovg maxbased, r3d + jmp .w16_filter_end2 +.w16_filter_5tap_h8: + vpblendd xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 + vpblendd xm3, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 + pavgw xm2, xm4 + paddw xm1, xm3 + paddw xm1, xm2 + psrlw xm1, 2 + jmp .w16_filter_end2 +.w16_filter_5tap_h4: + pshuflw xm4, xm3, q3332 ; 4 5 5 5 + pshuflw xm3, xm3, q3321 ; 3 4 5 5 + pavgw xm2, xm4 + paddw xm1, xm3 + paddw xm1, xm2 + psrlw xm1, 2 + jmp .w16_filter_end2 +.w16_filter_3tap: + vpbroadcastd m3, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] + pmullw m0, m3, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + movu m2, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pmullw m1, m4 + pmullw m3, m2 + paddw m0, m1 + cmp hd, 8 + je .w16_filter_3tap_h8 + jl .w16_filter_3tap_h4 + punpckhwd m2, m2 + vpblendd m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + jmp .w16_filter_end +.w16_filter_3tap_h4: + pshuflw xm2, xm2, q3321 ; 2 3 4 4 _ _ _ _ + jmp .w16_filter_end +.w16_filter_3tap_h8: + psrldq xm2, 2 + pshufhw xm2, xm2, q2210 ; 2 3 4 5 6 7 8 8 +.w16_filter_end: + paddw m2, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + pmullw m2, m4 + psrlw m0, 3 + pxor m1, m1 + paddw m2, m3 + psrlw m2, 3 + pavgw m0, m1 + pavgw m1, m2 +.w16_filter_end2: + mov tlq, rsp + mova [tlq+ 0], m0 + mova [tlq+32], m1 +.w16_main: + movd xm4, dxd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + movd xm0, maxbased + mov r3d, dxd + vpbroadcastw m0, xm0 + paddw m3, m4, [z_base_inc] + psubw m3, m0 +.w16_loop: + lea r5d, [r3+dxq] + shr r3d, 6 + movu m0, [tlq+r3*2] + movu m1, [tlq+r3*2+2] + lea r3d, [r5+dxq] + shr r5d, 6 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 + paddw m3, m4 + paddw m1, m0 + movu m0, [tlq+r5*2] + vpblendvb m2, m6, m1, m2 + movu m1, [tlq+r5*2+2] + mova [dstq+strideq*0], m2 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+strideq*1], m0 + sub hd, 2 + jz .w16_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w16_loop +.w16_end_loop: + mova [dstq+strideq*0], m6 + mova [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_end_loop +.w16_end: + RET +.w32: + %assign stack_offset org_stack_offset + ALLOC_STACK -160, 8 + lea maxbased, [hq+31] + mov r3d, 63 + cmp hd, 32 + cmova maxbased, r3d + test angled, 0x400 + jnz .w32_main + vpbroadcastd m2, [pw_3] + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + paddw m1, m2 + paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h + pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i + mov r3, rsp + paddw m0, m1 + lea r5d, [maxbaseq-31] + psrlw m0, 2 + mova [r3], m0 +.w32_filter_loop: + mova m0, [tlq+30] + paddw m1, m2, [tlq+28] + add tlq, 32 + paddw m0, [tlq+0] + pavgw m1, [tlq+4] + paddw m0, [tlq+2] + add r3, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r3], m0 + sub r5d, 16 + jg .w32_filter_loop + movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h + punpckhwd m1, m0, m0 + paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + jl .w32_filter_h8 + vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h + vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h + movzx r5d, word [tlq+62] + movzx r2d, word [tlq+60] + pavgw m2, m3 + sub r2d, r5d + paddw m0, m1 + lea r2d, [r2+r5*8+4] + paddw m0, m2 + shr r2d, 3 + psrlw m0, 2 + mova [r3+32], m0 + mov [r3+66], r5w + mov [r3+64], r2w + mov tlq, rsp + mov r3d, 65 + cmp hd, 64 + cmove maxbased, r3d + jmp .w32_main +.w32_filter_h8: + vpblendd xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 + vpblendd xm1, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 + pavgw xm2, xm3 + paddw xm0, xm1 + mov tlq, rsp + paddw xm0, xm2 + psrlw xm0, 2 + mova [r3+32], xm0 +.w32_main: + movd xm4, dxd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + movd xm0, maxbased + mov r5d, dxd + vpbroadcastd m7, [pw_m1024] ; -16 * 64 + vpbroadcastw m0, xm0 + paddw m3, m4, [z_base_inc] + psubw m3, m0 +.w32_loop: + mov r3d, r5d + shr r3d, 6 + movu m0, [tlq+r3*2] + movu m1, [tlq+r3*2+2] + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + psraw m1, m3, 15 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*0], m0 + movu m0, [tlq+r3*2+32] + movu m1, [tlq+r3*2+34] + add r5d, dxd + psubw m1, m0 + pmulhrsw m1, m2 + pcmpgtw m2, m7, m3 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+32*1], m0 + dec hd + jz .w32_end + add dstq, strideq + cmp r5d, maxbased + jb .w32_loop +.w32_end_loop: + mova [dstq+32*0], m6 + mova [dstq+32*1], m6 + add dstq, strideq + dec hd + jg .w32_end_loop +.w32_end: + RET +.w64: + %assign stack_offset org_stack_offset + ALLOC_STACK -256, 10 + lea maxbased, [hq+63] + test angled, 0x400 + jnz .w64_main + vpbroadcastd m2, [pw_3] + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + paddw m1, m2 + paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h + pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i + mov r3, rsp + paddw m0, m1 + lea r5d, [hq+32] + psrlw m0, 2 + mova [r3], m0 +.w64_filter_loop: + mova m0, [tlq+30] + paddw m1, m2, [tlq+28] + add tlq, 32 + paddw m0, [tlq+0] + pavgw m1, [tlq+4] + paddw m0, [tlq+2] + add r3, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r3], m0 + sub r5d, 16 + jg .w64_filter_loop + movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h + punpckhwd m1, m0, m0 + paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h + vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h + pavgw m2, m3 + paddw m0, m1 + paddw m0, m2 + mov tlq, rsp + psrlw m0, 2 + mova [r3+32], m0 +.w64_main: + movd xm4, dxd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + movd xm0, maxbased + mov r5d, dxd + vpbroadcastd m7, [pw_m1024] ; -16 * 64 + vpbroadcastw m0, xm0 + paddw m3, m4, [z_base_inc] + paddw m8, m7, m7 ; -32 * 64 + psubw m3, m0 + paddw m9, m8, m7 ; -48 * 64 +.w64_loop: + mov r3d, r5d + shr r3d, 6 + movu m0, [tlq+r3*2] + movu m1, [tlq+r3*2+2] + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + psraw m1, m3, 15 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*0], m0 + movu m0, [tlq+r3*2+32] + movu m1, [tlq+r3*2+34] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m7, m3 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*1], m0 + movu m0, [tlq+r3*2+64] + movu m1, [tlq+r3*2+66] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m8, m3 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*2], m0 + movu m0, [tlq+r3*2+96] + movu m1, [tlq+r3*2+98] + add r5d, dxd + psubw m1, m0 + pmulhrsw m1, m2 + pcmpgtw m2, m9, m3 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+32*3], m0 + dec hd + jz .w64_end + add dstq, strideq + cmp r5d, maxbased + jb .w64_loop +.w64_end_loop: + mova [dstq+32*0], m6 + mova [dstq+32*1], m6 + mova [dstq+32*2], m6 + mova [dstq+32*3], m6 + add dstq, strideq + dec hd + jg .w64_end_loop +.w64_end: + RET + %macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax %ifnum %4 pshufb xm%2, xm%4 From 00a6a96afca889d9160dd9a740c0a0f7dcf78b13 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:04:34 +0200 Subject: [PATCH 058/188] x86: Add high bitdepth ipred_z3 AVX2 asm --- src/x86/ipred16_avx2.asm | 1046 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 1046 insertions(+) diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index a38823c539..995d693f66 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -105,6 +105,7 @@ JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z3_16bpc, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32 JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 @@ -1873,6 +1874,1051 @@ ALIGN function_align .w64_end: RET +cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase + %assign org_stack_offset stack_offset + lea r6, [ipred_z3_16bpc_avx2_table] + tzcnt hd, hm + movifnidn angled, anglem + lea r7, [dr_intra_derivative+45*2-1] + sub tlq, 2 + movsxd hq, [r6+hq*4] + sub angled, 180 + add hq, r6 + mov dyd, angled + neg dyd + xor angled, 0x400 + or dyq, ~0x7e + movzx dyd, word [r7+dyq] + vpbroadcastd m5, [pw_62] + mov org_wd, wd + jmp hq +.h4: + ALLOC_STACK -64, 7 + lea r7, [strideq*3] + cmp angleb, 40 + jae .h4_no_upsample + lea r4d, [angleq-1024] + sar r4d, 7 + add r4d, wd + jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) + mova xm2, [tlq-14] ; 0 1 2 3 4 5 6 7 + pblendw xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 + vpblendd xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 + pshufd xm3, xm1, q0000 + paddw xm1, xm2 + paddw xm0, [tlq-12] ; 1 2 3 4 5 6 7 8 + vpbroadcastw xm4, r8m ; pixel_max + add dyd, dyd + psubw xm0, xm1, xm0 + mova [rsp+ 0], xm3 + movd xm3, dyd + psraw xm0, 3 + neg dyd + paddw xm1, xm0 + pxor xm0, xm0 + lea r2d, [dyq+(16<<6)+63] ; ypos + pmaxsw xm1, xm0 + pavgw xm1, xm0 + vpbroadcastw m3, xm3 + pminsw xm1, xm4 + punpckhwd xm0, xm1, xm2 + punpcklwd xm1, xm2 + paddw m2, m3, m3 + mova [rsp+32], xm0 + punpcklwd m3, m2 + mova [rsp+16], xm1 + paddw m4, m2, m2 + paddw m2, m3 + vpblendd m3, m2, 0xf0 ; ypos0 ypos1 ypos2 ypos3 +.h4_upsample_loop: + lea r4d, [r2+dyq] + shr r2d, 6 + movu xm1, [rsp+r2*2] + lea r2d, [r4+dyq] + shr r4d, 6 + movu xm2, [rsp+r4*2] + lea r4d, [r2+dyq] + shr r2d, 6 + vinserti128 m1, [rsp+r2*2], 1 + lea r2d, [r4+dyq] + shr r4d, 6 + vinserti128 m2, [rsp+r4*2], 1 + psrld m0, m1, 16 + pblendw m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 + pslld m2, 16 + pblendw m1, m2, 0xaa + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m3, m4 + paddw m1, m0 + vextracti128 xm2, m1, 1 + punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 + movhps [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movhps [dstq+strideq*2], xm1 + movq [dstq+r7 ], xm1 + add dstq, 8 + sub wd, 4 + jg .h4_upsample_loop + RET +ALIGN function_align +.filter_strength: ; h4/h8/h16 +%define base r4-z_filter_t0 + lea r4, [z_filter_t0] + movd xm0, maxbased + movd xm1, angled + shr angled, 8 ; is_sm << 1 + vpbroadcastb m0, xm0 + vpbroadcastb m1, xm1 + pcmpeqb m0, [base+z_filter_wh] + pand m0, m1 + mova xm1, [r4+angleq*8] + pcmpgtb m0, m1 + pmovmskb r5d, m0 + ret +.h4_no_upsample: + mov maxbased, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h4_main + lea maxbased, [wq+3] + call .filter_strength + mov maxbased, 7 + test r5d, r5d + jz .h4_main ; filter_strength == 0 + popcnt r5d, r5d + mova xm0, [tlq-14] ; 0 1 2 3 4 5 6 7 + movu xm3, [tlq-12] ; 1 2 3 4 5 6 7 8 + vpbroadcastd xm2, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] + pmullw xm2, xm0 + pblendw xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 + paddw xm1, xm0, xm3 + movd [rsp+12], xm0 + pmullw xm1, xm4 + cmp r5d, 3 + jne .h4_filter_3tap + pblendw xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8 + vpblendd xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 + movzx r4d, word [tlq-14] + movzx r2d, word [tlq-12] + inc maxbased + paddw xm1, xm2 + paddw xm0, xm3 + sub r2d, r4d + paddw xm2, xm0, xm0 + lea r2d, [r2+r4*8+4] + shr r2d, 3 + mov [rsp+14], r2w +.h4_filter_3tap: + pxor xm0, xm0 + paddw xm1, xm2 + lea tlq, [rsp+30] + psrlw xm1, 3 + cmp wd, 8 + sbb maxbased, -1 + pavgw xm0, xm1 + mova [rsp+16], xm0 +.h4_main: + movd xm3, dyd + neg maxbaseq + vbroadcasti128 m1, [z_base_inc] + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m3, xm3 + lea r4d, [maxbaseq+3*64] + neg dyq + movd xm2, r4d + sub tlq, 8 + lea r4, [dyq+63] ; ypos + punpcklwd m1, m1 + paddw m0, m3, m3 + vpbroadcastw m2, xm2 + punpcklwd m3, m0 + paddw m4, m0, m0 + paddw m0, m3 + psubw m2, m1 + vpblendd m3, m0, 0xf0 ; ypos0 ypos1 ypos2 ypos3 + or maxbased, 63 + paddw m3, m2 +.h4_loop: + lea r5, [r4+dyq] + sar r4, 6 ; base0 + movu xm1, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base1 + movu xm2, [tlq+r5*2] + lea r5, [r4+dyq] + sar r4, 6 ; base2 + vinserti128 m1, [tlq+r4*2], 1 + lea r4, [r5+dyq] + sar r5, 6 ; base3 + vinserti128 m2, [tlq+r5*2], 1 + punpckhwd m0, m1, m2 + punpcklwd m1, m2 + pand m2, m5, m3 + palignr m0, m1, 4 ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 ; ypos < max_base_y + paddw m3, m4 + paddw m1, m0 + vpblendvb m1, m6, m1, m2 + vextracti128 xm2, m1, 1 + punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 + movhps [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movhps [dstq+strideq*2], xm1 + movq [dstq+r7 ], xm1 + sub wd, 4 + jz .h4_end + add dstq, 8 + cmp r4d, maxbased + jg .h4_loop +.h4_end_loop: + movq [dstq+strideq*0], xm6 + movq [dstq+strideq*1], xm6 + movq [dstq+strideq*2], xm6 + movq [dstq+r7 ], xm6 + add dstq, 8 + sub wd, 4 + jg .h4_end_loop +.h4_end: + RET +.h8: + lea r4d, [angleq+216] + %assign stack_offset org_stack_offset + ALLOC_STACK -64, 8 + mov r4b, wb + lea r7, [strideq*3] + cmp r4d, 8 + ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 + mova m2, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6 7 8 9 a b c d e + movu m0, [tlq-34] ; _ _ 0 1 2 3 4 5 6 7 8 9 a b c d + cmp wd, 8 + je .h8_upsample_w8 + pshufhw xm3, xm2, q1000 + vpblendd m0, m3, 0x0f ; _ _ _ _ 4 4 4 5 6 7 8 9 a b c d +.h8_upsample_w8: + paddw m0, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpbroadcastw m4, r8m ; pixel_max + add dyd, dyd + psubw m0, m1, m0 + movd xm6, dyd + psraw m0, 3 + neg dyd + paddw m1, m0 + pxor m0, m0 + pmaxsw m1, m0 + lea r4d, [dyq+(16<<6)+63] ; ypos + pavgw m1, m0 + vpbroadcastw m6, xm6 + pminsw m1, m4 + punpckhwd m0, m1, m2 + punpcklwd m1, m2 + vextracti128 [rsp+48], m0, 1 + vextracti128 [rsp+32], m1, 1 + paddw m7, m6, m6 + mova [rsp+16], xm0 + mova [rsp+ 0], xm1 + punpcklwd m6, m7 ; ypos0 ypos1 +.h8_upsample_loop: + lea r2d, [r4+dyq] + shr r4d, 6 ; base0 + movu m1, [rsp+r4*2] + lea r4d, [r2+dyq] + shr r2d, 6 ; base1 + movu m2, [rsp+r2*2] + lea r2d, [r4+dyq] + shr r4d, 6 ; base2 + movu m3, [rsp+r4*2] + lea r4d, [r2+dyq] + shr r2d, 6 ; base3 + movu m4, [rsp+r2*2] + psrld m0, m1, 16 + pblendw m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0 + pslld m2, 16 + pblendw m1, m2, 0xaa + psrld m2, m3, 16 + pblendw m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4 c3 d3 c2 d2 c1 d1 c0 d0 + pslld m4, 16 + pblendw m3, m4, 0xaa + pand m4, m5, m6 + paddw m6, m7 + psllw m4, 9 + psubw m1, m0 + pmulhrsw m1, m4 + pand m4, m5, m6 + psllw m4, 9 + psubw m3, m2 + pmulhrsw m3, m4 + paddw m6, m7 + lea r2, [dstq+strideq*4] + paddw m1, m0 + paddw m3, m2 + punpckhdq m0, m1, m3 ; a5 b5 c5 d5 a4 b4 c4 d4 a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq m1, m3 ; a7 b7 c7 d7 a6 b6 c6 d6 a3 b3 c3 d3 a2 b2 c2 d2 + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + movhps [r2 +strideq*0], xm0 + movq [r2 +strideq*1], xm0 + movhps [r2 +strideq*2], xm1 + movq [r2 +r7 ], xm1 + movhps [dstq+strideq*0], xm2 + movq [dstq+strideq*1], xm2 + movhps [dstq+strideq*2], xm3 + movq [dstq+r7 ], xm3 + add dstq, 8 + sub wd, 4 + jg .h8_upsample_loop + RET +.h8_no_intra_edge_filter: + and maxbased, 7 + or maxbased, 8 ; imin(w+7, 15) + jmp .h8_main +.h8_no_upsample: + lea maxbased, [wq+7] + test angled, 0x400 + jnz .h8_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .h8_main + popcnt r5d, r5d + mova m0, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + movu m3, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpbroadcastd m2, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] + pmullw m2, m0 + cmp wd, 8 + jl .h8_filter_w4 + punpcklwd xm0, xm0 + vpblendd m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + movd [rsp+28], xm0 + paddw m1, m3 + mov r4d, 16 + pmullw m1, m4 + cmovg maxbased, r4d + cmp r5d, 3 + jne .h8_filter_3tap + punpckhwd m3, m3 + vpblendd m0, [tlq-34], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m3, [tlq-26], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + movzx r4d, word [tlq-30] + movzx r2d, word [tlq-28] + inc maxbased + paddw m1, m2 + paddw m0, m3 + sub r2d, r4d + paddw m2, m0, m0 + lea r2d, [r2+r4*8+4] + shr r2d, 3 + mov [rsp+30], r2w + jmp .h8_filter_3tap +.h8_filter_w4: + pshufhw xm1, xm0, q2100 + vinserti128 m1, [tlq-16], 1 ; _ _ _ _ 4 4 5 6 7 8 9 a b c d e + paddw m1, m3 + pmullw m1, m4 +.h8_filter_3tap: + pxor m0, m0 + paddw m1, m2 + lea tlq, [rsp+62] + psrlw m1, 3 + pavgw m0, m1 + mova [rsp+32], m0 +.h8_main: + movd xm4, dyd + neg maxbaseq + vbroadcasti128 m1, [z_base_inc] + vpbroadcastw m7, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + lea r4d, [maxbaseq+7*64] + neg dyq + movd xm2, r4d + sub tlq, 16 + lea r4, [dyq+63] + paddw m6, m4, m4 + vpbroadcastw m2, xm2 + vpblendd m4, m6, 0xf0 ; ypos0 ypos1 + psubw m2, m1 + or maxbased, 63 + paddw m4, m2 +.h8_loop: + lea r5, [r4+dyq] + sar r4, 6 ; base0 + movu xm0, [tlq+r4*2+2] + movu xm1, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base1 + vinserti128 m0, [tlq+r5*2+2], 1 + vinserti128 m1, [tlq+r5*2], 1 + lea r5, [r4+dyq] + sar r4, 6 ; base2 + pand m3, m5, m4 + psllw m3, 9 + psubw m1, m0 + pmulhrsw m1, m3 + psraw m3, m4, 15 + paddw m4, m6 + paddw m0, m1 + movu xm1, [tlq+r4*2+2] + movu xm2, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base3 + vpblendvb m0, m7, m0, m3 + vinserti128 m1, [tlq+r5*2+2], 1 + vinserti128 m2, [tlq+r5*2], 1 + pand m3, m5, m4 + psllw m3, 9 + psubw m2, m1 + pmulhrsw m2, m3 + psraw m3, m4, 15 + paddw m4, m6 + lea r5, [dstq+strideq*4] + paddw m1, m2 + vpblendvb m1, m7, m1, m3 + punpckhwd m2, m0, m1 ; a3 c3 a2 c2 a1 c1 a0 c0 b3 d3 b2 d2 b1 d1 b0 d0 + vextracti128 xm3, m2, 1 + punpcklwd m0, m1 ; a7 c7 a6 c6 a5 c5 a4 c5 b7 d7 b6 d6 b5 d5 b4 d4 + punpckhwd xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpcklwd xm2, xm3 ; a3 b3 c3 d3 a2 b2 c2 d2 + vextracti128 xm3, m0, 1 + movhps [dstq+strideq*0], xm1 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movq [dstq+r7 ], xm2 + punpckhwd xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4 + punpcklwd xm0, xm3 ; a7 b7 c7 d7 a6 b6 c6 d6 + movhps [r5 +strideq*0], xm1 + movq [r5 +strideq*1], xm1 + movhps [r5 +strideq*2], xm0 + movq [r5 +r7 ], xm0 + sub wd, 4 + jz .h8_end + add dstq, 8 + cmp r4d, maxbased + jg .h8_loop + lea r6, [strideq*5] + lea r2, [strideq+r7*2] ; stride*7 + test wd, 4 + jz .h8_end_loop + movq [dstq+strideq*0], xm7 + movq [dstq+strideq*1], xm7 + movq [dstq+strideq*2], xm7 + movq [dstq+r7 ], xm7 + movq [dstq+strideq*4], xm7 + movq [dstq+r6 ], xm7 + movq [dstq+r7*2 ], xm7 + movq [dstq+r2 ], xm7 + add dstq, 8 + sub wd, 4 + jz .h8_end +.h8_end_loop: + mova [dstq+strideq*0], xm7 + mova [dstq+strideq*1], xm7 + mova [dstq+strideq*2], xm7 + mova [dstq+r7 ], xm7 + mova [dstq+strideq*4], xm7 + mova [dstq+r6 ], xm7 + mova [dstq+r7*2 ], xm7 + mova [dstq+r2 ], xm7 + add dstq, 16 + sub wd, 8 + jg .h8_end_loop +.h8_end: + RET +.h16_no_intra_edge_filter: + and maxbased, 15 + or maxbased, 16 ; imin(w+15, 31) + jmp .h16_main +ALIGN function_align +.h16: + %assign stack_offset org_stack_offset + ALLOC_STACK -96, 10 + lea maxbased, [wq+15] + lea r7, [strideq*3] + test angled, 0x400 + jnz .h16_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .h16_main ; filter_strength == 0 + popcnt r5d, r5d + movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i + paddw m1, m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpbroadcastd m6, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] + pmullw m2, m6, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h + pmullw m1, m7 + paddw m1, m2 + cmp wd, 8 + jg .h16_filter_w16 + mova xm3, [tlq-46] ; 0 1 2 3 4 5 6 7 + pmullw xm6, xm3 + jl .h16_filter_w4 + pblendw xm3, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 + cmp r5d, 3 + jne .h16_filter_w8_3tap + vpblendd xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 +.h16_filter_w8_5tap: + punpckhwd m0, m0 + vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw xm4, [tlq-42] ; 2 3 4 5 6 7 8 9 + paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw xm4, xm4 + paddw m0, m0 + paddw xm6, xm4 + paddw m1, m0 +.h16_filter_w8_3tap: + paddw xm3, [tlq-44] ; 1 2 3 4 5 6 7 8 + pmullw xm3, xm7 + pxor m0, m0 + paddw xm3, xm6 + psrlw xm3, 3 + pavgw xm3, xm0 + mova [rsp+48], xm3 + jmp .h16_filter_end +.h16_filter_w4: + pshufhw xm3, xm3, q2100 ; _ _ _ _ 4 4 5 6 + cmp r5d, 3 + jne .h16_filter_w8_3tap + pshufhw xm4, xm3, q2100 ; _ _ _ _ 4 4 4 5 + jmp .h16_filter_w8_5tap +.h16_filter_w16: + mova m3, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + pmullw m6, m3 + punpcklwd xm3, xm3 + vpblendd m4, m3, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m4, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + mov r4d, 32 + cmp wd, 16 + cmovg maxbased, r4d + movd [rsp+28], xm3 + pmullw m4, m7 + cmp r5d, 3 + jne .h16_filter_w16_3tap + punpckhwd m0, m0 + vpblendd m3, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw m3, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + movzx r4d, word [tlq-62] + movzx r2d, word [tlq-60] + or maxbased, 1 + paddw m3, m3 + sub r2d, r4d + paddw m0, m0 + lea r2d, [r2+r4*8+4] + paddw m4, m3 + shr r2d, 3 + paddw m1, m0 + mov [rsp+30], r2w +.h16_filter_w16_3tap: + pxor m0, m0 + paddw m4, m6 + psrlw m4, 3 + pavgw m4, m0 + mova [rsp+32], m4 +.h16_filter_end: + psrlw m1, 3 + lea tlq, [rsp+94] + pavgw m1, m0 + mova [rsp+64], m1 +.h16_main: + movd xm8, dyd + neg maxbaseq + vpbroadcastw m9, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m8, xm8 + lea r4d, [maxbaseq+dyq+15*64] + neg dyq + movd xm7, r4d + sub tlq, 32 + lea r4, [dyq+63] + vpbroadcastw m7, xm7 + or maxbased, 63 + psubw m7, [z_base_inc] +.h16_loop: + lea r5, [r4+dyq] + sar r4, 6 ; base0 + movu m0, [tlq+r4*2+2] + movu m2, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base1 + movu m1, [tlq+r5*2+2] + movu m3, [tlq+r5*2] + lea r5, [r4+dyq] + sar r4, 6 ; base3 + pand m6, m5, m7 + psllw m6, 9 + psubw m2, m0 + pmulhrsw m2, m6 + psraw m6, m7, 15 + paddw m7, m8 + paddw m0, m2 + movu m2, [tlq+r4*2+2] + movu m4, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base3 + vpblendvb m0, m9, m0, m6 + pand m6, m5, m7 + psllw m6, 9 + psubw m3, m1 + pmulhrsw m3, m6 + psraw m6, m7, 15 + paddw m7, m8 + paddw m1, m3 + vpblendvb m1, m9, m1, m6 + pand m6, m5, m7 + psllw m6, 9 + psubw m4, m2 + pmulhrsw m4, m6 + psraw m6, m7, 15 + paddw m7, m8 + paddw m2, m4 + movu m3, [tlq+r5*2+2] + movu m4, [tlq+r5*2] + vpblendvb m2, m9, m2, m6 + pand m6, m5, m7 + psllw m6, 9 + psubw m4, m3 + pmulhrsw m4, m6 + psraw m6, m7, 15 + paddw m7, m8 + lea r5, [dstq+strideq*4] + paddw m3, m4 + vpblendvb m3, m9, m3, m6 + punpckhwd m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8 a3 b3 a2 b2 a1 b1 a0 b0 + punpcklwd m0, m1 ; af bf ae be ad bd ac bc a7 b7 a6 b6 a5 b5 a4 b4 + punpckhwd m1, m2, m3 ; cb db ca da c9 d9 c8 d8 c3 d3 c2 d2 c1 d1 c0 d0 + punpcklwd m2, m3 ; cf df ce de cd dd cc dc c7 d7 c6 d6 c5 d5 c4 d4 + punpckhdq m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8 a1 b1 c1 d1 a0 b0 c0 d0 + vextracti128 xm6, m3, 1 + punpckldq m4, m1 ; ab bb cb db aa ba ca da a3 b3 c3 d3 a2 b2 c2 d2 + punpckhdq m1, m0, m2 ; ad bd cd dd ac bc cc dc a5 b5 c5 d5 a4 b4 c4 d4 + punpckldq m0, m2 ; af bf cf df ae be ce de a7 b7 c7 d7 a6 b6 c6 d6 + vextracti128 xm2, m4, 1 + movhps [dstq+strideq*0], xm6 + movq [dstq+strideq*1], xm6 + vextracti128 xm6, m1, 1 + movhps [dstq+strideq*2], xm2 + movq [dstq+r7 ], xm2 + vextracti128 xm2, m0, 1 + movhps [r5 +strideq*0], xm6 + movq [r5 +strideq*1], xm6 + movhps [r5 +strideq*2], xm2 + movq [r5 +r7 ], xm2 + lea r5, [dstq+strideq*8] + movhps [r5 +strideq*0], xm3 + movq [r5 +strideq*1], xm3 + movhps [r5 +strideq*2], xm4 + movq [r5 +r7 ], xm4 + lea r5, [r5+strideq*4] + movhps [r5 +strideq*0], xm1 + movq [r5 +strideq*1], xm1 + movhps [r5 +strideq*2], xm0 + movq [r5 +r7 ], xm0 + sub wd, 4 + jz .h16_end + add dstq, 8 + cmp r4d, maxbased + jg .h16_loop + mov hd, 4 +.h16_end_loop0: + mov r6d, wd + mov r2, dstq + test wb, 4 + jz .h16_end_loop + movq [dstq+strideq*0], xm9 + movq [dstq+strideq*1], xm9 + movq [dstq+strideq*2], xm9 + movq [dstq+r7 ], xm9 + and r6d, 120 + jz .h16_end_w4 + add dstq, 8 +.h16_end_loop: + mova [dstq+strideq*0], xm9 + mova [dstq+strideq*1], xm9 + mova [dstq+strideq*2], xm9 + mova [dstq+r7 ], xm9 + add dstq, 16 + sub r6d, 8 + jg .h16_end_loop +.h16_end_w4: + lea dstq, [r2+strideq*4] + dec hd + jg .h16_end_loop0 +.h16_end: + RET +.h32: + %assign stack_offset org_stack_offset + ALLOC_STACK -160, 9 + lea maxbased, [wq+31] + and maxbased, 31 + or maxbased, 32 ; imin(w+31, 63) + test angled, 0x400 + jnz .h32_main + vpbroadcastd m2, [pw_3] + movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i + punpckhwd m1, m0, m0 + vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m1, m2 + paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + lea r4, [rsp+128] + paddw m0, m1 + lea r5d, [maxbaseq-31] + psrlw m0, 2 + mova [r4], m0 +.h32_filter_loop: + mova m0, [tlq-62] + paddw m1, m2, [tlq-66] + paddw m0, [tlq-64] + pavgw m1, [tlq-58] + paddw m0, [tlq-60] + sub tlq, 32 + sub r4, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r4], m0 + sub r5d, 16 + jg .h32_filter_loop + jl .h32_filter_h8 + mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + movzx r5d, word [tlq-62] + movzx r2d, word [tlq-60] + pavgw m2, m3 + sub r2d, r5d + paddw m0, m1 + lea r2d, [r2+r5*8+4] + paddw m0, m2 + shr r2d, 3 + psrlw m0, 2 + mova [r4-32], m0 + mov [r4-36], r5w + mov [r4-34], r2w + lea tlq, [rsp+158] + mov r4d, 65 + cmp wd, 64 + cmove maxbased, r4d + jmp .h32_main +.h32_filter_h8: + mova xm0, [tlq-46] ; 0 1 2 3 4 5 6 7 + pblendw xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 + paddw xm2, [tlq-42] ; 2 3 4 5 6 7 8 9 + paddw xm0, [tlq-44] ; 1 2 3 4 5 6 7 8 + vpblendd xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 + lea tlq, [rsp+158] + pavgw xm2, xm3 + paddw xm0, xm1 + paddw xm0, xm2 + psrlw xm0, 2 + mova [r4-16], xm0 +.h32_main: + movd xm6, dyd + neg maxbaseq + vpbroadcastw m7, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m6, xm6 + lea r4d, [maxbaseq+dyq+15*64] + neg dyq + movd xm4, r4d + vpbroadcastd m8, [pw_m1024] + lea r4, [dyq+63] + vpbroadcastw m4, xm4 + or maxbased, 63 + psubw m4, [z_base_inc] +.h32_loop: + mov r5, r4 + sar r5, 6 + movu m1, [tlq+r5*2-64] + movu m0, [tlq+r5*2-62] + pand m3, m5, m4 + psllw m3, 9 + psubw m1, m0 + pmulhrsw m1, m3 + pcmpgtw m2, m8, m4 + paddw m0, m1 + vpblendvb m0, m7, m0, m2 + movu m2, [tlq+r5*2-32] + movu m1, [tlq+r5*2-30] + add r4, dyq + sub rsp, 64 + psubw m2, m1 + pmulhrsw m2, m3 + psraw m3, m4, 15 + paddw m4, m6 + mova [rsp+32*0], m0 + paddw m1, m2 + vpblendvb m1, m7, m1, m3 + mova [rsp+32*1], m1 + dec wd + jz .h32_transpose + cmp r4d, maxbased + jg .h32_loop +.h32_end_loop: + sub rsp, 64 + mova [rsp+32*0], m7 + mova [rsp+32*1], m7 + dec wd + jg .h32_end_loop +.h32_transpose: + lea r3, [strideq*3] + lea r4, [strideq*5] + mov r8, dstq + lea r5, [strideq+r3*2] +.h32_transpose_loop0: + lea r6, [rsp+32] + lea r2, [r8+org_wq*2-16] +.h32_transpose_loop: + mova m0, [r6+64*7] + mova m1, [r6+64*6] + mova m2, [r6+64*5] + mova m3, [r6+64*4] + mova m4, [r6+64*3] + mova m5, [r6+64*2] + mova m6, [r6+64*1] + mova m7, [r6+64*0] + punpckhwd m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0 + punpcklwd m0, m1 ; a7 b7 a6 b6 a5 b5 a4 b4 + punpckhwd m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0 + punpcklwd m2, m3 ; c7 d7 c6 d6 c5 d5 c4 d4 + punpckhwd m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0 + punpcklwd m4, m5 ; e7 f7 e6 f6 e5 f5 e4 f4 + punpckhwd m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0 + punpcklwd m6, m7 ; g7 h7 g6 h6 g5 h5 g4 h4 + lea dstq, [r2+strideq*8] + sub r6, 32 + punpckhdq m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq m8, m1 ; a3 b3 c3 d3 a2 b2 c2 d2 + punpckhdq m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0 + punpckldq m3, m5 ; e3 f3 g3 h3 e2 f2 g2 h2 + punpckhqdq m5, m7, m1 ; 8 0 + vextracti128 [r2 +strideq*0], m5, 1 + punpcklqdq m7, m1 ; 9 1 + mova [dstq+strideq*0], xm5 + punpckhqdq m1, m8, m3 ; 10 2 + vextracti128 [r2 +strideq*1], m7, 1 + punpcklqdq m8, m3 ; 11 3 + mova [dstq+strideq*1], xm7 + punpckhdq m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4 + vextracti128 [r2 +strideq*2], m1, 1 + punpckldq m0, m2 ; a7 b7 c7 d7 a6 b6 c6 d6 + mova [dstq+strideq*2], xm1 + punpckhdq m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4 + vextracti128 [r2 +r3 ], m8, 1 + punpckldq m4, m6 ; e7 f7 g7 h7 e6 f6 g6 h6 + mova [dstq+r3 ], xm8 + punpckhqdq m6, m3, m2 ; 12 4 + vextracti128 [r2 +strideq*4], m6, 1 + punpcklqdq m3, m2 ; 13 5 + mova [dstq+strideq*4], xm6 + punpckhqdq m2, m0, m4 ; 14 6 + vextracti128 [r2 +r4 ], m3, 1 + punpcklqdq m0, m4 ; 15 7 + mova [dstq+r4 ], xm3 + vextracti128 [r2 +r3*2 ], m2, 1 + mova [dstq+r3*2 ], xm2 + vextracti128 [r2 +r5 ], m0, 1 + mova [dstq+r5 ], xm0 + lea r2, [dstq+strideq*8] + cmp r6, rsp + jae .h32_transpose_loop + add rsp, 64*8 + sub org_wd, 8 + jg .h32_transpose_loop0 +.h32_end: + RET +.h64: + %assign stack_offset org_stack_offset + ALLOC_STACK -256, 10 + lea maxbased, [wq+63] + test angled, 0x400 + jnz .h64_main + vpbroadcastd m2, [pw_3] + movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i + punpckhwd m1, m0, m0 + vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m1, m2 + paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + lea r4, [rsp+224] + paddw m0, m1 + lea r5d, [wq+32] + psrlw m0, 2 + mova [r4], m0 +.h64_filter_loop: + mova m0, [tlq-62] + paddw m1, m2, [tlq-66] + paddw m0, [tlq-64] + pavgw m1, [tlq-58] + paddw m0, [tlq-60] + sub tlq, 32 + sub r4, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r4], m0 + sub r5d, 16 + jg .h64_filter_loop + mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + lea tlq, [rsp+254] + pavgw m2, m3 + paddw m0, m1 + paddw m0, m2 + psrlw m0, 2 + mova [r4-32], m0 +.h64_main: + neg maxbaseq + movd xm4, dyd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + lea r4d, [maxbaseq+dyq+15*64] + neg dyq + vpbroadcastd m7, [pw_m1024] + movd xm3, r4d + lea r4, [dyq+63] + paddw m8, m7, m7 + vpbroadcastw m3, xm3 + or maxbased, 63 + paddw m9, m8, m7 + psubw m3, [z_base_inc] +.h64_loop: + mov r5, r4 + sar r5, 6 + movu m1, [tlq+r5*2-128] + movu m0, [tlq+r5*2-126] + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + sub rsp, 128 + paddw m0, m1 + pcmpgtw m1, m9, m3 + vpblendvb m0, m6, m0, m1 + mova [rsp+32*0], m0 + movu m1, [tlq+r5*2-96] + movu m0, [tlq+r5*2-94] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m8, m3 + vpblendvb m0, m6, m0, m1 + mova [rsp+32*1], m0 + movu m1, [tlq+r5*2-64] + movu m0, [tlq+r5*2-62] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m7, m3 + vpblendvb m0, m6, m0, m1 + mova [rsp+32*2], m0 + movu m1, [tlq+r5*2-32] + movu m0, [tlq+r5*2-30] + psubw m1, m0 + pmulhrsw m1, m2 + add r4, dyq + psraw m2, m3, 15 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [rsp+32*3], m0 + dec wd + jz .h64_transpose + cmp r4d, maxbased + jg .h64_loop +.h64_end_loop: + sub rsp, 128 + mova [rsp+32*0], m6 + mova [rsp+32*1], m6 + mova [rsp+32*2], m6 + mova [rsp+32*3], m6 + dec wd + jg .h64_end_loop +.h64_transpose: + lea r2, [strideq*3] + lea r3, [strideq*5] + mov r5, dstq + lea r4, [strideq+r2*2] +.h64_transpose_loop0: + lea r6, [rsp+112] + lea dstq, [r5+org_wq*2-32] +.h64_transpose_loop: + mova xm0, [r6+128*15] + vinserti128 m0, [r6+128* 7], 1 + mova xm1, [r6+128*14] + vinserti128 m1, [r6+128* 6], 1 + mova xm2, [r6+128*13] + vinserti128 m2, [r6+128* 5], 1 + mova xm3, [r6+128*12] + vinserti128 m3, [r6+128* 4], 1 + mova xm4, [r6+128*11] + vinserti128 m4, [r6+128* 3], 1 + mova xm5, [r6+128*10] + vinserti128 m5, [r6+128* 2], 1 + mova xm6, [r6+128* 9] + vinserti128 m6, [r6+128* 1], 1 + mova xm7, [r6+128* 8] + vinserti128 m7, [r6+128* 0], 1 + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + sub r6, 16 + punpckhdq m7, m8, m1 + punpckldq m8, m1 + punpckhdq m1, m3, m5 + punpckldq m3, m5 + punpckhqdq m5, m7, m1 + punpcklqdq m7, m1 + punpckhqdq m1, m8, m3 + punpcklqdq m8, m3 + punpckhdq m3, m0, m2 + mova [dstq+strideq*0], m5 + punpckldq m0, m2 + mova [dstq+strideq*1], m7 + punpckhdq m2, m4, m6 + mova [dstq+strideq*2], m1 + punpckldq m4, m6 + mova [dstq+r2 ], m8 + punpckhqdq m6, m3, m2 + mova [dstq+strideq*4], m6 + punpcklqdq m3, m2 + mova [dstq+r3 ], m3 + punpckhqdq m2, m0, m4 + mova [dstq+r2*2 ], m2 + punpcklqdq m0, m4 + mova [dstq+r4 ], m0 + lea dstq, [dstq+strideq*8] + cmp r6, rsp + jae .h64_transpose_loop + add rsp, 128*16 + sub org_wd, 16 + jg .h64_transpose_loop0 +.h64_end: + RET + %macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax %ifnum %4 pshufb xm%2, xm%4 From fa133e0a53160e778d1d666034045317ca9d4cc9 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:04:35 +0200 Subject: [PATCH 059/188] x86: Add high bitdepth ipred_z2 AVX2 asm --- src/x86/ipred16_avx2.asm | 865 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 862 insertions(+), 3 deletions(-) diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index 995d693f66..a1a7f6ba44 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -69,12 +69,24 @@ z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 pw_m1024: times 2 dw -1024 +pw_1to16: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +pw_16to1: dw 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 +z2_ymul: dw 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4 +z2_ymul8: dw 1, 2, 5, 6, 3, 4, 7, 8, 5, 6, 16, 16, 7, 8 +pb_90: times 4 db 90 +z2_y_shuf_h4: dd 3, 7, 2, 6, 1, 5, 0, 4 z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 -z_filter_k: dw 4, 4, 5, 5, 4, 4, 8, 8, 6, 6, 4, 4 +z2_x_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 +z2_y_shuf: db 6, 7, 14, 15, 4, 5, 12, 13, 4, 5, 12, 13, 2, 3, 10, 11 +z2_y_shuf_us: db 6, 7, 14, 15, 2, 3, 10, 11, 4, 5, 12, 13, 0, 1, 8, 9 +z_filter_k: dw 4, 4, 5, 5, 4, 4 + dw 8, 8, 6, 6, 4, 4 + dw 0, 0, 0, 0, 2, 2 -%define pw_4 z_filter_k +%define pw_2 (z_filter_k+32) +%define pw_4 (z_filter_k+ 0) +%define pw_16 (z2_ymul8 +20) -pw_2: times 2 dw 2 pw_3: times 2 dw 3 pw_62: times 2 dw 62 pw_512: times 2 dw 512 @@ -105,6 +117,7 @@ JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3_16bpc, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32 JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ @@ -1874,6 +1887,852 @@ ALIGN function_align .w64_end: RET +cglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy +%define base r9-z_filter_t0 + lea r9, [ipred_z2_16bpc_avx2_table] + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + lea dxq, [dr_intra_derivative-90] + movsxd wq, [r9+wq*4] + mova m1, [tlq- 0] + movzx dyd, angleb + xor angled, 0x400 + mova m2, [tlq- 32] + mov r8, dxq + sub dxq, dyq + mova m3, [tlq- 64] + add wq, r9 + add r9, z_filter_t0-ipred_z2_16bpc_avx2_table + mova m4, [tlq- 96] + and dyd, ~1 + mova m5, [tlq-128] + and dxq, ~1 + movzx dyd, word [r8+dyq] ; angle - 90 + movzx dxd, word [dxq+270] ; 180 - angle + vpbroadcastd m11, [base+pw_62] + mova [rsp+128], m1 + mova [rsp+ 96], m2 + mova [rsp+ 64], m3 + neg dxd + mova [rsp+ 32], m4 + neg dyq + mova [rsp+ 0], m5 + jmp wq +.w4: + vbroadcasti128 m10, [base+z2_x_shuf] + vpbroadcastq m6, [base+z_base_inc+2] + lea r8d, [dxq+(65<<6)] ; xpos + mov r10d, (63-4)<<6 + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+2] + add angled, 1022 + shl r3d, 6 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + movq xm0, [tlq+2] ; 1 2 3 4 + movq xm1, [tlq+0] ; 0 1 2 3 + pshuflw xm2, xm0, q3321 ; 2 3 4 4 + pshuflw xm3, xm1, q2100 ; 0 0 1 2 + vpbroadcastw xm4, r8m ; pixel_max + vbroadcasti128 m10, [base+z_upsample] + paddw xm1, xm0 + paddw xm2, xm3 + lea r8d, [r8+dxq+(1<<6)] + psubw xm2, xm1, xm2 + add dxd, dxd + psraw xm2, 3 + pxor xm3, xm3 + sub r10d, 3<<6 + paddw xm1, xm2 + paddw m6, m6 + pmaxsw xm1, xm3 + sub angled, 1075 ; angle - 53 + pavgw xm1, xm3 + lea r3d, [hq+3] + pminsw xm1, xm4 + xor angled, 0x7f ; 180 - angle + punpcklwd xm1, xm0 + movu [rsp+130], xm1 + call .filter_strength + jmp .w4_filter_left +ALIGN function_align +.filter_strength: + movd xm8, r3d + mov r3d, angled + movd xm7, angled + vpbroadcastb m8, xm8 + shr r3d, 8 ; is_sm << 1 + vpbroadcastb m7, xm7 + pcmpeqb m8, [base+z_filter_wh] + mova xm9, [r9+r3*8] + pand m0, m8, m7 + pcmpgtb m0, m9 + pmovmskb r3d, m0 + ret +ALIGN function_align +.upsample_left: ; h4/h8 + mova xm0, [tlq-16] ; 8 7 6 5 4 3 2 1 + movu xm1, [tlq-14] ; 7 6 5 4 3 2 1 0 +%if STACK_ALIGNMENT < 32 + vpbroadcastw xm4, r8m ; pixel_max +%else + vpbroadcastw xm4, r9m ; r8m -> r9m due to call +%endif + cmp hd, 8 + je .upsample_left_h8 + pshufhw xm2, xm0, q2100 ; _ _ _ _ 4 4 3 2 + pshufhw xm3, xm1, q3321 ; _ _ _ _ 2 1 0 0 + jmp .upsample_left_end +.upsample_left_h8: + pblendw xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2 + pblendw xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0 +.upsample_left_end: + paddw xm1, xm0 + paddw xm2, xm3 + psubw xm2, xm1, xm2 + add dyq, dyq + psraw xm2, 3 + pxor xm3, xm3 + paddw xm1, xm2 + pmaxsw xm1, xm3 + pavgw xm1, xm3 + pminsw xm1, xm4 + punpcklwd xm2, xm0, xm1 + punpckhwd xm0, xm1 + mova [rsp+ 96+gprsize], xm2 + mova [rsp+112+gprsize], xm0 + ret +.w4_no_upsample_above: + lea r3d, [hq+3] + sub angled, 1112 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w4_no_filter_above + popcnt r3d, r3d + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] + psrldq xm0, xm1, 2 ; 1 2 3 4 + pshuflw xm2, xm1, q2100 ; 0 0 1 2 + pmullw xm4, xm0 + pshuflw xm3, xm0, q3321 ; 2 3 4 4 + paddw xm1, xm3 + pshuflw xm3, xm0, q3332 ; 3 4 4 4 + pmullw xm1, xm5 + vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*2] + paddw xm2, xm3 + vpbroadcastd xm3, r6m ; max_width + pmullw xm2, xm5 + packssdw xm3, xm3 + paddw xm1, xm4 + paddw xm1, xm2 + psubw xm3, [base+pw_1to16] + pxor xm4, xm4 + psrlw xm1, 3 + pminsw xm3, xm11 ; clip to byte range since there's no variable word blend + pavgw xm1, xm4 + vpblendvb xm1, xm0, xm3 + movq [rsp+130], xm1 +.w4_no_filter_above: + lea r3d, [hq+2] + add angled, 973 ; angle + 883 + shl r3d, 6 + test r3d, angled + jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) + vpbroadcastd xm0, [base+pb_90] + psubb xm0, xm7 ; 180 - angle + pand xm0, xm8 ; reuse from previous filter_strength call + pcmpgtb xm0, xm9 + pmovmskb r3d, xm0 +.w4_filter_left: + test r3d, r3d + jz .w4_main + popcnt r3d, r3d + mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + vpbroadcastd m5, r7m ; max_height + cmp r3d, 3 + je .w4_filter_left_s3 + vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] + pmullw m2, m0 + cmp hd, 8 + jl .w4_filter_left_h4 + movu m4, [tlq-34] + punpcklwd m1, m0, m0 + vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e + je .w4_filter_left_end + vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + jmp .w4_filter_left_end +.w4_upsample_left: + call .upsample_left + mov r11, -16 + vbroadcasti128 m9, [base+z_upsample] + jmp .w4_main_upsample_left +.w4_filter_left_s3: ; can only be h16 + movu m2, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpbroadcastd m4, [base+pw_3] + paddw m1, m0, m2 + punpckhwd m2, m2 + vpblendd m2, [tlq-28], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + punpcklwd xm3, xm0, xm0 + paddw m2, m4 + vpblendd m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e + vpblendd m3, [tlq-36], 0xfe ; 0 0 0 1 2 3 4 5 6 8 8 9 a b c d + paddw m1, m4 + pavgw m2, m3 + paddw m1, m2 + psrlw m1, 2 + jmp .w4_filter_left_end2 +.w4_filter_left_h4: + pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e +.w4_filter_left_end: + paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pmullw m1, m3 + paddw m1, m2 + pxor m2, m2 + psrlw m1, 3 + pavgw m1, m2 +.w4_filter_left_end2: + packssdw m5, m5 + psubw m5, [base+pw_16to1] + pminsw m5, m11 + vpblendvb m1, m0, m5 + mova [rsp+96], m1 +.w4_main: + vbroadcasti128 m9, [base+z2_x_shuf] + mov r11, -8 +.w4_main_upsample_left: + movd xm5, dyd + mova m4, [base+z2_y_shuf_h4] + mov r2d, r8d + movd xm0, dxd + vpbroadcastw m5, xm5 + rorx r5, dyq, 5 + lea r8d, [dyq*3] + pmullw m5, [base+z2_ymul] + rorx r9, dyq, 4 + sar dyd, 6 + vpbroadcastw m0, xm0 + sar r8d, 6 + pand m5, m11 ; frac_y + neg dyd + psllw m5, 9 + add r5d, dyd + add r8d, dyd + add r9d, dyd + paddw m7, m0, m0 + lea dyq, [rsp+dyq*2+126] + vpblendd m0, m7, 0xcc + add dyq, r11 + neg r5d + paddw m1, m0, m7 + neg r8d + vpblendd m0, m1, 0xf0 ; xpos0 xpos1 xpos2 xpos3 + neg r9d + paddw m7, m7 + paddw m6, m0 +.w4_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movu xm1, [rsp+r2*2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + movu xm3, [rsp+r3*2] + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x2 + vinserti128 m1, [rsp+r2*2], 1 + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x3 + vinserti128 m3, [rsp+r3*2], 1 + pshufb m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3 + pshufb m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3 + pand m2, m11, m6 + punpcklqdq m0, m1, m3 + punpckhqdq m1, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + cmp r3d, 64 + jge .w4_toponly + movu xm2, [dyq] + vinserti128 m2, [dyq+r8*2], 1 + movu xm3, [dyq+r5*2] + vinserti128 m3, [dyq+r9*2], 1 + pshufb m2, m9 + pshufb m3, m9 + punpckhwd m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0 + punpcklwd m2, m3 + psubw m2, m1 + pmulhrsw m2, m5 + psraw m3, m6, 15 ; base_x < topleft + paddw m1, m2 + vpermd m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 + vpblendvb m0, m1, m3 +.w4_toponly: + paddw m6, m7 ; xpos += dx + lea r3, [strideq*3] + add dyq, r11 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + sub hd, 4 + jz .w4_end + lea dstq, [dstq+strideq*4] + cmp r2d, r10d + jge .w4_loop +.w4_leftonly_loop: + movu xm1, [dyq] + vinserti128 m1, [dyq+r8*2], 1 + movu xm2, [dyq+r5*2] + vinserti128 m2, [dyq+r9*2], 1 + add dyq, r11 + pshufb m1, m9 + pshufb m2, m9 + punpckhwd m0, m1, m2 + punpcklwd m1, m2 + psubw m1, m0 + pmulhrsw m1, m5 + paddw m0, m1 + vpermd m0, m4, m0 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_leftonly_loop +.w4_end: + RET +.w8: + mov r10d, hd + test angled, 0x400 + jnz .w8_main + lea r3d, [angleq+126] + xor r8d, r8d + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 + mova xm1, [tlq+0] ; 0 1 2 3 4 5 6 7 + pblendw xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 + pblendw xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 + vpbroadcastw xm4, r8m ; pixel_max + paddw xm1, xm0 + paddw xm2, xm3 + not r8d + psubw xm2, xm1, xm2 + add dxd, dxd + psraw xm2, 3 + sub angled, 53 ; angle - 53 + pxor xm3, xm3 + paddw xm2, xm1 + lea r3d, [hq+7] + pmaxsw xm2, xm3 + xor angled, 0x7f ; 180 - angle + pavgw xm2, xm3 + pminsw xm2, xm4 + punpcklwd xm1, xm2, xm0 + punpckhwd xm2, xm0 + movu [rsp+130], xm1 + movu [rsp+146], xm2 + call .filter_strength + jmp .w8_filter_left +.w8_no_upsample_above: + lea r3d, [hq+7] + sub angled, 90 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w8_no_filter_above + popcnt r3d, r3d + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd xm6, [base+z_filter_k-4+r3*4+12*2] + movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 x + pblendw xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x + pmullw xm4, xm0 + pblendw xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x + paddw xm1, xm3 + vpblendd xm3, [tlq+6], 0x07 ; 3 4 5 6 7 8 8 8 x + paddw xm2, xm3 + vpbroadcastd xm3, r6m ; max_width + pmullw xm1, xm5 + pmullw xm2, xm6 + packssdw xm3, xm3 + paddw xm1, xm4 + paddw xm1, xm2 + psubw xm3, [base+pw_1to16] + pxor xm4, xm4 + psrlw xm1, 3 + pminsw xm3, xm11 + pavgw xm1, xm4 + vpblendvb xm1, xm0, xm3 + movu [rsp+130], xm1 +.w8_no_filter_above: + lea r3d, [angleq-51] + mov r3b, hb + cmp r3d, 8 + jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm + vpbroadcastd m0, [base+pb_90] + psubb m0, m7 + pand m0, m8 + pcmpgtb m0, m9 + pmovmskb r3d, m0 +.w8_filter_left: + test r3d, r3d + jz .w8_main + popcnt r3d, r3d + cmp r3d, 3 + jne .w8_filter_left_s12 + vpbroadcastd m6, [base+pw_3] + vpbroadcastd m7, [base+pw_16] + cmp hd, 16 ; flags needed for later + jmp .filter_left_s3b +.w8_upsample_left: + call .upsample_left + vbroadcasti128 m7, [base+z2_y_shuf_us] + lea r11, [rsp+118] + mov r8, -8 + jmp .w8_main_upsample_left +.w16_filter_left_s12: + xor r8d, r8d +.w8_filter_left_s12: + mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + vpbroadcastd m5, r7m ; max_height + vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] + pmullw m2, m0 + cmp hd, 8 + jl .w8_filter_left_h4 + movu m4, [tlq-34] + punpcklwd m1, m0, m0 + vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e + je .w8_filter_left_end + vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + jmp .w8_filter_left_end +.w8_filter_left_h4: + pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e +.w8_filter_left_end: + paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pmullw m1, m3 + paddw m1, m2 + pxor m2, m2 + psrlw m1, 3 + pavgw m1, m2 + packssdw m5, m5 + psubw m5, [base+pw_16to1] + pminsw m5, m11 + vpblendvb m1, m0, m5 + mova [rsp+96], m1 + test r8d, r8d + jz .w8_main +; upsample_main + vbroadcasti128 m10, [base+z_upsample] + vbroadcasti128 m7, [base+z2_y_shuf] + lea r5, [rsp+120] + movd xm1, dyd + vbroadcasti128 m4, [base+z_base_inc+2] + movd xm2, dxd + vpbroadcastw m1, xm1 + vpbroadcastw m2, xm2 + mov r7, dstq + paddw m4, m4 + pmullw m0, m1, [base+z2_ymul8] + paddw m5, m2, m2 + psllw xm1, 3 + vpblendd m2, m5, 0xf0 + lea r2d, [dxq+(66<<6)] ; xpos + paddw m4, m2 + pshufd m6, m0, q2020 + psraw xm0, 6 + pxor xm1, xm1 + psubw xm8, xm1, xm0 + pand m6, m11 + punpckhwd xm9, xm8, xm1 + psllw m6, 9 + punpcklwd xm8, xm1 +.w8_upsample_above_loop: + lea r3d, [r2+dxq] + shr r2d, 6 + movu xm1, [rsp+r2*2] + movu xm2, [rsp+r2*2+16] + lea r2d, [r3+dxq] + shr r3d, 6 + vinserti128 m1, [rsp+r3*2], 1 + vinserti128 m2, [rsp+r3*2+16], 1 + pshufb m1, m10 + pshufb m2, m10 + punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 + punpckhqdq m1, m2 + pand m2, m11, m4 + psubw m1, m0 + psllw m2, 9 + pmulhrsw m1, m2 + paddw m0, m1 + cmp r3d, 64 + jge .w8_upsample_above_toponly + mova m1, m5 + vpgatherdq m3, [r5+xm9*2], m5 + mova m5, m1 + vpgatherdq m2, [r5+xm8*2], m1 + pshufb m3, m7 + pshufb m2, m7 + punpckldq m1, m2, m3 + punpckhdq m2, m3 + psubw m2, m1 + pmulhrsw m2, m6 + paddw m1, m2 + vpermq m1, m1, q3120 + psraw m2, m4, 15 + vpblendvb m0, m1, m2 +.w8_upsample_above_toponly: + paddw m4, m5 + sub r5, 4 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w8_ret + lea dstq, [dstq+strideq*2] + jmp .w8_upsample_above_loop +.w8_main: + vbroadcasti128 m7, [base+z2_y_shuf] + lea r11, [rsp+120] + mov r8, -4 +.w8_main_upsample_left: + movd xm1, dyd + vbroadcasti128 m4, [base+z_base_inc+2] + movd xm2, dxd + vpbroadcastw m1, xm1 + vpbroadcastw m2, xm2 + mov r7, dstq + pmullw m0, m1, [base+z2_ymul8] + paddw m5, m2, m2 + psllw xm1, 3 + vpblendd m2, m5, 0xf0 ; xpos0 xpos1 + lea r9d, [dxq+(65<<6)] ; xpos + paddw m4, m2 + movd [rsp+284], xm1 +.w8_loop0: + mov r2d, r9d + mova [rsp+288], m0 + mov r5, r11 + mova [rsp+320], m4 + pshufd m6, m0, q2020 + psraw xm0, 6 + pxor xm1, xm1 + psubw xm8, xm1, xm0 ; base_y + pand m6, m11 ; frac_y + punpckhwd xm9, xm8, xm1 ; base_y 2 3 6 7 + psllw m6, 9 + punpcklwd xm8, xm1 ; base_y 0 1 4 5 +.w8_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movu xm0, [rsp+r2*2] + movu xm1, [rsp+r2*2+2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + vinserti128 m0, [rsp+r3*2], 1 + vinserti128 m1, [rsp+r3*2+2], 1 + pand m2, m11, m4 + psubw m1, m0 + psllw m2, 9 + pmulhrsw m1, m2 + paddw m0, m1 + cmp r3d, 64 + jge .w8_toponly + mova m1, m5 + vpgatherdq m3, [r5+xm9*2], m5 + mova m5, m1 + vpgatherdq m2, [r5+xm8*2], m1 + pshufb m3, m7 ; c0 d0 c1 d1 g0 h0 g1 h1 + pshufb m2, m7 ; a0 b0 a1 b1 e0 f0 e1 f1 + punpckldq m1, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m2, m3 + psubw m2, m1 + pmulhrsw m2, m6 + paddw m1, m2 + vpermq m1, m1, q3120 + psraw m2, m4, 15 ; base_x < topleft + vpblendvb m0, m1, m2 +.w8_toponly: + paddw m4, m5 ; xpos += dx + add r5, r8 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w8_end + lea dstq, [dstq+strideq*2] + cmp r2d, (63-8)<<6 + jge .w8_loop +.w8_leftonly_loop: + mova m0, m5 + vpgatherdq m4, [r5+xm9*2], m5 + mova m5, m0 + vpgatherdq m3, [r5+xm8*2], m0 + add r5, r8 + pshufb m2, m4, m7 + pshufb m1, m3, m7 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + psubw m1, m0 + pmulhrsw m1, m6 + paddw m0, m1 + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_leftonly_loop +.w8_end: + sub r10d, 1<<8 + jl .w8_ret + vpbroadcastd m0, [rsp+284] + add r7, 16 + paddw m0, [rsp+288] ; base_y += 8*dy + add r9d, 8<<6 + vpbroadcastd m4, [pw_512] + movzx hd, r10b + paddw m4, [rsp+320] ; base_x += 8*64 + mov dstq, r7 + jmp .w8_loop0 +.w8_ret: + RET +.w16: + movd xm0, [tlq+32] + lea r10d, [hq+(1<<8)] + movd [rsp+160], xm0 + test angled, 0x400 + jnz .w8_main + lea r3d, [hq+15] + sub angled, 90 + call .filter_strength + test r3d, r3d + jz .w16_no_filter_above + popcnt r3d, r3d + vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd m6, [base+z_filter_k-4+r3*4+12*2] + movu m0, [tlq+2] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + punpcklwd xm2, xm1, xm1 + vpblendd m2, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + punpckhwd m3, m0, m0 + pmullw m4, m0 + vpblendd m3, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + paddw m1, m3 + vpblendd m3, [tlq+6], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g + paddw m2, m3 + vpbroadcastd m3, r6m ; max_width + pmullw m1, m5 + pmullw m2, m6 + packssdw m3, m3 + paddw m1, m4 + paddw m1, m2 + psubw m3, [base+pw_1to16] + pxor m4, m4 + psrlw m1, 3 + pminsw m3, m11 + pavgw m1, m4 + vpblendvb m1, m0, m3 + movu [rsp+130], m1 +.w16_no_filter_above: + vpbroadcastd m0, [base+pb_90] + psubb m0, m7 + pand m0, m8 + pcmpgtb m0, m9 + pmovmskb r3d, m0 + test r3d, r3d + jz .w8_main + popcnt r3d, r3d + cmp r3d, 3 + jne .w16_filter_left_s12 + vpbroadcastd m6, [base+pw_3] + vpbroadcastd m7, [base+pw_16] + cmp hd, 4 + jne .filter_left_s3 + movq xm0, [tlq-8] ; 0 1 2 3 + movq xm1, [tlq-6] ; 1 2 3 4 + vpbroadcastd xm5, r7m ; max_height + movq xm4, [base+pw_16to1+24] ; 4to1 + pshuflw xm2, xm0, q2100 ; 0 0 1 2 + pshuflw xm3, xm1, q3321 ; 2 3 4 4 + paddw xm1, xm0 + paddw xm1, xm2 + pshuflw xm2, xm0, q1000 ; 0 0 0 1 + paddw xm3, xm6 + packssdw xm5, xm5 + pavgw xm2, xm3 + psubw xm5, xm4 + paddw xm1, xm2 + pminsw xm5, xm11 + psrlw xm1, 2 + vpblendvb xm1, xm0, xm5 + movq [rsp+120], xm1 + jmp .w8_main +.w32: + mova m2, [tlq+32] + movd xm0, [tlq+64] + lea r10d, [hq+(3<<8)] + mova [rsp+160], m2 + movd [rsp+192], xm0 + test angled, 0x400 + jnz .w8_main + vpbroadcastd m6, [base+pw_3] + vpbroadcastd m0, r6m ; max_width + vpbroadcastd m7, [base+pw_16] + mov r3d, 32 + packssdw m0, m0 + psubw m0, [base+pw_1to16] + pminsw m8, m0, m11 + psubw m9, m8, m7 +.w32_filter_above: + movu m0, [tlq+2] + punpcklwd xm4, xm1, xm1 + paddw m2, m6, [tlq+6] + paddw m1, m0 + vpblendd m4, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m1, [tlq+4] + movu m3, [tlq+r3+2] + paddw m5, m6, [tlq+r3-2] + pavgw m2, m4 + punpckhwd m4, m3, m3 + paddw m1, m2 + vpblendd m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h + vpblendd m4, [tlq+r3+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h + pavgw m2, m5 + paddw m5, m3, [tlq+r3] + paddw m4, m5 + psrlw m1, 2 + paddw m2, m4 + vpblendvb m1, m0, m8 + psrlw m2, 2 + vpblendvb m2, m3, m9 + movu [rsp+130], m1 + movu [rsp+r3+130], m2 +.filter_left_s3: + cmp hd, 16 + jl .filter_left_s3_h8 ; h8 +.filter_left_s3b: + mova m0, [tlq-32] ; 2 3 4 5 6 7 8 9 a b c d e f g h + movu m2, [tlq-30] ; 3 4 5 6 7 8 9 a b c d e f g h i + vpbroadcastd m5, r7m ; max_height + paddw m1, m0, m2 + punpckhwd m2, m2 + mov r3d, hd + vpblendd m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + packssdw m5, m5 + not r3 + psubw m5, [base+pw_16to1] + paddw m2, m6 + pminsw m8, m11, m5 + je .filter_left_s3_end ; h16 + paddw m1, [tlq-34] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pavgw m2, [tlq-36] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m2 + psrlw m1, 2 + vpblendvb m3, m1, m0, m8 + mova m0, [tlq-64] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m1, m0, [tlq-62] ; 3 4 5 6 7 8 9 a b c d e f g h i + paddw m2, m6, [tlq-60] ; 4 5 6 7 8 9 a b c d e f g h i j + psubw m8, m7 + mova [rsp+96], m3 + jnp .filter_left_s3_end ; h32 + mova m5, [tlq-96] + paddw m1, [tlq-66] + pavgw m2, [tlq-68] + paddw m1, m2 + paddw m4, m5, [tlq-94] + paddw m2, m6, [tlq-92] + psrlw m1, 2 + paddw m4, [tlq- 98] + pavgw m2, [tlq-100] + vpblendvb m3, m1, m0, m8 + mova m0, [tlq-128] + psubw m8, m7 + paddw m4, m2 + paddw m1, m0, [tlq-126] + paddw m2, m6, [tlq-124] + psrlw m4, 2 + mova [rsp+64], m3 + vpblendvb m4, m5, m8 + psubw m8, m7 + mova [rsp+32], m4 +.filter_left_s3_end: + punpcklwd xm3, xm0, xm0 + vpblendd m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8 9 a b c d e f g + vpblendd m3, [tlq+r3*2-2], 0xfe ; 2 2 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m4 + pavgw m2, m3 + paddw m1, m2 + psrlw m1, 2 + vpblendvb m1, m0, m8 + mova [rsp+r3*2+130], m1 + jmp .w8_main +.filter_left_s3_h8: + mova xm0, [tlq-16] ; 0 1 2 3 4 5 6 7 + movu xm3, [tlq-14] ; 1 2 3 4 5 6 7 8 + pblendw xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6 + vpbroadcastd xm5, r7m ; max_height + paddw xm1, xm0, xm3 + pblendw xm3, [tlq-12], 0x7f ; 2 3 4 5 6 7 8 8 + paddw xm1, xm2 + vpblendd xm2, [tlq-20], 0x0e ; 0 0 0 1 2 3 4 5 + paddw xm3, xm6 + packssdw xm5, xm5 + pavgw xm2, xm3 + psubw xm5, [base+pw_16to1+16] ; 8to1 + paddw xm1, xm2 + pminsw xm5, xm11 + psrlw xm1, 2 + vpblendvb xm1, xm0, xm5 + mova [rsp+112], xm1 + jmp .w8_main +.w64: + mova m2, [tlq+ 32] + mova m3, [tlq+ 64] + mova m4, [tlq+ 96] + movd xm0, [tlq+128] + lea r10d, [hq+(7<<8)] + mova [rsp+160], m2 + mova [rsp+192], m3 + mova [rsp+224], m4 + movd [rsp+256], xm0 + test angled, 0x400 + jnz .w8_main + vpbroadcastd m6, [base+pw_3] + movu m0, [tlq+34] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pavgw m2, [tlq+38] ; 4 5 6 7 8 9 a b c d e f g h h h + paddw m5, [tlq+36] ; 3 4 5 6 7 8 9 a b c d e f g h h + movu m4, [tlq+66] + paddw m3, m6, [tlq+62] + paddw m7, m4, [tlq+64] + pavgw m3, [tlq+70] + paddw m7, [tlq+68] + paddw m2, m5 + vpbroadcastd m5, r6m ; max_width + mov r3d, 96 + packssdw m5, m5 + paddw m3, m7 + psubw m5, [base+pw_1to16] + psrlw m2, 2 + vpbroadcastd m7, [base+pw_16] + psrlw m3, 2 + pminsw m8, m11, m5 + psubw m9, m8, m7 + vpblendvb m2, m0, m9 + psubw m9, m7 + vpblendvb m3, m4, m9 + psubw m9, m7 + movu [rsp+162], m2 + movu [rsp+194], m3 + jmp .w32_filter_above + cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase %assign org_stack_offset stack_offset lea r6, [ipred_z3_16bpc_avx2_table] From 1d043c77d58ec54ead6d9a590a9ce00d01fa0451 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:06:04 +0200 Subject: [PATCH 060/188] x86: Add cdef_dir asm improvements --- src/x86/cdef16_avx2.asm | 63 +++++++++--------- src/x86/cdef16_sse.asm | 144 ++++++++++++++++++++++++---------------- src/x86/cdef_avx2.asm | 45 ++++++------- src/x86/cdef_rav1e.asm | 44 ++++++------ src/x86/cdef_sse.asm | 48 ++++++++------ 5 files changed, 190 insertions(+), 154 deletions(-) diff --git a/src/x86/cdef16_avx2.asm b/src/x86/cdef16_avx2.asm index 27b64cf261..f8b5c5b0f7 100644 --- a/src/x86/cdef16_avx2.asm +++ b/src/x86/cdef16_avx2.asm @@ -1,5 +1,5 @@ -; Copyright (c) 2017-2021, The rav1e contributors -; Copyright (c) 2021, Nathan Egge +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without @@ -28,38 +28,39 @@ %if ARCH_X86_64 +SECTION_RODATA + +dir_shift: times 2 dw 0x4000 + times 2 dw 0x1000 + +cextern cdef_dir_8bpc_avx2.main + SECTION .text -cextern cdef_dir_8bpc_avx2 +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro INIT_YMM avx2 -cglobal cdef_dir_16bpc, 4, 4, 3, 32 + 8*8, src, ss, var, bdmax - popcnt bdmaxd, bdmaxd - movzx bdmaxq, bdmaxw - sub bdmaxq, 8 - movq xm2, bdmaxq - DEFINE_ARGS src, ss, var, ss3 - lea ss3q, [ssq*3] - mova xm0, [srcq + ssq*0] - mova xm1, [srcq + ssq*1] - vinserti128 m0, [srcq + ssq*2], 1 - vinserti128 m1, [srcq + ss3q], 1 - psraw m0, xm2 - psraw m1, xm2 - vpackuswb m0, m1 - mova [rsp + 32 + 0*8], m0 - lea srcq, [srcq + ssq*4] - mova xm0, [srcq + ssq*0] - mova xm1, [srcq + ssq*1] - vinserti128 m0, [srcq + ssq*2], 1 - vinserti128 m1, [srcq + ss3q], 1 - psraw m0, xm2 - psraw m1, xm2 - vpackuswb m0, m1 - mova [rsp + 32 + 4*8], m0 - lea srcq, [rsp + 32] ; WIN64 shadow space - mov ssq, 8 - call mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX) - RET +cglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax + lea r6, [dir_shift] + shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc + vpbroadcastd m4, [r6+bdmaxq*4] + lea r6, [strideq*3] + mova xm0, [srcq+strideq*0] + mova xm1, [srcq+strideq*1] + mova xm2, [srcq+strideq*2] + mova xm3, [srcq+r6 ] + lea srcq, [srcq+strideq*4] + vinserti128 m0, [srcq+r6 ], 1 + vinserti128 m1, [srcq+strideq*2], 1 + vinserti128 m2, [srcq+strideq*1], 1 + vinserti128 m3, [srcq+strideq*0], 1 + REPX {pmulhuw x, m4}, m0, m1, m2, m3 + jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main %endif ; ARCH_X86_64 diff --git a/src/x86/cdef16_sse.asm b/src/x86/cdef16_sse.asm index f89c8c4088..d1d46eaa8b 100644 --- a/src/x86/cdef16_sse.asm +++ b/src/x86/cdef16_sse.asm @@ -26,68 +26,98 @@ %include "config.asm" %include "ext/x86/x86inc.asm" -%ifn ARCH_X86_64 -SECTION_RODATA 16 +SECTION_RODATA -pq_dir_shr: dq 2, 4 -%endif +dir_shift: times 4 dw 0x4000 + times 4 dw 0x1000 + +pw_128: times 4 dw 128 + +cextern cdef_dir_8bpc_ssse3.main +cextern cdef_dir_8bpc_sse4.main +cextern shufw_6543210x SECTION .text -cextern cdef_dir_8bpc_ssse3 +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro -INIT_XMM ssse3 -cglobal cdef_dir_16bpc, 2, 4, 4, 32 + 8*8, src, ss, var, bdmax - bsr bdmaxd, bdmaxm -%if ARCH_X86_64 - movzx bdmaxq, bdmaxw - sub bdmaxq, 7 - movq m4, bdmaxq -%else - push r4 - sub bdmaxd, 9 - LEA r4, pq_dir_shr - movq m4, [r4 + bdmaxd*4] - pop r4 -%endif - DEFINE_ARGS src, ss, var, ss3 - lea ss3q, [ssq*3] - mova m0, [srcq + ssq*0] - mova m1, [srcq + ssq*1] - mova m2, [srcq + ssq*2] - mova m3, [srcq + ss3q] - psraw m0, m4 - psraw m1, m4 - psraw m2, m4 - psraw m3, m4 - packuswb m0, m1 - packuswb m2, m3 - mova [rsp + 32 + 0*8], m0 - mova [rsp + 32 + 2*8], m2 - lea srcq, [srcq + ssq*4] - mova m0, [srcq + ssq*0] - mova m1, [srcq + ssq*1] - mova m2, [srcq + ssq*2] - mova m3, [srcq + ss3q] - psraw m0, m4 - psraw m1, m4 - psraw m2, m4 - psraw m3, m4 - packuswb m0, m1 - packuswb m2, m3 - mova [rsp + 32 + 4*8], m0 - mova [rsp + 32 + 6*8], m2 - lea srcq, [rsp + 32] ; WIN64 shadow space - mov ssq, 8 +%macro CDEF_DIR 0 %if ARCH_X86_64 - call mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX) +cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax + lea r6, [dir_shift] + shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc + movddup m7, [r6+bdmaxq*8] + lea r6, [strideq*3] + mova m0, [srcq+strideq*0] + mova m1, [srcq+strideq*1] + mova m2, [srcq+strideq*2] + mova m3, [srcq+r6 ] + lea srcq, [srcq+strideq*4] + mova m4, [srcq+strideq*0] + mova m5, [srcq+strideq*1] + mova m6, [srcq+strideq*2] + REPX {pmulhuw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhuw m7, [srcq+r6 ] + pxor m8, m8 + packuswb m9, m0, m1 + packuswb m10, m2, m3 + packuswb m11, m4, m5 + packuswb m12, m6, m7 + REPX {psadbw x, m8}, m9, m10, m11, m12 + packssdw m9, m10 + packssdw m11, m12 + packssdw m9, m11 + jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main %else - movifnidn vard, varm - push eax ; align stack - push vard - push ssd - push srcd - call mangle(private_prefix %+ _cdef_dir_8bpc) - add esp, 0x10 +cglobal cdef_dir_16bpc, 2, 4, 8, 96, src, stride, var, bdmax + mov bdmaxd, bdmaxm + LEA r2, dir_shift + shr bdmaxd, 11 + movddup m7, [r2+bdmaxq*8] + lea r3, [strideq*3] + pmulhuw m3, m7, [srcq+strideq*0] + pmulhuw m4, m7, [srcq+strideq*1] + pmulhuw m5, m7, [srcq+strideq*2] + pmulhuw m6, m7, [srcq+r3 ] + movddup m1, [r2-dir_shift+pw_128] + lea srcq, [srcq+strideq*4] + pxor m0, m0 + packuswb m2, m3, m4 + psubw m3, m1 + psubw m4, m1 + mova [esp+0x00], m3 + mova [esp+0x10], m4 + packuswb m3, m5, m6 + psadbw m2, m0 + psadbw m3, m0 + psubw m5, m1 + psubw m6, m1 + packssdw m2, m3 + mova [esp+0x20], m5 + mova [esp+0x50], m6 + pmulhuw m4, m7, [srcq+strideq*0] + pmulhuw m5, m7, [srcq+strideq*1] + pmulhuw m6, m7, [srcq+strideq*2] + pmulhuw m7, [srcq+r3 ] + packuswb m3, m4, m5 + packuswb m1, m6, m7 + psadbw m3, m0 + psadbw m1, m0 + packssdw m3, m1 + movddup m1, [r2-dir_shift+pw_128] + LEA r2, shufw_6543210x + jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main %endif - RET +%endmacro + +INIT_XMM ssse3 +CDEF_DIR + +INIT_XMM sse4 +CDEF_DIR diff --git a/src/x86/cdef_avx2.asm b/src/x86/cdef_avx2.asm index f274a1d631..f460cc187a 100644 --- a/src/x86/cdef_avx2.asm +++ b/src/x86/cdef_avx2.asm @@ -1592,34 +1592,33 @@ CDEF_FILTER 4, 8 CDEF_FILTER 4, 4 INIT_YMM avx2 -cglobal cdef_dir_8bpc, 3, 4, 15, src, stride, var, stride3 +cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3 lea stride3q, [strideq*3] movq xm0, [srcq+strideq*0] movq xm1, [srcq+strideq*1] movq xm2, [srcq+strideq*2] - movq xm3, [srcq+stride3q] + movq xm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - vpbroadcastq m4, [srcq+strideq*0] - vpbroadcastq m5, [srcq+strideq*1] - vpbroadcastq m6, [srcq+strideq*2] - vpbroadcastq m7, [srcq+stride3q] - vpbroadcastd m8, [pw_128] - pxor m9, m9 - - vpblendd m0, m0, m7, 0xf0 - vpblendd m1, m1, m6, 0xf0 - vpblendd m2, m2, m5, 0xf0 - vpblendd m3, m3, m4, 0xf0 - - punpcklbw m0, m9 - punpcklbw m1, m9 - punpcklbw m2, m9 - punpcklbw m3, m9 - - psubw m0, m8 - psubw m1, m8 - psubw m2, m8 - psubw m3, m8 + vpbroadcastq m4, [srcq+stride3q ] + vpbroadcastq m5, [srcq+strideq*2] + vpblendd m0, m4, 0xf0 + vpblendd m1, m5, 0xf0 + vpbroadcastq m4, [srcq+strideq*1] + vpbroadcastq m5, [srcq+strideq*0] + vpblendd m2, m4, 0xf0 + vpblendd m3, m5, 0xf0 + pxor m4, m4 + punpcklbw m0, m4 + punpcklbw m1, m4 + punpcklbw m2, m4 + punpcklbw m3, m4 +cglobal_label .main + vpbroadcastd m4, [pw_128] + PROLOGUE 3, 4, 15 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 ; shuffle registers to generate partial_sum_diag[0-1] together vperm2i128 m7, m0, m0, 0x01 diff --git a/src/x86/cdef_rav1e.asm b/src/x86/cdef_rav1e.asm index abdf125c21..3604378bf4 100644 --- a/src/x86/cdef_rav1e.asm +++ b/src/x86/cdef_rav1e.asm @@ -264,33 +264,33 @@ CDEF_FILTER 4, 8 CDEF_FILTER 4, 4 INIT_YMM avx2 -cglobal cdef_dir_8bpc, 3, 4, 15, src, stride, var, stride3 +cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3 lea stride3q, [strideq*3] movq xm0, [srcq+strideq*0] movq xm1, [srcq+strideq*1] movq xm2, [srcq+strideq*2] - movq xm3, [srcq+stride3q] + movq xm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - vpbroadcastq m4, [srcq+strideq*0] - vpbroadcastq m5, [srcq+strideq*1] - vpbroadcastq m6, [srcq+strideq*2] - vpbroadcastq m7, [srcq+stride3q] - vpbroadcastd m8, [pw_128] - pxor m9, m9 - - vpblendd m0, m0, m7, 0xf0 - vpblendd m1, m1, m6, 0xf0 - vpblendd m2, m2, m5, 0xf0 - vpblendd m3, m3, m4, 0xf0 - - punpcklbw m0, m9 - punpcklbw m1, m9 - punpcklbw m2, m9 - punpcklbw m3, m9 - psubw m0, m8 - psubw m1, m8 - psubw m2, m8 - psubw m3, m8 + vpbroadcastq m4, [srcq+stride3q ] + vpbroadcastq m5, [srcq+strideq*2] + vpblendd m0, m4, 0xf0 + vpblendd m1, m5, 0xf0 + vpbroadcastq m4, [srcq+strideq*1] + vpbroadcastq m5, [srcq+strideq*0] + vpblendd m2, m4, 0xf0 + vpblendd m3, m5, 0xf0 + pxor m4, m4 + punpcklbw m0, m4 + punpcklbw m1, m4 + punpcklbw m2, m4 + punpcklbw m3, m4 +cglobal_label .main + vpbroadcastd m4, [pw_128] + PROLOGUE 3, 4, 15 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 ; shuffle registers to generate partial_sum_diag[0-1] together vpermq m7, m0, q1032 diff --git a/src/x86/cdef_sse.asm b/src/x86/cdef_sse.asm index 4c335aba21..6560b7d6c3 100644 --- a/src/x86/cdef_sse.asm +++ b/src/x86/cdef_sse.asm @@ -42,12 +42,13 @@ div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210 dw 168, 168, 140, 140, 120, 120, 105, 105 dw 420, 420, 210, 210, 140, 140, 105, 105 dw 105, 105, 105, 105, 105, 105, 105, 105 -shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 -shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 -pw_8: times 8 dw 8 -pw_128: times 8 dw 128 -pw_256: times 8 dw 256 -pw_2048: times 8 dw 2048 +const shufw_6543210x, \ + db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 +shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pw_8: times 8 dw 8 +pw_128: times 8 dw 128 +pw_256: times 8 dw 256 +pw_2048: times 8 dw 2048 pw_0x7FFF: times 8 dw 0x7FFF pw_0x8000: times 8 dw 0x8000 tap_table: ; masks for 8-bit shift emulation @@ -758,27 +759,26 @@ cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \ %macro CDEF_DIR 0 %if ARCH_X86_64 -cglobal cdef_dir_8bpc, 3, 5, 16, 32, src, stride, var, stride3 - lea stride3q, [strideq*3] +cglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var + lea r6, [strideq*3] movq m1, [srcq+strideq*0] movhps m1, [srcq+strideq*1] movq m3, [srcq+strideq*2] - movhps m3, [srcq+stride3q] + movhps m3, [srcq+r6 ] lea srcq, [srcq+strideq*4] movq m5, [srcq+strideq*0] movhps m5, [srcq+strideq*1] movq m7, [srcq+strideq*2] - movhps m7, [srcq+stride3q] + movhps m7, [srcq+r6 ] pxor m8, m8 - psadbw m0, m1, m8 + psadbw m9, m1, m8 psadbw m2, m3, m8 psadbw m4, m5, m8 psadbw m6, m7, m8 - packssdw m0, m2 + packssdw m9, m2 packssdw m4, m6 - packssdw m0, m4 - SWAP m0, m9 + packssdw m9, m4 punpcklbw m0, m1, m8 punpckhbw m1, m8 @@ -788,7 +788,7 @@ cglobal cdef_dir_8bpc, 3, 5, 16, 32, src, stride, var, stride3 punpckhbw m5, m8 punpcklbw m6, m7, m8 punpckhbw m7, m8 - +cglobal_label .main mova m8, [pw_128] psubw m0, m8 psubw m1, m8 @@ -1018,14 +1018,20 @@ cglobal cdef_dir_8bpc, 3, 5, 16, 32, src, stride, var, stride3 punpckldq m4, m6 psubd m2, m0, m1 psubd m3, m0, m4 - mova [rsp+0x00], m2 ; emulate ymm in stack - mova [rsp+0x10], m3 +%if WIN64 + WIN64_RESTORE_XMM + %define tmp rsp+stack_offset+8 +%else + %define tmp rsp-40 +%endif + mova [tmp+0x00], m2 ; emulate ymm in stack + mova [tmp+0x10], m3 pcmpeqd m1, m0 ; compute best cost mask pcmpeqd m4, m0 packssdw m4, m1 pmovmskb eax, m4 ; get byte-idx from mask tzcnt eax, eax - mov r1d, [rsp+rax*2] ; get idx^4 complement from emulated ymm + mov r1d, [tmp+rax*2] ; get idx^4 complement from emulated ymm shr eax, 1 ; get direction by converting byte-idx to word-idx shr r1d, 10 mov [varq], r1d @@ -1063,19 +1069,19 @@ cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3 movq m7, [srcq+strideq*2] movhps m7, [srcq+stride3q] psadbw m3, m5, m0 - psadbw m0, m7, m0 + psadbw m0, m7 packssdw m3, m0 pxor m0, m0 - packssdw m2, m3 punpcklbw m4, m5, m0 punpckhbw m5, m0 punpcklbw m6, m7, m0 punpckhbw m7, m0 +cglobal_label .main psubw m4, m1 psubw m5, m1 psubw m6, m1 psubw m7, m1 - + packssdw m2, m3 psllw m1, 3 psubw m2, m1 ; partial_sum_hv[0] pmaddwd m2, m2 From 09853b8122c2c54a5808c1fa342dfb2f9dab9d07 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 4 May 2021 14:07:33 +0200 Subject: [PATCH 061/188] x86: Add high bitdepth cdef AVX2 asm --- src/x86/cdef16_avx2.asm | 419 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 419 insertions(+) diff --git a/src/x86/cdef16_avx2.asm b/src/x86/cdef16_avx2.asm index f8b5c5b0f7..cc9690f568 100644 --- a/src/x86/cdef16_avx2.asm +++ b/src/x86/cdef16_avx2.asm @@ -30,9 +30,28 @@ SECTION_RODATA +tap_table: dw 4, 2, 3, 3, 2, 1 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + db 1 * 16 + 0, 2 * 16 + 0 + db 1 * 16 + 0, 2 * 16 - 1 + ; the last 6 are repeats of the first 6 so we don't need to & 7 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + dir_shift: times 2 dw 0x4000 times 2 dw 0x1000 +pw_2048: times 2 dw 2048 + cextern cdef_dir_8bpc_avx2.main SECTION .text @@ -45,6 +64,406 @@ SECTION .text %endrep %endmacro +%macro ACCUMULATE_TAP 6 ; tap_offset, shift, strength, mul_tap, w, stride + ; load p0/p1 + movsx offq, byte [dirq+kq+%1] ; off1 +%if %5 == 4 + movq xm5, [stkq+offq*2+%6*0] ; p0 + movq xm6, [stkq+offq*2+%6*2] + movhps xm5, [stkq+offq*2+%6*1] + movhps xm6, [stkq+offq*2+%6*3] + vinserti128 m5, xm6, 1 +%else + movu xm5, [stkq+offq*2+%6*0] ; p0 + vinserti128 m5, [stkq+offq*2+%6*1], 1 +%endif + neg offq ; -off1 +%if %5 == 4 + movq xm6, [stkq+offq*2+%6*0] ; p1 + movq xm9, [stkq+offq*2+%6*2] + movhps xm6, [stkq+offq*2+%6*1] + movhps xm9, [stkq+offq*2+%6*3] + vinserti128 m6, xm9, 1 +%else + movu xm6, [stkq+offq*2+%6*0] ; p1 + vinserti128 m6, [stkq+offq*2+%6*1], 1 +%endif + ; out of bounds values are set to a value that is a both a large unsigned + ; value and a negative signed value. + ; use signed max and unsigned min to remove them + pmaxsw m7, m5 ; max after p0 + pminuw m8, m5 ; min after p0 + pmaxsw m7, m6 ; max after p1 + pminuw m8, m6 ; min after p1 + + ; accumulate sum[m15] over p0/p1 + psubw m5, m4 ; diff_p0(p0 - px) + psubw m6, m4 ; diff_p1(p1 - px) + pabsw m9, m5 + pabsw m10, m6 + psignw m11, %4, m5 + psignw m12, %4, m6 + psrlw m5, m9, %2 + psrlw m6, m10, %2 + psubusw m5, %3, m5 + psubusw m6, %3, m6 + pminuw m5, m9 ; constrain(diff_p0) + pminuw m6, m10 ; constrain(diff_p1) + pmullw m5, m11 ; constrain(diff_p0) * taps + pmullw m6, m12 ; constrain(diff_p1) * taps + paddw m15, m5 + paddw m15, m6 +%endmacro + +%macro cdef_filter_fn 3 ; w, h, stride +INIT_YMM avx2 +%if %1 != 4 || %2 != 8 +cglobal cdef_filter_%1x%2_16bpc, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ + dst, stride, left, top, pri, sec, \ + stride3, dst4, edge +%else +cglobal cdef_filter_%1x%2_16bpc, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ + dst, stride, left, top, pri, sec, \ + stride3, dst4, edge +%endif +%define px rsp+2*16+2*%3 + pcmpeqw m14, m14 + psllw m14, 15 ; 0x8000 + mov edged, r8m + + ; prepare pixel buffers - body/right +%if %1 == 4 + INIT_XMM avx2 +%endif +%if %2 == 8 + lea dst4q, [dstq+strideq*4] +%endif + lea stride3q, [strideq*3] + test edgeb, 2 ; have_right + jz .no_right + movu m1, [dstq+strideq*0] + movu m2, [dstq+strideq*1] + movu m3, [dstq+strideq*2] + movu m4, [dstq+stride3q] + mova [px+0*%3], m1 + mova [px+1*%3], m2 + mova [px+2*%3], m3 + mova [px+3*%3], m4 +%if %2 == 8 + movu m1, [dst4q+strideq*0] + movu m2, [dst4q+strideq*1] + movu m3, [dst4q+strideq*2] + movu m4, [dst4q+stride3q] + mova [px+4*%3], m1 + mova [px+5*%3], m2 + mova [px+6*%3], m3 + mova [px+7*%3], m4 +%endif + jmp .body_done +.no_right: +%if %1 == 4 + movq xm1, [dstq+strideq*0] + movq xm2, [dstq+strideq*1] + movq xm3, [dstq+strideq*2] + movq xm4, [dstq+stride3q] + movq [px+0*%3], xm1 + movq [px+1*%3], xm2 + movq [px+2*%3], xm3 + movq [px+3*%3], xm4 +%else + mova xm1, [dstq+strideq*0] + mova xm2, [dstq+strideq*1] + mova xm3, [dstq+strideq*2] + mova xm4, [dstq+stride3q] + mova [px+0*%3], xm1 + mova [px+1*%3], xm2 + mova [px+2*%3], xm3 + mova [px+3*%3], xm4 +%endif + movd [px+0*%3+%1*2], xm14 + movd [px+1*%3+%1*2], xm14 + movd [px+2*%3+%1*2], xm14 + movd [px+3*%3+%1*2], xm14 +%if %2 == 8 + %if %1 == 4 + movq xm1, [dst4q+strideq*0] + movq xm2, [dst4q+strideq*1] + movq xm3, [dst4q+strideq*2] + movq xm4, [dst4q+stride3q] + movq [px+4*%3], xm1 + movq [px+5*%3], xm2 + movq [px+6*%3], xm3 + movq [px+7*%3], xm4 + %else + mova xm1, [dst4q+strideq*0] + mova xm2, [dst4q+strideq*1] + mova xm3, [dst4q+strideq*2] + mova xm4, [dst4q+stride3q] + mova [px+4*%3], xm1 + mova [px+5*%3], xm2 + mova [px+6*%3], xm3 + mova [px+7*%3], xm4 + %endif + movd [px+4*%3+%1*2], xm14 + movd [px+5*%3+%1*2], xm14 + movd [px+6*%3+%1*2], xm14 + movd [px+7*%3+%1*2], xm14 +%endif +.body_done: + + ; top + test edgeb, 4 ; have_top + jz .no_top + test edgeb, 1 ; have_left + jz .top_no_left + test edgeb, 2 ; have_right + jz .top_no_right + movu m1, [topq+strideq*0-%1] + movu m2, [topq+strideq*1-%1] + movu [px-2*%3-%1], m1 + movu [px-1*%3-%1], m2 + jmp .top_done +.top_no_right: + movu m1, [topq+strideq*0-%1*2] + movu m2, [topq+strideq*1-%1*2] + movu [px-2*%3-%1*2], m1 + movu [px-1*%3-%1*2], m2 + movd [px-2*%3+%1*2], xm14 + movd [px-1*%3+%1*2], xm14 + jmp .top_done +.top_no_left: + test edgeb, 2 ; have_right + jz .top_no_left_right + movu m1, [topq+strideq*0] + movu m2, [topq+strideq*1] + mova [px-2*%3+0], m1 + mova [px-1*%3+0], m2 + movd [px-2*%3-4], xm14 + movd [px-1*%3-4], xm14 + jmp .top_done +.top_no_left_right: +%if %1 == 4 + movq xm1, [topq+strideq*0] + movq xm2, [topq+strideq*1] + movq [px-2*%3+0], xm1 + movq [px-1*%3+0], xm2 +%else + mova xm1, [topq+strideq*0] + mova xm2, [topq+strideq*1] + mova [px-2*%3+0], xm1 + mova [px-1*%3+0], xm2 +%endif + movd [px-2*%3-4], xm14 + movd [px-1*%3-4], xm14 + movd [px-2*%3+%1*2], xm14 + movd [px-1*%3+%1*2], xm14 + jmp .top_done +.no_top: + movu [px-2*%3-%1], m14 + movu [px-1*%3-%1], m14 +.top_done: + + ; left + test edgeb, 1 ; have_left + jz .no_left + mova xm1, [leftq+ 0] +%if %2 == 8 + mova xm2, [leftq+16] +%endif + movd [px+0*%3-4], xm1 + pextrd [px+1*%3-4], xm1, 1 + pextrd [px+2*%3-4], xm1, 2 + pextrd [px+3*%3-4], xm1, 3 +%if %2 == 8 + movd [px+4*%3-4], xm2 + pextrd [px+5*%3-4], xm2, 1 + pextrd [px+6*%3-4], xm2, 2 + pextrd [px+7*%3-4], xm2, 3 +%endif + jmp .left_done +.no_left: + movd [px+0*%3-4], xm14 + movd [px+1*%3-4], xm14 + movd [px+2*%3-4], xm14 + movd [px+3*%3-4], xm14 +%if %2 == 8 + movd [px+4*%3-4], xm14 + movd [px+5*%3-4], xm14 + movd [px+6*%3-4], xm14 + movd [px+7*%3-4], xm14 +%endif +.left_done: + + ; bottom + DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge + test edgeb, 8 ; have_bottom + jz .no_bottom + lea dst8q, [dstq+%2*strideq] + test edgeb, 1 ; have_left + jz .bottom_no_left + test edgeb, 2 ; have_right + jz .bottom_no_right + movu m1, [dst8q-%1] + movu m2, [dst8q+strideq-%1] + movu [px+(%2+0)*%3-%1], m1 + movu [px+(%2+1)*%3-%1], m2 + jmp .bottom_done +.bottom_no_right: + movu m1, [dst8q-%1*2] + movu m2, [dst8q+strideq-%1*2] + movu [px+(%2+0)*%3-%1*2], m1 + movu [px+(%2+1)*%3-%1*2], m2 +%if %1 == 8 + movd [px+(%2-1)*%3+%1*2], xm14 ; overwritten by previous movu +%endif + movd [px+(%2+0)*%3+%1*2], xm14 + movd [px+(%2+1)*%3+%1*2], xm14 + jmp .bottom_done +.bottom_no_left: + test edgeb, 2 ; have_right + jz .bottom_no_left_right + movu m1, [dst8q] + movu m2, [dst8q+strideq] + mova [px+(%2+0)*%3+0], m1 + mova [px+(%2+1)*%3+0], m2 + movd [px+(%2+0)*%3-4], xm14 + movd [px+(%2+1)*%3-4], xm14 + jmp .bottom_done +.bottom_no_left_right: +%if %1 == 4 + movq xm1, [dst8q] + movq xm2, [dst8q+strideq] + movq [px+(%2+0)*%3+0], xm1 + movq [px+(%2+1)*%3+0], xm2 +%else + mova xm1, [dst8q] + mova xm2, [dst8q+strideq] + mova [px+(%2+0)*%3+0], xm1 + mova [px+(%2+1)*%3+0], xm2 +%endif + movd [px+(%2+0)*%3-4], xm14 + movd [px+(%2+1)*%3-4], xm14 + movd [px+(%2+0)*%3+%1*2], xm14 + movd [px+(%2+1)*%3+%1*2], xm14 + jmp .bottom_done +.no_bottom: + movu [px+(%2+0)*%3-%1], m14 + movu [px+(%2+1)*%3-%1], m14 +.bottom_done: + + ; actual filter + INIT_YMM avx2 + DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero + %undef edged + movifnidn prid, prim + mov dampingd, r7m + lzcnt pridmpd, prid +%if UNIX64 + movd xm0, prid + movd xm1, secdmpd +%endif + lzcnt secdmpd, secdmpm + sub dampingd, 31 + xor zerod, zerod + add pridmpd, dampingd + cmovs pridmpd, zerod + add secdmpd, dampingd + cmovs secdmpd, zerod + mov [rsp+0], pridmpq ; pri_shift + mov [rsp+8], secdmpq ; sec_shift + + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, table, dir, pri, sec, stride3 +%if UNIX64 + vpbroadcastw m0, xm0 ; pri_strength + vpbroadcastw m1, xm1 ; sec_strength +%else + vpbroadcastw m0, prim ; pri_strength + vpbroadcastw m1, secm ; sec_strength +%endif + rorx r2d, prid, 2 + cmp dword r9m, 0xfff + cmove prid, r2d + and prid, 4 + lea tableq, [tap_table] + lea priq, [tableq+priq] ; pri_taps + lea secq, [tableq+8] ; sec_taps + + ; off1/2/3[k] [6 total] from [tableq+12+(dir+0/2/6)*2+k] + mov dird, r6m + lea tableq, [tableq+dirq*2+12] +%if %1*%2*2/mmsize > 1 + %if %1 == 4 + DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k + %else + DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k + %endif + mov hd, %1*%2*2/mmsize +%else + DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k +%endif + lea stkq, [px] + pxor m13, m13 +%if %1*%2*2/mmsize > 1 +.v_loop: +%endif + mov kd, 1 +%if %1 == 4 + movq xm4, [stkq+%3*0] + movhps xm4, [stkq+%3*1] + movq xm5, [stkq+%3*2] + movhps xm5, [stkq+%3*3] + vinserti128 m4, xm5, 1 +%else + mova xm4, [stkq+%3*0] ; px + vinserti128 m4, [stkq+%3*1], 1 +%endif + pxor m15, m15 ; sum + mova m7, m4 ; max + mova m8, m4 ; min +.k_loop: + vpbroadcastw m2, [priq+kq*2] ; pri_taps + vpbroadcastw m3, [secq+kq*2] ; sec_taps + + ACCUMULATE_TAP 0*2, [rsp+0], m0, m2, %1, %3 + ACCUMULATE_TAP 2*2, [rsp+8], m1, m3, %1, %3 + ACCUMULATE_TAP 6*2, [rsp+8], m1, m3, %1, %3 + + dec kq + jge .k_loop + + vpbroadcastd m12, [pw_2048] + pcmpgtw m11, m13, m15 + paddw m15, m11 + pmulhrsw m15, m12 + paddw m4, m15 + pminsw m4, m7 + pmaxsw m4, m8 +%if %1 == 4 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm4 + movq [dstq+strideq*2], xm5 + movhps [dstq+stride3q], xm5 +%else + mova [dstq+strideq*0], xm4 + vextracti128 [dstq+strideq*1], m4, 1 +%endif + +%if %1*%2*2/mmsize > 1 + %define vloop_lines (mmsize/(%1*2)) + lea dstq, [dstq+strideq*vloop_lines] + add stkq, %3*vloop_lines + dec hd + jg .v_loop +%endif + + RET +%endmacro + +cdef_filter_fn 8, 8, 32 +cdef_filter_fn 4, 4, 32 + INIT_YMM avx2 cglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax lea r6, [dir_shift] From 71953de0b5b0fb87ac2c3572d6527300b4c95958 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 4 May 2021 14:08:13 +0200 Subject: [PATCH 062/188] x86: Add high bitdepth loopfilter AVX2 asm --- src/x86/loopfilter16_avx2.asm | 1188 +++++++++++++++++++++++++++++++++ 1 file changed, 1188 insertions(+) create mode 100644 src/x86/loopfilter16_avx2.asm diff --git a/src/x86/loopfilter16_avx2.asm b/src/x86/loopfilter16_avx2.asm new file mode 100644 index 0000000000..2fbce77353 --- /dev/null +++ b/src/x86/loopfilter16_avx2.asm @@ -0,0 +1,1188 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 + times 4 db 8, 9 + times 4 db 0, 1 + times 4 db 8, 9 + +pw_1: times 16 dw 1 +pw_2: times 16 dw 2 +pw_3: times 16 dw 3 +; 4 and 16 need to be next to each other since they are used as alternates +; depending on whether bitdepth is 10 or 12 +pw_4: times 16 dw 4 +pw_16: times 16 dw 16 +pw_8: times 16 dw 8 +pw_4096: times 16 dw 4096 + +pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8 + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +; in: out: +; mm%1 a b c d a e i m +; mm%2 e f g h b f j n +; mm%3 i j k l -> c g k o +; mm%4 m n o p d h l p +%macro TRANSPOSE4X4W 5 + punpcklwd m%5, m%1, m%2 + punpckhwd m%1, m%2 + punpcklwd m%2, m%3, m%4 + punpckhwd m%3, m%4 + punpckldq m%4, m%5, m%2 + punpckhdq m%5, m%2 + punpckldq m%2, m%1, m%3 + punpckhdq m%1, m%3 + + SWAP %1, %4 + SWAP %2, %5, %3 +%endmacro + +; in: out: +; xmm%1 a b c d e f g h a i q y 6 E M U +; xmm%2 i j k l m n o p b j r z 7 F N V +; xmm%3 q r s t u v w x c k s 0 8 G O W +; xmm%4 y z 0 1 2 3 4 5 d l t 1 9 H P X +; xmm%5 6 7 8 9 A B C D -> e m u 2 A I Q Y +; xmm%6 E F G H I J K L f n v 3 B J R Z +; xmm%7 M N O P Q R S T g o w 4 C K S + +; xmm%8 U V W X Y Z + = h p x 5 D L T = +%macro TRANSPOSE8X8W 9 + ; xmm%1 a b c d e f g h a i q y b j r z + ; xmm%2 i j k l m n o p c k s 0 d l t 1 + ; xmm%3 q r s t u v w x -> e m u 2 f n v 3 + ; xmm%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 + TRANSPOSE4X4W %1, %2, %3, %4, %9 + + ; xmm%5 6 7 8 9 A B C D 6 E M U 7 F N V + ; xmm%6 E F G H I J K L 8 G O W 9 H P X + ; xmm%7 M N O P Q R S T -> A I Q Y B J R Z + ; xmm%8 U V W X Y Z + = C K S + D L T = + TRANSPOSE4X4W %5, %6, %7, %8, %9 + + ; xmm%1 a i q y b j r z a i q y 6 E M U + ; xmm%2 c k s 0 d l t 1 b j r z 7 F N V + ; xmm%3 e m u 2 f n v 3 c k s 0 8 G O W + ; xmm%4 g o w 4 h p x 5 d l t 1 9 H P X + ; xmm%5 6 E M U 7 F N V -> e m u 2 A I Q Y + ; xmm%6 8 G O W 9 H P X f n v 3 B J R Z + ; xmm%7 A I Q Y B J R Z g o w 4 C K S + + ; xmm%8 C K S + D L T = h p x 5 D L T = + punpckhqdq m%9, m%1, m%5 + punpcklqdq m%1, m%5 + punpckhqdq m%5, m%2, m%6 + punpcklqdq m%2, m%6 + punpckhqdq m%6, m%3, m%7 + punpcklqdq m%3, m%7 + punpckhqdq m%7, m%4, m%8 + punpcklqdq m%4, m%8 + + SWAP %8, %7, %4, %5, %3, %2, %9 +%endmacro + +; transpose and write m3-6, everything else is scratch +%macro TRANSPOSE_8x4_AND_WRITE_4x16 0 + ; transpose 8x4 + punpcklwd m0, m3, m4 + punpckhwd m3, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckldq m6, m0, m4 + punpckhdq m0, m4 + punpckldq m4, m3, m5 + punpckhdq m3, m5 + + ; write out + movq [dstq+strideq*0-4], xm6 + movhps [dstq+strideq*1-4], xm6 + movq [dstq+strideq*2-4], xm0 + movhps [dstq+stride3q -4], xm0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm4 + movhps [dstq+strideq*1-4], xm4 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] + + vextracti128 xm6, m6, 1 + vextracti128 xm0, m0, 1 + vextracti128 xm4, m4, 1 + vextracti128 xm3, m3, 1 + + movq [dstq+strideq*0-4], xm6 + movhps [dstq+strideq*1-4], xm6 + movq [dstq+strideq*2-4], xm0 + movhps [dstq+stride3q -4], xm0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm4 + movhps [dstq+strideq*1-4], xm4 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] + ; load data +%ifidn %2, v +%if %1 == 4 + lea tmpq, [dstq+mstrideq*2] + mova m3, [tmpq+strideq*0] ; p1 + mova m4, [tmpq+strideq*1] ; p0 + mova m5, [tmpq+strideq*2] ; q0 + mova m6, [tmpq+stride3q] ; q1 +%else + ; load 6-8 pixels, remainder (for wd=16) will be read inline + lea tmpq, [dstq+mstrideq*4] + ; we load p3 later + mova m13, [tmpq+strideq*1] + mova m3, [tmpq+strideq*2] + mova m4, [tmpq+stride3q] + mova m5, [dstq+strideq*0] + mova m6, [dstq+strideq*1] + mova m14, [dstq+strideq*2] +%if %1 != 6 + mova m15, [dstq+stride3q] +%endif +%endif +%else + ; load lines +%if %1 == 4 + movq xm3, [dstq+strideq*0-4] + movq xm4, [dstq+strideq*1-4] + movq xm5, [dstq+strideq*2-4] + movq xm6, [dstq+stride3q -4] + lea tmpq, [dstq+strideq*4] + movq xm11, [tmpq+strideq*0-4] + movq xm13, [tmpq+strideq*1-4] + movq xm14, [tmpq+strideq*2-4] + movq xm15, [tmpq+stride3q -4] + lea tmpq, [tmpq+strideq*4] + ; this overreads by 8 bytes but the buffers are padded + ; so that should be ok + vinserti128 m3, [tmpq+strideq*0-4], 1 + vinserti128 m4, [tmpq+strideq*1-4], 1 + vinserti128 m5, [tmpq+strideq*2-4], 1 + vinserti128 m6, [tmpq+stride3q -4], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m11, [tmpq+strideq*0-4], 1 + vinserti128 m13, [tmpq+strideq*1-4], 1 + vinserti128 m14, [tmpq+strideq*2-4], 1 + vinserti128 m15, [tmpq+stride3q -4], 1 + + ; transpose 4x8 + ; xm3: A-D0,A-D4 + ; xm4: A-D1,A-D5 + ; xm5: A-D2,A-D6 + ; xm6: A-D3,A-D7 + punpcklwd m7, m3, m4 + punpcklwd m3, m11, m13 + punpcklwd m4, m5, m6 + punpcklwd m5, m14, m15 + ; xm7: A0-1,B0-1,C0-1,D0-1 + ; xm3: A4-5,B4-5,C4-5,D4-5 + ; xm4: A2-3,B2-3,C2-3,D2-3 + ; xm5: A6-7,B6-7,C6-7,D6-7 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m8, m3, m5 + punpckhdq m5, m3, m5 + ; xm6: A0-3,B0-3 + ; xm7: C0-3,D0-3 + ; xm8: A4-7,B4-7 + ; xm5: C4-7,D4-7 + punpcklqdq m3, m6, m8 + punpckhqdq m4, m6, m8 + punpckhqdq m6, m7, m5 + punpcklqdq m5, m7, m5 + ; xm3: A0-7 + ; xm4: B0-7 + ; xm5: C0-7 + ; xm6: D0-7 +%elif %1 == 6 || %1 == 8 + movu xm3, [dstq+strideq*0-8] + movu xm4, [dstq+strideq*1-8] + movu xm5, [dstq+strideq*2-8] + movu xm6, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4] + movu xm11, [tmpq+strideq*0-8] + movu xm13, [tmpq+strideq*1-8] + movu xm14, [tmpq+strideq*2-8] + movu xm15, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + vinserti128 m3, [tmpq+strideq*0-8], 1 + vinserti128 m4, [tmpq+strideq*1-8], 1 + vinserti128 m5, [tmpq+strideq*2-8], 1 + vinserti128 m6, [tmpq+stride3q -8], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m11, [tmpq+strideq*0-8], 1 + vinserti128 m13, [tmpq+strideq*1-8], 1 + vinserti128 m14, [tmpq+strideq*2-8], 1 + vinserti128 m15, [tmpq+stride3q -8], 1 + + ; transpose 8x16 + ; xm3: A-H0,A-H8 + ; xm4: A-H1,A-H9 + ; xm5: A-H2,A-H10 + ; xm6: A-H3,A-H11 + ; xm11: A-H4,A-H12 + ; xm13: A-H5,A-H13 + ; xm14: A-H6,A-H14 + ; xm15: A-H7,A-H15 + punpcklwd m7, m3, m4 + punpckhwd m3, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpcklwd m6, m11, m13 + punpckhwd m11, m13 + punpcklwd m13, m14, m15 + punpckhwd m14, m15 + ; xm7: A0-1,B0-1,C0-1,D0-1 + ; xm3: E0-1,F0-1,G0-1,H0-1 + ; xm4: A2-3,B2-3,C2-3,D2-3 + ; xm5: E2-3,F2-3,G2-3,H2-3 + ; xm6: A4-5,B4-5,C4-5,D4-5 + ; xm11: E4-5,F4-5,G4-5,H4-5 + ; xm13: A6-7,B6-7,C6-7,D6-7 + ; xm14: E6-7,F6-7,G6-7,H6-7 + punpckldq m15, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m3, m5 + punpckhdq m8, m3, m5 + punpckldq m3, m6, m13 + punpckhdq m6, m13 + punpckldq m10, m11, m14 + punpckhdq m11, m14 + ; xm15: A0-3,B0-3 + ; xm7: C0-3,D0-3 + ; xm9: E0-3,F0-3 + ; xm8: G0-3,H0-3 + ; xm3: A4-7,B4-7 + ; xm6: C4-7,D4-7 + ; xm10: E4-7,F4-7 + ; xm11: G4-7,H4-7 +%if %1 != 6 + punpcklqdq m0, m15, m3 +%endif + punpckhqdq m13, m15, m3 + punpcklqdq m3, m7, m6 + punpckhqdq m4, m7, m6 + punpcklqdq m5, m9, m10 + punpckhqdq m6, m9, m10 + punpcklqdq m14, m8, m11 +%if %1 != 6 + punpckhqdq m15, m8, m11 + mova [rsp+5*32], m0 +%endif +%else + ; We only use 14 pixels but we'll need the remainder at the end for + ; the second transpose + mova xm0, [dstq+strideq*0-16] + mova xm1, [dstq+strideq*1-16] + mova xm2, [dstq+strideq*2-16] + mova xm3, [dstq+stride3q -16] + lea tmpq, [dstq+strideq*4] + mova xm4, [tmpq+strideq*0-16] + mova xm5, [tmpq+strideq*1-16] + mova xm6, [tmpq+strideq*2-16] + mova xm7, [tmpq+stride3q -16] + lea tmpq, [tmpq+strideq*4] + vinserti128 m0, m0, [tmpq+strideq*0-16], 1 + vinserti128 m1, m1, [tmpq+strideq*1-16], 1 + vinserti128 m2, m2, [tmpq+strideq*2-16], 1 + vinserti128 m3, m3, [tmpq+stride3q -16], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m4, m4, [tmpq+strideq*0-16], 1 + vinserti128 m5, m5, [tmpq+strideq*1-16], 1 + vinserti128 m6, m6, [tmpq+strideq*2-16], 1 + vinserti128 m7, m7, [tmpq+stride3q -16], 1 + + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + + mova [rsp+6*32], m0 + mova [rsp+7*32], m1 + mova [rsp+8*32], m2 + mova [rsp+9*32], m3 + mova [rsp+5*32], m4 + + mova xm0, [dstq+strideq*0] + mova xm1, [dstq+strideq*1] + mova xm2, [dstq+strideq*2] + mova xm3, [dstq+stride3q ] + lea tmpq, [dstq+strideq*4] + mova xm8, [tmpq+strideq*0] + mova xm9, [tmpq+strideq*1] + mova xm10, [tmpq+strideq*2] + mova xm11, [tmpq+stride3q ] + lea tmpq, [tmpq+strideq*4] + vinserti128 m0, m0, [tmpq+strideq*0], 1 + vinserti128 m1, m1, [tmpq+strideq*1], 1 + vinserti128 m2, m2, [tmpq+strideq*2], 1 + vinserti128 m3, m3, [tmpq+stride3q ], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m8, m8, [tmpq+strideq*0], 1 + vinserti128 m9, m9, [tmpq+strideq*1], 1 + vinserti128 m10, m10, [tmpq+strideq*2], 1 + vinserti128 m11, m11, [tmpq+stride3q ], 1 + + TRANSPOSE8X8W 0, 1, 2, 3, 8, 9, 10, 11, 4 + + mova [rsp+10*32], m8 + mova [rsp+11*32], m9 + mova [rsp+12*32], m10 + mova [rsp+13*32], m11 + + ; 5,6,7,0,1,2,3 -> 13,3,4,5,6,14,15 + SWAP 13, 5, 0 + SWAP 3, 6, 1, 15 + SWAP 4, 7 + SWAP 2, 14 +%endif +%endif + + ; load L/E/I/H +%ifidn %2, v + pmovzxbw m1, [lq] + pmovzxbw m0, [lq+l_strideq] + pxor m2, m2 +%else + vpbroadcastq m0, [lq] ; l0, l1 + vpbroadcastq m1, [lq+l_strideq] ; l2, l3 + vpbroadcastq m2, [lq+l_strideq*2] ; l4, l5 + vpbroadcastq m10, [lq+l_stride3q] ; l6, l7 + punpckldq m0, m1 ; l0, l2, l1, l3 [2x] + punpckldq m2, m10 ; l4, l6, l5, l7 [2x] + vpblendd m0, m0, m2, 11110000b ; l0, l2, l1, l3, l4, l6, l5, l7 + pxor m2, m2 + punpcklbw m1, m0, m2 ; l0, l2, l4, l6 + punpckhbw m0, m2 ; l1, l3, l5, l7 +%endif + pcmpeqw m10, m2, m0 + pand m1, m10 + por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1] + pcmpeqw m10, m2, m0 ; !L + psrlw m10, 1 + psrlw m2, m0, [lutq+128] + vpbroadcastw m1, [lutq+136] + pminuw m2, m1 + pmaxuw m2, [pw_1] ; I + psrlw m1, m0, 4 ; H + paddw m0, [pw_2] + paddw m0, m0 + paddw m0, m2 ; E + REPX {pmullw x, [r11]}, m0, m1, m2 + + psubw m8, m3, m4 ; p1-p0 + psubw m9, m5, m6 ; q1-q0 + REPX {pabsw x, x}, m8, m9 + pmaxuw m8, m10 + pmaxuw m8, m9 + pcmpgtw m7, m8, m1 ; hev +%if %1 != 4 + psubw m9, m13, m4 ; p2-p0 + pabsw m9, m9 + pmaxuw m9, m8 +%if %1 != 6 +%ifidn %2, v + mova m11, [tmpq+strideq*0] ; p3 +%else + mova m11, [rsp+5*32] ; p3 +%endif + psubw m10, m11, m4 ; p3-p0 + pabsw m10, m10 + pmaxuw m9, m10 +%endif + psubw m10, m5, m14 ; q2-q0 + pabsw m10, m10 + pmaxuw m9, m10 +%if %1 != 6 + psubw m10, m5, m15 ; q3-q0 + pabsw m10, m10 + pmaxuw m9, m10 +%endif + pcmpgtw m9, [r11] ; !flat8in + + psubw m10, m13, m3 ; p2-p1 + pabsw m10, m10 +%if %1 != 6 + psubw m11, m13 ; p3-p2 + pabsw m11, m11 + pmaxuw m10, m11 + psubw m11, m14, m15 ; q3-q2 + pabsw m11, m11 + pmaxuw m10, m11 +%endif + psubw m11, m14, m6 ; q2-q1 + pabsw m11, m11 + pmaxuw m10, m11 + +%if %1 == 16 + vpbroadcastd m11, [maskq+8] + vpbroadcastd m1, [maskq+4] + por m11, m1 + pand m11, m12 + pcmpeqd m11, m12 + pand m10, m11 +%else + vpbroadcastd m11, [maskq+4] + pand m11, m12 + pcmpeqd m11, m12 + pand m10, m11 ; only apply fm-wide to wd>4 blocks +%endif + pmaxuw m8, m10 +%endif + pcmpgtw m8, m2 + + psubw m10, m3, m6 ; p1-q1 + psubw m11, m4, m5 ; p0-q0 + REPX {pabsw x, x}, m10, m11 + paddw m11, m11 + psrlw m10, 1 + paddw m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + pcmpgtw m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E + por m8, m10 + +%if %1 == 16 + +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] + mova m1, [tmpq+strideq*2] + mova m2, [tmpq+stride3q] +%else + mova m0, [rsp+7*32] + mova m1, [rsp+8*32] + mova m2, [rsp+9*32] +%endif + REPX {psubw x, m4}, m0, m1, m2 + REPX {pabsw x, x}, m0, m1, m2 + pmaxuw m1, m0 + pmaxuw m1, m2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] + mova m0, [tmpq+strideq*0] + mova m2, [tmpq+strideq*1] + mova m10, [tmpq+strideq*2] +%else + mova m0, [rsp+10*32] + mova m2, [rsp+11*32] + mova m10, [rsp+12*32] +%endif + REPX {psubw x, m5}, m0, m2, m10 + REPX {pabsw x, x}, m0, m2, m10 + pmaxuw m0, m2 + pmaxuw m1, m10 + pmaxuw m1, m0 + pcmpgtw m1, [r11] ; !flat8out + por m1, m9 ; !flat8in | !flat8out + vpbroadcastd m2, [maskq+8] + pand m10, m2, m12 + pcmpeqd m10, m12 + pandn m1, m10 ; flat16 + pandn m1, m8, m1 ; flat16 & fm + + vpbroadcastd m10, [maskq+4] + por m10, m2 + pand m2, m10, m12 + pcmpeqd m2, m12 + pandn m9, m2 ; flat8in + pandn m9, m8, m9 + vpbroadcastd m2, [maskq+0] + por m2, m10 + pand m2, m12 + pcmpeqd m2, m12 + pandn m8, m2 + pandn m8, m9, m8 ; fm & !flat8 & !flat16 + pandn m9, m1, m9 ; flat8 & !flat16 +%elif %1 != 4 + vpbroadcastd m0, [maskq+4] + pand m2, m0, m12 + pcmpeqd m2, m12 + pandn m9, m2 + pandn m9, m8, m9 ; flat8 & fm + vpbroadcastd m2, [maskq+0] + por m0, m2 + pand m0, m12 + pcmpeqd m0, m12 + pandn m8, m0 + pandn m8, m9, m8 ; fm & !flat8 +%else + vpbroadcastd m0, [maskq+0] + pand m0, m12 + pcmpeqd m0, m12 + pandn m8, m0 ; fm +%endif + + ; short filter + + vpbroadcastw m0, r7m + pcmpeqw m2, m2 + psrlw m0, 1 ; 511 or 2047 + pxor m2, m0 ; -512 or -2048 + + psubw m10, m5, m4 + paddw m11, m10, m10 + paddw m11, m10 + psubw m10, m3, m6 ; iclip_diff(p1-q1) + pminsw m10, m0 + pmaxsw m10, m2 + pand m10, m7 ; f=iclip_diff(p1-q1)&hev + paddw m10, m11 ; f=iclip_diff(3*(q0-p0)+f) + pminsw m10, m0 + pmaxsw m10, m2 + pand m8, m10 ; f&=fm + paddw m10, m8, [pw_3] + paddw m8, [pw_4] + REPX {pminsw x, m0}, m10, m8 + psraw m10, 3 ; f2 + psraw m8, 3 ; f1 + paddw m4, m10 + psubw m5, m8 + + paddw m8, [pw_1] + psraw m8, 1 ; f=(f1+1)>>1 + pandn m8, m7, m8 ; f&=!hev + paddw m3, m8 + psubw m6, m8 + pxor m8, m8 + psubw m0, m2 ; 1023 or 4095 + REPX {pminsw x, m0}, m3, m4, m5, m6 + REPX {pmaxsw x, m8}, m3, m4, m5, m6 + +%if %1 == 16 + +; m3-6 = p1/p0/q0/q1, m9=flat8, m1=flat16 +; m12=filter bits mask +; m13-15=p2/q2/q3 +; m0,2,7-8,10-11 = free + + ; flat16 filter +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] ; p6 + mova m2, [tmpq+strideq*2] ; p5 + mova m7, [tmpq+stride3q] ; p4 + mova m11, [tmpq+strideq*4] ; p3 +%else + mova m0, [rsp+7*32] + mova m2, [rsp+8*32] + mova m7, [rsp+9*32] + mova m11, [rsp+5*32] +%endif + + mova [rsp+ 0*32], m9 + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psllw m8, m0, 3 ; p6*8 + paddw m8, [pw_8] + paddw m10, m2, m7 ; p5+p4 + psubw m8, m0 + paddw m10, m10 ; (p5+p4)*2 + paddw m8, m11 ; p6*7+p3 + paddw m10, m13 ; (p5+p4)*2+p2 + paddw m8, m3 ; p6*7+p3+p1 + paddw m10, m4 ; (p5+p4)*2+p2+p0 + paddw m8, m5 ; p6*7+p3+p1+q0 + paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m2 + por m10, m9 +%ifidn %2, v + mova [tmpq+strideq*2], m10 ; p5 +%else + mova [rsp+8*32], m10 +%endif + + ; sub p6*2, add p3/q1 + paddw m8, m11 + paddw m10, m0, m0 + paddw m8, m6 + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m7 + por m10, m9 +%ifidn %2, v + mova [tmpq+stride3q], m10 ; p4 +%else + mova [rsp+9*32], m10 +%endif + + ; sub p6/p5, add p2/q2 + psubw m8, m0 + paddw m10, m13, m14 + psubw m8, m2 + paddw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m11 + por m10, m9 +%ifidn %2, v + mova [tmpq+strideq*4], m10 ; p3 + lea tmpq, [dstq+strideq*4] +%else + mova [rsp+5*32], m10 +%endif + + ; sub p6/p4, add p1/q3 + paddw m8, m3 + paddw m10, m0, m7 + paddw m8, m15 + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m13 + por m10, m9 + mova [rsp+1*32], m10 ; don't clobber p2/m13 + + ; sub p6/p3, add p0/q4 + paddw m8, m4 + paddw m10, m0, m11 +%ifidn %2, v + paddw m8, [tmpq+strideq*0] +%else + paddw m8, [rsp+10*32] +%endif + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m3 + por m10, m9 + mova [rsp+2*32], m10 ; don't clobber p1/m3 + + ; sub p6/p2, add q0/q5 + paddw m8, m5 + paddw m10, m0, m13 +%ifidn %2, v + paddw m8, [tmpq+strideq*1] +%else + paddw m8, [rsp+11*32] +%endif + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m4 + por m10, m9 + mova [rsp+3*32], m10 ; don't clobber p0/m4 + + ; sub p6/p1, add q1/q6 + paddw m8, m6 + paddw m10, m0, m3 +%ifidn %2, v + mova m0, [tmpq+strideq*2] ; q6 +%else + mova m0, [rsp+12*32] ; q6 +%endif + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m5 + por m10, m9 + mova [rsp+4*32], m10 ; don't clobber q0/m5 + + ; sub p5/p0, add q2/q6 + paddw m8, m14 + paddw m10, m2, m4 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m6 + por m2, m10, m9 ; don't clobber q1/m6 + + ; sub p4/q0, add q3/q6 + paddw m8, m15 + paddw m10, m7, m5 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m14 + por m7, m10, m9 ; don't clobber q2/m14 + + ; sub p3/q1, add q4/q6 +%ifidn %2, v + paddw m8, [tmpq+strideq*0] +%else + paddw m8, [rsp+10*32] +%endif + paddw m10, m11, m6 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m15 + por m10, m9 +%ifidn %2, v + mova [tmpq+mstrideq], m10 ; q3 +%else + mova [rsp+14*32], m10 +%endif + + ; sub p2/q2, add q5/q6 +%ifidn %2, v + paddw m8, [tmpq+strideq*1] +%else + paddw m8, [rsp+11*32] +%endif + paddw m10, m13, m14 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 +%ifidn %2, v + pandn m9, m1, [tmpq+strideq*0] +%else + pandn m9, m1, [rsp+10*32] +%endif + por m10, m9 +%ifidn %2, v + mova [tmpq+strideq*0], m10 ; q4 +%else + mova [rsp+10*32], m10 +%endif + + ; sub p1/q3, add q6*2 + psubw m8, m3 + paddw m0, m0 + psubw m8, m15 + paddw m8, m0 + psrlw m10, m8, 4 + pand m10, m1 +%ifidn %2, v + pandn m9, m1, [tmpq+strideq*1] +%else + pandn m9, m1, [rsp+11*32] +%endif + por m10, m9 +%ifidn %2, v + mova [tmpq+strideq*1], m10 ; q5 +%else + mova [rsp+11*32], m10 +%endif + + mova m9, [rsp+0*32] + mova m13, [rsp+1*32] + mova m3, [rsp+2*32] + mova m4, [rsp+3*32] + mova m5, [rsp+4*32] + SWAP 2, 6 + SWAP 7, 14 +%ifidn %2, v + lea tmpq, [dstq+mstrideq*4] +%else + mova m15, [rsp+14*32] +%endif +%endif + +%if %1 >= 8 + ; flat8 filter +%ifidn %2, v + mova m0, [tmpq+strideq*0] ; p3 +%else + mova m0, [rsp+5*32] ; p3 +%endif + paddw m1, m0, m13 ; p3+p2 + paddw m2, m3, m4 ; p1+p0 + paddw m8, m1, m1 ; 2*(p3+p2) + paddw m2, m0 ; p1+p0+p3 + paddw m8, m5 ; 2*(p3+p2)+q0 + paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0 + pmulhrsw m7, m2, [pw_4096] + + paddw m8, m3, m6 + psubw m2, m1 + paddw m2, m8 + pmulhrsw m8, m2, [pw_4096] + + paddw m10, m0, m3 + paddw m11, m4, m14 + psubw m2, m10 + paddw m2, m11 + pmulhrsw m10, m2, [pw_4096] + + paddw m11, m0, m4 + paddw m1, m5, m15 + psubw m2, m11 + paddw m2, m1 + pmulhrsw m11, m2, [pw_4096] + + paddw m2, m6 + paddw m2, m15 + paddw m1, m13, m5 + psubw m2, m1 + pmulhrsw m1, m2, [pw_4096] + + psubw m2, m3 + psubw m2, m6 + paddw m0, m15, m14 + paddw m2, m0 + pmulhrsw m2, [pw_4096] + + REPX {pand x, m9}, m7, m8, m10, m11, m1, m2 + REPX {pandn x, m9, x}, m13, m3, m4, m5, m6, m14 + por m13, m7 + por m3, m8 + por m4, m10 + por m5, m11 + por m6, m1 + por m14, m2 + +%ifidn %2, v + mova [tmpq+strideq*1], m13 ; p2 + mova [tmpq+strideq*2], m3 ; p1 + mova [tmpq+stride3q ], m4 ; p0 + mova [dstq+strideq*0], m5 ; q0 + mova [dstq+strideq*1], m6 ; q1 + mova [dstq+strideq*2], m14 ; q2 +%else + mova m0, [rsp+5*32] +%if %1 == 8 + TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1 + + ; write 8x16 + movu [dstq+strideq*0-8], xm0 + movu [dstq+strideq*1-8], xm13 + movu [dstq+strideq*2-8], xm3 + movu [dstq+stride3q -8], xm4 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm5 + movu [dstq+strideq*1-8], xm6 + movu [dstq+strideq*2-8], xm14 + movu [dstq+stride3q -8], xm15 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m0, 1 + vextracti128 [dstq+strideq*1-8], m13, 1 + vextracti128 [dstq+strideq*2-8], m3, 1 + vextracti128 [dstq+stride3q -8], m4, 1 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m5, 1 + vextracti128 [dstq+strideq*1-8], m6, 1 + vextracti128 [dstq+strideq*2-8], m14, 1 + vextracti128 [dstq+stride3q -8], m15, 1 + lea dstq, [dstq+strideq*4] +%else + mova m0, [rsp+6*32] + mova m1, [rsp+7*32] + mova m2, [rsp+8*32] + mova m7, [rsp+9*32] + mova m8, [rsp+5*32] + TRANSPOSE8X8W 0, 1, 2, 7, 8, 13, 3, 4, 9 + + mova [dstq+strideq*0-16], xm0 + mova [dstq+strideq*1-16], xm1 + mova [dstq+strideq*2-16], xm2 + mova [dstq+stride3q -16], xm7 + lea tmpq, [dstq+strideq*4] + mova [tmpq+strideq*0-16], xm8 + mova [tmpq+strideq*1-16], xm13 + mova [tmpq+strideq*2-16], xm3 + mova [tmpq+stride3q -16], xm4 + lea tmpq, [tmpq+strideq*4] + vextracti128 [tmpq+strideq*0-16], m0, 1 + vextracti128 [tmpq+strideq*1-16], m1, 1 + vextracti128 [tmpq+strideq*2-16], m2, 1 + vextracti128 [tmpq+stride3q -16], m7, 1 + lea tmpq, [tmpq+strideq*4] + vextracti128 [tmpq+strideq*0-16], m8, 1 + vextracti128 [tmpq+strideq*1-16], m13, 1 + vextracti128 [tmpq+strideq*2-16], m3, 1 + vextracti128 [tmpq+stride3q -16], m4, 1 + + mova m0, [rsp+10*32] + mova m1, [rsp+11*32] + mova m2, [rsp+12*32] + mova m3, [rsp+13*32] + TRANSPOSE8X8W 5, 6, 14, 15, 0, 1, 2, 3, 4 + mova [dstq+strideq*0], xm5 + mova [dstq+strideq*1], xm6 + mova [dstq+strideq*2], xm14 + mova [dstq+stride3q ], xm15 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + mova [dstq+strideq*2], xm2 + mova [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0], m5, 1 + vextracti128 [dstq+strideq*1], m6, 1 + vextracti128 [dstq+strideq*2], m14, 1 + vextracti128 [dstq+stride3q ], m15, 1 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0], m0, 1 + vextracti128 [dstq+strideq*1], m1, 1 + vextracti128 [dstq+strideq*2], m2, 1 + vextracti128 [dstq+stride3q ], m3, 1 + lea dstq, [dstq+strideq*4] +%endif +%endif +%elif %1 == 6 + ; flat6 filter + + paddw m8, m3, m4 + paddw m8, m13 ; p2+p1+p0 + paddw m11, m13, m5 + paddw m8, m8 + paddw m8, m11 ; p2+2*(p2+p1+p0)+q0 + pmulhrsw m2, m8, [pw_4096] + + paddw m8, m5 + paddw m11, m13, m13 + paddw m8, m6 + psubw m8, m11 + pmulhrsw m10, m8, [pw_4096] + + paddw m8, m6 + paddw m11, m13, m3 + paddw m8, m14 + psubw m8, m11 + pmulhrsw m11, m8, [pw_4096] + + psubw m8, m3 + paddw m14, m14 + psubw m8, m4 + paddw m8, m14 + pmulhrsw m8, [pw_4096] + + REPX {pand x, m9}, m2, m10, m11, m8 + REPX {pandn x, m9, x}, m3, m4, m5, m6 + por m3, m2 + por m4, m10 + por m5, m11 + por m6, m8 + +%ifidn %2, v + mova [tmpq+strideq*2], m3 ; p1 + mova [tmpq+stride3q ], m4 ; p0 + mova [dstq+strideq*0], m5 ; q0 + mova [dstq+strideq*1], m6 ; q1 +%else + TRANSPOSE_8x4_AND_WRITE_4x16 +%endif +%else +%ifidn %2, v + mova [tmpq+strideq*0], m3 ; p1 + mova [tmpq+strideq*1], m4 ; p0 + mova [tmpq+strideq*2], m5 ; q0 + mova [tmpq+stride3q ], m6 ; q1 +%else + TRANSPOSE_8x4_AND_WRITE_4x16 +%endif +%endif +%endmacro + +INIT_YMM avx2 +cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits + rorx r6d, r7m, 6 + and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + lea r11, [pw_4] + add r11, r6 + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + mov mask_bitsd, 0xf + mova m12, [pb_mask] + +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + + FILTER 16, v + jmp .end + +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 8, v + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + + FILTER 4, v + +.end: + pslld m12, 4 + add lq, 16 + add dstq, 32 + shl mask_bitsd, 4 + sub wd, 4 + jg .loop + RET + +INIT_YMM avx2 +cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits + rorx r6d, r7m, 6 + and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + lea r11, [pw_4] + add r11, r6 + mov hd, hm + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] + mov mask_bitsd, 0xf + mova m12, [pb_mask] + +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + + FILTER 16, h + jmp .end + +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 8, h + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] + lea dstq, [dstq+strideq*8] +.end: + pslld m12, 4 + lea lq, [lq+l_strideq*4] + shl mask_bitsd, 4 + sub hd, 4 + jg .loop + RET + +INIT_YMM avx2 +cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits + rorx r6d, r7m, 6 + and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + lea r11, [pw_4] + add r11, r6 + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + mov mask_bitsd, 0xf + mova m12, [pb_mask] + +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 6, v + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + + FILTER 4, v + +.end: + pslld m12, 4 + add lq, 16 + add dstq, 32 + shl mask_bitsd, 4 + sub wd, 4 + jg .loop + RET + +INIT_YMM avx2 +cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits + rorx r6d, r7m, 6 + and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + lea r11, [pw_4] + add r11, r6 + mov hd, hm + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] + mov mask_bitsd, 0xf + mova m12, [pb_mask] + +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 6, h + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] + lea dstq, [dstq+strideq*8] +.end: + pslld m12, 4 + lea lq, [lq+l_strideq*4] + shl mask_bitsd, 4 + sub hd, 4 + jg .loop + RET + +%endif ; ARCH_X86_64 From 877ec2a0f40da1672de4ae2f60b47734eacca435 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 4 May 2021 14:09:14 +0200 Subject: [PATCH 063/188] x86: Add high bitdepth (10-bit) itx AVX2 asm --- src/x86/itx16_avx2.asm | 6363 ++++++++++++++++++++++++++++++++++++++++ src/x86/itx_avx2.asm | 90 +- 2 files changed, 6409 insertions(+), 44 deletions(-) create mode 100644 src/x86/itx16_avx2.asm diff --git a/src/x86/itx16_avx2.asm b/src/x86/itx16_avx2.asm new file mode 100644 index 0000000000..5df5f4619b --- /dev/null +++ b/src/x86/itx16_avx2.asm @@ -0,0 +1,6363 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 +pd_1321_2482: dd 1321, 1321, 1321, 1321, 2482, 2482, 2482, 2482 +itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6 + dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7 +pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048 +iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856 +idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 +idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 + +%macro COEF_PAIR 2 +pd_%1_%2: dd %1, %1, %2, %2 +%define pd_%1 (pd_%1_%2 + 4*0) +%define pd_%2 (pd_%1_%2 + 4*2) +%endmacro + +COEF_PAIR 201, 995 +COEF_PAIR 401, 1931 +COEF_PAIR 799, 3406 +COEF_PAIR 1380, 601 +COEF_PAIR 1751, 2440 +COEF_PAIR 2598, 1189 +COEF_PAIR 2751, 2106 +COEF_PAIR 2896, 1567 +COEF_PAIR 2896, 3784 +COEF_PAIR 3035, 3513 +COEF_PAIR 3166, 3920 +COEF_PAIR 3703, 3290 +COEF_PAIR 3857, 4052 +COEF_PAIR 4017, 2276 +COEF_PAIR 4076, 3612 +COEF_PAIR 4091, 3973 + +%define pd_1321 (pd_1321_2482 + 4*0) +%define pd_2482 (pd_1321_2482 + 4*4) + +pd_m601: dd -601 +pd_m1189: dd -1189 +pd_m1380: dd -1380 +pd_m2106: dd -2106 +pd_m2598: dd -2598 +pd_m2751: dd -2751 +pd_m3344: dd -3344 +pd_3803: dd 3803 +pd_5793: dd 5793 +pd_6144: dd 6144 ; 2048 + 4096 +pd_10239: dd 10239 ; 2048 + 8192 - 1 +pd_10240: dd 10240 ; 2048 + 8192 +pd_11586: dd 11586 ; 5793 * 2 +pd_38912: dd 38912 ; 2048 + 4096 + 32768 + +pixel_max: times 2 dw 0x03ff ; 10bpc +clip_min: dd -0x20000 +clip_max: dd 0x1ffff + +idct64_mul_16bpc: +dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 +dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799 +dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276 +dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406 + +cextern deint_shuf +cextern idct64_mul +cextern pw_1697x8 +cextern pw_1697x16 +cextern pw_1567_3784 +cextern pw_m1567_m3784 +cextern pw_m3784_1567 +cextern pw_2896_2896 +cextern pw_m2896_2896 +cextern pw_5 +cextern pw_2048 +cextern pw_4096 +cextern pw_8192 +cextern pw_16384 +cextern pw_2896x8 +cextern pd_2048 + +cextern idct_4x8_internal_avx2.main +cextern idct_4x16_internal_avx2.main +cextern idct_8x8_internal_avx2.main +cextern idct_8x16_internal_avx2.main +cextern idct_16x4_internal_avx2.main +cextern idct_16x8_internal_avx2.main +cextern idct_16x16_internal_avx2.main +cextern inv_txfm_add_dct_dct_8x32_avx2.main +cextern inv_txfm_add_dct_dct_8x32_avx2.main_fast +cextern inv_txfm_add_dct_dct_16x32_avx2.main_oddhalf +cextern inv_txfm_add_dct_dct_16x32_avx2.main_oddhalf_fast +cextern inv_txfm_add_dct_dct_16x64_avx2.main_part1 +cextern inv_txfm_add_dct_dct_16x64_avx2.main_part2_internal + +cextern iadst_4x4_internal_avx2.main +cextern iadst_4x8_internal_avx2.main_pass2 +cextern iadst_4x16_internal_avx2.main2 +cextern iadst_8x4_internal_avx2.main +cextern iadst_8x8_internal_avx2.main_pass2 +cextern iadst_8x16_internal_avx2.main +cextern iadst_8x16_internal_avx2.main_pass2_end +cextern iadst_16x4_internal_avx2.main +cextern iadst_16x8_internal_avx2.main +cextern iadst_16x8_internal_avx2.main_pass2_end +cextern iadst_16x16_internal_avx2.main +cextern iadst_16x16_internal_avx2.main_pass2_end + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +%macro WRAP_XMM 1+ + INIT_XMM cpuname + %1 + INIT_YMM cpuname +%endmacro + +%macro IWHT4_1D_PACKED 0 + ; m0 = in0 in2, m1 = in1 in3 + psubd m2, m0, m1 ; t2 + paddd xm0, xm1 ; t0 + vpermq m2, m2, q3322 + vpermq m0, m0, q1100 + vpermq m1, m1, q3120 + psubd m3, m0, m2 + psrad m3, 1 + psubd m3, m1 ; t1 t3 + psubd m0, m3 ; ____ out0 + paddd m2, m3 ; out3 ____ +%endmacro + +INIT_YMM avx2 +cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c + mova xm0, [cq+16*0] + vinserti128 m0, [cq+16*2], 1 + mova xm1, [cq+16*1] + vinserti128 m1, [cq+16*3], 1 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + lea r6, [dstq+strideq*2] + psrad m0, 2 + psrad m1, 2 + IWHT4_1D_PACKED + punpckhdq m0, m3 + punpckldq m3, m2 + punpckhqdq m1, m0, m3 + punpcklqdq m0, m3 + IWHT4_1D_PACKED + vpblendd m0, m2, 0x33 + packssdw m0, m3 + vextracti128 xm2, m0, 1 + punpckhdq xm1, xm0, xm2 ; out2 out1 + punpckldq xm0, xm2 ; out3 out0 + movq xm2, [r6 +strideq*1] + movhps xm2, [dstq+strideq*0] + movq xm3, [r6 +strideq*0] + movhps xm3, [dstq+strideq*1] + vpbroadcastd xm5, [pixel_max] + paddsw xm0, xm2 + paddsw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movq [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm0 + RET + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +; flags: 1 = packed, 2 = inv_dst1, 4 = inv_dst2 +; skip round/shift if rnd is not a number +%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags +%if %8 < 32 + pmulld m%4, m%1, m%8 + pmulld m%3, m%2, m%8 +%else +%if %9 & 1 + vbroadcasti128 m%3, [pd_%8] +%else + vpbroadcastd m%3, [pd_%8] +%endif + pmulld m%4, m%1, m%3 + pmulld m%3, m%2 +%endif +%if %7 < 32 + pmulld m%1, m%7 + pmulld m%2, m%7 +%else +%if %9 & 1 + vbroadcasti128 m%5, [pd_%7] +%else + vpbroadcastd m%5, [pd_%7] +%endif + pmulld m%1, m%5 + pmulld m%2, m%5 +%endif +%if %9 & 4 + psubd m%4, m%6, m%4 + psubd m%2, m%4, m%2 +%else +%ifnum %6 + paddd m%4, m%6 +%endif + paddd m%2, m%4 +%endif +%if %9 & 2 ; invert the upper half of dst1 before rounding + vbroadcasti128 m%4, [pw_2048_m2048] + psubd m%1, m%3 + psignd m%1, m%4 + paddd m%1, m%6 +%else +%ifnum %6 + paddd m%1, m%6 +%endif + psubd m%1, m%3 +%endif +%ifnum %6 + psrad m%2, 12 + psrad m%1, 12 +%endif +%endmacro + +%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size +cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 5, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%4_internal_16bpc) + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%4_internal_16bpc).pass2] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else +%if %3 + add eobd, %3 +%endif + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 4x4 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + movd xm1, [pw_2896x8] + mov [cq], eobd ; 0 + add r6d, 2048 + sar r6d, 12 + movd xm0, r6d + packssdw xm0, xm0 + pmulhrsw xm0, xm1 + vpbroadcastw xm0, xm0 + mova xm1, xm0 + jmp m(iadst_4x4_internal_16bpc).end +%endif +%endmacro + +%macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd + ITX_MULSUB_2D %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1 + punpckhqdq m%3, m%2, m%1 ; t3 t2 + punpcklqdq m%2, m%1 ; t0 t1 + paddd m%1, m%2, m%3 ; out0 out1 + psubd m%2, m%3 ; out3 out2 +%endmacro + +%macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd + vpbroadcastd m%5, [pw_m3784_1567] + punpckhwd m%3, m%2, m%1 + psubw m%4, m%1, m%2 + paddw m%1, m%2 + vpbroadcastd m%2, [pw_1567_3784] + punpcklqdq m%1, m%4 + vpbroadcastd m%4, [pw_2896x8] + pmaddwd m%5, m%3 + pmaddwd m%3, m%2 + pmulhrsw m%1, m%4 ; t0 t1 + paddd m%5, m%6 + paddd m%3, m%6 + psrad m%5, 12 + psrad m%3, 12 + packssdw m%3, m%5 ; t3 t2 + psubsw m%2, m%1, m%3 ; out3 out2 + paddsw m%1, m%3 ; out0 out1 +%endmacro + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, identity +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst + +cglobal idct_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m5, [pd_2048] + IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5 + vbroadcasti128 m2, [idct4_shuf] + packssdw m0, m1 + pshufb m0, m2 + jmp tx2q +.pass2: + vextracti128 xm1, m0, 1 + WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5 + packssdw xm5, xm5 ; pw_2048 + pmulhrsw xm0, xm5 + pmulhrsw xm1, xm5 + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movq xm3, [r6 +strideq*1] + movhps xm3, [r6 +strideq*0] + vpbroadcastd xm5, [pixel_max] + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movhps [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm1 + RET + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2 + call .main + vpermd m0, m4, m0 + psrld m4, 4 + pshufb m0, m4 + jmp tx2q +.pass2: + lea rax, [deint_shuf+128] + vextracti128 xm1, m0, 1 + call m(iadst_4x4_internal).main +.end: + vpbroadcastd xm4, [pw_2048] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movq xm3, [r6 +strideq*0] + movhps xm3, [r6 +strideq*1] + vpbroadcastd xm5, [pixel_max] + pmulhrsw xm0, xm4 + pmulhrsw xm1, xm4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [r6 +strideq*0], xm1 + movhps [r6 +strideq*1], xm1 + RET +ALIGN function_align +.main: + mova m2, [cq+16*2] + mova m0, [pd_1321_2482] + vpbroadcastd m3, [pd_3803] + vbroadcasti128 m5, [cq+16*0] + vpbroadcastd m1, [pd_m3344] + pmulld m4, m0, m2 + pmulld m3, m2 + pmulld m0, m5 + vpbroadcastd m5, [pd_2048] + psubd xm2, [cq+16*3] + psubd m2, [cq+16*0] + pmulld m2, m1 ; t2 t3 + vpermq m4, m4, q1032 + paddd m4, m3 + psubd m0, m4 + paddd xm4, xm4 + paddd m4, m0 ; t0 t1 + vinserti128 m3, m2, xm4, 1 ; t2 t0 + paddd m0, m4, m5 + psubd xm4, xm2 + psubd m1, m0, m2 + vpermq m2, m2, q3232 ; t3 t3 + psubd m1, m4 + mova m4, [itx4_shuf] + paddd m0, m2 ; out0 out1 + paddd m1, m3 ; out2 out3 + psrad m0, 12 + psrad m1, 12 + packssdw m0, m1 + ret + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_16bpc).main + psrld m1, m4, 8 + vpermd m0, m1, m0 + psrld m4, 4 + pshufb m0, m4 + jmp tx2q +.pass2: + lea rax, [deint_shuf+128] + vextracti128 xm1, m0, 1 + call m(iadst_4x4_internal).main + vpbroadcastd xm4, [pw_2048] + movq xm3, [dstq+strideq*1] + movhps xm3, [dstq+strideq*0] + lea r6, [dstq+strideq*2] + movq xm2, [r6 +strideq*1] + movhps xm2, [r6 +strideq*0] + vpbroadcastd xm5, [pixel_max] + pmulhrsw xm0, xm4 + pmulhrsw xm1, xm4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movhps [dstq+strideq*0], xm1 + movq [dstq+strideq*1], xm1 + movhps [r6 +strideq*0], xm0 + movq [r6 +strideq*1], xm0 + RET + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2 + vpbroadcastd m1, [pd_5793] + pmulld m0, m1, [cq+32*0] + pmulld m1, [cq+32*1] + vpbroadcastd m5, [pd_2048] + mova m3, [itx4_shuf] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + packssdw m0, m1 + vpermd m0, m3, m0 + psrld m3, 4 + pshufb m0, m3 + jmp tx2q +.pass2: + vpbroadcastd m1, [pw_1697x8] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + pmulhrsw m1, m0 + paddsw m0, m1 + movq xm3, [r6 +strideq*0] + movhps xm3, [r6 +strideq*1] + vpbroadcastd xm4, [pixel_max] + packssdw m5, m5 ; pw_2048 + pmulhrsw m0, m5 + pxor m5, m5 + mova [cq+32*0], m5 + mova [cq+32*1], m5 + vextracti128 xm1, m0, 1 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm5 + pmaxsw xm1, xm5 + pminsw xm0, xm4 + pminsw xm1, xm4 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [r6 +strideq*0], xm1 + movhps [r6 +strideq*1], xm1 + RET + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 4x8 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 8 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + add r6d, 2048 + sar r6d, 12 +.end: + imul r6d, 2896 + add r6d, 34816 + sar r6d, 16 + movd xm0, r6d + vpbroadcastw xm0, xm0 +.end2: + vpbroadcastd xm3, [pixel_max] + pxor xm2, xm2 +.end_loop: + movq xm1, [dstq+strideq*0] + movhps xm1, [dstq+strideq*1] + paddw xm1, xm0 + pmaxsw xm1, xm2 + pminsw xm1, xm3 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .end_loop + WRAP_XMM RET +%endif +%endmacro + +%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd + ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3 + vpbroadcastd m%5, [pd_2896] + pmulld m%1, m%5 + pmulld m%3, m%5 + paddd m%1, m%8 + paddd m%5, m%1, m%3 + psubd m%1, m%3 + psrad m%5, 12 ; t0 + psrad m%1, 12 ; t1 + psubd m%3, m%1, m%2 + paddd m%2, m%1 + paddd m%1, m%5, m%4 + psubd m%4, m%5, m%4 +%endmacro + +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, identity +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst + +cglobal idct_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 + vpbroadcastd m3, [pd_2896] + pmulld m0, m3, [cq+32*0] + pmulld m1, m3, [cq+32*1] + pmulld m2, m3, [cq+32*2] + pmulld m3, m3, [cq+32*3] + vpbroadcastd m7, [pd_2048] + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 + jmp tx2q +.pass2: + packssdw m0, m2 + packssdw m1, m3 + lea rax, [deint_shuf+128] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m2 ; 2 3 + punpckldq m0, m2 ; 0 1 + vextracti128 xm2, m0, 1 ; 4 5 + vextracti128 xm3, m1, 1 ; 6 7 + call m(idct_4x8_internal).main + vpbroadcastd xm4, [pw_2048] + REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+r3 ] + movhps xm5, [dstq+strideq*2] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + movq xm7, [r6 +r3 ] + movhps xm7, [r6 +strideq*2] + paddw xm0, xm4 ; 0 1 + paddw xm1, xm5 ; 3 2 + paddw xm2, xm6 ; 4 5 + paddw xm3, xm7 ; 7 6 + vpbroadcastd xm5, [pixel_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 + REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movhps [dstq+strideq*2], xm1 + movq [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movhps [r6 +strideq*2], xm3 + movq [r6 +r3 ], xm3 + RET + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_16bpc).main + psrad m0, m4, 12 + psrad m1, m5, 12 + psrad m2, 12 + psrad m3, 12 + jmp tx2q +.pass2: + call .pass2_main + mova xm4, [pw_2048_m2048] + REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 +.end: + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+strideq*2] + movhps xm5, [dstq+r3 ] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + movq xm7, [r6 +strideq*2] + movhps xm7, [r6 +r3 ] + paddw xm0, xm4 ; 0 1 + paddw xm1, xm5 ; 2 3 + paddw xm2, xm6 ; 4 5 + paddw xm3, xm7 ; 6 7 + vpbroadcastd xm5, [pixel_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 + REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + RET +ALIGN function_align +.pass2_main: + packssdw m0, m2 + packssdw m1, m3 + lea rax, [deint_shuf+128] + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpckhdq m5, m4, m0 + punpckldq m4, m0 + vextracti128 xm2, m4, 1 ; 4 5 + vextracti128 xm3, m5, 1 ; 6 7 + pshufd xm4, xm4, q1032 ; 1 0 + pshufd xm5, xm5, q1032 ; 3 2 + jmp m(iadst_4x8_internal).main_pass2 +ALIGN function_align +.main: + vbroadcasti128 m0, [cq+16*0] + vbroadcasti128 m2, [cq+16*2] + vbroadcasti128 m3, [cq+16*5] + vbroadcasti128 m1, [cq+16*7] + vpbroadcastd m6, [pd_2896] + shufpd m0, m2, 0x0c ; 0 2 + shufpd m1, m3, 0x0c ; 7 5 + vbroadcasti128 m2, [cq+16*4] + vbroadcasti128 m4, [cq+16*6] + vbroadcasti128 m5, [cq+16*1] + vbroadcasti128 m3, [cq+16*3] + vpbroadcastd m7, [pd_2048] + vpbroadcastd m8, [clip_min] + vpbroadcastd m9, [clip_max] + shufpd m2, m4, 0x0c ; 4 6 + shufpd m3, m5, 0x0c ; 3 1 + REPX {pmulld x, m6}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 401_1931, 4076_3612, 1 + ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1 + psubd m4, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + REPX {pmaxsd x, m8}, m4, m2, m0, m1 + REPX {pminsd x, m9}, m4, m2, m0, m1 + pxor m5, m5 + psubd m5, m4 + vpblendd m4, m2, 0xcc ; t4 t7 + vpblendd m2, m5, 0xcc ; t5 -t6 + ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 1567, 3784 + vpbroadcastd m5, [pd_2896] + vbroadcasti128 m6, [pw_2048_m2048] ; + + - - + punpckhqdq m3, m0, m1 + punpcklqdq m0, m1 + psubd m1, m0, m3 ; t2 t3 + paddd m0, m3 ; out0 -out7 + punpckhqdq m3, m4, m2 ; t7a t6a + punpcklqdq m4, m2 ; t5a t4a + psubd m2, m4, m3 ; t7 t6 + paddd m4, m3 ; out6 -out1 + REPX {pmaxsd x, m8}, m1, m2 + REPX {pminsd x, m9}, m1, m2 + vpblendd m3, m1, m2, 0xcc + shufpd m1, m2, 0x05 + pmulld m3, m5 + pmulld m5, m1 + psignd m0, m6 ; out0 out7 + psignd m4, m6 ; out6 out1 + paddd m3, m7 + psubd m2, m3, m5 + paddd m5, m3 + psrad m2, 12 ; out4 -out5 + psrad m5, 12 ; -out3 out2 + ret + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_16bpc).main + psrad m0, m3, 12 + psrad m1, m2, 12 + psrad m2, m5, 12 + psrad m3, m4, 12 + jmp tx2q +.pass2: + call m(iadst_4x8_internal_16bpc).pass2_main + mova xm4, [pw_2048_m2048] + REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*1] + movhps xm4, [dstq+strideq*0] + movq xm5, [dstq+r3 ] + movhps xm5, [dstq+strideq*2] + movq xm6, [r6 +strideq*1] + movhps xm6, [r6 +strideq*0] + movq xm7, [r6 +r3 ] + movhps xm7, [r6 +strideq*2] + paddw xm3, xm4 ; 1 0 + paddw xm2, xm5 ; 3 2 + paddw xm1, xm6 ; 5 4 + paddw xm0, xm7 ; 7 6 + vpbroadcastd xm5, [pixel_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0 + REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0 + movhps [dstq+strideq*0], xm3 + movq [dstq+strideq*1], xm3 + movhps [dstq+strideq*2], xm2 + movq [dstq+r3 ], xm2 + movhps [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm1 + movhps [r6 +strideq*2], xm0 + movq [r6 +r3 ], xm0 + RET + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 + vpbroadcastd m3, [pd_2896] + pmulld m0, m3, [cq+32*0] + pmulld m1, m3, [cq+32*1] + pmulld m2, m3, [cq+32*2] + pmulld m3, [cq+32*3] + vpbroadcastd m5, [pd_2048] + vpbroadcastd m4, [pd_5793] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m4, [pw_4096] + packssdw m0, m2 + packssdw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmulhrsw m2, m4 + pmulhrsw m0, m4 + punpckhdq m1, m0, m2 ; 2 3 6 7 + punpckldq m0, m2 ; 0 1 4 5 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + vpbroadcastq m4, [r6 +strideq*0] + vpbroadcastq m5, [r6 +strideq*1] + movq xm3, [dstq+strideq*2] + movhps xm3, [dstq+r3 ] + vpblendd m2, m4, 0x30 + vpblendd m2, m5, 0xc0 + vpbroadcastq m4, [r6 +strideq*2] + vpbroadcastq m5, [r6 +r3 ] + vpblendd m3, m4, 0x30 + vpblendd m3, m5, 0xc0 + vpbroadcastd m5, [pixel_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + RET + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 4x16 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 16 + add r6d, 6144 + sar r6d, 13 + jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end +%endif +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, identity +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst + +cglobal idct_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2 + mova m1, [cq+32*2] + mova m3, [cq+32*6] + mova m5, [cq+32*3] + mova m7, [cq+32*7] + vpbroadcastd m4, [pd_3784] + vpbroadcastd m8, [pd_1567] + vpbroadcastd m9, [pd_2048] + vpbroadcastd m6, [pd_2896] + ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l + ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h + pmulld m0, m6, [cq+32*0] + pmulld m2, m6, [cq+32*4] + pmulld m4, m6, [cq+32*1] + pmulld m6, [cq+32*5] + vpbroadcastd m8, [pd_6144] + paddd m0, m8 + paddd m4, m8 + paddd m8, m0, m2 + psubd m0, m2 + paddd m9, m4, m6 + psubd m4, m6 + REPX {psrad x, 12}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h + psubd m2, m0, m1 + paddd m1, m0 + psubd m6, m4, m5 + paddd m5, m4 + paddd m0, m8, m3 + psubd m3, m8, m3 + paddd m4, m9, m7 + psubd m7, m9, m7 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea rax, [deint_shuf+128] + punpcklwd m4, m2, m3 + punpckhwd m2, m3 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m4 ; 2 3 + punpckldq m0, m4 ; 0 1 + punpckldq m4, m5, m2 ; 8 9 + punpckhdq m5, m2 ; a b + vextracti128 xm2, m0, 1 ; 4 5 + vextracti128 xm3, m1, 1 ; 6 7 + vextracti128 xm6, m4, 1 ; c d + vextracti128 xm7, m5, 1 ; e f + call m(idct_4x16_internal).main + vpbroadcastd m9, [pw_2048] + vinserti128 m0, m0, xm1, 1 ; 0 1 3 2 + vinserti128 m1, m2, xm3, 1 ; 4 5 7 6 + vinserti128 m2, m4, xm5, 1 ; 8 9 b a + vinserti128 m3, m6, xm7, 1 ; c d f e + vpbroadcastd m8, [pixel_max] + lea r6, [strideq*3] + pxor m7, m7 + pmulhrsw m0, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + RET +ALIGN function_align +.write_4x4: + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + vpbroadcastq m5, [dstq+strideq*2] + vpbroadcastq m6, [dstq+r6 ] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0xc0 + vpblendd m4, m6, 0x30 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm4 + movhps [dstq+strideq*2], xm5 + movq [dstq+r6 ], xm5 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2 + call m(iadst_16x4_internal_16bpc).main + psrad m0, m4, 13 + psrad m1, m5, 13 + psrad m2, 13 + psrad m3, 13 + psrad m4, m8, 13 + psrad m5, m9, 13 + psrad m6, 13 + psrad m7, 13 + jmp tx2q +.pass2: + call .pass2_main + vpbroadcastd m5, [pw_2048] + vpbroadcastd m8, [pixel_max] + lea r6, [strideq*3] + vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1 + pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 + vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13 + pxor m7, m7 + psubw m9, m7, m5 + vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 + pmulhrsw m0, m4, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + RET +ALIGN function_align +.write_4x4: + movq xm4, [dstq+r6 ] + movhps xm4, [dstq+strideq*0] + vpbroadcastq m5, [dstq+strideq*1] + vpbroadcastq m6, [dstq+strideq*2] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0xc0 + vpblendd m4, m6, 0x30 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movhps [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm5 + movq [dstq+strideq*2], xm5 + movq [dstq+r6 ], xm4 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align +.pass2_main: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea rax, [deint_shuf+128] + punpcklwd m4, m2, m3 + punpckhwd m2, m3 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m4 + punpckldq m0, m4 + punpckldq m4, m5, m2 + punpckhdq m5, m2 + vpblendd m3, m0, m1, 0x33 + vpblendd m0, m1, 0xcc + shufpd m2, m5, m4, 0x05 + shufpd m4, m5, 0x05 + vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5 + vinserti128 m0, xm3, 1 ; 0 3 2 1 + vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ???? + vinserti128 m2, xm4, 1 ; b 8 9 a + call m(iadst_4x16_internal).main2 + vpbroadcastd m5, [pw_2896x8] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + ret +ALIGN function_align +.main: + vbroadcasti128 m0, [cq+16* 0] + vbroadcasti128 m4, [cq+16* 2] + vbroadcasti128 m1, [cq+16*15] + vbroadcasti128 m5, [cq+16*13] + vbroadcasti128 m2, [cq+16* 4] + vbroadcasti128 m6, [cq+16* 6] + vbroadcasti128 m3, [cq+16*11] + vbroadcasti128 m7, [cq+16* 9] + shufpd m0, m4, 0x0c ; 0 2 + shufpd m1, m5, 0x0c ; 15 13 + shufpd m2, m6, 0x0c ; 4 6 + shufpd m3, m7, 0x0c ; 11 9 + vbroadcasti128 m4, [cq+16* 8] + vbroadcasti128 m6, [cq+16*10] + vbroadcasti128 m5, [cq+16* 7] + vbroadcasti128 m7, [cq+16* 5] + shufpd m4, m6, 0x0c ; 8 10 + shufpd m5, m7, 0x0c ; 7 5 + vbroadcasti128 m6, [cq+16*12] + vbroadcasti128 m7, [cq+16*14] + shufpd m6, m7, 0x0c ; 12 14 + vbroadcasti128 m7, [cq+16* 3] + vbroadcasti128 m8, [cq+16* 1] + shufpd m7, m8, 0x0c ; 3 1 + vpbroadcastd m11, [pd_2048] + ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1 + psubd m8, m0, m4 ; t8a t10a + paddd m0, m4 ; t0a t2a + psubd m4, m1, m5 ; t9a t11a + paddd m1, m5 ; t1a t3a + psubd m5, m2, m6 ; t12a t14a + paddd m2, m6 ; t4a t6a + psubd m6, m3, m7 ; t13a t15a + paddd m3, m7 ; t5a t7a + ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 799_3406, 4017_2276, 1 + ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 4017_2276, 10, 1 + psubd m7, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + psubd m3, m4, m6 ; t12a t14a + paddd m4, m6 ; t8a t10a + psubd m6, m8, m5 ; t13a t15a + paddd m8, m5 ; t9a t11a + punpcklqdq m5, m3, m7 ; t12a t4 + punpckhqdq m3, m7 ; t14a t6 + punpckhqdq m7, m6, m2 ; t15a t7 + punpcklqdq m6, m2 ; t13a t5 + ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 3784, 1567 + ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 1567, 10 + vpbroadcastd m10, [pd_2896] + vbroadcasti128 m9, [pw_2048_m2048] ; + + - - + punpckhqdq m2, m4, m0 ; t10a t2 + punpcklqdq m4, m0 ; t8a t0 + punpckhqdq m0, m8, m1 ; t11a t3 + punpcklqdq m8, m1 ; t9a t1 + paddd m1, m6, m7 ; out2 -out3 + psubd m6, m7 ; t14a t6 + paddd m7, m5, m3 ; -out13 out12 + psubd m5, m3 ; t15a t7 + psubd m3, m8, m0 ; t11 t3a + paddd m8, m0 ; out14 -out15 + paddd m0, m4, m2 ; -out1 out0 + psubd m4, m2 ; t10 t2a + REPX {pmulld x, m10}, m6, m5, m3, m4 + paddd m6, m11 + paddd m4, m11 + paddd m2, m6, m5 ; -out5 out4 + psubd m6, m5 ; out10 -out11 + psubd m5, m4, m3 ; -out9 out8 + paddd m3, m4 ; out6 -out7 + REPX {psrad x, 12}, m2, m3, m5, m6 + REPX {psignd x, m9}, m1, m8, m3, m6 + pshufd m9, m9, q1032 + REPX {psignd x, m9}, m0, m7, m2, m5 + ret + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2 + call m(iadst_16x4_internal_16bpc).main + psrad m0, m3, 13 + psrad m1, m2, 13 + psrad m2, m5, 13 + psrad m3, m4, 13 + psrad m4, m7, 13 + psrad m5, m6, 13 + psrad m6, m9, 13 + psrad m7, m8, 13 + jmp tx2q +.pass2: + call m(iadst_4x16_internal_16bpc).pass2_main + vpbroadcastd m5, [pw_2048] + vpbroadcastd m8, [pixel_max] + lea r6, [strideq*3] + vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2 + pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 + vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14 + pxor m7, m7 + psubw m9, m7, m5 + vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 + pmulhrsw m0, m4, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + RET +ALIGN function_align +.write_4x4: + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+r6 ] + vpbroadcastq m5, [dstq+strideq*1] + vpbroadcastq m6, [dstq+strideq*2] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0x30 + vpblendd m4, m6, 0xc0 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*1], xm5 + movhps [dstq+strideq*2], xm5 + movhps [dstq+r6 ], xm4 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2 + vpbroadcastd m7, [pd_5793] + pmulld m0, m7, [cq+32*0] + pmulld m4, m7, [cq+32*1] + pmulld m1, m7, [cq+32*2] + pmulld m5, m7, [cq+32*3] + pmulld m2, m7, [cq+32*4] + pmulld m6, m7, [cq+32*5] + pmulld m3, m7, [cq+32*6] + pmulld m7, [cq+32*7] + vpbroadcastd m8, [pd_6144] + REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7 + REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7 + jmp tx2q +.pass2: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m7, [pw_1697x16] + vpbroadcastd m8, [pw_2048] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + vpbroadcastd m4, [pixel_max] + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + lea r6, [strideq*5] + pxor m3, m3 + punpckhdq m5, m0, m2 ; 2 3 6 7 + punpckldq m0, m2 ; 0 1 4 5 + punpckldq m6, m7, m1 ; 8 9 c d + punpckhdq m7, m1 ; a b e f + pmulhrsw m0, m8 + call .write_2x4x2 + pmulhrsw m0, m5, m8 + call .write_2x4x2 + pmulhrsw m0, m6, m8 + lea dstq, [dstq+strideq*4] + call .write_2x4x2 + pmulhrsw m0, m7, m8 + call .write_2x4x2 + RET +ALIGN function_align +.write_2x4x2: + movq xm1, [dstq+strideq*0] + movhps xm1, [dstq+strideq*1] + vpbroadcastq m2, [dstq+strideq*4] + vpblendd m1, m2, 0x30 + vpbroadcastq m2, [dstq+r6 ] + vpblendd m1, m2, 0xc0 + mova [cq+32*0], m3 + mova [cq+32*1], m3 + add cq, 32*2 + paddw m1, m0 + pmaxsw m1, m3 + pminsw m1, m4 + vextracti128 xm2, m1, 1 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + movq [dstq+strideq*4], xm2 + movhps [dstq+r6 ], xm2 + lea dstq, [dstq+strideq*2] + ret + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 8x4 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + add r6d, 34816 + sar r6d, 16 + movd xm0, r6d + vpbroadcastw m0, xm0 +.end: + vpbroadcastd m4, [pixel_max] + pxor m3, m3 + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 + lea r6, [dstq+strideq*2] + mova xm2, [r6 +strideq*0] + vinserti128 m2, [r6 +strideq*1], 1 + paddw m1, m0 + paddw m2, m0 + pmaxsw m1, m3 + pmaxsw m2, m3 + pminsw m1, m4 + pminsw m2, m4 + mova [dstq+strideq*0], xm1 + vextracti128 [dstq+strideq*1], m1, 1 + mova [r6 +strideq*0], xm2 + vextracti128 [r6 +strideq*1], m2, 1 + RET +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, identity +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst + +cglobal idct_8x4_internal_16bpc, 0, 7, 10, dst, stride, c, eob, tx2 + vbroadcasti128 m1, [cq+16*1] + vbroadcasti128 m0, [cq+16*5] + vbroadcasti128 m2, [cq+16*3] + vbroadcasti128 m3, [cq+16*7] + vpbroadcastd m6, [pd_2896] + shufpd m1, m0, 0x0c ; 1 5 + shufpd m3, m2, 0x0c ; 7 3 + vbroadcasti128 m0, [cq+16*0] + vbroadcasti128 m4, [cq+16*2] + vbroadcasti128 m2, [cq+16*4] + vbroadcasti128 m5, [cq+16*6] + vpbroadcastd m7, [pd_2048] + shufpd m0, m4, 0x0c ; 0 2 + shufpd m2, m5, 0x0c ; 4 6 + REPX {pmulld x, m6}, m1, m3, m0, m2 + REPX {paddd x, m7}, m1, m3, m0, m2 + REPX {psrad x, 12}, m1, m3, m0, m2 + call .main + psubd m3, m0, m4 ; out7 out6 + paddd m0, m4 ; out0 out1 + paddd m1, m2, m5 ; out3 out2 + psubd m2, m5 ; out4 out5 + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp tx2q +.pass2: + vbroadcasti128 m4, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + vperm2i128 m1, m0, m2, 0x31 + vinserti128 m0, xm2, 1 + pshufb m0, m4 + pshufb m1, m4 + IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7 + vpermq m0, m0, q3120 ; out0 out1 + vpermq m2, m1, q2031 ; out2 out3 + jmp m(iadst_8x4_internal_16bpc).end +ALIGN function_align +.main: + ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1 + IDCT4_1D_PACKED 0, 2, 4, 5, 6, 7 + vpbroadcastd m8, [clip_min] + vpbroadcastd m9, [clip_max] + vpbroadcastd m6, [pd_2896] + punpcklqdq m4, m1, m3 ; t4a t7a + punpckhqdq m1, m3 ; t5a t6a + psubd m3, m4, m1 ; t5a t6a + paddd m4, m1 ; t4 t7 + REPX {pmaxsd x, m8}, m3, m4, m0, m2 + REPX {pminsd x, m9}, m3, m4, m0, m2 + pmulld m3, m6 + pshufd m1, m3, q1032 + paddd m3, m7 + psubd m5, m3, m1 + paddd m1, m3 + psrad m5, 12 + psrad m1, 12 + vpblendd m5, m4, 0x33 ; t4 t5 + punpckhqdq m4, m1 ; t7 t6 + ret + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_16bpc, 0, 7, 10, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_16bpc).main + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 + jmp tx2q +.pass2: + call .pass2_main + vpermq m0, m0, q3120 ; out0 out1 + vpermq m2, m1, q3120 ; out2 out3 +.end: + vpbroadcastd m1, [pw_2048] + pmulhrsw m0, m1 + pmulhrsw m1, m2 +.end2: + mova xm2, [dstq+strideq*0] + vinserti128 m2, [dstq+strideq*1], 1 + lea r6, [dstq+strideq*2] + mova xm3, [r6 +strideq*0] + vinserti128 m3, [r6 +strideq*1], 1 + vpbroadcastd m5, [pixel_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [r6 +strideq*0], xm1 + vextracti128 [r6 +strideq*1], m1, 1 + RET +ALIGN function_align +.pass2_main: + vbroadcasti128 m4, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + lea rax, [deint_shuf+128] + vperm2i128 m1, m0, m2, 0x31 + vinserti128 m0, xm2, 1 + pshufb m0, m4 + pshufb m1, m4 + jmp m(iadst_8x4_internal).main +ALIGN function_align +.main: + vpbroadcastd m1, [pd_2896] + pmulld m0, m1, [cq+32*0] + pmulld m3, m1, [cq+32*3] + pmulld m2, m1, [cq+32*2] + pmulld m1, [cq+32*1] + vpbroadcastd m4, [pd_2048] + REPX {paddd x, m4}, m0, m3, m2, m1 + REPX {psrad x, 12}, m0, m3, m2, m1 + vbroadcasti128 m6, [pd_1321] + vbroadcasti128 m7, [pd_2482] + pmulld m4, m0, m6 ; 1321*in0 + pmulld m5, m3, m7 ; 2482*in3 + paddd m4, m5 ; 1321*in0 + 2482*in3 + pmulld m5, m0, m7 ; 2482*in0 + paddd m0, m3 ; in0 + in3 + paddd m7, m6 ; pd_3803 + pmulld m6, m2 ; 1321*in2 + pmulld m3, m7 ; 3803*in3 + pmulld m7, m2 ; 3803*in2 + psubd m2, m0 ; in2 - in0 - in3 + vpbroadcastd m0, [pd_m3344] + psubd m5, m6 ; 2482*in0 - 1321*in2 + vpbroadcastd m6, [pd_2048] + psubd m5, m3 ; t1 + pmulld m2, m0 ; t2 + pmulld m1, m0 ; -t3 + paddd m4, m7 ; t0 + paddd m5, m6 + paddd m3, m4, m5 + paddd m4, m6 + psubd m4, m1 ; out0 (unshifted) + psubd m5, m1 ; out1 (unshifted) + paddd m2, m6 ; out2 (unshifted) + paddd m3, m1 ; out3 (unshifted) + ret + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_16bpc, 0, 5, 10, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_16bpc).main + shufpd m3, m4, m0, 0x05 + shufpd m0, m4, 0x05 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 + psignd m2, m5, m6 + jmp tx2q +.pass2: + call m(iadst_8x4_internal_16bpc).pass2_main + vpermq m2, m0, q2031 + vpermq m0, m1, q2031 + jmp m(iadst_8x4_internal_16bpc).end + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_16bpc, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m4, [pd_2896] + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpermq m2, [cq+32*2], q3120 + vpermq m3, [cq+32*3], q3120 + vpbroadcastd m7, [pd_2048] + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {paddd x, x }, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m4, [pw_1697x8] + packssdw m0, m1 + packssdw m2, m3 + pmulhrsw m1, m4, m0 + pmulhrsw m4, m2 + paddsw m0, m1 + paddsw m2, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + packssdw m7, m7 ; pw_2048 + lea r6, [dstq+strideq*2] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmulhrsw m2, m7 + pmulhrsw m0, m7 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova xm2, [dstq+strideq*0] + vinserti128 m2, [r6 +strideq*0], 1 + mova xm3, [dstq+strideq*1] + vinserti128 m3, [r6 +strideq*1], 1 + vpbroadcastd m5, [pixel_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + vextracti128 [r6 +strideq*0], m0, 1 + vextracti128 [r6 +strideq*1], m1, 1 + RET + +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 8x8 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 8 +.dconly: + add r6d, 6144 + sar r6d, 13 +.dconly2: + imul r6d, 2896 + add r6d, 34816 + sar r6d, 16 + movd xm0, r6d + vpbroadcastw m0, xm0 + vpbroadcastd m3, [pixel_max] + pxor m2, m2 +.dconly_loop: + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 + paddw m1, m0 + pmaxsw m1, m2 + pminsw m1, m3 + mova [dstq+strideq*0], xm1 + vextracti128 [dstq+strideq*1], m1, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +%macro IADST8_1D 14 ; src[1-8], tmp[1-3], pd_2048, clip[1-2] + ITX_MULSUB_2D %8, %1, %9, %10, %11, %12, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D %2, %7, %9, %10, %11, %12, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2D %6, %3, %9, %10, %11, %12, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D %4, %5, %9, %10, %11, %12, 3166, 2598 ; t5a, t4a + psubd m%9, m%3, m%7 ; t6 + paddd m%3, m%7 ; t2 + psubd m%7, m%1, m%5 ; t4 + paddd m%1, m%5 ; t0 + psubd m%5, m%6, m%2 ; t7 + paddd m%6, m%2 ; t3 + psubd m%2, m%8, m%4 ; t5 + paddd m%8, m%4 ; t1 + REPX {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 + REPX {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 + ITX_MULSUB_2D %7, %2, %4, %10, %11, %12, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a + psubd m%10, m%7, m%9 ; t7 + paddd m%7, m%9 ; out6 + vpbroadcastd m%9, [pd_2896] + psubd m%4, m%8, m%6 ; t3 + paddd m%8, m%6 ; -out7 + psubd m%6, m%1, m%3 ; t2 + paddd m%1, m%3 ; out0 + psubd m%3, m%2, m%5 ; t6 + paddd m%2, m%5 ; -out1 + REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10 + REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10 + REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10 + psubd m%5, m%6, m%4 ; (t2 - t3) * 2896 + paddd m%4, m%6 ; (t2 + t3) * 2896 + psubd m%6, m%3, m%10 ; (t6 - t7) * 2896 + paddd m%3, m%10 ; (t6 + t7) * 2896 +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, identity +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst + +cglobal idct_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + mova m5, [cq+32*5] + mova m6, [cq+32*6] + mova m7, [cq+32*7] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + call .main + call .round_shift1 + jmp tx2q +.pass2: + call .transpose_8x8_packed + call m(idct_8x8_internal).main + vpbroadcastd m12, [pw_2048] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call .write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call .write_8x4 + RET +ALIGN function_align +.write_8x4_start: + vpbroadcastd m11, [pixel_max] + lea r6, [strideq*3] + pxor m10, m10 +.write_8x4: + mova xm8, [dstq+strideq*0] + vinserti128 m8, [dstq+strideq*1], 1 + mova xm9, [dstq+strideq*2] + vinserti128 m9, [dstq+r6 ], 1 + mova [cq+32*0], m10 + mova [cq+32*1], m10 + mova [cq+32*2], m10 + mova [cq+32*3], m10 + add cq, 32*4 + paddw m0, m8 + paddw m1, m9 + pmaxsw m0, m10 + pmaxsw m1, m10 + pminsw m0, m11 + pminsw m1, m11 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+r6 ], m1, 1 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align +.transpose_8x8_packed: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea rax, [deint_shuf+128] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m4, m1 + punpckldq m4, m1 + vinserti128 m1, m3, xm2, 1 + vperm2i128 m3, m2, 0x31 + vperm2i128 m2, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + ret +ALIGN function_align +.main_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main: + ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a + ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3 + paddd m8, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m9, m7, m3 ; t7 + psubd m7, m3 ; t6a + vpbroadcastd m3, [pd_2896] + REPX {pmaxsd x, m12}, m1, m8, m7, m9 + REPX {pminsd x, m13}, m1, m8, m7, m9 + REPX {pmulld x, m3 }, m0, m4, m7, m1 + paddd m0, m11 + paddd m7, m11 + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + REPX {pmaxsd x, m12}, m0, m6, m5, m3 + REPX {pminsd x, m13}, m0, m6, m5, m3 + ret +ALIGN function_align +.round_shift1: + pcmpeqd m1, m1 + REPX {psubd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + call .main + call .main_end + jmp tx2q +.pass2: + call m(idct_8x8_internal_16bpc).transpose_8x8_packed + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal).main_pass2 + vpbroadcastd m5, [pw_2048] + vpbroadcastd xm12, [pw_4096] + psubw m12, m5 + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + RET +ALIGN function_align +.main: + mova m0, [cq+32*0] + mova m7, [cq+32*7] + mova m1, [cq+32*1] + mova m6, [cq+32*6] + mova m2, [cq+32*2] + mova m5, [cq+32*5] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] +.main2: + IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 + psrld m8, 11 ; pd_1 + vpbroadcastd m9, [pd_6144] + ret +ALIGN function_align +.main_end: + paddd m0, m8 + psubd m1, m8, m1 + paddd m6, m8 + psubd m7, m8, m7 + REPX {psrad x, 1 }, m0, m1, m6, m7 + ; (1 + ((x + 2048) >> 12)) >> 1 = (6144 + x) >> 13 + ; (1 - ((x + 2048) >> 12)) >> 1 = (6143 - x) >> 13 + psubd m8, m9, m8 ; pd_6143 + paddd m2, m9 + psubd m3, m8, m3 + paddd m4, m9 + psubd m5, m8, m5 + REPX {psrad x, 13}, m2, m3, m4, m5 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + call m(iadst_8x8_internal_16bpc).main + call .main_end + jmp tx2q +.pass2: + call m(idct_8x8_internal_16bpc).transpose_8x8_packed + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal).main_pass2 + vpbroadcastd m12, [pw_2048] + vpbroadcastd xm5, [pw_4096] + psubw m12, m5 + vpermq m8, m3, q2031 + vpermq m9, m2, q2031 + vpermq m2, m1, q2031 + vpermq m3, m0, q2031 + pmulhrsw m0, m8, m12 + pmulhrsw m1, m9, m12 + call m(idct_8x8_internal_16bpc).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + RET +ALIGN function_align +.main_end: + paddd m10, m8, m0 + psubd m0, m8, m7 + psubd m7, m8, m1 + paddd m1, m8, m6 + psrad m0, 1 + psrad m1, 1 + psrad m6, m7, 1 + psrad m7, m10, 1 + psubd m8, m9, m8 ; pd_6143 + psubd m10, m8, m5 + paddd m5, m9, m2 + psubd m2, m8, m3 + paddd m3, m9, m4 + psrad m4, m2, 13 + psrad m2, m10, 13 + psrad m3, 13 + psrad m5, 13 + ret + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + mova m5, [cq+32*5] + mova m6, [cq+32*6] + mova m7, [cq+32*7] + jmp tx2q +.pass2: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m12, [pw_4096] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m1 + punpckhdq m4, m1 + punpckhqdq m1, m0, m2 ; 1 5 + punpcklqdq m0, m2 ; 0 4 + punpcklqdq m2, m3, m4 ; 2 6 + punpckhqdq m3, m4 ; 3 7 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call .write_2x8x2_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call .write_2x8x2_zero + RET +.write_2x8x2_start: + vpbroadcastd m7, [pixel_max] + lea r6, [strideq*5] + pxor m6, m6 +.write_2x8x2_zero: + mova [cq+32*0], m6 + mova [cq+32*1], m6 + mova [cq+32*2], m6 + mova [cq+32*3], m6 + add cq, 32*4 +.write_2x8x2: + mova xm4, [dstq+strideq*0] + vinserti128 m4, [dstq+strideq*4], 1 + mova xm5, [dstq+strideq*1] + vinserti128 m5, [dstq+r6 ], 1 + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m6 + pmaxsw m1, m6 + pminsw m0, m7 + pminsw m1, m7 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+strideq*4], m0, 1 + vextracti128 [dstq+r6 ], m1, 1 + lea dstq, [dstq+strideq*2] + ret + +%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 8x16 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 16 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + jmp m(inv_txfm_add_dct_dct_8x8_16bpc).dconly +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, identity, 35 +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst + +cglobal idct_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + cmp eobd, 43 + jl .fast + add cq, 32 + call .pass1_main + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call .pass1_main + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call .pass1_main + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + call m(idct_8x16_internal).main + vpbroadcastd m12, [pw_2048] + REPX {vpermq x, x, q3120}, m0, m2, m4, m6 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7 +.end: + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + pmulhrsw m0, m4, m12 + pmulhrsw m1, m5, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + pmulhrsw m0, m6, m12 + pmulhrsw m1, m7, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + RET +ALIGN function_align +.transpose: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m7, m15 + lea rax, [deint_shuf+128] + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpcklwd m3, m4, m5 + punpckhwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + punpckhdq m7, m3, m6 + punpckldq m3, m6 + punpckhdq m6, m4, m5 + punpckldq m4, m5 + punpckhdq m5, m8, m1 + punpckldq m8, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + vperm2i128 m2, m0, m3, 0x31 + vinserti128 m0, xm3, 1 + vperm2i128 m3, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m7, m5, m6, 0x31 + vinserti128 m5, xm6, 1 + vperm2i128 m6, m8, m4, 0x31 + vinserti128 m4, m8, xm4, 1 + ret +ALIGN function_align +.pass1_main: + pmulld m0, m14, [cq+32* 0] + pmulld m1, m14, [cq+32* 2] + pmulld m2, m14, [cq+32* 4] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + pmulld m5, m14, [cq+32*10] + pmulld m6, m14, [cq+32*12] + pmulld m7, m14, [cq+32*14] + call m(idct_8x8_internal_16bpc).main_rect2 + jmp m(idct_8x8_internal_16bpc).round_shift1 +ALIGN function_align +.main_evenhalf: + paddd m1, m6, m7 ; idct8 out1 + psubd m6, m7 ; idct8 out6 + psubd m7, m0, m9 ; idct8 out7 + paddd m0, m9 ; idct8 out0 + paddd m2, m5, m4 ; idct8 out2 + psubd m5, m4 ; idct8 out5 + psubd m4, m3, m8 ; idct8 out4 + paddd m3, m8 ; idct8 out3 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + ret +.main_oddhalf_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_fast: ; lower half zero + vpbroadcastd m7, [pd_4076] + vpbroadcastd m8, [pd_401] + vpbroadcastd m6, [pd_m1189] + vpbroadcastd m9, [pd_3920] + vpbroadcastd m5, [pd_3612] + vpbroadcastd m10, [pd_1931] + vpbroadcastd m4, [pd_m2598] + vpbroadcastd m15, [pd_3166] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_fast2 +.main_oddhalf_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf: + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a +.main_oddhalf_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t9 + paddd m0, m4 ; t8 + psubd m4, m6, m2 ; t10 + paddd m2, m6 ; t11 + psubd m6, m1, m5 ; t13 + paddd m5, m1 ; t12 + psubd m1, m7, m3 ; t14 + paddd m7, m3 ; t15 + REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 + REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 + vpbroadcastd m15, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 + ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4 + psubd m3, m1, m4 ; t10 + paddd m1, m4 ; t9 + psubd m4, m0, m2 ; t11a + paddd m0, m2 ; t8a + psubd m2, m8, m6 ; t13 + paddd m6, m8 ; t14 + psubd m8, m7, m5 ; t12a + paddd m7, m5 ; t15a + REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pmulld x, m14}, m2, m8, m3, m4 + paddd m2, m11 + paddd m8, m11 + paddd m5, m2, m3 ; t13a + psubd m2, m3 ; t10a + psubd m3, m8, m4 ; t11 + paddd m4, m8 ; t12 + REPX {psrad x, 12}, m5, m2, m3, m4 + mova [r6-32*4], m7 + mova [r6-32*3], m6 + mova [r6-32*2], m5 + mova [r6-32*1], m4 + mova [r6+32*0], m3 + mova [r6+32*1], m2 + mova [r6+32*2], m1 + mova [r6+32*3], m0 + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity, 35 + +cglobal iadst_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + cmp eobd, 43 + jl .fast + add cq, 32 + call .pass1_main + call m(iadst_8x8_internal_16bpc).main_end + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call .pass1_main + call m(iadst_8x8_internal_16bpc).main_end + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call .pass1_main + call m(iadst_8x8_internal_16bpc).main_end + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_8x16_internal_16bpc).transpose + call m(iadst_8x16_internal).main + call m(iadst_8x16_internal).main_pass2_end + vpbroadcastd m8, [pw_2048] + vpbroadcastd xm12, [pw_4096] + REPX {vpermq x, x, q2031}, m0, m1, m2, m3 + REPX {vpermq x, x, q3120}, m4, m5, m6, m7 + psubw m12, m8 + jmp m(idct_8x16_internal_16bpc).end +ALIGN function_align +.pass1_main: + pmulld m0, m14, [cq+32* 0] + pmulld m7, m14, [cq+32*14] + pmulld m1, m14, [cq+32* 2] + pmulld m6, m14, [cq+32*12] + pmulld m2, m14, [cq+32* 4] + pmulld m5, m14, [cq+32*10] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + jmp m(iadst_8x8_internal_16bpc).main2 + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity, 35 + +cglobal iflipadst_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + cmp eobd, 43 + jl .fast + add cq, 32 + call m(iadst_8x16_internal_16bpc).pass1_main + call m(iflipadst_8x8_internal_16bpc).main_end + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call m(iadst_8x16_internal_16bpc).pass1_main + call m(iflipadst_8x8_internal_16bpc).main_end + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call m(iadst_8x16_internal_16bpc).pass1_main + call m(iflipadst_8x8_internal_16bpc).main_end + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_8x16_internal_16bpc).transpose + call m(iadst_8x16_internal).main + call m(iadst_8x16_internal).main_pass2_end + vpbroadcastd m12, [pw_2048] + vpbroadcastd xm13, [pw_4096] + mova m11, m0 + vpermq m0, m7, q2031 + mova m10, m1 + vpermq m1, m6, q2031 + mova m9, m2 + vpermq m2, m5, q2031 + mova m8, m3 + vpermq m3, m4, q2031 + vpermq m4, m8, q3120 + vpermq m5, m9, q3120 + vpermq m6, m10, q3120 + vpermq m7, m11, q3120 + psubw m12, m13 + jmp m(idct_8x16_internal_16bpc).end + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 + pmulhrsw m%2, m%4 +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2 + vpbroadcastd m15, [pd_2896] + pmulld m0, m15, [cq+32* 0] + pmulld m8, m15, [cq+32* 1] + pmulld m1, m15, [cq+32* 2] + pmulld m9, m15, [cq+32* 3] + pmulld m2, m15, [cq+32* 4] + pmulld m10, m15, [cq+32* 5] + pmulld m3, m15, [cq+32* 6] + pmulld m11, m15, [cq+32* 7] + pmulld m4, m15, [cq+32* 8] + pmulld m12, m15, [cq+32* 9] + pmulld m5, m15, [cq+32*10] + pmulld m13, m15, [cq+32*11] + pmulld m6, m15, [cq+32*12] + pmulld m14, m15, [cq+32*13] + pmulld m7, m15, [cq+32*14] + pmulld m15, [cq+32*15] + mova [cq], m7 + vpbroadcastd m7, [pd_2048] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [cq] + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m7, m15 + vpbroadcastd m8, [pw_1697x16] + REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 + punpckhwd m9, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m4, m5 + punpcklwd m4, m5 + punpcklwd m5, m2, m3 + punpckhwd m2, m3 + vpbroadcastd m12, [pw_2048] + punpckhdq m3, m0, m5 + punpckldq m0, m5 + punpckhdq m11, m9, m2 + punpckldq m9, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m7, m1 + punpckhdq m7, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m8, m9, m6 + punpckhqdq m9, m6 + punpcklqdq m10, m11, m7 + punpckhqdq m11, m7 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(iidentity_8x8_internal_16bpc).write_2x8x2_start + pmulhrsw m0, m12, m2 + pmulhrsw m1, m12, m3 + call m(iidentity_8x8_internal_16bpc).write_2x8x2_zero + pmulhrsw m0, m12, m8 + pmulhrsw m1, m12, m9 + lea dstq, [dstq+strideq*4] + call m(iidentity_8x8_internal_16bpc).write_2x8x2_zero + pmulhrsw m0, m12, m10 + pmulhrsw m1, m12, m11 + call m(iidentity_8x8_internal_16bpc).write_2x8x2_zero + RET + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 16x4 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 4 +.dconly: + add r6d, 6144 + sar r6d, 13 +.dconly2: + imul r6d, 2896 + add r6d, 34816 + sar r6d, 16 + movd xm0, r6d + vpbroadcastw m0, xm0 + vpbroadcastd m4, [pixel_max] + pxor m3, m3 +.dconly_loop: + paddw m1, m0, [dstq+strideq*0] + paddw m2, m0, [dstq+strideq*1] + pmaxsw m1, m3 + pmaxsw m2, m3 + pminsw m1, m4 + pminsw m2, m4 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, identity +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst + +cglobal idct_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vbroadcasti128 m0, [cq+16* 0] + vbroadcasti128 m4, [cq+16* 4] + vbroadcasti128 m1, [cq+16* 2] + vbroadcasti128 m7, [cq+16* 6] + vbroadcasti128 m5, [cq+16*10] + vbroadcasti128 m2, [cq+16* 8] + vbroadcasti128 m6, [cq+16*12] + vbroadcasti128 m3, [cq+16*14] + shufpd m0, m4, 0x0c ; 0 4 + shufpd m1, m5, 0x0c ; 2 10 + shufpd m2, m6, 0x0c ; 8 12 + shufpd m3, m7, 0x0c ; 14 6 + vpbroadcastd m7, [pd_2048] + call m(idct_8x4_internal_16bpc).main + pcmpeqd m6, m6 + psubd m0, m6 + psubd m2, m6 + psubd m3, m0, m4 ; idct8 out7 out6 + paddd m0, m4 ; idct8 out0 out1 + paddd m1, m2, m5 ; idct8 out3 out2 + psubd m2, m5 ; idct8 out4 out5 + vbroadcasti128 m10, [cq+16* 1] + vbroadcasti128 m4, [cq+16* 5] + vbroadcasti128 m11, [cq+16*15] + vbroadcasti128 m5, [cq+16*11] + shufpd m10, m4, 0x0c ; 1 5 + shufpd m11, m5, 0x0c ; 15 11 + vbroadcasti128 m5, [cq+16* 9] + vbroadcasti128 m4, [cq+16*13] + shufpd m5, m4, 0x0c ; 9 13 + vbroadcasti128 m6, [cq+16* 7] + vbroadcasti128 m4, [cq+16* 3] + shufpd m6, m4, 0x0c ; 7 3 + ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1 + ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1 + psubd m4, m10, m5 ; t9 -t10 + paddd m10, m5 ; t8 t11 + psubd m5, m11, m6 ; t14 -t13 + paddd m11, m6 ; t15 t12 + REPX {pmaxsd x, m8}, m4, m5, m10, m11 + REPX {pminsd x, m9}, m4, m5, m10, m11 + ITX_MULSUB_2D 5, 4, 6, 12, 13, 7, 1567, 3784, 2 + vpbroadcastd m12, [pd_2896] + punpckhqdq m6, m11, m5 + punpcklqdq m11, m4 + punpckhqdq m4, m10, m4 + punpcklqdq m10, m5 + psubd m5, m11, m6 ; t12a t13 + paddd m11, m6 ; t15a t14 + psubd m6, m10, m4 ; t11a t10 + paddd m10, m4 ; t8a t9 + REPX {pmaxsd x, m8}, m5, m6 + REPX {pminsd x, m9}, m5, m6 + pmulld m5, m12 + pmulld m6, m12 + REPX {pmaxsd x, m8}, m0, m1, m2, m3, m11, m10 + REPX {pminsd x, m9}, m0, m1, m2, m3, m11, m10 + paddd m5, m7 + psubd m4, m5, m6 + paddd m5, m6 + psrad m4, 12 ; t11 t10a + psrad m5, 12 ; t12 t13a + psubd m7, m0, m11 ; out15 out14 + paddd m0, m11 ; out0 out1 + psubd m6, m1, m5 ; out12 out13 + paddd m1, m5 ; out3 out2 + psubd m5, m2, m4 ; out11 out10 + paddd m2, m4 ; out4 out5 + psubd m4, m3, m10 ; out8 out9 + paddd m3, m10 ; out7 out6 + REPX {pshufd x, x, q1032}, m1, m3, m5, m7 + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call .transpose_4x16_packed + lea rax, [deint_shuf+128] + call m(idct_16x4_internal).main +.end: + vpbroadcastd m4, [pw_2048] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 +.end2: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] +.end3: + lea r6, [dstq+strideq*2] + paddw m2, [r6 +strideq*0] + paddw m3, [r6 +strideq*1] + vpbroadcastd m5, [pixel_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + REPX {pminsw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [r6 +strideq*0], m2 + mova [r6 +strideq*1], m3 + RET +ALIGN function_align +.transpose_4x16_packed: + vbroadcasti128 m8, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + REPX {pshufb x, m8}, m0, m2, m4, m6 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpckhqdq m2, m4, m6 + punpcklqdq m4, m6 + vperm2i128 m3, m1, m2, 0x31 + vinserti128 m1, xm2, 1 + vperm2i128 m2, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + ret + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + call m(iadst_4x16_internal_16bpc).main + psrad m11, 11 ; pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3 + paddd m4, m5, m11 + paddd m5, m6, m11 + paddd m6, m7, m11 + paddd m7, m8, m11 +.pass1_end: + REPX {pshufd x, x, q1032}, m0, m2, m4, m6 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call m(idct_16x4_internal_16bpc).transpose_4x16_packed + lea rax, [deint_shuf+128] + call m(iadst_16x4_internal).main + jmp m(idct_16x4_internal_16bpc).end +ALIGN function_align +.main: + vbroadcasti128 m6, [pd_1321] + mova m0, [cq+32*0] + mova m1, [cq+32*1] + vbroadcasti128 m7, [pd_2482] + mova m2, [cq+32*6] + mova m3, [cq+32*7] + pmulld m4, m0, m6 + pmulld m5, m1, m6 ; 1321*in0 + pmulld m9, m2, m7 + pmulld m8, m3, m7 ; 2482*in3 + paddd m4, m9 + paddd m8, m5 ; 1321*in0 + 2482*in3 + pmulld m5, m0, m7 + pmulld m9, m1, m7 ; 2482*in0 + paddd m0, m2 + paddd m1, m3 ; in0 + in3 + paddd m7, m6 ; pd_3803 + pmulld m2, m7 + pmulld m3, m7 ; 3803*in3 + psubd m5, m2 + psubd m9, m3 ; 2482*in0 - 3803*in3 + mova m2, [cq+32*4] + pmulld m10, m7, m2 + pmulld m3, m6, m2 + psubd m2, m0 + mova m0, [cq+32*5] + pmulld m7, m0 ; 3803*in2 + pmulld m6, m0 ; 1321*in2 + psubd m0, m1 ; in2 - in0 - in3 + vpbroadcastd m1, [pd_m3344] + paddd m4, m10 + paddd m7, m8 ; t0 + psubd m5, m3 + psubd m9, m6 ; t1 + vpbroadcastd m6, [pd_6144] + pmulld m2, m1 + pmulld m0, m1 ; t2 + pmulld m3, m1, [cq+32*2] + pmulld m1, [cq+32*3] ; -t3 + paddd m5, m6 + paddd m9, m6 + paddd m10, m4, m5 + paddd m4, m6 + paddd m8, m7, m6 + paddd m7, m9 + psubd m4, m3 ; out0 (unshifted) + psubd m5, m3 ; out1 (unshifted) + paddd m2, m6 ; out2 (unshifted) + paddd m3, m10 ; out3 (unshifted) + psubd m8, m1 ; out4 (unshifted) + psubd m9, m1 ; out5 (unshifted) + paddd m6, m0 ; out6 (unshifted) + paddd m7, m1 ; out7 (unshifted) + ret + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + call m(iadst_4x16_internal_16bpc).main + psrad m11, 11 ; pd_1 + paddd m4, m3, m11 + paddd m3, m5, m11 + paddd m5, m2, m11 + paddd m2, m6, m11 + paddd m6, m1, m11 + paddd m1, m7, m11 + paddd m7, m0, m11 + paddd m0, m8, m11 + jmp m(iadst_16x4_internal_16bpc).pass1_end +.pass2: + call m(idct_16x4_internal_16bpc).transpose_4x16_packed + lea rax, [deint_shuf+128] + call m(iadst_16x4_internal).main + vpbroadcastd m4, [pw_2048] + pmulhrsw m5, m3, m4 + pmulhrsw m6, m2, m4 + pmulhrsw m2, m1, m4 + pmulhrsw m3, m0, m4 + paddw m0, m5, [dstq+strideq*0] + paddw m1, m6, [dstq+strideq*1] + jmp m(idct_16x4_internal_16bpc).end3 + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [pd_11586] + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m1, [cq+32*1], q3120 ; 2 3 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m4, [cq+32*4], q3120 ; 8 9 + vpermq m5, [cq+32*5], q3120 ; a b + vpermq m6, [cq+32*6], q3120 ; c d + vpermq m7, [cq+32*7], q3120 ; e f + vpbroadcastd m9, [pd_6144] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call m(idct_16x4_internal_16bpc).transpose_4x16_packed + vpbroadcastd m7, [pw_1697x8] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(idct_16x4_internal_16bpc).end + +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 16x8 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 8 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly +%endif +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, identity +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst + +cglobal idct_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m14, [pd_2896] + pmulld m0, m14, [cq+32* 1] + pmulld m1, m14, [cq+32* 3] + pmulld m2, m14, [cq+32* 5] + pmulld m3, m14, [cq+32* 7] + pmulld m4, m14, [cq+32* 9] + pmulld m5, m14, [cq+32*11] + pmulld m6, m14, [cq+32*13] + pmulld m7, m14, [cq+32*15] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + lea r6, [rsp+32*4] + call m(idct_8x16_internal_16bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+32* 0] + pmulld m1, m14, [cq+32* 2] + pmulld m2, m14, [cq+32* 4] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + pmulld m5, m14, [cq+32*10] + pmulld m6, m14, [cq+32*12] + pmulld m7, m14, [cq+32*14] + call m(idct_8x8_internal_16bpc).main_rect2 + call m(idct_8x16_internal_16bpc).main_evenhalf + psrld m11, 11 ; pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + mova m14, [r6-32*4] + mova m13, [r6-32*3] + mova m12, [r6-32*2] + mova m11, [r6-32*1] + mova m10, [r6+32*0] + mova m9, [r6+32*1] + mova m8, [r6+32*2] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r6+32*3] ; out8 + paddd m7, [r6+32*3] ; out7 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + call m(idct_16x8_internal).main + vpbroadcastd m10, [pw_2048] +.end: + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pmulhrsw m2, m10 + pmulhrsw m3, m10 + call .write_16x4_start + pmulhrsw m0, m4, m10 + pmulhrsw m1, m5, m10 + pmulhrsw m2, m6, m10 + pmulhrsw m3, m7, m10 + call .write_16x4_zero + RET +ALIGN function_align +.transpose: + lea rax, [deint_shuf+128] +.transpose2: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m7, m15 +.transpose3: + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpckhwd m3, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + punpckhdq m7, m4, m6 + punpckldq m4, m6 + punpckldq m6, m8, m2 + punpckhdq m8, m2 + punpckhdq m2, m0, m1 + punpckldq m0, m1 + punpckhdq m1, m3, m5 + punpckldq m3, m5 + punpcklqdq m5, m6, m3 + punpckhqdq m6, m3 + punpckhqdq m3, m2, m7 + punpcklqdq m2, m7 + punpcklqdq m7, m8, m1 + punpckhqdq m8, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + vperm2i128 m4, m0, m5, 0x31 + vinserti128 m0, xm5, 1 + vperm2i128 m5, m1, m6, 0x31 + vinserti128 m1, xm6, 1 + vperm2i128 m6, m2, m7, 0x31 + vinserti128 m2, xm7, 1 + vperm2i128 m7, m3, m8, 0x31 + vinserti128 m3, xm8, 1 + ret +ALIGN function_align +.write_16x4_start: + vpbroadcastd m9, [pixel_max] + lea r3, [strideq*3] + pxor m8, m8 +.write_16x4_zero: + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7 + add cq, 32*8 +.write_16x4: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3 ] + REPX {pmaxsw x, m8}, m0, m1, m2, m3 + REPX {pminsw x, m9}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + lea r6, [rsp+32*4] + call .main + vpbroadcastd m14, [pd_6144] + psrld m15, 11 ; pd_1 + psubd m13, m14, m15 ; pd_6143 + paddd m0, m15 + psubd m1, m15, m1 + paddd m2, m15 + psubd m3, m15, m3 + paddd m4, m14 + psubd m5, m13, m5 + paddd m6, m14 + psubd m7, m13, m7 + paddd m8, m14, m9 + psubd m9, m13, m10 + paddd m10, m14, m11 + psubd m11, m13, m12 + paddd m12, m15, [r6-32*1] + psubd m13, m15, [r6-32*2] + paddd m14, m15, [r6-32*3] + psubd m15, [r6-32*4] +.pass1_end: + REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 + jmp tx2q +.pass2: + call m(idct_16x8_internal_16bpc).transpose + call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass2_end + vpbroadcastd m10, [pw_2048] + pxor m11, m11 + psubw m11, m10 + pmulhrsw m0, m10 + pmulhrsw m1, m11 + pmulhrsw m2, m10 + pmulhrsw m3, m11 + call m(idct_16x8_internal_16bpc).write_16x4_start + pmulhrsw m0, m4, m10 + pmulhrsw m1, m5, m11 + pmulhrsw m2, m6, m10 + pmulhrsw m3, m7, m11 + call m(idct_16x8_internal_16bpc).write_16x4_zero + RET +ALIGN function_align +.main: + vpbroadcastd m15, [pd_2896] + pmulld m0, m15, [cq+32* 2] + pmulld m1, m15, [cq+32*13] + pmulld m2, m15, [cq+32* 6] + pmulld m3, m15, [cq+32* 9] + pmulld m4, m15, [cq+32*10] + pmulld m5, m15, [cq+32* 5] + pmulld m6, m15, [cq+32*14] + pmulld m7, m15, [cq+32* 1] + vpbroadcastd m12, [pd_2048] + vpbroadcastd m13, [clip_min] + vpbroadcastd m14, [clip_max] + REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + call .main_part1 + pmulld m0, m15, [cq+32* 0] + pmulld m1, m15, [cq+32*15] + pmulld m2, m15, [cq+32* 4] + pmulld m3, m15, [cq+32*11] + pmulld m4, m15, [cq+32* 8] + pmulld m5, m15, [cq+32* 7] + pmulld m6, m15, [cq+32*12] + pmulld m7, m15, [cq+32* 3] + REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_part2: + ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 201, 4091 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 1751, 3703 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3035, 2751 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 3857, 1380 + psubd m8, m0, m4 ; t8a + paddd m0, m4 ; t0a + psubd m4, m1, m5 ; t9a + paddd m1, m5 ; t1a + psubd m5, m2, m6 ; t12a + paddd m2, m6 ; t4a + psubd m6, m3, m7 ; t13a + paddd m7, m3 ; t5a + REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 + vpbroadcastd m11, [pd_4017] + vpbroadcastd m10, [pd_799] + ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 + ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 + psubd m3, m0, m2 ; t4 + paddd m0, m2 ; t0 + psubd m2, m1, m7 ; t5 + paddd m1, m7 ; t1 + psubd m7, m4, m6 ; t12a + paddd m4, m6 ; t8a + psubd m6, m8, m5 ; t13a + paddd m5, m8 ; t9a + REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5 + vpbroadcastd m11, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 3, 2, 8, 9, _, 12, 10, 11 + ITX_MULSUB_2D 7, 6, 8, 9, _, 12, 10, 11 + pminsd m10, m14, [r6-32*4] ; t2 + pminsd m8, m14, [r6-32*3] ; t3 + psubd m9, m0, m10 ; t2a + paddd m0, m10 ; out0 + psubd m10, m1, m8 ; t3a + paddd m1, m8 ; -out15 + pmaxsd m9, m13 + pmaxsd m10, m13 + pminsd m9, m14 + pminsd m10, m14 + pmulld m9, m15 + pmulld m10, m15 + mova [r6-32*4], m1 + mova m11, [r6-32*1] ; t7a + mova m1, [r6-32*2] ; t6a + psubd m8, m3, m11 ; t7 + paddd m11, m3 ; out12 + paddd m3, m2, m1 ; -out3 + psubd m2, m1 ; t6 + pmaxsd m8, m13 + pmaxsd m2, m13 + pminsd m8, m14 + pminsd m2, m14 + pmulld m8, m15 + mova [r6-32*1], m11 + mova [r6-32*3], m2 + mova m1, [r6+32*3] ; t15 + mova m2, [r6+32*2] ; t14 + paddd m12, m7, m1 ; -out13 + psubd m7, m1 ; t15a + psubd m11, m6, m2 ; t14a + paddd m2, m6 ; out2 + pmaxsd m7, m13 + pmaxsd m11, m13 + pminsd m7, m14 + pminsd m11, m14 + pmulld m7, m15 + pmulld m11, m15 + mova [r6-32*2], m12 + pminsd m1, m14, [r6+32*0] ; t10a + pminsd m12, m14, [r6+32*1] ; t11a + psubd m6, m4, m1 ; t10 + paddd m1, m4 ; -out1 + psubd m4, m5, m12 ; t11 + paddd m5, m12 ; out14 + pmulld m12, m15, [r6-32*3] ; t6 + pmaxsd m6, m13 + pmaxsd m4, m13 + pminsd m6, m14 + pminsd m4, m14 + pmulld m6, m15 + pmulld m4, m15 + mova [r6-32*3], m5 + paddd m5, m11, m7 ; -out5 (unshifted) + psubd m11, m7 ; out10 (unshifted) + paddd m7, m9, m10 ; -out7 (unshifted) + psubd m9, m10 ; out8 (unshifted) + psubd m10, m6, m4 ; -out9 (unshifted) + paddd m6, m4 ; out6 (unshifted) + paddd m4, m12, m8 ; out4 (unshifted) + psubd m12, m8 ; -out11 (unshifted) + ret +.main_part1: + ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 995, 3973 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 2440, 3290 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3513, 2106 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 4052, 601 + psubd m8, m0, m4 ; t10a + paddd m0, m4 ; t2a + psubd m4, m1, m5 ; t11a + paddd m1, m5 ; t3a + psubd m5, m2, m6 ; t14a + paddd m2, m6 ; t6a + psubd m6, m3, m7 ; t15a + paddd m7, m3 ; t7a + REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 + vpbroadcastd m11, [pd_2276] + vpbroadcastd m10, [pd_3406] + ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 + ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 + psubd m3, m0, m2 ; t6 + paddd m0, m2 ; t2 + psubd m2, m1, m7 ; t7 + paddd m1, m7 ; t3 + psubd m7, m4, m6 ; t14a + paddd m4, m6 ; t10a + psubd m6, m8, m5 ; t15a + paddd m5, m8 ; t11a + REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later + vpbroadcastd m11, [pd_1567] + vpbroadcastd m10, [pd_3784] + ITX_MULSUB_2D 2, 3, 8, 9, _, 12, 10, 11 + ITX_MULSUB_2D 6, 7, 8, 9, _, 12, 10, 11 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + lea r6, [rsp+32*4] + call m(iadst_16x8_internal_16bpc).main + vpbroadcastd m14, [pd_6144] + psrld m15, 11 + psubd m13, m14, m15 + psubd m8, m13, m7 + paddd m7, m14, m9 + paddd m9, m14, m6 + psubd m6, m13, m10 + psubd m10, m13, m5 + paddd m5, m14, m11 + paddd m11, m14, m4 + psubd m4, m13, m12 + psubd m12, m15, m3 + paddd m3, m15, [r6-32*1] + paddd m13, m15, m2 + psubd m2, m15, [r6-32*2] + psubd m14, m15, m1 + mova m1, m15 + paddd m15, m0 + psubd m0, m1, [r6-32*4] + paddd m1, [r6-32*3] + jmp m(iadst_16x8_internal_16bpc).pass1_end +.pass2: + call m(idct_16x8_internal_16bpc).transpose + call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass2_end + vpbroadcastd m10, [pw_2048] + pxor m11, m11 + psubw m11, m10 + mova m12, m0 + pmulhrsw m0, m7, m11 + mova m7, m1 + pmulhrsw m1, m6, m10 + mova m6, m2 + pmulhrsw m2, m5, m11 + mova m5, m3 + pmulhrsw m3, m4, m10 + call m(idct_16x8_internal_16bpc).write_16x4_start + pmulhrsw m0, m5, m11 + pmulhrsw m1, m6, m10 + pmulhrsw m2, m7, m11 + pmulhrsw m3, m12, m10 + call m(idct_16x8_internal_16bpc).write_16x4_zero + RET + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m15, [pd_2896] + pmulld m0, m15, [cq+32* 0] + pmulld m1, m15, [cq+32* 1] + pmulld m2, m15, [cq+32* 2] + pmulld m3, m15, [cq+32* 3] + pmulld m4, m15, [cq+32* 4] + pmulld m5, m15, [cq+32* 5] + pmulld m6, m15, [cq+32* 6] + pmulld m7, m15, [cq+32* 7] + pmulld m8, m15, [cq+32* 8] + pmulld m9, m15, [cq+32* 9] + pmulld m10, m15, [cq+32*10] + pmulld m11, m15, [cq+32*11] + pmulld m12, m15, [cq+32*12] + pmulld m13, m15, [cq+32*13] + pmulld m14, m15, [cq+32*14] + pmulld m15, [cq+32*15] + mova [rsp], m7 + vpbroadcastd m7, [pd_2048] + REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [rsp] + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + mova [rsp], m15 + vpbroadcastd m15, [pd_11586] + REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + pmulld m15, [rsp] + mova [rsp], m7 + vpbroadcastd m7, [pd_6144] + REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [rsp] + REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_16x8_internal_16bpc).transpose + vpbroadcastd m10, [pw_4096] + jmp m(idct_16x8_internal_16bpc).end + +%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 16x16 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 16 + add r6d, 10240 + sar r6d, 14 + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, identity, 28 +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst + +cglobal idct_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call .main + sub cq, 32 + mova m10, [r6-32*4] + mova m9, [r6-32*3] + mova m8, [r6-32*2] + psubd m15, m0, m10 ; out15 + paddd m0, m10 ; out0 + psubd m10, m1, m9 ; out14 + paddd m1, m9 ; out1 + psubd m9, m2, m8 ; out13 + paddd m2, m8 ; out2 + REPX {psrad x, 2}, m0, m1, m2 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova m2, [r6-32*1] + mova m1, [r6+32*0] + mova m0, [r6+32*1] + REPX {psrad x, 2}, m9, m10, m15 + psubd m8, m3, m2 ; out12 + paddd m3, m2 ; out3 + psubd m2, m4, m1 ; out11 + paddd m4, m1 ; out4 + psubd m1, m5, m0 ; out10 + paddd m5, m0 ; out5 + REPX {psrad x, 2}, m3, m4, m5 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova m4, [r6+32*2] + mova m3, [r6+32*3] + REPX {psrad x, 2}, m1, m2, m8 + psubd m5, m6, m4 ; out9 + paddd m6, m4 ; out6 + psubd m4, m7, m3 ; out8 + paddd m7, m3 ; out7 + REPX {psrad x, 2}, m6, m7, m4, m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + add r6, 32*8 + mova [r6-32*4], m4 + mova [r6-32*3], m5 + mova [r6-32*2], m1 + mova [r6-32*1], m2 + mova [r6+32*0], m8 + mova [r6+32*1], m9 + mova [r6+32*2], m10 + mova [r6+32*3], m15 +.fast: + add r6, 32*8 + call .main + mova m14, [r6-32*4] + mova m13, [r6-32*3] + mova m12, [r6-32*2] + mova m11, [r6-32*1] + mova m10, [r6+32*0] + mova m9, [r6+32*1] + mova m8, [r6+32*2] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r6+32*3] ; out8 + paddd m7, [r6+32*3] ; out7 + sub r6, 32*8 + REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + lea rax, [pw_5+128] + mova [rsp], m15 + call m(idct_16x16_internal).main + mova m1, [rsp+32*1] +.end: + call .write_16x16 + RET +ALIGN function_align +.write_16x16: + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [pw_2048] + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct_16x8_internal_16bpc).write_16x4_start + pmulhrsw m0, m12, m4 + pmulhrsw m1, m12, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m12, m7 + call m(idct_16x8_internal_16bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+gprsize+32*0] + pmulhrsw m1, m12, [rsp+gprsize+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m12, m11 + call m(idct_16x8_internal_16bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+gprsize+32*2] + pmulhrsw m1, m12, m13 + pmulhrsw m2, m12, m14 + pmulhrsw m3, m12, m15 + jmp m(idct_16x8_internal_16bpc).write_16x4_zero +ALIGN function_align +.transpose: + test eobd, eobd + jl .transpose_fast + packssdw m8, [r6-32*4] + packssdw m9, [r6-32*3] + packssdw m10, [r6-32*2] + packssdw m11, [r6-32*1] + packssdw m12, [r6+32*0] + packssdw m13, [r6+32*1] + packssdw m14, [r6+32*2] + packssdw m15, [r6+32*3] + sub r6, 32*8 + packssdw m0, [r6-32*4] + packssdw m1, [r6-32*3] + packssdw m2, [r6-32*2] + packssdw m3, [r6-32*1] + packssdw m4, [r6+32*0] + packssdw m5, [r6+32*1] + packssdw m6, [r6+32*2] + packssdw m7, [r6+32*3] + mova [r6], m8 + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpckhwd m3, m6, m7 + punpcklwd m6, m7 + punpcklwd m7, m4, m5 + punpckhwd m4, m5 + punpckldq m5, m8, m2 + punpckhdq m8, m2 + punpckhdq m2, m0, m1 + punpckldq m0, m1 + punpckhdq m1, m7, m6 + punpckldq m7, m6 + punpckhdq m6, m4, m3 + punpckldq m4, m3 + punpckhqdq m3, m2, m1 + punpcklqdq m2, m1 + punpckhqdq m1, m0, m7 + punpcklqdq m0, m7 + punpcklqdq m7, m8, m6 + punpckhqdq m8, m6 + punpckhqdq m6, m5, m4 + punpcklqdq m5, m4 + mova m4, [r6] + mova [r6], m8 + punpcklwd m8, m4, m9 + punpckhwd m4, m9 + punpcklwd m9, m10, m11 + punpckhwd m10, m11 + punpckhwd m11, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m12, m13 + punpcklwd m12, m13 + punpckldq m13, m4, m10 + punpckhdq m4, m10 + punpckhdq m10, m8, m9 + punpckldq m8, m9 + punpckhdq m9, m12, m14 + punpckldq m12, m14 + punpckhdq m14, m15, m11 + punpckldq m15, m11 + punpckhqdq m11, m10, m9 + punpcklqdq m10, m9 + punpckhqdq m9, m8, m12 + punpcklqdq m8, m12 + punpcklqdq m12, m13, m15 + punpckhqdq m13, m15 + punpckhqdq m15, m4, m14 + punpcklqdq m14, m4, m14 + vperm2i128 m4, m0, m8, 0x31 + vinserti128 m0, xm8, 1 + vinserti128 m8, m5, xm12, 1 + vperm2i128 m12, m5, 0x13 + vperm2i128 m5, m1, m9, 0x31 + vinserti128 m1, xm9, 1 + vinserti128 m9, m6, xm13, 1 + vperm2i128 m13, m6, 0x13 + vperm2i128 m6, m2, m10, 0x31 + vinserti128 m2, xm10, 1 + vinserti128 m10, m7, xm14, 1 + vperm2i128 m14, m7, 0x13 + vperm2i128 m7, m3, m11, 0x31 + vinserti128 m3, xm11, 1 + mova xm11, [r6] + vinserti128 m11, xm15, 1 + vinserti128 m15, [r6+16], 0 + ret +.transpose_fast: + call m(idct_16x8_internal_16bpc).transpose2 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + ret +ALIGN function_align +.main: + mova m0, [cq+64* 1] + mova m1, [cq+64* 3] + mova m2, [cq+64* 5] + mova m3, [cq+64* 7] + mova m4, [cq+64* 9] + mova m5, [cq+64*11] + mova m6, [cq+64*13] + mova m7, [cq+64*15] + call m(idct_8x16_internal_16bpc).main_oddhalf + mova m0, [cq+64* 0] + mova m1, [cq+64* 2] + mova m2, [cq+64* 4] + mova m3, [cq+64* 6] + mova m4, [cq+64* 8] + mova m5, [cq+64*10] + mova m6, [cq+64*12] + mova m7, [cq+64*14] + call m(idct_8x8_internal_16bpc).main + call m(idct_8x16_internal_16bpc).main_evenhalf + psrld m10, m11, 10 ; pd_2 + REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_min] + vpbroadcastd m14, [clip_max] + vpbroadcastd m15, [pd_2896] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call .main + sub cq, 32 + vpbroadcastd m8, [pd_10240] + paddd m4, m8 + paddd m6, m8 + paddd m9, m8 + paddd m11, m8 + vpbroadcastd m8, [pd_10239] + psubd m5, m8, m5 + psubd m7, m8, m7 + psubd m10, m8, m10 + psubd m12, m8, m12 + REPX {psrad x, 14}, m4, m5, m6, m7, m9, m10, m11, m12 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + psrld m4, m15, 10 ; pd_2 + paddd m0, m4 + psubd m1, m4, m1 + paddd m2, m4 + psubd m3, m4, m3 + psubd m7, m4, [r6-32*4] + paddd m6, m4, [r6-32*3] + psubd m5, m4, [r6-32*2] + paddd m4, [r6-32*1] + REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + add r6, 32*8 + mova [r6-32*4], m9 + mova [r6-32*3], m10 + mova [r6-32*2], m11 + mova [r6-32*1], m12 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 +.fast: + add r6, 32*8 + call .main + vpbroadcastd m14, [pd_10240] + vpbroadcastd m13, [pd_10239] + psrld m15, 10 ; pd_2 + paddd m0, m15 + psubd m1, m15, m1 + paddd m2, m15 + psubd m3, m15, m3 + paddd m4, m14 + psubd m5, m13, m5 + paddd m6, m14 + psubd m7, m13, m7 + paddd m8, m14, m9 + psubd m9, m13, m10 + paddd m10, m14, m11 + psubd m11, m13, m12 + paddd m12, m15, [r6-32*1] + psubd m13, m15, [r6-32*2] + paddd m14, m15, [r6-32*3] + psubd m15, [r6-32*4] +.pass1_end: + REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11 + sub r6, 32*8 + jmp tx2q +.pass2: + call m(idct_16x16_internal_16bpc).transpose + lea rax, [pw_5+128] + mova [rsp], m15 + call m(iadst_16x16_internal).main + call m(iadst_16x16_internal).main_pass2_end + mova [rsp+32*0], m8 + mova [rsp+32*2], m12 + mova [rsp+32*3], m13 + vpbroadcastd m12, [pw_2048] + pxor m13, m13 + psubw m13, m12 + pmulhrsw m0, m12 + pmulhrsw m1, m13, [rsp+32*1] + mova [rsp+32*1], m9 + pmulhrsw m2, m12 + pmulhrsw m3, m13 + call m(idct_16x8_internal_16bpc).write_16x4_start + pmulhrsw m0, m12, m4 + pmulhrsw m1, m13, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m13, m7 + call m(idct_16x8_internal_16bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+32*0] + pmulhrsw m1, m13, [rsp+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m13, m11 + call m(idct_16x8_internal_16bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+32*2] + pmulhrsw m1, m13, [rsp+32*3] + pmulhrsw m2, m12, m14 + pmulhrsw m3, m13, m15 + call m(idct_16x8_internal_16bpc).write_16x4_zero + RET +ALIGN function_align +.main: + mova m0, [cq+64* 2] + mova m1, [cq+64*13] + mova m2, [cq+64* 6] + mova m3, [cq+64* 9] + mova m4, [cq+64*10] + mova m5, [cq+64* 5] + mova m6, [cq+64*14] + mova m7, [cq+64* 1] + vpbroadcastd m12, [pd_2048] + call m(iadst_16x8_internal_16bpc).main_part1 + mova m0, [cq+64* 0] + mova m1, [cq+64*15] + mova m2, [cq+64* 4] + mova m3, [cq+64*11] + mova m4, [cq+64* 8] + mova m5, [cq+64* 7] + mova m6, [cq+64*12] + mova m7, [cq+64* 3] + jmp m(iadst_16x8_internal_16bpc).main_part2 + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_min] + vpbroadcastd m14, [clip_max] + vpbroadcastd m15, [pd_2896] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call m(iadst_16x16_internal_16bpc).main + sub cq, 32 + vpbroadcastd m8, [pd_10240] + paddd m11, m8 + paddd m9, m8 + paddd m6, m8 + paddd m4, m8 + vpbroadcastd m8, [pd_10239] + psubd m12, m8, m12 + psubd m10, m8, m10 + psubd m7, m8, m7 + psubd m5, m8, m5 + REPX {psrad x, 14}, m12, m11, m10, m9, m7, m6, m5, m4 + mova [r6+32*0], m12 + mova [r6+32*1], m11 + mova [r6+32*2], m10 + mova [r6+32*3], m9 + psrld m9, m15, 10 ; pd_2 + psubd m3, m9, m3 + paddd m2, m9 + psubd m1, m9, m1 + paddd m0, m9 + psubd m12, m9, [r6-32*4] + paddd m11, m9, [r6-32*3] + psubd m10, m9, [r6-32*2] + paddd m9, [r6-32*1] + REPX {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0 + mova [r6-32*4], m12 + mova [r6-32*3], m11 + mova [r6-32*2], m10 + mova [r6-32*1], m9 + add r6, 32*8 + mova [r6-32*4], m7 + mova [r6-32*3], m6 + mova [r6-32*2], m5 + mova [r6-32*1], m4 + mova [r6+32*0], m3 + mova [r6+32*1], m2 + mova [r6+32*2], m1 + mova [r6+32*3], m0 +.fast: + add r6, 32*8 + call m(iadst_16x16_internal_16bpc).main + vpbroadcastd m14, [pd_10240] + vpbroadcastd m13, [pd_10239] + psrld m15, 10 ; pd_2 + psubd m8, m13, m7 + paddd m7, m14, m9 + paddd m9, m14, m6 + psubd m6, m13, m10 + psubd m10, m13, m5 + paddd m5, m14, m11 + paddd m11, m14, m4 + psubd m4, m13, m12 + psubd m12, m15, m3 + paddd m3, m15, [r6-32*1] + paddd m13, m15, m2 + psubd m2, m15, [r6-32*2] + psubd m14, m15, m1 + mova m1, m15 + paddd m15, m0 + psubd m0, m1, [r6-32*4] + paddd m1, [r6-32*3] + jmp m(iadst_16x16_internal_16bpc).pass1_end +.pass2: + call m(idct_16x16_internal_16bpc).transpose + lea rax, [pw_5+128] + mova [rsp], m15 + call m(iadst_16x16_internal).main + call m(iadst_16x16_internal).main_pass2_end + mova [rsp+32*3], m3 + mova [rsp+32*2], m2 + mova [rsp+32*0], m0 + mova m2, m13 + mova m3, m12 + vpbroadcastd m12, [pw_2048] + pxor m13, m13 + psubw m13, m12 + pmulhrsw m0, m13, m15 + pmulhrsw m1, m12, m14 + pmulhrsw m2, m13 + pmulhrsw m3, m12 + mova m14, m8 + mova m15, m9 + call m(idct_16x8_internal_16bpc).write_16x4_start + pmulhrsw m0, m13, m11 + pmulhrsw m1, m12, m10 + pmulhrsw m2, m13, m15 + pmulhrsw m3, m12, m14 + call m(idct_16x8_internal_16bpc).write_16x4_zero + pmulhrsw m0, m13, m7 + pmulhrsw m1, m12, m6 + pmulhrsw m2, m13, m5 + pmulhrsw m3, m12, m4 + call m(idct_16x8_internal_16bpc).write_16x4_zero + pmulhrsw m0, m13, [rsp+32*3] + pmulhrsw m1, m12, [rsp+32*2] + pmulhrsw m2, m13, [rsp+32*1] + pmulhrsw m3, m12, [rsp+32*0] + call m(idct_16x8_internal_16bpc).write_16x4_zero + RET + +INV_TXFM_16X16_FN identity, dct, -92 +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m15, [pd_11586] + vpbroadcastd m7, [pd_10240] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + mov r3, -32*8*4 +.righthalf: + pmulld m0, m15, [cq+r3+32*33] + pmulld m1, m15, [cq+r3+32*35] + pmulld m2, m15, [cq+r3+32*37] + pmulld m3, m15, [cq+r3+32*39] + add r6, 32*4 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 14}, m0, m1, m2, m3 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + add r3, 32*8 + jl .righthalf +.fast: + pmulld m0, m15, [cq+64* 0] + pmulld m1, m15, [cq+64* 1] + pmulld m2, m15, [cq+64* 2] + pmulld m3, m15, [cq+64* 3] + pmulld m4, m15, [cq+64* 4] + pmulld m5, m15, [cq+64* 5] + pmulld m6, m15, [cq+64* 6] + pmulld m8, m15, [cq+64* 7] + mova [cq], m8 + pmulld m8, m15, [cq+64* 8] + pmulld m9, m15, [cq+64* 9] + pmulld m10, m15, [cq+64*10] + pmulld m11, m15, [cq+64*11] + pmulld m12, m15, [cq+64*12] + pmulld m13, m15, [cq+64*13] + pmulld m14, m15, [cq+64*14] + pmulld m15, [cq+64*15] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [cq] + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_16x16_internal_16bpc).transpose + + mova [cq+32*0], m15 + mova [cq+32*1], m0 + vpbroadcastd m15, [pw_1697x16] + + REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14 + mova m0, [cq+32*1] + mova [cq+32*1], m1 + IDTX16 0, 1, 15 + mova m1, [cq+32*0] + pmulhrsw m15, m1 + paddsw m1, m1 + paddsw m15, m1 + mova m1, [cq+32*1] + jmp m(idct_16x16_internal_16bpc).end + +%macro IDCT32_END 6 ; in/out1, out2, tmp[1-3], shift + mova m%4, [r6+32*(%1-4)] + mova m%2, [r5+32*(3-%1)] + mova m%5, [r4+32*(%1-4)] + psubd m%3, m%1, m%4 ; idct16 out15 - n + paddd m%1, m%4 ; idct16 out0 + n + pmaxsd m%1, m12 + pmaxsd m%3, m12 + pminsd m%1, m13 + pminsd m%3, m13 + paddd m%1, m11 + paddd m%3, m11 + psubd m%4, m%1, m%2 ; out31 - n + paddd m%1, m%2 ; out0 + n + paddd m%2, m%3, m%5 ; out15 - n + psubd m%3, m%5 ; out16 + n + REPX {psrad x, %6}, m%1, m%3, m%2, m%4 + packssdw m%1, m%3 ; out0 + n, out16 + n + packssdw m%2, m%4 ; out15 - n, out31 - n +%endmacro + +cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vbroadcasti128 m14, [idct32_shuf] + mov r4, cq + call .pass1_main + mova [rsp+32*0], m2 + mova [rsp+32*1], m3 + cmp eobd, 43 + jge .eob43 + pxor m4, m4 + REPX {mova x, m4}, [rsp+32*2], m2, m3, m11 + jmp .pass1_end_fast +.eob43: + lea r6, [rsp+32*8] + mova [r6-32*4], m0 + mova [r6-32*3], m1 + call .pass1_main + mova [rsp+32*2], m2 + cmp eobd, 107 + jge .eob107 + mova m11, m3 + mova m2, m0 + mova m3, m1 + mova m0, [r6-32*4] + mova m1, [r6-32*3] + pxor m4, m4 +.pass1_end_fast: + vpbroadcastd m10, [pw_2048] + lea rax, [deint_shuf+128] + REPX {mova x, m4}, m5, m6, m7 + call m(inv_txfm_add_dct_dct_8x32).main_fast + jmp .end +.eob107: + mova [rsp+32*3], m3 + mova [r6-32*2], m0 + mova [r6-32*1], m1 + call .pass1_main + cmp eobd, 171 + jge .eob171 + pshufd m12, m2, q1032 + pshufd m13, m3, q1032 + mova m4, m0 + mova m5, m1 + pxor m6, m6 + REPX {mova x, m6}, m7, m14, m15 + jmp .pass1_end +.eob171: + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + call .pass1_main + pshufd m12, [r6+32*2], q1032 ; out19 out17 + pshufd m13, [r6+32*3], q1032 ; out23 out21 + mova m4, [r6+32*0] ; out16 out18 + mova m5, [r6+32*1] ; out20 out22 + pshufd m14, m2, q1032 ; out27 out25 + pshufd m15, m3, q1032 ; out31 out29 + mova m6, m0 ; out24 out26 + mova m7, m1 ; out28 out30 +.pass1_end: + mova m0, [r6-32*4] ; out0 out2 + mova m1, [r6-32*3] ; out4 out6 + mova m2, [r6-32*2] ; out8 out10 + mova m3, [r6-32*1] ; out12 out14 + lea rax, [deint_shuf+128] + mova m11, [rsp+32*3] ; out13 out15 + vpbroadcastd m10, [pw_2048] + call m(inv_txfm_add_dct_dct_8x32).main +.end: ; [rsp+0*32] = m12 + vpbroadcastd m12, [pw_2048] + mov cq, r4 + mova [rsp+32*1], m8 + mova [rsp+32*2], m9 + mova [rsp+32*3], m10 + mova [rsp+32*4], m11 + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4_start + vpermq m0, m2, q3120 + vpermq m1, m3, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m5, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + vpermq m0, m6, q3120 + vpermq m1, m7, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + vpermq m0, [rsp+32*1], q3120 + vpermq m1, [rsp+32*2], q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + vpermq m0, [rsp+32*3], q3120 + vpermq m1, [rsp+32*4], q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + vpermq m0, [rsp+32*0], q3120 + vpermq m1, m13, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + vpermq m0, m14, q3120 + vpermq m1, m15, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + RET +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 32 + add r6d, 10240 + sar r6d, 14 + jmp m(inv_txfm_add_dct_dct_8x8_16bpc).dconly2 +ALIGN function_align +.pass1_main: + mova m0, [cq+128*0] + mova m1, [cq+128*1] + mova m2, [cq+128*2] + mova m3, [cq+128*3] + mova m4, [cq+128*4] + mova m5, [cq+128*5] + mova m6, [cq+128*6] + mova m7, [cq+128*7] + add cq, 32 + call m(idct_8x8_internal_16bpc).main + psrld m1, m11, 10 ; pd_2 + REPX {paddd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + pshufb m0, m14 + pshufb m2, m14 + pshufb m4, m14 + pshufb m6, m14 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + vperm2i128 m1, m0, m2, 0x31 ; 4 6 + vinserti128 m0, xm2, 1 ; 0 2 + vinserti128 m2, m3, xm4, 1 ; 1 3 + vperm2i128 m3, m4, 0x31 ; 5 7 + ret +.main_oddhalf_part1_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_part1_fast: ; lower half zero + vpbroadcastd m7, [pd_4091] + vpbroadcastd m8, [pd_201] + vpbroadcastd m6, [pd_m1380] + vpbroadcastd m9, [pd_3857] + vpbroadcastd m5, [pd_3703] + vpbroadcastd m10, [pd_1751] + vpbroadcastd m4, [pd_m2751] + vpbroadcastd m15, [pd_3035] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_part1_fast2 +.main_oddhalf_part1_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a +.main_oddhalf_part1_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t17 + paddd m0, m4 ; t16 + psubd m4, m6, m2 ; t18 + paddd m6, m2 ; t19 + psubd m2, m1, m5 ; t29 + paddd m1, m5 ; t28 + psubd m5, m7, m3 ; t30 + paddd m7, m3 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + vpbroadcastd m15, [pd_4017] + vpbroadcastd m10, [pd_799] + ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a + ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a + psubd m3, m0, m6 ; t19a + paddd m0, m6 ; t16a + psubd m6, m7, m1 ; t28a + paddd m7, m1 ; t31a + psubd m1, m5, m4 ; t18 + paddd m5, m4 ; t17 + psubd m4, m8, m2 ; t29 + paddd m8, m2 ; t30 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + vpbroadcastd m15, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a + ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28 + mova [r6-32*4], m0 + mova [r6-32*3], m5 + mova [r6-32*2], m4 + mova [r6-32*1], m6 + mova [r6+32*0], m3 + mova [r6+32*1], m1 + mova [r6+32*2], m8 + mova [r6+32*3], m7 + ret +.main_oddhalf_part2_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_part2_fast: ; lower half zero + vpbroadcastd m7, [pd_m601] + vpbroadcastd m8, [pd_4052] + vpbroadcastd m6, [pd_3973] + vpbroadcastd m9, [pd_995] + vpbroadcastd m5, [pd_m2106] + vpbroadcastd m10, [pd_3513] + vpbroadcastd m4, [pd_3290] + vpbroadcastd m15, [pd_2440] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_part2_fast2 +.main_oddhalf_part2_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 + ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a + ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a + ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a +.main_oddhalf_part2_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t25 + paddd m0, m4 ; t24 + psubd m4, m6, m2 ; t26 + paddd m6, m2 ; t27 + psubd m2, m1, m5 ; t21 + paddd m1, m5 ; t20 + psubd m5, m7, m3 ; t22 + paddd m7, m3 ; t23 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + vpbroadcastd m15, [pd_2276] + vpbroadcastd m10, [pd_3406] + ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a + ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a + psubd m3, m0, m6 ; t27a + paddd m0, m6 ; t24a + psubd m6, m7, m1 ; t20a + paddd m7, m1 ; t23a + psubd m1, m5, m4 ; t21 + paddd m5, m4 ; t22 + psubd m4, m8, m2 ; t26 + paddd m8, m2 ; t25 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + vpbroadcastd m15, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a + ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20 + mova m9, [r6-32*4] ; t16a + mova m10, [r6-32*3] ; t17 + psubd m2, m9, m7 ; t23 + paddd m9, m7 ; t16 + psubd m7, m10, m5 ; t22a + paddd m10, m5 ; t17a + REPX {pmaxsd x, m12}, m9, m10, m2, m7 + REPX {pminsd x, m13}, m9, m10, m2, m7 + mova [r6-32*4], m9 + mova [r6-32*3], m10 + mova m9, [r6-32*2] ; t18a + mova m10, [r6-32*1] ; t19 + psubd m5, m9, m1 ; t21 + paddd m9, m1 ; t18 + psubd m1, m10, m6 ; t20a + paddd m10, m6 ; t19a + REPX {pmaxsd x, m12}, m9, m10, m5, m1 + REPX {pminsd x, m13}, m9, m10, m5, m1 + mova [r6-32*2], m9 + mova [r6-32*1], m10 + mova m9, [r6+32*0] ; t28 + mova m10, [r6+32*1] ; t29a + psubd m6, m9, m3 ; t27a + paddd m9, m3 ; t28a + psubd m3, m10, m4 ; t26 + paddd m10, m4 ; t29 + REPX {pmaxsd x, m12}, m9, m10, m6, m3 + REPX {pminsd x, m13}, m9, m10, m6, m3 + REPX {pmulld x, m14}, m6, m3, m1, m5 + paddd m6, m11 + paddd m3, m11 + psubd m4, m6, m1 ; t20 + paddd m6, m1 ; t27 + psubd m1, m3, m5 ; t21a + paddd m3, m5 ; t26a + REPX {psrad x, 12 }, m4, m1, m3, m6 + mova [r6+32*0], m4 + mova [r6+32*1], m1 + mova m4, [r6+32*2] ; t30 + mova m1, [r6+32*3] ; t31a + psubd m5, m4, m8 ; t25a + paddd m4, m8 ; t30a + psubd m8, m1, m0 ; t24 + paddd m1, m0 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m1 + REPX {pminsd x, m13}, m8, m5, m4, m1 + REPX {pmulld x, m14}, m5, m8, m7, m2 + paddd m5, m11 + paddd m8, m11 + psubd m0, m5, m7 ; t22 + paddd m5, m7 ; t25 + psubd m7, m8, m2 ; t23a + paddd m2, m8 ; t24a + REPX {psrad x, 12 }, m0, m7, m2, m5 + mova [r6+32*2], m0 + mova [r6+32*3], m7 + mov r4, r6 + add r6, 32*8 + mova [r6-32*4], m2 + mova [r6-32*3], m5 + mova [r6-32*2], m3 + mova [r6-32*1], m6 + mova [r6+32*0], m9 + mova [r6+32*1], m10 + mova [r6+32*2], m4 + mova [r6+32*3], m1 + mov r5, r6 + add r6, 32*8 + ret +ALIGN function_align +.main_end: + psrld m11, 10 ; pd_2 + IDCT32_END 0, 15, 8, 9, 10, 2 + IDCT32_END 1, 14, 8, 9, 10, 2 + punpckhwd m8, m0, m1 ; 16 17 + punpcklwd m0, m1 ; 0 1 + punpcklwd m1, m14, m15 ; 14 15 + punpckhwd m14, m15 ; 30 31 + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT32_END 2, 15, 8, 9, 10, 2 + IDCT32_END 3, 14, 8, 9, 10, 2 + punpckhwd m8, m2, m3 ; 18 19 + punpcklwd m2, m3 ; 2 3 + punpcklwd m3, m14, m15 ; 12 13 + punpckhwd m14, m15 ; 28 29 + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT32_END 4, 15, 8, 9, 10, 2 + IDCT32_END 5, 14, 8, 9, 10, 2 + punpckhwd m8, m4, m5 ; 20 21 + punpcklwd m4, m5 ; 4 5 + punpcklwd m5, m14, m15 ; 10 11 + punpckhwd m14, m15 ; 26 27 + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT32_END 6, 15, 8, 9, 10, 2 + IDCT32_END 7, 14, 8, 9, 10, 2 + punpckhwd m8, m6, m7 ; 22 23 + punpcklwd m6, m7 ; 6 7 + punpcklwd m7, m14, m15 ; 8 9 + punpckhwd m14, m15 ; 24 25 + mova [r5-32*3], m8 + mova [r5-32*4], m14 +.transpose: + punpckhdq m15, m3, m1 + punpckldq m3, m1 + punpckhdq m1, m4, m6 + punpckldq m4, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m7, m5 + punpckldq m7, m5 + punpcklqdq m5, m2, m15 + punpckhqdq m2, m15 + punpckhqdq m15, m7, m3 + punpcklqdq m7, m3 + punpckhqdq m3, m6, m1 + punpcklqdq m6, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + vperm2i128 m4, m0, m7, 0x31 + vinserti128 m0, xm7, 1 + vperm2i128 m7, m3, m2, 0x31 + vinserti128 m3, xm2, 1 + vinserti128 m2, m6, xm5, 1 + vperm2i128 m6, m5, 0x31 + vperm2i128 m5, m1, m15, 0x31 + vinserti128 m1, xm15, 1 + ret + +cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m5, [pw_5] + vpbroadcastd m7, [pixel_max] + pxor m6, m6 + mov r6d, eobd + add eobb, 21 + cmovc eobd, r6d ; 43, 107, 171 -> 64, 128, 192 + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 +.loop: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {paddsw x, m5}, m0, m1, m2, m3 + REPX {psraw x, 3 }, m0, m1, m2, m3 + call .main_zero + add cq, 32 + lea dstq, [dstq+strideq*8] + sub eobd, 64 + jge .loop + RET +ALIGN function_align +.main_zero: + REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 +.main: + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + punpckhwd m4, m2, m1 + punpcklwd m2, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + mova xm4, [dstq+strideq*0] + vinserti128 m4, [dstq+strideq*4], 1 + paddw m0, m4 + mova xm4, [dstq+strideq*1] + vinserti128 m4, [dstq+r5 ], 1 + paddw m1, m4 + mova xm4, [dstq+strideq*2] + vinserti128 m4, [dstq+r6*2 ], 1 + paddw m2, m4 + mova xm4, [dstq+r6 ] + vinserti128 m4, [dstq+r4 ], 1 + paddw m3, m4 + REPX {pmaxsw x, m6}, m0, m1, m2, m3 + REPX {pminsw x, m7}, m0, m1, m2, m3 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*4], m0, 1 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+r5 ], m1, 1 + mova [dstq+strideq*2], xm2 + vextracti128 [dstq+r6*2 ], m2, 1 + mova [dstq+r6 ], xm3 + vextracti128 [dstq+r4 ], m3, 1 + ret + +cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .full + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 8 +.dconly: + add r6d, 10240 + sar r6d, 14 +.dconly2: + imul r6d, 2896 + add r6d, 34816 + sar r6d, 16 + movd xm0, r6d + vpbroadcastw m0, xm0 + vpbroadcastd m4, [pixel_max] + pxor m3, m3 +.dconly_loop: + paddw m1, m0, [dstq+32*0] + paddw m2, m0, [dstq+32*1] + pmaxsw m1, m3 + pmaxsw m2, m3 + pminsw m1, m4 + pminsw m2, m4 + mova [dstq+32*0], m1 + mova [dstq+32*1], m2 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.full: + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + mova m0, [cq+32* 1] + mova m1, [cq+32* 7] + mova m2, [cq+32* 9] + mova m3, [cq+32*15] + mova m4, [cq+32*17] + mova m5, [cq+32*23] + mova m6, [cq+32*25] + mova m7, [cq+32*31] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*4] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1 + mova m0, [cq+32* 3] + mova m1, [cq+32* 5] + mova m2, [cq+32*11] + mova m3, [cq+32*13] + mova m4, [cq+32*19] + mova m5, [cq+32*21] + mova m6, [cq+32*27] + mova m7, [cq+32*29] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2 + mova m0, [cq+32* 2] + mova m1, [cq+32* 6] + mova m2, [cq+32*10] + mova m3, [cq+32*14] + mova m4, [cq+32*18] + mova m5, [cq+32*22] + mova m6, [cq+32*26] + mova m7, [cq+32*30] + call m(idct_8x16_internal_16bpc).main_oddhalf + mova m0, [cq+32* 0] + mova m1, [cq+32* 4] + mova m2, [cq+32* 8] + mova m3, [cq+32*12] + mova m4, [cq+32*16] + mova m5, [cq+32*20] + mova m6, [cq+32*24] + mova m7, [cq+32*28] + call m(idct_8x8_internal_16bpc).main + call m(idct_8x16_internal_16bpc).main_evenhalf + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_end + lea rax, [deint_shuf+128] + vpbroadcastd m11, [pw_2048] + mov r4, dstq + call .pass2 + mova m0, [r5+32*3] ; 16 17 + mova m1, [r5+32*2] ; 30 31 + mova m2, [r5+32*1] ; 18 19 + mova m3, [r5+32*0] ; 28 29 + mova m4, [r5-32*1] ; 20 21 + mova m5, [r5-32*2] ; 26 27 + mova m6, [r5-32*3] ; 22 23 + mova m7, [r5-32*4] ; 24 25 + call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + lea dstq, [r4+32] + call .pass2 + RET +ALIGN function_align +.pass2: + call m(idct_16x8_internal).main + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + call m(idct_16x8_internal_16bpc).write_16x4_start + pmulhrsw m0, m11, m4 + pmulhrsw m1, m11, m5 + pmulhrsw m2, m11, m6 + pmulhrsw m3, m11, m7 + jmp m(idct_16x8_internal_16bpc).write_16x4_zero + +cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m5, [pw_4096] + vpbroadcastd m7, [pixel_max] + pxor m6, m6 + mov r6d, eobd + add eobb, 21 + cmovc eobd, r6d + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 +.loop: + mova m0, [cq+32*0] + packssdw m0, [cq+32*1] + mova m1, [cq+32*2] + packssdw m1, [cq+32*3] + REPX {mova [cq+32*x], m6}, 0, 1, 2, 3 + add cq, 32*8 + mova m2, [cq-32*4] + packssdw m2, [cq-32*3] + mova m3, [cq-32*2] + packssdw m3, [cq-32*1] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {mova [cq+32*x], m6}, -4, -3, -2, -1 + call m(inv_txfm_add_identity_identity_8x32_16bpc).main + add dstq, 16 + sub eobd, 64 + jge .loop + RET + +%macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2] + mova m%4, [%2] + paddsw m%3, m%1, m%4 + psubsw m%1, m%4 +%if %1 == 0 + pxor m6, m6 +%endif + pmulhrsw m%3, m15 + pmulhrsw m%1, m15 + paddw m%3, [dstq+%5] + paddw m%1, [r2+%6] + pmaxsw m%3, m6 + pmaxsw m%1, m6 + pminsw m%3, m7 + pminsw m%1, m7 + mova [dstq+%5], m%3 + mova [r2+%6], m%1 +%endmacro + +cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*16] + lea r4, [r6+32*8] + lea r5, [r6+32*16] + call .main + sub eobd, 44 + jge .eob44 + vperm2i128 m2, m0, m3, 0x31 ; 5 + vinserti128 m0, xm3, 1 ; 1 + vperm2i128 m3, m1, m4, 0x31 ; 7 + vinserti128 m1, xm4, 1 ; 3 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + REPX {mova [r6+32*x], m4}, 0, 1, 2, 3 + jmp .fast +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 32 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly +.eob44: + mova [r4+16*0], xm0 + mova [r4+16*1], xm3 + mova [r4+16*2], xm1 + mova [r4+16*3], xm4 + vextracti128 [r4+16*4], m0, 1 + vextracti128 [r4+16*5], m3, 1 + vextracti128 [r4+16*6], m1, 1 + vextracti128 [r4+16*7], m4, 1 + call .main + sub eobd, 107 + jge .eob151 + vperm2i128 m7, m1, m4, 0x31 ; 15 + vinserti128 m5, m1, xm4, 1 ; 11 + vperm2i128 m6, m0, m3, 0x31 ; 13 + vinserti128 m4, m0, xm3, 1 ; 9 + mova m0, [r4+32*0] + mova m1, [r4+32*1] + mova m2, [r4+32*2] + mova m3, [r4+32*3] +.fast: + lea rax, [pw_5+128] + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp .idct16 +.eob151: + mova [r4-16*8], xm0 + mova [r4-16*7], xm3 + mova [r4-16*6], xm1 + mova [r4-16*5], xm4 + vextracti128 [r4-16*4], m0, 1 + vextracti128 [r4-16*3], m3, 1 + vextracti128 [r4-16*2], m1, 1 + vextracti128 [r4-16*1], m4, 1 + call .main + sub eobd, 128 + jge .eob279 + vperm2i128 m10, m0, m3, 0x31 ; 21 + vinserti128 m8, m0, xm3, 1 ; 17 + vperm2i128 m11, m1, m4, 0x31 ; 23 + vinserti128 m9, m1, xm4, 1 ; 19 + pxor m12, m12 + REPX {mova x, m12}, m13, m14, m15 + REPX {mova [r6+32*x], m12}, 0, 1, 2, 3 + jmp .full +.eob279: + mova [r5+16*0], xm0 + mova [r5+16*1], xm3 + mova [r5+16*2], xm1 + mova [r5+16*3], xm4 + vextracti128 [r5+16*4], m0, 1 + vextracti128 [r5+16*5], m3, 1 + vextracti128 [r5+16*6], m1, 1 + vextracti128 [r5+16*7], m4, 1 + call .main + vperm2i128 m14, m0, m3, 0x31 ; 29 + vinserti128 m12, m0, xm3, 1 ; 25 + vperm2i128 m15, m1, m4, 0x31 ; 31 + vinserti128 m13, m1, xm4, 1 ; 27 + mova m8, [r5+32*0] + mova m9, [r5+32*1] + mova m10, [r5+32*2] + mova m11, [r5+32*3] +.full: + mova m0, [r4+32*0] + mova m1, [r4+32*1] + mova m2, [r4+32*2] + mova m3, [r4+32*3] + mova m4, [r4-32*4] + mova m5, [r4-32*3] + mova m6, [r4-32*2] + mova m7, [r4-32*1] + lea rax, [pw_5 + 128] + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + lea r3, [rsp+32*8] + mova m8, [r3+32*0] + mova m9, [r3+32*1] + mova m10, [r3+32*2] + mova m11, [r3+32*3] + mova m12, [r3-32*4] + mova m13, [r3-32*3] + mova m14, [r3-32*2] + mova m15, [r3-32*1] +.idct16: + lea r3, [rsp+32*16] + mova m0, [r3+32*0] + mova m1, [r3+32*1] + mova m2, [r3+32*2] + mova m3, [r3+32*3] + mova m4, [r3-32*4] + mova m5, [r3-32*3] + mova m6, [r3-32*2] + mova m7, [r3-32*1] + mova [rsp], m15 + call m(idct_16x16_internal).main + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call .pass2_end + RET +ALIGN function_align +.main: + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128* 3] + pmulld m2, m14, [cq+128* 5] + pmulld m3, m14, [cq+128* 7] + pmulld m4, m14, [cq+128* 9] + pmulld m5, m14, [cq+128*11] + pmulld m6, m14, [cq+128*13] + pmulld m7, m14, [cq+128*15] + call m(idct_8x16_internal_16bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 2] + pmulld m2, m14, [cq+128* 4] + pmulld m3, m14, [cq+128* 6] + pmulld m4, m14, [cq+128* 8] + pmulld m5, m14, [cq+128*10] + pmulld m6, m14, [cq+128*12] + pmulld m7, m14, [cq+128*14] + call m(idct_8x8_internal_16bpc).main_rect2 + call m(idct_8x16_internal_16bpc).main_evenhalf + psrld m15, m11, 11 ; pd_1 + mova m8, [r6-32*4] + mova m9, [r6-32*3] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m10, m0, m8 ; out15 + paddd m0, m8 ; out0 + mova m8, [r6-32*2] + paddd m15, m1, m9 ; out1 + psubd m1, m9 ; out14 + mova m9, [r6-32*1] + REPX {psrad x, 1}, m0, m15, m10, m1 + packssdw m0, m15 + packssdw m1, m10 + psubd m10, m2, m8 ; out13 + paddd m2, m8 ; out2 + mova m8, [r6+32*0] + paddd m15, m3, m9 ; out3 + psubd m3, m9 ; out12 + mova m9, [r6+32*1] + REPX {psrad x, 1}, m2, m15, m10, m3 + packssdw m2, m15 + packssdw m3, m10 + psubd m10, m4, m8 ; out11 + paddd m4, m8 ; out4 + mova m8, [r6+32*2] + paddd m15, m5, m9 ; out5 + psubd m5, m9 ; out10 + mova m9, [r6+32*3] + REPX {psrad x, 1}, m4, m10, m15, m5 + packssdw m4, m15 + packssdw m5, m10 + psubd m10, m6, m8 ; out9 + paddd m6, m8 ; out6 + paddd m15, m7, m9 ; out7 + psubd m7, m9 ; out8 + REPX {psrad x, 1}, m6, m10, m15, m7 + packssdw m6, m15 + packssdw m7, m10 + punpckhwd m8, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m3, m1 + punpcklwd m3, m1 + punpckhwd m1, m4, m6 + punpcklwd m4, m6 + punpcklwd m6, m7, m5 + punpckhwd m7, m5 + pxor m5, m5 + mov r7d, 128*13 +.main_zero_loop: + mova [cq+r7-128*1], m5 + mova [cq+r7+128*0], m5 + mova [cq+r7+128*1], m5 + mova [cq+r7+128*2], m5 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + punpcklwd m5, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m1 + punpckhwd m4, m1 + punpckhwd m1, m0, m8 + punpcklwd m0, m8 + punpckhwd m8, m6, m7 + punpcklwd m6, m7 + punpcklqdq m7, m1, m4 + punpckhqdq m1, m4 + punpckhqdq m4, m8, m3 + punpcklqdq m8, m3 + punpckhqdq m3, m6, m5 + punpcklqdq m6, m5 + punpcklqdq m5, m0, m2 + punpckhqdq m0, m2 + mova [r6+16*0], xm5 + mova [r6+16*1], xm6 + mova [r6+16*2], xm7 + mova [r6+16*3], xm8 + vextracti128 [r6+16*4], m5, 1 + vextracti128 [r6+16*5], m6, 1 + vextracti128 [r6+16*6], m7, 1 + vextracti128 [r6+16*7], m8, 1 + sub r6, 32*4 + ret +ALIGN function_align +.pass2_end: + mova [rsp+gprsize+32*0], m6 + mova [rsp+gprsize+32*2], m7 + mova [rsp+gprsize+32*3], m15 + vpbroadcastd m15, [pw_2048] + vpbroadcastd m7, [pixel_max] + IDCT32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4 + IDCT32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8 + IDCT32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 12, r4-32*1, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*1] + IDCT32_PASS2_END 1, r5+32*2, 0, 4, strideq*0, r3*4 + IDCT32_PASS2_END 5, r5-32*2, 0, 4, strideq*4, strideq*8 + IDCT32_PASS2_END 9, r4+32*2, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 13, r4-32*2, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*0] + IDCT32_PASS2_END 2, r5+32*1, 0, 4, strideq*0, r3*4 + IDCT32_PASS2_END 1, r5-32*3, 0, 4, strideq*4, strideq*8 + IDCT32_PASS2_END 10, r4+32*1, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 14, r4-32*3, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*2] + mova m2, [rsp+gprsize+32*3] + IDCT32_PASS2_END 3, r5+32*0, 0, 4, strideq*0, r3*4 + IDCT32_PASS2_END 1, r5-32*4, 0, 4, strideq*4, strideq*8 + IDCT32_PASS2_END 11, r4+32*0, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0 + ret + +cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob + vpbroadcastd m8, [pw_2896x8] + vpbroadcastd m9, [pw_1697x16] + vpbroadcastd m11, [pw_8192] + vpbroadcastd m7, [pixel_max] + lea r6, [strideq*5] + pxor m6, m6 + paddw m10, m11, m11 ; pw_16384 + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 107 ; eob < 143 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 271 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 399 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 + REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 +.main2: + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + punpcklwd m4, m2, m1 + punpckhwd m2, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + call m(iidentity_8x8_internal_16bpc).write_2x8x2 + punpcklqdq m0, m3, m2 + punpckhqdq m1, m3, m2 + jmp m(iidentity_8x8_internal_16bpc).write_2x8x2 + +cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob + %undef cmp + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + lea r6, [rsp+32*4] + call .main + cmp eobd, 36 + jge .full + call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] + lea rax, [pw_5+128] + mov r7, dstq + call m(idct_16x16_internal).main + call .write_16x16 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] + jmp .end +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 16 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + add r6d, 6144 + sar r6d, 13 + jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 +.full: + add cq, 32 + mova [r4+32*3], m0 + mova [r4+32*2], m1 + mova [r4+32*1], m2 + mova [r4+32*0], m3 + mova [r4-32*1], m4 + mova [r4-32*2], m5 + mova [r4-32*3], m6 + mova [r4-32*4], m7 + call .main + sub r4, 32*16 ; topleft 16x8 + call .transpose_16x16 + lea rax, [pw_5+128] + mov r7, dstq + call m(idct_16x16_internal).main + call .write_16x16 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + add r4, 32*8 ; bottomleft 16x8 + call .transpose_16x16 +.end: + lea dstq, [r7+32] + call m(idct_16x16_internal).main + call .write_16x16 + RET +ALIGN function_align +.transpose_16x16: + punpckhdq m8, m3, m1 + punpckldq m3, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m7, m5 + punpckldq m7, m5 + punpckhdq m5, m4, m6 + punpckldq m4, m6 + punpckhqdq m6, m0, m4 + punpcklqdq m0, m4 + punpckhqdq m4, m1, m5 + punpcklqdq m1, m5 + punpckhqdq m5, m7, m3 + punpcklqdq m7, m3 + punpckhqdq m3, m2, m8 + punpcklqdq m2, m8 + vinserti128 m8, m0, xm7, 1 + vperm2i128 m12, m0, m7, 0x31 + vinserti128 m9, m6, xm5, 1 + vperm2i128 m13, m6, m5, 0x31 + vinserti128 m10, m1, xm2, 1 + vperm2i128 m14, m1, m2, 0x31 + vinserti128 m11, m4, xm3, 1 + vperm2i128 m15, m4, m3, 0x31 + mova m0, [r4+32*3] + mova m1, [r4+32*2] + mova m2, [r4+32*1] + mova m3, [r4+32*0] + mova m4, [r4-32*1] + mova m5, [r4-32*2] + mova m6, [r4-32*3] + mova m7, [r4-32*4] + mova [rsp+gprsize], m15 + jmp m(inv_txfm_add_dct_dct_8x32_16bpc).transpose +ALIGN function_align +.main: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + pmulld m0, m14, [cq+64* 1] + pmulld m1, m14, [cq+64* 7] + pmulld m2, m14, [cq+64* 9] + pmulld m3, m14, [cq+64*15] + pmulld m4, m14, [cq+64*17] + pmulld m5, m14, [cq+64*23] + pmulld m6, m14, [cq+64*25] + pmulld m7, m14, [cq+64*31] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_rect2 + pmulld m0, m14, [cq+64* 3] + pmulld m1, m14, [cq+64* 5] + pmulld m2, m14, [cq+64*11] + pmulld m3, m14, [cq+64*13] + pmulld m4, m14, [cq+64*19] + pmulld m5, m14, [cq+64*21] + pmulld m6, m14, [cq+64*27] + pmulld m7, m14, [cq+64*29] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_rect2 + pmulld m0, m14, [cq+64* 2] + pmulld m1, m14, [cq+64* 6] + pmulld m2, m14, [cq+64*10] + pmulld m3, m14, [cq+64*14] + pmulld m4, m14, [cq+64*18] + pmulld m5, m14, [cq+64*22] + pmulld m6, m14, [cq+64*26] + pmulld m7, m14, [cq+64*30] + call m(idct_8x16_internal_16bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+64* 0] + pmulld m1, m14, [cq+64* 4] + pmulld m2, m14, [cq+64* 8] + pmulld m3, m14, [cq+64*12] + pmulld m4, m14, [cq+64*16] + pmulld m5, m14, [cq+64*20] + pmulld m6, m14, [cq+64*24] + pmulld m7, m14, [cq+64*28] + call m(idct_8x8_internal_16bpc).main_rect2 + call m(idct_8x16_internal_16bpc).main_evenhalf + pxor m8, m8 + mov r7d, 64*30 +.main_zero_loop: + mova [cq+r7-64*2], m8 + mova [cq+r7-64*1], m8 + mova [cq+r7+64*0], m8 + mova [cq+r7+64*1], m8 + sub r7d, 64*4 + jg .main_zero_loop +.main_end: + psrld m11, 11 ; pd_1 + IDCT32_END 0, 15, 8, 9, 10, 1 + IDCT32_END 1, 14, 8, 9, 10, 1 + punpckhwd m8, m0, m1 ; 16 17 + punpcklwd m0, m1 ; 0 1 + punpcklwd m1, m14, m15 ; 14 15 + punpckhwd m14, m15 ; 30 31 + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT32_END 2, 15, 8, 9, 10, 1 + IDCT32_END 3, 14, 8, 9, 10, 1 + punpckhwd m8, m2, m3 ; 18 19 + punpcklwd m2, m3 ; 2 3 + punpcklwd m3, m14, m15 ; 12 13 + punpckhwd m14, m15 ; 28 29 + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT32_END 4, 15, 8, 9, 10, 1 + IDCT32_END 5, 14, 8, 9, 10, 1 + punpckhwd m8, m4, m5 ; 20 21 + punpcklwd m4, m5 ; 4 5 + punpcklwd m5, m14, m15 ; 10 11 + punpckhwd m14, m15 ; 26 27 + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT32_END 6, 15, 8, 9, 10, 1 + IDCT32_END 7, 14, 8, 9, 10, 1 + punpckhwd m8, m6, m7 ; 22 23 + punpcklwd m6, m7 ; 6 7 + punpcklwd m7, m14, m15 ; 8 9 + punpckhwd m14, m15 ; 24 25 + mova [r5-32*3], m8 + mova [r5-32*4], m14 + ret +ALIGN function_align +.write_16x16: + mova m1, [rsp+gprsize+32*1] + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [pw_2048] + vpbroadcastd m9, [pixel_max] + lea r3, [strideq*3] + pxor m8, m8 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct_16x8_internal_16bpc).write_16x4 + pmulhrsw m0, m12, m4 + pmulhrsw m1, m12, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m12, m7 + call m(idct_16x8_internal_16bpc).write_16x4 + pmulhrsw m0, m12, [rsp+gprsize+32*0] + pmulhrsw m1, m12, [rsp+gprsize+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m12, m11 + call m(idct_16x8_internal_16bpc).write_16x4 + pmulhrsw m0, m12, [rsp+gprsize+32*2] + pmulhrsw m1, m12, m13 + pmulhrsw m2, m12, m14 + pmulhrsw m3, m12, m15 + jmp m(idct_16x8_internal_16bpc).write_16x4 + +cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob + vpbroadcastd m8, [pw_2896x8] + vpbroadcastd m9, [pw_1697x16] + vpbroadcastd m10, [pw_2048] + vpbroadcastd m7, [pixel_max] + lea r6, [strideq*5] + pxor m6, m6 + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*1] + call .main + sub eobd, 107 ; eob < 143 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*2] + call .main + sub eobd, 128 ; eob < 271 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*3] + call .main + sub eobd, 128 ; eob < 399 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+64*0] + packssdw m0, [cq+64*1] + mova m1, [cq+64*2] + packssdw m1, [cq+64*3] + mova m2, [cq+64*4] + packssdw m2, [cq+64*5] + mova m3, [cq+64*6] + packssdw m3, [cq+64*7] + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 + REPX {paddsw x, x }, m0, m1, m2, m3 + REPX {IDTX16 x, 4, 9 }, 0, 1, 2, 3 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3 + REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(inv_txfm_add_identity_identity_16x32_16bpc).main2 + +cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob + %undef cmp + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 32 + jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly +.fast: + lea r4, [rsp+32*71] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r3, [rsp+32*3] + mov r4, r6 + lea r5, [r6+32*8] + lea rax, [pw_5+128] + call .pass2_oddhalf + call .pass2_evenhalf + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end + sub dstq, r3 + lea r2, [r2+r3+32] + add dstq, 32 + lea r3, [rsp+32*11] + call .pass2_oddhalf + call .pass2_evenhalf + lea r3, [strideq*3] + call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end + RET +ALIGN function_align +.main: + mova m0, [cq+128* 1] + mova m1, [cq+128* 7] + mova m2, [cq+128* 9] + mova m3, [cq+128*15] + mova m4, [cq+128*17] + mova m5, [cq+128*23] + mova m6, [cq+128*25] + mova m7, [cq+128*31] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1 + mova m0, [cq+128* 3] + mova m1, [cq+128* 5] + mova m2, [cq+128*11] + mova m3, [cq+128*13] + mova m4, [cq+128*19] + mova m5, [cq+128*21] + mova m6, [cq+128*27] + mova m7, [cq+128*29] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2 + mova m0, [cq+128* 2] + mova m1, [cq+128* 6] + mova m2, [cq+128*10] + mova m3, [cq+128*14] + mova m4, [cq+128*18] + mova m5, [cq+128*22] + mova m6, [cq+128*26] + mova m7, [cq+128*30] + call m(idct_8x16_internal_16bpc).main_oddhalf + mova m0, [cq+128* 0] + mova m1, [cq+128* 4] + mova m2, [cq+128* 8] + mova m3, [cq+128*12] + mova m4, [cq+128*16] + mova m5, [cq+128*20] + mova m6, [cq+128*24] + mova m7, [cq+128*28] + call m(idct_8x8_internal_16bpc).main + call m(idct_8x16_internal_16bpc).main_evenhalf + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_end + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + ret +ALIGN function_align +.pass2_oddhalf: + mova m0, [r3+32* 1] ; 1 + mova m1, [r3+32* 3] ; 3 + mova m2, [r3+32* 5] ; 5 + mova m3, [r3+32* 7] ; 7 + mova m4, [r3+32*17] ; 9 + mova m5, [r3+32*19] ; 11 + mova m6, [r3+32*21] ; 13 + mova m7, [r3+32*23] ; 15 + mova m8, [r3+32*33] ; 17 + mova m9, [r3+32*35] ; 19 + mova m10, [r3+32*37] ; 21 + mova m11, [r3+32*39] ; 23 + mova m12, [r3+32*49] ; 25 + mova m13, [r3+32*51] ; 27 + mova m14, [r3+32*53] ; 29 + mova m15, [r3+32*55] ; 31 + jmp m(inv_txfm_add_dct_dct_16x32).main_oddhalf +ALIGN function_align +.pass2_evenhalf: + mova m0, [r3+32* 0] ; 0 + mova m1, [r3+32* 2] ; 2 + mova m2, [r3+32* 4] ; 4 + mova m3, [r3+32* 6] ; 6 + mova m4, [r3+32*16] ; 8 + mova m5, [r3+32*18] ; 10 + mova m6, [r3+32*20] ; 12 + mova m7, [r3+32*22] ; 14 + mova m8, [r3+32*32] ; 16 + mova m9, [r3+32*34] ; 18 + mova m10, [r3+32*36] ; 20 + mova m11, [r3+32*38] ; 22 + mova m12, [r3+32*48] ; 24 + mova m13, [r3+32*50] ; 26 + mova m14, [r3+32*52] ; 28 + mova m15, [r3+32*54] ; 30 + mova [rsp+gprsize], m15 + jmp m(idct_16x16_internal).main + +cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 8, 8, dst, stride, c, eob + %undef cmp + vpbroadcastd m5, [pw_8192] + vpbroadcastd m7, [pixel_max] + pxor m6, m6 + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 + call .main ; 0 + cmp eobd, 36 + jl .ret + add cq, 128*8 ; 0 1 + mov r7, dstq ; 1 + add dstq, 16 + call .main + call .main2 + cmp eobd, 136 + jl .ret + add cq, 128*16-32 ; 0 1 2 + lea dstq, [r7+16*2] ; 1 2 + call .main ; 2 + call .main2 + call .main2 + cmp eobd, 300 + jl .ret + add cq, 128*24-64 ; 0 1 2 3 + add r7, 16*3 ; 1 2 3 + mov dstq, r7 ; 2 3 + call .main ; 3 + call .main2 + call .main2 + call .main2 + cmp eobd, 535 + jl .ret + add cq, 128*24-64 ; 0 1 2 3 + lea dstq, [r7+strideq*8] ; 1 2 3 4 + mov r7, dstq ; 2 3 4 + call .main ; 3 4 + call .main2 + call .main2 + cmp eobd, 755 + jl .ret + add cq, 128*16-32 ; 0 1 2 3 + lea dstq, [r7+strideq*8] ; 1 2 3 4 + call .main ; 2 3 4 5 + call .main2 ; 3 4 5 + cmp eobd, 911 + jl .ret + add cq, 128*8 ; 0 1 2 3 + add dstq, 16 ; 1 2 3 4 + call .main ; 2 3 4 5 +.ret: ; 3 4 5 6 + RET +ALIGN function_align +.main2: + sub cq, 128*8-32 + lea dstq, [dstq+strideq*8-16] +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + jmp m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero + +%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) +%if %1 & 1 + mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n + mova m%4, [r4-32*(14+%1)] ; idct32 out31-n +%else + mova m%5, [r4-32*(45-%1)] + mova m%4, [r5-32*(20+%1)] +%endif + paddsw m%6, m%5, m%4 ; idct32 out 0+n + psubsw m%5, m%4 ; idct32 out31-n + paddsw m%4, m%5, m%3 ; out31-n + psubsw m%5, m%3 ; out32+n + paddsw m%3, m%6, m%2 ; out 0+n + psubsw m%6, m%2 ; out63-n + REPX {pmulhrsw x, m14}, m%5, m%6, m%4, m%3 +%if %1 & 1 + %define %%d0 r2 + %define %%d1 dstq +%else + %define %%d0 dstq + %define %%d1 r2 +%endif + paddw m%3, [%%d0+%7 ] + paddw m%4, [%%d1+%8 ] + paddw m%5, [%%d0+%9 ] + paddw m%6, [%%d1+%10] + pxor m%2, m%2 + REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6 + vpbroadcastd m%2, [pixel_max] + REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6 + mova [%%d0+%7 ], m%3 + mova [%%d1+%8 ], m%4 + mova [%%d0+%9 ], m%5 + mova [%%d1+%10], m%6 +%endmacro + +cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob + %undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*6] + call .main + sub eobd, 44 + jl .fast + call .main + sub eobd, 107 + jl .fast + call .main + sub eobd, 128 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 64 + add r6d, 10240 + sar r6d, 14 + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 +.fast: + lea r4, [rsp+32*38] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea rax, [pw_5+128] + mova m0, [rsp+32* 2] ; in0 + mova m1, [rsp+32* 6] ; in4 + mova m2, [rsp+32*10] ; in8 + mova m3, [rsp+32*14] ; in12 + mova m4, [rsp+32*18] ; in16 + mova m5, [rsp+32*22] ; in20 + mova m6, [rsp+32*26] ; in24 + mova m7, [rsp+32*30] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal).main + mova m1, [rsp+32*1] + lea r4, [rsp+32*38] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [rsp+32* 4] ; in2 + mova m1, [rsp+32* 8] ; in6 + mova m2, [rsp+32*12] ; in10 + mova m3, [rsp+32*16] ; in14 + mova m4, [rsp+32*20] ; in18 + mova m5, [rsp+32*24] ; in22 + mova m6, [rsp+32*28] ; in26 + mova m7, [rsp+32*32] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + mova m0, [rsp+32* 3] ; in1 + mova m1, [rsp+32*33] ; in31 + mova m2, [rsp+32*19] ; in17 + mova m3, [rsp+32*17] ; in15 + mova m4, [rsp+32*11] ; in9 + mova m5, [rsp+32*25] ; in23 + mova m6, [rsp+32*27] ; in25 + mova m7, [rsp+32* 9] ; in7 + lea rax, [idct64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(inv_txfm_add_dct_dct_16x64).main_part1 + mova m0, [rsp+32* 7] ; in5 + mova m1, [rsp+32*29] ; in27 + mova m2, [rsp+32*23] ; in21 + mova m3, [rsp+32*13] ; in11 + mova m4, [rsp+32*15] ; in13 + mova m5, [rsp+32*21] ; in19 + mova m6, [rsp+32*31] ; in29 + mova m7, [rsp+32* 5] ; in3 + add rax, 8 + add r4, 32*8 + sub r5, 32*8 + call m(inv_txfm_add_dct_dct_16x64).main_part1 + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 + call .main_part2_pass2 + RET +ALIGN function_align +.main: + mova m0, [cq+128* 1] + mova m1, [cq+128* 3] + mova m2, [cq+128* 5] + mova m3, [cq+128* 7] + mova m4, [cq+128* 9] + mova m5, [cq+128*11] + mova m6, [cq+128*13] + mova m7, [cq+128*15] + call m(idct_8x16_internal_16bpc).main_oddhalf + mova m0, [cq+128* 0] + mova m1, [cq+128* 2] + mova m2, [cq+128* 4] + mova m3, [cq+128* 6] + mova m4, [cq+128* 8] + mova m5, [cq+128*10] + mova m6, [cq+128*12] + mova m7, [cq+128*14] + call m(idct_8x8_internal_16bpc).main + call m(idct_8x16_internal_16bpc).main_evenhalf + pxor m15, m15 + mov r7d, 128*13 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + psrld m15, m11, 10 ; pd_2 + mova m8, [r6-32*4] + mova m9, [r6+32*3] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m10, m0, m8 ; out15 + paddd m0, m8 ; out0 + mova m8, [r6-32*3] + psubd m15, m7, m9 ; out8 + paddd m7, m9 ; out7 + mova m9, [r6+32*2] + REPX {psrad x, 2}, m0, m15, m10, m7 + packssdw m0, m15 + packssdw m7, m10 + psubd m10, m1, m8 ; out14 + paddd m1, m8 ; out1 + mova m8, [r6-32*2] + psubd m15, m6, m9 ; out9 + paddd m6, m9 ; out6 + mova m9, [r6+32*1] + REPX {psrad x, 2}, m1, m15, m10, m6 + packssdw m1, m15 + packssdw m6, m10 + psubd m10, m2, m8 ; out13 + paddd m2, m8 ; out2 + mova m8, [r6-32*1] + psubd m15, m5, m9 ; out10 + paddd m5, m9 ; out5 + mova m9, [r6+32*0] + REPX {psrad x, 2}, m2, m15, m10, m5 + packssdw m2, m15 + packssdw m5, m10 + psubd m10, m3, m8 ; out12 + paddd m3, m8 ; out3 + psubd m15, m4, m9 ; out11 + paddd m4, m9 ; out4 + REPX {psrad x, 2}, m3, m15, m10, m4 + packssdw m3, m15 + packssdw m4, m10 + call m(idct_16x8_internal_16bpc).transpose3 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + add r6, 32*8 + ret +.main_part2_pass2: + vpbroadcastd m11, [pw_1567_3784] + vpbroadcastd m12, [pw_m3784_1567] + vpbroadcastd m13, [pw_2896_2896] + lea rax, [pw_5+128] + lea r2, [dstq+r7] +.main_part2_pass2_loop: + vpbroadcastd m14, [pw_m2896_2896] + call m(inv_txfm_add_dct_dct_16x64).main_part2_internal + vpbroadcastd m14, [pw_2048] + IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8 + IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8 + IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 + IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 + add dstq, strideq + sub r2, strideq + cmp r4, r5 + jne .main_part2_pass2_loop + ret +ALIGN function_align +.main_part1_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_part1: ; idct64 steps 1-5 + ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + vpbroadcastd m7, [r5+4*0] + vpbroadcastd m8, [r5+4*1] + vpbroadcastd m6, [r5+4*2] + vpbroadcastd m9, [r5+4*3] + vpbroadcastd m5, [r5+4*4] + vpbroadcastd m10, [r5+4*5] + vpbroadcastd m4, [r5+4*6] + vpbroadcastd m15, [r5+4*7] + pmulld m7, m0 ; t63a + pmulld m0, m8 ; t32a + pmulld m6, m1 ; t62a + pmulld m1, m9 ; t33a + pmulld m5, m2 ; t61a + pmulld m2, m10 ; t34a + pmulld m4, m3 ; t60a + pmulld m3, m15 ; t35a + vpbroadcastd m10, [r5+4*8] + vpbroadcastd m15, [r5+4*9] + REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 + REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 + psubd m8, m0, m1 ; t33 + paddd m0, m1 ; t32 + psubd m1, m7, m6 ; t62 + paddd m7, m6 ; t63 + psubd m6, m3, m2 ; t34 + paddd m3, m2 ; t35 + psubd m2, m4, m5 ; t61 + paddd m4, m5 ; t60 + REPX {pmaxsd x, m12}, m8, m1, m6, m2 + REPX {pminsd x, m13}, m8, m1, m6, m2 + ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a + ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a + REPX {pmaxsd x, m12}, m0, m3, m7, m4 + REPX {pminsd x, m13}, m0, m3, m7, m4 + vpbroadcastd m10, [r5+4*10] + vpbroadcastd m15, [r5+4*11] + psubd m5, m0, m3 ; t35a + paddd m0, m3 ; t32a + psubd m3, m7, m4 ; t60a + paddd m7, m4 ; t63a + psubd m4, m1, m6 ; t34 + paddd m1, m6 ; t33 + psubd m6, m8, m2 ; t61 + paddd m8, m2 ; t62 + REPX {pmaxsd x, m12}, m5, m3, m4, m6 + REPX {pminsd x, m13}, m5, m3, m4, m6 + ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60 + ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a + REPX {pmaxsd x, m12}, m0, m7, m1, m8 + REPX {pminsd x, m13}, m0, m7, m1, m8 + add r5, 4*12 + mova [r6-32*4], m0 + mova [r6+32*3], m7 + mova [r6-32*3], m1 + mova [r6+32*2], m8 + mova [r6-32*2], m6 + mova [r6+32*1], m4 + mova [r6-32*1], m3 + mova [r6+32*0], m5 + add r6, 32*8 + ret +.main_part2: ; idct64 steps 6-9 + lea r5, [r6+32*3] + sub r6, 32*4 + vpbroadcastd m10, [pd_1567] + vpbroadcastd m15, [pd_3784] +.main_part2_loop: + mova m0, [r6-32*32] ; t32a + mova m1, [r5-32*24] ; t39a + mova m2, [r5-32*32] ; t63a + mova m3, [r6-32*24] ; t56a + mova m4, [r6-32*16] ; t40a + mova m5, [r5-32* 8] ; t47a + mova m6, [r5-32*16] ; t55a + mova m7, [r6-32* 8] ; t48a + psubd m8, m0, m1 ; t39 + paddd m0, m1 ; t32 + psubd m1, m2, m3 ; t56 + paddd m2, m3 ; t63 + psubd m3, m5, m4 ; t40 + paddd m5, m4 ; t47 + psubd m4, m7, m6 ; t55 + paddd m7, m6 ; t48 + REPX {pmaxsd x, m12}, m8, m1, m3, m4 + REPX {pminsd x, m13}, m8, m1, m3, m4 + ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a + ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a + REPX {pmaxsd x, m12}, m0, m2, m5, m7 + REPX {pminsd x, m13}, m0, m5, m2, m7 + psubd m6, m2, m7 ; t48a + paddd m2, m7 ; t63a + psubd m7, m0, m5 ; t47a + paddd m0, m5 ; t32a + psubd m5, m8, m4 ; t55 + paddd m8, m4 ; t56 + psubd m4, m1, m3 ; t40 + paddd m1, m3 ; t39 + REPX {pmaxsd x, m12}, m6, m7, m5, m4 + REPX {pminsd x, m13}, m6, m7, m5, m4 + REPX {pmulld x, m14}, m6, m7, m5, m4 + REPX {pmaxsd x, m12}, m2, m0, m8, m1 + REPX {pminsd x, m13}, m2, m0, m8, m1 + paddd m6, m11 + paddd m5, m11 + psubd m3, m6, m7 ; t47 + paddd m6, m7 ; t48 + psubd m7, m5, m4 ; t40a + paddd m5, m4 ; t55a + REPX {psrad x, 12}, m3, m6, m7, m5 + mova [r5-32* 8], m2 + mova [r6-32*32], m0 + mova [r6-32* 8], m8 + mova [r5-32*32], m1 + mova [r5-32*24], m3 + mova [r6-32*16], m6 + mova [r6-32*24], m7 + mova [r5-32*16], m5 + add r6, 32 + sub r5, 32 + cmp r6, r5 + jl .main_part2_loop + ret + +cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob + %undef cmp + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + lea r6, [rsp+32*6] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 64 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + add r6d, 6144 + sar r6d, 13 + jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 +.fast: + lea r4, [rsp+32*70] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea rax, [pw_5 + 128] + mov r10, rsp + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 +.pass2_loop: + mova m0, [r10+32* 2] ; in0 + mova m1, [r10+32* 6] ; in4 + mova m2, [r10+32*18] ; in8 + mova m3, [r10+32*22] ; in12 + mova m4, [r10+32*34] ; in16 + mova m5, [r10+32*38] ; in20 + mova m6, [r10+32*50] ; in24 + mova m7, [r10+32*54] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal).main + mova m1, [rsp+32*1] + lea r4, [rsp+32*70] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [r10+32* 4] ; in2 + mova m1, [r10+32* 8] ; in6 + mova m2, [r10+32*20] ; in10 + mova m3, [r10+32*24] ; in14 + mova m4, [r10+32*36] ; in18 + mova m5, [r10+32*40] ; in22 + mova m6, [r10+32*52] ; in26 + mova m7, [r10+32*56] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + mova m0, [r10+32* 3] ; in1 + mova m1, [r10+32*57] ; in31 + mova m2, [r10+32*35] ; in17 + mova m3, [r10+32*25] ; in15 + mova m4, [r10+32*19] ; in9 + mova m5, [r10+32*41] ; in23 + mova m6, [r10+32*51] ; in25 + mova m7, [r10+32* 9] ; in7 + lea rax, [idct64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(inv_txfm_add_dct_dct_16x64).main_part1 + mova m0, [r10+32* 7] ; in5 + mova m1, [r10+32*53] ; in27 + mova m2, [r10+32*39] ; in21 + mova m3, [r10+32*21] ; in11 + mova m4, [r10+32*23] ; in13 + mova m5, [r10+32*37] ; in19 + mova m6, [r10+32*55] ; in29 + mova m7, [r10+32* 5] ; in3 + add rax, 8 + add r4, 32*8 + sub r5, 32*8 + call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2 + add r10, 32*8 + sub r4, 32*98 ; rsp+32*16 + sub dstq, r8 + add dstq, 32 + cmp r10, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128* 7] + pmulld m2, m14, [cq+128* 9] + pmulld m3, m14, [cq+128*15] + pmulld m4, m14, [cq+128*17] + pmulld m5, m14, [cq+128*23] + pmulld m6, m14, [cq+128*25] + pmulld m7, m14, [cq+128*31] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_rect2 + pmulld m0, m14, [cq+128* 3] + pmulld m1, m14, [cq+128* 5] + pmulld m2, m14, [cq+128*11] + pmulld m3, m14, [cq+128*13] + pmulld m4, m14, [cq+128*19] + pmulld m5, m14, [cq+128*21] + pmulld m6, m14, [cq+128*27] + pmulld m7, m14, [cq+128*29] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_rect2 + pmulld m0, m14, [cq+128* 2] + pmulld m1, m14, [cq+128* 6] + pmulld m2, m14, [cq+128*10] + pmulld m3, m14, [cq+128*14] + pmulld m4, m14, [cq+128*18] + pmulld m5, m14, [cq+128*22] + pmulld m6, m14, [cq+128*26] + pmulld m7, m14, [cq+128*30] + call m(idct_8x16_internal_16bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 4] + pmulld m2, m14, [cq+128* 8] + pmulld m3, m14, [cq+128*12] + pmulld m4, m14, [cq+128*16] + pmulld m5, m14, [cq+128*20] + pmulld m6, m14, [cq+128*24] + pmulld m7, m14, [cq+128*28] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + call m(idct_8x8_internal_16bpc).main_rect2 + call m(idct_8x16_internal_16bpc).main_evenhalf + call m(inv_txfm_add_dct_dct_32x16_16bpc).main_end + call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + ret + +cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .normal + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 16 +.dconly: + add r6d, 10240 + sar r6d, 14 +.dconly2: + imul r6d, 2896 + add r6d, 34816 + sar r6d, 16 + movd xm0, r6d +%if WIN64 + movaps [rsp+8], xmm6 +%endif + vpbroadcastw m0, xm0 + vpbroadcastd m6, [pixel_max] + pxor m5, m5 +.dconly_loop: + paddw m1, m0, [dstq+32*0] + paddw m2, m0, [dstq+32*1] + paddw m3, m0, [dstq+32*2] + paddw m4, m0, [dstq+32*3] + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + mova [dstq+32*0], m1 + mova [dstq+32*1], m2 + mova [dstq+32*2], m3 + mova [dstq+32*3], m4 + add dstq, strideq + dec r3d + jg .dconly_loop +%if WIN64 + movaps xmm6, [rsp+8] +%endif + RET +.normal: + PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob + %undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*4] + call .main + call .shift_transpose + cmp eobd, 36 + jl .fast + call .main + call .shift_transpose + jmp .pass2 +.fast: + pxor m0, m0 + mov r3d, 4 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + dec r3d + jg .fast_loop +.pass2: + lea r7, [r6-32*64] + lea r4, [r6-32*32] + lea rax, [pw_5+128] + mov r5, dstq +.pass2_loop: + mova m0, [r7-32*4] + mova m1, [r7-32*3] + mova m2, [r7-32*2] + mova m3, [r7-32*1] + mova m4, [r7+32*0] + mova m5, [r7+32*1] + mova m6, [r7+32*2] + mova m7, [r7+32*3] + add r7, 32*32 + mova m8, [r7-32*4] + mova m9, [r7-32*3] + mova m10, [r7-32*2] + mova m11, [r7-32*1] + mova m12, [r7+32*0] + mova m13, [r7+32*1] + mova m14, [r7+32*2] + mova m15, [r7+32*3] + sub r7, 32*24 + mova [rsp], m15 + call m(idct_16x16_internal).main + mova m1, [rsp+32*1] + call m(inv_txfm_add_dct_dct_32x16_16bpc).write_16x16 + add r5, 32 + mov dstq, r5 + cmp r7, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct64_mul_16bpc] + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + mova m0, [cq+64* 7] + mova m1, [cq+64*25] + mova m2, [cq+64*23] + mova m3, [cq+64* 9] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + mova m0, [cq+64* 3] + mova m1, [cq+64*29] + mova m2, [cq+64*19] + mova m3, [cq+64*13] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2 + mova m0, [cq+64* 2] + mova m1, [cq+64*14] + mova m2, [cq+64*18] + mova m3, [cq+64*30] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_fast + mova m0, [cq+64* 6] + mova m1, [cq+64*10] + mova m2, [cq+64*22] + mova m3, [cq+64*26] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_fast + mova m0, [cq+64* 4] + mova m1, [cq+64*12] + mova m2, [cq+64*20] + mova m3, [cq+64*28] + call m(idct_8x16_internal_16bpc).main_oddhalf_fast + mova m0, [cq+64* 0] + mova m1, [cq+64* 8] + mova m2, [cq+64*16] + mova m3, [cq+64*24] + pxor m15, m15 + mov r7d, 64*30 +.main_zero_loop: + mova [cq+r7-64*2], m15 + mova [cq+r7-64*1], m15 + mova [cq+r7+64*0], m15 + mova [cq+r7+64*1], m15 + sub r7d, 64*4 + jg .main_zero_loop +.main_end: + psrld m15, m11, 10 ; pd_2 +.main_end2: + add cq, 32 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_16bpc).main + add r6, 32*8 + call m(idct_8x16_internal_16bpc).main_evenhalf + mova [r6+32*2], m1 + mova [r6+32*1], m2 + mova [r6+32*0], m3 + mova [r6-32*1], m4 + mova [r6-32*2], m5 + mova [r6-32*3], m6 + mova [r6-32*4], m7 + jmp .main_end_loop_start +.main_end_loop: + mova m0, [r6+32* 3] ; idct8 0 + n +.main_end_loop_start: + mova m1, [r5+32* 4] ; idct16 15 - n + mova m2, [r5-32*12] ; idct32 16 + n + mova m3, [r6-32*13] ; idct32 31 - n + mova m4, [r6-32*29] ; idct64 63 - n + mova m5, [r5-32*28] ; idct64 48 + n + mova m6, [r6-32*45] ; idct64 47 - n + mova m7, [r5-32*44] ; idct64 32 + n + paddd m8, m0, m1 ; idct16 out0 + n + psubd m0, m1 ; idct16 out15 - n + REPX {pmaxsd x, m12}, m8, m0 + REPX {pminsd x, m13}, m8, m0 + paddd m1, m8, m3 ; idct32 out0 + n + psubd m8, m3 ; idct32 out31 - n + paddd m3, m0, m2 ; idct32 out15 - n + psubd m0, m2 ; idct32 out16 + n + REPX {pmaxsd x, m12}, m1, m8, m3, m0 + REPX {pminsd x, m13}, m1, m3, m8, m0 + REPX {paddd x, m15}, m1, m3, m0, m8 + paddd m2, m1, m4 ; idct64 out0 + n (unshifted) + psubd m1, m4 ; idct64 out63 - n (unshifted) + paddd m4, m3, m5 ; idct64 out15 - n (unshifted) + psubd m3, m5 ; idct64 out48 + n (unshifted) + paddd m5, m0, m6 ; idct64 out16 + n (unshifted) + psubd m0, m6 ; idct64 out47 - n (unshifted) + paddd m6, m8, m7 ; idct64 out31 - n (unshifted) + psubd m8, m7 ; idct64 out32 + n (unshifted) + mova [r5-32*44], m2 + mova [r6+32* 3], m1 + mova [r6-32*45], m4 + mova [r5+32* 4], m3 + mova [r5-32*28], m5 + mova [r6-32*13], m0 + mova [r6-32*29], m6 + mova [r5-32*12], m8 + add r5, 32 + sub r6, 32 + cmp r5, r6 + jl .main_end_loop + ret +.shift_transpose: +%macro IDCT64_SHIFT_TRANSPOSE 1 ; shift + sub r6, 32*48 + mov r5, r6 +%%loop: + mova m0, [r6-32* 4] + mova m4, [r6+32* 4] + mova m1, [r6-32* 3] + mova m5, [r6+32* 5] + mova m2, [r6-32* 2] + mova m6, [r6+32* 6] + mova m3, [r6-32* 1] + mova m7, [r6+32* 7] + REPX {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + mova m4, [r6+32* 0] + mova m6, [r6+32* 8] + mova m5, [r6+32* 1] + mova m7, [r6+32* 9] + REPX {psrad x, %1}, m4, m6, m5, m7 + packssdw m4, m6 + packssdw m5, m7 + mova m6, [r6+32* 2] + mova m8, [r6+32*10] + mova m7, [r6+32* 3] + mova m9, [r6+32*11] + REPX {psrad x, %1}, m6, m8, m7, m9 + packssdw m6, m8 + packssdw m7, m9 + call m(idct_16x8_internal_16bpc).transpose3 + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + add r6, 32*16 + add r5, 32*8 + cmp r5, r4 + jl %%loop + mov r6, r4 +%endmacro + IDCT64_SHIFT_TRANSPOSE 2 + ret + +cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob + %undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 32 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + add r6d, 6144 + sar r6d, 13 + jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2 +.fast: + pxor m0, m0 + lea r4, [rsp+32*135] +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r7, [r6-32*32] + lea r5, [r6+32*8] + lea rax, [pw_5+128] + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq +.pass2_loop: + mova m0, [r7-32*99] + mova m1, [r7-32*97] + mova m2, [r7-32*95] + mova m3, [r7-32*93] + mova m4, [r7-32*67] + mova m5, [r7-32*65] + mova m6, [r7-32*63] + mova m7, [r7-32*61] + mova m8, [r7-32*35] + mova m9, [r7-32*33] + mova m10, [r7-32*31] + mova m11, [r7-32*29] + mova m12, [r7-32* 3] + mova m13, [r7-32* 1] + mova m14, [r7+32* 1] + mova m15, [r7+32* 3] + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + mova m0, [r7-32*100] + mova m1, [r7-32*98] + mova m2, [r7-32*96] + mova m3, [r7-32*94] + mova m4, [r7-32*68] + mova m5, [r7-32*66] + mova m6, [r7-32*64] + mova m7, [r7-32*62] + mova m8, [r7-32*36] + mova m9, [r7-32*34] + mova m10, [r7-32*32] + mova m11, [r7-32*30] + mova m12, [r7-32* 4] + mova m13, [r7-32* 2] + mova m14, [r7+32* 0] + mova m15, [r7+32* 2] + add r7, 32*8 + mova [rsp], m15 + call m(idct_16x16_internal).main + call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end + sub dstq, r3 + lea r2, [r2+r3+32] + add dstq, 32 + cmp r7, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct64_mul_16bpc] + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128*31] + pmulld m2, m14, [cq+128*17] + pmulld m3, m14, [cq+128*15] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2 + pmulld m0, m14, [cq+128* 7] + pmulld m1, m14, [cq+128*25] + pmulld m2, m14, [cq+128*23] + pmulld m3, m14, [cq+128* 9] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2 + pmulld m0, m14, [cq+128* 5] + pmulld m1, m14, [cq+128*27] + pmulld m2, m14, [cq+128*21] + pmulld m3, m14, [cq+128*11] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2 + pmulld m0, m14, [cq+128* 3] + pmulld m1, m14, [cq+128*29] + pmulld m2, m14, [cq+128*19] + pmulld m3, m14, [cq+128*13] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2 + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2 + pmulld m0, m14, [cq+128* 2] + pmulld m1, m14, [cq+128*14] + pmulld m2, m14, [cq+128*18] + pmulld m3, m14, [cq+128*30] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_fast_rect2 + pmulld m0, m14, [cq+128* 6] + pmulld m1, m14, [cq+128*10] + pmulld m2, m14, [cq+128*22] + pmulld m3, m14, [cq+128*26] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_fast_rect2 + pmulld m0, m14, [cq+128* 4] + pmulld m1, m14, [cq+128*12] + pmulld m2, m14, [cq+128*20] + pmulld m3, m14, [cq+128*28] + call m(idct_8x16_internal_16bpc).main_oddhalf_fast_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 8] + pmulld m2, m14, [cq+128*16] + pmulld m3, m14, [cq+128*24] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + psrld m15, m11, 11 ; pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end2 + IDCT64_SHIFT_TRANSPOSE 1 + ret + +cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob + %undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 64 + jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly +.fast: + pxor m0, m0 + lea r4, [rsp+32*135] +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r10, [r6-32*32] + lea rax, [pw_5+128] + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 +.pass2_loop: + mova m0, [r10-32*100] ; in0 + mova m1, [r10-32*96] ; in4 + mova m2, [r10-32*68] ; in8 + mova m3, [r10-32*64] ; in12 + mova m4, [r10-32*36] ; in16 + mova m5, [r10-32*32] ; in20 + mova m6, [r10-32* 4] ; in24 + mova m7, [r10+32* 0] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal).main + mova m1, [rsp+32*1] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [r10-32*98] ; in2 + mova m1, [r10-32*94] ; in6 + mova m2, [r10-32*66] ; in10 + mova m3, [r10-32*62] ; in14 + mova m4, [r10-32*34] ; in18 + mova m5, [r10-32*30] ; in22 + mova m6, [r10-32* 2] ; in26 + mova m7, [r10+32* 2] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + mova m0, [r10-32*99] ; in1 + mova m1, [r10+32* 3] ; in31 + mova m2, [r10-32*35] ; in17 + mova m3, [r10-32*61] ; in15 + mova m4, [r10-32*67] ; in9 + mova m5, [r10-32*29] ; in23 + mova m6, [r10-32* 3] ; in25 + mova m7, [r10-32*93] ; in7 + lea rax, [idct64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(inv_txfm_add_dct_dct_16x64).main_part1 + mova m0, [r10-32*95] ; in5 + mova m1, [r10-32* 1] ; in27 + mova m2, [r10-32*31] ; in21 + mova m3, [r10-32*65] ; in11 + mova m4, [r10-32*63] ; in13 + mova m5, [r10-32*33] ; in19 + mova m6, [r10+32* 1] ; in29 + mova m7, [r10-32*97] ; in3 + add rax, 8 + add r4, 32*8 + sub r5, 32*8 + call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2 + add r10, 32*8 + sub dstq, r8 + sub r4, 32*44 + add dstq, 32 + cmp r10, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct64_mul_16bpc] + mova m0, [cq+128* 1] + mova m1, [cq+128*31] + mova m2, [cq+128*17] + mova m3, [cq+128*15] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + mova m0, [cq+128* 7] + mova m1, [cq+128*25] + mova m2, [cq+128*23] + mova m3, [cq+128* 9] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + mova m0, [cq+128* 5] + mova m1, [cq+128*27] + mova m2, [cq+128*21] + mova m3, [cq+128*11] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + mova m0, [cq+128* 3] + mova m1, [cq+128*29] + mova m2, [cq+128*19] + mova m3, [cq+128*13] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2 + mova m0, [cq+128* 2] + mova m1, [cq+128*14] + mova m2, [cq+128*18] + mova m3, [cq+128*30] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_fast + mova m0, [cq+128* 6] + mova m1, [cq+128*10] + mova m2, [cq+128*22] + mova m3, [cq+128*26] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_fast + mova m0, [cq+128* 4] + mova m1, [cq+128*12] + mova m2, [cq+128*20] + mova m3, [cq+128*28] + call m(idct_8x16_internal_16bpc).main_oddhalf_fast + mova m0, [cq+128* 0] + mova m1, [cq+128* 8] + mova m2, [cq+128*16] + mova m3, [cq+128*24] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end + jmp m(inv_txfm_add_dct_dct_64x16_16bpc).shift_transpose + +%endif ; ARCH_X86_64 diff --git a/src/x86/itx_avx2.asm b/src/x86/itx_avx2.asm index 1caa814ed4..bd64250f57 100644 --- a/src/x86/itx_avx2.asm +++ b/src/x86/itx_avx2.asm @@ -32,7 +32,7 @@ SECTION_RODATA 16 ; Note: The order of (at least some of) those constants matter! -deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +const deint_shuf, db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 %macro COEF_PAIR 2 pw_%1_%2: dw %1, %2 @@ -48,19 +48,20 @@ pw_m3803_3344: dw -3803, 3344 pw_m3803_m6688: dw -3803, -6688 pw_2896_m2896: dw 2896, -2896 -pw_5: times 2 dw 5 -pw_2048: times 2 dw 2048 -pw_4096: times 2 dw 4096 -pw_8192: times 2 dw 8192 -pw_16384: times 2 dw 16384 -pw_1697x16: times 2 dw 1697*16 -pw_1697x8: times 2 dw 1697*8 -pw_2896x8: times 2 dw 2896*8 - -pd_2048: dd 2048 - -COEF_PAIR 2896, 2896 -COEF_PAIR 1567, 3784 +const pw_5, times 2 dw 5 +const pw_2048, times 2 dw 2048 +const pw_4096, times 2 dw 4096 +const pw_8192, times 2 dw 8192 +const pw_16384, times 2 dw 16384 +const pw_1697x16, times 2 dw 1697*16 +const pw_1697x8, times 2 dw 1697*8 +const pw_2896x8, times 2 dw 2896*8 +const pd_2048, dd 2048 + +const pw_2896_2896, dw 2896, 2896 +const pw_m2896_2896, dw -2896, 2896 +const pw_1567_3784, dw 1567, 3784 +const pw_m3784_1567, dw -3784, 1567 COEF_PAIR 3784, 1567 COEF_PAIR 201, 4091 COEF_PAIR 995, 3973 @@ -77,7 +78,7 @@ COEF_PAIR 3920, 1189 COEF_PAIR 799, 4017 COEF_PAIR 3406, 2276 pw_m799_m4017: dw -799, -4017 -pw_m1567_m3784: dw -1567, -3784 +const pw_m1567_m3784, dw -1567, -3784 pw_m3406_m2276: dw -3406, -2276 pw_m401_m4076: dw -401, -4076 pw_m3166_m2598: dw -3166, -2598 @@ -106,10 +107,11 @@ pw_2440x8: COEF_X8 2440 pw_m601x8: COEF_X8 -601 pw_4052x8: COEF_X8 4052 -idct64_mul: COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520 - COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092 - COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842 - COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301 +const idct64_mul +COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520 +COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092 +COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842 +COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301 pw_201_4091x8: dw 201*8, 4091*8 pw_m601_4052x8: dw -601*8, 4052*8 @@ -476,7 +478,7 @@ cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 .end2: ITX4_END 0, 1, 2, 3 ALIGN function_align -.main: +cglobal_label .main IADST4_1D_PACKED ret @@ -708,7 +710,7 @@ cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 pshufd m1, m1, q1032 jmp m(iadst_4x8_internal).end2 ALIGN function_align -.main: +cglobal_label .main WRAP_XMM IDCT8_1D_PACKED ret @@ -759,7 +761,7 @@ ALIGN function_align WRAP_XMM IADST8_1D_PACKED 1 ret ALIGN function_align -.main_pass2: +cglobal_label .main_pass2 WRAP_XMM IADST8_1D_PACKED 2 ret @@ -945,7 +947,7 @@ cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 pshufd m3, m3, q1032 jmp m(iadst_4x16_internal).end2 ALIGN function_align -.main: +cglobal_label .main WRAP_XMM IDCT16_1D_PACKED ret @@ -1019,7 +1021,7 @@ ALIGN function_align vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5 pshufd m3, m3, q1032 ; in12 in15 in13 in14 pshufd m2, m4, q1032 ; in11 in8 in9 in10 -.main2: +cglobal_label .main2 vpbroadcastd m8, [o(pd_2048)] pxor m7, m7 punpckhwd m4, m3, m0 ; in12 in3 in14 in1 @@ -1278,7 +1280,7 @@ cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 WRITE_8X4 0, 1, 4, 5 RET ALIGN function_align -.main: +cglobal_label .main IADST4_1D_PACKED ret @@ -1398,7 +1400,7 @@ cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m3, m3, q2031 jmp m(iadst_8x8_internal).end2 ALIGN function_align -.main: +cglobal_label .main IDCT8_1D_PACKED ret @@ -1465,7 +1467,7 @@ ALIGN function_align IADST8_1D_PACKED 1 ret ALIGN function_align -.main_pass2: +cglobal_label .main_pass2 IADST8_1D_PACKED 2 ret @@ -1631,7 +1633,7 @@ cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 WRITE_8X4 6, 7, 0, 1 RET ALIGN function_align -.main: +cglobal_label .main IDCT16_1D_PACKED ret @@ -1659,7 +1661,7 @@ ALIGN function_align REPX {vpermq x, x, q3120}, m4, m5, m6, m7 jmp m(idct_8x16_internal).end2 ALIGN function_align -.main: +cglobal_label .main REPX {pshufd x, x, q1032}, m7, m1, m5, m3 .main2: vpbroadcastd m10, [o(pd_2048)] @@ -1759,7 +1761,7 @@ ALIGN function_align pxor m9, m9 ret ALIGN function_align -.main_pass2_end: +cglobal_label .main_pass2_end vpbroadcastd m8, [o(pw_2896x8)] pshufb m2, m11, m12 pshufb m5, m12 @@ -1964,7 +1966,7 @@ cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 call .main jmp m(iadst_16x4_internal).end ALIGN function_align -.main: +cglobal_label .main vpbroadcastd m6, [o(pd_2048)] IDCT4_1D 0, 1, 2, 3, 4, 5, 6 ret @@ -2023,7 +2025,7 @@ cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1 RET ALIGN function_align -.main: +cglobal_label .main vpbroadcastd m6, [o(pw_m3344_3344)] vpbroadcastd m7, [o(pw_3803_1321)] vpbroadcastd m8, [o(pw_m1321_2482)] @@ -2252,7 +2254,7 @@ cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 WRITE_16X2 6, 7, 0, 1, strideq*2, r3 RET ALIGN function_align -.main: +cglobal_label .main vpbroadcastd m10, [o(pd_2048)] .main2: IDCT8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 @@ -2287,7 +2289,7 @@ ALIGN function_align REPX {pmulhrsw x, m9}, m0, m2, m4, m6 jmp m(idct_16x8_internal).end2 ALIGN function_align -.main: +cglobal_label .main vpbroadcastd m10, [o(pd_2048)] ITX_MULSUB_2W 7, 0, 8, 9, 10, 401, 4076 ; t1a, t0a ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a @@ -2338,7 +2340,7 @@ ALIGN function_align packssdw m5, m11 ; -out5 ret ALIGN function_align -.main_pass2_end: +cglobal_label .main_pass2_end vpbroadcastd m8, [o(pw_2896x8)] psubsw m4, m5, m3 paddsw m3, m5 @@ -2618,7 +2620,7 @@ cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 WRITE_16X2 14, 15, 0, 1, strideq*2, r3 RET ALIGN function_align -.main: +cglobal_label .main vpbroadcastd m15, [o(pd_2048)] mova [rsp+gprsize+32*1], m1 mova [rsp+gprsize+32*2], m9 @@ -2680,7 +2682,7 @@ ALIGN function_align psubw m1, m6, m1 jmp m(idct_16x16_internal).end2 ALIGN function_align -.main: +cglobal_label .main vpbroadcastd m15, [o(pd_2048)] mova [rsp+gprsize+32*1], m0 mova [rsp+gprsize+32*2], m4 @@ -2811,7 +2813,7 @@ ALIGN function_align vpbroadcastd m1, [o(pw_8192)] ret ALIGN function_align -.main_pass2_end: +cglobal_label .main_pass2_end ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to ; 16-bit here will produce the same result as using 32-bit intermediates. paddsw m5, m10, m11 ; -out5 @@ -3172,7 +3174,7 @@ cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob WRITE_8X4 14, 15, 4, 6 RET ALIGN function_align -.main_fast: ; bottom half is zero +cglobal_label .main_fast ; bottom half is zero call m(idct_8x16_internal).main mova m8, [rsp+gprsize+0*32] mova [rsp+gprsize+0*32], m0 @@ -3187,7 +3189,7 @@ ALIGN function_align ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a jmp .main2 ALIGN function_align -.main: +cglobal_label .main call m(idct_8x16_internal).main mova m8, [rsp+gprsize+0*32] mova [rsp+gprsize+0*32], m0 @@ -3751,7 +3753,7 @@ cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob call .pass2_end RET ALIGN function_align -.main_oddhalf_fast: ; lower half is zero +cglobal_label .main_oddhalf_fast ; lower half is zero mova [rsp+gprsize+32*1], m7 pxor m7, m7 mova [rsp+gprsize+32*0], m7 @@ -3783,7 +3785,7 @@ ALIGN function_align vpbroadcastd m15, [o(pd_2048)] jmp .main2 ALIGN function_align -.main_oddhalf: +cglobal_label .main_oddhalf mova [rsp+gprsize+32*0], m15 mova [rsp+gprsize+32*1], m7 mova [rsp+gprsize+32*2], m8 @@ -4657,7 +4659,7 @@ cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob RET ALIGN function_align %define o_base idct64_mul - 8 -.main_part1: +cglobal_label .main_part1 ; idct64 steps 1-5: ; in1/31/17/15/ 9/23/25/ 7 -> ; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a @@ -4774,7 +4776,7 @@ ALIGN function_align cmp tmp1q, tmp2q jne .main_part2_pass1_loop ret -.main_part2_internal: +cglobal_label .main_part2_internal mova m0, [tmp1q-32*12] ; t32a mova m6, [tmp2q-32*13] ; t39a mova m1, [tmp1q-32* 4] ; t40a From 64bc07fd4d3cd54cbf19a323c882f07c73c4aae8 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 4 May 2021 14:17:08 +0200 Subject: [PATCH 064/188] x86: Add high bitdepth filmgrain AVX2 asm --- src/x86/film_grain16_avx2.asm | 2092 +++++++++++++++++++++++++++++++++ 1 file changed, 2092 insertions(+) create mode 100644 src/x86/film_grain16_avx2.asm diff --git a/src/x86/film_grain16_avx2.asm b/src/x86/film_grain16_avx2.asm new file mode 100644 index 0000000000..58225b40d8 --- /dev/null +++ b/src/x86/film_grain16_avx2.asm @@ -0,0 +1,2092 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 +pd_0x10000: times 8 dd 0x10000 +pw_1024: times 16 dw 1024 +pw_23_22: times 8 dw 23, 22 +pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +pd_16: dd 16 +pd_m65536: dd ~0xffff +pb_1: times 4 db 1 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512, 1024 +max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 +min: dw 0, 16*4, 16*16 +pw_27_17_17_27: dw 27, 17, 17, 27 +; these two should be next to each other +pw_4: times 2 dw 4 +pw_16: times 2 dw 16 + +%macro JMP_TABLE 1-* + %xdefine %1_table %%table + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .ar%2 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3 + +struc FGData + .seed: resd 1 + .num_y_points: resd 1 + .y_points: resb 14 * 2 + .chroma_scaling_from_luma: resd 1 + .num_uv_points: resd 2 + .uv_points: resb 2 * 10 * 2 + .scaling_shift: resd 1 + .ar_coeff_lag: resd 1 + .ar_coeffs_y: resb 24 + .ar_coeffs_uv: resb 2 * 28 ; includes padding + .ar_coeff_shift: resq 1 + .grain_scale_shift: resd 1 + .uv_mult: resd 2 + .uv_luma_mult: resd 2 + .uv_offset: resd 2 + .overlap_flag: resd 1 + .clip_to_restricted_range: resd 1 +endstruc + +cextern gaussian_sequence + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +INIT_YMM avx2 +cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax + lea r4, [pb_mask] +%define base r4-pb_mask + movq xm1, [base+rnd_next_upperbit_mask] + movq xm4, [base+mul_bits] + movq xm7, [base+hmul_bits] + mov r3d, [fg_dataq+FGData.grain_scale_shift] + lea r6d, [bdmaxq+1] + shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc + sub r3, r6 + vpbroadcastw xm8, [base+round+r3*2-2] + mova xm5, [base+pb_mask] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + vpbroadcastd xm9, [base+pd_m65536] + mov r3, -73*82*2 + sub bufq, r3 + lea r6, [gaussian_sequence] +.loop: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds + psllq xm6, xm2, 30 + por xm2, xm6 + psllq xm6, xm2, 15 + por xm2, xm6 ; aggregate each bit into next seed's high bit + pmulhuw xm3, xm0, xm7 + por xm2, xm3 ; 4 next output seeds + pshuflw xm0, xm2, q3333 + psrlw xm2, 5 + pmovzxwd xm3, xm2 + mova xm6, xm9 + vpgatherdd xm2, [r6+xm3*2], xm6 + pandn xm2, xm9, xm2 + packusdw xm2, xm2 + paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 + ; shifts by 0, which pmulhrsw does not support + pmulhrsw xm2, xm8 + movq [bufq+r3], xm2 + add r3, 4*2 + jl .loop + + ; auto-regression code + movsxd r3, [fg_dataq+FGData.ar_coeff_lag] + movsxd r3, [base+generate_grain_y_16bpc_avx2_table+r3*4] + lea r3, [r3+base+generate_grain_y_16bpc_avx2_table] + jmp r3 + +.ar1: + DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_y] + DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 + pinsrb xm4, [pb_1], 3 + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd + sub bufq, 2*(82*73-(82*3+79)) + mov hd, 70 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -76 + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu xm0, [bufq+xq*2-82*2-2] ; top/left + psrldq xm2, xm0, 2 ; top + psrldq xm1, xm0, 4 ; top/right + punpcklwd xm0, xm2 + punpcklwd xm1, xm3 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + sarx val3d, val3d, shiftd + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: + DEFINE_ARGS buf, fg_data, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + vpbroadcastw xm14, [base+round_vals-12+shiftq*2] + movq xm8, [fg_dataq+FGData.ar_coeffs_y+5] ; cf5-11 + vinserti128 m8, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4 + pxor m9, m9 + punpcklwd xm14, xm9 + pcmpgtb m9, m8 + punpcklbw m8, m9 ; cf5-11,0-4 + vpermq m9, m8, q3333 ; cf4 + psrldq xm10, xm8, 6 ; cf8-11 + vpblendw xm9, xm10, 11111110b ; cf4,9-11 + pshufd m12, m8, q0000 ; cf[5,6], cf[0-1] + pshufd m11, m8, q1111 ; cf[7,8], cf[2-3] + pshufd xm13, xm9, q1111 ; cf[10,11] + pshufd xm10, xm9, q0000 ; cf[4,9] + sar bdmaxd, 1 + movd xm15, bdmaxd + pcmpeqd xm7, xm7 + vpbroadcastd xm15, xm15 ; max_grain + pxor xm7, xm15 ; min_grain + sub bufq, 2*(82*73-(82*3+79)) + DEFINE_ARGS buf, fg_data, h, x + mov hd, 70 +.y_loop_ar2: + mov xq, -76 + +.x_loop_ar2: + movu xm0, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + vinserti128 m0, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] + psrldq m1, m0, 2 ; y=-1/-2,x=[-1,+5] + psrldq m2, m0, 4 ; y=-1/-2,x=[-0,+5] + psrldq m3, m0, 6 ; y=-1/-2,x=[+1,+5] + + vextracti128 xm4, m0, 1 ; y=-2,x=[-2,+5] + punpcklwd m2, m3 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + punpckhwd xm4, xm0 ; y=-2/-1 interleaved, x=[+2,+5] + punpcklwd m0, m1 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + + pmaddwd m2, m11 + pmaddwd m0, m12 + pmaddwd xm4, xm10 + + paddd m0, m2 + vextracti128 xm2, m0, 1 + paddd xm4, xm0 + paddd xm2, xm14 + paddd xm2, xm4 + + movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] + pshufd xm4, xm0, q3321 + pmovsxwd xm4, xm4 ; in dwords, y=0,x=[0,3] +.x_loop_ar2_inner: + pmaddwd xm3, xm0, xm13 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + ; skip packssdw because we only care about one value + paddd xm3, xm4 + pminsd xm3, xm15 + pmaxsd xm3, xm7 + pextrw [bufq+xq*2], xm3, 0 + psrldq xm4, 4 + pslldq xm3, 2 + psrldq xm0, 2 + vpblendw xm0, xm3, 0010b + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, fg_data, bdmax, shift +%if WIN64 + mov r6, rsp + and rsp, ~31 + sub rsp, 64 + %define tmp rsp +%elif STACK_ALIGNMENT < 32 + mov r6, rsp + and r6, ~31 + %define tmp r6-64 +%else + %define tmp rsp+stack_offset-88 +%endif + sar bdmaxd, 1 + movd xm15, bdmaxd + pcmpeqd xm13, xm13 + vpbroadcastd xm15, xm15 ; max_grain + pxor xm13, xm15 ; min_grain + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + vpbroadcastw m14, [base+round_vals+shiftq*2-12] + movq xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-6 + movd xm1, [fg_dataq+FGData.ar_coeffs_y+14] ; cf14-16 + pinsrb xm0, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13 + pinsrb xm1, [pb_1], 3 ; cf14-16,pb_1 + movd xm2, [fg_dataq+FGData.ar_coeffs_y+21] ; cf21-23 + vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13 + vinserti128 m1, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20 + punpcklbw m0, m0 ; sign-extension + punpcklbw m1, m1 ; sign-extension + punpcklbw xm2, xm2 + REPX {psraw x, 8}, m0, m1, xm2 + + pshufd m8, m0, q0000 ; cf[0,1] | cf[7,8] + pshufd m9, m0, q1111 ; cf[2,3] | cf[9,10] + pshufd m10, m0, q2222 ; cf[4,5] | cf[11,12] + pshufd xm11, xm0, q3333 ; cf[6,13] + + pshufd m3, m1, q0000 ; cf[14,15] | cf[17,18] + pshufd m4, m1, q1111 ; cf[16],pw_1 | cf[19,20] + mova [tmp+0*32], m3 + mova [tmp+1*32], m4 + + paddw xm5, xm14, xm14 + vpblendw xm12, xm2, xm5, 00001000b + + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 2*(82*73-(82*3+79)) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 + +.x_loop_ar3: + movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] + movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] + vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] + vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] + + palignr m4, m1, m0, 2 ; y=-3/-2,x=[-2,+5] + palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] + punpckhwd m5, m0, m4 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m4 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] + palignr m6, m5, m0, 8 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] + vextracti128 xm7, m1, 1 + punpcklwd xm1, xm7 ; y=-3/-2 interleaved,x=[+3,+4,+5,+6] + + psrldq m3, m2, 2 + psrldq m4, m2, 4 + psrldq m7, m2, 6 + vpblendd m7, m14, 00001111b ; rounding constant + punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + ; x=[+0/+1,+1/+2,+2/+3,+3/+4] + punpcklwd m4, m7 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] + ; x=[+2/+3,+3/+4,+4/+5,+5,+6] + + pmaddwd m0, m8 + pmaddwd m6, m9 + pmaddwd m5, m10 + pmaddwd xm1, xm11 + pmaddwd m2, [tmp+0*32] + pmaddwd m4, [tmp+1*32] + + paddd m0, m6 + paddd m5, m2 + paddd m0, m4 + paddd m0, m5 + vextracti128 xm4, m0, 1 + paddd xm0, xm1 + paddd xm0, xm4 + + movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmaddwd xm2, xm1, xm12 + pshufd xm3, xm2, q1111 + paddd xm2, xm3 ; left+cur + paddd xm2, xm0 ; add top + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + ; skip packssdw because we only care about one value + pminsd xm2, xm15 + pmaxsd xm2, xm13 + pextrw [bufq+xq*2], xm2, 0 + pslldq xm2, 4 + psrldq xm1, 2 + vpblendw xm1, xm2, 0100b + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar3 +%if WIN64 + mov rsp, r6 +%endif + RET + +INIT_XMM avx2 +cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax +%define base r8-pb_mask + lea r8, [pb_mask] + movifnidn bdmaxd, bdmaxm + movq xm1, [base+rnd_next_upperbit_mask] + movq xm4, [base+mul_bits] + movq xm7, [base+hmul_bits] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + lea r6d, [bdmaxq+1] + shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc + sub r5, r6 + vpbroadcastw xm8, [base+round+r5*2-2] + mova xm5, [base+pb_mask] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + vpbroadcastw xm9, [base+pw_seed_xor+uvq*4] + pxor xm0, xm9 + vpbroadcastd xm9, [base+pd_m65536] + lea r6, [gaussian_sequence] + mov r7d, 38 + add bufq, 44*2 +.loop_y: + mov r5, -44 +.loop_x: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds + psllq xm6, xm2, 30 + por xm2, xm6 + psllq xm6, xm2, 15 + por xm2, xm6 ; aggregate each bit into next seed's high bit + pmulhuw xm3, xm0, xm7 + por xm2, xm3 ; 4 next output seeds + pshuflw xm0, xm2, q3333 + psrlw xm2, 5 + pmovzxwd xm3, xm2 + mova xm6, xm9 + vpgatherdd xm2, [r6+xm3*2], xm6 + pandn xm2, xm9, xm2 + packusdw xm2, xm2 + paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 + ; shifts by 0, which pmulhrsw does not support + pmulhrsw xm2, xm8 + movq [bufq+r5*2], xm2 + add r5, 4 + jl .loop_x + add bufq, 82*2 + dec r7d + jg .loop_y + + ; auto-regression code + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movsxd r5, [base+generate_grain_uv_420_16bpc_avx2_table+r5*4] + lea r5, [r5+base+generate_grain_uv_420_16bpc_avx2_table] + jmp r5 + +.ar0: + INIT_YMM avx2 + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + vpbroadcastw m3, [base+hmul_bits+shiftq*2-10] + sar bdmaxd, 1 + movd xm14, bdmaxd + pcmpeqw m7, m7 + vpbroadcastw m14, xm14 ; max_gain + pxor m7, m14 ; min_grain + DEFINE_ARGS buf, bufy, h + pmovsxbw xm4, xm4 + vpbroadcastw m6, [hmul_bits+4] + vpbroadcastw m4, xm4 + pxor m5, m5 + sub bufq, 2*(82*38+82-(82*3+41)) + add bufyq, 2*(3+82*3) + mov hd, 35 +.y_loop_ar0: + ; first 32 pixels + movu xm8, [bufyq] + movu xm9, [bufyq+82*2] + movu xm10, [bufyq+ 16] + movu xm11, [bufyq+82*2+16] + vinserti128 m8, [bufyq+ 32], 1 + vinserti128 m9, [bufyq+82*2+32], 1 + vinserti128 m10, [bufyq+ 48], 1 + vinserti128 m11, [bufyq+82*2+48], 1 + paddw m8, m9 + paddw m10, m11 + phaddw m8, m10 + movu xm10, [bufyq+ 64] + movu xm11, [bufyq+82*2+64] + movu xm12, [bufyq+ 80] + movu xm13, [bufyq+82*2+80] + vinserti128 m10, [bufyq+ 96], 1 + vinserti128 m11, [bufyq+82*2+96], 1 + vinserti128 m12, [bufyq+ 112], 1 + vinserti128 m13, [bufyq+82*2+112], 1 + paddw m10, m11 + paddw m12, m13 + phaddw m10, m12 + pmulhrsw m8, m6 + pmulhrsw m10, m6 + punpckhwd m9, m8, m5 + punpcklwd m8, m5 + punpckhwd m11, m10, m5 + punpcklwd m10, m5 + REPX {pmaddwd x, m4}, m8, m9, m10, m11 + REPX {psrad x, 5}, m8, m9, m10, m11 + packssdw m8, m9 + packssdw m10, m11 + REPX {pmulhrsw x, m3}, m8, m10 + paddw m8, [bufq+ 0] + paddw m10, [bufq+32] + pminsw m8, m14 + pminsw m10, m14 + pmaxsw m8, m7 + pmaxsw m10, m7 + movu [bufq+ 0], m8 + movu [bufq+32], m10 + + ; last 6 pixels + movu xm8, [bufyq+32*4] + movu xm10, [bufyq+32*4+16] + paddw xm8, [bufyq+32*4+82*2] + paddw xm10, [bufyq+32*4+82*2+16] + phaddw xm8, xm10 + pmulhrsw xm8, xm6 + punpckhwd xm9, xm8, xm5 + punpcklwd xm8, xm5 + REPX {pmaddwd x, xm4}, xm8, xm9 + REPX {psrad x, 5}, xm8, xm9 + packssdw xm8, xm9 + pmulhrsw xm8, xm3 + movu xm0, [bufq+32*2] + paddw xm8, xm0 + pminsw xm8, xm14 + pmaxsw xm8, xm7 + vpblendw xm0, xm8, xm0, 11000000b + movu [bufq+32*2], xm0 + + add bufq, 82*2 + add bufyq, 82*4 + dec hd + jg .y_loop_ar0 + RET + +.ar1: + INIT_XMM avx2 + DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 + DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd + vpbroadcastw xm6, [hmul_bits+4] + vpbroadcastd xm3, xm3 + sub bufq, 2*(82*38+44-(82*3+41)) + add bufyq, 2*(79+82*3) + mov hd, 35 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -38 + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu xm0, [bufq+xq*2-82*2-2] ; top/left + movu xm8, [bufyq+xq*4] + psrldq xm2, xm0, 2 ; top + psrldq xm1, xm0, 4 ; top/right + phaddw xm8, [bufyq+xq*4+82*2] + pshufd xm9, xm8, q3232 + paddw xm8, xm9 + pmulhrsw xm8, xm6 + punpcklwd xm0, xm2 + punpcklwd xm1, xm8 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 + paddd xm0, xm3 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + sarx val3d, val3d, shiftd + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82*2 + add bufyq, 82*4 + dec hd + jg .y_loop_ar1 + RET + + INIT_YMM avx2 +.ar2: + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + sar bdmaxd, 1 + movd xm6, bdmaxd + pcmpeqd xm5, xm5 + vpbroadcastd xm6, xm6 ; max_grain + pxor xm5, xm6 ; min_grain + vpbroadcastw xm7, [base+hmul_bits+4] + vpbroadcastw xm15, [base+round_vals-12+shiftq*2] + + movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+5] + pinsrb xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4 + pinsrb xm0, [pb_1], 5 + pinsrw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3 + movhps xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] + pinsrb xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+9], 13 + pmovsxbw m0, xm0 + + pshufd xm13, xm0, q3333 + pshufd m12, m0, q0000 + pshufd m11, m0, q1111 + pshufd m10, m0, q2222 + + DEFINE_ARGS buf, bufy, fg_data, h, x + sub bufq, 2*(82*38+44-(82*3+41)) + add bufyq, 2*(79+82*3) + mov hd, 35 +.y_loop_ar2: + mov xq, -38 + +.x_loop_ar2: + movu xm0, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + vinserti128 m0, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] + psrldq m1, m0, 2 ; y=-1/-2,x=[-1,+5] + psrldq m2, m0, 4 ; y=-1/-2,x=[-0,+5] + psrldq m3, m0, 6 ; y=-1/-2,x=[+1,+5] + + movu xm8, [bufyq+xq*4] + paddw xm8, [bufyq+xq*4+82*2] + phaddw xm8, xm8 + + vinserti128 m4, xm0, 1 ; y=-1,x=[-2,+5] + punpcklwd m2, m3 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + punpckhwd m4, m0, m4 ; y=-2/-1 interleaved, x=[+2,+5] + punpcklwd m0, m1 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + + pmulhrsw xm1, xm8, xm7 + punpcklwd xm1, xm15 ; luma, round interleaved + vpblendd m1, m1, m4, 11110000b + + pmaddwd m2, m11 + pmaddwd m0, m12 + pmaddwd m1, m10 + paddd m2, m0 + paddd m2, m1 + vextracti128 xm0, m2, 1 + paddd xm2, xm0 + + movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] + pshufd xm4, xm0, q3321 + pmovsxwd xm4, xm4 ; y=0,x=[0,3] in dword +.x_loop_ar2_inner: + pmaddwd xm3, xm0, xm13 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + ; we do not need to packssdw since we only care about one value + paddd xm3, xm4 + pminsd xm3, xm6 + pmaxsd xm3, xm5 + pextrw [bufq+xq*2], xm3, 0 + psrldq xm0, 2 + pslldq xm3, 2 + psrldq xm4, 4 + vpblendw xm0, xm3, 00000010b + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82*2 + add bufyq, 82*4 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%if WIN64 + mov r6, rsp + and rsp, ~31 + sub rsp, 96 + %define tmp rsp +%elif STACK_ALIGNMENT < 32 + mov r6, rsp + and r6, ~31 + %define tmp r6-96 +%else + %define tmp rsp+stack_offset-120 +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vpbroadcastw xm14, [base+round_vals-12+shiftq*2] + sar bdmaxd, 1 + movd xm15, bdmaxd + pcmpeqd xm13, xm13 + vpbroadcastd xm15, xm15 ; max_grain + pxor xm13, xm15 ; min_grain + vpbroadcastw xm12, [base+hmul_bits+4] + + movq xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] + pinsrb xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma + movhps xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7] + pmovsxbw m0, xm0 + + pshufd m11, m0, q3333 + pshufd m10, m0, q2222 + pshufd m9, m0, q1111 + pshufd m8, m0, q0000 + + movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14] + pinsrb xm0, [pb_1], 3 + pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1 + pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2 + pmovsxbw m0, xm0 + + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + mova [tmp+32*2], m11 + pshufd xm11, xm0, q3232 + mova [tmp+32*0], m1 + mova [tmp+32*1], m2 + pinsrw xm11, [base+round_vals-10+shiftq*2], 3 + + DEFINE_ARGS buf, bufy, fg_data, h, unused, x + sub bufq, 2*(82*38+44-(82*3+41)) + add bufyq, 2*(79+82*3) + mov hd, 35 +.y_loop_ar3: + mov xq, -38 + +.x_loop_ar3: + movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] + movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] + vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] + vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] + + movu xm7, [bufyq+xq*4] + paddw xm7, [bufyq+xq*4+82*2] + phaddw xm7, xm7 + + palignr m4, m1, m0, 2 ; y=-3/-2,x=[-2,+5] + palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] + punpckhwd m5, m0, m4 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m4 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] + palignr m6, m5, m0, 8 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] + pmulhrsw xm7, xm12 + punpcklwd m1, m7 + + psrldq m3, m2, 2 + psrldq m4, m2, 4 + psrldq m7, m2, 6 + vpblendd m7, m14, 00001111b ; rounding constant + punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + ; x=[+0/+1,+1/+2,+2/+3,+3/+4] + punpcklwd m4, m7 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] + ; x=[+2/+3,+3/+4,+4/+5,+5,+6] + + pmaddwd m0, m8 + pmaddwd m6, m9 + pmaddwd m5, m10 + pmaddwd m1, [tmp+32*2] + pmaddwd m2, [tmp+32*0] + pmaddwd m4, [tmp+32*1] + + paddd m0, m6 + paddd m5, m2 + paddd m4, m1 + paddd m0, m4 + paddd m0, m5 + vextracti128 xm4, m0, 1 + paddd xm0, xm4 + + movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmaddwd xm2, xm1, xm11 + pshufd xm3, xm2, q1111 + paddd xm2, xm3 ; left+cur + paddd xm2, xm0 ; add top + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + ; no need to packssdw since we only care about one value + pminsd xm2, xm15 + pmaxsd xm2, xm13 + pextrw [bufq+xq*2], xm2, 0 + pslldq xm2, 4 + psrldq xm1, 2 + vpblendw xm1, xm2, 00000100b + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82*2 + add bufyq, 82*4 + dec hd + jg .y_loop_ar3 +%if WIN64 + mov rsp, r6 +%endif + RET + +INIT_YMM avx2 +cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, grain_lut + mov r7d, [fg_dataq+FGData.scaling_shift] + lea r8, [pb_mask] +%define base r8-pb_mask + vpbroadcastw m11, [base+round_vals+r7*2-12] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + mov r9d, r9m ; bdmax + sar r9d, 11 ; is_12bpc + shlx r10d, r6d, r9d + vpbroadcastw m13, [base+min+r10*2] + lea r9d, [r9d*3] + lea r9d, [r6d*2+r9d] + vpbroadcastw m12, [base+max+r9*2] + vpbroadcastw m10, r9m + mov r9mp, r7 + pxor m2, m2 + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see + + movifnidn sbyd, sbym + test sbyd, sbyd + setnz r7b + test r7b, byte [fg_dataq+FGData.overlap_flag] + jnz .vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, src_bak + + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + +.loop_x: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak + + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y: + ; src + pminuw m0, m10, [srcq+ 0] + pminuw m1, m10, [srcq+32] ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + mova m9, m3 + vpgatherdd m8, [scalingq+m4-3], m3 + vpgatherdd m4, [scalingq+m5-3], m9 + pcmpeqw m3, m3 + mova m9, m3 + vpgatherdd m5, [scalingq+m6-3], m3 + vpgatherdd m6, [scalingq+m7-3], m9 + REPX {psrld x, 24}, m8, m4, m5, m6 + REPX {por x, [pd_0x10000]}, m8, m4, m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m9, [grain_lutq+offxyq*2] + movu m3, [grain_lutq+offxyq*2+32] + + ; noise = round2(scaling[src] * grain, scaling_shift) + ; the problem here is that since the grain is 10-bits, the product of + ; scaling*grain is 17+sign bits, so we need to unfortunately do some + ; of these steps in 32-bits + punpckhwd m7, m9, m11 + punpcklwd m9, m11 + pmaddwd m9, m8 + pmaddwd m7, m4 + punpckhwd m8, m3, m11 + punpcklwd m3, m11 + pmaddwd m3, m5 + pmaddwd m8, m6 + REPX {psrad x, r9m}, m9, m7, m3, m8 + packssdw m9, m7 + packssdw m3, m8 + + ; dst = clip_pixel(src, noise) + paddw m0, m9 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hd + jg .loop_y + + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je .loop_x + + ; r8m = sbym + movq xm15, [pw_27_17_17_27] + cmp dword r8m, 0 + jne .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) + vpbroadcastd xm14, [pd_16] +.loop_x_h_overlap: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy + + lea left_offxyd, [offyd+32] ; previous column's offy*stride+offx + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y_h_overlap: + ; src + pminuw m0, m10, [srcq+ 0] + pminuw m1, m10, [srcq+32] ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + mova m9, m3 + vpgatherdd m8, [scalingq+m4-3], m3 + vpgatherdd m4, [scalingq+m5-3], m9 + pcmpeqw m3, m3 + mova m9, m3 + vpgatherdd m5, [scalingq+m6-3], m3 + vpgatherdd m6, [scalingq+m7-3], m9 + REPX {psrld x, 24}, m8, m4, m5, m6 + REPX {por x, [pd_0x10000]}, m8, m4, m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m9, [grain_lutq+offxyq*2] + movd xm7, [grain_lutq+left_offxyq*2] + punpcklwd xm7, xm9 + pmaddwd xm7, xm15 + paddd xm7, xm14 + psrad xm7, 5 + packssdw xm7, xm7 + vpblendd m9, m7, 00000001b + pcmpeqw m3, m3 + psraw m7, m10, 1 ; max_grain + pxor m3, m7 ; min_grain + pminsw m9, m7 + pmaxsw m9, m3 + movu m3, [grain_lutq+offxyq*2+32] + + ; noise = round2(scaling[src] * grain, scaling_shift) + punpckhwd m7, m9, m11 + punpcklwd m9, m11 + pmaddwd m9, m8 + pmaddwd m7, m4 + punpckhwd m8, m3, m11 + punpcklwd m3, m11 + pmaddwd m3, m5 + pmaddwd m8, m6 + REPX {psrad x, r9m}, m9, m7, m3, m8 + packssdw m9, m7 + packssdw m3, m8 + + ; dst = clip_pixel(src, noise) + paddw m0, m9 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hd + jg .loop_y_h_overlap + + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + + ; r8m = sbym + cmp dword r8m, 0 + jne .loop_x_hv_overlap + jmp .loop_x_h_overlap + +.end: + RET + +.vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, src_bak + + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + + vpbroadcastd m14, [pd_16] +.loop_x_v_overlap: + vpbroadcastd m15, [pw_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, unused, top_offxy + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, unused, top_offxy + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y_v_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] + movu m7, [grain_lutq+top_offxyq*2] + punpckhwd m9, m7, m3 + punpcklwd m7, m3 + REPX {pmaddwd x, m15}, m9, m7 + REPX {paddd x, m14}, m9, m7 + REPX {psrad x, 5}, m9, m7 + packssdw m7, m9 + pcmpeqw m0, m0 + psraw m1, m10, 1 ; max_grain + pxor m0, m1 ; min_grain + pminsw m7, m1 + pmaxsw m7, m0 + movu m3, [grain_lutq+offxyq*2+32] + movu m8, [grain_lutq+top_offxyq*2+32] + punpckhwd m9, m8, m3 + punpcklwd m8, m3 + REPX {pmaddwd x, m15}, m9, m8 + REPX {paddd x, m14}, m9, m8 + REPX {psrad x, 5}, m9, m8 + packssdw m8, m9 + pminsw m8, m1 + pmaxsw m8, m0 + + ; src + pminuw m0, m10, [srcq+ 0] ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + + ; scaling[src] + pcmpeqw m3, m3 + mova m9, m3 + vpgatherdd m6, [scalingq+m4-3], m3 + vpgatherdd m4, [scalingq+m5-3], m9 + REPX {psrld x, 24}, m6, m4 + REPX {por x, [pd_0x10000]}, m6, m4 + + ; noise = round2(scaling[src] * grain, scaling_shift) + punpckhwd m9, m7, m11 + punpcklwd m7, m11 + pmaddwd m6, m7 + pmaddwd m4, m9 + + REPX {psrad x, r9m}, m6, m4 + packssdw m6, m4 + + ; same for the other half + pminuw m1, m10, [srcq+32] ; m0-1: src as word + punpckhwd m9, m1, m2 + punpcklwd m4, m1, m2 ; m4-7: src as dword + pcmpeqw m3, m3 + mova m7, m3 + vpgatherdd m5, [scalingq+m4-3], m3 + vpgatherdd m4, [scalingq+m9-3], m7 + REPX {psrld x, 24}, m5, m4 + REPX {por x, [pd_0x10000]}, m5, m4 + + punpckhwd m9, m8, m11 + punpcklwd m8, m11 + pmaddwd m5, m8 + pmaddwd m4, m9 + + REPX {psrad x, r9m}, m5, m4 + packssdw m5, m4 + + ; dst = clip_pixel(src, noise) + paddw m0, m6 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line + add srcq, strideq + add grain_lutq, 82*2 + dec hw + jz .end_y_v_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + xor hd, 0x10000 + test hd, 0x10000 + jnz .loop_y_v_overlap + jmp .loop_y + +.end_y_v_overlap: + add wq, 32 + jge .end_hv + lea srcq, [src_bakq+wq*2] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + + movq xm15, [pw_27_17_17_27] +.loop_x_hv_overlap: + vpbroadcastd m8, [pw_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+32] + lea left_offxyq, [offyq+32] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] + movu m0, [grain_lutq+offxyq*2+32] + movu m6, [grain_lutq+top_offxyq*2] + movu m1, [grain_lutq+top_offxyq*2+32] + movd xm4, [grain_lutq+left_offxyq*2] + movd xm7, [grain_lutq+topleft_offxyq*2] + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklwd xm4, xm3 + punpcklwd xm7, xm6 + REPX {pmaddwd x, xm15}, xm4, xm7 + REPX {paddd x, xm14}, xm4, xm7 + REPX {psrad x, 5}, xm4, xm7 + REPX {packssdw x, x}, xm4, xm7 + pcmpeqw m5, m5 + psraw m9, m10, 1 ; max_grain + pxor m5, m9 ; min_grain + REPX {pminsw x, xm9}, xm4, xm7 + REPX {pmaxsw x, xm5}, xm4, xm7 + vpblendd m3, m4, 00000001b + vpblendd m6, m7, 00000001b + ; followed by v interpolation (top | cur -> cur) + punpckhwd m7, m6, m3 + punpcklwd m6, m3 + punpckhwd m3, m1, m0 + punpcklwd m1, m0 + REPX {pmaddwd x, m8}, m7, m6, m3, m1 + REPX {paddd x, m14}, m7, m6, m3, m1 + REPX {psrad x, 5}, m7, m6, m3, m1 + packssdw m7, m6, m7 + packssdw m3, m1, m3 + REPX {pminsw x, m9}, m7, m3 + REPX {pmaxsw x, m5}, m7, m3 + + ; src + pminuw m0, m10, [srcq+ 0] + pminuw m1, m10, [srcq+32] ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + + ; scaling[src] + pcmpeqw m9, m9 + vpgatherdd m6, [scalingq+m4-3], m9 + pcmpeqw m9, m9 + vpgatherdd m4, [scalingq+m5-3], m9 + REPX {psrld x, 24}, m6, m4 + REPX {por x, [pd_0x10000]}, m6, m4 + + ; noise = round2(scaling[src] * grain, scaling_shift) + punpckhwd m9, m7, m11 + punpcklwd m7, m11 + pmaddwd m9, m4 + pmaddwd m7, m6 + REPX {psrad x, r9m}, m9, m7 + packssdw m7, m9 + + ; other half + punpckhwd m5, m1, m2 + punpcklwd m4, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m6, m6 + vpgatherdd m9, [scalingq+m4-3], m6 + pcmpeqw m6, m6 + vpgatherdd m4, [scalingq+m5-3], m6 + REPX {psrld x, 24}, m9, m4 + REPX {por x, [pd_0x10000]}, m9, m4 + + ; noise = round2(scaling[src] * grain, scaling_shift) + punpckhwd m6, m3, m11 + punpcklwd m3, m11 + pmaddwd m6, m4 + pmaddwd m3, m9 + REPX {psrad x, r9m}, m6, m3 + packssdw m3, m6 + + ; dst = clip_pixel(src, noise) + paddw m0, m7 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + vpbroadcastd m8, [pw_27_17_17_27+4] ; swap weights for second v-overlap line + add srcq, strideq + add grain_lutq, 82*2 + dec hw + jz .end_y_hv_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + xor hd, 0x10000 + test hd, 0x10000 + jnz .loop_y_hv_overlap + jmp .loop_y_h_overlap + +.end_y_hv_overlap: + add wq, 32 + lea srcq, [src_bakq+wq*2] + jl .loop_x_hv_overlap + +.end_hv: + RET + +cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id +%define base r8-pb_mask + lea r8, [pb_mask] + mov r7d, [fg_dataq+FGData.scaling_shift] + vpbroadcastw m11, [base+round_vals+r7*2-12] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + mov r9d, r13m ; bdmax + sar r9d, 11 ; is_12bpc + shlx r10d, r6d, r9d + vpbroadcastw m13, [base+min+r10*2] + lea r10d, [r9d*3] + mov r11d, is_idm + shlx r6d, r6d, r11d + add r10d, r6d + vpbroadcastw m12, [base+max+r10*2] + vpbroadcastw m10, r13m + pxor m2, m2 + mov r13mp, r7 + + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro FGUV_32x32xN_LOOP 1 ; not-csfl + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + +%if %1 + mov r7d, r11m + vpbroadcastw m0, [fg_dataq+FGData.uv_mult+r7*4] + vpbroadcastw m1, [fg_dataq+FGData.uv_luma_mult+r7*4] + punpcklwd m14, m1, m0 + vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4] + vpbroadcastd m9, [base+pw_4+r9*4] + pmullw m15, m9 +%else + vpbroadcastd m14, [pw_1024] + vpbroadcastd m15, [pw_23_22] +%endif + + movifnidn sbyd, sbym + test sbyd, sbyd + setnz r7b + test r7b, byte [fg_dataq+FGData.overlap_flag] + jnz %%vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused2, unused3, see, unused4, unused5, unused6, luma, lstride + + mov lumaq, r9mp + mov lstrideq, r10mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*4] + mov r10mp, r10 + mov r11mp, r11 + mov r12mp, r12 + neg wq + +%%loop_x: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, unused2, unused3, luma, lstride + + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 82 + lea offyq, [offyq+offxq+498] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, unused2, unused3, luma, lstride + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y: + ; src + mova m0, [srcq] + mova m1, [srcq+strideq] ; m0-1: src as word + + ; luma_src + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm7, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+32], 1 + vinserti128 m7, [lumaq+lstrideq*0+48], 1 + mova xm6, [lumaq+lstrideq*2+ 0] + mova xm8, [lumaq+lstrideq*2+16] + vinserti128 m6, [lumaq+lstrideq*2+32], 1 + vinserti128 m8, [lumaq+lstrideq*2+48], 1 + phaddw m4, m7 + phaddw m6, m8 + pavgw m4, m2 + pavgw m6, m2 + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, m2}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pminuw x, m10}, m4, m6 +%endif + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + + ; scaling[luma_src] + pcmpeqw m3, m3 + mova m9, m3 + vpgatherdd m8, [scalingq+m4-3], m3 + vpgatherdd m4, [scalingq+m5-3], m9 + pcmpeqw m3, m3 + mova m9, m3 + vpgatherdd m5, [scalingq+m6-3], m3 + vpgatherdd m6, [scalingq+m7-3], m9 + REPX {psrld x, 24}, m8, m4, m5, m6 + REPX {por x, [pd_0x10000]}, m8, m4, m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m9, [grain_lutq+offxyq*2] + movu m3, [grain_lutq+offxyq*2+82*2] + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + punpckhwd m7, m9, m11 + punpcklwd m9, m11 + pmaddwd m9, m8 + pmaddwd m7, m4 + punpckhwd m8, m3, m11 + punpcklwd m3, m11 + pmaddwd m3, m5 + pmaddwd m8, m6 + REPX {psrad x, r13m}, m9, m7, m3, m8 + packssdw m9, m7 + packssdw m3, m8 + + ; dst = clip_pixel(src, noise) + paddw m0, m9 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq], m0 + mova [dstq+strideq], m1 + + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*4] + add grain_lutq, 82*4 + sub hb, 2 + jg %%loop_y + + add wq, 16 + jge %%end + mov srcq, r10mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*4] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je %%loop_x + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, luma, lstride + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 82 + lea offyq, [offyq+offxq+498] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, luma, lstride + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y_h_overlap: + mova m0, [srcq] + mova m1, [srcq+strideq] + + ; luma_src + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm7, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+32], 1 + vinserti128 m7, [lumaq+lstrideq*0+48], 1 + mova xm6, [lumaq+lstrideq*2+ 0] + mova xm8, [lumaq+lstrideq*2+16] + vinserti128 m6, [lumaq+lstrideq*2+32], 1 + vinserti128 m8, [lumaq+lstrideq*2+48], 1 + phaddw m4, m7 + phaddw m6, m8 + pavgw m4, m2 + pavgw m6, m2 + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, m2}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pminuw x, m10}, m4, m6 +%endif + + ; grain = grain_lut[offy+y][offx+x] + movu m9, [grain_lutq+offxyq*2] + movu m3, [grain_lutq+offxyq*2+82*2] + movd xm5, [grain_lutq+left_offxyq*2+ 0] + pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 1 ; {left0, left1} + punpcklwd xm7, xm9, xm3 ; {cur0, cur1} + punpcklwd xm5, xm7 ; {left0, cur0, left1, cur1} +%if %1 + pmaddwd xm5, [pw_23_22] +%else + pmaddwd xm5, xm15 +%endif + vpbroadcastd xm8, [pd_16] + paddd xm5, xm8 + psrad xm5, 5 + packssdw xm5, xm5 + pcmpeqw xm8, xm8 + psraw xm7, xm10, 1 + pxor xm8, xm7 + pmaxsw xm5, xm8 + pminsw xm5, xm7 + vpblendw xm7, xm5, xm9, 11111110b + psrldq xm5, 2 + vpblendw xm5, xm3, 11111110b + vpblendd m9, m7, 00001111b + vpblendd m3, m5, 00001111b + + ; scaling[luma_src] + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + pcmpeqw m7, m7 + vpgatherdd m8, [scalingq+m4-3], m7 + pcmpeqw m7, m7 + vpgatherdd m4, [scalingq+m5-3], m7 + REPX {psrld x, 24}, m8, m4 + REPX {por x, [pd_0x10000]}, m8, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + punpckhwd m7, m9, m11 + punpcklwd m9, m11 + pmaddwd m9, m8 + pmaddwd m7, m4 + REPX {psrad x, r13m}, m9, m7 + packssdw m9, m7 + + ; same for the other half + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + pcmpeqw m8, m8 + mova m4, m8 + vpgatherdd m5, [scalingq+m6-3], m8 + vpgatherdd m6, [scalingq+m7-3], m4 + REPX {psrld x, 24}, m5, m6 + REPX {por x, [pd_0x10000]}, m5, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + punpckhwd m8, m3, m11 + punpcklwd m3, m11 + pmaddwd m3, m5 + pmaddwd m8, m6 + REPX {psrad x, r13m}, m3, m8 + packssdw m3, m8 + + ; dst = clip_pixel(src, noise) + paddw m0, m9 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq], m0 + mova [dstq+strideq], m1 + + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*4] + add grain_lutq, 82*4 + sub hb, 2 + jg %%loop_y_h_overlap + + add wq, 16 + jge %%end + mov srcq, r10mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*4] + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap + +%%end: + RET + +%%vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ + sby, see, unused1, unused2, unused3, lstride + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, unused3, unused4, unused5, luma, lstride + + mov lumaq, r9mp + mov lstrideq, r10mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*4] + mov r10mp, r10 + mov r11mp, r11 + mov r12mp, r12 + neg wq + +%%loop_x_v_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, top_offxy, unused2, luma, lstride + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 82 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq+0x10001*498+16*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, top_offxy, unused2, luma, lstride + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y_v_overlap: + ; src + mova m0, [srcq] + mova m1, [srcq+strideq] + + ; luma_src + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm7, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+32], 1 + vinserti128 m7, [lumaq+lstrideq*0+48], 1 + mova xm6, [lumaq+lstrideq*2+ 0] + mova xm8, [lumaq+lstrideq*2+16] + vinserti128 m6, [lumaq+lstrideq*2+32], 1 + vinserti128 m8, [lumaq+lstrideq*2+48], 1 + phaddw m4, m7 + phaddw m6, m8 + pavgw m4, m2 + pavgw m6, m2 + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, m2}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pminuw x, m10}, m4, m6 +%endif + + ; grain = grain_lut[offy+y][offx+x] + movu m9, [grain_lutq+offxyq*2] + movu m5, [grain_lutq+top_offxyq*2] + punpckhwd m7, m5, m9 + punpcklwd m5, m9 ; {top/cur interleaved} +%if %1 + REPX {pmaddwd x, [pw_23_22]}, m7, m5 +%else + REPX {pmaddwd x, m15}, m7, m5 +%endif + vpbroadcastd m3, [pd_16] + REPX {paddd x, m3}, m7, m5 + REPX {psrad x, 5}, m7, m5 + packssdw m9, m5, m7 + pcmpeqw m7, m7 + psraw m5, m10, 1 + pxor m7, m5 + pmaxsw m9, m7 + pminsw m9, m5 + + ; scaling[luma_src] + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + pcmpeqw m7, m7 + vpgatherdd m8, [scalingq+m4-3], m7 + pcmpeqw m7, m7 + vpgatherdd m4, [scalingq+m5-3], m7 + REPX {psrld x, 24}, m8, m4 + REPX {por x, [pd_0x10000]}, m8, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + punpckhwd m7, m9, m11 + punpcklwd m9, m11 + pmaddwd m9, m8 + pmaddwd m7, m4 + REPX {psrad x, r13m}, m9, m7 + packssdw m9, m7 + + ; same for the other half + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + pcmpeqw m8, m8 + mova m4, m8 + vpgatherdd m5, [scalingq+m6-3], m8 + vpgatherdd m6, [scalingq+m7-3], m4 + REPX {psrld x, 24}, m5, m6 + REPX {por x, [pd_0x10000]}, m5, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + movu m3, [grain_lutq+offxyq*2+82*2] + punpckhwd m8, m3, m11 + punpcklwd m3, m11 + pmaddwd m3, m5 + pmaddwd m8, m6 + REPX {psrad x, r13m}, m3, m8 + packssdw m3, m8 + + ; dst = clip_pixel(src, noise) + paddw m0, m9 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq], m0 + mova [dstq+strideq], m1 + + sub hb, 2 + jle %%end_y_v_overlap + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*4] + add grain_lutq, 82*4 + jmp %%loop_y + +%%end_y_v_overlap: + add wq, 16 + jge %%end_hv + mov srcq, r10mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*4] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + +%%loop_x_hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 82 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq+0x10001*498+16*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] + movd xm5, [grain_lutq+left_offxyq*2] + pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 1 + pinsrw xm5, [grain_lutq+topleft_offxyq*2], 2 ; { left0, left1, top/left } + movu m9, [grain_lutq+offxyq*2] + movu m3, [grain_lutq+offxyq*2+82*2] + movu m8, [grain_lutq+top_offxyq*2] + punpcklwd xm7, xm9, xm3 ; { cur0, cur1 } + punpckldq xm7, xm8 ; { cur0, cur1, top0 } + punpcklwd xm5, xm7 ; { cur/left } interleaved + pmaddwd xm5, [pw_23_22] + vpbroadcastd xm0, [pd_16] + paddd xm5, xm0 + psrad xm5, 5 + packssdw xm5, xm5 + pcmpeqw xm0, xm0 + psraw xm7, xm10, 1 + pxor xm0, xm7 + pminsw xm5, xm7 + pmaxsw xm5, xm0 + pcmpeqw xm7, xm7 + psrldq xm7, 14 ; 0xffff, 0..... + vpblendvb m9, m5, m7 ; line 0 + psrldq xm5, 2 + vpblendvb m3, m5, m7 ; line 1 + psrldq xm5, 2 + vpblendvb m5, m8, m5, m7 ; top line + + punpckhwd m7, m5, m9 + punpcklwd m5, m9 ; {top/cur interleaved} +%if %1 + REPX {pmaddwd x, [pw_23_22]}, m7, m5 +%else + REPX {pmaddwd x, m15}, m7, m5 +%endif + vpbroadcastd m9, [pd_16] + REPX {paddd x, m9}, m5, m7 + REPX {psrad x, 5}, m5, m7 + packssdw m9, m5, m7 + pcmpeqw m5, m5 + psraw m7, m10, 1 + pxor m5, m7 + pmaxsw m9, m5 + pminsw m9, m7 + + ; src + mova m0, [srcq] + mova m1, [srcq+strideq] + + ; luma_src + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm7, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+32], 1 + vinserti128 m7, [lumaq+lstrideq*0+48], 1 + mova xm6, [lumaq+lstrideq*2+ 0] + mova xm8, [lumaq+lstrideq*2+16] + vinserti128 m6, [lumaq+lstrideq*2+32], 1 + vinserti128 m8, [lumaq+lstrideq*2+48], 1 + phaddw m4, m7 + phaddw m6, m8 + pavgw m4, m2 + pavgw m6, m2 + +%if %1 + punpckhwd m8, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m8, m4, m5, m6 + REPX {psrad x, 6}, m8, m4, m5, m6 + packssdw m4, m8 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, m2}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pminuw x, m10}, m4, m6 +%endif + + ; scaling[luma_src] + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + pcmpeqw m7, m7 + vpgatherdd m8, [scalingq+m4-3], m7 + pcmpeqw m7, m7 + vpgatherdd m4, [scalingq+m5-3], m7 + REPX {psrld x, 24}, m8, m4 + REPX {por x, [pd_0x10000]}, m8, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + punpckhwd m7, m9, m11 + punpcklwd m9, m11 + pmaddwd m9, m8 + pmaddwd m7, m4 + REPX {psrad x, r13m}, m9, m7 + packssdw m9, m7 + + ; same for the other half + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + pcmpeqw m8, m8 + mova m4, m8 + vpgatherdd m5, [scalingq+m6-3], m8 + vpgatherdd m6, [scalingq+m7-3], m4 + REPX {psrld x, 24}, m5, m6 + REPX {por x, [pd_0x10000]}, m5, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + punpckhwd m8, m3, m11 + punpcklwd m3, m11 + pmaddwd m3, m5 + pmaddwd m8, m6 + REPX {psrad x, r13m}, m3, m8 + packssdw m3, m8 + + ; dst = clip_pixel(src, noise) + paddw m0, m9 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq], m0 + mova [dstq+strideq], m1 + + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*4] + add grain_lutq, 82*4 + sub hb, 2 + jg %%loop_y_h_overlap + +%%end_y_hv_overlap: + add wq, 16 + jge %%end_hv + mov srcq, r10mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*4] + jmp %%loop_x_hv_overlap + +%%end_hv: + RET +%endmacro + + FGUV_32x32xN_LOOP 1 +.csfl: + FGUV_32x32xN_LOOP 0 + +%endif ; ARCH_X86_64 From 48ff05f9143ef0dcae1149d96f8a2f7c47a3eb25 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 11 May 2021 08:02:21 -0400 Subject: [PATCH 065/188] x86: add 10/12-bpc AVX2 version of mc.emu_edge --- src/x86/mc16_avx2.asm | 187 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm index 0ca7f06cd1..951f9a112a 100644 --- a/src/x86/mc16_avx2.asm +++ b/src/x86/mc16_avx2.asm @@ -3816,4 +3816,191 @@ INIT_YMM avx2 jl .w128 RET +cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ + bottomext, rightext + ; we assume that the buffer (stride) is larger than width, so we can + ; safely overwrite by a few bytes + + ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + xor r12d, r12d + lea r10, [ihq-1] + cmp yq, ihq + cmovs r10, yq + test yq, yq + cmovs r10, r12 + imul r10, sstrideq + add srcq, r10 + + ; ref += iclip(x, 0, iw - 1) + lea r10, [iwq-1] + cmp xq, iwq + cmovs r10, xq + test xq, xq + cmovs r10, r12 + lea srcq, [srcq+r10*2] + + ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) + lea bottomextq, [yq+bhq] + sub bottomextq, ihq + lea r3, [bhq-1] + cmovs bottomextq, r12 + + DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ + bottomext, rightext + + ; top_ext = iclip(-y, 0, bh - 1) + neg topextq + cmovs topextq, r12 + cmp bottomextq, bhq + cmovns bottomextq, r3 + cmp topextq, bhq + cmovg topextq, r3 + + ; right_ext = iclip(x + bw - iw, 0, bw - 1) + lea rightextq, [xq+bwq] + sub rightextq, iwq + lea r2, [bwq-1] + cmovs rightextq, r12 + + DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ + bottomext, rightext + + ; left_ext = iclip(-x, 0, bw - 1) + neg leftextq + cmovs leftextq, r12 + cmp rightextq, bwq + cmovns rightextq, r2 + cmp leftextq, bwq + cmovns leftextq, r2 + + DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ + dst, dstride, src, sstride, bottomext, rightext + + ; center_h = bh - top_ext - bottom_ext + lea r3, [bottomextq+topextq] + sub centerhq, r3 + + ; blk += top_ext * PXSTRIDE(dst_stride) + mov r2, topextq + imul r2, dstrideq + add dstq, r2 + mov r9m, dstq + + ; center_w = bw - left_ext - right_ext + mov centerwq, bwq + lea r3, [rightextq+leftextq] + sub centerwq, r3 + +%macro v_loop 3 ; need_left_ext, need_right_ext, suffix +.v_loop_%3: +%if %1 + ; left extension + xor r3, r3 + vpbroadcastw m0, [srcq] +.left_loop_%3: + mova [dstq+r3*2], m0 + add r3, 16 + cmp r3, leftextq + jl .left_loop_%3 + + ; body + lea r12, [dstq+leftextq*2] +%endif + xor r3, r3 +.body_loop_%3: + movu m0, [srcq+r3*2] +%if %1 + movu [r12+r3*2], m0 +%else + movu [dstq+r3*2], m0 +%endif + add r3, 16 + cmp r3, centerwq + jl .body_loop_%3 + +%if %2 + ; right extension +%if %1 + lea r12, [r12+centerwq*2] +%else + lea r12, [dstq+centerwq*2] +%endif + xor r3, r3 + vpbroadcastw m0, [srcq+centerwq*2-2] +.right_loop_%3: + movu [r12+r3*2], m0 + add r3, 16 + cmp r3, rightextq + jl .right_loop_%3 + +%endif + add dstq, dstrideq + add srcq, sstrideq + dec centerhq + jg .v_loop_%3 +%endmacro + + test leftextq, leftextq + jnz .need_left_ext + test rightextq, rightextq + jnz .need_right_ext + v_loop 0, 0, 0 + jmp .body_done + +.need_left_ext: + test rightextq, rightextq + jnz .need_left_right_ext + v_loop 1, 0, 1 + jmp .body_done + +.need_left_right_ext: + v_loop 1, 1, 2 + jmp .body_done + +.need_right_ext: + v_loop 0, 1, 3 + +.body_done: + ; bottom edge extension + test bottomextq, bottomextq + jz .top + mov srcq, dstq + sub srcq, dstrideq + xor r1, r1 +.bottom_x_loop: + mova m0, [srcq+r1*2] + lea r3, [dstq+r1*2] + mov r4, bottomextq +.bottom_y_loop: + mova [r3], m0 + add r3, dstrideq + dec r4 + jg .bottom_y_loop + add r1, 16 + cmp r1, bwq + jl .bottom_x_loop + +.top: + ; top edge extension + test topextq, topextq + jz .end + mov srcq, r9m + mov dstq, dstm + xor r1, r1 +.top_x_loop: + mova m0, [srcq+r1*2] + lea r3, [dstq+r1*2] + mov r4, topextq +.top_y_loop: + mova [r3], m0 + add r3, dstrideq + dec r4 + jg .top_y_loop + add r1, 16 + cmp r1, bwq + jl .top_x_loop + +.end: + RET + %endif ; ARCH_X86_64 From 57f8ea4e3b087c3a76f523c35014f5846b39a52d Mon Sep 17 00:00:00 2001 From: Josh Holmer Date: Thu, 13 May 2021 01:14:34 -0400 Subject: [PATCH 066/188] Enable 10-bit inverse transform ASM --- build.rs | 1 + src/asm/shared/transform/inverse.rs | 2 - src/asm/x86/transform/inverse.rs | 143 ++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+), 2 deletions(-) diff --git a/build.rs b/build.rs index bf24c0775b..658b2b264c 100644 --- a/build.rs +++ b/build.rs @@ -89,6 +89,7 @@ fn build_nasm_files() { "src/x86/ipred_sse.asm", "src/x86/itx_avx2.asm", "src/x86/itx_sse.asm", + "src/x86/itx16_avx2.asm", "src/x86/looprestoration_avx2.asm", "src/x86/looprestoration16_avx2.asm", "src/x86/mc_avx2.asm", diff --git a/src/asm/shared/transform/inverse.rs b/src/asm/shared/transform/inverse.rs index b639178c24..7641cef966 100644 --- a/src/asm/shared/transform/inverse.rs +++ b/src/asm/shared/transform/inverse.rs @@ -14,7 +14,6 @@ use crate::util::*; pub type InvTxfmFunc = unsafe extern fn(*mut u8, libc::ptrdiff_t, *mut i16, i32); -#[cfg(asm_neon)] pub type InvTxfmHBDFunc = unsafe extern fn(*mut u16, libc::ptrdiff_t, *mut i16, i32); @@ -46,7 +45,6 @@ pub fn call_inverse_func( } } -#[cfg(asm_neon)] pub fn call_inverse_hbd_func( func: InvTxfmHBDFunc, input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize, width: usize, height: usize, diff --git a/src/asm/x86/transform/inverse.rs b/src/asm/x86/transform/inverse.rs index f33140af42..c4212e81f6 100644 --- a/src/asm/x86/transform/inverse.rs +++ b/src/asm/x86/transform/inverse.rs @@ -36,6 +36,21 @@ pub fn inverse_transform_add( ); } } + PixelType::U16 if bd == 10 => { + if let Some(func) = INV_TXFM_HBD_FNS[cpu.as_index()] + [get_tx_size_idx(tx_size)][get_tx_type_idx(tx_type)] + { + return call_inverse_hbd_func( + func, + input, + output, + eob, + tx_size.width(), + tx_size.height(), + bd, + ); + } + } PixelType::U16 => {} }; @@ -73,6 +88,37 @@ macro_rules! decl_itx_fns { }; } +macro_rules! decl_itx_hbd_fns { + // Takes a 2d list of tx types for W and H + ([$([$(($ENUM:expr, $TYPE1:ident, $TYPE2:ident)),*]),*], $W:expr, $H:expr, + $OPT_LOWER:ident, $OPT_UPPER:ident) => { + paste::item! { + // For each tx type, declare an function for the current WxH + $( + $( + extern { + // Note: type1 and type2 are flipped + fn []( + dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16, + eob: i32, + ); + } + )* + )* + // Create a lookup table for the tx types declared above + const []: [Option; TX_TYPES] = { + let mut out: [Option; 16] = [None; 16]; + $( + $( + out[get_tx_type_idx($ENUM)] = Some([]); + )* + )* + out + }; + } + }; +} + macro_rules! create_wxh_tables { // Create a lookup table for each cpu feature ([$([$(($W:expr, $H:expr)),*]),*], $OPT_LOWER:ident, $OPT_UPPER:ident) => { @@ -98,6 +144,31 @@ macro_rules! create_wxh_tables { }; } +macro_rules! create_wxh_hbd_tables { + // Create a lookup table for each cpu feature + ([$([$(($W:expr, $H:expr)),*]),*], $OPT_LOWER:ident, $OPT_UPPER:ident) => { + paste::item! { + const []: [[Option; TX_TYPES]; 32] = { + let mut out: [[Option; TX_TYPES]; 32] = [[None; TX_TYPES]; 32]; + // For each dimension, add an entry to the table + $( + $( + out[get_tx_size_idx(TxSize::[])] = []; + )* + )* + out + }; + } + }; + + // Loop through cpu features + ($DIMS:tt, [$(($OPT_LOWER:ident, $OPT_UPPER:ident)),+]) => { + $( + create_wxh_hbd_tables!($DIMS, $OPT_LOWER, $OPT_UPPER); + )* + }; +} + macro_rules! impl_itx_fns { ($TYPES:tt, $W:expr, $H:expr, [$(($OPT_LOWER:ident, $OPT_UPPER:ident)),+]) => { $( @@ -169,6 +240,78 @@ cpu_function_lookup_table!( [SSSE3, AVX2] ); +macro_rules! impl_itx_hbd_fns { + + ($TYPES:tt, $W:expr, $H:expr, [$(($OPT_LOWER:ident, $OPT_UPPER:ident)),+]) => { + $( + decl_itx_hbd_fns!($TYPES, $W, $H, $OPT_LOWER, $OPT_UPPER); + )* + }; + + // Loop over a list of dimensions + ($TYPES_VALID:tt, [$(($W:expr, $H:expr)),*], $OPT:tt) => { + $( + impl_itx_hbd_fns!($TYPES_VALID, $W, $H, $OPT); + )* + }; + + ($TYPES64:tt, $DIMS64:tt, $TYPES32:tt, $DIMS32:tt, $TYPES16:tt, $DIMS16:tt, + $TYPES84:tt, $DIMS84:tt, $OPT:tt) => { + // Make 2d list of tx types for each set of dimensions. Each set of + // dimensions uses a superset of the previous set of tx types. + impl_itx_hbd_fns!([$TYPES64], $DIMS64, $OPT); + impl_itx_hbd_fns!([$TYPES64, $TYPES32], $DIMS32, $OPT); + impl_itx_hbd_fns!([$TYPES64, $TYPES32, $TYPES16], $DIMS16, $OPT); + impl_itx_hbd_fns!( + [$TYPES64, $TYPES32, $TYPES16, $TYPES84], $DIMS84, $OPT + ); + + // Pool all of the dimensions together to create a table for each cpu + // feature level. + create_wxh_hbd_tables!( + [$DIMS64, $DIMS32, $DIMS16, $DIMS84], $OPT + ); + }; +} + +impl_itx_hbd_fns!( + // 64x + [(TxType::DCT_DCT, dct, dct)], + [(64, 64), (64, 32), (32, 64), (16, 64), (64, 16)], + // 32x + [(TxType::IDTX, identity, identity)], + [(32, 32), (32, 16), (16, 32), (32, 8), (8, 32)], + // 16x16 + [ + (TxType::DCT_ADST, dct, adst), + (TxType::ADST_DCT, adst, dct), + (TxType::DCT_FLIPADST, dct, flipadst), + (TxType::FLIPADST_DCT, flipadst, dct), + (TxType::V_DCT, dct, identity), + (TxType::H_DCT, identity, dct), + (TxType::ADST_ADST, adst, adst), + (TxType::ADST_FLIPADST, adst, flipadst), + (TxType::FLIPADST_ADST, flipadst, adst), + (TxType::FLIPADST_FLIPADST, flipadst, flipadst) + ], + [(16, 16)], + // 8x, 4x and 16x (minus 16x16) + [ + (TxType::V_ADST, adst, identity), + (TxType::H_ADST, identity, adst), + (TxType::V_FLIPADST, flipadst, identity), + (TxType::H_FLIPADST, identity, flipadst) + ], + [(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8), (4, 4)], + [(avx2, AVX2)] +); + +cpu_function_lookup_table!( + INV_TXFM_HBD_FNS: [[[Option; TX_TYPES]; 32]], + default: [[None; TX_TYPES]; 32], + [AVX2] +); + #[cfg(test)] mod test { use super::*; From c64b4c6b998bad451e90b2f577bf539de198ef22 Mon Sep 17 00:00:00 2001 From: Josh Holmer Date: Thu, 13 May 2021 10:29:16 -0400 Subject: [PATCH 067/188] Enable 10-bit intra prediction ASM --- build.rs | 1 + src/asm/aarch64/predict.rs | 4 +- src/asm/x86/predict.rs | 416 ++++++++++++++++++++++++++----------- 3 files changed, 299 insertions(+), 122 deletions(-) diff --git a/build.rs b/build.rs index 658b2b264c..9cab0eecaa 100644 --- a/build.rs +++ b/build.rs @@ -87,6 +87,7 @@ fn build_nasm_files() { let asm_files = &[ "src/x86/ipred_avx2.asm", "src/x86/ipred_sse.asm", + "src/x86/ipred16_avx2.asm", "src/x86/itx_avx2.asm", "src/x86/itx_sse.asm", "src/x86/itx16_avx2.asm", diff --git a/src/asm/aarch64/predict.rs b/src/asm/aarch64/predict.rs index 49a21e8fd6..563ad72eee 100644 --- a/src/asm/aarch64/predict.rs +++ b/src/asm/aarch64/predict.rs @@ -78,7 +78,7 @@ macro_rules! decl_cfl_pred_fn { $( fn $f( dst: *mut u8, stride: libc::ptrdiff_t, topleft: *const u8, - width: libc::c_int, height: libc::c_int, ac: *const u8, + width: libc::c_int, height: libc::c_int, ac: *const i16, alpha: libc::c_int, ); )* @@ -99,7 +99,7 @@ macro_rules! decl_cfl_pred_hbd_fn { $( fn $f( dst: *mut u16, stride: libc::ptrdiff_t, topleft: *const u16, - width: libc::c_int, height: libc::c_int, ac: *const u16, + width: libc::c_int, height: libc::c_int, ac: *const i16, alpha: libc::c_int, bit_depth_max: libc::c_int, ); )* diff --git a/src/asm/x86/predict.rs b/src/asm/x86/predict.rs index 16500a17e4..db16a7b163 100644 --- a/src/asm/x86/predict.rs +++ b/src/asm/x86/predict.rs @@ -16,7 +16,7 @@ use crate::tiling::PlaneRegionMut; use crate::transform::TxSize; use crate::util::Aligned; use crate::Pixel; -use std::mem::size_of; +use v_frame::pixel::PixelType; macro_rules! decl_angular_ipred_fn { ($($f:ident),+) => { @@ -56,6 +56,36 @@ decl_angular_ipred_fn! { rav1e_ipred_paeth_ssse3 } +macro_rules! decl_angular_ipred_hbd_fn { + ($($f:ident),+) => { + extern { + $( + fn $f( + dst: *mut u16, stride: libc::ptrdiff_t, topleft: *const u16, + width: libc::c_int, height: libc::c_int, angle: libc::c_int, + max_width: libc::c_int, max_height: libc::c_int, + bit_depth_max: libc::c_int, + ); + )* + } + }; +} + +decl_angular_ipred_hbd_fn! { + rav1e_ipred_dc_16bpc_avx2, + rav1e_ipred_dc_128_16bpc_avx2, + rav1e_ipred_dc_left_16bpc_avx2, + rav1e_ipred_dc_top_16bpc_avx2, + rav1e_ipred_v_16bpc_avx2, + rav1e_ipred_h_16bpc_avx2, + rav1e_ipred_z1_16bpc_avx2, + rav1e_ipred_z3_16bpc_avx2, + rav1e_ipred_smooth_16bpc_avx2, + rav1e_ipred_smooth_v_16bpc_avx2, + rav1e_ipred_smooth_h_16bpc_avx2, + rav1e_ipred_paeth_16bpc_avx2 +} + // For z2 prediction, we need to provide extra parameters, dx and dy, which indicate // the distance between the predicted block's top-left pixel and the frame's edge. // It is required for the intra edge filtering process. @@ -65,6 +95,12 @@ extern { width: libc::c_int, height: libc::c_int, angle: libc::c_int, dx: libc::c_int, dy: libc::c_int, ); + + fn rav1e_ipred_z2_16bpc_avx2( + dst: *mut u16, stride: libc::ptrdiff_t, topleft: *const u16, + width: libc::c_int, height: libc::c_int, angle: libc::c_int, + dx: libc::c_int, dy: libc::c_int, bit_depth_max: libc::c_int, + ); } macro_rules! decl_cfl_pred_fn { @@ -73,7 +109,7 @@ macro_rules! decl_cfl_pred_fn { $( fn $f( dst: *mut u8, stride: libc::ptrdiff_t, topleft: *const u8, - width: libc::c_int, height: libc::c_int, ac: *const u8, + width: libc::c_int, height: libc::c_int, ac: *const i16, alpha: libc::c_int, ); )* @@ -92,6 +128,27 @@ decl_cfl_pred_fn! { rav1e_ipred_cfl_top_ssse3 } +macro_rules! decl_cfl_pred_hbd_fn { + ($($f:ident),+) => { + extern { + $( + fn $f( + dst: *mut u16, stride: libc::ptrdiff_t, topleft: *const u16, + width: libc::c_int, height: libc::c_int, ac: *const i16, + alpha: libc::c_int, bit_depth_max: libc::c_int, + ); + )* + } + }; +} + +decl_cfl_pred_hbd_fn! { + rav1e_ipred_cfl_16bpc_avx2, + rav1e_ipred_cfl_128_16bpc_avx2, + rav1e_ipred_cfl_left_16bpc_avx2, + rav1e_ipred_cfl_top_16bpc_avx2 +} + #[inline(always)] pub fn dispatch_predict_intra( mode: PredictionMode, variant: PredictionVariant, @@ -106,139 +163,258 @@ pub fn dispatch_predict_intra( ); }; - if size_of::() != 1 { - return call_rust(dst); - } - unsafe { - let dst_ptr = dst.data_ptr_mut() as *mut _; - let stride = dst.plane_cfg.stride as libc::ptrdiff_t; - let edge_ptr = - edge_buf.data.as_ptr().offset(2 * MAX_TX_SIZE as isize) as *const _; + let stride = T::to_asm_stride(dst.plane_cfg.stride) as libc::ptrdiff_t; let w = tx_size.width() as libc::c_int; let h = tx_size.height() as libc::c_int; let angle = angle as libc::c_int; - if cpu >= CpuFeatureLevel::AVX2 { - match mode { - PredictionMode::DC_PRED => { - (match variant { - PredictionVariant::NONE => rav1e_ipred_dc_128_avx2, - PredictionVariant::LEFT => rav1e_ipred_dc_left_avx2, - PredictionVariant::TOP => rav1e_ipred_dc_top_avx2, - PredictionVariant::BOTH => rav1e_ipred_dc_avx2, - })(dst_ptr, stride, edge_ptr, w, h, angle); - } - PredictionMode::V_PRED if angle == 90 => { - rav1e_ipred_v_avx2(dst_ptr, stride, edge_ptr, w, h, angle); - } - PredictionMode::H_PRED if angle == 180 => { - rav1e_ipred_h_avx2(dst_ptr, stride, edge_ptr, w, h, angle); + match T::type_enum() { + PixelType::U8 => { + let dst_ptr = dst.data_ptr_mut() as *mut _; + let edge_ptr = + edge_buf.data.as_ptr().offset(2 * MAX_TX_SIZE as isize) as *const _; + if cpu >= CpuFeatureLevel::AVX2 { + match mode { + PredictionMode::DC_PRED => { + (match variant { + PredictionVariant::NONE => rav1e_ipred_dc_128_avx2, + PredictionVariant::LEFT => rav1e_ipred_dc_left_avx2, + PredictionVariant::TOP => rav1e_ipred_dc_top_avx2, + PredictionVariant::BOTH => rav1e_ipred_dc_avx2, + })(dst_ptr, stride, edge_ptr, w, h, angle); + } + PredictionMode::V_PRED if angle == 90 => { + rav1e_ipred_v_avx2(dst_ptr, stride, edge_ptr, w, h, angle); + } + PredictionMode::H_PRED if angle == 180 => { + rav1e_ipred_h_avx2(dst_ptr, stride, edge_ptr, w, h, angle); + } + PredictionMode::V_PRED + | PredictionMode::H_PRED + | PredictionMode::D45_PRED + | PredictionMode::D135_PRED + | PredictionMode::D113_PRED + | PredictionMode::D157_PRED + | PredictionMode::D203_PRED + | PredictionMode::D67_PRED => { + let (enable_ief, ief_smooth_filter) = + if let Some(params) = ief_params { + ( + true as libc::c_int, + params.use_smooth_filter() as libc::c_int, + ) + } else { + (false as libc::c_int, false as libc::c_int) + }; + + // dav1d assembly uses the unused integer bits to hold IEF parameters + let angle_arg = + angle | (enable_ief << 10) | (ief_smooth_filter << 9); + + // From dav1d, bw and bh are the frame width and height rounded to 8px units + let (bw, bh) = ( + ((dst.plane_cfg.width + 7) >> 3) << 3, + ((dst.plane_cfg.height + 7) >> 3) << 3, + ); + // From dav1d, dx and dy are the distance from the predicted block to the frame edge + let (dx, dy) = ( + (bw as isize - dst.rect().x as isize) as libc::c_int, + (bh as isize - dst.rect().y as isize) as libc::c_int, + ); + + if angle <= 90 { + rav1e_ipred_z1_avx2( + dst_ptr, stride, edge_ptr, w, h, angle_arg, + ); + } else if angle < 180 { + rav1e_ipred_z2_avx2( + dst_ptr, stride, edge_ptr, w, h, angle_arg, dx, dy, + ); + } else { + rav1e_ipred_z3_avx2( + dst_ptr, stride, edge_ptr, w, h, angle_arg, + ); + } + } + PredictionMode::SMOOTH_PRED => { + rav1e_ipred_smooth_avx2(dst_ptr, stride, edge_ptr, w, h, angle); + } + PredictionMode::SMOOTH_V_PRED => { + rav1e_ipred_smooth_v_avx2( + dst_ptr, stride, edge_ptr, w, h, angle, + ); + } + PredictionMode::SMOOTH_H_PRED => { + rav1e_ipred_smooth_h_avx2( + dst_ptr, stride, edge_ptr, w, h, angle, + ); + } + PredictionMode::PAETH_PRED => { + rav1e_ipred_paeth_avx2(dst_ptr, stride, edge_ptr, w, h, angle); + } + PredictionMode::UV_CFL_PRED => { + let ac_ptr = ac.as_ptr() as *const _; + (match variant { + PredictionVariant::NONE => rav1e_ipred_cfl_128_avx2, + PredictionVariant::LEFT => rav1e_ipred_cfl_left_avx2, + PredictionVariant::TOP => rav1e_ipred_cfl_top_avx2, + PredictionVariant::BOTH => rav1e_ipred_cfl_avx2, + })(dst_ptr, stride, edge_ptr, w, h, ac_ptr, angle); + } + _ => call_rust(dst), + } + } else if cpu >= CpuFeatureLevel::SSSE3 { + match mode { + PredictionMode::DC_PRED => { + (match variant { + PredictionVariant::NONE => rav1e_ipred_dc_128_ssse3, + PredictionVariant::LEFT => rav1e_ipred_dc_left_ssse3, + PredictionVariant::TOP => rav1e_ipred_dc_top_ssse3, + PredictionVariant::BOTH => rav1e_ipred_dc_ssse3, + })(dst_ptr, stride, edge_ptr, w, h, angle); + } + PredictionMode::V_PRED if angle == 90 => { + rav1e_ipred_v_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); + } + PredictionMode::H_PRED if angle == 180 => { + rav1e_ipred_h_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); + } + PredictionMode::SMOOTH_PRED => { + rav1e_ipred_smooth_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); + } + PredictionMode::SMOOTH_V_PRED => { + rav1e_ipred_smooth_v_ssse3( + dst_ptr, stride, edge_ptr, w, h, angle, + ); + } + PredictionMode::SMOOTH_H_PRED => { + rav1e_ipred_smooth_h_ssse3( + dst_ptr, stride, edge_ptr, w, h, angle, + ); + } + PredictionMode::PAETH_PRED => { + rav1e_ipred_paeth_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); + } + PredictionMode::UV_CFL_PRED => { + let ac_ptr = ac.as_ptr() as *const _; + (match variant { + PredictionVariant::NONE => rav1e_ipred_cfl_128_ssse3, + PredictionVariant::LEFT => rav1e_ipred_cfl_left_ssse3, + PredictionVariant::TOP => rav1e_ipred_cfl_top_ssse3, + PredictionVariant::BOTH => rav1e_ipred_cfl_ssse3, + })(dst_ptr, stride, edge_ptr, w, h, ac_ptr, angle); + } + _ => call_rust(dst), + } } - PredictionMode::V_PRED - | PredictionMode::H_PRED - | PredictionMode::D45_PRED - | PredictionMode::D135_PRED - | PredictionMode::D113_PRED - | PredictionMode::D157_PRED - | PredictionMode::D203_PRED - | PredictionMode::D67_PRED => { - let (enable_ief, ief_smooth_filter) = - if let Some(params) = ief_params { + } + PixelType::U16 if cpu >= CpuFeatureLevel::AVX2 => { + let dst_ptr = dst.data_ptr_mut() as *mut _; + let edge_ptr = + edge_buf.data.as_ptr().offset(2 * MAX_TX_SIZE as isize) as *const _; + let bd_max = (1 << bit_depth) - 1; + match mode { + PredictionMode::DC_PRED => { + (match variant { + PredictionVariant::NONE => rav1e_ipred_dc_128_16bpc_avx2, + PredictionVariant::LEFT => rav1e_ipred_dc_left_16bpc_avx2, + PredictionVariant::TOP => rav1e_ipred_dc_top_16bpc_avx2, + PredictionVariant::BOTH => rav1e_ipred_dc_16bpc_avx2, + })( + dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max + ); + } + PredictionMode::V_PRED if angle == 90 => { + rav1e_ipred_v_16bpc_avx2( + dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, + ); + } + PredictionMode::H_PRED if angle == 180 => { + rav1e_ipred_h_16bpc_avx2( + dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, + ); + } + PredictionMode::V_PRED + | PredictionMode::H_PRED + | PredictionMode::D45_PRED + | PredictionMode::D135_PRED + | PredictionMode::D113_PRED + | PredictionMode::D157_PRED + | PredictionMode::D203_PRED + | PredictionMode::D67_PRED => { + let (enable_ief, ief_smooth_filter) = if let Some(params) = + ief_params + { (true as libc::c_int, params.use_smooth_filter() as libc::c_int) } else { (false as libc::c_int, false as libc::c_int) }; - // dav1d assembly uses the unused integer bits to hold IEF parameters - let angle_arg = - angle | (enable_ief << 10) | (ief_smooth_filter << 9); - - // From dav1d, bw and bh are the frame width and height rounded to 8px units - let (bw, bh) = ( - ((dst.plane_cfg.width + 7) >> 3) << 3, - ((dst.plane_cfg.height + 7) >> 3) << 3, - ); - // From dav1d, dx and dy are the distance from the predicted block to the frame edge - let (dx, dy) = ( - (bw as isize - dst.rect().x as isize) as libc::c_int, - (bh as isize - dst.rect().y as isize) as libc::c_int, - ); - - if angle <= 90 { - rav1e_ipred_z1_avx2(dst_ptr, stride, edge_ptr, w, h, angle_arg); - } else if angle < 180 { - rav1e_ipred_z2_avx2( - dst_ptr, stride, edge_ptr, w, h, angle_arg, dx, dy, + // dav1d assembly uses the unused integer bits to hold IEF parameters + let angle_arg = + angle | (enable_ief << 10) | (ief_smooth_filter << 9); + + // From dav1d, bw and bh are the frame width and height rounded to 8px units + let (bw, bh) = ( + ((dst.plane_cfg.width + 7) >> 3) << 3, + ((dst.plane_cfg.height + 7) >> 3) << 3, ); - } else { - rav1e_ipred_z3_avx2(dst_ptr, stride, edge_ptr, w, h, angle_arg); + // From dav1d, dx and dy are the distance from the predicted block to the frame edge + let (dx, dy) = ( + (bw as isize - dst.rect().x as isize) as libc::c_int, + (bh as isize - dst.rect().y as isize) as libc::c_int, + ); + + if angle <= 90 { + rav1e_ipred_z1_16bpc_avx2( + dst_ptr, stride, edge_ptr, w, h, angle_arg, 0, 0, bd_max, + ); + } else if angle < 180 { + rav1e_ipred_z2_16bpc_avx2( + dst_ptr, stride, edge_ptr, w, h, angle_arg, dx, dy, bd_max, + ); + } else { + rav1e_ipred_z3_16bpc_avx2( + dst_ptr, stride, edge_ptr, w, h, angle_arg, 0, 0, bd_max, + ); + } } + PredictionMode::SMOOTH_PRED => { + rav1e_ipred_smooth_16bpc_avx2( + dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, + ); + } + PredictionMode::SMOOTH_V_PRED => { + rav1e_ipred_smooth_v_16bpc_avx2( + dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, + ); + } + PredictionMode::SMOOTH_H_PRED => { + rav1e_ipred_smooth_h_16bpc_avx2( + dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, + ); + } + PredictionMode::PAETH_PRED => { + rav1e_ipred_paeth_16bpc_avx2( + dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, + ); + } + PredictionMode::UV_CFL_PRED => { + let ac_ptr = ac.as_ptr() as *const _; + (match variant { + PredictionVariant::NONE => rav1e_ipred_cfl_128_16bpc_avx2, + PredictionVariant::LEFT => rav1e_ipred_cfl_left_16bpc_avx2, + PredictionVariant::TOP => rav1e_ipred_cfl_top_16bpc_avx2, + PredictionVariant::BOTH => rav1e_ipred_cfl_16bpc_avx2, + })( + dst_ptr, stride, edge_ptr, w, h, ac_ptr, angle, bd_max + ); + } + _ => call_rust(dst), } - PredictionMode::SMOOTH_PRED => { - rav1e_ipred_smooth_avx2(dst_ptr, stride, edge_ptr, w, h, angle); - } - PredictionMode::SMOOTH_V_PRED => { - rav1e_ipred_smooth_v_avx2(dst_ptr, stride, edge_ptr, w, h, angle); - } - PredictionMode::SMOOTH_H_PRED => { - rav1e_ipred_smooth_h_avx2(dst_ptr, stride, edge_ptr, w, h, angle); - } - PredictionMode::PAETH_PRED => { - rav1e_ipred_paeth_avx2(dst_ptr, stride, edge_ptr, w, h, angle); - } - PredictionMode::UV_CFL_PRED => { - let ac_ptr = ac.as_ptr() as *const _; - (match variant { - PredictionVariant::NONE => rav1e_ipred_cfl_128_avx2, - PredictionVariant::LEFT => rav1e_ipred_cfl_left_avx2, - PredictionVariant::TOP => rav1e_ipred_cfl_top_avx2, - PredictionVariant::BOTH => rav1e_ipred_cfl_avx2, - })(dst_ptr, stride, edge_ptr, w, h, ac_ptr, angle); - } - _ => call_rust(dst), - } - } else if cpu >= CpuFeatureLevel::SSSE3 { - match mode { - PredictionMode::DC_PRED => { - (match variant { - PredictionVariant::NONE => rav1e_ipred_dc_128_ssse3, - PredictionVariant::LEFT => rav1e_ipred_dc_left_ssse3, - PredictionVariant::TOP => rav1e_ipred_dc_top_ssse3, - PredictionVariant::BOTH => rav1e_ipred_dc_ssse3, - })(dst_ptr, stride, edge_ptr, w, h, angle); - } - PredictionMode::V_PRED if angle == 90 => { - rav1e_ipred_v_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); - } - PredictionMode::H_PRED if angle == 180 => { - rav1e_ipred_h_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); - } - PredictionMode::SMOOTH_PRED => { - rav1e_ipred_smooth_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); - } - PredictionMode::SMOOTH_V_PRED => { - rav1e_ipred_smooth_v_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); - } - PredictionMode::SMOOTH_H_PRED => { - rav1e_ipred_smooth_h_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); - } - PredictionMode::PAETH_PRED => { - rav1e_ipred_paeth_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); - } - PredictionMode::UV_CFL_PRED => { - let ac_ptr = ac.as_ptr() as *const _; - (match variant { - PredictionVariant::NONE => rav1e_ipred_cfl_128_ssse3, - PredictionVariant::LEFT => rav1e_ipred_cfl_left_ssse3, - PredictionVariant::TOP => rav1e_ipred_cfl_top_ssse3, - PredictionVariant::BOTH => rav1e_ipred_cfl_ssse3, - })(dst_ptr, stride, edge_ptr, w, h, ac_ptr, angle); - } - _ => call_rust(dst), } - } else { - call_rust(dst); + _ => call_rust(dst), } } } From 7f2e0adf9b9d03882b071ea0c19cf8b0605fb640 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Fri, 14 May 2021 14:47:45 -0400 Subject: [PATCH 068/188] Only write frame CDEF params when intrabc disabled. --- src/header.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/header.rs b/src/header.rs index 8c4d19b780..c045a35afb 100644 --- a/src/header.rs +++ b/src/header.rs @@ -991,7 +991,7 @@ impl UncompressedHeader for BitWriter { fn write_frame_cdef( &mut self, fi: &FrameInvariants, ) -> io::Result<()> { - if fi.sequence.enable_cdef { + if fi.sequence.enable_cdef && !fi.allow_intrabc { assert!(fi.cdef_damping >= 3); assert!(fi.cdef_damping <= 6); self.write(2, fi.cdef_damping - 3)?; From 4ae3983729460be82eb3392a1b87bea985d40bf9 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Sat, 15 May 2021 23:53:20 +0900 Subject: [PATCH 069/188] CI: Add rav1e-ch.exe binary to pre-release deployment --- .github/workflows/deploy.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 2d28dffecd..5594fbb7f1 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -89,6 +89,12 @@ jobs: 7z a "$ZIP_PREFIX-${{ matrix.conf }}.zip" ` "C:\usr\rav1e-windows-${{ matrix.conf }}-sdk" + - name: Build rav1e-ch (unstable) + if: matrix.conf == 'msvc' + env: + RUSTFLAGS: "-C target-feature=+avx2,+fma" + run: cargo build --release --features=unstable,channel-api --bin=rav1e-ch + - name: Upload rav1e msvc binary if: matrix.conf == 'msvc' uses: actions/upload-artifact@v2 @@ -108,6 +114,12 @@ jobs: with: path: rav1e-${{ steps.tagName.outputs.version }}-windows-${{ matrix.conf }}.zip + - name: Upload rav1e-ch msvc binary (unstable) + if: matrix.conf == 'msvc' + uses: actions/upload-artifact@v2 + with: + path: target/release/rav1e-ch.exe + linux-binaries: strategy: matrix: @@ -310,6 +322,7 @@ jobs: files: | Cargo.lock rav1e.exe + rav1e-ch.exe rav1e-linux.tar.gz rav1e-aarch64-linux.tar.gz rav1e-macos.zip @@ -327,6 +340,7 @@ jobs: files: | Cargo.lock rav1e.exe + rav1e-ch.exe rav1e-linux.tar.gz rav1e-aarch64-linux.tar.gz rav1e-macos.zip From 7b1d05ae3d48bf1216b418d9a1ceab8654c05884 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Fri, 14 May 2021 17:28:21 -0400 Subject: [PATCH 070/188] Pull check for allowed intrabc out of write_lrf(). --- src/context/frame_header.rs | 162 ++++++++++++++++++------------------ src/encoder.rs | 5 +- 2 files changed, 82 insertions(+), 85 deletions(-) diff --git a/src/context/frame_header.rs b/src/context/frame_header.rs index 6e8d5db40c..711f52a74c 100644 --- a/src/context/frame_header.rs +++ b/src/context/frame_header.rs @@ -162,102 +162,98 @@ impl<'a> ContextWriter<'a> { self.fc.count_lrf_switchable(w, rs, filter, pli) } - pub fn write_lrf( - &mut self, w: &mut W, fi: &FrameInvariants, - rs: &mut TileRestorationStateMut, sbo: TileSuperBlockOffset, pli: usize, + pub fn write_lrf( + &mut self, w: &mut W, rs: &mut TileRestorationStateMut, + sbo: TileSuperBlockOffset, pli: usize, ) { - if !fi.allow_intrabc { - // TODO: also disallow if lossless - let rp = &mut rs.planes[pli]; - if let Some(filter) = rp.restoration_unit(sbo, true).map(|ru| ru.filter) - { - match filter { - RestorationFilter::None => match rp.rp_cfg.lrf_type { - RESTORE_WIENER => { - let cdf = &mut self.fc.lrf_wiener_cdf; - symbol_with_update!(self, w, 0, cdf, 2); - } + let rp = &mut rs.planes[pli]; + if let Some(filter) = rp.restoration_unit(sbo, true).map(|ru| ru.filter) { + match filter { + RestorationFilter::None => match rp.rp_cfg.lrf_type { + RESTORE_WIENER => { + let cdf = &mut self.fc.lrf_wiener_cdf; + symbol_with_update!(self, w, 0, cdf, 2); + } + RESTORE_SGRPROJ => { + let cdf = &mut self.fc.lrf_sgrproj_cdf; + symbol_with_update!(self, w, 0, cdf, 2); + } + RESTORE_SWITCHABLE => { + let cdf = &mut self.fc.lrf_switchable_cdf; + symbol_with_update!(self, w, 0, cdf, 3); + } + RESTORE_NONE => {} + _ => unreachable!(), + }, + RestorationFilter::Sgrproj { set, xqd } => { + match rp.rp_cfg.lrf_type { RESTORE_SGRPROJ => { let cdf = &mut self.fc.lrf_sgrproj_cdf; - symbol_with_update!(self, w, 0, cdf, 2); + symbol_with_update!(self, w, 1, cdf, 2); } RESTORE_SWITCHABLE => { + // Does *not* write 'RESTORE_SGRPROJ' let cdf = &mut self.fc.lrf_switchable_cdf; - symbol_with_update!(self, w, 0, cdf, 3); + symbol_with_update!(self, w, 2, cdf, 3); } - RESTORE_NONE => {} _ => unreachable!(), - }, - RestorationFilter::Sgrproj { set, xqd } => { - match rp.rp_cfg.lrf_type { - RESTORE_SGRPROJ => { - let cdf = &mut self.fc.lrf_sgrproj_cdf; - symbol_with_update!(self, w, 1, cdf, 2); - } - RESTORE_SWITCHABLE => { - // Does *not* write 'RESTORE_SGRPROJ' - let cdf = &mut self.fc.lrf_switchable_cdf; - symbol_with_update!(self, w, 2, cdf, 3); - } - _ => unreachable!(), - } - w.literal(SGRPROJ_PARAMS_BITS, set as u32); - for i in 0..2 { - let s = SGRPROJ_PARAMS_S[set as usize][i]; - let min = SGRPROJ_XQD_MIN[i] as i32; - let max = SGRPROJ_XQD_MAX[i] as i32; - if s > 0 { - w.write_signed_subexp_with_ref( - xqd[i] as i32, - min, - max + 1, - SGRPROJ_PRJ_SUBEXP_K, - rp.sgrproj_ref[i] as i32, - ); - rp.sgrproj_ref[i] = xqd[i]; + } + w.literal(SGRPROJ_PARAMS_BITS, set as u32); + for i in 0..2 { + let s = SGRPROJ_PARAMS_S[set as usize][i]; + let min = SGRPROJ_XQD_MIN[i] as i32; + let max = SGRPROJ_XQD_MAX[i] as i32; + if s > 0 { + w.write_signed_subexp_with_ref( + xqd[i] as i32, + min, + max + 1, + SGRPROJ_PRJ_SUBEXP_K, + rp.sgrproj_ref[i] as i32, + ); + rp.sgrproj_ref[i] = xqd[i]; + } else { + // Nothing written, just update the reference + if i == 0 { + assert!(xqd[i] == 0); + rp.sgrproj_ref[0] = 0; } else { - // Nothing written, just update the reference - if i == 0 { - assert!(xqd[i] == 0); - rp.sgrproj_ref[0] = 0; - } else { - rp.sgrproj_ref[1] = 95; // LOL at spec. The result is always 95. - } + rp.sgrproj_ref[1] = 95; // LOL at spec. The result is always 95. } } } - RestorationFilter::Wiener { coeffs } => { - match rp.rp_cfg.lrf_type { - RESTORE_WIENER => { - let cdf = &mut self.fc.lrf_wiener_cdf; - symbol_with_update!(self, w, 1, cdf, 2); - } - RESTORE_SWITCHABLE => { - // Does *not* write 'RESTORE_WIENER' - let cdf = &mut self.fc.lrf_switchable_cdf; - symbol_with_update!(self, w, 1, cdf, 3); - } - _ => unreachable!(), + } + RestorationFilter::Wiener { coeffs } => { + match rp.rp_cfg.lrf_type { + RESTORE_WIENER => { + let cdf = &mut self.fc.lrf_wiener_cdf; + symbol_with_update!(self, w, 1, cdf, 2); } - for pass in 0..2 { - let first_coeff = if pli == 0 { - 0 - } else { - assert!(coeffs[pass][0] == 0); - 1 - }; - for i in first_coeff..3 { - let min = WIENER_TAPS_MIN[i] as i32; - let max = WIENER_TAPS_MAX[i] as i32; - w.write_signed_subexp_with_ref( - coeffs[pass][i] as i32, - min, - max + 1, - (i + 1) as u8, - rp.wiener_ref[pass][i] as i32, - ); - rp.wiener_ref[pass][i] = coeffs[pass][i]; - } + RESTORE_SWITCHABLE => { + // Does *not* write 'RESTORE_WIENER' + let cdf = &mut self.fc.lrf_switchable_cdf; + symbol_with_update!(self, w, 1, cdf, 3); + } + _ => unreachable!(), + } + for pass in 0..2 { + let first_coeff = if pli == 0 { + 0 + } else { + assert!(coeffs[pass][0] == 0); + 1 + }; + for i in first_coeff..3 { + let min = WIENER_TAPS_MIN[i] as i32; + let max = WIENER_TAPS_MAX[i] as i32; + w.write_signed_subexp_with_ref( + coeffs[pass][i] as i32, + min, + max + 1, + (i + 1) as u8, + rp.wiener_ref[pass][i] as i32, + ); + rp.wiener_ref[pass][i] = coeffs[pass][i]; } } } diff --git a/src/encoder.rs b/src/encoder.rs index c821f97714..08b22075af 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -3166,13 +3166,14 @@ fn check_lf_queue( } } // write LRF information - if fi.sequence.enable_restoration { + if !fi.allow_intrabc && fi.sequence.enable_restoration { + // TODO: also disallow if lossless for pli in 0..planes { if qe.lru_index[pli] != -1 && last_lru_coded[pli] < qe.lru_index[pli] { last_lru_coded[pli] = qe.lru_index[pli]; - cw.write_lrf(w, fi, &mut ts.restoration, qe.sbo, pli); + cw.write_lrf(w, &mut ts.restoration, qe.sbo, pli); } } } From 44f5033670c75132d0c5a1e3dcc18ac2329c86be Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Sat, 8 May 2021 09:24:30 +0200 Subject: [PATCH 071/188] Make the SceneChange stage constructor use the EncoderConfig directly --- src/api/channel/by_gop.rs | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/src/api/channel/by_gop.rs b/src/api/channel/by_gop.rs index f79e30a197..c87cc0277a 100644 --- a/src/api/channel/by_gop.rs +++ b/src/api/channel/by_gop.rs @@ -10,6 +10,7 @@ use crate::api::channel::data::*; use crate::api::config::*; use crate::api::util::*; +use crate::api::EncoderConfig; use crate::api::InterConfig; use crossbeam::channel::*; @@ -34,7 +35,7 @@ impl SubGop { } */ -// Extra +// TODO: Make the detector logic fitting the model struct SceneChange { frames: u64, pyramid_size: usize, @@ -43,20 +44,17 @@ struct SceneChange { } impl SceneChange { - fn new( - pyramid_size: usize, min_key_frame_interval: u64, - max_key_frame_interval: u64, - ) -> Self { + fn new(pyramid_size: usize, enc: &EncoderConfig) -> Self { Self { frames: 0, pyramid_size, - min_key_frame_interval, - max_key_frame_interval, + min_key_frame_interval: enc.min_key_frame_interval, + max_key_frame_interval: enc.max_key_frame_interval, } } // Tell where to split the lookahead - // 7 is currently hardcoded, it should be a parameter + // fn split( &mut self, lookahead: &[Arc>], ) -> Option<(usize, bool)> { @@ -241,11 +239,7 @@ impl Config { inter_cfg.keyframe_lookahead_distance() as usize + 1; let (send, recv) = bounded(lookahead_distance * 2); - let mut sc = SceneChange::new( - lookahead_distance, - self.enc.min_key_frame_interval, - self.enc.max_key_frame_interval, - ); + let mut sc = SceneChange::new(lookahead_distance, &self.enc); s.spawn_fifo(move |_| { let mut lookahead = Vec::new(); From 0fa8df4d8cd8bc8b0b862c725975017c771ae1b8 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Mon, 17 May 2021 18:39:00 +0200 Subject: [PATCH 072/188] Save the aspect ratio --- src/bin/rav1e-ch.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/bin/rav1e-ch.rs b/src/bin/rav1e-ch.rs index 8707162610..f2fb9c09ab 100644 --- a/src/bin/rav1e-ch.rs +++ b/src/bin/rav1e-ch.rs @@ -420,6 +420,10 @@ fn run() -> Result<(), error::CliError> { ), ) .with_colorspace(y4m_dec.get_colorspace()) + .with_pixel_aspect(y4m::Ratio { + num: video_info.sample_aspect_ratio.num as usize, + den: video_info.sample_aspect_ratio.den as usize, + }) .write_header(rec) .unwrap() }); @@ -432,6 +436,7 @@ fn run() -> Result<(), error::CliError> { cli.enc.width = video_info.width; cli.enc.height = video_info.height; cli.enc.bit_depth = video_info.bit_depth; + cli.enc.sample_aspect_ratio = video_info.sample_aspect_ratio; cli.enc.chroma_sampling = video_info.chroma_sampling; cli.enc.chroma_sample_position = video_info.chroma_sample_position; From d71ac663c1be54d480b726a083d4a8d358fc6dbd Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Tue, 11 May 2021 17:44:54 +0200 Subject: [PATCH 073/188] Handle smaller than intended frame set --- src/scenechange/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index 7ea001acc8..b8a5f57054 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -88,6 +88,10 @@ impl SceneChangeDetector { // Find the distance to the previous keyframe. let distance = input_frameno - previous_keyframe; + if frame_set.len() < 2 { + return false; + } + // Handle minimum and maximum key frame intervals. if distance < self.encoder_config.min_key_frame_interval { return false; From df8b712b068826ee7fec04bb0b5829289fd4c6fa Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Sat, 8 May 2021 14:00:22 +0200 Subject: [PATCH 074/188] Wire in the current scenechange Co-Authored-by: David Michael Barr --- src/api/channel/by_gop.rs | 70 +++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/src/api/channel/by_gop.rs b/src/api/channel/by_gop.rs index c87cc0277a..d7024e90bf 100644 --- a/src/api/channel/by_gop.rs +++ b/src/api/channel/by_gop.rs @@ -16,7 +16,10 @@ use crate::api::InterConfig; use crossbeam::channel::*; // use crate::encoder::*; +use crate::config::CpuFeatureLevel; +use crate::encoder::Sequence; use crate::frame::*; +use crate::scenechange::SceneChangeDetector; use crate::util::Pixel; use std::collections::BTreeMap; @@ -37,20 +40,26 @@ impl SubGop { // TODO: Make the detector logic fitting the model struct SceneChange { - frames: u64, + frames: usize, pyramid_size: usize, - min_key_frame_interval: u64, - max_key_frame_interval: u64, + processed: u64, + last_keyframe: u64, + detector: SceneChangeDetector, } impl SceneChange { fn new(pyramid_size: usize, enc: &EncoderConfig) -> Self { - Self { - frames: 0, + let seq = Arc::new(Sequence::new(enc)); + + let detector = SceneChangeDetector::new( + *enc, + CpuFeatureLevel::default(), pyramid_size, - min_key_frame_interval: enc.min_key_frame_interval, - max_key_frame_interval: enc.max_key_frame_interval, - } + seq, + true, + ); + + Self { frames: 0, pyramid_size, processed: 0, last_keyframe: 0, detector } } // Tell where to split the lookahead @@ -58,24 +67,27 @@ impl SceneChange { fn split( &mut self, lookahead: &[Arc>], ) -> Option<(usize, bool)> { - self.frames += 1; + self.processed += 1; - let new_gop = if self.frames < self.min_key_frame_interval { - false - } else if self.frames >= self.max_key_frame_interval { - self.frames = 0; - true - } else { - false - }; + let new_gop = self.detector.analyze_next_frame( + &lookahead[self.frames..], + self.processed, + self.last_keyframe, + ); - let len = lookahead.len(); + if new_gop { + self.last_keyframe = self.processed; + } - if len > self.pyramid_size { - Some((self.pyramid_size, new_gop)) + if self.frames > self.pyramid_size { + self.frames -= self.pyramid_size + 1; + Some((self.pyramid_size + 2, new_gop)) } else if new_gop { - Some((len - 1, true)) + let frames = self.frames + 1; + self.frames = 0; + Some((frames, true)) } else { + self.frames += 1; None } } @@ -235,11 +247,11 @@ impl Config { &self, s: &rayon::ScopeFifo, r: Receiver>, ) -> Receiver> { let inter_cfg = InterConfig::new(&self.enc); - let lookahead_distance = - inter_cfg.keyframe_lookahead_distance() as usize + 1; + let pyramid_size = inter_cfg.keyframe_lookahead_distance() as usize; + let lookahead_distance = pyramid_size + 1 + 1; let (send, recv) = bounded(lookahead_distance * 2); - let mut sc = SceneChange::new(lookahead_distance, &self.enc); + let mut sc = SceneChange::new(pyramid_size, &self.enc); s.spawn_fifo(move |_| { let mut lookahead = Vec::new(); @@ -261,11 +273,13 @@ impl Config { } } - while let Some((split_pos, end_gop)) = sc.split(&lookahead) { - let rem = lookahead.split_off(split_pos); - let _ = send.send(SubGop { frames: lookahead, end_gop }); + while lookahead.len() > lookahead_distance { + if let Some((split_pos, end_gop)) = sc.split(&lookahead) { + let rem = lookahead.split_off(split_pos); + let _ = send.send(SubGop { frames: lookahead, end_gop }); - lookahead = rem; + lookahead = rem; + } } if !lookahead.is_empty() { From 6ee4243f075a5b6ca7f5d6ecee1946eb220ac545 Mon Sep 17 00:00:00 2001 From: Josh Holmer Date: Tue, 18 May 2021 10:50:12 -0400 Subject: [PATCH 075/188] Fix clippy warnings from Rust nightly These will be enabled by default in a future stable Rust, so fixing them now keeps us ahead of the ball. Relevant lints: - used `assert_eq!` with a literal bool - all if blocks contain the same code at the end - called `is_none()` after searching an `Iterator` with `find` --- src/api/internal.rs | 8 +++----- src/header.rs | 11 +++++------ 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/api/internal.rs b/src/api/internal.rs index bf3ddc2270..5cd09cbca7 100644 --- a/src/api/internal.rs +++ b/src/api/internal.rs @@ -433,14 +433,12 @@ impl ContextInner { let mut data_location = PathBuf::new(); if env::var_os("RAV1E_DATA_PATH").is_some() { data_location.push(&env::var_os("RAV1E_DATA_PATH").unwrap()); - fs::create_dir_all(data_location.clone()).unwrap(); - data_location } else { data_location.push(&env::current_dir().unwrap()); data_location.push(".lookahead_data"); - fs::create_dir_all(data_location.clone()).unwrap(); - data_location } + fs::create_dir_all(&data_location).unwrap(); + data_location } fn build_frame_properties( @@ -1001,7 +999,7 @@ impl ContextInner { let mut unique_indices = ArrayVec::<_, 3>::new(); for (mv_index, &rec_index) in fi.ref_frames.iter().enumerate() { - if unique_indices.iter().find(|&&(_, r)| r == rec_index).is_none() { + if !unique_indices.iter().any(|&(_, r)| r == rec_index) { unique_indices.push((mv_index, rec_index)); } } diff --git a/src/header.rs b/src/header.rs index c045a35afb..e650b16ac7 100644 --- a/src/header.rs +++ b/src/header.rs @@ -273,8 +273,8 @@ impl UncompressedHeader for BitWriter { self.write_bit(fi.sequence.reduced_still_picture_hdr)?; // reduced_still_picture_header if fi.sequence.reduced_still_picture_hdr { - assert_eq!(fi.sequence.timing_info_present, false); - assert_eq!(fi.sequence.decoder_model_info_present_flag, false); + assert!(!fi.sequence.timing_info_present); + assert!(!fi.sequence.decoder_model_info_present_flag); assert_eq!(fi.sequence.operating_points_cnt_minus_1, 0); assert_eq!(fi.sequence.operating_point_idc[0], 0); self.write(5, 31)?; // level @@ -564,11 +564,10 @@ impl UncompressedHeader for BitWriter { // Inter frame info goes here if fi.intra_only { assert!(fi.refresh_frame_flags != ALL_REF_FRAMES_MASK); - self.write(REF_FRAMES as u32, fi.refresh_frame_flags)?; } else { // TODO: This should be set once inter mode is used - self.write(REF_FRAMES as u32, fi.refresh_frame_flags)?; } + self.write(REF_FRAMES as u32, fi.refresh_frame_flags)?; }; if (!fi.intra_only || fi.refresh_frame_flags != ALL_REF_FRAMES_MASK) { @@ -1067,8 +1066,8 @@ impl UncompressedHeader for BitWriter { if segmentation.enabled { if fi.primary_ref_frame == PRIMARY_REF_NONE { - assert_eq!(segmentation.update_map, true); - assert_eq!(segmentation.update_data, true); + assert!(segmentation.update_map); + assert!(segmentation.update_data); } else { self.write_bit(segmentation.update_map)?; if segmentation.update_map { From 6810bf19c1e7d0727bd701c0609f6e3aef5b7559 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Sun, 16 May 2021 17:21:45 +0200 Subject: [PATCH 076/188] Point the users to the cargo-c installation instructions Should address concerns such as #2733. --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index e7074aa66a..04bae6226a 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,8 @@ cargo install cargo-c cargo cinstall --release ``` +Please refer to the cargo-c [installation](https://github.com/lu-zero/cargo-c#installation) instructions. + ## Usage ### Compressing video Input videos must be in [y4m format](https://wiki.multimedia.cx/index.php/YUV4MPEG2). The monochrome color format is not supported. From e19269752954379f4aa1033010e84489e85ef73d Mon Sep 17 00:00:00 2001 From: Josh Holmer Date: Thu, 20 May 2021 19:21:55 -0400 Subject: [PATCH 077/188] Fix the draw-mvs.py script (#2739) --- tools/draw-mvs.py | 3 +++ 1 file changed, 3 insertions(+) mode change 100644 => 100755 tools/draw-mvs.py diff --git a/tools/draw-mvs.py b/tools/draw-mvs.py old mode 100644 new mode 100755 index 49d21b7a7f..2ffb89a328 --- a/tools/draw-mvs.py +++ b/tools/draw-mvs.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + import struct import sys @@ -13,6 +15,7 @@ def draw_mvs(prefix): + prefix = str(prefix).rjust(10, '0') with open(prefix + "-mvs.bin", "rb") as f: contents = f.read() From 7a857391354a24f9715dbc97b531365a747853ab Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Mon, 24 May 2021 23:06:56 +0900 Subject: [PATCH 078/188] CI: Update libdav1d to 0.9.0-dmo1 --- .github/workflows/rav1e.yml | 6 +++--- .travis/install-dav1d.sh | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/rav1e.yml b/.github/workflows/rav1e.yml index 71ebceb617..cd2c8c8eb8 100644 --- a/.github/workflows/rav1e.yml +++ b/.github/workflows/rav1e.yml @@ -145,11 +145,11 @@ jobs: matrix.conf == 'grcov-coveralls' || matrix.conf == 'fuzz' || matrix.conf == 'no-asm-tests' env: LINK: https://www.deb-multimedia.org/pool/main/d/dav1d-dmo - DAV1D_VERSION: 0.8.2-dmo1 + DAV1D_VERSION: 0.9.0-dmo1 DAV1D_DEV_SHA256: >- - 04d30fc34056467b91a627563c61b9a0046a2e084bb649791cd31887a6c76d8e + ce6bd5c710d287306d3b6d45fa3843b35231da37f4d18d82ff24ba088916cfae DAV1D_LIB_SHA256: >- - 0c3debb3a926e10009503e639dddcfd4082ed6e012340ca49682b738c243dedc + 54c8ff504523101b96fa994963fb24b7104221a5b011f8b525baac8260640994 run: | echo "$LINK/libdav1d-dev_${DAV1D_VERSION}_amd64.deb" >> DEBS echo "$LINK/libdav1d5_${DAV1D_VERSION}_amd64.deb" >> DEBS diff --git a/.travis/install-dav1d.sh b/.travis/install-dav1d.sh index deddb47689..6366236856 100755 --- a/.travis/install-dav1d.sh +++ b/.travis/install-dav1d.sh @@ -1,7 +1,7 @@ #!/bin/bash set -ex -DAV1D_VERSION="0.8.2-dmo1" +DAV1D_VERSION="0.9.0-dmo1" PKG_URL="https://www.deb-multimedia.org/pool/main/d/dav1d-dmo" case "$ARCH" in @@ -17,10 +17,10 @@ curl -O "$PKG_URL/libdav1d-dev_${DAV1D_VERSION}_$ARCH.deb" \ -O "$PKG_URL/libdav1d5_${DAV1D_VERSION}_$ARCH.deb" sha256sum --check --ignore-missing < Date: Mon, 24 May 2021 19:45:21 +0300 Subject: [PATCH 079/188] Change scene change tests naming --- src/api/test.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/api/test.rs b/src/api/test.rs index 5cbeaa6ea1..9bb36aae56 100644 --- a/src/api/test.rs +++ b/src/api/test.rs @@ -1394,7 +1394,7 @@ fn output_frameno_no_scene_change_at_short_flash(flash_at: u64) { } #[test] -fn output_frameno_no_scene_change_at_max_len_flash() { +fn output_frameno_no_scene_change_at_flash_smaller_than_max_len_flash() { // Test output_frameno configurations when there's a multi-frame flash // with length equal to the max flash length @@ -1454,7 +1454,7 @@ fn output_frameno_no_scene_change_at_max_len_flash() { } #[test] -fn output_frameno_scene_change_past_max_len_flash() { +fn output_frameno_scene_change_before_flash_longer_than_max_flash_len() { // Test output_frameno configurations when there's a multi-frame flash // with length greater than the max flash length @@ -1521,7 +1521,7 @@ fn output_frameno_scene_change_past_max_len_flash() { } #[test] -fn output_frameno_no_scene_change_at_multiple_flashes() { +fn output_frameno_scene_change_after_multiple_flashes() { // Test output_frameno configurations when there are multiple consecutive flashes let mut ctx = setup_encoder::( From a3badce755385612b3b7bb41052bbaddf8a7debd Mon Sep 17 00:00:00 2001 From: Josh Holmer Date: Wed, 26 May 2021 08:15:26 -0400 Subject: [PATCH 080/188] Resolve an unnecessary clone of rec data (#2741) This change requires a bit of explanation, because the way Arc::make_mut works is slightly complicated. First, I found that there are a couple of places in the code where we want to clone the rec data so we can keep a copy of it prior to changes being made. This was working coincidentally, because we were cloning the Arc containing it (meaning, the inner data was not cloned, we only made a new reference to it), but a later call to Arc::make_mut caused the original data to be cloned and preserved. I changed the locations where we want to clone the data into explicit clones of the inner data, and changed the make_mut calls to get_mut to verify that this is the case (get_mut ensures that there is only one reference to an Arc and fails if that is not the case). I was then investigating the way we handle making mutable references to the rec data within TileStateMut. It turns out that there, as well, Arc::make_mut is implicitly cloning the inner data for each tile that is generated. Fortunately, because we never need that data after the tile is done, this doesn't impact the result of the encode, but it does have the effect of cloning data that we shouldn't need to clone. I attempted to change these calls to Arc::get_mut_unchecked--we will need to be able to have multiple mutable references, as many as there are tiles, but these shouldn't overlap, making it safe for our use case. (As a side note, get_mut_unchecked is an unstable API, so we can't actually use it in rav1e master yet.) This change caused one of the other calls to get_mut to panic, meaning we had some other live reference to the rec data somewhere. I traced this back to the .cloned() call on internal.rs:1141, and avoided that clone by removing the frame data from the map so we can modify it, then putting it back on the map when we're done (this is a technique we use in at least one other place in the codebase). The end result of this change specifically is a slight reduction in peak memory usage. This also unblocked the ability to use get_mut_unchecked, which if it is ever stabilized, that additional change would reduce peak memory usage by a more significant amount (about 8% with 4 tiles, with more savings for higher tile counts). --- src/api/internal.rs | 5 ++--- src/capi.rs | 2 +- src/encoder.rs | 8 ++++---- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/api/internal.rs b/src/api/internal.rs index 5cd09cbca7..07625471f8 100644 --- a/src/api/internal.rs +++ b/src/api/internal.rs @@ -1138,7 +1138,7 @@ impl ContextInner { return Err(EncoderStatus::NotReady); } let mut frame_data = - self.frame_data.get(&cur_output_frameno).cloned().unwrap(); + self.frame_data.remove(&cur_output_frameno).unwrap(); let fti = frame_data.fi.get_frame_subtype(); let qps = self.rc_state.select_qi( self, @@ -1200,13 +1200,12 @@ impl ContextInner { let planes = if frame_data.fi.sequence.chroma_sampling == Cs400 { 1 } else { 3 }; - Arc::make_mut(&mut frame_data.fs.rec).pad( + Arc::get_mut(&mut frame_data.fs.rec).unwrap().pad( frame_data.fi.width, frame_data.fi.height, planes, ); - // TODO avoid the clone by having rec Arc. let (rec, source) = if frame_data.fi.show_frame { (Some(frame_data.fs.rec.clone()), Some(frame_data.fs.input.clone())) } else { diff --git a/src/capi.rs b/src/capi.rs index 6e988af529..8a3c81c8da 100644 --- a/src/capi.rs +++ b/src/capi.rs @@ -1116,7 +1116,7 @@ fn rav1e_frame_fill_plane_internal( f: &mut Arc>, plane: c_int, data_slice: &[u8], stride: ptrdiff_t, bytewidth: c_int, ) { - let input = Arc::make_mut(f); + let input = Arc::get_mut(f).unwrap(); input.planes[plane as usize].copy_from_raw_u8( data_slice, stride as usize, diff --git a/src/encoder.rs b/src/encoder.rs index 08b22075af..564d78d7ed 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -3028,7 +3028,7 @@ fn encode_tile_group( if fi.sequence.enable_restoration { // Until the loop filters are better pipelined, we'll need to keep // around a copy of both the deblocked and cdeffed frame. - let deblocked_frame = fs.rec.clone(); + let deblocked_frame = (*fs.rec).clone(); /* TODO: Don't apply if lossless */ if fi.sequence.enable_cdef { @@ -3038,14 +3038,14 @@ fn encode_tile_group( } /* TODO: Don't apply if lossless */ fs.restoration.lrf_filter_frame( - Arc::make_mut(&mut fs.rec), + Arc::get_mut(&mut fs.rec).unwrap(), &deblocked_frame, fi, ); } else { /* TODO: Don't apply if lossless */ if fi.sequence.enable_cdef { - let deblocked_frame = fs.rec.clone(); + let deblocked_frame = (*fs.rec).clone(); let ts = &mut fs.as_tile_state_mut(); let rec = &mut ts.rec; cdef_filter_tile(fi, &deblocked_frame, &blocks.as_tile_blocks(), rec); @@ -3450,7 +3450,7 @@ pub fn encode_show_existing_frame( let map_idx = fi.frame_to_show_map_idx as usize; if let Some(ref rec) = fi.rec_buffer.frames[map_idx] { - let fs_rec = Arc::make_mut(&mut fs.rec); + let fs_rec = Arc::get_mut(&mut fs.rec).unwrap(); let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 { 1 } else { 3 }; for p in 0..planes { From 245f0d39b2d83773b7be9d7385fa48ac94895f4f Mon Sep 17 00:00:00 2001 From: Hayden Date: Fri, 28 May 2021 18:08:48 -0400 Subject: [PATCH 081/188] Support more Apple aarch64 systems --- build.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.rs b/build.rs index 9cab0eecaa..35946ec028 100644 --- a/build.rs +++ b/build.rs @@ -80,7 +80,7 @@ fn build_nasm_files() { config_file.write(b" %define PIC 1\n").unwrap(); config_file.write(b" %define STACK_ALIGNMENT 16\n").unwrap(); config_file.write(b" %define HAVE_AVX512ICL 1\n").unwrap(); - if env::var("CARGO_CFG_TARGET_OS").unwrap() == "macos" { + if env::var("CARGO_CFG_TARGET_VENDOR").unwrap() == "apple" { config_file.write(b" %define PREFIX 1\n").unwrap(); } @@ -148,7 +148,7 @@ fn build_asm_files() { let dest_path = Path::new(&out_dir).join("config.h"); let mut config_file = File::create(&dest_path).unwrap(); - if env::var("CARGO_CFG_TARGET_OS").unwrap() == "macos" { + if env::var("CARGO_CFG_TARGET_VENDOR").unwrap() == "apple" { config_file.write(b" #define PREFIX 1\n").unwrap(); } config_file.write(b" #define PRIVATE_PREFIX rav1e_\n").unwrap(); From 975def25863668252c81248f2109226b30a2cee6 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Mon, 31 May 2021 13:11:54 +0200 Subject: [PATCH 082/188] Move the cpu feature info line Address #2747 --- src/bin/rav1e-ch.rs | 4 ++-- src/bin/rav1e.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/bin/rav1e-ch.rs b/src/bin/rav1e-ch.rs index f2fb9c09ab..73b01f9be2 100644 --- a/src/bin/rav1e-ch.rs +++ b/src/bin/rav1e-ch.rs @@ -315,8 +315,6 @@ fn main() -> Result<(), Box> { buffer_size: 4096, }); - info!("CPU Feature Level: {}", CpuFeatureLevel::default()); - run().map_err(|e| { error::print_error(&e); Box::new(e) as Box @@ -510,6 +508,8 @@ fn run() -> Result<(), error::CliError> { cli.enc.time_base.num as usize, ); + info!("CPU Feature Level: {}", CpuFeatureLevel::default()); + info!( "Using y4m decoder: {}x{}p @ {}/{} fps, {}, {}-bit", video_info.width, diff --git a/src/bin/rav1e.rs b/src/bin/rav1e.rs index 74fd111921..19f36de8e4 100644 --- a/src/bin/rav1e.rs +++ b/src/bin/rav1e.rs @@ -306,8 +306,6 @@ fn main() -> Result<(), Box> { buffer_size: 4096, }); - info!("CPU Feature Level: {}", CpuFeatureLevel::default()); - run().map_err(|e| { error::print_error(&e); Box::new(e) as Box @@ -494,6 +492,8 @@ fn run() -> Result<(), error::CliError> { cli.enc.time_base.num as usize, ); + info!("CPU Feature Level: {}", CpuFeatureLevel::default()); + info!( "Using y4m decoder: {}x{}p @ {}/{} fps, {}, {}-bit", video_info.width, From 1c8ad5b10244f58dfd7b7a2ebd5a3c4eff4d8678 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Thu, 3 Jun 2021 22:53:36 +0900 Subject: [PATCH 083/188] Use the x86 8-bit CDEF assembly on edges The current workaround to use 16-bit intermediates also means that we do not need to fall back to the Rust implementation on edges. --- src/asm/x86/cdef.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/asm/x86/cdef.rs b/src/asm/x86/cdef.rs index 7439396287..fcfd5e04bc 100644 --- a/src/asm/x86/cdef.rs +++ b/src/asm/x86/cdef.rs @@ -64,7 +64,7 @@ pub(crate) unsafe fn cdef_filter_block( }; // TODO: handle padding in the fast path - if edges != CDEF_HAVE_ALL { + if edges != CDEF_HAVE_ALL && matches!(T::type_enum(), PixelType::U16) { call_rust(dst); } else { #[cfg(feature = "check_asm")] From 685c394962298de73cf228b7e5536c7b2d199501 Mon Sep 17 00:00:00 2001 From: Vibhoothi Date: Mon, 14 Jun 2021 21:53:35 +0100 Subject: [PATCH 084/188] CI: Update libaom to 3.1.1-dmo0~bpo10+1 This commit also introduces libvmaf and libvmaf-dev as a new dependency as libaom-3.1.1 onwards it is required --- .github/workflows/rav1e.yml | 22 +++++++++++++++++++--- .travis/install-aom.sh | 20 ++++++++++++++------ 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/.github/workflows/rav1e.yml b/.github/workflows/rav1e.yml index cd2c8c8eb8..d259ceefae 100644 --- a/.github/workflows/rav1e.yml +++ b/.github/workflows/rav1e.yml @@ -123,17 +123,33 @@ jobs: run: | echo "$LINK/nasm_${NASM_VERSION}_amd64.deb" >> DEBS echo "$NASM_SHA256 nasm_${NASM_VERSION}_amd64.deb" >> CHECKSUMS + - name: Add libvmaf + if: > + matrix.conf == '1.51.0-tests' || matrix.conf == 'aom-tests' || + matrix.conf == 'grcov-coveralls' + env: + LINK: https://www.deb-multimedia.org/pool/main/v/vmaf-dmo + LIBVMAF_VERSION: 2.1.1-dmo0~bpo10+3 + LIBVMAF_SHA256: >- + d9c2e708399af37cac52090453aadf8ad4311c3ed40addf02c3158c7f7a705a6 + LIBVMAF_DEV_SHA256: >- + 957a21b4a4b3cea4b27ab068fc41e85776d19c69fbede949b6eecd9631aa697f + run: | + echo "$LINK/libvmaf1_${LIBVMAF_VERSION}_amd64.deb" >> DEBS + echo "$LINK/libvmaf-dev_${LIBVMAF_VERSION}_amd64.deb" >> DEBS + echo "$LIBVMAF_SHA256 libvmaf1_${LIBVMAF_VERSION}_amd64.deb" >> CHECKSUMS + echo "$LIBVMAF_DEV_SHA256 libvmaf-dev_${LIBVMAF_VERSION}_amd64.deb" >> CHECKSUMS - name: Add aom if: > matrix.conf == '1.51.0-tests' || matrix.conf == 'aom-tests' || matrix.conf == 'grcov-coveralls' env: LINK: https://www.deb-multimedia.org/pool/main/a/aom-dmo - AOM_VERSION: 3.1.0-dmo0~bpo10+1 + AOM_VERSION: 3.1.1-dmo0~bpo10+1 AOM_DEV_SHA256: >- - 1a78ad10714c0cd9ed2324007369c20a5d9047d98e7098f932f48edb01056f36 + 940a6419b794edf69c25d54765fe8eee83c8df7a566c49051d7bf1a30d8bf9d8 AOM_LIB_SHA256: >- - a2e1f0a0ab1be6b93a1582d68b869d27e88c1fb8df7fae7bd793ebc0322c76a2 + d17d988dcef38e5e00d304ee153e31201a547293112c1b8879a167e01b59851c run: | echo "$LINK/libaom-dev_${AOM_VERSION}_amd64.deb" >> DEBS echo "$LINK/libaom3_${AOM_VERSION}_amd64.deb" >> DEBS diff --git a/.travis/install-aom.sh b/.travis/install-aom.sh index ae0a192877..f4894ed87f 100755 --- a/.travis/install-aom.sh +++ b/.travis/install-aom.sh @@ -1,21 +1,29 @@ #!/bin/bash set -ex -AOM_VERSION="3.1.0-dmo0~bpo10+1" +AOM_VERSION="3.1.1-dmo0~bpo10+1" +LIBVMAF_VERSION="2.1.1-dmo0~bpo10+3" PKG_URL="https://www.deb-multimedia.org/pool/main/a/aom-dmo" +LIBVMAF_URL="https://www.deb-multimedia.org/pool/main/v/vmaf-dmo" ARCH="arm64" cd "$DEPS_DIR" -[ -f "libaom-dev_${AOM_VERSION}_${ARCH}.deb" ] && +[ -f "libvmaf-dev_${LIBVMAF_VERSION}_${ARCH}.deb" ] && [ -f "libvmaf1_${LIBVMAF_VERSION}_${ARCH}.deb" ] && [ -f "libaom-dev_${AOM_VERSION}_${ARCH}.deb" ] && [ -f "libaom2_${AOM_VERSION}_${ARCH}.deb" ] || -curl -O "${PKG_URL}/libaom-dev_${AOM_VERSION}_${ARCH}.deb" \ +curl -O "${LIBVMAF_URL}/libvmaf1_${LIBVMAF_VERSION}_${ARCH}.deb" \ + -O "${LIBVMAF_URL}/libvmaf-dev_${LIBVMAF_VERSION}_${ARCH}.deb" \ + -O "${PKG_URL}/libaom-dev_${AOM_VERSION}_${ARCH}.deb" \ -O "${PKG_URL}/libaom3_${AOM_VERSION}_${ARCH}.deb" sha256sum --check --ignore-missing < Date: Wed, 23 Jun 2021 17:45:09 +0100 Subject: [PATCH 085/188] CI: Update to libaom to 3.1.1-dmo0~bpo10+2 --- .github/workflows/rav1e.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/rav1e.yml b/.github/workflows/rav1e.yml index d259ceefae..632d8c2ea0 100644 --- a/.github/workflows/rav1e.yml +++ b/.github/workflows/rav1e.yml @@ -145,11 +145,11 @@ jobs: matrix.conf == 'grcov-coveralls' env: LINK: https://www.deb-multimedia.org/pool/main/a/aom-dmo - AOM_VERSION: 3.1.1-dmo0~bpo10+1 + AOM_VERSION: 3.1.1-dmo0~bpo10+2 AOM_DEV_SHA256: >- - 940a6419b794edf69c25d54765fe8eee83c8df7a566c49051d7bf1a30d8bf9d8 + 881ec275a01169378e19c1779fec3fb5d4b80e1afe61d8b576a7c66419702a90 AOM_LIB_SHA256: >- - d17d988dcef38e5e00d304ee153e31201a547293112c1b8879a167e01b59851c + a2a75cda5eacbddad70c508a7113d0ba572aad29934bb31905773e9adb555413 run: | echo "$LINK/libaom-dev_${AOM_VERSION}_amd64.deb" >> DEBS echo "$LINK/libaom3_${AOM_VERSION}_amd64.deb" >> DEBS From c17ce82f8b59995b962a81edcc6f51db71da98c7 Mon Sep 17 00:00:00 2001 From: Thomas Daede Date: Wed, 7 Jul 2021 07:47:16 -0700 Subject: [PATCH 086/188] Move #daala to Libera.Chat. (#2744) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Move #daala channel to Libera.Chat. Co-authored-by: David Michael Barr Co-authored-by: Raphaël Zumer Co-authored-by: Vibhoothi Co-authored-by: Josh Holmer Co-authored-by: David Michael Barr --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 04bae6226a..af8de661ea 100644 --- a/README.md +++ b/README.md @@ -178,5 +178,4 @@ Find a full list in feature-table in [`Cargo.toml`](Cargo.toml) Please read our guide to [contributing to rav1e](CONTRIBUTING.md). ## Getting in Touch -Come chat with us on the IRC channel #daala on Freenode! If you don't have IRC set -up you can easily connect from your [web browser](http://webchat.freenode.net/?channels=%23daala). +Come chat with us on the IRC channel #daala on [Libera.Chat](https://libera.chat/)! You can also use a [web client](https://web.libera.chat/?channel=#daala) to join with a web browser. From f553646d70fba8e265d436103a73520eb7adec8c Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Thu, 8 Jul 2021 13:39:59 +0900 Subject: [PATCH 087/188] Initialise residual when less than the transform width is visible The input stride for forward transforms did not match the output stride of residual computation in this case. Extend the residual stride to the transform width and zero the non-visible portion. Fixes #2662. Fixes #2757. --- src/encoder.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/encoder.rs b/src/encoder.rs index 564d78d7ed..1ccf8c8310 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -1209,12 +1209,20 @@ pub fn encode_tx_block( residual, &ts.input_tile.planes[p].subregion(area), &rec.subregion(area), - visible_tx_w, + tx_size.width(), visible_tx_h, ); + if visible_tx_w < tx_size.width() { + for row in residual.chunks_mut(tx_size.width()).take(visible_tx_h) { + for a in &mut row[visible_tx_w..] { + *a = 0; + } + } + } } - let visible_area = visible_tx_w * visible_tx_h; - for a in residual[visible_area..].iter_mut() { + let initialized_area = + if visible_tx_w == 0 { 0 } else { tx_size.width() * visible_tx_h }; + for a in residual[initialized_area..].iter_mut() { *a = 0; } From 8c064f492985ab9f6c1a5be0fa3fc72e9b83d132 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Thu, 8 Jul 2021 23:42:45 +0900 Subject: [PATCH 088/188] Fix fall-through of x86 dispatch_predict_intra for CpuFeatureLevel::RUST --- src/asm/x86/predict.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/asm/x86/predict.rs b/src/asm/x86/predict.rs index db16a7b163..3b5bd2e70f 100644 --- a/src/asm/x86/predict.rs +++ b/src/asm/x86/predict.rs @@ -170,7 +170,7 @@ pub fn dispatch_predict_intra( let angle = angle as libc::c_int; match T::type_enum() { - PixelType::U8 => { + PixelType::U8 if cpu >= CpuFeatureLevel::SSSE3 => { let dst_ptr = dst.data_ptr_mut() as *mut _; let edge_ptr = edge_buf.data.as_ptr().offset(2 * MAX_TX_SIZE as isize) as *const _; From acbfd1b49ce012659efd52f5e6725e7bc21e04f8 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Mon, 12 Jul 2021 16:50:09 +0900 Subject: [PATCH 089/188] fuzz: Pass check_asm feature through to rav1e Optionally enables additional verification in the encode and encode_decode fuzzers. Useful for analysis of failure cases. --- fuzz/Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index b0914d8743..c108a2b05c 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -5,6 +5,9 @@ version = "0.0.1" authors = ["Automatically generated"] publish = false +[features] +check_asm = ["rav1e/check_asm"] + [package.metadata] cargo-fuzz = true From 36c5a380b5cf5299a1e006631fe3d82350dc33d7 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Mon, 12 Jul 2021 16:54:13 +0900 Subject: [PATCH 090/188] x86: Disable HBD asm without 8-bit support for 8-in-16 --- src/asm/x86/cdef.rs | 3 ++- src/asm/x86/mc.rs | 6 ++++-- src/asm/x86/predict.rs | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/asm/x86/cdef.rs b/src/asm/x86/cdef.rs index fcfd5e04bc..fd5296cb9a 100644 --- a/src/asm/x86/cdef.rs +++ b/src/asm/x86/cdef.rs @@ -220,7 +220,7 @@ pub(crate) fn cdef_find_dir( call_rust(var) } } - PixelType::U16 => { + PixelType::U16 if coeff_shift > 0 => { if let Some(func) = CDEF_DIR_HBD_FNS[cpu.as_index()] { unsafe { (func)( @@ -234,6 +234,7 @@ pub(crate) fn cdef_find_dir( call_rust(var) } } + _ => call_rust(var), }; #[cfg(feature = "check_asm")] diff --git a/src/asm/x86/mc.rs b/src/asm/x86/mc.rs index f24128aa54..41897c3742 100644 --- a/src/asm/x86/mc.rs +++ b/src/asm/x86/mc.rs @@ -186,7 +186,7 @@ pub fn prep_8tap( None => call_rust(tmp), } } - PixelType::U16 => { + PixelType::U16 if bit_depth > 8 => { match PREP_HBD_FNS[cpu.as_index()][get_2d_mode_idx(mode_x, mode_y)] { Some(func) => unsafe { (func)( @@ -203,6 +203,7 @@ pub fn prep_8tap( None => call_rust(tmp), } } + _ => call_rust(tmp), } #[cfg(feature = "check_asm")] { @@ -237,7 +238,7 @@ pub fn mc_avg( }, None => call_rust(dst), }, - PixelType::U16 => match AVG_HBD_FNS[cpu.as_index()] { + PixelType::U16 if bit_depth > 8 => match AVG_HBD_FNS[cpu.as_index()] { Some(func) => unsafe { (func)( dst.data_ptr_mut() as *mut _, @@ -251,6 +252,7 @@ pub fn mc_avg( }, None => call_rust(dst), }, + _ => call_rust(dst), } #[cfg(feature = "check_asm")] { diff --git a/src/asm/x86/predict.rs b/src/asm/x86/predict.rs index 3b5bd2e70f..0883de6aa1 100644 --- a/src/asm/x86/predict.rs +++ b/src/asm/x86/predict.rs @@ -309,7 +309,7 @@ pub fn dispatch_predict_intra( } } } - PixelType::U16 if cpu >= CpuFeatureLevel::AVX2 => { + PixelType::U16 if cpu >= CpuFeatureLevel::AVX2 && bit_depth > 8 => { let dst_ptr = dst.data_ptr_mut() as *mut _; let edge_ptr = edge_buf.data.as_ptr().offset(2 * MAX_TX_SIZE as isize) as *const _; From 83742836be24584df56888cedf4dd479512f7ff9 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 13 Jul 2021 16:57:47 +0900 Subject: [PATCH 091/188] arm64: Disable HBD asm without 8-bit support for 8-in-16 --- src/asm/aarch64/cdef.rs | 3 ++- src/asm/aarch64/mc.rs | 9 ++++++--- src/asm/aarch64/predict.rs | 3 ++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/asm/aarch64/cdef.rs b/src/asm/aarch64/cdef.rs index 77f19c65a1..9fc5b242ef 100644 --- a/src/asm/aarch64/cdef.rs +++ b/src/asm/aarch64/cdef.rs @@ -357,7 +357,7 @@ pub(crate) fn cdef_find_dir( call_rust(var) } } - PixelType::U16 => { + PixelType::U16 if coeff_shift > 0 => { if let Some(func) = CDEF_DIR_HBD_FNS[cpu.as_index()] { unsafe { (func)( @@ -371,6 +371,7 @@ pub(crate) fn cdef_find_dir( call_rust(var) } } + _ => call_rust(var), }; #[cfg(feature = "check_asm")] diff --git a/src/asm/aarch64/mc.rs b/src/asm/aarch64/mc.rs index 7b43a10ab2..e9b50cad61 100644 --- a/src/asm/aarch64/mc.rs +++ b/src/asm/aarch64/mc.rs @@ -119,7 +119,7 @@ pub fn put_8tap( None => call_rust(dst), } } - PixelType::U16 => { + PixelType::U16 if bit_depth > 8 => { match PUT_HBD_FNS[cpu.as_index()][get_2d_mode_idx(mode_x, mode_y)] { Some(func) => unsafe { (func)( @@ -137,6 +137,7 @@ pub fn put_8tap( None => call_rust(dst), } } + _ => call_rust(dst), } #[cfg(feature = "check_asm")] { @@ -186,7 +187,7 @@ pub fn prep_8tap( None => call_rust(tmp), } } - PixelType::U16 => { + PixelType::U16 if bit_depth > 8 => { match PREP_HBD_FNS[cpu.as_index()][get_2d_mode_idx(mode_x, mode_y)] { Some(func) => unsafe { (func)( @@ -203,6 +204,7 @@ pub fn prep_8tap( None => call_rust(tmp), } } + _ => call_rust(tmp), } #[cfg(feature = "check_asm")] { @@ -237,7 +239,7 @@ pub fn mc_avg( }, None => call_rust(dst), }, - PixelType::U16 => match AVG_HBD_FNS[cpu.as_index()] { + PixelType::U16 if bit_depth > 8 => match AVG_HBD_FNS[cpu.as_index()] { Some(func) => unsafe { (func)( dst.data_ptr_mut() as *mut _, @@ -251,6 +253,7 @@ pub fn mc_avg( }, None => call_rust(dst), }, + _ => call_rust(dst), } #[cfg(feature = "check_asm")] { diff --git a/src/asm/aarch64/predict.rs b/src/asm/aarch64/predict.rs index 563ad72eee..9d22f8e312 100644 --- a/src/asm/aarch64/predict.rs +++ b/src/asm/aarch64/predict.rs @@ -187,7 +187,7 @@ pub fn dispatch_predict_intra( } _ => call_rust(dst), }, - PixelType::U16 => match mode { + PixelType::U16 if bit_depth > 8 => match mode { PredictionMode::DC_PRED => { (match variant { PredictionVariant::NONE => rav1e_ipred_dc_128_16bpc_neon, @@ -237,6 +237,7 @@ pub fn dispatch_predict_intra( } _ => call_rust(dst), }, + _ => call_rust(dst), } } } From bdee3b951bfda3c02cdee69900439089a6e2ba75 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 13 Jul 2021 18:20:44 +0900 Subject: [PATCH 092/188] fuzz: Add target encode_decode_hbd Share code with encode_decode such that the corpus can be reused. Bit-depth is decoded last so that all other parameters are shared. --- fuzz/Cargo.toml | 5 +++++ fuzz/fuzz_targets/encode_decode.rs | 2 +- fuzz/fuzz_targets/encode_decode_hbd.rs | 19 +++++++++++++++++++ src/fuzzing.rs | 14 ++++++++++---- 4 files changed, 35 insertions(+), 5 deletions(-) create mode 100644 fuzz/fuzz_targets/encode_decode_hbd.rs diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index c108a2b05c..d82c11f432 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -28,6 +28,11 @@ name = "encode_decode" path = "fuzz_targets/encode_decode.rs" required-features = ["rav1e/decode_test_dav1d"] +[[bin]] +name = "encode_decode_hbd" +path = "fuzz_targets/encode_decode_hbd.rs" +required-features = ["rav1e/decode_test_dav1d"] + [[bin]] name = "encode" path = "fuzz_targets/encode.rs" diff --git a/fuzz/fuzz_targets/encode_decode.rs b/fuzz/fuzz_targets/encode_decode.rs index f9aabb37ba..d8041bdb5d 100644 --- a/fuzz/fuzz_targets/encode_decode.rs +++ b/fuzz/fuzz_targets/encode_decode.rs @@ -12,7 +12,7 @@ extern crate rav1e; use rav1e::fuzzing::*; -fuzz_target!(|data: DecodeTestParameters| { +fuzz_target!(|data: DecodeTestParameters| { let _ = pretty_env_logger::try_init(); fuzz_encode_decode(data) diff --git a/fuzz/fuzz_targets/encode_decode_hbd.rs b/fuzz/fuzz_targets/encode_decode_hbd.rs new file mode 100644 index 0000000000..edceecae44 --- /dev/null +++ b/fuzz/fuzz_targets/encode_decode_hbd.rs @@ -0,0 +1,19 @@ +// Copyright (c) 2019-2021, The rav1e contributors. All rights reserved +// +// This source code is subject to the terms of the BSD 2 Clause License and +// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +// was not distributed with this source code in the LICENSE file, you can +// obtain it at www.aomedia.org/license/software. If the Alliance for Open +// Media Patent License 1.0 was not distributed with this source code in the +// PATENTS file, you can obtain it at www.aomedia.org/license/patent. + +#![no_main] +#[macro_use] extern crate libfuzzer_sys; +extern crate rav1e; +use rav1e::fuzzing::*; + +fuzz_target!(|data: DecodeTestParameters| { + let _ = pretty_env_logger::try_init(); + + fuzz_encode_decode(data) +}); diff --git a/src/fuzzing.rs b/src/fuzzing.rs index ef6641c826..8189c4784c 100644 --- a/src/fuzzing.rs +++ b/src/fuzzing.rs @@ -7,6 +7,7 @@ // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. +use std::marker::PhantomData; use std::sync::Arc; use libfuzzer_sys::arbitrary::{Arbitrary, Error, Unstructured}; @@ -295,7 +296,7 @@ pub fn fuzz_encode(arbitrary: ArbitraryEncoder) { } #[derive(Debug)] -pub struct DecodeTestParameters { +pub struct DecodeTestParameters { w: usize, h: usize, speed: usize, @@ -312,9 +313,10 @@ pub struct DecodeTestParameters { tile_cols_log2: usize, tile_rows_log2: usize, still_picture: bool, + pixel: PhantomData, } -impl Arbitrary for DecodeTestParameters { +impl Arbitrary for DecodeTestParameters { fn arbitrary(u: &mut Unstructured<'_>) -> Result { let mut p = Self { w: u.int_in_range(16..=16 + 255)?, @@ -338,7 +340,11 @@ impl Arbitrary for DecodeTestParameters { tile_cols_log2: u.int_in_range(0..=2)?, tile_rows_log2: u.int_in_range(0..=2)?, still_picture: bool::arbitrary(u)?, + pixel: PhantomData, }; + if matches!(T::type_enum(), PixelType::U16) { + p.bit_depth = *u.choose(&[8, 10, 12])?; + } if !p.low_latency { p.switch_frame_interval = 0; } @@ -350,10 +356,10 @@ impl Arbitrary for DecodeTestParameters { } #[cfg(feature = "decode_test_dav1d")] -pub fn fuzz_encode_decode(p: DecodeTestParameters) { +pub fn fuzz_encode_decode(p: DecodeTestParameters) { use crate::test_encode_decode::*; - let mut dec = get_decoder::("dav1d", p.w, p.h); + let mut dec = get_decoder::("dav1d", p.w, p.h); dec.encode_decode( p.w, p.h, From 7970d35ebb1424358b837c6e19b0cecf3cc6f8df Mon Sep 17 00:00:00 2001 From: Zen <46526140+master-of-zen@users.noreply.github.com> Date: Wed, 14 Jul 2021 20:33:15 +0300 Subject: [PATCH 093/188] Improve scene detection (#2710) Current fast scene detection in rav1e relatively slow to compared methods of scene detection, and have proclivity to show false results or don't detect scene changes where they appear This pr reworks fast scene detection algorithm, making it faster, better, and more accurate Achieved goals are: Faster decision making ( Both less and more efficient computations ) More accurate scene detection, by adjusting threshold based on previous frames Frame downscale for faster decisions. --- src/api/channel/by_gop.rs | 11 +- src/api/internal.rs | 3 +- src/scenechange/mod.rs | 495 ++++++++++++++++++++++---------------- 3 files changed, 296 insertions(+), 213 deletions(-) diff --git a/src/api/channel/by_gop.rs b/src/api/channel/by_gop.rs index d7024e90bf..799faf6e55 100644 --- a/src/api/channel/by_gop.rs +++ b/src/api/channel/by_gop.rs @@ -39,15 +39,15 @@ impl SubGop { */ // TODO: Make the detector logic fitting the model -struct SceneChange { +struct SceneChange { frames: usize, pyramid_size: usize, processed: u64, last_keyframe: u64, - detector: SceneChangeDetector, + detector: SceneChangeDetector, } -impl SceneChange { +impl SceneChange { fn new(pyramid_size: usize, enc: &EncoderConfig) -> Self { let seq = Arc::new(Sequence::new(enc)); @@ -56,7 +56,6 @@ impl SceneChange { CpuFeatureLevel::default(), pyramid_size, seq, - true, ); Self { frames: 0, pyramid_size, processed: 0, last_keyframe: 0, detector } @@ -64,9 +63,7 @@ impl SceneChange { // Tell where to split the lookahead // - fn split( - &mut self, lookahead: &[Arc>], - ) -> Option<(usize, bool)> { + fn split(&mut self, lookahead: &[Arc>]) -> Option<(usize, bool)> { self.processed += 1; let new_gop = self.detector.analyze_next_frame( diff --git a/src/api/internal.rs b/src/api/internal.rs index 07625471f8..4dbb474052 100644 --- a/src/api/internal.rs +++ b/src/api/internal.rs @@ -246,7 +246,7 @@ pub(crate) struct ContextInner { gop_output_frameno_start: BTreeMap, /// Maps `output_frameno` to `gop_input_frameno_start`. pub(crate) gop_input_frameno_start: BTreeMap, - keyframe_detector: SceneChangeDetector, + keyframe_detector: SceneChangeDetector, pub(crate) config: Arc, seq: Arc, pub(crate) rc_state: RCState, @@ -291,7 +291,6 @@ impl ContextInner { CpuFeatureLevel::default(), lookahead_distance, seq.clone(), - true, ), config: Arc::new(*enc), seq, diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index b8a5f57054..76d0d08f1d 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -13,35 +13,41 @@ use crate::cpu_features::CpuFeatureLevel; use crate::encoder::Sequence; use crate::frame::*; use crate::util::{CastFromPrimitive, Pixel}; +use itertools::Itertools; use rust_hawktracer::*; -use std::collections::BTreeSet; use std::sync::Arc; +use std::{cmp, u64}; /// Runs keyframe detection on frames from the lookahead queue. -pub struct SceneChangeDetector { +pub struct SceneChangeDetector { /// Minimum average difference between YUV deltas that will trigger a scene change. - threshold: u64, + threshold: usize, /// Fast scene cut detection mode, uses simple SAD instead of encoder cost estimates. fast_mode: bool, - /// Determine whether or not short scene flashes should be excluded - exclude_scene_flashes: bool, - /// Frames that cannot be marked as keyframes due to the algorithm excluding them. - /// Storing the frame numbers allows us to avoid looking back more than one frame. - excluded_frames: BTreeSet, + /// scaling factor for fast scene detection + scale_factor: usize, + // Frame buffer for scaled frames + frame_buffer: Vec>, + // Deque offset for current + lookahead_offset: usize, + // Start deque offset based on lookahead + deque_offset: usize, + // Scenechange results for adaptive threshold + score_deque: Vec<(f64, f64)>, + /// Number of pixels in scaled frame for fast mode + pixels: usize, /// The bit depth of the video. bit_depth: usize, /// The CPU feature level to be used. cpu_feature_level: CpuFeatureLevel, encoder_config: EncoderConfig, - lookahead_distance: usize, sequence: Arc, } -impl SceneChangeDetector { +impl SceneChangeDetector { pub fn new( encoder_config: EncoderConfig, cpu_feature_level: CpuFeatureLevel, lookahead_distance: usize, sequence: Arc, - exclude_scene_flashes: bool, ) -> Self { // This implementation is based on a Python implementation at // https://pyscenedetect.readthedocs.io/en/latest/reference/detection-methods/. @@ -54,20 +60,47 @@ impl SceneChangeDetector { // This may be adjusted later. // // This threshold is only used for the fast scenecut implementation. - const BASE_THRESHOLD: u64 = 12; + // + // Testing shown that default threshold of 12 overallocates keyframes by almost double, + // compared to other scene change implementations + const BASE_THRESHOLD: usize = 12; let bit_depth = encoder_config.bit_depth; let fast_mode = encoder_config.speed_settings.fast_scene_detection || encoder_config.low_latency; + // Scale factor for fast scene detection + let scale_factor = + if fast_mode { detect_scale_factor(&sequence) } else { 1_usize }; + + // Set lookahead offset to 5 if normal lookahead available + let lookahead_offset = if lookahead_distance >= 5 { 5 } else { 0 }; + let deque_offset = lookahead_offset; + + let score_deque = Vec::with_capacity(5 + lookahead_distance); + + // Pixel count for fast scenedetect + let pixels = if fast_mode { + (sequence.max_frame_height as usize / scale_factor) + * (sequence.max_frame_width as usize / scale_factor) + } else { + 1 + }; + + let frame_buffer = + if fast_mode { Vec::with_capacity(2) } else { Vec::new() }; + Self { - threshold: BASE_THRESHOLD * bit_depth as u64 / 8, + threshold: BASE_THRESHOLD * bit_depth / 8, fast_mode, - exclude_scene_flashes, - excluded_frames: BTreeSet::new(), + scale_factor, + frame_buffer, + lookahead_offset, + deque_offset, + score_deque, + pixels, bit_depth, cpu_feature_level, encoder_config, - lookahead_distance, sequence, } } @@ -81,10 +114,13 @@ impl SceneChangeDetector { /// /// This will gracefully handle the first frame in the video as well. #[hawktracer(analyze_next_frame)] - pub fn analyze_next_frame( + pub fn analyze_next_frame( &mut self, frame_set: &[Arc>], input_frameno: u64, previous_keyframe: u64, ) -> bool { + // Use score deque for adaptive threshold for scene cut + // Declare score_deque offset based on lookahead for scene change scores + // Find the distance to the previous keyframe. let distance = input_frameno - previous_keyframe; @@ -92,7 +128,7 @@ impl SceneChangeDetector { return false; } - // Handle minimum and maximum key frame intervals. + // Handle minimum and maximum keyframe intervals. if distance < self.encoder_config.min_key_frame_interval { return false; } @@ -104,215 +140,244 @@ impl SceneChangeDetector { return false; } - if self.exclude_scene_flashes { - self.exclude_scene_flashes(frame_set, input_frameno, previous_keyframe); - } - - self.is_key_frame( - frame_set[0].clone(), - frame_set[1].clone(), - input_frameno, - previous_keyframe, - ) - } + // Initiallization of score deque + // based on frame set length + if self.deque_offset > 0 + && frame_set.len() > self.deque_offset + 1 + && self.score_deque.is_empty() + { + self.initialize_score_deque( + frame_set, + input_frameno, + previous_keyframe, + self.deque_offset, + ); + } else if self.score_deque.is_empty() { + self.initialize_score_deque( + frame_set, + input_frameno, + previous_keyframe, + frame_set.len() - 1, + ); - /// Determines if `current_frame` should be a keyframe. - fn is_key_frame( - &self, previous_frame: Arc>, current_frame: Arc>, - current_frameno: u64, previous_keyframe: u64, - ) -> bool { - if self.excluded_frames.contains(¤t_frameno) { - return false; + self.deque_offset = frame_set.len() - 2; + } + // Running single frame comparison and adding it to deque + // Decrease deque offset if there is no new frames + if frame_set.len() > self.deque_offset + 1 { + self.run_comparison( + frame_set[self.deque_offset].clone(), + frame_set[self.deque_offset + 1].clone(), + input_frameno, + previous_keyframe, + ); + } else { + self.deque_offset -= 1; } - let result = self.has_scenecut( - previous_frame, - current_frame, - current_frameno, - previous_keyframe, - ); + // Adaptive scenecut check + let scenecut = self.adaptive_scenecut(); debug!( - "[SC-Detect] Frame {} to {}: I={:.3} T={:.3} P={:.3} {}", - current_frameno - 1, - current_frameno, - result.intra_cost, - result.threshold, - result.inter_cost, - if result.has_scenecut { "Scenecut" } else { "No cut" } + "[SC-Detect] Frame {}: I={:4.0} T= {:.0} {}", + input_frameno, + self.score_deque[self.deque_offset].0, + self.score_deque[self.deque_offset].1, + if scenecut { "Scenecut" } else { "No cut" } ); - result.has_scenecut - } - /// Uses lookahead to avoid coding short flashes as scenecuts. - /// Saves excluded frame numbers in `self.excluded_frames`. - fn exclude_scene_flashes( - &mut self, frame_subset: &[Arc>], frameno: u64, - previous_keyframe: u64, - ) { - let lookahead_distance = self.lookahead_distance; - - if frame_subset.len() - 1 < lookahead_distance { - // Don't add a keyframe in the last frame pyramid. - // It's effectively the same as a scene flash, - // and really wasteful for compression. - for frame in frameno..=(frameno + lookahead_distance as u64) { - self.excluded_frames.insert(frame); + if scenecut { + // Clear buffers and deque + self.frame_buffer.clear(); + debug!("[SC-score-deque]{:.0?}", self.score_deque); + self.score_deque.clear(); + } else { + // Keep score deque of 5 backward frames + // and forward frames of lenght of lookahead offset + if self.score_deque.len() > 5 + self.lookahead_offset { + self.score_deque.pop(); } - return; } - // Where A and B are scenes: AAAAAABBBAAAAAA - // If BBB is shorter than lookahead_distance, it is detected as a flash - // and not considered a scenecut. - // - // Search starting with the furthest frame, - // to enable early loop exit if we find a scene flash. - for j in (1..=lookahead_distance).rev() { - let result = self.has_scenecut( - frame_subset[0].clone(), - frame_subset[j].clone(), - frameno - 1 + j as u64, + scenecut + } + + // Initially fill score deque with frame scores + fn initialize_score_deque( + &mut self, frame_set: &[Arc>], input_frameno: u64, + previous_keyframe: u64, init_len: usize, + ) { + for x in 0..init_len { + self.run_comparison( + frame_set[x].clone(), + frame_set[x + 1].clone(), + input_frameno, previous_keyframe, ); - debug!( - "[SF-Detect-1] Frame {} to {}: I={:.3} T={:.3} P={:.3} {}", - frameno - 1, - frameno - 1 + j as u64, - result.intra_cost, - result.threshold, - result.inter_cost, - if result.has_scenecut { "No flash" } else { "Scene flash" } - ); - if !result.has_scenecut { - // Any frame in between `0` and `j` cannot be a real scenecut. - for i in 0..=j { - let frameno = frameno + i as u64 - 1; - self.excluded_frames.insert(frameno); - } - // Because all frames in this gap are already excluded, - // exit the loop early as an optimization. - break; - } } + } - // Where A-F are scenes: AAAAABBCCDDEEFFFFFF - // If each of BB ... EE are shorter than `lookahead_distance`, they are - // detected as flashes and not considered scenecuts. - // Instead, the first F frame becomes a scenecut. - // If the video ends before F, no frame becomes a scenecut. - for i in 1..lookahead_distance { - let result = self.has_scenecut( - frame_subset[i].clone(), - frame_subset[lookahead_distance].clone(), - frameno - 1 + lookahead_distance as u64, - previous_keyframe, - ); - debug!( - "[SF-Detect-2] Frame {} to {}: I={:.3} T={:.3} P={:.3} {}", - frameno - 1 + i as u64, - frameno - 1 + lookahead_distance as u64, - result.intra_cost, - result.threshold, - result.inter_cost, - if result.has_scenecut { "Scene flash" } else { "No flash" } - ); - if result.has_scenecut { - // If the current frame is the frame before a scenecut, it cannot also be the frame of a scenecut. - let frameno = frameno + i as u64 - 1; - self.excluded_frames.insert(frameno); + /// Runs scene change comparison beetween 2 given frames + /// Insert result to start of score deque + fn run_comparison( + &mut self, frame1: Arc>, frame2: Arc>, + input_frameno: u64, previous_keyframe: u64, + ) { + let result = if self.fast_mode { + self.fast_scenecut(frame1, frame2) + } else { + self.cost_scenecut(frame1, frame2, input_frameno, previous_keyframe) + }; + self + .score_deque + .insert(0, (result.inter_cost as f64, result.threshold as f64)); + } + + /// Compares current scene score to adapted threshold based on previous scores + /// Value of current frame is offset by lookahead, if lookahead >=5 + /// Returns true if current scene score is higher than adapted threshold + fn adaptive_scenecut(&mut self) -> bool { + let mut cloned_deque = self.score_deque.to_vec(); + cloned_deque.remove(self.deque_offset); + + let scene_score = self.score_deque[self.deque_offset].0; + let scene_threshold = self.score_deque[self.deque_offset].1; + + if scene_score >= scene_threshold as f64 { + let back_deque = self.score_deque[self.deque_offset + 1..].to_vec(); + let forward_deque = self.score_deque[..self.deque_offset].to_vec(); + let back_over_tr = + back_deque.iter().filter(|(x, y)| x > y).collect_vec(); + + let forward_over_tr = + forward_deque.iter().filter(|(x, y)| x > y).collect_vec(); + + // Check for scenecut after the flashes + // No frames over threshold forward + // and some frames over threshold backward + if !back_over_tr.is_empty() + && forward_over_tr.is_empty() + && back_deque.len() > 1 + && back_over_tr.len() > 1 + { + return true; + } + + // Check for scenecut before flash + // If distance longer than max flash length + if back_over_tr.is_empty() + && forward_over_tr.len() == 1 + && forward_deque[0].0 > forward_deque[0].1 + { + return true; + } + + if !back_over_tr.is_empty() || !forward_over_tr.is_empty() { + return false; } } + + scene_score >= scene_threshold + } + + /// The fast algorithm detects fast cuts using a raw difference + /// in pixel values between the scaled frames. + #[hawktracer(fast_scenecut)] + fn fast_scenecut( + &mut self, frame1: Arc>, frame2: Arc>, + ) -> ScenecutResult { + // Downscaling both frames for comparison + // Moving scaled frames to buffer + if self.frame_buffer.is_empty() { + let frame1_scaled = frame1.planes[0].downscale(self.scale_factor); + self.frame_buffer.push(frame1_scaled); + + let frame2_scaled = frame2.planes[0].downscale(self.scale_factor); + self.frame_buffer.push(frame2_scaled); + } else { + self.frame_buffer.remove(0); + self.frame_buffer.push(frame2.planes[0].downscale(self.scale_factor)); + } + + let delta = + self.delta_in_planes(&self.frame_buffer[0], &self.frame_buffer[1]); + + ScenecutResult { + intra_cost: self.threshold as f64, + threshold: self.threshold as f64, + inter_cost: delta as f64, + } } /// Run a comparison between two frames to determine if they qualify for a scenecut. /// - /// The standard algorithm uses block intra and inter costs + /// Using block intra and inter costs /// to determine which method would be more efficient /// for coding this frame. - /// - /// The fast algorithm detects fast cuts using a raw difference - /// in pixel values between the frames. - /// It does not handle pans well, but the scene flash detection compensates for this - /// in many cases. - fn has_scenecut( + #[hawktracer(cost_scenecut)] + fn cost_scenecut( &self, frame1: Arc>, frame2: Arc>, frameno: u64, previous_keyframe: u64, ) -> ScenecutResult { - if self.fast_mode { - let len = frame2.planes[0].cfg.width * frame2.planes[0].cfg.height; - let delta = self.delta_in_planes(&frame1.planes[0], &frame2.planes[0]); - let threshold = self.threshold * len as u64; - ScenecutResult { - intra_cost: threshold as f64, - threshold: threshold as f64, - inter_cost: delta as f64, - has_scenecut: delta >= threshold, - } + let frame2_ref2 = Arc::clone(&frame2); + let (intra_cost, inter_cost) = crate::rayon::join( + move || { + let intra_costs = estimate_intra_costs( + &*frame2, + self.bit_depth, + self.cpu_feature_level, + ); + intra_costs.iter().map(|&cost| cost as u64).sum::() as f64 + / intra_costs.len() as f64 + }, + move || { + let inter_costs = estimate_inter_costs( + frame2_ref2, + frame1, + self.bit_depth, + self.encoder_config, + self.sequence.clone(), + ); + inter_costs.iter().map(|&cost| cost as u64).sum::() as f64 + / inter_costs.len() as f64 + }, + ); + + // Sliding scale, more likely to choose a keyframe + // as we get farther from the last keyframe. + // Based on x264 scenecut code. + // + // `THRESH_MAX` determines how likely we are + // to choose a keyframe, between 0.0-1.0. + // Higher values mean we are more likely to choose a keyframe. + // `0.4` was chosen based on trials of the `scenecut-720p` set in AWCY, + // as it appeared to provide the best average compression. + // This also matches the default scenecut threshold in x264. + const THRESH_MAX: f64 = 0.4; + const THRESH_MIN: f64 = THRESH_MAX * 0.25; + let distance_from_keyframe = frameno - previous_keyframe; + let min_keyint = self.encoder_config.min_key_frame_interval; + let max_keyint = self.encoder_config.max_key_frame_interval; + let bias = if distance_from_keyframe <= min_keyint / 4 { + THRESH_MIN / 4.0 + } else if distance_from_keyframe <= min_keyint { + THRESH_MIN * distance_from_keyframe as f64 / min_keyint as f64 } else { - let frame2_ref2 = Arc::clone(&frame2); - let (intra_cost, inter_cost) = crate::rayon::join( - move || { - let intra_costs = estimate_intra_costs( - &*frame2, - self.bit_depth, - self.cpu_feature_level, - ); - intra_costs.iter().map(|&cost| cost as u64).sum::() as f64 - / intra_costs.len() as f64 - }, - move || { - let inter_costs = estimate_inter_costs( - frame2_ref2, - frame1, - self.bit_depth, - self.encoder_config, - self.sequence.clone(), - ); - inter_costs.iter().map(|&cost| cost as u64).sum::() as f64 - / inter_costs.len() as f64 - }, - ); + THRESH_MIN + + (THRESH_MAX - THRESH_MIN) + * (distance_from_keyframe - min_keyint) as f64 + / (max_keyint - min_keyint) as f64 + }; + let threshold = intra_cost * (1.0 - bias); - // Sliding scale, more likely to choose a keyframe - // as we get farther from the last keyframe. - // Based on x264 scenecut code. - // - // `THRESH_MAX` determines how likely we are - // to choose a keyframe, between 0.0-1.0. - // Higher values mean we are more likely to choose a keyframe. - // `0.4` was chosen based on trials of the `scenecut-720p` set in AWCY, - // as it appeared to provide the best average compression. - // This also matches the default scenecut threshold in x264. - const THRESH_MAX: f64 = 0.4; - const THRESH_MIN: f64 = THRESH_MAX * 0.25; - let distance_from_keyframe = frameno - previous_keyframe; - let min_keyint = self.encoder_config.min_key_frame_interval; - let max_keyint = self.encoder_config.max_key_frame_interval; - let bias = if distance_from_keyframe <= min_keyint / 4 { - THRESH_MIN / 4.0 - } else if distance_from_keyframe <= min_keyint { - THRESH_MIN * distance_from_keyframe as f64 / min_keyint as f64 - } else { - THRESH_MIN - + (THRESH_MAX - THRESH_MIN) - * (distance_from_keyframe - min_keyint) as f64 - / (max_keyint - min_keyint) as f64 - }; - let threshold = intra_cost * (1.0 - bias); - - ScenecutResult { - intra_cost, - threshold, - inter_cost, - has_scenecut: inter_cost > threshold, - } - } + ScenecutResult { intra_cost, inter_cost, threshold } } - fn delta_in_planes( - &self, plane1: &Plane, plane2: &Plane, - ) -> u64 { + /// Calculates delta beetween 2 planes + /// returns average for pixel + #[hawktracer(delta_in_planes)] + fn delta_in_planes(&self, plane1: &Plane, plane2: &Plane) -> f64 { let mut delta = 0; + let lines = plane1.rows_iter().zip(plane2.rows_iter()); for (l1, l2) in lines { @@ -320,21 +385,43 @@ impl SceneChangeDetector { .iter() .zip(l2.iter()) .map(|(&p1, &p2)| { - (i16::cast_from(p1) - i16::cast_from(p2)).abs() as u64 + (i16::cast_from(p1) - i16::cast_from(p2)).abs() as u32 }) - .sum::(); - delta += delta_line; + .sum::(); + delta += delta_line as u64; } - delta + delta as f64 / self.pixels as f64 } } +/// Scaling factor for frame in scene detection +fn detect_scale_factor(sequence: &Arc) -> usize { + let small_edge = + cmp::min(sequence.max_frame_height, sequence.max_frame_width) as usize; + let scale_factor = match small_edge { + 0..=240 => 1, + 241..=480 => 2, + 481..=720 => 4, + 721..=1080 => 8, + 1081..=1600 => 16, + 1601..=std::usize::MAX => 32, + _ => 1, + } as usize; + debug!( + "Scene detection scale factor {}, [{},{}] -> [{},{}]", + scale_factor, + sequence.max_frame_width, + sequence.max_frame_height, + sequence.max_frame_width as usize / scale_factor, + sequence.max_frame_height as usize / scale_factor + ); + scale_factor +} + /// This struct primarily exists for returning metrics to the caller -/// for logging debug information. #[derive(Debug, Clone, Copy)] struct ScenecutResult { intra_cost: f64, inter_cost: f64, threshold: f64, - has_scenecut: bool, } From 72e20c5c5678640b2f1ca36b228afe67a5dbdf94 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Thu, 22 Jul 2021 17:49:50 +0200 Subject: [PATCH 094/188] Update semver and rustc_version --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7313ad61c1..60f7781e3c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -100,7 +100,7 @@ cc = { version = "1.0", optional = true, features = ["parallel"] } # Vendored to remove the dependency on `failure`, # which takes a long time to build. vergen = { version = "3", path = "crates/vergen" } -rustc_version = "0.3" +rustc_version = "0.4" regex = { version = "1", optional = true } [build-dependencies.nasm-rs] @@ -118,7 +118,7 @@ pretty_assertions = "0.7" interpolate_name = "0.2.2" rand = "0.8" rand_chacha = "0.3" -semver = "0.11" +semver = "1.0" [target.'cfg(fuzzing)'.dependencies] arbitrary = "0.4" From 76c3447d8db1c3e254d76c066153542788de3297 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Thu, 22 Jul 2021 17:18:07 +0200 Subject: [PATCH 095/188] Add build.rs to the asm freshness hash --- build.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/build.rs b/build.rs index 35946ec028..ce98f7af2f 100644 --- a/build.rs +++ b/build.rs @@ -40,7 +40,11 @@ fn hash_changed( let mut hasher = DefaultHasher::new(); - let paths = files.iter().map(Path::new).chain(std::iter::once(config)); + let paths = files + .iter() + .map(Path::new) + .chain(std::iter::once(config)) + .chain(std::iter::once(Path::new("build.rs"))); for path in paths { if let Ok(mut f) = std::fs::File::open(path) { From 45d2873aba809b01e5b11d2491695675b46509ce Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Wed, 21 Jul 2021 23:25:40 +0200 Subject: [PATCH 096/188] Strip the local symbols from the nasm objects They tend to confuse the debugger and are overall unnecessary. --- README.md | 1 + build.rs | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/README.md b/README.md index af8de661ea..11d7802566 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,7 @@ rav1e currently requires Rust 1.51.0 or later to build. ### Dependency: NASM Some `x86_64`-specific optimizations require [NASM](https://nasm.us/) `2.14.02` or newer and are enabled by default. +`strip` will be used if available to remove the local symbols from the asm objects. The CI is testing against `nasm 2.15.05`, so bugs for other versions might happen. If you find one please open an issue! diff --git a/build.rs b/build.rs index ce98f7af2f..4fd2bcfd12 100644 --- a/build.rs +++ b/build.rs @@ -57,6 +57,10 @@ fn hash_changed( } } + let strip = env::var("STRIP").unwrap_or_else(|_| "strip".to_string()); + + hasher.write(strip.as_bytes()); + let hash = hasher.finish().to_be_bytes(); let hash_path = Path::new(&out_dir).join("asm.hash"); @@ -135,6 +139,20 @@ fn build_nasm_files() { } cc.compile("rav1easm"); + // Strip local symbols from the asm library since they + // confuse the debugger. + fn strip>(obj: P) { + let strip = env::var("STRIP").unwrap_or_else(|_| "strip".to_string()); + + let mut cmd = std::process::Command::new(strip); + + cmd.arg("-x").arg(obj.as_ref()); + + let _ = cmd.output(); + } + + strip(Path::new(&out_dir).join("librav1easm.a")); + std::fs::write(hash_path, &hash[..]).unwrap(); } else { println!("cargo:rustc-link-search={}", out_dir); From 0f25619ab3dd29322603df2e070a1e7975209fca Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Thu, 22 Jul 2021 19:31:31 +0200 Subject: [PATCH 097/188] Pin system-deps More recent versions use cfg-expr 0.8 that use a rust-1.53 feature: or-patterns --- Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 60f7781e3c..3e1fc33a2b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -127,6 +127,9 @@ libfuzzer-sys = "0.3" rand = "0.8" rand_chacha = "0.3" +[target.'cfg(any(decode_test, decode_test_dav1d))'.dependencies] +system-deps = "~3.1.2" + [[bin]] name = "rav1e" required-features = ["binaries"] From f6c841fe9234e9e08a801e884bec1d91a3c6f9f4 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 27 Jul 2021 13:45:08 +0900 Subject: [PATCH 098/188] CI: Update to libaom to 3.1.2-dmo1 --- .github/workflows/rav1e.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/rav1e.yml b/.github/workflows/rav1e.yml index 632d8c2ea0..b667d99433 100644 --- a/.github/workflows/rav1e.yml +++ b/.github/workflows/rav1e.yml @@ -129,11 +129,11 @@ jobs: matrix.conf == 'grcov-coveralls' env: LINK: https://www.deb-multimedia.org/pool/main/v/vmaf-dmo - LIBVMAF_VERSION: 2.1.1-dmo0~bpo10+3 + LIBVMAF_VERSION: 2.2.0-dmo1 LIBVMAF_SHA256: >- - d9c2e708399af37cac52090453aadf8ad4311c3ed40addf02c3158c7f7a705a6 + 88dace39e1aa0c88973a397640c457b85fd86746b4e79399e4aefdf5167aae92 LIBVMAF_DEV_SHA256: >- - 957a21b4a4b3cea4b27ab068fc41e85776d19c69fbede949b6eecd9631aa697f + bfaecbeba9fc413e69e2301d88720b2c72d35986778b30381d11eee0274ba78f run: | echo "$LINK/libvmaf1_${LIBVMAF_VERSION}_amd64.deb" >> DEBS echo "$LINK/libvmaf-dev_${LIBVMAF_VERSION}_amd64.deb" >> DEBS @@ -145,11 +145,11 @@ jobs: matrix.conf == 'grcov-coveralls' env: LINK: https://www.deb-multimedia.org/pool/main/a/aom-dmo - AOM_VERSION: 3.1.1-dmo0~bpo10+2 + AOM_VERSION: 3.1.2-dmo1 AOM_DEV_SHA256: >- - 881ec275a01169378e19c1779fec3fb5d4b80e1afe61d8b576a7c66419702a90 + 63cf8804e1a010431e06f6da02582c5b95fae546c0e47ba75b1921aa7cbd9d3a AOM_LIB_SHA256: >- - a2a75cda5eacbddad70c508a7113d0ba572aad29934bb31905773e9adb555413 + 5df58fa6f6b1f28e64dfec77959516ea714ba6fd753c2b7e85527ac892932777 run: | echo "$LINK/libaom-dev_${AOM_VERSION}_amd64.deb" >> DEBS echo "$LINK/libaom3_${AOM_VERSION}_amd64.deb" >> DEBS From 6ceef48a31e48bee9fe388351516fecd615b91bb Mon Sep 17 00:00:00 2001 From: Vibhoothi Date: Thu, 5 Aug 2021 21:00:08 +0100 Subject: [PATCH 099/188] CI: Update libdav1d to 0.9.1-dmo1 --- .github/workflows/rav1e.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/rav1e.yml b/.github/workflows/rav1e.yml index b667d99433..451daf59da 100644 --- a/.github/workflows/rav1e.yml +++ b/.github/workflows/rav1e.yml @@ -161,11 +161,11 @@ jobs: matrix.conf == 'grcov-coveralls' || matrix.conf == 'fuzz' || matrix.conf == 'no-asm-tests' env: LINK: https://www.deb-multimedia.org/pool/main/d/dav1d-dmo - DAV1D_VERSION: 0.9.0-dmo1 + DAV1D_VERSION: 0.9.1-dmo1 DAV1D_DEV_SHA256: >- - ce6bd5c710d287306d3b6d45fa3843b35231da37f4d18d82ff24ba088916cfae + df760b1124c121289f40cf25d6f4a6ee2fb1d20a988853fa33b9e947a1cd263a DAV1D_LIB_SHA256: >- - 54c8ff504523101b96fa994963fb24b7104221a5b011f8b525baac8260640994 + a6a3cf5b9d08250780b5661d40388267cd4dae42acdfc4d7b132ca19815e0301 run: | echo "$LINK/libdav1d-dev_${DAV1D_VERSION}_amd64.deb" >> DEBS echo "$LINK/libdav1d5_${DAV1D_VERSION}_amd64.deb" >> DEBS From 64e3d9fb3c943b2512328d66a48d082377ccf55a Mon Sep 17 00:00:00 2001 From: Vibhoothi Date: Thu, 5 Aug 2021 21:05:17 +0100 Subject: [PATCH 100/188] vergen: Fix clippy errors Reference: https://rust-lang.github.io/rust-clippy/master/index.html#needless_borrow --- crates/vergen/src/output/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/vergen/src/output/mod.rs b/crates/vergen/src/output/mod.rs index b5e3735c65..c4ac369b06 100755 --- a/crates/vergen/src/output/mod.rs +++ b/crates/vergen/src/output/mod.rs @@ -65,7 +65,7 @@ pub fn generate_build_info( if flags.contains(ConstantsFlags::SEMVER) { let describe = run_command(Command::new("git").args(&["describe"])); - let semver = if describe.eq_ignore_ascii_case(&"UNKNOWN") { + let semver = if describe.eq_ignore_ascii_case("UNKNOWN") { env::var("CARGO_PKG_VERSION")? } else { describe @@ -79,7 +79,7 @@ pub fn generate_build_info( let describe = run_command(Command::new("git").args(&["describe", "--tags"])); - let semver = if describe.eq_ignore_ascii_case(&"UNKNOWN") { + let semver = if describe.eq_ignore_ascii_case("UNKNOWN") { env::var("CARGO_PKG_VERSION")? } else { describe From 7a7e1a638440f2f91dfd729ae33064450589cea1 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Thu, 5 Aug 2021 21:15:03 +0100 Subject: [PATCH 101/188] context: Remove an unused attribute --- src/context/cdf_context.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index 9f5362d51a..848ea3e156 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -542,7 +542,6 @@ impl fmt::Debug for CDFContext { } } -#[macro_use] macro_rules! symbol_with_update { ($self:ident, $w:ident, $s:expr, $cdf:expr) => { $w.symbol_with_update($s, $cdf, &mut $self.fc_log); From 59ef8847cb483a17a8d7b8bff0cfd7cb0a5e2190 Mon Sep 17 00:00:00 2001 From: Vibhoothi Date: Thu, 5 Aug 2021 21:21:25 +0100 Subject: [PATCH 102/188] lib: Silence clippy::enum-variant-names --- src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib.rs b/src/lib.rs index ad4899b380..2294ccfc45 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -47,6 +47,7 @@ #![allow(clippy::missing_safety_doc)] #![allow(clippy::comparison_chain)] #![allow(clippy::upper_case_acronyms)] +#![allow(clippy::enum_variant_names)] #![warn(clippy::expl_impl_clone_on_copy)] #![warn(clippy::linkedlist)] #![warn(clippy::map_flatten)] From 45c60fc381c016a3b82c86cfbf8516b6c731a90b Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Thu, 13 May 2021 21:29:03 +0200 Subject: [PATCH 103/188] x86: itx: Add 12-bit wht --- src/x86/itx16_avx2.asm | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/x86/itx16_avx2.asm b/src/x86/itx16_avx2.asm index 5df5f4619b..1ef674e8f2 100644 --- a/src/x86/itx16_avx2.asm +++ b/src/x86/itx16_avx2.asm @@ -165,7 +165,7 @@ SECTION .text %endmacro INIT_YMM avx2 -cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c +cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax mova xm0, [cq+16*0] vinserti128 m0, [cq+16*2], 1 mova xm1, [cq+16*1] @@ -191,7 +191,12 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c movhps xm2, [dstq+strideq*0] movq xm3, [r6 +strideq*0] movhps xm3, [dstq+strideq*1] - vpbroadcastd xm5, [pixel_max] +%ifidn bdmaxd, bdmaxm + movd xm5, bdmaxd + vpbroadcastw xm5, xm5 +%else ; win64: load from stack + vpbroadcastw xm5, bdmaxm +%endif paddsw xm0, xm2 paddsw xm1, xm3 pmaxsw xm0, xm4 From 5847bc9c67e18e221acb7d0d238c958f9d6d5384 Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Tue, 18 May 2021 00:42:07 +0200 Subject: [PATCH 104/188] x86: itx: Add 10/12-bit SSE2 WHT --- src/x86/itx16_sse.asm | 93 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 src/x86/itx16_sse.asm diff --git a/src/x86/itx16_sse.asm b/src/x86/itx16_sse.asm new file mode 100644 index 0000000000..48c7674d12 --- /dev/null +++ b/src/x86/itx16_sse.asm @@ -0,0 +1,93 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; Copyright © 2017-2021, The rav1e contributors +; Copyright © 2020, Nathan Egge +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION .text + +%macro IWHT4_1D_PACKED 0 + ; m0 = in0, m1 = in1, m2 = in2, m3 = in3 + paddd m0, m1 ; in0 += in1 + psubd m4, m2, m3 ; tmp0 = in2 - in3 + psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1 + psrad m5, 1 + psubd m2, m5, m1 ; in2 = tmp1 - in1 + psubd m5, m3 ; in1 = tmp1 - in3 + psubd m0, m5 ; in0 -= in1 + paddd m4, m2 ; in3 = tmp0 + in2 + ; m0 = out0, m1 = in1, m2 = out2, m3 = in3 + ; m4 = out3, m5 = out1 +%endmacro + +INIT_XMM sse2 +cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax + mova m0, [cq+16*0] + mova m1, [cq+16*1] + mova m2, [cq+16*2] + mova m3, [cq+16*3] + psrad m0, 2 + psrad m1, 2 + psrad m2, 2 + psrad m3, 2 + IWHT4_1D_PACKED + punpckldq m1, m0, m5 + punpckhdq m3, m0, m5 + punpckldq m5, m2, m4 + punpckhdq m2, m4 + punpcklqdq m0, m1, m5 + punpckhqdq m1, m5 + punpcklqdq m4, m3, m2 + punpckhqdq m3, m2 + mova m2, m4 + IWHT4_1D_PACKED + packssdw m0, m4 ; low: out3, high: out0 + packssdw m2, m5 ; low: out2, high: out1 + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + lea r2, [dstq+strideq*2] + movq m1, [dstq+strideq*0] + movhps m1, [r2 +strideq*1] + movq m3, [r2 +strideq*0] + movhps m3, [dstq+strideq*1] + movd m5, bdmaxm + pshuflw m5, m5, q0000 ; broadcast + punpcklqdq m5, m5 ; broadcast + paddsw m0, m1 + paddsw m2, m3 + pmaxsw m0, m4 + pmaxsw m2, m4 + pminsw m0, m5 + pminsw m2, m5 + movhps [r2 +strideq*1], m0 ; write out0 + movhps [dstq+strideq*1], m2 ; write out1 + movq [r2 +strideq*0], m2 ; write out2 + movq [dstq+strideq*0], m0 ; write out3 + RET From 8e0258ec930bc62b27ba5f43a4444d12eab386ef Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Thu, 20 May 2021 16:41:26 +0200 Subject: [PATCH 105/188] x86: Add high bitdepth avg/w_avg/mask SSSE3 asm --- src/x86/mc16_sse.asm | 306 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 306 insertions(+) create mode 100644 src/x86/mc16_sse.asm diff --git a/src/x86/mc16_sse.asm b/src/x86/mc16_sse.asm new file mode 100644 index 0000000000..2dd7e97590 --- /dev/null +++ b/src/x86/mc16_sse.asm @@ -0,0 +1,306 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +pw_64: times 8 dw 64 +pw_256: times 8 dw 256 +pd_65538: times 2 dd 65538 + +bidir_rnd: times 4 dw -16400 + times 4 dw -16388 +bidir_mul: times 4 dw 2048 + times 4 dw 8192 + +%macro BIDIR_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - 2*%3) + %xdefine %%base %1_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .w%3 - %%base + %rotate 1 + %endrep +%endmacro + +BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 + +SECTION .text + +INIT_XMM ssse3 +%macro BIDIR_FN 0 + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop +.ret: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jne .w8_loop + RET +.w16_loop: + call .main + add dstq, strideq +.w16: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + dec hd + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + call .main + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + call .main + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16* 0], m0 + mova [dstq+16* 1], m1 + call .main + mova [dstq+16* 2], m0 + mova [dstq+16* 3], m1 + call .main + mova [dstq+16* 4], m0 + mova [dstq+16* 5], m1 + call .main + mova [dstq+16* 6], m0 + mova [dstq+16* 7], m1 + call .main + mova [dstq+16* 8], m0 + mova [dstq+16* 9], m1 + call .main + mova [dstq+16*10], m0 + mova [dstq+16*11], m1 + call .main + mova [dstq+16*12], m0 + mova [dstq+16*13], m1 + call .main + mova [dstq+16*14], m0 + mova [dstq+16*15], m1 + dec hd + jg .w128_loop + RET +%endmacro + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h +%define base r6-avg_ssse3_table + LEA r6, avg_ssse3_table + tzcnt wd, wm + mov t0d, r6m ; pixel_max + movsxd wq, [r6+wq*4] + shr t0d, 11 + movddup m2, [base+bidir_rnd+t0*8] + movddup m3, [base+bidir_mul+t0*8] + movifnidn hd, hm + add wq, r6 + BIDIR_FN +ALIGN function_align +.main: + mova m0, [tmp1q+16*0] + paddsw m0, [tmp2q+16*0] + mova m1, [tmp1q+16*1] + paddsw m1, [tmp2q+16*1] + add tmp1q, 16*2 + add tmp2q, 16*2 + pmaxsw m0, m2 + pmaxsw m1, m2 + psubsw m0, m2 + psubsw m1, m2 + pmulhw m0, m3 + pmulhw m1, m3 + ret + +cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h +%define base r6-w_avg_ssse3_table + LEA r6, w_avg_ssse3_table + tzcnt wd, wm + mov t0d, r6m ; weight + movd m6, r7m ; pixel_max + movddup m5, [base+pd_65538] + movsxd wq, [r6+wq*4] + pshufb m6, [base+pw_256] + add wq, r6 + lea r6d, [t0-16] + shl t0d, 16 + sub t0d, r6d ; 16-weight, weight + paddw m5, m6 + mov r6d, t0d + shl t0d, 2 + test dword r7m, 0x800 + cmovnz r6d, t0d + movifnidn hd, hm + movd m4, r6d + pslld m5, 7 + pxor m7, m7 + pshufd m4, m4, q0000 + BIDIR_FN +ALIGN function_align +.main: + mova m2, [tmp1q+16*0] + mova m0, [tmp2q+16*0] + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + mova m2, [tmp1q+16*1] + mova m1, [tmp2q+16*1] + add tmp1q, 16*2 + add tmp2q, 16*2 + pmaddwd m3, m4 + pmaddwd m0, m4 + paddd m3, m5 + paddd m0, m5 + psrad m3, 8 + psrad m0, 8 + packssdw m0, m3 + punpckhwd m3, m1, m2 + punpcklwd m1, m2 + pmaddwd m3, m4 + pmaddwd m1, m4 + paddd m3, m5 + paddd m1, m5 + psrad m3, 8 + psrad m1, 8 + packssdw m1, m3 + pminsw m0, m6 + pminsw m1, m6 + pmaxsw m0, m7 + pmaxsw m1, m7 + ret + +%if ARCH_X86_64 +cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask +%else +cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask +%define hd dword r5m +%define m8 [base+pw_64] +%endif +%define base r6-mask_ssse3_table + LEA r6, mask_ssse3_table + tzcnt wd, wm + mov t0d, r7m ; pixel_max + shr t0d, 11 + movsxd wq, [r6+wq*4] + movddup m6, [base+bidir_rnd+t0*8] + movddup m7, [base+bidir_mul+t0*8] +%if ARCH_X86_64 + mova m8, [base+pw_64] + movifnidn hd, hm +%endif + add wq, r6 + mov maskq, r6mp + BIDIR_FN +ALIGN function_align +.main: + movq m3, [maskq+8*0] + mova m0, [tmp1q+16*0] + mova m4, [tmp2q+16*0] + pxor m5, m5 + punpcklbw m3, m5 + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + psubw m1, m8, m3 + punpckhwd m4, m3, m1 ; m, 64-m + punpcklwd m3, m1 + pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) + pmaddwd m0, m3 + movq m3, [maskq+8*1] + mova m1, [tmp1q+16*1] + mova m4, [tmp2q+16*1] + add maskq, 8*2 + add tmp1q, 16*2 + add tmp2q, 16*2 + psrad m2, 5 + psrad m0, 5 + packssdw m0, m2 + punpcklbw m3, m5 + punpckhwd m2, m1, m4 + punpcklwd m1, m4 + psubw m5, m8, m3 + punpckhwd m4, m3, m5 ; m, 64-m + punpcklwd m3, m5 + pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) + pmaddwd m1, m3 + psrad m2, 5 + psrad m1, 5 + packssdw m1, m2 + pmaxsw m0, m6 + pmaxsw m1, m6 + psubsw m0, m6 + psubsw m1, m6 + pmulhw m0, m7 + pmulhw m1, m7 + ret From c9e7cb11104f76edbb7974759a02c0af01c48d87 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Thu, 20 May 2021 16:41:31 +0200 Subject: [PATCH 106/188] x86: Add high bitdepth w_mask SSSE3 asm --- src/x86/mc16_sse.asm | 629 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 629 insertions(+) diff --git a/src/x86/mc16_sse.asm b/src/x86/mc16_sse.asm index 2dd7e97590..5be8a456f0 100644 --- a/src/x86/mc16_sse.asm +++ b/src/x86/mc16_sse.asm @@ -28,8 +28,10 @@ SECTION_RODATA +pw_2: times 8 dw 2 pw_64: times 8 dw 64 pw_256: times 8 dw 256 +pw_27615: times 8 dw 27615 pd_65538: times 2 dd 65538 bidir_rnd: times 4 dw -16400 @@ -51,9 +53,18 @@ bidir_mul: times 4 dw 2048 BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 SECTION .text +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + INIT_XMM ssse3 %macro BIDIR_FN 0 call .main @@ -304,3 +315,621 @@ ALIGN function_align pmulhw m0, m7 pmulhw m1, m7 ret + +cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_420_ssse3_table + LEA t0, w_mask_420_ssse3_table + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movd m0, r7m ; sign + shr r6d, 11 + movsxd wq, [t0+wq*4] +%if ARCH_X86_64 + mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + mova m9, [base+pw_64] + movddup m10, [base+bidir_rnd+r6*8] + movddup m11, [base+bidir_mul+r6*8] +%else + mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + mova m2, [base+pw_64] + movddup m3, [base+bidir_rnd+r6*8] + movddup m4, [base+bidir_mul+r6*8] + ALLOC_STACK -16*4 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + mova [rsp+16*3], m4 + %define m8 [rsp+gprsize+16*0] + %define m9 [rsp+gprsize+16*1] + %define m10 [rsp+gprsize+16*2] + %define m11 [rsp+gprsize+16*3] +%endif + movd m7, [base+pw_2] + psubw m7, m0 + pshufb m7, [base+pw_256] + add wq, t0 + movifnidn hd, r5m + mov maskq, r6mp + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 4 +.w4: + movq [dstq+strideq*0], m0 + phaddw m2, m3 + movhps [dstq+strideq*1], m0 + phaddd m2, m2 + lea dstq, [dstq+strideq*2] + paddw m2, m7 + movq [dstq+strideq*0], m1 + psrlw m2, 2 + movhps [dstq+strideq*1], m1 + packuswb m2, m2 + movd [maskq], m2 + sub hd, 4 + jg .w4_loop + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 4 +.w8: + mova [dstq+strideq*0], m0 + paddw m2, m3 + phaddw m2, m2 + mova [dstq+strideq*1], m1 + paddw m2, m7 + psrlw m2, 2 + packuswb m2, m2 + movd [maskq], m2 + sub hd, 2 + jg .w8_loop + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 8 +.w16: + mova [dstq+strideq*1+16*0], m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*1+16*1], m3 + mova [dstq+strideq*0+16*1], m1 + call .main + paddw m2, [dstq+strideq*1+16*0] + paddw m3, [dstq+strideq*1+16*1] + mova [dstq+strideq*1+16*0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*1], m1 + paddw m2, m7 + psrlw m2, 2 + packuswb m2, m2 + movq [maskq], m2 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 16 +.w32: + mova [dstq+strideq*1+16*0], m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*1+16*1], m3 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*0+16*2], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*3], m2 + mova [dstq+strideq*0+16*3], m1 + call .main + paddw m2, [dstq+strideq*1+16*0] + paddw m3, [dstq+strideq*1+16*1] + mova [dstq+strideq*1+16*0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*2], m2 + mova [dstq+strideq*1+16*1], m1 + call .main + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*2] + paddw m2, [dstq+strideq*1+16*3] + mova [dstq+strideq*1+16*2], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*3], m1 + packuswb m3, m2 + mova [maskq], m3 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 16*2 +.w64: + mova [dstq+strideq*1+16*1], m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*1+16*2], m3 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*1+16*3], m2 + mova [dstq+strideq*0+16*2], m0 + mova [dstq+strideq*1+16*4], m3 + mova [dstq+strideq*0+16*3], m1 + call .main + mova [dstq+strideq*1+16*5], m2 + mova [dstq+strideq*0+16*4], m0 + mova [dstq+strideq*1+16*6], m3 + mova [dstq+strideq*0+16*5], m1 + call .main + mova [dstq+strideq*0+16*6], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*7], m2 + mova [dstq+strideq*0+16*7], m1 + call .main + paddw m2, [dstq+strideq*1+16*1] + paddw m3, [dstq+strideq*1+16*2] + mova [dstq+strideq*1+16*0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*2], m2 + mova [dstq+strideq*1+16*1], m1 + call .main + paddw m2, [dstq+strideq*1+16*3] + paddw m3, [dstq+strideq*1+16*4] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*2] + mova [dstq+strideq*1+16*2], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*3], m1 + packuswb m3, m2 + mova [maskq+16*0], m3 + call .main + paddw m2, [dstq+strideq*1+16*5] + paddw m3, [dstq+strideq*1+16*6] + mova [dstq+strideq*1+16*4], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*6], m2 + mova [dstq+strideq*1+16*5], m1 + call .main + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*6] + paddw m2, [dstq+strideq*1+16*7] + mova [dstq+strideq*1+16*6], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*7], m1 + packuswb m3, m2 + mova [maskq+16*1], m3 + sub hd, 2 + jg .w64_loop + RET +.w128_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 16*4 +.w128: + mova [dstq+strideq*1+16* 1], m2 + mova [dstq+strideq*0+16* 0], m0 + mova [dstq+strideq*1+16* 2], m3 + mova [dstq+strideq*0+16* 1], m1 + call .main + mova [dstq+strideq*1+16* 3], m2 + mova [dstq+strideq*0+16* 2], m0 + mova [dstq+strideq*1+16* 4], m3 + mova [dstq+strideq*0+16* 3], m1 + call .main + mova [dstq+strideq*1+16* 5], m2 + mova [dstq+strideq*0+16* 4], m0 + mova [dstq+strideq*1+16* 6], m3 + mova [dstq+strideq*0+16* 5], m1 + call .main + mova [dstq+strideq*1+16* 7], m2 + mova [dstq+strideq*0+16* 6], m0 + mova [dstq+strideq*1+16* 8], m3 + mova [dstq+strideq*0+16* 7], m1 + call .main + mova [dstq+strideq*1+16* 9], m2 + mova [dstq+strideq*0+16* 8], m0 + mova [dstq+strideq*1+16*10], m3 + mova [dstq+strideq*0+16* 9], m1 + call .main + mova [dstq+strideq*1+16*11], m2 + mova [dstq+strideq*0+16*10], m0 + mova [dstq+strideq*1+16*12], m3 + mova [dstq+strideq*0+16*11], m1 + call .main + mova [dstq+strideq*1+16*13], m2 + mova [dstq+strideq*0+16*12], m0 + mova [dstq+strideq*1+16*14], m3 + mova [dstq+strideq*0+16*13], m1 + call .main + mova [dstq+strideq*0+16*14], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*15], m2 + mova [dstq+strideq*0+16*15], m1 + call .main + paddw m2, [dstq+strideq*1+16* 1] + paddw m3, [dstq+strideq*1+16* 2] + mova [dstq+strideq*1+16* 0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16* 2], m2 + mova [dstq+strideq*1+16* 1], m1 + call .main + paddw m2, [dstq+strideq*1+16* 3] + paddw m3, [dstq+strideq*1+16* 4] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16* 2] + mova [dstq+strideq*1+16* 2], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16* 3], m1 + packuswb m3, m2 + mova [maskq+16*0], m3 + call .main + paddw m2, [dstq+strideq*1+16* 5] + paddw m3, [dstq+strideq*1+16* 6] + mova [dstq+strideq*1+16* 4], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16* 6], m2 + mova [dstq+strideq*1+16* 5], m1 + call .main + paddw m2, [dstq+strideq*1+16* 7] + paddw m3, [dstq+strideq*1+16* 8] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16* 6] + mova [dstq+strideq*1+16* 6], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16* 7], m1 + packuswb m3, m2 + mova [maskq+16*1], m3 + call .main + paddw m2, [dstq+strideq*1+16* 9] + paddw m3, [dstq+strideq*1+16*10] + mova [dstq+strideq*1+16* 8], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*10], m2 + mova [dstq+strideq*1+16* 9], m1 + call .main + paddw m2, [dstq+strideq*1+16*11] + paddw m3, [dstq+strideq*1+16*12] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*10] + mova [dstq+strideq*1+16*10], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*11], m1 + packuswb m3, m2 + mova [maskq+16*2], m3 + call .main + paddw m2, [dstq+strideq*1+16*13] + paddw m3, [dstq+strideq*1+16*14] + mova [dstq+strideq*1+16*12], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*14], m2 + mova [dstq+strideq*1+16*13], m1 + call .main + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*14] + paddw m2, [dstq+strideq*1+16*15] + mova [dstq+strideq*1+16*14], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*15], m1 + packuswb m3, m2 + mova [maskq+16*3], m3 + sub hd, 2 + jg .w128_loop + RET +ALIGN function_align +.main: +%macro W_MASK 2 ; dst/tmp_offset, mask + mova m%1, [tmp1q+16*%1] + mova m%2, [tmp2q+16*%1] + punpcklwd m4, m%2, m%1 + punpckhwd m5, m%2, m%1 + psubsw m%1, m%2 + pabsw m%1, m%1 + psubusw m6, m8, m%1 + psrlw m6, 10 ; 64-m + psubw m%2, m9, m6 ; m + punpcklwd m%1, m6, m%2 + punpckhwd m6, m%2 + pmaddwd m%1, m4 + pmaddwd m6, m5 + psrad m%1, 5 + psrad m6, 5 + packssdw m%1, m6 + pmaxsw m%1, m10 + psubsw m%1, m10 + pmulhw m%1, m11 +%endmacro + W_MASK 0, 2 + W_MASK 1, 3 + add tmp1q, 16*2 + add tmp2q, 16*2 + ret + +cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_422_ssse3_table + LEA t0, w_mask_422_ssse3_table + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movd m7, r7m ; sign + shr r6d, 11 + movsxd wq, [t0+wq*4] +%if ARCH_X86_64 + mova m8, [base+pw_27615] + mova m9, [base+pw_64] + movddup m10, [base+bidir_rnd+r6*8] + movddup m11, [base+bidir_mul+r6*8] +%else + mova m1, [base+pw_27615] + mova m2, [base+pw_64] + movddup m3, [base+bidir_rnd+r6*8] + movddup m4, [base+bidir_mul+r6*8] + ALLOC_STACK -16*4 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + mova [rsp+16*3], m4 +%endif + pxor m0, m0 + add wq, t0 + pshufb m7, m0 + movifnidn hd, r5m + mov maskq, r6mp + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop +.end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + call .main + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + call .main + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16* 0], m0 + mova [dstq+16* 1], m1 + call .main + mova [dstq+16* 2], m0 + mova [dstq+16* 3], m1 + call .main + mova [dstq+16* 4], m0 + mova [dstq+16* 5], m1 + call .main + mova [dstq+16* 6], m0 + mova [dstq+16* 7], m1 + call .main + mova [dstq+16* 8], m0 + mova [dstq+16* 9], m1 + call .main + mova [dstq+16*10], m0 + mova [dstq+16*11], m1 + call .main + mova [dstq+16*12], m0 + mova [dstq+16*13], m1 + call .main + mova [dstq+16*14], m0 + mova [dstq+16*15], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 2 + W_MASK 1, 3 + phaddw m2, m3 + add tmp1q, 16*2 + add tmp2q, 16*2 + packuswb m2, m2 + pxor m3, m3 + psubb m2, m7 + pavgb m2, m3 + movq [maskq], m2 + add maskq, 8 + ret + +cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_444_ssse3_table + LEA t0, w_mask_444_ssse3_table + tzcnt wd, wm + mov r6d, r8m ; pixel_max + shr r6d, 11 + movsxd wq, [t0+wq*4] +%if ARCH_X86_64 + mova m8, [base+pw_27615] + mova m9, [base+pw_64] + movddup m10, [base+bidir_rnd+r6*8] + movddup m11, [base+bidir_mul+r6*8] +%else + mova m1, [base+pw_27615] + mova m2, [base+pw_64] + movddup m3, [base+bidir_rnd+r6*8] + movddup m7, [base+bidir_mul+r6*8] + ALLOC_STACK -16*3 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + %define m11 m7 +%endif + add wq, t0 + movifnidn hd, r5m + mov maskq, r6mp + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop +.end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + call .main + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + call .main + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16* 0], m0 + mova [dstq+16* 1], m1 + call .main + mova [dstq+16* 2], m0 + mova [dstq+16* 3], m1 + call .main + mova [dstq+16* 4], m0 + mova [dstq+16* 5], m1 + call .main + mova [dstq+16* 6], m0 + mova [dstq+16* 7], m1 + call .main + mova [dstq+16* 8], m0 + mova [dstq+16* 9], m1 + call .main + mova [dstq+16*10], m0 + mova [dstq+16*11], m1 + call .main + mova [dstq+16*12], m0 + mova [dstq+16*13], m1 + call .main + mova [dstq+16*14], m0 + mova [dstq+16*15], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 2 + W_MASK 1, 3 + packuswb m2, m3 + add tmp1q, 16*2 + add tmp2q, 16*2 + mova [maskq], m2 + add maskq, 16 + ret From be3f400c72adb09aa7e2381e934931620e1644eb Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Thu, 20 May 2021 16:41:33 +0200 Subject: [PATCH 107/188] x86: Add high bitdepth blend/blend_v/blend_h SSSE3 asm --- src/x86/mc16_sse.asm | 404 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 404 insertions(+) diff --git a/src/x86/mc16_sse.asm b/src/x86/mc16_sse.asm index 5be8a456f0..685890f816 100644 --- a/src/x86/mc16_sse.asm +++ b/src/x86/mc16_sse.asm @@ -28,10 +28,22 @@ SECTION_RODATA +; dav1d_obmc_masks[] << 9 +obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0 + dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0 + dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120 + dw 4096, 3072, 2048, 1536, 0, 0, 0, 0 + dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240 + dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608 + dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024 + +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 + pw_2: times 8 dw 2 pw_64: times 8 dw 64 pw_256: times 8 dw 256 pw_27615: times 8 dw 27615 +pw_m512: times 8 dw -512 pd_65538: times 2 dd 65538 bidir_rnd: times 4 dw -16400 @@ -56,6 +68,9 @@ BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128 SECTION .text @@ -933,3 +948,392 @@ ALIGN function_align mova [maskq], m2 add maskq, 16 ret + +; (a * (64 - m) + b * m + 32) >> 6 +; = (((b - a) * m + 32) >> 6) + a +; = (((b - a) * (m << 9) + 16384) >> 15) + a +; except m << 9 overflows int16_t when m == 64 (which is possible), +; but if we negate m it works out (-64 << 9 == -32768). +; = (((a - b) * (m * -512) + 16384) >> 15) + a +cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3 +%define base r6-blend_ssse3_table + LEA r6, blend_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r6+wq*4] + movifnidn maskq, maskmp + mova m7, [base+pw_m512] + add wq, r6 + lea stride3q, [strideq*3] + pxor m6, m6 + jmp wq +.w4: + mova m5, [maskq] + movq m0, [dstq+strideq*0] + movhps m0, [dstq+strideq*1] + movq m1, [dstq+strideq*2] + movhps m1, [dstq+stride3q ] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + add maskq, 16 + add tmpq, 32 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + mova m5, [maskq] + mova m0, [dstq+strideq*0] + mova m1, [dstq+strideq*1] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + add maskq, 16 + add tmpq, 32 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8 + RET +.w16: + mova m5, [maskq] + mova m0, [dstq+16*0] + mova m1, [dstq+16*1] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + add maskq, 16 + add tmpq, 32 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, strideq + dec hd + jg .w16 + RET +.w32: + mova m5, [maskq+16*0] + mova m0, [dstq+16*0] + mova m1, [dstq+16*1] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova m5, [maskq+16*1] + mova m0, [dstq+16*2] + mova m1, [dstq+16*3] + psubw m2, m0, [tmpq+16*2] + psubw m3, m1, [tmpq+16*3] + add maskq, 32 + add tmpq, 64 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + add dstq, strideq + dec hd + jg .w32 + RET + +cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h +%define base r5-blend_v_ssse3_table + LEA r5, blend_v_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + jmp wq +.w2: + movd m4, [base+obmc_masks+2*2] +.w2_loop: + movd m0, [dstq+strideq*0] + movd m2, [tmpq+4*0] + movd m1, [dstq+strideq*1] + movd m3, [tmpq+4*1] + add tmpq, 4*2 + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w2_loop + RET +.w4: + movddup m2, [base+obmc_masks+4*2] +.w4_loop: + movq m0, [dstq+strideq*0] + movhps m0, [dstq+strideq*1] + mova m1, [tmpq] + add tmpq, 8*2 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w4_loop + RET +.w8: + mova m4, [base+obmc_masks+8*2] +.w8_loop: + mova m0, [dstq+strideq*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+strideq*1] + mova m3, [tmpq+16*1] + add tmpq, 16*2 + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +.w16: + mova m4, [base+obmc_masks+16*2] + movq m5, [base+obmc_masks+16*3] +.w16_loop: + mova m0, [dstq+16*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+16*1] + mova m3, [tmpq+16*1] + add tmpq, 16*2 + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, strideq + dec hd + jg .w16_loop + RET +.w32: +%if WIN64 + movaps [rsp+8], m6 +%endif + mova m4, [base+obmc_masks+16*4] + mova m5, [base+obmc_masks+16*5] + mova m6, [base+obmc_masks+16*6] +.w32_loop: + mova m0, [dstq+16*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+16*1] + mova m3, [tmpq+16*1] + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + mova m2, [dstq+16*2] + paddw m1, m3 + mova m3, [tmpq+16*2] + add tmpq, 16*4 + psubw m3, m2 + pmulhrsw m3, m6 + paddw m2, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + add dstq, strideq + dec hd + jg .w32_loop +%if WIN64 + movaps m6, [rsp+8] +%endif + RET + +%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp + mova m0, [dstq+16*(%1+0)] + mova m2, [tmpq+16*(%2+0)] + mova m1, [dstq+16*(%1+1)] + mova m3, [tmpq+16*(%2+1)] +%if %3 + add tmpq, 16*%3 +%endif + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m5 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*(%1+0)], m0 + mova [dstq+16*(%1+1)], m1 +%endmacro + +cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask +%define base r6-blend_h_ssse3_table + LEA r6, blend_h_ssse3_table + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + movddup m4, [base+blend_shuf] + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + add wq, r6 + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd m0, [dstq+dsq*0] + movd m2, [dstq+dsq*1] + movd m3, [maskq+hq*2] + movq m1, [tmpq] + add tmpq, 4*2 + punpckldq m0, m2 + punpcklwd m3, m3 + psubw m1, m0 + pmulhrsw m1, m3 + paddw m0, m1 + movd [dstq+dsq*0], m0 + psrlq m0, 32 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +.w4: + mova m3, [base+blend_shuf] +.w4_loop: + movq m0, [dstq+dsq*0] + movhps m0, [dstq+dsq*1] + movd m2, [maskq+hq*2] + mova m1, [tmpq] + add tmpq, 8*2 + psubw m1, m0 + pshufb m2, m3 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +.w8: + movddup m5, [base+blend_shuf+8] +%if WIN64 + movaps [rsp+ 8], m6 + movaps [rsp+24], m7 +%endif +.w8_loop: + movd m7, [maskq+hq*2] + mova m0, [dstq+dsq*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+dsq*1] + mova m3, [tmpq+16*1] + add tmpq, 16*2 + pshufb m6, m7, m4 + psubw m2, m0 + pshufb m7, m5 + psubw m3, m1 + pmulhrsw m2, m6 + pmulhrsw m3, m7 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop +%if WIN64 + movaps m6, [rsp+ 8] + movaps m7, [rsp+24] +%endif + RET +.w16: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0, 2 + add dstq, dsq + inc hq + jl .w16 + RET +.w32: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2, 4 + add dstq, dsq + inc hq + jl .w32 + RET +.w64: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2 + BLEND_H_ROW 4, 4 + BLEND_H_ROW 6, 6, 8 + add dstq, dsq + inc hq + jl .w64 + RET +.w128: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2 + BLEND_H_ROW 4, 4 + BLEND_H_ROW 6, 6, 16 + BLEND_H_ROW 8, -8 + BLEND_H_ROW 10, -6 + BLEND_H_ROW 12, -4 + BLEND_H_ROW 14, -2 + add dstq, dsq + inc hq + jl .w128 + RET From e42d92111d02a733f8033b936521b1b808aae7c4 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Thu, 20 May 2021 16:41:35 +0200 Subject: [PATCH 108/188] x86: Add high bitdepth put_bilin/prep_bilin SSSE3 asm --- src/x86/mc16_sse.asm | 1014 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 1012 insertions(+), 2 deletions(-) diff --git a/src/x86/mc16_sse.asm b/src/x86/mc16_sse.asm index 685890f816..30d053b1f5 100644 --- a/src/x86/mc16_sse.asm +++ b/src/x86/mc16_sse.asm @@ -40,16 +40,23 @@ obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 pw_2: times 8 dw 2 +pw_16: times 4 dw 16 +prep_mul: times 4 dw 16 + times 8 dw 4 pw_64: times 8 dw 64 pw_256: times 8 dw 256 +pw_2048: times 4 dw 2048 +bidir_mul: times 4 dw 2048 +pw_8192: times 8 dw 8192 pw_27615: times 8 dw 27615 +pw_32766: times 8 dw 32766 pw_m512: times 8 dw -512 pd_65538: times 2 dd 65538 +put_bilin_h_rnd: times 4 dw 8 + times 4 dw 10 bidir_rnd: times 4 dw -16400 times 4 dw -16388 -bidir_mul: times 4 dw 2048 - times 4 dw 8192 %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) @@ -72,8 +79,32 @@ BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128 +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put) +%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep) + +BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 + SECTION .text +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + %if UNIX64 DECLARE_REG_TMP 7 %else @@ -81,6 +112,985 @@ DECLARE_REG_TMP 5 %endif INIT_XMM ssse3 +cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy +%define base t0-put_ssse3 + mov mxyd, r6m ; mx + LEA t0, put_ssse3 + movifnidn wd, wm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + tzcnt wd, wd + movzx wd, word [base+put_ssse3_table+wq*2] + add wq, t0 + movifnidn hd, hm + jmp wq +.put_w2: + mov r4d, [srcq+ssq*0] + mov r6d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r4d + mov [dstq+dsq*1], r6d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + movq m0, [srcq+ssq*0] + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq [dstq+dsq*0], m0 + movq [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +.put_w16: + movu m0, [srcq+ssq*0+16*0] + movu m1, [srcq+ssq*0+16*1] + movu m2, [srcq+ssq*1+16*0] + movu m3, [srcq+ssq*1+16*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+16*0], m0 + mova [dstq+dsq*0+16*1], m1 + mova [dstq+dsq*1+16*0], m2 + mova [dstq+dsq*1+16*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +.put_w32: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + add srcq, ssq + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + add dstq, dsq + dec hd + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + add srcq, ssq + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + mova [dstq+16*6], m2 + mova [dstq+16*7], m3 + add dstq, dsq + dec hd + jg .put_w64 + RET +.put_w128: + add srcq, 16*8 + add dstq, 16*8 +.put_w128_loop: + movu m0, [srcq-16*8] + movu m1, [srcq-16*7] + movu m2, [srcq-16*6] + movu m3, [srcq-16*5] + mova [dstq-16*8], m0 + mova [dstq-16*7], m1 + mova [dstq-16*6], m2 + mova [dstq-16*5], m3 + movu m0, [srcq-16*4] + movu m1, [srcq-16*3] + movu m2, [srcq-16*2] + movu m3, [srcq-16*1] + mova [dstq-16*4], m0 + mova [dstq-16*3], m1 + mova [dstq-16*2], m2 + mova [dstq-16*1], m3 + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + add srcq, ssq + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + mova [dstq+16*6], m2 + mova [dstq+16*7], m3 + add dstq, dsq + dec hd + jg .put_w128_loop + RET +.h: + movd m5, mxyd + mov mxyd, r7m ; my + mova m4, [base+pw_16] + pshufb m5, [base+pw_256] + psubw m4, m5 + test mxyd, mxyd + jnz .hv + ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v + mov r6d, r8m ; bitdepth_max + shr r6d, 11 + movddup m3, [base+put_bilin_h_rnd+r6*8] + movifnidn hd, hm + sub wd, 8 + jg .h_w16 + je .h_w8 + jp .h_w4 +.h_w2: + movq m1, [srcq+ssq*0] + movhps m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmullw m0, m4, m1 + psrlq m1, 16 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 4 + movd [dstq+dsq*0], m0 + punpckhqdq m0, m0 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + movq m0, [srcq+ssq*0] + movhps m0, [srcq+ssq*1] + movq m1, [srcq+ssq*0+2] + movhps m1, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 4 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + neg wq +.h_w16_loop0: + mov r6, wq +.h_w16_loop: + movu m0, [srcq+r6*2+ 0] + movu m1, [srcq+r6*2+ 2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + movu m1, [srcq+r6*2+16] + movu m2, [srcq+r6*2+18] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+r6*2+16*0], m0 + mova [dstq+r6*2+16*1], m1 + add r6, 16 + jl .h_w16_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w16_loop0 + RET +.v: + shl mxyd, 11 + movd m5, mxyd + pshufb m5, [base+pw_256] + movifnidn hd, hm + cmp wd, 4 + jg .v_w8 + je .v_w4 +.v_w2: + movd m0, [srcq+ssq*0] +.v_w2_loop: + movd m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq m2, m0, m1 + movd m0, [srcq+ssq*0] + punpcklqdq m1, m0 + psubw m1, m2 + pmulhrsw m1, m5 + paddw m1, m2 + movd [dstq+dsq*0], m1 + punpckhqdq m1, m1 + movd [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq m0, [srcq+ssq*0] +.v_w4_loop: + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq m2, m0, m1 + movq m0, [srcq+ssq*0] + punpcklqdq m1, m0 + psubw m1, m2 + pmulhrsw m1, m5 + paddw m1, m2 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: +%if ARCH_X86_64 +%if WIN64 + push r7 +%endif + shl wd, 5 + mov r7, srcq + lea r6d, [wq+hq-256] + mov r4, dstq +%else + mov r6, srcq +%endif +.v_w8_loop0: + movu m0, [srcq+ssq*0] +.v_w8_loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + psubw m1, m3, m0 + pmulhrsw m1, m5 + paddw m1, m0 + movu m0, [srcq+ssq*0] + psubw m2, m0, m3 + pmulhrsw m2, m5 + paddw m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop +%if ARCH_X86_64 + add r7, 16 + add r4, 16 + movzx hd, r6b + mov srcq, r7 + mov dstq, r4 + sub r6d, 1<<8 +%else + mov dstq, dstmp + add r6, 16 + mov hd, hm + add dstq, 16 + mov srcq, r6 + mov dstmp, dstq + sub wd, 8 +%endif + jg .v_w8_loop0 +%if WIN64 + pop r7 +%endif + RET +.hv: + WIN64_SPILL_XMM 8 + shl mxyd, 11 + mova m3, [base+pw_2] + movd m6, mxyd + mova m7, [base+pw_8192] + pshufb m6, [base+pw_256] + test dword r8m, 0x800 + jnz .hv_12bpc + psllw m4, 2 + psllw m5, 2 + mova m7, [base+pw_2048] +.hv_12bpc: + movifnidn hd, hm + cmp wd, 4 + jg .hv_w8 + je .hv_w4 +.hv_w2: + movddup m0, [srcq+ssq*0] + pshufhw m1, m0, q0321 + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w2_loop: + movq m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps m2, [srcq+ssq*0] + pmullw m1, m4, m2 + psrlq m2, 16 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 ; 1 _ 2 _ + shufpd m2, m0, m1, 0x01 ; 0 _ 1 _ + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m2 + pmulhrsw m1, m7 + movd [dstq+dsq*0], m1 + punpckhqdq m1, m1 + movd [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + movddup m0, [srcq+ssq*0] + movddup m1, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w4_loop: + movq m1, [srcq+ssq*1] + movq m2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + movhps m1, [srcq+ssq*0] + movhps m2, [srcq+ssq*0+2] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 ; 1 2 + shufpd m2, m0, m1, 0x01 ; 0 1 + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m2 + pmulhrsw m1, m7 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: +%if ARCH_X86_64 +%if WIN64 + push r7 +%endif + shl wd, 5 + lea r6d, [wq+hq-256] + mov r4, srcq + mov r7, dstq +%else + mov r6, srcq +%endif +.hv_w8_loop0: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w8_loop: + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 + psubw m2, m1, m0 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m0 + pmulhrsw m2, m7 + mova [dstq+dsq*0], m2 + movu m0, [srcq+ssq*0] + movu m2, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m2, m5 + paddw m0, m3 + paddw m0, m2 + psrlw m0, 2 + psubw m2, m0, m1 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m1 + pmulhrsw m2, m7 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop +%if ARCH_X86_64 + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 +%else + mov dstq, dstmp + add r6, 16 + mov hd, hm + add dstq, 16 + mov srcq, r6 + mov dstmp, dstq + sub wd, 8 +%endif + jg .hv_w8_loop0 +%if WIN64 + pop r7 +%endif + RET + +cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3 +%define base r6-prep_ssse3 + movifnidn mxyd, r5m ; mx + LEA r6, prep_ssse3 + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + tzcnt wd, wd + movzx wd, word [base+prep_ssse3_table+wq*2] + mov r5d, r7m ; bitdepth_max + mova m5, [base+pw_8192] + add wq, r6 + shr r5d, 11 + movddup m4, [base+prep_mul+r5*8] + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movq m0, [srcq+strideq*0] + movhps m0, [srcq+strideq*1] + movq m1, [srcq+strideq*2] + movhps m1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + pmullw m1, m4 + psubw m0, m5 + psubw m1, m5 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*2] + movu m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + movu m0, [srcq+strideq*0+16*0] + movu m1, [srcq+strideq*0+16*1] + movu m2, [srcq+strideq*1+16*0] + movu m3, [srcq+strideq*1+16*1] + lea srcq, [srcq+strideq*2] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 2 + jg .prep_w16 + RET +.prep_w32: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + add srcq, strideq + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + dec hd + jg .prep_w32 + RET +.prep_w64: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + add srcq, strideq + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*4], m0 + mova [tmpq+16*5], m1 + mova [tmpq+16*6], m2 + mova [tmpq+16*7], m3 + add tmpq, 16*8 + dec hd + jg .prep_w64 + RET +.prep_w128: + movu m0, [srcq+16* 0] + movu m1, [srcq+16* 1] + movu m2, [srcq+16* 2] + movu m3, [srcq+16* 3] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + movu m0, [srcq+16* 4] + movu m1, [srcq+16* 5] + movu m2, [srcq+16* 6] + movu m3, [srcq+16* 7] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*4], m0 + mova [tmpq+16*5], m1 + mova [tmpq+16*6], m2 + mova [tmpq+16*7], m3 + movu m0, [srcq+16* 8] + movu m1, [srcq+16* 9] + movu m2, [srcq+16*10] + movu m3, [srcq+16*11] + add tmpq, 16*16 + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq-16*8], m0 + mova [tmpq-16*7], m1 + mova [tmpq-16*6], m2 + mova [tmpq-16*5], m3 + movu m0, [srcq+16*12] + movu m1, [srcq+16*13] + movu m2, [srcq+16*14] + movu m3, [srcq+16*15] + add srcq, strideq + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq-16*4], m0 + mova [tmpq-16*3], m1 + mova [tmpq-16*2], m2 + mova [tmpq-16*1], m3 + dec hd + jg .prep_w128 + RET +.h: + movd m4, mxyd + mov mxyd, r6m ; my + mova m3, [base+pw_16] + pshufb m4, [base+pw_256] + mova m5, [base+pw_32766] + psubw m3, m4 + test dword r7m, 0x800 + jnz .h_12bpc + psllw m3, 2 + psllw m4, 2 +.h_12bpc: + test mxyd, mxyd + jnz .hv + sub wd, 8 + je .h_w8 + jg .h_w16 +.h_w4: + movq m0, [srcq+strideq*0] + movhps m0, [srcq+strideq*1] + movq m1, [srcq+strideq*0+2] + movhps m1, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 16 + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + lea srcq, [srcq+wq*2] + neg wq +.h_w16_loop0: + mov r6, wq +.h_w16_loop: + movu m0, [srcq+r6*2+ 0] + movu m1, [srcq+r6*2+ 2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + movu m1, [srcq+r6*2+16] + movu m2, [srcq+r6*2+18] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + add r6, 16 + jl .h_w16_loop + add srcq, strideq + dec hd + jg .h_w16_loop0 + RET +.v: + movd m4, mxyd + mova m3, [base+pw_16] + pshufb m4, [base+pw_256] + mova m5, [base+pw_32766] + psubw m3, m4 + test dword r7m, 0x800 + jnz .v_12bpc + psllw m3, 2 + psllw m4, 2 +.v_12bpc: + cmp wd, 8 + je .v_w8 + jg .v_w16 +.v_w4: + movq m0, [srcq+strideq*0] +.v_w4_loop: + movq m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + punpcklqdq m1, m0, m2 ; 0 1 + movq m0, [srcq+strideq*0] + punpcklqdq m2, m0 ; 1 2 + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 16 + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movu m0, [srcq+strideq*0] +.v_w8_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m3 + pmullw m1, m4, m2 + psubw m0, m5 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m3 + mova [tmpq+16*0], m1 + pmullw m1, m4, m0 + psubw m2, m5 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: +%if WIN64 + push r7 +%endif + mov r5, srcq +%if ARCH_X86_64 + lea r6d, [wq*4-32] + mov wd, wd + lea r6d, [hq+r6*8] + mov r7, tmpq +%else + mov r6d, wd +%endif +.v_w16_loop0: + movu m0, [srcq+strideq*0] +.v_w16_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m3 + pmullw m1, m4, m2 + psubw m0, m5 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m3 + mova [tmpq+wq*0], m1 + pmullw m1, m4, m0 + psubw m2, m5 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+wq*2], m1 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w16_loop +%if ARCH_X86_64 + add r5, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 +%else + mov tmpq, tmpmp + add r5, 16 + mov hd, hm + add tmpq, 16 + mov srcq, r5 + mov tmpmp, tmpq + sub r6d, 8 +%endif + jg .v_w16_loop0 +%if WIN64 + pop r7 +%endif + RET +.hv: + WIN64_SPILL_XMM 7 + shl mxyd, 11 + movd m6, mxyd + pshufb m6, [base+pw_256] + cmp wd, 8 + je .hv_w8 + jg .hv_w16 +.hv_w4: + movddup m0, [srcq+strideq*0] + movddup m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 +.hv_w4_loop: + movq m1, [srcq+strideq*1] + movq m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + movhps m1, [srcq+strideq*0] + movhps m2, [srcq+strideq*0+2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 ; 1 2 + shufpd m2, m0, m1, 0x01 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 16 + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 +.hv_w8_loop: + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 + psubw m2, m1, m0 + pmulhrsw m2, m6 + paddw m2, m0 + mova [tmpq+16*0], m2 + movu m0, [srcq+strideq*0] + movu m2, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m2, m4 + psubw m0, m5 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+16*1], m2 + add tmpq, 16*2 + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: +%if WIN64 + push r7 +%endif + mov r5, srcq +%if ARCH_X86_64 + lea r6d, [wq*4-32] + mov wd, wd + lea r6d, [hq+r6*8] + mov r7, tmpq +%else + mov r6d, wd +%endif +.hv_w16_loop0: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 +.hv_w16_loop: + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 + psubw m2, m1, m0 + pmulhrsw m2, m6 + paddw m2, m0 + mova [tmpq+wq*0], m2 + movu m0, [srcq+strideq*0] + movu m2, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m2, m4 + psubw m0, m5 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+wq*2], m2 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .hv_w16_loop +%if ARCH_X86_64 + add r5, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 +%else + mov tmpq, tmpmp + add r5, 16 + mov hd, hm + add tmpq, 16 + mov srcq, r5 + mov tmpmp, tmpq + sub r6d, 8 +%endif + jg .hv_w16_loop0 +%if WIN64 + pop r7 +%endif + RET + %macro BIDIR_FN 0 call .main jmp wq From 37909d574148e111e7539bc62669356475dd2cdc Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Thu, 20 May 2021 16:41:36 +0200 Subject: [PATCH 109/188] x86: Add high bitdepth put_8tap SSSE3 asm --- src/x86/mc16_sse.asm | 856 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 856 insertions(+) diff --git a/src/x86/mc16_sse.asm b/src/x86/mc16_sse.asm index 30d053b1f5..0ff321ca2b 100644 --- a/src/x86/mc16_sse.asm +++ b/src/x86/mc16_sse.asm @@ -38,6 +38,9 @@ obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0 dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 pw_2: times 8 dw 2 pw_16: times 4 dw 16 @@ -51,12 +54,14 @@ pw_8192: times 8 dw 8192 pw_27615: times 8 dw 27615 pw_32766: times 8 dw 32766 pw_m512: times 8 dw -512 +pd_512: times 4 dd 512 pd_65538: times 2 dd 65538 put_bilin_h_rnd: times 4 dw 8 times 4 dw 10 bidir_rnd: times 4 dw -16400 times 4 dw -16388 +put_8tap_h_rnd: dd 34, 34, 40, 40 %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) @@ -95,6 +100,9 @@ BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + SECTION .text %macro REPX 2-* @@ -1091,6 +1099,854 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3 %endif RET +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v +cglobal %1_8tap_%2_16bpc + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) +%endif +%endmacro + +%if ARCH_X86_32 +DECLARE_REG_TMP 1, 2, 6 +%elif WIN64 +DECLARE_REG_TMP 4, 5, 8 +%else +DECLARE_REG_TMP 7, 8, 8 +%endif + +MC_8TAP_FN put, sharp, SHARP, SHARP +MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN put, smooth, SMOOTH, SMOOTH +MC_8TAP_FN put, sharp_regular, SHARP, REGULAR +MC_8TAP_FN put, regular_sharp, REGULAR, SHARP +MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN put, regular, REGULAR, REGULAR + +%if ARCH_X86_32 +cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my +%define mxb r0b +%define mxd r0 +%define mxq r0 +%define myb r1b +%define myd r1 +%define myq r1 +%define m8 [esp+16*0] +%define m9 [esp+16*1] +%define m10 [esp+16*2] +%define m11 [esp+16*3] +%define m12 [esp+16*4] +%define m13 [esp+16*5] +%define m14 [esp+16*6] +%define m15 [esp+16*7] +%else +cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my +%endif +%define base t2-put_ssse3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + LEA t2, put_ssse3 + movifnidn wd, wm + movifnidn srcq, srcmp + movifnidn ssq, ssmp + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [base+put_ssse3_table+wq*2] + movifnidn dstq, dstmp + movifnidn dsq, dsmp + add wq, t2 +%if WIN64 + pop r8 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + mov myd, r8m + movd m5, r8m + shr myd, 11 + movddup m4, [base+put_8tap_h_rnd+myq*8] + movifnidn dsq, dsmp + pshufb m5, [base+pw_256] + cmp wd, 4 + jg .h_w8 + movzx mxd, mxb + lea srcq, [srcq-2] + movq m3, [base+subpel_filters+mxq*8] + movifnidn dstq, dstmp + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + je .h_w4 +.h_w2: + mova m2, [base+spel_h_shuf2] + pshufd m3, m3, q2121 +.h_w2_loop: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m0, m2 + pshufb m1, m2 + pmaddwd m0, m3 + pmaddwd m1, m3 + phaddd m0, m1 + paddd m0, m4 + psrad m0, 6 + packssdw m0, m0 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + movd [dstq+dsq*0], m0 + pshuflw m0, m0, q3232 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + WIN64_SPILL_XMM 8 + mova m6, [base+spel_h_shufA] + mova m7, [base+spel_h_shufB] + pshufd m2, m3, q1111 + pshufd m3, m3, q2222 +.h_w4_loop: + movu m1, [srcq] + add srcq, ssq + pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 + pshufb m1, m7 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m2 + pmaddwd m1, m3 + paddd m0, m4 + paddd m0, m1 + psrad m0, 6 + packssdw m0, m0 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + movq [dstq], m0 + add dstq, dsq + dec hd + jg .h_w4_loop + RET +.h_w8: +%if WIN64 + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 12 +%endif + shr mxd, 16 + movq m3, [base+subpel_filters+mxq*8] + movifnidn dstq, dstmp + mova m6, [base+spel_h_shufA] + mova m7, [base+spel_h_shufB] +%if UNIX64 + mov wd, wd +%endif + lea srcq, [srcq+wq*2] + punpcklbw m3, m3 + lea dstq, [dstq+wq*2] + psraw m3, 8 + neg wq +%if ARCH_X86_32 + ALLOC_STACK -16*4 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif +.h_w8_loop0: + mov r6, wq +.h_w8_loop: + movu m0, [srcq+r6*2- 6] + movu m1, [srcq+r6*2+ 2] + pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4 + pshufb m0, m7 ; 2 3 3 4 4 5 5 6 + pmaddwd m2, m8 ; abcd0 + pmaddwd m0, m9 ; abcd1 + pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8 + pshufb m1, m7 ; 6 7 7 8 8 9 9 a + paddd m2, m4 + paddd m0, m2 + pmaddwd m2, m10, m3 ; abcd2 + pmaddwd m3, m8 ; efgh0 + paddd m0, m2 + pmaddwd m2, m11, m1 ; abcd3 + pmaddwd m1, m9 ; efgh1 + paddd m0, m2 + movu m2, [srcq+r6*2+10] + paddd m3, m4 + paddd m1, m3 + pshufb m3, m2, m6 ; 8 9 9 a a b b c + pshufb m2, m7 ; a b b c c d d e + pmaddwd m3, m10 ; efgh2 + pmaddwd m2, m11 ; efgh3 + paddd m1, m3 + paddd m1, m2 + psrad m0, 6 + psrad m1, 6 + packssdw m0, m1 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + mova [dstq+r6*2], m0 + add r6, 8 + jl .h_w8_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w8_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif +%if WIN64 + WIN64_SPILL_XMM 15 +%endif + movd m7, r8m + movifnidn dstq, dstmp + movifnidn dsq, dsmp + punpcklbw m3, m3 + pshufb m7, [base+pw_256] + psraw m3, 8 ; sign-extend +%if ARCH_X86_32 + ALLOC_STACK -16*7 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif + lea r6, [ssq*3] + sub srcq, r6 + cmp wd, 2 + jne .v_w4 +.v_w2: + movd m1, [srcq+ssq*0] + movd m4, [srcq+ssq*1] + movd m2, [srcq+ssq*2] + add srcq, r6 + movd m5, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + movd m6, [srcq+ssq*2] + add srcq, r6 + movd m0, [srcq+ssq*0] + punpckldq m1, m4 ; 0 1 + punpckldq m4, m2 ; 1 2 + punpckldq m2, m5 ; 2 3 + punpckldq m5, m3 ; 3 4 + punpckldq m3, m6 ; 4 5 + punpckldq m6, m0 ; 5 6 + punpcklwd m1, m4 ; 01 12 + punpcklwd m2, m5 ; 23 34 + punpcklwd m3, m6 ; 45 56 + pxor m6, m6 +.v_w2_loop: + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m5, m8, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m9 ; a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m10 ; a2 b2 + paddd m5, m3 + punpckldq m3, m0, m4 ; 6 7 + movd m0, [srcq+ssq*0] + punpckldq m4, m0 ; 7 8 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m11, m3 ; a3 b3 + paddd m5, m4 + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m6 + pavgw m5, m6 + pminsw m5, m7 + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q3232 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: +%if ARCH_X86_32 + shl wd, 14 +%if STACK_ALIGNMENT < 16 + mov [esp+4*29], srcq + mov [esp+4*30], dstq +%else + mov srcmp, srcq +%endif + lea wd, [wq+hq-(1<<16)] +%else + shl wd, 6 + mov r7, srcq + mov r8, dstq + lea wd, [wq+hq-(1<<8)] +%endif +.v_w4_loop0: + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + movq m3, [srcq+ssq*2] + add srcq, r6 + movq m4, [srcq+ssq*0] + movq m5, [srcq+ssq*1] + movq m6, [srcq+ssq*2] + add srcq, r6 + movq m0, [srcq+ssq*0] + punpcklwd m1, m2 ; 01 + punpcklwd m2, m3 ; 12 + punpcklwd m3, m4 ; 23 + punpcklwd m4, m5 ; 34 + punpcklwd m5, m6 ; 45 + punpcklwd m6, m0 ; 56 +%if ARCH_X86_32 + jmp .v_w4_loop_start +.v_w4_loop: + mova m1, m12 + mova m2, m13 + mova m3, m14 +.v_w4_loop_start: + pmaddwd m1, m8 ; a0 + pmaddwd m2, m8 ; b0 + mova m12, m3 + mova m13, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m1, m3 + paddd m2, m4 + mova m14, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m1, m5 + paddd m2, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m3, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m1, m3 + pmaddwd m3, m11, m6 ; b3 + paddd m2, m3 + psrad m1, 5 + psrad m2, 5 + packssdw m1, m2 + pxor m2, m2 + pmaxsw m1, m2 + pavgw m1, m2 + pminsw m1, m7 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop +%if STACK_ALIGNMENT < 16 + mov srcq, [esp+4*29] + mov dstq, [esp+4*30] + movzx hd, ww + add srcq, 8 + add dstq, 8 + mov [esp+4*29], srcq + mov [esp+4*30], dstq +%else + mov srcq, srcmp + mov dstq, dstmp + movzx hd, ww + add srcq, 8 + add dstq, 8 + mov srcmp, srcq + mov dstmp, dstq +%endif + sub wd, 1<<16 +%else +.v_w4_loop: + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + paddd m13, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m14, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m14 + psrad m12, 5 + psrad m13, 5 + packssdw m12, m13 + pxor m13, m13 + pmaxsw m12, m13 + pavgw m12, m13 + pminsw m12, m7 + movq [dstq+dsq*0], m12 + movhps [dstq+dsq*1], m12 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + add r7, 8 + add r8, 8 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 +%endif + jg .v_w4_loop0 + RET +.hv: +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif +%if ARCH_X86_32 + movd m4, r8m + mova m6, [base+pd_512] + pshufb m4, [base+pw_256] +%else +%if WIN64 + ALLOC_STACK 16*6, 16 +%endif + movd m15, r8m + pshufb m15, [base+pw_256] +%endif + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + je .hv_w4 + movq m0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 + mov dstq, dstmp + mov dsq, dsmp + mova m5, [base+spel_h_shuf2] + ALLOC_STACK -16*8 +%else + mova m6, [base+pd_512] + mova m9, [base+spel_h_shuf2] +%endif + pshuflw m0, m0, q2121 + pxor m7, m7 + punpcklbw m7, m0 + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + test dword r8m, 0x800 + jz .hv_w2_10bpc + psraw m7, 2 + psllw m3, 2 +.hv_w2_10bpc: + lea r6, [ssq*3] + sub srcq, 2 + sub srcq, r6 +%if ARCH_X86_32 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m9, m5 + mova m11, m0 + mova m12, m1 + mova m13, m2 + mova m14, m3 + mova m15, m4 +%else + pshufd m11, m3, q0000 + pshufd m12, m3, q1111 + pshufd m13, m3, q2222 + pshufd m14, m3, q3333 +%endif + movu m2, [srcq+ssq*0] + movu m3, [srcq+ssq*1] + movu m1, [srcq+ssq*2] + add srcq, r6 + movu m4, [srcq+ssq*0] +%if ARCH_X86_32 + REPX {pshufb x, m5}, m2, m3, m1, m4 +%else + REPX {pshufb x, m9}, m2, m3, m1, m4 +%endif + REPX {pmaddwd x, m7}, m2, m3, m1, m4 + phaddd m2, m3 ; 0 1 + phaddd m1, m4 ; 2 3 + movu m3, [srcq+ssq*1] + movu m4, [srcq+ssq*2] + add srcq, r6 + movu m0, [srcq+ssq*0] +%if ARCH_X86_32 + REPX {pshufb x, m5}, m3, m4, m0 +%else + REPX {pshufb x, m9}, m3, m4, m0 +%endif + REPX {pmaddwd x, m7}, m3, m4, m0 + phaddd m3, m4 ; 4 5 + phaddd m0, m0 ; 6 6 + REPX {paddd x, m6}, m2, m1, m3, m0 + REPX {psrad x, 10}, m2, m1, m3, m0 + packssdw m2, m1 ; 0 1 2 3 + packssdw m3, m0 ; 4 5 6 _ + palignr m4, m3, m2, 4 ; 1 2 3 4 + pshufd m5, m3, q0321 ; 5 6 _ _ + punpcklwd m1, m2, m4 ; 01 12 + punpckhwd m2, m4 ; 23 34 + punpcklwd m3, m5 ; 45 56 +.hv_w2_loop: + movu m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movu m5, [srcq+ssq*0] + pshufb m4, m9 + pshufb m5, m9 + pmaddwd m4, m7 + pmaddwd m5, m7 + phaddd m4, m5 + pmaddwd m5, m11, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m12 ; a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m13 ; a2 b2 + paddd m5, m3 + paddd m4, m6 + psrad m4, 10 ; 7 8 + packssdw m0, m4 + pshufd m3, m0, q2103 + punpckhwd m3, m0 ; 67 78 + mova m0, m4 + pmaddwd m4, m14, m3 ; a3 b3 + paddd m5, m6 + paddd m5, m4 + psrad m5, 10 + packssdw m5, m5 + pxor m4, m4 + pminsw m5, m15 + pmaxsw m5, m4 + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q3232 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w8: + shr mxd, 16 +.hv_w4: + movq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif + mov dstq, dstmp + mov dsq, dsmp + mova m0, [base+spel_h_shufA] + mova m1, [base+spel_h_shufB] + ALLOC_STACK -16*15 + mova m8, m0 + mova m9, m1 + mova m14, m6 +%else + mova m8, [base+spel_h_shufA] + mova m9, [base+spel_h_shufB] +%endif + pxor m0, m0 + punpcklbw m0, m2 + punpcklbw m3, m3 + psraw m3, 8 + test dword r8m, 0x800 + jz .hv_w4_10bpc + psraw m0, 2 + psllw m3, 2 +.hv_w4_10bpc: + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 +%if ARCH_X86_32 + %define tmp esp+16*8 + shl wd, 14 +%if STACK_ALIGNMENT < 16 + mov [esp+4*61], srcq + mov [esp+4*62], dstq +%else + mov srcmp, srcq +%endif + mova [tmp+16*5], m4 + lea wd, [wq+hq-(1<<16)] + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + pshufd m5, m0, q2222 + pshufd m0, m0, q3333 + mova m10, m1 + mova m11, m2 + mova m12, m5 + mova m13, m0 +%else +%if WIN64 + %define tmp rsp +%else + %define tmp rsp-104 ; red zone +%endif + shl wd, 6 + mov r7, srcq + mov r8, dstq + lea wd, [wq+hq-(1<<8)] + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 + mova [tmp+16*5], m15 +%endif + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [tmp+16*1], m0 + mova [tmp+16*2], m1 + mova [tmp+16*3], m2 + mova [tmp+16*4], m3 +%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512] + pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4 + pshufb m%1, m9 ; 2 3 3 4 4 5 5 6 + pmaddwd m%3, m10 + pmaddwd m%1, m11 + paddd m%3, %5 + paddd m%1, m%3 + pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8 + pshufb m%2, m9 ; 6 7 7 8 8 9 9 a + pmaddwd m%3, m12 + pmaddwd m%2, m13 + paddd m%1, m%3 + paddd m%1, m%2 + psrad m%1, %4 +%endmacro +.hv_w4_loop0: +%if ARCH_X86_64 + mova m14, [pd_512] +%endif + movu m4, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + movu m5, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + movu m6, [srcq+ssq*2+0] + movu m3, [srcq+ssq*2+8] + add srcq, r6 + PUT_8TAP_HV_H 4, 1, 0, 10 + PUT_8TAP_HV_H 5, 2, 0, 10 + PUT_8TAP_HV_H 6, 3, 0, 10 + movu m7, [srcq+ssq*0+0] + movu m2, [srcq+ssq*0+8] + movu m1, [srcq+ssq*1+0] + movu m3, [srcq+ssq*1+8] + PUT_8TAP_HV_H 7, 2, 0, 10 + PUT_8TAP_HV_H 1, 3, 0, 10 + movu m2, [srcq+ssq*2+0] + movu m3, [srcq+ssq*2+8] + add srcq, r6 + PUT_8TAP_HV_H 2, 3, 0, 10 + packssdw m4, m7 ; 0 3 + packssdw m5, m1 ; 1 4 + movu m0, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 1, 3, 10 + packssdw m6, m2 ; 2 5 + packssdw m7, m0 ; 3 6 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +%if ARCH_X86_32 + jmp .hv_w4_loop_start +.hv_w4_loop: + mova m1, [tmp+16*6] + mova m2, m15 +.hv_w4_loop_start: + mova m7, [tmp+16*1] + pmaddwd m1, m7 ; a0 + pmaddwd m2, m7 ; b0 + mova m7, [tmp+16*2] + mova [tmp+16*6], m3 + pmaddwd m3, m7 ; a1 + mova m15, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m1, m3 + paddd m2, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m1, m5 + paddd m2, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 10 + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 10 + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m1, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m2, m7 ; b3 + psrad m1, 9 + psrad m2, 9 + packssdw m1, m2 + pxor m7, m7 + pmaxsw m1, m7 + pavgw m7, m1 + pminsw m7, [tmp+16*5] + movq [dstq+dsq*0], m7 + movhps [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop +%if STACK_ALIGNMENT < 16 + mov srcq, [esp+4*61] + mov dstq, [esp+4*62] + add srcq, 8 + add dstq, 8 + mov [esp+4*61], srcq + mov [esp+4*62], dstq +%else + mov srcq, srcmp + mov dstq, dstmp + add srcq, 8 + add dstq, 8 + mov srcmp, srcq + mov dstmp, dstq +%endif + movzx hd, ww + sub wd, 1<<16 +%else +.hv_w4_loop: + mova m15, [tmp+16*1] + pmaddwd m14, m15, m1 ; a0 + pmaddwd m15, m2 ; b0 + mova m7, [tmp+16*2] + mova m1, m3 + pmaddwd m3, m7 ; a1 + mova m2, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m14, m3 + paddd m15, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m14, m5 + paddd m15, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512] + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512] + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m14, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m15, m7 ; b3 + psrad m14, 9 + psrad m15, 9 + packssdw m14, m15 + pxor m7, m7 + pmaxsw m14, m7 + pavgw m7, m14 + pminsw m7, [tmp+16*5] + movq [dstq+dsq*0], m7 + movhps [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + add r7, 8 + add r8, 8 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 +%endif + jg .hv_w4_loop0 + RET +%undef tmp + %macro BIDIR_FN 0 call .main jmp wq From 4ccf021b746fa3245177c8299a50b41ea5831adc Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Thu, 20 May 2021 16:41:38 +0200 Subject: [PATCH 110/188] x86: Add high bitdepth prep_8tap SSSE3 asm --- src/x86/mc16_sse.asm | 579 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 579 insertions(+) diff --git a/src/x86/mc16_sse.asm b/src/x86/mc16_sse.asm index 0ff321ca2b..f01de8c514 100644 --- a/src/x86/mc16_sse.asm +++ b/src/x86/mc16_sse.asm @@ -62,6 +62,8 @@ put_bilin_h_rnd: times 4 dw 8 bidir_rnd: times 4 dw -16400 times 4 dw -16388 put_8tap_h_rnd: dd 34, 34, 40, 40 +prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4) +prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5) %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) @@ -1947,6 +1949,583 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my RET %undef tmp +%if ARCH_X86_32 +DECLARE_REG_TMP 2, 1, 6, 4 +%elif WIN64 +DECLARE_REG_TMP 6, 4, 7, 4 +%else +DECLARE_REG_TMP 6, 7, 7, 8 +%endif + +MC_8TAP_FN prep, sharp, SHARP, SHARP +MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH +MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR +MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP +MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN prep, regular, REGULAR, REGULAR + +%if ARCH_X86_32 +cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my +%define mxb r0b +%define mxd r0 +%define mxq r0 +%define myb r2b +%define myd r2 +%define myq r2 +%else +cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my +%endif +%define base t2-prep_ssse3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + LEA t2, prep_ssse3 + movifnidn wd, wm + movifnidn srcq, srcmp + test mxd, 0xf00 + jnz .h + movifnidn hd, hm + test myd, 0xf00 + jnz .v + tzcnt wd, wd + mov myd, r7m ; bitdepth_max + movzx wd, word [base+prep_ssse3_table+wq*2] + mova m5, [base+pw_8192] + shr myd, 11 + add wq, t2 + movddup m4, [base+prep_mul+myq*8] + movifnidn ssq, ssmp + movifnidn tmpq, tmpmp + lea r6, [ssq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + movifnidn ssq, r2mp + movifnidn hd, r4m + movddup m5, [base+prep_8tap_1d_rnd] + cmp wd, 4 + jne .h_w8 + movzx mxd, mxb + movq m0, [base+subpel_filters+mxq*8] + mova m3, [base+spel_h_shufA] + mova m4, [base+spel_h_shufB] + movifnidn tmpq, tmpmp + sub srcq, 2 + WIN64_SPILL_XMM 8 + punpcklbw m0, m0 + psraw m0, 8 + test dword r7m, 0x800 + jnz .h_w4_12bpc + psllw m0, 2 +.h_w4_12bpc: + pshufd m6, m0, q1111 + pshufd m7, m0, q2222 +.h_w4_loop: + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 + pshufb m1, m4 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m6 + pmaddwd m1, m7 + paddd m0, m5 + paddd m0, m1 + pshufb m1, m2, m3 + pshufb m2, m4 + pmaddwd m1, m6 + pmaddwd m2, m7 + paddd m1, m5 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq], m0 + add tmpq, 16 + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + WIN64_SPILL_XMM 11 + shr mxd, 16 + movq m2, [base+subpel_filters+mxq*8] + mova m4, [base+spel_h_shufA] + mova m6, [base+spel_h_shufB] + movifnidn tmpq, r0mp + add wd, wd + punpcklbw m2, m2 + add srcq, wq + psraw m2, 8 + add tmpq, wq + neg wq + test dword r7m, 0x800 + jnz .h_w8_12bpc + psllw m2, 2 +.h_w8_12bpc: + pshufd m7, m2, q0000 +%if ARCH_X86_32 + ALLOC_STACK -16*3 + pshufd m0, m2, q1111 + pshufd m1, m2, q2222 + pshufd m2, m2, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 +%else + pshufd m8, m2, q1111 + pshufd m9, m2, q2222 + pshufd m10, m2, q3333 +%endif +.h_w8_loop0: + mov r6, wq +.h_w8_loop: + movu m0, [srcq+r6- 6] + movu m1, [srcq+r6+ 2] + pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4 + pshufb m0, m6 ; 2 3 3 4 4 5 5 6 + pmaddwd m2, m7 ; abcd0 + pmaddwd m0, m8 ; abcd1 + pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8 + pshufb m1, m6 ; 6 7 7 8 8 9 9 a + paddd m2, m5 + paddd m0, m2 + pmaddwd m2, m9, m3 ; abcd2 + pmaddwd m3, m7 ; efgh0 + paddd m0, m2 + pmaddwd m2, m10, m1 ; abcd3 + pmaddwd m1, m8 ; efgh1 + paddd m0, m2 + movu m2, [srcq+r6+10] + paddd m3, m5 + paddd m1, m3 + pshufb m3, m2, m4 ; a b b c c d d e + pshufb m2, m6 ; 8 9 9 a a b b c + pmaddwd m3, m9 ; efgh2 + pmaddwd m2, m10 ; efgh3 + paddd m1, m3 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq+r6], m0 + add r6, 16 + jl .h_w8_loop + add srcq, ssq + sub tmpq, wq + dec hd + jg .h_w8_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif + WIN64_SPILL_XMM 15 + movddup m7, [base+prep_8tap_1d_rnd] + movifnidn ssq, r2mp + movifnidn tmpq, r0mp + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + test dword r7m, 0x800 + jnz .v_12bpc + psllw m3, 2 +.v_12bpc: +%if ARCH_X86_32 + ALLOC_STACK -16*7 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif + lea r6, [ssq*3] + sub srcq, r6 + mov r6d, wd + shl wd, 6 + mov r5, srcq +%if ARCH_X86_64 + mov r7, tmpq +%elif STACK_ALIGNMENT < 16 + mov [esp+4*29], tmpq +%endif + lea wd, [wq+hq-(1<<8)] +.v_loop0: + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m3, [srcq+ssq*0] + movq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m5, [srcq+ssq*0] + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m0, [srcq+ssq*0] + punpcklwd m1, m2 ; 01 + punpcklwd m2, m3 ; 12 + punpcklwd m3, m4 ; 23 + punpcklwd m4, m5 ; 34 + punpcklwd m5, m6 ; 45 + punpcklwd m6, m0 ; 56 +%if ARCH_X86_32 + jmp .v_loop_start +.v_loop: + mova m1, m12 + mova m2, m13 + mova m3, m14 +.v_loop_start: + pmaddwd m1, m8 ; a0 + pmaddwd m2, m8 ; b0 + mova m12, m3 + mova m13, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m1, m3 + paddd m2, m4 + mova m14, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m1, m5 + paddd m2, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m3, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m1, m7 + paddd m1, m3 + pmaddwd m3, m11, m6 ; b3 + paddd m2, m7 + paddd m2, m3 + psrad m1, 4 + psrad m2, 4 + packssdw m1, m2 + movq [tmpq+r6*0], m1 + movhps [tmpq+r6*2], m1 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .v_loop +%if STACK_ALIGNMENT < 16 + mov tmpq, [esp+4*29] + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov [esp+4*29], tmpq +%else + mov tmpq, tmpmp + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov tmpmp, tmpq +%endif +%else +.v_loop: + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + paddd m13, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m14, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m12, m7 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m7 + paddd m13, m14 + psrad m12, 4 + psrad m13, 4 + packssdw m12, m13 + movq [tmpq+r6*0], m12 + movhps [tmpq+r6*2], m12 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .v_loop + add r5, 8 + add r7, 8 + mov srcq, r5 + mov tmpq, r7 +%endif + movzx hd, wb + sub wd, 1<<8 + jg .v_loop0 + RET +.hv: +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif + movzx t3d, mxb + shr mxd, 16 + cmp wd, 4 + cmove mxd, t3d + movifnidn hd, r4m + movq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 + mov ssq, r2mp + mov tmpq, r0mp + mova m0, [base+spel_h_shufA] + mova m1, [base+spel_h_shufB] + mova m4, [base+prep_8tap_2d_rnd] + ALLOC_STACK -16*14 + mova m8, m0 + mova m9, m1 + mova m14, m4 +%else +%if WIN64 + ALLOC_STACK 16*6, 16 +%endif + mova m8, [base+spel_h_shufA] + mova m9, [base+spel_h_shufB] +%endif + pxor m0, m0 + punpcklbw m0, m2 + punpcklbw m3, m3 + psraw m0, 4 + psraw m3, 8 + test dword r7m, 0x800 + jz .hv_10bpc + psraw m0, 2 +.hv_10bpc: + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 + mov r6d, wd + shl wd, 6 + mov r5, srcq +%if ARCH_X86_32 + %define tmp esp+16*8 +%if STACK_ALIGNMENT < 16 + mov [esp+4*61], tmpq +%endif + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + pshufd m5, m0, q2222 + pshufd m0, m0, q3333 + mova m10, m1 + mova m11, m2 + mova m12, m5 + mova m13, m0 +%else +%if WIN64 + %define tmp rsp +%else + %define tmp rsp-88 ; red zone +%endif + mov r7, tmpq + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 +%endif + lea wd, [wq+hq-(1<<8)] + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [tmp+16*1], m0 + mova [tmp+16*2], m1 + mova [tmp+16*3], m2 + mova [tmp+16*4], m3 +.hv_loop0: +%if ARCH_X86_64 + mova m14, [prep_8tap_2d_rnd] +%endif + movu m4, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + movu m5, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + movu m6, [srcq+ssq*0+0] + movu m3, [srcq+ssq*0+8] + PUT_8TAP_HV_H 4, 1, 0, 6 + PUT_8TAP_HV_H 5, 2, 0, 6 + PUT_8TAP_HV_H 6, 3, 0, 6 + movu m7, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + movu m1, [srcq+ssq*0+0] + movu m3, [srcq+ssq*0+8] + PUT_8TAP_HV_H 7, 2, 0, 6 + PUT_8TAP_HV_H 1, 3, 0, 6 + movu m2, [srcq+ssq*1+0] + movu m3, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 2, 3, 0, 6 + packssdw m4, m7 ; 0 3 + packssdw m5, m1 ; 1 4 + movu m0, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 1, 3, 6 + packssdw m6, m2 ; 2 5 + packssdw m7, m0 ; 3 6 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +%if ARCH_X86_32 + jmp .hv_loop_start +.hv_loop: + mova m1, [tmp+16*5] + mova m2, m15 +.hv_loop_start: + mova m7, [tmp+16*1] + pmaddwd m1, m7 ; a0 + pmaddwd m2, m7 ; b0 + mova m7, [tmp+16*2] + mova [tmp+16*5], m3 + pmaddwd m3, m7 ; a1 + mova m15, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m1, m14 + paddd m2, m14 + paddd m1, m3 + paddd m2, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m1, m5 + paddd m2, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 6 + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 6 + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m1, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m2, m7 ; b3 + psrad m1, 6 + psrad m2, 6 + packssdw m1, m2 + movq [tmpq+r6*0], m1 + movhps [tmpq+r6*2], m1 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .hv_loop +%if STACK_ALIGNMENT < 16 + mov tmpq, [esp+4*61] + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov [esp+4*61], tmpq +%else + mov tmpq, tmpmp + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov tmpmp, tmpq +%endif +%else +.hv_loop: + mova m15, [tmp+16*1] + mova m7, [prep_8tap_2d_rnd] + pmaddwd m14, m15, m1 ; a0 + pmaddwd m15, m2 ; b0 + paddd m14, m7 + paddd m15, m7 + mova m7, [tmp+16*2] + mova m1, m3 + pmaddwd m3, m7 ; a1 + mova m2, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m14, m3 + paddd m15, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m14, m5 + paddd m15, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd] + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd] + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m14, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m15, m7 ; b3 + psrad m14, 6 + psrad m15, 6 + packssdw m14, m15 + movq [tmpq+r6*0], m14 + movhps [tmpq+r6*2], m14 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .hv_loop + add r5, 8 + add r7, 8 + mov srcq, r5 + mov tmpq, r7 +%endif + movzx hd, wb + sub wd, 1<<8 + jg .hv_loop0 + RET +%undef tmp + %macro BIDIR_FN 0 call .main jmp wq From 5377fbd32c6634292f400a698c2c7753de1ccc6f Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 3 Jun 2021 11:54:03 -0400 Subject: [PATCH 111/188] Do avx2/hbd scaling*grain multiplication in 16bit instead of 32bit --- src/x86/film_grain16_avx2.asm | 163 ++++++++++------------------------ 1 file changed, 47 insertions(+), 116 deletions(-) diff --git a/src/x86/film_grain16_avx2.asm b/src/x86/film_grain16_avx2.asm index 58225b40d8..6f4a4aa5fb 100644 --- a/src/x86/film_grain16_avx2.asm +++ b/src/x86/film_grain16_avx2.asm @@ -29,7 +29,6 @@ %if ARCH_X86_64 SECTION_RODATA 32 -pd_0x10000: times 8 dd 0x10000 pw_1024: times 16 dw 1024 pw_23_22: times 8 dw 23, 22 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 @@ -844,7 +843,7 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra mov r7d, [fg_dataq+FGData.scaling_shift] lea r8, [pb_mask] %define base r8-pb_mask - vpbroadcastw m11, [base+round_vals+r7*2-12] + vpbroadcastw m11, [base+mul_bits+r7*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] mov r9d, r9m ; bdmax sar r9d, 11 ; is_12bpc @@ -854,7 +853,6 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra lea r9d, [r6d*2+r9d] vpbroadcastw m12, [base+max+r9*2] vpbroadcastw m10, r9m - mov r9mp, r7 pxor m2, m2 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ @@ -921,27 +919,17 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra vpgatherdd m5, [scalingq+m6-3], m3 vpgatherdd m6, [scalingq+m7-3], m9 REPX {psrld x, 24}, m8, m4, m5, m6 - REPX {por x, [pd_0x10000]}, m8, m4, m5, m6 + packssdw m8, m4 + packssdw m5, m6 ; grain = grain_lut[offy+y][offx+x] movu m9, [grain_lutq+offxyq*2] movu m3, [grain_lutq+offxyq*2+32] ; noise = round2(scaling[src] * grain, scaling_shift) - ; the problem here is that since the grain is 10-bits, the product of - ; scaling*grain is 17+sign bits, so we need to unfortunately do some - ; of these steps in 32-bits - punpckhwd m7, m9, m11 - punpcklwd m9, m11 - pmaddwd m9, m8 - pmaddwd m7, m4 - punpckhwd m8, m3, m11 - punpcklwd m3, m11 - pmaddwd m3, m5 - pmaddwd m8, m6 - REPX {psrad x, r9m}, m9, m7, m3, m8 - packssdw m9, m7 - packssdw m3, m8 + REPX {pmullw x, m11}, m8, m5 + pmulhrsw m9, m8 + pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m9 @@ -1014,7 +1002,8 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra vpgatherdd m5, [scalingq+m6-3], m3 vpgatherdd m6, [scalingq+m7-3], m9 REPX {psrld x, 24}, m8, m4, m5, m6 - REPX {por x, [pd_0x10000]}, m8, m4, m5, m6 + packssdw m8, m4 + packssdw m5, m6 ; grain = grain_lut[offy+y][offx+x] movu m9, [grain_lutq+offxyq*2] @@ -1033,17 +1022,9 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra movu m3, [grain_lutq+offxyq*2+32] ; noise = round2(scaling[src] * grain, scaling_shift) - punpckhwd m7, m9, m11 - punpcklwd m9, m11 - pmaddwd m9, m8 - pmaddwd m7, m4 - punpckhwd m8, m3, m11 - punpcklwd m3, m11 - pmaddwd m3, m5 - pmaddwd m8, m6 - REPX {psrad x, r9m}, m9, m7, m3, m8 - packssdw m9, m7 - packssdw m3, m8 + REPX {pmullw x, m11}, m8, m5 + pmulhrsw m9, m8 + pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m9 @@ -1167,16 +1148,11 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra vpgatherdd m6, [scalingq+m4-3], m3 vpgatherdd m4, [scalingq+m5-3], m9 REPX {psrld x, 24}, m6, m4 - REPX {por x, [pd_0x10000]}, m6, m4 + packssdw m6, m4 ; noise = round2(scaling[src] * grain, scaling_shift) - punpckhwd m9, m7, m11 - punpcklwd m7, m11 - pmaddwd m6, m7 - pmaddwd m4, m9 - - REPX {psrad x, r9m}, m6, m4 - packssdw m6, m4 + pmullw m6, m11 + pmulhrsw m6, m7 ; same for the other half pminuw m1, m10, [srcq+32] ; m0-1: src as word @@ -1187,16 +1163,11 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra vpgatherdd m5, [scalingq+m4-3], m3 vpgatherdd m4, [scalingq+m9-3], m7 REPX {psrld x, 24}, m5, m4 - REPX {por x, [pd_0x10000]}, m5, m4 - - punpckhwd m9, m8, m11 - punpcklwd m8, m11 - pmaddwd m5, m8 - pmaddwd m4, m9 - - REPX {psrad x, r9m}, m5, m4 packssdw m5, m4 + pmullw m5, m11 + pmulhrsw m5, m8 + ; dst = clip_pixel(src, noise) paddw m0, m6 paddw m1, m5 @@ -1313,15 +1284,11 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra pcmpeqw m9, m9 vpgatherdd m4, [scalingq+m5-3], m9 REPX {psrld x, 24}, m6, m4 - REPX {por x, [pd_0x10000]}, m6, m4 + packssdw m6, m4 ; noise = round2(scaling[src] * grain, scaling_shift) - punpckhwd m9, m7, m11 - punpcklwd m7, m11 - pmaddwd m9, m4 - pmaddwd m7, m6 - REPX {psrad x, r9m}, m9, m7 - packssdw m7, m9 + pmullw m6, m11 + pmulhrsw m7, m6 ; other half punpckhwd m5, m1, m2 @@ -1333,15 +1300,11 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra pcmpeqw m6, m6 vpgatherdd m4, [scalingq+m5-3], m6 REPX {psrld x, 24}, m9, m4 - REPX {por x, [pd_0x10000]}, m9, m4 + packssdw m9, m4 ; noise = round2(scaling[src] * grain, scaling_shift) - punpckhwd m6, m3, m11 - punpcklwd m3, m11 - pmaddwd m6, m4 - pmaddwd m3, m9 - REPX {psrad x, r9m}, m6, m3 - packssdw m3, m6 + pmullw m9, m11 + pmulhrsw m3, m9 ; dst = clip_pixel(src, noise) paddw m0, m7 @@ -1378,7 +1341,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %define base r8-pb_mask lea r8, [pb_mask] mov r7d, [fg_dataq+FGData.scaling_shift] - vpbroadcastw m11, [base+round_vals+r7*2-12] + vpbroadcastw m11, [base+mul_bits+r7*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] mov r9d, r13m ; bdmax sar r9d, 11 ; is_12bpc @@ -1391,7 +1354,6 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin vpbroadcastw m12, [base+max+r10*2] vpbroadcastw m10, r13m pxor m2, m2 - mov r13mp, r7 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl @@ -1510,24 +1472,17 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin vpgatherdd m5, [scalingq+m6-3], m3 vpgatherdd m6, [scalingq+m7-3], m9 REPX {psrld x, 24}, m8, m4, m5, m6 - REPX {por x, [pd_0x10000]}, m8, m4, m5, m6 + packssdw m8, m4 + packssdw m5, m6 ; grain = grain_lut[offy+y][offx+x] movu m9, [grain_lutq+offxyq*2] movu m3, [grain_lutq+offxyq*2+82*2] ; noise = round2(scaling[luma_src] * grain, scaling_shift) - punpckhwd m7, m9, m11 - punpcklwd m9, m11 - pmaddwd m9, m8 - pmaddwd m7, m4 - punpckhwd m8, m3, m11 - punpcklwd m3, m11 - pmaddwd m3, m5 - pmaddwd m8, m6 - REPX {psrad x, r13m}, m9, m7, m3, m8 - packssdw m9, m7 - packssdw m3, m8 + REPX {pmullw x, m11}, m8, m5 + pmulhrsw m9, m8 + pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m9 @@ -1655,15 +1610,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin pcmpeqw m7, m7 vpgatherdd m4, [scalingq+m5-3], m7 REPX {psrld x, 24}, m8, m4 - REPX {por x, [pd_0x10000]}, m8, m4 + packssdw m8, m4 ; noise = round2(scaling[luma_src] * grain, scaling_shift) - punpckhwd m7, m9, m11 - punpcklwd m9, m11 - pmaddwd m9, m8 - pmaddwd m7, m4 - REPX {psrad x, r13m}, m9, m7 - packssdw m9, m7 + pmullw m8, m11 + pmulhrsw m9, m8 ; same for the other half punpckhwd m7, m6, m2 @@ -1673,15 +1624,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin vpgatherdd m5, [scalingq+m6-3], m8 vpgatherdd m6, [scalingq+m7-3], m4 REPX {psrld x, 24}, m5, m6 - REPX {por x, [pd_0x10000]}, m5, m6 + packssdw m5, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) - punpckhwd m8, m3, m11 - punpcklwd m3, m11 - pmaddwd m3, m5 - pmaddwd m8, m6 - REPX {psrad x, r13m}, m3, m8 - packssdw m3, m8 + pmullw m5, m11 + pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m9 @@ -1841,15 +1788,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin pcmpeqw m7, m7 vpgatherdd m4, [scalingq+m5-3], m7 REPX {psrld x, 24}, m8, m4 - REPX {por x, [pd_0x10000]}, m8, m4 + packssdw m8, m4 ; noise = round2(scaling[luma_src] * grain, scaling_shift) - punpckhwd m7, m9, m11 - punpcklwd m9, m11 - pmaddwd m9, m8 - pmaddwd m7, m4 - REPX {psrad x, r13m}, m9, m7 - packssdw m9, m7 + pmullw m8, m11 + pmulhrsw m9, m8 ; same for the other half punpckhwd m7, m6, m2 @@ -1859,16 +1802,12 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin vpgatherdd m5, [scalingq+m6-3], m8 vpgatherdd m6, [scalingq+m7-3], m4 REPX {psrld x, 24}, m5, m6 - REPX {por x, [pd_0x10000]}, m5, m6 + packssdw m5, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) movu m3, [grain_lutq+offxyq*2+82*2] - punpckhwd m8, m3, m11 - punpcklwd m3, m11 - pmaddwd m3, m5 - pmaddwd m8, m6 - REPX {psrad x, r13m}, m3, m8 - packssdw m3, m8 + pmullw m5, m11 + pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m9 @@ -2025,15 +1964,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin pcmpeqw m7, m7 vpgatherdd m4, [scalingq+m5-3], m7 REPX {psrld x, 24}, m8, m4 - REPX {por x, [pd_0x10000]}, m8, m4 + packssdw m8, m4 ; noise = round2(scaling[luma_src] * grain, scaling_shift) - punpckhwd m7, m9, m11 - punpcklwd m9, m11 - pmaddwd m9, m8 - pmaddwd m7, m4 - REPX {psrad x, r13m}, m9, m7 - packssdw m9, m7 + pmullw m8, m11 + pmulhrsw m9, m8 ; same for the other half punpckhwd m7, m6, m2 @@ -2043,15 +1978,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin vpgatherdd m5, [scalingq+m6-3], m8 vpgatherdd m6, [scalingq+m7-3], m4 REPX {psrld x, 24}, m5, m6 - REPX {por x, [pd_0x10000]}, m5, m6 + packssdw m5, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) - punpckhwd m8, m3, m11 - punpcklwd m3, m11 - pmaddwd m3, m5 - pmaddwd m8, m6 - REPX {psrad x, r13m}, m3, m8 - packssdw m3, m8 + pmullw m5, m11 + pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m9 From f9e9559aadb6f5ce62af7cb216ae699178ffefaf Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Wed, 19 May 2021 12:16:09 +0200 Subject: [PATCH 112/188] x86: Add high bitdepth wiener filter SSSE3 asm --- src/x86/looprestoration16_avx2.asm | 2 +- src/x86/looprestoration16_sse.asm | 1125 ++++++++++++++++++++++++++++ 2 files changed, 1126 insertions(+), 1 deletion(-) create mode 100644 src/x86/looprestoration16_sse.asm diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm index c1ebdc487d..207784e2b3 100644 --- a/src/x86/looprestoration16_avx2.asm +++ b/src/x86/looprestoration16_avx2.asm @@ -56,7 +56,7 @@ pd_8: dd 8 pd_25: dd 25 pd_4096: dd 4096 pd_34816: dd 34816 -pd_m262128 dd -262128 +pd_m262128: dd -262128 pd_0xf00800a4: dd 0xf00800a4 pd_0xf00801c7: dd 0xf00801c7 diff --git a/src/x86/looprestoration16_sse.asm b/src/x86/looprestoration16_sse.asm new file mode 100644 index 0000000000..0da068b11b --- /dev/null +++ b/src/x86/looprestoration16_sse.asm @@ -0,0 +1,1125 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 +wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 +wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 +wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 +wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +wiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +wiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 +pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +pb_m10_m9: times 8 db -10, -9 +pb_m6_m5: times 8 db -6, -5 +pb_m2_m1: times 8 db -2, -1 +pb_2_3: times 8 db 2, 3 +pb_6_7: times 8 db 6, 7 +pd_m262128: times 4 dd -262128 + +wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192 +wiener_round: dd 1049600, 1048832 + +SECTION .text + +INIT_XMM ssse3 +%if ARCH_X86_32 +DECLARE_REG_TMP 4, 6 + %if STACK_ALIGNMENT < 16 + %assign stack_size 13*16+384*12 + %else + %assign stack_size 11*16+384*12 + %endif +cglobal wiener_filter7_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \ + lpf, lpf_stride, w, flt + %if STACK_ALIGNMENT < 16 + %define lpfm dword [esp+calloff+16*10+0] + %define lpf_stridem dword [esp+calloff+16*10+4] + %define wm dword [esp+calloff+16*10+8] + %define hd dword [esp+calloff+16*10+12] + %define edgeb byte [esp+calloff+16*10+16] + %else + %define hd dword r6m + %define edgeb byte r8m + %endif + %define PICmem dword [esp+calloff+4*0] + %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers + %define t1m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define t5m dword [esp+calloff+4*6] + %define t6m dword [esp+calloff+4*7] + %define t2 t2m + %define t3 t3m + %define t4 t4m + %define t5 t5m + %define t6 t6m + %define m8 [esp+calloff+16*2] + %define m9 [esp+calloff+16*3] + %define m10 [esp+calloff+16*4] + %define m11 [esp+calloff+16*5] + %define m12 [esp+calloff+16*6] + %define m13 [esp+calloff+16*7] + %define m14 [esp+calloff+16*8] + %define m15 [esp+calloff+16*9] + %define base t0-wiener_shifts + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov wd, [rstk+stack_offset+24] + mov lpf_stridem, lpf_strideq + mov wm, wd + mov r4, [rstk+stack_offset+28] + mov hd, r4 + mov r4, [rstk+stack_offset+36] + mov [esp+16*11], r4 ; edge + %endif +%else +DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers +cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h + %define base +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov fltq, fltmp + mov edged, r8m + mov hd, r6m + mov t3d, r9m ; pixel_max + movq m13, [fltq] + movq m15, [fltq+16] +%else + %if STACK_ALIGNMENT < 16 + mov t0, [rstk+stack_offset+32] + mov t1, [rstk+stack_offset+40] ; pixel_max + movq m1, [t0] ; fx + movq m3, [t0+16] ; fy + LEA t0, wiener_shifts + mov PICmem, t0 + %else + LEA t0, wiener_shifts + mov fltq, r7m + movq m1, [fltq] + movq m3, [fltq+16] + mov t1, r9m ; pixel_max + mov PICmem, t0 + %endif +%endif + mova m6, [base+wiener_shufA] + mova m7, [base+wiener_shufB] +%if ARCH_X86_64 + lea t4, [wiener_shifts] + add wd, wd + pshufd m12, m13, q0000 ; x0 x1 + pshufd m13, m13, q1111 ; x2 x3 + pshufd m14, m15, q0000 ; y0 y1 + pshufd m15, m15, q1111 ; y2 y3 + mova m8, [wiener_shufC] + mova m9, [wiener_shufD] + add lpfq, wq + lea t1, [rsp+wq+16] + add dstq, wq + neg wq + shr t3d, 11 + %define base t4-wiener_shifts + movd m10, [base+wiener_round+t3*4] + movq m11, [base+wiener_shifts+t3*8] + pshufd m10, m10, q0000 + pshufd m0, m11, q0000 + pshufd m11, m11, q1111 + pmullw m12, m0 ; upshift filter coefs to make the + pmullw m13, m0 ; horizontal downshift constant + DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + %define lpfm [rsp+0] + %define lpf_stridem [rsp+8] + %define base +%else + add wd, wd + mova m4, [base+wiener_shufC] + mova m5, [base+wiener_shufD] + pshufd m0, m1, q0000 + pshufd m1, m1, q1111 + pshufd m2, m3, q0000 + pshufd m3, m3, q1111 + mova m8, m4 + mova m9, m5 + mova m14, m2 + mova m15, m3 + shr t1, 11 + add lpfq, wq + movd m4, [base+wiener_round+t1*4] + movq m5, [base+wiener_shifts+t1*8] + %if STACK_ALIGNMENT < 16 + lea t1, [esp+16*12+wq+16] + %else + lea t1, [esp+16*10+wq+16] + %endif + add dstq, wq + neg wq + pshufd m4, m4, q0000 + pshufd m2, m5, q0000 + pshufd m5, m5, q1111 + mov wm, wq + pmullw m0, m2 + pmullw m1, m2 + mova m10, m4 + mova m11, m5 + mova m12, m0 + mova m13, m1 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top +%if ARCH_X86_64 + add lpfq, lpf_strideq +%else + add lpfq, lpf_stridem +%endif + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top +%if ARCH_X86_64 + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + mov lpf_stridem, lpf_strideq + add r7, lpf_strideq + mov lpfm, r7 ; below +%else + mov t4m, t1 + mov t0, lpf_stridem + lea t1, [lpfq+t0*4] + mov lpfq, dstq + add t1, t0 + mov lpfm, t1 ; below + mov t1, t4m + mov t0, PICmem + add t1, 384*2 +%endif + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, lpfm + call .hv_bottom + add lpfq, lpf_stridem + call .hv_bottom +.v1: + call .v + RET +.no_top: +%if ARCH_X86_64 + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov lpf_stridem, lpf_strideq + lea r7, [r7+lpf_strideq*2] + mov lpfm, r7 + call .h +%else + mov t1m, t1 + mov t0, lpf_stridem + lea t1, [lpfq+t0*4] + mov lpfq, dstq + lea t1, [t1+t0*2] + mov lpfm, t1 + mov t0, PICmem + mov t1, t1m + call .h +%endif + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v +%if ARCH_X86_32 + mov wq, wm +%endif +.v2: + call .v +%if ARCH_X86_32 + mov wq, wm +%endif + jmp .v1 +.extend_right: +%assign stack_offset_tmp stack_offset +%assign stack_offset stack_offset+8 +%assign calloff 8 + pxor m0, m0 + movd m1, wd + mova m2, [base+pb_0to15] + pshufb m1, m0 + mova m0, [base+pb_6_7] + psubb m0, m1 + pminub m0, m2 + pshufb m3, m0 + mova m0, [base+pb_m2_m1] + psubb m0, m1 + pminub m0, m2 + pshufb m4, m0 + mova m0, [base+pb_m10_m9] + psubb m0, m1 + pminub m0, m2 + pshufb m5, m0 + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: +%if ARCH_X86_64 + mov wq, r5 +%else + mov wq, wm +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movq m3, [leftq] + movhps m3, [lpfq+wq] + add leftq, 8 + jmp .h_main +.h_extend_left: + mova m3, [lpfq+wq] ; avoid accessing memory located + pshufb m3, [base+wiener_lshuf7] ; before the start of the buffer + jmp .h_main +.h_top: +%if ARCH_X86_64 + mov wq, r5 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+wq-8] +.h_main: + mova m4, [lpfq+wq+0] + movu m5, [lpfq+wq+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -18 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + mova m2, [base+pd_m262128] ; (1 << 4) - (1 << 18) + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+wq], m0 + add wq, 16 + jl .h_loop +%if ARCH_X86_32 + mov wq, wm +%endif + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq +%if ARCH_X86_64 + mov wq, r5 +%else + mov t0m, t0 + mov t1m, t1 + mov t0, PICmem +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movq m3, [leftq] + movhps m3, [lpfq+wq] + add leftq, 8 + jmp .hv_main +.hv_extend_left: + mova m3, [lpfq+wq] + pshufb m3, [base+wiener_lshuf7] + jmp .hv_main +.hv_bottom: +%if ARCH_X86_64 + mov wq, r5 +%else + mov t0m, t0 + mov t1m, t1 + mov t0, PICmem +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+wq-8] +.hv_main: + mova m4, [lpfq+wq+0] + movu m5, [lpfq+wq+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -18 + jl .hv_have_right + call .extend_right +.hv_have_right: +%if ARCH_X86_32 + mov t1, t4m +%endif + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + mova m2, [base+pd_m262128] + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 +%if ARCH_X86_64 + mova m2, [t4+wq] + paddw m2, [t2+wq] + mova m5, [t3+wq] +%else + mov t0, t0m + mova m2, [t1+wq] + mov t1, t2m + paddw m2, [t1+wq] + mov t1, t3m + mova m5, [t1+wq] + mov t1, t5m +%endif + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 +%if ARCH_X86_64 + mova m4, [t5+wq] + paddw m4, [t1+wq] + psraw m0, 1 + paddw m3, m0, [t6+wq] +%else + mova m4, [t1+wq] + mov t1, t1m + paddw m4, [t1+wq] + psraw m0, 1 + mov t1, t6m + paddw m3, m0, [t1+wq] +%endif + mova [t0+wq], m0 + punpcklwd m0, m2, m5 + pmaddwd m0, m15 + punpckhwd m2, m5 + pmaddwd m2, m15 + punpcklwd m1, m3, m4 + pmaddwd m1, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m2, m10 + paddd m0, m1 + paddd m2, m3 + psrad m0, 6 + psrad m2, 6 + packssdw m0, m2 + pmulhw m0, m11 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 +%if ARCH_X86_64 + jl .hv_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 +%else + jge .hv_end + mov t0, PICmem + jmp .hv_loop +.hv_end: + mov r5, t5m + mov t1, t4m + mov t6m, r5 + mov t5m, t1 + mov r5, t3m + mov t1, t2m + mov t4m, r5 + mov t3m, t1 + mov r5, t1m + mov t1, t0 + mov t2m, r5 + mov t0, t6m + mov wq, wm +%endif + add dstq, dst_strideq + ret +.v: +%if ARCH_X86_64 + mov wq, r5 +.v_loop: + mova m1, [t4+wq] + paddw m1, [t2+wq] + mova m2, [t3+wq] + mova m4, [t1+wq] + paddw m3, m4, [t6+wq] + paddw m4, [t5+wq] +%else + mov t1m, t1 +.v_loop: + mov t1, t4m + mova m1, [t1+wq] + mov t1, t2m + paddw m1, [t1+wq] + mov t1, t3m + mova m2, [t1+wq] + mov t1, t1m + mova m4, [t1+wq] + mov t1, t6m + paddw m3, m4, [t1+wq] + mov t1, t5m + paddw m4, [t1+wq] +%endif + punpcklwd m0, m1, m2 + pmaddwd m0, m15 + punpckhwd m1, m2 + pmaddwd m1, m15 + punpcklwd m2, m3, m4 + pmaddwd m2, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m1, m10 + paddd m0, m2 + paddd m1, m3 + psrad m0, 6 + psrad m1, 6 + packssdw m0, m1 + pmulhw m0, m11 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 + jl .v_loop +%if ARCH_X86_64 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 +%else + mov t1, t5m + mov r5, t4m + mov t6m, t1 + mov t5m, r5 + mov t1, t3m + mov r5, t2m + mov t4m, t1 + mov t3m, r5 + mov t1, t1m + mov t2m, t1 +%endif + add dstq, dst_strideq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign stack_size 12*16+384*8 + %else + %assign stack_size 11*16+384*8 + %endif +cglobal wiener_filter5_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \ + lpf, lpf_stride, w, flt + %if STACK_ALIGNMENT < 16 + %define lpfm dword [esp+calloff+4*6] + %define lpf_stridem dword [esp+calloff+4*7] + %define wm dword [esp+calloff+16*10+0] + %define hd dword [esp+calloff+16*10+4] + %define edgeb byte [esp+calloff+16*10+8] + %else + %define hd dword r6m + %define edgeb byte r8m + %endif + %define PICmem dword [esp+calloff+4*0] + %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers + %define t1m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define t2 t2m + %define t3 t3m + %define t4 t4m + %define m8 [esp+calloff+16*2] + %define m9 [esp+calloff+16*3] + %define m10 [esp+calloff+16*4] + %define m11 [esp+calloff+16*5] + %define m12 [esp+calloff+16*6] + %define m13 [esp+calloff+16*7] + %define m14 [esp+calloff+16*8] + %define m15 [esp+calloff+16*9] + %define base t0-wiener_shifts + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov wd, [rstk+stack_offset+24] + mov lpf_stridem, lpf_strideq + mov wm, wd + mov r4, [rstk+stack_offset+28] + mov hd, r4 + mov r4, [rstk+stack_offset+36] + mov [esp+16*10+8], r4 ; edge + %endif +%else +cglobal wiener_filter5_16bpc, 5, 15, 16, 384*8+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h + %define base +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov fltq, fltmp + mov edged, r8m + mov hd, r6m + mov t3d, r9m ; pixel_max + movq m12, [fltq] + movq m14, [fltq+16] +%else + %if STACK_ALIGNMENT < 16 + mov t0, [rstk+stack_offset+32] + mov t1, [rstk+stack_offset+40] ; pixel_max + movq m1, [t0] ; fx + movq m3, [t0+16] ; fy + LEA t0, wiener_shifts + mov PICmem, t0 + %else + LEA t0, wiener_shifts + mov fltq, r7m + movq m1, [fltq] + movq m3, [fltq+16] + mov t1, r9m ; pixel_max + mov PICmem, t0 + %endif +%endif + mova m5, [base+wiener_shufE] + mova m6, [base+wiener_shufB] + mova m7, [base+wiener_shufD] +%if ARCH_X86_64 + lea t4, [wiener_shifts] + add wd, wd + punpcklwd m11, m12, m12 + pshufd m11, m11, q1111 ; x1 + pshufd m12, m12, q1111 ; x2 x3 + punpcklwd m13, m14, m14 + pshufd m13, m13, q1111 ; y1 + pshufd m14, m14, q1111 ; y2 y3 + shr t3d, 11 + mova m8, [pd_m262128] ; (1 << 4) - (1 << 18) + add lpfq, wq + lea t1, [rsp+wq+16] + add dstq, wq + neg wq + %define base t4-wiener_shifts + movd m9, [base+wiener_round+t3*4] + movq m10, [base+wiener_shifts+t3*8] + pshufd m9, m9, q0000 + pshufd m0, m10, q0000 + pshufd m10, m10, q1111 + mova m15, [wiener_lshuf5] + pmullw m11, m0 + pmullw m12, m0 + DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + %define lpfm [rsp+0] + %define lpf_stridem [rsp+8] + %define base +%else + add wd, wd + punpcklwd m0, m1, m1 + pshufd m0, m0, q1111 ; x1 + pshufd m1, m1, q1111 ; x2 x3 + punpcklwd m2, m3, m3 + pshufd m2, m2, q1111 ; y1 + pshufd m3, m3, q1111 ; y2 y3 + mova m4, [base+pd_m262128] ; (1 << 4) - (1 << 18) + mova m13, m2 + mova m14, m3 + mova m8, m4 + shr t1, 11 + add lpfq, wq + movd m2, [base+wiener_round+t1*4] + movq m3, [base+wiener_shifts+t1*8] + %if STACK_ALIGNMENT < 16 + lea t1, [esp+16*11+wq+16] + %else + lea t1, [esp+16*10+wq+16] + %endif + add dstq, wq + neg wq + pshufd m2, m2, q0000 + pshufd m4, m3, q0000 + pshufd m3, m3, q1111 + mov wm, wq + pmullw m0, m4 + pmullw m1, m4 + mova m4, [base+wiener_lshuf5] + mova m9, m2 + mova m10, m3 + mova m11, m0 + mova m12, m1 + mova m15, m4 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top +%if ARCH_X86_64 + add lpfq, lpf_strideq +%else + add lpfq, lpf_stridem +%endif + mov t4, t1 + add t1, 384*2 + call .h_top +%if ARCH_X86_64 + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov t3, t1 + add t1, 384*2 + mov lpf_stridem, lpf_strideq + add r7, lpf_strideq + mov lpfm, r7 ; below +%else + mov t3m, t1 + mov t0, lpf_stridem + lea t1, [lpfq+t0*4] + mov lpfq, dstq + add t1, t0 + mov lpfm, t1 ; below + mov t1, t3m + add t1, 384*2 +%endif + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, lpfm + call .hv_bottom + add lpfq, lpf_stridem + call .hv_bottom +.end: + RET +.no_top: +%if ARCH_X86_64 + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov lpf_stridem, lpf_strideq + lea r7, [r7+lpf_strideq*2] + mov lpfm, r7 + call .h +%else + mov t1m, t1 + mov t0, lpf_stridem + lea t1, [lpfq+t0*4] + mov lpfq, dstq + lea t1, [t1+t0*2] + mov lpfm, t1 + mov t1, t1m + call .h +%endif + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call .v +%if ARCH_X86_64 + mov t4, t3 + mov t3, t2 + mov t2, t1 +%else + mov t0, t3m + mov r5, t2m + mov t1, t1m + mov t4m, t0 + mov t3m, r5 + mov t2m, t1 + mov wq, wm +%endif + add dstq, dst_strideq +.v1: + call .v + jmp .end +.extend_right: +%assign stack_offset_tmp stack_offset +%assign stack_offset stack_offset+8 +%assign calloff 8 +%if ARCH_X86_32 + mov t0, PICmem +%endif + pxor m1, m1 + movd m2, wd + mova m0, [base+pb_2_3] + pshufb m2, m1 + mova m1, [base+pb_m6_m5] + psubb m0, m2 + psubb m1, m2 + mova m2, [base+pb_0to15] + pminub m0, m2 + pminub m1, m2 + pshufb m3, m0 + pshufb m4, m1 + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: +%if ARCH_X86_64 + mov wq, r5 +%else + mov wq, wm +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + mova m4, [lpfq+wq] + movd m3, [leftq+4] + pslldq m4, 4 + por m3, m4 + add leftq, 8 + jmp .h_main +.h_extend_left: + mova m3, [lpfq+wq] ; avoid accessing memory located + pshufb m3, m15 ; before the start of the buffer + jmp .h_main +.h_top: +%if ARCH_X86_64 + mov wq, r5 +%else + mov wq, wm +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+wq-4] +.h_main: + movu m4, [lpfq+wq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -18 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 + paddd m1, m3 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+wq], m0 + add wq, 16 + jl .h_loop +%if ARCH_X86_32 + mov wq, wm +%endif + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq +%if ARCH_X86_64 + mov wq, r5 +%else + mov t0m, t0 + mov t1m, t1 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + mova m4, [lpfq+wq] + movd m3, [leftq+4] + pslldq m4, 4 + por m3, m4 + add leftq, 8 + jmp .hv_main +.hv_extend_left: + mova m3, [lpfq+wq] + pshufb m3, m15 + jmp .hv_main +.hv_bottom: +%if ARCH_X86_64 + mov wq, r5 +%else + mov t0m, t0 + mov t1m, t1 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+wq-4] +.hv_main: + movu m4, [lpfq+wq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -18 + jl .hv_have_right + call .extend_right +.hv_have_right: +%if ARCH_X86_32 + mov t1, t1m + mov t0, t3m +%endif + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 +%if ARCH_X86_64 + mova m2, [t3+wq] + paddw m2, [t1+wq] + paddd m1, m3 + mova m4, [t2+wq] +%else + mova m2, [t0+wq] + mov t0, t2m + paddw m2, [t1+wq] + mov t1, t4m + paddd m1, m3 + mova m4, [t0+wq] + mov t0, t0m +%endif + punpckhwd m3, m2, m4 + pmaddwd m3, m14 + punpcklwd m2, m4 +%if ARCH_X86_64 + mova m4, [t4+wq] +%else + mova m4, [t1+wq] +%endif + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + pmaddwd m2, m14 + psraw m0, 1 + mova [t0+wq], m0 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 6 + psrad m0, 6 + packssdw m0, m1 + pmulhw m0, m10 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 + jl .hv_loop +%if ARCH_X86_64 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 +%else + mov r5, t3m + mov t1, t2m + mov t4m, r5 + mov t3m, t1 + mov r5, t1m + mov t1, t0 + mov t2m, r5 + mov t0, t4m + mov wq, wm +%endif + add dstq, dst_strideq + ret +.v: +%if ARCH_X86_64 + mov wq, r5 +.v_loop: + mova m0, [t1+wq] + paddw m2, m0, [t3+wq] + mova m1, [t2+wq] + mova m4, [t4+wq] +%else + mov t1m, t1 +.v_loop: + mov t0, t3m + mova m0, [t1+wq] + mov t1, t2m + paddw m2, m0, [t0+wq] + mov t0, t4m + mova m1, [t1+wq] + mova m4, [t0+wq] +%endif + punpckhwd m3, m2, m1 + pmaddwd m3, m14 + punpcklwd m2, m1 + pmaddwd m2, m14 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 6 + psrad m0, 6 + packssdw m0, m1 + pmulhw m0, m10 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 +%if ARCH_X86_64 + jl .v_loop +%else + jge .v_end + mov t1, t1m + jmp .v_loop +.v_end: +%endif + ret From 65ef757f4e7a6ae1fc4a89290488f3f76cea5b1b Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Wed, 9 Jun 2021 09:43:51 -0400 Subject: [PATCH 113/188] mc: add HBD/SSSE3 mc.emu_edge optimizations --- src/x86/mc16_sse.asm | 360 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 360 insertions(+) diff --git a/src/x86/mc16_sse.asm b/src/x86/mc16_sse.asm index f01de8c514..6b1869a97d 100644 --- a/src/x86/mc16_sse.asm +++ b/src/x86/mc16_sse.asm @@ -3782,3 +3782,363 @@ cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask inc hq jl .w128 RET + +; emu_edge args: +; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, +; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, +; const pixel *ref, const ptrdiff_t ref_stride +; +; bw, bh total filled size +; iw, ih, copied block -> fill bottom, right +; x, y, offset in bw/bh -> fill top, left +cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \ + y, dst, dstride, src, sstride, \ + bottomext, rightext, blk + ; we assume that the buffer (stride) is larger than width, so we can + ; safely overwrite by a few bytes + +%if ARCH_X86_64 + %define reg_zero r12q + %define reg_tmp r10 + %define reg_src srcq + %define reg_bottomext bottomextq + %define reg_rightext rightextq + %define reg_blkm r9m +%else + %define reg_zero r6 + %define reg_tmp r0 + %define reg_src r1 + %define reg_bottomext r0 + %define reg_rightext r1 + %define reg_blkm r2m +%endif + ; + ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + xor reg_zero, reg_zero + lea reg_tmp, [ihq-1] + cmp yq, ihq + cmovs reg_tmp, yq + test yq, yq + cmovs reg_tmp, reg_zero +%if ARCH_X86_64 + imul reg_tmp, sstrideq + add srcq, reg_tmp +%else + imul reg_tmp, sstridem + mov reg_src, srcm + add reg_src, reg_tmp +%endif + ; + ; ref += iclip(x, 0, iw - 1) + lea reg_tmp, [iwq-1] + cmp xq, iwq + cmovs reg_tmp, xq + test xq, xq + cmovs reg_tmp, reg_zero + lea reg_src, [reg_src+reg_tmp*2] +%if ARCH_X86_32 + mov srcm, reg_src +%endif + ; + ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) +%if ARCH_X86_32 + mov r1, r1m ; restore bh +%endif + lea reg_bottomext, [yq+bhq] + sub reg_bottomext, ihq + lea r3, [bhq-1] + cmovs reg_bottomext, reg_zero + ; + + DEFINE_ARGS bw, bh, iw, ih, x, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; top_ext = iclip(-y, 0, bh - 1) + neg topextq + cmovs topextq, reg_zero + cmp reg_bottomext, bhq + cmovns reg_bottomext, r3 + cmp topextq, bhq + cmovg topextq, r3 + %if ARCH_X86_32 + mov r4m, reg_bottomext + ; + ; right_ext = iclip(x + bw - iw, 0, bw - 1) + mov r0, r0m ; restore bw + %endif + lea reg_rightext, [xq+bwq] + sub reg_rightext, iwq + lea r2, [bwq-1] + cmovs reg_rightext, reg_zero + + DEFINE_ARGS bw, bh, iw, ih, leftext, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; left_ext = iclip(-x, 0, bw - 1) + neg leftextq + cmovs leftextq, reg_zero + cmp reg_rightext, bwq + cmovns reg_rightext, r2 + %if ARCH_X86_32 + mov r3m, r1 + %endif + cmp leftextq, bwq + cmovns leftextq, r2 + +%undef reg_zero +%undef reg_tmp +%undef reg_src +%undef reg_bottomext +%undef reg_rightext + + DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; center_h = bh - top_ext - bottom_ext +%if ARCH_X86_64 + lea r3, [bottomextq+topextq] + sub centerhq, r3 +%else + mov r1, centerhm ; restore r1 + sub centerhq, topextq + sub centerhq, r4m + mov r1m, centerhq +%endif + ; + ; blk += top_ext * PXSTRIDE(dst_stride) + mov r2, topextq +%if ARCH_X86_64 + imul r2, dstrideq +%else + mov r6, r6m ; restore dstq + imul r2, dstridem +%endif + add dstq, r2 + mov reg_blkm, dstq ; save pointer for ext + ; + ; center_w = bw - left_ext - right_ext + mov centerwq, bwq +%if ARCH_X86_64 + lea r3, [rightextq+leftextq] + sub centerwq, r3 +%else + sub centerwq, r3m + sub centerwq, leftextq +%endif + +; vloop Macro +%macro v_loop 3 ; need_left_ext, need_right_ext, suffix + %if ARCH_X86_64 + %define reg_tmp r12 + %else + %define reg_tmp r0 + %endif +.v_loop_%3: + %if ARCH_X86_32 + mov r0, r0m + mov r1, r1m + %endif +%if %1 + ; left extension + %if ARCH_X86_64 + movd m0, [srcq] + %else + mov r3, srcm + movd m0, [r3] + %endif + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + xor r3, r3 +.left_loop_%3: + mova [dstq+r3*2], m0 + add r3, mmsize/2 + cmp r3, leftextq + jl .left_loop_%3 + ; body + lea reg_tmp, [dstq+leftextq*2] +%endif + xor r3, r3 +.body_loop_%3: + %if ARCH_X86_64 + movu m0, [srcq+r3*2] + %else + mov r1, srcm + movu m0, [r1+r3*2] + %endif +%if %1 + movu [reg_tmp+r3*2], m0 +%else + movu [dstq+r3*2], m0 +%endif + add r3, mmsize/2 + cmp r3, centerwq + jl .body_loop_%3 +%if %2 + ; right extension +%if %1 + lea reg_tmp, [reg_tmp+centerwq*2] +%else + lea reg_tmp, [dstq+centerwq*2] +%endif + %if ARCH_X86_64 + movd m0, [srcq+centerwq*2-2] + %else + mov r3, srcm + movd m0, [r3+centerwq*2-2] + %endif + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + xor r3, r3 +.right_loop_%3: + movu [reg_tmp+r3*2], m0 + add r3, mmsize/2 + %if ARCH_X86_64 + cmp r3, rightextq + %else + cmp r3, r3m + %endif + jl .right_loop_%3 +%endif + %if ARCH_X86_64 + add dstq, dstrideq + add srcq, sstrideq + dec centerhq + jg .v_loop_%3 + %else + add dstq, dstridem + mov r0, sstridem + add srcm, r0 + sub dword centerhm, 1 + jg .v_loop_%3 + mov r0, r0m ; restore r0 + %endif +%endmacro ; vloop MACRO + + test leftextq, leftextq + jnz .need_left_ext + %if ARCH_X86_64 + test rightextq, rightextq + jnz .need_right_ext + %else + cmp leftextq, r3m ; leftextq == 0 + jne .need_right_ext + %endif + v_loop 0, 0, 0 + jmp .body_done + + ;left right extensions +.need_left_ext: + %if ARCH_X86_64 + test rightextq, rightextq + %else + mov r3, r3m + test r3, r3 + %endif + jnz .need_left_right_ext + v_loop 1, 0, 1 + jmp .body_done + +.need_left_right_ext: + v_loop 1, 1, 2 + jmp .body_done + +.need_right_ext: + v_loop 0, 1, 3 + +.body_done: +; r0 ; bw +; r1 ;; x loop +; r4 ;; y loop +; r5 ; topextq +; r6 ;dstq +; r7 ;dstrideq +; r8 ; srcq +%if ARCH_X86_64 + %define reg_dstride dstrideq +%else + %define reg_dstride r2 +%endif + ; + ; bottom edge extension + %if ARCH_X86_64 + test bottomextq, bottomextq + jz .top + %else + xor r1, r1 + cmp r1, r4m + je .top + %endif + ; + %if ARCH_X86_64 + mov srcq, dstq + sub srcq, dstrideq + xor r1, r1 + %else + mov r3, dstq + mov reg_dstride, dstridem + sub r3, reg_dstride + mov srcm, r3 + %endif + ; +.bottom_x_loop: + %if ARCH_X86_64 + mova m0, [srcq+r1*2] + lea r3, [dstq+r1*2] + mov r4, bottomextq + %else + mov r3, srcm + mova m0, [r3+r1*2] + lea r3, [dstq+r1*2] + mov r4, r4m + %endif + ; +.bottom_y_loop: + mova [r3], m0 + add r3, reg_dstride + dec r4 + jg .bottom_y_loop + add r1, mmsize/2 + cmp r1, bwq + jl .bottom_x_loop + +.top: + ; top edge extension + test topextq, topextq + jz .end +%if ARCH_X86_64 + mov srcq, reg_blkm +%else + mov r3, reg_blkm + mov reg_dstride, dstridem +%endif + mov dstq, dstm + xor r1, r1 + ; +.top_x_loop: +%if ARCH_X86_64 + mova m0, [srcq+r1*2] +%else + mov r3, reg_blkm + mova m0, [r3+r1*2] +%endif + lea r3, [dstq+r1*2] + mov r4, topextq + ; +.top_y_loop: + mova [r3], m0 + add r3, reg_dstride + dec r4 + jg .top_y_loop + add r1, mmsize/2 + cmp r1, bwq + jl .top_x_loop + +.end: + RET + +%undef reg_dstride +%undef reg_blkm +%undef reg_tmp From c830bd9d16b46dd680294b96a327df56f897ce92 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 1 Jun 2021 07:15:32 -0400 Subject: [PATCH 114/188] Add 10/12-bit deblock SSSE3 implementation Currently 64-bit only. --- src/x86/loopfilter16_sse.asm | 1135 ++++++++++++++++++++++++++++++++++ 1 file changed, 1135 insertions(+) create mode 100644 src/x86/loopfilter16_sse.asm diff --git a/src/x86/loopfilter16_sse.asm b/src/x86/loopfilter16_sse.asm new file mode 100644 index 0000000000..ff8cb722dc --- /dev/null +++ b/src/x86/loopfilter16_sse.asm @@ -0,0 +1,1135 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 16 + +pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 + times 4 db 8, 9 + +pw_1: times 8 dw 1 +pw_2: times 8 dw 2 +pw_3: times 8 dw 3 +; 4 and 16 need to be next to each other since they are used as alternates +; depending on whether bitdepth is 10 or 12 +pw_4: times 8 dw 4 +pw_16: times 8 dw 16 +pw_8: times 8 dw 8 +pw_4096: times 8 dw 4096 + +pb_mask: dd 1, 1, 2, 2 + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%macro SPLATD 2 + movd %1, %2 + pshufd %1, %1, q0000 +%endmacro + +%macro SPLATW 2 + movd %1, %2 + pshuflw %1, %1, q0000 + punpcklqdq %1, %1 +%endmacro + +; in: out: +; mm%1 a b c d a e i m +; mm%2 e f g h b f j n +; mm%3 i j k l -> c g k o +; mm%4 m n o p d h l p +%macro TRANSPOSE4X4W 5 + punpcklwd m%5, m%1, m%2 + punpckhwd m%1, m%2 + punpcklwd m%2, m%3, m%4 + punpckhwd m%3, m%4 + punpckldq m%4, m%5, m%2 + punpckhdq m%5, m%2 + punpckldq m%2, m%1, m%3 + punpckhdq m%1, m%3 + + SWAP %1, %4 + SWAP %2, %5, %3 +%endmacro + +; in: out: +; xmm%1 a b c d e f g h a i q y 6 E M U +; xmm%2 i j k l m n o p b j r z 7 F N V +; xmm%3 q r s t u v w x c k s 0 8 G O W +; xmm%4 y z 0 1 2 3 4 5 d l t 1 9 H P X +; xmm%5 6 7 8 9 A B C D -> e m u 2 A I Q Y +; xmm%6 E F G H I J K L f n v 3 B J R Z +; xmm%7 M N O P Q R S T g o w 4 C K S + +; xmm%8 U V W X Y Z + = h p x 5 D L T = +%macro TRANSPOSE8X8W 9 + ; xmm%1 a b c d e f g h a i q y b j r z + ; xmm%2 i j k l m n o p c k s 0 d l t 1 + ; xmm%3 q r s t u v w x -> e m u 2 f n v 3 + ; xmm%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 + TRANSPOSE4X4W %1, %2, %3, %4, %9 + + ; xmm%5 6 7 8 9 A B C D 6 E M U 7 F N V + ; xmm%6 E F G H I J K L 8 G O W 9 H P X + ; xmm%7 M N O P Q R S T -> A I Q Y B J R Z + ; xmm%8 U V W X Y Z + = C K S + D L T = + TRANSPOSE4X4W %5, %6, %7, %8, %9 + + ; xmm%1 a i q y b j r z a i q y 6 E M U + ; xmm%2 c k s 0 d l t 1 b j r z 7 F N V + ; xmm%3 e m u 2 f n v 3 c k s 0 8 G O W + ; xmm%4 g o w 4 h p x 5 d l t 1 9 H P X + ; xmm%5 6 E M U 7 F N V -> e m u 2 A I Q Y + ; xmm%6 8 G O W 9 H P X f n v 3 B J R Z + ; xmm%7 A I Q Y B J R Z g o w 4 C K S + + ; xmm%8 C K S + D L T = h p x 5 D L T = + punpckhqdq m%9, m%1, m%5 + punpcklqdq m%1, m%5 + punpckhqdq m%5, m%2, m%6 + punpcklqdq m%2, m%6 + punpckhqdq m%6, m%3, m%7 + punpcklqdq m%3, m%7 + punpckhqdq m%7, m%4, m%8 + punpcklqdq m%4, m%8 + + SWAP %8, %7, %4, %5, %3, %2, %9 +%endmacro + +; transpose and write m3-6, everything else is scratch +%macro TRANSPOSE_8x4_AND_WRITE_4x8 0 + ; transpose 8x4 + punpcklwd m0, m3, m4 + punpckhwd m3, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckldq m6, m0, m4 + punpckhdq m0, m4 + punpckldq m4, m3, m5 + punpckhdq m3, m5 + + ; write out + movq [dstq+strideq*0-4], xm6 + movhps [dstq+strideq*1-4], xm6 + movq [dstq+strideq*2-4], xm0 + movhps [dstq+stride3q -4], xm0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm4 + movhps [dstq+strideq*1-4], xm4 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] + ; load data +%ifidn %2, v +%if %1 == 4 + lea tmpq, [dstq+mstrideq*2] + mova m3, [tmpq+strideq*0] ; p1 + mova m4, [tmpq+strideq*1] ; p0 + mova m5, [tmpq+strideq*2] ; q0 + mova m6, [tmpq+stride3q] ; q1 +%else + ; load 6-8 pixels, remainder (for wd=16) will be read inline + lea tmpq, [dstq+mstrideq*4] + ; we load p3 later + mova m13, [tmpq+strideq*1] + mova m3, [tmpq+strideq*2] + mova m4, [tmpq+stride3q] + mova m5, [dstq+strideq*0] + mova m6, [dstq+strideq*1] + mova m14, [dstq+strideq*2] +%if %1 != 6 + mova m15, [dstq+stride3q] +%endif +%endif +%else + ; load lines +%if %1 == 4 + movq xm3, [dstq+strideq*0-4] + movq xm4, [dstq+strideq*1-4] + movq xm5, [dstq+strideq*2-4] + movq xm6, [dstq+stride3q -4] + lea tmpq, [dstq+strideq*4] + movq xm11, [tmpq+strideq*0-4] + movq xm13, [tmpq+strideq*1-4] + movq xm14, [tmpq+strideq*2-4] + movq xm15, [tmpq+stride3q -4] + + ; transpose 4x8 + ; xm3: A-D0,A-D4 + ; xm4: A-D1,A-D5 + ; xm5: A-D2,A-D6 + ; xm6: A-D3,A-D7 + punpcklwd m7, m3, m4 + punpcklwd m3, m11, m13 + punpcklwd m4, m5, m6 + punpcklwd m5, m14, m15 + ; xm7: A0-1,B0-1,C0-1,D0-1 + ; xm3: A4-5,B4-5,C4-5,D4-5 + ; xm4: A2-3,B2-3,C2-3,D2-3 + ; xm5: A6-7,B6-7,C6-7,D6-7 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m8, m3, m5 + punpckhdq m3, m5 + SWAP 3, 5 + ; xm6: A0-3,B0-3 + ; xm7: C0-3,D0-3 + ; xm8: A4-7,B4-7 + ; xm5: C4-7,D4-7 + punpcklqdq m3, m6, m8 + punpckhqdq m4, m6, m8 + punpckhqdq m6, m7, m5 + punpcklqdq m7, m5 + SWAP 7, 5 + ; xm3: A0-7 + ; xm4: B0-7 + ; xm5: C0-7 + ; xm6: D0-7 +%elif %1 == 6 || %1 == 8 + movu xm3, [dstq+strideq*0-8] + movu xm4, [dstq+strideq*1-8] + movu xm5, [dstq+strideq*2-8] + movu xm6, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4] + movu xm11, [tmpq+strideq*0-8] + movu xm13, [tmpq+strideq*1-8] + movu xm14, [tmpq+strideq*2-8] + movu xm15, [tmpq+stride3q -8] + + ; transpose 8x16 + ; xm3: A-H0,A-H8 + ; xm4: A-H1,A-H9 + ; xm5: A-H2,A-H10 + ; xm6: A-H3,A-H11 + ; xm11: A-H4,A-H12 + ; xm13: A-H5,A-H13 + ; xm14: A-H6,A-H14 + ; xm15: A-H7,A-H15 + punpcklwd m7, m3, m4 + punpckhwd m3, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpcklwd m6, m11, m13 + punpckhwd m11, m13 + punpcklwd m13, m14, m15 + punpckhwd m14, m15 + ; xm7: A0-1,B0-1,C0-1,D0-1 + ; xm3: E0-1,F0-1,G0-1,H0-1 + ; xm4: A2-3,B2-3,C2-3,D2-3 + ; xm5: E2-3,F2-3,G2-3,H2-3 + ; xm6: A4-5,B4-5,C4-5,D4-5 + ; xm11: E4-5,F4-5,G4-5,H4-5 + ; xm13: A6-7,B6-7,C6-7,D6-7 + ; xm14: E6-7,F6-7,G6-7,H6-7 + punpckldq m15, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m3, m5 + punpckhdq m8, m3, m5 + punpckldq m3, m6, m13 + punpckhdq m6, m13 + punpckldq m10, m11, m14 + punpckhdq m11, m14 + ; xm15: A0-3,B0-3 + ; xm7: C0-3,D0-3 + ; xm9: E0-3,F0-3 + ; xm8: G0-3,H0-3 + ; xm3: A4-7,B4-7 + ; xm6: C4-7,D4-7 + ; xm10: E4-7,F4-7 + ; xm11: G4-7,H4-7 +%if %1 != 6 + punpcklqdq m0, m15, m3 +%endif + punpckhqdq m13, m15, m3 + punpcklqdq m3, m7, m6 + punpckhqdq m4, m7, m6 + punpcklqdq m5, m9, m10 + punpckhqdq m6, m9, m10 + punpcklqdq m14, m8, m11 +%if %1 != 6 + punpckhqdq m15, m8, m11 + mova [rsp+5*32], m0 +%endif +%else + ; We only use 14 pixels but we'll need the remainder at the end for + ; the second transpose + mova xm0, [dstq+strideq*0-16] + mova xm1, [dstq+strideq*1-16] + mova xm2, [dstq+strideq*2-16] + mova xm3, [dstq+stride3q -16] + lea tmpq, [dstq+strideq*4] + mova xm4, [tmpq+strideq*0-16] + mova xm5, [tmpq+strideq*1-16] + mova xm6, [tmpq+strideq*2-16] + mova xm7, [tmpq+stride3q -16] + + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + + mova [rsp+6*32], m0 + mova [rsp+7*32], m1 + mova [rsp+8*32], m2 + mova [rsp+9*32], m3 + mova [rsp+5*32], m4 + + mova xm0, [dstq+strideq*0] + mova xm1, [dstq+strideq*1] + mova xm2, [dstq+strideq*2] + mova xm3, [dstq+stride3q ] + lea tmpq, [dstq+strideq*4] + mova xm8, [tmpq+strideq*0] + mova xm9, [tmpq+strideq*1] + mova xm10, [tmpq+strideq*2] + mova xm11, [tmpq+stride3q ] + + TRANSPOSE8X8W 0, 1, 2, 3, 8, 9, 10, 11, 4 + + mova [rsp+10*32], m8 + mova [rsp+11*32], m9 + mova [rsp+12*32], m10 + mova [rsp+13*32], m11 + + ; 5,6,7,0,1,2,3 -> 13,3,4,5,6,14,15 + SWAP 13, 5, 0 + SWAP 3, 6, 1, 15 + SWAP 4, 7 + SWAP 2, 14 +%endif +%endif + + ; load L/E/I/H +%ifidn %2, v +%if cpuflag(sse4) + pmovzxbw m1, [lq] + pmovzxbw m0, [lq+l_strideq] + pxor m2, m2 +%else + movq m1, [lq] + movq m0, [lq+l_strideq] + pxor m2, m2 + REPX {punpcklbw x, m2}, m1, m0 +%endif +%else + movq m0, [lq] ; l0, l1 + movq m1, [lq+l_strideq] ; l2, l3 + punpckldq m0, m1 ; l0, l2, l1, l3 + pxor m2, m2 + punpcklbw m1, m0, m2 ; l0, l2 + punpckhbw m0, m2 ; l1, l3 +%endif + pcmpeqw m10, m2, m0 + pand m1, m10 + por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1] + pcmpeqw m10, m2, m0 ; !L + psrlw m10, 1 + psrlw m2, m0, [lutq+128] + SPLATW m1, [lutq+136] + pminsw m2, m1 + pmaxsw m2, [pw_1] ; I + psrlw m1, m0, 4 ; H + paddw m0, [pw_2] + paddw m0, m0 + paddw m0, m2 ; E + REPX {pmullw x, [r11]}, m0, m1, m2 + + psubw m8, m3, m4 ; p1-p0 + psubw m9, m5, m6 ; q1-q0 + REPX {pabsw x, x}, m8, m9 + pmaxsw m8, m10 + pmaxsw m8, m9 + pcmpgtw m7, m8, m1 ; hev +%if %1 != 4 + psubw m9, m13, m4 ; p2-p0 + pabsw m9, m9 + pmaxsw m9, m8 +%if %1 != 6 +%ifidn %2, v + mova m11, [tmpq+strideq*0] ; p3 +%else + mova m11, [rsp+5*32] ; p3 +%endif + psubw m10, m11, m4 ; p3-p0 + pabsw m10, m10 + pmaxsw m9, m10 +%endif + psubw m10, m5, m14 ; q2-q0 + pabsw m10, m10 + pmaxsw m9, m10 +%if %1 != 6 + psubw m10, m5, m15 ; q3-q0 + pabsw m10, m10 + pmaxsw m9, m10 +%endif + pcmpgtw m9, [r11] ; !flat8in + + psubw m10, m13, m3 ; p2-p1 + pabsw m10, m10 +%if %1 != 6 + psubw m11, m13 ; p3-p2 + pabsw m11, m11 + pmaxsw m10, m11 + psubw m11, m14, m15 ; q3-q2 + pabsw m11, m11 + pmaxsw m10, m11 +%endif + psubw m11, m14, m6 ; q2-q1 + pabsw m11, m11 + pmaxsw m10, m11 + +%if %1 == 16 + SPLATD m11, [maskq+8] + SPLATD m1, [maskq+4] + por m11, m1 + pand m11, m12 + pcmpeqd m11, m12 + pand m10, m11 +%else + SPLATD m11, [maskq+4] + pand m11, m12 + pcmpeqd m11, m12 + pand m10, m11 ; only apply fm-wide to wd>4 blocks +%endif + pmaxsw m8, m10 +%endif + pcmpgtw m8, m2 + + psubw m10, m3, m6 ; p1-q1 + psubw m11, m4, m5 ; p0-q0 + REPX {pabsw x, x}, m10, m11 + paddw m11, m11 + psrlw m10, 1 + paddw m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + pcmpgtw m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E + por m8, m10 + +%if %1 == 16 + +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] + mova m1, [tmpq+strideq*2] + mova m2, [tmpq+stride3q] +%else + mova m0, [rsp+7*32] + mova m1, [rsp+8*32] + mova m2, [rsp+9*32] +%endif + REPX {psubw x, m4}, m0, m1, m2 + REPX {pabsw x, x}, m0, m1, m2 + pmaxsw m1, m0 + pmaxsw m1, m2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] + mova m0, [tmpq+strideq*0] + mova m2, [tmpq+strideq*1] + mova m10, [tmpq+strideq*2] +%else + mova m0, [rsp+10*32] + mova m2, [rsp+11*32] + mova m10, [rsp+12*32] +%endif + REPX {psubw x, m5}, m0, m2, m10 + REPX {pabsw x, x}, m0, m2, m10 + pmaxsw m0, m2 + pmaxsw m1, m10 + pmaxsw m1, m0 + pcmpgtw m1, [r11] ; !flat8out + por m1, m9 ; !flat8in | !flat8out + SPLATD m2, [maskq+8] + pand m10, m2, m12 + pcmpeqd m10, m12 + pandn m1, m10 ; flat16 + pandn m10, m8, m1 ; flat16 & fm + SWAP 1, 10 + + SPLATD m10, [maskq+4] + por m10, m2 + pand m2, m10, m12 + pcmpeqd m2, m12 + pandn m9, m2 ; flat8in + pandn m2, m8, m9 + SWAP 2, 9 + SPLATD m2, [maskq+0] + por m2, m10 + pand m2, m12 + pcmpeqd m2, m12 + pandn m8, m2 + pandn m0, m9, m8 ; fm & !flat8 & !flat16 + SWAP 0, 8 + pandn m0, m1, m9 ; flat8 & !flat16 + SWAP 0, 9 +%elif %1 != 4 + SPLATD m0, [maskq+4] + pand m2, m0, m12 + pcmpeqd m2, m12 + pandn m9, m2 + pandn m2, m8, m9 ; flat8 & fm + SWAP 2, 9 + SPLATD m2, [maskq+0] + por m0, m2 + pand m0, m12 + pcmpeqd m0, m12 + pandn m8, m0 + pandn m0, m9, m8 ; fm & !flat8 + SWAP 0, 8 +%else + SPLATD m0, [maskq+0] + pand m0, m12 + pcmpeqd m0, m12 + pandn m8, m0 ; fm +%endif + + ; short filter + + SPLATW m0, r7m + pcmpeqw m2, m2 + psrlw m0, 1 ; 511 or 2047 + pxor m2, m0 ; -512 or -2048 + + psubw m10, m5, m4 + paddw m11, m10, m10 + paddw m11, m10 + psubw m10, m3, m6 ; iclip_diff(p1-q1) + pminsw m10, m0 + pmaxsw m10, m2 + pand m10, m7 ; f=iclip_diff(p1-q1)&hev + paddw m10, m11 ; f=iclip_diff(3*(q0-p0)+f) + pminsw m10, m0 + pmaxsw m10, m2 + pand m8, m10 ; f&=fm + paddw m10, m8, [pw_3] + paddw m8, [pw_4] + REPX {pminsw x, m0}, m10, m8 + psraw m10, 3 ; f2 + psraw m8, 3 ; f1 + paddw m4, m10 + psubw m5, m8 + + paddw m8, [pw_1] + psraw m8, 1 ; f=(f1+1)>>1 + pandn m7, m8 ; f&=!hev + SWAP 7, 8 + paddw m3, m8 + psubw m6, m8 + pxor m8, m8 + psubw m0, m2 ; 1023 or 4095 + REPX {pminsw x, m0}, m3, m4, m5, m6 + REPX {pmaxsw x, m8}, m3, m4, m5, m6 + +%if %1 == 16 + +; m3-6 = p1/p0/q0/q1, m9=flat8, m1=flat16 +; m12=filter bits mask +; m13-15=p2/q2/q3 +; m0,2,7-8,10-11 = free + + ; flat16 filter +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] ; p6 + mova m2, [tmpq+strideq*2] ; p5 + mova m7, [tmpq+stride3q] ; p4 + mova m11, [tmpq+strideq*4] ; p3 +%else + mova m0, [rsp+7*32] + mova m2, [rsp+8*32] + mova m7, [rsp+9*32] + mova m11, [rsp+5*32] +%endif + + mova [rsp+ 0*32], m9 + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psllw m8, m0, 3 ; p6*8 + paddw m8, [pw_8] + paddw m10, m2, m7 ; p5+p4 + psubw m8, m0 + paddw m10, m10 ; (p5+p4)*2 + paddw m8, m11 ; p6*7+p3 + paddw m10, m13 ; (p5+p4)*2+p2 + paddw m8, m3 ; p6*7+p3+p1 + paddw m10, m4 ; (p5+p4)*2+p2+p0 + paddw m8, m5 ; p6*7+p3+p1+q0 + paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m2 + por m10, m9 +%ifidn %2, v + mova [tmpq+strideq*2], m10 ; p5 +%else + mova [rsp+8*32], m10 +%endif + + ; sub p6*2, add p3/q1 + paddw m8, m11 + paddw m10, m0, m0 + paddw m8, m6 + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m7 + por m10, m9 +%ifidn %2, v + mova [tmpq+stride3q], m10 ; p4 +%else + mova [rsp+9*32], m10 +%endif + + ; sub p6/p5, add p2/q2 + psubw m8, m0 + paddw m10, m13, m14 + psubw m8, m2 + paddw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m11 + por m10, m9 +%ifidn %2, v + mova [tmpq+strideq*4], m10 ; p3 + lea tmpq, [dstq+strideq*4] +%else + mova [rsp+5*32], m10 +%endif + + ; sub p6/p4, add p1/q3 + paddw m8, m3 + paddw m10, m0, m7 + paddw m8, m15 + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m13 + por m10, m9 + mova [rsp+1*32], m10 ; don't clobber p2/m13 + + ; sub p6/p3, add p0/q4 + paddw m8, m4 + paddw m10, m0, m11 +%ifidn %2, v + paddw m8, [tmpq+strideq*0] +%else + paddw m8, [rsp+10*32] +%endif + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m3 + por m10, m9 + mova [rsp+2*32], m10 ; don't clobber p1/m3 + + ; sub p6/p2, add q0/q5 + paddw m8, m5 + paddw m10, m0, m13 +%ifidn %2, v + paddw m8, [tmpq+strideq*1] +%else + paddw m8, [rsp+11*32] +%endif + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m4 + por m10, m9 + mova [rsp+3*32], m10 ; don't clobber p0/m4 + + ; sub p6/p1, add q1/q6 + paddw m8, m6 + paddw m10, m0, m3 +%ifidn %2, v + mova m0, [tmpq+strideq*2] ; q6 +%else + mova m0, [rsp+12*32] ; q6 +%endif + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m5 + por m10, m9 + mova [rsp+4*32], m10 ; don't clobber q0/m5 + + ; sub p5/p0, add q2/q6 + paddw m8, m14 + paddw m10, m2, m4 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m6 + por m2, m10, m9 ; don't clobber q1/m6 + + ; sub p4/q0, add q3/q6 + paddw m8, m15 + paddw m10, m7, m5 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m14 + por m7, m10, m9 ; don't clobber q2/m14 + + ; sub p3/q1, add q4/q6 +%ifidn %2, v + paddw m8, [tmpq+strideq*0] +%else + paddw m8, [rsp+10*32] +%endif + paddw m10, m11, m6 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 + pandn m9, m1, m15 + por m10, m9 +%ifidn %2, v + mova [tmpq+mstrideq], m10 ; q3 +%else + mova [rsp+14*32], m10 +%endif + + ; sub p2/q2, add q5/q6 +%ifidn %2, v + paddw m8, [tmpq+strideq*1] +%else + paddw m8, [rsp+11*32] +%endif + paddw m10, m13, m14 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + pand m10, m1 +%ifidn %2, v + pandn m9, m1, [tmpq+strideq*0] +%else + pandn m9, m1, [rsp+10*32] +%endif + por m10, m9 +%ifidn %2, v + mova [tmpq+strideq*0], m10 ; q4 +%else + mova [rsp+10*32], m10 +%endif + + ; sub p1/q3, add q6*2 + psubw m8, m3 + paddw m0, m0 + psubw m8, m15 + paddw m8, m0 + psrlw m10, m8, 4 + pand m10, m1 +%ifidn %2, v + pandn m9, m1, [tmpq+strideq*1] +%else + pandn m9, m1, [rsp+11*32] +%endif + por m10, m9 +%ifidn %2, v + mova [tmpq+strideq*1], m10 ; q5 +%else + mova [rsp+11*32], m10 +%endif + + mova m9, [rsp+0*32] + mova m13, [rsp+1*32] + mova m3, [rsp+2*32] + mova m4, [rsp+3*32] + mova m5, [rsp+4*32] + SWAP 2, 6 + SWAP 7, 14 +%ifidn %2, v + lea tmpq, [dstq+mstrideq*4] +%else + mova m15, [rsp+14*32] +%endif +%endif + +%if %1 >= 8 + ; flat8 filter +%ifidn %2, v + mova m0, [tmpq+strideq*0] ; p3 +%else + mova m0, [rsp+5*32] ; p3 +%endif + paddw m1, m0, m13 ; p3+p2 + paddw m2, m3, m4 ; p1+p0 + paddw m8, m1, m1 ; 2*(p3+p2) + paddw m2, m0 ; p1+p0+p3 + paddw m8, m5 ; 2*(p3+p2)+q0 + paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0 + pmulhrsw m7, m2, [pw_4096] + + paddw m8, m3, m6 + psubw m2, m1 + paddw m2, m8 + pmulhrsw m8, m2, [pw_4096] + + paddw m10, m0, m3 + paddw m11, m4, m14 + psubw m2, m10 + paddw m2, m11 + pmulhrsw m10, m2, [pw_4096] + + paddw m11, m0, m4 + paddw m1, m5, m15 + psubw m2, m11 + paddw m2, m1 + pmulhrsw m11, m2, [pw_4096] + + paddw m2, m6 + paddw m2, m15 + paddw m1, m13, m5 + psubw m2, m1 + pmulhrsw m1, m2, [pw_4096] + + psubw m2, m3 + psubw m2, m6 + paddw m0, m15, m14 + paddw m2, m0 + pmulhrsw m2, [pw_4096] + + REPX {pand x, m9}, m7, m8, m10, m11, m1, m2 +%if avx_enabled + REPX {pandn x, m9}, m13, m3, m4, m5, m6, m14 +%else + pcmpeqw m0, m0 + pxor m0, m9 + REPX {pand x, m0}, m13, m3, m4, m5, m6, m14 +%endif + por m13, m7 + por m3, m8 + por m4, m10 + por m5, m11 + por m6, m1 + por m14, m2 + +%ifidn %2, v + mova [tmpq+strideq*1], m13 ; p2 + mova [tmpq+strideq*2], m3 ; p1 + mova [tmpq+stride3q ], m4 ; p0 + mova [dstq+strideq*0], m5 ; q0 + mova [dstq+strideq*1], m6 ; q1 + mova [dstq+strideq*2], m14 ; q2 +%else + mova m0, [rsp+5*32] +%if %1 == 8 + TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1 + + ; write 8x8 + movu [dstq+strideq*0-8], xm0 + movu [dstq+strideq*1-8], xm13 + movu [dstq+strideq*2-8], xm3 + movu [dstq+stride3q -8], xm4 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm5 + movu [dstq+strideq*1-8], xm6 + movu [dstq+strideq*2-8], xm14 + movu [dstq+stride3q -8], xm15 + lea dstq, [dstq+strideq*4] +%else + mova m0, [rsp+6*32] + mova m1, [rsp+7*32] + mova m2, [rsp+8*32] + mova m7, [rsp+9*32] + mova m8, [rsp+5*32] + TRANSPOSE8X8W 0, 1, 2, 7, 8, 13, 3, 4, 9 + + mova [dstq+strideq*0-16], xm0 + mova [dstq+strideq*1-16], xm1 + mova [dstq+strideq*2-16], xm2 + mova [dstq+stride3q -16], xm7 + lea tmpq, [dstq+strideq*4] + mova [tmpq+strideq*0-16], xm8 + mova [tmpq+strideq*1-16], xm13 + mova [tmpq+strideq*2-16], xm3 + mova [tmpq+stride3q -16], xm4 + + mova m0, [rsp+10*32] + mova m1, [rsp+11*32] + mova m2, [rsp+12*32] + mova m3, [rsp+13*32] + TRANSPOSE8X8W 5, 6, 14, 15, 0, 1, 2, 3, 4 + mova [dstq+strideq*0], xm5 + mova [dstq+strideq*1], xm6 + mova [dstq+strideq*2], xm14 + mova [dstq+stride3q ], xm15 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + mova [dstq+strideq*2], xm2 + mova [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] +%endif +%endif +%elif %1 == 6 + ; flat6 filter + + paddw m8, m3, m4 + paddw m8, m13 ; p2+p1+p0 + paddw m11, m13, m5 + paddw m8, m8 + paddw m8, m11 ; p2+2*(p2+p1+p0)+q0 + pmulhrsw m2, m8, [pw_4096] + + paddw m8, m5 + paddw m11, m13, m13 + paddw m8, m6 + psubw m8, m11 + pmulhrsw m10, m8, [pw_4096] + + paddw m8, m6 + paddw m11, m13, m3 + paddw m8, m14 + psubw m8, m11 + pmulhrsw m11, m8, [pw_4096] + + psubw m8, m3 + paddw m14, m14 + psubw m8, m4 + paddw m8, m14 + pmulhrsw m8, [pw_4096] + + REPX {pand x, m9}, m2, m10, m11, m8 +%if avx_enabled + REPX {pandn x, m9, x}, m3, m4, m5, m6 +%else + pcmpeqw m0, m0 + pxor m0, m9 + REPX {pand x, m0}, m3, m4, m5, m6 +%endif + por m3, m2 + por m4, m10 + por m5, m11 + por m6, m8 + +%ifidn %2, v + mova [tmpq+strideq*2], m3 ; p1 + mova [tmpq+stride3q ], m4 ; p0 + mova [dstq+strideq*0], m5 ; q0 + mova [dstq+strideq*1], m6 ; q1 +%else + TRANSPOSE_8x4_AND_WRITE_4x8 +%endif +%else +%ifidn %2, v + mova [tmpq+strideq*0], m3 ; p1 + mova [tmpq+strideq*1], m4 ; p0 + mova [tmpq+strideq*2], m5 ; q0 + mova [tmpq+stride3q ], m6 ; q1 +%else + TRANSPOSE_8x4_AND_WRITE_4x8 +%endif +%endif +%endmacro + +INIT_XMM ssse3 +cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea r11, [pw_4] + add r11, r6 + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + mov mask_bitsd, 0x3 + mova m12, [pb_mask] + +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + + FILTER 16, v + jmp .end + +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 8, v + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + + FILTER 4, v + +.end: + pslld m12, 2 + add lq, 8 + add dstq, 16 + shl mask_bitsd, 2 + sub wd, 2 + jg .loop + RET + +INIT_XMM ssse3 +cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea r11, [pw_4] + add r11, r6 + mov hd, hm + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] + mov mask_bitsd, 0x3 + mova m12, [pb_mask] + +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + + FILTER 16, h + jmp .end + +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 8, h + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] +.end: + pslld m12, 2 + lea lq, [lq+l_strideq*2] + shl mask_bitsd, 2 + sub hd, 2 + jg .loop + RET + +INIT_XMM ssse3 +cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea r11, [pw_4] + add r11, r6 + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + mov mask_bitsd, 0x3 + mova m12, [pb_mask] + +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 6, v + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + + FILTER 4, v + +.end: + pslld m12, 2 + add lq, 8 + add dstq, 16 + shl mask_bitsd, 2 + sub wd, 2 + jg .loop + RET + +INIT_XMM ssse3 +cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea r11, [pw_4] + add r11, r6 + mov hd, hm + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] + mov mask_bitsd, 0x3 + mova m12, [pb_mask] + +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 6, h + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] +.end: + pslld m12, 2 + lea lq, [lq+l_strideq*2] + shl mask_bitsd, 2 + sub hd, 2 + jg .loop + RET + +%endif ; ARCH_X86_64 From c8ed4c91d86525cc6ec2a74ac4ccf0b8bda364f1 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 4 Jun 2021 07:11:13 -0400 Subject: [PATCH 115/188] Add SSSE3 HBD filmgrain assembly optimizations --- src/x86/film_grain16_sse.asm | 2192 ++++++++++++++++++++++++++++++++++ 1 file changed, 2192 insertions(+) create mode 100644 src/x86/film_grain16_sse.asm diff --git a/src/x86/film_grain16_sse.asm b/src/x86/film_grain16_sse.asm new file mode 100644 index 0000000000..45389c1ec8 --- /dev/null +++ b/src/x86/film_grain16_sse.asm @@ -0,0 +1,2192 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 16 +pd_16: times 4 dd 16 +pw_1: times 8 dw 1 +pw_8192: times 8 dw 8192 +pw_23_22: times 4 dw 23, 22 +pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +pb_1: times 4 db 1 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512, 1024 +max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 +min: dw 0, 16*4, 16*16 +pw_27_17_17_27: dw 27, 17, 17, 27 +; these two should be next to each other +pw_4: times 2 dw 4 +pw_16: times 2 dw 16 + +%macro JMP_TABLE 1-* + %xdefine %1_table %%table + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .ar%2 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3 + +struc FGData + .seed: resd 1 + .num_y_points: resd 1 + .y_points: resb 14 * 2 + .chroma_scaling_from_luma: resd 1 + .num_uv_points: resd 2 + .uv_points: resb 2 * 10 * 2 + .scaling_shift: resd 1 + .ar_coeff_lag: resd 1 + .ar_coeffs_y: resb 24 + .ar_coeffs_uv: resb 2 * 28 ; includes padding + .ar_coeff_shift: resq 1 + .grain_scale_shift: resd 1 + .uv_mult: resd 2 + .uv_luma_mult: resd 2 + .uv_offset: resd 2 + .overlap_flag: resd 1 + .clip_to_restricted_range: resd 1 +endstruc + +cextern gaussian_sequence + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg +%assign %%idx 0 +%define %%tmp %2 +%if %0 == 8 +%define %%tmp %8 +%endif +%rep (%6/2) +%if %%idx == 0 + movd %5 %+ d, %2 + pshuflw %%tmp, %2, q3232 +%else + movd %5 %+ d, %%tmp +%if %6 == 8 +%if %%idx == 2 + punpckhqdq %%tmp, %%tmp +%elif %%idx == 4 + psrlq %%tmp, 32 +%endif +%endif +%endif + movzx %4 %+ d, %5 %+ w + shr %5 %+ d, 16 + +%if %%idx == 0 + movd %1, [%3+%4*%7] +%else + pinsrw %1, [%3+%4*%7], %%idx + 0 +%endif + pinsrw %1, [%3+%5*%7], %%idx + 1 +%assign %%idx %%idx+2 +%endrep +%endmacro + +%macro SPLATD 2 ; dst, src +%ifnidn %1, %2 + movd %1, %2 +%endif + pshufd %1, %1, q0000 +%endmacro + +%macro SPLATW 2 ; dst, src +%ifnidn %1, %2 + movd %1, %2 +%endif + pshuflw %1, %1, q0000 + punpcklqdq %1, %1 +%endmacro + + +INIT_XMM ssse3 +cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax + lea r4, [pb_mask] +%define base r4-pb_mask + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r3d, [fg_dataq+FGData.grain_scale_shift] + lea r6d, [bdmaxq+1] + shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc + sub r3, r6 + SPLATW m8, [base+round+r3*2-2] + mova m5, [base+pb_mask] + SPLATW m0, [fg_dataq+FGData.seed] + mov r3, -73*82*2 + sub bufq, r3 + lea r6, [gaussian_sequence] +.loop: + pand m2, m0, m1 + psrlw m3, m2, 10 + por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m2, m4 ; bits 0x0f00 are set + pshufb m6, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m6, 30 + por m2, m6 + psllq m6, m2, 15 + por m2, m6 ; aggregate each bit into next seed's high bit + pmulhuw m3, m0, m7 + por m2, m3 ; 4 next output seeds + pshuflw m0, m2, q3333 + psrlw m2, 5 + vpgatherdw m3, m2, r6, r5, r7, 4, 2 + paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 + ; shifts by 0, which pmulhrsw does not support + pmulhrsw m3, m8 + movq [bufq+r3], m3 + add r3, 4*2 + jl .loop + + ; auto-regression code + movsxd r3, [fg_dataq+FGData.ar_coeff_lag] + movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4] + lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table] + jmp r3 + +.ar1: +%if WIN64 + DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0 + lea bufq, [r0-2*(82*73-(82*3+79))] +%elif ARCH_X86_64 + DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 + sub bufq, 2*(82*73-(82*3+79)) +%else + ; FIXME shift goes into r1 (x86-32 code) + .. +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd m4, [fg_dataq+FGData.ar_coeffs_y] +%if WIN64 + DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0 +%elif ARCH_X86_64 + DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 +%else + ; x86-32 code + .. +%endif +%if cpuflag(sse4) + pmovsxbw m4, m4 +%else + pxor m3, m3 + pcmpgtb m3, m4 + punpcklbw m4, m3 +%endif + pinsrw m4, [pw_1], 3 + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd + mov hd, 70 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -76 + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu m0, [bufq+xq*2-82*2-2] ; top/left + psrldq m2, m0, 2 ; top + psrldq m1, m0, 4 ; top/right + punpcklwd m0, m2 + punpcklwd m1, m3 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 + imul val3d, cf3d + add val3d, val0d + sar val3d, shiftb + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: + DEFINE_ARGS buf, fg_data, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + SPLATW m12, [base+round_vals-12+shiftq*2] + movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11 + pxor m9, m9 + punpcklwd m12, m9 + pcmpgtb m9, m6 + punpckhbw m10, m6, m9 + punpcklbw m6, m9 + pshufd m9, m6, q3333 + pshufd m8, m6, q2222 + pshufd m7, m6, q1111 + pshufd m6, m6, q0000 + pshufd m11, m10, q1111 + pshufd m10, m10, q0000 + sar bdmaxd, 1 + SPLATW m13, bdmaxd ; max_grain + pcmpeqw m14, m14 +%if !cpuflag(sse4) + pcmpeqw m15, m15 + psrldq m15, 14 + pslldq m15, 2 + pxor m15, m14 +%endif + pxor m14, m13 ; min_grain + sub bufq, 2*(82*73-(82*3+79)) + DEFINE_ARGS buf, fg_data, h, x + mov hd, 70 +.y_loop_ar2: + mov xq, -76 + +.x_loop_ar2: + movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] + movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + psrldq m2, m0, 2 + psrldq m3, m0, 4 + psrldq m4, m0, 6 + psrldq m5, m0, 8 + punpcklwd m0, m2 + punpcklwd m3, m4 + punpcklwd m5, m1 + psrldq m2, m1, 2 + psrldq m4, m1, 4 + punpcklwd m2, m4 + psrldq m4, m1, 6 + psrldq m1, 8 + punpcklwd m4, m1 + pmaddwd m0, m6 + pmaddwd m3, m7 + pmaddwd m5, m8 + pmaddwd m2, m9 + pmaddwd m4, m10 + paddd m0, m3 + paddd m5, m2 + paddd m0, m4 + paddd m0, m5 ; accumulated top 2 rows + paddd m0, m12 + + movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] + pshufd m4, m1, q3321 + pxor m2, m2 + pcmpgtw m2, m4 + punpcklwd m4, m2 ; in dwords, y=0,x=[0,3] +.x_loop_ar2_inner: + pmaddwd m2, m1, m11 + paddd m2, m0 + psrldq m0, 4 ; shift top to next pixel + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + paddd m2, m4 + packssdw m2, m2 + pminsw m2, m13 + pmaxsw m2, m14 + psrldq m4, 4 + pslldq m2, 2 + psrldq m1, 2 +%if cpuflag(sse4) + pblendw m1, m2, 00000010b +%else + pand m1, m15 + pandn m3, m15, m2 + por m1, m3 +%endif + ; overwrite previous pixel, this should be ok + movd [bufq+xq*2-2], m1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, fg_data, bdmax, shift +%if WIN64 + mov r6, rsp + and rsp, ~15 + sub rsp, 64 + %define tmp rsp +%else + %define tmp rsp+stack_offset-72 +%endif + sar bdmaxd, 1 + SPLATW m15, bdmaxd ; max_grain + pcmpeqw m14, m14 +%if !cpuflag(sse4) + pcmpeqw m12, m12 + psrldq m12, 14 + pslldq m12, 4 + pxor m12, m14 +%endif + pxor m14, m15 ; min_grain + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + + ; build cf0-1 until 18-19 in m5-12 and r0/1 + pxor m1, m1 + movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + +%if cpuflag(sse4) + pshufd m12, m2, q3333 +%else + pshufd m13, m2, q3333 + mova [tmp+48], m13 +%endif + pshufd m11, m2, q2222 + pshufd m10, m2, q1111 + pshufd m9, m2, q0000 + pshufd m8, m0, q3333 + pshufd m7, m0, q2222 + pshufd m6, m0, q1111 + pshufd m5, m0, q0000 + + ; build cf20,round in r2 + ; build cf21-23,round*2 in m13 + pxor m1, m1 + movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 + pcmpgtb m1, m0 + punpcklbw m0, m1 + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + mova [tmp+ 0], m1 + mova [tmp+16], m2 + psrldq m13, m0, 10 + pinsrw m13, [base+round_vals+shiftq*2-10], 3 + pinsrw m0, [base+round_vals+shiftq*2-12], 5 + pshufd m3, m0, q2222 + mova [tmp+32], m3 + + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 2*(82*73-(82*3+79)) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 + +.x_loop_ar3: + movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] + palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] + palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] + punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] + + pmaddwd m0, m5 + pmaddwd m2, m6 + pmaddwd m3, m7 + paddd m0, m2 + paddd m0, m3 + ; m0 = top line first 6 multiplied by cf, m1 = top line last entry + + movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] + movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] + punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] + palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5] + palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6] + punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] + punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + + pmaddwd m1, m8 + pmaddwd m4, m9 + pmaddwd m3, m10 + pmaddwd m2, m11 + paddd m1, m4 + paddd m3, m2 + paddd m0, m1 + paddd m0, m3 + ; m0 = top 2 lines multiplied by cf + + movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] + palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] + palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] + punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] + punpcklwd m2, [pw_1] + +%if cpuflag(sse4) + pmaddwd m1, m12 +%else + pmaddwd m1, [tmp+48] +%endif + pmaddwd m3, [tmp+ 0] + pmaddwd m4, [tmp+16] + pmaddwd m2, [tmp+32] + paddd m1, m3 + paddd m4, m2 + paddd m0, m1 + paddd m0, m4 + ; m0 = top 3 lines multiplied by cf plus rounding for downshift + + movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmaddwd m2, m1, m13 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + packssdw m2, m2 + pminsw m2, m15 + pmaxsw m2, m14 + pslldq m2, 4 + psrldq m1, 2 +%if cpuflag(sse4) + pblendw m1, m2, 00000100b +%else + pand m1, m12 + pandn m3, m12, m2 + por m1, m3 +%endif + ; overwrite a couple of pixels, should be ok + movq [bufq+xq*2-4], m1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar3 +%if WIN64 + mov rsp, r6 +%endif + RET + +INIT_XMM ssse3 +cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax +%define base r8-pb_mask + lea r8, [pb_mask] + movifnidn bdmaxd, bdmaxm + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + lea r6d, [bdmaxq+1] + shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc + sub r5, r6 + SPLATW m8, [base+round+r5*2-2] + mova m5, [base+pb_mask] + SPLATW m0, [fg_dataq+FGData.seed] + SPLATW m9, [base+pw_seed_xor+uvq*4] + pxor m0, m9 + lea r6, [gaussian_sequence] + mov r7d, 38 + add bufq, 44*2 +.loop_y: + mov r5, -44 +.loop_x: + pand m2, m0, m1 + psrlw m3, m2, 10 + por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m2, m4 ; bits 0x0f00 are set + pshufb m6, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m6, 30 + por m2, m6 + psllq m6, m2, 15 + por m2, m6 ; aggregate each bit into next seed's high bit + pmulhuw m3, m0, m7 + por m2, m3 ; 4 next output seeds + pshuflw m0, m2, q3333 + psrlw m2, 5 + vpgatherdw m3, m2, r6, r9, r10, 4, 2 + paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 + ; shifts by 0, which pmulhrsw does not support + pmulhrsw m3, m8 + movq [bufq+r5*2], m3 + add r5, 4 + jl .loop_x + add bufq, 82*2 + dec r7d + jg .loop_y + + ; auto-regression code + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movsxd r5, [base+generate_grain_uv_420_16bpc_ssse3_table+r5*4] + lea r5, [r5+base+generate_grain_uv_420_16bpc_ssse3_table] + jmp r5 + +.ar0: + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + SPLATW m3, [base+hmul_bits+shiftq*2-10] + sar bdmaxd, 1 + SPLATW m14, bdmaxd ; max_gain + pcmpeqw m7, m7 + pxor m7, m14 ; min_grain + DEFINE_ARGS buf, bufy, h, x + pxor m5, m5 + pcmpgtb m5, m4 + punpcklbw m4, m5 + SPLATW m6, [hmul_bits+4] + SPLATW m4, m4 + pxor m5, m5 +%if !cpuflag(sse4) + pcmpeqw m12, m12 + pslldq m12, 12 +%endif + sub bufq, 2*(82*38+82-(82*3+41)) + add bufyq, 2*(3+82*3) + mov hd, 35 +.y_loop_ar0: + ; first 32 pixels + xor xd, xd +.x_loop_ar0: + movu m8, [bufyq+xq*4] + movu m9, [bufyq+xq*4+82*2] + movu m10, [bufyq+xq*4 +16] + movu m11, [bufyq+xq*4+82*2+16] + paddw m8, m9 + paddw m10, m11 + phaddw m8, m10 + pmulhrsw m8, m6 + punpckhwd m9, m8, m5 + punpcklwd m8, m5 + REPX {pmaddwd x, m4}, m8, m9 + REPX {psrad x, 5}, m8, m9 + packssdw m8, m9 + pmulhrsw m8, m3 + movu m0, [bufq+xq*2] + paddw m8, m0 + pminsw m8, m14 + pmaxsw m8, m7 + cmp xd, 32 + je .end + movu [bufq+xq*2], m8 + add xd, 8 + jmp .x_loop_ar0 + + ; last 6 pixels +.end: +%if cpuflag(sse4) + pblendw m8, m0, 11000000b +%else + pand m0, m12 + pandn m9, m12, m8 + por m8, m0, m9 +%endif + movu [bufq+xq*2], m8 + + add bufq, 82*2 + add bufyq, 82*4 + dec hd + jg .y_loop_ar0 + RET + +.ar1: + DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x + imul uvd, 28 + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] +%if WIN64 + DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0 + lea bufq, [r0-2*(82*38+44-(82*3+41))] +%elif ARCH_X86_64 + DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0 + sub bufq, 2*(82*38+44-(82*3+41)) +%else + ; x86-32 code - move shift into r1 [ecx] + .. +%endif + mov shiftd, [r2+FGData.ar_coeff_shift] + pxor m5, m5 + pcmpgtb m5, m4 + punpcklbw m4, m5 ; cf0-4 in words + pshuflw m4, m4, q2100 + psrldq m4, 2 ; cf0-3,4 in words + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + movd m3, [base+round_vals+shiftq*2-12] ; rnd + pxor m6, m6 + punpcklwd m3, m6 + SPLATW m6, [hmul_bits+4] + SPLATD m3, m3 + add bufyq, 2*(79+82*3) + mov hd, 35 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -38 + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu m0, [bufq+xq*2-82*2-2] ; top/left + movu m8, [bufyq+xq*4] + movu m9, [bufyq+xq*4+82*2] + psrldq m2, m0, 2 ; top + psrldq m1, m0, 4 ; top/right + phaddw m8, m9 + pshufd m9, m8, q3232 + paddw m8, m9 + pmulhrsw m8, m6 + punpcklwd m0, m2 + punpcklwd m1, m8 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 + paddd m0, m3 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 + imul val3d, cf3d + add val3d, val0d + sar val3d, shiftb + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82*2 + add bufyq, 82*4 + dec hd + jg .y_loop_ar1 + RET + +.ar2: + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + sar bdmaxd, 1 + SPLATW m13, bdmaxd ; max_grain + pcmpeqw m14, m14 +%if !cpuflag(sse4) + pcmpeqw m15, m15 + psrldq m15, 14 + pslldq m15, 2 + pxor m15, m14 +%endif + pxor m14, m13 ; min_grain +%if cpuflag(sse4) + SPLATW m15, [hmul_bits+4] +%endif + + ; coef values + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] + pxor m1, m1 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pinsrw m2, [base+round_vals-12+shiftq*2], 5 + + pshufd m6, m0, q0000 + pshufd m7, m0, q1111 + pshufd m8, m0, q2222 + pshufd m9, m0, q3333 + pshufd m10, m2, q0000 + pshufd m11, m2, q1111 + pshufd m12, m2, q2222 + + DEFINE_ARGS buf, bufy, fg_data, h, x + sub bufq, 2*(82*38+44-(82*3+41)) + add bufyq, 2*(79+82*3) + mov hd, 35 +.y_loop_ar2: + mov xq, -38 + +.x_loop_ar2: + movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] + movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + psrldq m4, m0, 2 ; y=-2,x=[-1,+5] + psrldq m1, m0, 4 ; y=-2,x=[-0,+5] + psrldq m3, m0, 6 ; y=-2,x=[+1,+5] + psrldq m2, m0, 8 ; y=-2,x=[+2,+5] + punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1] + pmaddwd m0, m6 + pmaddwd m1, m7 + pmaddwd m2, m8 + paddd m0, m1 + paddd m0, m2 + psrldq m3, m5, 2 ; y=-1,x=[-1,+5] + psrldq m1, m5, 4 ; y=-1,x=[-0,+5] + psrldq m4, m5, 6 ; y=-1,x=[+1,+5] + psrldq m2, m5, 8 ; y=-1,x=[+2,+5] + punpcklwd m3, m1 + punpcklwd m4, m2 + pmaddwd m3, m9 + pmaddwd m4, m10 + paddd m3, m4 + paddd m0, m3 + + ; luma component & rounding + movu m1, [bufyq+xq*4] + movu m2, [bufyq+xq*4+82*2] + phaddw m1, m2 + pshufd m2, m1, q3232 + paddw m1, m2 +%if cpuflag(sse4) + pmulhrsw m1, m15 +%else + pmulhrsw m1, [pw_8192] +%endif + punpcklwd m1, [pw_1] + pmaddwd m1, m12 + paddd m0, m1 + + movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] + pshufd m2, m1, q3321 + pxor m3, m3 + pcmpgtw m3, m2 + punpcklwd m2, m3 ; y=0,x=[0,3] in dword +.x_loop_ar2_inner: + pmaddwd m3, m1, m11 + paddd m3, m0 + psrldq m0, 4 ; shift top to next pixel + psrad m3, [fg_dataq+FGData.ar_coeff_shift] + ; we do not need to packssdw since we only care about one value + paddd m3, m2 + packssdw m3, m3 + pminsw m3, m13 + pmaxsw m3, m14 + psrldq m1, 2 + pslldq m3, 2 + psrldq m2, 4 +%if cpuflag(sse4) + pblendw m1, m3, 00000010b +%else + pand m1, m15 + pandn m4, m15, m3 + por m1, m4 +%endif + ; overwrite previous pixel, should be ok + movd [bufq+xq*2-2], m1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82*2 + add bufyq, 82*4 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%if WIN64 + mov r6, rsp + and rsp, ~15 + sub rsp, 96 + %define tmp rsp +%else + %define tmp rsp+stack_offset-120 +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + SPLATW m12, [base+round_vals-12+shiftq*2] + pxor m13, m13 + pcmpgtw m13, m12 + punpcklwd m12, m13 + sar bdmaxd, 1 + SPLATW m14, bdmaxd ; max_grain + pcmpeqw m15, m15 +%if !cpuflag(sse4) + pcmpeqw m11, m11 + psrldq m11, 14 + pslldq m11, 4 + pxor m11, m15 +%endif + pxor m15, m14 ; min_grain +%if cpuflag(sse4) + SPLATW m11, [base+hmul_bits+4] +%endif + + ; cf from y=-3,x=-3 until y=-3,x=-2 + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] + pxor m1, m1 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pshufd m6, m0, q0000 + pshufd m7, m0, q1111 + pshufd m8, m0, q2222 + pshufd m9, m0, q3333 + pshufd m10, m2, q0000 + pshufd m13, m2, q1111 + mova [tmp+16*0], m6 + mova [tmp+16*1], m7 + mova [tmp+16*2], m8 + mova [tmp+16*3], m9 + mova [tmp+16*4], m10 + mova [tmp+16*5], m13 + pshufd m6, m2, q2222 + pshufd m7, m2, q3333 + + ; cf from y=-1,x=-1 to y=0,x=-1 + luma component + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] + pxor m1, m1 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 ; luma + punpcklbw m0, m1 + pshufd m10, m0, q3232 + psrldq m13, m0, 10 + ; y=0,x=[-3 to -1] + "1.0" for current pixel + pinsrw m13, [base+round_vals-10+shiftq*2], 3 + ; y=-1,x=[-1 to +2] + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + ; y=-1,x=+3 + luma + punpcklwd m10, m2 + pshufd m10, m10, q0000 + + DEFINE_ARGS buf, bufy, fg_data, h, unused, x + sub bufq, 2*(82*38+44-(82*3+41)) + add bufyq, 2*(79+82*3) + mov hd, 35 +.y_loop_ar3: + mov xq, -38 + +.x_loop_ar3: + ; first line + movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] + palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] + palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] + punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] + + pmaddwd m0, [tmp+0*16] + pmaddwd m2, [tmp+1*16] + pmaddwd m3, [tmp+2*16] + paddd m0, m2 + paddd m0, m3 ; first 6 x of top y + + ; second line [m0/1 are busy] + movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] + movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] + punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] + palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5] + palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5] + punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] + punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + pmaddwd m1, [tmp+3*16] + pmaddwd m4, [tmp+4*16] + pmaddwd m3, [tmp+5*16] + pmaddwd m5, m6 + paddd m1, m4 + paddd m3, m5 + paddd m0, m1 + paddd m0, m3 ; top 2 lines + + ; third line [m0 is busy] & luma + round + movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] + movu m5, [bufyq+xq*4] + movu m4, [bufyq+xq*4+82*2] + phaddw m5, m4 + palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] + palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] + pshufd m4, m5, q3232 + paddw m5, m4 +%if cpuflag(sse4) + pmulhrsw m5, m11 +%else + pmulhrsw m5, [pw_8192] +%endif + punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] + punpcklwd m2, m5 + pmaddwd m1, m7 + pmaddwd m3, m8 + pmaddwd m4, m9 + pmaddwd m2, m10 + paddd m1, m3 + paddd m4, m2 + paddd m0, m12 ; += round + paddd m1, m4 + paddd m0, m1 + + movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmaddwd m2, m1, m13 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + packssdw m2, m2 + pminsw m2, m14 + pmaxsw m2, m15 + pslldq m2, 4 + psrldq m1, 2 +%if cpuflag(sse4) + pblendw m1, m2, 00000100b +%else + pand m1, m11 + pandn m3, m11, m2 + por m1, m3 +%endif + ; overwrite previous pixels, should be ok + movq [bufq+xq*2-4], m1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82*2 + add bufyq, 82*4 + dec hd + jg .y_loop_ar3 +%if WIN64 + mov rsp, r6 +%endif + RET + +INIT_XMM ssse3 +cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut + mov r7d, [fg_dataq+FGData.scaling_shift] + lea r8, [pb_mask] +%define base r8-pb_mask + SPLATW m11, [base+mul_bits+r7*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + mov r9d, r9m ; bdmax + sar r9d, 11 ; is_12bpc + inc r9d + mov r10d, r6d + imul r10d, r9d + dec r9d + SPLATW m13, [base+min+r10*2] + lea r9d, [r9d*3] + lea r9d, [r6d*2+r9d] + SPLATW m12, [base+max+r9*2] + SPLATW m10, r9m + + pcmpeqw m9, m9 + psraw m7, m10, 1 ; max_grain + pxor m9, m7 ; min_grain +%if !cpuflag(sse4) + pcmpeqw m6, m6 + pslldq m6, 4 +%endif + SPLATD m14, [pd_16] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see + + movifnidn sbyd, sbym + test sbyd, sbyd + setnz r7b + test r7b, byte [fg_dataq+FGData.overlap_flag] + jnz .vertical_overlap + mov dword sbym, 0 + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, src_bak + + lea src_bakq, [srcq+wq*2] + mov r9mp, src_bakq + neg wq + sub dstq, srcq + +.loop_x: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak + + mov offyd, seed + mov offxd, seed + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak + +.loop_x_odd: + movzx hd, word hm + mov grain_lutq, grain_lutmp +.loop_y: + ; src + pand m0, m10, [srcq+ 0] + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] + vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4 + vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4 + REPX {psrlw x, 8}, m2, m3 + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2] + movu m5, [grain_lutq+offxyq*2+16] + + ; noise = round2(scaling[src] * grain, scaling_shift) + REPX {pmullw x, m11}, m2, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hd + jg .loop_y + + add wq, 16 + jge .end + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] + btc dword hm, 16 + jc .next_blk + add offxyd, 16 + cmp dword r8m, 0 + je .loop_x_odd + SPLATD m15, [pw_27_17_17_27] + add r12d, 16 ; top_offxy += 16 + jmp .loop_x_odd_v_overlap + +.next_blk: + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je .loop_x + + ; r8m = sbym + movq m15, [pw_27_17_17_27] + cmp dword r8m, 0 + jne .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx + mov offyd, seed + mov offxd, seed + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy + + movzx hd, word hm + mov grain_lutq, grain_lutmp +.loop_y_h_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2] + movd m5, [grain_lutq+left_offxyq*2] + punpcklwd m5, m4 + pmaddwd m5, m15 + paddd m5, m14 + psrad m5, 5 + packssdw m5, m5 +%if cpuflag(sse4) + pblendw m4, m5, 00000011b +%else + pand m4, m6 + pandn m0, m6, m5 + por m4, m0 +%endif + pminsw m4, m7 + pmaxsw m4, m9 + + ; src + pand m0, m10, [srcq+ 0] + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] + vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5 + vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5 + REPX {psrlw x, 8}, m2, m3 + + ; noise = round2(scaling[src] * grain, scaling_shift) + movu m5, [grain_lutq+offxyq*2+16] + REPX {pmullw x, m11}, m2, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hd + jg .loop_y_h_overlap + + add wq, 16 + jge .end + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] + or dword hm, 0x10000 + add offxyd, 16 + + ; r8m = sbym + cmp dword r8m, 0 + je .loop_x_odd + SPLATD m15, [pw_27_17_17_27] + add r12d, 16 ; top_offxy += 16 + jmp .loop_x_odd_v_overlap + +.end: + RET + +.vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, src_bak + + lea src_bakq, [srcq+wq*2] + mov r9mp, src_bakq + neg wq + sub dstq, srcq + +.loop_x_v_overlap: + SPLATD m15, [pw_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + mov seed, r7d + ror seed, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, unused, top_offxy + + mov offyd, seed + mov offxd, seed + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, unused, top_offxy + + movzx top_offxyd, offxyw + shr offxyd, 16 + +.loop_x_odd_v_overlap: + movzx hd, word hm + mov grain_lutq, grain_lutmp +.loop_y_v_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] + movu m2, [grain_lutq+top_offxyq*2] + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + REPX {pmaddwd x, m15}, m4, m2 + REPX {paddd x, m14}, m4, m2 + REPX {psrad x, 5}, m4, m2 + packssdw m2, m4 + pminsw m2, m7 + pmaxsw m2, m9 + movu m4, [grain_lutq+offxyq*2+16] + movu m3, [grain_lutq+top_offxyq*2+16] + punpckhwd m5, m3, m4 + punpcklwd m3, m4 + REPX {pmaddwd x, m15}, m5, m3 + REPX {paddd x, m14}, m5, m3 + REPX {psrad x, 5}, m5, m3 + packssdw m3, m5 + pminsw m3, m7 + pmaxsw m3, m9 + + ; src + pand m0, m10, [srcq+ 0] ; m0-1: src as word + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] + ; noise = round2(scaling[src] * grain, scaling_shift) + vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5 + psrlw m4, 8 + pmullw m4, m11 + pmulhrsw m4, m2 + vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2 + psrlw m5, 8 + pmullw m5, m11 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + SPLATD m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line + add srcq, strideq + add grain_lutq, 82*2 + dec hw + jz .end_y_v_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + xor hd, 0x10000 + test hd, 0x10000 + jnz .loop_y_v_overlap + jmp .loop_y + +.end_y_v_overlap: + add wq, 16 + jge .end_hv + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] + btc dword hm, 16 + jc .next_blk_v + SPLATD m15, [pw_27_17_17_27] + add offxyd, 16 + add top_offxyd, 16 + jmp .loop_x_odd_v_overlap + +.next_blk_v: + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + + movq m15, [pw_27_17_17_27] +.loop_x_hv_overlap: + SPLATD m8, [pw_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + mov seed, r7d + ror seed, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + mov offyd, seed + mov offxd, seed + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + movzx top_offxyd, offxyw + shr offxyd, 16 + + movzx hd, word hm + mov grain_lutq, grain_lutmp +.loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] + movu m5, [grain_lutq+top_offxyq*2] + movd m4, [grain_lutq+left_offxyq*2] + movd m2, [grain_lutq+topleft_offxyq*2] + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklwd m4, m3 + punpcklwd m2, m5 + REPX {pmaddwd x, m15}, m4, m2 + REPX {paddd x, m14}, m4, m2 + REPX {psrad x, 5}, m4, m2 + REPX {packssdw x, x}, m4, m2 + REPX {pminsw x, m7}, m4, m2 + REPX {pmaxsw x, m9}, m4, m2 +%if cpuflag(sse4) + pblendw m3, m4, 00000011b + pblendw m5, m2, 00000011b +%else + pand m3, m6 + pand m5, m6 + pandn m0, m6, m4 + pandn m1, m6, m2 + por m3, m0 + por m5, m1 +%endif + ; followed by v interpolation (top | cur -> cur) + movu m0, [grain_lutq+offxyq*2+16] + movu m1, [grain_lutq+top_offxyq*2+16] + punpcklwd m2, m5, m3 + punpckhwd m5, m3 + punpcklwd m3, m1, m0 + punpckhwd m1, m0 + REPX {pmaddwd x, m8}, m2, m5, m3, m1 + REPX {paddd x, m14}, m2, m5, m3, m1 + REPX {psrad x, 5}, m2, m5, m3, m1 + packssdw m2, m5 + packssdw m3, m1 + REPX {pminsw x, m7}, m2, m3 + REPX {pmaxsw x, m9}, m2, m3 + + ; src + pand m0, m10, [srcq+ 0] + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] + ; noise = round2(scaling[src] * grain, scaling_shift) + vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5 + psrlw m4, 8 + pmullw m4, m11 + pmulhrsw m2, m4 + vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4 + psrlw m5, 8 + pmullw m5, m11 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + SPLATD m8, [pw_27_17_17_27+4] ; swap weights for second v-overlap line + add srcq, strideq + add grain_lutq, 82*2 + dec hw + jz .end_y_hv_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + xor hd, 0x10000 + test hd, 0x10000 + jnz .loop_y_hv_overlap + jmp .loop_y_h_overlap + +.end_y_hv_overlap: + or dword hm, 0x10000 + add wq, 16 + jge .end_hv + SPLATD m15, [pw_27_17_17_27] + add offxyd, 16 + add top_offxyd, 16 + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] + jmp .loop_x_odd_v_overlap + +.end_hv: + RET + +cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id +%define base r8-pb_mask + lea r8, [pb_mask] + mov r7d, [fg_dataq+FGData.scaling_shift] + SPLATW m11, [base+mul_bits+r7*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + mov r9d, r13m ; bdmax + sar r9d, 11 ; is_12bpc + inc r9d + mov r10d, r6d + imul r10d, r9d + dec r9d + SPLATW m13, [base+min+r10*2] + lea r10d, [r9d*3] + mov r11d, is_idm + inc r11d + imul r6d, r11d + add r10d, r6d + SPLATW m12, [base+max+r10*2] + SPLATW m10, r13m +%if cpuflag(sse4) + pxor m2, m2 +%define mzero m2 +%else +%define mzero m7 +%endif + mov r13mp, strideq + + pcmpeqw m8, m8 + psraw m9, m10, 1 + pxor m8, m9 +%if !cpuflag(sse4) + pcmpeqw m2, m2 + pslldq m2, 2 +%endif + + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro FGUV_32x32xN_LOOP 1 ; not-csfl + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + +%if %1 + mov r7d, r11m + SPLATW m0, [fg_dataq+FGData.uv_mult+r7*4] + SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r7*4] + punpcklwd m14, m1, m0 + SPLATW m15, [fg_dataq+FGData.uv_offset+r7*4] + SPLATD m7, [base+pw_4+r9*4] + pmullw m15, m7 +%else + SPLATD m14, [pd_16] + SPLATD m15, [pw_23_22] +%endif + + movifnidn sbyd, sbym + test sbyd, sbyd + setnz r7b + test r7b, byte [fg_dataq+FGData.overlap_flag] + jnz %%vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused2, unused3, see, unused4, unused5, unused6, luma, lstride + + mov lumaq, r9mp + mov lstrideq, r10mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*4] + mov r10mp, r10 + mov r11mp, r11 + mov r12mp, r12 + neg wq + +%%loop_x: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, unused2, unused3, luma, lstride + + mov offxd, seed + mov offyd, seed + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 82 + lea offyq, [offyq+offxq+498] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, unused2, unused3, luma, lstride + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y: + ; src + mova m0, [srcq] + mova m1, [srcq+16] ; m0-1: src as word + + ; luma_src +%if !cpuflag(sse4) + pxor mzero, mzero +%endif + mova m4, [lumaq+lstrideq*0+ 0] + mova m6, [lumaq+lstrideq*0+32] + phaddw m4, [lumaq+lstrideq*0+16] + phaddw m6, [lumaq+lstrideq*0+48] + pavgw m4, mzero + pavgw m6, mzero + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, mzero}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pand x, m10}, m4, m6 +%endif + + ; scaling[luma_src] + vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1 + vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 + REPX {psrlw x, 8}, m3, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2] + movu m6, [grain_lutq+offxyq*2+16] + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m3, m5 + pmulhrsw m4, m3 + pmulhrsw m6, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m6 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+ 0], m0 + mova [dstq+16], m1 + + add srcq, r13mp + add dstq, r13mp + lea lumaq, [lumaq+lstrideq*2] + add grain_lutq, 82*2 + dec hb + jg %%loop_y + + add wq, 16 + jge %%end + mov srcq, r10mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*4] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je %%loop_x + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, luma, lstride + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx + mov offxd, seed + mov offyd, seed + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 82 + lea offyq, [offyq+offxq+498] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, luma, lstride + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y_h_overlap: + mova m0, [srcq] + mova m1, [srcq+16] + + ; luma_src +%if !cpuflag(sse4) + pxor mzero, mzero +%endif + mova m4, [lumaq+lstrideq*0+ 0] + mova m6, [lumaq+lstrideq*0+32] + phaddw m4, [lumaq+lstrideq*0+16] + phaddw m6, [lumaq+lstrideq*0+48] + pavgw m4, mzero + pavgw m6, mzero + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, mzero}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pand x, m10}, m4, m6 +%endif + + ; grain = grain_lut[offy+y][offx+x] + movu m7, [grain_lutq+offxyq*2] + movd m5, [grain_lutq+left_offxyq*2+ 0] + punpcklwd m5, m7 ; {left0, cur0} +%if %1 + pmaddwd m5, [pw_23_22] + paddd m5, [pd_16] +%else + pmaddwd m5, m15 + paddd m5, m14 +%endif + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m8 + pminsw m5, m9 +%if cpuflag(sse4) + pblendw m5, m7, 11111110b +%else + pand m7, m2 + pandn m3, m2, m5 + por m5, m7, m3 +%endif + movu m3, [grain_lutq+offxyq*2+16] + + ; scaling[luma_src] + vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1 + vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1 + REPX {psrlw x, 8}, m7, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m7, m4 + pmulhrsw m5, m7 + pmulhrsw m3, m4 + + ; dst = clip_pixel(src, noise) + paddw m0, m5 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+ 0], m0 + mova [dstq+16], m1 + + add srcq, r13mp + add dstq, r13mp + lea lumaq, [lumaq+lstrideq*2] + add grain_lutq, 82*2 + dec hb + jg %%loop_y_h_overlap + + add wq, 16 + jge %%end + mov srcq, r10mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*4] + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap + +%%end: + RET + +%%vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ + sby, see, unused1, unused2, unused3, lstride + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, unused3, unused4, unused5, luma, lstride + + mov lumaq, r9mp + mov lstrideq, r10mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*4] + mov r10mp, r10 + mov r11mp, r11 + mov r12mp, r12 + neg wq + +%%loop_x_v_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + mov seed, r7d + ror seed, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, top_offxy, unused2, luma, lstride + + mov offyd, seed + mov offxd, seed + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 82 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq+0x10001*498+16*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, top_offxy, unused2, luma, lstride + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y_v_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] + movu m5, [grain_lutq+top_offxyq*2] + punpckhwd m7, m5, m3 + punpcklwd m5, m3 ; {top/cur interleaved} +%if %1 + REPX {pmaddwd x, [pw_23_22]}, m7, m5 + REPX {paddd x, [pd_16]}, m7, m5 +%else + REPX {pmaddwd x, m15}, m7, m5 + REPX {paddd x, m14}, m7, m5 +%endif + REPX {psrad x, 5}, m7, m5 + packssdw m3, m5, m7 + pmaxsw m3, m8 + pminsw m3, m9 + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2+16] + movu m5, [grain_lutq+top_offxyq*2+16] + punpckhwd m7, m5, m4 + punpcklwd m5, m4 ; {top/cur interleaved} +%if %1 + REPX {pmaddwd x, [pw_23_22]}, m7, m5 + REPX {paddd x, [pd_16]}, m7, m5 +%else + REPX {pmaddwd x, m15}, m7, m5 + REPX {paddd x, m14}, m7, m5 +%endif + REPX {psrad x, 5}, m7, m5 + packssdw m4, m5, m7 + pmaxsw m4, m8 + pminsw m4, m9 + + ; src + mova m0, [srcq] + mova m1, [srcq+16] + + ; luma_src +%if !cpuflag(sse4) + pxor mzero, mzero +%endif + mova m5, [lumaq+lstrideq*0+ 0] + mova m6, [lumaq+lstrideq*0+32] + phaddw m5, [lumaq+lstrideq*0+16] + phaddw m6, [lumaq+lstrideq*0+48] + pavgw m5, mzero + pavgw m6, mzero + +%if %1 + punpckhwd m7, m5, m0 + punpcklwd m5, m0 + REPX {pmaddwd x, m14}, m7, m5 + REPX {psrad x, 6}, m7, m5 + packssdw m5, m7 + punpckhwd m7, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m7, m6 + REPX {psrad x, 6}, m7, m6 + packssdw m6, m7 +%if !cpuflag(sse4) + pxor mzero, mzero +%endif + REPX {paddw x, m15}, m5, m6 + REPX {pmaxsw x, mzero}, m5, m6 + REPX {pminsw x, m10}, m5, m6 ; clip_pixel() +%else + REPX {pand x, m10}, m5, m6 +%endif + + ; scaling[luma_src] + vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1 + vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 + REPX {psrlw x, 8}, m7, m5 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m7, m5 + pmulhrsw m3, m7 + pmulhrsw m4, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m3 + paddw m1, m4 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+ 0], m0 + mova [dstq+16], m1 + + dec hb + jle %%end_y_v_overlap + add srcq, r13mp + add dstq, r13mp + lea lumaq, [lumaq+lstrideq*2] + add grain_lutq, 82*2 + jmp %%loop_y + +%%end_y_v_overlap: + add wq, 16 + jge %%end_hv + mov srcq, r10mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*4] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + +%%loop_x_hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + mov seed, r7d + ror seed, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + mov offyd, seed + mov offxd, seed + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 82 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq+0x10001*498+16*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] + movd m5, [grain_lutq+left_offxyq*2] + pinsrw m5, [grain_lutq+topleft_offxyq*2], 1 ; { left, top/left } + movu m3, [grain_lutq+offxyq*2] + movu m4, [grain_lutq+top_offxyq*2] + punpcklwd m7, m3, m4 ; { cur0, top0 } + punpcklwd m5, m7 ; { cur/left } interleaved +%if %1 + pmaddwd m5, [pw_23_22] + paddd m5, [pd_16] +%else + pmaddwd m5, m15 + paddd m5, m14 +%endif + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m8 + pminsw m5, m9 +%if cpuflag(sse4) + pblendw m3, m5, 00000001b + psrldq m5, 2 + pblendw m5, m4, 11111110b +%else + pand m3, m2 + pandn m7, m2, m5 + por m3, m7 + psrldq m5, 2 + pand m4, m2 + pandn m7, m2, m5 + por m5, m4, m7 +%endif + + punpckhwd m7, m5, m3 + punpcklwd m5, m3 ; {top/cur interleaved} +%if %1 + REPX {pmaddwd x, [pw_23_22]}, m7, m5 + REPX {paddd x, [pd_16]}, m5, m7 +%else + REPX {pmaddwd x, m15}, m7, m5 + REPX {paddd x, m14}, m5, m7 +%endif + REPX {psrad x, 5}, m5, m7 + packssdw m3, m5, m7 + pmaxsw m3, m8 + pminsw m3, m9 + + ; right half + movu m4, [grain_lutq+offxyq*2+16] + movu m0, [grain_lutq+top_offxyq*2+16] + punpckhwd m1, m0, m4 + punpcklwd m0, m4 ; {top/cur interleaved} +%if %1 + REPX {pmaddwd x, [pw_23_22]}, m1, m0 + REPX {paddd x, [pd_16]}, m1, m0 +%else + REPX {pmaddwd x, m15}, m1, m0 + REPX {paddd x, m14}, m1, m0 +%endif + REPX {psrad x, 5}, m1, m0 + packssdw m4, m0, m1 + pmaxsw m4, m8 + pminsw m4, m9 + + ; src + mova m0, [srcq] + mova m1, [srcq+16] + + ; luma_src +%if !cpuflag(sse4) + pxor mzero, mzero +%endif + mova m6, [lumaq+lstrideq*0+ 0] + mova m5, [lumaq+lstrideq*0+32] + phaddw m6, [lumaq+lstrideq*0+16] + phaddw m5, [lumaq+lstrideq*0+48] + pavgw m6, mzero + pavgw m5, mzero + +%if %1 + punpckhwd m7, m6, m0 + punpcklwd m6, m0 + REPX {pmaddwd x, m14}, m7, m6 + REPX {psrad x, 6}, m7, m6 + packssdw m6, m7 + punpckhwd m7, m5, m1 + punpcklwd m5, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m7, m5 + REPX {psrad x, 6}, m7, m5 + packssdw m5, m7 +%if !cpuflag(sse4) + pxor mzero, mzero +%endif + REPX {paddw x, m15}, m6, m5 + REPX {pmaxsw x, mzero}, m6, m5 + REPX {pminsw x, m10}, m6, m5 ; clip_pixel() +%else + REPX {pand x, m10}, m6, m5 +%endif + + ; scaling[luma_src] + vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1 + vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1 + REPX {psrlw x, 8}, m7, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m7, m6 + pmulhrsw m3, m7 + pmulhrsw m4, m6 + + ; dst = clip_pixel(src, noise) + paddw m0, m3 + paddw m1, m4 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+ 0], m0 + mova [dstq+16], m1 + + add srcq, r13mp + add dstq, r13mp + lea lumaq, [lumaq+lstrideq*2] + add grain_lutq, 82*2 + dec hb + jg %%loop_y_h_overlap + +%%end_y_hv_overlap: + add wq, 16 + jge %%end_hv + mov srcq, r10mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*4] + jmp %%loop_x_hv_overlap + +%%end_hv: + RET +%endmacro + + FGUV_32x32xN_LOOP 1 +.csfl: + FGUV_32x32xN_LOOP 0 + +%endif ; ARCH_X86_64 From 94ecfe77d61e7a28a721e33f9734dca9b3e75c6e Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 15 Jun 2021 21:41:56 +0200 Subject: [PATCH 116/188] x86inc: Support memory operands in src1 in 3-operand instructions Particularly in code that makes heavy use of macros it's possible to end up with 3-operand instructions with a memory operand in src1. In the case of SSE this works fine due to automatic move insertions, but in AVX that fails since memory operands are only allowed in src2. The main purpose of this feature is to minimize the amount of code changes required to facilitate conversion of existing SSE code to AVX. --- src/x86/loopfilter16_sse.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/x86/loopfilter16_sse.asm b/src/x86/loopfilter16_sse.asm index ff8cb722dc..82549b5b35 100644 --- a/src/x86/loopfilter16_sse.asm +++ b/src/x86/loopfilter16_sse.asm @@ -824,7 +824,7 @@ SECTION .text REPX {pand x, m9}, m7, m8, m10, m11, m1, m2 %if avx_enabled - REPX {pandn x, m9}, m13, m3, m4, m5, m6, m14 + REPX {pandn x, m9, x}, m13, m3, m4, m5, m6, m14 %else pcmpeqw m0, m0 pxor m0, m9 From 477d6c096cf6330e95ac70ab156bdf8a7ddbde92 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Fri, 11 Jun 2021 16:44:59 +0200 Subject: [PATCH 117/188] x86: Add high bitdepth warp8x8 SSSE3 asm --- src/x86/mc16_sse.asm | 394 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 394 insertions(+) diff --git a/src/x86/mc16_sse.asm b/src/x86/mc16_sse.asm index 6b1869a97d..312d95cfed 100644 --- a/src/x86/mc16_sse.asm +++ b/src/x86/mc16_sse.asm @@ -65,6 +65,12 @@ put_8tap_h_rnd: dd 34, 34, 40, 40 prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4) prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5) +warp8x8_shift: dd 11, 13 +warp8x8_rnd1: dd 1024, 1024, 4096, 4096 +warp8x8_rnd2: times 4 dw 4096 + times 4 dw 16384 +warp8x8t_rnd: times 2 dd 16384 - (8192 << 15) + %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table @@ -105,6 +111,8 @@ BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) +cextern mc_warp_filter + SECTION .text %macro REPX 2-* @@ -2526,6 +2534,392 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my RET %undef tmp +%if ARCH_X86_64 +; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that +; by allocating 16 bytes more stack space so that stack offsets match up. +cglobal warp_affine_8x8t_16bpc, 4, 13, 9, 16*(13+WIN64), dst, ds, src, ss, \ + delta, mx, tmp, \ + alpha, beta, filter, \ + my, gamma, cnt +%else +cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ + filter, mx, my +%define m8 [esp+16*13] +%define m9 [esp+16*14] +%define cntd dword [esp+4*63] +%define dstq tmpq +%define dsq 0 +%if STACK_ALIGNMENT < 16 +%define dstm [esp+4*65] +%define dsm [esp+4*66] +%else +%define dstm r0m +%define dsm r1m +%endif +%endif +%define base filterq-$$ + mov t0d, r7m + LEA filterq, $$ + shr t0d, 11 +%if ARCH_X86_64 + movddup m8, [base+warp8x8t_rnd] +%else + movddup m1, [base+warp8x8t_rnd] + mov r1, r1m + add r1, r1 + mova m8, m1 + mov r1m, r1 ; ds *= 2 +%endif + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main + jmp .start +.loop: +%if ARCH_X86_64 + lea dstq, [dstq+dsq*4] +%else + add dstq, dsm + mov dstm, dstq +%endif + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2 +.start: +%if ARCH_X86_32 + mov dstq, dstm +%endif + paddd m1, m8 + paddd m2, m8 + psrad m1, 15 + psrad m2, 15 + packssdw m1, m2 + mova [dstq+dsq*0], m1 + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3 +%if ARCH_X86_32 + mov dstq, dstm + add dstq, dsm +%endif + paddd m1, m8 + paddd m2, m8 + psrad m1, 15 + psrad m2, 15 + packssdw m1, m2 + mova [dstq+dsq*2], m1 + dec cntd + jg .loop + RET + +%if ARCH_X86_64 +cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \ + mx, tmp, alpha, beta, \ + filter, my, gamma, cnt +%else +cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ + filter, mx, my +%endif + mov t0d, r7m + LEA filterq, $$ + shr t0d, 11 +%if ARCH_X86_64 + movddup m8, [base+warp8x8_rnd2+t0*8] + movd m9, r7m ; pixel_max + pshufb m9, [base+pw_256] +%else + movddup m1, [base+warp8x8_rnd2+t0*8] + movd m2, r7m ; pixel_max + pshufb m2, [base+pw_256] + mova m8, m1 + mova m9, m2 +%endif + call .main + jmp .start +.loop: +%if ARCH_X86_64 + lea dstq, [dstq+dsq*2] +%else + add dstq, dsm + mov dstm, dstq +%endif + call .main2 +.start: +%if ARCH_X86_32 + mov dstq, dstm +%endif + psrad m1, 16 + psrad m2, 16 + packssdw m1, m2 + pmaxsw m1, m6 + pmulhrsw m1, m8 + pminsw m1, m9 + mova [dstq+dsq*0], m1 + call .main3 +%if ARCH_X86_32 + mov dstq, dstm + add dstq, dsm +%endif + psrad m1, 16 + psrad m2, 16 + packssdw m1, m2 + pmaxsw m1, m6 + pmulhrsw m1, m8 + pminsw m1, m9 + mova [dstq+dsq*1], m1 + dec cntd + jg .loop + RET +ALIGN function_align +.main: + ; Stack args offset by one (r4m -> r5m etc.) due to call +%if WIN64 + mov deltaq, r5m + mov mxd, r6m +%endif + movd m0, [base+warp8x8_shift+t0*4] + movddup m7, [base+warp8x8_rnd1+t0*8] + add filterq, mc_warp_filter-$$ +%if ARCH_X86_64 + movsx alphad, word [deltaq+2*0] + movsx betad, word [deltaq+2*1] + movsx gammad, word [deltaq+2*2] + movsx deltad, word [deltaq+2*3] + lea tmpq, [ssq*3] + add mxd, 512+(64<<10) + sub srcq, tmpq ; src -= ss*3 + imul tmpd, alphad, -7 + mov myd, r7m + add betad, tmpd ; beta -= alpha*7 + imul tmpd, gammad, -7 + add myd, 512+(64<<10) + mov cntd, 4 + add deltad, tmpd ; delta -= gamma*7 +%else +%if STACK_ALIGNMENT < 16 + %assign stack_offset stack_offset - gprsize +%endif + mov r3d, r5m ; abcd +%if STACK_ALIGNMENT < 16 + mov r0, r1m ; dst + mov r1, r2m ; ds + mov [esp+gprsize+4*65], r0 + mov [esp+gprsize+4*66], r1 +%endif + movsx alphad, word [r3+2*0] + movsx r2d, word [r3+2*1] + movsx gammad, word [r3+2*2] + movsx r3d, word [r3+2*3] + imul r5d, alphad, -7 + add r2d, r5d ; beta -= alpha*7 + imul r5d, gammad, -7 + mov [esp+gprsize+4*60], r2d + add r3d, r5d ; delta -= gamma*7 + mov [esp+gprsize+4*61], r3d + mov r3d, r4m ; ss + mov srcq, r3m + mov mxd, r6m + mov myd, r7m + mov dword [esp+gprsize+4*63], 4 ; cnt + mov [esp+gprsize+4*62], r3 + lea r3, [r3*3] + add mxd, 512+(64<<10) + add myd, 512+(64<<10) + sub srcq, r3 ; src -= ss*3 +%if STACK_ALIGNMENT < 16 + %assign stack_offset stack_offset + gprsize +%endif +%endif + mova [rsp+gprsize], m0 + pxor m6, m6 + call .h + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 01 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 1], m1 + mova [rsp+gprsize+16* 4], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 12 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 7], m1 + mova [rsp+gprsize+16*10], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 23 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 2], m1 + mova [rsp+gprsize+16* 5], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 34 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 8], m1 + mova [rsp+gprsize+16*11], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 45 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 3], m1 + mova [rsp+gprsize+16* 6], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 56 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 9], m1 + mova [rsp+gprsize+16*12], m5 + mova m5, m0 +.main2: + call .h +%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h + lea tmpd, [myq+gammaq] + shr myd, 10 + movq m4, [filterq+myq*8] ; a + lea myd, [tmpq+gammaq] + shr tmpd, 10 + movq m2, [filterq+tmpq*8] ; b + lea tmpd, [myq+gammaq] + shr myd, 10 + movq m3, [filterq+myq*8] ; c + lea myd, [tmpq+gammaq] + shr tmpd, 10 + movq m1, [filterq+tmpq*8] ; d + lea tmpd, [myq+gammaq] + shr myd, 10 + punpcklwd m4, m2 + punpcklwd m3, m1 + punpckldq m2, m4, m3 + punpckhdq m4, m3 + punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 + pmaddwd m1, [rsp+gprsize+16*%1] + punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 + mova m2, [rsp+gprsize+16*%2] + pmaddwd m3, m2 + mova [rsp+gprsize+16*%1], m2 + paddd m1, m3 + punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 + mova m2, [rsp+gprsize+16*%3] + pmaddwd m3, m2 + mova [rsp+gprsize+16*%2], m2 + paddd m1, m3 + punpcklwd m3, m5, m0 ; 67 + punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 + pmaddwd m2, m3 + mova [rsp+gprsize+16*%3], m3 + paddd m1, m2 + movq m4, [filterq+myq*8] ; e + lea myd, [tmpq+gammaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] ; f + lea tmpd, [myq+gammaq] + shr myd, 10 + movq m2, [filterq+myq*8] ; g +%if ARCH_X86_64 + lea myd, [tmpq+deltaq] ; my += delta +%else + mov myd, [esp+gprsize+4*61] + add myd, tmpd +%endif + shr tmpd, 10 + punpcklwd m4, m3 + movq m3, [filterq+tmpq*8] ; h + punpcklwd m2, m3 + punpckldq m3, m4, m2 + punpckhdq m4, m2 + punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8 + pmaddwd m2, [rsp+gprsize+16*%4] + punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8 + mova m3, [rsp+gprsize+16*%5] + pmaddwd m6, m3 + mova [rsp+gprsize+16*%4], m3 + pxor m3, m3 + paddd m2, m6 + punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8 + mova m6, [rsp+gprsize+16*%6] + pmaddwd m3, m6 + mova [rsp+gprsize+16*%5], m6 + punpckhwd m5, m0 + pxor m6, m6 + paddd m2, m3 + punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8 + pmaddwd m3, m5 + mova [rsp+gprsize+16*%6], m5 + mova m5, m0 + paddd m2, m3 +%endmacro + WARP_V 1, 2, 3, 4, 5, 6 + ret +.main3: + call .h + WARP_V 7, 8, 9, 10, 11, 12 + ret +ALIGN function_align +.h: + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + punpcklbw m0, m6, m3 + movu m3, [srcq-6] + pmaddwd m0, m3 ; 0 + lea mxd, [tmpq+alphaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + punpcklbw m2, m6, m3 + movu m3, [srcq-4] + pmaddwd m2, m3 ; 1 + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + phaddd m0, m2 ; 0 1 + punpcklbw m2, m6, m3 + movu m3, [srcq-2] + pmaddwd m2, m3 ; 2 + lea mxd, [tmpq+alphaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + punpcklbw m1, m6, m3 + movu m3, [srcq+0] + pmaddwd m1, m3 ; 3 + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + phaddd m2, m1 ; 2 3 + punpcklbw m1, m6, m3 + movu m3, [srcq+2] + pmaddwd m1, m3 ; 4 + lea mxd, [tmpq+alphaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + phaddd m0, m2 ; 0 1 2 3 + punpcklbw m2, m6, m3 + movu m3, [srcq+4] + pmaddwd m2, m3 ; 5 + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + phaddd m1, m2 ; 4 5 + punpcklbw m2, m6, m3 + movu m3, [srcq+6] + pmaddwd m2, m3 ; 6 +%if ARCH_X86_64 + lea mxd, [tmpq+betaq] ; mx += beta +%else + mov mxd, [esp+gprsize*2+4*60] + add mxd, tmpd +%endif + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + punpcklbw m4, m6, m3 + movu m3, [srcq+8] +%if ARCH_X86_64 + add srcq, ssq +%else + add srcq, [esp+gprsize*2+4*62] +%endif + pmaddwd m3, m4 ; 7 + phaddd m2, m3 ; 6 7 + phaddd m1, m2 ; 4 5 6 7 + paddd m0, m7 + paddd m1, m7 + psrad m0, [rsp+gprsize*2] + psrad m1, [rsp+gprsize*2] + packssdw m0, m1 + ret + %macro BIDIR_FN 0 call .main jmp wq From 680defa2d255bf2d5d984de999f94d6648ad0c2a Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Fri, 18 Jun 2021 02:00:35 +0200 Subject: [PATCH 118/188] x86: Add bpc suffix to itx functions --- src/asm/x86/transform/inverse.rs | 4 +- src/x86/itx16_avx2.asm | 160 ++--- src/x86/itx_avx2.asm | 442 ++++++------ src/x86/itx_sse.asm | 1114 +++++++++++++++--------------- 4 files changed, 860 insertions(+), 860 deletions(-) diff --git a/src/asm/x86/transform/inverse.rs b/src/asm/x86/transform/inverse.rs index c4212e81f6..b5736b6fa7 100644 --- a/src/asm/x86/transform/inverse.rs +++ b/src/asm/x86/transform/inverse.rs @@ -67,7 +67,7 @@ macro_rules! decl_itx_fns { $( extern { // Note: type1 and type2 are flipped - fn []( + fn []( dst: *mut u8, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32 ); @@ -79,7 +79,7 @@ macro_rules! decl_itx_fns { let mut out: [Option; 16] = [None; 16]; $( $( - out[get_tx_type_idx($ENUM)] = Some([]); + out[get_tx_type_idx($ENUM)] = Some([]); )* )* out diff --git a/src/x86/itx16_avx2.asm b/src/x86/itx16_avx2.asm index 1ef674e8f2..99c231b45b 100644 --- a/src/x86/itx16_avx2.asm +++ b/src/x86/itx16_avx2.asm @@ -105,32 +105,32 @@ cextern pw_16384 cextern pw_2896x8 cextern pd_2048 -cextern idct_4x8_internal_avx2.main -cextern idct_4x16_internal_avx2.main -cextern idct_8x8_internal_avx2.main -cextern idct_8x16_internal_avx2.main -cextern idct_16x4_internal_avx2.main -cextern idct_16x8_internal_avx2.main -cextern idct_16x16_internal_avx2.main -cextern inv_txfm_add_dct_dct_8x32_avx2.main -cextern inv_txfm_add_dct_dct_8x32_avx2.main_fast -cextern inv_txfm_add_dct_dct_16x32_avx2.main_oddhalf -cextern inv_txfm_add_dct_dct_16x32_avx2.main_oddhalf_fast -cextern inv_txfm_add_dct_dct_16x64_avx2.main_part1 -cextern inv_txfm_add_dct_dct_16x64_avx2.main_part2_internal +cextern idct_4x8_internal_8bpc_avx2.main +cextern idct_4x16_internal_8bpc_avx2.main +cextern idct_8x8_internal_8bpc_avx2.main +cextern idct_8x16_internal_8bpc_avx2.main +cextern idct_16x4_internal_8bpc_avx2.main +cextern idct_16x8_internal_8bpc_avx2.main +cextern idct_16x16_internal_8bpc_avx2.main +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1 +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal -cextern iadst_4x4_internal_avx2.main -cextern iadst_4x8_internal_avx2.main_pass2 -cextern iadst_4x16_internal_avx2.main2 -cextern iadst_8x4_internal_avx2.main -cextern iadst_8x8_internal_avx2.main_pass2 -cextern iadst_8x16_internal_avx2.main -cextern iadst_8x16_internal_avx2.main_pass2_end -cextern iadst_16x4_internal_avx2.main -cextern iadst_16x8_internal_avx2.main -cextern iadst_16x8_internal_avx2.main_pass2_end -cextern iadst_16x16_internal_avx2.main -cextern iadst_16x16_internal_avx2.main_pass2_end +cextern iadst_4x4_internal_8bpc_avx2.main +cextern iadst_4x8_internal_8bpc_avx2.main_pass2 +cextern iadst_4x16_internal_8bpc_avx2.main2 +cextern iadst_8x4_internal_8bpc_avx2.main +cextern iadst_8x8_internal_8bpc_avx2.main_pass2 +cextern iadst_8x16_internal_8bpc_avx2.main +cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end +cextern iadst_16x4_internal_8bpc_avx2.main +cextern iadst_16x8_internal_8bpc_avx2.main +cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end +cextern iadst_16x16_internal_8bpc_avx2.main +cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end SECTION .text @@ -384,7 +384,7 @@ cglobal iadst_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2 .pass2: lea rax, [deint_shuf+128] vextracti128 xm1, m0, 1 - call m(iadst_4x4_internal).main + call m(iadst_4x4_internal_8bpc).main .end: vpbroadcastd xm4, [pw_2048] movq xm2, [dstq+strideq*0] @@ -457,7 +457,7 @@ cglobal iflipadst_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2 .pass2: lea rax, [deint_shuf+128] vextracti128 xm1, m0, 1 - call m(iadst_4x4_internal).main + call m(iadst_4x4_internal_8bpc).main vpbroadcastd xm4, [pw_2048] movq xm3, [dstq+strideq*1] movhps xm3, [dstq+strideq*0] @@ -607,7 +607,7 @@ cglobal idct_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 punpckldq m0, m2 ; 0 1 vextracti128 xm2, m0, 1 ; 4 5 vextracti128 xm3, m1, 1 ; 6 7 - call m(idct_4x8_internal).main + call m(idct_4x8_internal_8bpc).main vpbroadcastd xm4, [pw_2048] REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 lea r3, [strideq*3] @@ -697,7 +697,7 @@ ALIGN function_align vextracti128 xm3, m5, 1 ; 6 7 pshufd xm4, xm4, q1032 ; 1 0 pshufd xm5, xm5, q1032 ; 3 2 - jmp m(iadst_4x8_internal).main_pass2 + jmp m(iadst_4x8_internal_8bpc).main_pass2 ALIGN function_align .main: vbroadcasti128 m0, [cq+16*0] @@ -934,7 +934,7 @@ cglobal idct_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2 vextracti128 xm3, m1, 1 ; 6 7 vextracti128 xm6, m4, 1 ; c d vextracti128 xm7, m5, 1 ; e f - call m(idct_4x16_internal).main + call m(idct_4x16_internal_8bpc).main vpbroadcastd m9, [pw_2048] vinserti128 m0, m0, xm1, 1 ; 0 1 3 2 vinserti128 m1, m2, xm3, 1 ; 4 5 7 6 @@ -1054,7 +1054,7 @@ ALIGN function_align vinserti128 m0, xm3, 1 ; 0 3 2 1 vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ???? vinserti128 m2, xm4, 1 ; b 8 9 a - call m(iadst_4x16_internal).main2 + call m(iadst_4x16_internal_8bpc).main2 vpbroadcastd m5, [pw_2896x8] paddsw m1, m2, m4 psubsw m2, m4 @@ -1434,7 +1434,7 @@ ALIGN function_align vinserti128 m0, xm2, 1 pshufb m0, m4 pshufb m1, m4 - jmp m(iadst_8x4_internal).main + jmp m(iadst_8x4_internal_8bpc).main ALIGN function_align .main: vpbroadcastd m1, [pd_2896] @@ -1636,7 +1636,7 @@ cglobal idct_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 jmp tx2q .pass2: call .transpose_8x8_packed - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main vpbroadcastd m12, [pw_2048] vpermq m0, m0, q3120 vpermq m1, m1, q2031 @@ -1754,7 +1754,7 @@ cglobal iadst_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 call m(idct_8x8_internal_16bpc).transpose_8x8_packed pshufd m4, m0, q1032 pshufd m5, m1, q1032 - call m(iadst_8x8_internal).main_pass2 + call m(iadst_8x8_internal_8bpc).main_pass2 vpbroadcastd m5, [pw_2048] vpbroadcastd xm12, [pw_4096] psubw m12, m5 @@ -1814,7 +1814,7 @@ cglobal iflipadst_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 call m(idct_8x8_internal_16bpc).transpose_8x8_packed pshufd m4, m0, q1032 pshufd m5, m1, q1032 - call m(iadst_8x8_internal).main_pass2 + call m(iadst_8x8_internal_8bpc).main_pass2 vpbroadcastd m12, [pw_2048] vpbroadcastd xm5, [pw_4096] psubw m12, m5 @@ -1971,7 +1971,7 @@ cglobal idct_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2 jmp tx2q .pass2: call .transpose - call m(idct_8x16_internal).main + call m(idct_8x16_internal_8bpc).main vpbroadcastd m12, [pw_2048] REPX {vpermq x, x, q3120}, m0, m2, m4, m6 REPX {vpermq x, x, q2031}, m1, m3, m5, m7 @@ -2167,8 +2167,8 @@ cglobal iadst_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2 jmp tx2q .pass2: call m(idct_8x16_internal_16bpc).transpose - call m(iadst_8x16_internal).main - call m(iadst_8x16_internal).main_pass2_end + call m(iadst_8x16_internal_8bpc).main + call m(iadst_8x16_internal_8bpc).main_pass2_end vpbroadcastd m8, [pw_2048] vpbroadcastd xm12, [pw_4096] REPX {vpermq x, x, q2031}, m0, m1, m2, m3 @@ -2232,8 +2232,8 @@ cglobal iflipadst_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2 jmp tx2q .pass2: call m(idct_8x16_internal_16bpc).transpose - call m(iadst_8x16_internal).main - call m(iadst_8x16_internal).main_pass2_end + call m(iadst_8x16_internal_8bpc).main + call m(iadst_8x16_internal_8bpc).main_pass2_end vpbroadcastd m12, [pw_2048] vpbroadcastd xm13, [pw_4096] mova m11, m0 @@ -2458,7 +2458,7 @@ cglobal idct_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 .pass2: call .transpose_4x16_packed lea rax, [deint_shuf+128] - call m(idct_16x4_internal).main + call m(idct_16x4_internal_8bpc).main .end: vpbroadcastd m4, [pw_2048] REPX {pmulhrsw x, m4}, m0, m1, m2, m3 @@ -2517,7 +2517,7 @@ cglobal iadst_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 .pass2: call m(idct_16x4_internal_16bpc).transpose_4x16_packed lea rax, [deint_shuf+128] - call m(iadst_16x4_internal).main + call m(iadst_16x4_internal_8bpc).main jmp m(idct_16x4_internal_16bpc).end ALIGN function_align .main: @@ -2596,7 +2596,7 @@ cglobal iflipadst_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 .pass2: call m(idct_16x4_internal_16bpc).transpose_4x16_packed lea rax, [deint_shuf+128] - call m(iadst_16x4_internal).main + call m(iadst_16x4_internal_8bpc).main vpbroadcastd m4, [pw_2048] pmulhrsw m5, m3, m4 pmulhrsw m6, m2, m4 @@ -2712,7 +2712,7 @@ cglobal idct_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 jmp tx2q .pass2: call .transpose - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main vpbroadcastd m10, [pw_2048] .end: pmulhrsw m0, m10 @@ -2827,8 +2827,8 @@ cglobal iadst_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 jmp tx2q .pass2: call m(idct_16x8_internal_16bpc).transpose - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass2_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end vpbroadcastd m10, [pw_2048] pxor m11, m11 psubw m11, m10 @@ -3039,8 +3039,8 @@ cglobal iflipadst_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 jmp m(iadst_16x8_internal_16bpc).pass1_end .pass2: call m(idct_16x8_internal_16bpc).transpose - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass2_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end vpbroadcastd m10, [pw_2048] pxor m11, m11 psubw m11, m10 @@ -3216,7 +3216,7 @@ cglobal idct_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 call .transpose lea rax, [pw_5+128] mova [rsp], m15 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] .end: call .write_16x16 @@ -3450,8 +3450,8 @@ cglobal iadst_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 call m(idct_16x16_internal_16bpc).transpose lea rax, [pw_5+128] mova [rsp], m15 - call m(iadst_16x16_internal).main - call m(iadst_16x16_internal).main_pass2_end + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass2_end mova [rsp+32*0], m8 mova [rsp+32*2], m12 mova [rsp+32*3], m13 @@ -3582,8 +3582,8 @@ cglobal iflipadst_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx call m(idct_16x16_internal_16bpc).transpose lea rax, [pw_5+128] mova [rsp], m15 - call m(iadst_16x16_internal).main - call m(iadst_16x16_internal).main_pass2_end + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass2_end mova [rsp+32*3], m3 mova [rsp+32*2], m2 mova [rsp+32*0], m0 @@ -3740,7 +3740,7 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 0, dst, stride, c, eob vpbroadcastd m10, [pw_2048] lea rax, [deint_shuf+128] REPX {mova x, m4}, m5, m6, m7 - call m(inv_txfm_add_dct_dct_8x32).main_fast + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast jmp .end .eob107: mova [rsp+32*3], m3 @@ -3778,7 +3778,7 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 0, dst, stride, c, eob lea rax, [deint_shuf+128] mova m11, [rsp+32*3] ; out13 out15 vpbroadcastd m10, [pw_2048] - call m(inv_txfm_add_dct_dct_8x32).main + call m(inv_txfm_add_dct_dct_8x32_8bpc).main .end: ; [rsp+0*32] = m12 vpbroadcastd m12, [pw_2048] mov cq, r4 @@ -4294,7 +4294,7 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 0, dst, stride, c, eob RET ALIGN function_align .pass2: - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main REPX {pmulhrsw x, m11}, m0, m1, m2, m3 call m(idct_16x8_internal_16bpc).write_16x4_start pmulhrsw m0, m11, m4 @@ -4404,7 +4404,7 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 0, dst, stride, c, eob mova m3, [r4+32*3] .fast: lea rax, [pw_5+128] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp .idct16 @@ -4456,7 +4456,7 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 0, dst, stride, c, eob mova m6, [r4-32*2] mova m7, [r4-32*1] lea rax, [pw_5 + 128] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf lea r3, [rsp+32*8] mova m8, [r3+32*0] mova m9, [r3+32*1] @@ -4477,7 +4477,7 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 0, dst, stride, c, eob mova m6, [r3-32*2] mova m7, [r3-32*1] mova [rsp], m15 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq @@ -4711,7 +4711,7 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 0, dst, stride, c, eob REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] lea rax, [pw_5+128] mov r7, dstq - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main call .write_16x16 mova m0, [r5+32*3] mova m1, [r5+32*2] @@ -4750,7 +4750,7 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 0, dst, stride, c, eob call .transpose_16x16 lea rax, [pw_5+128] mov r7, dstq - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main call .write_16x16 mova m0, [r5+32*3] mova m1, [r5+32*2] @@ -4764,7 +4764,7 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 0, dst, stride, c, eob call .transpose_16x16 .end: lea dstq, [r7+32] - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main call .write_16x16 RET ALIGN function_align @@ -5124,7 +5124,7 @@ ALIGN function_align mova m13, [r3+32*51] ; 27 mova m14, [r3+32*53] ; 29 mova m15, [r3+32*55] ; 31 - jmp m(inv_txfm_add_dct_dct_16x32).main_oddhalf + jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf ALIGN function_align .pass2_evenhalf: mova m0, [r3+32* 0] ; 0 @@ -5144,7 +5144,7 @@ ALIGN function_align mova m14, [r3+32*52] ; 28 mova m15, [r3+32*54] ; 30 mova [rsp+gprsize], m15 - jmp m(idct_16x16_internal).main + jmp m(idct_16x16_internal_8bpc).main cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 8, 8, dst, stride, c, eob %undef cmp @@ -5300,7 +5300,7 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] lea r4, [rsp+32*38] mova [r4-32*4], m0 @@ -5330,7 +5330,7 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob mova m7, [rsp+32*32] ; in30 lea r5, [r4+32*16] add r4, 32*8 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova m0, [rsp+32* 3] ; in1 mova m1, [rsp+32*33] ; in31 mova m2, [rsp+32*19] ; in17 @@ -5342,7 +5342,7 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob lea rax, [idct64_mul - 8] add r4, 32*16 add r5, 32*32 - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 mova m0, [rsp+32* 7] ; in5 mova m1, [rsp+32*29] ; in27 mova m2, [rsp+32*23] ; in21 @@ -5354,7 +5354,7 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob add rax, 8 add r4, 32*8 sub r5, 32*8 - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 lea r8, [strideq*4] lea r9, [strideq*5] lea r3, [r9+strideq*1] ; stride*6 @@ -5449,7 +5449,7 @@ ALIGN function_align lea r2, [dstq+r7] .main_part2_pass2_loop: vpbroadcastd m14, [pw_m2896_2896] - call m(inv_txfm_add_dct_dct_16x64).main_part2_internal + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal vpbroadcastd m14, [pw_2048] IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8 IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8 @@ -5648,7 +5648,7 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] lea r4, [rsp+32*70] mova [r4-32*4], m0 @@ -5678,7 +5678,7 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob mova m7, [r10+32*56] ; in30 lea r5, [r4+32*16] add r4, 32*8 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova m0, [r10+32* 3] ; in1 mova m1, [r10+32*57] ; in31 mova m2, [r10+32*35] ; in17 @@ -5690,7 +5690,7 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob lea rax, [idct64_mul - 8] add r4, 32*16 add r5, 32*32 - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 mova m0, [r10+32* 7] ; in5 mova m1, [r10+32*53] ; in27 mova m2, [r10+32*39] ; in21 @@ -5702,7 +5702,7 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob add rax, 8 add r4, 32*8 sub r5, 32*8 - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2 add r10, 32*8 sub r4, 32*98 ; rsp+32*16 @@ -5877,7 +5877,7 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 0, dst, stride, c, eob mova m15, [r7+32*3] sub r7, 32*24 mova [rsp], m15 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] call m(inv_txfm_add_dct_dct_32x16_16bpc).write_16x16 add r5, 32 @@ -6109,7 +6109,7 @@ cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 0, dst, stride, c, eob mova m13, [r7-32* 1] mova m14, [r7+32* 1] mova m15, [r7+32* 3] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf mova m0, [r7-32*100] mova m1, [r7-32*98] mova m2, [r7-32*96] @@ -6128,7 +6128,7 @@ cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 0, dst, stride, c, eob mova m15, [r7+32* 2] add r7, 32*8 mova [rsp], m15 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end sub dstq, r3 lea r2, [r2+r3+32] @@ -6248,7 +6248,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [r4-32*4], m0 mova [r4-32*3], m1 @@ -6277,7 +6277,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob mova m7, [r10+32* 2] ; in30 lea r5, [r4+32*16] add r4, 32*8 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova m0, [r10-32*99] ; in1 mova m1, [r10+32* 3] ; in31 mova m2, [r10-32*35] ; in17 @@ -6289,7 +6289,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob lea rax, [idct64_mul - 8] add r4, 32*16 add r5, 32*32 - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 mova m0, [r10-32*95] ; in5 mova m1, [r10-32* 1] ; in27 mova m2, [r10-32*31] ; in21 @@ -6301,7 +6301,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob add rax, 8 add r4, 32*8 sub r5, 32*8 - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2 add r10, 32*8 sub dstq, r8 diff --git a/src/x86/itx_avx2.asm b/src/x86/itx_avx2.asm index bd64250f57..02f0718b31 100644 --- a/src/x86/itx_avx2.asm +++ b/src/x86/itx_avx2.asm @@ -1,4 +1,4 @@ -; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; @@ -344,7 +344,7 @@ SECTION .text %endmacro INIT_XMM avx2 -cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, c +cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, c mova m0, [cq+16*0] mova m1, [cq+16*1] pxor m2, m2 @@ -362,12 +362,12 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, c ITX4_END 3, 0, 2, 1, 0 %macro INV_TXFM_FN 3 ; type1, type2, size -cglobal inv_txfm_add_%1_%2_%3, 4, 5, 0, dst, stride, c, eob, tx2 - %define %%p1 m(i%1_%3_internal) +cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 5, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%3_internal_8bpc) lea rax, [o_base] ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. - lea tx2q, [m(i%2_%3_internal).pass2] + lea tx2q, [m(i%2_%3_internal_8bpc).pass2] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 @@ -388,7 +388,7 @@ ALIGN function_align mov [cq], eobd ; 0 pmulhrsw m0, m1 mova m1, m0 - jmp m(iadst_4x4_internal).end2 + jmp m(iadst_4x4_internal_8bpc).end2 %endif %endmacro @@ -438,7 +438,7 @@ INV_TXFM_4X4_FN dct, adst INV_TXFM_4X4_FN dct, flipadst INV_TXFM_4X4_FN dct, identity -cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 +cglobal idct_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] IDCT4_1D_PACKED @@ -460,7 +460,7 @@ INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity -cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 +cglobal iadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] call .main @@ -487,17 +487,17 @@ INV_TXFM_4X4_FN flipadst, adst INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity -cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 +cglobal iflipadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] - call m(iadst_4x4_internal).main + call m(iadst_4x4_internal_8bpc).main punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 jmp tx2q .pass2: - call m(iadst_4x4_internal).main + call m(iadst_4x4_internal_8bpc).main .end: pxor m2, m2 mova [cq+16*0], m2 @@ -510,7 +510,7 @@ INV_TXFM_4X4_FN identity, adst INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity -cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 +cglobal iidentity_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] vpbroadcastd m3, [o(pw_1697x8)] @@ -529,7 +529,7 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 - jmp m(iadst_4x4_internal).end + jmp m(iadst_4x4_internal_8bpc).end %macro WRITE_4X8 2 ; coefs[1-2] movd xm4, [dstq+strideq*0] @@ -568,7 +568,7 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mova m1, m0 - jmp m(iadst_4x8_internal).end3 + jmp m(iadst_4x8_internal_8bpc).end3 %endif %endmacro @@ -687,7 +687,7 @@ INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst INV_TXFM_4X8_FN dct, identity -cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal idct_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] @@ -708,7 +708,7 @@ cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vinserti128 m0, xm2, 1 vinserti128 m1, xm3, 1 pshufd m1, m1, q1032 - jmp m(iadst_4x8_internal).end2 + jmp m(iadst_4x8_internal_8bpc).end2 ALIGN function_align cglobal_label .main WRAP_XMM IDCT8_1D_PACKED @@ -719,13 +719,13 @@ INV_TXFM_4X8_FN adst, adst INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity -cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 - call m(iadst_8x4_internal).main + call m(iadst_8x4_internal_8bpc).main punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m3 @@ -770,13 +770,13 @@ INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity -cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iflipadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 - call m(iadst_8x4_internal).main + call m(iadst_8x4_internal_8bpc).main punpcklwd m3, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m3 @@ -787,7 +787,7 @@ cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vextracti128 xm3, m1, 1 pshufd xm4, xm0, q1032 pshufd xm5, xm1, q1032 - call m(iadst_4x8_internal).main_pass2 + call m(iadst_4x8_internal_8bpc).main_pass2 vpbroadcastd m5, [o(pw_2048)] vinserti128 m3, xm1, 1 vinserti128 m2, xm0, 1 @@ -795,14 +795,14 @@ cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 psubw m4, m5 pshufd m0, m3, q1032 pshufd m1, m2, q1032 - jmp m(iadst_4x8_internal).end + jmp m(iadst_4x8_internal_8bpc).end INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity -cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iidentity_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m2, [cq+32*0], q3120 vpermq m0, [cq+32*1], q3120 vpbroadcastd m3, [o(pw_2896x8)] @@ -820,7 +820,7 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 jmp tx2q .pass2: vpbroadcastd m4, [o(pw_4096)] - jmp m(iadst_4x8_internal).end2 + jmp m(iadst_4x8_internal_8bpc).end2 %macro INV_TXFM_4X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x16 @@ -837,7 +837,7 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 mova m1, m0 mova m2, m0 mova m3, m0 - jmp m(iadst_4x16_internal).end3 + jmp m(iadst_4x16_internal_8bpc).end3 %endif %endmacro @@ -915,12 +915,12 @@ INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst INV_TXFM_4X16_FN dct, identity -cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal idct_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] - call m(idct_16x4_internal).main + call m(idct_16x4_internal_8bpc).main vpbroadcastd m5, [o(pw_16384)] punpckhwd m4, m2, m3 punpcklwd m2, m3 @@ -945,7 +945,7 @@ cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 vinserti128 m3, xm7, 1 pshufd m1, m1, q1032 pshufd m3, m3, q1032 - jmp m(iadst_4x16_internal).end2 + jmp m(iadst_4x16_internal_8bpc).end2 ALIGN function_align cglobal_label .main WRAP_XMM IDCT16_1D_PACKED @@ -956,12 +956,12 @@ INV_TXFM_4X16_FN adst, adst INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity -cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal iadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] - call m(iadst_16x4_internal).main + call m(iadst_16x4_internal_8bpc).main vpbroadcastd m5, [o(pw_16384)] punpckhwd m4, m2, m3 punpcklwd m2, m3 @@ -1085,12 +1085,12 @@ INV_TXFM_4X16_FN flipadst, adst INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity -cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal iflipadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] - call m(iadst_16x4_internal).main + call m(iadst_16x4_internal_8bpc).main vpbroadcastd m5, [o(pw_16384)] punpcklwd m4, m1, m0 punpckhwd m1, m0 @@ -1103,7 +1103,7 @@ cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 punpckldq m0, m4 jmp tx2q .pass2: - call m(iadst_4x16_internal).main + call m(iadst_4x16_internal_8bpc).main vpbroadcastd m5, [o(pw_2896x8)] paddsw m1, m2, m4 psubsw m2, m4 @@ -1120,14 +1120,14 @@ cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 vpermq m2, m2, q2031 vpermq m3, m4, q1302 psubw m5, m7, m6 - jmp m(iadst_4x16_internal).end + jmp m(iadst_4x16_internal_8bpc).end INV_TXFM_4X16_FN identity, dct INV_TXFM_4X16_FN identity, adst INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity -cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova m3, [cq+32*0] mova m2, [cq+32*1] mova m4, [cq+32*2] @@ -1171,7 +1171,7 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 paddsw m1, m6 paddsw m2, m7 paddsw m3, m8 - jmp m(iadst_4x16_internal).end2 + jmp m(iadst_4x16_internal_8bpc).end2 %macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3] movq xm%3, [dstq ] @@ -1209,7 +1209,7 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mova m1, m0 - jmp m(iadst_8x4_internal).end3 + jmp m(iadst_8x4_internal_8bpc).end3 %endif %endmacro @@ -1218,13 +1218,13 @@ INV_TXFM_8X4_FN dct, adst INV_TXFM_8X4_FN dct, flipadst INV_TXFM_8X4_FN dct, identity -cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal idct_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd xm3, [o(pw_2896x8)] pmulhrsw xm0, xm3, [cq+16*0] pmulhrsw xm1, xm3, [cq+16*1] pmulhrsw xm2, xm3, [cq+16*2] pmulhrsw xm3, [cq+16*3] - call m(idct_4x8_internal).main + call m(idct_4x8_internal_8bpc).main vbroadcasti128 m4, [o(deint_shuf)] vinserti128 m3, m1, xm3, 1 vinserti128 m1, m0, xm2, 1 @@ -1237,14 +1237,14 @@ cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 IDCT4_1D_PACKED vpermq m0, m0, q3120 vpermq m1, m1, q2031 - jmp m(iadst_8x4_internal).end2 + jmp m(iadst_8x4_internal_8bpc).end2 INV_TXFM_8X4_FN adst, dct INV_TXFM_8X4_FN adst, adst INV_TXFM_8X4_FN adst, flipadst INV_TXFM_8X4_FN adst, identity -cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd xm0, [o(pw_2896x8)] pshufd xm4, [cq+16*0], q1032 pmulhrsw xm3, xm0, [cq+16*3] @@ -1252,7 +1252,7 @@ cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 pmulhrsw xm2, xm0, [cq+16*2] pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 - call m(iadst_4x8_internal).main_pass1 + call m(iadst_4x8_internal_8bpc).main_pass1 vinserti128 m0, xm2, 1 vinserti128 m1, xm3, 1 punpckhwd m2, m0, m1 @@ -1289,7 +1289,7 @@ INV_TXFM_8X4_FN flipadst, adst INV_TXFM_8X4_FN flipadst, flipadst INV_TXFM_8X4_FN flipadst, identity -cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iflipadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd xm0, [o(pw_2896x8)] pshufd xm4, [cq+16*0], q1032 pmulhrsw xm3, xm0, [cq+16*3] @@ -1297,7 +1297,7 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 pmulhrsw xm2, xm0, [cq+16*2] pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 - call m(iadst_4x8_internal).main_pass1 + call m(iadst_4x8_internal_8bpc).main_pass1 vinserti128 m3, xm1, 1 vinserti128 m2, xm0, 1 punpckhwd m1, m3, m2 @@ -1308,18 +1308,18 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 punpcklwd m0, m3 jmp tx2q .pass2: - call m(iadst_8x4_internal).main + call m(iadst_8x4_internal_8bpc).main mova m2, m1 vpermq m1, m0, q2031 vpermq m0, m2, q2031 - jmp m(iadst_8x4_internal).end2 + jmp m(iadst_8x4_internal_8bpc).end2 INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity -cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 mova xm2, [cq+16*0] mova xm0, [cq+16*1] vinserti128 m2, [cq+16*2], 1 @@ -1340,7 +1340,7 @@ cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 - jmp m(iadst_8x4_internal).end + jmp m(iadst_8x4_internal_8bpc).end %macro INV_TXFM_8X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x8 @@ -1372,7 +1372,7 @@ INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst INV_TXFM_8X8_FN dct, identity -cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal idct_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 ; 0 1 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m2, [cq+32*2], q3120 ; 4 5 @@ -1398,7 +1398,7 @@ cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 - jmp m(iadst_8x8_internal).end2 + jmp m(iadst_8x8_internal_8bpc).end2 ALIGN function_align cglobal_label .main IDCT8_1D_PACKED @@ -1409,7 +1409,7 @@ INV_TXFM_8X8_FN adst, adst INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity -cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m4, [cq+32*0], q1302 ; 1 0 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m5, [cq+32*1], q1302 ; 3 2 @@ -1476,12 +1476,12 @@ INV_TXFM_8X8_FN flipadst, adst INV_TXFM_8X8_FN flipadst, flipadst INV_TXFM_8X8_FN flipadst, identity -cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iflipadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m4, [cq+32*0], q1302 ; 1 0 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m5, [cq+32*1], q1302 ; 3 2 vpermq m2, [cq+32*2], q3120 ; 4 5 - call m(iadst_8x8_internal).main_pass1 + call m(iadst_8x8_internal_8bpc).main_pass1 vpbroadcastd m5, [o(pw_16384)] punpckhwd m4, m3, m2 punpcklwd m3, m2 @@ -1505,7 +1505,7 @@ cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 .pass2: pshufd m4, m0, q1032 pshufd m5, m1, q1032 - call m(iadst_8x8_internal).main_pass2 + call m(iadst_8x8_internal_8bpc).main_pass2 vpbroadcastd m4, [o(pw_2048)] vpbroadcastd xm5, [o(pw_4096)] psubw m4, m5 ; lower half = -2048, upper half = 2048 @@ -1515,14 +1515,14 @@ cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m2, m1, q2031 pmulhrsw m1, m0, m4 pmulhrsw m0, m5, m4 - jmp m(iadst_8x8_internal).end3 + jmp m(iadst_8x8_internal_8bpc).end3 INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity -cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 mova xm3, [cq+16*0] mova xm2, [cq+16*1] vinserti128 m3, [cq+16*4], 1 @@ -1542,7 +1542,7 @@ cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 jmp tx2q .pass2: vpbroadcastd m4, [o(pw_4096)] - jmp m(iadst_8x8_internal).end + jmp m(iadst_8x8_internal_8bpc).end %macro INV_TXFM_8X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x16 @@ -1558,7 +1558,7 @@ cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mov r2d, 4 - jmp m(inv_txfm_add_dct_dct_8x8).end2 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2 %endif %endmacro @@ -1580,9 +1580,9 @@ INV_TXFM_8X16_FN dct, adst INV_TXFM_8X16_FN dct, flipadst INV_TXFM_8X16_FN dct, identity -cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal idct_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_8X16_LOAD_COEFS - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main vpbroadcastd m10, [o(pw_16384)] .pass1_end: vperm2i128 m9, m3, m7, 0x31 @@ -1642,14 +1642,14 @@ INV_TXFM_8X16_FN adst, adst INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, identity -cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal iadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_8X16_LOAD_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end vpbroadcastd m10, [o(pw_16384)] pslld m9, m10, 17 psubw m10, m9 ; 16384, -16384 - jmp m(idct_8x16_internal).pass1_end + jmp m(idct_8x16_internal_8bpc).pass1_end ALIGN function_align .pass2: call .main @@ -1659,7 +1659,7 @@ ALIGN function_align psubw m8, m9 REPX {vpermq x, x, q2031}, m0, m1, m2, m3 REPX {vpermq x, x, q3120}, m4, m5, m6, m7 - jmp m(idct_8x16_internal).end2 + jmp m(idct_8x16_internal_8bpc).end2 ALIGN function_align cglobal_label .main REPX {pshufd x, x, q1032}, m7, m1, m5, m3 @@ -1783,10 +1783,10 @@ INV_TXFM_8X16_FN flipadst, adst INV_TXFM_8X16_FN flipadst, flipadst INV_TXFM_8X16_FN flipadst, identity -cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal iflipadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_8X16_LOAD_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end vpbroadcastd m9, [o(pw_16384)] pslld m10, m9, 17 psubw m10, m9 ; -16384, 16384 @@ -1802,10 +1802,10 @@ cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 punpckhwd m4, m0 punpcklwd m0, m3, m1 punpckhwd m3, m1 - jmp m(idct_8x16_internal).pass1_end2 + jmp m(idct_8x16_internal_8bpc).pass1_end2 .pass2: - call m(iadst_8x16_internal).main - call m(iadst_8x16_internal).main_pass2_end + call m(iadst_8x16_internal_8bpc).main + call m(iadst_8x16_internal_8bpc).main_pass2_end vpbroadcastd m8, [o(pw_2048)] vpbroadcastd xm9, [o(pw_4096)] psubw m8, m9 @@ -1825,7 +1825,7 @@ cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 pmulhrsw m5, m6, m8 pmulhrsw m6, m7, m8 pmulhrsw m7, m9, m8 - jmp m(idct_8x16_internal).end3 + jmp m(idct_8x16_internal_8bpc).end3 INV_TXFM_8X16_FN identity, dct INV_TXFM_8X16_FN identity, adst @@ -1842,7 +1842,7 @@ INV_TXFM_8X16_FN identity, identity paddsw m%1, m%2 %endmacro -cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 mova xm3, [cq+16*0] mova xm2, [cq+16*2] add cq, 16*8 @@ -1883,7 +1883,7 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 vpbroadcastd m8, [o(pw_1697x16)] REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 - jmp m(idct_8x16_internal).end + jmp m(idct_8x16_internal_8bpc).end %macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] pmovzxbw m%3, [dstq+%5] @@ -1941,7 +1941,7 @@ INV_TXFM_16X4_FN dct, adst INV_TXFM_16X4_FN dct, flipadst INV_TXFM_16X4_FN dct, identity -cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal idct_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova xm0, [cq+16*0] mova xm1, [cq+16*1] mova xm2, [cq+16*2] @@ -1950,7 +1950,7 @@ cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 mova xm5, [cq+16*5] mova xm6, [cq+16*6] mova xm7, [cq+16*7] - call m(idct_4x16_internal).main + call m(idct_4x16_internal_8bpc).main vinserti128 m6, m2, xm6, 1 vinserti128 m2, m0, xm4, 1 vinserti128 m0, m1, xm5, 1 @@ -1961,10 +1961,10 @@ cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 punpckhwd m4, m0, m1 punpcklwd m0, m1 mova m1, m6 - jmp m(iadst_16x4_internal).pass1_end + jmp m(iadst_16x4_internal_8bpc).pass1_end .pass2: call .main - jmp m(iadst_16x4_internal).end + jmp m(iadst_16x4_internal_8bpc).end ALIGN function_align cglobal_label .main vpbroadcastd m6, [o(pd_2048)] @@ -1976,13 +1976,13 @@ INV_TXFM_16X4_FN adst, adst INV_TXFM_16X4_FN adst, flipadst INV_TXFM_16X4_FN adst, identity -cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal iadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q1230 vpermq m3, [cq+32*3], q2103 vpermq m1, [cq+32*1], q1230 vpermq m2, [cq+32*2], q2103 - call m(iadst_4x16_internal).main2 - call m(iadst_4x16_internal).main_pass1_end + call m(iadst_4x16_internal_8bpc).main2 + call m(iadst_4x16_internal_8bpc).main_pass1_end punpcklwd m4, m3, m1 punpcklwd m5, m2, m0 punpckhwd m0, m1 @@ -2080,13 +2080,13 @@ INV_TXFM_16X4_FN flipadst, adst INV_TXFM_16X4_FN flipadst, flipadst INV_TXFM_16X4_FN flipadst, identity -cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal iflipadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q1230 vpermq m3, [cq+32*3], q2103 vpermq m1, [cq+32*1], q1230 vpermq m2, [cq+32*2], q2103 - call m(iadst_4x16_internal).main2 - call m(iadst_4x16_internal).main_pass1_end + call m(iadst_4x16_internal_8bpc).main2 + call m(iadst_4x16_internal_8bpc).main_pass1_end punpckhwd m4, m3, m2 punpckhwd m5, m1, m0 punpcklwd m0, m2 @@ -2097,10 +2097,10 @@ cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 vinserti128 m0, m4, xm5, 1 vperm2i128 m4, m4, m5, 0x31 psubw m1, m7, m6 - jmp m(iadst_16x4_internal).pass1_end + jmp m(iadst_16x4_internal_8bpc).pass1_end ALIGN function_align .pass2: - call m(iadst_16x4_internal).main + call m(iadst_16x4_internal_8bpc).main vpbroadcastd m4, [o(pw_2048)] REPX {pmulhrsw x, m4}, m3, m2, m1, m0 pxor m4, m4 @@ -2118,7 +2118,7 @@ INV_TXFM_16X4_FN identity, adst INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity -cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova xm2, [cq+16*0] mova xm4, [cq+16*1] vinserti128 m2, [cq+16*4], 1 @@ -2161,7 +2161,7 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 - jmp m(iadst_16x4_internal).end + jmp m(iadst_16x4_internal_8bpc).end %macro INV_TXFM_16X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x8 @@ -2172,7 +2172,7 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 mov [cq], eobd pmulhrsw xm0, xm1 mov r2d, 4 - jmp m(inv_txfm_add_dct_dct_16x4).dconly + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly %endif %endmacro @@ -2195,9 +2195,9 @@ INV_TXFM_16X8_FN dct, adst INV_TXFM_16X8_FN dct, flipadst INV_TXFM_16X8_FN dct, identity -cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal idct_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_16X8_LOAD_COEFS 3120 - call m(idct_8x16_internal).main + call m(idct_8x16_internal_8bpc).main vpbroadcastd m10, [o(pw_16384)] punpckhwd m8, m0, m2 punpcklwd m0, m2 @@ -2265,10 +2265,10 @@ INV_TXFM_16X8_FN adst, adst INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, identity -cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal iadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_16X8_LOAD_COEFS 1302 - call m(iadst_8x16_internal).main2 - call m(iadst_8x16_internal).main_pass1_end + call m(iadst_8x16_internal_8bpc).main2 + call m(iadst_8x16_internal_8bpc).main_pass1_end psubw m11, m9, m10 punpcklwd m8, m0, m2 punpckhwd m0, m2 @@ -2279,7 +2279,7 @@ cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 punpckhwd m6, m5, m7 punpcklwd m5, m7 REPX {pmulhrsw x, m11}, m8, m1, m4, m6 - jmp m(idct_16x8_internal).pass1_end + jmp m(idct_16x8_internal_8bpc).pass1_end ALIGN function_align .pass2: call .main @@ -2287,7 +2287,7 @@ ALIGN function_align pxor m8, m8 psubw m8, m9 REPX {pmulhrsw x, m9}, m0, m2, m4, m6 - jmp m(idct_16x8_internal).end2 + jmp m(idct_16x8_internal_8bpc).end2 ALIGN function_align cglobal_label .main vpbroadcastd m10, [o(pd_2048)] @@ -2358,10 +2358,10 @@ INV_TXFM_16X8_FN flipadst, adst INV_TXFM_16X8_FN flipadst, flipadst INV_TXFM_16X8_FN flipadst, identity -cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal iflipadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_16X8_LOAD_COEFS 1302 - call m(iadst_8x16_internal).main2 - call m(iadst_8x16_internal).main_pass1_end + call m(iadst_8x16_internal_8bpc).main2 + call m(iadst_8x16_internal_8bpc).main_pass1_end psubw m9, m10 punpcklwd m8, m6, m4 punpckhwd m6, m4 @@ -2399,8 +2399,8 @@ cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 vperm2i128 m7, m8, 0x31 jmp tx2q .pass2: - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass2_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end pxor m8, m8 psubw m8, m9 pmulhrsw m10, m7, m8 @@ -2414,14 +2414,14 @@ cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 lea r3, [strideq*3] WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1 WRITE_16X2 1, 2, 0, 1, strideq*2, r3 - jmp m(idct_16x8_internal).end3 + jmp m(idct_16x8_internal_8bpc).end3 INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity -cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 mova xm7, [cq+16*0] mova xm2, [cq+16*1] add cq, 16*8 @@ -2471,7 +2471,7 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 jmp tx2q .pass2: vpbroadcastd m8, [o(pw_4096)] - jmp m(idct_16x8_internal).end + jmp m(idct_16x8_internal_8bpc).end %define o_base pw_5 + 128 @@ -2483,7 +2483,7 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 movd xm2, [o(pw_8192)] mov [cq], eobd mov r2d, 8 - jmp m(inv_txfm_add_dct_dct_16x4).dconly + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly %endif %endmacro @@ -2513,7 +2513,7 @@ INV_TXFM_16X16_FN dct, adst INV_TXFM_16X16_FN dct, flipadst INV_TXFM_16X16_FN dct, identity -cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 +cglobal idct_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ITX_16X16_LOAD_COEFS call .main .pass1_end: @@ -2658,7 +2658,7 @@ INV_TXFM_16X16_FN adst, dct INV_TXFM_16X16_FN adst, adst INV_TXFM_16X16_FN adst, flipadst -cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 +cglobal iadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ITX_16X16_LOAD_COEFS call .main call .main_pass1_end @@ -2671,7 +2671,7 @@ cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 mova [rsp+16*1], xm8 pxor m8, m8 psubw m1, m8, m1 - jmp m(idct_16x16_internal).pass1_end2 + jmp m(idct_16x16_internal_8bpc).pass1_end2 ALIGN function_align .pass2: call .main @@ -2680,7 +2680,7 @@ ALIGN function_align mova [rsp+32*0], m6 pxor m6, m6 psubw m1, m6, m1 - jmp m(idct_16x16_internal).end2 + jmp m(idct_16x16_internal_8bpc).end2 ALIGN function_align cglobal_label .main vpbroadcastd m15, [o(pd_2048)] @@ -2833,10 +2833,10 @@ INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst -cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 +cglobal iflipadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ITX_16X16_LOAD_COEFS - call m(iadst_16x16_internal).main - call m(iadst_16x16_internal).main_pass1_end + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass1_end pmulhrsw m6, m1 pmulhrsw m2, m1, m8 mova [rsp+32*2], m6 @@ -2869,10 +2869,10 @@ cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 vperm2i128 m13, m1, m0, 0x31 vinserti128 m1, m8, [rsp+32*2], 1 vperm2i128 m8, m8, [rsp+32*2], 0x31 - jmp m(idct_16x16_internal).pass1_end3 + jmp m(idct_16x16_internal_8bpc).pass1_end3 .pass2: - call m(iadst_16x16_internal).main - call m(iadst_16x16_internal).main_pass2_end + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass2_end pmulhrsw m0, m1 pmulhrsw m8, m1 mova [rsp+32*0], m0 @@ -2900,7 +2900,7 @@ cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 lea dstq, [dstq+strideq*4] WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1 WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3 - jmp m(idct_16x16_internal).end3 + jmp m(idct_16x16_internal_8bpc).end3 %macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 pmulhrsw m%2, m%3, m%1 @@ -2911,7 +2911,7 @@ cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 INV_TXFM_16X16_FN identity, dct INV_TXFM_16X16_FN identity, identity -cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 +cglobal iidentity_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 vpbroadcastd m7, [o(pw_1697x16)] mova xm0, [cq+16* 0] vinserti128 m0, [cq+16*16], 1 @@ -2953,7 +2953,7 @@ cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 pmulhrsw m7, m0 psraw m7, 1 pavgw m7, m0 - jmp m(idct_16x16_internal).pass1_end3 + jmp m(idct_16x16_internal_8bpc).pass1_end3 ALIGN function_align .pass2: vpbroadcastd m15, [o(pw_1697x16)] @@ -2967,7 +2967,7 @@ ALIGN function_align pmulhrsw m15, m1 paddsw m1, m1 paddsw m15, m1 - jmp m(idct_16x16_internal).end + jmp m(idct_16x16_internal_8bpc).end %define o_base deint_shuf + 128 @@ -3028,7 +3028,7 @@ ALIGN function_align pmulhrsw m%2, m%3 %endmacro -cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jz .dconly @@ -3037,7 +3037,7 @@ cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob cmp eobd, 106 jle .fast LOAD_8ROWS cq+32*1, 32*2 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main vperm2i128 m11, m0, m4, 0x31 vinserti128 m0, xm4, 1 vperm2i128 m4, m1, m5, 0x31 @@ -3077,7 +3077,7 @@ cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob mova [rsp+32*2], m2 .fast: LOAD_8ROWS cq+32*0, 32*2 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main vperm2i128 m8, m0, m4, 0x31 vinserti128 m0, xm4, 1 vperm2i128 m4, m1, m5, 0x31 @@ -3135,7 +3135,7 @@ cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mov r2d, 8 - jmp m(inv_txfm_add_dct_dct_8x8).end2 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2 .full: REPX {pmulhrsw x, m9}, m12, m13, m14, m15 pmulhrsw m6, m9, [rsp+32*2] @@ -3175,7 +3175,7 @@ cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob RET ALIGN function_align cglobal_label .main_fast ; bottom half is zero - call m(idct_8x16_internal).main + call m(idct_8x16_internal_8bpc).main mova m8, [rsp+gprsize+0*32] mova [rsp+gprsize+0*32], m0 mova m9, [rsp+gprsize+1*32] @@ -3190,7 +3190,7 @@ cglobal_label .main_fast ; bottom half is zero jmp .main2 ALIGN function_align cglobal_label .main - call m(idct_8x16_internal).main + call m(idct_8x16_internal_8bpc).main mova m8, [rsp+gprsize+0*32] mova [rsp+gprsize+0*32], m0 mova m9, [rsp+gprsize+1*32] @@ -3291,7 +3291,7 @@ cglobal_label .main shufpd m%1, m%2, 0x0c %endmacro -cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -3341,7 +3341,7 @@ cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob jg .full pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(inv_txfm_add_dct_dct_8x32).main_fast + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast jmp .pass2 .full: LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18 @@ -3356,7 +3356,7 @@ cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29 pxor m8, m8 REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 - call m(inv_txfm_add_dct_dct_8x32).main + call m(inv_txfm_add_dct_dct_8x32_8bpc).main .pass2: vpbroadcastd m12, [o(pw_8192)] REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15 @@ -3397,7 +3397,7 @@ cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob vinserti128 m2, xm9, 1 vperm2i128 m7, m3, m10, 0x31 vinserti128 m3, xm10, 1 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main vpbroadcastd m8, [o(pw_2048)] REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 lea r2, [strideq*3] @@ -3442,7 +3442,7 @@ cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob vinserti128 m2, xm9, 1 vperm2i128 m7, m3, m12, 0x31 vinserti128 m3, xm12, 1 - call m(idct_16x8_internal).main2 + call m(idct_16x8_internal_8bpc).main2 vpbroadcastd m8, [o(pw_2048)] REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 add r0, 16 @@ -3455,7 +3455,7 @@ cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob WRITE_16X2 6, 7, 0, 1, strideq*2, r2 RET -cglobal inv_txfm_add_identity_identity_8x32, 4, 5, 11, dst, stride, c, eob +cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 5, 11, dst, stride, c, eob vpbroadcastd m9, [pw_5] lea r4, [strideq*3] sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107) @@ -3525,7 +3525,7 @@ ALIGN function_align punpcklqdq m6, m8 ret -cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 10, dst, stride, c, eob +cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 10, dst, stride, c, eob add cq, 16*8 vpbroadcastd m9, [pw_4096] lea r4, [strideq*3] @@ -3550,7 +3550,7 @@ cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 10, dst, stride, c, eob vinserti128 m7, [cq+16*7], 1 pxor m8, m8 REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r4 @@ -3627,7 +3627,7 @@ cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 10, dst, stride, c, eob vextracti128 [r2+%7], m%3, 1 %endmacro -cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jz .dconly @@ -3635,7 +3635,7 @@ cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob base, tmp3 %undef cmp LOAD_16ROWS cq, 64, 1 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main lea tmp1q, [rsp+32*7] lea tmp2q, [tmp1q+32*8] lea tmp3q, [tmp1q+32*16] @@ -3682,7 +3682,7 @@ cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob mov [cq], eobd pmulhrsw xm0, xm1 mov r2d, 16 - jmp m(inv_txfm_add_dct_dct_16x4).dconly + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .full: mova [tmp1q-32*4], m1 mova [tmp1q-32*3], m3 @@ -3693,7 +3693,7 @@ cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob mova [tmp1q+32*2], m13 mova [tmp1q+32*3], m15 LOAD_16ROWS cq+32, 64, 1 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main lea r2, [tmp3q+32*8] mova m1, [rsp+32*1] mova [rsp+32*0], m6 @@ -3746,7 +3746,7 @@ cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob .idct16: LOAD_8ROWS tmp3q-32*4, 32 mova [rsp], m15 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq @@ -3991,7 +3991,7 @@ ALIGN function_align vinserti128 m%1, xm%4, 1 %endmacro -cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -4001,7 +4001,7 @@ cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob mov [cq], eobd pmulhrsw xm0, xm1 mov r2d, 16 - jmp m(inv_txfm_add_dct_dct_32x8).dconly + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 vpbroadcastd m15, [o(pw_2896x8)] @@ -4023,7 +4023,7 @@ cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob pmulhrsw m15, [cq+32*31] lea tmp1q, [rsp+32*7] lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf LOAD_16ROWS cq+32*0, 32*2, 1, 0 pxor m15, m15 mov r3d, 8 @@ -4035,14 +4035,14 @@ cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob add cq, 32*4 dec r3d jg .zero_loop - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main call .pass1_end lea r2, [strideq*3] mov r3, dstq .pass2: vpbroadcastd m7, [o(pw_16384)] - call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round - call m(idct_16x16_internal).main + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + call m(idct_16x16_internal_8bpc).main mova [rsp+32*2], m15 vpbroadcastd m15, [o(pw_2048)] REPX {pmulhrsw x, m15}, m2, m3, m0 @@ -4090,7 +4090,7 @@ ALIGN function_align IDCT32_PASS1_END 1, 9, 6, 7 ret -cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 13, dst, stride, c, eob +cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 5, 13, dst, stride, c, eob %undef cmp lea rax, [o_base] vpbroadcastd m9, [o(pw_2896x8)] @@ -4124,7 +4124,7 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 13, dst, stride, c, eob vinserti128 m7, [cq+64*15], 1 REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r3 @@ -4159,7 +4159,7 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 13, dst, stride, c, eob jg .zero_loop RET -cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob +cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 12, dst, stride, c, eob %undef cmp lea rax, [o_base] vpbroadcastd m9, [o(pw_2896x8)] @@ -4192,7 +4192,7 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob vinserti128 m7, [cq+32*15], 1 REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7 REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 @@ -4223,7 +4223,7 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob jge .zero_loop RET -cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -4232,7 +4232,7 @@ cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob movd xm2, [o(pw_8192)] mov [cq], eobd mov r2d, 32 - jmp m(inv_txfm_add_dct_dct_32x8).dconly + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ base, tmp3, tmp4 @@ -4248,7 +4248,7 @@ cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob test tmp4d, tmp4d jl .fast LOAD_8ROWS_H cq+64*17, 64*2 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf LOAD_8ROWS_H cq+64*16, 64*2 pxor m0, m0 REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ @@ -4256,7 +4256,7 @@ cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob mova [rsp], m15 jmp .idct16 .fast: - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 @@ -4264,10 +4264,10 @@ cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob LOAD_8ROWS cq+64*0, 64*2 pxor m15, m15 REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 - call m(idct_16x16_internal).main - call m(inv_txfm_add_dct_dct_32x16).pass1_end + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end vpbroadcastd m7, [o(pw_8192)] - call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round lea tmp3q, [tmp1q+32*32] mova m15, [rsp] mova [tmp3q-32*4], m0 @@ -4296,7 +4296,7 @@ cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob pmulhrsw m5, m9, [tmp1q+32*1] pmulhrsw m6, m9, [tmp1q+32*2] pmulhrsw m7, m9, [tmp1q+32*3] - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova [tmp1q-32*4], m0 pmulhrsw m0, m9, [tmp2q-32*4] mova [tmp2q-32*4], m1 @@ -4313,7 +4313,7 @@ cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob pmulhrsw m6, m9, [tmp2q+32*2] mova [tmp2q-32*1], m7 pmulhrsw m7, m9, [tmp2q+32*3] - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova [tmp1q+32*0], m0 mova [tmp2q+32*0], m1 mova [tmp1q+32*1], m2 @@ -4341,21 +4341,21 @@ cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob test tmp4d, tmp4d jl .fast2 LOAD_8ROWS_H tmp3q-32*4, 32 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf sub tmp3q, 32*8 LOAD_8ROWS_H tmp3q-32*4, 32 sub tmp3q, 32*16 jmp .pass2_loop_end .fast2: - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast sub tmp3q, 32*24 pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 .pass2_loop_end: LOAD_8ROWS tmp3q-32*4, 32 mova [rsp], m15 - call m(idct_16x16_internal).main - call m(inv_txfm_add_dct_dct_16x32).pass2_end + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end lea tmp3q, [tmp1q-32*32] cmp tmp2q, tmp3q jb .ret @@ -4367,7 +4367,7 @@ cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob .ret: RET -cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 10, dst, stride, c, eob +cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 10, dst, stride, c, eob %undef cmp vpbroadcastd m9, [pw_8192] sub eobd, 136 ; if (eob < 136) @@ -4393,7 +4393,7 @@ cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 10, dst, stride, c, eob mova xm7, [cq+64* 7] vinserti128 m6, [cq+64*14], 1 vinserti128 m7, [cq+64*15], 1 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r4 @@ -4487,7 +4487,7 @@ cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 10, dst, stride, c, eob %endif %endmacro -cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -4496,7 +4496,7 @@ cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob movd xm2, [o(pw_8192)] mov [cq], eobd mov r2d, 32 - jmp m(inv_txfm_add_dct_dct_16x4).dconly + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .normal: PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 %undef cmp @@ -4506,12 +4506,12 @@ cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob mov r7d, eobd .pass1_loop: LOAD_16ROWS cq, 64 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [rsp+32*0], m6 mova [rsp+32*1], m7 vpbroadcastd m7, [o(pw_8192)] - call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round mova m15, [rsp+32*0] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m2 @@ -4559,7 +4559,7 @@ cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob .fast: mova [rsp], m8 lea tmp1q, [rsp+32*7] - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 @@ -4601,7 +4601,7 @@ cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob .fast2: add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast add r2, 32*24 vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 @@ -4629,7 +4629,7 @@ cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob vinserti128 m6, [r3+32*0+16], 1 .fast3: add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 @@ -4654,8 +4654,8 @@ cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob vinserti128 m5, [r3+32*1+ 0], 1 vinserti128 m6, [r3+32*2+16], 1 .fast4: - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 RET ALIGN function_align %define o_base idct64_mul - 8 @@ -4833,7 +4833,7 @@ cglobal_label .main_part2_internal jne .main_part2_pass2_loop ret -cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -4876,7 +4876,7 @@ cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 lea tmp1q, [rsp+32*7] - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 @@ -4900,7 +4900,7 @@ cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 @@ -4915,7 +4915,7 @@ cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob pxor m8, m8 REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 @@ -4929,8 +4929,8 @@ cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob mova m7, [cq+32* 3] pxor m8, m8 REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 sub tmp1q, 32*36 lea r2, [strideq*3] mov tmp2d, 4 @@ -4971,8 +4971,8 @@ cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob mova [rsp+32*0], m6 mova [rsp+32*1], m7 vpbroadcastd m7, [o(pw_8192)] - call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round - call m(idct_16x16_internal).main + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + call m(idct_16x16_internal_8bpc).main mova [rsp+32*0], m15 vpbroadcastd m15, [o(pw_2048)] REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7 @@ -4997,7 +4997,7 @@ cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob jg .pass2_loop RET -cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -5007,7 +5007,7 @@ cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob mov [cq], eobd pmulhrsw xm0, xm1 mov r2d, 64 - jmp m(inv_txfm_add_dct_dct_32x8).dconly + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 lea tmp1q, [rsp+32*7] @@ -5021,7 +5021,7 @@ cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob test r10b, r10b jnz .fast LOAD_8ROWS_H cq+64*17, 64*2, 2 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf LOAD_8ROWS_H cq+64*16, 64*2, 1 mova [rsp], m15 pxor m15, m15 @@ -5029,7 +5029,7 @@ cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob 24, 25, 26, 27, 28, 29, 30, 31 jmp .idct16 .fast: - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 @@ -5037,10 +5037,10 @@ cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob LOAD_8ROWS cq+64*0, 64*2, 1 pxor m15, m15 REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 - call m(idct_16x16_internal).main - call m(inv_txfm_add_dct_dct_32x16).pass1_end + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end vpbroadcastd m7, [o(pw_16384)] - call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round lea r3, [tmp1q+32*48] mova m15, [rsp] mova [r3-32*4], m0 @@ -5069,7 +5069,7 @@ cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob pmulhrsw m5, m9, [tmp1q+32*1] pmulhrsw m6, m9, [tmp1q+32*2] pmulhrsw m7, m9, [tmp1q+32*3] - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova [tmp1q-32*4], m0 pmulhrsw m0, m9, [tmp2q-32*4] mova [tmp2q-32*4], m1 @@ -5086,7 +5086,7 @@ cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob pmulhrsw m6, m9, [tmp2q+32*2] mova [tmp2q-32*1], m7 pmulhrsw m7, m9, [tmp2q+32*3] - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova [tmp1q+32*0], m0 mova [tmp2q+32*0], m1 mova [tmp1q+32*1], m2 @@ -5119,7 +5119,7 @@ cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob .fast2: mova [rsp], m8 lea tmp1q, [rsp+32*39] - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 @@ -5153,7 +5153,7 @@ cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob .fast3: add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 @@ -5171,7 +5171,7 @@ cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob mova m6, [r8+32*0] .fast4: add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 @@ -5188,8 +5188,8 @@ cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob mova m5, [r8-32*3] mova m6, [r8+32*2] .fast5: - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 add r10d, 0x80000000 jc .ret lea r2, [rsp+32*7] @@ -5200,7 +5200,7 @@ cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob .ret: RET -cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -5210,7 +5210,7 @@ cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob mov [cq], eobd pmulhrsw xm0, xm1 mov r2d, 32 - jmp m(inv_txfm_add_dct_dct_64x16).dconly + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly .normal: PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ base, tmp3, tmp4 @@ -5222,7 +5222,7 @@ cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 @@ -5246,7 +5246,7 @@ cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 @@ -5262,7 +5262,7 @@ cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob pxor m8, m8 REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))] add rax, 8 add tmp1q, 32*8 @@ -5277,11 +5277,11 @@ cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob pmulhrsw m7, [cq+64* 3] pxor m8, m8 REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 sub tmp1q, 32*44 vpbroadcastd m10, [o(pw_16384)] - call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave add cq, 32 add tmp4d, 0x80000000 jnc .pass1_loop @@ -5296,21 +5296,21 @@ cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob test tmp4d, 0x40000000 jnz .fast LOAD_8ROWS_H tmp2q-32*4, 32 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf lea tmp3q, [tmp2q-32*8] LOAD_8ROWS_H tmp3q-32*4, 32 mova [rsp], m15 jmp .idct16 .fast: - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 .idct16: lea tmp3q, [tmp1q-32*8] LOAD_8ROWS tmp3q-32*4, 32 - call m(idct_16x16_internal).main - call m(inv_txfm_add_dct_dct_16x32).pass2_end + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end add tmp1q, 32*16 sub dstq, r3 lea r2, [r2+r3+16] @@ -5340,7 +5340,7 @@ ALIGN function_align vinserti128 m6, [tmp2q+32*2], 1 vinserti128 m7, [tmp2q+32*3], 1 REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova xm8, [tmp1q-32*4+16] mova xm9, [tmp1q-32*3+16] vinserti128 m8, [tmp2q-32*4+16], 1 @@ -5368,7 +5368,7 @@ ALIGN function_align pmulhrsw m0, m8, m10 pmulhrsw m1, m9, m10 REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova [tmp1q+32*0], m0 mova [tmp2q+32*0], m1 mova [tmp1q+32*1], m2 @@ -5382,7 +5382,7 @@ ALIGN function_align jg .loop ret -cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -5391,7 +5391,7 @@ cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob movd xm2, [o(pw_8192)] mov [cq], eobd mov r2d, 64 - jmp m(inv_txfm_add_dct_dct_64x16).dconly + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly .normal: PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 lea tmp1q, [rsp+32*71] @@ -5402,7 +5402,7 @@ cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 @@ -5426,7 +5426,7 @@ cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 @@ -5441,7 +5441,7 @@ cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob pxor m8, m8 REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 @@ -5455,11 +5455,11 @@ cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob mova m7, [cq+64* 3] pxor m8, m8 REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 sub tmp1q, 32*44 vpbroadcastd m10, [o(pw_8192)] - call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave add cq, 32 add r10d, 0x80000000 jnc .pass1_loop @@ -5482,7 +5482,7 @@ cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob mova m6, [r3+32*0] mova m7, [r3+32*2] .fast: - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 @@ -5516,7 +5516,7 @@ cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob .fast2: add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add r2, 32*8 add r3, 32*8 @@ -5536,7 +5536,7 @@ cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob mova m6, [r3+32*0] ; 25 .fast3: add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 @@ -5553,8 +5553,8 @@ cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob mova m5, [r3-32*3] ; 19 mova m6, [r3+32*2] ; 29 .fast4: - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 sub tmp1q, 32*28 sub dstq, r8 lea dstq, [dstq+strideq*4+16] diff --git a/src/x86/itx_sse.asm b/src/x86/itx_sse.asm index 3ebd3cc17c..89ad56bda9 100644 --- a/src/x86/itx_sse.asm +++ b/src/x86/itx_sse.asm @@ -1,4 +1,4 @@ -; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; @@ -240,8 +240,8 @@ SECTION .text %endmacro %macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack -cglobal inv_txfm_add_%1_%2_%3, 4, 6, %4, dst, stride, coeff, eob, tx2 - %define %%p1 m(i%1_%3_internal) +cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2 + %define %%p1 m(i%1_%3_internal_8bpc) %if ARCH_X86_32 LEA r5, $$ %endif @@ -250,12 +250,12 @@ cglobal inv_txfm_add_%1_%2_%3, 4, 6, %4, dst, stride, coeff, eob, tx2 test eobd, eobd jz %%end %endif - lea tx2q, [o(m(i%2_%3_internal).pass2)] + lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] call %%p1 RET %%end: %else - lea tx2q, [o(m(i%2_%3_internal).pass2)] + lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 @@ -277,7 +277,7 @@ ALIGN function_align mov [coeffq], eobd ;0 pmulhrsw m0, m1 mova m1, m0 - TAIL_CALL m(iadst_4x4_internal).end2 + TAIL_CALL m(iadst_4x4_internal_8bpc).end2 %endif %endmacro @@ -288,7 +288,7 @@ INV_TXFM_4X4_FN dct, adst INV_TXFM_4X4_FN dct, flipadst INV_TXFM_4X4_FN dct, identity -cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] ;high: in1 ;low: in0 mova m1, [coeffq+16*1] ;high: in3 ;low in2 @@ -315,7 +315,7 @@ INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity -cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] call .main @@ -367,10 +367,10 @@ INV_TXFM_4X4_FN flipadst, adst INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity -cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] - call m(iadst_4x4_internal).main + call m(iadst_4x4_internal_8bpc).main punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 ;high: in3 ;low :in2 @@ -378,7 +378,7 @@ cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp tx2q .pass2: - call m(iadst_4x4_internal).main + call m(iadst_4x4_internal_8bpc).main .end: pxor m2, m2 @@ -393,7 +393,7 @@ INV_TXFM_4X4_FN identity, adst INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity -cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] mova m3, [o(pw_1697x8)] @@ -413,7 +413,7 @@ cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 - jmp m(iadst_4x4_internal).end + jmp m(iadst_4x4_internal_8bpc).end %macro IWHT4_1D_PACKED 0 punpckhqdq m3, m0, m1 ;low: in1 high: in3 @@ -429,7 +429,7 @@ cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 paddw m2, m1 ;low: out3 %endmacro -cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff +cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] pxor m2, m2 @@ -561,7 +561,7 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff mova m1, m0 mova m2, m0 mova m3, m0 - TAIL_CALL m(iadst_4x8_internal).end3 + TAIL_CALL m(iadst_4x8_internal_8bpc).end3 %endif %endmacro @@ -570,7 +570,7 @@ INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst INV_TXFM_4X8_FN dct, identity -cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] @@ -578,15 +578,15 @@ cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pmulhrsw m3, [coeffq+16*3] .pass1: - call m(idct_8x4_internal).main - jmp m(iadst_4x8_internal).pass1_end + call m(idct_8x4_internal_8bpc).main + jmp m(iadst_4x8_internal_8bpc).pass1_end .pass2: call .main shufps m1, m1, q1032 shufps m3, m3, q1032 mova m4, [o(pw_2048)] - jmp m(iadst_4x8_internal).end2 + jmp m(iadst_4x8_internal_8bpc).end2 ALIGN function_align .main: @@ -599,7 +599,7 @@ INV_TXFM_4X8_FN adst, adst INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity -cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] @@ -607,7 +607,7 @@ cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pmulhrsw m3, [coeffq+16*3] .pass1: - call m(iadst_8x4_internal).main + call m(iadst_8x4_internal_8bpc).main .pass1_end: INV_4X8 @@ -690,7 +690,7 @@ INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity -cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] @@ -698,7 +698,7 @@ cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pmulhrsw m3, [coeffq+16*3] .pass1: - call m(iadst_8x4_internal).main + call m(iadst_8x4_internal_8bpc).main punpcklwd m4, m3, m2 punpckhwd m3, m2 @@ -713,7 +713,7 @@ cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 .pass2: shufps m0, m0, q1032 shufps m1, m1, q1032 - call m(iadst_4x8_internal).main + call m(iadst_4x8_internal_8bpc).main mova m4, m0 mova m5, m1 @@ -724,14 +724,14 @@ cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m5, [o(pw_2048)] pxor m4, m4 psubw m4, m5 - jmp m(iadst_4x8_internal).end + jmp m(iadst_4x8_internal_8bpc).end INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity -cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] @@ -748,11 +748,11 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 - jmp m(iadst_4x8_internal).pass1_end + jmp m(iadst_4x8_internal_8bpc).pass1_end .pass2: mova m4, [o(pw_4096)] - jmp m(iadst_4x8_internal).end2 + jmp m(iadst_4x8_internal_8bpc).end2 %macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3] @@ -797,7 +797,7 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m1, m0 mova m2, m0 mova m3, m0 - TAIL_CALL m(iadst_8x4_internal).end2 + TAIL_CALL m(iadst_8x4_internal_8bpc).end2 %endif %endmacro @@ -806,14 +806,14 @@ INV_TXFM_8X4_FN dct, adst INV_TXFM_8X4_FN dct, flipadst INV_TXFM_8X4_FN dct, identity -cglobal idct_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] - call m(idct_4x8_internal).main + call m(idct_4x8_internal_8bpc).main mova m4, [o(deint_shuf1)] mova m5, [o(deint_shuf2)] @@ -833,7 +833,7 @@ cglobal idct_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 .pass2: call .main - jmp m(iadst_8x4_internal).end + jmp m(iadst_8x4_internal_8bpc).end ALIGN function_align .main: @@ -846,7 +846,7 @@ INV_TXFM_8X4_FN adst, adst INV_TXFM_8X4_FN adst, flipadst INV_TXFM_8X4_FN adst, identity -cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] @@ -855,7 +855,7 @@ cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 shufps m0, m0, q1032 shufps m1, m1, q1032 - call m(iadst_4x8_internal).main + call m(iadst_4x8_internal_8bpc).main punpckhwd m4, m0, m1 punpcklwd m0, m1 @@ -964,7 +964,7 @@ INV_TXFM_8X4_FN flipadst, adst INV_TXFM_8X4_FN flipadst, flipadst INV_TXFM_8X4_FN flipadst, identity -cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] @@ -973,7 +973,7 @@ cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 shufps m0, m0, q1032 shufps m1, m1, q1032 - call m(iadst_4x8_internal).main + call m(iadst_4x8_internal_8bpc).main punpckhwd m5, m3, m2 punpcklwd m3, m2 @@ -994,21 +994,21 @@ cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp tx2q .pass2: - call m(iadst_8x4_internal).main + call m(iadst_8x4_internal_8bpc).main mova m4, m0 mova m5, m1 mova m0, m3 mova m1, m2 mova m2, m5 mova m3, m4 - jmp m(iadst_8x4_internal).end + jmp m(iadst_8x4_internal_8bpc).end INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity -cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] @@ -1043,7 +1043,7 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 - jmp m(iadst_8x4_internal).end + jmp m(iadst_8x4_internal_8bpc).end %macro INV_TXFM_8X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x8, 8, 16*4 @@ -1060,7 +1060,7 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pmulhrsw m0, m2 .end: mov r3d, 2 - lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8).end3)] + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)] .loop: WRITE_8X4 0, 0, 0, 0, 1, 2, 3 lea dstq, [dstq+strideq*2] @@ -1110,7 +1110,7 @@ INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst INV_TXFM_8X8_FN dct, identity -cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 .pass1: @@ -1161,7 +1161,7 @@ cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp tx2q .pass2: - lea tx2q, [o(m(idct_8x8_internal).end4)] + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .pass2_main: call .main @@ -1222,7 +1222,7 @@ INV_TXFM_8X8_FN adst, adst INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity -cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 .pass1: @@ -1238,11 +1238,11 @@ cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pxor m6, m6 psubw m6, m7 mova m7, m6 - jmp m(idct_8x8_internal).pass1_end2 + jmp m(idct_8x8_internal_8bpc).pass1_end2 ALIGN function_align .pass2: - lea tx2q, [o(m(idct_8x8_internal).end4)] + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .pass2_main: call .main @@ -1255,7 +1255,7 @@ ALIGN function_align pxor m6, m6 psubw m6, m7 mova m7, m6 - jmp m(idct_8x8_internal).end2 + jmp m(idct_8x8_internal_8bpc).end2 ALIGN function_align .main: @@ -1361,12 +1361,12 @@ INV_TXFM_8X8_FN flipadst, adst INV_TXFM_8X8_FN flipadst, flipadst INV_TXFM_8X8_FN flipadst, identity -cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 .pass1: - call m(iadst_8x8_internal).main - call m(iadst_8x8_internal).main_pass1_end + call m(iadst_8x8_internal_8bpc).main + call m(iadst_8x8_internal_8bpc).main_pass1_end .pass1_end: mova m7, [o(pw_m16384)] @@ -1388,15 +1388,15 @@ cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pmulhrsw m0, [rsp+gprsize+16*0] REPX {pmulhrsw x, m7}, m1, m3, m5 pmulhrsw m7, m6 - jmp m(idct_8x8_internal).pass1_end3 + jmp m(idct_8x8_internal_8bpc).pass1_end3 ALIGN function_align .pass2: - lea tx2q, [o(m(idct_8x8_internal).end4)] + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .pass2_main: - call m(iadst_8x8_internal).main - call m(iadst_8x8_internal).main_pass2_end + call m(iadst_8x8_internal_8bpc).main + call m(iadst_8x8_internal_8bpc).main_pass2_end .end: mova m7, [o(pw_2048)] @@ -1415,21 +1415,21 @@ ALIGN function_align pmulhrsw m0, [rsp+gprsize+16*0] mova m3, m5 mova [rsp+gprsize+16*0], m7 - jmp m(idct_8x8_internal).end3 + jmp m(idct_8x8_internal_8bpc).end3 INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity -cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 mova [rsp+gprsize+16*1], m6 - jmp m(idct_8x8_internal).pass1_end3 + jmp m(idct_8x8_internal_8bpc).pass1_end3 ALIGN function_align .pass2: - lea tx2q, [o(m(idct_8x8_internal).end4)] + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .end: pmulhrsw m7, [o(pw_4096)] @@ -1438,7 +1438,7 @@ ALIGN function_align REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 mova [rsp+gprsize+16*2], m5 mova [rsp+gprsize+16*1], m6 - jmp m(idct_8x8_internal).end3 + jmp m(idct_8x8_internal_8bpc).end3 %macro INV_TXFM_4X16_FN 2 ; type1, type2 @@ -1469,8 +1469,8 @@ INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst INV_TXFM_4X16_FN dct, identity -cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(idct_4x8_internal).pass1)] +cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(idct_4x8_internal_8bpc).pass1)] .pass1: mova m0, [coeffq+16*1] @@ -1478,7 +1478,7 @@ cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m2, [coeffq+16*5] mova m3, [coeffq+16*7] push tx2q - lea tx2q, [o(m(idct_4x16_internal).pass1_2)] + lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)] jmp r3 .pass1_2: @@ -1490,7 +1490,7 @@ cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m1, [coeffq+16*2] mova m2, [coeffq+16*4] mova m3, [coeffq+16*6] - lea tx2q, [o(m(idct_4x16_internal).pass1_end)] + lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)] jmp r3 .pass1_end: @@ -1507,7 +1507,7 @@ cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp tx2q .pass2: - call m(idct_16x4_internal).main + call m(idct_16x4_internal_8bpc).main .end: mova m7, [o(pw_2048)] @@ -1538,13 +1538,13 @@ INV_TXFM_4X16_FN adst, adst INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity -cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(iadst_4x8_internal).pass1)] - jmp m(idct_4x16_internal).pass1 +cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iadst_4x8_internal_8bpc).pass1)] + jmp m(idct_4x16_internal_8bpc).pass1 .pass2: - call m(iadst_16x4_internal).main - call m(iadst_16x4_internal).main_pass2_end + call m(iadst_16x4_internal_8bpc).main + call m(iadst_16x4_internal_8bpc).main_pass2_end punpcklqdq m6, m5, m4 ;low: -out5 high: -out7 punpckhqdq m4, m5 ;low: out8 high: out10 @@ -1606,13 +1606,13 @@ INV_TXFM_4X16_FN flipadst, adst INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity -cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(iflipadst_4x8_internal).pass1)] - jmp m(idct_4x16_internal).pass1 +cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)] + jmp m(idct_4x16_internal_8bpc).pass1 .pass2: - call m(iadst_16x4_internal).main - call m(iadst_16x4_internal).main_pass2_end + call m(iadst_16x4_internal_8bpc).main + call m(iadst_16x4_internal_8bpc).main_pass2_end punpckhqdq m6, m5, m4 ;low: out5 high: out7 punpcklqdq m4, m5 ;low: -out8 high: -out10 @@ -1628,7 +1628,7 @@ cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 punpckhqdq m2, m3 ;low: out1 high: out3 mova m7, [o(pw_m2048)] - jmp m(iadst_4x16_internal).end1 + jmp m(iadst_4x16_internal_8bpc).end1 INV_TXFM_4X16_FN identity, dct @@ -1646,7 +1646,7 @@ INV_TXFM_4X16_FN identity, identity paddsw m%1, m%2 %endmacro -cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*1] mova m6, [o(pw_1697x8)] mova m1, [coeffq+16*3] @@ -1672,7 +1672,7 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pcmpeqw m3, m7 pandn m2, m4 pandn m3, m5 - jmp m(iadst_4x8_internal).pass1_end + jmp m(iadst_4x8_internal_8bpc).pass1_end .pass1_2: mova [coeffq+16*1], m0 mova [coeffq+16*3], m1 @@ -1704,7 +1704,7 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [coeffq+16*7] mova [coeffq+16*4], m4 - jmp m(iadst_4x16_internal).end2 + jmp m(iadst_4x16_internal_8bpc).end2 %macro INV_TXFM_16X4_FN 2 ; type1, type2 @@ -1715,7 +1715,7 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 movd m2, [o(pw_16384)] mov [coeffq], eobd mov r2d, 2 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4).end)] + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)] .dconly: pmulhrsw m0, m2 movd m2, [o(pw_2048)] ;intentionally rip-relative @@ -1806,7 +1806,7 @@ INV_TXFM_16X4_FN dct, adst INV_TXFM_16X4_FN dct, flipadst INV_TXFM_16X4_FN dct, identity -cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_7ROWS coeffq, 16 call .main @@ -1850,7 +1850,7 @@ cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp tx2q .pass2: - lea tx2q, [o(m(idct_8x4_internal).pass2)] + lea tx2q, [o(m(idct_8x4_internal_8bpc).pass2)] .pass2_end: mova [coeffq+16*4], m4 @@ -1911,7 +1911,7 @@ INV_TXFM_16X4_FN adst, adst INV_TXFM_16X4_FN adst, flipadst INV_TXFM_16X4_FN adst, identity -cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_7ROWS coeffq, 16 call .main call .main_pass1_end @@ -1939,11 +1939,11 @@ cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 REPX {pmulhrsw x, m2}, m7, m3, m6 pmulhrsw m2, [coeffq+16*7] mova [coeffq+16*6], m7 - jmp m(idct_16x4_internal).pass1_end3 + jmp m(idct_16x4_internal_8bpc).pass1_end3 .pass2: - lea tx2q, [o(m(iadst_8x4_internal).pass2)] - jmp m(idct_16x4_internal).pass2_end + lea tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)] + jmp m(idct_16x4_internal_8bpc).pass2_end ALIGN function_align .main: @@ -2095,10 +2095,10 @@ INV_TXFM_16X4_FN flipadst, adst INV_TXFM_16X4_FN flipadst, flipadst INV_TXFM_16X4_FN flipadst, identity -cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_7ROWS coeffq, 16 - call m(iadst_16x4_internal).main - call m(iadst_16x4_internal).main_pass1_end + call m(iadst_16x4_internal_8bpc).main + call m(iadst_16x4_internal_8bpc).main_pass1_end punpcklwd m6, m7, m0 ;packed out11, out15 punpckhwd m0, m7 ;packed -out0, -out4 @@ -2114,11 +2114,11 @@ cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 punpckhwd m1, m2 ;packed -out2, -out6 mova m7, [o(pw_m16384)] - jmp m(iadst_16x4_internal).pass1_end + jmp m(iadst_16x4_internal_8bpc).pass1_end .pass2: - lea tx2q, [o(m(iflipadst_8x4_internal).pass2)] - jmp m(idct_16x4_internal).pass2_end + lea tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)] + jmp m(idct_16x4_internal_8bpc).pass2_end INV_TXFM_16X4_FN identity, dct @@ -2126,7 +2126,7 @@ INV_TXFM_16X4_FN identity, adst INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity -cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m1, [coeffq+16*6] mova m0, [coeffq+16*5] mova m2, [coeffq+16*7] @@ -2176,11 +2176,11 @@ cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 punpcklwd m4, m6 ;packed out8, out12 punpckhwd m6, m5, m7 ;packed out11, out15 punpcklwd m5, m7 ;packed out10, out14 - jmp m(idct_16x4_internal).pass1_end3 + jmp m(idct_16x4_internal_8bpc).pass1_end3 .pass2: - lea tx2q, [o(m(iidentity_8x4_internal).pass2)] - jmp m(idct_16x4_internal).pass2_end + lea tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)] + jmp m(idct_16x4_internal_8bpc).pass2_end %macro SAVE_8ROWS 2 ;src, stride @@ -2209,8 +2209,8 @@ cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pmulhrsw m0, m1 pmulhrsw m0, m2 mov r3d, 4 - lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16).end)] - jmp m(inv_txfm_add_dct_dct_8x8).loop + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop .end: RET %endif @@ -2221,13 +2221,13 @@ INV_TXFM_8X16_FN dct, adst INV_TXFM_8X16_FN dct, flipadst INV_TXFM_8X16_FN dct, identity -cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(idct_8x8_internal).pass1)] +cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(idct_8x8_internal_8bpc).pass1)] .pass1: LOAD_8ROWS coeffq+16*1, 32, 1 mov [rsp+gprsize+16*11], tx2q - lea tx2q, [o(m(idct_8x16_internal).pass1_end)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)] jmp r3 .pass1_end: @@ -2237,7 +2237,7 @@ cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp r3 .pass2: - lea tx2q, [o(m(idct_8x16_internal).end)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] .pass2_pre: mova [coeffq+16*2 ], m1 @@ -2253,7 +2253,7 @@ cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m7, [coeffq+16*13] .pass2_main: - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*2 ] @@ -2264,18 +2264,18 @@ cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m5, [coeffq+16*7 ] mova m6, [coeffq+16*11] mova m7, [coeffq+16*15] - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mov r3, dstq lea dstq, [dstq+strideq*8] - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end1: pxor m7, m7 @@ -2287,12 +2287,12 @@ INV_TXFM_8X16_FN adst, adst INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, identity -cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(iadst_8x8_internal).pass1)] - jmp m(idct_8x16_internal).pass1 +cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iadst_8x8_internal_8bpc).pass1)] + jmp m(idct_8x16_internal_8bpc).pass1 .pass2: - lea tx2q, [o(m(iadst_8x16_internal).end)] + lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] .pass2_pre: mova [rsp+gprsize+16*7], m0 @@ -2318,19 +2318,19 @@ cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m6, [coeffq+16*9 ] mova m7, [coeffq+16*11] - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass2_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end mov r3, dstq lea dstq, [dstq+strideq*8] - jmp m(iadst_8x8_internal).end + jmp m(iadst_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(iadst_8x8_internal).end + jmp m(iadst_8x8_internal_8bpc).end INV_TXFM_8X16_FN flipadst, dct @@ -2338,12 +2338,12 @@ INV_TXFM_8X16_FN flipadst, adst INV_TXFM_8X16_FN flipadst, flipadst INV_TXFM_8X16_FN flipadst, identity -cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(iflipadst_8x8_internal).pass1)] - jmp m(idct_8x16_internal).pass1 +cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)] + jmp m(idct_8x16_internal_8bpc).pass1 .pass2: - lea tx2q, [o(m(iflipadst_8x16_internal).end)] + lea tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)] lea r3, [dstq+strideq*8] .pass2_pre: @@ -2370,16 +2370,16 @@ cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m6, [coeffq+16*9 ] mova m7, [coeffq+16*11] - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass2_end - jmp m(iflipadst_8x8_internal).end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + jmp m(iflipadst_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(iflipadst_8x8_internal).end + jmp m(iflipadst_8x8_internal_8bpc).end INV_TXFM_8X16_FN identity, dct @@ -2387,22 +2387,22 @@ INV_TXFM_8X16_FN identity, adst INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity -cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*1, 32, 1 mov r3, tx2q - lea tx2q, [o(m(iidentity_8x16_internal).pass1_end)] + lea tx2q, [o(m(iidentity_8x16_internal_8bpc).pass1_end)] mova [rsp+gprsize+16*1], m6 - jmp m(idct_8x8_internal).pass1_end3 + jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS coeffq+16*0, 32, 1 mov tx2q, r3 mova [rsp+gprsize+16*1], m6 - jmp m(idct_8x8_internal).pass1_end3 + jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass2: - lea tx2q, [o(m(iidentity_8x16_internal).end1)] + lea tx2q, [o(m(iidentity_8x16_internal_8bpc).end1)] .end: mova [rsp+gprsize+16*0], m7 @@ -2420,11 +2420,11 @@ cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*0], m5 mova [rsp+gprsize+16*1], m6 mova [rsp+gprsize+16*2], m7 - jmp m(idct_8x8_internal).end3 + jmp m(idct_8x8_internal_8bpc).end3 .end1: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] lea dstq, [dstq+strideq*2] jmp .end @@ -2438,8 +2438,8 @@ cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mov [coeffq], eobd pmulhrsw m0, m1 mov r2d, 4 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8).end)] - jmp m(inv_txfm_add_dct_dct_16x4).dconly + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .end: RET %endif @@ -2450,34 +2450,34 @@ INV_TXFM_16X8_FN dct, adst INV_TXFM_16X8_FN dct, flipadst INV_TXFM_16X8_FN dct, identity -cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*0, 32, 1 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*1, 32, 1 call .main mov r3, tx2q - lea tx2q, [o(m(idct_16x8_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_16x8_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 - jmp m(idct_8x8_internal).pass1_end + jmp m(idct_8x8_internal_8bpc).pass1_end .pass2: - lea tx2q, [o(m(idct_16x8_internal).end)] + lea tx2q, [o(m(idct_16x8_internal_8bpc).end)] lea r3, [dstq+8] - jmp m(idct_8x8_internal).pass2_main + jmp m(idct_8x8_internal_8bpc).pass2_main .end: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(idct_8x8_internal).pass2_main + jmp m(idct_8x8_internal_8bpc).pass2_main ALIGN function_align @@ -2567,7 +2567,7 @@ INV_TXFM_16X8_FN adst, adst INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, identity -cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m7, [o(pw_2896x8)] pmulhrsw m0, m7, [coeffq+16*0 ] pmulhrsw m1, m7, [coeffq+16*1 ] @@ -2597,26 +2597,26 @@ cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 call .main call .main_pass1_end mov r3, tx2q - lea tx2q, [o(m(iadst_16x8_internal).pass1_end)] - jmp m(iadst_8x8_internal).pass1_end + lea tx2q, [o(m(iadst_16x8_internal_8bpc).pass1_end)] + jmp m(iadst_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 - jmp m(iadst_8x8_internal).pass1_end + jmp m(iadst_8x8_internal_8bpc).pass1_end .pass2: - lea tx2q, [o(m(iadst_16x8_internal).end)] + lea tx2q, [o(m(iadst_16x8_internal_8bpc).end)] lea r3, [dstq+8] - jmp m(iadst_8x8_internal).pass2_main + jmp m(iadst_8x8_internal_8bpc).pass2_main .end: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(iadst_8x8_internal).pass2_main + jmp m(iadst_8x8_internal_8bpc).pass2_main ALIGN function_align .main: @@ -2847,7 +2847,7 @@ INV_TXFM_16X8_FN flipadst, adst INV_TXFM_16X8_FN flipadst, flipadst INV_TXFM_16X8_FN flipadst, identity -cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m7, [o(pw_2896x8)] pmulhrsw m0, m7, [coeffq+16*0 ] pmulhrsw m1, m7, [coeffq+16*1 ] @@ -2874,34 +2874,34 @@ cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pmulhrsw m6, m7, [coeffq+16*12] pmulhrsw m7, [coeffq+16*13] - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end mova m7, [rsp+gprsize+16*0] SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov r3, tx2q - lea tx2q, [o(m(iflipadst_16x8_internal).pass1_end)] - jmp m(iflipadst_8x8_internal).pass1_end + lea tx2q, [o(m(iflipadst_16x8_internal_8bpc).pass1_end)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS coeffq+16*0, 32 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 - jmp m(iflipadst_8x8_internal).pass1_end + jmp m(iflipadst_8x8_internal_8bpc).pass1_end .pass2: - lea tx2q, [o(m(iflipadst_16x8_internal).end)] + lea tx2q, [o(m(iflipadst_16x8_internal_8bpc).end)] lea r3, [dstq+8] - jmp m(iflipadst_8x8_internal).pass2_main + jmp m(iflipadst_8x8_internal_8bpc).pass2_main .end: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(iflipadst_8x8_internal).pass2_main + jmp m(iflipadst_8x8_internal_8bpc).pass2_main INV_TXFM_16X8_FN identity, dct @@ -2909,14 +2909,14 @@ INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity -cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 add coeffq, 16*16 mova m4, [coeffq-16*7] mova m5, [coeffq-16*5] mova m6, [coeffq-16*3] mova m7, [coeffq-16*1] mov r3, tx2q - lea tx2q, [o(m(iidentity_16x8_internal).pass1_end)] + lea tx2q, [o(m(iidentity_16x8_internal_8bpc).pass1_end)] .pass1: mova m0, [o(pw_2896x8)] @@ -2955,7 +2955,7 @@ cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pmulhrsw m6, m3 mova m3, [rsp+gprsize+16*0] paddsw m0, m6 - jmp m(idct_8x8_internal).pass1_end3 + jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass1_end: mova [coeffq+16*1], m4 @@ -2974,15 +2974,15 @@ cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp .pass1 .pass2: - lea tx2q, [o(m(iidentity_16x8_internal).end)] + lea tx2q, [o(m(iidentity_16x8_internal_8bpc).end)] lea r3, [dstq+8] - jmp m(iidentity_8x8_internal).end + jmp m(iidentity_8x8_internal_8bpc).end .end: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(iidentity_8x8_internal).end + jmp m(iidentity_8x8_internal_8bpc).end %macro INV_TXFM_16X16_FN 2 ; type1, type2 @@ -2993,8 +2993,8 @@ cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 movd m2, [o(pw_8192)] mov [coeffq], eobd mov r2d, 8 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16).end)] - jmp m(inv_txfm_add_dct_dct_16x4).dconly + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .end: RET %endif @@ -3005,35 +3005,35 @@ INV_TXFM_16X16_FN dct, adst INV_TXFM_16X16_FN dct, flipadst INV_TXFM_16X16_FN dct, identity -cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*1, 64 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*3, 64 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mov r3, tx2q - lea tx2q, [o(m(idct_16x16_internal).pass1_end)] + lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end)] mova m7, [o(pw_8192)] - jmp m(idct_8x8_internal).pass1_end1 + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+16*17, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x16_internal).pass1_end1)] + lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end1)] mova m7, [o(pw_8192)] - jmp m(idct_8x8_internal).pass1_end1 + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS coeffq+16*0, 64 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*2, 64 - call m(idct_16x8_internal).main - lea tx2q, [o(m(idct_16x16_internal).pass1_end2)] + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end2)] mova m7, [o(pw_8192)] - jmp m(idct_8x8_internal).pass1_end1 + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 @@ -3041,19 +3041,19 @@ cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 mova m7, [o(pw_8192)] - jmp m(idct_8x8_internal).pass1_end1 + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass2: - lea tx2q, [o(m(idct_16x16_internal).end)] - jmp m(idct_8x16_internal).pass2_pre + lea tx2q, [o(m(idct_16x16_internal_8bpc).end)] + jmp m(idct_8x16_internal_8bpc).pass2_pre .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x16_internal).end1)] + lea tx2q, [o(m(idct_16x16_internal_8bpc).end1)] mov dstq, r3 lea r3, [dstq+8] - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end1: pxor m7, m7 @@ -3070,8 +3070,8 @@ cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m5, [coeffq+16*5 ] mova m6, [coeffq+16*9 ] mova m7, [coeffq+16*13] - lea tx2q, [o(m(idct_8x16_internal).end)] - jmp m(idct_8x16_internal).pass2_main + lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] + jmp m(idct_8x16_internal_8bpc).pass2_main %macro ITX_16X16_ADST_LOAD_ODD_COEFS 0 @@ -3132,33 +3132,33 @@ INV_TXFM_16X16_FN adst, dct INV_TXFM_16X16_FN adst, adst INV_TXFM_16X16_FN adst, flipadst -cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 ITX_16X16_ADST_LOAD_ODD_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end mov r3, tx2q - lea tx2q, [o(m(iadst_16x16_internal).pass1_end)] + lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end)] mova m7, [o(pw_8192)] - jmp m(iadst_8x8_internal).pass1_end1 + jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+16*17, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iadst_16x16_internal).pass1_end1)] + lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end1)] mova m7, [o(pw_8192)] - jmp m(iadst_8x8_internal).pass1_end1 + jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+16*1, 32 ITX_16X16_ADST_LOAD_EVEN_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end - lea tx2q, [o(m(iadst_16x16_internal).pass1_end2)] + lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end2)] mova m7, [o(pw_8192)] - jmp m(iadst_8x8_internal).pass1_end1 + jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 @@ -3166,19 +3166,19 @@ cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 mova m7, [o(pw_8192)] - jmp m(iadst_8x8_internal).pass1_end1 + jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass2: - lea tx2q, [o(m(iadst_16x16_internal).end)] - jmp m(iadst_8x16_internal).pass2_pre + lea tx2q, [o(m(iadst_16x16_internal_8bpc).end)] + jmp m(iadst_8x16_internal_8bpc).pass2_pre .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iadst_16x16_internal).end1)] + lea tx2q, [o(m(iadst_16x16_internal_8bpc).end1)] mov dstq, r3 lea r3, [dstq+8] - jmp m(iadst_8x8_internal).end + jmp m(iadst_8x8_internal_8bpc).end .end1: pxor m7, m7 @@ -3199,45 +3199,45 @@ cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*8], m5 mova [rsp+gprsize+16*5], m6 mova [rsp+gprsize+16*6], m7 - lea tx2q, [o(m(iadst_8x16_internal).end)] - jmp m(iadst_8x16_internal).pass2_main + lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] + jmp m(iadst_8x16_internal_8bpc).pass2_main INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst -cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 ITX_16X16_ADST_LOAD_ODD_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end mov r3, tx2q - lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end)] + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end)] mova m7, [o(pw_m8192)] - jmp m(iflipadst_8x8_internal).pass1_end1 + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end1)] + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end1)] mova m7, [o(pw_m8192)] - jmp m(iflipadst_8x8_internal).pass1_end1 + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+16*17, 32 ITX_16X16_ADST_LOAD_EVEN_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end mova m7, [rsp+gprsize+16*0] SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end2)] + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end2)] mova m7, [o(pw_m8192)] - jmp m(iflipadst_8x8_internal).pass1_end1 + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 @@ -3245,19 +3245,19 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 mova m7, [o(pw_m8192)] - jmp m(iflipadst_8x8_internal).pass1_end1 + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass2: - lea tx2q, [o(m(iflipadst_16x16_internal).end)] + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end)] lea r3, [dstq+8] - jmp m(iflipadst_8x16_internal).pass2_pre + jmp m(iflipadst_8x16_internal_8bpc).pass2_pre .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iflipadst_16x16_internal).end1)] + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end1)] lea dstq, [dstq+strideq*2] - jmp m(iflipadst_8x8_internal).end + jmp m(iflipadst_8x8_internal_8bpc).end .end1: pxor m7, m7 @@ -3278,16 +3278,16 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*5], m6 mova [rsp+gprsize+16*6], m7 - lea tx2q, [o(m(iflipadst_16x16_internal).end2)] + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end2)] mov dstq, r3 - jmp m(iflipadst_8x16_internal).pass2_main + jmp m(iflipadst_8x16_internal_8bpc).pass2_main .end2: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] lea dstq, [dstq+strideq*2] - jmp m(iflipadst_8x8_internal).end + jmp m(iflipadst_8x8_internal_8bpc).end %macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 @@ -3299,10 +3299,10 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 INV_TXFM_16X16_FN identity, dct INV_TXFM_16X16_FN identity, identity -cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 add coeffq, 16*17 mov r3, tx2q - lea tx2q, [o(m(iidentity_16x16_internal).pass1_end)] + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end)] .pass1: mova m6, [o(pw_1697x16)] @@ -3318,18 +3318,18 @@ cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 IDTX16B 5, 7, 6 mova m7, [coeffq+32*7] IDTX16B 7, 6, 6 - jmp m(idct_8x8_internal).pass1_end3 + jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass1_end: SAVE_8ROWS coeffq, 32 sub coeffq, 16 - lea tx2q, [o(m(iidentity_16x16_internal).pass1_end1)] + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end1)] jmp .pass1 .pass1_end1: SAVE_8ROWS coeffq, 32 sub coeffq, 15*16 - lea tx2q, [o(m(iidentity_16x16_internal).pass1_end2)] + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end2)] jmp .pass1 .pass1_end2: @@ -3340,7 +3340,7 @@ cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 .pass2: lea r3, [dstq+8] - lea tx2q, [o(m(iidentity_16x16_internal).end1)] + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end1)] .end: mova [rsp+gprsize+16*0], m7 @@ -3359,11 +3359,11 @@ cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6 pmulhrsw m4, m5 mova [rsp+gprsize+16*0], m6 - jmp m(idct_8x8_internal).end3 + jmp m(idct_8x8_internal_8bpc).end3 .end1: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(iidentity_16x16_internal).end2)] + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end2)] lea dstq, [dstq+strideq*2] jmp .end @@ -3373,24 +3373,24 @@ cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 add coeffq, 32*8 LOAD_8ROWS coeffq, 32 - lea tx2q, [o(m(iidentity_16x16_internal).end3)] + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end3)] mov dstq, r3 jmp .end .end3: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] lea dstq, [dstq+strideq*2] jmp .end -cglobal inv_txfm_add_dct_dct_8x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_8x32_internal) + call m(idct_8x32_internal_8bpc) RET .dconly: @@ -3405,24 +3405,24 @@ cglobal inv_txfm_add_dct_dct_8x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 pshuflw m0, m0, q0000 punpcklwd m0, m0 mov r3d, 8 - lea tx2q, [o(m(inv_txfm_add_dct_dct_8x32).end)] - jmp m(inv_txfm_add_dct_dct_8x8).loop + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x32_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop .end: RET -cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp cmp eobd, 106 jle .fast LOAD_8ROWS coeffq+16*3, 64 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal).pass1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1: mova [rsp+gprsize+16*9 ], m0 ;in24 @@ -3434,10 +3434,10 @@ cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*27], m5 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 LOAD_8ROWS coeffq+16*2, 64 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal).pass1_1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_1: mova [rsp+gprsize+16*7 ], m0 ;in16 @@ -3451,10 +3451,10 @@ cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 .fast: LOAD_8ROWS coeffq+16*1, 64 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: mova [rsp+gprsize+16*5 ], m0 ;in8 @@ -3466,10 +3466,10 @@ cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*25], m5 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 LOAD_8ROWS coeffq+16*0, 64 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: mova [rsp+gprsize+16*11], m2 ;in2 @@ -3487,7 +3487,7 @@ cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3 , 16 mova m0, [rsp+gprsize+16*11] mova m1, [rsp+gprsize+16*12] @@ -3495,7 +3495,7 @@ cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [rsp+gprsize+16*14] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -3507,20 +3507,20 @@ cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m5, [rsp+gprsize+16*8 ] ;in20 mova m6, [rsp+gprsize+16*9 ] ;in24 mova m7, [rsp+gprsize+16*10] ;in28 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3 , 16 LOAD_8ROWS rsp+gprsize+16*11, 16 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call .main .pass2: - lea r3, [o(m(idct_8x32_internal).end6)] + lea r3, [o(m(idct_8x32_internal_8bpc).end6)] .end: mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_8x32_internal).end2)] + lea tx2q, [o(m(idct_8x32_internal_8bpc).end2)] .end1: pxor m7, m7 @@ -3532,29 +3532,29 @@ cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp tx2q .end2: - lea tx2q, [o(m(idct_8x32_internal).end3)] - jmp m(idct_8x8_internal).end + lea tx2q, [o(m(idct_8x32_internal_8bpc).end3)] + jmp m(idct_8x8_internal_8bpc).end .end3: LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] - lea tx2q, [o(m(idct_8x32_internal).end4)] - jmp m(idct_8x8_internal).end + lea tx2q, [o(m(idct_8x32_internal_8bpc).end4)] + jmp m(idct_8x8_internal_8bpc).end .end4: LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] - lea tx2q, [o(m(idct_8x32_internal).end5)] - jmp m(idct_8x8_internal).end + lea tx2q, [o(m(idct_8x32_internal_8bpc).end5)] + jmp m(idct_8x8_internal_8bpc).end .end5: LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] mov tx2q, r3 - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end6: ret @@ -3870,13 +3870,13 @@ ALIGN function_align ret -cglobal inv_txfm_add_dct_dct_32x8, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_32x8_internal) + call m(idct_32x8_internal_8bpc) RET .dconly: @@ -3885,7 +3885,7 @@ cglobal inv_txfm_add_dct_dct_32x8, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 8 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)] + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] .body: pmulhrsw m0, m2 @@ -3920,14 +3920,14 @@ cglobal inv_txfm_add_dct_dct_32x8, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 RET -cglobal idct_32x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp LOAD_8ROWS coeffq+16*0, 64 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*2, 64 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -3943,7 +3943,7 @@ cglobal idct_32x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 cmp eobd, 106 jg .full - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast jmp .pass2 .full: @@ -3956,66 +3956,66 @@ cglobal idct_32x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 - call m(idct_8x32_internal).main + call m(idct_8x32_internal_8bpc).main .pass2: mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x8_internal).end)] - jmp m(idct_8x32_internal).end1 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end)] + jmp m(idct_8x32_internal_8bpc).end1 .end: mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal).end1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .end1: lea r3, [dstq+8] - lea tx2q, [o(m(idct_32x8_internal).end2)] - jmp m(idct_8x8_internal).pass2_main + lea tx2q, [o(m(idct_32x8_internal_8bpc).end2)] + jmp m(idct_8x8_internal_8bpc).pass2_main .end2: LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal).end3)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .end3: mov dstq, r3 add r3, 8 - lea tx2q, [o(m(idct_32x8_internal).end4)] - jmp m(idct_8x8_internal).pass2_main + lea tx2q, [o(m(idct_32x8_internal_8bpc).end4)] + jmp m(idct_8x8_internal_8bpc).pass2_main .end4: LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal).end5)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .end5: mov dstq, r3 add r3, 8 - lea tx2q, [o(m(idct_32x8_internal).end6)] - jmp m(idct_8x8_internal).pass2_main + lea tx2q, [o(m(idct_32x8_internal_8bpc).end6)] + jmp m(idct_8x8_internal_8bpc).pass2_main .end6: LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal).end7)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .end7: mov dstq, r3 - lea tx2q, [o(m(idct_32x8_internal).end8)] - jmp m(idct_8x8_internal).pass2_main + lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] + jmp m(idct_8x8_internal_8bpc).pass2_main .end8: ret -cglobal inv_txfm_add_identity_identity_8x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 mov r5d, 4 mov tx2d, 2 cmp eobd, 107 @@ -4024,19 +4024,19 @@ cglobal inv_txfm_add_identity_identity_8x32, 4, 6, 8, 16*4, dst, stride, coeff, %if ARCH_X86_32 LEA r5, $$ %endif - lea tx2q, [o(m(idct_32x8_internal).end8)] + lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] .loop: LOAD_8ROWS coeffq+16*0, 64 paddsw m6, [o(pw_5)] mova [rsp+16*1], m6 mova m6, [o(pw_5)] REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7 - call m(idct_8x8_internal).pass1_end3 + call m(idct_8x8_internal_8bpc).pass1_end3 REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 mova [rsp+16*2], m5 mova [rsp+16*1], m6 mova [rsp+16*0], m7 - call m(idct_8x8_internal).end3 + call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] pxor m7, m7 REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 @@ -4045,7 +4045,7 @@ cglobal inv_txfm_add_identity_identity_8x32, 4, 6, 8, 16*4, dst, stride, coeff, jg .loop RET -cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 mov r5d, 4 mov tx2d, 2 cmp eobd, 107 @@ -4061,15 +4061,15 @@ cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 8, 16*4, dst, stride, coeff, mova [rsp+16*1], m6 mova m6, [o(pw_4096)] REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 - lea tx2q, [o(m(idct_32x8_internal).end8)] - call m(idct_8x8_internal).pass1_end3 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] + call m(idct_8x8_internal_8bpc).pass1_end3 mov [rsp+16*3], dstq mova [rsp+16*2], m5 mova [rsp+16*1], m6 mova [rsp+16*0], m7 - lea tx2q, [o(m(idct_8x8_internal).end4)] - call m(idct_8x8_internal).end3 + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + call m(idct_8x8_internal_8bpc).end3 add coeffq, 16*8 mov dstq, [rsp+16*3] @@ -4080,13 +4080,13 @@ cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 8, 16*4, dst, stride, coeff, RET -cglobal inv_txfm_add_dct_dct_16x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_16x32_internal) + call m(idct_16x32_internal_8bpc) RET .dconly: @@ -4096,29 +4096,29 @@ cglobal inv_txfm_add_dct_dct_16x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 mov [coeffq], eobd pmulhrsw m0, m1 mov r2d, 16 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x32).end)] - jmp m(inv_txfm_add_dct_dct_16x4).dconly + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x32_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .end: RET -cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp LOAD_8ROWS coeffq+16*1, 128, 1 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*5, 128, 1 - call m(idct_16x8_internal).main - lea tx2q, [o(m(idct_16x32_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*33, 64 ;in8~in15 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: mova [coeffq+16*1 ], m0 ;in8 @@ -4130,19 +4130,19 @@ cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*25], m5 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 LOAD_8ROWS coeffq+16*0, 128, 1 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*4, 128, 1 - call m(idct_16x8_internal).main - lea tx2q, [o(m(idct_16x32_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+16*32, 64 ;in0~in7 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: mova [rsp+gprsize+16*11], m2 ;in2 @@ -4160,7 +4160,7 @@ cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [coeffq+16*5 ] ;in12 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [rsp+gprsize+16*11] ;in2 mova m1, [rsp+gprsize+16*12] ;in6 @@ -4168,11 +4168,11 @@ cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [rsp+gprsize+16*14] ;in14 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast jmp .pass2 .full: @@ -4180,19 +4180,19 @@ cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [coeffq+16*4 ], m4 ;in4 LOAD_8ROWS coeffq+16*2, 128, 1 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*6, 128, 1 - call m(idct_16x8_internal).main - lea tx2q, [o(m(idct_16x32_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS coeffq+16*34, 64 ;in16~in23 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal).pass1_end5)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end5: mova [coeffq+16*2 ], m0 ;in16 @@ -4205,19 +4205,19 @@ cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*32], m7 ;in23 LOAD_8ROWS coeffq+16*3, 128, 1 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*7, 128, 1 - call m(idct_16x8_internal).main - lea tx2q, [o(m(idct_16x32_internal).pass1_end6)] - jmp m(idct_8x8_internal).pass1_end + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end6: SAVE_8ROWS coeffq+16*35, 64 ;in24~in31 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal).pass1_end7)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end7: mova [rsp+gprsize+16*17], m2 ;in26 @@ -4235,21 +4235,21 @@ cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [coeffq+16*5 ] ;in12 mova m4, [coeffq+16*2 ] ;in16 mova m5, [coeffq+16*6 ] ;in20 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3 , 16 LOAD_8ROWS rsp+gprsize+16*11, 16 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 - call m(idct_8x32_internal).main + call m(idct_8x32_internal_8bpc).main .pass2: mov [rsp+gprsize*1+16*35], eobd lea r3, [dstq+8] mov [rsp+gprsize*2+16*35], r3 - lea r3, [o(m(idct_16x32_internal).end)] - jmp m(idct_8x32_internal).end + lea r3, [o(m(idct_16x32_internal_8bpc).end)] + jmp m(idct_8x32_internal_8bpc).end .end: mov dstq, [rsp+gprsize*2+16*35] @@ -4284,7 +4284,7 @@ cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] ;in2 @@ -4293,11 +4293,11 @@ cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [coeffq+16*25] ;in14 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast jmp .end1 .full1: @@ -4305,7 +4305,7 @@ cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m5, [coeffq+16*18] ;in20 mova m6, [coeffq+16*3 ] ;in24 mova m7, [coeffq+16*19] ;in26 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] ;in2 @@ -4316,7 +4316,7 @@ cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m5, [coeffq+16*26] ;in22 mova m6, [coeffq+16*11] ;in26 mova m7, [coeffq+16*27] ;in30 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -4338,46 +4338,46 @@ cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 - call m(idct_8x32_internal).main + call m(idct_8x32_internal_8bpc).main .end1: - jmp m(idct_8x32_internal).pass2 + jmp m(idct_8x32_internal_8bpc).pass2 -cglobal inv_txfm_add_dct_dct_32x16, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_32x16_internal) - call m(idct_8x16_internal).pass2 + call m(idct_32x16_internal_8bpc) + call m(idct_8x16_internal_8bpc).pass2 add coeffq, 16*16 lea dstq, [r3+8] LOAD_8ROWS rsp+16*11, 16 mova [rsp+16*0], m7 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end - call m(idct_8x16_internal).pass2 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end + call m(idct_8x16_internal_8bpc).pass2 add coeffq, 16*16 lea dstq, [r3+8] LOAD_8ROWS rsp+16*19, 16 mova [rsp+16*0], m7 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end - call m(idct_8x16_internal).pass2 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end + call m(idct_8x16_internal_8bpc).pass2 add coeffq, 16*16 lea dstq, [r3+8] LOAD_8ROWS rsp+16*27, 16 mova [rsp+16*0], m7 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end - call m(idct_8x16_internal).pass2 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end + call m(idct_8x16_internal_8bpc).pass2 RET .dconly: @@ -4387,22 +4387,22 @@ cglobal inv_txfm_add_dct_dct_32x16, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 mov [coeffq], eobd pmulhrsw m0, m1 mov r3d, 16 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)] - jmp m(inv_txfm_add_dct_dct_32x8).body + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body -cglobal idct_32x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp add coeffq, 16 - lea r3, [o(m(idct_32x16_internal).pass1_end1)] + lea r3, [o(m(idct_32x16_internal_8bpc).pass1_end1)] .pass1: LOAD_8ROWS coeffq+16*0, 128, 1 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*4, 128, 1 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -4425,46 +4425,46 @@ cglobal idct_32x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 - call m(idct_8x32_internal).main + call m(idct_8x32_internal_8bpc).main .pass1_end: mova [rsp+gprsize+16*0 ], m7 mov tx2q, r3 - jmp m(idct_8x8_internal).pass1_end + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x16_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x16_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: SAVE_8ROWS coeffq+16*32, 32 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x16_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS coeffq+16*48, 32 sub coeffq, 16 - lea r3, [o(m(idct_32x16_internal).end)] + lea r3, [o(m(idct_32x16_internal_8bpc).end)] jmp .pass1 .end: ret -cglobal inv_txfm_add_identity_identity_16x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 %undef cmp mov r4d, eobd @@ -4488,8 +4488,8 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 6, 8, 16*4, dst, stride, coeff, mova [rsp+16*1], m6 pxor m6, m6 REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end3 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end3 mova [rsp+16*0], m2 mova [rsp+16*1], m3 mova [rsp+16*2], m4 @@ -4513,7 +4513,7 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 6, 8, 16*4, dst, stride, coeff, pmulhrsw m2, m4 pmulhrsw m3, m4 pmulhrsw m4, m7 - call m(idct_8x8_internal).end3 + call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] add coeffq, 16 dec r3d @@ -4529,7 +4529,7 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 6, 8, 16*4, dst, stride, coeff, RET -cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 %undef cmp mov r4d, 12 ;0100b @@ -4550,8 +4550,8 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff, LOAD_8ROWS coeffq, 32, 1 REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 mova [rsp+16*1], m6 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end3 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end3 mova [rsp+16*1], m5 mova [rsp+16*2], m6 mova m6, [o(pw_1697x16)] @@ -4566,7 +4566,7 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff, REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 mova [rsp+16*2], m5 mova [rsp+16*1], m7 - call m(idct_8x8_internal).end3 + call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] pxor m7, m7 REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 @@ -4589,14 +4589,14 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff, RET -cglobal inv_txfm_add_dct_dct_32x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_32x32_internal) + call m(idct_32x32_internal_8bpc) RET .dconly: @@ -4605,11 +4605,11 @@ cglobal inv_txfm_add_dct_dct_32x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 32 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)] - jmp m(inv_txfm_add_dct_dct_32x8).body + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body -cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp mov r4d, 2 @@ -4641,10 +4641,10 @@ cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 .full: LOAD_8ROWS coeffq+64*0, 64*4 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*2, 64*4 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -4658,7 +4658,7 @@ cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 - call m(idct_8x32_internal).main + call m(idct_8x32_internal_8bpc).main jmp .pass1_end .fast: @@ -4668,7 +4668,7 @@ cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [coeffq+256*3] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+128*1] @@ -4677,41 +4677,41 @@ cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [coeffq+128*7] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast .pass1_end: mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end3: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end4: SAVE_8ROWS coeffq+64*24, 64 @@ -4724,7 +4724,7 @@ cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 .pass2: mov coeffq, [rsp+gprsize*2+16*35] mov r3d, 4 - lea tx2q, [o(m(idct_32x32_internal).pass2_end)] + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] .pass2_loop: mov [rsp+gprsize*3+16*35], r3d @@ -4761,7 +4761,7 @@ cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m5, [coeffq+16*18] mova m6, [coeffq+16*3 ] mova m7, [coeffq+16*19] - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] @@ -4772,7 +4772,7 @@ cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m5, [coeffq+16*26] mova m6, [coeffq+16*11] mova m7, [coeffq+16*27] - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -4793,7 +4793,7 @@ cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 - call m(idct_8x32_internal).main + call m(idct_8x32_internal_8bpc).main jmp tx2q .fast1: @@ -4803,7 +4803,7 @@ cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [coeffq+16*17] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] @@ -4812,19 +4812,19 @@ cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [coeffq+16*25] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast jmp tx2q .pass2_end: - lea r3, [o(m(idct_32x32_internal).pass2_end1)] - jmp m(idct_8x32_internal).end + lea r3, [o(m(idct_32x32_internal_8bpc).pass2_end1)] + jmp m(idct_8x32_internal_8bpc).end .pass2_end1: - lea tx2q, [o(m(idct_32x32_internal).pass2_end)] + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] add coeffq, 16*32 mov dstq, [rsp+gprsize*2+16*35] mov r3d, [rsp+gprsize*3+16*35] @@ -4834,7 +4834,7 @@ cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ret -cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 %undef cmp mov r4d, 2 @@ -4855,15 +4855,15 @@ cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 8, 16*5, dst, stride, coeff, .loop: LOAD_8ROWS coeffq, 64 mova [rsp+16*1], m6 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end3 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end3 pmulhrsw m7, [o(pw_8192)] mova [rsp+16*0], m7 mova m7, [o(pw_8192)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 mova [rsp+16*1], m6 mova [rsp+16*2], m5 - call m(idct_8x8_internal).end3 + call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] pxor m7, m7 @@ -4891,14 +4891,14 @@ cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 8, 16*5, dst, stride, coeff, RET -cglobal inv_txfm_add_dct_dct_16x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_16x64_internal) + call m(idct_16x64_internal_8bpc) RET .dconly: @@ -4907,14 +4907,14 @@ cglobal inv_txfm_add_dct_dct_16x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 movd m2, [o(pw_8192)] mov [coeffq], eobd mov r2d, 32 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x64).end)] - jmp m(inv_txfm_add_dct_dct_16x4).dconly + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x64_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .end: RET -cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp mov r4d, 2 @@ -4931,21 +4931,21 @@ cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 .pass1_loop: LOAD_8ROWS coeffq+64*0, 64*2 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*1, 64*2 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_16x64_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_16x64_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+64*0, 64 @@ -4958,7 +4958,7 @@ cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r3d, 2 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal).end1)] + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] .pass2_loop: mov [rsp+gprsize*3+16*67], r3d @@ -4993,7 +4993,7 @@ cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [coeffq+16*3] REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 @@ -5003,7 +5003,7 @@ cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [coeffq+16*19] REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -5024,7 +5024,7 @@ cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*6 ] ;in17 @@ -5049,7 +5049,7 @@ cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 .fast: REPX {mova x, m4}, m2, m3, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 @@ -5057,7 +5057,7 @@ cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m1, [coeffq+16*17] REPX {mova x, m4}, m2, m3, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -5070,7 +5070,7 @@ cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 - call m(idct_8x32_internal).main_veryfast + call m(idct_8x32_internal_8bpc).main_veryfast SAVE_8ROWS rsp+gprsize+16*3, 16 call .main_fast @@ -5079,14 +5079,14 @@ cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov r3, r4 - jmp m(idct_8x32_internal).end2 + jmp m(idct_8x32_internal_8bpc).end2 .end1: LOAD_8ROWS rsp+gprsize+16*35, 16 lea dstq, [dstq+strideq*2] add rsp, 16*32 - lea r3, [o(m(idct_16x64_internal).end2)] - jmp m(idct_8x32_internal).end + lea r3, [o(m(idct_16x64_internal_8bpc).end2)] + jmp m(idct_8x32_internal_8bpc).end .end2: add coeffq, 16*32 @@ -5096,7 +5096,7 @@ cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r3d, [rsp+gprsize*3+16*67] lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal).end1)] + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] dec r3d jg .pass2_loop @@ -5751,14 +5751,14 @@ ALIGN function_align ret -cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_64x16_internal) + call m(idct_64x16_internal_8bpc) RET .dconly: @@ -5767,7 +5767,7 @@ cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 16 - lea tx2q, [o(m(inv_txfm_add_dct_dct_64x16).end)] + lea tx2q, [o(m(inv_txfm_add_dct_dct_64x16_8bpc).end)] .body: pmulhrsw m0, m2 @@ -5839,7 +5839,7 @@ cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx mova m7, [%1+%2*3] %endmacro -cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r3d, 2 mov [rsp+gprsize*2+16*67], dstq lea dstq, [rsp+gprsize+16*68] @@ -5848,14 +5848,14 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_4ROWS coeffq+32*0, 32*8 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 LOAD_4ROWS coeffq+32*4, 32*8 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -5869,7 +5869,7 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+32*1, 32*2 @@ -5892,69 +5892,69 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*51], m6 ;in29 mova [rsp+gprsize+16*65], m7 ;in31 - call m(idct_16x64_internal).main + call m(idct_16x64_internal_8bpc).main LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+32*0, 32 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+32*8, 32 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+32*16, 32 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end3: SAVE_8ROWS coeffq+32*24, 32 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end4: SAVE_8ROWS dstq+32*0, 32 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end5)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end5: SAVE_8ROWS dstq+32*8, 32 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end6)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end6: SAVE_8ROWS dstq+32*16, 32 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end7)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end7: SAVE_8ROWS dstq+32*24, 32 @@ -5974,23 +5974,23 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_4ROWS coeffq+16*0, 32*2 LOAD_4ROWS_H coeffq+16*1, 32*2 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_4ROWS coeffq+16*2, 32*2 LOAD_4ROWS_H coeffq+16*3, 32*2 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mov r3, dstq - lea tx2q, [o(m(idct_64x16_internal).end)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).end)] lea dstq, [dstq+strideq*8] - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x16_internal).end1)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end1: pxor m7, m7 @@ -6011,23 +6011,23 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_4ROWS coeffq+16*0, 32*2 LOAD_4ROWS_H coeffq+16*1, 32*2 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_4ROWS coeffq+16*2, 32*2 LOAD_4ROWS_H coeffq+16*3, 32*2 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mov r3, dstq - lea tx2q, [o(m(idct_64x16_internal).end2)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).end2)] lea dstq, [dstq+strideq*8] - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end2: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x16_internal).end3)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).end3)] mov dstq, r3 - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end3: @@ -6041,14 +6041,14 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ret -cglobal inv_txfm_add_dct_dct_32x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_32x64_internal) + call m(idct_32x64_internal_8bpc) RET .dconly: @@ -6058,14 +6058,14 @@ cglobal inv_txfm_add_dct_dct_32x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 mov [coeffq], eobd pmulhrsw m0, m1 mov r3d, 64 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x64).end)] - jmp m(inv_txfm_add_dct_dct_32x8).body + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x64_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body .end: RET -cglobal idct_32x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp mov r4d, 2 @@ -6097,10 +6097,10 @@ cglobal idct_32x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 .full: LOAD_8ROWS coeffq+64*0, 64*4, 1 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*2, 64*4, 1 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -6114,50 +6114,50 @@ cglobal idct_32x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 - call m(idct_8x32_internal).main + call m(idct_8x32_internal_8bpc).main jmp .pass1_end .fast: LOAD_4ROWS coeffq, 256, 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_4ROWS coeffq+128*1, 256, 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast .pass1_end: mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS coeffq+64*24, 64 @@ -6171,18 +6171,18 @@ cglobal idct_32x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r3d, 4 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal).end1)] - jmp m(idct_16x64_internal).pass2_loop + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + jmp m(idct_16x64_internal_8bpc).pass2_loop -cglobal inv_txfm_add_dct_dct_64x32, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_64x32_internal) + call m(idct_64x32_internal_8bpc) RET .dconly: @@ -6192,13 +6192,13 @@ cglobal inv_txfm_add_dct_dct_64x32, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx pmulhrsw m0, m1 mov [coeffq], eobd mov r3d, 32 - lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32).end)] - jmp m(inv_txfm_add_dct_dct_64x16).body + lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body .end: RET -cglobal idct_64x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp mov r4d, 2 @@ -6220,14 +6220,14 @@ cglobal idct_64x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_4ROWS coeffq+64*0, 64*8, 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 LOAD_4ROWS coeffq+64*4, 64*8, 1 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -6241,7 +6241,7 @@ cglobal idct_64x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*1, 64*2, 1 @@ -6264,61 +6264,61 @@ cglobal idct_64x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*51], m6 ;in29 mova [rsp+gprsize+16*65], m7 ;in31 - call m(idct_16x64_internal).main + call m(idct_16x64_internal_8bpc).main LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: SAVE_8ROWS coeffq+64*24, 64 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS dstq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end5)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end5: SAVE_8ROWS dstq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end6)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end6: SAVE_8ROWS dstq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end7)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end7: SAVE_8ROWS dstq+64*24, 64 @@ -6334,39 +6334,39 @@ cglobal idct_64x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mov eobd, [rsp+gprsize*1+16*67] lea dstq, [dstq+32] mov [rsp+gprsize*1+16*35], eobd - lea tx2q, [o(m(idct_64x32_internal).pass2_end)] + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)] mov r3d, 4 - jmp m(idct_32x32_internal).pass2_loop + jmp m(idct_32x32_internal_8bpc).pass2_loop .pass2_end: mova [rsp+gprsize+16*0], m7 - lea r3, [o(m(idct_64x32_internal).pass2_end1)] - jmp m(idct_8x32_internal).end2 + lea r3, [o(m(idct_64x32_internal_8bpc).pass2_end1)] + jmp m(idct_8x32_internal_8bpc).end2 .pass2_end1: - lea tx2q, [o(m(idct_64x32_internal).pass2_end)] + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)] add coeffq, 16*32 mov dstq, [rsp+gprsize*2+16*35] mov r3d, [rsp+gprsize*3+16*35] dec r3d - jg m(idct_32x32_internal).pass2_loop + jg m(idct_32x32_internal_8bpc).pass2_loop .pass2_end2: mov dstq, [rsp+gprsize*3+16*67] mov coeffq, [rsp+gprsize*2+16*67] - lea tx2q, [o(m(idct_32x32_internal).pass2_end)] + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] mov r3d, 4 - jmp m(idct_32x32_internal).pass2_loop + jmp m(idct_32x32_internal_8bpc).pass2_loop -cglobal inv_txfm_add_dct_dct_64x64, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_64x64_internal) + call m(idct_64x64_internal_8bpc) RET .dconly: @@ -6375,10 +6375,10 @@ cglobal inv_txfm_add_dct_dct_64x64, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 64 - lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32).end)] - jmp m(inv_txfm_add_dct_dct_64x16).body + lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body -cglobal idct_64x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp mov r5d, 4 @@ -6401,14 +6401,14 @@ cglobal idct_64x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_4ROWS coeffq+64*0, 64*8 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 LOAD_4ROWS coeffq+64*4, 64*8 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -6422,7 +6422,7 @@ cglobal idct_64x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*1, 64*2 @@ -6445,69 +6445,69 @@ cglobal idct_64x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*51], m6 ;in29 mova [rsp+gprsize+16*65], m7 ;in31 - call m(idct_16x64_internal).main + call m(idct_16x64_internal_8bpc).main LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end3: SAVE_8ROWS coeffq+64*24, 64 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end4: SAVE_8ROWS dstq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end5)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end5: SAVE_8ROWS dstq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end6)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end6: SAVE_8ROWS dstq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end7)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end7: SAVE_8ROWS dstq+64*24, 64 @@ -6524,16 +6524,16 @@ cglobal idct_64x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r3d, 4 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_64x64_internal).pass2_end)] - jmp m(idct_16x64_internal).pass2_loop + lea r4, [o(m(idct_64x64_internal_8bpc).pass2_end)] + jmp m(idct_16x64_internal_8bpc).pass2_loop .pass2_end: LOAD_8ROWS rsp+gprsize+16*35, 16 lea dstq, [dstq+strideq*2] add rsp, 16*32 mova [rsp+gprsize+16*0], m7 - lea r3, [o(m(idct_64x64_internal).pass2_end1)] - jmp m(idct_8x32_internal).end2 + lea r3, [o(m(idct_64x64_internal_8bpc).pass2_end1)] + jmp m(idct_8x32_internal_8bpc).end2 .pass2_end1: add coeffq, 16*32 @@ -6543,10 +6543,10 @@ cglobal idct_64x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r3d, [rsp+gprsize*3+16*67] lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_64x64_internal).pass2_end)] + lea r4, [o(m(idct_64x64_internal_8bpc).pass2_end)] dec r3d - jg m(idct_16x64_internal).pass2_loop + jg m(idct_16x64_internal_8bpc).pass2_loop .pass2_end2: mov coeffq, [rsp+gprsize*4+16*67] @@ -6555,5 +6555,5 @@ cglobal idct_64x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 sub dstq, 72 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal).end1)] - jmp m(idct_16x64_internal).pass2_loop + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + jmp m(idct_16x64_internal_8bpc).pass2_loop From 97c3ccfe0bd4b57717514a7431b8d959ae17f2fa Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Fri, 18 Jun 2021 13:37:15 +0200 Subject: [PATCH 119/188] x86: Fix warp_affine_8x8t_16bpc_ssse3 on 64-bit Windows + LLVM The stack size calculation ended up being incorrect when the stack alignment was larger than 16 due to auto-generated alignment padding. --- src/x86/mc16_sse.asm | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/x86/mc16_sse.asm b/src/x86/mc16_sse.asm index 312d95cfed..a277585f51 100644 --- a/src/x86/mc16_sse.asm +++ b/src/x86/mc16_sse.asm @@ -2537,10 +2537,15 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my %if ARCH_X86_64 ; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that ; by allocating 16 bytes more stack space so that stack offsets match up. -cglobal warp_affine_8x8t_16bpc, 4, 13, 9, 16*(13+WIN64), dst, ds, src, ss, \ - delta, mx, tmp, \ - alpha, beta, filter, \ - my, gamma, cnt +%if WIN64 && STACK_ALIGNMENT == 16 +%assign stksz 16*14 +%else +%assign stksz 16*13 +%endif +cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \ + mx, tmp, alpha, beta, \ + filter, my, gamma, cnt +%assign stack_size_padded_8x8t stack_size_padded %else cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ filter, mx, my @@ -2609,6 +2614,7 @@ cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \ mx, tmp, alpha, beta, \ filter, my, gamma, cnt +ASSERT stack_size_padded == stack_size_padded_8x8t %else cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ filter, mx, my From 14d7098914f79dbead320772b41d4667b96780ef Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Sat, 12 Jun 2021 00:16:34 +0200 Subject: [PATCH 120/188] x86: itx: wht: Minor fixes * Rename macro for consistency. WHT has exactly one line per register. * Use REPX to make code more readable. --- src/x86/itx16_sse.asm | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/src/x86/itx16_sse.asm b/src/x86/itx16_sse.asm index 48c7674d12..6785064ab7 100644 --- a/src/x86/itx16_sse.asm +++ b/src/x86/itx16_sse.asm @@ -30,16 +30,24 @@ SECTION .text -%macro IWHT4_1D_PACKED 0 +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%macro IWHT4_1D 0 ; m0 = in0, m1 = in1, m2 = in2, m3 = in3 - paddd m0, m1 ; in0 += in1 - psubd m4, m2, m3 ; tmp0 = in2 - in3 - psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1 - psrad m5, 1 - psubd m2, m5, m1 ; in2 = tmp1 - in1 - psubd m5, m3 ; in1 = tmp1 - in3 - psubd m0, m5 ; in0 -= in1 - paddd m4, m2 ; in3 = tmp0 + in2 + paddd m0, m1 ; in0 += in1 + psubd m4, m2, m3 ; tmp0 = in2 - in3 + psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1 + psrad m5, 1 + psubd m2, m5, m1 ; in2 = tmp1 - in1 + psubd m5, m3 ; in1 = tmp1 - in3 + psubd m0, m5 ; in0 -= in1 + paddd m4, m2 ; in3 = tmp0 + in2 ; m0 = out0, m1 = in1, m2 = out2, m3 = in3 ; m4 = out3, m5 = out1 %endmacro @@ -50,11 +58,8 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax mova m1, [cq+16*1] mova m2, [cq+16*2] mova m3, [cq+16*3] - psrad m0, 2 - psrad m1, 2 - psrad m2, 2 - psrad m3, 2 - IWHT4_1D_PACKED + REPX {psrad x, 2}, m0, m1, m2, m3 + IWHT4_1D punpckldq m1, m0, m5 punpckhdq m3, m0, m5 punpckldq m5, m2, m4 @@ -64,7 +69,7 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax punpcklqdq m4, m3, m2 punpckhqdq m3, m2 mova m2, m4 - IWHT4_1D_PACKED + IWHT4_1D packssdw m0, m4 ; low: out3, high: out0 packssdw m2, m5 ; low: out2, high: out1 pxor m4, m4 From 8d1e82ca0a599cf81bc37cadabc015ce0fedede1 Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Sat, 19 Jun 2021 20:44:56 +0200 Subject: [PATCH 121/188] x86: itx: Port 10-bit 4x4 transforms to SSE4 64-bit 32-bit inv_txfm_add_4x4_adst_adst_0_10bpc_c: 257.0 346.3 inv_txfm_add_4x4_adst_adst_0_10bpc_sse4: 47.1 51.7 inv_txfm_add_4x4_adst_adst_0_10bpc_avx2: 57.4 inv_txfm_add_4x4_adst_adst_1_10bpc_c: 259.8 345.6 inv_txfm_add_4x4_adst_adst_1_10bpc_sse4: 47.1 52.0 inv_txfm_add_4x4_adst_adst_1_10bpc_avx2: 56.9 inv_txfm_add_4x4_adst_dct_0_10bpc_c: 284.6 369.9 inv_txfm_add_4x4_adst_dct_0_10bpc_sse4: 42.2 46.0 inv_txfm_add_4x4_adst_dct_0_10bpc_avx2: 51.9 inv_txfm_add_4x4_adst_dct_1_10bpc_c: 285.2 369.8 inv_txfm_add_4x4_adst_dct_1_10bpc_sse4: 42.4 45.9 inv_txfm_add_4x4_adst_dct_1_10bpc_avx2: 51.9 inv_txfm_add_4x4_adst_flipadst_0_10bpc_c: 262.9 345.0 inv_txfm_add_4x4_adst_flipadst_0_10bpc_sse4: 46.8 50.1 inv_txfm_add_4x4_adst_flipadst_0_10bpc_avx2: 57.0 inv_txfm_add_4x4_adst_flipadst_1_10bpc_c: 262.1 345.6 inv_txfm_add_4x4_adst_flipadst_1_10bpc_sse4: 46.8 50.3 inv_txfm_add_4x4_adst_flipadst_1_10bpc_avx2: 57.1 inv_txfm_add_4x4_adst_identity_0_10bpc_c: 225.6 302.9 inv_txfm_add_4x4_adst_identity_0_10bpc_sse4: 38.0 42.3 inv_txfm_add_4x4_adst_identity_0_10bpc_avx2: 41.4 inv_txfm_add_4x4_adst_identity_1_10bpc_c: 225.7 303.1 inv_txfm_add_4x4_adst_identity_1_10bpc_sse4: 37.8 42.3 inv_txfm_add_4x4_adst_identity_1_10bpc_avx2: 41.4 inv_txfm_add_4x4_dct_adst_0_10bpc_c: 274.6 378.0 inv_txfm_add_4x4_dct_adst_0_10bpc_sse4: 44.8 48.5 inv_txfm_add_4x4_dct_adst_0_10bpc_avx2: 50.7 inv_txfm_add_4x4_dct_adst_1_10bpc_c: 274.0 377.4 inv_txfm_add_4x4_dct_adst_1_10bpc_sse4: 44.6 48.6 inv_txfm_add_4x4_dct_adst_1_10bpc_avx2: 51.0 inv_txfm_add_4x4_dct_dct_0_10bpc_c: 39.2 50.6 inv_txfm_add_4x4_dct_dct_0_10bpc_sse4: 29.1 33.8 inv_txfm_add_4x4_dct_dct_0_10bpc_avx2: 29.3 inv_txfm_add_4x4_dct_dct_1_10bpc_c: 300.6 399.0 inv_txfm_add_4x4_dct_dct_1_10bpc_sse4: 39.7 44.3 inv_txfm_add_4x4_dct_dct_1_10bpc_avx2: 48.6 inv_txfm_add_4x4_dct_flipadst_0_10bpc_c: 278.6 377.8 inv_txfm_add_4x4_dct_flipadst_0_10bpc_sse4: 45.3 49.6 inv_txfm_add_4x4_dct_flipadst_0_10bpc_avx2: 50.2 inv_txfm_add_4x4_dct_flipadst_1_10bpc_c: 277.1 378.3 inv_txfm_add_4x4_dct_flipadst_1_10bpc_sse4: 45.0 49.7 inv_txfm_add_4x4_dct_flipadst_1_10bpc_avx2: 50.2 inv_txfm_add_4x4_dct_identity_0_10bpc_c: 246.9 335.8 inv_txfm_add_4x4_dct_identity_0_10bpc_sse4: 37.1 41.7 inv_txfm_add_4x4_dct_identity_0_10bpc_avx2: 37.4 inv_txfm_add_4x4_dct_identity_1_10bpc_c: 247.2 336.2 inv_txfm_add_4x4_dct_identity_1_10bpc_sse4: 37.1 41.6 inv_txfm_add_4x4_dct_identity_1_10bpc_avx2: 37.3 inv_txfm_add_4x4_flipadst_adst_0_10bpc_c: 259.4 351.7 inv_txfm_add_4x4_flipadst_adst_0_10bpc_sse4: 47.1 51.8 inv_txfm_add_4x4_flipadst_adst_0_10bpc_avx2: 57.9 inv_txfm_add_4x4_flipadst_adst_1_10bpc_c: 258.7 350.8 inv_txfm_add_4x4_flipadst_adst_1_10bpc_sse4: 47.1 51.8 inv_txfm_add_4x4_flipadst_adst_1_10bpc_avx2: 57.4 inv_txfm_add_4x4_flipadst_dct_0_10bpc_c: 282.3 375.4 inv_txfm_add_4x4_flipadst_dct_0_10bpc_sse4: 42.2 45.8 inv_txfm_add_4x4_flipadst_dct_0_10bpc_avx2: 52.5 inv_txfm_add_4x4_flipadst_dct_1_10bpc_c: 283.0 375.8 inv_txfm_add_4x4_flipadst_dct_1_10bpc_sse4: 42.5 45.9 inv_txfm_add_4x4_flipadst_dct_1_10bpc_avx2: 52.4 inv_txfm_add_4x4_flipadst_flipadst_0_10bpc_c: 258.8 356.1 inv_txfm_add_4x4_flipadst_flipadst_0_10bpc_sse4: 47.3 50.1 inv_txfm_add_4x4_flipadst_flipadst_0_10bpc_avx2: 57.4 inv_txfm_add_4x4_flipadst_flipadst_1_10bpc_c: 259.0 355.3 inv_txfm_add_4x4_flipadst_flipadst_1_10bpc_sse4: 47.8 50.2 inv_txfm_add_4x4_flipadst_flipadst_1_10bpc_avx2: 57.4 inv_txfm_add_4x4_flipadst_identity_0_10bpc_c: 228.6 309.4 inv_txfm_add_4x4_flipadst_identity_0_10bpc_sse4: 37.8 42.0 inv_txfm_add_4x4_flipadst_identity_0_10bpc_avx2: 41.4 inv_txfm_add_4x4_flipadst_identity_1_10bpc_c: 229.1 309.6 inv_txfm_add_4x4_flipadst_identity_1_10bpc_sse4: 37.9 42.2 inv_txfm_add_4x4_flipadst_identity_1_10bpc_avx2: 41.3 inv_txfm_add_4x4_identity_adst_0_10bpc_c: 200.8 275.8 inv_txfm_add_4x4_identity_adst_0_10bpc_sse4: 39.0 43.9 inv_txfm_add_4x4_identity_adst_0_10bpc_avx2: 47.4 inv_txfm_add_4x4_identity_adst_1_10bpc_c: 200.8 276.5 inv_txfm_add_4x4_identity_adst_1_10bpc_sse4: 39.0 44.0 inv_txfm_add_4x4_identity_adst_1_10bpc_avx2: 47.2 inv_txfm_add_4x4_identity_dct_0_10bpc_c: 226.4 300.3 inv_txfm_add_4x4_identity_dct_0_10bpc_sse4: 36.9 41.7 inv_txfm_add_4x4_identity_dct_0_10bpc_avx2: 42.8 inv_txfm_add_4x4_identity_dct_1_10bpc_c: 229.0 300.6 inv_txfm_add_4x4_identity_dct_1_10bpc_sse4: 36.8 41.6 inv_txfm_add_4x4_identity_dct_1_10bpc_avx2: 42.7 inv_txfm_add_4x4_identity_flipadst_0_10bpc_c: 202.6 278.9 inv_txfm_add_4x4_identity_flipadst_0_10bpc_sse4: 39.2 43.7 inv_txfm_add_4x4_identity_flipadst_0_10bpc_avx2: 47.1 inv_txfm_add_4x4_identity_flipadst_1_10bpc_c: 202.6 279.3 inv_txfm_add_4x4_identity_flipadst_1_10bpc_sse4: 39.2 43.8 inv_txfm_add_4x4_identity_flipadst_1_10bpc_avx2: 47.0 inv_txfm_add_4x4_identity_identity_0_10bpc_c: 168.7 235.9 inv_txfm_add_4x4_identity_identity_0_10bpc_sse4: 31.7 37.6 inv_txfm_add_4x4_identity_identity_0_10bpc_avx2: 33.9 inv_txfm_add_4x4_identity_identity_1_10bpc_c: 169.1 235.7 inv_txfm_add_4x4_identity_identity_1_10bpc_sse4: 31.7 37.4 inv_txfm_add_4x4_identity_identity_1_10bpc_avx2: 33.8 --- src/x86/itx16_sse.asm | 465 ++++++++++++++++++++++++++++++++++++++++++ src/x86/itx_sse.asm | 3 +- 2 files changed, 467 insertions(+), 1 deletion(-) diff --git a/src/x86/itx16_sse.asm b/src/x86/itx16_sse.asm index 6785064ab7..3bd9ab5546 100644 --- a/src/x86/itx16_sse.asm +++ b/src/x86/itx16_sse.asm @@ -2,6 +2,7 @@ ; Copyright © 2021, Two Orioles, LLC ; Copyright © 2017-2021, The rav1e contributors ; Copyright © 2020, Nathan Egge +; Copyright © 2021, Matthias Dressel ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without @@ -28,6 +29,62 @@ %include "config.asm" %include "ext/x86/x86inc.asm" +SECTION_RODATA +%macro COEF 1 +pd_%1: times 4 dd %1 +%endmacro + +COEF 201 +COEF 401 +COEF 601 +COEF 799 +COEF 995 +COEF 1189 +COEF 1380 +COEF 1567 +COEF 1751 +COEF 1931 +COEF 2106 +COEF 2276 +COEF 2440 +COEF 2598 +COEF 2751 +COEF 2896 +COEF 3035 +COEF 3166 +COEF 3290 +COEF 3406 +COEF 3513 +COEF 3612 +COEF 3703 +COEF 3784 +COEF 3857 +COEF 3920 +COEF 3973 +COEF 4017 +COEF 4052 +COEF 4076 +COEF 4091 + +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +pd_1321: times 4 dd 1321 +pd_2482: times 4 dd 2482 +pd_m3344: times 4 dd -3344 +pd_2048: times 4 dd 2048 +pw_2048: times 8 dw 2048 +pd_3803: times 4 dd 3803 +pd_5793: times 4 dd 5793 +pw_1697x8: times 8 dw 1697*8 +pw_2896x8: times 8 dw 2896*8 +pixel_10bpc_max: times 8 dw 0x03ff + +pw_1567_3784: times 4 dw 1567, 3784 +pw_m3784_1567: times 4 dw -3784, 1567 + +cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3 +cextern iadst_4x4_internal_8bpc_ssse3.main + SECTION .text %macro REPX 2-* @@ -38,6 +95,26 @@ SECTION .text %endrep %endmacro +%define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx) +%define m(x) m_suffix(x, SUFFIX) + +; This refers to the first function in itx_sse i.e. the start of the text section +; which is needed as a base pointer for constants. +%define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3) + +%if ARCH_X86_64 +%define o(x) x +%else +%define o(x) r6-$$+x ; PIC +%endif + +%macro TRANSPOSE4X4_PACKED 3 ; src/dst[1-2], tmp + punpckhwd m%3, m%1, m%2 + punpcklwd m%1, m%2 + punpckhwd m%2, m%1, m%3 ; low: out2 ; high: out3 + punpcklwd m%1, m%3 ; low: out0 ; high: out1 +%endmacro + %macro IWHT4_1D 0 ; m0 = in0, m1 = in1, m2 = in2, m3 = in3 paddd m0, m1 ; in0 += in1 @@ -96,3 +173,391 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax movq [r2 +strideq*0], m2 ; write out2 movq [dstq+strideq*0], m0 ; write out3 RET + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +; flags: 2 = inv_dst1, 4 = inv_dst2 +; skip round/shift if rnd is not a number +%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags +; %1 dst/src[1] +; %2 dst/src[2] +; %3 tmp[1] +; %4 tmp[2] +; %5 tmp[3] +; %6 rnd +; %7 coef[1] +; %8 coef[2] +; %9 flags +%ifnidn %7,%8 ; optimize when coef1 == coef2 +%if %8 < 32 + pmulld m%4, m%1, m%8 + pmulld m%3, m%2, m%8 +%else + mova m%3, [o(pd_%8)] + pmulld m%4, m%1, m%3 + pmulld m%3, m%2 +%endif +%endif +%if %7 < 32 + pmulld m%1, m%7 + pmulld m%2, m%7 +%else + mova m%5, [o(pd_%7)] + pmulld m%1, m%5 + pmulld m%2, m%5 +%endif +%if %9 & 4 ; invert dst2 + paddd m%4, m%2 + psubd m%2, m%6, m%4 +%else +%ifnum %6 +%ifnidn %7,%8 + paddd m%4, m%6 +%else + paddd m%1, m%6 +%endif +%endif +%ifnidn %7,%8 + paddd m%2, m%4 +%else + mova m%3, m%2 + paddd m%2, m%1 +%endif +%endif +%if %9 & 2 ; invert dst1 + psubd m%3, m%1 + paddd m%1, m%3, m%6 +%else +%ifnum %6 +%ifnidn %7,%8 + paddd m%1, m%6 +%endif +%endif + psubd m%1, m%3 +%endif +%ifnum %6 + psrad m%2, 12 + psrad m%1, 12 +%endif +%endmacro + +%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size +cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, 8, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%4_internal_16bpc) +%if ARCH_X86_32 + LEA r6, $$ +%ifidn %1_%2, dct_dct + test eobd, eobd + jz %%end +%endif + lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] + call %%p1 + RET +%%end: +%else + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else +%if %3 + add eobd, %3 +%endif + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 4x4 +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + movd m1, [o(pw_2896x8)] + mov [cq], eobd ; 0 + add r5d, 2048 + sar r5d, 12 + movd m0, r5d + packssdw m0, m0 + pmulhrsw m0, m1 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + mova m1, m0 + jmp m(iadst_4x4_internal_16bpc).end +%endif +%endmacro + +%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd + ; butterfly rotation + ITX_MULSUB_2D %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1 %3 out0 + ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2 %4 out3 + ; Hadamard rotation + psubd m%5, m%1, m%2 + paddd m%2, m%1 + paddd m%1, m%3, m%4 + psubd m%3, m%4 + ; %1 (src1) = out0 + ; %2 (src2) = out1 + ; %3 (src3) = out3 + ; $5 (tmp1) = out2 +%endmacro + +INIT_XMM sse4 + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, identity +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst + +cglobal idct_4x4_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + mova m2, [cq+16*2] + mova m3, [cq+16*3] + mova m5, [o(pd_2048)] + IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5 + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 + ; transpose + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + ; m5 = pd_2048 + mova m4, [o(pw_m3784_1567)] + punpckhwd m2, m1, m0 + psubw m3, m0, m1 + paddw m0, m1 + punpcklqdq m0, m3 + pmaddwd m4, m2 + pmaddwd m2, [o(pw_1567_3784)] + pmulhrsw m0, [o(pw_2896x8)] ; t0 t1 + paddd m4, m5 + paddd m2, m5 + psrad m4, 12 + psrad m2, 12 + packssdw m2, m4 ; t3 t2 + psubsw m1, m0, m2 ; tmp3 tmp2 + paddsw m0, m2 ; tmp0 tmp1 + packssdw m5, m5 ; pw_2048 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + movq m2, [dstq+strideq*0] + movhps m2, [dstq+strideq*1] + lea r5, [dstq+strideq*2] + movq m3, [r5 +strideq*1] + movhps m3, [r5 +strideq*0] + mova m5, [o(pixel_10bpc_max)] + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movhps [r5 +strideq*0], m1 + movq [r5 +strideq*1], m1 + RET + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call .main + TRANSPOSE4X4_PACKED 0, 1, 2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main +.end: + mova m4, [o(pw_2048)] + movq m2, [dstq+strideq*0] + movhps m2, [dstq+strideq*1] + lea r5, [dstq+strideq*2] + movq m3, [r5 +strideq*0] + movhps m3, [r5 +strideq*1] + mova m5, [o(pixel_10bpc_max)] + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [r5 +strideq*0], m1 + movhps [r5 +strideq*1], m1 + RET +ALIGN function_align +.main: + mova m1, [cq+16*2] + mova m3, [cq+16*3] + mova m0, [o(pd_1321)] ; SINPI_1_9 + mova m2, [o(pd_2482)] ; SINPI_2_9 + mova m6, [o(pd_3803)] ; SINPI_4_9 + mova m5, [cq+16*0] + pmulld m4, m0, m1 ; s[4] = SINPI_1_9 * T[2] + pmulld m7, m3, m6 ; s[6] = SINPI_4_9 * T[3] + pmulld m6, m1 ; s[3] = SINPI_4_9 * T[2] + pmulld m0, m5 ; s[0] = SINPI_1_9 * T[0] + psubd m1, m3 ; T[2] - T[3] + pmulld m3, m2 ; s[5] = SINPI_2_9 * T[3] + pmulld m2, m5 ; s[1] = SINPI_2_9 * T[0] + paddd m0, m6 ; s[0] += s[3] + paddd m0, m3 ; s[0] += s[5] + mova m3, [o(pd_m3344)] ; -SINPI_3_9 + psubd m2, m4 ; s[1] -= s[4] + psubd m2, m7 ; s[1] -= s[6] + psubd m1, m5 ; -b7 = (T[2] -T[3]) - T[0] + pmulld m1, m3 ; s[2] = -SINPI_3_9 * -b7 + pmulld m3, [cq+16*1] ; -s[3] = -SINPI_3_9 * T[1] + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m0, m1 ; {s[0], s[2]} + 2048 + paddd m4, m0, m2 ; x[3] = s[0] + s[1] + psubd m2, m3 ; x[1] = s[1] + s[3] + psubd m0, m3 ; x[0] = s[0] + s[3] + paddd m4, m3 ; x[3] -= s[3] + paddd m2, m5 ; x[1] + 2048 + REPX {psrad x, 12}, m0, m2, m1, m4 + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + ret + + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_16bpc).main + ; transpose + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main + mova m4, [o(pw_2048)] + movq m3, [dstq+strideq*1] + movhps m3, [dstq+strideq*0] + lea r5, [dstq+strideq*2] + movq m2, [r5 +strideq*1] + movhps m2, [r5 +strideq*0] + mova m5, [o(pixel_10bpc_max)] + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + movhps [dstq+strideq*0], m1 + movq [dstq+strideq*1], m1 + movhps [r5 +strideq*0], m0 + movq [r5 +strideq*1], m0 + RET + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 + mova m3, [o(pd_5793)] + pmulld m0, m3, [cq+16*0] + pmulld m1, m3, [cq+16*1] + pmulld m2, m3, [cq+16*2] + pmulld m3, [cq+16*3] + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m1, m2, m3 + TRANSPOSE4X4_PACKED 0, 1, 2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + ; m5 = pd_2048 + mova m4, [o(pw_1697x8)] + movq m2, [dstq+strideq*0] + movhps m2, [dstq+strideq*1] + lea r5, [dstq+strideq*2] + pmulhrsw m3, m4, m0 + pmulhrsw m4, m1 + paddsw m0, m3 + paddsw m1, m4 + movq m3, [r5 +strideq*0] + movhps m3, [r5 +strideq*1] + mova m4, [o(pixel_10bpc_max)] + packssdw m5, m5 ; pw_2048 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + pxor m5, m5 + mova [cq+16*0], m5 + mova [cq+16*1], m5 + mova [cq+16*2], m5 + mova [cq+16*3], m5 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m5 + pmaxsw m1, m5 + pminsw m0, m4 + pminsw m1, m4 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [r5 +strideq*0], m1 + movhps [r5 +strideq*1], m1 + RET diff --git a/src/x86/itx_sse.asm b/src/x86/itx_sse.asm index 89ad56bda9..9ff8a01519 100644 --- a/src/x86/itx_sse.asm +++ b/src/x86/itx_sse.asm @@ -282,6 +282,7 @@ ALIGN function_align %endmacro INIT_XMM ssse3 +; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16. INV_TXFM_4X4_FN dct, dct INV_TXFM_4X4_FN dct, adst @@ -337,7 +338,7 @@ cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 ITX4_END 0, 1, 2, 3 ALIGN function_align -.main: +cglobal_label .main punpcklwd m2, m0, m1 ;unpacked in0 in2 punpckhwd m0, m1 ;unpacked in1 in3 mova m3, m0 From bd6a2bc348097d826a34a0ea799f2986371c4ecb Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Sun, 20 Jun 2021 18:25:29 +0200 Subject: [PATCH 122/188] x86: Add bpc suffix to ipred functions --- src/asm/x86/predict.rs | 132 ++++++++++++++++++++++------------------- src/x86/ipred_avx2.asm | 48 +++++++-------- src/x86/ipred_sse.asm | 48 +++++++-------- 3 files changed, 118 insertions(+), 110 deletions(-) diff --git a/src/asm/x86/predict.rs b/src/asm/x86/predict.rs index 0883de6aa1..0545fae1da 100644 --- a/src/asm/x86/predict.rs +++ b/src/asm/x86/predict.rs @@ -32,28 +32,28 @@ macro_rules! decl_angular_ipred_fn { } decl_angular_ipred_fn! { - rav1e_ipred_dc_avx2, - rav1e_ipred_dc_ssse3, - rav1e_ipred_dc_128_avx2, - rav1e_ipred_dc_128_ssse3, - rav1e_ipred_dc_left_avx2, - rav1e_ipred_dc_left_ssse3, - rav1e_ipred_dc_top_avx2, - rav1e_ipred_dc_top_ssse3, - rav1e_ipred_v_avx2, - rav1e_ipred_v_ssse3, - rav1e_ipred_h_avx2, - rav1e_ipred_h_ssse3, - rav1e_ipred_z1_avx2, - rav1e_ipred_z3_avx2, - rav1e_ipred_smooth_avx2, - rav1e_ipred_smooth_ssse3, - rav1e_ipred_smooth_v_avx2, - rav1e_ipred_smooth_v_ssse3, - rav1e_ipred_smooth_h_avx2, - rav1e_ipred_smooth_h_ssse3, - rav1e_ipred_paeth_avx2, - rav1e_ipred_paeth_ssse3 + rav1e_ipred_dc_8bpc_avx2, + rav1e_ipred_dc_8bpc_ssse3, + rav1e_ipred_dc_128_8bpc_avx2, + rav1e_ipred_dc_128_8bpc_ssse3, + rav1e_ipred_dc_left_8bpc_avx2, + rav1e_ipred_dc_left_8bpc_ssse3, + rav1e_ipred_dc_top_8bpc_avx2, + rav1e_ipred_dc_top_8bpc_ssse3, + rav1e_ipred_v_8bpc_avx2, + rav1e_ipred_v_8bpc_ssse3, + rav1e_ipred_h_8bpc_avx2, + rav1e_ipred_h_8bpc_ssse3, + rav1e_ipred_z1_8bpc_avx2, + rav1e_ipred_z3_8bpc_avx2, + rav1e_ipred_smooth_8bpc_avx2, + rav1e_ipred_smooth_8bpc_ssse3, + rav1e_ipred_smooth_v_8bpc_avx2, + rav1e_ipred_smooth_v_8bpc_ssse3, + rav1e_ipred_smooth_h_8bpc_avx2, + rav1e_ipred_smooth_h_8bpc_ssse3, + rav1e_ipred_paeth_8bpc_avx2, + rav1e_ipred_paeth_8bpc_ssse3 } macro_rules! decl_angular_ipred_hbd_fn { @@ -90,7 +90,7 @@ decl_angular_ipred_hbd_fn! { // the distance between the predicted block's top-left pixel and the frame's edge. // It is required for the intra edge filtering process. extern { - fn rav1e_ipred_z2_avx2( + fn rav1e_ipred_z2_8bpc_avx2( dst: *mut u8, stride: libc::ptrdiff_t, topleft: *const u8, width: libc::c_int, height: libc::c_int, angle: libc::c_int, dx: libc::c_int, dy: libc::c_int, @@ -118,14 +118,14 @@ macro_rules! decl_cfl_pred_fn { } decl_cfl_pred_fn! { - rav1e_ipred_cfl_avx2, - rav1e_ipred_cfl_ssse3, - rav1e_ipred_cfl_128_avx2, - rav1e_ipred_cfl_128_ssse3, - rav1e_ipred_cfl_left_avx2, - rav1e_ipred_cfl_left_ssse3, - rav1e_ipred_cfl_top_avx2, - rav1e_ipred_cfl_top_ssse3 + rav1e_ipred_cfl_8bpc_avx2, + rav1e_ipred_cfl_8bpc_ssse3, + rav1e_ipred_cfl_128_8bpc_avx2, + rav1e_ipred_cfl_128_8bpc_ssse3, + rav1e_ipred_cfl_left_8bpc_avx2, + rav1e_ipred_cfl_left_8bpc_ssse3, + rav1e_ipred_cfl_top_8bpc_avx2, + rav1e_ipred_cfl_top_8bpc_ssse3 } macro_rules! decl_cfl_pred_hbd_fn { @@ -178,17 +178,17 @@ pub fn dispatch_predict_intra( match mode { PredictionMode::DC_PRED => { (match variant { - PredictionVariant::NONE => rav1e_ipred_dc_128_avx2, - PredictionVariant::LEFT => rav1e_ipred_dc_left_avx2, - PredictionVariant::TOP => rav1e_ipred_dc_top_avx2, - PredictionVariant::BOTH => rav1e_ipred_dc_avx2, + PredictionVariant::NONE => rav1e_ipred_dc_128_8bpc_avx2, + PredictionVariant::LEFT => rav1e_ipred_dc_left_8bpc_avx2, + PredictionVariant::TOP => rav1e_ipred_dc_top_8bpc_avx2, + PredictionVariant::BOTH => rav1e_ipred_dc_8bpc_avx2, })(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::V_PRED if angle == 90 => { - rav1e_ipred_v_avx2(dst_ptr, stride, edge_ptr, w, h, angle); + rav1e_ipred_v_8bpc_avx2(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::H_PRED if angle == 180 => { - rav1e_ipred_h_avx2(dst_ptr, stride, edge_ptr, w, h, angle); + rav1e_ipred_h_8bpc_avx2(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::V_PRED | PredictionMode::H_PRED @@ -224,42 +224,46 @@ pub fn dispatch_predict_intra( ); if angle <= 90 { - rav1e_ipred_z1_avx2( + rav1e_ipred_z1_8bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle_arg, ); } else if angle < 180 { - rav1e_ipred_z2_avx2( + rav1e_ipred_z2_8bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle_arg, dx, dy, ); } else { - rav1e_ipred_z3_avx2( + rav1e_ipred_z3_8bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle_arg, ); } } PredictionMode::SMOOTH_PRED => { - rav1e_ipred_smooth_avx2(dst_ptr, stride, edge_ptr, w, h, angle); + rav1e_ipred_smooth_8bpc_avx2( + dst_ptr, stride, edge_ptr, w, h, angle, + ); } PredictionMode::SMOOTH_V_PRED => { - rav1e_ipred_smooth_v_avx2( + rav1e_ipred_smooth_v_8bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::SMOOTH_H_PRED => { - rav1e_ipred_smooth_h_avx2( + rav1e_ipred_smooth_h_8bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::PAETH_PRED => { - rav1e_ipred_paeth_avx2(dst_ptr, stride, edge_ptr, w, h, angle); + rav1e_ipred_paeth_8bpc_avx2( + dst_ptr, stride, edge_ptr, w, h, angle, + ); } PredictionMode::UV_CFL_PRED => { let ac_ptr = ac.as_ptr() as *const _; (match variant { - PredictionVariant::NONE => rav1e_ipred_cfl_128_avx2, - PredictionVariant::LEFT => rav1e_ipred_cfl_left_avx2, - PredictionVariant::TOP => rav1e_ipred_cfl_top_avx2, - PredictionVariant::BOTH => rav1e_ipred_cfl_avx2, + PredictionVariant::NONE => rav1e_ipred_cfl_128_8bpc_avx2, + PredictionVariant::LEFT => rav1e_ipred_cfl_left_8bpc_avx2, + PredictionVariant::TOP => rav1e_ipred_cfl_top_8bpc_avx2, + PredictionVariant::BOTH => rav1e_ipred_cfl_8bpc_avx2, })(dst_ptr, stride, edge_ptr, w, h, ac_ptr, angle); } _ => call_rust(dst), @@ -268,41 +272,45 @@ pub fn dispatch_predict_intra( match mode { PredictionMode::DC_PRED => { (match variant { - PredictionVariant::NONE => rav1e_ipred_dc_128_ssse3, - PredictionVariant::LEFT => rav1e_ipred_dc_left_ssse3, - PredictionVariant::TOP => rav1e_ipred_dc_top_ssse3, - PredictionVariant::BOTH => rav1e_ipred_dc_ssse3, + PredictionVariant::NONE => rav1e_ipred_dc_128_8bpc_ssse3, + PredictionVariant::LEFT => rav1e_ipred_dc_left_8bpc_ssse3, + PredictionVariant::TOP => rav1e_ipred_dc_top_8bpc_ssse3, + PredictionVariant::BOTH => rav1e_ipred_dc_8bpc_ssse3, })(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::V_PRED if angle == 90 => { - rav1e_ipred_v_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); + rav1e_ipred_v_8bpc_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::H_PRED if angle == 180 => { - rav1e_ipred_h_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); + rav1e_ipred_h_8bpc_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::SMOOTH_PRED => { - rav1e_ipred_smooth_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); + rav1e_ipred_smooth_8bpc_ssse3( + dst_ptr, stride, edge_ptr, w, h, angle, + ); } PredictionMode::SMOOTH_V_PRED => { - rav1e_ipred_smooth_v_ssse3( + rav1e_ipred_smooth_v_8bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::SMOOTH_H_PRED => { - rav1e_ipred_smooth_h_ssse3( + rav1e_ipred_smooth_h_8bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::PAETH_PRED => { - rav1e_ipred_paeth_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); + rav1e_ipred_paeth_8bpc_ssse3( + dst_ptr, stride, edge_ptr, w, h, angle, + ); } PredictionMode::UV_CFL_PRED => { let ac_ptr = ac.as_ptr() as *const _; (match variant { - PredictionVariant::NONE => rav1e_ipred_cfl_128_ssse3, - PredictionVariant::LEFT => rav1e_ipred_cfl_left_ssse3, - PredictionVariant::TOP => rav1e_ipred_cfl_top_ssse3, - PredictionVariant::BOTH => rav1e_ipred_cfl_ssse3, + PredictionVariant::NONE => rav1e_ipred_cfl_128_8bpc_ssse3, + PredictionVariant::LEFT => rav1e_ipred_cfl_left_8bpc_ssse3, + PredictionVariant::TOP => rav1e_ipred_cfl_top_8bpc_ssse3, + PredictionVariant::BOTH => rav1e_ipred_cfl_8bpc_ssse3, })(dst_ptr, stride, edge_ptr, w, h, ac_ptr, angle); } _ => call_rust(dst), diff --git a/src/x86/ipred_avx2.asm b/src/x86/ipred_avx2.asm index 6838110d33..dd188a7f37 100644 --- a/src/x86/ipred_avx2.asm +++ b/src/x86/ipred_avx2.asm @@ -1,4 +1,4 @@ -; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; @@ -141,7 +141,7 @@ pw_512: times 2 dw 512 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) - %xdefine %%base mangle(private_prefix %+ _%1_%2) + %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) @@ -178,7 +178,7 @@ cextern filter_intra_taps SECTION .text INIT_YMM avx2 -cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h +cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h lea r5, [ipred_dc_left_avx2_table] tzcnt wd, wm inc tlq @@ -196,7 +196,7 @@ cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h add wq, r5 jmp r6 -cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq @@ -235,7 +235,7 @@ cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3 mova m1, m0 jmp wq -cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd @@ -446,7 +446,7 @@ ALIGN function_align jg .s64 RET -cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_avx2_table] tzcnt wd, wm movifnidn hd, hm @@ -457,7 +457,7 @@ cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3 lea stride3q, [strideq*3] jmp wq -cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_avx2_table] tzcnt wd, wm movu m0, [tlq+ 1] @@ -486,7 +486,7 @@ ALIGN function_align %endmacro INIT_XMM avx2 -cglobal ipred_h, 3, 6, 4, dst, stride, tl, w, h, stride3 +cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 lea r5, [ipred_h_avx2_table] tzcnt wd, wm movifnidn hd, hm @@ -543,7 +543,7 @@ INIT_YMM avx2 vpblendvb m0, m5, m0, m1 %endmacro -cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h +cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h %define base r5-ipred_paeth_avx2_table lea r5, [ipred_paeth_avx2_table] tzcnt wd, wm @@ -677,7 +677,7 @@ ALIGN function_align packuswb m0, m1 %endmacro -cglobal ipred_smooth_v, 3, 7, 0, dst, stride, tl, w, h, weights +cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights %define base r6-ipred_smooth_v_avx2_table lea r6, [ipred_smooth_v_avx2_table] tzcnt wd, wm @@ -835,7 +835,7 @@ ALIGN function_align ALLOC_STACK %1, %3 %endmacro -cglobal ipred_smooth_h, 3, 7, 0, dst, stride, tl, w, h +cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h %define base r6-ipred_smooth_h_avx2_table lea r6, [ipred_smooth_h_avx2_table] mov wd, wm @@ -1045,7 +1045,7 @@ ALIGN function_align packuswb m0, m1 %endmacro -cglobal ipred_smooth, 3, 7, 0, dst, stride, tl, w, h, v_weights +cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights %define base r6-ipred_smooth_avx2_table lea r6, [ipred_smooth_avx2_table] mov wd, wm @@ -1315,7 +1315,7 @@ ALIGN function_align sub r3, hq ret -cglobal ipred_z1, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase +cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase %assign org_stack_offset stack_offset lea r6, [ipred_z1_avx2_table] tzcnt wd, wm @@ -2144,7 +2144,7 @@ ALIGN function_align .w64_end: RET -cglobal ipred_z2, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy +cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy %define base r9-z_filter_t0 lea r9, [ipred_z2_avx2_table] tzcnt wd, wm @@ -3000,7 +3000,7 @@ ALIGN function_align movu [rsp+97], m0 jmp .w32_filter_above -cglobal ipred_z3, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase +cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase %assign org_stack_offset stack_offset lea r6, [ipred_z3_avx2_table] tzcnt hd, hm @@ -4211,7 +4211,7 @@ ALIGN function_align ; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ ; 5 8 8 i -cglobal ipred_filter, 3, 7, 0, dst, stride, tl, w, h, filter +cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter %define base r6-ipred_filter_avx2_table lea r6, [filter_intra_taps] tzcnt wd, wm @@ -4435,7 +4435,7 @@ DECLARE_REG_TMP 7 paddw m%1, m0 %endmacro -cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha lea t0, [ipred_cfl_left_avx2_table] tzcnt wd, wm inc tlq @@ -4454,7 +4454,7 @@ cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha movifnidn acq, acmp jmp r6 -cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq @@ -4488,7 +4488,7 @@ cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha vpbroadcastw m0, xm0 jmp wq -cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd @@ -4692,7 +4692,7 @@ ALIGN function_align jg .s32_loop RET -cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha lea t0, [ipred_cfl_splat_avx2_table] tzcnt wd, wm movifnidn hd, hm @@ -4702,7 +4702,7 @@ cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha movifnidn acq, acmp jmp wq -cglobal ipred_cfl_ac_420, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak +cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak movifnidn hpadd, hpadm movifnidn wd, wm mov hd, hm @@ -4883,7 +4883,7 @@ cglobal ipred_cfl_ac_420, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak jg .sub_loop RET -cglobal ipred_cfl_ac_422, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak +cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak movifnidn hpadd, hpadm movifnidn wd, wm mov hd, hm @@ -5076,7 +5076,7 @@ cglobal ipred_cfl_ac_422, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak jg .sub_loop RET -cglobal ipred_cfl_ac_444, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak +cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak movifnidn hpadd, hpadm movifnidn wd, wm mov hd, hm @@ -5306,7 +5306,7 @@ cglobal ipred_cfl_ac_444, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak jg .sub_loop RET -cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h +cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h vbroadcasti128 m4, [palq] lea r2, [pal_pred_avx2_table] tzcnt wd, wm diff --git a/src/x86/ipred_sse.asm b/src/x86/ipred_sse.asm index 06ee256645..9f548aadb1 100644 --- a/src/x86/ipred_sse.asm +++ b/src/x86/ipred_sse.asm @@ -1,4 +1,4 @@ -; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; @@ -74,7 +74,7 @@ pd_32768 : times 1 dd 32768 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) - %xdefine %%base mangle(private_prefix %+ _%1_%2) + %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) @@ -156,7 +156,7 @@ SECTION .text %endmacro INIT_XMM ssse3 -cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3 +cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3 LEA r5, ipred_h_ssse3_table tzcnt wd, wm movifnidn hd, hm @@ -179,7 +179,7 @@ cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3 ;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- -cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_splat_ssse3_table tzcnt wd, wm movu m0, [tlq+ 1] @@ -196,7 +196,7 @@ cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3 ;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- -cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd @@ -438,7 +438,7 @@ ALIGN function_align ;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- -cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_left_ssse3_table mov hd, hm ; zero upper half tzcnt r6d, hd @@ -488,7 +488,7 @@ cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3 ;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- -cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_splat_ssse3_table tzcnt wd, wm movifnidn hd, hm @@ -505,7 +505,7 @@ cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3 ;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- -cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h +cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h LEA r5, ipred_dc_left_ssse3_table tzcnt wd, wm inc tlq @@ -540,7 +540,7 @@ cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h packuswb m6, m0 %endmacro -cglobal ipred_smooth_v, 3, 7, 7, dst, stride, tl, w, h, weights +cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights %define base r6-ipred_smooth_v_ssse3_table LEA r6, ipred_smooth_v_ssse3_table tzcnt wd, wm @@ -701,7 +701,7 @@ ALIGN function_align ;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- -cglobal ipred_smooth_h, 3, 7, 8, dst, stride, tl, w, h +cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h %define base r6-ipred_smooth_h_ssse3_table LEA r6, ipred_smooth_h_ssse3_table mov wd, wm @@ -958,7 +958,7 @@ ALIGN function_align mova m5, [rsp+16*%12] ; recovery %endmacro -cglobal ipred_smooth, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights +cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights %define base r6-ipred_smooth_ssse3_table mov wd, wm mov hd, hm @@ -1194,7 +1194,7 @@ ALIGN function_align ;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal, ; const uint8_t *idx, const int w, const int h); ;--------------------------------------------------------------------------------------- -cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h +cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h mova m4, [palq] LEA r2, pal_pred_ssse3_table tzcnt wd, wm @@ -1295,7 +1295,7 @@ DECLARE_REG_TMP 7 DECLARE_REG_TMP 5 %endif -cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha movifnidn wd, wm movifnidn hd, hm tzcnt r6d, hd @@ -1535,7 +1535,7 @@ ALIGN function_align ;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- -cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq @@ -1576,7 +1576,7 @@ cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha ;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- -cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha LEA t0, ipred_cfl_left_ssse3_table tzcnt wd, wm inc tlq @@ -1600,7 +1600,7 @@ cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha ;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- -cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha tzcnt wd, wm movifnidn hd, hm LEA r6, ipred_cfl_splat_ssse3_table @@ -1615,11 +1615,11 @@ cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha %endmacro %if ARCH_X86_64 -cglobal ipred_cfl_ac_420, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak +cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak DECLARE_REG_TMP 7 movddup m2, [pb_2] %else -cglobal ipred_cfl_ac_420, 4, 7, 7, ac, y, stride, wpad, hpad, w, h +cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h DECLARE_REG_TMP 4 %define ac_bakq acmp mov t0d, 0x02020202 @@ -1855,10 +1855,10 @@ DECLARE_REG_TMP 4 RET %if ARCH_X86_64 -cglobal ipred_cfl_ac_422, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak +cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak movddup m2, [pb_4] %else -cglobal ipred_cfl_ac_422, 4, 7, 7, ac, y, stride, wpad, hpad, w, h +cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h mov t0d, 0x04040404 movd m2, t0d pshufd m2, m2, q0000 @@ -2128,10 +2128,10 @@ cglobal ipred_cfl_ac_422, 4, 7, 7, ac, y, stride, wpad, hpad, w, h RET %if ARCH_X86_64 -cglobal ipred_cfl_ac_444, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak +cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak movddup m2, [pb_4] %else -cglobal ipred_cfl_ac_444, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h +cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h %define ac_bakq [rsp+16*4] mov t0d, 0x04040404 movd m2, t0d @@ -2769,7 +2769,7 @@ cglobal ipred_cfl_ac_444, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h BLEND m1, m0, m5 %endmacro -cglobal ipred_paeth, 3, 6, 8, -7*16, dst, stride, tl, w, h +cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h %define base r5-ipred_paeth_ssse3_table tzcnt wd, wm movifnidn hd, hm @@ -2937,7 +2937,7 @@ ALIGN function_align packuswb m%1, m%1 %endmacro -cglobal ipred_filter, 3, 7, 8, dst, stride, tl, w, h, filter +cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter %define base r6-$$ LEA r6, $$ tzcnt wd, wm From 0c02334f2cad0861023486f459752eda068229e1 Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Sun, 20 Jun 2021 18:47:17 +0200 Subject: [PATCH 123/188] x86: Add bpc suffix to loopfilter functions --- src/x86/loopfilter_avx2.asm | 10 +++++----- src/x86/loopfilter_sse.asm | 18 +++++++++--------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/x86/loopfilter_avx2.asm b/src/x86/loopfilter_avx2.asm index 8cf20b685b..c8eda1f0ff 100644 --- a/src/x86/loopfilter_avx2.asm +++ b/src/x86/loopfilter_avx2.asm @@ -1,4 +1,4 @@ -; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; @@ -1457,7 +1457,7 @@ SECTION .text %endmacro INIT_YMM avx2 -cglobal lpf_v_sb_y, 7, 10, 16, 32 * 11, \ +cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp shl l_strideq, 2 @@ -1495,7 +1495,7 @@ cglobal lpf_v_sb_y, 7, 10, 16, 32 * 11, \ RET INIT_YMM avx2 -cglobal lpf_h_sb_y, 7, 10, 16, 32 * 21, \ +cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp shl l_strideq, 2 @@ -1535,7 +1535,7 @@ cglobal lpf_h_sb_y, 7, 10, 16, 32 * 21, \ RET INIT_YMM avx2 -cglobal lpf_v_sb_uv, 7, 10, 16, \ +cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp shl l_strideq, 2 @@ -1566,7 +1566,7 @@ cglobal lpf_v_sb_uv, 7, 10, 16, \ RET INIT_YMM avx2 -cglobal lpf_h_sb_uv, 7, 10, 16, \ +cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp shl l_strideq, 2 diff --git a/src/x86/loopfilter_sse.asm b/src/x86/loopfilter_sse.asm index cc70051a88..cd0eb54702 100644 --- a/src/x86/loopfilter_sse.asm +++ b/src/x86/loopfilter_sse.asm @@ -1,4 +1,4 @@ -; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; @@ -1977,11 +1977,11 @@ SECTION .text INIT_XMM ssse3 %if ARCH_X86_64 -cglobal lpf_v_sb_y, 7, 11, 16, 16 * 15, \ +cglobal lpf_v_sb_y_8bpc, 7, 11, 16, 16 * 15, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits %else -cglobal lpf_v_sb_y, 6, 7, 8, -16 * (26 + copy_args), \ +cglobal lpf_v_sb_y_8bpc, 6, 7, 8, -16 * (26 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS w SETUP_PIC @@ -2075,11 +2075,11 @@ cglobal lpf_v_sb_y, 6, 7, 8, -16 * (26 + copy_args), \ INIT_XMM ssse3 %if ARCH_X86_64 -cglobal lpf_h_sb_y, 7, 11, 16, 16 * 26, \ +cglobal lpf_h_sb_y_8bpc, 7, 11, 16, 16 * 26, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits %else -cglobal lpf_h_sb_y, 6, 7, 8, -16 * (39 + copy_args), \ +cglobal lpf_h_sb_y_8bpc, 6, 7, 8, -16 * (39 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS h SETUP_PIC @@ -2179,11 +2179,11 @@ cglobal lpf_h_sb_y, 6, 7, 8, -16 * (39 + copy_args), \ INIT_XMM ssse3 %if ARCH_X86_64 -cglobal lpf_v_sb_uv, 7, 11, 16, 3 * 16, \ +cglobal lpf_v_sb_uv_8bpc, 7, 11, 16, 3 * 16, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits %else -cglobal lpf_v_sb_uv, 6, 7, 8, -16 * (12 + copy_args), \ +cglobal lpf_v_sb_uv_8bpc, 6, 7, 8, -16 * (12 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS w SETUP_PIC @@ -2261,11 +2261,11 @@ cglobal lpf_v_sb_uv, 6, 7, 8, -16 * (12 + copy_args), \ INIT_XMM ssse3 %if ARCH_X86_64 -cglobal lpf_h_sb_uv, 7, 11, 16, 16 * 3, \ +cglobal lpf_h_sb_uv_8bpc, 7, 11, 16, 16 * 3, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits %else -cglobal lpf_h_sb_uv, 6, 7, 8, -16 * (13 + copy_args), \ +cglobal lpf_h_sb_uv_8bpc, 6, 7, 8, -16 * (13 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS h SETUP_PIC From b855c918288b80121f9cc4292587d9f9b747bb75 Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Sun, 20 Jun 2021 19:52:31 +0200 Subject: [PATCH 124/188] x86: Add bpc suffix to filmgrain functions --- src/x86/film_grain_avx2.asm | 38 +++++++++++++++--------------- src/x86/film_grain_sse.asm | 46 ++++++++++++++++++------------------- 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/src/x86/film_grain_avx2.asm b/src/x86/film_grain_avx2.asm index bfd7a22085..130c4075dc 100644 --- a/src/x86/film_grain_avx2.asm +++ b/src/x86/film_grain_avx2.asm @@ -1,4 +1,4 @@ -; Copyright © 2019, VideoLAN and dav1d authors +; Copyright © 2019-2021, VideoLAN and dav1d authors ; Copyright © 2019, Two Orioles, LLC ; All rights reserved. ; @@ -49,22 +49,22 @@ min: dw 0, 16 pb_27_17_17_27: db 27, 17, 17, 27 pw_1: dw 1 -%macro JMP_TABLE 1-* - %xdefine %1_table %%table - %xdefine %%base %1_table - %xdefine %%prefix mangle(private_prefix %+ _%1) +%macro JMP_TABLE 2-* + %xdefine %1_8bpc_%2_table %%table + %xdefine %%base %1_8bpc_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) %%table: - %rep %0 - 1 - dd %%prefix %+ .ar%2 - %%base + %rep %0 - 2 + dd %%prefix %+ .ar%3 - %%base %rotate 1 %endrep %endmacro ALIGN 4 -JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_422_avx2, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_444_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3 struc FGData .seed: resd 1 @@ -91,7 +91,7 @@ cextern gaussian_sequence SECTION .text INIT_XMM avx2 -cglobal generate_grain_y, 2, 9, 16, buf, fg_data +cglobal generate_grain_y_8bpc, 2, 9, 16, buf, fg_data lea r4, [pb_mask] %define base r4-pb_mask movq xm1, [base+rnd_next_upperbit_mask] @@ -132,8 +132,8 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data ; auto-regression code movsxd r2, [fg_dataq+FGData.ar_coeff_lag] - movsxd r2, [base+generate_grain_y_avx2_table+r2*4] - lea r2, [r2+base+generate_grain_y_avx2_table] + movsxd r2, [base+generate_grain_y_8bpc_avx2_table+r2*4] + lea r2, [r2+base+generate_grain_y_8bpc_avx2_table] jmp r2 .ar1: @@ -420,7 +420,7 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data %macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y INIT_XMM avx2 -cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv +cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv lea r4, [pb_mask] %define base r4-pb_mask movq xm1, [base+rnd_next_upperbit_mask] @@ -478,8 +478,8 @@ cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv ; auto-regression code movsxd r5, [fg_dataq+FGData.ar_coeff_lag] - movsxd r5, [base+generate_grain_uv_%1_avx2_table+r5*4] - lea r5, [r5+base+generate_grain_uv_%1_avx2_table] + movsxd r5, [base+generate_grain_uv_%1_8bpc_avx2_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_8bpc_avx2_table] jmp r5 .ar0: @@ -975,7 +975,7 @@ generate_grain_uv_fn 422, 1, 0 generate_grain_uv_fn 444, 0, 0 INIT_YMM avx2 -cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut +cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut pcmpeqw m10, m10 psrld m10, 24 mov r7d, [fg_dataq+FGData.scaling_shift] @@ -1461,7 +1461,7 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut RET %macro FGUV_FN 3 ; name, ss_hor, ss_ver -cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ +cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, lstride, uv_pl, is_id pcmpeqw m10, m10 psrld m10, 24 diff --git a/src/x86/film_grain_sse.asm b/src/x86/film_grain_sse.asm index 9e47ea0659..8cba258162 100644 --- a/src/x86/film_grain_sse.asm +++ b/src/x86/film_grain_sse.asm @@ -1,4 +1,4 @@ -; Copyright © 2019, VideoLAN and dav1d authors +; Copyright © 2019-2021, VideoLAN and dav1d authors ; Copyright © 2019, Two Orioles, LLC ; All rights reserved. ; @@ -48,21 +48,21 @@ pw_1: dw 1 %define pb_27_17_17_27 pb_17_27 - 2 -%macro JMP_TABLE 1-* - %xdefine %1_table %%table - %xdefine %%base %1_table - %xdefine %%prefix mangle(private_prefix %+ _%1) +%macro JMP_TABLE 2-* + %xdefine %1_8bpc_%2_table %%table + %xdefine %%base %1_8bpc_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) %%table: - %rep %0 - 1 - dd %%prefix %+ .ar%2 - %%base + %rep %0 - 2 + dd %%prefix %+ .ar%3 - %%base %rotate 1 %endrep %endmacro -JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_422_ssse3, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_444_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3 struc FGData .seed: resd 1 @@ -98,7 +98,7 @@ SECTION .text %endmacro INIT_XMM ssse3 -cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data +cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data LEA r4, $$ %define base r4-$$ movq m1, [base+rnd_next_upperbit_mask] @@ -164,8 +164,8 @@ cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data ; auto-regression code movsxd r2, [fg_dataq+FGData.ar_coeff_lag] - movsxd r2, [base+generate_grain_y_ssse3_table+r2*4] - lea r2, [r2+base+generate_grain_y_ssse3_table] + movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4] + lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table] jmp r2 .ar1: @@ -507,7 +507,7 @@ cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data %macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y INIT_XMM ssse3 -cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv +cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv movifnidn r2, r2mp movifnidn r3, r3mp LEA r4, $$ @@ -606,8 +606,8 @@ cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv ; auto-regression code movsxd r5, [fg_dataq+FGData.ar_coeff_lag] - movsxd r5, [base+generate_grain_uv_%1_ssse3_table+r5*4] - lea r5, [r5+base+generate_grain_uv_%1_ssse3_table] + movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table] jmp r5 .ar0: @@ -1284,7 +1284,7 @@ INIT_XMM ssse3 ; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize -cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \ +cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \ dst, src, scaling, unused1, fg_data, picptr, unused2 ; copy stack arguments to new position post-alignment, so that we ; don't have to keep the old stack location in a separate register @@ -1302,7 +1302,7 @@ cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \ mov [rsp+6*mmsize+10*gprsize], r4 mov [rsp+6*mmsize+11*gprsize], r5 %else -cglobal fgy_32x32xn, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \ +cglobal fgy_32x32xn_8bpc, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \ dst, src, scaling, unused1, fg_data, picptr, unused2 %endif mov srcq, srcm @@ -1323,7 +1323,7 @@ cglobal fgy_32x32xn, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \ %define base r5-pb_mask mov r5m, picptrq %else -cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut +cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut lea r7, [pb_mask] %define base r7-pb_mask %endif @@ -2079,7 +2079,7 @@ INIT_XMM ssse3 ; sby, luma, lstride, uv_pl, is_id) %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 -cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \ +cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \ tmp, src, scaling, h, fg_data, picptr, unused mov r0, r0m mov r1, r2m @@ -2102,7 +2102,7 @@ cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \ mov [rsp+8*mmsize+13*gprsize], r2 mov [rsp+8*mmsize+14*gprsize], r4 %else -cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ +cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ tmp, src, scaling, h, fg_data, picptr, unused %endif mov srcq, srcm @@ -2127,7 +2127,7 @@ cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ %define base r5-pb_mask mov r5m, r5 %else -cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ +cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, tmp, sby, luma, lstride, uv_pl, is_id lea r8, [pb_mask] %define base r8-pb_mask From eef97217f1a7f065536bab2ac4816ad3f6eb3f70 Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Mon, 21 Jun 2021 19:30:39 +0200 Subject: [PATCH 125/188] x86: itx4: Inline transpose Saves one move. --- src/x86/itx16_sse.asm | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/x86/itx16_sse.asm b/src/x86/itx16_sse.asm index 3bd9ab5546..ee12707c3a 100644 --- a/src/x86/itx16_sse.asm +++ b/src/x86/itx16_sse.asm @@ -108,13 +108,6 @@ SECTION .text %define o(x) r6-$$+x ; PIC %endif -%macro TRANSPOSE4X4_PACKED 3 ; src/dst[1-2], tmp - punpckhwd m%3, m%1, m%2 - punpcklwd m%1, m%2 - punpckhwd m%2, m%1, m%3 ; low: out2 ; high: out3 - punpcklwd m%1, m%3 ; low: out0 ; high: out1 -%endmacro - %macro IWHT4_1D 0 ; m0 = in0, m1 = in1, m2 = in2, m3 = in3 paddd m0, m1 ; in0 += in1 @@ -383,7 +376,11 @@ INV_TXFM_4X4_FN adst, identity cglobal iadst_4x4_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 call .main - TRANSPOSE4X4_PACKED 0, 1, 2 + ; transpose + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 ; m0 = out0 out1 ; m1 = out2 out3 ; m5 = pd_2048 @@ -392,7 +389,7 @@ cglobal iadst_4x4_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 ; m0 = in0 in1 ; m1 = in2 in3 %if ARCH_X86_32 - lea r5, [o(itx8_start)] + lea r5, [o(itx8_start)] %endif call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main .end: @@ -521,8 +518,12 @@ cglobal iidentity_4x4_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 packssdw m0, m1 - packssdw m1, m2, m3 - TRANSPOSE4X4_PACKED 0, 1, 2 + packssdw m2, m3 + ; transpose + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 ; m0 = out0 out1 ; m1 = out2 out3 ; m5 = pd_2048 From 30708a421634b3a3b1cb54ce43870483c6d9d55d Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Mon, 21 Jun 2021 12:59:15 +0200 Subject: [PATCH 126/188] x86: Add high bitdepth dc/h/v ipred SSSE3 asm --- src/x86/ipred16_sse.asm | 511 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 511 insertions(+) create mode 100644 src/x86/ipred16_sse.asm diff --git a/src/x86/ipred16_sse.asm b/src/x86/ipred16_sse.asm new file mode 100644 index 0000000000..6afd15e647 --- /dev/null +++ b/src/x86/ipred16_sse.asm @@ -0,0 +1,511 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +pb_0_1: times 4 db 0, 1 +pb_2_3: times 4 db 2, 3 +pw_512: times 4 dw 512 +pw_2048: times 4 dw 2048 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4) +%define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4) + +JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64 +JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \ + s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4 +JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 + +SECTION .text + +INIT_XMM ssse3 +cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h + LEA r5, ipred_dc_left_16bpc_ssse3_table + movd m4, wm + tzcnt wd, wm + add tlq, 2 + movifnidn hd, hm + pxor m3, m3 + pavgw m4, m3 + movd m5, wd + movu m0, [tlq] + movsxd r6, [r5+wq*4] + add r6, r5 + add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_left_16bpc_ssse3_table + mov hd, hm + movd m4, hm + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + pxor m3, m3 + sub tlq, hq + pavgw m4, m3 + movd m5, r6d + movu m0, [tlq] + movsxd r6, [r5+r6*4] + add r6, r5 + add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + movu m2, [tlq+112] + movu m1, [tlq+ 96] + paddw m0, m2 + movu m2, [tlq+ 80] + paddw m1, m2 + movu m2, [tlq+ 64] + paddw m0, m2 + paddw m0, m1 +.h32: + movu m1, [tlq+ 48] + movu m2, [tlq+ 32] + paddw m1, m2 + paddw m0, m1 +.h16: + movu m1, [tlq+ 16] + paddw m0, m1 +.h8: + movhlps m1, m0 + paddw m0, m1 +.h4: + punpcklwd m0, m3 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + lea stride3q, [strideq*3] + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + jmp wq + +cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd m4, r5d + tzcnt r5d, r5d + movd m5, r5d + LEA r5, ipred_dc_16bpc_ssse3_table + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+5*4] + pxor m3, m3 + psrlw m4, 1 + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movq m0, [tlq-8] + jmp wq +.w4: + movq m1, [tlq+2] + paddw m1, m0 + punpckhwd m0, m3 + punpcklwd m1, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + cmp hd, 4 + jg .w4_mul + psrlw m0, 3 + jmp .w4_end +.w4_mul: + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 16 + cmove r2d, r3d + psrld m0, 2 + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w4_end: + pshuflw m0, m0, q0000 +.s4: + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +.h8: + mova m0, [tlq-16] + jmp wq +.w8: + movu m1, [tlq+2] + paddw m0, m1 + punpcklwd m1, m0, m3 + punpckhwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 8 + je .w8_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 32 + cmove r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w8_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +.h16: + mova m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w16: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + paddw m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 16 + je .w16_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + test hd, 8|32 + cmovz r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w16_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s16c: + mova m1, m0 +.s16: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + mova [dstq+strideq*2+16*0], m0 + mova [dstq+strideq*2+16*1], m1 + mova [dstq+stride3q +16*0], m0 + mova [dstq+stride3q +16*1], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +.h32: + mova m0, [tlq-64] + paddw m0, [tlq-48] + paddw m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w32: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + movu m2, [tlq+34] + paddw m0, m2 + movu m2, [tlq+50] + paddw m1, m2 + paddw m0, m1 + punpcklwd m1, m0, m3 + punpckhwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 32 + je .w32_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 8 + cmove r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w32_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s32c: + mova m1, m0 + mova m2, m0 + mova m3, m0 +.s32: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + mova [dstq+strideq*0+16*2], m2 + mova [dstq+strideq*0+16*3], m3 + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + mova [dstq+strideq*1+16*2], m2 + mova [dstq+strideq*1+16*3], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .s32 + RET +.h64: + mova m0, [tlq-128] + mova m1, [tlq-112] + paddw m0, [tlq- 96] + paddw m1, [tlq- 80] + paddw m0, [tlq- 64] + paddw m1, [tlq- 48] + paddw m0, [tlq- 32] + paddw m1, [tlq- 16] + paddw m0, m1 + jmp wq +.w64: + movu m1, [tlq+ 2] + movu m2, [tlq+ 18] + paddw m1, m2 + movu m2, [tlq+ 34] + paddw m0, m2 + movu m2, [tlq+ 50] + paddw m1, m2 + movu m2, [tlq+ 66] + paddw m0, m2 + movu m2, [tlq+ 82] + paddw m1, m2 + movu m2, [tlq+ 98] + paddw m0, m2 + movu m2, [tlq+114] + paddw m1, m2 + paddw m0, m1 + punpcklwd m1, m0, m3 + punpckhwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 64 + je .w64_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 16 + cmove r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w64_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m0 + mova [dstq+16*2], m0 + mova [dstq+16*3], m0 + mova [dstq+16*4], m0 + mova [dstq+16*5], m0 + mova [dstq+16*6], m0 + mova [dstq+16*7], m0 + add dstq, strideq + dec hd + jg .s64 + RET + +cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 + mov r6d, r8m + LEA r5, ipred_dc_128_16bpc_ssse3_table + tzcnt wd, wm + shr r6d, 11 + movifnidn hd, hm + movsxd wq, [r5+wq*4] + movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_splat_16bpc_ssse3_table + movifnidn hd, hm + movu m0, [tlq+ 2] + movu m1, [tlq+ 18] + movu m2, [tlq+ 34] + movu m3, [tlq+ 50] + cmp wd, 64 + je .w64 + tzcnt wd, wd + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +.w64: + WIN64_SPILL_XMM 8 + movu m4, [tlq+ 66] + movu m5, [tlq+ 82] + movu m6, [tlq+ 98] + movu m7, [tlq+114] +.w64_loop: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + mova [dstq+16*4], m4 + mova [dstq+16*5], m5 + mova [dstq+16*6], m6 + mova [dstq+16*7], m7 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 +%define base r5-ipred_h_16bpc_ssse3_table + tzcnt wd, wm + LEA r5, ipred_h_16bpc_ssse3_table + movifnidn hd, hm + movsxd wq, [r5+wq*4] + movddup m2, [base+pb_0_1] + movddup m3, [base+pb_2_3] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +.w4: + sub tlq, 8 + movq m3, [tlq] + pshuflw m0, m3, q3333 + pshuflw m1, m3, q2222 + pshuflw m2, m3, q1111 + pshuflw m3, m3, q0000 + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m1 + movq [dstq+strideq*2], m2 + movq [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + sub tlq, 8 + movq m3, [tlq] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +.w16: + sub tlq, 4 + movd m1, [tlq] + pshufb m0, m1, m3 + pshufb m1, m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m0 + mova [dstq+strideq*1+16*0], m1 + mova [dstq+strideq*1+16*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16 + RET +.w32: + sub tlq, 4 + movd m1, [tlq] + pshufb m0, m1, m3 + pshufb m1, m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m0 + mova [dstq+strideq*0+16*2], m0 + mova [dstq+strideq*0+16*3], m0 + mova [dstq+strideq*1+16*0], m1 + mova [dstq+strideq*1+16*1], m1 + mova [dstq+strideq*1+16*2], m1 + mova [dstq+strideq*1+16*3], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32 + RET +.w64: + sub tlq, 2 + movd m0, [tlq] + pshufb m0, m2 + mova [dstq+16*0], m0 + mova [dstq+16*1], m0 + mova [dstq+16*2], m0 + mova [dstq+16*3], m0 + mova [dstq+16*4], m0 + mova [dstq+16*5], m0 + mova [dstq+16*6], m0 + mova [dstq+16*7], m0 + add dstq, strideq + dec hd + jg .w64 + RET From 679618561439d2329f3d7a615f145c25bfd41cf7 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Mon, 21 Jun 2021 12:59:22 +0200 Subject: [PATCH 127/188] x86: Add high bitdepth paeth ipred SSSE3 asm --- src/x86/ipred16_avx2.asm | 1 - src/x86/ipred16_sse.asm | 83 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index a1a7f6ba44..c712bca350 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -561,7 +561,6 @@ ALIGN function_align .w8: vbroadcasti128 m2, [tlq+2] movsldup m6, [base+ipred_hv_shuf] - lea r3, [strideq*3] psubw m4, m2, m3 pabsw m5, m4 .w8_loop: diff --git a/src/x86/ipred16_sse.asm b/src/x86/ipred16_sse.asm index 6afd15e647..6bd66a5248 100644 --- a/src/x86/ipred16_sse.asm +++ b/src/x86/ipred16_sse.asm @@ -509,3 +509,86 @@ cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 dec hd jg .w64 RET + +cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left +%define base r5-ipred_paeth_16bpc_ssse3_table + movifnidn hd, hm + pshuflw m4, [tlq], q0000 + mov leftq, tlq + add hd, hd + punpcklqdq m4, m4 ; topleft + sub leftq, hq + and wd, ~7 + jnz .w8 + movddup m5, [tlq+2] ; top + psubw m6, m5, m4 + pabsw m7, m6 +.w4_loop: + movd m1, [leftq+hq-4] + punpcklwd m1, m1 + punpckldq m1, m1 ; left +%macro PAETH 0 + paddw m0, m6, m1 + psubw m2, m4, m0 ; tldiff + psubw m0, m5 ; tdiff + pabsw m2, m2 + pabsw m0, m0 + pminsw m2, m0 + pcmpeqw m0, m2 + pand m3, m5, m0 + pandn m0, m4 + por m0, m3 + pcmpgtw m3, m7, m2 + pand m0, m3 + pandn m3, m1 + por m0, m3 +%endmacro + PAETH + movhps [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2*2 + jg .w4_loop + RET +.w8: +%if ARCH_X86_32 + PUSH r6 + %define r7d hm + %assign regs_used 7 +%elif WIN64 + movaps r4m, m8 + PUSH r7 + %assign regs_used 8 +%endif +%if ARCH_X86_64 + movddup m8, [pb_0_1] +%endif + lea tlq, [tlq+wq*2+2] + neg wq + mov r7d, hd +.w8_loop0: + movu m5, [tlq+wq*2] + mov r6, dstq + add dstq, 16 + psubw m6, m5, m4 + pabsw m7, m6 +.w8_loop: + movd m1, [leftq+hq-2] +%if ARCH_X86_64 + pshufb m1, m8 +%else + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 +%endif + PAETH + mova [r6], m0 + add r6, strideq + sub hd, 1*2 + jg .w8_loop + mov hd, r7d + add wq, 8 + jl .w8_loop0 +%if WIN64 + movaps m8, r4m +%endif + RET From d86b58455d17b227acd3b7ee4ecf6529831f9b6b Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Mon, 21 Jun 2021 12:59:25 +0200 Subject: [PATCH 128/188] x86: Add high bitdepth pal_pred SSSE3 asm --- src/x86/ipred16_sse.asm | 114 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/src/x86/ipred16_sse.asm b/src/x86/ipred16_sse.asm index 6bd66a5248..eee1f0e21b 100644 --- a/src/x86/ipred16_sse.asm +++ b/src/x86/ipred16_sse.asm @@ -28,6 +28,8 @@ SECTION_RODATA +pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + pb_0_1: times 4 db 0, 1 pb_2_3: times 4 db 2, 3 pw_512: times 4 dw 512 @@ -51,6 +53,7 @@ JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32 s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \ s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4 JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 SECTION .text @@ -592,3 +595,114 @@ cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left movaps m8, r4m %endif RET + +cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h +%define base r2-pal_pred_16bpc_ssse3_table +%if ARCH_X86_32 + %define hd r2d +%endif + mova m3, [palq] + LEA r2, pal_pred_16bpc_ssse3_table + tzcnt wd, wm + pshufb m3, [base+pal_pred_shuf] + movsxd wq, [r2+wq*4] + pshufd m4, m3, q1032 + add wq, r2 + movifnidn hd, hm + jmp wq +.w4: + mova m0, [idxq] + add idxq, 16 + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 4 + jg .w4 + RET +.w8: + mova m0, [idxq] + add idxq, 16 + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8 + RET +.w16: + mova m0, [idxq] + add idxq, 16 + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, strideq + dec hd + jg .w16 + RET +.w32: + mova m0, [idxq+16*0] + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova m2, [idxq+16*1] + add idxq, 16*2 + mova [dstq+16*0], m0 + pshufb m0, m3, m2 + mova [dstq+16*1], m1 + pshufb m1, m4, m2 + punpcklbw m2, m0, m1 + punpckhbw m0, m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m0 + add dstq, strideq + dec hd + jg .w32 + RET +.w64: + mova m0, [idxq+16*0] + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova m2, [idxq+16*1] + mova [dstq+16*0], m0 + pshufb m0, m3, m2 + mova [dstq+16*1], m1 + pshufb m1, m4, m2 + punpcklbw m2, m0, m1 + punpckhbw m0, m1 + mova m1, [idxq+16*2] + mova [dstq+16*2], m2 + pshufb m2, m3, m1 + mova [dstq+16*3], m0 + pshufb m0, m4, m1 + punpcklbw m1, m2, m0 + punpckhbw m2, m0 + mova m0, [idxq+16*3] + add idxq, 16*4 + mova [dstq+16*4], m1 + pshufb m1, m3, m0 + mova [dstq+16*5], m2 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + add dstq, strideq + dec hd + jg .w64 + RET From e454e86449d772b3a41eb3cbddb2c179ec506b21 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Wed, 23 Jun 2021 22:31:45 +0200 Subject: [PATCH 129/188] x86: Optimize high bitdepth smooth ipred AVX2 asm --- src/x86/ipred16_avx2.asm | 608 +++++++++++++++++++-------------------- 1 file changed, 289 insertions(+), 319 deletions(-) diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index c712bca350..e0a8c6eb4c 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -30,31 +30,35 @@ SECTION_RODATA 32 -%macro SMOOTH_WEIGHT_TABLE 1-* +%macro SMOOTH_WEIGHTS 1-* +const smooth_weights_1d_16bpc ; sm_weights[] << 7 + %rep %0 + dw %1*128 + %rotate 1 + %endrep +const smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[] %rep %0 dw %1, 256-%1 %rotate 1 %endrep %endmacro -; sm_weights[], but modified to precalculate x and 256-x -smooth_weights: SMOOTH_WEIGHT_TABLE \ - 0, 0, 255, 128, 255, 149, 85, 64, \ - 255, 197, 146, 105, 73, 50, 37, 32, \ - 255, 225, 196, 170, 145, 123, 102, 84, \ - 68, 54, 43, 33, 26, 20, 17, 16, \ - 255, 240, 225, 210, 196, 182, 169, 157, \ - 145, 133, 122, 111, 101, 92, 83, 74, \ - 66, 59, 52, 45, 39, 34, 29, 25, \ - 21, 17, 14, 12, 10, 9, 8, 8, \ - 255, 248, 240, 233, 225, 218, 210, 203, \ - 196, 189, 182, 176, 169, 163, 156, 150, \ - 144, 138, 133, 127, 121, 116, 111, 106, \ - 101, 96, 91, 86, 82, 77, 73, 69, \ - 65, 61, 57, 54, 50, 47, 44, 41, \ - 38, 35, 32, 29, 27, 25, 22, 20, \ - 18, 16, 15, 13, 12, 10, 9, 8, \ - 7, 6, 6, 5, 5, 4, 4, 4 +SMOOTH_WEIGHTS 0, 0, 255, 128, 255, 149, 85, 64, \ + 255, 197, 146, 105, 73, 50, 37, 32, \ + 255, 225, 196, 170, 145, 123, 102, 84, \ + 68, 54, 43, 33, 26, 20, 17, 16, \ + 255, 240, 225, 210, 196, 182, 169, 157, \ + 145, 133, 122, 111, 101, 92, 83, 74, \ + 66, 59, 52, 45, 39, 34, 29, 25, \ + 21, 17, 14, 12, 10, 9, 8, 8, \ + 255, 248, 240, 233, 225, 218, 210, 203, \ + 196, 189, 182, 176, 169, 163, 156, 150, \ + 144, 138, 133, 127, 121, 116, 111, 106, \ + 101, 96, 91, 86, 82, 77, 73, 69, \ + 65, 61, 57, 54, 50, 47, 44, 41, \ + 38, 35, 32, 29, 27, 25, 22, 20, \ + 18, 16, 15, 13, 12, 10, 9, 8, \ + 7, 6, 6, 5, 5, 4, 4, 4 ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15 @@ -92,8 +96,6 @@ pw_62: times 2 dw 62 pw_512: times 2 dw 512 pw_2048: times 2 dw 2048 pd_8: dd 8 -pd_128: dd 128 -pd_256: dd 256 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) @@ -132,8 +134,15 @@ cextern filter_intra_taps SECTION .text -INIT_YMM avx2 +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro +INIT_YMM avx2 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h movifnidn hd, hm add tlq, 2 @@ -646,268 +655,244 @@ ALIGN function_align jg .w64_loop RET -%macro SMOOTH 4 ; src[1-2], mul[1-2] - pmaddwd m0, m%3, m%1 - pmaddwd m1, m%4, m%2 - paddd m0, m2 - paddd m1, m2 - psrld m0, 8 - psrld m1, 8 - packssdw m0, m1 -%endmacro - cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights %define base r6-ipred_smooth_v_16bpc_avx2_table - lea r6, [ipred_smooth_v_16bpc_avx2_table] - tzcnt wd, wm - mov hd, hm - movsxd wq, [r6+wq*4] - vpbroadcastd m2, [base+pd_128] - lea weightsq, [base+smooth_weights+hq*8] - neg hq - vpbroadcastw m5, [tlq+hq*2] ; bottom - add wq, r6 - jmp wq + lea r6, [ipred_smooth_v_16bpc_avx2_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] + neg hq + vpbroadcastw m5, [tlq+hq*2] ; bottom + add wq, r6 + jmp wq .w4: - vpbroadcastq m3, [tlq+2] - punpcklwd m3, m5 ; top, bottom - movshdup m5, [base+ipred_hv_shuf] - lea r3, [strideq*3] - punpcklqdq m4, m5, m5 - punpckhqdq m5, m5 + vpbroadcastq m4, [tlq+2] ; top + movsldup m3, [base+ipred_hv_shuf] + lea r6, [strideq*3] + psubw m4, m5 ; top - bottom .w4_loop: - vbroadcasti128 m1, [weightsq+hq*4] - pshufb m0, m1, m4 - pshufb m1, m5 - SMOOTH 3, 3, 0, 1 - vextracti128 xm1, m0, 1 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xm1 - movhps [dstq+strideq*2], xm0 - movhps [dstq+r3 ], xm1 - lea dstq, [dstq+strideq*4] - add hq, 4 + vpbroadcastq m0, [weightsq+hq*2] + pshufb m0, m3 + pmulhrsw m0, m4 + paddw m0, m5 + vextracti128 xm1, m0, 1 + movhps [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movq [dstq+r6 ], xm0 + lea dstq, [dstq+strideq*4] + add hq, 4 jl .w4_loop .ret: RET -ALIGN function_align .w8: - vbroadcasti128 m4, [tlq+2] - punpcklwd m3, m4, m5 - punpckhwd m4, m5 - movshdup m5, [base+ipred_hv_shuf] + vbroadcasti128 m4, [tlq+2] + movsldup m3, [base+ipred_hv_shuf] + lea r6, [strideq*3] + psubw m4, m5 .w8_loop: - vpbroadcastq m1, [weightsq+hq*4] - pshufb m1, m5 - SMOOTH 3, 4, 1, 1 - mova [dstq+strideq*0], xm0 - vextracti128 [dstq+strideq*1], m0, 1 - lea dstq, [dstq+strideq*2] - add hq, 2 + vpbroadcastd m0, [weightsq+hq*2+0] + vpbroadcastd m1, [weightsq+hq*2+4] + pshufb m0, m3 + pshufb m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 + vextracti128 [dstq+strideq*0], m0, 1 + mova [dstq+strideq*1], xm0 + vextracti128 [dstq+strideq*2], m1, 1 + mova [dstq+r6 ], xm1 + lea dstq, [dstq+strideq*4] + add hq, 4 jl .w8_loop RET -ALIGN function_align .w16: - movu m4, [tlq+2] - punpcklwd m3, m4, m5 - punpckhwd m4, m5 + movu m4, [tlq+2] + lea r6, [strideq*3] + psubw m4, m5 .w16_loop: - vpbroadcastd m1, [weightsq+hq*4] - vpbroadcastd m5, [weightsq+hq*4+4] - SMOOTH 3, 4, 1, 1 - mova [dstq+strideq*0], m0 - SMOOTH 3, 4, 5, 5 - mova [dstq+strideq*1], m0 - lea dstq, [dstq+strideq*2] - add hq, 2 + vpbroadcastw m0, [weightsq+hq*2+0] + vpbroadcastw m1, [weightsq+hq*2+2] + vpbroadcastw m2, [weightsq+hq*2+4] + vpbroadcastw m3, [weightsq+hq*2+6] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r6 ], m3 + lea dstq, [dstq+strideq*4] + add hq, 4 jl .w16_loop RET -ALIGN function_align .w32: - WIN64_SPILL_XMM 8 - movu m4, [tlq+2] - movu m7, [tlq+34] - punpcklwd m3, m4, m5 - punpckhwd m4, m5 - punpcklwd m6, m7, m5 - punpckhwd m7, m5 + WIN64_SPILL_XMM 7 + movu m4, [tlq+ 2] + movu m6, [tlq+34] + psubw m4, m5 + psubw m6, m5 .w32_loop: - vpbroadcastd m5, [weightsq+hq*4] - SMOOTH 3, 4, 5, 5 - mova [dstq+32*0], m0 - SMOOTH 6, 7, 5, 5 - mova [dstq+32*1], m0 - add dstq, strideq - inc hq + vpbroadcastw m1, [weightsq+hq*2+0] + vpbroadcastw m3, [weightsq+hq*2+2] + pmulhrsw m0, m4, m1 + pmulhrsw m1, m6 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m6 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + lea dstq, [dstq+strideq*2] + add hq, 2 jl .w32_loop RET -ALIGN function_align .w64: - WIN64_SPILL_XMM 12 - movu m4, [tlq+ 2] - movu m7, [tlq+34] - movu m9, [tlq+66] - movu m11, [tlq+98] - punpcklwd m3, m4, m5 - punpckhwd m4, m5 - punpcklwd m6, m7, m5 - punpckhwd m7, m5 - punpcklwd m8, m9, m5 - punpckhwd m9, m5 - punpcklwd m10, m11, m5 - punpckhwd m11, m5 + WIN64_SPILL_XMM 8 + movu m3, [tlq+ 2] + movu m4, [tlq+34] + movu m6, [tlq+66] + movu m7, [tlq+98] + REPX {psubw x, m5}, m3, m4, m6, m7 .w64_loop: - vpbroadcastd m5, [weightsq+hq*4] - SMOOTH 3, 4, 5, 5 - mova [dstq+32*0], m0 - SMOOTH 6, 7, 5, 5 - mova [dstq+32*1], m0 - SMOOTH 8, 9, 5, 5 - mova [dstq+32*2], m0 - SMOOTH 10, 11, 5, 5 - mova [dstq+32*3], m0 - add dstq, strideq - inc hq + vpbroadcastw m2, [weightsq+hq*2] + pmulhrsw m0, m3, m2 + pmulhrsw m1, m4, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*0], m0 + pmulhrsw m0, m6, m2 + mova [dstq+32*1], m1 + pmulhrsw m1, m7, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + add dstq, strideq + inc hq jl .w64_loop RET -cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h +cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 %define base r6-ipred_smooth_h_16bpc_avx2_table - lea r6, [ipred_smooth_h_16bpc_avx2_table] - mov wd, wm - mov hd, hm - vpbroadcastw m3, [tlq+wq*2] ; right - tzcnt wd, wd - movsxd wq, [r6+wq*4] - vpbroadcastd m2, [base+pd_128] - add wq, r6 - jmp wq + lea r6, [ipred_smooth_h_16bpc_avx2_table] + mov wd, wm + movifnidn hd, hm + vpbroadcastw m5, [tlq+wq*2] ; right + tzcnt wd, wd + add hd, hd + movsxd wq, [r6+wq*4] + sub tlq, hq + lea stride3q, [strideq*3] + add wq, r6 + jmp wq .w4: - vbroadcasti128 m4, [base+smooth_weights+4*4] - movsldup m5, [base+ipred_hv_shuf] - sub tlq, 8 - sub tlq, hq - sub tlq, hq - lea r3, [strideq*3] + vpbroadcastq m4, [base+smooth_weights_1d_16bpc+4*2] + movsldup m3, [base+ipred_hv_shuf] .w4_loop: - vpbroadcastq m1, [tlq+hq*2] - pshufb m1, m5 - punpcklwd m0, m1, m3 ; left, right - punpckhwd m1, m3 - SMOOTH 0, 1, 4, 4 - vextracti128 xm1, m0, 1 + vpbroadcastq m0, [tlq+hq-8] ; left + pshufb m0, m3 + psubw m0, m5 ; left - right + pmulhrsw m0, m4 + paddw m0, m5 + vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+r3 ], xm1 - lea dstq, [dstq+strideq*4] - sub hd, 4 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 jg .w4_loop RET -ALIGN function_align .w8: - WIN64_SPILL_XMM 7 - vbroadcasti128 m4, [base+smooth_weights+8*4+16*0] - vbroadcasti128 m5, [base+smooth_weights+8*4+16*1] - movsldup m6, [base+ipred_hv_shuf] - sub tlq, 4 - sub tlq, hq - sub tlq, hq + vbroadcasti128 m4, [base+smooth_weights_1d_16bpc+8*2] + movsldup m3, [base+ipred_hv_shuf] .w8_loop: - vpbroadcastd m1, [tlq+hq*2] - pshufb m1, m6 - punpcklwd m0, m1, m3 - punpckhwd m1, m3 - SMOOTH 0, 1, 4, 5 + vpbroadcastd m0, [tlq+hq-4] + vpbroadcastd m1, [tlq+hq-8] + pshufb m0, m3 + pshufb m1, m3 + psubw m0, m5 + psubw m1, m5 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 - lea dstq, [dstq+strideq*2] - sub hq, 2 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 jg .w8_loop RET -ALIGN function_align .w16: - WIN64_SPILL_XMM 6 - mova xm4, [base+smooth_weights+16*4+16*0] - mova xm5, [base+smooth_weights+16*4+16*1] - vinserti128 m4, [base+smooth_weights+16*4+16*2], 1 - vinserti128 m5, [base+smooth_weights+16*4+16*3], 1 - sub tlq, 2 - sub tlq, hq - sub tlq, hq + movu m4, [base+smooth_weights_1d_16bpc+16*2] .w16_loop: - vpbroadcastw m1, [tlq+hq*2] - punpcklwd m0, m1, m3 - punpckhwd m1, m3 - SMOOTH 0, 1, 4, 5 - mova [dstq], m0 - add dstq, strideq - dec hq + vpbroadcastq m3, [tlq+hq-8] + punpcklwd m3, m3 + psubw m3, m5 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 jg .w16_loop RET -ALIGN function_align .w32: - WIN64_SPILL_XMM 10 - mova xm6, [base+smooth_weights+32*4+16*0] - mova xm7, [base+smooth_weights+32*4+16*1] - vinserti128 m6, [base+smooth_weights+32*4+16*2], 1 - vinserti128 m7, [base+smooth_weights+32*4+16*3], 1 - mova xm8, [base+smooth_weights+32*4+16*4] - mova xm9, [base+smooth_weights+32*4+16*5] - vinserti128 m8, [base+smooth_weights+32*4+16*6], 1 - vinserti128 m9, [base+smooth_weights+32*4+16*7], 1 - sub tlq, 2 - sub tlq, hq - sub tlq, hq + WIN64_SPILL_XMM 7 + movu m4, [base+smooth_weights_1d_16bpc+32*2] + movu m6, [base+smooth_weights_1d_16bpc+32*3] .w32_loop: - vpbroadcastw m5, [tlq+hq*2] - punpcklwd m4, m5, m3 - punpckhwd m5, m3 - SMOOTH 4, 5, 6, 7 - mova [dstq+32*0], m0 - SMOOTH 4, 5, 8, 9 - mova [dstq+32*1], m0 - add dstq, strideq - dec hq + vpbroadcastw m1, [tlq+hq-2] + vpbroadcastw m3, [tlq+hq-4] + psubw m1, m5 + psubw m3, m5 + pmulhrsw m0, m4, m1 + pmulhrsw m1, m6 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m6 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 jg .w32_loop RET -ALIGN function_align .w64: -%assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 14 - mova xm6, [base+smooth_weights+64*4+16* 0] - mova xm7, [base+smooth_weights+64*4+16* 1] - vinserti128 m6, [base+smooth_weights+64*4+16* 2], 1 - vinserti128 m7, [base+smooth_weights+64*4+16* 3], 1 - mova xm8, [base+smooth_weights+64*4+16* 4] - mova xm9, [base+smooth_weights+64*4+16* 5] - vinserti128 m8, [base+smooth_weights+64*4+16* 6], 1 - vinserti128 m9, [base+smooth_weights+64*4+16* 7], 1 - mova xm10, [base+smooth_weights+64*4+16* 8] - mova xm11, [base+smooth_weights+64*4+16* 9] - vinserti128 m10, [base+smooth_weights+64*4+16*10], 1 - vinserti128 m11, [base+smooth_weights+64*4+16*11], 1 - mova xm12, [base+smooth_weights+64*4+16*12] - mova xm13, [base+smooth_weights+64*4+16*13] - vinserti128 m12, [base+smooth_weights+64*4+16*14], 1 - vinserti128 m13, [base+smooth_weights+64*4+16*15], 1 - sub tlq, 2 - sub tlq, hq - sub tlq, hq + WIN64_SPILL_XMM 8 + movu m3, [base+smooth_weights_1d_16bpc+32*4] + movu m4, [base+smooth_weights_1d_16bpc+32*5] + movu m6, [base+smooth_weights_1d_16bpc+32*6] + movu m7, [base+smooth_weights_1d_16bpc+32*7] .w64_loop: - vpbroadcastw m5, [tlq+hq*2] - punpcklwd m4, m5, m3 - punpckhwd m5, m3 - SMOOTH 4, 5, 6, 7 - mova [dstq+32*0], m0 - SMOOTH 4, 5, 8, 9 - mova [dstq+32*1], m0 - SMOOTH 4, 5, 10, 11 - mova [dstq+32*2], m0 - SMOOTH 4, 5, 12, 13 - mova [dstq+32*3], m0 - add dstq, strideq - dec hq + vpbroadcastw m2, [tlq+hq-2] + psubw m2, m5 + pmulhrsw m0, m3, m2 + pmulhrsw m1, m4, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*0], m0 + pmulhrsw m0, m6, m2 + mova [dstq+32*1], m1 + pmulhrsw m1, m7, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + add dstq, strideq + sub hq, 1*2 jg .w64_loop RET @@ -916,11 +901,10 @@ ALIGN function_align pmaddwd m1, m%2, m%4 paddd m0, m%5 paddd m1, m%6 - paddd m0, m5 - paddd m1, m5 - psrld m0, 9 - psrld m1, 9 + psrld m0, 8 + psrld m1, 8 packssdw m0, m1 + pavgw m0, m5 %endmacro cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights @@ -933,9 +917,9 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights sub tlq, hq sub tlq, hq movsxd wq, [r6+wq*4] - vpbroadcastd m5, [base+pd_256] + pxor m5, m5 add wq, r6 - lea v_weightsq, [base+smooth_weights+hq*4] + lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*4] jmp wq .w4: WIN64_SPILL_XMM 11 @@ -943,15 +927,14 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights vpbroadcastq m6, [tlq+hq*2+2] movsldup m7, [base+ipred_hv_shuf] movshdup m9, [base+ipred_hv_shuf] - vbroadcasti128 m10, [base+smooth_weights+4*4] + vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+4*4] punpcklwd m6, m0 ; top, bottom punpcklqdq m8, m9, m9 punpckhqdq m9, m9 lea r3, [strideq*3] - sub tlq, 8 .w4_loop: + vpbroadcastq m3, [tlq+hq*2-8] vbroadcasti128 m1, [v_weightsq] - vpbroadcastq m3, [tlq+hq*2] pshufb m3, m7 punpcklwd m2, m3, m4 ; left, right punpckhwd m3, m4 @@ -959,7 +942,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights pmaddwd m3, m10 pshufb m0, m1, m8 pshufb m1, m9 - SMOOTH_2D_END 6, 6, 0, 1, 2, 3 + SMOOTH_2D_END 0, 1, 6, 6, 2, 3 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 @@ -970,7 +953,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights sub hd, 4 jg .w4_loop RET -ALIGN function_align .w8: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 @@ -978,21 +960,20 @@ ALIGN function_align vbroadcasti128 m7, [tlq+hq*2+2] movsldup m8, [base+ipred_hv_shuf] movshdup m9, [base+ipred_hv_shuf] - vbroadcasti128 m10, [base+smooth_weights+8*4+16*0] - vbroadcasti128 m11, [base+smooth_weights+8*4+16*1] + vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+8*4+16*0] + vbroadcasti128 m11, [base+smooth_weights_2d_16bpc+8*4+16*1] punpcklwd m6, m7, m0 ; top, bottom punpckhwd m7, m0 - sub tlq, 4 .w8_loop: + vpbroadcastd m3, [tlq+hq*2-4] vpbroadcastq m1, [v_weightsq] - vpbroadcastd m3, [tlq+hq*2] pshufb m3, m8 punpcklwd m2, m3, m4 ; left, right punpckhwd m3, m4 pmaddwd m2, m10 pmaddwd m3, m11 pshufb m1, m9 - SMOOTH_2D_END 6, 7, 1, 1, 2, 3 + SMOOTH_2D_END 1, 1, 6, 7, 2, 3 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] @@ -1000,88 +981,79 @@ ALIGN function_align sub hd, 2 jg .w8_loop RET -ALIGN function_align .w16: %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 14 + WIN64_SPILL_XMM 11 vpbroadcastw m0, [tlq] ; bottom movu m7, [tlq+hq*2+2] - mova xm8, [base+smooth_weights+16*4+16*0] - mova xm9, [base+smooth_weights+16*4+16*1] - vinserti128 m8, [base+smooth_weights+16*4+16*2], 1 - vinserti128 m9, [base+smooth_weights+16*4+16*3], 1 + mova xm8, [base+smooth_weights_2d_16bpc+16*4+16*0] + mova xm9, [base+smooth_weights_2d_16bpc+16*4+16*1] + vinserti128 m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1 + vinserti128 m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1 punpcklwd m6, m7, m0 ; top, bottom punpckhwd m7, m0 - sub tlq, 2 .w16_loop: - vpbroadcastd m10, [v_weightsq+0] - vpbroadcastd m11, [v_weightsq+4] - vpbroadcastw m3, [tlq+hq*2-0] - vpbroadcastw m13, [tlq+hq*2-2] - punpcklwd m2, m3, m4 ; left, right - punpckhwd m3, m4 - punpcklwd m12, m13, m4 - punpckhwd m13, m4 - pmaddwd m2, m8 + vpbroadcastd m3, [tlq+hq*2-4] + vpbroadcastd m1, [v_weightsq+0] + punpcklwd m3, m4 ; left, right + pshufd m2, m3, q1111 + pmaddwd m10, m8, m2 + pmaddwd m2, m9 + pshufd m3, m3, q0000 + SMOOTH_2D_END 1, 1, 6, 7, 10, 2 + vpbroadcastd m1, [v_weightsq+4] + pmaddwd m2, m8, m3 pmaddwd m3, m9 - pmaddwd m12, m8 - pmaddwd m13, m9 - SMOOTH_2D_END 6, 7, 10, 10, 2, 3 mova [dstq+strideq*0], m0 - SMOOTH_2D_END 6, 7, 11, 11, 12, 13 + SMOOTH_2D_END 1, 1, 6, 7, 2, 3 mova [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] add v_weightsq, 8 sub hq, 2 jg .w16_loop RET -ALIGN function_align .w32: %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 16 + WIN64_SPILL_XMM 15 vpbroadcastw m0, [tlq] ; bottom movu m7, [tlq+hq*2+ 2] movu m9, [tlq+hq*2+34] - mova xm10, [base+smooth_weights+32*4+16*0] - mova xm11, [base+smooth_weights+32*4+16*1] - vinserti128 m10, [base+smooth_weights+32*4+16*2], 1 - vinserti128 m11, [base+smooth_weights+32*4+16*3], 1 - mova xm12, [base+smooth_weights+32*4+16*4] - mova xm13, [base+smooth_weights+32*4+16*5] - vinserti128 m12, [base+smooth_weights+32*4+16*6], 1 - vinserti128 m13, [base+smooth_weights+32*4+16*7], 1 + mova xm10, [base+smooth_weights_2d_16bpc+32*4+16*0] + mova xm11, [base+smooth_weights_2d_16bpc+32*4+16*1] + vinserti128 m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1 + vinserti128 m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1 + mova xm12, [base+smooth_weights_2d_16bpc+32*4+16*4] + mova xm13, [base+smooth_weights_2d_16bpc+32*4+16*5] + vinserti128 m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1 + vinserti128 m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1 punpcklwd m6, m7, m0 punpckhwd m7, m0 punpcklwd m8, m9, m0 punpckhwd m9, m0 - sub tlq, 2 .w32_loop: - vpbroadcastw m3, [tlq+hq*2] - punpcklwd m2, m3, m4 - punpckhwd m3, m4 - pmaddwd m14, m2, m10 - pmaddwd m15, m3, m11 - pmaddwd m2, m12 + vpbroadcastw m3, [tlq+hq*2-2] + vpbroadcastd m14, [v_weightsq] + punpcklwd m3, m4 + pmaddwd m1, m10, m3 + pmaddwd m2, m11, m3 + pmaddwd m0, m6, m14 + paddd m0, m1 + pmaddwd m1, m7, m14 + paddd m1, m2 + pmaddwd m2, m12, m3 pmaddwd m3, m13 - vpbroadcastd m1, [v_weightsq] - pmaddwd m0, m6, m1 - paddd m0, m14 - paddd m0, m5 - psrld m0, 9 - pmaddwd m14, m7, m1 - paddd m14, m15 - paddd m14, m5 - psrld m14, 9 - packssdw m0, m14 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + pavgw m0, m5 mova [dstq+32*0], m0 - SMOOTH_2D_END 8, 9, 1, 1, 2, 3 + SMOOTH_2D_END 14, 14, 8, 9, 2, 3 mova [dstq+32*1], m0 add dstq, strideq add v_weightsq, 4 dec hd jg .w32_loop RET -ALIGN function_align .w64: %assign stack_offset stack_offset - stack_size_padded PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base @@ -1095,37 +1067,35 @@ ALIGN function_align vpbroadcastw m0, [tl_baseq] ; bottom movu m7, [tlq+xq*2+ 2] movu m9, [tlq+xq*2+34] - mova xm10, [base+smooth_weights+64*4+16*0] - mova xm11, [base+smooth_weights+64*4+16*1] - vinserti128 m10, [base+smooth_weights+64*4+16*2], 1 - vinserti128 m11, [base+smooth_weights+64*4+16*3], 1 - mova xm12, [base+smooth_weights+64*4+16*4] - mova xm13, [base+smooth_weights+64*4+16*5] - vinserti128 m12, [base+smooth_weights+64*4+16*6], 1 - vinserti128 m13, [base+smooth_weights+64*4+16*7], 1 + mova xm10, [base+smooth_weights_2d_16bpc+64*4+16*0] + mova xm11, [base+smooth_weights_2d_16bpc+64*4+16*1] + vinserti128 m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1 + vinserti128 m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1 + mova xm12, [base+smooth_weights_2d_16bpc+64*4+16*4] + mova xm13, [base+smooth_weights_2d_16bpc+64*4+16*5] + vinserti128 m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1 + vinserti128 m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1 punpcklwd m6, m7, m0 punpckhwd m7, m0 punpcklwd m8, m9, m0 punpckhwd m9, m0 lea tlq, [tl_baseq-2] .w64_loop_y: - vpbroadcastd m1, [v_weightsq] vpbroadcastw m3, [tlq+yq*2] - punpcklwd m2, m3, m4 - punpckhwd m3, m4 - pmaddwd m14, m2, m10 - pmaddwd m15, m3, m11 - pmaddwd m2, m12 + vpbroadcastd m1, [v_weightsq] + punpcklwd m3, m4 + pmaddwd m14, m10, m3 + pmaddwd m15, m11, m3 + pmaddwd m2, m12, m3 pmaddwd m3, m13 pmaddwd m0, m6, m1 paddd m0, m14 - paddd m0, m5 - psrld m0, 9 pmaddwd m14, m7, m1 paddd m14, m15 - paddd m14, m5 - psrld m14, 9 + psrld m0, 8 + psrld m14, 8 packssdw m0, m14 + pavgw m0, m5 mova [dstq+32*0], m0 SMOOTH_2D_END 8, 9, 1, 1, 2, 3 mova [dstq+32*1], m0 From 0dd8574fbb68dd78c33a001a1764ffc1013bf8c2 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Wed, 23 Jun 2021 22:33:17 +0200 Subject: [PATCH 130/188] x86: Add high bitdepth smooth ipred SSSE3 asm --- src/x86/ipred16_avx2.asm | 4 +- src/x86/ipred16_sse.asm | 283 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 285 insertions(+), 2 deletions(-) diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index e0a8c6eb4c..c25768feda 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -26,8 +26,6 @@ %include "config.asm" %include "ext/x86/x86inc.asm" -%if ARCH_X86_64 - SECTION_RODATA 32 %macro SMOOTH_WEIGHTS 1-* @@ -60,6 +58,8 @@ SMOOTH_WEIGHTS 0, 0, 255, 128, 255, 149, 85, 64, \ 18, 16, 15, 13, 12, 10, 9, 8, \ 7, 6, 6, 5, 5, 4, 4, 4 +%if ARCH_X86_64 + ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15 filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1 diff --git a/src/x86/ipred16_sse.asm b/src/x86/ipred16_sse.asm index eee1f0e21b..ad35c52074 100644 --- a/src/x86/ipred16_sse.asm +++ b/src/x86/ipred16_sse.asm @@ -55,8 +55,19 @@ JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32 JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 +cextern smooth_weights_1d_16bpc +cextern smooth_weights_2d_16bpc + SECTION .text +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + INIT_XMM ssse3 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h LEA r5, ipred_dc_left_16bpc_ssse3_table @@ -596,6 +607,278 @@ cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left %endif RET +%if ARCH_X86_64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 4 +%endif + +cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights + LEA weightsq, smooth_weights_1d_16bpc + mov hd, hm + lea weightsq, [weightsq+hq*4] + neg hq + movd m5, [tlq+hq*2] ; bottom + pshuflw m5, m5, q0000 + punpcklqdq m5, m5 + cmp wd, 4 + jne .w8 + movddup m4, [tlq+2] ; top + lea r3, [strideq*3] + psubw m4, m5 ; top - bottom +.w4_loop: + movq m1, [weightsq+hq*2] + punpcklwd m1, m1 + pshufd m0, m1, q1100 + punpckhdq m1, m1 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r3 ], m1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w4_loop + RET +.w8: +%if ARCH_X86_32 + PUSH r6 + %assign regs_used 7 + mov hm, hq + %define hq hm +%elif WIN64 + PUSH r7 + %assign regs_used 8 +%endif +.w8_loop0: + mov t0, hq + movu m4, [tlq+2] + add tlq, 16 + mov r6, dstq + add dstq, 16 + psubw m4, m5 +.w8_loop: + movq m3, [weightsq+t0*2] + punpcklwd m3, m3 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [r6+strideq*0], m0 + mova [r6+strideq*1], m1 + lea r6, [r6+strideq*2] + mova [r6+strideq*0], m2 + mova [r6+strideq*1], m3 + lea r6, [r6+strideq*2] + add t0, 4 + jl .w8_loop + sub wd, 8 + jg .w8_loop0 + RET + +cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights + LEA weightsq, smooth_weights_1d_16bpc + mov wd, wm + movifnidn hd, hm + movd m5, [tlq+wq*2] ; right + sub tlq, 8 + add hd, hd + pshuflw m5, m5, q0000 + sub tlq, hq + punpcklqdq m5, m5 + cmp wd, 4 + jne .w8 + movddup m4, [weightsq+4*2] + lea r3, [strideq*3] +.w4_loop: + movq m1, [tlq+hq] ; left + punpcklwd m1, m1 + psubw m1, m5 ; left - right + pshufd m0, m1, q3322 + punpckldq m1, m1 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 + movhps [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movhps [dstq+strideq*2], m1 + movq [dstq+r3 ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 + jg .w4_loop + RET +.w8: + lea weightsq, [weightsq+wq*4] + neg wq +%if ARCH_X86_32 + PUSH r6 + %assign regs_used 7 + %define hd hm +%elif WIN64 + PUSH r7 + %assign regs_used 8 +%endif +.w8_loop0: + mov t0d, hd + mova m4, [weightsq+wq*2] + mov r6, dstq + add dstq, 16 +.w8_loop: + movq m3, [tlq+t0*(1+ARCH_X86_32)] + punpcklwd m3, m3 + psubw m3, m5 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [r6+strideq*0], m0 + mova [r6+strideq*1], m1 + lea r6, [r6+strideq*2] + mova [r6+strideq*0], m2 + mova [r6+strideq*1], m3 + lea r6, [r6+strideq*2] + sub t0d, 4*(1+ARCH_X86_64) + jg .w8_loop + add wq, 8 + jl .w8_loop0 + RET + +%if ARCH_X86_64 +DECLARE_REG_TMP 10 +%else +DECLARE_REG_TMP 3 +%endif + +cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \ + h_weights, v_weights, top + LEA h_weightsq, smooth_weights_2d_16bpc + mov wd, wm + mov hd, hm + movd m7, [tlq+wq*2] ; right + lea v_weightsq, [h_weightsq+hq*8] + neg hq + movd m6, [tlq+hq*2] ; bottom + pshuflw m7, m7, q0000 + pshuflw m6, m6, q0000 + cmp wd, 4 + jne .w8 + movq m4, [tlq+2] ; top + mova m5, [h_weightsq+4*4] + punpcklwd m4, m6 ; top, bottom + pxor m6, m6 +.w4_loop: + movq m1, [v_weightsq+hq*4] + sub tlq, 4 + movd m3, [tlq] ; left + pshufd m0, m1, q0000 + pshufd m1, m1, q1111 + pmaddwd m0, m4 + punpcklwd m3, m7 ; left, right + pmaddwd m1, m4 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + pmaddwd m2, m5 + pmaddwd m3, m5 + paddd m0, m2 + paddd m1, m3 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + pavgw m0, m6 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w4_loop + RET +.w8: +%if ARCH_X86_32 + lea h_weightsq, [h_weightsq+wq*4] + mov t0, tlq + mov r1m, tlq + mov r2m, hq + %define m8 [h_weightsq+16*0] + %define m9 [h_weightsq+16*1] +%else +%if WIN64 + movaps r4m, m8 + movaps r6m, m9 + PUSH r7 + PUSH r8 +%endif + PUSH r9 + PUSH r10 + %assign regs_used 11 + lea h_weightsq, [h_weightsq+wq*8] + lea topq, [tlq+wq*2] + neg wq + mov r8, tlq + mov r9, hq +%endif + punpcklqdq m6, m6 +.w8_loop0: +%if ARCH_X86_32 + movu m5, [t0+2] + add t0, 16 + mov r0m, t0 +%else + movu m5, [topq+wq*2+2] + mova m8, [h_weightsq+wq*4+16*0] + mova m9, [h_weightsq+wq*4+16*1] +%endif + mov t0, dstq + add dstq, 16 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 +.w8_loop: + movd m1, [v_weightsq+hq*4] + sub tlq, 2 + movd m3, [tlq] ; left + pshufd m1, m1, q0000 + pmaddwd m0, m4, m1 + pshuflw m3, m3, q0000 + pmaddwd m1, m5 + punpcklwd m3, m7 ; left, right + pmaddwd m2, m8, m3 + pmaddwd m3, m9 + paddd m0, m2 + paddd m1, m3 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + pxor m1, m1 + pavgw m0, m1 + mova [t0], m0 + add t0, strideq + inc hq + jl .w8_loop +%if ARCH_X86_32 + mov t0, r0m + mov tlq, r1m + add h_weightsq, 16*2 + mov hq, r2m + sub dword wm, 8 + jg .w8_loop0 +%else + mov tlq, r8 + mov hq, r9 + add wq, 8 + jl .w8_loop0 +%endif +%if WIN64 + movaps m8, r4m + movaps m9, r6m +%endif + RET + cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h %define base r2-pal_pred_16bpc_ssse3_table %if ARCH_X86_32 From 6d4d002d6471ca4afe0608ece41d2ab2ccc229f4 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Wed, 30 Jun 2021 19:46:12 -0400 Subject: [PATCH 131/188] x86/deblock_hbd_avx2: use vpblendvb instead of pand/pandn/por in flat16/8/6 --- src/x86/loopfilter16_avx2.asm | 77 ++++++++++++----------------------- 1 file changed, 26 insertions(+), 51 deletions(-) diff --git a/src/x86/loopfilter16_avx2.asm b/src/x86/loopfilter16_avx2.asm index 2fbce77353..0c8618655c 100644 --- a/src/x86/loopfilter16_avx2.asm +++ b/src/x86/loopfilter16_avx2.asm @@ -623,9 +623,7 @@ SECTION .text paddw m8, m5 ; p6*7+p3+p1+q0 paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m2 - por m10, m9 + vpblendvb m10, m2, m10, m1 %ifidn %2, v mova [tmpq+strideq*2], m10 ; p5 %else @@ -638,9 +636,7 @@ SECTION .text paddw m8, m6 psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m7 - por m10, m9 + vpblendvb m10, m7, m10, m1 %ifidn %2, v mova [tmpq+stride3q], m10 ; p4 %else @@ -653,9 +649,7 @@ SECTION .text psubw m8, m2 paddw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m11 - por m10, m9 + vpblendvb m10, m11, m10, m1 %ifidn %2, v mova [tmpq+strideq*4], m10 ; p3 lea tmpq, [dstq+strideq*4] @@ -669,9 +663,7 @@ SECTION .text paddw m8, m15 psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m13 - por m10, m9 + vpblendvb m10, m13, m10, m1 mova [rsp+1*32], m10 ; don't clobber p2/m13 ; sub p6/p3, add p0/q4 @@ -684,9 +676,7 @@ SECTION .text %endif psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m3 - por m10, m9 + vpblendvb m10, m3, m10, m1 mova [rsp+2*32], m10 ; don't clobber p1/m3 ; sub p6/p2, add q0/q5 @@ -699,9 +689,7 @@ SECTION .text %endif psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m4 - por m10, m9 + vpblendvb m10, m4, m10, m1 mova [rsp+3*32], m10 ; don't clobber p0/m4 ; sub p6/p1, add q1/q6 @@ -715,9 +703,7 @@ SECTION .text paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m5 - por m10, m9 + vpblendvb m10, m5, m10, m1 mova [rsp+4*32], m10 ; don't clobber q0/m5 ; sub p5/p0, add q2/q6 @@ -726,9 +712,7 @@ SECTION .text paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m6 - por m2, m10, m9 ; don't clobber q1/m6 + vpblendvb m2, m6, m10, m1 ; don't clobber q1/m6 ; sub p4/q0, add q3/q6 paddw m8, m15 @@ -736,9 +720,7 @@ SECTION .text paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m14 - por m7, m10, m9 ; don't clobber q2/m14 + vpblendvb m7, m14, m10, m1 ; don't clobber q2/m14 ; sub p3/q1, add q4/q6 %ifidn %2, v @@ -750,9 +732,7 @@ SECTION .text paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m15 - por m10, m9 + vpblendvb m10, m15, m10, m1 %ifidn %2, v mova [tmpq+mstrideq], m10 ; q3 %else @@ -769,13 +749,12 @@ SECTION .text paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 %ifidn %2, v - pandn m9, m1, [tmpq+strideq*0] + mova m9, [tmpq+strideq*0] %else - pandn m9, m1, [rsp+10*32] + mova m9, [rsp+10*32] %endif - por m10, m9 + vpblendvb m10, m9, m10, m1 %ifidn %2, v mova [tmpq+strideq*0], m10 ; q4 %else @@ -790,11 +769,11 @@ SECTION .text psrlw m10, m8, 4 pand m10, m1 %ifidn %2, v - pandn m9, m1, [tmpq+strideq*1] + mova m9, [tmpq+strideq*1] %else - pandn m9, m1, [rsp+11*32] + mova m9, [rsp+11*32] %endif - por m10, m9 + vpblendvb m10, m9, m10, m1 %ifidn %2, v mova [tmpq+strideq*1], m10 ; q5 %else @@ -859,14 +838,12 @@ SECTION .text paddw m2, m0 pmulhrsw m2, [pw_4096] - REPX {pand x, m9}, m7, m8, m10, m11, m1, m2 - REPX {pandn x, m9, x}, m13, m3, m4, m5, m6, m14 - por m13, m7 - por m3, m8 - por m4, m10 - por m5, m11 - por m6, m1 - por m14, m2 + vpblendvb m13, m13, m7, m9 + vpblendvb m3, m3, m8, m9 + vpblendvb m4, m4, m10, m9 + vpblendvb m5, m5, m11, m9 + vpblendvb m6, m6, m1, m9 + vpblendvb m14, m14, m2, m9 %ifidn %2, v mova [tmpq+strideq*1], m13 ; p2 @@ -984,12 +961,10 @@ SECTION .text paddw m8, m14 pmulhrsw m8, [pw_4096] - REPX {pand x, m9}, m2, m10, m11, m8 - REPX {pandn x, m9, x}, m3, m4, m5, m6 - por m3, m2 - por m4, m10 - por m5, m11 - por m6, m8 + vpblendvb m3, m3, m2, m9 + vpblendvb m4, m4, m10, m9 + vpblendvb m5, m5, m11, m9 + vpblendvb m6, m6, m8, m9 %ifidn %2, v mova [tmpq+strideq*2], m3 ; p1 From 89fbfce2e2cdd582f29d63a1589f69ea503cdc75 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 1 Jul 2021 08:16:03 -0400 Subject: [PATCH 132/188] x86/deblock_avx2: use vpblendvb instead of pand/pandn/por in flat16/8/6 --- src/x86/loopfilter_avx2.asm | 90 +++++++++++-------------------------- 1 file changed, 25 insertions(+), 65 deletions(-) diff --git a/src/x86/loopfilter_avx2.asm b/src/x86/loopfilter_avx2.asm index c8eda1f0ff..d6b296b19e 100644 --- a/src/x86/loopfilter_avx2.asm +++ b/src/x86/loopfilter_avx2.asm @@ -822,9 +822,7 @@ SECTION .text pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m7 - por m8, m9 + vpblendvb m8, m7, m8, m1 %ifidn %2, v mova [tmpq+stride3q], m8 ; p4 %else @@ -850,9 +848,7 @@ SECTION .text pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m12 - por m8, m9 + vpblendvb m8, m12, m8, m1 %ifidn %2, v mova [tmpq+strideq*4], m8 ; p3 %else @@ -878,9 +874,7 @@ SECTION .text pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m13 - por m8, m9 + vpblendvb m8, m13, m8, m1 mova [rsp+6*32], m8 ; don't clobber p2/m13 since we need it in F ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E @@ -910,9 +904,7 @@ SECTION .text pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m3 - por m8, m9 + vpblendvb m8, m3, m8, m1 mova [rsp+8*32], m8 ; don't clobber p1/m3 since we need it in G ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F @@ -940,9 +932,7 @@ SECTION .text pmulhrsw m0, m10, [pw_2048] pmulhrsw m8, m11, [pw_2048] packuswb m0, m8 - pand m0, m1 - pandn m8, m1, m4 - por m0, m8 + vpblendvb m0, m4, m0, m1 mova [rsp+6*32], m0 ; don't clobber p0/m4 since we need it in H ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G @@ -966,9 +956,7 @@ SECTION .text pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m5 - por m8, m9 + vpblendvb m8, m5, m8, m1 mova [rsp+8*32], m8 ; don't clobber q0/m5 since we need it in I ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H @@ -985,9 +973,7 @@ SECTION .text pmulhrsw m2, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m2, m9 - pand m2, m1 - pandn m9, m1, m6 - por m2, m9 ; don't clobber q1/m6 since we need it in K + vpblendvb m2, m6, m2, m1 ; don't clobber q1/m6 since we need it in K ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I ; write +2 @@ -1003,9 +989,7 @@ SECTION .text pmulhrsw m7, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m7, m9 - pand m7, m1 - pandn m9, m1, m14 - por m7, m9 ; don't clobber q2/m14 since we need it in K + vpblendvb m7, m14, m7, m1 ; don't clobber q2/m14 since we need it in K ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J ; write +3 @@ -1021,9 +1005,7 @@ SECTION .text pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m15 - por m8, m9 + vpblendvb m8, m15, m8, m1 %ifidn %2, v mova [tmpq+mstrideq], m8 ; q3 %else @@ -1044,13 +1026,12 @@ SECTION .text pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 - pand m8, m1 %ifidn %2, v - pandn m9, m1, [tmpq+strideq*0] + mova m9, [tmpq+strideq*0] %else - pandn m9, m1, [rsp+15*32] + mova m9, [rsp+15*32] %endif - por m8, m9 + vpblendvb m8, m9, m8, m1 %ifidn %2, v mova [tmpq+strideq*0], m8 ; q4 %else @@ -1070,13 +1051,12 @@ SECTION .text pmulhrsw m10, [pw_2048] pmulhrsw m11, [pw_2048] packuswb m10, m11 - pand m10, m1 %ifidn %2, v - pandn m11, m1, [tmpq+strideq*1] + mova m11, [tmpq+strideq*1] %else - pandn m11, m1, [rsp+16*32] + mova m11, [rsp+16*32] %endif - por m10, m11 + vpblendvb m10, m11, m10, m1 %ifidn %2, v mova [tmpq+strideq*1], m10 ; q5 %else @@ -1109,9 +1089,7 @@ SECTION .text psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 - pand m8, m9 - pandn m11, m9, m13 - por m10, m8, m11 ; p2 + vpblendvb m10, m13, m8, m9 ; p2 %ifidn %2, v mova [tmpq+strideq*1], m10 ; p2 %endif @@ -1129,9 +1107,7 @@ SECTION .text psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 - pand m8, m9 - pandn m11, m9, m3 - por m8, m11 ; p1 + vpblendvb m8, m3, m8, m9 ; p1 %ifidn %2, v mova [tmpq+strideq*2], m8 ; p1 %else @@ -1151,9 +1127,7 @@ SECTION .text psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 - pand m8, m9 - pandn m11, m9, m4 - por m8, m11 ; p0 + vpblendvb m8, m4, m8, m9 ; p0 %ifidn %2, v mova [tmpq+stride3q ], m8 ; p0 %else @@ -1175,9 +1149,7 @@ SECTION .text psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 - pand m8, m9 - pandn m11, m9, m5 - por m11, m8, m11 ; q0 + vpblendvb m11, m5, m8, m9 ; q0 %ifidn %2, v mova [dstq+strideq*0], m11 ; q0 %endif @@ -1195,9 +1167,7 @@ SECTION .text psrlw m8, m2, 3 psrlw m13, m7, 3 packuswb m8, m13 - pand m8, m9 - pandn m13, m9, m6 - por m13, m8, m13 ; q1 + vpblendvb m13, m6, m8, m9 ; q1 %ifidn %2, v mova [dstq+strideq*1], m13 ; q1 %endif @@ -1217,9 +1187,7 @@ SECTION .text psrlw m2, 3 psrlw m7, 3 packuswb m2, m7 - pand m2, m9 - pandn m7, m9, m14 - por m2, m7 ; q2 + vpblendvb m2, m14, m2, m9 ; q2 %ifidn %2, v mova [dstq+strideq*2], m2 ; q2 %else @@ -1380,9 +1348,7 @@ SECTION .text pmulhrsw m2, m0, [pw_4096] pmulhrsw m12, m1, [pw_4096] packuswb m2, m12 - pand m2, m9 - pandn m12, m9, m3 - por m2, m12 + vpblendvb m2, m3, m2, m9 %ifidn %2, v mova [tmpq+strideq*2], m2 ; p1 %endif @@ -1400,9 +1366,7 @@ SECTION .text pmulhrsw m12, m0, [pw_4096] pmulhrsw m13, m1, [pw_4096] packuswb m12, m13 - pand m12, m9 - pandn m13, m9, m4 - por m12, m13 + vpblendvb m12, m4, m12, m9 %ifidn %2, v mova [tmpq+stride3q], m12 ; p0 %endif @@ -1418,9 +1382,7 @@ SECTION .text pmulhrsw m14, m0, [pw_4096] pmulhrsw m13, m1, [pw_4096] packuswb m14, m13 - pand m14, m9 - pandn m13, m9, m5 - por m14, m13 + vpblendvb m14, m5, m14, m9 %ifidn %2, v mova [dstq+strideq*0], m14 ; q0 %endif @@ -1436,9 +1398,7 @@ SECTION .text pmulhrsw m0, [pw_4096] pmulhrsw m1, [pw_4096] packuswb m0, m1 - pand m0, m9 - pandn m9, m6 - por m0, m9 + vpblendvb m0, m6, m0, m9 %ifidn %2, v mova [dstq+strideq*1], m0 ; q1 %else From a96f910a2dbae465478420b7a0af896579483ddd Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 29 Jun 2021 18:11:22 +0000 Subject: [PATCH 133/188] x86/deblock: make hbd/ssse3 implementations 32bit-compatible --- src/x86/loopfilter16_sse.asm | 2020 ++++++++++++++++++++++------------ 1 file changed, 1343 insertions(+), 677 deletions(-) diff --git a/src/x86/loopfilter16_sse.asm b/src/x86/loopfilter16_sse.asm index 82549b5b35..3ec3fd81fe 100644 --- a/src/x86/loopfilter16_sse.asm +++ b/src/x86/loopfilter16_sse.asm @@ -26,10 +26,15 @@ %include "config.asm" %include "ext/x86/x86inc.asm" -%if ARCH_X86_64 - SECTION_RODATA 16 +%if ARCH_X86_64 +%define PIC_sym(a) a +%else +%define PIC_base $$ +%define PIC_sym(a) pic_regq+a-PIC_base +%endif + pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 times 4 db 8, 9 @@ -47,6 +52,60 @@ pb_mask: dd 1, 1, 2, 2 SECTION .text +%if ARCH_X86_32 +%if STACK_ALIGNMENT < 16 +%define extra_stack 2 +%else +%define extra_stack 0 +%endif +%endif + +%macro RELOC_ARGS 2 ; h/v, off +ASSERT ARCH_X86_32 +%if STACK_ALIGNMENT < 16 + mov r5d, [rstk + stack_offset + 4*4 + 4] +%define lstridem [esp+%2+0*gprsize] + mov lstridem, r5d + mov r5d, [rstk + stack_offset + 4*5 + 4] +%define lutm [esp+%2+1*gprsize] + mov lutm, r5d + mov r5d, [rstk + stack_offset + 4*6 + 4] +%ifidn %1, v +%define wm [esp+%2+2*gprsize] + mov wm, r5d + mov r5d, [rstk + stack_offset + 4*3 + 4] +%define lm [esp+%2+3*gprsize] + mov lm, r5d +%else ; %1 == h +%define hm [esp+%2+2*gprsize] + mov hm, r5d +%endif ; %1==v + mov r5d, r7m +%define bdmulm [esp+%2+4*gprsize] + mov bdmulm, r5d +%else +%define lstridem r4m +%define lutm r5m +%ifidn %1, v +%define wm r6m +%define lm r3m +%else +%define hm r6m +%endif +%define bdmulm r7m +%endif ; STACK_ALIGNMENT +%endmacro + +%macro UNRELOC_ARGS 0 +%if ARCH_X86_32 +%undef lm +%undef lstridem +%undef wm +%undef hm +%undef lutm +%endif +%endmacro + %macro REPX 2-* %xdefine %%f(x) %1 %rep %0 - 1 @@ -86,35 +145,36 @@ SECTION .text %endmacro ; in: out: -; xmm%1 a b c d e f g h a i q y 6 E M U -; xmm%2 i j k l m n o p b j r z 7 F N V -; xmm%3 q r s t u v w x c k s 0 8 G O W -; xmm%4 y z 0 1 2 3 4 5 d l t 1 9 H P X -; xmm%5 6 7 8 9 A B C D -> e m u 2 A I Q Y -; xmm%6 E F G H I J K L f n v 3 B J R Z -; xmm%7 M N O P Q R S T g o w 4 C K S + -; xmm%8 U V W X Y Z + = h p x 5 D L T = +; m%1 a b c d e f g h a i q y 6 E M U +; m%2 i j k l m n o p b j r z 7 F N V +; m%3 q r s t u v w x c k s 0 8 G O W +; m%4 y z 0 1 2 3 4 5 d l t 1 9 H P X +; m%5 6 7 8 9 A B C D -> e m u 2 A I Q Y +; m%6 E F G H I J K L f n v 3 B J R Z +; m%7 M N O P Q R S T g o w 4 C K S + +; m%8 U V W X Y Z + = h p x 5 D L T = +%if ARCH_X86_64 %macro TRANSPOSE8X8W 9 - ; xmm%1 a b c d e f g h a i q y b j r z - ; xmm%2 i j k l m n o p c k s 0 d l t 1 - ; xmm%3 q r s t u v w x -> e m u 2 f n v 3 - ; xmm%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 + ; m%1 a b c d e f g h a i q y b j r z + ; m%2 i j k l m n o p c k s 0 d l t 1 + ; m%3 q r s t u v w x -> e m u 2 f n v 3 + ; m%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 TRANSPOSE4X4W %1, %2, %3, %4, %9 - ; xmm%5 6 7 8 9 A B C D 6 E M U 7 F N V - ; xmm%6 E F G H I J K L 8 G O W 9 H P X - ; xmm%7 M N O P Q R S T -> A I Q Y B J R Z - ; xmm%8 U V W X Y Z + = C K S + D L T = + ; m%5 6 7 8 9 A B C D 6 E M U 7 F N V + ; m%6 E F G H I J K L 8 G O W 9 H P X + ; m%7 M N O P Q R S T -> A I Q Y B J R Z + ; m%8 U V W X Y Z + = C K S + D L T = TRANSPOSE4X4W %5, %6, %7, %8, %9 - ; xmm%1 a i q y b j r z a i q y 6 E M U - ; xmm%2 c k s 0 d l t 1 b j r z 7 F N V - ; xmm%3 e m u 2 f n v 3 c k s 0 8 G O W - ; xmm%4 g o w 4 h p x 5 d l t 1 9 H P X - ; xmm%5 6 E M U 7 F N V -> e m u 2 A I Q Y - ; xmm%6 8 G O W 9 H P X f n v 3 B J R Z - ; xmm%7 A I Q Y B J R Z g o w 4 C K S + - ; xmm%8 C K S + D L T = h p x 5 D L T = + ; m%1 a i q y b j r z a i q y 6 E M U + ; m%2 c k s 0 d l t 1 b j r z 7 F N V + ; m%3 e m u 2 f n v 3 c k s 0 8 G O W + ; m%4 g o w 4 h p x 5 d l t 1 9 H P X + ; m%5 6 E M U 7 F N V -> e m u 2 A I Q Y + ; m%6 8 G O W 9 H P X f n v 3 B J R Z + ; m%7 A I Q Y B J R Z g o w 4 C K S + + ; m%8 C K S + D L T = h p x 5 D L T = punpckhqdq m%9, m%1, m%5 punpcklqdq m%1, m%5 punpckhqdq m%5, m%2, m%6 @@ -126,29 +186,62 @@ SECTION .text SWAP %8, %7, %4, %5, %3, %2, %9 %endmacro +%else ; x86-32 +; input: 1-7 in registers, 8 in first memory [read-only] +; second memory is scratch, and may overlap with first or third memory +; output: 1-5,7-8 in registers, 6 in third memory [write-only] +%macro TRANSPOSE8X8W 13 ; regs [8x], mem [3x], a/u [in/out alignment [2x] + TRANSPOSE4X4W %1, %2, %3, %4, %8 +%ifnidn %9, "" + mov%12 m%8, %9 +%else + mova m%8, %10 +%endif + mova %10, m%4 + TRANSPOSE4X4W %5, %6, %7, %8, %4 + punpckhqdq m%4, m%1, m%5 + punpcklqdq m%1, m%5 + punpckhqdq m%5, m%2, m%6 + punpcklqdq m%2, m%6 + punpckhqdq m%6, m%3, m%7 + punpcklqdq m%3, m%7 + mova m%7, %10 +%ifnidn %11, "" + mov%13 %11, m%6 +%else + mova %10, m%6 +%endif + punpckhqdq m%6, m%7, m%8 + punpcklqdq m%7, m%8 + + ; 1,4,2,5,3,8,7,6 -> 1,2,3,4,5,6,7,8 + SWAP %2, %4, %5, %3 + SWAP %6, %8 +%endmacro +%endif ; x86-32/64 -; transpose and write m3-6, everything else is scratch -%macro TRANSPOSE_8x4_AND_WRITE_4x8 0 +; transpose and write m8-11, everything else is scratch +%macro TRANSPOSE_8x4_AND_WRITE_4x8 5 ; p1, p0, q0, q1, tmp ; transpose 8x4 - punpcklwd m0, m3, m4 - punpckhwd m3, m4 - punpcklwd m4, m5, m6 - punpckhwd m5, m6 - punpckldq m6, m0, m4 - punpckhdq m0, m4 - punpckldq m4, m3, m5 - punpckhdq m3, m5 + punpcklwd %5, %1, %2 + punpckhwd %1, %2 + punpcklwd %2, %3, %4 + punpckhwd %3, %4 + punpckldq %4, %5, %2 + punpckhdq %5, %2 + punpckldq %2, %1, %3 + punpckhdq %1, %3 ; write out - movq [dstq+strideq*0-4], xm6 - movhps [dstq+strideq*1-4], xm6 - movq [dstq+strideq*2-4], xm0 - movhps [dstq+stride3q -4], xm0 + movq [dstq+strideq*0-4], %4 + movhps [dstq+strideq*1-4], %4 + movq [dstq+strideq*2-4], %5 + movhps [dstq+stride3q -4], %5 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0-4], xm4 - movhps [dstq+strideq*1-4], xm4 - movq [dstq+strideq*2-4], xm3 - movhps [dstq+stride3q -4], xm3 + movq [dstq+strideq*0-4], %2 + movhps [dstq+strideq*1-4], %2 + movq [dstq+strideq*2-4], %1 + movhps [dstq+stride3q -4], %1 lea dstq, [dstq+strideq*4] %endmacro @@ -156,285 +249,475 @@ SECTION .text ; load data %ifidn %2, v %if %1 == 4 - lea tmpq, [dstq+mstrideq*2] - mova m3, [tmpq+strideq*0] ; p1 - mova m4, [tmpq+strideq*1] ; p0 - mova m5, [tmpq+strideq*2] ; q0 - mova m6, [tmpq+stride3q] ; q1 -%else +%if ARCH_X86_64 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 + mova P1, [dstq+mstrideq*2] ; p1 + mova P0, [dstq+mstrideq*1] ; p0 + mova Q0, [dstq+strideq*0] ; q0 + mova Q1, [dstq+strideq*1] ; q1 +%else ; x86-32 +%define P1 [dstq+mstrideq*2] +%define P0 [dstq+mstrideq*1] +%define Q0 [dstq+strideq*0] +%define Q1 [dstq+strideq*1] +%endif ; x86-32/64 +%else ; %1 != 4 ; load 6-8 pixels, remainder (for wd=16) will be read inline lea tmpq, [dstq+mstrideq*4] +%if ARCH_X86_64 ; we load p3 later - mova m13, [tmpq+strideq*1] - mova m3, [tmpq+strideq*2] - mova m4, [tmpq+stride3q] - mova m5, [dstq+strideq*0] - mova m6, [dstq+strideq*1] - mova m14, [dstq+strideq*2] +%define P2 m13 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 +%define Q2 m14 + mova P2, [tmpq+strideq*1] + mova P1, [tmpq+strideq*2] + mova P0, [tmpq+stride3q] + mova Q0, [dstq+strideq*0] + mova Q1, [dstq+strideq*1] + mova Q2, [dstq+strideq*2] %if %1 != 6 - mova m15, [dstq+stride3q] -%endif -%endif -%else +%define P3 [tmpq+strideq*0] +%define Q3 m15 + mova Q3, [dstq+stride3q] +%endif ; %1 != 6 +%else ; x86-32 +%define P2 [tmpq+strideq*1] +%define P1 [dstq+mstrideq*2] +%define P0 [dstq+mstrideq*1] +%define Q0 [dstq+strideq*0] +%define Q1 [dstq+strideq*1] +%define Q2 [dstq+strideq*2] +%if %1 != 6 +%define P3 [dstq+mstrideq*4] +%define Q3 [dstq+stride3q] +%endif ; %1 != 6 +%endif ; x86-32/64 +%endif ; %1 ==/!= 4 +%else ; %2 != v ; load lines %if %1 == 4 - movq xm3, [dstq+strideq*0-4] - movq xm4, [dstq+strideq*1-4] - movq xm5, [dstq+strideq*2-4] - movq xm6, [dstq+stride3q -4] + movq m0, [dstq+strideq*0-4] + movq m2, [dstq+strideq*1-4] + movq m4, [dstq+strideq*2-4] + movq m5, [dstq+stride3q -4] lea tmpq, [dstq+strideq*4] - movq xm11, [tmpq+strideq*0-4] - movq xm13, [tmpq+strideq*1-4] - movq xm14, [tmpq+strideq*2-4] - movq xm15, [tmpq+stride3q -4] + movq m3, [tmpq+strideq*0-4] + movq m6, [tmpq+strideq*1-4] + movq m1, [tmpq+strideq*2-4] + movq m7, [tmpq+stride3q -4] ; transpose 4x8 - ; xm3: A-D0,A-D4 - ; xm4: A-D1,A-D5 - ; xm5: A-D2,A-D6 - ; xm6: A-D3,A-D7 - punpcklwd m7, m3, m4 - punpcklwd m3, m11, m13 - punpcklwd m4, m5, m6 - punpcklwd m5, m14, m15 - ; xm7: A0-1,B0-1,C0-1,D0-1 - ; xm3: A4-5,B4-5,C4-5,D4-5 - ; xm4: A2-3,B2-3,C2-3,D2-3 - ; xm5: A6-7,B6-7,C6-7,D6-7 - punpckldq m6, m7, m4 - punpckhdq m7, m4 - punpckldq m8, m3, m5 - punpckhdq m3, m5 - SWAP 3, 5 - ; xm6: A0-3,B0-3 - ; xm7: C0-3,D0-3 - ; xm8: A4-7,B4-7 - ; xm5: C4-7,D4-7 - punpcklqdq m3, m6, m8 - punpckhqdq m4, m6, m8 - punpckhqdq m6, m7, m5 - punpcklqdq m7, m5 - SWAP 7, 5 - ; xm3: A0-7 - ; xm4: B0-7 - ; xm5: C0-7 - ; xm6: D0-7 + ; m0: A-D0 + ; m2: A-D1 + ; m4: A-D2 + ; m5: A-D3 + ; m3: A-D4 + ; m6: A-D5 + ; m1: A-D6 + ; m7: A-D7 + punpcklwd m0, m2 + punpcklwd m4, m5 + punpcklwd m3, m6 + punpcklwd m1, m7 + ; m0: A0-1,B0-1,C0-1,D0-1 + ; m4: A2-3,B2-3,C2-3,D2-3 + ; m3: A4-5,B4-5,C4-5,D4-5 + ; m1: A6-7,B6-7,C6-7,D6-7 + punpckhdq m2, m0, m4 + punpckldq m0, m4 + punpckhdq m4, m3, m1 + punpckldq m3, m1 + ; m0: A0-3,B0-3 + ; m2: C0-3,D0-3 + ; m3: A4-7,B4-7 + ; m4: C4-7,D4-7 + punpckhqdq m1, m0, m3 + punpcklqdq m0, m3 + punpckhqdq m3, m2, m4 + punpcklqdq m2, m4 + ; m0: A0-7 + ; m1: B0-7 + ; m2: C0-7 + ; m3: D0-7 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 +%else +%define P1 [esp+3*mmsize] +%define P0 [esp+4*mmsize] +%define Q0 [esp+5*mmsize] +%define Q1 [esp+6*mmsize] + mova P1, m0 + mova P0, m1 + mova Q0, m2 + mova Q1, m3 +%endif %elif %1 == 6 || %1 == 8 - movu xm3, [dstq+strideq*0-8] - movu xm4, [dstq+strideq*1-8] - movu xm5, [dstq+strideq*2-8] - movu xm6, [dstq+stride3q -8] + movu m0, [dstq+strideq*0-8] + movu m1, [dstq+strideq*1-8] + movu m2, [dstq+strideq*2-8] + movu m3, [dstq+stride3q -8] lea tmpq, [dstq+strideq*4] - movu xm11, [tmpq+strideq*0-8] - movu xm13, [tmpq+strideq*1-8] - movu xm14, [tmpq+strideq*2-8] - movu xm15, [tmpq+stride3q -8] + movu m4, [tmpq+strideq*0-8] + movu m5, [tmpq+strideq*1-8] + movu m6, [tmpq+strideq*2-8] +%if ARCH_X86_64 + movu m7, [tmpq+stride3q -8] +%endif ; transpose 8x16 - ; xm3: A-H0,A-H8 - ; xm4: A-H1,A-H9 - ; xm5: A-H2,A-H10 - ; xm6: A-H3,A-H11 - ; xm11: A-H4,A-H12 - ; xm13: A-H5,A-H13 - ; xm14: A-H6,A-H14 - ; xm15: A-H7,A-H15 - punpcklwd m7, m3, m4 - punpckhwd m3, m4 - punpcklwd m4, m5, m6 - punpckhwd m5, m6 - punpcklwd m6, m11, m13 - punpckhwd m11, m13 - punpcklwd m13, m14, m15 - punpckhwd m14, m15 - ; xm7: A0-1,B0-1,C0-1,D0-1 - ; xm3: E0-1,F0-1,G0-1,H0-1 - ; xm4: A2-3,B2-3,C2-3,D2-3 - ; xm5: E2-3,F2-3,G2-3,H2-3 - ; xm6: A4-5,B4-5,C4-5,D4-5 - ; xm11: E4-5,F4-5,G4-5,H4-5 - ; xm13: A6-7,B6-7,C6-7,D6-7 - ; xm14: E6-7,F6-7,G6-7,H6-7 - punpckldq m15, m7, m4 - punpckhdq m7, m4 - punpckldq m9, m3, m5 - punpckhdq m8, m3, m5 - punpckldq m3, m6, m13 - punpckhdq m6, m13 - punpckldq m10, m11, m14 - punpckhdq m11, m14 - ; xm15: A0-3,B0-3 - ; xm7: C0-3,D0-3 - ; xm9: E0-3,F0-3 - ; xm8: G0-3,H0-3 - ; xm3: A4-7,B4-7 - ; xm6: C4-7,D4-7 - ; xm10: E4-7,F4-7 - ; xm11: G4-7,H4-7 + ; m0: A-H0,A-H8 + ; m1: A-H1,A-H9 + ; m2: A-H2,A-H10 + ; m3: A-H3,A-H11 + ; m4: A-H4,A-H12 + ; m5: A-H5,A-H13 + ; m6: A-H6,A-H14 + ; m7: A-H7,A-H15 +%if ARCH_X86_64 + punpcklwd m8, m0, m1 +%else + punpcklwd m7, m0, m1 +%endif + punpckhwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpcklwd m3, m4, m5 + punpckhwd m4, m5 +%if ARCH_X86_64 + punpcklwd m5, m6, m7 + punpckhwd m6, m7 +%else + mova [rsp+3*16], m4 + movu m4, [tmpq+stride3q -8] + punpcklwd m5, m6, m4 + punpckhwd m6, m4 +%endif + ; m8: A0-1,B0-1,C0-1,D0-1 [m7 on x86-32] + ; m0: E0-1,F0-1,G0-1,H0-1 + ; m1: A2-3,B2-3,C2-3,D2-3 + ; m2: E2-3,F2-3,G2-3,H2-3 + ; m3: A4-5,B4-5,C4-5,D4-5 + ; m4: E4-5,F4-5,G4-5,H4-5 [r3 on x86-32] + ; m5: A6-7,B6-7,C6-7,D6-7 + ; m6: E6-7,F6-7,G6-7,H6-7 +%if ARCH_X86_64 + punpckldq m7, m8, m1 + punpckhdq m8, m1 +%else + punpckldq m4, m7, m1 + punpckhdq m7, m1 +%endif + punpckldq m1, m0, m2 + punpckhdq m0, m2 + punpckldq m2, m3, m5 + punpckhdq m3, m5 +%if ARCH_X86_64 + punpckldq m5, m4, m6 + punpckhdq m4, m6 +%else + mova [rsp+4*16], m3 + mova m3, [rsp+3*16] + punpckldq m5, m3, m6 + punpckhdq m3, m6 +%endif + ; m7: A0-3,B0-3 [m4 on x86-32] + ; m8: C0-3,D0-3 [m7 on x86-32] + ; m1: E0-3,F0-3 + ; m0: G0-3,H0-3 + ; m2: A4-7,B4-7 + ; m3: C4-7,D4-7 [r4 on x86-32] + ; m5: E4-7,F4-7 + ; m4: G4-7,H4-7 [m3 on x86-32] +%if ARCH_X86_64 %if %1 != 6 - punpcklqdq m0, m15, m3 -%endif - punpckhqdq m13, m15, m3 - punpcklqdq m3, m7, m6 - punpckhqdq m4, m7, m6 - punpcklqdq m5, m9, m10 - punpckhqdq m6, m9, m10 - punpcklqdq m14, m8, m11 + punpcklqdq m6, m7, m2 +%endif + punpckhqdq m7, m2 + punpcklqdq m2, m8, m3 + punpckhqdq m8, m3 + punpcklqdq m3, m1, m5 + punpckhqdq m1, m5 %if %1 != 6 - punpckhqdq m15, m8, m11 - mova [rsp+5*32], m0 + punpckhqdq m5, m0, m4 +%endif + punpcklqdq m0, m4 +%if %1 == 8 + mova [rsp+1*16], m6 +%define P3 [rsp+1*16] %endif + ; 7,2,8,3,1,0,5 -> 13,8,9,10,11,14,15 + SWAP 7, 13 + SWAP 8, 2, 9 + SWAP 3, 10 + SWAP 1, 11 + SWAP 0, 14 + SWAP 5, 15 +%define P2 m13 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 +%define Q2 m14 +%if %1 == 8 +%define Q3 m15 +%endif +%else ; x86-32 +%if %1 == 8 +%define P3 [rsp+ 6*16] + punpcklqdq m6, m4, m2 + mova P3, m6 +%endif + mova m6, [rsp+4*16] + punpckhqdq m4, m2 + punpcklqdq m2, m7, m6 + punpckhqdq m7, m6 + punpcklqdq m6, m1, m5 + punpckhqdq m1, m5 +%if %1 == 8 +%define Q3 [rsp+24*16] + punpckhqdq m5, m0, m3 + mova Q3, m5 +%endif + punpcklqdq m0, m3 +%if %1 == 8 +%define P2 [rsp+18*16] +%define P1 [rsp+19*16] +%define P0 [rsp+20*16] +%define Q0 [rsp+21*16] +%define Q1 [rsp+22*16] +%define Q2 [rsp+23*16] %else +%define P2 [rsp+3*16] +%define P1 [rsp+4*16] +%define P0 [rsp+5*16] +%define Q0 [rsp+6*16] +%define Q1 [rsp+7*16] +%define Q2 [rsp+8*16] +%endif + mova P2, m4 + mova P1, m2 + mova P0, m7 + mova Q0, m6 + mova Q1, m1 + mova Q2, m0 +%endif ; x86-32/64 +%else ; %1 == 16 ; We only use 14 pixels but we'll need the remainder at the end for ; the second transpose - mova xm0, [dstq+strideq*0-16] - mova xm1, [dstq+strideq*1-16] - mova xm2, [dstq+strideq*2-16] - mova xm3, [dstq+stride3q -16] + mova m0, [dstq+strideq*0-16] + mova m1, [dstq+strideq*1-16] + mova m2, [dstq+strideq*2-16] + mova m3, [dstq+stride3q -16] lea tmpq, [dstq+strideq*4] - mova xm4, [tmpq+strideq*0-16] - mova xm5, [tmpq+strideq*1-16] - mova xm6, [tmpq+strideq*2-16] - mova xm7, [tmpq+stride3q -16] + mova m4, [tmpq+strideq*0-16] + mova m5, [tmpq+strideq*1-16] + mova m6, [tmpq+strideq*2-16] +%if ARCH_X86_64 + mova m7, [tmpq+stride3q -16] TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 - - mova [rsp+6*32], m0 - mova [rsp+7*32], m1 - mova [rsp+8*32], m2 - mova [rsp+9*32], m3 - mova [rsp+5*32], m4 - - mova xm0, [dstq+strideq*0] - mova xm1, [dstq+strideq*1] - mova xm2, [dstq+strideq*2] - mova xm3, [dstq+stride3q ] + SWAP 5, 13 + SWAP 6, 8 + SWAP 7, 9 +%define P2 m13 +%define P1 m8 +%define P0 m9 +%else ; x86-32 +%define P2 [esp+18*16] +%define P1 [esp+19*16] +%define P0 [esp+20*16] + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ + [tmpq+stride3q -16], P2, "", a, a + mova P1, m6 + mova P0, m7 +%endif ; x86-32/64 + mova [rsp+ 7*16], m0 + mova [rsp+ 8*16], m1 + mova [rsp+ 9*16], m2 + mova [rsp+10*16], m3 +%define P3 [rsp+6*16] + mova P3, m4 + + mova m0, [dstq+strideq*0] + mova m1, [dstq+strideq*1] + mova m2, [dstq+strideq*2] + mova m3, [dstq+stride3q ] lea tmpq, [dstq+strideq*4] - mova xm8, [tmpq+strideq*0] - mova xm9, [tmpq+strideq*1] - mova xm10, [tmpq+strideq*2] - mova xm11, [tmpq+stride3q ] - - TRANSPOSE8X8W 0, 1, 2, 3, 8, 9, 10, 11, 4 - - mova [rsp+10*32], m8 - mova [rsp+11*32], m9 - mova [rsp+12*32], m10 - mova [rsp+13*32], m11 - - ; 5,6,7,0,1,2,3 -> 13,3,4,5,6,14,15 - SWAP 13, 5, 0 - SWAP 3, 6, 1, 15 - SWAP 4, 7 + mova m4, [tmpq+strideq*0] + mova m5, [tmpq+strideq*1] + mova m6, [tmpq+strideq*2] +%if ARCH_X86_64 + mova m7, [tmpq+stride3q ] + + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 10 + SWAP 0, 10 + SWAP 1, 11 SWAP 2, 14 + SWAP 3, 15 +%define Q0 m10 +%define Q1 m11 +%define Q2 m14 +%define Q3 m15 +%else ; x86-32 + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ + [tmpq+stride3q ], [rsp+12*16], "", a, a +%define Q0 [esp+21*16] +%define Q1 [esp+22*16] +%define Q2 [esp+23*16] +%define Q3 [esp+24*16] + mova Q0, m0 + mova Q1, m1 + mova Q2, m2 + mova Q3, m3 +%endif ; x86-32/64 + + mova [rsp+11*16], m4 +%if ARCH_X86_64 + mova [rsp+12*16], m5 %endif -%endif + mova [rsp+13*16], m6 + mova [rsp+14*16], m7 +%endif ; %1 == 4/6/8/16 +%endif ; %2 ==/!= v ; load L/E/I/H +%if ARCH_X86_32 +%define l_strideq r5 + mov l_strideq, dword lstridem +%ifidn %2, v +%define lq r3 + mov lq, dword lm +%endif +%endif %ifidn %2, v %if cpuflag(sse4) pmovzxbw m1, [lq] pmovzxbw m0, [lq+l_strideq] pxor m2, m2 -%else +%else ; ssse3 movq m1, [lq] movq m0, [lq+l_strideq] pxor m2, m2 REPX {punpcklbw x, m2}, m1, m0 -%endif -%else +%endif ; ssse3/sse4 +%else ; %2 != v movq m0, [lq] ; l0, l1 movq m1, [lq+l_strideq] ; l2, l3 punpckldq m0, m1 ; l0, l2, l1, l3 pxor m2, m2 punpcklbw m1, m0, m2 ; l0, l2 punpckhbw m0, m2 ; l1, l3 +%endif ; %2==/!=v +%if ARCH_X86_32 +%ifidn %2, v +%undef lq + mov mstrideq, mstridem +%endif %endif - pcmpeqw m10, m2, m0 - pand m1, m10 + pcmpeqw m5, m2, m0 + pand m1, m5 por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] - pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1] - pcmpeqw m10, m2, m0 ; !L - psrlw m10, 1 + pshufb m0, [PIC_sym(pb_4x1_4x5_4x9_4x13)] ; l[x][1] + pcmpeqw m5, m2, m0 ; !L + psrlw m5, 1 +%if ARCH_X86_64 psrlw m2, m0, [lutq+128] SPLATW m1, [lutq+136] +%else ; x86-32 + mov r5, lutm + psrlw m2, m0, [r5+128] + SPLATW m1, [r5+136] +%endif ; x86-32/64 pminsw m2, m1 - pmaxsw m2, [pw_1] ; I + pmaxsw m2, [PIC_sym(pw_1)] ; I psrlw m1, m0, 4 ; H - paddw m0, [pw_2] + paddw m0, [PIC_sym(pw_2)] paddw m0, m0 paddw m0, m2 ; E - REPX {pmullw x, [r11]}, m0, m1, m2 - - psubw m8, m3, m4 ; p1-p0 - psubw m9, m5, m6 ; q1-q0 - REPX {pabsw x, x}, m8, m9 - pmaxsw m8, m10 - pmaxsw m8, m9 - pcmpgtw m7, m8, m1 ; hev + REPX {pmullw x, [bdmulq]}, m0, m1, m2 +%if ARCH_X86_32 +%undef l_strideq + lea stride3q, [strideq*3] +%endif + + psubw m3, P1, P0 ; p1-p0 + psubw m4, Q0, Q1 ; q0-q1 + REPX {pabsw x, x}, m3, m4 + pmaxsw m3, m5 + pmaxsw m3, m4 + pcmpgtw m7, m3, m1 ; hev %if %1 != 4 - psubw m9, m13, m4 ; p2-p0 - pabsw m9, m9 - pmaxsw m9, m8 + psubw m4, P2, P0 ; p2-p0 + pabsw m4, m4 + pmaxsw m4, m3 %if %1 != 6 -%ifidn %2, v - mova m11, [tmpq+strideq*0] ; p3 -%else - mova m11, [rsp+5*32] ; p3 -%endif - psubw m10, m11, m4 ; p3-p0 - pabsw m10, m10 - pmaxsw m9, m10 -%endif - psubw m10, m5, m14 ; q2-q0 - pabsw m10, m10 - pmaxsw m9, m10 + mova m6, P3 ; p3 + psubw m5, m6, P0 ; p3-p0 + pabsw m5, m5 + pmaxsw m4, m5 +%endif ; %1 != 6 + psubw m5, Q0, Q2 ; q0-q2 + pabsw m5, m5 + pmaxsw m4, m5 %if %1 != 6 - psubw m10, m5, m15 ; q3-q0 - pabsw m10, m10 - pmaxsw m9, m10 -%endif - pcmpgtw m9, [r11] ; !flat8in - - psubw m10, m13, m3 ; p2-p1 - pabsw m10, m10 + psubw m5, Q0, Q3 ; q0-q3 + pabsw m5, m5 + pmaxsw m4, m5 +%endif ; %1 != 6 + pcmpgtw m4, [bdmulq] ; !flat8in + + psubw m5, P2, P1 ; p2-p1 + pabsw m5, m5 %if %1 != 6 - psubw m11, m13 ; p3-p2 - pabsw m11, m11 - pmaxsw m10, m11 - psubw m11, m14, m15 ; q3-q2 - pabsw m11, m11 - pmaxsw m10, m11 -%endif - psubw m11, m14, m6 ; q2-q1 - pabsw m11, m11 - pmaxsw m10, m11 + psubw m6, P2 ; p3-p2 + pabsw m6, m6 + pmaxsw m5, m6 + psubw m6, Q2, Q3 ; q2-q3 + pabsw m6, m6 + pmaxsw m5, m6 +%endif ; %1 != 6 + psubw m6, Q2, Q1 ; q2-q1 + pabsw m6, m6 + pmaxsw m5, m6 %if %1 == 16 - SPLATD m11, [maskq+8] + SPLATD m6, [maskq+8] SPLATD m1, [maskq+4] - por m11, m1 - pand m11, m12 - pcmpeqd m11, m12 - pand m10, m11 -%else - SPLATD m11, [maskq+4] - pand m11, m12 - pcmpeqd m11, m12 - pand m10, m11 ; only apply fm-wide to wd>4 blocks -%endif - pmaxsw m8, m10 -%endif - pcmpgtw m8, m2 - - psubw m10, m3, m6 ; p1-q1 - psubw m11, m4, m5 ; p0-q0 - REPX {pabsw x, x}, m10, m11 - paddw m11, m11 - psrlw m10, 1 - paddw m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) - pcmpgtw m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E - por m8, m10 + por m6, m1 + pand m6, m12 + pcmpeqd m6, m12 + pand m5, m6 +%else ; %1 != 16 + SPLATD m6, [maskq+4] + pand m6, m12 + pcmpeqd m6, m12 + pand m5, m6 ; only apply fm-wide to wd>4 blocks +%endif ; %1==/!=16 + pmaxsw m3, m5 +%endif ; %1 != 4 + pcmpgtw m3, m2 + + psubw m5, P1, Q1 ; p1-q1 + psubw m6, P0, Q0 ; p0-q0 + REPX {pabsw x, x}, m5, m6 + paddw m6, m6 + psrlw m5, 1 + paddw m5, m6 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + pcmpgtw m5, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E + por m3, m5 %if %1 == 16 @@ -443,12 +726,12 @@ SECTION .text mova m0, [tmpq+strideq*1] mova m1, [tmpq+strideq*2] mova m2, [tmpq+stride3q] -%else - mova m0, [rsp+7*32] - mova m1, [rsp+8*32] - mova m2, [rsp+9*32] -%endif - REPX {psubw x, m4}, m0, m1, m2 +%else ; %2 != v + mova m0, [rsp+ 8*16] + mova m1, [rsp+ 9*16] + mova m2, [rsp+10*16] +%endif ; %2==/!=v + REPX {psubw x, P0}, m0, m1, m2 REPX {pabsw x, x}, m0, m1, m2 pmaxsw m1, m0 pmaxsw m1, m2 @@ -456,106 +739,129 @@ SECTION .text lea tmpq, [dstq+strideq*4] mova m0, [tmpq+strideq*0] mova m2, [tmpq+strideq*1] - mova m10, [tmpq+strideq*2] -%else - mova m0, [rsp+10*32] - mova m2, [rsp+11*32] - mova m10, [rsp+12*32] -%endif - REPX {psubw x, m5}, m0, m2, m10 - REPX {pabsw x, x}, m0, m2, m10 + mova m5, [tmpq+strideq*2] +%else ; %2 != v + mova m0, [rsp+11*16] + mova m2, [rsp+12*16] + mova m5, [rsp+13*16] +%endif ; %2==/!=v + REPX {psubw x, Q0}, m0, m2, m5 + REPX {pabsw x, x}, m0, m2, m5 pmaxsw m0, m2 - pmaxsw m1, m10 + pmaxsw m1, m5 pmaxsw m1, m0 - pcmpgtw m1, [r11] ; !flat8out - por m1, m9 ; !flat8in | !flat8out + pcmpgtw m1, [bdmulq] ; !flat8out + por m1, m4 ; !flat8in | !flat8out SPLATD m2, [maskq+8] - pand m10, m2, m12 - pcmpeqd m10, m12 - pandn m1, m10 ; flat16 - pandn m10, m8, m1 ; flat16 & fm - SWAP 1, 10 - - SPLATD m10, [maskq+4] - por m10, m2 - pand m2, m10, m12 + pand m5, m2, m12 + pcmpeqd m5, m12 + pandn m1, m5 ; flat16 + pandn m5, m3, m1 ; flat16 & fm + SWAP 1, 5 + + SPLATD m5, [maskq+4] + por m5, m2 + pand m2, m5, m12 pcmpeqd m2, m12 - pandn m9, m2 ; flat8in - pandn m2, m8, m9 - SWAP 2, 9 + pandn m4, m2 ; flat8in + pandn m2, m3, m4 + SWAP 2, 4 SPLATD m2, [maskq+0] - por m2, m10 + por m2, m5 pand m2, m12 pcmpeqd m2, m12 - pandn m8, m2 - pandn m0, m9, m8 ; fm & !flat8 & !flat16 - SWAP 0, 8 - pandn m0, m1, m9 ; flat8 & !flat16 - SWAP 0, 9 + pandn m3, m2 + pandn m0, m4, m3 ; fm & !flat8 & !flat16 + SWAP 0, 3 + pandn m0, m1, m4 ; flat8 & !flat16 + SWAP 0, 4 %elif %1 != 4 SPLATD m0, [maskq+4] pand m2, m0, m12 pcmpeqd m2, m12 - pandn m9, m2 - pandn m2, m8, m9 ; flat8 & fm - SWAP 2, 9 + pandn m4, m2 + pandn m2, m3, m4 ; flat8 & fm + SWAP 2, 4 SPLATD m2, [maskq+0] por m0, m2 pand m0, m12 pcmpeqd m0, m12 - pandn m8, m0 - pandn m0, m9, m8 ; fm & !flat8 - SWAP 0, 8 -%else + pandn m3, m0 + pandn m0, m4, m3 ; fm & !flat8 + SWAP 0, 3 +%else ; %1 == 4 SPLATD m0, [maskq+0] pand m0, m12 pcmpeqd m0, m12 - pandn m8, m0 ; fm -%endif + pandn m3, m0 ; fm +%endif ; %1==/!=4 ; short filter - +%if ARCH_X86_64 SPLATW m0, r7m +%else + SPLATW m0, bdmulm +%endif pcmpeqw m2, m2 psrlw m0, 1 ; 511 or 2047 pxor m2, m0 ; -512 or -2048 - psubw m10, m5, m4 - paddw m11, m10, m10 - paddw m11, m10 - psubw m10, m3, m6 ; iclip_diff(p1-q1) - pminsw m10, m0 - pmaxsw m10, m2 - pand m10, m7 ; f=iclip_diff(p1-q1)&hev - paddw m10, m11 ; f=iclip_diff(3*(q0-p0)+f) - pminsw m10, m0 - pmaxsw m10, m2 - pand m8, m10 ; f&=fm - paddw m10, m8, [pw_3] - paddw m8, [pw_4] - REPX {pminsw x, m0}, m10, m8 - psraw m10, 3 ; f2 - psraw m8, 3 ; f1 - paddw m4, m10 - psubw m5, m8 - - paddw m8, [pw_1] - psraw m8, 1 ; f=(f1+1)>>1 - pandn m7, m8 ; f&=!hev - SWAP 7, 8 - paddw m3, m8 - psubw m6, m8 - pxor m8, m8 + psubw m5, Q0, P0 ; q0-p0 + paddw m6, m5, m5 + paddw m6, m5 ; 3*(q0-p0) + psubw m5, P1, Q1 ; iclip_diff(p1-q1) + pminsw m5, m0 + pmaxsw m5, m2 + pand m5, m7 ; f=iclip_diff(p1-q1)&hev + paddw m5, m6 ; f=iclip_diff(3*(q0-p0)+f) + pminsw m5, m0 + pmaxsw m5, m2 + pand m3, m5 ; f&=fm + paddw m5, m3, [PIC_sym(pw_3)] + paddw m3, [PIC_sym(pw_4)] + REPX {pminsw x, m0}, m5, m3 + psraw m5, 3 ; f2 + psraw m3, 3 ; f1 psubw m0, m2 ; 1023 or 4095 - REPX {pminsw x, m0}, m3, m4, m5, m6 - REPX {pmaxsw x, m8}, m3, m4, m5, m6 + pxor m2, m2 +%if ARCH_X86_64 + paddw P0, m5 + psubw Q0, m3 +%else + paddw m5, P0 + psubw m6, Q0, m3 + REPX {pminsw x, m0}, m5, m6 + REPX {pmaxsw x, m2}, m5, m6 +%endif + + paddw m3, [PIC_sym(pw_1)] + psraw m3, 1 ; f=(f1+1)>>1 + pandn m7, m3 ; f&=!hev + SWAP 7, 3 +%if ARCH_X86_64 + paddw P1, m3 + psubw Q1, m3 + REPX {pminsw x, m0}, P1, P0, Q0, Q1 + REPX {pmaxsw x, m2}, P1, P0, Q0, Q1 +%else + psubw m7, Q1, m3 + paddw m3, P1 + REPX {pminsw x, m0}, m7, m3 + REPX {pmaxsw x, m2}, m7, m3 +%if %1 > 4 + mova P1, m3 + mova P0, m5 + mova Q0, m6 + mova Q1, m7 +%endif +%endif %if %1 == 16 -; m3-6 = p1/p0/q0/q1, m9=flat8, m1=flat16 +; m8-11 = p1/p0/q0/q1, m4=flat8, m1=flat16 ; m12=filter bits mask ; m13-15=p2/q2/q3 -; m0,2,7-8,10-11 = free +; m0,2-3,5-7 = free ; flat16 filter %ifidn %2, v @@ -563,479 +869,722 @@ SECTION .text mova m0, [tmpq+strideq*1] ; p6 mova m2, [tmpq+strideq*2] ; p5 mova m7, [tmpq+stride3q] ; p4 - mova m11, [tmpq+strideq*4] ; p3 -%else - mova m0, [rsp+7*32] - mova m2, [rsp+8*32] - mova m7, [rsp+9*32] - mova m11, [rsp+5*32] -%endif + mova m6, [tmpq+strideq*4] ; p3 + lea tmpq, [dstq+mstrideq*4] +%else ; %2 != v + mova m0, [rsp+ 8*16] + mova m2, [rsp+ 9*16] + mova m7, [rsp+10*16] + mova m6, [rsp+ 6*16] +%endif ; %2==/!=v - mova [rsp+ 0*32], m9 + mova [rsp+ 0*16], m4 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 - psllw m8, m0, 3 ; p6*8 - paddw m8, [pw_8] - paddw m10, m2, m7 ; p5+p4 - psubw m8, m0 - paddw m10, m10 ; (p5+p4)*2 - paddw m8, m11 ; p6*7+p3 - paddw m10, m13 ; (p5+p4)*2+p2 - paddw m8, m3 ; p6*7+p3+p1 - paddw m10, m4 ; (p5+p4)*2+p2+p0 - paddw m8, m5 ; p6*7+p3+p1+q0 - paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 - psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m2 - por m10, m9 + psllw m3, m0, 3 ; p6*8 + paddw m3, [PIC_sym(pw_8)] + paddw m5, m2, m7 ; p5+p4 + psubw m3, m0 + paddw m5, m5 ; (p5+p4)*2 + paddw m3, m6 ; p6*7+p3 + paddw m5, P2 ; (p5+p4)*2+p2 + paddw m3, P1 ; p6*7+p3+p1 + paddw m5, P0 ; (p5+p4)*2+p2+p0 + paddw m3, Q0 ; p6*7+p3+p1+q0 + paddw m3, m5 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, m2 + por m5, m4 %ifidn %2, v - mova [tmpq+strideq*2], m10 ; p5 -%else - mova [rsp+8*32], m10 -%endif + mova [tmpq+mstrideq*2], m5 ; p5 +%else ; %2 != v + mova [rsp+9*16], m5 +%endif ; %2==/!=v ; sub p6*2, add p3/q1 - paddw m8, m11 - paddw m10, m0, m0 - paddw m8, m6 - psubw m8, m10 - psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m7 - por m10, m9 + paddw m3, m6 + paddw m5, m0, m0 + paddw m3, Q1 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, m7 + por m5, m4 %ifidn %2, v - mova [tmpq+stride3q], m10 ; p4 -%else - mova [rsp+9*32], m10 -%endif + mova [tmpq+mstrideq*1], m5 ; p4 +%else ; %2 != v + mova [rsp+10*16], m5 +%endif ; %2==/!=v ; sub p6/p5, add p2/q2 - psubw m8, m0 - paddw m10, m13, m14 - psubw m8, m2 - paddw m8, m10 - psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m11 - por m10, m9 + psubw m3, m0 + paddw m5, P2, Q2 + psubw m3, m2 + paddw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, m6 + por m5, m4 %ifidn %2, v - mova [tmpq+strideq*4], m10 ; p3 - lea tmpq, [dstq+strideq*4] -%else - mova [rsp+5*32], m10 + mova [tmpq+strideq*0], m5 ; p3 +%else ; %2 != v + mova [rsp+6*16], m5 +%endif ; %2==/!=v + +%define WRITE_IN_PLACE 0 +%ifidn %2, v +%if ARCH_X86_64 +%define WRITE_IN_PLACE 1 +%endif %endif ; sub p6/p4, add p1/q3 - paddw m8, m3 - paddw m10, m0, m7 - paddw m8, m15 - psubw m8, m10 - psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m13 - por m10, m9 - mova [rsp+1*32], m10 ; don't clobber p2/m13 + paddw m3, P1 + paddw m5, m0, m7 + paddw m3, Q3 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, P2 + por m5, m4 +%if WRITE_IN_PLACE + mova [tmpq+strideq*1], m5 +%else + mova [rsp+1*16], m5 ; don't clobber p2/m13 +%endif ; sub p6/p3, add p0/q4 - paddw m8, m4 - paddw m10, m0, m11 + paddw m3, P0 + paddw m5, m0, m6 %ifidn %2, v - paddw m8, [tmpq+strideq*0] + paddw m3, [dstq+strideq*4] +%else ; %2 != v + paddw m3, [rsp+11*16] +%endif ; %2==/!=v + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, P1 + por m5, m4 +%if WRITE_IN_PLACE + mova [dstq+mstrideq*2], m5 %else - paddw m8, [rsp+10*32] + mova [rsp+2*16], m5 ; don't clobber p1/m3 %endif - psubw m8, m10 - psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m3 - por m10, m9 - mova [rsp+2*32], m10 ; don't clobber p1/m3 ; sub p6/p2, add q0/q5 - paddw m8, m5 - paddw m10, m0, m13 + paddw m3, Q0 + paddw m5, m0, P2 %ifidn %2, v - paddw m8, [tmpq+strideq*1] +%if ARCH_X86_32 + lea r4, P2 +%endif + lea tmpq, [dstq+strideq*4] + paddw m3, [tmpq+strideq*1] +%else ; %2 != v + paddw m3, [rsp+12*16] +%endif ; %2==/!=v + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, P0 + por m5, m4 +%if WRITE_IN_PLACE + mova [dstq+mstrideq*1], m5 %else - paddw m8, [rsp+11*32] + mova [rsp+3*16], m5 ; don't clobber p0/m4 %endif - psubw m8, m10 - psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m4 - por m10, m9 - mova [rsp+3*32], m10 ; don't clobber p0/m4 ; sub p6/p1, add q1/q6 - paddw m8, m6 - paddw m10, m0, m3 + paddw m3, Q1 + paddw m5, m0, P1 %ifidn %2, v mova m0, [tmpq+strideq*2] ; q6 +%else ; %2 != v + mova m0, [rsp+13*16] ; q6 +%endif ; %2==/!=v + paddw m3, m0 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, Q0 + por m5, m4 +%if WRITE_IN_PLACE + mova [dstq], m5 %else - mova m0, [rsp+12*32] ; q6 + mova [rsp+4*16], m5 ; don't clobber q0/m5 %endif - paddw m8, m0 - psubw m8, m10 - psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m5 - por m10, m9 - mova [rsp+4*32], m10 ; don't clobber q0/m5 ; sub p5/p0, add q2/q6 - paddw m8, m14 - paddw m10, m2, m4 - paddw m8, m0 - psubw m8, m10 - psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m6 - por m2, m10, m9 ; don't clobber q1/m6 + paddw m3, Q2 + paddw m5, m2, P0 + paddw m3, m0 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, Q1 + por m2, m5, m4 ; don't clobber q1/m6 ; sub p4/q0, add q3/q6 - paddw m8, m15 - paddw m10, m7, m5 - paddw m8, m0 - psubw m8, m10 - psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m14 - por m7, m10, m9 ; don't clobber q2/m14 + paddw m3, Q3 + paddw m7, Q0 + paddw m3, m0 + psubw m3, m7 + psrlw m7, m3, 4 + pand m7, m1 + pandn m4, m1, Q2 + por m7, m4 ; don't clobber q2/m14 ; sub p3/q1, add q4/q6 %ifidn %2, v - paddw m8, [tmpq+strideq*0] -%else - paddw m8, [rsp+10*32] -%endif - paddw m10, m11, m6 - paddw m8, m0 - psubw m8, m10 - psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m15 - por m10, m9 -%ifidn %2, v - mova [tmpq+mstrideq], m10 ; q3 -%else - mova [rsp+14*32], m10 -%endif + paddw m3, [tmpq+strideq*0] +%else ; %2 != v + paddw m3, [rsp+11*16] +%endif ; %2==/!=v + paddw m6, Q1 + paddw m3, m0 + psubw m3, m6 + psrlw m6, m3, 4 + pand m6, m1 + pandn m4, m1, Q3 + por m6, m4 +%if WRITE_IN_PLACE + mova [tmpq+mstrideq], m6 ; q3 +%else ; %2 != v + mova [rsp+5*16], m6 +%endif ; %2==/!=v ; sub p2/q2, add q5/q6 %ifidn %2, v - paddw m8, [tmpq+strideq*1] + paddw m3, [tmpq+strideq*1] +%if ARCH_X86_64 + paddw m5, P2, Q2 %else - paddw m8, [rsp+11*32] + ; because tmpq is clobbered, so we use a backup pointer for P2 instead + paddw m5, [r4], Q2 + mov pic_regq, pic_regm %endif - paddw m10, m13, m14 - paddw m8, m0 - psubw m8, m10 - psrlw m10, m8, 4 - pand m10, m1 +%else ; %2 != v + paddw m3, [rsp+12*16] + paddw m5, P2, Q2 +%endif ; %2==/!=v + paddw m3, m0 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 %ifidn %2, v - pandn m9, m1, [tmpq+strideq*0] -%else - pandn m9, m1, [rsp+10*32] -%endif - por m10, m9 + pandn m4, m1, [tmpq+strideq*0] +%else ; %2 != v + pandn m4, m1, [rsp+11*16] +%endif ; %2==/!=v + por m5, m4 %ifidn %2, v - mova [tmpq+strideq*0], m10 ; q4 -%else - mova [rsp+10*32], m10 -%endif + mova [tmpq+strideq*0], m5 ; q4 +%else ; %2 != v + mova [rsp+11*16], m5 +%endif ; %2==/!=v ; sub p1/q3, add q6*2 - psubw m8, m3 + psubw m3, P1 paddw m0, m0 - psubw m8, m15 - paddw m8, m0 - psrlw m10, m8, 4 - pand m10, m1 + psubw m3, Q3 + paddw m3, m0 + psrlw m5, m3, 4 + pand m5, m1 %ifidn %2, v - pandn m9, m1, [tmpq+strideq*1] -%else - pandn m9, m1, [rsp+11*32] -%endif - por m10, m9 + pandn m4, m1, [tmpq+strideq*1] +%else ; %2 != v + pandn m4, m1, [rsp+12*16] +%endif ; %2==/!=v + por m5, m4 %ifidn %2, v - mova [tmpq+strideq*1], m10 ; q5 -%else - mova [rsp+11*32], m10 -%endif + mova [tmpq+strideq*1], m5 ; q5 +%else ; %2 != v + mova [rsp+12*16], m5 +%endif ; %2==/!=v - mova m9, [rsp+0*32] - mova m13, [rsp+1*32] - mova m3, [rsp+2*32] - mova m4, [rsp+3*32] - mova m5, [rsp+4*32] - SWAP 2, 6 - SWAP 7, 14 + mova m4, [rsp+0*16] %ifidn %2, v lea tmpq, [dstq+mstrideq*4] -%else - mova m15, [rsp+14*32] -%endif %endif +%if ARCH_X86_64 + SWAP 2, 11 + SWAP 7, 14 + SWAP 6, 15 +%else ; x86-32 + mova Q1, m2 + mova Q2, m7 +%endif ; x86-32/64 +%if WRITE_IN_PLACE + mova P2, [tmpq+strideq*1] + mova P1, [tmpq+strideq*2] + mova P0, [tmpq+stride3q] + mova Q0, [dstq] +%elif ARCH_X86_64 + mova P2, [rsp+1*16] + mova P1, [rsp+2*16] + mova P0, [rsp+3*16] + mova Q0, [rsp+4*16] +%else ; !WRITE_IN_PLACE & x86-32 + mova m0, [rsp+1*16] + mova m1, [rsp+2*16] + mova m2, [rsp+3*16] + mova m3, [rsp+4*16] + mova m7, [rsp+5*16] + mova P2, m0 + mova P1, m1 + mova P0, m2 + mova Q0, m3 + mova Q3, m7 +%endif ; WRITE_IN_PLACE / x86-32/64 +%undef WRITE_IN_PLACE +%endif ; %1 == 16 %if %1 >= 8 + ; flat8 filter -%ifidn %2, v - mova m0, [tmpq+strideq*0] ; p3 -%else - mova m0, [rsp+5*32] ; p3 -%endif - paddw m1, m0, m13 ; p3+p2 - paddw m2, m3, m4 ; p1+p0 - paddw m8, m1, m1 ; 2*(p3+p2) + mova m0, P3 ; p3 + paddw m1, m0, P2 ; p3+p2 + paddw m2, P1, P0 ; p1+p0 + paddw m3, m1, m1 ; 2*(p3+p2) paddw m2, m0 ; p1+p0+p3 - paddw m8, m5 ; 2*(p3+p2)+q0 - paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0 - pmulhrsw m7, m2, [pw_4096] - - paddw m8, m3, m6 - psubw m2, m1 - paddw m2, m8 - pmulhrsw m8, m2, [pw_4096] - - paddw m10, m0, m3 - paddw m11, m4, m14 - psubw m2, m10 - paddw m2, m11 - pmulhrsw m10, m2, [pw_4096] - - paddw m11, m0, m4 - paddw m1, m5, m15 - psubw m2, m11 - paddw m2, m1 - pmulhrsw m11, m2, [pw_4096] - - paddw m2, m6 - paddw m2, m15 - paddw m1, m13, m5 - psubw m2, m1 - pmulhrsw m1, m2, [pw_4096] - - psubw m2, m3 - psubw m2, m6 - paddw m0, m15, m14 - paddw m2, m0 - pmulhrsw m2, [pw_4096] - - REPX {pand x, m9}, m7, m8, m10, m11, m1, m2 -%if avx_enabled - REPX {pandn x, m9, x}, m13, m3, m4, m5, m6, m14 -%else - pcmpeqw m0, m0 - pxor m0, m9 - REPX {pand x, m0}, m13, m3, m4, m5, m6, m14 -%endif - por m13, m7 - por m3, m8 - por m4, m10 - por m5, m11 - por m6, m1 - por m14, m2 + paddw m3, Q0 ; 2*(p3+p2)+q0 + paddw m2, m3 ; 3*p3+2*p2+p1+p0+q0 + pmulhrsw m7, m2, [PIC_sym(pw_4096)] + psubw m7, P2 + pand m7, m4 + + paddw m3, P1, Q1 ; p1+q1 + psubw m2, m1 ; 2*p3+p2+p1+p0+q0 + paddw m2, m3 ; 2*p3+p2+2*p1+p0+q0+q1 + pmulhrsw m3, m2, [PIC_sym(pw_4096)] + psubw m3, P1 + pand m3, m4 + + paddw m5, m0, P1 ; p3+p1 + paddw m6, P0, Q2 ; p0+q2 + psubw m2, m5 ; p3+p2+p1+p0+q0+q1 + paddw m2, m6 ; p3+p2+p1+2*p0+q0+q1+q2 + pmulhrsw m5, m2, [PIC_sym(pw_4096)] + psubw m5, P0 + pand m5, m4 + + paddw m6, m0, P0 ; p3+p0 + paddw m1, Q0, Q3 ; q0+q3 + psubw m2, m6 ; p2+p1+p0+q0+q1+q2 + paddw m2, m1 ; p2+p1+p0+2*q0+q1+q2+q3 + pmulhrsw m6, m2, [PIC_sym(pw_4096)] + psubw m6, Q0 + pand m6, m4 + + paddw m2, Q1 ; p2+p1+p0+2*q0+2*q1+q2+q3 + paddw m2, Q3 ; p2+p1+p0+2*q0+2*q1+q2+2*q3 + paddw m1, P2, Q0 ; p2+q0 + psubw m2, m1 ; p1+p0+q0+2*q1+q2+2*q3 + pmulhrsw m1, m2, [PIC_sym(pw_4096)] + psubw m1, Q1 + pand m1, m4 + + psubw m2, P1 ; p0+q0+2*q1+q2+2*q3 + psubw m2, Q1 ; p0+q0+q1+q2+2*q3 + paddw m0, Q3, Q2 ; q3+q2 + paddw m2, m0 ; p0+q0+q1+2*q2+3*q3 + pmulhrsw m2, [PIC_sym(pw_4096)] + psubw m2, Q2 + pand m2, m4 + + paddw m7, P2 + paddw m3, P1 + paddw m5, P0 + paddw m6, Q0 + paddw m1, Q1 + paddw m2, Q2 %ifidn %2, v - mova [tmpq+strideq*1], m13 ; p2 + mova [tmpq+strideq*1], m7 ; p2 mova [tmpq+strideq*2], m3 ; p1 - mova [tmpq+stride3q ], m4 ; p0 - mova [dstq+strideq*0], m5 ; q0 - mova [dstq+strideq*1], m6 ; q1 - mova [dstq+strideq*2], m14 ; q2 -%else - mova m0, [rsp+5*32] + mova [tmpq+stride3q ], m5 ; p0 + mova [dstq+strideq*0], m6 ; q0 + mova [dstq+strideq*1], m1 ; q1 + mova [dstq+strideq*2], m2 ; q2 +%else ; %2 != v + mova m0, P3 + %if %1 == 8 - TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1 + lea tmpq, [dstq+strideq*4] +%if ARCH_X86_64 + SWAP 4, 15 + TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, 8 +%else + TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, "", \ + Q3, [tmpq+strideq*1-8], a, u +%endif ; write 8x8 - movu [dstq+strideq*0-8], xm0 - movu [dstq+strideq*1-8], xm13 - movu [dstq+strideq*2-8], xm3 - movu [dstq+stride3q -8], xm4 - lea dstq, [dstq+strideq*4] - movu [dstq+strideq*0-8], xm5 - movu [dstq+strideq*1-8], xm6 - movu [dstq+strideq*2-8], xm14 - movu [dstq+stride3q -8], xm15 - lea dstq, [dstq+strideq*4] -%else - mova m0, [rsp+6*32] - mova m1, [rsp+7*32] - mova m2, [rsp+8*32] - mova m7, [rsp+9*32] - mova m8, [rsp+5*32] - TRANSPOSE8X8W 0, 1, 2, 7, 8, 13, 3, 4, 9 - - mova [dstq+strideq*0-16], xm0 - mova [dstq+strideq*1-16], xm1 - mova [dstq+strideq*2-16], xm2 - mova [dstq+stride3q -16], xm7 - lea tmpq, [dstq+strideq*4] - mova [tmpq+strideq*0-16], xm8 - mova [tmpq+strideq*1-16], xm13 - mova [tmpq+strideq*2-16], xm3 - mova [tmpq+stride3q -16], xm4 - - mova m0, [rsp+10*32] - mova m1, [rsp+11*32] - mova m2, [rsp+12*32] - mova m3, [rsp+13*32] - TRANSPOSE8X8W 5, 6, 14, 15, 0, 1, 2, 3, 4 - mova [dstq+strideq*0], xm5 - mova [dstq+strideq*1], xm6 - mova [dstq+strideq*2], xm14 - mova [dstq+stride3q ], xm15 - lea dstq, [dstq+strideq*4] - mova [dstq+strideq*0], xm0 - mova [dstq+strideq*1], xm1 - mova [dstq+strideq*2], xm2 - mova [dstq+stride3q ], xm3 - lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], m0 + movu [dstq+strideq*1-8], m7 + movu [dstq+strideq*2-8], m3 + movu [dstq+stride3q -8], m5 + movu [tmpq+strideq*0-8], m6 +%if ARCH_X86_64 + movu [tmpq+strideq*1-8], m1 %endif + movu [tmpq+strideq*2-8], m2 + movu [tmpq+stride3q -8], m4 + lea dstq, [dstq+strideq*8] +%else ; %1 != 8 +%if ARCH_X86_64 + SWAP 6, 8 + SWAP 1, 9 + SWAP 2, 10 +%else + mova [rsp+1*16], m6 + mova [rsp+2*16], m1 + mova [rsp+3*16], m2 %endif -%elif %1 == 6 - ; flat6 filter - paddw m8, m3, m4 - paddw m8, m13 ; p2+p1+p0 - paddw m11, m13, m5 - paddw m8, m8 - paddw m8, m11 ; p2+2*(p2+p1+p0)+q0 - pmulhrsw m2, m8, [pw_4096] - - paddw m8, m5 - paddw m11, m13, m13 - paddw m8, m6 - psubw m8, m11 - pmulhrsw m10, m8, [pw_4096] - - paddw m8, m6 - paddw m11, m13, m3 - paddw m8, m14 - psubw m8, m11 - pmulhrsw m11, m8, [pw_4096] - - psubw m8, m3 - paddw m14, m14 - psubw m8, m4 - paddw m8, m14 - pmulhrsw m8, [pw_4096] - - REPX {pand x, m9}, m2, m10, m11, m8 -%if avx_enabled - REPX {pandn x, m9, x}, m3, m4, m5, m6 + mova m1, [rsp+ 7*16] + mova m2, [rsp+ 8*16] + mova m4, [rsp+ 9*16] + mova m6, [rsp+10*16] + lea tmpq, [dstq+strideq*4] +%if ARCH_X86_64 + TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, 11 %else - pcmpeqw m0, m0 - pxor m0, m9 - REPX {pand x, m0}, m3, m4, m5, m6 + mova [rsp+7*16], m5 + TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, "", \ + [rsp+7*16], [tmpq+strideq*1-16], a, a %endif - por m3, m2 - por m4, m10 - por m5, m11 - por m6, m8 -%ifidn %2, v - mova [tmpq+strideq*2], m3 ; p1 - mova [tmpq+stride3q ], m4 ; p0 - mova [dstq+strideq*0], m5 ; q0 - mova [dstq+strideq*1], m6 ; q1 + mova [dstq+strideq*0-16], m1 + mova [dstq+strideq*1-16], m2 + mova [dstq+strideq*2-16], m4 + mova [dstq+stride3q -16], m6 + mova [tmpq+strideq*0-16], m0 +%if ARCH_X86_64 + mova [tmpq+strideq*1-16], m7 +%endif + mova [tmpq+strideq*2-16], m3 + mova [tmpq+stride3q -16], m5 + +%if ARCH_X86_64 + SWAP 6, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 4, 15 %else - TRANSPOSE_8x4_AND_WRITE_4x8 + mova m6, [rsp+1*16] + mova m1, [rsp+2*16] + mova m2, [rsp+3*16] + mova m4, Q3 %endif + mova m0, [rsp+11*16] + mova m3, [rsp+12*16] + mova m5, [rsp+13*16] +%if ARCH_X86_64 + mova m7, [rsp+14*16] + TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, 8 %else -%ifidn %2, v - mova [tmpq+strideq*0], m3 ; p1 - mova [tmpq+strideq*1], m4 ; p0 - mova [tmpq+strideq*2], m5 ; q0 - mova [tmpq+stride3q ], m6 ; q1 + TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, "", \ + [rsp+14*16], [tmpq+strideq*1], a, a +%endif + mova [dstq+strideq*0], m6 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m4 + mova [tmpq+strideq*0], m0 +%if ARCH_X86_64 + mova [tmpq+strideq*1], m3 +%endif + mova [tmpq+strideq*2], m5 + mova [tmpq+stride3q ], m7 + lea dstq, [dstq+strideq*8] +%endif ; %1==/!=8 +%endif ; %2==/!=v +%elif %1 == 6 + ; flat6 filter + paddw m3, P1, P0 ; p1+p0 + paddw m3, P2 ; p2+p1+p0 + paddw m6, P2, Q0 ; p2+q0 + paddw m3, m3 ; 2*(p2+p1+p0) + paddw m3, m6 ; p2+2*(p2+p1+p0)+q0 + pmulhrsw m2, m3, [PIC_sym(pw_4096)] + psubw m2, P1 + pand m2, m4 + + paddw m3, Q0 ; p2+2*(p2+p1+p0+q0) + paddw m6, P2, P2 ; 2*p2 + paddw m3, Q1 ; p2+2*(p2+p1+p0+q0)+q1 + psubw m3, m6 ; p2+2*(p1+p0+q0)+q1 + pmulhrsw m5, m3, [PIC_sym(pw_4096)] + psubw m5, P0 + pand m5, m4 + + paddw m3, Q1 ; p2+2*(p1+p0+q0+q1) + paddw m6, P2, P1 ; p2+p1 + paddw m3, Q2 ; p2+2*(p1+p0+q0+q1)+q2 + psubw m3, m6 ; p1+2*(p0+q0+q1)+q2 + pmulhrsw m6, m3, [PIC_sym(pw_4096)] + psubw m6, Q0 + pand m6, m4 + + psubw m3, P1 ; 2*(p0+q0+q1)+q2 +%if ARCH_X86_64 + paddw Q2, Q2 ; q2*2 %else - TRANSPOSE_8x4_AND_WRITE_4x8 + mova m0, Q2 + paddw m0, m0 %endif + psubw m3, P0 ; p0+2*(q0+q1)+q2 +%if ARCH_X86_64 + paddw m3, Q2 ; p0+q*(q0+q1+q2)+q2 +%else + paddw m3, m0 %endif + pmulhrsw m3, [PIC_sym(pw_4096)] + psubw m3, Q1 + pand m3, m4 + + paddw m2, P1 + paddw m5, P0 + paddw m6, Q0 + paddw m3, Q1 + +%ifidn %2, v + mova [dstq+mstrideq*2], m2 ; p1 + mova [dstq+mstrideq*1], m5 ; p0 + mova [dstq+strideq*0], m6 ; q0 + mova [dstq+strideq*1], m3 ; q1 +%else ; %2 != v + TRANSPOSE_8x4_AND_WRITE_4x8 m2, m5, m6, m3, m0 +%endif ; %2==/!=v +%else ; %1 == 4 +%if ARCH_X86_64 +%ifidn %2, v + mova [dstq+mstrideq*2], P1 ; p1 + mova [dstq+mstrideq*1], P0 ; p0 + mova [dstq+strideq*0], Q0 ; q0 + mova [dstq+strideq*1], Q1 ; q1 +%else ; %2 != v + TRANSPOSE_8x4_AND_WRITE_4x8 P1, P0, Q0, Q1, m0 +%endif ; %2==/!=v +%else ; x86-32 +%ifidn %2, v + mova [dstq+mstrideq*2], m3 + mova [dstq+mstrideq*1], m5 + mova [dstq+strideq*0], m6 + mova [dstq+strideq*1], m7 +%else ; %2 != v + TRANSPOSE_8x4_AND_WRITE_4x8 m3, m5, m6, m7, m0 +%endif ; %2==/!=v +%endif ; x86-32/64 +%endif ; %1 +%undef P3 +%undef P2 +%undef P1 +%undef P0 +%undef Q0 +%undef Q1 +%undef Q2 +%undef Q3 %endmacro INIT_XMM ssse3 -cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ +; stack layout: +; r0 - flat8 backup inside flat16 code +%if ARCH_X86_64 +cglobal lpf_v_sb_y_16bpc, 6, 12, 16, -16 * 1, \ dst, stride, mask, l, l_stride, lut, \ - w, stride3, mstride, tmp, mask_bits + w, stride3, mstride, tmp, mask_bits, bdmul mov r6d, r7m sar r6d, 7 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc - lea r11, [pw_4] - add r11, r6 + lea bdmulq, [pw_4] + add bdmulq, r6 mov wd, wm shl l_strideq, 2 sub lq, l_strideq +%else +; stack layout [32bit only]: +; r1-4 - p2-q0 post-filter16 +; r5 - p3 +; r6 - q3 post-filter16 +; r7 - GPRs [mask_bitsm, mstridem] +; r8 - m12/pb_mask +; r9 - bdmulq +cglobal lpf_v_sb_y_16bpc, 4, 7, 8, -16 * (10 + extra_stack), \ + dst, stride, mask, mstride, pic_reg, stride3, tmp + RELOC_ARGS v, 10*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base +%define pic_regm dword [esp+7*16+2*gprsize] + mov pic_regm, pic_regq + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+9*16 + mova [bdmulq], m0 + shl dword lstridem, 2 + sub r3, dword lstridem + mov dword lm, r3 +%endif mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] +%if ARCH_X86_64 mov mask_bitsd, 0x3 mova m12, [pb_mask] +%else +%define mstridem dword [esp+7*16+1*gprsize] + mov mstridem, mstrideq +%define mask_bitsm dword [esp+7*16+0*gprsize] + mov mask_bitsm, 0x3 + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+8*16] + mova m12, m0 +%endif .loop: +%if ARCH_X86_64 test [maskq+8], mask_bitsd ; vmask[2] +%else + mov r6d, mask_bitsm + test [maskq+8], r6d +%endif jz .no_flat16 FILTER 16, v jmp .end .no_flat16: +%if ARCH_X86_64 test [maskq+4], mask_bitsd ; vmask[1] +%else + test [maskq+4], r6d +%endif jz .no_flat FILTER 8, v jmp .end .no_flat: +%if ARCH_X86_64 test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif jz .end FILTER 4, v .end: +%if ARCH_X86_64 pslld m12, 2 add lq, 8 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add dword lm, 8 +%endif add dstq, 16 +%if ARCH_X86_64 shl mask_bitsd, 2 sub wd, 2 +%else + shl mask_bitsm, 2 + sub dword wm, 2 +%endif jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS RET INIT_XMM ssse3 -cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ +; stack layout: +; r0 - flat8 backup inside flat16 +; r1-4 - p2-q0 post-filter16 backup +; r5 - q3 post-filter16 backup +; r6 - p3 +; r7-10 - p7-4 +; r11-14 - q4-7 +%if ARCH_X86_64 +cglobal lpf_h_sb_y_16bpc, 6, 11, 16, -16 * 15, \ dst, stride, mask, l, l_stride, lut, \ - h, stride3, l_stride3, tmp, mask_bits + h, stride3, tmp, mask_bits, bdmul mov r6d, r7m sar r6d, 7 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc - lea r11, [pw_4] - add r11, r6 + lea bdmulq, [pw_4] + add bdmulq, r6 mov hd, hm shl l_strideq, 2 +%else +; stack layout [32bit only]: +; r15 - GPRs [mask_bitsm] +; r16 - m12/pb_mask +; r17 - bdmulq +; r18-24 - p2-q3 +cglobal lpf_h_sb_y_16bpc, 4, 7, 8, -16 * (25 + extra_stack), \ + dst, stride, mask, l, pic_reg, stride3, tmp + RELOC_ARGS h, 25*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+17*16 + mova [bdmulq], m0 + shl dword lstridem, 2 +%endif sub lq, 4 lea stride3q, [strideq*3] - lea l_stride3q, [l_strideq*3] +%if ARCH_X86_64 mov mask_bitsd, 0x3 mova m12, [pb_mask] +%else +%define mask_bitsm dword [esp+15*16+0*gprsize] + mov mask_bitsm, 0x3 + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+16*16] + mova m12, m0 +%endif .loop: +%if ARCH_X86_64 test [maskq+8], mask_bitsd ; vmask[2] +%else + mov r6d, mask_bitsm + test [maskq+8], r6d +%endif jz .no_flat16 FILTER 16, h jmp .end .no_flat16: +%if ARCH_X86_64 test [maskq+4], mask_bitsd ; vmask[1] +%else + test [maskq+4], r6d +%endif jz .no_flat FILTER 8, h jmp .end .no_flat: +%if ARCH_X86_64 test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif jz .no_filter FILTER 4, h @@ -1044,79 +1593,185 @@ cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ .no_filter: lea dstq, [dstq+strideq*8] .end: +%if ARCH_X86_64 pslld m12, 2 lea lq, [lq+l_strideq*2] shl mask_bitsd, 2 sub hd, 2 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add lq, dword lstridem + add lq, dword lstridem + shl mask_bitsm, 2 + sub dword hm, 2 +%endif jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS RET INIT_XMM ssse3 +%if ARCH_X86_64 cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ dst, stride, mask, l, l_stride, lut, \ - w, stride3, mstride, tmp, mask_bits + w, stride3, mstride, tmp, mask_bits, bdmul mov r6d, r7m sar r6d, 7 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc - lea r11, [pw_4] - add r11, r6 + lea bdmulq, [pw_4] + add bdmulq, r6 mov wd, wm shl l_strideq, 2 sub lq, l_strideq +%else +; stack layout [32bit only]: +; r0 - GPRs [mask_bitsm, mstridem] +; r1 - m12/pb_mask +; r2 - bdmulq +cglobal lpf_v_sb_uv_16bpc, 4, 7, 8, -16 * (3 + extra_stack), \ + dst, stride, mask, mstride, pic_reg, stride3, tmp + RELOC_ARGS v, 3*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+2*16 + mova [bdmulq], m0 + shl dword lstridem, 2 + sub r3, dword lstridem + mov dword lm, r3 +%endif mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] +%if ARCH_X86_64 mov mask_bitsd, 0x3 mova m12, [pb_mask] +%else +%define mask_bitsm dword [esp+0*gprsize] +%define mstridem dword [esp+1*gprsize] + mov mask_bitsm, 0x3 + mov mstridem, mstrideq + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+1*16] + mova m12, m0 +%endif .loop: +%if ARCH_X86_64 test [maskq+4], mask_bitsd ; vmask[1] +%else + mov r6d, mask_bitsm + test [maskq+4], r6d +%endif jz .no_flat FILTER 6, v jmp .end .no_flat: +%if ARCH_X86_64 test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif jz .end FILTER 4, v .end: +%if ARCH_X86_64 pslld m12, 2 add lq, 8 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add dword lm, 8 +%endif add dstq, 16 +%if ARCH_X86_64 shl mask_bitsd, 2 sub wd, 2 +%else + shl mask_bitsm, 2 + sub dword wm, 2 +%endif jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS RET INIT_XMM ssse3 -cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \ +%if ARCH_X86_64 +cglobal lpf_h_sb_uv_16bpc, 6, 11, 16, \ dst, stride, mask, l, l_stride, lut, \ - h, stride3, l_stride3, tmp, mask_bits + h, stride3, tmp, mask_bits, bdmul mov r6d, r7m sar r6d, 7 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc - lea r11, [pw_4] - add r11, r6 + lea bdmulq, [pw_4] + add bdmulq, r6 mov hd, hm shl l_strideq, 2 +%else +; stack layout [32bit only]: +; r0 - GPRs [mask_bitsm] +; r1 - m12/pb_mask +; r2 - bdmulq +; r3-8 - p2-q2 +cglobal lpf_h_sb_uv_16bpc, 4, 7, 8, -16 * (9 + extra_stack), \ + dst, stride, mask, l, pic_reg, stride3, tmp + RELOC_ARGS h, 9*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+2*16 + mova [bdmulq], m0 + shl dword lstridem, 2 +%endif sub lq, 4 lea stride3q, [strideq*3] - lea l_stride3q, [l_strideq*3] +%if ARCH_X86_64 mov mask_bitsd, 0x3 mova m12, [pb_mask] +%else +%define mask_bitsm dword [esp+0*gprsize] + mov mask_bitsm, 0x3 + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+1*16] + mova m12, m0 +%endif .loop: +%if ARCH_X86_64 test [maskq+4], mask_bitsd ; vmask[1] +%else + mov r6d, mask_bitsm + test [maskq+4], r6d +%endif jz .no_flat FILTER 6, h jmp .end .no_flat: +%if ARCH_X86_64 test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif jz .no_filter FILTER 4, h @@ -1125,11 +1780,22 @@ cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \ .no_filter: lea dstq, [dstq+strideq*8] .end: +%if ARCH_X86_64 pslld m12, 2 lea lq, [lq+l_strideq*2] shl mask_bitsd, 2 sub hd, 2 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add lq, dword lstridem + add lq, dword lstridem + shl mask_bitsm, 2 + sub dword hm, 2 +%endif jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS RET - -%endif ; ARCH_X86_64 From 88cf358d76e2fba13b90f6b09e84fcd30d666ab0 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Fri, 2 Jul 2021 15:38:24 +0200 Subject: [PATCH 134/188] x86: Add high bitdepth cfl_pred SSSE3 asm --- src/x86/ipred16_sse.asm | 309 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 309 insertions(+) diff --git a/src/x86/ipred16_sse.asm b/src/x86/ipred16_sse.asm index ad35c52074..d7cf7211d8 100644 --- a/src/x86/ipred16_sse.asm +++ b/src/x86/ipred16_sse.asm @@ -47,12 +47,16 @@ pw_2048: times 4 dw 2048 %define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4) %define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4) +%define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4) JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64 JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \ s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4 JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ + s4-8*4, s8-8*4, s16-8*4, s32-8*4 +JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32 JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 cextern smooth_weights_1d_16bpc @@ -879,6 +883,311 @@ cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \ %endif RET +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac + LEA t0, ipred_cfl_left_16bpc_ssse3_table + movd m4, wd + tzcnt wd, wd + movifnidn hd, hm + add tlq, 2 + movsxd r6, [t0+wq*4] + movd m5, wd + jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start) + +cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + LEA t0, ipred_cfl_left_16bpc_ssse3_table + tzcnt wd, wm + lea r6d, [hq*2] + movd m4, hd + sub tlq, r6 + tzcnt r6d, hd + movd m5, r6d + movsxd r6, [t0+r6*4] +.start: + movd m7, r7m + movu m0, [tlq] + add r6, t0 + add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table + movsxd wq, [t0+wq*4] + pxor m6, m6 + pshuflw m7, m7, q0000 + pcmpeqw m3, m3 + add wq, t0 + movifnidn acq, acmp + pavgw m4, m6 + punpcklqdq m7, m7 + jmp r6 +.h32: + movu m1, [tlq+48] + movu m2, [tlq+32] + paddw m0, m1 + paddw m0, m2 +.h16: + movu m1, [tlq+16] + paddw m0, m1 +.h8: + pshufd m1, m0, q1032 + paddw m0, m1 +.h4: + pmaddwd m0, m3 + psubd m4, m0 + pshuflw m0, m4, q1032 + paddd m0, m4 + psrld m0, m5 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + jmp wq + +%macro IPRED_CFL 2 ; dst, src + pabsw m%1, m%2 + pmulhrsw m%1, m2 + psignw m%2, m1 + psignw m%1, m%2 + paddw m%1, m0 + pmaxsw m%1, m6 + pminsw m%1, m7 +%endmacro + +cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + tzcnt r6d, hd + lea t0d, [wq+hq] + movd m4, t0d + tzcnt t0d, t0d + movd m5, t0d + LEA t0, ipred_cfl_16bpc_ssse3_table + tzcnt wd, wd + movd m7, r7m + movsxd r6, [t0+r6*4] + movsxd wq, [t0+wq*4+4*4] + psrlw m4, 1 + pxor m6, m6 + pshuflw m7, m7, q0000 + add r6, t0 + add wq, t0 + movifnidn acq, acmp + pcmpeqw m3, m3 + punpcklqdq m7, m7 + jmp r6 +.h4: + movq m0, [tlq-8] + jmp wq +.w4: + movq m1, [tlq+2] + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + cmp hd, 4 + jg .w4_mul + psrld m0, 3 + jmp .w4_end +.w4_mul: + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 16 + cmove r6d, r2d + movd m1, r6d + psrld m0, 2 + pmulhuw m0, m1 + psrlw m0, 1 +.w4_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s4: + movd m1, alpham + lea r6, [strideq*3] + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s4_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + add acq, 16*2 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + movq [dstq+strideq*0], m3 + movhps [dstq+strideq*1], m3 + movq [dstq+strideq*2], m4 + movhps [dstq+r6 ], m4 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4_loop + RET +.h8: + mova m0, [tlq-16] + jmp wq +.w8: + movu m1, [tlq+2] + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + cmp hd, 8 + je .w8_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 32 + cmove r6d, r2d + movd m1, r6d + pmulhuw m0, m1 + psrlw m0, 1 +.w8_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s8: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s8_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + add acq, 16*2 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+strideq*0], m3 + mova [dstq+strideq*1], m4 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .s8_loop + RET +.h16: + mova m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w16: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + cmp hd, 16 + je .w16_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + test hd, 8|32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 + psrlw m0, 1 +.w16_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s16: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s16_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + add acq, 16*2 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+16*0], m3 + mova [dstq+16*1], m4 + add dstq, strideq + dec hd + jg .s16_loop + RET +.h32: + mova m0, [tlq-64] + paddw m0, [tlq-48] + paddw m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w32: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + movu m2, [tlq+34] + paddw m1, m2 + movu m2, [tlq+50] + paddw m1, m2 + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + cmp hd, 32 + je .w32_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 8 + cmove r6d, r2d + movd m1, r6d + pmulhuw m0, m1 + psrlw m0, 1 +.w32_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s32: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s32_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+16*0], m3 + mova [dstq+16*1], m4 + mova m4, [acq+16*2] + mova m5, [acq+16*3] + add acq, 16*4 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+16*2], m3 + mova [dstq+16*3], m4 + add dstq, strideq + dec hd + jg .s32_loop + RET + +cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac + tzcnt wd, wm + LEA t0, ipred_cfl_splat_16bpc_ssse3_table + mov r6d, r7m + movifnidn hd, hm + shr r6d, 11 + movd m7, r7m + movsxd wq, [t0+wq*4] + movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8] + pshuflw m7, m7, q0000 + pxor m6, m6 + add wq, t0 + movifnidn acq, acmp + punpcklqdq m7, m7 + jmp wq + cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h %define base r2-pal_pred_16bpc_ssse3_table %if ARCH_X86_32 From 3051dbc73eb014a0a2e150478e9eba10ee8bfe12 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Fri, 2 Jul 2021 15:38:30 +0200 Subject: [PATCH 135/188] x86: Add high bitdepth cfl_ac SSSE3 asm --- src/x86/ipred16_sse.asm | 486 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 486 insertions(+) diff --git a/src/x86/ipred16_sse.asm b/src/x86/ipred16_sse.asm index d7cf7211d8..48d797b27a 100644 --- a/src/x86/ipred16_sse.asm +++ b/src/x86/ipred16_sse.asm @@ -32,6 +32,9 @@ pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 pb_0_1: times 4 db 0, 1 pb_2_3: times 4 db 2, 3 +pw_1: times 4 dw 1 +pw_2: times 4 dw 2 +pw_4: times 4 dw 4 pw_512: times 4 dw 512 pw_2048: times 4 dw 2048 @@ -57,6 +60,7 @@ JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32 +JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32 JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 cextern smooth_weights_1d_16bpc @@ -1188,6 +1192,488 @@ cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac punpcklqdq m7, m7 jmp wq +cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h + movifnidn hpadd, hpadm +%if ARCH_X86_32 && PIC + pcmpeqw m5, m5 + pabsw m5, m5 + paddw m5, m5 +%else + movddup m5, [pw_2] +%endif + mov hd, hm + shl hpadd, 2 + pxor m4, m4 + sub hd, hpadd + cmp dword wm, 8 + mov r5, acq + jg .w16 + je .w8 + lea r3, [strideq*3] +.w4_loop: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m1, m5, [ypxq+strideq*1] + pmaddwd m2, m5, [ypxq+strideq*2] + pmaddwd m3, m5, [ypxq+r3 ] + lea ypxq, [ypxq+strideq*4] + paddd m0, m1 + paddd m2, m3 + paddd m4, m0 + packssdw m0, m2 + paddd m4, m2 + mova [acq], m0 + add acq, 16 + sub hd, 2 + jg .w4_loop + test hpadd, hpadd + jz .dc + punpckhqdq m0, m0 + pslld m2, 2 +.w4_hpad: + mova [acq+16*0], m0 + paddd m4, m2 + mova [acq+16*1], m0 + add acq, 16*2 + sub hpadd, 4 + jg .w4_hpad + jmp .dc +.w8: +%if ARCH_X86_32 + cmp dword wpadm, 0 +%else + test wpadd, wpadd +%endif + jnz .w8_wpad1 +.w8_loop: + pmaddwd m0, m5, [ypxq+strideq*0+16*0] + pmaddwd m2, m5, [ypxq+strideq*1+16*0] + pmaddwd m1, m5, [ypxq+strideq*0+16*1] + pmaddwd m3, m5, [ypxq+strideq*1+16*1] + lea ypxq, [ypxq+strideq*2] + paddd m0, m2 + paddd m1, m3 + paddd m2, m0, m1 + packssdw m0, m1 + paddd m4, m2 + mova [acq], m0 + add acq, 16 + dec hd + jg .w8_loop +.w8_hpad: + test hpadd, hpadd + jz .dc + pslld m2, 2 + mova m1, m0 + jmp .hpad +.w8_wpad1: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m1, m5, [ypxq+strideq*1] + lea ypxq, [ypxq+strideq*2] + paddd m0, m1 + pshufd m1, m0, q3333 + paddd m2, m0, m1 + packssdw m0, m1 + paddd m4, m2 + mova [acq], m0 + add acq, 16 + dec hd + jg .w8_wpad1 + jmp .w8_hpad +.w16_wpad3: + pshufd m3, m0, q3333 + mova m1, m3 + mova m2, m3 + jmp .w16_wpad_end +.w16_wpad2: + pshufd m1, m3, q3333 + mova m2, m1 + jmp .w16_wpad_end +.w16_wpad1: + pshufd m2, m1, q3333 + jmp .w16_wpad_end +.w16: + movifnidn wpadd, wpadm + WIN64_SPILL_XMM 7 +.w16_loop: + pmaddwd m0, m5, [ypxq+strideq*0+16*0] + pmaddwd m6, m5, [ypxq+strideq*1+16*0] + paddd m0, m6 + cmp wpadd, 2 + jg .w16_wpad3 + pmaddwd m3, m5, [ypxq+strideq*0+16*1] + pmaddwd m6, m5, [ypxq+strideq*1+16*1] + paddd m3, m6 + je .w16_wpad2 + pmaddwd m1, m5, [ypxq+strideq*0+16*2] + pmaddwd m6, m5, [ypxq+strideq*1+16*2] + paddd m1, m6 + jp .w16_wpad1 + pmaddwd m2, m5, [ypxq+strideq*0+16*3] + pmaddwd m6, m5, [ypxq+strideq*1+16*3] + paddd m2, m6 +.w16_wpad_end: + lea ypxq, [ypxq+strideq*2] + paddd m6, m0, m3 + packssdw m0, m3 + paddd m6, m1 + mova [acq+16*0], m0 + packssdw m1, m2 + paddd m2, m6 + mova [acq+16*1], m1 + add acq, 16*2 + paddd m4, m2 + dec hd + jg .w16_loop + WIN64_RESTORE_XMM + add hpadd, hpadd + jz .dc + paddd m2, m2 +.hpad: + mova [acq+16*0], m0 + mova [acq+16*1], m1 + paddd m4, m2 + mova [acq+16*2], m0 + mova [acq+16*3], m1 + add acq, 16*4 + sub hpadd, 4 + jg .hpad +.dc: + sub r5, acq ; -w*h*2 + pshufd m2, m4, q1032 + tzcnt r1d, r5d + paddd m2, m4 + sub r1d, 2 + pshufd m4, m2, q2301 + movd m0, r1d + paddd m2, m4 + psrld m2, m0 + pxor m0, m0 + pavgw m2, m0 + packssdw m2, m2 +.dc_loop: + mova m0, [acq+r5+16*0] + mova m1, [acq+r5+16*1] + psubw m0, m2 + psubw m1, m2 + mova [acq+r5+16*0], m0 + mova [acq+r5+16*1], m1 + add r5, 16*2 + jl .dc_loop + RET + +cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h + movifnidn hpadd, hpadm +%if ARCH_X86_32 && PIC + pcmpeqw m5, m5 + pabsw m5, m5 + psllw m5, 2 +%else + movddup m5, [pw_4] +%endif + mov hd, hm + shl hpadd, 2 + pxor m4, m4 + sub hd, hpadd + cmp dword wm, 8 + mov r5, acq + jg .w16 + je .w8 + lea r3, [strideq*3] +.w4_loop: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m3, m5, [ypxq+strideq*1] + pmaddwd m1, m5, [ypxq+strideq*2] + pmaddwd m2, m5, [ypxq+r3 ] + lea ypxq, [ypxq+strideq*4] + paddd m4, m0 + packssdw m0, m3 + paddd m3, m1 + packssdw m1, m2 + paddd m4, m2 + paddd m4, m3 + mova [acq+16*0], m0 + mova [acq+16*1], m1 + add acq, 16*2 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + punpckhqdq m1, m1 + pslld m2, 3 + mova [acq+16*0], m1 + mova [acq+16*1], m1 + paddd m4, m2 + mova [acq+16*2], m1 + mova [acq+16*3], m1 + add acq, 16*4 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc +.w8: +%if ARCH_X86_32 + cmp dword wpadm, 0 +%else + test wpadd, wpadd +%endif + jnz .w8_wpad1 +.w8_loop: + pmaddwd m0, m5, [ypxq+strideq*0+16*0] + pmaddwd m2, m5, [ypxq+strideq*0+16*1] + pmaddwd m1, m5, [ypxq+strideq*1+16*0] + pmaddwd m3, m5, [ypxq+strideq*1+16*1] + lea ypxq, [ypxq+strideq*2] + paddd m4, m0 + packssdw m0, m2 + paddd m4, m2 + mova [acq+16*0], m0 + paddd m2, m1, m3 + packssdw m1, m3 + paddd m4, m2 + mova [acq+16*1], m1 + add acq, 16*2 + sub hd, 2 + jg .w8_loop +.w8_hpad: + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + pslld m2, 2 + mova m0, m1 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad +.w8_wpad1: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m1, m5, [ypxq+strideq*1] + lea ypxq, [ypxq+strideq*2] + pshufd m2, m0, q3333 + pshufd m3, m1, q3333 + paddd m4, m0 + packssdw m0, m2 + paddd m4, m2 + paddd m2, m1, m3 + packssdw m1, m3 + paddd m4, m2 + mova [acq+16*0], m0 + mova [acq+16*1], m1 + add acq, 16*2 + sub hd, 2 + jg .w8_wpad1 + jmp .w8_hpad +.w16_wpad3: + pshufd m3, m0, q3333 + mova m1, m3 + mova m2, m3 + jmp .w16_wpad_end +.w16_wpad2: + pshufd m1, m3, q3333 + mova m2, m1 + jmp .w16_wpad_end +.w16_wpad1: + pshufd m2, m1, q3333 + jmp .w16_wpad_end +.w16: + movifnidn wpadd, wpadm + WIN64_SPILL_XMM 7 +.w16_loop: + pmaddwd m0, m5, [ypxq+16*0] + cmp wpadd, 2 + jg .w16_wpad3 + pmaddwd m3, m5, [ypxq+16*1] + je .w16_wpad2 + pmaddwd m1, m5, [ypxq+16*2] + jp .w16_wpad1 + pmaddwd m2, m5, [ypxq+16*3] +.w16_wpad_end: + add ypxq, strideq + paddd m6, m0, m3 + packssdw m0, m3 + mova [acq+16*0], m0 + paddd m6, m1 + packssdw m1, m2 + paddd m2, m6 + mova [acq+16*1], m1 + add acq, 16*2 + paddd m4, m2 + dec hd + jg .w16_loop + WIN64_RESTORE_XMM + add hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + paddd m2, m2 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad + +cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h +%define base r6-ipred_cfl_ac_444_16bpc_ssse3_table + LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table + tzcnt wd, wm + movifnidn hpadd, hpadm + pxor m4, m4 + movsxd wq, [r6+wq*4] + movddup m5, [base+pw_1] + add wq, r6 + mov hd, hm + shl hpadd, 2 + sub hd, hpadd + jmp wq +.w4: + lea r3, [strideq*3] + mov r5, acq +.w4_loop: + movq m0, [ypxq+strideq*0] + movhps m0, [ypxq+strideq*1] + movq m1, [ypxq+strideq*2] + movhps m1, [ypxq+r3 ] + lea ypxq, [ypxq+strideq*4] + psllw m0, 3 + psllw m1, 3 + mova [acq+16*0], m0 + pmaddwd m0, m5 + mova [acq+16*1], m1 + pmaddwd m2, m5, m1 + add acq, 16*2 + paddd m4, m0 + paddd m4, m2 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + punpckhqdq m1, m1 + mova [acq+16*0], m1 + pslld m2, 2 + mova [acq+16*1], m1 + punpckhqdq m2, m2 + mova [acq+16*2], m1 + paddd m4, m2 + mova [acq+16*3], m1 + add acq, 16*4 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc +.w8: + mov r5, acq +.w8_loop: + mova m0, [ypxq+strideq*0] + mova m1, [ypxq+strideq*1] + lea ypxq, [ypxq+strideq*2] + psllw m0, 3 + psllw m1, 3 + mova [acq+16*0], m0 + pmaddwd m0, m5 + mova [acq+16*1], m1 + pmaddwd m2, m5, m1 + add acq, 16*2 + paddd m4, m0 + paddd m4, m2 + sub hd, 2 + jg .w8_loop +.w8_hpad: + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + pslld m2, 2 + mova m0, m1 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad +.w16_wpad2: + pshufhw m3, m2, q3333 + pshufhw m1, m0, q3333 + punpckhqdq m3, m3 + punpckhqdq m1, m1 + jmp .w16_wpad_end +.w16: + movifnidn wpadd, wpadm + mov r5, acq +.w16_loop: + mova m2, [ypxq+strideq*0+16*0] + mova m0, [ypxq+strideq*1+16*0] + psllw m2, 3 + psllw m0, 3 + test wpadd, wpadd + jnz .w16_wpad2 + mova m3, [ypxq+strideq*0+16*1] + mova m1, [ypxq+strideq*1+16*1] + psllw m3, 3 + psllw m1, 3 +.w16_wpad_end: + lea ypxq, [ypxq+strideq*2] + mova [acq+16*0], m2 + pmaddwd m2, m5 + mova [acq+16*1], m3 + pmaddwd m3, m5 + paddd m4, m2 + pmaddwd m2, m5, m0 + mova [acq+16*2], m0 + paddd m4, m3 + pmaddwd m3, m5, m1 + mova [acq+16*3], m1 + add acq, 16*4 + paddd m2, m3 + paddd m4, m2 + sub hd, 2 + jg .w16_loop + add hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + paddd m2, m2 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad +.w32_wpad6: + pshufhw m1, m0, q3333 + punpckhqdq m1, m1 + mova m2, m1 + mova m3, m1 + jmp .w32_wpad_end +.w32_wpad4: + pshufhw m2, m1, q3333 + punpckhqdq m2, m2 + mova m3, m2 + jmp .w32_wpad_end +.w32_wpad2: + pshufhw m3, m2, q3333 + punpckhqdq m3, m3 + jmp .w32_wpad_end +.w32: + movifnidn wpadd, wpadm + mov r5, acq + WIN64_SPILL_XMM 8 +.w32_loop: + mova m0, [ypxq+16*0] + psllw m0, 3 + cmp wpadd, 4 + jg .w32_wpad6 + mova m1, [ypxq+16*1] + psllw m1, 3 + je .w32_wpad4 + mova m2, [ypxq+16*2] + psllw m2, 3 + jnp .w32_wpad2 + mova m3, [ypxq+16*3] + psllw m3, 3 +.w32_wpad_end: + add ypxq, strideq + pmaddwd m6, m5, m0 + mova [acq+16*0], m0 + pmaddwd m7, m5, m1 + mova [acq+16*1], m1 + paddd m6, m7 + pmaddwd m7, m5, m2 + mova [acq+16*2], m2 + paddd m6, m7 + pmaddwd m7, m5, m3 + mova [acq+16*3], m3 + add acq, 16*4 + paddd m6, m7 + paddd m4, m6 + dec hd + jg .w32_loop +%if WIN64 + mova m5, m6 + WIN64_RESTORE_XMM + SWAP 5, 6 +%endif + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc +.w32_hpad_loop: + mova [acq+16*0], m0 + mova [acq+16*1], m1 + paddd m4, m6 + mova [acq+16*2], m2 + mova [acq+16*3], m3 + add acq, 16*4 + dec hpadd + jg .w32_hpad_loop + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h %define base r2-pal_pred_16bpc_ssse3_table %if ARCH_X86_32 From bd5a77889afe6b03433d486446656107130537ac Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Tue, 29 Jun 2021 14:47:43 +0200 Subject: [PATCH 136/188] x86: Add minor improvements to wiener16 SSSE3 asm --- src/x86/looprestoration16_sse.asm | 310 ++++++++++++------------------ 1 file changed, 121 insertions(+), 189 deletions(-) diff --git a/src/x86/looprestoration16_sse.asm b/src/x86/looprestoration16_sse.asm index 0da068b11b..e682f37d99 100644 --- a/src/x86/looprestoration16_sse.asm +++ b/src/x86/looprestoration16_sse.asm @@ -49,22 +49,35 @@ wiener_round: dd 1049600, 1048832 SECTION .text +%macro movif64 2 ; dst, src + %if ARCH_X86_64 + mov %1, %2 + %endif +%endmacro + +%macro movif32 2 ; dst, src + %if ARCH_X86_32 + mov %1, %2 + %endif +%endmacro + INIT_XMM ssse3 %if ARCH_X86_32 DECLARE_REG_TMP 4, 6 %if STACK_ALIGNMENT < 16 - %assign stack_size 13*16+384*12 + %assign extra_stack 14*16 %else - %assign stack_size 11*16+384*12 + %assign extra_stack 12*16 %endif -cglobal wiener_filter7_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \ - lpf, lpf_stride, w, flt +cglobal wiener_filter7_16bpc, 5, 7, 8, -384*12-16-extra_stack, \ + dst, dst_stride, left, lpf, lpf_stride, w, flt %if STACK_ALIGNMENT < 16 - %define lpfm dword [esp+calloff+16*10+0] - %define lpf_stridem dword [esp+calloff+16*10+4] - %define wm dword [esp+calloff+16*10+8] - %define hd dword [esp+calloff+16*10+12] - %define edgeb byte [esp+calloff+16*10+16] + %define lpfm dword [esp+calloff+16*12+ 0] + %define lpf_stridem dword [esp+calloff+16*12+ 4] + %define wm dword [esp+calloff+16*12+ 8] + %define hd dword [esp+calloff+16*12+12] + %define edgeb byte [esp+calloff+16*12+16] + %define edged dword [esp+calloff+16*12+16] %else %define hd dword r6m %define edgeb byte r8m @@ -90,6 +103,7 @@ cglobal wiener_filter7_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \ %define m13 [esp+calloff+16*7] %define m14 [esp+calloff+16*8] %define m15 [esp+calloff+16*9] + %define r10 r5 %define base t0-wiener_shifts %assign calloff 0 %if STACK_ALIGNMENT < 16 @@ -99,7 +113,7 @@ cglobal wiener_filter7_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \ mov r4, [rstk+stack_offset+28] mov hd, r4 mov r4, [rstk+stack_offset+36] - mov [esp+16*11], r4 ; edge + mov edged, r4 ; edge %endif %else DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers @@ -124,15 +138,14 @@ cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, movq m1, [t0] ; fx movq m3, [t0+16] ; fy LEA t0, wiener_shifts - mov PICmem, t0 %else LEA t0, wiener_shifts mov fltq, r7m movq m1, [fltq] movq m3, [fltq+16] mov t1, r9m ; pixel_max - mov PICmem, t0 %endif + mov PICmem, t0 %endif mova m6, [base+wiener_shufA] mova m7, [base+wiener_shufB] @@ -162,6 +175,8 @@ cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, %define lpfm [rsp+0] %define lpf_stridem [rsp+8] %define base + %define wiener_lshuf7_mem [wiener_lshuf7] + %define pd_m262128_mem [pd_m262128] %else add wd, wd mova m4, [base+wiener_shufC] @@ -176,13 +191,10 @@ cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, mova m15, m3 shr t1, 11 add lpfq, wq + mova m3, [base+pd_m262128] movd m4, [base+wiener_round+t1*4] movq m5, [base+wiener_shifts+t1*8] - %if STACK_ALIGNMENT < 16 - lea t1, [esp+16*12+wq+16] - %else - lea t1, [esp+16*10+wq+16] - %endif + lea t1, [esp+extra_stack+wq+16] add dstq, wq neg wq pshufd m4, m4, q0000 @@ -191,10 +203,15 @@ cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, mov wm, wq pmullw m0, m2 pmullw m1, m2 + mova m2, [base+wiener_lshuf7] + %define pd_m262128_mem [esp+calloff+16*10] + mova pd_m262128_mem, m3 mova m10, m4 mova m11, m5 mova m12, m0 mova m13, m1 + %define wiener_lshuf7_mem [esp+calloff+16*11] + mova wiener_lshuf7_mem, m2 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top @@ -208,25 +225,14 @@ cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, mov t5, t1 add t1, 384*2 call .h_top -%if ARCH_X86_64 - lea r7, [lpfq+lpf_strideq*4] + movif32 lpf_strideq, lpf_stridem + lea r10, [lpfq+lpf_strideq*4] mov lpfq, dstq mov t4, t1 add t1, 384*2 - mov lpf_stridem, lpf_strideq - add r7, lpf_strideq - mov lpfm, r7 ; below -%else - mov t4m, t1 - mov t0, lpf_stridem - lea t1, [lpfq+t0*4] - mov lpfq, dstq - add t1, t0 - mov lpfm, t1 ; below - mov t1, t4m - mov t0, PICmem - add t1, 384*2 -%endif + movif64 lpf_stridem, lpf_strideq + add r10, lpf_strideq + mov lpfm, r10 ; below call .h mov t3, t1 mov t2, t1 @@ -259,24 +265,13 @@ cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, call .v RET .no_top: -%if ARCH_X86_64 - lea r7, [lpfq+lpf_strideq*4] - mov lpfq, dstq - mov lpf_stridem, lpf_strideq - lea r7, [r7+lpf_strideq*2] - mov lpfm, r7 - call .h -%else - mov t1m, t1 - mov t0, lpf_stridem - lea t1, [lpfq+t0*4] + movif32 lpf_strideq, lpf_stridem + lea r10, [lpfq+lpf_strideq*4] mov lpfq, dstq - lea t1, [t1+t0*2] - mov lpfm, t1 - mov t0, PICmem - mov t1, t1m + movif64 lpf_stridem, lpf_strideq + lea r10, [r10+lpf_strideq*2] + mov lpfm, r10 call .h -%endif mov t6, t1 mov t5, t1 mov t4, t1 @@ -305,19 +300,15 @@ cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, jnz .main .v3: call .v -%if ARCH_X86_32 - mov wq, wm -%endif + movif32 wq, wm .v2: call .v -%if ARCH_X86_32 - mov wq, wm -%endif + movif32 wq, wm jmp .v1 .extend_right: -%assign stack_offset_tmp stack_offset %assign stack_offset stack_offset+8 %assign calloff 8 + movif32 t0, PICmem pxor m0, m0 movd m1, wd mova m2, [base+pb_0to15] @@ -334,15 +325,13 @@ cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, psubb m0, m1 pminub m0, m2 pshufb m5, m0 + movif32 t0, t0m ret %assign stack_offset stack_offset-4 %assign calloff 4 .h: -%if ARCH_X86_64 - mov wq, r5 -%else - mov wq, wm -%endif + movif64 wq, r5 + movif32 wq, wm test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movq m3, [leftq] @@ -350,13 +339,11 @@ cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, add leftq, 8 jmp .h_main .h_extend_left: - mova m3, [lpfq+wq] ; avoid accessing memory located - pshufb m3, [base+wiener_lshuf7] ; before the start of the buffer + mova m3, [lpfq+wq] ; avoid accessing memory located + pshufb m3, wiener_lshuf7_mem ; before the start of the buffer jmp .h_main .h_top: -%if ARCH_X86_64 - mov wq, r5 -%endif + movif64 wq, r5 test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: @@ -381,7 +368,7 @@ cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, pmaddwd m3, m13 pshufb m2, m5, m7 paddw m1, m2 - mova m2, [base+pd_m262128] ; (1 << 4) - (1 << 18) + mova m2, pd_m262128_mem ; (1 << 4) - (1 << 18) pshufb m4, m8 pmaddwd m1, m12 pshufb m5, m9 @@ -398,20 +385,14 @@ cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, mova [t1+wq], m0 add wq, 16 jl .h_loop -%if ARCH_X86_32 - mov wq, wm -%endif + movif32 wq, wm ret ALIGN function_align .hv: add lpfq, dst_strideq -%if ARCH_X86_64 - mov wq, r5 -%else - mov t0m, t0 - mov t1m, t1 - mov t0, PICmem -%endif + movif64 wq, r5 + movif32 t0m, t0 + movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movq m3, [leftq] @@ -420,16 +401,12 @@ ALIGN function_align jmp .hv_main .hv_extend_left: mova m3, [lpfq+wq] - pshufb m3, [base+wiener_lshuf7] + pshufb m3, wiener_lshuf7_mem jmp .hv_main .hv_bottom: -%if ARCH_X86_64 - mov wq, r5 -%else - mov t0m, t0 - mov t1m, t1 - mov t0, PICmem -%endif + movif64 wq, r5 + movif32 t0m, t0 + movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: @@ -443,9 +420,8 @@ ALIGN function_align jl .hv_have_right call .extend_right .hv_have_right: -%if ARCH_X86_32 - mov t1, t4m -%endif + movif32 t1, t4m + movif32 t0, t2m pshufb m0, m3, m6 pshufb m1, m4, m7 paddw m0, m1 @@ -457,7 +433,7 @@ ALIGN function_align pmaddwd m3, m13 pshufb m2, m5, m7 paddw m1, m2 - mova m2, [base+pd_m262128] + mova m2, pd_m262128_mem pshufb m4, m8 pmaddwd m1, m12 pshufb m5, m9 @@ -470,13 +446,12 @@ ALIGN function_align paddw m2, [t2+wq] mova m5, [t3+wq] %else - mov t0, t0m mova m2, [t1+wq] - mov t1, t2m - paddw m2, [t1+wq] + paddw m2, [t0+wq] mov t1, t3m + mov t0, t5m mova m5, [t1+wq] - mov t1, t5m + mov t1, t1m %endif paddd m0, m3 paddd m1, m4 @@ -489,11 +464,11 @@ ALIGN function_align psraw m0, 1 paddw m3, m0, [t6+wq] %else - mova m4, [t1+wq] - mov t1, t1m + mova m4, [t0+wq] paddw m4, [t1+wq] - psraw m0, 1 + mov t0, t0m mov t1, t6m + psraw m0, 1 paddw m3, m0, [t1+wq] %endif mova [t0+wq], m0 @@ -517,8 +492,8 @@ ALIGN function_align pmaxsw m0, m1 mova [dstq+wq], m0 add wq, 16 -%if ARCH_X86_64 jl .hv_loop +%if ARCH_X86_64 mov t6, t5 mov t5, t4 mov t4, t3 @@ -527,10 +502,6 @@ ALIGN function_align mov t1, t0 mov t0, t6 %else - jge .hv_end - mov t0, PICmem - jmp .hv_loop -.hv_end: mov r5, t5m mov t1, t4m mov t6m, r5 @@ -548,9 +519,11 @@ ALIGN function_align add dstq, dst_strideq ret .v: -%if ARCH_X86_64 - mov wq, r5 + movif64 wq, r5 + movif32 t0m, t0 + movif32 t1m, t1 .v_loop: +%if ARCH_X86_64 mova m1, [t4+wq] paddw m1, [t2+wq] mova m2, [t3+wq] @@ -558,19 +531,17 @@ ALIGN function_align paddw m3, m4, [t6+wq] paddw m4, [t5+wq] %else - mov t1m, t1 -.v_loop: - mov t1, t4m - mova m1, [t1+wq] + mov t0, t4m mov t1, t2m + mova m1, [t0+wq] paddw m1, [t1+wq] - mov t1, t3m - mova m2, [t1+wq] + mov t0, t3m mov t1, t1m + mova m2, [t0+wq] mova m4, [t1+wq] - mov t1, t6m - paddw m3, m4, [t1+wq] + mov t0, t6m mov t1, t5m + paddw m3, m4, [t0+wq] paddw m4, [t1+wq] %endif punpcklwd m0, m1, m2 @@ -601,15 +572,16 @@ ALIGN function_align mov t3, t2 mov t2, t1 %else - mov t1, t5m - mov r5, t4m - mov t6m, t1 - mov t5m, r5 - mov t1, t3m + mov t0, t5m + mov t1, t4m + mov r5, t3m + mov t6m, t0 + mov t5m, t1 + mov t4m, r5 mov r5, t2m - mov t4m, t1 - mov t3m, r5 mov t1, t1m + mov t0, t0m + mov t3m, r5 mov t2m, t1 %endif add dstq, dst_strideq @@ -629,6 +601,7 @@ cglobal wiener_filter5_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \ %define wm dword [esp+calloff+16*10+0] %define hd dword [esp+calloff+16*10+4] %define edgeb byte [esp+calloff+16*10+8] + %define edged dword [esp+calloff+16*10+8] %else %define hd dword r6m %define edgeb byte r8m @@ -659,10 +632,10 @@ cglobal wiener_filter5_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \ mov r4, [rstk+stack_offset+28] mov hd, r4 mov r4, [rstk+stack_offset+36] - mov [esp+16*10+8], r4 ; edge + mov edged, r4 ; edge %endif %else -cglobal wiener_filter5_16bpc, 5, 15, 16, 384*8+16, dst, dst_stride, left, lpf, \ +cglobal wiener_filter5_16bpc, 5, 14, 16, 384*8+16, dst, dst_stride, left, lpf, \ lpf_stride, w, edge, flt, h %define base %endif @@ -683,15 +656,14 @@ cglobal wiener_filter5_16bpc, 5, 15, 16, 384*8+16, dst, dst_stride, left, lpf, \ movq m1, [t0] ; fx movq m3, [t0+16] ; fy LEA t0, wiener_shifts - mov PICmem, t0 %else LEA t0, wiener_shifts mov fltq, r7m movq m1, [fltq] movq m3, [fltq+16] mov t1, r9m ; pixel_max - mov PICmem, t0 %endif + mov PICmem, t0 %endif mova m5, [base+wiener_shufE] mova m6, [base+wiener_shufB] @@ -771,24 +743,14 @@ cglobal wiener_filter5_16bpc, 5, 15, 16, 384*8+16, dst, dst_stride, left, lpf, \ mov t4, t1 add t1, 384*2 call .h_top -%if ARCH_X86_64 - lea r7, [lpfq+lpf_strideq*4] + movif32 lpf_strideq, lpf_stridem + lea r10, [lpfq+lpf_strideq*4] mov lpfq, dstq mov t3, t1 add t1, 384*2 - mov lpf_stridem, lpf_strideq - add r7, lpf_strideq - mov lpfm, r7 ; below -%else - mov t3m, t1 - mov t0, lpf_stridem - lea t1, [lpfq+t0*4] - mov lpfq, dstq - add t1, t0 - mov lpfm, t1 ; below - mov t1, t3m - add t1, 384*2 -%endif + movif64 lpf_stridem, lpf_strideq + add r10, lpf_strideq + mov lpfm, r10 ; below call .h mov t2, t1 dec hd @@ -813,23 +775,13 @@ cglobal wiener_filter5_16bpc, 5, 15, 16, 384*8+16, dst, dst_stride, left, lpf, \ .end: RET .no_top: -%if ARCH_X86_64 - lea r7, [lpfq+lpf_strideq*4] + movif32 lpf_strideq, lpf_stridem + lea r10, [lpfq+lpf_strideq*4] mov lpfq, dstq - mov lpf_stridem, lpf_strideq - lea r7, [r7+lpf_strideq*2] - mov lpfm, r7 - call .h -%else - mov t1m, t1 - mov t0, lpf_stridem - lea t1, [lpfq+t0*4] - mov lpfq, dstq - lea t1, [t1+t0*2] - mov lpfm, t1 - mov t1, t1m + movif64 lpf_stridem, lpf_strideq + lea r10, [r10+lpf_strideq*2] + mov lpfm, r10 call .h -%endif mov t4, t1 mov t3, t1 mov t2, t1 @@ -868,12 +820,9 @@ cglobal wiener_filter5_16bpc, 5, 15, 16, 384*8+16, dst, dst_stride, left, lpf, \ call .v jmp .end .extend_right: -%assign stack_offset_tmp stack_offset %assign stack_offset stack_offset+8 %assign calloff 8 -%if ARCH_X86_32 - mov t0, PICmem -%endif + movif32 t0, PICmem pxor m1, m1 movd m2, wd mova m0, [base+pb_2_3] @@ -890,11 +839,8 @@ cglobal wiener_filter5_16bpc, 5, 15, 16, 384*8+16, dst, dst_stride, left, lpf, \ %assign stack_offset stack_offset-4 %assign calloff 4 .h: -%if ARCH_X86_64 - mov wq, r5 -%else - mov wq, wm -%endif + movif64 wq, r5 + movif32 wq, wm test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left mova m4, [lpfq+wq] @@ -908,11 +854,8 @@ cglobal wiener_filter5_16bpc, 5, 15, 16, 384*8+16, dst, dst_stride, left, lpf, \ pshufb m3, m15 ; before the start of the buffer jmp .h_main .h_top: -%if ARCH_X86_64 - mov wq, r5 -%else - mov wq, wm -%endif + movif64 wq, r5 + movif32 wq, wm test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: @@ -948,19 +891,14 @@ cglobal wiener_filter5_16bpc, 5, 15, 16, 384*8+16, dst, dst_stride, left, lpf, \ mova [t1+wq], m0 add wq, 16 jl .h_loop -%if ARCH_X86_32 - mov wq, wm -%endif + movif32 wq, wm ret ALIGN function_align .hv: add lpfq, dst_strideq -%if ARCH_X86_64 - mov wq, r5 -%else - mov t0m, t0 - mov t1m, t1 -%endif + movif64 wq, r5 + movif32 t0m, t0 + movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left mova m4, [lpfq+wq] @@ -974,12 +912,9 @@ ALIGN function_align pshufb m3, m15 jmp .hv_main .hv_bottom: -%if ARCH_X86_64 - mov wq, r5 -%else - mov t0m, t0 - mov t1m, t1 -%endif + movif64 wq, r5 + movif32 t0m, t0 + movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: @@ -992,10 +927,8 @@ ALIGN function_align jl .hv_have_right call .extend_right .hv_have_right: -%if ARCH_X86_32 - mov t1, t1m - mov t0, t3m -%endif + movif32 t1, t1m + movif32 t0, t3m pshufb m0, m3, m5 pmaddwd m0, m11 pshufb m1, m4, m5 @@ -1076,16 +1009,15 @@ ALIGN function_align add dstq, dst_strideq ret .v: -%if ARCH_X86_64 - mov wq, r5 + movif64 wq, r5 + movif32 t1m, t1 .v_loop: +%if ARCH_X86_64 mova m0, [t1+wq] paddw m2, m0, [t3+wq] mova m1, [t2+wq] mova m4, [t4+wq] %else - mov t1m, t1 -.v_loop: mov t0, t3m mova m0, [t1+wq] mov t1, t2m From 940eb4d53a77412f84fa552a2e251fed1fbff867 Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Thu, 10 Jun 2021 15:30:16 +0200 Subject: [PATCH 137/188] x86: Add high bitdepth (10-bit) sgr SSSE3 asm --- src/x86/looprestoration16_sse.asm | 2721 +++++++++++++++++++++++++++++ 1 file changed, 2721 insertions(+) diff --git a/src/x86/looprestoration16_sse.asm b/src/x86/looprestoration16_sse.asm index e682f37d99..0ccb036939 100644 --- a/src/x86/looprestoration16_sse.asm +++ b/src/x86/looprestoration16_sse.asm @@ -35,18 +35,32 @@ wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 wiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 wiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 +sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +pb_m14_m13: times 8 db -14,-13 pb_m10_m9: times 8 db -10, -9 pb_m6_m5: times 8 db -6, -5 pb_m2_m1: times 8 db -2, -1 pb_2_3: times 8 db 2, 3 pb_6_7: times 8 db 6, 7 +pw_25: times 8 dw 25 +pw_256: times 8 dw 256 +pw_1023: times 8 dw 1023 +pd_8: times 4 dd 8 +pd_4096: times 4 dd 4096 +pd_34816: times 4 dd 34816 pd_m262128: times 4 dd -262128 +pd_0xffff: times 4 dd 0xffff +pd_0xf00800a4: times 4 dd 0xf00800a4 +pd_0xf00801c7: times 4 dd 0xf00801c7 wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192 wiener_round: dd 1049600, 1048832 +cextern sgr_x_by_x + SECTION .text %macro movif64 2 ; dst, src @@ -1055,3 +1069,2710 @@ ALIGN function_align .v_end: %endif ret + +%macro GATHERDD 3 ; dst, src, tmp + movd %3d, %2 + %if ARCH_X86_64 + movd %1, [r13+%3] + pextrw %3d, %2, 2 + pinsrw %1, [r13+%3+2], 3 + pextrw %3d, %2, 4 + pinsrw %1, [r13+%3+2], 5 + pextrw %3d, %2, 6 + pinsrw %1, [r13+%3+2], 7 + %else + movd %1, [base+sgr_x_by_x-0xf03+%3] + pextrw %3, %2, 2 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3 + pextrw %3, %2, 4 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5 + pextrw %3, %2, 6 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7 + %endif +%endmacro + +%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore + %if ARCH_X86_64 + %define tmp r14 + %else + %define tmp %4 + %endif + GATHERDD %1, %2, tmp + GATHERDD %2, %3, tmp + movif32 %4, %5 + psrld %1, 24 + psrld %2, 24 + packssdw %1, %2 +%endmacro + +%macro MAXSD 3-4 0 ; dst, src, restore_tmp + pcmpgtd %3, %1, %2 + pand %1, %3 + pandn %3, %2 + por %1, %3 + %if %4 == 1 + pxor %3, %3 + %endif +%endmacro + +%macro MULLD 3 ; dst, src, tmp + pmulhuw %3, %1, %2 + pmullw %1, %2 + pslld %3, 16 + paddd %1, %3 +%endmacro + +%if ARCH_X86_32 +DECLARE_REG_TMP 0, 1, 2, 3, 4 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 5*16 + %else + %assign extra_stack 3*16 + %endif +cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \ + dst, dst_stride, left, lpf, lpf_stride, w, params, h + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*0+4*6] + %define dst_stridemp dword [esp+calloff+16*3+4*7] + %define leftm dword [esp+calloff+16*3+4*0] + %define lpfm dword [esp+calloff+16*3+4*1] + %define lpf_stridem dword [esp+calloff+16*3+4*2] + %define w0m dword [esp+calloff+16*3+4*3] + %define hd dword [esp+calloff+16*3+4*4] + %define edgeb byte [esp+calloff+16*3+4*5] + %define edged dword [esp+calloff+16*3+4*5] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r6m + %define edgeb byte r8m + %define edged dword r8m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t0m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define m8 [base+pd_8] + %define m9 [base+pw_25] + %define m10 [esp+calloff+16*2] + %define m11 [base+pd_0xf00800a4] + %define m12 [base+pw_256] + %define m13 [base+pd_34816] + %define m14 [base+pw_1023] + %define m15 [base+sgr_lshuf5] + %define r10 r5 + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov dst_strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov lpf_strideq, [rstk+stack_offset+20] + mov wd, [rstk+stack_offset+24] + mov dstm, dstq + mov dst_stridemp, dst_strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+28] + mov r2, [rstk+stack_offset+36] + mov lpfm, lpfq + mov lpf_stridem, lpf_strideq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_5x5_16bpc, 5, 15, 16, -400*24-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, paramsmp + lea r13, [sgr_x_by_x-0xf03] + mov edged, r8m + add wd, wd + mov hd, r6m + movu m10, [paramsq] + mova m12, [pw_256] + add lpfq, wq + mova m8, [pd_8] + lea t1, [rsp+wq+20] + mova m9, [pw_25] + add dstq, wq + lea t3, [rsp+wq*2+400*12+16] + mova m11, [pd_0xf00800a4] + lea t4, [rsp+wq+400*20+16] + pshufhw m7, m10, q0000 + pshufb m10, m12 ; s0 + punpckhqdq m7, m7 ; w0 + neg wq + mova m13, [pd_34816] ; (1 << 11) + (1 << 15) + pxor m6, m6 + mova m14, [pw_1023] + psllw m7, 4 + mova m15, [sgr_lshuf5] + DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + %define lpfm [rsp+0] + %define lpf_stridem [rsp+8] +%else + mov r1, [rstk+stack_offset+32] ; params + LEA r6, $$ + add wd, wd + movu m1, [r1] + add lpfm, wq + lea t1, [rsp+extra_stack+wq+20] + add dstq, wq + lea t3, [rsp+extra_stack+wq*2+400*12+16] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq+400*20+16] + mov t3m, t3 + pshufhw m7, m1, q0000 + mov t4m, t4 + pshufb m1, m12 ; s0 + punpckhqdq m7, m7 ; w0 + psllw m7, 4 + neg wq + mova m10, m1 + pxor m6, m6 + mov w1m, wd + sub wd, 4 + mov lpfq, lpfm + mov lpf_strideq, lpf_stridem + mov w0m, wd +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + movif32 t2m, t1 + mov t2, t1 + call .top_fixup + add t1, 400*6 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + movif64 lpf_stridem, lpf_strideq + add r10, lpf_strideq + mov lpfm, r10 ; below + movif32 t0m, t2 + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, dst_stridemp + movif32 t4, t4m + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp +%if ARCH_X86_64 + test hb, hb +%else + mov r5, hd + test r5, r5 +%endif + jz .odd_height + call .h + add lpfq, dst_stridemp + call .hv + movif32 dstq, dstm + call .n0 + call .n1 + sub hd, 2 + movif32 t0, t0m + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .h_top + add lpfq, lpf_stridem + call .hv_bottom +.end: + movif32 dstq, dstm + call .n0 + call .n1 +.end2: + RET +.height1: + movif32 t4, t4m + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + movif32 dstq, dstm + call .n0 + call .n1 +.odd_height_end: + call .v + movif32 dstq, dstm + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + movif64 lpf_stridem, lpf_strideq + lea r10, [r10+lpf_strideq*2] + mov lpfm, r10 + call .h + lea t2, [t1+400*6] + movif32 t2m, t2 + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + movif32 t0m, t0 + jmp .main +.no_top_height1: + movif32 t3, t3m + movif32 t4, t4m + call .v + call .prep_n + jmp .odd_height_end +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movd m1, wd + mova m3, [base+pb_m14_m13] + mova m0, [base+pb_0to15] + pshufb m1, m6 + psubb m2, m12, m1 + psubb m3, m1 + movd m1, [lpfq-2] + pcmpgtb m2, m0 + pcmpgtb m3, m0 + pshufb m1, m12 + pand m4, m2 + pand m5, m3 + pandn m2, m1 + pandn m3, m1 + por m4, m2 + por m5, m3 + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: ; horizontal boxsum +%if ARCH_X86_64 + lea wq, [r5-4] +%else + %define leftq r5 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m15 + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r5-4] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m4, [lpfq+wq- 2] +.h_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -20 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m2, m5, m4, 2 + paddw m0, m4, m2 + palignr m3, m5, m4, 6 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + palignr m5, m4, 8 + paddw m0, m5 + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + paddd m1, m3 + punpckhwd m3, m4, m5 + pmaddwd m3, m3 + shufps m4, m5, q2121 + paddw m0, m4 ; sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m2, m3 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+wq+400*0] + paddd m1, [t1+wq+400*2] + paddd m2, [t1+wq+400*4] +.h_loop_end: + paddd m1, m5 ; sumsq + paddd m2, m4 + mova [t1+wq+400*0], m0 + mova [t1+wq+400*2], m1 + mova [t1+wq+400*4], m2 + add wq, 16 + jl .h_loop + ret +.top_fixup: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+wq+400*0] + mova m1, [t1+wq+400*2] + mova m2, [t1+wq+400*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m1 + mova [t2+wq+400*4], m2 + add wq, 16 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .hv_main +.hv_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m15 + jmp .hv_main +.hv_bottom: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv_loop_start +%endif +.hv_loop: + movif32 lpfq, hvsrcm +.hv_loop_start: + movu m4, [lpfq+wq- 2] +.hv_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -20 + jl .hv_have_right + call .extend_right +.hv_have_right: + movif32 t3, hd + palignr m3, m5, m4, 2 + paddw m0, m4, m3 + palignr m1, m5, m4, 6 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + palignr m5, m4, 8 + paddw m0, m5 + punpcklwd m1, m4, m5 + pmaddwd m1, m1 + paddd m2, m1 + punpckhwd m1, m4, m5 + pmaddwd m1, m1 + shufps m4, m5, q2121 + paddw m0, m4 ; h sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m3, m1 + paddd m2, m5 ; h sumsq + paddd m3, m4 + paddw m1, m0, [t1+wq+400*0] + paddd m4, m2, [t1+wq+400*2] + paddd m5, m3, [t1+wq+400*4] +%if ARCH_X86_64 + test hd, hd +%else + test t3, t3 +%endif + jz .hv_last_row +.hv_main2: + paddw m1, [t2+wq+400*0] ; hv sum + paddd m4, [t2+wq+400*2] ; hv sumsq + paddd m5, [t2+wq+400*4] + mova [t0+wq+400*0], m0 + mova [t0+wq+400*2], m2 + mova [t0+wq+400*4], m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + punpcklwd m2, m3, m6 + psrld m5, 4 + punpckhwd m3, m6 + MULLD m4, m9, m0 ; a * 25 + MULLD m5, m9, m0 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m6 + MAXSD m5, m3, m6, 1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m10, m2 ; p * s + MULLD m5, m10, m2 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m3, m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, t2, t2m + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m5 + MULLD m1, m4, m5 + psubw m5, m12, m2 ; a + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+wq+4], m5 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+wq*2+ 8], m0 + mova [t3+wq*2+24], m1 + add wq, 16 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + movif32 t2m, t2 + movif32 t0m, t0 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+wq+400*0], m1 + paddw m1, m0 + mova [t1+wq+400*2], m4 + paddd m4, m2 + mova [t1+wq+400*4], m5 + paddd m5, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.v_loop: + mova m0, [t1+wq+400*0] + mova m2, [t1+wq+400*2] + mova m3, [t1+wq+400*4] + paddw m1, m0, [t2+wq+400*0] + paddd m4, m2, [t2+wq+400*2] + paddd m5, m3, [t2+wq+400*4] + paddw m0, m0 + paddd m2, m2 + paddd m3, m3 + paddw m1, m0 ; hv sum + paddd m4, m2 ; hv sumsq + paddd m5, m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + punpcklwd m2, m3, m6 + psrld m5, 4 + punpckhwd m3, m6 + MULLD m4, m9, m0 ; a * 25 + MULLD m5, m9, m0 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m6 + MAXSD m5, m3, m6, 1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m10, m2 ; p * s + MULLD m5, m10, m2 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m3, m4, 20 ; min(z, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, t2, t2m + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m5 + MULLD m1, m4, m5 + psubw m5, m12, m2 ; a + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+wq+4], m5 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+wq*2+ 8], m0 + mova [t3+wq*2+24], m1 + add wq, 16 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + movif64 wq, r5 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*1+ 2] + movu m3, [t4+wq*1+ 4] + movu m1, [t3+wq*2+ 4] + movu m4, [t3+wq*2+ 8] + movu m2, [t3+wq*2+20] + movu m5, [t3+wq*2+24] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + paddw m3, [t4+wq*1+ 0] + paddd m4, [t3+wq*2+ 0] + paddd m5, [t3+wq*2+16] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + mova [t4+wq*1+400*2+ 0], m0 + mova [t3+wq*2+400*4+ 0], m1 + mova [t3+wq*2+400*4+16], m2 + add wq, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r5 + movif32 wd, w1m +.n0_loop: + movu m0, [t4+wq*1+ 2] + movu m3, [t4+wq*1+ 4] + movu m1, [t3+wq*2+ 4] + movu m4, [t3+wq*2+ 8] + movu m2, [t3+wq*2+20] + movu m5, [t3+wq*2+24] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + paddw m3, [t4+wq*1+ 0] + paddd m4, [t3+wq*2+ 0] + paddd m5, [t3+wq*2+16] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + paddw m3, m0, [t4+wq*1+400*2+ 0] + paddd m4, m1, [t3+wq*2+400*4+ 0] + paddd m5, m2, [t3+wq*2+400*4+16] + mova [t4+wq*1+400*2+ 0], m0 + mova [t3+wq*2+400*4+ 0], m1 + mova [t3+wq*2+400*4+16], m2 + mova m0, [dstq+wq] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + paddd m2, m4 ; a * src + b + (1 << 8) + paddd m3, m5 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+wq], m0 + add wq, 16 + jl .n0_loop + add dstq, dst_stridemp + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r5 + movif32 wd, w1m +.n1_loop: + mova m0, [dstq+wq] + mova m3, [t4+wq*1+400*2+ 0] + mova m4, [t3+wq*2+400*4+ 0] + mova m5, [t3+wq*2+400*4+16] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + paddd m2, m4 ; a * src + b + (1 << 7) + paddd m3, m5 + psrld m2, 8 + psrld m3, 8 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+wq], m0 + add wq, 16 + jl .n1_loop + add dstq, dst_stridemp + movif32 dstm, dstq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 4*16 + %else + %assign extra_stack 2*16 + %endif +cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \ + dst, dst_stride, left, lpf, lpf_stride, w, params, h + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*2+4*0] + %define dst_stridemp dword [esp+calloff+16*2+4*1] + %define leftm dword [esp+calloff+16*2+4*2] + %define lpfm dword [esp+calloff+16*2+4*3] + %define lpf_stridem dword [esp+calloff+16*2+4*4] + %define w0m dword [esp+calloff+16*2+4*5] + %define hd dword [esp+calloff+16*2+4*6] + %define edgeb byte [esp+calloff+16*2+4*7] + %define edged dword [esp+calloff+16*2+4*7] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r6m + %define edgeb byte r8m + %define edged dword r8m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t3m dword [esp+calloff+4*2] + %define t4m dword [esp+calloff+4*3] + %define m8 [base+pd_8] + %define m9 [esp+calloff+16*1] + %define m10 [base+pd_0xf00801c7] + %define m11 [base+pd_34816] + %define m12 [base+pw_256] + %define m13 [base+pw_1023] + %define m14 [base+sgr_lshuf3] + %define m15 m6 + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov dst_strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov lpf_strideq, [rstk+stack_offset+20] + mov wd, [rstk+stack_offset+24] + mov dstm, dstq + mov dst_stridemp, dst_strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+28] + mov r2, [rstk+stack_offset+36] + mov lpfm, lpfq + mov lpf_stridem, lpf_strideq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_3x3_16bpc, 5, 15, 16, 400*42+8, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, paramsmp + lea r13, [sgr_x_by_x-0xf03] + mov edged, r8m + add wd, wd + mov hd, r6m + movq m9, [paramsq+4] + mova m12, [pw_256] + add lpfq, wq + lea t1, [rsp+wq+12] + mova m8, [pd_8] + add dstq, wq + lea t3, [rsp+wq*2+400*12+8] + mova m10, [pd_0xf00801c7] + lea t4, [rsp+wq+400*32+8] + mova m11, [pd_34816] + pshuflw m7, m9, q3333 + pshufb m9, m12 ; s1 + punpcklqdq m7, m7 ; w1 + neg wq + pxor m6, m6 + mova m13, [pw_1023] + psllw m7, 4 + mova m14, [sgr_lshuf3] + DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + %define lpfm [rsp] +%else + mov r1, [rstk+stack_offset+32] ; params + LEA r6, $$ + add wd, wd + movq m1, [r1+4] + add lpfm, wq + lea t1, [rsp+extra_stack+wq+20] + add dstq, wq + lea t3, [rsp+extra_stack+wq*2+400*12+16] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq+400*32+16] + mov t3m, t3 + pshuflw m7, m1, q3333 + mov t4m, t4 + pshufb m1, m12 ; s1 + punpcklqdq m7, m7 ; w1 + psllw m7, 4 + neg wq + mova m9, m1 + pxor m6, m6 + mov w1m, wd + sub wd, 4 + mov lpfq, lpfm + mov lpf_strideq, lpf_stridem + mov w0m, wd +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + add t1, 400*6 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + add r10, lpf_strideq + mov lpfm, r10 ; below + movif32 t4, t4m + call .hv0 +.main: + dec hd + jz .height1 + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv0 +%if ARCH_X86_64 + test hb, hb +%else + mov r5, hd + test r5, r5 +%endif + jz .odd_height + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .hv0_bottom +%if ARCH_X86_64 + add lpfq, lpf_strideq +%else + mov lpfq, hvsrcm + add lpfq, lpf_stridem +%endif + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + lea r10, [r10+lpf_strideq*2] + mov lpfm, r10 + call .h +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wq, w0m + mov hvsrcm, lpfq +%endif + lea t2, [t1+400*6] +.top_fixup_loop: + mova m0, [t1+wq+400*0] + mova m1, [t1+wq+400*2] + mova m2, [t1+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m1 + mova [t2+wq+400*4], m2 + add wq, 16 + jl .top_fixup_loop + movif32 t3, t3m + movif32 t4, t4m + call .v0 + jmp .main +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movd m1, wd + mova m2, [base+pb_m2_m1] + mova m3, [base+pb_0to15] + movd m5, [lpfq-2] + pshufb m1, m6 + pshufb m5, m12 + psubb m2, m1 + pcmpgtb m2, m3 + pand m4, m2 + pandn m2, m5 + por m4, m2 + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: ; horizontal boxsum +%if ARCH_X86_64 + lea wq, [r5-4] +%else + %define leftq r5 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 12 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m14 + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r5-4] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m4, [lpfq+wq+ 0] +.h_main: + movu m5, [lpfq+wq+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -18 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + mova [t1+wq+400*0], m1 + mova [t1+wq+400*2], m2 + mova [t1+wq+400*4], m3 + add wq, 16 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 12 + jmp .hv0_main +.hv0_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m14 + jmp .hv0_main +.hv0_bottom: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv0_loop_start +%endif +.hv0_loop: + movif32 lpfq, hvsrcm +.hv0_loop_start: + movu m4, [lpfq+wq+ 0] +.hv0_main: + movu m5, [lpfq+wq+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp wd, -18 + jl .hv0_have_right + call .extend_right +.hv0_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + paddw m0, m1, [t1+wq+400*0] + paddd m4, m2, [t1+wq+400*2] + paddd m5, m3, [t1+wq+400*4] + mova [t1+wq+400*0], m1 + mova [t1+wq+400*2], m2 + mova [t1+wq+400*4], m3 + paddw m1, m0, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m5, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m15 + MAXSD m5, m3, m15 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m15 ; p * s + MULLD m5, m9, m15 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m3, m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m15 + MULLD m1, m4, m15 + psubw m5, m12, m2 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq+4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+ 8], m0 + mova [t3+wq*2+24], m1 + add wq, 16 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 12 + jmp .hv1_main +.hv1_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m14 + jmp .hv1_main +.hv1_bottom: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv1_loop_start +%endif +.hv1_loop: + movif32 lpfq, hvsrcm +.hv1_loop_start: + movu m4, [lpfq+wq+ 0] +.hv1_main: + movu m5, [lpfq+wq+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp wd, -18 + jl .hv1_have_right + call .extend_right +.hv1_have_right: + palignr m1, m5, m4, 2 + paddw m0, m4, m1 + punpcklwd m2, m4, m1 + pmaddwd m2, m2 + punpckhwd m3, m4, m1 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m0, m5 ; h sum + punpcklwd m1, m5, m6 + pmaddwd m1, m1 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m1 ; h sumsq + paddd m3, m5 + paddw m1, m0, [t2+wq+400*0] + paddd m4, m2, [t2+wq+400*2] + paddd m5, m3, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m2 + mova [t2+wq+400*4], m3 + paddd m4, m8 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + psrld m5, 4 + pslld m2, m4, 3 + pslld m3, m5, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m15 + MAXSD m5, m3, m15 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m15 ; p * s + MULLD m5, m9, m15 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m3, m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m15 + MULLD m1, m4, m15 + psubw m5, m12, m2 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*1+400*2 +4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab (even rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.v0_loop: + mova m0, [t1+wq+400*0] + mova m4, [t1+wq+400*2] + mova m5, [t1+wq+400*4] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m5, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m15 + MAXSD m5, m3, m15 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m15 ; p * s + MULLD m5, m9, m15 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m3, m4, 20 ; min(z, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m15 + MULLD m1, m4, m15 + psubw m5, m12, m2 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*1+400*0+ 4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*0+ 8], m0 + mova [t3+wq*2+400*0+24], m1 + add wq, 16 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.v1_loop: + mova m0, [t1+wq+400*0] + mova m4, [t1+wq+400*2] + mova m5, [t1+wq+400*4] + paddw m1, m0, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m5, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m15 + MAXSD m5, m3, m15 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m15 ; p * s + MULLD m5, m9, m15 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m3, m4, 20 ; min(z, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m15 + MULLD m1, m4, m15 + psubw m5, m12, m2 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*1+400*2+ 4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + movif64 wq, r5 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*1+400*0+ 4] + movu m1, [t3+wq*2+400*0+ 8] + movu m2, [t3+wq*2+400*0+24] + movu m3, [t4+wq*1+400*0+ 2] + movu m4, [t3+wq*2+400*0+ 4] + movu m5, [t3+wq*2+400*0+20] + paddw m0, [t4+wq*1+400*0+ 0] + paddd m1, [t3+wq*2+400*0+ 0] + paddd m2, [t3+wq*2+400*0+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a[-1] 444 + pslld m4, 2 ; b[-1] 444 + pslld m5, 2 + psubw m3, m0 ; a[-1] 343 + psubd m4, m1 ; b[-1] 343 + psubd m5, m2 + mova [t4+wq*1+400*4], m3 + mova [t3+wq*2+400*8+ 0], m4 + mova [t3+wq*2+400*8+16], m5 + movu m0, [t4+wq*1+400*2+ 4] + movu m1, [t3+wq*2+400*4+ 8] + movu m2, [t3+wq*2+400*4+24] + movu m3, [t4+wq*1+400*2+ 2] + movu m4, [t3+wq*2+400*4+ 4] + movu m5, [t3+wq*2+400*4+20] + paddw m0, [t4+wq*1+400*2+ 0] + paddd m1, [t3+wq*2+400*4+ 0] + paddd m2, [t3+wq*2+400*4+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a[ 0] 444 + pslld m4, 2 ; b[ 0] 444 + pslld m5, 2 + mova [t4+wq*1+400* 6], m3 + mova [t3+wq*2+400*12+ 0], m4 + mova [t3+wq*2+400*12+16], m5 + psubw m3, m0 ; a[ 0] 343 + psubd m4, m1 ; b[ 0] 343 + psubd m5, m2 + mova [t4+wq*1+400* 8], m3 + mova [t3+wq*2+400*16+ 0], m4 + mova [t3+wq*2+400*16+16], m5 + add wq, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r5 + movif32 wd, w1m +.n0_loop: + movu m3, [t4+wq*1+400*0+4] + movu m1, [t4+wq*1+400*0+2] + paddw m3, [t4+wq*1+400*0+0] + paddw m1, m3 + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+wq*1+400*4] + paddw m3, [t4+wq*1+400*6] + mova [t4+wq*1+400*4], m2 + mova [t4+wq*1+400*6], m1 + movu m4, [t3+wq*2+400*0+8] + movu m1, [t3+wq*2+400*0+4] + paddd m4, [t3+wq*2+400*0+0] + paddd m1, m4 + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+wq*2+400* 8+ 0] + paddd m4, [t3+wq*2+400*12+ 0] + mova [t3+wq*2+400* 8+ 0], m2 + mova [t3+wq*2+400*12+ 0], m1 + movu m5, [t3+wq*2+400*0+24] + movu m1, [t3+wq*2+400*0+20] + paddd m5, [t3+wq*2+400*0+16] + paddd m1, m5 + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+wq*2+400* 8+16] + paddd m5, [t3+wq*2+400*12+16] + mova [t3+wq*2+400* 8+16], m2 + mova [t3+wq*2+400*12+16], m1 + mova m0, [dstq+wq] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + paddd m2, m4 ; a * src + b + (1 << 8) + paddd m3, m5 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+wq], m0 + add wq, 16 + jl .n0_loop + add dstq, dst_stridemp + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r5 + movif32 wd, w1m +.n1_loop: + movu m3, [t4+wq*1+400*2+4] + movu m1, [t4+wq*1+400*2+2] + paddw m3, [t4+wq*1+400*2+0] + paddw m1, m3 + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+wq*1+400*6] + paddw m3, [t4+wq*1+400*8] + mova [t4+wq*1+400*6], m1 + mova [t4+wq*1+400*8], m2 + movu m4, [t3+wq*2+400*4+8] + movu m1, [t3+wq*2+400*4+4] + paddd m4, [t3+wq*2+400*4+0] + paddd m1, m4 + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+wq*2+400*12+ 0] + paddd m4, [t3+wq*2+400*16+ 0] + mova [t3+wq*2+400*12+ 0], m1 + mova [t3+wq*2+400*16+ 0], m2 + movu m5, [t3+wq*2+400*4+24] + movu m1, [t3+wq*2+400*4+20] + paddd m5, [t3+wq*2+400*4+16] + paddd m1, m5 + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+wq*2+400*12+16] + paddd m5, [t3+wq*2+400*16+16] + mova [t3+wq*2+400*12+16], m1 + mova [t3+wq*2+400*16+16], m2 + mova m0, [dstq+wq] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + paddd m2, m4 ; a * src + b + (1 << 8) + paddd m3, m5 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+wq], m0 + add wq, 16 + jl .n1_loop + add dstq, dst_stridemp + movif32 dstm, dstq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 10*16 + %else + %assign extra_stack 8*16 + %endif +cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \ + dst, dst_stride, left, lpf, lpf_stride, w, params, h + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*8+4*0] + %define dst_stridemp dword [esp+calloff+16*8+4*1] + %define leftm dword [esp+calloff+16*8+4*2] + %define lpfm dword [esp+calloff+16*8+4*3] + %define lpf_stridem dword [esp+calloff+16*8+4*4] + %define w0m dword [esp+calloff+16*8+4*5] + %define hd dword [esp+calloff+16*8+4*6] + %define edgeb byte [esp+calloff+16*8+4*7] + %define edged dword [esp+calloff+16*8+4*7] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r6m + %define edgeb byte r8m + %define edged dword r8m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t3m dword [esp+calloff+4*2] + %define t4m dword [esp+calloff+4*3] + %xdefine m8 m6 + %define m9 [base+pd_8] + %define m10 [base+pd_34816] + %define m11 [base+pd_0xf00801c7] + %define m12 [base+pw_256] + %define m13 [esp+calloff+16*4] + %define m14 [esp+calloff+16*5] + %define m15 [esp+calloff+16*6] + %define m6 [esp+calloff+16*7] + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov dst_strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov lpf_strideq, [rstk+stack_offset+20] + mov wd, [rstk+stack_offset+24] + mov dstm, dstq + mov dst_stridemp, dst_strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+28] + mov r2, [rstk+stack_offset+36] + mov lpfm, lpfq + mov lpf_stridem, lpf_strideq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_mix_16bpc, 5, 15, 16, -400*66-40, dst, dst_stride, left, \ + lpf, lpf_stride, w, edge, \ + params, h +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, paramsmp + lea r13, [sgr_x_by_x-0xf03] + mov edged, r8m + add wd, wd + mov hd, r6m + mova m15, [paramsq] + add lpfq, wq + mova m9, [pd_8] + lea t1, [rsp+wq+44] + mova m10, [pd_34816] + add dstq, wq + mova m12, [pw_256] + lea t3, [rsp+wq*2+400*24+40] + mova m11, [pd_0xf00801c7] + lea t4, [rsp+wq+400*52+40] + neg wq + pshuflw m13, m15, q0000 + pshuflw m14, m15, q2222 + pshufhw m15, m15, q1010 + punpcklqdq m13, m13 ; s0 + punpcklqdq m14, m14 ; s1 + punpckhqdq m15, m15 ; w0 w1 + pxor m6, m6 + psllw m15, 2 + DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + %define lpfm [rsp] +%else + mov r1, [rstk+stack_offset+32] ; params + LEA r6, $$ + add wd, wd + mova m2, [r1] + add lpfm, wq + lea t1, [rsp+extra_stack+wq+52] + add dstq, wq + lea t3, [rsp+extra_stack+wq*2+400*24+48] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq+400*52+48] + mov t3m, t3 + mov t4m, t4 + neg wq + pshuflw m0, m2, q0000 + pshuflw m1, m2, q2222 + pshufhw m2, m2, q1010 + punpcklqdq m0, m0 ; s0 + punpcklqdq m1, m1 ; s1 + punpckhqdq m2, m2 ; w0 w1 + mov w1m, wd + pxor m3, m3 + psllw m2, 2 + mova m13, m0 + mova m14, m1 + sub wd, 4 + mova m15, m2 + mova m6, m3 + mov lpfq, lpfm + mov lpf_strideq, lpf_stridem + mov w0m, wd +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 +%if ARCH_X86_64 + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup +%else + mov wq, w0m + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop +%endif + add t1, 400*12 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + add r10, lpf_strideq + mov lpfm, r10 ; below + movif32 t4, t4m + call .hv0 +.main: + dec hd + jz .height1 + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv0 +%if ARCH_X86_64 + test hd, hd +%else + mov r5, hd + test r5, r5 +%endif + jz .odd_height + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .hv0_bottom +%if ARCH_X86_64 + add lpfq, lpf_strideq +%else + mov lpfq, hvsrcm + add lpfq, lpf_stridem +%endif + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + lea r10, [r10+lpf_strideq*2] + mov lpfm, r10 + call .h +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wq, w0m + mov hvsrcm, lpfq +%endif + lea t2, [t1+400*12] +.top_fixup_loop: + mova m0, [t1+wq+400* 0] + mova m1, [t1+wq+400* 2] + mova m2, [t1+wq+400* 4] + paddw m0, m0 + mova m3, [t1+wq+400* 6] + paddd m1, m1 + mova m4, [t1+wq+400* 8] + paddd m2, m2 + mova m5, [t1+wq+400*10] + mova [t2+wq+400* 0], m0 + mova [t2+wq+400* 2], m1 + mova [t2+wq+400* 4], m2 + mova [t2+wq+400* 6], m3 + mova [t2+wq+400* 8], m4 + mova [t2+wq+400*10], m5 + add wq, 16 + jl .top_fixup_loop + movif32 t3, t3m + movif32 t4, t4m + call .v0 + jmp .main +.h: ; horizontal boxsum +%assign stack_offset stack_offset+4 +%assign calloff 4 +%if ARCH_X86_64 + lea wq, [r5-4] +%else + %define leftq r5 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, [base+sgr_lshuf5] + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r5-4] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m4, [lpfq+wq- 2] +.h_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -20 + jl .h_have_right +%if ARCH_X86_32 + pxor m8, m8 +%endif + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right +.h_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; sum3 + punpcklwd m7, m0, m6 + pmaddwd m7, m7 + punpckhwd m0, m6 + pmaddwd m0, m0 + paddd m2, m7 ; sumsq3 + palignr m5, m4, 8 + punpcklwd m7, m5, m4 + paddw m8, m4, m5 + pmaddwd m7, m7 + punpckhwd m5, m4 + pmaddwd m5, m5 + paddd m3, m0 + mova [t1+wq+400* 6], m1 + mova [t1+wq+400* 8], m2 + mova [t1+wq+400*10], m3 + paddw m8, m1 ; sum5 + paddd m7, m2 ; sumsq5 + paddd m5, m3 + mova [t1+wq+400* 0], m8 + mova [t1+wq+400* 2], m7 + mova [t1+wq+400* 4], m5 + add wq, 16 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .hv0_main +.hv0_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, [base+sgr_lshuf5] + jmp .hv0_main +.hv0_bottom: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv0_loop_start +%endif +.hv0_loop: + movif32 lpfq, hvsrcm +.hv0_loop_start: + movu m4, [lpfq+wq- 2] +.hv0_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp wd, -20 + jl .hv0_have_right +%if ARCH_X86_32 + pxor m8, m8 +%endif + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right +.hv0_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + movif32 t3, t3m + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; h sum3 + punpcklwd m7, m0, m6 + pmaddwd m7, m7 + punpckhwd m0, m6 + pmaddwd m0, m0 + paddd m2, m7 ; h sumsq3 + palignr m5, m4, 8 + punpcklwd m7, m5, m4 + paddw m8, m4, m5 + pmaddwd m7, m7 + punpckhwd m5, m4 + pmaddwd m5, m5 + paddd m3, m0 + paddw m8, m1 ; h sum5 + paddd m7, m2 ; h sumsq5 + paddd m5, m3 + mova [t3+wq*2+400*8+ 8], m8 + mova [t3+wq*2+400*0+ 8], m7 + mova [t3+wq*2+400*0+24], m5 + paddw m8, [t1+wq+400* 0] + paddd m7, [t1+wq+400* 2] + paddd m5, [t1+wq+400* 4] + mova [t1+wq+400* 0], m8 + mova [t1+wq+400* 2], m7 + mova [t1+wq+400* 4], m5 + paddw m0, m1, [t1+wq+400* 6] + paddd m4, m2, [t1+wq+400* 8] + paddd m5, m3, [t1+wq+400*10] + mova [t1+wq+400* 6], m1 + mova [t1+wq+400* 8], m2 + mova [t1+wq+400*10], m3 + paddw m1, m0, [t2+wq+400* 6] + paddd m2, m4, [t2+wq+400* 8] + paddd m3, m5, [t2+wq+400*10] + mova [t2+wq+400* 6], m0 + mova [t2+wq+400* 8], m4 + mova [t2+wq+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m4, m2, m7 + MAXSD m5, m3, m7 + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m3, m4, 20 ; min(z3, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m7 + MULLD m1, m4, m7 + psubw m5, m12, m2 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*2+ 4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .hv1_main +.hv1_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, [base+sgr_lshuf5] + jmp .hv1_main +.hv1_bottom: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv1_loop_start +%endif +.hv1_loop: + movif32 lpfq, hvsrcm +.hv1_loop_start: + movu m4, [lpfq+wq- 2] +.hv1_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp wd, -20 + jl .hv1_have_right +%if ARCH_X86_32 + pxor m8, m8 +%endif + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right +.hv1_have_right: + palignr m7, m5, m4, 2 + palignr m3, m5, m4, 4 + paddw m2, m7, m3 + punpcklwd m0, m7, m3 + pmaddwd m0, m0 + punpckhwd m7, m3 + pmaddwd m7, m7 + palignr m3, m5, m4, 6 + paddw m2, m3 ; h sum3 + punpcklwd m1, m3, m6 + pmaddwd m1, m1 + punpckhwd m3, m6 + pmaddwd m3, m3 + paddd m0, m1 ; h sumsq3 + palignr m5, m4, 8 + punpckhwd m1, m4, m5 + paddw m8, m4, m5 + pmaddwd m1, m1 + punpcklwd m4, m5 + pmaddwd m4, m4 + paddd m7, m3 + paddw m5, m2, [t2+wq+400* 6] + mova [t2+wq+400* 6], m2 + paddw m8, m2 ; h sum5 + paddd m2, m0, [t2+wq+400* 8] + paddd m3, m7, [t2+wq+400*10] + mova [t2+wq+400* 8], m0 + mova [t2+wq+400*10], m7 + paddd m4, m0 ; h sumsq5 + paddd m1, m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m0, m2, 3 + pslld m7, m3, 3 + paddd m2, m0 ; ((a3 + 8) >> 4) * 9 + paddd m3, m7 + psrlw m7, m5, 1 + pavgw m7, m6 ; (b3 + 2) >> 2 + punpcklwd m0, m7, m6 + pmaddwd m0, m0 + punpckhwd m7, m6 + pmaddwd m7, m7 +%if ARCH_X86_32 + mova [esp+20], m8 +%else + SWAP m8, m6 +%endif + MAXSD m2, m0, m8 + MAXSD m3, m7, m8 + pxor m8, m8 + psubd m2, m0 ; p3 + psubd m3, m7 + punpcklwd m0, m5, m8 ; b3 + punpckhwd m5, m8 + MULLD m2, m14, m8 ; p3 * s1 + MULLD m3, m14, m8 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m5, m11 + paddusw m2, m11 + paddusw m3, m11 + psrld m8, m2, 20 ; min(z3, 255) + movif32 t3, t3m + psrld m2, m3, 20 + GATHER_X_BY_X m7, m8, m2, r0, dstm + punpcklwd m2, m7, m7 + punpckhwd m8, m7, m7 + MULLD m0, m2, m3 + MULLD m5, m8, m3 + psubw m3, m12, m7 +%if ARCH_X86_32 + mova m8, [esp+20] +%endif + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m5, m10 + psrld m0, 12 + psrld m5, 12 + mova [t4+wq*1+400*4+4], m3 + mova [t3+wq*2+400*8+ 8], m0 + mova [t3+wq*2+400*8+24], m5 +%if ARCH_X86_64 + SWAP m6, m8 + pxor m6, m6 +%endif + paddw m5, m8, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m1, [t2+wq+400*4] + paddw m5, [t1+wq+400*0] + paddd m2, [t1+wq+400*2] + paddd m3, [t1+wq+400*4] + mova [t2+wq+400*0], m8 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m1 + mova m4, [base+pw_25] + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + MULLD m2, m4, m7 ; ((a5 + 8) >> 4) * 25 + MULLD m3, m4, m7 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + psrlw m1, m5, 1 + pavgw m1, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m1, m7 + pmaddwd m4, m4 + punpckhwd m1, m7 + pmaddwd m1, m1 + punpcklwd m0, m5, m7 ; b5 + punpckhwd m5, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m2, m4, m7 + psubd m2, m4 ; p5 + mova m4, [base+pd_0xf00800a4] + MAXSD m3, m1, m7 + psubd m3, m1 + MULLD m2, m13, m7 ; p5 * s0 + MULLD m3, m13, m7 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m5, m4 + paddusw m2, m4 + paddusw m3, m4 + psrld m1, m2, 20 ; min(z5, 255) + psrld m2, m3, 20 + GATHER_X_BY_X m4, m1, m2, r0, dstm + punpcklwd m2, m4, m4 + punpckhwd m3, m4, m4 + MULLD m0, m2, m7 + MULLD m5, m3, m7 + psubw m1, m12, m4 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m5, m10 + mova [t4+wq*1+400*0+ 4], m1 + psrld m0, 12 + psrld m5, 12 + mova [t3+wq*2+400*0+ 8], m0 + mova [t3+wq*2+400*0+24], m5 + add wq, 16 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.v0_loop: + mova m0, [t1+wq+400* 6] + mova m4, [t1+wq+400* 8] + mova m5, [t1+wq+400*10] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+wq+400* 6] + paddd m2, m4, [t2+wq+400* 8] + paddd m3, m5, [t2+wq+400*10] + mova [t2+wq+400* 6], m0 + mova [t2+wq+400* 8], m4 + mova [t2+wq+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m4, m2, m7 + MAXSD m5, m3, m7 + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m3, m4, 20 ; min(z3, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m7 + MULLD m1, m4, m7 + psubw m5, m12, m2 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*2+4], m5 + psrld m0, 12 + psrld m1, 12 + mova m3, [t1+wq+400*0] + mova m4, [t1+wq+400*2] + mova m5, [t1+wq+400*4] + mova [t3+wq*2+400*8+ 8], m3 + mova [t3+wq*2+400*0+ 8], m4 + mova [t3+wq*2+400*0+24], m5 + paddw m3, m3 ; cc5 + paddd m4, m4 + paddd m5, m5 + mova [t1+wq+400*0], m3 + mova [t1+wq+400*2], m4 + mova [t1+wq+400*4], m5 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.v1_loop: + mova m4, [t1+wq+400* 6] + mova m5, [t1+wq+400* 8] + mova m7, [t1+wq+400*10] + paddw m1, m4, [t2+wq+400* 6] + paddd m2, m5, [t2+wq+400* 8] + paddd m3, m7, [t2+wq+400*10] + mova [t2+wq+400* 6], m4 + mova [t2+wq+400* 8], m5 + mova [t2+wq+400*10], m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m4, m2, m7 + MAXSD m5, m3, m7 + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m3, m4, 20 ; min(z3, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m7 + MULLD m1, m4, m7 + psubw m5, m12, m2 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*4+4], m5 + psrld m0, 12 + psrld m8, m1, 12 + mova m4, [t3+wq*2+400*8+ 8] + mova m5, [t3+wq*2+400*0+ 8] + mova m7, [t3+wq*2+400*0+24] + paddw m1, m4, [t2+wq+400*0] + paddd m2, m5, [t2+wq+400*2] + paddd m3, m7, [t2+wq+400*4] + paddw m1, [t1+wq+400*0] + paddd m2, [t1+wq+400*2] + paddd m3, [t1+wq+400*4] + mova [t2+wq+400*0], m4 + mova [t2+wq+400*2], m5 + mova [t2+wq+400*4], m7 + mova m4, [base+pw_25] + mova [t3+wq*2+400*8+ 8], m0 + mova [t3+wq*2+400*8+24], m8 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + MULLD m2, m4, m7 ; ((a5 + 8) >> 4) * 25 + MULLD m3, m4, m7 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + psrlw m5, m1, 1 + pavgw m5, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m5, m7 + pmaddwd m4, m4 + punpckhwd m5, m7 + pmaddwd m5, m5 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m2, m4, m7 + psubd m2, m4 ; p5 + mova m4, [base+pd_0xf00800a4] + MAXSD m3, m5, m7 + psubd m3, m5 + MULLD m2, m13, m7 ; p5 * s0 + MULLD m3, m13, m7 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrld m5, m2, 20 ; min(z5, 255) + psrld m2, m3, 20 + GATHER_X_BY_X m4, m5, m2, r0, dstm + punpcklwd m2, m4, m4 + punpckhwd m3, m4, m4 + psubw m5, m12, m4 + MULLD m0, m2, m7 + MULLD m1, m3, m7 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*0+ 4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*0+ 8], m0 + mova [t3+wq*2+400*0+24], m1 + add wq, 16 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + movif64 wq, r5 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*1+400*0+ 2] + movu m1, [t3+wq*2+400*0+ 4] + movu m2, [t3+wq*2+400*0+20] + movu m7, [t4+wq*1+400*0+ 4] + movu m8, [t3+wq*2+400*0+ 8] + paddw m3, m0, [t4+wq*1+400*0+ 0] + paddd m4, m1, [t3+wq*2+400*0+ 0] + paddd m5, m2, [t3+wq*2+400*0+16] + paddw m3, m7 + paddd m4, m8 + movu m7, [t3+wq*2+400*0+24] + paddw m0, m3 + paddd m1, m4 + psllw m3, 2 + pslld m4, 2 + paddd m5, m7 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a5 565 + paddd m1, m4 ; b5 565 + paddd m2, m5 + mova [t4+wq*1+400* 6+ 0], m0 + mova [t3+wq*2+400*12+ 0], m1 + mova [t3+wq*2+400*12+16], m2 + movu m0, [t4+wq*1+400*2+ 4] + movu m3, [t4+wq*1+400*2+ 2] + paddw m0, [t4+wq*1+400*2+ 0] + movu m1, [t3+wq*2+400*4+ 8] + movu m4, [t3+wq*2+400*4+ 4] + paddd m1, [t3+wq*2+400*4+ 0] + movu m2, [t3+wq*2+400*4+24] + movu m5, [t3+wq*2+400*4+20] + paddd m2, [t3+wq*2+400*4+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a3[-1] 444 + pslld m4, 2 ; b3[-1] 444 + pslld m5, 2 + psubw m3, m0 ; a3[-1] 343 + psubd m4, m1 ; b3[-1] 343 + psubd m5, m2 + mova [t4+wq*1+400* 8+ 0], m3 + mova [t3+wq*2+400*16+ 0], m4 + mova [t3+wq*2+400*16+16], m5 + movu m0, [t4+wq*1+400*4+ 4] + movu m3, [t4+wq*1+400*4+ 2] + paddw m0, [t4+wq*1+400*4+ 0] + movu m1, [t3+wq*2+400*8+ 8] + movu m4, [t3+wq*2+400*8+ 4] + paddd m1, [t3+wq*2+400*8+ 0] + movu m2, [t3+wq*2+400*8+24] + movu m5, [t3+wq*2+400*8+20] + paddd m2, [t3+wq*2+400*8+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a3[ 0] 444 + pslld m4, 2 ; b3[ 0] 444 + pslld m5, 2 + mova [t4+wq*1+400*10+ 0], m3 + mova [t3+wq*2+400*20+ 0], m4 + mova [t3+wq*2+400*20+16], m5 + psubw m3, m0 ; a3[ 0] 343 + psubd m4, m1 ; b3[ 0] 343 + psubd m5, m2 + mova [t4+wq*1+400*12+ 0], m3 + mova [t3+wq*2+400*24+ 0], m4 + mova [t3+wq*2+400*24+16], m5 + add wq, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r5 + movif32 wd, w1m +.n0_loop: + movu m0, [t4+wq*1+ 4] + movu m2, [t4+wq*1+ 2] + paddw m0, [t4+wq*1+ 0] + paddw m0, m2 + paddw m2, m0 + psllw m0, 2 + paddw m0, m2 ; a5 + movu m4, [t3+wq*2+ 8] + movu m5, [t3+wq*2+24] + movu m1, [t3+wq*2+ 4] + movu m3, [t3+wq*2+20] + paddd m4, [t3+wq*2+ 0] + paddd m5, [t3+wq*2+16] + paddd m4, m1 + paddd m5, m3 + paddd m1, m4 + paddd m3, m5 + pslld m4, 2 + pslld m5, 2 + paddd m4, m1 ; b5 + paddd m5, m3 + movu m2, [t4+wq*1+400* 6] + paddw m2, m0 + mova [t4+wq*1+400* 6], m0 + paddd m0, m4, [t3+wq*2+400*12+ 0] + paddd m1, m5, [t3+wq*2+400*12+16] + mova [t3+wq*2+400*12+ 0], m4 + mova [t3+wq*2+400*12+16], m5 + mova [rsp+16+ARCH_X86_32*4], m1 + movu m3, [t4+wq*1+400*2+4] + movu m5, [t4+wq*1+400*2+2] + paddw m3, [t4+wq*1+400*2+0] + paddw m5, m3 + psllw m5, 2 ; a3[ 1] 444 + psubw m4, m5, m3 ; a3[ 1] 343 + movu m3, [t4+wq*1+400* 8] + paddw m3, [t4+wq*1+400*10] + paddw m3, m4 + mova [t4+wq*1+400* 8], m4 + mova [t4+wq*1+400*10], m5 + movu m1, [t3+wq*2+400*4+ 8] + movu m5, [t3+wq*2+400*4+ 4] + movu m7, [t3+wq*2+400*4+24] + movu m8, [t3+wq*2+400*4+20] + paddd m1, [t3+wq*2+400*4+ 0] + paddd m7, [t3+wq*2+400*4+16] + paddd m5, m1 + paddd m8, m7 + pslld m5, 2 ; b3[ 1] 444 + pslld m8, 2 + psubd m4, m5, m1 ; b3[ 1] 343 +%if ARCH_X86_32 + mova [esp+52], m8 + psubd m8, m7 +%else + psubd m6, m8, m7 + SWAP m8, m6 +%endif + paddd m1, m4, [t3+wq*2+400*16+ 0] + paddd m7, m8, [t3+wq*2+400*16+16] + paddd m1, [t3+wq*2+400*20+ 0] + paddd m7, [t3+wq*2+400*20+16] + mova [t3+wq*2+400*16+ 0], m4 + mova [t3+wq*2+400*16+16], m8 + mova [t3+wq*2+400*20+ 0], m5 +%if ARCH_X86_32 + mova m8, [esp+52] +%else + SWAP m8, m6 + pxor m6, m6 +%endif + mova [t3+wq*2+400*20+16], m8 + mova [rsp+32+ARCH_X86_32*4], m7 + movu m4, [dstq+wq] + punpcklwd m7, m2, m6 + punpckhwd m2, m6 + punpcklwd m8, m3, m6 + punpckhwd m3, m6 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + pmaddwd m7, m5 ; a5 * src + pmaddwd m8, m5 ; a3 * src + pmaddwd m2, m4 + pmaddwd m3, m4 + pslld m5, 13 + pslld m4, 13 + psubd m0, m5 + psubd m1, m5 + paddd m0, m7 ; a5 * src + b5 + (1 << 8) - (src << 13) + paddd m1, m8 ; a3 * src + b3 + (1 << 8) - (src << 13) + mova m7, [base+pd_0xffff] + psrld m0, 9 + pslld m1, 7 + pand m0, m7 + pandn m8, m7, m1 + por m0, m8 + psubd m1, m4, [rsp+16+ARCH_X86_32*4] + psubd m8, m4, [rsp+32+ARCH_X86_32*4] + psubd m2, m1 + psubd m3, m8 + mova m1, [base+pd_4096] + psrld m2, 9 + pslld m3, 7 + pand m2, m7 + pandn m7, m3 + por m2, m7 + pmaddwd m0, m15 + pmaddwd m2, m15 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + paddd m5, m1 + paddd m4, m1 + paddd m0, m5 + paddd m2, m4 + psrad m0, 8 + psrad m2, 8 + packssdw m0, m2 ; clip + pmaxsw m0, m7 + psrlw m0, 5 + mova [dstq+wq], m0 + add wq, 16 + jl .n0_loop + add dstq, dst_stridemp + ret +%if ARCH_X86_64 + SWAP m6, m7 +%endif +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r5 + movif32 wd, w1m +.n1_loop: + movu m3, [t4+wq*1+400*4+4] + movu m5, [t4+wq*1+400*4+2] + paddw m3, [t4+wq*1+400*4+0] + paddw m5, m3 + psllw m5, 2 ; a3[ 1] 444 + psubw m4, m5, m3 ; a3[ 1] 343 + paddw m3, m4, [t4+wq*1+400*12] + paddw m3, [t4+wq*1+400*10] + mova [t4+wq*1+400*10], m5 + mova [t4+wq*1+400*12], m4 + movu m1, [t3+wq*2+400*8+ 8] + movu m5, [t3+wq*2+400*8+ 4] + movu m7, [t3+wq*2+400*8+24] + movu m8, [t3+wq*2+400*8+20] + paddd m1, [t3+wq*2+400*8+ 0] + paddd m7, [t3+wq*2+400*8+16] + paddd m5, m1 + paddd m8, m7 + pslld m5, 2 ; b3[ 1] 444 + pslld m8, 2 + psubd m4, m5, m1 ; b3[ 1] 343 + psubd m0, m8, m7 + paddd m1, m4, [t3+wq*2+400*24+ 0] + paddd m7, m0, [t3+wq*2+400*24+16] + paddd m1, [t3+wq*2+400*20+ 0] + paddd m7, [t3+wq*2+400*20+16] + mova [t3+wq*2+400*20+ 0], m5 + mova [t3+wq*2+400*20+16], m8 + mova [t3+wq*2+400*24+ 0], m4 + mova [t3+wq*2+400*24+16], m0 + mova m5, [dstq+wq] + mova m8, [t4+wq*1+400* 6] + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpcklwd m0, m8, m6 + punpckhwd m8, m6 + punpcklwd m2, m3, m6 + punpckhwd m3, m6 + pmaddwd m0, m4 ; a5 * src + pmaddwd m2, m4 ; a3 * src + pmaddwd m8, m5 + pmaddwd m3, m5 + paddd m1, m2 ; a3 * src + b3 + (1 << 8) - (src << 13) + pslld m4, 12 + pslld m5, 12 + psubd m2, m4, [t3+wq*2+400*12+ 0] + psubd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) + psubd m2, m5, [t3+wq*2+400*12+16] + psubd m8, m2 + paddd m4, m4 + paddd m5, m5 + paddd m7, m3 + mova m2, [base+pd_0xffff] + psubd m1, m4 + psubd m7, m5 + psrld m0, 8 + psrld m8, 8 + pslld m1, 7 + pslld m7, 7 + pand m0, m2 + pand m8, m2 + pandn m3, m2, m1 + pandn m2, m7 + por m0, m3 + por m8, m2 + mova m1, [base+pd_4096] + pmaddwd m0, m15 + pmaddwd m8, m15 +%if ARCH_X86_64 + pxor m6, m6 + SWAP m7, m6 +%else + pxor m7, m7 +%endif + paddd m4, m1 + paddd m5, m1 + paddd m0, m4 + paddd m8, m5 + psrad m0, 8 + psrad m8, 8 + packssdw m0, m8 ; clip + pmaxsw m0, m7 + psrlw m0, 5 + mova [dstq+wq], m0 + add wq, 16 + jl .n1_loop + add dstq, dst_stridemp + movif32 dstm, dstq + ret From 01968b9ce7649285bf42916bb1ba64e9dddb3846 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Mon, 5 Jul 2021 13:15:11 +0000 Subject: [PATCH 138/188] x86/film_grain: make generate_grain_y/uv_420 32-bit compatible --- src/x86/film_grain16_sse.asm | 634 ++++++++++++++++++++++++++--------- 1 file changed, 472 insertions(+), 162 deletions(-) diff --git a/src/x86/film_grain16_sse.asm b/src/x86/film_grain16_sse.asm index 45389c1ec8..fc572b04a9 100644 --- a/src/x86/film_grain16_sse.asm +++ b/src/x86/film_grain16_sse.asm @@ -26,8 +26,6 @@ %include "config.asm" %include "ext/x86/x86inc.asm" -%if ARCH_X86_64 - SECTION_RODATA 16 pd_16: times 4 dd 16 pw_1: times 8 dw 1 @@ -147,40 +145,52 @@ SECTION .text INIT_XMM ssse3 -cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax +%if ARCH_X86_64 +cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax lea r4, [pb_mask] %define base r4-pb_mask +%else +cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax + LEA r4, $$ +%define base r4-$$ +%endif movq m1, [base+rnd_next_upperbit_mask] movq m4, [base+mul_bits] movq m7, [base+hmul_bits] mov r3d, [fg_dataq+FGData.grain_scale_shift] - lea r6d, [bdmaxq+1] - shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc - sub r3, r6 - SPLATW m8, [base+round+r3*2-2] + lea r5d, [bdmaxq+1] + shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc + sub r3, r5 + SPLATW m6, [base+round+r3*2-2] mova m5, [base+pb_mask] SPLATW m0, [fg_dataq+FGData.seed] mov r3, -73*82*2 sub bufq, r3 +%if ARCH_X86_64 lea r6, [gaussian_sequence] +%endif .loop: pand m2, m0, m1 psrlw m3, m2, 10 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw m2, m4 ; bits 0x0f00 are set - pshufb m6, m5, m2 ; set 15th bit for next 4 seeds - psllq m2, m6, 30 - por m2, m6 - psllq m6, m2, 15 - por m2, m6 ; aggregate each bit into next seed's high bit + pshufb m3, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m3, 30 + por m2, m3 + psllq m3, m2, 15 + por m2, m3 ; aggregate each bit into next seed's high bit pmulhuw m3, m0, m7 por m2, m3 ; 4 next output seeds pshuflw m0, m2, q3333 psrlw m2, 5 +%if ARCH_X86_64 vpgatherdw m3, m2, r6, r5, r7, 4, 2 +%else + vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2 +%endif paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 ; shifts by 0, which pmulhrsw does not support - pmulhrsw m3, m8 + pmulhrsw m3, m6 movq [bufq+r3], m3 add r3, 4*2 jl .loop @@ -195,23 +205,29 @@ cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax %if WIN64 DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0 lea bufq, [r0-2*(82*73-(82*3+79))] -%elif ARCH_X86_64 + PUSH r8 +%else +%if ARCH_X86_64 DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 +%else ; x86-32 + DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0 + PUSH r6 +%define shiftd r1d +%endif sub bufq, 2*(82*73-(82*3+79)) -%else - ; FIXME shift goes into r1 (x86-32 code) - .. %endif - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] movd m4, [fg_dataq+FGData.ar_coeffs_y] + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] %if WIN64 DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0 %elif ARCH_X86_64 DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 -%else - ; x86-32 code - .. +%else ; x86-32 +%undef shiftd + DEFINE_ARGS buf, shift, min, val3, x, cf3, val0 +%define hd dword r0m +%define maxd dword minm %endif %if cpuflag(sse4) pmovsxbw m4, m4 @@ -220,7 +236,7 @@ cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax pcmpgtb m3, m4 punpcklbw m4, m3 %endif - pinsrw m4, [pw_1], 3 + pinsrw m4, [base+pw_1], 3 pshufd m5, m4, q1111 pshufd m4, m4, q0000 SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd @@ -264,35 +280,80 @@ cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax add bufq, 82*2 dec hd jg .y_loop_ar1 +%if WIN64 + POP r8 +%elif ARCH_X86_32 + POP r6 +%undef maxd +%undef hd +%endif .ar0: RET .ar2: +%if ARCH_X86_32 +%assign stack_offset_old stack_offset + ALLOC_STACK -16*8 +%endif DEFINE_ARGS buf, fg_data, bdmax, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - SPLATW m12, [base+round_vals-12+shiftq*2] + movd m0, [base+round_vals-12+shiftq*2] + pshuflw m0, m0, q0000 movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11 - pxor m9, m9 - punpcklwd m12, m9 - pcmpgtb m9, m6 - punpckhbw m10, m6, m9 - punpcklbw m6, m9 - pshufd m9, m6, q3333 - pshufd m8, m6, q2222 + pxor m2, m2 + punpcklwd m0, m2 + pcmpgtb m2, m6 + punpckhbw m3, m6, m2 + punpcklbw m6, m2 + pshufd m2, m6, q3333 + pshufd m1, m6, q2222 pshufd m7, m6, q1111 pshufd m6, m6, q0000 - pshufd m11, m10, q1111 - pshufd m10, m10, q0000 + pshufd m4, m3, q1111 + pshufd m3, m3, q0000 +%if ARCH_X86_64 + SWAP 0, 12 + SWAP 1, 8 + SWAP 2, 9 + SWAP 3, 10 + SWAP 4, 11 +%else +%define m12 [rsp+0*16] +%define m8 [rsp+1*16] +%define m9 [rsp+2*16] +%define m10 [rsp+3*16] +%define m11 [rsp+4*16] + mova m12, m0 + mova m8, m1 + mova m9, m2 + mova m10, m3 + mova m11, m4 + mov bdmaxd, bdmaxm +%endif sar bdmaxd, 1 - SPLATW m13, bdmaxd ; max_grain - pcmpeqw m14, m14 + SPLATW m0, bdmaxd ; max_grain + pcmpeqw m1, m1 %if !cpuflag(sse4) - pcmpeqw m15, m15 - psrldq m15, 14 - pslldq m15, 2 - pxor m15, m14 + pcmpeqw m2, m2 + psrldq m2, 14 + pslldq m2, 2 + pxor m2, m1 +%endif + pxor m1, m0 ; min_grain +%if ARCH_X86_64 + SWAP 0, 13 + SWAP 1, 14 + SWAP 2, 15 +%else +%define m13 [rsp+5*16] +%define m14 [rsp+6*16] + mova m13, m0 + mova m14, m1 +%if !cpuflag(sse4) +%define m15 [rsp+7*16] + mova m15, m2 +%endif %endif - pxor m14, m13 ; min_grain sub bufq, 2*(82*73-(82*3+79)) DEFINE_ARGS buf, fg_data, h, x mov hd, 70 @@ -362,6 +423,16 @@ cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax add bufq, 82*2 dec hd jg .y_loop_ar2 +%if ARCH_X86_32 +%undef m8 +%undef m9 +%undef m10 +%undef m11 +%undef m12 +%undef m13 +%undef m14 +%undef m15 +%endif RET .ar3: @@ -371,21 +442,36 @@ cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax and rsp, ~15 sub rsp, 64 %define tmp rsp -%else +%elif ARCH_X86_64 %define tmp rsp+stack_offset-72 +%else +%assign stack_offset stack_offset_old + ALLOC_STACK -16*12 + %define tmp rsp + mov bdmaxd, bdmaxm %endif sar bdmaxd, 1 - SPLATW m15, bdmaxd ; max_grain - pcmpeqw m14, m14 + SPLATW m7, bdmaxd ; max_grain + pcmpeqw m6, m6 %if !cpuflag(sse4) - pcmpeqw m12, m12 - psrldq m12, 14 - pslldq m12, 4 - pxor m12, m14 + pcmpeqw m4, m4 + psrldq m4, 14 + pslldq m4, 4 + pxor m4, m6 %endif - pxor m14, m15 ; min_grain + pxor m6, m7 ; min_grain mov shiftd, [fg_dataq+FGData.ar_coeff_shift] +%if ARCH_X86_64 + SWAP 6, 14 + SWAP 7, 15 +%else +%define m14 [rsp+10*16] +%define m15 [esp+11*16] + mova m14, m6 + mova m15, m7 +%endif + ; build cf0-1 until 18-19 in m5-12 and r0/1 pxor m1, m1 movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 @@ -394,18 +480,37 @@ cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax punpcklbw m0, m1 %if cpuflag(sse4) - pshufd m12, m2, q3333 + pshufd m4, m2, q3333 %else - pshufd m13, m2, q3333 - mova [tmp+48], m13 + pshufd m5, m2, q3333 + mova [tmp+48], m5 %endif - pshufd m11, m2, q2222 - pshufd m10, m2, q1111 - pshufd m9, m2, q0000 - pshufd m8, m0, q3333 + pshufd m3, m2, q2222 + pshufd m1, m2, q0000 + pshufd m2, m2, q1111 pshufd m7, m0, q2222 pshufd m6, m0, q1111 pshufd m5, m0, q0000 + pshufd m0, m0, q3333 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 + SWAP 4, 12 +%else +%define m8 [rsp+4*16] +%define m9 [esp+5*16] +%define m10 [rsp+6*16] +%define m11 [esp+7*16] +%define m12 [rsp+8*16] + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + mova m12, m4 +%endif ; build cf20,round in r2 ; build cf21-23,round*2 in m13 @@ -417,8 +522,16 @@ cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax pshufd m2, m0, q1111 mova [tmp+ 0], m1 mova [tmp+16], m2 - psrldq m13, m0, 10 - pinsrw m13, [base+round_vals+shiftq*2-10], 3 + psrldq m3, m0, 10 + pinsrw m3, [base+round_vals+shiftq*2-10], 3 + +%if ARCH_X86_64 + SWAP 3, 13 +%else +%define m13 [esp+9*16] + mova m13, m3 +%endif + pinsrw m0, [base+round_vals+shiftq*2-12], 5 pshufd m3, m0, q2222 mova [tmp+32], m3 @@ -471,7 +584,7 @@ cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] - punpcklwd m2, [pw_1] + punpcklwd m2, [base+pw_1] %if cpuflag(sse4) pmaddwd m1, m12 @@ -521,54 +634,83 @@ cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax jg .y_loop_ar3 %if WIN64 mov rsp, r6 +%elif ARCH_X86_32 +%undef m8 +%undef m9 +%undef m10 +%undef m11 +%undef m12 +%undef m13 +%undef m14 +%undef m15 %endif RET INIT_XMM ssse3 -cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax +%if ARCH_X86_64 +cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg %define base r8-pb_mask lea r8, [pb_mask] movifnidn bdmaxd, bdmaxm + lea r6d, [bdmaxq+1] +%else +cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h +%define base r2-$$ + LEA r2, $$ + mov fg_dataq, r2m + mov r6d, r4m + inc r6d +%endif movq m1, [base+rnd_next_upperbit_mask] movq m4, [base+mul_bits] movq m7, [base+hmul_bits] mov r5d, [fg_dataq+FGData.grain_scale_shift] - lea r6d, [bdmaxq+1] shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc sub r5, r6 - SPLATW m8, [base+round+r5*2-2] + SPLATW m6, [base+round+r5*2-2] mova m5, [base+pb_mask] SPLATW m0, [fg_dataq+FGData.seed] - SPLATW m9, [base+pw_seed_xor+uvq*4] - pxor m0, m9 +%if ARCH_X86_64 + SPLATW m2, [base+pw_seed_xor+uvq*4] +%else + mov r5d, r3m + SPLATW m2, [base+pw_seed_xor+r5*4] +%endif + pxor m0, m2 +%if ARCH_X86_64 lea r6, [gaussian_sequence] - mov r7d, 38 +%endif + mov hd, 38 add bufq, 44*2 .loop_y: - mov r5, -44 + mov xq, -44 .loop_x: pand m2, m0, m1 psrlw m3, m2, 10 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw m2, m4 ; bits 0x0f00 are set - pshufb m6, m5, m2 ; set 15th bit for next 4 seeds - psllq m2, m6, 30 - por m2, m6 - psllq m6, m2, 15 - por m2, m6 ; aggregate each bit into next seed's high bit + pshufb m3, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m3, 30 + por m2, m3 + psllq m3, m2, 15 + por m2, m3 ; aggregate each bit into next seed's high bit pmulhuw m3, m0, m7 por m2, m3 ; 4 next output seeds pshuflw m0, m2, q3333 psrlw m2, 5 +%if ARCH_X86_64 vpgatherdw m3, m2, r6, r9, r10, 4, 2 +%else + vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2 +%endif paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 ; shifts by 0, which pmulhrsw does not support - pmulhrsw m3, m8 - movq [bufq+r5*2], m3 - add r5, 4 + pmulhrsw m3, m6 + movq [bufq+xq*2], m3 + add xq, 4 jl .loop_x add bufq, 82*2 - dec r7d + dec hd jg .loop_y ; auto-regression code @@ -578,25 +720,51 @@ cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax jmp r5 .ar0: +%if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift +%assign stack_offset_old stack_offset + ALLOC_STACK -16*2 + mov bufyq, r1m + mov uvd, r3m +%endif imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] SPLATW m3, [base+hmul_bits+shiftq*2-10] +%if ARCH_X86_64 sar bdmaxd, 1 - SPLATW m14, bdmaxd ; max_gain + SPLATW m1, bdmaxd ; max_gain +%else + SPLATW m1, r4m + psraw m1, 1 +%endif pcmpeqw m7, m7 - pxor m7, m14 ; min_grain + pxor m7, m1 ; min_grain +%if ARCH_X86_64 + SWAP 1, 14 DEFINE_ARGS buf, bufy, h, x +%else +%define m14 [rsp+0*16] + mova m14, m1 + DEFINE_ARGS buf, bufy, pic_reg, h, x +%endif pxor m5, m5 pcmpgtb m5, m4 punpcklbw m4, m5 - SPLATW m6, [hmul_bits+4] + SPLATW m6, [base+hmul_bits+4] SPLATW m4, m4 pxor m5, m5 %if !cpuflag(sse4) - pcmpeqw m12, m12 - pslldq m12, 12 + pcmpeqw m2, m2 + pslldq m2, 12 +%if ARCH_X86_64 + SWAP 2, 12 +%else +%define m12 [rsp+1*16] + mova m12, m2 +%endif %endif sub bufq, 2*(82*38+82-(82*3+41)) add bufyq, 2*(3+82*3) @@ -605,63 +773,84 @@ cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax ; first 32 pixels xor xd, xd .x_loop_ar0: - movu m8, [bufyq+xq*4] - movu m9, [bufyq+xq*4+82*2] - movu m10, [bufyq+xq*4 +16] - movu m11, [bufyq+xq*4+82*2+16] - paddw m8, m9 - paddw m10, m11 - phaddw m8, m10 - pmulhrsw m8, m6 - punpckhwd m9, m8, m5 - punpcklwd m8, m5 - REPX {pmaddwd x, m4}, m8, m9 - REPX {psrad x, 5}, m8, m9 - packssdw m8, m9 - pmulhrsw m8, m3 - movu m0, [bufq+xq*2] - paddw m8, m0 - pminsw m8, m14 - pmaxsw m8, m7 + movu m0, [bufyq+xq*4] + movu m2, [bufyq+xq*4+82*2] + paddw m0, m2 + movu m1, [bufyq+xq*4 +16] + movu m2, [bufyq+xq*4+82*2+16] + paddw m1, m2 + phaddw m0, m1 + pmulhrsw m0, m6 + punpckhwd m1, m0, m5 + punpcklwd m0, m5 + REPX {pmaddwd x, m4}, m0, m1 + REPX {psrad x, 5}, m0, m1 + packssdw m0, m1 + pmulhrsw m0, m3 + movu m1, [bufq+xq*2] + paddw m0, m1 + pminsw m0, m14 + pmaxsw m0, m7 cmp xd, 32 je .end - movu [bufq+xq*2], m8 + movu [bufq+xq*2], m0 add xd, 8 jmp .x_loop_ar0 ; last 6 pixels .end: %if cpuflag(sse4) - pblendw m8, m0, 11000000b + pblendw m0, m1, 11000000b %else - pand m0, m12 - pandn m9, m12, m8 - por m8, m0, m9 + pand m1, m12 + pandn m2, m12, m0 + por m0, m1, m2 %endif - movu [bufq+xq*2], m8 + movu [bufq+xq*2], m0 add bufq, 82*2 add bufyq, 82*4 dec hd jg .y_loop_ar0 +%if ARCH_X86_32 +%undef m12 +%undef m14 +%endif RET .ar1: +%if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x +%else +%assign stack_offset stack_offset_old +%xdefine rstk rsp +%assign stack_size_padded 0 + DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3 + mov bufyq, r1m + mov uvd, r3m +%endif imul uvd, 28 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] %if WIN64 DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0 lea bufq, [r0-2*(82*38+44-(82*3+41))] -%elif ARCH_X86_64 +%else +%if ARCH_X86_64 DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0 - sub bufq, 2*(82*38+44-(82*3+41)) %else - ; x86-32 code - move shift into r1 [ecx] - .. + DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3 +%define hd dword r1m +%define mind dword r3m +%define maxd dword r4m %endif + sub bufq, 2*(82*38+44-(82*3+41)) +%endif +%if ARCH_X86_64 mov shiftd, [r2+FGData.ar_coeff_shift] +%else + mov shiftd, [r3+FGData.ar_coeff_shift] +%endif pxor m5, m5 pcmpgtb m5, m4 punpcklbw m4, m5 ; cf0-4 in words @@ -672,28 +861,35 @@ cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax movd m3, [base+round_vals+shiftq*2-12] ; rnd pxor m6, m6 punpcklwd m3, m6 - SPLATW m6, [hmul_bits+4] + SPLATW m6, [base+hmul_bits+4] SPLATD m3, m3 add bufyq, 2*(79+82*3) mov hd, 35 sar maxd, 1 +%if ARCH_X86_64 mov mind, maxd xor mind, -1 +%else + DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3 + mov r2, maxd + xor r2, -1 + mov mind, r2 +%endif .y_loop_ar1: mov xq, -38 movsx val3d, word [bufq+xq*2-2] .x_loop_ar1: movu m0, [bufq+xq*2-82*2-2] ; top/left - movu m8, [bufyq+xq*4] - movu m9, [bufyq+xq*4+82*2] + movu m7, [bufyq+xq*4] + movu m1, [bufyq+xq*4+82*2] + phaddw m7, m1 psrldq m2, m0, 2 ; top psrldq m1, m0, 4 ; top/right - phaddw m8, m9 - pshufd m9, m8, q3232 - paddw m8, m9 - pmulhrsw m8, m6 punpcklwd m0, m2 - punpcklwd m1, m8 + pshufd m2, m7, q3232 + paddw m7, m2 + pmulhrsw m7, m6 + punpcklwd m1, m7 pmaddwd m0, m4 pmaddwd m1, m5 paddd m0, m1 @@ -723,24 +919,54 @@ cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax add bufyq, 82*4 dec hd jg .y_loop_ar1 +%if ARCH_X86_32 +%undef maxd +%undef mind +%undef hd +%endif RET .ar2: +%if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift + ALLOC_STACK -16*8 + mov bufyq, r1m + mov uvd, r3m +%endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 +%if ARCH_X86_64 sar bdmaxd, 1 - SPLATW m13, bdmaxd ; max_grain - pcmpeqw m14, m14 + SPLATW m5, bdmaxd ; max_grain +%else + SPLATW m5, r4m + psraw m5, 1 +%endif + pcmpeqw m6, m6 %if !cpuflag(sse4) - pcmpeqw m15, m15 - psrldq m15, 14 - pslldq m15, 2 - pxor m15, m14 + pcmpeqw m7, m7 + psrldq m7, 14 + pslldq m7, 2 + pxor m7, m6 %endif - pxor m14, m13 ; min_grain + pxor m6, m5 ; min_grain %if cpuflag(sse4) - SPLATW m15, [hmul_bits+4] + SPLATW m7, [base+hmul_bits+4] +%endif + +%if ARCH_X86_64 + SWAP 5, 13 + SWAP 6, 14 + SWAP 7, 15 +%else +%define m13 [rsp+5*16] +%define m14 [rsp+6*16] +%define m15 [rsp+7*16] + mova m13, m5 + mova m14, m6 + mova m15, m7 %endif ; coef values @@ -753,13 +979,36 @@ cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax pshufd m6, m0, q0000 pshufd m7, m0, q1111 - pshufd m8, m0, q2222 - pshufd m9, m0, q3333 - pshufd m10, m2, q0000 - pshufd m11, m2, q1111 - pshufd m12, m2, q2222 + pshufd m1, m0, q3333 + pshufd m0, m0, q2222 + pshufd m3, m2, q1111 + pshufd m4, m2, q2222 + pshufd m2, m2, q0000 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 + SWAP 4, 12 +%else +%define m8 [rsp+0*16] +%define m9 [rsp+1*16] +%define m10 [rsp+2*16] +%define m11 [rsp+3*16] +%define m12 [rsp+4*16] + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + mova m12, m4 +%endif +%if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, h, x +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x +%endif sub bufq, 2*(82*38+44-(82*3+41)) add bufyq, 2*(79+82*3) mov hd, 35 @@ -801,9 +1050,9 @@ cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax %if cpuflag(sse4) pmulhrsw m1, m15 %else - pmulhrsw m1, [pw_8192] + pmulhrsw m1, [base+pw_8192] %endif - punpcklwd m1, [pw_1] + punpcklwd m1, [base+pw_1] pmaddwd m1, m12 paddd m0, m1 @@ -845,9 +1094,15 @@ cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax add bufyq, 82*4 dec hd jg .y_loop_ar2 +%if ARCH_X86_32 +%undef m13 +%undef m14 +%undef m15 +%endif RET .ar3: +%if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift %if WIN64 mov r6, rsp @@ -856,25 +1111,54 @@ cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax %define tmp rsp %else %define tmp rsp+stack_offset-120 +%endif +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift +%assign stack_offset stack_offset_old + ALLOC_STACK -16*14 + mov bufyq, r1m + mov uvd, r3m + %define tmp rsp %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 - SPLATW m12, [base+round_vals-12+shiftq*2] - pxor m13, m13 - pcmpgtw m13, m12 - punpcklwd m12, m13 + SPLATW m4, [base+round_vals-12+shiftq*2] + pxor m5, m5 + pcmpgtw m5, m4 + punpcklwd m4, m5 +%if ARCH_X86_64 sar bdmaxd, 1 - SPLATW m14, bdmaxd ; max_grain - pcmpeqw m15, m15 + SPLATW m6, bdmaxd ; max_grain +%else + SPLATW m6, r4m + psraw m6, 1 +%endif + pcmpeqw m7, m7 %if !cpuflag(sse4) - pcmpeqw m11, m11 - psrldq m11, 14 - pslldq m11, 4 - pxor m11, m15 + pcmpeqw m3, m3 + psrldq m3, 14 + pslldq m3, 4 + pxor m3, m7 %endif - pxor m15, m14 ; min_grain + pxor m7, m6 ; min_grain %if cpuflag(sse4) - SPLATW m11, [base+hmul_bits+4] + SPLATW m3, [base+hmul_bits+4] +%endif + +%if ARCH_X86_64 + SWAP 3, 11 + SWAP 4, 12 + SWAP 6, 14 + SWAP 7, 15 +%else +%define m11 [rsp+ 9*16] +%define m12 [rsp+10*16] +%define m14 [rsp+12*16] +%define m15 [rsp+13*16] + mova m11, m3 + mova m12, m4 + mova m14, m6 + mova m15, m7 %endif ; cf from y=-3,x=-3 until y=-3,x=-2 @@ -883,18 +1167,18 @@ cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax pcmpgtb m1, m0 punpckhbw m2, m0, m1 punpcklbw m0, m1 - pshufd m6, m0, q0000 - pshufd m7, m0, q1111 - pshufd m8, m0, q2222 - pshufd m9, m0, q3333 - pshufd m10, m2, q0000 - pshufd m13, m2, q1111 - mova [tmp+16*0], m6 - mova [tmp+16*1], m7 - mova [tmp+16*2], m8 - mova [tmp+16*3], m9 - mova [tmp+16*4], m10 - mova [tmp+16*5], m13 + pshufd m1, m0, q0000 + pshufd m3, m0, q1111 + pshufd m4, m0, q2222 + pshufd m0, m0, q3333 + pshufd m5, m2, q0000 + pshufd m6, m2, q1111 + mova [tmp+16*0], m1 + mova [tmp+16*1], m3 + mova [tmp+16*2], m4 + mova [tmp+16*3], m0 + mova [tmp+16*4], m5 + mova [tmp+16*5], m6 pshufd m6, m2, q2222 pshufd m7, m2, q3333 @@ -904,18 +1188,34 @@ cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax pcmpgtb m1, m0 punpckhbw m2, m0, m1 ; luma punpcklbw m0, m1 - pshufd m10, m0, q3232 - psrldq m13, m0, 10 + pshufd m3, m0, q3232 + psrldq m5, m0, 10 ; y=0,x=[-3 to -1] + "1.0" for current pixel - pinsrw m13, [base+round_vals-10+shiftq*2], 3 + pinsrw m5, [base+round_vals-10+shiftq*2], 3 ; y=-1,x=[-1 to +2] - pshufd m8, m0, q0000 - pshufd m9, m0, q1111 + pshufd m1, m0, q0000 + pshufd m0, m0, q1111 ; y=-1,x=+3 + luma - punpcklwd m10, m2 - pshufd m10, m10, q0000 + punpcklwd m3, m2 + pshufd m3, m3, q0000 - DEFINE_ARGS buf, bufy, fg_data, h, unused, x +%if ARCH_X86_64 + SWAP 1, 8 + SWAP 0, 9 + SWAP 3, 10 + SWAP 5, 13 + DEFINE_ARGS buf, bufy, fg_data, h, x +%else +%define m8 [rsp+ 6*16] +%define m9 [rsp+ 7*16] +%define m10 [rsp+ 8*16] +%define m13 [rsp+11*16] + mova m8, m1 + mova m9, m0 + mova m10, m3 + mova m13, m5 + DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x +%endif sub bufq, 2*(82*38+44-(82*3+41)) add bufyq, 2*(79+82*3) mov hd, 35 @@ -969,7 +1269,7 @@ cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax %if cpuflag(sse4) pmulhrsw m5, m11 %else - pmulhrsw m5, [pw_8192] + pmulhrsw m5, [base+pw_8192] %endif punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] @@ -1020,9 +1320,19 @@ cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax jg .y_loop_ar3 %if WIN64 mov rsp, r6 +%elif ARCH_X86_32 +%undef m8 +%undef m9 +%undef m10 +%undef m11 +%undef m12 +%undef m13 +%undef m14 +%undef m15 %endif RET +%if ARCH_X86_64 INIT_XMM ssse3 cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut mov r7d, [fg_dataq+FGData.scaling_shift] From 16fad1adc5c9650306b410e4bca57a36a8541906 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Mon, 12 Jul 2021 12:33:55 +0200 Subject: [PATCH 139/188] x86: Add high bitdepth ipred_filter SSSE3 asm --- src/x86/ipred16_sse.asm | 145 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) diff --git a/src/x86/ipred16_sse.asm b/src/x86/ipred16_sse.asm index 48d797b27a..eaa56b67bc 100644 --- a/src/x86/ipred16_sse.asm +++ b/src/x86/ipred16_sse.asm @@ -28,6 +28,7 @@ SECTION_RODATA +filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 pb_0_1: times 4 db 0, 1 @@ -65,6 +66,7 @@ JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 cextern smooth_weights_1d_16bpc cextern smooth_weights_2d_16bpc +cextern filter_intra_taps SECTION .text @@ -887,6 +889,149 @@ cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \ %endif RET +%if ARCH_X86_64 +cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter +%else +cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter +%define m8 [esp+16*0] +%define m9 [esp+16*1] +%define m10 [esp+16*2] +%define m11 [esp+16*3] +%define m12 [esp+16*4] +%define m13 [esp+16*5] +%define m14 [esp+16*6] +%define m15 [esp+16*7] +%endif +%define base r6-$$ + movifnidn hd, hm + movd m6, r8m ; bitdepth_max +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + LEA r6, $$ + shl filterd, 6 + movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3 + mova m1, [base+filter_intra_taps+filterq+16*0] + mova m2, [base+filter_intra_taps+filterq+16*1] + mova m3, [base+filter_intra_taps+filterq+16*2] + mova m4, [base+filter_intra_taps+filterq+16*3] + pxor m5, m5 +%if ARCH_X86_64 + punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper + punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid + punpcklbw m10, m5, m2 ; having to perform sign-extension. + punpckhbw m11, m5, m2 + punpcklbw m12, m5, m3 + punpckhbw m13, m5, m3 + punpcklbw m14, m5, m4 + punpckhbw m15, m5, m4 +%else + punpcklbw m7, m5, m1 + mova m8, m7 + punpckhbw m7, m5, m1 + mova m9, m7 + punpcklbw m7, m5, m2 + mova m10, m7 + punpckhbw m7, m5, m2 + mova m11, m7 + punpcklbw m7, m5, m3 + mova m12, m7 + punpckhbw m7, m5, m3 + mova m13, m7 + punpcklbw m7, m5, m4 + mova m14, m7 + punpckhbw m7, m5, m4 + mova m15, m7 +%endif + mova m7, [base+filter_shuf] + add hd, hd + mov r5, dstq + pshuflw m6, m6, q0000 + mov r6, tlq + punpcklqdq m6, m6 + sub tlq, hq +.left_loop: + pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __ + pshufd m1, m0, q0000 + pmaddwd m2, m8, m1 + pmaddwd m1, m9 + pshufd m4, m0, q1111 + pmaddwd m3, m10, m4 + pmaddwd m4, m11 + paddd m2, m3 + paddd m1, m4 + pshufd m4, m0, q2222 + pmaddwd m3, m12, m4 + pmaddwd m4, m13 + paddd m2, m3 + paddd m1, m4 + pshufd m3, m0, q3333 + pmaddwd m0, m14, m3 + pmaddwd m3, m15 + paddd m0, m2 + paddd m1, m3 + psrad m0, 11 ; x >> 3 + psrad m1, 11 + packssdw m0, m1 + pmaxsw m0, m5 + pavgw m0, m5 ; (x + 8) >> 4 + pminsw m0, m6 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movlps m0, [tlq+hq-10] + lea dstq, [dstq+strideq*2] + sub hd, 2*2 + jg .left_loop + sub wd, 4 + jz .end + sub tld, r6d ; -h*2 + sub r6, r5 ; tl-dst +.right_loop0: + add r5, 8 + mov hd, tld + movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __ + mov dstq, r5 +.right_loop: + pshufd m2, m0, q0000 + pmaddwd m1, m8, m2 + pmaddwd m2, m9 + pshufd m4, m0, q1111 + pmaddwd m3, m10, m4 + pmaddwd m4, m11 + pinsrw m0, [dstq+strideq*0-2], 5 + paddd m1, m3 + paddd m2, m4 + pshufd m0, m0, q2222 + movddup m4, [dstq+strideq*1-8] + pmaddwd m3, m12, m0 + pmaddwd m0, m13 + paddd m1, m3 + paddd m0, m2 + pshuflw m2, m4, q3333 + punpcklwd m2, m5 + pmaddwd m3, m14, m2 + pmaddwd m2, m15 + paddd m1, m3 + paddd m0, m2 + psrad m1, 11 + psrad m0, 11 + packssdw m0, m1 + pmaxsw m0, m5 + pavgw m0, m5 + pminsw m0, m6 + movhps [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + palignr m0, m4, 14 + lea dstq, [dstq+strideq*2] + add hd, 2*2 + jl .right_loop + sub wd, 4 + jg .right_loop0 +.end: + RET + %if UNIX64 DECLARE_REG_TMP 7 %else From cc923e1ef7662591741cdeb3dd0a7d8cee7f95d9 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 13 Jul 2021 16:16:24 +0000 Subject: [PATCH 140/188] x86/filmgrain: make fgy_32x32xn HBD/SSSE3 32bit-compatible --- src/x86/film_grain16_sse.asm | 508 +++++++++++++++++++++++++++++------ 1 file changed, 431 insertions(+), 77 deletions(-) diff --git a/src/x86/film_grain16_sse.asm b/src/x86/film_grain16_sse.asm index fc572b04a9..8a1e1d014b 100644 --- a/src/x86/film_grain16_sse.asm +++ b/src/x86/film_grain16_sse.asm @@ -1332,60 +1332,155 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h %endif RET -%if ARCH_X86_64 +%macro SCRATCH 3 +%if ARCH_X86_32 + mova [rsp+%3*mmsize], m%1 +%define m%2 [rsp+%3*mmsize] +%else + SWAP %1, %2 +%endif +%endmacro + INIT_XMM ssse3 +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize +cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \ + dst, src, scaling, unused1, fg_data, picptr, unused2 + ; copy stack arguments to new position post-alignment, so that we + ; don't have to keep the old stack location in a separate register + mov r0, r0m + mov r1, r2m + mov r2, r4m + mov r3, r6m + mov r4, r7m + mov r5, r8m + +%define r0m [rsp+8*mmsize+ 3*gprsize] +%define r2m [rsp+8*mmsize+ 5*gprsize] +%define r4m [rsp+8*mmsize+ 7*gprsize] +%define r6m [rsp+8*mmsize+ 9*gprsize] +%define r7m [rsp+8*mmsize+10*gprsize] +%define r8m [rsp+8*mmsize+11*gprsize] + + mov r0m, r0 + mov r2m, r1 + mov r4m, r2 + mov r6m, r3 + mov r7m, r4 + mov r8m, r5 +%else +cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \ + dst, src, scaling, unused1, fg_data, picptr, unused2 +%endif + mov srcq, srcm + mov scalingq, r5m + mov fg_dataq, r3m +%if STACK_ALIGNMENT < mmsize + mov r6, r9m + +%define r9m [rsp+8*mmsize+ 4*gprsize] +%define r3m [rsp+8*mmsize+ 6*gprsize] +%define r5m [rsp+8*mmsize+ 8*gprsize] + + mov r9m, r6 +%endif + LEA r5, $$ +%define base r5-$$ + mov r5m, picptrq +%else cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut - mov r7d, [fg_dataq+FGData.scaling_shift] lea r8, [pb_mask] %define base r8-pb_mask - SPLATW m11, [base+mul_bits+r7*2-14] +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + SPLATW m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] - mov r9d, r9m ; bdmax - sar r9d, 11 ; is_12bpc - inc r9d - mov r10d, r6d - imul r10d, r9d - dec r9d - SPLATW m13, [base+min+r10*2] - lea r9d, [r9d*3] - lea r9d, [r6d*2+r9d] - SPLATW m12, [base+max+r9*2] - SPLATW m10, r9m - - pcmpeqw m9, m9 - psraw m7, m10, 1 ; max_grain - pxor m9, m7 ; min_grain +%if ARCH_X86_32 + DECLARE_REG_TMP 0, 3 +%else + DECLARE_REG_TMP 9, 10 +%endif + mov t0d, r9m ; bdmax + sar t0d, 11 ; is_12bpc + inc t0d + mov t1d, r6d + imul t1d, t0d + dec t0d + SPLATW m5, [base+min+t1*2] + lea t0d, [t0d*3] + lea t0d, [r6d*2+t0d] + SPLATW m4, [base+max+t0*2] + SPLATW m2, r9m + + pcmpeqw m1, m1 + psraw m7, m2, 1 ; max_grain + pxor m1, m7 ; min_grain + SPLATD m6, [base+pd_16] + + SCRATCH 1, 9, 0 + SCRATCH 2, 10, 1 + SCRATCH 3, 11, 2 + SCRATCH 4, 12, 3 + SCRATCH 5, 13, 4 + SCRATCH 6, 14, 5 + SCRATCH 7, 15, 6 + %if !cpuflag(sse4) pcmpeqw m6, m6 pslldq m6, 4 %endif - SPLATD m14, [pd_16] +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2 + DECLARE_REG_TMP 0 +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ sby, see + DECLARE_REG_TMP 7 +%endif - movifnidn sbyd, sbym + mov sbyd, r8m + movzx t0d, byte [fg_dataq+FGData.overlap_flag] + test t0d, t0d + jz .no_vertical_overlap test sbyd, sbyd - setnz r7b - test r7b, byte [fg_dataq+FGData.overlap_flag] jnz .vertical_overlap - mov dword sbym, 0 +.no_vertical_overlap: + mov dword r8m, t0d +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused + imul seed, (173 << 24) | 37 +%else imul seed, sbyd, (173 << 24) | 37 +%endif add seed, (105 << 24) | 178 rol seed, 8 movzx seed, seew xor seed, [fg_dataq+FGData.seed] +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused1, unused2, see, src_bak +%endif lea src_bakq, [srcq+wq*2] mov r9mp, src_bakq neg wq - sub dstq, srcq + sub dstmp, srcq +%if ARCH_X86_32 + mov r4m, wq +%endif .loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 @@ -1393,22 +1488,34 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak mov offyd, seed mov offxd, seed +%endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak +%endif .loop_x_odd: - movzx hd, word hm + movzx hd, word r7m mov grain_lutq, grain_lutmp .loop_y: ; src @@ -1416,8 +1523,13 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra pand m1, m10, [srcq+16] ; m0-1: src as word ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4 + vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4 +%else vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4 vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4 +%endif REPX {psrlw x, 8}, m2, m3 ; grain = grain_lut[offy+y][offx+x] @@ -1436,38 +1548,67 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 + movifnidn dstq, dstmp mova [dstq+srcq+ 0], m0 mova [dstq+srcq+16], m1 - add srcq, strideq + add srcq, r2mp ; src += stride add grain_lutq, 82*2 dec hd jg .loop_y +%if ARCH_X86_32 + add r4mp, 16 +%else add wq, 16 +%endif jge .end +%if ARCH_X86_32 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else mov src_bakq, r9mp lea srcq, [src_bakq+wq*2] - btc dword hm, 16 +%endif + btc dword r8m, 2 jc .next_blk add offxyd, 16 - cmp dword r8m, 0 - je .loop_x_odd - SPLATD m15, [pw_27_17_17_27] + test dword r8m, 2 + jz .loop_x_odd +%if ARCH_X86_32 + mov r5, r5m + SPLATD m7, [base+pw_27_17_17_27] + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + SPLATD m7, [pw_27_17_17_27] add r12d, 16 ; top_offxy += 16 +%endif jmp .loop_x_odd_v_overlap .next_blk: - cmp byte [fg_dataq+FGData.overlap_flag], 0 - je .loop_x + test dword r8m, 1 + jz .loop_x ; r8m = sbym - movq m15, [pw_27_17_17_27] - cmp dword r8m, 0 - jne .loop_x_hv_overlap +%if ARCH_X86_32 + mov r5, r5m + movq m7, [base+pw_27_17_17_27] +%else + movq m7, [pw_27_17_17_27] +%endif + test dword r8m, 2 + jnz .loop_x_hv_overlap ; horizontal overlap (without vertical overlap) .loop_x_h_overlap: +%if ARCH_X86_32 + add offxyd, 16 + mov [rsp+8*mmsize+0*gprsize], offxyd + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + mov seed, r3m +%endif + mov r6d, seed or seed, 0xEFF4 shr r6d, 1 @@ -1475,29 +1616,47 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx + + mov offxd, offyd +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, left_offxy lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx + mov offyd, seed mov offxd, seed +%endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak, left_offxy +%endif - movzx hd, word hm + mov hd, dword r7m mov grain_lutq, grain_lutmp .loop_y_h_overlap: ; grain = grain_lut[offy+y][offx+x] movu m4, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+0*gprsize] + movd m5, [grain_lutq+r5*2] +%else movd m5, [grain_lutq+left_offxyq*2] +%endif punpcklwd m5, m4 - pmaddwd m5, m15 + pmaddwd m5, m7 paddd m5, m14 psrad m5, 5 packssdw m5, m5 @@ -1508,7 +1667,7 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra pandn m0, m6, m5 por m4, m0 %endif - pminsw m4, m7 + pminsw m4, m15 pmaxsw m4, m9 ; src @@ -1516,8 +1675,13 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra pand m1, m10, [srcq+16] ; m0-1: src as word ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5 + vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5 +%else vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5 vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5 +%endif REPX {psrlw x, 8}, m2, m3 ; noise = round2(scaling[src] * grain, scaling_shift) @@ -1533,76 +1697,131 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 + movifnidn dstq, dstmp mova [dstq+srcq+ 0], m0 mova [dstq+srcq+16], m1 - add srcq, strideq + add srcq, r2mp add grain_lutq, 82*2 dec hd jg .loop_y_h_overlap +%if ARCH_X86_32 + add r4mp, 16 +%else add wq, 16 +%endif jge .end +%if ARCH_X86_32 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else mov src_bakq, r9mp lea srcq, [src_bakq+wq*2] - or dword hm, 0x10000 +%endif + or dword r8m, 4 add offxyd, 16 ; r8m = sbym - cmp dword r8m, 0 - je .loop_x_odd - SPLATD m15, [pw_27_17_17_27] + test dword r8m, 2 + jz .loop_x_odd +%if ARCH_X86_32 + mov r5, r5m + SPLATD m7, [base+pw_27_17_17_27] + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + SPLATD m7, [pw_27_17_17_27] add r12d, 16 ; top_offxy += 16 +%endif jmp .loop_x_odd_v_overlap .end: RET .vertical_overlap: + or t0d, 2 + mov r8m, t0d + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ sby, see +%endif movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused +%else imul seed, [fg_dataq+FGData.seed], 0x00010001 - imul r7d, sbyd, 173 * 0x00010001 +%endif + imul t0d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 - add r7d, (105 << 16) | 188 + add t0d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) - and r7d, 0x00ff00ff + and t0d, 0x00ff00ff and sbyd, 0xff00ff00 - xor seed, r7d + xor seed, t0d +%if ARCH_X86_32 + xor sbyd, seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused1, unused2, see, src_bak +%endif lea src_bakq, [srcq+wq*2] mov r9mp, src_bakq neg wq - sub dstq, srcq + sub dstmp, srcq +%if ARCH_X86_32 + mov r4m, wq +%endif .loop_x_v_overlap: - SPLATD m15, [pw_27_17_17_27] +%if ARCH_X86_32 + mov r5, r5m + SPLATD m7, [base+pw_27_17_17_27] + mov seed, r3m +%else + SPLATD m7, [pw_27_17_17_27] +%endif ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh - setp r7b ; parity of top_seed + setp t0b ; parity of top_seed shr seed, 16 - shl r7d, 16 + shl t0d, 16 test seeb, seeh - setp r7b ; parity of cur_seed + setp t0b ; parity of cur_seed or r6d, 0x00010001 - xor r7d, r6d - mov seed, r7d + xor t0d, r6d + mov seed, t0d ror seed, 1 ; updated (cur_seed << 16) | top_seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, unused, top_offxy mov offyd, seed mov offxd, seed +%endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f @@ -1611,36 +1830,54 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*2+0x10001*747+32*82] +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak, unused, top_offxy +%endif movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif shr offxyd, 16 .loop_x_odd_v_overlap: - movzx hd, word hm + mov hd, dword r7m mov grain_lutq, grain_lutmp .loop_y_v_overlap: ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+1*gprsize] + movu m2, [grain_lutq+r5*2] +%else movu m2, [grain_lutq+top_offxyq*2] +%endif punpckhwd m4, m2, m3 punpcklwd m2, m3 - REPX {pmaddwd x, m15}, m4, m2 + REPX {pmaddwd x, m7}, m4, m2 REPX {paddd x, m14}, m4, m2 REPX {psrad x, 5}, m4, m2 packssdw m2, m4 - pminsw m2, m7 + pminsw m2, m15 pmaxsw m2, m9 movu m4, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m3, [grain_lutq+r5*2+16] +%else movu m3, [grain_lutq+top_offxyq*2+16] +%endif punpckhwd m5, m3, m4 punpcklwd m3, m4 - REPX {pmaddwd x, m15}, m5, m3 + REPX {pmaddwd x, m7}, m5, m3 REPX {paddd x, m14}, m5, m3 REPX {psrad x, 5}, m5, m3 packssdw m3, m5 - pminsw m3, m7 + pminsw m3, m15 pmaxsw m3, m9 ; src @@ -1649,11 +1886,19 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra ; scaling[src] ; noise = round2(scaling[src] * grain, scaling_shift) +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 +%else vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5 +%endif psrlw m4, 8 pmullw m4, m11 pmulhrsw m4, m2 +%if ARCH_X86_32 + vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2 +%else vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2 +%endif psrlw m5, 8 pmullw m5, m11 pmulhrsw m5, m3 @@ -1665,11 +1910,17 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 + movifnidn dstq, dstmp mova [dstq+srcq+ 0], m0 mova [dstq+srcq+16], m1 - SPLATD m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line - add srcq, strideq +%if ARCH_X86_32 + mov r5, r5m + SPLATD m7, [base+pw_27_17_17_27+4] +%else + SPLATD m7, [pw_27_17_17_27+4] ; swap weights for second v-overlap line +%endif + add srcq, r2mp add grain_lutq, 82*2 dec hw jz .end_y_v_overlap @@ -1681,15 +1932,31 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra jmp .loop_y .end_y_v_overlap: +%if ARCH_X86_32 + add r4mp, 16 +%else add wq, 16 +%endif jge .end_hv +%if ARCH_X86_32 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else mov src_bakq, r9mp lea srcq, [src_bakq+wq*2] - btc dword hm, 16 +%endif + btc dword r8m, 2 jc .next_blk_v - SPLATD m15, [pw_27_17_17_27] - add offxyd, 16 +%if ARCH_X86_32 + mov r5, r5m + SPLATD m7, [base+pw_27_17_17_27] + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + SPLATD m7, [pw_27_17_17_27] add top_offxyd, 16 +%endif + add offxyd, 16 jmp .loop_x_odd_v_overlap .next_blk_v: @@ -1697,24 +1964,55 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap - movq m15, [pw_27_17_17_27] +%if ARCH_X86_32 + mov r5, r5m + movq m7, [base+pw_27_17_17_27] +%else + movq m7, [pw_27_17_17_27] +%endif + .loop_x_hv_overlap: +%if ARCH_X86_32 + mov r5, r5m + SPLATD m0, [base+pw_27_17_17_27] + mova [rsp+7*mmsize], m0 +%define m8 [rsp+7*mmsize] + + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r0, [rsp+8*mmsize+1*gprsize] + add r3, 16 + add r0, 16 + mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy + mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy + + mov seed, r3m + xor r0, r0 +%else SPLATD m8, [pw_27_17_17_27] ; we assume from the block above that bits 8-15 of r7d are zero'ed +%endif mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh - setp r7b ; parity of top_seed + setp t0b ; parity of top_seed shr seed, 16 - shl r7d, 16 + shl t0d, 16 test seeb, seeh - setp r7b ; parity of cur_seed + setp t0b ; parity of cur_seed or r6d, 0x00010001 - xor r7d, r6d - mov seed, r7d + xor t0d, r6d + mov seed, t0d ror seed, 1 ; updated (cur_seed << 16) | top_seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy @@ -1722,6 +2020,7 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra lea left_offxyq, [offyq+16] mov offyd, seed mov offxd, seed +%endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f @@ -1730,28 +2029,46 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*2+0x10001*747+32*82] +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy +%endif movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif shr offxyd, 16 - movzx hd, word hm + movzx hd, word r7m mov grain_lutq, grain_lutmp .loop_y_hv_overlap: ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy + mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy + movu m5, [grain_lutq+r0*2] + movd m4, [grain_lutq+r5*2] + mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy + movd m2, [grain_lutq+r5*2] +%else movu m5, [grain_lutq+top_offxyq*2] movd m4, [grain_lutq+left_offxyq*2] movd m2, [grain_lutq+topleft_offxyq*2] +%endif ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklwd m4, m3 punpcklwd m2, m5 - REPX {pmaddwd x, m15}, m4, m2 + REPX {pmaddwd x, m7}, m4, m2 REPX {paddd x, m14}, m4, m2 REPX {psrad x, 5}, m4, m2 REPX {packssdw x, x}, m4, m2 - REPX {pminsw x, m7}, m4, m2 + REPX {pminsw x, m15}, m4, m2 REPX {pmaxsw x, m9}, m4, m2 %if cpuflag(sse4) pblendw m3, m4, 00000011b @@ -1766,7 +2083,11 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra %endif ; followed by v interpolation (top | cur -> cur) movu m0, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m1, [grain_lutq+r0*2+16] +%else movu m1, [grain_lutq+top_offxyq*2+16] +%endif punpcklwd m2, m5, m3 punpckhwd m5, m3 punpcklwd m3, m1, m0 @@ -1776,7 +2097,7 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra REPX {psrad x, 5}, m2, m5, m3, m1 packssdw m2, m5 packssdw m3, m1 - REPX {pminsw x, m7}, m2, m3 + REPX {pminsw x, m15}, m2, m3 REPX {pmaxsw x, m9}, m2, m3 ; src @@ -1785,11 +2106,19 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra ; scaling[src] ; noise = round2(scaling[src] * grain, scaling_shift) +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 +%else vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5 +%endif psrlw m4, 8 pmullw m4, m11 pmulhrsw m2, m4 +%if ARCH_X86_32 + vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4 +%else vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4 +%endif psrlw m5, 8 pmullw m5, m11 pmulhrsw m3, m5 @@ -1801,11 +2130,18 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 + movifnidn dstq, dstmp mova [dstq+srcq+ 0], m0 mova [dstq+srcq+16], m1 +%if ARCH_X86_32 + mov r5, r5m + SPLATD m0, [base+pw_27_17_17_27+4] + mova m8, m0 +%else SPLATD m8, [pw_27_17_17_27+4] ; swap weights for second v-overlap line - add srcq, strideq +%endif + add srcq, r2mp add grain_lutq, 82*2 dec hw jz .end_y_hv_overlap @@ -1817,19 +2153,37 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra jmp .loop_y_h_overlap .end_y_hv_overlap: - or dword hm, 0x10000 + or dword r8m, 4 +%if ARCH_X86_32 + add r4mp, 16 +%else add wq, 16 +%endif jge .end_hv - SPLATD m15, [pw_27_17_17_27] +%if ARCH_X86_32 + mov r5, r5m + SPLATD m7, [base+pw_27_17_17_27] + add offxyd, 16 + add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + SPLATD m7, [pw_27_17_17_27] add offxyd, 16 add top_offxyd, 16 mov src_bakq, r9mp lea srcq, [src_bakq+wq*2] +%endif jmp .loop_x_odd_v_overlap .end_hv: RET +%if ARCH_X86_32 + DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +%endif +%if ARCH_X86_64 cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, lstride, uv_pl, is_id %define base r8-pb_mask From 3aae8d6ff0ad403c17a07613640c3af634063bc1 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 13 Jul 2021 23:24:25 +0000 Subject: [PATCH 141/188] x86/filmgrain: make fguv_i420_32x32xn HBD/SSSE3 32bit-compatible --- src/x86/film_grain16_sse.asm | 612 +++++++++++++++++++++++++++++------ 1 file changed, 520 insertions(+), 92 deletions(-) diff --git a/src/x86/film_grain16_sse.asm b/src/x86/film_grain16_sse.asm index 8a1e1d014b..05bdd03c84 100644 --- a/src/x86/film_grain16_sse.asm +++ b/src/x86/film_grain16_sse.asm @@ -1341,6 +1341,13 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h %endif %endmacro +%if ARCH_X86_32 +%undef base +%define PIC_ptr(a) base+a +%else +%define PIC_ptr(a) a +%endif + INIT_XMM ssse3 %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize @@ -2183,39 +2190,118 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 %endif -%if ARCH_X86_64 +INIT_XMM ssse3 +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize +cglobal fguv_32x32xn_i420_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \ + tmp, src, scaling, h, fg_data, picptr, unused + mov r0, r0m + mov r1, r1m + mov r2, r2m + mov r4, r3m + mov r3, r4m + mov r5, r5m +%define r0m [rsp+8*mmsize+ 3*gprsize] +%define r1m [rsp+8*mmsize+ 4*gprsize] +%define r2m [rsp+8*mmsize+ 5*gprsize] +%define r3m [rsp+8*mmsize+ 6*gprsize] +%define r4m [rsp+8*mmsize+ 7*gprsize] +%define r5m [rsp+8*mmsize+ 8*gprsize] + mov r0m, r0 + mov r2m, r2 + mov r4m, r3 + mov r5m, r5 + + mov r0, r6m + mov r2, r7m + mov r3, r8m + mov r5, r9m +%define r6m [rsp+8*mmsize+ 9*gprsize] +%define r7m [rsp+8*mmsize+10*gprsize] +%define r8m [rsp+8*mmsize+11*gprsize] +%define r9m [rsp+8*mmsize+12*gprsize] + mov r6m, r0 + mov r7m, r2 + mov r8m, r3 + mov r9m, r5 + + mov r2, r10m + mov r3, r11m + mov r5, r12m + mov r0, r13m +%define r10m [rsp+8*mmsize+13*gprsize] +%define r11m [rsp+8*mmsize+14*gprsize] +%define r12m [rsp+8*mmsize+15*gprsize] + mov r10m, r2 + mov r11m, r3 + mov r12m, r5 + + SPLATW m2, r13m +%else +cglobal fguv_32x32xn_i420_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ + tmp, src, scaling, h, fg_data, picptr, unused + mov srcq, srcm + mov fg_dataq, r3m +%endif + LEA r5, $$ +%define base r5-$$ + + DECLARE_REG_TMP 0, 2, 3 +%else cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, lstride, uv_pl, is_id %define base r8-pb_mask lea r8, [pb_mask] - mov r7d, [fg_dataq+FGData.scaling_shift] - SPLATW m11, [base+mul_bits+r7*2-14] + + DECLARE_REG_TMP 9, 10, 11 +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + SPLATW m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] - mov r9d, r13m ; bdmax - sar r9d, 11 ; is_12bpc - inc r9d - mov r10d, r6d - imul r10d, r9d - dec r9d - SPLATW m13, [base+min+r10*2] - lea r10d, [r9d*3] - mov r11d, is_idm - inc r11d - imul r6d, r11d - add r10d, r6d - SPLATW m12, [base+max+r10*2] - SPLATW m10, r13m +%if STACK_ALIGNMENT >= mmsize + mov t0d, r13m ; bdmax +%endif + sar t0d, 11 ; is_12bpc + inc t0d + mov t1d, r6d + imul t1d, t0d + dec t0d + SPLATW m5, [base+min+t1*2] + lea t1d, [t0d*3] + mov t2d, r12m + inc t2d + imul r6d, t2d + add t1d, r6d + SPLATW m4, [base+max+t1*2] +%if STACK_ALIGNMENT >= mmsize + SPLATW m2, r13m +%endif + + SCRATCH 2, 10, 2 + SCRATCH 3, 11, 3 + SCRATCH 4, 12, 4 + SCRATCH 5, 13, 5 + %if cpuflag(sse4) pxor m2, m2 %define mzero m2 %else %define mzero m7 %endif +%if ARCH_X86_32 + mov scalingq, r5m + mov r5m, r5 +%else mov r13mp, strideq +%endif + + pcmpeqw m0, m0 + psraw m1, m10, 1 + pxor m0, m1 + + SCRATCH 0, 8, 0 + SCRATCH 1, 9, 1 - pcmpeqw m8, m8 - psraw m9, m10, 1 - pxor m8, m9 %if !cpuflag(sse4) pcmpeqw m2, m2 pslldq m2, 2 @@ -2225,47 +2311,95 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin jne .csfl %macro FGUV_32x32xN_LOOP 1 ; not-csfl +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap + + DECLARE_REG_TMP 0 +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + DECLARE_REG_TMP 9 +%endif + %if %1 - mov r7d, r11m - SPLATW m0, [fg_dataq+FGData.uv_mult+r7*4] - SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r7*4] - punpcklwd m14, m1, m0 - SPLATW m15, [fg_dataq+FGData.uv_offset+r7*4] - SPLATD m7, [base+pw_4+r9*4] - pmullw m15, m7 + mov r6d, r11m + SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4] + SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4] + punpcklwd m6, m1, m0 + SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4] + SPLATD m7, [base+pw_4+t0*4] + pmullw m5, m7 %else - SPLATD m14, [pd_16] - SPLATD m15, [pw_23_22] + SPLATD m6, [base+pd_16] + SPLATD m5, [base+pw_23_22] %endif - movifnidn sbyd, sbym + SCRATCH 6, 14, 6 + SCRATCH 5, 15, 7 + +%if ARCH_X86_32 + DECLARE_REG_TMP 0 +%else + DECLARE_REG_TMP 7 +%endif + + mov sbyd, r8m + mov t0d, [fg_dataq+FGData.overlap_flag] + test t0d, t0d + jz %%no_vertical_overlap test sbyd, sbyd - setnz r7b - test r7b, byte [fg_dataq+FGData.overlap_flag] jnz %%vertical_overlap +%%no_vertical_overlap: + mov r8m, t0d +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap + imul seed, (173 << 24) | 37 +%else imul seed, sbyd, (173 << 24) | 37 +%endif add seed, (105 << 24) | 178 rol seed, 8 movzx seed, seew xor seed, [fg_dataq+FGData.seed] +%if ARCH_X86_32 + mov r3m, seed + DEFINE_ARGS dst, src, scaling, see, w, picptr, luma + + mov dstq, r0mp + mov lumaq, r9mp + mov wq, r4m + lea r3, [srcq+wq*2] + mov r1mp, r3 + lea r3, [dstq+wq*2] + mov r11mp, r3 + lea r3, [lumaq+wq*4] + mov r12mp, r3 + shl r10mp, 1 +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused2, unused3, see, unused4, unused5, unused6, luma, lstride - mov lumaq, r9mp mov lstrideq, r10mp + mov lumaq, r9mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] lea r12, [lumaq+wq*4] mov r10mp, r10 mov r11mp, r11 mov r12mp, r12 +%endif neg wq +%if ARCH_X86_32 + mov r4mp, wq +%endif %%loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + mov r6d, seed or seed, 0xEFF4 shr r6d, 1 @@ -2273,21 +2407,33 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, unused1, unused2, unused3, luma, lstride mov offxd, seed mov offyd, seed +%endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 82 lea offyq, [offyq+offxq+498] ; offy*stride+offx +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, unused1, unused2, unused3, luma, lstride +%endif - mov hd, hm + mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y: ; src @@ -2298,10 +2444,19 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %if !cpuflag(sse4) pxor mzero, mzero %endif - mova m4, [lumaq+lstrideq*0+ 0] - mova m6, [lumaq+lstrideq*0+32] - phaddw m4, [lumaq+lstrideq*0+16] - phaddw m6, [lumaq+lstrideq*0+48] +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + + mov lumaq, r9m +%endif + mova m4, [lumaq+ 0] + mova m6, [lumaq+32] + phaddw m4, [lumaq+16] + phaddw m6, [lumaq+48] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9m, lumaq +%endif pavgw m4, mzero pavgw m6, mzero @@ -2322,8 +2477,13 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %endif ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1 + vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 +%else vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1 vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 +%endif REPX {psrlw x, 8}, m3, m5 ; grain = grain_lut[offy+y][offx+x] @@ -2342,33 +2502,62 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 + movifnidn dstq, dstmp mova [dstq+ 0], m0 mova [dstq+16], m1 +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else add srcq, r13mp add dstq, r13mp lea lumaq, [lumaq+lstrideq*2] +%endif add grain_lutq, 82*2 - dec hb + dec hd jg %%loop_y +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma + + mov wq, r4mp +%endif add wq, 16 jge %%end +%if ARCH_X86_32 + mov srcq, r1mp +%else mov srcq, r10mp +%endif mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*4] - cmp byte [fg_dataq+FGData.overlap_flag], 0 +%if ARCH_X86_32 + mov r0m, dstq + mov r9m, lumaq + mov r4m, wq +%endif + test dword r8m, 1 je %%loop_x ; r8m = sbym - cmp dword r8m, 0 - jne %%loop_x_hv_overlap + test dword r8m, 2 + jnz %%loop_x_hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: +%if ARCH_X86_32 + add offxyd, 16 + mov [rsp+8*mmsize+0*gprsize], offxyd + + DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut + + mov seed, r3m +%endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 @@ -2376,22 +2565,34 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, luma, lstride lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx mov offxd, seed mov offyd, seed +%endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 82 lea offyq, [offyq+offxq+498] ; offy*stride+offx +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, left_offxy, unused1, unused2, luma, lstride +%endif - mov hd, hm + mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y_h_overlap: mova m0, [srcq] @@ -2401,10 +2602,18 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %if !cpuflag(sse4) pxor mzero, mzero %endif - mova m4, [lumaq+lstrideq*0+ 0] - mova m6, [lumaq+lstrideq*0+32] - phaddw m4, [lumaq+lstrideq*0+16] - phaddw m6, [lumaq+lstrideq*0+48] +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + mov lumaq, r9m +%endif + mova m4, [lumaq+ 0] + mova m6, [lumaq+32] + phaddw m4, [lumaq+16] + phaddw m6, [lumaq+48] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9m, lumaq +%endif pavgw m4, mzero pavgw m6, mzero @@ -2426,11 +2635,19 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin ; grain = grain_lut[offy+y][offx+x] movu m7, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+0*gprsize] + movd m5, [grain_lutq+r5*2] +%else movd m5, [grain_lutq+left_offxyq*2+ 0] +%endif punpcklwd m5, m7 ; {left0, cur0} %if %1 - pmaddwd m5, [pw_23_22] - paddd m5, [pd_16] +%if ARCH_X86_32 + mov r5, r5m +%endif + pmaddwd m5, [PIC_ptr(pw_23_22)] + paddd m5, [PIC_ptr(pd_16)] %else pmaddwd m5, m15 paddd m5, m14 @@ -2449,8 +2666,13 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin movu m3, [grain_lutq+offxyq*2+16] ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1 + vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1 +%else vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1 vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1 +%endif REPX {psrlw x, 8}, m7, m4 ; noise = round2(scaling[luma_src] * grain, scaling_shift) @@ -2465,27 +2687,47 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 + movifnidn dstq, dstmp mova [dstq+ 0], m0 mova [dstq+16], m1 +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else add srcq, r13mp add dstq, r13mp lea lumaq, [lumaq+lstrideq*2] +%endif add grain_lutq, 82*2 - dec hb + dec hd jg %%loop_y_h_overlap +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut + mov wq, r4mp +%endif add wq, 16 jge %%end +%if ARCH_X86_32 + mov srcq, r1mp +%else mov srcq, r10mp +%endif mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*4] +%if ARCH_X86_32 + mov r0mp, dstq + mov r9mp, lumaq + mov r4m, wq +%endif ; r8m = sbym - cmp dword r8m, 0 + test dword r8m, 2 jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap @@ -2493,53 +2735,99 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin RET %%vertical_overlap: + or t0d, 2 + mov r8m, t0d + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ sby, see, unused1, unused2, unused3, lstride +%endif movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + + DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused +%else imul seed, [fg_dataq+FGData.seed], 0x00010001 - imul r7d, sbyd, 173 * 0x00010001 +%endif + imul t0d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 - add r7d, (105 << 16) | 188 + add t0d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) - and r7d, 0x00ff00ff + and t0d, 0x00ff00ff and sbyd, 0xff00ff00 - xor seed, r7d + xor seed, t0d +%if ARCH_X86_32 + xor sbyd, seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, luma + + mov r3m, seed + mov dstq, r0mp + mov lumaq, r9mp + mov wq, r4m + lea r3, [srcq+wq*2] + mov r1mp, r3 + lea r3, [dstq+wq*2] + mov r11mp, r3 + lea r3, [lumaq+wq*4] + mov r12mp, r3 + shl r10mp, 1 +%else xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused1, unused2, see, unused3, unused4, unused5, luma, lstride - mov lumaq, r9mp mov lstrideq, r10mp + mov lumaq, r9mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] lea r12, [lumaq+wq*4] mov r10mp, r10 mov r11mp, r11 mov r12mp, r12 +%endif neg wq +%if ARCH_X86_32 + mov r4m, wq +%endif %%loop_x_v_overlap: +%if ARCH_X86_32 + mov seed, r3m + xor t0d, t0d +%else ; we assume from the block above that bits 8-15 of r7d are zero'ed +%endif mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh - setp r7b ; parity of top_seed + setp t0b ; parity of top_seed shr seed, 16 - shl r7d, 16 + shl t0d, 16 test seeb, seeh - setp r7b ; parity of cur_seed + setp t0b ; parity of cur_seed or r6d, 0x00010001 - xor r7d, r6d - mov seed, r7d + xor t0d, r6d + mov seed, t0d ror seed, 1 ; updated (cur_seed << 16) | top_seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + mov offxd, offyd +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, unused1, top_offxy, unused2, luma, lstride mov offyd, seed mov offxd, seed +%endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f @@ -2548,23 +2836,38 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq+0x10001*498+16*82] +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, unused1, top_offxy, unused2, luma, lstride - +%endif movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif shr offxyd, 16 - mov hd, hm + mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y_v_overlap: ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy + movu m5, [grain_lutq+r0*2] +%else movu m5, [grain_lutq+top_offxyq*2] +%endif punpckhwd m7, m5, m3 punpcklwd m5, m3 ; {top/cur interleaved} %if %1 - REPX {pmaddwd x, [pw_23_22]}, m7, m5 - REPX {paddd x, [pd_16]}, m7, m5 +%if ARCH_X86_32 + mov r5, r5m +%endif + REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m7, m5 + REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 %else REPX {pmaddwd x, m15}, m7, m5 REPX {paddd x, m14}, m7, m5 @@ -2576,12 +2879,16 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin ; grain = grain_lut[offy+y][offx+x] movu m4, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m5, [grain_lutq+r0*2+16] +%else movu m5, [grain_lutq+top_offxyq*2+16] +%endif punpckhwd m7, m5, m4 punpcklwd m5, m4 ; {top/cur interleaved} %if %1 - REPX {pmaddwd x, [pw_23_22]}, m7, m5 - REPX {paddd x, [pd_16]}, m7, m5 + REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m7, m5 + REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 %else REPX {pmaddwd x, m15}, m7, m5 REPX {paddd x, m14}, m7, m5 @@ -2599,10 +2906,19 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %if !cpuflag(sse4) pxor mzero, mzero %endif - mova m5, [lumaq+lstrideq*0+ 0] - mova m6, [lumaq+lstrideq*0+32] - phaddw m5, [lumaq+lstrideq*0+16] - phaddw m6, [lumaq+lstrideq*0+48] +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + + mov lumaq, r9mp +%endif + mova m5, [lumaq+ 0] + mova m6, [lumaq+32] + phaddw m5, [lumaq+16] + phaddw m6, [lumaq+48] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif pavgw m5, mzero pavgw m6, mzero @@ -2628,8 +2944,13 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %endif ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1 + vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 +%else vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1 vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 +%endif REPX {psrlw x, 8}, m7, m5 ; noise = round2(scaling[luma_src] * grain, scaling_shift) @@ -2644,46 +2965,88 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 + movifnidn dstq, dstmp mova [dstq+ 0], m0 mova [dstq+16], m1 - dec hb + dec hd jle %%end_y_v_overlap +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else add srcq, r13mp add dstq, r13mp lea lumaq, [lumaq+lstrideq*2] +%endif add grain_lutq, 82*2 jmp %%loop_y %%end_y_v_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut + + mov wq, r4m +%endif add wq, 16 jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp +%else mov srcq, r10mp +%endif mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*4] +%if ARCH_X86_32 + mov r0mp, dstq + mov r9mp, lumaq + mov r4m, wq +%endif ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap %%loop_x_hv_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut + + mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy + add offxyd, 16 + add t0d, 16 + mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd + mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd + + DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut + + mov seed, r3m + xor t0d, t0d +%else ; we assume from the block above that bits 8-15 of r7d are zero'ed +%endif mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh - setp r7b ; parity of top_seed + setp t0b ; parity of top_seed shr seed, 16 - shl r7d, 16 + shl t0d, 16 test seeb, seeh - setp r7b ; parity of cur_seed + setp t0b ; parity of cur_seed or r6d, 0x00010001 - xor r7d, r6d - mov seed, r7d + xor t0d, r6d + mov seed, t0d ror seed, 1 ; updated (cur_seed << 16) | top_seed +%if ARCH_X86_32 + mov r3m, seed + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride @@ -2691,6 +3054,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin lea left_offxyq, [offyq+16] mov offyd, seed mov offxd, seed +%endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f @@ -2699,25 +3063,48 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq+0x10001*498+16*82] +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy +%else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride - +%endif movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif shr offxyd, 16 - mov hd, hm + mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y_hv_overlap: ; grain = grain_lut[offy+y][offx+x] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy + mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy + movd m5, [grain_lutq+r5*2] +%else movd m5, [grain_lutq+left_offxyq*2] - pinsrw m5, [grain_lutq+topleft_offxyq*2], 1 ; { left, top/left } +%endif movu m3, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+2*gprsize] + movu m4, [grain_lutq+r0*2] + pinsrw m5, [grain_lutq+r5*2], 1 +%else movu m4, [grain_lutq+top_offxyq*2] + pinsrw m5, [grain_lutq+topleft_offxyq*2], 1 ; { left, top/left } +%endif punpcklwd m7, m3, m4 ; { cur0, top0 } punpcklwd m5, m7 ; { cur/left } interleaved %if %1 - pmaddwd m5, [pw_23_22] - paddd m5, [pd_16] +%if ARCH_X86_32 + mov r5, r5m +%endif + pmaddwd m5, [PIC_ptr(pw_23_22)] + paddd m5, [PIC_ptr(pd_16)] %else pmaddwd m5, m15 paddd m5, m14 @@ -2743,8 +3130,8 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin punpckhwd m7, m5, m3 punpcklwd m5, m3 ; {top/cur interleaved} %if %1 - REPX {pmaddwd x, [pw_23_22]}, m7, m5 - REPX {paddd x, [pd_16]}, m5, m7 + REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m7, m5 + REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7 %else REPX {pmaddwd x, m15}, m7, m5 REPX {paddd x, m14}, m5, m7 @@ -2756,12 +3143,16 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin ; right half movu m4, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m0, [grain_lutq+r0*2+16] +%else movu m0, [grain_lutq+top_offxyq*2+16] +%endif punpckhwd m1, m0, m4 punpcklwd m0, m4 ; {top/cur interleaved} %if %1 - REPX {pmaddwd x, [pw_23_22]}, m1, m0 - REPX {paddd x, [pd_16]}, m1, m0 + REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m1, m0 + REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0 %else REPX {pmaddwd x, m15}, m1, m0 REPX {paddd x, m14}, m1, m0 @@ -2779,10 +3170,19 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %if !cpuflag(sse4) pxor mzero, mzero %endif - mova m6, [lumaq+lstrideq*0+ 0] - mova m5, [lumaq+lstrideq*0+32] - phaddw m6, [lumaq+lstrideq*0+16] - phaddw m5, [lumaq+lstrideq*0+48] +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + + mov lumaq, r9mp +%endif + mova m6, [lumaq+ 0] + mova m5, [lumaq+32] + phaddw m6, [lumaq+16] + phaddw m5, [lumaq+48] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif pavgw m6, mzero pavgw m5, mzero @@ -2808,8 +3208,13 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %endif ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1 + vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1 +%else vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1 vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1 +%endif REPX {psrlw x, 8}, m7, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) @@ -2824,25 +3229,46 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 + movifnidn dstq, dstmp mova [dstq+ 0], m0 mova [dstq+16], m1 +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else add srcq, r13mp add dstq, r13mp lea lumaq, [lumaq+lstrideq*2] +%endif add grain_lutq, 82*2 - dec hb + dec hd jg %%loop_y_h_overlap %%end_y_hv_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut + + mov wq, r4m +%endif add wq, 16 jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp +%else mov srcq, r10mp +%endif mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*4] +%if ARCH_X86_32 + mov dstmp, dstq + mov r9mp, lumaq + mov r4m, wq +%endif jmp %%loop_x_hv_overlap %%end_hv: @@ -2853,4 +3279,6 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin .csfl: FGUV_32x32xN_LOOP 0 -%endif ; ARCH_X86_64 +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif From 7237a27adb1e24b17ee3e9fbd55ac9555601a775 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Wed, 14 Jul 2021 13:18:33 -0400 Subject: [PATCH 142/188] x86/filmgrain: add generate_grain_uv_i422/i444 HBD AVX2 & SSSE3 --- src/x86/film_grain16_avx2.asm | 166 ++++++++++++++++++++++++++++------ src/x86/film_grain16_sse.asm | 152 ++++++++++++++++++++++++------- 2 files changed, 258 insertions(+), 60 deletions(-) diff --git a/src/x86/film_grain16_avx2.asm b/src/x86/film_grain16_avx2.asm index 6f4a4aa5fb..ea5611d4bc 100644 --- a/src/x86/film_grain16_avx2.asm +++ b/src/x86/film_grain16_avx2.asm @@ -62,6 +62,8 @@ pw_16: times 2 dw 16 JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3 struc FGData .seed: resd 1 @@ -403,8 +405,9 @@ cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax %endif RET +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y INIT_XMM avx2 -cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax +cglobal generate_grain_uv_%1_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax %define base r8-pb_mask lea r8, [pb_mask] movifnidn bdmaxd, bdmaxm @@ -422,10 +425,15 @@ cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax pxor xm0, xm9 vpbroadcastd xm9, [base+pd_m65536] lea r6, [gaussian_sequence] - mov r7d, 38 +%if %2 + mov r7d, 73-35*%3 add bufq, 44*2 .loop_y: mov r5, -44 +%else + mov r5, -82*73 + add bufq, 2*82*73 +%endif .loop_x: pand xm2, xm0, xm1 psrlw xm3, xm2, 10 @@ -451,14 +459,16 @@ cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax movq [bufq+r5*2], xm2 add r5, 4 jl .loop_x +%if %2 add bufq, 82*2 dec r7d jg .loop_y +%endif ; auto-regression code movsxd r5, [fg_dataq+FGData.ar_coeff_lag] - movsxd r5, [base+generate_grain_uv_420_16bpc_avx2_table+r5*4] - lea r5, [r5+base+generate_grain_uv_420_16bpc_avx2_table] + movsxd r5, [base+generate_grain_uv_%1_16bpc_avx2_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_16bpc_avx2_table] jmp r5 .ar0: @@ -473,40 +483,61 @@ cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax pcmpeqw m7, m7 vpbroadcastw m14, xm14 ; max_gain pxor m7, m14 ; min_grain - DEFINE_ARGS buf, bufy, h + DEFINE_ARGS buf, bufy, h, x pmovsxbw xm4, xm4 - vpbroadcastw m6, [hmul_bits+4] +%if %2 + vpbroadcastw m6, [hmul_bits+2+%3*2] +%endif vpbroadcastw m4, xm4 pxor m5, m5 - sub bufq, 2*(82*38+82-(82*3+41)) +%if %2 + sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) +%else + sub bufq, 2*(82*70-3) +%endif add bufyq, 2*(3+82*3) - mov hd, 35 + mov hd, 70-35*%3 .y_loop_ar0: +%if %2 ; first 32 pixels movu xm8, [bufyq] - movu xm9, [bufyq+82*2] movu xm10, [bufyq+ 16] +%if %3 + movu xm9, [bufyq+82*2] movu xm11, [bufyq+82*2+16] +%endif vinserti128 m8, [bufyq+ 32], 1 - vinserti128 m9, [bufyq+82*2+32], 1 vinserti128 m10, [bufyq+ 48], 1 +%if %3 + vinserti128 m9, [bufyq+82*2+32], 1 vinserti128 m11, [bufyq+82*2+48], 1 paddw m8, m9 paddw m10, m11 +%endif phaddw m8, m10 movu xm10, [bufyq+ 64] - movu xm11, [bufyq+82*2+64] movu xm12, [bufyq+ 80] +%if %3 + movu xm11, [bufyq+82*2+64] movu xm13, [bufyq+82*2+80] +%endif vinserti128 m10, [bufyq+ 96], 1 - vinserti128 m11, [bufyq+82*2+96], 1 vinserti128 m12, [bufyq+ 112], 1 +%if %3 + vinserti128 m11, [bufyq+82*2+96], 1 vinserti128 m13, [bufyq+82*2+112], 1 paddw m10, m11 paddw m12, m13 +%endif phaddw m10, m12 pmulhrsw m8, m6 pmulhrsw m10, m6 +%else + xor xd, xd +.x_loop_ar0: + movu m8, [bufyq+xq*2] + movu m10, [bufyq+xq*2+32] +%endif punpckhwd m9, m8, m5 punpcklwd m8, m5 punpckhwd m11, m10, m5 @@ -516,20 +547,28 @@ cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax packssdw m8, m9 packssdw m10, m11 REPX {pmulhrsw x, m3}, m8, m10 +%if %2 paddw m8, [bufq+ 0] paddw m10, [bufq+32] +%else + paddw m8, [bufq+xq*2+ 0] + paddw m10, [bufq+xq*2+32] +%endif pminsw m8, m14 pminsw m10, m14 pmaxsw m8, m7 pmaxsw m10, m7 +%if %2 movu [bufq+ 0], m8 movu [bufq+32], m10 ; last 6 pixels movu xm8, [bufyq+32*4] movu xm10, [bufyq+32*4+16] +%if %3 paddw xm8, [bufyq+32*4+82*2] paddw xm10, [bufyq+32*4+82*2+16] +%endif phaddw xm8, xm10 pmulhrsw xm8, xm6 punpckhwd xm9, xm8, xm5 @@ -544,9 +583,31 @@ cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax pmaxsw xm8, xm7 vpblendw xm0, xm8, xm0, 11000000b movu [bufq+32*2], xm0 +%else + movu [bufq+xq*2+ 0], m8 + movu [bufq+xq*2+32], m10 + add xd, 32 + cmp xd, 64 + jl .x_loop_ar0 + + ; last 12 pixels + movu m8, [bufyq+64*2] + punpckhwd m9, m8, m5 + punpcklwd m8, m5 + REPX {pmaddwd x, m4}, m8, m9 + REPX {psrad x, 5}, m8, m9 + packssdw m8, m9 + pmulhrsw m8, m3 + movu m0, [bufq+64*2] + paddw m8, m0 + pminsw m8, m14 + pmaxsw m8, m7 + vpblendd m0, m8, m0, 11000000b + movu [bufq+64*2], m0 +%endif add bufq, 82*2 - add bufyq, 82*4 + add bufyq, 82*2<<%3 dec hd jg .y_loop_ar0 RET @@ -564,26 +625,40 @@ cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax pshufd xm5, xm4, q1111 pshufd xm4, xm4, q0000 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd - vpbroadcastw xm6, [hmul_bits+4] + vpbroadcastw xm6, [hmul_bits+2+%3*2] vpbroadcastd xm3, xm3 - sub bufq, 2*(82*38+44-(82*3+41)) +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif add bufyq, 2*(79+82*3) - mov hd, 35 + mov hd, 70-35*%3 sar maxd, 1 mov mind, maxd xor mind, -1 .y_loop_ar1: - mov xq, -38 + mov xq, -(76>>%2) movsx val3d, word [bufq+xq*2-2] .x_loop_ar1: movu xm0, [bufq+xq*2-82*2-2] ; top/left +%if %2 movu xm8, [bufyq+xq*4] +%else + movq xm8, [bufyq+xq*2] +%endif psrldq xm2, xm0, 2 ; top psrldq xm1, xm0, 4 ; top/right +%if %2 +%if %3 phaddw xm8, [bufyq+xq*4+82*2] pshufd xm9, xm8, q3232 paddw xm8, xm9 +%else + phaddw xm8, xm8 +%endif pmulhrsw xm8, xm6 +%endif punpcklwd xm0, xm2 punpcklwd xm1, xm8 pmaddwd xm0, xm4 @@ -612,7 +687,7 @@ cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax .x_loop_ar1_end: add bufq, 82*2 - add bufyq, 82*4 + add bufyq, 82*2<<%3 dec hd jg .y_loop_ar1 RET @@ -627,7 +702,9 @@ cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax pcmpeqd xm5, xm5 vpbroadcastd xm6, xm6 ; max_grain pxor xm5, xm6 ; min_grain - vpbroadcastw xm7, [base+hmul_bits+4] +%if %2 + vpbroadcastw xm7, [base+hmul_bits+2+%3*2] +%endif vpbroadcastw xm15, [base+round_vals-12+shiftq*2] movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+5] @@ -644,11 +721,15 @@ cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax pshufd m10, m0, q2222 DEFINE_ARGS buf, bufy, fg_data, h, x - sub bufq, 2*(82*38+44-(82*3+41)) +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif add bufyq, 2*(79+82*3) - mov hd, 35 + mov hd, 70-35*%3 .y_loop_ar2: - mov xq, -38 + mov xq, -(76>>%2) .x_loop_ar2: movu xm0, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] @@ -657,17 +738,27 @@ cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax psrldq m2, m0, 4 ; y=-1/-2,x=[-0,+5] psrldq m3, m0, 6 ; y=-1/-2,x=[+1,+5] +%if %2 movu xm8, [bufyq+xq*4] +%if %3 paddw xm8, [bufyq+xq*4+82*2] +%endif phaddw xm8, xm8 +%else + movq xm8, [bufyq+xq*2] +%endif vinserti128 m4, xm0, 1 ; y=-1,x=[-2,+5] punpcklwd m2, m3 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] punpckhwd m4, m0, m4 ; y=-2/-1 interleaved, x=[+2,+5] punpcklwd m0, m1 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] +%if %2 pmulhrsw xm1, xm8, xm7 punpcklwd xm1, xm15 ; luma, round interleaved +%else + punpcklwd xm1, xm8, xm15 +%endif vpblendd m1, m1, m4, 11110000b pmaddwd m2, m11 @@ -703,7 +794,7 @@ cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax .x_loop_ar2_end: add bufq, 82*2 - add bufyq, 82*4 + add bufyq, 82*2<<%3 dec hd jg .y_loop_ar2 RET @@ -730,7 +821,9 @@ cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax pcmpeqd xm13, xm13 vpbroadcastd xm15, xm15 ; max_grain pxor xm13, xm15 ; min_grain - vpbroadcastw xm12, [base+hmul_bits+4] +%if %2 + vpbroadcastw xm12, [base+hmul_bits+2+%3*2] +%endif movq xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] pinsrb xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma @@ -757,11 +850,15 @@ cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax pinsrw xm11, [base+round_vals-10+shiftq*2], 3 DEFINE_ARGS buf, bufy, fg_data, h, unused, x - sub bufq, 2*(82*38+44-(82*3+41)) +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif add bufyq, 2*(79+82*3) - mov hd, 35 + mov hd, 70-35*%3 .y_loop_ar3: - mov xq, -38 + mov xq, -(76>>%2) .x_loop_ar3: movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] @@ -771,16 +868,24 @@ cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] +%if %2 movu xm7, [bufyq+xq*4] +%if %3 paddw xm7, [bufyq+xq*4+82*2] +%endif phaddw xm7, xm7 +%else + movq xm7, [bufyq+xq*2] +%endif palignr m4, m1, m0, 2 ; y=-3/-2,x=[-2,+5] palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] punpckhwd m5, m0, m4 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] punpcklwd m0, m4 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] palignr m6, m5, m0, 8 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] +%if %2 pmulhrsw xm7, xm12 +%endif punpcklwd m1, m7 psrldq m3, m2, 2 @@ -830,13 +935,18 @@ cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax .x_loop_ar3_end: add bufq, 82*2 - add bufyq, 82*4 + add bufyq, 82*2<<%3 dec hd jg .y_loop_ar3 %if WIN64 mov rsp, r6 %endif RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 INIT_YMM avx2 cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, grain_lut diff --git a/src/x86/film_grain16_sse.asm b/src/x86/film_grain16_sse.asm index 05bdd03c84..26e7ecbc0e 100644 --- a/src/x86/film_grain16_sse.asm +++ b/src/x86/film_grain16_sse.asm @@ -29,6 +29,7 @@ SECTION_RODATA 16 pd_16: times 4 dd 16 pw_1: times 8 dw 1 +pw_16384: times 8 dw 16384 pw_8192: times 8 dw 8192 pw_23_22: times 4 dw 23, 22 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 @@ -60,6 +61,8 @@ pw_16: times 2 dw 16 JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3 struc FGData .seed: resd 1 @@ -646,15 +649,16 @@ cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax %endif RET +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y INIT_XMM ssse3 %if ARCH_X86_64 -cglobal generate_grain_uv_420_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg +cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg %define base r8-pb_mask lea r8, [pb_mask] movifnidn bdmaxd, bdmaxm lea r6d, [bdmaxq+1] %else -cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h +cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h %define base r2-$$ LEA r2, $$ mov fg_dataq, r2m @@ -680,10 +684,15 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h %if ARCH_X86_64 lea r6, [gaussian_sequence] %endif - mov hd, 38 +%if %2 + mov hd, 73-35*%3 add bufq, 44*2 .loop_y: mov xq, -44 +%else + mov xq, -82*73 + add bufq, 82*73*2 +%endif .loop_x: pand m2, m0, m1 psrlw m3, m2, 10 @@ -709,14 +718,16 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h movq [bufq+xq*2], m3 add xq, 4 jl .loop_x +%if %2 add bufq, 82*2 dec hd jg .loop_y +%endif ; auto-regression code movsxd r5, [fg_dataq+FGData.ar_coeff_lag] - movsxd r5, [base+generate_grain_uv_420_16bpc_ssse3_table+r5*4] - lea r5, [r5+base+generate_grain_uv_420_16bpc_ssse3_table] + movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table] jmp r5 .ar0: @@ -753,9 +764,12 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h pxor m5, m5 pcmpgtb m5, m4 punpcklbw m4, m5 - SPLATW m6, [base+hmul_bits+4] +%if %2 + SPLATW m6, [base+hmul_bits+2+%3*2] +%endif SPLATW m4, m4 pxor m5, m5 +%if %2 %if !cpuflag(sse4) pcmpeqw m2, m2 pslldq m2, 12 @@ -766,21 +780,32 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h mova m12, m2 %endif %endif - sub bufq, 2*(82*38+82-(82*3+41)) +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) +%else + sub bufq, 2*(82*70-3) +%endif add bufyq, 2*(3+82*3) - mov hd, 35 + mov hd, 70-35*%3 .y_loop_ar0: ; first 32 pixels xor xd, xd .x_loop_ar0: - movu m0, [bufyq+xq*4] + movu m0, [bufyq+xq*(2<<%2)] +%if %2 +%if %3 movu m2, [bufyq+xq*4+82*2] paddw m0, m2 +%endif movu m1, [bufyq+xq*4 +16] +%if %3 movu m2, [bufyq+xq*4+82*2+16] paddw m1, m2 +%endif phaddw m0, m1 pmulhrsw m0, m6 +%endif punpckhwd m1, m0, m5 punpcklwd m0, m5 REPX {pmaddwd x, m4}, m0, m1 @@ -791,14 +816,15 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h paddw m0, m1 pminsw m0, m14 pmaxsw m0, m7 - cmp xd, 32 + cmp xd, 72-40*%2 je .end movu [bufq+xq*2], m0 add xd, 8 jmp .x_loop_ar0 - ; last 6 pixels + ; last 6/4 pixels .end: +%if %2 %if cpuflag(sse4) pblendw m0, m1, 11000000b %else @@ -807,9 +833,12 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h por m0, m1, m2 %endif movu [bufq+xq*2], m0 +%else + movq [bufq+xq*2], m0 +%endif add bufq, 82*2 - add bufyq, 82*4 + add bufyq, 82*(2<<%3) dec hd jg .y_loop_ar0 %if ARCH_X86_32 @@ -834,7 +863,11 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] %if WIN64 DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0 - lea bufq, [r0-2*(82*38+44-(82*3+41))] +%if %2 + lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))] +%else + lea bufq, [r0-2*(82*69+3)] +%endif %else %if ARCH_X86_64 DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0 @@ -844,7 +877,11 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h %define mind dword r3m %define maxd dword r4m %endif - sub bufq, 2*(82*38+44-(82*3+41)) +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif %endif %if ARCH_X86_64 mov shiftd, [r2+FGData.ar_coeff_shift] @@ -861,10 +898,12 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h movd m3, [base+round_vals+shiftq*2-12] ; rnd pxor m6, m6 punpcklwd m3, m6 - SPLATW m6, [base+hmul_bits+4] +%if %2 + SPLATW m6, [base+hmul_bits+2+%3*2] +%endif SPLATD m3, m3 add bufyq, 2*(79+82*3) - mov hd, 35 + mov hd, 70-35*%3 sar maxd, 1 %if ARCH_X86_64 mov mind, maxd @@ -876,19 +915,31 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h mov mind, r2 %endif .y_loop_ar1: - mov xq, -38 + mov xq, -(76>>%2) movsx val3d, word [bufq+xq*2-2] .x_loop_ar1: movu m0, [bufq+xq*2-82*2-2] ; top/left +%if %2 movu m7, [bufyq+xq*4] +%if %3 movu m1, [bufyq+xq*4+82*2] phaddw m7, m1 +%else + phaddw m7, m7 +%endif +%else + movq m7, [bufyq+xq*2] +%endif psrldq m2, m0, 2 ; top psrldq m1, m0, 4 ; top/right punpcklwd m0, m2 +%if %2 +%if %3 pshufd m2, m7, q3232 paddw m7, m2 +%endif pmulhrsw m7, m6 +%endif punpcklwd m1, m7 pmaddwd m0, m4 pmaddwd m1, m5 @@ -916,7 +967,7 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h .x_loop_ar1_end: add bufq, 82*2 - add bufyq, 82*4 + add bufyq, 82*2<<%3 dec hd jg .y_loop_ar1 %if ARCH_X86_32 @@ -952,8 +1003,8 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h pxor m7, m6 %endif pxor m6, m5 ; min_grain -%if cpuflag(sse4) - SPLATW m7, [base+hmul_bits+4] +%if %2 && cpuflag(sse4) + SPLATW m7, [base+hmul_bits+2+%3*2] %endif %if ARCH_X86_64 @@ -1009,11 +1060,15 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h %else DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x %endif - sub bufq, 2*(82*38+44-(82*3+41)) +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif add bufyq, 2*(79+82*3) - mov hd, 35 + mov hd, 70-35*%3 .y_loop_ar2: - mov xq, -38 + mov xq, -(76>>%2) .x_loop_ar2: movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] @@ -1042,15 +1097,25 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h paddd m0, m3 ; luma component & rounding +%if %2 movu m1, [bufyq+xq*4] +%if %3 movu m2, [bufyq+xq*4+82*2] phaddw m1, m2 pshufd m2, m1, q3232 paddw m1, m2 +%else + phaddw m1, m1 +%endif %if cpuflag(sse4) pmulhrsw m1, m15 -%else +%elif %3 pmulhrsw m1, [base+pw_8192] +%else + pmulhrsw m1, [base+pw_16384] +%endif +%else + movq m1, [bufyq+xq*2] %endif punpcklwd m1, [base+pw_1] pmaddwd m1, m12 @@ -1091,7 +1156,7 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h .x_loop_ar2_end: add bufq, 82*2 - add bufyq, 82*4 + add bufyq, 82*2<<%3 dec hd jg .y_loop_ar2 %if ARCH_X86_32 @@ -1141,8 +1206,8 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h pxor m3, m7 %endif pxor m7, m6 ; min_grain -%if cpuflag(sse4) - SPLATW m3, [base+hmul_bits+4] +%if %2 && cpuflag(sse4) + SPLATW m3, [base+hmul_bits+2+%3*2] %endif %if ARCH_X86_64 @@ -1216,11 +1281,15 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h mova m13, m5 DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x %endif - sub bufq, 2*(82*38+44-(82*3+41)) +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif add bufyq, 2*(79+82*3) - mov hd, 35 + mov hd, 70-35*%3 .y_loop_ar3: - mov xq, -38 + mov xq, -(76>>%2) .x_loop_ar3: ; first line @@ -1259,17 +1328,31 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h ; third line [m0 is busy] & luma + round movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] +%if %2 movu m5, [bufyq+xq*4] +%if %3 movu m4, [bufyq+xq*4+82*2] phaddw m5, m4 +%else + phaddw m5, m5 +%endif +%else + movq m5, [bufyq+xq*2] +%endif palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] +%if %3 pshufd m4, m5, q3232 paddw m5, m4 +%endif +%if %2 %if cpuflag(sse4) pmulhrsw m5, m11 -%else +%elif %3 pmulhrsw m5, [base+pw_8192] +%else + pmulhrsw m5, [base+pw_16384] +%endif %endif punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] @@ -1315,7 +1398,7 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h .x_loop_ar3_end: add bufq, 82*2 - add bufyq, 82*4 + add bufyq, 82*2<<%3 dec hd jg .y_loop_ar3 %if WIN64 @@ -1331,6 +1414,11 @@ cglobal generate_grain_uv_420_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h %undef m15 %endif RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 %macro SCRATCH 3 %if ARCH_X86_32 From 6e7abd9b216152996c5954c4dfc9336580f213e3 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 15 Jul 2021 11:09:03 -0400 Subject: [PATCH 143/188] x86/filmgrain: simplify post-horizontal filter blending --- src/x86/film_grain16_avx2.asm | 101 ++++---- src/x86/film_grain16_sse.asm | 236 ++++++----------- src/x86/film_grain_avx2.asm | 167 +++++------- src/x86/film_grain_sse.asm | 465 ++++++++++++++++------------------ 4 files changed, 414 insertions(+), 555 deletions(-) diff --git a/src/x86/film_grain16_avx2.asm b/src/x86/film_grain16_avx2.asm index ea5611d4bc..af450647f4 100644 --- a/src/x86/film_grain16_avx2.asm +++ b/src/x86/film_grain16_avx2.asm @@ -29,8 +29,6 @@ %if ARCH_X86_64 SECTION_RODATA 32 -pw_1024: times 16 dw 1024 -pw_23_22: times 8 dw 23, 22 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 pw_seed_xor: times 2 dw 0xb524 @@ -48,6 +46,7 @@ pw_27_17_17_27: dw 27, 17, 17, 27 ; these two should be next to each other pw_4: times 2 dw 4 pw_16: times 2 dw 16 +pw_23_22: dw 23, 22, 0, 32 %macro JMP_TABLE 1-* %xdefine %1_table %%table @@ -1480,8 +1479,8 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin vpbroadcastd m9, [base+pw_4+r9*4] pmullw m15, m9 %else - vpbroadcastd m14, [pw_1024] - vpbroadcastd m15, [pw_23_22] + vpbroadcastd m14, [pd_16] + vpbroadcastq m15, [pw_23_22] %endif movifnidn sbyd, sbym @@ -1689,16 +1688,18 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin movu m9, [grain_lutq+offxyq*2] movu m3, [grain_lutq+offxyq*2+82*2] movd xm5, [grain_lutq+left_offxyq*2+ 0] - pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 1 ; {left0, left1} - punpcklwd xm7, xm9, xm3 ; {cur0, cur1} + pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1} + punpckldq xm7, xm9, xm3 ; {cur0, cur1} punpcklwd xm5, xm7 ; {left0, cur0, left1, cur1} %if %1 - pmaddwd xm5, [pw_23_22] + vpbroadcastq xm8, [pw_23_22] + pmaddwd xm5, xm8 + vpbroadcastd xm8, [pd_16] + paddd xm5, xm8 %else pmaddwd xm5, xm15 + paddd xm5, xm14 %endif - vpbroadcastd xm8, [pd_16] - paddd xm5, xm8 psrad xm5, 5 packssdw xm5, xm5 pcmpeqw xm8, xm8 @@ -1706,11 +1707,9 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin pxor xm8, xm7 pmaxsw xm5, xm8 pminsw xm5, xm7 - vpblendw xm7, xm5, xm9, 11111110b - psrldq xm5, 2 - vpblendw xm5, xm3, 11111110b - vpblendd m9, m7, 00001111b - vpblendd m3, m5, 00001111b + vpblendd m9, m9, m5, 00000001b + psrldq xm5, 4 + vpblendd m3, m3, m5, 00000001b ; scaling[luma_src] punpckhwd m5, m4, m2 @@ -1875,13 +1874,14 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin movu m5, [grain_lutq+top_offxyq*2] punpckhwd m7, m5, m9 punpcklwd m5, m9 ; {top/cur interleaved} + vpbroadcastd m3, [pw_23_22] + REPX {pmaddwd x, m3}, m7, m5 %if %1 - REPX {pmaddwd x, [pw_23_22]}, m7, m5 -%else - REPX {pmaddwd x, m15}, m7, m5 -%endif vpbroadcastd m3, [pd_16] REPX {paddd x, m3}, m7, m5 +%else + REPX {paddd x, m14}, m7, m5 +%endif REPX {psrad x, 5}, m7, m5 packssdw m9, m5, m7 pcmpeqw m7, m7 @@ -1989,48 +1989,51 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %%loop_y_hv_overlap: ; grain = grain_lut[offy+y][offx+x] movd xm5, [grain_lutq+left_offxyq*2] - pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 1 - pinsrw xm5, [grain_lutq+topleft_offxyq*2], 2 ; { left0, left1, top/left } + pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2 + vinserti128 m5, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left } movu m9, [grain_lutq+offxyq*2] movu m3, [grain_lutq+offxyq*2+82*2] movu m8, [grain_lutq+top_offxyq*2] - punpcklwd xm7, xm9, xm3 ; { cur0, cur1 } - punpckldq xm7, xm8 ; { cur0, cur1, top0 } - punpcklwd xm5, xm7 ; { cur/left } interleaved - pmaddwd xm5, [pw_23_22] - vpbroadcastd xm0, [pd_16] - paddd xm5, xm0 - psrad xm5, 5 - packssdw xm5, xm5 - pcmpeqw xm0, xm0 - psraw xm7, xm10, 1 - pxor xm0, xm7 + punpckldq xm7, xm9, xm3 ; { cur0, cur1 } + vinserti128 m7, xm8, 1 ; { cur0, cur1, top0 } + punpcklwd m5, m7 ; { cur/left } interleaved +%if %1 + vpbroadcastq m0, [pw_23_22] + pmaddwd m5, m0 + vpbroadcastd m0, [pd_16] + paddd m5, m0 +%else + pmaddwd m5, m15 + paddd m5, m14 +%endif + psrad m5, 5 + vextracti128 xm0, m5, 1 + packssdw xm5, xm0 + pcmpeqw m0, m0 + psraw m7, m10, 1 + pxor m0, m7 pminsw xm5, xm7 pmaxsw xm5, xm0 - pcmpeqw xm7, xm7 - psrldq xm7, 14 ; 0xffff, 0..... - vpblendvb m9, m5, m7 ; line 0 - psrldq xm5, 2 - vpblendvb m3, m5, m7 ; line 1 - psrldq xm5, 2 - vpblendvb m5, m8, m5, m7 ; top line + vpblendd m9, m9, m5, 00000001b + psrldq xm5, 4 + vpblendd m3, m3, m5, 00000001b + psrldq xm5, 4 + vpblendd m5, m8, m5, 00000001b - punpckhwd m7, m5, m9 + punpckhwd m8, m5, m9 punpcklwd m5, m9 ; {top/cur interleaved} + vpbroadcastd m9, [pw_23_22] + REPX {pmaddwd x, m9}, m8, m5 %if %1 - REPX {pmaddwd x, [pw_23_22]}, m7, m5 + vpbroadcastd m9, [pd_16] + REPX {paddd x, m9}, m5, m8 %else - REPX {pmaddwd x, m15}, m7, m5 + REPX {paddd x, m14}, m5, m8 %endif - vpbroadcastd m9, [pd_16] - REPX {paddd x, m9}, m5, m7 - REPX {psrad x, 5}, m5, m7 - packssdw m9, m5, m7 - pcmpeqw m5, m5 - psraw m7, m10, 1 - pxor m5, m7 - pmaxsw m9, m5 + REPX {psrad x, 5}, m5, m8 + packssdw m9, m5, m8 pminsw m9, m7 + pmaxsw m9, m0 ; src mova m0, [srcq] diff --git a/src/x86/film_grain16_sse.asm b/src/x86/film_grain16_sse.asm index 26e7ecbc0e..114952a3e7 100644 --- a/src/x86/film_grain16_sse.asm +++ b/src/x86/film_grain16_sse.asm @@ -31,8 +31,11 @@ pd_16: times 4 dd 16 pw_1: times 8 dw 1 pw_16384: times 8 dw 16384 pw_8192: times 8 dw 8192 -pw_23_22: times 4 dw 23, 22 +pw_23_22: dw 23, 22 + times 3 dw 0, 32 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +pw_27_17_17_27: dw 27, 17, 17, 27 + times 2 dw 0, 32 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 @@ -43,7 +46,6 @@ mul_bits: dw 256, 128, 64, 32, 16 round_vals: dw 32, 64, 128, 256, 512, 1024 max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 min: dw 0, 16*4, 16*16 -pw_27_17_17_27: dw 27, 17, 17, 27 ; these two should be next to each other pw_4: times 2 dw 4 pw_16: times 2 dw 16 @@ -96,6 +98,13 @@ SECTION .text %endrep %endmacro +%if ARCH_X86_32 +%undef base +%define PIC_ptr(a) base+a +%else +%define PIC_ptr(a) a +%endif + %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) %macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg @@ -1429,13 +1438,6 @@ generate_grain_uv_fn 444, 0, 0 %endif %endmacro -%if ARCH_X86_32 -%undef base -%define PIC_ptr(a) base+a -%else -%define PIC_ptr(a) a -%endif - INIT_XMM ssse3 %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize @@ -1520,10 +1522,7 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra SCRATCH 6, 14, 5 SCRATCH 7, 15, 6 -%if !cpuflag(sse4) - pcmpeqw m6, m6 - pslldq m6, 4 -%endif + mova m6, [base+pw_27_17_17_27] ; for horizontal filter %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2 @@ -1672,11 +1671,8 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra test dword r8m, 2 jz .loop_x_odd %if ARCH_X86_32 - mov r5, r5m - SPLATD m7, [base+pw_27_17_17_27] add dword [rsp+8*mmsize+1*gprsize], 16 %else - SPLATD m7, [pw_27_17_17_27] add r12d, 16 ; top_offxy += 16 %endif jmp .loop_x_odd_v_overlap @@ -1686,12 +1682,6 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra jz .loop_x ; r8m = sbym -%if ARCH_X86_32 - mov r5, r5m - movq m7, [base+pw_27_17_17_27] -%else - movq m7, [pw_27_17_17_27] -%endif test dword r8m, 2 jnz .loop_x_hv_overlap @@ -1743,27 +1733,21 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra mov grain_lutq, grain_lutmp .loop_y_h_overlap: ; grain = grain_lut[offy+y][offx+x] - movu m4, [grain_lutq+offxyq*2] + movu m5, [grain_lutq+offxyq*2] %if ARCH_X86_32 mov r5, [rsp+8*mmsize+0*gprsize] - movd m5, [grain_lutq+r5*2] -%else - movd m5, [grain_lutq+left_offxyq*2] -%endif - punpcklwd m5, m4 - pmaddwd m5, m7 - paddd m5, m14 - psrad m5, 5 - packssdw m5, m5 -%if cpuflag(sse4) - pblendw m4, m5, 00000011b + movd m4, [grain_lutq+r5*2] %else - pand m4, m6 - pandn m0, m6, m5 - por m4, m0 + movd m4, [grain_lutq+left_offxyq*2] %endif + punpcklwd m4, m5 + pmaddwd m4, m6 + paddd m4, m14 + psrad m4, 5 + packssdw m4, m4 pminsw m4, m15 pmaxsw m4, m9 + shufps m4, m5, q3210 ; src pand m0, m10, [srcq+ 0] @@ -1822,11 +1806,8 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra test dword r8m, 2 jz .loop_x_odd %if ARCH_X86_32 - mov r5, r5m - SPLATD m7, [base+pw_27_17_17_27] add dword [rsp+8*mmsize+1*gprsize], 16 %else - SPLATD m7, [pw_27_17_17_27] add r12d, 16 ; top_offxy += 16 %endif jmp .loop_x_odd_v_overlap @@ -1941,6 +1922,10 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra shr offxyd, 16 .loop_x_odd_v_overlap: +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)] mov hd, dword r7m mov grain_lutq, grain_lutmp .loop_y_v_overlap: @@ -2009,18 +1994,16 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra mova [dstq+srcq+ 0], m0 mova [dstq+srcq+16], m1 -%if ARCH_X86_32 - mov r5, r5m - SPLATD m7, [base+pw_27_17_17_27+4] -%else - SPLATD m7, [pw_27_17_17_27+4] ; swap weights for second v-overlap line -%endif add srcq, r2mp add grain_lutq, 82*2 dec hw jz .end_y_v_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] xor hd, 0x10000 test hd, 0x10000 jnz .loop_y_v_overlap @@ -2044,11 +2027,8 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra btc dword r8m, 2 jc .next_blk_v %if ARCH_X86_32 - mov r5, r5m - SPLATD m7, [base+pw_27_17_17_27] add dword [rsp+8*mmsize+1*gprsize], 16 %else - SPLATD m7, [pw_27_17_17_27] add top_offxyd, 16 %endif add offxyd, 16 @@ -2059,20 +2039,8 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap -%if ARCH_X86_32 - mov r5, r5m - movq m7, [base+pw_27_17_17_27] -%else - movq m7, [pw_27_17_17_27] -%endif - .loop_x_hv_overlap: %if ARCH_X86_32 - mov r5, r5m - SPLATD m0, [base+pw_27_17_17_27] - mova [rsp+7*mmsize], m0 -%define m8 [rsp+7*mmsize] - DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak mov r0, [rsp+8*mmsize+1*gprsize] @@ -2084,8 +2052,6 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra mov seed, r3m xor r0, r0 %else - SPLATD m8, [pw_27_17_17_27] - ; we assume from the block above that bits 8-15 of r7d are zero'ed %endif mov r6d, seed @@ -2139,43 +2105,39 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra %endif shr offxyd, 16 +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)] + movzx hd, word r7m mov grain_lutq, grain_lutmp .loop_y_hv_overlap: ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq*2] + movu m2, [grain_lutq+offxyq*2] %if ARCH_X86_32 mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy - movu m5, [grain_lutq+r0*2] - movd m4, [grain_lutq+r5*2] + movu m4, [grain_lutq+r0*2] + movd m5, [grain_lutq+r5*2] mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy - movd m2, [grain_lutq+r5*2] + movd m3, [grain_lutq+r5*2] %else - movu m5, [grain_lutq+top_offxyq*2] - movd m4, [grain_lutq+left_offxyq*2] - movd m2, [grain_lutq+topleft_offxyq*2] + movu m4, [grain_lutq+top_offxyq*2] + movd m5, [grain_lutq+left_offxyq*2] + movd m3, [grain_lutq+topleft_offxyq*2] %endif ; do h interpolation first (so top | top/left -> top, left | cur -> cur) - punpcklwd m4, m3 - punpcklwd m2, m5 - REPX {pmaddwd x, m7}, m4, m2 - REPX {paddd x, m14}, m4, m2 - REPX {psrad x, 5}, m4, m2 - REPX {packssdw x, x}, m4, m2 - REPX {pminsw x, m15}, m4, m2 - REPX {pmaxsw x, m9}, m4, m2 -%if cpuflag(sse4) - pblendw m3, m4, 00000011b - pblendw m5, m2, 00000011b -%else - pand m3, m6 - pand m5, m6 - pandn m0, m6, m4 - pandn m1, m6, m2 - por m3, m0 - por m5, m1 -%endif + punpcklwd m5, m2 + punpcklwd m3, m4 + REPX {pmaddwd x, m6}, m5, m3 + REPX {paddd x, m14}, m5, m3 + REPX {psrad x, 5}, m5, m3 + packssdw m5, m3 + pminsw m5, m15 + pmaxsw m5, m9 + shufps m3, m5, m2, q3210 + shufps m5, m4, q3232 ; followed by v interpolation (top | cur -> cur) movu m0, [grain_lutq+offxyq*2+16] %if ARCH_X86_32 @@ -2187,7 +2149,7 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra punpckhwd m5, m3 punpcklwd m3, m1, m0 punpckhwd m1, m0 - REPX {pmaddwd x, m8}, m2, m5, m3, m1 + REPX {pmaddwd x, m7}, m2, m5, m3, m1 REPX {paddd x, m14}, m2, m5, m3, m1 REPX {psrad x, 5}, m2, m5, m3, m1 packssdw m2, m5 @@ -2229,19 +2191,16 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra mova [dstq+srcq+ 0], m0 mova [dstq+srcq+16], m1 -%if ARCH_X86_32 - mov r5, r5m - SPLATD m0, [base+pw_27_17_17_27+4] - mova m8, m0 -%else - SPLATD m8, [pw_27_17_17_27+4] ; swap weights for second v-overlap line -%endif add srcq, r2mp add grain_lutq, 82*2 dec hw jz .end_y_hv_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] xor hd, 0x10000 test hd, 0x10000 jnz .loop_y_hv_overlap @@ -2257,14 +2216,12 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra jge .end_hv %if ARCH_X86_32 mov r5, r5m - SPLATD m7, [base+pw_27_17_17_27] add offxyd, 16 add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16 mov srcq, r9mp add srcq, r4mp add srcq, r4mp %else - SPLATD m7, [pw_27_17_17_27] add offxyd, 16 add top_offxyd, 16 mov src_bakq, r9mp @@ -2370,12 +2327,10 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin SCRATCH 4, 12, 4 SCRATCH 5, 13, 5 -%if cpuflag(sse4) - pxor m2, m2 -%define mzero m2 -%else %define mzero m7 -%endif + + SPLATD m2, [base+pw_23_22] + %if ARCH_X86_32 mov scalingq, r5m mov r5m, r5 @@ -2390,11 +2345,6 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin SCRATCH 0, 8, 0 SCRATCH 1, 9, 1 -%if !cpuflag(sse4) - pcmpeqw m2, m2 - pslldq m2, 2 -%endif - cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl @@ -2419,7 +2369,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin pmullw m5, m7 %else SPLATD m6, [base+pd_16] - SPLATD m5, [base+pw_23_22] + mova m5, [base+pw_23_22] %endif SCRATCH 6, 14, 6 @@ -2529,9 +2479,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mova m1, [srcq+16] ; m0-1: src as word ; luma_src -%if !cpuflag(sse4) pxor mzero, mzero -%endif %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut @@ -2687,9 +2635,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mova m1, [srcq+16] ; luma_src -%if !cpuflag(sse4) pxor mzero, mzero -%endif %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut mov lumaq, r9m @@ -2744,13 +2690,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin packssdw m5, m5 pmaxsw m5, m8 pminsw m5, m9 -%if cpuflag(sse4) - pblendw m5, m7, 11111110b -%else - pand m7, m2 - pandn m3, m2, m5 - por m5, m7, m3 -%endif + shufps m5, m7, q3210 movu m3, [grain_lutq+offxyq*2+16] ; scaling[luma_src] @@ -2950,14 +2890,13 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %endif punpckhwd m7, m5, m3 punpcklwd m5, m3 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m7, m5 %if %1 %if ARCH_X86_32 mov r5, r5m %endif - REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m7, m5 REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 %else - REPX {pmaddwd x, m15}, m7, m5 REPX {paddd x, m14}, m7, m5 %endif REPX {psrad x, 5}, m7, m5 @@ -2974,11 +2913,10 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %endif punpckhwd m7, m5, m4 punpcklwd m5, m4 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m7, m5 %if %1 - REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m7, m5 REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 %else - REPX {pmaddwd x, m15}, m7, m5 REPX {paddd x, m14}, m7, m5 %endif REPX {psrad x, 5}, m7, m5 @@ -2991,9 +2929,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mova m1, [srcq+16] ; luma_src -%if !cpuflag(sse4) pxor mzero, mzero -%endif %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut @@ -3021,9 +2957,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin REPX {pmaddwd x, m14}, m7, m6 REPX {psrad x, 6}, m7, m6 packssdw m6, m7 -%if !cpuflag(sse4) pxor mzero, mzero -%endif REPX {paddw x, m15}, m5, m6 REPX {pmaxsw x, mzero}, m5, m6 REPX {pminsw x, m10}, m5, m6 ; clip_pixel() @@ -3176,52 +3110,45 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %else movd m5, [grain_lutq+left_offxyq*2] %endif - movu m3, [grain_lutq+offxyq*2] + movu m7, [grain_lutq+offxyq*2] %if ARCH_X86_32 mov r5, [rsp+8*mmsize+2*gprsize] movu m4, [grain_lutq+r0*2] - pinsrw m5, [grain_lutq+r5*2], 1 + pinsrw m5, [grain_lutq+r5*2], 2 %else movu m4, [grain_lutq+top_offxyq*2] - pinsrw m5, [grain_lutq+topleft_offxyq*2], 1 ; { left, top/left } + pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left } %endif - punpcklwd m7, m3, m4 ; { cur0, top0 } - punpcklwd m5, m7 ; { cur/left } interleaved + punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 } + punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 } %if %1 %if ARCH_X86_32 mov r5, r5m %endif - pmaddwd m5, [PIC_ptr(pw_23_22)] + pshufd m0, [PIC_ptr(pw_23_22)], q1010 +%else + pshufd m0, m15, q1010 +%endif + pmaddwd m5, m0 +%if %1 paddd m5, [PIC_ptr(pd_16)] %else - pmaddwd m5, m15 paddd m5, m14 %endif psrad m5, 5 packssdw m5, m5 pmaxsw m5, m8 pminsw m5, m9 -%if cpuflag(sse4) - pblendw m3, m5, 00000001b - psrldq m5, 2 - pblendw m5, m4, 11111110b -%else - pand m3, m2 - pandn m7, m2, m5 - por m3, m7 - psrldq m5, 2 - pand m4, m2 - pandn m7, m2, m5 - por m5, m4, m7 -%endif + shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3 + shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter + shufps m5, m4, q3231 ; top0-7 post-h_filter punpckhwd m7, m5, m3 punpcklwd m5, m3 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m7, m5 %if %1 - REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m7, m5 REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7 %else - REPX {pmaddwd x, m15}, m7, m5 REPX {paddd x, m14}, m5, m7 %endif REPX {psrad x, 5}, m5, m7 @@ -3238,11 +3165,10 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %endif punpckhwd m1, m0, m4 punpcklwd m0, m4 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m1, m0 %if %1 - REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m1, m0 REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0 %else - REPX {pmaddwd x, m15}, m1, m0 REPX {paddd x, m14}, m1, m0 %endif REPX {psrad x, 5}, m1, m0 @@ -3255,9 +3181,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mova m1, [srcq+16] ; luma_src -%if !cpuflag(sse4) pxor mzero, mzero -%endif %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut @@ -3285,9 +3209,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin REPX {pmaddwd x, m14}, m7, m5 REPX {psrad x, 6}, m7, m5 packssdw m5, m7 -%if !cpuflag(sse4) pxor mzero, mzero -%endif REPX {paddw x, m15}, m6, m5 REPX {pmaxsw x, mzero}, m6, m5 REPX {pminsw x, m10}, m6, m5 ; clip_pixel() diff --git a/src/x86/film_grain_avx2.asm b/src/x86/film_grain_avx2.asm index 130c4075dc..0c3910b7cf 100644 --- a/src/x86/film_grain_avx2.asm +++ b/src/x86/film_grain_avx2.asm @@ -38,7 +38,8 @@ byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 pd_m65536: dd ~0xffff -pb_23_22: times 2 db 23, 22 +pb_23_22: db 23, 22 + times 3 db 0, 32 pb_1: times 4 db 1 hmul_bits: dw 32768, 16384, 8192, 4096 round: dw 2048, 1024, 512 @@ -47,6 +48,7 @@ round_vals: dw 32, 64, 128, 256, 512 max: dw 255, 240, 235 min: dw 0, 16 pb_27_17_17_27: db 27, 17, 17, 27 + times 2 db 0, 32 pw_1: dw 1 %macro JMP_TABLE 2-* @@ -90,6 +92,14 @@ cextern gaussian_sequence SECTION .text +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + INIT_XMM avx2 cglobal generate_grain_y_8bpc, 2, 9, 16, buf, fg_data lea r4, [pb_mask] @@ -1092,12 +1102,12 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grai jz .loop_x ; r8m = sbym - movd xm15, [pb_27_17_17_27] + movq xm15, [pb_27_17_17_27] cmp dword r8m, 0 jne .loop_x_hv_overlap ; horizontal overlap (without vertical overlap) - movd xm14, [pw_1024] + movq xm14, [pw_1024] .loop_x_h_overlap: mov r6d, seed or seed, 0xEFF4 @@ -1156,8 +1166,7 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grai pmaddubsw xm4, xm15, xm4 pmulhrsw xm4, xm14 packsswb xm4, xm4 - vpblendw xm4, xm3, 11111110b - vpblendd m3, m4, 00001111b + vpblendd m3, m3, m4, 00000001b pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 @@ -1329,7 +1338,7 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grai ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap - movd xm15, [pb_27_17_17_27] + movq xm15, [pb_27_17_17_27] .loop_x_hv_overlap: vpbroadcastw m8, [pb_27_17_17_27] @@ -1409,10 +1418,8 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grai pmulhrsw xm7, xm14 packsswb xm4, xm4 packsswb xm7, xm7 - vpblendw xm4, xm3, 11111110b - vpblendw xm7, xm6, 11111110b - vpblendd m3, m4, 00001111b - vpblendd m6, m7, 00001111b + vpblendd m3, m4, 00000001b + vpblendd m6, m7, 00000001b ; followed by v interpolation (top | cur -> cur) punpckhbw m7, m6, m3 punpcklbw m6, m3 @@ -1463,8 +1470,6 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grai %macro FGUV_FN 3 ; name, ss_hor, ss_ver cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, lstride, uv_pl, is_id - pcmpeqw m10, m10 - psrld m10, 24 mov r7d, [fg_dataq+FGData.scaling_shift] lea r8, [pb_mask] %define base r8-pb_mask @@ -1490,10 +1495,15 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, %else vpbroadcastd m14, [pw_1024] %if %2 - vpbroadcastd m15, [pb_23_22] + vpbroadcastq m15, [pb_23_22] %else - vpbroadcastd xm15, [pb_27_17_17_27] + vpbroadcastq xm15, [pb_27_17_17_27] %endif +%endif +%if %3 + vpbroadcastw m10, [pb_23_22] +%elif %2 + mova m10, [pb_8x_27_17_8x_17_27] %endif mov overlapd, [fg_dataq+FGData.overlap_flag] @@ -1593,16 +1603,13 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, ; scaling[luma_src] pcmpeqw m3, m3 pcmpeqw m9, m9 - vpgatherdd m8, [scalingq+m4], m3 - vpgatherdd m4, [scalingq+m5], m9 + vpgatherdd m8, [scalingq-3+m4], m3 + vpgatherdd m4, [scalingq-3+m5], m9 pcmpeqw m3, m3 pcmpeqw m9, m9 - vpgatherdd m5, [scalingq+m6], m3 - vpgatherdd m6, [scalingq+m7], m9 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 + vpgatherdd m5, [scalingq-3+m6], m3 + vpgatherdd m6, [scalingq-3+m7], m9 + REPX {psrld x, 24}, m8, m4, m5, m6 packusdw m8, m4 packusdw m5, m6 @@ -1743,16 +1750,13 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, ; scaling[luma_src] pcmpeqw m3, m3 pcmpeqw m9, m9 - vpgatherdd m8, [scalingq+m4], m3 - vpgatherdd m4, [scalingq+m5], m9 + vpgatherdd m8, [scalingq-3+m4], m3 + vpgatherdd m4, [scalingq-3+m5], m9 pcmpeqw m3, m3 pcmpeqw m9, m9 - vpgatherdd m5, [scalingq+m6], m3 - vpgatherdd m6, [scalingq+m7], m9 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 + vpgatherdd m5, [scalingq-3+m6], m3 + vpgatherdd m6, [scalingq-3+m7], m9 + REPX {psrld x, 24}, m8, m4, m5, m6 packusdw m8, m4 packusdw m5, m6 @@ -1763,7 +1767,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, ; grain = grain_lut[offy+y][offx+x] %if %2 %if %1 - vpbroadcastd m6, [pb_23_22] ; FIXME + vpbroadcastq m6, [pb_23_22] %endif movu xm3, [grain_lutq+offxyq+ 0] movd xm4, [grain_lutq+left_offxyq+ 0] @@ -1778,12 +1782,10 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, pmulhrsw m4, m14 %endif packsswb m4, m4 - pcmpeqw m6, m6 ; FIXME - psrldq m6, 15 ; FIXME - vpblendvb m3, m3, m4, m6 + vpblendd m3, m3, m4, 00010001b %else %if %1 - vpbroadcastd xm6, [pb_27_17_17_27] + movq xm6, [pb_27_17_17_27] %endif movu m3, [grain_lutq+offxyq] movd xm4, [grain_lutq+left_offxyq] @@ -1796,9 +1798,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, pmulhrsw xm4, xm14 %endif packsswb xm4, xm4 - pcmpeqw xm6, xm6 - psrldq xm6, 14 - vpblendvb m3, m3, m4, m6 + vpblendd m3, m3, m4, 00000001b %endif pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 @@ -1915,7 +1915,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, mov hd, hm mov grain_lutq, grain_lutmp %if %2 == 0 - vbroadcasti128 m1, [pb_8x_27_17_8x_17_27] + vbroadcasti128 m10, [pb_8x_27_17_8x_17_27] %endif %%loop_y_v_overlap: ; src @@ -1966,16 +1966,13 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, ; scaling[luma_src] pcmpeqw m3, m3 pcmpeqw m9, m9 - vpgatherdd m8, [scalingq+m4], m3 - vpgatherdd m4, [scalingq+m5], m9 + vpgatherdd m8, [scalingq-3+m4], m3 + vpgatherdd m4, [scalingq-3+m5], m9 pcmpeqw m3, m3 pcmpeqw m9, m9 - vpgatherdd m5, [scalingq+m6], m3 - vpgatherdd m6, [scalingq+m7], m9 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 + vpgatherdd m5, [scalingq-3+m6], m3 + vpgatherdd m6, [scalingq-3+m7], m9 + REPX {psrld x, 24}, m8, m4, m5, m6 packusdw m8, m4 packusdw m5, m6 @@ -1988,7 +1985,6 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, ; grain = grain_lut[offy+y][offx+x] %if %3 == 0 %if %2 - mova m6, [pb_8x_27_17_8x_17_27] movu xm3, [grain_lutq+offxyq] movu xm4, [grain_lutq+top_offxyq] vinserti128 m3, [grain_lutq+offxyq+82], 1 @@ -1999,13 +1995,8 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, %endif punpckhbw m9, m4, m3 punpcklbw m4, m3 -%if %2 - pmaddubsw m9, m6, m9 - pmaddubsw m4, m6, m4 -%else - pmaddubsw m9, m1, m9 - pmaddubsw m4, m1, m4 -%endif + pmaddubsw m9, m10, m9 + pmaddubsw m4, m10, m4 %if %1 pmulhrsw m9, [pw_1024] pmulhrsw m4, [pw_1024] @@ -2015,19 +2006,15 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, %endif packsswb m3, m4, m9 %else -%if %1 - vpbroadcastd m6, [pb_23_22] -%endif movq xm3, [grain_lutq+offxyq] movq xm4, [grain_lutq+top_offxyq] vinserti128 m3, [grain_lutq+offxyq+8], 1 vinserti128 m4, [grain_lutq+top_offxyq+8], 1 punpcklbw m4, m3 + pmaddubsw m4, m10, m4 %if %1 - pmaddubsw m4, m6, m4 pmulhrsw m4, [pw_1024] %else - pmaddubsw m4, m15, m4 pmulhrsw m4, m14 %endif packsswb m4, m4 @@ -2084,7 +2071,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, %endif add grain_lutq, 82<<%2 %if %2 == 0 - vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16] + vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16] btc hd, 16 jnc %%loop_y_v_overlap %endif @@ -2139,7 +2126,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, mov hd, hm mov grain_lutq, grain_lutmp %if %2 == 0 - vbroadcasti128 m1, [pb_8x_27_17_8x_17_27] + vbroadcasti128 m10, [pb_8x_27_17_8x_17_27] %endif %%loop_y_hv_overlap: ; src @@ -2190,16 +2177,13 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, ; scaling[src] pcmpeqw m9, m9 pcmpeqw m3, m3 - vpgatherdd m8, [scalingq+m4], m9 - vpgatherdd m4, [scalingq+m5], m3 + vpgatherdd m8, [scalingq-3+m4], m9 + vpgatherdd m4, [scalingq-3+m5], m3 pcmpeqw m9, m9 pcmpeqw m3, m3 - vpgatherdd m5, [scalingq+m6], m9 - vpgatherdd m6, [scalingq+m7], m3 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 + vpgatherdd m5, [scalingq-3+m6], m9 + vpgatherdd m6, [scalingq-3+m7], m3 + REPX {psrld x, 24}, m8, m4, m5, m6 packusdw m8, m4 packusdw m5, m6 @@ -2212,9 +2196,9 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, ; grain = grain_lut[offy+y][offx+x] %if %1 %if %2 - vpbroadcastd m9, [pb_23_22] + vpbroadcastq m9, [pb_23_22] %else - vpbroadcastd xm9, [pb_27_17_17_27] + vpbroadcastq xm9, [pb_27_17_17_27] %endif %endif @@ -2252,7 +2236,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, %else punpcklbw m7, m6 %endif - punpcklwd m4, m7 + punpcklqdq m4, m7 %if %1 pmaddubsw m4, m9, m4 pmulhrsw m4, [pw_1024] @@ -2261,18 +2245,17 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, pmulhrsw m4, m14 %endif packsswb m4, m4 - pcmpeqw m9, m9 ; this is kind of ugly - psrldq m9, 15 - vpblendvb m3, m3, m4, m9 - psrldq m4, 1 + vpblendd m3, m4, 00010001b + psrldq m4, 4 %if %3 - shufpd m9, m9, m9, 1110b ; clear upper lane + vpblendd m6, m6, m4, 00000001b +%else + vpblendd m6, m6, m4, 00010001b %endif - vpblendvb m6, m6, m4, m9 %else punpcklbw xm4, xm3 punpcklbw xm7, xm6 - punpckldq xm4, xm7 + punpcklqdq xm4, xm7 %if %1 pmaddubsw xm4, xm9, xm4 pmulhrsw xm4, [pw_1024] @@ -2281,23 +2264,19 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, pmulhrsw xm4, xm14 %endif packsswb xm4, xm4 - pcmpeqw xm9, xm9 ; this is kind of ugly - psrldq xm9, 14 - vpblendvb m3, m3, m4, m9 - psrldq xm4, 2 - vpblendvb m6, m6, m4, m9 + vpblendd m3, m3, m4, 00000001b + psrldq xm4, 4 + vpblendd m6, m6, m4, 00000001b %endif ; followed by v interpolation (top | cur -> cur) %if %3 vpermq m9, m3, q3120 punpcklbw m6, m9 + pmaddubsw m6, m10, m6 %if %1 - vpbroadcastd m9, [pb_23_22] - pmaddubsw m6, m9, m6 pmulhrsw m6, [pw_1024] %else - pmaddubsw m6, m15, m6 pmulhrsw m6, m14 %endif packsswb m6, m6 @@ -2306,14 +2285,8 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, %else punpckhbw m9, m6, m3 punpcklbw m6, m3 -%if %2 - mova m3, [pb_8x_27_17_8x_17_27] - pmaddubsw m9, m3, m9 - pmaddubsw m6, m3, m6 -%else - pmaddubsw m9, m1, m9 - pmaddubsw m6, m1, m6 -%endif + pmaddubsw m9, m10, m9 + pmaddubsw m6, m10, m6 %if %1 pmulhrsw m9, [pw_1024] pmulhrsw m6, [pw_1024] @@ -2373,7 +2346,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, jg %%loop_y_h_overlap %else je %%end_y_hv_overlap - vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16] + vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16] btc hd, 16 jnc %%loop_y_hv_overlap jmp %%loop_y_h_overlap diff --git a/src/x86/film_grain_sse.asm b/src/x86/film_grain_sse.asm index 8cba258162..20334591a9 100644 --- a/src/x86/film_grain_sse.asm +++ b/src/x86/film_grain_sse.asm @@ -29,14 +29,18 @@ SECTION_RODATA pw_1024: times 8 dw 1024 +pb_27_17_17_27: db 27, 17, 17, 27 + times 6 db 0, 32 +pb_23_22_h: db 23, 22 + times 7 db 0, 32 pb_27_17: times 8 db 27, 17 pb_17_27: times 8 db 17, 27 +pb_23_22: times 8 db 23, 22 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 -pb_23_22: times 2 db 23, 22 pb_1: times 4 db 1 hmul_bits: dw 32768, 16384, 8192, 4096 round: dw 2048, 1024, 512 @@ -46,8 +50,6 @@ max: dw 255, 240, 235 min: dw 0, 16 pw_1: dw 1 -%define pb_27_17_17_27 pb_17_27 - 2 - %macro JMP_TABLE 2-* %xdefine %1_8bpc_%2_table %%table %xdefine %%base %1_8bpc_%2_table @@ -88,6 +90,20 @@ cextern gaussian_sequence SECTION .text +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%if ARCH_X86_32 +%define PIC_ptr(a) base+a +%else +%define PIC_ptr(a) a +%endif + %macro SCRATCH 3 %if ARCH_X86_32 mova [rsp+%3*mmsize], m%1 @@ -1284,7 +1300,7 @@ INIT_XMM ssse3 ; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize -cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \ +cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \ dst, src, scaling, unused1, fg_data, picptr, unused2 ; copy stack arguments to new position post-alignment, so that we ; don't have to keep the old stack location in a separate register @@ -1295,29 +1311,29 @@ cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \ mov r4, r7m mov r5, r8m - mov [rsp+6*mmsize+ 3*gprsize], r0 - mov [rsp+6*mmsize+ 5*gprsize], r1 - mov [rsp+6*mmsize+ 7*gprsize], r2 - mov [rsp+6*mmsize+ 9*gprsize], r3 - mov [rsp+6*mmsize+10*gprsize], r4 - mov [rsp+6*mmsize+11*gprsize], r5 + mov [rsp+5*mmsize+ 4*gprsize], r0 + mov [rsp+5*mmsize+ 6*gprsize], r1 + mov [rsp+5*mmsize+ 8*gprsize], r2 + mov [rsp+5*mmsize+10*gprsize], r3 + mov [rsp+5*mmsize+11*gprsize], r4 + mov [rsp+5*mmsize+12*gprsize], r5 %else -cglobal fgy_32x32xn_8bpc, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \ +cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \ dst, src, scaling, unused1, fg_data, picptr, unused2 %endif mov srcq, srcm mov fg_dataq, r3m mov scalingq, r5m %if STACK_ALIGNMENT < mmsize -%define r0m [rsp+6*mmsize+ 3*gprsize] -%define r1m [rsp+6*mmsize+ 4*gprsize] -%define r2m [rsp+6*mmsize+ 5*gprsize] -%define r3m [rsp+6*mmsize+ 6*gprsize] -%define r4m [rsp+6*mmsize+ 7*gprsize] -%define r5m [rsp+6*mmsize+ 8*gprsize] -%define r6m [rsp+6*mmsize+ 9*gprsize] -%define r7m [rsp+6*mmsize+10*gprsize] -%define r8m [rsp+6*mmsize+11*gprsize] +%define r0m [rsp+5*mmsize+ 4*gprsize] +%define r1m [rsp+5*mmsize+ 5*gprsize] +%define r2m [rsp+5*mmsize+ 6*gprsize] +%define r3m [rsp+5*mmsize+ 7*gprsize] +%define r4m [rsp+5*mmsize+ 8*gprsize] +%define r5m [rsp+5*mmsize+ 9*gprsize] +%define r6m [rsp+5*mmsize+10*gprsize] +%define r7m [rsp+5*mmsize+11*gprsize] +%define r8m [rsp+5*mmsize+12*gprsize] %endif LEA r5, pb_mask %define base r5-pb_mask @@ -1330,8 +1346,6 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai mov r6d, [fg_dataq+FGData.scaling_shift] movd m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] - pcmpeqw m2, m2 - psrldq m2, 14 movd m4, [base+max+r6*4] movd m5, [base+min+r6*2] punpcklwd m3, m3 @@ -1340,10 +1354,9 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai pshufd m3, m3, q0000 pshufd m4, m4, q0000 pshufd m5, m5, q0000 - SCRATCH 2, 10, 0 - SCRATCH 3, 11, 1 - SCRATCH 4, 12, 2 - SCRATCH 5, 13, 3 + SCRATCH 3, 11, 0 + SCRATCH 4, 12, 1 + SCRATCH 5, 13, 2 %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap @@ -1356,9 +1369,9 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai test overlapd, overlapd jz .no_vertical_overlap mova m6, [base+pw_1024] - movd m7, [base+pb_27_17_17_27] - SCRATCH 6, 14, 4 - SCRATCH 7, 15, 5 + mova m7, [base+pb_27_17_17_27] + SCRATCH 6, 14, 3 + SCRATCH 7, 15, 4 test sbyd, sbyd jnz .vertical_overlap ; fall-through @@ -1445,16 +1458,13 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai ; scaling[src] %if ARCH_X86_32 - vpgatherdw m4, m0, scalingq, r0, r5, m3 - vpgatherdw m5, m1, scalingq, r0, r5, m3 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 %else - vpgatherdw m4, m0, scalingq, r12, r13, m3 - vpgatherdw m5, m1, scalingq, r12, r13, m3 + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 %endif - pcmpeqw m3, m3 - psrlw m3, 8 - pand m4, m3 - pand m5, m3 + REPX {psrlw x, 8}, m4, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] @@ -1504,7 +1514,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai jz .loop_x_odd %if ARCH_X86_32 - add dword [rsp+6*mmsize+1*gprsize], 16 + add dword [rsp+5*mmsize+1*gprsize], 16 %else add r11d, 16 ; top_offxyd %endif @@ -1525,7 +1535,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 add offxyd, 16 ; left_offxyd - mov [rsp+6*mmsize+0*gprsize], offxyd + mov [rsp+5*mmsize+0*gprsize], offxyd DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 @@ -1578,21 +1588,18 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai ; scaling[src] %if ARCH_X86_32 - vpgatherdw m4, m0, scalingq, r0, r5, m3 - vpgatherdw m5, m1, scalingq, r0, r5, m3 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 %else - vpgatherdw m4, m0, scalingq, r12, r13, m3 - vpgatherdw m5, m1, scalingq, r12, r13, m3 + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 %endif - pcmpeqw m3, m3 - psrlw m3, 8 - pand m4, m3 - pand m5, m3 + REPX {psrlw x, 8}, m4, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 - mov r5, [rsp+6*mmsize+0*gprsize] + mov r5, [rsp+5*mmsize+0*gprsize] movd m7, [grain_lutq+r5] %else movd m7, [grain_lutq+left_offxyq] @@ -1601,9 +1608,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai pmaddubsw m6, m15, m7 pmulhrsw m6, m14 packsswb m6, m6 - pand m6, m10 - pandn m7, m10, m3 - por m6, m7 + shufps m6, m3, q3210 pcmpgtb m2, m6 punpcklbw m7, m6, m2 punpckhbw m6, m2 @@ -1649,7 +1654,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai test dword r8m, 2 ; have_top_overlap jz .loop_x_odd %if ARCH_X86_32 - add dword [rsp+6*mmsize+1*gprsize], 16 + add dword [rsp+5*mmsize+1*gprsize], 16 %else add r11d, 16 ; top_offxyd %endif @@ -1754,7 +1759,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai movzx top_offxyd, offxyw %if ARCH_X86_32 - mov [rsp+6*mmsize+1*gprsize], top_offxyd + mov [rsp+5*mmsize+1*gprsize], top_offxyd DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif @@ -1764,7 +1769,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai %if ARCH_X86_32 mov r5, r5m lea r5, [base+pb_27_17] - mov [rsp+5*mmsize+8], r5 + mov [rsp+5*mmsize+12], r5 %else mova m8, [pb_27_17] %endif @@ -1779,21 +1784,18 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai ; scaling[src] %if ARCH_X86_32 - vpgatherdw m4, m0, scalingq, r0, r5, m3 - vpgatherdw m5, m1, scalingq, r0, r5, m3 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 %else - vpgatherdw m4, m0, scalingq, r12, r13, m3 - vpgatherdw m5, m1, scalingq, r12, r13, m3 + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 %endif - pcmpeqw m3, m3 - psrlw m3, 8 - pand m4, m3 - pand m5, m3 + REPX {psrlw x, 8}, m4, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 - mov r5, [rsp+6*mmsize+1*gprsize] + mov r5, [rsp+5*mmsize+1*gprsize] movu m7, [grain_lutq+r5] %else movu m7, [grain_lutq+top_offxyq] @@ -1801,7 +1803,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai punpckhbw m6, m7, m3 punpcklbw m7, m3 %if ARCH_X86_32 - mov r5, [rsp+5*mmsize+8] + mov r5, [rsp+5*mmsize+12] pmaddubsw m3, [r5], m6 pmaddubsw m6, [r5], m7 %else @@ -1833,7 +1835,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai mova [dstq+srcq], m0 %if ARCH_X86_32 - add dword [rsp+5*mmsize+8], mmsize + add dword [rsp+5*mmsize+12], mmsize %else mova m8, [pb_17_27] %endif @@ -1864,7 +1866,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai jc .loop_x_hv_overlap add offxyd, 16 %if ARCH_X86_32 - add dword [rsp+6*mmsize+1*gprsize], 16 + add dword [rsp+5*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif @@ -1874,16 +1876,16 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai %if ARCH_X86_32 mov r5, r5m lea r5, [base+pb_27_17] - mov [rsp+5*mmsize+8], r5 + mov [rsp+5*mmsize+12], r5 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak - mov r5, [rsp+6*mmsize+1*gprsize] + mov r5, [rsp+5*mmsize+1*gprsize] mov r4, offxyd add r5, 16 add r4, 16 - mov [rsp+6*mmsize+2*gprsize], r5 ; topleft_offxy - mov [rsp+6*mmsize+0*gprsize], r4 ; left_offxy + mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy + mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak @@ -1937,7 +1939,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut movzx r5, offxyw ; top_offxy - mov [rsp+6*mmsize+1*gprsize], r5 + mov [rsp+5*mmsize+1*gprsize], r5 %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy @@ -1952,10 +1954,10 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 - mov r5, [rsp+6*mmsize+1*gprsize] ; top_offxy - mov r0, [rsp+6*mmsize+0*gprsize] ; left_offxy + mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy + mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy movu m6, [grain_lutq+r5] - mov r5, [rsp+6*mmsize+2*gprsize] ; topleft_offxy + mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy movd m4, [grain_lutq+r0] movd m7, [grain_lutq+r5] %else @@ -1972,17 +1974,13 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai pmulhrsw m4, m14 packsswb m2, m2 packsswb m4, m4 - pand m2, m10 - pand m4, m10 - pandn m7, m10, m3 - pandn m3, m10, m6 - por m7, m2 - por m3, m4 + shufps m2, m3, q3210 + shufps m4, m6, q3210 ; followed by v interpolation (top | cur -> cur) - punpckhbw m4, m3, m7 - punpcklbw m3, m7 + punpcklbw m3, m4, m2 + punpckhbw m4, m2 %if ARCH_X86_32 - mov r5, [rsp+5*mmsize+8] + mov r5, [rsp+5*mmsize+12] pmaddubsw m7, [r5], m4 pmaddubsw m4, [r5], m3 %else @@ -2004,16 +2002,13 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai ; scaling[src] %if ARCH_X86_32 - vpgatherdw m5, m0, scalingq, r0, r5, m7 - vpgatherdw m6, m1, scalingq, r0, r5, m7 + vpgatherdw m5, m0, scalingq-1, r0, r5, m7 + vpgatherdw m6, m1, scalingq-1, r0, r5, m7 %else - vpgatherdw m5, m0, scalingq, r13, r14, m7 - vpgatherdw m6, m1, scalingq, r13, r14, m7 + vpgatherdw m5, m0, scalingq-1, r13, r14, m7 + vpgatherdw m6, m1, scalingq-1, r13, r14, m7 %endif - pcmpeqw m7, m7 - psrlw m7, 8 - pand m5, m7 - pand m6, m7 + REPX {psrlw x, 8}, m5, m6 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m3, m5 @@ -2033,7 +2028,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai mova [dstq+srcq], m0 %if ARCH_X86_32 - add dword [rsp+5*mmsize+8], mmsize + add dword [rsp+5*mmsize+12], mmsize %else mova m8, [pb_17_27] %endif @@ -2063,7 +2058,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai xor dword r8m, 4 add offxyd, 16 %if ARCH_X86_32 - add dword [rsp+6*mmsize+1*gprsize], 16 + add dword [rsp+5*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif @@ -2079,49 +2074,49 @@ INIT_XMM ssse3 ; sby, luma, lstride, uv_pl, is_id) %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 -cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \ +cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \ tmp, src, scaling, h, fg_data, picptr, unused mov r0, r0m mov r1, r2m mov r2, r4m mov r3, r6m mov r4, r7m - mov [rsp+8*mmsize+3*gprsize], r0 - mov [rsp+8*mmsize+5*gprsize], r1 - mov [rsp+8*mmsize+7*gprsize], r2 - mov [rsp+8*mmsize+9*gprsize], r3 - mov [rsp+8*mmsize+10*gprsize], r4 + mov [rsp+7*mmsize+3*gprsize], r0 + mov [rsp+7*mmsize+5*gprsize], r1 + mov [rsp+7*mmsize+7*gprsize], r2 + mov [rsp+7*mmsize+9*gprsize], r3 + mov [rsp+7*mmsize+10*gprsize], r4 mov r0, r8m mov r1, r9m mov r2, r10m mov r4, r11m mov r3, r12m - mov [rsp+8*mmsize+11*gprsize], r0 - mov [rsp+8*mmsize+12*gprsize], r1 - mov [rsp+8*mmsize+13*gprsize], r2 - mov [rsp+8*mmsize+14*gprsize], r4 + mov [rsp+7*mmsize+11*gprsize], r0 + mov [rsp+7*mmsize+12*gprsize], r1 + mov [rsp+7*mmsize+13*gprsize], r2 + mov [rsp+7*mmsize+14*gprsize], r4 %else -cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ +cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \ tmp, src, scaling, h, fg_data, picptr, unused %endif mov srcq, srcm mov fg_dataq, r3m mov scalingq, r5m %if STACK_ALIGNMENT < mmsize -%define r0m [rsp+8*mmsize+ 3*gprsize] -%define r1m [rsp+8*mmsize+ 4*gprsize] -%define r2m [rsp+8*mmsize+ 5*gprsize] -%define r3m [rsp+8*mmsize+ 6*gprsize] -%define r4m [rsp+8*mmsize+ 7*gprsize] -%define r5m [rsp+8*mmsize+ 8*gprsize] -%define r6m [rsp+8*mmsize+ 9*gprsize] -%define r7m [rsp+8*mmsize+10*gprsize] -%define r8m [rsp+8*mmsize+11*gprsize] -%define r9m [rsp+8*mmsize+12*gprsize] -%define r10m [rsp+8*mmsize+13*gprsize] -%define r11m [rsp+8*mmsize+14*gprsize] -%define r12m [rsp+8*mmsize+15*gprsize] +%define r0m [rsp+7*mmsize+ 3*gprsize] +%define r1m [rsp+7*mmsize+ 4*gprsize] +%define r2m [rsp+7*mmsize+ 5*gprsize] +%define r3m [rsp+7*mmsize+ 6*gprsize] +%define r4m [rsp+7*mmsize+ 7*gprsize] +%define r5m [rsp+7*mmsize+ 8*gprsize] +%define r6m [rsp+7*mmsize+ 9*gprsize] +%define r7m [rsp+7*mmsize+10*gprsize] +%define r8m [rsp+7*mmsize+11*gprsize] +%define r9m [rsp+7*mmsize+12*gprsize] +%define r10m [rsp+7*mmsize+13*gprsize] +%define r11m [rsp+7*mmsize+14*gprsize] +%define r12m [rsp+7*mmsize+15*gprsize] %endif LEA r5, pb_mask %define base r5-pb_mask @@ -2133,7 +2128,6 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, %define base r8-pb_mask %endif mov r6d, [fg_dataq+FGData.scaling_shift] - pcmpeqw m2, m2 movd m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] lea tmpd, [r6d*2] @@ -2145,17 +2139,15 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, movd m5, [base+min+r6*2] cmovne r6d, tmpd movd m4, [base+max+r6*2] - psrldq m2, 14+%2 punpcklwd m3, m3 punpcklwd m5, m5 punpcklwd m4, m4 pshufd m3, m3, q0000 pshufd m5, m5, q0000 pshufd m4, m4, q0000 - SCRATCH 2, 10, 0 - SCRATCH 3, 11, 1 - SCRATCH 4, 12, 2 - SCRATCH 5, 13, 3 + SCRATCH 3, 11, 0 + SCRATCH 4, 12, 1 + SCRATCH 5, 13, 2 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl @@ -2177,8 +2169,8 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, punpcklwd m7, m7 pshufd m6, m6, q0000 pshufd m7, m7, q0000 - SCRATCH 6, 14, 4 - SCRATCH 7, 15, 5 + SCRATCH 6, 14, 3 + SCRATCH 7, 15, 4 %endif mov sbyd, r8m @@ -2187,22 +2179,21 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, jz %%no_vertical_overlap %if ARCH_X86_32 %if %2 - movd m1, [base+pb_23_22] + mova m1, [base+pb_23_22_h] %else - movd m1, [base+pb_27_17_17_27] + mova m1, [base+pb_27_17_17_27] %endif mova m0, [base+pw_1024] %else %if %2 - movd m1, [pb_23_22] + mova m1, [pb_23_22_h] %else - movd m1, [pb_27_17_17_27] + mova m1, [pb_27_17_17_27] %endif mova m0, [pw_1024] %endif - pshufd m1, m1, q0000 - SCRATCH 0, 8, 6 - SCRATCH 1, 9, 7 + SCRATCH 0, 8, 5 + SCRATCH 1, 9, 6 test sbyd, sbyd jnz %%vertical_overlap ; fall-through @@ -2347,16 +2338,13 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, ; scaling[luma_src] %if ARCH_X86_32 - vpgatherdw m7, m4, scalingq, r0, r5 - vpgatherdw m5, m6, scalingq, r0, r5 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 %else - vpgatherdw m7, m4, scalingq, r12, r2 - vpgatherdw m5, m6, scalingq, r12, r2 + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 %endif - pcmpeqw m1, m1 - psrlw m1, 8 - pand m7, m1 - pand m5, m1 + REPX {psrlw x, 8}, m7, m5 ; unpack chroma_source punpckhbw m1, m0, m2 @@ -2426,7 +2414,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, %if %2 == 0 ; adjust top_offxy %if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 + add dword [rsp+7*mmsize+1*gprsize], 16 %else add r11d, 16 %endif @@ -2450,9 +2438,9 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, %if ARCH_X86_32 %if %2 lea r6, [offxyd+16] - mov [rsp+8*mmsize+0*gprsize], r6 + mov [rsp+7*mmsize+0*gprsize], r6 %else - mov [rsp+8*mmsize+0*gprsize], offxyd + mov [rsp+7*mmsize+0*gprsize], offxyd %endif DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut @@ -2558,36 +2546,31 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, ; scaling[luma_src] %if ARCH_X86_32 - vpgatherdw m7, m4, scalingq, r0, r5 - vpgatherdw m5, m6, scalingq, r0, r5 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 %else - vpgatherdw m7, m4, scalingq, r12, r2 - vpgatherdw m5, m6, scalingq, r12, r2 + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 %endif - pcmpeqw m1, m1 - psrlw m1, 8 - pand m7, m1 - pand m5, m1 + REPX {psrlw x, 8}, m7, m5 ; unpack chroma_source punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq+ 0] + movu m4, [grain_lutq+offxyq+ 0] %if ARCH_X86_32 - mov r0, [rsp+8*mmsize+0*gprsize] - movd m4, [grain_lutq+r0+ 0] + mov r0, [rsp+7*mmsize+0*gprsize] + movd m2, [grain_lutq+r0+ 0] %else - movd m4, [grain_lutq+left_offxyq+ 0] + movd m2, [grain_lutq+left_offxyq+ 0] %endif - punpcklbw m2, m4, m3 - pmaddubsw m4, m9, m2 - pmulhrsw m4, m8 - packsswb m4, m4 - pand m4, m10 - pandn m2, m10, m3 - por m3, m4, m2 + punpcklbw m2, m4 + pmaddubsw m3, m9, m2 + pmulhrsw m3, m8 + packsswb m3, m3 + shufps m3, m4, q3210 pxor m4, m4 pcmpgtb m4, m3 punpcklbw m2, m3, m4 @@ -2652,7 +2635,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, xor dword r8m, 4 ; adjust top_offxyd %if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 + add dword [rsp+7*mmsize+1*gprsize], 16 %else add r11d, 16 %endif @@ -2780,7 +2763,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, movzx top_offxyd, offxyw shr offxyd, 16 %if ARCH_X86_32 - mov [rsp+8*mmsize+1*gprsize], top_offxyd + mov [rsp+7*mmsize+1*gprsize], top_offxyd DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut %endif @@ -2790,9 +2773,11 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, mov grain_lutq, grain_lutmp %if ARCH_X86_32 mov r5, r5m - mova m1, [base+pb_27_17] +%endif +%if %3 + mova m1, [PIC_ptr(pb_23_22)] %else - mova m1, [pb_27_17] + mova m1, [PIC_ptr(pb_27_17)] %endif %%loop_y_v_overlap: %if ARCH_X86_32 @@ -2848,34 +2833,26 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, ; scaling[luma_src] %if ARCH_X86_32 - vpgatherdw m7, m4, scalingq, r0, r5 - vpgatherdw m5, m6, scalingq, r0, r5 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 %else - vpgatherdw m7, m4, scalingq, r12, r2 - vpgatherdw m5, m6, scalingq, r12, r2 + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 %endif - pcmpeqw m4, m4 - psrlw m4, 8 - pand m7, m4 - pand m5, m4 + REPX {psrlw x, 8}, m7, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 - mov r0, [rsp+8*mmsize+1*gprsize] + mov r0, [rsp+7*mmsize+1*gprsize] movu m4, [grain_lutq+r0] %else movu m4, [grain_lutq+top_offxyq] %endif punpckhbw m6, m4, m3 punpcklbw m4, m3 -%if %3 - pmaddubsw m2, m9, m6 - pmaddubsw m3, m9, m4 -%else pmaddubsw m2, m1, m6 pmaddubsw m3, m1, m4 -%endif pmulhrsw m2, m8 pmulhrsw m3, m8 packsswb m3, m2 @@ -2928,10 +2905,8 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, btc hd, 16 %if ARCH_X86_32 mov r5, r5m - mova m1, [base+pb_17_27] -%else - mova m1, [pb_17_27] %endif + mova m1, [PIC_ptr(pb_17_27)] jnc %%loop_y_v_overlap %endif jmp %%loop_y @@ -2963,7 +2938,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, ; h+v overlap %else %if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 + add dword [rsp+7*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif @@ -2976,15 +2951,15 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, %if ARCH_X86_32 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused - mov r6, [rsp+8*mmsize+1*gprsize] + mov r6, [rsp+7*mmsize+1*gprsize] %if %2 lea r0, [r3d+16] add r6, 16 - mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy + mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy %else - mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy + mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy %endif - mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy + mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused @@ -3048,18 +3023,55 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, movzx top_offxyd, offxyw shr offxyd, 16 %if ARCH_X86_32 - mov [rsp+8*mmsize+1*gprsize], top_offxyd + mov [rsp+7*mmsize+1*gprsize], top_offxyd %endif mov hd, r7m mov grain_lutq, grain_lutmp %if ARCH_X86_32 mov r5, r5m - mova m3, [base+pb_27_17] +%endif +%if %3 + mova m3, [PIC_ptr(pb_23_22)] %else - mova m3, [pb_27_17] + mova m3, [PIC_ptr(pb_27_17)] %endif %%loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] +%if ARCH_X86_32 + mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy + mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy + movd m1, [grain_lutq+r0] + mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy +%else + movd m1, [grain_lutq+topleft_offxyq] +%endif + movu m2, [grain_lutq+offxyq] +%if ARCH_X86_32 + movu m6, [grain_lutq+r5] + movd m4, [grain_lutq+r0] +%else + movu m6, [grain_lutq+top_offxyq] + movd m4, [grain_lutq+left_offxyq] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m1, m6 + punpcklbw m4, m2 + pmaddubsw m0, m9, m1 + pmaddubsw m1, m9, m4 + REPX {pmulhrsw x, m8}, m0, m1 + packsswb m0, m1 + shufps m4, m0, m2, q3232 + shufps m0, m6, q3210 + ; followed by v interpolation (top | cur -> cur) + punpcklbw m2, m0, m4 + punpckhbw m0, m4 + pmaddubsw m4, m3, m0 + pmaddubsw m1, m3, m2 + pmulhrsw m4, m8 + pmulhrsw m1, m8 + packsswb m1, m4 + ; src %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut @@ -3116,69 +3128,20 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, ; scaling[src] %if ARCH_X86_32 - vpgatherdw m7, m4, scalingq, r0, r5 - vpgatherdw m5, m6, scalingq, r0, r5 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 %else - movd m1, [grain_lutq+topleft_offxyq] %if %3 - vpgatherdw m7, m4, scalingq, r2, r12 - vpgatherdw m5, m6, scalingq, r2, r12 + vpgatherdw m7, m4, scalingq-1, r2, r12 + vpgatherdw m5, m6, scalingq-1, r2, r12 %else - vpgatherdw m7, m4, scalingq, r2, r13 - vpgatherdw m5, m6, scalingq, r2, r13 + vpgatherdw m7, m4, scalingq-1, r2, r13 + vpgatherdw m5, m6, scalingq-1, r2, r13 %endif %endif - pcmpeqw m2, m2 - psrlw m2, 8 - pand m7, m2 - pand m5, m2 + REPX {psrlw x, 8}, m7, m5 - ; grain = grain_lut[offy+y][offx+x] -%if ARCH_X86_32 - mov r0, [rsp+8*mmsize+2*gprsize] ; topleft_offxy - mov r5, [rsp+8*mmsize+1*gprsize] ; top_offxy - movd m1, [grain_lutq+r0] - mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy -%endif - movu m2, [grain_lutq+offxyq] -%if ARCH_X86_32 - movu m6, [grain_lutq+r5] - movd m4, [grain_lutq+r0] -%else - movu m6, [grain_lutq+top_offxyq] - movd m4, [grain_lutq+left_offxyq] -%endif - ; do h interpolation first (so top | top/left -> top, left | cur -> cur) - punpcklbw m1, m6 - punpcklbw m4, m2 -%if %2 - punpcklwd m4, m1 -%else - punpckldq m4, m1 -%endif - pmaddubsw m1, m9, m4 - pmulhrsw m1, m8 - packsswb m1, m1 - pandn m4, m10, m2 - pandn m2, m10, m6 - psrldq m6, m1, 2-%2 - pand m1, m10 - pand m6, m10 - por m4, m1 - por m2, m6 - ; followed by v interpolation (top | cur -> cur) - punpckhbw m1, m2, m4 - punpcklbw m2, m4 -%if %3 - pmaddubsw m4, m9, m1 - pmaddubsw m1, m9, m2 -%else - pmaddubsw m4, m3, m1 - pmaddubsw m1, m3, m2 -%endif - pmulhrsw m4, m8 - pmulhrsw m1, m8 - packsswb m1, m4 + ; unpack grain pxor m4, m4 pcmpgtb m4, m1 punpcklbw m2, m1, m4 @@ -3229,10 +3192,8 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, jle %%end_y_hv_overlap %if ARCH_X86_32 mov r5, r5m - mova m3, [base+pb_17_27] -%else - mova m3, [pb_17_27] %endif + mova m3, [PIC_ptr(pb_17_27)] btc hd, 16 jnc %%loop_y_hv_overlap %if ARCH_X86_64 @@ -3268,7 +3229,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, jmp %%loop_x_hv_overlap %else %if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 + add dword [rsp+7*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif From 528559579bba0c409606d74fe1a8e529bd249862 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sat, 17 Jul 2021 17:37:35 -0400 Subject: [PATCH 144/188] x86/itx: change function signatures of itx_4x4 to 0 GPRs The wrapper function already backs up GPRs, and declaring 7 here means we will backup/restore twice on x86-32. --- src/x86/itx16_sse.asm | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/x86/itx16_sse.asm b/src/x86/itx16_sse.asm index ee12707c3a..283ea6b727 100644 --- a/src/x86/itx16_sse.asm +++ b/src/x86/itx16_sse.asm @@ -239,6 +239,8 @@ cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, 8, dst, stride, c, eob, tx2 %define %%p1 m(i%1_%4_internal_16bpc) %if ARCH_X86_32 LEA r6, $$ +%endif +%if has_epilogue %ifidn %1_%2, dct_dct test eobd, eobd jz %%end @@ -250,7 +252,7 @@ cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, 8, dst, stride, c, eob, tx2 %else ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. - lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] + lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 @@ -280,7 +282,7 @@ ALIGN function_align pshuflw m0, m0, q0000 punpcklqdq m0, m0 mova m1, m0 - jmp m(iadst_4x4_internal_16bpc).end + TAIL_CALL m(iadst_4x4_internal_16bpc).end %endif %endmacro @@ -306,7 +308,7 @@ INV_TXFM_4X4_FN dct, identity INV_TXFM_4X4_FN dct, adst INV_TXFM_4X4_FN dct, flipadst -cglobal idct_4x4_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 +cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] mova m2, [cq+16*2] @@ -374,7 +376,7 @@ INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity -cglobal iadst_4x4_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 +cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call .main ; transpose punpckhwd m2, m0, m1 @@ -459,7 +461,7 @@ INV_TXFM_4X4_FN flipadst, adst INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity -cglobal iflipadst_4x4_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 +cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_16bpc).main ; transpose punpcklwd m2, m1, m0 @@ -508,7 +510,7 @@ INV_TXFM_4X4_FN identity, adst INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity -cglobal iidentity_4x4_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 +cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 mova m3, [o(pd_5793)] pmulld m0, m3, [cq+16*0] pmulld m1, m3, [cq+16*1] From 7f965a7c296981f1b1ae575f9c36b0cc28e51b2f Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 16 Jul 2021 11:10:34 -0400 Subject: [PATCH 145/188] x86/filmgrain: add fguv_32x32xn_i422/444 HBD/SSSE3 --- src/x86/film_grain16_sse.asm | 224 +++++++++++++++++++++++++++++------ 1 file changed, 190 insertions(+), 34 deletions(-) diff --git a/src/x86/film_grain16_sse.asm b/src/x86/film_grain16_sse.asm index 114952a3e7..3f86e7d9a5 100644 --- a/src/x86/film_grain16_sse.asm +++ b/src/x86/film_grain16_sse.asm @@ -2235,10 +2235,11 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 %endif +%macro FGUV_FN 3 ; name, ss_hor, ss_ver INIT_XMM ssse3 %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize -cglobal fguv_32x32xn_i420_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \ +cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \ tmp, src, scaling, h, fg_data, picptr, unused mov r0, r0m mov r1, r1m @@ -2283,7 +2284,7 @@ cglobal fguv_32x32xn_i420_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \ SPLATW m2, r13m %else -cglobal fguv_32x32xn_i420_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ +cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ tmp, src, scaling, h, fg_data, picptr, unused mov srcq, srcm mov fg_dataq, r3m @@ -2293,7 +2294,7 @@ cglobal fguv_32x32xn_i420_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ DECLARE_REG_TMP 0, 2, 3 %else -cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ +cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, lstride, uv_pl, is_id %define base r8-pb_mask lea r8, [pb_mask] @@ -2329,7 +2330,9 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %define mzero m7 +%if %3 SPLATD m2, [base+pw_23_22] +%endif %if ARCH_X86_32 mov scalingq, r5m @@ -2348,7 +2351,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl -%macro FGUV_32x32xN_LOOP 1 ; not-csfl +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap @@ -2369,7 +2372,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin pmullw m5, m7 %else SPLATD m6, [base+pd_16] +%if %2 mova m5, [base+pw_23_22] +%else + mova m5, [base+pw_27_17_17_27] +%endif %endif SCRATCH 6, 14, 6 @@ -2412,18 +2419,23 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mov r1mp, r3 lea r3, [dstq+wq*2] mov r11mp, r3 - lea r3, [lumaq+wq*4] + lea r3, [lumaq+wq*(2<<%2)] mov r12mp, r3 +%if %3 shl r10mp, 1 +%endif %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused2, unused3, see, unused4, unused5, unused6, luma, lstride mov lstrideq, r10mp +%if %3 + add lstrideq, lstrideq +%endif mov lumaq, r9mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] - lea r12, [lumaq+wq*4] + lea r12, [lumaq+wq*(2<<%2)] mov r10mp, r10 mov r11mp, r11 mov r12mp, r12 @@ -2461,8 +2473,8 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin ror offyd, 8 shr offxd, 12 and offyd, 0xf - imul offyd, 82 - lea offyq, [offyq+offxq+498] ; offy*stride+offx + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut @@ -2471,6 +2483,9 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin h, offxy, see, unused1, unused2, unused3, luma, lstride %endif +%if %2 == 0 +%%loop_x_odd: +%endif mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y: @@ -2486,15 +2501,19 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mov lumaq, r9m %endif mova m4, [lumaq+ 0] - mova m6, [lumaq+32] + mova m6, [lumaq+(16<<%2)] +%if %2 phaddw m4, [lumaq+16] phaddw m6, [lumaq+48] +%endif %if ARCH_X86_32 add lumaq, r10mp mov r9m, lumaq %endif +%if %2 pavgw m4, mzero pavgw m6, mzero +%endif %if %1 punpckhwd m3, m4, m0 @@ -2549,7 +2568,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %else add srcq, r13mp add dstq, r13mp - lea lumaq, [lumaq+lstrideq*2] + add lumaq, lstrideq %endif add grain_lutq, 82*2 dec hd @@ -2571,11 +2590,25 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*4] + lea lumaq, [lumaq+wq*(2<<%2)] %if ARCH_X86_32 mov r0m, dstq mov r9m, lumaq mov r4m, wq +%endif +%if %2 == 0 + btc dword r8m, 2 + jc %%next_blk + add offxyd, 16 + test dword r8m, 2 + jz %%loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + jmp %%loop_x_odd_v_overlap +%%next_blk: %endif test dword r8m, 1 je %%loop_x @@ -2618,8 +2651,8 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin ror offyd, 8 shr offxd, 12 and offyd, 0xf - imul offyd, 82 - lea offyq, [offyq+offxq+498] ; offy*stride+offx + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut @@ -2641,15 +2674,19 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mov lumaq, r9m %endif mova m4, [lumaq+ 0] - mova m6, [lumaq+32] + mova m6, [lumaq+(16<<%2)] +%if %2 phaddw m4, [lumaq+16] phaddw m6, [lumaq+48] +%endif %if ARCH_X86_32 add lumaq, r10mp mov r9m, lumaq %endif +%if %2 pavgw m4, mzero pavgw m6, mzero +%endif %if %1 punpckhwd m3, m4, m0 @@ -2680,7 +2717,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %if ARCH_X86_32 mov r5, r5m %endif +%if %2 pmaddwd m5, [PIC_ptr(pw_23_22)] +%else + pmaddwd m5, [PIC_ptr(pw_27_17_17_27)] +%endif paddd m5, [PIC_ptr(pd_16)] %else pmaddwd m5, m15 @@ -2726,7 +2767,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %else add srcq, r13mp add dstq, r13mp - lea lumaq, [lumaq+lstrideq*2] + add lumaq, lstrideq %endif add grain_lutq, 82*2 dec hd @@ -2747,17 +2788,32 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*4] + lea lumaq, [lumaq+wq*(2<<%2)] %if ARCH_X86_32 mov r0mp, dstq mov r9mp, lumaq mov r4m, wq %endif +%if %2 ; r8m = sbym test dword r8m, 2 jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap +%else + or dword r8m, 4 + add offxyd, 16 + + ; r8m = sbym + test dword r8m, 2 + jz %%loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxy += 16 +%endif + jmp %%loop_x_odd_v_overlap +%endif %%end: RET @@ -2801,9 +2857,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mov r1mp, r3 lea r3, [dstq+wq*2] mov r11mp, r3 - lea r3, [lumaq+wq*4] + lea r3, [lumaq+wq*(2<<%2)] mov r12mp, r3 +%if %3 shl r10mp, 1 +%endif %else xor seed, sbyd ; (cur_seed << 16) | top_seed @@ -2811,10 +2869,13 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin unused1, unused2, see, unused3, unused4, unused5, luma, lstride mov lstrideq, r10mp +%if %3 + add lstrideq, lstrideq +%endif mov lumaq, r9mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] - lea r12, [lumaq+wq*4] + lea r12, [lumaq+wq*(2<<%2)] mov r10mp, r10 mov r11mp, r11 mov r12mp, r12 @@ -2860,9 +2921,9 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f - imul offyd, 82 + imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq+0x10001*498+16*82] + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] %if ARCH_X86_32 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut @@ -2877,6 +2938,16 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %endif shr offxyd, 16 +%if %2 == 0 +%%loop_x_odd_v_overlap: +%endif +%if %3 == 0 +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)] +%endif + mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y_v_overlap: @@ -2936,15 +3007,19 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mov lumaq, r9mp %endif mova m5, [lumaq+ 0] - mova m6, [lumaq+32] + mova m6, [lumaq+(16<<%2)] +%if %2 phaddw m5, [lumaq+16] phaddw m6, [lumaq+48] +%endif %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif +%if %2 pavgw m5, mzero pavgw m6, mzero +%endif %if %1 punpckhwd m7, m5, m0 @@ -2991,7 +3066,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mova [dstq+ 0], m0 mova [dstq+16], m1 - dec hd + dec hw jle %%end_y_v_overlap %if ARCH_X86_32 add srcq, r2mp @@ -3000,10 +3075,20 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %else add srcq, r13mp add dstq, r13mp - lea lumaq, [lumaq+lstrideq*2] + add lumaq, lstrideq %endif add grain_lutq, 82*2 +%if %3 jmp %%loop_y +%else + btc hd, 16 + jc %%loop_y +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] + jmp %%loop_y_v_overlap +%endif %%end_y_v_overlap: %if ARCH_X86_32 @@ -3022,16 +3107,28 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*4] + lea lumaq, [lumaq+wq*(2<<%2)] %if ARCH_X86_32 mov r0mp, dstq mov r9mp, lumaq mov r4m, wq %endif +%if %2 ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap +%else + btc dword r8m, 2 + jc %%loop_x_hv_overlap + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + jmp %%loop_x_odd_v_overlap +%endif %%loop_x_hv_overlap: %if ARCH_X86_32 @@ -3081,9 +3178,9 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f - imul offyd, 82 + imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq+0x10001*498+16*82] + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy @@ -3099,6 +3196,13 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %endif shr offxyd, 16 +%if %3 == 0 +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)] +%endif + mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y_hv_overlap: @@ -3114,10 +3218,21 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %if ARCH_X86_32 mov r5, [rsp+8*mmsize+2*gprsize] movu m4, [grain_lutq+r0*2] +%if %2 pinsrw m5, [grain_lutq+r5*2], 2 +%else + movd m3, [grain_lutq+r5*2] +%endif %else movu m4, [grain_lutq+top_offxyq*2] +%if %2 pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left } +%else + movd m3, [grain_lutq+topleft_offxyq*2] +%endif +%endif +%if %2 == 0 + punpckldq m5, m3 %endif punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 } punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 } @@ -3125,7 +3240,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %if ARCH_X86_32 mov r5, r5m %endif - pshufd m0, [PIC_ptr(pw_23_22)], q1010 +%if %2 + movddup m0, [PIC_ptr(pw_23_22)] +%else + movddup m0, [PIC_ptr(pw_27_17_17_27)] +%endif %else pshufd m0, m15, q1010 %endif @@ -3188,15 +3307,19 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mov lumaq, r9mp %endif mova m6, [lumaq+ 0] - mova m5, [lumaq+32] + mova m5, [lumaq+(16<<%2)] +%if %2 phaddw m6, [lumaq+16] phaddw m5, [lumaq+48] +%endif %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif +%if %2 pavgw m6, mzero pavgw m5, mzero +%endif %if %1 punpckhwd m7, m6, m0 @@ -3222,8 +3345,15 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1 vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1 %else +%if %3 == 0 + ; register shortage :) + push r12 +%endif vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1 vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1 +%if %3 == 0 + pop r12 +%endif %endif REPX {psrlw x, 8}, m7, m6 @@ -3250,13 +3380,23 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin %else add srcq, r13mp add dstq, r13mp - lea lumaq, [lumaq+lstrideq*2] + add lumaq, lstrideq %endif add grain_lutq, 82*2 - dec hd + dec hw +%if %3 jg %%loop_y_h_overlap - +%else + jle %%end_y_hv_overlap + btc hd, 16 + jc %%loop_y_h_overlap +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] + jmp %%loop_y_hv_overlap %%end_y_hv_overlap: +%endif %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut @@ -3273,22 +3413,38 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*4] + lea lumaq, [lumaq+wq*(2<<%2)] %if ARCH_X86_32 mov dstmp, dstq mov r9mp, lumaq mov r4m, wq %endif +%if %2 jmp %%loop_x_hv_overlap +%else + or dword r8m, 4 + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxy += 16 +%endif + jmp %%loop_x_odd_v_overlap +%endif %%end_hv: RET %endmacro - FGUV_32x32xN_LOOP 1 + %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: - FGUV_32x32xN_LOOP 0 + %%FGUV_32x32xN_LOOP 0, %2, %3 %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 %endif +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 From 9d9e3be5ba163978e775178a7d6e7c61168b3c82 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 16 Jul 2021 14:35:23 -0400 Subject: [PATCH 146/188] x86/filmgrain: add fguv_32x32xn_i422 HBD/AVX2 --- src/x86/film_grain16_avx2.asm | 146 ++++++++++++++++++++++++---------- 1 file changed, 106 insertions(+), 40 deletions(-) diff --git a/src/x86/film_grain16_avx2.asm b/src/x86/film_grain16_avx2.asm index af450647f4..e0fab38dcf 100644 --- a/src/x86/film_grain16_avx2.asm +++ b/src/x86/film_grain16_avx2.asm @@ -1445,7 +1445,8 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra .end_hv: RET -cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, lstride, uv_pl, is_id %define base r8-pb_mask lea r8, [pb_mask] @@ -1467,7 +1468,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl -%macro FGUV_32x32xN_LOOP 1 ; not-csfl +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap %if %1 @@ -1523,8 +1524,8 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin rorx offyd, seed, 8 shr offxd, 12 and offyd, 0xf - imul offyd, 82 - lea offyq, [offyq+offxq+498] ; offy*stride+offx + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, unused1, unused2, unused3, luma, lstride @@ -1541,10 +1542,10 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mova xm7, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+32], 1 vinserti128 m7, [lumaq+lstrideq*0+48], 1 - mova xm6, [lumaq+lstrideq*2+ 0] - mova xm8, [lumaq+lstrideq*2+16] - vinserti128 m6, [lumaq+lstrideq*2+32], 1 - vinserti128 m8, [lumaq+lstrideq*2+48], 1 + mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] + mova xm8, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 + vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m4, m7 phaddw m6, m8 pavgw m4, m2 @@ -1605,7 +1606,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*4] + lea lumaq, [lumaq+lstrideq*(2<<%3)] add grain_lutq, 82*4 sub hb, 2 jg %%loop_y @@ -1642,8 +1643,8 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin rorx offyd, seed, 8 shr offxd, 12 and offyd, 0xf - imul offyd, 82 - lea offyq, [offyq+offxq+498] ; offy*stride+offx + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, left_offxy, unused1, unused2, luma, lstride @@ -1659,10 +1660,10 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mova xm7, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+32], 1 vinserti128 m7, [lumaq+lstrideq*0+48], 1 - mova xm6, [lumaq+lstrideq*2+ 0] - mova xm8, [lumaq+lstrideq*2+16] - vinserti128 m6, [lumaq+lstrideq*2+32], 1 - vinserti128 m8, [lumaq+lstrideq*2+48], 1 + mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] + mova xm8, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 + vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m4, m7 phaddw m6, m8 pavgw m4, m2 @@ -1751,7 +1752,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*4] + lea lumaq, [lumaq+lstrideq*(2<<%3)] add grain_lutq, 82*4 sub hb, 2 jg %%loop_y_h_overlap @@ -1822,9 +1823,9 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f - imul offyd, 82 + imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq+0x10001*498+16*82] + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, unused1, top_offxy, unused2, luma, lstride @@ -1844,10 +1845,10 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mova xm7, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+32], 1 vinserti128 m7, [lumaq+lstrideq*0+48], 1 - mova xm6, [lumaq+lstrideq*2+ 0] - mova xm8, [lumaq+lstrideq*2+16] - vinserti128 m6, [lumaq+lstrideq*2+32], 1 - vinserti128 m8, [lumaq+lstrideq*2+48], 1 + mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] + mova xm8, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 + vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m4, m7 phaddw m6, m8 pavgw m4, m2 @@ -1874,21 +1875,45 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin movu m5, [grain_lutq+top_offxyq*2] punpckhwd m7, m5, m9 punpcklwd m5, m9 ; {top/cur interleaved} +%if %3 vpbroadcastd m3, [pw_23_22] +%else + vpbroadcastd m3, [pw_27_17_17_27] +%endif REPX {pmaddwd x, m3}, m7, m5 %if %1 - vpbroadcastd m3, [pd_16] - REPX {paddd x, m3}, m7, m5 + vpbroadcastd m8, [pd_16] + REPX {paddd x, m8}, m7, m5 %else REPX {paddd x, m14}, m7, m5 %endif REPX {psrad x, 5}, m7, m5 packssdw m9, m5, m7 + movu m3, [grain_lutq+offxyq*2+82*2] +%if %3 == 0 + movu m5, [grain_lutq+top_offxyq*2+82*2] + punpckhwd m7, m5, m3 + punpcklwd m5, m3 ; {top/cur interleaved} + vpbroadcastd m3, [pw_27_17_17_27+4] + REPX {pmaddwd x, m3}, m7, m5 +%if %1 + REPX {paddd x, m8}, m7, m5 +%else + REPX {paddd x, m14}, m7, m5 +%endif + REPX {psrad x, 5}, m7, m5 + packssdw m3, m5, m7 +%endif ; %3 == 0 pcmpeqw m7, m7 psraw m5, m10, 1 pxor m7, m5 +%if %3 pmaxsw m9, m7 pminsw m9, m5 +%else + REPX {pmaxsw x, m7}, m9, m3 + REPX {pminsw x, m5}, m9, m3 +%endif ; scaling[luma_src] punpckhwd m5, m4, m2 @@ -1904,7 +1929,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin pmullw m8, m11 pmulhrsw m9, m8 - ; same for the other half + ; scaling for the other half punpckhwd m7, m6, m2 punpcklwd m6, m2 ; m4-7: luma_src as dword pcmpeqw m8, m8 @@ -1915,7 +1940,6 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin packssdw m5, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) - movu m3, [grain_lutq+offxyq*2+82*2] pmullw m5, m11 pmulhrsw m3, m5 @@ -1933,7 +1957,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin jle %%end_y_v_overlap lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*4] + lea lumaq, [lumaq+lstrideq*(2<<%3)] add grain_lutq, 82*4 jmp %%loop_y @@ -1974,9 +1998,9 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f - imul offyd, 82 + imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq+0x10001*498+16*82] + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride @@ -1990,12 +2014,26 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin ; grain = grain_lut[offy+y][offx+x] movd xm5, [grain_lutq+left_offxyq*2] pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2 +%if %3 vinserti128 m5, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left } +%else + ; insert both top/left lines + movd xm9, [grain_lutq+topleft_offxyq*2+82*2] + pinsrw xm9, [grain_lutq+topleft_offxyq*2], 2 + vinserti128 m5, xm9, 1 +%endif movu m9, [grain_lutq+offxyq*2] movu m3, [grain_lutq+offxyq*2+82*2] movu m8, [grain_lutq+top_offxyq*2] punpckldq xm7, xm9, xm3 ; { cur0, cur1 } +%if %3 vinserti128 m7, xm8, 1 ; { cur0, cur1, top0 } +%else + ; insert both top lines + movu m1, [grain_lutq+top_offxyq*2+82*2] + punpckldq xm0, xm1, xm8 + vinserti128 m7, xm0, 1 +%endif punpcklwd m5, m7 ; { cur/left } interleaved %if %1 vpbroadcastq m0, [pw_23_22] @@ -2017,23 +2055,47 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin vpblendd m9, m9, m5, 00000001b psrldq xm5, 4 vpblendd m3, m3, m5, 00000001b +%if %3 == 0 + psrldq xm5, 4 + vpblendd m1, m1, m5, 00000001b +%endif psrldq xm5, 4 vpblendd m5, m8, m5, 00000001b punpckhwd m8, m5, m9 punpcklwd m5, m9 ; {top/cur interleaved} +%if %3 vpbroadcastd m9, [pw_23_22] +%else + vpbroadcastd m9, [pw_27_17_17_27] +%endif REPX {pmaddwd x, m9}, m8, m5 %if %1 - vpbroadcastd m9, [pd_16] - REPX {paddd x, m9}, m5, m8 + vpbroadcastd m4, [pd_16] + REPX {paddd x, m4}, m8, m5 %else - REPX {paddd x, m14}, m5, m8 + REPX {paddd x, m14}, m8, m5 %endif - REPX {psrad x, 5}, m5, m8 + REPX {psrad x, 5}, m8, m5 packssdw m9, m5, m8 +%if %3 pminsw m9, m7 pmaxsw m9, m0 +%else + punpckhwd m8, m1, m3 + punpcklwd m1, m3 ; {top/cur interleaved} + vpbroadcastd m3, [pw_27_17_17_27+4] + REPX {pmaddwd x, m3}, m8, m1 +%if %1 + REPX {paddd x, m4}, m8, m1 +%else + REPX {paddd x, m14}, m8, m1 +%endif + REPX {psrad x, 5}, m8, m1 + packssdw m3, m1, m8 + REPX {pminsw x, m7}, m9, m3 + REPX {pmaxsw x, m0}, m9, m3 +%endif ; src mova m0, [srcq] @@ -2044,10 +2106,10 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin mova xm7, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+32], 1 vinserti128 m7, [lumaq+lstrideq*0+48], 1 - mova xm6, [lumaq+lstrideq*2+ 0] - mova xm8, [lumaq+lstrideq*2+16] - vinserti128 m6, [lumaq+lstrideq*2+32], 1 - vinserti128 m8, [lumaq+lstrideq*2+48], 1 + mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] + mova xm8, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 + vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m4, m7 phaddw m6, m8 pavgw m4, m2 @@ -2109,7 +2171,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*4] + lea lumaq, [lumaq+lstrideq*(2<<%3)] add grain_lutq, 82*4 sub hb, 2 jg %%loop_y_h_overlap @@ -2129,8 +2191,12 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin RET %endmacro - FGUV_32x32xN_LOOP 1 + %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: - FGUV_32x32xN_LOOP 0 + %%FGUV_32x32xN_LOOP 0, %2, %3 +%endmacro +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 %endif ; ARCH_X86_64 From eeba794417ba7a5509d9c1410c02b861f5b20072 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 16 Jul 2021 17:31:42 -0400 Subject: [PATCH 147/188] x86/filmgrain: add fguv_32x32xn_i444 HBD/AVX2 --- src/x86/film_grain16_avx2.asm | 211 +++++++++++++++++++++++++++++++--- 1 file changed, 192 insertions(+), 19 deletions(-) diff --git a/src/x86/film_grain16_avx2.asm b/src/x86/film_grain16_avx2.asm index e0fab38dcf..a30192e162 100644 --- a/src/x86/film_grain16_avx2.asm +++ b/src/x86/film_grain16_avx2.asm @@ -1481,7 +1481,11 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling pmullw m15, m9 %else vpbroadcastd m14, [pd_16] +%if %2 vpbroadcastq m15, [pw_23_22] +%else + vpbroadcastq m15, [pw_27_17_17_27] +%endif %endif movifnidn sbyd, sbym @@ -1503,7 +1507,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling mov lstrideq, r10mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] - lea r12, [lumaq+wq*4] + lea r12, [lumaq+wq*(2<<%2)] mov r10mp, r10 mov r11mp, r11 mov r12mp, r12 @@ -1535,9 +1539,14 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling %%loop_y: ; src mova m0, [srcq] +%if %2 mova m1, [srcq+strideq] ; m0-1: src as word +%else + mova m1, [srcq+32] +%endif ; luma_src +%if %2 mova xm4, [lumaq+lstrideq*0+ 0] mova xm7, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+32], 1 @@ -1550,6 +1559,10 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling phaddw m6, m8 pavgw m4, m2 pavgw m6, m2 +%else + mova m4, [lumaq] + mova m6, [lumaq+32] +%endif %if %1 punpckhwd m3, m4, m0 @@ -1587,7 +1600,11 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling ; grain = grain_lut[offy+y][offx+x] movu m9, [grain_lutq+offxyq*2] +%if %2 movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif ; noise = round2(scaling[luma_src] * grain, scaling_shift) REPX {pmullw x, m11}, m8, m5 @@ -1602,23 +1619,34 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling pminsw m0, m12 pminsw m1, m12 mova [dstq], m0 +%if %2 mova [dstq+strideq], m1 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] - add grain_lutq, 82*4 +%else + mova [dstq+32], m1 + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(2<<%2) +%if %2 sub hb, 2 +%else + dec hb +%endif jg %%loop_y - add wq, 16 + add wq, 32>>%2 jge %%end mov srcq, r10mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*4] + lea lumaq, [lumaq+wq*(2<<%2)] cmp byte [fg_dataq+FGData.overlap_flag], 0 je %%loop_x @@ -1638,7 +1666,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, luma, lstride - lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx + lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx mov offxd, seed rorx offyd, seed, 8 shr offxd, 12 @@ -1653,6 +1681,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling mov grain_lutq, grain_lutmp %%loop_y_h_overlap: mova m0, [srcq] +%if %2 mova m1, [srcq+strideq] ; luma_src @@ -1668,6 +1697,13 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling phaddw m6, m8 pavgw m4, m2 pavgw m6, m2 +%else + mova m1, [srcq+32] + + ; luma_src + mova m4, [lumaq] + mova m6, [lumaq+32] +%endif %if %1 punpckhwd m3, m4, m0 @@ -1687,13 +1723,25 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling ; grain = grain_lut[offy+y][offx+x] movu m9, [grain_lutq+offxyq*2] +%if %2 movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif movd xm5, [grain_lutq+left_offxyq*2+ 0] +%if %2 pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1} punpckldq xm7, xm9, xm3 ; {cur0, cur1} punpcklwd xm5, xm7 ; {left0, cur0, left1, cur1} +%else + punpcklwd xm5, xm9 +%endif %if %1 +%if %2 vpbroadcastq xm8, [pw_23_22] +%else + movq xm8, [pw_27_17_17_27] +%endif pmaddwd xm5, xm8 vpbroadcastd xm8, [pd_16] paddd xm5, xm8 @@ -1709,8 +1757,10 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling pmaxsw xm5, xm8 pminsw xm5, xm7 vpblendd m9, m9, m5, 00000001b +%if %2 psrldq xm5, 4 vpblendd m3, m3, m5, 00000001b +%endif ; scaling[luma_src] punpckhwd m5, m4, m2 @@ -1748,23 +1798,36 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling pminsw m0, m12 pminsw m1, m12 mova [dstq], m0 +%if %2 mova [dstq+strideq], m1 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] - add grain_lutq, 82*4 +%else + mova [dstq+32], m1 + + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + + add grain_lutq, 82*(2<<%2) +%if %2 sub hb, 2 +%else + dec hb +%endif jg %%loop_y_h_overlap - add wq, 16 + add wq, 32>>%2 jge %%end mov srcq, r10mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*4] + lea lumaq, [lumaq+wq*(2<<%2)] ; r8m = sbym cmp dword r8m, 0 @@ -1796,7 +1859,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling mov lstrideq, r10mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] - lea r12, [lumaq+wq*4] + lea r12, [lumaq+wq*(2<<%2)] mov r10mp, r10 mov r11mp, r11 mov r12mp, r12 @@ -1833,11 +1896,15 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling movzx top_offxyd, offxyw shr offxyd, 16 +%if %2 == 0 + lea r10, [pw_27_17_17_27] +%endif mov hd, hm mov grain_lutq, grain_lutmp %%loop_y_v_overlap: ; src mova m0, [srcq] +%if %2 mova m1, [srcq+strideq] ; luma_src @@ -1853,6 +1920,13 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling phaddw m6, m8 pavgw m4, m2 pavgw m6, m2 +%else + mova m1, [srcq+32] + + ; luma_src + mova m4, [lumaq] + mova m6, [lumaq+32] +%endif %if %1 punpckhwd m3, m4, m0 @@ -1877,8 +1951,10 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling punpcklwd m5, m9 ; {top/cur interleaved} %if %3 vpbroadcastd m3, [pw_23_22] -%else +%elif %2 vpbroadcastd m3, [pw_27_17_17_27] +%else + vpbroadcastd m3, [r10] %endif REPX {pmaddwd x, m3}, m7, m5 %if %1 @@ -1889,12 +1965,24 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling %endif REPX {psrad x, 5}, m7, m5 packssdw m9, m5, m7 +%if %2 movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif %if %3 == 0 +%if %2 movu m5, [grain_lutq+top_offxyq*2+82*2] +%else + movu m5, [grain_lutq+top_offxyq*2+32] +%endif punpckhwd m7, m5, m3 punpcklwd m5, m3 ; {top/cur interleaved} +%if %2 vpbroadcastd m3, [pw_27_17_17_27+4] +%else + vpbroadcastd m3, [r10] +%endif REPX {pmaddwd x, m3}, m7, m5 %if %1 REPX {paddd x, m8}, m7, m5 @@ -1951,25 +2039,43 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling pminsw m0, m12 pminsw m1, m12 mova [dstq], m0 +%if %2 mova [dstq+strideq], m1 sub hb, 2 +%else + mova [dstq+32], m1 + dec hb +%endif jle %%end_y_v_overlap +%if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] - add grain_lutq, 82*4 +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(2<<%2) +%if %2 jmp %%loop_y +%else + btc hd, 16 + jc %%loop_y + add r10, 4 + jmp %%loop_y_v_overlap +%endif %%end_y_v_overlap: - add wq, 16 + add wq, 32>>%2 jge %%end_hv mov srcq, r10mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*4] + lea lumaq, [lumaq+wq*(2<<%2)] ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to @@ -1992,8 +2098,12 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride - lea topleft_offxyq, [top_offxyq+16] - lea left_offxyq, [offyq+16] +%if %2 == 0 + lea r12, [pw_27_17_17_27] + mov r13mp, r12 +%endif + lea topleft_offxyq, [top_offxyq+(32>>%2)] + lea left_offxyq, [offyq+(32>>%2)] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f @@ -2013,6 +2123,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling %%loop_y_hv_overlap: ; grain = grain_lut[offy+y][offx+x] movd xm5, [grain_lutq+left_offxyq*2] +%if %2 pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2 %if %3 vinserti128 m5, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left } @@ -2021,10 +2132,18 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling movd xm9, [grain_lutq+topleft_offxyq*2+82*2] pinsrw xm9, [grain_lutq+topleft_offxyq*2], 2 vinserti128 m5, xm9, 1 +%endif +%else + pinsrd xm5, [grain_lutq+topleft_offxyq*2], 1 %endif movu m9, [grain_lutq+offxyq*2] +%if %2 movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif movu m8, [grain_lutq+top_offxyq*2] +%if %2 punpckldq xm7, xm9, xm3 ; { cur0, cur1 } %if %3 vinserti128 m7, xm8, 1 ; { cur0, cur1, top0 } @@ -2033,8 +2152,13 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling movu m1, [grain_lutq+top_offxyq*2+82*2] punpckldq xm0, xm1, xm8 vinserti128 m7, xm0, 1 +%endif +%else + movu m1, [grain_lutq+top_offxyq*2+32] + punpckldq xm7, xm9, xm8 %endif punpcklwd m5, m7 ; { cur/left } interleaved +%if %2 %if %1 vpbroadcastq m0, [pw_23_22] pmaddwd m5, m0 @@ -2047,17 +2171,32 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling psrad m5, 5 vextracti128 xm0, m5, 1 packssdw xm5, xm0 +%else +%if %1 + movddup xm0, [pw_27_17_17_27] + pmaddwd xm5, xm0 + vpbroadcastd m0, [pd_16] + paddd xm5, xm0 +%else + pmaddwd xm5, xm15 + paddd xm5, xm14 +%endif + psrad xm5, 5 + packssdw xm5, xm5 +%endif pcmpeqw m0, m0 psraw m7, m10, 1 pxor m0, m7 pminsw xm5, xm7 pmaxsw xm5, xm0 vpblendd m9, m9, m5, 00000001b +%if %2 psrldq xm5, 4 vpblendd m3, m3, m5, 00000001b %if %3 == 0 psrldq xm5, 4 vpblendd m1, m1, m5, 00000001b +%endif %endif psrldq xm5, 4 vpblendd m5, m8, m5, 00000001b @@ -2066,8 +2205,11 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling punpcklwd m5, m9 ; {top/cur interleaved} %if %3 vpbroadcastd m9, [pw_23_22] -%else +%elif %2 vpbroadcastd m9, [pw_27_17_17_27] +%else + xchg r12, r13mp + vpbroadcastd m9, [r12] %endif REPX {pmaddwd x, m9}, m8, m5 %if %1 @@ -2084,7 +2226,12 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling %else punpckhwd m8, m1, m3 punpcklwd m1, m3 ; {top/cur interleaved} +%if %2 vpbroadcastd m3, [pw_27_17_17_27+4] +%else + vpbroadcastd m3, [r12] + xchg r12, r13mp +%endif REPX {pmaddwd x, m3}, m8, m1 %if %1 REPX {paddd x, m4}, m8, m1 @@ -2099,9 +2246,14 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling ; src mova m0, [srcq] +%if %2 mova m1, [srcq+strideq] +%else + mova m1, [srcq+32] +%endif ; luma_src +%if %2 mova xm4, [lumaq+lstrideq*0+ 0] mova xm7, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+32], 1 @@ -2114,6 +2266,10 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling phaddw m6, m8 pavgw m4, m2 pavgw m6, m2 +%else + mova m4, [lumaq] + mova m6, [lumaq+32] +%endif %if %1 punpckhwd m8, m4, m0 @@ -2167,24 +2323,41 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling pminsw m0, m12 pminsw m1, m12 mova [dstq], m0 +%if %2 mova [dstq+strideq], m1 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] - add grain_lutq, 82*4 +%else + mova [dstq+32], m1 + + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(2<<%2) +%if %2 sub hb, 2 jg %%loop_y_h_overlap +%else + dec hb + jle %%end_y_hv_overlap + btc hd, 16 + jc %%loop_y_h_overlap + add r13mp, 4 + jmp %%loop_y_hv_overlap +%endif %%end_y_hv_overlap: - add wq, 16 + add wq, 32>>%2 jge %%end_hv mov srcq, r10mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*4] + lea lumaq, [lumaq+wq*(2<<%2)] jmp %%loop_x_hv_overlap %%end_hv: From 63000bd5df93d163998781b4b7b1b81d82a06812 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sat, 17 Jul 2021 10:50:42 -0400 Subject: [PATCH 148/188] x86/itx: 4x8 inverse transforms hbd/sse4 --- src/x86/itx16_sse.asm | 259 +++++++++++++++++++++++++++++++++++++++++- src/x86/itx_sse.asm | 4 +- 2 files changed, 256 insertions(+), 7 deletions(-) diff --git a/src/x86/itx16_sse.asm b/src/x86/itx16_sse.asm index 283ea6b727..0536414916 100644 --- a/src/x86/itx16_sse.asm +++ b/src/x86/itx16_sse.asm @@ -72,8 +72,13 @@ pd_1321: times 4 dd 1321 pd_2482: times 4 dd 2482 pd_m3344: times 4 dd -3344 pd_2048: times 4 dd 2048 +pw_4x2048_4xm2048: times 4 dw 2048 + times 4 dw -2048 +pw_4xm2048_4x2048: times 4 dw -2048 + times 4 dw 2048 pw_2048: times 8 dw 2048 pd_3803: times 4 dd 3803 +pw_4096: times 8 dw 4096 pd_5793: times 4 dd 5793 pw_1697x8: times 8 dw 1697*8 pw_2896x8: times 8 dw 2896*8 @@ -84,6 +89,8 @@ pw_m3784_1567: times 4 dw -3784, 1567 cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3 cextern iadst_4x4_internal_8bpc_ssse3.main +cextern idct_4x8_internal_8bpc_ssse3.main +cextern iadst_4x8_internal_8bpc_ssse3.main SECTION .text @@ -314,9 +321,7 @@ cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 mova m2, [cq+16*2] mova m3, [cq+16*3] mova m5, [o(pd_2048)] - IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5 - packssdw m0, m1 ; out0 out1 - packssdw m4, m2 ; out2 out3 + call .pass1_main ; transpose punpckhwd m2, m0, m4 punpcklwd m0, m4 @@ -326,6 +331,11 @@ cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ; m1 = out2 out3 ; m5 = pd_2048 jmp tx2q +.pass1_main: + IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5 + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 + ret .pass2: ; m0 = in0 in1 ; m1 = in2 in3 @@ -424,10 +434,12 @@ ALIGN function_align .main: mova m1, [cq+16*2] mova m3, [cq+16*3] + mova m5, [cq+16*0] + lea r3, [cq+16*1] +.main2: mova m0, [o(pd_1321)] ; SINPI_1_9 mova m2, [o(pd_2482)] ; SINPI_2_9 mova m6, [o(pd_3803)] ; SINPI_4_9 - mova m5, [cq+16*0] pmulld m4, m0, m1 ; s[4] = SINPI_1_9 * T[2] pmulld m7, m3, m6 ; s[6] = SINPI_4_9 * T[3] pmulld m6, m1 ; s[3] = SINPI_4_9 * T[2] @@ -442,7 +454,7 @@ ALIGN function_align psubd m2, m7 ; s[1] -= s[6] psubd m1, m5 ; -b7 = (T[2] -T[3]) - T[0] pmulld m1, m3 ; s[2] = -SINPI_3_9 * -b7 - pmulld m3, [cq+16*1] ; -s[3] = -SINPI_3_9 * T[1] + pmulld m3, [r3] ; -s[3] = -SINPI_3_9 * T[1] mova m5, [o(pd_2048)] REPX {paddd x, m5}, m0, m1 ; {s[0], s[2]} + 2048 paddd m4, m0, m2 ; x[3] = s[0] + s[1] @@ -564,3 +576,240 @@ cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 movq [r5 +strideq*0], m1 movhps [r5 +strideq*1], m1 RET + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 4x8 +%ifidn %1_%2, dct_dct +%if 1 + pshufd m0, [cq], q0000 + mova m1, [o(pw_2896x8)] + mov [cq], eobd ; 0 + packssdw m0, m0 + REPX {pmulhrsw x, m1}, m0, m0, m0 + pmulhrsw m0, [o(pw_2048)] +%else + imul r5d, [cq], 2896 + mov [cq], eobd ; 0 + add r5d, 2048 + sar r5d, 12 + imul r5d, 2896 + add r5d, 2048 + sar r5d, 12 + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 +%endif + pxor m4, m4 + REPX {mova x, m0}, m1, m2, m3 + TAIL_CALL m(idct_4x8_internal_16bpc).write_4x8 +%endif +%endmacro + +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, identity +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst + +cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m5, [o(pd_2048)] + mov r3d, 16 +.loop_pass1: + mova m3, [o(pd_2896)] + pmulld m0, m3, [cq+32*0+r3] + pmulld m1, m3, [cq+32*1+r3] + pmulld m2, m3, [cq+32*2+r3] + pmulld m3, [cq+32*3+r3] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + call m(idct_4x4_internal_16bpc).pass1_main + test r3d, r3d + jz .end_pass1 + mova [cq+32*0+16], m0 + mova [cq+32*1+16], m4 + xor r3d, r3d + jmp .loop_pass1 +.end_pass1: + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova m2, [cq+32*0+16] + mova m6, [cq+32*1+16] + punpckhwd m4, m2, m6 + punpcklwd m2, m6 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_4x8_internal_8bpc, _ssse3).main + ; m0-3 is now out0/1,3/2,4/5,7/6 + mova m4, [o(pw_2048)] + shufps m1, m1, q1032 + shufps m3, m3, q1032 +.end: + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + pxor m4, m4 + REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 +.write_4x8: + mova m7, [o(pixel_10bpc_max)] + lea r2, [strideq*3] + movq m5, [dstq+strideq*0] + movq m6, [dstq+strideq*2] + movhps m5, [dstq+strideq*1] + movhps m6, [dstq+r2] + lea r4, [dstq+strideq*4] + paddw m0, m5 + paddw m1, m6 + movq m5, [r4+strideq*0] + movq m6, [r4+strideq*2] + movhps m5, [r4+strideq*1] + movhps m6, [r4+r2] + paddw m2, m5 + paddw m3, m6 + REPX {pminsw x, m7}, m0, m1, m2, m3 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r2 ], m1 + movq [r4 +strideq*0], m2 + movhps [r4 +strideq*1], m2 + movq [r4 +strideq*2], m3 + movhps [r4 +r2 ], m3 + RET + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call .pass1_main + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova m2, [cq+32*2+16] + mova m6, [cq+32*3+16] + punpckhwd m4, m2, m6 + punpcklwd m2, m6 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass1_main: + mov r5d, 16 + lea r3, [cq+32*1+16] +.loop_pass1: + mova m0, [o(pd_2048)] + mova m3, [o(pd_2896)] + pmulld m5, m3, [cq+32*0+r5] + pmulld m2, m3, [cq+32*1+r5] + pmulld m1, m3, [cq+32*2+r5] + pmulld m3, [cq+32*3+r5] + REPX {paddd x, m0}, m5, m2, m1, m3 + REPX {psrad x, 12}, m5, m2, m1, m3 + mova [r3], m2 + call m(iadst_4x4_internal_16bpc).main2 + test r5d, r5d + jz .end_pass1 + mova [cq+32*2+16], m0 + mova [cq+32*3+16], m1 + xor r5d, r5d + jmp .loop_pass1 +.end_pass1: + ret +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main + mova m4, [o(pw_4x2048_4xm2048)] + jmp m(idct_4x8_internal_16bpc).end + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_16bpc).pass1_main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + mova m6, [cq+32*2+16] + mova m2, [cq+32*3+16] + punpcklwd m4, m2, m6 + punpckhwd m2, m6 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main + mova m4, m0 + mova m5, m1 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + pshufd m2, m5, q1032 + pshufd m3, m4, q1032 + mova m4, [o(pw_4xm2048_4x2048)] + jmp m(idct_4x8_internal_16bpc).end + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m5, [o(pd_2048)] + mova m4, [o(pd_2896)] + mova m6, [o(pd_5793)] + mov r3d, 16 +.loop_pass1: + pmulld m0, m4, [cq+32*0+r3] + pmulld m1, m4, [cq+32*1+r3] + pmulld m2, m4, [cq+32*2+r3] + pmulld m3, m4, [cq+32*3+r3] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {pmulld x, m6}, m0, m1, m2, m3 + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + test r3d, r3d + jz .end_pass1 + mova [cq+32*0+16], m0 + mova m7, m2 + xor r3d, r3d + jmp .loop_pass1 +.end_pass1: + punpckhwd m4, m0, m2 + punpcklwd m0, m2 + punpckhwd m1, m0, m4 + punpcklwd m0, m4 + mova m2, [cq+32*0+16] + punpckhwd m4, m2, m7 + punpcklwd m2, m7 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass2: + mova m4, [o(pw_4096)] + jmp m(idct_4x8_internal_16bpc).end diff --git a/src/x86/itx_sse.asm b/src/x86/itx_sse.asm index 9ff8a01519..03cec677a5 100644 --- a/src/x86/itx_sse.asm +++ b/src/x86/itx_sse.asm @@ -590,7 +590,7 @@ cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(iadst_4x8_internal_8bpc).end2 ALIGN function_align -.main: +cglobal_label .main IDCT8_1D_PACKED ret @@ -641,7 +641,7 @@ cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 RET ALIGN function_align -.main: +cglobal_label .main mova m6, [o(pd_2048)] punpckhwd m4, m3, m0 ;unpacked in7 in0 punpckhwd m5, m2, m1 ;unpacked in5 in2 From 359a5445c48b1a6edafe9401afd4c47a0b1a5d41 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sun, 18 Jul 2021 09:20:17 -0400 Subject: [PATCH 149/188] x86/itx: 4x16 inverse transforms hbd/sse4 --- src/x86/itx16_sse.asm | 346 ++++++++++++++++++++++++++++++++++++++++-- src/x86/itx_sse.asm | 6 +- 2 files changed, 333 insertions(+), 19 deletions(-) diff --git a/src/x86/itx16_sse.asm b/src/x86/itx16_sse.asm index 0536414916..b3cfb17fab 100644 --- a/src/x86/itx16_sse.asm +++ b/src/x86/itx16_sse.asm @@ -80,8 +80,11 @@ pw_2048: times 8 dw 2048 pd_3803: times 4 dd 3803 pw_4096: times 8 dw 4096 pd_5793: times 4 dd 5793 +pd_6144: times 4 dd 6144 pw_1697x8: times 8 dw 1697*8 pw_2896x8: times 8 dw 2896*8 +pw_1697x16: times 8 dw 1697*16 +pw_16384: times 8 dw 16384 pixel_10bpc_max: times 8 dw 0x03ff pw_1567_3784: times 4 dw 1567, 3784 @@ -91,6 +94,9 @@ cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3 cextern iadst_4x4_internal_8bpc_ssse3.main cextern idct_4x8_internal_8bpc_ssse3.main cextern iadst_4x8_internal_8bpc_ssse3.main +cextern idct_16x4_internal_8bpc_ssse3.main +cextern iadst_16x4_internal_8bpc_ssse3.main +cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end SECTION .text @@ -322,6 +328,8 @@ cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 mova m3, [cq+16*3] mova m5, [o(pd_2048)] call .pass1_main + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 ; transpose punpckhwd m2, m0, m4 punpcklwd m0, m4 @@ -333,8 +341,6 @@ cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 jmp tx2q .pass1_main: IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5 - packssdw m0, m1 ; out0 out1 - packssdw m4, m2 ; out2 out3 ret .pass2: ; m0 = in0 in1 @@ -388,6 +394,8 @@ INV_TXFM_4X4_FN adst, identity cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call .main + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 ; transpose punpckhwd m2, m0, m1 punpcklwd m0, m1 @@ -463,8 +471,6 @@ ALIGN function_align paddd m4, m3 ; x[3] -= s[3] paddd m2, m5 ; x[1] + 2048 REPX {psrad x, 12}, m0, m2, m1, m4 - packssdw m0, m2 ; out0 out1 - packssdw m1, m4 ; out2 out3 ret @@ -475,6 +481,8 @@ INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_16bpc).main + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 ; transpose punpcklwd m2, m1, m0 punpckhwd m1, m0 @@ -580,30 +588,40 @@ cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %macro INV_TXFM_4X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 0, 4x8 %ifidn %1_%2, dct_dct -%if 1 - pshufd m0, [cq], q0000 - mova m1, [o(pw_2896x8)] - mov [cq], eobd ; 0 - packssdw m0, m0 - REPX {pmulhrsw x, m1}, m0, m0, m0 - pmulhrsw m0, [o(pw_2048)] -%else imul r5d, [cq], 2896 mov [cq], eobd ; 0 + mov r3d, 2 add r5d, 2048 sar r5d, 12 imul r5d, 2896 add r5d, 2048 sar r5d, 12 +.end: imul r5d, 2896 add r5d, 34816 movd m0, r5d pshuflw m0, m0, q1111 punpcklqdq m0, m0 -%endif pxor m4, m4 - REPX {mova x, m0}, m1, m2, m3 - TAIL_CALL m(idct_4x8_internal_16bpc).write_4x8 + mova m3, [o(pixel_10bpc_max)] + lea r2, [strideq*3] +.loop: + movq m1, [dstq+strideq*0] + movq m2, [dstq+strideq*2] + movhps m1, [dstq+strideq*1] + movhps m2, [dstq+r2] + paddw m1, m0 + paddw m2, m0 + REPX {pminsw x, m3}, m1, m2 + REPX {pmaxsw x, m4}, m1, m2 + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + movq [dstq+strideq*2], m2 + movhps [dstq+r2 ], m2 + lea dstq, [dstq+strideq*4] + dec r3d + jg .loop + RET %endif %endmacro @@ -624,6 +642,8 @@ cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 call m(idct_4x4_internal_16bpc).pass1_main + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 test r3d, r3d jz .end_pass1 mova [cq+32*0+16], m0 @@ -656,7 +676,6 @@ cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 pxor m4, m4 REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 -.write_4x8: mova m7, [o(pixel_10bpc_max)] lea r2, [strideq*3] movq m5, [dstq+strideq*0] @@ -717,6 +736,8 @@ cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 REPX {psrad x, 12}, m5, m2, m1, m3 mova [r3], m2 call m(iadst_4x4_internal_16bpc).main2 + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 test r5d, r5d jz .end_pass1 mova [cq+32*2+16], m0 @@ -813,3 +834,296 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 .pass2: mova m4, [o(pw_4096)] jmp m(idct_4x8_internal_16bpc).end + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 4x16 +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 4 + add r5d, 6144 + sar r5d, 13 + jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end +%endif +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, identity +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst + +cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m5, [o(pd_2048)] + mov r3d, 48 +.loop_pass1: + mova m0, [cq+64*0+r3] + mova m1, [cq+64*1+r3] + mova m2, [cq+64*2+r3] + mova m3, [cq+64*3+r3] + call m(idct_4x4_internal_16bpc).pass1_main + pcmpeqd m3, m3 + REPX {psubd x, m3}, m0, m1, m4, m2 + REPX {psrad x, 1}, m0, m1, m4, m2 + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + test r3d, r3d + jz .end_pass1 + mova [cq+64*0+r3], m0 + mova [cq+64*1+r3], m1 + sub r3d, 16 + jmp .loop_pass1 +.end_pass1: + mova m2, [cq+64*0+16] + mova m3, [cq+64*1+16] + mova m4, [cq+64*0+32] + mova m5, [cq+64*1+32] + mova m6, [cq+64*0+48] + mova m7, [cq+64*1+48] + ; m0-7 = packed & transposed output + jmp tx2q +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_16x4_internal_8bpc, _ssse3).main + ; m0-6 is out0-13 [with odd registers having inversed output] + ; [coeffq+16*7] has out15/14 + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [cq+16*7] + REPX {shufps x, x, q1032}, m1, m3, m5, m7 + mova [cq+16*0], m4 + mova [cq+16*1], m5 + mova [cq+16*2], m6 + mova [cq+16*3], m7 +.end: + pxor m4, m4 + REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + mova m7, [o(pixel_10bpc_max)] + mov r5d, 2 + lea r3, [strideq*3] +.loop: + movq m5, [dstq+strideq*0] + movq m6, [dstq+strideq*2] + movhps m5, [dstq+strideq*1] + movhps m6, [dstq+r3] + lea r4, [dstq+strideq*4] + paddw m0, m5 + paddw m1, m6 + movq m5, [r4+strideq*0] + movq m6, [r4+strideq*2] + movhps m5, [r4+strideq*1] + movhps m6, [r4+r3] + paddw m2, m5 + paddw m3, m6 + REPX {pminsw x, m7}, m0, m1, m2, m3 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r3 ], m1 + movq [r4 +strideq*0], m2 + movhps [r4 +strideq*1], m2 + movq [r4 +strideq*2], m3 + movhps [r4 +r3 ], m3 + dec r5d + jz .end2 + lea dstq, [dstq+strideq*8] + mova m0, [cq+0*16] + mova m1, [cq+1*16] + mova m2, [cq+2*16] + mova m3, [cq+3*16] + REPX {mova [cq+x*16], m4}, 0, 1, 2, 3 + jmp .loop +.end2: + RET + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mov r5d, 48 +.loop_pass1: + mova m5, [cq+64*0+r5] + lea r3, [cq+64*1+r5] + mova m1, [cq+64*2+r5] + mova m3, [cq+64*3+r5] + call m(iadst_4x4_internal_16bpc).main2 + pcmpeqd m3, m3 + REPX {psubd x, m3}, m0, m2, m1, m4 + REPX {psrad x, 1}, m0, m2, m1, m4 + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + test r5d, r5d + jz m(idct_4x16_internal_16bpc).end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end + ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8 + ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13 + mova m1, [o(pw_4x2048_4xm2048)] + REPX {pmulhrsw x, m1}, m7, m2, m0 + pshufd m6, m1, q1032 ; 4x-2048,4x2048 + pmulhrsw m1, [cq+16*7] + REPX {pmulhrsw x, m6}, m5, m4, m3 + pmulhrsw m6, [cq+16*6] + ; m7/5/2/4 = out4/11,5/10,6/9,7/8 + ; m0/3/6/1 = out0/15,3/12,1/14,2/13 + ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 + movhps [cq+0*8], m4 + movhps [cq+1*8], m2 + movhps [cq+2*8], m5 + movhps [cq+3*8], m7 + movhps [cq+4*8], m3 + movhps [cq+5*8], m1 + movhps [cq+6*8], m6 + movhps [cq+7*8], m0 + punpcklqdq m0, m6 + punpcklqdq m1, m3 + punpcklqdq m3, m2, m4 + punpcklqdq m2, m7, m5 + jmp m(idct_4x16_internal_16bpc).end + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mov r5d, 48 +.loop_pass1: + mova m5, [cq+64*0+r5] + lea r3, [cq+64*1+r5] + mova m1, [cq+64*2+r5] + mova m3, [cq+64*3+r5] + call m(iadst_4x4_internal_16bpc).main2 + pcmpeqd m3, m3 + REPX {psubd x, m3}, m0, m2, m1, m4 + REPX {psrad x, 1}, m0, m2, m1, m4 + packssdw m0, m2 ; out3 out2 + packssdw m1, m4 ; out1 out0 + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + test r5d, r5d + jz m(idct_4x16_internal_16bpc).end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end + ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7 + ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2 + mova m1, [o(pw_4x2048_4xm2048)] + REPX {pmulhrsw x, m1}, m7, m2, m0 + pshufd m6, m1, q1032 ; 4x-2048,4x2048 + pmulhrsw m1, [cq+16*7] + REPX {pmulhrsw x, m6}, m5, m4, m3 + pmulhrsw m6, [cq+16*6] + ; m7/5/2/4 = out11/4,10/5,9/6,8/7 + ; m0/3/6/1 = out15/0,12/3,14/1,13/2 + ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 + movq [cq+0*8], m4 + movq [cq+1*8], m2 + movq [cq+2*8], m5 + movq [cq+3*8], m7 + movq [cq+4*8], m3 + movq [cq+5*8], m1 + movq [cq+6*8], m6 + movq [cq+7*8], m0 + punpckhqdq m0, m6 + punpckhqdq m1, m3 + punpckhqdq m3, m2, m4 + punpckhqdq m2, m7, m5 + jmp m(idct_4x16_internal_16bpc).end + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m5, [o(pd_6144)] + mova m4, [o(pd_5793)] + mov r3d, 48 +.loop_pass1: + pmulld m0, m4, [cq+64*0+r3] + pmulld m1, m4, [cq+64*1+r3] + pmulld m2, m4, [cq+64*2+r3] + pmulld m3, m4, [cq+64*3+r3] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 13}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + test r3d, r3d + jz m(idct_4x16_internal_16bpc).end_pass1 + mova [cq+64*0+r3], m0 + mova [cq+64*1+r3], m1 + sub r3d, 16 + jmp .loop_pass1 +.pass2: + mova [cq+16*4], m0 + mova [cq+16*5], m1 + mova [cq+16*6], m2 + mova [cq+16*7], m7 + mova m0, [o(pw_1697x16)] + mova m7, [o(pw_2048)] + pmulhrsw m1, m0, m4 + pmulhrsw m2, m0, m5 + REPX {paddsw x, x}, m4, m5 + paddsw m4, m1 + paddsw m5, m2 + REPX {pmulhrsw x, m7}, m4, m5 + mova [cq+16*0], m4 + mova [cq+16*1], m5 + mova m4, [cq+16*7] + pmulhrsw m1, m0, m6 + pmulhrsw m2, m0, m4 + REPX {paddsw x, x}, m6, m4 + paddsw m6, m1 + paddsw m4, m2 + REPX {pmulhrsw x, m7}, m6, m4 + mova [cq+16*2], m6 + mova [cq+16*3], m4 + mova m4, [cq+16*4] + mova m1, [cq+16*5] + mova m2, [cq+16*6] + pmulhrsw m5, m0, m2 + pmulhrsw m6, m0, m3 + REPX {paddsw x, x}, m2, m3 + paddsw m2, m5 + paddsw m3, m6 + pmulhrsw m6, m0, m1 + pmulhrsw m0, m4 + REPX {paddsw x, x}, m1, m4 + paddsw m1, m6 + paddsw m0, m4 + REPX {pmulhrsw x, m7}, m2, m3, m1, m0 + jmp m(idct_4x16_internal_16bpc).end diff --git a/src/x86/itx_sse.asm b/src/x86/itx_sse.asm index 03cec677a5..802aa94112 100644 --- a/src/x86/itx_sse.asm +++ b/src/x86/itx_sse.asm @@ -1869,7 +1869,7 @@ cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp tx2q ALIGN function_align -.main: +cglobal_label .main punpckhqdq m7, m0, m1 ;low:in1 high:in3 punpcklqdq m0, m1 punpcklqdq m1, m2, m3 @@ -1947,7 +1947,7 @@ cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(idct_16x4_internal_8bpc).pass2_end ALIGN function_align -.main: +cglobal_label .main mova [coeffq+16*6], m0 pshufd m0, m1, q1032 pshufd m2, m2, q1032 @@ -2070,7 +2070,7 @@ ALIGN function_align mova m3, [coeffq+16*5] ret ALIGN function_align -.main_pass2_end: +cglobal_label .main_pass2_end mova m7, [o(pw_2896x8)] punpckhqdq m6, m2, m1 ;low:t11 high:t15a punpcklqdq m2, m1 ;low:t10 high:t14a From ebe91763b2d742f19eada6fe70f720d4f951edb6 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sun, 18 Jul 2021 19:28:48 -0400 Subject: [PATCH 150/188] x86/itx: 8x4 inverse transforms hbd/sse4 --- src/x86/itx16_sse.asm | 416 +++++++++++++++++++++++++++++++++++++++++- src/x86/itx_sse.asm | 4 +- 2 files changed, 416 insertions(+), 4 deletions(-) diff --git a/src/x86/itx16_sse.asm b/src/x86/itx16_sse.asm index b3cfb17fab..fa3d463c0b 100644 --- a/src/x86/itx16_sse.asm +++ b/src/x86/itx16_sse.asm @@ -90,6 +90,9 @@ pixel_10bpc_max: times 8 dw 0x03ff pw_1567_3784: times 4 dw 1567, 3784 pw_m3784_1567: times 4 dw -3784, 1567 +clip_min: times 4 dd -0x20000 +clip_max: times 4 dd 0x1ffff + cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3 cextern iadst_4x4_internal_8bpc_ssse3.main cextern idct_4x8_internal_8bpc_ssse3.main @@ -97,6 +100,8 @@ cextern iadst_4x8_internal_8bpc_ssse3.main cextern idct_16x4_internal_8bpc_ssse3.main cextern iadst_16x4_internal_8bpc_ssse3.main cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end +cextern idct_8x4_internal_8bpc_ssse3.main +cextern iadst_8x4_internal_8bpc_ssse3.main SECTION .text @@ -247,8 +252,8 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax %endif %endmacro -%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size -cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, 8, dst, stride, c, eob, tx2 +%macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack +cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2 %define %%p1 m(i%1_%4_internal_16bpc) %if ARCH_X86_32 LEA r6, $$ @@ -1127,3 +1132,410 @@ cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 paddsw m0, m4 REPX {pmulhrsw x, m7}, m2, m3, m1, m0 jmp m(idct_4x16_internal_16bpc).end + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, 0, 8x4, 14 +%else + INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + mov [cq], eobd ; 0 + add r5d, 2048 + sar r5d, 12 + imul r5d, 2896 + add r5d, 2048 + sar r5d, 12 + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m6, [o(pixel_10bpc_max)] + pxor m5, m5 + lea r2, [strideq*3] + mova m1, [dstq+strideq*0] + mova m2, [dstq+strideq*1] + mova m3, [dstq+strideq*2] + mova m4, [dstq+r2] + REPX {paddw x, m0}, m1, m2, m3, m4 + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + mova [dstq+strideq*2], m3 + mova [dstq+r2 ], m4 + RET +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, identity +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst + +cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call .load + call .main_pass1 +.pack_transpose: + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 +.transpose: + ; transpose + punpckhwd m5, m0, m4 + punpcklwd m0, m4 + punpckhwd m4, m2, m6 + punpcklwd m2, m6 + + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + punpckhwd m7, m5, m4 + punpcklwd m5, m4 + + punpckhwd m1, m0, m5 + punpcklwd m0, m5 + punpcklwd m2, m3, m7 + punpckhwd m3, m7 + ; m0-3 = packed & transposed output + jmp tx2q +.load: + mova m7, [o(pd_2896)] + pmulld m0, m7, [cq+0*16] + pmulld m1, m7, [cq+1*16] + pmulld m2, m7, [cq+2*16] + pmulld m3, m7, [cq+3*16] + pmulld m4, m7, [cq+4*16] + pmulld m5, m7, [cq+5*16] + pmulld m6, m7, [cq+6*16] + pmulld m7, [cq+7*16] +%if ARCH_X86_64 + mova m8, [o(pd_2048)] + REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +%else + mova [cq+0*16], m7 + mova m7, [o(pd_2048)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [cq+0*16] +%endif + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 + ret +.main_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_min)] + mova m13, [o(clip_max)] + ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a + ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3 + paddd m8, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m9, m7, m3 ; t7 + psubd m7, m3 ; t6a + mova m3, [o(pd_2896)] + REPX {pmaxsd x, m12}, m1, m8, m7, m9 + REPX {pminsd x, m13}, m1, m8, m7, m9 + REPX {pmulld x, m3 }, m0, m4, m7, m1 + paddd m0, m11 + paddd m7, m11 + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + REPX {pmaxsd x, m12}, m0, m6, m5, m3 + REPX {pminsd x, m13}, m0, m6, m5, m3 + + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 +%else + mova [rsp+0*16+2*gprsize], m0 + mova [rsp+1*16+2*gprsize], m2 + mova [rsp+2*16+2*gprsize], m4 + mova [rsp+3*16+2*gprsize], m6 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 2, 4, 6, 0, 799, 4017 ; t4a t7a + paddd m2, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m4, m7, m3 ; t7 + psubd m7, m3 ; t6a + mova m6, [o(clip_min)] + REPX {pmaxsd x, m6 }, m1, m2, m7, m4 + mova m6, [o(clip_max)] + REPX {pminsd x, m6 }, m1, m2, m7, m4 + mova m6, [rsp+3*16+2*gprsize] + mova [rsp+3*16+2*gprsize], m2 + mova m2, [rsp+1*16+2*gprsize] + mova [rsp+1*16+2*gprsize], m4 + + ITX_MULSUB_2D 2, 6, 4, 3, 5, 0, 1567, 3784 ; t2 t3 + mova m3, [o(pd_2896)] + mova m5, [rsp+0*16+2*gprsize] + mova m4, [rsp+2*16+2*gprsize] + REPX {pmulld x, m3 }, m5, m4, m7, m1 + paddd m7, m0 + paddd m0, m5 + + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + + mova m1, [o(clip_min)] + REPX {pmaxsd x, m1 }, m0, m6, m5, m3 + mova m1, [o(clip_max)] + REPX {pminsd x, m1 }, m0, m6, m5, m3 + + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + mova [rsp+0*16+2*gprsize], m6 + mova m6, [rsp+1*16+2*gprsize] + psubd m7, m0, m6 ; out7 + paddd m0, m6 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + mova m6, [rsp+3*16+2*gprsize] + psubd m4, m3, m6 ; out4 + paddd m3, m6 ; out3 + mova m6, [rsp+0*16+2*gprsize] +%endif + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_8x4_internal_8bpc, _ssse3).main +.end: + lea r3, [strideq*3] +.end2: + ; output is in m0-3 + mova m4, [o(pw_2048)] +.end3: + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + pxor m4, m4 + REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 + mova m7, [o(pixel_10bpc_max)] + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3] + REPX {pminsw x, m7}, m0, m1, m2, m3 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + RET + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(idct_8x4_internal_16bpc).load + call .main_pass1 + jmp m(idct_8x4_internal_16bpc).pack_transpose +.main_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_min)] + mova m13, [o(clip_max)] + + ITX_MULSUB_2D 7, 0, 8, 9, 10, 11, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D 1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2D 5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D 3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a + psubd m8, m2, m6 ; t6 + paddd m2, m6 ; t2 + psubd m6, m0, m4 ; t4 + paddd m0, m4 ; t0 + psubd m4, m5, m1 ; t7 + paddd m5, m1 ; t3 + psubd m1, m7, m3 ; t5 + paddd m7, m3 ; t1 + REPX {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7 + REPX {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7 + ITX_MULSUB_2D 6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2D 4, 8, 3, 9, 10, 11, 3784, 10 ; t6a, t7a + psubd m9, m6, m8 ; t7 + paddd m6, m8 ; out6 + mova m8, [o(pd_2896)] + psubd m3, m7, m5 ; t3 + paddd m7, m5 ; -out7 + psubd m5, m0, m2 ; t2 + paddd m0, m2 ; out0 + psubd m2, m1, m4 ; t6 + paddd m1, m4 ; -out1 + REPX {pmaxsd x, m12}, m5, m3, m2, m9 + REPX {pminsd x, m13}, m5, m3, m2, m9 + REPX {pmulld x, m8 }, m5, m3, m2, m9 + psubd m4, m5, m3 ; (t2 - t3) * 2896 + paddd m3, m5 ; (t2 + t3) * 2896 + psubd m5, m2, m9 ; (t6 - t7) * 2896 + paddd m2, m9 ; (t6 + t7) * 2896 + + ; m0=out0,m1=-out1,m6=out6,m7=-out7 + + pcmpeqd m8, m8 + REPX {pxor x, m8 }, m1, m7, m3, m5 + REPX {psubd x, m8 }, m1, m7 + REPX {paddd x, m11}, m2, m3, m4, m5 + REPX {psrad x, 12 }, m2, m3, m4, m5 +%else + mova [rsp+0*16+2*gprsize], m2 + mova [rsp+1*16+2*gprsize], m3 + mova [rsp+2*16+2*gprsize], m4 + mova [rsp+3*16+2*gprsize], m5 + mova m5, [o(pd_2048)] + + ITX_MULSUB_2D 7, 0, 2, 3, 4, 5, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D 1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a + mova m2, [rsp+0*16+2*gprsize] + mova m3, [rsp+1*16+2*gprsize] + mova m4, [rsp+2*16+2*gprsize] + mova [rsp+0*16+2*gprsize], m0 + mova [rsp+1*16+2*gprsize], m1 + mova [rsp+2*16+2*gprsize], m6 + mova m1, [rsp+3*16+2*gprsize] + mova [rsp+3*16+2*gprsize], m7 + ITX_MULSUB_2D 1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D 3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a + mova m0, [rsp+0*16+2*gprsize] + mova m6, [rsp+2*16+2*gprsize] + psubd m7, m2, m6 ; t6 + paddd m2, m6 ; t2 + psubd m6, m0, m4 ; t4 + paddd m0, m4 ; t0 + mova [rsp+0*16+2*gprsize], m7 + mova m5, [rsp+1*16+2*gprsize] + mova m7, [rsp+3*16+2*gprsize] + psubd m4, m1, m5 ; t7 + paddd m5, m1 ; t3 + psubd m1, m7, m3 ; t5 + paddd m7, m3 ; t1 + mova m3, [o(clip_min)] + REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7 + mova [rsp+1*16+2*gprsize], m7 + mova m7, [o(clip_max)] + pmaxsd m3, [rsp+0*16+2*gprsize] + REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5 + pminsd m7, [rsp+1*16+2*gprsize] + mova [rsp+0*16+2*gprsize], m0 + mova [rsp+1*16+2*gprsize], m2 + mova [rsp+2*16+2*gprsize], m5 + mova [rsp+3*16+2*gprsize], m7 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2D 4, 3, 2, 5, 7, 0, 3784, 7 ; t6a, t7a + mova m5, [rsp+2*16+2*gprsize] + mova m7, [rsp+3*16+2*gprsize] + psubd m2, m6, m3 ; t7 + paddd m6, m3 ; out6 + mova [rsp+3*16+2*gprsize], m6 + mova m0, [rsp+0*16+2*gprsize] + mova m6, [rsp+1*16+2*gprsize] + psubd m3, m7, m5 ; t3 + paddd m7, m5 ; -out7 + psubd m5, m0, m6 ; t2 + paddd m0, m6 ; out0 + psubd m6, m1, m4 ; t6 + paddd m1, m4 ; -out1 + mova m4, [o(clip_min)] + REPX {pmaxsd x, m4 }, m5, m3, m6, m2 + mova m4, [o(clip_max)] + REPX {pminsd x, m4 }, m5, m3, m6, m2 + mova m4, [o(pd_2896)] + REPX {pmulld x, m4 }, m5, m3, m6, m2 + psubd m4, m5, m3 ; (t2 - t3) * 2896 + paddd m3, m5 ; (t2 + t3) * 2896 + psubd m5, m6, m2 ; (t6 - t7) * 2896 + paddd m2, m6 ; (t6 + t7) * 2896 + mova [rsp+2*16+2*gprsize], m0 + + pcmpeqd m0, m0 + mova m6, [o(pd_2048)] + REPX {pxor x, m0 }, m1, m7, m3, m5 + REPX {psubd x, m0 }, m1, m7 + REPX {paddd x, m6 }, m2, m3, m4, m5 + REPX {psrad x, 12 }, m2, m3, m4, m5 + + mova m6, [rsp+3*16+2*gprsize] + mova m0, [rsp+2*16+2*gprsize] +%endif + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main + jmp m(idct_8x4_internal_16bpc).end + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(idct_8x4_internal_16bpc).load + call m(iadst_8x4_internal_16bpc).main_pass1 + packssdw m7, m6 + packssdw m5, m4 + packssdw m3, m2 + packssdw m1, m0 + mova m0, m7 + mova m2, m5 + mova m4, m3 + mova m6, m1 + jmp m(idct_8x4_internal_16bpc).transpose +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main + lea r3, [strideq*3] + add dstq, r3 + neg strideq + neg r3 + jmp m(idct_8x4_internal_16bpc).end2 + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(idct_8x4_internal_16bpc).load + REPX {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp m(idct_8x4_internal_16bpc).pack_transpose +.pass2: + mova m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(idct_8x4_internal_16bpc).end diff --git a/src/x86/itx_sse.asm b/src/x86/itx_sse.asm index 802aa94112..8808b10121 100644 --- a/src/x86/itx_sse.asm +++ b/src/x86/itx_sse.asm @@ -837,7 +837,7 @@ cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(iadst_8x4_internal_8bpc).end ALIGN function_align -.main: +cglobal_label .main mova m6, [o(pd_2048)] IDCT4_1D 0, 1, 2, 3, 4, 5, 6 ret @@ -896,7 +896,7 @@ cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 RET ALIGN function_align -.main: +cglobal_label .main punpckhwd m6, m0, m2 ;unpacked in0 in2 punpcklwd m0, m2 ;unpacked in0 in2 punpckhwd m7, m1, m3 ;unpacked in1 in3 From 5bdb9c7f0a912b00e58ec971c8ca7f446298d0cb Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 20 Jul 2021 12:15:03 -0400 Subject: [PATCH 151/188] x86/itx: add eob-based fast path to 4x8 hbd/sse4 itx --- src/x86/itx16_sse.asm | 77 +++++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 21 deletions(-) diff --git a/src/x86/itx16_sse.asm b/src/x86/itx16_sse.asm index fa3d463c0b..636d4cc566 100644 --- a/src/x86/itx16_sse.asm +++ b/src/x86/itx16_sse.asm @@ -264,6 +264,9 @@ cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2 jz %%end %endif lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] +%if %3 + add eobd, %3 +%endif call %%p1 RET %%end: @@ -590,8 +593,8 @@ cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 movhps [r5 +strideq*1], m1 RET -%macro INV_TXFM_4X8_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 0, 4x8 +%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2 + INV_TXFM_FN %1, %2, %3, 4x8 %ifidn %1_%2, dct_dct imul r5d, [cq], 2896 mov [cq], eobd ; 0 @@ -631,29 +634,39 @@ cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %endmacro INV_TXFM_4X8_FN dct, dct -INV_TXFM_4X8_FN dct, identity +INV_TXFM_4X8_FN dct, identity, 9 INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp mova m5, [o(pd_2048)] - mov r3d, 16 +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 13 + setge r5b +%else + mov r5d, 1 + cmp eobd, 13 + sbb r5d, 0 +%endif + shl r5d, 4 .loop_pass1: mova m3, [o(pd_2896)] - pmulld m0, m3, [cq+32*0+r3] - pmulld m1, m3, [cq+32*1+r3] - pmulld m2, m3, [cq+32*2+r3] - pmulld m3, [cq+32*3+r3] + pmulld m0, m3, [cq+32*0+r5] + pmulld m1, m3, [cq+32*1+r5] + pmulld m2, m3, [cq+32*2+r5] + pmulld m3, [cq+32*3+r5] REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 call m(idct_4x4_internal_16bpc).pass1_main packssdw m0, m1 ; out0 out1 packssdw m4, m2 ; out2 out3 - test r3d, r3d + test r5d, r5d jz .end_pass1 mova [cq+32*0+16], m0 mova [cq+32*1+16], m4 - xor r3d, r3d + xor r5d, r5d jmp .loop_pass1 .end_pass1: punpckhwd m2, m0, m4 @@ -711,7 +724,7 @@ cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_4X8_FN adst, dct INV_TXFM_4X8_FN adst, adst INV_TXFM_4X8_FN adst, flipadst -INV_TXFM_4X8_FN adst, identity +INV_TXFM_4X8_FN adst, identity, 9 cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call .pass1_main @@ -728,7 +741,17 @@ cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ; m0-3 = packed & transposed output jmp tx2q .pass1_main: - mov r5d, 16 +%undef cmp +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 13 + setge r5b +%else + mov r5d, 1 + cmp eobd, 13 + sbb r5d, 0 +%endif + shl r5d, 4 lea r3, [cq+32*1+16] .loop_pass1: mova m0, [o(pd_2048)] @@ -764,7 +787,7 @@ cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_4X8_FN flipadst, dct INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, flipadst -INV_TXFM_4X8_FN flipadst, identity +INV_TXFM_4X8_FN flipadst, identity, 9 cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call m(iadst_4x8_internal_16bpc).pass1_main @@ -799,18 +822,30 @@ cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst -INV_TXFM_4X8_FN identity, identity +INV_TXFM_4X8_FN identity, identity, 3 cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp mova m5, [o(pd_2048)] mova m4, [o(pd_2896)] mova m6, [o(pd_5793)] - mov r3d, 16 + ; clear m7 in case we skip the bottom square + pxor m7, m7 +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 16 + setge r5b +%else + mov r5d, 1 + cmp eobd, 16 + sbb r5d, 0 +%endif + shl r5d, 4 .loop_pass1: - pmulld m0, m4, [cq+32*0+r3] - pmulld m1, m4, [cq+32*1+r3] - pmulld m2, m4, [cq+32*2+r3] - pmulld m3, m4, [cq+32*3+r3] + pmulld m0, m4, [cq+32*0+r5] + pmulld m1, m4, [cq+32*1+r5] + pmulld m2, m4, [cq+32*2+r5] + pmulld m3, m4, [cq+32*3+r5] REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 REPX {pmulld x, m6}, m0, m1, m2, m3 @@ -818,11 +853,11 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 REPX {psrad x, 12}, m0, m1, m2, m3 packssdw m0, m1 packssdw m2, m3 - test r3d, r3d + test r5d, r5d jz .end_pass1 mova [cq+32*0+16], m0 mova m7, m2 - xor r3d, r3d + xor r5d, r5d jmp .loop_pass1 .end_pass1: punpckhwd m4, m0, m2 From 2fae040d106420817951bbd5c1644705e00d956b Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 20 Jul 2021 13:57:18 -0400 Subject: [PATCH 152/188] x86/itx: add eob-based fast path to 4x16 hbd/sse4 itx --- src/x86/itx16_sse.asm | 130 ++++++++++++++++++++++++++++++++---------- 1 file changed, 99 insertions(+), 31 deletions(-) diff --git a/src/x86/itx16_sse.asm b/src/x86/itx16_sse.asm index 636d4cc566..3b7a066761 100644 --- a/src/x86/itx16_sse.asm +++ b/src/x86/itx16_sse.asm @@ -103,6 +103,10 @@ cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end cextern idct_8x4_internal_8bpc_ssse3.main cextern iadst_8x4_internal_8bpc_ssse3.main +tbl_4x16_2d: db 0, 13, 29, 45 +tbl_4x16_h: db 0, 16, 32, 48 +tbl_4x16_v: db 0, 4, 8, 12 + SECTION .text %macro REPX 2-* @@ -264,8 +268,12 @@ cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2 jz %%end %endif lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] +%ifnum %3 %if %3 add eobd, %3 +%endif +%else + lea r5, [o(%3)] %endif call %%p1 RET @@ -274,13 +282,17 @@ cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2 ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] +%ifnum %3 +%if %3 + add eobd, %3 +%endif +%else + lea r5, [o(%3)] +%endif %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 %else -%if %3 - add eobd, %3 -%endif ; jump to the 1st txfm function unless it's located directly after this times ((%%end - %%p1) >> 31) & 1 jmp %%p1 ALIGN function_align @@ -875,8 +887,8 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 mova m4, [o(pw_4096)] jmp m(idct_4x8_internal_16bpc).end -%macro INV_TXFM_4X16_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 0, 4x16 +%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2 + INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16 %ifidn %1_%2, dct_dct imul r5d, [cq], 2896 mov [cq], eobd ; 0 @@ -888,18 +900,32 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %endmacro INV_TXFM_4X16_FN dct, dct -INV_TXFM_4X16_FN dct, identity +INV_TXFM_4X16_FN dct, identity, v INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif mova m5, [o(pd_2048)] - mov r3d, 48 .loop_pass1: - mova m0, [cq+64*0+r3] - mova m1, [cq+64*1+r3] - mova m2, [cq+64*2+r3] - mova m3, [cq+64*3+r3] + mova m0, [cq+64*0+r5] + mova m1, [cq+64*1+r5] + mova m2, [cq+64*2+r5] + mova m3, [cq+64*3+r5] call m(idct_4x4_internal_16bpc).pass1_main pcmpeqd m3, m3 REPX {psubd x, m3}, m0, m1, m4, m2 @@ -910,11 +936,11 @@ cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 punpcklwd m0, m4 punpckhwd m1, m0, m2 punpcklwd m0, m2 - test r3d, r3d + test r5d, r5d jz .end_pass1 - mova [cq+64*0+r3], m0 - mova [cq+64*1+r3], m1 - sub r3d, 16 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 jmp .loop_pass1 .end_pass1: mova m2, [cq+64*0+16] @@ -985,10 +1011,24 @@ cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_4X16_FN adst, dct INV_TXFM_4X16_FN adst, adst INV_TXFM_4X16_FN adst, flipadst -INV_TXFM_4X16_FN adst, identity +INV_TXFM_4X16_FN adst, identity, v cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 - mov r5d, 48 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r6+r5] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif .loop_pass1: mova m5, [cq+64*0+r5] lea r3, [cq+64*1+r5] @@ -1044,10 +1084,24 @@ cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_4X16_FN flipadst, dct INV_TXFM_4X16_FN flipadst, adst INV_TXFM_4X16_FN flipadst, flipadst -INV_TXFM_4X16_FN flipadst, identity +INV_TXFM_4X16_FN flipadst, identity, v cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 - mov r5d, 48 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif .loop_pass1: mova m5, [cq+64*0+r5] lea r3, [cq+64*1+r5] @@ -1100,20 +1154,34 @@ cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 punpckhqdq m2, m7, m5 jmp m(idct_4x16_internal_16bpc).end -INV_TXFM_4X16_FN identity, dct -INV_TXFM_4X16_FN identity, adst -INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, dct, h +INV_TXFM_4X16_FN identity, adst, h +INV_TXFM_4X16_FN identity, flipadst, h INV_TXFM_4X16_FN identity, identity cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif mova m5, [o(pd_6144)] mova m4, [o(pd_5793)] - mov r3d, 48 .loop_pass1: - pmulld m0, m4, [cq+64*0+r3] - pmulld m1, m4, [cq+64*1+r3] - pmulld m2, m4, [cq+64*2+r3] - pmulld m3, m4, [cq+64*3+r3] + pmulld m0, m4, [cq+64*0+r5] + pmulld m1, m4, [cq+64*1+r5] + pmulld m2, m4, [cq+64*2+r5] + pmulld m3, m4, [cq+64*3+r5] REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 13}, m0, m1, m2, m3 packssdw m0, m1 @@ -1122,11 +1190,11 @@ cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 punpcklwd m0, m2 punpckhwd m1, m0, m3 punpcklwd m0, m3 - test r3d, r3d + test r5d, r5d jz m(idct_4x16_internal_16bpc).end_pass1 - mova [cq+64*0+r3], m0 - mova [cq+64*1+r3], m1 - sub r3d, 16 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 jmp .loop_pass1 .pass2: mova [cq+16*4], m0 From 8d3369d6f5435397eb2c678fe1493a86c5361026 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Wed, 21 Jul 2021 08:46:48 -0400 Subject: [PATCH 153/188] x86/itx: 8x8 inverse transforms hbd/sse4 --- src/x86/itx16_sse.asm | 426 +++++++++++++++++++++++++++++++++++++----- src/x86/itx_sse.asm | 8 +- 2 files changed, 382 insertions(+), 52 deletions(-) diff --git a/src/x86/itx16_sse.asm b/src/x86/itx16_sse.asm index 3b7a066761..66648a1cb3 100644 --- a/src/x86/itx16_sse.asm +++ b/src/x86/itx16_sse.asm @@ -77,6 +77,7 @@ pw_4x2048_4xm2048: times 4 dw 2048 pw_4xm2048_4x2048: times 4 dw -2048 times 4 dw 2048 pw_2048: times 8 dw 2048 +pw_m2048: times 8 dw -2048 pd_3803: times 4 dd 3803 pw_4096: times 8 dw 4096 pd_5793: times 4 dd 5793 @@ -102,6 +103,10 @@ cextern iadst_16x4_internal_8bpc_ssse3.main cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end cextern idct_8x4_internal_8bpc_ssse3.main cextern iadst_8x4_internal_8bpc_ssse3.main +cextern idct_8x8_internal_8bpc_ssse3.main +cextern idct_8x8_internal_8bpc_ssse3.pass1_end3 +cextern iadst_8x8_internal_8bpc_ssse3.main +cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end tbl_4x16_2d: db 0, 13, 29, 45 tbl_4x16_h: db 0, 16, 32, 48 @@ -1280,7 +1285,11 @@ INV_TXFM_8X4_FN dct, flipadst cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call .load +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%endif call .main_pass1 + call .round .pack_transpose: packssdw m0, m1 packssdw m2, m3 @@ -1354,7 +1363,8 @@ cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 psubd m5, m2 ; dct4 out2 REPX {pmaxsd x, m12}, m0, m6, m5, m3 REPX {pminsd x, m13}, m0, m6, m5, m3 - + ret +.round: paddd m1, m6, m7 ; out1 psubd m6, m7 ; out6 psubd m7, m0, m9 ; out7 @@ -1364,10 +1374,10 @@ cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 %else - mova [rsp+0*16+2*gprsize], m0 - mova [rsp+1*16+2*gprsize], m2 - mova [rsp+2*16+2*gprsize], m4 - mova [rsp+3*16+2*gprsize], m6 + mova [r3+0*16], m0 + mova [r3+1*16], m2 + mova [r3+2*16], m4 + mova [r3+3*16], m6 mova m0, [o(pd_2048)] ITX_MULSUB_2D 5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a ITX_MULSUB_2D 1, 7, 2, 4, 6, 0, 799, 4017 ; t4a t7a @@ -1379,15 +1389,15 @@ cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 REPX {pmaxsd x, m6 }, m1, m2, m7, m4 mova m6, [o(clip_max)] REPX {pminsd x, m6 }, m1, m2, m7, m4 - mova m6, [rsp+3*16+2*gprsize] - mova [rsp+3*16+2*gprsize], m2 - mova m2, [rsp+1*16+2*gprsize] - mova [rsp+1*16+2*gprsize], m4 + mova m6, [r3+3*16] + mova [r3+3*16], m2 + mova m2, [r3+1*16] + mova [r3+1*16], m4 ITX_MULSUB_2D 2, 6, 4, 3, 5, 0, 1567, 3784 ; t2 t3 mova m3, [o(pd_2896)] - mova m5, [rsp+0*16+2*gprsize] - mova m4, [rsp+2*16+2*gprsize] + mova m5, [r3+0*16] + mova m4, [r3+2*16] REPX {pmulld x, m3 }, m5, m4, m7, m1 paddd m7, m0 paddd m0, m5 @@ -1406,19 +1416,20 @@ cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 REPX {pmaxsd x, m1 }, m0, m6, m5, m3 mova m1, [o(clip_max)] REPX {pminsd x, m1 }, m0, m6, m5, m3 - + ret +.round: paddd m1, m6, m7 ; out1 psubd m6, m7 ; out6 - mova [rsp+0*16+2*gprsize], m6 - mova m6, [rsp+1*16+2*gprsize] + mova [r3+0*16], m6 + mova m6, [r3+1*16] psubd m7, m0, m6 ; out7 paddd m0, m6 ; out0 paddd m2, m5, m4 ; out2 psubd m5, m4 ; out5 - mova m6, [rsp+3*16+2*gprsize] + mova m6, [r3+3*16] psubd m4, m3, m6 ; out4 paddd m3, m6 ; out3 - mova m6, [rsp+0*16+2*gprsize] + mova m6, [r3+0*16] %endif ret @@ -1456,7 +1467,11 @@ INV_TXFM_8X4_FN adst, identity cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call m(idct_8x4_internal_16bpc).load +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%endif call .main_pass1 + call .round jmp m(idct_8x4_internal_16bpc).pack_transpose .main_pass1: %if ARCH_X86_64 @@ -1496,6 +1511,8 @@ cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 paddd m3, m5 ; (t2 + t3) * 2896 psubd m5, m2, m9 ; (t6 - t7) * 2896 paddd m2, m9 ; (t6 + t7) * 2896 + ret +.round: ; m0=out0,m1=-out1,m6=out6,m7=-out7 @@ -1505,58 +1522,58 @@ cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 REPX {paddd x, m11}, m2, m3, m4, m5 REPX {psrad x, 12 }, m2, m3, m4, m5 %else - mova [rsp+0*16+2*gprsize], m2 - mova [rsp+1*16+2*gprsize], m3 - mova [rsp+2*16+2*gprsize], m4 - mova [rsp+3*16+2*gprsize], m5 + mova [r3+0*16], m2 + mova [r3+1*16], m3 + mova [r3+2*16], m4 + mova [r3+3*16], m5 mova m5, [o(pd_2048)] ITX_MULSUB_2D 7, 0, 2, 3, 4, 5, 401, 4076 ; t1a, t0a ITX_MULSUB_2D 1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a - mova m2, [rsp+0*16+2*gprsize] - mova m3, [rsp+1*16+2*gprsize] - mova m4, [rsp+2*16+2*gprsize] - mova [rsp+0*16+2*gprsize], m0 - mova [rsp+1*16+2*gprsize], m1 - mova [rsp+2*16+2*gprsize], m6 - mova m1, [rsp+3*16+2*gprsize] - mova [rsp+3*16+2*gprsize], m7 + mova m2, [r3+0*16] + mova m3, [r3+1*16] + mova m4, [r3+2*16] + mova [r3+0*16], m0 + mova [r3+1*16], m1 + mova [r3+2*16], m6 + mova m1, [r3+3*16] + mova [r3+3*16], m7 ITX_MULSUB_2D 1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a ITX_MULSUB_2D 3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a - mova m0, [rsp+0*16+2*gprsize] - mova m6, [rsp+2*16+2*gprsize] + mova m0, [r3+0*16] + mova m6, [r3+2*16] psubd m7, m2, m6 ; t6 paddd m2, m6 ; t2 psubd m6, m0, m4 ; t4 paddd m0, m4 ; t0 - mova [rsp+0*16+2*gprsize], m7 - mova m5, [rsp+1*16+2*gprsize] - mova m7, [rsp+3*16+2*gprsize] + mova [r3+0*16], m7 + mova m5, [r3+1*16] + mova m7, [r3+3*16] psubd m4, m1, m5 ; t7 paddd m5, m1 ; t3 psubd m1, m7, m3 ; t5 paddd m7, m3 ; t1 mova m3, [o(clip_min)] REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7 - mova [rsp+1*16+2*gprsize], m7 + mova [r3+1*16], m7 mova m7, [o(clip_max)] - pmaxsd m3, [rsp+0*16+2*gprsize] + pmaxsd m3, [r3+0*16] REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5 - pminsd m7, [rsp+1*16+2*gprsize] - mova [rsp+0*16+2*gprsize], m0 - mova [rsp+1*16+2*gprsize], m2 - mova [rsp+2*16+2*gprsize], m5 - mova [rsp+3*16+2*gprsize], m7 + pminsd m7, [r3+1*16] + mova [r3+0*16], m0 + mova [r3+1*16], m2 + mova [r3+2*16], m5 + mova [r3+3*16], m7 mova m0, [o(pd_2048)] ITX_MULSUB_2D 6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a ITX_MULSUB_2D 4, 3, 2, 5, 7, 0, 3784, 7 ; t6a, t7a - mova m5, [rsp+2*16+2*gprsize] - mova m7, [rsp+3*16+2*gprsize] + mova m5, [r3+2*16] + mova m7, [r3+3*16] psubd m2, m6, m3 ; t7 paddd m6, m3 ; out6 - mova [rsp+3*16+2*gprsize], m6 - mova m0, [rsp+0*16+2*gprsize] - mova m6, [rsp+1*16+2*gprsize] + mova [r3+3*16], m6 + mova m0, [r3+0*16] + mova m6, [r3+1*16] psubd m3, m7, m5 ; t3 paddd m7, m5 ; -out7 psubd m5, m0, m6 ; t2 @@ -1573,7 +1590,9 @@ cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 paddd m3, m5 ; (t2 + t3) * 2896 psubd m5, m6, m2 ; (t6 - t7) * 2896 paddd m2, m6 ; (t6 + t7) * 2896 - mova [rsp+2*16+2*gprsize], m0 + ret +.round: + mova [r3+2*16], m0 pcmpeqd m0, m0 mova m6, [o(pd_2048)] @@ -1582,8 +1601,8 @@ cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 REPX {paddd x, m6 }, m2, m3, m4, m5 REPX {psrad x, 12 }, m2, m3, m4, m5 - mova m6, [rsp+3*16+2*gprsize] - mova m0, [rsp+2*16+2*gprsize] + mova m6, [r3+3*16] + mova m0, [r3+2*16] %endif ret @@ -1601,7 +1620,11 @@ INV_TXFM_8X4_FN flipadst, identity cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call m(idct_8x4_internal_16bpc).load +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%endif call m(iadst_8x4_internal_16bpc).main_pass1 + call m(iadst_8x4_internal_16bpc).round packssdw m7, m6 packssdw m5, m4 packssdw m3, m2 @@ -1642,3 +1665,310 @@ cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 paddsw m2, m6 paddsw m3, m7 jmp m(idct_8x4_internal_16bpc).end + +%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2 +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, %3, 8x8, 14, 0-3*16 +%else + INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + mov [cq], eobd ; 0 + add r5d, 6144 + sar r5d, 13 + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m6, [o(pixel_10bpc_max)] + pxor m5, m5 + lea r2, [strideq*3] + mov r5d, 2 +.loop: + mova m1, [dstq+strideq*0] + mova m2, [dstq+strideq*1] + mova m3, [dstq+strideq*2] + mova m4, [dstq+r2] + REPX {paddw x, m0}, m1, m2, m3, m4 + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + mova [dstq+strideq*2], m3 + mova [dstq+r2 ], m4 + lea dstq, [dstq+strideq*4] + dec r5d + jg .loop + RET +%endif +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, identity, 6 +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst + +cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + DECLARE_REG_TMP 1 + mov [rsp+4*16+1*gprsize], r1 +%else + DECLARE_REG_TMP 6 +%endif + lea t0, [o(.pass1_main)] + +.pass1_full: +%undef cmp +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 10 + setge r5b +%else + mov r5d, 1 + cmp eobd, 10 + sbb r5d, 0 +%endif + shl r5d, 4 +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%endif +.loop_pass1: + mova m0, [cq+0*32+r5] + mova m1, [cq+1*32+r5] + mova m2, [cq+2*32+r5] + mova m3, [cq+3*32+r5] + mova m4, [cq+4*32+r5] + mova m5, [cq+5*32+r5] + mova m6, [cq+6*32+r5] + mova m7, [cq+7*32+r5] + call t0 + + test r5d, r5d + jz .end_pass1 + + mova [cq+0*32+16], m0 + mova [cq+1*32+16], m1 + mova [cq+2*32+16], m2 + mova [cq+3*32+16], m3 + + sub r5d, 16 + jmp .loop_pass1 +.end_pass1: + mova m4, [cq+0*32+16] + mova m5, [cq+1*32+16] + mova m6, [cq+2*32+16] + mova m7, [cq+3*32+16] +%if ARCH_X86_32 + mov r1, [rsp+4*16+1*gprsize] +%endif + jmp tx2q +.pass1_main: + call m(idct_8x4_internal_16bpc).main_pass1 + pcmpeqd m1, m1 + REPX {psubd x, m1}, m0, m6, m5, m3 + call m(idct_8x4_internal_16bpc).round + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 +.pack_and_transpose: + packssdw m2, m3 + packssdw m6, m7 + packssdw m0, m1 + packssdw m4, m5 +.transpose: + punpcklwd m7, m2, m6 + punpckhwd m2, m6 + punpckhwd m5, m0, m4 + punpcklwd m0, m4 + + punpckhwd m4, m5, m2 + punpcklwd m5, m2 + punpckhwd m2, m0, m7 + punpcklwd m0, m7 + + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + punpckhwd m1, m0, m5 + punpcklwd m0, m5 + + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [rsp+gprsize+0*16] +.end: + lea r3, [strideq*3] +%if ARCH_X86_64 +%define mzero m8 +%define mlim m11 +%else + mova [rsp+0*16+gprsize], m6 + mova [rsp+1*16+gprsize], m7 +%define mzero m6 +%define mlim m7 +%endif + pxor mzero, mzero + REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + mova mlim, [o(pixel_10bpc_max)] + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3] + REPX {pminsw x, mlim }, m0, m1, m2, m3 + REPX {pmaxsw x, mzero}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + lea dstq, [dstq+strideq*4] +%if ARCH_X86_32 + SWAP 2, 6 + SWAP 3, 7 + mova m6, [rsp+0*16+gprsize] + mova m7, [rsp+1*16+gprsize] +%define mzero m2 +%define mlim m3 +%endif + paddw m4, [dstq+strideq*0] + paddw m5, [dstq+strideq*1] + paddw m6, [dstq+strideq*2] + paddw m7, [dstq+r3] + REPX {pminsw x, mlim }, m4, m5, m6, m7 + REPX {pmaxsw x, mzero}, m4, m5, m6, m7 + mova [dstq+strideq*0], m4 + mova [dstq+strideq*1], m5 + mova [dstq+strideq*2], m6 + mova [dstq+r3 ], m7 +%undef mzero +%undef mlim + RET + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity, 6 + +cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+4*16+1*gprsize], r1 +%endif + lea t0, [o(.pass1_main)] + jmp m(idct_8x8_internal_16bpc).pass1_full +.pass1_main: + call m(iadst_8x4_internal_16bpc).main_pass1 + call .round + jmp m(idct_8x8_internal_16bpc).pack_and_transpose +.round: +%if ARCH_X86_64 + pcmpeqd m8, m8 ; -1 + mova m11, [o(pd_6144)] + REPX {psubd x, m8 }, m0, m6 + REPX {pxor x, m8 }, m1, m7, m3, m5 + REPX {psrad x, 1 }, m0, m1, m6, m7 + REPX {psubd x, m8 }, m1, m7 + REPX {paddd x, m11}, m2, m3, m4, m5 + REPX {psrad x, 13 }, m2, m3, m4, m5 +%else + mova [r3+2*16], m0 + + pcmpeqd m0, m0 ; -1 + mova m6, [o(pd_6144)] + REPX {pxor x, m0 }, m1, m7, m3, m5 + REPX {psrad x, 1 }, m1, m7 + REPX {psubd x, m0 }, m1, m7 + REPX {paddd x, m6 }, m2, m3, m4, m5 + REPX {psrad x, 13 }, m2, m3, m4, m5 + + mova m0, [r3+2*16] + psrld m6, 12 ; +1 + paddd m0, m6 + paddd m6, [r3+3*16] + REPX {psrad x, 1 }, m0, m6 +%endif + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main + call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova m7, [o(pw_m2048)] + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, [rsp+gprsize+16*0] + jmp m(idct_8x8_internal_16bpc).end + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity, 6 + +cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+4*16+1*gprsize], r1 +%endif + lea t0, [o(.pass1_main)] + jmp m(idct_8x8_internal_16bpc).pass1_full +.pass1_main: + call m(iadst_8x4_internal_16bpc).main_pass1 + call m(iadst_8x8_internal_16bpc).round + ; invert registers + packssdw m7, m6 + packssdw m5, m4 + packssdw m3, m2 + packssdw m1, m0 + mova m0, m7 + mova m2, m5 + mova m4, m3 + mova m6, m1 + jmp m(idct_8x8_internal_16bpc).transpose + +.pass2: + lea dstq, [dstq+strideq*8] + sub dstq, strideq + neg strideq + jmp m(iadst_8x8_internal_16bpc).pass2 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m0, [cq+0*32] + mova m1, [cq+1*32] + mova m2, [cq+2*32] + mova m3, [cq+3*32] + mova m4, [cq+4*32] + mova m5, [cq+5*32] + mova m6, [cq+6*32] + mova m7, [cq+7*32] + packssdw m0, [cq+0*32+16] + packssdw m1, [cq+1*32+16] + packssdw m2, [cq+2*32+16] + packssdw m3, [cq+3*32+16] + packssdw m4, [cq+4*32+16] + packssdw m5, [cq+5*32+16] + packssdw m6, [cq+6*32+16] + packssdw m7, [cq+7*32+16] + mova [rsp+gprsize+16*1], m6 + jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3 + +.pass2: +%if ARCH_X86_64 + mova m8, [o(pw_4096)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +%else + mova [rsp+gprsize+0*16], m7 + mova m7, [o(pw_4096)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [rsp+gprsize+0*16] +%endif + jmp m(idct_8x8_internal_16bpc).end diff --git a/src/x86/itx_sse.asm b/src/x86/itx_sse.asm index 8808b10121..d547ca4a56 100644 --- a/src/x86/itx_sse.asm +++ b/src/x86/itx_sse.asm @@ -1128,7 +1128,7 @@ cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 REPX {pmulhrsw x, m7}, m1, m3, m5 pmulhrsw m7, [rsp+gprsize+16*0] -.pass1_end3: +cglobal_label .pass1_end3 punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53 punpckhwd m1, m5 ;14 54 15 55 16 56 17 57 punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47 @@ -1190,7 +1190,7 @@ cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 ret ALIGN function_align -.main: +cglobal_label .main mova [rsp+gprsize*2+16*0], m7 mova [rsp+gprsize*2+16*1], m3 mova [rsp+gprsize*2+16*2], m1 @@ -1259,7 +1259,7 @@ ALIGN function_align jmp m(idct_8x8_internal_8bpc).end2 ALIGN function_align -.main: +cglobal_label .main mova [rsp+gprsize*2+16*0], m7 mova [rsp+gprsize*2+16*1], m3 mova [rsp+gprsize*2+16*2], m4 @@ -1344,7 +1344,7 @@ ALIGN function_align mova m6, [rsp+gprsize*2+16*2] ret ALIGN function_align -.main_pass2_end: +cglobal_label .main_pass2_end paddsw m7, m4, m3 ;t2 + t3 psubsw m4, m3 ;t2 - t3 paddsw m3, m5, m2 ;t6 + t7 From 59b6e82e0ebea8e5b1ad076d568a5ec8fddbfd9f Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Wed, 21 Jul 2021 13:01:34 -0400 Subject: [PATCH 154/188] x86/itx: 8x16 inverse transforms hbd/sse4 --- src/x86/itx16_sse.asm | 375 +++++++++++++++++++++++++++++++++++++++++- src/x86/itx_sse.asm | 6 +- 2 files changed, 376 insertions(+), 5 deletions(-) diff --git a/src/x86/itx16_sse.asm b/src/x86/itx16_sse.asm index 66648a1cb3..b55c7e8997 100644 --- a/src/x86/itx16_sse.asm +++ b/src/x86/itx16_sse.asm @@ -107,11 +107,18 @@ cextern idct_8x8_internal_8bpc_ssse3.main cextern idct_8x8_internal_8bpc_ssse3.pass1_end3 cextern iadst_8x8_internal_8bpc_ssse3.main cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end +cextern idct_16x8_internal_8bpc_ssse3.main +cextern iadst_16x8_internal_8bpc_ssse3.main +cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end tbl_4x16_2d: db 0, 13, 29, 45 tbl_4x16_h: db 0, 16, 32, 48 tbl_4x16_v: db 0, 4, 8, 12 +tbl_8x16_2d: db 0, 14, 30, 46 +tbl_8x16_v: db 0, 4, 8, 12 +tbl_8x16_h: db 0, 32, 64, 96 + SECTION .text %macro REPX 2-* @@ -1675,6 +1682,8 @@ cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct imul r5d, [cq], 2896 mov [cq], eobd ; 0 + mov r3d, 2 +.end: add r5d, 6144 sar r5d, 13 imul r5d, 2896 @@ -1685,7 +1694,6 @@ cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 mova m6, [o(pixel_10bpc_max)] pxor m5, m5 lea r2, [strideq*3] - mov r5d, 2 .loop: mova m1, [dstq+strideq*0] mova m2, [dstq+strideq*1] @@ -1699,7 +1707,7 @@ cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 mova [dstq+strideq*2], m3 mova [dstq+r2 ], m4 lea dstq, [dstq+strideq*4] - dec r5d + dec r3d jg .loop RET %endif @@ -1972,3 +1980,366 @@ cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 pmulhrsw m7, [rsp+gprsize+0*16] %endif jmp m(idct_8x8_internal_16bpc).end + +%macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2 +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 14, 0-16*16 +%else + INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + mov [cq], eobd ; 0 + add r5d, 2048 + sar r5d, 12 + imul r5d, 2896 + mov r3d, 4 +%if stack_size_padded > 0 + ; adjust to caller's stack allocation + add rsp, (12+ARCH_X86_64)*16 +%endif + jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, identity, v +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst + +%if ARCH_X86_64 +DECLARE_REG_TMP 7 +%endif + +cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(idct_8x8_internal_16bpc).pass1_main)] +.pass1_full: +%undef cmp + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, [rsp+16*16+2*gprsize] + ; setup stack pointer + lea r3, [rsp+gprsize] +%endif +.loop_pass1: + mova m7, [o(pd_2896)] + pmulld m0, m7, [cq+0*64+r5] + pmulld m1, m7, [cq+1*64+r5] + pmulld m2, m7, [cq+2*64+r5] + pmulld m3, m7, [cq+3*64+r5] + pmulld m4, m7, [cq+4*64+r5] + pmulld m5, m7, [cq+5*64+r5] + pmulld m6, m7, [cq+6*64+r5] + pmulld m7, [cq+7*64+r5] +%if ARCH_X86_64 + mova m8, [o(pd_2048)] + REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +%else + mova [rsp+gprsize+0*16], m7 + mova m7, [o(pd_2048)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [rsp+gprsize+0*16] +%endif + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 + call t0 + + mova [cq+0*64+r5], m0 + mova [cq+1*64+r5], m1 + mova [cq+2*64+r5], m2 + mova [cq+3*64+r5], m3 + sub r5d, 16 + jge .loop_pass1 +%if WIN64 + POP r7 +%elif ARCH_X86_32 + mov r1, [rsp+16*16+1*gprsize] +%endif + jmp tx2q + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + + ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15 + ; some are still pre-loaded from the final loop iteration in pass=1 + + mova m1, m2 + mova m2, [cq+ 1*16] + mova m3, [cq+ 9*16] + mova m4, [cq+ 2*16] + mova m5, [cq+10*16] + mova m6, [cq+ 3*16] + mova m7, [cq+11*16] + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + mova [rsp+gprsize+3*16], m0 + mova [rsp+gprsize+4*16], m1 + mova [rsp+gprsize+5*16], m2 + mova [rsp+gprsize+6*16], m3 + mova [rsp+gprsize+7*16], m4 + mova [rsp+gprsize+8*16], m5 + mova [rsp+gprsize+9*16], m6 + ; m7 is already stored in [rsp+gprsize+0*16] + mova m0, [cq+ 4*16] + mova m1, [cq+12*16] + mova m2, [cq+ 5*16] + mova m3, [cq+13*16] + mova m4, [cq+ 6*16] + mova m5, [cq+14*16] + mova m6, [cq+ 7*16] + mova m7, [cq+15*16] + call m_suffix(idct_16x8_internal_8bpc, _ssse3).main + + ; out0-7 is in rsp+gprsize+3-10*mmsize + ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize + +%if ARCH_X86_64 +%define mzero m8 +%define mlim m9 +%define mula m10 +%define mulb m11 +%else +%define mzero m4 +%define mlim m5 +%define mula m6 +%define mulb m7 +%endif + mova m7, [rsp+gprsize+0*16] +%if ARCH_X86_32 + mova [rsp+gprsize+11*16], m4 + mova [rsp+gprsize+12*16], m5 + mova [rsp+gprsize+13*16], m6 + mova [rsp+gprsize+14*16], m7 +%endif + + mova mula, [o(pw_2048)] + mova mulb, mula +.end: + lea r3, [strideq*3] + lea r5, [dstq+strideq*8] + pxor mzero, mzero + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + mova mlim, [o(pixel_10bpc_max)] + call .write_8x4 + lea r5, [dstq+r3*4] +%if ARCH_X86_64 + mova m0, m4 + mova m1, m5 + mova m2, m6 + mova m3, m7 +%else + mova m0, [rsp+gprsize+11*16] + mova m1, [rsp+gprsize+12*16] + mova m2, [rsp+gprsize+13*16] + mova m3, [rsp+gprsize+14*16] +%endif + call .write_8x4 + mov r5, dstq + mova m0, [rsp+gprsize+ 3*16] + mova m1, [rsp+gprsize+ 4*16] + mova m2, [rsp+gprsize+ 5*16] + mova m3, [rsp+gprsize+ 6*16] + call .write_8x4 + lea r5, [dstq+strideq*4] + mova m0, [rsp+gprsize+ 7*16] + mova m1, [rsp+gprsize+ 8*16] + mova m2, [rsp+gprsize+ 9*16] + mova m3, [rsp+gprsize+10*16] + call .write_8x4 + RET +.write_8x4: + REPX {pmulhrsw x, mula}, m0, m2 + REPX {pmulhrsw x, mulb}, m1, m3 + paddw m0, [r5+strideq*0] + paddw m1, [r5+strideq*1] + paddw m2, [r5+strideq*2] + paddw m3, [r5+r3] + REPX {pminsw x, mlim }, m0, m1, m2, m3 + REPX {pmaxsw x, mzero}, m0, m1, m2, m3 + mova [r5+strideq*0], m0 + mova [r5+strideq*1], m1 + mova [r5+strideq*2], m2 + mova [r5+r3 ], m3 + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity, v + +cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)] + jmp m(idct_8x16_internal_16bpc).pass1_full + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + mova m4, [cq+ 9*16] + mova m5, [cq+13*16] + mova [rsp+gprsize+7*16], m0 + mova [rsp+gprsize+8*16], m1 + mova [rsp+gprsize+5*16], m4 + mova [rsp+gprsize+6*16], m5 + mova m0, m2 + mova m1, m3 + mova m2, [cq+ 1*16] + mova m3, [cq+ 5*16] + mova m4, [cq+ 2*16] + mova m5, [cq+ 6*16] + mova m6, [cq+11*16] + mova m7, [cq+15*16] + mova [rsp+gprsize+ 3*16], m4 + mova [rsp+gprsize+ 4*16], m5 + mova [rsp+gprsize+ 9*16], m6 + mova [rsp+gprsize+10*16], m7 + mova m4, [cq+10*16] + mova m5, [cq+14*16] + mova m6, [cq+ 3*16] + mova m7, [cq+ 7*16] + call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end + mova m7, [rsp+gprsize+0*16] +%if ARCH_X86_32 + mova [rsp+gprsize+11*16], m4 + mova [rsp+gprsize+12*16], m5 + mova [rsp+gprsize+13*16], m6 + mova [rsp+gprsize+14*16], m7 +%endif + mova mula, [o(pw_2048)] + mova mulb, [o(pw_m2048)] + jmp m(idct_8x16_internal_16bpc).end + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity, v + +cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)] + jmp m(idct_8x16_internal_16bpc).pass1_full + +.pass2: + lea r3, [strideq*3] + lea r3, [r3*5] + add dstq, r3 + neg strideq + jmp m(iadst_8x16_internal_16bpc).pass2 + +INV_TXFM_8X16_FN identity, dct, h +INV_TXFM_8X16_FN identity, adst, h +INV_TXFM_8X16_FN identity, flipadst, h +INV_TXFM_8X16_FN identity, identity + +cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)] + jmp m(idct_8x16_internal_16bpc).pass1_full + +.pass2: +%if ARCH_X86_64 + mova m8, [o(pw_1697x16)] +%endif + call .main + mova [rsp+ 3*16+gprsize], m0 + mova [rsp+ 4*16+gprsize], m1 + mova [rsp+ 5*16+gprsize], m2 + mova [rsp+ 6*16+gprsize], m3 + mova m0, [cq+ 1*16] + mova m1, [cq+ 5*16] + mova m2, [cq+ 9*16] + mova m3, [cq+13*16] + call .main + mova [rsp+ 7*16+gprsize], m0 + mova [rsp+ 8*16+gprsize], m1 + mova [rsp+ 9*16+gprsize], m2 + mova [rsp+10*16+gprsize], m3 +%if ARCH_X86_32 + mova m0, [cq+ 3*16] + mova m1, [cq+ 7*16] + mova m2, [cq+11*16] + mova m3, [cq+15*16] + call .main + mova [rsp+11*16+gprsize], m0 + mova [rsp+12*16+gprsize], m1 + mova [rsp+13*16+gprsize], m2 + mova [rsp+14*16+gprsize], m3 +%endif + mova m0, [cq+ 2*16] + mova m1, [cq+ 6*16] + mova m2, [cq+10*16] + mova m3, [cq+14*16] + call .main +%if ARCH_X86_64 + mova m4, [cq+ 3*16] + mova m5, [cq+ 7*16] + mova m6, [cq+11*16] + mova m7, [cq+15*16] + pmulhrsw m9, m8, m4 + pmulhrsw m10, m8, m5 + pmulhrsw m11, m8, m6 + pmulhrsw m8, m7 + REPX {paddsw x, x}, m4, m5, m6, m7 + paddsw m4, m9 + paddsw m5, m10 + paddsw m6, m11 + paddsw m7, m8 +%endif + mova mula, [o(pw_2048)] + mova mulb, mula + jmp m(idct_8x16_internal_16bpc).end +.main: + ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y) +%if ARCH_X86_32 + mova m7, [o(pw_1697x16)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 +%else + pmulhrsw m4, m8, m0 + pmulhrsw m5, m8, m1 + pmulhrsw m6, m8, m2 + pmulhrsw m7, m8, m3 +%endif + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + ret +%undef mula +%undef mulb +%undef mlim +%undef mzero diff --git a/src/x86/itx_sse.asm b/src/x86/itx_sse.asm index d547ca4a56..5fcbdaa27e 100644 --- a/src/x86/itx_sse.asm +++ b/src/x86/itx_sse.asm @@ -2482,7 +2482,7 @@ cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 ALIGN function_align -.main: +cglobal_label .main mova [rsp+gprsize*2+16*1], m2 mova [rsp+gprsize*2+16*2], m6 mova [rsp+gprsize*2+32*5], m5 @@ -2620,7 +2620,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(iadst_8x8_internal_8bpc).pass2_main ALIGN function_align -.main: +cglobal_label .main mova [rsp+gprsize*2+16*0], m1 mova [rsp+gprsize*2+16*1], m2 mova [rsp+gprsize*2+16*2], m6 @@ -2813,7 +2813,7 @@ ALIGN function_align mova m6, [rsp+gprsize*2+16*15] ;out14 ret ALIGN function_align -.main_pass2_end: +cglobal_label .main_pass2_end mova m7, [o(pw_2896x8)] mova m1, [rsp+gprsize*2+16* 9] mova m2, [rsp+gprsize*2+16*14] From d34234b00fc366b3034a53c79df9919425a3198c Mon Sep 17 00:00:00 2001 From: Josh Holmer Date: Tue, 3 Aug 2021 16:36:32 -0400 Subject: [PATCH 155/188] Enable HBD SSSE3 put/prep/avg x86 assembly --- build.rs | 1 + src/asm/x86/mc.rs | 27 ++++++++++++++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/build.rs b/build.rs index 4fd2bcfd12..ccfe0e4669 100644 --- a/build.rs +++ b/build.rs @@ -105,6 +105,7 @@ fn build_nasm_files() { "src/x86/mc16_avx2.asm", "src/x86/mc_avx512.asm", "src/x86/mc_sse.asm", + "src/x86/mc16_sse.asm", "src/x86/me.asm", "src/x86/sad_sse2.asm", "src/x86/sad_avx.asm", diff --git a/src/asm/x86/mc.rs b/src/asm/x86/mc.rs index 41897c3742..541deb41a3 100644 --- a/src/asm/x86/mc.rs +++ b/src/asm/x86/mc.rs @@ -332,6 +332,14 @@ macro_rules! decl_mc_hbd_fns { )* } + static PUT_HBD_FNS_SSSE3: [Option; 16] = { + let mut out: [Option; 16] = [None; 16]; + $( + out[get_2d_mode_idx($mode_x, $mode_y)] = Some($func_name); + )* + out + }; + static PUT_HBD_FNS_AVX2: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( @@ -358,7 +366,7 @@ decl_mc_hbd_fns!( cpu_function_lookup_table!( PUT_HBD_FNS: [[Option; 16]], default: [None; 16], - [AVX2] + [SSSE3, AVX2] ); macro_rules! decl_mct_fns { @@ -440,6 +448,14 @@ macro_rules! decl_mct_hbd_fns { )* } + static PREP_HBD_FNS_SSSE3: [Option; 16] = { + let mut out: [Option; 16] = [None; 16]; + $( + out[get_2d_mode_idx($mode_x, $mode_y)] = Some($func_name); + )* + out + }; + static PREP_HBD_FNS_AVX2: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( @@ -466,7 +482,7 @@ decl_mct_hbd_fns!( cpu_function_lookup_table!( PREP_HBD_FNS: [[Option; 16]], default: [None; 16], - [AVX2] + [SSSE3, AVX2] ); extern { @@ -480,6 +496,11 @@ extern { tmp2: *const i16, w: i32, h: i32, ); + fn rav1e_avg_16bpc_ssse3( + dst: *mut u16, dst_stride: libc::ptrdiff_t, tmp1: *const i16, + tmp2: *const i16, w: i32, h: i32, bitdepth_max: i32, + ); + fn rav1e_avg_16bpc_avx2( dst: *mut u16, dst_stride: libc::ptrdiff_t, tmp1: *const i16, tmp2: *const i16, w: i32, h: i32, bitdepth_max: i32, @@ -495,7 +516,7 @@ cpu_function_lookup_table!( cpu_function_lookup_table!( AVG_HBD_FNS: [Option], default: None, - [(AVX2, Some(rav1e_avg_16bpc_avx2))] + [(SSSE3, Some(rav1e_avg_16bpc_ssse3)), (AVX2, Some(rav1e_avg_16bpc_avx2))] ); #[cfg(test)] From 984515f4eda3b3f22c9fc6d0525d8e3857cce5cb Mon Sep 17 00:00:00 2001 From: Josh Holmer Date: Fri, 6 Aug 2021 09:25:55 -0400 Subject: [PATCH 156/188] Enable SSE4.1 HBD Inverse Transform x86 assembly --- build.rs | 1 + src/asm/x86/transform/inverse.rs | 30 +++++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/build.rs b/build.rs index ccfe0e4669..502c759697 100644 --- a/build.rs +++ b/build.rs @@ -99,6 +99,7 @@ fn build_nasm_files() { "src/x86/itx_avx2.asm", "src/x86/itx_sse.asm", "src/x86/itx16_avx2.asm", + "src/x86/itx16_sse.asm", "src/x86/looprestoration_avx2.asm", "src/x86/looprestoration16_avx2.asm", "src/x86/mc_avx2.asm", diff --git a/src/asm/x86/transform/inverse.rs b/src/asm/x86/transform/inverse.rs index b5736b6fa7..2fc0941eb6 100644 --- a/src/asm/x86/transform/inverse.rs +++ b/src/asm/x86/transform/inverse.rs @@ -107,6 +107,7 @@ macro_rules! decl_itx_hbd_fns { )* // Create a lookup table for the tx types declared above const []: [Option; TX_TYPES] = { + #[allow(unused_mut)] let mut out: [Option; 16] = [None; 16]; $( $( @@ -306,10 +307,37 @@ impl_itx_hbd_fns!( [(avx2, AVX2)] ); +impl_itx_hbd_fns!( + // 64x + [], + [(64, 64), (64, 32), (32, 64), (16, 64), (64, 16)], + // 32x + [], + [(32, 32), (32, 16), (16, 32), (32, 8), (8, 32)], + // 16x + [], + [(16, 16), (16, 8), (16, 4)], + // 8x and 4x + [ + (TxType::DCT_ADST, dct, adst), + (TxType::ADST_DCT, adst, dct), + (TxType::DCT_FLIPADST, dct, flipadst), + (TxType::FLIPADST_DCT, flipadst, dct), + (TxType::V_DCT, dct, identity), + (TxType::H_DCT, identity, dct), + (TxType::ADST_ADST, adst, adst), + (TxType::ADST_FLIPADST, adst, flipadst), + (TxType::FLIPADST_ADST, flipadst, adst), + (TxType::FLIPADST_FLIPADST, flipadst, flipadst) + ], + [(8, 16), (4, 16), (8, 8), (8, 4), (4, 8), (4, 4)], + [(sse4, SSE4_1)] +); + cpu_function_lookup_table!( INV_TXFM_HBD_FNS: [[[Option; TX_TYPES]; 32]], default: [[None; TX_TYPES]; 32], - [AVX2] + [SSE4_1, AVX2] ); #[cfg(test)] From 1793baa1a084ca21f084ce6b80ebcfe14b2567ac Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Mon, 9 Aug 2021 18:07:47 +0900 Subject: [PATCH 157/188] Update copyright headers of files touched in 2021 --- build.rs | 2 +- fuzz/fuzz_targets/encode_decode.rs | 2 +- ivf/src/lib.rs | 2 +- src/activity.rs | 2 +- src/api/channel/data.rs | 2 +- src/api/channel/mod.rs | 2 +- src/api/color.rs | 2 +- src/api/config/encoder.rs | 2 +- src/api/config/mod.rs | 2 +- src/api/config/speedsettings.rs | 2 +- src/api/context.rs | 2 +- src/api/internal.rs | 2 +- src/api/test.rs | 2 +- src/api/util.rs | 2 +- src/asm/aarch64/cdef.rs | 2 +- src/asm/aarch64/mc.rs | 2 +- src/asm/aarch64/predict.rs | 2 +- src/asm/shared/transform/inverse.rs | 2 +- src/asm/x86/cdef.rs | 2 +- src/asm/x86/ec.rs | 2 +- src/asm/x86/mc.rs | 2 +- src/asm/x86/predict.rs | 2 +- src/asm/x86/quantize.rs | 2 +- src/asm/x86/transform/inverse.rs | 2 +- src/bin/common.rs | 2 +- src/bin/decoder/mod.rs | 2 +- src/bin/rav1e-ch.rs | 2 +- src/bin/rav1e.rs | 2 +- src/capi.rs | 2 +- src/cdef.rs | 2 +- src/context/block_unit.rs | 2 +- src/context/cdf_context.rs | 2 +- src/context/frame_header.rs | 2 +- src/context/mod.rs | 2 +- src/context/partition_unit.rs | 2 +- src/context/superblock_unit.rs | 2 +- src/context/transform_unit.rs | 2 +- src/deblock.rs | 2 +- src/ec.rs | 2 +- src/encoder.rs | 2 +- src/entropymode.rs | 2 +- src/frame/mod.rs | 2 +- src/fuzzing.rs | 2 +- src/header.rs | 2 +- src/lib.rs | 2 +- src/lrf.rs | 2 +- src/me.rs | 2 +- src/predict.rs | 2 +- src/quantize.rs | 2 +- src/rate.rs | 2 +- src/rdo.rs | 2 +- src/scenechange/mod.rs | 2 +- src/segmentation.rs | 2 +- src/stats.rs | 2 +- src/test_encode_decode/mod.rs | 2 +- src/token_cdfs.rs | 2 +- src/transform/mod.rs | 2 +- src/util/align.rs | 2 +- src/util/cdf.rs | 2 +- v_frame/src/pixel.rs | 2 +- v_frame/src/plane.rs | 2 +- 61 files changed, 61 insertions(+), 61 deletions(-) diff --git a/build.rs b/build.rs index 502c759697..c4c44077c8 100644 --- a/build.rs +++ b/build.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/fuzz/fuzz_targets/encode_decode.rs b/fuzz/fuzz_targets/encode_decode.rs index d8041bdb5d..956587314b 100644 --- a/fuzz/fuzz_targets/encode_decode.rs +++ b/fuzz/fuzz_targets/encode_decode.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2019-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/ivf/src/lib.rs b/ivf/src/lib.rs index 04be426111..74227de0ba 100644 --- a/ivf/src/lib.rs +++ b/ivf/src/lib.rs @@ -1,5 +1,5 @@ // Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved -// Copyright (c) 2017-2018, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/activity.rs b/src/activity.rs index 5c0a764d0c..6efdcb5a75 100644 --- a/src/activity.rs +++ b/src/activity.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/api/channel/data.rs b/src/api/channel/data.rs index 999b8e72b7..51ac473cdb 100644 --- a/src/api/channel/data.rs +++ b/src/api/channel/data.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/api/channel/mod.rs b/src/api/channel/mod.rs index 542aa9d247..c6ae91bc50 100644 --- a/src/api/channel/mod.rs +++ b/src/api/channel/mod.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/api/color.rs b/src/api/color.rs index 5b022051f2..f81e0a2162 100644 --- a/src/api/color.rs +++ b/src/api/color.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2019-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/api/config/encoder.rs b/src/api/config/encoder.rs index c91d7f4098..a94e99f55f 100644 --- a/src/api/config/encoder.rs +++ b/src/api/config/encoder.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2020, The rav1e contributors. All rights reserved +// Copyright (c) 2020-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/api/config/mod.rs b/src/api/config/mod.rs index 51421a3c8a..230e35c624 100644 --- a/src/api/config/mod.rs +++ b/src/api/config/mod.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2020, The rav1e contributors. All rights reserved +// Copyright (c) 2020-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/api/config/speedsettings.rs b/src/api/config/speedsettings.rs index 5524396d3e..8af5adb67a 100644 --- a/src/api/config/speedsettings.rs +++ b/src/api/config/speedsettings.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2020, The rav1e contributors. All rights reserved +// Copyright (c) 2020-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/api/context.rs b/src/api/context.rs index e607bbc22e..88ea9da84e 100644 --- a/src/api/context.rs +++ b/src/api/context.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/api/internal.rs b/src/api/internal.rs index 4dbb474052..14687b3bdf 100644 --- a/src/api/internal.rs +++ b/src/api/internal.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/api/test.rs b/src/api/test.rs index 9bb36aae56..8c04c93318 100644 --- a/src/api/test.rs +++ b/src/api/test.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/api/util.rs b/src/api/util.rs index 71b8611a9e..c26295de88 100644 --- a/src/api/util.rs +++ b/src/api/util.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/asm/aarch64/cdef.rs b/src/asm/aarch64/cdef.rs index 9fc5b242ef..560bd28995 100644 --- a/src/asm/aarch64/cdef.rs +++ b/src/asm/aarch64/cdef.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2020, The rav1e contributors. All rights reserved +// Copyright (c) 2020-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/asm/aarch64/mc.rs b/src/asm/aarch64/mc.rs index e9b50cad61..5cab18be3b 100644 --- a/src/asm/aarch64/mc.rs +++ b/src/asm/aarch64/mc.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2019-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/asm/aarch64/predict.rs b/src/asm/aarch64/predict.rs index 9d22f8e312..36c124cfe4 100644 --- a/src/asm/aarch64/predict.rs +++ b/src/asm/aarch64/predict.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2019-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/asm/shared/transform/inverse.rs b/src/asm/shared/transform/inverse.rs index 7641cef966..fc8d6a362a 100644 --- a/src/asm/shared/transform/inverse.rs +++ b/src/asm/shared/transform/inverse.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2019-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/asm/x86/cdef.rs b/src/asm/x86/cdef.rs index fd5296cb9a..67a3842a55 100644 --- a/src/asm/x86/cdef.rs +++ b/src/asm/x86/cdef.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2019-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/asm/x86/ec.rs b/src/asm/x86/ec.rs index f6db6a5f3f..03197c48dd 100644 --- a/src/asm/x86/ec.rs +++ b/src/asm/x86/ec.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2019-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/asm/x86/mc.rs b/src/asm/x86/mc.rs index 541deb41a3..81dc07d849 100644 --- a/src/asm/x86/mc.rs +++ b/src/asm/x86/mc.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2019-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/asm/x86/predict.rs b/src/asm/x86/predict.rs index 0545fae1da..67373b9f5b 100644 --- a/src/asm/x86/predict.rs +++ b/src/asm/x86/predict.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2019-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/asm/x86/quantize.rs b/src/asm/x86/quantize.rs index d4ba56695e..58b026c329 100644 --- a/src/asm/x86/quantize.rs +++ b/src/asm/x86/quantize.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2019-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/asm/x86/transform/inverse.rs b/src/asm/x86/transform/inverse.rs index 2fc0941eb6..5c172a78ca 100644 --- a/src/asm/x86/transform/inverse.rs +++ b/src/asm/x86/transform/inverse.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2019-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/bin/common.rs b/src/bin/common.rs index 759e6627ad..5c5766a8da 100644 --- a/src/bin/common.rs +++ b/src/bin/common.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/bin/decoder/mod.rs b/src/bin/decoder/mod.rs index d9cecc64cb..7c7ca0e1b7 100644 --- a/src/bin/decoder/mod.rs +++ b/src/bin/decoder/mod.rs @@ -1,5 +1,5 @@ // Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/bin/rav1e-ch.rs b/src/bin/rav1e-ch.rs index 73b01f9be2..3034885df8 100644 --- a/src/bin/rav1e-ch.rs +++ b/src/bin/rav1e-ch.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/bin/rav1e.rs b/src/bin/rav1e.rs index 19f36de8e4..9bde699f8a 100644 --- a/src/bin/rav1e.rs +++ b/src/bin/rav1e.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/capi.rs b/src/capi.rs index 8a3c81c8da..4cc8ed1481 100644 --- a/src/capi.rs +++ b/src/capi.rs @@ -1,5 +1,5 @@ // Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/cdef.rs b/src/cdef.rs index de895bc594..08a222677b 100644 --- a/src/cdef.rs +++ b/src/cdef.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/context/block_unit.rs b/src/context/block_unit.rs index fb06c3901a..4f3dc7f456 100644 --- a/src/context/block_unit.rs +++ b/src/context/block_unit.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index 848ea3e156..8d9926292f 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/context/frame_header.rs b/src/context/frame_header.rs index 711f52a74c..a8e8957a02 100644 --- a/src/context/frame_header.rs +++ b/src/context/frame_header.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/context/mod.rs b/src/context/mod.rs index 38400a90b4..0ed74e94f5 100644 --- a/src/context/mod.rs +++ b/src/context/mod.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/context/partition_unit.rs b/src/context/partition_unit.rs index 46e4e0d18f..8a80429c5f 100644 --- a/src/context/partition_unit.rs +++ b/src/context/partition_unit.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/context/superblock_unit.rs b/src/context/superblock_unit.rs index 7f0aa827cf..e97d87de0c 100644 --- a/src/context/superblock_unit.rs +++ b/src/context/superblock_unit.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/context/transform_unit.rs b/src/context/transform_unit.rs index d4c876e903..fe61f4fa6b 100644 --- a/src/context/transform_unit.rs +++ b/src/context/transform_unit.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/deblock.rs b/src/deblock.rs index e837e2b112..1d25017d41 100644 --- a/src/deblock.rs +++ b/src/deblock.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/ec.rs b/src/ec.rs index ce9b5237d0..3beb0db745 100644 --- a/src/ec.rs +++ b/src/ec.rs @@ -1,5 +1,5 @@ // Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/encoder.rs b/src/encoder.rs index 1ccf8c8310..43d696e9b7 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/entropymode.rs b/src/entropymode.rs index 1efbfe9caa..94f86a14b3 100644 --- a/src/entropymode.rs +++ b/src/entropymode.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/frame/mod.rs b/src/frame/mod.rs index 2ea9c4fd8b..c42586a879 100644 --- a/src/frame/mod.rs +++ b/src/frame/mod.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/fuzzing.rs b/src/fuzzing.rs index 8189c4784c..8362783689 100644 --- a/src/fuzzing.rs +++ b/src/fuzzing.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2019-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/header.rs b/src/header.rs index e650b16ac7..56d9e2d5af 100644 --- a/src/header.rs +++ b/src/header.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/lib.rs b/src/lib.rs index 2294ccfc45..13af936b13 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/lrf.rs b/src/lrf.rs index db4600348b..522f350e23 100644 --- a/src/lrf.rs +++ b/src/lrf.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/me.rs b/src/me.rs index 4836bbbf2c..7306a47ae4 100644 --- a/src/me.rs +++ b/src/me.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/predict.rs b/src/predict.rs index cdea8740a3..fbf1e9526d 100644 --- a/src/predict.rs +++ b/src/predict.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/quantize.rs b/src/quantize.rs index 597c185699..4f3c63332d 100644 --- a/src/quantize.rs +++ b/src/quantize.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/rate.rs b/src/rate.rs index 5da59889e9..6bc23f3236 100644 --- a/src/rate.rs +++ b/src/rate.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2019-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/rdo.rs b/src/rdo.rs index 8c4f69de17..b054958eba 100644 --- a/src/rdo.rs +++ b/src/rdo.rs @@ -1,5 +1,5 @@ // Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index 76d0d08f1d..fef1072373 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/segmentation.rs b/src/segmentation.rs index fd68095b61..319e9c6c9e 100644 --- a/src/segmentation.rs +++ b/src/segmentation.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2019, The rav1e contributors. All rights reserved +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/stats.rs b/src/stats.rs index 5d428a8648..b042d5daaf 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2019-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/test_encode_decode/mod.rs b/src/test_encode_decode/mod.rs index 928017e15a..d220185ca0 100644 --- a/src/test_encode_decode/mod.rs +++ b/src/test_encode_decode/mod.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2018, The rav1e contributors. All rights reserved +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/token_cdfs.rs b/src/token_cdfs.rs index 980553e6e1..044e441fc2 100644 --- a/src/token_cdfs.rs +++ b/src/token_cdfs.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2019, The rav1e contributors. All rights reserved +// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/transform/mod.rs b/src/transform/mod.rs index ede695ec10..304df733a1 100644 --- a/src/transform/mod.rs +++ b/src/transform/mod.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/util/align.rs b/src/util/align.rs index 5134b769ff..a571f631e1 100644 --- a/src/util/align.rs +++ b/src/util/align.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/src/util/cdf.rs b/src/util/cdf.rs index 6a0b44ee16..4309dcc28f 100644 --- a/src/util/cdf.rs +++ b/src/util/cdf.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2019, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/v_frame/src/pixel.rs b/v_frame/src/pixel.rs index 69972b3609..1f45f9cd77 100644 --- a/v_frame/src/pixel.rs +++ b/v_frame/src/pixel.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License diff --git a/v_frame/src/plane.rs b/v_frame/src/plane.rs index 10c8abfec1..b4daf85388 100644 --- a/v_frame/src/plane.rs +++ b/v_frame/src/plane.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved +// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License From 218691d11f68c4e8535afb49f073ef63ce41d927 Mon Sep 17 00:00:00 2001 From: mahanstreamer <84676642+mahanstreamer@users.noreply.github.com> Date: Tue, 1 Jun 2021 11:30:31 -0400 Subject: [PATCH 158/188] Update License year --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 0770036335..4c6c3029a9 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD 2-Clause License -Copyright (c) 2017-2020, the rav1e contributors +Copyright (c) 2017-2021, the rav1e contributors All rights reserved. Redistribution and use in source and binary forms, with or without From 7960160f186149a47a99f7a3e4c52851018266dc Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Mon, 9 Aug 2021 18:33:34 +0900 Subject: [PATCH 159/188] Update license year for ivf and v_frame --- ivf/LICENSE | 2 +- v_frame/LICENSE | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ivf/LICENSE b/ivf/LICENSE index 1c2cd3b4a6..4c6c3029a9 100644 --- a/ivf/LICENSE +++ b/ivf/LICENSE @@ -1,6 +1,6 @@ BSD 2-Clause License -Copyright (c) 2017-2018, the rav1e contributors +Copyright (c) 2017-2021, the rav1e contributors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/v_frame/LICENSE b/v_frame/LICENSE index 0770036335..4c6c3029a9 100644 --- a/v_frame/LICENSE +++ b/v_frame/LICENSE @@ -1,6 +1,6 @@ BSD 2-Clause License -Copyright (c) 2017-2020, the rav1e contributors +Copyright (c) 2017-2021, the rav1e contributors All rights reserved. Redistribution and use in source and binary forms, with or without From 325d0b0ceb51c190f12acf59938ab4f121cbf515 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Sun, 8 Aug 2021 14:56:35 +0200 Subject: [PATCH 160/188] Prepare for the beta release --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3e1fc33a2b..cc3170d924 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rav1e" -version = "0.5.0-alpha" +version = "0.5.0-beta" authors = ["Thomas Daede "] edition = "2018" build = "build.rs" @@ -112,7 +112,7 @@ features = ["parallel"] signal-hook = { version = "0.3", optional = true } [dev-dependencies] -assert_cmd = "1.0" +assert_cmd = "2.0" criterion = "0.3" pretty_assertions = "0.7" interpolate_name = "0.2.2" From 54489d2653f580ed17bd96ad2be815db07a27bb2 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Mon, 9 Aug 2021 13:59:18 +0200 Subject: [PATCH 161/188] Prepare for v_frame-0.2.2 --- v_frame/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v_frame/Cargo.toml b/v_frame/Cargo.toml index 3450d36b28..964d2ef30f 100644 --- a/v_frame/Cargo.toml +++ b/v_frame/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "v_frame" -version = "0.2.1" +version = "0.2.2" description = "Video Frame data structures, part of rav1e" license = "BSD-2-Clause" authors = ["Luca Barbato "] From 8d1d0a6c4263024dc13bc88b00c5d5fe9be3cf12 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Mon, 9 Aug 2021 14:16:32 +0200 Subject: [PATCH 162/188] Depend on v_frame-0.2.2 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index cc3170d924..e365b61c7d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -73,7 +73,7 @@ dav1d-sys = { version = "0.3.4", optional = true } aom-sys = { version = "0.3.0", optional = true } scan_fmt = { version = "0.2.3", optional = true, default-features = false } ivf = { version = "0.1", path = "ivf/", optional = true } -v_frame = { version = "0.2.1", path = "v_frame/" } +v_frame = { version = "0.2.2", path = "v_frame/" } av-metrics = { version = "0.6.2", optional = true, default-features = false } rayon = "1.0" crossbeam = { version = "0.8", optional = true } From e09f555d279e7d4dbd0063f072d57695dcc4da7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Lauzier?= Date: Mon, 9 Aug 2021 14:39:12 -0400 Subject: [PATCH 163/188] Fix some warnings present in nightly --- benches/mc.rs | 6 +++--- clippy.toml | 1 + src/asm/shared/predict.rs | 4 ++-- src/asm/x86/dist/mod.rs | 8 ++++---- src/asm/x86/ec.rs | 2 +- src/asm/x86/mc.rs | 6 +++--- src/capi.rs | 23 ++++++++++------------- src/context/partition_unit.rs | 1 - src/predict.rs | 4 ++-- tests/binary.rs | 4 ++-- 10 files changed, 28 insertions(+), 31 deletions(-) diff --git a/benches/mc.rs b/benches/mc.rs index 95a80b0603..3267e76dd1 100644 --- a/benches/mc.rs +++ b/benches/mc.rs @@ -543,9 +543,9 @@ fn new_plane( p } -fn get_params<'a, T: Pixel>( - rec_plane: &'a Plane, po: PlaneOffset, mv: MotionVector, -) -> (i32, i32, PlaneSlice<'a, T>) { +fn get_params( + rec_plane: &Plane, po: PlaneOffset, mv: MotionVector, +) -> (i32, i32, PlaneSlice) { let rec_cfg = &rec_plane.cfg; let shift_row = 3 + rec_cfg.ydec; let shift_col = 3 + rec_cfg.xdec; diff --git a/clippy.toml b/clippy.toml index a2998c565c..13aa72450a 100644 --- a/clippy.toml +++ b/clippy.toml @@ -1,3 +1,4 @@ too-many-arguments-threshold = 16 cognitive-complexity-threshold = 40 trivial-copy-size-limit = 16 # 128-bits = 2 64-bit registers +msrv = "1.51" diff --git a/src/asm/shared/predict.rs b/src/asm/shared/predict.rs index a9f274e973..6359123bde 100644 --- a/src/asm/shared/predict.rs +++ b/src/asm/shared/predict.rs @@ -71,7 +71,7 @@ mod test { }; for angle in angles { let expected = { - let mut plane = Plane::from_slice(&vec![0u8; 4 * 4], 4); + let mut plane = Plane::from_slice(&[0u8; 4 * 4], 4); rust::dispatch_predict_intra( *mode, *variant, @@ -91,7 +91,7 @@ mod test { data }; - let mut output = Plane::from_slice(&vec![0u8; 4 * 4], 4); + let mut output = Plane::from_slice(&[0u8; 4 * 4], 4); dispatch_predict_intra( *mode, *variant, diff --git a/src/asm/x86/dist/mod.rs b/src/asm/x86/dist/mod.rs index 36c17b28f6..5cfdbb1c45 100644 --- a/src/asm/x86/dist/mod.rs +++ b/src/asm/x86/dist/mod.rs @@ -495,9 +495,9 @@ mod test { let bsize = BlockSize::[]; if $BD > 8 { // dynamic allocation: test - let mut src = Plane::from_slice(&vec![0u16; $W * $H], $W); + let mut src = Plane::from_slice(&[0u16; $W * $H], $W); // dynamic allocation: test - let mut dst = Plane::from_slice(&vec![0u16; $W * $H], $W); + let mut dst = Plane::from_slice(&[0u16; $W * $H], $W); for (s, d) in src.data.iter_mut().zip(dst.data.iter_mut()) { *s = random::() as u16 * $BD / 8; *d = random::() as u16 * $BD / 8; @@ -508,9 +508,9 @@ mod test { assert_eq!(rust_result, result); } else { // dynamic allocation: test - let mut src = Plane::from_slice(&vec![0u8; $W * $H], $W); + let mut src = Plane::from_slice(&[0u8; $W * $H], $W); // dynamic allocation: test - let mut dst = Plane::from_slice(&vec![0u8; $W * $H], $W); + let mut dst = Plane::from_slice(&[0u8; $W * $H], $W); for (s, d) in src.data.iter_mut().zip(dst.data.iter_mut()) { *s = random::(); *d = random::(); diff --git a/src/asm/x86/ec.rs b/src/asm/x86/ec.rs index 03197c48dd..6723188e99 100644 --- a/src/asm/x86/ec.rs +++ b/src/asm/x86/ec.rs @@ -106,7 +106,7 @@ mod test { } let mut cdf = [7297, 3820, 1617, 0]; - let mut cdf2 = cdf.clone(); + let mut cdf2 = cdf; for i in 0..4 { rust::update_cdf(&mut cdf, i); unsafe { diff --git a/src/asm/x86/mc.rs b/src/asm/x86/mc.rs index 81dc07d849..d638436d58 100644 --- a/src/asm/x86/mc.rs +++ b/src/asm/x86/mc.rs @@ -710,9 +710,9 @@ mod test { 8 ); - fn get_params<'a, T: Pixel>( - rec_plane: &'a Plane, po: PlaneOffset, mv: MotionVector, - ) -> (i32, i32, PlaneSlice<'a, T>) { + fn get_params( + rec_plane: &Plane, po: PlaneOffset, mv: MotionVector, + ) -> (i32, i32, PlaneSlice) { let rec_cfg = &rec_plane.cfg; let shift_row = 3 + rec_cfg.ydec; let shift_col = 3 + rec_cfg.xdec; diff --git a/src/capi.rs b/src/capi.rs index 4cc8ed1481..cda43d8d40 100644 --- a/src/capi.rs +++ b/src/capi.rs @@ -209,14 +209,11 @@ impl EncContext { ) -> Result { ctx.receive_packet().map(|p| { let mut p = std::mem::ManuallyDrop::new(p); - let opaque = p.opaque.take().map_or_else( - || std::ptr::null_mut(), - |o| { - let mut opaque = o.downcast::().unwrap(); - opaque.cb = None; - opaque.opaque - }, - ); + let opaque = p.opaque.take().map_or_else(std::ptr::null_mut, |o| { + let mut opaque = o.downcast::().unwrap(); + opaque.cb = None; + opaque.opaque + }); let p = std::mem::ManuallyDrop::into_inner(p); let rav1e::Packet { data, input_frameno, frame_type, .. } = p; let len = data.len(); @@ -754,7 +751,7 @@ pub unsafe extern fn rav1e_frame_new(ctx: *const Context) -> *mut Frame { let fi = (*ctx).ctx.new_frame(); let frame_type = rav1e::FrameTypeOverride::No; let f = Frame { fi, frame_type, opaque: None }; - let frame = Box::new(f.into()); + let frame = Box::new(f); Box::into_raw(frame) } @@ -952,7 +949,7 @@ pub unsafe extern fn rav1e_rc_send_pass_data( .ctx .rc_send_pass_data(maybe_buf.unwrap()) .map(|_v| None) - .unwrap_or_else(|e| Some(e)); + .unwrap_or_else(Some); (*ctx).last_err = ret; @@ -1025,14 +1022,14 @@ pub unsafe extern fn rav1e_send_frame( let maybe_opaque = if frame.is_null() { None } else { - (*frame).opaque.take().map(|o| rav1e::Opaque::new(o)) + (*frame).opaque.take().map(rav1e::Opaque::new) }; let ret = (*ctx) .ctx .send_frame(frame_internal, frame_type, maybe_opaque) .map(|_v| None) - .unwrap_or_else(|e| Some(e)); + .unwrap_or_else(Some); (*ctx).last_err = ret; @@ -1075,7 +1072,7 @@ pub unsafe extern fn rav1e_receive_packet( *pkt = Box::into_raw(Box::new(packet)); None }) - .unwrap_or_else(|e| Some(e)); + .unwrap_or_else(Some); (*ctx).last_err = ret; diff --git a/src/context/partition_unit.rs b/src/context/partition_unit.rs index 8a80429c5f..350e708183 100644 --- a/src/context/partition_unit.rs +++ b/src/context/partition_unit.rs @@ -500,7 +500,6 @@ impl<'a> BlockContext<'a> { pub fn update_partition_context( &mut self, bo: TileBlockOffset, subsize: BlockSize, bsize: BlockSize, ) { - #[allow(dead_code)] assert!(bsize.is_sqr()); let bw = bsize.width_mi(); diff --git a/src/predict.rs b/src/predict.rs index fbf1e9526d..862becfe36 100644 --- a/src/predict.rs +++ b/src/predict.rs @@ -1415,7 +1415,7 @@ mod test { let above = &edge_buf.data[MAX_TX_SIZE + 1..MAX_TX_SIZE + 5]; let top_left = edge_buf.data[MAX_TX_SIZE]; - let mut output = Plane::from_slice(&vec![0u8; 4 * 4], 4); + let mut output = Plane::from_slice(&[0u8; 4 * 4], 4); pred_dc(&mut output.as_region_mut(), above, left, 4, 4, 8); assert_eq!(&output.data[..], [32u8; 16]); @@ -1467,7 +1467,7 @@ mod test { let left = &edge_buf.data[MAX_TX_SIZE - 8..MAX_TX_SIZE]; let above = &edge_buf.data[MAX_TX_SIZE + 1..MAX_TX_SIZE + 9]; - let top_left = &edge_buf.data[MAX_TX_SIZE..MAX_TX_SIZE + 1]; + let top_left = &edge_buf.data[MAX_TX_SIZE..=MAX_TX_SIZE]; let angles = [ 3, 6, 9, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45, 48, 51, 54, 58, 61, 64, 67, 70, 73, 76, 81, 84, 87, diff --git a/tests/binary.rs b/tests/binary.rs index dfb41526ef..5d7f1fa7fe 100644 --- a/tests/binary.rs +++ b/tests/binary.rs @@ -6,7 +6,7 @@ mod binary { use std::env::temp_dir; use std::fs::File; use std::io::Read; - use std::path::PathBuf; + use std::path::{Path, PathBuf}; fn get_y4m_input() -> Vec { let mut input = File::open(&format!( @@ -43,7 +43,7 @@ mod binary { Command::cargo_bin("rav1e").unwrap() } - fn get_common_cmd(outfile: &PathBuf) -> Command { + fn get_common_cmd(outfile: &Path) -> Command { let mut cmd = get_rav1e_command(); cmd.args(&["--bitrate", "1000"]).arg("-o").arg(outfile).arg("-y"); cmd From 45d6754f6389179fac085aedfa97af3b483aa896 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Lauzier?= Date: Mon, 9 Aug 2021 18:37:34 -0400 Subject: [PATCH 164/188] Replace some deprecated call --- src/api/channel/by_gop.rs | 2 +- src/api/channel/mod.rs | 2 +- src/api/test.rs | 48 +++++++++++++++++++-------------------- src/asm/x86/quantize.rs | 2 +- src/scenechange/mod.rs | 2 +- src/tiling/tiler.rs | 2 +- 6 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/api/channel/by_gop.rs b/src/api/channel/by_gop.rs index 799faf6e55..907ab5414b 100644 --- a/src/api/channel/by_gop.rs +++ b/src/api/channel/by_gop.rs @@ -327,7 +327,7 @@ impl Config { // TODO: make it user-settable let input_len = self.enc.rdo_lookahead_frames as usize * 4; - let frame_limit = std::i32::MAX as u64; + let frame_limit = i32::MAX as u64; let (send_frame, receive_frame) = bounded(input_len); let (send_packet, receive_packet) = unbounded(); diff --git a/src/api/channel/mod.rs b/src/api/channel/mod.rs index c6ae91bc50..95593f9fd0 100644 --- a/src/api/channel/mod.rs +++ b/src/api/channel/mod.rs @@ -258,7 +258,7 @@ impl Config { frame_limit, ) } else { - (None, None, std::i32::MAX as u64) + (None, None, i32::MAX as u64) }; let config = Arc::new(self.enc); diff --git a/src/api/test.rs b/src/api/test.rs index 8c04c93318..b4eff5711a 100644 --- a/src/api/test.rs +++ b/src/api/test.rs @@ -1367,9 +1367,9 @@ fn output_frameno_no_scene_change_at_short_flash(flash_at: u64) { let limit = 5; for i in 0..limit { if i == flash_at { - send_test_frame(&mut ctx, u8::min_value()); + send_test_frame(&mut ctx, u8::MIN); } else { - send_test_frame(&mut ctx, u8::max_value()); + send_test_frame(&mut ctx, u8::MAX); } } ctx.flush(); @@ -1419,14 +1419,14 @@ fn output_frameno_no_scene_change_at_flash_smaller_than_max_len_flash() { assert_eq!(ctx.inner.inter_cfg.pyramid_depth, 2); assert_eq!(ctx.inner.inter_cfg.group_input_len, 4); - send_test_frame(&mut ctx, u8::min_value()); - send_test_frame(&mut ctx, u8::min_value()); - send_test_frame(&mut ctx, u8::max_value()); - send_test_frame(&mut ctx, u8::max_value()); - send_test_frame(&mut ctx, u8::max_value()); - send_test_frame(&mut ctx, u8::max_value()); - send_test_frame(&mut ctx, u8::min_value()); - send_test_frame(&mut ctx, u8::min_value()); + send_test_frame(&mut ctx, u8::MIN); + send_test_frame(&mut ctx, u8::MIN); + send_test_frame(&mut ctx, u8::MAX); + send_test_frame(&mut ctx, u8::MAX); + send_test_frame(&mut ctx, u8::MAX); + send_test_frame(&mut ctx, u8::MAX); + send_test_frame(&mut ctx, u8::MIN); + send_test_frame(&mut ctx, u8::MIN); ctx.flush(); let data = get_frame_invariants(ctx) @@ -1479,18 +1479,18 @@ fn output_frameno_scene_change_before_flash_longer_than_max_flash_len() { assert_eq!(ctx.inner.inter_cfg.pyramid_depth, 2); assert_eq!(ctx.inner.inter_cfg.group_input_len, 4); - send_test_frame(&mut ctx, u8::min_value()); - send_test_frame(&mut ctx, u8::min_value()); - send_test_frame(&mut ctx, u8::max_value()); - send_test_frame(&mut ctx, u8::max_value()); - send_test_frame(&mut ctx, u8::max_value()); - send_test_frame(&mut ctx, u8::max_value()); - send_test_frame(&mut ctx, u8::max_value()); - send_test_frame(&mut ctx, u8::min_value()); - send_test_frame(&mut ctx, u8::min_value()); - send_test_frame(&mut ctx, u8::min_value()); - send_test_frame(&mut ctx, u8::min_value()); - send_test_frame(&mut ctx, u8::min_value()); + send_test_frame(&mut ctx, u8::MIN); + send_test_frame(&mut ctx, u8::MIN); + send_test_frame(&mut ctx, u8::MAX); + send_test_frame(&mut ctx, u8::MAX); + send_test_frame(&mut ctx, u8::MAX); + send_test_frame(&mut ctx, u8::MAX); + send_test_frame(&mut ctx, u8::MAX); + send_test_frame(&mut ctx, u8::MIN); + send_test_frame(&mut ctx, u8::MIN); + send_test_frame(&mut ctx, u8::MIN); + send_test_frame(&mut ctx, u8::MIN); + send_test_frame(&mut ctx, u8::MIN); ctx.flush(); let data = get_frame_invariants(ctx) @@ -1545,8 +1545,8 @@ fn output_frameno_scene_change_after_multiple_flashes() { assert_eq!(ctx.inner.inter_cfg.pyramid_depth, 2); assert_eq!(ctx.inner.inter_cfg.group_input_len, 4); - send_test_frame(&mut ctx, u8::min_value()); - send_test_frame(&mut ctx, u8::min_value()); + send_test_frame(&mut ctx, u8::MIN); + send_test_frame(&mut ctx, u8::MIN); send_test_frame(&mut ctx, 40); send_test_frame(&mut ctx, 100); send_test_frame(&mut ctx, 160); diff --git a/src/asm/x86/quantize.rs b/src/asm/x86/quantize.rs index 58b026c329..83e3986a81 100644 --- a/src/asm/x86/quantize.rs +++ b/src/asm/x86/quantize.rs @@ -192,7 +192,7 @@ mod test { let mut rcoeffs = Aligned::new([0i16; 32 * 32]); // Generate quantized coefficients up to the eob - let between = Uniform::from(-std::i16::MAX..=std::i16::MAX); + let between = Uniform::from(-i16::MAX..=i16::MAX); for (i, qcoeff) in qcoeffs.data.iter_mut().enumerate().take(eob) { *qcoeff = between.sample(&mut rng) / if i == 0 { dc_quant } else { ac_quant }; diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index fef1072373..fe79a87cd3 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -404,7 +404,7 @@ fn detect_scale_factor(sequence: &Arc) -> usize { 481..=720 => 4, 721..=1080 => 8, 1081..=1600 => 16, - 1601..=std::usize::MAX => 32, + 1601..=usize::MAX => 32, _ => 1, } as usize; debug!( diff --git a/src/tiling/tiler.rs b/src/tiling/tiler.rs index 0b7eb57b02..08ebe5fdda 100644 --- a/src/tiling/tiler.rs +++ b/src/tiling/tiler.rs @@ -813,7 +813,7 @@ pub mod test { #[test] fn tile_log2_overflow() { - assert_eq!(TilingInfo::tile_log2(1, usize::max_value()), None); + assert_eq!(TilingInfo::tile_log2(1, usize::MAX), None); } #[test] From 5b4b21afa631da967457ee425800c9523b230375 Mon Sep 17 00:00:00 2001 From: Josh Holmer Date: Thu, 26 Aug 2021 11:44:54 -0400 Subject: [PATCH 165/188] Do not compare invisible padding in fast scenecut Fixes #2781 --- src/scenechange/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index fe79a87cd3..67030e3089 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -384,6 +384,7 @@ impl SceneChangeDetector { let delta_line = l1 .iter() .zip(l2.iter()) + .take(plane1.cfg.width) .map(|(&p1, &p2)| { (i16::cast_from(p1) - i16::cast_from(p2)).abs() as u32 }) From cdd415dafe1f7811a6a432c45a8ae6f31f546f56 Mon Sep 17 00:00:00 2001 From: Josh Holmer Date: Thu, 26 Aug 2021 10:56:38 -0400 Subject: [PATCH 166/188] Bump av-metrics to 0.7 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index e365b61c7d..d09452ab94 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,7 +74,7 @@ aom-sys = { version = "0.3.0", optional = true } scan_fmt = { version = "0.2.3", optional = true, default-features = false } ivf = { version = "0.1", path = "ivf/", optional = true } v_frame = { version = "0.2.2", path = "v_frame/" } -av-metrics = { version = "0.6.2", optional = true, default-features = false } +av-metrics = { version = "0.7.1", optional = true, default-features = false } rayon = "1.0" crossbeam = { version = "0.8", optional = true } toml = { version = "0.5", optional = true } From 787e427faf27f87809bacbe5f3db5c290f341b54 Mon Sep 17 00:00:00 2001 From: Aleksandr Date: Fri, 30 Jul 2021 21:58:44 +0300 Subject: [PATCH 167/188] Scene detection improvement Adjusting values of thresholds for both versions of the scene detection algorithm. Improved metric for the fast version. The peaks of the metric are now more distinguishable from the other values, so the threshold cuts off only relevent values. --- src/scenechange/mod.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index 67030e3089..d5ab405ca3 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -56,14 +56,12 @@ impl SceneChangeDetector { // conversion, but the deltas needed to be scaled down. The deltas for keyframes // in YUV were about 1/3 to 1/2 of what they were in HSV, but non-keyframes were // very unlikely to have a delta greater than 3 in YUV, whereas they may reach into - // the double digits in HSV. Therefore, 12 was chosen as a reasonable default threshold. - // This may be adjusted later. + // the double digits in HSV. // // This threshold is only used for the fast scenecut implementation. // - // Testing shown that default threshold of 12 overallocates keyframes by almost double, - // compared to other scene change implementations - const BASE_THRESHOLD: usize = 12; + // Experiments have shown that this threshold is optimal. + const BASE_THRESHOLD: usize = 18; let bit_depth = encoder_config.bit_depth; let fast_mode = encoder_config.speed_settings.fast_scene_detection || encoder_config.low_latency; @@ -239,6 +237,14 @@ impl SceneChangeDetector { let mut cloned_deque = self.score_deque.to_vec(); cloned_deque.remove(self.deque_offset); + + // Subtract the previous metric value from the current one + // It makes the peaks in the metric more distinctive + if !self.fast_mode && self.deque_offset > 0 { + let previous_scene_score = self.score_deque[self.deque_offset-1].0; + self.score_deque[self.deque_offset].0 -= previous_scene_score; + } + let scene_score = self.score_deque[self.deque_offset].0; let scene_threshold = self.score_deque[self.deque_offset].1; @@ -367,7 +373,7 @@ impl SceneChangeDetector { * (distance_from_keyframe - min_keyint) as f64 / (max_keyint - min_keyint) as f64 }; - let threshold = intra_cost * (1.0 - bias); + let threshold = intra_cost * (1.0 - bias) / 2.2; ScenecutResult { intra_cost, inter_cost, threshold } } From e27832c8c07002464893c1c5170ba1cf7eefd681 Mon Sep 17 00:00:00 2001 From: Aleksandr Date: Mon, 2 Aug 2021 22:44:10 +0300 Subject: [PATCH 168/188] format fixed --- src/scenechange/mod.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index d5ab405ca3..b611e7c7a4 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -56,7 +56,7 @@ impl SceneChangeDetector { // conversion, but the deltas needed to be scaled down. The deltas for keyframes // in YUV were about 1/3 to 1/2 of what they were in HSV, but non-keyframes were // very unlikely to have a delta greater than 3 in YUV, whereas they may reach into - // the double digits in HSV. + // the double digits in HSV. // // This threshold is only used for the fast scenecut implementation. // @@ -237,11 +237,10 @@ impl SceneChangeDetector { let mut cloned_deque = self.score_deque.to_vec(); cloned_deque.remove(self.deque_offset); - // Subtract the previous metric value from the current one // It makes the peaks in the metric more distinctive if !self.fast_mode && self.deque_offset > 0 { - let previous_scene_score = self.score_deque[self.deque_offset-1].0; + let previous_scene_score = self.score_deque[self.deque_offset - 1].0; self.score_deque[self.deque_offset].0 -= previous_scene_score; } From d4f51a82faa2e22f3e453166aedd89ad2bdc5a9d Mon Sep 17 00:00:00 2001 From: Aleksandr Date: Thu, 19 Aug 2021 13:23:18 +0300 Subject: [PATCH 169/188] Improve scene detection Three versions of detection based on speed mode. This pr makes scene detection algorithm more accurate and flexible. Goals: More accurate scene detection in all three versions, by adjusting threshold Greater flexibility with the addition of a slow version of the algotihm Improved metric values - the peaks of the metric are now more distinguishable from the low values, so the threshold cuts off only relevant values. Downsample for medium version for faster decisions --- Cargo.toml | 1 + src/api/config/speedsettings.rs | 48 +++++++++++-- src/api/lookahead.rs | 60 ++++++++++++++++ src/scenechange/mod.rs | 117 ++++++++++++++++++++++---------- 4 files changed, 185 insertions(+), 41 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d09452ab94..da3fd646a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,6 +57,7 @@ wasm = ["wasm-bindgen"] dump_lookahead_data = ["byteorder", "image"] [dependencies] +histogram = "*" arg_enum_proc_macro = "0.3" bitstream-io = "1" cfg-if = "1.0" diff --git a/src/api/config/speedsettings.rs b/src/api/config/speedsettings.rs index 8af5adb67a..43a101465b 100644 --- a/src/api/config/speedsettings.rs +++ b/src/api/config/speedsettings.rs @@ -57,7 +57,7 @@ pub struct SpeedSettings { /// Enabled is faster. pub no_scene_detection: bool, /// Fast scene detection mode, uses simple SAD instead of encoder cost estimates. - pub fast_scene_detection: bool, + pub fast_scene_detection: SceneDetectionSpeed, /// Enables CDEF. pub cdef: bool, /// Enables LRF. @@ -110,7 +110,7 @@ impl Default for SpeedSettings { prediction_modes: PredictionModesSetting::ComplexAll, include_near_mvs: true, no_scene_detection: false, - fast_scene_detection: false, + fast_scene_detection: SceneDetectionSpeed::Medium, cdef: true, lrf: false, sgr_complexity: SGRComplexityLevel::Full, @@ -246,8 +246,14 @@ impl SpeedSettings { false } - const fn fast_scene_detection_preset(speed: usize) -> bool { - speed == 10 + const fn fast_scene_detection_preset(speed: usize) -> SceneDetectionSpeed { + if speed <= 6 { + SceneDetectionSpeed::Slow + } else if speed <= 9 { + SceneDetectionSpeed::Medium + } else { + SceneDetectionSpeed::Fast + } } const fn cdef_preset(_speed: usize) -> bool { @@ -313,6 +319,40 @@ impl PartitionRange { } } +/// Prediction modes to search. +#[derive( + Clone, + Copy, + Debug, + PartialOrd, + PartialEq, + FromPrimitive, + Serialize, + Deserialize, +)] +pub enum SceneDetectionSpeed { + /// Fastest scene detection using pixel-wise comparison + Fast, + /// Scene detection using motion vectors + Medium, + /// Scene detection using histogram block-based comparison + Slow, +} + +impl fmt::Display for SceneDetectionSpeed { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!( + f, + "{}", + match self { + SceneDetectionSpeed::Fast => "Fast", + SceneDetectionSpeed::Medium => "Medium", + SceneDetectionSpeed::Slow => "Slow", + } + ) + } +} + /// Prediction modes to search. #[derive( Clone, diff --git a/src/api/lookahead.rs b/src/api/lookahead.rs index d058f13c04..ad91dbf22c 100644 --- a/src/api/lookahead.rs +++ b/src/api/lookahead.rs @@ -14,8 +14,10 @@ use crate::rayon::iter::*; use crate::tiling::{Area, TileRect}; use crate::transform::TxSize; use crate::{Frame, Pixel}; +use histogram::Histogram; use rust_hawktracer::*; use std::sync::Arc; +use v_frame::pixel::CastFromPrimitive; pub(crate) const IMP_BLOCK_MV_UNITS_PER_PIXEL: i64 = 8; pub(crate) const IMP_BLOCK_SIZE_IN_MV_UNITS: i64 = @@ -115,6 +117,64 @@ pub(crate) fn estimate_intra_costs( intra_costs.into_boxed_slice() } +#[hawktracer(estimate_inter_costs_histogram)] +pub(crate) fn estimate_inter_costs_histogram_blocks( + frame: Arc>, ref_frame: Arc>, +) -> Box<[u32]> { + let plane_org = &frame.planes[0]; + let plane_ref = &ref_frame.planes[0]; + let h_in_imp_b = plane_org.cfg.height / IMPORTANCE_BLOCK_SIZE; + let w_in_imp_b = plane_org.cfg.width / IMPORTANCE_BLOCK_SIZE; + let mut inter_costs = Vec::with_capacity(h_in_imp_b * w_in_imp_b); + + (0..h_in_imp_b).for_each(|y| { + (0..w_in_imp_b).for_each(|x| { + // Coordinates of the top-left corner of the reference block, in MV + // units. + let region_org = plane_org.region(Area::Rect { + x: (x * IMPORTANCE_BLOCK_SIZE) as isize, + y: (y * IMPORTANCE_BLOCK_SIZE) as isize, + width: IMPORTANCE_BLOCK_SIZE, + height: IMPORTANCE_BLOCK_SIZE, + }); + + let region_ref = plane_ref.region(Area::Rect { + x: (x * IMPORTANCE_BLOCK_SIZE) as isize, + y: (y * IMPORTANCE_BLOCK_SIZE) as isize, + width: IMPORTANCE_BLOCK_SIZE, + height: IMPORTANCE_BLOCK_SIZE, + }); + + let mut histogram_org = + Histogram::configure().max_value(256).build().unwrap(); + let iter_org = region_org.rows_iter(); + for row in iter_org { + for pixel in row { + let cur = i16::cast_from(*pixel); + histogram_org.increment(cur as u64).unwrap(); + } + } + + let mut histogram_ref = + Histogram::configure().max_value(256).build().unwrap(); + let iter_ref = region_ref.rows_iter(); + for row in iter_ref { + for pixel in row { + let cur = i16::cast_from(*pixel); + histogram_ref.increment(cur as u64).unwrap(); + } + } + + let mean = (histogram_org.mean().unwrap() as i32 + - histogram_ref.mean().unwrap() as i32) + .abs(); + + inter_costs.push(mean as u32); + }); + }); + inter_costs.into_boxed_slice() +} + #[hawktracer(estimate_inter_costs)] pub(crate) fn estimate_inter_costs( frame: Arc>, ref_frame: Arc>, bit_depth: usize, diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index b611e7c7a4..60ce63e8eb 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -8,7 +8,7 @@ // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::api::lookahead::*; -use crate::api::EncoderConfig; +use crate::api::{EncoderConfig, SceneDetectionSpeed}; use crate::cpu_features::CpuFeatureLevel; use crate::encoder::Sequence; use crate::frame::*; @@ -17,13 +17,14 @@ use itertools::Itertools; use rust_hawktracer::*; use std::sync::Arc; use std::{cmp, u64}; +// use crate::api::*; /// Runs keyframe detection on frames from the lookahead queue. pub struct SceneChangeDetector { /// Minimum average difference between YUV deltas that will trigger a scene change. - threshold: usize, + threshold: f64, /// Fast scene cut detection mode, uses simple SAD instead of encoder cost estimates. - fast_mode: bool, + speed_mode: SceneDetectionSpeed, /// scaling factor for fast scene detection scale_factor: usize, // Frame buffer for scaled frames @@ -58,17 +59,23 @@ impl SceneChangeDetector { // very unlikely to have a delta greater than 3 in YUV, whereas they may reach into // the double digits in HSV. // - // This threshold is only used for the fast scenecut implementation. - // - // Experiments have shown that this threshold is optimal. - const BASE_THRESHOLD: usize = 18; + // Experiments have shown that these thresholds is optimal. + const FAST_THRESHOLD: f64 = 18.0; + const SLOW_THRESHOLD: f64 = 7.0; + let bit_depth = encoder_config.bit_depth; - let fast_mode = encoder_config.speed_settings.fast_scene_detection - || encoder_config.low_latency; + let speed_mode = if encoder_config.low_latency { + SceneDetectionSpeed::Fast + } else { + encoder_config.speed_settings.fast_scene_detection + }; - // Scale factor for fast scene detection - let scale_factor = - if fast_mode { detect_scale_factor(&sequence) } else { 1_usize }; + // Scale factor for fast and medium scene detection + let scale_factor = if speed_mode != SceneDetectionSpeed::Slow { + detect_scale_factor(&sequence, speed_mode) + } else { + 1_usize + }; // Set lookahead offset to 5 if normal lookahead available let lookahead_offset = if lookahead_distance >= 5 { 5 } else { 0 }; @@ -77,19 +84,28 @@ impl SceneChangeDetector { let score_deque = Vec::with_capacity(5 + lookahead_distance); // Pixel count for fast scenedetect - let pixels = if fast_mode { + let pixels = if speed_mode == SceneDetectionSpeed::Fast { (sequence.max_frame_height as usize / scale_factor) * (sequence.max_frame_width as usize / scale_factor) } else { 1 }; - let frame_buffer = - if fast_mode { Vec::with_capacity(2) } else { Vec::new() }; + let frame_buffer = if speed_mode == SceneDetectionSpeed::Fast { + Vec::with_capacity(2) + } else { + Vec::new() + }; + + let threshold = if speed_mode == SceneDetectionSpeed::Fast { + FAST_THRESHOLD * (bit_depth as f64) / 8.0 + } else { + SLOW_THRESHOLD * (bit_depth as f64) / 8.0 + }; Self { - threshold: BASE_THRESHOLD * bit_depth / 8, - fast_mode, + threshold, + speed_mode, scale_factor, frame_buffer, lookahead_offset, @@ -220,7 +236,7 @@ impl SceneChangeDetector { &mut self, frame1: Arc>, frame2: Arc>, input_frameno: u64, previous_keyframe: u64, ) { - let result = if self.fast_mode { + let result = if self.speed_mode == SceneDetectionSpeed::Fast { self.fast_scenecut(frame1, frame2) } else { self.cost_scenecut(frame1, frame2, input_frameno, previous_keyframe) @@ -239,7 +255,8 @@ impl SceneChangeDetector { // Subtract the previous metric value from the current one // It makes the peaks in the metric more distinctive - if !self.fast_mode && self.deque_offset > 0 { + if (self.speed_mode != SceneDetectionSpeed::Fast) && self.deque_offset > 0 + { let previous_scene_score = self.score_deque[self.deque_offset - 1].0; self.score_deque[self.deque_offset].0 -= previous_scene_score; } @@ -324,6 +341,7 @@ impl SceneChangeDetector { previous_keyframe: u64, ) -> ScenecutResult { let frame2_ref2 = Arc::clone(&frame2); + let (intra_cost, inter_cost) = crate::rayon::join( move || { let intra_costs = estimate_intra_costs( @@ -335,13 +353,18 @@ impl SceneChangeDetector { / intra_costs.len() as f64 }, move || { - let inter_costs = estimate_inter_costs( - frame2_ref2, - frame1, - self.bit_depth, - self.encoder_config, - self.sequence.clone(), - ); + let inter_costs = if self.speed_mode == SceneDetectionSpeed::Medium { + estimate_inter_costs( + frame2_ref2, + frame1, + self.bit_depth, + self.encoder_config, + self.sequence.clone(), + ) + } else { + estimate_inter_costs_histogram_blocks(frame2_ref2, frame1) + }; + inter_costs.iter().map(|&cost| cost as u64).sum::() as f64 / inter_costs.len() as f64 }, @@ -359,6 +382,7 @@ impl SceneChangeDetector { // This also matches the default scenecut threshold in x264. const THRESH_MAX: f64 = 0.4; const THRESH_MIN: f64 = THRESH_MAX * 0.25; + const SCALE_FACTOR: f64 = 3.6; let distance_from_keyframe = frameno - previous_keyframe; let min_keyint = self.encoder_config.min_key_frame_interval; let max_keyint = self.encoder_config.max_key_frame_interval; @@ -372,7 +396,13 @@ impl SceneChangeDetector { * (distance_from_keyframe - min_keyint) as f64 / (max_keyint - min_keyint) as f64 }; - let threshold = intra_cost * (1.0 - bias) / 2.2; + + // Adaptive threshold for medium version, static thresholf for the slow one + let threshold = if self.speed_mode == SceneDetectionSpeed::Medium { + intra_cost * (1.0 - bias) / SCALE_FACTOR + } else { + self.threshold as f64 + }; ScenecutResult { intra_cost, inter_cost, threshold } } @@ -401,18 +431,31 @@ impl SceneChangeDetector { } /// Scaling factor for frame in scene detection -fn detect_scale_factor(sequence: &Arc) -> usize { +fn detect_scale_factor( + sequence: &Arc, speed_mode: SceneDetectionSpeed, +) -> usize { let small_edge = cmp::min(sequence.max_frame_height, sequence.max_frame_width) as usize; - let scale_factor = match small_edge { - 0..=240 => 1, - 241..=480 => 2, - 481..=720 => 4, - 721..=1080 => 8, - 1081..=1600 => 16, - 1601..=usize::MAX => 32, - _ => 1, - } as usize; + let scale_factor; + if speed_mode == SceneDetectionSpeed::Fast { + scale_factor = match small_edge { + 0..=240 => 1, + 241..=480 => 2, + 481..=720 => 4, + 721..=1080 => 8, + 1081..=1600 => 16, + 1601..=usize::MAX => 32, + _ => 1, + } as usize + } else { + scale_factor = match small_edge { + 0..=1600 => 1, + 1601..=2160 => 2, + 2161..=usize::MAX => 4, + _ => 1, + } as usize + }; + debug!( "Scene detection scale factor {}, [{},{}] -> [{},{}]", scale_factor, From 84d54b4e92e86b1ae444192eaa89d673a4fd0651 Mon Sep 17 00:00:00 2001 From: Aleksandr Date: Thu, 19 Aug 2021 15:48:27 +0300 Subject: [PATCH 170/188] tests fixed --- src/api/test.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/api/test.rs b/src/api/test.rs index b4eff5711a..cf4dc865e9 100644 --- a/src/api/test.rs +++ b/src/api/test.rs @@ -1941,7 +1941,7 @@ fn log_q_exp_overflow() { prediction_modes: PredictionModesSetting::Simple, include_near_mvs: false, no_scene_detection: true, - fast_scene_detection: false, + fast_scene_detection: SceneDetectionSpeed::Fast, cdef: true, lrf: true, use_satd_subpel: false, @@ -2006,7 +2006,7 @@ fn guess_frame_subtypes_assert() { prediction_modes: PredictionModesSetting::Simple, include_near_mvs: false, no_scene_detection: true, - fast_scene_detection: false, + fast_scene_detection: SceneDetectionSpeed::Fast, cdef: true, lrf: true, use_satd_subpel: false, From 3fa28dfcd722722090062c312f667ca394c87304 Mon Sep 17 00:00:00 2001 From: Aleksandr Date: Thu, 19 Aug 2021 16:52:38 +0300 Subject: [PATCH 171/188] Add CLI option of scene change detection speed mode --- src/api/config/speedsettings.rs | 2 +- src/bin/common.rs | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/api/config/speedsettings.rs b/src/api/config/speedsettings.rs index 43a101465b..81d4c159f1 100644 --- a/src/api/config/speedsettings.rs +++ b/src/api/config/speedsettings.rs @@ -110,7 +110,7 @@ impl Default for SpeedSettings { prediction_modes: PredictionModesSetting::ComplexAll, include_near_mvs: true, no_scene_detection: false, - fast_scene_detection: SceneDetectionSpeed::Medium, + fast_scene_detection: SceneDetectionSpeed::Fast, cdef: true, lrf: false, sgr_complexity: SGRComplexityLevel::Full, diff --git a/src/bin/common.rs b/src/bin/common.rs index 5c5766a8da..89f477b7e8 100644 --- a/src/bin/common.rs +++ b/src/bin/common.rs @@ -175,6 +175,13 @@ pub fn parse_cli() -> Result { .takes_value(true) .default_value("6") ) + .arg( + Arg::with_name("SCENE_CHANGE_DETECTION_SPEED") + .help("Speed level (0 is best quality, 2 is fastest)") + .long("scd_speed") + .takes_value(true) + .default_value("2") + ) .arg( Arg::with_name("MIN_KEYFRAME_INTERVAL") .help("Minimum interval between keyframes") @@ -563,6 +570,8 @@ fn parse_config(matches: &ArgMatches<'_>) -> Result { } let speed = matches.value_of("SPEED").unwrap().parse().unwrap(); + let scene_detection_speed: u32 = + matches.value_of("SCENE_CHANGE_DETECTION_SPEED").unwrap().parse().unwrap(); let max_interval: u64 = matches.value_of("KEYFRAME_INTERVAL").unwrap().parse().unwrap(); let mut min_interval: u64 = @@ -577,6 +586,9 @@ fn parse_config(matches: &ArgMatches<'_>) -> Result { } else if min_interval > max_interval { panic!("Maximum keyframe interval must be greater than or equal to minimum keyframe interval"); } + if scene_detection_speed > 2 { + panic!("Scene change detection speed must be between 0-2"); + } let color_primaries = matches.value_of("COLOR_PRIMARIES").unwrap().parse().unwrap_or_default(); @@ -592,6 +604,16 @@ fn parse_config(matches: &ArgMatches<'_>) -> Result { .unwrap_or_default(); let mut cfg = EncoderConfig::with_speed_preset(speed); + + if matches.occurrences_of("SCENE_CHANGE_DETECTION_SPEED") != 0 { + cfg.speed_settings.fast_scene_detection = match scene_detection_speed { + 0 => SceneDetectionSpeed::Slow, + 1 => SceneDetectionSpeed::Medium, + 2 => SceneDetectionSpeed::Fast, + 3..=u32::MAX => cfg.speed_settings.fast_scene_detection, + }; + } + cfg.set_key_frame_interval(min_interval, max_interval); cfg.switch_frame_interval = matches.value_of("SWITCH_FRAME_INTERVAL").unwrap().parse().unwrap(); From f2e6bb3cd5717f5790a5bffaf6feb06e7758cfcf Mon Sep 17 00:00:00 2001 From: Aleksandr Date: Thu, 19 Aug 2021 17:34:07 +0300 Subject: [PATCH 172/188] Fixes: crate version and CLI --- Cargo.toml | 2 +- src/bin/common.rs | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index da3fd646a8..ab3aa8cb1d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,7 @@ wasm = ["wasm-bindgen"] dump_lookahead_data = ["byteorder", "image"] [dependencies] -histogram = "*" +histogram = "0.6.9" arg_enum_proc_macro = "0.3" bitstream-io = "1" cfg-if = "1.0" diff --git a/src/bin/common.rs b/src/bin/common.rs index 89f477b7e8..c41de23af1 100644 --- a/src/bin/common.rs +++ b/src/bin/common.rs @@ -177,7 +177,8 @@ pub fn parse_cli() -> Result { ) .arg( Arg::with_name("SCENE_CHANGE_DETECTION_SPEED") - .help("Speed level (0 is best quality, 2 is fastest)") + .help("Speed level for scene-change detection, 0: best quality, 1: speed-to-quality trade-off, 2: fastest mode\n\ + [default: 0 for s0-s6, 1 for s7-s9, 2 for s10]") .long("scd_speed") .takes_value(true) .default_value("2") @@ -606,11 +607,12 @@ fn parse_config(matches: &ArgMatches<'_>) -> Result { let mut cfg = EncoderConfig::with_speed_preset(speed); if matches.occurrences_of("SCENE_CHANGE_DETECTION_SPEED") != 0 { - cfg.speed_settings.fast_scene_detection = match scene_detection_speed { - 0 => SceneDetectionSpeed::Slow, - 1 => SceneDetectionSpeed::Medium, - 2 => SceneDetectionSpeed::Fast, - 3..=u32::MAX => cfg.speed_settings.fast_scene_detection, + cfg.speed_settings.fast_scene_detection = if scene_detection_speed == 0 { + SceneDetectionSpeed::Slow + } else if scene_detection_speed == 1 { + SceneDetectionSpeed::Medium + } else { + SceneDetectionSpeed::Fast }; } From 803b7071c953e4a7e2a3ed5e21ff63a6ff486d14 Mon Sep 17 00:00:00 2001 From: Josh Holmer Date: Mon, 30 Aug 2021 16:09:12 -0400 Subject: [PATCH 173/188] Minor scenechange code cleanup Removes a pair of unnecessary allocations --- src/scenechange/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index 60ce63e8eb..20db3c018e 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -265,8 +265,8 @@ impl SceneChangeDetector { let scene_threshold = self.score_deque[self.deque_offset].1; if scene_score >= scene_threshold as f64 { - let back_deque = self.score_deque[self.deque_offset + 1..].to_vec(); - let forward_deque = self.score_deque[..self.deque_offset].to_vec(); + let back_deque = &self.score_deque[self.deque_offset + 1..]; + let forward_deque = &self.score_deque[..self.deque_offset]; let back_over_tr = back_deque.iter().filter(|(x, y)| x > y).collect_vec(); From afcda68f52c001f706aabf9ca764b53b3ab6fbeb Mon Sep 17 00:00:00 2001 From: Josh Holmer Date: Tue, 31 Aug 2021 09:03:39 -0400 Subject: [PATCH 174/188] Remove dead branches in scenecut code --- src/scenechange/mod.rs | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index 20db3c018e..64dd3d7163 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -386,16 +386,11 @@ impl SceneChangeDetector { let distance_from_keyframe = frameno - previous_keyframe; let min_keyint = self.encoder_config.min_key_frame_interval; let max_keyint = self.encoder_config.max_key_frame_interval; - let bias = if distance_from_keyframe <= min_keyint / 4 { - THRESH_MIN / 4.0 - } else if distance_from_keyframe <= min_keyint { - THRESH_MIN * distance_from_keyframe as f64 / min_keyint as f64 - } else { - THRESH_MIN - + (THRESH_MAX - THRESH_MIN) - * (distance_from_keyframe - min_keyint) as f64 - / (max_keyint - min_keyint) as f64 - }; + debug_assert!(distance_from_keyframe >= min_keyint); + let bias = THRESH_MIN + + (THRESH_MAX - THRESH_MIN) + * (distance_from_keyframe - min_keyint) as f64 + / (max_keyint - min_keyint) as f64; // Adaptive threshold for medium version, static thresholf for the slow one let threshold = if self.speed_mode == SceneDetectionSpeed::Medium { From 54b667cdc4ea49709ba744e37c3a515252d23c09 Mon Sep 17 00:00:00 2001 From: Thomas Daede Date: Tue, 31 Aug 2021 01:19:35 -0700 Subject: [PATCH 175/188] Remove scale factor parameter for medium scenecut detection. It is redundant with THRESH_MAX and THRESH_MIN. No change on AWCY. https://beta.arewecompressedyet.com/?job=sc-cleanup-2%402021-09-01T05%3A08%3A17.086Z&job=sc-cleanup-1%402021-08-31T13%3A05%3A27.050Z --- src/scenechange/mod.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index 64dd3d7163..c6e09d8cb3 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -377,12 +377,10 @@ impl SceneChangeDetector { // `THRESH_MAX` determines how likely we are // to choose a keyframe, between 0.0-1.0. // Higher values mean we are more likely to choose a keyframe. - // `0.4` was chosen based on trials of the `scenecut-720p` set in AWCY, - // as it appeared to provide the best average compression. - // This also matches the default scenecut threshold in x264. - const THRESH_MAX: f64 = 0.4; - const THRESH_MIN: f64 = THRESH_MAX * 0.25; - const SCALE_FACTOR: f64 = 3.6; + // `0.833` was chosen based on trials using the new + // adaptive scenecut code. + const THRESH_MAX: f64 = 0.833; + const THRESH_MIN: f64 = 0.75; let distance_from_keyframe = frameno - previous_keyframe; let min_keyint = self.encoder_config.min_key_frame_interval; let max_keyint = self.encoder_config.max_key_frame_interval; @@ -394,7 +392,7 @@ impl SceneChangeDetector { // Adaptive threshold for medium version, static thresholf for the slow one let threshold = if self.speed_mode == SceneDetectionSpeed::Medium { - intra_cost * (1.0 - bias) / SCALE_FACTOR + intra_cost * (1.0 - bias) } else { self.threshold as f64 }; From 9dba7a3dbb86fcec5ab16847401e51898f00e5b6 Mon Sep 17 00:00:00 2001 From: Thomas Daede Date: Wed, 1 Sep 2021 08:48:32 -0700 Subject: [PATCH 176/188] Don't print the Error twice in the CLI. The second print came from the stdlib's Result return from main(), so manually return via std::process::exit. --- src/bin/rav1e.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/bin/rav1e.rs b/src/bin/rav1e.rs index 9bde699f8a..b9a14cb7fd 100644 --- a/src/bin/rav1e.rs +++ b/src/bin/rav1e.rs @@ -48,6 +48,7 @@ use crate::decoder::{Decoder, FrameBuilder, VideoDetails}; use crate::muxer::*; use std::fs::File; use std::io::{Read, Seek, Write}; +use std::process::exit; use std::sync::Arc; impl FrameBuilder for Context { @@ -293,7 +294,7 @@ fn do_encode( Ok(()) } -fn main() -> Result<(), Box> { +fn main() { #[cfg(feature = "tracing")] use rust_hawktracer::*; init_logger(); @@ -306,10 +307,10 @@ fn main() -> Result<(), Box> { buffer_size: 4096, }); - run().map_err(|e| { + run().unwrap_or_else(|e| { error::print_error(&e); - Box::new(e) as Box - }) + exit(1); + }); } fn init_logger() { From 0c6321176925791e089f6cfc83d38c186e2052cb Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Sun, 22 Aug 2021 15:08:31 +0200 Subject: [PATCH 177/188] Add a configuration attribute to use nightly features It will be used to experiment with the aarch64 and wasm SIMD intrinsics mainly. --- build.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/build.rs b/build.rs index c4c44077c8..3dab07df6e 100644 --- a/build.rs +++ b/build.rs @@ -10,7 +10,7 @@ #![allow(clippy::print_literal)] #![allow(clippy::unused_io_amount)] -use rustc_version::{version, Version}; +use rustc_version::{version, version_meta, Channel, Version}; #[allow(unused_imports)] use std::env; use std::fs; @@ -220,6 +220,10 @@ fn rustc_version_check() { eprintln!("rav1e requires rustc >= {}.", REQUIRED_VERSION); exit(1); } + + if version_meta().unwrap().channel == Channel::Nightly { + println!("cargo:rustc-cfg=nightly_rustc"); + } } #[cfg(feature = "asm")] From 58208ce14b3d313253d132f709b672ab2aa6a622 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Wed, 1 Sep 2021 18:21:40 +0200 Subject: [PATCH 178/188] Fix the histogram max_value setting --- src/api/lookahead.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/api/lookahead.rs b/src/api/lookahead.rs index ad91dbf22c..4699884c70 100644 --- a/src/api/lookahead.rs +++ b/src/api/lookahead.rs @@ -145,8 +145,10 @@ pub(crate) fn estimate_inter_costs_histogram_blocks( height: IMPORTANCE_BLOCK_SIZE, }); + let max_value = 1u64 << 12; // At most 12 bit per pixel + let mut histogram_org = - Histogram::configure().max_value(256).build().unwrap(); + Histogram::configure().max_value(max_value).build().unwrap(); let iter_org = region_org.rows_iter(); for row in iter_org { for pixel in row { @@ -156,7 +158,7 @@ pub(crate) fn estimate_inter_costs_histogram_blocks( } let mut histogram_ref = - Histogram::configure().max_value(256).build().unwrap(); + Histogram::configure().max_value(max_value).build().unwrap(); let iter_ref = region_ref.rows_iter(); for row in iter_ref { for pixel in row { From 8a88d7aa2b6d844965094e963f36afe3ae164220 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 7 Sep 2021 22:02:49 +0900 Subject: [PATCH 179/188] CI: Update libdav1d to 0.9.2-dmo1 --- .github/workflows/rav1e.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/rav1e.yml b/.github/workflows/rav1e.yml index 451daf59da..6e199721cd 100644 --- a/.github/workflows/rav1e.yml +++ b/.github/workflows/rav1e.yml @@ -161,11 +161,11 @@ jobs: matrix.conf == 'grcov-coveralls' || matrix.conf == 'fuzz' || matrix.conf == 'no-asm-tests' env: LINK: https://www.deb-multimedia.org/pool/main/d/dav1d-dmo - DAV1D_VERSION: 0.9.1-dmo1 + DAV1D_VERSION: 0.9.2-dmo1 DAV1D_DEV_SHA256: >- - df760b1124c121289f40cf25d6f4a6ee2fb1d20a988853fa33b9e947a1cd263a + 2ed10b35fa2663d2e7ba04d8fe01f0518602008d066cadcb311a5b8105f70f14 DAV1D_LIB_SHA256: >- - a6a3cf5b9d08250780b5661d40388267cd4dae42acdfc4d7b132ca19815e0301 + 1c4336743115b8a512fb984d289cac65c49ad249ba1456940258e53d9c91bd0c run: | echo "$LINK/libdav1d-dev_${DAV1D_VERSION}_amd64.deb" >> DEBS echo "$LINK/libdav1d5_${DAV1D_VERSION}_amd64.deb" >> DEBS From 45bde4a2a667b0ef625cd1fd01331020155580f7 Mon Sep 17 00:00:00 2001 From: Redzic Date: Tue, 7 Sep 2021 08:12:02 -0500 Subject: [PATCH 180/188] Remove unused clone of `score_deque` in `adaptive_scenecut` --- src/scenechange/mod.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index c6e09d8cb3..954e610f75 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -250,9 +250,6 @@ impl SceneChangeDetector { /// Value of current frame is offset by lookahead, if lookahead >=5 /// Returns true if current scene score is higher than adapted threshold fn adaptive_scenecut(&mut self) -> bool { - let mut cloned_deque = self.score_deque.to_vec(); - cloned_deque.remove(self.deque_offset); - // Subtract the previous metric value from the current one // It makes the peaks in the metric more distinctive if (self.speed_mode != SceneDetectionSpeed::Fast) && self.deque_offset > 0 From c6a52d217078e73566ef2485590da8a24d6a9a88 Mon Sep 17 00:00:00 2001 From: redzic <48274562+redzic@users.noreply.github.com> Date: Tue, 7 Sep 2021 13:02:51 -0500 Subject: [PATCH 181/188] =?UTF-8?q?Simplify=20`adaptive=5Fscenecut`=20by?= =?UTF-8?q?=20not=20collecting=20into=20a=20Vec=20for=20`*=5Fover=E2=80=A6?= =?UTF-8?q?=20(#2797)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Simplify `adaptive_scenecut` by not collecting into a Vec for `*_over_tr` The variables `back_over_tr` and `forward_over_tr` (which have now been renamed to be suffixed with `_count`) were previously initialized by performing an allocation (through the use of `.collect_vec()`) of the filtered values, even though only the *length* of the `Vec` was actually used. We now get the length directly by using `.count()`, which does not perform any allocations, and allows the compiler to make use of SIMD instructions. Additionally, the previous code essentially did this (simplified for example): `x != 0 && x > 1` (where `x` is a `usize`). This has been simplified to just `x > 1`. * Further simplification Do not check if `back_over_tr_count` and `back_deque` are both greater than one, as just checking `back_over_tr_count` is sufficient --- src/scenechange/mod.rs | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index 954e610f75..909e7e321a 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -13,7 +13,6 @@ use crate::cpu_features::CpuFeatureLevel; use crate::encoder::Sequence; use crate::frame::*; use crate::util::{CastFromPrimitive, Pixel}; -use itertools::Itertools; use rust_hawktracer::*; use std::sync::Arc; use std::{cmp, u64}; @@ -264,33 +263,29 @@ impl SceneChangeDetector { if scene_score >= scene_threshold as f64 { let back_deque = &self.score_deque[self.deque_offset + 1..]; let forward_deque = &self.score_deque[..self.deque_offset]; - let back_over_tr = - back_deque.iter().filter(|(x, y)| x > y).collect_vec(); - let forward_over_tr = - forward_deque.iter().filter(|(x, y)| x > y).collect_vec(); + let back_over_tr_count = + back_deque.iter().filter(|(x, y)| x > y).count(); + let forward_over_tr_count = + forward_deque.iter().filter(|(x, y)| x > y).count(); // Check for scenecut after the flashes // No frames over threshold forward // and some frames over threshold backward - if !back_over_tr.is_empty() - && forward_over_tr.is_empty() - && back_deque.len() > 1 - && back_over_tr.len() > 1 - { + if forward_over_tr_count == 0 && back_over_tr_count > 1 { return true; } // Check for scenecut before flash // If distance longer than max flash length - if back_over_tr.is_empty() - && forward_over_tr.len() == 1 + if back_over_tr_count == 0 + && forward_over_tr_count == 1 && forward_deque[0].0 > forward_deque[0].1 { return true; } - if !back_over_tr.is_empty() || !forward_over_tr.is_empty() { + if back_over_tr_count != 0 || forward_over_tr_count != 0 { return false; } } From 54a87a12e0d706e19ecda4cc0c3cf3bd324fb5cb Mon Sep 17 00:00:00 2001 From: Josh Holmer Date: Thu, 9 Sep 2021 18:20:15 -0400 Subject: [PATCH 182/188] Remove the histogram crate (#2794) It was extremely slow, especially at higher max values. We can just calculate the mean directly by summing and division. --- Cargo.toml | 1 - src/api/lookahead.rs | 23 ++++++++++------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ab3aa8cb1d..d09452ab94 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,6 @@ wasm = ["wasm-bindgen"] dump_lookahead_data = ["byteorder", "image"] [dependencies] -histogram = "0.6.9" arg_enum_proc_macro = "0.3" bitstream-io = "1" cfg-if = "1.0" diff --git a/src/api/lookahead.rs b/src/api/lookahead.rs index 4699884c70..d5659e44a1 100644 --- a/src/api/lookahead.rs +++ b/src/api/lookahead.rs @@ -14,7 +14,6 @@ use crate::rayon::iter::*; use crate::tiling::{Area, TileRect}; use crate::transform::TxSize; use crate::{Frame, Pixel}; -use histogram::Histogram; use rust_hawktracer::*; use std::sync::Arc; use v_frame::pixel::CastFromPrimitive; @@ -145,30 +144,28 @@ pub(crate) fn estimate_inter_costs_histogram_blocks( height: IMPORTANCE_BLOCK_SIZE, }); - let max_value = 1u64 << 12; // At most 12 bit per pixel - - let mut histogram_org = - Histogram::configure().max_value(max_value).build().unwrap(); + let mut count = 0i64; + let mut histogram_org_sum = 0i64; let iter_org = region_org.rows_iter(); for row in iter_org { for pixel in row { - let cur = i16::cast_from(*pixel); - histogram_org.increment(cur as u64).unwrap(); + let cur = u16::cast_from(*pixel); + histogram_org_sum += cur as i64; + count += 1; } } - let mut histogram_ref = - Histogram::configure().max_value(max_value).build().unwrap(); + let mut histogram_ref_sum = 0i64; let iter_ref = region_ref.rows_iter(); for row in iter_ref { for pixel in row { - let cur = i16::cast_from(*pixel); - histogram_ref.increment(cur as u64).unwrap(); + let cur = u16::cast_from(*pixel); + histogram_ref_sum += cur as i64; } } - let mean = (histogram_org.mean().unwrap() as i32 - - histogram_ref.mean().unwrap() as i32) + let mean = (((histogram_org_sum + count / 2) / count) + - ((histogram_ref_sum + count / 2) / count)) .abs(); inter_costs.push(mean as u32); From 0c58582863afb7e018a053f6af84897ec7de97a5 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Fri, 10 Sep 2021 10:50:06 +0200 Subject: [PATCH 183/188] Add more Frame -> FrameInternal to the C-API --- src/capi.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/capi.rs b/src/capi.rs index cda43d8d40..f3885b8c2d 100644 --- a/src/capi.rs +++ b/src/capi.rs @@ -61,6 +61,18 @@ impl From> for FrameInternal { } } +impl From>> for FrameInternal { + fn from(f: Arc>) -> FrameInternal { + FrameInternal::U8(f) + } +} + +impl From>> for FrameInternal { + fn from(f: Arc>) -> FrameInternal { + FrameInternal::U16(f) + } +} + struct FrameOpaque { opaque: *mut c_void, cb: FrameOpaqueCb, From dfa3925ac0281ca7eed3b539338145859d5e4cee Mon Sep 17 00:00:00 2001 From: Thomas Daede Date: Fri, 10 Sep 2021 01:35:54 -0700 Subject: [PATCH 184/188] Add rec and source to capi. --- src/capi.rs | 93 ++++++++++++++++++++++++++++++++++++++++++-- v_frame/Cargo.toml | 2 +- v_frame/src/plane.rs | 62 +++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 4 deletions(-) diff --git a/src/capi.rs b/src/capi.rs index f3885b8c2d..615993c674 100644 --- a/src/capi.rs +++ b/src/capi.rs @@ -218,7 +218,10 @@ impl EncContext { fn receive_packet(&mut self) -> Result { fn receive_packet( ctx: &mut rav1e::Context, - ) -> Result { + ) -> Result + where + FrameInternal: From>>, + { ctx.receive_packet().map(|p| { let mut p = std::mem::ManuallyDrop::new(p); let opaque = p.opaque.take().map_or_else(std::ptr::null_mut, |o| { @@ -227,10 +230,32 @@ impl EncContext { opaque.opaque }); let p = std::mem::ManuallyDrop::into_inner(p); - let rav1e::Packet { data, input_frameno, frame_type, .. } = p; + let rav1e::Packet { + data, rec, source, input_frameno, frame_type, .. + } = p; let len = data.len(); let data = Box::into_raw(data.into_boxed_slice()) as *const u8; - Packet { data, len, input_frameno, frame_type, opaque } + let rec = if let Some(rec) = rec { + let rec = FrameInternal::from(rec); + Box::into_raw(Box::new(Frame { + fi: rec, + frame_type: FrameTypeOverride::No, + opaque: None, + })) + } else { + std::ptr::null_mut() + }; + let source = if let Some(source) = source { + let source = FrameInternal::from(source); + Box::into_raw(Box::new(Frame { + fi: source, + frame_type: FrameTypeOverride::No, + opaque: None, + })) + } else { + std::ptr::null_mut() + }; + Packet { data, rec, source, len, input_frameno, frame_type, opaque } }) } match self { @@ -328,6 +353,12 @@ pub struct Packet { pub frame_type: FrameType, /// User provided opaque data pub opaque: *mut c_void, + /// The reconstruction of the shown frame. + /// This is freed automatically by rav1e_packet_unref(). + pub rec: *mut Frame, + /// The Reference Frame + /// This is freed automatically by rav1e_packet_unref(). + pub source: *mut Frame, } /// Version information as presented in `[package]` `version`. @@ -1101,6 +1132,8 @@ pub unsafe extern fn rav1e_packet_unref(pkt: *mut Packet) { pkt.len as usize, pkt.len as usize, ); + rav1e_frame_unref(pkt.rec); + rav1e_frame_unref(pkt.source); } } @@ -1133,6 +1166,17 @@ fn rav1e_frame_fill_plane_internal( ); } +fn rav1e_frame_extract_plane_internal( + f: &Arc>, plane: c_int, data_slice: &mut [u8], + stride: ptrdiff_t, bytewidth: c_int, +) { + f.planes[plane as usize].copy_to_raw_u8( + data_slice, + stride as usize, + bytewidth as usize, + ); +} + /// Fill a frame plane /// /// Currently the frame contains 3 planes, the first is luminance followed by @@ -1163,6 +1207,39 @@ pub unsafe extern fn rav1e_frame_fill_plane( } } +/// Extract a frame plane +/// +/// This is the reverse of rav1e_frame_fill_plane(), primarily used for +/// extracting the source and reconstruction data from a RaPacket. +/// +/// Currently the frame contains 3 planes, the first is luminance followed by +/// chrominance. +/// +/// The data is copied out of the frame for a single plane. +/// +/// frame: A frame provided inside a packet returned by rav1e_receive_packet() +/// plane: The index of the plane starting from 0 +/// data: The destination for the data +/// data_len: Length of the buffer +/// stride: Plane line in bytes, including padding +/// bytewidth: Number of bytes per component, either 1 or 2 +#[no_mangle] +pub unsafe extern fn rav1e_frame_extract_plane( + frame: *const Frame, plane: c_int, data: *mut u8, data_len: size_t, + stride: ptrdiff_t, bytewidth: c_int, +) { + let data_slice = slice::from_raw_parts_mut(data, data_len as usize); + + match (*frame).fi { + FrameInternal::U8(ref f) => rav1e_frame_extract_plane_internal( + f, plane, data_slice, stride, bytewidth, + ), + FrameInternal::U16(ref f) => rav1e_frame_extract_plane_internal( + f, plane, data_slice, stride, bytewidth, + ), + } +} + #[cfg(test)] mod test { use super::*; @@ -1201,6 +1278,16 @@ mod test { let ret = rav1e_receive_packet(rax, &mut p); if ret == EncoderStatus::Success { + let mut source = vec![1; 64 * 64]; + rav1e_frame_extract_plane( + (*p).source, + 0, + source.as_mut_ptr(), + 64 * 64, + 64, + 1, + ); + assert_eq!(source, vec![128; 64 * 64]); let v = Box::from_raw((*p).opaque as *mut u8); eprintln!("Opaque {}", v); } diff --git a/v_frame/Cargo.toml b/v_frame/Cargo.toml index 964d2ef30f..df484ca52d 100644 --- a/v_frame/Cargo.toml +++ b/v_frame/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "v_frame" -version = "0.2.2" +version = "0.2.3" description = "Video Frame data structures, part of rav1e" license = "BSD-2-Clause" authors = ["Luca Barbato "] diff --git a/v_frame/src/plane.rs b/v_frame/src/plane.rs index b4daf85388..bb19f4c34f 100644 --- a/v_frame/src/plane.rs +++ b/v_frame/src/plane.rs @@ -435,6 +435,41 @@ impl Plane { } } + /// Copies data from a plane into a pixel array. + pub fn copy_to_raw_u8( + &self, dest: &mut [u8], dest_stride: usize, dest_bytewidth: usize, + ) { + let stride = self.cfg.stride; + for (self_row, dest_row) in + self.data_origin().chunks(stride).zip(dest.chunks_mut(dest_stride)) + { + match dest_bytewidth { + 1 => { + for (self_pixel, dest_pixel) in + self_row[..self.cfg.width].iter().zip(dest_row.iter_mut()) + { + *dest_pixel = u8::cast_from(*self_pixel); + } + } + 2 => { + assert!( + mem::size_of::() >= 2, + "dest bytewidth ({}) cannot fit in Plane", + dest_bytewidth + ); + for (self_pixel, bytes) in + self_row[..self.cfg.width].iter().zip(dest_row.chunks_mut(2)) + { + bytes[0] = u16::cast_from(*self_pixel) as u8; + bytes[1] = (u16::cast_from(*self_pixel) >> 8) as u8; + } + } + + _ => {} + } + } + } + /// Returns plane with half the resolution for width and height. /// Downscaled with 2x2 box filter. /// Padded to dimensions with frame_width and frame_height. @@ -840,6 +875,33 @@ pub mod test { assert_eq!(&input[..64], &plane.data[..64]); } + #[test] + fn copy_to_raw_u8() { + #[rustfmt::skip] + let plane = Plane::from_slice(& + vec![ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 2, 3, 4, 0, 0, + 0, 0, 8, 7, 6, 5, 0, 0, + 0, 0, 9, 8, 7, 6, 0, 0, + 0, 0, 2, 3, 4, 5, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + ], + 8, + ); + + let mut output = vec![42u8; 64]; + + plane.copy_to_raw_u8(&mut output, 8, 1); + + println!("{:?}", &plane.data[..10]); + + assert_eq!(&output[..64], &plane.data[..64]); + } + #[test] fn test_plane_downsample() { #[rustfmt::skip] From b4b06e9f1b7318f30038df7528c689d2b07a9943 Mon Sep 17 00:00:00 2001 From: redzic Date: Wed, 8 Sep 2021 14:34:47 -0500 Subject: [PATCH 185/188] Reuse allocations of `Plane` for downscaling when using fast scene detection The implicit size of the frame buffer for downscaled frames was 2, but the type of `frame_buffer` was `Vec>`. This has now been changed to `Option<(Box<[Plane; 2]>, bool)>` to explicitly handle the size of the frame buffer, where the `bool` represents whether the `Plane` is in a valid state (i.e. if `true`, push a new frame onto the buffer, and if `false`, which will happen on scenecuts, reuse the allocations for new downscaled frames). --- src/scenechange/mod.rs | 85 +++++++++++++++++++++++++++--------------- v_frame/src/plane.rs | 47 ++++++++++++----------- 2 files changed, 81 insertions(+), 51 deletions(-) diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index 909e7e321a..32d5986a7c 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -26,13 +26,18 @@ pub struct SceneChangeDetector { speed_mode: SceneDetectionSpeed, /// scaling factor for fast scene detection scale_factor: usize, - // Frame buffer for scaled frames - frame_buffer: Vec>, - // Deque offset for current + /// Frame buffer for scaled frames + frame_buffer: Option<( + Box<[Plane; 2]>, + // `true` if the data is valid and initialized, or `false` + // if it should be assumed that the data is uninitialized. + bool, + )>, + /// Deque offset for current lookahead_offset: usize, - // Start deque offset based on lookahead + /// Start deque offset based on lookahead deque_offset: usize, - // Scenechange results for adaptive threshold + /// Scenechange results for adaptive threshold score_deque: Vec<(f64, f64)>, /// Number of pixels in scaled frame for fast mode pixels: usize, @@ -90,12 +95,6 @@ impl SceneChangeDetector { 1 }; - let frame_buffer = if speed_mode == SceneDetectionSpeed::Fast { - Vec::with_capacity(2) - } else { - Vec::new() - }; - let threshold = if speed_mode == SceneDetectionSpeed::Fast { FAST_THRESHOLD * (bit_depth as f64) / 8.0 } else { @@ -106,7 +105,7 @@ impl SceneChangeDetector { threshold, speed_mode, scale_factor, - frame_buffer, + frame_buffer: None, lookahead_offset, deque_offset, score_deque, @@ -146,6 +145,13 @@ impl SceneChangeDetector { return false; } if distance >= self.encoder_config.max_key_frame_interval { + // Clear buffers and `score_deque` + if let Some((_, is_initialized)) = &mut self.frame_buffer { + *is_initialized = false; + } + debug!("[SC-score-deque]{:.0?}", self.score_deque); + self.score_deque.clear(); + return true; } @@ -199,8 +205,10 @@ impl SceneChangeDetector { ); if scenecut { - // Clear buffers and deque - self.frame_buffer.clear(); + // Clear buffers and `score_deque` + if let Some((_, is_initialized)) = &mut self.frame_buffer { + *is_initialized = false; + } debug!("[SC-score-deque]{:.0?}", self.score_deque); self.score_deque.clear(); } else { @@ -299,26 +307,43 @@ impl SceneChangeDetector { fn fast_scenecut( &mut self, frame1: Arc>, frame2: Arc>, ) -> ScenecutResult { - // Downscaling both frames for comparison - // Moving scaled frames to buffer - if self.frame_buffer.is_empty() { - let frame1_scaled = frame1.planes[0].downscale(self.scale_factor); - self.frame_buffer.push(frame1_scaled); - - let frame2_scaled = frame2.planes[0].downscale(self.scale_factor); - self.frame_buffer.push(frame2_scaled); + // downscale both frames for faster comparison + if let Some((frame_buffer, is_initialized)) = &mut self.frame_buffer { + let frame_buffer = &mut **frame_buffer; + if *is_initialized { + frame_buffer.swap(0, 1); + frame2.planes[0] + .downscale_in_place(self.scale_factor, &mut frame_buffer[1]); + } else { + // both frames are in an irrelevant and invalid state, so we have to reinitialize + // them, but we can reuse their allocations + frame1.planes[0] + .downscale_in_place(self.scale_factor, &mut frame_buffer[0]); + frame2.planes[0] + .downscale_in_place(self.scale_factor, &mut frame_buffer[1]); + *is_initialized = true; + } } else { - self.frame_buffer.remove(0); - self.frame_buffer.push(frame2.planes[0].downscale(self.scale_factor)); + self.frame_buffer = Some(( + Box::new([ + frame1.planes[0].downscale(self.scale_factor), + frame2.planes[0].downscale(self.scale_factor), + ]), + true, // the frame buffer is initialized and in a valid state + )); } - let delta = - self.delta_in_planes(&self.frame_buffer[0], &self.frame_buffer[1]); + if let Some((frame_buffer, _)) = &self.frame_buffer { + let frame_buffer = &**frame_buffer; + let delta = self.delta_in_planes(&frame_buffer[0], &frame_buffer[1]); - ScenecutResult { - intra_cost: self.threshold as f64, - threshold: self.threshold as f64, - inter_cost: delta as f64, + ScenecutResult { + intra_cost: self.threshold as f64, + threshold: self.threshold as f64, + inter_cost: delta as f64, + } + } else { + unreachable!() } } diff --git a/v_frame/src/plane.rs b/v_frame/src/plane.rs index bb19f4c34f..7f9edb3ac0 100644 --- a/v_frame/src/plane.rs +++ b/v_frame/src/plane.rs @@ -477,7 +477,7 @@ impl Plane { &self, frame_width: usize, frame_height: usize, ) -> Plane { let src = self; - // unsafe: all pixels initialized in this function + // SAFETY: all pixels initialized in this function let mut new = unsafe { Plane::new_uninitialized( (src.cfg.width + 1) / 2, @@ -519,22 +519,13 @@ impl Plane { new } - /// Returns plane with downscaled resolution - /// Downscaling the plane by integer value - /// Not padded - #[hawktracer(downscale)] + /// Returns a plane downscaled from the source plane by a factor of `scale` (not padded) pub fn downscale(&self, scale: usize) -> Plane { - let box_pixels = scale * scale; - let half_box_pixels = box_pixels as u32 / 2; // Used for rounding int division - - let src = self; - let data_origin = src.data_origin(); - - // unsafe: all pixels initialized in this function + // SAFETY: all pixels initialized when `downscale_in_place` is called let mut new_plane = unsafe { Plane::new_uninitialized( - src.cfg.width / scale, - src.cfg.height / scale, + self.cfg.width / scale, + self.cfg.height / scale, 0, 0, 0, @@ -542,19 +533,35 @@ impl Plane { ) }; - let stride = new_plane.cfg.stride; - let width = new_plane.cfg.width; - let height = new_plane.cfg.height; + self.downscale_in_place(scale, &mut new_plane); + + new_plane + } + + /// Downscales the source plane by a factor of `scale`, writing the result to `in_plane` (not padded) + /// + /// `in_plane`'s width and height must be sufficient for `scale`. + #[hawktracer(downscale)] + pub fn downscale_in_place(&self, scale: usize, in_plane: &mut Plane) { + let src = self; + let box_pixels = scale * scale; + let half_box_pixels = box_pixels as u32 / 2; // Used for rounding int division + + let data_origin = src.data_origin(); + + let stride = in_plane.cfg.stride; + let width = in_plane.cfg.width; + let height = in_plane.cfg.height; // Par iter over dst chunks - let np_raw_slice = new_plane.data.deref_mut(); + let plane_data_mut_slice = in_plane.data.deref_mut(); let threads = current_num_threads(); let chunk_rows = cmp::max((height + threads / 2) / threads, 1); let chunk_size = chunk_rows * stride; let height_limit = height * stride; - np_raw_slice[0..height_limit] + plane_data_mut_slice[0..height_limit] .par_chunks_mut(chunk_size) .enumerate() .for_each(|(chunk_idx, chunk)| { @@ -586,8 +593,6 @@ impl Plane { } } }); - - new_plane } /// Iterates over the pixels in the plane, skipping the padding. From e27cff00b4dac55d2eaad0dd9d058912a8dc3edc Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 9 Sep 2021 15:47:17 -0700 Subject: [PATCH 186/188] Write the 'monochrome' bit correctly The original code write the 'monochrome' bit with the value seq.bit_depth == 1. I suspect this comes from misunderstanding what monochrome means in AV1. In AV1, monochrome is what is usually known as grayscale, rather than black and white. Therefore the correct check for monochrome should be seq.chroma_sampling == ChromaSampling::Cs400. Also change the comment "sample_position" to "chroma_sample_position" because chroma_sample_position is the name of that field in the AV1 ISOBMFF specification. --- src/api/channel/data.rs | 4 ++-- src/api/context.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/api/channel/data.rs b/src/api/channel/data.rs index 51ac473cdb..64dc10f366 100644 --- a/src/api/channel/data.rs +++ b/src/api/channel/data.rs @@ -209,10 +209,10 @@ impl PacketReceiver { bw.write_bit(false)?; // tier bw.write_bit(seq.bit_depth > 8)?; // high_bitdepth bw.write_bit(seq.bit_depth == 12)?; // twelve_bit - bw.write_bit(seq.bit_depth == 1)?; // monochrome + bw.write_bit(seq.chroma_sampling == ChromaSampling::Cs400)?; // monochrome bw.write_bit(seq.chroma_sampling != ChromaSampling::Cs444)?; // chroma_subsampling_x bw.write_bit(seq.chroma_sampling == ChromaSampling::Cs420)?; // chroma_subsampling_y - bw.write(2, 0)?; // sample_position + bw.write(2, 0)?; // chroma_sample_position bw.write(3, 0)?; // reserved bw.write_bit(false)?; // initial_presentation_delay_present diff --git a/src/api/context.rs b/src/api/context.rs index 88ea9da84e..458ce98ca0 100644 --- a/src/api/context.rs +++ b/src/api/context.rs @@ -327,10 +327,10 @@ impl Context { bw.write_bit(false)?; // tier bw.write_bit(seq.bit_depth > 8)?; // high_bitdepth bw.write_bit(seq.bit_depth == 12)?; // twelve_bit - bw.write_bit(seq.bit_depth == 1)?; // monochrome + bw.write_bit(seq.chroma_sampling == ChromaSampling::Cs400)?; // monochrome bw.write_bit(seq.chroma_sampling != ChromaSampling::Cs444)?; // chroma_subsampling_x bw.write_bit(seq.chroma_sampling == ChromaSampling::Cs420)?; // chroma_subsampling_y - bw.write(2, 0)?; // sample_position + bw.write(2, 0)?; // chroma_sample_position bw.write(3, 0)?; // reserved bw.write_bit(false)?; // initial_presentation_delay_present From 3d33e2c319772e1a03c3c1fc4db0a62ab240bc8d Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Tue, 14 Sep 2021 12:34:10 +0200 Subject: [PATCH 187/188] Prepare for release --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d09452ab94..0ff425158b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rav1e" -version = "0.5.0-beta" +version = "0.5.0-beta.2" authors = ["Thomas Daede "] edition = "2018" build = "build.rs" @@ -73,7 +73,7 @@ dav1d-sys = { version = "0.3.4", optional = true } aom-sys = { version = "0.3.0", optional = true } scan_fmt = { version = "0.2.3", optional = true, default-features = false } ivf = { version = "0.1", path = "ivf/", optional = true } -v_frame = { version = "0.2.2", path = "v_frame/" } +v_frame = { version = "0.2.3", path = "v_frame/" } av-metrics = { version = "0.7.1", optional = true, default-features = false } rayon = "1.0" crossbeam = { version = "0.8", optional = true } From 2ec4e675b0298c3513110cd67f8d229112feb468 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Tue, 14 Sep 2021 20:57:35 +0200 Subject: [PATCH 188/188] Bump the v_frame version as additional API got added --- Cargo.toml | 2 +- v_frame/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0ff425158b..09d0389bda 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -73,7 +73,7 @@ dav1d-sys = { version = "0.3.4", optional = true } aom-sys = { version = "0.3.0", optional = true } scan_fmt = { version = "0.2.3", optional = true, default-features = false } ivf = { version = "0.1", path = "ivf/", optional = true } -v_frame = { version = "0.2.3", path = "v_frame/" } +v_frame = { version = "0.2.4", path = "v_frame/" } av-metrics = { version = "0.7.1", optional = true, default-features = false } rayon = "1.0" crossbeam = { version = "0.8", optional = true } diff --git a/v_frame/Cargo.toml b/v_frame/Cargo.toml index df484ca52d..6d617d5e49 100644 --- a/v_frame/Cargo.toml +++ b/v_frame/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "v_frame" -version = "0.2.3" +version = "0.2.4" description = "Video Frame data structures, part of rav1e" license = "BSD-2-Clause" authors = ["Luca Barbato "]