aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2024-02-28 21:03:50 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2024-02-28 21:03:50 +0000
commit68e705e1d4218c6eac88d65295236faa1e619011 (patch)
tree6c2b5c30b2320802f44e5fabf1058c74c3f7df25
parent5b270c69b51dcee979ba09f9ca892175eea25a9c (diff)
parent024ac3cafd57de34e57b0c0ee023f747a82b61c9 (diff)
downloadlibdav1d-simpleperf-release.tar.gz
Snap for 11510257 from 024ac3cafd57de34e57b0c0ee023f747a82b61c9 to simpleperf-releasesimpleperf-release
Change-Id: I976328a79aa611b52b12f99e8c95ea3c2cba2c55
-rw-r--r--.gitlab-ci.yml113
-rw-r--r--METADATA29
-rw-r--r--NEWS14
-rw-r--r--config/arm32/config.h12
-rw-r--r--config/arm64/config.h12
-rw-r--r--config/riscv64/config.h16
-rw-r--r--config/x86_32/config.h12
-rw-r--r--config/x86_64/config.h12
-rw-r--r--include/common/attributes.h2
-rw-r--r--meson.build26
-rw-r--r--meson_options.txt5
-rw-r--r--package/crossfiles/loongarch64-linux.meson13
-rw-r--r--package/crossfiles/riscv64-linux.meson12
-rw-r--r--src/arm/64/itx.S7
-rw-r--r--src/cpu.c4
-rw-r--r--src/cpu.h8
-rw-r--r--src/decode.c55
-rw-r--r--src/internal.h2
-rw-r--r--src/itx_tmpl.c10
-rw-r--r--src/loongarch/cpu.c47
-rw-r--r--src/loongarch/cpu.h37
-rw-r--r--src/loongarch/itx.S8104
-rw-r--r--src/loongarch/itx.h195
-rw-r--r--src/loongarch/loongson_asm.S776
-rw-r--r--src/loongarch/loopfilter.S1108
-rw-r--r--src/loongarch/loopfilter.h52
-rw-r--r--src/loongarch/looprestoration.S1407
-rw-r--r--src/loongarch/looprestoration.h78
-rw-r--r--src/loongarch/looprestoration_tmpl.c274
-rw-r--r--src/loongarch/mc.S4758
-rw-r--r--src/loongarch/mc.h118
-rw-r--r--src/loongarch/msac.S368
-rw-r--r--src/loongarch/msac.h46
-rw-r--r--src/loongarch/refmvs.S152
-rw-r--r--src/loongarch/refmvs.h44
-rw-r--r--src/loopfilter_tmpl.c4
-rw-r--r--src/looprestoration_tmpl.c4
-rw-r--r--src/mc_tmpl.c4
-rw-r--r--src/meson.build27
-rw-r--r--src/msac.h2
-rw-r--r--src/picture.c44
-rw-r--r--src/qm.c1499
-rw-r--r--src/refmvs.c4
-rw-r--r--src/refmvs.h1
-rw-r--r--src/riscv/64/itx.S1339
-rw-r--r--src/riscv/asm.S126
-rw-r--r--src/riscv/cpu.c49
-rw-r--r--src/riscv/cpu.h37
-rw-r--r--src/riscv/itx.h111
-rw-r--r--src/x86/cpu.c7
-rw-r--r--src/x86/filmgrain.h10
-rw-r--r--src/x86/filmgrain16_avx512.asm10
-rw-r--r--src/x86/ipred.h4
-rw-r--r--src/x86/ipred16_avx512.asm1812
-rw-r--r--src/x86/ipred_avx2.asm56
-rw-r--r--src/x86/ipred_avx512.asm1701
-rw-r--r--src/x86/loopfilter.h7
-rw-r--r--src/x86/loopfilter_avx512.asm21
-rw-r--r--src/x86/looprestoration16_avx2.asm16
-rw-r--r--src/x86/looprestoration_avx2.asm11
-rw-r--r--src/x86/mc.h9
-rw-r--r--src/x86/pal.asm2
-rw-r--r--tests/checkasm/cdef.c2
-rw-r--r--tests/checkasm/checkasm.c170
-rw-r--r--tests/checkasm/checkasm.h72
-rw-r--r--tests/checkasm/ipred.c15
-rw-r--r--tests/checkasm/msac.c8
-rw-r--r--tests/checkasm/riscv/checkasm_64.S252
-rwxr-xr-xtests/dav1d_argon.bash13
-rw-r--r--tests/meson.build4
-rw-r--r--tools/dav1d_cli_parse.c9
71 files changed, 23648 insertions, 1742 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6b80a35..702f284 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -4,56 +4,56 @@ stages:
- test
.debian-amd64-common:
- image: registry.videolan.org/dav1d-debian-unstable:20230512061205
+ image: registry.videolan.org/dav1d-debian-unstable:20240113214804
stage: build
tags:
- docker
- amd64
.debian-amd64-minimum:
- image: registry.videolan.org/dav1d-debian-minimum:20230211045249
+ image: registry.videolan.org/dav1d-debian-minimum:20231024033032
stage: build
tags:
- docker
- amd64
.debian-llvm-mingw-common:
- image: registry.videolan.org/vlc-debian-llvm-msvcrt:20230212072216
+ image: registry.videolan.org/vlc-debian-llvm-msvcrt:20231024033032
stage: build
tags:
- docker
- amd64
.debian-aarch64-common:
- image: registry.videolan.org/dav1d-debian-bullseye-aarch64:20230512061045
+ image: registry.videolan.org/dav1d-debian-bookworm-aarch64:20231018041418
stage: build
tags:
- docker
- aarch64
.debian-armv7-common:
- image: registry.videolan.org/dav1d-debian-bullseye-armv7:20230513182209
+ image: registry.videolan.org/dav1d-debian-bookworm-armv7:20231018042237
stage: build
tags:
- docker
- armv7
.debian-ppc64le-common:
- image: registry.videolan.org/dav1d-debian-unstable-ppc64le:20230211050439
+ image: registry.videolan.org/dav1d-debian-unstable-ppc64le:20231020040221
stage: build
tags:
- docker
- ppc64le
.android-common:
- image: registry.videolan.org/vlc-debian-android:20230212071537
+ image: registry.videolan.org/vlc-debian-android:20231013040434
stage: build
tags:
- docker
- amd64
.debian-wasm-emscripten-common:
- image: registry.videolan.org/vlc-debian-wasm-emscripten:20221213104631
+ image: registry.videolan.org/vlc-debian-wasm-emscripten:20231024033032
stage: build
tags:
- docker
@@ -418,6 +418,32 @@ build-debian-wasm:
matrix:
- CROSSFILE: [wasm32, wasm64]
+build-debian-riscv64:
+ extends: .debian-amd64-common
+ variables:
+ QEMU_CPU: rv64,v=true,vext_spec=v1.0,vlen=256,elen=64
+ QEMU_LD_PREFIX: /usr/riscv64-linux-gnu/
+ script:
+ - meson setup build --buildtype release
+ -Dtrim_dsp=false
+ --werror
+ --cross-file package/crossfiles/riscv64-linux.meson
+ - ninja -C build
+ - cd build && meson test -v
+
+build-debian-loongarch64:
+ extends: .debian-amd64-common
+ variables:
+ QEMU_CPU: max-loongarch-cpu
+ QEMU_LD_PREFIX: /opt/cross-tools/target/
+ script:
+ - meson setup build --buildtype release
+ -Dtrim_dsp=false
+ --werror
+ --cross-file package/crossfiles/loongarch64-linux.meson
+ - ninja -C build
+ - cd build && meson test -v
+
.test-common:
stage: test
@@ -450,11 +476,11 @@ build-debian-wasm:
- ninja -C build
- cd build
- exit_code=0
- - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask 0" || exit_code=$((exit_code + $?))
- - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask sse2" || exit_code=$((exit_code + $?))
- - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask ssse3" || exit_code=$((exit_code + $?))
- - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask sse41" || exit_code=$((exit_code + $?))
- - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask avx2" || exit_code=$((exit_code + $?))
+ - time meson test -q --suite testdata --test-args "--cpumask 0" || exit_code=$((exit_code + $?))
+ - time meson test -q --suite testdata --test-args "--cpumask sse2" || exit_code=$((exit_code + $?))
+ - time meson test -q --suite testdata --test-args "--cpumask ssse3" || exit_code=$((exit_code + $?))
+ - time meson test -q --suite testdata --test-args "--cpumask sse41" || exit_code=$((exit_code + $?))
+ - time meson test -q --suite testdata --test-args "--cpumask avx2" || exit_code=$((exit_code + $?))
- if [ $exit_code -ne 0 ]; then exit $exit_code; fi
.test-argon:
@@ -484,6 +510,7 @@ test-debian:
script:
- meson setup build --buildtype release
-Dtestdata_tests=true
+ -Denable_seek_stress=true
-Dlogging=false
-Db_coverage=true
-Dtrim_dsp=false
@@ -496,7 +523,7 @@ test-debian:
grep -Eo '[0-9.]+' | awk '{ print "coverage:", $1 * 100 } '
- time meson test -v --suite testdata_seek-stress --test-args "--threads 2 --framedelay 1"
- time meson test -v --suite testdata_seek-stress --test-args "--threads 2 --framedelay 2"
- - time meson test -v --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--threads=1 --negstride"
+ - time meson test -v --suite testdata --test-args "--threads=1 --negstride"
coverage: '/^coverage: (\d+.\d+)$/'
artifacts:
expose_as: 'Coverage HTML report'
@@ -534,8 +561,8 @@ test-debian-avx512:
-Dtestdata_tests=true
-Dtrim_dsp=false
- ninja -C build
- - cd build && time meson test --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask avx512icl"
- - time meson test --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--threads 2 --framedelay 2 --cpumask avx512icl"
+ - cd build && time meson test --suite testdata --test-args "--cpumask avx512icl"
+ - time meson test --suite testdata --test-args "--threads 2 --framedelay 2 --cpumask avx512icl"
test-debian-unaligned-stack:
extends:
@@ -549,6 +576,7 @@ test-debian-unaligned-stack:
script:
- meson setup build --buildtype release
-Dtestdata_tests=true
+ -Denable_seek_stress=true
-Dlogging=false
-Dstack_alignment=16
-Dtrim_dsp=false
@@ -570,8 +598,9 @@ test-debian-asan:
- ninja -C build
- cd build
- exit_code=0
- - time meson test -v --setup=sanitizer --test-args "--cpumask 0" || exit_code=$((exit_code + $?))
- - time meson test -v --setup=sanitizer --test-args "--cpumask 0xff" || exit_code=$((exit_code + $?))
+ - time meson test -v --setup=sanitizer --suite checkasm || exit_code=$((exit_code + $?))
+ - time meson test -v --setup=sanitizer --suite testdata --test-args "--cpumask 0" || exit_code=$((exit_code + $?))
+ - time meson test -v --setup=sanitizer --suite testdata --test-args "--cpumask 0xff" || exit_code=$((exit_code + $?))
- if [ $exit_code -ne 0 ]; then exit $exit_code; fi
test-debian-msan:
@@ -585,6 +614,7 @@ test-debian-msan:
script:
- meson setup build --buildtype debugoptimized
-Dtestdata_tests=true
+ -Denable_seek_stress=true
-Dlogging=false
-Db_sanitize=memory
-Db_lundef=false
@@ -603,6 +633,7 @@ test-debian-ubsan:
script:
- meson setup build --buildtype debugoptimized
-Dtestdata_tests=true
+ -Denable_seek_stress=true
-Dlogging=false
-Db_sanitize=undefined
-Db_lundef=false
@@ -621,15 +652,16 @@ test-debian-tsan:
script:
- meson setup build --buildtype debugoptimized
-Dtestdata_tests=true
+ -Denable_seek_stress=true
-Dlogging=false
-Db_sanitize=thread
-Db_lundef=false
- ninja -C build
- cd build
- exit_code=0
- - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--threads 2 --framedelay 1" || exit_code=$((exit_code + $?))
- - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--threads 2 --framedelay 2" || exit_code=$((exit_code + $?))
- - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--threads 2 --framedelay 2 --negstride" || exit_code=$((exit_code + $?))
+ - time meson test -v --setup=sanitizer --suite testdata --test-args "--threads 2 --framedelay 1" || exit_code=$((exit_code + $?))
+ - time meson test -v --setup=sanitizer --suite testdata --test-args "--threads 2 --framedelay 2" || exit_code=$((exit_code + $?))
+ - time meson test -v --setup=sanitizer --suite testdata --test-args "--threads 2 --framedelay 2 --negstride" || exit_code=$((exit_code + $?))
- time meson test -v --setup=sanitizer --suite testdata_seek-stress --test-args "--threads 2 --framedelay 1" || exit_code=$((exit_code + $?))
- time meson test -v --setup=sanitizer --suite testdata_seek-stress --test-args "--threads 2 --framedelay 2" || exit_code=$((exit_code + $?))
- time meson test -v --setup=sanitizer --suite oss-fuzz-asan --suite oss-fuzz-msan --suite oss-fuzz-ubsan || exit_code=$((exit_code + $?))
@@ -680,6 +712,28 @@ test-debian-ppc64le:
- ninja -C build
- cd build && time meson test -v
+test-debian-riscv64:
+ extends:
+ - .debian-amd64-common
+ - .test-common
+ needs: ["build-debian-riscv64"]
+ script:
+ - meson setup build --buildtype release
+ -Dtestdata_tests=true
+ -Dlogging=false
+ -Dtrim_dsp=false
+ --cross-file package/crossfiles/riscv64-linux.meson
+ - ninja -C build
+ - cd build && time meson test -v --timeout-multiplier 2
+ variables:
+ QEMU_LD_PREFIX: /usr/riscv64-linux-gnu/
+ parallel:
+ matrix:
+ - QEMU_CPU: [ "rv64,v=true,vext_spec=v1.0,vlen=128,elen=64",
+ "rv64,v=true,vext_spec=v1.0,vlen=256,elen=64",
+ "rv64,v=true,vext_spec=v1.0,vlen=512,elen=64",
+ "rv64,v=true,vext_spec=v1.0,vlen=1024,elen=64" ]
+
test-debian-armv7-clang-5:
extends:
- .debian-armv7-common
@@ -696,6 +750,23 @@ test-debian-armv7-clang-5:
- ninja -C build
- cd build && time meson test -v
+test-debian-loongarch64:
+ extends:
+ - .debian-amd64-common
+ - .test-common
+ needs: ["build-debian-loongarch64"]
+ variables:
+ QEMU_CPU: max-loongarch-cpu
+ QEMU_LD_PREFIX: /opt/cross-tools/target/
+ script:
+ - meson setup build --buildtype release
+ -Dtestdata_tests=true
+ -Dlogging=false
+ -Dtrim_dsp=false
+ --cross-file package/crossfiles/loongarch64-linux.meson
+ - ninja -C build
+ - cd build && time meson test -v --timeout-multiplier 2
+
.test-argon-script: &test-argon-script
- meson setup build --buildtype release
-Dlogging=false
diff --git a/METADATA b/METADATA
index 43f6fd0..8298a7f 100644
--- a/METADATA
+++ b/METADATA
@@ -1,21 +1,20 @@
-name: "libdav1d"
-description:
- "An AV1 cross-platform decoder, open-source, and focused on speed and correctness."
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update external/libdav1d
+# For more info, check https://cs.android.com/android/platform/superproject/+/main:tools/external_updater/README.md
+name: "libdav1d"
+description: "An AV1 cross-platform decoder, open-source, and focused on speed and correctness."
third_party {
- url {
- type: HOMEPAGE
- value: "https://code.videolan.org/videolan/dav1d/"
- }
- url {
- type: GIT
- value: "https://code.videolan.org/videolan/dav1d.git"
- }
- version: "1.3.0"
license_type: NOTICE
last_upgrade_date {
- year: 2023
- month: 10
- day: 22
+ year: 2024
+ month: 2
+ day: 14
+ }
+ homepage: "https://code.videolan.org/videolan/dav1d/"
+ identifier {
+ type: "Git"
+ value: "https://code.videolan.org/videolan/dav1d.git"
+ version: "1.4.0"
}
}
diff --git a/NEWS b/NEWS
index 54f8557..f74af58 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,17 @@
+Changes for 1.4.0 'Road Runner':
+------------------------------------------------------
+
+1.4.0 is a medium release of dav1d, focusing on new architecture support and optimizations
+
+- AVX-512 optimizations for z1, z2, z3 in 8bit and high-bitdepth
+- New architecture supported: loongarch
+- Loongarch optimizations for 8bit
+- New architecture supported: RISC-V
+- RISC-V optimizations for itx
+- Misc improvements in threading and in reducing binary size
+- Fix potential integer overflow with extremely large frame sizes
+
+
Changes for 1.3.0 'Tundra Peregrine Falcon (Calidus)':
------------------------------------------------------
diff --git a/config/arm32/config.h b/config/arm32/config.h
index 99d56c7..538dfae 100644
--- a/config/arm32/config.h
+++ b/config/arm32/config.h
@@ -9,8 +9,20 @@
#define ARCH_ARM 1
+#define ARCH_LOONGARCH 0
+
+#define ARCH_LOONGARCH32 0
+
+#define ARCH_LOONGARCH64 0
+
#define ARCH_PPC64LE 0
+#define ARCH_RISCV 0
+
+#define ARCH_RV32 0
+
+#define ARCH_RV64 0
+
#define ARCH_X86 0
#define ARCH_X86_32 0
diff --git a/config/arm64/config.h b/config/arm64/config.h
index c7b8a55..b04ba7c 100644
--- a/config/arm64/config.h
+++ b/config/arm64/config.h
@@ -9,8 +9,20 @@
#define ARCH_ARM 0
+#define ARCH_LOONGARCH 0
+
+#define ARCH_LOONGARCH32 0
+
+#define ARCH_LOONGARCH64 0
+
#define ARCH_PPC64LE 0
+#define ARCH_RISCV 0
+
+#define ARCH_RV32 0
+
+#define ARCH_RV64 0
+
#define ARCH_X86 0
#define ARCH_X86_32 0
diff --git a/config/riscv64/config.h b/config/riscv64/config.h
index fd02400..f76dcd0 100644
--- a/config/riscv64/config.h
+++ b/config/riscv64/config.h
@@ -9,13 +9,25 @@
#define ARCH_ARM 0
+#define ARCH_LOONGARCH 0
+
+#define ARCH_LOONGARCH32 0
+
+#define ARCH_LOONGARCH64 0
+
#define ARCH_PPC64LE 0
-#define ARCH_X86 1
+#define ARCH_RISCV 1
+
+#define ARCH_RV32 0
+
+#define ARCH_RV64 0
+
+#define ARCH_X86 0
#define ARCH_X86_32 0
-#define ARCH_X86_64 1
+#define ARCH_X86_64 0
#define CONFIG_16BPC 1
diff --git a/config/x86_32/config.h b/config/x86_32/config.h
index 8bddaa6..35c0070 100644
--- a/config/x86_32/config.h
+++ b/config/x86_32/config.h
@@ -9,8 +9,20 @@
#define ARCH_ARM 0
+#define ARCH_LOONGARCH 0
+
+#define ARCH_LOONGARCH32 0
+
+#define ARCH_LOONGARCH64 0
+
#define ARCH_PPC64LE 0
+#define ARCH_RISCV 0
+
+#define ARCH_RV32 0
+
+#define ARCH_RV64 0
+
#define ARCH_X86 1
#define ARCH_X86_32 1
diff --git a/config/x86_64/config.h b/config/x86_64/config.h
index fd02400..fb2e14d 100644
--- a/config/x86_64/config.h
+++ b/config/x86_64/config.h
@@ -9,8 +9,20 @@
#define ARCH_ARM 0
+#define ARCH_LOONGARCH 0
+
+#define ARCH_LOONGARCH32 0
+
+#define ARCH_LOONGARCH64 0
+
#define ARCH_PPC64LE 0
+#define ARCH_RISCV 0
+
+#define ARCH_RV32 0
+
+#define ARCH_RV64 0
+
#define ARCH_X86 1
#define ARCH_X86_32 0
diff --git a/include/common/attributes.h b/include/common/attributes.h
index 71c34f2..cd058ab 100644
--- a/include/common/attributes.h
+++ b/include/common/attributes.h
@@ -60,7 +60,7 @@
#define ALIGN_64_VAL 64
#define ALIGN_32_VAL 32
#define ALIGN_16_VAL 16
-#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
+#elif ARCH_AARCH64 || ARCH_ARM || ARCH_LOONGARCH || ARCH_PPC64LE || ARCH_X86_32
/* ARM doesn't benefit from anything more than 16-byte alignment. */
#define ALIGN_64_VAL 16
#define ALIGN_32_VAL 16
diff --git a/meson.build b/meson.build
index 2b88f3c..6e49852 100644
--- a/meson.build
+++ b/meson.build
@@ -23,7 +23,7 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
- version: '1.3.0',
+ version: '1.4.0',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
@@ -62,11 +62,13 @@ endforeach
# ASM option
is_asm_enabled = (get_option('enable_asm') == true and
- (host_machine.cpu_family() == 'x86' or
- (host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__').strip() == '') or
- host_machine.cpu_family() == 'aarch64' or
+ (host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm') or
- host_machine.cpu() == 'ppc64le'))
+ host_machine.cpu() == 'ppc64le' or
+ host_machine.cpu_family().startswith('riscv') or
+ host_machine.cpu_family().startswith('loongarch') or
+ host_machine.cpu_family() == 'x86' or
+ (host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__').strip() == '')))
cdata.set10('HAVE_ASM', is_asm_enabled)
if is_asm_enabled and get_option('b_sanitize') == 'memory'
@@ -134,7 +136,7 @@ if host_machine.system() == 'windows'
rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
- rc_data.set('COPYRIGHT_YEARS', '2018-2023')
+ rc_data.set('COPYRIGHT_YEARS', '2018-2024')
else
thread_dependency = dependency('threads')
thread_compat_dep = []
@@ -232,7 +234,9 @@ endif
if (host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm') or
- host_machine.cpu() == 'ppc64le')
+ host_machine.cpu_family().startswith('loongarch') or
+ host_machine.cpu() == 'ppc64le' or
+ host_machine.cpu_family().startswith('riscv'))
if cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
cdata.set('HAVE_GETAUXVAL', 1)
endif
@@ -379,6 +383,14 @@ endif
cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le')
+cdata.set10('ARCH_RISCV', host_machine.cpu_family().startswith('riscv'))
+cdata.set10('ARCH_RV32', host_machine.cpu_family() == 'riscv32')
+cdata.set10('ARCH_RV64', host_machine.cpu_family() == 'riscv64')
+
+cdata.set10('ARCH_LOONGARCH', host_machine.cpu_family().startswith('loongarch'))
+cdata.set10('ARCH_LOONGARCH32', host_machine.cpu_family() == 'loongarch32')
+cdata.set10('ARCH_LOONGARCH64', host_machine.cpu_family() == 'loongarch64')
+
# meson's cc.symbols_have_underscore_prefix() is unfortunately unrelieably
# when additional flags like '-fprofile-instr-generate' are passed via CFLAGS
# see following meson issue https://github.com/mesonbuild/meson/issues/5482
diff --git a/meson_options.txt b/meson_options.txt
index 91a0f6c..c04deff 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -25,6 +25,11 @@ option('enable_tests',
value: true,
description: 'Build dav1d tests')
+option('enable_seek_stress',
+ type: 'boolean',
+ value: false,
+ description: 'Build seek_stress test tool')
+
option('enable_docs',
type: 'boolean',
value: false,
diff --git a/package/crossfiles/loongarch64-linux.meson b/package/crossfiles/loongarch64-linux.meson
new file mode 100644
index 0000000..04c29ac
--- /dev/null
+++ b/package/crossfiles/loongarch64-linux.meson
@@ -0,0 +1,13 @@
+[binaries]
+c = 'loongarch64-unknown-linux-gnu-gcc'
+cpp = 'loongarch64-unknown-linux-gnu-c++'
+ar = 'loongarch64-unknown-linux-gnu-ar'
+strip = 'loongarch64-unknown-linux-gnu-strip'
+pkgconfig = 'pkg-config'
+exe_wrapper = 'qemu-loongarch64'
+
+[host_machine]
+system = 'linux'
+cpu_family = 'loongarch64'
+cpu = 'loongarch64'
+endian = 'little'
diff --git a/package/crossfiles/riscv64-linux.meson b/package/crossfiles/riscv64-linux.meson
new file mode 100644
index 0000000..e3eda5e
--- /dev/null
+++ b/package/crossfiles/riscv64-linux.meson
@@ -0,0 +1,12 @@
+[binaries]
+c = 'riscv64-linux-gnu-gcc'
+cpp = 'riscv64-linux-gnu-g++'
+ar = 'riscv64-linux-gnu-ar'
+strip = 'riscv64-linux-gnu-strip'
+exe_wrapper = 'qemu-riscv64'
+
+[host_machine]
+system = 'linux'
+cpu_family = 'riscv64'
+cpu = 'riscv64'
+endian = 'little'
diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S
index b1b2f8f..53490cd 100644
--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -1426,6 +1426,7 @@ endfunc
function inv_txfm_add_16x16_neon
mov x15, x30
sub sp, sp, #512
+ mov x8, #16*2
.irp i, 0, 8
add x6, sp, #(\i*16*2)
.if \i == 8
@@ -1433,7 +1434,6 @@ function inv_txfm_add_16x16_neon
b.lt 1f
.endif
add x7, x2, #(\i*2)
- mov x8, #16*2
blr x9
.endr
b 2f
@@ -1449,7 +1449,6 @@ function inv_txfm_add_16x16_neon
.irp i, 0, 8
add x6, x0, #(\i)
add x7, sp, #(\i*2)
- mov x8, #32
bl inv_txfm_add_vert_8x16_neon
.endr
@@ -2461,10 +2460,10 @@ function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
b.gt 2b
3:
+ mov x8, #32*2
.irp i, 0, 8, 16, 24
add x6, x0, #(\i)
add x7, sp, #(\i*2)
- mov x8, #32*2
bl inv_txfm_add_vert_8x16_neon
.endr
@@ -3205,10 +3204,10 @@ function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
3:
adr x5, inv_dct_8h_x16_neon
+ mov x8, #64*2
.irp i, 0, 8, 16, 24, 32, 40, 48, 56
add x6, x0, #(\i)
add x7, x4, #(\i*2)
- mov x8, #64*2
bl inv_txfm_add_vert_8x16_neon
.endr
diff --git a/src/cpu.c b/src/cpu.c
index d24148c..9bb85f1 100644
--- a/src/cpu.c
+++ b/src/cpu.c
@@ -56,8 +56,12 @@ COLD void dav1d_init_cpu(void) {
// memory sanitizer is inherently incompatible with asm
#if ARCH_AARCH64 || ARCH_ARM
dav1d_cpu_flags = dav1d_get_cpu_flags_arm();
+#elif ARCH_LOONGARCH
+ dav1d_cpu_flags = dav1d_get_cpu_flags_loongarch();
#elif ARCH_PPC64LE
dav1d_cpu_flags = dav1d_get_cpu_flags_ppc();
+#elif ARCH_RISCV
+ dav1d_cpu_flags = dav1d_get_cpu_flags_riscv();
#elif ARCH_X86
dav1d_cpu_flags = dav1d_get_cpu_flags_x86();
#endif
diff --git a/src/cpu.h b/src/cpu.h
index 8f70fef..c9009c7 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -37,8 +37,12 @@
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/cpu.h"
+#elif ARCH_LOONGARCH
+#include "src/loongarch/cpu.h"
#elif ARCH_PPC64LE
#include "src/ppc/cpu.h"
+#elif ARCH_RISCV
+#include "src/riscv/cpu.h"
#elif ARCH_X86
#include "src/x86/cpu.h"
#endif
@@ -64,6 +68,10 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
#if defined(__VSX__)
flags |= DAV1D_PPC_CPU_FLAG_VSX;
#endif
+#elif ARCH_RISCV
+#if defined(__riscv_v)
+ flags |= DAV1D_RISCV_CPU_FLAG_V;
+#endif
#elif ARCH_X86
#if defined(__AVX512F__) && defined(__AVX512CD__) && \
defined(__AVX512BW__) && defined(__AVX512DQ__) && \
diff --git a/src/decode.c b/src/decode.c
index 94ef17c..eed9dfb 100644
--- a/src/decode.c
+++ b/src/decode.c
@@ -2470,7 +2470,7 @@ static void setup_tile(Dav1dTileState *const ts,
const Dav1dFrameContext *const f,
const uint8_t *const data, const size_t sz,
const int tile_row, const int tile_col,
- const int tile_start_off)
+ const unsigned tile_start_off)
{
const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
@@ -2616,6 +2616,25 @@ static void read_restoration_info(Dav1dTaskContext *const t,
}
}
+// modeled after the equivalent function in aomdec:decodeframe.c
+static int check_trailing_bits_after_symbol_coder(const MsacContext *const msac) {
+ // check marker bit (single 1), followed by zeroes
+ const int n_bits = -(msac->cnt + 14);
+ assert(n_bits <= 0); // this assumes we errored out when cnt <= -15 in caller
+ const int n_bytes = (n_bits + 7) >> 3;
+ const uint8_t *p = &msac->buf_pos[n_bytes];
+ const int pattern = 128 >> ((n_bits - 1) & 7);
+ if ((p[-1] & (2 * pattern - 1)) != pattern)
+ return 1;
+
+ // check remainder zero bytes
+ for (; p < msac->buf_end; p++)
+ if (*p)
+ return 1;
+
+ return 0;
+}
+
int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) {
const Dav1dFrameContext *const f = t->f;
const enum BlockLevel root_bl = f->seq_hdr->sb128 ? BL_128X128 : BL_64X64;
@@ -2659,9 +2678,6 @@ int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) {
return 0;
}
- // error out on symbol decoder overread
- if (ts->msac.cnt < -15) return 1;
-
if (f->c->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) {
f->c->refmvs_dsp.load_tmvs(&f->rf, ts->tiling.row,
ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
@@ -2767,7 +2783,12 @@ int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) {
memcpy(&f->lf.tx_lpf_right_edge[1][align_h * tile_col + (t->by >> ss_ver)],
&t->l.tx_lpf_uv[(t->by & 16) >> ss_ver], sb_step >> ss_ver);
- return 0;
+ // error out on symbol decoder overread
+ if (ts->msac.cnt <= -15) return 1;
+
+ return c->strict_std_compliance &&
+ (t->by >> f->sb_shift) + 1 >= f->frame_hdr->tiling.row_start_sb[tile_row + 1] &&
+ check_trailing_bits_after_symbol_coder(&ts->msac);
}
int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
@@ -2822,15 +2843,16 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
const int hbd = !!f->seq_hdr->hbd;
if (c->n_fc > 1) {
+ const unsigned sb_step4 = f->sb_step * 4;
int tile_idx = 0;
for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
- int row_off = f->frame_hdr->tiling.row_start_sb[tile_row] *
- f->sb_step * 4 * f->sb128w * 128;
- int b_diff = (f->frame_hdr->tiling.row_start_sb[tile_row + 1] -
- f->frame_hdr->tiling.row_start_sb[tile_row]) * f->sb_step * 4;
+ const unsigned row_off = f->frame_hdr->tiling.row_start_sb[tile_row] *
+ sb_step4 * f->sb128w * 128;
+ const unsigned b_diff = (f->frame_hdr->tiling.row_start_sb[tile_row + 1] -
+ f->frame_hdr->tiling.row_start_sb[tile_row]) * sb_step4;
for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
f->frame_thread.tile_start_off[tile_idx++] = row_off + b_diff *
- f->frame_hdr->tiling.col_start_sb[tile_col] * f->sb_step * 4;
+ f->frame_hdr->tiling.col_start_sb[tile_col] * sb_step4;
}
}
@@ -3261,7 +3283,7 @@ error:
return retval;
}
-void dav1d_decode_frame_exit(Dav1dFrameContext *const f, const int retval) {
+void dav1d_decode_frame_exit(Dav1dFrameContext *const f, int retval) {
const Dav1dContext *const c = f->c;
if (f->sr_cur.p.data[0])
@@ -3272,8 +3294,16 @@ void dav1d_decode_frame_exit(Dav1dFrameContext *const f, const int retval) {
(size_t)f->frame_thread.cf_sz * 128 * 128 / 2);
}
for (int i = 0; i < 7; i++) {
- if (f->refp[i].p.frame_hdr)
+ if (f->refp[i].p.frame_hdr) {
+ if (!retval && c->n_fc > 1 && c->strict_std_compliance &&
+ atomic_load(&f->refp[i].progress[1]) == FRAME_ERROR)
+ {
+ retval = DAV1D_ERR(EINVAL);
+ atomic_store(&f->task_thread.error, 1);
+ atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
+ }
dav1d_thread_picture_unref(&f->refp[i]);
+ }
dav1d_ref_dec(&f->ref_mvs_ref[i]);
}
@@ -3327,6 +3357,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
}
}
dav1d_decode_frame_exit(f, res);
+ res = f->task_thread.retval;
f->n_tile_data = 0;
return res;
}
diff --git a/src/internal.h b/src/internal.h
index 631c5a8..72f6560 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -289,7 +289,7 @@ struct Dav1dFrameContext {
int prog_sz;
int cbi_sz, pal_sz, pal_idx_sz, cf_sz;
// start offsets per tile
- int *tile_start_off;
+ unsigned *tile_start_off;
} frame_thread;
// loopfilter
diff --git a/src/itx_tmpl.c b/src/itx_tmpl.c
index d385989..8ff245a 100644
--- a/src/itx_tmpl.c
+++ b/src/itx_tmpl.c
@@ -183,6 +183,10 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/itx.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/itx.h"
+#elif ARCH_RISCV
+#include "src/riscv/itx.h"
#elif ARCH_X86
#include "src/x86/itx.h"
#endif
@@ -257,6 +261,12 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
#if ARCH_AARCH64 || ARCH_ARM
itx_dsp_init_arm(c, bpc);
#endif
+#if ARCH_LOONGARCH64
+ itx_dsp_init_loongarch(c, bpc);
+#endif
+#if ARCH_RISCV
+ itx_dsp_init_riscv(c, bpc);
+#endif
#if ARCH_X86
itx_dsp_init_x86(c, bpc);
#endif
diff --git a/src/loongarch/cpu.c b/src/loongarch/cpu.c
new file mode 100644
index 0000000..a79ade5
--- /dev/null
+++ b/src/loongarch/cpu.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "common/attributes.h"
+#include "src/loongarch/cpu.h"
+
+#if defined(HAVE_GETAUXVAL)
+#include <sys/auxv.h>
+
+#define LA_HWCAP_LSX ( 1 << 4 )
+#define LA_HWCAP_LASX ( 1 << 5 )
+#endif
+
+COLD unsigned dav1d_get_cpu_flags_loongarch(void) {
+ unsigned flags = 0;
+#if defined(HAVE_GETAUXVAL)
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+ flags |= (hw_cap & LA_HWCAP_LSX) ? DAV1D_LOONGARCH_CPU_FLAG_LSX : 0;
+ flags |= (hw_cap & LA_HWCAP_LASX) ? DAV1D_LOONGARCH_CPU_FLAG_LASX : 0;
+#endif
+
+ return flags;
+}
diff --git a/src/loongarch/cpu.h b/src/loongarch/cpu.h
new file mode 100644
index 0000000..d00ff67
--- /dev/null
+++ b/src/loongarch/cpu.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_CPU_H
+#define DAV1D_SRC_LOONGARCH_CPU_H
+
+enum CpuFlags {
+ DAV1D_LOONGARCH_CPU_FLAG_LSX = 1 << 0,
+ DAV1D_LOONGARCH_CPU_FLAG_LASX = 1 << 1,
+};
+
+unsigned dav1d_get_cpu_flags_loongarch(void);
+
+#endif /* DAV1D_SRC_LOONGARCH_CPU_H */
diff --git a/src/loongarch/itx.S b/src/loongarch/itx.S
new file mode 100644
index 0000000..fc0c79e
--- /dev/null
+++ b/src/loongarch/itx.S
@@ -0,0 +1,8104 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/loongarch/loongson_asm.S"
+
+/*
+void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrlowff_t stride,
+ coef *const coeff, const int eob
+ HIGHBD_DECL_SUFFIX)
+*/
+function inv_txfm_add_wht_wht_4x4_8bpc_lsx
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+
+ vreplgr2vr.h vr20, zero
+
+ vsrai.h vr0, vr0, 2
+ vsrai.h vr2, vr2, 2
+
+ vst vr20, a2, 0
+
+ vpickod.d vr1, vr0, vr0
+ vpickod.d vr3, vr2, vr2
+
+ vadd.h vr4, vr0, vr1
+ vsub.h vr5, vr2, vr3
+ vsub.h vr6, vr4, vr5
+ vsrai.h vr6, vr6, 1
+ vsub.h vr0, vr6, vr3
+ vsub.h vr2, vr6, vr1
+ vsub.h vr1, vr4, vr0
+ vadd.h vr3, vr5, vr2
+
+ vst vr20, a2, 16
+
+ vilvl.h vr4, vr0, vr1
+ vilvl.h vr5, vr3, vr2
+ vilvl.w vr0, vr5, vr4
+ vilvh.w vr2, vr5, vr4
+ vilvh.d vr1, vr0, vr0
+ vilvh.d vr3, vr2, vr2
+
+ vadd.h vr4, vr0, vr1
+ vsub.h vr5, vr2, vr3
+ vsub.h vr6, vr4, vr5
+ vsrai.h vr6, vr6, 1
+ vsub.h vr0, vr6, vr3
+ vsub.h vr2, vr6, vr1
+ vsub.h vr1, vr4, vr0
+ vadd.h vr3, vr5, vr2
+
+ vld vr4, a0, 0
+ vldx vr5, a0, a1
+ alsl.d t0, a1, a0, 1
+ vld vr6, t0, 0
+ vldx vr7, t0, a1
+
+ vsllwil.hu.bu vr4, vr4, 0
+ vsllwil.hu.bu vr5, vr5, 0
+ vsllwil.hu.bu vr6, vr6, 0
+ vsllwil.hu.bu vr7, vr7, 0
+ vilvl.d vr1, vr0, vr1
+ vilvl.d vr2, vr3, vr2
+ vilvl.d vr4, vr5, vr4
+ vilvl.d vr6, vr7, vr6
+ vadd.h vr1, vr1, vr4
+ vadd.h vr2, vr2, vr6
+ vssrani.bu.h vr2, vr1, 0
+
+ vstelm.w vr2, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr2, a0, 0, 1
+ add.d a0, a0, a1
+ vstelm.w vr2, a0, 0, 2
+ add.d a0, a0, a1
+ vstelm.w vr2, a0, 0, 3
+endfunc
+
+const idct_coeffs, align=4
+ // idct4
+ .word 2896, 2896*8, 1567, 3784
+ // idct8
+ .word 799, 4017, 3406, 2276
+ // idct16
+ .word 401, 4076, 3166, 2598
+ .word 1931, 3612, 3920, 1189
+ // idct32
+ .word 201, 4091, 3035, 2751
+ .word 1751, 3703, 3857, 1380
+ .word 995, 3973, 3513, 2106
+ .word 2440, 3290, 4052, 601
+endconst
+
+.macro vld_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7
+ vld \in0, \src, \start
+ vld \in1, \src, \start+(\stride*1)
+ vld \in2, \src, \start+(\stride*2)
+ vld \in3, \src, \start+(\stride*3)
+ vld \in4, \src, \start+(\stride*4)
+ vld \in5, \src, \start+(\stride*5)
+ vld \in6, \src, \start+(\stride*6)
+ vld \in7, \src, \start+(\stride*7)
+.endm
+
+.macro vst_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7
+ vst \in0, \src, \start
+ vst \in1, \src, \start+(\stride*1)
+ vst \in2, \src, \start+(\stride*2)
+ vst \in3, \src, \start+(\stride*3)
+ vst \in4, \src, \start+(\stride*4)
+ vst \in5, \src, \start+(\stride*5)
+ vst \in6, \src, \start+(\stride*6)
+ vst \in7, \src, \start+(\stride*7)
+.endm
+
+.macro vld_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15
+
+ vld_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+
+ vld \in8, \src, \start+(\stride*8)
+ vld \in9, \src, \start+(\stride*9)
+ vld \in10, \src, \start+(\stride*10)
+ vld \in11, \src, \start+(\stride*11)
+ vld \in12, \src, \start+(\stride*12)
+ vld \in13, \src, \start+(\stride*13)
+ vld \in14, \src, \start+(\stride*14)
+ vld \in15, \src, \start+(\stride*15)
+.endm
+
+.macro vst_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15
+
+ vst_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+
+ vst \in8, \src, \start+(\stride*8)
+ vst \in9, \src, \start+(\stride*9)
+ vst \in10, \src, \start+(\stride*10)
+ vst \in11, \src, \start+(\stride*11)
+ vst \in12, \src, \start+(\stride*12)
+ vst \in13, \src, \start+(\stride*13)
+ vst \in14, \src, \start+(\stride*14)
+ vst \in15, \src, \start+(\stride*15)
+.endm
+
+.macro DST_ADD_W4 in0, in1, in2, in3, in4, in5
+ vilvl.w vr10, \in1, \in0 // 0 1 2 3 4 5 6 7 x ...
+ vilvl.w vr12, \in3, \in2 // 8 9 10 11 12 13 14 15 x ...
+ vsllwil.hu.bu vr10, vr10, 0
+ vsllwil.hu.bu vr12, vr12, 0
+ vadd.h vr10, \in4, vr10
+ vadd.h vr12, \in5, vr12
+ vssrani.bu.h vr12, vr10, 0
+ vstelm.w vr12, a0, 0, 0
+ add.d t8, a0, a1
+ vstelm.w vr12, t8, 0, 1
+ vstelm.w vr12, t2, 0, 2
+ add.d t8, t2, a1
+ vstelm.w vr12, t8, 0, 3
+.endm
+
+.macro VLD_DST_ADD_W4 in0, in1
+ vld vr0, a0, 0
+ vldx vr1, a0, a1
+ vld vr2, t2, 0
+ vldx vr3, t2, a1
+
+ DST_ADD_W4 vr0, vr1, vr2, vr3, \in0, \in1
+.endm
+
+.macro dct_4x4_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1
+ vexth.w.h vr4, \in0 // in1
+ vexth.w.h vr5, \in1 // in3
+ vmul.w vr6, vr4, \in4
+ vmul.w vr7, vr4, \in5
+ vmadd.w vr6, vr5, \in5 // t3
+ vmsub.w vr7, vr5, \in4 // t2
+ vsllwil.w.h vr4, \in2, 0 // in0
+ vsllwil.w.h vr5, \in3, 0 // in2
+ vmul.w vr9, vr4, \in6
+ vmul.w vr10, vr4, \in7
+ vmadd.w vr9, vr5, \in7 // t0
+ vmsub.w vr10, vr5, \in6 // t1
+ vssrarni.h.w vr10, vr9, 12 // t0 t1
+ vssrarni.h.w vr7, vr6, 12 // t3 t2
+ vsadd.h \out0, vr10, vr7 // 0 4 8 12 1 5 9 13 c[0] c[1]
+ vssub.h \out1, vr10, vr7 // 3 7 11 15 2 6 10 14 c[3] c[2]
+.endm
+
+.macro inv_dct_dct_4x4_lsx
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15
+
+ vldrepl.w vr2, t0, 8 // 1567
+ vldrepl.w vr3, t0, 12 // 3784
+ vldrepl.w vr8, t0, 0 // 2896
+
+ dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr11, vr12
+
+ vreplgr2vr.h vr15, zero
+ vshuf4i.d vr12, vr12, 0x01 // 2 6 10 14 3 7 11 15
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15
+ vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15
+
+ dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr13, vr14
+ vsrari.h vr13, vr13, 4
+ vsrari.h vr14, vr14, 4
+ vshuf4i.d vr14, vr14, 0x01
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W4 vr13, vr14
+.endm
+
+.macro identity_4x4_lsx in0, in1, in2, in3, out0
+ vsllwil.w.h vr2, \in0, 0
+ vexth.w.h vr3, \in1
+ vmul.w vr4, vr2, \in2
+ vmul.w vr5, vr3, \in2
+ vssrarni.h.w vr5, vr4, 12
+ vsadd.h \out0, vr5, \in3
+.endm
+
+.macro inv_identity_identity_4x4_lsx
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr0, vr0, vr20, vr0, vr0
+ identity_4x4_lsx vr1, vr1, vr20, vr1, vr1
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+ identity_4x4_lsx vr0, vr0, vr20, vr0, vr6
+ identity_4x4_lsx vr1, vr1, vr20, vr1, vr7
+
+ vsrari.h vr6, vr6, 4
+ vsrari.h vr7, vr7, 4
+ vilvh.d vr8, vr6, vr6
+ vilvh.d vr9, vr7, vr7
+ vilvl.h vr4, vr8, vr6
+ vilvl.h vr5, vr9, vr7
+ vilvl.w vr6, vr5, vr4
+ vilvh.w vr7, vr5, vr4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr6, vr7
+.endm
+
+const iadst4_coeffs, align=4
+ .word 1321, 3803, 2482, 3344
+endconst
+
+.macro adst4x4_1d_lsx in0, in1, in2, in3, out0, out1, out2, out3
+ vsub.w vr6, \in0, \in2 // in0-in2
+ vmul.w vr7, \in0, vr20 // in0*1321
+ vmadd.w vr7, \in2, vr21 // in0*1321+in2*3803
+ vmadd.w vr7, \in3, vr22 // in0*1321+in2*3803+in3*2482
+ vmul.w vr8, \in1, vr23 // in1*3344
+ vadd.w vr6, vr6, \in3 // in0-in2+in3
+ vmul.w vr9, \in0, vr22 // in0*2482
+ vmsub.w vr9, \in2, vr20 // in2*1321
+ vmsub.w vr9, \in3, vr21 // in0*2482-in2*1321-in3*3803
+ vadd.w vr5, vr7, vr9
+ vmul.w \out2, vr6, vr23 // out[2] 8 9 10 11
+ vadd.w \out0, vr7, vr8 // out[0] 0 1 2 3
+ vadd.w \out1, vr9, vr8 // out[1] 4 5 6 7
+ vsub.w \out3, vr5, vr8 // out[3] 12 13 14 15
+.endm
+
+.macro inv_adst_dct_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7
+ vssrarni.h.w vr13, vr11, 12
+ vssrarni.h.w vr14, vr12, 12
+
+ vreplgr2vr.h vr15, zero
+ la.local t0, idct_coeffs
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_4x4_core_lsx vr13, vr14, vr13, vr14, vr21, vr20, vr22, vr22, vr13, vr14
+
+ vshuf4i.d vr14, vr14, 0x01
+ vsrari.h vr13, vr13, 4
+ vsrari.h vr14, vr14, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr14
+.endm
+
+.macro inv_adst_adst_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7
+
+ vsrari.w vr11, vr11, 12
+ vsrari.w vr13, vr13, 12
+ vsrari.w vr12, vr12, 12
+ vsrari.w vr14, vr14, 12
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ adst4x4_1d_lsx vr11, vr13, vr12, vr14, vr11, vr13, vr12, vr14
+
+ vssrarni.h.w vr13, vr11, 12
+ vssrarni.h.w vr14, vr12, 12
+ vsrari.h vr13, vr13, 4
+ vsrari.h vr14, vr14, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr14
+.endm
+
+.macro inv_dct_adst_4x4_lsx
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr21, vr20, vr22, vr22, vr11, vr12
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ vshuf4i.d vr12, vr12, 0x01 // 3 7 11 15 2 6 10 14
+
+ vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15
+ vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15
+
+ vsllwil.w.h vr2, vr11, 0 // in0
+ vexth.w.h vr3, vr11 // in1
+ vsllwil.w.h vr4, vr12, 0 // in2
+ vexth.w.h vr5, vr12 // in3
+
+ la.local t0, iadst4_coeffs
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr11, vr13, vr12, vr14
+
+ vssrarni.h.w vr13, vr11, 12
+ vssrarni.h.w vr14, vr12, 12
+ vsrari.h vr13, vr13, 4
+ vsrari.h vr14, vr14, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr14
+.endm
+
+.macro inv_dct_flipadst_4x4_lsx
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr21, vr20, vr22, vr22, vr11, vr12
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ vshuf4i.d vr12, vr12, 0x01 // 3 7 11 15 2 6 10 14
+
+ vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15
+ vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15
+ vsllwil.w.h vr2, vr11, 0 // in0
+ vexth.w.h vr3, vr11 // in1
+ vsllwil.w.h vr4, vr12, 0 // in2
+ vexth.w.h vr5, vr12 // in3
+
+ la.local t0, iadst4_coeffs
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr11, vr12, vr13, vr14
+
+ vssrarni.h.w vr11, vr12, 12 // 0 1 2 3 4 5 6 7
+ vssrarni.h.w vr13, vr14, 12 // 8 9 10 11 12 13 14 15
+ vsrari.h vr11, vr11, 4
+ vsrari.h vr13, vr13, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr11
+.endm
+
+.macro inv_flipadst_adst_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ vsrari.w vr0, vr0, 12
+ vsrari.w vr1, vr1, 12
+ vsrari.w vr2, vr2, 12
+ vsrari.w vr3, vr3, 12
+
+ vilvl.w vr4, vr0, vr1
+ vilvh.w vr5, vr0, vr1
+ vilvl.w vr6, vr2, vr3
+ vilvh.w vr7, vr2, vr3
+ vilvl.d vr11, vr4, vr6
+ vilvh.d vr12, vr4, vr6
+ vilvl.d vr13, vr5, vr7
+ vilvh.d vr14, vr5, vr7
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ adst4x4_1d_lsx vr11, vr12, vr13, vr14, vr11, vr13, vr12, vr14
+
+ vssrarni.h.w vr13, vr11, 12
+ vssrarni.h.w vr14, vr12, 12
+ vsrari.h vr13, vr13, 4
+ vsrari.h vr14, vr14, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr14
+.endm
+
+.macro inv_adst_flipadst_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+ LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7
+ vsrari.w vr11, vr11, 12
+ vsrari.w vr12, vr12, 12
+ vsrari.w vr13, vr13, 12
+ vsrari.w vr14, vr14, 12
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ adst4x4_1d_lsx vr11, vr13, vr12, vr14, vr11, vr12, vr13, vr14
+
+ vssrarni.h.w vr11, vr12, 12
+ vssrarni.h.w vr13, vr14, 12
+ vsrari.h vr11, vr11, 4
+ vsrari.h vr13, vr13, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr11
+.endm
+
+.macro inv_flipadst_dct_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ vilvl.w vr4, vr0, vr1
+ vilvh.w vr5, vr0, vr1
+ vilvl.w vr6, vr2, vr3
+ vilvh.w vr7, vr2, vr3
+
+ vilvl.d vr11, vr4, vr6
+ vilvh.d vr12, vr4, vr6
+ vilvl.d vr13, vr5, vr7
+ vilvh.d vr14, vr5, vr7
+
+ vssrarni.h.w vr12, vr11, 12
+ vssrarni.h.w vr14, vr13, 12
+
+ vreplgr2vr.h vr15, zero
+ la.local t0, idct_coeffs
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_4x4_core_lsx vr12, vr14, vr12, vr14, vr21, vr20, vr22, vr22, vr13, vr14
+
+ vshuf4i.d vr14, vr14, 0x01
+ vsrari.h vr13, vr13, 4
+ vsrari.h vr14, vr14, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr14
+.endm
+
+.macro inv_flipadst_flipadst_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ vilvl.w vr4, vr0, vr1
+ vilvh.w vr5, vr0, vr1
+ vilvl.w vr6, vr2, vr3
+ vilvh.w vr7, vr2, vr3
+ vilvl.d vr11, vr4, vr6
+ vilvh.d vr12, vr4, vr6
+ vilvl.d vr13, vr5, vr7
+ vilvh.d vr14, vr5, vr7
+
+ vsrari.w vr11, vr11, 12
+ vsrari.w vr12, vr12, 12
+ vsrari.w vr13, vr13, 12
+ vsrari.w vr14, vr14, 12
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ adst4x4_1d_lsx vr11, vr12, vr13, vr14, vr11, vr12, vr13, vr14
+
+ vssrarni.h.w vr11, vr12, 12
+ vssrarni.h.w vr13, vr14, 12
+ vsrari.h vr11, vr11, 4
+ vsrari.h vr13, vr13, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr11
+.endm
+
+.macro inv_dct_identity_4x4_lsx
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ vldrepl.w vr2, t0, 8 // 1567
+ vldrepl.w vr3, t0, 12 // 3784
+ vldrepl.w vr8, t0, 0 // 2896
+
+ dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr11, vr12
+ vshuf4i.d vr12, vr12, 0x01 // 2 6 10 14 3 7 11 15
+
+ vreplgr2vr.h vr15, zero
+ li.w t0, 1697
+
+ vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15
+ vilvl.h vr10, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15
+
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr10, vr10, vr20, vr10, vr6
+ identity_4x4_lsx vr12, vr12, vr20, vr12, vr7
+ vsrari.h vr11, vr6, 4
+ vsrari.h vr13, vr7, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr11, vr13
+.endm
+
+.macro inv_identity_dct_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr0, vr0, vr20, vr0, vr0
+ identity_4x4_lsx vr1, vr1, vr20, vr1, vr1
+
+ vreplgr2vr.h vr15, zero
+
+ vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15
+ vilvl.h vr13, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr14, vr5, vr4 // 8 9 10 11 12 13 14 15
+
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_4x4_core_lsx vr13, vr14, vr13, vr14, vr21, vr20, vr22, vr22, vr13, vr14
+
+ vshuf4i.d vr14, vr14, 0x01
+ vsrari.h vr13, vr13, 4
+ vsrari.h vr14, vr14, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr14
+.endm
+
+.macro inv_flipadst_identity_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr10, vr11, vr12, vr13
+
+ vssrarni.h.w vr12, vr13, 12
+ vssrarni.h.w vr10, vr11, 12
+
+ vilvl.h vr4, vr10, vr12 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr10, vr12 // 1 3 5 7 9 11 13 15
+ vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr13, vr5, vr4 // 8 9 10 11 12 13 14 15
+
+ vreplgr2vr.h vr15, zero
+ li.w t0, 1697
+
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr11, vr11, vr20, vr11, vr6
+ identity_4x4_lsx vr13, vr13, vr20, vr13, vr7
+ vsrari.h vr11, vr6, 4
+ vsrari.h vr13, vr7, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr11, vr13
+.endm
+
+.macro inv_identity_flipadst_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr0, vr0, vr20, vr0, vr0
+ identity_4x4_lsx vr1, vr1, vr20, vr1, vr1
+
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr5, vr1, vr0
+ vilvl.h vr11, vr5, vr4
+ vilvh.h vr13, vr5, vr4
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr11, 0 // in0
+ vexth.w.h vr3, vr11 // in1
+ vsllwil.w.h vr4, vr13, 0 // in2
+ vexth.w.h vr5, vr13 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ vssrarni.h.w vr0, vr1, 12 // 8 9 10 11 12 13 14 15
+ vssrarni.h.w vr2, vr3, 12 // 0 1 2 3 4 5 6 7
+ vsrari.h vr11, vr0, 4
+ vsrari.h vr13, vr2, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr11
+.endm
+
+.macro inv_identity_adst_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr0, vr0, vr20, vr0, vr0
+ identity_4x4_lsx vr1, vr1, vr20, vr1, vr1
+
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr5, vr1, vr0
+ vilvl.h vr11, vr5, vr4
+ vilvh.h vr13, vr5, vr4
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr11, 0 // in0
+ vexth.w.h vr3, vr11 // in1
+ vsllwil.w.h vr4, vr13, 0 // in2
+ vexth.w.h vr5, vr13 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ vssrarni.h.w vr1, vr0, 12
+ vssrarni.h.w vr3, vr2, 12
+ vsrari.h vr11, vr1, 4
+ vsrari.h vr13, vr3, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr11, vr13
+.endm
+
+.macro inv_adst_identity_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7
+
+ vssrarni.h.w vr13, vr11, 12
+ vssrarni.h.w vr14, vr12, 12
+
+ vreplgr2vr.h vr15, zero
+ li.w t0, 1697
+
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr13, vr13, vr20, vr13, vr6
+ identity_4x4_lsx vr14, vr14, vr20, vr14, vr7
+ vsrari.h vr11, vr6, 4
+ vsrari.h vr13, vr7, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr11, vr13
+.endm
+
+.macro fun4x4 type1, type2
+function inv_txfm_add_\type1\()_\type2\()_4x4_8bpc_lsx
+.ifc \type1\()_\type2, dct_dct
+ bnez a3, .LLL
+
+ vldi vr0, 0x8b5 // 181
+ ld.h t2, a2, 0 // dc
+ st.h zero, a2, 0
+ vreplgr2vr.w vr1, t2
+ vldi vr3, 0x880 // 128
+ vmul.w vr2, vr0, vr1
+ vld vr10, a0, 0
+ vsrari.w vr2, vr2, 8
+ vldx vr11, a0, a1
+ vmadd.w vr3, vr2, vr0
+ alsl.d t2, a1, a0, 1
+ vssrarni.h.w vr3, vr3, 12
+ vld vr12, t2, 0
+ vldx vr13, t2, a1
+
+ DST_ADD_W4 vr10, vr11, vr12, vr13, vr3, vr3
+
+ b .IDST_\type1\()_\type2\()_4X4_END
+.LLL:
+.endif
+
+ inv_\type1\()_\type2\()_4x4_lsx
+.IDST_\type1\()_\type2\()_4X4_END:
+endfunc
+.endm
+
+fun4x4 dct, dct
+fun4x4 identity, identity
+fun4x4 adst, dct
+fun4x4 dct, adst
+fun4x4 adst, adst
+fun4x4 dct, flipadst
+fun4x4 flipadst, adst
+fun4x4 adst, flipadst
+fun4x4 flipadst, dct
+fun4x4 flipadst, flipadst
+fun4x4 dct, identity
+fun4x4 identity, dct
+fun4x4 flipadst, identity
+fun4x4 identity, flipadst
+fun4x4 identity, adst
+fun4x4 adst, identity
+
+function inv_txfm_add_dct_dct_4x8_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_4x8
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr5, 0x880 // 128
+ vmul.w vr2, vr0, vr1
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8
+ vld vr10, a0, 0
+ vmul.w vr2, vr2, vr0
+ vldx vr11, a0, a1
+ vsrari.w vr2, vr2, 8
+ alsl.d t2, a1, a0, 1
+ vmadd.w vr5, vr2, vr0
+ vld vr12, t2, 0
+ vssrarni.h.w vr5, vr5, 12
+ vldx vr13, t2, a1
+
+ DST_ADD_W4 vr10, vr11, vr12, vr13, vr5, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+
+ VLD_DST_ADD_W4 vr5, vr5
+ b .DCT_DCT_4x8_END
+
+.NO_HAS_DCONLY_4x8:
+ // sh=8 sw=4
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1
+ vld vr20, a2, 32 // 16 17 18 19 20 21 22 23 in2
+ vld vr21, a2, 48 // 24 25 26 27 28 29 30 31 in3
+
+ vldrepl.w vr2, t0, 8 // 1567
+ vldrepl.w vr3, t0, 12 // 3784
+ vldrepl.w vr8, t0, 0 // 2896
+
+.macro DCT4_4Wx8H_1D_LSX
+ // in1 in3
+ vsllwil.w.h vr4, vr1, 0 // in1
+ vsllwil.w.h vr5, vr21, 0 // in3
+ vmul.w vr4, vr4, vr8
+ vmul.w vr5, vr5, vr8
+ vsrari.w vr4, vr4, 12
+ vsrari.w vr5, vr5, 12
+ vmul.w vr6, vr4, vr3
+ vmul.w vr7, vr4, vr2
+ vmadd.w vr6, vr5, vr2 // t3 0 1 2 3
+ vmsub.w vr7, vr5, vr3 // t2 0 1 2 3
+ vexth.w.h vr4, vr1 // in1
+ vexth.w.h vr5, vr21 // in3
+ vmul.w vr4, vr4, vr8
+ vmul.w vr5, vr5, vr8
+ vsrari.w vr4, vr4, 12
+ vsrari.w vr5, vr5, 12
+ vmul.w vr9, vr4, vr3
+ vmul.w vr10, vr4, vr2
+ vmadd.w vr9, vr5, vr2 // t3 4 5 6 7
+ vmsub.w vr10, vr5, vr3 // t2 4 5 6 7
+
+ // in0 in2
+ vsllwil.w.h vr4, vr0, 0 // in0
+ vsllwil.w.h vr5, vr20, 0 // in2
+ vmul.w vr4, vr4, vr8
+ vmul.w vr5, vr5, vr8
+ vsrari.w vr4, vr4, 12
+ vsrari.w vr5, vr5, 12
+ vmul.w vr11, vr4, vr8
+ vmul.w vr12, vr4, vr8
+ vmadd.w vr11, vr5, vr8 // t0 0 1 2 3
+ vmsub.w vr12, vr5, vr8 // t1 0 1 2 3
+ vexth.w.h vr4, vr0 // in0
+ vexth.w.h vr5, vr20 // in2
+ vmul.w vr4, vr4, vr8
+ vmul.w vr5, vr5, vr8
+ vsrari.w vr4, vr4, 12
+ vsrari.w vr5, vr5, 12
+ vmul.w vr13, vr4, vr8
+ vmul.w vr14, vr4, vr8
+ vmadd.w vr13, vr5, vr8 // t0 4 5 6 7
+ vmsub.w vr14, vr5, vr8 // t1 4 5 6 7
+ vssrarni.h.w vr9, vr6, 12 // t3
+ vssrarni.h.w vr10, vr7, 12 // t2
+ vssrarni.h.w vr14, vr12, 12 // t1
+ vssrarni.h.w vr13, vr11, 12 // t0
+ vsadd.h vr4, vr13, vr9 // c[0] 0 4 8 12 16 20 24 28
+ vsadd.h vr5, vr14, vr10 // c[1] 1 5 9 13 17 21 25 29
+ vssub.h vr20, vr14, vr10 // c[2] 2 6 10 14 18 22 26 30
+ vssub.h vr21, vr13, vr9 // c[3] 3 7 11 15 19 23 27 31
+.endm
+
+ DCT4_4Wx8H_1D_LSX
+
+ vreplgr2vr.h vr22, zero
+ vst vr22, a2, 0
+ vst vr22, a2, 16
+ vst vr22, a2, 32
+ vst vr22, a2, 48
+
+ vilvl.h vr0, vr5, vr4 // 0 1 4 5 8 9 12 13
+ vilvl.h vr1, vr21, vr20 // 2 3 6 7 10 11 14 15
+ vilvh.h vr6, vr5, vr4 // 16 17 20 21 24 25 28 29
+ vilvh.h vr7, vr21, vr20 // 18 19 22 23 26 27 30 31
+ vilvl.w vr9, vr1, vr0 // 0 1 2 3 4 5 6 7 in0
+ vilvh.w vr10, vr1, vr0 // 8 9 10 11 12 13 14 15 in1
+ vilvl.w vr11, vr7, vr6 // 16 17 18 19 20 21 22 23 in2
+ vilvh.w vr12, vr7, vr6 // 24 25 26 27 28 29 30 31 in3
+
+ vilvl.d vr0, vr10, vr9
+ vilvl.d vr1, vr12, vr11
+ vilvh.d vr20, vr9, vr11 // in5 in1
+ vilvh.d vr21, vr12, vr10 // in3 in7
+
+.macro DCT8_4Wx8H_1D_LSX
+ dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr13, vr14
+
+ vldrepl.w vr17, t0, 16 // 799
+ vldrepl.w vr18, t0, 20 // 4017
+ vldrepl.w vr11, t0, 24 // 3406
+ vldrepl.w vr12, t0, 28 // 2276
+
+ vexth.w.h vr4, vr20
+ vexth.w.h vr5, vr21
+ vmul.w vr6, vr4, vr18 // in1 * 4017
+ vmul.w vr7, vr4, vr17 // in1 * 799
+ vmadd.w vr6, vr5, vr17 // in7 * 799
+ vmsub.w vr7, vr5, vr18 // in7 * 4017
+ vsllwil.w.h vr4, vr20, 0
+ vsllwil.w.h vr5, vr21, 0
+ vmul.w vr9, vr4, vr12
+ vmul.w vr10, vr4, vr11
+ vmadd.w vr9, vr5, vr11
+ vmsub.w vr10, vr5, vr12
+ vssrarni.h.w vr10, vr9, 12 // t6a t5a
+ vssrarni.h.w vr7, vr6, 12 // t7a t4a
+ vsadd.h vr15, vr7, vr10 // t7 t4
+ vssub.h vr16, vr7, vr10 // t6a t5a
+
+ vexth.w.h vr4, vr16 // t5a
+ vsllwil.w.h vr5, vr16, 0 // t6a
+ vldi vr2, 0x8b5 // 181
+ vsub.w vr6, vr5, vr4
+ vadd.w vr7, vr5, vr4
+ vmul.w vr6, vr6, vr2
+ vmul.w vr7, vr7, vr2
+ vssrarni.h.w vr7, vr6, 8 // t5 t6
+ vaddi.hu vr18, vr7, 0
+ vshuf4i.d vr7, vr15, 0x06 // t7 t6
+ vshuf4i.d vr15, vr18, 0x09 // t4 t5
+
+ // vr17 -> vr7 vr18 -> vr15
+ vsadd.h vr4, vr13, vr7
+ vsadd.h vr5, vr14, vr15
+ vssub.h vr6, vr14, vr15
+ vssub.h vr7, vr13, vr7
+.endm
+
+ DCT8_4Wx8H_1D_LSX
+
+ vshuf4i.d vr5, vr5, 0x01
+ vshuf4i.d vr7, vr7, 0x01
+
+ vsrari.h vr4, vr4, 4
+ vsrari.h vr5, vr5, 4
+ vsrari.h vr6, vr6, 4
+ vsrari.h vr7, vr7, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W4 vr4, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+
+ VLD_DST_ADD_W4 vr6, vr7
+.DCT_DCT_4x8_END:
+endfunc
+
+.macro rect2_w4_lsx in0, in1, in2, out0, out1
+ vsllwil.w.h vr22, \in0, 0
+ vexth.w.h vr23, \in1
+ vmul.w vr22, vr22, \in2
+ vmul.w vr23, vr23, \in2
+ vsrari.w \out0, vr22, 12
+ vsrari.w \out1, vr23, 12
+.endm
+
+.macro dct_8x4_core_lsx1 out0, out1, out2, out3
+ // dct4 stride=1<<1
+ vmul.w vr0, vr6, vr21
+ vmul.w vr1, vr6, vr20
+ vmadd.w vr0, vr10, vr20 // t3
+ vmsub.w vr1, vr10, vr21 // t2
+ vmul.w vr2, vr18, vr22
+ vmul.w vr3, vr18, vr22
+ vmadd.w vr2, vr8, vr22 // t0
+ vmsub.w vr3, vr8, vr22 // t1
+ vssrarni.h.w vr1, vr0, 12 // t3 t2
+ vssrarni.h.w vr3, vr2, 12 // t0 t1
+ vsadd.h vr8, vr3, vr1 // t0 t1
+ vssub.h vr10, vr3, vr1 // t3 t2
+
+ vldrepl.w vr20, t0, 16 // 799
+ vldrepl.w vr21, t0, 20 // 4017
+ vldrepl.w vr22, t0, 24 // 3406
+ vldrepl.w vr23, t0, 28 // 2276
+
+ vmul.w vr0, vr19, vr21 // in1 * 4017
+ vmul.w vr1, vr19, vr20 // in1 * 799
+ vmadd.w vr0, vr11, vr20 // in7 * 799 // t7a
+ vmsub.w vr1, vr11, vr21 // in7 * 4017 // t4a
+ vmul.w vr2, vr9, vr23 // in5 * 1138
+ vmul.w vr3, vr9, vr22 // in5 * 1703
+ vmadd.w vr2, vr7, vr22 // in3 * 1703 // t6a
+ vmsub.w vr3, vr7, vr23 // in3 * 1138 // t5a
+ vssrarni.h.w vr0, vr1, 12 // t4a t7a
+ vssrarni.h.w vr2, vr3, 12 // t5a t6a
+ vsadd.h vr9, vr0, vr2 // t4 t7
+ vssub.h vr11, vr0, vr2 // t5a t6a
+
+ vldrepl.w vr22, t0, 0 // 2896
+ vexth.w.h vr18, vr11 // t6a
+ vsllwil.w.h vr19, vr11, 0 // t5a
+ vmul.w vr6, vr18, vr22
+ vmul.w vr7, vr18, vr22
+ vmadd.w vr6, vr19, vr22 // t6
+ vmsub.w vr7, vr19, vr22 // t5
+ vssrarni.h.w vr6, vr7, 12 // t5 t6
+
+ vilvh.d vr11, vr6, vr9 // t7 t6
+ vilvl.d vr9, vr6, vr9 // t4 t5
+
+ vsadd.h \out0, vr8, vr11 // c[0] c[1]
+ vsadd.h \out1, vr10, vr9 // c[3] c[2]
+ vssub.h \out2, vr10, vr9 // c[4] c[5]
+ vssub.h \out3, vr8, vr11 // c[7] c[6]
+.endm
+
+.macro dct_8x4_core_lsx2 in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3
+ vexth.w.h vr4, \in0 // in1
+ vexth.w.h vr5, \in1 // in3
+ vmul.w vr6, vr4, \in4
+ vmul.w vr7, vr4, \in5
+ vmadd.w vr6, vr5, \in5 // t3
+ vmsub.w vr7, vr5, \in4 // t2
+ vexth.w.h vr4, \in2 // in1
+ vexth.w.h vr5, \in3 // in3
+ vmul.w vr8, vr4, \in4
+ vmul.w vr9, vr4, \in5
+ vmadd.w vr8, vr5, \in5 // t3
+ vmsub.w vr9, vr5, \in4 // t2
+ vssrarni.h.w vr8, vr6, 12 // t3
+ vssrarni.h.w vr9, vr7, 12 // t2
+
+ vsllwil.w.h vr4, \in0, 0
+ vsllwil.w.h vr5, \in1, 0
+ vmul.w vr11, vr4, \in6
+ vmul.w vr12, vr4, \in7
+ vmadd.w vr11, vr5, \in7 // t0
+ vmsub.w vr12, vr5, \in6 // t1
+ vsllwil.w.h vr4, \in2, 0
+ vsllwil.w.h vr5, \in3, 0
+ vmul.w vr13, vr4, \in6
+ vmul.w vr14, vr4, \in7
+ vmadd.w vr13, vr5, \in7 // t0
+ vmsub.w vr14, vr5, \in6 // t1
+ vssrarni.h.w vr13, vr11, 12 // t0
+ vssrarni.h.w vr14, vr12, 12 // t1
+
+ vsadd.h \out0, vr13, vr8
+ vsadd.h \out1, vr14, vr9
+ vssub.h \out2, vr14, vr9
+ vssub.h \out3, vr13, vr8
+.endm
+
+.macro DST_ADD_W8 in0, in1, in2, in3, in4, in5, in6, in7
+ vsllwil.hu.bu vr10, \in0, 0
+ vsllwil.hu.bu vr11, \in1, 0
+ vsllwil.hu.bu vr12, \in2, 0
+ vsllwil.hu.bu vr13, \in3, 0
+ vadd.h vr10, \in4, vr10
+ vadd.h vr11, \in5, vr11
+ vadd.h vr12, \in6, vr12
+ vadd.h vr13, \in7, vr13
+ vssrani.bu.h vr11, vr10, 0
+ vssrani.bu.h vr13, vr12, 0
+ vstelm.d vr11, a0, 0, 0
+ add.d t8, a0, a1
+ vstelm.d vr11, t8, 0, 1
+ vstelm.d vr13, t2, 0, 0
+ add.d t8, t2, a1
+ vstelm.d vr13, t8, 0, 1
+.endm
+
+.macro VLD_DST_ADD_W8 in0, in1, in2, in3
+ vld vr0, a0, 0
+ vldx vr1, a0, a1
+ vld vr2, t2, 0
+ vldx vr3, t2, a1
+
+ DST_ADD_W8 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3
+.endm
+
+function inv_txfm_add_dct_dct_8x4_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_8x4
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr5, 0x880 // 128
+ vmul.w vr2, vr0, vr1
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8
+ vld vr10, a0, 0
+ vmul.w vr2, vr2, vr0
+ vldx vr11, a0, a1
+ vsrari.w vr2, vr2, 8
+ alsl.d t2, a1, a0, 1
+ vmadd.w vr5, vr2, vr0
+ vld vr12, t2, 0
+ vssrarni.h.w vr5, vr5, 12
+ vldx vr13, t2, a1
+
+ DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5
+
+ b .DCT_DCT_8X4_END
+
+.NO_HAS_DCONLY_8x4:
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ vld vr2, a2, 32
+ vld vr3, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr0, vr1, vr2, vr3
+
+ vshuf4i.d vr1, vr1, 0x01
+ vshuf4i.d vr3, vr3, 0x01
+
+ vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15
+ vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7 in0
+ vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15 in1
+ vilvl.h vr4, vr3, vr2 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr3, vr2 // 1 3 5 7 9 11 13 15
+ vilvl.h vr2, vr5, vr4 // 16 - 23 in2
+ vilvh.h vr3, vr5, vr4 // 24 - 31 in3
+
+ la.local t0, idct_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+
+ dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \
+ vr22, vr15, vr16, vr17, vr18
+
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+ vsrari.h vr18, vr18, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
+
+.DCT_DCT_8X4_END:
+endfunc
+
+.macro identity8_lsx in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3
+ vssrarni.h.w \in1, \in0, 0
+ vssrarni.h.w \in3, \in2, 0
+ vssrarni.h.w \in5, \in4, 0
+ vssrarni.h.w \in7, \in6, 0
+ vsadd.h \out0, \in1, \in1
+ vsadd.h \out1, \in3, \in3
+ vsadd.h \out2, \in5, \in5
+ vsadd.h \out3, \in7, \in7
+.endm
+
+function inv_txfm_add_identity_identity_8x4_8bpc_lsx
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1
+ vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2
+ vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3
+
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
+
+ identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \
+ vr19, vr7, vr9, vr11
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+ identity_4x4_lsx vr19, vr19, vr20, vr19, vr19
+ identity_4x4_lsx vr7, vr7, vr20, vr7, vr7
+ identity_4x4_lsx vr9, vr9, vr20, vr9, vr9
+ identity_4x4_lsx vr11, vr11, vr20, vr11, vr11
+
+ vsrari.h vr15, vr19, 4
+ vsrari.h vr16, vr7, 4
+ vsrari.h vr17, vr9, 4
+ vsrari.h vr18, vr11, 4
+
+ vilvl.h vr4, vr16, vr15
+ vilvh.h vr5, vr16, vr15
+ vilvl.h vr11, vr5, vr4
+ vilvh.h vr12, vr5, vr4
+ vilvl.h vr4, vr18, vr17
+ vilvh.h vr5, vr18, vr17
+ vilvl.h vr13, vr5, vr4
+ vilvh.h vr14, vr5, vr4
+ vilvl.d vr15, vr13, vr11
+ vilvh.d vr16, vr13, vr11
+ vilvl.d vr17, vr14, vr12
+ vilvh.d vr18, vr14, vr12
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
+endfunc
+
+const iadst8_coeffs, align=4
+ .word 4076, 401, 3612, 1931
+ .word 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .word 2896, 0, 1567, 3784, 0, 0, 0, 0
+endconst
+
+.macro vmadd_vmsub_vssrarni_hw_12 in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, out0, out1, out2, out3
+ vmul.w \out0, \in0, \in4
+ vmul.w \out1, \in0, \in5
+ vmadd.w \out0, \in1, \in6 // t0a
+ vmsub.w \out1, \in1, \in7 // t1a
+ vmul.w \out2, \in2, \in8
+ vmul.w \out3, \in2, \in9
+ vmadd.w \out2, \in3, \in10 // t2a
+ vmsub.w \out3, \in3, \in11 // t3a
+ vssrarni.h.w \out1, \out0, 12 // t0a t1a
+ vssrarni.h.w \out3, \out2, 12 // t2a t3a
+.endm
+
+.macro adst8x4_1d_lsx
+ la.local t0, iadst8_coeffs
+
+ vldrepl.w vr20, t0, 0 // 4076
+ vldrepl.w vr21, t0, 4 // 401
+ vldrepl.w vr22, t0, 8 // 3612
+ vldrepl.w vr23, t0, 12 // 1931
+
+ // vr13 t0a t1a vr15 t2a t3a
+ vmadd_vmsub_vssrarni_hw_12 vr11, vr18, vr9, vr6, vr20, vr21, vr21, vr20, \
+ vr22, vr23, vr23, vr22, vr12, vr13, vr14, vr15
+ vldrepl.w vr20, t0, 16 // 2598
+ vldrepl.w vr21, t0, 20 // 3166
+ vldrepl.w vr22, t0, 24 // 1189
+ vldrepl.w vr23, t0, 28 // 3920
+
+ // vr18 t4a t5a vr6 t6a t7a
+ vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr19, vr10, vr20, vr21, vr21, vr20, \
+ vr22, vr23, vr23, vr22, vr11, vr18, vr9, vr6
+
+ vsadd.h vr12, vr13, vr18 // t0 t1
+ vsadd.h vr14, vr15, vr6 // t2 t3
+ vssub.h vr16, vr13, vr18 // t4 t5
+ vssub.h vr18, vr15, vr6 // t6 t7
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ vsllwil.w.h vr7, vr16, 0 // t4
+ vexth.w.h vr8, vr16 // t5
+ vsllwil.w.h vr10, vr18, 0 // t6
+ vexth.w.h vr11, vr18 // t7
+
+ // vr13 out0 out7 vr17 out1 out6
+ vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr11, vr10, vr21, vr20, vr20, vr21, \
+ vr20, vr21, vr21, vr20, vr13, vr15, vr17, vr19
+ vshuf4i.d vr19, vr19, 0x01
+
+ vsadd.h vr13, vr12, vr14 // out0 out7
+ vssub.h vr16, vr12, vr14 // t2 t3
+ vsadd.h vr17, vr15, vr19 // out1 out6
+ vssub.h vr18, vr15, vr19 // t6 t7
+
+ vexth.w.h vr20, vr13 // out7
+ vsllwil.w.h vr21, vr17, 0 // out1
+ vneg.w vr20, vr20
+ vneg.w vr21, vr21
+ vssrarni.h.w vr21, vr20, 0 // out7 out1
+ vilvl.d vr13, vr21, vr13 // out0 out7
+ vilvh.d vr17, vr17, vr21 // out1 out6
+
+ vsllwil.w.h vr7, vr16, 0 // t2
+ vexth.w.h vr8, vr16 // t3
+ vsllwil.w.h vr10, vr18, 0 // t6
+ vexth.w.h vr11, vr18 // t7
+
+ // vr15 out[3] out[4] vr18 out[2] out[5]
+ vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr10, vr11, vr22, vr22, vr22, vr22, \
+ vr22, vr22, vr22, vr22, vr14, vr15, vr19, vr18
+
+ vexth.w.h vr20, vr18 // out5
+ vsllwil.w.h vr21, vr15, 0 // out3
+ vneg.w vr20, vr20
+ vneg.w vr21, vr21
+ vssrarni.h.w vr21, vr20, 0 // out5 out3
+ vilvl.d vr18, vr21, vr18 // out2 out5
+ vilvh.d vr15, vr15, vr21 // out3 out4
+.endm
+
+function inv_txfm_add_adst_dct_8x4_8bpc_lsx
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1
+ vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2
+ vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr4, vr17, vr13
+ vilvl.h vr5, vr15, vr18
+ vilvl.w vr0, vr5, vr4
+ vilvh.w vr1, vr5, vr4
+ vilvh.h vr4, vr18, vr15
+ vilvh.h vr5, vr13, vr17
+ vilvl.w vr2, vr5, vr4
+ vilvh.w vr3, vr5, vr4
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \
+ vr22, vr15, vr16, vr17, vr18
+
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+ vsrari.h vr18, vr18, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
+endfunc
+
+function inv_txfm_add_dct_adst_8x4_8bpc_lsx
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1
+ vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2
+ vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr0, vr1, vr2, vr3
+
+ vshuf4i.d vr1, vr1, 0x01
+ vshuf4i.d vr3, vr3, 0x01
+
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr5, vr1, vr0
+ vilvl.h vr0, vr5, vr4
+ vilvh.h vr1, vr5, vr4
+ vilvl.h vr4, vr3, vr2
+ vilvh.h vr5, vr3, vr2
+ vilvl.h vr2, vr5, vr4
+ vilvh.h vr3, vr5, vr4
+
+ la.local t0, iadst4_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr0, 0
+ vexth.w.h vr11, vr0
+ vsllwil.w.h vr12, vr1, 0
+ vexth.w.h vr13, vr1
+
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr2, 0
+ vexth.w.h vr15, vr2
+ vsllwil.w.h vr16, vr3, 0
+ vexth.w.h vr17, vr3
+
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+endfunc
+
+function inv_txfm_add_adst_adst_8x4_8bpc_lsx
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1
+ vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2
+ vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr4, vr17, vr13
+ vilvl.h vr5, vr15, vr18
+ vilvl.w vr0, vr5, vr4
+ vilvh.w vr1, vr5, vr4
+ vilvh.h vr4, vr18, vr15
+ vilvh.h vr5, vr13, vr17
+ vilvl.w vr2, vr5, vr4
+ vilvh.w vr3, vr5, vr4
+
+ la.local t0, iadst4_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr0, 0
+ vexth.w.h vr11, vr0
+ vsllwil.w.h vr12, vr1, 0
+ vexth.w.h vr13, vr1
+
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr2, 0
+ vexth.w.h vr15, vr2
+ vsllwil.w.h vr16, vr3, 0
+ vexth.w.h vr17, vr3
+
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+endfunc
+
+function inv_txfm_add_flipadst_adst_8x4_8bpc_lsx
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1
+ vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2
+ vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr20, vr15, vr13
+ vilvl.h vr21, vr18, vr17
+ vilvl.w vr0, vr21, vr20
+ vilvh.w vr1, vr21, vr20
+ vilvh.h vr20, vr15, vr13
+ vilvh.h vr21, vr18, vr17
+ vilvl.w vr2, vr21, vr20
+ vilvh.w vr3, vr21, vr20
+ vshuf4i.h vr0, vr0, 0x2d
+ vshuf4i.h vr1, vr1, 0x2d
+ vshuf4i.h vr2, vr2, 0x78
+ vshuf4i.h vr3, vr3, 0x78
+
+ la.local t0, iadst4_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr2, 0
+ vexth.w.h vr11, vr2
+ vsllwil.w.h vr12, vr3, 0
+ vexth.w.h vr13, vr3
+
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr0, 0
+ vexth.w.h vr15, vr0
+ vsllwil.w.h vr16, vr1, 0
+ vexth.w.h vr17, vr1
+
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+endfunc
+
+function inv_txfm_add_adst_flipadst_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr4, vr17, vr13
+ vilvl.h vr5, vr15, vr18
+ vilvl.w vr0, vr5, vr4
+ vilvh.w vr1, vr5, vr4
+ vilvh.h vr4, vr18, vr15
+ vilvh.h vr5, vr13, vr17
+ vilvl.w vr2, vr5, vr4
+ vilvh.w vr3, vr5, vr4
+
+ la.local t0, iadst4_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr0, 0
+ vexth.w.h vr11, vr0
+ vsllwil.w.h vr12, vr1, 0
+ vexth.w.h vr13, vr1
+
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr2, 0
+ vexth.w.h vr15, vr2
+ vsllwil.w.h vr16, vr3, 0
+ vexth.w.h vr17, vr3
+
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr17, vr16, vr15, vr14
+endfunc
+
+function inv_txfm_add_flipadst_dct_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr20, vr15, vr13
+ vilvl.h vr21, vr18, vr17
+ vilvl.w vr0, vr21, vr20
+ vilvh.w vr1, vr21, vr20
+ vilvh.h vr20, vr15, vr13
+ vilvh.h vr21, vr18, vr17
+ vilvl.w vr2, vr21, vr20
+ vilvh.w vr3, vr21, vr20
+ vshuf4i.h vr0, vr0, 0x2d
+ vshuf4i.h vr1, vr1, 0x2d
+ vshuf4i.h vr2, vr2, 0x78
+ vshuf4i.h vr3, vr3, 0x78
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx2 vr2, vr3, vr0, vr1, vr21, vr20, vr22, \
+ vr22, vr15, vr16, vr17, vr18
+
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+ vsrari.h vr18, vr18, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
+endfunc
+
+function inv_txfm_add_dct_flipadst_8x4_8bpc_lsx
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr0, vr1, vr2, vr3
+
+ vshuf4i.d vr1, vr1, 0x01
+ vshuf4i.d vr3, vr3, 0x01
+
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr5, vr1, vr0
+ vilvl.h vr0, vr5, vr4
+ vilvh.h vr1, vr5, vr4
+ vilvl.h vr4, vr3, vr2
+ vilvh.h vr5, vr3, vr2
+ vilvl.h vr2, vr5, vr4
+ vilvh.h vr3, vr5, vr4
+
+ la.local t0, iadst4_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr0, 0 // in0
+ vexth.w.h vr11, vr0 // in1
+ vsllwil.w.h vr12, vr1, 0 // in2
+ vexth.w.h vr13, vr1 // in3
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr2, 0
+ vexth.w.h vr15, vr2
+ vsllwil.w.h vr16, vr3, 0
+ vexth.w.h vr17, vr3
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr17, vr16, vr15, vr14
+endfunc
+
+function inv_txfm_add_flipadst_flipadst_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr20, vr15, vr13
+ vilvl.h vr21, vr18, vr17
+ vilvl.w vr0, vr21, vr20
+ vilvh.w vr1, vr21, vr20
+ vilvh.h vr20, vr15, vr13
+ vilvh.h vr21, vr18, vr17
+ vilvl.w vr2, vr21, vr20
+ vilvh.w vr3, vr21, vr20
+ vshuf4i.h vr0, vr0, 0x2d
+ vshuf4i.h vr1, vr1, 0x2d
+ vshuf4i.h vr2, vr2, 0x78
+ vshuf4i.h vr3, vr3, 0x78
+
+ la.local t0, iadst4_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr2, 0 // in0
+ vexth.w.h vr11, vr2 // in1
+ vsllwil.w.h vr12, vr3, 0 // in2
+ vexth.w.h vr13, vr3 // in3
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr0, 0
+ vexth.w.h vr15, vr0
+ vsllwil.w.h vr16, vr1, 0
+ vexth.w.h vr17, vr1
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr17, vr16, vr15, vr14
+endfunc
+
+function inv_txfm_add_dct_identity_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr0, vr1, vr2, vr3
+
+ vshuf4i.d vr1, vr1, 0x01
+ vshuf4i.d vr3, vr3, 0x01
+
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr5, vr1, vr0
+ vilvl.h vr0, vr5, vr4
+ vilvh.h vr1, vr5, vr4
+ vilvl.h vr4, vr3, vr2
+ vilvh.h vr5, vr3, vr2
+ vilvl.h vr2, vr5, vr4
+ vilvh.h vr3, vr5, vr4
+ vilvl.d vr14, vr2, vr0
+ vilvh.d vr15, vr2, vr0
+ vilvl.d vr16, vr3, vr1
+ vilvh.d vr17, vr3, vr1
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr14, vr14, vr20, vr14, vr14
+ identity_4x4_lsx vr15, vr15, vr20, vr15, vr15
+ identity_4x4_lsx vr16, vr16, vr20, vr16, vr16
+ identity_4x4_lsx vr17, vr17, vr20, vr17, vr17
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+endfunc
+
+function inv_txfm_add_identity_dct_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31
+
+ identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \
+ vr19, vr7, vr9, vr11
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vilvl.h vr4, vr7, vr19
+ vilvh.h vr5, vr7, vr19
+ vilvl.h vr0, vr5, vr4
+ vilvh.h vr1, vr5, vr4
+ vilvl.h vr4, vr11, vr9
+ vilvh.h vr5, vr11, vr9
+ vilvl.h vr2, vr5, vr4
+ vilvh.h vr3, vr5, vr4
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \
+ vr22, vr15, vr16, vr17, vr18
+
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+ vsrari.h vr18, vr18, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
+endfunc
+
+function inv_txfm_add_flipadst_identity_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr20, vr15, vr13
+ vilvl.h vr21, vr18, vr17
+ vilvl.w vr0, vr21, vr20
+ vilvh.w vr1, vr21, vr20
+ vilvh.h vr20, vr15, vr13
+ vilvh.h vr21, vr18, vr17
+ vilvl.w vr2, vr21, vr20
+ vilvh.w vr3, vr21, vr20
+ vshuf4i.h vr0, vr0, 0x2d
+ vshuf4i.h vr1, vr1, 0x2d
+ vshuf4i.h vr2, vr2, 0x78
+ vshuf4i.h vr3, vr3, 0x78
+ vilvl.d vr14, vr0, vr2 // in0
+ vilvh.d vr15, vr0, vr2 // in1
+ vilvl.d vr16, vr1, vr3 // in2
+ vilvh.d vr17, vr1, vr3 // in3
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr14, vr14, vr20, vr14, vr14
+ identity_4x4_lsx vr15, vr15, vr20, vr15, vr15
+ identity_4x4_lsx vr16, vr16, vr20, vr16, vr16
+ identity_4x4_lsx vr17, vr17, vr20, vr17, vr17
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+endfunc
+
+function inv_txfm_add_identity_flipadst_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31
+
+ identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \
+ vr19, vr7, vr9, vr11
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vilvl.h vr4, vr7, vr19
+ vilvh.h vr5, vr7, vr19
+ vilvl.h vr0, vr5, vr4
+ vilvh.h vr1, vr5, vr4
+ vilvl.h vr4, vr11, vr9
+ vilvh.h vr5, vr11, vr9
+ vilvl.h vr2, vr5, vr4
+ vilvh.h vr3, vr5, vr4
+
+ la.local t0, iadst4_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr0, 0 // in0
+ vexth.w.h vr11, vr0 // in1
+ vsllwil.w.h vr12, vr1, 0 // in2
+ vexth.w.h vr13, vr1 // in3
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr2, 0
+ vexth.w.h vr15, vr2
+ vsllwil.w.h vr16, vr3, 0
+ vexth.w.h vr17, vr3
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr17, vr16, vr15, vr14
+endfunc
+
+function inv_txfm_add_adst_identity_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr4, vr17, vr13
+ vilvl.h vr5, vr15, vr18
+ vilvl.w vr14, vr5, vr4 // in0 in1
+ vilvh.w vr16, vr5, vr4 // in2 in3
+ vilvh.h vr4, vr18, vr15
+ vilvh.h vr5, vr13, vr17
+ vilvl.w vr17, vr5, vr4
+ vilvh.w vr18, vr5, vr4
+ vilvl.d vr10, vr17, vr14 // in0
+ vilvh.d vr11, vr17, vr14 // in1
+ vilvl.d vr12, vr18, vr16 // in2
+ vilvh.d vr13, vr18, vr16 // in3
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr10, vr10, vr20, vr10, vr15
+ identity_4x4_lsx vr11, vr11, vr20, vr11, vr16
+ identity_4x4_lsx vr12, vr12, vr20, vr12, vr17
+ identity_4x4_lsx vr13, vr13, vr20, vr13, vr18
+
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+ vsrari.h vr18, vr18, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
+endfunc
+
+function inv_txfm_add_identity_adst_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31
+
+ identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \
+ vr0, vr1, vr2, vr3
+
+ vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15
+ vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15
+ vilvl.h vr4, vr3, vr2 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr3, vr2 // 1 3 5 7 9 11 13 15
+ vilvl.h vr2, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr3, vr5, vr4 // 8 9 10 11 12 13 14 15
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ la.local t0, iadst4_coeffs
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr0, 0
+ vexth.w.h vr11, vr0
+ vsllwil.w.h vr12, vr1, 0
+ vexth.w.h vr13, vr1
+
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr2, 0
+ vexth.w.h vr15, vr2
+ vsllwil.w.h vr16, vr3, 0
+ vexth.w.h vr17, vr3
+
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+endfunc
+
+function inv_txfm_add_identity_identity_8x8_8bpc_lsx
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15
+
+ // identity8
+ vsllwil.w.h vr6, vr0, 1
+ vsllwil.w.h vr7, vr1, 1
+ vsllwil.w.h vr8, vr2, 1
+ vsllwil.w.h vr9, vr3, 1
+ vsllwil.w.h vr10, vr4, 1
+ vsllwil.w.h vr11, vr5, 1
+ vsllwil.w.h vr12, vr14, 1
+ vsllwil.w.h vr13, vr15, 1
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w vr0, vr6, 1 // in0
+ vssrarni.h.w vr1, vr7, 1 // in1
+ vssrarni.h.w vr2, vr8, 1 // in2
+ vssrarni.h.w vr3, vr9, 1 // in3
+ vssrarni.h.w vr4, vr10, 1 // in4
+ vssrarni.h.w vr5, vr11, 1 // in5
+ vssrarni.h.w vr14, vr12, 1 // in6
+ vssrarni.h.w vr15, vr13, 1 // in7
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr12 vr13
+
+ vsllwil.w.h vr6, vr16, 1
+ vsllwil.w.h vr7, vr17, 1
+ vsllwil.w.h vr8, vr18, 1
+ vsllwil.w.h vr9, vr19, 1
+ vsllwil.w.h vr10, vr20, 1
+ vsllwil.w.h vr11, vr21, 1
+ vsllwil.w.h vr12, vr22, 1
+ vsllwil.w.h vr13, vr23, 1
+
+.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w vr16, vr6, 4 // in0
+ vssrarni.h.w vr17, vr7, 4 // in1
+ vssrarni.h.w vr18, vr8, 4 // in2
+ vssrarni.h.w vr19, vr9, 4 // in3
+ vssrarni.h.w vr20, vr10, 4 // in4
+ vssrarni.h.w vr21, vr11, 4 // in5
+ vssrarni.h.w vr22, vr12, 4 // in6
+ vssrarni.h.w vr23, vr13, 4 // in7
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
+
+endfunc
+
+.macro adst8x8_1d_lsx out0, out1, out2, out3
+ la.local t0, iadst8_coeffs
+
+ vldrepl.w vr20, t0, 0 // 4076
+ vldrepl.w vr21, t0, 4 // 401
+ vldrepl.w vr22, t0, 8 // 3612
+ vldrepl.w vr23, t0, 12 // 1931
+
+ // vr13 t0a t1a vr15 t2a t3a
+ vmadd_vmsub_vssrarni_hw_12 vr11, vr18, vr9, vr6, vr20, vr21, vr21, vr20, \
+ vr22, vr23, vr23, vr22, vr12, vr13, vr14, vr15
+ vldrepl.w vr20, t0, 16 // 2598
+ vldrepl.w vr21, t0, 20 // 3166
+ vldrepl.w vr22, t0, 24 // 1189
+ vldrepl.w vr23, t0, 28 // 3920
+
+ // vr18 t4a t5a vr6 t6a t7a
+ vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr19, vr10, vr20, vr21, vr21, vr20, \
+ vr22, vr23, vr23, vr22, vr11, vr18, vr9, vr6
+
+ vsadd.h vr12, vr13, vr18 // t0 t1
+ vsadd.h vr14, vr15, vr6 // t2 t3
+ vssub.h vr9, vr13, vr18 // t4 t5
+ vssub.h vr18, vr15, vr6 // t6 t7
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ vsllwil.w.h vr7, vr9, 0 // t4
+ vexth.w.h vr8, vr9 // t5
+ vsllwil.w.h vr10, vr18, 0 // t6
+ vexth.w.h vr11, vr18 // t7
+
+ // vr13 out0 out7 vr17 out1 out6
+ vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr11, vr10, vr21, vr20, vr20, vr21, \
+ vr20, vr21, vr21, vr20, vr13, vr15, vr18, vr19
+ vshuf4i.d vr19, vr19, 0x01
+
+ vsadd.h vr13, vr12, vr14 // out0 out7
+ vssub.h vr6, vr12, vr14 // t2 t3
+ vsadd.h vr7, vr15, vr19 // out1 out6
+ vssub.h vr18, vr15, vr19 // t6 t7
+
+ vexth.w.h vr20, vr13 // out7
+ vsllwil.w.h vr21, vr7, 0 // out1
+ vneg.w vr20, vr20
+ vneg.w vr21, vr21
+ vssrarni.h.w vr21, vr20, 0 // out7 out1
+ vilvl.d \out0, vr21, vr13 // out0 out7
+ vilvh.d \out1, vr7, vr21 // out1 out6
+
+ vsllwil.w.h vr7, vr6, 0 // t2
+ vexth.w.h vr8, vr6 // t3
+ vsllwil.w.h vr10, vr18, 0 // t6
+ vexth.w.h vr11, vr18 // t7
+
+ // vr15 out[3] out[4] vr18 out[2] out[5]
+ vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr10, vr11, vr22, vr22, vr22, vr22, \
+ vr22, vr22, vr22, vr22, vr14, vr15, vr19, vr18
+
+ vexth.w.h vr20, vr18 // out5
+ vsllwil.w.h vr21, vr15, 0 // out3
+ vneg.w vr20, vr20
+ vneg.w vr21, vr21
+ vssrarni.h.w vr21, vr20, 0 // out5 out3
+ vilvl.d \out2, vr21, vr18 // out2 out5
+ vilvh.d \out3, vr15, vr21 // out3 out4
+.endm
+
+function inv_txfm_add_adst_dct_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr24, vr25, vr26, vr27
+
+ vexth.w.h vr18, vr0
+ vexth.w.h vr19, vr1
+ vexth.w.h vr6, vr2
+ vexth.w.h vr7, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr16
+ vexth.w.h vr11, vr17
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
+ vsrari.h \i, \i, 1
+.endr
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \
+ vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17
+
+ vshuf4i.h vr14, vr14, 0x1b
+ vshuf4i.h vr15, vr15, 0x1b
+ vshuf4i.h vr24, vr24, 0x1b
+ vshuf4i.h vr25, vr25, 0x1b
+
+ vsllwil.w.h vr18, vr4, 0
+ vsllwil.w.h vr19, vr5, 0
+ vsllwil.w.h vr6, vr12, 0
+ vsllwil.w.h vr7, vr13, 0
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr12
+ vexth.w.h vr11, vr13
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr4, vr5, vr12, vr13
+
+ vshuf4i.d vr5, vr5, 0x01
+ vshuf4i.d vr13, vr13, 0x01
+
+ vsllwil.w.h vr18, vr14, 0
+ vsllwil.w.h vr19, vr15, 0
+ vsllwil.w.h vr6, vr24, 0
+ vsllwil.w.h vr7, vr25, 0
+ vexth.w.h vr8, vr14
+ vexth.w.h vr9, vr15
+ vexth.w.h vr10, vr24
+ vexth.w.h vr11, vr25
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr14, vr15, vr24, vr25
+
+ vshuf4i.d vr15, vr15, 0x01
+ vshuf4i.d vr25, vr25, 0x01
+
+ vilvl.d vr20, vr14, vr4
+ vilvh.d vr21, vr14, vr4
+ vilvl.d vr22, vr15, vr5
+ vilvh.d vr23, vr15, vr5
+ vilvl.d vr16, vr24, vr12
+ vilvh.d vr17, vr24, vr12
+ vilvl.d vr18, vr25, vr13
+ vilvh.d vr19, vr25, vr13
+
+.irp i, vr20, vr21, vr22, vr23, vr16, vr17, vr18, vr19
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_dct_adst_8x8_8bpc_lsx
+ addi.d sp, sp, -48
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+
+ vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ vsllwil.w.h vr18, vr4, 0
+ vsllwil.w.h vr19, vr5, 0
+ vsllwil.w.h vr6, vr12, 0
+ vsllwil.w.h vr7, vr13, 0
+ vsllwil.w.h vr8, vr14, 0
+ vsllwil.w.h vr9, vr15, 0
+ vsllwil.w.h vr10, vr24, 0
+ vsllwil.w.h vr11, vr25, 0
+
+ dct_8x4_core_lsx1 vr26, vr27, vr28, vr29
+
+ vshuf4i.d vr27, vr27, 0x01
+ vshuf4i.d vr29, vr29, 0x01
+
+ vilvl.h vr8, vr27, vr26 // 0 2 4 6 8 10 12 14
+ vilvh.h vr9, vr27, vr26 // 1 3 5 7 9 11 13 15
+ vilvl.h vr26, vr9, vr8 // 0 - 7 in0
+ vilvh.h vr27, vr9, vr8 // 8 - 15 in1
+ vilvl.h vr8, vr29, vr28 // 0 2 4 6 8 10 12 14
+ vilvh.h vr9, vr29, vr28 // 1 3 5 7 9 11 13 15
+ vilvl.h vr28, vr9, vr8 // 16 - 23 in2
+ vilvh.h vr29, vr9, vr8 // 24 - 31 in3
+
+ vsrari.h vr26, vr26, 1 // in0low in1low
+ vsrari.h vr27, vr27, 1 // in2low in3low
+ vsrari.h vr28, vr28, 1 // in0high in1high
+ vsrari.h vr29, vr29, 1 // in2high in3high
+
+ vexth.w.h vr18, vr4
+ vexth.w.h vr19, vr5
+ vexth.w.h vr6, vr12
+ vexth.w.h vr7, vr13
+ vexth.w.h vr8, vr14
+ vexth.w.h vr9, vr15
+ vexth.w.h vr10, vr24
+ vexth.w.h vr11, vr25
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr12, vr13, vr14, vr15
+
+ vshuf4i.d vr13, vr13, 0x01
+ vshuf4i.d vr15, vr15, 0x01
+
+ vilvl.h vr8, vr13, vr12 // 0 2 4 6 8 10 12 14
+ vilvh.h vr9, vr13, vr12 // 1 3 5 7 9 11 13 15
+ vilvl.h vr12, vr9, vr8 // 0 - 7 in0
+ vilvh.h vr13, vr9, vr8 // 8 - 15 in1
+ vilvl.h vr8, vr15, vr14 // 0 2 4 6 8 10 12 14
+ vilvh.h vr9, vr15, vr14 // 1 3 5 7 9 11 13 15
+ vilvl.h vr14, vr9, vr8 // 16 - 23 in2
+ vilvh.h vr15, vr9, vr8 // 24 - 31 in3
+
+ vsrari.h vr0, vr12, 1 // in4low in5low
+ vsrari.h vr1, vr13, 1 // in6low in7low
+ vsrari.h vr2, vr14, 1 // in4high in5high
+ vsrari.h vr3, vr15, 1 // in6high in7high
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ vsllwil.w.h vr18, vr26, 0 // in0
+ vexth.w.h vr19, vr26 // in1
+ vsllwil.w.h vr6, vr27, 0 // in2
+ vexth.w.h vr7, vr27 // in3
+ vsllwil.w.h vr8, vr0, 0 // in3
+ vexth.w.h vr9, vr0 // in4
+ vsllwil.w.h vr10, vr1, 0 // in5
+ vexth.w.h vr11, vr1 // in6
+ adst8x8_1d_lsx vr26, vr27, vr0, vr1
+
+ vsllwil.w.h vr18, vr28, 0 // in0
+ vexth.w.h vr19, vr28 // in1
+ vsllwil.w.h vr6, vr29, 0 // in2
+ vexth.w.h vr7, vr29 // in3
+ vsllwil.w.h vr8, vr2, 0 // in4
+ vexth.w.h vr9, vr2 // in5
+ vsllwil.w.h vr10, vr3, 0 // in6
+ vexth.w.h vr11, vr3 // in7
+ adst8x8_1d_lsx vr28, vr29, vr16, vr17
+
+ vilvl.d vr4, vr28, vr26 // 0 ... 7
+ vilvl.d vr5, vr29, vr27 // 8 ... 15
+ vilvl.d vr6, vr16, vr0 // 16 ... 23
+ vilvl.d vr7, vr17, vr1 // 24 ... 31
+ vilvh.d vr14, vr17, vr1 // 32 ... 39
+ vilvh.d vr15, vr16, vr0 // 40 ... 47
+ vilvh.d vr16, vr29, vr27 // 48 ... 55
+ vilvh.d vr17, vr28, vr26 // 56 ... 63
+
+.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ addi.d sp, sp, 48
+endfunc
+
+function inv_txfm_add_adst_adst_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr24, vr25, vr26, vr27
+
+ vexth.w.h vr18, vr0 // in0
+ vexth.w.h vr19, vr1 // in1
+ vexth.w.h vr6, vr2 // in2
+ vexth.w.h vr7, vr3 // in3
+ vexth.w.h vr8, vr4 // in3
+ vexth.w.h vr9, vr5 // in4
+ vexth.w.h vr10, vr16 // in5
+ vexth.w.h vr11, vr17 // in6
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
+ vsrari.h \i, \i, 1
+.endr
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \
+ vr14, vr15, vr12, vr13, vr4, vr5, vr24, vr25, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17
+
+ vshuf4i.h vr4, vr4, 0x1b
+ vshuf4i.h vr5, vr5, 0x1b
+ vshuf4i.h vr24, vr24, 0x1b
+ vshuf4i.h vr25, vr25, 0x1b
+
+ vsllwil.w.h vr18, vr14, 0
+ vsllwil.w.h vr19, vr15, 0
+ vsllwil.w.h vr6, vr12, 0
+ vsllwil.w.h vr7, vr13, 0
+ vexth.w.h vr8, vr14 // in3
+ vexth.w.h vr9, vr15 // in4
+ vexth.w.h vr10, vr12 // in5
+ vexth.w.h vr11, vr13 // in6
+
+ adst8x8_1d_lsx vr26, vr27, vr0, vr1
+
+ vsllwil.w.h vr18, vr4, 0
+ vsllwil.w.h vr19, vr5, 0
+ vsllwil.w.h vr6, vr24, 0
+ vsllwil.w.h vr7, vr25, 0
+ vexth.w.h vr8, vr4 // in3
+ vexth.w.h vr9, vr5 // in4
+ vexth.w.h vr10, vr24 // in5
+ vexth.w.h vr11, vr25 // in6
+
+ adst8x8_1d_lsx vr24, vr25, vr16, vr17
+
+ vilvl.d vr4, vr24, vr26 // 0 ... 7
+ vilvl.d vr5, vr25, vr27 // 8 ... 15
+ vilvl.d vr6, vr16, vr0 // 16 ... 23
+ vilvl.d vr7, vr17, vr1 // 24 ... 31
+ vilvh.d vr14, vr17, vr1 // 32 ... 39
+ vilvh.d vr15, vr16, vr0 // 40 ... 47
+ vilvh.d vr16, vr25, vr27 // 48 ... 55
+ vilvh.d vr17, vr24, vr26 // 56 ... 63
+
+.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_flipadst_adst_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr24, vr20, vr21
+ vilvh.w vr25, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr26, vr20, vr21
+ vilvh.w vr27, vr20, vr21
+ vshuf4i.h vr26, vr26, 0x1b
+ vshuf4i.h vr27, vr27, 0x1b
+
+ vexth.w.h vr18, vr0
+ vexth.w.h vr19, vr1
+ vexth.w.h vr6, vr2
+ vexth.w.h vr7, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr16
+ vexth.w.h vr11, vr17
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr0, vr20, vr21
+ vilvh.w vr1, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr2, vr20, vr21
+ vilvh.w vr3, vr20, vr21
+ vshuf4i.h vr2, vr2, 0x1b
+ vshuf4i.h vr3, vr3, 0x1b
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
+ vsrari.h \i, \i, 1
+.endr
+
+ vsllwil.w.h vr18, vr26, 0 // in0
+ vexth.w.h vr19, vr26 // in1
+ vsllwil.w.h vr6, vr27, 0 // in2
+ vexth.w.h vr7, vr27 // in3
+ vsllwil.w.h vr8, vr2, 0 // in4
+ vexth.w.h vr9, vr2 // in5
+ vsllwil.w.h vr10, vr3, 0 // in6
+ vexth.w.h vr11, vr3 // in7
+ adst8x8_1d_lsx vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr24, 0 // in0
+ vexth.w.h vr19, vr24 // in1
+ vsllwil.w.h vr6, vr25, 0 // in2
+ vexth.w.h vr7, vr25 // in3
+ vsllwil.w.h vr8, vr0, 0 // in4
+ vexth.w.h vr9, vr0 // in5
+ vsllwil.w.h vr10, vr1, 0 // in6
+ vexth.w.h vr11, vr1 // in7
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vilvl.d vr20, vr0, vr4 // 0 ... 7
+ vilvl.d vr21, vr1, vr5 // 8 ... 15
+ vilvl.d vr22, vr2, vr16 // 16 ... 23
+ vilvl.d vr23, vr3, vr17 // 24 ... 31
+ vilvh.d vr14, vr3, vr17 // 32 ... 39
+ vilvh.d vr15, vr2, vr16 // 40 ... 47
+ vilvh.d vr16, vr1, vr5 // 48 ... 55
+ vilvh.d vr17, vr0, vr4 // 56 ... 63
+
+.irp i, vr20, vr21, vr22, vr23, vr14, vr15, vr16, vr17
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_adst_flipadst_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr24, vr25, vr26, vr27
+
+ vexth.w.h vr18, vr0
+ vexth.w.h vr19, vr1
+ vexth.w.h vr6, vr2
+ vexth.w.h vr7, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr16
+ vexth.w.h vr11, vr17
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
+ vsrari.h \i, \i, 1
+.endr
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \
+ vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17
+
+ vshuf4i.h vr0, vr0, 0x1b
+ vshuf4i.h vr1, vr1, 0x1b
+ vshuf4i.h vr2, vr2, 0x1b
+ vshuf4i.h vr3, vr3, 0x1b
+
+ vsllwil.w.h vr18, vr0, 0 // in0
+ vsllwil.w.h vr19, vr1, 0 // in1
+ vsllwil.w.h vr6, vr2, 0 // in2
+ vsllwil.w.h vr7, vr3, 0 // in3
+ vexth.w.h vr8, vr0 // in4
+ vexth.w.h vr9, vr1 // in5
+ vexth.w.h vr10, vr2 // in6
+ vexth.w.h vr11, vr3 // in7
+ adst8x8_1d_lsx vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr24, 0 // in0
+ vsllwil.w.h vr19, vr25, 0 // in1
+ vsllwil.w.h vr6, vr26, 0 // in2
+ vsllwil.w.h vr7, vr27, 0 // in3
+ vexth.w.h vr8, vr24 // in4
+ vexth.w.h vr9, vr25 // in5
+ vexth.w.h vr10, vr26 // in6
+ vexth.w.h vr11, vr27 // in7
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vilvh.d vr20, vr4, vr0
+ vilvh.d vr21, vr5, vr1
+ vilvh.d vr22, vr16, vr2
+ vilvh.d vr23, vr17, vr3
+ vilvl.d vr14, vr17, vr3
+ vilvl.d vr15, vr16, vr2
+ vilvl.d vr18, vr5, vr1
+ vilvl.d vr19, vr4, vr0
+
+.irp i, vr20, vr21, vr22, vr23, vr14, vr15, vr18, vr19
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr18, vr19
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_flipadst_dct_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr24, vr20, vr21
+ vilvh.w vr25, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr26, vr20, vr21
+ vilvh.w vr27, vr20, vr21
+
+ vexth.w.h vr18, vr0
+ vexth.w.h vr19, vr1
+ vexth.w.h vr6, vr2
+ vexth.w.h vr7, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr16
+ vexth.w.h vr11, vr17
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr0, vr20, vr21
+ vilvh.w vr1, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr2, vr20, vr21
+ vilvh.w vr3, vr20, vr21
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ vsrari.h vr24, vr24, 1
+ vsrari.h vr25, vr25, 1
+ vsrari.h vr26, vr26, 1
+ vsrari.h vr27, vr27, 1
+ vsrari.h vr14, vr0, 1
+ vsrari.h vr15, vr1, 1
+ vsrari.h vr16, vr2, 1
+ vsrari.h vr17, vr3, 1
+
+ vsllwil.w.h vr18, vr26, 0
+ vexth.w.h vr19, vr26
+ vsllwil.w.h vr6, vr27, 0
+ vexth.w.h vr7, vr27
+ vsllwil.w.h vr8, vr16, 0
+ vexth.w.h vr9, vr16
+ vsllwil.w.h vr10, vr17, 0
+ vexth.w.h vr11, vr17
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr26, vr27, vr16, vr17
+
+ vshuf4i.h vr26, vr26, 0x1b
+ vshuf4i.h vr27, vr27, 0x1b
+ vshuf4i.h vr16, vr16, 0x1b
+ vshuf4i.h vr17, vr17, 0x1b
+
+ vsllwil.w.h vr18, vr24, 0
+ vexth.w.h vr19, vr24
+ vsllwil.w.h vr6, vr25, 0
+ vexth.w.h vr7, vr25
+ vsllwil.w.h vr8, vr14, 0
+ vexth.w.h vr9, vr14
+ vsllwil.w.h vr10, vr15, 0
+ vexth.w.h vr11, vr15
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr24, vr25, vr14, vr15
+
+ vilvl.d vr4, vr24, vr26
+ vilvh.d vr5, vr24, vr26
+ vilvh.d vr6, vr25, vr27
+ vilvl.d vr7, vr25, vr27
+ vilvl.d vr24, vr14, vr16
+ vilvh.d vr25, vr14, vr16
+ vilvh.d vr26, vr15, vr17
+ vilvl.d vr27, vr15, vr17
+
+.irp i, vr4, vr5, vr6, vr7, vr24, vr25, vr26, vr27
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr24, vr25, vr26, vr27
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_dct_flipadst_8x8_8bpc_lsx
+ addi.d sp, sp, -48
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+
+ vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ vsllwil.w.h vr18, vr4, 0
+ vsllwil.w.h vr19, vr5, 0
+ vsllwil.w.h vr6, vr12, 0
+ vsllwil.w.h vr7, vr13, 0
+ vsllwil.w.h vr8, vr14, 0
+ vsllwil.w.h vr9, vr15, 0
+ vsllwil.w.h vr10, vr24, 0
+ vsllwil.w.h vr11, vr25, 0
+ dct_8x4_core_lsx1 vr26, vr27, vr28, vr29
+ vshuf4i.d vr27, vr27, 0x01
+ vshuf4i.d vr29, vr29, 0x01
+
+ vilvl.h vr8, vr27, vr26
+ vilvh.h vr9, vr27, vr26
+ vilvl.h vr26, vr9, vr8
+ vilvh.h vr27, vr9, vr8
+ vilvl.h vr8, vr29, vr28
+ vilvh.h vr9, vr29, vr28
+ vilvl.h vr28, vr9, vr8
+ vilvh.h vr29, vr9, vr8
+
+ vsrari.h vr26, vr26, 1 // in0low in1low
+ vsrari.h vr27, vr27, 1 // in2low in3low
+ vsrari.h vr28, vr28, 1 // in0high in1high
+ vsrari.h vr29, vr29, 1 // in2high in3high
+
+ vexth.w.h vr18, vr4
+ vexth.w.h vr19, vr5
+ vexth.w.h vr6, vr12
+ vexth.w.h vr7, vr13
+ vexth.w.h vr8, vr14
+ vexth.w.h vr9, vr15
+ vexth.w.h vr10, vr24
+ vexth.w.h vr11, vr25
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+ dct_8x4_core_lsx1 vr12, vr13, vr14, vr15
+ vshuf4i.d vr13, vr13, 0x01
+ vshuf4i.d vr15, vr15, 0x01
+
+ vilvl.h vr8, vr13, vr12
+ vilvh.h vr9, vr13, vr12
+ vilvl.h vr12, vr9, vr8
+ vilvh.h vr13, vr9, vr8
+ vilvl.h vr8, vr15, vr14
+ vilvh.h vr9, vr15, vr14
+ vilvl.h vr14, vr9, vr8
+ vilvh.h vr15, vr9, vr8
+
+ vsrari.h vr0, vr12, 1
+ vsrari.h vr1, vr13, 1
+ vsrari.h vr2, vr14, 1
+ vsrari.h vr3, vr15, 1
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ vsllwil.w.h vr18, vr28, 0 // in0
+ vexth.w.h vr19, vr28 // in1
+ vsllwil.w.h vr6, vr29, 0 // in2
+ vexth.w.h vr7, vr29 // in3
+ vsllwil.w.h vr8, vr2, 0 // in4
+ vexth.w.h vr9, vr2 // in5
+ vsllwil.w.h vr10, vr3, 0 // in6
+ vexth.w.h vr11, vr3 // in7
+ adst8x8_1d_lsx vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr26, 0 // in0
+ vexth.w.h vr19, vr26 // in1
+ vsllwil.w.h vr6, vr27, 0 // in2
+ vexth.w.h vr7, vr27 // in3
+ vsllwil.w.h vr8, vr0, 0 // in4
+ vexth.w.h vr9, vr0 // in5
+ vsllwil.w.h vr10, vr1, 0 // in6
+ vexth.w.h vr11, vr1 // in7
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vilvh.d vr26, vr4, vr0
+ vilvh.d vr27, vr5, vr1
+ vilvh.d vr28, vr16, vr2
+ vilvh.d vr29, vr17, vr3
+ vilvl.d vr20, vr17, vr3
+ vilvl.d vr21, vr16, vr2
+ vilvl.d vr22, vr5, vr1
+ vilvl.d vr23, vr4, vr0
+
+.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr26, vr27, vr28, vr29
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ addi.d sp, sp, 48
+endfunc
+
+function inv_txfm_add_flipadst_flipadst_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr24, vr20, vr21
+ vilvh.w vr25, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr26, vr20, vr21
+ vilvh.w vr27, vr20, vr21
+ vshuf4i.h vr26, vr26, 0x1b
+ vshuf4i.h vr27, vr27, 0x1b
+
+ vexth.w.h vr18, vr0
+ vexth.w.h vr19, vr1
+ vexth.w.h vr6, vr2
+ vexth.w.h vr7, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr16
+ vexth.w.h vr11, vr17
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr0, vr20, vr21
+ vilvh.w vr1, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr2, vr20, vr21
+ vilvh.w vr3, vr20, vr21
+ vshuf4i.h vr2, vr2, 0x1b
+ vshuf4i.h vr3, vr3, 0x1b
+
+.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
+ vsrari.h \i, \i, 1
+.endr
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ vsllwil.w.h vr18, vr26, 0 // in0
+ vexth.w.h vr19, vr26 // in1
+ vsllwil.w.h vr6, vr27, 0 // in2
+ vexth.w.h vr7, vr27 // in3
+ vsllwil.w.h vr8, vr2, 0 // in4
+ vexth.w.h vr9, vr2 // in5
+ vsllwil.w.h vr10, vr3, 0 // in6
+ vexth.w.h vr11, vr3 // in7
+ adst8x8_1d_lsx vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr24, 0 // in0
+ vexth.w.h vr19, vr24 // in1
+ vsllwil.w.h vr6, vr25, 0 // in2
+ vexth.w.h vr7, vr25 // in3
+ vsllwil.w.h vr8, vr0, 0 // in4
+ vexth.w.h vr9, vr0 // in5
+ vsllwil.w.h vr10, vr1, 0 // in6
+ vexth.w.h vr11, vr1 // in7
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vilvh.d vr24, vr0, vr4
+ vilvh.d vr25, vr1, vr5
+ vilvh.d vr26, vr2, vr16
+ vilvh.d vr27, vr3, vr17
+ vilvl.d vr20, vr3, vr17
+ vilvl.d vr21, vr2, vr16
+ vilvl.d vr22, vr1, vr5
+ vilvl.d vr23, vr0, vr4
+
+.irp i, vr24, vr25, vr26, vr27, vr20, vr21, vr22, vr23
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr24, vr25, vr26, vr27
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_dct_identity_8x8_8bpc_lsx
+ addi.d sp, sp, -48
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+
+ vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ vsllwil.w.h vr18, vr4, 0
+ vsllwil.w.h vr19, vr5, 0
+ vsllwil.w.h vr6, vr12, 0
+ vsllwil.w.h vr7, vr13, 0
+ vsllwil.w.h vr8, vr14, 0
+ vsllwil.w.h vr9, vr15, 0
+ vsllwil.w.h vr10, vr24, 0
+ vsllwil.w.h vr11, vr25, 0
+ dct_8x4_core_lsx1 vr26, vr27, vr28, vr29
+ vshuf4i.d vr27, vr27, 0x01
+ vshuf4i.d vr29, vr29, 0x01
+
+ vilvl.h vr8, vr27, vr26
+ vilvh.h vr9, vr27, vr26
+ vilvl.h vr26, vr9, vr8
+ vilvh.h vr27, vr9, vr8
+ vilvl.h vr8, vr29, vr28
+ vilvh.h vr9, vr29, vr28
+ vilvl.h vr28, vr9, vr8
+ vilvh.h vr29, vr9, vr8
+
+ vsrari.h vr26, vr26, 1 // in0low in1low
+ vsrari.h vr27, vr27, 1 // in2low in3low
+ vsrari.h vr28, vr28, 1 // in0high in1high
+ vsrari.h vr29, vr29, 1 // in2high in3high
+
+ vexth.w.h vr18, vr4
+ vexth.w.h vr19, vr5
+ vexth.w.h vr6, vr12
+ vexth.w.h vr7, vr13
+ vexth.w.h vr8, vr14
+ vexth.w.h vr9, vr15
+ vexth.w.h vr10, vr24
+ vexth.w.h vr11, vr25
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr12, vr13, vr14, vr15
+
+ vshuf4i.d vr13, vr13, 0x01
+ vshuf4i.d vr15, vr15, 0x01
+
+ vilvl.h vr8, vr13, vr12
+ vilvh.h vr9, vr13, vr12
+ vilvl.h vr12, vr9, vr8
+ vilvh.h vr13, vr9, vr8
+ vilvl.h vr8, vr15, vr14
+ vilvh.h vr9, vr15, vr14
+ vilvl.h vr14, vr9, vr8
+ vilvh.h vr15, vr9, vr8
+
+ vsrari.h vr20, vr12, 1
+ vsrari.h vr21, vr13, 1
+ vsrari.h vr22, vr14, 1
+ vsrari.h vr23, vr15, 1
+
+ vreplgr2vr.h vr19, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr19, a2, \i
+.endr
+ // identity8
+ vsllwil.w.h vr10, vr26, 1
+ vsllwil.w.h vr11, vr27, 1
+ vsllwil.w.h vr16, vr28, 1
+ vsllwil.w.h vr17, vr29, 1
+ vsllwil.w.h vr6, vr20, 1
+ vsllwil.w.h vr7, vr21, 1
+ vsllwil.w.h vr18, vr22, 1
+ vsllwil.w.h vr19, vr23, 1
+
+.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w vr16, vr10, 4 // in0
+ vssrarni.h.w vr28, vr26, 4 // in1
+ vssrarni.h.w vr17, vr11, 4 // in2
+ vssrarni.h.w vr29, vr27, 4 // in3
+ vssrarni.h.w vr18, vr6, 4 // in4
+ vssrarni.h.w vr22, vr20, 4 // in5
+ vssrarni.h.w vr19, vr7, 4 // in6
+ vssrarni.h.w vr23, vr21, 4 // in7
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr16, vr28, vr17, vr29
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr18, vr22, vr19, vr23
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ addi.d sp, sp, 48
+endfunc
+
+function inv_txfm_add_identity_dct_8x8_8bpc_lsx
+ addi.d sp, sp, -48
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+
+ // identity8
+ vsllwil.w.h vr6, vr0, 1
+ vsllwil.w.h vr7, vr1, 1
+ vsllwil.w.h vr8, vr2, 1
+ vsllwil.w.h vr9, vr3, 1
+ vsllwil.w.h vr10, vr4, 1
+ vsllwil.w.h vr11, vr5, 1
+ vsllwil.w.h vr12, vr24, 1
+ vsllwil.w.h vr13, vr25, 1
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+ vslli.w \i, \i, 1
+.endr
+ vssrarni.h.w vr0, vr6, 1 // in0
+ vssrarni.h.w vr1, vr7, 1 // in1
+ vssrarni.h.w vr2, vr8, 1 // in2
+ vssrarni.h.w vr3, vr9, 1 // in3
+ vssrarni.h.w vr4, vr10, 1 // in4
+ vssrarni.h.w vr5, vr11, 1 // in5
+ vssrarni.h.w vr24, vr12, 1 // in6
+ vssrarni.h.w vr25, vr13, 1 // in7
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \
+ vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ // dct4 in0 in2 in4 in6
+ vsllwil.w.h vr18, vr4, 0
+ vsllwil.w.h vr19, vr5, 0
+ vsllwil.w.h vr6, vr12, 0
+ vsllwil.w.h vr7, vr13, 0
+ vsllwil.w.h vr8, vr14, 0
+ vsllwil.w.h vr9, vr15, 0
+ vsllwil.w.h vr10, vr24, 0
+ vsllwil.w.h vr11, vr25, 0
+ dct_8x4_core_lsx1 vr16, vr17, vr26, vr27
+
+ vexth.w.h vr18, vr4
+ vexth.w.h vr19, vr5
+ vexth.w.h vr6, vr12
+ vexth.w.h vr7, vr13
+ vexth.w.h vr8, vr14
+ vexth.w.h vr9, vr15
+ vexth.w.h vr10, vr24
+ vexth.w.h vr11, vr25
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+ dct_8x4_core_lsx1 vr4, vr5, vr24, vr25
+
+ vilvl.d vr8, vr4, vr16
+ vilvh.d vr9, vr4, vr16
+ vilvh.d vr6, vr5, vr17
+ vilvl.d vr7, vr5, vr17
+ vilvl.d vr16, vr24, vr26
+ vilvh.d vr17, vr24, vr26
+ vilvh.d vr18, vr25, vr27
+ vilvl.d vr19, vr25, vr27
+
+.irp i, vr8, vr9, vr6, vr7, vr16, vr17, vr18, vr19
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr8, vr9, vr6, vr7
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ addi.d sp, sp, 48
+endfunc
+
+function inv_txfm_add_flipadst_identity_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr24, vr20, vr21
+ vilvh.w vr25, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr26, vr20, vr21
+ vilvh.w vr27, vr20, vr21
+ vshuf4i.h vr26, vr26, 0x1b
+ vshuf4i.h vr27, vr27, 0x1b
+
+ vexth.w.h vr18, vr0 // in0
+ vexth.w.h vr19, vr1 // in1
+ vexth.w.h vr6, vr2 // in2
+ vexth.w.h vr7, vr3 // in3
+ vexth.w.h vr8, vr4 // in3
+ vexth.w.h vr9, vr5 // in4
+ vexth.w.h vr10, vr16 // in5
+ vexth.w.h vr11, vr17 // in6
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr16, vr20, vr21
+ vilvh.w vr17, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr18, vr20, vr21
+ vilvh.w vr19, vr20, vr21
+ vshuf4i.h vr18, vr18, 0x1b
+ vshuf4i.h vr19, vr19, 0x1b
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19
+ vsrari.h \i, \i, 1
+.endr
+
+ // identity8
+ vsllwil.w.h vr20, vr24, 1
+ vsllwil.w.h vr21, vr25, 1
+ vsllwil.w.h vr12, vr26, 1
+ vsllwil.w.h vr13, vr27, 1
+ vsllwil.w.h vr22, vr16, 1
+ vsllwil.w.h vr23, vr17, 1
+ vsllwil.w.h vr14, vr18, 1
+ vsllwil.w.h vr15, vr19, 1
+
+.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w vr20, vr12, 4 // in0
+ vssrarni.h.w vr24, vr26, 4 // in1
+ vssrarni.h.w vr21, vr13, 4 // in2
+ vssrarni.h.w vr25, vr27, 4 // in3
+ vssrarni.h.w vr22, vr14, 4 // in4
+ vssrarni.h.w vr16, vr18, 4 // in5
+ vssrarni.h.w vr23, vr15, 4 // in6
+ vssrarni.h.w vr17, vr19, 4 // in7
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr24, vr21, vr25
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr22, vr16, vr23, vr17
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_identity_flipadst_8x8_8bpc_lsx
+ addi.d sp, sp, -48
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+
+ // identity8
+ vsllwil.w.h vr6, vr0, 1
+ vsllwil.w.h vr7, vr1, 1
+ vsllwil.w.h vr8, vr2, 1
+ vsllwil.w.h vr9, vr3, 1
+ vsllwil.w.h vr10, vr4, 1
+ vsllwil.w.h vr11, vr5, 1
+ vsllwil.w.h vr12, vr24, 1
+ vsllwil.w.h vr13, vr25, 1
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w vr0, vr6, 1 // in0
+ vssrarni.h.w vr1, vr7, 1 // in1
+ vssrarni.h.w vr2, vr8, 1 // in2
+ vssrarni.h.w vr3, vr9, 1 // in3
+ vssrarni.h.w vr4, vr10, 1 // in4
+ vssrarni.h.w vr5, vr11, 1 // in5
+ vssrarni.h.w vr24, vr12, 1 // in6
+ vssrarni.h.w vr25, vr13, 1 // in7
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ vsllwil.w.h vr18, vr0, 0 // in0
+ vsllwil.w.h vr19, vr1, 0 // in1
+ vsllwil.w.h vr6, vr2, 0 // in2
+ vsllwil.w.h vr7, vr3, 0 // in3
+ vsllwil.w.h vr8, vr4, 0 // in3
+ vsllwil.w.h vr9, vr5, 0 // in4
+ vsllwil.w.h vr10, vr24, 0 // in5
+ vsllwil.w.h vr11, vr25, 0 // in6
+ adst8x8_1d_lsx vr26, vr27, vr28, vr29
+
+ vexth.w.h vr18, vr0 // in0
+ vexth.w.h vr19, vr1 // in1
+ vexth.w.h vr6, vr2 // in2
+ vexth.w.h vr7, vr3 // in3
+ vexth.w.h vr8, vr4 // in3
+ vexth.w.h vr9, vr5 // in4
+ vexth.w.h vr10, vr24 // in5
+ vexth.w.h vr11, vr25 // in6
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vilvh.d vr4, vr0, vr26
+ vilvh.d vr5, vr1, vr27
+ vilvh.d vr6, vr2, vr28
+ vilvh.d vr7, vr3, vr29
+ vilvl.d vr14, vr3, vr29
+ vilvl.d vr15, vr2, vr28
+ vilvl.d vr16, vr1, vr27
+ vilvl.d vr17, vr0, vr26
+
+.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ addi.d sp, sp, 48
+
+endfunc
+
+function inv_txfm_add_adst_identity_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr24, vr25, vr26, vr27
+
+ vexth.w.h vr18, vr0
+ vexth.w.h vr19, vr1
+ vexth.w.h vr6, vr2
+ vexth.w.h vr7, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr16
+ vexth.w.h vr11, vr17
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
+ vsrari.h \i, \i, 1
+.endr
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \
+ vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17
+
+ vshuf4i.h vr26, vr26, 0x1b
+ vshuf4i.h vr27, vr27, 0x1b
+ vshuf4i.h vr22, vr22, 0x1b
+ vshuf4i.h vr23, vr23, 0x1b
+
+ // identity8
+ vsllwil.w.h vr16, vr24, 1
+ vsllwil.w.h vr17, vr25, 1
+ vsllwil.w.h vr10, vr20, 1
+ vsllwil.w.h vr11, vr21, 1
+ vsllwil.w.h vr18, vr26, 1
+ vsllwil.w.h vr19, vr27, 1
+ vsllwil.w.h vr14, vr22, 1
+ vsllwil.w.h vr15, vr23, 1
+
+.irp i, vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w vr18, vr16, 4 // in0
+ vssrarni.h.w vr19, vr17, 4 // in1
+ vssrarni.h.w vr14, vr10, 4 // in2
+ vssrarni.h.w vr15, vr11, 4 // in3
+ vssrarni.h.w vr26, vr24, 4 // in4
+ vssrarni.h.w vr27, vr25, 4 // in5
+ vssrarni.h.w vr22, vr20, 4 // in6
+ vssrarni.h.w vr23, vr21, 4 // in7
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr18, vr19, vr14, vr15
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr26, vr27, vr22, vr23
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_identity_adst_8x8_8bpc_lsx
+ addi.d sp, sp, -48
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+
+ // identity8
+ vsllwil.w.h vr6, vr0, 1
+ vsllwil.w.h vr7, vr1, 1
+ vsllwil.w.h vr8, vr2, 1
+ vsllwil.w.h vr9, vr3, 1
+ vsllwil.w.h vr10, vr4, 1
+ vsllwil.w.h vr11, vr5, 1
+ vsllwil.w.h vr12, vr24, 1
+ vsllwil.w.h vr13, vr25, 1
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w vr0, vr6, 1 // in0
+ vssrarni.h.w vr1, vr7, 1 // in1
+ vssrarni.h.w vr2, vr8, 1 // in2
+ vssrarni.h.w vr3, vr9, 1 // in3
+ vssrarni.h.w vr4, vr10, 1 // in4
+ vssrarni.h.w vr5, vr11, 1 // in5
+ vssrarni.h.w vr24, vr12, 1 // in6
+ vssrarni.h.w vr25, vr13, 1 // in7
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr24, 0
+ vsllwil.w.h vr11, vr25, 0
+ adst8x8_1d_lsx vr26, vr27, vr28, vr29
+
+ vexth.w.h vr18, vr0
+ vexth.w.h vr19, vr1
+ vexth.w.h vr6, vr2
+ vexth.w.h vr7, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr24
+ vexth.w.h vr11, vr25
+
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vilvl.d vr4, vr0, vr26 // 0 ... 7
+ vilvl.d vr5, vr1, vr27 // 8 ... 15
+ vilvl.d vr6, vr2, vr28 // 16 ... 23
+ vilvl.d vr7, vr3, vr29 // 24 ... 31
+ vilvh.d vr14, vr3, vr29 // 32 ... 39
+ vilvh.d vr15, vr2, vr28 // 40 ... 47
+ vilvh.d vr16, vr1, vr27 // 48 ... 55
+ vilvh.d vr17, vr0, vr26 // 56 ... 63
+
+.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ addi.d sp, sp, 48
+endfunc
+
+.macro vmul_vmadd_w in0, in1, in2, in3, out0, out1
+ vsllwil.w.h vr22, \in0, 0
+ vexth.w.h vr23, \in0
+ vmul.w \out0, vr22, \in2
+ vmul.w \out1, vr23, \in2
+ vsllwil.w.h vr22, \in1, 0
+ vexth.w.h vr23, \in1
+ vmadd.w \out0, vr22, \in3
+ vmadd.w \out1, vr23, \in3
+.endm
+
+.macro vmul_vmsub_w in0, in1, in2, in3, out0, out1
+ vsllwil.w.h vr22, \in0, 0
+ vexth.w.h vr23, \in0
+ vmul.w \out0, vr22, \in2
+ vmul.w \out1, vr23, \in2
+ vsllwil.w.h vr22, \in1, 0
+ vexth.w.h vr23, \in1
+ vmsub.w \out0, vr22, \in3
+ vmsub.w \out1, vr23, \in3
+.endm
+
+.macro rect2_lsx in0, in1, out0
+ vsllwil.w.h vr22, \in0, 0 // in1
+ vexth.w.h \in0, \in0 // in1
+ vmul.w vr22, vr22, \in1
+ vmul.w \out0, \in0, \in1
+ vssrarni.h.w \out0, vr22, 12
+.endm
+
+.macro dct_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, \
+ out1, out2, out3, out4, out5, out6, out7, rect2
+
+ la.local t0, idct_coeffs
+
+.ifc \rect2, rect2_lsx
+ vldrepl.w vr23, t0, 0 // 2896
+.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+ rect2_lsx \i, vr23, \i
+.endr
+.endif
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+
+ vmul_vmadd_w \in2, \in6, vr21, vr20, vr8, vr9
+ vssrarni.h.w vr9, vr8, 12 // t3
+ vmul_vmsub_w \in2, \in6, vr20, vr21, vr8, vr10
+ vssrarni.h.w vr10, vr8, 12 // t2
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmadd_w \in0, \in4, vr20, vr20, vr8, \in2
+ vssrarni.h.w \in2, vr8, 12 // t0
+ vmul_vmsub_w \in0, \in4, vr20, vr20, vr8, \in6
+ vssrarni.h.w \in6, vr8, 12 // t1
+
+ vsadd.h vr8, \in2, vr9 // c[0]
+ vssub.h vr9, \in2, vr9 // c[3]
+ vsadd.h \in0, \in6, vr10 // c[1]
+ vssub.h vr10, \in6, vr10 // c[2]
+
+ vldrepl.w vr20, t0, 16 // 799
+ vldrepl.w vr21, t0, 20 // 4017
+ vmul_vmadd_w \in1, \in7, vr21, vr20, \in2, \in4
+ vssrarni.h.w \in4, \in2, 12 // t7a
+ vmul_vmsub_w \in1, \in7, vr20, vr21, \in2, \in6
+ vssrarni.h.w \in6, \in2, 12 // t4a
+
+ vldrepl.w vr20, t0, 24 // 3406
+ vldrepl.w vr21, t0, 28 // 2276
+ vmul_vmadd_w \in5, \in3, vr21, vr20, \in2, \in1
+ vssrarni.h.w \in1, \in2, 12 // t6a
+ vmul_vmsub_w \in5, \in3, vr20, vr21, \in2, \in7
+ vssrarni.h.w \in7, \in2, 12 // t5a
+
+ vsadd.h \in3, \in6, \in7 // t4
+ vssub.h \in6, \in6, \in7 // t5a
+ vsadd.h \in5, \in4, \in1 // t7
+ vssub.h \in4, \in4, \in1 // t6a
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmadd_w \in4, \in6, vr20, vr20, \in2, \in1
+ vssrarni.h.w \in1, \in2, 12 // t6
+ vmul_vmsub_w \in4, \in6, vr20, vr20, \in2, \in7
+ vssrarni.h.w \in7, \in2, 12 // t5
+
+ vsadd.h \out0, vr8, \in5 // c[0]
+ vssub.h \out7, vr8, \in5 // c[7]
+ vsadd.h \out1, \in0, \in1 // c[1]
+ vssub.h \out6, \in0, \in1 // c[6]
+ vsadd.h \out2, vr10, \in7 // c[2]
+ vssub.h \out5, vr10, \in7 // c[5]
+ vsadd.h \out3, vr9, \in3 // c[3]
+ vssub.h \out4, vr9, \in3 // c[4]
+.endm
+
+function inv_txfm_add_dct_dct_8x8_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_8x8
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr5, 0x880 // 128
+ vmul.w vr2, vr0, vr1 // dc * 181
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
+ vld vr10, a0, 0 // 0 1 2 3 4 5 6 7
+ vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift
+ vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15
+ alsl.d t2, a1, a0, 1
+ vmadd.w vr5, vr2, vr0
+ vld vr12, t2, 0 // 16 17 18 19 20 21 22 23
+ vssrarni.h.w vr5, vr5, 12
+ vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31
+
+ DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr5, vr5, vr5, vr5
+
+ b .DCT_DCT_8X8_END
+
+.NO_HAS_DCONLY_8x8:
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ la.local t0, idct_coeffs
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+ vsrari.h \i, \i, 1
+.endr
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ dct_8x8_core_lsx vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr4, vr5, vr6, vr7, vr20, vr21, vr22, vr23, no_rect2
+
+.irp i, vr4, vr5, vr6, vr7, vr20, vr21, vr22, vr23
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
+
+.DCT_DCT_8X8_END:
+
+endfunc
+
+.macro dct_8x16_core_lsx
+ dct_8x8_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 32 // 401
+ vldrepl.w vr21, t0, 36 // 4076
+ vmul_vmadd_w vr1, vr30, vr21, vr20, vr0, vr10
+ vssrarni.h.w vr10, vr0, 12 // t15a
+ vmul_vmsub_w vr1, vr30, vr20, vr21, vr0, vr29
+ vssrarni.h.w vr29, vr0, 12 // t8a
+
+ vldrepl.w vr20, t0, 40 // 3166 -> 1583
+ vldrepl.w vr21, t0, 44 // 2598 -> 1299
+ vmul_vmadd_w vr24, vr7, vr21, vr20, vr0, vr30
+ vssrarni.h.w vr30, vr0, 12 // t14a
+ vmul_vmsub_w vr24, vr7, vr20, vr21, vr0, vr31
+ vssrarni.h.w vr31, vr0, 12 // t9a
+
+ vldrepl.w vr20, t0, 48 // 1931
+ vldrepl.w vr21, t0, 52 // 3612
+ vmul_vmadd_w vr5, vr26, vr21, vr20, vr0, vr24
+ vssrarni.h.w vr24, vr0, 12 // t13a
+ vmul_vmsub_w vr5, vr26, vr20, vr21, vr0, vr25
+ vssrarni.h.w vr25, vr0, 12 // t10a
+
+ vldrepl.w vr20, t0, 56 // 3920
+ vldrepl.w vr21, t0, 60 // 1189
+ vmul_vmadd_w vr28, vr3, vr21, vr20, vr0, vr26
+ vssrarni.h.w vr26, vr0, 12 // t12a
+ vmul_vmsub_w vr28, vr3, vr20, vr21, vr0, vr27
+ vssrarni.h.w vr27, vr0, 12 // t11a
+
+ // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27
+ vsadd.h vr28, vr29, vr31 // t8
+ vssub.h vr19, vr29, vr31 // t9
+ vssub.h vr29, vr27, vr25 // t10
+ vsadd.h vr9, vr27, vr25 // t11
+ vsadd.h vr31, vr26, vr24 // t12
+ vssub.h vr25, vr26, vr24 // t13
+ vssub.h vr27, vr10, vr30 // t14
+ vsadd.h vr24, vr10, vr30 // t15
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26
+ vssrarni.h.w vr26, vr0, 12 // t14a
+ vmul_vmsub_w vr27, vr19, vr20, vr21, vr0, vr30
+ vssrarni.h.w vr30, vr0, 12 // t9a
+
+ vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19
+ vneg.w vr0, vr0
+ vneg.w vr19, vr19
+ vssrarni.h.w vr19, vr0, 12 // t10a
+ vmul_vmsub_w vr25, vr29, vr20, vr21, vr0, vr27
+ vssrarni.h.w vr27, vr0, 12 // t13a
+
+ vsadd.h vr25, vr28, vr9 // t8a
+ vssub.h vr29, vr28, vr9 // t11a
+ vssub.h vr28, vr24, vr31 // t12a
+ vsadd.h vr10, vr24, vr31 // t15a
+ vsadd.h vr9, vr30, vr19 // t9
+ vssub.h vr31, vr30, vr19 // t10
+ vssub.h vr30, vr26, vr27 // t13
+ vsadd.h vr24, vr26, vr27 // t14
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26
+ vssrarni.h.w vr26, vr0, 12 // t13a
+ vmul_vmsub_w vr30, vr31, vr20, vr20, vr0, vr27
+ vssrarni.h.w vr27, vr0, 12 // t10a
+
+ vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31
+ vssrarni.h.w vr31, vr0, 12 // t12
+ vmul_vmsub_w vr28, vr29, vr20, vr20, vr0, vr30
+ vssrarni.h.w vr30, vr0, 12 // t11
+
+ // vr11 vr12 ... vr18
+ vsadd.h vr28, vr14, vr31 // c[3]
+ vssub.h vr29, vr14, vr31 // c[12]
+ vsadd.h vr20, vr15, vr30 // c[4]
+ vssub.h vr21, vr15, vr30 // c[11]
+ vsadd.h vr14, vr16, vr27 // c[5]
+ vssub.h vr23, vr16, vr27 // c[10]
+ vsadd.h vr15, vr17, vr9 // c[6]
+ vssub.h vr30, vr17, vr9 // c[9]
+ vsadd.h vr16, vr18, vr25 // c[7]
+ vssub.h vr27, vr18, vr25 // c[8]
+ vsadd.h vr17, vr13, vr26 // c[2]
+ vssub.h vr26, vr13, vr26 // c[13]
+ vsadd.h vr18, vr12, vr24 // c[1]
+ vssub.h vr25, vr12, vr24 // c[14]
+ vsadd.h vr22, vr11, vr10 // c[0]
+ vssub.h vr24, vr11, vr10 // c[15]
+.endm
+
+function inv_txfm_add_dct_dct_8x16_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_8x16
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr5, 0x880 // 128
+ vmul.w vr2, vr0, vr1 // dc * 181
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
+ vld vr10, a0, 0 // 0 1 2 3 4 5 6 7
+ vmul.w vr2, vr0, vr2
+ vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
+ vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift
+ vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15
+ alsl.d t2, a1, a0, 1
+ vmadd.w vr5, vr2, vr0
+ vld vr12, t2, 0 // 16 17 18 19 20 21 22 23
+ vssrarni.h.w vr5, vr5, 12
+ vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31
+
+ DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr5, vr5, vr5, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr5, vr5, vr5, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr5, vr5, vr5, vr5
+
+ b .DCT_DCT_8X16_END
+
+.NO_HAS_DCONLY_8x16:
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+
+ vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ la.local t0, idct_coeffs
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx
+
+ vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+ vsrari.h \i, \i, 1
+.endr
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
+ vst vr23, a2, \i
+.endr
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31
+
+ LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \
+ vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31
+
+ dct_8x16_core_lsx
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr22, vr18, vr17, vr28
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr14, vr15, vr16
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr27, vr30, vr23, vr21
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr29, vr26, vr25, vr24
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+.DCT_DCT_8X16_END:
+endfunc
+
+.macro identity_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, rect2
+
+ la.local t0, idct_coeffs
+
+.ifc \rect2, rect2_lsx
+ vldrepl.w vr23, t0, 0 // 2896
+.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+ rect2_lsx \i, vr23, \i
+.endr
+.endif
+ vsllwil.w.h vr8, \in0, 1
+ vsllwil.w.h vr9, \in1, 1
+ vsllwil.w.h vr10, \in2, 1
+ vsllwil.w.h vr11, \in3, 1
+ vsllwil.w.h vr12, \in4, 1
+ vsllwil.w.h vr13, \in5, 1
+ vsllwil.w.h vr14, \in6, 1
+ vsllwil.w.h vr15, \in7, 1
+
+.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+ vexth.w.h \i, \i
+.endr
+
+.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w \in0, vr8, 1
+ vssrarni.h.w \in1, vr9, 1
+ vssrarni.h.w \in2, vr10, 1
+ vssrarni.h.w \in3, vr11, 1
+ vssrarni.h.w \in4, vr12, 1
+ vssrarni.h.w \in5, vr13, 1
+ vssrarni.h.w \in6, vr14, 1
+ vssrarni.h.w \in7, vr15, 1
+.endm
+
+.macro identity_8x16_core_lsx in0, out0
+ vsadd.h vr10, \in0, \in0
+ vsllwil.w.h vr8, \in0, 0
+ vexth.w.h \out0, \in0
+ vmul.w vr8, vr8, vr20
+ vmul.w \out0, \out0, vr20
+ vssrarni.h.w \out0, vr8, 11
+ vsadd.h \out0, \out0, vr10
+.endm
+
+function inv_txfm_add_identity_identity_8x16_8bpc_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ identity_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, rect2_lsx
+
+ vld_x8 a2, 128, 16, vr16, vr17, vr18, vr19, vr24, vr25, vr26, vr27
+
+ identity_8x8_core_lsx vr16, vr17, vr18, vr19, vr24, vr25, vr26, vr27, rect2_lsx
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
+ vst vr23, a2, \i
+.endr
+
+
+ LSX_TRANSPOSE8x8_H vr0, vr2, vr4, vr6, vr16, vr18, vr24, vr26, \
+ vr14, vr15, vr22, vr23, vr16, vr18, vr24, vr26, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr20, vr21
+
+ LSX_TRANSPOSE8x8_H vr1, vr3, vr5, vr7, vr17, vr19, vr25, vr27, \
+ vr28, vr29, vr30, vr31, vr17, vr19, vr25, vr27, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr20, vr21
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+.irp i, vr14, vr15, vr22, vr23, vr16, vr18, vr24, vr26, \
+ vr28, vr29, vr30, vr31, vr17, vr19, vr25, vr27
+ identity_8x16_core_lsx \i, \i
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr22, vr23
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr16, vr18, vr24, vr26
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr28, vr29, vr30, vr31
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr17, vr19, vr25, vr27
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+
+.macro adst_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7, rect2
+
+ la.local t0, iadst8_coeffs
+
+.ifc \rect2, rect2_lsx
+ vldrepl.w vr23, t0, 32 // 2896
+.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+ rect2_lsx \i, vr23, \i
+.endr
+.endif
+
+ vldrepl.w vr20, t0, 0 // 4076
+ vldrepl.w vr21, t0, 4 // 401
+
+ vmul_vmadd_w vr7, vr0, vr20, vr21, vr8, vr9
+ vssrarni.h.w vr9, vr8, 12 // t0a low
+ vmul_vmsub_w vr7, vr0, vr21, vr20, vr8, vr10
+ vssrarni.h.w vr10, vr8, 12 // t1a low
+
+ vldrepl.w vr20, t0, 8 // 3612
+ vldrepl.w vr21, t0, 12 // 1931
+ vmul_vmadd_w vr5, vr2, vr20, vr21, vr8, vr0
+ vssrarni.h.w vr0, vr8, 12 // t2a low
+ vmul_vmsub_w vr5, vr2, vr21, vr20, vr8, vr7
+ vssrarni.h.w vr7, vr8, 12 // t3a low
+
+ vldrepl.w vr20, t0, 16 // 2598 -> 1299
+ vldrepl.w vr21, t0, 20 // 3166 -> 1583
+ vmul_vmadd_w vr3, vr4, vr20, vr21, vr8, vr2
+ vssrarni.h.w vr2, vr8, 12 // t4a low
+ vmul_vmsub_w vr3, vr4, vr21, vr20, vr8, vr5
+ vssrarni.h.w vr5, vr8, 12 // t5a low
+
+ vldrepl.w vr20, t0, 24 // 1189
+ vldrepl.w vr21, t0, 28 // 3920
+ vmul_vmadd_w vr1, vr6, vr20, vr21, vr8, vr3
+ vssrarni.h.w vr3, vr8, 12 // t6a low
+ vmul_vmsub_w vr1, vr6, vr21, vr20, vr8, vr4
+ vssrarni.h.w vr4, vr8, 12 // t7a low
+
+ vsadd.h vr1, vr9, vr2 // t0
+ vssub.h vr6, vr9, vr2 // t4
+ vsadd.h vr8, vr10, vr5 // t1
+ vssub.h vr2, vr10, vr5 // t5
+ vsadd.h vr9, vr0, vr3 // t2
+ vssub.h vr5, vr0, vr3 // t6
+ vsadd.h vr10, vr7, vr4 // t3
+ vssub.h vr0, vr7, vr4 // t7
+
+ vldrepl.w vr20, t0, 40 // 1567
+ vldrepl.w vr21, t0, 44 // 3784
+ vmul_vmadd_w vr6, vr2, vr21, vr20, vr3, vr4
+ vssrarni.h.w vr4, vr3, 12 // t4a low
+ vmul_vmsub_w vr6, vr2, vr20, vr21, vr3, vr7
+ vssrarni.h.w vr7, vr3, 12 // t5a low
+
+ vmul_vmadd_w vr0, vr5, vr20, vr21, vr3, vr2
+ vssrarni.h.w vr2, vr3, 12 // t7a low
+ vmul_vmsub_w vr0, vr5, vr21, vr20, vr3, vr6
+ vssrarni.h.w vr6, vr3, 12 // t6a low
+
+ vsadd.h \out0, vr1, vr9 // out[0]
+ vssub.h vr5, vr1, vr9 // t2
+ vsadd.h vr3, vr8, vr10 // out[7]
+ vssub.h vr1, vr8, vr10 // t3
+ vexth.w.h vr9, vr3
+ vsllwil.w.h vr21, vr3, 0
+ vneg.w \out7, vr9
+ vneg.w vr21, vr21
+ vssrarni.h.w \out7, vr21, 0 // out[7]
+
+ vsadd.h vr8, vr4, vr6 // out[1]
+ vssub.h vr10, vr4, vr6 // t6
+ vexth.w.h vr20, vr8
+ vsllwil.w.h vr21, vr8, 0
+ vneg.w \out1, vr20
+ vneg.w vr21, vr21
+ vssrarni.h.w \out1, vr21, 0 // out[1]
+ vsadd.h \out6, vr7, vr2 // out[6]
+ vssub.h vr4, vr7, vr2 // t7
+
+ vldrepl.w vr20, t0, 32 // 2896
+ vmul_vmadd_w vr5, vr1, vr20, vr20, vr9, vr6
+ vssrarni.h.w vr6, vr9, 12 // out[3]
+ vmul_vmsub_w vr5, vr1, vr20, vr20, vr9, \out4
+ vssrarni.h.w \out4, vr9, 12 // out[4]
+
+ vmul_vmadd_w vr10, vr4, vr20, vr20, vr9, \out2
+ vssrarni.h.w \out2, vr9, 12 // out[2]
+ vmul_vmsub_w vr10, vr4, vr20, vr20, vr9, vr5
+ vssrarni.h.w vr5, vr9, 12 // out[5]
+
+ vexth.w.h vr20, vr6
+ vsllwil.w.h vr21, vr6, 0
+ vneg.w \out3, vr20
+ vneg.w vr21, vr21
+ vssrarni.h.w \out3, vr21, 0 // out[3]
+
+ vexth.w.h vr20, vr5
+ vsllwil.w.h vr21, vr5, 0
+ vneg.w \out5, vr20
+ vneg.w vr21, vr21
+ vssrarni.h.w \out5, vr21, 0 // out[5]
+.endm
+
+function inv_txfm_add_adst_dct_8x16_8bpc_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+
+ vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ adst_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx
+
+ vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ adst_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+ vsrari.h \i, \i, 1
+.endr
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
+ vst vr23, a2, \i
+.endr
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31
+
+ LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \
+ vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31
+
+ dct_8x8_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 32 // 401
+ vldrepl.w vr21, t0, 36 // 4076
+ vmul_vmadd_w vr1, vr30, vr21, vr20, vr0, vr10
+ vssrarni.h.w vr10, vr0, 12 // t15a
+ vmul_vmsub_w vr1, vr30, vr20, vr21, vr0, vr29
+ vssrarni.h.w vr29, vr0, 12 // t8a
+
+ vldrepl.w vr20, t0, 40 // 3166 -> 1583
+ vldrepl.w vr21, t0, 44 // 2598 -> 1299
+ vmul_vmadd_w vr24, vr7, vr21, vr20, vr0, vr30
+ vssrarni.h.w vr30, vr0, 12 // t14a
+ vmul_vmsub_w vr24, vr7, vr20, vr21, vr0, vr31
+ vssrarni.h.w vr31, vr0, 12 // t9a
+
+ vldrepl.w vr20, t0, 48 // 1931
+ vldrepl.w vr21, t0, 52 // 3612
+ vmul_vmadd_w vr5, vr26, vr21, vr20, vr0, vr24
+ vssrarni.h.w vr24, vr0, 12 // t13a
+ vmul_vmsub_w vr5, vr26, vr20, vr21, vr0, vr25
+ vssrarni.h.w vr25, vr0, 12 // t10a
+
+ vldrepl.w vr20, t0, 56 // 3920
+ vldrepl.w vr21, t0, 60 // 1189
+ vmul_vmadd_w vr28, vr3, vr21, vr20, vr0, vr26
+ vssrarni.h.w vr26, vr0, 12 // t12a
+ vmul_vmsub_w vr28, vr3, vr20, vr21, vr0, vr27
+ vssrarni.h.w vr27, vr0, 12 // t11a
+
+ // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27
+ vsadd.h vr28, vr29, vr31 // t8
+ vssub.h vr19, vr29, vr31 // t9
+ vssub.h vr29, vr27, vr25 // t10
+ vsadd.h vr9, vr27, vr25 // t11
+ vsadd.h vr31, vr26, vr24 // t12
+ vssub.h vr25, vr26, vr24 // t13
+ vssub.h vr27, vr10, vr30 // t14
+ vsadd.h vr24, vr10, vr30 // t15
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26
+ vssrarni.h.w vr26, vr0, 12 // t14a
+ vmul_vmsub_w vr27, vr19, vr20, vr21, vr0, vr30
+ vssrarni.h.w vr30, vr0, 12 // t9a
+
+ vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19
+ vneg.w vr0, vr0
+ vneg.w vr19, vr19
+ vssrarni.h.w vr19, vr0, 12 // t10a
+ vmul_vmsub_w vr25, vr29, vr20, vr21, vr0, vr27
+ vssrarni.h.w vr27, vr0, 12 // t13a
+
+ vsadd.h vr25, vr28, vr9 // t8a
+ vssub.h vr29, vr28, vr9 // t11a
+ vssub.h vr28, vr24, vr31 // t12a
+ vsadd.h vr10, vr24, vr31 // t15a
+ vsadd.h vr9, vr30, vr19 // t9
+ vssub.h vr31, vr30, vr19 // t10
+ vssub.h vr30, vr26, vr27 // t13
+ vsadd.h vr24, vr26, vr27 // t14
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26
+ vssrarni.h.w vr26, vr0, 12 // t13a
+ vmul_vmsub_w vr30, vr31, vr20, vr20, vr0, vr27
+ vssrarni.h.w vr27, vr0, 12 // t10a
+
+ vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31
+ vssrarni.h.w vr31, vr0, 12 // t12
+ vmul_vmsub_w vr28, vr29, vr20, vr20, vr0, vr30
+ vssrarni.h.w vr30, vr0, 12 // t11
+
+ // vr11 vr12 ... vr18
+ vsadd.h vr28, vr14, vr31 // c[3]
+ vssub.h vr29, vr14, vr31 // c[12]
+ vsadd.h vr20, vr15, vr30 // c[4]
+ vssub.h vr21, vr15, vr30 // c[11]
+ vsadd.h vr14, vr16, vr27 // c[5]
+ vssub.h vr23, vr16, vr27 // c[10]
+ vsadd.h vr15, vr17, vr9 // c[6]
+ vssub.h vr30, vr17, vr9 // c[9]
+ vsadd.h vr16, vr18, vr25 // c[7]
+ vssub.h vr27, vr18, vr25 // c[8]
+ vsadd.h vr17, vr13, vr26 // c[2]
+ vssub.h vr26, vr13, vr26 // c[13]
+ vsadd.h vr18, vr12, vr24 // c[1]
+ vssub.h vr25, vr12, vr24 // c[14]
+ vsadd.h vr22, vr11, vr10 // c[0]
+ vssub.h vr24, vr11, vr10 // c[15]
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr22, vr18, vr17, vr28
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr14, vr15, vr16
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr27, vr30, vr23, vr21
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr29, vr26, vr25, vr24
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+
+const iadst16_coeffs, align=4
+ .word 4091, 201, 3973, 995
+ .word 3703, 1751, 3290, 2440
+ .word 2751, 3035, 2106, 3513
+ .word 1380, 3857, 601, 4052
+endconst
+
+.macro adst16_core_lsx transpose8x8, shift, vst
+ la.local t0, iadst16_coeffs
+ vldrepl.w vr20, t0, 0 // 4091
+ vldrepl.w vr21, t0, 4 // 201
+
+ vmul_vmadd_w vr15, vr0, vr20, vr21, vr16, vr18
+ vmul_vmsub_w vr15, vr0, vr21, vr20, vr17, vr19
+ vssrarni.h.w vr18, vr16, 12 // t0
+ vssrarni.h.w vr19, vr17, 12 // t1
+
+ vldrepl.w vr20, t0, 8 // 3973
+ vldrepl.w vr21, t0, 12 // 995
+ vmul_vmadd_w vr13, vr2, vr20, vr21, vr16, vr0
+ vmul_vmsub_w vr13, vr2, vr21, vr20, vr17, vr15
+ vssrarni.h.w vr0, vr16, 12 // t2
+ vssrarni.h.w vr15, vr17, 12 // t3
+
+ vldrepl.w vr20, t0, 16 // 3703
+ vldrepl.w vr21, t0, 20 // 1751
+ vmul_vmadd_w vr11, vr4, vr20, vr21, vr16, vr2
+ vmul_vmsub_w vr11, vr4, vr21, vr20, vr17, vr13
+ vssrarni.h.w vr2, vr16, 12 // t4
+ vssrarni.h.w vr13, vr17, 12 // t5
+
+ vldrepl.w vr20, t0, 24 // 3290 -> 1645
+ vldrepl.w vr21, t0, 28 // 2440 -> 1220
+ vmul_vmadd_w vr9, vr6, vr20, vr21, vr16, vr4
+ vmul_vmsub_w vr9, vr6, vr21, vr20, vr17, vr11
+ vssrarni.h.w vr4, vr16, 12 // t6
+ vssrarni.h.w vr11, vr17, 12 // t7
+
+ vldrepl.w vr20, t0, 32 // 2751
+ vldrepl.w vr21, t0, 36 // 3035
+ vmul_vmadd_w vr7, vr8, vr20, vr21, vr16, vr6
+ vmul_vmsub_w vr7, vr8, vr21, vr20, vr17, vr9
+ vssrarni.h.w vr6, vr16, 12 // t8
+ vssrarni.h.w vr9, vr17, 12 // t9
+
+ vldrepl.w vr20, t0, 40 // 2106
+ vldrepl.w vr21, t0, 44 // 3513
+ vmul_vmadd_w vr5, vr10, vr20, vr21, vr16, vr7
+ vmul_vmsub_w vr5, vr10, vr21, vr20, vr17, vr8
+ vssrarni.h.w vr7, vr16, 12 // t10
+ vssrarni.h.w vr8, vr17, 12 // t11
+
+ vldrepl.w vr20, t0, 48 // 1380
+ vldrepl.w vr21, t0, 52 // 3857
+ vmul_vmadd_w vr3, vr12, vr20, vr21, vr16, vr5
+ vmul_vmsub_w vr3, vr12, vr21, vr20, vr17, vr10
+ vssrarni.h.w vr5, vr16, 12 // t12
+ vssrarni.h.w vr10, vr17, 12 // t13
+
+ vldrepl.w vr20, t0, 56 // 601
+ vldrepl.w vr21, t0, 60 // 4052
+ vmul_vmadd_w vr1, vr14, vr20, vr21, vr16, vr3
+ vmul_vmsub_w vr1, vr14, vr21, vr20, vr17, vr12
+ vssrarni.h.w vr3, vr16, 12 // t14
+ vssrarni.h.w vr12, vr17, 12 // t15
+
+ vsadd.h vr1, vr18, vr6 // t0a
+ vssub.h vr14, vr18, vr6 // t8a
+ vsadd.h vr16, vr19, vr9 // t1a
+ vssub.h vr17, vr19, vr9 // t9a
+ vsadd.h vr6, vr0, vr7 // t2a
+ vssub.h vr18, vr0, vr7 // t10a
+ vsadd.h vr9, vr15, vr8 // t3a
+ vssub.h vr19, vr15, vr8 // t11a
+ vsadd.h vr0, vr2, vr5 // t4a
+ vssub.h vr7, vr2, vr5 // t12a
+ vsadd.h vr8, vr13, vr10 // t5a
+ vssub.h vr15, vr13, vr10 // t13a
+ vsadd.h vr2, vr4, vr3 // t6a
+ vssub.h vr5, vr4, vr3 // t14a
+ vsadd.h vr10, vr11, vr12 // t7a
+ vssub.h vr13, vr11, vr12 // t15a
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 16 // 799
+ vldrepl.w vr21, t0, 20 // 4017
+ vmul_vmadd_w vr14, vr17, vr21, vr20, vr3, vr11
+ vmul_vmsub_w vr14, vr17, vr20, vr21, vr4, vr12
+ vssrarni.h.w vr11, vr3, 12 // t8
+ vssrarni.h.w vr12, vr4, 12 // t9
+
+ vmul_vmadd_w vr15, vr7, vr20, vr21, vr3, vr14
+ vmul_vmsub_w vr15, vr7, vr21, vr20, vr4, vr17
+ vssrarni.h.w vr14, vr3, 12 // t13
+ vssrarni.h.w vr17, vr4, 12 // t12
+
+ vldrepl.w vr20, t0, 24 // 3406
+ vldrepl.w vr21, t0, 28 // 2276
+ vmul_vmadd_w vr18, vr19, vr21, vr20, vr3, vr7
+ vmul_vmsub_w vr18, vr19, vr20, vr21, vr4, vr15
+ vssrarni.h.w vr7, vr3, 12 // t10
+ vssrarni.h.w vr15, vr4, 12 // t11
+
+ vmul_vmadd_w vr13, vr5, vr20, vr21, vr3, vr18
+ vmul_vmsub_w vr13, vr5, vr21, vr20, vr4, vr19
+ vssrarni.h.w vr18, vr3, 12 // t15
+ vssrarni.h.w vr19, vr4, 12 // t14
+
+ vsadd.h vr5, vr1, vr0 // t0
+ vssub.h vr13, vr1, vr0 // t4
+ vsadd.h vr3, vr16, vr8 // t1
+ vssub.h vr4, vr16, vr8 // t5
+ vsadd.h vr0, vr6, vr2 // t2
+ vssub.h vr1, vr6, vr2 // t6
+ vsadd.h vr8, vr9, vr10 // t3
+ vssub.h vr16, vr9, vr10 // t7
+ vsadd.h vr2, vr11, vr17 // t8a
+ vssub.h vr6, vr11, vr17 // t12a
+ vsadd.h vr9, vr12, vr14 // t9a
+ vssub.h vr10, vr12, vr14 // t13a
+ vsadd.h vr11, vr7, vr19 // t10a
+ vssub.h vr17, vr7, vr19 // t14a
+ vsadd.h vr12, vr15, vr18 // t11a
+ vssub.h vr14, vr15, vr18 // t15a
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr13, vr4, vr21, vr20, vr7, vr18
+ vmul_vmsub_w vr13, vr4, vr20, vr21, vr15, vr19
+ vssrarni.h.w vr18, vr7, 12 // t4a
+ vssrarni.h.w vr19, vr15, 12 // t5a
+
+ vmul_vmadd_w vr16, vr1, vr20, vr21, vr7, vr4
+ vmul_vmsub_w vr16, vr1, vr21, vr20, vr15, vr13
+ vssrarni.h.w vr4, vr7, 12 // t7a
+ vssrarni.h.w vr13, vr15, 12 // t6a
+
+ vmul_vmadd_w vr6, vr10, vr21, vr20, vr7, vr1
+ vmul_vmsub_w vr6, vr10, vr20, vr21, vr15, vr16
+ vssrarni.h.w vr1, vr7, 12 // t12
+ vssrarni.h.w vr16, vr15, 12 // t13
+
+ vmul_vmadd_w vr14, vr17, vr20, vr21, vr7, vr6
+ vmul_vmsub_w vr14, vr17, vr21, vr20, vr15, vr10
+ vssrarni.h.w vr6, vr7, 12 // t15
+ vssrarni.h.w vr10, vr15, 12 // t14
+
+ vsadd.h vr14, vr5, vr0 // out[0]
+ vssub.h vr17, vr5, vr0 // t2a
+ vssub.h vr7, vr3, vr8 // t3a
+ vsadd.h vr15, vr3, vr8 // out[15]
+ vsllwil.w.h vr22, vr15, 0
+ vexth.w.h vr15, vr15
+ vneg.w vr22, vr22
+ vneg.w vr15, vr15
+ vssrarni.h.w vr15, vr22, 0 // out[15]
+ vsadd.h vr14, vr5, vr0 // out[0]
+ vssub.h vr17, vr5, vr0 // t2a
+ vssub.h vr7, vr3, vr8 // t3a
+
+ vsadd.h vr3, vr19, vr4 // out[12]
+ vssub.h vr8, vr19, vr4 // t7
+ vssub.h vr0, vr18, vr13 // t6
+ vsadd.h vr5, vr18, vr13 // out[3]
+ vsllwil.w.h vr22, vr5, 0
+ vexth.w.h vr5, vr5
+ vneg.w vr22, vr22
+ vneg.w vr5, vr5
+ vssrarni.h.w vr5, vr22, 0 // out[3]
+
+ vsadd.h vr13, vr9, vr12 // out[14]
+ vssub.h vr19, vr9, vr12 // t11
+ vssub.h vr4, vr2, vr11 // t10
+ vsadd.h vr18, vr2, vr11 // out[1]
+ vsllwil.w.h vr22, vr18, 0
+ vexth.w.h vr18, vr18
+ vneg.w vr22, vr22
+ vneg.w vr18, vr18
+ vssrarni.h.w vr18, vr22, 0 // out[1]
+
+ vsadd.h vr2, vr1, vr10 // out[2]
+ vssub.h vr11, vr1, vr10 // t14a
+ vssub.h vr12, vr16, vr6 // t15a
+ vsadd.h vr9, vr16, vr6 // out[13]
+ vsllwil.w.h vr22, vr9, 0
+ vexth.w.h vr9, vr9
+ vneg.w vr22, vr22
+ vneg.w vr9, vr9
+ vssrarni.h.w vr9, vr22, 0 // out[13]
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmadd_w vr17, vr7, vr20, vr20, vr6, vr10
+ vmul_vmsub_w vr17, vr7, vr20, vr20, vr16, vr1
+ vssrarni.h.w vr10, vr6, 12 // out[7]
+
+ vsllwil.w.h vr7, vr10, 0
+ vexth.w.h vr10, vr10
+ vneg.w vr7, vr7
+ vneg.w vr10, vr10
+ vssrarni.h.w vr10, vr7, 0
+ vssrarni.h.w vr1, vr16, 12 // out[8]
+
+ vmul_vmsub_w vr0, vr8, vr20, vr20, vr16, vr17
+ vmul_vmadd_w vr0, vr8, vr20, vr20, vr6, vr7
+ vssrarni.h.w vr17, vr16, 12 // out[11]
+
+ vsllwil.w.h vr0, vr17, 0
+ vexth.w.h vr17, vr17
+ vneg.w vr0, vr0
+ vneg.w vr17, vr17
+ vssrarni.h.w vr17, vr0, 0
+ vssrarni.h.w vr7, vr6, 12 // out[4]
+
+ vmul_vmsub_w vr4, vr19, vr20, vr20, vr16, vr0
+ vmul_vmadd_w vr4, vr19, vr20, vr20, vr6, vr8
+ vssrarni.h.w vr0, vr16, 12 // out[9]
+
+ vsllwil.w.h vr4, vr0, 0
+ vexth.w.h vr0, vr0
+ vneg.w vr4, vr4
+ vneg.w vr0, vr0
+ vssrarni.h.w vr0, vr4, 0
+ vssrarni.h.w vr8, vr6, 12 // out[6]
+
+ vmul_vmadd_w vr11, vr12, vr20, vr20, vr6, vr4
+ vmul_vmsub_w vr11, vr12, vr20, vr20, vr16, vr19
+ vssrarni.h.w vr4, vr6, 12 // out[5]
+
+ vsllwil.w.h vr24, vr4, 0
+ vexth.w.h vr4, vr4
+ vneg.w vr24, vr24
+ vneg.w vr4, vr4
+ vssrarni.h.w vr4, vr24, 0
+ vssrarni.h.w vr19, vr16, 12 // out[10]
+
+.ifnb \transpose8x8
+ LSX_TRANSPOSE8x8_H vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \
+ vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \
+ vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23
+
+ LSX_TRANSPOSE8x8_H vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \
+ vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \
+ vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23
+.endif
+
+.ifnb \shift
+.irp i, vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \
+ vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
+ vsrari.h \i, \i, \shift
+.endr
+.endif
+
+.ifnb \vst
+ vst_x16 t1, 0, 16, vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \
+ vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
+.endif
+// out0 out1 out2 out3 out4 out5 out6 out7
+// vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10
+// out8 out9 out10 out11 out12 out13 out14 out15
+// vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15
+.endm // adst16_core_lsx
+
+.macro adst16_core_finish_lsx in0, in1, in2, in3, in4, in5, in6, in7
+ fld.d f20, t2, 0
+ fldx.d f21, t2, a1
+ fld.d f22, t3, 0
+ fldx.d f23, t3, a1
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ fld.d f24, t2, 0
+ fldx.d f25, t2, a1
+ fld.d f26, t3, 0
+ fldx.d f27, t3, a1
+
+.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27
+ vsllwil.hu.bu \i, \i, 0
+.endr
+
+.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+ vsrari.h \i, \i, 4
+.endr
+
+ vadd.h vr20, vr20, \in0
+ vadd.h vr21, vr21, \in1
+ vadd.h vr22, vr22, \in2
+ vadd.h vr23, vr23, \in3
+ vadd.h vr24, vr24, \in4
+ vadd.h vr25, vr25, \in5
+ vadd.h vr26, vr26, \in6
+ vadd.h vr27, vr27, \in7
+
+ vssrani.bu.h vr21, vr20, 0
+ vssrani.bu.h vr23, vr22, 0
+ vssrani.bu.h vr25, vr24, 0
+ vssrani.bu.h vr27, vr26, 0
+
+ vstelm.d vr21, t4, 0, 0
+ vstelm.d vr21, t5, 0, 1
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+ vstelm.d vr23, t4, 0, 0
+ vstelm.d vr23, t5, 0, 1
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+ vstelm.d vr25, t4, 0, 0
+ vstelm.d vr25, t5, 0, 1
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+ vstelm.d vr27, t4, 0, 0
+ vstelm.d vr27, t5, 0, 1
+
+.endm // adst16_core_finish_lsx
+
+function inv_txfm_add_dct_adst_8x16_8bpc_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+
+ vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ la.local t0, idct_coeffs
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx
+
+ vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+ vsrari.h \i, \i, 1
+.endr
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
+ vst vr23, a2, \i
+.endr
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31
+
+ LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+ vr16, vr17, vr18, vr20, vr21, vr22, vr23, vr31
+
+ adst16_core_lsx , ,
+
+ addi.d t2, a0, 0
+ alsl.d t3, a1, a0, 1
+ addi.d t4, a0, 0
+ add.d t5, a1, a0
+
+ adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+
+ adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+
+.macro malloc_space number
+ li.w t0, \number
+ sub.d sp, sp, t0
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+.endm
+
+.macro free_space number
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ li.w t0, \number
+ add.d sp, sp, t0
+ addi.d sp, sp, 64
+.endm
+
+.macro DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11
+ vsllwil.hu.bu vr10, \in0, 0
+ vexth.hu.bu vr0, \in0
+ vsllwil.hu.bu vr11, \in1, 0
+ vexth.hu.bu vr1, \in1
+ vsllwil.hu.bu vr12, \in2, 0
+ vexth.hu.bu vr2, \in2
+ vsllwil.hu.bu vr13, \in3, 0
+ vexth.hu.bu vr3, \in3
+ vadd.h vr10, vr10, \in4
+ vadd.h vr0, vr0, \in5
+ vadd.h vr11, vr11, \in6
+ vadd.h vr1, vr1, \in7
+ vadd.h vr12, vr12, \in8
+ vadd.h vr2, vr2, \in9
+ vadd.h vr13, vr13, \in10
+ vadd.h vr3, vr3, \in11
+ vssrani.bu.h vr0, vr10, 0
+ vssrani.bu.h vr1, vr11, 0
+ vssrani.bu.h vr2, vr12, 0
+ vssrani.bu.h vr3, vr13, 0
+ vst vr0, a0, 0
+ vstx vr1, a0, a1
+ vst vr2, t2, 0
+ vstx vr3, t2, a1
+.endm
+
+.macro VLD_DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, shift
+
+.ifnb \shift
+.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+ vsrari.h \i, \i, \shift
+.endr
+.endif
+
+ vld vr0, a0, 0
+ vldx vr1, a0, a1
+ vld vr2, t2, 0
+ vldx vr3, t2, a1
+ DST_ADD_W16 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3, \
+ \in4, \in5, \in6, \in7
+.endm
+
+function inv_txfm_add_dct_dct_16x8_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_16x8
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr5, 0x880 // 128
+ vmul.w vr2, vr0, vr1 // dc * 181
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
+ alsl.d t2, a1, a0, 1
+ vmul.w vr2, vr2, vr0
+ vldx vr1, a0, a1
+ vsrari.w vr2, vr2, 8
+ vldx vr3, t2, a1
+ vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift
+ vmadd.w vr5, vr2, vr0
+ vld vr0, a0, 0
+ vssrarni.h.w vr5, vr5, 12
+ vld vr2, t2, 0
+
+ DST_ADD_W16 vr0, vr1, vr2, vr3, vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5,
+
+ b .DCT_DCT_16x8_END
+
+.NO_HAS_DCONLY_16x8:
+ malloc_space 512
+
+ vld_x16 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr23, t0, 0 //2896
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+ rect2_lsx \i, vr23, \i
+.endr
+
+ dct_8x16_core_lsx
+
+ LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \
+ vr13, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24, \
+ vr13, vr31, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \
+ vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 1
+.endr
+
+ vst_x16 sp, 64, 16, vr13, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr12, vr29, vr26, vr25, vr24
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
+ vst vr23, a2, \i
+.endr
+
+ dct_8x8_core_lsx vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \
+ vr4, vr5, vr6, vr16, vr7, vr18, vr19, vr31, no_rect2
+
+ dct_8x8_core_lsx vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24, \
+ vr14, vr15, vr17, vr20, vr21, vr22, vr23, vr28, no_rect2
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W16 vr4, vr14, vr5, vr15, vr6, vr17, vr16, vr20, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W16 vr7, vr21, vr18, vr22, vr19, vr23, vr31, vr28, 4
+
+ free_space 512
+
+.DCT_DCT_16x8_END:
+
+endfunc
+
+function inv_txfm_add_adst_dct_16x8_8bpc_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+
+ addi.d t1, sp, 64
+ addi.d t2, a2, 0
+
+ vld_x16 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr23, t0, 0 //2896
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+ rect2_lsx \i, vr23, \i
+.endr
+
+ adst16_core_lsx , 1,
+
+ // out0 out1 out2 out3 out4 out5 out6 out7
+ // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10
+ // out8 out9 out10 out11 out12 out13 out14 out15
+ // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15
+
+ LSX_TRANSPOSE8x8_H vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \
+ vr14, vr18, vr2, vr5, vr7, vr4, vr24, vr25, \
+ vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23
+
+ LSX_TRANSPOSE8x8_H vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \
+ vr1, vr0, vr19, vr17, vr3, vr26, vr13, vr15, \
+ vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
+ vst vr23, a2, \i
+.endr
+
+ dct_8x8_core_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr24, vr25, \
+ vr27, vr28, vr29, vr25, vr30, vr31, vr6, vr16, no_rect2
+
+ dct_8x8_core_lsx vr1, vr0, vr19, vr17, vr3, vr26, vr13, vr15, \
+ vr5, vr7, vr18, vr20, vr21, vr22, vr23, vr24, no_rect2
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W16 vr27, vr5, vr28, vr7, vr29, vr18, vr25, vr20, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W16 vr30, vr21, vr31, vr22, vr6, vr23, vr16, vr24, 4
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+
+function inv_txfm_add_dct_dct_16x16_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_16x16
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr5, 0x880 // 128
+ vmul.w vr2, vr0, vr1 // dc * 181
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
+ alsl.d t2, a1, a0, 1
+ vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift
+ vldx vr1, a0, a1
+ vmadd.w vr5, vr2, vr0
+ vldx vr3, t2, a1
+ vssrarni.h.w vr5, vr5, 12
+ vld vr0, a0, 0
+ vld vr2, t2, 0
+
+ DST_ADD_W16 vr0, vr1, vr2, vr3, vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5,
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5,
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5,
+
+ b .DCT_DCT_16x16_END
+
+.NO_HAS_DCONLY_16x16:
+
+ malloc_space 512
+
+ vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 2
+.endr
+
+ vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 2
+.endr
+
+ vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vreplgr2vr.h vr31, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
+ 464, 480, 496
+ vst vr31, a2, \i
+.endr
+
+ vld_x8 sp, 64, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 sp, 320, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ vst_x8 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16
+ vst_x8 sp, 320, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x8 sp, 192, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 sp, 448, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 64
+ vld vr5, sp, 80
+ vld vr6, sp, 96
+ vld vr7, sp, 112
+ VLD_DST_ADD_W16 vr4, vr22, vr5, vr18, vr6, vr17, vr7, vr28, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 128
+ vld vr5, sp, 144
+ vld vr6, sp, 160
+ vld vr7, sp, 176
+ VLD_DST_ADD_W16 vr4, vr20, vr5, vr14, vr6, vr15, vr7, vr16, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 320
+ vld vr5, sp, 336
+ vld vr6, sp, 352
+ vld vr7, sp, 368
+ VLD_DST_ADD_W16 vr4, vr27, vr5, vr30, vr6, vr23, vr7, vr21, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 384
+ vld vr5, sp, 400
+ vld vr6, sp, 416
+ vld vr7, sp, 432
+ VLD_DST_ADD_W16 vr4, vr29, vr5, vr26, vr6, vr25, vr7, vr24, 4
+
+ free_space 512
+
+.DCT_DCT_16x16_END:
+endfunc
+
+function inv_txfm_add_adst_adst_16x16_8bpc_lsx
+
+ malloc_space 256+256
+
+ addi.d t1, sp, 64
+ addi.d t2, a2, 0
+
+ vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx transpose8x8, 2, vst_x16
+
+ addi.d t2, a2, 16
+ addi.d t1, t1, 256
+
+ vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx transpose8x8, 2, vst_x16
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
+ 464, 480, 496
+ vst vr23, a2, \i
+.endr
+
+ addi.d t2, sp, 64
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx , ,
+
+ // out0 out1 out2 out3 out4 out5 out6 out7
+ // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10
+ // out8 out9 out10 out11 out12 out13 out14 out15
+ // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15
+
+ addi.d t2, a0, 0
+ alsl.d t3, a1, a0, 1
+ addi.d t4, a0, 0
+ add.d t5, a1, a0
+
+ adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+
+ adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
+
+ addi.d t2, sp, 64+128
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx , ,
+
+ addi.d a0, a0, 8
+
+ addi.d t2, a0, 0
+ alsl.d t3, a1, a0, 1
+ addi.d t4, a0, 0
+ add.d t5, a1, a0
+
+ adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+
+ adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
+
+ free_space 256+256
+endfunc
+
+function inv_txfm_add_adst_dct_16x16_8bpc_lsx
+ malloc_space 256+256
+
+ addi.d t1, sp, 64
+ addi.d t2, a2, 0
+
+ vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx transpose8x8, 2, vst_x16
+
+ addi.d t2, a2, 16
+ addi.d t1, t1, 256
+
+ vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx transpose8x8, 2, vst_x16
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
+ 464, 480, 496
+ vst vr23, a2, \i
+.endr
+
+ addi.d t2, sp, 64
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ vst_x8 t2, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16
+ vst_x8 t2, 256, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ addi.d t2, sp, 64+128
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 64
+ vld vr5, sp, 80
+ vld vr6, sp, 96
+ vld vr7, sp, 112
+ VLD_DST_ADD_W16 vr4, vr22, vr5, vr18, vr6, vr17, vr7, vr28, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 128
+ vld vr5, sp, 144
+ vld vr6, sp, 160
+ vld vr7, sp, 176
+ VLD_DST_ADD_W16 vr4, vr20, vr5, vr14, vr6, vr15, vr7, vr16, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 320
+ vld vr5, sp, 336
+ vld vr6, sp, 352
+ vld vr7, sp, 368
+ VLD_DST_ADD_W16 vr4, vr27, vr5, vr30, vr6, vr23, vr7, vr21, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 384
+ vld vr5, sp, 400
+ vld vr6, sp, 416
+ vld vr7, sp, 432
+ VLD_DST_ADD_W16 vr4, vr29, vr5, vr26, vr6, vr25, vr7, vr24, 4
+
+ free_space 256+256
+endfunc
+
+function inv_txfm_add_dct_adst_16x16_8bpc_lsx
+ malloc_space 256+256
+
+ vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 2
+.endr
+
+ vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 2
+.endr
+
+ vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vreplgr2vr.h vr31, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
+ 464, 480, 496
+ vst vr31, a2, \i
+.endr
+
+ addi.d t2, sp, 64
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx , ,
+
+ // out0 out1 out2 out3 out4 out5 out6 out7
+ // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10
+ // out8 out9 out10 out11 out12 out13 out14 out15
+ // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15
+
+ addi.d t2, a0, 0
+ alsl.d t3, a1, a0, 1
+ addi.d t4, a0, 0
+ add.d t5, a1, a0
+
+ adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+
+ adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
+
+ addi.d t2, sp, 64+128
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx , ,
+
+ addi.d a0, a0, 8
+
+ addi.d t2, a0, 0
+ alsl.d t3, a1, a0, 1
+ addi.d t4, a0, 0
+ add.d t5, a1, a0
+
+ adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+
+ adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
+
+ free_space 256+256
+endfunc
+
+const shufb
+ .byte 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
+endconst
+
+function inv_txfm_add_flipadst_dct_16x16_8bpc_lsx
+ malloc_space 256+256
+
+ addi.d t1, sp, 64
+ addi.d t2, a2, 0
+
+ vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx transpose8x8, 2, vst_x16
+
+ addi.d t2, a2, 16
+ addi.d t1, t1, 256
+
+ vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx transpose8x8, 2, vst_x16
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
+ 464, 480, 496
+ vst vr23, a2, \i
+.endr
+
+ addi.d t2, sp, 64
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ la.local t0, shufb
+ vld vr0, t0, 0
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vshuf.b \i, \i, \i, vr0
+.endr
+
+ vst_x8 t2, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16
+ vst_x8 t2, 256, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ addi.d t2, sp, 64+128
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ la.local t0, shufb
+ vld vr0, t0, 0
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vshuf.b \i, \i, \i, vr0
+.endr
+
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 64
+ vld vr5, sp, 80
+ vld vr6, sp, 96
+ vld vr7, sp, 112
+ VLD_DST_ADD_W16 vr22, vr4, vr18, vr5, vr17, vr6, vr28, vr7, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 128
+ vld vr5, sp, 144
+ vld vr6, sp, 160
+ vld vr7, sp, 176
+ VLD_DST_ADD_W16 vr20, vr4, vr14, vr5, vr15, vr6, vr16, vr7, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 320
+ vld vr5, sp, 336
+ vld vr6, sp, 352
+ vld vr7, sp, 368
+ VLD_DST_ADD_W16 vr27, vr4, vr30, vr5, vr23, vr6, vr21, vr7, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 384
+ vld vr5, sp, 400
+ vld vr6, sp, 416
+ vld vr7, sp, 432
+ VLD_DST_ADD_W16 vr29, vr4, vr26, vr5, vr25, vr6, vr24, vr7, 4
+
+ free_space 256+256
+endfunc
+
+function inv_txfm_add_dct_flipadst_16x16_8bpc_lsx
+ malloc_space 256+256
+
+ vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 2
+.endr
+
+ vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 2
+.endr
+
+ vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vreplgr2vr.h vr31, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
+ 464, 480, 496
+ vst vr31, a2, \i
+.endr
+
+ addi.d t2, sp, 64
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx , ,
+
+ // out0 out1 out2 out3 out4 out5 out6 out7
+ // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10
+ // out8 out9 out10 out11 out12 out13 out14 out15
+ // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15
+
+ la.local t0, shufb
+ vld vr31, t0, 0
+
+ addi.d t2, a0, 0
+ alsl.d t3, a1, a0, 1
+ addi.d t4, a0, 0
+ add.d t5, a1, a0
+
+ adst16_core_finish_lsx vr15, vr13, vr9, vr3, vr17, vr19, vr0, vr1
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+
+ adst16_core_finish_lsx vr10, vr8, vr4, vr7, vr5, vr2, vr18, vr14
+
+ addi.d t2, sp, 64+128
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx , ,
+
+ addi.d a0, a0, 8
+
+ la.local t0, shufb
+ vld vr31, t0, 0
+
+ addi.d t2, a0, 0
+ alsl.d t3, a1, a0, 1
+ addi.d t4, a0, 0
+ add.d t5, a1, a0
+
+ adst16_core_finish_lsx vr15, vr13, vr9, vr3, vr17, vr19, vr0, vr1
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+
+ adst16_core_finish_lsx vr10, vr8, vr4, vr7, vr5, vr2, vr18, vr14
+
+ free_space 256+256
+
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_8x32
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr5, 0x880 // 128
+ vmul.w vr2, vr0, vr1 // dc * 181
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
+ vld vr10, a0, 0 // 0 1 2 3 4 5 6 7
+ vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift
+ vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15
+ alsl.d t2, a1, a0, 1
+ vmadd.w vr5, vr2, vr0
+ vld vr12, t2, 0 // 16 17 18 19 20 21 22 23
+ vssrarni.h.w vr5, vr5, 12
+ vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31
+
+ DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5
+
+.rept 7
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr5, vr5, vr5, vr5
+.endr
+
+ b .DCT_DCT_8X32_END
+
+.NO_HAS_DCONLY_8x32:
+ malloc_space 512
+
+ vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ la.local t0, idct_coeffs
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+ vsrari.h \i, \i, 2
+.endr
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ vst_x8 sp, 64, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vld_x8 a2, 16, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+ vsrari.h \i, \i, 2
+.endr
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ vst_x8 sp, 192, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vld_x8 a2, 32, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ la.local t0, idct_coeffs
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+ vsrari.h \i, \i, 2
+.endr
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ vst_x8 sp, 320, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vld_x8 a2, 48, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+ vsrari.h \i, \i, 2
+.endr
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ vst_x8 sp, 448, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vreplgr2vr.h vr31, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
+ 464, 480, 496
+ vst vr31, a2, \i
+.endr
+
+ addi.d t2, sp, 64
+ addi.d t3, sp, 64
+
+ vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ vst_x16 t3, 0, 32, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ // vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ // in1 in3 in5 in7 in9 in11 in13 in15
+ // vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+ // in17 in19 in21 in23 in25 in27 in29 in31
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 64 // 201
+ vldrepl.w vr21, t0, 68 // 4091
+
+ vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9
+ vssrarni.h.w vr9, vr8, 12 // t31a
+ vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10
+ vssrarni.h.w vr10, vr11, 12 // t16a
+
+ vldrepl.w vr20, t0, 72 // 3035
+ vldrepl.w vr21, t0, 76 // 2751
+ vmul_vmadd_w vr19, vr7, vr21, vr20, vr11, vr0
+ vssrarni.h.w vr0, vr11, 12 // t30a
+ vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30
+ vssrarni.h.w vr30, vr11, 12 // t17a
+
+ vldrepl.w vr20, t0, 80 // 1751
+ vldrepl.w vr21, t0, 84 // 3703
+ vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7
+ vssrarni.h.w vr7, vr8, 12 // t29a
+ vmul_vmsub_w vr4, vr26, vr20, vr21, vr8, vr19
+ vssrarni.h.w vr19, vr8, 12 // t18a
+
+ vldrepl.w vr20, t0, 88 // 3857
+ vldrepl.w vr21, t0, 92 // 1380
+ vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4
+ vssrarni.h.w vr4, vr8, 12 // t28a
+ vmul_vmsub_w vr27, vr3, vr20, vr21, vr8, vr26
+ vssrarni.h.w vr26, vr8, 12 // t19a
+
+ vldrepl.w vr20, t0, 96 // 995
+ vldrepl.w vr21, t0, 100 // 3973
+ vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3
+ vssrarni.h.w vr3, vr8, 12 // t27a
+ vmul_vmsub_w vr2, vr28, vr20, vr21, vr8, vr27
+ vssrarni.h.w vr27, vr8, 12 // t20a
+
+ vldrepl.w vr20, t0, 104 // 3513
+ vldrepl.w vr21, t0, 108 // 2106
+ vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2
+ vssrarni.h.w vr2, vr8, 12 // t26a
+ vmul_vmsub_w vr25, vr5, vr20, vr21, vr8, vr28
+ vssrarni.h.w vr28, vr8, 12 // t21a
+
+ vldrepl.w vr20, t0, 112 // 2440 -> 1220
+ vldrepl.w vr21, t0, 116 // 3290 -> 1645
+ vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5
+ vssrarni.h.w vr5, vr8, 12 // t25a
+ vmul_vmsub_w vr6, vr24, vr20, vr21, vr8, vr25
+ vssrarni.h.w vr25, vr8, 12 // t22a
+
+ vldrepl.w vr20, t0, 120 // 4052
+ vldrepl.w vr21, t0, 124 // 601
+ vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6
+ vssrarni.h.w vr6, vr8, 12 // t24a
+ vmul_vmsub_w vr29, vr1, vr20, vr21, vr8, vr24
+ vssrarni.h.w vr24, vr8, 12 // t23a
+
+ vsadd.h vr1, vr10, vr30 // t16
+ vssub.h vr29, vr10, vr30 // t17
+ vssub.h vr8, vr26, vr19 // t18
+ vsadd.h vr31, vr26, vr19 // t19
+ vsadd.h vr10, vr27, vr28 // t20
+ vssub.h vr30, vr27, vr28 // t21
+ vssub.h vr19, vr24, vr25 // t22
+ vsadd.h vr26, vr24, vr25 // t23
+ vsadd.h vr27, vr6, vr5 // t24
+ vssub.h vr28, vr6, vr5 // t25
+ vssub.h vr24, vr3, vr2 // t26
+ vsadd.h vr25, vr3, vr2 // t27
+ vsadd.h vr5, vr4, vr7 // t28
+ vssub.h vr6, vr4, vr7 // t29
+ vssub.h vr2, vr9, vr0 // t30
+ vsadd.h vr3, vr9, vr0 // t31
+
+ vldrepl.w vr20, t0, 16 // 799
+ vldrepl.w vr21, t0, 20 // 4017
+ vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7
+ vssrarni.h.w vr7, vr4, 12 // t30a
+ vmul_vmsub_w vr2, vr29, vr20, vr21, vr4, vr0
+ vssrarni.h.w vr0, vr4, 12 // t17a
+ vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9
+ vneg.w vr4, vr4
+ vneg.w vr9, vr9
+ vssrarni.h.w vr9, vr4, 12 // t18a
+ vmul_vmsub_w vr6, vr8, vr20, vr21, vr4, vr2
+ vssrarni.h.w vr2, vr4, 12 // t29a
+
+ vldrepl.w vr20, t0, 24 // 3406 -> 1703
+ vldrepl.w vr21, t0, 28 // 2276 -> 1138
+ vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29
+ vssrarni.h.w vr29, vr4, 12 // t26a
+ vmul_vmsub_w vr24, vr30, vr20, vr21, vr4, vr6
+ vssrarni.h.w vr6, vr4, 12 // t21a
+
+ vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8
+ vneg.w vr4, vr4
+ vneg.w vr8, vr8
+ vssrarni.h.w vr8, vr4, 12 // t22a
+ vmul_vmsub_w vr28, vr19, vr20, vr21, vr4, vr24
+ vssrarni.h.w vr24, vr4, 12 // t25a
+
+ vsadd.h vr4, vr1, vr31 // t16a
+ vssub.h vr30, vr1, vr31 // t19a
+ vsadd.h vr19, vr0, vr9 // t17
+ vssub.h vr28, vr0, vr9 // t18
+ vssub.h vr1, vr26, vr10 // t20a
+ vsadd.h vr31, vr26, vr10 // t23a
+ vssub.h vr0, vr8, vr6 // t21
+ vsadd.h vr9, vr8, vr6 // t22
+ vsadd.h vr10, vr27, vr25 // t24a
+ vssub.h vr26, vr27, vr25 // t27a
+ vsadd.h vr6, vr24, vr29 // t25
+ vssub.h vr8, vr24, vr29 // t26
+ vssub.h vr25, vr3, vr5 // t28a
+ vsadd.h vr27, vr3, vr5 // t31a
+ vssub.h vr24, vr7, vr2 // t29
+ vsadd.h vr29, vr7, vr2 // t30
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5
+ vssrarni.h.w vr5, vr3, 12 // t29a
+ vmul_vmsub_w vr24, vr28, vr20, vr21, vr3, vr2
+ vssrarni.h.w vr2, vr3, 12 // 18a
+
+ vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7
+ vssrarni.h.w vr7, vr3, 12 // t28
+ vmul_vmsub_w vr25, vr30, vr20, vr21, vr3, vr24
+ vssrarni.h.w vr24, vr3, 12 // t19
+
+ vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28
+ vneg.w vr3, vr3
+ vneg.w vr28, vr28
+ vssrarni.h.w vr28, vr3, 12 // t20
+ vmul_vmsub_w vr26, vr1, vr20, vr21, vr3, vr25
+ vssrarni.h.w vr25, vr3, 12 // t27
+
+ vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30
+ vneg.w vr3, vr3
+ vneg.w vr30, vr30
+ vssrarni.h.w vr30, vr3, 12 // t21a
+ vmul_vmsub_w vr8, vr0, vr20, vr21, vr3, vr1
+ vssrarni.h.w vr1, vr3, 12 // t26a
+
+ vsadd.h vr3, vr4, vr31 // t16
+ vssub.h vr26, vr4, vr31 // t23
+ vsadd.h vr0, vr19, vr9 // t17a
+ vssub.h vr8, vr19, vr9 // t22a
+ vsadd.h vr4, vr2, vr30 // t18
+ vssub.h vr31, vr2, vr30 // t21
+ vsadd.h vr9, vr24, vr28 // t19a
+ vssub.h vr19, vr24, vr28 // t20a
+ vssub.h vr2, vr27, vr10 // t24
+ vsadd.h vr30, vr27, vr10 // t31
+ vssub.h vr24, vr29, vr6 // t25a
+ vsadd.h vr28, vr29, vr6 // t30a
+ vssub.h vr10, vr5, vr1 // t26
+ vsadd.h vr27, vr5, vr1 // t29
+ vssub.h vr6, vr7, vr25 // t27a
+ vsadd.h vr29, vr7, vr25 // t28a
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5
+ vssrarni.h.w vr5, vr1, 12 // t20
+ vmul_vmadd_w vr6, vr19, vr20, vr20, vr1, vr7
+ vssrarni.h.w vr7, vr1, 12 // t27
+
+ vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25
+ vssrarni.h.w vr25, vr1, 12 // t21a
+ vmul_vmadd_w vr10, vr31, vr20, vr20, vr1, vr6
+ vssrarni.h.w vr6, vr1, 12 // t26a
+
+ vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19
+ vssrarni.h.w vr19, vr1, 12 // t22
+ vmul_vmadd_w vr24, vr8, vr20, vr20, vr1, vr10
+ vssrarni.h.w vr10, vr1, 12 // t25
+
+ vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31
+ vssrarni.h.w vr31, vr1, 12 // t23a
+ vmul_vmadd_w vr2, vr26, vr20, vr20, vr1, vr8
+ vssrarni.h.w vr8, vr1, 12 // t24a
+
+ // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16
+ // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3
+
+ vld_x8 t3, 0, 32, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vsadd.h vr1, vr11, vr30 // c[0]
+ vssub.h vr2, vr11, vr30 // c[31]
+ vsadd.h vr24, vr12, vr28 // c[1]
+ vssub.h vr26, vr12, vr28 // c[30]
+ vsadd.h vr11, vr13, vr27 // c[2]
+ vssub.h vr30, vr13, vr27 // c[29]
+ vsadd.h vr12, vr14, vr29 // c[3]
+ vssub.h vr28, vr14, vr29 // c[28]
+ vsadd.h vr13, vr15, vr7 // c[4]
+ vssub.h vr27, vr15, vr7 // c[27]
+ vsadd.h vr14, vr16, vr6 // c[5]
+ vssub.h vr29, vr16, vr6 // c[26]
+ vsadd.h vr7, vr17, vr10 // c[6]
+ vssub.h vr15, vr17, vr10 // c[25]
+ vsadd.h vr6, vr18, vr8 // c[7]
+ vssub.h vr16, vr18, vr8 // c[24]
+
+.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
+ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+ vsrari.h \i, \i, 4
+.endr
+
+ vst_x8 t2, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+
+ vst_x8 t2, 128, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+
+ vld_x8 t3, 256, 32, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vsadd.h vr1, vr11, vr31 // c[8]
+ vssub.h vr2, vr11, vr31 // c[23]
+ vsadd.h vr24, vr12, vr19 // c[9]
+ vssub.h vr26, vr12, vr19 // c[22]
+ vsadd.h vr11, vr13, vr25 // c[10]
+ vssub.h vr30, vr13, vr25 // c[21]
+ vsadd.h vr12, vr14, vr5 // c[11]
+ vssub.h vr28, vr14, vr5 // c[20]
+ vsadd.h vr13, vr15, vr9 // c[12]
+ vssub.h vr27, vr15, vr9 // c[19]
+ vsadd.h vr14, vr16, vr4 // c[13]
+ vssub.h vr29, vr16, vr4 // c[18]
+ vsadd.h vr7, vr17, vr0 // c[14]
+ vssub.h vr15, vr17, vr0 // c[17]
+ vsadd.h vr6, vr18, vr3 // c[15]
+ vssub.h vr16, vr18, vr3 // c[16]
+
+.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
+ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+ vsrari.h \i, \i, 4
+.endr
+
+ vst_x8 t2, 256, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+
+ vst_x8 t2, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+
+ alsl.d t2, a1, a0, 1
+ addi.d t3, sp, 64
+
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ addi.d t3, sp, 64+64
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ addi.d t3, sp, 64+256
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ addi.d t3, t3, 64
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ addi.d t3, sp, 64+384
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ addi.d t3, t3, 64
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ addi.d t3, sp, 64+128
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ addi.d t3, t3, 64
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ free_space 512
+.DCT_DCT_8X32_END:
+endfunc
+
+.macro dct_8x32_core_lsx in1, in2, vst_start0, vst_start1, vst_start2, \
+ vst_start3, transpose8x8, shift
+
+ // vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ // in1 in3 in5 in7 in9 in11 in13 in15
+ // vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+ // in17 in19 in21 in23 in25 in27 in29 in31
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 64 // 201
+ vldrepl.w vr21, t0, 68 // 4091
+
+ vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9
+ vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10
+ vssrarni.h.w vr9, vr8, 12 // t31a
+ vssrarni.h.w vr10, vr11, 12 // t16a
+
+ vldrepl.w vr20, t0, 72 // 3035
+ vldrepl.w vr21, t0, 76 // 2751
+ vmul_vmadd_w vr19, vr7, vr21, vr20, vr8, vr0
+ vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30
+ vssrarni.h.w vr0, vr8, 12 // t30a
+ vssrarni.h.w vr30, vr11, 12 // t17a
+
+ vldrepl.w vr20, t0, 80 // 1751
+ vldrepl.w vr21, t0, 84 // 3703
+ vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7
+ vmul_vmsub_w vr4, vr26, vr20, vr21, vr11, vr19
+ vssrarni.h.w vr7, vr8, 12 // t29a
+ vssrarni.h.w vr19, vr11, 12 // t18a
+
+ vldrepl.w vr20, t0, 88 // 3857
+ vldrepl.w vr21, t0, 92 // 1380
+ vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4
+ vmul_vmsub_w vr27, vr3, vr20, vr21, vr11, vr26
+ vssrarni.h.w vr4, vr8, 12 // t28a
+ vssrarni.h.w vr26, vr11, 12 // t19a
+
+ vldrepl.w vr20, t0, 96 // 995
+ vldrepl.w vr21, t0, 100 // 3973
+ vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3
+ vmul_vmsub_w vr2, vr28, vr20, vr21, vr11, vr27
+ vssrarni.h.w vr3, vr8, 12 // t27a
+ vssrarni.h.w vr27, vr11, 12 // t20a
+
+ vldrepl.w vr20, t0, 104 // 3513
+ vldrepl.w vr21, t0, 108 // 2106
+ vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2
+ vmul_vmsub_w vr25, vr5, vr20, vr21, vr11, vr28
+ vssrarni.h.w vr2, vr8, 12 // t26a
+ vssrarni.h.w vr28, vr11, 12 // t21a
+
+ vldrepl.w vr20, t0, 112 // 2440 -> 1220
+ vldrepl.w vr21, t0, 116 // 3290 -> 1645
+ vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5
+ vmul_vmsub_w vr6, vr24, vr20, vr21, vr11, vr25
+ vssrarni.h.w vr5, vr8, 12 // t25a
+ vssrarni.h.w vr25, vr11, 12 // t22a
+
+ vldrepl.w vr20, t0, 120 // 4052
+ vldrepl.w vr21, t0, 124 // 601
+ vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6
+ vmul_vmsub_w vr29, vr1, vr20, vr21, vr11, vr24
+ vssrarni.h.w vr6, vr8, 12 // t24a
+ vssrarni.h.w vr24, vr11, 12 // t23a
+
+ vsadd.h vr1, vr10, vr30 // t16
+ vssub.h vr29, vr10, vr30 // t17
+ vssub.h vr8, vr26, vr19 // t18
+ vsadd.h vr31, vr26, vr19 // t19
+ vsadd.h vr10, vr27, vr28 // t20
+ vssub.h vr30, vr27, vr28 // t21
+ vssub.h vr19, vr24, vr25 // t22
+ vsadd.h vr26, vr24, vr25 // t23
+ vsadd.h vr27, vr6, vr5 // t24
+ vssub.h vr28, vr6, vr5 // t25
+ vssub.h vr24, vr3, vr2 // t26
+ vsadd.h vr25, vr3, vr2 // t27
+ vsadd.h vr5, vr4, vr7 // t28
+ vssub.h vr6, vr4, vr7 // t29
+ vssub.h vr2, vr9, vr0 // t30
+ vsadd.h vr3, vr9, vr0 // t31
+
+ vldrepl.w vr20, t0, 16 // 799
+ vldrepl.w vr21, t0, 20 // 4017
+ vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7
+ vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0
+ vssrarni.h.w vr7, vr4, 12 // t30a
+ vssrarni.h.w vr0, vr11, 12 // t17a
+ vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9
+ vneg.w vr4, vr4
+ vneg.w vr9, vr9
+ vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2
+ vssrarni.h.w vr9, vr4, 12 // t18a
+ vssrarni.h.w vr2, vr11, 12 // t29a
+
+ vldrepl.w vr20, t0, 24 // 3406 -> 1703
+ vldrepl.w vr21, t0, 28 // 2276 -> 1138
+ vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29
+ vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6
+ vssrarni.h.w vr29, vr4, 12 // t26a
+ vssrarni.h.w vr6, vr11, 12 // t21a
+
+ vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8
+ vneg.w vr4, vr4
+ vneg.w vr8, vr8
+ vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24
+ vssrarni.h.w vr8, vr4, 12 // t22a
+ vssrarni.h.w vr24, vr11, 12 // t25a
+
+ vsadd.h vr4, vr1, vr31 // t16a
+ vssub.h vr30, vr1, vr31 // t19a
+ vsadd.h vr19, vr0, vr9 // t17
+ vssub.h vr28, vr0, vr9 // t18
+ vssub.h vr1, vr26, vr10 // t20a
+ vsadd.h vr31, vr26, vr10 // t23a
+ vssub.h vr0, vr8, vr6 // t21
+ vsadd.h vr9, vr8, vr6 // t22
+ vsadd.h vr10, vr27, vr25 // t24a
+ vssub.h vr26, vr27, vr25 // t27a
+ vsadd.h vr6, vr24, vr29 // t25
+ vssub.h vr8, vr24, vr29 // t26
+ vssub.h vr25, vr3, vr5 // t28a
+ vsadd.h vr27, vr3, vr5 // t31a
+ vssub.h vr24, vr7, vr2 // t29
+ vsadd.h vr29, vr7, vr2 // t30
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5
+ vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2
+ vssrarni.h.w vr5, vr3, 12 // t29a
+ vssrarni.h.w vr2, vr11, 12 // 18a
+
+ vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7
+ vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24
+ vssrarni.h.w vr7, vr3, 12 // t28
+ vssrarni.h.w vr24, vr11, 12 // t19
+
+ vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28
+ vneg.w vr3, vr3
+ vneg.w vr28, vr28
+ vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25
+ vssrarni.h.w vr28, vr3, 12 // t20
+ vssrarni.h.w vr25, vr11, 12 // t27
+
+ vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30
+ vneg.w vr3, vr3
+ vneg.w vr30, vr30
+ vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1
+ vssrarni.h.w vr30, vr3, 12 // t21a
+ vssrarni.h.w vr1, vr11, 12 // t26a
+
+ vsadd.h vr3, vr4, vr31 // t16
+ vssub.h vr26, vr4, vr31 // t23
+ vsadd.h vr0, vr19, vr9 // t17a
+ vssub.h vr8, vr19, vr9 // t22a
+ vsadd.h vr4, vr2, vr30 // t18
+ vssub.h vr31, vr2, vr30 // t21
+ vsadd.h vr9, vr24, vr28 // t19a
+ vssub.h vr19, vr24, vr28 // t20a
+ vssub.h vr2, vr27, vr10 // t24
+ vsadd.h vr30, vr27, vr10 // t31
+ vssub.h vr24, vr29, vr6 // t25a
+ vsadd.h vr28, vr29, vr6 // t30a
+ vssub.h vr10, vr5, vr1 // t26
+ vsadd.h vr27, vr5, vr1 // t29
+ vssub.h vr6, vr7, vr25 // t27a
+ vsadd.h vr29, vr7, vr25 // t28a
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5
+ vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7
+ vssrarni.h.w vr5, vr1, 12 // t20
+ vssrarni.h.w vr7, vr11, 12 // t27
+
+ vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25
+ vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6
+ vssrarni.h.w vr25, vr1, 12 // t21a
+ vssrarni.h.w vr6, vr11, 12 // t26a
+
+ vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19
+ vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10
+ vssrarni.h.w vr19, vr1, 12 // t22
+ vssrarni.h.w vr10, vr11, 12 // t25
+
+ vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31
+ vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8
+ vssrarni.h.w vr31, vr1, 12 // t23a
+ vssrarni.h.w vr8, vr11, 12 // t24a
+
+ // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16
+ // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3
+
+ vld_x8 \in2, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vsadd.h vr1, vr11, vr30 // c[0]
+ vssub.h vr2, vr11, vr30 // c[31]
+ vsadd.h vr24, vr12, vr28 // c[1]
+ vssub.h vr26, vr12, vr28 // c[30]
+ vsadd.h vr11, vr13, vr27 // c[2]
+ vssub.h vr30, vr13, vr27 // c[29]
+ vsadd.h vr12, vr14, vr29 // c[3]
+ vssub.h vr28, vr14, vr29 // c[28]
+ vsadd.h vr13, vr15, vr7 // c[4]
+ vssub.h vr27, vr15, vr7 // c[27]
+ vsadd.h vr14, vr16, vr6 // c[5]
+ vssub.h vr29, vr16, vr6 // c[26]
+ vsadd.h vr7, vr17, vr10 // c[6]
+ vssub.h vr15, vr17, vr10 // c[25]
+ vsadd.h vr6, vr18, vr8 // c[7]
+ vssub.h vr16, vr18, vr8 // c[24]
+
+.ifnb \transpose8x8
+ LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
+ vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
+ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
+.endif
+
+.ifnb \shift
+.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+ vsrari.h \i, \i, \shift
+.endr
+.endif
+
+ vst_x8 \in1, \vst_start0, 64, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+
+.ifnb \transpose8x8
+ LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
+ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
+ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
+.endif
+
+.ifnb \shift
+.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+ vsrari.h \i, \i, \shift
+.endr
+.endif
+
+ vst_x8 \in1, \vst_start3, 64, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+
+ vld_x8 \in2, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vsadd.h vr1, vr11, vr31 // c[8]
+ vssub.h vr2, vr11, vr31 // c[23]
+ vsadd.h vr24, vr12, vr19 // c[9]
+ vssub.h vr26, vr12, vr19 // c[22]
+ vsadd.h vr11, vr13, vr25 // c[10]
+ vssub.h vr30, vr13, vr25 // c[21]
+ vsadd.h vr12, vr14, vr5 // c[11]
+ vssub.h vr28, vr14, vr5 // c[20]
+ vsadd.h vr13, vr15, vr9 // c[12]
+ vssub.h vr27, vr15, vr9 // c[19]
+ vsadd.h vr14, vr16, vr4 // c[13]
+ vssub.h vr29, vr16, vr4 // c[18]
+ vsadd.h vr7, vr17, vr0 // c[14]
+ vssub.h vr15, vr17, vr0 // c[17]
+ vsadd.h vr6, vr18, vr3 // c[15]
+ vssub.h vr16, vr18, vr3 // c[16]
+
+.ifnb \transpose8x8
+ LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
+ vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
+ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
+.endif
+
+.ifnb \shift
+.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+ vsrari.h \i, \i, \shift
+.endr
+.endif
+
+ vst_x8 \in1, \vst_start1, 64, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+
+.ifnb \transpose8x8
+ LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
+ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
+ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
+.endif
+
+.ifnb \shift
+.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+ vsrari.h \i, \i, \shift
+.endr
+.endif
+
+ vst_x8 \in1, \vst_start2, 64, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+.endm
+
+function inv_txfm_add_dct_dct_32x32_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_32x32
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr20, 0x880 // 128
+ vmul.w vr2, vr0, vr1 // dc * 181
+ st.h zero, a2, 0
+ add.d t0, a0, a1
+ vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
+ vld vr3, t0, 16
+ vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift
+ vld vr1, a0, 16
+ vmadd.w vr20, vr2, vr0
+ vld vr2, t0, 0
+ vssrarni.h.w vr20, vr20, 12
+ vld vr0, a0, 0
+
+ vsllwil.hu.bu vr4, vr0, 0
+ vsllwil.hu.bu vr5, vr1, 0
+ vsllwil.hu.bu vr6, vr2, 0
+ vsllwil.hu.bu vr7, vr3, 0
+ vexth.hu.bu vr0, vr0
+ vexth.hu.bu vr1, vr1
+ vexth.hu.bu vr2, vr2
+ vexth.hu.bu vr3, vr3
+ vadd.h vr8, vr4, vr20
+ vadd.h vr9, vr0, vr20
+ vadd.h vr10, vr5, vr20
+ vadd.h vr11, vr1, vr20
+ vadd.h vr12, vr6, vr20
+ vadd.h vr13, vr2, vr20
+ vadd.h vr14, vr7, vr20
+ vadd.h vr15, vr3, vr20
+ vssrani.bu.h vr9, vr8, 0
+ vssrani.bu.h vr11, vr10, 0
+ vssrani.bu.h vr13, vr12, 0
+ vssrani.bu.h vr15, vr14, 0
+ vst vr9, a0, 0
+ vst vr11, a0, 16
+ vst vr13, t0, 0
+ vst vr15, t0, 16
+
+.rept 15
+ alsl.d a0, a1, a0, 1
+ add.d t0, a0, a1
+
+ vld vr0, a0, 0
+ vld vr1, a0, 16
+ vld vr2, t0, 0
+ vld vr3, t0, 16
+ vsllwil.hu.bu vr4, vr0, 0
+ vsllwil.hu.bu vr5, vr1, 0
+ vsllwil.hu.bu vr6, vr2, 0
+ vsllwil.hu.bu vr7, vr3, 0
+ vexth.hu.bu vr0, vr0
+ vexth.hu.bu vr1, vr1
+ vexth.hu.bu vr2, vr2
+ vexth.hu.bu vr3, vr3
+ vadd.h vr8, vr4, vr20
+ vadd.h vr9, vr0, vr20
+ vadd.h vr10, vr5, vr20
+ vadd.h vr11, vr1, vr20
+ vadd.h vr12, vr6, vr20
+ vadd.h vr13, vr2, vr20
+ vadd.h vr14, vr7, vr20
+ vadd.h vr15, vr3, vr20
+ vssrani.bu.h vr9, vr8, 0
+ vssrani.bu.h vr11, vr10, 0
+ vssrani.bu.h vr13, vr12, 0
+ vssrani.bu.h vr15, vr14, 0
+ vst vr9, a0, 0
+ vst vr11, a0, 16
+ vst vr13, t0, 0
+ vst vr15, t0, 16
+.endr
+
+ b .DCT_DCT_32X32_END
+.NO_HAS_DCONLY_32x32:
+
+ malloc_space 2560 // 32*32*2+512
+
+ addi.d t1, sp, 64
+ addi.d t2, a2, 0
+ addi.d t3, sp, 1024
+ addi.d t3, t3, 1024
+ addi.d t3, t3, 64
+
+ vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x32_core_lsx t1, t3, 0, 16, 32, 48, transpose8x8, 2
+
+.rept 3
+ addi.d t2, t2, 16
+ addi.d t1, t1, 512
+
+ vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x32_core_lsx t1, t3, 0, 16, 32, 48, transpose8x8, 2
+.endr
+
+ vreplgr2vr.h vr31, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040, 1056, 1072, 1088, 1104, 1120, 1136, 1152, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, 1296, 1312, 1328, 1344, 1360, 1376, 1392, 1408, 1424, 1440, 1456, 1472, 1488, 1504, 1520, 1536, 1552, 1568, 1584, 1600, 1616, 1632, 1648, 1664, 1680, 1696, 1712, 1728, 1744, 1760, 1776, 1792, 1808, 1824, 1840, 1856, 1872, 1888, 1904, 1920, 1936, 1952, 1968, 1984, 2000, 2016, 2032
+ vst vr31, a2, \i
+.endr
+
+ addi.d t2, sp, 64
+ addi.d t1, sp, 64
+
+ vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x32_core_lsx t1, t3, 0, 512, 1024, 1536, , 4
+
+.rept 3
+ addi.d t2, t2, 16
+ addi.d t1, t1, 16
+
+ vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x32_core_lsx t1, t3, 0, 512, 1024, 1536, , 4
+.endr
+
+ addi.d t2, sp, 64
+
+.rept 16
+ add.d t0, a0, a1
+ vld vr0, a0, 0
+ vld vr1, a0, 16
+ vld vr2, t0, 0
+ vld vr3, t0, 16
+ vsllwil.hu.bu vr4, vr0, 0
+ vsllwil.hu.bu vr5, vr1, 0
+ vsllwil.hu.bu vr6, vr2, 0
+ vsllwil.hu.bu vr7, vr3, 0
+ vexth.hu.bu vr0, vr0
+ vexth.hu.bu vr1, vr1
+ vexth.hu.bu vr2, vr2
+ vexth.hu.bu vr3, vr3
+ vld_x8 t2, 0, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+ vadd.h vr8, vr4, vr8
+ vadd.h vr9, vr0, vr9
+ vadd.h vr10, vr5, vr10
+ vadd.h vr11, vr1, vr11
+ vadd.h vr12, vr6, vr12
+ vadd.h vr13, vr2, vr13
+ vadd.h vr14, vr7, vr14
+ vadd.h vr15, vr3, vr15
+ vssrani.bu.h vr9, vr8, 0
+ vssrani.bu.h vr11, vr10, 0
+ vssrani.bu.h vr13, vr12, 0
+ vssrani.bu.h vr15, vr14, 0
+ vst vr9, a0, 0
+ vst vr11, a0, 16
+ vst vr13, t0, 0
+ vst vr15, t0, 16
+
+ alsl.d a0, a1, a0, 1
+ addi.d t2, t2, 128
+.endr
+
+ free_space 2560 // 32*32*2+512
+
+.DCT_DCT_32X32_END:
+endfunc
+
+.macro dct_8x8_tx64_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7
+
+ // in0 in1 in2 in3
+ // dct4 in0 in2
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vsllwil.w.h vr22, \in2, 0
+ vexth.w.h vr23, \in2
+ vmul.w vr8, vr22, vr20
+ vmul.w vr10, vr23, vr20
+ vmul.w \in2, vr22, vr21
+ vmul.w vr9, vr23, vr21
+ vssrarni.h.w vr10, vr8, 12 // t2
+ vssrarni.h.w vr9, \in2, 12 // t3
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vsllwil.w.h vr22, \in0, 0
+ vexth.w.h vr23, \in0
+ vmul.w vr8, vr22, vr20
+ vmul.w \in2, vr23, vr20
+ vssrarni.h.w \in2, vr8, 12
+
+ vsadd.h vr8, \in2, vr9 // c[0]
+ vssub.h vr9, \in2, vr9 // c[3]
+ vsadd.h \in0, \in2, vr10 // c[1]
+ vssub.h vr10, \in2, vr10 // c[2]
+
+ // inv_dct8_1d_internal_c tx64
+ // in1 in3
+ vldrepl.w vr20, t0, 16 // 799
+ vldrepl.w vr21, t0, 20 // 4017
+
+ vsllwil.w.h vr22, \in1, 0
+ vexth.w.h vr23, \in1
+ vmul.w \in2, vr22, vr21
+ vmul.w \in4, vr23, vr21
+ vmul.w \in1, vr22, vr20
+ vmul.w \in6, vr23, vr20
+ vssrarni.h.w \in4, \in2, 12 // t7a
+ vssrarni.h.w \in6, \in1, 12 // t4a
+
+ vldrepl.w vr20, t0, 24 // 3406
+ vldrepl.w vr21, t0, 28 // 2276
+
+ vsllwil.w.h vr22, \in3, 0
+ vexth.w.h vr23, \in3
+ vneg.w vr21, vr21
+ vmul.w \in2, vr22, vr20
+ vmul.w \in1, vr23, vr20
+ vmul.w \in3, vr22, vr21
+ vmul.w \in7, vr23, vr21
+ vssrarni.h.w \in1, \in2, 12 // t6a
+ vssrarni.h.w \in7, \in3, 12 // t5a
+
+ vsadd.h \in3, \in6, \in7 // t4
+ vssub.h \in6, \in6, \in7 // t5a
+ vsadd.h \in5, \in4, \in1 // t7
+ vssub.h \in4, \in4, \in1 // t6a
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmadd_w \in4, \in6, vr20, vr20, vr21, \in1
+ vmul_vmsub_w \in4, \in6, vr20, vr20, \in2, \in7
+ vssrarni.h.w \in1, vr21, 12 // t6
+ vssrarni.h.w \in7, \in2, 12 // t5
+
+ vsadd.h \out0, vr8, \in5 // c[0]
+ vssub.h \out7, vr8, \in5 // c[7]
+ vsadd.h \out1, \in0, \in1 // c[1]
+ vssub.h \out6, \in0, \in1 // c[6]
+ vsadd.h \out2, vr10, \in7 // c[2]
+ vssub.h \out5, vr10, \in7 // c[5]
+ vsadd.h \out3, vr9, \in3 // c[3]
+ vssub.h \out4, vr9, \in3 // c[4]
+.endm
+
+.macro dct_8x16_tx64_core_lsx
+ dct_8x8_tx64_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, vr11, \
+ vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ // in1 in3 in5 in7 in9 in11 in13 in15
+ // vr1 vr3 vr5 vr7 vr24 vr26 vr28 vr30
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 32 // 401
+ vldrepl.w vr21, t0, 36 // 4076
+ vsllwil.w.h vr22, vr1, 0
+ vexth.w.h vr23, vr1
+ vmul.w vr0, vr22, vr21
+ vmul.w vr10, vr23, vr21
+ vmul.w vr1, vr22, vr20
+ vmul.w vr29, vr23, vr20
+ vssrarni.h.w vr10, vr0, 12 // t15a
+ vssrarni.h.w vr29, vr1, 12 // t8a
+
+ vldrepl.w vr20, t0, 40 // 3166 -> 1583
+ vldrepl.w vr21, t0, 44 // 2598 -> 1299
+ vsllwil.w.h vr22, vr7, 0
+ vexth.w.h vr23, vr7
+ vneg.w vr21, vr21
+ vmul.w vr0, vr22, vr20
+ vmul.w vr30, vr23, vr20
+ vmul.w vr7, vr22, vr21
+ vmul.w vr31, vr23, vr21
+ vssrarni.h.w vr30, vr0, 12 // t14a
+ vssrarni.h.w vr31, vr7, 12 // t9a
+
+ vldrepl.w vr20, t0, 48 // 1931
+ vldrepl.w vr21, t0, 52 // 3612
+ vsllwil.w.h vr22, vr5, 0
+ vexth.w.h vr23, vr5
+ vmul.w vr0, vr22, vr21
+ vmul.w vr24, vr23, vr21
+ vmul.w vr5, vr22, vr20
+ vmul.w vr25, vr23, vr20
+ vssrarni.h.w vr24, vr0, 12 // t13a
+ vssrarni.h.w vr25, vr5, 12 // t10a
+
+ vldrepl.w vr20, t0, 56 // 3920
+ vldrepl.w vr21, t0, 60 // 1189
+ vsllwil.w.h vr22, vr3, 0
+ vexth.w.h vr23, vr3
+ vneg.w vr21, vr21
+ vmul.w vr0, vr22, vr20
+ vmul.w vr26, vr23, vr20
+ vmul.w vr3, vr22, vr21
+ vmul.w vr27, vr23, vr21
+ vssrarni.h.w vr26, vr0, 12 // t12a
+ vssrarni.h.w vr27, vr3, 12 // t11a
+
+ // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27
+ vsadd.h vr28, vr29, vr31 // t8
+ vssub.h vr19, vr29, vr31 // t9
+ vssub.h vr29, vr27, vr25 // t10
+ vsadd.h vr9, vr27, vr25 // t11
+ vsadd.h vr31, vr26, vr24 // t12
+ vssub.h vr25, vr26, vr24 // t13
+ vssub.h vr27, vr10, vr30 // t14
+ vsadd.h vr24, vr10, vr30 // t15
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26
+ vmul_vmsub_w vr27, vr19, vr20, vr21, vr1, vr30
+ vssrarni.h.w vr26, vr0, 12 // t14a
+ vssrarni.h.w vr30, vr1, 12 // t9a
+
+ vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19
+ vneg.w vr0, vr0
+ vneg.w vr19, vr19
+ vmul_vmsub_w vr25, vr29, vr20, vr21, vr1, vr27
+ vssrarni.h.w vr19, vr0, 12 // t10a
+ vssrarni.h.w vr27, vr1, 12 // t13a
+
+ vsadd.h vr25, vr28, vr9 // t8a
+ vssub.h vr29, vr28, vr9 // t11a
+ vssub.h vr28, vr24, vr31 // t12a
+ vsadd.h vr10, vr24, vr31 // t15a
+ vsadd.h vr9, vr30, vr19 // t9
+ vssub.h vr31, vr30, vr19 // t10
+ vssub.h vr30, vr26, vr27 // t13
+ vsadd.h vr24, vr26, vr27 // t14
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26
+ vmul_vmsub_w vr30, vr31, vr20, vr20, vr1, vr27
+ vssrarni.h.w vr26, vr0, 12 // t13a
+ vssrarni.h.w vr27, vr1, 12 // t10a
+
+ vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31
+ vmul_vmsub_w vr28, vr29, vr20, vr20, vr1, vr30
+ vssrarni.h.w vr31, vr0, 12 // t12
+ vssrarni.h.w vr30, vr1, 12 // t11
+
+ // vr11 vr12 ... vr18
+ vsadd.h vr28, vr14, vr31 // c[3]
+ vssub.h vr29, vr14, vr31 // c[12]
+ vsadd.h vr20, vr15, vr30 // c[4]
+ vssub.h vr21, vr15, vr30 // c[11]
+ vsadd.h vr14, vr16, vr27 // c[5]
+ vssub.h vr23, vr16, vr27 // c[10]
+ vsadd.h vr15, vr17, vr9 // c[6]
+ vssub.h vr30, vr17, vr9 // c[9]
+ vsadd.h vr16, vr18, vr25 // c[7]
+ vssub.h vr27, vr18, vr25 // c[8]
+ vsadd.h vr17, vr13, vr26 // c[2]
+ vssub.h vr26, vr13, vr26 // c[13]
+ vsadd.h vr18, vr12, vr24 // c[1]
+ vssub.h vr25, vr12, vr24 // c[14]
+ vsadd.h vr22, vr11, vr10 // c[0]
+ vssub.h vr24, vr11, vr10 // c[15]
+.endm // dct_8x16_tx64_core_lsx
+
+.macro vmul_vssrarni_hw in0, in1, in2, tmp0, tmp1, out0, out1
+ vsllwil.w.h vr22, \in0, 0
+ vexth.w.h vr23, \in0
+ vmul.w \tmp0, vr22, \in1
+ vmul.w \out0, vr23, \in1
+ vmul.w \tmp1, vr22, \in2
+ vmul.w \out1, vr23, \in2
+ vssrarni.h.w \out0, \tmp0, 12
+ vssrarni.h.w \out1, \tmp1, 12
+.endm
+
+const idct64_coeffs, align=4
+ .word 101, 4095, 2967, -2824
+ .word 1660, 3745, 3822, -1474
+ .word 4076, 401, 4017, 799
+
+ .word 4036, -700, 2359, 3349
+ .word 3461, -2191, 897, 3996
+ .word -3166, -2598, -799, -4017
+
+ .word 501, 4065, 3229, -2520
+ .word 2019, 3564, 3948, -1092
+ .word 3612, 1931, 2276, 3406
+
+ .word 4085, -301, 2675, 3102
+ .word 3659, -1842, 1285, 3889
+ .word -3920, -1189, -3406, -2276
+endconst
+
+// in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+// in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+// in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+// in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+.macro dct64_step1_lsx
+
+ vldrepl.w vr20, t0, 0 // 101
+ vldrepl.w vr21, t0, 4 // 4095
+ vmul_vssrarni_hw vr0, vr20, vr21, vr16, vr0, vr8, vr9 // vr8 t32a vr9 t63a
+
+ vldrepl.w vr20, t0, 8 // 2967
+ vldrepl.w vr21, t0, 12 // -2824
+ vmul_vssrarni_hw vr1, vr20, vr21, vr16, vr1, vr10, vr11 // vr10 t62a vr11 t33a
+
+ vldrepl.w vr20, t0, 16 // 1660
+ vldrepl.w vr21, t0, 20 // 3745
+ vmul_vssrarni_hw vr2, vr20, vr21, vr16, vr2, vr12, vr13 // vr12 t34a vr13 t61a
+
+ vldrepl.w vr20, t0, 24 // 3822
+ vldrepl.w vr21, t0, 28 // -1474
+ vmul_vssrarni_hw vr3, vr20, vr21, vr16, vr3, vr14, vr15 // vr14 t60a vr15 t35a
+
+ vsadd.h vr0, vr8, vr11 // t32
+ vssub.h vr1, vr8, vr11 // t33
+ vssub.h vr2, vr15, vr12 // t34
+ vsadd.h vr3, vr15, vr12 // t35
+ vsadd.h vr4, vr14, vr13 // t60
+ vssub.h vr5, vr14, vr13 // t61
+ vssub.h vr6, vr9, vr10 // t62
+ vsadd.h vr7, vr9, vr10 // t63
+
+ vldrepl.w vr20, t0, 32 // 4076
+ vldrepl.w vr21, t0, 36 // 401
+ vmul_vmadd_w vr6, vr1, vr20, vr21, vr9, vr10
+ vmul_vmsub_w vr6, vr1, vr21, vr20, vr13, vr11
+ vssrarni.h.w vr10, vr9, 12 // t62a
+ vssrarni.h.w vr11, vr13, 12 // t33a
+
+ vmul_vmadd_w vr5, vr2, vr20, vr21, vr9, vr1
+ vmul_vmsub_w vr5, vr2, vr21, vr20, vr13, vr6
+ vneg.w vr9, vr9
+ vneg.w vr1, vr1
+ vssrarni.h.w vr6, vr13, 12 // t61a
+ vssrarni.h.w vr1, vr9, 12 // t34a
+
+ vsadd.h vr2, vr0, vr3 // t32a
+ vssub.h vr5, vr0, vr3 // t35a
+ vsadd.h vr9, vr11, vr1 // t33
+ vssub.h vr13, vr11, vr1 // t34
+ vssub.h vr0, vr7, vr4 // t60a
+ vsadd.h vr3, vr7, vr4 // t63a
+ vssub.h vr1, vr10, vr6 // t61
+ vsadd.h vr11, vr10, vr6 // t62
+
+ vldrepl.w vr20, t0, 40 // 4017
+ vldrepl.w vr21, t0, 44 // 799
+
+ vmul_vmadd_w vr1, vr13, vr20, vr21, vr8, vr4
+ vmul_vmsub_w vr1, vr13, vr21, vr20, vr12, vr7
+ vssrarni.h.w vr4, vr8, 12 // t61a
+ vssrarni.h.w vr7, vr12, 12 // t34a
+
+ vmul_vmadd_w vr0, vr5, vr20, vr21, vr8, vr6
+ vmul_vmsub_w vr0, vr5, vr21, vr20, vr12, vr10
+ vssrarni.h.w vr6, vr8, 12 // t60
+ vssrarni.h.w vr10, vr12, 12 // t35
+
+ vst_x8 t6, 0, 16, vr2, vr9, vr7, vr10, vr6, vr4, vr11, vr3
+.endm // dct64_step1
+
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+.macro dct64_step2_lsx
+ vld vr0, t5, 0 // t32a
+ vld vr2, t4, 0 // t63a
+ vld vr3, t5, 16*8 // t56a
+ vld vr1, t4, 16*8 // t39a
+ vld vr4, t5, 16*16 // t40a
+ vld vr6, t4, 16*16 // t55a
+ vld vr7, t5, 16*24 // t48a
+ vld vr5, t4, 16*24 // t47a
+
+ vsadd.h vr8, vr0, vr1 // t32
+ vssub.h vr9, vr0, vr1 // t39
+ vsadd.h vr10, vr2, vr3 // t63
+ vssub.h vr11, vr2, vr3 // t56
+ vssub.h vr12, vr5, vr4 // t40
+ vsadd.h vr13, vr5, vr4 // t47
+ vsadd.h vr14, vr7, vr6 // t48
+ vssub.h vr15, vr7, vr6 // t55
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr11, vr9, vr21, vr20, vr0, vr2
+ vmul_vmsub_w vr11, vr9, vr20, vr21, vr1, vr3
+ vssrarni.h.w vr2, vr0, 12 // t56a
+ vssrarni.h.w vr3, vr1, 12 // t39a
+
+ vmul_vmadd_w vr15, vr12, vr21, vr20, vr0, vr4
+ vmul_vmsub_w vr15, vr12, vr20, vr21, vr1, vr5
+ vneg.w vr0, vr0
+ vneg.w vr4, vr4
+ vssrarni.h.w vr5, vr1, 12 // t55a
+ vssrarni.h.w vr4, vr0, 12 // t40a
+
+ vsadd.h vr9, vr8, vr13 // t32a
+ vssub.h vr11, vr8, vr13 // t47a
+ vsadd.h vr6, vr3, vr4 // t39
+ vssub.h vr7, vr3, vr4 // t40
+ vssub.h vr12, vr10, vr14 // t48a
+ vsadd.h vr15, vr10, vr14 // t63a
+ vssub.h vr0, vr2, vr5 // t55
+ vsadd.h vr1, vr2, vr5 // t56
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmsub_w vr0, vr7, vr20, vr20, vr8, vr13
+ vmul_vmadd_w vr0, vr7, vr20, vr20, vr3, vr4
+ vssrarni.h.w vr13, vr8, 12 // t40a
+ vssrarni.h.w vr4, vr3, 12 // t55a
+ vmul_vmsub_w vr12, vr11, vr20, vr20, vr8, vr10
+ vmul_vmadd_w vr12, vr11, vr20, vr20, vr3, vr14
+ vssrarni.h.w vr10, vr8, 12 // t47
+ vssrarni.h.w vr14, vr3, 12 // t48
+
+ // t32a t39 t40a t47 t48 t55a t56 t63a
+ // vr9 vr6 vr13 vr10 vr14 vr4 vr1 vr15
+ vst vr9, t5, 0 // t32a
+ vst vr6, t4, 0 // t39
+ vst vr13, t5, 16*8 // t40a
+ vst vr10, t4, 16*8 // t47
+ vst vr14, t5, 16*16 // t48
+ vst vr4, t4, 16*16 // t55a
+ vst vr1, t5, 16*24 // t56
+ vst vr15, t4, 16*24 // t63a
+.endm // dct64_step2_lsx
+
+.macro dct64_step3_lsx
+ // t0 t1 t2 t3 t4 t5 t6 t7
+ vld_x8 t3, 0, 16, vr2, vr3, vr7, vr8, vr11, vr12, vr16, vr17
+
+ vld vr9, t5, 16*24 // t56
+ vld vr6, t5, 16*24+16 // t57a
+ vld vr13, t5, 16*24+32 // t58
+ vld vr10, t5, 16*24+48 // t59a
+ vld vr14, t4, 16*24-48 // t60
+ vld vr4, t4, 16*24-32 // t61a
+ vld vr1, t4, 16*24-16 // t62
+ vld vr15, t4, 16*24 // t63a
+
+ vsadd.h vr20, vr2, vr15 // c[0]
+ vssub.h vr21, vr2, vr15 // c[63]
+ vsadd.h vr22, vr3, vr1 // c[1]
+ vssub.h vr23, vr3, vr1 // c[62]
+ vsadd.h vr24, vr7, vr4 // c[2]
+ vssub.h vr25, vr7, vr4 // c[61]
+ vsadd.h vr26, vr8, vr14 // c[3]
+ vssub.h vr27, vr8, vr14 // c[60]
+
+ vsadd.h vr28, vr11, vr10 // c[4]
+ vssub.h vr29, vr11, vr10 // c[59]
+ vsadd.h vr30, vr12, vr13 // c[5]
+ vssub.h vr31, vr12, vr13 // c[58]
+ vsadd.h vr2, vr16, vr6 // c[6]
+ vssub.h vr15, vr16, vr6 // c[57]
+ vsadd.h vr1, vr17, vr9 // c[7]
+ vssub.h vr3, vr17, vr9 // c[56]
+.endm // dct64_step3_lsx
+
+.macro dct64_step4_lsx transpose8x8, shift, start0, stride0, start1, stride1
+
+ dct64_step3_lsx
+
+.ifnb \transpose8x8
+ LSX_TRANSPOSE8x8_H vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
+ vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
+ vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13
+
+ LSX_TRANSPOSE8x8_H vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \
+ vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \
+ vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13
+.endif
+
+.ifnb \shift
+.irp i, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
+ vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
+ vsrari.h \i, \i, \shift
+.endr
+.endif
+
+ vst_x8 t7, \start0, \stride0, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
+
+ vst_x8 t7, \start1, \stride1, vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
+
+.endm // dct64_step4_lsx
+
+.macro dct64_step5_lsx in0, in1, in2, in3, in4, in5, in6, in7
+
+ fld.d f4, t0, 0
+ fldx.d f5, t0, a1
+ fld.d f6, t6, 0
+ fldx.d f7, t6, a1
+ alsl.d t0, a1, t0, 2
+ alsl.d t6, a1, t6, 2
+ fld.d f8, t0, 0
+ fldx.d f9, t0, a1
+ fld.d f10, t6, 0
+ fldx.d f11, t6, a1
+
+.irp i, vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11
+ vsllwil.hu.bu \i, \i, 0
+.endr
+
+ vsrari.h vr20, \in0, 4
+ vsrari.h vr22, \in1, 4
+ vsrari.h vr24, \in2, 4
+ vsrari.h vr26, \in3, 4
+ vsrari.h vr28, \in4, 4
+ vsrari.h vr30, \in5, 4
+ vsrari.h vr2, \in6, 4
+ vsrari.h vr1, \in7, 4
+
+ vadd.h vr4, vr4, vr20
+ vadd.h vr5, vr5, vr22
+ vadd.h vr6, vr6, vr24
+ vadd.h vr7, vr7, vr26
+ vadd.h vr8, vr8, vr28
+ vadd.h vr9, vr9, vr30
+ vadd.h vr10, vr10, vr2
+ vadd.h vr11, vr11, vr1
+
+ vssrani.bu.h vr5, vr4, 0
+ vssrani.bu.h vr7, vr6, 0
+ vssrani.bu.h vr9, vr8, 0
+ vssrani.bu.h vr11, vr10, 0
+
+ vstelm.d vr5, t1, 0, 0
+ vstelm.d vr5, t2, 0, 1
+
+ alsl.d t1, a1, t1, 1
+ alsl.d t2, a1, t2, 1
+ vstelm.d vr7, t1, 0, 0
+ vstelm.d vr7, t2, 0, 1
+
+ alsl.d t1, a1, t1, 1
+ alsl.d t2, a1, t2, 1
+ vstelm.d vr9, t1, 0, 0
+ vstelm.d vr9, t2, 0, 1
+
+ alsl.d t1, a1, t1, 1
+ alsl.d t2, a1, t2, 1
+ vstelm.d vr11, t1, 0, 0
+ vstelm.d vr11, t2, 0, 1
+.endm // dct64_step5_lsx
+
+.macro dct_8x32_tx64_new_lsx vld_loc0, stride0, vld_loc1, stride1
+ vld_x8 t2, \vld_loc0, \stride0, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ dct_8x16_tx64_core_lsx
+
+ vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x8 t2, \vld_loc1, \stride1, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 64 // 201
+ vldrepl.w vr21, t0, 68 // 4091
+ vsllwil.w.h vr22, vr0, 0
+ vexth.w.h vr23, vr0
+ vmul.w vr8, vr22, vr21
+ vmul.w vr9, vr23, vr21
+ vmul.w vr0, vr22, vr20
+ vmul.w vr10, vr23, vr20
+ vssrarni.h.w vr9, vr8, 12 // t31a
+ vssrarni.h.w vr10, vr0, 12 // t16a
+
+ vldrepl.w vr20, t0, 72 // 3035
+ vldrepl.w vr21, t0, 76 // 2751
+ vsllwil.w.h vr22, vr7, 0
+ vexth.w.h vr23, vr7
+ vneg.w vr21, vr21
+ vmul.w vr8, vr22, vr20
+ vmul.w vr0, vr23, vr20
+ vmul.w vr7, vr22, vr21
+ vmul.w vr30, vr23, vr21
+ vssrarni.h.w vr0, vr8, 12 // t30a
+ vssrarni.h.w vr30, vr7, 12 // t17a
+
+ vldrepl.w vr20, t0, 80 // 1751
+ vldrepl.w vr21, t0, 84 // 3703
+ vsllwil.w.h vr22, vr4, 0
+ vexth.w.h vr23, vr4
+ vmul.w vr8, vr22, vr21
+ vmul.w vr7, vr23, vr21
+ vmul.w vr4, vr22, vr20
+ vmul.w vr19, vr23, vr20
+ vssrarni.h.w vr7, vr8, 12 // t29a
+ vssrarni.h.w vr19, vr4, 12 // t18a
+
+ vldrepl.w vr20, t0, 88 // 3857
+ vldrepl.w vr21, t0, 92 // 1380
+ vsllwil.w.h vr22, vr3, 0
+ vexth.w.h vr23, vr3
+ vneg.w vr21, vr21
+ vmul.w vr8, vr22, vr20
+ vmul.w vr4, vr23, vr20
+ vmul.w vr3, vr22, vr21
+ vmul.w vr26, vr23, vr21
+ vssrarni.h.w vr4, vr8, 12 // t28a
+ vssrarni.h.w vr26, vr3, 12 // t19a
+
+ vldrepl.w vr20, t0, 96 // 995
+ vldrepl.w vr21, t0, 100 // 3973
+ vsllwil.w.h vr22, vr2, 0
+ vexth.w.h vr23, vr2
+ vmul.w vr8, vr22, vr21
+ vmul.w vr3, vr23, vr21
+ vmul.w vr2, vr22, vr20
+ vmul.w vr27, vr23, vr20
+ vssrarni.h.w vr3, vr8, 12 // t27a
+ vssrarni.h.w vr27, vr2, 12 // t20a
+
+ vldrepl.w vr20, t0, 104 // 3513
+ vldrepl.w vr21, t0, 108 // 2106
+ vsllwil.w.h vr22, vr5, 0
+ vexth.w.h vr23, vr5
+ vneg.w vr21, vr21
+ vmul.w vr8, vr22, vr20
+ vmul.w vr2, vr23, vr20
+ vmul.w vr5, vr22, vr21
+ vmul.w vr28, vr23, vr21
+ vssrarni.h.w vr2, vr8, 12 // t26a
+ vssrarni.h.w vr28, vr5, 12 // t21a
+
+ vldrepl.w vr20, t0, 112 // 2440 -> 1220
+ vldrepl.w vr21, t0, 116 // 3290 -> 1645
+ vsllwil.w.h vr22, vr6, 0
+ vexth.w.h vr23, vr6
+ vmul.w vr8, vr22, vr21
+ vmul.w vr5, vr23, vr21
+ vmul.w vr6, vr22, vr20
+ vmul.w vr25, vr23, vr20
+ vssrarni.h.w vr5, vr8, 12 // t25a
+ vssrarni.h.w vr25, vr6, 12 // t22a
+
+ vldrepl.w vr20, t0, 120 // 4052
+ vldrepl.w vr21, t0, 124 // 601
+ vsllwil.w.h vr22, vr1, 0
+ vexth.w.h vr23, vr1
+ vneg.w vr21, vr21
+ vmul.w vr8, vr22, vr20
+ vmul.w vr6, vr23, vr20
+ vmul.w vr1, vr22, vr21
+ vmul.w vr24, vr23, vr21
+ vssrarni.h.w vr6, vr8, 12 // t24a
+ vssrarni.h.w vr24, vr1, 12 // t23a
+
+ vsadd.h vr1, vr10, vr30 // t16
+ vssub.h vr29, vr10, vr30 // t17
+ vssub.h vr8, vr26, vr19 // t18
+ vsadd.h vr31, vr26, vr19 // t19
+ vsadd.h vr10, vr27, vr28 // t20
+ vssub.h vr30, vr27, vr28 // t21
+ vssub.h vr19, vr24, vr25 // t22
+ vsadd.h vr26, vr24, vr25 // t23
+ vsadd.h vr27, vr6, vr5 // t24
+ vssub.h vr28, vr6, vr5 // t25
+ vssub.h vr24, vr3, vr2 // t26
+ vsadd.h vr25, vr3, vr2 // t27
+ vsadd.h vr5, vr4, vr7 // t28
+ vssub.h vr6, vr4, vr7 // t29
+ vssub.h vr2, vr9, vr0 // t30
+ vsadd.h vr3, vr9, vr0 // t31
+
+ vldrepl.w vr20, t0, 16 // 799
+ vldrepl.w vr21, t0, 20 // 4017
+ vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7
+ vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0
+ vssrarni.h.w vr7, vr4, 12 // t30a
+ vssrarni.h.w vr0, vr11, 12 // t17a
+ vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9
+ vneg.w vr4, vr4
+ vneg.w vr9, vr9
+ vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2
+ vssrarni.h.w vr9, vr4, 12 // t18a
+ vssrarni.h.w vr2, vr11, 12 // t29a
+
+ vldrepl.w vr20, t0, 24 // 3406 -> 1703
+ vldrepl.w vr21, t0, 28 // 2276 -> 1138
+ vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29
+ vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6
+ vssrarni.h.w vr29, vr4, 12 // t26a
+ vssrarni.h.w vr6, vr11, 12 // t21a
+
+ vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8
+ vneg.w vr4, vr4
+ vneg.w vr8, vr8
+ vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24
+ vssrarni.h.w vr8, vr4, 12 // t22a
+ vssrarni.h.w vr24, vr11, 12 // t25a
+
+ vsadd.h vr4, vr1, vr31 // t16a
+ vssub.h vr30, vr1, vr31 // t19a
+ vsadd.h vr19, vr0, vr9 // t17
+ vssub.h vr28, vr0, vr9 // t18
+ vssub.h vr1, vr26, vr10 // t20a
+ vsadd.h vr31, vr26, vr10 // t23a
+ vssub.h vr0, vr8, vr6 // t21
+ vsadd.h vr9, vr8, vr6 // t22
+ vsadd.h vr10, vr27, vr25 // t24a
+ vssub.h vr26, vr27, vr25 // t27a
+ vsadd.h vr6, vr24, vr29 // t25
+ vssub.h vr8, vr24, vr29 // t26
+ vssub.h vr25, vr3, vr5 // t28a
+ vsadd.h vr27, vr3, vr5 // t31a
+ vssub.h vr24, vr7, vr2 // t29
+ vsadd.h vr29, vr7, vr2 // t30
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5
+ vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2
+ vssrarni.h.w vr5, vr3, 12 // t29a
+ vssrarni.h.w vr2, vr11, 12 // 18a
+
+ vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7
+ vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24
+ vssrarni.h.w vr7, vr3, 12 // t28
+ vssrarni.h.w vr24, vr11, 12 // t19
+
+ vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28
+ vneg.w vr3, vr3
+ vneg.w vr28, vr28
+ vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25
+ vssrarni.h.w vr28, vr3, 12 // t20
+ vssrarni.h.w vr25, vr11, 12 // t27
+
+ vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30
+ vneg.w vr3, vr3
+ vneg.w vr30, vr30
+ vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1
+ vssrarni.h.w vr30, vr3, 12 // t21a
+ vssrarni.h.w vr1, vr11, 12 // t26a
+
+ vsadd.h vr3, vr4, vr31 // t16
+ vssub.h vr26, vr4, vr31 // t23
+ vsadd.h vr0, vr19, vr9 // t17a
+ vssub.h vr8, vr19, vr9 // t22a
+ vsadd.h vr4, vr2, vr30 // t18
+ vssub.h vr31, vr2, vr30 // t21
+ vsadd.h vr9, vr24, vr28 // t19a
+ vssub.h vr19, vr24, vr28 // t20a
+ vssub.h vr2, vr27, vr10 // t24
+ vsadd.h vr30, vr27, vr10 // t31
+ vssub.h vr24, vr29, vr6 // t25a
+ vsadd.h vr28, vr29, vr6 // t30a
+ vssub.h vr10, vr5, vr1 // t26
+ vsadd.h vr27, vr5, vr1 // t29
+ vssub.h vr6, vr7, vr25 // t27a
+ vsadd.h vr29, vr7, vr25 // t28a
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5
+ vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7
+ vssrarni.h.w vr5, vr1, 12 // t20
+ vssrarni.h.w vr7, vr11, 12 // t27
+
+ vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25
+ vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6
+ vssrarni.h.w vr25, vr1, 12 // t21a
+ vssrarni.h.w vr6, vr11, 12 // t26a
+
+ vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19
+ vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10
+ vssrarni.h.w vr19, vr1, 12 // t22
+ vssrarni.h.w vr10, vr11, 12 // t25
+
+ vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31
+ vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8
+ vssrarni.h.w vr31, vr1, 12 // t23a
+ vssrarni.h.w vr8, vr11, 12 // t24a
+
+ // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16
+ // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3
+
+ vld_x8 t3, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vsadd.h vr1, vr11, vr30 // c[0]
+ vssub.h vr2, vr11, vr30 // c[31]
+ vsadd.h vr24, vr12, vr28 // c[1]
+ vssub.h vr26, vr12, vr28 // c[30]
+ vsadd.h vr11, vr13, vr27 // c[2]
+ vssub.h vr30, vr13, vr27 // c[29]
+ vsadd.h vr12, vr14, vr29 // c[3]
+ vssub.h vr28, vr14, vr29 // c[28]
+ vsadd.h vr13, vr15, vr7 // c[4]
+ vssub.h vr27, vr15, vr7 // c[27]
+ vsadd.h vr14, vr16, vr6 // c[5]
+ vssub.h vr29, vr16, vr6 // c[26]
+ vsadd.h vr7, vr17, vr10 // c[6]
+ vssub.h vr15, vr17, vr10 // c[25]
+ vsadd.h vr6, vr18, vr8 // c[7]
+ vssub.h vr16, vr18, vr8 // c[24]
+
+ vst_x8 t3, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+
+ vst_x8 t3, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+
+ vld_x8 t3, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vsadd.h vr1, vr11, vr31 // c[8]
+ vssub.h vr2, vr11, vr31 // c[23]
+ vsadd.h vr24, vr12, vr19 // c[9]
+ vssub.h vr26, vr12, vr19 // c[22]
+ vsadd.h vr11, vr13, vr25 // c[10]
+ vssub.h vr30, vr13, vr25 // c[21]
+ vsadd.h vr12, vr14, vr5 // c[11]
+ vssub.h vr28, vr14, vr5 // c[20]
+ vsadd.h vr13, vr15, vr9 // c[12]
+ vssub.h vr27, vr15, vr9 // c[19]
+ vsadd.h vr14, vr16, vr4 // c[13]
+ vssub.h vr29, vr16, vr4 // c[18]
+ vsadd.h vr7, vr17, vr0 // c[14]
+ vssub.h vr15, vr17, vr0 // c[17]
+ vsadd.h vr6, vr18, vr3 // c[15]
+ vssub.h vr16, vr18, vr3 // c[16]
+
+ vst_x8 t3, 128, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+
+ vst_x8 t3, 256, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+.endm // dct_8x32_tx64_new_lsx
+
+function inv_txfm_add_dct_dct_64x64_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_64x64
+
+ ld.h t2, a2, 0
+ vldi vr0, 0x8b5
+ vreplgr2vr.w vr1, t2
+ vldi vr20, 0x880
+ vmul.w vr2, vr0, vr1
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8
+ vld vr3, a0, 48
+ vsrari.w vr2, vr2, 2
+ vld vr1, a0, 16
+ vmadd.w vr20, vr2, vr0
+ vld vr2, a0, 32
+ vssrarni.h.w vr20, vr20, 12
+ vld vr0, a0, 0
+
+ vsllwil.hu.bu vr4, vr0, 0
+ vsllwil.hu.bu vr5, vr1, 0
+ vsllwil.hu.bu vr6, vr2, 0
+ vsllwil.hu.bu vr7, vr3, 0
+ vexth.hu.bu vr0, vr0
+ vexth.hu.bu vr1, vr1
+ vexth.hu.bu vr2, vr2
+ vexth.hu.bu vr3, vr3
+ vadd.h vr8, vr4, vr20
+ vadd.h vr9, vr0, vr20
+ vadd.h vr10, vr5, vr20
+ vadd.h vr11, vr1, vr20
+ vadd.h vr12, vr6, vr20
+ vadd.h vr13, vr2, vr20
+ vadd.h vr14, vr7, vr20
+ vadd.h vr15, vr3, vr20
+ vssrani.bu.h vr9, vr8, 0
+ vssrani.bu.h vr11, vr10, 0
+ vssrani.bu.h vr13, vr12, 0
+ vssrani.bu.h vr15, vr14, 0
+ vst vr9, a0, 0
+ vst vr11, a0, 16
+ vst vr13, a0, 32
+ vst vr15, a0, 48
+
+.rept 63
+ add.d a0, a0, a1
+ vld vr0, a0, 0
+ vld vr1, a0, 16
+ vld vr2, a0, 32
+ vld vr3, a0, 48
+ vsllwil.hu.bu vr4, vr0, 0
+ vsllwil.hu.bu vr5, vr1, 0
+ vsllwil.hu.bu vr6, vr2, 0
+ vsllwil.hu.bu vr7, vr3, 0
+ vexth.hu.bu vr0, vr0
+ vexth.hu.bu vr1, vr1
+ vexth.hu.bu vr2, vr2
+ vexth.hu.bu vr3, vr3
+ vadd.h vr8, vr4, vr20
+ vadd.h vr9, vr0, vr20
+ vadd.h vr10, vr5, vr20
+ vadd.h vr11, vr1, vr20
+ vadd.h vr12, vr6, vr20
+ vadd.h vr13, vr2, vr20
+ vadd.h vr14, vr7, vr20
+ vadd.h vr15, vr3, vr20
+ vssrani.bu.h vr9, vr8, 0
+ vssrani.bu.h vr11, vr10, 0
+ vssrani.bu.h vr13, vr12, 0
+ vssrani.bu.h vr15, vr14, 0
+ vst vr9, a0, 0
+ vst vr11, a0, 16
+ vst vr13, a0, 32
+ vst vr15, a0, 48
+.endr
+ b .DCT_DCT_64X64_END
+.NO_HAS_DCONLY_64x64:
+
+ malloc_space 64*32*2+512+512
+
+ addi.d t7, sp, 64
+
+.macro dct64x64_core1_lsx in0, in1, in2
+ addi.d t2, a2, \in0
+ addi.d t7, t7, \in1
+ li.w t4, 64*32*2+64
+ add.d t3, sp, t4
+ addi.d t6, t3, 512
+ add.d t5, t6, zero
+
+ dct_8x32_tx64_new_lsx 0, 256, 128, 256
+
+ la.local t0, idct64_coeffs
+
+ addi.d t2, a2, \in2 // 32 ...
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ vld vr0, t2, 128*0 // in1
+ vld vr1, t2, 128*15 // in31
+ vld vr2, t2, 128*8 // in17
+ vld vr3, t2, 128*7 // in15
+ dct64_step1_lsx
+
+ addi.d t0, t0, 48
+ addi.d t6, t6, 128
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ vld vr0, t2, 128*3 // in7
+ vld vr1, t2, 128*12 // in25
+ vld vr2, t2, 128*11 // in23
+ vld vr3, t2, 128*4 // in9
+ dct64_step1_lsx
+
+ addi.d t0, t0, 48
+ addi.d t6, t6, 128
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ vld vr0, t2, 128*2 // in5
+ vld vr1, t2, 128*13 // in27
+ vld vr2, t2, 128*10 // in21
+ vld vr3, t2, 128*5 // in11
+ dct64_step1_lsx
+
+ addi.d t0, t0, 48
+ addi.d t6, t6, 128
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+ vld vr0, t2, 128*1 // in3
+ vld vr1, t2, 128*14 // in29
+ vld vr2, t2, 128*9 // in19
+ vld vr3, t2, 128*6 // in13
+ dct64_step1_lsx
+
+ la.local t0, idct_coeffs
+ addi.d t4, t5, 16*7
+ // t32a/t39/t40a/t47/t48/t55a/t56/t63a
+ dct64_step2_lsx
+
+ addi.d t5, t5, 16
+ addi.d t4, t4, -16
+ // t33/t38a/t41/t46a/t49a/t54/t57a/t62
+ dct64_step2_lsx
+
+ addi.d t5, t5, 16
+ addi.d t4, t4, -16
+ // t34a/t37/t42a/t45/t50/t53a/t58/t61a
+ dct64_step2_lsx
+
+ addi.d t5, t5, 16
+ addi.d t4, t4, -16
+ // t35/t36a/t43/t44a/t51a/t52/t59a/t60
+ dct64_step2_lsx
+
+ li.w t4, 64*32*2+64+512
+ add.d t5, t4, sp
+ addi.d t4, t5, 16*7
+ dct64_step4_lsx transpose8x8, 2, 0, 128, 112, 128
+
+ addi.d t3, t3, 128
+ addi.d t4, t4, -16*8
+ addi.d t5, t5, -16*8
+ dct64_step4_lsx transpose8x8, 2, 16, 128, 96, 128
+
+ addi.d t5, t5, -16*8
+ addi.d t4, t4, -16*8
+ addi.d t3, t3, 128
+ dct64_step4_lsx transpose8x8, 2, 32, 128, 80, 128
+
+ addi.d t5, t5, -16*8
+ addi.d t4, t4, -16*8
+ addi.d t3, t3, 128
+ dct64_step4_lsx transpose8x8, 2, 48, 128, 64, 128
+.endm
+
+ dct64x64_core1_lsx 0, 0, 64
+
+ dct64x64_core1_lsx 16, 128*8, 64+16
+
+ dct64x64_core1_lsx 32, 128*8, 64+16*2
+
+ dct64x64_core1_lsx 48, 128*8, 64+16*3
+
+ vreplgr2vr.h vr31, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040, 1056, 1072, 1088, 1104, 1120, 1136, 1152, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, 1296, 1312, 1328, 1344, 1360, 1376, 1392, 1408, 1424, 1440, 1456, 1472, 1488, 1504, 1520, 1536, 1552, 1568, 1584, 1600, 1616, 1632, 1648, 1664, 1680, 1696, 1712, 1728, 1744, 1760, 1776, 1792, 1808, 1824, 1840, 1856, 1872, 1888, 1904, 1920, 1936, 1952, 1968, 1984, 2000, 2016, 2032
+ vst vr31, a2, \i
+.endr
+
+.macro dct64x64_core2_lsx in0, in1
+ addi.d t2, sp, 64+\in0
+ addi.d t7, sp, 64+\in0
+ li.w t4, 64*32*2+64
+ add.d t3, sp, t4
+ addi.d t6, t3, 512
+ add.d t5, t6, zero
+
+ addi.d t2, t2, 1024
+ addi.d t2, t2, 1024
+ dct_8x32_tx64_new_lsx -2048, 512, 256-2048, 512
+
+ la.local t0, idct64_coeffs
+
+ addi.d t2, sp, 64+64*2+\in0
+ addi.d t4, t2, 256*7
+ addi.d t4, t4, 256
+
+ vld vr0, t2, 256*0 // in1
+ vld vr1, t4, 256*7 // in31
+ vld vr2, t4, 256*0 // in17
+ vld vr3, t2, 256*7 // in15
+ dct64_step1_lsx
+
+ addi.d t0, t0, 48
+ addi.d t6, t6, 128
+ vld vr0, t2, 256*3 // in7
+ vld vr1, t4, 256*4 // in25
+ vld vr2, t4, 256*3 // in23
+ vld vr3, t2, 256*4 // in9
+ dct64_step1_lsx
+
+ addi.d t0, t0, 48
+ addi.d t6, t6, 128
+ vld vr0, t2, 256*2 // in5
+ vld vr1, t4, 256*5 // in27
+ vld vr2, t4, 256*2 // in21
+ vld vr3, t2, 256*5 // in11
+ dct64_step1_lsx
+
+ addi.d t0, t0, 48
+ addi.d t6, t6, 128
+ vld vr0, t2, 256*1 // in3
+ vld vr1, t4, 256*6 // in29
+ vld vr2, t4, 256*1 // in19
+ vld vr3, t2, 256*6 // in13
+ dct64_step1_lsx
+
+ la.local t0, idct_coeffs
+ addi.d t4, t5, 16*7
+ // t32a/t39/t40a/t47/t48/t55a/t56/t63a
+ dct64_step2_lsx
+
+ addi.d t5, t5, 16
+ addi.d t4, t4, -16
+ // t33/t38a/t41/t46a/t49a/t54/t57a/t62
+ dct64_step2_lsx
+
+ addi.d t5, t5, 16
+ addi.d t4, t4, -16
+ // t34a/t37/t42a/t45/t50/t53a/t58/t61a
+ dct64_step2_lsx
+
+ addi.d t5, t5, 16
+ addi.d t4, t4, -16
+ // t35/t36a/t43/t44a/t51a/t52/t59a/t60
+ dct64_step2_lsx
+
+ li.w t4, 64*32*2+64+512
+ add.d t5, t4, sp
+ addi.d t4, t5, 16*7
+ addi.d a0, a0, \in1
+ // 0 - 7, 56 -63
+ dct64_step3_lsx
+
+ li.w t8, 0
+ mul.w t0, t8, a1
+ add.d t0, a0, t0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
+
+ li.w t8, 56
+ mul.w t0, t8, a1
+ add.d t0, a0, t0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
+
+ // 8 - 15, 48 - 55
+ addi.d t3, t3, 128
+ addi.d t4, t4, -16*8
+ addi.d t5, t5, -16*8
+ dct64_step3_lsx
+
+ li.w t8, 8
+ mul.w t0, t8, a1
+ add.d t0, t0, a0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
+
+ li.w t8, 48
+ mul.w t0, t8, a1
+ add.d t0, t0, a0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
+
+ // 16 - 23, 40 - 47
+ addi.d t3, t3, 128
+ addi.d t4, t4, -16*8
+ addi.d t5, t5, -16*8
+ dct64_step3_lsx
+
+ li.w t8, 16
+ mul.w t0, t8, a1
+ add.d t0, t0, a0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
+
+ li.w t8, 40
+ mul.w t0, t8, a1
+ add.d t0, t0, a0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
+
+ // 24 - 31, 32 - 39
+ addi.d t3, t3, 128
+ addi.d t4, t4, -16*8
+ addi.d t5, t5, -16*8
+ dct64_step3_lsx
+
+ li.w t8, 24
+ mul.w t0, t8, a1
+ add.d t0, t0, a0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
+
+ li.w t8, 32
+ mul.w t0, t8, a1
+ add.d t0, t0, a0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
+.endm
+
+ dct64x64_core2_lsx 16*0, 0
+
+ dct64x64_core2_lsx 16*1, 8
+
+ dct64x64_core2_lsx 16*2, 8
+
+ dct64x64_core2_lsx 16*3, 8
+
+ dct64x64_core2_lsx 16*4, 8
+
+ dct64x64_core2_lsx 16*5, 8
+
+ dct64x64_core2_lsx 16*6, 8
+
+ dct64x64_core2_lsx 16*7, 8
+
+ free_space 64*32*2+512+512
+.DCT_DCT_64X64_END:
+endfunc
diff --git a/src/loongarch/itx.h b/src/loongarch/itx.h
new file mode 100644
index 0000000..3ad444f
--- /dev/null
+++ b/src/loongarch/itx.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_ITX_H
+#define DAV1D_SRC_LOONGARCH_ITX_H
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_4x4, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x8, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x4, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x8, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x16, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x8, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_16x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_16x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_16x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_16x16, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x32, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, lsx));
+
+static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) {
+#if BITDEPTH == 8
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
+
+ if (BITDEPTH != 8 ) return;
+
+ c->itxfm_add[TX_4X4][WHT_WHT] = dav1d_inv_txfm_add_wht_wht_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][IDTX] = dav1d_inv_txfm_add_identity_identity_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_4x4_8bpc_lsx;
+
+ c->itxfm_add[RTX_4X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x8_8bpc_lsx;
+
+ c->itxfm_add[RTX_8X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][IDTX] = dav1d_inv_txfm_add_identity_identity_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x4_8bpc_lsx;
+
+ c->itxfm_add[TX_8X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][IDTX] = dav1d_inv_txfm_add_identity_identity_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x8_8bpc_lsx;
+
+ c->itxfm_add[RTX_8X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x16_8bpc_lsx;
+ c->itxfm_add[RTX_8X16][IDTX] = dav1d_inv_txfm_add_identity_identity_8x16_8bpc_lsx;
+ c->itxfm_add[RTX_8X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x16_8bpc_lsx;
+ c->itxfm_add[RTX_8X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x16_8bpc_lsx;
+
+ c->itxfm_add[RTX_16X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x8_8bpc_lsx;
+ c->itxfm_add[RTX_16X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x8_8bpc_lsx;
+
+ c->itxfm_add[TX_16X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x16_8bpc_lsx;
+ c->itxfm_add[TX_16X16][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_16x16_8bpc_lsx;
+ c->itxfm_add[TX_16X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x16_8bpc_lsx;
+ c->itxfm_add[TX_16X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_16x16_8bpc_lsx;
+ c->itxfm_add[TX_16X16][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_16x16_8bpc_lsx;
+ c->itxfm_add[TX_16X16][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_16x16_8bpc_lsx;
+
+ c->itxfm_add[RTX_8X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x32_8bpc_lsx;
+
+ c->itxfm_add[TX_32X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_32x32_8bpc_lsx;
+
+ c->itxfm_add[TX_64X64][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_64x64_8bpc_lsx;
+#endif
+}
+
+#endif /* DAV1D_SRC_LOONGARCH_ITX_H */
diff --git a/src/loongarch/loongson_asm.S b/src/loongarch/loongson_asm.S
new file mode 100644
index 0000000..a22072b
--- /dev/null
+++ b/src/loongarch/loongson_asm.S
@@ -0,0 +1,776 @@
+/*********************************************************************
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn)
+ * Shiyou Yin(yinshiyou-hf@loongson.cn)
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *********************************************************************/
+
+/*
+ * This file is a LoongArch assembly helper file and available under ISC
+ * license. It provides a large number of macros and alias to simplify
+ * writing assembly code, especially for LSX and LASX optimizations.
+ *
+ * Any one can modify it or add new features for his/her own purposes.
+ * Contributing a patch will be appreciated as it might be useful for
+ * others as well. Send patches to loongson contributor mentioned above.
+ *
+ * MAJOR version: Usage changes, incompatible with previous version.
+ * MINOR version: Add new macros/functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+
+#define LML_VERSION_MAJOR 0
+#define LML_VERSION_MINOR 4
+#define LML_VERSION_MICRO 0
+
+#define DEFAULT_ALIGN 5
+
+/* Set prefix as needed. */
+#ifndef PRIVATE_PREFIX
+#define PRIVATE_PREFIX dav1d_
+#endif
+
+#define PASTE(a,b) a ## b
+#define CONCAT(a,b) PASTE(a,b)
+
+#ifdef PREFIX
+#define ASM_PREF CONCAT(_,PRIVATE_PREFIX)
+#else
+#define ASM_PREF PRIVATE_PREFIX
+#endif
+
+.macro function name, align=DEFAULT_ALIGN
+.macro endfunc
+ jirl $r0, $r1, 0x0
+ .size ASM_PREF\name, . - ASM_PREF\name
+ .purgem endfunc
+.endm
+.text ;
+.align \align ;
+.globl ASM_PREF\name ;
+.type ASM_PREF\name, @function ;
+ASM_PREF\name: ;
+.endm
+
+.macro const name, align=DEFAULT_ALIGN
+ .macro endconst
+ .size \name, . - \name
+ .purgem endconst
+ .endm
+.section .rodata
+.align \align
+\name:
+.endm
+
+/*
+ *============================================================================
+ * LoongArch register alias
+ *============================================================================
+ */
+
+#define a0 $a0
+#define a1 $a1
+#define a2 $a2
+#define a3 $a3
+#define a4 $a4
+#define a5 $a5
+#define a6 $a6
+#define a7 $a7
+
+#define t0 $t0
+#define t1 $t1
+#define t2 $t2
+#define t3 $t3
+#define t4 $t4
+#define t5 $t5
+#define t6 $t6
+#define t7 $t7
+#define t8 $t8
+
+#define s0 $s0
+#define s1 $s1
+#define s2 $s2
+#define s3 $s3
+#define s4 $s4
+#define s5 $s5
+#define s6 $s6
+#define s7 $s7
+#define s8 $s8
+
+#define zero $zero
+#define sp $sp
+#define ra $ra
+
+#define fa0 $fa0
+#define fa1 $fa1
+#define fa2 $fa2
+#define fa3 $fa3
+#define fa4 $fa4
+#define fa5 $fa5
+#define fa6 $fa6
+#define fa7 $fa7
+#define ft0 $ft0
+#define ft1 $ft1
+#define ft2 $ft2
+#define ft3 $ft3
+#define ft4 $ft4
+#define ft5 $ft5
+#define ft6 $ft6
+#define ft7 $ft7
+#define ft8 $ft8
+#define ft9 $ft9
+#define ft10 $ft10
+#define ft11 $ft11
+#define ft12 $ft12
+#define ft13 $ft13
+#define ft14 $ft14
+#define ft15 $ft15
+#define fs0 $fs0
+#define fs1 $fs1
+#define fs2 $fs2
+#define fs3 $fs3
+#define fs4 $fs4
+#define fs5 $fs5
+#define fs6 $fs6
+#define fs7 $fs7
+
+#define f0 $f0
+#define f1 $f1
+#define f2 $f2
+#define f3 $f3
+#define f4 $f4
+#define f5 $f5
+#define f6 $f6
+#define f7 $f7
+#define f8 $f8
+#define f9 $f9
+#define f10 $f10
+#define f11 $f11
+#define f12 $f12
+#define f13 $f13
+#define f14 $f14
+#define f15 $f15
+#define f16 $f16
+#define f17 $f17
+#define f18 $f18
+#define f19 $f19
+#define f20 $f20
+#define f21 $f21
+#define f22 $f22
+#define f23 $f23
+#define f24 $f24
+#define f25 $f25
+#define f26 $f26
+#define f27 $f27
+#define f28 $f28
+#define f29 $f29
+#define f30 $f30
+#define f31 $f31
+
+#define vr0 $vr0
+#define vr1 $vr1
+#define vr2 $vr2
+#define vr3 $vr3
+#define vr4 $vr4
+#define vr5 $vr5
+#define vr6 $vr6
+#define vr7 $vr7
+#define vr8 $vr8
+#define vr9 $vr9
+#define vr10 $vr10
+#define vr11 $vr11
+#define vr12 $vr12
+#define vr13 $vr13
+#define vr14 $vr14
+#define vr15 $vr15
+#define vr16 $vr16
+#define vr17 $vr17
+#define vr18 $vr18
+#define vr19 $vr19
+#define vr20 $vr20
+#define vr21 $vr21
+#define vr22 $vr22
+#define vr23 $vr23
+#define vr24 $vr24
+#define vr25 $vr25
+#define vr26 $vr26
+#define vr27 $vr27
+#define vr28 $vr28
+#define vr29 $vr29
+#define vr30 $vr30
+#define vr31 $vr31
+
+#define xr0 $xr0
+#define xr1 $xr1
+#define xr2 $xr2
+#define xr3 $xr3
+#define xr4 $xr4
+#define xr5 $xr5
+#define xr6 $xr6
+#define xr7 $xr7
+#define xr8 $xr8
+#define xr9 $xr9
+#define xr10 $xr10
+#define xr11 $xr11
+#define xr12 $xr12
+#define xr13 $xr13
+#define xr14 $xr14
+#define xr15 $xr15
+#define xr16 $xr16
+#define xr17 $xr17
+#define xr18 $xr18
+#define xr19 $xr19
+#define xr20 $xr20
+#define xr21 $xr21
+#define xr22 $xr22
+#define xr23 $xr23
+#define xr24 $xr24
+#define xr25 $xr25
+#define xr26 $xr26
+#define xr27 $xr27
+#define xr28 $xr28
+#define xr29 $xr29
+#define xr30 $xr30
+#define xr31 $xr31
+
+/*
+ *============================================================================
+ * LSX/LASX synthesize instructions
+ *============================================================================
+ */
+
+/*
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - vj, vk
+ * Outputs - vd
+ * Return Type - halfword
+ */
+.macro vdp2.h.bu vd, vj, vk
+ vmulwev.h.bu \vd, \vj, \vk
+ vmaddwod.h.bu \vd, \vj, \vk
+.endm
+
+.macro vdp2.h.bu.b vd, vj, vk
+ vmulwev.h.bu.b \vd, \vj, \vk
+ vmaddwod.h.bu.b \vd, \vj, \vk
+.endm
+
+.macro vdp2.w.h vd, vj, vk
+ vmulwev.w.h \vd, \vj, \vk
+ vmaddwod.w.h \vd, \vj, \vk
+.endm
+
+.macro xvdp2.h.bu xd, xj, xk
+ xvmulwev.h.bu \xd, \xj, \xk
+ xvmaddwod.h.bu \xd, \xj, \xk
+.endm
+
+.macro xvdp2.h.bu.b xd, xj, xk
+ xvmulwev.h.bu.b \xd, \xj, \xk
+ xvmaddwod.h.bu.b \xd, \xj, \xk
+.endm
+
+.macro xvdp2.w.h xd, xj, xk
+ xvmulwev.w.h \xd, \xj, \xk
+ xvmaddwod.w.h \xd, \xj, \xk
+.endm
+
+/*
+ * Description : Dot product & addition of halfword vector elements
+ * Arguments : Inputs - vj, vk
+ * Outputs - vd
+ * Return Type - twice size of input
+ */
+.macro vdp2add.h.bu vd, vj, vk
+ vmaddwev.h.bu \vd, \vj, \vk
+ vmaddwod.h.bu \vd, \vj, \vk
+.endm
+
+.macro vdp2add.h.bu.b vd, vj, vk
+ vmaddwev.h.bu.b \vd, \vj, \vk
+ vmaddwod.h.bu.b \vd, \vj, \vk
+.endm
+
+.macro vdp2add.w.h vd, vj, vk
+ vmaddwev.w.h \vd, \vj, \vk
+ vmaddwod.w.h \vd, \vj, \vk
+.endm
+
+.macro xvdp2add.h.bu.b xd, xj, xk
+ xvmaddwev.h.bu.b \xd, \xj, \xk
+ xvmaddwod.h.bu.b \xd, \xj, \xk
+.endm
+
+.macro xvdp2add.w.h xd, xj, xk
+ xvmaddwev.w.h \xd, \xj, \xk
+ xvmaddwod.w.h \xd, \xj, \xk
+.endm
+
+/*
+ * Description : Range element vj[i] to vk[i] ~ vj[i]
+ * clip: vj > vk ? vj : vk && vj < va ? vj : va
+ */
+.macro vclip.h vd, vj, vk, va
+ vmax.h \vd, \vj, \vk
+ vmin.h \vd, \vd, \va
+.endm
+
+.macro vclip.w vd, vj, vk, va
+ vmax.w \vd, \vj, \vk
+ vmin.w \vd, \vd, \va
+.endm
+
+.macro xvclip.h xd, xj, xk, xa
+ xvmax.h \xd, \xj, \xk
+ xvmin.h \xd, \xd, \xa
+.endm
+
+.macro xvclip.w xd, xj, xk, xa
+ xvmax.w \xd, \xj, \xk
+ xvmin.w \xd, \xd, \xa
+.endm
+
+/*
+ * Description : Range element vj[i] to 0 ~ 255
+ * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
+ */
+.macro vclip255.h vd, vj
+ vmaxi.h \vd, \vj, 0
+ vsat.hu \vd, \vd, 7
+.endm
+
+.macro vclip255.w vd, vj
+ vmaxi.w \vd, \vj, 0
+ vsat.wu \vd, \vd, 7
+.endm
+
+.macro xvclip255.h xd, xj
+ xvmaxi.h \xd, \xj, 0
+ xvsat.hu \xd, \xd, 7
+.endm
+
+.macro xvclip255.w xd, xj
+ xvmaxi.w \xd, \xj, 0
+ xvsat.wu \xd, \xd, 7
+.endm
+
+/*
+ * Description : Store elements of vector
+ * vd : Data vector to be stroed
+ * rk : Address of data storage
+ * ra : Offset of address
+ * si : Index of data in vd
+ */
+.macro vstelmx.b vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.b \vd, \rk, 0, \si
+.endm
+
+.macro vstelmx.h vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.h \vd, \rk, 0, \si
+.endm
+
+.macro vstelmx.w vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.w \vd, \rk, 0, \si
+.endm
+
+.macro vstelmx.d vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.d \vd, \rk, 0, \si
+.endm
+
+.macro vmov xd, xj
+ vor.v \xd, \xj, \xj
+.endm
+
+.macro xmov xd, xj
+ xvor.v \xd, \xj, \xj
+.endm
+
+.macro xvstelmx.d xd, rk, ra, si
+ add.d \rk, \rk, \ra
+ xvstelm.d \xd, \rk, 0, \si
+.endm
+
+/*
+ *============================================================================
+ * LSX/LASX custom macros
+ *============================================================================
+ */
+
+/*
+ * Load 4 float, double, V128, v256 elements with stride.
+ */
+.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ fld.s \out0, \src, 0
+ fldx.s \out1, \src, \stride
+ fldx.s \out2, \src, \stride2
+ fldx.s \out3, \src, \stride3
+.endm
+
+.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ fld.d \out0, \src, 0
+ fldx.d \out1, \src, \stride
+ fldx.d \out2, \src, \stride2
+ fldx.d \out3, \src, \stride3
+.endm
+
+.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ vld \out0, \src, 0
+ vldx \out1, \src, \stride
+ vldx \out2, \src, \stride2
+ vldx \out3, \src, \stride3
+.endm
+
+.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ xvld \out0, \src, 0
+ xvldx \out1, \src, \stride
+ xvldx \out2, \src, \stride2
+ xvldx \out3, \src, \stride3
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+ vilvl.h \tmp0, \in1, \in0
+ vilvl.h \tmp1, \in3, \in2
+ vilvl.w \out0, \tmp1, \tmp0
+ vilvh.w \out2, \tmp1, \tmp0
+ vilvh.d \out1, \out0, \out0
+ vilvh.d \out3, \out0, \out2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ * Details :
+ * Example :
+ * 1, 2, 3, 4 1, 5, 9,13
+ * 5, 6, 7, 8 to 2, 6,10,14
+ * 9,10,11,12 =====> 3, 7,11,15
+ * 13,14,15,16 4, 8,12,16
+ */
+.macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+
+ vilvl.w \tmp0, \in1, \in0
+ vilvh.w \out1, \in1, \in0
+ vilvl.w \tmp1, \in3, \in2
+ vilvh.w \out3, \in3, \in2
+
+ vilvl.d \out0, \tmp1, \tmp0
+ vilvl.d \out2, \out3, \out1
+ vilvh.d \out3, \out3, \out1
+ vilvh.d \out1, \tmp1, \tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
+ tmp3, tmp4, tmp5, tmp6, tmp7
+ vilvl.h \tmp0, \in6, \in4
+ vilvl.h \tmp1, \in7, \in5
+ vilvl.h \tmp2, \in2, \in0
+ vilvl.h \tmp3, \in3, \in1
+
+ vilvl.h \tmp4, \tmp1, \tmp0
+ vilvh.h \tmp5, \tmp1, \tmp0
+ vilvl.h \tmp6, \tmp3, \tmp2
+ vilvh.h \tmp7, \tmp3, \tmp2
+
+ vilvh.h \tmp0, \in6, \in4
+ vilvh.h \tmp1, \in7, \in5
+ vilvh.h \tmp2, \in2, \in0
+ vilvh.h \tmp3, \in3, \in1
+
+ vpickev.d \out0, \tmp4, \tmp6
+ vpickod.d \out1, \tmp4, \tmp6
+ vpickev.d \out2, \tmp5, \tmp7
+ vpickod.d \out3, \tmp5, \tmp7
+
+ vilvl.h \tmp4, \tmp1, \tmp0
+ vilvh.h \tmp5, \tmp1, \tmp0
+ vilvl.h \tmp6, \tmp3, \tmp2
+ vilvh.h \tmp7, \tmp3, \tmp2
+
+ vpickev.d \out4, \tmp4, \tmp6
+ vpickod.d \out5, \tmp4, \tmp6
+ vpickev.d \out6, \tmp5, \tmp7
+ vpickod.d \out7, \tmp5, \tmp7
+.endm
+
+/*
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15, \
+ out0, out1, out2, out3, out4, out5, out6, out7,\
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+ xvilvl.b \tmp0, \in2, \in0
+ xvilvl.b \tmp1, \in3, \in1
+ xvilvl.b \tmp2, \in6, \in4
+ xvilvl.b \tmp3, \in7, \in5
+ xvilvl.b \tmp4, \in10, \in8
+ xvilvl.b \tmp5, \in11, \in9
+ xvilvl.b \tmp6, \in14, \in12
+ xvilvl.b \tmp7, \in15, \in13
+ xvilvl.b \out0, \tmp1, \tmp0
+ xvilvh.b \out1, \tmp1, \tmp0
+ xvilvl.b \out2, \tmp3, \tmp2
+ xvilvh.b \out3, \tmp3, \tmp2
+ xvilvl.b \out4, \tmp5, \tmp4
+ xvilvh.b \out5, \tmp5, \tmp4
+ xvilvl.b \out6, \tmp7, \tmp6
+ xvilvh.b \out7, \tmp7, \tmp6
+ xvilvl.w \tmp0, \out2, \out0
+ xvilvh.w \tmp2, \out2, \out0
+ xvilvl.w \tmp4, \out3, \out1
+ xvilvh.w \tmp6, \out3, \out1
+ xvilvl.w \tmp1, \out6, \out4
+ xvilvh.w \tmp3, \out6, \out4
+ xvilvl.w \tmp5, \out7, \out5
+ xvilvh.w \tmp7, \out7, \out5
+ xvilvl.d \out0, \tmp1, \tmp0
+ xvilvh.d \out1, \tmp1, \tmp0
+ xvilvl.d \out2, \tmp3, \tmp2
+ xvilvh.d \out3, \tmp3, \tmp2
+ xvilvl.d \out4, \tmp5, \tmp4
+ xvilvh.d \out5, \tmp5, \tmp4
+ xvilvl.d \out6, \tmp7, \tmp6
+ xvilvh.d \out7, \tmp7, \tmp6
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+ xvilvl.h \tmp0, \in1, \in0
+ xvilvl.h \tmp1, \in3, \in2
+ xvilvl.w \out0, \tmp1, \tmp0
+ xvilvh.w \out2, \tmp1, \tmp0
+ xvilvh.d \out1, \out0, \out0
+ xvilvh.d \out3, \out0, \out2
+.endm
+
+/*
+ * Description : Transpose 4x8 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+ xvilvl.h \tmp0, \in2, \in0
+ xvilvl.h \tmp1, \in3, \in1
+ xvilvl.h \out2, \tmp1, \tmp0
+ xvilvh.h \out3, \tmp1, \tmp0
+
+ xvilvl.d \out0, \out2, \out2
+ xvilvh.d \out1, \out2, \out2
+ xvilvl.d \out2, \out3, \out3
+ xvilvh.d \out3, \out3, \out3
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7, \
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+ xvilvl.h \tmp0, \in6, \in4
+ xvilvl.h \tmp1, \in7, \in5
+ xvilvl.h \tmp2, \in2, \in0
+ xvilvl.h \tmp3, \in3, \in1
+
+ xvilvl.h \tmp4, \tmp1, \tmp0
+ xvilvh.h \tmp5, \tmp1, \tmp0
+ xvilvl.h \tmp6, \tmp3, \tmp2
+ xvilvh.h \tmp7, \tmp3, \tmp2
+
+ xvilvh.h \tmp0, \in6, \in4
+ xvilvh.h \tmp1, \in7, \in5
+ xvilvh.h \tmp2, \in2, \in0
+ xvilvh.h \tmp3, \in3, \in1
+
+ xvpickev.d \out0, \tmp4, \tmp6
+ xvpickod.d \out1, \tmp4, \tmp6
+ xvpickev.d \out2, \tmp5, \tmp7
+ xvpickod.d \out3, \tmp5, \tmp7
+
+ xvilvl.h \tmp4, \tmp1, \tmp0
+ xvilvh.h \tmp5, \tmp1, \tmp0
+ xvilvl.h \tmp6, \tmp3, \tmp2
+ xvilvh.h \tmp7, \tmp3, \tmp2
+
+ xvpickev.d \out4, \tmp4, \tmp6
+ xvpickod.d \out5, \tmp4, \tmp6
+ xvpickev.d \out6, \tmp5, \tmp7
+ xvpickod.d \out7, \tmp5, \tmp7
+.endm
+
+/*
+ * Description : Transpose 2x4x4 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1, tmp2
+ xvilvh.h \tmp1, \in0, \in1
+ xvilvl.h \out1, \in0, \in1
+ xvilvh.h \tmp0, \in2, \in3
+ xvilvl.h \out3, \in2, \in3
+
+ xvilvh.w \tmp2, \out3, \out1
+ xvilvl.w \out3, \out3, \out1
+
+ xvilvl.w \out2, \tmp0, \tmp1
+ xvilvh.w \tmp1, \tmp0, \tmp1
+
+ xvilvh.d \out0, \out2, \out3
+ xvilvl.d \out2, \out2, \out3
+ xvilvh.d \out1, \tmp1, \tmp2
+ xvilvl.d \out3, \tmp1, \tmp2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ * Details :
+ * Example :
+ * 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13
+ * 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14
+ * 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15
+ * 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16
+ */
+.macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+
+ xvilvl.w \tmp0, \in1, \in0
+ xvilvh.w \out1, \in1, \in0
+ xvilvl.w \tmp1, \in3, \in2
+ xvilvh.w \out3, \in3, \in2
+
+ xvilvl.d \out0, \tmp1, \tmp0
+ xvilvl.d \out2, \out3, \out1
+ xvilvh.d \out3, \out3, \out1
+ xvilvh.d \out1, \tmp1, \tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6,
+ * _out7
+ * Example : LASX_TRANSPOSE8x8_W
+ * in0 : 1,2,3,4,5,6,7,8
+ * in1 : 2,2,3,4,5,6,7,8
+ * in2 : 3,2,3,4,5,6,7,8
+ * in3 : 4,2,3,4,5,6,7,8
+ * in4 : 5,2,3,4,5,6,7,8
+ * in5 : 6,2,3,4,5,6,7,8
+ * in6 : 7,2,3,4,5,6,7,8
+ * in7 : 8,2,3,4,5,6,7,8
+ *
+ * out0 : 1,2,3,4,5,6,7,8
+ * out1 : 2,2,2,2,2,2,2,2
+ * out2 : 3,3,3,3,3,3,3,3
+ * out3 : 4,4,4,4,4,4,4,4
+ * out4 : 5,5,5,5,5,5,5,5
+ * out5 : 6,6,6,6,6,6,6,6
+ * out6 : 7,7,7,7,7,7,7,7
+ * out7 : 8,8,8,8,8,8,8,8
+ */
+.macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\
+ out0, out1, out2, out3, out4, out5, out6, out7,\
+ tmp0, tmp1, tmp2, tmp3
+ xvilvl.w \tmp0, \in2, \in0
+ xvilvl.w \tmp1, \in3, \in1
+ xvilvh.w \tmp2, \in2, \in0
+ xvilvh.w \tmp3, \in3, \in1
+ xvilvl.w \out0, \tmp1, \tmp0
+ xvilvh.w \out1, \tmp1, \tmp0
+ xvilvl.w \out2, \tmp3, \tmp2
+ xvilvh.w \out3, \tmp3, \tmp2
+
+ xvilvl.w \tmp0, \in6, \in4
+ xvilvl.w \tmp1, \in7, \in5
+ xvilvh.w \tmp2, \in6, \in4
+ xvilvh.w \tmp3, \in7, \in5
+ xvilvl.w \out4, \tmp1, \tmp0
+ xvilvh.w \out5, \tmp1, \tmp0
+ xvilvl.w \out6, \tmp3, \tmp2
+ xvilvh.w \out7, \tmp3, \tmp2
+
+ xmov \tmp0, \out0
+ xmov \tmp1, \out1
+ xmov \tmp2, \out2
+ xmov \tmp3, \out3
+ xvpermi.q \out0, \out4, 0x02
+ xvpermi.q \out1, \out5, 0x02
+ xvpermi.q \out2, \out6, 0x02
+ xvpermi.q \out3, \out7, 0x02
+ xvpermi.q \out4, \tmp0, 0x31
+ xvpermi.q \out5, \tmp1, 0x31
+ xvpermi.q \out6, \tmp2, 0x31
+ xvpermi.q \out7, \tmp3, 0x31
+.endm
+
+/*
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ * Example : LASX_TRANSPOSE4x4_D
+ * in0 : 1,2,3,4
+ * in1 : 1,2,3,4
+ * in2 : 1,2,3,4
+ * in3 : 1,2,3,4
+ *
+ * out0 : 1,1,1,1
+ * out1 : 2,2,2,2
+ * out2 : 3,3,3,3
+ * out3 : 4,4,4,4
+ */
+.macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+ xvilvl.d \tmp0, \in1, \in0
+ xvilvh.d \out1, \in1, \in0
+ xvilvh.d \tmp1, \in3, \in2
+ xvilvl.d \out2, \in3, \in2
+
+ xvor.v \out0, \tmp0, \tmp0
+ xvor.v \out3, \tmp1, \tmp1
+
+ xvpermi.q \out0, \out2, 0x02
+ xvpermi.q \out2, \tmp0, 0x31
+ xvpermi.q \out3, \out1, 0x31
+ xvpermi.q \out1, \tmp1, 0x02
+.endm
diff --git a/src/loongarch/loopfilter.S b/src/loongarch/loopfilter.S
new file mode 100644
index 0000000..e71d5a7
--- /dev/null
+++ b/src/loongarch/loopfilter.S
@@ -0,0 +1,1108 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/loongarch/loongson_asm.S"
+
+.macro FILTER_W4 DIR, TYPE
+.ifc \DIR, h
+ addi.d t5, a0, -2
+ fld.s f6, t5, 0 //p1 p0 q0 q1
+ fldx.s f7, t5, a1
+ alsl.d t5, a1, t5, 1
+ fld.s f8, t5, 0
+ fldx.s f9, t5, a1
+
+ vilvl.b vr6, vr7, vr6
+ vilvl.b vr7, vr9, vr8
+ vilvl.h vr6, vr7, vr6 //p1p1p1p1
+ vbsrl.v vr7, vr6, 4 //p0p0p0p0
+ vbsrl.v vr8, vr7, 4 //q0q0q0q0
+ vbsrl.v vr9, vr8, 4 //q1q1q1q1
+.else
+ sub.d t5, a0, a1
+ fld.s f7, t5, 0
+ sub.d t5, t5, a1
+ fld.s f6, t5, 0
+ fld.s f8, a0, 0
+ fldx.s f9, a0, a1
+.endif
+
+ vabsd.bu vr10, vr6, vr7 // (p1 - p0)
+ vabsd.bu vr11, vr9, vr8 // (q1 - q0)
+ vabsd.bu vr12, vr7, vr8 // (p0 - q0)
+ vabsd.bu vr13, vr6, vr9 // (p1 - q1)
+
+ vmax.bu vr14, vr10, vr11
+ vsle.bu vr15, vr14, vr4 //abs(p1 - p0) <= I && abs(q1 - q0) <= I
+ vsadd.bu vr16, vr12, vr12
+ vsrli.b vr17, vr13, 1
+ vsadd.bu vr16, vr16, vr17 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1)
+ vsle.bu vr16, vr16, vr3
+ vand.v vr20, vr15, vr16 //fm
+
+ vpickve2gr.wu t5, vr20, 0
+ beqz t5, .END_FILTER_\DIR\()\TYPE\()_W4
+
+ vslt.bu vr16, vr2, vr14 //hev
+
+ vsllwil.h.b vr30, vr20, 0 //expand fm to w
+ vsllwil.w.h vr30, vr30, 0
+
+ vsllwil.hu.bu vr17, vr6, 0
+ vsllwil.hu.bu vr18, vr9, 0
+ vsub.h vr17, vr17, vr18
+ vssrarni.b.h vr17, vr17, 0 //f = iclip_diff(p1 - q1)
+
+ vand.v vr17, vr17, vr16
+ vsllwil.h.b vr18, vr17, 0
+
+ vsllwil.hu.bu vr10, vr8, 0
+ vsllwil.hu.bu vr11, vr7, 0
+ vsub.h vr10, vr10, vr11
+
+ vsadd.h vr11, vr10, vr10
+ vsadd.h vr10, vr10, vr11 //3 * (q0 - p0)
+ vsadd.h vr10, vr10, vr18 //f = iclip_diff(3 * (q0 - p0) + f);
+ vssrani.b.h vr10, vr10, 0
+ vsllwil.h.b vr10, vr10, 0
+
+ vaddi.hu vr11, vr10, 4
+ vaddi.hu vr12, vr10, 3
+ li.w t5, 127
+ vreplgr2vr.h vr13, t5
+ vmin.h vr11, vr11, vr13
+ vmin.h vr12, vr12, vr13
+ vsrai.h vr11, vr11, 3 //f1
+ vsrai.h vr12, vr12, 3 //f2
+
+ vsllwil.hu.bu vr13, vr7, 0 //p0
+ vsllwil.hu.bu vr14, vr8, 0 //q0
+ vsadd.h vr13, vr13, vr12
+ vssub.h vr14, vr14, vr11
+ vssrani.bu.h vr13, vr13, 0 //dst-1
+ vssrani.bu.h vr14, vr14, 0 //dst+0
+
+ vsrari.h vr15, vr11, 1 //f
+ vsllwil.hu.bu vr18, vr6, 0 //p1
+ vsllwil.hu.bu vr19, vr9, 0 //q1
+ vsadd.h vr18, vr18, vr15
+ vssub.h vr19, vr19, vr15
+ vssrani.bu.h vr18, vr18, 0 //dst-2
+ vssrani.bu.h vr19, vr19, 0 //dst+1
+ vbitsel.v vr26, vr18, vr6, vr16
+ vbitsel.v vr29, vr19, vr9, vr16
+
+ vbitsel.v vr6, vr6, vr26, vr20
+ vbitsel.v vr7, vr7, vr13, vr20
+ vbitsel.v vr8, vr8, vr14, vr20
+ vbitsel.v vr9, vr9, vr29, vr20
+
+.ifc \DIR, h
+ vilvl.b vr6, vr7, vr6
+ vilvl.b vr9, vr9, vr8
+ vilvl.h vr6, vr9, vr6
+
+ addi.d t5, a0, -2
+ vstelm.w vr6, t5, 0, 0
+ add.d t5, t5, a1
+ vstelm.w vr6, t5, 0, 1
+ add.d t5, t5, a1
+ vstelm.w vr6, t5, 0, 2
+ add.d t5, t5, a1
+ vstelm.w vr6, t5, 0, 3
+.else
+ fst.s f8, a0, 0
+ fstx.s f9, a0, a1
+ sub.d t5, a0, a1
+ fst.s f7, t5, 0
+ sub.d t5, t5, a1
+ fst.s f6, t5, 0
+.endif
+.END_FILTER_\DIR\()\TYPE\()_W4:
+.endm
+
+.macro FILTER_W6 DIR, TYPE
+.ifc \DIR, h
+ addi.d t5, a0, -3
+ fld.d f6, t5, 0 //p2 p1 p0 q0 q1 q2
+ fldx.d f7, t5, a1
+ alsl.d t5, a1, t5, 1
+ fld.d f8, t5, 0
+ fldx.d f9, t5, a1
+
+ vilvl.b vr6, vr7, vr6
+ vilvl.b vr7, vr9, vr8
+ vilvh.h vr10, vr7, vr6
+ vilvl.h vr6, vr7, vr6
+
+ vbsrl.v vr7, vr6, 4 //p1
+ vbsrl.v vr8, vr7, 4 //p0
+ vbsrl.v vr9, vr8, 4 //q0
+ vbsrl.v vr11, vr10, 4 //q2
+.else
+ alsl.d t5, a1, a1, 1
+ sub.d t5, a0, t5
+ fld.d f6, t5, 0
+ fldx.d f7, t5, a1
+ alsl.d t5, a1, t5, 1
+ fld.d f8, t5, 0
+ fldx.d f9, t5, a1
+ alsl.d t5, a1, t5, 1
+ fld.d f10, t5, 0
+ fldx.d f11, t5, a1
+.endif
+
+ vabsd.bu vr12, vr7, vr8 //abs(p1-p0)
+ vabsd.bu vr13, vr10, vr9 //abs(q1-q0)
+ vmax.bu vr14, vr12, vr13
+ vslt.bu vr2, vr2, vr14 //hev
+ vabsd.bu vr12, vr6, vr7 //abs(p2-p1)
+ vmax.bu vr12, vr12, vr14
+ vabsd.bu vr13, vr11, vr10 //abs(q2-q1)
+ vmax.bu vr12, vr12, vr13
+ vsle.bu vr0, vr12, vr4 // <=I
+
+ vabsd.bu vr13, vr8, vr9 //abs(p0-q0)
+ vsadd.bu vr13, vr13, vr13
+ vabsd.bu vr15, vr7, vr10
+ vsrli.b vr15, vr15, 1
+ vsadd.bu vr13, vr13, vr15
+ vsle.bu vr13, vr13, vr3 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E
+ vand.v vr0, vr0, vr13 //fm
+
+ vpickve2gr.wu t5, vr0, 0
+ beqz t5, .END_FILTER_\DIR\()\TYPE\()_W6
+
+ vabsd.bu vr12, vr6, vr8 //abs(p2-p0)
+ vabsd.bu vr13, vr11, vr9 //abs(q2-q0)
+ vmax.bu vr12, vr12, vr14
+ vmax.bu vr12, vr12, vr13
+ vxor.v vr13, vr13, vr13
+ vaddi.bu vr13, vr13, 1
+ vsle.bu vr1, vr12, vr13 //flat8in
+
+ //6789 10 11 --expand to h
+ vsllwil.hu.bu vr12, vr6, 0
+ vsllwil.hu.bu vr13, vr7, 0
+ vsllwil.hu.bu vr14, vr8, 0
+ vsllwil.hu.bu vr15, vr9, 0
+ vsllwil.hu.bu vr16, vr10, 0
+ vsllwil.hu.bu vr17, vr11, 0
+
+ //dst-2
+ vsadd.hu vr18, vr12, vr12
+ vsadd.hu vr18, vr18, vr12
+ vsadd.hu vr18, vr18, vr13
+ vsadd.hu vr18, vr18, vr13
+ vsadd.hu vr18, vr18, vr14
+ vsadd.hu vr18, vr18, vr14
+ vsadd.hu vr18, vr18, vr15
+
+ //dst-1
+ vsadd.hu vr19, vr18, vr15
+ vsadd.hu vr19, vr19, vr16
+ vssub.hu vr19, vr19, vr12
+ vssub.hu vr19, vr19, vr12
+
+ //dst+0
+ vsadd.hu vr20, vr19, vr17
+ vsadd.hu vr20, vr20, vr16
+ vssub.hu vr20, vr20, vr12
+ vssub.hu vr20, vr20, vr13
+
+ //dst+1
+ vsadd.hu vr21, vr20, vr17
+ vsadd.hu vr21, vr21, vr17
+ vssub.hu vr21, vr21, vr13
+ vssub.hu vr21, vr21, vr14
+
+ vsrari.h vr18, vr18, 3
+ vsrari.h vr19, vr19, 3
+ vsrari.h vr20, vr20, 3
+ vsrari.h vr21, vr21, 3
+
+ vsub.h vr22, vr13, vr16
+ vssrani.b.h vr22, vr22, 0
+ vand.v vr22, vr22, vr2
+ vsllwil.h.b vr22, vr22, 0 //f = iclip_diff(p1 - q1);
+
+ vsub.h vr23, vr15, vr14
+ vsadd.h vr24, vr23, vr23
+ vsadd.h vr23, vr23, vr24
+ vsadd.h vr23, vr23, vr22
+ vssrani.b.h vr23, vr23, 0
+ vsllwil.h.b vr23, vr23, 0 //f = iclip_diff(3 * (q0 - p0) + f);
+
+ vaddi.hu vr24, vr23, 4
+ vaddi.hu vr25, vr23, 3
+ li.w t5, 127
+ vreplgr2vr.h vr3, t5
+ vmin.h vr24, vr24, vr3
+ vmin.h vr25, vr25, vr3
+ vsrai.h vr24, vr24, 3 //f1
+ vsrai.h vr25, vr25, 3 //f2
+
+ vsadd.h vr26, vr14, vr25 //dst-1
+ vssub.h vr27, vr15, vr24 //dst+0
+
+ vsrari.h vr24, vr24, 1
+ vsadd.h vr28, vr13, vr24
+ vssub.h vr29, vr16, vr24
+ vsllwil.h.b vr2, vr2, 0
+ vbitsel.v vr28, vr28, vr13, vr2 //dst-2
+ vbitsel.v vr29, vr29, vr16, vr2 //dst+1
+
+ //flat8in
+ vsllwil.h.b vr1, vr1, 0
+ vbitsel.v vr18, vr28, vr18, vr1
+ vbitsel.v vr19, vr26, vr19, vr1
+ vbitsel.v vr20, vr27, vr20, vr1
+ vbitsel.v vr21, vr29, vr21, vr1
+
+ vssrani.bu.h vr18, vr18, 0
+ vssrani.bu.h vr19, vr19, 0
+ vssrani.bu.h vr20, vr20, 0
+ vssrani.bu.h vr21, vr21, 0
+
+ vbitsel.v vr7, vr7, vr18, vr0 //p1
+ vbitsel.v vr8, vr8, vr19, vr0 //p0
+ vbitsel.v vr9, vr9, vr20, vr0 //q0
+ vbitsel.v vr10, vr10, vr21, vr0 //q1
+
+.ifc \DIR, h
+ vilvl.b vr7, vr8, vr7
+ vilvl.b vr9, vr10, vr9
+ vilvl.h vr7, vr9, vr7
+
+ addi.d t5, a0, -2
+ vstelm.w vr7, t5, 0, 0
+ add.d t5, t5, a1
+ vstelm.w vr7, t5, 0, 1
+ add.d t5, t5, a1
+ vstelm.w vr7, t5, 0, 2
+ add.d t5, t5, a1
+ vstelm.w vr7, t5, 0, 3
+.else
+ fst.s f9, a0, 0
+ fstx.s f10, a0, a1
+ sub.d t5, a0, a1
+ fst.s f8, t5, 0
+ sub.d t5, t5, a1
+ fst.s f7, t5, 0
+.endif
+.END_FILTER_\DIR\()\TYPE\()_W6:
+.endm
+
+.macro FILTER_W8 DIR, TYPE
+.ifc \DIR, h
+ addi.d t5, a0, -4
+ fld.d f6, t5, 0 //p3 p2 p1 p0 q0 q1 q2 q3
+ fldx.d f7, t5, a1
+ alsl.d t5, a1, t5, 1
+ fld.d f8, t5, 0
+ fldx.d f9, t5, a1
+
+ vilvl.b vr6, vr7, vr6
+ vilvl.b vr7, vr9, vr8
+ vilvh.h vr10, vr7, vr6 //q0
+ vilvl.h vr6, vr7, vr6 //p3
+ vbsrl.v vr7, vr6, 4 //p2
+ vbsrl.v vr8, vr6, 8 //p1
+ vbsrl.v vr9, vr6, 12 //p0
+ vbsrl.v vr11, vr10, 4 //q1
+ vbsrl.v vr12, vr10, 8 //q2
+ vbsrl.v vr13, vr10, 12 //q3
+.else
+ fld.s f10, a0, 0
+ fldx.s f11, a0, a1
+ add.d t5, a0, a1
+ fldx.s f12, t5, a1
+ add.d t5, t5, a1
+ fldx.s f13, t5, a1
+ sub.d t5, a0, a1
+ fld.s f9, t5, 0
+ sub.d t5, t5, a1
+ fld.s f8, t5, 0
+ sub.d t5, t5, a1
+ fld.s f7, t5, 0
+ sub.d t5, t5, a1
+ fld.s f6, t5, 0
+.endif
+
+ vabsd.bu vr14, vr8, vr9 //p1-p0
+ vabsd.bu vr15, vr11, vr10 //q1-q0
+ vabsd.bu vr16, vr9, vr10 //p0-q0
+ vabsd.bu vr17, vr8, vr11 //p1-q1
+ vabsd.bu vr18, vr7, vr8 //p2-p1
+ vabsd.bu vr19, vr12, vr11 //q2-q1
+ vabsd.bu vr20, vr6, vr7 //p3-p2
+ vabsd.bu vr21, vr13, vr12 //q3-q2
+
+ vmax.bu vr22, vr14, vr15
+ vsle.bu vr23, vr22, vr4 //abs(p1 - p0) <= I && abs(q1 - q0) <= I
+ vsadd.bu vr16, vr16, vr16
+ vsrli.b vr17, vr17, 1
+ vsadd.bu vr16, vr16, vr17
+ vsle.bu vr16, vr16, vr3 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E
+ vand.v vr16, vr16, vr23 //fm
+
+ vpickve2gr.wu t5, vr16, 0
+ beqz t5, .END_FILTER_\DIR\()\TYPE\()_W8
+
+ vmax.bu vr23, vr18, vr19
+ vmax.bu vr23, vr23, vr20
+ vmax.bu vr23, vr23, vr21
+ vsle.bu vr23, vr23, vr4
+ vand.v vr16, vr16, vr23 //fm
+
+ vabsd.bu vr17, vr7, vr9 //abs(p2-p0)
+ vabsd.bu vr18, vr12, vr10 //abs(q2-q0)
+ vmax.bu vr17, vr17, vr14
+ vmax.bu vr17, vr17, vr15
+ vmax.bu vr17, vr17, vr18
+ vabsd.bu vr18, vr6, vr9 //abs(p3 - p0)
+ vabsd.bu vr19, vr13, vr10 //abs(q3 - q0)
+ vmax.bu vr17, vr17, vr18
+ vmax.bu vr17, vr17, vr19
+
+ vxor.v vr5, vr5, vr5
+ vaddi.bu vr5, vr5, 1 //F
+ vsle.bu vr17, vr17, vr5 //flat8in
+
+ vsllwil.hu.bu vr0, vr6, 0 //p3
+ vsllwil.hu.bu vr1, vr7, 0 //p2
+ vsllwil.hu.bu vr27, vr8, 0 //p1
+ vsllwil.hu.bu vr3, vr9, 0 //p0
+ vsllwil.hu.bu vr4, vr10, 0 //q0
+ vsllwil.hu.bu vr5, vr11, 0 //q1
+ vsllwil.hu.bu vr14, vr12, 0 //q2
+ vsllwil.hu.bu vr15, vr13, 0 //q3
+
+ vsadd.hu vr18, vr0, vr0 //p3+p3
+ vsadd.hu vr19, vr15, vr15 //q3+q3
+ vsadd.hu vr20, vr0, vr1 //p3+p2
+ vsadd.hu vr21, vr1, vr27 //p2+p1
+ vsadd.hu vr28, vr27, vr3 //p1+p0
+ vsadd.hu vr23, vr3, vr4 //p0+q0
+ vsadd.hu vr24, vr4, vr5 //q0+q1
+ vsadd.hu vr25, vr5, vr14 //q1+q2
+ vsadd.hu vr26, vr14, vr15 //q2+q3
+
+ // dst-3
+ vsadd.hu vr29, vr18, vr20
+ vsadd.hu vr29, vr29, vr21
+ vsadd.hu vr29, vr29, vr23
+
+ // dst-2
+ vsadd.hu vr30, vr18, vr21
+ vsadd.hu vr30, vr30, vr28
+ vsadd.hu vr30, vr30, vr24
+
+ // dst-1
+ vsadd.hu vr31, vr20, vr28
+ vsadd.hu vr31, vr31, vr23
+ vsadd.hu vr31, vr31, vr25
+
+ // dst+0
+ vsadd.hu vr18, vr21, vr23
+ vsadd.hu vr18, vr18, vr24
+ vsadd.hu vr18, vr18, vr26
+
+ //dst+1
+ vsadd.hu vr20, vr28, vr24
+ vsadd.hu vr20, vr20, vr25
+ vsadd.hu vr20, vr20, vr19
+
+ //dst+2
+ vsadd.hu vr21, vr23, vr25
+ vsadd.hu vr21, vr21, vr26
+ vsadd.hu vr21, vr21, vr19
+
+ vssrarni.bu.h vr23, vr29, 3
+ vssrarni.bu.h vr24, vr30, 3
+ vssrarni.bu.h vr25, vr31, 3
+ vssrarni.bu.h vr19, vr18, 3
+ vssrarni.bu.h vr20, vr20, 3
+ vssrarni.bu.h vr21, vr21, 3
+
+ // !flat8in
+ vslt.bu vr2, vr2, vr22 //hev
+
+ vsub.h vr30, vr27, vr5 //p1-q1
+ vssrani.b.h vr30, vr30, 0
+ vand.v vr30, vr30, vr2
+ vsllwil.h.b vr30, vr30, 0
+
+ vsub.h vr31, vr4, vr3
+ vsadd.h vr0, vr31, vr31
+ vsadd.h vr31, vr31, vr0
+ vsadd.h vr31, vr31, vr30
+ vssrani.b.h vr31, vr31, 0
+ vsllwil.h.b vr31, vr31, 0 //f = iclip_diff(3 * (q0 - p0) + f);
+
+ vaddi.hu vr14, vr31, 4
+ vaddi.hu vr15, vr31, 3
+ li.w t5, 127
+ vreplgr2vr.h vr18, t5
+ vmin.h vr14, vr14, vr18
+ vmin.h vr15, vr15, vr18
+ vsrai.h vr14, vr14, 3 //f1
+ vsrai.h vr15, vr15, 3 //f2
+
+ vsadd.h vr3, vr3, vr15
+ vssub.h vr4, vr4, vr14
+ vssrani.bu.h vr3, vr3, 0 //dst-1
+ vssrani.bu.h vr4, vr4, 0 //dst+0
+
+ vsrari.h vr14, vr14, 1
+ vsadd.h vr18, vr27, vr14
+ vssub.h vr26, vr5, vr14
+ vssrani.bu.h vr18, vr18, 0 //dst-2
+ vssrani.bu.h vr26, vr26, 0 //dst+1
+
+ vbitsel.v vr27, vr18, vr8, vr2 //dst-2
+ vbitsel.v vr28, vr26, vr11, vr2 //dst+1
+
+ vbitsel.v vr23, vr7, vr23, vr17 //dst-3 (p2)
+ vbitsel.v vr24, vr27, vr24, vr17 //dst-2
+ vbitsel.v vr25, vr3, vr25, vr17 //dst-1
+ vbitsel.v vr19, vr4, vr19, vr17 //dst+0
+ vbitsel.v vr20, vr28, vr20, vr17 //dst+1
+ vbitsel.v vr21, vr12, vr21, vr17 //dst+2
+
+ vbitsel.v vr7, vr7, vr23, vr16 //-3
+ vbitsel.v vr8, vr8, vr24, vr16 //-2
+ vbitsel.v vr9, vr9, vr25, vr16 //-1
+ vbitsel.v vr10, vr10, vr19, vr16 //+0
+ vbitsel.v vr11, vr11, vr20, vr16 //+1
+ vbitsel.v vr12, vr12, vr21, vr16 //+2
+
+.ifc \DIR, h
+ vilvl.b vr6, vr7, vr6
+ vilvl.b vr8, vr9, vr8
+ vilvl.b vr10, vr11, vr10
+ vilvl.b vr12, vr13, vr12
+ vilvl.h vr6, vr8, vr6 //p3p2p1p0 -- -- --
+ vilvl.h vr10, vr12, vr10 //q0q1q2q3 -- -- --
+ vilvl.w vr0, vr10, vr6 //p3p2p1p0q0q1q2q3 --
+ vilvh.w vr1, vr10, vr6 //--
+
+ addi.d t5, a0, -4
+ vstelm.d vr0, t5, 0, 0
+ add.d t5, t5, a1
+ vstelm.d vr0, t5, 0, 1
+ add.d t5, t5, a1
+ vstelm.d vr1, t5, 0, 0
+ add.d t5, t5, a1
+ vstelm.d vr1, t5, 0, 1
+.else
+ alsl.d t5, a1, a1, 1
+ sub.d t5, a0, t5
+ fst.s f7, t5, 0
+ fstx.s f8, t5, a1
+ add.d t5, t5, a1
+ fstx.s f9, t5, a1
+
+ fst.s f10, a0, 0
+ add.d t5, a0, a1
+ fst.s f11, t5, 0
+ fstx.s f12, t5, a1
+.endif
+.END_FILTER_\DIR\()\TYPE\()_W8:
+.endm
+
+.macro FILTER_W16 DIR, TYPE
+.ifc \DIR, h
+ addi.d t5, a0, -7
+ vld vr6, t5, 0 //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6
+ vldx vr7, t5, a1
+ add.d t5, t5, a1
+ vldx vr8, t5, a1
+ add.d t5, t5, a1
+ vldx vr9, t5, a1
+
+ vilvl.b vr10, vr7, vr6
+ vilvh.b vr11, vr7, vr6
+ vilvl.b vr12, vr9, vr8
+ vilvh.b vr13, vr9, vr8
+ vilvl.h vr6, vr12, vr10
+ vilvh.h vr10, vr12, vr10 //p2---
+ vilvl.h vr15, vr13, vr11 //q1---
+ vilvh.h vr19, vr13, vr11
+
+ vbsrl.v vr7, vr6, 4 //p5---
+ vbsrl.v vr8, vr6, 8 //p4---
+ vbsrl.v vr9, vr6, 12 //p3---
+ vbsrl.v vr12, vr10, 4 //p1---
+ vbsrl.v vr13, vr10, 8 //p0---
+ vbsrl.v vr14, vr10, 12 //q0---
+ vbsrl.v vr16, vr15, 4 //q2---
+ vbsrl.v vr17, vr15, 8 //q3---
+ vbsrl.v vr18, vr15, 12 //q4---
+ vbsrl.v vr20, vr19, 4 //q6---
+.else
+ slli.d t5, a1, 3
+ sub.d t5, a0, t5
+ fldx.s f6, t5, a1 //p6
+ alsl.d t5, a1, t5, 1
+ fld.s f7, t5, 0 //p5
+ fldx.s f8, t5, a1 //p4
+ alsl.d t5, a1, t5, 1
+ fld.s f9, t5, 0 //p3
+ fldx.s f10, t5, a1 //p2
+ alsl.d t5, a1, t5, 1
+ fld.s f12, t5, 0 //p1
+ fldx.s f13, t5, a1 //p0
+ alsl.d t5, a1, t5, 1
+ fld.s f14, t5, 0 //q0
+ fldx.s f15, t5, a1 //q1
+ alsl.d t5, a1, t5, 1
+ fld.s f16, t5, 0 //q2
+ fldx.s f17, t5, a1 //q3
+ alsl.d t5, a1, t5, 1
+ fld.s f18, t5, 0 //q4
+ fldx.s f19, t5, a1 //q5
+ add.d t5, t5, a1
+ fldx.s f20, t5, a1 //q6
+
+ //temp store
+ addi.d sp, sp, -96
+ fst.d f7, sp, 0
+ fst.d f8, sp, 8
+ fst.d f9, sp, 16
+ fst.d f10, sp, 24
+ fst.d f12, sp, 32
+ fst.d f13, sp, 40
+ fst.d f14, sp, 48
+ fst.d f15, sp, 56
+ fst.d f16, sp, 64
+ fst.d f17, sp, 72
+ fst.d f18, sp, 80
+ fst.d f19, sp, 88
+.endif
+
+ vabsd.bu vr21, vr12, vr13 //abs(p1-p0)
+ vabsd.bu vr22, vr15, vr14 //abs(q1-q0)
+ vmax.bu vr0, vr21, vr22
+ vslt.bu vr2, vr2, vr0 //hev
+ vabsd.bu vr1, vr10, vr12 //abs(p2-p1)
+ vmax.bu vr0, vr0, vr1
+ vabsd.bu vr1, vr16, vr15 //abs(q2-q1)
+ vmax.bu vr0, vr0, vr1
+ vabsd.bu vr1, vr9, vr10 //abs(p3-p2)
+ vmax.bu vr0, vr0, vr1
+ vabsd.bu vr1, vr17, vr16 //abs(q3-q2)
+ vmax.bu vr0, vr0, vr1
+ vsle.bu vr0, vr0, vr4 //vr4 released I
+ vabsd.bu vr1, vr13, vr14 //abs(p0-q0)
+ vsadd.bu vr1, vr1, vr1
+ vabsd.bu vr4, vr12, vr15 //abs(p1-q1)
+ vsrli.b vr4, vr4, 1
+ vsadd.bu vr1, vr1, vr4 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1)
+ vsle.bu vr1, vr1, vr3 //vr3 released E
+ vand.v vr0, vr0, vr1 //fm
+
+ vpickve2gr.wu t5, vr0, 0
+ beqz t5, .END_FILTER_\DIR\()\TYPE\()_W16
+
+ vabsd.bu vr1, vr6, vr13 //abs(p6-p0)
+ vabsd.bu vr4, vr7, vr13 //abs(p5-p0)
+ vmax.bu vr1, vr1, vr4
+ vabsd.bu vr4, vr8, vr13 //abs(p4-p0)
+ vmax.bu vr1, vr1, vr4
+ vabsd.bu vr4, vr18, vr14 //abs(q4-q0)
+ vmax.bu vr1, vr1, vr4
+ vabsd.bu vr4, vr19, vr14 //abs(q5-q0)
+ vmax.bu vr1, vr1, vr4
+ vabsd.bu vr4, vr20, vr14
+ vmax.bu vr1, vr1, vr4
+ vxor.v vr5, vr5, vr5
+ vaddi.bu vr5, vr5, 1 //F
+ vsle.bu vr1, vr1, vr5 //flat8out
+
+ vabsd.bu vr3, vr10, vr13 //abs(p2-p0)
+ vmax.bu vr3, vr3, vr21
+ vmax.bu vr3, vr3, vr22
+ vabsd.bu vr4, vr16, vr14 //abs(q2-q0)
+ vmax.bu vr3, vr3, vr4
+ vabsd.bu vr4, vr9, vr13 //abs(p3-p0)
+ vmax.bu vr3, vr3, vr4
+ vabsd.bu vr4, vr17, vr14 //abs(q3-q0)
+ vmax.bu vr3, vr3, vr4
+ vsle.bu vr3, vr3, vr5 //flatin released vr5
+
+ vsllwil.hu.bu vr6, vr6, 0 //p6
+ vsllwil.hu.bu vr7, vr7, 0 //p5
+ vsllwil.hu.bu vr8, vr8, 0 //p4
+ vsllwil.hu.bu vr9, vr9, 0 //p3
+ vsllwil.hu.bu vr10, vr10, 0 //p2
+ vsllwil.hu.bu vr12, vr12, 0 //p1
+ vsllwil.hu.bu vr13, vr13, 0 //p0
+ vsllwil.hu.bu vr14, vr14, 0 //q0
+ vsllwil.hu.bu vr15, vr15, 0 //q1
+ vsllwil.hu.bu vr16, vr16, 0 //q2
+ vsllwil.hu.bu vr17, vr17, 0 //q3
+ vsllwil.hu.bu vr18, vr18, 0 //q4
+ vsllwil.hu.bu vr19, vr19, 0 //q5
+ vsllwil.hu.bu vr20, vr20, 0 //q6
+
+ //dst-6
+ vslli.w vr21, vr6, 3
+ vssub.hu vr21, vr21, vr6
+ vsadd.hu vr21, vr21, vr7
+ vsadd.hu vr21, vr21, vr7
+ vsadd.hu vr21, vr21, vr8
+ vsadd.hu vr21, vr21, vr8
+ vsadd.hu vr21, vr21, vr9
+ vsadd.hu vr21, vr21, vr10
+ vsadd.hu vr21, vr21, vr12
+ vsadd.hu vr21, vr21, vr13
+ vsadd.hu vr21, vr21, vr14
+
+ //dst-5
+ vsadd.hu vr22, vr21, vr15
+ vsadd.hu vr22, vr22, vr9
+ vssub.hu vr22, vr22, vr6
+ vssub.hu vr22, vr22, vr6
+
+ //dst-4
+ vsadd.hu vr23, vr22, vr16
+ vsadd.hu vr23, vr23, vr10
+ vssub.hu vr23, vr23, vr7
+ vssub.hu vr23, vr23, vr6
+
+ //dst-3
+ vsadd.hu vr24, vr23, vr12
+ vsadd.hu vr24, vr24, vr17
+ vssub.hu vr24, vr24, vr6
+ vssub.hu vr24, vr24, vr8
+
+ //dst-2
+ vsadd.hu vr25, vr24, vr18
+ vsadd.hu vr25, vr25, vr13
+ vssub.hu vr25, vr25, vr6
+ vssub.hu vr25, vr25, vr9
+
+ //dst-1
+ vsadd.hu vr26, vr25, vr19
+ vsadd.hu vr26, vr26, vr14
+ vssub.hu vr26, vr26, vr6
+ vssub.hu vr26, vr26, vr10
+
+ //dst+0
+ vsadd.hu vr27, vr26, vr20
+ vsadd.hu vr27, vr27, vr15
+ vssub.hu vr27, vr27, vr6
+ vssub.hu vr27, vr27, vr12
+
+ //dst+1
+ vsadd.hu vr28, vr27, vr20
+ vsadd.hu vr28, vr28, vr16
+ vssub.hu vr28, vr28, vr7
+ vssub.hu vr28, vr28, vr13
+
+ //dst+2
+ vsadd.hu vr29, vr28, vr20
+ vsadd.hu vr29, vr29, vr17
+ vssub.hu vr29, vr29, vr8
+ vssub.hu vr29, vr29, vr14
+
+ //dst+3
+ vsadd.hu vr30, vr29, vr20
+ vsadd.hu vr30, vr30, vr18
+ vssub.hu vr30, vr30, vr9
+ vssub.hu vr30, vr30, vr15
+
+ //dst+4
+ vsadd.hu vr31, vr30, vr20
+ vsadd.hu vr31, vr31, vr19
+ vssub.hu vr31, vr31, vr10
+ vssub.hu vr31, vr31, vr16
+
+ //dst+5
+ vsadd.hu vr11, vr31, vr20
+ vsadd.hu vr11, vr11, vr20
+ vssub.hu vr11, vr11, vr12
+ vssub.hu vr11, vr11, vr17
+
+ vsrari.h vr21, vr21, 4
+ vsrari.h vr22, vr22, 4
+ vsrari.h vr23, vr23, 4
+ vsrari.h vr24, vr24, 4
+ vsrari.h vr25, vr25, 4
+ vsrari.h vr26, vr26, 4
+ vsrari.h vr27, vr27, 4
+ vsrari.h vr28, vr28, 4
+ vsrari.h vr29, vr29, 4
+ vsrari.h vr30, vr30, 4
+ vsrari.h vr31, vr31, 4
+ vsrari.h vr11, vr11, 4
+
+ vand.v vr1, vr1, vr3
+ vsllwil.h.b vr1, vr1, 0 //expand to h
+ //(flat8out & flat8in)
+ vbitsel.v vr21, vr7, vr21, vr1 //dst-6
+ vbitsel.v vr22, vr8, vr22, vr1 //dst-5
+ vbitsel.v vr23, vr9, vr23, vr1 //dst-4
+ vbitsel.v vr30, vr17, vr30, vr1 //dst+3
+ vbitsel.v vr31, vr18, vr31, vr1 //dst+4
+ vbitsel.v vr11, vr19, vr11, vr1 //dst+5
+
+ //flat8in
+ //dst-3
+ vslli.h vr4, vr9, 1
+ vsadd.hu vr4, vr4, vr9 //p3*3
+ vsadd.hu vr4, vr4, vr10
+ vsadd.hu vr4, vr4, vr10
+ vsadd.hu vr4, vr4, vr12
+ vsadd.hu vr4, vr4, vr13
+ vsadd.hu vr4, vr4, vr14
+
+ //dst-2
+ vsadd.hu vr5, vr4, vr12
+ vsadd.hu vr5, vr5, vr15
+ vssub.hu vr5, vr5, vr9
+ vssub.hu vr5, vr5, vr10
+
+ //dst-1
+ vsadd.hu vr18, vr5, vr13
+ vsadd.hu vr18, vr18, vr16
+ vssub.hu vr18, vr18, vr9
+ vssub.hu vr18, vr18, vr12
+
+ //dst+0
+ vsadd.hu vr7, vr18, vr14
+ vsadd.hu vr7, vr7, vr17
+ vssub.hu vr7, vr7, vr9
+ vssub.hu vr7, vr7, vr13
+
+ //dst+1
+ vsadd.hu vr8, vr7, vr15
+ vsadd.hu vr8, vr8, vr17
+ vssub.hu vr8, vr8, vr10
+ vssub.hu vr8, vr8, vr14
+
+ //dst+2
+ vsadd.hu vr9, vr8, vr16
+ vsadd.hu vr9, vr9, vr17
+ vssub.hu vr9, vr9, vr12
+ vssub.hu vr9, vr9, vr15
+
+ vsrari.h vr4, vr4, 3
+ vsrari.h vr5, vr5, 3
+ vsrari.h vr18, vr18, 3
+ vsrari.h vr7, vr7, 3
+ vsrari.h vr8, vr8, 3
+ vsrari.h vr9, vr9, 3
+
+ //flat8out & flat8in
+ vbitsel.v vr24, vr4, vr24, vr1 //dst-3
+ vbitsel.v vr25, vr5, vr25, vr1 //dst-2
+ vbitsel.v vr26, vr18, vr26, vr1 //dst-1
+ vbitsel.v vr27, vr7, vr27, vr1 //dst+0
+ vbitsel.v vr28, vr8, vr28, vr1 //dst+1
+ vbitsel.v vr29, vr9, vr29, vr1 //dst+2
+
+ //!flat8in
+ vsub.h vr17, vr12, vr15 //p1-q1
+ vsllwil.h.b vr2, vr2, 0
+ vand.v vr17, vr17, vr2 //&hev
+ vssrani.b.h vr17, vr17, 0
+ vsllwil.h.b vr17, vr17, 0
+
+ vsub.h vr7, vr14, vr13
+ vsadd.h vr8, vr7, vr7
+ vsadd.h vr7, vr7, vr8
+ vsadd.h vr7, vr7, vr17
+ vssrani.b.h vr7, vr7, 0
+ vsllwil.h.b vr17, vr7, 0 //f = iclip_diff(3 * (q0 - p0) + f);
+
+ vaddi.hu vr7, vr17, 4
+ vaddi.hu vr8, vr17, 3
+ li.w t5, 127
+ vreplgr2vr.h vr9, t5
+ vmin.h vr7, vr7, vr9
+ vmin.h vr8, vr8, vr9
+ vsrai.h vr7, vr7, 3 //f1
+ vsrai.h vr8, vr8, 3 //f2
+
+ vsadd.h vr4, vr13, vr8 //dst-1
+ vssub.h vr5, vr14, vr7 //dst+0
+
+ vsrari.h vr7, vr7, 1
+ vsadd.h vr17, vr12, vr7
+ vssub.h vr7, vr15, vr7
+ vbitsel.v vr17, vr17, vr12, vr2 //dst-2
+ vbitsel.v vr7, vr7, vr15, vr2 //dst+1
+
+ //flat8in or !flat8in
+ vsllwil.h.b vr3, vr3, 0
+ vbitsel.v vr24, vr10, vr24, vr3 //dst-3
+ vbitsel.v vr25, vr17, vr25, vr3 //dst-2
+ vbitsel.v vr26, vr4, vr26, vr3 //dst-1
+ vbitsel.v vr27, vr5, vr27, vr3 //dst+0
+ vbitsel.v vr28, vr7, vr28, vr3 //dst+1
+ vbitsel.v vr29, vr16, vr29, vr3 //dst+2
+
+.ifc \DIR, h
+ //dst-6,dst-2,dst-5,dst-1
+ vssrani.bu.h vr25, vr21, 0
+ vssrani.bu.h vr26, vr22, 0
+ vpermi.w vr25, vr25, 0xd8
+ vpermi.w vr26, vr26, 0xd8
+ vilvl.b vr6, vr26, vr25 //65656565 21212121
+
+ //dst-4,dst+0,dst-3,dst+1
+ vssrani.bu.h vr27, vr23, 0
+ vssrani.bu.h vr28, vr24, 0
+ vpermi.w vr27, vr27, 0xd8
+ vpermi.w vr28, vr28, 0xd8
+ vilvl.b vr26, vr28, vr27 //43434343 01010101
+
+ vilvl.h vr21, vr26, vr6 //6543 -- -- --
+ vilvh.h vr22, vr26, vr6 //2101 -- -- --
+ vilvl.w vr20, vr22, vr21 //65432101 --
+ vilvh.w vr22, vr22, vr21 //65432101 --
+ vreplvei.d vr21, vr20, 1
+ vreplvei.d vr23, vr22, 1
+
+ //dst+2,dst+4,dst+3,dst+5
+ vssrani.bu.h vr31, vr29, 0
+ vssrani.bu.h vr11, vr30, 0
+ vpermi.w vr31, vr31, 0xd8
+ vpermi.w vr11, vr11, 0xd8
+ vilvl.b vr11, vr11, vr31 //23232323 45454545
+ vshuf4i.w vr11, vr11, 0xd8
+ vshuf4i.h vr11, vr11, 0xd8 //2345 -- -- --
+
+ vextrins.w vr20, vr11, 0x20
+ vextrins.w vr21, vr11, 0x21
+ vextrins.w vr22, vr11, 0x22
+ vextrins.w vr23, vr11, 0x23
+
+ addi.d t5, a0, -6
+ vld vr6, t5, 0 //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6
+ vldx vr7, t5, a1
+ add.d t5, t5, a1
+ vldx vr8, t5, a1
+ add.d t5, t5, a1
+ vldx vr9, t5, a1
+
+ //expand fm to 128
+ vreplvei.b vr10, vr0, 0
+ vreplvei.b vr11, vr0, 1
+ vreplvei.b vr12, vr0, 2
+ vreplvei.b vr13, vr0, 3
+
+ vbitsel.v vr20, vr6, vr20, vr10
+ vbitsel.v vr21, vr7, vr21, vr11
+ vbitsel.v vr22, vr8, vr22, vr12
+ vbitsel.v vr23, vr9, vr23, vr13
+
+ addi.d t5, a0, -6
+ vstelm.d vr20, t5, 0, 0
+ vstelm.w vr20, t5, 8, 2
+ add.d t5, t5, a1
+ vstelm.d vr21, t5, 0, 0
+ vstelm.w vr21, t5, 8, 2
+ add.d t5, t5, a1
+ vstelm.d vr22, t5, 0, 0
+ vstelm.w vr22, t5, 8, 2
+ add.d t5, t5, a1
+ vstelm.d vr23, t5, 0, 0
+ vstelm.w vr23, t5, 8, 2
+.else
+ //reload
+ fld.d f7, sp, 0
+ fld.d f8, sp, 8
+ fld.d f9, sp, 16
+ fld.d f10, sp, 24
+ fld.d f12, sp, 32
+ fld.d f13, sp, 40
+ fld.d f14, sp, 48
+ fld.d f15, sp, 56
+ fld.d f16, sp, 64
+ fld.d f17, sp, 72
+ fld.d f18, sp, 80
+ fld.d f19, sp, 88
+
+ vssrarni.bu.h vr21, vr21, 0
+ vssrarni.bu.h vr22, vr22, 0
+ vssrarni.bu.h vr23, vr23, 0
+ vssrarni.bu.h vr24, vr24, 0
+ vssrarni.bu.h vr25, vr25, 0
+ vssrarni.bu.h vr26, vr26, 0
+ vssrarni.bu.h vr27, vr27, 0
+ vssrarni.bu.h vr28, vr28, 0
+ vssrarni.bu.h vr29, vr29, 0
+ vssrarni.bu.h vr30, vr30, 0
+ vssrarni.bu.h vr31, vr31, 0
+ vssrarni.bu.h vr11, vr11, 0
+
+ vbitsel.v vr7, vr7, vr21, vr0 //p5
+ vbitsel.v vr8, vr8, vr22, vr0 //p4
+ vbitsel.v vr9, vr9, vr23, vr0 //p3
+ vbitsel.v vr10, vr10, vr24, vr0 //p2
+ vbitsel.v vr12, vr12, vr25, vr0 //p1
+ vbitsel.v vr13, vr13, vr26, vr0 //p0
+ vbitsel.v vr14, vr14, vr27, vr0 //q0
+ vbitsel.v vr15, vr15, vr28, vr0 //q1
+ vbitsel.v vr16, vr16, vr29, vr0 //q2
+ vbitsel.v vr17, vr17, vr30, vr0 //q3
+ vbitsel.v vr18, vr18, vr31, vr0 //q4
+ vbitsel.v vr19, vr19, vr11, vr0 //q5
+
+ fst.s f14, a0, 0
+ fstx.s f15, a0, a1
+ alsl.d t5, a1, a0, 1
+ fst.s f16, t5, 0
+ fstx.s f17, t5, a1
+ alsl.d t5, a1, t5, 1
+ fst.s f18, t5, 0
+ fstx.s f19, t5, a1
+
+ slli.w t5, a1, 2
+ alsl.d t5, a1, t5, 1
+ sub.d t5, a0, t5
+ fst.s f7, t5, 0
+ fstx.s f8, t5, a1
+ alsl.d t5, a1, t5, 1
+ fst.s f9, t5, 0
+ fstx.s f10, t5, a1
+ alsl.d t5, a1, t5, 1
+ fst.s f12, t5, 0
+ fstx.s f13, t5, a1
+.endif
+.END_FILTER_\DIR\()\TYPE\()_W16:
+.ifc \DIR, v
+ addi.d sp, sp, 96
+.endif
+.endm
+
+.macro PUSH_REG
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+.endm
+.macro POP_REG
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+.endm
+
+.macro LPF_FUNC DIR, TYPE
+function lpf_\DIR\()_sb_\TYPE\()_8bpc_lsx
+ PUSH_REG
+ vld vr0, a2, 0 //vmask
+ vpickve2gr.wu t0, vr0, 0
+ vpickve2gr.wu t1, vr0, 1
+ vpickve2gr.wu t2, vr0, 2
+ li.w t3, 1 //y
+ or t0, t0, t1
+.ifc \TYPE, y
+ or t0, t0, t2 //vm
+.endif
+ addi.w t8, t3, -1
+ andn t8, t0, t8
+ beqz t0, .\DIR\()\TYPE\()_END
+.\DIR\()\TYPE\()_LOOP:
+ and t4, t0, t3 //vm & y
+ beqz t4, .\DIR\()\TYPE\()_LOOP_NEXT
+ vldrepl.b vr1, a3, 0 //l[0][0]
+.ifc \DIR, h
+ addi.d t5, a3, -4
+.else
+ slli.d t5, a4, 2
+ sub.d t5, a3, t5
+.endif
+ vldrepl.b vr2, t5, 0 //l[-1][0]
+ vseqi.b vr3, vr1, 0
+ vbitsel.v vr1, vr1, vr2, vr3 //L
+ vpickve2gr.b t5, vr1, 0
+ beqz t5, .\DIR\()\TYPE\()_LOOP_NEXT
+ vsrai.b vr2, vr1, 4 //H
+ add.d t6, a5, t5
+ vldrepl.b vr3, t6, 0 //E
+ addi.d t6, t6, 64
+ vldrepl.b vr4, t6, 0 //I
+.ifc \TYPE, y
+ and t5, t2, t3
+ bnez t5, .FILTER_\DIR\()\TYPE\()_16
+.endif
+ and t5, t1, t3
+.ifc \TYPE, y
+ bnez t5, .FILTER_\DIR\()\TYPE\()_8
+.else
+ bnez t5, .FILTER_\DIR\()\TYPE\()_6
+.endif
+ FILTER_W4 \DIR, \TYPE
+ b .\DIR\()\TYPE\()_LOOP_NEXT
+.ifc \TYPE, uv
+.FILTER_\DIR\()\TYPE\()_6:
+ FILTER_W6 \DIR, \TYPE
+.endif
+.ifc \TYPE, y
+.FILTER_\DIR\()\TYPE\()_8:
+ FILTER_W8 \DIR, \TYPE
+ b .\DIR\()\TYPE\()_LOOP_NEXT
+.FILTER_\DIR\()\TYPE\()_16:
+ FILTER_W16 \DIR, \TYPE
+.endif
+.\DIR\()\TYPE\()_LOOP_NEXT:
+ slli.w t3, t3, 1
+.ifc \DIR, h
+ alsl.d a0, a1, a0, 2
+ slli.w t8, a4, 2
+ add.d a3, a3, t8
+.else
+ addi.d a0, a0, 4
+ addi.d a3, a3, 4
+.endif
+ addi.w t8, t3, -1
+ andn t8, t0, t8
+ bnez t8, .\DIR\()\TYPE\()_LOOP
+.\DIR\()\TYPE\()_END:
+ POP_REG
+endfunc
+.endm
+
+LPF_FUNC h, y
+LPF_FUNC v, y
+LPF_FUNC h, uv
+LPF_FUNC v, uv
diff --git a/src/loongarch/loopfilter.h b/src/loongarch/loopfilter.h
new file mode 100644
index 0000000..844faf0
--- /dev/null
+++ b/src/loongarch/loopfilter.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_LOOPFILTER_H
+#define DAV1D_SRC_LOONGARCH_LOOPFILTER_H
+
+#include "src/cpu.h"
+#include "src/loopfilter.h"
+
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, lsx));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, lsx));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, lsx));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, lsx));
+
+static ALWAYS_INLINE void loop_filter_dsp_init_loongarch(Dav1dLoopFilterDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
+
+#if BITDEPTH == 8
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, lsx);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, lsx);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, lsx);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, lsx);
+#endif
+}
+
+#endif /* DAV1D_SRC_LOONGARCH_LOOPFILTER_H */
diff --git a/src/loongarch/looprestoration.S b/src/loongarch/looprestoration.S
new file mode 100644
index 0000000..ab512d1
--- /dev/null
+++ b/src/loongarch/looprestoration.S
@@ -0,0 +1,1407 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/loongarch/loongson_asm.S"
+
+#define REST_UNIT_STRIDE (400)
+
+.macro MADD_HU_BU in0, in1, out0, out1
+ vsllwil.hu.bu vr12, \in0, 0
+ vexth.hu.bu vr13, \in0
+ vmadd.h \out0, vr12, \in1
+ vmadd.h \out1, vr13, \in1
+.endm
+
+const wiener_shuf
+.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
+endconst
+
+/*
+void wiener_filter_h_lsx(int32_t *hor_ptr,
+ uint8_t *tmp_ptr,
+ const int16_t filterh[8],
+ const int w, const int h)
+*/
+function wiener_filter_h_8bpc_lsx
+ addi.d sp, sp, -40
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ li.w t7, 1<<14 // clip_limit
+
+ la.local t1, wiener_shuf
+ vld vr4, t1, 0
+ vld vr14, a2, 0 // filter[0][k]
+ vreplvei.h vr21, vr14, 0
+ vreplvei.h vr22, vr14, 1
+ vreplvei.h vr23, vr14, 2
+ vreplvei.h vr24, vr14, 3
+ vreplvei.h vr25, vr14, 4
+ vreplvei.h vr26, vr14, 5
+ vreplvei.h vr27, vr14, 6
+ vreplgr2vr.w vr0, t7
+
+.WIENER_FILTER_H_H:
+ addi.w a4, a4, -1 // h
+ addi.w t0, a3, 0 // w
+ addi.d t1, a1, 0 // tmp_ptr
+ addi.d t2, a0, 0 // hor_ptr
+
+.WIENER_FILTER_H_W:
+ addi.w t0, t0, -16
+ vld vr5, t1, 0
+ vld vr13, t1, 16
+
+ vsubi.bu vr14, vr4, 2
+ vsubi.bu vr15, vr4, 1
+ vshuf.b vr6, vr13, vr5, vr14 // 1 ... 8, 9 ... 16
+ vshuf.b vr7, vr13, vr5, vr15 // 2 ... 9, 10 ... 17
+ vshuf.b vr8, vr13, vr5, vr4 // 3 ... 10, 11 ... 18
+ vaddi.bu vr14, vr4, 1
+ vaddi.bu vr15, vr4, 2
+ vshuf.b vr9, vr13, vr5, vr14 // 4 ... 11, 12 ... 19
+ vshuf.b vr10, vr13, vr5, vr15 // 5 ... 12, 13 ... 20
+ vaddi.bu vr14, vr4, 3
+ vshuf.b vr11, vr13, vr5, vr14 // 6 ... 13, 14 ... 21
+
+ vsllwil.hu.bu vr15, vr8, 0 // 3 4 5 6 7 8 9 10
+ vexth.hu.bu vr16, vr8 // 11 12 13 14 15 16 17 18
+ vsllwil.wu.hu vr17, vr15, 0 // 3 4 5 6
+ vexth.wu.hu vr18, vr15 // 7 8 9 10
+ vsllwil.wu.hu vr19, vr16, 0 // 11 12 13 14
+ vexth.wu.hu vr20, vr16 // 15 16 17 18
+ vslli.w vr17, vr17, 7
+ vslli.w vr18, vr18, 7
+ vslli.w vr19, vr19, 7
+ vslli.w vr20, vr20, 7
+ vxor.v vr15, vr15, vr15
+ vxor.v vr14, vr14, vr14
+
+ MADD_HU_BU vr5, vr21, vr14, vr15
+ MADD_HU_BU vr6, vr22, vr14, vr15
+ MADD_HU_BU vr7, vr23, vr14, vr15
+ MADD_HU_BU vr8, vr24, vr14, vr15
+ MADD_HU_BU vr9, vr25, vr14, vr15
+ MADD_HU_BU vr10, vr26, vr14, vr15
+ MADD_HU_BU vr11, vr27, vr14, vr15
+
+ vsllwil.w.h vr5, vr14, 0 // 0 1 2 3
+ vexth.w.h vr6, vr14 // 4 5 6 7
+ vsllwil.w.h vr7, vr15, 0 // 8 9 10 11
+ vexth.w.h vr8, vr15 // 12 13 14 15
+ vadd.w vr17, vr17, vr5
+ vadd.w vr18, vr18, vr6
+ vadd.w vr19, vr19, vr7
+ vadd.w vr20, vr20, vr8
+ vadd.w vr17, vr17, vr0
+ vadd.w vr18, vr18, vr0
+ vadd.w vr19, vr19, vr0
+ vadd.w vr20, vr20, vr0
+
+ vsrli.w vr1, vr0, 1
+ vsubi.wu vr1, vr1, 1
+ vxor.v vr3, vr3, vr3
+ vsrari.w vr17, vr17, 3
+ vsrari.w vr18, vr18, 3
+ vsrari.w vr19, vr19, 3
+ vsrari.w vr20, vr20, 3
+ vclip.w vr17, vr17, vr3, vr1
+ vclip.w vr18, vr18, vr3, vr1
+ vclip.w vr19, vr19, vr3, vr1
+ vclip.w vr20, vr20, vr3, vr1
+
+ vst vr17, t2, 0
+ vst vr18, t2, 16
+ vst vr19, t2, 32
+ vst vr20, t2, 48
+ addi.d t1, t1, 16
+ addi.d t2, t2, 64
+ blt zero, t0, .WIENER_FILTER_H_W
+
+ addi.d a1, a1, REST_UNIT_STRIDE
+ addi.d a0, a0, (REST_UNIT_STRIDE << 2)
+ bnez a4, .WIENER_FILTER_H_H
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ addi.d sp, sp, 40
+endfunc
+
+.macro APPLY_FILTER in0, in1, in2
+ alsl.d t7, \in0, \in1, 2
+ vld vr10, t7, 0
+ vld vr11, t7, 16
+ vld vr12, t7, 32
+ vld vr13, t7, 48
+ vmadd.w vr14, vr10, \in2
+ vmadd.w vr15, vr11, \in2
+ vmadd.w vr16, vr12, \in2
+ vmadd.w vr17, vr13, \in2
+.endm
+
+.macro wiener_filter_v_8bpc_core_lsx
+ vreplgr2vr.w vr14, t6
+ vreplgr2vr.w vr15, t6
+ vreplgr2vr.w vr16, t6
+ vreplgr2vr.w vr17, t6
+
+ addi.w t7, t2, 0 // j + index k
+ mul.w t7, t7, t8 // (j + index) * REST_UNIT_STRIDE
+ add.w t7, t7, t4 // (j + index) * REST_UNIT_STRIDE + i
+
+ APPLY_FILTER t7, a2, vr2
+ APPLY_FILTER t8, t7, vr3
+ APPLY_FILTER t8, t7, vr4
+ APPLY_FILTER t8, t7, vr5
+ APPLY_FILTER t8, t7, vr6
+ APPLY_FILTER t8, t7, vr7
+ APPLY_FILTER t8, t7, vr8
+ vssrarni.hu.w vr15, vr14, 11
+ vssrarni.hu.w vr17, vr16, 11
+ vssrlni.bu.h vr17, vr15, 0
+.endm
+
+/*
+void wiener_filter_v_lsx(uint8_t *p,
+ const ptrdiff_t p_stride,
+ const int32_t *hor,
+ const int16_t filterv[8],
+ const int w, const int h)
+*/
+function wiener_filter_v_8bpc_lsx
+ li.w t6, -(1 << 18)
+
+ li.w t8, REST_UNIT_STRIDE
+ ld.h t0, a3, 0
+ ld.h t1, a3, 2
+ vreplgr2vr.w vr2, t0
+ vreplgr2vr.w vr3, t1
+ ld.h t0, a3, 4
+ ld.h t1, a3, 6
+ vreplgr2vr.w vr4, t0
+ vreplgr2vr.w vr5, t1
+ ld.h t0, a3, 8
+ ld.h t1, a3, 10
+ vreplgr2vr.w vr6, t0
+ vreplgr2vr.w vr7, t1
+ ld.h t0, a3, 12
+ vreplgr2vr.w vr8, t0
+
+ andi t1, a4, 0xf
+ sub.w t0, a4, t1 // w-w%16
+ or t2, zero, zero // j
+ or t4, zero, zero
+ beqz t0, .WIENER_FILTER_V_W_LT16
+
+.WIENER_FILTER_V_H:
+ andi t1, a4, 0xf
+ add.d t3, zero, a0 // p
+ or t4, zero, zero // i
+
+.WIENER_FILTER_V_W:
+
+ wiener_filter_v_8bpc_core_lsx
+
+ mul.w t5, t2, a1 // j * stride
+ add.w t5, t5, t4 // j * stride + i
+ add.d t3, a0, t5
+ addi.w t4, t4, 16
+ vst vr17, t3, 0
+ bne t0, t4, .WIENER_FILTER_V_W
+
+ beqz t1, .WIENER_FILTER_V_W_EQ16
+
+ wiener_filter_v_8bpc_core_lsx
+
+ addi.d t3, t3, 16
+ andi t1, a4, 0xf
+
+.WIENER_FILTER_V_ST_REM:
+ vstelm.b vr17, t3, 0, 0
+ vbsrl.v vr17, vr17, 1
+ addi.d t3, t3, 1
+ addi.w t1, t1, -1
+ bnez t1, .WIENER_FILTER_V_ST_REM
+.WIENER_FILTER_V_W_EQ16:
+ addi.w t2, t2, 1
+ blt t2, a5, .WIENER_FILTER_V_H
+ b .WIENER_FILTER_V_END
+
+.WIENER_FILTER_V_W_LT16:
+ andi t1, a4, 0xf
+ add.d t3, zero, a0
+
+ wiener_filter_v_8bpc_core_lsx
+
+ mul.w t5, t2, a1 // j * stride
+ add.d t3, a0, t5
+
+.WIENER_FILTER_V_ST_REM_1:
+ vstelm.b vr17, t3, 0, 0
+ vbsrl.v vr17, vr17, 1
+ addi.d t3, t3, 1
+ addi.w t1, t1, -1
+ bnez t1, .WIENER_FILTER_V_ST_REM_1
+
+ addi.w t2, t2, 1
+ blt t2, a5, .WIENER_FILTER_V_W_LT16
+
+.WIENER_FILTER_V_END:
+endfunc
+
+/*
+void boxsum3_h(int32_t *sumsq, coef *sum, const pixel *src,
+ const int w, const int h)
+*/
+function boxsum3_h_8bpc_lsx
+ addi.d a2, a2, REST_UNIT_STRIDE
+ li.w t0, 1
+ addi.w a3, a3, -2
+ addi.w a4, a4, -4
+
+.LBS3_H_H:
+ alsl.d t1, t0, a1, 1 // sum_v *sum_v = sum + x
+ alsl.d t2, t0, a0, 2 // sumsq_v *sumsq_v = sumsq + x
+ add.d t3, t0, a2 // s
+ addi.w t5, a3, 0
+.LBS3_H_W:
+ vld vr0, t3, 0
+ vld vr1, t3, REST_UNIT_STRIDE
+ vld vr2, t3, (REST_UNIT_STRIDE<<1)
+
+ vilvl.b vr3, vr1, vr0
+ vhaddw.hu.bu vr4, vr3, vr3
+ vilvh.b vr5, vr1, vr0
+ vhaddw.hu.bu vr6, vr5, vr5
+ vsllwil.hu.bu vr7, vr2, 0
+ vexth.hu.bu vr8, vr2
+ // sum_v
+ vadd.h vr4, vr4, vr7
+ vadd.h vr6, vr6, vr8
+ vst vr4, t1, REST_UNIT_STRIDE<<1
+ vst vr6, t1, (REST_UNIT_STRIDE<<1)+16
+ addi.d t1, t1, 32
+ // sumsq
+ vmulwev.h.bu vr9, vr3, vr3
+ vmulwod.h.bu vr10, vr3, vr3
+ vmulwev.h.bu vr11, vr5, vr5
+ vmulwod.h.bu vr12, vr5, vr5
+ vmul.h vr7, vr7, vr7
+ vmul.h vr8, vr8, vr8
+ vaddwev.w.hu vr13, vr10, vr9
+ vaddwod.w.hu vr14, vr10, vr9
+ vilvl.w vr3, vr14, vr13
+ vilvh.w vr4, vr14, vr13
+ vaddwev.w.hu vr13, vr12, vr11
+ vaddwod.w.hu vr14, vr12, vr11
+ vilvl.w vr15, vr14, vr13
+ vilvh.w vr16, vr14, vr13
+ vsllwil.wu.hu vr9, vr7, 0
+ vexth.wu.hu vr10, vr7
+ vsllwil.wu.hu vr11, vr8, 0
+ vexth.wu.hu vr12, vr8
+ vadd.w vr9, vr9, vr3
+ vadd.w vr10, vr10, vr4
+ vadd.w vr11, vr11, vr15
+ vadd.w vr12, vr12, vr16
+ vst vr9, t2, REST_UNIT_STRIDE<<2
+ vst vr10, t2, (REST_UNIT_STRIDE<<2)+16
+ vst vr11, t2, (REST_UNIT_STRIDE<<2)+32
+ vst vr12, t2, (REST_UNIT_STRIDE<<2)+48
+ addi.d t2, t2, 64
+
+ addi.w t5, t5, -16
+ addi.d t3, t3, 16
+ blt zero, t5, .LBS3_H_W
+
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ addi.d a2, a2, REST_UNIT_STRIDE
+ addi.d a4, a4, -1
+ blt zero, a4, .LBS3_H_H
+
+.LBS3_H_END:
+endfunc
+
+/*
+void boxsum3_v(int32_t *sumsq, coef *sum,
+ const int w, const int h)
+*/
+function boxsum3_v_8bpc_lsx
+ addi.d a0, a0, (REST_UNIT_STRIDE<<2)
+ addi.d a1, a1, (REST_UNIT_STRIDE<<1)
+ addi.w a3, a3, -4
+ addi.w a2, a2, -4
+
+.LBS3_V_H:
+ sub.w t3, a2, zero
+ addi.d t0, a0, 4
+ addi.d t1, a1, 2
+ addi.d t5, a0, 8
+ addi.d t6, a1, 4
+
+ vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7
+ vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8
+ vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9
+ vld vr3, t0, 0 // a2 0 1 2 3
+ vld vr4, t0, 4 // b2 1 2 3 4
+ vld vr5, t0, 8 // c2 2 3 4 5
+ vld vr6, t0, 16 // 3 4 5 6
+ vld vr7, t0, 20 // 4 5 6 7
+ vld vr8, t0, 24 // 5 6 7 8
+ vadd.h vr9, vr0, vr1
+ vadd.h vr9, vr9, vr2
+ vadd.w vr10, vr3, vr4
+ vadd.w vr10, vr10, vr5
+ vadd.w vr11, vr6, vr7
+ vadd.w vr11, vr11, vr8
+ vpickve2gr.h t7, vr2, 6
+ vpickve2gr.w t8, vr8, 2
+ vst vr9, t6, 0
+ vst vr10, t5, 0
+ vst vr11, t5, 16
+
+ addi.d t1, t1, 16
+ addi.d t0, t0, 32
+ addi.d t5, t5, 32
+ addi.d t6, t6, 16
+ addi.d t3, t3, -8
+ ble t3, zero, .LBS3_V_H0
+
+.LBS3_V_W8:
+ vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7
+ vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8
+ vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9
+ vld vr3, t0, 0 // a2 0 1 2 3
+ vld vr4, t0, 4 // b2 1 2 3 4
+ vld vr5, t0, 8 // c2 2 3 4 5
+ vld vr6, t0, 16 // 3 4 5 6
+ vld vr7, t0, 20 // 4 5 6 7
+ vld vr8, t0, 24 // 5 6 7 8
+ vinsgr2vr.h vr0, t7, 0
+ vinsgr2vr.w vr3, t8, 0
+ vpickve2gr.h t7, vr2, 6
+ vpickve2gr.w t8, vr8, 2
+ vadd.h vr9, vr0, vr1
+ vadd.w vr10, vr3, vr4
+ vadd.w vr11, vr6, vr7
+ vadd.h vr9, vr9, vr2
+ vadd.w vr10, vr10, vr5
+ vadd.w vr11, vr11, vr8
+ vst vr9, t6, 0
+ vst vr10, t5, 0
+ vst vr11, t5, 16
+ addi.d t3, t3, -8
+ addi.d t1, t1, 16
+ addi.d t0, t0, 32
+ addi.d t5, t5, 32
+ addi.d t6, t6, 16
+ blt zero, t3, .LBS3_V_W8
+
+.LBS3_V_H0:
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.w a3, a3, -1
+ bnez a3, .LBS3_V_H
+
+.LBS3_V_END:
+endfunc
+
+/*
+boxsum3_selfguided_filter(int32_t *sumsq, coef *sum,
+ const int w, const int h,
+ const unsigned s)
+*/
+function boxsum3_sgf_h_8bpc_lsx
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a0, a0, 12 // AA
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ addi.d a1, a1, 6 // BB
+ la.local t8, dav1d_sgr_x_by_x
+ li.w t6, 455
+ vreplgr2vr.w vr20, t6
+ li.w t6, 255
+ vreplgr2vr.w vr22, t6
+ vaddi.wu vr21, vr22, 1 // 256
+ vreplgr2vr.w vr6, a4
+ vldi vr19, 0x809
+ addi.w a2, a2, 2 // w + 2
+ addi.w a3, a3, 2 // h + 2
+
+.LBS3SGF_H_H:
+ addi.w t2, a2, 0
+ addi.d t0, a0, -4
+ addi.d t1, a1, -2
+
+.LBS3SGF_H_W:
+ addi.w t2, t2, -8
+ vld vr0, t0, 0 // AA[i]
+ vld vr1, t0, 16
+ vld vr2, t1, 0 // BB[i]
+
+ vmul.w vr4, vr0, vr19 // a * n
+ vmul.w vr5, vr1, vr19 // a * n
+ vsllwil.w.h vr9, vr2, 0
+ vexth.w.h vr10, vr2
+ vmsub.w vr4, vr9, vr9 // p
+ vmsub.w vr5, vr10, vr10 // p
+ vmaxi.w vr4, vr4, 0
+ vmaxi.w vr5, vr5, 0 // p
+ vmul.w vr4, vr4, vr6 // p * s
+ vmul.w vr5, vr5, vr6 // p * s
+ vsrlri.w vr4, vr4, 20
+ vsrlri.w vr5, vr5, 20 // z
+ vmin.w vr4, vr4, vr22
+ vmin.w vr5, vr5, vr22
+
+ vpickve2gr.w t6, vr4, 0
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 0
+ vpickve2gr.w t6, vr4, 1
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 1
+ vpickve2gr.w t6, vr4, 2
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 2
+ vpickve2gr.w t6, vr4, 3
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 3
+
+ vpickve2gr.w t6, vr5, 0
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 0
+ vpickve2gr.w t6, vr5, 1
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 1
+ vpickve2gr.w t6, vr5, 2
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 2
+ vpickve2gr.w t6, vr5, 3
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 3 // x
+
+ vmul.w vr9, vr7, vr9 // x * BB[i]
+ vmul.w vr10, vr8, vr10
+ vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x
+ vmul.w vr10, vr10, vr20
+ vsrlri.w vr9, vr9, 12
+ vsrlri.w vr10, vr10, 12
+ vsub.w vr7, vr21, vr7
+ vsub.w vr8, vr21, vr8
+ vpickev.h vr8, vr8, vr7
+
+ vst vr9, t0, 0
+ vst vr10, t0, 16
+ vst vr8, t1, 0
+ addi.d t0, t0, 32
+ addi.d t1, t1, 16
+ blt zero, t2, .LBS3SGF_H_W
+
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ addi.w a3, a3, -1
+ bnez a3, .LBS3SGF_H_H
+endfunc
+
+/*
+boxsum3_selfguided_filter(coef *dst, pixel *src,
+ int32_t *sumsq, coef *sum,
+ const int w, const int h)
+*/
+function boxsum3_sgf_v_8bpc_lsx
+ addi.d a1, a1, (3*REST_UNIT_STRIDE+3) // src
+ addi.d a2, a2, REST_UNIT_STRIDE<<2
+ addi.d a2, a2, (REST_UNIT_STRIDE<<2)+12
+ addi.d a3, a3, REST_UNIT_STRIDE<<2
+ addi.d a3, a3, 6
+.LBS3SGF_V_H:
+ // A int32_t *sumsq
+ addi.d t0, a2, -(REST_UNIT_STRIDE<<2) // -stride
+ addi.d t1, a2, 0 // sumsq
+ addi.d t2, a2, REST_UNIT_STRIDE<<2 // +stride
+ addi.d t6, a1, 0
+ addi.w t7, a4, 0
+ addi.d t8, a0, 0
+ // B coef *sum
+ addi.d t3, a3, -(REST_UNIT_STRIDE<<1) // -stride
+ addi.d t4, a3, 0
+ addi.d t5, a3, REST_UNIT_STRIDE<<1
+
+.LBS3SGF_V_W:
+ vld vr0, t0, 0 // P[i - REST_UNIT_STRIDE]
+ vld vr1, t0, 16
+ vld vr2, t1, -4 // P[i-1]
+ vld vr3, t1, 12
+ vld vr4, t2, 0 // P[i + REST_UNIT_STRIDE]
+ vld vr5, t2, 16
+ vld vr6, t1, 0 // p[i]
+ vld vr7, t1, 16
+ vld vr8, t1, 4 // p[i+1]
+ vld vr9, t1, 20
+
+ vld vr10, t0, -4 // P[i - 1 - REST_UNIT_STRIDE]
+ vld vr11, t0, 12
+ vld vr12, t2, -4 // P[i - 1 + REST_UNIT_STRIDE]
+ vld vr13, t2, 12
+ vld vr14, t0, 4 // P[i + 1 - REST_UNIT_STRIDE]
+ vld vr15, t0, 20
+ vld vr16, t2, 4 // P[i + 1 + REST_UNIT_STRIDE]
+ vld vr17, t2, 20
+
+ vadd.w vr0, vr2, vr0
+ vadd.w vr4, vr6, vr4
+ vadd.w vr0, vr0, vr8
+ vadd.w vr20, vr0, vr4
+ vslli.w vr20, vr20, 2 // 0 1 2 3
+ vadd.w vr0, vr1, vr3
+ vadd.w vr4, vr5, vr7
+ vadd.w vr0, vr0, vr9
+ vadd.w vr21, vr0, vr4
+ vslli.w vr21, vr21, 2 // 4 5 6 7
+ vadd.w vr12, vr10, vr12
+ vadd.w vr16, vr14, vr16
+ vadd.w vr22, vr12, vr16
+ vslli.w vr23, vr22, 1
+ vadd.w vr22, vr23, vr22
+ vadd.w vr11, vr11, vr13
+ vadd.w vr15, vr15, vr17
+ vadd.w vr0, vr11, vr15
+ vslli.w vr23, vr0, 1
+ vadd.w vr23, vr23, vr0
+ vadd.w vr20, vr20, vr22 // b
+ vadd.w vr21, vr21, vr23
+
+ // B coef *sum
+ vld vr0, t3, 0 // P[i - REST_UNIT_STRIDE]
+ vld vr1, t4, -2 // p[i - 1]
+ vld vr2, t4, 0 // p[i]
+ vld vr3, t4, 2 // p[i + 1]
+ vld vr4, t5, 0 // P[i + REST_UNIT_STRIDE]
+ vld vr5, t3, -2 // P[i - 1 - REST_UNIT_STRIDE]
+ vld vr6, t5, -2 // P[i - 1 + REST_UNIT_STRIDE]
+ vld vr7, t3, 2 // P[i + 1 - REST_UNIT_STRIDE]
+ vld vr8, t5, 2 // P[i + 1 + REST_UNIT_STRIDE]
+ vaddwev.w.h vr9, vr0, vr1
+ vaddwod.w.h vr10, vr0, vr1
+ vaddwev.w.h vr11, vr2, vr3
+ vaddwod.w.h vr12, vr2, vr3
+ vadd.w vr9, vr11, vr9
+ vadd.w vr10, vr12, vr10
+ vilvl.w vr11, vr10, vr9 // 0 1 2 3
+ vilvh.w vr12, vr10, vr9 // 4 5 6 7
+ vsllwil.w.h vr0, vr4, 0
+ vexth.w.h vr1, vr4
+ vadd.w vr0, vr11, vr0
+ vadd.w vr1, vr12, vr1
+ vslli.w vr0, vr0, 2
+ vslli.w vr1, vr1, 2
+ vaddwev.w.h vr9, vr5, vr6
+ vaddwod.w.h vr10, vr5, vr6
+ vaddwev.w.h vr11, vr7, vr8
+ vaddwod.w.h vr12, vr7, vr8
+ vadd.w vr9, vr11, vr9
+ vadd.w vr10, vr12, vr10
+ vilvl.w vr13, vr10, vr9
+ vilvh.w vr14, vr10, vr9
+ vslli.w vr15, vr13, 1
+ vslli.w vr16, vr14, 1
+ vadd.w vr15, vr13, vr15 // a
+ vadd.w vr16, vr14, vr16
+ vadd.w vr22, vr0, vr15
+ vadd.w vr23, vr1, vr16
+ vld vr0, t6, 0 // src
+ vsllwil.hu.bu vr0, vr0, 0
+ vsllwil.wu.hu vr1, vr0, 0
+ vexth.wu.hu vr2, vr0
+ vmadd.w vr20, vr22, vr1
+ vmadd.w vr21, vr23, vr2
+ vssrlrni.h.w vr21, vr20, 9
+ vst vr21, t8, 0
+ addi.d t8, t8, 16
+
+ addi.d t0, t0, 32
+ addi.d t1, t1, 32
+ addi.d t2, t2, 32
+ addi.d t3, t3, 16
+ addi.d t4, t4, 16
+ addi.d t5, t5, 16
+ addi.d t6, t6, 8
+ addi.w t7, t7, -8
+ blt zero, t7, .LBS3SGF_V_W
+
+ addi.w a5, a5, -1
+ addi.d a0, a0, 384*2
+ addi.d a1, a1, REST_UNIT_STRIDE
+ addi.d a3, a3, REST_UNIT_STRIDE<<1
+ addi.d a2, a2, REST_UNIT_STRIDE<<2
+ bnez a5, .LBS3SGF_V_H
+endfunc
+
+#define FILTER_OUT_STRIDE (384)
+
+/*
+sgr_3x3_finish_c(const pixel *p, const ptrdiff_t stride,
+ const int16_t *dst, const int w1;
+ const int w, const int h);
+*/
+function sgr_3x3_finish_8bpc_lsx
+ vreplgr2vr.w vr3, a3 // w1
+ andi t4, a4, 0x7
+ sub.w t5, a4, t4
+
+ beq zero, t5, .LSGR3X3_REM
+
+.LSGR3X3_H:
+ addi.d t0, a0, 0
+ addi.d t1, a2, 0
+ addi.w t2, t5, 0
+ andi t4, a4, 0x7
+.LSGR3X3_W:
+ vld vr0, t0, 0
+ vld vr1, t1, 0
+ vsllwil.hu.bu vr2, vr0, 4 // u 8 h
+ vsllwil.wu.hu vr4, vr2, 0 // p
+ vexth.wu.hu vr5, vr2 // p
+ vslli.w vr6, vr4, 7
+ vslli.w vr7, vr5, 7
+ vsllwil.w.h vr8, vr1, 0 // dst
+ vexth.w.h vr9, vr1 // dst
+ vsub.w vr8, vr8, vr4
+ vsub.w vr9, vr9, vr5
+ vmadd.w vr6, vr8, vr3 // v 0 - 3
+ vmadd.w vr7, vr9, vr3 // v 4 - 7
+ vssrarni.hu.w vr7, vr6, 11
+ vssrlni.bu.h vr7, vr7, 0
+ vstelm.d vr7, t0, 0, 0
+ addi.d t0, t0, 8
+ addi.d t1, t1, 16
+ addi.d t2, t2, -8
+ bne zero, t2, .LSGR3X3_W
+
+ beq t4, zero, .LSGR3X3_NOREM
+
+ vld vr0, t0, 0
+ vld vr1, t1, 0
+ vsllwil.hu.bu vr2, vr0, 4 // u 8 h
+ vsllwil.wu.hu vr4, vr2, 0 // p
+ vexth.wu.hu vr5, vr2 // p
+ vslli.w vr6, vr4, 7
+ vslli.w vr7, vr5, 7
+ vsllwil.w.h vr8, vr1, 0 // dst
+ vexth.w.h vr9, vr1 // dst
+ vsub.w vr8, vr8, vr4
+ vsub.w vr9, vr9, vr5
+ vmadd.w vr6, vr8, vr3 // v 0 - 3
+ vmadd.w vr7, vr9, vr3 // v 4 - 7
+ vssrarni.hu.w vr7, vr6, 11
+ vssrlni.bu.h vr7, vr7, 0
+
+.LSGR3X3_ST:
+ vstelm.b vr7, t0, 0, 0
+ addi.d t0, t0, 1
+ vbsrl.v vr7, vr7, 1
+ addi.w t4, t4, -1
+ bnez t4, .LSGR3X3_ST
+
+.LSGR3X3_NOREM:
+ addi.w a5, a5, -1
+ add.d a0, a0, a1
+ addi.d a2, a2, (FILTER_OUT_STRIDE<<1)
+ bnez a5, .LSGR3X3_H
+ b .LSGR3X3_END
+
+.LSGR3X3_REM:
+ andi t4, a4, 0x7
+ addi.d t0, a0, 0
+ vld vr0, t0, 0
+ vld vr1, a2, 0
+ vsllwil.hu.bu vr2, vr0, 4 // u 8 h
+ vsllwil.wu.hu vr4, vr2, 0 // p
+ vexth.wu.hu vr5, vr2 // p
+ vslli.w vr6, vr4, 7
+ vslli.w vr7, vr5, 7
+ vsllwil.w.h vr8, vr1, 0 // dst
+ vexth.w.h vr9, vr1 // dst
+ vsub.w vr8, vr8, vr4
+ vsub.w vr9, vr9, vr5
+ vmadd.w vr6, vr8, vr3 // v 0 - 3
+ vmadd.w vr7, vr9, vr3 // v 4 - 7
+ vssrarni.hu.w vr7, vr6, 11
+ vssrlni.bu.h vr7, vr7, 0
+
+.LSGR3X3_REM_ST:
+ vstelm.b vr7, t0, 0, 0
+ addi.d t0, t0, 1
+ vbsrl.v vr7, vr7, 1
+ addi.w t4, t4, -1
+ bnez t4, .LSGR3X3_REM_ST
+ addi.w a5, a5, -1
+ add.d a0, a0, a1
+ addi.d a2, a2, (FILTER_OUT_STRIDE<<1)
+ bnez a5, .LSGR3X3_REM
+
+.LSGR3X3_END:
+endfunc
+
+/*
+void boxsum5(int32_t *sumsq, coef *sum,
+ const pixel *const src,
+ const int w, const int h)
+*/
+function boxsum5_h_8bpc_lsx
+ addi.w a4, a4, -4
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ li.w t6, 1
+.LBOXSUM5_H_H:
+ addi.w t3, a3, 0
+ addi.d t2, a2, 0
+ addi.d t0, a0, 0
+ addi.d t1, a1, 0
+
+.LBOXSUM5_H_W:
+ vld vr0, t2, 0 // a
+ vld vr1, t2, REST_UNIT_STRIDE // b
+ vld vr2, t2, REST_UNIT_STRIDE<<1 // c
+ vld vr3, t2, REST_UNIT_STRIDE*3 // d
+ vld vr4, t2, REST_UNIT_STRIDE<<2 // e
+
+ vilvl.b vr5, vr1, vr0
+ vilvh.b vr6, vr1, vr0
+ vilvl.b vr7, vr3, vr2
+ vilvh.b vr8, vr3, vr2
+ //sum_v
+ vhaddw.hu.bu vr9, vr5, vr5 // 0 1 2 3 4 5 6 7
+ vhaddw.hu.bu vr10, vr6, vr6 // 8 9 10 11 12 13 14 15 a+b
+ vhaddw.hu.bu vr11, vr7, vr7
+ vhaddw.hu.bu vr12, vr8, vr8
+ vadd.h vr9, vr9, vr11
+ vadd.h vr10, vr10, vr12 // a + b + c + d
+ vsllwil.hu.bu vr11, vr4, 0
+ vexth.hu.bu vr12, vr4
+ vadd.h vr9, vr9, vr11
+ vadd.h vr10, vr10, vr12
+ vst vr9, t1, 0
+ vst vr10, t1, 16
+ addi.d t1, t1, 32
+
+ // sumsq
+ vmulwev.h.bu vr9, vr5, vr5 // a*a 0 1 2 3 4 5 6 7
+ vmulwev.h.bu vr10, vr6, vr6 // a*a 8 9 10 11 12 13 14 15
+ vmulwod.h.bu vr13, vr5, vr5 // b*b 0 1 2 3 4 5 6 7
+ vmulwod.h.bu vr14, vr6, vr6 // b*b 8 9 10 11 12 13 14 15
+ vmulwev.h.bu vr15, vr7, vr7 // c*c 0 1 2 3 4 5 6 7
+ vmulwev.h.bu vr16, vr8, vr8 // c*c 8 9 10 11 12 13 14 15
+ vmulwod.h.bu vr17, vr7, vr7 // d*d 0 1 2 3 4 5 6 7
+ vmulwod.h.bu vr18, vr8, vr8 // d*d 8 9 10 11 12 13 14 15
+ vaddwev.w.hu vr5, vr9, vr13 // 0 2 4 6
+ vaddwod.w.hu vr6, vr9, vr13 // 1 3 5 7
+ vaddwev.w.hu vr7, vr10, vr14 // 8 10 12 14
+ vaddwod.w.hu vr8, vr10, vr14 // 9 11 13 15 a + b
+ vaddwev.w.hu vr19, vr15, vr17 // 0 2 4 6
+ vaddwod.w.hu vr20, vr15, vr17 // 1 3 5 7
+ vaddwev.w.hu vr21, vr16, vr18 // 8 10 12 14
+ vaddwod.w.hu vr22, vr16, vr18 // 9 11 13 15 c + d
+ vadd.w vr5, vr5, vr19
+ vadd.w vr6, vr6, vr20
+ vadd.w vr7, vr7, vr21
+ vadd.w vr8, vr8, vr22
+ vilvl.w vr19, vr6, vr5
+ vilvh.w vr20, vr6, vr5
+ vilvl.w vr21, vr8, vr7
+ vilvh.w vr22, vr8, vr7
+ vmul.h vr11, vr11, vr11
+ vmul.h vr12, vr12, vr12
+ vsllwil.wu.hu vr0, vr11, 0
+ vexth.wu.hu vr1, vr11
+ vsllwil.wu.hu vr2, vr12, 0
+ vexth.wu.hu vr3, vr12
+ vadd.w vr19, vr19, vr0
+ vadd.w vr20, vr20, vr1
+ vadd.w vr21, vr21, vr2
+ vadd.w vr22, vr22, vr3
+ vst vr19, t0, 0
+ vst vr20, t0, 16
+ vst vr21, t0, 32
+ vst vr22, t0, 48
+ addi.d t0, t0, 64
+ addi.d t2, t2, 16
+ addi.w t3, t3, -16
+ blt zero, t3, .LBOXSUM5_H_W
+
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ addi.d a2, a2, REST_UNIT_STRIDE
+ addi.d a4, a4, -1
+ bnez a4, .LBOXSUM5_H_H
+endfunc
+
+/*
+void boxsum5_h(int32_t *sumsq, coef *sum,
+ const int w, const int h)
+*/
+function boxsum5_v_8bpc_lsx
+ addi.d a0, a0, (REST_UNIT_STRIDE<<2)
+ addi.d a1, a1, (REST_UNIT_STRIDE<<1)
+ addi.w a3, a3, -4
+ addi.w a2, a2, -4
+
+.LBOXSUM5_V_H:
+ addi.w t3, a2, 0
+ addi.d t0, a0, 0
+ addi.d t1, a1, 0
+ addi.d t2, a0, 8
+ addi.d t3, a1, 4
+ addi.d t4, a2, 0
+
+ vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7
+ vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8
+ vld vr2, t1, 4 // c 2
+ vld vr3, t1, 6 // d 3
+ vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11
+ vadd.h vr5, vr0, vr1
+ vadd.h vr6, vr2, vr3
+ vpickve2gr.w t5, vr4, 2
+ vadd.h vr5, vr5, vr6
+ vadd.h vr5, vr5, vr4
+ vst vr5, t3, 0
+
+ vld vr0, t0, 0 // 0 1 2 3 a
+ vld vr1, t0, 4 // 1 2 3 4 b
+ vld vr2, t0, 8 // 2 3 4 5 c
+ vld vr3, t0, 12 // 3 4 5 6 d
+ vld vr4, t0, 16 // 4 5 6 7 e a
+ vld vr5, t0, 20 // 5 6 7 8 b
+ vld vr6, t0, 24 // 6 7 8 9 c
+ vld vr7, t0, 28 // 7 8 9 10 d
+ vld vr8, t0, 32 // 8 9 10 11 e
+
+ vadd.w vr9, vr0, vr1
+ vadd.w vr10, vr2, vr3
+ vadd.w vr9, vr9, vr10
+ vadd.w vr9, vr9, vr4
+ vadd.w vr10, vr4, vr5
+ vadd.w vr11, vr6, vr7
+ vadd.w vr10, vr10, vr8
+ vadd.w vr10, vr10, vr11
+ vst vr9, t2, 0
+ vst vr10, t2, 16
+
+ addi.d t3, t3, 16
+ addi.d t1, t1, 16
+ addi.d t0, t0, 32
+ addi.d t2, t2, 32
+ addi.w t4, t4, -8
+ ble t4, zero, .LBOXSUM5_V_H1
+
+.LBOXSUM5_V_W:
+ vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7
+ vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8
+ vld vr2, t1, 4 // c 2
+ vld vr3, t1, 6 // d 3
+ vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11
+ vinsgr2vr.w vr0, t5, 0
+ vpickve2gr.w t5, vr4, 2
+ vextrins.h vr1, vr0, 0x01
+ vadd.h vr5, vr0, vr1
+ vadd.h vr6, vr2, vr3
+ vadd.h vr5, vr5, vr6
+ vadd.h vr5, vr5, vr4
+ vst vr5, t3, 0
+
+ vaddi.hu vr0, vr8, 0 // 8 9 10 11 a
+ vld vr1, t0, 4 // 9 10 11 12 b
+ vld vr2, t0, 8 // 10 11 12 13 c
+ vld vr3, t0, 12 // 14 15 16 17 d
+ vld vr4, t0, 16 // 15 16 17 18 e a
+ vld vr5, t0, 20 // 16 17 18 19 b
+ vld vr6, t0, 24 // 17 18 19 20 c
+ vld vr7, t0, 28 // 18 19 20 21 d
+ vld vr8, t0, 32 // 19 20 21 22 e
+ vextrins.w vr1, vr0, 0x01
+ vadd.w vr9, vr0, vr1
+ vadd.w vr10, vr2, vr3
+ vadd.w vr9, vr9, vr10
+ vadd.w vr9, vr9, vr4
+ vadd.w vr10, vr4, vr5
+ vadd.w vr11, vr6, vr7
+ vadd.w vr10, vr10, vr8
+ vadd.w vr10, vr10, vr11
+ vst vr9, t2, 0
+ vst vr10, t2, 16
+
+ addi.d t3, t3, 16
+ addi.d t1, t1, 16
+ addi.d t0, t0, 32
+ addi.d t2, t2, 32
+ addi.w t4, t4, -8
+ blt zero, t4, .LBOXSUM5_V_W
+
+.LBOXSUM5_V_H1:
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.w a3, a3, -1
+ bnez a3, .LBOXSUM5_V_H
+endfunc
+
+/*
+selfguided_filter(int32_t *sumsq, coef *sum,
+ const int w, const int h,
+ const unsigned s)
+*/
+function boxsum5_sgf_h_8bpc_lsx
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a0, a0, 12 // AA
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ addi.d a1, a1, 6 // BB
+ la.local t8, dav1d_sgr_x_by_x
+ li.w t6, 164
+ vreplgr2vr.w vr20, t6
+ li.w t6, 255
+ vreplgr2vr.w vr22, t6
+ vaddi.wu vr21, vr22, 1 // 256
+ vreplgr2vr.w vr6, a4
+ vldi vr19, 0x819
+ addi.w a2, a2, 2 // w + 2
+ addi.w a3, a3, 2 // h + 2
+
+.LBS5SGF_H_H:
+ addi.w t2, a2, 0
+ addi.d t0, a0, -4
+ addi.d t1, a1, -2
+
+.LBS5SGF_H_W:
+ vld vr0, t0, 0 // AA[i]
+ vld vr1, t0, 16
+ vld vr2, t1, 0 // BB[i]
+
+ vmul.w vr4, vr0, vr19 // a * n
+ vmul.w vr5, vr1, vr19 // a * n
+ vsllwil.w.h vr9, vr2, 0
+ vexth.w.h vr10, vr2
+ vmsub.w vr4, vr9, vr9 // p
+ vmsub.w vr5, vr10, vr10 // p
+ vmaxi.w vr4, vr4, 0
+ vmaxi.w vr5, vr5, 0 // p
+ vmul.w vr4, vr4, vr6 // p * s
+ vmul.w vr5, vr5, vr6 // p * s
+ vsrlri.w vr4, vr4, 20
+ vsrlri.w vr5, vr5, 20 // z
+ vmin.w vr4, vr4, vr22
+ vmin.w vr5, vr5, vr22
+
+ // load table data
+ vpickve2gr.w t6, vr4, 0
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 0
+ vpickve2gr.w t6, vr4, 1
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 1
+ vpickve2gr.w t6, vr4, 2
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 2
+ vpickve2gr.w t6, vr4, 3
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 3
+
+ vpickve2gr.w t6, vr5, 0
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 0
+ vpickve2gr.w t6, vr5, 1
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 1
+ vpickve2gr.w t6, vr5, 2
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 2
+ vpickve2gr.w t6, vr5, 3
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 3 // x
+
+ vmul.w vr9, vr7, vr9 // x * BB[i]
+ vmul.w vr10, vr8, vr10
+ vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x
+ vmul.w vr10, vr10, vr20
+ vsrlri.w vr9, vr9, 12
+ vsrlri.w vr10, vr10, 12
+ vsub.w vr7, vr21, vr7
+ vsub.w vr8, vr21, vr8
+ vpickev.h vr8, vr8, vr7
+ vst vr9, t0, 0
+ vst vr10, t0, 16
+ vst vr8, t1, 0
+ addi.d t0, t0, 32
+ addi.d t1, t1, 16
+ addi.w t2, t2, -8
+ blt zero, t2, .LBS5SGF_H_W
+
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a1, a1, REST_UNIT_STRIDE<<2
+ addi.w a3, a3, -2
+ blt zero, a3, .LBS5SGF_H_H
+endfunc
+
+/*
+selfguided_filter(coef *dst, pixel *src,
+ int32_t *sumsq, coef *sum,
+ const int w, const int h)
+*/
+function boxsum5_sgf_v_8bpc_lsx
+ addi.d a1, a1, 3*REST_UNIT_STRIDE+3 // src
+ addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1 // A
+ addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1
+ addi.d a3, a3, (2*REST_UNIT_STRIDE+3)<<1 // B
+ addi.w a5, a5, -1
+ vldi vr10, 0x806
+ vldi vr11, 0x805
+ vldi vr22, 0x406
+
+.LBS5SGF_V_H:
+ addi.d t0, a0, 0
+ addi.d t1, a1, 0
+ addi.d t2, a2, 0
+ addi.d t3, a3, 0
+ addi.w t4, a4, 0
+
+ addi.d t5, a0, 384*2
+ addi.d t6, a1, REST_UNIT_STRIDE
+ addi.d t7, a2, REST_UNIT_STRIDE<<2
+ addi.d t8, a3, REST_UNIT_STRIDE<<1 // B
+.LBS5SGF_V_W:
+ // a
+ vld vr0, t3, -REST_UNIT_STRIDE*2
+ vld vr1, t3, REST_UNIT_STRIDE*2
+ vld vr2, t3, (-REST_UNIT_STRIDE-1)*2
+ vld vr3, t3, (REST_UNIT_STRIDE-1)*2
+ vld vr4, t3, (1-REST_UNIT_STRIDE)*2
+ vld vr5, t3, (1+REST_UNIT_STRIDE)*2
+ vaddwev.w.h vr6, vr0, vr1
+ vaddwod.w.h vr7, vr0, vr1
+ vmul.w vr6, vr6, vr10
+ vmul.w vr7, vr7, vr10
+ vaddwev.w.h vr8, vr2, vr3
+ vaddwod.w.h vr9, vr2, vr3
+ vaddwev.w.h vr12, vr4, vr5
+ vaddwod.w.h vr13, vr4, vr5
+ vadd.w vr8, vr8, vr12
+ vadd.w vr9, vr9, vr13
+ vmadd.w vr6, vr8, vr11
+ vmadd.w vr7, vr9, vr11
+ vilvl.w vr18, vr7, vr6
+ vilvh.w vr19, vr7, vr6
+ // b
+ vld vr0, t2, -REST_UNIT_STRIDE*4
+ vld vr1, t2, -REST_UNIT_STRIDE*4+16
+ vld vr2, t2, REST_UNIT_STRIDE*4
+ vld vr3, t2, REST_UNIT_STRIDE*4+16
+ vld vr4, t2, (-REST_UNIT_STRIDE-1)*4
+ vld vr5, t2, (-REST_UNIT_STRIDE-1)*4+16
+ vld vr8, t2, (REST_UNIT_STRIDE-1)*4
+ vld vr9, t2, (REST_UNIT_STRIDE-1)*4+16
+ vld vr12, t2, (1-REST_UNIT_STRIDE)*4
+ vld vr13, t2, (1-REST_UNIT_STRIDE)*4+16
+ vld vr14, t2, (1+REST_UNIT_STRIDE)*4
+ vld vr15, t2, (1+REST_UNIT_STRIDE)*4+16
+ vadd.w vr0, vr0, vr2 // 0 1 2 3
+ vadd.w vr1, vr1, vr3 // 4 5 6 7
+ vmul.w vr20, vr0, vr10
+ vmul.w vr21, vr1, vr10
+ vadd.w vr4, vr4, vr8 // 0 1 2 3
+ vadd.w vr5, vr5, vr9 // 4 5 6 7
+ vadd.w vr12, vr12, vr14
+ vadd.w vr13, vr13, vr15
+ vadd.w vr12, vr12, vr4
+ vadd.w vr13, vr13, vr5
+ vmadd.w vr20, vr12, vr11
+ vmadd.w vr21, vr13, vr11
+ vld vr2, t1, 0
+ vsllwil.hu.bu vr2, vr2, 0
+ vsllwil.wu.hu vr3, vr2, 0
+ vexth.wu.hu vr4, vr2
+ vmadd.w vr20, vr18, vr3
+ vmadd.w vr21, vr19, vr4
+ vssrlrni.h.w vr21, vr20, 9
+ vst vr21, t0, 0
+
+ addi.d t1, t1, 8
+ addi.d t2, t2, 32
+ addi.d t3, t3, 16
+
+ // a
+ vld vr0, t8, 0
+ vld vr1, t8, -2
+ vld vr2, t8, 2
+ vmulwev.w.h vr3, vr0, vr22
+ vmulwod.w.h vr4, vr0, vr22
+ vaddwev.w.h vr5, vr1, vr2
+ vaddwod.w.h vr6, vr1, vr2
+ vmadd.w vr3, vr5, vr11
+ vmadd.w vr4, vr6, vr11
+ vilvl.w vr19, vr4, vr3
+ vilvh.w vr20, vr4, vr3
+ // b
+ vld vr0, t7, 0
+ vld vr1, t7, -4
+ vld vr2, t7, 4
+ vld vr5, t7, 16
+ vld vr6, t7, 12
+ vld vr7, t7, 20
+ vmul.w vr8, vr0, vr10
+ vmul.w vr9, vr5, vr10
+ vadd.w vr12, vr1, vr2
+ vadd.w vr13, vr6, vr7
+ vmadd.w vr8, vr12, vr11
+ vmadd.w vr9, vr13, vr11
+ vld vr2, t6, 0
+ vsllwil.hu.bu vr2, vr2, 0
+ vsllwil.wu.hu vr3, vr2, 0
+ vexth.wu.hu vr4, vr2
+ vmadd.w vr8, vr19, vr3
+ vmadd.w vr9, vr20, vr4
+ vssrlrni.h.w vr9, vr8, 8
+ vst vr9, t0, 384*2
+
+ addi.d t0, t0, 16
+ addi.d t8, t8, 16
+ addi.d t7, t7, 32
+ addi.d t6, t6, 8
+ addi.w t4, t4, -8
+ blt zero, t4, .LBS5SGF_V_W
+
+ addi.w a5, a5, -2
+ addi.d a0, a0, 384*4 // dst
+ addi.d a1, a1, REST_UNIT_STRIDE<<1 // src
+ addi.d a2, a2, REST_UNIT_STRIDE<<2 //
+ addi.d a2, a2, REST_UNIT_STRIDE<<2
+ addi.d a3, a3, REST_UNIT_STRIDE<<2 //
+ blt zero, a5, .LBS5SGF_V_H
+ bnez a5, .LBS5SGF_END
+.LBS5SGF_V_W1:
+ // a
+ vld vr0, a3, -REST_UNIT_STRIDE*2
+ vld vr1, a3, REST_UNIT_STRIDE*2
+ vld vr2, a3, (-REST_UNIT_STRIDE-1)*2
+ vld vr3, a3, (REST_UNIT_STRIDE-1)*2
+ vld vr4, a3, (1-REST_UNIT_STRIDE)*2
+ vld vr5, a3, (1+REST_UNIT_STRIDE)*2
+ vaddwev.w.h vr6, vr0, vr1
+ vaddwod.w.h vr7, vr0, vr1
+ vmul.w vr6, vr6, vr10
+ vmul.w vr7, vr7, vr10
+ vaddwev.w.h vr8, vr2, vr3
+ vaddwod.w.h vr9, vr2, vr3
+ vaddwev.w.h vr12, vr4, vr5
+ vaddwod.w.h vr13, vr4, vr5
+ vadd.w vr8, vr8, vr12
+ vadd.w vr9, vr9, vr13
+ vmadd.w vr6, vr8, vr11
+ vmadd.w vr7, vr9, vr11
+ vilvl.w vr18, vr7, vr6
+ vilvh.w vr19, vr7, vr6
+ // b
+ vld vr0, a2, -REST_UNIT_STRIDE*4
+ vld vr1, a2, -REST_UNIT_STRIDE*4+16
+ vld vr2, a2, REST_UNIT_STRIDE*4
+ vld vr3, a2, REST_UNIT_STRIDE*4+16
+ vld vr4, a2, (-REST_UNIT_STRIDE-1)*4
+ vld vr5, a2, (-REST_UNIT_STRIDE-1)*4+16
+ vld vr8, a2, (REST_UNIT_STRIDE-1)*4
+ vld vr9, a2, (REST_UNIT_STRIDE-1)*4+16
+ vld vr12, a2, (1-REST_UNIT_STRIDE)*4
+ vld vr13, a2, (1-REST_UNIT_STRIDE)*4+16
+ vld vr14, a2, (1+REST_UNIT_STRIDE)*4
+ vld vr15, a2, (1+REST_UNIT_STRIDE)*4+16
+ vadd.w vr0, vr0, vr2 // 0 1 2 3
+ vadd.w vr1, vr1, vr3 // 4 5 6 7
+ vmul.w vr20, vr0, vr10
+ vmul.w vr21, vr1, vr10
+ vadd.w vr4, vr4, vr8 // 0 1 2 3
+ vadd.w vr5, vr5, vr9 // 4 5 6 7
+ vadd.w vr12, vr12, vr14
+ vadd.w vr13, vr13, vr15
+ vadd.w vr12, vr12, vr4
+ vadd.w vr13, vr13, vr5
+ vmadd.w vr20, vr12, vr11
+ vmadd.w vr21, vr13, vr11
+ vld vr2, a1, 0
+ vsllwil.hu.bu vr2, vr2, 0
+ vsllwil.wu.hu vr3, vr2, 0
+ vexth.wu.hu vr4, vr2
+ vmadd.w vr20, vr18, vr3
+ vmadd.w vr21, vr19, vr4
+ vssrlrni.h.w vr21, vr20, 9
+ vst vr21, a0, 0
+ addi.d a3, a3, 16
+ addi.d a2, a2, 32
+ addi.d a1, a1, 8
+ addi.d a0, a0, 16
+ addi.w a4, a4, -8
+ blt zero, a4, .LBS5SGF_V_W1
+.LBS5SGF_END:
+endfunc
+
+/*
+void dav1d_sgr_mix_finish_lsx(uint8_t *p, const ptrdiff_t stride,
+ const int16_t *dst0, const int16_t *dst1,
+ const int w0, const int w1,
+ const int w, const int h);
+*/
+function sgr_mix_finish_8bpc_lsx
+ vreplgr2vr.w vr3, a4 // w0
+ vreplgr2vr.w vr13, a5 // w1
+ andi t4, a6, 0x7
+ sub.w t5, a6, t4
+
+ beq zero, t5, .LSGRMIX_REM
+
+.LSGRMIX_H:
+ addi.d t0, a0, 0
+ addi.d t1, a2, 0 // dst0
+ addi.d t3, a3, 0 // dst1
+ addi.w t2, t5, 0
+ andi t4, a6, 0x7
+.LSGRMIX_W:
+ vld vr0, t0, 0
+ vld vr1, t1, 0
+ vld vr10, t3, 0
+ vsllwil.hu.bu vr2, vr0, 4 // u 8 h
+ vsllwil.wu.hu vr4, vr2, 0 // u 0 1 2 3
+ vexth.wu.hu vr5, vr2 // u 4 5 6 7
+ vslli.w vr6, vr4, 7
+ vslli.w vr7, vr5, 7
+ vsllwil.w.h vr8, vr1, 0 // dst0
+ vexth.w.h vr9, vr1 // dst0
+ vsub.w vr8, vr8, vr4
+ vsub.w vr9, vr9, vr5
+ vmadd.w vr6, vr8, vr3 // v 0 - 3
+ vmadd.w vr7, vr9, vr3 // v 4 - 7
+
+ vsllwil.w.h vr11, vr10, 0 // dst1
+ vexth.w.h vr12, vr10 // dst1
+ vsub.w vr11, vr11, vr4
+ vsub.w vr12, vr12, vr5
+ vmadd.w vr6, vr11, vr13
+ vmadd.w vr7, vr12, vr13
+
+ vssrarni.hu.w vr7, vr6, 11
+ vssrlni.bu.h vr7, vr7, 0
+ vstelm.d vr7, t0, 0, 0
+ addi.d t0, t0, 8
+ addi.d t1, t1, 16
+ addi.d t3, t3, 16
+ addi.d t2, t2, -8
+ bne zero, t2, .LSGRMIX_W
+
+ beq t4, zero, .LSGRMIX_W8
+
+ vld vr0, t0, 0
+ vld vr1, t1, 0
+ vld vr10, t3, 0
+ vsllwil.hu.bu vr2, vr0, 4 // u 8 h
+ vsllwil.wu.hu vr4, vr2, 0 // p
+ vexth.wu.hu vr5, vr2 // p
+ vslli.w vr6, vr4, 7
+ vslli.w vr7, vr5, 7
+ vsllwil.w.h vr8, vr1, 0 // dst
+ vexth.w.h vr9, vr1 // dst
+ vsub.w vr8, vr8, vr4
+ vsub.w vr9, vr9, vr5
+ vmadd.w vr6, vr8, vr3 // v 0 - 3
+ vmadd.w vr7, vr9, vr3 // v 4 - 7
+
+ vsllwil.w.h vr11, vr10, 0 // dst1
+ vexth.w.h vr12, vr10 // dst1
+ vsub.w vr11, vr11, vr4
+ vsub.w vr12, vr12, vr5
+ vmadd.w vr6, vr11, vr13
+ vmadd.w vr7, vr12, vr13
+
+ vssrarni.hu.w vr7, vr6, 11
+ vssrlni.bu.h vr7, vr7, 0
+
+.LSGRMIX_ST:
+ vstelm.b vr7, t0, 0, 0
+ addi.d t0, t0, 1
+ vbsrl.v vr7, vr7, 1
+ addi.w t4, t4, -1
+ bnez t4, .LSGRMIX_ST
+
+.LSGRMIX_W8:
+ addi.w a7, a7, -1
+ add.d a0, a0, a1
+ addi.d a2, a2, (FILTER_OUT_STRIDE<<1)
+ addi.d a3, a3, (FILTER_OUT_STRIDE<<1)
+ bnez a7, .LSGRMIX_H
+ b .LSGR_MIX_END
+
+.LSGRMIX_REM:
+ andi t4, a6, 0x7
+ vld vr0, a0, 0
+ vld vr1, a2, 0
+ vld vr10, a3, 0
+ vsllwil.hu.bu vr2, vr0, 4 // u 8 h
+ vsllwil.wu.hu vr4, vr2, 0 // p
+ vexth.wu.hu vr5, vr2 // p
+ vslli.w vr6, vr4, 7
+ vslli.w vr7, vr5, 7
+ vsllwil.w.h vr8, vr1, 0 // dst
+ vexth.w.h vr9, vr1 // dst
+ vsub.w vr8, vr8, vr4
+ vsub.w vr9, vr9, vr5
+ vmadd.w vr6, vr8, vr3 // v 0 - 3
+ vmadd.w vr7, vr9, vr3 // v 4 - 7
+
+ vsllwil.w.h vr11, vr10, 0 // dst1
+ vexth.w.h vr12, vr10 // dst1
+ vsub.w vr11, vr11, vr4
+ vsub.w vr12, vr12, vr5
+ vmadd.w vr6, vr11, vr13
+ vmadd.w vr7, vr12, vr13
+
+ vssrarni.hu.w vr7, vr6, 11
+ vssrlni.bu.h vr7, vr7, 0
+ addi.d t0, a0, 0
+.LSGRMIX_REM_ST:
+ vstelm.b vr7, t0, 0, 0
+ addi.d t0, t0, 1
+ vbsrl.v vr7, vr7, 1
+ addi.w t4, t4, -1
+ bnez t4, .LSGRMIX_REM_ST
+
+ addi.w a7, a7, -1
+ add.d a0, a0, a1
+ addi.d a2, a2, (FILTER_OUT_STRIDE<<1)
+ addi.d a3, a3, (FILTER_OUT_STRIDE<<1)
+ bnez a7, .LSGRMIX_REM
+
+.LSGR_MIX_END:
+endfunc
diff --git a/src/loongarch/looprestoration.h b/src/loongarch/looprestoration.h
new file mode 100644
index 0000000..ac0cb06
--- /dev/null
+++ b/src/loongarch/looprestoration.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H
+#define DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H
+
+#include "common/intops.h"
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t stride,
+ const uint8_t (*const left)[4],
+ const uint8_t *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_loongarch(Dav1dLoopRestorationDSPContext *const c, int bpc)
+{
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
+
+#if BITDEPTH == 8
+ c->wiener[0] = c->wiener[1] = dav1d_wiener_filter_lsx;
+
+ c->sgr[0] = dav1d_sgr_filter_5x5_lsx;
+ c->sgr[1] = dav1d_sgr_filter_3x3_lsx;
+ c->sgr[2] = dav1d_sgr_filter_mix_lsx;
+#endif
+}
+
+#endif /* DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H */
diff --git a/src/loongarch/looprestoration_tmpl.c b/src/loongarch/looprestoration_tmpl.c
new file mode 100644
index 0000000..66d0d63
--- /dev/null
+++ b/src/loongarch/looprestoration_tmpl.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/loongarch/looprestoration.h"
+
+#if BITDEPTH == 8
+
+#define REST_UNIT_STRIDE (400)
+
+void BF(dav1d_wiener_filter_h, lsx)(int32_t *hor_ptr,
+ uint8_t *tmp_ptr,
+ const int16_t filterh[8],
+ const int w, const int h);
+
+void BF(dav1d_wiener_filter_v, lsx)(uint8_t *p,
+ const ptrdiff_t p_stride,
+ const int32_t *hor,
+ const int16_t filterv[8],
+ const int w, const int h);
+
+// This function refers to the function in the ppc/looprestoration_init_tmpl.c.
+static inline void padding(uint8_t *dst, const uint8_t *p,
+ const ptrdiff_t stride, const uint8_t (*left)[4],
+ const uint8_t *lpf, int unit_w, const int stripe_h,
+ const enum LrEdgeFlags edges)
+{
+ const int have_left = !!(edges & LR_HAVE_LEFT);
+ const int have_right = !!(edges & LR_HAVE_RIGHT);
+
+ // Copy more pixels if we don't have to pad them
+ unit_w += 3 * have_left + 3 * have_right;
+ uint8_t *dst_l = dst + 3 * !have_left;
+ p -= 3 * have_left;
+ lpf -= 3 * have_left;
+
+ if (edges & LR_HAVE_TOP) {
+ // Copy previous loop filtered rows
+ const uint8_t *const above_1 = lpf;
+ const uint8_t *const above_2 = above_1 + PXSTRIDE(stride);
+ pixel_copy(dst_l, above_1, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
+ } else {
+ // Pad with first row
+ pixel_copy(dst_l, p, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
+ if (have_left) {
+ pixel_copy(dst_l, &left[0][1], 3);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
+ }
+ }
+
+ uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
+ if (edges & LR_HAVE_BOTTOM) {
+ // Copy next loop filtered rows
+ const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride);
+ const uint8_t *const below_2 = below_1 + PXSTRIDE(stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
+ } else {
+ // Pad with last row
+ const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
+ if (have_left) {
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ }
+ }
+
+ // Inner UNIT_WxSTRIPE_H
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
+ dst_tl += REST_UNIT_STRIDE;
+ p += PXSTRIDE(stride);
+ }
+
+ if (!have_right) {
+ uint8_t *pad = dst_l + unit_w;
+ uint8_t *row_last = &dst_l[unit_w - 1];
+ // Pad 3x(STRIPE_H+6) with last column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(pad, *row_last, 3);
+ pad += REST_UNIT_STRIDE;
+ row_last += REST_UNIT_STRIDE;
+ }
+ }
+
+ if (!have_left) {
+ // Pad 3x(STRIPE_H+6) with first column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(dst, *dst_l, 3);
+ dst += REST_UNIT_STRIDE;
+ dst_l += REST_UNIT_STRIDE;
+ }
+ } else {
+ dst += 3 * REST_UNIT_STRIDE;
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst, &left[j][1], 3);
+ dst += REST_UNIT_STRIDE;
+ }
+ }
+}
+
+// This function refers to the function in the ppc/looprestoration_init_tmpl.c.
+
+// FIXME Could split into luma and chroma specific functions,
+// (since first and last tops are always 0 for chroma)
+// FIXME Could implement a version that requires less temporary memory
+// (should be possible to implement with only 6 rows of temp storage)
+void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t p_stride,
+ const uint8_t (*const left)[4],
+ const uint8_t *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ const int16_t (*const filter)[8] = params->filter;
+
+ // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
+ // of padding above and below
+ ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+ padding(tmp, p, p_stride, left, lpf, w, h, edges);
+ ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
+
+ BF(dav1d_wiener_filter_h, lsx)(hor, tmp, filter[0], w, h + 6);
+ BF(dav1d_wiener_filter_v, lsx)(p, p_stride, hor, filter[1], w, h);
+}
+
+void BF(dav1d_boxsum3_h, lsx)(int32_t *sumsq, int16_t *sum, pixel *src,
+ const int w, const int h);
+void BF(dav1d_boxsum3_v, lsx)(int32_t *sumsq, int16_t *sum,
+ const int w, const int h);
+
+void BF(dav1d_boxsum3_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
+ const int w, const int h, const int w1);
+void BF(dav1d_boxsum3_sgf_v, lsx)(int16_t *dst, uint8_t *tmp,
+ int32_t *sumsq, int16_t *sum,
+ const int w, const int h);
+void BF(dav1d_sgr_3x3_finish, lsx)(pixel *p, const ptrdiff_t p_stride,
+ int16_t *dst, int w1,
+ const int w, const int h);
+
+
+static inline void boxsum3_lsx(int32_t *sumsq, coef *sum, pixel *src,
+ const int w, const int h)
+{
+ BF(dav1d_boxsum3_h, lsx)(sumsq, sum, src, w + 6, h + 6);
+ BF(dav1d_boxsum3_v, lsx)(sumsq, sum, w + 6, h + 6);
+}
+
+void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+ padding(tmp, p, p_stride, left, lpf, w, h, edges);
+ coef dst[64 * 384];
+
+ ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
+ ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
+
+ boxsum3_lsx(sumsq, sum, tmp, w, h);
+ BF(dav1d_boxsum3_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s1);
+ BF(dav1d_boxsum3_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
+ BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w1, w, h);
+}
+
+void BF(dav1d_boxsum5_h, lsx)(int32_t *sumsq, int16_t *sum,
+ const uint8_t *const src,
+ const int w, const int h);
+
+void BF(dav1d_boxsum5_v, lsx)(int32_t *sumsq, int16_t *sum,
+ const int w, const int h);
+
+void BF(dav1d_boxsum5_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
+ const int w, const int h,
+ const unsigned s);
+
+void BF(dav1d_boxsum5_sgf_v, lsx)(int16_t *dst, uint8_t *src,
+ int32_t *sumsq, int16_t *sum,
+ const int w, const int h);
+
+void BF(dav1d_sgr_mix_finish, lsx)(uint8_t *p, const ptrdiff_t stride,
+ const int16_t *dst0, const int16_t *dst1,
+ const int w0, const int w1,
+ const int w, const int h);
+
+static inline void boxsum5_lsx(int32_t *sumsq, coef *sum, pixel *src,
+ const int w, const int h)
+{
+ BF(dav1d_boxsum5_h, lsx)(sumsq, sum, src, w + 6, h + 6);
+ BF(dav1d_boxsum5_v, lsx)(sumsq, sum, w + 6, h + 6);
+}
+
+void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+ padding(tmp, p, p_stride, left, lpf, w, h, edges);
+ coef dst[64 * 384];
+
+ ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
+ ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
+
+ boxsum5_lsx(sumsq, sum, tmp, w, h);
+ BF(dav1d_boxsum5_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s0);
+ BF(dav1d_boxsum5_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
+ BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w0, w, h);
+}
+
+void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+ padding(tmp, p, p_stride, left, lpf, w, h, edges);
+ coef dst0[64 * 384];
+ coef dst1[64 * 384];
+
+ ALIGN_STK_16(int32_t, sumsq0, 68 * REST_UNIT_STRIDE + 8, );
+ ALIGN_STK_16(int16_t, sum0, 68 * REST_UNIT_STRIDE + 16, );
+
+ boxsum5_lsx(sumsq0, sum0, tmp, w, h);
+ BF(dav1d_boxsum5_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s0);
+ BF(dav1d_boxsum5_sgf_v, lsx)(dst0, tmp, sumsq0, sum0, w, h);
+
+ boxsum3_lsx(sumsq0, sum0, tmp, w, h);
+ BF(dav1d_boxsum3_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s1);
+ BF(dav1d_boxsum3_sgf_v, lsx)(dst1, tmp, sumsq0, sum0, w, h);
+
+ BF(dav1d_sgr_mix_finish, lsx)(p, p_stride, dst0, dst1, params->sgr.w0,
+ params->sgr.w1, w, h);
+}
+#endif
diff --git a/src/loongarch/mc.S b/src/loongarch/mc.S
new file mode 100644
index 0000000..97887de
--- /dev/null
+++ b/src/loongarch/mc.S
@@ -0,0 +1,4758 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/loongarch/loongson_asm.S"
+
+/*
+static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *const abcd, int mx, int my
+ HIGHBD_DECL_SUFFIX)
+*/
+.macro FILTER_WARP_RND_P_LSX in0, in1, in2, in3, out0, out1, out2, out3
+ vbsrl.v vr2, \in0, \in1
+ vbsrl.v vr20, \in0, \in2
+ addi.w t4, \in3, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr1, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ addi.w t4, t3, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr29, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ vilvl.d vr2, vr20, vr2
+ vilvl.d vr1, vr29, vr1
+ vmulwev.h.bu.b vr3, vr2, vr1
+ vmulwod.h.bu.b vr20, vr2, vr1
+ vilvl.d vr2, vr20, vr3
+ vhaddw.w.h vr2, vr2, vr2
+ vhaddw.d.w vr2, vr2, vr2
+ vhaddw.q.d vr2, vr2, vr2
+ vilvh.d vr3, vr20, vr3
+ vhaddw.w.h vr3, vr3, vr3
+ vhaddw.d.w vr3, vr3, vr3
+ vhaddw.q.d vr3, vr3, vr3
+ vextrins.w \out0, vr2, \out1
+ vextrins.w \out2, vr3, \out3
+.endm
+
+.macro FILTER_WARP_CLIP_LSX in0, in1, in2, out0, out1
+ add.w \in0, \in0, \in1
+ addi.w t6, \in0, 512
+ srai.w t6, t6, 10
+ addi.w t6, t6, 64
+ slli.w t6, t6, 3
+ fldx.d f1, t5, t6
+ vsllwil.h.b vr1, vr1, 0
+ vmulwev.w.h vr3, \in2, vr1
+ vmaddwod.w.h vr3, \in2, vr1
+ vhaddw.d.w vr3, vr3, vr3
+ vhaddw.q.d vr3, vr3, vr3
+ vextrins.w \out0, vr3, \out1
+.endm
+
+const warp_sh
+.rept 2
+.byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+.endr
+.rept 2
+.byte 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.endr
+endconst
+
+.macro warp_lsx t, shift
+function warp_affine_8x8\t\()_8bpc_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+
+ la.local t4, warp_sh
+ ld.h t0, a4, 0 // abcd[0]
+ ld.h t1, a4, 2 // abcd[1]
+
+ alsl.w t2, a3, a3, 1
+ addi.w t3, a5, 0
+ la.local t5, dav1d_mc_warp_filter
+ sub.d a2, a2, t2
+ addi.d a2, a2, -3
+ vld vr0, a2, 0
+ vld vr30, t4, 0
+ vld vr31, t4, 32
+
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30
+
+ add.w a5, t1, a5
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x00, vr13, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x00, vr15, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x00, vr17, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x00, vr19, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x10, vr13, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x10, vr15, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x10, vr17, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x10, vr19, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x20, vr13, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x20, vr15, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x20, vr17, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x20, vr19, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x30, vr13, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x30, vr15, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x30, vr17, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x30, vr19, 0x30
+
+ vsrarni.h.w vr12, vr4, 3
+ vsrarni.h.w vr13, vr5, 3
+ vsrarni.h.w vr14, vr6, 3
+ vsrarni.h.w vr15, vr7, 3
+ vsrarni.h.w vr16, vr8, 3
+ vsrarni.h.w vr17, vr9, 3
+ vsrarni.h.w vr18, vr10, 3
+ vsrarni.h.w vr19, vr11, 3
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x00, vr22, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x00, vr24, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x00, vr26, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x00, vr28, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x10, vr22, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x10, vr24, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x10, vr26, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x10, vr28, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x20, vr22, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x20, vr24, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x20, vr26, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x20, vr28, 0x20
+
+ vsrarni.h.w vr21, vr4, 3
+ vsrarni.h.w vr22, vr5, 3
+ vsrarni.h.w vr23, vr6, 3
+ vsrarni.h.w vr24, vr7, 3
+ vsrarni.h.w vr25, vr8, 3
+ vsrarni.h.w vr26, vr9, 3
+ vsrarni.h.w vr27, vr10, 3
+ vsrarni.h.w vr28, vr11, 3
+
+ addi.w t2, a6, 0 // my
+ ld.h t7, a4, 4 // abcd[2]
+ ld.h t8, a4, 6 // abcd[3]
+
+.ifnb \t
+ slli.d a1, a1, 1
+.endif
+
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vst vr5, a0, 0
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fst.d f5, a0, 0
+.endif
+
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vstx vr5, a0, a1
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fstx.d f5, a0, a1
+.endif
+
+ vaddi.bu vr31, vr31, 2
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+ alsl.d a0, a1, a0, 1
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vst vr5, a0, 0
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fst.d f5, a0, 0
+.endif
+
+ vaddi.bu vr31, vr31, 2
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vstx vr5, a0, a1
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fstx.d f5, a0, a1
+.endif
+
+ vaddi.bu vr31, vr31, 2
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+ alsl.d a0, a1, a0, 1
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vst vr5, a0, 0
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fst.d f5, a0, 0
+.endif
+
+ vaddi.bu vr31, vr31, 2
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vstx vr5, a0, a1
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fstx.d f5, a0, a1
+.endif
+
+ vaddi.bu vr31, vr31, 2
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+ alsl.d a0, a1, a0, 1
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vst vr5, a0, 0
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fst.d f5, a0, 0
+.endif
+
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vstx vr5, a0, a1
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fstx.d f5, a0, a1
+.endif
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+.endm
+
+warp_lsx , 11
+warp_lsx t, 7
+
+.macro FILTER_WARP_RND_P_LASX in0, in1, in2, out0, out1, out2, out3
+ xvshuf.b xr2, \in0, \in0, \in2
+
+ addi.w t4, \in1, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr3, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ addi.w t4, t3, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr4, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ addi.w t4, t3, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr5, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ addi.w t4, t3, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr6, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ xvinsve0.d xr3, xr5, 1
+ xvinsve0.d xr3, xr4, 2
+ xvinsve0.d xr3, xr6, 3
+
+ xvmulwev.h.bu.b xr4, xr2, xr3
+ xvmulwod.h.bu.b xr5, xr2, xr3
+ xvilvl.d xr2, xr5, xr4
+ xvilvh.d xr3, xr5, xr4
+ xvhaddw.w.h xr2, xr2, xr2
+ xvhaddw.w.h xr3, xr3, xr3
+ xvhaddw.d.w xr2, xr2, xr2
+ xvhaddw.d.w xr3, xr3, xr3
+ xvhaddw.q.d xr2, xr2, xr2
+ xvhaddw.q.d xr3, xr3, xr3
+
+ xvextrins.w \out0, xr2, \out1
+ xvextrins.w \out2, xr3, \out3
+.endm
+
+.macro FILTER_WARP_CLIP_LASX in0, in1, in2, out0, out1
+ add.w \in0, \in0, \in1
+ addi.w t6, \in0, 512
+ srai.w t6, t6, 10
+ addi.w t6, t6, 64
+ slli.w t6, t6, 3
+ fldx.d f1, t5, t6
+
+ add.w t2, t2, t7
+ addi.w t6, t2, 512
+ srai.w t6, t6, 10
+ addi.w t6, t6, 64
+ slli.w t6, t6, 3
+ fldx.d f2, t5, t6
+
+ vilvl.d vr0, vr2, vr1
+ vext2xv.h.b xr0, xr0
+ xvmulwev.w.h xr3, \in2, xr0
+ xvmaddwod.w.h xr3, \in2, xr0
+ xvhaddw.d.w xr3, xr3, xr3
+ xvhaddw.q.d xr3, xr3, xr3
+ xvextrins.w \out0, xr3, \out1
+.endm
+
+const shuf0
+.byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+.byte 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10
+endconst
+
+.macro warp_lasx t, shift
+function warp_affine_8x8\t\()_8bpc_lasx
+ addi.d sp, sp, -16
+ ld.h t0, a4, 0 // abcd[0]
+ ld.h t1, a4, 2 // abcd[1]
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+
+ alsl.w t2, a3, a3, 1
+ addi.w t3, a5, 0
+ la.local t4, warp_sh
+ la.local t5, dav1d_mc_warp_filter
+ sub.d a2, a2, t2
+ addi.d a2, a2, -3
+ vld vr0, a2, 0
+ xvld xr24, t4, 0
+ xvld xr25, t4, 32
+ la.local t2, shuf0
+ xvld xr1, t2, 0
+ xvpermi.q xr0, xr0, 0x00
+ xvaddi.bu xr9, xr1, 4
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x00, xr13, 0x00
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x00, xr15, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x10, xr13, 0x10
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x10, xr15, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x20, xr13, 0x20
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x20, xr15, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x30, xr13, 0x30
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x30, xr15, 0x30
+
+ xvsrarni.h.w xr12, xr7, 3
+ xvsrarni.h.w xr13, xr8, 3
+ xvsrarni.h.w xr14, xr10, 3
+ xvsrarni.h.w xr15, xr11, 3
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x00, xr17, 0x00
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x00, xr19, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x10, xr17, 0x10
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x10, xr19, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x20, xr17, 0x20
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x20, xr19, 0x20
+
+ xvsrarni.h.w xr16, xr7, 3
+ xvsrarni.h.w xr17, xr8, 3
+ xvsrarni.h.w xr18, xr10, 3
+ xvsrarni.h.w xr19, xr11, 3
+
+ addi.w t2, a6, 0 // my
+ ld.h t7, a4, 4 // abcd[2]
+ ld.h t8, a4, 6 // abcd[3]
+
+.ifnb \t
+ slli.d a1, a1, 1
+.endif
+
+ // y = 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
+
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
+
+.ifnb \t
+ xvssrarni.h.w xr21, xr20, \shift
+ xvpermi.q xr22, xr21, 0x01
+ vilvl.h vr23, vr22, vr21
+ vilvh.h vr21, vr22, vr21
+ vst vr23, a0, 0
+ vstx vr21, a0, a1
+.else
+ xvssrarni.hu.w xr21, xr20, \shift
+ xvssrlni.bu.h xr22, xr21, 0
+ xvpermi.q xr23, xr22, 0x01
+ vilvl.b vr21, vr23, vr22
+ fst.d f21, a0, 0
+ add.d a0, a0, a1
+ vstelm.d vr21, a0, 0, 1
+.endif
+
+ xvaddi.bu xr25, xr25, 2
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
+
+ xvaddi.bu xr25, xr25, 2
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
+
+.ifnb \t
+ xvssrarni.h.w xr21, xr20, \shift
+ alsl.d a0, a1, a0, 1
+ xvpermi.q xr22, xr21, 0x01
+ vilvl.h vr23, vr22, vr21
+ vilvh.h vr21, vr22, vr21
+ vst vr23, a0, 0
+ vstx vr21, a0, a1
+.else
+ xvssrarni.hu.w xr21, xr20, 11
+ xvssrlni.bu.h xr22, xr21, 0
+ xvpermi.q xr23, xr22, 0x01
+ vilvl.b vr21, vr23, vr22
+ add.d a0, a0, a1
+ fst.d f21, a0, 0
+ add.d a0, a0, a1
+ vstelm.d vr21, a0, 0, 1
+.endif
+
+ xvaddi.bu xr25, xr25, 2
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
+
+ xvaddi.bu xr25, xr25, 2
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
+
+.ifnb \t
+ xvssrarni.h.w xr21, xr20, \shift
+ alsl.d a0, a1, a0, 1
+ xvpermi.q xr22, xr21, 0x01
+ vilvl.h vr23, vr22, vr21
+ vilvh.h vr21, vr22, vr21
+ vst vr23, a0, 0
+ vstx vr21, a0, a1
+.else
+ xvssrarni.hu.w xr21, xr20, 11
+ xvssrlni.bu.h xr22, xr21, 0
+ xvpermi.q xr23, xr22, 0x01
+ vilvl.b vr21, vr23, vr22
+ add.d a0, a0, a1
+ fst.d f21, a0, 0
+ add.d a0, a0, a1
+ vstelm.d vr21, a0, 0, 1
+.endif
+
+ xvaddi.bu xr25, xr25, 2
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
+
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
+
+.ifnb \t
+ xvssrarni.h.w xr21, xr20, \shift
+ alsl.d a0, a1, a0, 1
+ xvpermi.q xr22, xr21, 0x01
+ vilvl.h vr23, vr22, vr21
+ vilvh.h vr21, vr22, vr21
+ vst vr23, a0, 0
+ vstx vr21, a0, a1
+.else
+ xvssrarni.hu.w xr21, xr20, 11
+ xvssrlni.bu.h xr22, xr21, 0
+ xvpermi.q xr23, xr22, 0x01
+ vilvl.b vr21, vr23, vr22
+ add.d a0, a0, a1
+ fst.d f21, a0, 0
+ add.d a0, a0, a1
+ vstelm.d vr21, a0, 0, 1
+.endif
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ addi.d sp, sp, 16
+endfunc
+.endm
+
+warp_lasx , 11
+warp_lasx t, 7
+
+/*
+static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2,
+ const int w, int h,
+ const int weight HIGHBD_DECL_SUFFIX)
+*/
+
+#define bpc8_sh 5 // sh = intermediate_bits + 1
+#define bpcw8_sh 8 // sh = intermediate_bits + 4
+
+#define bpc_sh bpc8_sh
+#define bpcw_sh bpcw8_sh
+
+function avg_8bpc_lsx
+ addi.d t8, a0, 0
+
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .AVG_LSX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0 // The jump addresses are relative to AVG_LSX_JRTABLE
+ add.d t1, t1, t2 // Get absolute address
+ jirl $r0, t1, 0
+
+ .align 3
+.AVG_LSX_JRTABLE:
+ .hword .AVG_W128_LSX - .AVG_LSX_JRTABLE
+ .hword .AVG_W64_LSX - .AVG_LSX_JRTABLE
+ .hword .AVG_W32_LSX - .AVG_LSX_JRTABLE
+ .hword .AVG_W16_LSX - .AVG_LSX_JRTABLE
+ .hword .AVG_W8_LSX - .AVG_LSX_JRTABLE
+ .hword .AVG_W4_LSX - .AVG_LSX_JRTABLE
+
+.AVG_W4_LSX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ vadd.h vr2, vr0, vr1
+ vssrarni.bu.h vr3, vr2, bpc_sh
+ vstelm.w vr3, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr3, a0, 0, 1
+ addi.w a5, a5, -2
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W4_LSX
+ b .AVG_END_LSX
+
+.AVG_W8_LSX:
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vadd.h vr4, vr0, vr1
+ vadd.h vr5, vr2, vr3
+ vssrarni.bu.h vr5, vr4, bpc_sh
+ addi.w a5, a5, -2
+ addi.d a2, a2, 32
+ vstelm.d vr5, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.d vr5, a0, 0, 1
+ addi.d a3, a3, 32
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W8_LSX
+ b .AVG_END_LSX
+
+.AVG_W16_LSX:
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vadd.h vr4, vr0, vr1
+ vadd.h vr5, vr2, vr3
+ vssrarni.bu.h vr5, vr4, bpc_sh
+ addi.w a5, a5, -1
+ addi.d a2, a2, 32
+ vst vr5, a0, 0
+ addi.d a3, a3, 32
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W16_LSX
+ b .AVG_END_LSX
+
+.AVG_W32_LSX:
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr4, a2, 32
+ vld vr6, a2, 48
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vld vr5, a3, 32
+ vld vr7, a3, 48
+ vadd.h vr0, vr0, vr1
+ vadd.h vr2, vr2, vr3
+ vadd.h vr4, vr4, vr5
+ vadd.h vr6, vr6, vr7
+ vssrarni.bu.h vr2, vr0, bpc_sh
+ vssrarni.bu.h vr6, vr4, bpc_sh
+ addi.w a5, a5, -1
+ addi.d a2, a2, 64
+ vst vr2, a0, 0
+ vst vr6, a0, 16
+ addi.d a3, a3, 64
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W32_LSX
+ b .AVG_END_LSX
+
+.AVG_W64_LSX:
+.rept 4
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vadd.h vr0, vr0, vr1
+ vadd.h vr2, vr2, vr3
+ vssrarni.bu.h vr2, vr0, bpc_sh
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ vst vr2, a0, 0
+ addi.d a0, a0, 16
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .AVG_W64_LSX
+ b .AVG_END_LSX
+
+.AVG_W128_LSX:
+.rept 8
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vadd.h vr0, vr0, vr1
+ vadd.h vr2, vr2, vr3
+ vssrarni.bu.h vr2, vr0, bpc_sh
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ vst vr2, a0, 0
+ addi.d a0, a0, 16
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .AVG_W128_LSX
+.AVG_END_LSX:
+endfunc
+
+function avg_8bpc_lasx
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .AVG_LASX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0
+ add.d t1, t1, t2
+ jirl $r0, t1, 0
+
+ .align 3
+.AVG_LASX_JRTABLE:
+ .hword .AVG_W128_LASX - .AVG_LASX_JRTABLE
+ .hword .AVG_W64_LASX - .AVG_LASX_JRTABLE
+ .hword .AVG_W32_LASX - .AVG_LASX_JRTABLE
+ .hword .AVG_W16_LASX - .AVG_LASX_JRTABLE
+ .hword .AVG_W8_LASX - .AVG_LASX_JRTABLE
+ .hword .AVG_W4_LASX - .AVG_LASX_JRTABLE
+
+.AVG_W4_LASX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ vadd.h vr0, vr0, vr1
+ vssrarni.bu.h vr1, vr0, bpc_sh
+ vstelm.w vr1, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr1, a0, 0, 1
+ addi.w a5, a5, -2
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W4_LASX
+ b .AVG_END_LASX
+.AVG_W8_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ xvadd.h xr2, xr0, xr1
+ xvssrarni.bu.h xr1, xr2, bpc_sh
+ xvstelm.d xr1, a0, 0, 0
+ add.d a0, a0, a1
+ xvstelm.d xr1, a0, 0, 2
+ addi.w a5, a5, -2
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ add.d a0, a1, a0
+ blt zero, a5, .AVG_W8_LASX
+ b .AVG_END_LASX
+.AVG_W16_LASX:
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvadd.h xr4, xr0, xr1
+ xvadd.h xr5, xr2, xr3
+ xvssrarni.bu.h xr5, xr4, bpc_sh
+ xvpermi.d xr2, xr5, 0xd8
+ xvpermi.d xr3, xr5, 0x8d
+ vst vr2, a0, 0
+ vstx vr3, a0, a1
+ addi.w a5, a5, -2
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ alsl.d a0, a1, a0, 1
+ blt zero, a5, .AVG_W16_LASX
+ b .AVG_END_LASX
+.AVG_W32_LASX:
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvadd.h xr4, xr0, xr1
+ xvadd.h xr5, xr2, xr3
+ xvssrarni.bu.h xr5, xr4, bpc_sh
+ xvpermi.d xr6, xr5, 0xd8
+ xvst xr6, a0, 0
+ addi.w a5, a5, -1
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W32_LASX
+ b .AVG_END_LASX
+.AVG_W64_LASX:
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr4, a2, 64
+ xvld xr6, a2, 96
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvld xr5, a3, 64
+ xvld xr7, a3, 96
+ xvadd.h xr0, xr0, xr1
+ xvadd.h xr2, xr2, xr3
+ xvadd.h xr4, xr4, xr5
+ xvadd.h xr6, xr6, xr7
+ xvssrarni.bu.h xr2, xr0, bpc_sh
+ xvssrarni.bu.h xr6, xr4, bpc_sh
+ xvpermi.d xr1, xr2, 0xd8
+ xvpermi.d xr3, xr6, 0xd8
+ xvst xr1, a0, 0
+ xvst xr3, a0, 32
+ addi.w a5, a5, -1
+ addi.d a2, a2, 128
+ addi.d a3, a3, 128
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W64_LASX
+ b .AVG_END_LASX
+.AVG_W128_LASX:
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr4, a2, 64
+ xvld xr6, a2, 96
+ xvld xr8, a2, 128
+ xvld xr10, a2, 160
+ xvld xr12, a2, 192
+ xvld xr14, a2, 224
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvld xr5, a3, 64
+ xvld xr7, a3, 96
+ xvld xr9, a3, 128
+ xvld xr11, a3, 160
+ xvld xr13, a3, 192
+ xvld xr15, a3, 224
+ xvadd.h xr0, xr0, xr1
+ xvadd.h xr2, xr2, xr3
+ xvadd.h xr4, xr4, xr5
+ xvadd.h xr6, xr6, xr7
+ xvadd.h xr8, xr8, xr9
+ xvadd.h xr10, xr10, xr11
+ xvadd.h xr12, xr12, xr13
+ xvadd.h xr14, xr14, xr15
+ xvssrarni.bu.h xr2, xr0, bpc_sh
+ xvssrarni.bu.h xr6, xr4, bpc_sh
+ xvssrarni.bu.h xr10, xr8, bpc_sh
+ xvssrarni.bu.h xr14, xr12, bpc_sh
+ xvpermi.d xr1, xr2, 0xd8
+ xvpermi.d xr3, xr6, 0xd8
+ xvpermi.d xr5, xr10, 0xd8
+ xvpermi.d xr7, xr14, 0xd8
+ xvst xr1, a0, 0
+ xvst xr3, a0, 32
+ xvst xr5, a0, 64
+ xvst xr7, a0, 96
+ addi.w a5, a5, -1
+ addi.d a2, a2, 256
+ addi.d a3, a3, 256
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W128_LASX
+.AVG_END_LASX:
+endfunc
+
+function w_avg_8bpc_lsx
+ addi.d t8, a0, 0
+ li.w t2, 16
+ sub.w t2, t2, a6 // 16 - weight
+ vreplgr2vr.h vr21, a6
+ vreplgr2vr.h vr22, t2
+
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .W_AVG_LSX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0
+ add.d t1, t1, t2
+ jirl $r0, t1, 0
+
+ .align 3
+.W_AVG_LSX_JRTABLE:
+ .hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE
+ .hword .W_AVG_W64_LSX - .W_AVG_LSX_JRTABLE
+ .hword .W_AVG_W32_LSX - .W_AVG_LSX_JRTABLE
+ .hword .W_AVG_W16_LSX - .W_AVG_LSX_JRTABLE
+ .hword .W_AVG_W8_LSX - .W_AVG_LSX_JRTABLE
+ .hword .W_AVG_W4_LSX - .W_AVG_LSX_JRTABLE
+
+.W_AVG_W4_LSX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ vmulwev.w.h vr2, vr0, vr21
+ vmulwod.w.h vr3, vr0, vr21
+ vmaddwev.w.h vr2, vr1, vr22
+ vmaddwod.w.h vr3, vr1, vr22
+ vssrarni.hu.w vr3, vr2, bpcw_sh
+ vssrlni.bu.h vr1, vr3, 0
+ vpickod.w vr4, vr2, vr1
+ vilvl.b vr0, vr4, vr1
+ fst.s f0, a0, 0
+ add.d a0, a0, a1
+ vstelm.w vr0, a0, 0, 1
+ addi.w a5, a5, -2
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ add.d a0, a1, a0
+ blt zero, a5, .W_AVG_W4_LSX
+ b .W_AVG_END_LSX
+.W_AVG_W8_LSX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ vmulwev.w.h vr2, vr0, vr21
+ vmulwod.w.h vr3, vr0, vr21
+ vmaddwev.w.h vr2, vr1, vr22
+ vmaddwod.w.h vr3, vr1, vr22
+ vssrarni.hu.w vr3, vr2, bpcw_sh
+ vssrlni.bu.h vr1, vr3, 0
+ vpickod.w vr4, vr2, vr1
+ vilvl.b vr0, vr4, vr1
+ fst.d f0, a0, 0
+ addi.w a5, a5, -1
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ add.d a0, a0, a1
+ blt zero, a5, .W_AVG_W8_LSX
+ b .W_AVG_END_LSX
+.W_AVG_W16_LSX:
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vmulwev.w.h vr4, vr0, vr21
+ vmulwod.w.h vr5, vr0, vr21
+ vmulwev.w.h vr6, vr2, vr21
+ vmulwod.w.h vr7, vr2, vr21
+ vmaddwev.w.h vr4, vr1, vr22
+ vmaddwod.w.h vr5, vr1, vr22
+ vmaddwev.w.h vr6, vr3, vr22
+ vmaddwod.w.h vr7, vr3, vr22
+ vssrarni.hu.w vr6, vr4, bpcw_sh
+ vssrarni.hu.w vr7, vr5, bpcw_sh
+ vssrlrni.bu.h vr7, vr6, 0
+ vshuf4i.w vr8, vr7, 0x4E
+ vilvl.b vr0, vr8, vr7
+ vst vr0, a0, 0
+ addi.w a5, a5, -1
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ add.d a0, a0, a1
+ blt zero, a5, .W_AVG_W16_LSX
+ b .W_AVG_END_LSX
+.W_AVG_W32_LSX:
+.rept 2
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vmulwev.w.h vr4, vr0, vr21
+ vmulwod.w.h vr5, vr0, vr21
+ vmulwev.w.h vr6, vr2, vr21
+ vmulwod.w.h vr7, vr2, vr21
+ vmaddwev.w.h vr4, vr1, vr22
+ vmaddwod.w.h vr5, vr1, vr22
+ vmaddwev.w.h vr6, vr3, vr22
+ vmaddwod.w.h vr7, vr3, vr22
+ vssrarni.hu.w vr6, vr4, bpcw_sh
+ vssrarni.hu.w vr7, vr5, bpcw_sh
+ vssrlrni.bu.h vr7, vr6, 0
+ vshuf4i.w vr8, vr7, 0x4E
+ vilvl.b vr0, vr8, vr7
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a0, a0, 16
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .W_AVG_W32_LSX
+ b .W_AVG_END_LSX
+
+.W_AVG_W64_LSX:
+.rept 4
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vmulwev.w.h vr4, vr0, vr21
+ vmulwod.w.h vr5, vr0, vr21
+ vmulwev.w.h vr6, vr2, vr21
+ vmulwod.w.h vr7, vr2, vr21
+ vmaddwev.w.h vr4, vr1, vr22
+ vmaddwod.w.h vr5, vr1, vr22
+ vmaddwev.w.h vr6, vr3, vr22
+ vmaddwod.w.h vr7, vr3, vr22
+ vssrarni.hu.w vr6, vr4, bpcw_sh
+ vssrarni.hu.w vr7, vr5, bpcw_sh
+ vssrlrni.bu.h vr7, vr6, 0
+ vshuf4i.w vr8, vr7, 0x4E
+ vilvl.b vr0, vr8, vr7
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a0, a0, 16
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .W_AVG_W64_LSX
+ b .W_AVG_END_LSX
+
+.W_AVG_W128_LSX:
+.rept 8
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vmulwev.w.h vr4, vr0, vr21
+ vmulwod.w.h vr5, vr0, vr21
+ vmulwev.w.h vr6, vr2, vr21
+ vmulwod.w.h vr7, vr2, vr21
+ vmaddwev.w.h vr4, vr1, vr22
+ vmaddwod.w.h vr5, vr1, vr22
+ vmaddwev.w.h vr6, vr3, vr22
+ vmaddwod.w.h vr7, vr3, vr22
+ vssrarni.hu.w vr6, vr4, bpcw_sh
+ vssrarni.hu.w vr7, vr5, bpcw_sh
+ vssrlrni.bu.h vr7, vr6, 0
+ vshuf4i.w vr8, vr7, 0x4E
+ vilvl.b vr0, vr8, vr7
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a0, a0, 16
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .W_AVG_W128_LSX
+.W_AVG_END_LSX:
+endfunc
+
+function w_avg_8bpc_lasx
+ addi.d t8, a0, 0
+ li.w t2, 16
+ sub.w t2, t2, a6 // 16 - weight
+ xvreplgr2vr.h xr21, a6
+ xvreplgr2vr.h xr22, t2
+
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .W_AVG_LASX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0
+ add.d t1, t1, t2
+ jirl $r0, t1, 0
+
+ .align 3
+.W_AVG_LASX_JRTABLE:
+ .hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE
+ .hword .W_AVG_W64_LASX - .W_AVG_LASX_JRTABLE
+ .hword .W_AVG_W32_LASX - .W_AVG_LASX_JRTABLE
+ .hword .W_AVG_W16_LASX - .W_AVG_LASX_JRTABLE
+ .hword .W_AVG_W8_LASX - .W_AVG_LASX_JRTABLE
+ .hword .W_AVG_W4_LASX - .W_AVG_LASX_JRTABLE
+
+.W_AVG_W4_LASX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ xvpermi.d xr2, xr0, 0xD8
+ xvpermi.d xr3, xr1, 0xD8
+ xvilvl.h xr4, xr3, xr2
+ xvmulwev.w.h xr0, xr4, xr21
+ xvmaddwod.w.h xr0, xr4, xr22
+ xvssrarni.hu.w xr1, xr0, bpcw_sh
+ xvssrlni.bu.h xr0, xr1, 0
+ fst.s f0, a0, 0
+ add.d a0, a0, a1
+ xvstelm.w xr0, a0, 0, 4
+ addi.w a5, a5, -2
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ add.d a0, a1, a0
+ blt zero, a5, .W_AVG_W4_LASX
+ b .W_AVG_END_LASX
+
+.W_AVG_W8_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ xvmulwev.w.h xr2, xr0, xr21
+ xvmulwod.w.h xr3, xr0, xr21
+ xvmaddwev.w.h xr2, xr1, xr22
+ xvmaddwod.w.h xr3, xr1, xr22
+ xvssrarni.hu.w xr3, xr2, bpcw_sh
+ xvssrlni.bu.h xr1, xr3, 0
+ xvpickod.w xr4, xr2, xr1
+ xvilvl.b xr0, xr4, xr1
+ xvstelm.d xr0, a0, 0, 0
+ add.d a0, a0, a1
+ xvstelm.d xr0, a0, 0, 2
+ addi.w a5, a5, -2
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ add.d a0, a0, a1
+ blt zero, a5, .W_AVG_W8_LASX
+ b .W_AVG_END_LASX
+
+.W_AVG_W16_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ xvmulwev.w.h xr2, xr0, xr21
+ xvmulwod.w.h xr3, xr0, xr21
+ xvmaddwev.w.h xr2, xr1, xr22
+ xvmaddwod.w.h xr3, xr1, xr22
+ xvssrarni.hu.w xr3, xr2, bpcw_sh
+ xvssrlni.bu.h xr1, xr3, 0
+ xvpickod.w xr4, xr2, xr1
+ xvilvl.b xr0, xr4, xr1
+ xvpermi.d xr1, xr0, 0xD8
+ vst vr1, a0, 0
+ addi.w a5, a5, -1
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ add.d a0, a0, a1
+ blt zero, a5, .W_AVG_W16_LASX
+ b .W_AVG_END_LSX
+
+.W_AVG_W32_LASX:
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvmulwev.w.h xr4, xr0, xr21
+ xvmulwod.w.h xr5, xr0, xr21
+ xvmulwev.w.h xr6, xr2, xr21
+ xvmulwod.w.h xr7, xr2, xr21
+ xvmaddwev.w.h xr4, xr1, xr22
+ xvmaddwod.w.h xr5, xr1, xr22
+ xvmaddwev.w.h xr6, xr3, xr22
+ xvmaddwod.w.h xr7, xr3, xr22
+ xvssrarni.hu.w xr6, xr4, bpcw_sh
+ xvssrarni.hu.w xr7, xr5, bpcw_sh
+ xvssrlni.bu.h xr7, xr6, 0
+ xvshuf4i.w xr8, xr7, 0x4E
+ xvilvl.b xr9, xr8, xr7
+ xvpermi.d xr0, xr9, 0xD8
+ xvst xr0, a0, 0
+ addi.w a5, a5, -1
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ add.d a0, a0, a1
+ blt zero, a5, .W_AVG_W32_LASX
+ b .W_AVG_END_LASX
+
+.W_AVG_W64_LASX:
+.rept 2
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvmulwev.w.h xr4, xr0, xr21
+ xvmulwod.w.h xr5, xr0, xr21
+ xvmulwev.w.h xr6, xr2, xr21
+ xvmulwod.w.h xr7, xr2, xr21
+ xvmaddwev.w.h xr4, xr1, xr22
+ xvmaddwod.w.h xr5, xr1, xr22
+ xvmaddwev.w.h xr6, xr3, xr22
+ xvmaddwod.w.h xr7, xr3, xr22
+ xvssrarni.hu.w xr6, xr4, bpcw_sh
+ xvssrarni.hu.w xr7, xr5, bpcw_sh
+ xvssrlni.bu.h xr7, xr6, 0
+ xvshuf4i.w xr8, xr7, 0x4E
+ xvilvl.b xr9, xr8, xr7
+ xvpermi.d xr0, xr9, 0xD8
+ xvst xr0, a0, 0
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a0, a0, 32
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .W_AVG_W64_LASX
+ b .W_AVG_END_LASX
+
+.W_AVG_W128_LASX:
+.rept 4
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvmulwev.w.h xr4, xr0, xr21
+ xvmulwod.w.h xr5, xr0, xr21
+ xvmulwev.w.h xr6, xr2, xr21
+ xvmulwod.w.h xr7, xr2, xr21
+ xvmaddwev.w.h xr4, xr1, xr22
+ xvmaddwod.w.h xr5, xr1, xr22
+ xvmaddwev.w.h xr6, xr3, xr22
+ xvmaddwod.w.h xr7, xr3, xr22
+ xvssrarni.hu.w xr6, xr4, bpcw_sh
+ xvssrarni.hu.w xr7, xr5, bpcw_sh
+ xvssrlni.bu.h xr7, xr6, 0
+ xvshuf4i.w xr8, xr7, 0x4E
+ xvilvl.b xr9, xr8, xr7
+ xvpermi.d xr0, xr9, 0xD8
+ xvst xr0, a0, 0
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a0, a0, 32
+.endr
+
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .W_AVG_W128_LASX
+.W_AVG_END_LASX:
+endfunc
+
+#undef bpc_sh
+#undef bpcw_sh
+
+#define mask_sh 10
+/*
+static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
+ const uint8_t *mask HIGHBD_DECL_SUFFIX)
+*/
+function mask_8bpc_lsx
+ vldi vr21, 0x440 // 64
+ vxor.v vr19, vr19, vr19
+ addi.d t8, a0, 0
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .MASK_LSX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0
+ add.d t1, t1, t2
+ jirl $r0, t1, 0
+
+ .align 3
+.MASK_LSX_JRTABLE:
+ .hword .MASK_W128_LSX - .MASK_LSX_JRTABLE
+ .hword .MASK_W64_LSX - .MASK_LSX_JRTABLE
+ .hword .MASK_W32_LSX - .MASK_LSX_JRTABLE
+ .hword .MASK_W16_LSX - .MASK_LSX_JRTABLE
+ .hword .MASK_W8_LSX - .MASK_LSX_JRTABLE
+ .hword .MASK_W4_LSX - .MASK_LSX_JRTABLE
+
+.MASK_W4_LSX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ fld.d f22, a6, 0
+
+ vilvl.b vr2, vr19, vr22
+ vsub.h vr3, vr21, vr2
+
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vssrarni.hu.w vr5, vr4, mask_sh
+ vssrlrni.bu.h vr1, vr5, 0
+ vpickod.w vr4, vr2, vr1
+ vilvl.b vr0, vr4, vr1
+ fst.s f0, a0, 0
+ add.d a0, a0, a1
+ vstelm.w vr0, a0, 0, 1
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ addi.d a6, a6, 8
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ blt zero, a5, .MASK_W4_LSX
+ b .MASK_END_LSX
+.MASK_W8_LSX:
+ vld vr0, a2, 0
+ vld vr10, a2, 16
+ vld vr1, a3, 0
+ vld vr11, a3, 16
+ vld vr22, a6, 0
+
+ vilvl.b vr2, vr19, vr22
+ vilvh.b vr12, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ vsub.h vr13, vr21, vr12
+
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmulwev.w.h vr14, vr10, vr12
+ vmulwod.w.h vr15, vr10, vr12
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vmaddwev.w.h vr14, vr11, vr13
+ vmaddwod.w.h vr15, vr11, vr13
+ vssrarni.hu.w vr14, vr4, mask_sh
+ vssrarni.hu.w vr15, vr5, mask_sh
+ vssrlrni.bu.h vr15, vr14, 0
+ vshuf4i.w vr6, vr15, 0x4E
+ vilvl.b vr0, vr6, vr15
+ fst.d f0, a0, 0
+ add.d a0, a0, a1
+ vstelm.d vr0, a0, 0, 1
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ blt zero, a5, .MASK_W8_LSX
+ b .MASK_END_LSX
+
+.MASK_W16_LSX:
+ vld vr0, a2, 0
+ vld vr10, a2, 16
+ vld vr1, a3, 0
+ vld vr11, a3, 16
+ vld vr22, a6, 0
+
+ vilvl.b vr2, vr19, vr22
+ vilvh.b vr12, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ vsub.h vr13, vr21, vr12
+
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmulwev.w.h vr14, vr10, vr12
+ vmulwod.w.h vr15, vr10, vr12
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vmaddwev.w.h vr14, vr11, vr13
+ vmaddwod.w.h vr15, vr11, vr13
+ vssrarni.hu.w vr14, vr4, mask_sh
+ vssrarni.hu.w vr15, vr5, mask_sh
+ vssrlrni.bu.h vr15, vr14, 0
+ vshuf4i.w vr6, vr15, 0x4E
+ vilvl.b vr0, vr6, vr15
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ add.d a0, a0, a1
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W16_LSX
+ b .MASK_END_LSX
+.MASK_W32_LSX:
+.rept 2
+ vld vr0, a2, 0
+ vld vr10, a2, 16
+ vld vr1, a3, 0
+ vld vr11, a3, 16
+ vld vr22, a6, 0
+ vilvl.b vr2, vr19, vr22
+ vilvh.b vr12, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ vsub.h vr13, vr21, vr12
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmulwev.w.h vr14, vr10, vr12
+ vmulwod.w.h vr15, vr10, vr12
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vmaddwev.w.h vr14, vr11, vr13
+ vmaddwod.w.h vr15, vr11, vr13
+ vssrarni.hu.w vr14, vr4, mask_sh
+ vssrarni.hu.w vr15, vr5, mask_sh
+ vssrlrni.bu.h vr15, vr14, 0
+ vshuf4i.w vr6, vr15, 0x4E
+ vilvl.b vr0, vr6, vr15
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ addi.d a0, a0, 16
+.endr
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W32_LSX
+ b .MASK_END_LSX
+.MASK_W64_LSX:
+.rept 4
+ vld vr0, a2, 0
+ vld vr10, a2, 16
+ vld vr1, a3, 0
+ vld vr11, a3, 16
+ vld vr22, a6, 0
+ vilvl.b vr2, vr19, vr22
+ vilvh.b vr12, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ vsub.h vr13, vr21, vr12
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmulwev.w.h vr14, vr10, vr12
+ vmulwod.w.h vr15, vr10, vr12
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vmaddwev.w.h vr14, vr11, vr13
+ vmaddwod.w.h vr15, vr11, vr13
+ vssrarni.hu.w vr14, vr4, mask_sh
+ vssrarni.hu.w vr15, vr5, mask_sh
+ vssrlrni.bu.h vr15, vr14, 0
+ vshuf4i.w vr6, vr15, 0x4E
+ vilvl.b vr0, vr6, vr15
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ addi.d a0, a0, 16
+.endr
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W64_LSX
+ b .MASK_END_LSX
+.MASK_W128_LSX:
+.rept 8
+ vld vr0, a2, 0
+ vld vr10, a2, 16
+ vld vr1, a3, 0
+ vld vr11, a3, 16
+ vld vr22, a6, 0
+ vilvl.b vr2, vr19, vr22
+ vilvh.b vr12, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ vsub.h vr13, vr21, vr12
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmulwev.w.h vr14, vr10, vr12
+ vmulwod.w.h vr15, vr10, vr12
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vmaddwev.w.h vr14, vr11, vr13
+ vmaddwod.w.h vr15, vr11, vr13
+ vssrarni.hu.w vr14, vr4, mask_sh
+ vssrarni.hu.w vr15, vr5, mask_sh
+ vssrlrni.bu.h vr15, vr14, 0
+ vshuf4i.w vr6, vr15, 0x4E
+ vilvl.b vr0, vr6, vr15
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ addi.d a0, a0, 16
+.endr
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W128_LSX
+.MASK_END_LSX:
+endfunc
+
+function mask_8bpc_lasx
+ xvldi xr21, 0x440 // 64
+ xvxor.v xr19, xr19, xr19
+ addi.d t8, a0, 0
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .MASK_LASX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0
+ add.d t1, t1, t2
+ jirl $r0, t1, 0
+
+ .align 3
+.MASK_LASX_JRTABLE:
+ .hword .MASK_W128_LASX - .MASK_LASX_JRTABLE
+ .hword .MASK_W64_LASX - .MASK_LASX_JRTABLE
+ .hword .MASK_W32_LASX - .MASK_LASX_JRTABLE
+ .hword .MASK_W16_LASX - .MASK_LASX_JRTABLE
+ .hword .MASK_W8_LASX - .MASK_LASX_JRTABLE
+ .hword .MASK_W4_LASX - .MASK_LASX_JRTABLE
+
+.MASK_W4_LASX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ fld.d f22, a6, 0
+
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr14, vr1, vr0
+ vilvl.b vr2, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ xvpermi.q xr14, xr4, 0x20
+ vilvl.h vr5, vr3, vr2
+ vilvh.h vr15, vr3, vr2
+ xvpermi.q xr15, xr5, 0x20
+ xvmulwev.w.h xr0, xr14, xr15
+ xvmaddwod.w.h xr0, xr14, xr15
+ xvssrarni.hu.w xr1, xr0, mask_sh
+ xvssrlni.bu.h xr2, xr1, 0
+ fst.s f2, a0, 0
+ add.d a0, a0, a1
+ xvstelm.w xr2, a0, 0, 4
+
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ addi.d a6, a6, 8
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ blt zero, a5, .MASK_W4_LASX
+ b .MASK_END_LASX
+
+.MASK_W8_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ vld vr22, a6, 0
+
+ vext2xv.hu.bu xr2, xr22
+ xvsub.h xr3, xr21, xr2
+ xvmulwev.w.h xr4, xr0, xr2
+ xvmulwod.w.h xr5, xr0, xr2
+ xvmaddwev.w.h xr4, xr1, xr3
+ xvmaddwod.w.h xr5, xr1, xr3
+ xvssrarni.hu.w xr5, xr4, mask_sh
+ xvssrlni.bu.h xr1, xr5, 0
+ xvpickod.w xr4, xr2, xr1
+ xvilvl.b xr0, xr4, xr1
+ fst.d f0, a0, 0
+ add.d a0, a0, a1
+ xvstelm.d xr0, a0, 0, 2
+
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ blt zero, a5, .MASK_W8_LASX
+ b .MASK_END_LASX
+
+.MASK_W16_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ vld vr22, a6, 0
+
+ vext2xv.hu.bu xr2, xr22
+ xvsub.h xr3, xr21, xr2
+ xvmulwev.w.h xr4, xr0, xr2
+ xvmulwod.w.h xr5, xr0, xr2
+ xvmaddwev.w.h xr4, xr1, xr3
+ xvmaddwod.w.h xr5, xr1, xr3
+ xvssrarni.hu.w xr5, xr4, mask_sh
+ xvssrlni.bu.h xr1, xr5, 0
+ xvpickod.w xr4, xr2, xr1
+ xvilvl.b xr0, xr4, xr1
+ xvpermi.d xr1, xr0, 0xD8
+ vst vr1, a0, 0
+
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ add.d a0, a0, a1
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W16_LASX
+ b .MASK_END_LASX
+.MASK_W32_LASX:
+ xvld xr0, a2, 0
+ xvld xr10, a2, 32
+ xvld xr1, a3, 0
+ xvld xr11, a3, 32
+ xvld xr22, a6, 0
+ vext2xv.hu.bu xr2, xr22
+ xvpermi.q xr4, xr22, 0x01
+ vext2xv.hu.bu xr12, xr4
+ xvsub.h xr3, xr21, xr2
+ xvsub.h xr13, xr21, xr12
+
+ xvmulwev.w.h xr4, xr0, xr2
+ xvmulwod.w.h xr5, xr0, xr2
+ xvmulwev.w.h xr14, xr10, xr12
+ xvmulwod.w.h xr15, xr10, xr12
+ xvmaddwev.w.h xr4, xr1, xr3
+ xvmaddwod.w.h xr5, xr1, xr3
+ xvmaddwev.w.h xr14, xr11, xr13
+ xvmaddwod.w.h xr15, xr11, xr13
+ xvssrarni.hu.w xr14, xr4, mask_sh
+ xvssrarni.hu.w xr15, xr5, mask_sh
+ xvssrlni.bu.h xr15, xr14, 0
+ xvshuf4i.w xr6, xr15, 0x4E
+ xvilvl.b xr1, xr6, xr15
+ xvpermi.d xr0, xr1, 0xD8
+ xvst xr0, a0, 0
+
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a6, a6, 32
+ add.d a0, a0, a1
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W32_LASX
+ b .MASK_END_LASX
+
+.MASK_W64_LASX:
+.rept 2
+ xvld xr0, a2, 0
+ xvld xr10, a2, 32
+ xvld xr1, a3, 0
+ xvld xr11, a3, 32
+ xvld xr22, a6, 0
+ vext2xv.hu.bu xr2, xr22
+ xvpermi.q xr4, xr22, 0x01
+ vext2xv.hu.bu xr12, xr4
+ xvsub.h xr3, xr21, xr2
+ xvsub.h xr13, xr21, xr12
+
+ xvmulwev.w.h xr4, xr0, xr2
+ xvmulwod.w.h xr5, xr0, xr2
+ xvmulwev.w.h xr14, xr10, xr12
+ xvmulwod.w.h xr15, xr10, xr12
+ xvmaddwev.w.h xr4, xr1, xr3
+ xvmaddwod.w.h xr5, xr1, xr3
+ xvmaddwev.w.h xr14, xr11, xr13
+ xvmaddwod.w.h xr15, xr11, xr13
+ xvssrarni.hu.w xr14, xr4, mask_sh
+ xvssrarni.hu.w xr15, xr5, mask_sh
+ xvssrlni.bu.h xr15, xr14, 0
+ xvshuf4i.w xr6, xr15, 0x4E
+ xvilvl.b xr1, xr6, xr15
+ xvpermi.d xr0, xr1, 0xD8
+ xvst xr0, a0, 0
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a6, a6, 32
+ addi.d a0, a0, 32
+.endr
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W64_LASX
+ b .MASK_END_LASX
+
+.MASK_W128_LASX:
+.rept 4
+ xvld xr0, a2, 0
+ xvld xr10, a2, 32
+ xvld xr1, a3, 0
+ xvld xr11, a3, 32
+ xvld xr22, a6, 0
+ vext2xv.hu.bu xr2, xr22
+ xvpermi.q xr4, xr22, 0x01
+ vext2xv.hu.bu xr12, xr4
+ xvsub.h xr3, xr21, xr2
+ xvsub.h xr13, xr21, xr12
+
+ xvmulwev.w.h xr4, xr0, xr2
+ xvmulwod.w.h xr5, xr0, xr2
+ xvmulwev.w.h xr14, xr10, xr12
+ xvmulwod.w.h xr15, xr10, xr12
+ xvmaddwev.w.h xr4, xr1, xr3
+ xvmaddwod.w.h xr5, xr1, xr3
+ xvmaddwev.w.h xr14, xr11, xr13
+ xvmaddwod.w.h xr15, xr11, xr13
+ xvssrarni.hu.w xr14, xr4, mask_sh
+ xvssrarni.hu.w xr15, xr5, mask_sh
+ xvssrlni.bu.h xr15, xr14, 0
+ xvshuf4i.w xr6, xr15, 0x4E
+ xvilvl.b xr1, xr6, xr15
+ xvpermi.d xr0, xr1, 0xD8
+ xvst xr0, a0, 0
+
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a6, a6, 32
+ addi.d a0, a0, 32
+.endr
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W128_LASX
+.MASK_END_LASX:
+endfunc
+
+/*
+static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
+ uint8_t *mask, const int sign,
+ const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX)
+*/
+function w_mask_420_8bpc_lsx
+ addi.d sp, sp, -24
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ vldi vr20, 0x440
+ vreplgr2vr.h vr21, a7
+ vldi vr22, 0x426
+
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .WMASK420_LSX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t8, t0, 0
+ add.d t1, t1, t8
+ jirl $r0, t1, 0
+
+ .align 3
+.WMASK420_LSX_JRTABLE:
+ .hword .WMASK420_W128_LSX - .WMASK420_LSX_JRTABLE
+ .hword .WMASK420_W64_LSX - .WMASK420_LSX_JRTABLE
+ .hword .WMASK420_W32_LSX - .WMASK420_LSX_JRTABLE
+ .hword .WMASK420_W16_LSX - .WMASK420_LSX_JRTABLE
+ .hword .WMASK420_W8_LSX - .WMASK420_LSX_JRTABLE
+ .hword .WMASK420_W4_LSX - .WMASK420_LSX_JRTABLE
+
+.WMASK420_W4_LSX:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ vld vr2, a3, 0
+ vld vr3, a3, 16
+ addi.w a5, a5, -4
+
+ vabsd.h vr4, vr0, vr2
+ vabsd.h vr5, vr1, vr3
+ vaddi.hu vr4, vr4, 8
+ vaddi.hu vr5, vr5, 8
+ vsrli.h vr4, vr4, 8
+ vsrli.h vr5, vr5, 8
+ vadd.h vr4, vr4, vr22
+ vadd.h vr5, vr5, vr22
+ vmin.hu vr6, vr4, vr20
+ vmin.hu vr7, vr5, vr20
+ vsub.h vr8, vr20, vr6
+ vsub.h vr9, vr20, vr7
+ vmulwev.w.h vr4, vr6, vr0
+ vmulwod.w.h vr5, vr6, vr0
+ vmulwev.w.h vr10, vr7, vr1
+ vmulwod.w.h vr11, vr7, vr1
+ vmaddwev.w.h vr4, vr8, vr2
+ vmaddwod.w.h vr5, vr8, vr2
+ vmaddwev.w.h vr10, vr9, vr3
+ vmaddwod.w.h vr11, vr9, vr3
+ vilvl.w vr0, vr5, vr4
+ vilvh.w vr1, vr5, vr4
+ vilvl.w vr2, vr11, vr10
+ vilvh.w vr3, vr11, vr10
+ vssrarni.hu.w vr1, vr0, 10
+ vssrarni.hu.w vr3, vr2, 10
+ vssrlni.bu.h vr3, vr1, 0
+ vstelm.w vr3, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr3, a0, 0, 1
+ add.d a0, a0, a1
+ vstelm.w vr3, a0, 0, 2
+ add.d a0, a0, a1
+ vstelm.w vr3, a0, 0, 3
+ add.d a0, a0, a1
+ vpickev.h vr0, vr7, vr6
+ vpickod.h vr1, vr7, vr6
+ vadd.h vr0, vr0, vr1
+ vshuf4i.h vr0, vr0, 0xd8
+ vhaddw.w.h vr2, vr0, vr0
+ vpickev.h vr2, vr2, vr2
+ vsub.h vr2, vr2, vr21
+ vaddi.hu vr2, vr2, 2
+ vssrani.bu.h vr2, vr2, 2
+ vstelm.w vr2, a6, 0, 0
+
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 4
+ blt zero, a5, .WMASK420_W4_LSX
+ b .END_W420
+
+.WMASK420_W8_LSX:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ vld vr2, a3, 0
+ vld vr3, a3, 16
+ addi.w a5, a5, -2
+
+ vabsd.h vr4, vr0, vr2
+ vabsd.h vr5, vr1, vr3
+ vaddi.hu vr4, vr4, 8
+ vaddi.hu vr5, vr5, 8
+ vsrli.h vr4, vr4, 8
+ vsrli.h vr5, vr5, 8
+ vadd.h vr4, vr4, vr22
+ vadd.h vr5, vr5, vr22
+ vmin.hu vr6, vr4, vr20
+ vmin.hu vr7, vr5, vr20
+ vsub.h vr8, vr20, vr6
+ vsub.h vr9, vr20, vr7
+ vmulwev.w.h vr4, vr6, vr0
+ vmulwod.w.h vr5, vr6, vr0
+ vmulwev.w.h vr10, vr7, vr1
+ vmulwod.w.h vr11, vr7, vr1
+ vmaddwev.w.h vr4, vr8, vr2
+ vmaddwod.w.h vr5, vr8, vr2
+ vmaddwev.w.h vr10, vr9, vr3
+ vmaddwod.w.h vr11, vr9, vr3
+ vssrarni.hu.w vr10, vr4, 10
+ vssrarni.hu.w vr11, vr5, 10
+ vssrlni.bu.h vr11, vr10, 0
+ vshuf4i.w vr0, vr11, 0x4E
+ vilvl.b vr3, vr0, vr11
+ vstelm.d vr3, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.d vr3, a0, 0, 1
+ add.d a0, a0, a1
+ vpickev.h vr0, vr7, vr6
+ vpickod.h vr1, vr7, vr6
+ vadd.h vr0, vr0, vr1
+ vilvh.d vr2, vr0, vr0
+ vadd.h vr2, vr2, vr0
+ vsub.h vr2, vr2, vr21
+ vaddi.hu vr2, vr2, 2
+ vssrani.bu.h vr2, vr2, 2
+ vstelm.w vr2, a6, 0, 0
+
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 4
+ blt zero, a5, .WMASK420_W8_LSX
+ b .END_W420
+
+.WMASK420_W16_LSX:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ alsl.d a2, a4, a2, 1
+ vld vr2, a2, 0
+ vld vr3, a2, 16
+ vld vr4, a3, 0
+ vld vr5, a3, 16
+ alsl.d a3, a4, a3, 1
+ vld vr6, a3, 0
+ vld vr7, a3, 16
+
+ vabsd.h vr8, vr0, vr4
+ vabsd.h vr9, vr1, vr5
+ vabsd.h vr10, vr2, vr6
+ vabsd.h vr11, vr3, vr7
+ vaddi.hu vr8, vr8, 8
+ vaddi.hu vr9, vr9, 8
+ vaddi.hu vr10, vr10, 8
+ vaddi.hu vr11, vr11, 8
+ vsrli.h vr8, vr8, 8
+ vsrli.h vr9, vr9, 8
+ vsrli.h vr10, vr10, 8
+ vsrli.h vr11, vr11, 8
+ vadd.h vr8, vr8, vr22
+ vadd.h vr9, vr9, vr22
+ vadd.h vr10, vr10, vr22
+ vadd.h vr11, vr11, vr22
+ vmin.hu vr12, vr8, vr20
+ vmin.hu vr13, vr9, vr20
+ vmin.hu vr14, vr10, vr20
+ vmin.hu vr15, vr11, vr20
+ vsub.h vr16, vr20, vr12
+ vsub.h vr17, vr20, vr13
+ vsub.h vr18, vr20, vr14
+ vsub.h vr19, vr20, vr15
+ vmulwev.w.h vr8, vr12, vr0
+ vmulwod.w.h vr9, vr12, vr0
+ vmulwev.w.h vr10, vr13, vr1
+ vmulwod.w.h vr11, vr13, vr1
+ vmulwev.w.h vr23, vr14, vr2
+ vmulwod.w.h vr24, vr14, vr2
+ vmulwev.w.h vr25, vr15, vr3
+ vmulwod.w.h vr26, vr15, vr3
+ vmaddwev.w.h vr8, vr16, vr4
+ vmaddwod.w.h vr9, vr16, vr4
+ vmaddwev.w.h vr10, vr17, vr5
+ vmaddwod.w.h vr11, vr17, vr5
+ vmaddwev.w.h vr23, vr18, vr6
+ vmaddwod.w.h vr24, vr18, vr6
+ vmaddwev.w.h vr25, vr19, vr7
+ vmaddwod.w.h vr26, vr19, vr7
+ vssrarni.hu.w vr10, vr8, 10
+ vssrarni.hu.w vr11, vr9, 10
+ vssrarni.hu.w vr25, vr23, 10
+ vssrarni.hu.w vr26, vr24, 10
+ vssrlni.bu.h vr11, vr10, 0
+ vssrlni.bu.h vr26, vr25, 0
+ vshuf4i.w vr0, vr11, 0x4E
+ vshuf4i.w vr1, vr26, 0x4E
+ vilvl.b vr3, vr0, vr11
+ vilvl.b vr7, vr1, vr26
+ vst vr3, a0, 0
+ vstx vr7, a0, a1
+ vpickev.h vr0, vr13, vr12
+ vpickod.h vr1, vr13, vr12
+ vpickev.h vr2, vr15, vr14
+ vpickod.h vr3, vr15, vr14
+ vadd.h vr4, vr0, vr1
+ vadd.h vr5, vr2, vr3
+ vadd.h vr4, vr4, vr5
+ vsub.h vr4, vr4, vr21
+ vssrarni.bu.h vr4, vr4, 2
+ vstelm.d vr4, a6, 0, 0
+
+ alsl.d a2, a4, a2, 1
+ alsl.d a3, a4, a3, 1
+ alsl.d a0, a1, a0, 1
+ addi.d a6, a6, 8
+ addi.w a5, a5, -2
+ blt zero, a5, .WMASK420_W16_LSX
+ b .END_W420
+
+.WMASK420_W32_LSX:
+.WMASK420_W64_LSX:
+.WMASK420_W128_LSX:
+
+.LOOP_W32_420_LSX:
+ add.d t1, a2, zero
+ add.d t2, a3, zero
+ add.d t3, a0, zero
+ add.d t4, a6, zero
+ alsl.d t5, a4, t1, 1
+ alsl.d t6, a4, t2, 1
+ or t7, a4, a4
+
+.W32_420_LSX:
+ vld vr0, t1, 0
+ vld vr1, t1, 16
+ vld vr2, t2, 0
+ vld vr3, t2, 16
+ vld vr4, t5, 0
+ vld vr5, t5, 16
+ vld vr6, t6, 0
+ vld vr7, t6, 16
+ addi.d t1, t1, 32
+ addi.d t2, t2, 32
+ addi.d t5, t5, 32
+ addi.d t6, t6, 32
+ addi.w t7, t7, -16
+ vabsd.h vr8, vr0, vr2
+ vabsd.h vr9, vr1, vr3
+ vabsd.h vr10, vr4, vr6
+ vabsd.h vr11, vr5, vr7
+ vaddi.hu vr8, vr8, 8
+ vaddi.hu vr9, vr9, 8
+ vaddi.hu vr10, vr10, 8
+ vaddi.hu vr11, vr11, 8
+ vsrli.h vr8, vr8, 8
+ vsrli.h vr9, vr9, 8
+ vsrli.h vr10, vr10, 8
+ vsrli.h vr11, vr11, 8
+ vadd.h vr8, vr8, vr22
+ vadd.h vr9, vr9, vr22
+ vadd.h vr10, vr10, vr22
+ vadd.h vr11, vr11, vr22
+ vmin.hu vr12, vr8, vr20
+ vmin.hu vr13, vr9, vr20
+ vmin.hu vr14, vr10, vr20
+ vmin.hu vr15, vr11, vr20
+ vsub.h vr16, vr20, vr12
+ vsub.h vr17, vr20, vr13
+ vsub.h vr18, vr20, vr14
+ vsub.h vr19, vr20, vr15
+ vmulwev.w.h vr8, vr12, vr0
+ vmulwod.w.h vr9, vr12, vr0
+ vmulwev.w.h vr10, vr13, vr1
+ vmulwod.w.h vr11, vr13, vr1
+ vmulwev.w.h vr23, vr14, vr4
+ vmulwod.w.h vr24, vr14, vr4
+ vmulwev.w.h vr25, vr15, vr5
+ vmulwod.w.h vr26, vr15, vr5
+ vmaddwev.w.h vr8, vr16, vr2
+ vmaddwod.w.h vr9, vr16, vr2
+ vmaddwev.w.h vr10, vr17, vr3
+ vmaddwod.w.h vr11, vr17, vr3
+ vmaddwev.w.h vr23, vr18, vr6
+ vmaddwod.w.h vr24, vr18, vr6
+ vmaddwev.w.h vr25, vr19, vr7
+ vmaddwod.w.h vr26, vr19, vr7
+ vssrarni.hu.w vr10, vr8, 10
+ vssrarni.hu.w vr11, vr9, 10
+ vssrarni.hu.w vr25, vr23, 10
+ vssrarni.hu.w vr26, vr24, 10
+ vssrlni.bu.h vr11, vr10, 0
+ vssrlni.bu.h vr26, vr25, 0
+ vshuf4i.w vr8, vr11, 0x4E
+ vshuf4i.w vr9, vr26, 0x4E
+ vilvl.b vr3, vr8, vr11
+ vilvl.b vr7, vr9, vr26
+ vst vr3, t3, 0
+ vstx vr7, a1, t3
+ addi.d t3, t3, 16
+ vpickev.h vr8, vr13, vr12
+ vpickod.h vr9, vr13, vr12
+ vpickev.h vr10, vr15, vr14
+ vpickod.h vr11, vr15, vr14
+ vadd.h vr8, vr8, vr9
+ vadd.h vr10, vr10, vr11
+ vadd.h vr12, vr8, vr10
+ vsub.h vr12, vr12, vr21
+ vssrarni.bu.h vr12, vr12, 2
+ vstelm.d vr12, t4, 0, 0
+ addi.d t4, t4, 8
+ bne t7, zero, .W32_420_LSX
+
+ alsl.d a2, a4, a2, 2
+ alsl.d a3, a4, a3, 2
+ alsl.d a0, a1, a0, 1
+ srai.w t8, a4, 1
+ add.d a6, a6, t8
+ addi.w a5, a5, -2
+ blt zero, a5, .LOOP_W32_420_LSX
+
+.END_W420:
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ addi.d sp, sp, 24
+endfunc
+
+function w_mask_420_8bpc_lasx
+ xvldi xr20, 0x440
+ xvreplgr2vr.h xr21, a7
+ xvldi xr22, 0x426
+
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .WMASK420_LASX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t8, t0, 0
+ add.d t1, t1, t8
+ jirl $r0, t1, 0
+
+ .align 3
+.WMASK420_LASX_JRTABLE:
+ .hword .WMASK420_W128_LASX - .WMASK420_LASX_JRTABLE
+ .hword .WMASK420_W64_LASX - .WMASK420_LASX_JRTABLE
+ .hword .WMASK420_W32_LASX - .WMASK420_LASX_JRTABLE
+ .hword .WMASK420_W16_LASX - .WMASK420_LASX_JRTABLE
+ .hword .WMASK420_W8_LASX - .WMASK420_LASX_JRTABLE
+ .hword .WMASK420_W4_LASX - .WMASK420_LASX_JRTABLE
+
+.WMASK420_W4_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ addi.w a5, a5, -4
+
+ xvabsd.h xr2, xr0, xr1
+ xvaddi.hu xr2, xr2, 8
+ xvsrli.h xr2, xr2, 8
+ xvadd.h xr2, xr2, xr22
+ xvmin.hu xr3, xr2, xr20
+ xvsub.h xr4, xr20, xr3
+ xvmulwev.w.h xr5, xr3, xr0
+ xvmulwod.w.h xr6, xr3, xr0
+ xvmaddwev.w.h xr5, xr4, xr1
+ xvmaddwod.w.h xr6, xr4, xr1
+ xvilvl.w xr7, xr6, xr5
+ xvilvh.w xr8, xr6, xr5
+ xvssrarni.hu.w xr8, xr7, 10
+ xvssrlni.bu.h xr9, xr8, 0
+ vstelm.w vr9, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr9, a0, 0, 1
+ add.d a0, a0, a1
+ xvstelm.w xr9, a0, 0, 4
+ add.d a0, a0, a1
+ xvstelm.w xr9, a0, 0, 5
+ add.d a0, a0, a1
+
+ xvhaddw.w.h xr3, xr3, xr3
+ xvpermi.d xr4, xr3, 0xb1
+ xvadd.h xr3, xr3, xr4
+ xvpickev.h xr3, xr3, xr3
+ xvsub.h xr3, xr3, xr21
+ xvssrarni.bu.h xr3, xr3, 2
+ vstelm.h vr3, a6, 0, 0
+ xvstelm.h xr3, a6, 2, 8
+
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 4
+ blt zero, a5, .WMASK420_W4_LASX
+ b .END_W420_LASX
+
+.WMASK420_W8_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a2, 32
+ xvld xr2, a3, 0
+ xvld xr3, a3, 32
+ addi.w a5, a5, -4
+
+ xvabsd.h xr4, xr0, xr2
+ xvabsd.h xr5, xr1, xr3
+ xvaddi.hu xr4, xr4, 8
+ xvaddi.hu xr5, xr5, 8
+ xvsrli.h xr4, xr4, 8
+ xvsrli.h xr5, xr5, 8
+ xvadd.h xr4, xr4, xr22
+ xvadd.h xr5, xr5, xr22
+ xvmin.hu xr6, xr4, xr20
+ xvmin.hu xr7, xr5, xr20
+ xvsub.h xr8, xr20, xr6
+ xvsub.h xr9, xr20, xr7
+ xvmulwev.w.h xr10, xr6, xr0
+ xvmulwod.w.h xr11, xr6, xr0
+ xvmulwev.w.h xr12, xr7, xr1
+ xvmulwod.w.h xr13, xr7, xr1
+ xvmaddwev.w.h xr10, xr8, xr2
+ xvmaddwod.w.h xr11, xr8, xr2
+ xvmaddwev.w.h xr12, xr9, xr3
+ xvmaddwod.w.h xr13, xr9, xr3
+ xvssrarni.hu.w xr12, xr10, 10
+ xvssrarni.hu.w xr13, xr11, 10
+ xvssrlni.bu.h xr13, xr12, 0
+ xvshuf4i.w xr1, xr13, 0x4E
+ xvilvl.b xr17, xr1, xr13
+ vstelm.d vr17, a0, 0, 0
+ add.d a0, a0, a1
+ xvstelm.d xr17, a0, 0, 2
+ add.d a0, a0, a1
+ xvstelm.d xr17, a0, 0, 1
+ add.d a0, a0, a1
+ xvstelm.d xr17, a0, 0, 3
+ add.d a0, a0, a1
+
+ xvhaddw.w.h xr6, xr6, xr6
+ xvhaddw.w.h xr7, xr7, xr7
+ xvpickev.h xr8, xr7, xr6
+ xvpermi.q xr9, xr8, 0x01
+ vadd.h vr8, vr8, vr9
+ vsub.h vr8, vr8, vr21
+ vssrarni.bu.h vr8, vr8, 2
+ vstelm.d vr8, a6, 0, 0
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a6, a6, 8
+ blt zero, a5, .WMASK420_W8_LASX
+ b .END_W420_LASX
+
+.WMASK420_W16_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a2, 32
+ xvld xr2, a3, 0
+ xvld xr3, a3, 32
+ addi.w a5, a5, -2
+
+ xvabsd.h xr4, xr0, xr2
+ xvabsd.h xr5, xr1, xr3
+ xvaddi.hu xr4, xr4, 8
+ xvaddi.hu xr5, xr5, 8
+ xvsrli.h xr4, xr4, 8
+ xvsrli.h xr5, xr5, 8
+ xvadd.h xr4, xr4, xr22
+ xvadd.h xr5, xr5, xr22
+ xvmin.hu xr4, xr4, xr20
+ xvmin.hu xr5, xr5, xr20
+ xvsub.h xr6, xr20, xr4
+ xvsub.h xr7, xr20, xr5
+ xvmulwev.w.h xr8, xr4, xr0
+ xvmulwod.w.h xr9, xr4, xr0
+ xvmulwev.w.h xr10, xr5, xr1
+ xvmulwod.w.h xr11, xr5, xr1
+ xvmaddwev.w.h xr8, xr6, xr2
+ xvmaddwod.w.h xr9, xr6, xr2
+ xvmaddwev.w.h xr10, xr7, xr3
+ xvmaddwod.w.h xr11, xr7, xr3
+ xvssrarni.hu.w xr10, xr8, 10
+ xvssrarni.hu.w xr11, xr9, 10
+ xvssrlni.bu.h xr11, xr10, 0
+ xvshuf4i.w xr8, xr11, 0x4E
+ xvilvl.b xr15, xr8, xr11
+ xvpermi.d xr16, xr15, 0xd8
+ vst vr16, a0, 0
+ add.d a0, a0, a1
+ xvpermi.q xr16, xr16, 0x01
+ vst vr16, a0, 0
+ add.d a0, a0, a1
+
+ xvhaddw.w.h xr4, xr4, xr4
+ xvhaddw.w.h xr5, xr5, xr5
+ xvadd.h xr4, xr5, xr4
+ xvpickev.h xr6, xr4, xr4
+ xvpermi.d xr7, xr6, 0x08
+ vsub.h vr7, vr7, vr21
+ vssrarni.bu.h vr7, vr7, 2
+ vstelm.d vr7, a6, 0, 0
+
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a6, a6, 8
+ blt zero, a5, .WMASK420_W16_LASX
+ b .END_W420_LASX
+
+.WMASK420_W32_LASX:
+.WMASK420_W64_LASX:
+.WMASK420_W128_LASX:
+
+.LOOP_W32_420_LASX:
+ add.d t1, a2, zero
+ add.d t2, a3, zero
+ add.d t3, a0, zero
+ add.d t4, a6, zero
+ alsl.d t5, a4, t1, 1
+ alsl.d t6, a4, t2, 1
+ or t7, a4, a4
+.W32_420_LASX:
+ xvld xr0, t1, 0
+ xvld xr1, t2, 0
+ xvld xr2, t5, 0
+ xvld xr3, t6, 0
+ addi.d t1, t1, 32
+ addi.d t2, t2, 32
+ addi.d t5, t5, 32
+ addi.d t6, t6, 32
+ addi.w t7, t7, -16
+ xvabsd.h xr4, xr0, xr1
+ xvabsd.h xr5, xr2, xr3
+ xvaddi.hu xr4, xr4, 8
+ xvaddi.hu xr5, xr5, 8
+ xvsrli.h xr4, xr4, 8
+ xvsrli.h xr5, xr5, 8
+ xvadd.h xr4, xr4, xr22
+ xvadd.h xr5, xr5, xr22
+ xvmin.hu xr6, xr4, xr20
+ xvmin.hu xr7, xr5, xr20
+ xvsub.h xr8, xr20, xr6
+ xvsub.h xr9, xr20, xr7
+ xvmulwev.w.h xr10, xr6, xr0
+ xvmulwod.w.h xr11, xr6, xr0
+ xvmulwev.w.h xr12, xr7, xr2
+ xvmulwod.w.h xr13, xr7, xr2
+ xvmaddwev.w.h xr10, xr8, xr1
+ xvmaddwod.w.h xr11, xr8, xr1
+ xvmaddwev.w.h xr12, xr9, xr3
+ xvmaddwod.w.h xr13, xr9, xr3
+ xvssrarni.hu.w xr12, xr10, 10
+ xvssrarni.hu.w xr13, xr11, 10
+ xvssrlni.bu.h xr13, xr12, 0
+ xvshuf4i.w xr10, xr13, 0x4E
+ xvilvl.b xr17, xr10, xr13
+ xvpermi.d xr18, xr17, 0x08
+ xvpermi.d xr19, xr17, 0x0d
+ vst vr18, t3, 0
+ vstx vr19, t3, a1
+ addi.d t3, t3, 16
+
+ xvhaddw.w.h xr6, xr6, xr6
+ xvhaddw.w.h xr7, xr7, xr7
+ xvadd.h xr6, xr7, xr6
+ xvpickev.h xr7, xr6, xr6
+ xvpermi.d xr8, xr7, 0x08
+ vsub.h vr9, vr8, vr21
+ vssrarni.bu.h vr9, vr9, 2
+ vstelm.d vr9, t4, 0, 0
+ addi.d t4, t4, 8
+ bne t7, zero, .W32_420_LASX
+
+ alsl.d a2, a4, a2, 2
+ alsl.d a3, a4, a3, 2
+ alsl.d a0, a1, a0, 1
+ srai.w t8, a4, 1
+ add.d a6, a6, t8
+ addi.w a5, a5, -2
+ blt zero, a5, .LOOP_W32_420_LASX
+
+.END_W420_LASX:
+endfunc
+
+#undef bpc_sh
+#undef bpcw_sh
+
+.macro vhaddw.d.h in0
+ vhaddw.w.h \in0, \in0, \in0
+ vhaddw.d.w \in0, \in0, \in0
+.endm
+.macro vhaddw.q.w in0
+ vhaddw.d.w \in0, \in0, \in0
+ vhaddw.q.d \in0, \in0, \in0
+.endm
+.macro PUT_H_8W in0
+ vbsrl.v vr2, \in0, 1
+ vbsrl.v vr3, \in0, 2
+ vbsrl.v vr4, \in0, 3
+ vbsrl.v vr5, \in0, 4
+ vbsrl.v vr6, \in0, 5
+ vbsrl.v vr7, \in0, 6
+ vbsrl.v vr10, \in0, 7
+ vilvl.d vr2, vr2, \in0
+ vilvl.d vr3, vr4, vr3
+ vilvl.d vr4, vr6, vr5
+ vilvl.d vr5, vr10, vr7
+ vdp2.h.bu.b \in0, vr2, vr8
+ vdp2.h.bu.b vr2, vr3, vr8
+ vdp2.h.bu.b vr3, vr4, vr8
+ vdp2.h.bu.b vr4, vr5, vr8
+ vhaddw.d.h \in0
+ vhaddw.d.h vr2
+ vhaddw.d.h vr3
+ vhaddw.d.h vr4
+ vpickev.w \in0, vr2, \in0
+ vpickev.w vr2, vr4, vr3
+ vpickev.h \in0, vr2, \in0
+ vadd.h \in0, \in0, vr9
+.endm
+.macro FILTER_8TAP_4W in0
+ vbsrl.v vr10, \in0, 1
+ vbsrl.v vr11, \in0, 2
+ vbsrl.v vr12, \in0, 3
+ vilvl.d vr10, vr10, \in0
+ vilvl.d vr11, vr12, vr11
+ vdp2.h.bu.b vr7, vr10, vr8
+ vdp2.h.bu.b vr10, vr11, vr8
+ vhaddw.d.h vr7
+ vhaddw.d.h vr10
+ vpickev.w \in0, vr10, vr7
+.endm
+.macro FILTER_8TAP_8W in0
+ vbsrl.v vr10, \in0, 1
+ vbsrl.v vr11, \in0, 2
+ vbsrl.v vr12, \in0, 3
+ vbsrl.v vr13, \in0, 4
+ vbsrl.v vr14, \in0, 5
+ vbsrl.v vr15, \in0, 6
+ vbsrl.v vr16, \in0, 7
+ vilvl.d vr10, vr10, \in0
+ vilvl.d vr11, vr12, vr11
+ vilvl.d vr12, vr14, vr13
+ vilvl.d vr13, vr16, vr15
+ vdp2.h.bu.b vr14, vr10, vr8
+ vdp2.h.bu.b vr15, vr11, vr8
+ vdp2.h.bu.b vr16, vr12, vr8
+ vdp2.h.bu.b vr17, vr13, vr8
+ vhaddw.d.h vr14
+ vhaddw.d.h vr15
+ vhaddw.d.h vr16
+ vhaddw.d.h vr17
+ vpickev.w vr13, vr15, vr14
+ vpickev.w vr14, vr17, vr16
+ vpickev.h \in0, vr14, vr13 //x0 ... x7
+ vsrari.h \in0, \in0, 2
+.endm
+.macro FILTER_8TAP_8W_CLIP_STORE
+ vdp2.w.h vr12, vr0, vr9
+ vdp2.w.h vr13, vr1, vr9
+ vdp2.w.h vr14, vr2, vr9
+ vdp2.w.h vr15, vr3, vr9
+ vdp2.w.h vr16, vr4, vr9
+ vdp2.w.h vr17, vr5, vr9
+ vdp2.w.h vr18, vr6, vr9
+ vdp2.w.h vr19, vr7, vr9
+ vhaddw.q.w vr12
+ vhaddw.q.w vr13
+ vhaddw.q.w vr14
+ vhaddw.q.w vr15
+ vhaddw.q.w vr16
+ vhaddw.q.w vr17
+ vhaddw.q.w vr18
+ vhaddw.q.w vr19
+ vpackev.w vr12, vr13, vr12
+ vpackev.w vr13, vr15, vr14
+ vpackev.d vr12, vr13, vr12
+ vpackev.w vr14, vr17, vr16
+ vpackev.w vr15, vr19, vr18
+ vpackev.d vr13, vr15, vr14
+ vssrarni.hu.w vr13, vr12, 10
+ vssrani.bu.h vr13, vr13, 0
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a1
+.endm
+.macro VEXTRINS_Hx8 in0
+ vextrins.h vr0, \in0, 0x70
+ vextrins.h vr1, \in0, 0x71
+ vextrins.h vr2, \in0, 0x72
+ vextrins.h vr3, \in0, 0x73
+ vextrins.h vr4, \in0, 0x74
+ vextrins.h vr5, \in0, 0x75
+ vextrins.h vr6, \in0, 0x76
+ vextrins.h vr7, \in0, 0x77
+.endm
+.macro VBSRL_Vx8
+ vbsrl.v vr0, vr0, 2
+ vbsrl.v vr1, vr1, 2
+ vbsrl.v vr2, vr2, 2
+ vbsrl.v vr3, vr3, 2
+ vbsrl.v vr4, vr4, 2
+ vbsrl.v vr5, vr5, 2
+ vbsrl.v vr6, vr6, 2
+ vbsrl.v vr7, vr7, 2
+.endm
+
+.macro PUT_8TAP_8BPC_LSX lable
+ li.w t0, 4
+ la.local t6, dav1d_mc_subpel_filters
+ slli.d t2, a3, 1 //src_stride*2
+ add.d t3, t2, a3 //src_stride*3
+ slli.d t4, t2, 1 //src_stride*4
+
+ bnez a6, .l_\lable\()put_h //mx
+ bnez a7, .l_\lable\()put_v //my
+
+ clz.w t1, a4
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()put_hv0_jtable
+ alsl.d t1, t1, t5, 3
+ ld.d t6, t1, 0
+ add.d t5, t5, t6
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()put_hv0_jtable:
+ .dword .l_\lable\()put_hv0_128w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_64w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_32w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_16w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_8w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_4w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_2w - .l_\lable\()put_hv0_jtable
+
+.l_\lable\()put_hv0_2w:
+ vldrepl.h vr0, a2, 0
+ add.d a2, a2, a3
+ vldrepl.h vr1, a2, 0
+ vstelm.h vr0, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.h vr1, a0, 0, 0
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_2w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_4w:
+ fld.s f0, a2, 0
+ fldx.s f1, a2, a3
+ fst.s f0, a0, 0
+ fstx.s f1, a0, a1
+ alsl.d a2, a3, a2, 1
+ alsl.d a0, a1, a0, 1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_4w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_8w:
+ fld.d f0, a2, 0
+ fldx.d f1, a2, a3
+ fst.d f0, a0, 0
+ fstx.d f1, a0, a1
+ alsl.d a2, a3, a2, 1
+ alsl.d a0, a1, a0, 1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_8w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_16w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ vst vr0, a0, 0
+ vstx vr1, a0, a1
+ alsl.d a2, a3, a2, 1
+ alsl.d a0, a1, a0, 1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_16w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_32w:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ add.d a2, a2, a3
+ vld vr2, a2, 0
+ vld vr3, a2, 16
+ vst vr0, a0, 0
+ vst vr1, a0, 16
+ add.d a0, a0, a1
+ vst vr2, a0, 0
+ vst vr3, a0, 16
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_32w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_64w:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ vld vr2, a2, 32
+ vld vr3, a2, 48
+ add.d a2, a2, a3
+ vld vr4, a2, 0
+ vld vr5, a2, 16
+ vld vr6, a2, 32
+ vld vr7, a2, 48
+ add.d a2, a2, a3
+ vst vr0, a0, 0
+ vst vr1, a0, 16
+ vst vr2, a0, 32
+ vst vr3, a0, 48
+ add.d a0, a0, a1
+ vst vr4, a0, 0
+ vst vr5, a0, 16
+ vst vr6, a0, 32
+ vst vr7, a0, 48
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_64w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_128w:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ vld vr2, a2, 32
+ vld vr3, a2, 48
+ vld vr4, a2, 64
+ vld vr5, a2, 80
+ vld vr6, a2, 96
+ vld vr7, a2, 112
+ add.d a2, a2, a3
+ vld vr8, a2, 0
+ vld vr9, a2, 16
+ vld vr10, a2, 32
+ vld vr11, a2, 48
+ vld vr12, a2, 64
+ vld vr13, a2, 80
+ vld vr14, a2, 96
+ vld vr15, a2, 112
+ add.d a2, a2, a3
+ vst vr0, a0, 0
+ vst vr1, a0, 16
+ vst vr2, a0, 32
+ vst vr3, a0, 48
+ vst vr4, a0, 64
+ vst vr5, a0, 80
+ vst vr6, a0, 96
+ vst vr7, a0, 112
+ add.d a0, a0, a1
+ vst vr8, a0, 0
+ vst vr9, a0, 16
+ vst vr10, a0, 32
+ vst vr11, a0, 48
+ vst vr12, a0, 64
+ vst vr13, a0, 80
+ vst vr14, a0, 96
+ vst vr15, a0, 112
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_128w
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h:
+ bnez a7, .l_\lable\()put_hv //if(fh) && if (fv)
+ ld.d t5, sp, 0 //filter_type
+ andi t1, t5, 3
+ blt t0, a4, .l_\lable\()put_h_idx_fh
+ andi t1, t5, 1
+ addi.w t1, t1, 3
+
+.l_\lable\()put_h_idx_fh:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a6, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fh's offset
+ vldrepl.d vr8, t1, 0
+ addi.d a2, a2, -3
+ li.w t1, 34
+ vreplgr2vr.h vr9, t1
+
+ clz.w t1, a4
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()put_h_jtable
+ alsl.d t1, t1, t5, 3
+ ld.d t6, t1, 0
+ add.d t5, t5, t6
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()put_h_jtable:
+ .dword .l_\lable\()put_h_128w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_64w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_32w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_16w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_8w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_4w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_2w - .l_\lable\()put_h_jtable
+
+.l_\lable\()put_h_2w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ add.d a2, a2, t2
+
+ vbsrl.v vr2, vr0, 1
+ vilvl.d vr0, vr2, vr0
+ vdp2.h.bu.b vr2, vr0, vr8
+ vhaddw.w.h vr0, vr2, vr2
+ vhaddw.d.w vr0, vr0, vr0
+ vbsrl.v vr2, vr1, 1
+ vilvl.d vr1, vr2, vr1
+ vdp2.h.bu.b vr2, vr1, vr8
+ vhaddw.w.h vr1, vr2, vr2
+ vhaddw.d.w vr1, vr1, vr1
+ vpickev.w vr0, vr1, vr0
+ vpickev.h vr0, vr0, vr0
+ vadd.h vr0, vr0, vr9
+ vssrani.bu.h vr0, vr0, 6
+
+ vstelm.h vr0, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.h vr0, a0, 0, 1
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_h_2w
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h_4w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ add.d a2, a2, t2
+
+ vbsrl.v vr2, vr0, 1
+ vbsrl.v vr3, vr0, 2
+ vbsrl.v vr4, vr0, 3
+ vilvl.d vr0, vr2, vr0 //x0 x1
+ vilvl.d vr2, vr4, vr3 //x2 x3
+ vdp2.h.bu.b vr3, vr0, vr8
+ vdp2.h.bu.b vr4, vr2, vr8
+ vhaddw.w.h vr0, vr3, vr3
+ vhaddw.d.w vr0, vr0, vr0
+ vhaddw.w.h vr2, vr4, vr4
+ vhaddw.d.w vr2, vr2, vr2
+ vpickev.w vr5, vr2, vr0
+ vbsrl.v vr2, vr1, 1
+ vbsrl.v vr3, vr1, 2
+ vbsrl.v vr4, vr1, 3
+ vilvl.d vr0, vr2, vr1 //x0 x1
+ vilvl.d vr2, vr4, vr3 //x2 x3
+ vdp2.h.bu.b vr3, vr0, vr8
+ vdp2.h.bu.b vr4, vr2, vr8
+ vhaddw.w.h vr0, vr3, vr3
+ vhaddw.d.w vr0, vr0, vr0
+ vhaddw.w.h vr2, vr4, vr4
+ vhaddw.d.w vr2, vr2, vr2
+ vpickev.w vr6, vr2, vr0
+ vpickev.h vr0, vr6, vr5
+ vadd.h vr0, vr0, vr9
+ vssrani.bu.h vr0, vr0, 6
+
+ vstelm.w vr0, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr0, a0, 0, 1
+ add.d a0, a0, a1
+ addi.d a5, a5, -2
+ bnez a5, .l_\lable\()put_h_4w
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h_8w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ add.d a2, a2, t2
+ PUT_H_8W vr0
+ PUT_H_8W vr1
+ vssrani.bu.h vr1, vr0, 6
+ vstelm.d vr1, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.d vr1, a0, 0, 1
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_h_8w
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h_16w:
+.l_\lable\()put_h_32w:
+.l_\lable\()put_h_64w:
+.l_\lable\()put_h_128w:
+ addi.d t0, a2, 0 //src
+ addi.w t5, a5, 0 //h
+ addi.d t8, a0, 0 //dst
+.l_\lable\()put_h_16w_loop:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ add.d a2, a2, t2
+ PUT_H_8W vr0
+ PUT_H_8W vr1
+ vssrani.bu.h vr1, vr0, 6
+ vstelm.d vr1, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.d vr1, a0, 0, 1
+ add.d a0, a0, a1
+ addi.d a5, a5, -2
+ bnez a5, .l_\lable\()put_h_16w_loop
+ addi.d a2, t0, 8
+ addi.d t0, t0, 8
+ addi.d a0, t8, 8
+ addi.d t8, t8, 8
+ addi.w a5, t5, 0
+ addi.w a4, a4, -8
+ bnez a4, .l_\lable\()put_h_16w_loop
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_v:
+ ld.d t1, sp, 0 //filter_type
+ srli.w t1, t1, 2
+ blt t0, a5, .l_\lable\()put_v_idx_fv
+ andi t1, t1, 1
+ addi.w t1, t1, 3
+
+.l_\lable\()put_v_idx_fv:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a7, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fv's offset
+ vldrepl.d vr8, t1, 0
+ sub.d a2, a2, t3
+
+ clz.w t1, a4
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()put_v_jtable
+ alsl.d t1, t1, t5, 3
+ ld.d t6, t1, 0
+ add.d t5, t5, t6
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()put_v_jtable:
+ .dword .l_\lable\()put_v_128w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_64w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_32w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_16w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_8w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_4w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_2w - .l_\lable\()put_v_jtable
+
+.l_\lable\()put_v_2w:
+ fld.s f0, a2, 0
+ fldx.s f1, a2, a3
+ fldx.s f2, a2, t2
+ add.d a2, a2, t3
+ fld.s f3, a2, 0
+ fldx.s f4, a2, a3
+ fldx.s f5, a2, t2
+ fldx.s f6, a2, t3
+ add.d a2, a2, t4
+ vilvl.b vr0, vr1, vr0
+ vilvl.b vr1, vr3, vr2
+ vilvl.b vr2, vr5, vr4
+ vilvl.b vr3, vr7, vr6
+ vilvl.h vr0, vr1, vr0
+ vilvl.h vr1, vr3, vr2
+ vilvl.w vr0, vr1, vr0
+
+.l_\lable\()put_v_2w_loop:
+ fld.s f7, a2, 0 //h0
+ fldx.s f10, a2, a3 //h1
+ add.d a2, a2, t2
+
+ vextrins.b vr0, vr7, 0x70
+ vextrins.b vr0, vr7, 0xf1
+ vbsrl.v vr1, vr0, 1
+ vextrins.b vr1, vr10, 0x70
+ vextrins.b vr1, vr10, 0xf1
+ vdp2.h.bu.b vr10, vr0, vr8
+ vdp2.h.bu.b vr11, vr1, vr8
+ vbsrl.v vr0, vr1, 1
+ vhaddw.d.h vr10
+ vhaddw.d.h vr11
+ vpickev.w vr10, vr11, vr10
+ vssrarni.hu.w vr10, vr10, 6
+ vssrani.bu.h vr10, vr10, 0
+
+ vstelm.h vr10, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.h vr10, a0, 0, 1
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_v_2w_loop
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_v_4w:
+ fld.s f0, a2, 0
+ fldx.s f1, a2, a3
+ fldx.s f2, a2, t2
+ add.d a2, a2, t3
+ fld.s f3, a2, 0
+ fldx.s f4, a2, a3
+ fldx.s f5, a2, t2
+ fldx.s f6, a2, t3
+ add.d a2, a2, t4
+
+ vilvl.b vr0, vr1, vr0
+ vilvl.b vr1, vr3, vr2
+ vilvl.b vr2, vr5, vr4
+ vilvl.b vr3, vr7, vr6
+ vilvl.h vr0, vr1, vr0
+ vilvl.h vr1, vr3, vr2
+ vilvl.w vr2, vr1, vr0
+ vilvh.w vr3, vr1, vr0
+
+.l_\lable\()put_v_4w_loop:
+ fld.s f7, a2, 0
+ fldx.s f10, a2, a3
+ add.d a2, a2, t2
+
+ vextrins.b vr2, vr7, 0x70
+ vextrins.b vr2, vr7, 0xf1 //x0x1(h0)
+ vbsrl.v vr4, vr2, 1
+ vextrins.b vr4, vr10, 0x70
+ vextrins.b vr4, vr10, 0xf1 //x0x1(h1)
+ vdp2.h.bu.b vr11, vr2, vr8
+ vdp2.h.bu.b vr12, vr4, vr8
+ vbsrl.v vr2, vr4, 1
+
+ vextrins.b vr3, vr7, 0x72
+ vextrins.b vr3, vr7, 0xf3 //x2x3(h0)
+ vbsrl.v vr4, vr3, 1
+ vextrins.b vr4, vr10, 0x72
+ vextrins.b vr4, vr10, 0xf3 //x2x3(h1)
+ vdp2.h.bu.b vr13, vr3, vr8
+ vdp2.h.bu.b vr14, vr4, vr8
+ vbsrl.v vr3, vr4, 1
+
+ vhaddw.d.h vr11
+ vhaddw.d.h vr12
+ vhaddw.d.h vr13
+ vhaddw.d.h vr14
+
+ vpickev.w vr11, vr13, vr11
+ vpickev.w vr12, vr14, vr12
+ vpickev.h vr11, vr12, vr11
+ vssrarni.bu.h vr11, vr11, 6
+ vstelm.w vr11, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr11, a0, 0, 1
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_v_4w_loop
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_v_8w:
+.l_\lable\()put_v_16w:
+.l_\lable\()put_v_32w:
+.l_\lable\()put_v_64w:
+.l_\lable\()put_v_128w:
+ addi.d t0, a2, 0 //src
+ addi.d t5, a5, 0 //h
+ addi.d t8, a0, 0 //dst
+.l_\lable\()put_v_8w_loop0:
+ fld.d f0, a2, 0
+ fldx.d f1, a2, a3
+ fldx.d f2, a2, t2
+ add.d a2, a2, t3
+ fld.d f3, a2, 0
+ fldx.d f4, a2, a3
+ fldx.d f5, a2, t2
+ fldx.d f6, a2, t3
+ add.d a2, a2, t4
+
+ vilvl.b vr0, vr1, vr0
+ vilvl.b vr1, vr3, vr2
+ vilvl.b vr2, vr5, vr4
+ vilvl.b vr3, vr7, vr6
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr5, vr1, vr0
+ vilvl.h vr6, vr3, vr2
+ vilvh.h vr7, vr3, vr2
+ vilvl.w vr0, vr6, vr4 // x0x1
+ vilvh.w vr1, vr6, vr4 // x2x3
+ vilvl.w vr2, vr7, vr5 // x4x5
+ vilvh.w vr3, vr7, vr5 // x6x7
+.l_\lable\()put_v_8w_loop:
+ fld.d f7, a2, 0
+ fldx.d f10, a2, a3
+ add.d a2, a2, t2
+ //h0
+ vextrins.b vr0, vr7, 0x70
+ vextrins.b vr0, vr7, 0xf1
+ vextrins.b vr1, vr7, 0x72
+ vextrins.b vr1, vr7, 0xf3
+ vextrins.b vr2, vr7, 0x74
+ vextrins.b vr2, vr7, 0xf5
+ vextrins.b vr3, vr7, 0x76
+ vextrins.b vr3, vr7, 0xf7
+ vdp2.h.bu.b vr11, vr0, vr8
+ vdp2.h.bu.b vr12, vr1, vr8
+ vdp2.h.bu.b vr13, vr2, vr8
+ vdp2.h.bu.b vr14, vr3, vr8
+ vhaddw.d.h vr11
+ vhaddw.d.h vr12
+ vhaddw.d.h vr13
+ vhaddw.d.h vr14
+ vpickev.w vr11, vr12, vr11
+ vpickev.w vr12, vr14, vr13
+ vpickev.h vr11, vr12, vr11
+ vssrarni.bu.h vr11, vr11, 6
+ fst.d f11, a0, 0
+ add.d a0, a0, a1
+ //h1
+ vbsrl.v vr0, vr0, 1
+ vbsrl.v vr1, vr1, 1
+ vbsrl.v vr2, vr2, 1
+ vbsrl.v vr3, vr3, 1
+ vextrins.b vr0, vr10, 0x70
+ vextrins.b vr0, vr10, 0xf1
+ vextrins.b vr1, vr10, 0x72
+ vextrins.b vr1, vr10, 0xf3
+ vextrins.b vr2, vr10, 0x74
+ vextrins.b vr2, vr10, 0xf5
+ vextrins.b vr3, vr10, 0x76
+ vextrins.b vr3, vr10, 0xf7
+ vdp2.h.bu.b vr11, vr0, vr8
+ vdp2.h.bu.b vr12, vr1, vr8
+ vdp2.h.bu.b vr13, vr2, vr8
+ vdp2.h.bu.b vr14, vr3, vr8
+ vhaddw.d.h vr11
+ vhaddw.d.h vr12
+ vhaddw.d.h vr13
+ vhaddw.d.h vr14
+ vpickev.w vr11, vr12, vr11
+ vpickev.w vr12, vr14, vr13
+ vpickev.h vr11, vr12, vr11
+ vssrarni.bu.h vr11, vr11, 6
+ fst.d f11, a0, 0
+ add.d a0, a0, a1
+ vbsrl.v vr0, vr0, 1
+ vbsrl.v vr1, vr1, 1
+ vbsrl.v vr2, vr2, 1
+ vbsrl.v vr3, vr3, 1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_v_8w_loop
+ addi.d a2, t0, 8
+ addi.d t0, t0, 8
+ addi.d a0, t8, 8
+ addi.d t8, t8, 8
+ addi.d a5, t5, 0
+ addi.w a4, a4, -8
+ bnez a4, .l_\lable\()put_v_8w_loop0
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_hv:
+ ld.d t5, sp, 0 //filter_type
+ andi t1, t5, 3
+ blt t0, a4, .l_\lable\()put_hv_idx_fh
+ andi t1, t5, 1
+ addi.w t1, t1, 3
+.l_\lable\()put_hv_idx_fh:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a6, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fh's offset
+ vldrepl.d vr8, t1, 0
+ ld.d t1, sp, 0 //filter_type
+ srli.w t1, t1, 2
+ blt t0, a5, .l_\lable\()put_hv_idx_fv
+ andi t1, t1, 1
+ addi.w t1, t1, 3
+.l_\lable\()put_hv_idx_fv:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a7, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fv's offset
+ vldrepl.d vr9, t1, 0
+ vexth.h.b vr9, vr9
+
+ sub.d a2, a2, t3
+ addi.d a2, a2, -3
+
+ clz.w t1, a4
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()put_hv_jtable
+ alsl.d t1, t1, t5, 3
+ ld.d t6, t1, 0
+ add.d t5, t5, t6
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()put_hv_jtable:
+ .dword .l_\lable\()put_hv_128w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_64w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_32w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_16w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_8w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_4w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_2w - .l_\lable\()put_hv_jtable
+
+.l_\lable\()put_hv_2w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ vldx vr2, a2, t2
+ add.d a2, a2, t3
+ vld vr3, a2, 0
+ vldx vr4, a2, a3
+ vldx vr5, a2, t2
+ vldx vr6, a2, t3
+ add.d a2, a2, t4
+
+ vbsrl.v vr10, vr0, 1
+ vbsrl.v vr11, vr1, 1
+ vbsrl.v vr12, vr2, 1
+ vbsrl.v vr13, vr3, 1
+ vbsrl.v vr14, vr4, 1
+ vbsrl.v vr15, vr5, 1
+ vbsrl.v vr16, vr6, 1
+ vilvl.d vr0, vr10, vr0
+ vilvl.d vr1, vr11, vr1
+ vilvl.d vr2, vr12, vr2
+ vilvl.d vr3, vr13, vr3
+ vilvl.d vr4, vr14, vr4
+ vilvl.d vr5, vr15, vr5
+ vilvl.d vr6, vr16, vr6
+ vdp2.h.bu.b vr10, vr0, vr8
+ vdp2.h.bu.b vr11, vr1, vr8
+ vdp2.h.bu.b vr12, vr2, vr8
+ vdp2.h.bu.b vr13, vr3, vr8
+ vdp2.h.bu.b vr14, vr4, vr8
+ vdp2.h.bu.b vr15, vr5, vr8
+ vdp2.h.bu.b vr16, vr6, vr8
+ vhaddw.d.h vr10
+ vhaddw.d.h vr11
+ vhaddw.d.h vr12
+ vhaddw.d.h vr13
+ vhaddw.d.h vr14
+ vhaddw.d.h vr15
+ vhaddw.d.h vr16
+
+ vpackev.w vr10, vr11, vr10
+ vpackev.w vr12, vr13, vr12
+ vpackod.d vr11, vr12, vr10
+ vpackev.d vr10, vr12, vr10
+
+ vpackev.w vr12, vr15, vr14
+ vpackev.w vr16, vr17, vr16
+ vpackod.d vr13, vr16, vr12
+ vpackev.d vr12, vr16, vr12
+
+ vpickev.h vr10, vr12, vr10 //0 1 2 3 4 5 6 * (h0)
+ vpickev.h vr11, vr13, vr11 //8 9 10 11 12 13 14 * (h1)
+ vsrari.h vr10, vr10, 2
+ vsrari.h vr11, vr11, 2
+.l_\lable\()put_hv_2w_loop:
+ vld vr7, a2, 0
+ vldx vr12, a2, a3
+ add.d a2, a2, t2
+
+ vbsrl.v vr1, vr7, 1
+ vbsrl.v vr2, vr12, 1
+ vilvl.d vr0, vr1, vr7
+ vilvl.d vr1, vr2, vr12
+ vdp2.h.bu.b vr2, vr0, vr8
+ vdp2.h.bu.b vr3, vr1, vr8
+ vhaddw.d.h vr2
+ vhaddw.d.h vr3
+ vpickev.w vr2, vr3, vr2
+ vpickev.h vr2, vr2, vr2
+ vsrari.h vr2, vr2, 2
+ vextrins.h vr10, vr2, 0x70 //0 1 2 3 4 5 6 7
+ vextrins.h vr11, vr2, 0x71
+ vbsrl.v vr12, vr10, 2
+ vbsrl.v vr13, vr11, 2
+ vextrins.h vr12, vr2, 0x72 //1 2 3 4 5 6 7 8
+ vextrins.h vr13, vr2, 0x73
+ vdp2.w.h vr0, vr10, vr9
+ vdp2.w.h vr1, vr11, vr9
+ vdp2.w.h vr2, vr12, vr9
+ vdp2.w.h vr3, vr13, vr9
+ vhaddw.q.w vr0
+ vhaddw.q.w vr1
+ vhaddw.q.w vr2
+ vhaddw.q.w vr3
+ vpackev.w vr0, vr1, vr0
+ vpackev.w vr1, vr3, vr2
+ vpackev.d vr0, vr1, vr0
+ vssrarni.hu.w vr0, vr0, 10
+ vssrani.bu.h vr0, vr0, 0
+ vbsrl.v vr10, vr12, 2
+ vbsrl.v vr11, vr13, 2
+ vstelm.h vr0, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.h vr0, a0, 0, 1
+ add.d a0, a0, a1
+ addi.d a5, a5, -2
+ bnez a5, .l_\lable\()put_hv_2w_loop
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_hv_4w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ vldx vr2, a2, t2
+ add.d a2, a2, t3
+ vld vr3, a2, 0
+ vldx vr4, a2, a3
+ vldx vr5, a2, t2
+ vldx vr6, a2, t3
+ add.d a2, a2, t4
+ FILTER_8TAP_4W vr0 //x0 x1 x2 x3
+ FILTER_8TAP_4W vr1
+ FILTER_8TAP_4W vr2
+ FILTER_8TAP_4W vr3
+ FILTER_8TAP_4W vr4
+ FILTER_8TAP_4W vr5
+ FILTER_8TAP_4W vr6
+ vpackev.h vr0, vr1, vr0
+ vpackev.h vr1, vr3, vr2
+ vpackev.h vr2, vr5, vr4
+ vpackev.h vr3, vr7, vr6
+ vilvl.w vr4, vr1, vr0
+ vilvh.w vr5, vr1, vr0
+ vilvl.w vr6, vr3, vr2
+ vilvh.w vr7, vr3, vr2
+ vilvl.d vr0, vr6, vr4 //0 1 2 3 4 5 6 *
+ vilvh.d vr1, vr6, vr4
+ vilvl.d vr2, vr7, vr5
+ vilvh.d vr3, vr7, vr5
+ vsrari.h vr0, vr0, 2
+ vsrari.h vr1, vr1, 2
+ vsrari.h vr2, vr2, 2
+ vsrari.h vr3, vr3, 2
+.l_\lable\()put_hv_4w_loop:
+ vld vr4, a2, 0
+ vldx vr5, a2, a3
+ add.d a2, a2, t2
+ FILTER_8TAP_4W vr4
+ FILTER_8TAP_4W vr5
+ vpickev.h vr4, vr5, vr4
+ vsrari.h vr4, vr4, 2
+ vextrins.h vr0, vr4, 0x70
+ vextrins.h vr1, vr4, 0x71
+ vextrins.h vr2, vr4, 0x72
+ vextrins.h vr3, vr4, 0x73
+ vbsrl.v vr5, vr0, 2
+ vbsrl.v vr6, vr1, 2
+ vbsrl.v vr7, vr2, 2
+ vbsrl.v vr10, vr3, 2
+ vextrins.h vr5, vr4, 0x74
+ vextrins.h vr6, vr4, 0x75
+ vextrins.h vr7, vr4, 0x76
+ vextrins.h vr10, vr4, 0x77
+ vdp2.w.h vr11, vr0, vr9
+ vdp2.w.h vr12, vr1, vr9
+ vdp2.w.h vr13, vr2, vr9
+ vdp2.w.h vr14, vr3, vr9
+ vhaddw.q.w vr11
+ vhaddw.q.w vr12
+ vhaddw.q.w vr13
+ vhaddw.q.w vr14
+ vpackev.w vr0, vr12, vr11
+ vpackev.w vr1, vr14, vr13
+ vpackev.d vr0, vr1, vr0
+ vdp2.w.h vr11, vr5, vr9
+ vdp2.w.h vr12, vr6, vr9
+ vdp2.w.h vr13, vr7, vr9
+ vdp2.w.h vr14, vr10, vr9
+ vhaddw.q.w vr11
+ vhaddw.q.w vr12
+ vhaddw.q.w vr13
+ vhaddw.q.w vr14
+ vpackev.w vr1, vr12, vr11
+ vpackev.w vr2, vr14, vr13
+ vpackev.d vr1, vr2, vr1
+ vssrarni.hu.w vr1, vr0, 10
+ vssrani.bu.h vr1, vr1, 0
+ vstelm.w vr1, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr1, a0, 0, 1
+ add.d a0, a0, a1
+ vbsrl.v vr0, vr5, 2
+ vbsrl.v vr1, vr6, 2
+ vbsrl.v vr2, vr7, 2
+ vbsrl.v vr3, vr10, 2
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv_4w_loop
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_hv_8w:
+.l_\lable\()put_hv_16w:
+.l_\lable\()put_hv_32w:
+.l_\lable\()put_hv_64w:
+.l_\lable\()put_hv_128w:
+ addi.d t0, a2, 0 //src
+ addi.d t5, a5, 0 //h
+ addi.d t8, a0, 0 //dst
+.l_\lable\()put_hv_8w_loop0:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ vldx vr2, a2, t2
+ add.d a2, a2, t3
+ vld vr3, a2, 0
+ vldx vr4, a2, a3
+ vldx vr5, a2, t2
+ vldx vr6, a2, t3
+ add.d a2, a2, t4
+ FILTER_8TAP_8W vr0
+ FILTER_8TAP_8W vr1
+ FILTER_8TAP_8W vr2
+ FILTER_8TAP_8W vr3
+ FILTER_8TAP_8W vr4
+ FILTER_8TAP_8W vr5
+ FILTER_8TAP_8W vr6
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\
+ vr10,vr11,vr12,vr13,vr14,vr15,vr16,vr17
+.l_\lable\()put_hv_8w_loop:
+ vld vr20, a2, 0
+ vldx vr21, a2, a3
+ add.d a2, a2, t2
+ FILTER_8TAP_8W vr20
+ FILTER_8TAP_8W vr21
+ VEXTRINS_Hx8 vr20
+ FILTER_8TAP_8W_CLIP_STORE
+ VBSRL_Vx8
+ VEXTRINS_Hx8 vr21
+ FILTER_8TAP_8W_CLIP_STORE
+ VBSRL_Vx8
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv_8w_loop
+ addi.d a2, t0, 8
+ addi.d t0, t0, 8
+ addi.d a0, t8, 8
+ addi.d t8, t8, 8
+ addi.d a5, t5, 0
+ addi.w a4, a4, -8
+ bnez a4, .l_\lable\()put_hv_8w_loop0
+.l_\lable\()end_put_8tap:
+.endm
+
+function put_8tap_regular_8bpc_lsx
+ addi.d sp, sp, -16
+ st.d zero, sp, 0
+ PUT_8TAP_8BPC_LSX 0
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_smooth_regular_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 1
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 1
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_sharp_regular_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 2
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 2
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_regular_smooth_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 4
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 4
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_smooth_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 5
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 5
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_sharp_smooth_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 6
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 6
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_regular_sharp_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 8
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 8
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_smooth_sharp_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 9
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 9
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_sharp_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 10
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 10
+ addi.d sp, sp, 16
+endfunc
+
+const shufb1
+.byte 0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8,0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8
+endconst
+
+.macro SHUFB in0, in1, tmp, out
+ xvbsrl.v \tmp, \in0, 2
+ xvpermi.q \tmp, \in0, 0x20
+ xvshuf.b \out, \tmp, \tmp, \in1
+.endm
+
+.macro HADDWDH in0
+ xvhaddw.w.h \in0, \in0, \in0
+ xvhaddw.d.w \in0, \in0, \in0
+.endm
+
+.macro HADDWQW in0
+ xvhaddw.d.w \in0, \in0, \in0
+ xvhaddw.q.d \in0, \in0, \in0
+.endm
+
+.macro PREP_W16_H in0
+ xvbsrl.v xr4, \in0, 4
+ xvbsrl.v xr5, \in0, 8
+ xvpermi.q xr9, \in0, 0x31
+ xvpackev.d xr5, xr9, xr5
+ xvbsrl.v xr6, xr5, 4
+ SHUFB \in0, xr23, xr9, \in0
+ SHUFB xr4, xr23, xr9, xr4
+ SHUFB xr5, xr23, xr9, xr5
+ SHUFB xr6, xr23, xr9, xr6
+ xvdp2.h.bu.b xr10, \in0, xr22
+ xvdp2.h.bu.b xr11, xr4, xr22
+ xvdp2.h.bu.b xr12, xr5, xr22
+ xvdp2.h.bu.b xr13, xr6, xr22
+ HADDWDH xr10
+ HADDWDH xr11
+ HADDWDH xr12
+ HADDWDH xr13
+ xvpickev.w xr10, xr11, xr10
+ xvpickev.w xr11, xr13, xr12
+ xvpermi.d xr10, xr10, 0xd8
+ xvpermi.d xr11, xr11, 0xd8
+ xvpickev.h xr10, xr11, xr10
+ xvpermi.d xr10, xr10, 0xd8
+ xvsrari.h \in0, xr10, 2
+.endm
+
+.macro PREP_8TAP_8BPC_LASX lable
+ li.w t0, 4
+ la.local t6, dav1d_mc_subpel_filters
+ la.local t7, shufb1
+ xvld xr23, t7, 0
+ slli.d t2, a2, 1 //src_stride*2
+ add.d t3, t2, a2 //src_stride*3
+ slli.d t4, t2, 1
+
+ bnez a5, .l_\lable\()h //mx
+ bnez a6, .l_\lable\()v
+
+ clz.w t1, a3
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()prep_hv0_jtable
+ alsl.d t1, t1, t5, 1
+ ld.h t8, t1, 0
+ add.d t5, t5, t8
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()prep_hv0_jtable:
+ .hword .l_\lable\()hv0_128w - .l_\lable\()prep_hv0_jtable
+ .hword .l_\lable\()hv0_64w - .l_\lable\()prep_hv0_jtable
+ .hword .l_\lable\()hv0_32w - .l_\lable\()prep_hv0_jtable
+ .hword .l_\lable\()hv0_16w - .l_\lable\()prep_hv0_jtable
+ .hword .l_\lable\()hv0_8w - .l_\lable\()prep_hv0_jtable
+ .hword .l_\lable\()hv0_4w - .l_\lable\()prep_hv0_jtable
+
+.l_\lable\()hv0_4w:
+ fld.s f0, a1, 0
+ fldx.s f1, a1, a2
+ fldx.s f2, a1, t2
+ fldx.s f3, a1, t3
+ add.d a1, a1, t4
+ xvpackev.w xr0, xr1, xr0
+ xvpackev.w xr1, xr3, xr2
+ xvpermi.q xr0, xr1, 0x02
+ xvsllwil.hu.bu xr0, xr0, 4
+ xvst xr0, a0, 0
+ addi.d a0, a0, 32
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv0_4w
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()hv0_8w:
+ fld.d f0, a1, 0
+ fldx.d f1, a1, a2
+ fldx.d f2, a1, t2
+ fldx.d f3, a1, t3
+ add.d a1, a1, t4
+ xvpermi.q xr0, xr1, 0x02
+ xvpermi.q xr2, xr3, 0x02
+ xvsllwil.hu.bu xr0, xr0, 4
+ xvsllwil.hu.bu xr2, xr2, 4
+ xvst xr0, a0, 0
+ xvst xr2, a0, 32
+ addi.d a0, a0, 64
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv0_8w
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()hv0_16w:
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vldx vr2, a1, t2
+ vldx vr3, a1, t3
+ add.d a1, a1, t4
+ vext2xv.hu.bu xr0, xr0
+ vext2xv.hu.bu xr1, xr1
+ vext2xv.hu.bu xr2, xr2
+ vext2xv.hu.bu xr3, xr3
+ xvslli.h xr0, xr0, 4
+ xvslli.h xr1, xr1, 4
+ xvslli.h xr2, xr2, 4
+ xvslli.h xr3, xr3, 4
+ xvst xr0, a0, 0
+ xvst xr1, a0, 32
+ xvst xr2, a0, 64
+ xvst xr3, a0, 96
+ addi.d a0, a0, 128
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv0_16w
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()hv0_32w:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+ xvpermi.d xr4, xr0, 0xD8
+ xvpermi.d xr5, xr1, 0xD8
+ xvpermi.d xr6, xr2, 0xD8
+ xvpermi.d xr7, xr3, 0xD8
+ xvpermi.d xr10, xr0, 0x32
+ xvpermi.d xr11, xr1, 0x32
+ xvpermi.d xr12, xr2, 0x32
+ xvpermi.d xr13, xr3, 0x32
+ xvsllwil.hu.bu xr0, xr4, 4
+ xvsllwil.hu.bu xr1, xr5, 4
+ xvsllwil.hu.bu xr2, xr6, 4
+ xvsllwil.hu.bu xr3, xr7, 4
+ xvsllwil.hu.bu xr4, xr10, 4
+ xvsllwil.hu.bu xr5, xr11, 4
+ xvsllwil.hu.bu xr6, xr12, 4
+ xvsllwil.hu.bu xr7, xr13, 4
+ xvst xr0, a0, 0
+ xvst xr4, a0, 32
+ xvst xr1, a0, 64
+ xvst xr5, a0, 96
+ xvst xr2, a0, 128
+ xvst xr6, a0, 160
+ xvst xr3, a0, 192
+ xvst xr7, a0, 224
+ addi.d a0, a0, 256
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv0_32w
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()hv0_64w:
+.l_\lable\()hv0_128w:
+ addi.d t0, a1, 0
+ addi.d t5, a4, 0
+ srli.w t7, a3, 5
+ slli.w t7, t7, 6
+ addi.d t8, a0, 0
+.l_\lable\()hv0_32_loop:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+ xvpermi.d xr4, xr0, 0xD8
+ xvpermi.d xr5, xr1, 0xD8
+ xvpermi.d xr6, xr2, 0xD8
+ xvpermi.d xr7, xr3, 0xD8
+ xvpermi.d xr10, xr0, 0x32
+ xvpermi.d xr11, xr1, 0x32
+ xvpermi.d xr12, xr2, 0x32
+ xvpermi.d xr13, xr3, 0x32
+ xvsllwil.hu.bu xr0, xr4, 4
+ xvsllwil.hu.bu xr1, xr5, 4
+ xvsllwil.hu.bu xr2, xr6, 4
+ xvsllwil.hu.bu xr3, xr7, 4
+ xvsllwil.hu.bu xr4, xr10, 4
+ xvsllwil.hu.bu xr5, xr11, 4
+ xvsllwil.hu.bu xr6, xr12, 4
+ xvsllwil.hu.bu xr7, xr13, 4
+ xvst xr0, a0, 0
+ xvst xr4, a0, 32
+ add.d t1, a0, t7
+ xvst xr1, t1, 0
+ xvst xr5, t1, 32
+ add.d t1, t1, t7
+ xvst xr2, t1, 0
+ xvst xr6, t1, 32
+ add.d t1, t1, t7
+ xvst xr3, t1, 0
+ xvst xr7, t1, 32
+ add.d a0, t1, t7
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv0_32_loop
+ addi.d a1, t0, 32
+ addi.d t0, t0, 32
+ addi.d a0, t8, 64
+ addi.d t8, t8, 64
+ addi.d a4, t5, 0
+ addi.d a3, a3, -32
+ bnez a3, .l_\lable\()hv0_32_loop
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()h:
+ bnez a6, .l_\lable\()hv //if(fh) && if (fv)
+
+ andi t1, a7, 3
+ blt t0, a3, .l_\lable\()h_idx_fh
+ andi t1, a7, 1
+ addi.w t1, t1, 3
+.l_\lable\()h_idx_fh:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a5, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fh's offset
+ xvldrepl.d xr22, t1, 0
+
+ addi.d a1, a1, -3
+ clz.w t1, a3
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()prep_h_jtable
+ alsl.d t1, t1, t5, 1
+ ld.h t8, t1, 0
+ add.d t5, t5, t8
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()prep_h_jtable:
+ .hword .l_\lable\()h_128w - .l_\lable\()prep_h_jtable
+ .hword .l_\lable\()h_64w - .l_\lable\()prep_h_jtable
+ .hword .l_\lable\()h_32w - .l_\lable\()prep_h_jtable
+ .hword .l_\lable\()h_16w - .l_\lable\()prep_h_jtable
+ .hword .l_\lable\()h_8w - .l_\lable\()prep_h_jtable
+ .hword .l_\lable\()h_4w - .l_\lable\()prep_h_jtable
+
+.l_\lable\()h_4w:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+
+ SHUFB xr0, xr23, xr9, xr0
+ SHUFB xr1, xr23, xr9, xr1
+ SHUFB xr2, xr23, xr9, xr2
+ SHUFB xr3, xr23, xr9, xr3
+
+ xvdp2.h.bu.b xr10, xr0, xr22
+ xvdp2.h.bu.b xr12, xr1, xr22
+ xvdp2.h.bu.b xr14, xr2, xr22
+ xvdp2.h.bu.b xr16, xr3, xr22
+
+ HADDWDH xr10 //h0 mid0 mid1 mid2 mid3
+ HADDWDH xr12 //h1 mid4 mid5 mid6 mid7
+ HADDWDH xr14 //h2
+ HADDWDH xr16 //h3
+
+ xvpickev.w xr10, xr12, xr10
+ xvpickev.w xr14, xr16, xr14
+ xvpermi.d xr10, xr10, 0xd8
+ xvpermi.d xr14, xr14, 0xd8
+ xvpickev.h xr10, xr14, xr10
+ xvpermi.d xr10, xr10, 0xd8
+ xvsrari.h xr10, xr10, 2
+
+ xvst xr10, a0, 0
+ addi.d a0, a0, 32
+ addi.w a4, a4, -4
+ bnez a4, .l_\lable\()h_4w
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()h_8w:
+ xvld xr0, a1, 0
+ xvldx xr2, a1, a2
+ xvldx xr4, a1, t2
+ xvldx xr6, a1, t3
+ add.d a1, a1, t4
+
+ xvbsrl.v xr1, xr0, 4
+ xvbsrl.v xr3, xr2, 4
+ xvbsrl.v xr5, xr4, 4
+ xvbsrl.v xr7, xr6, 4
+
+ SHUFB xr0, xr23, xr9, xr10
+ SHUFB xr1, xr23, xr9, xr11
+ SHUFB xr2, xr23, xr9, xr12
+ SHUFB xr3, xr23, xr9, xr13
+ SHUFB xr4, xr23, xr9, xr14
+ SHUFB xr5, xr23, xr9, xr15
+ SHUFB xr6, xr23, xr9, xr16
+ SHUFB xr7, xr23, xr9, xr17
+
+ xvdp2.h.bu.b xr0, xr10, xr22
+ xvdp2.h.bu.b xr1, xr11, xr22
+ xvdp2.h.bu.b xr2, xr12, xr22
+ xvdp2.h.bu.b xr3, xr13, xr22
+ xvdp2.h.bu.b xr4, xr14, xr22
+ xvdp2.h.bu.b xr5, xr15, xr22
+ xvdp2.h.bu.b xr6, xr16, xr22
+ xvdp2.h.bu.b xr7, xr17, xr22
+
+ HADDWDH xr0
+ HADDWDH xr1
+ HADDWDH xr2
+ HADDWDH xr3
+ HADDWDH xr4
+ HADDWDH xr5
+ HADDWDH xr6
+ HADDWDH xr7
+
+ xvpickev.w xr0, xr1, xr0
+ xvpickev.w xr2, xr3, xr2
+ xvpermi.d xr0, xr0, 0xd8
+ xvpermi.d xr2, xr2, 0xd8
+ xvpickev.h xr0, xr2, xr0
+ xvpermi.d xr0, xr0, 0xd8
+ xvsrari.h xr0, xr0, 2
+
+ xvpickev.w xr4, xr5, xr4
+ xvpickev.w xr6, xr7, xr6
+ xvpermi.d xr4, xr4, 0xd8
+ xvpermi.d xr6, xr6, 0xd8
+ xvpickev.h xr4, xr6, xr4
+ xvpermi.d xr4, xr4, 0xd8
+ xvsrari.h xr4, xr4, 2
+
+ xvst xr0, a0, 0
+ xvst xr4, a0, 32
+ addi.d a0, a0, 64
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()h_8w
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()h_16w:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+
+ PREP_W16_H xr0
+ PREP_W16_H xr1
+ PREP_W16_H xr2
+ PREP_W16_H xr3
+
+ xvst xr0, a0, 0
+ xvst xr1, a0, 32
+ xvst xr2, a0, 64
+ xvst xr3, a0, 96
+
+ addi.d a0, a0, 128
+ addi.w a4, a4, -4
+ bnez a4, .l_\lable\()h_16w
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()h_32w:
+.l_\lable\()h_64w:
+.l_\lable\()h_128w:
+ addi.d t0, a1, 0 //src
+ addi.d t5, a4, 0 //h
+ srli.w t7, a3, 4 //w
+ slli.w t7, t7, 5 //store offset
+ addi.d t8, a0, 0 //dst
+.l_\lable\()h_16_loop:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+
+ PREP_W16_H xr0
+ PREP_W16_H xr1
+ PREP_W16_H xr2
+ PREP_W16_H xr3
+
+ xvst xr0, a0, 0
+ xvstx xr1, a0, t7
+ slli.w t1, t7, 1
+ xvstx xr2, a0, t1
+ add.w t1, t1, t7
+ xvstx xr3, a0, t1
+ slli.w t1, t7, 2
+ add.d a0, a0, t1
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()h_16_loop
+
+ addi.d a1, t0, 16
+ addi.d t0, t0, 16
+ addi.d a0, t8, 32
+ addi.d t8, t8, 32
+ addi.d a4, t5, 0
+ addi.d a3, a3, -16
+ bnez a3, .l_\lable\()h_16_loop
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()hv:
+ andi t1, a7, 3
+ blt t0, a3, .l_\lable\()hv_idx_fh
+ andi t1, a7, 1
+ addi.w t1, t1, 3
+.l_\lable\()hv_idx_fh:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a5, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fh's offset
+ xvldrepl.d xr22, t1, 0
+ srli.w a7, a7, 2
+ blt t0, a4, .l_\lable\()hv_idx_fv
+ andi a7, a7, 1
+ addi.w a7, a7, 3
+.l_\lable\()hv_idx_fv:
+ addi.w t5, zero, 120
+ mul.w a7, a7, t5
+ addi.w t5, a6, -1
+ slli.w t5, t5, 3
+ add.w a7, a7, t5
+ add.d a7, t6, a7 //fv's offset
+ xvldrepl.d xr8, a7, 0
+ xvsllwil.h.b xr8, xr8, 0
+
+ sub.d a1, a1, t3
+ addi.d a1, a1, -3
+ beq a3, t0, .l_\lable\()hv_4w
+ b .l_\lable\()hv_8w
+.l_\lable\()hv_4w:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+ xvld xr4, a1, 0
+ xvldx xr5, a1, a2
+ xvldx xr6, a1, t2
+
+ SHUFB xr0, xr23, xr9, xr0
+ SHUFB xr1, xr23, xr9, xr1
+ SHUFB xr2, xr23, xr9, xr2
+ SHUFB xr3, xr23, xr9, xr3
+
+ SHUFB xr4, xr23, xr9, xr4
+ SHUFB xr5, xr23, xr9, xr5
+ SHUFB xr6, xr23, xr9, xr6
+
+ xvdp2.h.bu.b xr10, xr0, xr22
+ xvdp2.h.bu.b xr11, xr1, xr22
+ xvdp2.h.bu.b xr12, xr2, xr22
+ xvdp2.h.bu.b xr13, xr3, xr22
+
+ xvdp2.h.bu.b xr14, xr4, xr22
+ xvdp2.h.bu.b xr15, xr5, xr22
+ xvdp2.h.bu.b xr16, xr6, xr22
+
+ HADDWDH xr10 //h0 mid0 mid1 mid2 mid3
+ HADDWDH xr11 //h1 mid4 mid5 mid6 mid7
+ HADDWDH xr12 //h2
+ HADDWDH xr13 //h3
+
+ xvpackev.w xr10, xr11, xr10
+ xvpackev.w xr12, xr13, xr12
+ xvpackev.d xr11, xr12, xr10
+ xvpackod.d xr10, xr12, xr10
+ xvpickev.h xr11, xr10, xr11
+ xvsrari.h xr11, xr11, 2
+
+ HADDWDH xr14 //h4
+ HADDWDH xr15 //h5
+ HADDWDH xr16 //h6
+
+ xvpackev.w xr14, xr15, xr14
+ xvpackev.w xr16, xr17, xr16
+ xvpackev.d xr17, xr16, xr14
+ xvpackod.d xr14, xr16, xr14
+ xvpickev.h xr13, xr14, xr17
+ xvsrari.h xr13, xr13, 2
+
+ xvpackev.d xr18, xr13, xr11 //0 4 8 12 16 20 24 * 2 6 10 14 18 22 26 *
+ xvpackod.d xr19, xr13, xr11 //1 5 9 13 17 21 25 * 3 7 11 15 19 23 27 *
+.l_\lable\()hv_w4_loop:
+ xvldx xr0, a1, t3
+ add.d a1, a1, t4
+ xvld xr1, a1, 0
+ xvldx xr2, a1, a2
+ xvldx xr3, a1, t2
+
+ SHUFB xr0, xr23, xr9, xr0
+ SHUFB xr1, xr23, xr9, xr1
+ SHUFB xr2, xr23, xr9, xr2
+ SHUFB xr3, xr23, xr9, xr3
+
+ xvdp2.h.bu.b xr10, xr0, xr22
+ xvdp2.h.bu.b xr12, xr1, xr22
+ xvdp2.h.bu.b xr14, xr2, xr22
+ xvdp2.h.bu.b xr16, xr3, xr22
+
+ HADDWDH xr10 //h0 mid0 mid1 mid2 mid3
+ HADDWDH xr12 //h1 mid4 mid5 mid6 mid7
+ HADDWDH xr14 //h2
+ HADDWDH xr16 //h3
+
+ xvpackev.w xr10, xr12, xr10
+ xvpackev.w xr14, xr16, xr14
+ xvpackev.d xr12, xr14, xr10
+ xvpackod.d xr10, xr14, xr10
+ xvpickev.h xr12, xr10, xr12
+ xvsrari.h xr12, xr12, 2
+
+ xvextrins.h xr18, xr12, 0x70 //0 4 8 12 16 20 24 0(x0) 2 6 10 14 18 22 26 2(x2)
+ xvextrins.h xr19, xr12, 0x74 //1 5 9 13 17 21 25 0(x1) 3 7 11 15 19 23 27 2(x3)
+
+ xvdp2.w.h xr0, xr18, xr8
+ xvdp2.w.h xr2, xr19, xr8
+ HADDWQW xr0
+ HADDWQW xr2
+ xvpackev.w xr0, xr2, xr0
+
+ xvbsrl.v xr18, xr18, 2
+ xvbsrl.v xr19, xr19, 2
+ xvextrins.h xr18, xr12, 0x71
+ xvextrins.h xr19, xr12, 0x75
+ xvdp2.w.h xr2, xr18, xr8
+ xvdp2.w.h xr4, xr19, xr8
+ HADDWQW xr2
+ HADDWQW xr4
+ xvpackev.w xr2, xr4, xr2
+
+ xvbsrl.v xr18, xr18, 2
+ xvbsrl.v xr19, xr19, 2
+ xvextrins.h xr18, xr12, 0x72
+ xvextrins.h xr19, xr12, 0x76
+ xvdp2.w.h xr4, xr18, xr8
+ xvdp2.w.h xr9, xr19, xr8
+ HADDWQW xr4
+ HADDWQW xr9
+ xvpackev.w xr4, xr9, xr4
+
+ xvbsrl.v xr18, xr18, 2
+ xvbsrl.v xr19, xr19, 2
+ xvextrins.h xr18, xr12, 0x73
+ xvextrins.h xr19, xr12, 0x77
+ xvdp2.w.h xr9, xr18, xr8
+ xvdp2.w.h xr11, xr19, xr8
+ HADDWQW xr9
+ HADDWQW xr11
+ xvpackev.w xr9, xr11, xr9
+
+ xvpackev.d xr0, xr2, xr0
+ xvpackev.d xr4, xr9, xr4
+ xvsrari.w xr0, xr0, 6
+ xvsrari.w xr4, xr4, 6
+ xvpermi.d xr0, xr0, 0xd8
+ xvpermi.d xr4, xr4, 0xd8
+ xvpickev.h xr0, xr4, xr0
+ xvpermi.d xr0, xr0, 0xd8
+ xvst xr0, a0, 0
+ addi.d a0, a0, 32
+
+ xvbsrl.v xr18, xr18, 2
+ xvbsrl.v xr19, xr19, 2
+
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv_w4_loop
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()hv_8w:
+ addi.d t0, a1, 0
+ addi.d t5, a4, 0
+ srli.w t7, a3, 3
+ slli.w t7, t7, 4 // store offset
+ addi.d t8, a0, 0
+.l_\lable\()hv_8w_loop0:
+ xvld xr0, a1, 0
+ xvldx xr2, a1, a2
+ xvldx xr4, a1, t2
+ xvldx xr6, a1, t3
+
+ add.d a1, a1, t4
+ xvld xr10, a1, 0
+ xvldx xr11, a1, a2
+ xvldx xr12, a1, t2
+
+ xvbsrl.v xr1, xr0, 4
+ xvbsrl.v xr3, xr2, 4
+ xvbsrl.v xr5, xr4, 4
+ xvbsrl.v xr7, xr6, 4
+
+ SHUFB xr0, xr23, xr9, xr13
+ SHUFB xr1, xr23, xr9, xr14
+ SHUFB xr2, xr23, xr9, xr15
+ SHUFB xr3, xr23, xr9, xr16
+ SHUFB xr4, xr23, xr9, xr17
+ SHUFB xr5, xr23, xr9, xr18
+ SHUFB xr6, xr23, xr9, xr19
+ SHUFB xr7, xr23, xr9, xr20
+
+ xvdp2.h.bu.b xr0, xr13, xr22
+ xvdp2.h.bu.b xr1, xr14, xr22
+ xvdp2.h.bu.b xr2, xr15, xr22
+ xvdp2.h.bu.b xr3, xr16, xr22
+ xvdp2.h.bu.b xr4, xr17, xr22
+ xvdp2.h.bu.b xr5, xr18, xr22
+ xvdp2.h.bu.b xr6, xr19, xr22
+ xvdp2.h.bu.b xr7, xr20, xr22
+
+ HADDWDH xr0
+ HADDWDH xr1
+ HADDWDH xr2
+ HADDWDH xr3
+ HADDWDH xr4
+ HADDWDH xr5
+ HADDWDH xr6
+ HADDWDH xr7
+
+ xvpackev.w xr0, xr2, xr0
+ xvpackev.w xr2, xr6, xr4
+ xvpackev.d xr16, xr2, xr0
+ xvpackod.d xr0, xr2, xr0
+ xvpickev.h xr0, xr0, xr16
+ xvsrari.h xr0, xr0, 2 // 0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27
+
+ xvpackev.w xr1, xr3, xr1
+ xvpackev.w xr3, xr7, xr5
+ xvpackev.d xr16, xr3, xr1
+ xvpackod.d xr1, xr3, xr1
+ xvpickev.h xr1, xr1, xr16
+ xvsrari.h xr1, xr1, 2 // 4 12 20 28 5 13 21 29 6 14 22 30 7 15 23 31
+
+ xvbsrl.v xr13, xr10, 4
+ xvbsrl.v xr14, xr11, 4
+ xvbsrl.v xr15, xr12, 4
+
+ SHUFB xr10, xr23, xr9, xr10
+ SHUFB xr13, xr23, xr9, xr13
+ SHUFB xr11, xr23, xr9, xr11
+ SHUFB xr14, xr23, xr9, xr14
+ SHUFB xr12, xr23, xr9, xr12
+ SHUFB xr15, xr23, xr9, xr15
+
+ xvdp2.h.bu.b xr4, xr10, xr22
+ xvdp2.h.bu.b xr5, xr13, xr22
+ xvdp2.h.bu.b xr6, xr11, xr22
+ xvdp2.h.bu.b xr7, xr14, xr22
+ xvdp2.h.bu.b xr9, xr12, xr22
+ xvdp2.h.bu.b xr10, xr15, xr22
+
+ HADDWDH xr4
+ HADDWDH xr5
+ HADDWDH xr6
+ HADDWDH xr7
+ HADDWDH xr9
+ HADDWDH xr10
+
+ xvpackev.w xr4, xr6, xr4
+ xvpackev.w xr9, xr12, xr9
+ xvpackev.d xr16, xr9, xr4
+ xvpackod.d xr11, xr9, xr4
+ xvpickev.h xr2, xr11, xr16
+ xvsrari.h xr2, xr2, 2 // 32 40 48 * 33 41 49 * 34 42 50 * 35 43 51 *
+
+ xvpackev.w xr5, xr7, xr5
+ xvpackev.w xr10, xr12, xr10
+ xvpackev.d xr16, xr10, xr5
+ xvpackod.d xr11, xr10, xr5
+ xvpickev.h xr3, xr11, xr16
+ xvsrari.h xr3, xr3, 2 // 36 44 52 * 37 45 53 * 38 46 54 * 39 47 56 *
+
+ xvpackev.d xr18, xr2, xr0 // 0 8 16 24 32 40 48 * 2 10 18 26 34 42 50 *
+ xvpackod.d xr19, xr2, xr0 // 1 9 17 25 33 41 49 * 3 11 19 27 35 43 51 *
+ xvpackev.d xr20, xr3, xr1 // 4 12 20 28 36 44 52 * 6 14 22 30 38 46 54 *
+ xvpackod.d xr21, xr3, xr1 // 5 13 21 29 37 45 53 * 7 15 23 31 39 47 55 *
+
+.l_\lable\()hv_8w_loop:
+ xvldx xr0, a1, t3
+ add.d a1, a1, t4
+ xvld xr2, a1, 0
+ xvldx xr4, a1, a2
+ xvldx xr6, a1, t2
+
+ xvbsrl.v xr1, xr0, 4
+ xvbsrl.v xr3, xr2, 4
+ xvbsrl.v xr5, xr4, 4
+ xvbsrl.v xr7, xr6, 4
+
+ SHUFB xr0, xr23, xr9, xr0
+ SHUFB xr1, xr23, xr9, xr1
+ SHUFB xr2, xr23, xr9, xr2
+ SHUFB xr3, xr23, xr9, xr3
+ SHUFB xr4, xr23, xr9, xr4
+ SHUFB xr5, xr23, xr9, xr5
+ SHUFB xr6, xr23, xr9, xr6
+ SHUFB xr7, xr23, xr9, xr7
+
+ xvdp2.h.bu.b xr10, xr0, xr22
+ xvdp2.h.bu.b xr11, xr1, xr22
+ xvdp2.h.bu.b xr12, xr2, xr22
+ xvdp2.h.bu.b xr13, xr3, xr22
+ xvdp2.h.bu.b xr14, xr4, xr22
+ xvdp2.h.bu.b xr15, xr5, xr22
+ xvdp2.h.bu.b xr16, xr6, xr22
+ xvdp2.h.bu.b xr17, xr7, xr22
+
+ HADDWDH xr10
+ HADDWDH xr11
+ HADDWDH xr12
+ HADDWDH xr13
+ HADDWDH xr14
+ HADDWDH xr15
+ HADDWDH xr16
+ HADDWDH xr17
+
+ xvpackev.w xr0, xr12, xr10
+ xvpackev.w xr2, xr16, xr14
+ xvpackev.d xr9, xr2, xr0
+ xvpackod.d xr0, xr2, xr0
+ xvpickev.h xr0, xr0, xr9
+ xvsrari.h xr0, xr0, 2 // 56 64 72 80 57 65 73 81 58 66 74 82 59 67 75 83
+
+ xvpackev.w xr1, xr13, xr11
+ xvpackev.w xr3, xr17, xr15
+ xvpackev.d xr9, xr3, xr1
+ xvpackod.d xr1, xr3, xr1
+ xvpickev.h xr1, xr1, xr9
+ xvsrari.h xr1, xr1, 2 // 60 68 76 84 61 69 77 85 62 70 78 86 63 71 79 87
+
+ xvextrins.h xr18, xr0, 0x70 // 0 8 16 24 32 40 48 (56) 2 10 18 26 34 42 50 (58)
+ xvextrins.h xr19, xr0, 0x74 // 1 9 17 25 33 41 49 (57) 3 11 19 27 35 43 51 (59)
+ xvextrins.h xr20, xr1, 0x70
+ xvextrins.h xr21, xr1, 0x74
+
+ //h - 1
+ xvdp2.w.h xr10, xr18, xr8
+ xvdp2.w.h xr11, xr19, xr8
+ xvdp2.w.h xr12, xr20, xr8
+ xvdp2.w.h xr13, xr21, xr8
+
+ HADDWQW xr10
+ HADDWQW xr11
+ HADDWQW xr12
+ HADDWQW xr13
+
+ xvpackev.w xr2, xr11, xr10 //0 1 * * 2 3 * *
+ xvpackev.w xr3, xr13, xr12 //4 5 * * 6 7 * *
+ xvpackev.d xr2, xr3, xr2 //0 1 4 5 2 3 6 7
+ //h - 2
+ xvbsrl.v xr4, xr18, 2
+ xvbsrl.v xr5, xr19, 2
+ xvbsrl.v xr6, xr20, 2
+ xvbsrl.v xr7, xr21, 2
+ xvextrins.h xr4, xr0, 0x71
+ xvextrins.h xr5, xr0, 0x75
+ xvextrins.h xr6, xr1, 0x71
+ xvextrins.h xr7, xr1, 0x75
+
+ xvdp2.w.h xr10, xr4, xr8
+ xvdp2.w.h xr11, xr5, xr8
+ xvdp2.w.h xr12, xr6, xr8
+ xvdp2.w.h xr13, xr7, xr8
+
+ HADDWQW xr10
+ HADDWQW xr11
+ HADDWQW xr12
+ HADDWQW xr13
+
+ xvpackev.w xr14, xr11, xr10
+ xvpackev.w xr15, xr13, xr12
+ xvpackev.d xr14, xr15, xr14 //8 9 12 13 10 11 14 15
+ //h - 3
+ xvbsrl.v xr4, xr4, 2
+ xvbsrl.v xr5, xr5, 2
+ xvbsrl.v xr6, xr6, 2
+ xvbsrl.v xr7, xr7, 2
+ xvextrins.h xr4, xr0, 0x72
+ xvextrins.h xr5, xr0, 0x76
+ xvextrins.h xr6, xr1, 0x72
+ xvextrins.h xr7, xr1, 0x76
+
+ xvdp2.w.h xr10, xr4, xr8
+ xvdp2.w.h xr11, xr5, xr8
+ xvdp2.w.h xr12, xr6, xr8
+ xvdp2.w.h xr13, xr7, xr8
+
+ HADDWQW xr10
+ HADDWQW xr11
+ HADDWQW xr12
+ HADDWQW xr13
+
+ xvpackev.w xr15, xr11, xr10
+ xvpackev.w xr16, xr13, xr12
+ xvpackev.d xr15, xr16, xr15 //16 17 20 21 18 19 22 23
+ //h - 4
+ xvbsrl.v xr4, xr4, 2
+ xvbsrl.v xr5, xr5, 2
+ xvbsrl.v xr6, xr6, 2
+ xvbsrl.v xr7, xr7, 2
+ xvextrins.h xr4, xr0, 0x73
+ xvextrins.h xr5, xr0, 0x77
+ xvextrins.h xr6, xr1, 0x73
+ xvextrins.h xr7, xr1, 0x77
+
+ xvdp2.w.h xr10, xr4, xr8
+ xvdp2.w.h xr11, xr5, xr8
+ xvdp2.w.h xr12, xr6, xr8
+ xvdp2.w.h xr13, xr7, xr8
+
+ HADDWQW xr10
+ HADDWQW xr11
+ HADDWQW xr12
+ HADDWQW xr13
+
+ xvpackev.w xr16, xr11, xr10
+ xvpackev.w xr17, xr13, xr12
+ xvpackev.d xr16, xr17, xr16 //24 25 28 29 26 27 30 31
+
+ xvsrari.w xr2, xr2, 6
+ xvsrari.w xr14, xr14, 6
+ xvsrari.w xr15, xr15, 6
+ xvsrari.w xr16, xr16, 6
+
+ xvpermi.d xr2, xr2, 0xd8
+ xvpermi.d xr14, xr14, 0xd8
+ xvpermi.d xr15, xr15, 0xd8
+ xvpermi.d xr16, xr16, 0xd8
+ xvpickev.h xr2, xr14, xr2
+ xvpickev.h xr3, xr16, xr15
+ xvpermi.d xr2, xr2, 0xd8
+ xvpermi.d xr3, xr3, 0xd8
+
+ xvpermi.q xr10, xr2, 0x31
+ xvpermi.q xr11, xr3, 0x31
+
+ vst vr2, a0, 0
+ vstx vr10, a0, t7 //32
+ slli.w t1, t7, 1 //64
+ vstx vr3, a0, t1
+ add.w t1, t1, t7 //96
+ vstx vr11, a0, t1
+ slli.w t1, t7, 2 //128
+ add.d a0, a0, t1
+
+ xvbsrl.v xr18, xr4, 2
+ xvbsrl.v xr19, xr5, 2
+ xvbsrl.v xr20, xr6, 2
+ xvbsrl.v xr21, xr7, 2
+
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv_8w_loop
+
+ addi.d a1, t0, 8
+ addi.d t0, t0, 8
+ addi.d a0, t8, 16
+ addi.d t8, t8, 16
+ addi.d a4, t5, 0
+ addi.d a3, a3, -8
+ bnez a3, .l_\lable\()hv_8w_loop0
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()v:
+
+ srli.w a7, a7, 2
+ blt t0, a4, .l_\lable\()v_idx_fv
+ andi a7, a7, 1
+ addi.w a7, a7, 3
+.l_\lable\()v_idx_fv:
+ addi.w t5, zero, 120
+ mul.w a7, a7, t5
+ addi.w t5, a6, -1
+ slli.w t5, t5, 3
+ add.w a7, a7, t5
+ add.d a7, t6, a7 //fv's offset
+ xvldrepl.d xr8, a7, 0
+
+ sub.d a1, a1, t3
+ beq a3, t0, .l_\lable\()v_4w
+ blt t0, a3, .l_\lable\()v_8w
+.l_\lable\()v_4w:
+ fld.s f0, a1, 0
+ fldx.s f1, a1, a2
+ fldx.s f2, a1, t2
+ add.d a1, a1, t3
+ fld.s f3, a1, 0
+ fldx.s f4, a1, a2
+ fldx.s f5, a1, t2
+ fldx.s f6, a1, t3
+
+ xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25
+ xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27
+ xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29
+ xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31
+ xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27
+ xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31
+ xvilvl.w xr2, xr1, xr0
+ xvilvh.w xr0, xr1, xr0
+ xvpermi.q xr0, xr2, 0x20
+
+.l_\lable\()v_4w_loop:
+ add.d a1, a1, t4
+ fld.s f7, a1, 0 //h0
+ fldx.s f10, a1, a2 //h1
+ fldx.s f11, a1, t2 //h2
+ fldx.s f12, a1, t3 //h3
+
+ xvbsrl.v xr9, xr7, 2
+ xvpermi.q xr9, xr7, 0x20
+ xvextrins.b xr0, xr9, 0x70
+ xvextrins.b xr0, xr9, 0xf1
+
+ xvbsrl.v xr1, xr0, 1
+ xvbsrl.v xr7, xr10, 2
+ xvpermi.q xr7, xr10, 0x20
+ xvextrins.b xr1, xr7, 0x70
+ xvextrins.b xr1, xr7, 0xf1
+
+ xvbsrl.v xr2, xr1, 1
+ xvbsrl.v xr7, xr11, 2
+ xvpermi.q xr7, xr11, 0x20
+ xvextrins.b xr2, xr7, 0x70
+ xvextrins.b xr2, xr7, 0xf1
+
+ xvbsrl.v xr3, xr2, 1
+ xvbsrl.v xr7, xr12, 2
+ xvpermi.q xr7, xr12, 0x20
+ xvextrins.b xr3, xr7, 0x70
+ xvextrins.b xr3, xr7, 0xf1
+ xvbsrl.v xr4, xr3, 1
+
+ xvdp2.h.bu.b xr10, xr0, xr8
+ xvdp2.h.bu.b xr11, xr1, xr8
+ xvdp2.h.bu.b xr12, xr2, xr8
+ xvdp2.h.bu.b xr13, xr3, xr8
+ HADDWDH xr10
+ HADDWDH xr11
+ HADDWDH xr12
+ HADDWDH xr13
+ xvpickev.w xr10, xr11, xr10
+ xvpickev.w xr11, xr13, xr12
+ xvpermi.d xr10, xr10, 0xd8
+ xvpermi.d xr11, xr11, 0xd8
+ xvpickev.h xr10, xr11, xr10
+ xvpermi.d xr10, xr10, 0xd8
+ xvsrari.h xr10, xr10, 2
+
+ xvaddi.bu xr0, xr4, 0
+
+ xvst xr10, a0, 0
+ addi.d a0, a0, 32
+ addi.w a4, a4, -4
+ bnez a4, .l_\lable\()v_4w_loop
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()v_8w:
+ addi.d t0, a1, 0
+ addi.d t5, a4, 0
+ srli.w t7, a3, 2
+ slli.w t7, t7, 3
+ addi.d t8, a0, 0
+.l_\lable\()v_8w_loop0:
+ fld.s f0, a1, 0
+ fldx.s f1, a1, a2
+ fldx.s f2, a1, t2
+ add.d a1, a1, t3
+ fld.s f3, a1, 0
+ fldx.s f4, a1, a2
+ fldx.s f5, a1, t2
+ fldx.s f6, a1, t3
+
+ xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25
+ xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27
+ xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29
+ xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31
+ xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27
+ xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31
+ xvilvl.w xr2, xr1, xr0
+ xvilvh.w xr0, xr1, xr0
+ xvpermi.q xr0, xr2, 0x20
+
+.l_\lable\()v_8w_loop:
+ add.d a1, a1, t4
+ fld.s f7, a1, 0 //h0
+ fldx.s f10, a1, a2 //h1
+ fldx.s f11, a1, t2 //h2
+ fldx.s f12, a1, t3 //h3
+
+ xvbsrl.v xr9, xr7, 2
+ xvpermi.q xr9, xr7, 0x20
+ xvextrins.b xr0, xr9, 0x70
+ xvextrins.b xr0, xr9, 0xf1
+
+ xvbsrl.v xr1, xr0, 1
+ xvbsrl.v xr7, xr10, 2
+ xvpermi.q xr7, xr10, 0x20
+ xvextrins.b xr1, xr7, 0x70
+ xvextrins.b xr1, xr7, 0xf1
+
+ xvbsrl.v xr2, xr1, 1
+ xvbsrl.v xr7, xr11, 2
+ xvpermi.q xr7, xr11, 0x20
+ xvextrins.b xr2, xr7, 0x70
+ xvextrins.b xr2, xr7, 0xf1
+
+ xvbsrl.v xr3, xr2, 1
+ xvbsrl.v xr7, xr12, 2
+ xvpermi.q xr7, xr12, 0x20
+ xvextrins.b xr3, xr7, 0x70
+ xvextrins.b xr3, xr7, 0xf1
+ xvbsrl.v xr4, xr3, 1
+
+ xvdp2.h.bu.b xr10, xr0, xr8
+ xvdp2.h.bu.b xr11, xr1, xr8
+ xvdp2.h.bu.b xr12, xr2, xr8
+ xvdp2.h.bu.b xr13, xr3, xr8
+ HADDWDH xr10
+ HADDWDH xr11
+ HADDWDH xr12
+ HADDWDH xr13
+ xvpickev.w xr10, xr11, xr10
+ xvpickev.w xr11, xr13, xr12
+ xvpermi.d xr10, xr10, 0xd8
+ xvpermi.d xr11, xr11, 0xd8
+ xvpickev.h xr10, xr11, xr10
+ xvpermi.d xr10, xr10, 0xd8
+ xvsrari.h xr10, xr10, 2
+
+ xvaddi.bu xr0, xr4, 0
+
+ xvstelm.d xr10, a0, 0, 0
+ add.d a0, a0, t7
+ xvstelm.d xr10, a0, 0, 1
+ add.d a0, a0, t7
+ xvstelm.d xr10, a0, 0, 2
+ add.d a0, a0, t7
+ xvstelm.d xr10, a0, 0, 3
+ add.d a0, a0, t7
+ addi.w a4, a4, -4
+ bnez a4, .l_\lable\()v_8w_loop
+
+ addi.d a1, t0, 4
+ addi.d t0, t0, 4
+ addi.d a0, t8, 8
+ addi.d t8, t8, 8
+ addi.d a4, t5, 0
+ addi.d a3, a3, -4
+ bnez a3, .l_\lable\()v_8w_loop0
+
+.l_\lable\()end_pre_8tap:
+.endm
+
+function prep_8tap_regular_8bpc_lasx
+ addi.w a7, zero, 0
+ PREP_8TAP_8BPC_LASX 0
+endfunc
+
+function prep_8tap_smooth_regular_8bpc_lasx
+ addi.w a7, zero, 1
+ PREP_8TAP_8BPC_LASX 1
+endfunc
+
+function prep_8tap_sharp_regular_8bpc_lasx
+ addi.w a7, zero, 2
+ PREP_8TAP_8BPC_LASX 2
+endfunc
+
+function prep_8tap_regular_smooth_8bpc_lasx
+ addi.w a7, zero, 4
+ PREP_8TAP_8BPC_LASX 4
+endfunc
+
+function prep_8tap_smooth_8bpc_lasx
+ addi.w a7, zero, 5
+ PREP_8TAP_8BPC_LASX 5
+endfunc
+
+function prep_8tap_sharp_smooth_8bpc_lasx
+ addi.w a7, zero, 6
+ PREP_8TAP_8BPC_LASX 6
+endfunc
+
+function prep_8tap_regular_sharp_8bpc_lasx
+ addi.w a7, zero, 8
+ PREP_8TAP_8BPC_LASX 8
+endfunc
+
+function prep_8tap_smooth_sharp_8bpc_lasx
+ addi.w a7, zero, 9
+ PREP_8TAP_8BPC_LASX 9
+endfunc
+
+function prep_8tap_sharp_8bpc_lasx
+ addi.w a7, zero, 10
+ PREP_8TAP_8BPC_LASX 10
+endfunc
diff --git a/src/loongarch/mc.h b/src/loongarch/mc.h
new file mode 100644
index 0000000..c64b7ef
--- /dev/null
+++ b/src/loongarch/mc.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_MC_H
+#define DAV1D_SRC_LOONGARCH_MC_H
+
+#include "config.h"
+#include "src/mc.h"
+#include "src/cpu.h"
+
+#define init_mc_fn(type, name, suffix) \
+ c->mc[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_fn(type, name, suffix) \
+ c->mct[type] = BF(dav1d_prep_##name, suffix)
+
+decl_avg_fn(BF(dav1d_avg, lsx));
+decl_w_avg_fn(BF(dav1d_w_avg, lsx));
+decl_mask_fn(BF(dav1d_mask, lsx));
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lsx));
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lsx));
+decl_w_mask_fn(BF(dav1d_w_mask_420, lsx));
+
+decl_mc_fn(BF(dav1d_put_8tap_regular, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_smooth, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_sharp, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, lsx));
+
+decl_avg_fn(BF(dav1d_avg, lasx));
+decl_w_avg_fn(BF(dav1d_w_avg, lasx));
+decl_mask_fn(BF(dav1d_mask, lasx));
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx));
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lasx));
+decl_w_mask_fn(BF(dav1d_w_mask_420, lasx));
+
+decl_mct_fn(BF(dav1d_prep_8tap_regular, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, lasx));
+
+static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) {
+#if BITDEPTH == 8
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
+
+ c->avg = BF(dav1d_avg, lsx);
+ c->w_avg = BF(dav1d_w_avg, lsx);
+ c->mask = BF(dav1d_mask, lsx);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, lsx);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, lsx);
+ c->w_mask[2] = BF(dav1d_w_mask_420, lsx);
+
+ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lsx);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lsx);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, lsx);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lsx);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, lsx);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, lsx);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, lsx);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, lsx);
+ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, lsx);
+
+ if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return;
+
+ c->avg = BF(dav1d_avg, lasx);
+ c->w_avg = BF(dav1d_w_avg, lasx);
+ c->mask = BF(dav1d_mask, lasx);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, lasx);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, lasx);
+ c->w_mask[2] = BF(dav1d_w_mask_420, lasx);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lasx);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lasx);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, lasx);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lasx);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, lasx);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, lasx);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, lasx);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, lasx);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, lasx);
+#endif
+}
+
+#endif /* DAV1D_SRC_LOONGARCH_MC_H */
diff --git a/src/loongarch/msac.S b/src/loongarch/msac.S
new file mode 100644
index 0000000..c371eba
--- /dev/null
+++ b/src/loongarch/msac.S
@@ -0,0 +1,368 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "loongson_asm.S"
+
+const min_prob
+ .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+endconst
+
+.macro decode_symbol_adapt w
+ addi.d sp, sp, -48
+ addi.d a4, a0, 24
+ vldrepl.h vr0, a4, 0 //rng
+ fst.s f0, sp, 0 //val==0
+ vld vr1, a1, 0 //cdf
+.if \w == 16
+ li.w t4, 16
+ vldx vr11, a1, t4
+.endif
+ addi.d a6, a0, 16
+ vldrepl.d vr2, a6, 0 //dif
+ addi.d t0, a0, 32
+ ld.w t1, t0, 0 //allow_update_cdf
+ la.local t2, min_prob
+ addi.d t2, t2, 32
+ addi.w t3, a2, 1
+ slli.w t3, t3, 1
+ sub.d t2, t2, t3
+ vld vr3, t2, 0 //min_prob
+.if \w == 16
+ vldx vr13, t2, t4
+.endif
+ vsrli.h vr4, vr0, 8 //r = s->rng >> 8
+ vslli.h vr4, vr4, 8 //r << 8
+ vsrli.h vr5, vr1, 6
+ vslli.h vr5, vr5, 7
+.if \w == 16
+ vsrli.h vr15, vr11, 6
+ vslli.h vr15, vr15, 7
+.endif
+ vmuh.hu vr5, vr4, vr5
+ vadd.h vr5, vr5, vr3 //v
+.if \w == 16
+ vmuh.hu vr15, vr4, vr15
+ vadd.h vr15, vr15, vr13
+.endif
+ addi.d t8, sp, 4
+ vst vr5, t8, 0 //store v
+.if \w == 16
+ vstx vr15, t8, t4
+.endif
+ vreplvei.h vr20, vr2, 3 //c
+ vssub.hu vr6, vr5, vr20 //c >=v
+ vseqi.h vr6, vr6, 0
+.if \w == 16
+ vssub.hu vr16, vr15, vr20 //c >=v
+ vseqi.h vr16, vr16, 0
+ vpickev.b vr21, vr16, vr6
+.endif
+.if \w <= 8
+ vmskltz.h vr10, vr6
+.else
+ vmskltz.b vr10, vr21
+.endif
+ beqz t1, .renorm\()\w
+
+ // update_cdf
+ alsl.d t1, a2, a1, 1
+ ld.h t2, t1, 0 //count
+ srli.w t3, t2, 4 //count >> 4
+ addi.w t3, t3, 4
+ li.w t5, 2
+ sltu t5, t5, a2
+ add.w t3, t3, t5 //rate
+ sltui t5, t2, 32
+ add.w t2, t2, t5 //count + (count < 32)
+ vreplgr2vr.h vr9, t3
+ vseq.h vr7, vr7, vr7
+ vavgr.hu vr5, vr6, vr7 //i >= val ? -1 : 32768
+ vsub.h vr5, vr5, vr1
+ vsub.h vr8, vr1, vr6
+.if \w == 16
+ vavgr.hu vr15, vr16, vr7
+ vsub.h vr15, vr15, vr11
+ vsub.h vr18, vr11, vr16
+.endif
+ vsra.h vr5, vr5, vr9
+ vadd.h vr8, vr8, vr5
+.if \w == 4
+ fst.d f8, a1, 0
+.else
+ vst vr8, a1, 0
+.endif
+.if \w == 16
+ vsra.h vr15, vr15, vr9
+ vadd.h vr18, vr18, vr15
+ vstx vr18, a1, t4
+.endif
+ st.h t2, t1, 0
+
+.renorm\()\w:
+ vpickve2gr.h t3, vr10, 0
+ ctz.w a7, t3 // ret
+ alsl.d t3, a7, t8, 1
+ ld.hu t4, t3, 0 // v
+ addi.d t3, t3, -2
+ ld.hu t5, t3, 0 // u
+ sub.w t5, t5, t4 // rng
+ slli.d t4, t4, 48
+ vpickve2gr.d t6, vr2, 0
+ sub.d t6, t6, t4 // dif
+ addi.d t6, t6, 1
+ clz.w t4, t5 // d
+ xori t4, t4, 16 // d
+ sll.d t6, t6, t4
+ addi.d t6, t6, -1 // dif
+ addi.d a5, a0, 28 // cnt
+ ld.w t7, a5, 0
+ sub.w t7, t7, t4 // cnt-d
+ sll.w t5, t5, t4
+ st.w t5, a4, 0 // store rng
+ bge t7, zero, 9f
+
+ // refill
+ ld.d t0, a0, 0 // buf_pos
+ addi.d t1, a0, 8
+ ld.d t1, t1, 0 // buf_end
+ addi.d t2, t0, 8
+ blt t1, t2, 1f
+
+ ld.d t0, t0, 0 // next_bits
+ addi.w t3, t7, 23 // shift_bits = cnt + 23
+ addi.w t7, t7, 16 // cnt += 16
+ revb.d t0, t0 // next_bits = bswap(next_bits)
+ srli.w t4, t3, 3
+ sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
+ st.d t2, a0, 0
+ andi t3, t3, 24 // shift_bits &= 24
+ srl.d t0, t0, t3 // next_bits >>= shift_bits
+ sub.w t3, t3, t7 // shift_bits -= 16 + cnt
+ sll.d t0, t0, t3 // next_bits <<= shift_bits
+ li.w t5, 48
+ sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
+ xor t6, t6, t0 // dif ^= next_bits
+ b 9f
+1:
+ li.w t4, 40
+ sub.w t5, t4, t7 // c = 40 - cnt
+2:
+ bge t0, t1, 3f
+ ld.bu t2, t0, 0
+ addi.d t0, t0, 1
+ sll.d t2, t2, t5
+ xor t6, t6, t2
+ addi.w t5, t5, -8
+ bge t5, zero, 2b
+ // refill_eob_end
+3:
+ st.d t0, a0, 0 // s->buf_pos = buf_pos
+ sub.w t7, t4, t5 // cnt = 40 - c
+9:
+ st.w t7, a5, 0 // store cnt
+ st.d t6, a6, 0 // store dif
+ move a0, a7
+ addi.d sp, sp, 48
+.endm
+
+function msac_decode_symbol_adapt4_lsx
+ decode_symbol_adapt 4
+endfunc
+
+function msac_decode_symbol_adapt8_lsx
+ decode_symbol_adapt 8
+endfunc
+
+function msac_decode_symbol_adapt16_lsx
+ decode_symbol_adapt 16
+endfunc
+
+function msac_decode_bool_lsx
+ ld.w t0, a0, 24 // rng
+ srli.w a1, a1, 6
+ ld.d t1, a0, 16 // dif
+ srli.w t2, t0, 8 // r >> 8
+ mul.w t2, t2, a1
+ ld.w a5, a0, 28 // cnt
+ addi.d t1, t1, 1 // dif + 1
+ srli.w t2, t2, 1
+ addi.w t2, t2, 4 // v
+ slli.d t3, t2, 48 // vw
+ sltu t4, t1, t3
+ move t8, t4 // ret
+ xori t4, t4, 1
+ maskeqz t6, t3, t4 // if (ret) vw
+ sub.d t6, t1, t6 // dif
+ slli.w t5, t2, 1
+ sub.w t5, t0, t5 // r - 2v
+ maskeqz t7, t5, t4 // if (ret) r - 2v
+ add.w t5, t2, t7 // v(rng)
+
+ // renorm
+ clz.w t4, t5 // d
+ xori t4, t4, 16 // d
+ sll.d t6, t6, t4
+ addi.d t6, t6, -1 // dif
+ sub.w t7, a5, t4 // cnt-d
+ sll.w t5, t5, t4
+ st.w t5, a0, 24 // store rng
+ bge t7, zero, 9f
+
+ // refill
+ ld.d t0, a0, 0 // buf_pos
+ addi.d t1, a0, 8
+ ld.d t1, t1, 0 // buf_end
+ addi.d t2, t0, 8
+ blt t1, t2, 1f
+
+ ld.d t0, t0, 0 // next_bits
+ addi.w t3, t7, 23 // shift_bits = cnt + 23
+ addi.w t7, t7, 16 // cnt += 16
+ revb.d t0, t0 // next_bits = bswap(next_bits)
+ srli.w t4, t3, 3
+ sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
+ st.d t2, a0, 0
+ andi t3, t3, 24 // shift_bits &= 24
+ srl.d t0, t0, t3 // next_bits >>= shift_bits
+ sub.w t3, t3, t7 // shift_bits -= 16 + cnt
+ sll.d t0, t0, t3 // next_bits <<= shift_bits
+ li.w t5, 48
+ sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
+ xor t6, t6, t0 // dif ^= next_bits
+ b 9f
+1:
+ li.w t4, 40
+ sub.w t5, t4, t7 // c = 40 - cnt
+2:
+ bge t0, t1, 3f
+ ld.bu t2, t0, 0
+ addi.d t0, t0, 1
+ sll.d t2, t2, t5
+ xor t6, t6, t2
+ addi.w t5, t5, -8
+ bge t5, zero, 2b
+ // refill_eob_end
+3:
+ st.d t0, a0, 0 // s->buf_pos = buf_pos
+ sub.w t7, t4, t5 // cnt = 40 - c
+9:
+ st.w t7, a0, 28 // store cnt
+ st.d t6, a0, 16 // store dif
+ move a0, t8
+endfunc
+
+function msac_decode_bool_adapt_lsx
+ ld.hu a3, a1, 0 // cdf[0] /f
+ ld.w t0, a0, 24 // rng
+ ld.d t1, a0, 16 // dif
+ srli.w t2, t0, 8 // r >> 8
+ srli.w a7, a3, 6
+ mul.w t2, t2, a7
+ ld.w a4, a0, 32 // allow_update_cdf
+ ld.w a5, a0, 28 // cnt
+ srli.w t2, t2, 1
+ addi.w t2, t2, 4 // v
+ slli.d t3, t2, 48 // vw
+ sltu t4, t1, t3
+ move t8, t4 // bit
+ xori t4, t4, 1
+ maskeqz t6, t3, t4 // if (ret) vw
+ sub.d t6, t1, t6 // dif
+ slli.w t5, t2, 1
+ sub.w t5, t0, t5 // r - 2v
+ maskeqz t7, t5, t4 // if (ret) r - 2v
+ add.w t5, t2, t7 // v(rng)
+ beqz a4, .renorm
+
+ // update_cdf
+ ld.hu t0, a1, 2 // cdf[1]
+ srli.w t1, t0, 4
+ addi.w t1, t1, 4 // rate
+ sltui t2, t0, 32 // count < 32
+ add.w t0, t0, t2 // count + (count < 32)
+ sub.w a3, a3, t8 // cdf[0] -= bit
+ slli.w t4, t8, 15
+ sub.w t7, a3, t4 // cdf[0] - bit - 32768
+ sra.w t7, t7, t1 // (cdf[0] - bit - 32768) >> rate
+ sub.w t7, a3, t7 // cdf[0]
+ st.h t7, a1, 0
+ st.h t0, a1, 2
+
+.renorm:
+ // renorm
+ addi.d t6, t6, 1
+ clz.w t4, t5 // d
+ xori t4, t4, 16 // d
+ sll.d t6, t6, t4
+ addi.d t6, t6, -1 // dif
+ sub.w t7, a5, t4 // cnt-d
+ sll.w t5, t5, t4
+ st.w t5, a0, 24 // store rng
+ bge t7, zero, 9f
+
+ // refill
+ ld.d t0, a0, 0 // buf_pos
+ addi.d t1, a0, 8
+ ld.d t1, t1, 0 // buf_end
+ addi.d t2, t0, 8
+ blt t1, t2, 1f
+
+ ld.d t0, t0, 0 // next_bits
+ addi.w t3, t7, 23 // shift_bits = cnt + 23
+ addi.w t7, t7, 16 // cnt += 16
+ revb.d t0, t0 // next_bits = bswap(next_bits)
+ srli.w t4, t3, 3
+ sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
+ st.d t2, a0, 0
+ andi t3, t3, 24 // shift_bits &= 24
+ srl.d t0, t0, t3 // next_bits >>= shift_bits
+ sub.w t3, t3, t7 // shift_bits -= 16 + cnt
+ sll.d t0, t0, t3 // next_bits <<= shift_bits
+ li.w t5, 48
+ sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
+ xor t6, t6, t0 // dif ^= next_bits
+ b 9f
+1:
+ li.w t4, 40
+ sub.w t5, t4, t7 // c = 40 - cnt
+2:
+ bge t0, t1, 3f
+ ld.bu t2, t0, 0
+ addi.d t0, t0, 1
+ sll.d t2, t2, t5
+ xor t6, t6, t2
+ addi.w t5, t5, -8
+ bge t5, zero, 2b
+ // refill_eob_end
+3:
+ st.d t0, a0, 0 // s->buf_pos = buf_pos
+ sub.w t7, t4, t5 // cnt = 40 - c
+9:
+ st.w t7, a0, 28 // store cnt
+ st.d t6, a0, 16 // store dif
+ move a0, t8
+endfunc
diff --git a/src/loongarch/msac.h b/src/loongarch/msac.h
new file mode 100644
index 0000000..fdcff83
--- /dev/null
+++ b/src/loongarch/msac.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_MSAC_H
+#define DAV1D_SRC_LOONGARCH_MSAC_H
+
+unsigned dav1d_msac_decode_symbol_adapt4_lsx(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_lsx(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_lsx(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_lsx(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_lsx(MsacContext *s, unsigned f);
+
+#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_lsx
+#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_lsx
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_lsx
+#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_lsx
+#define dav1d_msac_decode_bool dav1d_msac_decode_bool_lsx
+
+#endif /* DAV1D_SRC_LOONGARCH_MSAC_H */
diff --git a/src/loongarch/refmvs.S b/src/loongarch/refmvs.S
new file mode 100644
index 0000000..63a83d3
--- /dev/null
+++ b/src/loongarch/refmvs.S
@@ -0,0 +1,152 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/loongarch/loongson_asm.S"
+
+/*
+static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
+ const int bx4, const int bw4, int bh4)
+*/
+
+function splat_mv_lsx
+ vld vr0, a1, 0 // 0 1 ... 11 ...
+ clz.w t4, a3
+ vaddi.bu vr1, vr0, 0
+ addi.w t4, t4, -26
+ vextrins.w vr1, vr0, 0x30 // 0 1 2 ... 11 0 1 2 3
+ la.local t5, .SPLAT_LSX_JRTABLE
+ vbsrl.v vr2, vr1, 4 // 4 5 6 7...11 0 1 2 3 0 0 0 0
+ alsl.d t6, t4, t5, 1
+ vextrins.w vr2, vr0, 0x31 // 4 5 6 7...11 0 1 2 3 4 5 6 7
+ ld.h t7, t6, 0
+ vbsrl.v vr3, vr2, 4 // 8 9 10 11 0 1 2 3 4 5 6 7 0 0 0 0
+ add.d t8, t5, t7
+ alsl.d a2, a2, a2, 1
+ vextrins.w vr3, vr0, 0x32 // 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11
+ slli.w a2, a2, 2
+ jirl $r0, t8, 0
+
+.SPLAT_LSX_JRTABLE:
+ .hword .SPLAT_W32_LSX - .SPLAT_LSX_JRTABLE
+ .hword .SPLAT_W16_LSX - .SPLAT_LSX_JRTABLE
+ .hword .SPLAT_W8_LSX - .SPLAT_LSX_JRTABLE
+ .hword .SPLAT_W4_LSX - .SPLAT_LSX_JRTABLE
+ .hword .SPLAT_W2_LSX - .SPLAT_LSX_JRTABLE
+ .hword .SPLAT_W1_LSX - .SPLAT_LSX_JRTABLE
+
+.SPLAT_W1_LSX:
+ ld.d t3, a0, 0
+ addi.d a0, a0, 8
+ addi.d a4, a4, -1
+ add.d t3, t3, a2
+
+ fst.d f1, t3, 0
+ fst.s f3, t3, 8
+ blt zero, a4, .SPLAT_W1_LSX
+ b .splat_end
+.SPLAT_W2_LSX:
+ ld.d t3, a0, 0
+ addi.d a0, a0, 8
+ addi.d a4, a4, -1
+ add.d t3, t3, a2
+
+ vst vr1, t3, 0
+ fst.d f2, t3, 16
+ blt zero, a4, .SPLAT_W2_LSX
+ b .splat_end
+
+.SPLAT_W4_LSX:
+ ld.d t3, a0, 0
+ addi.d a0, a0, 8
+ addi.d a4, a4, -1
+ add.d t3, t3, a2
+
+ vst vr1, t3, 0
+ vst vr2, t3, 16
+ vst vr3, t3, 32
+ blt zero, a4, .SPLAT_W4_LSX
+ b .splat_end
+
+.SPLAT_W8_LSX:
+ ld.d t3, a0, 0
+ addi.d a0, a0, 8
+ addi.d a4, a4, -1
+ add.d t3, t3, a2
+
+ vst vr1, t3, 0
+ vst vr2, t3, 16
+ vst vr3, t3, 32
+
+ vst vr1, t3, 48
+ vst vr2, t3, 64
+ vst vr3, t3, 80
+ blt zero, a4, .SPLAT_W8_LSX
+ b .splat_end
+
+.SPLAT_W16_LSX:
+ ld.d t3, a0, 0
+ addi.d a0, a0, 8
+ addi.d a4, a4, -1
+ add.d t3, t3, a2
+
+.rept 2
+ vst vr1, t3, 0
+ vst vr2, t3, 16
+ vst vr3, t3, 32
+
+ vst vr1, t3, 48
+ vst vr2, t3, 64
+ vst vr3, t3, 80
+
+ addi.d t3, t3, 96
+.endr
+
+ blt zero, a4, .SPLAT_W16_LSX
+ b .splat_end
+
+.SPLAT_W32_LSX:
+ ld.d t3, a0, 0
+ addi.d a0, a0, 8
+ addi.d a4, a4, -1
+ add.d t3, t3, a2
+
+.rept 4
+ vst vr1, t3, 0
+ vst vr2, t3, 16
+ vst vr3, t3, 32
+
+ vst vr1, t3, 48
+ vst vr2, t3, 64
+ vst vr3, t3, 80
+
+ addi.d t3, t3, 96
+.endr
+
+ blt zero, a4, .SPLAT_W32_LSX
+
+.splat_end:
+endfunc
diff --git a/src/loongarch/refmvs.h b/src/loongarch/refmvs.h
new file mode 100644
index 0000000..60ff435
--- /dev/null
+++ b/src/loongarch/refmvs.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_REFMVS_H
+#define DAV1D_SRC_LOONGARCH_REFMVS_H
+
+#include "src/cpu.h"
+#include "src/refmvs.h"
+
+decl_splat_mv_fn(dav1d_splat_mv_lsx);
+
+static ALWAYS_INLINE void refmvs_dsp_init_loongarch(Dav1dRefmvsDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
+
+ c->splat_mv = dav1d_splat_mv_lsx;
+}
+
+#endif /* DAV1D_SRC_LOONGARCH_REFMVS_H */
diff --git a/src/loopfilter_tmpl.c b/src/loopfilter_tmpl.c
index cacf258..7cc8964 100644
--- a/src/loopfilter_tmpl.c
+++ b/src/loopfilter_tmpl.c
@@ -247,6 +247,8 @@ static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/loopfilter.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/loopfilter.h"
#elif ARCH_X86
#include "src/x86/loopfilter.h"
#endif
@@ -261,6 +263,8 @@ COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c)
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
loop_filter_dsp_init_arm(c);
+#elif ARCH_LOONGARCH64
+ loop_filter_dsp_init_loongarch(c);
#elif ARCH_X86
loop_filter_dsp_init_x86(c);
#endif
diff --git a/src/looprestoration_tmpl.c b/src/looprestoration_tmpl.c
index d4d7867..9922908 100644
--- a/src/looprestoration_tmpl.c
+++ b/src/looprestoration_tmpl.c
@@ -527,6 +527,8 @@ static void sgr_mix_c(pixel *p, const ptrdiff_t stride,
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/looprestoration.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/looprestoration.h"
#elif ARCH_PPC64LE
#include "src/ppc/looprestoration.h"
#elif ARCH_X86
@@ -545,6 +547,8 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
loop_restoration_dsp_init_arm(c, bpc);
+#elif ARCH_LOONGARCH64
+ loop_restoration_dsp_init_loongarch(c, bpc);
#elif ARCH_PPC64LE
loop_restoration_dsp_init_ppc(c, bpc);
#elif ARCH_X86
diff --git a/src/mc_tmpl.c b/src/mc_tmpl.c
index 20226d8..469fc5f 100644
--- a/src/mc_tmpl.c
+++ b/src/mc_tmpl.c
@@ -905,6 +905,8 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/mc.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/mc.h"
#elif ARCH_X86
#include "src/x86/mc.h"
#endif
@@ -946,6 +948,8 @@ COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
mc_dsp_init_arm(c);
+#elif ARCH_LOONGARCH64
+ mc_dsp_init_loongarch(c);
#elif ARCH_X86
mc_dsp_init_x86(c);
#endif
diff --git a/src/meson.build b/src/meson.build
index 3a34e76..dc4be5f 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -226,6 +226,24 @@ if is_asm_enabled
# Compile the ASM sources with NASM
libdav1d_asm_objs = nasm_gen.process(libdav1d_sources_asm)
+ elif host_machine.cpu_family().startswith('loongarch')
+ libdav1d_sources += files(
+ 'loongarch/cpu.c',
+ )
+
+ libdav1d_arch_tmpl_sources += files(
+ 'loongarch/looprestoration_tmpl.c',
+ )
+
+ libdav1d_sources_asm = files(
+ 'loongarch/mc.S',
+ 'loongarch/loopfilter.S',
+ 'loongarch/looprestoration.S',
+ 'loongarch/msac.S',
+ 'loongarch/refmvs.S',
+ 'loongarch/itx.S',
+ )
+ libdav1d_asm_objs += libdav1d_sources_asm
elif host_machine.cpu() == 'ppc64le'
arch_flags = ['-maltivec', '-mvsx']
libdav1d_sources += files(
@@ -235,6 +253,15 @@ if is_asm_enabled
'ppc/cdef_tmpl.c',
'ppc/looprestoration_tmpl.c',
)
+ elif host_machine.cpu_family().startswith('riscv')
+ libdav1d_sources += files(
+ 'riscv/cpu.c',
+ )
+ if host_machine.cpu_family() == 'riscv64'
+ libdav1d_sources += files(
+ 'riscv/64/itx.S',
+ )
+ endif
endif
endif
diff --git a/src/msac.h b/src/msac.h
index eb04f58..c3e07e1 100644
--- a/src/msac.h
+++ b/src/msac.h
@@ -51,6 +51,8 @@ typedef struct MsacContext {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/msac.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/msac.h"
#elif ARCH_X86
#include "src/x86/msac.h"
#endif
diff --git a/src/picture.c b/src/picture.c
index f22f05f..94365bc 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -111,15 +111,15 @@ void dav1d_picture_free_itut_t35(const uint8_t *const data, void *const user_dat
dav1d_free(itut_t35_ctx);
}
-static int picture_alloc_with_edges(Dav1dContext *const c,
- Dav1dPicture *const p,
- const int w, const int h,
- Dav1dSequenceHeader *const seq_hdr, Dav1dRef *const seq_hdr_ref,
- Dav1dFrameHeader *const frame_hdr, Dav1dRef *const frame_hdr_ref,
- const int bpc,
- const Dav1dDataProps *const props,
- Dav1dPicAllocator *const p_allocator,
- void **const extra_ptr)
+static int picture_alloc(Dav1dContext *const c,
+ Dav1dPicture *const p,
+ const int w, const int h,
+ Dav1dSequenceHeader *const seq_hdr, Dav1dRef *const seq_hdr_ref,
+ Dav1dFrameHeader *const frame_hdr, Dav1dRef *const frame_hdr_ref,
+ const int bpc,
+ const Dav1dDataProps *const props,
+ Dav1dPicAllocator *const p_allocator,
+ void **const extra_ptr)
{
if (p->data[0]) {
dav1d_log(c, "Picture already allocated!\n");
@@ -194,12 +194,11 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
{
Dav1dThreadPicture *const p = &f->sr_cur;
- const int res =
- picture_alloc_with_edges(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height,
- f->seq_hdr, f->seq_hdr_ref,
- f->frame_hdr, f->frame_hdr_ref,
- bpc, &f->tile[0].data.m, &c->allocator,
- (void **) &p->progress);
+ const int res = picture_alloc(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height,
+ f->seq_hdr, f->seq_hdr_ref,
+ f->frame_hdr, f->frame_hdr_ref,
+ bpc, &f->tile[0].data.m, &c->allocator,
+ (void **) &p->progress);
if (res) return res;
dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref,
@@ -212,9 +211,10 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
c->itut_t35 = NULL;
c->n_itut_t35 = 0;
- // Don't clear these flags from c->frame_flags if the frame is not visible.
+ // Don't clear these flags from c->frame_flags if the frame is not going to be output.
// This way they will be added to the next visible frame too.
- const int flags_mask = (f->frame_hdr->show_frame || c->output_invisible_frames)
+ const int flags_mask = ((f->frame_hdr->show_frame || c->output_invisible_frames) &&
+ c->max_spatial_id == f->frame_hdr->spatial_id)
? 0 : (PICTURE_FLAG_NEW_SEQUENCE | PICTURE_FLAG_NEW_OP_PARAMS_INFO);
p->flags = c->frame_flags;
c->frame_flags &= flags_mask;
@@ -233,11 +233,11 @@ int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, con
{
Dav1dMemPoolBuffer *const buf = (Dav1dMemPoolBuffer *)src->ref->const_data;
struct pic_ctx_context *const pic_ctx = buf->data;
- const int res = picture_alloc_with_edges(c, dst, w, src->p.h,
- src->seq_hdr, src->seq_hdr_ref,
- src->frame_hdr, src->frame_hdr_ref,
- src->p.bpc, &src->m, &pic_ctx->allocator,
- NULL);
+ const int res = picture_alloc(c, dst, w, src->p.h,
+ src->seq_hdr, src->seq_hdr_ref,
+ src->frame_hdr, src->frame_hdr_ref,
+ src->p.bpc, &src->m, &pic_ctx->allocator,
+ NULL);
if (res) return res;
dav1d_picture_copy_props(dst, src->content_light, src->content_light_ref,
diff --git a/src/qm.c b/src/qm.c
index e2e0d61..a523da5 100644
--- a/src/qm.c
+++ b/src/qm.c
@@ -33,1470 +33,6 @@
#include "src/qm.h"
-static const uint8_t qm_tbl_4x4_t[][2][10] = {
- {
- {
- 32,
- 43, 67,
- 73, 94, 137,
- 97, 110, 150, 200,
- }, {
- 35,
- 46, 60,
- 57, 69, 90,
- 66, 71, 90, 109,
- },
- }, {
- {
- 32,
- 41, 63,
- 69, 88, 127,
- 92, 103, 140, 184,
- }, {
- 33,
- 45, 58,
- 56, 66, 86,
- 64, 69, 87, 105,
- },
- }, {
- {
- 32,
- 38, 56,
- 63, 78, 113,
- 86, 97, 130, 169,
- }, {
- 32,
- 45, 55,
- 53, 62, 80,
- 63, 67, 84, 101,
- },
- }, {
- {
- 32,
- 37, 54,
- 58, 72, 102,
- 81, 91, 121, 156,
- }, {
- 32,
- 45, 54,
- 51, 59, 75,
- 61, 65, 81, 97,
- },
- }, {
- {
- 32,
- 34, 49,
- 53, 64, 91,
- 75, 81, 112, 140,
- }, {
- 32,
- 46, 53,
- 49, 55, 70,
- 58, 62, 78, 91,
- },
- }, {
- {
- 32,
- 34, 48,
- 49, 60, 82,
- 72, 79, 104, 134,
- }, {
- 32,
- 46, 53,
- 47, 54, 66,
- 57, 60, 75, 89,
- },
- }, {
- {
- 32,
- 33, 39,
- 45, 51, 71,
- 62, 64, 87, 108,
- }, {
- 31,
- 42, 48,
- 47, 50, 61,
- 53, 54, 67, 78,
- },
- }, {
- {
- 32,
- 33, 38,
- 42, 46, 63,
- 55, 57, 75, 92,
- }, {
- 31,
- 41, 48,
- 46, 48, 58,
- 51, 51, 62, 71,
- },
- }, {
- {
- 32,
- 32, 35,
- 38, 40, 54,
- 51, 49, 64, 81,
- }, {
- 31,
- 38, 47,
- 47, 46, 54,
- 49, 46, 57, 66,
- },
- }, {
- {
- 32,
- 32, 34,
- 35, 37, 48,
- 43, 43, 54, 65,
- }, {
- 31,
- 37, 44,
- 47, 47, 53,
- 47, 45, 53, 59,
- },
- }, {
- {
- 32,
- 32, 33,
- 34, 35, 39,
- 38, 39, 45, 54,
- }, {
- 31,
- 34, 39,
- 42, 45, 48,
- 47, 46, 49, 54,
- },
- }, {
- {
- 32,
- 32, 32,
- 32, 33, 35,
- 35, 35, 38, 46,
- }, {
- 31,
- 32, 34,
- 38, 41, 47,
- 46, 46, 47, 52,
- },
- }, {
- {
- 31,
- 32, 32,
- 32, 32, 33,
- 32, 33, 34, 35,
- }, {
- 31,
- 31, 32,
- 34, 35, 39,
- 38, 40, 43, 47,
- },
- }, {
- {
- 31,
- 31, 32,
- 31, 32, 32,
- 32, 32, 32, 33,
- }, {
- 31,
- 31, 31,
- 31, 31, 32,
- 34, 35, 35, 39,
- },
- }, {
- {
- 31,
- 31, 32,
- 31, 32, 32,
- 31, 32, 32, 32,
- }, {
- 31,
- 31, 31,
- 31, 31, 31,
- 31, 31, 31, 31,
- },
- },
-};
-
-static const uint8_t qm_tbl_8x4[][2][32] = {
- {
- {
- 32, 33, 37, 49, 65, 80, 91, 104,
- 42, 42, 58, 71, 84, 97, 100, 112,
- 75, 69, 84, 103, 125, 142, 145, 146,
- 91, 86, 91, 110, 128, 152, 178, 190,
- }, {
- 31, 40, 46, 48, 54, 61, 64, 68,
- 47, 45, 56, 61, 65, 69, 68, 71,
- 60, 54, 64, 75, 85, 92, 90, 87,
- 66, 61, 64, 73, 82, 92, 102, 105,
- },
- }, {
- {
- 32, 33, 36, 46, 60, 75, 86, 98,
- 42, 42, 56, 67, 79, 92, 95, 105,
- 69, 64, 77, 93, 112, 130, 136, 136,
- 88, 83, 88, 105, 122, 144, 167, 177,
- }, {
- 31, 40, 46, 47, 52, 59, 63, 66,
- 47, 45, 55, 60, 64, 68, 66, 69,
- 57, 52, 61, 70, 79, 87, 88, 85,
- 65, 61, 63, 72, 81, 90, 99, 102,
- },
- }, {
- {
- 32, 32, 34, 44, 54, 72, 82, 92,
- 38, 40, 51, 61, 69, 84, 89, 98,
- 62, 58, 68, 85, 98, 118, 129, 127,
- 86, 80, 85, 101, 117, 136, 157, 165,
- }, {
- 31, 38, 46, 46, 50, 57, 61, 65,
- 47, 46, 53, 56, 59, 64, 65, 67,
- 54, 50, 57, 66, 74, 82, 85, 82,
- 64, 60, 62, 71, 79, 88, 97, 99,
- },
- }, {
- {
- 32, 32, 34, 41, 51, 65, 75, 86,
- 35, 36, 47, 53, 61, 73, 81, 92,
- 59, 57, 65, 78, 92, 108, 117, 119,
- 83, 78, 82, 97, 111, 129, 148, 154,
- }, {
- 31, 36, 46, 45, 49, 54, 59, 63,
- 47, 47, 52, 53, 55, 58, 61, 65,
- 53, 50, 55, 63, 71, 77, 81, 80,
- 63, 59, 61, 70, 77, 86, 94, 95,
- },
- }, {
- {
- 32, 32, 34, 38, 48, 60, 72, 81,
- 35, 36, 42, 51, 59, 68, 79, 86,
- 51, 50, 54, 67, 80, 92, 104, 112,
- 77, 72, 75, 87, 103, 119, 135, 144,
- }, {
- 31, 36, 43, 45, 47, 52, 57, 61,
- 47, 47, 50, 53, 54, 56, 60, 63,
- 50, 47, 50, 58, 66, 70, 75, 77,
- 61, 57, 58, 65, 74, 82, 90, 93,
- },
- }, {
- {
- 32, 32, 34, 37, 45, 54, 65, 75,
- 35, 36, 42, 50, 56, 63, 73, 81,
- 51, 50, 54, 65, 76, 87, 97, 106,
- 75, 71, 73, 84, 96, 110, 125, 136,
- }, {
- 31, 36, 43, 46, 46, 50, 54, 59,
- 47, 47, 50, 53, 54, 55, 58, 61,
- 50, 47, 50, 57, 64, 68, 72, 75,
- 60, 56, 57, 64, 71, 78, 85, 90,
- },
- }, {
- {
- 32, 32, 33, 35, 41, 49, 57, 66,
- 34, 34, 37, 43, 48, 54, 60, 68,
- 43, 42, 44, 54, 64, 71, 78, 86,
- 62, 59, 58, 68, 79, 91, 101, 111,
- }, {
- 31, 33, 40, 47, 45, 48, 51, 55,
- 42, 44, 47, 50, 49, 50, 52, 55,
- 47, 45, 46, 54, 59, 61, 63, 66,
- 54, 51, 50, 57, 64, 70, 75, 79,
- },
- }, {
- {
- 32, 32, 32, 34, 38, 44, 50, 61,
- 32, 33, 35, 37, 40, 45, 50, 58,
- 42, 41, 42, 50, 58, 66, 71, 79,
- 56, 53, 52, 59, 68, 78, 86, 97,
- }, {
- 31, 32, 39, 44, 46, 47, 48, 53,
- 38, 40, 47, 47, 47, 46, 47, 50,
- 47, 45, 45, 51, 56, 59, 61, 64,
- 52, 49, 48, 53, 58, 64, 68, 73,
- },
- }, {
- {
- 32, 32, 32, 34, 35, 40, 46, 52,
- 32, 33, 34, 37, 38, 42, 46, 51,
- 37, 36, 38, 44, 49, 55, 59, 64,
- 52, 49, 49, 54, 60, 69, 76, 83,
- }, {
- 31, 31, 36, 42, 47, 46, 48, 50,
- 38, 40, 44, 47, 48, 46, 46, 48,
- 47, 46, 47, 50, 53, 54, 55, 56,
- 50, 48, 47, 50, 54, 60, 64, 67,
- },
- }, {
- {
- 31, 32, 32, 32, 34, 37, 42, 46,
- 32, 33, 34, 35, 37, 40, 43, 46,
- 35, 34, 36, 38, 43, 49, 53, 56,
- 43, 41, 42, 42, 49, 56, 63, 67,
- }, {
- 31, 31, 35, 39, 43, 47, 46, 48,
- 38, 40, 43, 47, 47, 47, 46, 46,
- 47, 46, 47, 47, 50, 53, 53, 54,
- 48, 45, 46, 45, 50, 55, 58, 59,
- },
- }, {
- {
- 31, 32, 32, 32, 33, 34, 37, 40,
- 32, 32, 33, 33, 34, 36, 38, 40,
- 34, 34, 34, 36, 38, 41, 44, 46,
- 39, 38, 38, 40, 42, 47, 52, 56,
- }, {
- 31, 31, 33, 36, 40, 45, 47, 47,
- 34, 35, 37, 41, 44, 46, 47, 46,
- 42, 42, 44, 46, 48, 49, 50, 49,
- 48, 46, 46, 46, 48, 51, 54, 55,
- },
- }, {
- {
- 31, 32, 32, 32, 32, 33, 34, 35,
- 31, 32, 32, 32, 33, 33, 34, 34,
- 32, 32, 33, 34, 35, 36, 37, 38,
- 35, 35, 34, 36, 38, 40, 42, 48,
- }, {
- 31, 31, 31, 34, 37, 39, 42, 48,
- 31, 31, 32, 36, 39, 41, 43, 46,
- 37, 38, 40, 43, 46, 47, 47, 48,
- 48, 47, 46, 47, 47, 48, 50, 53,
- },
- }, {
- {
- 31, 31, 32, 32, 32, 32, 32, 33,
- 31, 32, 32, 32, 32, 32, 33, 33,
- 32, 32, 32, 32, 33, 34, 34, 35,
- 32, 32, 32, 33, 34, 34, 35, 36,
- }, {
- 31, 31, 31, 31, 34, 35, 38, 41,
- 31, 31, 32, 32, 36, 37, 40, 42,
- 35, 36, 37, 37, 40, 42, 45, 45,
- 37, 38, 39, 40, 43, 44, 47, 47,
- },
- }, {
- {
- 31, 31, 31, 31, 31, 31, 32, 32,
- 31, 32, 32, 32, 32, 32, 32, 32,
- 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 33,
- }, {
- 31, 31, 31, 31, 31, 31, 34, 34,
- 31, 31, 31, 32, 32, 33, 36, 36,
- 31, 31, 31, 32, 32, 33, 36, 36,
- 34, 35, 35, 36, 36, 37, 40, 40,
- },
- }, {
- {
- 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 32,
- 31, 31, 32, 32, 32, 32, 32, 32,
- 31, 31, 32, 32, 32, 32, 32, 32,
- }, {
- 31, 31, 31, 31, 31, 31, 31, 30,
- 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 32, 32,
- 31, 31, 31, 31, 31, 31, 32, 32,
- },
- },
-};
-
-static const uint8_t qm_tbl_8x8_t[][2][36] = {
- {
- {
- 32,
- 32, 35,
- 38, 40, 54,
- 51, 49, 65, 82,
- 68, 63, 78, 97, 117,
- 84, 76, 91, 111, 134, 152,
- 95, 89, 98, 113, 138, 159, 183,
- 109, 102, 106, 121, 142, 168, 199, 220,
- }, {
- 31,
- 38, 47,
- 47, 46, 54,
- 50, 47, 57, 66,
- 57, 52, 61, 72, 82,
- 63, 57, 66, 77, 88, 96,
- 67, 62, 67, 75, 86, 95, 104,
- 71, 67, 68, 75, 84, 95, 107, 113,
- },
- }, {
- {
- 32,
- 32, 35,
- 37, 39, 51,
- 47, 46, 60, 73,
- 62, 58, 71, 87, 105,
- 78, 72, 84, 100, 121, 140,
- 90, 84, 93, 106, 129, 148, 169,
- 102, 96, 100, 113, 132, 155, 183, 201,
- }, {
- 31,
- 38, 47,
- 47, 47, 53,
- 48, 46, 55, 62,
- 54, 50, 58, 67, 76,
- 61, 55, 63, 72, 83, 91,
- 66, 61, 65, 73, 84, 92, 101,
- 69, 65, 66, 73, 82, 92, 103, 109,
- },
- }, {
- {
- 32,
- 32, 34,
- 35, 37, 48,
- 46, 45, 56, 70,
- 57, 54, 64, 80, 93,
- 76, 70, 79, 96, 111, 134,
- 85, 79, 87, 100, 121, 138, 156,
- 96, 90, 93, 105, 122, 144, 168, 184,
- }, {
- 31,
- 36, 43,
- 47, 47, 53,
- 48, 46, 54, 61,
- 52, 49, 55, 65, 71,
- 60, 55, 60, 70, 78, 89,
- 64, 59, 63, 71, 81, 89, 97,
- 67, 63, 64, 71, 79, 89, 99, 104,
- },
- }, {
- {
- 32,
- 32, 33,
- 35, 36, 46,
- 42, 42, 52, 63,
- 53, 51, 60, 73, 86,
- 68, 64, 72, 84, 100, 117,
- 78, 74, 80, 92, 109, 128, 140,
- 90, 84, 87, 98, 114, 133, 155, 168,
- }, {
- 31,
- 34, 39,
- 46, 47, 52,
- 47, 45, 52, 58,
- 50, 48, 54, 62, 68,
- 57, 53, 58, 65, 73, 82,
- 61, 57, 61, 68, 77, 86, 91,
- 65, 61, 62, 68, 76, 86, 95, 100,
- },
- }, {
- {
- 32,
- 32, 33,
- 34, 35, 39,
- 39, 40, 46, 56,
- 50, 48, 53, 65, 78,
- 62, 59, 63, 75, 90, 105,
- 76, 71, 74, 86, 101, 118, 134,
- 84, 79, 81, 92, 106, 123, 142, 153,
- }, {
- 31,
- 34, 39,
- 42, 45, 48,
- 47, 46, 49, 55,
- 49, 47, 50, 58, 65,
- 54, 51, 53, 61, 69, 76,
- 60, 56, 57, 65, 73, 82, 89,
- 64, 59, 60, 66, 74, 83, 92, 96,
- },
- }, {
- {
- 32,
- 32, 33,
- 34, 35, 39,
- 38, 39, 45, 54,
- 46, 45, 51, 61, 71,
- 56, 54, 58, 69, 80, 92,
- 68, 64, 68, 78, 90, 103, 117,
- 78, 74, 76, 86, 99, 113, 128, 140,
- }, {
- 31,
- 34, 39,
- 42, 45, 48,
- 47, 46, 49, 54,
- 48, 46, 50, 56, 61,
- 52, 49, 52, 58, 65, 71,
- 57, 53, 55, 61, 68, 75, 82,
- 61, 57, 58, 64, 71, 79, 86, 91,
- },
- }, {
- {
- 31,
- 32, 32,
- 32, 33, 35,
- 35, 35, 38, 48,
- 42, 41, 43, 54, 63,
- 51, 49, 49, 59, 71, 81,
- 59, 56, 56, 66, 77, 89, 98,
- 69, 65, 64, 73, 85, 97, 108, 119,
- }, {
- 31,
- 32, 35,
- 38, 42, 47,
- 48, 47, 48, 53,
- 47, 45, 45, 53, 58,
- 50, 47, 47, 54, 61, 66,
- 53, 50, 49, 56, 63, 69, 73,
- 57, 54, 52, 58, 65, 72, 77, 82,
- },
- }, {
- {
- 31,
- 32, 32,
- 32, 32, 35,
- 34, 34, 37, 42,
- 38, 37, 40, 47, 54,
- 46, 44, 45, 52, 60, 69,
- 52, 49, 49, 56, 65, 75, 82,
- 63, 59, 58, 65, 73, 84, 92, 105,
- }, {
- 31,
- 31, 32,
- 38, 40, 47,
- 44, 44, 47, 50,
- 47, 45, 46, 51, 54,
- 48, 46, 46, 51, 56, 61,
- 50, 47, 47, 52, 57, 63, 66,
- 55, 52, 50, 54, 60, 66, 70, 76,
- },
- }, {
- {
- 31,
- 32, 32,
- 32, 32, 34,
- 34, 33, 35, 39,
- 35, 34, 37, 42, 48,
- 41, 40, 41, 47, 53, 60,
- 47, 44, 45, 51, 57, 65, 71,
- 53, 50, 51, 55, 61, 70, 77, 85,
- }, {
- 31,
- 31, 32,
- 35, 36, 41,
- 42, 42, 45, 48,
- 48, 46, 47, 50, 53,
- 47, 45, 45, 49, 53, 57,
- 49, 46, 46, 50, 54, 59, 61,
- 51, 48, 48, 51, 54, 60, 64, 68,
- },
- }, {
- {
- 31,
- 31, 32,
- 32, 32, 33,
- 32, 32, 34, 35,
- 34, 34, 35, 37, 41,
- 37, 36, 38, 39, 45, 51,
- 43, 41, 42, 42, 49, 56, 63,
- 47, 44, 45, 46, 52, 59, 67, 71,
- }, {
- 31,
- 31, 32,
- 34, 35, 39,
- 37, 40, 43, 47,
- 43, 43, 45, 47, 49,
- 48, 46, 46, 47, 50, 53,
- 47, 45, 45, 45, 50, 55, 58,
- 49, 46, 46, 46, 50, 55, 60, 61,
- },
- }, {
- {
- 31,
- 31, 32,
- 32, 32, 32,
- 32, 32, 33, 34,
- 33, 33, 34, 35, 37,
- 34, 34, 35, 36, 39, 43,
- 37, 36, 37, 38, 41, 46, 51,
- 41, 39, 40, 41, 44, 49, 54, 58,
- }, {
- 31,
- 31, 31,
- 32, 33, 35,
- 35, 37, 39, 43,
- 39, 41, 42, 45, 47,
- 45, 44, 45, 47, 48, 50,
- 48, 46, 46, 47, 48, 51, 53,
- 48, 46, 45, 46, 47, 51, 54, 56,
- },
- }, {
- {
- 31,
- 31, 32,
- 31, 32, 32,
- 32, 32, 32, 33,
- 32, 32, 32, 34, 35,
- 32, 33, 33, 34, 35, 36,
- 34, 34, 33, 35, 36, 38, 39,
- 35, 35, 34, 36, 38, 40, 42, 48,
- }, {
- 31,
- 31, 31,
- 30, 31, 32,
- 34, 34, 35, 39,
- 36, 37, 39, 42, 46,
- 39, 40, 41, 44, 47, 47,
- 42, 42, 42, 45, 47, 48, 48,
- 48, 47, 46, 47, 47, 49, 50, 53,
- },
- }, {
- {
- 31,
- 31, 32,
- 31, 32, 32,
- 31, 32, 32, 32,
- 32, 32, 32, 32, 33,
- 32, 32, 32, 32, 33, 34,
- 32, 32, 32, 32, 34, 34, 35,
- 33, 33, 33, 33, 35, 35, 36, 38,
- }, {
- 31,
- 31, 31,
- 31, 31, 31,
- 30, 31, 31, 32,
- 34, 34, 35, 35, 39,
- 35, 35, 36, 36, 40, 41,
- 37, 38, 39, 40, 43, 44, 47,
- 40, 41, 41, 42, 44, 45, 47, 48,
- },
- }, {
- {
- 31,
- 31, 32,
- 31, 32, 32,
- 31, 32, 32, 32,
- 31, 32, 32, 32, 32,
- 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 33,
- 32, 32, 32, 32, 32, 32, 33, 33,
- }, {
- 31,
- 31, 31,
- 31, 31, 31,
- 31, 31, 31, 31,
- 30, 31, 31, 31, 32,
- 31, 32, 32, 32, 32, 33,
- 33, 34, 34, 35, 35, 36, 39,
- 33, 34, 34, 35, 35, 36, 39, 39,
- },
- }, {
- {
- 31,
- 31, 31,
- 31, 31, 31,
- 31, 31, 32, 32,
- 31, 31, 32, 32, 32,
- 31, 31, 32, 32, 32, 32,
- 31, 31, 32, 32, 32, 32, 32,
- 31, 31, 32, 32, 32, 32, 32, 32,
- }, {
- 31,
- 31, 31,
- 31, 31, 31,
- 31, 31, 31, 31,
- 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31,
- 30, 31, 31, 31, 31, 31, 31, 31,
- },
- },
-};
-
-static const uint8_t qm_tbl_16x4[][2][64] = {
- {
- {
- 31, 32, 32, 34, 34, 41, 45, 54, 60, 72, 75, 83, 88, 94, 101, 108,
- 44, 41, 42, 48, 54, 63, 67, 75, 79, 90, 92, 100, 100, 101, 108, 115,
- 79, 72, 71, 73, 78, 90, 96, 110, 118, 133, 136, 142, 140, 144, 141, 151,
- 96, 90, 86, 83, 89, 95, 102, 111, 123, 135, 149, 160, 173, 180, 188, 197,
- }, {
- 31, 32, 36, 43, 46, 45, 46, 50, 52, 57, 59, 62, 63, 65, 67, 69,
- 49, 45, 46, 49, 53, 58, 59, 62, 64, 67, 68, 71, 69, 68, 70, 72,
- 63, 57, 56, 57, 60, 67, 71, 78, 82, 89, 90, 91, 89, 89, 86, 88,
- 69, 65, 62, 60, 63, 66, 70, 74, 80, 85, 91, 96, 101, 103, 105, 107,
- },
- }, {
- {
- 31, 32, 32, 33, 34, 37, 44, 49, 56, 65, 72, 78, 84, 89, 95, 101,
- 44, 41, 42, 44, 54, 58, 66, 71, 77, 84, 90, 95, 95, 95, 101, 108,
- 73, 67, 65, 66, 74, 79, 90, 99, 107, 119, 127, 133, 132, 136, 132, 141,
- 93, 87, 83, 81, 86, 92, 98, 107, 117, 129, 141, 151, 163, 169, 175, 183,
- }, {
- 31, 32, 36, 41, 46, 46, 46, 48, 51, 54, 57, 60, 62, 64, 65, 67,
- 49, 45, 46, 47, 53, 56, 59, 61, 63, 65, 67, 69, 67, 66, 68, 70,
- 61, 55, 54, 54, 59, 62, 68, 73, 77, 82, 86, 88, 86, 87, 83, 86,
- 69, 64, 61, 59, 62, 65, 68, 73, 78, 84, 89, 93, 98, 100, 102, 103,
- },
- }, {
- {
- 31, 32, 32, 33, 34, 37, 41, 46, 53, 60, 65, 74, 79, 84, 89, 94,
- 39, 38, 39, 40, 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 95, 101,
- 65, 60, 59, 58, 65, 73, 79, 86, 97, 105, 111, 120, 125, 128, 124, 131,
- 90, 84, 81, 78, 83, 89, 94, 102, 112, 123, 134, 143, 154, 158, 164, 170,
- }, {
- 31, 32, 36, 40, 44, 46, 45, 47, 49, 52, 54, 58, 60, 62, 64, 65,
- 48, 46, 46, 46, 51, 54, 56, 57, 58, 60, 62, 64, 66, 64, 66, 68,
- 57, 53, 51, 50, 54, 60, 64, 68, 73, 76, 79, 82, 84, 84, 81, 83,
- 68, 63, 60, 58, 61, 64, 67, 71, 77, 82, 87, 91, 95, 97, 99, 100,
- },
- }, {
- {
- 31, 32, 32, 33, 34, 34, 39, 44, 49, 54, 60, 68, 75, 79, 84, 88,
- 36, 35, 36, 38, 42, 48, 51, 56, 60, 63, 68, 75, 81, 85, 89, 94,
- 62, 58, 57, 56, 61, 66, 74, 82, 90, 95, 102, 110, 117, 120, 116, 123,
- 88, 82, 79, 76, 81, 85, 91, 98, 107, 117, 127, 135, 145, 148, 153, 159,
- }, {
- 31, 32, 35, 40, 43, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 63,
- 48, 46, 47, 47, 50, 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 66,
- 56, 52, 50, 49, 53, 56, 61, 65, 70, 72, 75, 79, 81, 82, 79, 81,
- 67, 62, 60, 57, 60, 63, 66, 70, 75, 80, 85, 89, 93, 94, 96, 97,
- },
- }, {
- {
- 31, 32, 32, 32, 33, 34, 37, 41, 45, 49, 54, 61, 68, 74, 78, 83,
- 36, 35, 35, 37, 41, 48, 50, 53, 56, 60, 63, 69, 75, 80, 84, 88,
- 53, 51, 49, 49, 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 114,
- 81, 76, 73, 71, 74, 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148,
- }, {
- 31, 31, 33, 38, 42, 46, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62,
- 48, 47, 46, 47, 49, 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 64,
- 52, 49, 48, 47, 50, 54, 57, 61, 64, 66, 68, 71, 73, 75, 76, 78,
- 64, 60, 57, 56, 57, 61, 64, 68, 71, 75, 78, 83, 87, 90, 92, 94,
- },
- }, {
- {
- 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75,
- 36, 35, 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81,
- 53, 51, 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106,
- 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136,
- }, {
- 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59,
- 48, 47, 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61,
- 52, 50, 48, 47, 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75,
- 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85, 89, 90,
- },
- }, {
- {
- 31, 32, 32, 32, 32, 34, 34, 36, 39, 42, 45, 50, 54, 60, 66, 73,
- 34, 34, 33, 35, 37, 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74,
- 44, 43, 41, 43, 43, 48, 53, 57, 60, 64, 67, 72, 76, 80, 85, 91,
- 65, 62, 59, 59, 58, 63, 67, 71, 76, 81, 85, 92, 98, 105, 111, 118,
- }, {
- 31, 31, 32, 35, 40, 43, 46, 46, 46, 46, 47, 48, 50, 52, 55, 58,
- 42, 42, 42, 45, 47, 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 58,
- 49, 47, 45, 46, 46, 49, 53, 55, 57, 59, 60, 61, 63, 64, 66, 68,
- 57, 54, 52, 51, 50, 53, 56, 58, 61, 64, 67, 71, 73, 76, 79, 82,
- },
- }, {
- {
- 31, 32, 32, 32, 32, 32, 34, 35, 37, 39, 41, 45, 50, 54, 57, 61,
- 32, 32, 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58,
- 44, 42, 41, 42, 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79,
- 58, 55, 53, 53, 53, 52, 57, 63, 67, 70, 74, 79, 86, 90, 93, 97,
- }, {
- 31, 31, 32, 34, 37, 39, 42, 47, 46, 46, 46, 47, 48, 50, 51, 53,
- 37, 38, 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50,
- 49, 47, 45, 45, 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64,
- 54, 51, 49, 49, 48, 48, 51, 55, 58, 60, 62, 65, 68, 70, 71, 73,
- },
- }, {
- {
- 31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 55,
- 32, 32, 32, 33, 34, 35, 36, 37, 38, 40, 40, 43, 45, 47, 50, 54,
- 38, 37, 36, 36, 38, 39, 41, 44, 49, 51, 52, 56, 58, 60, 63, 67,
- 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87,
- }, {
- 31, 31, 31, 32, 35, 39, 40, 42, 47, 47, 46, 46, 47, 48, 49, 51,
- 37, 38, 39, 40, 43, 47, 47, 47, 48, 47, 47, 46, 46, 47, 47, 49,
- 48, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57,
- 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68,
- },
- }, {
- {
- 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46,
- 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46,
- 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56,
- 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67,
- }, {
- 31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 47, 47, 46, 48, 48,
- 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46,
- 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54,
- 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59,
- },
- }, {
- {
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42,
- 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42,
- 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 43, 43, 45, 45, 48,
- 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 51, 51, 54, 54, 58,
- }, {
- 31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 48, 48, 47, 47, 47,
- 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45,
- 42, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49,
- 48, 47, 47, 45, 45, 46, 46, 46, 46, 50, 50, 53, 53, 54, 54, 56,
- },
- }, {
- {
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34,
- 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38,
- 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48,
- }, {
- 31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 48, 48,
- 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46,
- 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48,
- 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53,
- },
- }, {
- {
- 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 36,
- 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37,
- }, {
- 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 38, 38, 39, 42,
- 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43,
- 35, 35, 36, 36, 36, 37, 37, 38, 40, 40, 40, 43, 45, 45, 45, 46,
- 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47,
- },
- }, {
- {
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
- }, {
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36,
- 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 39, 40, 40, 40,
- },
- }, {
- {
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- }, {
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- },
- },
-};
-
-static const uint8_t qm_tbl_16x8[][2][128] = {
- {
- {
- 32, 31, 32, 34, 36, 44, 48, 58, 65, 79, 82, 91, 97, 103, 110, 118,
- 32, 33, 34, 37, 38, 43, 46, 54, 58, 70, 72, 80, 86, 93, 100, 107,
- 36, 34, 36, 42, 48, 53, 56, 63, 68, 79, 81, 88, 94, 98, 101, 105,
- 53, 49, 50, 54, 60, 71, 76, 87, 92, 104, 106, 106, 107, 114, 117, 118,
- 65, 59, 59, 63, 68, 79, 85, 98, 105, 118, 121, 130, 128, 131, 138, 136,
- 87, 78, 77, 79, 84, 95, 102, 116, 124, 141, 144, 148, 157, 150, 161, 157,
- 93, 86, 82, 80, 86, 94, 105, 112, 122, 135, 149, 162, 167, 174, 183, 182,
- 99, 93, 89, 88, 90, 97, 105, 115, 124, 135, 146, 159, 171, 186, 193, 203,
- }, {
- 32, 30, 33, 42, 49, 49, 50, 54, 57, 63, 64, 68, 70, 72, 74, 76,
- 37, 40, 43, 47, 48, 46, 46, 49, 50, 55, 56, 59, 62, 64, 67, 69,
- 48, 46, 47, 50, 53, 53, 54, 55, 56, 60, 61, 64, 66, 66, 66, 67,
- 52, 48, 47, 50, 54, 61, 64, 68, 70, 75, 75, 74, 73, 75, 74, 73,
- 57, 52, 51, 53, 57, 64, 67, 73, 76, 82, 83, 86, 83, 83, 84, 82,
- 66, 60, 59, 60, 62, 69, 73, 80, 84, 92, 93, 94, 96, 92, 94, 91,
- 68, 63, 60, 59, 62, 66, 72, 76, 80, 87, 93, 98, 99, 101, 103, 101,
- 71, 66, 63, 62, 62, 66, 70, 75, 79, 84, 89, 94, 98, 104, 106, 109,
- },
- }, {
- {
- 32, 31, 32, 32, 36, 39, 47, 53, 61, 71, 79, 86, 92, 98, 104, 110,
- 32, 32, 34, 35, 37, 40, 45, 50, 56, 64, 70, 76, 82, 88, 94, 100,
- 36, 35, 36, 40, 48, 50, 56, 60, 65, 73, 79, 84, 89, 93, 95, 98,
- 47, 44, 45, 47, 56, 60, 69, 75, 81, 89, 95, 100, 101, 108, 110, 111,
- 65, 60, 59, 60, 68, 73, 84, 92, 100, 111, 118, 124, 121, 124, 129, 127,
- 79, 72, 71, 71, 78, 84, 95, 103, 113, 125, 133, 140, 148, 141, 151, 147,
- 90, 84, 80, 78, 83, 91, 101, 108, 116, 129, 142, 153, 157, 163, 171, 169,
- 96, 90, 87, 85, 87, 94, 101, 110, 118, 129, 138, 150, 161, 174, 181, 188,
- }, {
- 32, 30, 33, 39, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74,
- 35, 38, 41, 46, 48, 46, 46, 47, 49, 53, 55, 58, 60, 62, 65, 67,
- 48, 46, 47, 48, 53, 53, 54, 54, 56, 58, 60, 62, 64, 65, 65, 65,
- 50, 46, 46, 47, 54, 56, 61, 63, 65, 68, 70, 72, 71, 73, 72, 71,
- 57, 52, 51, 51, 57, 60, 66, 71, 74, 79, 82, 84, 81, 81, 82, 79,
- 63, 58, 56, 55, 60, 64, 70, 75, 79, 85, 89, 91, 94, 89, 92, 89,
- 68, 63, 60, 58, 61, 65, 71, 75, 79, 85, 91, 95, 97, 98, 100, 98,
- 70, 65, 63, 61, 61, 65, 69, 74, 78, 82, 87, 91, 96, 101, 103, 105,
- },
- }, {
- {
- 32, 31, 32, 32, 34, 39, 44, 49, 57, 65, 71, 81, 87, 92, 98, 103,
- 32, 32, 33, 34, 36, 39, 42, 46, 53, 59, 64, 72, 77, 83, 88, 94,
- 36, 35, 36, 38, 44, 50, 53, 57, 63, 68, 73, 80, 85, 88, 89, 92,
- 44, 41, 42, 42, 50, 58, 63, 67, 74, 79, 84, 91, 96, 102, 103, 103,
- 58, 54, 53, 52, 59, 68, 74, 81, 90, 97, 102, 110, 114, 117, 121, 119,
- 79, 73, 71, 69, 75, 84, 90, 97, 108, 118, 125, 135, 140, 133, 141, 137,
- 88, 81, 78, 76, 81, 88, 97, 104, 111, 123, 135, 145, 148, 153, 160, 158,
- 93, 88, 84, 82, 84, 90, 97, 105, 113, 122, 131, 141, 151, 163, 169, 175,
- }, {
- 32, 31, 33, 37, 44, 48, 49, 51, 54, 57, 60, 64, 66, 68, 70, 72,
- 34, 36, 40, 44, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65,
- 48, 46, 47, 47, 51, 53, 53, 54, 55, 56, 58, 61, 63, 63, 63, 63,
- 49, 46, 46, 45, 51, 56, 58, 60, 62, 64, 65, 68, 69, 71, 70, 69,
- 54, 50, 49, 48, 53, 58, 62, 65, 70, 73, 75, 78, 79, 79, 80, 77,
- 63, 58, 56, 54, 59, 64, 67, 71, 77, 82, 85, 89, 91, 87, 89, 86,
- 67, 62, 59, 57, 60, 64, 70, 73, 77, 83, 89, 93, 94, 96, 97, 95,
- 69, 65, 62, 60, 61, 64, 68, 72, 76, 81, 85, 89, 93, 98, 100, 102,
- },
- }, {
- {
- 32, 31, 31, 32, 34, 36, 41, 47, 53, 58, 65, 74, 82, 87, 92, 97,
- 31, 32, 33, 34, 35, 36, 40, 44, 50, 54, 59, 67, 73, 78, 83, 88,
- 35, 34, 35, 37, 41, 46, 49, 53, 57, 61, 66, 73, 79, 83, 84, 86,
- 44, 41, 42, 42, 48, 54, 60, 66, 71, 75, 79, 86, 92, 96, 97, 97,
- 53, 50, 49, 49, 54, 60, 67, 75, 82, 87, 92, 100, 105, 110, 114, 111,
- 65, 61, 59, 58, 63, 68, 76, 84, 92, 98, 105, 113, 120, 125, 132, 128,
- 82, 76, 73, 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144, 150, 147,
- 90, 85, 81, 79, 81, 87, 93, 101, 108, 116, 124, 134, 142, 153, 157, 163,
- }, {
- 32, 31, 33, 37, 42, 49, 48, 50, 52, 54, 57, 61, 64, 66, 68, 70,
- 33, 34, 37, 43, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63,
- 45, 45, 46, 47, 49, 52, 51, 52, 53, 54, 55, 58, 60, 61, 61, 61,
- 49, 46, 45, 45, 49, 53, 57, 59, 61, 62, 64, 66, 68, 69, 68, 67,
- 52, 49, 47, 47, 50, 54, 59, 63, 66, 68, 70, 73, 75, 77, 77, 75,
- 57, 53, 51, 50, 53, 57, 61, 66, 71, 73, 76, 80, 83, 84, 86, 83,
- 64, 60, 57, 55, 58, 61, 66, 71, 75, 79, 83, 87, 91, 93, 94, 92,
- 68, 64, 61, 59, 60, 63, 67, 71, 74, 79, 83, 87, 91, 95, 97, 98,
- },
- }, {
- {
- 32, 31, 31, 32, 33, 36, 39, 44, 48, 53, 58, 66, 74, 81, 86, 91,
- 31, 32, 32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 78, 82,
- 33, 33, 34, 36, 38, 42, 44, 46, 50, 53, 57, 63, 69, 75, 78, 80,
- 40, 39, 38, 40, 44, 51, 54, 59, 62, 66, 70, 75, 81, 86, 90, 90,
- 51, 49, 47, 48, 52, 58, 63, 69, 74, 79, 84, 90, 97, 102, 106, 103,
- 65, 61, 59, 58, 62, 68, 73, 79, 85, 92, 98, 106, 113, 120, 124, 119,
- 79, 74, 71, 69, 72, 78, 84, 90, 96, 103, 110, 119, 128, 135, 140, 137,
- 87, 82, 79, 77, 78, 84, 89, 96, 103, 111, 118, 126, 134, 143, 147, 151,
- }, {
- 32, 31, 31, 35, 41, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68,
- 32, 33, 35, 39, 43, 47, 46, 45, 46, 48, 50, 52, 55, 58, 59, 61,
- 40, 41, 43, 46, 48, 50, 49, 48, 49, 50, 51, 53, 56, 58, 59, 59,
- 49, 47, 46, 46, 49, 53, 54, 56, 57, 58, 59, 61, 63, 65, 66, 65,
- 51, 49, 47, 47, 49, 54, 57, 61, 63, 65, 67, 69, 72, 73, 75, 72,
- 57, 54, 51, 50, 52, 57, 60, 64, 67, 71, 73, 77, 80, 82, 84, 81,
- 63, 59, 57, 55, 57, 60, 64, 67, 71, 75, 78, 82, 86, 89, 91, 89,
- 67, 63, 60, 58, 59, 62, 65, 69, 73, 77, 81, 85, 88, 92, 94, 95,
- },
- }, {
- {
- 32, 31, 31, 32, 32, 34, 36, 39, 44, 48, 53, 58, 65, 71, 79, 82,
- 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75,
- 32, 32, 33, 34, 35, 37, 38, 40, 43, 46, 50, 54, 58, 63, 70, 72,
- 36, 35, 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81,
- 44, 42, 41, 42, 42, 48, 54, 58, 63, 67, 71, 75, 79, 84, 90, 92,
- 53, 51, 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106,
- 65, 62, 59, 59, 58, 63, 68, 73, 79, 85, 92, 98, 105, 111, 118, 121,
- 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136,
- }, {
- 32, 31, 30, 33, 37, 42, 49, 48, 49, 50, 52, 54, 57, 60, 63, 64,
- 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59,
- 37, 38, 40, 43, 47, 47, 48, 47, 46, 46, 47, 49, 50, 52, 55, 56,
- 48, 47, 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61,
- 49, 47, 45, 46, 45, 49, 53, 56, 58, 59, 61, 62, 64, 65, 67, 68,
- 52, 50, 48, 47, 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75,
- 57, 54, 52, 51, 50, 53, 57, 60, 64, 67, 71, 73, 76, 79, 82, 83,
- 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85, 89, 90,
- },
- }, {
- {
- 32, 31, 31, 32, 32, 34, 35, 38, 41, 44, 48, 53, 58, 65, 71, 79,
- 31, 32, 32, 32, 33, 34, 34, 36, 39, 42, 45, 49, 54, 60, 65, 72,
- 32, 32, 33, 34, 35, 37, 38, 40, 41, 43, 46, 50, 54, 58, 63, 70,
- 36, 35, 34, 36, 38, 42, 47, 49, 51, 54, 56, 60, 63, 68, 73, 79,
- 44, 42, 41, 42, 42, 48, 52, 56, 60, 64, 67, 71, 75, 79, 84, 90,
- 53, 51, 49, 50, 49, 54, 59, 63, 67, 72, 76, 82, 87, 92, 97, 104,
- 62, 59, 57, 57, 56, 61, 65, 69, 74, 79, 83, 90, 95, 102, 108, 115,
- 73, 69, 66, 65, 64, 69, 73, 77, 81, 86, 91, 99, 105, 112, 119, 127,
- }, {
- 32, 31, 30, 33, 37, 42, 47, 48, 48, 49, 50, 52, 54, 57, 60, 63,
- 31, 31, 32, 36, 40, 43, 46, 46, 45, 45, 46, 48, 50, 52, 54, 57,
- 37, 38, 40, 43, 47, 47, 48, 47, 46, 46, 46, 47, 49, 50, 52, 55,
- 48, 47, 46, 47, 47, 50, 52, 53, 53, 53, 54, 54, 55, 56, 58, 60,
- 49, 47, 45, 46, 45, 49, 53, 55, 57, 58, 59, 61, 62, 64, 65, 67,
- 52, 50, 48, 47, 47, 50, 53, 56, 59, 62, 64, 66, 68, 70, 72, 75,
- 56, 53, 51, 50, 49, 53, 55, 58, 61, 64, 66, 70, 72, 75, 77, 80,
- 61, 57, 55, 54, 52, 56, 58, 61, 63, 66, 69, 73, 76, 79, 82, 86,
- },
- }, {
- {
- 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 53, 57, 61, 65,
- 31, 32, 32, 32, 32, 33, 34, 34, 37, 39, 41, 45, 49, 53, 56, 60,
- 32, 32, 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58,
- 35, 35, 34, 35, 36, 37, 41, 46, 47, 49, 51, 54, 57, 60, 63, 66,
- 39, 38, 37, 38, 39, 40, 44, 50, 52, 54, 57, 60, 64, 67, 69, 72,
- 44, 42, 41, 42, 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79,
- 53, 51, 49, 49, 49, 49, 54, 60, 64, 67, 71, 76, 82, 86, 89, 92,
- 65, 62, 59, 59, 58, 58, 63, 68, 72, 76, 79, 85, 92, 97, 100, 105,
- }, {
- 32, 31, 30, 33, 35, 37, 42, 49, 48, 48, 49, 50, 52, 54, 55, 57,
- 31, 31, 32, 35, 37, 40, 43, 46, 46, 45, 45, 46, 48, 49, 51, 52,
- 37, 38, 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50,
- 45, 45, 44, 46, 46, 47, 49, 52, 51, 51, 51, 52, 53, 54, 54, 55,
- 48, 47, 45, 46, 46, 47, 50, 53, 54, 54, 55, 56, 57, 58, 58, 59,
- 49, 47, 45, 45, 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64,
- 52, 50, 48, 47, 47, 47, 50, 54, 57, 59, 61, 64, 66, 68, 69, 70,
- 57, 54, 52, 51, 51, 50, 53, 57, 59, 61, 64, 67, 71, 73, 74, 76,
- },
- }, {
- {
- 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58,
- 31, 32, 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54,
- 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54,
- 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 45, 47, 48, 51, 55,
- 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63,
- 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75,
- 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79,
- 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87,
- }, {
- 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54,
- 31, 31, 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50,
- 35, 37, 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49,
- 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47, 48, 48, 48, 50,
- 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55,
- 49, 47, 45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62,
- 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 65,
- 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48,
- 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45,
- 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45,
- 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46,
- 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54,
- 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56,
- 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67,
- 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70,
- }, {
- 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50,
- 31, 31, 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46,
- 33, 34, 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46,
- 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46,
- 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52,
- 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54,
- 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59,
- 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44,
- 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41,
- 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41,
- 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43,
- 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43,
- 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53,
- 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53,
- 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63,
- }, {
- 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49,
- 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45,
- 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45,
- 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46,
- 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46,
- 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53,
- 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53,
- 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34,
- 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36,
- 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38,
- 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38,
- 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 43, 46, 46,
- 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48,
- }, {
- 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49,
- 31, 31, 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47,
- 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46,
- 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44, 44, 45, 47, 47,
- 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48,
- 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48,
- 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, 49, 50, 52, 52,
- 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34,
- 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35,
- 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37,
- 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37,
- 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38,
- }, {
- 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42,
- 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42,
- 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43,
- 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43,
- 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44,
- 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47,
- 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47,
- 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
- }, {
- 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34,
- 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36,
- 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 38, 38, 38,
- 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41, 41, 41,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- }, {
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- },
- },
-};
-
-static const uint8_t qm_tbl_32x8[][2][256] = {
- {
- {
- 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122,
- 32, 32, 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111,
- 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100, 101, 103, 105, 107,
- 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118, 119,
- 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136, 138, 137, 136, 136,
- 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152, 150, 155, 161, 159, 157, 156,
- 93, 88, 86, 84, 82, 82, 80, 84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162, 165, 167, 173, 174, 177, 183, 185, 182, 179,
- 99, 94, 93, 90, 89, 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204,
- }, {
- 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
- 37, 38, 40, 41, 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70,
- 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67,
- 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74, 73, 73,
- 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81,
- 66, 63, 60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90,
- 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103, 103, 101, 99,
- 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109, 108,
- },
- }, {
- {
- 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114,
- 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104,
- 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, 100,
- 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111,
- 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127,
- 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145,
- 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, 166,
- 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181, 188, 188, 190,
- }, {
- 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
- 35, 37, 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68,
- 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65,
- 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72, 71, 71,
- 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78,
- 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87,
- 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100, 100, 98, 96,
- 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105,
- },
- }, {
- {
- 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106,
- 32, 32, 32, 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97,
- 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93,
- 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103, 103, 103, 103, 104,
- 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120, 121, 120, 119, 118,
- 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135,
- 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, 155, 160, 161, 158, 155,
- 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176,
- }, {
- 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73,
- 34, 35, 36, 36, 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66,
- 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63,
- 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70, 69, 69,
- 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76,
- 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84,
- 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97, 95, 93,
- 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99,
- 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 81, 83, 85, 88, 91,
- 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87,
- 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97, 97, 97,
- 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111, 110,
- 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126,
- 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144,
- 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, 163, 163,
- }, {
- 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70,
- 33, 34, 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
- 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61,
- 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68, 67, 67,
- 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74,
- 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82,
- 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94, 92, 90,
- 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93,
- 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 78, 80, 82, 85,
- 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81,
- 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90, 90, 90,
- 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103,
- 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117,
- 79, 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134,
- 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152,
- }, {
- 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68,
- 32, 33, 33, 33, 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57, 58, 58, 59, 60, 61, 62,
- 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59,
- 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66, 65, 65,
- 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71,
- 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79,
- 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91, 89, 87,
- 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87,
- 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79,
- 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76,
- 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84,
- 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96,
- 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109,
- 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124,
- 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141,
- }, {
- 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66,
- 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60,
- 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57,
- 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63,
- 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69,
- 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77,
- 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84,
- 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79,
- 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57, 60, 61, 65, 66, 72, 72,
- 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70,
- 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73, 79, 79,
- 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90,
- 53, 51, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104,
- 62, 60, 59, 58, 57, 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102, 103, 108, 108, 115, 115,
- 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119, 127, 127,
- }, {
- 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63,
- 31, 31, 31, 32, 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 57,
- 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55,
- 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59, 60, 60,
- 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67,
- 52, 50, 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75,
- 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78, 80, 80,
- 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49, 53, 54, 56, 60, 60, 64,
- 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62,
- 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66, 66, 70,
- 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76,
- 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83,
- 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92, 96,
- 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109,
- }, {
- 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59,
- 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 49, 50, 51, 52, 52, 54,
- 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52,
- 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55, 55, 57,
- 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61,
- 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65,
- 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70, 70, 72,
- 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54,
- 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54,
- 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53, 55, 55,
- 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63,
- 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75,
- 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, 79, 79,
- 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87,
- }, {
- 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54,
- 31, 31, 31, 31, 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50,
- 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49,
- 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50,
- 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55,
- 49, 48, 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62,
- 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63, 65, 65,
- 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47,
- 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48,
- 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48,
- 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56,
- 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58,
- 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69,
- 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73,
- }, {
- 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51,
- 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47,
- 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47,
- 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47,
- 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52,
- 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54,
- 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60,
- 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41,
- 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43,
- 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43,
- 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53,
- 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53,
- 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63,
- }, {
- 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49,
- 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45,
- 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45,
- 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46,
- 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46,
- 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53,
- 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53,
- 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 36,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37,
- 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39,
- 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39,
- 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46, 46, 46,
- 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49,
- }, {
- 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49,
- 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42, 44, 46, 47, 47, 47, 47,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46,
- 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47,
- 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47,
- 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47,
- 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52, 52, 52,
- 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
- 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38,
- }, {
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42,
- 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43,
- 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43,
- 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44,
- 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47,
- 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47,
- 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34,
- }, {
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
- 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39,
- 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42,
- },
- }, {
- {
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- }, {
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- },
- },
-};
-
static const uint8_t qm_tbl_32x16[][2][512] = {
{
{
@@ -3069,19 +1605,23 @@ const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
static uint8_t qm_tbl_4x4[15][2][16];
static uint8_t qm_tbl_4x8[15][2][32];
static uint8_t qm_tbl_4x16[15][2][64];
+static uint8_t qm_tbl_8x4[15][2][32];
static uint8_t qm_tbl_8x8[15][2][64];
static uint8_t qm_tbl_8x16[15][2][128];
static uint8_t qm_tbl_8x32[15][2][256];
+static uint8_t qm_tbl_16x4[15][2][64];
+static uint8_t qm_tbl_16x8[15][2][128];
static uint8_t qm_tbl_16x16[15][2][256];
static uint8_t qm_tbl_16x32[15][2][512];
+static uint8_t qm_tbl_32x8[15][2][256];
static uint8_t qm_tbl_32x32[15][2][1024];
-static void subsample(uint8_t *const dst, const uint8_t *const src,
- const int sz, const int step)
+static void subsample(uint8_t *dst, const uint8_t *const src,
+ const int h, const int hstep, const int vstep)
{
- for (int y = 0; y < sz; y++)
- for (int x = 0; x < sz; x++)
- dst[y * sz + x] = src[y * sz * step * step + x * step];
+ for (int y = 0; y < h; y += vstep)
+ for (int x = 0; x < 32; x += hstep)
+ *dst++ = src[y * 32 + x];
}
static void transpose(uint8_t *const dst, const uint8_t *const src,
@@ -3114,28 +1654,33 @@ COLD void dav1d_init_qm_tables(void) {
// because we store coefficients transposed
dav1d_qm_tbl[i][j][RTX_4X8 ] = qm_tbl_8x4[i][j];
dav1d_qm_tbl[i][j][RTX_8X4 ] = qm_tbl_4x8[i][j];
- transpose(qm_tbl_4x8[i][j], qm_tbl_8x4[i][j], 8, 4);
dav1d_qm_tbl[i][j][RTX_4X16 ] = qm_tbl_16x4[i][j];
dav1d_qm_tbl[i][j][RTX_16X4 ] = qm_tbl_4x16[i][j];
- transpose(qm_tbl_4x16[i][j], qm_tbl_16x4[i][j], 16, 4);
dav1d_qm_tbl[i][j][RTX_8X16 ] = qm_tbl_16x8[i][j];
dav1d_qm_tbl[i][j][RTX_16X8 ] = qm_tbl_8x16[i][j];
- transpose(qm_tbl_8x16[i][j], qm_tbl_16x8[i][j], 16, 8);
dav1d_qm_tbl[i][j][RTX_8X32 ] = qm_tbl_32x8[i][j];
dav1d_qm_tbl[i][j][RTX_32X8 ] = qm_tbl_8x32[i][j];
- transpose(qm_tbl_8x32[i][j], qm_tbl_32x8[i][j], 32, 8);
dav1d_qm_tbl[i][j][RTX_16X32] = qm_tbl_32x16[i][j];
dav1d_qm_tbl[i][j][RTX_32X16] = qm_tbl_16x32[i][j];
- transpose(qm_tbl_16x32[i][j], qm_tbl_32x16[i][j], 32, 16);
dav1d_qm_tbl[i][j][ TX_4X4 ] = qm_tbl_4x4[i][j];
dav1d_qm_tbl[i][j][ TX_8X8 ] = qm_tbl_8x8[i][j];
dav1d_qm_tbl[i][j][ TX_16X16] = qm_tbl_16x16[i][j];
dav1d_qm_tbl[i][j][ TX_32X32] = qm_tbl_32x32[i][j];
- untriangle(qm_tbl_4x4[i][j], qm_tbl_4x4_t[i][j], 4);
- untriangle(qm_tbl_8x8[i][j], qm_tbl_8x8_t[i][j], 8);
+
untriangle(qm_tbl_32x32[i][j], qm_tbl_32x32_t[i][j], 32);
- subsample(qm_tbl_16x16[i][j], qm_tbl_32x32[i][j], 16, 2);
+ subsample(qm_tbl_4x4[i][j], &qm_tbl_32x32[i][j][32*3+3], 32, 8, 8);
+ subsample(qm_tbl_8x4[i][j], &qm_tbl_32x16[i][j][32*1+1], 16, 4, 4);
+ subsample(qm_tbl_8x8[i][j], &qm_tbl_32x32[i][j][32*1+1], 32, 4, 4);
+ subsample(qm_tbl_16x4[i][j], &qm_tbl_32x16[i][j][32*1+0], 16, 2, 4);
+ subsample(qm_tbl_16x8[i][j], &qm_tbl_32x16[i][j][32*0+0], 16, 2, 2);
+ subsample(qm_tbl_16x16[i][j], &qm_tbl_32x32[i][j][32*0+0], 32, 2, 2);
+ subsample(qm_tbl_32x8[i][j], &qm_tbl_32x16[i][j][32*0+0], 16, 1, 2);
+ transpose(qm_tbl_4x8[i][j], qm_tbl_8x4[i][j], 8, 4);
+ transpose(qm_tbl_4x16[i][j], qm_tbl_16x4[i][j], 16, 4);
+ transpose(qm_tbl_8x16[i][j], qm_tbl_16x8[i][j], 16, 8);
+ transpose(qm_tbl_8x32[i][j], qm_tbl_32x8[i][j], 32, 8);
+ transpose(qm_tbl_16x32[i][j], qm_tbl_32x16[i][j], 32, 16);
dav1d_qm_tbl[i][j][ TX_64X64] = dav1d_qm_tbl[i][j][ TX_32X32];
dav1d_qm_tbl[i][j][RTX_64X32] = dav1d_qm_tbl[i][j][ TX_32X32];
diff --git a/src/refmvs.c b/src/refmvs.c
index 0b5ccd3..200afeb 100644
--- a/src/refmvs.c
+++ b/src/refmvs.c
@@ -919,6 +919,8 @@ static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/refmvs.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/refmvs.h"
#elif ARCH_X86
#include "src/x86/refmvs.h"
#endif
@@ -933,6 +935,8 @@ COLD void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *const c)
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
refmvs_dsp_init_arm(c);
+#elif ARCH_LOONGARCH64
+ refmvs_dsp_init_loongarch(c);
#elif ARCH_X86
refmvs_dsp_init_x86(c);
#endif
diff --git a/src/refmvs.h b/src/refmvs.h
index 70dc967..d63874d 100644
--- a/src/refmvs.h
+++ b/src/refmvs.h
@@ -171,6 +171,7 @@ void dav1d_refmvs_find(const refmvs_tile *rt,
void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *dsp);
void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *dsp);
+void dav1d_refmvs_dsp_init_loongarch(Dav1dRefmvsDSPContext *dsp);
void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *dsp);
#endif /* DAV1D_SRC_REF_MVS_H */
diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S
new file mode 100644
index 0000000..5677cf4
--- /dev/null
+++ b/src/riscv/64/itx.S
@@ -0,0 +1,1339 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2023, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/riscv/asm.S"
+
+function inv_txfm_add_4x4_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vle16.v v0, (a2)
+ addi t0, a2, 8
+ vle16.v v1, (t0)
+ addi t0, t0, 8
+ vle16.v v2, (t0)
+ addi t0, t0, 8
+ vle16.v v3, (t0)
+
+ jalr t0, a4
+
+ vmv.v.x v4, zero
+
+ vsseg4e16.v v0, (a2)
+ vle16.v v0, (a2)
+ vse16.v v4, (a2)
+ addi t0, a2, 8
+ vle16.v v1, (t0)
+ vse16.v v4, (t0)
+ addi t0, t0, 8
+ vle16.v v2, (t0)
+ vse16.v v4, (t0)
+ addi t0, t0, 8
+ vle16.v v3, (t0)
+ vse16.v v4, (t0)
+
+ jalr t0, a5
+
+ vssra.vi v0, v0, 4
+ vssra.vi v1, v1, 4
+ vssra.vi v2, v2, 4
+ vssra.vi v3, v3, 4
+
+itx_4x4_end:
+ vsetvli zero, zero, e8, mf4, ta, ma
+ vle8.v v4, (a0)
+ add t0, a0, a1
+ vle8.v v5, (t0)
+ add t0, t0, a1
+ vle8.v v6, (t0)
+ add t0, t0, a1
+ vle8.v v7, (t0)
+
+ vwaddu.wv v0, v0, v4
+ vwaddu.wv v1, v1, v5
+ vwaddu.wv v2, v2, v6
+ vwaddu.wv v3, v3, v7
+
+ vsetvli zero, zero, e16, mf2, ta, ma
+ vmax.vx v0, v0, zero
+ vmax.vx v1, v1, zero
+ vmax.vx v2, v2, zero
+ vmax.vx v3, v3, zero
+
+ vsetvli zero, zero, e8, mf4, ta, ma
+
+ vnclipu.wi v4, v0, 0
+ vnclipu.wi v5, v1, 0
+ vnclipu.wi v6, v2, 0
+ vnclipu.wi v7, v3, 0
+
+ vse8.v v4, (a0)
+ add a0, a0, a1
+ vse8.v v5, (a0)
+ add a0, a0, a1
+ vse8.v v6, (a0)
+ add a0, a0, a1
+ vse8.v v7, (a0)
+
+ ret
+endfunc
+
+function inv_identity_e16_x4_rvv, export=1, ext=v
+ li t1, (5793-4096)*8
+ vsmul.vx v4, v0, t1
+ vsmul.vx v5, v1, t1
+ vsmul.vx v6, v2, t1
+ vsmul.vx v7, v3, t1
+
+ vsadd.vv v0, v0, v4
+ vsadd.vv v1, v1, v5
+ vsadd.vv v2, v2, v6
+ vsadd.vv v3, v3, v7
+
+ jr t0
+endfunc
+
+.macro iwht_4
+ vadd.vv v0, v0, v1
+ vsub.vv v5, v2, v3
+ vsub.vv v4, v0, v5
+ vsra.vi v4, v4, 1
+ vsub.vv v2, v4, v1
+ vsub.vv v1, v4, v3
+ vadd.vv v3, v5, v2
+ vsub.vv v0, v0, v1
+.endm
+
+.macro idct_4 o0, o1, o2, o3
+ li t1, 2896
+ li t2, 1567
+ li t3, 3784
+
+ vwmul.vx v16, \o0, t1
+ vwmul.vx v18, \o0, t1
+ vwmacc.vx v16, t1, \o2
+ neg t1, t1
+ vwmacc.vx v18, t1, \o2
+
+ vwmul.vx v20, \o1, t3
+ neg t3, t3
+ vwmul.vx v22, \o1, t2
+ vwmacc.vx v20, t2, \o3
+ vwmacc.vx v22, t3, \o3
+
+ li t1, 2048
+
+ vwadd.wx v16, v16, t1
+ vwadd.wx v18, v18, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v22, v22, t1
+
+ vnsra.wi v16, v16, 12
+ vnsra.wi v18, v18, 12
+ vnsra.wi v20, v20, 12
+ vnsra.wi v22, v22, 12
+
+ vsadd.vv \o0, v16, v20
+ vsadd.vv \o1, v18, v22
+ vssub.vv \o2, v18, v22
+ vssub.vv \o3, v16, v20
+.endm
+
+.macro iadst_4 o0, o1, o2, o3
+ li t1, 1321
+ li t2, 3803
+ li t3, 2482
+
+ vwmul.vx v4, v0, t1
+ vwmul.vx v5, v0, t3
+ neg t1, t1
+ vwmacc.vx v4, t2, v2
+ vwmacc.vx v5, t1, v2
+ neg t2, t2
+ vwmacc.vx v4, t3, v3
+ vwmacc.vx v5, t2, v3
+
+ vwsub.vv v6, v0, v2
+ vwadd.wv v6, v6, v3
+
+ li t1, 3344
+ vwmul.vx v7, v1, t1
+
+ vsetvli zero, zero, e32, m1, ta, ma
+
+ vmul.vx v6, v6, t1
+
+ vadd.vv v8, v4, v5
+ vadd.vv v4, v4, v7
+ vadd.vv v5, v5, v7
+ vsub.vv v7, v8, v7
+
+ li t1, 2048
+
+ vadd.vx v4, v4, t1
+ vadd.vx v5, v5, t1
+ vadd.vx v6, v6, t1
+ vadd.vx v7, v7, t1
+
+ vsetvli zero, zero, e16, mf2, ta, ma
+
+ vnsra.wi \o0, v4, 12
+ vnsra.wi \o1, v5, 12
+ vnsra.wi \o2, v6, 12
+ vnsra.wi \o3, v7, 12
+.endm
+
+function inv_dct_e16_x4_rvv, export=1, ext=v
+ idct_4 v0, v1, v2, v3
+ jr t0
+endfunc
+
+function inv_adst_e16_x4_rvv, export=1, ext=v
+ iadst_4 v0, v1, v2, v3
+ jr t0
+endfunc
+
+function inv_flipadst_e16_x4_rvv, export=1, ext=v
+ iadst_4 v3, v2, v1, v0
+ jr t0
+endfunc
+
+function inv_txfm_add_wht_wht_4x4_8bpc_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vle16.v v0, (a2)
+ addi t0, a2, 8
+ vle16.v v1, (t0)
+ addi t0, t0, 8
+ vle16.v v2, (t0)
+ addi t0, t0, 8
+ vle16.v v3, (t0)
+
+ vsra.vi v0, v0, 2
+ vsra.vi v1, v1, 2
+ vsra.vi v2, v2, 2
+ vsra.vi v3, v3, 2
+
+ iwht_4
+
+ vmv.v.x v4, zero
+
+ vsseg4e16.v v0, (a2)
+ vle16.v v0, (a2)
+ vse16.v v4, (a2)
+ addi t0, a2, 8
+ vle16.v v1, (t0)
+ vse16.v v4, (t0)
+ addi t0, t0, 8
+ vle16.v v2, (t0)
+ vse16.v v4, (t0)
+ addi t0, t0, 8
+ vle16.v v3, (t0)
+ vse16.v v4, (t0)
+
+ iwht_4
+
+ j itx_4x4_end
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v
+.ifc \txfm1\()_\txfm2, dct_dct
+ beqz a3, 1f
+.endif
+ la a4, inv_\txfm1\()_e16_x4_rvv
+ la a5, inv_\txfm2\()_e16_x4_rvv
+ j inv_txfm_add_4x4_rvv
+.ifc \txfm1\()_\txfm2, dct_dct
+1:
+ csrw vxrm, zero
+ vsetivli zero, 4, e16, mf2, ta, ma
+ ld t2, (a2)
+ li t1, 2896*8
+ vmv.v.x v0, t2
+ vsmul.vx v0, v0, t1
+ sd x0, (a2)
+ vsmul.vx v0, v0, t1
+ vssra.vi v0, v0, 4
+ vmv.v.v v1, v0
+ vmv.v.v v2, v0
+ vmv.v.v v3, v0
+ j itx_4x4_end
+.endif
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 8, e16, m1, ta, ma
+ vle16.v v0, (a2)
+ addi t0, a2, 16
+ vle16.v v1, (t0)
+ addi t0, t0, 16
+ vle16.v v2, (t0)
+ addi t0, t0, 16
+ vle16.v v3, (t0)
+ addi t0, t0, 16
+ vle16.v v4, (t0)
+ addi t0, t0, 16
+ vle16.v v5, (t0)
+ addi t0, t0, 16
+ vle16.v v6, (t0)
+ addi t0, t0, 16
+ vle16.v v7, (t0)
+
+.ifc \variant, identity_
+ // The identity vsadd.vv and downshift vssra.vi 1 cancel out
+.else
+ jalr t0, a4
+
+ vssra.vi v0, v0, 1
+ vssra.vi v1, v1, 1
+ vssra.vi v2, v2, 1
+ vssra.vi v3, v3, 1
+ vssra.vi v4, v4, 1
+ vssra.vi v5, v5, 1
+ vssra.vi v6, v6, 1
+ vssra.vi v7, v7, 1
+.endif
+
+ vsseg8e16.v v0, (a2)
+ vle16.v v0, (a2)
+ addi t0, a2, 16
+ vle16.v v1, (t0)
+ addi t0, t0, 16
+ vle16.v v2, (t0)
+ addi t0, t0, 16
+ vle16.v v3, (t0)
+ addi t0, t0, 16
+ vle16.v v4, (t0)
+ addi t0, t0, 16
+ vle16.v v5, (t0)
+ addi t0, t0, 16
+ vle16.v v6, (t0)
+ addi t0, t0, 16
+ vle16.v v7, (t0)
+
+ jalr t0, a5
+
+ vssra.vi v0, v0, 4
+ vssra.vi v1, v1, 4
+ vssra.vi v2, v2, 4
+ vssra.vi v3, v3, 4
+ vssra.vi v4, v4, 4
+ vssra.vi v5, v5, 4
+ vssra.vi v6, v6, 4
+ vssra.vi v7, v7, 4
+
+ li t1, 64
+ vsetvli zero, t1, e16, m8, ta, ma
+ vmv.v.x v8, zero
+ vse16.v v8, (a2)
+
+.ifc \variant, identity_
+itx_8x8_end:
+.endif
+ vsetivli zero, 8, e8, mf2, ta, ma
+ vle8.v v8, (a0)
+ add t0, a0, a1
+ vle8.v v9, (t0)
+ add t0, t0, a1
+ vle8.v v10, (t0)
+ add t0, t0, a1
+ vle8.v v11, (t0)
+ add t0, t0, a1
+ vle8.v v12, (t0)
+ add t0, t0, a1
+ vle8.v v13, (t0)
+ add t0, t0, a1
+ vle8.v v14, (t0)
+ add t0, t0, a1
+ vle8.v v15, (t0)
+
+ vwaddu.wv v0, v0, v8
+ vwaddu.wv v1, v1, v9
+ vwaddu.wv v2, v2, v10
+ vwaddu.wv v3, v3, v11
+ vwaddu.wv v4, v4, v12
+ vwaddu.wv v5, v5, v13
+ vwaddu.wv v6, v6, v14
+ vwaddu.wv v7, v7, v15
+
+ vsetvli zero, zero, e16, m1
+ vmax.vx v0, v0, zero
+ vmax.vx v1, v1, zero
+ vmax.vx v2, v2, zero
+ vmax.vx v3, v3, zero
+ vmax.vx v4, v4, zero
+ vmax.vx v5, v5, zero
+ vmax.vx v6, v6, zero
+ vmax.vx v7, v7, zero
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+
+ vnclipu.wi v8, v0, 0
+ vnclipu.wi v9, v1, 0
+ vnclipu.wi v10, v2, 0
+ vnclipu.wi v11, v3, 0
+ vnclipu.wi v12, v4, 0
+ vnclipu.wi v13, v5, 0
+ vnclipu.wi v14, v6, 0
+ vnclipu.wi v15, v7, 0
+
+ vse8.v v8, (a0)
+ add a0, a0, a1
+ vse8.v v9, (a0)
+ add a0, a0, a1
+ vse8.v v10, (a0)
+ add a0, a0, a1
+ vse8.v v11, (a0)
+ add a0, a0, a1
+ vse8.v v12, (a0)
+ add a0, a0, a1
+ vse8.v v13, (a0)
+ add a0, a0, a1
+ vse8.v v14, (a0)
+ add a0, a0, a1
+ vse8.v v15, (a0)
+
+ ret
+endfunc
+.endm
+
+def_fn_8x8_base
+def_fn_8x8_base identity_
+
+function inv_identity_e16_x8_rvv, export=1, ext=v
+ vsadd.vv v0, v0, v0
+ vsadd.vv v1, v1, v1
+ vsadd.vv v2, v2, v2
+ vsadd.vv v3, v3, v3
+ vsadd.vv v4, v4, v4
+ vsadd.vv v5, v5, v5
+ vsadd.vv v6, v6, v6
+ vsadd.vv v7, v7, v7
+
+ jr t0
+endfunc
+
+.macro idct_8 o0, o1, o2, o3, o4, o5, o6, o7
+ idct_4 \o0, \o2, \o4, \o6
+
+ li t1, 799
+ li t2, 4017
+ li t3, 3406
+ li t4, 2276
+
+ vwmul.vx v22, \o1, t2
+ neg t2, t2
+ vwmul.vx v16, \o1, t1
+ vwmacc.vx v22, t1, \o7
+ vwmacc.vx v16, t2, \o7
+
+ vwmul.vx v20, \o5, t4
+ neg t4, t4
+ vwmul.vx v18, \o5, t3
+ vwmacc.vx v20, t3, \o3
+ vwmacc.vx v18, t4, \o3
+
+ li t1, 2048
+
+ vwadd.wx v16, v16, t1
+ vwadd.wx v18, v18, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v22, v22, t1
+
+ vnsra.wi v16, v16, 12
+ vnsra.wi v18, v18, 12
+ vnsra.wi v20, v20, 12
+ vnsra.wi v22, v22, 12
+
+ vssub.vv \o7, v22, v20
+ vsadd.vv v22, v22, v20
+ vssub.vv \o1, v16, v18
+ vsadd.vv v16, v16, v18
+
+ li t2, 2896
+
+ vwmul.vx v18, \o7, t2
+ vwmul.vx v20, \o7, t2
+ vwmacc.vx v20, t2, \o1
+ neg t2, t2
+ vwmacc.vx v18, t2, \o1
+
+ vwadd.wx v18, v18, t1
+ vwadd.wx v20, v20, t1
+
+ vnsra.wi v18, v18, 12
+ vnsra.wi v20, v20, 12
+
+ vssub.vv \o7, \o0, v22
+ vsadd.vv \o0, \o0, v22
+ vssub.vv v17, \o2, v20
+ vsadd.vv \o1, \o2, v20
+ vssub.vv \o5, \o4, v18
+ vsadd.vv \o2, \o4, v18
+ vssub.vv \o4, \o6, v16
+ vsadd.vv \o3, \o6, v16
+ vmv.v.v \o6, v17
+.endm
+
+.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
+ li t1, 4076
+ li t2, 401
+ li t3, 3612
+ li t4, 1931
+ li t5, 2598
+ li t6, 3166
+
+ vwmul.vx v8, v7, t1
+ neg t1, t1
+ vwmul.vx v10, v7, t2
+ vwmacc.vx v8, t2, v0
+ vwmacc.vx v10, t1, v0
+
+ vwmul.vx v12, v5, t3
+ neg t3, t3
+ vwmul.vx v14, v5, t4
+ vwmacc.vx v12, t4, v2
+ vwmacc.vx v14, t3, v2
+
+ vwmul.vx v16, v3, t5
+ neg t5, t5
+ vwmul.vx v18, v3, t6
+ vwmacc.vx v16, t6, v4
+ vwmacc.vx v18, t5, v4
+
+ li t1, 2048
+ li t2, 1189
+ li t3, 3920
+ li t4, 1567
+ li t5, 3784
+ li t6, 2896
+
+ vwmul.vx v20, v1, t2
+ neg t2, t2
+ vwmul.vx v22, v1, t3
+ vwmacc.vx v20, t3, v6
+ vwmacc.vx v22, t2, v6
+
+ vwadd.wx v8, v8, t1
+ vwadd.wx v10, v10, t1
+ vwadd.wx v12, v12, t1
+ vwadd.wx v14, v14, t1
+ vwadd.wx v16, v16, t1
+ vwadd.wx v18, v18, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v22, v22, t1
+
+ vnsra.wi v8, v8, 12
+ vnsra.wi v10, v10, 12
+ vnsra.wi v12, v12, 12
+ vnsra.wi v14, v14, 12
+ vnsra.wi v16, v16, 12
+ vnsra.wi v18, v18, 12
+ vnsra.wi v20, v20, 12
+ vnsra.wi v22, v22, 12
+
+ vssub.vv v4, v8, v16
+ vsadd.vv v8, v8, v16
+ vsadd.vv v1, v10, v18
+ vsadd.vv v2, v12, v20
+ vsadd.vv v3, v14, v22
+ vssub.vv v5, v10, v18
+ vssub.vv v6, v12, v20
+ vssub.vv v22, v14, v22
+
+ vsadd.vv \o0, v8, v2
+ vsadd.vv \o7, v1, v3
+ vssub.vv v2, v8, v2
+ vssub.vv v3, v1, v3
+
+ vwmul.vx v8, v4, t5
+ vwmul.vx v10, v4, t4
+ vwmul.vx v12, v22, t5
+ vwmul.vx v14, v22, t4
+ vwmacc.vx v8, t4, v5
+ neg t4, t4
+ vwmacc.vx v14, t5, v6
+ neg t5, t5
+ vwmacc.vx v12, t4, v6
+ vwmacc.vx v10, t5, v5
+
+ vwadd.wx v8, v8, t1
+ vwadd.wx v10, v10, t1
+ vwadd.wx v12, v12, t1
+ vwadd.wx v14, v14, t1
+
+ vnsra.wi v8, v8, 12
+ vnsra.wi v10, v10, 12
+ vnsra.wi v12, v12, 12
+ vnsra.wi v14, v14, 12
+
+ vsadd.vv \o1, v8, v12
+ vsadd.vv \o6, v10, v14
+ vssub.vv v8, v8, v12
+ vssub.vv v9, v10, v14
+
+ vwmul.vx v10, v2, t6
+ vwmul.vx v12, v2, t6
+ vwmul.vx v14, v8, t6
+ vwmul.vx v16, v8, t6
+ vwmacc.vx v10, t6, v3
+ vwmacc.vx v14, t6, v9
+ neg t6, t6
+ vwmacc.vx v12, t6, v3
+ vwmacc.vx v16, t6, v9
+
+ vwadd.wx v10, v10, t1
+ vwadd.wx v12, v12, t1
+ vwadd.wx v14, v14, t1
+ vwadd.wx v16, v16, t1
+
+ vnsra.wi \o3, v10, 12
+ vnsra.wi \o4, v12, 12
+ vnsra.wi \o2, v14, 12
+ vnsra.wi \o5, v16, 12
+
+ vmv.v.x v8, zero
+ vssub.vv \o1, v8, \o1
+ vssub.vv \o3, v8, \o3
+ vssub.vv \o5, v8, \o5
+ vssub.vv \o7, v8, \o7
+.endm
+
+function inv_dct_e16_x8_rvv, export=1, ext=v
+ idct_8 v0, v1, v2, v3, v4, v5, v6, v7
+ jr t0
+endfunc
+
+function inv_adst_e16_x8_rvv, export=1, ext=v
+ iadst_8 v0, v1, v2, v3, v4, v5, v6, v7
+ jr t0
+endfunc
+
+function inv_flipadst_e16_x8_rvv, export=1, ext=v
+ iadst_8 v7, v6, v5, v4, v3, v2, v1, v0
+ jr t0
+endfunc
+
+.macro def_fn_8x8 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1, ext=v
+.ifc \txfm1\()_\txfm2, dct_dct
+ beqz a3, 1f
+.endif
+ la a5, inv_\txfm2\()_e16_x8_rvv
+.ifc \txfm1, identity
+ j inv_txfm_identity_add_8x8_rvv
+.else
+ la a4, inv_\txfm1\()_e16_x8_rvv
+ j inv_txfm_add_8x8_rvv
+.endif
+.ifc \txfm1\()_\txfm2, dct_dct
+1:
+ csrw vxrm, zero
+ vsetivli zero, 8, e16, m1, ta, ma
+ ld t2, (a2)
+ li t1, 2896*8
+ vmv.v.x v0, t2
+ vsmul.vx v0, v0, t1
+ sd x0, (a2)
+ vssra.vi v0, v0, 1
+ vsmul.vx v0, v0, t1
+ vssra.vi v0, v0, 4
+ vmv.v.v v1, v0
+ vmv.v.v v2, v0
+ vmv.v.v v3, v0
+ vmv.v.v v4, v0
+ vmv.v.v v5, v0
+ vmv.v.v v6, v0
+ vmv.v.v v7, v0
+ j itx_8x8_end
+.endif
+endfunc
+.endm
+
+def_fn_8x8 dct, dct
+def_fn_8x8 identity, identity
+def_fn_8x8 dct, adst
+def_fn_8x8 dct, flipadst
+def_fn_8x8 dct, identity
+def_fn_8x8 adst, dct
+def_fn_8x8 adst, adst
+def_fn_8x8 adst, flipadst
+def_fn_8x8 flipadst, dct
+def_fn_8x8 flipadst, adst
+def_fn_8x8 flipadst, flipadst
+def_fn_8x8 identity, dct
+def_fn_8x8 adst, identity
+def_fn_8x8 flipadst, identity
+def_fn_8x8 identity, adst
+def_fn_8x8 identity, flipadst
+
+function inv_identity_e16_x16_rvv, export=1, ext=v
+ li t1, 2*(5793-4096)*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vsmul.vx v16, v\i, t1
+ vsadd.vv v\i, v\i, v\i
+ vsadd.vv v\i, v\i, v16
+.endr
+ jr t0
+endfunc
+
+function inv_dct_e16_x16_rvv, export=1, ext=v
+ idct_8 v0, v2, v4, v6, v8, v10, v12, v14
+
+ li t1, 401
+ li t2, 4076
+ li t3, 3166
+ li t4, 2598
+
+ vwmul.vx v30, v1, t2
+ neg t2, t2
+ vwmul.vx v16, v1, t1
+ vwmacc.vx v30, t1, v15
+ vwmacc.vx v16, t2, v15
+
+ vwmul.vx v28, v9, t4
+ neg t4, t4
+ vwmul.vx v18, v9, t3
+ vwmacc.vx v28, t3, v7
+ vwmacc.vx v18, t4, v7
+
+ li t1, 1931
+ li t2, 3612
+ li t3, 3920
+ li t4, 1189
+
+ vwmul.vx v26, v5, t2
+ neg t2, t2
+ vwmul.vx v20, v5, t1
+ vwmacc.vx v26, t1, v11
+ vwmacc.vx v20, t2, v11
+
+ vwmul.vx v24, v13, t4
+ neg t4, t4
+ vwmul.vx v22, v13, t3
+ vwmacc.vx v24, t3, v3
+ vwmacc.vx v22, t4, v3
+
+ li t1, 2048
+ li t2, 2896
+ li t3, 1567
+ li t4, 3784
+
+ vwadd.wx v16, v16, t1
+ vwadd.wx v18, v18, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v22, v22, t1
+ vwadd.wx v24, v24, t1
+ vwadd.wx v26, v26, t1
+ vwadd.wx v28, v28, t1
+ vwadd.wx v30, v30, t1
+
+ vnsra.wi v16, v16, 12
+ vnsra.wi v18, v18, 12
+ vnsra.wi v20, v20, 12
+ vnsra.wi v22, v22, 12
+ vnsra.wi v24, v24, 12
+ vnsra.wi v26, v26, 12
+ vnsra.wi v28, v28, 12
+ vnsra.wi v30, v30, 12
+
+ vssub.vv v3, v16, v18
+ vsadd.vv v16, v16, v18
+ vssub.vv v5, v22, v20
+ vsadd.vv v22, v22, v20
+ vssub.vv v11, v24, v26
+ vsadd.vv v24, v24, v26
+ vssub.vv v13, v30, v28
+ vsadd.vv v30, v30, v28
+
+ vwmul.vx v28, v13, t4
+ neg t4, t4
+ vwmul.vx v18, v13, t3
+ vwmul.vx v26, v11, t3
+ vwmacc.vx v28, t3, v3
+ neg t3, t3
+ vwmul.vx v20, v11, t4
+ vwmacc.vx v18, t4, v3
+ vwmacc.vx v20, t3, v5
+ vwmacc.vx v26, t4, v5
+
+ vwadd.wx v18, v18, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v26, v26, t1
+ vwadd.wx v28, v28, t1
+
+ vnsra.wi v18, v18, 12
+ vnsra.wi v20, v20, 12
+ vnsra.wi v26, v26, 12
+ vnsra.wi v28, v28, 12
+
+ vssub.vv v5, v18, v20
+ vsadd.vv v18, v18, v20
+ vssub.vv v11, v28, v26
+ vsadd.vv v28, v28, v26
+
+ vssub.vv v7, v16, v22
+ vsadd.vv v16, v16, v22
+ vssub.vv v9, v30, v24
+ vsadd.vv v30, v30, v24
+
+ vwmul.vx v20, v11, t2
+ vwmul.vx v22, v9, t2
+ vwmul.vx v24, v9, t2
+ vwmul.vx v26, v11, t2
+ vwmacc.vx v24, t2, v7
+ vwmacc.vx v26, t2, v5
+ neg t2, t2
+ vwmacc.vx v20, t2, v5
+ vwmacc.vx v22, t2, v7
+
+ vwadd.wx v20, v20, t1
+ vwadd.wx v22, v22, t1
+ vwadd.wx v24, v24, t1
+ vwadd.wx v26, v26, t1
+
+ vnsra.wi v20, v20, 12
+ vnsra.wi v22, v22, 12
+ vnsra.wi v24, v24, 12
+ vnsra.wi v26, v26, 12
+
+ vssub.vv v15, v0, v30
+ vsadd.vv v0, v0, v30
+ vssub.vv v17, v2, v28
+ vsadd.vv v1, v2, v28
+ vssub.vv v13, v4, v26
+ vsadd.vv v2, v4, v26
+ vssub.vv v19, v6, v24
+ vsadd.vv v3, v6, v24
+ vssub.vv v11, v8, v22
+ vsadd.vv v4, v8, v22
+ vsadd.vv v5, v10, v20
+ vssub.vv v10, v10, v20
+ vssub.vv v9, v12, v18
+ vsadd.vv v6, v12, v18
+ vssub.vv v8, v14, v16
+ vsadd.vv v7, v14, v16
+ vmv.v.v v14, v17
+ vmv.v.v v12, v19
+
+ jr t0
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+ li t1, 4091
+ li t2, 201
+ li t3, 3973
+ li t4, 995
+
+ vwmul.vx v16, v15, t1
+ neg t1, t1
+ vwmul.vx v18, v15, t2
+ vwmacc.vx v16, t2, v0
+ vwmacc.vx v18, t1, v0
+
+ vwmul.vx v20, v13, t3
+ neg t3, t3
+ vwmul.vx v22, v13, t4
+ vwmacc.vx v20, t4, v2
+ vwmacc.vx v22, t3, v2
+
+ li t1, 3703
+ li t2, 1751
+ li t3, 3290
+ li t4, 2440
+
+ vwmul.vx v24, v11, t1
+ neg t1, t1
+ vwmul.vx v26, v11, t2
+ vwmacc.vx v24, t2, v4
+ vwmacc.vx v26, t1, v4
+
+ vwmul.vx v28, v9, t3
+ neg t3, t3
+ vwmul.vx v30, v9, t4
+ vwmacc.vx v28, t4, v6
+ vwmacc.vx v30, t3, v6
+
+ li t1, 2048
+
+ vwadd.wx v16, v16, t1
+ vwadd.wx v18, v18, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v22, v22, t1
+ vwadd.wx v24, v24, t1
+ vwadd.wx v26, v26, t1
+ vwadd.wx v28, v28, t1
+ vwadd.wx v30, v30, t1
+
+ vnsra.wi v0, v16, 12
+ vnsra.wi v18, v18, 12
+ vnsra.wi v2, v20, 12
+ vnsra.wi v22, v22, 12
+ vnsra.wi v4, v24, 12
+ vnsra.wi v26, v26, 12
+ vnsra.wi v6, v28, 12
+ vnsra.wi v30, v30, 12
+
+ li t1, 2751
+ li t2, 3035
+ li t3, 2106
+ li t4, 3513
+
+ vwmul.vx v16, v7, t1
+ neg t1, t1
+ vwmul.vx v20, v7, t2
+ vwmacc.vx v16, t2, v8
+ vwmacc.vx v20, t1, v8
+
+ vwmul.vx v24, v5, t3
+ neg t3, t3
+ vwmul.vx v28, v5, t4
+ vwmacc.vx v24, t4, v10
+ vwmacc.vx v28, t3, v10
+
+ li t1, 2048
+
+ vwadd.wx v16, v16, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v24, v24, t1
+ vwadd.wx v28, v28, t1
+
+ vnsra.wi v16, v16, 12
+ vnsra.wi v9, v20, 12
+ vnsra.wi v24, v24, 12
+ vnsra.wi v11, v28, 12
+
+ vssub.vv v8, v0, v16
+ vsadd.vv v0, v0, v16
+ vssub.vv v10, v2, v24
+ vsadd.vv v2, v2, v24
+
+ li t1, 1380
+ li t2, 3857
+ li t3, 601
+ li t4, 4052
+
+ vwmul.vx v16, v3, t1
+ neg t1, t1
+ vwmul.vx v20, v3, t2
+ vwmacc.vx v16, t2, v12
+ vwmacc.vx v20, t1, v12
+
+ vwmul.vx v24, v1, t3
+ neg t3, t3
+ vwmul.vx v28, v1, t4
+ vwmacc.vx v24, t4, v14
+ vwmacc.vx v28, t3, v14
+
+ li t1, 2048
+
+ vwadd.wx v16, v16, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v24, v24, t1
+ vwadd.wx v28, v28, t1
+
+ vnsra.wi v16, v16, 12
+ vnsra.wi v13, v20, 12
+ vnsra.wi v24, v24, 12
+ vnsra.wi v15, v28, 12
+
+ vssub.vv v12, v4, v16
+ vsadd.vv v16, v4, v16
+ vssub.vv v14, v6, v24
+ vsadd.vv v20, v6, v24
+
+ vsadd.vv v1, v18, v9
+ vssub.vv v9, v18, v9
+ vsadd.vv v3, v22, v11
+ vssub.vv v11, v22, v11
+ vsadd.vv v18, v26, v13
+ vssub.vv v13, v26, v13
+ vsadd.vv v22, v30, v15
+ vssub.vv v15, v30, v15
+
+ vssub.vv v4, v0, v16
+ vsadd.vv v0, v0, v16
+ vssub.vv v5, v1, v18
+ vsadd.vv v1, v1, v18
+ vssub.vv v6, v2, v20
+ vsadd.vv v2, v2, v20
+ vssub.vv v7, v3, v22
+ vsadd.vv v3, v3, v22
+
+ li t1, 799
+ li t2, 4017
+ li t3, 3406
+ li t4, 2276
+
+ vwmul.vx v16, v8, t2
+ vwmul.vx v18, v8, t1
+ vwmul.vx v20, v10, t4
+ vwmul.vx v22, v10, t3
+ vwmul.vx v24, v13, t2
+ vwmul.vx v26, v13, t1
+ vwmul.vx v28, v15, t4
+ vwmul.vx v30, v15, t3
+ vwmacc.vx v16, t1, v9
+ neg t1, t1
+ vwmacc.vx v20, t3, v11
+ neg t3, t3
+ vwmacc.vx v26, t2, v12
+ neg t2, t2
+ vwmacc.vx v30, t4, v14
+ neg t4, t4
+ vwmacc.vx v18, t2, v9
+ vwmacc.vx v22, t4, v11
+ vwmacc.vx v24, t1, v12
+ vwmacc.vx v28, t3, v14
+
+ li t1, 2048
+ li t2, 2896
+ li t3, 1567
+ li t4, 3784
+
+ vwadd.wx v16, v16, t1
+ vwadd.wx v18, v18, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v22, v22, t1
+ vwadd.wx v24, v24, t1
+ vwadd.wx v26, v26, t1
+ vwadd.wx v28, v28, t1
+ vwadd.wx v30, v30, t1
+
+ vnsra.wi v16, v16, 12
+ vnsra.wi v18, v18, 12
+ vnsra.wi v20, v20, 12
+ vnsra.wi v22, v22, 12
+ vnsra.wi v24, v24, 12
+ vnsra.wi v26, v26, 12
+ vnsra.wi v28, v28, 12
+ vnsra.wi v30, v30, 12
+
+ vsadd.vv v8, v16, v24
+ vsadd.vv v9, v18, v26
+ vsadd.vv v10, v20, v28
+ vsadd.vv v11, v22, v30
+ vssub.vv v12, v16, v24
+ vssub.vv v13, v18, v26
+ vssub.vv v14, v20, v28
+ vssub.vv v15, v22, v30
+
+ vwmul.vx v16, v4, t4
+ vwmul.vx v18, v4, t3
+ vwmul.vx v20, v7, t4
+ vwmul.vx v22, v7, t3
+ vwmul.vx v24, v12, t4
+ vwmul.vx v26, v12, t3
+ vwmul.vx v28, v15, t4
+ vwmul.vx v30, v15, t3
+ vwmacc.vx v16, t3, v5
+ vwmacc.vx v22, t4, v6
+ vwmacc.vx v24, t3, v13
+ neg t3, t3
+ vwmacc.vx v30, t4, v14
+ neg t4, t4
+ vwmacc.vx v20, t3, v6
+ vwmacc.vx v28, t3, v14
+ vwmacc.vx v18, t4, v5
+ vwmacc.vx v26, t4, v13
+
+ vwadd.wx v16, v16, t1
+ vwadd.wx v18, v18, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v22, v22, t1
+ vwadd.wx v24, v24, t1
+ vwadd.wx v26, v26, t1
+ vwadd.wx v28, v28, t1
+ vwadd.wx v30, v30, t1
+
+ vnsra.wi v16, v16, 12
+ vnsra.wi v18, v18, 12
+ vnsra.wi v20, v20, 12
+ vnsra.wi v22, v22, 12
+ vnsra.wi v24, v24, 12
+ vnsra.wi v26, v26, 12
+ vnsra.wi v28, v28, 12
+ vnsra.wi v30, v30, 12
+
+.ifc \o0, v0
+ vsadd.vv \o14, v9, v11
+ vssub.vv v11, v9, v11
+ vssub.vv v9, v1, v3
+ vsadd.vv \o15, v1, v3
+ vsadd.vv \o1, v8, v10
+ vssub.vv v10, v8, v10
+ vssub.vv v8, v0, v2
+ vsadd.vv \o0, v0, v2
+.else
+ vsadd.vv \o1, v8, v10
+ vssub.vv v10, v8, v10
+ vssub.vv v8, v0, v2
+ vsadd.vv \o0, v0, v2
+ vsadd.vv v2, v9, v11
+ vssub.vv v11, v9, v11
+ vssub.vv v9, v1, v3
+ vsadd.vv \o15, v1, v3
+ vmv.v.v \o14, v2
+.endif
+
+ vsadd.vv \o3, v16, v20
+ vssub.vv v6, v16, v20
+ vsadd.vv \o12, v18, v22
+ vssub.vv v7, v18, v22
+ vsadd.vv \o2, v24, v28
+ vssub.vv v24, v24, v28
+ vsadd.vv \o13, v26, v30
+ vssub.vv v26, v26, v30
+
+ neg t3, t2
+
+ vwmul.vx v28, v24, t2
+ vwmul.vx v30, v24, t2
+ vwmacc.vx v28, t2, v26
+ vwmacc.vx v30, t3, v26
+
+ vwmul.vx v24, v10, t2
+ vwmul.vx v26, v10, t2
+ vwmacc.vx v24, t2, v11
+ vwmacc.vx v26, t3, v11
+
+ vwmul.vx v20, v6, t2
+ vwmul.vx v22, v6, t2
+ vwmacc.vx v20, t2, v7
+ vwmacc.vx v22, t3, v7
+
+ vwmul.vx v16, v8, t2
+ vwmul.vx v18, v8, t2
+ vwmacc.vx v16, t2, v9
+ vwmacc.vx v18, t3, v9
+
+ vwadd.wx v16, v16, t1
+ vwadd.wx v18, v18, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v22, v22, t1
+ vwadd.wx v24, v24, t1
+ vwadd.wx v26, v26, t1
+ vwadd.wx v28, v28, t1
+ vwadd.wx v30, v30, t1
+
+ vnsra.wi \o7, v16, 12
+ vnsra.wi \o8, v18, 12
+ vnsra.wi \o4, v20, 12
+ vnsra.wi \o11, v22, 12
+ vnsra.wi \o6, v24, 12
+ vnsra.wi \o9, v26, 12
+ vnsra.wi \o5, v28, 12
+ vnsra.wi \o10, v30, 12
+
+ vmv.v.x v16, zero
+ vssub.vv \o1, v16, \o1
+ vssub.vv \o3, v16, \o3
+ vssub.vv \o5, v16, \o5
+ vssub.vv \o7, v16, \o7
+ vssub.vv \o9, v16, \o9
+ vssub.vv \o11, v16, \o11
+ vssub.vv \o13, v16, \o13
+ vssub.vv \o15, v16, \o15
+.endm
+
+function inv_adst_e16_x16_rvv, export=1, ext=v
+ iadst_16 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
+ jr t0
+endfunc
+
+function inv_flipadst_e16_x16_rvv, export=1, ext=v
+ iadst_16 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0
+ jr t0
+endfunc
+
+.macro def_horz_16 variant
+function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v
+ vmv.v.x v16, zero
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vle16.v v\i, (t4)
+ vse16.v v16, (t4)
+ add t4, t4, t6
+.endr
+.ifc \variant, _identity
+ li t1, 2*(5793-4096)*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vsmul.vx v16, v\i, t1
+ vsra.vi v16, v16, 1
+ vaadd.vv v\i, v\i, v16
+.endr
+.else
+ jalr t0, a4
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vssra.vi v\i, v\i, 2
+.endr
+.endif
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vsse16.v v\i, (t5), t6
+ addi t5, t5, 2
+.endr
+ jr a7
+endfunc
+.endm
+
+def_horz_16
+def_horz_16 _identity
+
+function inv_txfm_add_vert_8x16_rvv, export=1, ext=v
+ vsetivli zero, 8, e16, m1, ta, ma
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vle16.v v\i, (t4)
+ add t4, t4, t6
+.endr
+ jalr t0, a5
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vssra.vi v\i, v\i, 4
+.endr
+
+ vsetivli zero, 8, e8, mf2, ta, ma
+ mv t0, t5
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ vle8.v v\i, (t0)
+ add t0, t0, a1
+.endr
+
+ vwaddu.wv v0, v0, v16
+ vwaddu.wv v1, v1, v17
+ vwaddu.wv v2, v2, v18
+ vwaddu.wv v3, v3, v19
+ vwaddu.wv v4, v4, v20
+ vwaddu.wv v5, v5, v21
+ vwaddu.wv v6, v6, v22
+ vwaddu.wv v7, v7, v23
+ vwaddu.wv v8, v8, v24
+ vwaddu.wv v9, v9, v25
+ vwaddu.wv v10, v10, v26
+ vwaddu.wv v11, v11, v27
+ vwaddu.wv v12, v12, v28
+ vwaddu.wv v13, v13, v29
+ vwaddu.wv v14, v14, v30
+ vwaddu.wv v15, v15, v31
+
+ vsetvli zero, zero, e16, m1
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vmax.vx v\i, v\i, zero
+.endr
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+ vnclipu.wi v16, v0, 0
+ vnclipu.wi v17, v1, 0
+ vnclipu.wi v18, v2, 0
+ vnclipu.wi v19, v3, 0
+ vnclipu.wi v20, v4, 0
+ vnclipu.wi v21, v5, 0
+ vnclipu.wi v22, v6, 0
+ vnclipu.wi v23, v7, 0
+ vnclipu.wi v24, v8, 0
+ vnclipu.wi v25, v9, 0
+ vnclipu.wi v26, v10, 0
+ vnclipu.wi v27, v11, 0
+ vnclipu.wi v28, v12, 0
+ vnclipu.wi v29, v13, 0
+ vnclipu.wi v30, v14, 0
+ vnclipu.wi v31, v15, 0
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ vse8.v v\i, (t5)
+ add t5, t5, a1
+.endr
+
+ jr a7
+endfunc
+
+function inv_txfm_add_16x16_rvv, export=1, ext=v
+ csrw vxrm, zero
+ vsetivli zero, 8, e16, m1, ta, ma
+ addi sp, sp, -16*32
+.irp i, 0, 8
+ addi t4, a2, \i*2
+ addi t5, sp, \i*16*2
+ li t6, 16*2
+ jalr a7, a6
+.endr
+.irp i, 0, 8
+ addi t4, sp, \i*2
+ addi t5, a0, \i
+ li t6, 16*2
+ jal a7, inv_txfm_add_vert_8x16_rvv
+.endr
+ addi sp, sp, 16*32
+ ret
+endfunc
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v
+.ifc \txfm1, identity
+ la a6, inv_txfm_horz_identity_16x8_rvv
+.else
+ la a6, inv_txfm_horz_16x8_rvv
+ la a4, inv_\txfm1\()_e16_x16_rvv
+.endif
+ la a5, inv_\txfm2\()_e16_x16_rvv
+ j inv_txfm_add_16x16_rvv
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
diff --git a/src/riscv/asm.S b/src/riscv/asm.S
new file mode 100644
index 0000000..2435170
--- /dev/null
+++ b/src/riscv/asm.S
@@ -0,0 +1,126 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2023, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_RISCV_ASM_S
+#define DAV1D_SRC_RISCV_ASM_S
+
+#include "config.h"
+
+#if !defined(PIC)
+#if defined(__PIC__)
+#define PIC __PIC__
+#elif defined(__pic__)
+#define PIC __pic__
+#endif
+#endif
+
+#ifndef PRIVATE_PREFIX
+#define PRIVATE_PREFIX dav1d_
+#endif
+
+#define PASTE(a,b) a ## b
+#define CONCAT(a,b) PASTE(a,b)
+
+#ifdef PREFIX
+#define EXTERN CONCAT(_,PRIVATE_PREFIX)
+#else
+#define EXTERN PRIVATE_PREFIX
+#endif
+
+.macro function name, export=0, ext=
+ .macro endfunc
+#ifdef __ELF__
+ .size \name, . - \name
+#endif
+ .option pop
+ .purgem endfunc
+ .endm
+ .text
+ .option push
+ .ifnb \ext
+ .option arch, +\ext
+ .endif
+ .if \export
+ .global EXTERN\name
+#ifdef __ELF__
+ .type EXTERN\name, %function
+ .hidden EXTERN\name
+#elif defined(__MACH__)
+ .private_extern EXTERN\name
+#endif
+EXTERN\name:
+ .else
+#ifdef __ELF__
+ .type \name, %function
+#endif
+ .endif
+\name:
+.endm
+
+.macro const name, export=0, align=2
+ .macro endconst
+#ifdef __ELF__
+ .size \name, . - \name
+#endif
+ .purgem endconst
+ .endm
+#if defined(_WIN32)
+ .section .rdata
+#elif !defined(__MACH__)
+ .section .rodata
+#else
+ .const_data
+#endif
+ .align \align
+ .if \export
+ .global EXTERN\name
+#ifdef __ELF__
+ .hidden EXTERN\name
+#elif defined(__MACH__)
+ .private_extern EXTERN\name
+#endif
+EXTERN\name:
+ .endif
+\name:
+.endm
+
+.macro thread_local name, align=3, quads=1
+ .macro end_thread_local
+ .size \name, . - \name
+ .purgem end_thread_local
+ .endm
+ .section .tbss, "waT"
+ .align \align
+ .hidden \name
+\name:
+ .rept \quads
+ .quad 0
+ .endr
+ end_thread_local
+.endm
+
+#endif /* DAV1D_SRC_RISCV_ASM_S */
diff --git a/src/riscv/cpu.c b/src/riscv/cpu.c
new file mode 100644
index 0000000..1637710
--- /dev/null
+++ b/src/riscv/cpu.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2022, VideoLAN and dav1d authors
+ * Copyright © 2022, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/attributes.h"
+
+#include "src/riscv/cpu.h"
+
+#if defined(HAVE_GETAUXVAL)
+#include <sys/auxv.h>
+
+#define HWCAP_RVV (1 << ('v' - 'a'))
+
+#endif
+
+COLD unsigned dav1d_get_cpu_flags_riscv(void) {
+ unsigned flags = 0;
+#if defined(HAVE_GETAUXVAL)
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+ flags |= (hw_cap & HWCAP_RVV) ? DAV1D_RISCV_CPU_FLAG_V : 0;
+#endif
+
+ return flags;
+}
diff --git a/src/riscv/cpu.h b/src/riscv/cpu.h
new file mode 100644
index 0000000..8ab7f53
--- /dev/null
+++ b/src/riscv/cpu.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2022, VideoLAN and dav1d authors
+ * Copyright © 2022, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_RISCV_CPU_H
+#define DAV1D_SRC_RISCV_CPU_H
+
+enum CpuFlags {
+ DAV1D_RISCV_CPU_FLAG_V = 1 << 0,
+};
+
+unsigned dav1d_get_cpu_flags_riscv(void);
+
+#endif /* DAV1D_SRC_RISCV_CPU_H */
diff --git a/src/riscv/itx.h b/src/riscv/itx.h
new file mode 100644
index 0000000..28c5e54
--- /dev/null
+++ b/src/riscv/itx.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2023, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
+#define decl_itx_fns(ext) \
+decl_itx17_fns( 4, 4, ext); \
+decl_itx16_fns( 8, 8, ext); \
+decl_itx16_fns(16, 16, ext)
+
+decl_itx_fns(rvv);
+
+static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, int const bpc) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+ assign_itx1_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+ assign_itx2_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
+ assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+ assign_itx12_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+ assign_itx16_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
+
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return;
+
+#if BITDEPTH == 8
+ assign_itx17_fn( , 4, 4, rvv);
+ assign_itx16_fn( , 8, 8, rvv);
+ assign_itx12_fn( , 16, 16, rvv);
+#endif
+}
diff --git a/src/x86/cpu.c b/src/x86/cpu.c
index 764d8be..f570fd7 100644
--- a/src/x86/cpu.c
+++ b/src/x86/cpu.c
@@ -57,7 +57,6 @@ COLD unsigned dav1d_get_cpu_flags_x86(void) {
if (cpu.max_leaf >= 1) {
CpuidRegisters r;
dav1d_cpu_cpuid(&r, 1, 0);
- const unsigned model = ((r.eax >> 4) & 0x0f) + ((r.eax >> 12) & 0xf0);
const unsigned family = ((r.eax >> 8) & 0x0f) + ((r.eax >> 20) & 0xff);
if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ {
@@ -87,10 +86,8 @@ COLD unsigned dav1d_get_cpu_flags_x86(void) {
}
#endif
if (!memcmp(cpu.vendor, "AuthenticAMD", sizeof(cpu.vendor))) {
- if ((flags & DAV1D_X86_CPU_FLAG_AVX2) && (family < 0x19 ||
- (family == 0x19 && (model < 0x10 || (model >= 0x20 && model < 0x60)))))
- {
- /* Excavator, Zen, Zen+, Zen 2, Zen 3, Zen 3+ */
+ if ((flags & DAV1D_X86_CPU_FLAG_AVX2) && family <= 0x19) {
+ /* Excavator, Zen, Zen+, Zen 2, Zen 3, Zen 3+, Zen 4 */
flags |= DAV1D_X86_CPU_FLAG_SLOW_GATHER;
}
}
diff --git a/src/x86/filmgrain.h b/src/x86/filmgrain.h
index eeaa328..8f6ac8f 100644
--- a/src/x86/filmgrain.h
+++ b/src/x86/filmgrain.h
@@ -73,9 +73,11 @@ static ALWAYS_INLINE void film_grain_dsp_init_x86(Dav1dFilmGrainDSPContext *cons
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
- c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl);
- c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl);
- c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl);
- c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl);
+ if (BITDEPTH == 8 || !(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl);
+ }
#endif
}
diff --git a/src/x86/filmgrain16_avx512.asm b/src/x86/filmgrain16_avx512.asm
index 00dd6af..5cbebce 100644
--- a/src/x86/filmgrain16_avx512.asm
+++ b/src/x86/filmgrain16_avx512.asm
@@ -29,11 +29,7 @@
%if ARCH_X86_64
-SECTION_RODATA 64
-pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
- db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
+SECTION_RODATA 16
scale_mask: db -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1
scale_shift: dw 7, 7, 6, 6, 5, 5, 4, 4
pw_27_17_17_27: dw 108, 68, 68, 108, 27, 17, 17, 27
@@ -53,6 +49,8 @@ uv_offset_mul: dd 256
dd 1024
pb_8_9_0_1: db 8, 9, 0, 1
+cextern pb_0to63
+
SECTION .text
INIT_ZMM avx512icl
@@ -382,7 +380,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling
packssdw m4, m5, m5
vpbroadcastd m21, [base+scale_shift+r9*8+4]
%if %2
- mova m12, [base+pb_0to63] ; pw_even
+ mova m12, [pb_0to63] ; pw_even
mov r13d, 0x0101
vpbroadcastq m10, [base+pw_23_22+r9*8]
kmovw k3, r13d
diff --git a/src/x86/ipred.h b/src/x86/ipred.h
index 29e1d96..57aff0f 100644
--- a/src/x86/ipred.h
+++ b/src/x86/ipred.h
@@ -137,11 +137,15 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons
init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl);
init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl);
init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl);
+ init_angular_ipred_fn(Z2_PRED, ipred_z2, avx512icl);
#endif
init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl);
init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl);
init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl);
init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl);
+ init_angular_ipred_fn(Z1_PRED, ipred_z1, avx512icl);
+ init_angular_ipred_fn(Z2_PRED, ipred_z2, avx512icl);
+ init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl);
init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl);
c->pal_pred = BF(dav1d_pal_pred, avx512icl);
diff --git a/src/x86/ipred16_avx512.asm b/src/x86/ipred16_avx512.asm
index 60f08d7..6980261 100644
--- a/src/x86/ipred16_avx512.asm
+++ b/src/x86/ipred16_avx512.asm
@@ -1,5 +1,5 @@
-; Copyright © 2022, VideoLAN and dav1d authors
-; Copyright © 2022, Two Orioles, LLC
+; Copyright © 2022-2024, VideoLAN and dav1d authors
+; Copyright © 2022-2024, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
@@ -42,6 +42,20 @@ pal_pred_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51
db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55
db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
+pw_31to0: dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
+ dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+pw_1to32: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+z_upsample: dw 0, -1, 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6
+ dw 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14
+z_xpos_mul: dw 1, 1, 1, 1, 2, 2, 1, 1, 3, 3, 2, 2, 4, 4, 2, 2
+ dw 5, 5, 3, 3, 6, 6, 3, 3, 7, 7, 4, 4, 8, 8, 4, 4
+z_ypos_mul: dw 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 1, 3, 3, 1, 1
+ dw 4, 4, 2, 2, 5, 5, 2, 2, 6, 6, 3, 3, 7, 7, 3, 3
+z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
+z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
+z_xpos_off1a: dw 30720, 30784, 30848, 30912, 30976, 31040, 31104, 31168
+z_xpos_off1b: dw 30720, 30848, 30976, 31104, 31232, 31360, 31488, 31616
filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5
times 4 db 10, 11, 12, 13, 2, 3, -1, -1
filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7
@@ -57,8 +71,36 @@ filter_shift: times 2 dw 6
dd 0
times 2 dw 4
dd 9
-pal_unpack: db 0, 8, 4, 12, 32, 40, 36, 44
- db 16, 24, 20, 28, 48, 56, 52, 60
+pd_65536: dd 65536
+pal_unpack: db 0, 8, 4, 12, 32, 40, 36, 44
+ db 16, 24, 20, 28, 48, 56, 52, 60
+z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
+ db 39, 39, 47, 47, 47, 79, 79, 79
+z_filter_k: dw 8, 8, 6, 6, 4, 4
+ dw 4, 4, 5, 5, 4, 4
+ dw 0, 0, 0, 0, 2, 2
+pb_90: times 4 db 90
+pw_15: times 2 dw 15
+pw_16: times 2 dw 16
+pw_17: times 2 dw 17
+pw_24: times 2 dw 24
+pw_31: times 2 dw 31
+pw_32: times 2 dw 32
+pw_63: times 2 dw 63
+pw_64: times 2 dw 64
+pw_512: times 2 dw 512
+pw_2048: times 2 dw 2048
+pw_31806: times 2 dw 31806
+pw_32640: times 2 dw 32640
+pw_32672: times 2 dw 32672
+pw_32704: times 2 dw 32704
+pw_32735: times 2 dw 32735
+pw_32736: times 2 dw 32736
+
+%define pw_2 (z_xpos_mul+4* 2)
+%define pw_3 (z_xpos_mul+4* 4)
+%define pw_7 (z_xpos_mul+4*12)
+%define pw_0to31 (pw_1to32-2)
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
@@ -74,10 +116,14 @@ JMP_TABLE ipred_paeth_16bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3_16bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64
cextern smooth_weights_1d_16bpc
cextern smooth_weights_2d_16bpc
+cextern dr_intra_derivative
cextern filter_intra_taps
SECTION .text
@@ -612,6 +658,1764 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
jg .w64_loop
RET
+%if WIN64
+ DECLARE_REG_TMP 4
+%else
+ DECLARE_REG_TMP 8
+%endif
+
+cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
+%define base r7-z_filter_t0
+ lea r7, [z_filter_t0]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ lea t0, [dr_intra_derivative]
+ movsxd wq, [base+ipred_z1_16bpc_avx512icl_table+wq*4]
+ add tlq, 2
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ movzx dxd, word [t0+dxq]
+ lea wq, [base+ipred_z1_16bpc_avx512icl_table+wq]
+ movifnidn hd, hm
+ xor angled, 0x4ff ; d = 90 - angle
+ vpbroadcastd m15, [base+pw_31806]
+ jmp wq
+.w4:
+ vpbroadcastw m5, [tlq+14]
+ vinserti32x4 m5, [tlq], 0
+ cmp angleb, 40
+ jae .w4_no_upsample
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+ call .upsample_top
+ vpbroadcastq m0, [base+z_xpos_off1b]
+ jmp .w4_main2
+.w4_no_upsample:
+ test angled, 0x400
+ jnz .w4_main ; !enable_intra_edge_filter
+ lea r3d, [hq+3]
+ vpbroadcastb xm0, r3d
+ vpbroadcastb xm1, angled
+ shr angled, 8 ; is_sm << 1
+ vpcmpeqb k1, xm0, [base+z_filter_wh]
+ vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8]
+ kmovw r5d, k1
+ test r5d, r5d
+ jz .w4_main
+ call .w16_filter
+ mov r2d, 9
+ cmp hd, 4
+ cmovne r3d, r2d
+ vpbroadcastw m6, r3d
+ pminuw m6, [base+pw_0to31]
+ vpermw m5, m6, m5
+.w4_main:
+ vpbroadcastq m0, [base+z_xpos_off1a]
+.w4_main2:
+ movsldup m3, [base+z_xpos_mul]
+ vpbroadcastw m4, dxd
+ lea r2, [strideq*3]
+ pmullw m3, m4
+ vshufi32x4 m6, m5, m5, q3321
+ psllw m4, 3 ; dx*8
+ paddsw m3, m0 ; xpos
+ palignr m6, m5, 2 ; top+1
+.w4_loop:
+ psrlw m1, m3, 6 ; base_x
+ pand m2, m15, m3 ; frac
+ vpermw m0, m1, m5 ; top[base_x]
+ vpermw m1, m1, m6 ; top[base_x+1]
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r2 ], xm1
+ sub hd, 8
+ jl .w4_end
+ vextracti32x4 xm1, m0, 2
+ paddsw m3, m4 ; xpos += dx
+ lea dstq, [dstq+strideq*4]
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+r2 ], xm0
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_end:
+ RET
+.upsample_top:
+ vinserti32x4 m5, [tlq-16], 3
+ mova m3, [base+z_upsample]
+ vpbroadcastd m4, [base+pd_65536]
+ add dxd, dxd
+ vpermw m0, m3, m5
+ paddw m3, m4
+ vpermw m1, m3, m5
+ paddw m3, m4
+ vpermw m2, m3, m5
+ paddw m3, m4
+ vpermw m3, m3, m5
+ vpbroadcastw m5, r9m ; pixel_max
+ paddw m1, m2 ; b+c
+ paddw m0, m3 ; a+d
+ psubw m0, m1, m0
+ psraw m0, 3
+ pxor m2, m2
+ paddw m0, m1
+ pmaxsw m0, m2
+ pavgw m0, m2
+ pminsw m5, m0
+ ret
+.w8:
+ lea r3d, [angleq+216]
+ movu ym5, [tlq]
+ mov r3b, hb
+ movu m10, [base+pw_0to31]
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ lea r3d, [hq+7]
+ vpbroadcastw m6, r3d
+ add r3d, r3d
+ pminuw m6, m10
+ vpermw m5, m6, m5
+ call .upsample_top
+ vbroadcasti32x4 m0, [base+z_xpos_off1b]
+ jmp .w8_main2
+.w8_no_upsample:
+ lea r3d, [hq+7]
+ vpbroadcastb ym0, r3d
+ and r3d, 7
+ or r3d, 8 ; imin(h+7, 15)
+ vpbroadcastw m6, r3d
+ pminuw m6, m10
+ vpermw m5, m6, m5
+ test angled, 0x400
+ jnz .w8_main
+ vpbroadcastb ym1, angled
+ shr angled, 8
+ vpcmpeqb k1, ym0, [base+z_filter_wh]
+ mova xm0, [base+z_filter_t0+angleq*8]
+ vpcmpgtb k1{k1}, ym1, ym0
+ kmovd r5d, k1
+ test r5d, r5d
+ jz .w8_main
+ call .w16_filter
+ cmp hd, r3d
+ jl .w8_filter_end
+ pminud m6, m10, [base+pw_17] {1to16}
+ add r3d, 2
+.w8_filter_end:
+ vpermw m5, m6, m5
+.w8_main:
+ vbroadcasti32x4 m0, [base+z_xpos_off1a]
+.w8_main2:
+ movshdup m3, [base+z_xpos_mul]
+ vpbroadcastw m4, dxd
+ shl r3d, 6
+ lea r2, [strideq*3]
+ pmullw m3, m4
+ vshufi32x4 m6, m5, m5, q3321
+ sub r3d, dxd
+ psllw m4, 2 ; dx*4
+ shl dxd, 2
+ paddsw m3, m0 ; xpos
+ palignr m6, m5, 2 ; top+1
+.w8_loop:
+ psrlw m1, m3, 6 ; base_x
+ pand m2, m15, m3 ; frac
+ vpermw m0, m1, m5 ; top[base_x]
+ vpermw m1, m1, m6 ; top[base_x+1]
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+r2 ], m0, 3
+ sub hd, 4
+ jz .w8_end
+ paddsw m3, m4 ; xpos += dx
+ lea dstq, [dstq+strideq*4]
+ sub r3d, dxd
+ jg .w8_loop
+ vextracti32x4 xm5, m5, 3
+.w8_end_loop:
+ mova [dstq+strideq*0], xm5
+ mova [dstq+strideq*1], xm5
+ mova [dstq+strideq*2], xm5
+ mova [dstq+r2 ], xm5
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16_filter:
+ vpbroadcastw m1, [tlq-2]
+ popcnt r5d, r5d
+ valignq m3, m6, m5, 2
+ vpbroadcastd m7, [base+z_filter_k+(r5-1)*4+12*0]
+ valignq m1, m5, m1, 6
+ vpbroadcastd m8, [base+z_filter_k+(r5-1)*4+12*1]
+ palignr m2, m3, m5, 2
+ vpbroadcastd m9, [base+z_filter_k+(r5-1)*4+12*2]
+ palignr m0, m5, m1, 14
+ pmullw m7, m5
+ palignr m3, m5, 4
+ paddw m0, m2
+ palignr m5, m1, 12
+ pmullw m0, m8
+ paddw m5, m3
+ pmullw m5, m9
+ pxor m1, m1
+ paddw m0, m7
+ paddw m5, m0
+ psrlw m5, 3
+ pavgw m5, m1
+ ret
+.w16:
+ lea r3d, [hq+15]
+ vpbroadcastb ym0, r3d
+ and r3d, 15
+ or r3d, 16 ; imin(h+15, 31)
+ vpbroadcastw m11, r3d
+ pminuw m10, m11, [base+pw_0to31]
+ vpbroadcastw m6, [tlq+r3*2]
+ vpermw m5, m10, [tlq]
+ test angled, 0x400
+ jnz .w16_main
+ vpbroadcastb ym1, angled
+ shr angled, 8
+ vpcmpeqb k1, ym0, [base+z_filter_wh]
+ mova xm0, [base+z_filter_t0+angleq*8]
+ vpcmpgtb k1{k1}, ym1, ym0
+ kmovd r5d, k1
+ test r5d, r5d
+ jz .w16_main
+ call .w16_filter
+ cmp hd, 16
+ jg .w16_filter_h32
+ vpermw m6, m11, m5
+ vpermw m5, m10, m5
+ jmp .w16_main
+.w16_filter_h32:
+ movzx r3d, word [tlq+62]
+ movzx r2d, word [tlq+60]
+ lea r2d, [r2+r3*8+4]
+ sub r2d, r3d
+ mov r3d, 1
+ shr r2d, 3
+ kmovb k1, r3d
+ movd xm0, r2d
+ or r3d, 32
+ vmovdqu16 m6{k1}, m0
+.w16_main:
+ rorx r2d, dxd, 23
+ mov r7, rsp
+ and rsp, ~63
+ vpbroadcastw m3, r2d
+ sub rsp, 64*2
+ mov r2d, dxd
+ paddw m4, m3, m3
+ mova [rsp+64*0], m5
+ vinserti32x8 m3, ym4, 1
+ mova [rsp+64*1], m6
+ shl r3d, 6
+.w16_loop:
+ lea r5d, [r2+dxq]
+ shr r2d, 6
+ movu ym0, [rsp+r2*2]
+ movu ym1, [rsp+r2*2+2]
+ lea r2d, [r5+dxq]
+ shr r5d, 6
+ vinserti32x8 m0, [rsp+r5*2], 1
+ vinserti32x8 m1, [rsp+r5*2+2], 1
+ pand m2, m15, m3 ; frac << 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w16_end
+ paddw m3, m4
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, r3d
+ jl .w16_loop
+ punpckhqdq ym6, ym6
+.w16_end_loop:
+ mova [dstq+strideq*0], ym6
+ mova [dstq+strideq*1], ym6
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_end_loop
+.w16_end:
+ mov rsp, r7
+ RET
+.w32:
+ lea r3d, [hq+31]
+ movu m7, [tlq+64*0]
+ and r3d, 31
+ vpbroadcastw m11, r3d
+ or r3d, 32 ; imin(h+31, 63)
+ pminuw m10, m11, [base+pw_0to31]
+ vpbroadcastw m9, [tlq+r3*2]
+ vpermw m8, m10, [tlq+64*1]
+ test angled, 0x400
+ jnz .w32_main
+ vpbroadcastd m5, [base+pw_3]
+ mov r5d, ~1
+ movu m3, [tlq-2]
+ kmovd k1, r5d
+ valignq m2, m8, m7, 6
+ paddw m7, m3
+ vmovdqu16 m3{k1}, [tlq-4]
+ valignq m4, m9, m8, 2
+ paddw m3, m5
+ paddw m7, [tlq+2]
+ palignr m1, m8, m2, 14
+ pavgw m3, [tlq+4]
+ palignr m2, m8, m2, 12
+ paddw m7, m3
+ palignr m3, m4, m8, 2
+ psrlw m7, 2
+ palignr m4, m8, 4
+ paddw m8, m1
+ paddw m2, m5
+ paddw m8, m3
+ pavgw m2, m4
+ paddw m8, m2
+ psrlw m8, 2
+ cmp hd, 64
+ je .w32_filter_h64
+ vpermw m9, m11, m8
+ vpermw m8, m10, m8
+ jmp .w32_main
+.w32_filter_h64:
+ movzx r3d, word [tlq+126]
+ movzx r2d, word [tlq+124]
+ lea r2d, [r2+r3*8+4]
+ sub r2d, r3d
+ mov r3d, 65
+ shr r2d, 3
+ movd xm0, r2d
+ vpblendmw m9{k1}, m0, m9
+.w32_main:
+ rorx r2d, dxd, 23
+ mov r7, rsp
+ and rsp, ~63
+ vpbroadcastw m5, r2d
+ sub rsp, 64*4
+ mov r2d, dxd
+ mova [rsp+64*0], m7
+ shl r3d, 6
+ mova [rsp+64*1], m8
+ mova m6, m5
+ mova [rsp+64*2], m9
+ punpckhqdq m9, m9
+ mova [rsp+64*3], ym9
+.w32_loop:
+ lea r5d, [r2+dxq]
+ shr r2d, 6
+ movu m0, [rsp+r2*2]
+ movu m2, [rsp+r2*2+2]
+ lea r2d, [r5+dxq]
+ shr r5d, 6
+ movu m1, [rsp+r5*2]
+ movu m3, [rsp+r5*2+2]
+ pand m4, m15, m5
+ paddw m5, m6
+ psubw m2, m0
+ pmulhrsw m2, m4
+ pand m4, m15, m5
+ psubw m3, m1
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jz .w32_end
+ paddw m5, m6
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, r3d
+ jl .w32_loop
+.w32_end_loop:
+ mova [dstq+strideq*0], m9
+ mova [dstq+strideq*1], m9
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_end_loop
+.w32_end:
+ mov rsp, r7
+ RET
+.w64_filter96:
+ vpbroadcastd m4, [base+pw_3]
+ mov r5d, ~1
+ movu m0, [tlq-2]
+ kmovd k1, r5d
+ paddw m7, m0
+ vmovdqu16 m0{k1}, [tlq-4]
+ paddw m0, m4
+ paddw m7, [tlq+2]
+ pavgw m0, [tlq+4]
+ valignq m1, m9, m8, 6
+ paddw m8, [tlq+62]
+ paddw m2, m4, [tlq+60]
+ valignq m3, m10, m9, 2
+ paddw m8, [tlq+66]
+ pavgw m2, [tlq+68]
+ paddw m7, m0
+ palignr m0, m9, m1, 14
+ paddw m8, m2
+ palignr m1, m9, m1, 12
+ psrlw m7, 2
+ palignr m2, m3, m9, 2
+ psrlw m8, 2
+ palignr m3, m9, 4
+ paddw m0, m9
+ paddw m1, m4
+ paddw m0, m2
+ pavgw m1, m3
+ paddw m0, m1
+ ret
+.w64:
+ movu m7, [tlq+64*0]
+ lea r3d, [hq-1]
+ movu m8, [tlq+64*1]
+ vpbroadcastw m11, [tlq+r3*2+128]
+ movu m9, [tlq+64*2]
+ cmp hd, 64
+ je .w64_h64
+ vpbroadcastw m13, r3d
+ or r3d, 64
+ pminuw m12, m13, [base+pw_0to31]
+ mova m10, m11
+ vpermw m9, m12, m9
+ test angled, 0x400
+ jnz .w64_main
+ call .w64_filter96
+ psrlw m0, 2
+ vpermw m9, m12, m0
+ vpermw m10, m13, m0
+ mova m11, m10
+ jmp .w64_main
+.w64_h64:
+ movu m10, [tlq+64*3]
+ or r3d, 64
+ test angled, 0x400
+ jnz .w64_main
+ call .w64_filter96
+ valignq m1, m10, m9, 6
+ valignq m3, m11, m10, 2
+ vpbroadcastd m11, [base+pw_63]
+ psrlw m9, m0, 2
+ palignr m0, m10, m1, 14
+ palignr m1, m10, m1, 12
+ palignr m2, m3, m10, 2
+ palignr m3, m10, 4
+ paddw m10, m0
+ paddw m1, m4
+ paddw m10, m2
+ pavgw m1, m3
+ paddw m10, m1
+ psrlw m10, 2
+ vpermw m11, m11, m10
+.w64_main:
+ rorx r2d, dxd, 23
+ mov r7, rsp
+ and rsp, ~63
+ vpbroadcastw m5, r2d
+ sub rsp, 64*6
+ mova [rsp+64*0], m7
+ mov r2d, dxd
+ mova [rsp+64*1], m8
+ lea r5, [rsp+r3*2]
+ mova [rsp+64*2], m9
+ shl r3d, 6
+ mova [rsp+64*3], m10
+ sub r2, r3
+ mova [rsp+64*4], m11
+ mova m6, m5
+ mova [rsp+64*5], m11
+.w64_loop:
+ mov r3, r2
+ sar r3, 6
+ movu m0, [r5+r3*2+64*0]
+ movu m2, [r5+r3*2+64*0+2]
+ movu m1, [r5+r3*2+64*1]
+ movu m3, [r5+r3*2+64*1+2]
+ pand m4, m15, m5
+ psubw m2, m0
+ pmulhrsw m2, m4
+ psubw m3, m1
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ dec hd
+ jz .w64_end
+ paddw m5, m6
+ add dstq, strideq
+ add r2, dxq
+ jl .w64_loop
+.w64_end_loop:
+ mova [dstq+64*0], m11
+ mova [dstq+64*1], m11
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ mov rsp, r7
+ RET
+
+cglobal ipred_z2_16bpc, 3, 9, 16, dst, stride, tl, w, h, angle, dx, _, dy
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ lea dxq, [dr_intra_derivative-90]
+ movzx dyd, angleb
+ xor angled, 0x400
+ mov r7, dxq
+ sub dxq, dyq
+ movifnidn hd, hm
+ and dyd, ~1
+ vpbroadcastw m12, [tlq]
+ and dxq, ~1
+ movzx dyd, word [r7+dyq] ; angle - 90
+ lea r7, [z_filter_t0]
+ movzx dxd, word [dxq+270] ; 180 - angle
+ mova m0, [base+pw_31to0]
+ movsxd wq, [base+ipred_z2_16bpc_avx512icl_table+wq*4]
+ movu m4, [tlq+2]
+ neg dyd
+ vpermw m7, m0, [tlq-64*1]
+ lea wq, [base+ipred_z2_16bpc_avx512icl_table+wq]
+ vpbroadcastd m14, [base+pw_31806]
+ vpbroadcastd m15, [base+pw_1]
+ jmp wq
+.w4:
+ movq xm3, [tlq]
+ vpbroadcastq m8, [base+pw_1to32]
+ test angled, 0x400
+ jnz .w4_main ; !enable_intra_edge_filter
+ lea r3d, [hq+2]
+ add angled, 1022
+ shl r3d, 6
+ test r3d, angled
+ jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+ pshuflw xm0, xm4, q3321
+ sub angled, 1075 ; angle - 53
+ lea r3d, [hq+3]
+ call .upsample_above
+ punpcklwd xm4, xm3, xm4
+ palignr xm3, xm4, xm12, 14
+ jmp .w4_main
+.w4_upsample_left:
+ call .upsample_left
+ movsldup m1, [base+z_xpos_mul]
+ paddw m1, m1
+ jmp .w4_main2
+.w4_no_upsample_above:
+ lea r3d, [hq+3]
+ vpbroadcastd ym0, [base+pw_3]
+ sub angled, 1112 ; angle - 90
+ call .filter_above2
+ lea r3d, [hq+2]
+ add angled, 973 ; angle + 883
+ palignr xm3, xm4, xm12, 14
+ shl r3d, 6
+ test r3d, angled
+ jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+ call .filter_left16
+.w4_main:
+ movsldup m1, [base+z_xpos_mul]
+ psllw m15, 3
+.w4_main2:
+ vpbroadcastq m0, [base+pw_1to32]
+ vpbroadcastw m11, dxd
+ movsldup m2, [base+z_xpos_mul]
+ vpbroadcastw m13, dyd
+ vpbroadcastd m5, [tlq-2]
+ psllw m10, m8, 6
+ valignq m5, m7, m5, 6
+ pmullw m2, m11
+ psubw m10, m2 ; xpos
+ pmullw m13, m0 ; ypos
+ palignr m5, m7, m5, 14
+ psrlw m12, m13, 6
+ psllw m13, 9
+ paddw m12, m1 ; base_y
+ pand m13, m14 ; frac_y << 9
+ psllw m11, 3
+ lea r5, [strideq*3]
+.w4_loop:
+ psrlw m1, m10, 6 ; base_x
+ pand m2, m14, m10 ; frac
+ vpermw m0, m1, m3 ; top[base_x]
+ vpermw m1, m1, m4 ; top[base_x+1]
+ vpmovw2m k1, m10 ; base_x < 0
+ psllw m2, 9
+ vpermw m0{k1}, m12, m5 ; left[base_y]
+ vpermw m1{k1}, m12, m7 ; left[base_y+1]
+ vmovdqu16 m2{k1}, m13
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r5 ], xm1
+ sub hd, 8
+ jl .w4_end
+ vextracti32x8 ym0, m0, 1
+ psubw m10, m11 ; base_x -= dx
+ lea dstq, [dstq+strideq*4]
+ paddw m12, m15 ; base_y++
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r5 ], xm1
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_end:
+ RET
+.upsample_above: ; w4/w8
+ mova ym9, [base+pw_1to32]
+ palignr xm1, xm4, xm12, 12
+ paddw xm3, xm4 ; b+c
+ xor angled, 0x7f ; 180 - angle
+ paddw xm0, xm1 ; a+d
+ vpbroadcastw xm1, r9m ; pixel_max
+ vpbroadcastb xm11, r3d
+ psubw xm0, xm3, xm0
+ vpbroadcastb xm2, angled
+ psraw xm0, 3
+ shr angled, 8
+ paddw xm3, xm0
+ pxor xm0, xm0
+ vpcmpeqb k2, xm11, [base+z_filter_wh]
+ pmaxsw xm3, xm0
+ add dxd, dxd
+ pavgw xm3, xm0
+ vpcmpgtb k2{k2}, xm2, [base+z_filter_t0+angleq*8]
+ pminsw xm3, xm1
+ paddw m8, m8
+ jmp .filter_left16b
+.upsample_left: ; h4/h8
+ lea r3d, [hq-1]
+ palignr xm2, xm7, xm12, 14
+ vpbroadcastw xm0, r3d
+ palignr xm1, xm7, xm12, 12
+ pminuw xm0, xm9
+ paddw xm2, xm7 ; b+c
+ vpermw xm0, xm0, xm7
+ add dyd, dyd
+ paddw xm0, xm1 ; a+d
+ vpbroadcastw xm1, r9m ; pixel_max
+ psubw xm0, xm2, xm0
+ psraw xm0, 3
+ paddw xm2, xm0
+ pxor xm0, xm0
+ pmaxsw xm2, xm0
+ pavgw xm2, xm0
+ pminsw xm2, xm1
+ punpckhwd xm0, xm2, xm7
+ punpcklwd xm7, xm2, xm7
+ vinserti32x4 ym7, xm0, 1
+ ret
+.filter_above:
+ sub angled, 90
+.filter_above2:
+ vpbroadcastb ym1, r3d
+ vpbroadcastb ym10, angled
+ mov r3d, angled
+ shr r3d, 8
+ vpcmpeqb k2, ym1, [base+z_filter_wh]
+ mova xm11, [base+z_filter_t0+r3*8]
+ vpcmpgtb k1{k2}, ym10, ym11
+ mova m9, [base+pw_1to32]
+ kmovd r3d, k1
+ test r3d, r3d
+ jz .filter_end
+ pminuw ym0, ym9
+ popcnt r3d, r3d
+ vpbroadcastd ym6, r7m ; max_w
+ kxnorw k1, k1, k1
+ vpbroadcastd ym5, [base+z_filter_k+(r3-1)*4+12*0]
+ kaddw k1, k1, k1 ; ~1
+ vpbroadcastd ym13, [base+z_filter_k+(r3-1)*4+12*1]
+ vpermw ym2, ym0, ym4 ; +1
+ pmullw ym5, ym4
+ paddw ym1, ym2, ym3
+ vmovdqu16 m3{k1}, [tlq-2] ; -2
+ vpermw ym2, ym0, ym2 ; +2
+ vpbroadcastd ym0, [base+z_filter_k+(r3-1)*4+12*2]
+ pmullw ym1, ym13
+ movu m13, [base+pw_0to31]
+ paddw ym2, ym3
+ packssdw ym6, ym6
+ pmullw ym2, ym0
+ paddw ym1, ym5
+ vpcmpgtw k1, ym6, ym13
+ paddw ym1, ym2
+ pxor ym2, ym2
+ psrlw ym1, 3
+ pavgw ym4{k1}, ym1, ym2
+.filter_end:
+ ret
+.filter_left16:
+ vpbroadcastd ym1, [base+pb_90]
+ psubb ym1, ym10
+ vpcmpgtb k2{k2}, ym1, ym11
+.filter_left16b:
+ kmovd r3d, k2
+ test r3d, r3d
+ jz .filter_end
+ lea r5d, [hq-1]
+ vinserti32x4 ym0, ym12, xm7, 1
+ vpbroadcastw ym1, r5d
+ popcnt r3d, r3d
+ vpbroadcastd ym6, r8m ; max_h
+ pminuw ym9, ym1
+ vpbroadcastd ym5, [base+z_filter_k+(r3-1)*4+12*0]
+ vpermw ym2, ym9, ym7 ; +1
+ vpbroadcastd ym10, [base+z_filter_k+(r3-1)*4+12*1]
+ palignr ym1, ym7, ym0, 14 ; -1
+ pmullw ym5, ym7
+ palignr ym0, ym7, ym0, 12 ; -2
+ paddw ym1, ym2
+ vpermw ym2, ym9, ym2 ; +2
+ vpbroadcastd ym9, [base+z_filter_k+(r3-1)*4+12*2]
+ pmullw ym1, ym10
+ paddw ym2, ym0
+ packssdw ym6, ym6
+ pmullw ym2, ym9
+ paddw ym1, ym5
+ vpcmpgtw k1, ym6, [base+pw_0to31]
+ paddw ym1, ym2
+ pxor ym2, ym2
+ psrlw ym1, 3
+ pavgw ym7{k1}, ym1, ym2
+ ret
+.filter_left:
+ cmp hd, 32
+ jl .filter_left16
+ vpbroadcastd m5, [base+pw_3]
+ pminud m0, m9, [base+pw_31] {1to16}
+.filter_left32:
+ vpbroadcastd m6, r8m ; max_h
+ valignq m2, m7, m12, 6
+ packssdw m6, m6
+ palignr m1, m7, m2, 14 ; -1
+ paddw m1, m7
+ palignr m2, m7, m2, 12 ; -2
+ vpcmpgtw k1, m6, m13
+ paddw m2, m5
+ cmp hd, 64
+ je .filter_left64
+ lea r3d, [hq-1]
+ vpbroadcastw m10, r3d
+ pminuw m0, m10
+ vpermw m10, m0, m7 ; +1
+ paddw m1, m10
+ vpermw m10, m0, m10 ; +2
+ pavgw m2, m10
+ paddw m1, m2
+ vpsrlw m7{k1}, m1, 2
+ ret
+.filter_left64:
+ valignq m10, m8, m7, 2
+ vpaddd m13, [base+pw_32] {1to16}
+ palignr m11, m10, m7, 2 ; +1
+ paddw m1, m11
+ palignr m11, m10, m7, 4 ; +2
+ valignq m10, m8, m7, 6
+ pavgw m11, m2
+ vpermw m2, m0, m8 ; 32+1
+ paddw m1, m11
+ vpsrlw m7{k1}, m1, 2
+ palignr m1, m8, m10, 14 ; 32-1
+ paddw m1, m8
+ palignr m10, m8, m10, 12 ; 32-2
+ paddw m1, m2
+ vpermw m2, m0, m2 ; 32+2
+ paddw m10, m5
+ vpcmpgtw k1, m6, m13
+ pavgw m2, m10
+ paddw m1, m2
+ vpsrlw m8{k1}, m1, 2
+ ret
+.w8:
+ mova xm3, [tlq]
+ vbroadcasti32x4 m8, [base+pw_1to32]
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [angleq+126]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+ psrldq xm0, xm4, 2
+ sub angled, 53
+ pshufhw xm0, xm0, q2210
+ lea r3d, [hq+7]
+ call .upsample_above
+ punpcklwd xm0, xm3, xm4
+ punpckhwd xm4, xm3, xm4
+ vinserti32x4 ym3, ym12, xm0, 1
+ vinserti32x4 ym4, ym0, xm4, 1
+ palignr ym3, ym4, ym3, 14
+ jmp .w8_main
+.w8_upsample_left:
+ call .upsample_left
+ movshdup m1, [base+z_xpos_mul]
+ psllw m15, 3
+ paddw m1, m1
+ jmp .w8_main2
+.w8_no_upsample_above:
+ lea r3d, [hq+7]
+ vpbroadcastd ym0, [base+pw_7]
+ call .filter_above
+ lea r3d, [angleq-51]
+ mov r3b, hb
+ palignr xm3, xm4, xm12, 14
+ cmp r3d, 8
+ jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
+ call .filter_left
+.w8_main:
+ movshdup m1, [base+z_xpos_mul]
+ psllw m15, 2
+.w8_main2:
+ vbroadcasti32x4 m0, [base+pw_1to32]
+ vpbroadcastw m11, dxd
+ movshdup m2, [base+z_xpos_mul]
+ vpbroadcastw m13, dyd
+ psllw m10, m8, 6
+ valignq m5, m7, m12, 6
+ pmullw m2, m11
+ psubw m10, m2 ; xpos
+ pmullw m13, m0 ; ypos
+ palignr m5, m7, m5, 14
+ psrlw m12, m13, 6
+ psllw m13, 9
+ mov r2d, 1<<6
+ paddw m12, m1 ; base_y
+ lea r3d, [dxq-(8<<6)] ; left-only threshold
+ pand m13, m14 ; frac_y << 9
+ shl dxd, 2
+ psllw m11, 2
+ lea r5, [strideq*3]
+.w8_loop:
+ psrlw m1, m10, 6
+ pand m2, m14, m10
+ vpermw m0, m1, m3
+ vpermw m1, m1, m4
+ psllw m2, 9
+ sub r2d, dxd
+ jge .w8_toponly
+ vpmovw2m k1, m10
+ vpermw m0{k1}, m12, m5
+ vpermw m1{k1}, m12, m7
+ vmovdqu16 m2{k1}, m13
+.w8_toponly:
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+r5 ], m0, 3
+ sub hd, 4
+ jz .w8_end
+ psubw m10, m11 ; base_x -= dx
+ lea dstq, [dstq+strideq*4]
+ paddw m12, m15 ; base_y++
+ cmp r2d, r3d
+ jge .w8_loop
+.w8_leftonly_loop:
+ vpermw m0, m12, m5
+ vpermw m1, m12, m7
+ psubw m1, m0
+ pmulhrsw m1, m13
+ paddw m12, m15
+ paddw m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+r5 ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_leftonly_loop
+.w8_end:
+ RET
+.w16:
+ mova ym3, [tlq]
+ vpermw m8, m0, [tlq-64*2]
+ test angled, 0x400
+ jnz .w16_main
+ lea r3d, [hq+15]
+ vpbroadcastd ym0, [base+pw_15]
+ call .filter_above
+ call .filter_left
+ vinserti32x4 ym3, ym12, xm4, 1
+ palignr ym3, ym4, ym3, 14
+.w16_main:
+ vbroadcasti32x8 m0, [base+pw_1to32]
+ vpbroadcastw m11, dxd
+ vpbroadcastw m13, dyd
+ kxnorw k2, k2, k2
+ psllw m10, m0, 6
+ valignq m5, m7, m12, 6
+ psubw m10, m11 ; xpos
+ valignq m6, m8, m7, 6
+ pmullw m13, m0 ; ypos
+ knotd k1, k2
+ palignr m5, m7, m5, 14
+ palignr m6, m8, m6, 14
+ vpsubw m10{k1}, m11
+ psrlw m12, m13, 6
+ psllw m13, 9
+ mov r2d, 1<<6
+ vpsubw m12{k2}, m15 ; base_y
+ pand m13, m14 ; frac_y << 9
+ lea r3d, [dxq-(16<<6)]
+ paddw m11, m11
+ add dxd, dxd
+ paddw m15, m15
+.w16_loop:
+ psrlw m1, m10, 6
+ pand m2, m14, m10
+ vpermw m0, m1, m3
+ vpermw m1, m1, m4
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m12, m15 ; base_y++
+ paddw m0, m1
+ sub r2d, dxd
+ jge .w16_toponly
+ mova m1, m5
+ vpermt2w m1, m12, m6
+ mova m2, m7
+ vpermt2w m2, m12, m8
+ vpmovw2m k1, m10
+ psubw m2, m1
+ pmulhrsw m2, m13
+ vpaddw m0{k1}, m1, m2
+.w16_toponly:
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w16_end
+ psubw m10, m11 ; base_x -= dx
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, r3d
+ jge .w16_loop
+ paddw m12, m15
+ vpermt2w m5, m12, m6
+ mova m1, m7
+ vpermt2w m1, m12, m8
+ jmp .w16_leftonly_loop_start
+.w16_leftonly_loop:
+ mova m1, m7
+ vpermt2w m1, m12, m8
+ vshufi32x4 m5, m1, q1032
+.w16_leftonly_loop_start:
+ psubw m0, m1, m5
+ pmulhrsw m0, m13
+ paddw m12, m15
+ paddw m0, m5
+ mova m5, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_leftonly_loop
+.w16_end:
+ RET
+.w32:
+ mova m3, [tlq]
+ vpermw m8, m0, [tlq-64*2]
+ mova m9, [base+pw_1to32]
+ test angled, 0x400
+ jnz .w32_main
+ pminud m0, m9, [base+pw_31] {1to16}
+ mov r3d, ~1
+ kmovd k1, r3d
+ vpbroadcastd m5, [base+pw_3]
+ vpbroadcastd m6, r6m ; max_w
+ vpermw m2, m0, m4 ; +1
+ movu m13, [base+pw_0to31]
+ paddw m1, m4, m3
+ vmovdqu16 m3{k1}, [tlq-2] ; -2
+ packssdw m6, m6
+ paddw m1, m2
+ vpermw m2, m0, m2 ; +2
+ paddw m3, m5
+ vpcmpgtw k1, m6, m13
+ pavgw m2, m3
+ paddw m1, m2
+ psrlw m4{k1}, m1, 2
+ call .filter_left32
+.w32_main:
+ sub rsp, 64*2
+ call .w32_main1
+ add rsp, 64*2
+ RET
+.w32_main1:
+ vpbroadcastw m11, dxd
+ movu [rsp+64], m4
+ vpbroadcastw m4, dyd
+ movd [rsp+60], xm12
+ valignq m5, m7, m12, 6
+ psllw m3, m9, 6 ; xpos
+ valignq m6, m8, m7, 6
+ pmullw m9, m4 ; ypos
+ palignr m5, m7, m5, 14
+ mov r2d, 33<<6
+ palignr m6, m8, m6, 14
+ mova m10, m3
+.w32_main2:
+ psllw m13, m9, 9
+ sub r2d, dxd
+ psrlw m12, m9, 6 ; base_y
+ mov r8d, hd
+ pand m13, m14 ; frac_y << 9
+.w32_loop:
+ mov r3d, r2d
+ shr r3d, 6
+ psubw m10, m11 ; base_x -= dx
+ movu m0, [rsp+r3*2-2]
+ pand m2, m10, m14 ; frac_x
+ movu m1, [rsp+r3*2]
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m12, m15 ; base_y++
+ paddw m0, m1
+ cmp r2d, 32<<6
+ jge .w32_toponly
+ mova m1, m5
+ vpermt2w m1, m12, m6
+ mova m2, m7
+ vpermt2w m2, m12, m8
+ vpmovw2m k1, m10
+ psubw m2, m1
+ pmulhrsw m2, m13
+ vpaddw m0{k1}, m1, m2
+.w32_toponly:
+ mova [dstq], m0
+ dec r8d
+ jz .w32_end
+ add dstq, strideq
+ sub r2d, dxd
+ jge .w32_loop
+ paddw m12, m15
+ mova m2, m5
+ vpermt2w m2, m12, m6
+.w32_leftonly_loop:
+ mova m1, m7
+ vpermt2w m1, m12, m8
+ psubw m0, m1, m2
+ pmulhrsw m0, m13
+ paddw m12, m15
+ paddw m0, m2
+ mova m2, m1
+ mova [dstq], m0
+ add dstq, strideq
+ dec r8d
+ jg .w32_leftonly_loop
+.w32_end:
+ ret
+.w64:
+ movu m3, [tlq+66]
+ vpermw m8, m0, [tlq-64*2]
+ mova m9, [base+pw_1to32]
+ test angled, 0x400
+ jnz .w64_main
+ mova m2, [tlq] ; -1
+ mov r3d, ~1
+ vpbroadcastd m5, [base+pw_3]
+ kmovd k1, r3d
+ movu m13, [base+pw_0to31]
+ vpbroadcastd m6, r6m ; max_w
+ pminud m0, m9, [base+pw_31] {1to16}
+ paddw m1, m4, m2
+ vmovdqu16 m2{k1}, [tlq-2] ; -2
+ packssdw m6, m6
+ paddw m1, [tlq+4] ; +1
+ paddw m2, m5
+ vpcmpgtw k1, m6, m13
+ pavgw m2, [tlq+6] ; +2
+ paddw m1, m2
+ vpermw m2, m0, m3 ; 32+1
+ psrlw m4{k1}, m1, 2
+ paddw m1, m3, [tlq+64] ; 32-1
+ vpaddd m11, m13, [base+pw_32] {1to16}
+ paddw m1, m2
+ vpermw m2, m0, m2 ; 32+2
+ paddw m10, m5, [tlq+62] ; 32-2
+ vpcmpgtw k1, m6, m11
+ pavgw m2, m10
+ paddw m1, m2
+ psrlw m3{k1}, m1, 2
+ call .filter_left32
+.w64_main:
+ sub rsp, 64*3
+ movu [rsp+64*2-gprsize], m3
+ mov r5, dstq
+ call .w32_main1
+ psllw m4, 5
+ mov r2d, 65<<6
+ vpaddd m10, m3, [base+pw_2048] {1to16} ; xpos
+ lea dstq, [r5+64]
+ paddw m9, m4 ; ypos
+ call .w32_main2
+ add rsp, 64*3
+ RET
+
+cglobal ipred_z3_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
+ lea r7, [z_filter_t0]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ lea t0, [dr_intra_derivative+45*2-1]
+ movsxd wq, [base+ipred_z3_16bpc_avx512icl_table+wq*4]
+ sub angled, 180
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ or dyq, ~0x7e
+ mova m0, [base+pw_31to0]
+ movzx dyd, word [t0+dyq]
+ lea wq, [base+ipred_z3_16bpc_avx512icl_table+wq]
+ movifnidn hd, hm
+ vpbroadcastd m14, [base+pw_31806]
+ vpbroadcastd m15, [base+pw_1]
+ jmp wq
+.w4:
+ lea r3d, [hq+3]
+ xor r3d, 31 ; 32 - (h + imin(w, h))
+ vpbroadcastw m7, r3d
+ pmaxuw m7, m0
+ vpermw m6, m7, [tlq-64*1]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ cmp angleb, 40
+ jae .w4_filter
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_filter ; h > 8 || (h == 8 && is_sm)
+ call .upsample
+ movsldup m1, [base+z_ypos_mul]
+ paddw m1, m1
+ jmp .w4_main2
+.w4_filter:
+ lea r3d, [hq+3]
+ call .filter32
+.w4_main:
+ movsldup m1, [base+z_ypos_mul]
+.w4_main2:
+ vpbroadcastq m0, [base+pw_1to32]
+ vpbroadcastw m4, dyd
+ lea r2d, [hq+4]
+ shr r2d, 3
+ pmullw m4, m0 ; ypos
+ vpbroadcastw m0, r2d
+ imul r2, strideq ; stride * imax(height / 8, 1)
+ pmullw m1, m0
+ lea r3, [r2*3]
+ paddd m1, [base+pw_32736] {1to16}
+ psrlw m2, m4, 6
+ psllw m4, 9
+ paddsw m2, m1 ; base+0
+ vpandd m4, m14 ; frac << 9
+ vpermw m3, m2, m6 ; left[base+0]
+.w4_loop:
+ paddsw m2, m15 ; base+1
+ vpermw m1, m2, m6 ; left[base+1]
+ psubw m0, m1, m3
+ pmulhrsw m0, m4
+ paddw m0, m3
+ movq [dstq+r2*0], xm0
+ movhps [dstq+r2*1], xm0
+ vextracti32x4 xm3, ym0, 1
+ movq [dstq+r2*2], xm3
+ movhps [dstq+r3 ], xm3
+ sub hd, 8
+ jl .w4_end
+ lea r5, [dstq+r2*4]
+ vextracti32x8 ym0, m0, 1
+ mova m3, m1
+ movq [r5+r2*0], xm0
+ movhps [r5+r2*1], xm0
+ vextracti32x4 xm1, ym0, 1
+ movq [r5+r2*2], xm1
+ movhps [r5+r3 ], xm1
+ add dstq, strideq
+ test hd, hd
+ jnz .w4_loop
+.w4_end:
+ RET
+.upsample:
+ vinserti32x4 m6, [tlq-14], 3
+ mova m3, [base+z_upsample]
+ vpbroadcastd m4, [base+pd_65536]
+ add dyd, dyd
+ vpermw m0, m3, m6
+ paddw m3, m4
+ vpermw m1, m3, m6
+ paddw m3, m4
+ vpermw m2, m3, m6
+ paddw m3, m4
+ vpermw m3, m3, m6
+ vpbroadcastw m6, r9m ; pixel_max
+ paddw m1, m2 ; b+c
+ paddw m0, m3 ; a+d
+ psubw m0, m1, m0
+ psraw m0, 3
+ pxor m2, m2
+ paddw m0, m1
+ pmaxsw m0, m2
+ pavgw m0, m2
+ pminsw m6, m0
+ ret
+.w8:
+ mova m6, [tlq-64*1]
+ cmp hd, 32
+ je .w8_h32
+ mov r3d, 8
+ cmp hd, 4
+ cmove r3d, hd
+ lea r3d, [r3+hq-1]
+ xor r3d, 31 ; 32 - (h + imin(w, h))
+ vpbroadcastw m1, r3d
+ vpermw m7, m1, m6
+ pmaxuw m1, m0
+ vpermw m6, m1, m6
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [angleq+216]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_filter ; is_sm || d >= 40 || h > 8
+ call .upsample
+ movshdup m1, [base+z_ypos_mul]
+ paddw m1, m1
+ call .w8_main_setup
+.w8_upsample_loop:
+ vpermw m3, m2, m6 ; left[base+0]
+ paddw m2, m15 ; base+1
+ vpermw m1, m2, m6 ; left[base+1]
+ psubw m0, m1, m3
+ pmulhrsw m0, m4
+ paddw m2, m15 ; base+2
+ paddw m0, m3
+ mova m3, m1
+ mova [dstq+r2*0], xm0
+ vextracti32x4 [dstq+r2*1], ym0, 1
+ vextracti32x4 [dstq+r2*2], m0, 2
+ vextracti32x4 [dstq+r3 ], m0, 3
+ add dstq, strideq
+ sub hd, 4
+ jg .w8_upsample_loop
+ RET
+.w8_main_setup:
+ vbroadcasti32x4 m0, [base+pw_1to32]
+ vpbroadcastw m4, dyd
+ rorx r2d, hd, 2
+ pmullw m4, m0 ; ypos
+ vpbroadcastw m0, r2d
+ imul r2, strideq ; stride * height / 4
+ lea r3, [r2*3]
+ pmullw m1, m0 ; 0 1 2 3
+ paddd m1, [base+pw_32704] {1to16}
+ psrlw m2, m4, 6
+ psllw m4, 9
+ paddsw m2, m1 ; base+0
+ vpandd m4, m14 ; frac << 9
+ ret
+.w8_h32:
+ pmaxud m7, m0, [base+pw_24] {1to16}
+ vpermw m6, m0, m6
+ vpermw m7, m7, [tlq-64*2]
+ test angled, 0x400
+ jnz .w8_main
+ call .filter64
+ vpbroadcastd m0, [base+pw_7]
+ pminuw m0, [base+pw_0to31]
+ vpermw m7, m0, m7
+ jmp .w8_main
+.w8_filter:
+ lea r3d, [hq+7]
+ call .filter32
+.w8_main:
+ movshdup m1, [base+z_ypos_mul]
+ call .w8_main_setup
+ mova m3, m6
+ vpermt2w m3, m2, m7 ; left[base+0]
+.w8_loop:
+ paddsw m2, m15 ; base+1
+ mova m1, m6
+ vpermt2w m1, m2, m7 ; left[base+1]
+ psubw m0, m1, m3
+ pmulhrsw m0, m4
+ paddw m0, m3
+ mova m3, m1
+ mova [dstq+r2*0], xm0
+ vextracti32x4 [dstq+r2*1], ym0, 1
+ vextracti32x4 [dstq+r2*2], m0, 2
+ vextracti32x4 [dstq+r3 ], m0, 3
+ add dstq, strideq
+ sub hd, 4
+ jg .w8_loop
+ RET
+.filter32:
+ vpbroadcastb ym10, r3d
+ vpbroadcastb ym1, angled
+ shr angled, 8
+ vpcmpeqb k1, ym10, [base+z_filter_wh]
+ mova xm2, [base+z_filter_t0+angleq*8]
+ vpcmpgtb k1{k1}, ym1, ym2
+ kmovd r5d, k1
+ test r5d, r5d
+ jz .filter32_end
+ vpbroadcastw m2, [tlq]
+ popcnt r5d, r5d
+ vpbroadcastd m5, [base+z_filter_k+(r5-1)*4+12*0]
+ valignq m2, m6, m2, 6
+ vpbroadcastd m8, [base+z_filter_k+(r5-1)*4+12*1]
+ valignq m4, m7, m6, 2
+ vpbroadcastd m9, [base+z_filter_k+(r5-1)*4+12*2]
+ palignr m1, m6, m2, 14
+ pmullw m5, m6
+ palignr m3, m4, m6, 2
+ paddw m1, m3
+ palignr m2, m6, m2, 12
+ pmullw m1, m8
+ palignr m4, m6, 4
+ paddw m2, m4
+ pmullw m2, m9
+ pmovzxbw m10, ym10
+ pxor m6, m6
+ paddw m5, m1
+ pminuw m1, m10, [base+pw_0to31]
+ paddw m5, m2
+ psrlw m5, 3
+ pavgw m6, m5
+ vpermw m7, m10, m6
+ vpermw m6, m1, m6
+.filter32_end:
+ ret
+.w16:
+ mova m6, [tlq-64*1]
+ cmp hd, 32
+ jl .w16_h16
+ pmaxud m8, m0, [base+pw_16] {1to16}
+ mova m7, [tlq-64*2]
+ vpermw m6, m0, m6
+ jg .w16_h64
+ vpermw m7, m8, m7
+ test angled, 0x400
+ jnz .w16_main
+ call .filter64
+ vpbroadcastd m0, [base+pw_15]
+ vinserti32x8 m0, [base+pw_0to31], 0
+ vpermw m7, m0, m7
+ jmp .w16_main
+.w16_h16:
+ lea r3d, [hq*2-1]
+ xor r3d, 31 ; 32 - (h + imin(w, h))
+ vpbroadcastw m1, r3d
+ vpermw m7, m1, m6
+ pmaxuw m1, m0
+ vpermw m6, m1, m6
+ test angled, 0x400
+ jnz .w16_main
+ lea r3d, [hq+15]
+ call .filter32
+.w16_main:
+ vbroadcasti32x8 m0, [base+pw_1to32]
+ vpbroadcastw m4, dyd
+ rorx r2d, hd, 1
+ pmullw m4, m0 ; ypos
+ vpbroadcastw ym1, r2d
+ imul r2, strideq ; stride * height / 2
+ paddd m1, [base+pw_32704] {1to16}
+ lea r3, [r2+strideq]
+ psrlw m2, m4, 6
+ psllw m4, 9
+ paddsw m2, m1 ; base+0
+ vpandd m4, m14 ; frac << 9
+ mova m3, m6
+ vpermt2w m3, m2, m7 ; left[base+0]
+.w16_loop:
+ paddsw m1, m2, m15 ; base+1
+ paddsw m2, m1, m15 ; base+2
+ vpermi2w m1, m6, m7 ; left[base+1]
+ psubw m0, m1, m3
+ pmulhrsw m0, m4
+ paddw m0, m3
+ mova m3, m6
+ vpermt2w m3, m2, m7 ; left[base+2]
+ vextracti32x8 [dstq+strideq*0], m0, 1
+ mova [dstq+r2 ], ym0
+ psubw m0, m3, m1
+ pmulhrsw m0, m4
+ paddw m0, m1
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+r3 ], ym0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w16_h64:
+ vpermw m7, m0, m7
+ vpermw m8, m8, [tlq-64*3]
+ test angled, 0x400
+ jnz .w16_h64_main
+ valignq m11, m8, m7, 6
+ call .filter64
+ vshufi32x4 m2, m8, m8, q3321
+ vpbroadcastd m0, [base+pw_15]
+ palignr ym3, ym8, ym11, 12
+ vinserti32x8 m0, [base+pw_0to31], 0
+ palignr ym4, ym8, ym11, 14
+ palignr ym1, ym2, ym8, 4
+ paddw ym3, ym5
+ palignr ym2, ym8, 2
+ paddw ym8, ym4
+ pavgw ym3, ym1
+ paddw ym8, ym2
+ paddw ym8, ym3
+ psrlw ym8, 2
+ vpermw m8, m0, m8
+.w16_h64_main:
+ vbroadcasti32x8 m0, [base+pw_1to32]
+ vpbroadcastw m4, dyd
+ pmullw m4, m0 ; ypos
+ vpbroadcastd ym1, [base+pw_32]
+ paddd m1, [base+pw_32672] {1to16}
+ mov r2, strideq
+ shl r2, 5 ; stride*32
+ vpbroadcastd m9, [base+pw_32735]
+ lea r3, [r2+strideq]
+ psrlw m2, m4, 6
+ psllw m4, 9
+ paddsw m2, m1 ; base+0
+ vpandd m4, m14 ; frac << 9
+ mova m3, m7
+ vpermt2w m3, m2, m6
+ vpcmpgtw k1, m2, m9
+ vpermw m3{k1}, m2, m8 ; left[base+0]
+.w16_h64_loop:
+ paddsw m2, m15 ; base+1
+ mova m1, m7
+ vpermt2w m1, m2, m6
+ vpcmpgtw k1, m2, m9
+ vpermw m1{k1}, m2, m8 ; left[base+1]
+ psubw m0, m1, m3
+ pmulhrsw m0, m4
+ paddsw m2, m15 ; base+2
+ paddw m0, m3
+ mova m3, m7
+ vpermt2w m3, m2, m6
+ vpcmpgtw k1, m2, m9
+ vpermw m3{k1}, m2, m8 ; left[base+2]
+ vextracti32x8 [dstq+strideq*0], m0, 1
+ mova [dstq+r2 ], ym0
+ psubw m0, m3, m1
+ pmulhrsw m0, m4
+ paddw m0, m1
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+r3 ], ym0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 4
+ jg .w16_h64_loop
+ RET
+.filter64:
+ vpbroadcastw m2, [tlq]
+ vpbroadcastd m5, [base+pw_3]
+ valignq m2, m6, m2, 6
+ valignq m4, m7, m6, 2
+ valignq m10, m7, m6, 6
+ palignr m1, m6, m2, 12
+ palignr m2, m6, m2, 14
+ palignr m3, m4, m6, 4
+ paddw m1, m5
+ palignr m4, m6, 2
+ paddw m6, m2
+ valignq m2, m8, m7, 2
+ pavgw m1, m3
+ palignr m3, m7, m10, 12
+ paddw m6, m4
+ palignr m4, m7, m10, 14
+ paddw m6, m1
+ palignr m1, m2, m7, 4
+ psrlw m6, 2
+ palignr m2, m7, 2
+ paddw m3, m5
+ paddw m7, m4
+ pavgw m3, m1
+ paddw m7, m2
+ paddw m7, m3
+ psrlw m7, 2
+ ret
+.w32:
+ mova m6, [tlq-64*1]
+ cmp hd, 32
+ jl .w32_h16
+ mova m8, [tlq-64*2]
+ vpermw m6, m0, m6
+ vpermw m7, m0, m8
+ jg .w32_h64
+ test angled, 0x400
+ jnz .w32_main
+ vpbroadcastw xm8, xm8
+ jmp .w32_filter
+.w32_h16:
+ lea r3d, [hq*2-1]
+ xor r3d, 31 ; 32 - (h + imin(w, h))
+ vpbroadcastw m1, r3d
+ vpermw m7, m1, m6
+ pmaxuw m1, m0
+ vpermw m6, m1, m6
+ test angled, 0x400
+ jnz .w32_main
+ vextracti32x4 xm8, m7, 3
+.w32_filter:
+ call .filter64
+.w32_main:
+ vpbroadcastw m4, dyd
+ vpbroadcastd m1, [base+pw_32704]
+ pmullw m4, [base+pw_1to32] ; ypos
+ psrlw m2, m4, 6
+ psllw m4, 9
+ paddsw m2, m1 ; base+0
+ vpandd m4, m14 ; frac << 9
+ mova m3, m6
+ vpermt2w m3, m2, m7 ; left[base+0]
+.w32_loop:
+ paddsw m1, m2, m15 ; base+1
+ paddsw m2, m1, m15 ; base+2
+ vpermi2w m1, m6, m7 ; left[base+1]
+ psubw m0, m1, m3
+ pmulhrsw m0, m4
+ paddw m0, m3
+ mova m3, m6
+ vpermt2w m3, m2, m7 ; left[base+2]
+ mova [dstq+strideq*0], m0
+ psubw m0, m3, m1
+ pmulhrsw m0, m4
+ paddw m0, m1
+ mova [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w32_h64:
+ mova m9, [tlq-64*3]
+ vpermw m8, m0, m9
+ test angled, 0x400
+ jnz .w32_h64_main
+ vpbroadcastw xm9, xm9
+ call .filter96
+.w32_h64_main:
+ vpbroadcastw m4, dyd
+ vpbroadcastd m1, [base+pw_32672]
+ pmullw m4, [base+pw_1to32] ; ypos
+ vpbroadcastd m9, [base+pw_32735]
+ psrlw m2, m4, 6
+ psllw m4, 9
+ paddsw m2, m1 ; base+0
+ vpandd m4, m14 ; frac << 9
+ mova m3, m7
+ vpermt2w m3, m2, m6
+ vpcmpgtw k1, m2, m9
+ vpermw m3{k1}, m2, m8 ; left[base+0]
+.w32_h64_loop:
+ paddsw m2, m15 ; base+1
+ mova m1, m7
+ vpermt2w m1, m2, m6
+ vpcmpgtw k1, m2, m9
+ vpermw m1{k1}, m2, m8 ; left[base+1]
+ psubw m0, m1, m3
+ pmulhrsw m0, m4
+ paddsw m2, m15 ; base+2
+ paddw m0, m3
+ mova m3, m7
+ vpermt2w m3, m2, m6
+ vpcmpgtw k1, m2, m9
+ vpermw m3{k1}, m2, m8 ; left[base+2]
+ mova [dstq+strideq*0], m0
+ psubw m0, m3, m1
+ pmulhrsw m0, m4
+ paddw m0, m1
+ mova [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_h64_loop
+ RET
+.filter96:
+ valignq m11, m8, m7, 6
+ call .filter64
+ valignq m2, m9, m8, 2
+ palignr m3, m8, m11, 12
+ palignr m4, m8, m11, 14
+ palignr m1, m2, m8, 4
+ paddw m3, m5
+ palignr m2, m8, 2
+ paddw m8, m4
+ pavgw m3, m1
+ paddw m8, m2
+ paddw m8, m3
+ psrlw m8, 2
+ ret
+.w64:
+ mova m7, [tlq-64*1]
+ vpermw m6, m0, m7
+ cmp hd, 32
+ jl .w64_h16
+ mova m8, [tlq-64*2]
+ vpermw m7, m0, m8
+ jg .w64_h64
+ test angled, 0x400
+ jnz .w64_main
+ vpbroadcastw m8, xm8
+ mova m9, m8
+ call .filter96
+ vshufi32x4 m9, m8, m8, q3333
+ jmp .w64_h64_main
+.w64_h16:
+ vpbroadcastw m7, xm7
+ test angled, 0x400
+ jnz .w64_main
+ mova m8, m7
+ call .filter64
+.w64_main:
+ vpbroadcastw m11, dyd
+ vpbroadcastd m1, [base+pw_32704]
+ pmullw m10, m11, [base+pw_1to32] ; ypos
+ psllw m11, 5
+ psrlw m8, m10, 6
+ paddw m11, m10
+ psllw m10, 9
+ psrlw m9, m11, 6
+ psllw m11, 9
+ psubw m9, m8
+ paddsw m8, m1 ; base+0
+ vpandd m10, m14 ; frac << 9
+ vpandd m11, m14 ; frac << 9
+ mova m4, m6
+ vpermt2w m4, m8, m7 ; left[base+0] ( 0..31)
+ paddsw m5, m8, m9
+ vpermi2w m5, m6, m7 ; left[base+0] (32..63)
+.w64_loop:
+ paddsw m8, m15 ; base+1 ( 0..31)
+ mova m2, m6
+ vpermt2w m2, m8, m7 ; left[base+1] ( 0..31)
+ paddsw m3, m8, m9 ; base+1 (32..63)
+ vpermi2w m3, m6, m7 ; left[base+1] (32..63)
+ psubw m0, m2, m4
+ psubw m1, m3, m5
+ pmulhrsw m0, m10
+ pmulhrsw m1, m11
+ paddw m0, m4
+ paddw m1, m5
+ mova m4, m2
+ mova [dstq+64*0], m0
+ mova m5, m3
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w64_h64:
+ vpermw m8, m0, [tlq-64*3]
+ mova m13, [tlq-64*4]
+ vpermw m9, m0, m13
+ test angled, 0x400
+ jnz .w64_h64_main
+ valignq m12, m9, m8, 6
+ call .filter96
+ vpbroadcastw xm2, xm13
+ valignq m2, m9, 2
+ palignr m3, m9, m12, 12
+ palignr m4, m9, m12, 14
+ palignr m1, m2, m9, 4
+ paddw m3, m5
+ palignr m2, m9, 2
+ paddw m9, m4
+ pavgw m3, m1
+ paddw m9, m2
+ paddw m9, m3
+ psrlw m9, 2
+.w64_h64_main:
+ vpbroadcastw m11, dyd
+ vpbroadcastd m1, [base+pw_32640]
+ pmullw m10, m11, [base+pw_1to32] ; ypos
+ psllw m11, 5
+ psrlw m12, m10, 6
+ paddw m11, m10
+ psllw m10, 9
+ psrlw m13, m11, 6
+ psllw m11, 9
+ psubw m13, m12
+ paddsw m12, m1 ; base+0
+ vpandd m10, m14 ; frac << 9
+ vpandd m11, m14 ; frac << 9
+ vpbroadcastd m14, [base+pw_64]
+ mova m4, m6
+ vpermt2w m4, m12, m7
+ vptestmw k1, m12, m14
+ mova m0, m8
+ vpermt2w m0, m12, m9
+ paddsw m1, m12, m13
+ mova m5, m6
+ vpermt2w m5, m1, m7
+ vptestmw k2, m1, m14
+ vpermi2w m1, m8, m9
+ vmovdqu16 m4{k1}, m0 ; left[base+0] ( 0..31)
+ vmovdqu16 m5{k2}, m1 ; left[base+0] (32..63)
+.w64_h64_loop:
+ paddsw m12, m15 ; base+1
+ mova m2, m6
+ vpermt2w m2, m12, m7
+ vptestmw k1, m12, m14
+ mova m0, m8
+ vpermt2w m0, m12, m9
+ paddsw m1, m12, m13
+ mova m3, m6
+ vpermt2w m3, m1, m7
+ vptestmw k2, m1, m14
+ vpermi2w m1, m8, m9
+ vmovdqu16 m2{k1}, m0 ; left[base+1] ( 0..31)
+ vmovdqu16 m3{k2}, m1 ; left[base+1] (32..63)
+ psubw m0, m2, m4
+ psubw m1, m3, m5
+ pmulhrsw m0, m10
+ pmulhrsw m1, m11
+ paddw m0, m4
+ paddw m1, m5
+ mova m4, m2
+ mova [dstq+64*0], m0
+ mova m5, m3
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w64_h64_loop
+ RET
+
cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3
lea r6, [pal_pred_16bpc_avx512icl_table]
tzcnt wd, wm
diff --git a/src/x86/ipred_avx2.asm b/src/x86/ipred_avx2.asm
index 95802c7..58e4093 100644
--- a/src/x86/ipred_avx2.asm
+++ b/src/x86/ipred_avx2.asm
@@ -2275,14 +2275,14 @@ ALIGN function_align
vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2]
punpckhqdq xm3, xm3 ; 34 44 44 44
pmaddubsw xm3, xm4
- movd xm4, r6m ; max_width
- pminsw xm4, xm15
- vpbroadcastb xm4, xm4
+ vpbroadcastd xm4, r6m ; max_width
+ packssdw xm4, xm4
paddw xm0, xm2
paddw xm0, xm3
pmulhrsw xm0, xm13
- psubb xm4, [base+pb_1to32]
+ packsswb xm4, xm4
psrlq xm1, 8
+ psubb xm4, [base+pb_1to32]
packuswb xm0, xm0
vpblendvb xm0, xm1, xm4
movd [rsp+65], xm0
@@ -2324,14 +2324,14 @@ ALIGN function_align
vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2]
pshufb m2, m4
pmaddubsw m2, m3
- movd xm4, r7m ; max_height
- pminsw xm4, xm15
- vpbroadcastb xm4, xm4
- psubb xm4, [base+pb_16to1]
+ vpbroadcastd xm4, r7m ; max_height
+ packssdw xm4, xm4
paddw m1, m0
paddw m1, m2
pmulhrsw m1, m13
+ packsswb xm4, xm4
vextracti128 xm0, m1, 1
+ psubb xm4, [base+pb_16to1]
packuswb xm0, xm1
vpblendvb xm0, [rsp+48], xm4
mova [rsp+48], xm0
@@ -2465,14 +2465,14 @@ ALIGN function_align
pmaddubsw xm2, xm4
vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2]
pmaddubsw xm3, xm4
- movd xm4, r6m ; max_width
- pminuw xm4, xm15
- vpbroadcastb xm4, xm4
+ vpbroadcastd xm4, r6m ; max_width
+ packssdw xm4, xm4
paddw xm0, xm2
paddw xm0, xm3
pmulhrsw xm0, xm13
- psubb xm4, [base+pb_1to32]
+ packsswb xm4, xm4
psrldq xm1, 1
+ psubb xm4, [base+pb_1to32]
packuswb xm0, xm0
vpblendvb xm0, xm1, xm4
movq [rsp+65], xm0
@@ -2530,14 +2530,14 @@ ALIGN function_align
vinserti128 m2, [rsp+43], 1
pshufb m0, m2, m0
pmaddubsw m0, m7
- movd xm7, r7m ; max_height
+ vpbroadcastd m7, r7m ; max_height
pshufb m1, m2, m1
pmaddubsw m1, m8
pshufb m2, m4
pmaddubsw m2, m9
- pminsw xm7, xm15
+ packssdw m7, m7
paddw m1, m0
- vpbroadcastb m7, xm7
+ packsswb m7, m7
paddw m1, m2
pmulhrsw m1, m13
psubb m7, [base+pb_32to1]
@@ -2679,14 +2679,14 @@ ALIGN function_align
shufps m2, m1, q2121 ; 12 23 34 45 56 67 78 89 89 9a ab bc cd de ef ff
pmaddubsw m2, m4
pmaddubsw m1, m5
- movd xm4, r6m ; max_width
- pminsw xm4, xm15
- vpbroadcastb xm4, xm4
+ vpbroadcastd xm4, r6m ; max_width
+ packssdw xm4, xm4
paddw m0, m2
paddw m0, m1
pmulhrsw m0, m13
- psubb xm4, [base+pb_1to32]
+ packsswb xm4, xm4
vextracti128 xm2, m0, 1
+ psubb xm4, [base+pb_1to32]
packuswb xm0, xm2
vpblendvb xm0, xm6, xm4
movu [rsp+65], xm0
@@ -2703,9 +2703,9 @@ ALIGN function_align
vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1]
vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2]
.w16_filter_left:
- movd xm6, r7m ; max_height
- pminsw xm6, xm15
- vpbroadcastb m6, xm6
+ vpbroadcastd m6, r7m ; max_height
+ packssdw m6, m6
+ packsswb m6, m6
cmp hd, 32
jl .w16_filter_left_h16
vpbroadcastd xm0, [base+pb_5]
@@ -2916,9 +2916,9 @@ ALIGN function_align
vinserti128 m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd ab bc cd de ef ff ff ff
movu xm3, [tlq+ 6]
vinserti128 m3, [tlq+17], 1
- movd xm0, r6m ; max_width
- pminsw xm0, xm15
- vpbroadcastb m10, xm0
+ vpbroadcastd m10, r6m ; max_width
+ packssdw m10, m10
+ packsswb m10, m10
.w32_filter_above:
pshufb m0, m1, m5
shufps m4, m5, m6, q1021 ; 12 23 34 45 56 67 78 89 67 78 89 9a ab bc cd de
@@ -2974,20 +2974,20 @@ ALIGN function_align
paddw m0, m3
movu xm2, [tlq+36]
vinserti128 m2, [tlq+49], 1
+ vpbroadcastd m10, r6m ; max_width
pshufb m4, m2, m4
pmaddubsw m4, m7
pshufb m3, m2, m6
pmaddubsw m3, m8
pshufb m2, m5
pmaddubsw m2, m9
- movd xm5, r6m ; max_width
- pminsw xm5, xm15
- vpbroadcastb m10, xm5
+ packssdw m10, m10
paddw m3, m4
paddw m2, m3
vpbroadcastd m3, [base+pb_32]
pmulhrsw m0, m13
pmulhrsw m2, m13
+ packsswb m10, m10
mova xm5, [base+z_filter_s]
vinserti128 m5, [base+z_filter_s+6], 1
psubb m3, m10, m3
diff --git a/src/x86/ipred_avx512.asm b/src/x86/ipred_avx512.asm
index 4aeb14e..de953de 100644
--- a/src/x86/ipred_avx512.asm
+++ b/src/x86/ipred_avx512.asm
@@ -97,16 +97,97 @@ ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4
db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pb_63to0: db 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48
+ db 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32
+ db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
+ db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+z_frac_table: db 64, 0, 62, 2, 60, 4, 58, 6, 56, 8, 54, 10, 52, 12, 50, 14
+ db 48, 16, 46, 18, 44, 20, 42, 22, 40, 24, 38, 26, 36, 28, 34, 30
+ db 32, 32, 30, 34, 28, 36, 26, 38, 24, 40, 22, 42, 20, 44, 18, 46
+ db 16, 48, 14, 50, 12, 52, 10, 54, 8, 56, 6, 58, 4, 60, 2, 62
+z_filter_s1: db -1, -1, -1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6
+ db 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22
+ db 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38
+ db 46, 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54
+z_filter_s5: db 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15, 17, 16
+ db 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31, 33, 32
+ db 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47, 49, 48
+ db 58, 57, 59, 58, 60, 59, 61, 60, 62, 61, 63, 62, 64, 63, 65, 64
+z_filter_s3: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+z_filter_s2: db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+z_filter_s4: db 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8
+z_xpos_bc: db 17, 17, 17, 17, 33, 33, 33, 33, 9, 9, 9, 9, 9, 9, 9, 9
+z_filter4_s1: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
+ db 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
+z_xpos_off1a: db 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72
+z_xpos_off1b: db 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80
+z_xpos_off2a: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+ db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24
+ db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40
+ db 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56
+z_xpos_off2b: db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
+ db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32
+ db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48
+ db 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64
+z_xpos_mul: dw 4, 4, 4, 4, 8, 8, 4, 4, 12, 12, 8, 8, 16, 16, 8, 8
+ dw 20, 20, 12, 12, 24, 24, 12, 12, 28, 28, 16, 16, 32, 32, 16, 16
+z_ypos_off1: db 64, 65, 64, 65, 64, 65, 64, 65, 65, 66, 65, 66, 66, 67, 66, 67
+ db 66, 67, 66, 67, 68, 69, 68, 69, 67, 68, 67, 68, 70, 71, 70, 71
+ db 68, 69, 68, 69, 72, 73, 72, 73, 69, 70, 69, 70, 74, 75, 74, 75
+ db 70, 71, 70, 71, 76, 77, 76, 77, 71, 72, 71, 72, 78, 79, 78, 79
+z_ypos_off2: db 64, 65, 64, 65, 0, 0, 0, 0, 64, 65, 64, 65, 0, 0, 0, 0
+ db 65, 66, 65, 66, 1, 1, 1, 1, 65, 66, 65, 66, 1, 1, 1, 1
+ db 66, 67, 66, 67, 2, 2, 2, 2, 66, 67, 66, 67, 2, 2, 2, 2
+ db 67, 68, 67, 68, 3, 3, 3, 3, 67, 68, 67, 68, 3, 3, 3, 3
+z_ypos_off3: db 1, 2, 1, 2, 1, 1, 1, 1, 3, 4, 3, 4, 1, 1, 1, 1
+ db 5, 6, 5, 6, 3, 3, 3, 3, 7, 8, 7, 8, 3, 3, 3, 3
+ db 9, 10, 9, 10, 5, 5, 5, 5, 11, 12, 11, 12, 5, 5, 5, 5
+ db 13, 14, 13, 14, 7, 7, 7, 7, 15, 16, 15, 16, 7, 7, 7, 7
+z_ypos_mul1a: dw 1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24
+ dw 33, 34, 35, 36, 37, 38, 39, 40, 49, 50, 51, 52, 53, 54, 55, 56
+z_ypos_mul1b: dw 9, 10, 11, 12, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32
+ dw 41, 42, 43, 44, 45, 46, 47, 48, 57, 58, 59, 60, 61, 62, 63, 64
+z_ypos_mul2a: dw 1*512, 2*512, 3*512, 4*512, 5*512, 6*512, 7*512, 8*512
+ dw 17*512, 18*512, 19*512, 20*512, 21*512, 22*512, 23*512, 24*512
+ dw 33*512, 34*512, 35*512, 36*512, 37*512, 38*512, 39*512, 40*512
+ dw 49*512, 50*512, 51*512, 52*512, 53*512, 54*512, 55*512, 56*512
+z_ypos_mul2b: dw 9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512
+ dw 25*512, 26*512, 27*512, 28*512, 29*512, 30*512, 31*512, 32*512
+ dw 41*512, 42*512, 43*512, 44*512, 45*512, 46*512, 47*512, 48*512
+ dw 57*512, 58*512, 59*512, 60*512, 61*512, 62*512, 63*512, 64*512
+z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
+z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
+z3_upsample: db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
+ db 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8
+z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
+ db 39, 39, 47, 47, 47, 79, 79, 79
+z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16
+ db 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16, 0
+ db 0, 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16
+pb_8_56_0_0: db 8, 56, 0, 0
+pb_m4_36: times 2 db -4, 36
pb_127_m127: times 2 db 127, -127
+pb_8: times 4 db 8
+pb_15: times 4 db 15
+pb_16: times 4 db 16
+pb_31: times 4 db 31
+pb_63: times 4 db 63
+pb_90: times 4 db 90
pb_128: times 4 db 128
pw_128: times 2 dw 128
pw_255: times 2 dw 255
+pw_512: times 2 dw 512
-%define pb_1 (ipred_h_shuf+24)
-%define pb_2 (ipred_h_shuf+20)
-%define pb_3 (ipred_h_shuf+16)
-%define pd_8 (filter_taps+128)
+%define pb_1 (ipred_h_shuf+24)
+%define pb_2 (ipred_h_shuf+20)
+%define pb_3 (ipred_h_shuf+16)
+%define pb_4 (smooth_shuf +48)
+%define pb_7 (ipred_h_shuf+ 0)
+%define pb_9 (z_xpos_bc + 8)
+%define pb_17 (z_xpos_bc + 0)
+%define pb_33 (z_xpos_bc + 4)
+%define pd_8 (filter_taps+128)
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
@@ -125,10 +206,16 @@ JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64
+cextern dr_intra_derivative
+cextern pb_0to63
+
SECTION .text
INIT_ZMM avx512icl
@@ -1200,6 +1287,1612 @@ cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3
jg .w64
RET
+%if WIN64
+ DECLARE_REG_TMP 4
+%else
+ DECLARE_REG_TMP 8
+%endif
+
+cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
+%define base r7-z_filter_t0
+ lea r7, [z_filter_t0]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ lea t0, [dr_intra_derivative]
+ movsxd wq, [base+ipred_z1_8bpc_avx512icl_table+wq*4]
+ inc tlq
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ movzx dxd, word [t0+dxq]
+ lea wq, [base+ipred_z1_8bpc_avx512icl_table+wq]
+ movifnidn hd, hm
+ xor angled, 0x4ff ; d = 90 - angle
+ mova m14, [base+z_frac_table]
+ vpbroadcastd m15, [base+pw_512]
+ jmp wq
+.w4:
+ mova m9, [pb_0to63]
+ pminud m8, m9, [base+pb_7] {1to16}
+ vpbroadcastq m7, [tlq]
+ pshufb m7, m8
+ cmp angleb, 40
+ jae .w4_no_upsample
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+ pshufb xmm0, xm7, [base+z_filter_s4]
+ mova xmm1, [tlq-1]
+ pshufb xmm1, [base+z_xpos_off2a]
+ vpbroadcastd xmm2, [base+pb_m4_36]
+ vpbroadcastq m4, [pb_0to63]
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm1, xmm2
+ add dxd, dxd
+ kxnorw k1, k1, k1
+ paddw xmm0, xmm1
+ pmulhrsw xm0, xmm0, xm15
+ packuswb xm0, xm0
+ punpcklbw ym7{k1}, ym0
+ jmp .w4_main2
+.w4_no_upsample:
+ test angled, 0x400
+ jnz .w4_main ; !enable_intra_edge_filter
+ lea r3d, [hq+3]
+ vpbroadcastb xm0, r3d
+ vpbroadcastb xm1, angled
+ shr angled, 8 ; is_sm << 1
+ vpcmpeqb k1, xm0, [base+z_filter_wh]
+ vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8]
+ kmovw r5d, k1
+ test r5d, r5d
+ jz .w4_main
+ vbroadcasti32x4 ym0, [tlq-1]
+ pshufb ym0, [base+z_filter4_s1]
+ popcnt r5d, r5d ; filter_strength
+ pshufb ym1, ym7, [z_filter_s4]
+ pshufb ym7, [base+z_filter_s3]
+ vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0]
+ vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1]
+ pmaddubsw ym0, ym11
+ pmaddubsw ym1, ym11
+ pmaddubsw ym7, ym12
+ paddw ym0, ym1
+ paddw ym7, ym0
+ pmulhrsw ym7, ym15
+ cmp hd, 4
+ je .w4_filter_end
+ vpbroadcastd m8, [base+pb_9]
+ pminub m8, m9
+.w4_filter_end:
+ paddb m8, m8
+ vpermb m7, m8, m7
+.w4_main:
+ vpbroadcastq m4, [base+z_xpos_off1a]
+.w4_main2:
+ movsldup m2, [base+z_xpos_mul]
+ vpbroadcastw m5, dxd
+ vbroadcasti32x4 m3, [base+z_xpos_bc]
+ lea r2, [strideq*3]
+ pmullw m2, m5 ; xpos
+ psllw m5, 5 ; dx*8
+.w4_loop:
+ psrlw m1, m2, 3
+ pshufb m0, m2, m3
+ vpermw m1, m1, m14 ; 64-frac, frac
+ paddsb m0, m4 ; base, base+1
+ vpermb m0, m0, m7 ; top[base], top[base+1]
+ paddsw m2, m5 ; xpos += dx
+ pmaddubsw m0, m1 ; v
+ pmulhrsw m0, m15
+ packuswb m0, m0
+ vextracti32x4 xm1, ym0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+r2 ], xm1, 1
+ sub hd, 8
+ jl .w4_end
+ vextracti32x4 xm1, m0, 2 ; top[max_base_x]
+ lea dstq, [dstq+strideq*4]
+ vextracti32x4 xm0, m0, 3
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ movd [dstq+strideq*2], xm0
+ pextrd [dstq+r2 ], xm0, 1
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_end:
+ RET
+.w8_filter:
+ mova ym0, [base+z_filter_s1]
+ popcnt r5d, r5d
+ vbroadcasti32x4 ym1, [base+z_filter_s2]
+ vbroadcasti32x4 ym3, [base+z_filter_s3]
+ vbroadcasti32x4 ym4, [base+z_filter_s4]
+ vpermi2b ym0, ym7, ym2 ; al bl
+ mova ym5, [base+z_filter_s5]
+ pshufb ym1, ym7, ym1 ; ah bh
+ vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0]
+ pshufb ym3, ym7, ym3 ; cl ch
+ vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1]
+ pshufb ym4, ym7, ym4 ; el dl
+ vpbroadcastd ym13, [base+z_filter_k+(r5-1)*4+12*2]
+ vpermb ym5, ym5, ym7 ; eh dh
+ pmaddubsw ym0, ym11
+ pmaddubsw ym1, ym11
+ pmaddubsw ym2, ym3, ym12
+ pmaddubsw ym3, ym13
+ pmaddubsw ym4, ym11
+ pmaddubsw ym5, ym11
+ paddw ym0, ym2
+ paddw ym1, ym3
+ paddw ym0, ym4
+ paddw ym1, ym5
+ pmulhrsw ym0, ym15
+ pmulhrsw ym1, ym15
+ packuswb ym0, ym1
+ ret
+.w8:
+ lea r3d, [angleq+216]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ lea r3d, [hq-1]
+ mova xm1, [base+z_filter_s4]
+ vpbroadcastb xm2, r3d
+ mova xm7, [tlq-1]
+ vinserti32x4 ym7, [tlq+7], 1
+ vbroadcasti32x4 ym0, [base+z_xpos_off1a]
+ vpbroadcastd ym3, [base+pb_m4_36]
+ pminub xm2, xm1
+ pshufb ym0, ym7, ym0
+ vinserti32x4 ym1, xm2, 1
+ psrldq ym7, 1
+ pshufb ym1, ym7, ym1
+ pmaddubsw ym0, ym3
+ pmaddubsw ym1, ym3
+ vbroadcasti32x4 m8, [pb_0to63]
+ add dxd, dxd
+ paddw ym0, ym1
+ pmulhrsw ym0, ym15
+ packuswb ym0, ym0
+ punpcklbw ym7, ym0
+ jmp .w8_main2
+.w8_no_upsample:
+ lea r3d, [hq+7]
+ mova m9, [pb_0to63]
+ vpbroadcastb ym0, r3d
+ and r3d, 7
+ vbroadcasti32x4 m7, [tlq]
+ or r3d, 8 ; imin(h+7, 15)
+ vpbroadcastb m8, r3d
+ pminub m8, m9
+ pshufb m7, m8
+ test angled, 0x400
+ jnz .w8_main
+ vpbroadcastb ym1, angled
+ shr angled, 8
+ vpcmpeqb k1, ym0, [base+z_filter_wh]
+ mova xm0, [base+z_filter_t0+angleq*8]
+ vpcmpgtb k1{k1}, ym1, ym0
+ kmovd r5d, k1
+ test r5d, r5d
+ jz .w8_main
+ vpbroadcastd ym2, [tlq-4]
+ call .w8_filter
+ cmp hd, 8
+ jle .w8_filter_end
+ vpbroadcastd m8, [base+pb_17]
+ add r3d, 2
+ pminub m8, m9
+.w8_filter_end:
+ vpermb m7, m8, m0
+.w8_main:
+ vbroadcasti32x4 m8, [base+z_xpos_off1a]
+.w8_main2:
+ movsldup m4, [base+z_xpos_mul]
+ vpbroadcastw m9, dxd
+ shl r3d, 6
+ vpbroadcastd m5, [base+z_xpos_bc+8*0]
+ pmullw m4, m9 ; xpos
+ vpbroadcastd m6, [base+z_xpos_bc+8*1]
+ sub r3d, dxd
+ shl dxd, 3
+ psllw m9, 5 ; dx*8
+ lea r2, [strideq*3]
+.w8_loop:
+ psrlw m3, m4, 3
+ pshufb m0, m4, m5
+ pshufb m1, m4, m6
+ vpermw m3, m3, m14
+ paddsb m0, m8
+ paddsb m1, m8
+ vpermb m0, m0, m7
+ vpermb m1, m1, m7
+ paddsw m4, m9
+ punpcklqdq m2, m3, m3
+ pmaddubsw m0, m2
+ punpckhqdq m3, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ packuswb m0, m1
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r2 ], xm1
+ sub hd, 8
+ jl .w8_end
+ vextracti32x8 ym0, m0, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r2 ], xm1
+ jz .w8_end
+ lea dstq, [dstq+strideq*4]
+ sub r3d, dxd
+ jg .w8_loop
+ vextracti32x4 xm7, m7, 3
+.w8_end_loop:
+ movq [dstq+strideq*0], xm7
+ movq [dstq+strideq*1], xm7
+ movq [dstq+strideq*2], xm7
+ movq [dstq+r2 ], xm7
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16_filter:
+ mova m0, [base+z_filter_s1]
+ popcnt r5d, r5d
+ vbroadcasti32x4 m1, [base+z_filter_s2]
+ vbroadcasti32x4 m3, [base+z_filter_s3]
+ vbroadcasti32x4 m4, [base+z_filter_s4]
+ vpermi2b m0, m7, m2 ; al bl
+ mova m5, [base+z_filter_s5]
+ pshufb m1, m7, m1 ; ah bh
+ vpbroadcastd m11, [base+z_filter_k+(r5-1)*4+12*0]
+ pshufb m3, m7, m3 ; cl ch
+ vpbroadcastd m12, [base+z_filter_k+(r5-1)*4+12*1]
+ pshufb m4, m7, m4 ; el dl
+ vpbroadcastd m13, [base+z_filter_k+(r5-1)*4+12*2]
+ vpermb m5, m5, m7 ; eh dh
+ pmaddubsw m0, m11
+ pmaddubsw m1, m11
+ pmaddubsw m2, m3, m12
+ pmaddubsw m3, m13
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m0, m2
+ paddw m1, m3
+ paddw m0, m4
+ paddw m1, m5
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ packuswb m0, m1
+ ret
+.w16:
+ lea r3d, [hq+15]
+ mova m9, [pb_0to63]
+ vpbroadcastb ym0, r3d
+ and r3d, 15
+ movu ym7, [tlq]
+ or r3d, 16 ; imin(h+15, 31)
+ vpbroadcastb m8, r3d
+ pminub m8, m9
+ vpermb m7, m8, m7
+ test angled, 0x400
+ jnz .w16_main
+ vpbroadcastb ym1, angled
+ shr angled, 8
+ vpcmpeqb k1, ym0, [base+z_filter_wh]
+ mova xm0, [base+z_filter_t0+angleq*8]
+ vpcmpgtb k1{k1}, ym1, ym0
+ kmovd r5d, k1
+ test r5d, r5d
+ jz .w16_main
+ vpbroadcastd m2, [tlq-4]
+ call .w16_filter
+ cmp hd, 16
+ jle .w16_filter_end
+ vpbroadcastd m8, [base+pb_33]
+ add r3d, 2
+ pminub m8, m9
+.w16_filter_end:
+ vpermb m7, m8, m0
+.w16_main:
+ movshdup m3, [base+z_xpos_mul]
+ vpbroadcastw m8, dxd
+ shl r3d, 6
+ vpbroadcastd m4, [base+z_xpos_bc]
+ pmullw m3, m8 ; xpos
+ vbroadcasti32x4 m5, [base+z_xpos_off1a]
+ sub r3d, dxd
+ shl dxd, 2
+ vbroadcasti32x4 m6, [base+z_xpos_off1b]
+ psllw m8, 4 ; dx*4
+ lea r2, [strideq*3]
+.w16_loop:
+ pshufb m1, m3, m4
+ psrlw m2, m3, 3
+ paddsb m0, m1, m5
+ vpermw m2, m2, m14
+ paddsb m1, m6
+ vpermb m0, m0, m7
+ vpermb m1, m1, m7
+ paddsw m3, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ packuswb m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+r2 ], m0, 3
+ sub hd, 4
+ jz .w16_end
+ lea dstq, [dstq+strideq*4]
+ sub r3d, dxd
+ jg .w16_loop
+ vextracti32x4 xm7, m7, 3
+.w16_end_loop:
+ mova [dstq+strideq*0], xm7
+ mova [dstq+strideq*1], xm7
+ mova [dstq+strideq*2], xm7
+ mova [dstq+r2 ], xm7
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_end_loop
+.w16_end:
+ RET
+.w32_filter:
+ mova m0, [base+z_filter_s1]
+ vbroadcasti32x4 m1, [base+z_filter_s2]
+ vbroadcasti32x4 m3, [base+z_filter_s3]
+ vbroadcasti32x4 m4, [base+z_filter_s4]
+ vpermi2b m0, m7, m2 ; al bl
+ mova m5, [base+z_filter_s5]
+ pshufb m1, m7, m1 ; ah bh
+ vpbroadcastd m11, [base+z_filter_k+4*2+12*0]
+ pshufb m3, m7, m3 ; cl ch
+ vpbroadcastd m12, [base+z_filter_k+4*2+12*1]
+ pshufb m4, m7, m4 ; el dl
+ vpbroadcastd m13, [base+z_filter_k+4*2+12*2]
+ vpermi2b m5, m7, m8 ; eh dh
+ pmaddubsw m0, m11
+ pmaddubsw m1, m11
+ pmaddubsw m2, m3, m12
+ pmaddubsw m3, m13
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m0, m2
+ paddw m1, m3
+ paddw m0, m4
+ paddw m1, m5
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ packuswb m7, m0, m1
+ ret
+.w32:
+ lea r3d, [hq+31]
+ vpbroadcastb m9, r3d
+ and r3d, 31
+ pminub m10, m9, [pb_0to63]
+ or r3d, 32 ; imin(h+31, 63)
+ vpermb m7, m10, [tlq]
+ vpbroadcastb m8, [tlq+r3]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w32_main
+ vpbroadcastd m2, [tlq-4]
+ call .w32_filter
+ cmp hd, 64
+ je .w32_h64_filter_end
+ vpermb m8, m9, m7
+ vpermb m7, m10, m7
+ jmp .w32_main
+.w32_h64_filter_end: ; edge case for 32x64
+ movd xmm0, [tlq+r3-1]
+ movd xmm1, [base+pb_8_56_0_0]
+ add r3d, 2
+ pmaddubsw xmm0, xmm1
+ vptestmw k1, xmm1, xmm1 ; 0x01
+ pmulhrsw xm0, xmm0, xm15
+ vmovdqu8 m8{k1}, m0
+.w32_main:
+ rorx r2d, dxd, 30
+ vpbroadcastd m4, [base+z_xpos_bc]
+ vpbroadcastw m3, r2d
+ vbroadcasti32x8 m5, [base+z_xpos_off2a]
+ shl r3d, 6
+ vbroadcasti32x8 m6, [base+z_xpos_off2b]
+ sub r3d, dxd
+ paddw m9, m3, m3
+ add dxd, dxd
+ vinserti32x8 m3, ym9, 1
+.w32_loop:
+ pshufb m1, m3, m4
+ psrlw m2, m3, 3
+ paddsb m0, m1, m5
+ vpermw m2, m2, m14
+ paddsb m1, m6
+ vpermi2b m0, m7, m8
+ vpermi2b m1, m7, m8
+ paddsw m3, m9
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ packuswb m0, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w32_end
+ lea dstq, [dstq+strideq*2]
+ sub r3d, dxd
+ jg .w32_loop
+ punpckhqdq ym8, ym8
+.w32_end_loop:
+ mova [dstq+strideq*0], ym8
+ mova [dstq+strideq*1], ym8
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_end_loop
+.w32_end:
+ RET
+.w64_filter:
+ vbroadcasti32x4 m3, [base+z_filter_s2]
+ mova m1, [base+z_filter_s1]
+ pshufb m0, m3 ; al bl
+ vpermi2b m1, m7, m2
+ vbroadcasti32x4 m4, [base+z_filter_s4]
+ pshufb m6, m8, m4 ; el dl
+ pshufb m9, m7, m4
+ pminub m10, m13, [base+z_filter_s5]
+ pshufb m2, m8, m3 ; ah bh
+ pshufb m3, m7, m3
+ vbroadcasti32x4 m5, [base+z_filter_s3]
+ vpermb m10, m10, m8 ; eh dh
+ pshufb m11, m4
+ vpbroadcastd m4, [base+z_filter_k+4*2+12*0]
+ pshufb m8, m5 ; cl ch
+ pshufb m7, m5
+ vpbroadcastd m5, [base+z_filter_k+4*2+12*1]
+ REPX {pmaddubsw x, m4}, m0, m1, m6, m9, m2, m3, m10, m11
+ pmaddubsw m4, m8, m5
+ pmaddubsw m5, m7, m5
+ paddw m0, m6
+ vpbroadcastd m6, [base+z_filter_k+4*2+12*2]
+ paddw m1, m9
+ pmaddubsw m7, m6
+ pmaddubsw m8, m6
+ paddw m2, m10
+ paddw m3, m11
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m8
+ paddw m3, m7
+ REPX {pmulhrsw x, m15}, m0, m2, m1, m3
+ packuswb m0, m2
+ packuswb m7, m1, m3
+ vpermb m8, m12, m0
+ ret
+.w64:
+ lea r3d, [hq-1]
+ movu m7, [tlq+64*0]
+ vpbroadcastb m13, r3d
+ pminub m12, m13, [pb_0to63]
+ or r3d, 64
+ vpermb m8, m12, [tlq+64*1]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w64_main
+ movu m0, [tlq+56]
+ vpbroadcastd m2, [tlq-4]
+ movu m11, [tlq+8]
+ call .w64_filter
+.w64_main:
+ rorx r2d, dxd, 30
+ vpbroadcastd m4, [base+z_xpos_bc]
+ vpbroadcastw m3, r2d
+ mova m5, [base+z_xpos_off2a]
+ shl r3d, 6
+ mova m6, [base+z_xpos_off2b]
+ sub r3d, dxd
+ mova m9, m3
+.w64_loop:
+ pshufb m1, m3, m4
+ psrlw m2, m3, 3
+ paddsb m0, m1, m5
+ vpermw m2, m2, m14
+ paddsb m1, m6
+ vpermi2b m0, m7, m8
+ vpermi2b m1, m7, m8
+ paddsw m3, m9
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ packuswb m0, m1
+ mova [dstq], m0
+ dec hd
+ jz .w64_end
+ add dstq, strideq
+ sub r3d, dxd
+ jg .w64_loop
+ vpermb m8, m13, m8
+.w64_end_loop:
+ mova [dstq], m8
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
+
+cglobal ipred_z2_8bpc, 3, 9, 18, dst, stride, tl, w, h, angle, dx, _, dy
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ lea dxq, [dr_intra_derivative-90]
+ movzx dyd, angleb
+ xor angled, 0x400
+ mov r7, dxq
+ sub dxq, dyq
+ movifnidn hd, hm
+ and dyd, ~1
+ and dxq, ~1
+ movzx dyd, word [r7+dyq] ; angle - 90
+ lea r7, [z_filter_t0]
+ movzx dxd, word [dxq+270] ; 180 - angle
+ movsxd wq, [base+ipred_z2_8bpc_avx512icl_table+wq*4]
+ mova m8, [base+pb_63to0]
+ neg dyd
+ vpermb m8, m8, [tlq-64] ; left
+ lea wq, [base+ipred_z2_8bpc_avx512icl_table+wq]
+ mova m14, [base+z_frac_table]
+ inc tlq
+ vpbroadcastd m15, [base+pw_512]
+ neg dxd
+ jmp wq
+.w4:
+ movd xm7, [tlq]
+ vpbroadcastq m10, [base+z_xpos_off2a]
+ test angled, 0x400
+ jnz .w4_main ; !enable_intra_edge_filter
+ lea r3d, [hq+2]
+ add angled, 1022
+ shl r3d, 6
+ test r3d, angled
+ jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+ vpbroadcastd xm2, [base+pb_4]
+ sub angled, 1075 ; angle - 53
+ call .upsample_above
+ lea r3d, [hq+3]
+ vpbroadcastq m10, [pb_0to63+1]
+ punpcklbw xm7, xm0, xm7
+ call .filter_strength
+ jmp .w4_filter_left
+.w4_upsample_left:
+ call .upsample_left
+ movsldup m16, [base+z_ypos_off3]
+ vpbroadcastd m9, [base+pb_16]
+ punpcklbw xm8, xm0, xm8
+ jmp .w4_main2
+.w4_no_upsample_above:
+ lea r3d, [hq+3]
+ sub angled, 1112 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w4_no_filter_above
+ vpbroadcastd xm5, [base+pb_3]
+ call .filter_top_w16
+.w4_no_filter_above:
+ lea r3d, [hq+2]
+ add angled, 973 ; angle + 883
+ shl r3d, 6
+ test r3d, angled
+ jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+ vpbroadcastd ym0, [base+pb_90]
+ psubb ym0, ym17
+ vpcmpgtb k2{k2}, ym0, ym16
+ kmovd r3d, k2
+.w4_filter_left:
+ test r3d, r3d
+ jz .w4_main
+ popcnt r3d, r3d
+ call .filter_left_h16
+.w4_main:
+ movsldup m16, [base+z_ypos_off1]
+ vpbroadcastd m9, [base+pb_8]
+.w4_main2:
+ vpbroadcastq m3, [base+z_ypos_mul1a]
+ vpbroadcastw m0, dyd
+ movsldup m1, [base+z_xpos_mul]
+ vpbroadcastw m5, dxd
+ vinserti32x4 m7, [tlq-16], 3
+ vinserti32x4 m8, [tlq-16], 3
+ pmullw m3, m0
+ vbroadcasti32x4 m2, [base+z_xpos_bc]
+ pmullw m1, m5 ; xpos0..3
+ psllw m5, 5 ; dx*8
+ psraw m4, m3, 6
+ psrlw m3, 1
+ packsswb m4, m4
+ vpermw m3, m3, m14 ; 64-frac, frac
+ punpcklbw m4, m4
+ lea r2, [strideq*3]
+ paddb m4, m16 ; base, base+1
+.w4_loop:
+ pshufb m16, m1, m2
+ psrlw m0, m1, 3
+ paddb m16, m10
+ vpermw m0, m0, m14
+ vpmovw2m k1, m16 ; base_x < 0
+ vpermb m16, m16, m7
+ pmaddubsw m16, m0
+ vpermb m0, m4, m8
+ pmaddubsw m16{k1}, m0, m3
+ pmulhrsw m16, m15
+ vpmovwb ym16, m16
+ movd [dstq+strideq*0], xm16
+ pextrd [dstq+strideq*1], xm16, 1
+ pextrd [dstq+strideq*2], xm16, 2
+ pextrd [dstq+r2 ], xm16, 3
+ sub hd, 8
+ jl .w4_end
+ paddsw m1, m5
+ vextracti128 xm16, ym16, 1
+ lea dstq, [dstq+strideq*4]
+ paddb m4, m9
+ movd [dstq+strideq*0], xm16
+ pextrd [dstq+strideq*1], xm16, 1
+ pextrd [dstq+strideq*2], xm16, 2
+ pextrd [dstq+r2 ], xm16, 3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_end:
+ RET
+.upsample_above: ; w4/w8
+ mova xm0, [tlq-1]
+ xor angled, 0x7f ; 180 - angle
+ add dxd, dxd
+ jmp .upsample
+.upsample_left: ; h4/h8
+ palignr xm0, xm8, [tlq-16], 15
+ vpbroadcastb xm2, hd
+ add dyd, dyd
+.upsample:
+ pshufb xm1, xm0, [base+z_filter4_s1]
+ pminub xm2, [base+z_filter_s4]
+ vpbroadcastd xm3, [base+pb_m4_36]
+ pshufb xm0, xm2
+ pmaddubsw xm1, xm3
+ pmaddubsw xm0, xm3
+ paddw xm0, xm1
+ pmulhrsw xm0, xm15
+ packuswb xm0, xm0
+ ret
+.filter_strength:
+ vpbroadcastb ym16, r3d
+ mov r3d, angled
+ vpbroadcastd m2, [tlq-4]
+ vpbroadcastb ym17, angled
+ shr r3d, 8
+ vpcmpeqb k2, ym16, [base+z_filter_wh]
+ mova xm16, [base+z_filter_t0+r3*8]
+ vpcmpgtb k1{k2}, ym17, ym16
+ mova m9, [pb_0to63]
+ kmovd r3d, k1
+ ret
+.w8:
+ movq xm7, [tlq]
+ vbroadcasti32x4 m10, [base+z_xpos_off2a]
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [angleq+126]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+ vpbroadcastd xm2, [base+pb_8]
+ sub angled, 53 ; angle - 53
+ call .upsample_above
+ lea r3d, [hq+7]
+ vbroadcasti32x4 m10, [pb_0to63+1]
+ punpcklbw xm7, xm0, xm7
+ call .filter_strength
+ jmp .w8_filter_left
+.w8_upsample_left:
+ call .upsample_left
+ movshdup m16, [base+z_ypos_off3]
+ vpbroadcastd m9, [base+pb_8]
+ punpcklbw xm8, xm0, xm8
+ jmp .w8_main2
+.w8_no_upsample_above:
+ lea r3d, [hq+7]
+ sub angled, 90 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w8_no_filter_above
+ vpbroadcastd xm5, [base+pb_7]
+ call .filter_top_w16
+.w8_no_filter_above:
+ lea r3d, [angleq-51]
+ mov r3b, hb
+ cmp r3d, 8
+ jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
+ vpbroadcastd ym0, [base+pb_90]
+ psubb ym0, ym17
+ vpcmpgtb k2{k2}, ym0, ym16
+ kmovd r3d, k2
+.w8_filter_left:
+ test r3d, r3d
+ jz .w8_main
+ cmp hd, 32
+ je .w8_filter_left_h32
+ popcnt r3d, r3d
+ call .filter_left_h16
+ jmp .w8_main
+.w8_filter_left_h32:
+ call .filter_left_h64
+.w8_main:
+ movshdup m16, [base+z_ypos_off2]
+ vpbroadcastd m9, [base+pb_4]
+.w8_main2:
+ vbroadcasti32x4 m3, [base+z_ypos_mul1a]
+ vpbroadcastw m0, dyd
+ movshdup m1, [base+z_xpos_mul]
+ vpbroadcastw m5, dxd
+ vinserti32x4 m7, [tlq-16], 3
+ vinserti32x4 m8, [tlq-16], 3
+ pmullw m3, m0
+ vpbroadcastd m2, [base+pb_1]
+ pmullw m1, m5 ; xpos0..3
+ psllw m5, 4 ; dx*4
+ psraw m4, m3, 6
+ psrlw m3, 1
+ packsswb m4, m4
+ vpermw m3, m3, m14 ; 64-frac, frac
+ lea r3d, [dxq+(8<<6)]
+ paddsb m4, m16
+ shl dxd, 2
+ paddsb m0, m4, m2
+ lea r2, [strideq*3]
+ punpcklbw m4, m0 ; base, base+1
+.w8_loop:
+ pshufb m16, m1, m2
+ psrlw m0, m1, 3
+ paddb m16, m10
+ vpermw m0, m0, m14
+ vpmovw2m k1, m16 ; base_x < 0
+ vpermb m16, m16, m7
+ pmaddubsw m16, m0
+ vpermb m0, m4, m8
+ pmaddubsw m16{k1}, m0, m3
+ pmulhrsw m16, m15
+ vpmovwb ym16, m16
+ vextracti128 xm17, ym16, 1
+ movq [dstq+strideq*0], xm16
+ movhps [dstq+strideq*1], xm16
+ movq [dstq+strideq*2], xm17
+ movhps [dstq+r2 ], xm17
+ sub hd, 4
+ jz .w8_end
+ paddw m1, m5
+ lea dstq, [dstq+strideq*4]
+ paddb m4, m9
+ add r3d, dxd
+ jge .w8_loop
+.w8_leftonly_loop:
+ vpermb m16, m4, m8
+ pmaddubsw m16, m3
+ paddb m4, m9
+ pmulhrsw m16, m15
+ vpmovwb ym16, m16
+ vextracti128 xm17, ym16, 1
+ movq [dstq+strideq*0], xm16
+ movhps [dstq+strideq*1], xm16
+ movq [dstq+strideq*2], xm17
+ movhps [dstq+r2 ], xm17
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_leftonly_loop
+.w8_end:
+ RET
+.filter_top_w16:
+ mova xm0, [base+z_filter_s1]
+ popcnt r3d, r3d
+ pminub xm4, xm5, [base+z_filter_s4]
+ vpermi2b xm0, xm7, xm2
+ pminub xm5, [base+z_filter_s5]
+ pshufb xm1, xm7, [base+z_filter_s2]
+ vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0]
+ pshufb xm3, xm7, [base+z_filter_s3]
+ vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1]
+ pshufb xm4, xm7, xm4
+ vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2]
+ pshufb xm5, xm7, xm5
+ pmaddubsw xm0, xm11
+ pmaddubsw xm1, xm11
+ pmaddubsw xm6, xm3, xm12
+ vpbroadcastd xm12, r7m ; max_width
+ pmaddubsw xm3, xm13
+ pmaddubsw xm4, xm11
+ pmaddubsw xm5, xm11
+ packssdw xm12, xm12
+ paddw xm0, xm6
+ paddw xm1, xm3
+ paddw xm0, xm4
+ paddw xm1, xm5
+ packsswb xm12, xm12
+ pmulhrsw xm0, xm15
+ pmulhrsw xm1, xm15
+ vpcmpgtb k1, xm12, xm9 ; x < max_width
+ packuswb xm7{k1}, xm0, xm1
+ ret
+.filter_left_h16:
+ lea r5d, [hq-1]
+ mova xm0, [base+z_filter_s1]
+ vpbroadcastb xm5, r5d
+ vpermi2b xm0, xm8, xm2
+ pminub xm4, xm5, [base+z_filter_s4]
+ pshufb xm1, xm8, [base+z_filter_s2]
+ pminub xm5, [base+z_filter_s5]
+ pshufb xm3, xm8, [base+z_filter_s3]
+ vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0]
+ pshufb xm4, xm8, xm4
+ vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1]
+ pshufb xm5, xm8, xm5
+ vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2]
+ pmaddubsw xm0, xm11
+ pmaddubsw xm1, xm11
+ pmaddubsw xm6, xm3, xm12
+ vpbroadcastd xm12, r8m ; max_height
+ pmaddubsw xm3, xm13
+ pmaddubsw xm4, xm11
+ pmaddubsw xm5, xm11
+ packssdw xm12, xm12
+ paddw xm0, xm6
+ paddw xm1, xm3
+ paddw xm0, xm4
+ paddw xm1, xm5
+ packsswb xm12, xm12
+ pmulhrsw xm0, xm15
+ pmulhrsw xm1, xm15
+ vpcmpgtb k1, xm12, xm9 ; y < max_height
+ packuswb xm8{k1}, xm0, xm1
+ ret
+.w16:
+ movu xm7, [tlq] ; top
+ test angled, 0x400
+ jnz .w16_main
+ lea r3d, [hq+15]
+ sub angled, 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w16_no_filter_above
+ vpbroadcastd xm5, [base+pb_15]
+ call .filter_top_w16
+.w16_no_filter_above:
+ cmp hd, 16
+ jg .w16_filter_left_h64
+ vpbroadcastd ym0, [base+pb_90]
+ psubb ym0, ym17
+ vpcmpgtb k2{k2}, ym0, ym16
+ kmovd r3d, k2
+ test r3d, r3d
+ jz .w16_main
+ popcnt r3d, r3d
+ call .filter_left_h16
+ jmp .w16_main
+.w16_filter_left_h64:
+ call .filter_left_h64
+.w16_main:
+ vbroadcasti32x4 m6, [base+z_ypos_mul1a] ; 1.. 8
+ vbroadcasti32x4 m5, [base+z_ypos_mul1b] ; 9..15
+ vpbroadcastw m0, dyd
+ vinserti32x4 m7, [tlq-16], 3
+ vpbroadcastd m2, [base+pb_1]
+ vpbroadcastw m12, dxd
+ movshdup m1, [base+z_xpos_mul]
+ pmullw m6, m0
+ vbroadcasti32x4 m3, [base+z_xpos_off2a]
+ pmullw m5, m0
+ vbroadcasti32x4 m4, [base+z_xpos_off2b]
+ pmullw m1, m12 ; xpos0 xpos1 xpos2 xpos3
+ vpbroadcastd m9, [base+pb_4]
+ psllw m12, 4 ; dx*4
+ movshdup m16, [base+z_ypos_off2]
+ psrlw m10, m6, 1
+ psrlw m11, m5, 1
+ vpermw m10, m10, m14 ; 64-frac, frac
+ psraw m6, 6
+ vpermw m11, m11, m14
+ psraw m5, 6
+ mov r5d, -(16<<6) ; 15 to avoid top, +1 to avoid topleft
+ packsswb m6, m5
+ mov r3d, 1<<6
+ paddsb m6, m16
+ sub r5d, dxd ; left-only threshold
+ paddsb m0, m6, m2
+ shl dxd, 2
+ punpcklbw m5, m6, m0 ; base, base+1
+ lea r2, [strideq*3]
+ punpckhbw m6, m0
+.w16_loop:
+ pshufb m17, m1, m2
+ psrlw m0, m1, 3
+ paddb m16, m3, m17
+ vpermw m0, m0, m14
+ paddb m17, m4
+ vpmovw2m k1, m16
+ vpermb m16, m16, m7
+ vpmovw2m k2, m17
+ vpermb m17, m17, m7
+ pmaddubsw m16, m0
+ pmaddubsw m17, m0
+ add r3d, dxd
+ jge .w16_toponly
+ mova m0, m8
+ vpermt2b m0, m5, m7
+ pmaddubsw m16{k1}, m0, m10
+ mova m0, m8
+ vpermt2b m0, m6, m7
+ pmaddubsw m17{k2}, m0, m11
+.w16_toponly:
+ pmulhrsw m16, m15
+ pmulhrsw m17, m15
+ packuswb m16, m17
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+r2 ], m16, 3
+ sub hd, 4
+ jz .w16_end
+ paddw m1, m12
+ lea dstq, [dstq+strideq*4]
+ paddb m5, m9
+ paddb m6, m9
+ cmp r3d, r5d
+ jge .w16_loop
+.w16_leftonly_loop:
+ vpermb m16, m5, m8
+ vpermb m17, m6, m8
+ pmaddubsw m16, m10
+ pmaddubsw m17, m11
+ paddb m5, m9
+ paddb m6, m9
+ pmulhrsw m16, m15
+ pmulhrsw m17, m15
+ packuswb m16, m17
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+r2 ], m16, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_leftonly_loop
+.w16_end:
+ RET
+.w32:
+ movu ym7, [tlq]
+ test angled, 0x400
+ jnz .w32_main
+ vpbroadcastd m2, [tlq-4]
+ mova ym0, [base+z_filter_s1]
+ vbroadcasti32x4 ym1, [base+z_filter_s2]
+ vbroadcasti32x4 ym3, [base+z_filter_s3]
+ vbroadcasti32x4 ym4, [base+z_filter_s4]
+ vpermi2b ym0, ym7, ym2 ; al bl
+ vpbroadcastd ym5, [base+pb_31]
+ pminub ym5, [base+z_filter_s5]
+ pshufb ym1, ym7, ym1 ; ah bh
+ vpbroadcastd ym11, [base+z_filter_k+4*2+12*0]
+ pshufb ym3, ym7, ym3 ; cl ch
+ vpbroadcastd ym12, [base+z_filter_k+4*2+12*1]
+ pshufb ym4, ym7, ym4 ; el dl
+ vpbroadcastd ym13, [base+z_filter_k+4*2+12*2]
+ vpermb ym5, ym5, ym7 ; eh dh
+ pmaddubsw ym0, ym11
+ pmaddubsw ym1, ym11
+ pmaddubsw ym6, ym3, ym12
+ vpbroadcastd ym12, r6m
+ pmaddubsw ym3, ym13
+ pmaddubsw ym4, ym11
+ pmaddubsw ym5, ym11
+ mova m9, [pb_0to63]
+ packssdw ym12, ym12
+ paddw ym0, ym6
+ paddw ym1, ym3
+ paddw ym0, ym4
+ paddw ym1, ym5
+ packsswb ym12, ym12
+ pmulhrsw ym0, ym15
+ pmulhrsw ym1, ym15
+ vpcmpgtb k1, ym12, ym9 ; x < max_width
+ packuswb ym7{k1}, ym0, ym1
+ cmp hd, 16
+ jg .w32_filter_h64
+ mov r3d, 3
+ call .filter_left_h16
+ jmp .w32_main
+.w32_filter_h64:
+ call .filter_left_h64
+.w32_main:
+ vbroadcasti32x8 m6, [base+z_ypos_mul1a] ; 1.. 8
+ vbroadcasti32x8 m5, [base+z_ypos_mul1b] ; 9..15
+ vpbroadcastw m0, dyd
+ vinserti32x4 m7, [tlq-16], 3
+ rorx r2q, dxq, 62 ; dx << 2
+ vpbroadcastd m2, [base+pb_1]
+ vpbroadcastw m1, r2d
+ pmullw m6, m0
+ vbroadcasti32x8 m3, [base+z_xpos_off2a]
+ pmullw m5, m0
+ vbroadcasti32x8 m4, [base+z_xpos_off2b]
+ mova ym0, ym1
+ paddw m12, m1, m1
+ vpbroadcastd m9, [base+pb_2]
+ paddw m1, m0 ; xpos1 xpos0
+ mova ym0, ym2
+ psrlw m10, m6, 1
+ psrlw m11, m5, 1
+ vpermw m10, m10, m14 ; 64-frac, frac
+ psraw m6, 6
+ vpermw m11, m11, m14
+ psraw m5, 6
+ mov r5d, -(32<<6) ; 31 to avoid top, +1 to avoid topleft
+ packsswb m6, m5
+ mov r3d, 1<<6
+ paddsb m6, m0
+ sub r5d, dxd ; left-only threshold
+ paddsb m0, m6, m2
+ add dxd, dxd
+ punpcklbw m5, m6, m0 ; base, base+1
+ punpckhbw m6, m0
+.w32_loop:
+ pshufb m17, m1, m2
+ psrlw m0, m1, 3
+ paddb m16, m3, m17
+ vpermw m0, m0, m14
+ paddb m17, m4
+ vpmovw2m k1, m16
+ vpermb m16, m16, m7
+ vpmovw2m k2, m17
+ vpermb m17, m17, m7
+ pmaddubsw m16, m0
+ pmaddubsw m17, m0
+ add r3d, dxd
+ jge .w32_toponly
+ mova m0, m8
+ vpermt2b m0, m5, m7
+ pmaddubsw m16{k1}, m0, m10
+ mova m0, m8
+ vpermt2b m0, m6, m7
+ pmaddubsw m17{k2}, m0, m11
+.w32_toponly:
+ pmulhrsw m16, m15
+ pmulhrsw m17, m15
+ packuswb m16, m17
+ vextracti32x8 [dstq+strideq*0], m16, 1
+ mova [dstq+strideq*1], ym16
+ sub hd, 2
+ jz .w32_end
+ paddw m1, m12
+ lea dstq, [dstq+strideq*2]
+ paddb m5, m9
+ paddb m6, m9
+ cmp r3d, r5d
+ jge .w32_loop
+.w32_leftonly_loop:
+ vpermb m16, m5, m8
+ vpermb m17, m6, m8
+ pmaddubsw m16, m10
+ pmaddubsw m17, m11
+ paddb m5, m9
+ paddb m6, m9
+ pmulhrsw m16, m15
+ pmulhrsw m17, m15
+ packuswb m16, m17
+ vextracti32x8 [dstq+strideq*0], m16, 1
+ mova [dstq+strideq*1], ym16
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_leftonly_loop
+.w32_end:
+ RET
+.filter_left_h64:
+ mova m0, [base+z_filter_s1]
+ lea r3d, [hq-1]
+ vbroadcasti32x4 m4, [base+z_filter_s4]
+ vpbroadcastb m5, r3d
+ vbroadcasti32x4 m1, [base+z_filter_s2]
+ vbroadcasti32x4 m3, [base+z_filter_s3]
+ vpermi2b m0, m8, m2 ; al bl
+ pminub m5, [base+z_filter_s5]
+ pshufb m1, m8, m1 ; ah bh
+ vpbroadcastd m11, [base+z_filter_k+4*2+12*0]
+ pshufb m3, m8, m3 ; cl ch
+ vpbroadcastd m12, [base+z_filter_k+4*2+12*1]
+ pshufb m4, m8, m4 ; el dl
+ vpbroadcastd m13, [base+z_filter_k+4*2+12*2]
+ vpermb m5, m5, m8 ; eh dh
+ pmaddubsw m0, m11
+ pmaddubsw m1, m11
+ pmaddubsw m6, m3, m12
+ vpbroadcastd m12, r8m ; max_height
+ pmaddubsw m3, m13
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ packssdw m12, m12
+ paddw m0, m6
+ paddw m1, m3
+ paddw m0, m4
+ paddw m1, m5
+ packsswb m12, m12
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ vpcmpgtb k1, m12, m9 ; y < max_height
+ packuswb m8{k1}, m0, m1
+ ret
+.w64:
+ movu m7, [tlq]
+ test angled, 0x400
+ jnz .w64_main
+ vpbroadcastd m2, [tlq-4]
+ mova m0, [base+z_filter_s1]
+ vbroadcasti32x4 m1, [base+z_filter_s2]
+ vbroadcasti32x4 m3, [base+z_filter_s3]
+ vbroadcasti32x4 m4, [base+z_filter_s4]
+ vpermi2b m0, m7, m2 ; al bl
+ vpbroadcastd m5, [base+pb_63]
+ pminub m5, [base+z_filter_s5]
+ pshufb m1, m7, m1 ; ah bh
+ vpbroadcastd m11, [base+z_filter_k+4*2+12*0]
+ pshufb m3, m7, m3 ; cl ch
+ vpbroadcastd m12, [base+z_filter_k+4*2+12*1]
+ pshufb m4, m7, m4 ; el dl
+ vpbroadcastd m13, [base+z_filter_k+4*2+12*2]
+ vpermb m5, m5, m7 ; eh dh
+ pmaddubsw m0, m11
+ pmaddubsw m1, m11
+ pmaddubsw m6, m3, m12
+ vpbroadcastd m12, r6m
+ pmaddubsw m3, m13
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ mova m9, [pb_0to63]
+ packssdw m12, m12
+ paddw m0, m6
+ paddw m1, m3
+ paddw m0, m4
+ paddw m1, m5
+ packsswb m12, m12
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ vpcmpgtb k1, m12, m9 ; x < max_width
+ packuswb m7{k1}, m0, m1
+ call .filter_left_h64 ; always filter the full 64 pixels for simplicity
+.w64_main:
+ vpbroadcastw m5, dyd
+ vpbroadcastd m9, [tlq-4]
+ rorx r2q, dxq, 62 ; dx << 2
+ pmullw m6, m5, [base+z_ypos_mul1a] ; can overflow, but it doesn't matter as such
+ pmullw m5, [base+z_ypos_mul1b] ; pixels aren't selected from the left edge
+ vpbroadcastw m1, r2d ; xpos
+ mova m3, [base+z_xpos_off2a]
+ mova m4, [base+z_xpos_off2b]
+ mova m12, m1
+ vpbroadcastd m2, [base+pb_1]
+ psrlw m10, m6, 1
+ psrlw m11, m5, 1
+ vpermw m10, m10, m14 ; 64-frac, frac
+ psraw m6, 6
+ vpermw m11, m11, m14
+ psraw m5, 6
+ mov r5d, -(64<<6) ; 63 to avoid top, +1 to avoid topleft
+ packsswb m6, m5
+ mov r3d, 1<<6
+ paddsb m0, m6, m2
+ sub r5d, dxd ; left-only threshold
+ punpcklbw m5, m6, m0 ; base, base+1
+ punpckhbw m6, m0
+.w64_loop:
+ pshufb m17, m1, m2
+ psrlw m0, m1, 3
+ paddb m16, m3, m17
+ vpermw m0, m0, m14
+ paddb m17, m4
+ vpmovw2m k1, m16 ; base_x < 0
+ vpermi2b m16, m7, m9
+ vpmovw2m k2, m17
+ vpermi2b m17, m7, m9
+ pmaddubsw m16, m0
+ pmaddubsw m17, m0
+ add r3d, dxd
+ jge .w64_toponly
+ mova m0, m8
+ vpermt2b m0, m5, m9
+ pmaddubsw m16{k1}, m0, m10
+ mova m0, m8
+ vpermt2b m0, m6, m9
+ pmaddubsw m17{k2}, m0, m11
+.w64_toponly:
+ pmulhrsw m16, m15
+ pmulhrsw m17, m15
+ packuswb m16, m17
+ mova [dstq], m16
+ dec hd
+ jz .w64_end
+ paddw m1, m12
+ add dstq, strideq
+ paddb m5, m2
+ paddb m6, m2
+ cmp r3d, r5d
+ jge .w64_loop
+.w64_leftonly_loop:
+ vpermb m16, m5, m8
+ vpermb m17, m6, m8
+ pmaddubsw m16, m10
+ pmaddubsw m17, m11
+ paddb m5, m2
+ paddb m6, m2
+ pmulhrsw m16, m15
+ pmulhrsw m17, m15
+ packuswb m16, m17
+ mova [dstq], m16
+ add dstq, strideq
+ dec hd
+ jg .w64_leftonly_loop
+.w64_end:
+ RET
+
+cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
+ lea r7, [z_filter_t0]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ lea t0, [dr_intra_derivative+45*2-1]
+ movsxd wq, [base+ipred_z3_8bpc_avx512icl_table+wq*4]
+ sub angled, 180
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ or dyq, ~0x7e
+ mova m0, [base+pb_63to0]
+ movzx dyd, word [t0+dyq]
+ lea wq, [base+ipred_z3_8bpc_avx512icl_table+wq]
+ movifnidn hd, hm
+ mova m14, [base+z_frac_table]
+ shl dyd, 6
+ vpbroadcastd m15, [base+pw_512]
+ jmp wq
+.w4:
+ cmp angleb, 40
+ jae .w4_no_upsample
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+ lea r3d, [hq+4]
+ call .upsample
+ movshdup m1, [base+z_ypos_off1]
+ vpbroadcastd m6, [base+pb_16]
+ jmp .w4_main2
+.w4_no_upsample:
+ lea r3d, [hq+3]
+ vpbroadcastb m9, r3d
+ vpxord m1, m9, [base+pb_63] {1to16} ; 63 - (h + 4)
+ pmaxub m1, m0
+ vpermb m7, m1, [tlq-64*1]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ vpbroadcastb xm1, angled
+ shr angled, 8
+ vpcmpeqb k1, xm9, [base+z_filter_wh]
+ vpbroadcastd m2, [tlq-3]
+ vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8]
+ kmovw r5d, k1
+ test r5d, r5d
+ jz .w4_main
+ pminub m9, [pb_0to63]
+ call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w8_filter
+ vpermb m7, m9, m0
+.w4_main:
+ movsldup m1, [base+z_ypos_off1]
+ vpbroadcastd m6, [base+pb_8]
+.w4_main2:
+ vpbroadcastw m0, dyd
+ vpbroadcastq m2, [base+z_ypos_mul2a] ; 1..4
+ pmulhuw m2, m0 ; ypos >> 1
+ lea r2, [strideq*3]
+ vpermw m3, m2, m14 ; 64-frac, frac
+ psrlw m2, 5
+ packsswb m2, m2
+ punpcklbw m2, m2
+ paddsb m2, m1 ; base, base+1
+.w4_loop:
+ vpermb m0, m2, m7
+ pmaddubsw m0, m3
+ paddsb m2, m6
+ pmulhrsw m0, m15
+ vpmovwb ym0, m0
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ sub hd, 8
+ jl .w4_end
+ vextracti32x4 xm0, ym0, 1
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_end:
+ RET
+.upsample:
+ xor r3d, 31 ; 31 - (h + imin(w, h))
+ vbroadcasti32x4 ym0, [base+z_xpos_off2a]
+ vpbroadcastb ym7, r3d
+ pmaxub ym7, [base+z3_upsample]
+ vbroadcasti32x4 ym1, [base+z_filter_s4]
+ vpermb ym7, ym7, [tlq-31]
+ vpbroadcastd ym2, [base+pb_m4_36]
+ pshufb ym0, ym7, ym0
+ psrldq ym7, 1
+ pshufb ym1, ym7, ym1
+ pmaddubsw ym0, ym2
+ pmaddubsw ym1, ym2
+ add dyd, dyd
+ paddw ym0, ym1
+ pmulhrsw ym0, ym15
+ packuswb ym0, ym0
+ punpcklbw ym7, ym0
+ ret
+.w8:
+ lea r3d, [angleq+216]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ lea r3d, [hq*2]
+ call .upsample
+ pshufd m1, [base+z_ypos_off1], q0000
+ vpbroadcastd m6, [base+pb_8]
+ jmp .w8_main2
+.w8_no_upsample:
+ mov r3d, 8
+ cmp hd, 4
+ cmove r3d, hd
+ lea r3d, [r3+hq-1]
+ xor r3d, 63 ; 63 - (h + imin(w, h))
+ vpbroadcastb m1, wd
+ pmaxub m1, m0
+ vpermb m7, m1, [tlq-64*1]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w8_main
+ lea r3d, [hq+7]
+ call .filter_strength
+ test r5d, r5d
+ jz .w8_main
+ call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter
+ vpermb m7, m10, m0
+.w8_main:
+ movsldup m1, [base+z_ypos_off2]
+ vpbroadcastd m6, [base+pb_4]
+.w8_main2:
+ vpbroadcastw m0, dyd
+ vbroadcasti32x4 m2, [base+z_ypos_mul2a] ; 1..8
+ pmulhuw m2, m0 ; ypos >> 1
+ lea r2, [strideq*3]
+ vpermw m3, m2, m14 ; 64-frac, frac
+ psrlw m2, 5
+ packsswb m2, m2
+ punpcklbw m2, m2
+ paddsb m2, m1 ; base, base+1
+.w8_loop:
+ vpermb m0, m2, m7
+ pmaddubsw m0, m3
+ paddsb m2, m6
+ pmulhrsw m0, m15
+ vpmovwb ym0, m0
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r2 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+.filter_strength:
+ vpbroadcastd m2, [tlq-3]
+.filter_strength2:
+ vpbroadcastb m9, r3d
+ vpbroadcastb ym1, angled
+ shr angled, 8
+ vpcmpeqb k1, ym9, [base+z_filter_wh]
+ mova xm0, [base+z_filter_t0+angleq*8]
+ vpcmpgtb k1{k1}, ym1, ym0
+ pminub m10, m9, [pb_0to63]
+ kmovd r5d, k1
+ ret
+.w16_load:
+ cmp r3d, hd
+ cmovae r3d, hd
+ add r3d, hd
+ mova m7, [tlq-64*1]
+ neg r3d ; -(h + imin(w, h))
+ and r3d, 63
+ vpbroadcastb m1, r3d
+ pmaxub m2, m0, m1
+ cmp hd, 64
+ je .w16_load_h64
+ vpermb m8, m1, m7
+ vpermb m7, m2, m7
+ ret
+.w16_load_h64:
+ vpermb m7, m0, m7
+ vpermb m8, m2, [tlq-64*2]
+ ret
+.w16:
+ mov r3d, 16
+ call .w16_load
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w16_main
+ vpbroadcastd m2, [tlq-3]
+ cmp hd, 64
+ je .w16_filter64
+ lea r3d, [hq+15]
+ call .filter_strength2
+ test r5d, r5d
+ jz .w16_main
+ call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter
+ pminub m10, m9, [pb_0to63]
+ vpermb m8, m9, m0
+ vpermb m7, m10, m0
+ jmp .w16_main
+.w16_filter64:
+ vpbroadcastd m13, [base+pb_15]
+ valignq m0, m8, m7, 7
+ pminub m12, m13, [pb_0to63]
+ valignq m11, m8, m7, 1
+ call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter
+.w16_main:
+ vbroadcasti32x4 m3, [base+z_ypos_mul2a] ; 1.. 8
+ vbroadcasti32x4 m2, [base+z_ypos_mul2b] ; 9..15
+ vpbroadcastw m0, dyd
+ vpbroadcastd m6, [base+pb_4]
+ pmulhuw m3, m0 ; ypos >> 1
+ pmulhuw m2, m0
+ movshdup m0, [base+z_ypos_off2]
+ lea r2, [strideq*3]
+ vpbroadcastd m1, [base+pb_1]
+ vpermw m4, m3, m14 ; 64-frac, frac
+ psrlw m3, 5
+ vpermw m5, m2, m14
+ psrlw m2, 5
+ packsswb m3, m2
+ paddsb m3, m0
+ paddsb m1, m3
+ punpcklbw m2, m3, m1 ; base, base+1
+ punpckhbw m3, m1
+.w16_loop:
+%macro Z3_PERM2 0
+ mova m0, m7
+ vpermt2b m0, m2, m8
+ mova m1, m7
+ vpermt2b m1, m3, m8
+ pmaddubsw m0, m4
+ pmaddubsw m1, m5
+ paddsb m2, m6
+ paddsb m3, m6
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ packuswb m0, m1
+%endmacro
+ Z3_PERM2
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+r2 ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ mov r3d, 32
+ call .w16_load
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w32_main
+ vpbroadcastd m2, [tlq-3]
+ cmp hd, 64
+ je .w32_filter64
+ lea r3d, [hq+31]
+ vpbroadcastb m9, r3d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w32_filter
+ vpermb m8, m9, m7
+ jmp .w32_main
+.w32_filter64:
+ vpbroadcastd m13, [base+pb_31]
+ valignq m0, m8, m7, 7
+ pminub m12, m13, [pb_0to63]
+ valignq m11, m8, m7, 1
+ call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter
+.w32_main:
+ vbroadcasti32x8 m3, [base+z_ypos_mul2a] ; 1.. 8
+ vbroadcasti32x8 m2, [base+z_ypos_mul2b] ; 9..15
+ vpbroadcastw m0, dyd
+ vpbroadcastd m1, [base+pb_1]
+ pmulhuw m3, m0 ; ypos >> 1
+ pmulhuw m2, m0
+ vpbroadcastd m6, [base+pb_2]
+ mova ym0, ym1
+ vpermw m4, m3, m14 ; 64-frac, frac
+ psrlw m3, 5
+ vpermw m5, m2, m14
+ psrlw m2, 5
+ packsswb m3, m2
+ paddsb m3, m0
+ paddsb m1, m3
+ punpcklbw m2, m3, m1 ; base, base+1
+ punpckhbw m3, m1
+.w32_loop:
+ Z3_PERM2
+ vextracti32x8 [dstq+strideq*0], m0, 1
+ mova [dstq+strideq*1], ym0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ mova m7, [tlq-64*1]
+ cmp hd, 64
+ je .w64_h64
+ lea r3d, [hq*2-1]
+ xor r3d, 63 ; -(h + imin(w, h)) & 63
+ vpbroadcastb m1, r3d
+ pmaxub m0, m1
+ vpermb m8, m1, m7
+ jmp .w64_filter
+.w64_h64:
+ vpermb m8, m0, [tlq-64*2]
+.w64_filter:
+ vpermb m7, m0, m7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w64_main
+ lea r3d, [hq-1]
+ vpbroadcastd m2, [tlq-3]
+ vpbroadcastb m13, r3d
+ valignq m0, m8, m7, 7
+ pminub m12, m13, [pb_0to63]
+ valignq m11, m8, m7, 1
+ call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter
+.w64_main:
+ vpbroadcastw m2, dyd
+ pmulhuw m3, m2, [base+z_ypos_mul2a]
+ pmulhuw m2, [base+z_ypos_mul2b]
+ vpbroadcastd m6, [base+pb_1]
+ vpermw m4, m3, m14 ; 64-frac, frac
+ psrlw m3, 5
+ vpermw m5, m2, m14
+ psrlw m2, 5
+ packsswb m3, m2
+ paddsb m1, m3, m6
+ punpcklbw m2, m3, m1 ; base, base+1
+ punpckhbw m3, m1
+.w64_loop:
+ Z3_PERM2
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
; The ipred_filter code processes 4x2 blocks in the following order
; which increases parallelism compared to doing things row by row.
; Some redundant blocks are calculated for w > 4.
diff --git a/src/x86/loopfilter.h b/src/x86/loopfilter.h
index 33c842a..9535c75 100644
--- a/src/x86/loopfilter.h
+++ b/src/x86/loopfilter.h
@@ -58,9 +58,12 @@ static ALWAYS_INLINE void loop_filter_dsp_init_x86(Dav1dLoopFilterDSPContext *co
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
- c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx512icl);
c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx512icl);
- c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx512icl);
c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx512icl);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx512icl);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx512icl);
+ }
#endif
}
diff --git a/src/x86/loopfilter_avx512.asm b/src/x86/loopfilter_avx512.asm
index 0218b62..202a612 100644
--- a/src/x86/loopfilter_avx512.asm
+++ b/src/x86/loopfilter_avx512.asm
@@ -41,6 +41,10 @@ hmulC: dd 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51
hmulD: dd 0, 1, 16, 17, 32, 33, 48, 49
hshuf4:db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+shift1: dq 0x0204081020408000
+shift3: dq 0x0810204080000000
+shift4: dq 0x1020408000000000
+
pb_1: times 4 db 1
pb_2: times 4 db 2
pb_3: times 4 db 3
@@ -49,9 +53,6 @@ pb_16: times 4 db 16
pb_63: times 4 db 63
pb_64: times 4 db 64
pb_128: times 4 db 0x80
-pb_240: times 4 db 0xf0
-pb_248: times 4 db 0xf8
-pb_254: times 4 db 0xfe
pb_2_1: times 2 db 2, 1
pb_3_1: times 2 db 3, 1
pb_7_1: times 2 db 7, 1
@@ -482,8 +483,7 @@ SECTION .text
vpbroadcastb m1, [lutq+136]
pminub m2, m1
pmaxub m2, m15 ; I
- pand m1, m0, [pb_240]{bcstd}
- psrlq m1, 4 ; H
+ gf2p8affineqb m1, m0, [shift4]{bcstq}, 0 ; H
paddd m0, [pb_2]{bcstd}
paddb m0, m0
paddb m0, m2 ; E
@@ -534,8 +534,7 @@ SECTION .text
ABSSUB m10, m3, m6, m11 ; abs(p1-q1)
ABSSUB m11, m4, m5, m2 ; abs(p0-q0)
paddusb m11, m11
- pand m10, [pb_254]{bcstd}
- psrlq m10, 1
+ gf2p8affineqb m10, m10, [shift1]{bcstq}, 0
paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
vpcmpub k3{k3}, m10, m0, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E
@@ -608,12 +607,8 @@ SECTION .text
paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm
paddsb m8, m10, m15
paddsb m10, m0
- pand m8, [pb_248]{bcstd}
- pand m10, [pb_248]{bcstd}
- psrlq m8, 3
- psrlq m10, 3
- pxor m8, m12
- pxor m10, m12
+ gf2p8affineqb m8, m8, [shift3]{bcstq}, 16
+ gf2p8affineqb m10, m10, [shift3]{bcstq}, 16
psubb m8, m12 ; f2
psubb m10, m12 ; f1
paddsb m4, m8
diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm
index ef25c28..4cf8b90 100644
--- a/src/x86/looprestoration16_avx2.asm
+++ b/src/x86/looprestoration16_avx2.asm
@@ -32,15 +32,15 @@ SECTION_RODATA 32
sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11
wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11
wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1
wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15
-pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
wiener_hshift: dw 4, 4, 1, 1
wiener_vshift: dw 1024, 1024, 4096, 4096
@@ -62,6 +62,7 @@ pd_0xf00801c7: dd 0xf00801c7
%define pw_256 sgr_lshuf5
+cextern pb_0to63
cextern sgr_x_by_x_avx2
SECTION .text
@@ -182,7 +183,7 @@ cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
.extend_right:
movd xm1, r10d
vpbroadcastd m0, [pb_6_7]
- movu m2, [pb_0to31]
+ mova m2, [pb_0to63]
vpbroadcastb m1, xm1
psubb m0, m1
pminub m0, m2
@@ -406,9 +407,8 @@ cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
vpbroadcastd m0, [base+wiener_hshift+t3*4]
vpbroadcastd m9, [base+wiener_round+t3*4]
vpbroadcastd m10, [base+wiener_vshift+t3*4]
- movu xm15, [wiener_lshuf5]
+ mova m15, [wiener_lshuf5]
pmullw m11, m0
- vinserti128 m15, [pb_0to31], 1
pmullw m12, m0
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
@@ -486,7 +486,7 @@ cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
vpbroadcastb m2, xm2
psubb m0, m2
psubb m1, m2
- movu m2, [pb_0to31]
+ mova m2, [pb_0to63]
pminub m0, m2
pminub m1, m2
pshufb m3, m0
diff --git a/src/x86/looprestoration_avx2.asm b/src/x86/looprestoration_avx2.asm
index a73cb21..7787997 100644
--- a/src/x86/looprestoration_avx2.asm
+++ b/src/x86/looprestoration_avx2.asm
@@ -31,11 +31,11 @@
SECTION_RODATA 32
wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14
wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12
+sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
sgr_r_ext: times 16 db 1
times 16 db 9
@@ -64,7 +64,6 @@ pb_m5: times 4 db -5
pb_3: times 4 db 3
pw_5_6: dw 5, 6
-sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1
db 9, -1, 10, -1, 11, -1, 12, -1
@@ -77,6 +76,8 @@ pd_m4096: dd -4096
pd_0xf00801c7: dd 0xf00801c7
pd_0xf00800a4: dd 0xf00800a4
+cextern pb_0to63
+
SECTION .text
DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
@@ -192,7 +193,7 @@ cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
vpbroadcastd m0, [pb_3]
vpbroadcastd m1, [pb_m5]
vpbroadcastb m2, xm2
- movu m3, [pb_0to31]
+ mova m3, [pb_0to63]
psubb m0, m2
psubb m1, m2
pminub m0, m3
@@ -826,7 +827,7 @@ cglobal sgr_filter_5x5_8bpc, 4, 13, 16, 400*24+16, dst, stride, left, lpf, \
mova m0, [sgr_r_ext]
vpbroadcastb m2, xm2
psubb m0, m2
- pminub m0, [pb_0to31]
+ pminub m0, [pb_0to63]
pshufb m5, m0
ret
.h: ; horizontal boxsum
diff --git a/src/x86/mc.h b/src/x86/mc.h
index 65c607e..b142361 100644
--- a/src/x86/mc.h
+++ b/src/x86/mc.h
@@ -292,8 +292,11 @@ static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) {
c->blend = BF(dav1d_blend, avx512icl);
c->blend_v = BF(dav1d_blend_v, avx512icl);
c->blend_h = BF(dav1d_blend_h, avx512icl);
- c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl);
- c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl);
- c->resize = BF(dav1d_resize, avx512icl);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
+ c->resize = BF(dav1d_resize, avx512icl);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl);
+ }
#endif
}
diff --git a/src/x86/pal.asm b/src/x86/pal.asm
index 27187d1..92075b9 100644
--- a/src/x86/pal.asm
+++ b/src/x86/pal.asm
@@ -28,7 +28,7 @@
SECTION_RODATA 64
-pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+const pb_0to63, db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
%if ARCH_X86_64
db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
diff --git a/tests/checkasm/cdef.c b/tests/checkasm/cdef.c
index 9a90e31..b96339a 100644
--- a/tests/checkasm/cdef.c
+++ b/tests/checkasm/cdef.c
@@ -106,7 +106,7 @@ static void check_cdef_filter(const cdef_fn fn, const int w, const int h) {
static void check_cdef_direction(const cdef_dir_fn fn) {
ALIGN_STK_64(pixel, src, 8 * 8,);
- declare_func(int, pixel *src, ptrdiff_t dst_stride, unsigned *var
+ declare_func(int, const pixel *src, ptrdiff_t dst_stride, unsigned *var
HIGHBD_DECL_SUFFIX);
if (check_func(fn, "cdef_dir_%dbpc", BITDEPTH)) {
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 682cc43..844ae44 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -26,7 +26,9 @@
*/
#include "tests/checkasm/checkasm.h"
+#include <errno.h>
#include <math.h>
+#include <signal.h>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
@@ -34,13 +36,15 @@
#include "src/cpu.h"
#ifdef _WIN32
-#include <windows.h>
-#define COLOR_RED FOREGROUND_RED
-#define COLOR_GREEN FOREGROUND_GREEN
-#define COLOR_YELLOW (FOREGROUND_RED|FOREGROUND_GREEN)
+#ifndef SIGBUS
+/* non-standard, use the same value as mingw-w64 */
+#define SIGBUS 10
+#endif
+#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
+#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x04
+#endif
#else
#include <unistd.h>
-#include <signal.h>
#include <time.h>
#include <pthread.h>
#ifdef HAVE_PTHREAD_NP_H
@@ -49,11 +53,12 @@
#ifdef __APPLE__
#include <mach/mach_time.h>
#endif
-#define COLOR_RED 1
-#define COLOR_GREEN 2
-#define COLOR_YELLOW 3
#endif
+#define COLOR_RED 31
+#define COLOR_GREEN 32
+#define COLOR_YELLOW 33
+
/* List of tests to invoke */
static const struct {
const char *name;
@@ -97,8 +102,13 @@ static const struct {
{ "AVX-512 (Ice Lake)", "avx512icl", DAV1D_X86_CPU_FLAG_AVX512ICL },
#elif ARCH_AARCH64 || ARCH_ARM
{ "NEON", "neon", DAV1D_ARM_CPU_FLAG_NEON },
+#elif ARCH_LOONGARCH
+ { "LSX", "lsx", DAV1D_LOONGARCH_CPU_FLAG_LSX },
+ { "LASX", "lasx", DAV1D_LOONGARCH_CPU_FLAG_LASX },
#elif ARCH_PPC64LE
{ "VSX", "vsx", DAV1D_PPC_CPU_FLAG_VSX },
+#elif ARCH_RISCV
+ { "RVV", "rvv", DAV1D_RISCV_CPU_FLAG_V },
#endif
{ 0 }
};
@@ -137,7 +147,7 @@ static struct {
int bench;
int verbose;
int function_listing;
- int catch_signals;
+ volatile sig_atomic_t catch_signals;
int suffix_length;
int max_function_name_length;
#if ARCH_X86_64
@@ -241,48 +251,19 @@ int float_near_abs_eps_array_ulp(const float *const a, const float *const b,
}
/* Print colored text to stderr if the terminal supports it */
+static int use_printf_color;
static void color_printf(const int color, const char *const fmt, ...) {
- static int8_t use_color = -1;
va_list arg;
-#ifdef _WIN32
- static HANDLE con;
- static WORD org_attributes;
-
- if (use_color < 0) {
- CONSOLE_SCREEN_BUFFER_INFO con_info;
- con = GetStdHandle(STD_ERROR_HANDLE);
- if (con && con != INVALID_HANDLE_VALUE &&
- GetConsoleScreenBufferInfo(con, &con_info))
- {
- org_attributes = con_info.wAttributes;
- use_color = 1;
- } else
- use_color = 0;
- }
- if (use_color)
- SetConsoleTextAttribute(con, (org_attributes & 0xfff0) |
- (color & 0x0f));
-#else
- if (use_color < 0) {
- const char *const term = getenv("TERM");
- use_color = term && strcmp(term, "dumb") && isatty(2);
- }
- if (use_color)
- fprintf(stderr, "\x1b[%d;3%dm", (color & 0x08) >> 3, color & 0x07);
-#endif
+ if (use_printf_color)
+ fprintf(stderr, "\x1b[0;%dm", color);
va_start(arg, fmt);
vfprintf(stderr, fmt, arg);
va_end(arg);
- if (use_color) {
-#ifdef _WIN32
- SetConsoleTextAttribute(con, org_attributes);
-#else
+ if (use_printf_color)
fprintf(stderr, "\x1b[0m");
-#endif
- }
}
/* Deallocate a tree */
@@ -462,48 +443,51 @@ checkasm_context checkasm_context_buf;
/* Crash handling: attempt to catch crashes and handle them
* gracefully instead of just aborting abruptly. */
#ifdef _WIN32
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
static LONG NTAPI signal_handler(EXCEPTION_POINTERS *const e) {
if (!state.catch_signals)
return EXCEPTION_CONTINUE_SEARCH;
- const char *err;
+ int s;
switch (e->ExceptionRecord->ExceptionCode) {
case EXCEPTION_FLT_DIVIDE_BY_ZERO:
case EXCEPTION_INT_DIVIDE_BY_ZERO:
- err = "fatal arithmetic error";
+ s = SIGFPE;
break;
case EXCEPTION_ILLEGAL_INSTRUCTION:
case EXCEPTION_PRIV_INSTRUCTION:
- err = "illegal instruction";
+ s = SIGILL;
break;
case EXCEPTION_ACCESS_VIOLATION:
case EXCEPTION_ARRAY_BOUNDS_EXCEEDED:
case EXCEPTION_DATATYPE_MISALIGNMENT:
- case EXCEPTION_IN_PAGE_ERROR:
case EXCEPTION_STACK_OVERFLOW:
- err = "segmentation fault";
+ s = SIGSEGV;
+ break;
+ case EXCEPTION_IN_PAGE_ERROR:
+ s = SIGBUS;
break;
default:
return EXCEPTION_CONTINUE_SEARCH;
}
state.catch_signals = 0;
- checkasm_fail_func(err);
- checkasm_load_context();
+ checkasm_load_context(s);
return EXCEPTION_CONTINUE_EXECUTION; /* never reached, but shuts up gcc */
}
+#endif
#else
+static void signal_handler(int s);
+
+static const struct sigaction signal_handler_act = {
+ .sa_handler = signal_handler,
+ .sa_flags = SA_RESETHAND,
+};
+
static void signal_handler(const int s) {
if (state.catch_signals) {
state.catch_signals = 0;
- checkasm_fail_func(s == SIGFPE ? "fatal arithmetic error" :
- s == SIGILL ? "illegal instruction" :
- "segmentation fault");
- checkasm_load_context();
- } else {
- /* fall back to the default signal handler */
- static const struct sigaction default_sa = { .sa_handler = SIG_DFL };
- sigaction(s, &default_sa, NULL);
- raise(s);
+ sigaction(s, &signal_handler_act, NULL);
+ checkasm_load_context(s);
}
}
#endif
@@ -567,6 +551,13 @@ static unsigned get_seed(void) {
#endif
}
+static int checkasm_strtoul(unsigned long *const dst, const char *const str, const int base) {
+ char *end;
+ errno = 0;
+ *dst = strtoul(str, &end, base);
+ return errno || end == str || *end;
+}
+
int main(int argc, char *argv[]) {
state.seed = get_seed();
@@ -612,15 +603,23 @@ int main(int argc, char *argv[]) {
} else if (!strcmp(argv[1], "--verbose") || !strcmp(argv[1], "-v")) {
state.verbose = 1;
} else if (!strncmp(argv[1], "--affinity=", 11)) {
- unsigned long affinity = strtoul(argv[1] + 11, NULL, 16);
+ const char *const s = argv[1] + 11;
+ unsigned long affinity;
+ if (checkasm_strtoul(&affinity, s, 16)) {
+ fprintf(stderr, "checkasm: invalid cpu affinity (%s)\n", s);
+ return 1;
+ }
#ifdef _WIN32
+ int affinity_err;
+ HANDLE process = GetCurrentProcess();
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
BOOL (WINAPI *spdcs)(HANDLE, const ULONG*, ULONG) =
(void*)GetProcAddress(GetModuleHandleW(L"kernel32.dll"), "SetProcessDefaultCpuSets");
- HANDLE process = GetCurrentProcess();
- int affinity_err;
- if (spdcs) {
+ if (spdcs)
affinity_err = !spdcs(process, (ULONG[]){ affinity + 256 }, 1);
- } else {
+ else
+#endif
+ {
if (affinity < sizeof(DWORD_PTR) * 8)
affinity_err = !SetProcessAffinityMask(process, (DWORD_PTR)1 << affinity);
else
@@ -649,7 +648,12 @@ int main(int argc, char *argv[]) {
return 1;
#endif
} else {
- state.seed = (unsigned) strtoul(argv[1], NULL, 10);
+ unsigned long seed;
+ if (checkasm_strtoul(&seed, argv[1], 10)) {
+ fprintf(stderr, "checkasm: unknown option (%s)\n", argv[1]);
+ return 1;
+ }
+ state.seed = (unsigned)seed;
}
argc--;
@@ -657,7 +661,7 @@ int main(int argc, char *argv[]) {
}
#if TRIM_DSP_FUNCTIONS
- fprintf(stderr, "checkasm: reference functions unavailable\n");
+ fprintf(stderr, "checkasm: reference functions unavailable, reconfigure using '-Dtrim_dsp=false'\n");
return 0;
#endif
@@ -666,25 +670,27 @@ int main(int argc, char *argv[]) {
#ifdef _WIN32
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
AddVectoredExceptionHandler(0, signal_handler);
+
+ HANDLE con = GetStdHandle(STD_ERROR_HANDLE);
+ DWORD con_mode = 0;
+ use_printf_color = con && con != INVALID_HANDLE_VALUE &&
+ GetConsoleMode(con, &con_mode) &&
+ SetConsoleMode(con, con_mode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
#endif
#else
- const struct sigaction sa = {
- .sa_handler = signal_handler,
- .sa_flags = SA_NODEFER,
- };
- sigaction(SIGBUS, &sa, NULL);
- sigaction(SIGFPE, &sa, NULL);
- sigaction(SIGILL, &sa, NULL);
- sigaction(SIGSEGV, &sa, NULL);
+ sigaction(SIGBUS, &signal_handler_act, NULL);
+ sigaction(SIGFPE, &signal_handler_act, NULL);
+ sigaction(SIGILL, &signal_handler_act, NULL);
+ sigaction(SIGSEGV, &signal_handler_act, NULL);
+
+ const char *const term = getenv("TERM");
+ use_printf_color = term && strcmp(term, "dumb") && isatty(2);
#endif
#ifdef readtime
if (state.bench) {
- static int testing = 0;
- checkasm_save_context();
- if (!testing) {
+ if (!checkasm_save_context()) {
checkasm_set_signal_handler_state(1);
- testing = 1;
readtime();
checkasm_set_signal_handler_state(0);
} else {
@@ -883,6 +889,16 @@ void checkasm_set_signal_handler_state(const int enabled) {
state.catch_signals = enabled;
}
+int checkasm_handle_signal(const int s) {
+ if (s) {
+ checkasm_fail_func(s == SIGFPE ? "fatal arithmetic error" :
+ s == SIGILL ? "illegal instruction" :
+ s == SIGBUS ? "bus error" :
+ "segmentation fault");
+ }
+ return s;
+}
+
static int check_err(const char *const file, const int line,
const char *const name, const int w, const int h,
int *const err)
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 67a2e42..eeda5df 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -33,18 +33,35 @@
#include <stdint.h>
#include <stdlib.h>
-#if ARCH_X86_64 && defined(_WIN32)
-/* setjmp/longjmp on 64-bit Windows will try to use SEH to unwind the stack,
- * which doesn't work for assembly functions without unwind information. */
+#ifdef _WIN32
#include <windows.h>
-#define checkasm_context CONTEXT
-#define checkasm_save_context() RtlCaptureContext(&checkasm_context_buf)
-#define checkasm_load_context() RtlRestoreContext(&checkasm_context_buf, NULL)
+#if ARCH_X86_32
+#include <setjmp.h>
+typedef jmp_buf checkasm_context;
+#define checkasm_save_context() checkasm_handle_signal(setjmp(checkasm_context_buf))
+#define checkasm_load_context(s) longjmp(checkasm_context_buf, s)
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+/* setjmp/longjmp on Windows on architectures using SEH (all except x86_32)
+ * will try to use SEH to unwind the stack, which doesn't work for assembly
+ * functions without unwind information. */
+typedef struct { CONTEXT c; int status; } checkasm_context;
+#define checkasm_save_context() \
+ (checkasm_context_buf.status = 0, \
+ RtlCaptureContext(&checkasm_context_buf.c), \
+ checkasm_handle_signal(checkasm_context_buf.status))
+#define checkasm_load_context(s) \
+ (checkasm_context_buf.status = s, \
+ RtlRestoreContext(&checkasm_context_buf.c, NULL))
+#else
+typedef void* checkasm_context;
+#define checkasm_save_context() 0
+#define checkasm_load_context() do {} while (0)
+#endif
#else
#include <setjmp.h>
-#define checkasm_context jmp_buf
-#define checkasm_save_context() setjmp(checkasm_context_buf)
-#define checkasm_load_context() longjmp(checkasm_context_buf, 1)
+typedef sigjmp_buf checkasm_context;
+#define checkasm_save_context() checkasm_handle_signal(sigsetjmp(checkasm_context_buf, 1))
+#define checkasm_load_context(s) siglongjmp(checkasm_context_buf, s)
#endif
#include "include/common/attributes.h"
@@ -75,6 +92,7 @@ int checkasm_fail_func(const char *msg, ...);
void checkasm_update_bench(int iterations, uint64_t cycles);
void checkasm_report(const char *name, ...);
void checkasm_set_signal_handler_state(int enabled);
+int checkasm_handle_signal(int s);
extern checkasm_context checkasm_context_buf;
/* float compare utilities */
@@ -179,6 +197,31 @@ static inline uint64_t readtime(void) {
return (((uint64_t)tbu) << 32) | (uint64_t)tbl;
}
#define readtime readtime
+#elif ARCH_RISCV
+#include <time.h>
+static inline uint64_t clock_gettime_nsec(void) {
+ struct timespec ts;
+ clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
+ return ((uint64_t)ts.tv_sec*1000000000u) + (uint64_t)ts.tv_nsec;
+}
+#define readtime clock_gettime_nsec
+#elif ARCH_LOONGARCH
+static inline uint64_t readtime(void) {
+#if ARCH_LOONGARCH64
+ uint64_t a, id;
+ __asm__ __volatile__("rdtime.d %0, %1"
+ : "=r"(a), "=r"(id)
+ :: );
+ return a;
+#else
+ uint32_t a, id;
+ __asm__ __volatile__("rdtimel.w %0, %1"
+ : "=r"(a), "=r"(id)
+ :: );
+ return (uint64_t)a;
+#endif
+}
+#define readtime readtime
#endif
/* Verifies that clobbered callee-saved registers
@@ -283,6 +326,17 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\
7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\
checkasm_set_signal_handler_state(0)
+#elif ARCH_RISCV
+#define declare_new(ret, ...)\
+ ret (*checked_call)(void *, int, int, int, int, int, int, int,\
+ __VA_ARGS__, int, int, int, int, int, int, int, int,\
+ int, int, int, int, int, int, int) =\
+ (void *)checkasm_checked_call;
+#define call_new(...)\
+ (checkasm_set_signal_handler_state(1),\
+ checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\
+ 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\
+ checkasm_set_signal_handler_state(0)
#else
#define declare_new(ret, ...)
#define call_new(...)\
diff --git a/tests/checkasm/ipred.c b/tests/checkasm/ipred.c
index 946ce73..ad54f1b 100644
--- a/tests/checkasm/ipred.c
+++ b/tests/checkasm/ipred.c
@@ -65,6 +65,16 @@ static const uint8_t z_angles[27] = {
81, 84, 87
};
+/* Generate max_width/max_height values that covers all edge cases */
+static int gen_z2_max_wh(const int sz) {
+ const int n = rnd();
+ if (n & (1 << 17)) /* edge block */
+ return (n & (sz - 1)) + 1;
+ if (n & (1 << 16)) /* max size, exceeds uint16_t */
+ return 65536;
+ return (n & 65535) + 1;
+}
+
static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
PIXEL_RECT(c_dst, 64, 64);
PIXEL_RECT(a_dst, 64, 64);
@@ -98,9 +108,8 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) |
(rnd() & 0x600);
if (mode == Z2_PRED) {
- maxw = rnd(), maxh = rnd();
- maxw = 1 + (maxw & (maxw & 4096 ? 4095 : w - 1));
- maxh = 1 + (maxh & (maxh & 4096 ? 4095 : h - 1));
+ maxw = gen_z2_max_wh(w);
+ maxh = gen_z2_max_wh(h);
}
} else if (mode == FILTER_PRED) /* filter_idx */
a = (rnd() % 5) | (rnd() & ~511);
diff --git a/tests/checkasm/msac.c b/tests/checkasm/msac.c
index b9c89b4..81fd593 100644
--- a/tests/checkasm/msac.c
+++ b/tests/checkasm/msac.c
@@ -266,6 +266,14 @@ void checkasm_check_msac(void) {
c.decode_bool = dav1d_msac_decode_bool_neon;
c.decode_hi_tok = dav1d_msac_decode_hi_tok_neon;
}
+#elif ARCH_LOONGARCH64 && HAVE_ASM
+ if (dav1d_get_cpu_flags() & DAV1D_LOONGARCH_CPU_FLAG_LSX) {
+ c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_lsx;
+ c.decode_symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_lsx;
+ c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_lsx;
+ c.decode_bool_adapt = dav1d_msac_decode_bool_adapt_lsx;
+ c.decode_bool = dav1d_msac_decode_bool_lsx;
+ }
#elif ARCH_X86 && HAVE_ASM
if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {
c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_sse2;
diff --git a/tests/checkasm/riscv/checkasm_64.S b/tests/checkasm/riscv/checkasm_64.S
new file mode 100644
index 0000000..0d02e5f
--- /dev/null
+++ b/tests/checkasm/riscv/checkasm_64.S
@@ -0,0 +1,252 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2023, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#define PRIVATE_PREFIX checkasm_
+
+#include "src/riscv/asm.S"
+
+// max number of args used by any asm function.
+#define MAX_ARGS 15
+
+// + 16 for stack canary reference
+#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15 + 16)
+
+const register_init, align=4
+ .quad 0x68909d060f4a7fdd
+ .quad 0x924f739e310218a1
+ .quad 0xb988385a8254174c
+ .quad 0x4c1110430bf09fd7
+ .quad 0x2b310edf6a5d7ecf
+ .quad 0xda8112e98ddbb559
+ .quad 0x6da5854aa2f84b62
+ .quad 0x72b761199e9b1f38
+ .quad 0x13f27aa74ae5dcdf
+ .quad 0x36a6c12a7380e827
+ .quad 0x5c452889aefc8548
+ .quad 0x6a9ea1ddb236235f
+ .quad 0x0449854bdfc94b1e
+ .quad 0x4f849b7076a156f5
+ .quad 0x1baa4275e734930e
+ .quad 0x77df3503ba3e073d
+ .quad 0x6060e073705a4bf2
+ .quad 0xa7b482508471e44b
+ .quad 0xd296a3158d6da2b9
+ .quad 0x1c0ed711a93d970b
+ .quad 0x9359537fdd79569d
+ .quad 0x2b1dc95c1e232d62
+ .quad 0xab06cd578e2bb5a0
+ .quad 0x4100b4987a0af30f
+ .quad 0x2523e36f9bb1e36f
+ .quad 0xfb0b815930c6d25c
+ .quad 0x89acc810c2902fcf
+ .quad 0xa65854b4c2b381f1
+ .quad 0x78150d69a1accedf
+ .quad 0x057e24868e022de1
+ .quad 0x88f6e79ed4b8d362
+ .quad 0x1f4a420e262c9035
+endconst
+
+const error_message_register
+error_message_rsvd:
+ .asciz "unallocatable register clobbered"
+error_message_sreg:
+ .asciz "callee-saved integer register s%i modified"
+error_message_fsreg:
+ .asciz "callee-saved floating-point register fs%i modified"
+error_message_stack:
+ .asciz "stack clobbered"
+endconst
+
+thread_local saved_regs, quads=29 # 5 + 12 + 12
+
+function checked_call, export=1, ext=v
+ /* Save the function ptr, RA, SP, unallocatable and callee-saved registers */
+ la.tls.ie t0, saved_regs
+ add t0, tp, t0
+ sd a0, (t0)
+ sd ra, 8(t0)
+ sd sp, 16(t0)
+ sd gp, 24(t0)
+ sd tp, 32(t0)
+.irp n, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ sd s\n, 40 + 16*\n(t0)
+#ifdef __riscv_float_abi_double
+ fsd fs\n, 48 + 16*\n(t0)
+#endif
+.endr
+
+ /* Check for vector extension */
+ call dav1d_get_cpu_flags_riscv
+ and a0, a0, 1 # DAV1D_RISCV_CPU_FLAG_RVV
+ beqz a0, 0f
+
+ /* Clobber vector configuration */
+ vsetvli t0, zero, e32, m8, ta, ma
+ lla t0, register_init
+ ld t0, (t0)
+.irp n, 0, 8, 16, 24
+ vmv.v.x v0, t0
+.endr
+ li t0, -1 << 31
+ vsetvl zero, zero, t0
+ csrwi vxrm, 3
+ csrwi vxsat, 1
+
+0:
+ /* Load the register arguments */
+.irp n, 0, 1, 2, 3, 4, 5, 6, 7
+ ld a\n, 8*\n(sp)
+.endr
+
+ /* Load the stack arguments */
+.irp n, 8, 9, 10, 11, 12, 13, 14, 15
+ ld t0, 8*\n(sp)
+ sd t0, 8*(\n - 8) - ARG_STACK(sp)
+.endr
+
+ /* Setup the stack canary */
+ ld t0, MAX_ARGS*8(sp)
+ addi sp, sp, -ARG_STACK
+ slli t0, t0, 3
+ add t0, t0, sp
+ ld t0, (t0)
+ not t0, t0
+ sd t0, ARG_STACK - 8(sp)
+
+ /* Clobber the stack space right below SP */
+ lla t0, register_init
+ ld t1, (t0)
+.rept 16
+ addi sp, sp, -16
+ sd t1, (sp)
+ sd t1, 8(sp)
+.endr
+ addi sp, sp, 16*16
+
+ /* Clobber the callee-saved and temporary registers */
+.irp n, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+.if (\n > 0 && \n < 7)
+ ld t\n, 16*\n(t0)
+.endif
+ ld s\n, 8 + 8*\n(t0)
+#ifdef __riscv_float_abi_double
+ fld ft\n, 16 + 16*\n(t0)
+ fld fs\n, 24 + 8*\n(t0)
+#endif
+.endr
+
+ /* Call the checked function */
+ la.tls.ie t0, saved_regs
+ add t0, tp, t0
+ ld t0, (t0)
+ jalr t0
+
+ /* Check the value of callee-saved registers */
+ lla t0, register_init
+.irp n, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ ld t1, 8 + 8*\n(t0)
+ li a1, \n
+ bne t1, s\n, 2f
+#ifdef __riscv_float_abi_double
+ ld t1, 24 + 8*\n(t0)
+ fmv.x.d t2, fs\n
+ bne t1, t2, 3f
+#endif
+.endr
+
+ /* Check unallocatable register values */
+ la.tls.ie t0, saved_regs
+ add t0, tp, t0
+ ld t1, 16(t0)
+ addi t1, t1, -ARG_STACK
+ bne t1, sp, 4f
+ ld t1, 24(t0)
+ bne t1, gp, 4f
+ ld t1, 32(t0)
+ bne t1, tp, 4f
+
+ /* Check the stack canary */
+ ld t0, ARG_STACK + MAX_ARGS*8(sp)
+ slli t0, t0, 3
+ add t0, t0, sp
+ ld t0, (t0)
+ not t0, t0
+ ld t1, ARG_STACK - 8(sp)
+ bne t0, t1, 5f
+
+1:
+ /* Restore RA, SP and callee-saved registers from thread local storage */
+ la.tls.ie t0, saved_regs
+ add t0, tp, t0
+ ld ra, 8(t0)
+ ld sp, 16(t0)
+.irp n, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ ld s\n, 40 + 16*\n(t0)
+#ifdef __riscv_float_abi_double
+ fld fs\n, 48 + 16*\n(t0)
+#endif
+.endr
+ ret
+
+2:
+ lla a0, error_message_sreg
+#ifdef PREFIX
+ call _checkasm_fail_func
+#else
+ call checkasm_fail_func
+#endif
+ j 1b
+
+#ifdef __riscv_float_abi_double
+3:
+ lla a0, error_message_fsreg
+#ifdef PREFIX
+ call _checkasm_fail_func
+#else
+ call checkasm_fail_func
+#endif
+ j 1b
+#endif
+
+4:
+ lla a0, error_message_rsvd
+#ifdef PREFIX
+ call _checkasm_fail_func
+#else
+ call checkasm_fail_func
+#endif
+ j 1b
+
+5:
+ lla a0, error_message_stack
+#ifdef PREFIX
+ call _checkasm_fail_func
+#else
+ call checkasm_fail_func
+#endif
+ j 1b
+endfunc
diff --git a/tests/dav1d_argon.bash b/tests/dav1d_argon.bash
index 0c35663..27a8d61 100755
--- a/tests/dav1d_argon.bash
+++ b/tests/dav1d_argon.bash
@@ -132,7 +132,8 @@ for d in "${dirs[@]}"; do
fi
done
-if [ ${#files[@]} -eq 0 ]; then
+num_files="${#files[@]}"
+if [ "$num_files" -eq 0 ]; then
error "Error! No files found at ${dirs[*]}"
fi
@@ -148,17 +149,17 @@ for i in "${!files[@]}"; do
md5=$(<"${md5/%obu/md5}") || error "Error! Can't read md5 ${md5} for file ${f}"
md5=${md5/ */}
- printf "\033[1K\r[%3d%% %d/%d] Verifying %s" "$(((i+1)*100/${#files[@]}))" "$((i+1))" "${#files[@]}" "$f"
+ printf '\033[1K\r[%3d%% %*d/%d] Verifying %s' "$(((i+1)*100/num_files))" "${#num_files}" "$((i+1))" "$num_files" "${f#"$ARGON_DIR"/}"
cmd=("$DAV1D" -i "$f" --filmgrain "$FILMGRAIN" --verify "$md5" --cpumask "$CPUMASK" --threads "$THREADS" -q)
if [ "$JOBS" -gt 1 ]; then
"${cmd[@]}" 2>/dev/null &
p=$!
pids+=("$p")
- declare "file$p=$f"
+ declare "file$p=${f#"$ARGON_DIR"/}"
block_pids
else
if ! "${cmd[@]}" 2>/dev/null; then
- fail "$f"
+ fail "${f#"$ARGON_DIR"/}"
fi
fi
done
@@ -166,9 +167,9 @@ done
wait_all_pids
if [ "$failed" -ne 0 ]; then
- printf "\033[1K\r%d/%d files \033[1;91mfailed\033[0m to verify" "$failed" "${#files[@]}"
+ printf "\033[1K\r%d/%d files \033[1;91mfailed\033[0m to verify" "$failed" "$num_files"
else
- printf "\033[1K\r%d files \033[1;92msuccessfully\033[0m verified" "${#files[@]}"
+ printf "\033[1K\r%d files \033[1;92msuccessfully\033[0m verified" "$num_files"
fi
printf " in %dm%ds (%s)\n" "$((SECONDS/60))" "$((SECONDS%60))" "$ver_info"
diff --git a/tests/meson.build b/tests/meson.build
index ef8c21e..11db0a5 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -69,6 +69,8 @@ if is_asm_enabled
checkasm_asm_sources += files('checkasm/arm/checkasm_64.S')
elif host_machine.cpu_family().startswith('arm')
checkasm_asm_sources += files('checkasm/arm/checkasm_32.S')
+ elif host_machine.cpu_family() == 'riscv64'
+ checkasm_asm_sources += files('checkasm/riscv/checkasm_64.S')
elif host_machine.cpu_family().startswith('x86')
checkasm_asm_objs += nasm_gen.process(files('checkasm/x86/checkasm.asm'))
endif
@@ -128,7 +130,7 @@ endforeach
subdir('libfuzzer')
# seek stress test binary, depends on dav1d cli tool
-if get_option('enable_tools')
+if (get_option('enable_tools') and get_option('enable_seek_stress'))
seek_stress_sources = files('seek_stress.c')
seek_stress = executable('seek_stress',
seek_stress_sources, rev_target,
diff --git a/tools/dav1d_cli_parse.c b/tools/dav1d_cli_parse.c
index 4d747c0..5fdbab3 100644
--- a/tools/dav1d_cli_parse.c
+++ b/tools/dav1d_cli_parse.c
@@ -101,8 +101,12 @@ static const struct option long_opts[] = {
#if ARCH_AARCH64 || ARCH_ARM
#define ALLOWED_CPU_MASKS " or 'neon'"
+#elif ARCH_LOONGARCH
+#define ALLOWED_CPU_MASKS ", 'lsx' or 'lasx'"
#elif ARCH_PPC64LE
#define ALLOWED_CPU_MASKS " or 'vsx'"
+#elif ARCH_RISCV
+#define ALLOWED_CPU_MASKS " or 'rvv'"
#elif ARCH_X86
#define ALLOWED_CPU_MASKS \
", 'sse2', 'ssse3', 'sse41', 'avx2' or 'avx512icl'"
@@ -216,8 +220,13 @@ enum CpuMask {
static const EnumParseTable cpu_mask_tbl[] = {
#if ARCH_AARCH64 || ARCH_ARM
{ "neon", DAV1D_ARM_CPU_FLAG_NEON },
+#elif ARCH_LOONGARCH
+ { "lsx", DAV1D_LOONGARCH_CPU_FLAG_LSX },
+ { "lasx", DAV1D_LOONGARCH_CPU_FLAG_LASX },
#elif ARCH_PPC64LE
{ "vsx", DAV1D_PPC_CPU_FLAG_VSX },
+#elif ARCH_RISCV
+ { "rvv", DAV1D_RISCV_CPU_FLAG_V },
#elif ARCH_X86
{ "sse2", X86_CPU_MASK_SSE2 },
{ "ssse3", X86_CPU_MASK_SSSE3 },