aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHarish Mahendrakar <hmahendrakar@google.com>2024-03-23 01:03:18 +0000
committerHarish Mahendrakar <hmahendrakar@google.com>2024-03-23 01:15:42 +0000
commita95116abf726e72cdd76e54abb68aeecfa95d777 (patch)
tree4727ae093428809b63c0fd37e24dd77752d9ec05
parent147f3dbab39940634865e3c66a5b81c02f5d43fd (diff)
parent872e470ebf3e65b0b956f3a70329e885a2df1c2a (diff)
downloadlibdav1d-a95116abf726e72cdd76e54abb68aeecfa95d777.tar.gz
Upgrade libdav1d to 1.4.1
This project was upgraded with external_updater. Usage: tools/external_updater/updater.sh update external/libdav1d For more info, check https://cs.android.com/android/platform/superproject/+/main:tools/external_updater/README.md Bug: 330952417 Test: atest CtsMediaV2TestCases -- --module-arg \ CtsMediaV2TestCases:instrumentation-arg:codec-prefix\ :=c2.android.av1 Change-Id: I140e0975b742e45ec1a03d3b1c2644c9eb684f6d
-rw-r--r--.gitlab-ci.yml72
-rw-r--r--METADATA6
-rw-r--r--NEWS22
-rw-r--r--THANKS.md33
-rw-r--r--gcovr.cfg2
-rw-r--r--meson.build77
-rw-r--r--package/crossfiles/aarch64-linux-clang.meson16
-rw-r--r--package/crossfiles/aarch64-linux.meson12
-rw-r--r--package/crossfiles/riscv64-linux-clang.meson16
-rw-r--r--src/arm/32/itx.S79
-rw-r--r--src/arm/32/itx16.S19
-rw-r--r--src/arm/32/msac.S167
-rw-r--r--src/arm/64/itx.S99
-rw-r--r--src/arm/64/itx16.S21
-rw-r--r--src/arm/64/mc.S411
-rw-r--r--src/arm/64/mc16.S373
-rw-r--r--src/arm/64/msac.S167
-rw-r--r--src/arm/64/util.S41
-rw-r--r--src/arm/asm.S44
-rw-r--r--src/arm/cpu.c137
-rw-r--r--src/arm/cpu.h4
-rw-r--r--src/arm/itx.h4
-rw-r--r--src/arm/msac.h2
-rw-r--r--src/cpu.h14
-rw-r--r--src/ext/x86/x86inc.asm72
-rw-r--r--src/itx_1d.c5
-rw-r--r--src/itx_tmpl.c10
-rw-r--r--src/loongarch/msac.S216
-rw-r--r--src/msac.c58
-rw-r--r--src/ppc/cdef_tmpl.c399
-rw-r--r--src/riscv/64/itx.S1065
-rw-r--r--src/riscv/asm.S2
-rw-r--r--src/riscv/itx.h12
-rw-r--r--src/x86/looprestoration_sse.asm8
-rw-r--r--src/x86/msac.asm172
-rw-r--r--tests/checkasm/arm/checkasm_64.S10
-rw-r--r--tests/checkasm/checkasm.c90
-rw-r--r--tests/checkasm/mc.c2
-rw-r--r--tests/checkasm/msac.c27
-rw-r--r--tests/checkasm/riscv/checkasm_64.S5
-rwxr-xr-xtests/dav1d_argon.bash16
-rw-r--r--tools/dav1d_cli_parse.c8
42 files changed, 2939 insertions, 1076 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 702f284..a3cf425 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -4,7 +4,7 @@ stages:
- test
.debian-amd64-common:
- image: registry.videolan.org/dav1d-debian-unstable:20240113214804
+ image: registry.videolan.org/dav1d-debian-unstable:20240226203953
stage: build
tags:
- docker
@@ -220,7 +220,7 @@ build-debian-avx512:
- cd build
- time meson test -v --suite checkasm
-build-debian-clang14:
+build-debian-clang:
extends: .debian-amd64-common
variables:
CC: clang
@@ -355,6 +355,18 @@ build-debian-aarch64-clang-5:
- ninja -C build
- cd build && meson test -v
+build-debian-aarch64-clang-17:
+ extends: .debian-amd64-common
+ variables:
+ QEMU_LD_PREFIX: /usr/aarch64-linux-gnu/
+ script:
+ - meson setup build --buildtype release
+ -Dtrim_dsp=false
+ --werror
+ --cross-file package/crossfiles/aarch64-linux-clang.meson
+ - ninja -C build
+ - cd build && meson test -v
+
build-macos:
stage: build
tags:
@@ -427,9 +439,12 @@ build-debian-riscv64:
- meson setup build --buildtype release
-Dtrim_dsp=false
--werror
- --cross-file package/crossfiles/riscv64-linux.meson
+ --cross-file package/crossfiles/${CROSSFILE}.meson
- ninja -C build
- cd build && meson test -v
+ parallel:
+ matrix:
+ - CROSSFILE: [riscv64-linux, riscv64-linux-clang]
build-debian-loongarch64:
extends: .debian-amd64-common
@@ -734,6 +749,33 @@ test-debian-riscv64:
"rv64,v=true,vext_spec=v1.0,vlen=512,elen=64",
"rv64,v=true,vext_spec=v1.0,vlen=1024,elen=64" ]
+test-debian-aarch64-qemu:
+ extends:
+ - .debian-amd64-common
+ - .test-common
+ needs: ["build-debian-aarch64"]
+ script:
+ - meson setup build --buildtype release
+ -Dtestdata_tests=true
+ -Dlogging=false
+ -Dtrim_dsp=false
+ --cross-file package/crossfiles/aarch64-linux.meson
+ - ninja -C build
+ - cd build && time meson test -v --timeout-multiplier 2
+ variables:
+ QEMU_LD_PREFIX: /usr/aarch64-linux-gnu/
+ parallel:
+ matrix:
+ # sve-default-vector-length sets the max vector length in bytes;
+ # the default is 64, allowing up to 512 bit vectors. Testing 1024
+ # and 2048 bit vectors requires raising this limit. The sve<n>
+ # option sets the active vector length in bits.
+ - QEMU_CPU: [ "max,sve-default-vector-length=256,sve128=on",
+ "max,sve-default-vector-length=256,sve256=on",
+ "max,sve-default-vector-length=256,sve512=on",
+ "max,sve-default-vector-length=256,sve1024=on",
+ "max,sve-default-vector-length=256,sve2048=on" ]
+
test-debian-armv7-clang-5:
extends:
- .debian-armv7-common
@@ -785,11 +827,11 @@ test-debian-argon:
- avx2
script:
- *test-argon-script
- - ../tests/dav1d_argon.bash -t 2 -j 4 -c 0 || exit_code=$((exit_code + $?))
- - ../tests/dav1d_argon.bash -t 2 -j 4 -c sse2 -g 0 || exit_code=$((exit_code + $?))
- - ../tests/dav1d_argon.bash -t 2 -j 4 -c ssse3 || exit_code=$((exit_code + $?))
- - ../tests/dav1d_argon.bash -t 2 -j 4 -c sse41 || exit_code=$((exit_code + $?))
- - ../tests/dav1d_argon.bash -t 2 -j 4 -c avx2 || exit_code=$((exit_code + $?))
+ - ../tests/dav1d_argon.bash -t 1 -c 0 || exit_code=$((exit_code + $?))
+ - ../tests/dav1d_argon.bash -t 2 -c sse2 -g 0 || exit_code=$((exit_code + $?))
+ - ../tests/dav1d_argon.bash -t 3 -c ssse3 || exit_code=$((exit_code + $?))
+ - ../tests/dav1d_argon.bash -t 4 -c sse41 || exit_code=$((exit_code + $?))
+ - ../tests/dav1d_argon.bash -t 5 -c avx2 || exit_code=$((exit_code + $?))
- if [ $exit_code -ne 0 ]; then exit $exit_code; fi
test-debian32-argon:
@@ -804,9 +846,9 @@ test-debian32-argon:
--cross-file package/crossfiles/i686-linux32.meson
- cd build && ninja
- exit_code=0
- - ../tests/dav1d_argon.bash -t 2 -j 4 -c sse2 || exit_code=$((exit_code + $?))
- - ../tests/dav1d_argon.bash -t 2 -j 4 -c ssse3 || exit_code=$((exit_code + $?))
- - ../tests/dav1d_argon.bash -t 2 -j 4 -c sse41 -g 0 || exit_code=$((exit_code + $?))
+ - ../tests/dav1d_argon.bash -t 2 -c sse2 || exit_code=$((exit_code + $?))
+ - ../tests/dav1d_argon.bash -t 2 -c ssse3 || exit_code=$((exit_code + $?))
+ - ../tests/dav1d_argon.bash -t 2 -c sse41 -g 0 || exit_code=$((exit_code + $?))
- if [ $exit_code -ne 0 ]; then exit $exit_code; fi
test-debian-argon-avx512:
@@ -819,7 +861,7 @@ test-debian-argon-avx512:
- amd64-avx512
script:
- *test-argon-script
- - ../tests/dav1d_argon.bash -t 2 -j 1 -c avx512icl || exit_code=$((exit_code + $?))
+ - ../tests/dav1d_argon.bash -t 2 -c avx512icl || exit_code=$((exit_code + $?))
- if [ $exit_code -ne 0 ]; then exit $exit_code; fi
.test-debian-arm-argon:
@@ -831,9 +873,9 @@ test-debian-argon-avx512:
-Dtrim_dsp=false
- cd build && ninja
- exit_code=0
- - ../tests/dav1d_argon.bash -t 2 -j 4 -c 0 || exit_code=$((exit_code + $?))
- - ../tests/dav1d_argon.bash -t 2 -j 4 -c neon || exit_code=$((exit_code + $?))
- - ../tests/dav1d_argon.bash -t 2 -j 4 -c neon -g 0 || exit_code=$((exit_code + $?))
+ - ../tests/dav1d_argon.bash -t 2 -c 0 || exit_code=$((exit_code + $?))
+ - ../tests/dav1d_argon.bash -t 2 -c neon || exit_code=$((exit_code + $?))
+ - ../tests/dav1d_argon.bash -t 2 -c neon -g 0 || exit_code=$((exit_code + $?))
- if [ $exit_code -ne 0 ]; then exit $exit_code; fi
test-debian-armv7-argon:
diff --git a/METADATA b/METADATA
index 8298a7f..fbf04db 100644
--- a/METADATA
+++ b/METADATA
@@ -8,13 +8,13 @@ third_party {
license_type: NOTICE
last_upgrade_date {
year: 2024
- month: 2
- day: 14
+ month: 3
+ day: 23
}
homepage: "https://code.videolan.org/videolan/dav1d/"
identifier {
type: "Git"
value: "https://code.videolan.org/videolan/dav1d.git"
- version: "1.4.0"
+ version: "1.4.1"
}
}
diff --git a/NEWS b/NEWS
index f74af58..88b1eea 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,17 @@
+Changes for 1.4.1 'Road Runner':
+--------------------------------
+
+1.4.1 is a small release of dav1d, improving notably ARM and RISC-V speed
+
+- Optimizations for 6tap filters for NEON (ARM)
+- More RISC-V optimizations for itx (4x8, 8x4, 4x16, 16x4, 8x16, 16x8)
+- Reduction of binary size on ARM64, ARM32 and RISC-V
+- Fix out-of-bounds read in 8bpc SSE2/SSSE3 wiener_filter
+- Msac optimizations
+
+
Changes for 1.4.0 'Road Runner':
-------------------------------------------------------
+--------------------------------
1.4.0 is a medium release of dav1d, focusing on new architecture support and optimizations
@@ -9,7 +21,7 @@ Changes for 1.4.0 'Road Runner':
- New architecture supported: RISC-V
- RISC-V optimizations for itx
- Misc improvements in threading and in reducing binary size
-- Fix potential integer overflow with extremely large frame sizes
+- Fix potential integer overflow with extremely large frame sizes (CVE-2024-1580)
Changes for 1.3.0 'Tundra Peregrine Falcon (Calidus)':
@@ -26,7 +38,7 @@ Changes for 1.3.0 'Tundra Peregrine Falcon (Calidus)':
Changes for 1.2.1 'Arctic Peregrine Falcon':
--------------------------------------------
+--------------------------------------------
1.2.1 is a small release of dav1d, adding more SIMD and fixes
@@ -42,7 +54,7 @@ Changes for 1.2.1 'Arctic Peregrine Falcon':
Changes for 1.2.0 'Arctic Peregrine Falcon':
--------------------------------------------
+--------------------------------------------
1.2.0 is a small release of dav1d, adding more SIMD and fixes
@@ -55,7 +67,7 @@ Changes for 1.2.0 'Arctic Peregrine Falcon':
Changes for 1.1.0 'Arctic Peregrine Falcon':
--------------------------------------------
+--------------------------------------------
1.1.0 is an important release of dav1d, fixing numerous bugs, and adding SIMD
diff --git a/THANKS.md b/THANKS.md
index 4fc8d27..b7aa200 100644
--- a/THANKS.md
+++ b/THANKS.md
@@ -16,19 +16,20 @@ The Alliance for Open Media (AOM) for partially funding this project.
And all the dav1d Authors (git shortlog -sn), including:
-Martin Storsjö, Henrik Gramner, Ronald S. Bultje, Janne Grunau, James Almer,
-Victorien Le Couviour--Tuffet, Matthias Dressel, Marvin Scholz,
-Jean-Baptiste Kempf, Luc Trudeau, Hugo Beauzée-Luyssen, Konstantin Pavlov,
-Niklas Haas, David Michael Barr, Steve Lhomme, Nathan E. Egge, Wan-Teh Chang,
-Kyle Siefring, B Krishnan Iyer, Francois Cartegnie, Liwei Wang, Luca Barbato,
-David Conrad, Derek Buitenhuis, Jan Beich, Michael Bradshaw, Raphaël Zumer,
-Xuefeng Jiang, Christophe Gisquet, Justin Bull, Boyuan Xiao, Dale Curtis,
-Emmanuel Gil Peyrot, Raphael Zumer, Rupert Swarbrick, Thierry Foucu,
-Thomas Daede, Colin Lee, Jonathan Wright, Lynne, Michail Alvanos, Nico Weber,
-Salome Thirot, SmilingWolf, Tristan Laurent, Vittorio Giovara, Yannis Guyon,
-André Kempe, Anisse Astier, Anton Mitrofanov, Charlie Hayden, Dmitriy Sychov,
-Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard, Joe Drago, Mark Shuttleworth,
-Matthieu Bouron, Mehdi Sabwat, Nicolas Frattaroli, Pablo Stebler, Rostislav
-Pehlivanov, Sebastian Dröge, Shiz, Steinar Midtskogen, Sylvain BERTRAND,
-Sylvestre Ledru, Timo Gurr, Tristan Matthews, Vibhoothi, Xavier Claessens,
-Xu Guangxin, kossh1 and skal.
+Henrik Gramner, Martin Storsjö, Ronald S. Bultje, Janne Grunau, James Almer,
+Victorien Le Couviour--Tuffet, Matthias Dressel, Nathan E. Egge,
+Jean-Baptiste Kempf, Marvin Scholz, Luc Trudeau, Niklas Haas,
+Hugo Beauzée-Luyssen, Konstantin Pavlov, David Michael Barr, Steve Lhomme,
+yuanhecai, Luca Barbato, Wan-Teh Chang, Kyle Siefring, B Krishnan Iyer,
+Francois Cartegnie, Liwei Wang, David Conrad, Derek Buitenhuis, Jan Beich,
+Michael Bradshaw, Raphaël Zumer, Xuefeng Jiang, Arpad Panyik, Christophe Gisquet,
+Justin Bull, Boyuan Xiao, Dale Curtis, Emmanuel Gil Peyrot, Raphael Zumer,
+Rupert Swarbrick, Thierry Foucu, Thomas Daede, jinbo, André Kempe, Colin Lee,
+Jonathan Wright, Lynne, Michail Alvanos, Nico Weber, Salome Thirot, SmilingWolf,
+Tristan Laurent, Tristan Matthews, Vittorio Giovara, Yannis Guyon,
+Andrey Semashev, Anisse Astier, Anton Mitrofanov, Charlie Hayden, Dmitriy Sychov,
+Ewout ter Hoeven, Fred Barbier, Hao Chen, Jean-Yves Avenard, Joe Drago,
+Mark Shuttleworth, Matthieu Bouron, Mehdi Sabwat, Nicolas Frattaroli,
+Pablo Stebler, Rostislav Pehlivanov, Sebastian Dröge, Shiz, Steinar Midtskogen,
+Sylvain BERTRAND, Sylvestre Ledru, Timo Gurr, Vibhoothi,
+Vignesh Venkatasubramanian, Xavier Claessens, Xu Guangxin, kossh1 and skal.
diff --git a/gcovr.cfg b/gcovr.cfg
index d09a0ec..e02ae33 100644
--- a/gcovr.cfg
+++ b/gcovr.cfg
@@ -1,4 +1,4 @@
exclude = .*/tests/.*
exclude = .*/tools/.*
exclude = .*/include/common/dump.h
-gcov-ignore-parse-errors = yes
+gcov-ignore-parse-errors = negative_hits.warn
diff --git a/meson.build b/meson.build
index 6e49852..e371415 100644
--- a/meson.build
+++ b/meson.build
@@ -23,7 +23,7 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
- version: '1.4.0',
+ version: '1.4.1',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
@@ -309,6 +309,10 @@ if (host_machine.system() in ['darwin', 'ios', 'tvos'] and cc.get_id() == 'clang
optional_arguments += '-fno-stack-check'
endif
+if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm'))
+ optional_arguments += '-fno-align-functions'
+endif
+
add_project_arguments(cc.get_supported_arguments(optional_arguments), language : 'c')
add_project_link_arguments(cc.get_supported_link_arguments(optional_link_arguments), language : 'c')
@@ -365,6 +369,66 @@ if (is_asm_enabled and
if cc.compiles(check_pic_code)
cdata.set('PIC', '3')
endif
+
+ if host_machine.cpu_family() == 'aarch64'
+ have_as_arch = cc.compiles('''__asm__ (".arch armv8-a");''')
+ cdata.set10('HAVE_AS_ARCH_DIRECTIVE', have_as_arch)
+ as_arch_str = ''
+ if have_as_arch
+ as_arch_level = 'armv8-a'
+ # Check what .arch levels are supported. In principle, we only
+ # want to detect up to armv8.2-a here (binutils requires that
+ # in order to enable i8mm). However, older Clang versions
+ # (before Clang 17, and Xcode versions up to and including 15.0)
+ # didn't support controlling dotprod/i8mm extensions via
+ # .arch_extension, therefore try to enable a high enough .arch
+ # level as well, to implicitly make them available via that.
+ foreach arch : ['armv8.2-a', 'armv8.4-a', 'armv8.6-a']
+ if cc.compiles('__asm__ (".arch ' + arch + '\\n");')
+ as_arch_level = arch
+ endif
+ endforeach
+ # Clang versions before 17 also had a bug
+ # (https://github.com/llvm/llvm-project/issues/32220)
+ # causing a plain ".arch <level>" to not have any effect unless it
+ # had an extra "+<feature>" included - but it was activated on the
+ # next ".arch_extension" directive instead. Check if we can include
+ # "+crc" as dummy feature to make the .arch directive behave as
+ # expected and take effect right away.
+ if cc.compiles('__asm__ (".arch ' + as_arch_level + '+crc\\n");')
+ as_arch_level = as_arch_level + '+crc'
+ endif
+ cdata.set('AS_ARCH_LEVEL', as_arch_level)
+ as_arch_str = '".arch ' + as_arch_level + '\\n"'
+ endif
+ extensions = {
+ 'dotprod': 'udot v0.4s, v0.16b, v0.16b',
+ 'i8mm': 'usdot v0.4s, v0.16b, v0.16b',
+ 'sve': 'whilelt p0.s, x0, x1',
+ 'sve2': 'sqrdmulh z0.s, z0.s, z0.s',
+ }
+ foreach name, instr : extensions
+ # Test for support for the various extensions. First test if
+ # the assembler supports the .arch_extension directive for
+ # enabling/disabling the extension, then separately check whether
+ # the instructions themselves are supported. Even if .arch_extension
+ # isn't supported, we may be able to assemble the instructions
+ # if the .arch level includes support for them.
+ code = '__asm__ (' + as_arch_str
+ code += '".arch_extension ' + name + '\\n"'
+ code += ');'
+ supports_archext = cc.compiles(code)
+ cdata.set10('HAVE_AS_ARCHEXT_' + name.to_upper() + '_DIRECTIVE', supports_archext)
+ code = '__asm__ (' + as_arch_str
+ if supports_archext
+ code += '".arch_extension ' + name + '\\n"'
+ endif
+ code += '"' + instr + '\\n"'
+ code += ');'
+ supports_instr = cc.compiles(code, name: name.to_upper())
+ cdata.set10('HAVE_' + name.to_upper(), supports_instr)
+ endforeach
+ endif
endif
cdata.set10('ARCH_X86', host_machine.cpu_family().startswith('x86'))
@@ -477,6 +541,17 @@ if (is_asm_enabled and
])
endif
+if is_asm_enabled and host_machine.cpu_family().startswith('riscv')
+ as_option_code = '''__asm__ (
+".option arch, +v\n"
+"vsetivli zero, 0, e8, m1, ta, ma"
+);
+'''
+ if not cc.compiles(as_option_code, name : 'RISC-V Vector')
+ error('Compiler doesn\'t support \'.option arch\' asm directive. Update to binutils>=2.38 or clang>=17 or use \'-Denable_asm=false\'.')
+ endif
+endif
+
# Generate config.h
config_h_target = configure_file(output: 'config.h', configuration: cdata)
diff --git a/package/crossfiles/aarch64-linux-clang.meson b/package/crossfiles/aarch64-linux-clang.meson
new file mode 100644
index 0000000..2d218c7
--- /dev/null
+++ b/package/crossfiles/aarch64-linux-clang.meson
@@ -0,0 +1,16 @@
+[binaries]
+c = 'clang'
+cpp = 'clang++'
+ar = 'aarch64-linux-gnu-ar'
+strip = 'aarch64-linux-gnu-strip'
+exe_wrapper = 'qemu-aarch64'
+
+[properties]
+c_args = '-target aarch64-linux-gnu'
+c_link_args = '-target aarch64-linux-gnu'
+
+[host_machine]
+system = 'linux'
+cpu_family = 'aarch64'
+cpu = 'aarch64'
+endian = 'little'
diff --git a/package/crossfiles/aarch64-linux.meson b/package/crossfiles/aarch64-linux.meson
new file mode 100644
index 0000000..7dae0fc
--- /dev/null
+++ b/package/crossfiles/aarch64-linux.meson
@@ -0,0 +1,12 @@
+[binaries]
+c = 'aarch64-linux-gnu-gcc'
+cpp = 'aarch64-linux-gnu-g++'
+ar = 'aarch64-linux-gnu-ar'
+strip = 'aarch64-linux-gnu-strip'
+exe_wrapper = 'qemu-aarch64'
+
+[host_machine]
+system = 'linux'
+cpu_family = 'aarch64'
+cpu = 'aarch64'
+endian = 'little'
diff --git a/package/crossfiles/riscv64-linux-clang.meson b/package/crossfiles/riscv64-linux-clang.meson
new file mode 100644
index 0000000..c16d74d
--- /dev/null
+++ b/package/crossfiles/riscv64-linux-clang.meson
@@ -0,0 +1,16 @@
+[binaries]
+c = 'clang'
+cpp = 'clang++'
+ar = 'riscv64-linux-gnu-ar'
+strip = 'riscv64-linux-gnu-strip'
+exe_wrapper = 'qemu-riscv64'
+
+[properties]
+c_args = '-target riscv64-linux-gnu'
+c_link_args = '-target riscv64-linux-gnu'
+
+[host_machine]
+system = 'linux'
+cpu_family = 'riscv64'
+cpu = 'riscv64'
+endian = 'little'
diff --git a/src/arm/32/itx.S b/src/arm/32/itx.S
index ceea025..9ba1df7 100644
--- a/src/arm/32/itx.S
+++ b/src/arm/32/itx.S
@@ -965,6 +965,8 @@ function inv_txfm_\variant\()add_8x8_neon
.ifc \variant, identity_
// The identity shl #1 and downshift srshr #1 cancel out
+
+ b L(itx_8x8_epilog)
.else
blx r4
@@ -976,8 +978,8 @@ function inv_txfm_\variant\()add_8x8_neon
vrshr.s16 q13, q13, #1
vrshr.s16 q14, q14, #1
vrshr.s16 q15, q15, #1
-.endif
+L(itx_8x8_epilog):
transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
blx r5
@@ -985,11 +987,12 @@ function inv_txfm_\variant\()add_8x8_neon
load_add_store_8x8 r0, r7
vpop {q4-q7}
pop {r4-r5,r7,pc}
+.endif
endfunc
.endm
-def_fn_8x8_base
def_fn_8x8_base identity_
+def_fn_8x8_base
.macro def_fn_8x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
@@ -1444,14 +1447,16 @@ function inv_txfm_horz\suffix\()_16x4_neon
.else
identity_4x16_shift1 d0[0]
.endif
+ b L(horz_16x4_epilog)
.else
blx r4
-.endif
-.if \shift > 0
.irp i, q8, q9, q10, q11, q12, q13, q14, q15
vrshr.s16 \i, \i, #\shift
.endr
-.endif
+.if \shift == 1
+ b L(horz_16x4_epilog)
+.else
+L(horz_16x4_epilog):
transpose_4x4h q8, q9, d16, d17, d18, d19
transpose_4x4h q10, q11, d20, d21, d22, d23
transpose_4x4h q12, q13, d24, d25, d26, d27
@@ -1462,13 +1467,15 @@ function inv_txfm_horz\suffix\()_16x4_neon
.endr
pop {pc}
+.endif
+.endif
endfunc
.endm
-def_horz_16 scale=0, identity=0, shift=2
-def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
-def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity
+def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=0, shift=2
function inv_txfm_add_vert_4x16_neon
push {lr}
@@ -1597,6 +1604,8 @@ function inv_txfm_\variant\()add_16x4_neon
.endr
identity_4x16_shift1 d0[0]
+
+ b L(itx_16x4_epilog)
.else
vmov.i16 q2, #0
vmov.i16 q3, #0
@@ -1615,30 +1624,25 @@ function inv_txfm_\variant\()add_16x4_neon
vswp d19, d22
vswp d18, d20
vswp d19, d21
-.irp i, q8, q9, q10, q11
+ vswp d25, d28
+ vswp d27, d30
+ vswp d26, d28
+ vswp d27, d29
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
vrshr.s16 \i, \i, #1
.endr
-.endif
+
+L(itx_16x4_epilog):
transpose_4x8h q8, q9, q10, q11
blx r5
mov r6, r0
load_add_store_8x4 r6, r7
-.ifc \variant, identity_
vmov q8, q12
vmov q9, q13
vmov q10, q14
vmov q11, q15
-.else
- vswp d25, d28
- vswp d27, d30
- vswp d26, d28
- vswp d27, d29
- vrshr.s16 q8, q12, #1
- vrshr.s16 q9, q13, #1
- vrshr.s16 q10, q14, #1
- vrshr.s16 q11, q15, #1
-.endif
+
transpose_4x8h q8, q9, q10, q11
blx r5
add r6, r0, #8
@@ -1646,6 +1650,7 @@ function inv_txfm_\variant\()add_16x4_neon
vpop {q4-q7}
pop {r4-r11,pc}
+.endif
endfunc
function inv_txfm_\variant\()add_4x16_neon
@@ -1696,12 +1701,14 @@ function inv_txfm_\variant\()add_4x16_neon
movw r12, #(5793-4096)*8
vdup.16 d0, r12
identity_8x4_shift1 q8, q9, q10, q11, d0[0]
+
+ b L(itx_4x16_epilog)
.else
blx r4
.irp i, q8, q9, q10, q11
vrshr.s16 \i, \i, #1
.endr
-.endif
+L(itx_4x16_epilog):
transpose_4x8h q8, q9, q10, q11
vswp d19, d21
vswp d18, d20
@@ -1714,11 +1721,12 @@ function inv_txfm_\variant\()add_4x16_neon
vpop {q4-q7}
pop {r4-r11,pc}
+.endif
endfunc
.endm
-def_fn_416_base
def_fn_416_base identity_
+def_fn_416_base
.macro def_fn_416 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1728,11 +1736,15 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
.if \w == 4
+.ifnc \txfm1, identity
movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon
+.endif
movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon
mov r10, #\eob_half
.else
+.ifnc \txfm1, identity
movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon
+.endif
movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon
.endif
.ifc \txfm1, identity
@@ -1765,8 +1777,7 @@ def_fn_416 \w, \h, identity, flipadst, 32
def_fns_416 4, 16
def_fns_416 16, 4
-.macro def_fn_816_base variant
-function inv_txfm_\variant\()add_16x8_neon
+function inv_txfm_add_16x8_neon
sub_sp_align 256
.irp i, 0, 4
@@ -1805,6 +1816,7 @@ function inv_txfm_\variant\()add_16x8_neon
pop {r4-r11,pc}
endfunc
+.macro def_fn_816_base variant
function inv_txfm_\variant\()add_8x16_neon
sub_sp_align 256
@@ -1849,6 +1861,10 @@ function inv_txfm_\variant\()add_8x16_neon
.endr
2:
+.ifc \variant, identity_
+ b L(itx_8x16_epilog)
+.else
+L(itx_8x16_epilog):
.irp i, 0, 4
add r6, r0, #(\i)
add r7, sp, #(\i*2)
@@ -1859,11 +1875,18 @@ function inv_txfm_\variant\()add_8x16_neon
add_sp_align 256
vpop {q4-q7}
pop {r4-r11,pc}
+.endif
endfunc
.endm
-def_fn_816_base
def_fn_816_base identity_
+def_fn_816_base
+
+/* Define symbols used in .if statement */
+.equ dct, 1
+.equ identity, 2
+.equ adst, 3
+.equ flipadst, 4
.macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1873,7 +1896,9 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
.if \w == 8
+.ifnc \txfm1, identity
movrel_local r4, inv_\txfm1\()_8h_x8_neon
+.endif
movrel_local r5, inv_\txfm2\()_4h_x16_neon
.else
.ifc \txfm1, identity
@@ -1889,7 +1914,7 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
.else
mov r10, #\eob_4x4
.endif
-.ifc \txfm1, identity
+.if \w == 8 && \txfm1 == identity
b inv_txfm_identity_add_\w\()x\h\()_neon
.else
b inv_txfm_add_\w\()x\h\()_neon
diff --git a/src/arm/32/itx16.S b/src/arm/32/itx16.S
index aa6c272..7691272 100644
--- a/src/arm/32/itx16.S
+++ b/src/arm/32/itx16.S
@@ -547,11 +547,11 @@ function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
vmov.i16 q15, #0
vld1.32 {q8, q9}, [r2, :128]
vst1.32 {q14, q15}, [r2, :128]!
- vshr.s16 q8, q8, #2
+ vshr.s32 q8, q8, #2
vld1.32 {q10, q11}, [r2, :128]
- vshr.s16 q9, q9, #2
- vshr.s16 q10, q10, #2
- vshr.s16 q11, q11, #2
+ vshr.s32 q9, q9, #2
+ vshr.s32 q10, q10, #2
+ vshr.s32 q11, q11, #2
iwht4
@@ -598,7 +598,9 @@ function inv_txfm_add_4x4_neon
vld1.16 {d3}, [r0, :64], r1
L(itx_4x4_end):
- vmvn.i16 q15, #0xfc00 // 0x3ff
+ // read bitdepth_max from the callers stack
+ ldr r4, [sp, #44]
+ vdup.i16 q15, r4
sub r0, r0, r1, lsl #2
vqadd.s16 q8, q8, q0
vqadd.s16 q9, q9, q1
@@ -1487,6 +1489,10 @@ function inv_txfm_horz\suffix\()_16x2_neon
vqrshrn.s32 d21, q13, #\shift
vqrshrn.s32 d22, q14, #\shift
vqrshrn.s32 d23, q15, #\shift
+.if \scale
+ b L(horz_16x2_epilog)
+.else
+L(horz_16x2_epilog):
vuzp.16 q8, q9
vuzp.16 q10, q11
@@ -1495,11 +1501,12 @@ function inv_txfm_horz\suffix\()_16x2_neon
.endr
pop {pc}
+.endif
endfunc
.endm
-def_horz_16 scale=0, shift=2
def_horz_16 scale=1, shift=1, suffix=_scale
+def_horz_16 scale=0, shift=2
function inv_txfm_add_vert_4x16_neon
push {lr}
diff --git a/src/arm/32/msac.S b/src/arm/32/msac.S
index b06e109..b16957f 100644
--- a/src/arm/32/msac.S
+++ b/src/arm/32/msac.S
@@ -279,60 +279,67 @@ L(renorm):
sub r4, r4, r3 // rng = u - v
clz r5, r4 // clz(rng)
eor r5, r5, #16 // d = clz(rng) ^ 16
- mvn r7, r7 // ~dif
- add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+ sub r7, r7, r3, lsl #16 // dif - (v << 16)
L(renorm2):
lsl r4, r4, r5 // rng << d
subs r6, r6, r5 // cnt -= d
- lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ lsl r7, r7, r5 // (dif - (v << 16)) << d
str r4, [r0, #RNG]
- mvn r7, r7 // ~dif
- bhs 9f
+ bhs 4f
// refill
ldr r3, [r0, #BUF_POS] // BUF_POS
ldr r4, [r0, #BUF_END] // BUF_END
add r5, r3, #4
- cmp r5, r4
- bgt 2f
-
- ldr r3, [r3] // next_bits
- add r8, r6, #23 // shift_bits = cnt + 23
- add r6, r6, #16 // cnt += 16
- rev r3, r3 // next_bits = bswap(next_bits)
- sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
- and r8, r8, #24 // shift_bits &= 24
- lsr r3, r3, r8 // next_bits >>= shift_bits
- sub r8, r8, r6 // shift_bits -= 16 + cnt
- str r5, [r0, #BUF_POS]
- lsl r3, r3, r8 // next_bits <<= shift_bits
- rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
- eor r7, r7, r3 // dif ^= next_bits
- b 9f
-
-2: // refill_eob
- rsb r5, r6, #8 // c = 8 - cnt
-3:
- cmp r3, r4
- bge 4f
- ldrb r8, [r3], #1
- lsl r8, r8, r5
- eor r7, r7, r8
- subs r5, r5, #8
- bge 3b
-
-4: // refill_eob_end
+ subs r5, r5, r4
+ bhi 6f
+
+ ldr r8, [r3] // next_bits
+ rsb r5, r6, #16
+ add r4, r6, #16 // shift_bits = cnt + 16
+ mvn r8, r8
+ lsr r5, r5, #3 // num_bytes_read
+ rev r8, r8 // next_bits = bswap(next_bits)
+ lsr r8, r8, r4 // next_bits >>= shift_bits
+
+2: // refill_end
+ add r3, r3, r5
+ add r6, r6, r5, lsl #3 // cnt += num_bits_read
str r3, [r0, #BUF_POS]
- rsb r6, r5, #8 // cnt = 8 - c
-9:
+3: // refill_end2
+ orr r7, r7, r8 // dif |= next_bits
+
+4: // end
str r6, [r0, #CNT]
str r7, [r0, #DIF]
-
mov r0, lr
add sp, sp, #48
-
pop {r4-r10,pc}
+
+5: // pad_with_ones
+ add r8, r6, #-240
+ lsr r8, r8, r8
+ b 3b
+
+6: // refill_eob
+ cmp r3, r4
+ bhs 5b
+
+ ldr r8, [r4, #-4]
+ lsl r5, r5, #3
+ lsr r8, r8, r5
+ add r5, r6, #16
+ mvn r8, r8
+ sub r4, r4, r3 // num_bytes_left
+ rev r8, r8
+ lsr r8, r8, r5
+ rsb r5, r6, #16
+ lsr r5, r5, #3
+ cmp r5, r4
+ it hs
+ movhs r5, r4
+ b 2b
endfunc
function msac_decode_symbol_adapt8_neon, export=1
@@ -414,53 +421,38 @@ function msac_decode_hi_tok_neon, export=1
sub r4, r4, r3 // rng = u - v
clz r5, r4 // clz(rng)
eor r5, r5, #16 // d = clz(rng) ^ 16
- mvn r7, r7 // ~dif
- add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+ sub r7, r7, r3, lsl #16 // dif - (v << 16)
lsl r4, r4, r5 // rng << d
subs r6, r6, r5 // cnt -= d
- lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ lsl r7, r7, r5 // (dif - (v << 16)) << d
str r4, [r0, #RNG]
vdup.16 d1, r4
- mvn r7, r7 // ~dif
- bhs 9f
+ bhs 5f
// refill
ldr r3, [r0, #BUF_POS] // BUF_POS
ldr r4, [r0, #BUF_END] // BUF_END
add r5, r3, #4
- cmp r5, r4
- bgt 2f
-
- ldr r3, [r3] // next_bits
- add r8, r6, #23 // shift_bits = cnt + 23
- add r6, r6, #16 // cnt += 16
- rev r3, r3 // next_bits = bswap(next_bits)
- sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
- and r8, r8, #24 // shift_bits &= 24
- lsr r3, r3, r8 // next_bits >>= shift_bits
- sub r8, r8, r6 // shift_bits -= 16 + cnt
- str r5, [r0, #BUF_POS]
- lsl r3, r3, r8 // next_bits <<= shift_bits
- rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
- eor r7, r7, r3 // dif ^= next_bits
- b 9f
-
-2: // refill_eob
- rsb r5, r6, #8 // c = 40 - cnt
-3:
- cmp r3, r4
- bge 4f
- ldrb r8, [r3], #1
- lsl r8, r8, r5
- eor r7, r7, r8
- subs r5, r5, #8
- bge 3b
-
-4: // refill_eob_end
+ subs r5, r5, r4
+ bhi 7f
+
+ ldr r8, [r3] // next_bits
+ rsb r5, r6, #16
+ add r4, r6, #16 // shift_bits = cnt + 16
+ mvn r8, r8
+ lsr r5, r5, #3 // num_bytes_read
+ rev r8, r8 // next_bits = bswap(next_bits)
+ lsr r8, r8, r4 // next_bits >>= shift_bits
+
+3: // refill_end
+ add r3, r3, r5
+ add r6, r6, r5, lsl #3 // cnt += num_bits_read
str r3, [r0, #BUF_POS]
- rsb r6, r5, #8 // cnt = 40 - c
-9:
+4: // refill_end2
+ orr r7, r7, r8 // dif |= next_bits
+
+5: // end
lsl lr, lr, #1
sub lr, lr, #5
lsr r12, r7, #16
@@ -473,6 +465,30 @@ function msac_decode_hi_tok_neon, export=1
str r7, [r0, #DIF]
lsr r0, r2, #1
pop {r4-r10,pc}
+
+6: // pad_with_ones
+ add r8, r6, #-240
+ lsr r8, r8, r8
+ b 4b
+
+7: // refill_eob
+ cmp r3, r4
+ bhs 6b
+
+ ldr r8, [r4, #-4]
+ lsl r5, r5, #3
+ lsr r8, r8, r5
+ add r5, r6, #16
+ mvn r8, r8
+ sub r4, r4, r3 // num_bytes_left
+ rev r8, r8
+ lsr r8, r8, r5
+ rsb r5, r6, #16
+ lsr r5, r5, #3
+ cmp r5, r4
+ it hs
+ movhs r5, r4
+ b 3b
endfunc
function msac_decode_bool_equi_neon, export=1
@@ -493,7 +509,6 @@ function msac_decode_bool_equi_neon, export=1
movhs r7, r8 // if (ret) dif = dif - vw;
clz r5, r4 // clz(rng)
- mvn r7, r7 // ~dif
eor r5, r5, #16 // d = clz(rng) ^ 16
mov lr, r2
b L(renorm2)
@@ -519,7 +534,6 @@ function msac_decode_bool_neon, export=1
movhs r7, r8 // if (ret) dif = dif - vw;
clz r5, r4 // clz(rng)
- mvn r7, r7 // ~dif
eor r5, r5, #16 // d = clz(rng) ^ 16
mov lr, r2
b L(renorm2)
@@ -549,7 +563,6 @@ function msac_decode_bool_adapt_neon, export=1
cmp r10, #0
clz r5, r4 // clz(rng)
- mvn r7, r7 // ~dif
eor r5, r5, #16 // d = clz(rng) ^ 16
mov lr, r2
diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S
index 53490cd..7063cbd 100644
--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -879,6 +879,8 @@ function inv_txfm_\variant\()add_8x8_neon
.ifc \variant, identity_
// The identity shl #1 and downshift srshr #1 cancel out
+
+ b L(itx_8x8_epilog)
.else
blr x4
@@ -890,19 +892,20 @@ function inv_txfm_\variant\()add_8x8_neon
srshr v21.8h, v21.8h, #1
srshr v22.8h, v22.8h, #1
srshr v23.8h, v23.8h, #1
-.endif
+L(itx_8x8_epilog):
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
blr x5
load_add_store_8x8 x0, x7
ret x15
+.endif
endfunc
.endm
-def_fn_8x8_base
def_fn_8x8_base identity_
+def_fn_8x8_base
.macro def_fn_8x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
@@ -1390,14 +1393,16 @@ function inv_txfm_horz\suffix\()_16x8_neon
.endif
.if \identity
identity_8x16_shift2 v0.h[0]
+ b L(horz_16x8_epilog)
.else
blr x4
-.endif
-.if \shift > 0
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
srshr \i, \i, #\shift
.endr
-.endif
+.if \shift == 1
+ b L(horz_16x8_epilog)
+.else
+L(horz_16x8_epilog):
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
@@ -1406,12 +1411,14 @@ function inv_txfm_horz\suffix\()_16x8_neon
.endr
ret x14
+.endif
+.endif
endfunc
.endm
-def_horz_16 scale=0, identity=0, shift=2
def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
def_horz_16 scale=0, identity=1, shift=0, suffix=_identity
+def_horz_16 scale=0, identity=0, shift=2
function inv_txfm_add_vert_8x16_neon
mov x14, x30
@@ -1512,6 +1519,8 @@ function inv_txfm_\variant\()add_16x4_neon
.endr
identity_8x16_shift1 v0.h[0]
+
+ b L(itx_16x4_epilog)
.else
.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
ld1 {\i}, [x2]
@@ -1527,33 +1536,29 @@ function inv_txfm_\variant\()add_16x4_neon
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
srshr \i, \i, #1
.endr
-.endif
- transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
- blr x5
- mov x6, x0
- load_add_store_8x4 x6, x7
-.ifc \variant, identity_
- mov v16.16b, v20.16b
- mov v17.16b, v21.16b
- mov v18.16b, v22.16b
- mov v19.16b, v23.16b
-.else
ins v24.d[1], v28.d[0]
ins v25.d[1], v29.d[0]
ins v26.d[1], v30.d[0]
ins v27.d[1], v31.d[0]
- srshr v16.8h, v24.8h, #1
- srshr v17.8h, v25.8h, #1
- srshr v18.8h, v26.8h, #1
- srshr v19.8h, v27.8h, #1
-.endif
+ srshr v20.8h, v24.8h, #1
+ srshr v21.8h, v25.8h, #1
+ srshr v22.8h, v26.8h, #1
+ srshr v23.8h, v27.8h, #1
+
+L(itx_16x4_epilog):
transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
blr x5
+ mov x6, x0
+ load_add_store_8x4 x6, x7
+
+ transpose_4x8h_mov v20, v21, v22, v23, v2, v3, v4, v5, v16, v17, v18, v19
+ blr x5
add x6, x0, #8
load_add_store_8x4 x6, x7
ret x15
+.endif
endfunc
function inv_txfm_\variant\()add_4x16_neon
@@ -1605,12 +1610,14 @@ function inv_txfm_\variant\()add_4x16_neon
mov w16, #(5793-4096)*8
dup v0.4h, w16
identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
+
+ b L(itx_4x16_epilog)
.else
blr x4
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
srshr \i, \i, #1
.endr
-.endif
+L(itx_4x16_epilog):
transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
ins v20.d[0], v16.d[1]
ins v21.d[0], v17.d[1]
@@ -1622,11 +1629,12 @@ function inv_txfm_\variant\()add_4x16_neon
load_add_store_4x16 x0, x6
ret x15
+.endif
endfunc
.endm
-def_fn_416_base
def_fn_416_base identity_
+def_fn_416_base
.macro def_fn_416 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1634,11 +1642,15 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
idct_dc \w, \h, 1
.endif
.if \w == 4
+.ifnc \txfm1, identity
adr x4, inv_\txfm1\()_8h_x\w\()_neon
+.endif
adr x5, inv_\txfm2\()_4h_x\h\()_neon
mov w13, #\eob_half
.else
+.ifnc \txfm1, identity
adr x4, inv_\txfm1\()_4h_x\w\()_neon
+.endif
adr x5, inv_\txfm2\()_8h_x\h\()_neon
.endif
.ifc \txfm1, identity
@@ -1690,13 +1702,16 @@ function inv_txfm_\variant\()add_16x8_neon
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
identity_8x16_shift1 v0.h[0]
+
+ b L(itx_16x8_epilog)
.else
blr x4
-.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
srshr \i, \i, #1
.endr
-.endif
+
+L(itx_16x8_epilog):
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
blr x5
@@ -1704,27 +1719,7 @@ function inv_txfm_\variant\()add_16x8_neon
mov x6, x0
load_add_store_8x8 x6, x7
-.ifc \variant, identity_
- mov v16.16b, v24.16b
- mov v17.16b, v25.16b
- mov v18.16b, v26.16b
- mov v19.16b, v27.16b
- mov v20.16b, v28.16b
- mov v21.16b, v29.16b
- mov v22.16b, v30.16b
- mov v23.16b, v31.16b
-.else
- srshr v16.8h, v24.8h, #1
- srshr v17.8h, v25.8h, #1
- srshr v18.8h, v26.8h, #1
- srshr v19.8h, v27.8h, #1
- srshr v20.8h, v28.8h, #1
- srshr v21.8h, v29.8h, #1
- srshr v22.8h, v30.8h, #1
- srshr v23.8h, v31.8h, #1
-.endif
-
- transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+ transpose_8x8h_mov v24, v25, v26, v27, v28, v29, v30, v31, v2, v3, v16, v17, v18, v19, v20, v21, v22, v23
blr x5
@@ -1732,6 +1727,7 @@ function inv_txfm_\variant\()add_16x8_neon
load_add_store_8x8 x0, x7
ret x15
+.endif
endfunc
function inv_txfm_\variant\()add_8x16_neon
@@ -1790,14 +1786,16 @@ function inv_txfm_\variant\()add_8x16_neon
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
.ifc \variant, identity_
// The identity shl #1 and downshift srshr #1 cancel out
+
+ b L(itx_8x16_epilog)
.else
blr x4
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
srshr \i, \i, #1
.endr
-.endif
+L(itx_8x16_epilog):
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
blr x5
@@ -1805,18 +1803,21 @@ function inv_txfm_\variant\()add_8x16_neon
load_add_store_8x16 x0, x6
ret x15
+.endif
endfunc
.endm
-def_fn_816_base
def_fn_816_base identity_
+def_fn_816_base
.macro def_fn_816 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc \w, \h, 1
.endif
+.ifnc \txfm1, identity
adr x4, inv_\txfm1\()_8h_x\w\()_neon
+.endif
adr x5, inv_\txfm2\()_8h_x\h\()_neon
.if \w == 8
mov x13, #\eob_half
diff --git a/src/arm/64/itx16.S b/src/arm/64/itx16.S
index eee3a96..31ee9be 100644
--- a/src/arm/64/itx16.S
+++ b/src/arm/64/itx16.S
@@ -514,13 +514,17 @@ function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
b L(itx_4x4_end)
endfunc
+// HBD inv_txfm_add_4x4_neon deviates from the common pattern with registers
+// x0-x4 external parameters
+// x5 function pointer to first transform
+// x6 function pointer to second transform
function inv_txfm_add_4x4_neon
movi v30.4s, #0
movi v31.4s, #0
ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
st1 {v30.4s, v31.4s}, [x2], #32
- blr x4
+ blr x5
st1 {v30.4s, v31.4s}, [x2], #32
sqxtn v16.4h, v16.4s
@@ -529,7 +533,7 @@ function inv_txfm_add_4x4_neon
sqxtn v19.4h, v19.4s
transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23
- blr x5
+ blr x6
ld1 {v0.d}[0], [x0], x1
ld1 {v0.d}[1], [x0], x1
@@ -541,7 +545,7 @@ function inv_txfm_add_4x4_neon
srshr v18.8h, v18.8h, #4
L(itx_4x4_end):
- mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+ dup v31.8h, w4
sub x0, x0, x1, lsl #2
usqadd v0.8h, v16.8h
usqadd v1.8h, v18.8h
@@ -579,8 +583,8 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
b L(itx_4x4_end)
1:
.endif
- adr x4, inv_\txfm1\()_4s_x4_neon
- movrel x5, X(inv_\txfm2\()_4h_x4_neon)
+ adr x5, inv_\txfm1\()_4s_x4_neon
+ movrel x6, X(inv_\txfm2\()_4h_x4_neon)
b inv_txfm_add_4x4_neon
endfunc
.endm
@@ -1381,6 +1385,10 @@ function inv_txfm_horz\suffix\()_16x4_neon
sqrshrn2 v21.8h, v29.4s, #\shift
sqrshrn2 v22.8h, v30.4s, #\shift
sqrshrn2 v23.8h, v31.4s, #\shift
+.if \scale
+ b L(horz_16x4_epilog)
+.else
+L(horz_16x4_epilog):
transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7
@@ -1389,11 +1397,12 @@ function inv_txfm_horz\suffix\()_16x4_neon
.endr
ret x14
+.endif
endfunc
.endm
-def_horz_16 scale=0, shift=2
def_horz_16 scale=1, shift=1, suffix=_scale
+def_horz_16 scale=0, shift=2
function inv_txfm_add_vert_8x16_neon
mov x14, x30
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index 9f7b4e7..3df0393 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1154,7 +1154,7 @@ endfunc
uxtl \r6\().8h, \r6\().8b
.endif
.endm
-.macro mul_mla_4 d, s0, s1, s2, s3, wd
+.macro mul_mla_4tap d, s0, s1, s2, s3, wd
mul \d\wd, \s0\wd, v0.h[0]
mla \d\wd, \s1\wd, v0.h[1]
mla \d\wd, \s2\wd, v0.h[2]
@@ -1163,7 +1163,51 @@ endfunc
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
-.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+.macro mul_mla_6tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+ mul \d0\().4h, \s1\().4h, v0.h[1]
+ mla \d0\().4h, \s2\().4h, v0.h[2]
+ mla \d0\().4h, \s3\().4h, v0.h[3]
+ mla \d0\().4h, \s4\().4h, v0.h[4]
+ mla \d0\().4h, \s5\().4h, v0.h[5]
+ mla \d0\().4h, \s6\().4h, v0.h[6]
+.endm
+.macro mul_mla_6tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+ mul \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+.endm
+.macro mul_mla_6tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ mul \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mul \d1\().8h, \s2\().8h, v0.h[1]
+ mla \d1\().8h, \s3\().8h, v0.h[2]
+ mla \d1\().8h, \s4\().8h, v0.h[3]
+ mla \d1\().8h, \s5\().8h, v0.h[4]
+ mla \d1\().8h, \s6\().8h, v0.h[5]
+ mla \d1\().8h, \s7\().8h, v0.h[6]
+.endm
+.macro mul_mla_6tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+ mul \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mul \d1\().8h, \s3\().8h, v0.h[1]
+ mla \d1\().8h, \s4\().8h, v0.h[2]
+ mla \d1\().8h, \s5\().8h, v0.h[3]
+ mla \d1\().8h, \s6\().8h, v0.h[4]
+ mla \d1\().8h, \s7\().8h, v0.h[5]
+ mla \d1\().8h, \s8\().8h, v0.h[6]
+.endm
+.macro mul_mla_8tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
mul \d0\().4h, \s0\().4h, v0.h[0]
mla \d0\().4h, \s1\().4h, v0.h[1]
mla \d0\().4h, \s2\().4h, v0.h[2]
@@ -1173,7 +1217,7 @@ endfunc
mla \d0\().4h, \s6\().4h, v0.h[6]
mla \d0\().4h, \s7\().4h, v0.h[7]
.endm
-.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+.macro mul_mla_8tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
mul \d0\().8h, \s0\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
@@ -1183,7 +1227,7 @@ endfunc
mla \d0\().8h, \s6\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7]
.endm
-.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+.macro mul_mla_8tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
mul \d0\().8h, \s0\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
@@ -1201,7 +1245,7 @@ endfunc
mla \d1\().8h, \s7\().8h, v0.h[6]
mla \d1\().8h, \s8\().8h, v0.h[7]
.endm
-.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+.macro mul_mla_8tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
mul \d0\().8h, \s0\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
@@ -1315,11 +1359,11 @@ endfunc
.endif
.endm
-.macro make_8tap_fn op, type, type_h, type_v
+.macro make_8tap_fn op, type, type_h, type_v, taps
function \op\()_8tap_\type\()_8bpc_neon, export=1
mov x8, \type_h
mov x9, \type_v
- b \op\()_8tap_neon
+ b \op\()_\taps\()_neon
endfunc
.endm
@@ -1328,18 +1372,8 @@ endfunc
#define SMOOTH ((1*15<<7)|4*15)
#define SHARP ((2*15<<7)|3*15)
-.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
-make_8tap_fn \type, regular, REGULAR, REGULAR
-make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
-make_8tap_fn \type, regular_sharp, REGULAR, SHARP
-make_8tap_fn \type, smooth, SMOOTH, SMOOTH
-make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
-make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
-make_8tap_fn \type, sharp, SHARP, SHARP
-make_8tap_fn \type, sharp_regular, SHARP, REGULAR
-make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
-
-function \type\()_8tap_neon
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv, taps
+function \type\()_\taps\()_neon
mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
mul \mx, \mx, w10
mul \my, \my, w10
@@ -1354,12 +1388,12 @@ function \type\()_8tap_neon
tst \mx, #(0x7f << 14)
sub w8, w8, #24
movrel x10, X(mc_subpel_filters), -8
- b.ne L(\type\()_8tap_h)
+ b.ne L(\type\()_\taps\()_h)
tst \my, #(0x7f << 14)
- b.ne L(\type\()_8tap_v)
+ b.ne L(\type\()_\taps\()_v)
b \type\()_neon
-L(\type\()_8tap_h):
+L(\type\()_\taps\()_h):
cmp \w, #4
ubfx w9, \mx, #7, #7
and \mx, \mx, #0x7f
@@ -1368,9 +1402,9 @@ L(\type\()_8tap_h):
4:
tst \my, #(0x7f << 14)
add \xmx, x10, \mx, uxtw #3
- b.ne L(\type\()_8tap_hv)
+ b.ne L(\type\()_\taps\()_hv)
- adr x9, L(\type\()_8tap_h_tbl)
+ adr x9, L(\type\()_\taps\()_h_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
br x9
@@ -1471,6 +1505,18 @@ L(\type\()_8tap_h):
uxtl v20.8h, v20.8b
uxtl v21.8h, v21.8b
+.ifc \taps, 6tap
+ ext v19.16b, v16.16b, v17.16b, #2
+ ext v23.16b, v20.16b, v21.16b, #2
+ mul v18.8h, v19.8h, v0.h[1]
+ mul v22.8h, v23.8h, v0.h[1]
+.irpc i, 23456
+ ext v19.16b, v16.16b, v17.16b, #(2*\i)
+ ext v23.16b, v20.16b, v21.16b, #(2*\i)
+ mla v18.8h, v19.8h, v0.h[\i]
+ mla v22.8h, v23.8h, v0.h[\i]
+.endr
+.else // 8tap
mul v18.8h, v16.8h, v0.h[0]
mul v22.8h, v20.8h, v0.h[0]
.irpc i, 1234567
@@ -1479,6 +1525,7 @@ L(\type\()_8tap_h):
mla v18.8h, v19.8h, v0.h[\i]
mla v22.8h, v23.8h, v0.h[\i]
.endr
+.endif
subs \h, \h, #2
srshr v18.8h, v18.8h, #2
srshr v22.8h, v22.8h, #2
@@ -1523,6 +1570,26 @@ L(\type\()_8tap_h):
uxtl v22.8h, v22.8b
16:
+.ifc \taps, 6tap
+ ext v28.16b, v16.16b, v17.16b, #2
+ ext v29.16b, v17.16b, v18.16b, #2
+ ext v30.16b, v20.16b, v21.16b, #2
+ ext v31.16b, v21.16b, v22.16b, #2
+ mul v24.8h, v28.8h, v0.h[1]
+ mul v25.8h, v29.8h, v0.h[1]
+ mul v26.8h, v30.8h, v0.h[1]
+ mul v27.8h, v31.8h, v0.h[1]
+.irpc i, 23456
+ ext v28.16b, v16.16b, v17.16b, #(2*\i)
+ ext v29.16b, v17.16b, v18.16b, #(2*\i)
+ ext v30.16b, v20.16b, v21.16b, #(2*\i)
+ ext v31.16b, v21.16b, v22.16b, #(2*\i)
+ mla v24.8h, v28.8h, v0.h[\i]
+ mla v25.8h, v29.8h, v0.h[\i]
+ mla v26.8h, v30.8h, v0.h[\i]
+ mla v27.8h, v31.8h, v0.h[\i]
+.endr
+.else // 8tap
mul v24.8h, v16.8h, v0.h[0]
mul v25.8h, v17.8h, v0.h[0]
mul v26.8h, v20.8h, v0.h[0]
@@ -1537,6 +1604,7 @@ L(\type\()_8tap_h):
mla v26.8h, v30.8h, v0.h[\i]
mla v27.8h, v31.8h, v0.h[\i]
.endr
+.endif
srshr v24.8h, v24.8h, #2
srshr v25.8h, v25.8h, #2
srshr v26.8h, v26.8h, #2
@@ -1575,18 +1643,18 @@ L(\type\()_8tap_h):
b.gt 161b
ret
-L(\type\()_8tap_h_tbl):
- .hword L(\type\()_8tap_h_tbl) - 1280b
- .hword L(\type\()_8tap_h_tbl) - 640b
- .hword L(\type\()_8tap_h_tbl) - 320b
- .hword L(\type\()_8tap_h_tbl) - 160b
- .hword L(\type\()_8tap_h_tbl) - 80b
- .hword L(\type\()_8tap_h_tbl) - 40b
- .hword L(\type\()_8tap_h_tbl) - 20b
+L(\type\()_\taps\()_h_tbl):
+ .hword L(\type\()_\taps\()_h_tbl) - 1280b
+ .hword L(\type\()_\taps\()_h_tbl) - 640b
+ .hword L(\type\()_\taps\()_h_tbl) - 320b
+ .hword L(\type\()_\taps\()_h_tbl) - 160b
+ .hword L(\type\()_\taps\()_h_tbl) - 80b
+ .hword L(\type\()_\taps\()_h_tbl) - 40b
+ .hword L(\type\()_\taps\()_h_tbl) - 20b
.hword 0
-L(\type\()_8tap_v):
+L(\type\()_\taps\()_v):
cmp \h, #4
ubfx w9, \my, #7, #7
and \my, \my, #0x7f
@@ -1595,7 +1663,7 @@ L(\type\()_8tap_v):
4:
add \xmy, x10, \my, uxtw #3
- adr x9, L(\type\()_8tap_v_tbl)
+ adr x9, L(\type\()_\taps\()_v_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
br x9
@@ -1620,7 +1688,7 @@ L(\type\()_8tap_v):
interleave_1_h v1, v2, v3, v4, v5
b.gt 24f
uxtl_b v1, v2, v3, v4
- mul_mla_4 v6, v1, v2, v3, v4, .4h
+ mul_mla_4tap v6, v1, v2, v3, v4, .4h
sqrshrun_b 6, v6
st_h \d_strd, v6, 2
ret
@@ -1630,7 +1698,7 @@ L(\type\()_8tap_v):
interleave_1_h v5, v6, v7
interleave_2_s v1, v2, v3, v4, v5, v6
uxtl_b v1, v2, v3, v4
- mul_mla_4 v6, v1, v2, v3, v4, .8h
+ mul_mla_4tap v6, v1, v2, v3, v4, .8h
sqrshrun_b 6, v6
st_h \d_strd, v6, 4
ret
@@ -1655,7 +1723,7 @@ L(\type\()_8tap_v):
interleave_1_h v7, v16, v17, v18, v19
interleave_2_s v5, v6, v7, v16, v17, v18
uxtl_b v5, v6, v7, v16
- mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
+ mul_mla_\taps\()_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
sqrshrun_b 6, v30
st_h \d_strd, v30, 4
b.le 0f
@@ -1673,7 +1741,7 @@ L(\type\()_8tap_v):
load_h \sr2, \src, \s_strd, v16, v17
interleave_1_h v7, v16, v17
uxtl_b v5, v6, v7, v16
- mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
+ mul_mla_\taps\()_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
sqrshrun_b 6, v30
st_h \d_strd, v30, 2
0:
@@ -1698,13 +1766,13 @@ L(\type\()_8tap_v):
load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
interleave_1_s v1, v2, v3, v4, v5
uxtl_b v1, v2, v3, v4
- mul_mla_4 v6, v1, v2, v3, v4, .8h
+ mul_mla_4tap v6, v1, v2, v3, v4, .8h
shift_store_4 \type, \d_strd, v6
b.le 0f
load_s \sr2, \src, \s_strd, v6, v7
interleave_1_s v5, v6, v7
uxtl_b v5, v6
- mul_mla_4 v7, v3, v4, v5, v6, .8h
+ mul_mla_4tap v7, v3, v4, v5, v6, .8h
shift_store_4 \type, \d_strd, v7
0:
ret
@@ -1729,28 +1797,28 @@ L(\type\()_8tap_v):
load_s \sr2, \src, \s_strd, v23, v24, v25, v26
interleave_1_s v22, v23, v24, v25, v26
uxtl_b v22, v23, v24, v25
- mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+ mul_mla_\taps\()_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
shift_store_4 \type, \d_strd, v1, v2
b.le 0f
load_s \sr2, \src, \s_strd, v27, v16
subs \h, \h, #2
interleave_1_s v26, v27, v16
uxtl_b v26, v27
- mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
+ mul_mla_\taps\()_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
shift_store_4 \type, \d_strd, v1
b.le 0f
load_s \sr2, \src, \s_strd, v17, v18
subs \h, \h, #2
interleave_1_s v16, v17, v18
uxtl_b v16, v17
- mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
+ mul_mla_\taps\()_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
shift_store_4 \type, \d_strd, v2
b.le 0f
subs \h, \h, #4
load_s \sr2, \src, \s_strd, v19, v20, v21, v22
interleave_1_s v18, v19, v20, v21, v22
uxtl_b v18, v19, v20, v21
- mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
+ mul_mla_\taps\()_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
shift_store_4 \type, \d_strd, v1, v2
b.gt 48b
0:
@@ -1773,14 +1841,14 @@ L(\type\()_8tap_v):
load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5
uxtl_b v1, v2, v3, v4, v5
- mul_mla_4 v6, v1, v2, v3, v4, .8h
- mul_mla_4 v7, v2, v3, v4, v5, .8h
+ mul_mla_4tap v6, v1, v2, v3, v4, .8h
+ mul_mla_4tap v7, v2, v3, v4, v5, .8h
shift_store_8 \type, \d_strd, v6, v7
b.le 0f
load_8b \sr2, \src, \s_strd, v6, v7
uxtl_b v6, v7
- mul_mla_4 v1, v3, v4, v5, v6, .8h
- mul_mla_4 v2, v4, v5, v6, v7, .8h
+ mul_mla_4tap v1, v3, v4, v5, v6, .8h
+ mul_mla_4tap v2, v4, v5, v6, v7, .8h
shift_store_8 \type, \d_strd, v1, v2
0:
ret
@@ -1809,32 +1877,32 @@ L(\type\()_8tap_v):
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v23, v24
uxtl_b v23, v24
- mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
+ mul_mla_\taps\()_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
shift_store_8 \type, \d_strd, v1, v2
b.le 9f
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v25, v26
uxtl_b v25, v26
- mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
+ mul_mla_\taps\()_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_8 \type, \d_strd, v3, v4
b.le 9f
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v27, v16
uxtl_b v27, v16
- mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
+ mul_mla_\taps\()_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
shift_store_8 \type, \d_strd, v1, v2
b.le 9f
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v17, v18
uxtl_b v17, v18
- mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
+ mul_mla_\taps\()_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
shift_store_8 \type, \d_strd, v3, v4
b.le 9f
subs \h, \h, #4
load_8b \sr2, \src, \s_strd, v19, v20, v21, v22
uxtl_b v19, v20, v21, v22
- mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
- mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
+ mul_mla_\taps\()_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
+ mul_mla_\taps\()_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
shift_store_8 \type, \d_strd, v1, v2, v3, v4
b.gt 88b
9:
@@ -1882,10 +1950,10 @@ L(\type\()_8tap_v):
uxtl2 v25.8h, v3.16b
uxtl2 v26.8h, v4.16b
uxtl2 v27.8h, v5.16b
- mul_mla_4 v1, v16, v17, v18, v19, .8h
- mul_mla_4 v16, v17, v18, v19, v20, .8h
- mul_mla_4 v2, v23, v24, v25, v26, .8h
- mul_mla_4 v17, v24, v25, v26, v27, .8h
+ mul_mla_4tap v1, v16, v17, v18, v19, .8h
+ mul_mla_4tap v16, v17, v18, v19, v20, .8h
+ mul_mla_4tap v2, v23, v24, v25, v26, .8h
+ mul_mla_4tap v17, v24, v25, v26, v27, .8h
shift_store_16 \type, \d_strd, v1, v2, v16, v17
b.le 0f
load_16b \sr2, \src, \s_strd, v6, v7
@@ -1893,25 +1961,25 @@ L(\type\()_8tap_v):
uxtl v22.8h, v7.8b
uxtl2 v28.8h, v6.16b
uxtl2 v29.8h, v7.16b
- mul_mla_4 v1, v18, v19, v20, v21, .8h
- mul_mla_4 v3, v19, v20, v21, v22, .8h
- mul_mla_4 v2, v25, v26, v27, v28, .8h
- mul_mla_4 v4, v26, v27, v28, v29, .8h
+ mul_mla_4tap v1, v18, v19, v20, v21, .8h
+ mul_mla_4tap v3, v19, v20, v21, v22, .8h
+ mul_mla_4tap v2, v25, v26, v27, v28, .8h
+ mul_mla_4tap v4, v26, v27, v28, v29, .8h
shift_store_16 \type, \d_strd, v1, v2, v3, v4
0:
ret
-L(\type\()_8tap_v_tbl):
- .hword L(\type\()_8tap_v_tbl) - 1280b
- .hword L(\type\()_8tap_v_tbl) - 640b
- .hword L(\type\()_8tap_v_tbl) - 320b
- .hword L(\type\()_8tap_v_tbl) - 160b
- .hword L(\type\()_8tap_v_tbl) - 80b
- .hword L(\type\()_8tap_v_tbl) - 40b
- .hword L(\type\()_8tap_v_tbl) - 20b
+L(\type\()_\taps\()_v_tbl):
+ .hword L(\type\()_\taps\()_v_tbl) - 1280b
+ .hword L(\type\()_\taps\()_v_tbl) - 640b
+ .hword L(\type\()_\taps\()_v_tbl) - 320b
+ .hword L(\type\()_\taps\()_v_tbl) - 160b
+ .hword L(\type\()_\taps\()_v_tbl) - 80b
+ .hword L(\type\()_\taps\()_v_tbl) - 40b
+ .hword L(\type\()_\taps\()_v_tbl) - 20b
.hword 0
-L(\type\()_8tap_hv):
+L(\type\()_\taps\()_hv):
cmp \h, #4
ubfx w9, \my, #7, #7
and \my, \my, #0x7f
@@ -1920,7 +1988,7 @@ L(\type\()_8tap_hv):
4:
add \xmy, x10, \my, uxtw #3
- adr x9, L(\type\()_8tap_hv_tbl)
+ adr x9, L(\type\()_\taps\()_hv_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
br x9
@@ -1952,13 +2020,13 @@ L(\type\()_8tap_hv):
addp v28.4h, v28.4h, v29.4h
addp v16.4h, v28.4h, v28.4h
srshr v16.4h, v16.4h, #2
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
trn1 v16.2s, v16.2s, v28.2s
mov v17.8b, v28.8b
2:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v28.8b, #4
smull v2.4s, v16.4h, v1.h[0]
@@ -1997,19 +2065,27 @@ L(\type\()_8tap_hv):
addp v16.4h, v28.4h, v28.4h
srshr v16.4h, v16.4h, #2
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
trn1 v16.2s, v16.2s, v28.2s
mov v17.8b, v28.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v28.8b, #4
mov v19.8b, v28.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v20.8b, v19.8b, v28.8b, #4
mov v21.8b, v28.8b
28:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v22.8b, v21.8b, v28.8b, #4
+.ifc \taps, 6tap
+ smull v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -2018,6 +2094,7 @@ L(\type\()_8tap_hv):
smlal v2.4s, v21.4h, v1.h[5]
smlal v2.4s, v22.4h, v1.h[6]
smlal v2.4s, v28.4h, v1.h[7]
+.endif
sqrshrn v2.4h, v2.4s, #\shift_hv
sqxtun v2.8b, v2.8h
@@ -2036,7 +2113,7 @@ L(\type\()_8tap_hv):
0:
ret x15
-L(\type\()_8tap_filter_2):
+L(\type\()_\taps\()_filter_2):
ld1 {v28.8b}, [\sr2], \s_strd
ld1 {v30.8b}, [\src], \s_strd
uxtl v28.8h, v28.8b
@@ -2083,12 +2160,12 @@ L(\type\()_8tap_filter_2):
mla v31.4h, v30.4h, v0.h[3]
srshr v16.4h, v31.4h, #2
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v28.8b
mov v18.8b, v29.8b
4:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
@@ -2121,8 +2198,13 @@ L(\type\()_8tap_filter_2):
480: // 4x8, 4x16, 4x32 hv
ld1 {v1.8b}, [\xmy]
sub \src, \src, #1
+.ifc \taps, 6tap
+ sub \sr2, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+.else
sub \sr2, \src, \s_strd, lsl #1
sub \src, \sr2, \s_strd
+.endif
add \ds2, \dst, \d_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
@@ -2139,20 +2221,38 @@ L(\type\()_8tap_filter_2):
mla v31.4h, v28.4h, v0.h[1]
mla v31.4h, v29.4h, v0.h[2]
mla v31.4h, v30.4h, v0.h[3]
+.ifc \taps, 6tap
+ srshr v18.4h, v31.4h, #2
+.else
srshr v16.4h, v31.4h, #2
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v28.8b
mov v18.8b, v29.8b
- bl L(\type\()_8tap_filter_4)
+.endif
+ bl L(\type\()_\taps\()_filter_4)
mov v19.8b, v28.8b
mov v20.8b, v29.8b
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v21.8b, v28.8b
mov v22.8b, v29.8b
48:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
+.ifc \taps, 6tap
+ smull v2.4s, v18.4h, v1.h[1]
+ smlal v2.4s, v19.4h, v1.h[2]
+ smlal v2.4s, v20.4h, v1.h[3]
+ smlal v2.4s, v21.4h, v1.h[4]
+ smlal v2.4s, v22.4h, v1.h[5]
+ smlal v2.4s, v28.4h, v1.h[6]
+ smull v3.4s, v19.4h, v1.h[1]
+ smlal v3.4s, v20.4h, v1.h[2]
+ smlal v3.4s, v21.4h, v1.h[3]
+ smlal v3.4s, v22.4h, v1.h[4]
+ smlal v3.4s, v28.4h, v1.h[5]
+ smlal v3.4s, v29.4h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -2169,6 +2269,7 @@ L(\type\()_8tap_filter_2):
smlal v3.4s, v22.4h, v1.h[5]
smlal v3.4s, v28.4h, v1.h[6]
smlal v3.4s, v29.4h, v1.h[7]
+.endif
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn v3.4h, v3.4s, #\shift_hv
subs \h, \h, #2
@@ -2182,8 +2283,10 @@ L(\type\()_8tap_filter_2):
st1 {v3.4h}, [\ds2], \d_strd
.endif
b.le 0f
+.ifc \taps, 8tap
mov v16.8b, v18.8b
mov v17.8b, v19.8b
+.endif
mov v18.8b, v20.8b
mov v19.8b, v21.8b
mov v20.8b, v22.8b
@@ -2193,7 +2296,7 @@ L(\type\()_8tap_filter_2):
0:
ret x15
-L(\type\()_8tap_filter_4):
+L(\type\()_\taps\()_filter_4):
ld1 {v26.8b}, [\sr2], \s_strd
ld1 {v27.8b}, [\src], \s_strd
uxtl v26.8h, v26.8b
@@ -2237,15 +2340,15 @@ L(\type\()_8tap_filter_4):
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
- bl L(\type\()_8tap_filter_8_first)
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8_first)
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v24.16b
mov v18.16b, v25.16b
8:
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2303,7 +2406,9 @@ L(\type\()_8tap_filter_4):
ld1 {v0.8b}, [\xmx]
ld1 {v1.8b}, [\xmy]
sub \src, \src, #3
+.ifc \taps, 8tap
sub \src, \src, \s_strd
+.endif
sub \src, \src, \s_strd, lsl #1
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
@@ -2316,21 +2421,52 @@ L(\type\()_8tap_filter_4):
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
- bl L(\type\()_8tap_filter_8_first)
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8_first)
+.ifc \taps, 6tap
+ mov v18.16b, v16.16b
+.else
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v24.16b
mov v18.16b, v25.16b
- bl L(\type\()_8tap_filter_8)
+.endif
+ bl L(\type\()_\taps\()_filter_8)
mov v19.16b, v24.16b
mov v20.16b, v25.16b
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v21.16b, v24.16b
mov v22.16b, v25.16b
88:
+.ifc \taps, 6tap
+ smull v2.4s, v18.4h, v1.h[1]
+ smull2 v3.4s, v18.8h, v1.h[1]
+ bl L(\type\()_\taps\()_filter_8)
+ smull v4.4s, v19.4h, v1.h[1]
+ smull2 v5.4s, v19.8h, v1.h[1]
+ smlal v2.4s, v19.4h, v1.h[2]
+ smlal2 v3.4s, v19.8h, v1.h[2]
+ smlal v4.4s, v20.4h, v1.h[2]
+ smlal2 v5.4s, v20.8h, v1.h[2]
+ smlal v2.4s, v20.4h, v1.h[3]
+ smlal2 v3.4s, v20.8h, v1.h[3]
+ smlal v4.4s, v21.4h, v1.h[3]
+ smlal2 v5.4s, v21.8h, v1.h[3]
+ smlal v2.4s, v21.4h, v1.h[4]
+ smlal2 v3.4s, v21.8h, v1.h[4]
+ smlal v4.4s, v22.4h, v1.h[4]
+ smlal2 v5.4s, v22.8h, v1.h[4]
+ smlal v2.4s, v22.4h, v1.h[5]
+ smlal2 v3.4s, v22.8h, v1.h[5]
+ smlal v4.4s, v24.4h, v1.h[5]
+ smlal2 v5.4s, v24.8h, v1.h[5]
+ smlal v2.4s, v24.4h, v1.h[6]
+ smlal2 v3.4s, v24.8h, v1.h[6]
+ smlal v4.4s, v25.4h, v1.h[6]
+ smlal2 v5.4s, v25.8h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2361,6 +2497,7 @@ L(\type\()_8tap_filter_4):
smlal2 v3.4s, v24.8h, v1.h[7]
smlal v4.4s, v25.4h, v1.h[7]
smlal2 v5.4s, v25.8h, v1.h[7]
+.endif
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn2 v2.8h, v3.4s, #\shift_hv
sqrshrn v4.4h, v4.4s, #\shift_hv
@@ -2376,8 +2513,10 @@ L(\type\()_8tap_filter_4):
st1 {v4.8h}, [\ds2], \d_strd
.endif
b.le 9f
+.ifc \taps, 8tap
mov v16.16b, v18.16b
mov v17.16b, v19.16b
+.endif
mov v18.16b, v20.16b
mov v19.16b, v21.16b
mov v20.16b, v22.16b
@@ -2399,14 +2538,32 @@ L(\type\()_8tap_filter_4):
.else
add \dst, \dst, #16
.endif
+.ifc \taps, 6tap
+ add \src, \src, \s_strd, lsl #1
+.endif
b 168b
0:
ret x15
-L(\type\()_8tap_filter_8_first):
+L(\type\()_\taps\()_filter_8_first):
ld1 {v28.8b, v29.8b}, [\src], \s_strd
uxtl v28.8h, v28.8b
uxtl v29.8h, v29.8b
+.ifc \taps, 6tap
+ ext v24.16b, v28.16b, v29.16b, #(2*1)
+ ext v25.16b, v28.16b, v29.16b, #(2*2)
+ ext v26.16b, v28.16b, v29.16b, #(2*3)
+ ext v27.16b, v28.16b, v29.16b, #(2*4)
+ mul v16.8h, v24.8h, v0.h[1]
+ mla v16.8h, v25.8h, v0.h[2]
+ mla v16.8h, v26.8h, v0.h[3]
+ mla v16.8h, v27.8h, v0.h[4]
+ ext v24.16b, v28.16b, v29.16b, #(2*5)
+ ext v25.16b, v28.16b, v29.16b, #(2*6)
+ ext v26.16b, v28.16b, v29.16b, #(2*7)
+ mla v16.8h, v24.8h, v0.h[5]
+ mla v16.8h, v25.8h, v0.h[6]
+.else // 8tap
mul v16.8h, v28.8h, v0.h[0]
ext v24.16b, v28.16b, v29.16b, #(2*1)
ext v25.16b, v28.16b, v29.16b, #(2*2)
@@ -2422,16 +2579,29 @@ L(\type\()_8tap_filter_8_first):
mla v16.8h, v24.8h, v0.h[5]
mla v16.8h, v25.8h, v0.h[6]
mla v16.8h, v26.8h, v0.h[7]
+.endif
srshr v16.8h, v16.8h, #2
ret
-L(\type\()_8tap_filter_8):
+L(\type\()_\taps\()_filter_8):
ld1 {v28.8b, v29.8b}, [\sr2], \s_strd
ld1 {v30.8b, v31.8b}, [\src], \s_strd
uxtl v28.8h, v28.8b
uxtl v29.8h, v29.8b
uxtl v30.8h, v30.8b
uxtl v31.8h, v31.8b
+.ifc \taps, 6tap
+ ext v26.16b, v28.16b, v29.16b, #2
+ ext v27.16b, v30.16b, v31.16b, #2
+ mul v24.8h, v26.8h, v0.h[1]
+ mul v25.8h, v27.8h, v0.h[1]
+.irpc i, 23456
+ ext v26.16b, v28.16b, v29.16b, #(2*\i)
+ ext v27.16b, v30.16b, v31.16b, #(2*\i)
+ mla v24.8h, v26.8h, v0.h[\i]
+ mla v25.8h, v27.8h, v0.h[\i]
+.endr
+.else // 8tap
mul v24.8h, v28.8h, v0.h[0]
mul v25.8h, v30.8h, v0.h[0]
.irpc i, 1234567
@@ -2440,22 +2610,25 @@ L(\type\()_8tap_filter_8):
mla v24.8h, v26.8h, v0.h[\i]
mla v25.8h, v27.8h, v0.h[\i]
.endr
+.endif
srshr v24.8h, v24.8h, #2
srshr v25.8h, v25.8h, #2
ret
-L(\type\()_8tap_hv_tbl):
- .hword L(\type\()_8tap_hv_tbl) - 1280b
- .hword L(\type\()_8tap_hv_tbl) - 640b
- .hword L(\type\()_8tap_hv_tbl) - 320b
- .hword L(\type\()_8tap_hv_tbl) - 160b
- .hword L(\type\()_8tap_hv_tbl) - 80b
- .hword L(\type\()_8tap_hv_tbl) - 40b
- .hword L(\type\()_8tap_hv_tbl) - 20b
+L(\type\()_\taps\()_hv_tbl):
+ .hword L(\type\()_\taps\()_hv_tbl) - 1280b
+ .hword L(\type\()_\taps\()_hv_tbl) - 640b
+ .hword L(\type\()_\taps\()_hv_tbl) - 320b
+ .hword L(\type\()_\taps\()_hv_tbl) - 160b
+ .hword L(\type\()_\taps\()_hv_tbl) - 80b
+ .hword L(\type\()_\taps\()_hv_tbl) - 40b
+ .hword L(\type\()_\taps\()_hv_tbl) - 20b
.hword 0
endfunc
+.endm
+.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
function \type\()_bilin_8bpc_neon, export=1
dup v1.16b, \mx
dup v3.16b, \my
@@ -2987,8 +3160,34 @@ L(\type\()_bilin_hv_tbl):
endfunc
.endm
-filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
-filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn put, sharp, SHARP, SHARP, 8tap
+make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 8tap
+
+make_8tap_fn put, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 6tap
+filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
+
+make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn prep, sharp, SHARP, SHARP, 8tap
+make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 8tap
+
+make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 6tap
+filter_bilin_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+
.macro load_filter_row dst, src, inc
asr w13, \src, #10
diff --git a/src/arm/64/mc16.S b/src/arm/64/mc16.S
index 1bfb12e..576fab1 100644
--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -1374,19 +1374,35 @@ endfunc
sub \r3\wd, \r3\wd, \c\wd
.endif
.endm
-.macro smull_smlal_4 d, s0, s1, s2, s3
+.macro smull_smlal_4tap d, s0, s1, s2, s3
smull \d\().4s, \s0\().4h, v0.h[0]
smlal \d\().4s, \s1\().4h, v0.h[1]
smlal \d\().4s, \s2\().4h, v0.h[2]
smlal \d\().4s, \s3\().4h, v0.h[3]
.endm
-.macro smull2_smlal2_4 d, s0, s1, s2, s3
+.macro smull2_smlal2_4tap d, s0, s1, s2, s3
smull2 \d\().4s, \s0\().8h, v0.h[0]
smlal2 \d\().4s, \s1\().8h, v0.h[1]
smlal2 \d\().4s, \s2\().8h, v0.h[2]
smlal2 \d\().4s, \s3\().8h, v0.h[3]
.endm
-.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull \d\().4s, \s1\().4h, v0.h[1]
+ smlal \d\().4s, \s2\().4h, v0.h[2]
+ smlal \d\().4s, \s3\().4h, v0.h[3]
+ smlal \d\().4s, \s4\().4h, v0.h[4]
+ smlal \d\().4s, \s5\().4h, v0.h[5]
+ smlal \d\().4s, \s6\().4h, v0.h[6]
+.endm
+.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull2 \d\().4s, \s1\().8h, v0.h[1]
+ smlal2 \d\().4s, \s2\().8h, v0.h[2]
+ smlal2 \d\().4s, \s3\().8h, v0.h[3]
+ smlal2 \d\().4s, \s4\().8h, v0.h[4]
+ smlal2 \d\().4s, \s5\().8h, v0.h[5]
+ smlal2 \d\().4s, \s6\().8h, v0.h[6]
+.endm
+.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
smull \d\().4s, \s0\().4h, v0.h[0]
smlal \d\().4s, \s1\().4h, v0.h[1]
smlal \d\().4s, \s2\().4h, v0.h[2]
@@ -1396,7 +1412,7 @@ endfunc
smlal \d\().4s, \s6\().4h, v0.h[6]
smlal \d\().4s, \s7\().4h, v0.h[7]
.endm
-.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
smull2 \d\().4s, \s0\().8h, v0.h[0]
smlal2 \d\().4s, \s1\().8h, v0.h[1]
smlal2 \d\().4s, \s2\().8h, v0.h[2]
@@ -1499,11 +1515,11 @@ endfunc
st1 {\r0\().8h, \r1\().8h}, [\dst], \strd
.endm
-.macro make_8tap_fn op, type, type_h, type_v
+.macro make_8tap_fn op, type, type_h, type_v, taps
function \op\()_8tap_\type\()_16bpc_neon, export=1
mov w9, \type_h
mov w10, \type_v
- b \op\()_8tap_neon
+ b \op\()_\taps\()_neon
endfunc
.endm
@@ -1512,18 +1528,8 @@ endfunc
#define SMOOTH ((1*15<<7)|4*15)
#define SHARP ((2*15<<7)|3*15)
-.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
-make_8tap_fn \type, regular, REGULAR, REGULAR
-make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
-make_8tap_fn \type, regular_sharp, REGULAR, SHARP
-make_8tap_fn \type, smooth, SMOOTH, SMOOTH
-make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
-make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
-make_8tap_fn \type, sharp, SHARP, SHARP
-make_8tap_fn \type, sharp_regular, SHARP, REGULAR
-make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
-
-function \type\()_8tap_neon
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps
+function \type\()_\taps\()_neon
.ifc \bdmax, w8
ldr w8, [sp]
.endif
@@ -1547,12 +1553,12 @@ function \type\()_8tap_neon
add w13, w12, \bdmax // 6 + intermediate_bits
sub w12, w12, \bdmax // 6 - intermediate_bits
movrel x11, X(mc_subpel_filters), -8
- b.ne L(\type\()_8tap_h)
+ b.ne L(\type\()_\taps\()_h)
tst \my, #(0x7f << 14)
- b.ne L(\type\()_8tap_v)
+ b.ne L(\type\()_\taps\()_v)
b \type\()_neon
-L(\type\()_8tap_h):
+L(\type\()_\taps\()_h):
cmp \w, #4
ubfx w10, \mx, #7, #7
and \mx, \mx, #0x7f
@@ -1561,9 +1567,9 @@ L(\type\()_8tap_h):
4:
tst \my, #(0x7f << 14)
add \xmx, x11, \mx, uxtw #3
- b.ne L(\type\()_8tap_hv)
+ b.ne L(\type\()_\taps\()_hv)
- adr x10, L(\type\()_8tap_h_tbl)
+ adr x10, L(\type\()_\taps\()_h_tbl)
dup v30.4s, w12 // 6 - intermediate_bits
ldrh w9, [x10, x9, lsl #1]
neg v30.4s, v30.4s // -(6-intermediate_bits)
@@ -1682,6 +1688,22 @@ L(\type\()_8tap_h):
mov \mx, \w
8:
+.ifc \taps, 6tap
+ ext v24.16b, v16.16b, v17.16b, #2
+ ext v25.16b, v20.16b, v21.16b, #2
+ smull v18.4s, v24.4h, v0.h[1]
+ smull2 v19.4s, v24.8h, v0.h[1]
+ smull v22.4s, v25.4h, v0.h[1]
+ smull2 v23.4s, v25.8h, v0.h[1]
+.irpc i, 23456
+ ext v24.16b, v16.16b, v17.16b, #(2*\i)
+ ext v25.16b, v20.16b, v21.16b, #(2*\i)
+ smlal v18.4s, v24.4h, v0.h[\i]
+ smlal2 v19.4s, v24.8h, v0.h[\i]
+ smlal v22.4s, v25.4h, v0.h[\i]
+ smlal2 v23.4s, v25.8h, v0.h[\i]
+.endr
+.else // 8tap
smull v18.4s, v16.4h, v0.h[0]
smull2 v19.4s, v16.8h, v0.h[0]
smull v22.4s, v20.4h, v0.h[0]
@@ -1694,6 +1716,7 @@ L(\type\()_8tap_h):
smlal v22.4s, v25.4h, v0.h[\i]
smlal2 v23.4s, v25.8h, v0.h[\i]
.endr
+.endif
subs \mx, \mx, #8
srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits)
srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits)
@@ -1734,18 +1757,18 @@ L(\type\()_8tap_h):
b.gt 81b
ret
-L(\type\()_8tap_h_tbl):
- .hword L(\type\()_8tap_h_tbl) - 1280b
- .hword L(\type\()_8tap_h_tbl) - 640b
- .hword L(\type\()_8tap_h_tbl) - 320b
- .hword L(\type\()_8tap_h_tbl) - 160b
- .hword L(\type\()_8tap_h_tbl) - 80b
- .hword L(\type\()_8tap_h_tbl) - 40b
- .hword L(\type\()_8tap_h_tbl) - 20b
+L(\type\()_\taps\()_h_tbl):
+ .hword L(\type\()_\taps\()_h_tbl) - 1280b
+ .hword L(\type\()_\taps\()_h_tbl) - 640b
+ .hword L(\type\()_\taps\()_h_tbl) - 320b
+ .hword L(\type\()_\taps\()_h_tbl) - 160b
+ .hword L(\type\()_\taps\()_h_tbl) - 80b
+ .hword L(\type\()_\taps\()_h_tbl) - 40b
+ .hword L(\type\()_\taps\()_h_tbl) - 20b
.hword 0
-L(\type\()_8tap_v):
+L(\type\()_\taps\()_v):
cmp \h, #4
ubfx w10, \my, #7, #7
and \my, \my, #0x7f
@@ -1758,7 +1781,7 @@ L(\type\()_8tap_v):
dup v30.4s, w12 // 6 - intermediate_bits
movi v29.8h, #(PREP_BIAS >> 8), lsl #8
.endif
- adr x10, L(\type\()_8tap_v_tbl)
+ adr x10, L(\type\()_\taps\()_v_tbl)
ldrh w9, [x10, x9, lsl #1]
.ifc \type, prep
neg v30.4s, v30.4s // -(6-intermediate_bits)
@@ -1785,7 +1808,7 @@ L(\type\()_8tap_v):
load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
interleave_1_s v1, v2, v3, v4, v5
b.gt 24f
- smull_smlal_4 v6, v1, v2, v3, v4
+ smull_smlal_4tap v6, v1, v2, v3, v4
sqrshrun_h 6, v6
umin_h v31, .8h, v6
st_s \d_strd, v6, 2
@@ -1794,8 +1817,8 @@ L(\type\()_8tap_v):
24: // 2x4 v
load_s \sr2, \src, \s_strd, v6, v7
interleave_1_s v5, v6, v7
- smull_smlal_4 v16, v1, v2, v3, v4
- smull_smlal_4 v17, v3, v4, v5, v6
+ smull_smlal_4tap v16, v1, v2, v3, v4
+ smull_smlal_4tap v17, v3, v4, v5, v6
sqrshrun_h 6, v16, v17
umin_h v31, .8h, v16
st_s \d_strd, v16, 4
@@ -1817,8 +1840,8 @@ L(\type\()_8tap_v):
subs \h, \h, #4
load_s \sr2, \src, \s_strd, v16, v17, v18, v19
interleave_1_s v7, v16, v17, v18, v19
- smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
- smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18
+ smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16
+ smull_smlal_\taps v25, v3, v4, v5, v6, v7, v16, v17, v18
sqrshrun_h 6, v24, v25
umin_h v31, .8h, v24
st_s \d_strd, v24, 4
@@ -1836,7 +1859,7 @@ L(\type\()_8tap_v):
26:
load_s \sr2, \src, \s_strd, v16, v17
interleave_1_s v7, v16, v17
- smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
+ smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16
sqrshrun_h 6, v24
umin_h v31, .4h, v24
st_s \d_strd, v24, 2
@@ -1860,13 +1883,13 @@ L(\type\()_8tap_v):
sxtl v0.8h, v0.8b
load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
- smull_smlal_4 v6, v1, v2, v3, v4
- smull_smlal_4 v7, v2, v3, v4, v5
+ smull_smlal_4tap v6, v1, v2, v3, v4
+ smull_smlal_4tap v7, v2, v3, v4, v5
shift_store_4 \type, \d_strd, v6, v7
b.le 0f
load_4h \sr2, \src, \s_strd, v6, v7
- smull_smlal_4 v1, v3, v4, v5, v6
- smull_smlal_4 v2, v4, v5, v6, v7
+ smull_smlal_4tap v1, v3, v4, v5, v6
+ smull_smlal_4tap v2, v4, v5, v6, v7
shift_store_4 \type, \d_strd, v1, v2
0:
ret
@@ -1885,10 +1908,10 @@ L(\type\()_8tap_v):
48:
subs \h, \h, #4
load_4h \sr2, \src, \s_strd, v23, v24, v25, v26
- smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
- smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
- smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25
- smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_4 \type, \d_strd, v1, v2, v3, v4
b.le 0f
cmp \h, #2
@@ -1903,8 +1926,8 @@ L(\type\()_8tap_v):
b 48b
46:
load_4h \sr2, \src, \s_strd, v23, v24
- smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
- smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
shift_store_4 \type, \d_strd, v1, v2
0:
ret
@@ -1925,17 +1948,17 @@ L(\type\()_8tap_v):
sxtl v0.8h, v0.8b
load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
- smull_smlal_4 v16, v1, v2, v3, v4
- smull2_smlal2_4 v17, v1, v2, v3, v4
- smull_smlal_4 v18, v2, v3, v4, v5
- smull2_smlal2_4 v19, v2, v3, v4, v5
+ smull_smlal_4tap v16, v1, v2, v3, v4
+ smull2_smlal2_4tap v17, v1, v2, v3, v4
+ smull_smlal_4tap v18, v2, v3, v4, v5
+ smull2_smlal2_4tap v19, v2, v3, v4, v5
shift_store_8 \type, \d_strd, v16, v17, v18, v19
b.le 0f
load_8h \sr2, \src, \s_strd, v6, v7
- smull_smlal_4 v16, v3, v4, v5, v6
- smull2_smlal2_4 v17, v3, v4, v5, v6
- smull_smlal_4 v18, v4, v5, v6, v7
- smull2_smlal2_4 v19, v4, v5, v6, v7
+ smull_smlal_4tap v16, v3, v4, v5, v6
+ smull2_smlal2_4tap v17, v3, v4, v5, v6
+ smull_smlal_4tap v18, v4, v5, v6, v7
+ smull2_smlal2_4tap v19, v4, v5, v6, v7
shift_store_8 \type, \d_strd, v16, v17, v18, v19
0:
ret
@@ -1962,18 +1985,18 @@ L(\type\()_8tap_v):
88:
subs \h, \h, #2
load_8h \sr2, \src, \s_strd, v23, v24
- smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
- smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23
- smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24
- smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_\taps v3, v17, v18, v19, v20, v21, v22, v23, v24
+ smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24
shift_store_8 \type, \d_strd, v1, v2, v3, v4
b.le 9f
subs \h, \h, #2
load_8h \sr2, \src, \s_strd, v25, v26
- smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25
- smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25
- smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26
- smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ smull_smlal_\taps v1, v18, v19, v20, v21, v22, v23, v24, v25
+ smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_\taps v3, v19, v20, v21, v22, v23, v24, v25, v26
+ smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_8 \type, \d_strd, v1, v2, v3, v4
b.le 9f
mov v16.16b, v20.16b
@@ -2013,10 +2036,10 @@ L(\type\()_8tap_v):
16:
load_16h \src, \src, \s_strd, v22, v23
subs \h, \h, #1
- smull_smlal_4 v1, v16, v18, v20, v22
- smull2_smlal2_4 v2, v16, v18, v20, v22
- smull_smlal_4 v3, v17, v19, v21, v23
- smull2_smlal2_4 v4, v17, v19, v21, v23
+ smull_smlal_4tap v1, v16, v18, v20, v22
+ smull2_smlal2_4tap v2, v16, v18, v20, v22
+ smull_smlal_4tap v3, v17, v19, v21, v23
+ smull2_smlal2_4tap v4, v17, v19, v21, v23
shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4
b.le 0f
mov v16.16b, v18.16b
@@ -2029,17 +2052,17 @@ L(\type\()_8tap_v):
0:
ret
-L(\type\()_8tap_v_tbl):
- .hword L(\type\()_8tap_v_tbl) - 1280b
- .hword L(\type\()_8tap_v_tbl) - 640b
- .hword L(\type\()_8tap_v_tbl) - 320b
- .hword L(\type\()_8tap_v_tbl) - 160b
- .hword L(\type\()_8tap_v_tbl) - 80b
- .hword L(\type\()_8tap_v_tbl) - 40b
- .hword L(\type\()_8tap_v_tbl) - 20b
+L(\type\()_\taps\()_v_tbl):
+ .hword L(\type\()_\taps\()_v_tbl) - 1280b
+ .hword L(\type\()_\taps\()_v_tbl) - 640b
+ .hword L(\type\()_\taps\()_v_tbl) - 320b
+ .hword L(\type\()_\taps\()_v_tbl) - 160b
+ .hword L(\type\()_\taps\()_v_tbl) - 80b
+ .hword L(\type\()_\taps\()_v_tbl) - 40b
+ .hword L(\type\()_\taps\()_v_tbl) - 20b
.hword 0
-L(\type\()_8tap_hv):
+L(\type\()_\taps\()_hv):
cmp \h, #4
ubfx w10, \my, #7, #7
and \my, \my, #0x7f
@@ -2048,7 +2071,7 @@ L(\type\()_8tap_hv):
4:
add \xmy, x11, \my, uxtw #3
- adr x10, L(\type\()_8tap_hv_tbl)
+ adr x10, L(\type\()_\taps\()_hv_tbl)
dup v30.4s, w12 // 6 - intermediate_bits
ldrh w9, [x10, x9, lsl #1]
neg v30.4s, v30.4s // -(6-intermediate_bits)
@@ -2089,7 +2112,7 @@ L(\type\()_8tap_hv):
addp v27.4s, v27.4s, v28.4s
addp v16.4s, v27.4s, v27.4s
srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
// The intermediates from the horizontal pass fit in 16 bit without
// any bias; we could just as well keep them as .4s, but narrowing
// them to .4h gives a significant speedup on out of order cores
@@ -2100,7 +2123,7 @@ L(\type\()_8tap_hv):
mov v17.8b, v24.8b
2:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v24.8b, #4
smull v2.4s, v16.4h, v1.h[0]
@@ -2143,20 +2166,28 @@ L(\type\()_8tap_hv):
// them to .4h gives a significant speedup on out of order cores
// (at the cost of a smaller slowdown on in-order cores such as A53).
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
xtn v16.4h, v16.4s
trn1 v16.2s, v16.2s, v24.2s
mov v17.8b, v24.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v24.8b, #4
mov v19.8b, v24.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v20.8b, v19.8b, v24.8b, #4
mov v21.8b, v24.8b
28:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v22.8b, v21.8b, v24.8b, #4
+.ifc \taps, 6tap
+ smull v3.4s, v17.4h, v1.h[1]
+ smlal v3.4s, v18.4h, v1.h[2]
+ smlal v3.4s, v19.4h, v1.h[3]
+ smlal v3.4s, v20.4h, v1.h[4]
+ smlal v3.4s, v21.4h, v1.h[5]
+ smlal v3.4s, v22.4h, v1.h[6]
+.else // 8tap
smull v3.4s, v16.4h, v1.h[0]
smlal v3.4s, v17.4h, v1.h[1]
smlal v3.4s, v18.4h, v1.h[2]
@@ -2165,6 +2196,7 @@ L(\type\()_8tap_hv):
smlal v3.4s, v21.4h, v1.h[5]
smlal v3.4s, v22.4h, v1.h[6]
smlal v3.4s, v24.4h, v1.h[7]
+.endif
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
sqxtun v3.4h, v3.4s
@@ -2184,7 +2216,7 @@ L(\type\()_8tap_hv):
0:
ret x15
-L(\type\()_8tap_filter_2):
+L(\type\()_\taps\()_filter_2):
ld1 {v25.8h}, [\sr2], \s_strd
ld1 {v27.8h}, [\src], \s_strd
ext v26.16b, v25.16b, v25.16b, #2
@@ -2234,12 +2266,12 @@ L(\type\()_8tap_filter_2):
// (at the cost of a smaller slowdown on in-order cores such as A53).
xtn v16.4h, v16.4s
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v24.8b
mov v18.8b, v25.8b
4:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -2272,8 +2304,13 @@ L(\type\()_8tap_filter_2):
480: // 4x8, 4x16, 4x32 hv
ld1 {v1.8b}, [\xmy]
sub \src, \src, #2
+.ifc \taps, 6tap
+ sub \sr2, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+.else
sub \sr2, \src, \s_strd, lsl #1
sub \src, \sr2, \s_strd
+.endif
add \ds2, \dst, \d_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
@@ -2294,20 +2331,38 @@ L(\type\()_8tap_filter_2):
// any bias; we could just as well keep them as .4s, but narrowing
// them to .4h gives a significant speedup on out of order cores
// (at the cost of a smaller slowdown on in-order cores such as A53).
+.ifc \taps, 6tap
+ xtn v18.4h, v16.4s
+.else
xtn v16.4h, v16.4s
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v24.8b
mov v18.8b, v25.8b
- bl L(\type\()_8tap_filter_4)
+.endif
+ bl L(\type\()_\taps\()_filter_4)
mov v19.8b, v24.8b
mov v20.8b, v25.8b
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v21.8b, v24.8b
mov v22.8b, v25.8b
48:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
+.ifc \taps, 6tap
+ smull v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v19.4h, v1.h[2]
+ smlal v3.4s, v20.4h, v1.h[3]
+ smlal v3.4s, v21.4h, v1.h[4]
+ smlal v3.4s, v22.4h, v1.h[5]
+ smlal v3.4s, v24.4h, v1.h[6]
+ smull v4.4s, v19.4h, v1.h[1]
+ smlal v4.4s, v20.4h, v1.h[2]
+ smlal v4.4s, v21.4h, v1.h[3]
+ smlal v4.4s, v22.4h, v1.h[4]
+ smlal v4.4s, v24.4h, v1.h[5]
+ smlal v4.4s, v25.4h, v1.h[6]
+.else // 8tap
smull v3.4s, v16.4h, v1.h[0]
smlal v3.4s, v17.4h, v1.h[1]
smlal v3.4s, v18.4h, v1.h[2]
@@ -2324,6 +2379,7 @@ L(\type\()_8tap_filter_2):
smlal v4.4s, v22.4h, v1.h[5]
smlal v4.4s, v24.4h, v1.h[6]
smlal v4.4s, v25.4h, v1.h[7]
+.endif
.ifc \type, put
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
@@ -2339,8 +2395,10 @@ L(\type\()_8tap_filter_2):
st1 {v3.d}[0], [\dst], \d_strd
st1 {v3.d}[1], [\ds2], \d_strd
b.le 0f
+.ifc \taps, 8tap
mov v16.8b, v18.8b
mov v17.8b, v19.8b
+.endif
mov v18.8b, v20.8b
mov v19.8b, v21.8b
mov v20.8b, v22.8b
@@ -2350,7 +2408,7 @@ L(\type\()_8tap_filter_2):
0:
ret x15
-L(\type\()_8tap_filter_4):
+L(\type\()_\taps\()_filter_4):
ld1 {v24.8h}, [\sr2], \s_strd
ld1 {v25.8h}, [\src], \s_strd
ext v26.16b, v24.16b, v24.16b, #2
@@ -2411,14 +2469,14 @@ L(\type\()_8tap_filter_4):
// and conserves register space (no need to clobber v8-v15).
uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v23.16b
mov v18.16b, v24.16b
8:
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2480,7 +2538,9 @@ L(\type\()_8tap_filter_4):
ld1 {v0.8b}, [\xmx]
ld1 {v1.8b}, [\xmy]
sub \src, \src, #6
+.ifc \taps, 8tap
sub \src, \src, \s_strd
+.endif
sub \src, \src, \s_strd, lsl #1
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
@@ -2494,6 +2554,16 @@ L(\type\()_8tap_filter_4):
lsl \s_strd, \s_strd, #1
ld1 {v27.8h, v28.8h}, [\src], \s_strd
+.ifc \taps, 6tap
+ ext v26.16b, v27.16b, v28.16b, #2
+ smull v24.4s, v26.4h, v0.h[1]
+ smull2 v25.4s, v26.8h, v0.h[1]
+.irpc i, 23456
+ ext v26.16b, v27.16b, v28.16b, #(2*\i)
+ smlal v24.4s, v26.4h, v0.h[\i]
+ smlal2 v25.4s, v26.8h, v0.h[\i]
+.endr
+.else // 8tap
smull v24.4s, v27.4h, v0.h[0]
smull2 v25.4s, v27.8h, v0.h[0]
.irpc i, 1234567
@@ -2501,6 +2571,7 @@ L(\type\()_8tap_filter_4):
smlal v24.4s, v26.4h, v0.h[\i]
smlal2 v25.4s, v26.8h, v0.h[\i]
.endr
+.endif
srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
// The intermediates from the horizontal pass fit in 16 bit without
@@ -2508,22 +2579,53 @@ L(\type\()_8tap_filter_4):
// them to .4h gives a significant speedup on out of order cores
// (at the cost of a smaller slowdown on in-order cores such as A53),
// and conserves register space (no need to clobber v8-v15).
+.ifc \taps, 6tap
+ uzp1 v18.8h, v24.8h, v25.8h // Same as xtn, xtn2
+.else
uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v23.16b
mov v18.16b, v24.16b
- bl L(\type\()_8tap_filter_8)
+.endif
+ bl L(\type\()_\taps\()_filter_8)
mov v19.16b, v23.16b
mov v20.16b, v24.16b
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v21.16b, v23.16b
mov v22.16b, v24.16b
88:
+.ifc \taps, 6tap
+ smull v2.4s, v18.4h, v1.h[1]
+ smull2 v3.4s, v18.8h, v1.h[1]
+ bl L(\type\()_\taps\()_filter_8)
+ smull v4.4s, v19.4h, v1.h[1]
+ smull2 v5.4s, v19.8h, v1.h[1]
+ smlal v2.4s, v19.4h, v1.h[2]
+ smlal2 v3.4s, v19.8h, v1.h[2]
+ smlal v4.4s, v20.4h, v1.h[2]
+ smlal2 v5.4s, v20.8h, v1.h[2]
+ smlal v2.4s, v20.4h, v1.h[3]
+ smlal2 v3.4s, v20.8h, v1.h[3]
+ smlal v4.4s, v21.4h, v1.h[3]
+ smlal2 v5.4s, v21.8h, v1.h[3]
+ smlal v2.4s, v21.4h, v1.h[4]
+ smlal2 v3.4s, v21.8h, v1.h[4]
+ smlal v4.4s, v22.4h, v1.h[4]
+ smlal2 v5.4s, v22.8h, v1.h[4]
+ smlal v2.4s, v22.4h, v1.h[5]
+ smlal2 v3.4s, v22.8h, v1.h[5]
+ smlal v4.4s, v23.4h, v1.h[5]
+ smlal2 v5.4s, v23.8h, v1.h[5]
+ smlal v2.4s, v23.4h, v1.h[6]
+ smlal2 v3.4s, v23.8h, v1.h[6]
+ smlal v4.4s, v24.4h, v1.h[6]
+ smlal2 v5.4s, v24.8h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2554,6 +2656,7 @@ L(\type\()_8tap_filter_4):
smlal2 v3.4s, v23.8h, v1.h[7]
smlal v4.4s, v24.4h, v1.h[7]
smlal2 v5.4s, v24.8h, v1.h[7]
+.endif
.ifc \type, put
srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
@@ -2577,8 +2680,10 @@ L(\type\()_8tap_filter_4):
st1 {v2.8h}, [\dst], \d_strd
st1 {v3.8h}, [\ds2], \d_strd
b.le 9f
+.ifc \taps, 8tap
mov v16.16b, v18.16b
mov v17.16b, v19.16b
+.endif
mov v18.16b, v20.16b
mov v19.16b, v21.16b
mov v20.16b, v22.16b
@@ -2596,13 +2701,32 @@ L(\type\()_8tap_filter_4):
mov \h, \my
add \src, \src, #16
add \dst, \dst, #16
+.ifc \taps, 6tap
+ add \src, \src, \s_strd, lsl #1
+.endif
b 168b
0:
ret x15
-L(\type\()_8tap_filter_8):
+L(\type\()_\taps\()_filter_8):
ld1 {v4.8h, v5.8h}, [\sr2], \s_strd
ld1 {v6.8h, v7.8h}, [\src], \s_strd
+.ifc \taps, 6tap
+ ext v23.16b, v4.16b, v5.16b, #2
+ ext v24.16b, v6.16b, v7.16b, #2
+ smull v25.4s, v23.4h, v0.h[1]
+ smull2 v26.4s, v23.8h, v0.h[1]
+ smull v27.4s, v24.4h, v0.h[1]
+ smull2 v28.4s, v24.8h, v0.h[1]
+.irpc i, 23456
+ ext v23.16b, v4.16b, v5.16b, #(2*\i)
+ ext v24.16b, v6.16b, v7.16b, #(2*\i)
+ smlal v25.4s, v23.4h, v0.h[\i]
+ smlal2 v26.4s, v23.8h, v0.h[\i]
+ smlal v27.4s, v24.4h, v0.h[\i]
+ smlal2 v28.4s, v24.8h, v0.h[\i]
+.endr
+.else // 8tap
smull v25.4s, v4.4h, v0.h[0]
smull2 v26.4s, v4.8h, v0.h[0]
smull v27.4s, v6.4h, v0.h[0]
@@ -2615,6 +2739,7 @@ L(\type\()_8tap_filter_8):
smlal v27.4s, v24.4h, v0.h[\i]
smlal2 v28.4s, v24.8h, v0.h[\i]
.endr
+.endif
srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits)
srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits)
@@ -2623,18 +2748,20 @@ L(\type\()_8tap_filter_8):
uzp1 v24.8h, v27.8h, v28.8h // Ditto
ret
-L(\type\()_8tap_hv_tbl):
- .hword L(\type\()_8tap_hv_tbl) - 1280b
- .hword L(\type\()_8tap_hv_tbl) - 640b
- .hword L(\type\()_8tap_hv_tbl) - 320b
- .hword L(\type\()_8tap_hv_tbl) - 160b
- .hword L(\type\()_8tap_hv_tbl) - 80b
- .hword L(\type\()_8tap_hv_tbl) - 40b
- .hword L(\type\()_8tap_hv_tbl) - 20b
+L(\type\()_\taps\()_hv_tbl):
+ .hword L(\type\()_\taps\()_hv_tbl) - 1280b
+ .hword L(\type\()_\taps\()_hv_tbl) - 640b
+ .hword L(\type\()_\taps\()_hv_tbl) - 320b
+ .hword L(\type\()_\taps\()_hv_tbl) - 160b
+ .hword L(\type\()_\taps\()_hv_tbl) - 80b
+ .hword L(\type\()_\taps\()_hv_tbl) - 40b
+ .hword L(\type\()_\taps\()_hv_tbl) - 20b
.hword 0
endfunc
+.endm
+.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
function \type\()_bilin_16bpc_neon, export=1
.ifc \bdmax, w8
ldr w8, [sp]
@@ -3236,8 +3363,34 @@ L(\type\()_bilin_hv_tbl):
endfunc
.endm
-filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
-filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn put, sharp, SHARP, SHARP, 8tap
+make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap
+
+make_8tap_fn put, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap
+filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
+
+make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn prep, sharp, SHARP, SHARP, 8tap
+make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap
+
+make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap
+filter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+
.macro load_filter_row dst, src, inc
asr w13, \src, #10
diff --git a/src/arm/64/msac.S b/src/arm/64/msac.S
index 3a6cf90..7bef924 100644
--- a/src/arm/64/msac.S
+++ b/src/arm/64/msac.S
@@ -208,60 +208,66 @@ L(renorm):
sub w4, w4, w3 // rng = u - v
clz w5, w4 // clz(rng)
eor w5, w5, #16 // d = clz(rng) ^ 16
- mvn x7, x7 // ~dif
- add x7, x7, x3, lsl #48 // ~dif + (v << 48)
+ sub x7, x7, x3, lsl #48 // dif - (v << 48)
L(renorm2):
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
- lsl x7, x7, x5 // (~dif + (v << 48)) << d
+ lsl x7, x7, x5 // (dif - (v << 48)) << d
str w4, [x0, #RNG]
- mvn x7, x7 // ~dif
- b.hs 9f
+ b.hs 4f
// refill
ldp x3, x4, [x0] // BUF_POS, BUF_END
add x5, x3, #8
- cmp x5, x4
- b.gt 2f
-
- ldr x3, [x3] // next_bits
- add w8, w6, #23 // shift_bits = cnt + 23
- add w6, w6, #16 // cnt += 16
- rev x3, x3 // next_bits = bswap(next_bits)
- sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3
- and w8, w8, #24 // shift_bits &= 24
- lsr x3, x3, x8 // next_bits >>= shift_bits
- sub w8, w8, w6 // shift_bits -= 16 + cnt
- str x5, [x0, #BUF_POS]
- lsl x3, x3, x8 // next_bits <<= shift_bits
- mov w4, #48
- sub w6, w4, w8 // cnt = cnt + 64 - shift_bits
- eor x7, x7, x3 // dif ^= next_bits
- b 9f
-
-2: // refill_eob
- mov w14, #40
- sub w5, w14, w6 // c = 40 - cnt
-3:
- cmp x3, x4
- b.ge 4f
- ldrb w8, [x3], #1
- lsl x8, x8, x5
- eor x7, x7, x8
- subs w5, w5, #8
- b.ge 3b
-
-4: // refill_eob_end
+ subs x5, x5, x4
+ b.hi 6f
+
+ ldr x8, [x3] // next_bits
+ add w4, w6, #-48 // shift_bits = cnt + 16 (- 64)
+ mvn x8, x8
+ neg w5, w4
+ rev x8, x8 // next_bits = bswap(next_bits)
+ lsr w5, w5, #3 // num_bytes_read
+ lsr x8, x8, x4 // next_bits >>= (shift_bits & 63)
+
+2: // refill_end
+ add x3, x3, x5
+ add w6, w6, w5, lsl #3 // cnt += num_bits_read
str x3, [x0, #BUF_POS]
- sub w6, w14, w5 // cnt = 40 - c
-9:
+3: // refill_end2
+ orr x7, x7, x8 // dif |= next_bits
+
+4: // end
str w6, [x0, #CNT]
str x7, [x0, #DIF]
mov w0, w15
add sp, sp, #48
ret
+
+5: // pad_with_ones
+ add w8, w6, #-16
+ ror x8, x8, x8
+ b 3b
+
+6: // refill_eob
+ cmp x3, x4
+ b.hs 5b
+
+ ldr x8, [x4, #-8]
+ lsl w5, w5, #3
+ lsr x8, x8, x5
+ add w5, w6, #-48
+ mvn x8, x8
+ sub w4, w4, w3 // num_bytes_left
+ rev x8, x8
+ lsr x8, x8, x5
+ neg w5, w5
+ lsr w5, w5, #3
+ cmp w5, w4
+ csel w5, w5, w4, lo // num_bytes_read
+ b 2b
endfunc
function msac_decode_symbol_adapt8_neon, export=1
@@ -334,54 +340,37 @@ function msac_decode_hi_tok_neon, export=1
sub w4, w4, w3 // rng = u - v
clz w5, w4 // clz(rng)
eor w5, w5, #16 // d = clz(rng) ^ 16
- mvn x7, x7 // ~dif
- add x7, x7, x3, lsl #48 // ~dif + (v << 48)
+ sub x7, x7, x3, lsl #48 // dif - (v << 48)
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
- lsl x7, x7, x5 // (~dif + (v << 48)) << d
+ lsl x7, x7, x5 // (dif - (v << 48)) << d
str w4, [x0, #RNG]
dup v3.4h, w4
- mvn x7, x7 // ~dif
- b.hs 9f
+ b.hs 5f
// refill
ldp x3, x4, [x0] // BUF_POS, BUF_END
add x5, x3, #8
- cmp x5, x4
- b.gt 2f
-
- ldr x3, [x3] // next_bits
- add w8, w6, #23 // shift_bits = cnt + 23
- add w6, w6, #16 // cnt += 16
- rev x3, x3 // next_bits = bswap(next_bits)
- sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3
- and w8, w8, #24 // shift_bits &= 24
- lsr x3, x3, x8 // next_bits >>= shift_bits
- sub w8, w8, w6 // shift_bits -= 16 + cnt
- str x5, [x0, #BUF_POS]
- lsl x3, x3, x8 // next_bits <<= shift_bits
- mov w4, #48
- sub w6, w4, w8 // cnt = cnt + 64 - shift_bits
- eor x7, x7, x3 // dif ^= next_bits
- b 9f
-
-2: // refill_eob
- mov w14, #40
- sub w5, w14, w6 // c = 40 - cnt
-3:
- cmp x3, x4
- b.ge 4f
- ldrb w8, [x3], #1
- lsl x8, x8, x5
- eor x7, x7, x8
- subs w5, w5, #8
- b.ge 3b
-
-4: // refill_eob_end
+ subs x5, x5, x4
+ b.hi 7f
+
+ ldr x8, [x3] // next_bits
+ add w4, w6, #-48 // shift_bits = cnt + 16 (- 64)
+ mvn x8, x8
+ neg w5, w4
+ rev x8, x8 // next_bits = bswap(next_bits)
+ lsr w5, w5, #3 // num_bytes_read
+ lsr x8, x8, x4 // next_bits >>= (shift_bits & 63)
+
+3: // refill_end
+ add x3, x3, x5
+ add w6, w6, w5, lsl #3 // cnt += num_bits_read
str x3, [x0, #BUF_POS]
- sub w6, w14, w5 // cnt = 40 - c
-9:
+4: // refill_end2
+ orr x7, x7, x8 // dif |= next_bits
+
+5: // end
lsl w15, w15, #1
sub w15, w15, #5
lsr x12, x7, #48
@@ -394,6 +383,29 @@ function msac_decode_hi_tok_neon, export=1
str x7, [x0, #DIF]
lsr w0, w13, #1
ret
+
+6: // pad_with_ones
+ add w8, w6, #-16
+ ror x8, x8, x8
+ b 4b
+
+7: // refill_eob
+ cmp x3, x4
+ b.hs 6b
+
+ ldr x8, [x4, #-8]
+ lsl w5, w5, #3
+ lsr x8, x8, x5
+ add w5, w6, #-48
+ mvn x8, x8
+ sub w4, w4, w3 // num_bytes_left
+ rev x8, x8
+ lsr x8, x8, x5
+ neg w5, w5
+ lsr w5, w5, #3
+ cmp w5, w4
+ csel w5, w5, w4, lo // num_bytes_read
+ b 3b
endfunc
function msac_decode_bool_equi_neon, export=1
@@ -410,7 +422,6 @@ function msac_decode_bool_equi_neon, export=1
csel x7, x8, x7, hs // if (ret) dif = dif - vw;
clz w5, w4 // clz(rng)
- mvn x7, x7 // ~dif
eor w5, w5, #16 // d = clz(rng) ^ 16
b L(renorm2)
endfunc
@@ -431,7 +442,6 @@ function msac_decode_bool_neon, export=1
csel x7, x8, x7, hs // if (ret) dif = dif - vw;
clz w5, w4 // clz(rng)
- mvn x7, x7 // ~dif
eor w5, w5, #16 // d = clz(rng) ^ 16
b L(renorm2)
endfunc
@@ -455,7 +465,6 @@ function msac_decode_bool_adapt_neon, export=1
ldr w10, [x0, #ALLOW_UPDATE_CDF]
clz w5, w4 // clz(rng)
- mvn x7, x7 // ~dif
eor w5, w5, #16 // d = clz(rng) ^ 16
cbz w10, L(renorm2)
diff --git a/src/arm/64/util.S b/src/arm/64/util.S
index 9013fd4..64d73e3 100644
--- a/src/arm/64/util.S
+++ b/src/arm/64/util.S
@@ -149,6 +149,35 @@
trn2 \r7\().2d, \t9\().2d, \r7\().2d
.endm
+.macro transpose_8x8h_mov r0, r1, r2, r3, r4, r5, r6, r7, t8, t9, o0, o1, o2, o3, o4, o5, o6, o7
+ trn1 \t8\().8h, \r0\().8h, \r1\().8h
+ trn2 \t9\().8h, \r0\().8h, \r1\().8h
+ trn1 \r1\().8h, \r2\().8h, \r3\().8h
+ trn2 \r3\().8h, \r2\().8h, \r3\().8h
+ trn1 \r0\().8h, \r4\().8h, \r5\().8h
+ trn2 \r5\().8h, \r4\().8h, \r5\().8h
+ trn1 \r2\().8h, \r6\().8h, \r7\().8h
+ trn2 \r7\().8h, \r6\().8h, \r7\().8h
+
+ trn1 \r4\().4s, \r0\().4s, \r2\().4s
+ trn2 \r2\().4s, \r0\().4s, \r2\().4s
+ trn1 \r6\().4s, \r5\().4s, \r7\().4s
+ trn2 \r7\().4s, \r5\().4s, \r7\().4s
+ trn1 \r5\().4s, \t9\().4s, \r3\().4s
+ trn2 \t9\().4s, \t9\().4s, \r3\().4s
+ trn1 \r3\().4s, \t8\().4s, \r1\().4s
+ trn2 \t8\().4s, \t8\().4s, \r1\().4s
+
+ trn1 \o0\().2d, \r3\().2d, \r4\().2d
+ trn2 \o4\().2d, \r3\().2d, \r4\().2d
+ trn1 \o1\().2d, \r5\().2d, \r6\().2d
+ trn2 \o5\().2d, \r5\().2d, \r6\().2d
+ trn2 \o6\().2d, \t8\().2d, \r2\().2d
+ trn1 \o2\().2d, \t8\().2d, \r2\().2d
+ trn1 \o3\().2d, \t9\().2d, \r7\().2d
+ trn2 \o7\().2d, \t9\().2d, \r7\().2d
+.endm
+
.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
trn1 \t8\().16b, \r0\().16b, \r1\().16b
trn2 \t9\().16b, \r0\().16b, \r1\().16b
@@ -226,4 +255,16 @@
trn2 \r3\().4s, \t5\().4s, \t7\().4s
.endm
+.macro transpose_4x8h_mov r0, r1, r2, r3, t4, t5, t6, t7, o0, o1, o2, o3
+ trn1 \t4\().8h, \r0\().8h, \r1\().8h
+ trn2 \t5\().8h, \r0\().8h, \r1\().8h
+ trn1 \t6\().8h, \r2\().8h, \r3\().8h
+ trn2 \t7\().8h, \r2\().8h, \r3\().8h
+
+ trn1 \o0\().4s, \t4\().4s, \t6\().4s
+ trn2 \o2\().4s, \t4\().4s, \t6\().4s
+ trn1 \o1\().4s, \t5\().4s, \t7\().4s
+ trn2 \o3\().4s, \t5\().4s, \t7\().4s
+.endm
+
#endif /* DAV1D_SRC_ARM_64_UTIL_S */
diff --git a/src/arm/asm.S b/src/arm/asm.S
index dc50415..fed73b3 100644
--- a/src/arm/asm.S
+++ b/src/arm/asm.S
@@ -34,6 +34,50 @@
#define x18 do_not_use_x18
#define w18 do_not_use_w18
+#if HAVE_AS_ARCH_DIRECTIVE
+ .arch AS_ARCH_LEVEL
+#endif
+
+#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
+#define ENABLE_DOTPROD .arch_extension dotprod
+#define DISABLE_DOTPROD .arch_extension nodotprod
+#else
+#define ENABLE_DOTPROD
+#define DISABLE_DOTPROD
+#endif
+#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
+#define ENABLE_I8MM .arch_extension i8mm
+#define DISABLE_I8MM .arch_extension noi8mm
+#else
+#define ENABLE_I8MM
+#define DISABLE_I8MM
+#endif
+#if HAVE_AS_ARCHEXT_SVE_DIRECTIVE
+#define ENABLE_SVE .arch_extension sve
+#define DISABLE_SVE .arch_extension nosve
+#else
+#define ENABLE_SVE
+#define DISABLE_SVE
+#endif
+#if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE
+#define ENABLE_SVE2 .arch_extension sve2
+#define DISABLE_SVE2 .arch_extension nosve2
+#else
+#define ENABLE_SVE2
+#define DISABLE_SVE2
+#endif
+
+/* If we do support the .arch_extension directives, disable support for all
+ * the extensions that we may use, in case they were implicitly enabled by
+ * the .arch level. This makes it clear if we try to assemble an instruction
+ * from an unintended extension set; we only allow assmbling such instructions
+ * within regions where we explicitly enable those extensions. */
+DISABLE_DOTPROD
+DISABLE_I8MM
+DISABLE_SVE
+DISABLE_SVE2
+
+
/* Support macros for
* - Armv8.3-A Pointer Authentication and
* - Armv8.5-A Branch Target Identification
diff --git a/src/arm/cpu.c b/src/arm/cpu.c
index b7a0d3a..d9b1751 100644
--- a/src/arm/cpu.c
+++ b/src/arm/cpu.c
@@ -31,22 +31,95 @@
#include "src/arm/cpu.h"
-#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
-// NEON is always available; runtime tests are not needed.
-#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
+#if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
#include <sys/auxv.h>
+#if ARCH_AARCH64
+
+#define HWCAP_AARCH64_ASIMDDP (1 << 20)
+#define HWCAP_AARCH64_SVE (1 << 22)
+#define HWCAP2_AARCH64_SVE2 (1 << 1)
+#define HWCAP2_AARCH64_I8MM (1 << 13)
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+#ifdef HAVE_GETAUXVAL
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+ unsigned long hw_cap2 = getauxval(AT_HWCAP2);
+#else
+ unsigned long hw_cap = 0;
+ unsigned long hw_cap2 = 0;
+ elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+ elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2));
+#endif
+
+ unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+ flags |= (hw_cap & HWCAP_AARCH64_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
+ flags |= (hw_cap2 & HWCAP2_AARCH64_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
+ flags |= (hw_cap & HWCAP_AARCH64_SVE) ? DAV1D_ARM_CPU_FLAG_SVE : 0;
+ flags |= (hw_cap2 & HWCAP2_AARCH64_SVE2) ? DAV1D_ARM_CPU_FLAG_SVE2 : 0;
+ return flags;
+}
+#else /* !ARCH_AARCH64 */
+
#ifndef HWCAP_ARM_NEON
-#define HWCAP_ARM_NEON (1 << 12)
+#define HWCAP_ARM_NEON (1 << 12)
#endif
-#define NEON_HWCAP HWCAP_ARM_NEON
+#define HWCAP_ARM_ASIMDDP (1 << 24)
+#define HWCAP_ARM_I8MM (1 << 27)
-#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
-#include <sys/auxv.h>
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+#ifdef HAVE_GETAUXVAL
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+#else
+ unsigned long hw_cap = 0;
+ elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+#endif
+
+ unsigned flags = (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+ flags |= (hw_cap & HWCAP_ARM_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
+ flags |= (hw_cap & HWCAP_ARM_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
+ return flags;
+}
+#endif /* ARCH_AARCH64 */
+
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+
+static int have_feature(const char *feature) {
+ int supported = 0;
+ size_t size = sizeof(supported);
+ if (sysctlbyname(feature, &supported, &size, NULL, 0) != 0) {
+ return 0;
+ }
+ return supported;
+}
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+ unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+ if (have_feature("hw.optional.arm.FEAT_DotProd"))
+ flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
+ if (have_feature("hw.optional.arm.FEAT_I8MM"))
+ flags |= DAV1D_ARM_CPU_FLAG_I8MM;
+ /* No SVE and SVE2 feature detection available on Apple platforms. */
+ return flags;
+}
+
+#elif defined(_WIN32)
+#include <windows.h>
-#define NEON_HWCAP HWCAP_NEON
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+ unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+#ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
+ if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE))
+ flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
+#endif
+ /* No I8MM or SVE feature detection available on Windows at the time of
+ * writing. */
+ return flags;
+}
#elif defined(__ANDROID__)
+#include <ctype.h>
#include <stdio.h>
#include <string.h>
@@ -58,18 +131,25 @@ static unsigned parse_proc_cpuinfo(const char *flag) {
char line_buffer[120];
const char *line;
+ size_t flaglen = strlen(flag);
while ((line = fgets(line_buffer, sizeof(line_buffer), file))) {
- if (strstr(line, flag)) {
- fclose(file);
- return 1;
+ // check all occurances as whole words
+ const char *found = line;
+ while ((found = strstr(found, flag))) {
+ if ((found == line_buffer || !isgraph(found[-1])) &&
+ (isspace(found[flaglen]) || feof(file))) {
+ fclose(file);
+ return 1;
+ }
+ found += flaglen;
}
// if line is incomplete seek back to avoid splitting the search
// string into two buffers
- if (!strchr(line, '\n') && strlen(line) > strlen(flag)) {
+ if (!strchr(line, '\n') && strlen(line) > flaglen) {
// use fseek since the 64 bit fseeko is only available since
// Android API level 24 and meson defines _FILE_OFFSET_BITS
// by default 64
- if (fseek(file, -strlen(flag), SEEK_CUR))
+ if (fseek(file, -flaglen, SEEK_CUR))
break;
}
}
@@ -78,22 +158,23 @@ static unsigned parse_proc_cpuinfo(const char *flag) {
return 0;
}
-#endif
COLD unsigned dav1d_get_cpu_flags_arm(void) {
- unsigned flags = 0;
-#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
- flags |= DAV1D_ARM_CPU_FLAG_NEON;
-#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
- unsigned long hw_cap = getauxval(AT_HWCAP);
- flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
-#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
- unsigned long hw_cap = 0;
- elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
- flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
-#elif defined(__ANDROID__)
- flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
-#endif
-
+ unsigned flags = parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+ flags |= parse_proc_cpuinfo("asimd") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+ flags |= parse_proc_cpuinfo("asimddp") ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
+ flags |= parse_proc_cpuinfo("i8mm") ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
+#if ARCH_AARCH64
+ flags |= parse_proc_cpuinfo("sve") ? DAV1D_ARM_CPU_FLAG_SVE : 0;
+ flags |= parse_proc_cpuinfo("sve2") ? DAV1D_ARM_CPU_FLAG_SVE2 : 0;
+#endif /* ARCH_AARCH64 */
return flags;
}
+
+#else /* Unsupported OS */
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+ return 0;
+}
+
+#endif
diff --git a/src/arm/cpu.h b/src/arm/cpu.h
index 8c10a1b..de9bde6 100644
--- a/src/arm/cpu.h
+++ b/src/arm/cpu.h
@@ -30,6 +30,10 @@
enum CpuFlags {
DAV1D_ARM_CPU_FLAG_NEON = 1 << 0,
+ DAV1D_ARM_CPU_FLAG_DOTPROD = 1 << 1,
+ DAV1D_ARM_CPU_FLAG_I8MM = 1 << 2,
+ DAV1D_ARM_CPU_FLAG_SVE = 1 << 3,
+ DAV1D_ARM_CPU_FLAG_SVE2 = 1 << 4,
};
unsigned dav1d_get_cpu_flags_arm(void);
diff --git a/src/arm/itx.h b/src/arm/itx.h
index 2ecd086..17234e0 100644
--- a/src/arm/itx.h
+++ b/src/arm/itx.h
@@ -117,9 +117,11 @@ static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+ assign_itx_fn( , 4, 4, wht_wht, WHT_WHT, neon);
+
if (BITDEPTH == 16 && bpc != 10) return;
- assign_itx17_fn( , 4, 4, neon);
+ assign_itx16_fn( , 4, 4, neon);
assign_itx16_fn(R, 4, 8, neon);
assign_itx16_fn(R, 4, 16, neon);
assign_itx16_fn(R, 8, 4, neon);
diff --git a/src/arm/msac.h b/src/arm/msac.h
index 9db0bf8..6eee0da 100644
--- a/src/arm/msac.h
+++ b/src/arm/msac.h
@@ -39,7 +39,7 @@ unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
-#if ARCH_AARCH64 || defined(__ARM_NEON)
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
diff --git a/src/cpu.h b/src/cpu.h
index c9009c7..d20c5f0 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -64,6 +64,20 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
flags |= DAV1D_ARM_CPU_FLAG_NEON;
#endif
+#ifdef __ARM_FEATURE_DOTPROD
+ flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
+#endif
+#ifdef __ARM_FEATURE_MATMUL_INT8
+ flags |= DAV1D_ARM_CPU_FLAG_I8MM;
+#endif
+#if ARCH_AARCH64
+#ifdef __ARM_FEATURE_SVE
+ flags |= DAV1D_ARM_CPU_FLAG_SVE;
+#endif
+#ifdef __ARM_FEATURE_SVE2
+ flags |= DAV1D_ARM_CPU_FLAG_SVE2;
+#endif
+#endif /* ARCH_AARCH64 */
#elif ARCH_PPC64LE
#if defined(__VSX__)
flags |= DAV1D_PPC_CPU_FLAG_VSX;
diff --git a/src/ext/x86/x86inc.asm b/src/ext/x86/x86inc.asm
index 68b1f74..2282d9b 100644
--- a/src/ext/x86/x86inc.asm
+++ b/src/ext/x86/x86inc.asm
@@ -1,7 +1,7 @@
;*****************************************************************************
;* x86inc.asm: x86 abstraction layer
;*****************************************************************************
-;* Copyright (C) 2005-2022 x264 project
+;* Copyright (C) 2005-2024 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Henrik Gramner <henrik@gramner.com>
@@ -104,7 +104,7 @@
%endif
%define HAVE_PRIVATE_EXTERN 1
-%ifdef __NASM_VER__
+%ifdef __NASM_VERSION_ID__
%use smartalign
%if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
%define HAVE_PRIVATE_EXTERN 0
@@ -845,9 +845,26 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%1: %2
%endmacro
-; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
%if FORMAT_ELF
+ ; The GNU linker assumes the stack is executable by default.
[SECTION .note.GNU-stack noalloc noexec nowrite progbits]
+
+ %ifdef __NASM_VERSION_ID__
+ %if __NASM_VERSION_ID__ >= 0x020e0300 ; 2.14.03
+ %if ARCH_X86_64
+ ; Control-flow Enforcement Technology (CET) properties.
+ [SECTION .note.gnu.property alloc noexec nowrite note align=gprsize]
+ dd 0x00000004 ; n_namesz
+ dd gprsize + 8 ; n_descsz
+ dd 0x00000005 ; n_type = NT_GNU_PROPERTY_TYPE_0
+ db "GNU",0 ; n_name
+ dd 0xc0000002 ; pr_type = GNU_PROPERTY_X86_FEATURE_1_AND
+ dd 0x00000004 ; pr_datasz
+ dd 0x00000002 ; pr_data = GNU_PROPERTY_X86_FEATURE_1_SHSTK
+ dd 0x00000000 ; pr_padding
+ %endif
+ %endif
+ %endif
%endif
; Tell debuggers how large the function was.
@@ -883,21 +900,22 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign cpuflags_sse4 (1<<10) | cpuflags_ssse3
%assign cpuflags_sse42 (1<<11) | cpuflags_sse4
%assign cpuflags_aesni (1<<12) | cpuflags_sse42
-%assign cpuflags_gfni (1<<13) | cpuflags_sse42
-%assign cpuflags_avx (1<<14) | cpuflags_sse42
-%assign cpuflags_xop (1<<15) | cpuflags_avx
-%assign cpuflags_fma4 (1<<16) | cpuflags_avx
-%assign cpuflags_fma3 (1<<17) | cpuflags_avx
-%assign cpuflags_bmi1 (1<<18) | cpuflags_avx|cpuflags_lzcnt
-%assign cpuflags_bmi2 (1<<19) | cpuflags_bmi1
-%assign cpuflags_avx2 (1<<20) | cpuflags_fma3|cpuflags_bmi2
-%assign cpuflags_avx512 (1<<21) | cpuflags_avx2 ; F, CD, BW, DQ, VL
-%assign cpuflags_avx512icl (1<<22) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ
-
-%assign cpuflags_cache32 (1<<23)
-%assign cpuflags_cache64 (1<<24)
-%assign cpuflags_aligned (1<<25) ; not a cpu feature, but a function variant
-%assign cpuflags_atom (1<<26)
+%assign cpuflags_clmul (1<<13) | cpuflags_sse42
+%assign cpuflags_gfni (1<<14) | cpuflags_aesni|cpuflags_clmul
+%assign cpuflags_avx (1<<15) | cpuflags_sse42
+%assign cpuflags_xop (1<<16) | cpuflags_avx
+%assign cpuflags_fma4 (1<<17) | cpuflags_avx
+%assign cpuflags_fma3 (1<<18) | cpuflags_avx
+%assign cpuflags_bmi1 (1<<19) | cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2 (1<<20) | cpuflags_bmi1
+%assign cpuflags_avx2 (1<<21) | cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512 (1<<22) | cpuflags_avx2 ; F, CD, BW, DQ, VL
+%assign cpuflags_avx512icl (1<<23) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ
+
+%assign cpuflags_cache32 (1<<24)
+%assign cpuflags_cache64 (1<<25)
+%assign cpuflags_aligned (1<<26) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<27)
; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
@@ -939,13 +957,13 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%endif
%if ARCH_X86_64 || cpuflag(sse2)
- %ifdef __NASM_VER__
+ %ifdef __NASM_VERSION_ID__
ALIGNMODE p6
%else
CPU amdnop
%endif
%else
- %ifdef __NASM_VER__
+ %ifdef __NASM_VERSION_ID__
ALIGNMODE nop
%else
CPU basicnop
@@ -1035,6 +1053,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%if WIN64
AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
%endif
+ %xdefine bcstw 1to8
%xdefine bcstd 1to4
%xdefine bcstq 1to2
%endmacro
@@ -1050,6 +1069,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
INIT_CPUFLAGS %1
DEFINE_MMREGS ymm
AVX512_MM_PERMUTATION
+ %xdefine bcstw 1to16
%xdefine bcstd 1to8
%xdefine bcstq 1to4
%endmacro
@@ -1065,6 +1085,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
INIT_CPUFLAGS %1
DEFINE_MMREGS zmm
AVX512_MM_PERMUTATION
+ %xdefine bcstw 1to32
%xdefine bcstd 1to16
%xdefine bcstq 1to8
%endmacro
@@ -1607,11 +1628,11 @@ AVX_INSTR pavgb, mmx2, 0, 0, 1
AVX_INSTR pavgw, mmx2, 0, 0, 1
AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding
AVX_INSTR pblendw, sse4, 0, 1, 0
-AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
-AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
-AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
-AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
-AVX_INSTR pclmulqdq, fnord, 0, 1, 0
+AVX_INSTR pclmulhqhqdq, clmul, 0, 0, 0
+AVX_INSTR pclmulhqlqdq, clmul, 0, 0, 0
+AVX_INSTR pclmullqhqdq, clmul, 0, 0, 0
+AVX_INSTR pclmullqlqdq, clmul, 0, 0, 0
+AVX_INSTR pclmulqdq, clmul, 0, 1, 0
AVX_INSTR pcmpeqb, mmx, 0, 0, 1
AVX_INSTR pcmpeqd, mmx, 0, 0, 1
AVX_INSTR pcmpeqq, sse4, 0, 0, 1
@@ -1766,6 +1787,7 @@ GPR_INSTR blsi, bmi1
GPR_INSTR blsmsk, bmi1
GPR_INSTR blsr, bmi1
GPR_INSTR bzhi, bmi2
+GPR_INSTR crc32, sse42
GPR_INSTR mulx, bmi2
GPR_INSTR pdep, bmi2
GPR_INSTR pext, bmi2
diff --git a/src/itx_1d.c b/src/itx_1d.c
index ca14fc8..8f75c65 100644
--- a/src/itx_1d.c
+++ b/src/itx_1d.c
@@ -1016,6 +1016,10 @@ void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
c[stride * i] *= 4;
}
+#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
+ ARCH_AARCH64 || \
+ (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
+))
void dav1d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) {
assert(stride > 0);
const int in0 = c[0 * stride], in1 = c[1 * stride];
@@ -1032,3 +1036,4 @@ void dav1d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) {
c[2 * stride] = t1;
c[3 * stride] = t2 + t1;
}
+#endif
diff --git a/src/itx_tmpl.c b/src/itx_tmpl.c
index 8ff245a..a226223 100644
--- a/src/itx_tmpl.c
+++ b/src/itx_tmpl.c
@@ -159,6 +159,10 @@ inv_txfm_fn64(64, 16, 2)
inv_txfm_fn64(64, 32, 1)
inv_txfm_fn64(64, 64, 2)
+#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
+ ARCH_AARCH64 || \
+ (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
+))
static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
coef *const coeff, const int eob
HIGHBD_DECL_SUFFIX)
@@ -179,6 +183,7 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
for (int x = 0; x < 4; x++)
dst[x] = iclip_pixel(dst[x] + *c++);
}
+#endif
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
@@ -236,7 +241,12 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
inv_txfm_add_identity_adst_##w##x##h##_c; \
+#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
+ ARCH_AARCH64 || \
+ (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
+))
c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;
+#endif
assign_itx_all_fn84( 4, 4, );
assign_itx_all_fn84( 4, 8, R);
assign_itx_all_fn84( 4, 16, R);
diff --git a/src/loongarch/msac.S b/src/loongarch/msac.S
index c371eba..5bf1825 100644
--- a/src/loongarch/msac.S
+++ b/src/loongarch/msac.S
@@ -133,55 +133,58 @@ endconst
slli.d t4, t4, 48
vpickve2gr.d t6, vr2, 0
sub.d t6, t6, t4 // dif
- addi.d t6, t6, 1
clz.w t4, t5 // d
xori t4, t4, 16 // d
sll.d t6, t6, t4
- addi.d t6, t6, -1 // dif
addi.d a5, a0, 28 // cnt
- ld.w t7, a5, 0
- sub.w t7, t7, t4 // cnt-d
+ ld.w t0, a5, 0
sll.w t5, t5, t4
+ sub.w t7, t0, t4 // cnt-d
st.w t5, a4, 0 // store rng
- bge t7, zero, 9f
+ bgeu t0, t4, 9f
// refill
ld.d t0, a0, 0 // buf_pos
- addi.d t1, a0, 8
- ld.d t1, t1, 0 // buf_end
+ ld.d t1, a0, 8 // buf_end
addi.d t2, t0, 8
- blt t1, t2, 1f
+ bltu t1, t2, 2f
- ld.d t0, t0, 0 // next_bits
- addi.w t3, t7, 23 // shift_bits = cnt + 23
- addi.w t7, t7, 16 // cnt += 16
- revb.d t0, t0 // next_bits = bswap(next_bits)
- srli.w t4, t3, 3
- sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
- st.d t2, a0, 0
- andi t3, t3, 24 // shift_bits &= 24
- srl.d t0, t0, t3 // next_bits >>= shift_bits
- sub.w t3, t3, t7 // shift_bits -= 16 + cnt
- sll.d t0, t0, t3 // next_bits <<= shift_bits
- li.w t5, 48
- sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
- xor t6, t6, t0 // dif ^= next_bits
- b 9f
+ ld.d t3, t0, 0 // next_bits
+ addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64)
+ nor t3, t3, t3
+ sub.w t2, zero, t1
+ revb.d t3, t3 // next_bits = bswap(next_bits)
+ srli.w t2, t2, 3 // num_bytes_read
+ srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63)
+ b 3f
1:
- li.w t4, 40
- sub.w t5, t4, t7 // c = 40 - cnt
+ addi.w t3, t7, -48
+ srl.d t3, t3, t3 // pad with ones
+ b 4f
2:
- bge t0, t1, 3f
- ld.bu t2, t0, 0
- addi.d t0, t0, 1
- sll.d t2, t2, t5
- xor t6, t6, t2
- addi.w t5, t5, -8
- bge t5, zero, 2b
- // refill_eob_end
+ bgeu t0, t1, 1b
+ ld.d t3, t1, -8 // next_bits
+ sub.w t2, t2, t1
+ sub.w t1, t1, t0 // num_bytes_left
+ slli.w t2, t2, 3
+ srl.d t3, t3, t2
+ addi.w t2, t7, -48
+ nor t3, t3, t3
+ sub.w t4, zero, t2
+ revb.d t3, t3
+ srli.w t4, t4, 3
+ srl.d t3, t3, t2
+ sltu t2, t1, t4
+ maskeqz t1, t1, t2
+ masknez t2, t4, t2
+ or t2, t2, t1 // num_bytes_read
3:
- st.d t0, a0, 0 // s->buf_pos = buf_pos
- sub.w t7, t4, t5 // cnt = 40 - c
+ slli.w t1, t2, 3
+ add.d t0, t0, t2
+ add.w t7, t7, t1 // cnt += num_bits_read
+ st.d t0, a0, 0
+4:
+ or t6, t6, t3 // dif |= next_bits
9:
st.w t7, a5, 0 // store cnt
st.d t6, a6, 0 // store dif
@@ -208,7 +211,6 @@ function msac_decode_bool_lsx
srli.w t2, t0, 8 // r >> 8
mul.w t2, t2, a1
ld.w a5, a0, 28 // cnt
- addi.d t1, t1, 1 // dif + 1
srli.w t2, t2, 1
addi.w t2, t2, 4 // v
slli.d t3, t2, 48 // vw
@@ -226,49 +228,53 @@ function msac_decode_bool_lsx
clz.w t4, t5 // d
xori t4, t4, 16 // d
sll.d t6, t6, t4
- addi.d t6, t6, -1 // dif
- sub.w t7, a5, t4 // cnt-d
sll.w t5, t5, t4
+ sub.w t7, a5, t4 // cnt-d
st.w t5, a0, 24 // store rng
- bge t7, zero, 9f
+ bgeu a5, t4, 9f
// refill
ld.d t0, a0, 0 // buf_pos
- addi.d t1, a0, 8
- ld.d t1, t1, 0 // buf_end
+ ld.d t1, a0, 8 // buf_end
addi.d t2, t0, 8
- blt t1, t2, 1f
+ bltu t1, t2, 2f
- ld.d t0, t0, 0 // next_bits
- addi.w t3, t7, 23 // shift_bits = cnt + 23
- addi.w t7, t7, 16 // cnt += 16
- revb.d t0, t0 // next_bits = bswap(next_bits)
- srli.w t4, t3, 3
- sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
- st.d t2, a0, 0
- andi t3, t3, 24 // shift_bits &= 24
- srl.d t0, t0, t3 // next_bits >>= shift_bits
- sub.w t3, t3, t7 // shift_bits -= 16 + cnt
- sll.d t0, t0, t3 // next_bits <<= shift_bits
- li.w t5, 48
- sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
- xor t6, t6, t0 // dif ^= next_bits
- b 9f
+ ld.d t3, t0, 0 // next_bits
+ addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64)
+ nor t3, t3, t3
+ sub.w t2, zero, t1
+ revb.d t3, t3 // next_bits = bswap(next_bits)
+ srli.w t2, t2, 3 // num_bytes_read
+ srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63)
+ b 3f
1:
- li.w t4, 40
- sub.w t5, t4, t7 // c = 40 - cnt
+ addi.w t3, t7, -48
+ srl.d t3, t3, t3 // pad with ones
+ b 4f
2:
- bge t0, t1, 3f
- ld.bu t2, t0, 0
- addi.d t0, t0, 1
- sll.d t2, t2, t5
- xor t6, t6, t2
- addi.w t5, t5, -8
- bge t5, zero, 2b
- // refill_eob_end
+ bgeu t0, t1, 1b
+ ld.d t3, t1, -8 // next_bits
+ sub.w t2, t2, t1
+ sub.w t1, t1, t0 // num_bytes_left
+ slli.w t2, t2, 3
+ srl.d t3, t3, t2
+ addi.w t2, t7, -48
+ nor t3, t3, t3
+ sub.w t4, zero, t2
+ revb.d t3, t3
+ srli.w t4, t4, 3
+ srl.d t3, t3, t2
+ sltu t2, t1, t4
+ maskeqz t1, t1, t2
+ masknez t2, t4, t2
+ or t2, t2, t1 // num_bytes_read
3:
- st.d t0, a0, 0 // s->buf_pos = buf_pos
- sub.w t7, t4, t5 // cnt = 40 - c
+ slli.w t1, t2, 3
+ add.d t0, t0, t2
+ add.w t7, t7, t1 // cnt += num_bits_read
+ st.d t0, a0, 0
+4:
+ or t6, t6, t3 // dif |= next_bits
9:
st.w t7, a0, 28 // store cnt
st.d t6, a0, 16 // store dif
@@ -313,54 +319,56 @@ function msac_decode_bool_adapt_lsx
st.h t0, a1, 2
.renorm:
- // renorm
- addi.d t6, t6, 1
clz.w t4, t5 // d
xori t4, t4, 16 // d
sll.d t6, t6, t4
- addi.d t6, t6, -1 // dif
- sub.w t7, a5, t4 // cnt-d
sll.w t5, t5, t4
+ sub.w t7, a5, t4 // cnt-d
st.w t5, a0, 24 // store rng
- bge t7, zero, 9f
+ bgeu a5, t4, 9f
// refill
ld.d t0, a0, 0 // buf_pos
- addi.d t1, a0, 8
- ld.d t1, t1, 0 // buf_end
+ ld.d t1, a0, 8 // buf_end
addi.d t2, t0, 8
- blt t1, t2, 1f
+ bltu t1, t2, 2f
- ld.d t0, t0, 0 // next_bits
- addi.w t3, t7, 23 // shift_bits = cnt + 23
- addi.w t7, t7, 16 // cnt += 16
- revb.d t0, t0 // next_bits = bswap(next_bits)
- srli.w t4, t3, 3
- sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
- st.d t2, a0, 0
- andi t3, t3, 24 // shift_bits &= 24
- srl.d t0, t0, t3 // next_bits >>= shift_bits
- sub.w t3, t3, t7 // shift_bits -= 16 + cnt
- sll.d t0, t0, t3 // next_bits <<= shift_bits
- li.w t5, 48
- sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
- xor t6, t6, t0 // dif ^= next_bits
- b 9f
+ ld.d t3, t0, 0 // next_bits
+ addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64)
+ nor t3, t3, t3
+ sub.w t2, zero, t1
+ revb.d t3, t3 // next_bits = bswap(next_bits)
+ srli.w t2, t2, 3 // num_bytes_read
+ srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63)
+ b 3f
1:
- li.w t4, 40
- sub.w t5, t4, t7 // c = 40 - cnt
+ addi.w t3, t7, -48
+ srl.d t3, t3, t3 // pad with ones
+ b 4f
2:
- bge t0, t1, 3f
- ld.bu t2, t0, 0
- addi.d t0, t0, 1
- sll.d t2, t2, t5
- xor t6, t6, t2
- addi.w t5, t5, -8
- bge t5, zero, 2b
- // refill_eob_end
+ bgeu t0, t1, 1b
+ ld.d t3, t1, -8 // next_bits
+ sub.w t2, t2, t1
+ sub.w t1, t1, t0 // num_bytes_left
+ slli.w t2, t2, 3
+ srl.d t3, t3, t2
+ addi.w t2, t7, -48
+ nor t3, t3, t3
+ sub.w t4, zero, t2
+ revb.d t3, t3
+ srli.w t4, t4, 3
+ srl.d t3, t3, t2
+ sltu t2, t1, t4
+ maskeqz t1, t1, t2
+ masknez t2, t4, t2
+ or t2, t2, t1 // num_bytes_read
3:
- st.d t0, a0, 0 // s->buf_pos = buf_pos
- sub.w t7, t4, t5 // cnt = 40 - c
+ slli.w t1, t2, 3
+ add.d t0, t0, t2
+ add.w t7, t7, t1 // cnt += num_bits_read
+ st.d t0, a0, 0
+4:
+ or t6, t6, t3 // dif |= next_bits
9:
st.w t7, a0, 28 // store cnt
st.d t6, a0, 16 // store dif
diff --git a/src/msac.c b/src/msac.c
index 43d8ae5..971ba85 100644
--- a/src/msac.c
+++ b/src/msac.c
@@ -43,15 +43,40 @@ static inline void ctx_refill(MsacContext *const s) {
const uint8_t *buf_end = s->buf_end;
int c = EC_WIN_SIZE - s->cnt - 24;
ec_win dif = s->dif;
- while (c >= 0 && buf_pos < buf_end) {
- dif ^= ((ec_win)*buf_pos++) << c;
+ do {
+ if (buf_pos >= buf_end) {
+ // set remaining bits to 1;
+ dif |= ~(~(ec_win)0xff << c);
+ break;
+ }
+ dif |= (ec_win)(*buf_pos++ ^ 0xff) << c;
c -= 8;
- }
+ } while (c >= 0);
s->dif = dif;
s->cnt = EC_WIN_SIZE - c - 24;
s->buf_pos = buf_pos;
}
+int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
+ const int n, unsigned k)
+{
+ assert(n >> k == 8);
+
+ unsigned a = 0;
+ if (dav1d_msac_decode_bool_equi(s)) {
+ if (dav1d_msac_decode_bool_equi(s))
+ k += dav1d_msac_decode_bool_equi(s) + 1;
+ a = 1 << k;
+ }
+ const unsigned v = dav1d_msac_decode_bools(s, k) + a;
+ return ref * 2 <= n ? inv_recenter(ref, v) :
+ n - 1 - inv_recenter(n - 1 - ref, v);
+}
+
+#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
+ ARCH_AARCH64 || \
+ (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
+))
/* Takes updated dif and range values, renormalizes them so that
* 32768 <= rng < 65536 (reading more bytes from the stream into dif if
* necessary), and stores them back in the decoder context.
@@ -61,11 +86,13 @@ static inline void ctx_norm(MsacContext *const s, const ec_win dif,
const unsigned rng)
{
const int d = 15 ^ (31 ^ clz(rng));
+ const int cnt = s->cnt;
assert(rng <= 65535U);
- s->cnt -= d;
- s->dif = ((dif + 1) << d) - 1; /* Shift in 1s in the LSBs */
+ s->dif = dif << d;
s->rng = rng << d;
- if (s->cnt < 0)
+ s->cnt = cnt - d;
+ // unsigned compare avoids redundant refills at eob
+ if ((unsigned)cnt < (unsigned)d)
ctx_refill(s);
}
@@ -100,22 +127,6 @@ unsigned dav1d_msac_decode_bool_c(MsacContext *const s, const unsigned f) {
return !ret;
}
-int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
- const int n, unsigned k)
-{
- assert(n >> k == 8);
-
- unsigned a = 0;
- if (dav1d_msac_decode_bool_equi(s)) {
- if (dav1d_msac_decode_bool_equi(s))
- k += dav1d_msac_decode_bool_equi(s) + 1;
- a = 1 << k;
- }
- const unsigned v = dav1d_msac_decode_bools(s, k) + a;
- return ref * 2 <= n ? inv_recenter(ref, v) :
- n - 1 - inv_recenter(n - 1 - ref, v);
-}
-
/* Decodes a symbol given an inverse cumulative distribution function (CDF)
* table in Q15. */
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
@@ -188,13 +199,14 @@ unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf) {
}
return tok;
}
+#endif
void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
const size_t sz, const int disable_cdf_update_flag)
{
s->buf_pos = data;
s->buf_end = data + sz;
- s->dif = ((ec_win)1 << (EC_WIN_SIZE - 1)) - 1;
+ s->dif = 0;
s->rng = 0x8000;
s->cnt = -15;
s->allow_update_cdf = !disable_cdf_update_flag;
diff --git a/src/ppc/cdef_tmpl.c b/src/ppc/cdef_tmpl.c
index e2e7598..6ef87ad 100644
--- a/src/ppc/cdef_tmpl.c
+++ b/src/ppc/cdef_tmpl.c
@@ -29,11 +29,10 @@
#if BITDEPTH == 8
static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
- const int damping)
+ const uint16_t shift)
{
const i16x8 zero = vec_splat_s16(0);
if (!threshold) return zero;
- const uint16_t shift = imax(0, damping - ulog2(threshold));
const i16x8 abs_diff = vec_abs(diff);
const b16x8 mask = vec_cmplt(diff, zero);
const i16x8 thr = vec_splats(threshold);
@@ -44,7 +43,7 @@ static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
return vec_sel(min, neg, mask);
}
-static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+static inline void copy4xN(uint16_t *tmp,
const uint8_t *src, const ptrdiff_t src_stride,
const uint8_t (*left)[2], const uint8_t *const top,
const uint8_t *const bottom, const int w, const int h,
@@ -114,7 +113,7 @@ static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
}
}
-static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+static inline void copy8xN(uint16_t *tmp,
const uint8_t *src, const ptrdiff_t src_stride,
const uint8_t (*left)[2], const uint8_t *const top,
const uint8_t *const bottom, const int w, const int h,
@@ -218,16 +217,12 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) {
#define LOAD_PIX(addr) \
const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \
- i16x8 max = px; \
- i16x8 min = px; \
i16x8 sum = vec_splat_s16(0);
#define LOAD_PIX4(addr) \
const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \
- const i16x8 b = (i16x8)vec_vsx_ld(0, addr + tmp_stride); \
+ const i16x8 b = (i16x8)vec_vsx_ld(0, addr + 8); \
const i16x8 px = vec_xxpermdi(a, b, 0); \
- i16x8 max = px; \
- i16x8 min = px; \
i16x8 sum = vec_splat_s16(0);
#define LOAD_DIR(p, addr, o0, o1) \
@@ -238,22 +233,26 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) {
#define LOAD_DIR4(p, addr, o0, o1) \
LOAD_DIR(p ## a, addr, o0, o1) \
- LOAD_DIR(p ## b, addr + tmp_stride, o0, o1) \
+ LOAD_DIR(p ## b, addr + 8, o0, o1) \
const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \
const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \
const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \
const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0);
-#define CONSTRAIN(p, strength) \
+#define CONSTRAIN(p, strength, shift) \
const i16x8 p ## _d0 = vec_sub(p ## 0, px); \
const i16x8 p ## _d1 = vec_sub(p ## 1, px); \
const i16x8 p ## _d2 = vec_sub(p ## 2, px); \
const i16x8 p ## _d3 = vec_sub(p ## 3, px); \
\
- i16x8 p ## _c0 = vconstrain(p ## _d0, strength, damping); \
- i16x8 p ## _c1 = vconstrain(p ## _d1, strength, damping); \
- i16x8 p ## _c2 = vconstrain(p ## _d2, strength, damping); \
- i16x8 p ## _c3 = vconstrain(p ## _d3, strength, damping);
+ i16x8 p ## _c0 = vconstrain(p ## _d0, strength, shift); \
+ i16x8 p ## _c1 = vconstrain(p ## _d1, strength, shift); \
+ i16x8 p ## _c2 = vconstrain(p ## _d2, strength, shift); \
+ i16x8 p ## _c3 = vconstrain(p ## _d3, strength, shift);
+
+#define SETUP_MINMAX \
+ i16x8 max = px; \
+ i16x8 min = px; \
#define MIN_MAX(p) \
max = max_mask(p ## 0, max); \
@@ -265,19 +264,16 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) {
max = max_mask(p ## 3, max); \
min = vec_min(p ## 3, min);
-#define PRI_0(p) \
- p ## _c0 = vec_add(vec_sl(p ## _c0, vec_splat_u16(1)), vec_sl(p ## _c0, vec_splats(tap_even))); \
- p ## _c1 = vec_add(vec_sl(p ## _c1, vec_splat_u16(1)), vec_sl(p ## _c1, vec_splats(tap_even)));
+#define MAKE_TAPS \
+ const int16_t tap_odd = (pri_strength >> bitdepth_min_8) & 1; \
+ const i16x8 tap0 = vec_splats((int16_t)(4 - tap_odd)); \
+ const i16x8 tap1 = vec_splats((int16_t)(2 + tap_odd));
-#define PRI_1(p) \
- p ## _c2 = vec_sub(vec_sl(p ## _c2, vec_splat_u16(2)), vec_sl(p ## _c2, vec_splats(tap_even))); \
- p ## _c3 = vec_sub(vec_sl(p ## _c3, vec_splat_u16(2)), vec_sl(p ## _c3, vec_splats(tap_even)));
-
-#define SEC_0(p) \
- p ## _c0 = vec_sl(p ## _c0, vec_splat_u16(1)); \
- p ## _c1 = vec_sl(p ## _c1, vec_splat_u16(1)); \
- p ## _c2 = vec_sl(p ## _c2, vec_splat_u16(1)); \
- p ## _c3 = vec_sl(p ## _c3, vec_splat_u16(1));
+#define PRI_0_UPDATE_SUM(p) \
+ sum = vec_madd(tap0, p ## _c0, sum); \
+ sum = vec_madd(tap0, p ## _c1, sum); \
+ sum = vec_madd(tap1, p ## _c2, sum); \
+ sum = vec_madd(tap1, p ## _c3, sum);
#define UPDATE_SUM(p) \
const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \
@@ -285,92 +281,198 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) {
sum = vec_add(sum, p ## sum0); \
sum = vec_add(sum, p ## sum1);
+#define SEC_0_UPDATE_SUM(p) \
+ sum = vec_madd(vec_splat_s16(2), p ## _c0, sum); \
+ sum = vec_madd(vec_splat_s16(2), p ## _c1, sum); \
+ sum = vec_madd(vec_splat_s16(2), p ## _c2, sum); \
+ sum = vec_madd(vec_splat_s16(2), p ## _c3, sum);
+
+#define BIAS \
+ i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); \
+ bias = vec_sub(vec_splat_s16(8), bias); \
+
+#define STORE4 \
+ dst[0] = vdst[0]; \
+ dst[1] = vdst[1]; \
+ dst[2] = vdst[2]; \
+ dst[3] = vdst[3]; \
+\
+ tmp += 8; \
+ dst += PXSTRIDE(dst_stride); \
+ dst[0] = vdst[4]; \
+ dst[1] = vdst[5]; \
+ dst[2] = vdst[6]; \
+ dst[3] = vdst[7]; \
+\
+ tmp += 8; \
+ dst += PXSTRIDE(dst_stride);
+
+#define STORE4_CLAMPED \
+ BIAS \
+ i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
+ i16x8 vdst = vec_max(vec_min(unclamped, max), min); \
+ STORE4
+
+#define STORE4_UNCLAMPED \
+ BIAS \
+ i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
+ STORE4
+
+#define STORE8 \
+ dst[0] = vdst[0]; \
+ dst[1] = vdst[1]; \
+ dst[2] = vdst[2]; \
+ dst[3] = vdst[3]; \
+ dst[4] = vdst[4]; \
+ dst[5] = vdst[5]; \
+ dst[6] = vdst[6]; \
+ dst[7] = vdst[7]; \
+\
+ tmp += 16; \
+ dst += PXSTRIDE(dst_stride);
+
+#define STORE8_CLAMPED \
+ BIAS \
+ i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
+ i16x8 vdst = vec_max(vec_min(unclamped, max), min); \
+ STORE8
+
+#define STORE8_UNCLAMPED \
+ BIAS \
+ i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
+ STORE8
+
+#define DIRECTIONS(w, tmp_stride) \
+ static const int8_t cdef_directions##w[8 /* dir */][2 /* pass */] = { \
+ { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, \
+ { 0 * tmp_stride + 1, -1 * tmp_stride + 2 }, \
+ { 0 * tmp_stride + 1, 0 * tmp_stride + 2 }, \
+ { 0 * tmp_stride + 1, 1 * tmp_stride + 2 }, \
+ { 1 * tmp_stride + 1, 2 * tmp_stride + 2 }, \
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 1 }, \
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 0 }, \
+ { 1 * tmp_stride + 0, 2 * tmp_stride - 1 } \
+ };
+
+DIRECTIONS(4, 8)
+DIRECTIONS(8, 16)
+
static inline void
filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
const pixel (*left)[2], const pixel *const top,
const pixel *const bottom, const int w, const int h,
const int pri_strength, const int sec_strength, const int dir,
- const int damping, const enum CdefEdgeFlags edges,
- const ptrdiff_t tmp_stride, uint16_t *tmp)
+ const int pri_shift, const int sec_shift,
+ const enum CdefEdgeFlags edges, uint16_t *tmp)
{
- const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
- { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
- { 0 * tmp_stride + 1, -1 * tmp_stride + 2 },
- { 0 * tmp_stride + 1, 0 * tmp_stride + 2 },
- { 0 * tmp_stride + 1, 1 * tmp_stride + 2 },
- { 1 * tmp_stride + 1, 2 * tmp_stride + 2 },
- { 1 * tmp_stride + 0, 2 * tmp_stride + 1 },
- { 1 * tmp_stride + 0, 2 * tmp_stride + 0 },
- { 1 * tmp_stride + 0, 2 * tmp_stride - 1 }
- };
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
- const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
- const int off1 = cdef_directions[dir][0];
- const int off1_1 = cdef_directions[dir][1];
+ const int off1 = cdef_directions4[dir][0];
+ const int off1_1 = cdef_directions4[dir][1];
- const int off2 = cdef_directions[(dir + 2) & 7][0];
- const int off3 = cdef_directions[(dir + 6) & 7][0];
+ const int off2 = cdef_directions4[(dir + 2) & 7][0];
+ const int off3 = cdef_directions4[(dir + 6) & 7][0];
- const int off2_1 = cdef_directions[(dir + 2) & 7][1];
- const int off3_1 = cdef_directions[(dir + 6) & 7][1];
+ const int off2_1 = cdef_directions4[(dir + 2) & 7][1];
+ const int off3_1 = cdef_directions4[(dir + 6) & 7][1];
- copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges);
+ MAKE_TAPS
for (int y = 0; y < h / 2; y++) {
LOAD_PIX4(tmp)
+ SETUP_MINMAX
+
// Primary pass
LOAD_DIR4(p, tmp, off1, off1_1)
- CONSTRAIN(p, pri_strength)
+ CONSTRAIN(p, pri_strength, pri_shift)
MIN_MAX(p)
- PRI_0(p)
- PRI_1(p)
-
- UPDATE_SUM(p)
+ PRI_0_UPDATE_SUM(p)
// Secondary pass 1
LOAD_DIR4(s, tmp, off2, off3)
- CONSTRAIN(s, sec_strength)
+ CONSTRAIN(s, sec_strength, sec_shift)
MIN_MAX(s)
- SEC_0(s)
-
- UPDATE_SUM(s)
+ SEC_0_UPDATE_SUM(s)
// Secondary pass 2
LOAD_DIR4(s2, tmp, off2_1, off3_1)
- CONSTRAIN(s2, sec_strength)
+ CONSTRAIN(s2, sec_strength, sec_shift)
MIN_MAX(s2)
UPDATE_SUM(s2)
// Store
- i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
- bias = vec_sub(vec_splat_s16(8), bias);
- i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
- i16x8 vdst = vec_max(vec_min(unclamped, max), min);
-
- dst[0] = vdst[0];
- dst[1] = vdst[1];
- dst[2] = vdst[2];
- dst[3] = vdst[3];
-
- tmp += tmp_stride;
- dst += PXSTRIDE(dst_stride);
- dst[0] = vdst[4];
- dst[1] = vdst[5];
- dst[2] = vdst[6];
- dst[3] = vdst[7];
-
- tmp += tmp_stride;
- dst += PXSTRIDE(dst_stride);
+ STORE4_CLAMPED
+ }
+}
+
+static inline void
+filter_4xN_pri(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2], const pixel *const top,
+ const pixel *const bottom, const int w, const int h,
+ const int pri_strength, const int dir,
+ const int pri_shift, const enum CdefEdgeFlags edges,
+ uint16_t *tmp)
+{
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int off1 = cdef_directions4[dir][0];
+ const int off1_1 = cdef_directions4[dir][1];
+
+ MAKE_TAPS
+
+ for (int y = 0; y < h / 2; y++) {
+ LOAD_PIX4(tmp)
+
+ // Primary pass
+ LOAD_DIR4(p, tmp, off1, off1_1)
+
+ CONSTRAIN(p, pri_strength, pri_shift)
+
+ PRI_0_UPDATE_SUM(p)
+
+ STORE4_UNCLAMPED
+ }
+}
+
+static inline void
+filter_4xN_sec(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2], const pixel *const top,
+ const pixel *const bottom, const int w, const int h,
+ const int sec_strength, const int dir,
+ const int sec_shift, const enum CdefEdgeFlags edges,
+ uint16_t *tmp)
+{
+ const int off2 = cdef_directions4[(dir + 2) & 7][0];
+ const int off3 = cdef_directions4[(dir + 6) & 7][0];
+
+ const int off2_1 = cdef_directions4[(dir + 2) & 7][1];
+ const int off3_1 = cdef_directions4[(dir + 6) & 7][1];
+
+ for (int y = 0; y < h / 2; y++) {
+ LOAD_PIX4(tmp)
+ // Secondary pass 1
+ LOAD_DIR4(s, tmp, off2, off3)
+
+ CONSTRAIN(s, sec_strength, sec_shift)
+
+ SEC_0_UPDATE_SUM(s)
+
+ // Secondary pass 2
+ LOAD_DIR4(s2, tmp, off2_1, off3_1)
+
+ CONSTRAIN(s2, sec_strength, sec_shift)
+
+ UPDATE_SUM(s2)
+
+ STORE4_UNCLAMPED
}
}
@@ -379,88 +481,121 @@ filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
const pixel (*left)[2], const pixel *const top,
const pixel *const bottom, const int w, const int h,
const int pri_strength, const int sec_strength, const int dir,
- const int damping, const enum CdefEdgeFlags edges,
- const ptrdiff_t tmp_stride, uint16_t *tmp)
+ const int pri_shift, const int sec_shift, const enum CdefEdgeFlags edges,
+ uint16_t *tmp)
{
- const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
- { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
- { 0 * tmp_stride + 1, -1 * tmp_stride + 2 },
- { 0 * tmp_stride + 1, 0 * tmp_stride + 2 },
- { 0 * tmp_stride + 1, 1 * tmp_stride + 2 },
- { 1 * tmp_stride + 1, 2 * tmp_stride + 2 },
- { 1 * tmp_stride + 0, 2 * tmp_stride + 1 },
- { 1 * tmp_stride + 0, 2 * tmp_stride + 0 },
- { 1 * tmp_stride + 0, 2 * tmp_stride - 1 }
- };
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int off1 = cdef_directions8[dir][0];
+ const int off1_1 = cdef_directions8[dir][1];
- const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
- const int off1 = cdef_directions[dir][0];
- const int off1_1 = cdef_directions[dir][1];
+ const int off2 = cdef_directions8[(dir + 2) & 7][0];
+ const int off3 = cdef_directions8[(dir + 6) & 7][0];
- const int off2 = cdef_directions[(dir + 2) & 7][0];
- const int off3 = cdef_directions[(dir + 6) & 7][0];
+ const int off2_1 = cdef_directions8[(dir + 2) & 7][1];
+ const int off3_1 = cdef_directions8[(dir + 6) & 7][1];
- const int off2_1 = cdef_directions[(dir + 2) & 7][1];
- const int off3_1 = cdef_directions[(dir + 6) & 7][1];
-
- copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges);
+ MAKE_TAPS
for (int y = 0; y < h; y++) {
LOAD_PIX(tmp)
+ SETUP_MINMAX
+
// Primary pass
LOAD_DIR(p, tmp, off1, off1_1)
- CONSTRAIN(p, pri_strength)
+ CONSTRAIN(p, pri_strength, pri_shift)
MIN_MAX(p)
- PRI_0(p)
- PRI_1(p)
-
- UPDATE_SUM(p)
+ PRI_0_UPDATE_SUM(p)
// Secondary pass 1
LOAD_DIR(s, tmp, off2, off3)
- CONSTRAIN(s, sec_strength)
+ CONSTRAIN(s, sec_strength, sec_shift)
MIN_MAX(s)
- SEC_0(s)
-
- UPDATE_SUM(s)
+ SEC_0_UPDATE_SUM(s)
// Secondary pass 2
LOAD_DIR(s2, tmp, off2_1, off3_1)
- CONSTRAIN(s2, sec_strength)
+ CONSTRAIN(s2, sec_strength, sec_shift)
MIN_MAX(s2)
UPDATE_SUM(s2)
// Store
- i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
- bias = vec_sub(vec_splat_s16(8), bias);
- i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
- i16x8 vdst = vec_max(vec_min(unclamped, max), min);
-
- dst[0] = vdst[0];
- dst[1] = vdst[1];
- dst[2] = vdst[2];
- dst[3] = vdst[3];
- dst[4] = vdst[4];
- dst[5] = vdst[5];
- dst[6] = vdst[6];
- dst[7] = vdst[7];
-
- tmp += tmp_stride;
- dst += PXSTRIDE(dst_stride);
+ STORE8_CLAMPED
+ }
+
+}
+
+static inline void
+filter_8xN_pri(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2], const pixel *const top,
+ const pixel *const bottom, const int w, const int h,
+ const int pri_strength, const int dir,
+ const int pri_shift, const enum CdefEdgeFlags edges,
+ uint16_t *tmp)
+{
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int off1 = cdef_directions8[dir][0];
+ const int off1_1 = cdef_directions8[dir][1];
+
+ MAKE_TAPS
+
+ for (int y = 0; y < h; y++) {
+ LOAD_PIX(tmp)
+
+ // Primary pass
+ LOAD_DIR(p, tmp, off1, off1_1)
+
+ CONSTRAIN(p, pri_strength, pri_shift)
+
+ PRI_0_UPDATE_SUM(p)
+
+ STORE8_UNCLAMPED
}
+}
+
+static inline void
+filter_8xN_sec(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2], const pixel *const top,
+ const pixel *const bottom, const int w, const int h,
+ const int sec_strength, const int dir,
+ const int sec_shift, const enum CdefEdgeFlags edges,
+ uint16_t *tmp)
+{
+ const int off2 = cdef_directions8[(dir + 2) & 7][0];
+ const int off3 = cdef_directions8[(dir + 6) & 7][0];
+
+ const int off2_1 = cdef_directions8[(dir + 2) & 7][1];
+ const int off3_1 = cdef_directions8[(dir + 6) & 7][1];
+
+ for (int y = 0; y < h; y++) {
+ LOAD_PIX(tmp)
+
+ // Secondary pass 1
+ LOAD_DIR(s, tmp, off2, off3)
+ CONSTRAIN(s, sec_strength, sec_shift)
+
+ SEC_0_UPDATE_SUM(s)
+
+ // Secondary pass 2
+ LOAD_DIR(s2, tmp, off2_1, off3_1)
+
+ CONSTRAIN(s2, sec_strength, sec_shift)
+
+ UPDATE_SUM(s2)
+
+ STORE8_UNCLAMPED
+ }
}
#define cdef_fn(w, h, tmp_stride) \
@@ -477,8 +612,22 @@ void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \
{ \
ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \
- filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
- sec_strength, dir, damping, edges, tmp_stride, tmp); \
+ copy##w##xN(tmp - 2, dst, dst_stride, left, top, bottom, w, h, edges); \
+ if (pri_strength) { \
+ const int pri_shift = imax(0, damping - ulog2(pri_strength)); \
+ if (sec_strength) { \
+ const int sec_shift = damping - ulog2(sec_strength); \
+ filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
+ sec_strength, dir, pri_shift, sec_shift, edges, tmp); \
+ } else { \
+ filter_##w##xN_pri(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
+ dir, pri_shift, edges, tmp); \
+ } \
+ } else { \
+ const int sec_shift = damping - ulog2(sec_strength); \
+ filter_##w##xN_sec(dst, dst_stride, left, top, bottom, w, h, sec_strength, \
+ dir, sec_shift, edges, tmp); \
+ } \
}
cdef_fn(4, 4, 8);
diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S
index 5677cf4..dfec548 100644
--- a/src/riscv/64/itx.S
+++ b/src/riscv/64/itx.S
@@ -163,48 +163,48 @@ endfunc
vssub.vv \o3, v16, v20
.endm
-.macro iadst_4 o0, o1, o2, o3
+.macro iadst_4 o0, o1, o2, o3, lm2, lm
li t1, 1321
li t2, 3803
li t3, 2482
- vwmul.vx v4, v0, t1
- vwmul.vx v5, v0, t3
+ vwmul.vx v16, v0, t1
+ vwmul.vx v18, v0, t3
neg t1, t1
- vwmacc.vx v4, t2, v2
- vwmacc.vx v5, t1, v2
+ vwmacc.vx v16, t2, v2
+ vwmacc.vx v18, t1, v2
neg t2, t2
- vwmacc.vx v4, t3, v3
- vwmacc.vx v5, t2, v3
+ vwmacc.vx v16, t3, v3
+ vwmacc.vx v18, t2, v3
- vwsub.vv v6, v0, v2
- vwadd.wv v6, v6, v3
+ vwsub.vv v20, v0, v2
+ vwadd.wv v20, v20, v3
li t1, 3344
- vwmul.vx v7, v1, t1
+ vwmul.vx v22, v1, t1
- vsetvli zero, zero, e32, m1, ta, ma
+ vsetvli zero, zero, e32, \lm2, ta, ma
- vmul.vx v6, v6, t1
+ vmul.vx v20, v20, t1
- vadd.vv v8, v4, v5
- vadd.vv v4, v4, v7
- vadd.vv v5, v5, v7
- vsub.vv v7, v8, v7
+ vadd.vv v24, v16, v18
+ vadd.vv v16, v16, v22
+ vadd.vv v18, v18, v22
+ vsub.vv v22, v24, v22
li t1, 2048
- vadd.vx v4, v4, t1
- vadd.vx v5, v5, t1
- vadd.vx v6, v6, t1
- vadd.vx v7, v7, t1
+ vadd.vx v16, v16, t1
+ vadd.vx v18, v18, t1
+ vadd.vx v20, v20, t1
+ vadd.vx v22, v22, t1
- vsetvli zero, zero, e16, mf2, ta, ma
+ vsetvli zero, zero, e16, \lm, ta, ma
- vnsra.wi \o0, v4, 12
- vnsra.wi \o1, v5, 12
- vnsra.wi \o2, v6, 12
- vnsra.wi \o3, v7, 12
+ vnsra.wi \o0, v16, 12
+ vnsra.wi \o1, v18, 12
+ vnsra.wi \o2, v20, 12
+ vnsra.wi \o3, v22, 12
.endm
function inv_dct_e16_x4_rvv, export=1, ext=v
@@ -213,12 +213,22 @@ function inv_dct_e16_x4_rvv, export=1, ext=v
endfunc
function inv_adst_e16_x4_rvv, export=1, ext=v
- iadst_4 v0, v1, v2, v3
+ iadst_4 v0, v1, v2, v3, m1, mf2
jr t0
endfunc
function inv_flipadst_e16_x4_rvv, export=1, ext=v
- iadst_4 v3, v2, v1, v0
+ iadst_4 v3, v2, v1, v0, m1, mf2
+ jr t0
+endfunc
+
+function inv_adst_e16_x4w_rvv, export=1, ext=v
+ iadst_4 v0, v1, v2, v3, m2, m1
+ jr t0
+endfunc
+
+function inv_flipadst_e16_x4w_rvv, export=1, ext=v
+ iadst_4 v3, v2, v1, v0, m2, m1
jr t0
endfunc
@@ -328,6 +338,8 @@ function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v
.ifc \variant, identity_
// The identity vsadd.vv and downshift vssra.vi 1 cancel out
+
+ j L(itx_8x8_epilog)
.else
jalr t0, a4
@@ -339,8 +351,8 @@ function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v
vssra.vi v5, v5, 1
vssra.vi v6, v6, 1
vssra.vi v7, v7, 1
-.endif
+L(itx_8x8_epilog):
vsseg8e16.v v0, (a2)
vle16.v v0, (a2)
addi t0, a2, 16
@@ -374,9 +386,7 @@ function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v
vmv.v.x v8, zero
vse16.v v8, (a2)
-.ifc \variant, identity_
itx_8x8_end:
-.endif
vsetivli zero, 8, e8, mf2, ta, ma
vle8.v v8, (a0)
add t0, a0, a1
@@ -403,7 +413,7 @@ itx_8x8_end:
vwaddu.wv v6, v6, v14
vwaddu.wv v7, v7, v15
- vsetvli zero, zero, e16, m1
+ vsetvli zero, zero, e16, m1, ta, ma
vmax.vx v0, v0, zero
vmax.vx v1, v1, zero
vmax.vx v2, v2, zero
@@ -441,11 +451,12 @@ itx_8x8_end:
vse8.v v15, (a0)
ret
+.endif
endfunc
.endm
-def_fn_8x8_base
def_fn_8x8_base identity_
+def_fn_8x8_base
function inv_identity_e16_x8_rvv, export=1, ext=v
vsadd.vv v0, v0, v0
@@ -530,23 +541,23 @@ endfunc
li t5, 2598
li t6, 3166
- vwmul.vx v8, v7, t1
+ vwmul.vx v16, v7, t1
neg t1, t1
- vwmul.vx v10, v7, t2
- vwmacc.vx v8, t2, v0
- vwmacc.vx v10, t1, v0
+ vwmul.vx v18, v7, t2
+ vwmacc.vx v16, t2, v0
+ vwmacc.vx v18, t1, v0
- vwmul.vx v12, v5, t3
+ vwmul.vx v20, v5, t3
neg t3, t3
- vwmul.vx v14, v5, t4
- vwmacc.vx v12, t4, v2
- vwmacc.vx v14, t3, v2
+ vwmul.vx v22, v5, t4
+ vwmacc.vx v20, t4, v2
+ vwmacc.vx v22, t3, v2
- vwmul.vx v16, v3, t5
+ vwmul.vx v24, v3, t5
neg t5, t5
- vwmul.vx v18, v3, t6
- vwmacc.vx v16, t6, v4
- vwmacc.vx v18, t5, v4
+ vwmul.vx v26, v3, t6
+ vwmacc.vx v24, t6, v4
+ vwmacc.vx v26, t5, v4
li t1, 2048
li t2, 1189
@@ -555,95 +566,95 @@ endfunc
li t5, 3784
li t6, 2896
- vwmul.vx v20, v1, t2
+ vwmul.vx v28, v1, t2
neg t2, t2
- vwmul.vx v22, v1, t3
- vwmacc.vx v20, t3, v6
- vwmacc.vx v22, t2, v6
-
- vwadd.wx v8, v8, t1
- vwadd.wx v10, v10, t1
- vwadd.wx v12, v12, t1
- vwadd.wx v14, v14, t1
+ vwmul.vx v30, v1, t3
+ vwmacc.vx v28, t3, v6
+ vwmacc.vx v30, t2, v6
+
vwadd.wx v16, v16, t1
vwadd.wx v18, v18, t1
vwadd.wx v20, v20, t1
vwadd.wx v22, v22, t1
+ vwadd.wx v24, v24, t1
+ vwadd.wx v26, v26, t1
+ vwadd.wx v28, v28, t1
+ vwadd.wx v30, v30, t1
- vnsra.wi v8, v8, 12
- vnsra.wi v10, v10, 12
- vnsra.wi v12, v12, 12
- vnsra.wi v14, v14, 12
vnsra.wi v16, v16, 12
vnsra.wi v18, v18, 12
vnsra.wi v20, v20, 12
vnsra.wi v22, v22, 12
+ vnsra.wi v24, v24, 12
+ vnsra.wi v26, v26, 12
+ vnsra.wi v28, v28, 12
+ vnsra.wi v30, v30, 12
- vssub.vv v4, v8, v16
- vsadd.vv v8, v8, v16
- vsadd.vv v1, v10, v18
- vsadd.vv v2, v12, v20
- vsadd.vv v3, v14, v22
- vssub.vv v5, v10, v18
- vssub.vv v6, v12, v20
- vssub.vv v22, v14, v22
-
- vsadd.vv \o0, v8, v2
- vsadd.vv \o7, v1, v3
- vssub.vv v2, v8, v2
- vssub.vv v3, v1, v3
-
- vwmul.vx v8, v4, t5
- vwmul.vx v10, v4, t4
- vwmul.vx v12, v22, t5
- vwmul.vx v14, v22, t4
- vwmacc.vx v8, t4, v5
+ vssub.vv v4, v16, v24
+ vsadd.vv v16, v16, v24
+ vsadd.vv v1, v18, v26
+ vsadd.vv v2, v20, v28
+ vsadd.vv v3, v22, v30
+ vssub.vv v5, v18, v26
+ vssub.vv v6, v20, v28
+ vssub.vv v30, v22, v30
+
+ vsadd.vv \o0, v16, v2
+ vsadd.vv \o7, v1, v3
+ vssub.vv v2, v16, v2
+ vssub.vv v3, v1, v3
+
+ vwmul.vx v16, v4, t5
+ vwmul.vx v18, v4, t4
+ vwmul.vx v20, v30, t5
+ vwmul.vx v22, v30, t4
+ vwmacc.vx v16, t4, v5
neg t4, t4
- vwmacc.vx v14, t5, v6
+ vwmacc.vx v22, t5, v6
neg t5, t5
- vwmacc.vx v12, t4, v6
- vwmacc.vx v10, t5, v5
-
- vwadd.wx v8, v8, t1
- vwadd.wx v10, v10, t1
- vwadd.wx v12, v12, t1
- vwadd.wx v14, v14, t1
-
- vnsra.wi v8, v8, 12
- vnsra.wi v10, v10, 12
- vnsra.wi v12, v12, 12
- vnsra.wi v14, v14, 12
-
- vsadd.vv \o1, v8, v12
- vsadd.vv \o6, v10, v14
- vssub.vv v8, v8, v12
- vssub.vv v9, v10, v14
-
- vwmul.vx v10, v2, t6
- vwmul.vx v12, v2, t6
- vwmul.vx v14, v8, t6
- vwmul.vx v16, v8, t6
- vwmacc.vx v10, t6, v3
- vwmacc.vx v14, t6, v9
- neg t6, t6
- vwmacc.vx v12, t6, v3
- vwmacc.vx v16, t6, v9
+ vwmacc.vx v20, t4, v6
+ vwmacc.vx v18, t5, v5
- vwadd.wx v10, v10, t1
- vwadd.wx v12, v12, t1
- vwadd.wx v14, v14, t1
vwadd.wx v16, v16, t1
+ vwadd.wx v18, v18, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v22, v22, t1
- vnsra.wi \o3, v10, 12
- vnsra.wi \o4, v12, 12
- vnsra.wi \o2, v14, 12
- vnsra.wi \o5, v16, 12
+ vnsra.wi v16, v16, 12
+ vnsra.wi v18, v18, 12
+ vnsra.wi v20, v20, 12
+ vnsra.wi v22, v22, 12
- vmv.v.x v8, zero
- vssub.vv \o1, v8, \o1
- vssub.vv \o3, v8, \o3
- vssub.vv \o5, v8, \o5
- vssub.vv \o7, v8, \o7
+ vsadd.vv \o1, v16, v20
+ vsadd.vv \o6, v18, v22
+ vssub.vv v16, v16, v20
+ vssub.vv v17, v18, v22
+
+ vwmul.vx v18, v2, t6
+ vwmul.vx v20, v2, t6
+ vwmul.vx v22, v16, t6
+ vwmul.vx v24, v16, t6
+ vwmacc.vx v18, t6, v3
+ vwmacc.vx v22, t6, v17
+ neg t6, t6
+ vwmacc.vx v20, t6, v3
+ vwmacc.vx v24, t6, v17
+
+ vwadd.wx v18, v18, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v22, v22, t1
+ vwadd.wx v24, v24, t1
+
+ vnsra.wi \o3, v18, 12
+ vnsra.wi \o4, v20, 12
+ vnsra.wi \o2, v22, 12
+ vnsra.wi \o5, v24, 12
+
+ vmv.v.x v16, zero
+ vssub.vv \o1, v16, \o1
+ vssub.vv \o3, v16, \o3
+ vssub.vv \o5, v16, \o5
+ vssub.vv \o7, v16, \o7
.endm
function inv_dct_e16_x8_rvv, export=1, ext=v
@@ -714,6 +725,206 @@ def_fn_8x8 flipadst, identity
def_fn_8x8 identity, adst
def_fn_8x8 identity, flipadst
+function inv_txfm_add_4x8_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 8, e16, m1, ta, ma
+ vle16.v v0, (a2)
+ addi t0, a2, 16
+ vle16.v v1, (t0)
+ addi t0, t0, 16
+ vle16.v v2, (t0)
+ addi t0, t0, 16
+ vle16.v v3, (t0)
+
+ li t1, 2896*8
+.irp i, 0, 1, 2, 3
+ vsmul.vx v\i, v\i, t1
+.endr
+
+ jalr t0, a4
+
+ vsseg4e16.v v0, (a2)
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vmv.v.x v8, zero
+ vle16.v v0, (a2)
+ vse16.v v8, (a2)
+.irp i, 1, 2, 3, 4, 5, 6, 7
+ addi a2, a2, 8
+ vle16.v v\i, (a2)
+ vse16.v v8, (a2)
+.endr
+
+ jalr t0, a5
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vssra.vi v\i, v\i, 4
+.endr
+
+ vsetvli zero, zero, e8, mf4, ta, ma
+ vle8.v v8, (a0)
+ add t0, a0, a1
+ vle8.v v9, (t0)
+.irp i, 10, 11, 12, 13, 14, 15
+ add t0, t0, a1
+ vle8.v v\i, (t0)
+.endr
+
+ vwaddu.wv v0, v0, v8
+ vwaddu.wv v1, v1, v9
+ vwaddu.wv v2, v2, v10
+ vwaddu.wv v3, v3, v11
+ vwaddu.wv v4, v4, v12
+ vwaddu.wv v5, v5, v13
+ vwaddu.wv v6, v6, v14
+ vwaddu.wv v7, v7, v15
+
+ vsetvli zero, zero, e16, mf2, ta, ma
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vmax.vx v\i, v\i, zero
+.endr
+
+ vsetvli zero, zero, e8, mf4, ta, ma
+
+ vnclipu.wi v8, v0, 0
+ vnclipu.wi v9, v1, 0
+ vnclipu.wi v10, v2, 0
+ vnclipu.wi v11, v3, 0
+ vnclipu.wi v12, v4, 0
+ vnclipu.wi v13, v5, 0
+ vnclipu.wi v14, v6, 0
+ vnclipu.wi v15, v7, 0
+
+ vse8.v v8, (a0)
+.irp i, 9, 10, 11, 12, 13, 14, 15
+ add a0, a0, a1
+ vse8.v v\i, (a0)
+.endr
+
+ ret
+endfunc
+
+function inv_txfm_add_8x4_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vle16.v v0, (a2)
+ addi t0, a2, 8
+ vle16.v v1, (t0)
+.irp i, 2, 3, 4, 5, 6, 7
+ addi t0, t0, 8
+ vle16.v v\i, (t0)
+.endr
+
+ li t1, 2896*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vsmul.vx v\i, v\i, t1
+.endr
+
+ jalr t0, a4
+
+ vsseg8e16.v v0, (a2)
+
+ vsetivli zero, 8, e16, m1, ta, ma
+ vmv.v.x v4, zero
+ vle16.v v0, (a2)
+ vse16.v v4, (a2)
+.irp i, 1, 2, 3
+ addi a2, a2, 16
+ vle16.v v\i, (a2)
+ vse16.v v4, (a2)
+.endr
+
+ jalr t0, a5
+
+ vssra.vi v0, v0, 4
+ vssra.vi v1, v1, 4
+ vssra.vi v2, v2, 4
+ vssra.vi v3, v3, 4
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+ vle8.v v4, (a0)
+ add t0, a0, a1
+ vle8.v v5, (t0)
+ add t0, t0, a1
+ vle8.v v6, (t0)
+ add t0, t0, a1
+ vle8.v v7, (t0)
+
+ vwaddu.wv v0, v0, v4
+ vwaddu.wv v1, v1, v5
+ vwaddu.wv v2, v2, v6
+ vwaddu.wv v3, v3, v7
+
+ vsetvli zero, zero, e16, m1, ta, ma
+ vmax.vx v0, v0, zero
+ vmax.vx v1, v1, zero
+ vmax.vx v2, v2, zero
+ vmax.vx v3, v3, zero
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+
+ vnclipu.wi v4, v0, 0
+ vnclipu.wi v5, v1, 0
+ vnclipu.wi v6, v2, 0
+ vnclipu.wi v7, v3, 0
+
+ vse8.v v4, (a0)
+ add a0, a0, a1
+ vse8.v v5, (a0)
+ add a0, a0, a1
+ vse8.v v6, (a0)
+ add a0, a0, a1
+ vse8.v v7, (a0)
+
+ ret
+endfunc
+
+/* Define symbols added in .if statement */
+.equ dct, 1
+.equ identity, 2
+.equ adst, 3
+.equ flipadst, 4
+
+.macro def_fn_48 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
+.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst)
+ la a4, inv_\txfm1\()_e16_x\w\()w_rvv
+.else
+ la a4, inv_\txfm1\()_e16_x\w\()_rvv
+.endif
+.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst)
+ la a5, inv_\txfm2\()_e16_x\h\()w_rvv
+.else
+ la a5, inv_\txfm2\()_e16_x\h\()_rvv
+.endif
+ j inv_txfm_add_\w\()x\h\()_rvv
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct
+def_fn_48 \w, \h, identity, identity
+def_fn_48 \w, \h, dct, adst
+def_fn_48 \w, \h, dct, flipadst
+def_fn_48 \w, \h, dct, identity
+def_fn_48 \w, \h, adst, dct
+def_fn_48 \w, \h, adst, adst
+def_fn_48 \w, \h, adst, flipadst
+def_fn_48 \w, \h, flipadst, dct
+def_fn_48 \w, \h, flipadst, adst
+def_fn_48 \w, \h, flipadst, flipadst
+def_fn_48 \w, \h, identity, dct
+def_fn_48 \w, \h, adst, identity
+def_fn_48 \w, \h, flipadst, identity
+def_fn_48 \w, \h, identity, adst
+def_fn_48 \w, \h, identity, flipadst
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
function inv_identity_e16_x16_rvv, export=1, ext=v
li t1, 2*(5793-4096)*8
.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@@ -1196,10 +1407,12 @@ endfunc
.macro def_horz_16 variant
function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v
vmv.v.x v16, zero
-.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- vle16.v v\i, (t4)
+ vle16.v v0, (t4)
vse16.v v16, (t4)
+.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
add t4, t4, t6
+ vle16.v v\i, (t4)
+ vse16.v v16, (t4)
.endr
.ifc \variant, _identity
li t1, 2*(5793-4096)*8
@@ -1208,29 +1421,35 @@ function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v
vsra.vi v16, v16, 1
vaadd.vv v\i, v\i, v16
.endr
+ j L(horz_16x8_epilog)
.else
jalr t0, a4
.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
vssra.vi v\i, v\i, 2
.endr
-.endif
-.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- vsse16.v v\i, (t5), t6
+L(horz_16x8_epilog):
+ vsse16.v v0, (t5), t6
+.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
addi t5, t5, 2
+ vsse16.v v\i, (t5), t6
.endr
jr a7
+.endif
endfunc
.endm
-def_horz_16
def_horz_16 _identity
+def_horz_16
function inv_txfm_add_vert_8x16_rvv, export=1, ext=v
vsetivli zero, 8, e16, m1, ta, ma
-.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- vle16.v v\i, (t4)
+
+ vle16.v v0, (t4)
+.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
add t4, t4, t6
+ vle16.v v\i, (t4)
.endr
+
jalr t0, a5
.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@@ -1238,10 +1457,13 @@ function inv_txfm_add_vert_8x16_rvv, export=1, ext=v
.endr
vsetivli zero, 8, e8, mf2, ta, ma
- mv t0, t5
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- vle8.v v\i, (t0)
+
+ vle8.v v16, (t5)
+ add t0, t5, a1
+ vle8.v v17, (t0)
+.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
add t0, t0, a1
+ vle8.v v\i, (t0)
.endr
vwaddu.wv v0, v0, v16
@@ -1261,7 +1483,7 @@ function inv_txfm_add_vert_8x16_rvv, export=1, ext=v
vwaddu.wv v14, v14, v30
vwaddu.wv v15, v15, v31
- vsetvli zero, zero, e16, m1
+ vsetvli zero, zero, e16, m1, ta, ma
.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
vmax.vx v\i, v\i, zero
.endr
@@ -1284,9 +1506,10 @@ function inv_txfm_add_vert_8x16_rvv, export=1, ext=v
vnclipu.wi v30, v14, 0
vnclipu.wi v31, v15, 0
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- vse8.v v\i, (t5)
+ vse8.v v16, (t5)
+.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
add t5, t5, a1
+ vse8.v v\i, (t5)
.endr
jr a7
@@ -1296,11 +1519,26 @@ function inv_txfm_add_16x16_rvv, export=1, ext=v
csrw vxrm, zero
vsetivli zero, 8, e16, m1, ta, ma
addi sp, sp, -16*32
-.irp i, 0, 8
+.irp i, 8, 0
addi t4, a2, \i*2
addi t5, sp, \i*16*2
+.if \i == 8
+ blt a3, a7, 1f
+.endif
li t6, 16*2
jalr a7, a6
+.if \i == 8
+ j 2f
+1:
+ li t1, 64
+ vsetvli zero, t1, e16, m8, ta, ma
+ vmv.v.x v0, zero
+ vse16.v v0, (t5)
+ addi t5, t5, 128
+ vse16.v v0, (t5)
+ vsetivli zero, 8, e16, m1, ta, ma
+2:
+.endif
.endr
.irp i, 0, 8
addi t4, sp, \i*2
@@ -1312,7 +1550,7 @@ function inv_txfm_add_16x16_rvv, export=1, ext=v
ret
endfunc
-.macro def_fn_16x16 txfm1, txfm2
+.macro def_fn_16x16 txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v
.ifc \txfm1, identity
la a6, inv_txfm_horz_identity_16x8_rvv
@@ -1321,19 +1559,558 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v
la a4, inv_\txfm1\()_e16_x16_rvv
.endif
la a5, inv_\txfm2\()_e16_x16_rvv
+ li a7, \eob_half
j inv_txfm_add_16x16_rvv
endfunc
.endm
-def_fn_16x16 dct, dct
-def_fn_16x16 identity, identity
-def_fn_16x16 dct, adst
-def_fn_16x16 dct, flipadst
-def_fn_16x16 dct, identity
-def_fn_16x16 adst, dct
-def_fn_16x16 adst, adst
-def_fn_16x16 adst, flipadst
-def_fn_16x16 flipadst, dct
-def_fn_16x16 flipadst, adst
-def_fn_16x16 flipadst, flipadst
-def_fn_16x16 identity, dct
+def_fn_16x16 dct, dct, 36
+def_fn_16x16 identity, identity, 36
+def_fn_16x16 dct, adst, 36
+def_fn_16x16 dct, flipadst, 36
+def_fn_16x16 dct, identity, 8
+def_fn_16x16 adst, dct, 36
+def_fn_16x16 adst, adst, 36
+def_fn_16x16 adst, flipadst, 36
+def_fn_16x16 flipadst, dct, 36
+def_fn_16x16 flipadst, adst, 36
+def_fn_16x16 flipadst, flipadst, 36
+def_fn_16x16 identity, dct, 8
+
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_4x16_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 8, e16, m1, ta, ma
+
+ blt a3, a6, 1f
+
+ addi t0, a2, 16
+ vle16.v v0, (t0)
+ addi t0, t0, 32
+ vle16.v v1, (t0)
+ addi t0, t0, 32
+ vle16.v v2, (t0)
+ addi t0, t0, 32
+ vle16.v v3, (t0)
+
+.ifc \variant, identity_
+ li t1, (5793-4096)*8
+ vsmul.vx v8, v0, t1
+ vaadd.vv v4, v0, v8
+ vsmul.vx v8, v1, t1
+ vaadd.vv v5, v1, v8
+ vsmul.vx v8, v2, t1
+ vaadd.vv v6, v2, v8
+ vsmul.vx v8, v3, t1
+ vaadd.vv v7, v3, v8
+.else
+ jalr t0, a4
+
+ vssra.vi v4, v0, 1
+ vssra.vi v5, v1, 1
+ vssra.vi v6, v2, 1
+ vssra.vi v7, v3, 1
+.endif
+
+ j 2f
+
+1:
+.irp i, 4, 5, 6, 7
+ vmv.v.x v\i, zero
+.endr
+
+2:
+ vle16.v v0, (a2)
+ addi t0, a2, 32
+ vle16.v v1, (t0)
+ addi t0, t0, 32
+ vle16.v v2, (t0)
+ addi t0, t0, 32
+ vle16.v v3, (t0)
+
+.ifc \variant, identity_
+ li t1, (5793-4096)*8
+.irp i, 0, 1, 2, 3
+ vsmul.vx v8, v\i, t1
+ vaadd.vv v\i, v\i, v8
+.endr
+
+ j L(itx_4x16_epilog)
+.else
+ jalr t0, a4
+
+ vssra.vi v0, v0, 1
+ vssra.vi v1, v1, 1
+ vssra.vi v2, v2, 1
+ vssra.vi v3, v3, 1
+
+L(itx_4x16_epilog):
+ vsseg4e16.v v0, (a2)
+ addi t0, a2, 64
+ vsseg4e16.v v4, (t0)
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+
+ vmv.v.x v16, zero
+ vle16.v v0, (a2)
+ vse16.v v16, (a2)
+ addi t0, a2, 8
+ vle16.v v1, (t0)
+ vse16.v v16, (t0)
+.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ addi t0, t0, 8
+ vle16.v v\i, (t0)
+ vse16.v v16, (t0)
+.endr
+
+ jalr t0, a5
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vssra.vi v\i, v\i, 4
+.endr
+
+ vsetvli zero, zero, e8, mf4, ta, ma
+
+ vle8.v v16, (a0)
+ add t0, a0, a1
+ vle8.v v17, (t0)
+.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ add t0, t0, a1
+ vle8.v v\i, (t0)
+.endr
+
+ vwaddu.wv v0, v0, v16
+ vwaddu.wv v1, v1, v17
+ vwaddu.wv v2, v2, v18
+ vwaddu.wv v3, v3, v19
+ vwaddu.wv v4, v4, v20
+ vwaddu.wv v5, v5, v21
+ vwaddu.wv v6, v6, v22
+ vwaddu.wv v7, v7, v23
+ vwaddu.wv v8, v8, v24
+ vwaddu.wv v9, v9, v25
+ vwaddu.wv v10, v10, v26
+ vwaddu.wv v11, v11, v27
+ vwaddu.wv v12, v12, v28
+ vwaddu.wv v13, v13, v29
+ vwaddu.wv v14, v14, v30
+ vwaddu.wv v15, v15, v31
+
+ vsetvli zero, zero, e16, mf2, ta, ma
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vmax.vx v\i, v\i, zero
+.endr
+
+ vsetvli zero, zero, e8, mf4, ta, ma
+
+ vnclipu.wi v16, v0, 0
+ vnclipu.wi v17, v1, 0
+ vnclipu.wi v18, v2, 0
+ vnclipu.wi v19, v3, 0
+ vnclipu.wi v20, v4, 0
+ vnclipu.wi v21, v5, 0
+ vnclipu.wi v22, v6, 0
+ vnclipu.wi v23, v7, 0
+ vnclipu.wi v24, v8, 0
+ vnclipu.wi v25, v9, 0
+ vnclipu.wi v26, v10, 0
+ vnclipu.wi v27, v11, 0
+ vnclipu.wi v28, v12, 0
+ vnclipu.wi v29, v13, 0
+ vnclipu.wi v30, v14, 0
+ vnclipu.wi v31, v15, 0
+
+ vse8.v v16, (a0)
+.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ add a0, a0, a1
+ vse8.v v\i, (a0)
+.endr
+
+ ret
+.endif
+endfunc
+
+function inv_txfm_\variant\()add_16x4_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vle16.v v0, (a2)
+ addi t0, a2, 8
+ vle16.v v1, (t0)
+.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ addi t0, t0, 8
+ vle16.v v\i, (t0)
+.endr
+
+.ifc \variant, identity_
+ li t1, 2*(5793-4096)*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vsmul.vx v16, v\i, t1
+ vssra.vi v16, v16, 1
+ vsadd.vv v\i, v\i, v16
+.endr
+
+ j L(itx_16x4_epilog)
+.else
+ jalr t0, a4
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vssra.vi v\i, v\i, 1
+.endr
+
+L(itx_16x4_epilog):
+ li t0, 32
+ vssseg8e16.v v0, (a2), t0
+ addi t1, a2, 16
+ vssseg8e16.v v8, (t1), t0
+
+.irp j, 0, 8
+ vsetivli zero, 8, e16, m1, ta, ma
+
+ vmv.v.x v4, zero
+ addi t0, a2, \j*2
+ vle16.v v0, (t0)
+ vse16.v v4, (t0)
+.irp i, 1, 2, 3
+ addi t0, t0, 32
+ vle16.v v\i, (t0)
+ vse16.v v4, (t0)
+.endr
+
+ jalr t0, a5
+
+ vssra.vi v0, v0, 4
+ vssra.vi v1, v1, 4
+ vssra.vi v2, v2, 4
+ vssra.vi v3, v3, 4
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+ addi t0, a0, \j
+ vle8.v v4, (t0)
+ add t0, t0, a1
+ vle8.v v5, (t0)
+ add t0, t0, a1
+ vle8.v v6, (t0)
+ add t0, t0, a1
+ vle8.v v7, (t0)
+
+ vwaddu.wv v0, v0, v4
+ vwaddu.wv v1, v1, v5
+ vwaddu.wv v2, v2, v6
+ vwaddu.wv v3, v3, v7
+
+ vsetvli zero, zero, e16, m1, ta, ma
+ vmax.vx v0, v0, zero
+ vmax.vx v1, v1, zero
+ vmax.vx v2, v2, zero
+ vmax.vx v3, v3, zero
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+
+ vnclipu.wi v4, v0, 0
+ vnclipu.wi v5, v1, 0
+ vnclipu.wi v6, v2, 0
+ vnclipu.wi v7, v3, 0
+
+ addi t0, a0, \j
+ vse8.v v4, (t0)
+ add t0, t0, a1
+ vse8.v v5, (t0)
+ add t0, t0, a1
+ vse8.v v6, (t0)
+ add t0, t0, a1
+ vse8.v v7, (t0)
+.endr
+
+ ret
+.endif
+endfunc
+.endm
+
+def_fn_416_base identity_
+def_fn_416_base
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
+.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst)
+ la a4, inv_\txfm1\()_e16_x\w\()w_rvv
+.elseif \txfm1 != identity
+ la a4, inv_\txfm1\()_e16_x\w\()_rvv
+.endif
+.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst)
+ la a5, inv_\txfm2\()_e16_x\h\()w_rvv
+.else
+ la a5, inv_\txfm2\()_e16_x\h\()_rvv
+.endif
+.if \w == 4
+ li a6, \eob_half
+.endif
+.ifc \txfm1, identity
+ j inv_txfm_identity_add_\w\()x\h\()_rvv
+.else
+ j inv_txfm_add_\w\()x\h\()_rvv
+.endif
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 29
+def_fn_416 \w, \h, identity, identity, 29
+def_fn_416 \w, \h, dct, adst, 29
+def_fn_416 \w, \h, dct, flipadst, 29
+def_fn_416 \w, \h, dct, identity, 8
+def_fn_416 \w, \h, adst, dct, 29
+def_fn_416 \w, \h, adst, adst, 29
+def_fn_416 \w, \h, adst, flipadst, 29
+def_fn_416 \w, \h, flipadst, dct, 29
+def_fn_416 \w, \h, flipadst, adst, 29
+def_fn_416 \w, \h, flipadst, flipadst, 29
+def_fn_416 \w, \h, identity, dct, 32
+def_fn_416 \w, \h, adst, identity, 8
+def_fn_416 \w, \h, flipadst, identity, 8
+def_fn_416 \w, \h, identity, adst, 32
+def_fn_416 \w, \h, identity, flipadst, 32
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_8x16_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 8, e16, m1, ta, ma
+
+ blt a3, a6, 1f
+
+ vmv.v.x v16, zero
+ addi t0, a2, 16
+ vle16.v v0, (t0)
+ vse16.v v16, (t0)
+.irp i, 1, 2, 3, 4, 5, 6, 7
+ addi t0, t0, 32
+ vle16.v v\i, (t0)
+ vse16.v v16, (t0)
+.endr
+
+ li t1, 2896*8
+.ifc \variant, identity_
+ vsmul.vx v8, v0, t1
+ vsmul.vx v9, v1, t1
+ vsmul.vx v10, v2, t1
+ vsmul.vx v11, v3, t1
+ vsmul.vx v12, v4, t1
+ vsmul.vx v13, v5, t1
+ vsmul.vx v14, v6, t1
+ vsmul.vx v15, v7, t1
+.else
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vsmul.vx v\i, v\i, t1
+.endr
+
+ jalr t0, a4
+
+ vssra.vi v8, v0, 1
+ vssra.vi v9, v1, 1
+ vssra.vi v10, v2, 1
+ vssra.vi v11, v3, 1
+ vssra.vi v12, v4, 1
+ vssra.vi v13, v5, 1
+ vssra.vi v14, v6, 1
+ vssra.vi v15, v7, 1
+.endif
+
+ j 2f
+
+1:
+.irp i, 8, 9, 10, 11, 12, 13, 14, 15
+ vmv.v.x v\i, zero
+.endr
+
+2:
+ vmv.v.x v16, zero
+ vle16.v v0, (a2)
+ vse16.v v16, (a2)
+ addi t0, a2, 32
+ vle16.v v1, (t0)
+ vse16.v v16, (t0)
+.irp i, 2, 3, 4, 5, 6, 7
+ addi t0, t0, 32
+ vle16.v v\i, (t0)
+ vse16.v v16, (t0)
+.endr
+
+ li t1, 2896*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vsmul.vx v\i, v\i, t1
+.endr
+
+.ifc \variant, identity_
+ j L(itx_8x16_epilog)
+.else
+ jalr t0, a4
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vssra.vi v\i, v\i, 1
+.endr
+
+L(itx_8x16_epilog):
+ addi t4, sp, -8*32
+ vsseg8e16.v v0, (t4)
+ addi t0, t4, 8*16
+ vsseg8e16.v v8, (t0)
+
+ mv t5, a0
+ li t6, 16
+ jal a7, inv_txfm_add_vert_8x16_rvv
+
+ ret
+.endif
+endfunc
+
+function inv_txfm_\variant\()add_16x8_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 8, e16, m1, ta, ma
+ vle16.v v0, (a2)
+ addi t0, a2, 16
+ vle16.v v1, (t0)
+.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ addi t0, t0, 16
+ vle16.v v\i, (t0)
+.endr
+
+ li t1, 2896*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vsmul.vx v\i, v\i, t1
+.endr
+
+.ifc \variant, identity_
+ li t1, 2*(5793-4096)*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vsmul.vx v16, v\i, t1
+ vssra.vi v16, v16, 1
+ vsadd.vv v\i, v\i, v16
+.endr
+
+ j L(itx_16x8_epilog)
+.else
+ jalr t0, a4
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vssra.vi v\i, v\i, 1
+.endr
+
+L(itx_16x8_epilog):
+ li t0, 32
+ vssseg8e16.v v0, (a2), t0
+ addi t1, a2, 16
+ vssseg8e16.v v8, (t1), t0
+
+.irp j, 0, 8
+ vsetivli zero, 8, e16, m1, ta, ma
+
+ vmv.v.x v8, zero
+ addi t0, a2, \j*2
+ vle16.v v0, (t0)
+ vse16.v v8, (t0)
+.irp i, 1, 2, 3, 4, 5, 6, 7
+ addi t0, t0, 32
+ vle16.v v\i, (t0)
+ vse16.v v8, (t0)
+.endr
+
+ jalr t0, a5
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vssra.vi v\i, v\i, 4
+.endr
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+ addi t0, a0, \j
+ vle8.v v8, (t0)
+.irp i, 9, 10, 11, 12, 13, 14, 15
+ add t0, t0, a1
+ vle8.v v\i, (t0)
+.endr
+
+ vwaddu.wv v0, v0, v8
+ vwaddu.wv v1, v1, v9
+ vwaddu.wv v2, v2, v10
+ vwaddu.wv v3, v3, v11
+ vwaddu.wv v4, v4, v12
+ vwaddu.wv v5, v5, v13
+ vwaddu.wv v6, v6, v14
+ vwaddu.wv v7, v7, v15
+
+ vsetvli zero, zero, e16, m1, ta, ma
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vmax.vx v\i, v\i, zero
+.endr
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+
+ vnclipu.wi v8, v0, 0
+ vnclipu.wi v9, v1, 0
+ vnclipu.wi v10, v2, 0
+ vnclipu.wi v11, v3, 0
+ vnclipu.wi v12, v4, 0
+ vnclipu.wi v13, v5, 0
+ vnclipu.wi v14, v6, 0
+ vnclipu.wi v15, v7, 0
+
+ addi t0, a0, \j
+ vse8.v v8, (t0)
+.irp i, 9, 10, 11, 12, 13, 14, 15
+ add t0, t0, a1
+ vse8.v v\i, (t0)
+.endr
+.endr
+
+ ret
+.endif
+endfunc
+.endm
+
+def_fn_816_base identity_
+def_fn_816_base
+
+.macro def_fn_816 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
+.ifnc \txfm1, identity
+ la a4, inv_\txfm1\()_e16_x\w\()_rvv
+.endif
+ la a5, inv_\txfm2\()_e16_x\h\()_rvv
+.if \w == 8
+ li a6, \eob_half
+.endif
+.ifc \txfm1, identity
+ j inv_txfm_identity_add_\w\()x\h\()_rvv
+.else
+ j inv_txfm_add_\w\()x\h\()_rvv
+.endif
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct, 43
+def_fn_816 \w, \h, identity, identity, 43
+def_fn_816 \w, \h, dct, adst, 43
+def_fn_816 \w, \h, dct, flipadst, 43
+def_fn_816 \w, \h, dct, identity, 8
+def_fn_816 \w, \h, adst, dct, 43
+def_fn_816 \w, \h, adst, adst, 43
+def_fn_816 \w, \h, adst, flipadst, 43
+def_fn_816 \w, \h, flipadst, dct, 43
+def_fn_816 \w, \h, flipadst, adst, 43
+def_fn_816 \w, \h, flipadst, flipadst, 43
+def_fn_816 \w, \h, identity, dct, 64
+def_fn_816 \w, \h, adst, identity, 8
+def_fn_816 \w, \h, flipadst, identity, 8
+def_fn_816 \w, \h, identity, adst, 64
+def_fn_816 \w, \h, identity, flipadst, 64
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
diff --git a/src/riscv/asm.S b/src/riscv/asm.S
index 2435170..eed4d67 100644
--- a/src/riscv/asm.S
+++ b/src/riscv/asm.S
@@ -123,4 +123,6 @@ EXTERN\name:
end_thread_local
.endm
+#define L(x) .L ## x
+
#endif /* DAV1D_SRC_RISCV_ASM_S */
diff --git a/src/riscv/itx.h b/src/riscv/itx.h
index 28c5e54..d3f9a03 100644
--- a/src/riscv/itx.h
+++ b/src/riscv/itx.h
@@ -58,7 +58,13 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
#define decl_itx_fns(ext) \
decl_itx17_fns( 4, 4, ext); \
+decl_itx16_fns( 4, 8, ext); \
+decl_itx16_fns( 4, 16, ext); \
+decl_itx16_fns( 8, 4, ext); \
decl_itx16_fns( 8, 8, ext); \
+decl_itx16_fns( 8, 16, ext); \
+decl_itx16_fns(16, 4, ext); \
+decl_itx16_fns(16, 8, ext); \
decl_itx16_fns(16, 16, ext)
decl_itx_fns(rvv);
@@ -105,7 +111,13 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in
#if BITDEPTH == 8
assign_itx17_fn( , 4, 4, rvv);
+ assign_itx16_fn(R, 4, 8, rvv);
+ assign_itx16_fn(R, 4, 16, rvv);
+ assign_itx16_fn(R, 8, 4, rvv);
assign_itx16_fn( , 8, 8, rvv);
+ assign_itx16_fn(R, 8, 16, rvv);
+ assign_itx16_fn(R, 16, 4, rvv);
+ assign_itx16_fn(R, 16, 8, rvv);
assign_itx12_fn( , 16, 16, rvv);
#endif
}
diff --git a/src/x86/looprestoration_sse.asm b/src/x86/looprestoration_sse.asm
index 01eb6fa..b5c73a5 100644
--- a/src/x86/looprestoration_sse.asm
+++ b/src/x86/looprestoration_sse.asm
@@ -42,7 +42,6 @@ pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
pb_right_ext_mask: times 24 db 0xff
times 8 db 0
pb_1: times 16 db 1
-pb_3: times 16 db 3
pw_256: times 8 dw 256
pw_2056: times 8 dw 2056
pw_m16380: times 8 dw -16380
@@ -290,7 +289,7 @@ cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstrid
call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
jmp .v1
.extend_right:
- movd m2, [lpfq-4]
+ movd m2, [lpfq-1]
%if ARCH_X86_64
push r0
lea r0, [pb_right_ext_mask+21]
@@ -302,10 +301,11 @@ cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstrid
movu m1, [r6+xq+8]
%endif
%if cpuflag(ssse3)
- pshufb m2, [base+pb_3]
+ pxor m3, m3
+ pshufb m2, m3
%else
punpcklbw m2, m2
- pshuflw m2, m2, q3333
+ pshuflw m2, m2, q0000
punpcklqdq m2, m2
%endif
pand m4, m0
diff --git a/src/x86/msac.asm b/src/x86/msac.asm
index 9f05c92..4156efe 100644
--- a/src/x86/msac.asm
+++ b/src/x86/msac.asm
@@ -143,10 +143,9 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
mov esp, [esp]
%endif
%endif
- not t4
sub t2d, t1d ; rng
shl t1, gprsize*8-16
- add t4, t1 ; ~dif
+ sub t4, t1 ; dif - v
.renorm3:
mov t1d, [t0+msac.cnt]
movifnidn t7, t0
@@ -157,33 +156,31 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
shl t2d, cl
shl t4, cl
mov [t7+msac.rng], t2d
- not t4
sub t1d, ecx
jae .end ; no refill required
; refill:
- mov t2, [t7+msac.buf]
- mov rcx, [t7+msac.end]
%if ARCH_X86_64 == 0
push t5
%endif
- lea t5, [t2+gprsize]
- cmp t5, rcx
+ mov t2, [t7+msac.buf]
+ mov t5, [t7+msac.end]
+ lea rcx, [t2+gprsize]
+ sub rcx, t5
ja .refill_eob
- mov t2, [t2]
- lea ecx, [t1+23]
- add t1d, 16
- shr ecx, 3 ; shift_bytes
- bswap t2
- sub t5, rcx
- shl ecx, 3 ; shift_bits
- shr t2, cl
- sub ecx, t1d ; shift_bits - 16 - cnt
- mov t1d, gprsize*8-16
- shl t2, cl
- mov [t7+msac.buf], t5
- sub t1d, ecx ; cnt + gprsize*8 - shift_bits
- xor t4, t2
+ mov t5, [t2]
+ lea ecx, [t1+16-gprsize*8]
+ not t5
+ bswap t5
+ shr t5, cl
+ neg ecx
+ shr ecx, 3 ; num_bytes_read
+ or t4, t5
+.refill_end:
+ add t2, rcx
+ lea t1d, [t1+rcx*8] ; cnt += num_bits_read
+ mov [t7+msac.buf], t2
+.refill_end2:
%if ARCH_X86_64 == 0
pop t5
%endif
@@ -191,29 +188,35 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
mov [t7+msac.cnt], t1d
mov [t7+msac.dif], t4
RET
+.pad_with_ones:
+ lea ecx, [t1-16]
+%if ARCH_X86_64
+ ror rcx, cl
+%else
+ shr ecx, cl
+%endif
+ or t4, rcx
+ jmp .refill_end2
.refill_eob: ; avoid overreading the input buffer
- mov t5, rcx
- mov ecx, gprsize*8-24
- sub ecx, t1d ; c
-.refill_eob_loop:
cmp t2, t5
- jae .refill_eob_end ; eob reached
- movzx t1d, byte [t2]
- inc t2
- shl t1, cl
- xor t4, t1
- sub ecx, 8
- jge .refill_eob_loop
-.refill_eob_end:
- mov t1d, gprsize*8-24
-%if ARCH_X86_64 == 0
- pop t5
-%endif
- sub t1d, ecx
- mov [t7+msac.buf], t2
- mov [t7+msac.dif], t4
- mov [t7+msac.cnt], t1d
- RET
+ jae .pad_with_ones ; eob reached
+ ; We can safely do a register-sized load of the last bytes of the buffer
+ ; as this code is only reached if the msac buffer size is >= gprsize.
+ mov t5, [t5-gprsize]
+ shl ecx, 3
+ shr t5, cl
+ lea ecx, [t1+16-gprsize*8]
+ not t5
+ bswap t5
+ shr t5, cl
+ neg ecx
+ or t4, t5
+ mov t5d, [t7+msac.end]
+ shr ecx, 3
+ sub t5d, t2d ; num_bytes_left
+ cmp ecx, t5d
+ cmovae ecx, t5d ; num_bytes_read
+ jmp .refill_end
cglobal msac_decode_symbol_adapt8, 0, 6, 6
DECODE_SYMBOL_ADAPT_INIT
@@ -366,7 +369,6 @@ cglobal msac_decode_bool_adapt, 0, 6, 0
%if ARCH_X86_64 == 0
movzx eax, al
%endif
- not t4
test t3d, t3d
jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3
%if UNIX64 == 0
@@ -420,7 +422,6 @@ cglobal msac_decode_bool_equi, 0, 6, 0
mov ecx, 0xbfff
setb al ; the upper 32 bits contains garbage but that's OK
sub ecx, t2d
- not t4
; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14)
; i.e. (0 <= d <= 2) and v < (3 << 14)
shr ecx, 14 ; d
@@ -447,7 +448,6 @@ cglobal msac_decode_bool, 0, 6, 0
cmovb t2d, t1d
cmovb t4, t3
setb al
- not t4
%if ARCH_X86_64 == 0
movzx eax, al
%endif
@@ -497,48 +497,45 @@ cglobal msac_decode_bool, 0, 6, 0
tzcnt eax, eax
movzx ecx, word [buf+rax+16]
movzx t2d, word [buf+rax+14]
- not t4
%if ARCH_X86_64
add t6d, 5
%endif
sub eax, 5 ; setup for merging the tok_br and tok branches
sub t2d, ecx
shl rcx, gprsize*8-16
- add t4, rcx
+ sub t4, rcx
bsr ecx, t2d
xor ecx, 15
shl t2d, cl
shl t4, cl
movd m2, t2d
mov [t7+msac.rng], t2d
- not t4
sub t5d, ecx
jae %%end
- mov t2, [t7+msac.buf]
- mov rcx, [t7+msac.end]
%if UNIX64 == 0
push t8
%endif
- lea t8, [t2+gprsize]
- cmp t8, rcx
+ mov t2, [t7+msac.buf]
+ mov t8, [t7+msac.end]
+ lea rcx, [t2+gprsize]
+ sub rcx, t8
ja %%refill_eob
- mov t2, [t2]
- lea ecx, [t5+23]
- add t5d, 16
+ mov t8, [t2]
+ lea ecx, [t5+16-gprsize*8]
+ not t8
+ bswap t8
+ shr t8, cl
+ neg ecx
shr ecx, 3
- bswap t2
- sub t8, rcx
- shl ecx, 3
- shr t2, cl
- sub ecx, t5d
- mov t5d, gprsize*8-16
- shl t2, cl
- mov [t7+msac.buf], t8
+ or t4, t8
+%%refill_end:
+ add t2, rcx
+ lea t5d, [t5+rcx*8]
+ mov [t7+msac.buf], t2
+%%refill_end2:
%if UNIX64 == 0
pop t8
%endif
- sub t5d, ecx
- xor t4, t2
%%end:
movp m3, t4
%if ARCH_X86_64
@@ -559,27 +556,34 @@ cglobal msac_decode_bool, 0, 6, 0
shr eax, 1
mov [t7+msac.cnt], t5d
RET
+%%pad_with_ones:
+ ; ensure that dif is padded with at least 15 bits of ones at the end
+ lea ecx, [t5-16]
+%if ARCH_X86_64
+ ror rcx, cl
+%else
+ shr ecx, cl
+%endif
+ or t4, rcx
+ jmp %%refill_end2
%%refill_eob:
- mov t8, rcx
- mov ecx, gprsize*8-24
- sub ecx, t5d
-%%refill_eob_loop:
cmp t2, t8
- jae %%refill_eob_end
- movzx t5d, byte [t2]
- inc t2
- shl t5, cl
- xor t4, t5
- sub ecx, 8
- jge %%refill_eob_loop
-%%refill_eob_end:
-%if UNIX64 == 0
- pop t8
-%endif
- mov t5d, gprsize*8-24
- mov [t7+msac.buf], t2
- sub t5d, ecx
- jmp %%end
+ jae %%pad_with_ones
+ mov t8, [t8-gprsize]
+ shl ecx, 3
+ shr t8, cl
+ lea ecx, [t5+16-gprsize*8]
+ not t8
+ bswap t8
+ shr t8, cl
+ neg ecx
+ or t4, t8
+ mov t8d, [t7+msac.end]
+ shr ecx, 3
+ sub t8d, t2d
+ cmp ecx, t8d
+ cmovae ecx, t8d
+ jmp %%refill_end
%endmacro
cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6
diff --git a/tests/checkasm/arm/checkasm_64.S b/tests/checkasm/arm/checkasm_64.S
index 2574914..d0d7ec4 100644
--- a/tests/checkasm/arm/checkasm_64.S
+++ b/tests/checkasm/arm/checkasm_64.S
@@ -209,3 +209,13 @@ function checked_call, export=1
ldp x29, x30, [sp], #16
ret
endfunc
+
+#if HAVE_SVE
+ENABLE_SVE
+function sve_length, export=1
+ cntb x0
+ lsl x0, x0, #3
+ ret
+endfunc
+DISABLE_SVE
+#endif
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 844ae44..9a01da7 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -102,6 +102,12 @@ static const struct {
{ "AVX-512 (Ice Lake)", "avx512icl", DAV1D_X86_CPU_FLAG_AVX512ICL },
#elif ARCH_AARCH64 || ARCH_ARM
{ "NEON", "neon", DAV1D_ARM_CPU_FLAG_NEON },
+ { "DOTPROD", "dotprod", DAV1D_ARM_CPU_FLAG_DOTPROD },
+ { "I8MM", "i8mm", DAV1D_ARM_CPU_FLAG_I8MM },
+#if ARCH_AARCH64
+ { "SVE", "sve", DAV1D_ARM_CPU_FLAG_SVE },
+ { "SVE2", "sve2", DAV1D_ARM_CPU_FLAG_SVE2 },
+#endif /* ARCH_AARCH64 */
#elif ARCH_LOONGARCH
{ "LSX", "lsx", DAV1D_LOONGARCH_CPU_FLAG_LSX },
{ "LASX", "lasx", DAV1D_LOONGARCH_CPU_FLAG_LASX },
@@ -113,6 +119,12 @@ static const struct {
{ 0 }
};
+#if ARCH_AARCH64 && HAVE_SVE
+int checkasm_sve_length(void);
+#elif ARCH_RISCV
+int checkasm_get_vlenb(void);
+#endif
+
typedef struct CheckasmFuncVersion {
struct CheckasmFuncVersion *next;
void *func;
@@ -130,6 +142,13 @@ typedef struct CheckasmFunc {
char name[];
} CheckasmFunc;
+typedef enum {
+ RUN_NORMAL = 0,
+ RUN_BENCHMARK,
+ RUN_CPUFLAG_LISTING,
+ RUN_FUNCTION_LISTING,
+} CheckasmRunMode;
+
/* Internal state */
static struct {
CheckasmFunc *funcs;
@@ -144,9 +163,8 @@ static struct {
const char *test_pattern;
const char *function_pattern;
unsigned seed;
- int bench;
+ CheckasmRunMode run_mode;
int verbose;
- int function_listing;
volatile sig_atomic_t catch_signals;
int suffix_length;
int max_function_name_length;
@@ -252,18 +270,18 @@ int float_near_abs_eps_array_ulp(const float *const a, const float *const b,
/* Print colored text to stderr if the terminal supports it */
static int use_printf_color;
-static void color_printf(const int color, const char *const fmt, ...) {
+static void color_fprintf(FILE *const f, const int color, const char *const fmt, ...) {
va_list arg;
if (use_printf_color)
- fprintf(stderr, "\x1b[0;%dm", color);
+ fprintf(f, "\x1b[0;%dm", color);
va_start(arg, fmt);
- vfprintf(stderr, fmt, arg);
+ vfprintf(f, fmt, arg);
va_end(arg);
if (use_printf_color)
- fprintf(stderr, "\x1b[0m");
+ fprintf(f, "\x1b[0m");
}
/* Deallocate a tree */
@@ -532,7 +550,7 @@ static void check_cpu_flag(const char *const name, unsigned flag) {
/* Print the name of the current CPU flag, but only do it once */
static void print_cpu_name(void) {
if (state.cpu_flag_name) {
- color_printf(COLOR_YELLOW, "%s:\n", state.cpu_flag_name);
+ color_fprintf(stderr, COLOR_YELLOW, "%s:\n", state.cpu_flag_name);
state.cpu_flag_name = NULL;
}
}
@@ -571,6 +589,7 @@ int main(int argc, char *argv[]) {
" --test=<pattern> -t Test only <pattern>\n"
" --function=<pattern> -f Test only the functions matching <pattern>\n"
" --bench -b Benchmark the tested functions\n"
+ " --list-cpuflags List available cpu flags\n"
" --list-functions List available functions\n"
" --list-tests List available tests\n"
" --verbose -v Print verbose output\n");
@@ -581,7 +600,7 @@ int main(int argc, char *argv[]) {
"checkasm: --bench is not supported on your system\n");
return 1;
#endif
- state.bench = 1;
+ state.run_mode = RUN_BENCHMARK;
} else if (!strncmp(argv[1], "--test=", 7)) {
state.test_pattern = argv[1] + 7;
} else if (!strcmp(argv[1], "-t")) {
@@ -594,8 +613,11 @@ int main(int argc, char *argv[]) {
state.function_pattern = argc > 1 ? argv[2] : "";
argc--;
argv++;
+ } else if (!strcmp(argv[1], "--list-cpuflags")) {
+ state.run_mode = RUN_CPUFLAG_LISTING;
+ break;
} else if (!strcmp(argv[1], "--list-functions")) {
- state.function_listing = 1;
+ state.run_mode = RUN_FUNCTION_LISTING;
} else if (!strcmp(argv[1], "--list-tests")) {
for (int i = 0; tests[i].name; i++)
printf("%s\n", tests[i].name);
@@ -671,7 +693,8 @@ int main(int argc, char *argv[]) {
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
AddVectoredExceptionHandler(0, signal_handler);
- HANDLE con = GetStdHandle(STD_ERROR_HANDLE);
+ HANDLE con = GetStdHandle(state.run_mode >= RUN_CPUFLAG_LISTING ?
+ STD_OUTPUT_HANDLE : STD_ERROR_HANDLE);
DWORD con_mode = 0;
use_printf_color = con && con != INVALID_HANDLE_VALUE &&
GetConsoleMode(con, &con_mode) &&
@@ -683,12 +706,14 @@ int main(int argc, char *argv[]) {
sigaction(SIGILL, &signal_handler_act, NULL);
sigaction(SIGSEGV, &signal_handler_act, NULL);
- const char *const term = getenv("TERM");
- use_printf_color = term && strcmp(term, "dumb") && isatty(2);
+ if (isatty(state.run_mode >= RUN_CPUFLAG_LISTING ? 1 : 2)) {
+ const char *const term = getenv("TERM");
+ use_printf_color = term && strcmp(term, "dumb");
+ }
#endif
#ifdef readtime
- if (state.bench) {
+ if (state.run_mode == RUN_BENCHMARK) {
if (!checkasm_save_context()) {
checkasm_set_signal_handler_state(1);
readtime();
@@ -702,11 +727,22 @@ int main(int argc, char *argv[]) {
int ret = 0;
- if (!state.function_listing) {
+ if (state.run_mode != RUN_FUNCTION_LISTING) {
+ const unsigned cpu_flags = dav1d_get_cpu_flags();
+ if (state.run_mode == RUN_CPUFLAG_LISTING) {
+ const int last_i = (int)(sizeof(cpus) / sizeof(*cpus)) - 2;
+ for (int i = 0; i <= last_i ; i++) {
+ if (cpus[i].flag & cpu_flags)
+ color_fprintf(stdout, COLOR_GREEN, "%s", cpus[i].suffix);
+ else
+ color_fprintf(stdout, COLOR_RED, "~%s", cpus[i].suffix);
+ printf(i == last_i ? "\n" : ", ");
+ }
+ return 0;
+ }
#if ARCH_X86_64
void checkasm_warmup_avx2(void);
void checkasm_warmup_avx512(void);
- const unsigned cpu_flags = dav1d_get_cpu_flags();
if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX512ICL)
state.simd_warmup = checkasm_warmup_avx512;
else if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX2)
@@ -720,6 +756,18 @@ int main(int argc, char *argv[]) {
for (size_t len = strlen(name); len && name[len-1] == ' '; len--)
name[len-1] = '\0'; /* trim trailing whitespace */
fprintf(stderr, "checkasm: %s (%08X) using random seed %u\n", name, cpuid, state.seed);
+#elif ARCH_RISCV
+ char buf[32] = "";
+ if (cpu_flags & DAV1D_RISCV_CPU_FLAG_V) {
+ const int vlen = 8*checkasm_get_vlenb();
+ snprintf(buf, sizeof(buf), "VLEN=%i bits, ", vlen);
+ }
+ fprintf(stderr, "checkasm: %susing random seed %u\n", buf, state.seed);
+#elif ARCH_AARCH64 && HAVE_SVE
+ char buf[48] = "";
+ if (cpu_flags & DAV1D_ARM_CPU_FLAG_SVE)
+ snprintf(buf, sizeof(buf), "SVE %d bits, ", checkasm_sve_length());
+ fprintf(stderr, "checkasm: %susing random seed %u\n", buf, state.seed);
#else
fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
#endif
@@ -729,7 +777,7 @@ int main(int argc, char *argv[]) {
for (int i = 0; cpus[i].flag; i++)
check_cpu_flag(cpus[i].name, cpus[i].flag);
- if (state.function_listing) {
+ if (state.run_mode == RUN_FUNCTION_LISTING) {
print_functions(state.funcs);
} else if (state.num_failed) {
fprintf(stderr, "checkasm: %d of %d tests failed\n",
@@ -741,7 +789,7 @@ int main(int argc, char *argv[]) {
else
fprintf(stderr, "checkasm: no tests to perform\n");
#ifdef readtime
- if (state.bench && state.max_function_name_length) {
+ if (state.run_mode == RUN_BENCHMARK && state.max_function_name_length) {
state.nop_time = measure_nop_time();
if (state.verbose)
printf("nop:%*.1f\n", state.max_function_name_length + 6, state.nop_time);
@@ -801,7 +849,7 @@ void *checkasm_check_func(void *const func, const char *const name, ...) {
v->ok = 1;
v->cpu = state.cpu_flag;
state.current_func_ver = v;
- if (state.function_listing) /* Save function names without running tests */
+ if (state.run_mode == RUN_FUNCTION_LISTING) /* Save function names without running tests */
return NULL;
xor128_srand(state.seed);
@@ -814,7 +862,7 @@ void *checkasm_check_func(void *const func, const char *const name, ...) {
/* Decide whether or not the current function needs to be benchmarked */
int checkasm_bench_func(void) {
- return !state.num_failed && state.bench;
+ return !state.num_failed && state.run_mode == RUN_BENCHMARK;
}
/* Indicate that the current test has failed, return whether verbose printing
@@ -863,9 +911,9 @@ void checkasm_report(const char *const name, ...) {
fprintf(stderr, "%*c", imax(pad_length, 0) + 2, '[');
if (state.num_failed == prev_failed)
- color_printf(COLOR_GREEN, "OK");
+ color_fprintf(stderr, COLOR_GREEN, "OK");
else
- color_printf(COLOR_RED, "FAILED");
+ color_fprintf(stderr, COLOR_RED, "FAILED");
fprintf(stderr, "]\n");
prev_checked = state.num_checked;
diff --git a/tests/checkasm/mc.c b/tests/checkasm/mc.c
index 047ef7b..f1f5dc3 100644
--- a/tests/checkasm/mc.c
+++ b/tests/checkasm/mc.c
@@ -98,6 +98,7 @@ static void check_mc(Dav1dMCDSPContext *const c) {
w, h, "dst");
if (filter == FILTER_2D_8TAP_REGULAR ||
+ filter == FILTER_2D_8TAP_SHARP ||
filter == FILTER_2D_BILINEAR)
{
bench_new(a_dst, a_dst_stride, src, src_stride, w, h,
@@ -155,6 +156,7 @@ static void check_mct(Dav1dMCDSPContext *const c) {
w, h, "tmp");
if (filter == FILTER_2D_8TAP_REGULAR ||
+ filter == FILTER_2D_8TAP_SHARP ||
filter == FILTER_2D_BILINEAR)
{
bench_new(a_tmp, src, src_stride, w, h,
diff --git a/tests/checkasm/msac.c b/tests/checkasm/msac.c
index 81fd593..26d4a56 100644
--- a/tests/checkasm/msac.c
+++ b/tests/checkasm/msac.c
@@ -33,7 +33,7 @@
#include <stdio.h>
#include <string.h>
-#define BUF_SIZE 8192
+#define BUF_SIZE 128
/* The normal code doesn't use function pointers */
typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf,
@@ -64,9 +64,16 @@ static void randomize_cdf(uint16_t *const cdf, const int n) {
/* memcmp() on structs can have weird behavior due to padding etc. */
static int msac_cmp(const MsacContext *const a, const MsacContext *const b) {
- return a->buf_pos != b->buf_pos || a->buf_end != b->buf_end ||
- a->dif != b->dif || a->rng != b->rng || a->cnt != b->cnt ||
- a->allow_update_cdf != b->allow_update_cdf;
+ if (a->buf_pos != b->buf_pos || a->buf_end != b->buf_end ||
+ a->rng != b->rng || a->cnt != b->cnt ||
+ a->allow_update_cdf != b->allow_update_cdf)
+ {
+ return 1;
+ }
+
+ /* Only check valid dif bits, ignoring partial bytes at the end */
+ const ec_win dif_mask = ~((~(ec_win)0) >> (imax(a->cnt, 0) + 16));
+ return !!((a->dif ^ b->dif) & dif_mask);
}
static void msac_dump(unsigned c_res, unsigned a_res,
@@ -86,7 +93,7 @@ static void msac_dump(unsigned c_res, unsigned a_res,
fprintf(stderr, "rng %u vs %u\n", a->rng, b->rng);
if (a->cnt != b->cnt)
fprintf(stderr, "cnt %d vs %d\n", a->cnt, b->cnt);
- if (a->allow_update_cdf)
+ if (a->allow_update_cdf != b->allow_update_cdf)
fprintf(stderr, "allow_update_cdf %d vs %d\n",
a->allow_update_cdf, b->allow_update_cdf);
if (num_cdf && memcmp(cdf_a, cdf_b, sizeof(*cdf_a) * (num_cdf + 1))) {
@@ -113,7 +120,7 @@ static void msac_dump(unsigned c_res, unsigned a_res,
s_a = s_c; \
randomize_cdf(cdf[0], ns); \
memcpy(cdf[1], cdf[0], sizeof(*cdf)); \
- for (int i = 0; i < 64; i++) { \
+ while (s_c.cnt >= 0) { \
unsigned c_res = call_ref(&s_c, cdf[0], ns); \
unsigned a_res = call_new(&s_a, cdf[1], ns); \
if (c_res != a_res || msac_cmp(&s_c, &s_a) || \
@@ -154,7 +161,7 @@ static void check_decode_bool_adapt(MsacDSPContext *const c, uint8_t *const buf)
s_a = s_c;
cdf[0][0] = cdf[1][0] = rnd() % 32767 + 1;
cdf[0][1] = cdf[1][1] = 0;
- for (int i = 0; i < 64; i++) {
+ while (s_c.cnt >= 0) {
unsigned c_res = call_ref(&s_c, cdf[0]);
unsigned a_res = call_new(&s_a, cdf[1]);
if (c_res != a_res || msac_cmp(&s_c, &s_a) ||
@@ -177,7 +184,7 @@ static void check_decode_bool_equi(MsacDSPContext *const c, uint8_t *const buf)
if (check_func(c->decode_bool_equi, "msac_decode_bool_equi")) {
dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
s_a = s_c;
- for (int i = 0; i < 64; i++) {
+ while (s_c.cnt >= 0) {
unsigned c_res = call_ref(&s_c);
unsigned a_res = call_new(&s_a);
if (c_res != a_res || msac_cmp(&s_c, &s_a)) {
@@ -196,7 +203,7 @@ static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
if (check_func(c->decode_bool, "msac_decode_bool")) {
dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
s_a = s_c;
- for (int i = 0; i < 64; i++) {
+ while (s_c.cnt >= 0) {
const unsigned f = rnd() & 0x7fff;
unsigned c_res = call_ref(&s_c, f);
unsigned a_res = call_new(&s_a, f);
@@ -228,7 +235,7 @@ static void check_decode_hi_tok(MsacDSPContext *const c, uint8_t *const buf) {
s_a = s_c;
randomize_cdf(cdf[0], 3);
memcpy(cdf[1], cdf[0], sizeof(*cdf));
- for (int i = 0; i < 64; i++) {
+ while (s_c.cnt >= 0) {
unsigned c_res = call_ref(&s_c, cdf[0]);
unsigned a_res = call_new(&s_a, cdf[1]);
if (c_res != a_res || msac_cmp(&s_c, &s_a) ||
diff --git a/tests/checkasm/riscv/checkasm_64.S b/tests/checkasm/riscv/checkasm_64.S
index 0d02e5f..8557eab 100644
--- a/tests/checkasm/riscv/checkasm_64.S
+++ b/tests/checkasm/riscv/checkasm_64.S
@@ -83,6 +83,11 @@ endconst
thread_local saved_regs, quads=29 # 5 + 12 + 12
+function get_vlenb, export=1
+ csrr a0, vlenb
+ ret
+endfunc
+
function checked_call, export=1, ext=v
/* Save the function ptr, RA, SP, unallocatable and callee-saved registers */
la.tls.ie t0, saved_regs
diff --git a/tests/dav1d_argon.bash b/tests/dav1d_argon.bash
index 27a8d61..954dad8 100755
--- a/tests/dav1d_argon.bash
+++ b/tests/dav1d_argon.bash
@@ -4,8 +4,8 @@ DAV1D="tools/dav1d"
ARGON_DIR='.'
FILMGRAIN=1
CPUMASK=-1
-THREADS=0
-JOBS=1
+THREADS=1
+JOBS=0
usage() {
NAME=$(basename "$0")
@@ -19,8 +19,8 @@ usage() {
printf " -a dir path to argon dir (default: 'tests/argon' if found; '.' otherwise)\n"
printf " -g \$num enable filmgrain (default: 1)\n"
printf " -c \$mask use restricted cpumask (default: -1)\n"
- printf " -t \$num number of threads per dav1d (default: 0)\n"
- printf " -j \$num number of parallel dav1d processes (default: 1)\n\n"
+ printf " -t \$num number of threads per dav1d (default: 1)\n"
+ printf " -j \$num number of parallel dav1d processes (default: 0)\n\n"
} >&2
exit 1
}
@@ -110,6 +110,14 @@ while getopts ":d:a:g:c:t:j:" opt; do
done
shift $((OPTIND-1))
+if [ "$JOBS" -eq 0 ]; then
+ if [ "$THREADS" -gt 0 ]; then
+ JOBS="$((($( (nproc || sysctl -n hw.logicalcpu || getconf _NPROCESSORS_ONLN || echo 1) 2>/dev/null)+THREADS-1)/THREADS))"
+ else
+ JOBS=1
+ fi
+fi
+
if [ "$#" -eq 0 ]; then
# Everything except large scale tiles and stress files.
dirs=("$ARGON_DIR/profile0_core" "$ARGON_DIR/profile0_core_special"
diff --git a/tools/dav1d_cli_parse.c b/tools/dav1d_cli_parse.c
index 5fdbab3..39fe54d 100644
--- a/tools/dav1d_cli_parse.c
+++ b/tools/dav1d_cli_parse.c
@@ -219,7 +219,13 @@ enum CpuMask {
static const EnumParseTable cpu_mask_tbl[] = {
#if ARCH_AARCH64 || ARCH_ARM
- { "neon", DAV1D_ARM_CPU_FLAG_NEON },
+ { "neon", DAV1D_ARM_CPU_FLAG_NEON },
+ { "dotprod", DAV1D_ARM_CPU_FLAG_DOTPROD },
+ { "i8mm", DAV1D_ARM_CPU_FLAG_I8MM },
+#if ARCH_AARCH64
+ { "sve", DAV1D_ARM_CPU_FLAG_SVE },
+ { "sve2", DAV1D_ARM_CPU_FLAG_SVE2 },
+#endif /* ARCH_AARCH64 */
#elif ARCH_LOONGARCH
{ "lsx", DAV1D_LOONGARCH_CPU_FLAG_LSX },
{ "lasx", DAV1D_LOONGARCH_CPU_FLAG_LASX },