Snap for 8346638 from 84737ef36d3bd9c929b801fdaa22c8e27899648e to mainline-go-art-release

Change-Id: I69722bb50c4812c0499665a153d6b227fb0879eb
author: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> 2022-03-23 20:51:30 +0000
committer: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> 2022-03-23 20:51:30 +0000
commit: 5896e12e155aeb30682fcb6473a57a8f83f7de35 (patch)
tree: 7d9f824e8baa7032c8f8bf51f965345a27a21cef
parent: fcbcefbbc5486d3fa1bf9ac6ce6bfd66fbbf750e (diff)
parent: 84737ef36d3bd9c929b801fdaa22c8e27899648e (diff)
download: ppv-lite86-5896e12e155aeb30682fcb6473a57a8f83f7de35.tar.gz
13 files changed, 237 insertions, 533 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index e6ee0e5..adb1fc4 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,6 +1,5 @@
 {
   "git": {
-    "sha1": "4b1e1d655d05c9da29aa833ce705feedb3da760b"
-  },
-  "path_in_vcs": "utils-simd/ppv-lite86"
-}
-\ No newline at end of file
+    "sha1": "3012849c2d9c50228a780031e7c200b193a6b4fa"
+  }
+}
diff --git a/Android.bp b/Android.bp
index 123567a..fe8ffb5 100644
--- a/Android.bp
+++ b/Android.bp
@@ -1,5 +1,4 @@
-// This file is generated by cargo2android.py --config cargo2android.json.
-// Do not modify this file as changes will be overridden on upgrade.
+// This file is generated by cargo2android.py --device --run --dependencies --tests.
 
 package {
     default_applicable_licenses: ["external_rust_crates_ppv-lite86_license"],
@@ -41,35 +40,33 @@ rust_library {
     name: "libppv_lite86",
     host_supported: true,
     crate_name: "ppv_lite86",
-    cargo_env_compat: true,
-    cargo_pkg_version: "0.2.16",
     srcs: ["src/lib.rs"],
     edition: "2018",
     features: [
         "default",
         "std",
     ],
-    apex_available: [
-        "//apex_available:platform",
-        "com.android.virt",
-    ],
 }
 
-rust_test {
-    name: "ppv-lite86_test_src_lib",
-    host_supported: true,
+rust_defaults {
+    name: "ppv-lite86_defaults",
     crate_name: "ppv_lite86",
-    cargo_env_compat: true,
-    cargo_pkg_version: "0.2.16",
     srcs: ["src/lib.rs"],
     test_suites: ["general-tests"],
     auto_gen_config: true,
-    test_options: {
-        unit_test: true,
-    },
     edition: "2018",
     features: [
         "default",
         "std",
     ],
 }
+
+rust_test_host {
+    name: "ppv-lite86_host_test_src_lib",
+    defaults: ["ppv-lite86_defaults"],
+}
+
+rust_test {
+    name: "ppv-lite86_device_test_src_lib",
+    defaults: ["ppv-lite86_defaults"],
+}
diff --git a/CHANGELOG.md b/CHANGELOG.md
deleted file mode 100644
index 6e34be3..0000000
--- a/CHANGELOG.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# Changelog
-All notable changes to this project will be documented in this file.
-
-The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
-and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
-
-## [0.2.16]
-### Added
-- add [u64; 4] conversion for generic vec256, to support BLAKE on non-x86.
-- impl `From` (rather than just `Into`) for conversions between `*_storage` types and arrays.
diff --git a/Cargo.toml b/Cargo.toml
index 927ecfe..6ffa7ba 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,16 +3,17 @@
 # When uploading crates to the registry Cargo will automatically
 # "normalize" Cargo.toml files for maximal compatibility
 # with all versions of Cargo and also rewrite `path` dependencies
-# to registry (e.g., crates.io) dependencies.
+# to registry (e.g., crates.io) dependencies
 #
-# If you are reading this file be aware that the original Cargo.toml
-# will likely look very different (and much more reasonable).
-# See Cargo.toml.orig for the original contents.
+# If you believe there's an error in this file please file an
+# issue against the rust-lang/cargo repository. If you're
+# editing this file be aware that the upstream Cargo.toml
+# will likely look very different (and much more reasonable)
 
 [package]
 edition = "2018"
 name = "ppv-lite86"
-version = "0.2.16"
+version = "0.2.10"
 authors = ["The CryptoCorrosion Contributors"]
 description = "Implementation of the crypto-simd API for x86"
 keywords = ["crypto", "simd", "x86"]
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index b457f54..8f3fb52 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,6 +1,6 @@
 [package]
 name = "ppv-lite86"
-version = "0.2.16"
+version = "0.2.10"
 authors = ["The CryptoCorrosion Contributors"]
 edition = "2018"
 license = "MIT/Apache-2.0"
diff --git a/METADATA b/METADATA
index 95596fd..1ac3aef 100644
--- a/METADATA
+++ b/METADATA
@@ -7,13 +7,13 @@ third_party {
   }
   url {
     type: ARCHIVE
-    value: "https://static.crates.io/crates/ppv-lite86/ppv-lite86-0.2.16.crate"
+    value: "https://static.crates.io/crates/ppv-lite86/ppv-lite86-0.2.10.crate"
   }
-  version: "0.2.16"
+  version: "0.2.10"
   license_type: NOTICE
   last_upgrade_date {
-    year: 2022
-    month: 3
-    day: 1
+    year: 2020
+    month: 11
+    day: 2
   }
 }
diff --git a/TEST_MAPPING b/TEST_MAPPING
index f0d135a..a45c687 100644
--- a/TEST_MAPPING
+++ b/TEST_MAPPING
@@ -1,102 +1,8 @@
-// Generated by update_crate_tests.py for tests that depend on this crate.
+// Generated by cargo2android.py for tests in Android.bp
 {
-  "imports": [
-    {
-      "path": "external/rust/crates/base64"
-    },
-    {
-      "path": "external/rust/crates/cast"
-    },
-    {
-      "path": "external/rust/crates/crc32fast"
-    },
-    {
-      "path": "external/rust/crates/crossbeam-deque"
-    },
-    {
-      "path": "external/rust/crates/crossbeam-epoch"
-    },
-    {
-      "path": "external/rust/crates/crossbeam-queue"
-    },
-    {
-      "path": "external/rust/crates/crossbeam-utils"
-    },
-    {
-      "path": "external/rust/crates/mio"
-    },
-    {
-      "path": "external/rust/crates/quickcheck"
-    },
-    {
-      "path": "external/rust/crates/rand_chacha"
-    },
-    {
-      "path": "external/rust/crates/regex"
-    },
-    {
-      "path": "external/rust/crates/ryu"
-    },
-    {
-      "path": "external/rust/crates/tokio"
-    }
-  ],
   "presubmit": [
     {
-      "name": "ZipFuseTest"
-    },
-    {
-      "name": "apkdmverity.test"
-    },
-    {
-      "name": "authfs_device_test_src_lib"
-    },
-    {
-      "name": "keystore2_test"
-    },
-    {
-      "name": "keystore2_test_utils_test"
-    },
-    {
-      "name": "legacykeystore_test"
-    },
-    {
-      "name": "microdroid_manager_test"
-    },
-    {
-      "name": "ppv-lite86_test_src_lib"
-    },
-    {
-      "name": "virtualizationservice_device_test"
-    }
-  ],
-  "presubmit-rust": [
-    {
-      "name": "ZipFuseTest"
-    },
-    {
-      "name": "apkdmverity.test"
-    },
-    {
-      "name": "authfs_device_test_src_lib"
-    },
-    {
-      "name": "keystore2_test"
-    },
-    {
-      "name": "keystore2_test_utils_test"
-    },
-    {
-      "name": "legacykeystore_test"
-    },
-    {
-      "name": "microdroid_manager_test"
-    },
-    {
-      "name": "ppv-lite86_test_src_lib"
-    },
-    {
-      "name": "virtualizationservice_device_test"
+      "name": "ppv-lite86_device_test_src_lib"
     }
   ]
 }
diff --git a/cargo2android.json b/cargo2android.json
deleted file mode 100644
index ac56e26..0000000
--- a/cargo2android.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "apex-available": [
-    "//apex_available:platform",
-    "com.android.virt"
-  ],
-  "dependencies": true,
-  "device": true,
-  "run": true,
-  "tests": true
-}
-\ No newline at end of file
diff --git a/src/generic.rs b/src/generic.rs
index add6c48..f0e83d9 100644
--- a/src/generic.rs
+++ b/src/generic.rs
@@ -11,38 +11,38 @@ pub union vec128_storage {
     q: [u64; 2],
 }
 impl From<[u32; 4]> for vec128_storage {
-    #[inline(always)]
+    #[inline]
     fn from(d: [u32; 4]) -> Self {
         Self { d }
     }
 }
 impl From<vec128_storage> for [u32; 4] {
-    #[inline(always)]
+    #[inline]
     fn from(d: vec128_storage) -> Self {
         unsafe { d.d }
     }
 }
 impl From<[u64; 2]> for vec128_storage {
-    #[inline(always)]
+    #[inline]
     fn from(q: [u64; 2]) -> Self {
         Self { q }
     }
 }
 impl From<vec128_storage> for [u64; 2] {
-    #[inline(always)]
+    #[inline]
     fn from(q: vec128_storage) -> Self {
         unsafe { q.q }
     }
 }
 impl Default for vec128_storage {
-    #[inline(always)]
+    #[inline]
     fn default() -> Self {
         Self { q: [0, 0] }
     }
 }
 impl Eq for vec128_storage {}
 impl PartialEq<vec128_storage> for vec128_storage {
-    #[inline(always)]
+    #[inline]
     fn eq(&self, rhs: &Self) -> bool {
         unsafe { self.q == rhs.q }
     }
@@ -62,21 +62,13 @@ impl vec256_storage {
     }
 }
 impl From<vec256_storage> for [u64; 4] {
-    #[inline(always)]
+    #[inline]
     fn from(q: vec256_storage) -> Self {
         let [a, b]: [u64; 2] = q.v128[0].into();
         let [c, d]: [u64; 2] = q.v128[1].into();
         [a, b, c, d]
     }
 }
-impl From<[u64; 4]> for vec256_storage {
-    #[inline(always)]
-    fn from([a, b, c, d]: [u64; 4]) -> Self {
-        Self {
-            v128: [[a, b].into(), [c, d].into()],
-        }
-    }
-}
 #[derive(Clone, Copy, PartialEq, Eq, Default)]
 pub struct vec512_storage {
     v128: [vec128_storage; 4],
@@ -92,7 +84,6 @@ impl vec512_storage {
     }
 }
 
-#[inline(always)]
 fn dmap<T, F>(t: T, f: F) -> T
 where
     T: Store<vec128_storage> + Into<vec128_storage>,
@@ -126,7 +117,6 @@ where
     unsafe { T::unpack(d) }
 }
 
-#[inline(always)]
 fn qmap<T, F>(t: T, f: F) -> T
 where
     T: Store<vec128_storage> + Into<vec128_storage>,
@@ -140,7 +130,6 @@ where
     unsafe { T::unpack(q) }
 }
 
-#[inline(always)]
 fn qmap2<T, F>(a: T, b: T, f: F) -> T
 where
     T: Store<vec128_storage> + Into<vec128_storage>,
@@ -156,17 +145,14 @@ where
     unsafe { T::unpack(q) }
 }
 
-#[inline(always)]
 fn o_of_q(q: [u64; 2]) -> u128 {
     u128::from(q[0]) | (u128::from(q[1]) << 64)
 }
 
-#[inline(always)]
 fn q_of_o(o: u128) -> [u64; 2] {
     [o as u64, (o >> 64) as u64]
 }
 
-#[inline(always)]
 fn omap<T, F>(a: T, f: F) -> T
 where
     T: Store<vec128_storage> + Into<vec128_storage>,
@@ -178,7 +164,6 @@ where
     unsafe { T::unpack(o) }
 }
 
-#[inline(always)]
 fn omap2<T, F>(a: T, b: T, f: F) -> T
 where
     T: Store<vec128_storage> + Into<vec128_storage>,
@@ -262,39 +247,39 @@ macro_rules! impl_bitops {
         }
 
         impl Swap64 for $vec {
-            #[inline(always)]
+            #[inline]
             fn swap1(self) -> Self {
                 qmap(self, |x| {
                     ((x & 0x5555555555555555) << 1) | ((x & 0xaaaaaaaaaaaaaaaa) >> 1)
                 })
             }
-            #[inline(always)]
+            #[inline]
             fn swap2(self) -> Self {
                 qmap(self, |x| {
                     ((x & 0x3333333333333333) << 2) | ((x & 0xcccccccccccccccc) >> 2)
                 })
             }
-            #[inline(always)]
+            #[inline]
             fn swap4(self) -> Self {
                 qmap(self, |x| {
                     ((x & 0x0f0f0f0f0f0f0f0f) << 4) | ((x & 0xf0f0f0f0f0f0f0f0) >> 4)
                 })
             }
-            #[inline(always)]
+            #[inline]
             fn swap8(self) -> Self {
                 qmap(self, |x| {
                     ((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8)
                 })
             }
-            #[inline(always)]
+            #[inline]
             fn swap16(self) -> Self {
                 dmap(self, |x| x.rotate_left(16))
             }
-            #[inline(always)]
+            #[inline]
             fn swap32(self) -> Self {
                 qmap(self, |x| x.rotate_left(32))
             }
-            #[inline(always)]
+            #[inline]
             fn swap64(self) -> Self {
                 omap(self, |x| (x << 64) | (x >> 64))
             }
@@ -306,83 +291,82 @@ impl_bitops!(u64x2_generic);
 impl_bitops!(u128x1_generic);
 
 impl RotateEachWord32 for u32x4_generic {
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right7(self) -> Self {
         dmap(self, |x| x.rotate_right(7))
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right8(self) -> Self {
         dmap(self, |x| x.rotate_right(8))
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right11(self) -> Self {
         dmap(self, |x| x.rotate_right(11))
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right12(self) -> Self {
         dmap(self, |x| x.rotate_right(12))
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right16(self) -> Self {
         dmap(self, |x| x.rotate_right(16))
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right20(self) -> Self {
         dmap(self, |x| x.rotate_right(20))
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right24(self) -> Self {
         dmap(self, |x| x.rotate_right(24))
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right25(self) -> Self {
         dmap(self, |x| x.rotate_right(25))
     }
 }
 
 impl RotateEachWord32 for u64x2_generic {
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right7(self) -> Self {
         qmap(self, |x| x.rotate_right(7))
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right8(self) -> Self {
         qmap(self, |x| x.rotate_right(8))
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right11(self) -> Self {
         qmap(self, |x| x.rotate_right(11))
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right12(self) -> Self {
         qmap(self, |x| x.rotate_right(12))
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right16(self) -> Self {
         qmap(self, |x| x.rotate_right(16))
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right20(self) -> Self {
         qmap(self, |x| x.rotate_right(20))
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right24(self) -> Self {
         qmap(self, |x| x.rotate_right(24))
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right25(self) -> Self {
         qmap(self, |x| x.rotate_right(25))
     }
 }
 impl RotateEachWord64 for u64x2_generic {
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right32(self) -> Self {
         qmap(self, |x| x.rotate_right(32))
     }
 }
 
 // workaround for koute/cargo-web#52 (u128::rotate_* broken with cargo web)
-#[inline(always)]
 fn rotate_u128_right(x: u128, i: u32) -> u128 {
     (x >> i) | (x << (128 - i))
 }
@@ -393,41 +377,41 @@ fn test_rotate_u128() {
 }
 
 impl RotateEachWord32 for u128x1_generic {
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right7(self) -> Self {
         Self([rotate_u128_right(self.0[0], 7)])
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right8(self) -> Self {
         Self([rotate_u128_right(self.0[0], 8)])
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right11(self) -> Self {
         Self([rotate_u128_right(self.0[0], 11)])
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right12(self) -> Self {
         Self([rotate_u128_right(self.0[0], 12)])
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right16(self) -> Self {
         Self([rotate_u128_right(self.0[0], 16)])
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right20(self) -> Self {
         Self([rotate_u128_right(self.0[0], 20)])
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right24(self) -> Self {
         Self([rotate_u128_right(self.0[0], 24)])
     }
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right25(self) -> Self {
         Self([rotate_u128_right(self.0[0], 25)])
     }
 }
 impl RotateEachWord64 for u128x1_generic {
-    #[inline(always)]
+    #[inline]
     fn rotate_each_word_right32(self) -> Self {
         Self([rotate_u128_right(self.0[0], 32)])
     }
@@ -446,7 +430,7 @@ impl Machine for GenericMachine {
     type u32x4x4 = u32x4x4_generic;
     type u64x2x4 = u64x2x4_generic;
     type u128x4 = u128x4_generic;
-    #[inline(always)]
+    #[inline]
     unsafe fn instance() -> Self {
         Self
     }
@@ -623,22 +607,6 @@ pub type u32x4x4_generic = x4<u32x4_generic>;
 pub type u64x2x4_generic = x4<u64x2_generic>;
 pub type u128x4_generic = x4<u128x1_generic>;
 
-impl Vector<[u32; 16]> for u32x4x4_generic {
-    fn to_scalars(self) -> [u32; 16] {
-        let [a, b, c, d] = self.0;
-        let a = a.0;
-        let b = b.0;
-        let c = c.0;
-        let d = d.0;
-        [
-            a[0], a[1], a[2], a[3], //
-            b[0], b[1], b[2], b[3], //
-            c[0], c[1], c[2], c[3], //
-            d[0], d[1], d[2], d[3], //
-        ]
-    }
-}
-
 impl MultiLane<[u32; 4]> for u32x4_generic {
     #[inline(always)]
     fn to_lanes(self) -> [u32; 4] {
@@ -779,7 +747,7 @@ impl u128x4<GenericMachine> for u128x4_generic {}
 #[macro_export]
 macro_rules! dispatch {
     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
-        #[inline(always)]
+        #[inline]
         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
             let $mach = unsafe { $crate::generic::GenericMachine::instance() };
             #[inline(always)]
@@ -796,7 +764,7 @@ macro_rules! dispatch {
 #[macro_export]
 macro_rules! dispatch_light128 {
     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
-        #[inline(always)]
+        #[inline]
         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
             let $mach = unsafe { $crate::generic::GenericMachine::instance() };
             #[inline(always)]
@@ -813,7 +781,7 @@ macro_rules! dispatch_light128 {
 #[macro_export]
 macro_rules! dispatch_light256 {
     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
-        #[inline(always)]
+        #[inline]
         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
             let $mach = unsafe { $crate::generic::GenericMachine::instance() };
             #[inline(always)]
@@ -830,7 +798,7 @@ macro_rules! dispatch_light256 {
 #[macro_export]
 macro_rules! dispatch_light512 {
     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
-        #[inline(always)]
+        #[inline]
         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
             let $mach = unsafe { $crate::generic::GenericMachine::instance() };
             #[inline(always)]
diff --git a/src/soft.rs b/src/soft.rs
index 0ae390c..8976c48 100644
--- a/src/soft.rs
+++ b/src/soft.rs
@@ -175,50 +175,26 @@ impl<W: BSwap + Copy, G> BSwap for x2<W, G> {
 impl<W: StoreBytes + BSwap + Copy, G> StoreBytes for x2<W, G> {
     #[inline(always)]
     unsafe fn unsafe_read_le(input: &[u8]) -> Self {
-        let input = input.split_at(input.len() / 2);
+        let input = input.split_at(16);
         x2::new([W::unsafe_read_le(input.0), W::unsafe_read_le(input.1)])
     }
     #[inline(always)]
     unsafe fn unsafe_read_be(input: &[u8]) -> Self {
-        let input = input.split_at(input.len() / 2);
-        x2::new([W::unsafe_read_be(input.0), W::unsafe_read_be(input.1)])
+        x2::unsafe_read_le(input).bswap()
     }
     #[inline(always)]
     fn write_le(self, out: &mut [u8]) {
-        let out = out.split_at_mut(out.len() / 2);
+        let out = out.split_at_mut(16);
         self.0[0].write_le(out.0);
         self.0[1].write_le(out.1);
     }
     #[inline(always)]
     fn write_be(self, out: &mut [u8]) {
-        let out = out.split_at_mut(out.len() / 2);
+        let out = out.split_at_mut(16);
         self.0[0].write_be(out.0);
         self.0[1].write_be(out.1);
     }
 }
-impl<W: Copy + LaneWords4, G: Copy> LaneWords4 for x2<W, G> {
-    #[inline(always)]
-    fn shuffle_lane_words2301(self) -> Self {
-        Self::new([
-            self.0[0].shuffle_lane_words2301(),
-            self.0[1].shuffle_lane_words2301(),
-        ])
-    }
-    #[inline(always)]
-    fn shuffle_lane_words1230(self) -> Self {
-        Self::new([
-            self.0[0].shuffle_lane_words1230(),
-            self.0[1].shuffle_lane_words1230(),
-        ])
-    }
-    #[inline(always)]
-    fn shuffle_lane_words3012(self) -> Self {
-        Self::new([
-            self.0[0].shuffle_lane_words3012(),
-            self.0[1].shuffle_lane_words3012(),
-        ])
-    }
-}
 
 #[derive(Copy, Clone, Default)]
 #[allow(non_camel_case_types)]
@@ -334,20 +310,6 @@ impl<W: Copy> Vec4<W> for x4<W> {
         self
     }
 }
-impl<W: Copy> Vec4Ext<W> for x4<W> {
-    #[inline(always)]
-    fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self)
-    where
-        Self: Sized,
-    {
-        (
-            x4([a.0[0], b.0[0], c.0[0], d.0[0]]),
-            x4([a.0[1], b.0[1], c.0[1], d.0[1]]),
-            x4([a.0[2], b.0[2], c.0[2], d.0[2]]),
-            x4([a.0[3], b.0[3], c.0[3], d.0[3]]),
-        )
-    }
-}
 impl<W: Copy + Store<vec128_storage>> Store<vec512_storage> for x4<W> {
     #[inline(always)]
     unsafe fn unpack(p: vec512_storage) -> Self {
@@ -406,39 +368,30 @@ impl<W: BSwap + Copy> BSwap for x4<W> {
 impl<W: StoreBytes + BSwap + Copy> StoreBytes for x4<W> {
     #[inline(always)]
     unsafe fn unsafe_read_le(input: &[u8]) -> Self {
-        let n = input.len() / 4;
         x4([
-            W::unsafe_read_le(&input[..n]),
-            W::unsafe_read_le(&input[n..n * 2]),
-            W::unsafe_read_le(&input[n * 2..n * 3]),
-            W::unsafe_read_le(&input[n * 3..]),
+            W::unsafe_read_le(&input[0..16]),
+            W::unsafe_read_le(&input[16..32]),
+            W::unsafe_read_le(&input[32..48]),
+            W::unsafe_read_le(&input[48..64]),
         ])
     }
     #[inline(always)]
     unsafe fn unsafe_read_be(input: &[u8]) -> Self {
-        let n = input.len() / 4;
-        x4([
-            W::unsafe_read_be(&input[..n]),
-            W::unsafe_read_be(&input[n..n * 2]),
-            W::unsafe_read_be(&input[n * 2..n * 3]),
-            W::unsafe_read_be(&input[n * 3..]),
-        ])
+        x4::unsafe_read_le(input).bswap()
     }
     #[inline(always)]
     fn write_le(self, out: &mut [u8]) {
-        let n = out.len() / 4;
-        self.0[0].write_le(&mut out[..n]);
-        self.0[1].write_le(&mut out[n..n * 2]);
-        self.0[2].write_le(&mut out[n * 2..n * 3]);
-        self.0[3].write_le(&mut out[n * 3..]);
+        self.0[0].write_le(&mut out[0..16]);
+        self.0[1].write_le(&mut out[16..32]);
+        self.0[2].write_le(&mut out[32..48]);
+        self.0[3].write_le(&mut out[48..64]);
     }
     #[inline(always)]
     fn write_be(self, out: &mut [u8]) {
-        let n = out.len() / 4;
-        self.0[0].write_be(&mut out[..n]);
-        self.0[1].write_be(&mut out[n..n * 2]);
-        self.0[2].write_be(&mut out[n * 2..n * 3]);
-        self.0[3].write_be(&mut out[n * 3..]);
+        self.0[0].write_be(&mut out[0..16]);
+        self.0[1].write_be(&mut out[16..32]);
+        self.0[2].write_be(&mut out[32..48]);
+        self.0[3].write_be(&mut out[48..64]);
     }
 }
 impl<W: Copy + LaneWords4> LaneWords4 for x4<W> {
diff --git a/src/types.rs b/src/types.rs
index f9f3bf1..a282670 100644
--- a/src/types.rs
+++ b/src/types.rs
@@ -71,17 +71,6 @@ pub trait Vec4<W> {
     fn extract(self, i: u32) -> W;
     fn insert(self, w: W, i: u32) -> Self;
 }
-/// Vec4 functions which may not be implemented yet for all Vec4 types.
-/// NOTE: functions in this trait may be moved to Vec4 in any patch release. To avoid breakage,
-/// import Vec4Ext only together with Vec4, and don't qualify its methods.
-pub trait Vec4Ext<W> {
-    fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self)
-    where
-        Self: Sized;
-}
-pub trait Vector<T> {
-    fn to_scalars(self) -> T;
-}
 
 // TODO: multiples of 4 should inherit this
 /// A vector composed of four words; depending on their size, operations may cross lanes.
@@ -123,7 +112,12 @@ pub trait u32x4<M: Machine>:
 {
 }
 pub trait u64x2<M: Machine>:
-    BitOps64 + Store<vec128_storage> + ArithOps + Vec2<u64> + MultiLane<[u64; 2]> + Into<vec128_storage>
+    BitOps64
+    + Store<vec128_storage>
+    + ArithOps
+    + Vec2<u64>
+    + MultiLane<[u64; 2]>
+    + Into<vec128_storage>
 {
 }
 pub trait u128x1<M: Machine>:
@@ -138,7 +132,6 @@ pub trait u32x4x2<M: Machine>:
     + MultiLane<[M::u32x4; 2]>
     + ArithOps
     + Into<vec256_storage>
-    + StoreBytes
 {
 }
 pub trait u64x2x2<M: Machine>:
@@ -176,13 +169,10 @@ pub trait u32x4x4<M: Machine>:
     BitOps32
     + Store<vec512_storage>
     + Vec4<M::u32x4>
-    + Vec4Ext<M::u32x4>
-    + Vector<[u32; 16]>
     + MultiLane<[M::u32x4; 4]>
     + ArithOps
     + LaneWords4
     + Into<vec512_storage>
-    + StoreBytes
 {
 }
 pub trait u64x2x4<M: Machine>:
diff --git a/src/x86_64/mod.rs b/src/x86_64/mod.rs
index 937732d..d7455d0 100644
--- a/src/x86_64/mod.rs
+++ b/src/x86_64/mod.rs
@@ -79,7 +79,7 @@ where
     type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
     type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
 
-    type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>;
+    type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>;
     type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
     type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
     type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
@@ -119,16 +119,16 @@ impl Store<vec128_storage> for vec128_storage {
         p
     }
 }
-impl<'a> From<&'a vec128_storage> for &'a [u32; 4] {
+impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage {
     #[inline(always)]
-    fn from(x: &'a vec128_storage) -> Self {
-        unsafe { &x.u32x4 }
+    fn into(self) -> &'a [u32; 4] {
+        unsafe { &self.u32x4 }
     }
 }
-impl From<[u32; 4]> for vec128_storage {
+impl Into<vec128_storage> for [u32; 4] {
     #[inline(always)]
-    fn from(u32x4: [u32; 4]) -> Self {
-        vec128_storage { u32x4 }
+    fn into(self) -> vec128_storage {
+        vec128_storage { u32x4: self }
     }
 }
 impl Default for vec128_storage {
@@ -154,10 +154,10 @@ pub union vec256_storage {
     sse2: [vec128_storage; 2],
     avx: __m256i,
 }
-impl From<[u64; 4]> for vec256_storage {
+impl Into<vec256_storage> for [u64; 4] {
     #[inline(always)]
-    fn from(u64x4: [u64; 4]) -> Self {
-        vec256_storage { u64x4 }
+    fn into(self) -> vec256_storage {
+        vec256_storage { u64x4: self }
     }
 }
 impl Default for vec256_storage {
@@ -167,11 +167,9 @@ impl Default for vec256_storage {
     }
 }
 impl vec256_storage {
-    #[inline(always)]
     pub fn new128(xs: [vec128_storage; 2]) -> Self {
         Self { sse2: xs }
     }
-    #[inline(always)]
     pub fn split128(self) -> [vec128_storage; 2] {
         unsafe { self.sse2 }
     }
@@ -202,11 +200,9 @@ impl Default for vec512_storage {
     }
 }
 impl vec512_storage {
-    #[inline(always)]
     pub fn new128(xs: [vec128_storage; 4]) -> Self {
         Self { sse2: xs }
     }
-    #[inline(always)]
     pub fn split128(self) -> [vec128_storage; 4] {
         unsafe { self.sse2 }
     }
@@ -221,10 +217,10 @@ impl PartialEq for vec512_storage {
 
 macro_rules! impl_into {
     ($storage:ident, $array:ty, $name:ident) => {
-        impl From<$storage> for $array {
+        impl Into<$array> for $storage {
             #[inline(always)]
-            fn from(vec: $storage) -> Self {
-                unsafe { vec.$name }
+            fn into(self) -> $array {
+                unsafe { self.$name }
             }
         }
     };
diff --git a/src/x86_64/sse2.rs b/src/x86_64/sse2.rs
index 97197a4..bf0063f 100644
--- a/src/x86_64/sse2.rs
+++ b/src/x86_64/sse2.rs
@@ -189,21 +189,21 @@ impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
     rotr_32!(rotate_each_word_right7, 7);
     rotr_32_s3!(
         rotate_each_word_right8,
-        0x0c0f_0e0d_080b_0a09,
-        0x0407_0605_0003_0201
+        0x0c0f0e0d_080b0a09,
+        0x04070605_00030201
     );
     rotr_32!(rotate_each_word_right11, 11);
     rotr_32!(rotate_each_word_right12, 12);
     rotr_32_s3!(
         rotate_each_word_right16,
-        0x0d0c_0f0e_0908_0b0a,
-        0x0504_0706_0100_0302
+        0x0d0c0f0e_09080b0a,
+        0x05040706_01000302
     );
     rotr_32!(rotate_each_word_right20, 20);
     rotr_32_s3!(
         rotate_each_word_right24,
-        0x0e0d_0c0f_0a09_080b,
-        0x0605_0407_0201_0003
+        0x0e0d0c0f_0a09080b,
+        0x06050407_02010003
     );
     rotr_32!(rotate_each_word_right25, 25);
 }
@@ -880,13 +880,6 @@ pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
 #[allow(non_camel_case_types)]
 pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
 
-impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> {
-    #[inline(always)]
-    fn to_scalars(self) -> [u32; 16] {
-        unsafe { core::mem::transmute(self) }
-    }
-}
-
 impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
 where
     u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
@@ -990,8 +983,6 @@ where
     Machine86<S3, S4, NI>: Machine,
     u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
     u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
-    u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>,
-    u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>,
 {
 }
 impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
@@ -1013,6 +1004,14 @@ where
 {
 }
 
+impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_sse2<YesS3, YesS4, NI>
+where
+    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
+    Avx2Machine<NI>: Machine,
+    u32x4x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 4]>,
+    u32x4x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u32x4>,
+{
+}
 impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
 where
     u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
@@ -1375,78 +1374,65 @@ mod test {
 
 pub mod avx2 {
     #![allow(non_camel_case_types)]
-    use crate::soft::{x2, x4};
+    use crate::soft::x4;
     use crate::types::*;
-    use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0};
+    use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2};
     use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
     use core::arch::x86_64::*;
     use core::marker::PhantomData;
     use core::ops::*;
 
     #[derive(Copy, Clone)]
-    pub struct u32x4x2_avx2<NI> {
-        x: __m256i,
+    pub struct u32x4x4_avx2<NI> {
+        x: [__m256i; 2],
         ni: PhantomData<NI>,
     }
 
-    impl<NI> u32x4x2_avx2<NI> {
+    impl<NI> u32x4x4_avx2<NI> {
         #[inline(always)]
-        fn new(x: __m256i) -> Self {
+        fn new(x: [__m256i; 2]) -> Self {
             Self { x, ni: PhantomData }
         }
     }
 
-    impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {}
-    impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> {
-        #[inline(always)]
-        unsafe fn unpack(p: vec256_storage) -> Self {
-            Self::new(p.avx)
-        }
-    }
-    impl<NI> StoreBytes for u32x4x2_avx2<NI> {
-        #[inline(always)]
-        unsafe fn unsafe_read_le(input: &[u8]) -> Self {
-            assert_eq!(input.len(), 32);
-            Self::new(_mm256_loadu_si256(input.as_ptr() as *const _))
-        }
+    impl<NI> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> where NI: Copy {}
+    impl<NI> Store<vec512_storage> for u32x4x4_avx2<NI> {
         #[inline(always)]
-        unsafe fn unsafe_read_be(input: &[u8]) -> Self {
-            Self::unsafe_read_le(input).bswap()
-        }
-        #[inline(always)]
-        fn write_le(self, out: &mut [u8]) {
-            unsafe {
-                assert_eq!(out.len(), 32);
-                _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x)
-            }
-        }
-        #[inline(always)]
-        fn write_be(self, out: &mut [u8]) {
-            self.bswap().write_le(out)
+        unsafe fn unpack(p: vec512_storage) -> Self {
+            Self::new([p.avx[0].avx, p.avx[1].avx])
         }
     }
-    impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> {
+    impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
         #[inline(always)]
-        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] {
+        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
             unsafe {
                 [
-                    u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
-                    u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
+                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
+                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
+                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
+                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
                 ]
             }
         }
         #[inline(always)]
-        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self {
-            Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) })
+        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
+            Self::new(unsafe {
+                [
+                    _mm256_setr_m128i(x[0].x, x[1].x),
+                    _mm256_setr_m128i(x[2].x, x[3].x),
+                ]
+            })
         }
     }
-    impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> {
+    impl<NI> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
         #[inline(always)]
         fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
             unsafe {
                 match i {
-                    0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
-                    1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
+                    0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
+                    1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
+                    2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
+                    3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
                     _ => panic!(),
                 }
             }
@@ -1455,21 +1441,55 @@ pub mod avx2 {
         fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
             Self::new(unsafe {
                 match i {
-                    0 => _mm256_inserti128_si256(self.x, w.x, 0),
-                    1 => _mm256_inserti128_si256(self.x, w.x, 1),
+                    0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]],
+                    1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]],
+                    2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)],
+                    3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)],
                     _ => panic!(),
                 }
             })
         }
     }
-    impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {}
-    impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {}
+    impl<NI> LaneWords4 for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn shuffle_lane_words1230(self) -> Self {
+            Self::new(unsafe {
+                [
+                    _mm256_shuffle_epi32(self.x[0], 0b1001_0011),
+                    _mm256_shuffle_epi32(self.x[1], 0b1001_0011),
+                ]
+            })
+        }
+        #[inline(always)]
+        fn shuffle_lane_words2301(self) -> Self {
+            Self::new(unsafe {
+                [
+                    _mm256_shuffle_epi32(self.x[0], 0b0100_1110),
+                    _mm256_shuffle_epi32(self.x[1], 0b0100_1110),
+                ]
+            })
+        }
+        #[inline(always)]
+        fn shuffle_lane_words3012(self) -> Self {
+            Self::new(unsafe {
+                [
+                    _mm256_shuffle_epi32(self.x[0], 0b0011_1001),
+                    _mm256_shuffle_epi32(self.x[1], 0b0011_1001),
+                ]
+            })
+        }
+    }
+    impl<NI> BitOps32 for u32x4x4_avx2<NI> where NI: Copy {}
+    impl<NI> ArithOps for u32x4x4_avx2<NI> where NI: Copy {}
     macro_rules! shuf_lane_bytes {
         ($name:ident, $k0:expr, $k1:expr) => {
             #[inline(always)]
             fn $name(self) -> Self {
                 Self::new(unsafe {
-                    _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1))
+                    [
+                        _mm256_shuffle_epi8(self.x[0], _mm256_set_epi64x($k0, $k1, $k0, $k1)),
+                        _mm256_shuffle_epi8(self.x[1], _mm256_set_epi64x($k0, $k1, $k0, $k1)),
+                    ]
                 })
             }
         };
@@ -1479,41 +1499,52 @@ pub mod avx2 {
             #[inline(always)]
             fn $name(self) -> Self {
                 Self::new(unsafe {
-                    _mm256_or_si256(
-                        _mm256_srli_epi32(self.x, $i as i32),
-                        _mm256_slli_epi32(self.x, 32 - $i as i32),
-                    )
+                    [
+                        _mm256_or_si256(
+                            _mm256_srli_epi32(self.x[0], $i as i32),
+                            _mm256_slli_epi32(self.x[0], 32 - $i as i32),
+                        ),
+                        _mm256_or_si256(
+                            _mm256_srli_epi32(self.x[1], $i as i32),
+                            _mm256_slli_epi32(self.x[1], 32 - $i as i32),
+                        ),
+                    ]
                 })
             }
         };
     }
-    impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> {
+    impl<NI: Copy> RotateEachWord32 for u32x4x4_avx2<NI> {
         rotr_32!(rotate_each_word_right7, 7);
         shuf_lane_bytes!(
             rotate_each_word_right8,
-            0x0c0f_0e0d_080b_0a09,
-            0x0407_0605_0003_0201
+            0x0c0f0e0d_080b0a09,
+            0x04070605_00030201
         );
         rotr_32!(rotate_each_word_right11, 11);
         rotr_32!(rotate_each_word_right12, 12);
         shuf_lane_bytes!(
             rotate_each_word_right16,
-            0x0d0c_0f0e_0908_0b0a,
-            0x0504_0706_0100_0302
+            0x0d0c0f0e_09080b0a,
+            0x05040706_01000302
         );
         rotr_32!(rotate_each_word_right20, 20);
         shuf_lane_bytes!(
             rotate_each_word_right24,
-            0x0e0d_0c0f_0a09_080b,
-            0x0605_0407_0201_0003
+            0x0e0d0c0f_0a09080b,
+            0x06050407_02010003
         );
         rotr_32!(rotate_each_word_right25, 25);
     }
-    impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {}
-    impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage {
+    impl<NI> BitOps0 for u32x4x4_avx2<NI> where NI: Copy {}
+    impl<NI> From<u32x4x4_avx2<NI>> for vec512_storage {
         #[inline(always)]
-        fn from(x: u32x4x2_avx2<NI>) -> Self {
-            Self { avx: x.x }
+        fn from(x: u32x4x4_avx2<NI>) -> Self {
+            Self {
+                avx: [
+                    vec256_storage { avx: x.x[0] },
+                    vec256_storage { avx: x.x[1] },
+                ],
+            }
         }
     }
 
@@ -1530,172 +1561,55 @@ pub mod avx2 {
             }
         };
     }
-    impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor);
-    impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor);
-    impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand);
-    impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add);
+    impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor);
+    impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor);
+    impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand);
+    impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add);
 
-    macro_rules! impl_bitop {
+    macro_rules! impl_bitop_x2 {
         ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
             impl<NI> $Op for $vec<NI> {
                 type Output = Self;
                 #[inline(always)]
                 fn $op_fn(self, rhs: Self) -> Self::Output {
-                    Self::new(unsafe { $impl_fn(self.x, rhs.x) })
+                    Self::new(unsafe {
+                        [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])]
+                    })
                 }
             }
         };
     }
-    impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256);
-    impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256);
-    impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256);
-    impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256);
-    impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32);
+    impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256);
+    impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256);
+    impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256);
+    impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256);
+    impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32);
 
-    impl<NI> Not for u32x4x2_avx2<NI> {
+    impl<NI> Not for u32x4x4_avx2<NI> {
         type Output = Self;
         #[inline(always)]
         fn not(self) -> Self::Output {
             unsafe {
                 let f = _mm256_set1_epi8(-0x7f);
-                Self::new(f) ^ self
+                Self::new([f, f]) ^ self
             }
         }
     }
 
-    impl<NI> BSwap for u32x4x2_avx2<NI> {
+    impl<NI> BSwap for u32x4x4_avx2<NI> {
         shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
     }
 
-    impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI>
+    impl<NI> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI>
     where
         NI: Copy,
     {
         #[inline(always)]
-        fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self {
-            Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) })
-        }
-    }
-
-    impl<NI> LaneWords4 for u32x4x2_avx2<NI> {
-        #[inline(always)]
-        fn shuffle_lane_words1230(self) -> Self {
-            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) })
-        }
-        #[inline(always)]
-        fn shuffle_lane_words2301(self) -> Self {
-            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) })
-        }
-        #[inline(always)]
-        fn shuffle_lane_words3012(self) -> Self {
-            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) })
-        }
-    }
-
-    ///////////////////////////////////////////////////////////////////////////////////////////
-
-    pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>;
-    impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {}
-
-    impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> {
-        #[inline(always)]
-        unsafe fn unpack(p: vec512_storage) -> Self {
-            Self::new([
-                u32x4x2_avx2::unpack(p.avx[0]),
-                u32x4x2_avx2::unpack(p.avx[1]),
-            ])
-        }
-    }
-    impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
-        #[inline(always)]
-        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
-            let [a, b] = self.0[0].to_lanes();
-            let [c, d] = self.0[1].to_lanes();
-            [a, b, c, d]
-        }
-        #[inline(always)]
-        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
-            let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]);
-            let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]);
-            Self::new([ab, cd])
-        }
-    }
-    impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
-        #[inline(always)]
-        fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
-            match i {
-                0 => self.0[0].extract(0),
-                1 => self.0[0].extract(1),
-                2 => self.0[1].extract(0),
-                3 => self.0[1].extract(1),
-                _ => panic!(),
-            }
-        }
-        #[inline(always)]
-        fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
-            Self::new(match i {
-                0 | 1 => [self.0[0].insert(w, i), self.0[1]],
-                2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)],
-                _ => panic!(),
-            })
-        }
-    }
-    impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
-        #[inline(always)]
-        fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
-            /*
-             * a00:a01 a10:a11
-             * b00:b01 b10:b11
-             * c00:c01 c10:c11
-             * d00:d01 d10:d11
-             *       =>
-             * a00:b00 c00:d00
-             * a01:b01 c01:d01
-             * a10:b10 c10:d10
-             * a11:b11 c11:d11
-             */
-            unsafe {
-                let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20));
-                let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31));
-                let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20));
-                let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31));
-                let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20));
-                let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31));
-                let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20));
-                let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31));
-                (
-                    Self::new([ab00, cd00]),
-                    Self::new([ab01, cd01]),
-                    Self::new([ab10, cd10]),
-                    Self::new([ab11, cd11]),
-                )
-            }
-        }
-    }
-    impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> {
-        #[inline(always)]
-        fn to_scalars(self) -> [u32; 16] {
-            unsafe { core::mem::transmute(self) }
-        }
-    }
-    impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage {
-        #[inline(always)]
-        fn from(x: u32x4x4_avx2<NI>) -> Self {
-            Self {
-                avx: [
-                    vec256_storage { avx: x.0[0].x },
-                    vec256_storage { avx: x.0[1].x },
-                ],
-            }
-        }
-    }
-    impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> {
-        #[inline(always)]
         fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
             Self::new(unsafe {
                 [
-                    u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)),
-                    u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)),
+                    _mm256_setr_m128i(x.0[0].x, x.0[1].x),
+                    _mm256_setr_m128i(x.0[2].x, x.0[3].x),
                 ]
             })
         }
author	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	2022-03-23 20:51:30 +0000
committer	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	2022-03-23 20:51:30 +0000
commit	5896e12e155aeb30682fcb6473a57a8f83f7de35 (patch)
tree	7d9f824e8baa7032c8f8bf51f965345a27a21cef
parent	fcbcefbbc5486d3fa1bf9ac6ce6bfd66fbbf750e (diff)
parent	84737ef36d3bd9c929b801fdaa22c8e27899648e (diff)
download	ppv-lite86-5896e12e155aeb30682fcb6473a57a8f83f7de35.tar.gz