diff options
Diffstat (limited to 'src/reader/indexset.rs')
-rw-r--r-- | src/reader/indexset.rs | 116 |
1 files changed, 116 insertions, 0 deletions
diff --git a/src/reader/indexset.rs b/src/reader/indexset.rs new file mode 100644 index 0000000..3d683a2 --- /dev/null +++ b/src/reader/indexset.rs @@ -0,0 +1,116 @@ +use crate::attribute::OwnedAttribute; +use crate::name::OwnedName; + +use std::collections::hash_map::RandomState; +use std::collections::HashSet; +use std::hash::BuildHasher; +use std::hash::Hash; +use std::hash::Hasher; + +/// An ordered set +pub(crate) struct AttributesSet { + vec: Vec<OwnedAttribute>, + /// Uses a no-op hasher, because these u64s are hashes already + may_contain: HashSet<u64, U64HasherBuilder>, + /// This is real hasher for the `OwnedName` + hasher: RandomState, +} + +/// Use linear search and don't allocate `HashSet` if there are few attributes, +/// because allocation costs more than a few comparisons. +const HASH_THRESHOLD: usize = 8; + +impl AttributesSet { + pub fn new() -> Self { + Self { + vec: Vec::new(), + hasher: RandomState::new(), + may_contain: HashSet::default(), + } + } + + fn hash(&self, val: &OwnedName) -> u64 { + let mut h = self.hasher.build_hasher(); + val.hash(&mut h); + h.finish() + } + + pub fn len(&self) -> usize { + self.vec.len() + } + + pub fn contains(&self, name: &OwnedName) -> bool { + // fall back to linear search only on duplicate or hash collision + (self.vec.len() < HASH_THRESHOLD || self.may_contain.contains(&self.hash(name))) && + self.vec.iter().any(move |a| &a.name == name) + } + + pub fn push(&mut self, attr: OwnedAttribute) { + if self.vec.len() >= HASH_THRESHOLD { + if self.vec.len() == HASH_THRESHOLD { + self.may_contain.reserve(HASH_THRESHOLD * 2); + for attr in &self.vec { + self.may_contain.insert(self.hash(&attr.name)); + } + } + self.may_contain.insert(self.hash(&attr.name)); + } + self.vec.push(attr); + } + + pub fn into_vec(self) -> Vec<OwnedAttribute> { + self.vec + } +} + +#[test] +fn indexset() { + let mut s = AttributesSet::new(); + let not_here = OwnedName { + local_name: "attr1000".into(), + namespace: Some("test".into()), + prefix: None, + }; + + // this test will take a lot of time if the `contains()` is linear, and the loop is quadratic + for i in 0..50000 { + let name = OwnedName { + local_name: format!("attr{i}"), namespace: None, prefix: None, + }; + assert!(!s.contains(&name)); + + s.push(OwnedAttribute { name, value: String::new() }); + assert!(!s.contains(¬_here)); + } + + assert!(s.contains(&OwnedName { + local_name: "attr1234".into(), namespace: None, prefix: None, + })); + assert!(s.contains(&OwnedName { + local_name: "attr0".into(), namespace: None, prefix: None, + })); + assert!(s.contains(&OwnedName { + local_name: "attr49999".into(), namespace: None, prefix: None, + })); +} + +/// Hashser that does nothing except passing u64 through +struct U64Hasher(u64); + +impl Hasher for U64Hasher { + fn finish(&self) -> u64 { self.0 } + fn write(&mut self, slice: &[u8]) { + for &v in slice { self.0 ^= u64::from(v) } // unused in practice + } + fn write_u64(&mut self, i: u64) { + self.0 ^= i; + } +} + +#[derive(Default)] +struct U64HasherBuilder; + +impl BuildHasher for U64HasherBuilder { + type Hasher = U64Hasher; + fn build_hasher(&self) -> U64Hasher { U64Hasher(0) } +} |