aboutsummaryrefslogtreecommitdiff
path: root/src/reader/indexset.rs
blob: 3d683a28045bddd0c75fd3c46c4a4750b241aff5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
use crate::attribute::OwnedAttribute;
use crate::name::OwnedName;

use std::collections::hash_map::RandomState;
use std::collections::HashSet;
use std::hash::BuildHasher;
use std::hash::Hash;
use std::hash::Hasher;

/// An ordered set
pub(crate) struct AttributesSet {
    vec: Vec<OwnedAttribute>,
    /// Uses a no-op hasher, because these u64s are hashes already
    may_contain: HashSet<u64, U64HasherBuilder>,
    /// This is real hasher for the `OwnedName`
    hasher: RandomState,
}

/// Use linear search and don't allocate `HashSet` if there are few attributes,
/// because allocation costs more than a few comparisons.
const HASH_THRESHOLD: usize = 8;

impl AttributesSet {
    pub fn new() -> Self {
        Self {
            vec: Vec::new(),
            hasher: RandomState::new(),
            may_contain: HashSet::default(),
        }
    }

    fn hash(&self, val: &OwnedName) -> u64 {
        let mut h = self.hasher.build_hasher();
        val.hash(&mut h);
        h.finish()
    }

    pub fn len(&self) -> usize {
        self.vec.len()
    }

    pub fn contains(&self, name: &OwnedName) -> bool {
        // fall back to linear search only on duplicate or hash collision
        (self.vec.len() < HASH_THRESHOLD || self.may_contain.contains(&self.hash(name))) &&
            self.vec.iter().any(move |a| &a.name == name)
    }

    pub fn push(&mut self, attr: OwnedAttribute) {
        if self.vec.len() >= HASH_THRESHOLD {
            if self.vec.len() == HASH_THRESHOLD {
                self.may_contain.reserve(HASH_THRESHOLD * 2);
                for attr in &self.vec {
                    self.may_contain.insert(self.hash(&attr.name));
                }
            }
            self.may_contain.insert(self.hash(&attr.name));
        }
        self.vec.push(attr);
    }

    pub fn into_vec(self) -> Vec<OwnedAttribute> {
        self.vec
    }
}

#[test]
fn indexset() {
    let mut s = AttributesSet::new();
    let not_here = OwnedName {
        local_name: "attr1000".into(),
        namespace: Some("test".into()),
        prefix: None,
    };

    // this test will take a lot of time if the `contains()` is linear, and the loop is quadratic
    for i in 0..50000 {
        let name = OwnedName {
            local_name: format!("attr{i}"), namespace: None, prefix: None,
        };
        assert!(!s.contains(&name));

        s.push(OwnedAttribute { name, value: String::new() });
        assert!(!s.contains(&not_here));
    }

    assert!(s.contains(&OwnedName {
        local_name: "attr1234".into(), namespace: None, prefix: None,
    }));
    assert!(s.contains(&OwnedName {
        local_name: "attr0".into(), namespace: None, prefix: None,
    }));
    assert!(s.contains(&OwnedName {
        local_name: "attr49999".into(), namespace: None, prefix: None,
    }));
}

/// Hashser that does nothing except passing u64 through
struct U64Hasher(u64);

impl Hasher for U64Hasher {
    fn finish(&self) -> u64 { self.0 }
    fn write(&mut self, slice: &[u8]) {
        for &v in slice { self.0 ^= u64::from(v) } // unused in practice
    }
    fn write_u64(&mut self, i: u64) {
        self.0 ^= i;
    }
}

#[derive(Default)]
struct U64HasherBuilder;

impl BuildHasher for U64HasherBuilder {
    type Hasher = U64Hasher;
    fn build_hasher(&self) -> U64Hasher { U64Hasher(0) }
}