diff options
author | Chih-Hung Hsieh <chh@google.com> | 2020-10-29 10:23:47 +0000 |
---|---|---|
committer | Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> | 2020-10-29 10:23:47 +0000 |
commit | 8454fe7d746c34af0fc31a8f8c1a3baf4df48a7b (patch) | |
tree | 7d52089b3c3b807c9284314462ae0d08bea4a7eb | |
parent | b75048b81341b2b1f6150b40d7ed3004ce15b391 (diff) | |
parent | f1854b7bbcc06c1eb66cb3e1750d86a274a48122 (diff) | |
download | aho-corasick-8454fe7d746c34af0fc31a8f8c1a3baf4df48a7b.tar.gz |
Upgrade rust/crates/aho-corasick to 0.7.14 am: 2db690b09c am: 6176f34d45 am: 0e24f4c691 am: f1854b7bbc
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/aho-corasick/+/1474960
Change-Id: Icdfe80978584c6a1e270f18d6c3f7095a75e8e1d
-rw-r--r-- | .cargo_vcs_info.json | 2 | ||||
-rw-r--r-- | Cargo.toml | 2 | ||||
-rw-r--r-- | Cargo.toml.orig | 2 | ||||
-rw-r--r-- | METADATA | 8 | ||||
-rw-r--r-- | src/ahocorasick.rs | 28 | ||||
-rw-r--r-- | src/buffer.rs | 4 | ||||
-rw-r--r-- | src/dfa.rs | 4 | ||||
-rw-r--r-- | src/prefilter.rs | 60 | ||||
-rw-r--r-- | src/tests.rs | 120 |
9 files changed, 198 insertions, 32 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json index 725cd96..33e5654 100644 --- a/.cargo_vcs_info.json +++ b/.cargo_vcs_info.json @@ -1,5 +1,5 @@ { "git": { - "sha1": "55a42968a26a1150aca116fab63537330782d56a" + "sha1": "63f0b5252345fa50e490de42b3d82b5159c0e5dd" } } @@ -12,7 +12,7 @@ [package] name = "aho-corasick" -version = "0.7.13" +version = "0.7.14" authors = ["Andrew Gallant <jamslam@gmail.com>"] exclude = ["/aho-corasick-debug", "/ci/*", "/.travis.yml", "/appveyor.yml"] autotests = false diff --git a/Cargo.toml.orig b/Cargo.toml.orig index 00d71ef..0b64728 100644 --- a/Cargo.toml.orig +++ b/Cargo.toml.orig @@ -1,6 +1,6 @@ [package] name = "aho-corasick" -version = "0.7.13" #:version +version = "0.7.14" #:version authors = ["Andrew Gallant <jamslam@gmail.com>"] description = "Fast multiple substring searching." homepage = "https://github.com/BurntSushi/aho-corasick" @@ -7,13 +7,13 @@ third_party { } url { type: ARCHIVE - value: "https://static.crates.io/crates/aho-corasick/aho-corasick-0.7.13.crate" + value: "https://static.crates.io/crates/aho-corasick/aho-corasick-0.7.14.crate" } - version: "0.7.13" + version: "0.7.14" license_type: NOTICE last_upgrade_date { year: 2020 - month: 7 - day: 10 + month: 10 + day: 26 } } diff --git a/src/ahocorasick.rs b/src/ahocorasick.rs index 7880d13..9069396 100644 --- a/src/ahocorasick.rs +++ b/src/ahocorasick.rs @@ -6,7 +6,7 @@ use dfa::{self, DFA}; use error::Result; use nfa::{self, NFA}; use packed; -use prefilter::PrefilterState; +use prefilter::{Prefilter, PrefilterState}; use state_id::StateID; use Match; @@ -1075,6 +1075,24 @@ impl<S: StateID> Imp<S> { } } + /// Returns the prefilter object, if one exists, for the underlying + /// automaton. + fn prefilter(&self) -> Option<&dyn Prefilter> { + match *self { + Imp::NFA(ref nfa) => nfa.prefilter(), + Imp::DFA(ref dfa) => dfa.prefilter(), + } + } + + /// Returns true if and only if we should attempt to use a prefilter. + fn use_prefilter(&self) -> bool { + let p = match self.prefilter() { + None => return false, + Some(p) => p, + }; + !p.looks_for_non_start_of_match() + } + #[inline(always)] fn overlapping_find_at( &self, @@ -1363,7 +1381,11 @@ impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> { "stream searching is only supported for Standard match semantics" ); - let prestate = PrefilterState::new(ac.max_pattern_len()); + let prestate = if ac.imp.use_prefilter() { + PrefilterState::new(ac.max_pattern_len()) + } else { + PrefilterState::disabled() + }; let buf = Buffer::new(ac.imp.max_pattern_len()); let state_id = ac.imp.start_state(); StreamChunkIter { @@ -1847,7 +1869,7 @@ impl AhoCorasickBuilder { /// finite automaton (NFA) is used instead. /// /// The main benefit to a DFA is that it can execute searches more quickly - /// than a DFA (perhaps 2-4 times as fast). The main drawback is that the + /// than a NFA (perhaps 2-4 times as fast). The main drawback is that the /// DFA uses more space and can take much longer to build. /// /// Enabling this option does not change the time complexity for diff --git a/src/buffer.rs b/src/buffer.rs index 1008196..e7339eb 100644 --- a/src/buffer.rs +++ b/src/buffer.rs @@ -50,7 +50,9 @@ impl Buffer { // reasons, so we set a lower bound of `8 * min`. // // TODO: It would be good to find a way to test the streaming - // implementation with the minimal buffer size. + // implementation with the minimal buffer size. For now, we just + // uncomment out the next line and comment out the subsequent line. + // let capacity = 1 + min; let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY); Buffer { buf: vec![0; capacity], min, end: 0 } } @@ -43,6 +43,10 @@ impl<S: StateID> DFA<S> { self.repr().pattern_count } + pub fn prefilter(&self) -> Option<&dyn Prefilter> { + self.repr().prefilter.as_ref().map(|p| p.as_ref()) + } + pub fn start_state(&self) -> S { self.repr().start_id } diff --git a/src/prefilter.rs b/src/prefilter.rs index bda215d..05fa46d 100644 --- a/src/prefilter.rs +++ b/src/prefilter.rs @@ -80,6 +80,17 @@ pub trait Prefilter: fn reports_false_positives(&self) -> bool { true } + + /// Returns true if and only if this prefilter may look for a non-starting + /// position of a match. + /// + /// This is useful in a streaming context where prefilters that don't look + /// for a starting position of a match can be quite difficult to deal with. + /// + /// This returns false by default. + fn looks_for_non_start_of_match(&self) -> bool { + false + } } impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P { @@ -191,6 +202,17 @@ impl PrefilterState { } } + /// Create a prefilter state that always disables the prefilter. + pub fn disabled() -> PrefilterState { + PrefilterState { + skips: 0, + skipped: 0, + max_match_len: 0, + inert: true, + last_scan_at: 0, + } + } + /// Update this state with the number of bytes skipped on the last /// invocation of the prefilter. #[inline] @@ -285,6 +307,7 @@ impl Builder { /// All patterns added to an Aho-Corasick automaton should be added to this /// builder before attempting to construct the prefilter. pub fn build(&self) -> Option<PrefilterObj> { + // match (self.start_bytes.build(), self.rare_bytes.build()) { match (self.start_bytes.build(), self.rare_bytes.build()) { // If we could build both start and rare prefilters, then there are // a few cases in which we'd want to use the start-byte prefilter @@ -663,6 +686,33 @@ impl Prefilter for RareBytesOne { fn heap_bytes(&self) -> usize { 0 } + + fn looks_for_non_start_of_match(&self) -> bool { + // TODO: It should be possible to use a rare byte prefilter in a + // streaming context. The main problem is that we usually assume that + // if a prefilter has scanned some text and not found anything, then no + // match *starts* in that text. This doesn't matter in non-streaming + // contexts, but in a streaming context, if we're looking for a byte + // that doesn't start at the beginning of a match and don't find it, + // then it's still possible for a match to start at the end of the + // current buffer content. In order to fix this, the streaming searcher + // would need to become aware of prefilters that do this and use the + // appropriate offset in various places. It is quite a delicate change + // and probably shouldn't be attempted until streaming search has a + // better testing strategy. In particular, we'd really like to be able + // to vary the buffer size to force strange cases that occur at the + // edge of the buffer. If we make the buffer size minimal, then these + // cases occur more frequently and easier. + // + // This is also a bummer because this means that if the prefilter + // builder chose a rare byte prefilter, then a streaming search won't + // use any prefilter at all because the builder doesn't know how it's + // going to be used. Assuming we don't make streaming search aware of + // these special types of prefilters as described above, we could fix + // this by building a "backup" prefilter that could be used when the + // rare byte prefilter could not. But that's a bandaide. Sigh. + true + } } /// A prefilter for scanning for two "rare" bytes. @@ -697,6 +747,11 @@ impl Prefilter for RareBytesTwo { fn heap_bytes(&self) -> usize { 0 } + + fn looks_for_non_start_of_match(&self) -> bool { + // TODO: See Prefilter impl for RareBytesOne. + true + } } /// A prefilter for scanning for three "rare" bytes. @@ -732,6 +787,11 @@ impl Prefilter for RareBytesThree { fn heap_bytes(&self) -> usize { 0 } + + fn looks_for_non_start_of_match(&self) -> bool { + // TODO: See Prefilter impl for RareBytesOne. + true + } } /// A builder for constructing a starting byte prefilter. diff --git a/src/tests.rs b/src/tests.rs index 0ae31f0..29eba1d 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -998,6 +998,31 @@ testconfig!( } ); +fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>( + which: TestCollection, + mut f: F, +) { + let get_match_triples = + |matches: Vec<Match>| -> Vec<(usize, usize, usize)> { + matches + .into_iter() + .map(|m| (m.pattern(), m.start(), m.end())) + .collect() + }; + for &tests in which { + for test in tests { + assert_eq!( + test.matches, + get_match_triples(f(&test)).as_slice(), + "test: {}, patterns: {:?}, haystack: {:?}", + test.name, + test.patterns, + test.haystack + ); + } + } +} + #[test] fn search_tests_have_unique_names() { let assert = |constname, tests: &[SearchTest]| { @@ -1126,27 +1151,80 @@ fn regression_case_insensitive_prefilter() { } } -fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>( - which: TestCollection, - mut f: F, -) { - let get_match_triples = - |matches: Vec<Match>| -> Vec<(usize, usize, usize)> { - matches - .into_iter() - .map(|m| (m.pattern(), m.start(), m.end())) - .collect() - }; - for &tests in which { - for test in tests { - assert_eq!( - test.matches, - get_match_triples(f(&test)).as_slice(), - "test: {}, patterns: {:?}, haystack: {:?}", - test.name, - test.patterns, - test.haystack - ); +// See: https://github.com/BurntSushi/aho-corasick/issues/64 +// +// This occurs when the rare byte prefilter is active +#[test] +fn regression_stream_rare_byte_prefilter() { + use std::io::Read; + + // NOTE: The test only fails if this ends with j. + const MAGIC: [u8; 5] = *b"1234j"; + + // NOTE: The test fails for value in 8188..=8191 These value put the string + // to search accross two call to read because the buffer size is 8192 by + // default. + const BEGIN: usize = 8191; + + /// This is just a structure that implements Reader. The reader + /// implementation will simulate a file filled with 0, except for the MAGIC + /// string at offset BEGIN. + #[derive(Default)] + struct R { + read: usize, + } + + impl Read for R { + fn read(&mut self, buf: &mut [u8]) -> ::std::io::Result<usize> { + //dbg!(buf.len()); + if self.read > 100000 { + return Ok(0); + } + let mut from = 0; + if self.read < BEGIN { + from = buf.len().min(BEGIN - self.read); + for x in 0..from { + buf[x] = 0; + } + self.read += from; + } + if self.read >= BEGIN && self.read <= BEGIN + MAGIC.len() { + let to = buf.len().min(BEGIN + MAGIC.len() - self.read + from); + if to > from { + buf[from..to].copy_from_slice( + &MAGIC + [self.read - BEGIN..self.read - BEGIN + to - from], + ); + self.read += to - from; + from = to; + } + } + for x in from..buf.len() { + buf[x] = 0; + self.read += 1; + } + Ok(buf.len()) } } + + fn run() -> ::std::io::Result<()> { + let aut = AhoCorasickBuilder::new().build(&[&MAGIC]); + + // While reading from a vector, it works: + let mut buf = vec![]; + R::default().read_to_end(&mut buf)?; + let from_whole = aut.find_iter(&buf).next().unwrap().start(); + + //But using stream_find_iter fails! + let mut file = R::default(); + let begin = aut + .stream_find_iter(&mut file) + .next() + .expect("NOT FOUND!!!!")? // Panic here + .start(); + assert_eq!(from_whole, begin); + Ok(()) + } + + run().unwrap() } |