Upgrade rust/crates/aho-corasick to 0.7.14 am: 2db690b09c am: 6176f34d45 am: 0e24f4c691 am: f1854b7bbc

Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/aho-corasick/+/1474960 Change-Id: Icdfe80978584c6a1e270f18d6c3f7095a75e8e1d
author: Chih-Hung Hsieh <chh@google.com> 2020-10-29 10:23:47 +0000
committer: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> 2020-10-29 10:23:47 +0000
commit: 8454fe7d746c34af0fc31a8f8c1a3baf4df48a7b (patch)
tree: 7d52089b3c3b807c9284314462ae0d08bea4a7eb
parent: b75048b81341b2b1f6150b40d7ed3004ce15b391 (diff)
parent: f1854b7bbcc06c1eb66cb3e1750d86a274a48122 (diff)
download: aho-corasick-8454fe7d746c34af0fc31a8f8c1a3baf4df48a7b.tar.gz
9 files changed, 198 insertions, 32 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index 725cd96..33e5654 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,5 +1,5 @@
 {
   "git": {
-    "sha1": "55a42968a26a1150aca116fab63537330782d56a"
+    "sha1": "63f0b5252345fa50e490de42b3d82b5159c0e5dd"
   }
 }
diff --git a/Cargo.toml b/Cargo.toml
index a0c306a..d381829 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,7 +12,7 @@
 
 [package]
 name = "aho-corasick"
-version = "0.7.13"
+version = "0.7.14"
 authors = ["Andrew Gallant <jamslam@gmail.com>"]
 exclude = ["/aho-corasick-debug", "/ci/*", "/.travis.yml", "/appveyor.yml"]
 autotests = false
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index 00d71ef..0b64728 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,6 +1,6 @@
 [package]
 name = "aho-corasick"
-version = "0.7.13"  #:version
+version = "0.7.14"  #:version
 authors = ["Andrew Gallant <jamslam@gmail.com>"]
 description = "Fast multiple substring searching."
 homepage = "https://github.com/BurntSushi/aho-corasick"
diff --git a/METADATA b/METADATA
index 2320c5f..8c98616 100644
--- a/METADATA
+++ b/METADATA
@@ -7,13 +7,13 @@ third_party {
   }
   url {
     type: ARCHIVE
-    value: "https://static.crates.io/crates/aho-corasick/aho-corasick-0.7.13.crate"
+    value: "https://static.crates.io/crates/aho-corasick/aho-corasick-0.7.14.crate"
   }
-  version: "0.7.13"
+  version: "0.7.14"
   license_type: NOTICE
   last_upgrade_date {
     year: 2020
-    month: 7
-    day: 10
+    month: 10
+    day: 26
   }
 }
diff --git a/src/ahocorasick.rs b/src/ahocorasick.rs
index 7880d13..9069396 100644
--- a/src/ahocorasick.rs
+++ b/src/ahocorasick.rs
@@ -6,7 +6,7 @@ use dfa::{self, DFA};
 use error::Result;
 use nfa::{self, NFA};
 use packed;
-use prefilter::PrefilterState;
+use prefilter::{Prefilter, PrefilterState};
 use state_id::StateID;
 use Match;
 
@@ -1075,6 +1075,24 @@ impl<S: StateID> Imp<S> {
         }
     }
 
+    /// Returns the prefilter object, if one exists, for the underlying
+    /// automaton.
+    fn prefilter(&self) -> Option<&dyn Prefilter> {
+        match *self {
+            Imp::NFA(ref nfa) => nfa.prefilter(),
+            Imp::DFA(ref dfa) => dfa.prefilter(),
+        }
+    }
+
+    /// Returns true if and only if we should attempt to use a prefilter.
+    fn use_prefilter(&self) -> bool {
+        let p = match self.prefilter() {
+            None => return false,
+            Some(p) => p,
+        };
+        !p.looks_for_non_start_of_match()
+    }
+
     #[inline(always)]
     fn overlapping_find_at(
         &self,
@@ -1363,7 +1381,11 @@ impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> {
             "stream searching is only supported for Standard match semantics"
         );
 
-        let prestate = PrefilterState::new(ac.max_pattern_len());
+        let prestate = if ac.imp.use_prefilter() {
+            PrefilterState::new(ac.max_pattern_len())
+        } else {
+            PrefilterState::disabled()
+        };
         let buf = Buffer::new(ac.imp.max_pattern_len());
         let state_id = ac.imp.start_state();
         StreamChunkIter {
@@ -1847,7 +1869,7 @@ impl AhoCorasickBuilder {
     /// finite automaton (NFA) is used instead.
     ///
     /// The main benefit to a DFA is that it can execute searches more quickly
-    /// than a DFA (perhaps 2-4 times as fast). The main drawback is that the
+    /// than a NFA (perhaps 2-4 times as fast). The main drawback is that the
     /// DFA uses more space and can take much longer to build.
     ///
     /// Enabling this option does not change the time complexity for
diff --git a/src/buffer.rs b/src/buffer.rs
index 1008196..e7339eb 100644
--- a/src/buffer.rs
+++ b/src/buffer.rs
@@ -50,7 +50,9 @@ impl Buffer {
         // reasons, so we set a lower bound of `8 * min`.
         //
         // TODO: It would be good to find a way to test the streaming
-        // implementation with the minimal buffer size.
+        // implementation with the minimal buffer size. For now, we just
+        // uncomment out the next line and comment out the subsequent line.
+        // let capacity = 1 + min;
         let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
         Buffer { buf: vec![0; capacity], min, end: 0 }
     }
diff --git a/src/dfa.rs b/src/dfa.rs
index 1bf37d5..8a4e727 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -43,6 +43,10 @@ impl<S: StateID> DFA<S> {
         self.repr().pattern_count
     }
 
+    pub fn prefilter(&self) -> Option<&dyn Prefilter> {
+        self.repr().prefilter.as_ref().map(|p| p.as_ref())
+    }
+
     pub fn start_state(&self) -> S {
         self.repr().start_id
     }
diff --git a/src/prefilter.rs b/src/prefilter.rs
index bda215d..05fa46d 100644
--- a/src/prefilter.rs
+++ b/src/prefilter.rs
@@ -80,6 +80,17 @@ pub trait Prefilter:
     fn reports_false_positives(&self) -> bool {
         true
     }
+
+    /// Returns true if and only if this prefilter may look for a non-starting
+    /// position of a match.
+    ///
+    /// This is useful in a streaming context where prefilters that don't look
+    /// for a starting position of a match can be quite difficult to deal with.
+    ///
+    /// This returns false by default.
+    fn looks_for_non_start_of_match(&self) -> bool {
+        false
+    }
 }
 
 impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
@@ -191,6 +202,17 @@ impl PrefilterState {
         }
     }
 
+    /// Create a prefilter state that always disables the prefilter.
+    pub fn disabled() -> PrefilterState {
+        PrefilterState {
+            skips: 0,
+            skipped: 0,
+            max_match_len: 0,
+            inert: true,
+            last_scan_at: 0,
+        }
+    }
+
     /// Update this state with the number of bytes skipped on the last
     /// invocation of the prefilter.
     #[inline]
@@ -285,6 +307,7 @@ impl Builder {
     /// All patterns added to an Aho-Corasick automaton should be added to this
     /// builder before attempting to construct the prefilter.
     pub fn build(&self) -> Option<PrefilterObj> {
+        // match (self.start_bytes.build(), self.rare_bytes.build()) {
         match (self.start_bytes.build(), self.rare_bytes.build()) {
             // If we could build both start and rare prefilters, then there are
             // a few cases in which we'd want to use the start-byte prefilter
@@ -663,6 +686,33 @@ impl Prefilter for RareBytesOne {
     fn heap_bytes(&self) -> usize {
         0
     }
+
+    fn looks_for_non_start_of_match(&self) -> bool {
+        // TODO: It should be possible to use a rare byte prefilter in a
+        // streaming context. The main problem is that we usually assume that
+        // if a prefilter has scanned some text and not found anything, then no
+        // match *starts* in that text. This doesn't matter in non-streaming
+        // contexts, but in a streaming context, if we're looking for a byte
+        // that doesn't start at the beginning of a match and don't find it,
+        // then it's still possible for a match to start at the end of the
+        // current buffer content. In order to fix this, the streaming searcher
+        // would need to become aware of prefilters that do this and use the
+        // appropriate offset in various places. It is quite a delicate change
+        // and probably shouldn't be attempted until streaming search has a
+        // better testing strategy. In particular, we'd really like to be able
+        // to vary the buffer size to force strange cases that occur at the
+        // edge of the buffer. If we make the buffer size minimal, then these
+        // cases occur more frequently and easier.
+        //
+        // This is also a bummer because this means that if the prefilter
+        // builder chose a rare byte prefilter, then a streaming search won't
+        // use any prefilter at all because the builder doesn't know how it's
+        // going to be used. Assuming we don't make streaming search aware of
+        // these special types of prefilters as described above, we could fix
+        // this by building a "backup" prefilter that could be used when the
+        // rare byte prefilter could not. But that's a bandaide. Sigh.
+        true
+    }
 }
 
 /// A prefilter for scanning for two "rare" bytes.
@@ -697,6 +747,11 @@ impl Prefilter for RareBytesTwo {
     fn heap_bytes(&self) -> usize {
         0
     }
+
+    fn looks_for_non_start_of_match(&self) -> bool {
+        // TODO: See Prefilter impl for RareBytesOne.
+        true
+    }
 }
 
 /// A prefilter for scanning for three "rare" bytes.
@@ -732,6 +787,11 @@ impl Prefilter for RareBytesThree {
     fn heap_bytes(&self) -> usize {
         0
     }
+
+    fn looks_for_non_start_of_match(&self) -> bool {
+        // TODO: See Prefilter impl for RareBytesOne.
+        true
+    }
 }
 
 /// A builder for constructing a starting byte prefilter.
diff --git a/src/tests.rs b/src/tests.rs
index 0ae31f0..29eba1d 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -998,6 +998,31 @@ testconfig!(
     }
 );
 
+fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>(
+    which: TestCollection,
+    mut f: F,
+) {
+    let get_match_triples =
+        |matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
+            matches
+                .into_iter()
+                .map(|m| (m.pattern(), m.start(), m.end()))
+                .collect()
+        };
+    for &tests in which {
+        for test in tests {
+            assert_eq!(
+                test.matches,
+                get_match_triples(f(&test)).as_slice(),
+                "test: {}, patterns: {:?}, haystack: {:?}",
+                test.name,
+                test.patterns,
+                test.haystack
+            );
+        }
+    }
+}
+
 #[test]
 fn search_tests_have_unique_names() {
     let assert = |constname, tests: &[SearchTest]| {
@@ -1126,27 +1151,80 @@ fn regression_case_insensitive_prefilter() {
     }
 }
 
-fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>(
-    which: TestCollection,
-    mut f: F,
-) {
-    let get_match_triples =
-        |matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
-            matches
-                .into_iter()
-                .map(|m| (m.pattern(), m.start(), m.end()))
-                .collect()
-        };
-    for &tests in which {
-        for test in tests {
-            assert_eq!(
-                test.matches,
-                get_match_triples(f(&test)).as_slice(),
-                "test: {}, patterns: {:?}, haystack: {:?}",
-                test.name,
-                test.patterns,
-                test.haystack
-            );
+// See: https://github.com/BurntSushi/aho-corasick/issues/64
+//
+// This occurs when the rare byte prefilter is active
+#[test]
+fn regression_stream_rare_byte_prefilter() {
+    use std::io::Read;
+
+    // NOTE: The test only fails if this ends with j.
+    const MAGIC: [u8; 5] = *b"1234j";
+
+    // NOTE: The test fails for value in 8188..=8191 These value put the string
+    // to search accross two call to read because the buffer size is 8192 by
+    // default.
+    const BEGIN: usize = 8191;
+
+    /// This is just a structure that implements Reader. The reader
+    /// implementation will simulate a file filled with 0, except for the MAGIC
+    /// string at offset BEGIN.
+    #[derive(Default)]
+    struct R {
+        read: usize,
+    }
+
+    impl Read for R {
+        fn read(&mut self, buf: &mut [u8]) -> ::std::io::Result<usize> {
+            //dbg!(buf.len());
+            if self.read > 100000 {
+                return Ok(0);
+            }
+            let mut from = 0;
+            if self.read < BEGIN {
+                from = buf.len().min(BEGIN - self.read);
+                for x in 0..from {
+                    buf[x] = 0;
+                }
+                self.read += from;
+            }
+            if self.read >= BEGIN && self.read <= BEGIN + MAGIC.len() {
+                let to = buf.len().min(BEGIN + MAGIC.len() - self.read + from);
+                if to > from {
+                    buf[from..to].copy_from_slice(
+                        &MAGIC
+                            [self.read - BEGIN..self.read - BEGIN + to - from],
+                    );
+                    self.read += to - from;
+                    from = to;
+                }
+            }
+            for x in from..buf.len() {
+                buf[x] = 0;
+                self.read += 1;
+            }
+            Ok(buf.len())
         }
     }
+
+    fn run() -> ::std::io::Result<()> {
+        let aut = AhoCorasickBuilder::new().build(&[&MAGIC]);
+
+        // While reading from a vector, it works:
+        let mut buf = vec![];
+        R::default().read_to_end(&mut buf)?;
+        let from_whole = aut.find_iter(&buf).next().unwrap().start();
+
+        //But using stream_find_iter fails!
+        let mut file = R::default();
+        let begin = aut
+            .stream_find_iter(&mut file)
+            .next()
+            .expect("NOT FOUND!!!!")? // Panic here
+            .start();
+        assert_eq!(from_whole, begin);
+        Ok(())
+    }
+
+    run().unwrap()
 }
author	Chih-Hung Hsieh <chh@google.com>	2020-10-29 10:23:47 +0000
committer	Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>	2020-10-29 10:23:47 +0000
commit	8454fe7d746c34af0fc31a8f8c1a3baf4df48a7b (patch)
tree	7d52089b3c3b807c9284314462ae0d08bea4a7eb
parent	b75048b81341b2b1f6150b40d7ed3004ce15b391 (diff)
parent	f1854b7bbcc06c1eb66cb3e1750d86a274a48122 (diff)
download	aho-corasick-8454fe7d746c34af0fc31a8f8c1a3baf4df48a7b.tar.gz