aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChih-Hung Hsieh <chh@google.com>2020-10-29 10:23:47 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2020-10-29 10:23:47 +0000
commit8454fe7d746c34af0fc31a8f8c1a3baf4df48a7b (patch)
tree7d52089b3c3b807c9284314462ae0d08bea4a7eb
parentb75048b81341b2b1f6150b40d7ed3004ce15b391 (diff)
parentf1854b7bbcc06c1eb66cb3e1750d86a274a48122 (diff)
downloadaho-corasick-8454fe7d746c34af0fc31a8f8c1a3baf4df48a7b.tar.gz
Upgrade rust/crates/aho-corasick to 0.7.14 am: 2db690b09c am: 6176f34d45 am: 0e24f4c691 am: f1854b7bbc
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/aho-corasick/+/1474960 Change-Id: Icdfe80978584c6a1e270f18d6c3f7095a75e8e1d
-rw-r--r--.cargo_vcs_info.json2
-rw-r--r--Cargo.toml2
-rw-r--r--Cargo.toml.orig2
-rw-r--r--METADATA8
-rw-r--r--src/ahocorasick.rs28
-rw-r--r--src/buffer.rs4
-rw-r--r--src/dfa.rs4
-rw-r--r--src/prefilter.rs60
-rw-r--r--src/tests.rs120
9 files changed, 198 insertions, 32 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index 725cd96..33e5654 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,5 +1,5 @@
{
"git": {
- "sha1": "55a42968a26a1150aca116fab63537330782d56a"
+ "sha1": "63f0b5252345fa50e490de42b3d82b5159c0e5dd"
}
}
diff --git a/Cargo.toml b/Cargo.toml
index a0c306a..d381829 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,7 +12,7 @@
[package]
name = "aho-corasick"
-version = "0.7.13"
+version = "0.7.14"
authors = ["Andrew Gallant <jamslam@gmail.com>"]
exclude = ["/aho-corasick-debug", "/ci/*", "/.travis.yml", "/appveyor.yml"]
autotests = false
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index 00d71ef..0b64728 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,6 +1,6 @@
[package]
name = "aho-corasick"
-version = "0.7.13" #:version
+version = "0.7.14" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = "Fast multiple substring searching."
homepage = "https://github.com/BurntSushi/aho-corasick"
diff --git a/METADATA b/METADATA
index 2320c5f..8c98616 100644
--- a/METADATA
+++ b/METADATA
@@ -7,13 +7,13 @@ third_party {
}
url {
type: ARCHIVE
- value: "https://static.crates.io/crates/aho-corasick/aho-corasick-0.7.13.crate"
+ value: "https://static.crates.io/crates/aho-corasick/aho-corasick-0.7.14.crate"
}
- version: "0.7.13"
+ version: "0.7.14"
license_type: NOTICE
last_upgrade_date {
year: 2020
- month: 7
- day: 10
+ month: 10
+ day: 26
}
}
diff --git a/src/ahocorasick.rs b/src/ahocorasick.rs
index 7880d13..9069396 100644
--- a/src/ahocorasick.rs
+++ b/src/ahocorasick.rs
@@ -6,7 +6,7 @@ use dfa::{self, DFA};
use error::Result;
use nfa::{self, NFA};
use packed;
-use prefilter::PrefilterState;
+use prefilter::{Prefilter, PrefilterState};
use state_id::StateID;
use Match;
@@ -1075,6 +1075,24 @@ impl<S: StateID> Imp<S> {
}
}
+ /// Returns the prefilter object, if one exists, for the underlying
+ /// automaton.
+ fn prefilter(&self) -> Option<&dyn Prefilter> {
+ match *self {
+ Imp::NFA(ref nfa) => nfa.prefilter(),
+ Imp::DFA(ref dfa) => dfa.prefilter(),
+ }
+ }
+
+ /// Returns true if and only if we should attempt to use a prefilter.
+ fn use_prefilter(&self) -> bool {
+ let p = match self.prefilter() {
+ None => return false,
+ Some(p) => p,
+ };
+ !p.looks_for_non_start_of_match()
+ }
+
#[inline(always)]
fn overlapping_find_at(
&self,
@@ -1363,7 +1381,11 @@ impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> {
"stream searching is only supported for Standard match semantics"
);
- let prestate = PrefilterState::new(ac.max_pattern_len());
+ let prestate = if ac.imp.use_prefilter() {
+ PrefilterState::new(ac.max_pattern_len())
+ } else {
+ PrefilterState::disabled()
+ };
let buf = Buffer::new(ac.imp.max_pattern_len());
let state_id = ac.imp.start_state();
StreamChunkIter {
@@ -1847,7 +1869,7 @@ impl AhoCorasickBuilder {
/// finite automaton (NFA) is used instead.
///
/// The main benefit to a DFA is that it can execute searches more quickly
- /// than a DFA (perhaps 2-4 times as fast). The main drawback is that the
+ /// than a NFA (perhaps 2-4 times as fast). The main drawback is that the
/// DFA uses more space and can take much longer to build.
///
/// Enabling this option does not change the time complexity for
diff --git a/src/buffer.rs b/src/buffer.rs
index 1008196..e7339eb 100644
--- a/src/buffer.rs
+++ b/src/buffer.rs
@@ -50,7 +50,9 @@ impl Buffer {
// reasons, so we set a lower bound of `8 * min`.
//
// TODO: It would be good to find a way to test the streaming
- // implementation with the minimal buffer size.
+ // implementation with the minimal buffer size. For now, we just
+ // uncomment out the next line and comment out the subsequent line.
+ // let capacity = 1 + min;
let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
Buffer { buf: vec![0; capacity], min, end: 0 }
}
diff --git a/src/dfa.rs b/src/dfa.rs
index 1bf37d5..8a4e727 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -43,6 +43,10 @@ impl<S: StateID> DFA<S> {
self.repr().pattern_count
}
+ pub fn prefilter(&self) -> Option<&dyn Prefilter> {
+ self.repr().prefilter.as_ref().map(|p| p.as_ref())
+ }
+
pub fn start_state(&self) -> S {
self.repr().start_id
}
diff --git a/src/prefilter.rs b/src/prefilter.rs
index bda215d..05fa46d 100644
--- a/src/prefilter.rs
+++ b/src/prefilter.rs
@@ -80,6 +80,17 @@ pub trait Prefilter:
fn reports_false_positives(&self) -> bool {
true
}
+
+ /// Returns true if and only if this prefilter may look for a non-starting
+ /// position of a match.
+ ///
+ /// This is useful in a streaming context where prefilters that don't look
+ /// for a starting position of a match can be quite difficult to deal with.
+ ///
+ /// This returns false by default.
+ fn looks_for_non_start_of_match(&self) -> bool {
+ false
+ }
}
impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
@@ -191,6 +202,17 @@ impl PrefilterState {
}
}
+ /// Create a prefilter state that always disables the prefilter.
+ pub fn disabled() -> PrefilterState {
+ PrefilterState {
+ skips: 0,
+ skipped: 0,
+ max_match_len: 0,
+ inert: true,
+ last_scan_at: 0,
+ }
+ }
+
/// Update this state with the number of bytes skipped on the last
/// invocation of the prefilter.
#[inline]
@@ -285,6 +307,7 @@ impl Builder {
/// All patterns added to an Aho-Corasick automaton should be added to this
/// builder before attempting to construct the prefilter.
pub fn build(&self) -> Option<PrefilterObj> {
+ // match (self.start_bytes.build(), self.rare_bytes.build()) {
match (self.start_bytes.build(), self.rare_bytes.build()) {
// If we could build both start and rare prefilters, then there are
// a few cases in which we'd want to use the start-byte prefilter
@@ -663,6 +686,33 @@ impl Prefilter for RareBytesOne {
fn heap_bytes(&self) -> usize {
0
}
+
+ fn looks_for_non_start_of_match(&self) -> bool {
+ // TODO: It should be possible to use a rare byte prefilter in a
+ // streaming context. The main problem is that we usually assume that
+ // if a prefilter has scanned some text and not found anything, then no
+ // match *starts* in that text. This doesn't matter in non-streaming
+ // contexts, but in a streaming context, if we're looking for a byte
+ // that doesn't start at the beginning of a match and don't find it,
+ // then it's still possible for a match to start at the end of the
+ // current buffer content. In order to fix this, the streaming searcher
+ // would need to become aware of prefilters that do this and use the
+ // appropriate offset in various places. It is quite a delicate change
+ // and probably shouldn't be attempted until streaming search has a
+ // better testing strategy. In particular, we'd really like to be able
+ // to vary the buffer size to force strange cases that occur at the
+ // edge of the buffer. If we make the buffer size minimal, then these
+ // cases occur more frequently and easier.
+ //
+ // This is also a bummer because this means that if the prefilter
+ // builder chose a rare byte prefilter, then a streaming search won't
+ // use any prefilter at all because the builder doesn't know how it's
+ // going to be used. Assuming we don't make streaming search aware of
+ // these special types of prefilters as described above, we could fix
+ // this by building a "backup" prefilter that could be used when the
+ // rare byte prefilter could not. But that's a bandaide. Sigh.
+ true
+ }
}
/// A prefilter for scanning for two "rare" bytes.
@@ -697,6 +747,11 @@ impl Prefilter for RareBytesTwo {
fn heap_bytes(&self) -> usize {
0
}
+
+ fn looks_for_non_start_of_match(&self) -> bool {
+ // TODO: See Prefilter impl for RareBytesOne.
+ true
+ }
}
/// A prefilter for scanning for three "rare" bytes.
@@ -732,6 +787,11 @@ impl Prefilter for RareBytesThree {
fn heap_bytes(&self) -> usize {
0
}
+
+ fn looks_for_non_start_of_match(&self) -> bool {
+ // TODO: See Prefilter impl for RareBytesOne.
+ true
+ }
}
/// A builder for constructing a starting byte prefilter.
diff --git a/src/tests.rs b/src/tests.rs
index 0ae31f0..29eba1d 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -998,6 +998,31 @@ testconfig!(
}
);
+fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>(
+ which: TestCollection,
+ mut f: F,
+) {
+ let get_match_triples =
+ |matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
+ matches
+ .into_iter()
+ .map(|m| (m.pattern(), m.start(), m.end()))
+ .collect()
+ };
+ for &tests in which {
+ for test in tests {
+ assert_eq!(
+ test.matches,
+ get_match_triples(f(&test)).as_slice(),
+ "test: {}, patterns: {:?}, haystack: {:?}",
+ test.name,
+ test.patterns,
+ test.haystack
+ );
+ }
+ }
+}
+
#[test]
fn search_tests_have_unique_names() {
let assert = |constname, tests: &[SearchTest]| {
@@ -1126,27 +1151,80 @@ fn regression_case_insensitive_prefilter() {
}
}
-fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>(
- which: TestCollection,
- mut f: F,
-) {
- let get_match_triples =
- |matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
- matches
- .into_iter()
- .map(|m| (m.pattern(), m.start(), m.end()))
- .collect()
- };
- for &tests in which {
- for test in tests {
- assert_eq!(
- test.matches,
- get_match_triples(f(&test)).as_slice(),
- "test: {}, patterns: {:?}, haystack: {:?}",
- test.name,
- test.patterns,
- test.haystack
- );
+// See: https://github.com/BurntSushi/aho-corasick/issues/64
+//
+// This occurs when the rare byte prefilter is active
+#[test]
+fn regression_stream_rare_byte_prefilter() {
+ use std::io::Read;
+
+ // NOTE: The test only fails if this ends with j.
+ const MAGIC: [u8; 5] = *b"1234j";
+
+ // NOTE: The test fails for value in 8188..=8191 These value put the string
+ // to search accross two call to read because the buffer size is 8192 by
+ // default.
+ const BEGIN: usize = 8191;
+
+ /// This is just a structure that implements Reader. The reader
+ /// implementation will simulate a file filled with 0, except for the MAGIC
+ /// string at offset BEGIN.
+ #[derive(Default)]
+ struct R {
+ read: usize,
+ }
+
+ impl Read for R {
+ fn read(&mut self, buf: &mut [u8]) -> ::std::io::Result<usize> {
+ //dbg!(buf.len());
+ if self.read > 100000 {
+ return Ok(0);
+ }
+ let mut from = 0;
+ if self.read < BEGIN {
+ from = buf.len().min(BEGIN - self.read);
+ for x in 0..from {
+ buf[x] = 0;
+ }
+ self.read += from;
+ }
+ if self.read >= BEGIN && self.read <= BEGIN + MAGIC.len() {
+ let to = buf.len().min(BEGIN + MAGIC.len() - self.read + from);
+ if to > from {
+ buf[from..to].copy_from_slice(
+ &MAGIC
+ [self.read - BEGIN..self.read - BEGIN + to - from],
+ );
+ self.read += to - from;
+ from = to;
+ }
+ }
+ for x in from..buf.len() {
+ buf[x] = 0;
+ self.read += 1;
+ }
+ Ok(buf.len())
}
}
+
+ fn run() -> ::std::io::Result<()> {
+ let aut = AhoCorasickBuilder::new().build(&[&MAGIC]);
+
+ // While reading from a vector, it works:
+ let mut buf = vec![];
+ R::default().read_to_end(&mut buf)?;
+ let from_whole = aut.find_iter(&buf).next().unwrap().start();
+
+ //But using stream_find_iter fails!
+ let mut file = R::default();
+ let begin = aut
+ .stream_find_iter(&mut file)
+ .next()
+ .expect("NOT FOUND!!!!")? // Panic here
+ .start();
+ assert_eq!(from_whole, begin);
+ Ok(())
+ }
+
+ run().unwrap()
}