aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2022-03-25 12:32:28 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2022-03-25 12:32:28 +0000
commita55624750e44817ddcdde33749007356f9df3643 (patch)
treea1f8d4d3bb32c82cbfc9afbe6fe7949ab517d67f
parent12106570c7b85a4ce86f116be0fbb99c813936db (diff)
parent1faff9be927c85d1dfb151bc7975d02f697854df (diff)
downloadbstr-android13-mainline-go-extservices-release.tar.gz
Snap for 8358640 from 1faff9be927c85d1dfb151bc7975d02f697854df to mainline-go-extservices-releaseaml_go_ext_330912000android13-mainline-go-extservices-release
Change-Id: I4869a2663031ff8dd7559be4032f4a7fb6dad954
-rw-r--r--.cargo_vcs_info.json2
-rw-r--r--Android.bp12
-rw-r--r--Cargo.toml23
-rw-r--r--Cargo.toml.orig14
-rw-r--r--METADATA8
-rw-r--r--README.md31
-rw-r--r--TEST_MAPPING17
-rw-r--r--cargo2android.json4
-rw-r--r--examples/graphemes-std.rs5
-rw-r--r--examples/graphemes.rs2
-rw-r--r--examples/lines.rs2
-rw-r--r--examples/uppercase.rs2
-rw-r--r--examples/words-std.rs2
-rw-r--r--examples/words.rs2
-rwxr-xr-xscripts/generate-unicode-data2
-rw-r--r--src/bstring.rs2
-rw-r--r--src/byteset/mod.rs3
-rw-r--r--src/byteset/scalar.rs2
-rw-r--r--src/cow.rs84
-rw-r--r--src/ext_slice.rs150
-rw-r--r--src/ext_vec.rs13
-rw-r--r--src/impls.rs35
-rw-r--r--src/io.rs6
-rw-r--r--src/lib.rs73
-rw-r--r--src/search/byte_frequencies.rs258
-rw-r--r--src/search/mod.rs8
-rw-r--r--src/search/prefilter.rs424
-rw-r--r--src/search/tests.rs225
-rw-r--r--src/search/twoway.rs871
-rw-r--r--src/unicode/fsm/grapheme_break_fwd.rs58
-rw-r--r--src/unicode/fsm/grapheme_break_rev.rs58
-rw-r--r--src/unicode/fsm/regional_indicator_rev.rs58
-rw-r--r--src/unicode/fsm/sentence_break_fwd.rs58
-rw-r--r--src/unicode/fsm/simple_word_fwd.rs58
-rw-r--r--src/unicode/fsm/whitespace_anchored_fwd.rs58
-rw-r--r--src/unicode/fsm/whitespace_anchored_rev.bigendian.dfabin598 -> 884 bytes
-rw-r--r--src/unicode/fsm/whitespace_anchored_rev.littleendian.dfabin598 -> 884 bytes
-rw-r--r--src/unicode/fsm/whitespace_anchored_rev.rs60
-rw-r--r--src/unicode/fsm/word_break_fwd.rs58
-rw-r--r--src/unicode/grapheme.rs14
-rw-r--r--src/unicode/sentence.rs8
-rw-r--r--src/unicode/whitespace.rs4
-rw-r--r--src/unicode/word.rs10
-rw-r--r--src/utf8.rs14
44 files changed, 448 insertions, 2350 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index f1ec46c..ef4fb69 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,5 +1,5 @@
{
"git": {
- "sha1": "a444256ca7407fe180ee32534688549655b7a38e"
+ "sha1": "e38e7a7ca986f9499b30202f49d79e531d14d192"
}
}
diff --git a/Android.bp b/Android.bp
index 5bb6b68..7fb581b 100644
--- a/Android.bp
+++ b/Android.bp
@@ -1,4 +1,4 @@
-// This file is generated by cargo2android.py --run --device --dependencies.
+// This file is generated by cargo2android.py --config cargo2android.json.
// Do not modify this file as changes will be overridden on upgrade.
package {
@@ -43,8 +43,10 @@ rust_library {
name: "libbstr",
host_supported: true,
crate_name: "bstr",
+ cargo_env_compat: true,
+ cargo_pkg_version: "0.2.17",
srcs: ["src/lib.rs"],
- edition: "2015",
+ edition: "2018",
features: [
"default",
"lazy_static",
@@ -58,9 +60,3 @@ rust_library {
"libregex_automata",
],
}
-
-// dependent_library ["feature_list"]
-// byteorder-1.4.3
-// lazy_static-1.4.0
-// memchr-2.3.4 "std,use_std"
-// regex-automata-0.1.9
diff --git a/Cargo.toml b/Cargo.toml
index f02d695..0f206ba 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,16 +3,16 @@
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
-# to registry (e.g., crates.io) dependencies
+# to registry (e.g., crates.io) dependencies.
#
-# If you believe there's an error in this file please file an
-# issue against the rust-lang/cargo repository. If you're
-# editing this file be aware that the upstream Cargo.toml
-# will likely look very different (and much more reasonable)
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
[package]
+edition = "2018"
name = "bstr"
-version = "0.2.15"
+version = "0.2.17"
authors = ["Andrew Gallant <jamslam@gmail.com>"]
exclude = ["/.github"]
description = "A string type that is not required to be valid UTF-8."
@@ -29,11 +29,11 @@ debug = true
[lib]
bench = false
[dependencies.lazy_static]
-version = "1.2"
+version = "1.2.0"
optional = true
[dependencies.memchr]
-version = "2.1.2"
+version = "2.4.0"
default-features = false
[dependencies.regex-automata]
@@ -59,10 +59,5 @@ version = "1.2.1"
default = ["std", "unicode"]
serde1 = ["std", "serde1-nostd", "serde/std"]
serde1-nostd = ["serde"]
-std = ["memchr/use_std"]
+std = ["memchr/std"]
unicode = ["lazy_static", "regex-automata"]
-[badges.appveyor]
-repository = "BurntSushi/bstr"
-
-[badges.travis-ci]
-repository = "BurntSushi/bstr"
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index 585da1b..cbb6283 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,6 +1,6 @@
[package]
name = "bstr"
-version = "0.2.15" #:version
+version = "0.2.17" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = "A string type that is not required to be valid UTF-8."
documentation = "https://docs.rs/bstr"
@@ -11,24 +11,24 @@ keywords = ["string", "str", "byte", "bytes", "text"]
license = "MIT OR Apache-2.0"
categories = ["text-processing", "encoding"]
exclude = ["/.github"]
+edition = "2018"
-[badges]
-travis-ci = { repository = "BurntSushi/bstr" }
-appveyor = { repository = "BurntSushi/bstr" }
+[workspace]
+members = ["bench"]
[lib]
bench = false
[features]
default = ["std", "unicode"]
-std = ["memchr/use_std"]
+std = ["memchr/std"]
unicode = ["lazy_static", "regex-automata"]
serde1 = ["std", "serde1-nostd", "serde/std"]
serde1-nostd = ["serde"]
[dependencies]
-memchr = { version = "2.1.2", default-features = false }
-lazy_static = { version = "1.2", optional = true }
+memchr = { version = "2.4.0", default-features = false }
+lazy_static = { version = "1.2.0", optional = true }
regex-automata = { version = "0.1.5", default-features = false, optional = true }
serde = { version = "1.0.85", default-features = false, optional = true }
diff --git a/METADATA b/METADATA
index b51a02f..bfc1d19 100644
--- a/METADATA
+++ b/METADATA
@@ -7,13 +7,13 @@ third_party {
}
url {
type: ARCHIVE
- value: "https://static.crates.io/crates/bstr/bstr-0.2.15.crate"
+ value: "https://static.crates.io/crates/bstr/bstr-0.2.17.crate"
}
- version: "0.2.15"
+ version: "0.2.17"
license_type: NOTICE
last_upgrade_date {
year: 2021
- month: 4
- day: 1
+ month: 9
+ day: 22
}
}
diff --git a/README.md b/README.md
index 3e4ef8b..13bf0fc 100644
--- a/README.md
+++ b/README.md
@@ -158,7 +158,7 @@ and Unicode support.
### Minimum Rust version policy
-This crate's minimum supported `rustc` version (MSRV) is `1.28.0`.
+This crate's minimum supported `rustc` version (MSRV) is `1.41.1`.
In general, this crate will be conservative with respect to the minimum
supported version of Rust. MSRV may be bumped in minor version releases.
@@ -171,12 +171,12 @@ My hope is to move to `1.0` within the next year and commit to its API so that
`bstr` can be used as a public dependency.
A large part of the API surface area was taken from the standard library, so
-from an API design perspective, a good portion of this crate should be mature.
-The main differences from the standard library are in how the various substring
-search routines work. The standard library provides generic infrastructure for
-supporting different types of searches with a single method, where as this
-library prefers to define new methods for each type of search and drop the
-generic infrastructure.
+from an API design perspective, a good portion of this crate should be on solid
+ground already. The main differences from the standard library are in how the
+various substring search routines work. The standard library provides generic
+infrastructure for supporting different types of searches with a single method,
+where as this library prefers to define new methods for each type of search and
+drop the generic infrastructure.
Some _probable_ future considerations for APIs include, but are not limited to:
@@ -195,10 +195,10 @@ Here are some examples that are _probably_ out of scope for this crate:
The exact scope isn't quite clear, but I expect we can iterate on it.
-In general, as stated below, this crate is an experiment in bringing lots of
-related APIs together into a single crate while simultaneously attempting to
-keep the total number of dependencies low. Indeed, every dependency of `bstr`,
-except for `memchr`, is optional.
+In general, as stated below, this crate brings lots of related APIs together
+into a single crate while simultaneously attempting to keep the total number of
+dependencies low. Indeed, every dependency of `bstr`, except for `memchr`, is
+optional.
### High level motivation
@@ -229,13 +229,10 @@ Consider, for example, trimming and splitting, along with their different
variants.
In other words, `bstr` is partially a way of pushing back against the
-micro-crate ecosystem that appears to be evolving. It's not clear to me whether
-this experiment will be successful or not, but it is definitely a goal of
+micro-crate ecosystem that appears to be evolving. Namely, it is a goal of
`bstr` to keep its dependency list lightweight. For example, `serde` is an
-optional dependency because there is no feasible alternative, but `twoway` is
-not, where we instead prefer to implement our own substring search. In service
-of this philosophy, currently, the only required dependency of `bstr` is
-`memchr`.
+optional dependency because there is no feasible alternative. In service of
+this philosophy, currently, the only required dependency of `bstr` is `memchr`.
### License
diff --git a/TEST_MAPPING b/TEST_MAPPING
new file mode 100644
index 0000000..3cbd48d
--- /dev/null
+++ b/TEST_MAPPING
@@ -0,0 +1,17 @@
+// Generated by update_crate_tests.py for tests that depend on this crate.
+{
+ "imports": [
+ {
+ "path": "external/rust/crates/base64"
+ },
+ {
+ "path": "external/rust/crates/tinytemplate"
+ },
+ {
+ "path": "external/rust/crates/tinyvec"
+ },
+ {
+ "path": "external/rust/crates/unicode-xid"
+ }
+ ]
+}
diff --git a/cargo2android.json b/cargo2android.json
new file mode 100644
index 0000000..bf78496
--- /dev/null
+++ b/cargo2android.json
@@ -0,0 +1,4 @@
+{
+ "device": true,
+ "run": true
+} \ No newline at end of file
diff --git a/examples/graphemes-std.rs b/examples/graphemes-std.rs
index 3522736..647739d 100644
--- a/examples/graphemes-std.rs
+++ b/examples/graphemes-std.rs
@@ -1,5 +1,3 @@
-extern crate unicode_segmentation;
-
use std::error::Error;
use std::io::{self, BufRead, Write};
@@ -18,8 +16,7 @@ fn main() -> Result<(), Box<dyn Error>> {
.take(10)
.last()
.unwrap_or(line.len());
- #[allow(deprecated)] // for Rust 1.28.0
- stdout.write_all(line[..end].trim_right().as_bytes())?;
+ stdout.write_all(line[..end].trim_end().as_bytes())?;
stdout.write_all(b"\n")?;
line.clear();
diff --git a/examples/graphemes.rs b/examples/graphemes.rs
index 2372490..6adc701 100644
--- a/examples/graphemes.rs
+++ b/examples/graphemes.rs
@@ -1,5 +1,3 @@
-extern crate bstr;
-
use std::error::Error;
use std::io::{self, Write};
diff --git a/examples/lines.rs b/examples/lines.rs
index 4b1045f..c392a22 100644
--- a/examples/lines.rs
+++ b/examples/lines.rs
@@ -1,5 +1,3 @@
-extern crate bstr;
-
use std::error::Error;
use std::io::{self, Write};
diff --git a/examples/uppercase.rs b/examples/uppercase.rs
index f9771e0..1eb798a 100644
--- a/examples/uppercase.rs
+++ b/examples/uppercase.rs
@@ -1,5 +1,3 @@
-extern crate bstr;
-
use std::error::Error;
use std::io::{self, Write};
diff --git a/examples/words-std.rs b/examples/words-std.rs
index 7eae116..aeeeb26 100644
--- a/examples/words-std.rs
+++ b/examples/words-std.rs
@@ -1,5 +1,3 @@
-extern crate unicode_segmentation;
-
use std::error::Error;
use std::io::{self, BufRead};
diff --git a/examples/words.rs b/examples/words.rs
index eb20c0d..db305da 100644
--- a/examples/words.rs
+++ b/examples/words.rs
@@ -1,5 +1,3 @@
-extern crate bstr;
-
use std::error::Error;
use std::io;
diff --git a/scripts/generate-unicode-data b/scripts/generate-unicode-data
index 6b59fae..b8341c5 100755
--- a/scripts/generate-unicode-data
+++ b/scripts/generate-unicode-data
@@ -101,7 +101,7 @@ whitespace() {
ucd-generate dfa \
--name WHITESPACE_ANCHORED_REV \
--reverse \
- --anchored --classes --premultiply --minimize --state-size 1 \
+ --anchored --classes --premultiply --minimize --state-size 2 \
src/unicode/fsm/ \
"\s+"
}
diff --git a/src/bstring.rs b/src/bstring.rs
index f04c651..30093ba 100644
--- a/src/bstring.rs
+++ b/src/bstring.rs
@@ -1,4 +1,4 @@
-use bstr::BStr;
+use crate::bstr::BStr;
/// A wrapper for `Vec<u8>` that provides convenient string oriented trait
/// impls.
diff --git a/src/byteset/mod.rs b/src/byteset/mod.rs
index 969b0e3..043d309 100644
--- a/src/byteset/mod.rs
+++ b/src/byteset/mod.rs
@@ -81,8 +81,7 @@ pub(crate) fn rfind_not(haystack: &[u8], byteset: &[u8]) -> Option<usize> {
#[cfg(test)]
mod tests {
-
- quickcheck! {
+ quickcheck::quickcheck! {
fn qc_byteset_forward_matches_naive(
haystack: Vec<u8>,
needles: Vec<u8>
diff --git a/src/byteset/scalar.rs b/src/byteset/scalar.rs
index 3fe1f53..9bd34a8 100644
--- a/src/byteset/scalar.rs
+++ b/src/byteset/scalar.rs
@@ -238,7 +238,7 @@ mod tests {
#[test]
fn test_inv_memchr() {
- use {ByteSlice, B};
+ use crate::{ByteSlice, B};
for (search, byte, matching) in build_tests() {
assert_eq!(
inv_memchr(byte, &search),
diff --git a/src/cow.rs b/src/cow.rs
deleted file mode 100644
index 1556353..0000000
--- a/src/cow.rs
+++ /dev/null
@@ -1,84 +0,0 @@
-use core::ops;
-#[cfg(feature = "std")]
-use std::borrow::Cow;
-
-/// A specialized copy-on-write byte string.
-///
-/// The purpose of this type is to permit usage of a "borrowed or owned
-/// byte string" in a way that keeps std/no-std compatibility. That is, in
-/// no-std mode, this type devolves into a simple &[u8] with no owned variant
-/// availble.
-#[derive(Clone, Debug)]
-pub struct CowBytes<'a>(Imp<'a>);
-
-#[cfg(feature = "std")]
-#[derive(Clone, Debug)]
-struct Imp<'a>(Cow<'a, [u8]>);
-
-#[cfg(not(feature = "std"))]
-#[derive(Clone, Debug)]
-struct Imp<'a>(&'a [u8]);
-
-impl<'a> ops::Deref for CowBytes<'a> {
- type Target = [u8];
-
- fn deref(&self) -> &[u8] {
- self.as_slice()
- }
-}
-
-impl<'a> CowBytes<'a> {
- /// Create a new borrowed CowBytes.
- pub fn new<B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> CowBytes<'a> {
- CowBytes(Imp::new(bytes.as_ref()))
- }
-
- /// Create a new owned CowBytes.
- #[cfg(feature = "std")]
- pub fn new_owned(bytes: Vec<u8>) -> CowBytes<'static> {
- CowBytes(Imp(Cow::Owned(bytes)))
- }
-
- /// Return a borrowed byte string, regardless of whether this is an owned
- /// or borrowed byte string internally.
- pub fn as_slice(&self) -> &[u8] {
- self.0.as_slice()
- }
-
- /// Return an owned version of this copy-on-write byte string.
- ///
- /// If this is already an owned byte string internally, then this is a
- /// no-op. Otherwise, the internal byte string is copied.
- #[cfg(feature = "std")]
- pub fn into_owned(self) -> CowBytes<'static> {
- match (self.0).0 {
- Cow::Borrowed(b) => CowBytes::new_owned(b.to_vec()),
- Cow::Owned(b) => CowBytes::new_owned(b),
- }
- }
-}
-
-impl<'a> Imp<'a> {
- #[cfg(feature = "std")]
- pub fn new(bytes: &'a [u8]) -> Imp<'a> {
- Imp(Cow::Borrowed(bytes))
- }
-
- #[cfg(not(feature = "std"))]
- pub fn new(bytes: &'a [u8]) -> Imp<'a> {
- Imp(bytes)
- }
-
- #[cfg(feature = "std")]
- pub fn as_slice(&self) -> &[u8] {
- match self.0 {
- Cow::Owned(ref x) => x,
- Cow::Borrowed(x) => x,
- }
- }
-
- #[cfg(not(feature = "std"))]
- pub fn as_slice(&self) -> &[u8] {
- self.0
- }
-}
diff --git a/src/ext_slice.rs b/src/ext_slice.rs
index fa08190..0cc73af 100644
--- a/src/ext_slice.rs
+++ b/src/ext_slice.rs
@@ -5,22 +5,21 @@ use std::ffi::OsStr;
#[cfg(feature = "std")]
use std::path::Path;
-use core::{cmp, iter, ops, ptr, slice, str};
-use memchr::{memchr, memrchr};
+use core::{iter, ops, ptr, slice, str};
+use memchr::{memchr, memmem, memrchr};
-use ascii;
-use bstr::BStr;
-use byteset;
+use crate::ascii;
+use crate::bstr::BStr;
+use crate::byteset;
#[cfg(feature = "std")]
-use ext_vec::ByteVec;
-use search::{PrefilterState, TwoWay};
+use crate::ext_vec::ByteVec;
#[cfg(feature = "unicode")]
-use unicode::{
+use crate::unicode::{
whitespace_len_fwd, whitespace_len_rev, GraphemeIndices, Graphemes,
SentenceIndices, Sentences, WordIndices, Words, WordsWithBreakIndices,
WordsWithBreaks,
};
-use utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error};
+use crate::utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error};
/// A short-hand constructor for building a `&[u8]`.
///
@@ -344,7 +343,7 @@ pub trait ByteSlice: Sealed {
/// ```
#[cfg(feature = "std")]
#[inline]
- fn to_str_lossy(&self) -> Cow<str> {
+ fn to_str_lossy(&self) -> Cow<'_, str> {
match utf8::validate(self.as_bytes()) {
Ok(()) => {
// SAFETY: This is safe because of the guarantees provided by
@@ -488,10 +487,10 @@ pub trait ByteSlice: Sealed {
/// ```
#[cfg(feature = "std")]
#[inline]
- fn to_os_str_lossy(&self) -> Cow<OsStr> {
+ fn to_os_str_lossy(&self) -> Cow<'_, OsStr> {
#[cfg(unix)]
#[inline]
- fn imp(bytes: &[u8]) -> Cow<OsStr> {
+ fn imp(bytes: &[u8]) -> Cow<'_, OsStr> {
use std::os::unix::ffi::OsStrExt;
Cow::Borrowed(OsStr::from_bytes(bytes))
@@ -559,7 +558,7 @@ pub trait ByteSlice: Sealed {
/// ```
#[cfg(feature = "std")]
#[inline]
- fn to_path_lossy(&self) -> Cow<Path> {
+ fn to_path_lossy(&self) -> Cow<'_, Path> {
use std::path::PathBuf;
match self.to_os_str_lossy() {
@@ -1067,7 +1066,7 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(0, B(" \n\t\u{2003}\n \t").fields().count());
/// ```
#[inline]
- fn fields(&self) -> Fields {
+ fn fields(&self) -> Fields<'_> {
Fields::new(self.as_bytes())
}
@@ -1099,7 +1098,7 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(0, b"1911354563".fields_with(|c| c.is_numeric()).count());
/// ```
#[inline]
- fn fields_with<F: FnMut(char) -> bool>(&self, f: F) -> FieldsWith<F> {
+ fn fields_with<F: FnMut(char) -> bool>(&self, f: F) -> FieldsWith<'_, F> {
FieldsWith::new(self.as_bytes(), f)
}
@@ -1619,7 +1618,7 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(bytes, bs);
/// ```
#[inline]
- fn bytes(&self) -> Bytes {
+ fn bytes(&self) -> Bytes<'_> {
Bytes { it: self.as_bytes().iter() }
}
@@ -1649,7 +1648,7 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
/// ```
#[inline]
- fn chars(&self) -> Chars {
+ fn chars(&self) -> Chars<'_> {
Chars::new(self.as_bytes())
}
@@ -1704,7 +1703,7 @@ pub trait ByteSlice: Sealed {
/// ]);
/// ```
#[inline]
- fn char_indices(&self) -> CharIndices {
+ fn char_indices(&self) -> CharIndices<'_> {
CharIndices::new(self.as_bytes())
}
@@ -1741,7 +1740,7 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(invalid_chunks, vec![b"\xFD", b"\xFE", b"\xFF"]);
/// ```
#[inline]
- fn utf8_chunks(&self) -> Utf8Chunks {
+ fn utf8_chunks(&self) -> Utf8Chunks<'_> {
Utf8Chunks { bytes: self.as_bytes() }
}
@@ -1773,7 +1772,7 @@ pub trait ByteSlice: Sealed {
/// ```
#[cfg(feature = "unicode")]
#[inline]
- fn graphemes(&self) -> Graphemes {
+ fn graphemes(&self) -> Graphemes<'_> {
Graphemes::new(self.as_bytes())
}
@@ -1817,7 +1816,7 @@ pub trait ByteSlice: Sealed {
/// ```
#[cfg(feature = "unicode")]
#[inline]
- fn grapheme_indices(&self) -> GraphemeIndices {
+ fn grapheme_indices(&self) -> GraphemeIndices<'_> {
GraphemeIndices::new(self.as_bytes())
}
@@ -1853,7 +1852,7 @@ pub trait ByteSlice: Sealed {
/// ```
#[cfg(feature = "unicode")]
#[inline]
- fn words(&self) -> Words {
+ fn words(&self) -> Words<'_> {
Words::new(self.as_bytes())
}
@@ -1891,7 +1890,7 @@ pub trait ByteSlice: Sealed {
/// ```
#[cfg(feature = "unicode")]
#[inline]
- fn word_indices(&self) -> WordIndices {
+ fn word_indices(&self) -> WordIndices<'_> {
WordIndices::new(self.as_bytes())
}
@@ -1921,7 +1920,7 @@ pub trait ByteSlice: Sealed {
/// ```
#[cfg(feature = "unicode")]
#[inline]
- fn words_with_breaks(&self) -> WordsWithBreaks {
+ fn words_with_breaks(&self) -> WordsWithBreaks<'_> {
WordsWithBreaks::new(self.as_bytes())
}
@@ -1958,7 +1957,7 @@ pub trait ByteSlice: Sealed {
/// ```
#[cfg(feature = "unicode")]
#[inline]
- fn words_with_break_indices(&self) -> WordsWithBreakIndices {
+ fn words_with_break_indices(&self) -> WordsWithBreakIndices<'_> {
WordsWithBreakIndices::new(self.as_bytes())
}
@@ -1990,7 +1989,7 @@ pub trait ByteSlice: Sealed {
/// ```
#[cfg(feature = "unicode")]
#[inline]
- fn sentences(&self) -> Sentences {
+ fn sentences(&self) -> Sentences<'_> {
Sentences::new(self.as_bytes())
}
@@ -2024,7 +2023,7 @@ pub trait ByteSlice: Sealed {
/// ```
#[cfg(feature = "unicode")]
#[inline]
- fn sentence_indices(&self) -> SentenceIndices {
+ fn sentence_indices(&self) -> SentenceIndices<'_> {
SentenceIndices::new(self.as_bytes())
}
@@ -2055,7 +2054,7 @@ pub trait ByteSlice: Sealed {
/// ]);
/// ```
#[inline]
- fn lines(&self) -> Lines {
+ fn lines(&self) -> Lines<'_> {
Lines::new(self.as_bytes())
}
@@ -2099,7 +2098,7 @@ pub trait ByteSlice: Sealed {
/// ]);
/// ```
#[inline]
- fn lines_with_terminator(&self) -> LinesWithTerminator {
+ fn lines_with_terminator(&self) -> LinesWithTerminator<'_> {
LinesWithTerminator::new(self.as_bytes())
}
@@ -2789,7 +2788,7 @@ pub trait ByteSlice: Sealed {
#[cfg(feature = "unicode")]
#[inline]
fn reverse_graphemes(&mut self) {
- use unicode::decode_grapheme;
+ use crate::unicode::decode_grapheme;
let mut i = 0;
loop {
@@ -2986,15 +2985,13 @@ pub trait ByteSlice: Sealed {
/// version which permits building a `Finder` that is not connected to the
/// lifetime of its needle.
#[derive(Clone, Debug)]
-pub struct Finder<'a> {
- searcher: TwoWay<'a>,
-}
+pub struct Finder<'a>(memmem::Finder<'a>);
impl<'a> Finder<'a> {
/// Create a new finder for the given needle.
#[inline]
pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> Finder<'a> {
- Finder { searcher: TwoWay::forward(needle.as_ref()) }
+ Finder(memmem::Finder::new(needle.as_ref()))
}
/// Convert this finder into its owned variant, such that it no longer
@@ -3007,7 +3004,7 @@ impl<'a> Finder<'a> {
#[cfg(feature = "std")]
#[inline]
pub fn into_owned(self) -> Finder<'static> {
- Finder { searcher: self.searcher.into_owned() }
+ Finder(self.0.into_owned())
}
/// Returns the needle that this finder searches for.
@@ -3018,7 +3015,7 @@ impl<'a> Finder<'a> {
/// needle returned must necessarily be the shorter of the two.
#[inline]
pub fn needle(&self) -> &[u8] {
- self.searcher.needle()
+ self.0.needle()
}
/// Returns the index of the first occurrence of this needle in the given
@@ -3050,7 +3047,7 @@ impl<'a> Finder<'a> {
/// ```
#[inline]
pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> {
- self.searcher.find(haystack.as_ref())
+ self.0.find(haystack.as_ref())
}
}
@@ -3071,15 +3068,13 @@ impl<'a> Finder<'a> {
/// version which permits building a `FinderReverse` that is not connected to
/// the lifetime of its needle.
#[derive(Clone, Debug)]
-pub struct FinderReverse<'a> {
- searcher: TwoWay<'a>,
-}
+pub struct FinderReverse<'a>(memmem::FinderRev<'a>);
impl<'a> FinderReverse<'a> {
/// Create a new reverse finder for the given needle.
#[inline]
pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> FinderReverse<'a> {
- FinderReverse { searcher: TwoWay::reverse(needle.as_ref()) }
+ FinderReverse(memmem::FinderRev::new(needle.as_ref()))
}
/// Convert this finder into its owned variant, such that it no longer
@@ -3092,7 +3087,7 @@ impl<'a> FinderReverse<'a> {
#[cfg(feature = "std")]
#[inline]
pub fn into_owned(self) -> FinderReverse<'static> {
- FinderReverse { searcher: self.searcher.into_owned() }
+ FinderReverse(self.0.into_owned())
}
/// Returns the needle that this finder searches for.
@@ -3103,7 +3098,7 @@ impl<'a> FinderReverse<'a> {
/// the needle returned must necessarily be the shorter of the two.
#[inline]
pub fn needle(&self) -> &[u8] {
- self.searcher.needle()
+ self.0.needle()
}
/// Returns the index of the last occurrence of this needle in the given
@@ -3135,7 +3130,7 @@ impl<'a> FinderReverse<'a> {
/// ```
#[inline]
pub fn rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> {
- self.searcher.rfind(haystack.as_ref())
+ self.0.rfind(haystack.as_ref())
}
}
@@ -3147,17 +3142,14 @@ impl<'a> FinderReverse<'a> {
/// byte string being looked for.
#[derive(Debug)]
pub struct Find<'a> {
+ it: memmem::FindIter<'a, 'a>,
haystack: &'a [u8],
- prestate: PrefilterState,
- searcher: TwoWay<'a>,
- pos: usize,
+ needle: &'a [u8],
}
impl<'a> Find<'a> {
fn new(haystack: &'a [u8], needle: &'a [u8]) -> Find<'a> {
- let searcher = TwoWay::forward(needle);
- let prestate = searcher.prefilter_state();
- Find { haystack, prestate, searcher, pos: 0 }
+ Find { it: memmem::find_iter(haystack, needle), haystack, needle }
}
}
@@ -3166,20 +3158,7 @@ impl<'a> Iterator for Find<'a> {
#[inline]
fn next(&mut self) -> Option<usize> {
- if self.pos > self.haystack.len() {
- return None;
- }
- let result = self
- .searcher
- .find_with(&mut self.prestate, &self.haystack[self.pos..]);
- match result {
- None => None,
- Some(i) => {
- let pos = self.pos + i;
- self.pos = pos + cmp::max(1, self.searcher.needle().len());
- Some(pos)
- }
- }
+ self.it.next()
}
}
@@ -3191,20 +3170,18 @@ impl<'a> Iterator for Find<'a> {
/// byte string being looked for.
#[derive(Debug)]
pub struct FindReverse<'a> {
+ it: memmem::FindRevIter<'a, 'a>,
haystack: &'a [u8],
- prestate: PrefilterState,
- searcher: TwoWay<'a>,
- /// When searching with an empty needle, this gets set to `None` after
- /// we've yielded the last element at `0`.
- pos: Option<usize>,
+ needle: &'a [u8],
}
impl<'a> FindReverse<'a> {
fn new(haystack: &'a [u8], needle: &'a [u8]) -> FindReverse<'a> {
- let searcher = TwoWay::reverse(needle);
- let prestate = searcher.prefilter_state();
- let pos = Some(haystack.len());
- FindReverse { haystack, prestate, searcher, pos }
+ FindReverse {
+ it: memmem::rfind_iter(haystack, needle),
+ haystack,
+ needle,
+ }
}
fn haystack(&self) -> &'a [u8] {
@@ -3212,7 +3189,7 @@ impl<'a> FindReverse<'a> {
}
fn needle(&self) -> &[u8] {
- self.searcher.needle()
+ self.needle
}
}
@@ -3221,24 +3198,7 @@ impl<'a> Iterator for FindReverse<'a> {
#[inline]
fn next(&mut self) -> Option<usize> {
- let pos = match self.pos {
- None => return None,
- Some(pos) => pos,
- };
- let result = self
- .searcher
- .rfind_with(&mut self.prestate, &self.haystack[..pos]);
- match result {
- None => None,
- Some(i) => {
- if pos == i {
- self.pos = pos.checked_sub(1);
- } else {
- self.pos = Some(i);
- }
- Some(i)
- }
- }
+ self.it.next()
}
}
@@ -3398,7 +3358,7 @@ impl<'a> Iterator for Split<'a> {
match self.finder.next() {
Some(start) => {
let next = &haystack[self.last..start];
- self.last = start + self.finder.searcher.needle().len();
+ self.last = start + self.finder.needle.len();
Some(next)
}
None => {
@@ -3633,8 +3593,8 @@ impl<'a> Iterator for LinesWithTerminator<'a> {
#[cfg(test)]
mod tests {
- use ext_slice::{ByteSlice, B};
- use tests::LOSSY_TESTS;
+ use crate::ext_slice::{ByteSlice, B};
+ use crate::tests::LOSSY_TESTS;
#[test]
fn to_str_lossy() {
diff --git a/src/ext_vec.rs b/src/ext_vec.rs
index 6f6db56..5beb0e1 100644
--- a/src/ext_vec.rs
+++ b/src/ext_vec.rs
@@ -1,5 +1,3 @@
-#![allow(unused_imports)]
-
use std::borrow::Cow;
use std::error;
use std::ffi::{OsStr, OsString};
@@ -11,8 +9,8 @@ use std::ptr;
use std::str;
use std::vec;
-use ext_slice::ByteSlice;
-use utf8::{self, Utf8Error};
+use crate::ext_slice::ByteSlice;
+use crate::utf8::{self, Utf8Error};
/// Concatenate the elements given by the iterator together into a single
/// `Vec<u8>`.
@@ -877,7 +875,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(s, "foar".as_bytes());
/// ```
#[inline]
- fn drain_bytes<R>(&mut self, range: R) -> DrainBytes
+ fn drain_bytes<R>(&mut self, range: R) -> DrainBytes<'_>
where
R: ops::RangeBounds<usize>,
{
@@ -1040,15 +1038,14 @@ impl error::Error for FromUtf8Error {
impl fmt::Display for FromUtf8Error {
#[inline]
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.err)
}
}
#[cfg(test)]
mod tests {
- use ext_slice::B;
- use ext_vec::ByteVec;
+ use crate::ext_vec::ByteVec;
#[test]
fn insert() {
diff --git a/src/impls.rs b/src/impls.rs
index 4cba48e..85a27ba 100644
--- a/src/impls.rs
+++ b/src/impls.rs
@@ -67,20 +67,20 @@ mod bstring {
use std::iter::FromIterator;
use std::ops;
- use bstr::BStr;
- use bstring::BString;
- use ext_vec::ByteVec;
+ use crate::bstr::BStr;
+ use crate::bstring::BString;
+ use crate::ext_vec::ByteVec;
impl fmt::Display for BString {
#[inline]
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(self.as_bstr(), f)
}
}
impl fmt::Debug for BString {
#[inline]
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(self.as_bstr(), f)
}
}
@@ -308,15 +308,15 @@ mod bstr {
use core::fmt;
use core::ops;
- use bstr::BStr;
- use ext_slice::ByteSlice;
+ use crate::bstr::BStr;
+ use crate::ext_slice::ByteSlice;
impl fmt::Display for BStr {
#[inline]
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
/// Write the given bstr (lossily) to the given formatter.
fn write_bstr(
- f: &mut fmt::Formatter,
+ f: &mut fmt::Formatter<'_>,
bstr: &BStr,
) -> Result<(), fmt::Error> {
for chunk in bstr.utf8_chunks() {
@@ -329,7 +329,10 @@ mod bstr {
}
/// Write 'num' fill characters to the given formatter.
- fn write_pads(f: &mut fmt::Formatter, num: usize) -> fmt::Result {
+ fn write_pads(
+ f: &mut fmt::Formatter<'_>,
+ num: usize,
+ ) -> fmt::Result {
let fill = f.fill();
for _ in 0..num {
f.write_fmt(format_args!("{}", fill))?;
@@ -372,7 +375,7 @@ mod bstr {
impl fmt::Debug for BStr {
#[inline]
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "\"")?;
for (s, e, ch) in self.char_indices() {
match ch {
@@ -685,7 +688,7 @@ mod bstr_serde {
Serializer,
};
- use bstr::BStr;
+ use crate::bstr::BStr;
impl Serialize for BStr {
#[inline]
@@ -744,7 +747,7 @@ mod bstring_serde {
Serialize, Serializer,
};
- use bstring::BString;
+ use crate::bstring::BString;
impl Serialize for BString {
#[inline]
@@ -824,8 +827,8 @@ mod bstring_serde {
#[cfg(test)]
mod display {
+ use crate::bstring::BString;
use crate::ByteSlice;
- use bstring::BString;
#[test]
fn clean() {
@@ -923,7 +926,7 @@ mod display {
);
}
- quickcheck! {
+ quickcheck::quickcheck! {
fn total_length(bstr: BString) -> bool {
let size = bstr.chars().count();
format!("{:<1$}", bstr.as_bstr(), size).chars().count() >= size
@@ -933,7 +936,7 @@ mod display {
#[cfg(test)]
mod bstring_arbitrary {
- use bstring::BString;
+ use crate::bstring::BString;
use quickcheck::{Arbitrary, Gen};
diff --git a/src/io.rs b/src/io.rs
index f2b4452..ad6f3c1 100644
--- a/src/io.rs
+++ b/src/io.rs
@@ -9,8 +9,8 @@ More APIs may be added in the future.
use std::io;
-use ext_slice::ByteSlice;
-use ext_vec::ByteVec;
+use crate::ext_slice::ByteSlice;
+use crate::ext_vec::ByteVec;
/// An extention trait for
/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html)
@@ -441,7 +441,7 @@ fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] {
#[cfg(test)]
mod tests {
use super::BufReadExt;
- use bstring::BString;
+ use crate::bstring::BString;
fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
let mut lines = vec![];
diff --git a/src/lib.rs b/src/lib.rs
index c240cd1..41142c9 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,5 @@
/*!
-An experimental byte string library.
+A byte string library.
Byte strings are just like standard Unicode strings with one very important
difference: byte strings are only *conventionally* UTF-8 while Rust's standard
@@ -98,10 +98,10 @@ method converts any `&[u8]` to a `&BStr`.
# When should I use byte strings?
-This library is somewhat of an experiment that reflects my hypothesis that
-UTF-8 by convention is a better trade off in some circumstances than guaranteed
-UTF-8. It's possible, perhaps even likely, that this is a niche concern for
-folks working closely with core text primitives.
+This library reflects my hypothesis that UTF-8 by convention is a better trade
+off in some circumstances than guaranteed UTF-8. It's possible, perhaps even
+likely, that this is a niche concern for folks working closely with core text
+primitives.
The first time this idea hit me was in the implementation of Rust's regex
engine. In particular, very little of the internal implementation cares at all
@@ -134,15 +134,16 @@ incremental way by only parsing chunks at a time, but this is often complex to
do or impractical. For example, many regex engines only accept one contiguous
sequence of bytes at a time with no way to perform incremental matching.
-In summary, the conventional UTF-8 byte strings provided by this library is an
-experiment. They are definitely useful in some limited circumstances, but how
-useful they are more broadly isn't clear yet.
+In summary, conventional UTF-8 byte strings provided by this library are
+definitely useful in some limited circumstances, but how useful they are more
+broadly isn't clear yet.
# `bstr` in public APIs
-Since this library is still experimental, you should not use it in the public
-API of your crates until it hits `1.0` (unless you're OK with with tracking
-breaking releases of `bstr`).
+Since this library is not yet `1.0`, you should not use it in the public API of
+your crates until it hits `1.0` (unless you're OK with with tracking breaking
+releases of `bstr`). It is expected that `bstr 1.0` will be released before
+2022.
In general, it should be possible to avoid putting anything in this crate into
your public APIs. Namely, you should never need to use the `ByteSlice` or
@@ -367,41 +368,23 @@ Windows.
*/
#![cfg_attr(not(feature = "std"), no_std)]
-#![allow(dead_code)]
+pub use crate::bstr::BStr;
#[cfg(feature = "std")]
-extern crate core;
-
-#[cfg(feature = "unicode")]
-#[macro_use]
-extern crate lazy_static;
-extern crate memchr;
-#[cfg(test)]
-#[macro_use]
-extern crate quickcheck;
-#[cfg(feature = "unicode")]
-extern crate regex_automata;
-#[cfg(feature = "serde1-nostd")]
-extern crate serde;
-#[cfg(test)]
-extern crate ucd_parse;
-
-pub use bstr::BStr;
-#[cfg(feature = "std")]
-pub use bstring::BString;
-pub use ext_slice::{
+pub use crate::bstring::BString;
+pub use crate::ext_slice::{
ByteSlice, Bytes, Fields, FieldsWith, Find, FindReverse, Finder,
FinderReverse, Lines, LinesWithTerminator, Split, SplitN, SplitNReverse,
SplitReverse, B,
};
#[cfg(feature = "std")]
-pub use ext_vec::{concat, join, ByteVec, DrainBytes, FromUtf8Error};
+pub use crate::ext_vec::{concat, join, ByteVec, DrainBytes, FromUtf8Error};
#[cfg(feature = "unicode")]
-pub use unicode::{
+pub use crate::unicode::{
GraphemeIndices, Graphemes, SentenceIndices, Sentences, WordIndices,
Words, WordsWithBreakIndices, WordsWithBreaks,
};
-pub use utf8::{
+pub use crate::utf8::{
decode as decode_utf8, decode_last as decode_last_utf8, CharIndices,
Chars, Utf8Chunk, Utf8Chunks, Utf8Error,
};
@@ -411,14 +394,12 @@ mod bstr;
#[cfg(feature = "std")]
mod bstring;
mod byteset;
-mod cow;
mod ext_slice;
#[cfg(feature = "std")]
mod ext_vec;
mod impls;
#[cfg(feature = "std")]
pub mod io;
-mod search;
#[cfg(test)]
mod tests;
#[cfg(feature = "unicode")]
@@ -427,9 +408,9 @@ mod utf8;
#[cfg(test)]
mod apitests {
- use bstr::BStr;
- use bstring::BString;
- use ext_slice::{Finder, FinderReverse};
+ use crate::bstr::BStr;
+ use crate::bstring::BString;
+ use crate::ext_slice::{Finder, FinderReverse};
#[test]
fn oibits() {
@@ -446,11 +427,11 @@ mod apitests {
assert_sync::<BString>();
assert_unwind_safe::<BString>();
- assert_send::<Finder>();
- assert_sync::<Finder>();
- assert_unwind_safe::<Finder>();
- assert_send::<FinderReverse>();
- assert_sync::<FinderReverse>();
- assert_unwind_safe::<FinderReverse>();
+ assert_send::<Finder<'_>>();
+ assert_sync::<Finder<'_>>();
+ assert_unwind_safe::<Finder<'_>>();
+ assert_send::<FinderReverse<'_>>();
+ assert_sync::<FinderReverse<'_>>();
+ assert_unwind_safe::<FinderReverse<'_>>();
}
}
diff --git a/src/search/byte_frequencies.rs b/src/search/byte_frequencies.rs
deleted file mode 100644
index c313b62..0000000
--- a/src/search/byte_frequencies.rs
+++ /dev/null
@@ -1,258 +0,0 @@
-pub const BYTE_FREQUENCIES: [u8; 256] = [
- 55, // '\x00'
- 52, // '\x01'
- 51, // '\x02'
- 50, // '\x03'
- 49, // '\x04'
- 48, // '\x05'
- 47, // '\x06'
- 46, // '\x07'
- 45, // '\x08'
- 103, // '\t'
- 242, // '\n'
- 66, // '\x0b'
- 67, // '\x0c'
- 229, // '\r'
- 44, // '\x0e'
- 43, // '\x0f'
- 42, // '\x10'
- 41, // '\x11'
- 40, // '\x12'
- 39, // '\x13'
- 38, // '\x14'
- 37, // '\x15'
- 36, // '\x16'
- 35, // '\x17'
- 34, // '\x18'
- 33, // '\x19'
- 56, // '\x1a'
- 32, // '\x1b'
- 31, // '\x1c'
- 30, // '\x1d'
- 29, // '\x1e'
- 28, // '\x1f'
- 255, // ' '
- 148, // '!'
- 164, // '"'
- 149, // '#'
- 136, // '$'
- 160, // '%'
- 155, // '&'
- 173, // "'"
- 221, // '('
- 222, // ')'
- 134, // '*'
- 122, // '+'
- 232, // ','
- 202, // '-'
- 215, // '.'
- 224, // '/'
- 208, // '0'
- 220, // '1'
- 204, // '2'
- 187, // '3'
- 183, // '4'
- 179, // '5'
- 177, // '6'
- 168, // '7'
- 178, // '8'
- 200, // '9'
- 226, // ':'
- 195, // ';'
- 154, // '<'
- 184, // '='
- 174, // '>'
- 126, // '?'
- 120, // '@'
- 191, // 'A'
- 157, // 'B'
- 194, // 'C'
- 170, // 'D'
- 189, // 'E'
- 162, // 'F'
- 161, // 'G'
- 150, // 'H'
- 193, // 'I'
- 142, // 'J'
- 137, // 'K'
- 171, // 'L'
- 176, // 'M'
- 185, // 'N'
- 167, // 'O'
- 186, // 'P'
- 112, // 'Q'
- 175, // 'R'
- 192, // 'S'
- 188, // 'T'
- 156, // 'U'
- 140, // 'V'
- 143, // 'W'
- 123, // 'X'
- 133, // 'Y'
- 128, // 'Z'
- 147, // '['
- 138, // '\\'
- 146, // ']'
- 114, // '^'
- 223, // '_'
- 151, // '`'
- 249, // 'a'
- 216, // 'b'
- 238, // 'c'
- 236, // 'd'
- 253, // 'e'
- 227, // 'f'
- 218, // 'g'
- 230, // 'h'
- 247, // 'i'
- 135, // 'j'
- 180, // 'k'
- 241, // 'l'
- 233, // 'm'
- 246, // 'n'
- 244, // 'o'
- 231, // 'p'
- 139, // 'q'
- 245, // 'r'
- 243, // 's'
- 251, // 't'
- 235, // 'u'
- 201, // 'v'
- 196, // 'w'
- 240, // 'x'
- 214, // 'y'
- 152, // 'z'
- 182, // '{'
- 205, // '|'
- 181, // '}'
- 127, // '~'
- 27, // '\x7f'
- 212, // '\x80'
- 211, // '\x81'
- 210, // '\x82'
- 213, // '\x83'
- 228, // '\x84'
- 197, // '\x85'
- 169, // '\x86'
- 159, // '\x87'
- 131, // '\x88'
- 172, // '\x89'
- 105, // '\x8a'
- 80, // '\x8b'
- 98, // '\x8c'
- 96, // '\x8d'
- 97, // '\x8e'
- 81, // '\x8f'
- 207, // '\x90'
- 145, // '\x91'
- 116, // '\x92'
- 115, // '\x93'
- 144, // '\x94'
- 130, // '\x95'
- 153, // '\x96'
- 121, // '\x97'
- 107, // '\x98'
- 132, // '\x99'
- 109, // '\x9a'
- 110, // '\x9b'
- 124, // '\x9c'
- 111, // '\x9d'
- 82, // '\x9e'
- 108, // '\x9f'
- 118, // '\xa0'
- 141, // '¡'
- 113, // '¢'
- 129, // '£'
- 119, // '¤'
- 125, // '¥'
- 165, // '¦'
- 117, // '§'
- 92, // '¨'
- 106, // '©'
- 83, // 'ª'
- 72, // '«'
- 99, // '¬'
- 93, // '\xad'
- 65, // '®'
- 79, // '¯'
- 166, // '°'
- 237, // '±'
- 163, // '²'
- 199, // '³'
- 190, // '´'
- 225, // 'µ'
- 209, // '¶'
- 203, // '·'
- 198, // '¸'
- 217, // '¹'
- 219, // 'º'
- 206, // '»'
- 234, // '¼'
- 248, // '½'
- 158, // '¾'
- 239, // '¿'
- 255, // 'À'
- 255, // 'Á'
- 255, // 'Â'
- 255, // 'Ã'
- 255, // 'Ä'
- 255, // 'Å'
- 255, // 'Æ'
- 255, // 'Ç'
- 255, // 'È'
- 255, // 'É'
- 255, // 'Ê'
- 255, // 'Ë'
- 255, // 'Ì'
- 255, // 'Í'
- 255, // 'Î'
- 255, // 'Ï'
- 255, // 'Ð'
- 255, // 'Ñ'
- 255, // 'Ò'
- 255, // 'Ó'
- 255, // 'Ô'
- 255, // 'Õ'
- 255, // 'Ö'
- 255, // '×'
- 255, // 'Ø'
- 255, // 'Ù'
- 255, // 'Ú'
- 255, // 'Û'
- 255, // 'Ü'
- 255, // 'Ý'
- 255, // 'Þ'
- 255, // 'ß'
- 255, // 'à'
- 255, // 'á'
- 255, // 'â'
- 255, // 'ã'
- 255, // 'ä'
- 255, // 'å'
- 255, // 'æ'
- 255, // 'ç'
- 255, // 'è'
- 255, // 'é'
- 255, // 'ê'
- 255, // 'ë'
- 255, // 'ì'
- 255, // 'í'
- 255, // 'î'
- 255, // 'ï'
- 255, // 'ð'
- 255, // 'ñ'
- 255, // 'ò'
- 255, // 'ó'
- 255, // 'ô'
- 255, // 'õ'
- 255, // 'ö'
- 255, // '÷'
- 255, // 'ø'
- 255, // 'ù'
- 255, // 'ú'
- 255, // 'û'
- 255, // 'ü'
- 255, // 'ý'
- 255, // 'þ'
- 255, // 'ÿ'
-];
diff --git a/src/search/mod.rs b/src/search/mod.rs
deleted file mode 100644
index a0d1b45..0000000
--- a/src/search/mod.rs
+++ /dev/null
@@ -1,8 +0,0 @@
-pub use self::prefilter::PrefilterState;
-pub use self::twoway::TwoWay;
-
-mod byte_frequencies;
-mod prefilter;
-#[cfg(test)]
-mod tests;
-mod twoway;
diff --git a/src/search/prefilter.rs b/src/search/prefilter.rs
deleted file mode 100644
index 00e6acf..0000000
--- a/src/search/prefilter.rs
+++ /dev/null
@@ -1,424 +0,0 @@
-use core::mem;
-
-use ext_slice::ByteSlice;
-use search::byte_frequencies::BYTE_FREQUENCIES;
-
-/// PrefilterState tracks state associated with the effectiveness of a
-/// prefilter. It is used to track how many bytes, on average, are skipped by
-/// the prefilter. If this average dips below a certain threshold over time,
-/// then the state renders the prefilter inert and stops using it.
-///
-/// A prefilter state should be created for each search. (Where creating an
-/// iterator via, e.g., `find_iter`, is treated as a single search.)
-#[derive(Clone, Debug)]
-pub struct PrefilterState {
- /// The number of skips that has been executed.
- skips: usize,
- /// The total number of bytes that have been skipped.
- skipped: usize,
- /// The maximum length of a match. This is used to help determine how many
- /// bytes on average should be skipped in order for a prefilter to be
- /// effective.
- max_match_len: usize,
- /// Once this heuristic has been deemed ineffective, it will be inert
- /// throughout the rest of its lifetime. This serves as a cheap way to
- /// check inertness.
- inert: bool,
-}
-
-impl PrefilterState {
- /// The minimum number of skip attempts to try before considering whether
- /// a prefilter is effective or not.
- const MIN_SKIPS: usize = 50;
-
- /// The minimum amount of bytes that skipping must average.
- ///
- /// This value was chosen based on varying it and checking the bstr/find/
- /// microbenchmarks. In particular, this can impact the
- /// pathological/repeated-{huge,small} benchmarks quite a bit if it's
- /// set too low.
- const MIN_SKIP_BYTES: usize = 8;
-
- /// Create a fresh prefilter state.
- pub fn new(max_match_len: usize) -> PrefilterState {
- if max_match_len == 0 {
- return PrefilterState::inert();
- }
- PrefilterState { skips: 0, skipped: 0, max_match_len, inert: false }
- }
-
- /// Create a fresh prefilter state that is always inert.
- fn inert() -> PrefilterState {
- PrefilterState { skips: 0, skipped: 0, max_match_len: 0, inert: true }
- }
-
- /// Update this state with the number of bytes skipped on the last
- /// invocation of the prefilter.
- #[inline]
- pub fn update(&mut self, skipped: usize) {
- self.skips += 1;
- self.skipped += skipped;
- }
-
- /// Return true if and only if this state indicates that a prefilter is
- /// still effective.
- #[inline]
- pub fn is_effective(&mut self) -> bool {
- if self.inert {
- return false;
- }
- if self.skips < PrefilterState::MIN_SKIPS {
- return true;
- }
- if self.skipped >= PrefilterState::MIN_SKIP_BYTES * self.skips {
- return true;
- }
-
- // We're inert.
- self.inert = true;
- false
- }
-}
-
-/// A heuristic frequency based prefilter for searching a single needle.
-///
-/// This prefilter attempts to pick out the byte in a needle that is predicted
-/// to occur least frequently, and search for that using fast vectorized
-/// routines. If a rare enough byte could not be found, then this prefilter's
-/// constructors will return `None`.
-///
-/// This can be combined with `PrefilterState` to dynamically render this
-/// prefilter inert if it proves to ineffective.
-#[derive(Clone, Debug)]
-pub struct Freqy {
- /// Whether this prefilter should be used or not.
- inert: bool,
- /// The length of the needle we're searching for.
- needle_len: usize,
- /// The rarest byte in the needle, according to pre-computed frequency
- /// analysis.
- rare1: u8,
- /// The leftmost offset of the rarest byte in the needle.
- rare1i: usize,
- /// The second rarest byte in the needle, according to pre-computed
- /// frequency analysis. (This may be equivalent to the rarest byte.)
- ///
- /// The second rarest byte is used as a type of guard for quickly detecting
- /// a mismatch after memchr locates an instance of the rarest byte. This
- /// is a hedge against pathological cases where the pre-computed frequency
- /// analysis may be off. (But of course, does not prevent *all*
- /// pathological cases.)
- rare2: u8,
- /// The leftmost offset of the second rarest byte in the needle.
- rare2i: usize,
-}
-
-impl Freqy {
- /// The maximum frequency rank permitted. If the rarest byte in the needle
- /// has a frequency rank above this value, then Freqy is not used.
- const MAX_RANK: usize = 200;
-
- /// Return a fresh prefilter state that can be used with this prefilter. A
- /// prefilter state is used to track the effectiveness of a prefilter for
- /// speeding up searches. Therefore, the prefilter state should generally
- /// be reused on subsequent searches (such as in an iterator). For searches
- /// on a different haystack, then a new prefilter state should be used.
- pub fn prefilter_state(&self) -> PrefilterState {
- if self.inert {
- PrefilterState::inert()
- } else {
- PrefilterState::new(self.needle_len)
- }
- }
-
- /// Returns a valid but inert prefilter. This is valid for both the forward
- /// and reverse direction.
- ///
- /// It is never correct to use an inert prefilter. The results of finding
- /// the next (or previous) candidate are unspecified.
- fn inert() -> Freqy {
- Freqy {
- inert: true,
- needle_len: 0,
- rare1: 0,
- rare1i: 0,
- rare2: 0,
- rare2i: 0,
- }
- }
-
- /// Return search info for the given needle in the forward direction.
- pub fn forward(needle: &[u8]) -> Freqy {
- if needle.is_empty() {
- return Freqy::inert();
- }
-
- // Find the rarest two bytes. Try to make them distinct (but it's not
- // required).
- let (mut rare1, mut rare1i) = (needle[0], 0);
- let (mut rare2, mut rare2i) = (needle[0], 0);
- if needle.len() >= 2 {
- rare2 = needle[1];
- rare2i = 1;
- }
- if Freqy::rank(rare2) < Freqy::rank(rare1) {
- mem::swap(&mut rare1, &mut rare2);
- mem::swap(&mut rare1i, &mut rare2i);
- }
- for (i, b) in needle.bytes().enumerate().skip(2) {
- if Freqy::rank(b) < Freqy::rank(rare1) {
- rare2 = rare1;
- rare2i = rare1i;
- rare1 = b;
- rare1i = i;
- } else if b != rare1 && Freqy::rank(b) < Freqy::rank(rare2) {
- rare2 = b;
- rare2i = i;
- }
- }
- if Freqy::rank(rare1) > Freqy::MAX_RANK {
- return Freqy::inert();
- }
- let needle_len = needle.len();
- Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i }
- }
-
- /// Return search info for the given needle in the reverse direction.
- pub fn reverse(needle: &[u8]) -> Freqy {
- if needle.is_empty() {
- return Freqy::inert();
- }
-
- // Find the rarest two bytes. Try to make them distinct (but it's not
- // required). In reverse, the offsets correspond to the number of bytes
- // from the end of the needle. So `0` is the last byte in the needle.
- let (mut rare1i, mut rare2i) = (0, 0);
- if needle.len() >= 2 {
- rare2i += 1;
- }
- let mut rare1 = needle[needle.len() - rare1i - 1];
- let mut rare2 = needle[needle.len() - rare2i - 1];
- if Freqy::rank(rare2) < Freqy::rank(rare1) {
- mem::swap(&mut rare1, &mut rare2);
- mem::swap(&mut rare1i, &mut rare2i);
- }
- for (i, b) in needle.bytes().rev().enumerate().skip(2) {
- if Freqy::rank(b) < Freqy::rank(rare1) {
- rare2 = rare1;
- rare2i = rare1i;
- rare1 = b;
- rare1i = i;
- } else if b != rare1 && Freqy::rank(b) < Freqy::rank(rare2) {
- rare2 = b;
- rare2i = i;
- }
- }
- if Freqy::rank(rare1) > Freqy::MAX_RANK {
- return Freqy::inert();
- }
- let needle_len = needle.len();
- Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i }
- }
-
- /// Look for a possible occurrence of needle. The position returned
- /// corresponds to the beginning of the occurrence, if one exists.
- ///
- /// Callers may assume that this never returns false negatives (i.e., it
- /// never misses an actual occurrence), but must check that the returned
- /// position corresponds to a match. That is, it can return false
- /// positives.
- ///
- /// This should only be used when Freqy is constructed for forward
- /// searching.
- pub fn find_candidate(
- &self,
- prestate: &mut PrefilterState,
- haystack: &[u8],
- ) -> Option<usize> {
- debug_assert!(!self.inert);
-
- let mut i = 0;
- while prestate.is_effective() {
- // Use a fast vectorized implementation to skip to the next
- // occurrence of the rarest byte (heuristically chosen) in the
- // needle.
- i += match haystack[i..].find_byte(self.rare1) {
- None => return None,
- Some(found) => {
- prestate.update(found);
- found
- }
- };
-
- // If we can't align our first match with the haystack, then a
- // match is impossible.
- if i < self.rare1i {
- i += 1;
- continue;
- }
-
- // Align our rare2 byte with the haystack. A mismatch means that
- // a match is impossible.
- let aligned_rare2i = i - self.rare1i + self.rare2i;
- if haystack.get(aligned_rare2i) != Some(&self.rare2) {
- i += 1;
- continue;
- }
-
- // We've done what we can. There might be a match here.
- return Some(i - self.rare1i);
- }
- // The only way we get here is if we believe our skipping heuristic
- // has become ineffective. We're allowed to return false positives,
- // so return the position at which we advanced to, aligned to the
- // haystack.
- Some(i.saturating_sub(self.rare1i))
- }
-
- /// Look for a possible occurrence of needle, in reverse, starting from the
- /// end of the given haystack. The position returned corresponds to the
- /// position immediately after the end of the occurrence, if one exists.
- ///
- /// Callers may assume that this never returns false negatives (i.e., it
- /// never misses an actual occurrence), but must check that the returned
- /// position corresponds to a match. That is, it can return false
- /// positives.
- ///
- /// This should only be used when Freqy is constructed for reverse
- /// searching.
- pub fn rfind_candidate(
- &self,
- prestate: &mut PrefilterState,
- haystack: &[u8],
- ) -> Option<usize> {
- debug_assert!(!self.inert);
-
- let mut i = haystack.len();
- while prestate.is_effective() {
- // Use a fast vectorized implementation to skip to the next
- // occurrence of the rarest byte (heuristically chosen) in the
- // needle.
- i = match haystack[..i].rfind_byte(self.rare1) {
- None => return None,
- Some(found) => {
- prestate.update(i - found);
- found
- }
- };
-
- // If we can't align our first match with the haystack, then a
- // match is impossible.
- if i + self.rare1i + 1 > haystack.len() {
- continue;
- }
-
- // Align our rare2 byte with the haystack. A mismatch means that
- // a match is impossible.
- let aligned = match (i + self.rare1i).checked_sub(self.rare2i) {
- None => continue,
- Some(aligned) => aligned,
- };
- if haystack.get(aligned) != Some(&self.rare2) {
- continue;
- }
-
- // We've done what we can. There might be a match here.
- return Some(i + self.rare1i + 1);
- }
- // The only way we get here is if we believe our skipping heuristic
- // has become ineffective. We're allowed to return false positives,
- // so return the position at which we advanced to, aligned to the
- // haystack.
- Some(i + self.rare1i + 1)
- }
-
- /// Return the heuristical frequency rank of the given byte. A lower rank
- /// means the byte is believed to occur less frequently.
- fn rank(b: u8) -> usize {
- BYTE_FREQUENCIES[b as usize] as usize
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
- use ext_slice::B;
-
- #[test]
- fn freqy_forward() {
- // N.B. We sometimes use uppercase here since that mostly ensures freqy
- // will be constructable. Lowercase letters may be too common for freqy
- // to work.
-
- let s = Freqy::forward(B("BAR"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(0), s.find_candidate(&mut pre, B("BARFOO")));
-
- let s = Freqy::forward(B("BAR"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(3), s.find_candidate(&mut pre, B("FOOBAR")));
-
- let s = Freqy::forward(B("zyzy"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(0), s.find_candidate(&mut pre, B("zyzz")));
-
- let s = Freqy::forward(B("zyzy"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(2), s.find_candidate(&mut pre, B("zzzy")));
-
- let s = Freqy::forward(B("zyzy"));
- let mut pre = s.prefilter_state();
- assert_eq!(None, s.find_candidate(&mut pre, B("zazb")));
-
- let s = Freqy::forward(B("yzyz"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(0), s.find_candidate(&mut pre, B("yzyy")));
-
- let s = Freqy::forward(B("yzyz"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(2), s.find_candidate(&mut pre, B("yyyz")));
-
- let s = Freqy::forward(B("yzyz"));
- let mut pre = s.prefilter_state();
- assert_eq!(None, s.find_candidate(&mut pre, B("yayb")));
- }
-
- #[test]
- fn freqy_reverse() {
- // N.B. We sometimes use uppercase here since that mostly ensures freqy
- // will be constructable. Lowercase letters may be too common for freqy
- // to work.
-
- let s = Freqy::reverse(B("BAR"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(3), s.rfind_candidate(&mut pre, B("BARFOO")));
-
- let s = Freqy::reverse(B("BAR"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(6), s.rfind_candidate(&mut pre, B("FOOBAR")));
-
- let s = Freqy::reverse(B("zyzy"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(2), s.rfind_candidate(&mut pre, B("zyzz")));
-
- let s = Freqy::reverse(B("zyzy"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(4), s.rfind_candidate(&mut pre, B("zzzy")));
-
- let s = Freqy::reverse(B("zyzy"));
- let mut pre = s.prefilter_state();
- assert_eq!(None, s.rfind_candidate(&mut pre, B("zazb")));
-
- let s = Freqy::reverse(B("yzyz"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(2), s.rfind_candidate(&mut pre, B("yzyy")));
-
- let s = Freqy::reverse(B("yzyz"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(4), s.rfind_candidate(&mut pre, B("yyyz")));
-
- let s = Freqy::reverse(B("yzyz"));
- let mut pre = s.prefilter_state();
- assert_eq!(None, s.rfind_candidate(&mut pre, B("yayb")));
- }
-}
diff --git a/src/search/tests.rs b/src/search/tests.rs
deleted file mode 100644
index 827df92..0000000
--- a/src/search/tests.rs
+++ /dev/null
@@ -1,225 +0,0 @@
-use search::twoway::TwoWay;
-
-/// Each test is a (needle, haystack, expected_fwd, expected_rev) tuple.
-type SearchTest = (&'static str, &'static str, Option<usize>, Option<usize>);
-
-const SEARCH_TESTS: &'static [SearchTest] = &[
- ("", "", Some(0), Some(0)),
- ("", "a", Some(0), Some(1)),
- ("", "ab", Some(0), Some(2)),
- ("", "abc", Some(0), Some(3)),
- ("a", "", None, None),
- ("a", "a", Some(0), Some(0)),
- ("a", "aa", Some(0), Some(1)),
- ("a", "ba", Some(1), Some(1)),
- ("a", "bba", Some(2), Some(2)),
- ("a", "bbba", Some(3), Some(3)),
- ("a", "bbbab", Some(3), Some(3)),
- ("a", "bbbabb", Some(3), Some(3)),
- ("a", "bbbabbb", Some(3), Some(3)),
- ("a", "bbbbbb", None, None),
- ("ab", "", None, None),
- ("ab", "a", None, None),
- ("ab", "b", None, None),
- ("ab", "ab", Some(0), Some(0)),
- ("ab", "aab", Some(1), Some(1)),
- ("ab", "aaab", Some(2), Some(2)),
- ("ab", "abaab", Some(0), Some(3)),
- ("ab", "baaab", Some(3), Some(3)),
- ("ab", "acb", None, None),
- ("ab", "abba", Some(0), Some(0)),
- ("abc", "ab", None, None),
- ("abc", "abc", Some(0), Some(0)),
- ("abc", "abcz", Some(0), Some(0)),
- ("abc", "abczz", Some(0), Some(0)),
- ("abc", "zabc", Some(1), Some(1)),
- ("abc", "zzabc", Some(2), Some(2)),
- ("abc", "azbc", None, None),
- ("abc", "abzc", None, None),
- ("abczdef", "abczdefzzzzzzzzzzzzzzzzzzzz", Some(0), Some(0)),
- ("abczdef", "zzzzzzzzzzzzzzzzzzzzabczdef", Some(20), Some(20)),
- // Failures caught by quickcheck.
- ("\u{0}\u{15}", "\u{0}\u{15}\u{15}\u{0}", Some(0), Some(0)),
- ("\u{0}\u{1e}", "\u{1e}\u{0}", None, None),
-];
-
-#[test]
-fn unit_twoway_fwd() {
- run_search_tests_fwd("TwoWay", |n, h| TwoWay::forward(n).find(h));
-}
-
-#[test]
-fn unit_twoway_rev() {
- run_search_tests_rev("TwoWay", |n, h| TwoWay::reverse(n).rfind(h));
-}
-
-/// Run the substring search tests. `name` should be the type of searcher used,
-/// for diagnostics. `search` should be a closure that accepts a needle and a
-/// haystack and returns the starting position of the first occurrence of
-/// needle in the haystack, or `None` if one doesn't exist.
-fn run_search_tests_fwd(
- name: &str,
- mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>,
-) {
- for &(needle, haystack, expected_fwd, _) in SEARCH_TESTS {
- let (n, h) = (needle.as_bytes(), haystack.as_bytes());
- assert_eq!(
- expected_fwd,
- search(n, h),
- "{}: needle: {:?}, haystack: {:?}, expected: {:?}",
- name,
- n,
- h,
- expected_fwd
- );
- }
-}
-
-/// Run the substring search tests. `name` should be the type of searcher used,
-/// for diagnostics. `search` should be a closure that accepts a needle and a
-/// haystack and returns the starting position of the last occurrence of
-/// needle in the haystack, or `None` if one doesn't exist.
-fn run_search_tests_rev(
- name: &str,
- mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>,
-) {
- for &(needle, haystack, _, expected_rev) in SEARCH_TESTS {
- let (n, h) = (needle.as_bytes(), haystack.as_bytes());
- assert_eq!(
- expected_rev,
- search(n, h),
- "{}: needle: {:?}, haystack: {:?}, expected: {:?}",
- name,
- n,
- h,
- expected_rev
- );
- }
-}
-
-quickcheck! {
- fn qc_twoway_fwd_prefix_is_substring(bs: Vec<u8>) -> bool {
- prop_prefix_is_substring(false, &bs, |n, h| TwoWay::forward(n).find(h))
- }
-
- fn qc_twoway_fwd_suffix_is_substring(bs: Vec<u8>) -> bool {
- prop_suffix_is_substring(false, &bs, |n, h| TwoWay::forward(n).find(h))
- }
-
- fn qc_twoway_rev_prefix_is_substring(bs: Vec<u8>) -> bool {
- prop_prefix_is_substring(true, &bs, |n, h| TwoWay::reverse(n).rfind(h))
- }
-
- fn qc_twoway_rev_suffix_is_substring(bs: Vec<u8>) -> bool {
- prop_suffix_is_substring(true, &bs, |n, h| TwoWay::reverse(n).rfind(h))
- }
-
- fn qc_twoway_fwd_matches_naive(
- needle: Vec<u8>,
- haystack: Vec<u8>
- ) -> bool {
- prop_matches_naive(
- false,
- &needle,
- &haystack,
- |n, h| TwoWay::forward(n).find(h),
- )
- }
-
- fn qc_twoway_rev_matches_naive(
- needle: Vec<u8>,
- haystack: Vec<u8>
- ) -> bool {
- prop_matches_naive(
- true,
- &needle,
- &haystack,
- |n, h| TwoWay::reverse(n).rfind(h),
- )
- }
-}
-
-/// Check that every prefix of the given byte string is a substring.
-fn prop_prefix_is_substring(
- reverse: bool,
- bs: &[u8],
- mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>,
-) -> bool {
- if bs.is_empty() {
- return true;
- }
- for i in 0..(bs.len() - 1) {
- let prefix = &bs[..i];
- if reverse {
- assert_eq!(naive_rfind(prefix, bs), search(prefix, bs));
- } else {
- assert_eq!(naive_find(prefix, bs), search(prefix, bs));
- }
- }
- true
-}
-
-/// Check that every suffix of the given byte string is a substring.
-fn prop_suffix_is_substring(
- reverse: bool,
- bs: &[u8],
- mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>,
-) -> bool {
- if bs.is_empty() {
- return true;
- }
- for i in 0..(bs.len() - 1) {
- let suffix = &bs[i..];
- if reverse {
- assert_eq!(naive_rfind(suffix, bs), search(suffix, bs));
- } else {
- assert_eq!(naive_find(suffix, bs), search(suffix, bs));
- }
- }
- true
-}
-
-/// Check that naive substring search matches the result of the given search
-/// algorithm.
-fn prop_matches_naive(
- reverse: bool,
- needle: &[u8],
- haystack: &[u8],
- mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>,
-) -> bool {
- if reverse {
- naive_rfind(needle, haystack) == search(needle, haystack)
- } else {
- naive_find(needle, haystack) == search(needle, haystack)
- }
-}
-
-/// Naively search forwards for the given needle in the given haystack.
-fn naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize> {
- if needle.is_empty() {
- return Some(0);
- } else if haystack.len() < needle.len() {
- return None;
- }
- for i in 0..(haystack.len() - needle.len() + 1) {
- if needle == &haystack[i..i + needle.len()] {
- return Some(i);
- }
- }
- None
-}
-
-/// Naively search in reverse for the given needle in the given haystack.
-fn naive_rfind(needle: &[u8], haystack: &[u8]) -> Option<usize> {
- if needle.is_empty() {
- return Some(haystack.len());
- } else if haystack.len() < needle.len() {
- return None;
- }
- for i in (0..(haystack.len() - needle.len() + 1)).rev() {
- if needle == &haystack[i..i + needle.len()] {
- return Some(i);
- }
- }
- None
-}
diff --git a/src/search/twoway.rs b/src/search/twoway.rs
deleted file mode 100644
index 5f1e8cf..0000000
--- a/src/search/twoway.rs
+++ /dev/null
@@ -1,871 +0,0 @@
-use core::cmp;
-
-use cow::CowBytes;
-use ext_slice::ByteSlice;
-use search::prefilter::{Freqy, PrefilterState};
-
-/// An implementation of the TwoWay substring search algorithm, with heuristics
-/// for accelerating search based on frequency analysis.
-///
-/// This searcher supports forward and reverse search, although not
-/// simultaneously. It runs in O(n + m) time and O(1) space, where
-/// `n ~ len(needle)` and `m ~ len(haystack)`.
-///
-/// The implementation here roughly matches that which was developed by
-/// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The
-/// only change in this implementation is the use of zero-based indices and
-/// the addition of heuristics for a fast skip loop. That is, this will detect
-/// bytes that are believed to be rare in the needle and use fast vectorized
-/// instructions to find their occurrences quickly. The Two-Way algorithm is
-/// then used to confirm whether a match at that location occurred.
-///
-/// The heuristic for fast skipping is automatically shut off if it's
-/// detected to be ineffective at search time. Generally, this only occurs in
-/// pathological cases. But this is generally necessary in order to preserve
-/// a `O(n + m)` time bound.
-///
-/// The code below is fairly complex and not obviously correct at all. It's
-/// likely necessary to read the Two-Way paper cited above in order to fully
-/// grok this code.
-#[derive(Clone, Debug)]
-pub struct TwoWay<'b> {
- /// The needle that we're looking for.
- needle: CowBytes<'b>,
- /// An implementation of a fast skip loop based on hard-coded frequency
- /// data. This is only used when conditions are deemed favorable.
- freqy: Freqy,
- /// A critical position in needle. Specifically, this position corresponds
- /// to beginning of either the minimal or maximal suffix in needle. (N.B.
- /// See SuffixType below for why "minimal" isn't quite the correct word
- /// here.)
- ///
- /// This is the position at which every search begins. Namely, search
- /// starts by scanning text to the right of this position, and only if
- /// there's a match does the text to the left of this position get scanned.
- critical_pos: usize,
- /// The amount we shift by in the Two-Way search algorithm. This
- /// corresponds to the "small period" and "large period" cases.
- shift: Shift,
-}
-
-impl<'b> TwoWay<'b> {
- /// Create a searcher that uses the Two-Way algorithm by searching forwards
- /// through any haystack.
- pub fn forward(needle: &'b [u8]) -> TwoWay<'b> {
- let freqy = Freqy::forward(needle);
- if needle.is_empty() {
- return TwoWay {
- needle: CowBytes::new(needle),
- freqy,
- critical_pos: 0,
- shift: Shift::Large { shift: 0 },
- };
- }
-
- let min_suffix = Suffix::forward(needle, SuffixKind::Minimal);
- let max_suffix = Suffix::forward(needle, SuffixKind::Maximal);
- let (period_lower_bound, critical_pos) =
- if min_suffix.pos > max_suffix.pos {
- (min_suffix.period, min_suffix.pos)
- } else {
- (max_suffix.period, max_suffix.pos)
- };
- let shift = Shift::forward(needle, period_lower_bound, critical_pos);
- let needle = CowBytes::new(needle);
- TwoWay { needle, freqy, critical_pos, shift }
- }
-
- /// Create a searcher that uses the Two-Way algorithm by searching in
- /// reverse through any haystack.
- pub fn reverse(needle: &'b [u8]) -> TwoWay<'b> {
- let freqy = Freqy::reverse(needle);
- if needle.is_empty() {
- return TwoWay {
- needle: CowBytes::new(needle),
- freqy,
- critical_pos: 0,
- shift: Shift::Large { shift: 0 },
- };
- }
-
- let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal);
- let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal);
- let (period_lower_bound, critical_pos) =
- if min_suffix.pos < max_suffix.pos {
- (min_suffix.period, min_suffix.pos)
- } else {
- (max_suffix.period, max_suffix.pos)
- };
- let shift = Shift::reverse(needle, period_lower_bound, critical_pos);
- let needle = CowBytes::new(needle);
- TwoWay { needle, freqy, critical_pos, shift }
- }
-
- /// Return a fresh prefilter state that can be used with this searcher.
- /// A prefilter state is used to track the effectiveness of a searcher's
- /// prefilter for speeding up searches. Therefore, the prefilter state
- /// should generally be reused on subsequent searches (such as in an
- /// iterator). For searches on a different haystack, then a new prefilter
- /// state should be used.
- ///
- /// This always initializes a valid prefilter state even if this searcher
- /// does not have a prefilter enabled.
- pub fn prefilter_state(&self) -> PrefilterState {
- self.freqy.prefilter_state()
- }
-
- /// Return the needle used by this searcher.
- pub fn needle(&self) -> &[u8] {
- self.needle.as_slice()
- }
-
- /// Convert this searched into an owned version, where the needle is
- /// copied if it isn't already owned.
- #[cfg(feature = "std")]
- pub fn into_owned(self) -> TwoWay<'static> {
- TwoWay {
- needle: self.needle.into_owned(),
- freqy: self.freqy,
- critical_pos: self.critical_pos,
- shift: self.shift,
- }
- }
-
- /// Find the position of the first occurrence of this searcher's needle in
- /// the given haystack. If one does not exist, then return None.
- ///
- /// This will automatically initialize prefilter state. This should only
- /// be used for one-off searches.
- pub fn find(&self, haystack: &[u8]) -> Option<usize> {
- self.find_with(&mut self.prefilter_state(), haystack)
- }
-
- /// Find the position of the last occurrence of this searcher's needle
- /// in the given haystack. If one does not exist, then return None.
- ///
- /// This will automatically initialize prefilter state. This should only
- /// be used for one-off searches.
- pub fn rfind(&self, haystack: &[u8]) -> Option<usize> {
- self.rfind_with(&mut self.prefilter_state(), haystack)
- }
-
- /// Find the position of the first occurrence of this searcher's needle in
- /// the given haystack. If one does not exist, then return None.
- ///
- /// This accepts prefilter state that is useful when using the same
- /// searcher multiple times, such as in an iterator.
- pub fn find_with(
- &self,
- prestate: &mut PrefilterState,
- haystack: &[u8],
- ) -> Option<usize> {
- if self.needle.is_empty() {
- return Some(0);
- } else if haystack.len() < self.needle.len() {
- return None;
- } else if self.needle.len() == 1 {
- return haystack.find_byte(self.needle[0]);
- }
- match self.shift {
- Shift::Small { period } => {
- self.find_small(prestate, haystack, period)
- }
- Shift::Large { shift } => {
- self.find_large(prestate, haystack, shift)
- }
- }
- }
-
- /// Find the position of the last occurrence of this searcher's needle
- /// in the given haystack. If one does not exist, then return None.
- ///
- /// This accepts prefilter state that is useful when using the same
- /// searcher multiple times, such as in an iterator.
- pub fn rfind_with(
- &self,
- prestate: &mut PrefilterState,
- haystack: &[u8],
- ) -> Option<usize> {
- if self.needle.is_empty() {
- return Some(haystack.len());
- } else if haystack.len() < self.needle.len() {
- return None;
- } else if self.needle.len() == 1 {
- return haystack.rfind_byte(self.needle[0]);
- }
- match self.shift {
- Shift::Small { period } => {
- self.rfind_small(prestate, haystack, period)
- }
- Shift::Large { shift } => {
- self.rfind_large(prestate, haystack, shift)
- }
- }
- }
-
- // Below is the actual implementation of TwoWay searching, including both
- // forwards and backwards searching. Each forward and reverse search has
- // two fairly similar implementations, each handling the small and large
- // period cases, for a total 4 different search routines.
- //
- // On top of that, each search implementation can be accelerated by a
- // Freqy prefilter, but it is not always enabled. To avoid its overhead
- // when its disabled, we explicitly inline each search implementation based
- // on whether Freqy will be used or not. This brings us up to a total of
- // 8 monomorphized versions of the search code.
-
- #[inline(never)]
- fn find_small(
- &self,
- prestate: &mut PrefilterState,
- haystack: &[u8],
- period: usize,
- ) -> Option<usize> {
- if prestate.is_effective() {
- self.find_small_imp(prestate, true, haystack, period)
- } else {
- self.find_small_imp(prestate, false, haystack, period)
- }
- }
-
- #[inline(always)]
- fn find_small_imp(
- &self,
- prestate: &mut PrefilterState,
- prefilter: bool,
- haystack: &[u8],
- period: usize,
- ) -> Option<usize> {
- let needle = self.needle.as_slice();
- let mut pos = 0;
- let mut shift = 0;
- while pos + needle.len() <= haystack.len() {
- let mut i = cmp::max(self.critical_pos, shift);
- if prefilter && prestate.is_effective() {
- match self.freqy.find_candidate(prestate, &haystack[pos..]) {
- None => return None,
- Some(found) => {
- shift = 0;
- i = self.critical_pos;
- pos += found;
- if pos + needle.len() > haystack.len() {
- return None;
- }
- }
- }
- }
- while i < needle.len() && needle[i] == haystack[pos + i] {
- i += 1;
- }
- if i < needle.len() {
- pos += i - self.critical_pos + 1;
- shift = 0;
- } else {
- let mut j = self.critical_pos;
- while j > shift && needle[j] == haystack[pos + j] {
- j -= 1;
- }
- if j <= shift && needle[shift] == haystack[pos + shift] {
- return Some(pos);
- }
- pos += period;
- shift = needle.len() - period;
- }
- }
- None
- }
-
- #[inline(never)]
- fn find_large(
- &self,
- prestate: &mut PrefilterState,
- haystack: &[u8],
- shift: usize,
- ) -> Option<usize> {
- if prestate.is_effective() {
- self.find_large_imp(prestate, true, haystack, shift)
- } else {
- self.find_large_imp(prestate, false, haystack, shift)
- }
- }
-
- #[inline(always)]
- fn find_large_imp(
- &self,
- prestate: &mut PrefilterState,
- prefilter: bool,
- haystack: &[u8],
- shift: usize,
- ) -> Option<usize> {
- let needle = self.needle.as_slice();
- let mut pos = 0;
- while pos + needle.len() <= haystack.len() {
- let mut i = self.critical_pos;
- if prefilter && prestate.is_effective() {
- match self.freqy.find_candidate(prestate, &haystack[pos..]) {
- None => return None,
- Some(found) => {
- pos += found;
- if pos + needle.len() > haystack.len() {
- return None;
- }
- }
- }
- }
- while i < needle.len() && needle[i] == haystack[pos + i] {
- i += 1;
- }
- if i < needle.len() {
- pos += i - self.critical_pos + 1;
- } else {
- let mut j = self.critical_pos;
- while j > 0 && needle[j] == haystack[pos + j] {
- j -= 1;
- }
- if j == 0 && needle[0] == haystack[pos] {
- return Some(pos);
- }
- pos += shift;
- }
- }
- None
- }
-
- #[inline(never)]
- fn rfind_small(
- &self,
- prestate: &mut PrefilterState,
- haystack: &[u8],
- period: usize,
- ) -> Option<usize> {
- if prestate.is_effective() {
- self.rfind_small_imp(prestate, true, haystack, period)
- } else {
- self.rfind_small_imp(prestate, false, haystack, period)
- }
- }
-
- #[inline(always)]
- fn rfind_small_imp(
- &self,
- prestate: &mut PrefilterState,
- prefilter: bool,
- haystack: &[u8],
- period: usize,
- ) -> Option<usize> {
- let needle = &*self.needle;
- let nlen = needle.len();
- let mut pos = haystack.len();
- let mut shift = nlen;
- while pos >= nlen {
- let mut i = cmp::min(self.critical_pos, shift);
- if prefilter && prestate.is_effective() {
- match self.freqy.rfind_candidate(prestate, &haystack[..pos]) {
- None => return None,
- Some(found) => {
- shift = nlen;
- i = self.critical_pos;
- pos = found;
- if pos < nlen {
- return None;
- }
- }
- }
- }
- while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
- i -= 1;
- }
- if i > 0 || needle[0] != haystack[pos - nlen] {
- pos -= self.critical_pos - i + 1;
- shift = nlen;
- } else {
- let mut j = self.critical_pos;
- while j < shift && needle[j] == haystack[pos - nlen + j] {
- j += 1;
- }
- if j == shift {
- return Some(pos - nlen);
- }
- pos -= period;
- shift = period;
- }
- }
- None
- }
-
- #[inline(never)]
- fn rfind_large(
- &self,
- prestate: &mut PrefilterState,
- haystack: &[u8],
- shift: usize,
- ) -> Option<usize> {
- if prestate.is_effective() {
- self.rfind_large_imp(prestate, true, haystack, shift)
- } else {
- self.rfind_large_imp(prestate, false, haystack, shift)
- }
- }
-
- #[inline(always)]
- fn rfind_large_imp(
- &self,
- prestate: &mut PrefilterState,
- prefilter: bool,
- haystack: &[u8],
- shift: usize,
- ) -> Option<usize> {
- let needle = &*self.needle;
- let nlen = needle.len();
- let mut pos = haystack.len();
- while pos >= nlen {
- if prefilter && prestate.is_effective() {
- match self.freqy.rfind_candidate(prestate, &haystack[..pos]) {
- None => return None,
- Some(found) => {
- pos = found;
- if pos < nlen {
- return None;
- }
- }
- }
- }
-
- let mut i = self.critical_pos;
- while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
- i -= 1;
- }
- if i > 0 || needle[0] != haystack[pos - nlen] {
- pos -= self.critical_pos - i + 1;
- } else {
- let mut j = self.critical_pos;
- while j < nlen && needle[j] == haystack[pos - nlen + j] {
- j += 1;
- }
- if j == nlen {
- return Some(pos - nlen);
- }
- pos -= shift;
- }
- }
- None
- }
-}
-
-/// A representation of the amount we're allowed to shift by during Two-Way
-/// search.
-///
-/// When computing a critical factorization of the needle, we find the position
-/// of the critical factorization by finding the needle's maximal (or minimal)
-/// suffix, along with the period of that suffix. It turns out that the period
-/// of that suffix is a lower bound on the period of the needle itself.
-///
-/// This lower bound is equivalent to the actual period of the needle in
-/// some cases. To describe that case, we denote the needle as `x` where
-/// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower
-/// bound given here is always the period of `v`, which is `<= period(x)`. The
-/// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and
-/// where `u` is a suffix of `v[0..period(v)]`.
-///
-/// This case is important because the search algorithm for when the
-/// periods are equivalent is slightly different than the search algorithm
-/// for when the periods are not equivalent. In particular, when they aren't
-/// equivalent, we know that the period of the needle is no less than half its
-/// length. In this case, we shift by an amount less than or equal to the
-/// period of the needle (determined by the maximum length of the components
-/// of the critical factorization of `x`, i.e., `max(len(u), len(v))`)..
-///
-/// The above two cases are represented by the variants below. Each entails
-/// a different instantiation of the Two-Way search algorithm.
-///
-/// N.B. If we could find a way to compute the exact period in all cases,
-/// then we could collapse this case analysis and simplify the algorithm. The
-/// Two-Way paper suggests this is possible, but more reading is required to
-/// grok why the authors didn't pursue that path.
-#[derive(Clone, Debug)]
-enum Shift {
- Small { period: usize },
- Large { shift: usize },
-}
-
-impl Shift {
- /// Compute the shift for a given needle in the forward direction.
- ///
- /// This requires a lower bound on the period and a critical position.
- /// These can be computed by extracting both the minimal and maximal
- /// lexicographic suffixes, and choosing the right-most starting position.
- /// The lower bound on the period is then the period of the chosen suffix.
- fn forward(
- needle: &[u8],
- period_lower_bound: usize,
- critical_pos: usize,
- ) -> Shift {
- let large = cmp::max(critical_pos, needle.len() - critical_pos);
- if critical_pos * 2 >= needle.len() {
- return Shift::Large { shift: large };
- }
-
- let (u, v) = needle.split_at(critical_pos);
- if !v[..period_lower_bound].ends_with(u) {
- return Shift::Large { shift: large };
- }
- Shift::Small { period: period_lower_bound }
- }
-
- /// Compute the shift for a given needle in the reverse direction.
- ///
- /// This requires a lower bound on the period and a critical position.
- /// These can be computed by extracting both the minimal and maximal
- /// lexicographic suffixes, and choosing the left-most starting position.
- /// The lower bound on the period is then the period of the chosen suffix.
- fn reverse(
- needle: &[u8],
- period_lower_bound: usize,
- critical_pos: usize,
- ) -> Shift {
- let large = cmp::max(critical_pos, needle.len() - critical_pos);
- if (needle.len() - critical_pos) * 2 >= needle.len() {
- return Shift::Large { shift: large };
- }
-
- let (v, u) = needle.split_at(critical_pos);
- if !v[v.len() - period_lower_bound..].starts_with(u) {
- return Shift::Large { shift: large };
- }
- Shift::Small { period: period_lower_bound }
- }
-}
-
-/// A suffix extracted from a needle along with its period.
-#[derive(Debug)]
-struct Suffix {
- /// The starting position of this suffix.
- ///
- /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this
- /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for
- /// forward suffixes, this is an inclusive starting position, where as for
- /// reverse suffixes, this is an exclusive ending position.
- pos: usize,
- /// The period of this suffix.
- ///
- /// Note that this is NOT necessarily the period of the string from which
- /// this suffix comes from. (It is always less than or equal to the period
- /// of the original string.)
- period: usize,
-}
-
-impl Suffix {
- fn forward(needle: &[u8], kind: SuffixKind) -> Suffix {
- debug_assert!(!needle.is_empty());
-
- // suffix represents our maximal (or minimal) suffix, along with
- // its period.
- let mut suffix = Suffix { pos: 0, period: 1 };
- // The start of a suffix in `needle` that we are considering as a
- // more maximal (or minimal) suffix than what's in `suffix`.
- let mut candidate_start = 1;
- // The current offset of our suffixes that we're comparing.
- //
- // When the characters at this offset are the same, then we mush on
- // to the next position since no decision is possible. When the
- // candidate's character is greater (or lesser) than the corresponding
- // character than our current maximal (or minimal) suffix, then the
- // current suffix is changed over to the candidate and we restart our
- // search. Otherwise, the candidate suffix is no good and we restart
- // our search on the next candidate.
- //
- // The three cases above correspond to the three cases in the loop
- // below.
- let mut offset = 0;
-
- while candidate_start + offset < needle.len() {
- let current = needle[suffix.pos + offset];
- let candidate = needle[candidate_start + offset];
- match kind.cmp(current, candidate) {
- SuffixOrdering::Accept => {
- suffix = Suffix { pos: candidate_start, period: 1 };
- candidate_start += 1;
- offset = 0;
- }
- SuffixOrdering::Skip => {
- candidate_start += offset + 1;
- offset = 0;
- suffix.period = candidate_start - suffix.pos;
- }
- SuffixOrdering::Push => {
- if offset + 1 == suffix.period {
- candidate_start += suffix.period;
- offset = 0;
- } else {
- offset += 1;
- }
- }
- }
- }
- suffix
- }
-
- fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix {
- debug_assert!(!needle.is_empty());
-
- // See the comments in `forward` for how this works.
- let mut suffix = Suffix { pos: needle.len(), period: 1 };
- if needle.len() == 1 {
- return suffix;
- }
- let mut candidate_start = needle.len() - 1;
- let mut offset = 0;
-
- while offset < candidate_start {
- let current = needle[suffix.pos - offset - 1];
- let candidate = needle[candidate_start - offset - 1];
- match kind.cmp(current, candidate) {
- SuffixOrdering::Accept => {
- suffix = Suffix { pos: candidate_start, period: 1 };
- candidate_start -= 1;
- offset = 0;
- }
- SuffixOrdering::Skip => {
- candidate_start -= offset + 1;
- offset = 0;
- suffix.period = suffix.pos - candidate_start;
- }
- SuffixOrdering::Push => {
- if offset + 1 == suffix.period {
- candidate_start -= suffix.period;
- offset = 0;
- } else {
- offset += 1;
- }
- }
- }
- }
- suffix
- }
-}
-
-/// The kind of suffix to extract.
-#[derive(Clone, Copy, Debug)]
-enum SuffixKind {
- /// Extract the smallest lexicographic suffix from a string.
- ///
- /// Technically, this doesn't actually pick the smallest lexicographic
- /// suffix. e.g., Given the choice between `a` and `aa`, this will choose
- /// the latter over the former, even though `a < aa`. The reasoning for
- /// this isn't clear from the paper, but it still smells like a minimal
- /// suffix.
- Minimal,
- /// Extract the largest lexicographic suffix from a string.
- ///
- /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given
- /// the choice between `z` and `zz`, this will choose the latter over the
- /// former.
- Maximal,
-}
-
-/// The result of comparing corresponding bytes between two suffixes.
-#[derive(Clone, Copy, Debug)]
-enum SuffixOrdering {
- /// This occurs when the given candidate byte indicates that the candidate
- /// suffix is better than the current maximal (or minimal) suffix. That is,
- /// the current candidate suffix should supplant the current maximal (or
- /// minimal) suffix.
- Accept,
- /// This occurs when the given candidate byte excludes the candidate suffix
- /// from being better than the current maximal (or minimal) suffix. That
- /// is, the current candidate suffix should be dropped and the next one
- /// should be considered.
- Skip,
- /// This occurs when no decision to accept or skip the candidate suffix
- /// can be made, e.g., when corresponding bytes are equivalent. In this
- /// case, the next corresponding bytes should be compared.
- Push,
-}
-
-impl SuffixKind {
- /// Returns true if and only if the given candidate byte indicates that
- /// it should replace the current suffix as the maximal (or minimal)
- /// suffix.
- fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering {
- use self::SuffixOrdering::*;
-
- match self {
- SuffixKind::Minimal if candidate < current => Accept,
- SuffixKind::Minimal if candidate > current => Skip,
- SuffixKind::Minimal => Push,
- SuffixKind::Maximal if candidate > current => Accept,
- SuffixKind::Maximal if candidate < current => Skip,
- SuffixKind::Maximal => Push,
- }
- }
-}
-
-// N.B. There are more holistic tests in src/search/tests.rs.
-#[cfg(test)]
-mod tests {
- use super::*;
- use ext_slice::B;
-
- /// Convenience wrapper for computing the suffix as a byte string.
- fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
- let s = Suffix::forward(needle, kind);
- (&needle[s.pos..], s.period)
- }
-
- /// Convenience wrapper for computing the reverse suffix as a byte string.
- fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
- let s = Suffix::reverse(needle, kind);
- (&needle[..s.pos], s.period)
- }
-
- /// Return all of the non-empty suffixes in the given byte string.
- fn suffixes(bytes: &[u8]) -> Vec<&[u8]> {
- (0..bytes.len()).map(|i| &bytes[i..]).collect()
- }
-
- /// Return the lexicographically maximal suffix of the given byte string.
- fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] {
- let mut sufs = suffixes(needle);
- sufs.sort();
- sufs.pop().unwrap()
- }
-
- /// Return the lexicographically maximal suffix of the reverse of the given
- /// byte string.
- fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> {
- let mut reversed = needle.to_vec();
- reversed.reverse();
- let mut got = naive_maximal_suffix_forward(&reversed).to_vec();
- got.reverse();
- got
- }
-
- #[test]
- fn suffix_forward() {
- macro_rules! assert_suffix_min {
- ($given:expr, $expected:expr, $period:expr) => {
- let (got_suffix, got_period) =
- get_suffix_forward($given.as_bytes(), SuffixKind::Minimal);
- assert_eq!((B($expected), $period), (got_suffix, got_period));
- };
- }
-
- macro_rules! assert_suffix_max {
- ($given:expr, $expected:expr, $period:expr) => {
- let (got_suffix, got_period) =
- get_suffix_forward($given.as_bytes(), SuffixKind::Maximal);
- assert_eq!((B($expected), $period), (got_suffix, got_period));
- };
- }
-
- assert_suffix_min!("a", "a", 1);
- assert_suffix_max!("a", "a", 1);
-
- assert_suffix_min!("ab", "ab", 2);
- assert_suffix_max!("ab", "b", 1);
-
- assert_suffix_min!("ba", "a", 1);
- assert_suffix_max!("ba", "ba", 2);
-
- assert_suffix_min!("abc", "abc", 3);
- assert_suffix_max!("abc", "c", 1);
-
- assert_suffix_min!("acb", "acb", 3);
- assert_suffix_max!("acb", "cb", 2);
-
- assert_suffix_min!("cba", "a", 1);
- assert_suffix_max!("cba", "cba", 3);
-
- assert_suffix_min!("abcabc", "abcabc", 3);
- assert_suffix_max!("abcabc", "cabc", 3);
-
- assert_suffix_min!("abcabcabc", "abcabcabc", 3);
- assert_suffix_max!("abcabcabc", "cabcabc", 3);
-
- assert_suffix_min!("abczz", "abczz", 5);
- assert_suffix_max!("abczz", "zz", 1);
-
- assert_suffix_min!("zzabc", "abc", 3);
- assert_suffix_max!("zzabc", "zzabc", 5);
-
- assert_suffix_min!("aaa", "aaa", 1);
- assert_suffix_max!("aaa", "aaa", 1);
-
- assert_suffix_min!("foobar", "ar", 2);
- assert_suffix_max!("foobar", "r", 1);
- }
-
- #[test]
- fn suffix_reverse() {
- macro_rules! assert_suffix_min {
- ($given:expr, $expected:expr, $period:expr) => {
- let (got_suffix, got_period) =
- get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal);
- assert_eq!((B($expected), $period), (got_suffix, got_period));
- };
- }
-
- macro_rules! assert_suffix_max {
- ($given:expr, $expected:expr, $period:expr) => {
- let (got_suffix, got_period) =
- get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal);
- assert_eq!((B($expected), $period), (got_suffix, got_period));
- };
- }
-
- assert_suffix_min!("a", "a", 1);
- assert_suffix_max!("a", "a", 1);
-
- assert_suffix_min!("ab", "a", 1);
- assert_suffix_max!("ab", "ab", 2);
-
- assert_suffix_min!("ba", "ba", 2);
- assert_suffix_max!("ba", "b", 1);
-
- assert_suffix_min!("abc", "a", 1);
- assert_suffix_max!("abc", "abc", 3);
-
- assert_suffix_min!("acb", "a", 1);
- assert_suffix_max!("acb", "ac", 2);
-
- assert_suffix_min!("cba", "cba", 3);
- assert_suffix_max!("cba", "c", 1);
-
- assert_suffix_min!("abcabc", "abca", 3);
- assert_suffix_max!("abcabc", "abcabc", 3);
-
- assert_suffix_min!("abcabcabc", "abcabca", 3);
- assert_suffix_max!("abcabcabc", "abcabcabc", 3);
-
- assert_suffix_min!("abczz", "a", 1);
- assert_suffix_max!("abczz", "abczz", 5);
-
- assert_suffix_min!("zzabc", "zza", 3);
- assert_suffix_max!("zzabc", "zz", 1);
-
- assert_suffix_min!("aaa", "aaa", 1);
- assert_suffix_max!("aaa", "aaa", 1);
- }
-
- quickcheck! {
- fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool {
- if bytes.is_empty() {
- return true;
- }
-
- let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal);
- let expected = naive_maximal_suffix_forward(&bytes);
- got == expected
- }
-
- fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool {
- if bytes.is_empty() {
- return true;
- }
-
- let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal);
- let expected = naive_maximal_suffix_reverse(&bytes);
- expected == got
- }
- }
-}
diff --git a/src/unicode/fsm/grapheme_break_fwd.rs b/src/unicode/fsm/grapheme_break_fwd.rs
index 317ba96..b53b1d7 100644
--- a/src/unicode/fsm/grapheme_break_fwd.rs
+++ b/src/unicode/fsm/grapheme_break_fwd.rs
@@ -2,40 +2,44 @@
//
// ucd-generate dfa --name GRAPHEME_BREAK_FWD --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.8 is available on crates.io.
+// ucd-generate 0.2.9 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static! {
- pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u8; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("grapheme_break_fwd.bigendian.dfa"),
- };
-
- unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("grapheme_break_fwd.bigendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
#[cfg(target_endian = "little")]
-lazy_static! {
- pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u8; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("grapheme_break_fwd.littleendian.dfa"),
- };
-
- unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("grapheme_break_fwd.littleendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
diff --git a/src/unicode/fsm/grapheme_break_rev.rs b/src/unicode/fsm/grapheme_break_rev.rs
index db6b6ee..93e888c 100644
--- a/src/unicode/fsm/grapheme_break_rev.rs
+++ b/src/unicode/fsm/grapheme_break_rev.rs
@@ -2,40 +2,44 @@
//
// ucd-generate dfa --name GRAPHEME_BREAK_REV --reverse --longest --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.8 is available on crates.io.
+// ucd-generate 0.2.9 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static! {
- pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u8; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("grapheme_break_rev.bigendian.dfa"),
- };
-
- unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("grapheme_break_rev.bigendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
#[cfg(target_endian = "little")]
-lazy_static! {
- pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u8; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("grapheme_break_rev.littleendian.dfa"),
- };
-
- unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("grapheme_break_rev.littleendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
diff --git a/src/unicode/fsm/regional_indicator_rev.rs b/src/unicode/fsm/regional_indicator_rev.rs
index 3b6beff..2bf7e4c 100644
--- a/src/unicode/fsm/regional_indicator_rev.rs
+++ b/src/unicode/fsm/regional_indicator_rev.rs
@@ -2,40 +2,44 @@
//
// ucd-generate dfa --name REGIONAL_INDICATOR_REV --reverse --classes --minimize --anchored --premultiply --state-size 1 src/unicode/fsm/ \p{gcb=Regional_Indicator}
//
-// ucd-generate 0.2.8 is available on crates.io.
+// ucd-generate 0.2.9 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static! {
- pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u8; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"),
- };
-
- unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
#[cfg(target_endian = "little")]
-lazy_static! {
- pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u8; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"),
- };
-
- unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
diff --git a/src/unicode/fsm/sentence_break_fwd.rs b/src/unicode/fsm/sentence_break_fwd.rs
index 46ecfcf..cc937a4 100644
--- a/src/unicode/fsm/sentence_break_fwd.rs
+++ b/src/unicode/fsm/sentence_break_fwd.rs
@@ -2,40 +2,44 @@
//
// ucd-generate dfa --name SENTENCE_BREAK_FWD --minimize --sparse --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.8 is available on crates.io.
+// ucd-generate 0.2.9 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static! {
- pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u8; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("sentence_break_fwd.bigendian.dfa"),
- };
-
- unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("sentence_break_fwd.bigendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
#[cfg(target_endian = "little")]
-lazy_static! {
- pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u8; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("sentence_break_fwd.littleendian.dfa"),
- };
-
- unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("sentence_break_fwd.littleendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
diff --git a/src/unicode/fsm/simple_word_fwd.rs b/src/unicode/fsm/simple_word_fwd.rs
index c5fabe3..f1f3da5 100644
--- a/src/unicode/fsm/simple_word_fwd.rs
+++ b/src/unicode/fsm/simple_word_fwd.rs
@@ -2,40 +2,44 @@
//
// ucd-generate dfa --name SIMPLE_WORD_FWD --sparse --minimize --state-size 2 src/unicode/fsm/ \w
//
-// ucd-generate 0.2.8 is available on crates.io.
+// ucd-generate 0.2.9 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static! {
- pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u8; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("simple_word_fwd.bigendian.dfa"),
- };
-
- unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("simple_word_fwd.bigendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
#[cfg(target_endian = "little")]
-lazy_static! {
- pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u8; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("simple_word_fwd.littleendian.dfa"),
- };
-
- unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("simple_word_fwd.littleendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
diff --git a/src/unicode/fsm/whitespace_anchored_fwd.rs b/src/unicode/fsm/whitespace_anchored_fwd.rs
index ea68582..419b5d4 100644
--- a/src/unicode/fsm/whitespace_anchored_fwd.rs
+++ b/src/unicode/fsm/whitespace_anchored_fwd.rs
@@ -2,40 +2,44 @@
//
// ucd-generate dfa --name WHITESPACE_ANCHORED_FWD --anchored --classes --premultiply --minimize --state-size 1 src/unicode/fsm/ \s+
//
-// ucd-generate 0.2.8 is available on crates.io.
+// ucd-generate 0.2.9 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static! {
- pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u8; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"),
- };
-
- unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
#[cfg(target_endian = "little")]
-lazy_static! {
- pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u8; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"),
- };
-
- unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
diff --git a/src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa b/src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa
index bb217f1..427d3a9 100644
--- a/src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa
+++ b/src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa b/src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa
index a7cb5a7..7cc3a0a 100644
--- a/src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa
+++ b/src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/whitespace_anchored_rev.rs b/src/unicode/fsm/whitespace_anchored_rev.rs
index 72b444e..301b03c 100644
--- a/src/unicode/fsm/whitespace_anchored_rev.rs
+++ b/src/unicode/fsm/whitespace_anchored_rev.rs
@@ -1,41 +1,45 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
-// ucd-generate dfa --name WHITESPACE_ANCHORED_REV --reverse --anchored --classes --minimize --state-size 1 src/unicode/fsm/ \s+
+// ucd-generate dfa --name WHITESPACE_ANCHORED_REV --reverse --anchored --classes --premultiply --minimize --state-size 2 src/unicode/fsm/ \s+
//
-// ucd-generate 0.2.8 is available on crates.io.
+// ucd-generate 0.2.9 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static! {
- pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u16; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"),
- };
-
- unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
#[cfg(target_endian = "little")]
-lazy_static! {
- pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u16; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"),
- };
-
- unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
diff --git a/src/unicode/fsm/word_break_fwd.rs b/src/unicode/fsm/word_break_fwd.rs
index 52e6bc2..fb041b7 100644
--- a/src/unicode/fsm/word_break_fwd.rs
+++ b/src/unicode/fsm/word_break_fwd.rs
@@ -2,40 +2,44 @@
//
// ucd-generate dfa --name WORD_BREAK_FWD --sparse --minimize --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.8 is available on crates.io.
+// ucd-generate 0.2.9 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static! {
- pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u8; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("word_break_fwd.bigendian.dfa"),
- };
-
- unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("word_break_fwd.bigendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
#[cfg(target_endian = "little")]
-lazy_static! {
- pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
- #[repr(C)]
- struct Aligned<B: ?Sized> {
- _align: [u8; 0],
- bytes: B,
- }
+lazy_static::lazy_static! {
+ pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+ #[repr(C)]
+ struct Aligned<B: ?Sized> {
+ _align: [u8; 0],
+ bytes: B,
+ }
- static ALIGNED: &'static Aligned<[u8]> = &Aligned {
- _align: [],
- bytes: *include_bytes!("word_break_fwd.littleendian.dfa"),
- };
-
- unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+ static ALIGNED: &'static Aligned<[u8]> = &Aligned {
+ _align: [],
+ bytes: *include_bytes!("word_break_fwd.littleendian.dfa"),
};
+
+ unsafe {
+ ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
+ }
+ };
}
diff --git a/src/unicode/grapheme.rs b/src/unicode/grapheme.rs
index e40a0de..ad31cf1 100644
--- a/src/unicode/grapheme.rs
+++ b/src/unicode/grapheme.rs
@@ -1,10 +1,10 @@
use regex_automata::DFA;
-use ext_slice::ByteSlice;
-use unicode::fsm::grapheme_break_fwd::GRAPHEME_BREAK_FWD;
-use unicode::fsm::grapheme_break_rev::GRAPHEME_BREAK_REV;
-use unicode::fsm::regional_indicator_rev::REGIONAL_INDICATOR_REV;
-use utf8;
+use crate::ext_slice::ByteSlice;
+use crate::unicode::fsm::grapheme_break_fwd::GRAPHEME_BREAK_FWD;
+use crate::unicode::fsm::grapheme_break_rev::GRAPHEME_BREAK_REV;
+use crate::unicode::fsm::regional_indicator_rev::REGIONAL_INDICATOR_REV;
+use crate::utf8;
/// An iterator over grapheme clusters in a byte string.
///
@@ -262,8 +262,8 @@ mod tests {
use ucd_parse::GraphemeClusterBreakTest;
use super::*;
- use ext_slice::ByteSlice;
- use tests::LOSSY_TESTS;
+ use crate::ext_slice::ByteSlice;
+ use crate::tests::LOSSY_TESTS;
#[test]
fn forward_ucd() {
diff --git a/src/unicode/sentence.rs b/src/unicode/sentence.rs
index 01f5473..063f342 100644
--- a/src/unicode/sentence.rs
+++ b/src/unicode/sentence.rs
@@ -1,8 +1,8 @@
use regex_automata::DFA;
-use ext_slice::ByteSlice;
-use unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD;
-use utf8;
+use crate::ext_slice::ByteSlice;
+use crate::unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD;
+use crate::utf8;
/// An iterator over sentences in a byte string.
///
@@ -160,7 +160,7 @@ fn decode_sentence(bs: &[u8]) -> (&str, usize) {
mod tests {
use ucd_parse::SentenceBreakTest;
- use ext_slice::ByteSlice;
+ use crate::ext_slice::ByteSlice;
#[test]
fn forward_ucd() {
diff --git a/src/unicode/whitespace.rs b/src/unicode/whitespace.rs
index a8da144..949a83f 100644
--- a/src/unicode/whitespace.rs
+++ b/src/unicode/whitespace.rs
@@ -1,7 +1,7 @@
use regex_automata::DFA;
-use unicode::fsm::whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD;
-use unicode::fsm::whitespace_anchored_rev::WHITESPACE_ANCHORED_REV;
+use crate::unicode::fsm::whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD;
+use crate::unicode::fsm::whitespace_anchored_rev::WHITESPACE_ANCHORED_REV;
/// Return the first position of a non-whitespace character.
pub fn whitespace_len_fwd(slice: &[u8]) -> usize {
diff --git a/src/unicode/word.rs b/src/unicode/word.rs
index 1260e52..e0a5701 100644
--- a/src/unicode/word.rs
+++ b/src/unicode/word.rs
@@ -1,9 +1,9 @@
use regex_automata::DFA;
-use ext_slice::ByteSlice;
-use unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD;
-use unicode::fsm::word_break_fwd::WORD_BREAK_FWD;
-use utf8;
+use crate::ext_slice::ByteSlice;
+use crate::unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD;
+use crate::unicode::fsm::word_break_fwd::WORD_BREAK_FWD;
+use crate::utf8;
/// An iterator over words in a byte string.
///
@@ -320,7 +320,7 @@ fn decode_word(bs: &[u8]) -> (&str, usize) {
mod tests {
use ucd_parse::WordBreakTest;
- use ext_slice::ByteSlice;
+ use crate::ext_slice::ByteSlice;
#[test]
fn forward_ucd() {
diff --git a/src/utf8.rs b/src/utf8.rs
index 2d4035d..5c7de36 100644
--- a/src/utf8.rs
+++ b/src/utf8.rs
@@ -5,9 +5,9 @@ use core::str;
#[cfg(feature = "std")]
use std::error;
-use ascii;
-use bstr::BStr;
-use ext_slice::ByteSlice;
+use crate::ascii;
+use crate::bstr::BStr;
+use crate::ext_slice::ByteSlice;
// The UTF-8 decoder provided here is based on the one presented here:
// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
@@ -459,7 +459,7 @@ impl error::Error for Utf8Error {
}
impl fmt::Display for Utf8Error {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "invalid UTF-8 found at byte offset {}", self.valid_up_to)
}
}
@@ -858,9 +858,9 @@ fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
mod tests {
use std::char;
- use ext_slice::{ByteSlice, B};
- use tests::LOSSY_TESTS;
- use utf8::{self, Utf8Error};
+ use crate::ext_slice::{ByteSlice, B};
+ use crate::tests::LOSSY_TESTS;
+ use crate::utf8::{self, Utf8Error};
fn utf8e(valid_up_to: usize) -> Utf8Error {
Utf8Error { valid_up_to, error_len: None }