diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2022-03-25 12:32:28 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2022-03-25 12:32:28 +0000 |
commit | a55624750e44817ddcdde33749007356f9df3643 (patch) | |
tree | a1f8d4d3bb32c82cbfc9afbe6fe7949ab517d67f | |
parent | 12106570c7b85a4ce86f116be0fbb99c813936db (diff) | |
parent | 1faff9be927c85d1dfb151bc7975d02f697854df (diff) | |
download | bstr-android13-mainline-go-extservices-release.tar.gz |
Snap for 8358640 from 1faff9be927c85d1dfb151bc7975d02f697854df to mainline-go-extservices-releaseaml_go_ext_330912000android13-mainline-go-extservices-release
Change-Id: I4869a2663031ff8dd7559be4032f4a7fb6dad954
44 files changed, 448 insertions, 2350 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json index f1ec46c..ef4fb69 100644 --- a/.cargo_vcs_info.json +++ b/.cargo_vcs_info.json @@ -1,5 +1,5 @@ { "git": { - "sha1": "a444256ca7407fe180ee32534688549655b7a38e" + "sha1": "e38e7a7ca986f9499b30202f49d79e531d14d192" } } @@ -1,4 +1,4 @@ -// This file is generated by cargo2android.py --run --device --dependencies. +// This file is generated by cargo2android.py --config cargo2android.json. // Do not modify this file as changes will be overridden on upgrade. package { @@ -43,8 +43,10 @@ rust_library { name: "libbstr", host_supported: true, crate_name: "bstr", + cargo_env_compat: true, + cargo_pkg_version: "0.2.17", srcs: ["src/lib.rs"], - edition: "2015", + edition: "2018", features: [ "default", "lazy_static", @@ -58,9 +60,3 @@ rust_library { "libregex_automata", ], } - -// dependent_library ["feature_list"] -// byteorder-1.4.3 -// lazy_static-1.4.0 -// memchr-2.3.4 "std,use_std" -// regex-automata-0.1.9 @@ -3,16 +3,16 @@ # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies -# to registry (e.g., crates.io) dependencies +# to registry (e.g., crates.io) dependencies. # -# If you believe there's an error in this file please file an -# issue against the rust-lang/cargo repository. If you're -# editing this file be aware that the upstream Cargo.toml -# will likely look very different (and much more reasonable) +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. [package] +edition = "2018" name = "bstr" -version = "0.2.15" +version = "0.2.17" authors = ["Andrew Gallant <jamslam@gmail.com>"] exclude = ["/.github"] description = "A string type that is not required to be valid UTF-8." @@ -29,11 +29,11 @@ debug = true [lib] bench = false [dependencies.lazy_static] -version = "1.2" +version = "1.2.0" optional = true [dependencies.memchr] -version = "2.1.2" +version = "2.4.0" default-features = false [dependencies.regex-automata] @@ -59,10 +59,5 @@ version = "1.2.1" default = ["std", "unicode"] serde1 = ["std", "serde1-nostd", "serde/std"] serde1-nostd = ["serde"] -std = ["memchr/use_std"] +std = ["memchr/std"] unicode = ["lazy_static", "regex-automata"] -[badges.appveyor] -repository = "BurntSushi/bstr" - -[badges.travis-ci] -repository = "BurntSushi/bstr" diff --git a/Cargo.toml.orig b/Cargo.toml.orig index 585da1b..cbb6283 100644 --- a/Cargo.toml.orig +++ b/Cargo.toml.orig @@ -1,6 +1,6 @@ [package] name = "bstr" -version = "0.2.15" #:version +version = "0.2.17" #:version authors = ["Andrew Gallant <jamslam@gmail.com>"] description = "A string type that is not required to be valid UTF-8." documentation = "https://docs.rs/bstr" @@ -11,24 +11,24 @@ keywords = ["string", "str", "byte", "bytes", "text"] license = "MIT OR Apache-2.0" categories = ["text-processing", "encoding"] exclude = ["/.github"] +edition = "2018" -[badges] -travis-ci = { repository = "BurntSushi/bstr" } -appveyor = { repository = "BurntSushi/bstr" } +[workspace] +members = ["bench"] [lib] bench = false [features] default = ["std", "unicode"] -std = ["memchr/use_std"] +std = ["memchr/std"] unicode = ["lazy_static", "regex-automata"] serde1 = ["std", "serde1-nostd", "serde/std"] serde1-nostd = ["serde"] [dependencies] -memchr = { version = "2.1.2", default-features = false } -lazy_static = { version = "1.2", optional = true } +memchr = { version = "2.4.0", default-features = false } +lazy_static = { version = "1.2.0", optional = true } regex-automata = { version = "0.1.5", default-features = false, optional = true } serde = { version = "1.0.85", default-features = false, optional = true } @@ -7,13 +7,13 @@ third_party { } url { type: ARCHIVE - value: "https://static.crates.io/crates/bstr/bstr-0.2.15.crate" + value: "https://static.crates.io/crates/bstr/bstr-0.2.17.crate" } - version: "0.2.15" + version: "0.2.17" license_type: NOTICE last_upgrade_date { year: 2021 - month: 4 - day: 1 + month: 9 + day: 22 } } @@ -158,7 +158,7 @@ and Unicode support. ### Minimum Rust version policy -This crate's minimum supported `rustc` version (MSRV) is `1.28.0`. +This crate's minimum supported `rustc` version (MSRV) is `1.41.1`. In general, this crate will be conservative with respect to the minimum supported version of Rust. MSRV may be bumped in minor version releases. @@ -171,12 +171,12 @@ My hope is to move to `1.0` within the next year and commit to its API so that `bstr` can be used as a public dependency. A large part of the API surface area was taken from the standard library, so -from an API design perspective, a good portion of this crate should be mature. -The main differences from the standard library are in how the various substring -search routines work. The standard library provides generic infrastructure for -supporting different types of searches with a single method, where as this -library prefers to define new methods for each type of search and drop the -generic infrastructure. +from an API design perspective, a good portion of this crate should be on solid +ground already. The main differences from the standard library are in how the +various substring search routines work. The standard library provides generic +infrastructure for supporting different types of searches with a single method, +where as this library prefers to define new methods for each type of search and +drop the generic infrastructure. Some _probable_ future considerations for APIs include, but are not limited to: @@ -195,10 +195,10 @@ Here are some examples that are _probably_ out of scope for this crate: The exact scope isn't quite clear, but I expect we can iterate on it. -In general, as stated below, this crate is an experiment in bringing lots of -related APIs together into a single crate while simultaneously attempting to -keep the total number of dependencies low. Indeed, every dependency of `bstr`, -except for `memchr`, is optional. +In general, as stated below, this crate brings lots of related APIs together +into a single crate while simultaneously attempting to keep the total number of +dependencies low. Indeed, every dependency of `bstr`, except for `memchr`, is +optional. ### High level motivation @@ -229,13 +229,10 @@ Consider, for example, trimming and splitting, along with their different variants. In other words, `bstr` is partially a way of pushing back against the -micro-crate ecosystem that appears to be evolving. It's not clear to me whether -this experiment will be successful or not, but it is definitely a goal of +micro-crate ecosystem that appears to be evolving. Namely, it is a goal of `bstr` to keep its dependency list lightweight. For example, `serde` is an -optional dependency because there is no feasible alternative, but `twoway` is -not, where we instead prefer to implement our own substring search. In service -of this philosophy, currently, the only required dependency of `bstr` is -`memchr`. +optional dependency because there is no feasible alternative. In service of +this philosophy, currently, the only required dependency of `bstr` is `memchr`. ### License diff --git a/TEST_MAPPING b/TEST_MAPPING new file mode 100644 index 0000000..3cbd48d --- /dev/null +++ b/TEST_MAPPING @@ -0,0 +1,17 @@ +// Generated by update_crate_tests.py for tests that depend on this crate. +{ + "imports": [ + { + "path": "external/rust/crates/base64" + }, + { + "path": "external/rust/crates/tinytemplate" + }, + { + "path": "external/rust/crates/tinyvec" + }, + { + "path": "external/rust/crates/unicode-xid" + } + ] +} diff --git a/cargo2android.json b/cargo2android.json new file mode 100644 index 0000000..bf78496 --- /dev/null +++ b/cargo2android.json @@ -0,0 +1,4 @@ +{ + "device": true, + "run": true +}
\ No newline at end of file diff --git a/examples/graphemes-std.rs b/examples/graphemes-std.rs index 3522736..647739d 100644 --- a/examples/graphemes-std.rs +++ b/examples/graphemes-std.rs @@ -1,5 +1,3 @@ -extern crate unicode_segmentation; - use std::error::Error; use std::io::{self, BufRead, Write}; @@ -18,8 +16,7 @@ fn main() -> Result<(), Box<dyn Error>> { .take(10) .last() .unwrap_or(line.len()); - #[allow(deprecated)] // for Rust 1.28.0 - stdout.write_all(line[..end].trim_right().as_bytes())?; + stdout.write_all(line[..end].trim_end().as_bytes())?; stdout.write_all(b"\n")?; line.clear(); diff --git a/examples/graphemes.rs b/examples/graphemes.rs index 2372490..6adc701 100644 --- a/examples/graphemes.rs +++ b/examples/graphemes.rs @@ -1,5 +1,3 @@ -extern crate bstr; - use std::error::Error; use std::io::{self, Write}; diff --git a/examples/lines.rs b/examples/lines.rs index 4b1045f..c392a22 100644 --- a/examples/lines.rs +++ b/examples/lines.rs @@ -1,5 +1,3 @@ -extern crate bstr; - use std::error::Error; use std::io::{self, Write}; diff --git a/examples/uppercase.rs b/examples/uppercase.rs index f9771e0..1eb798a 100644 --- a/examples/uppercase.rs +++ b/examples/uppercase.rs @@ -1,5 +1,3 @@ -extern crate bstr; - use std::error::Error; use std::io::{self, Write}; diff --git a/examples/words-std.rs b/examples/words-std.rs index 7eae116..aeeeb26 100644 --- a/examples/words-std.rs +++ b/examples/words-std.rs @@ -1,5 +1,3 @@ -extern crate unicode_segmentation; - use std::error::Error; use std::io::{self, BufRead}; diff --git a/examples/words.rs b/examples/words.rs index eb20c0d..db305da 100644 --- a/examples/words.rs +++ b/examples/words.rs @@ -1,5 +1,3 @@ -extern crate bstr; - use std::error::Error; use std::io; diff --git a/scripts/generate-unicode-data b/scripts/generate-unicode-data index 6b59fae..b8341c5 100755 --- a/scripts/generate-unicode-data +++ b/scripts/generate-unicode-data @@ -101,7 +101,7 @@ whitespace() { ucd-generate dfa \ --name WHITESPACE_ANCHORED_REV \ --reverse \ - --anchored --classes --premultiply --minimize --state-size 1 \ + --anchored --classes --premultiply --minimize --state-size 2 \ src/unicode/fsm/ \ "\s+" } diff --git a/src/bstring.rs b/src/bstring.rs index f04c651..30093ba 100644 --- a/src/bstring.rs +++ b/src/bstring.rs @@ -1,4 +1,4 @@ -use bstr::BStr; +use crate::bstr::BStr; /// A wrapper for `Vec<u8>` that provides convenient string oriented trait /// impls. diff --git a/src/byteset/mod.rs b/src/byteset/mod.rs index 969b0e3..043d309 100644 --- a/src/byteset/mod.rs +++ b/src/byteset/mod.rs @@ -81,8 +81,7 @@ pub(crate) fn rfind_not(haystack: &[u8], byteset: &[u8]) -> Option<usize> { #[cfg(test)] mod tests { - - quickcheck! { + quickcheck::quickcheck! { fn qc_byteset_forward_matches_naive( haystack: Vec<u8>, needles: Vec<u8> diff --git a/src/byteset/scalar.rs b/src/byteset/scalar.rs index 3fe1f53..9bd34a8 100644 --- a/src/byteset/scalar.rs +++ b/src/byteset/scalar.rs @@ -238,7 +238,7 @@ mod tests { #[test] fn test_inv_memchr() { - use {ByteSlice, B}; + use crate::{ByteSlice, B}; for (search, byte, matching) in build_tests() { assert_eq!( inv_memchr(byte, &search), diff --git a/src/cow.rs b/src/cow.rs deleted file mode 100644 index 1556353..0000000 --- a/src/cow.rs +++ /dev/null @@ -1,84 +0,0 @@ -use core::ops; -#[cfg(feature = "std")] -use std::borrow::Cow; - -/// A specialized copy-on-write byte string. -/// -/// The purpose of this type is to permit usage of a "borrowed or owned -/// byte string" in a way that keeps std/no-std compatibility. That is, in -/// no-std mode, this type devolves into a simple &[u8] with no owned variant -/// availble. -#[derive(Clone, Debug)] -pub struct CowBytes<'a>(Imp<'a>); - -#[cfg(feature = "std")] -#[derive(Clone, Debug)] -struct Imp<'a>(Cow<'a, [u8]>); - -#[cfg(not(feature = "std"))] -#[derive(Clone, Debug)] -struct Imp<'a>(&'a [u8]); - -impl<'a> ops::Deref for CowBytes<'a> { - type Target = [u8]; - - fn deref(&self) -> &[u8] { - self.as_slice() - } -} - -impl<'a> CowBytes<'a> { - /// Create a new borrowed CowBytes. - pub fn new<B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> CowBytes<'a> { - CowBytes(Imp::new(bytes.as_ref())) - } - - /// Create a new owned CowBytes. - #[cfg(feature = "std")] - pub fn new_owned(bytes: Vec<u8>) -> CowBytes<'static> { - CowBytes(Imp(Cow::Owned(bytes))) - } - - /// Return a borrowed byte string, regardless of whether this is an owned - /// or borrowed byte string internally. - pub fn as_slice(&self) -> &[u8] { - self.0.as_slice() - } - - /// Return an owned version of this copy-on-write byte string. - /// - /// If this is already an owned byte string internally, then this is a - /// no-op. Otherwise, the internal byte string is copied. - #[cfg(feature = "std")] - pub fn into_owned(self) -> CowBytes<'static> { - match (self.0).0 { - Cow::Borrowed(b) => CowBytes::new_owned(b.to_vec()), - Cow::Owned(b) => CowBytes::new_owned(b), - } - } -} - -impl<'a> Imp<'a> { - #[cfg(feature = "std")] - pub fn new(bytes: &'a [u8]) -> Imp<'a> { - Imp(Cow::Borrowed(bytes)) - } - - #[cfg(not(feature = "std"))] - pub fn new(bytes: &'a [u8]) -> Imp<'a> { - Imp(bytes) - } - - #[cfg(feature = "std")] - pub fn as_slice(&self) -> &[u8] { - match self.0 { - Cow::Owned(ref x) => x, - Cow::Borrowed(x) => x, - } - } - - #[cfg(not(feature = "std"))] - pub fn as_slice(&self) -> &[u8] { - self.0 - } -} diff --git a/src/ext_slice.rs b/src/ext_slice.rs index fa08190..0cc73af 100644 --- a/src/ext_slice.rs +++ b/src/ext_slice.rs @@ -5,22 +5,21 @@ use std::ffi::OsStr; #[cfg(feature = "std")] use std::path::Path; -use core::{cmp, iter, ops, ptr, slice, str}; -use memchr::{memchr, memrchr}; +use core::{iter, ops, ptr, slice, str}; +use memchr::{memchr, memmem, memrchr}; -use ascii; -use bstr::BStr; -use byteset; +use crate::ascii; +use crate::bstr::BStr; +use crate::byteset; #[cfg(feature = "std")] -use ext_vec::ByteVec; -use search::{PrefilterState, TwoWay}; +use crate::ext_vec::ByteVec; #[cfg(feature = "unicode")] -use unicode::{ +use crate::unicode::{ whitespace_len_fwd, whitespace_len_rev, GraphemeIndices, Graphemes, SentenceIndices, Sentences, WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks, }; -use utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error}; +use crate::utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error}; /// A short-hand constructor for building a `&[u8]`. /// @@ -344,7 +343,7 @@ pub trait ByteSlice: Sealed { /// ``` #[cfg(feature = "std")] #[inline] - fn to_str_lossy(&self) -> Cow<str> { + fn to_str_lossy(&self) -> Cow<'_, str> { match utf8::validate(self.as_bytes()) { Ok(()) => { // SAFETY: This is safe because of the guarantees provided by @@ -488,10 +487,10 @@ pub trait ByteSlice: Sealed { /// ``` #[cfg(feature = "std")] #[inline] - fn to_os_str_lossy(&self) -> Cow<OsStr> { + fn to_os_str_lossy(&self) -> Cow<'_, OsStr> { #[cfg(unix)] #[inline] - fn imp(bytes: &[u8]) -> Cow<OsStr> { + fn imp(bytes: &[u8]) -> Cow<'_, OsStr> { use std::os::unix::ffi::OsStrExt; Cow::Borrowed(OsStr::from_bytes(bytes)) @@ -559,7 +558,7 @@ pub trait ByteSlice: Sealed { /// ``` #[cfg(feature = "std")] #[inline] - fn to_path_lossy(&self) -> Cow<Path> { + fn to_path_lossy(&self) -> Cow<'_, Path> { use std::path::PathBuf; match self.to_os_str_lossy() { @@ -1067,7 +1066,7 @@ pub trait ByteSlice: Sealed { /// assert_eq!(0, B(" \n\t\u{2003}\n \t").fields().count()); /// ``` #[inline] - fn fields(&self) -> Fields { + fn fields(&self) -> Fields<'_> { Fields::new(self.as_bytes()) } @@ -1099,7 +1098,7 @@ pub trait ByteSlice: Sealed { /// assert_eq!(0, b"1911354563".fields_with(|c| c.is_numeric()).count()); /// ``` #[inline] - fn fields_with<F: FnMut(char) -> bool>(&self, f: F) -> FieldsWith<F> { + fn fields_with<F: FnMut(char) -> bool>(&self, f: F) -> FieldsWith<'_, F> { FieldsWith::new(self.as_bytes(), f) } @@ -1619,7 +1618,7 @@ pub trait ByteSlice: Sealed { /// assert_eq!(bytes, bs); /// ``` #[inline] - fn bytes(&self) -> Bytes { + fn bytes(&self) -> Bytes<'_> { Bytes { it: self.as_bytes().iter() } } @@ -1649,7 +1648,7 @@ pub trait ByteSlice: Sealed { /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars); /// ``` #[inline] - fn chars(&self) -> Chars { + fn chars(&self) -> Chars<'_> { Chars::new(self.as_bytes()) } @@ -1704,7 +1703,7 @@ pub trait ByteSlice: Sealed { /// ]); /// ``` #[inline] - fn char_indices(&self) -> CharIndices { + fn char_indices(&self) -> CharIndices<'_> { CharIndices::new(self.as_bytes()) } @@ -1741,7 +1740,7 @@ pub trait ByteSlice: Sealed { /// assert_eq!(invalid_chunks, vec![b"\xFD", b"\xFE", b"\xFF"]); /// ``` #[inline] - fn utf8_chunks(&self) -> Utf8Chunks { + fn utf8_chunks(&self) -> Utf8Chunks<'_> { Utf8Chunks { bytes: self.as_bytes() } } @@ -1773,7 +1772,7 @@ pub trait ByteSlice: Sealed { /// ``` #[cfg(feature = "unicode")] #[inline] - fn graphemes(&self) -> Graphemes { + fn graphemes(&self) -> Graphemes<'_> { Graphemes::new(self.as_bytes()) } @@ -1817,7 +1816,7 @@ pub trait ByteSlice: Sealed { /// ``` #[cfg(feature = "unicode")] #[inline] - fn grapheme_indices(&self) -> GraphemeIndices { + fn grapheme_indices(&self) -> GraphemeIndices<'_> { GraphemeIndices::new(self.as_bytes()) } @@ -1853,7 +1852,7 @@ pub trait ByteSlice: Sealed { /// ``` #[cfg(feature = "unicode")] #[inline] - fn words(&self) -> Words { + fn words(&self) -> Words<'_> { Words::new(self.as_bytes()) } @@ -1891,7 +1890,7 @@ pub trait ByteSlice: Sealed { /// ``` #[cfg(feature = "unicode")] #[inline] - fn word_indices(&self) -> WordIndices { + fn word_indices(&self) -> WordIndices<'_> { WordIndices::new(self.as_bytes()) } @@ -1921,7 +1920,7 @@ pub trait ByteSlice: Sealed { /// ``` #[cfg(feature = "unicode")] #[inline] - fn words_with_breaks(&self) -> WordsWithBreaks { + fn words_with_breaks(&self) -> WordsWithBreaks<'_> { WordsWithBreaks::new(self.as_bytes()) } @@ -1958,7 +1957,7 @@ pub trait ByteSlice: Sealed { /// ``` #[cfg(feature = "unicode")] #[inline] - fn words_with_break_indices(&self) -> WordsWithBreakIndices { + fn words_with_break_indices(&self) -> WordsWithBreakIndices<'_> { WordsWithBreakIndices::new(self.as_bytes()) } @@ -1990,7 +1989,7 @@ pub trait ByteSlice: Sealed { /// ``` #[cfg(feature = "unicode")] #[inline] - fn sentences(&self) -> Sentences { + fn sentences(&self) -> Sentences<'_> { Sentences::new(self.as_bytes()) } @@ -2024,7 +2023,7 @@ pub trait ByteSlice: Sealed { /// ``` #[cfg(feature = "unicode")] #[inline] - fn sentence_indices(&self) -> SentenceIndices { + fn sentence_indices(&self) -> SentenceIndices<'_> { SentenceIndices::new(self.as_bytes()) } @@ -2055,7 +2054,7 @@ pub trait ByteSlice: Sealed { /// ]); /// ``` #[inline] - fn lines(&self) -> Lines { + fn lines(&self) -> Lines<'_> { Lines::new(self.as_bytes()) } @@ -2099,7 +2098,7 @@ pub trait ByteSlice: Sealed { /// ]); /// ``` #[inline] - fn lines_with_terminator(&self) -> LinesWithTerminator { + fn lines_with_terminator(&self) -> LinesWithTerminator<'_> { LinesWithTerminator::new(self.as_bytes()) } @@ -2789,7 +2788,7 @@ pub trait ByteSlice: Sealed { #[cfg(feature = "unicode")] #[inline] fn reverse_graphemes(&mut self) { - use unicode::decode_grapheme; + use crate::unicode::decode_grapheme; let mut i = 0; loop { @@ -2986,15 +2985,13 @@ pub trait ByteSlice: Sealed { /// version which permits building a `Finder` that is not connected to the /// lifetime of its needle. #[derive(Clone, Debug)] -pub struct Finder<'a> { - searcher: TwoWay<'a>, -} +pub struct Finder<'a>(memmem::Finder<'a>); impl<'a> Finder<'a> { /// Create a new finder for the given needle. #[inline] pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> Finder<'a> { - Finder { searcher: TwoWay::forward(needle.as_ref()) } + Finder(memmem::Finder::new(needle.as_ref())) } /// Convert this finder into its owned variant, such that it no longer @@ -3007,7 +3004,7 @@ impl<'a> Finder<'a> { #[cfg(feature = "std")] #[inline] pub fn into_owned(self) -> Finder<'static> { - Finder { searcher: self.searcher.into_owned() } + Finder(self.0.into_owned()) } /// Returns the needle that this finder searches for. @@ -3018,7 +3015,7 @@ impl<'a> Finder<'a> { /// needle returned must necessarily be the shorter of the two. #[inline] pub fn needle(&self) -> &[u8] { - self.searcher.needle() + self.0.needle() } /// Returns the index of the first occurrence of this needle in the given @@ -3050,7 +3047,7 @@ impl<'a> Finder<'a> { /// ``` #[inline] pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> { - self.searcher.find(haystack.as_ref()) + self.0.find(haystack.as_ref()) } } @@ -3071,15 +3068,13 @@ impl<'a> Finder<'a> { /// version which permits building a `FinderReverse` that is not connected to /// the lifetime of its needle. #[derive(Clone, Debug)] -pub struct FinderReverse<'a> { - searcher: TwoWay<'a>, -} +pub struct FinderReverse<'a>(memmem::FinderRev<'a>); impl<'a> FinderReverse<'a> { /// Create a new reverse finder for the given needle. #[inline] pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> FinderReverse<'a> { - FinderReverse { searcher: TwoWay::reverse(needle.as_ref()) } + FinderReverse(memmem::FinderRev::new(needle.as_ref())) } /// Convert this finder into its owned variant, such that it no longer @@ -3092,7 +3087,7 @@ impl<'a> FinderReverse<'a> { #[cfg(feature = "std")] #[inline] pub fn into_owned(self) -> FinderReverse<'static> { - FinderReverse { searcher: self.searcher.into_owned() } + FinderReverse(self.0.into_owned()) } /// Returns the needle that this finder searches for. @@ -3103,7 +3098,7 @@ impl<'a> FinderReverse<'a> { /// the needle returned must necessarily be the shorter of the two. #[inline] pub fn needle(&self) -> &[u8] { - self.searcher.needle() + self.0.needle() } /// Returns the index of the last occurrence of this needle in the given @@ -3135,7 +3130,7 @@ impl<'a> FinderReverse<'a> { /// ``` #[inline] pub fn rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> { - self.searcher.rfind(haystack.as_ref()) + self.0.rfind(haystack.as_ref()) } } @@ -3147,17 +3142,14 @@ impl<'a> FinderReverse<'a> { /// byte string being looked for. #[derive(Debug)] pub struct Find<'a> { + it: memmem::FindIter<'a, 'a>, haystack: &'a [u8], - prestate: PrefilterState, - searcher: TwoWay<'a>, - pos: usize, + needle: &'a [u8], } impl<'a> Find<'a> { fn new(haystack: &'a [u8], needle: &'a [u8]) -> Find<'a> { - let searcher = TwoWay::forward(needle); - let prestate = searcher.prefilter_state(); - Find { haystack, prestate, searcher, pos: 0 } + Find { it: memmem::find_iter(haystack, needle), haystack, needle } } } @@ -3166,20 +3158,7 @@ impl<'a> Iterator for Find<'a> { #[inline] fn next(&mut self) -> Option<usize> { - if self.pos > self.haystack.len() { - return None; - } - let result = self - .searcher - .find_with(&mut self.prestate, &self.haystack[self.pos..]); - match result { - None => None, - Some(i) => { - let pos = self.pos + i; - self.pos = pos + cmp::max(1, self.searcher.needle().len()); - Some(pos) - } - } + self.it.next() } } @@ -3191,20 +3170,18 @@ impl<'a> Iterator for Find<'a> { /// byte string being looked for. #[derive(Debug)] pub struct FindReverse<'a> { + it: memmem::FindRevIter<'a, 'a>, haystack: &'a [u8], - prestate: PrefilterState, - searcher: TwoWay<'a>, - /// When searching with an empty needle, this gets set to `None` after - /// we've yielded the last element at `0`. - pos: Option<usize>, + needle: &'a [u8], } impl<'a> FindReverse<'a> { fn new(haystack: &'a [u8], needle: &'a [u8]) -> FindReverse<'a> { - let searcher = TwoWay::reverse(needle); - let prestate = searcher.prefilter_state(); - let pos = Some(haystack.len()); - FindReverse { haystack, prestate, searcher, pos } + FindReverse { + it: memmem::rfind_iter(haystack, needle), + haystack, + needle, + } } fn haystack(&self) -> &'a [u8] { @@ -3212,7 +3189,7 @@ impl<'a> FindReverse<'a> { } fn needle(&self) -> &[u8] { - self.searcher.needle() + self.needle } } @@ -3221,24 +3198,7 @@ impl<'a> Iterator for FindReverse<'a> { #[inline] fn next(&mut self) -> Option<usize> { - let pos = match self.pos { - None => return None, - Some(pos) => pos, - }; - let result = self - .searcher - .rfind_with(&mut self.prestate, &self.haystack[..pos]); - match result { - None => None, - Some(i) => { - if pos == i { - self.pos = pos.checked_sub(1); - } else { - self.pos = Some(i); - } - Some(i) - } - } + self.it.next() } } @@ -3398,7 +3358,7 @@ impl<'a> Iterator for Split<'a> { match self.finder.next() { Some(start) => { let next = &haystack[self.last..start]; - self.last = start + self.finder.searcher.needle().len(); + self.last = start + self.finder.needle.len(); Some(next) } None => { @@ -3633,8 +3593,8 @@ impl<'a> Iterator for LinesWithTerminator<'a> { #[cfg(test)] mod tests { - use ext_slice::{ByteSlice, B}; - use tests::LOSSY_TESTS; + use crate::ext_slice::{ByteSlice, B}; + use crate::tests::LOSSY_TESTS; #[test] fn to_str_lossy() { diff --git a/src/ext_vec.rs b/src/ext_vec.rs index 6f6db56..5beb0e1 100644 --- a/src/ext_vec.rs +++ b/src/ext_vec.rs @@ -1,5 +1,3 @@ -#![allow(unused_imports)] - use std::borrow::Cow; use std::error; use std::ffi::{OsStr, OsString}; @@ -11,8 +9,8 @@ use std::ptr; use std::str; use std::vec; -use ext_slice::ByteSlice; -use utf8::{self, Utf8Error}; +use crate::ext_slice::ByteSlice; +use crate::utf8::{self, Utf8Error}; /// Concatenate the elements given by the iterator together into a single /// `Vec<u8>`. @@ -877,7 +875,7 @@ pub trait ByteVec: Sealed { /// assert_eq!(s, "foar".as_bytes()); /// ``` #[inline] - fn drain_bytes<R>(&mut self, range: R) -> DrainBytes + fn drain_bytes<R>(&mut self, range: R) -> DrainBytes<'_> where R: ops::RangeBounds<usize>, { @@ -1040,15 +1038,14 @@ impl error::Error for FromUtf8Error { impl fmt::Display for FromUtf8Error { #[inline] - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.err) } } #[cfg(test)] mod tests { - use ext_slice::B; - use ext_vec::ByteVec; + use crate::ext_vec::ByteVec; #[test] fn insert() { diff --git a/src/impls.rs b/src/impls.rs index 4cba48e..85a27ba 100644 --- a/src/impls.rs +++ b/src/impls.rs @@ -67,20 +67,20 @@ mod bstring { use std::iter::FromIterator; use std::ops; - use bstr::BStr; - use bstring::BString; - use ext_vec::ByteVec; + use crate::bstr::BStr; + use crate::bstring::BString; + use crate::ext_vec::ByteVec; impl fmt::Display for BString { #[inline] - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(self.as_bstr(), f) } } impl fmt::Debug for BString { #[inline] - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Debug::fmt(self.as_bstr(), f) } } @@ -308,15 +308,15 @@ mod bstr { use core::fmt; use core::ops; - use bstr::BStr; - use ext_slice::ByteSlice; + use crate::bstr::BStr; + use crate::ext_slice::ByteSlice; impl fmt::Display for BStr { #[inline] - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { /// Write the given bstr (lossily) to the given formatter. fn write_bstr( - f: &mut fmt::Formatter, + f: &mut fmt::Formatter<'_>, bstr: &BStr, ) -> Result<(), fmt::Error> { for chunk in bstr.utf8_chunks() { @@ -329,7 +329,10 @@ mod bstr { } /// Write 'num' fill characters to the given formatter. - fn write_pads(f: &mut fmt::Formatter, num: usize) -> fmt::Result { + fn write_pads( + f: &mut fmt::Formatter<'_>, + num: usize, + ) -> fmt::Result { let fill = f.fill(); for _ in 0..num { f.write_fmt(format_args!("{}", fill))?; @@ -372,7 +375,7 @@ mod bstr { impl fmt::Debug for BStr { #[inline] - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "\"")?; for (s, e, ch) in self.char_indices() { match ch { @@ -685,7 +688,7 @@ mod bstr_serde { Serializer, }; - use bstr::BStr; + use crate::bstr::BStr; impl Serialize for BStr { #[inline] @@ -744,7 +747,7 @@ mod bstring_serde { Serialize, Serializer, }; - use bstring::BString; + use crate::bstring::BString; impl Serialize for BString { #[inline] @@ -824,8 +827,8 @@ mod bstring_serde { #[cfg(test)] mod display { + use crate::bstring::BString; use crate::ByteSlice; - use bstring::BString; #[test] fn clean() { @@ -923,7 +926,7 @@ mod display { ); } - quickcheck! { + quickcheck::quickcheck! { fn total_length(bstr: BString) -> bool { let size = bstr.chars().count(); format!("{:<1$}", bstr.as_bstr(), size).chars().count() >= size @@ -933,7 +936,7 @@ mod display { #[cfg(test)] mod bstring_arbitrary { - use bstring::BString; + use crate::bstring::BString; use quickcheck::{Arbitrary, Gen}; @@ -9,8 +9,8 @@ More APIs may be added in the future. use std::io; -use ext_slice::ByteSlice; -use ext_vec::ByteVec; +use crate::ext_slice::ByteSlice; +use crate::ext_vec::ByteVec; /// An extention trait for /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html) @@ -441,7 +441,7 @@ fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] { #[cfg(test)] mod tests { use super::BufReadExt; - use bstring::BString; + use crate::bstring::BString; fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> { let mut lines = vec![]; @@ -1,5 +1,5 @@ /*! -An experimental byte string library. +A byte string library. Byte strings are just like standard Unicode strings with one very important difference: byte strings are only *conventionally* UTF-8 while Rust's standard @@ -98,10 +98,10 @@ method converts any `&[u8]` to a `&BStr`. # When should I use byte strings? -This library is somewhat of an experiment that reflects my hypothesis that -UTF-8 by convention is a better trade off in some circumstances than guaranteed -UTF-8. It's possible, perhaps even likely, that this is a niche concern for -folks working closely with core text primitives. +This library reflects my hypothesis that UTF-8 by convention is a better trade +off in some circumstances than guaranteed UTF-8. It's possible, perhaps even +likely, that this is a niche concern for folks working closely with core text +primitives. The first time this idea hit me was in the implementation of Rust's regex engine. In particular, very little of the internal implementation cares at all @@ -134,15 +134,16 @@ incremental way by only parsing chunks at a time, but this is often complex to do or impractical. For example, many regex engines only accept one contiguous sequence of bytes at a time with no way to perform incremental matching. -In summary, the conventional UTF-8 byte strings provided by this library is an -experiment. They are definitely useful in some limited circumstances, but how -useful they are more broadly isn't clear yet. +In summary, conventional UTF-8 byte strings provided by this library are +definitely useful in some limited circumstances, but how useful they are more +broadly isn't clear yet. # `bstr` in public APIs -Since this library is still experimental, you should not use it in the public -API of your crates until it hits `1.0` (unless you're OK with with tracking -breaking releases of `bstr`). +Since this library is not yet `1.0`, you should not use it in the public API of +your crates until it hits `1.0` (unless you're OK with with tracking breaking +releases of `bstr`). It is expected that `bstr 1.0` will be released before +2022. In general, it should be possible to avoid putting anything in this crate into your public APIs. Namely, you should never need to use the `ByteSlice` or @@ -367,41 +368,23 @@ Windows. */ #![cfg_attr(not(feature = "std"), no_std)] -#![allow(dead_code)] +pub use crate::bstr::BStr; #[cfg(feature = "std")] -extern crate core; - -#[cfg(feature = "unicode")] -#[macro_use] -extern crate lazy_static; -extern crate memchr; -#[cfg(test)] -#[macro_use] -extern crate quickcheck; -#[cfg(feature = "unicode")] -extern crate regex_automata; -#[cfg(feature = "serde1-nostd")] -extern crate serde; -#[cfg(test)] -extern crate ucd_parse; - -pub use bstr::BStr; -#[cfg(feature = "std")] -pub use bstring::BString; -pub use ext_slice::{ +pub use crate::bstring::BString; +pub use crate::ext_slice::{ ByteSlice, Bytes, Fields, FieldsWith, Find, FindReverse, Finder, FinderReverse, Lines, LinesWithTerminator, Split, SplitN, SplitNReverse, SplitReverse, B, }; #[cfg(feature = "std")] -pub use ext_vec::{concat, join, ByteVec, DrainBytes, FromUtf8Error}; +pub use crate::ext_vec::{concat, join, ByteVec, DrainBytes, FromUtf8Error}; #[cfg(feature = "unicode")] -pub use unicode::{ +pub use crate::unicode::{ GraphemeIndices, Graphemes, SentenceIndices, Sentences, WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks, }; -pub use utf8::{ +pub use crate::utf8::{ decode as decode_utf8, decode_last as decode_last_utf8, CharIndices, Chars, Utf8Chunk, Utf8Chunks, Utf8Error, }; @@ -411,14 +394,12 @@ mod bstr; #[cfg(feature = "std")] mod bstring; mod byteset; -mod cow; mod ext_slice; #[cfg(feature = "std")] mod ext_vec; mod impls; #[cfg(feature = "std")] pub mod io; -mod search; #[cfg(test)] mod tests; #[cfg(feature = "unicode")] @@ -427,9 +408,9 @@ mod utf8; #[cfg(test)] mod apitests { - use bstr::BStr; - use bstring::BString; - use ext_slice::{Finder, FinderReverse}; + use crate::bstr::BStr; + use crate::bstring::BString; + use crate::ext_slice::{Finder, FinderReverse}; #[test] fn oibits() { @@ -446,11 +427,11 @@ mod apitests { assert_sync::<BString>(); assert_unwind_safe::<BString>(); - assert_send::<Finder>(); - assert_sync::<Finder>(); - assert_unwind_safe::<Finder>(); - assert_send::<FinderReverse>(); - assert_sync::<FinderReverse>(); - assert_unwind_safe::<FinderReverse>(); + assert_send::<Finder<'_>>(); + assert_sync::<Finder<'_>>(); + assert_unwind_safe::<Finder<'_>>(); + assert_send::<FinderReverse<'_>>(); + assert_sync::<FinderReverse<'_>>(); + assert_unwind_safe::<FinderReverse<'_>>(); } } diff --git a/src/search/byte_frequencies.rs b/src/search/byte_frequencies.rs deleted file mode 100644 index c313b62..0000000 --- a/src/search/byte_frequencies.rs +++ /dev/null @@ -1,258 +0,0 @@ -pub const BYTE_FREQUENCIES: [u8; 256] = [ - 55, // '\x00' - 52, // '\x01' - 51, // '\x02' - 50, // '\x03' - 49, // '\x04' - 48, // '\x05' - 47, // '\x06' - 46, // '\x07' - 45, // '\x08' - 103, // '\t' - 242, // '\n' - 66, // '\x0b' - 67, // '\x0c' - 229, // '\r' - 44, // '\x0e' - 43, // '\x0f' - 42, // '\x10' - 41, // '\x11' - 40, // '\x12' - 39, // '\x13' - 38, // '\x14' - 37, // '\x15' - 36, // '\x16' - 35, // '\x17' - 34, // '\x18' - 33, // '\x19' - 56, // '\x1a' - 32, // '\x1b' - 31, // '\x1c' - 30, // '\x1d' - 29, // '\x1e' - 28, // '\x1f' - 255, // ' ' - 148, // '!' - 164, // '"' - 149, // '#' - 136, // '$' - 160, // '%' - 155, // '&' - 173, // "'" - 221, // '(' - 222, // ')' - 134, // '*' - 122, // '+' - 232, // ',' - 202, // '-' - 215, // '.' - 224, // '/' - 208, // '0' - 220, // '1' - 204, // '2' - 187, // '3' - 183, // '4' - 179, // '5' - 177, // '6' - 168, // '7' - 178, // '8' - 200, // '9' - 226, // ':' - 195, // ';' - 154, // '<' - 184, // '=' - 174, // '>' - 126, // '?' - 120, // '@' - 191, // 'A' - 157, // 'B' - 194, // 'C' - 170, // 'D' - 189, // 'E' - 162, // 'F' - 161, // 'G' - 150, // 'H' - 193, // 'I' - 142, // 'J' - 137, // 'K' - 171, // 'L' - 176, // 'M' - 185, // 'N' - 167, // 'O' - 186, // 'P' - 112, // 'Q' - 175, // 'R' - 192, // 'S' - 188, // 'T' - 156, // 'U' - 140, // 'V' - 143, // 'W' - 123, // 'X' - 133, // 'Y' - 128, // 'Z' - 147, // '[' - 138, // '\\' - 146, // ']' - 114, // '^' - 223, // '_' - 151, // '`' - 249, // 'a' - 216, // 'b' - 238, // 'c' - 236, // 'd' - 253, // 'e' - 227, // 'f' - 218, // 'g' - 230, // 'h' - 247, // 'i' - 135, // 'j' - 180, // 'k' - 241, // 'l' - 233, // 'm' - 246, // 'n' - 244, // 'o' - 231, // 'p' - 139, // 'q' - 245, // 'r' - 243, // 's' - 251, // 't' - 235, // 'u' - 201, // 'v' - 196, // 'w' - 240, // 'x' - 214, // 'y' - 152, // 'z' - 182, // '{' - 205, // '|' - 181, // '}' - 127, // '~' - 27, // '\x7f' - 212, // '\x80' - 211, // '\x81' - 210, // '\x82' - 213, // '\x83' - 228, // '\x84' - 197, // '\x85' - 169, // '\x86' - 159, // '\x87' - 131, // '\x88' - 172, // '\x89' - 105, // '\x8a' - 80, // '\x8b' - 98, // '\x8c' - 96, // '\x8d' - 97, // '\x8e' - 81, // '\x8f' - 207, // '\x90' - 145, // '\x91' - 116, // '\x92' - 115, // '\x93' - 144, // '\x94' - 130, // '\x95' - 153, // '\x96' - 121, // '\x97' - 107, // '\x98' - 132, // '\x99' - 109, // '\x9a' - 110, // '\x9b' - 124, // '\x9c' - 111, // '\x9d' - 82, // '\x9e' - 108, // '\x9f' - 118, // '\xa0' - 141, // '¡' - 113, // '¢' - 129, // '£' - 119, // '¤' - 125, // '¥' - 165, // '¦' - 117, // '§' - 92, // '¨' - 106, // '©' - 83, // 'ª' - 72, // '«' - 99, // '¬' - 93, // '\xad' - 65, // '®' - 79, // '¯' - 166, // '°' - 237, // '±' - 163, // '²' - 199, // '³' - 190, // '´' - 225, // 'µ' - 209, // '¶' - 203, // '·' - 198, // '¸' - 217, // '¹' - 219, // 'º' - 206, // '»' - 234, // '¼' - 248, // '½' - 158, // '¾' - 239, // '¿' - 255, // 'À' - 255, // 'Á' - 255, // 'Â' - 255, // 'Ã' - 255, // 'Ä' - 255, // 'Å' - 255, // 'Æ' - 255, // 'Ç' - 255, // 'È' - 255, // 'É' - 255, // 'Ê' - 255, // 'Ë' - 255, // 'Ì' - 255, // 'Í' - 255, // 'Î' - 255, // 'Ï' - 255, // 'Ð' - 255, // 'Ñ' - 255, // 'Ò' - 255, // 'Ó' - 255, // 'Ô' - 255, // 'Õ' - 255, // 'Ö' - 255, // '×' - 255, // 'Ø' - 255, // 'Ù' - 255, // 'Ú' - 255, // 'Û' - 255, // 'Ü' - 255, // 'Ý' - 255, // 'Þ' - 255, // 'ß' - 255, // 'à' - 255, // 'á' - 255, // 'â' - 255, // 'ã' - 255, // 'ä' - 255, // 'å' - 255, // 'æ' - 255, // 'ç' - 255, // 'è' - 255, // 'é' - 255, // 'ê' - 255, // 'ë' - 255, // 'ì' - 255, // 'í' - 255, // 'î' - 255, // 'ï' - 255, // 'ð' - 255, // 'ñ' - 255, // 'ò' - 255, // 'ó' - 255, // 'ô' - 255, // 'õ' - 255, // 'ö' - 255, // '÷' - 255, // 'ø' - 255, // 'ù' - 255, // 'ú' - 255, // 'û' - 255, // 'ü' - 255, // 'ý' - 255, // 'þ' - 255, // 'ÿ' -]; diff --git a/src/search/mod.rs b/src/search/mod.rs deleted file mode 100644 index a0d1b45..0000000 --- a/src/search/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -pub use self::prefilter::PrefilterState; -pub use self::twoway::TwoWay; - -mod byte_frequencies; -mod prefilter; -#[cfg(test)] -mod tests; -mod twoway; diff --git a/src/search/prefilter.rs b/src/search/prefilter.rs deleted file mode 100644 index 00e6acf..0000000 --- a/src/search/prefilter.rs +++ /dev/null @@ -1,424 +0,0 @@ -use core::mem; - -use ext_slice::ByteSlice; -use search::byte_frequencies::BYTE_FREQUENCIES; - -/// PrefilterState tracks state associated with the effectiveness of a -/// prefilter. It is used to track how many bytes, on average, are skipped by -/// the prefilter. If this average dips below a certain threshold over time, -/// then the state renders the prefilter inert and stops using it. -/// -/// A prefilter state should be created for each search. (Where creating an -/// iterator via, e.g., `find_iter`, is treated as a single search.) -#[derive(Clone, Debug)] -pub struct PrefilterState { - /// The number of skips that has been executed. - skips: usize, - /// The total number of bytes that have been skipped. - skipped: usize, - /// The maximum length of a match. This is used to help determine how many - /// bytes on average should be skipped in order for a prefilter to be - /// effective. - max_match_len: usize, - /// Once this heuristic has been deemed ineffective, it will be inert - /// throughout the rest of its lifetime. This serves as a cheap way to - /// check inertness. - inert: bool, -} - -impl PrefilterState { - /// The minimum number of skip attempts to try before considering whether - /// a prefilter is effective or not. - const MIN_SKIPS: usize = 50; - - /// The minimum amount of bytes that skipping must average. - /// - /// This value was chosen based on varying it and checking the bstr/find/ - /// microbenchmarks. In particular, this can impact the - /// pathological/repeated-{huge,small} benchmarks quite a bit if it's - /// set too low. - const MIN_SKIP_BYTES: usize = 8; - - /// Create a fresh prefilter state. - pub fn new(max_match_len: usize) -> PrefilterState { - if max_match_len == 0 { - return PrefilterState::inert(); - } - PrefilterState { skips: 0, skipped: 0, max_match_len, inert: false } - } - - /// Create a fresh prefilter state that is always inert. - fn inert() -> PrefilterState { - PrefilterState { skips: 0, skipped: 0, max_match_len: 0, inert: true } - } - - /// Update this state with the number of bytes skipped on the last - /// invocation of the prefilter. - #[inline] - pub fn update(&mut self, skipped: usize) { - self.skips += 1; - self.skipped += skipped; - } - - /// Return true if and only if this state indicates that a prefilter is - /// still effective. - #[inline] - pub fn is_effective(&mut self) -> bool { - if self.inert { - return false; - } - if self.skips < PrefilterState::MIN_SKIPS { - return true; - } - if self.skipped >= PrefilterState::MIN_SKIP_BYTES * self.skips { - return true; - } - - // We're inert. - self.inert = true; - false - } -} - -/// A heuristic frequency based prefilter for searching a single needle. -/// -/// This prefilter attempts to pick out the byte in a needle that is predicted -/// to occur least frequently, and search for that using fast vectorized -/// routines. If a rare enough byte could not be found, then this prefilter's -/// constructors will return `None`. -/// -/// This can be combined with `PrefilterState` to dynamically render this -/// prefilter inert if it proves to ineffective. -#[derive(Clone, Debug)] -pub struct Freqy { - /// Whether this prefilter should be used or not. - inert: bool, - /// The length of the needle we're searching for. - needle_len: usize, - /// The rarest byte in the needle, according to pre-computed frequency - /// analysis. - rare1: u8, - /// The leftmost offset of the rarest byte in the needle. - rare1i: usize, - /// The second rarest byte in the needle, according to pre-computed - /// frequency analysis. (This may be equivalent to the rarest byte.) - /// - /// The second rarest byte is used as a type of guard for quickly detecting - /// a mismatch after memchr locates an instance of the rarest byte. This - /// is a hedge against pathological cases where the pre-computed frequency - /// analysis may be off. (But of course, does not prevent *all* - /// pathological cases.) - rare2: u8, - /// The leftmost offset of the second rarest byte in the needle. - rare2i: usize, -} - -impl Freqy { - /// The maximum frequency rank permitted. If the rarest byte in the needle - /// has a frequency rank above this value, then Freqy is not used. - const MAX_RANK: usize = 200; - - /// Return a fresh prefilter state that can be used with this prefilter. A - /// prefilter state is used to track the effectiveness of a prefilter for - /// speeding up searches. Therefore, the prefilter state should generally - /// be reused on subsequent searches (such as in an iterator). For searches - /// on a different haystack, then a new prefilter state should be used. - pub fn prefilter_state(&self) -> PrefilterState { - if self.inert { - PrefilterState::inert() - } else { - PrefilterState::new(self.needle_len) - } - } - - /// Returns a valid but inert prefilter. This is valid for both the forward - /// and reverse direction. - /// - /// It is never correct to use an inert prefilter. The results of finding - /// the next (or previous) candidate are unspecified. - fn inert() -> Freqy { - Freqy { - inert: true, - needle_len: 0, - rare1: 0, - rare1i: 0, - rare2: 0, - rare2i: 0, - } - } - - /// Return search info for the given needle in the forward direction. - pub fn forward(needle: &[u8]) -> Freqy { - if needle.is_empty() { - return Freqy::inert(); - } - - // Find the rarest two bytes. Try to make them distinct (but it's not - // required). - let (mut rare1, mut rare1i) = (needle[0], 0); - let (mut rare2, mut rare2i) = (needle[0], 0); - if needle.len() >= 2 { - rare2 = needle[1]; - rare2i = 1; - } - if Freqy::rank(rare2) < Freqy::rank(rare1) { - mem::swap(&mut rare1, &mut rare2); - mem::swap(&mut rare1i, &mut rare2i); - } - for (i, b) in needle.bytes().enumerate().skip(2) { - if Freqy::rank(b) < Freqy::rank(rare1) { - rare2 = rare1; - rare2i = rare1i; - rare1 = b; - rare1i = i; - } else if b != rare1 && Freqy::rank(b) < Freqy::rank(rare2) { - rare2 = b; - rare2i = i; - } - } - if Freqy::rank(rare1) > Freqy::MAX_RANK { - return Freqy::inert(); - } - let needle_len = needle.len(); - Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i } - } - - /// Return search info for the given needle in the reverse direction. - pub fn reverse(needle: &[u8]) -> Freqy { - if needle.is_empty() { - return Freqy::inert(); - } - - // Find the rarest two bytes. Try to make them distinct (but it's not - // required). In reverse, the offsets correspond to the number of bytes - // from the end of the needle. So `0` is the last byte in the needle. - let (mut rare1i, mut rare2i) = (0, 0); - if needle.len() >= 2 { - rare2i += 1; - } - let mut rare1 = needle[needle.len() - rare1i - 1]; - let mut rare2 = needle[needle.len() - rare2i - 1]; - if Freqy::rank(rare2) < Freqy::rank(rare1) { - mem::swap(&mut rare1, &mut rare2); - mem::swap(&mut rare1i, &mut rare2i); - } - for (i, b) in needle.bytes().rev().enumerate().skip(2) { - if Freqy::rank(b) < Freqy::rank(rare1) { - rare2 = rare1; - rare2i = rare1i; - rare1 = b; - rare1i = i; - } else if b != rare1 && Freqy::rank(b) < Freqy::rank(rare2) { - rare2 = b; - rare2i = i; - } - } - if Freqy::rank(rare1) > Freqy::MAX_RANK { - return Freqy::inert(); - } - let needle_len = needle.len(); - Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i } - } - - /// Look for a possible occurrence of needle. The position returned - /// corresponds to the beginning of the occurrence, if one exists. - /// - /// Callers may assume that this never returns false negatives (i.e., it - /// never misses an actual occurrence), but must check that the returned - /// position corresponds to a match. That is, it can return false - /// positives. - /// - /// This should only be used when Freqy is constructed for forward - /// searching. - pub fn find_candidate( - &self, - prestate: &mut PrefilterState, - haystack: &[u8], - ) -> Option<usize> { - debug_assert!(!self.inert); - - let mut i = 0; - while prestate.is_effective() { - // Use a fast vectorized implementation to skip to the next - // occurrence of the rarest byte (heuristically chosen) in the - // needle. - i += match haystack[i..].find_byte(self.rare1) { - None => return None, - Some(found) => { - prestate.update(found); - found - } - }; - - // If we can't align our first match with the haystack, then a - // match is impossible. - if i < self.rare1i { - i += 1; - continue; - } - - // Align our rare2 byte with the haystack. A mismatch means that - // a match is impossible. - let aligned_rare2i = i - self.rare1i + self.rare2i; - if haystack.get(aligned_rare2i) != Some(&self.rare2) { - i += 1; - continue; - } - - // We've done what we can. There might be a match here. - return Some(i - self.rare1i); - } - // The only way we get here is if we believe our skipping heuristic - // has become ineffective. We're allowed to return false positives, - // so return the position at which we advanced to, aligned to the - // haystack. - Some(i.saturating_sub(self.rare1i)) - } - - /// Look for a possible occurrence of needle, in reverse, starting from the - /// end of the given haystack. The position returned corresponds to the - /// position immediately after the end of the occurrence, if one exists. - /// - /// Callers may assume that this never returns false negatives (i.e., it - /// never misses an actual occurrence), but must check that the returned - /// position corresponds to a match. That is, it can return false - /// positives. - /// - /// This should only be used when Freqy is constructed for reverse - /// searching. - pub fn rfind_candidate( - &self, - prestate: &mut PrefilterState, - haystack: &[u8], - ) -> Option<usize> { - debug_assert!(!self.inert); - - let mut i = haystack.len(); - while prestate.is_effective() { - // Use a fast vectorized implementation to skip to the next - // occurrence of the rarest byte (heuristically chosen) in the - // needle. - i = match haystack[..i].rfind_byte(self.rare1) { - None => return None, - Some(found) => { - prestate.update(i - found); - found - } - }; - - // If we can't align our first match with the haystack, then a - // match is impossible. - if i + self.rare1i + 1 > haystack.len() { - continue; - } - - // Align our rare2 byte with the haystack. A mismatch means that - // a match is impossible. - let aligned = match (i + self.rare1i).checked_sub(self.rare2i) { - None => continue, - Some(aligned) => aligned, - }; - if haystack.get(aligned) != Some(&self.rare2) { - continue; - } - - // We've done what we can. There might be a match here. - return Some(i + self.rare1i + 1); - } - // The only way we get here is if we believe our skipping heuristic - // has become ineffective. We're allowed to return false positives, - // so return the position at which we advanced to, aligned to the - // haystack. - Some(i + self.rare1i + 1) - } - - /// Return the heuristical frequency rank of the given byte. A lower rank - /// means the byte is believed to occur less frequently. - fn rank(b: u8) -> usize { - BYTE_FREQUENCIES[b as usize] as usize - } -} - -#[cfg(test)] -mod tests { - use super::*; - use ext_slice::B; - - #[test] - fn freqy_forward() { - // N.B. We sometimes use uppercase here since that mostly ensures freqy - // will be constructable. Lowercase letters may be too common for freqy - // to work. - - let s = Freqy::forward(B("BAR")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(0), s.find_candidate(&mut pre, B("BARFOO"))); - - let s = Freqy::forward(B("BAR")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(3), s.find_candidate(&mut pre, B("FOOBAR"))); - - let s = Freqy::forward(B("zyzy")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(0), s.find_candidate(&mut pre, B("zyzz"))); - - let s = Freqy::forward(B("zyzy")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(2), s.find_candidate(&mut pre, B("zzzy"))); - - let s = Freqy::forward(B("zyzy")); - let mut pre = s.prefilter_state(); - assert_eq!(None, s.find_candidate(&mut pre, B("zazb"))); - - let s = Freqy::forward(B("yzyz")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(0), s.find_candidate(&mut pre, B("yzyy"))); - - let s = Freqy::forward(B("yzyz")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(2), s.find_candidate(&mut pre, B("yyyz"))); - - let s = Freqy::forward(B("yzyz")); - let mut pre = s.prefilter_state(); - assert_eq!(None, s.find_candidate(&mut pre, B("yayb"))); - } - - #[test] - fn freqy_reverse() { - // N.B. We sometimes use uppercase here since that mostly ensures freqy - // will be constructable. Lowercase letters may be too common for freqy - // to work. - - let s = Freqy::reverse(B("BAR")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(3), s.rfind_candidate(&mut pre, B("BARFOO"))); - - let s = Freqy::reverse(B("BAR")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(6), s.rfind_candidate(&mut pre, B("FOOBAR"))); - - let s = Freqy::reverse(B("zyzy")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(2), s.rfind_candidate(&mut pre, B("zyzz"))); - - let s = Freqy::reverse(B("zyzy")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(4), s.rfind_candidate(&mut pre, B("zzzy"))); - - let s = Freqy::reverse(B("zyzy")); - let mut pre = s.prefilter_state(); - assert_eq!(None, s.rfind_candidate(&mut pre, B("zazb"))); - - let s = Freqy::reverse(B("yzyz")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(2), s.rfind_candidate(&mut pre, B("yzyy"))); - - let s = Freqy::reverse(B("yzyz")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(4), s.rfind_candidate(&mut pre, B("yyyz"))); - - let s = Freqy::reverse(B("yzyz")); - let mut pre = s.prefilter_state(); - assert_eq!(None, s.rfind_candidate(&mut pre, B("yayb"))); - } -} diff --git a/src/search/tests.rs b/src/search/tests.rs deleted file mode 100644 index 827df92..0000000 --- a/src/search/tests.rs +++ /dev/null @@ -1,225 +0,0 @@ -use search::twoway::TwoWay; - -/// Each test is a (needle, haystack, expected_fwd, expected_rev) tuple. -type SearchTest = (&'static str, &'static str, Option<usize>, Option<usize>); - -const SEARCH_TESTS: &'static [SearchTest] = &[ - ("", "", Some(0), Some(0)), - ("", "a", Some(0), Some(1)), - ("", "ab", Some(0), Some(2)), - ("", "abc", Some(0), Some(3)), - ("a", "", None, None), - ("a", "a", Some(0), Some(0)), - ("a", "aa", Some(0), Some(1)), - ("a", "ba", Some(1), Some(1)), - ("a", "bba", Some(2), Some(2)), - ("a", "bbba", Some(3), Some(3)), - ("a", "bbbab", Some(3), Some(3)), - ("a", "bbbabb", Some(3), Some(3)), - ("a", "bbbabbb", Some(3), Some(3)), - ("a", "bbbbbb", None, None), - ("ab", "", None, None), - ("ab", "a", None, None), - ("ab", "b", None, None), - ("ab", "ab", Some(0), Some(0)), - ("ab", "aab", Some(1), Some(1)), - ("ab", "aaab", Some(2), Some(2)), - ("ab", "abaab", Some(0), Some(3)), - ("ab", "baaab", Some(3), Some(3)), - ("ab", "acb", None, None), - ("ab", "abba", Some(0), Some(0)), - ("abc", "ab", None, None), - ("abc", "abc", Some(0), Some(0)), - ("abc", "abcz", Some(0), Some(0)), - ("abc", "abczz", Some(0), Some(0)), - ("abc", "zabc", Some(1), Some(1)), - ("abc", "zzabc", Some(2), Some(2)), - ("abc", "azbc", None, None), - ("abc", "abzc", None, None), - ("abczdef", "abczdefzzzzzzzzzzzzzzzzzzzz", Some(0), Some(0)), - ("abczdef", "zzzzzzzzzzzzzzzzzzzzabczdef", Some(20), Some(20)), - // Failures caught by quickcheck. - ("\u{0}\u{15}", "\u{0}\u{15}\u{15}\u{0}", Some(0), Some(0)), - ("\u{0}\u{1e}", "\u{1e}\u{0}", None, None), -]; - -#[test] -fn unit_twoway_fwd() { - run_search_tests_fwd("TwoWay", |n, h| TwoWay::forward(n).find(h)); -} - -#[test] -fn unit_twoway_rev() { - run_search_tests_rev("TwoWay", |n, h| TwoWay::reverse(n).rfind(h)); -} - -/// Run the substring search tests. `name` should be the type of searcher used, -/// for diagnostics. `search` should be a closure that accepts a needle and a -/// haystack and returns the starting position of the first occurrence of -/// needle in the haystack, or `None` if one doesn't exist. -fn run_search_tests_fwd( - name: &str, - mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, -) { - for &(needle, haystack, expected_fwd, _) in SEARCH_TESTS { - let (n, h) = (needle.as_bytes(), haystack.as_bytes()); - assert_eq!( - expected_fwd, - search(n, h), - "{}: needle: {:?}, haystack: {:?}, expected: {:?}", - name, - n, - h, - expected_fwd - ); - } -} - -/// Run the substring search tests. `name` should be the type of searcher used, -/// for diagnostics. `search` should be a closure that accepts a needle and a -/// haystack and returns the starting position of the last occurrence of -/// needle in the haystack, or `None` if one doesn't exist. -fn run_search_tests_rev( - name: &str, - mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, -) { - for &(needle, haystack, _, expected_rev) in SEARCH_TESTS { - let (n, h) = (needle.as_bytes(), haystack.as_bytes()); - assert_eq!( - expected_rev, - search(n, h), - "{}: needle: {:?}, haystack: {:?}, expected: {:?}", - name, - n, - h, - expected_rev - ); - } -} - -quickcheck! { - fn qc_twoway_fwd_prefix_is_substring(bs: Vec<u8>) -> bool { - prop_prefix_is_substring(false, &bs, |n, h| TwoWay::forward(n).find(h)) - } - - fn qc_twoway_fwd_suffix_is_substring(bs: Vec<u8>) -> bool { - prop_suffix_is_substring(false, &bs, |n, h| TwoWay::forward(n).find(h)) - } - - fn qc_twoway_rev_prefix_is_substring(bs: Vec<u8>) -> bool { - prop_prefix_is_substring(true, &bs, |n, h| TwoWay::reverse(n).rfind(h)) - } - - fn qc_twoway_rev_suffix_is_substring(bs: Vec<u8>) -> bool { - prop_suffix_is_substring(true, &bs, |n, h| TwoWay::reverse(n).rfind(h)) - } - - fn qc_twoway_fwd_matches_naive( - needle: Vec<u8>, - haystack: Vec<u8> - ) -> bool { - prop_matches_naive( - false, - &needle, - &haystack, - |n, h| TwoWay::forward(n).find(h), - ) - } - - fn qc_twoway_rev_matches_naive( - needle: Vec<u8>, - haystack: Vec<u8> - ) -> bool { - prop_matches_naive( - true, - &needle, - &haystack, - |n, h| TwoWay::reverse(n).rfind(h), - ) - } -} - -/// Check that every prefix of the given byte string is a substring. -fn prop_prefix_is_substring( - reverse: bool, - bs: &[u8], - mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, -) -> bool { - if bs.is_empty() { - return true; - } - for i in 0..(bs.len() - 1) { - let prefix = &bs[..i]; - if reverse { - assert_eq!(naive_rfind(prefix, bs), search(prefix, bs)); - } else { - assert_eq!(naive_find(prefix, bs), search(prefix, bs)); - } - } - true -} - -/// Check that every suffix of the given byte string is a substring. -fn prop_suffix_is_substring( - reverse: bool, - bs: &[u8], - mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, -) -> bool { - if bs.is_empty() { - return true; - } - for i in 0..(bs.len() - 1) { - let suffix = &bs[i..]; - if reverse { - assert_eq!(naive_rfind(suffix, bs), search(suffix, bs)); - } else { - assert_eq!(naive_find(suffix, bs), search(suffix, bs)); - } - } - true -} - -/// Check that naive substring search matches the result of the given search -/// algorithm. -fn prop_matches_naive( - reverse: bool, - needle: &[u8], - haystack: &[u8], - mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, -) -> bool { - if reverse { - naive_rfind(needle, haystack) == search(needle, haystack) - } else { - naive_find(needle, haystack) == search(needle, haystack) - } -} - -/// Naively search forwards for the given needle in the given haystack. -fn naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize> { - if needle.is_empty() { - return Some(0); - } else if haystack.len() < needle.len() { - return None; - } - for i in 0..(haystack.len() - needle.len() + 1) { - if needle == &haystack[i..i + needle.len()] { - return Some(i); - } - } - None -} - -/// Naively search in reverse for the given needle in the given haystack. -fn naive_rfind(needle: &[u8], haystack: &[u8]) -> Option<usize> { - if needle.is_empty() { - return Some(haystack.len()); - } else if haystack.len() < needle.len() { - return None; - } - for i in (0..(haystack.len() - needle.len() + 1)).rev() { - if needle == &haystack[i..i + needle.len()] { - return Some(i); - } - } - None -} diff --git a/src/search/twoway.rs b/src/search/twoway.rs deleted file mode 100644 index 5f1e8cf..0000000 --- a/src/search/twoway.rs +++ /dev/null @@ -1,871 +0,0 @@ -use core::cmp; - -use cow::CowBytes; -use ext_slice::ByteSlice; -use search::prefilter::{Freqy, PrefilterState}; - -/// An implementation of the TwoWay substring search algorithm, with heuristics -/// for accelerating search based on frequency analysis. -/// -/// This searcher supports forward and reverse search, although not -/// simultaneously. It runs in O(n + m) time and O(1) space, where -/// `n ~ len(needle)` and `m ~ len(haystack)`. -/// -/// The implementation here roughly matches that which was developed by -/// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The -/// only change in this implementation is the use of zero-based indices and -/// the addition of heuristics for a fast skip loop. That is, this will detect -/// bytes that are believed to be rare in the needle and use fast vectorized -/// instructions to find their occurrences quickly. The Two-Way algorithm is -/// then used to confirm whether a match at that location occurred. -/// -/// The heuristic for fast skipping is automatically shut off if it's -/// detected to be ineffective at search time. Generally, this only occurs in -/// pathological cases. But this is generally necessary in order to preserve -/// a `O(n + m)` time bound. -/// -/// The code below is fairly complex and not obviously correct at all. It's -/// likely necessary to read the Two-Way paper cited above in order to fully -/// grok this code. -#[derive(Clone, Debug)] -pub struct TwoWay<'b> { - /// The needle that we're looking for. - needle: CowBytes<'b>, - /// An implementation of a fast skip loop based on hard-coded frequency - /// data. This is only used when conditions are deemed favorable. - freqy: Freqy, - /// A critical position in needle. Specifically, this position corresponds - /// to beginning of either the minimal or maximal suffix in needle. (N.B. - /// See SuffixType below for why "minimal" isn't quite the correct word - /// here.) - /// - /// This is the position at which every search begins. Namely, search - /// starts by scanning text to the right of this position, and only if - /// there's a match does the text to the left of this position get scanned. - critical_pos: usize, - /// The amount we shift by in the Two-Way search algorithm. This - /// corresponds to the "small period" and "large period" cases. - shift: Shift, -} - -impl<'b> TwoWay<'b> { - /// Create a searcher that uses the Two-Way algorithm by searching forwards - /// through any haystack. - pub fn forward(needle: &'b [u8]) -> TwoWay<'b> { - let freqy = Freqy::forward(needle); - if needle.is_empty() { - return TwoWay { - needle: CowBytes::new(needle), - freqy, - critical_pos: 0, - shift: Shift::Large { shift: 0 }, - }; - } - - let min_suffix = Suffix::forward(needle, SuffixKind::Minimal); - let max_suffix = Suffix::forward(needle, SuffixKind::Maximal); - let (period_lower_bound, critical_pos) = - if min_suffix.pos > max_suffix.pos { - (min_suffix.period, min_suffix.pos) - } else { - (max_suffix.period, max_suffix.pos) - }; - let shift = Shift::forward(needle, period_lower_bound, critical_pos); - let needle = CowBytes::new(needle); - TwoWay { needle, freqy, critical_pos, shift } - } - - /// Create a searcher that uses the Two-Way algorithm by searching in - /// reverse through any haystack. - pub fn reverse(needle: &'b [u8]) -> TwoWay<'b> { - let freqy = Freqy::reverse(needle); - if needle.is_empty() { - return TwoWay { - needle: CowBytes::new(needle), - freqy, - critical_pos: 0, - shift: Shift::Large { shift: 0 }, - }; - } - - let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal); - let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal); - let (period_lower_bound, critical_pos) = - if min_suffix.pos < max_suffix.pos { - (min_suffix.period, min_suffix.pos) - } else { - (max_suffix.period, max_suffix.pos) - }; - let shift = Shift::reverse(needle, period_lower_bound, critical_pos); - let needle = CowBytes::new(needle); - TwoWay { needle, freqy, critical_pos, shift } - } - - /// Return a fresh prefilter state that can be used with this searcher. - /// A prefilter state is used to track the effectiveness of a searcher's - /// prefilter for speeding up searches. Therefore, the prefilter state - /// should generally be reused on subsequent searches (such as in an - /// iterator). For searches on a different haystack, then a new prefilter - /// state should be used. - /// - /// This always initializes a valid prefilter state even if this searcher - /// does not have a prefilter enabled. - pub fn prefilter_state(&self) -> PrefilterState { - self.freqy.prefilter_state() - } - - /// Return the needle used by this searcher. - pub fn needle(&self) -> &[u8] { - self.needle.as_slice() - } - - /// Convert this searched into an owned version, where the needle is - /// copied if it isn't already owned. - #[cfg(feature = "std")] - pub fn into_owned(self) -> TwoWay<'static> { - TwoWay { - needle: self.needle.into_owned(), - freqy: self.freqy, - critical_pos: self.critical_pos, - shift: self.shift, - } - } - - /// Find the position of the first occurrence of this searcher's needle in - /// the given haystack. If one does not exist, then return None. - /// - /// This will automatically initialize prefilter state. This should only - /// be used for one-off searches. - pub fn find(&self, haystack: &[u8]) -> Option<usize> { - self.find_with(&mut self.prefilter_state(), haystack) - } - - /// Find the position of the last occurrence of this searcher's needle - /// in the given haystack. If one does not exist, then return None. - /// - /// This will automatically initialize prefilter state. This should only - /// be used for one-off searches. - pub fn rfind(&self, haystack: &[u8]) -> Option<usize> { - self.rfind_with(&mut self.prefilter_state(), haystack) - } - - /// Find the position of the first occurrence of this searcher's needle in - /// the given haystack. If one does not exist, then return None. - /// - /// This accepts prefilter state that is useful when using the same - /// searcher multiple times, such as in an iterator. - pub fn find_with( - &self, - prestate: &mut PrefilterState, - haystack: &[u8], - ) -> Option<usize> { - if self.needle.is_empty() { - return Some(0); - } else if haystack.len() < self.needle.len() { - return None; - } else if self.needle.len() == 1 { - return haystack.find_byte(self.needle[0]); - } - match self.shift { - Shift::Small { period } => { - self.find_small(prestate, haystack, period) - } - Shift::Large { shift } => { - self.find_large(prestate, haystack, shift) - } - } - } - - /// Find the position of the last occurrence of this searcher's needle - /// in the given haystack. If one does not exist, then return None. - /// - /// This accepts prefilter state that is useful when using the same - /// searcher multiple times, such as in an iterator. - pub fn rfind_with( - &self, - prestate: &mut PrefilterState, - haystack: &[u8], - ) -> Option<usize> { - if self.needle.is_empty() { - return Some(haystack.len()); - } else if haystack.len() < self.needle.len() { - return None; - } else if self.needle.len() == 1 { - return haystack.rfind_byte(self.needle[0]); - } - match self.shift { - Shift::Small { period } => { - self.rfind_small(prestate, haystack, period) - } - Shift::Large { shift } => { - self.rfind_large(prestate, haystack, shift) - } - } - } - - // Below is the actual implementation of TwoWay searching, including both - // forwards and backwards searching. Each forward and reverse search has - // two fairly similar implementations, each handling the small and large - // period cases, for a total 4 different search routines. - // - // On top of that, each search implementation can be accelerated by a - // Freqy prefilter, but it is not always enabled. To avoid its overhead - // when its disabled, we explicitly inline each search implementation based - // on whether Freqy will be used or not. This brings us up to a total of - // 8 monomorphized versions of the search code. - - #[inline(never)] - fn find_small( - &self, - prestate: &mut PrefilterState, - haystack: &[u8], - period: usize, - ) -> Option<usize> { - if prestate.is_effective() { - self.find_small_imp(prestate, true, haystack, period) - } else { - self.find_small_imp(prestate, false, haystack, period) - } - } - - #[inline(always)] - fn find_small_imp( - &self, - prestate: &mut PrefilterState, - prefilter: bool, - haystack: &[u8], - period: usize, - ) -> Option<usize> { - let needle = self.needle.as_slice(); - let mut pos = 0; - let mut shift = 0; - while pos + needle.len() <= haystack.len() { - let mut i = cmp::max(self.critical_pos, shift); - if prefilter && prestate.is_effective() { - match self.freqy.find_candidate(prestate, &haystack[pos..]) { - None => return None, - Some(found) => { - shift = 0; - i = self.critical_pos; - pos += found; - if pos + needle.len() > haystack.len() { - return None; - } - } - } - } - while i < needle.len() && needle[i] == haystack[pos + i] { - i += 1; - } - if i < needle.len() { - pos += i - self.critical_pos + 1; - shift = 0; - } else { - let mut j = self.critical_pos; - while j > shift && needle[j] == haystack[pos + j] { - j -= 1; - } - if j <= shift && needle[shift] == haystack[pos + shift] { - return Some(pos); - } - pos += period; - shift = needle.len() - period; - } - } - None - } - - #[inline(never)] - fn find_large( - &self, - prestate: &mut PrefilterState, - haystack: &[u8], - shift: usize, - ) -> Option<usize> { - if prestate.is_effective() { - self.find_large_imp(prestate, true, haystack, shift) - } else { - self.find_large_imp(prestate, false, haystack, shift) - } - } - - #[inline(always)] - fn find_large_imp( - &self, - prestate: &mut PrefilterState, - prefilter: bool, - haystack: &[u8], - shift: usize, - ) -> Option<usize> { - let needle = self.needle.as_slice(); - let mut pos = 0; - while pos + needle.len() <= haystack.len() { - let mut i = self.critical_pos; - if prefilter && prestate.is_effective() { - match self.freqy.find_candidate(prestate, &haystack[pos..]) { - None => return None, - Some(found) => { - pos += found; - if pos + needle.len() > haystack.len() { - return None; - } - } - } - } - while i < needle.len() && needle[i] == haystack[pos + i] { - i += 1; - } - if i < needle.len() { - pos += i - self.critical_pos + 1; - } else { - let mut j = self.critical_pos; - while j > 0 && needle[j] == haystack[pos + j] { - j -= 1; - } - if j == 0 && needle[0] == haystack[pos] { - return Some(pos); - } - pos += shift; - } - } - None - } - - #[inline(never)] - fn rfind_small( - &self, - prestate: &mut PrefilterState, - haystack: &[u8], - period: usize, - ) -> Option<usize> { - if prestate.is_effective() { - self.rfind_small_imp(prestate, true, haystack, period) - } else { - self.rfind_small_imp(prestate, false, haystack, period) - } - } - - #[inline(always)] - fn rfind_small_imp( - &self, - prestate: &mut PrefilterState, - prefilter: bool, - haystack: &[u8], - period: usize, - ) -> Option<usize> { - let needle = &*self.needle; - let nlen = needle.len(); - let mut pos = haystack.len(); - let mut shift = nlen; - while pos >= nlen { - let mut i = cmp::min(self.critical_pos, shift); - if prefilter && prestate.is_effective() { - match self.freqy.rfind_candidate(prestate, &haystack[..pos]) { - None => return None, - Some(found) => { - shift = nlen; - i = self.critical_pos; - pos = found; - if pos < nlen { - return None; - } - } - } - } - while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { - i -= 1; - } - if i > 0 || needle[0] != haystack[pos - nlen] { - pos -= self.critical_pos - i + 1; - shift = nlen; - } else { - let mut j = self.critical_pos; - while j < shift && needle[j] == haystack[pos - nlen + j] { - j += 1; - } - if j == shift { - return Some(pos - nlen); - } - pos -= period; - shift = period; - } - } - None - } - - #[inline(never)] - fn rfind_large( - &self, - prestate: &mut PrefilterState, - haystack: &[u8], - shift: usize, - ) -> Option<usize> { - if prestate.is_effective() { - self.rfind_large_imp(prestate, true, haystack, shift) - } else { - self.rfind_large_imp(prestate, false, haystack, shift) - } - } - - #[inline(always)] - fn rfind_large_imp( - &self, - prestate: &mut PrefilterState, - prefilter: bool, - haystack: &[u8], - shift: usize, - ) -> Option<usize> { - let needle = &*self.needle; - let nlen = needle.len(); - let mut pos = haystack.len(); - while pos >= nlen { - if prefilter && prestate.is_effective() { - match self.freqy.rfind_candidate(prestate, &haystack[..pos]) { - None => return None, - Some(found) => { - pos = found; - if pos < nlen { - return None; - } - } - } - } - - let mut i = self.critical_pos; - while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { - i -= 1; - } - if i > 0 || needle[0] != haystack[pos - nlen] { - pos -= self.critical_pos - i + 1; - } else { - let mut j = self.critical_pos; - while j < nlen && needle[j] == haystack[pos - nlen + j] { - j += 1; - } - if j == nlen { - return Some(pos - nlen); - } - pos -= shift; - } - } - None - } -} - -/// A representation of the amount we're allowed to shift by during Two-Way -/// search. -/// -/// When computing a critical factorization of the needle, we find the position -/// of the critical factorization by finding the needle's maximal (or minimal) -/// suffix, along with the period of that suffix. It turns out that the period -/// of that suffix is a lower bound on the period of the needle itself. -/// -/// This lower bound is equivalent to the actual period of the needle in -/// some cases. To describe that case, we denote the needle as `x` where -/// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower -/// bound given here is always the period of `v`, which is `<= period(x)`. The -/// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and -/// where `u` is a suffix of `v[0..period(v)]`. -/// -/// This case is important because the search algorithm for when the -/// periods are equivalent is slightly different than the search algorithm -/// for when the periods are not equivalent. In particular, when they aren't -/// equivalent, we know that the period of the needle is no less than half its -/// length. In this case, we shift by an amount less than or equal to the -/// period of the needle (determined by the maximum length of the components -/// of the critical factorization of `x`, i.e., `max(len(u), len(v))`).. -/// -/// The above two cases are represented by the variants below. Each entails -/// a different instantiation of the Two-Way search algorithm. -/// -/// N.B. If we could find a way to compute the exact period in all cases, -/// then we could collapse this case analysis and simplify the algorithm. The -/// Two-Way paper suggests this is possible, but more reading is required to -/// grok why the authors didn't pursue that path. -#[derive(Clone, Debug)] -enum Shift { - Small { period: usize }, - Large { shift: usize }, -} - -impl Shift { - /// Compute the shift for a given needle in the forward direction. - /// - /// This requires a lower bound on the period and a critical position. - /// These can be computed by extracting both the minimal and maximal - /// lexicographic suffixes, and choosing the right-most starting position. - /// The lower bound on the period is then the period of the chosen suffix. - fn forward( - needle: &[u8], - period_lower_bound: usize, - critical_pos: usize, - ) -> Shift { - let large = cmp::max(critical_pos, needle.len() - critical_pos); - if critical_pos * 2 >= needle.len() { - return Shift::Large { shift: large }; - } - - let (u, v) = needle.split_at(critical_pos); - if !v[..period_lower_bound].ends_with(u) { - return Shift::Large { shift: large }; - } - Shift::Small { period: period_lower_bound } - } - - /// Compute the shift for a given needle in the reverse direction. - /// - /// This requires a lower bound on the period and a critical position. - /// These can be computed by extracting both the minimal and maximal - /// lexicographic suffixes, and choosing the left-most starting position. - /// The lower bound on the period is then the period of the chosen suffix. - fn reverse( - needle: &[u8], - period_lower_bound: usize, - critical_pos: usize, - ) -> Shift { - let large = cmp::max(critical_pos, needle.len() - critical_pos); - if (needle.len() - critical_pos) * 2 >= needle.len() { - return Shift::Large { shift: large }; - } - - let (v, u) = needle.split_at(critical_pos); - if !v[v.len() - period_lower_bound..].starts_with(u) { - return Shift::Large { shift: large }; - } - Shift::Small { period: period_lower_bound } - } -} - -/// A suffix extracted from a needle along with its period. -#[derive(Debug)] -struct Suffix { - /// The starting position of this suffix. - /// - /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this - /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for - /// forward suffixes, this is an inclusive starting position, where as for - /// reverse suffixes, this is an exclusive ending position. - pos: usize, - /// The period of this suffix. - /// - /// Note that this is NOT necessarily the period of the string from which - /// this suffix comes from. (It is always less than or equal to the period - /// of the original string.) - period: usize, -} - -impl Suffix { - fn forward(needle: &[u8], kind: SuffixKind) -> Suffix { - debug_assert!(!needle.is_empty()); - - // suffix represents our maximal (or minimal) suffix, along with - // its period. - let mut suffix = Suffix { pos: 0, period: 1 }; - // The start of a suffix in `needle` that we are considering as a - // more maximal (or minimal) suffix than what's in `suffix`. - let mut candidate_start = 1; - // The current offset of our suffixes that we're comparing. - // - // When the characters at this offset are the same, then we mush on - // to the next position since no decision is possible. When the - // candidate's character is greater (or lesser) than the corresponding - // character than our current maximal (or minimal) suffix, then the - // current suffix is changed over to the candidate and we restart our - // search. Otherwise, the candidate suffix is no good and we restart - // our search on the next candidate. - // - // The three cases above correspond to the three cases in the loop - // below. - let mut offset = 0; - - while candidate_start + offset < needle.len() { - let current = needle[suffix.pos + offset]; - let candidate = needle[candidate_start + offset]; - match kind.cmp(current, candidate) { - SuffixOrdering::Accept => { - suffix = Suffix { pos: candidate_start, period: 1 }; - candidate_start += 1; - offset = 0; - } - SuffixOrdering::Skip => { - candidate_start += offset + 1; - offset = 0; - suffix.period = candidate_start - suffix.pos; - } - SuffixOrdering::Push => { - if offset + 1 == suffix.period { - candidate_start += suffix.period; - offset = 0; - } else { - offset += 1; - } - } - } - } - suffix - } - - fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix { - debug_assert!(!needle.is_empty()); - - // See the comments in `forward` for how this works. - let mut suffix = Suffix { pos: needle.len(), period: 1 }; - if needle.len() == 1 { - return suffix; - } - let mut candidate_start = needle.len() - 1; - let mut offset = 0; - - while offset < candidate_start { - let current = needle[suffix.pos - offset - 1]; - let candidate = needle[candidate_start - offset - 1]; - match kind.cmp(current, candidate) { - SuffixOrdering::Accept => { - suffix = Suffix { pos: candidate_start, period: 1 }; - candidate_start -= 1; - offset = 0; - } - SuffixOrdering::Skip => { - candidate_start -= offset + 1; - offset = 0; - suffix.period = suffix.pos - candidate_start; - } - SuffixOrdering::Push => { - if offset + 1 == suffix.period { - candidate_start -= suffix.period; - offset = 0; - } else { - offset += 1; - } - } - } - } - suffix - } -} - -/// The kind of suffix to extract. -#[derive(Clone, Copy, Debug)] -enum SuffixKind { - /// Extract the smallest lexicographic suffix from a string. - /// - /// Technically, this doesn't actually pick the smallest lexicographic - /// suffix. e.g., Given the choice between `a` and `aa`, this will choose - /// the latter over the former, even though `a < aa`. The reasoning for - /// this isn't clear from the paper, but it still smells like a minimal - /// suffix. - Minimal, - /// Extract the largest lexicographic suffix from a string. - /// - /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given - /// the choice between `z` and `zz`, this will choose the latter over the - /// former. - Maximal, -} - -/// The result of comparing corresponding bytes between two suffixes. -#[derive(Clone, Copy, Debug)] -enum SuffixOrdering { - /// This occurs when the given candidate byte indicates that the candidate - /// suffix is better than the current maximal (or minimal) suffix. That is, - /// the current candidate suffix should supplant the current maximal (or - /// minimal) suffix. - Accept, - /// This occurs when the given candidate byte excludes the candidate suffix - /// from being better than the current maximal (or minimal) suffix. That - /// is, the current candidate suffix should be dropped and the next one - /// should be considered. - Skip, - /// This occurs when no decision to accept or skip the candidate suffix - /// can be made, e.g., when corresponding bytes are equivalent. In this - /// case, the next corresponding bytes should be compared. - Push, -} - -impl SuffixKind { - /// Returns true if and only if the given candidate byte indicates that - /// it should replace the current suffix as the maximal (or minimal) - /// suffix. - fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering { - use self::SuffixOrdering::*; - - match self { - SuffixKind::Minimal if candidate < current => Accept, - SuffixKind::Minimal if candidate > current => Skip, - SuffixKind::Minimal => Push, - SuffixKind::Maximal if candidate > current => Accept, - SuffixKind::Maximal if candidate < current => Skip, - SuffixKind::Maximal => Push, - } - } -} - -// N.B. There are more holistic tests in src/search/tests.rs. -#[cfg(test)] -mod tests { - use super::*; - use ext_slice::B; - - /// Convenience wrapper for computing the suffix as a byte string. - fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { - let s = Suffix::forward(needle, kind); - (&needle[s.pos..], s.period) - } - - /// Convenience wrapper for computing the reverse suffix as a byte string. - fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { - let s = Suffix::reverse(needle, kind); - (&needle[..s.pos], s.period) - } - - /// Return all of the non-empty suffixes in the given byte string. - fn suffixes(bytes: &[u8]) -> Vec<&[u8]> { - (0..bytes.len()).map(|i| &bytes[i..]).collect() - } - - /// Return the lexicographically maximal suffix of the given byte string. - fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] { - let mut sufs = suffixes(needle); - sufs.sort(); - sufs.pop().unwrap() - } - - /// Return the lexicographically maximal suffix of the reverse of the given - /// byte string. - fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> { - let mut reversed = needle.to_vec(); - reversed.reverse(); - let mut got = naive_maximal_suffix_forward(&reversed).to_vec(); - got.reverse(); - got - } - - #[test] - fn suffix_forward() { - macro_rules! assert_suffix_min { - ($given:expr, $expected:expr, $period:expr) => { - let (got_suffix, got_period) = - get_suffix_forward($given.as_bytes(), SuffixKind::Minimal); - assert_eq!((B($expected), $period), (got_suffix, got_period)); - }; - } - - macro_rules! assert_suffix_max { - ($given:expr, $expected:expr, $period:expr) => { - let (got_suffix, got_period) = - get_suffix_forward($given.as_bytes(), SuffixKind::Maximal); - assert_eq!((B($expected), $period), (got_suffix, got_period)); - }; - } - - assert_suffix_min!("a", "a", 1); - assert_suffix_max!("a", "a", 1); - - assert_suffix_min!("ab", "ab", 2); - assert_suffix_max!("ab", "b", 1); - - assert_suffix_min!("ba", "a", 1); - assert_suffix_max!("ba", "ba", 2); - - assert_suffix_min!("abc", "abc", 3); - assert_suffix_max!("abc", "c", 1); - - assert_suffix_min!("acb", "acb", 3); - assert_suffix_max!("acb", "cb", 2); - - assert_suffix_min!("cba", "a", 1); - assert_suffix_max!("cba", "cba", 3); - - assert_suffix_min!("abcabc", "abcabc", 3); - assert_suffix_max!("abcabc", "cabc", 3); - - assert_suffix_min!("abcabcabc", "abcabcabc", 3); - assert_suffix_max!("abcabcabc", "cabcabc", 3); - - assert_suffix_min!("abczz", "abczz", 5); - assert_suffix_max!("abczz", "zz", 1); - - assert_suffix_min!("zzabc", "abc", 3); - assert_suffix_max!("zzabc", "zzabc", 5); - - assert_suffix_min!("aaa", "aaa", 1); - assert_suffix_max!("aaa", "aaa", 1); - - assert_suffix_min!("foobar", "ar", 2); - assert_suffix_max!("foobar", "r", 1); - } - - #[test] - fn suffix_reverse() { - macro_rules! assert_suffix_min { - ($given:expr, $expected:expr, $period:expr) => { - let (got_suffix, got_period) = - get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal); - assert_eq!((B($expected), $period), (got_suffix, got_period)); - }; - } - - macro_rules! assert_suffix_max { - ($given:expr, $expected:expr, $period:expr) => { - let (got_suffix, got_period) = - get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal); - assert_eq!((B($expected), $period), (got_suffix, got_period)); - }; - } - - assert_suffix_min!("a", "a", 1); - assert_suffix_max!("a", "a", 1); - - assert_suffix_min!("ab", "a", 1); - assert_suffix_max!("ab", "ab", 2); - - assert_suffix_min!("ba", "ba", 2); - assert_suffix_max!("ba", "b", 1); - - assert_suffix_min!("abc", "a", 1); - assert_suffix_max!("abc", "abc", 3); - - assert_suffix_min!("acb", "a", 1); - assert_suffix_max!("acb", "ac", 2); - - assert_suffix_min!("cba", "cba", 3); - assert_suffix_max!("cba", "c", 1); - - assert_suffix_min!("abcabc", "abca", 3); - assert_suffix_max!("abcabc", "abcabc", 3); - - assert_suffix_min!("abcabcabc", "abcabca", 3); - assert_suffix_max!("abcabcabc", "abcabcabc", 3); - - assert_suffix_min!("abczz", "a", 1); - assert_suffix_max!("abczz", "abczz", 5); - - assert_suffix_min!("zzabc", "zza", 3); - assert_suffix_max!("zzabc", "zz", 1); - - assert_suffix_min!("aaa", "aaa", 1); - assert_suffix_max!("aaa", "aaa", 1); - } - - quickcheck! { - fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool { - if bytes.is_empty() { - return true; - } - - let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal); - let expected = naive_maximal_suffix_forward(&bytes); - got == expected - } - - fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool { - if bytes.is_empty() { - return true; - } - - let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal); - let expected = naive_maximal_suffix_reverse(&bytes); - expected == got - } - } -} diff --git a/src/unicode/fsm/grapheme_break_fwd.rs b/src/unicode/fsm/grapheme_break_fwd.rs index 317ba96..b53b1d7 100644 --- a/src/unicode/fsm/grapheme_break_fwd.rs +++ b/src/unicode/fsm/grapheme_break_fwd.rs @@ -2,40 +2,44 @@ // // ucd-generate dfa --name GRAPHEME_BREAK_FWD --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)] // -// ucd-generate 0.2.8 is available on crates.io. +// ucd-generate 0.2.9 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static! { - pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u8; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("grapheme_break_fwd.bigendian.dfa"), - }; - - unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("grapheme_break_fwd.bigendian.dfa"), }; + + unsafe { + ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) + } + }; } #[cfg(target_endian = "little")] -lazy_static! { - pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u8; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("grapheme_break_fwd.littleendian.dfa"), - }; - - unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("grapheme_break_fwd.littleendian.dfa"), }; + + unsafe { + ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) + } + }; } diff --git a/src/unicode/fsm/grapheme_break_rev.rs b/src/unicode/fsm/grapheme_break_rev.rs index db6b6ee..93e888c 100644 --- a/src/unicode/fsm/grapheme_break_rev.rs +++ b/src/unicode/fsm/grapheme_break_rev.rs @@ -2,40 +2,44 @@ // // ucd-generate dfa --name GRAPHEME_BREAK_REV --reverse --longest --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)] // -// ucd-generate 0.2.8 is available on crates.io. +// ucd-generate 0.2.9 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static! { - pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u8; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("grapheme_break_rev.bigendian.dfa"), - }; - - unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("grapheme_break_rev.bigendian.dfa"), }; + + unsafe { + ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) + } + }; } #[cfg(target_endian = "little")] -lazy_static! { - pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u8; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("grapheme_break_rev.littleendian.dfa"), - }; - - unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("grapheme_break_rev.littleendian.dfa"), }; + + unsafe { + ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) + } + }; } diff --git a/src/unicode/fsm/regional_indicator_rev.rs b/src/unicode/fsm/regional_indicator_rev.rs index 3b6beff..2bf7e4c 100644 --- a/src/unicode/fsm/regional_indicator_rev.rs +++ b/src/unicode/fsm/regional_indicator_rev.rs @@ -2,40 +2,44 @@ // // ucd-generate dfa --name REGIONAL_INDICATOR_REV --reverse --classes --minimize --anchored --premultiply --state-size 1 src/unicode/fsm/ \p{gcb=Regional_Indicator} // -// ucd-generate 0.2.8 is available on crates.io. +// ucd-generate 0.2.9 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static! { - pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u8; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"), - }; - - unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"), }; + + unsafe { + ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) + } + }; } #[cfg(target_endian = "little")] -lazy_static! { - pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u8; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"), - }; - - unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"), }; + + unsafe { + ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) + } + }; } diff --git a/src/unicode/fsm/sentence_break_fwd.rs b/src/unicode/fsm/sentence_break_fwd.rs index 46ecfcf..cc937a4 100644 --- a/src/unicode/fsm/sentence_break_fwd.rs +++ b/src/unicode/fsm/sentence_break_fwd.rs @@ -2,40 +2,44 @@ // // ucd-generate dfa --name SENTENCE_BREAK_FWD --minimize --sparse --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)] // -// ucd-generate 0.2.8 is available on crates.io. +// ucd-generate 0.2.9 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static! { - pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u8; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("sentence_break_fwd.bigendian.dfa"), - }; - - unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("sentence_break_fwd.bigendian.dfa"), }; + + unsafe { + ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) + } + }; } #[cfg(target_endian = "little")] -lazy_static! { - pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u8; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("sentence_break_fwd.littleendian.dfa"), - }; - - unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("sentence_break_fwd.littleendian.dfa"), }; + + unsafe { + ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) + } + }; } diff --git a/src/unicode/fsm/simple_word_fwd.rs b/src/unicode/fsm/simple_word_fwd.rs index c5fabe3..f1f3da5 100644 --- a/src/unicode/fsm/simple_word_fwd.rs +++ b/src/unicode/fsm/simple_word_fwd.rs @@ -2,40 +2,44 @@ // // ucd-generate dfa --name SIMPLE_WORD_FWD --sparse --minimize --state-size 2 src/unicode/fsm/ \w // -// ucd-generate 0.2.8 is available on crates.io. +// ucd-generate 0.2.9 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static! { - pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u8; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("simple_word_fwd.bigendian.dfa"), - }; - - unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("simple_word_fwd.bigendian.dfa"), }; + + unsafe { + ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) + } + }; } #[cfg(target_endian = "little")] -lazy_static! { - pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u8; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("simple_word_fwd.littleendian.dfa"), - }; - - unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("simple_word_fwd.littleendian.dfa"), }; + + unsafe { + ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) + } + }; } diff --git a/src/unicode/fsm/whitespace_anchored_fwd.rs b/src/unicode/fsm/whitespace_anchored_fwd.rs index ea68582..419b5d4 100644 --- a/src/unicode/fsm/whitespace_anchored_fwd.rs +++ b/src/unicode/fsm/whitespace_anchored_fwd.rs @@ -2,40 +2,44 @@ // // ucd-generate dfa --name WHITESPACE_ANCHORED_FWD --anchored --classes --premultiply --minimize --state-size 1 src/unicode/fsm/ \s+ // -// ucd-generate 0.2.8 is available on crates.io. +// ucd-generate 0.2.9 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static! { - pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u8; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"), - }; - - unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"), }; + + unsafe { + ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) + } + }; } #[cfg(target_endian = "little")] -lazy_static! { - pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u8; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"), - }; - - unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"), }; + + unsafe { + ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) + } + }; } diff --git a/src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa b/src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa Binary files differindex bb217f1..427d3a9 100644 --- a/src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa +++ b/src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa diff --git a/src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa b/src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa Binary files differindex a7cb5a7..7cc3a0a 100644 --- a/src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa +++ b/src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa diff --git a/src/unicode/fsm/whitespace_anchored_rev.rs b/src/unicode/fsm/whitespace_anchored_rev.rs index 72b444e..301b03c 100644 --- a/src/unicode/fsm/whitespace_anchored_rev.rs +++ b/src/unicode/fsm/whitespace_anchored_rev.rs @@ -1,41 +1,45 @@ // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // -// ucd-generate dfa --name WHITESPACE_ANCHORED_REV --reverse --anchored --classes --minimize --state-size 1 src/unicode/fsm/ \s+ +// ucd-generate dfa --name WHITESPACE_ANCHORED_REV --reverse --anchored --classes --premultiply --minimize --state-size 2 src/unicode/fsm/ \s+ // -// ucd-generate 0.2.8 is available on crates.io. +// ucd-generate 0.2.9 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static! { - pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u16; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"), - }; - - unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"), }; + + unsafe { + ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) + } + }; } #[cfg(target_endian = "little")] -lazy_static! { - pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u16; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"), - }; - - unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"), }; + + unsafe { + ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) + } + }; } diff --git a/src/unicode/fsm/word_break_fwd.rs b/src/unicode/fsm/word_break_fwd.rs index 52e6bc2..fb041b7 100644 --- a/src/unicode/fsm/word_break_fwd.rs +++ b/src/unicode/fsm/word_break_fwd.rs @@ -2,40 +2,44 @@ // // ucd-generate dfa --name WORD_BREAK_FWD --sparse --minimize --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)] // -// ucd-generate 0.2.8 is available on crates.io. +// ucd-generate 0.2.9 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static! { - pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u8; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("word_break_fwd.bigendian.dfa"), - }; - - unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("word_break_fwd.bigendian.dfa"), }; + + unsafe { + ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) + } + }; } #[cfg(target_endian = "little")] -lazy_static! { - pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { - #[repr(C)] - struct Aligned<B: ?Sized> { - _align: [u8; 0], - bytes: B, - } +lazy_static::lazy_static! { + pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { + #[repr(C)] + struct Aligned<B: ?Sized> { + _align: [u8; 0], + bytes: B, + } - static ALIGNED: &'static Aligned<[u8]> = &Aligned { - _align: [], - bytes: *include_bytes!("word_break_fwd.littleendian.dfa"), - }; - - unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } + static ALIGNED: &'static Aligned<[u8]> = &Aligned { + _align: [], + bytes: *include_bytes!("word_break_fwd.littleendian.dfa"), }; + + unsafe { + ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) + } + }; } diff --git a/src/unicode/grapheme.rs b/src/unicode/grapheme.rs index e40a0de..ad31cf1 100644 --- a/src/unicode/grapheme.rs +++ b/src/unicode/grapheme.rs @@ -1,10 +1,10 @@ use regex_automata::DFA; -use ext_slice::ByteSlice; -use unicode::fsm::grapheme_break_fwd::GRAPHEME_BREAK_FWD; -use unicode::fsm::grapheme_break_rev::GRAPHEME_BREAK_REV; -use unicode::fsm::regional_indicator_rev::REGIONAL_INDICATOR_REV; -use utf8; +use crate::ext_slice::ByteSlice; +use crate::unicode::fsm::grapheme_break_fwd::GRAPHEME_BREAK_FWD; +use crate::unicode::fsm::grapheme_break_rev::GRAPHEME_BREAK_REV; +use crate::unicode::fsm::regional_indicator_rev::REGIONAL_INDICATOR_REV; +use crate::utf8; /// An iterator over grapheme clusters in a byte string. /// @@ -262,8 +262,8 @@ mod tests { use ucd_parse::GraphemeClusterBreakTest; use super::*; - use ext_slice::ByteSlice; - use tests::LOSSY_TESTS; + use crate::ext_slice::ByteSlice; + use crate::tests::LOSSY_TESTS; #[test] fn forward_ucd() { diff --git a/src/unicode/sentence.rs b/src/unicode/sentence.rs index 01f5473..063f342 100644 --- a/src/unicode/sentence.rs +++ b/src/unicode/sentence.rs @@ -1,8 +1,8 @@ use regex_automata::DFA; -use ext_slice::ByteSlice; -use unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD; -use utf8; +use crate::ext_slice::ByteSlice; +use crate::unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD; +use crate::utf8; /// An iterator over sentences in a byte string. /// @@ -160,7 +160,7 @@ fn decode_sentence(bs: &[u8]) -> (&str, usize) { mod tests { use ucd_parse::SentenceBreakTest; - use ext_slice::ByteSlice; + use crate::ext_slice::ByteSlice; #[test] fn forward_ucd() { diff --git a/src/unicode/whitespace.rs b/src/unicode/whitespace.rs index a8da144..949a83f 100644 --- a/src/unicode/whitespace.rs +++ b/src/unicode/whitespace.rs @@ -1,7 +1,7 @@ use regex_automata::DFA; -use unicode::fsm::whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD; -use unicode::fsm::whitespace_anchored_rev::WHITESPACE_ANCHORED_REV; +use crate::unicode::fsm::whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD; +use crate::unicode::fsm::whitespace_anchored_rev::WHITESPACE_ANCHORED_REV; /// Return the first position of a non-whitespace character. pub fn whitespace_len_fwd(slice: &[u8]) -> usize { diff --git a/src/unicode/word.rs b/src/unicode/word.rs index 1260e52..e0a5701 100644 --- a/src/unicode/word.rs +++ b/src/unicode/word.rs @@ -1,9 +1,9 @@ use regex_automata::DFA; -use ext_slice::ByteSlice; -use unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD; -use unicode::fsm::word_break_fwd::WORD_BREAK_FWD; -use utf8; +use crate::ext_slice::ByteSlice; +use crate::unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD; +use crate::unicode::fsm::word_break_fwd::WORD_BREAK_FWD; +use crate::utf8; /// An iterator over words in a byte string. /// @@ -320,7 +320,7 @@ fn decode_word(bs: &[u8]) -> (&str, usize) { mod tests { use ucd_parse::WordBreakTest; - use ext_slice::ByteSlice; + use crate::ext_slice::ByteSlice; #[test] fn forward_ucd() { diff --git a/src/utf8.rs b/src/utf8.rs index 2d4035d..5c7de36 100644 --- a/src/utf8.rs +++ b/src/utf8.rs @@ -5,9 +5,9 @@ use core::str; #[cfg(feature = "std")] use std::error; -use ascii; -use bstr::BStr; -use ext_slice::ByteSlice; +use crate::ascii; +use crate::bstr::BStr; +use crate::ext_slice::ByteSlice; // The UTF-8 decoder provided here is based on the one presented here: // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ @@ -459,7 +459,7 @@ impl error::Error for Utf8Error { } impl fmt::Display for Utf8Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "invalid UTF-8 found at byte offset {}", self.valid_up_to) } } @@ -858,9 +858,9 @@ fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { mod tests { use std::char; - use ext_slice::{ByteSlice, B}; - use tests::LOSSY_TESTS; - use utf8::{self, Utf8Error}; + use crate::ext_slice::{ByteSlice, B}; + use crate::tests::LOSSY_TESTS; + use crate::utf8::{self, Utf8Error}; fn utf8e(valid_up_to: usize) -> Utf8Error { Utf8Error { valid_up_to, error_len: None } |