aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeff Vander Stoep <jeffv@google.com>2022-12-12 15:18:30 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2022-12-12 15:18:30 +0000
commit8a7be3d891f0132465e3609fdb2082d3a92b3eaa (patch)
tree065f4b90fc4351f93487c5554f9cacd93fe29cbe
parent1faff9be927c85d1dfb151bc7975d02f697854df (diff)
parent8cc600fbcfb556b18ae725de8cd6690ae9cbf3e0 (diff)
downloadbstr-8a7be3d891f0132465e3609fdb2082d3a92b3eaa.tar.gz
Upgrade bstr to 1.0.1 am: e3d458e404 am: 4e28fb8bc1 am: 8cc600fbcf
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/bstr/+/2337924 Change-Id: If4b1c0d1cf3c8ca75186dd9ad7e89cbf73a94cd6 Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
-rw-r--r--.cargo_vcs_info.json7
-rw-r--r--Android.bp13
-rw-r--r--COPYING4
-rw-r--r--Cargo.lock.saved127
-rw-r--r--Cargo.toml80
-rw-r--r--Cargo.toml.orig45
-rw-r--r--METADATA14
-rw-r--r--README.md56
-rw-r--r--src/ascii.rs23
-rw-r--r--src/bstr.rs32
-rw-r--r--src/bstring.rs46
-rw-r--r--src/byteset/mod.rs3
-rw-r--r--src/byteset/scalar.rs46
-rw-r--r--src/ext_slice.rs666
-rw-r--r--src/ext_vec.rs121
-rw-r--r--src/impls.rs144
-rw-r--r--src/io.rs33
-rw-r--r--src/lib.rs111
-rw-r--r--src/tests.rs2
-rw-r--r--src/unicode/data/GraphemeBreakTest.txt6
-rw-r--r--src/unicode/data/SentenceBreakTest.txt6
-rw-r--r--src/unicode/data/WordBreakTest.txt6
-rw-r--r--src/unicode/fsm/grapheme_break_fwd.bigendian.dfabin10589 -> 10781 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_fwd.littleendian.dfabin10589 -> 10781 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_fwd.rs26
-rw-r--r--src/unicode/fsm/grapheme_break_rev.bigendian.dfabin53905 -> 55271 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_rev.littleendian.dfabin53905 -> 55271 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_rev.rs26
-rw-r--r--src/unicode/fsm/regional_indicator_rev.rs26
-rw-r--r--src/unicode/fsm/sentence_break_fwd.bigendian.dfabin149903 -> 153619 bytes
-rw-r--r--src/unicode/fsm/sentence_break_fwd.littleendian.dfabin149903 -> 153619 bytes
-rw-r--r--src/unicode/fsm/sentence_break_fwd.rs26
-rw-r--r--src/unicode/fsm/simple_word_fwd.bigendian.dfabin8975 -> 9237 bytes
-rw-r--r--src/unicode/fsm/simple_word_fwd.littleendian.dfabin8975 -> 9237 bytes
-rw-r--r--src/unicode/fsm/simple_word_fwd.rs26
-rw-r--r--src/unicode/fsm/whitespace_anchored_fwd.rs26
-rw-r--r--src/unicode/fsm/whitespace_anchored_rev.rs26
-rw-r--r--src/unicode/fsm/word_break_fwd.bigendian.dfabin229739 -> 236309 bytes
-rw-r--r--src/unicode/fsm/word_break_fwd.littleendian.dfabin229739 -> 236309 bytes
-rw-r--r--src/unicode/fsm/word_break_fwd.rs26
-rw-r--r--src/unicode/grapheme.rs44
-rw-r--r--src/unicode/mod.rs10
-rw-r--r--src/unicode/sentence.rs15
-rw-r--r--src/unicode/whitespace.rs6
-rw-r--r--src/unicode/word.rs26
-rw-r--r--src/utf8.rs29
46 files changed, 1258 insertions, 671 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index ef4fb69..81b3348 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,5 +1,6 @@
{
"git": {
- "sha1": "e38e7a7ca986f9499b30202f49d79e531d14d192"
- }
-}
+ "sha1": "2900d6016b16acb907c70d2d87aa82d0172cd057"
+ },
+ "path_in_vcs": ""
+} \ No newline at end of file
diff --git a/Android.bp b/Android.bp
index 7fb581b..f41d2c2 100644
--- a/Android.bp
+++ b/Android.bp
@@ -44,19 +44,22 @@ rust_library {
host_supported: true,
crate_name: "bstr",
cargo_env_compat: true,
- cargo_pkg_version: "0.2.17",
+ cargo_pkg_version: "1.0.1",
srcs: ["src/lib.rs"],
- edition: "2018",
+ edition: "2021",
features: [
+ "alloc",
"default",
- "lazy_static",
- "regex-automata",
"std",
"unicode",
],
rustlibs: [
- "liblazy_static",
"libmemchr",
+ "libonce_cell",
"libregex_automata",
],
+ apex_available: [
+ "//apex_available:platform",
+ "//apex_available:anyapex",
+ ],
}
diff --git a/COPYING b/COPYING
index d5a7d7e..e343d38 100644
--- a/COPYING
+++ b/COPYING
@@ -1,8 +1,8 @@
This project is licensed under either of
* Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
- http://www.apache.org/licenses/LICENSE-2.0)
+ https://www.apache.org/licenses/LICENSE-2.0)
* MIT license ([LICENSE-MIT](LICENSE-MIT) or
- http://opensource.org/licenses/MIT)
+ https://opensource.org/licenses/MIT)
at your option.
diff --git a/Cargo.lock.saved b/Cargo.lock.saved
new file mode 100644
index 0000000..bcc42fb
--- /dev/null
+++ b/Cargo.lock.saved
@@ -0,0 +1,127 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "bstr"
+version = "1.0.1"
+dependencies = [
+ "memchr",
+ "once_cell",
+ "quickcheck",
+ "regex-automata",
+ "serde",
+ "ucd-parse",
+ "unicode-segmentation",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "getrandom"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.132"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5"
+
+[[package]]
+name = "memchr"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+
+[[package]]
+name = "once_cell"
+version = "1.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f7254b99e31cad77da24b08ebf628882739a608578bb1bcdfc1f9c21260d7c0"
+
+[[package]]
+name = "quickcheck"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6"
+dependencies = [
+ "rand",
+]
+
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "regex"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b"
+dependencies = [
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+
+[[package]]
+name = "regex-syntax"
+version = "0.6.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244"
+
+[[package]]
+name = "serde"
+version = "1.0.144"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f747710de3dcd43b88c9168773254e809d8ddbdf9653b84e2554ab219f17860"
+
+[[package]]
+name = "ucd-parse"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc2d0556a998f4c55500ce1730901ba32bafbe820068cbdc091421525d61253b"
+dependencies = [
+ "once_cell",
+ "regex",
+]
+
+[[package]]
+name = "unicode-segmentation"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99"
+
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
diff --git a/Cargo.toml b/Cargo.toml
index 0f206ba..3d72ff5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,32 +10,77 @@
# See Cargo.toml.orig for the original contents.
[package]
-edition = "2018"
+edition = "2021"
+rust-version = "1.60"
name = "bstr"
-version = "0.2.17"
+version = "1.0.1"
authors = ["Andrew Gallant <jamslam@gmail.com>"]
exclude = ["/.github"]
description = "A string type that is not required to be valid UTF-8."
homepage = "https://github.com/BurntSushi/bstr"
documentation = "https://docs.rs/bstr"
readme = "README.md"
-keywords = ["string", "str", "byte", "bytes", "text"]
-categories = ["text-processing", "encoding"]
+keywords = [
+ "string",
+ "str",
+ "byte",
+ "bytes",
+ "text",
+]
+categories = [
+ "text-processing",
+ "encoding",
+]
license = "MIT OR Apache-2.0"
repository = "https://github.com/BurntSushi/bstr"
+resolver = "2"
+
+[package.metadata.docs.rs]
+all-features = true
+rustdoc-args = [
+ "--cfg",
+ "docsrs",
+]
+
[profile.release]
debug = true
[lib]
bench = false
-[dependencies.lazy_static]
-version = "1.2.0"
-optional = true
+
+[[example]]
+name = "graphemes"
+required-features = [
+ "std",
+ "unicode",
+]
+
+[[example]]
+name = "lines"
+required-features = ["std"]
+
+[[example]]
+name = "uppercase"
+required-features = [
+ "std",
+ "unicode",
+]
+
+[[example]]
+name = "words"
+required-features = [
+ "std",
+ "unicode",
+]
[dependencies.memchr]
version = "2.4.0"
default-features = false
+[dependencies.once_cell]
+version = "1.14.0"
+optional = true
+
[dependencies.regex-automata]
version = "0.1.5"
optional = true
@@ -45,6 +90,7 @@ default-features = false
version = "1.0.85"
optional = true
default-features = false
+
[dev-dependencies.quickcheck]
version = "1"
default-features = false
@@ -56,8 +102,18 @@ version = "0.1.3"
version = "1.2.1"
[features]
-default = ["std", "unicode"]
-serde1 = ["std", "serde1-nostd", "serde/std"]
-serde1-nostd = ["serde"]
-std = ["memchr/std"]
-unicode = ["lazy_static", "regex-automata"]
+alloc = ["serde?/alloc"]
+default = [
+ "std",
+ "unicode",
+]
+serde = ["dep:serde"]
+std = [
+ "alloc",
+ "memchr/std",
+ "serde?/std",
+]
+unicode = [
+ "dep:once_cell",
+ "dep:regex-automata",
+]
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index cbb6283..c2a17a2 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,6 +1,6 @@
[package]
name = "bstr"
-version = "0.2.17" #:version
+version = "1.0.1" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = "A string type that is not required to be valid UTF-8."
documentation = "https://docs.rs/bstr"
@@ -11,7 +11,9 @@ keywords = ["string", "str", "byte", "bytes", "text"]
license = "MIT OR Apache-2.0"
categories = ["text-processing", "encoding"]
exclude = ["/.github"]
-edition = "2018"
+edition = "2021"
+rust-version = "1.60"
+resolver = "2"
[workspace]
members = ["bench"]
@@ -21,14 +23,14 @@ bench = false
[features]
default = ["std", "unicode"]
-std = ["memchr/std"]
-unicode = ["lazy_static", "regex-automata"]
-serde1 = ["std", "serde1-nostd", "serde/std"]
-serde1-nostd = ["serde"]
+std = ["alloc", "memchr/std", "serde?/std"]
+alloc = ["serde?/alloc"]
+unicode = ["dep:once_cell", "dep:regex-automata"]
+serde = ["dep:serde"]
[dependencies]
memchr = { version = "2.4.0", default-features = false }
-lazy_static = { version = "1.2.0", optional = true }
+once_cell = { version = "1.14.0", optional = true }
regex-automata = { version = "0.1.5", default-features = false, optional = true }
serde = { version = "1.0.85", default-features = false, optional = true }
@@ -37,5 +39,34 @@ quickcheck = { version = "1", default-features = false }
ucd-parse = "0.1.3"
unicode-segmentation = "1.2.1"
+[package.metadata.docs.rs]
+# We want to document all features.
+all-features = true
+# Since this crate's feature setup is pretty complicated, it is worth opting
+# into a nightly unstable option to show the features that need to be enabled
+# for public API items. To do that, we set 'docsrs', and when that's enabled,
+# we enable the 'doc_auto_cfg' feature.
+#
+# To test this locally, run:
+#
+# RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features
+rustdoc-args = ["--cfg", "docsrs"]
+
[profile.release]
debug = true
+
+[[example]]
+name = "graphemes"
+required-features = ["std", "unicode"]
+
+[[example]]
+name = "lines"
+required-features = ["std"]
+
+[[example]]
+name = "uppercase"
+required-features = ["std", "unicode"]
+
+[[example]]
+name = "words"
+required-features = ["std", "unicode"]
diff --git a/METADATA b/METADATA
index bfc1d19..912fe76 100644
--- a/METADATA
+++ b/METADATA
@@ -1,3 +1,7 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update rust/crates/bstr
+# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md
+
name: "bstr"
description: "A string type that is not required to be valid UTF-8."
third_party {
@@ -7,13 +11,13 @@ third_party {
}
url {
type: ARCHIVE
- value: "https://static.crates.io/crates/bstr/bstr-0.2.17.crate"
+ value: "https://static.crates.io/crates/bstr/bstr-1.0.1.crate"
}
- version: "0.2.17"
+ version: "1.0.1"
license_type: NOTICE
last_upgrade_date {
- year: 2021
- month: 9
- day: 22
+ year: 2022
+ month: 12
+ day: 12
}
}
diff --git a/README.md b/README.md
index 13bf0fc..675540a 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ differs from the standard library's `String` and `str` types in that they are
not required to be valid UTF-8, but may be fully or partially valid UTF-8.
[![Build status](https://github.com/BurntSushi/bstr/workflows/ci/badge.svg)](https://github.com/BurntSushi/bstr/actions)
-[![](https://meritbadge.herokuapp.com/bstr)](https://crates.io/crates/bstr)
+[![Crates.io](https://img.shields.io/crates/v/bstr.svg)](https://crates.io/crates/bstr)
### Documentation
@@ -17,7 +17,7 @@ https://docs.rs/bstr
### When should I use byte strings?
See this part of the documentation for more details:
-https://docs.rs/bstr/0.2.*/bstr/#when-should-i-use-byte-strings.
+<https://docs.rs/bstr/1.*/bstr/#when-should-i-use-byte-strings>.
The short story is that byte strings are useful when it is inconvenient or
incorrect to require valid UTF-8.
@@ -29,7 +29,7 @@ Add this to your `Cargo.toml`:
```toml
[dependencies]
-bstr = "0.2"
+bstr = "1.0.0"
```
@@ -42,9 +42,7 @@ This first example simply shows how to efficiently iterate over lines in
stdin, and print out lines containing a particular substring:
```rust
-use std::error::Error;
-use std::io::{self, Write};
-
+use std::{error::Error, io::{self, Write}};
use bstr::{ByteSlice, io::BufReadExt};
fn main() -> Result<(), Box<dyn Error>> {
@@ -65,9 +63,7 @@ This example shows how to count all of the words (Unicode-aware) in stdin,
line-by-line:
```rust
-use std::error::Error;
-use std::io;
-
+use std::{error::Error, io};
use bstr::{ByteSlice, io::BufReadExt};
fn main() -> Result<(), Box<dyn Error>> {
@@ -88,9 +84,7 @@ text, this is quite a bit faster than what you can (easily) do with standard
library APIs. (N.B. Any invalid UTF-8 bytes are passed through unchanged.)
```rust
-use std::error::Error;
-use std::io::{self, Write};
-
+use std::{error::Error, io::{self, Write}};
use bstr::{ByteSlice, io::BufReadExt};
fn main() -> Result<(), Box<dyn Error>> {
@@ -113,9 +107,7 @@ clusters) from each line, where invalid UTF-8 sequences are generally treated
as a single character and are passed through correctly:
```rust
-use std::error::Error;
-use std::io::{self, Write};
-
+use std::{error::Error, io::{self, Write}};
use bstr::{ByteSlice, io::BufReadExt};
fn main() -> Result<(), Box<dyn Error>> {
@@ -144,21 +136,23 @@ This crates comes with a few features that control standard library, serde
and Unicode support.
* `std` - **Enabled** by default. This provides APIs that require the standard
- library, such as `Vec<u8>`.
+ library, such as `Vec<u8>` and `PathBuf`. Enabling this feature also enables
+ the `alloc` feature.
+* `alloc` - **Enabled** by default. This provides APIs that require allocations
+ via the `alloc` crate, such as `Vec<u8>`.
* `unicode` - **Enabled** by default. This provides APIs that require sizable
Unicode data compiled into the binary. This includes, but is not limited to,
grapheme/word/sentence segmenters. When this is disabled, basic support such
- as UTF-8 decoding is still included.
-* `serde1` - **Disabled** by default. Enables implementations of serde traits
- for the `BStr` and `BString` types.
-* `serde1-nostd` - **Disabled** by default. Enables implementations of serde
- traits for the `BStr` type only, intended for use without the standard
- library. Generally, you either want `serde1` or `serde1-nostd`, not both.
+ as UTF-8 decoding is still included. Note that currently, enabling this
+ feature also requires enabling the `std` feature. It is expected that this
+ limitation will be lifted at some point.
+* `serde` - Enables implementations of serde traits for `BStr`, and also
+ `BString` when `alloc` is enabled.
### Minimum Rust version policy
-This crate's minimum supported `rustc` version (MSRV) is `1.41.1`.
+This crate's minimum supported `rustc` version (MSRV) is `1.60.0`.
In general, this crate will be conservative with respect to the minimum
supported version of Rust. MSRV may be bumped in minor version releases.
@@ -166,13 +160,16 @@ supported version of Rust. MSRV may be bumped in minor version releases.
### Future work
-Since this is meant to be a core crate, getting a `1.0` release is a priority.
-My hope is to move to `1.0` within the next year and commit to its API so that
-`bstr` can be used as a public dependency.
+Since it is plausible that some of the types in this crate might end up in
+your public API (e.g., `BStr` and `BString`), we will commit to being very
+conservative with respect to new major version releases. It's difficult to say
+precisely how conservative, but unless there is a major issue with the `1.0`
+release, I wouldn't expect a `2.0` release to come out any sooner than some
+period of years.
A large part of the API surface area was taken from the standard library, so
-from an API design perspective, a good portion of this crate should be on solid
-ground already. The main differences from the standard library are in how the
+from an API design perspective, a good portion of this crate should be on
+solid ground. The main differences from the standard library are in how the
various substring search routines work. The standard library provides generic
infrastructure for supporting different types of searches with a single method,
where as this library prefers to define new methods for each type of search and
@@ -180,13 +177,10 @@ drop the generic infrastructure.
Some _probable_ future considerations for APIs include, but are not limited to:
-* A convenience layer on top of the `aho-corasick` crate.
* Unicode normalization.
* More sophisticated support for dealing with Unicode case, perhaps by
combining the use cases supported by [`caseless`](https://docs.rs/caseless)
and [`unicase`](https://docs.rs/unicase).
-* Add facilities for dealing with OS strings and file paths, probably via
- simple conversion routines.
Here are some examples that are _probably_ out of scope for this crate:
diff --git a/src/ascii.rs b/src/ascii.rs
index bb2b679..259d41f 100644
--- a/src/ascii.rs
+++ b/src/ascii.rs
@@ -23,18 +23,18 @@ use core::mem;
// means we can effectively skip the _mm_cmpeq_epi8 step and jump straight to
// _mm_movemask_epi8.
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const USIZE_BYTES: usize = mem::size_of::<usize>();
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const FALLBACK_LOOP_SIZE: usize = 2 * USIZE_BYTES;
// This is a mask where the most significant bit of each byte in the usize
// is set. We test this bit to determine whether a character is ASCII or not.
// Namely, a single byte is regarded as an ASCII codepoint if and only if it's
// most significant bit is not set.
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const ASCII_MASK_U64: u64 = 0x8080808080808080;
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const ASCII_MASK: usize = ASCII_MASK_U64 as usize;
/// Returns the index of the first non ASCII byte in the given slice.
@@ -42,18 +42,18 @@ const ASCII_MASK: usize = ASCII_MASK_U64 as usize;
/// If slice only contains ASCII bytes, then the length of the slice is
/// returned.
pub fn first_non_ascii_byte(slice: &[u8]) -> usize {
- #[cfg(not(target_arch = "x86_64"))]
+ #[cfg(any(miri, not(target_arch = "x86_64")))]
{
first_non_ascii_byte_fallback(slice)
}
- #[cfg(target_arch = "x86_64")]
+ #[cfg(all(not(miri), target_arch = "x86_64"))]
{
first_non_ascii_byte_sse2(slice)
}
}
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize {
let align = USIZE_BYTES - 1;
let start_ptr = slice.as_ptr();
@@ -115,7 +115,7 @@ fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize {
}
}
-#[cfg(target_arch = "x86_64")]
+#[cfg(all(not(miri), target_arch = "x86_64"))]
fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize {
use core::arch::x86_64::*;
@@ -221,7 +221,7 @@ unsafe fn first_non_ascii_byte_slow(
/// bytes is not an ASCII byte.
///
/// The position returned is always in the inclusive range [0, 7].
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
fn first_non_ascii_byte_mask(mask: usize) -> usize {
#[cfg(target_endian = "little")]
{
@@ -245,7 +245,7 @@ unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 {
ptr.offset((amt as isize).wrapping_neg())
}
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
unsafe fn read_unaligned_usize(ptr: *const u8) -> usize {
use core::ptr;
@@ -286,6 +286,7 @@ mod tests {
#[test]
#[cfg(target_arch = "x86_64")]
+ #[cfg(not(miri))]
fn positive_sse2_forward() {
for i in 0..517 {
let b = "a".repeat(i).into_bytes();
@@ -294,6 +295,7 @@ mod tests {
}
#[test]
+ #[cfg(not(miri))]
fn negative_fallback_forward() {
for i in 0..517 {
for align in 0..65 {
@@ -315,6 +317,7 @@ mod tests {
#[test]
#[cfg(target_arch = "x86_64")]
+ #[cfg(not(miri))]
fn negative_sse2_forward() {
for i in 0..517 {
for align in 0..65 {
diff --git a/src/bstr.rs b/src/bstr.rs
index 1e3c91b..5036f06 100644
--- a/src/bstr.rs
+++ b/src/bstr.rs
@@ -1,5 +1,8 @@
use core::mem;
+#[cfg(feature = "alloc")]
+use alloc::boxed::Box;
+
/// A wrapper for `&[u8]` that provides convenient string oriented trait impls.
///
/// If you need ownership or a growable byte string buffer, then use
@@ -33,8 +36,31 @@ pub struct BStr {
}
impl BStr {
+ /// Directly creates a `BStr` slice from anything that can be converted
+ /// to a byte slice.
+ ///
+ /// This is very similar to the [`B`](crate::B) function, except this
+ /// returns a `&BStr` instead of a `&[u8]`.
+ ///
+ /// This is a cost-free conversion.
+ ///
+ /// # Example
+ ///
+ /// You can create `BStr`'s from byte arrays, byte slices or even string
+ /// slices:
+ ///
+ /// ```
+ /// use bstr::BStr;
+ ///
+ /// let a = BStr::new(b"abc");
+ /// let b = BStr::new(&b"abc"[..]);
+ /// let c = BStr::new("abc");
+ ///
+ /// assert_eq!(a, b);
+ /// assert_eq!(a, c);
+ /// ```
#[inline]
- pub(crate) fn new<B: ?Sized + AsRef<[u8]>>(bytes: &B) -> &BStr {
+ pub fn new<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a BStr {
BStr::from_bytes(bytes.as_ref())
}
@@ -56,13 +82,13 @@ impl BStr {
}
#[inline]
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
pub(crate) fn from_boxed_bytes(slice: Box<[u8]>) -> Box<BStr> {
unsafe { Box::from_raw(Box::into_raw(slice) as _) }
}
#[inline]
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
pub(crate) fn into_boxed_bytes(slice: Box<BStr>) -> Box<[u8]> {
unsafe { Box::from_raw(Box::into_raw(slice) as _) }
}
diff --git a/src/bstring.rs b/src/bstring.rs
index 30093ba..d144b1d 100644
--- a/src/bstring.rs
+++ b/src/bstring.rs
@@ -1,3 +1,5 @@
+use alloc::vec::Vec;
+
use crate::bstr::BStr;
/// A wrapper for `Vec<u8>` that provides convenient string oriented trait
@@ -38,16 +40,43 @@ use crate::bstr::BStr;
/// region of memory containing the bytes, a length and a capacity.
#[derive(Clone, Hash)]
pub struct BString {
- pub(crate) bytes: Vec<u8>,
+ bytes: Vec<u8>,
}
impl BString {
+ /// Constructs a new `BString` from the given [`Vec`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use bstr::BString;
+ ///
+ /// let mut b = BString::new(Vec::with_capacity(10));
+ /// ```
+ ///
+ /// This function is `const`:
+ ///
+ /// ```
+ /// use bstr::BString;
+ ///
+ /// const B: BString = BString::new(vec![]);
+ /// ```
+ #[inline]
+ pub const fn new(bytes: Vec<u8>) -> BString {
+ BString { bytes }
+ }
+
#[inline]
pub(crate) fn as_bytes(&self) -> &[u8] {
&self.bytes
}
#[inline]
+ pub(crate) fn as_bytes_mut(&mut self) -> &mut [u8] {
+ &mut self.bytes
+ }
+
+ #[inline]
pub(crate) fn as_bstr(&self) -> &BStr {
BStr::new(&self.bytes)
}
@@ -56,4 +85,19 @@ impl BString {
pub(crate) fn as_mut_bstr(&mut self) -> &mut BStr {
BStr::new_mut(&mut self.bytes)
}
+
+ #[inline]
+ pub(crate) fn as_vec(&self) -> &Vec<u8> {
+ &self.bytes
+ }
+
+ #[inline]
+ pub(crate) fn as_vec_mut(&mut self) -> &mut Vec<u8> {
+ &mut self.bytes
+ }
+
+ #[inline]
+ pub(crate) fn into_vec(self) -> Vec<u8> {
+ self.bytes
+ }
}
diff --git a/src/byteset/mod.rs b/src/byteset/mod.rs
index 043d309..c6c697c 100644
--- a/src/byteset/mod.rs
+++ b/src/byteset/mod.rs
@@ -1,4 +1,5 @@
use memchr::{memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3};
+
mod scalar;
#[inline]
@@ -79,7 +80,7 @@ pub(crate) fn rfind_not(haystack: &[u8], byteset: &[u8]) -> Option<usize> {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std", not(miri)))]
mod tests {
quickcheck::quickcheck! {
fn qc_byteset_forward_matches_naive(
diff --git a/src/byteset/scalar.rs b/src/byteset/scalar.rs
index 9bd34a8..28bff67 100644
--- a/src/byteset/scalar.rs
+++ b/src/byteset/scalar.rs
@@ -1,9 +1,8 @@
// This is adapted from `fallback.rs` from rust-memchr. It's modified to return
-// the 'inverse' query of memchr, e.g. finding the first byte not in the provided
-// set. This is simple for the 1-byte case.
+// the 'inverse' query of memchr, e.g. finding the first byte not in the
+// provided set. This is simple for the 1-byte case.
-use core::cmp;
-use core::usize;
+use core::{cmp, usize};
#[cfg(target_pointer_width = "32")]
const USIZE_BYTES: usize = 4;
@@ -29,10 +28,11 @@ pub fn inv_memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
let loop_size = cmp::min(LOOP_SIZE, haystack.len());
let align = USIZE_BYTES - 1;
let start_ptr = haystack.as_ptr();
- let end_ptr = haystack[haystack.len()..].as_ptr();
- let mut ptr = start_ptr;
unsafe {
+ let end_ptr = haystack.as_ptr().add(haystack.len());
+ let mut ptr = start_ptr;
+
if haystack.len() < USIZE_BYTES {
return forward_search(start_ptr, end_ptr, ptr, confirm);
}
@@ -68,10 +68,11 @@ pub fn inv_memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
let loop_size = cmp::min(LOOP_SIZE, haystack.len());
let align = USIZE_BYTES - 1;
let start_ptr = haystack.as_ptr();
- let end_ptr = haystack[haystack.len()..].as_ptr();
- let mut ptr = end_ptr;
unsafe {
+ let end_ptr = haystack.as_ptr().add(haystack.len());
+ let mut ptr = end_ptr;
+
if haystack.len() < USIZE_BYTES {
return reverse_search(start_ptr, end_ptr, ptr, confirm);
}
@@ -81,7 +82,7 @@ pub fn inv_memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
return reverse_search(start_ptr, end_ptr, ptr, confirm);
}
- ptr = (end_ptr as usize & !align) as *const u8;
+ ptr = ptr.sub(end_ptr as usize & align);
debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
@@ -174,9 +175,10 @@ pub(crate) fn reverse_search_bytes<F: Fn(u8) -> bool>(
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
use super::{inv_memchr, inv_memrchr};
+
// search string, search byte, inv_memchr result, inv_memrchr result.
// these are expanded into a much larger set of tests in build_tests
const TESTS: &[(&[u8], u8, usize, usize)] = &[
@@ -192,10 +194,15 @@ mod tests {
type TestCase = (Vec<u8>, u8, Option<(usize, usize)>);
fn build_tests() -> Vec<TestCase> {
+ #[cfg(not(miri))]
+ const MAX_PER: usize = 515;
+ #[cfg(miri)]
+ const MAX_PER: usize = 10;
+
let mut result = vec![];
for &(search, byte, fwd_pos, rev_pos) in TESTS {
result.push((search.to_vec(), byte, Some((fwd_pos, rev_pos))));
- for i in 1..515 {
+ for i in 1..MAX_PER {
// add a bunch of copies of the search byte to the end.
let mut suffixed: Vec<u8> = search.into();
suffixed.extend(std::iter::repeat(byte).take(i));
@@ -225,7 +232,7 @@ mod tests {
}
// build non-matching tests for several sizes
- for i in 0..515 {
+ for i in 0..MAX_PER {
result.push((
std::iter::repeat(b'\0').take(i).collect(),
b'\0',
@@ -239,6 +246,12 @@ mod tests {
#[test]
fn test_inv_memchr() {
use crate::{ByteSlice, B};
+
+ #[cfg(not(miri))]
+ const MAX_OFFSET: usize = 130;
+ #[cfg(miri)]
+ const MAX_OFFSET: usize = 13;
+
for (search, byte, matching) in build_tests() {
assert_eq!(
inv_memchr(byte, &search),
@@ -256,13 +269,14 @@ mod tests {
// better printing
B(&search).as_bstr(),
);
- // Test a rather large number off offsets for potential alignment issues
- for offset in 1..130 {
+ // Test a rather large number off offsets for potential alignment
+ // issues.
+ for offset in 1..MAX_OFFSET {
if offset >= search.len() {
break;
}
- // If this would cause us to shift the results off the end, skip
- // it so that we don't have to recompute them.
+ // If this would cause us to shift the results off the end,
+ // skip it so that we don't have to recompute them.
if let Some((f, r)) = matching {
if offset > f || offset > r {
break;
diff --git a/src/ext_slice.rs b/src/ext_slice.rs
index 0cc73af..ec52a61 100644
--- a/src/ext_slice.rs
+++ b/src/ext_slice.rs
@@ -1,17 +1,16 @@
+use core::{iter, slice, str};
+
+#[cfg(all(feature = "alloc", feature = "unicode"))]
+use alloc::vec;
+#[cfg(feature = "alloc")]
+use alloc::{borrow::Cow, string::String, vec::Vec};
+
#[cfg(feature = "std")]
-use std::borrow::Cow;
-#[cfg(feature = "std")]
-use std::ffi::OsStr;
-#[cfg(feature = "std")]
-use std::path::Path;
+use std::{ffi::OsStr, path::Path};
-use core::{iter, ops, ptr, slice, str};
use memchr::{memchr, memmem, memrchr};
-use crate::ascii;
-use crate::bstr::BStr;
-use crate::byteset;
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
use crate::ext_vec::ByteVec;
#[cfg(feature = "unicode")]
use crate::unicode::{
@@ -19,7 +18,12 @@ use crate::unicode::{
SentenceIndices, Sentences, WordIndices, Words, WordsWithBreakIndices,
WordsWithBreaks,
};
-use crate::utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error};
+use crate::{
+ ascii,
+ bstr::BStr,
+ byteset,
+ utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error},
+};
/// A short-hand constructor for building a `&[u8]`.
///
@@ -149,11 +153,12 @@ pub trait ByteSlice: Sealed {
/// Create an immutable byte string from an OS string slice.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns `None` if the given OS string is not valid UTF-8. (For
- /// example, on Windows, file paths are allowed to be a sequence of
- /// arbitrary 16-bit integers. Not all such sequences can be transcoded to
- /// valid UTF-8.)
+ /// When the underlying bytes of OS strings are accessible, then this
+ /// always succeeds and is zero cost. Otherwise, this returns `None` if the
+ /// given OS string is not valid UTF-8. (For example, when the underlying
+ /// bytes are inaccessible on Windows, file paths are allowed to be a
+ /// sequence of arbitrary 16-bit integers. Not all such sequences can be
+ /// transcoded to valid UTF-8.)
///
/// # Examples
///
@@ -190,10 +195,12 @@ pub trait ByteSlice: Sealed {
/// Create an immutable byte string from a file path.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns `None` if the given path is not valid UTF-8. (For example,
- /// on Windows, file paths are allowed to be a sequence of arbitrary 16-bit
- /// integers. Not all such sequences can be transcoded to valid UTF-8.)
+ /// When the underlying bytes of paths are accessible, then this always
+ /// succeeds and is zero cost. Otherwise, this returns `None` if the given
+ /// path is not valid UTF-8. (For example, when the underlying bytes are
+ /// inaccessible on Windows, file paths are allowed to be a sequence of
+ /// arbitrary 16-bit integers. Not all such sequences can be transcoded to
+ /// valid UTF-8.)
///
/// # Examples
///
@@ -230,6 +237,7 @@ pub trait ByteSlice: Sealed {
/// Basic usage:
///
/// ```
+ /// # #[cfg(feature = "alloc")] {
/// use bstr::{B, ByteSlice, ByteVec};
///
/// # fn example() -> Result<(), bstr::Utf8Error> {
@@ -241,6 +249,7 @@ pub trait ByteSlice: Sealed {
/// let err = bstring.to_str().unwrap_err();
/// assert_eq!(8, err.valid_up_to());
/// # Ok(()) }; example().unwrap()
+ /// # }
/// ```
#[inline]
fn to_str(&self) -> Result<&str, Utf8Error> {
@@ -301,7 +310,7 @@ pub trait ByteSlice: Sealed {
/// [W3C's Encoding standard](https://www.w3.org/TR/encoding/).
/// For a more precise description of the maximal subpart strategy, see
/// the Unicode Standard, Chapter 3, Section 9. See also
- /// [Public Review Issue #121](http://www.unicode.org/review/pr-121.html).
+ /// [Public Review Issue #121](https://www.unicode.org/review/pr-121.html).
///
/// N.B. Rust's standard library also appears to use the same strategy,
/// but it does not appear to be an API guarantee.
@@ -341,7 +350,7 @@ pub trait ByteSlice: Sealed {
/// let bs = B(b"\x61\xF1\x80\x80\xE1\x80\xC2\x62");
/// assert_eq!("a\u{FFFD}\u{FFFD}\u{FFFD}b", bs.to_str_lossy());
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn to_str_lossy(&self) -> Cow<'_, str> {
match utf8::validate(self.as_bytes()) {
@@ -398,7 +407,7 @@ pub trait ByteSlice: Sealed {
/// bstring.to_str_lossy_into(&mut dest);
/// assert_eq!("☃βツ\u{FFFD}", dest);
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn to_str_lossy_into(&self, dest: &mut String) {
let mut bytes = self.as_bytes();
@@ -428,12 +437,15 @@ pub trait ByteSlice: Sealed {
/// Create an OS string slice from this byte string.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns a UTF-8 decoding error if this byte string is not valid
- /// UTF-8. (For example, on Windows, file paths are allowed to be a
- /// sequence of arbitrary 16-bit integers. There is no obvious mapping from
- /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of
- /// 16-bit integers.)
+ /// When OS strings can be constructed from arbitrary byte sequences, this
+ /// always succeeds and is zero cost. Otherwise, this returns a UTF-8
+ /// decoding error if this byte string is not valid UTF-8. (For example,
+ /// assuming the representation of `OsStr` is opaque on Windows, file paths
+ /// are allowed to be a sequence of arbitrary 16-bit integers. There is
+ /// no obvious mapping from an arbitrary sequence of 8-bit integers to an
+ /// arbitrary sequence of 16-bit integers. If the representation of `OsStr`
+ /// is even opened up, then this will convert any sequence of bytes to an
+ /// `OsStr` without cost.)
///
/// # Examples
///
@@ -467,13 +479,13 @@ pub trait ByteSlice: Sealed {
/// Lossily create an OS string slice from this byte string.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this will perform a UTF-8 check and lossily convert this byte string
- /// into valid UTF-8 using the Unicode replacement codepoint.
+ /// When OS strings can be constructed from arbitrary byte sequences, this
+ /// is zero cost and always returns a slice. Otherwise, this will perform a
+ /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
+ /// the Unicode replacement codepoint.
///
- /// Note that this can prevent the correct roundtripping of file paths on
- /// non-Unix systems such as Windows, where file paths are an arbitrary
- /// sequence of 16-bit integers.
+ /// Note that this can prevent the correct roundtripping of file paths when
+ /// the representation of `OsStr` is opaque.
///
/// # Examples
///
@@ -512,12 +524,15 @@ pub trait ByteSlice: Sealed {
/// Create a path slice from this byte string.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns a UTF-8 decoding error if this byte string is not valid
- /// UTF-8. (For example, on Windows, file paths are allowed to be a
- /// sequence of arbitrary 16-bit integers. There is no obvious mapping from
- /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of
- /// 16-bit integers.)
+ /// When paths can be constructed from arbitrary byte sequences, this
+ /// always succeeds and is zero cost. Otherwise, this returns a UTF-8
+ /// decoding error if this byte string is not valid UTF-8. (For example,
+ /// assuming the representation of `Path` is opaque on Windows, file paths
+ /// are allowed to be a sequence of arbitrary 16-bit integers. There is
+ /// no obvious mapping from an arbitrary sequence of 8-bit integers to an
+ /// arbitrary sequence of 16-bit integers. If the representation of `Path`
+ /// is even opened up, then this will convert any sequence of bytes to an
+ /// `Path` without cost.)
///
/// # Examples
///
@@ -537,13 +552,13 @@ pub trait ByteSlice: Sealed {
/// Lossily create a path slice from this byte string.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this will perform a UTF-8 check and lossily convert this byte string
- /// into valid UTF-8 using the Unicode replacement codepoint.
+ /// When paths can be constructed from arbitrary byte sequences, this is
+ /// zero cost and always returns a slice. Otherwise, this will perform a
+ /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
+ /// the Unicode replacement codepoint.
///
- /// Note that this can prevent the correct roundtripping of file paths on
- /// non-Unix systems such as Windows, where file paths are an arbitrary
- /// sequence of 16-bit integers.
+ /// Note that this can prevent the correct roundtripping of file paths when
+ /// the representation of `Path` is opaque.
///
/// # Examples
///
@@ -584,15 +599,10 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(b"foo".repeatn(4), B("foofoofoofoo"));
/// assert_eq!(b"foo".repeatn(0), B(""));
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn repeatn(&self, n: usize) -> Vec<u8> {
- let bs = self.as_bytes();
- let mut dst = vec![0; bs.len() * n];
- for i in 0..n {
- dst[i * bs.len()..(i + 1) * bs.len()].copy_from_slice(bs);
- }
- dst
+ self.as_bytes().repeat(n)
}
/// Returns true if and only if this byte string contains the given needle.
@@ -759,10 +769,10 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(matches, vec![0]);
/// ```
#[inline]
- fn find_iter<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
- needle: &'a B,
- ) -> Find<'a> {
+ fn find_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ needle: &'n B,
+ ) -> Find<'h, 'n> {
Find::new(self.as_bytes(), needle.as_ref())
}
@@ -804,10 +814,10 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(matches, vec![0]);
/// ```
#[inline]
- fn rfind_iter<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
- needle: &'a B,
- ) -> FindReverse<'a> {
+ fn rfind_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ needle: &'n B,
+ ) -> FindReverse<'h, 'n> {
FindReverse::new(self.as_bytes(), needle.as_ref())
}
@@ -926,14 +936,17 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(b"foo bar baz".find_byteset(b"zr"), Some(6));
/// assert_eq!(b"foo baz bar".find_byteset(b"bzr"), Some(4));
/// assert_eq!(None, b"foo baz bar".find_byteset(b"\t\n"));
+ /// // The empty byteset never matches.
+ /// assert_eq!(None, b"abc".find_byteset(b""));
+ /// assert_eq!(None, b"".find_byteset(b""));
/// ```
#[inline]
fn find_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
byteset::find(self.as_bytes(), byteset.as_ref())
}
- /// Returns the index of the first occurrence of a byte that is not a member
- /// of the provided set.
+ /// Returns the index of the first occurrence of a byte that is not a
+ /// member of the provided set.
///
/// The `byteset` may be any type that can be cheaply converted into a
/// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
@@ -963,6 +976,10 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(b"foo bar baz".find_not_byteset(b"fo "), Some(4));
/// assert_eq!(b"\t\tbaz bar".find_not_byteset(b" \t\r\n"), Some(2));
/// assert_eq!(b"foo\nbaz\tbar".find_not_byteset(b"\t\n"), Some(0));
+ /// // The negation of the empty byteset matches everything.
+ /// assert_eq!(Some(0), b"abc".find_not_byteset(b""));
+ /// // But an empty string never contains anything.
+ /// assert_eq!(None, b"".find_not_byteset(b""));
/// ```
#[inline]
fn find_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
@@ -1043,8 +1060,9 @@ pub trait ByteSlice: Sealed {
byteset::rfind_not(self.as_bytes(), byteset.as_ref())
}
- /// Returns an iterator over the fields in a byte string, separated by
- /// contiguous whitespace.
+ /// Returns an iterator over the fields in a byte string, separated
+ /// by contiguous whitespace (according to the Unicode property
+ /// `White_Space`).
///
/// # Example
///
@@ -1065,6 +1083,7 @@ pub trait ByteSlice: Sealed {
///
/// assert_eq!(0, B(" \n\t\u{2003}\n \t").fields().count());
/// ```
+ #[cfg(feature = "unicode")]
#[inline]
fn fields(&self) -> Fields<'_> {
Fields::new(self.as_bytes())
@@ -1191,10 +1210,10 @@ pub trait ByteSlice: Sealed {
/// It does *not* give you `["a", "b", "c"]`. For that behavior, use
/// [`fields`](#method.fields) instead.
#[inline]
- fn split_str<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
- splitter: &'a B,
- ) -> Split<'a> {
+ fn split_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ splitter: &'s B,
+ ) -> Split<'h, 's> {
Split::new(self.as_bytes(), splitter.as_ref())
}
@@ -1285,13 +1304,101 @@ pub trait ByteSlice: Sealed {
///
/// It does *not* give you `["a", "b", "c"]`.
#[inline]
- fn rsplit_str<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
- splitter: &'a B,
- ) -> SplitReverse<'a> {
+ fn rsplit_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ splitter: &'s B,
+ ) -> SplitReverse<'h, 's> {
SplitReverse::new(self.as_bytes(), splitter.as_ref())
}
+ /// Split this byte string at the first occurance of `splitter`.
+ ///
+ /// If the `splitter` is found in the byte string, returns a tuple
+ /// containing the parts of the string before and after the first occurance
+ /// of `splitter` respectively. Otherwise, if there are no occurances of
+ /// `splitter` in the byte string, returns `None`.
+ ///
+ /// The splitter may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// If you need to split on the *last* instance of a delimiter instead, see
+ /// the [`ByteSlice::rsplit_once_str`](#method.rsplit_once_str) method .
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// assert_eq!(
+ /// B("foo,bar").split_once_str(","),
+ /// Some((B("foo"), B("bar"))),
+ /// );
+ /// assert_eq!(
+ /// B("foo,bar,baz").split_once_str(","),
+ /// Some((B("foo"), B("bar,baz"))),
+ /// );
+ /// assert_eq!(B("foo").split_once_str(","), None);
+ /// assert_eq!(B("foo,").split_once_str(b","), Some((B("foo"), B(""))));
+ /// assert_eq!(B(",foo").split_once_str(b","), Some((B(""), B("foo"))));
+ /// ```
+ #[inline]
+ fn split_once_str<'a, B: ?Sized + AsRef<[u8]>>(
+ &'a self,
+ splitter: &B,
+ ) -> Option<(&'a [u8], &'a [u8])> {
+ let bytes = self.as_bytes();
+ let splitter = splitter.as_ref();
+ let start = Finder::new(splitter).find(bytes)?;
+ let end = start + splitter.len();
+ Some((&bytes[..start], &bytes[end..]))
+ }
+
+ /// Split this byte string at the last occurance of `splitter`.
+ ///
+ /// If the `splitter` is found in the byte string, returns a tuple
+ /// containing the parts of the string before and after the last occurance
+ /// of `splitter`, respectively. Otherwise, if there are no occurances of
+ /// `splitter` in the byte string, returns `None`.
+ ///
+ /// The splitter may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// If you need to split on the *first* instance of a delimiter instead, see
+ /// the [`ByteSlice::split_once_str`](#method.split_once_str) method.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// assert_eq!(
+ /// B("foo,bar").rsplit_once_str(","),
+ /// Some((B("foo"), B("bar"))),
+ /// );
+ /// assert_eq!(
+ /// B("foo,bar,baz").rsplit_once_str(","),
+ /// Some((B("foo,bar"), B("baz"))),
+ /// );
+ /// assert_eq!(B("foo").rsplit_once_str(","), None);
+ /// assert_eq!(B("foo,").rsplit_once_str(b","), Some((B("foo"), B(""))));
+ /// assert_eq!(B(",foo").rsplit_once_str(b","), Some((B(""), B("foo"))));
+ /// ```
+ #[inline]
+ fn rsplit_once_str<'a, B: ?Sized + AsRef<[u8]>>(
+ &'a self,
+ splitter: &B,
+ ) -> Option<(&'a [u8], &'a [u8])> {
+ let bytes = self.as_bytes();
+ let splitter = splitter.as_ref();
+ let start = FinderReverse::new(splitter).rfind(bytes)?;
+ let end = start + splitter.len();
+ Some((&bytes[..start], &bytes[end..]))
+ }
+
/// Returns an iterator of at most `limit` substrings of this byte string,
/// separated by the given byte string. If `limit` substrings are yielded,
/// then the last substring will contain the remainder of this byte string.
@@ -1328,11 +1435,11 @@ pub trait ByteSlice: Sealed {
/// assert!(x.is_empty());
/// ```
#[inline]
- fn splitn_str<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
+ fn splitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
limit: usize,
- splitter: &'a B,
- ) -> SplitN<'a> {
+ splitter: &'s B,
+ ) -> SplitN<'h, 's> {
SplitN::new(self.as_bytes(), splitter.as_ref(), limit)
}
@@ -1374,11 +1481,11 @@ pub trait ByteSlice: Sealed {
/// assert!(x.is_empty());
/// ```
#[inline]
- fn rsplitn_str<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
+ fn rsplitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
limit: usize,
- splitter: &'a B,
- ) -> SplitNReverse<'a> {
+ splitter: &'s B,
+ ) -> SplitNReverse<'h, 's> {
SplitNReverse::new(self.as_bytes(), splitter.as_ref(), limit)
}
@@ -1416,7 +1523,7 @@ pub trait ByteSlice: Sealed {
/// let s = b"foo".replace("", "Z");
/// assert_eq!(s, "ZfZoZoZ".as_bytes());
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn replace<N: AsRef<[u8]>, R: AsRef<[u8]>>(
&self,
@@ -1462,7 +1569,7 @@ pub trait ByteSlice: Sealed {
/// let s = b"foo".replacen("", "Z", 2);
/// assert_eq!(s, "ZfZoo".as_bytes());
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn replacen<N: AsRef<[u8]>, R: AsRef<[u8]>>(
&self,
@@ -1520,7 +1627,7 @@ pub trait ByteSlice: Sealed {
/// s.replace_into("", "Z", &mut dest);
/// assert_eq!(dest, "ZfZoZoZ".as_bytes());
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn replace_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
&self,
@@ -1584,7 +1691,7 @@ pub trait ByteSlice: Sealed {
/// s.replacen_into("", "Z", 2, &mut dest);
/// assert_eq!(dest, "ZfZoo".as_bytes());
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn replacen_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
&self,
@@ -1800,6 +1907,7 @@ pub trait ByteSlice: Sealed {
/// not necessarily correspond to the length of the `&str` returned!
///
/// ```
+ /// # #[cfg(all(feature = "alloc"))] {
/// use bstr::{ByteSlice, ByteVec};
///
/// let mut bytes = vec![];
@@ -1813,6 +1921,7 @@ pub trait ByteSlice: Sealed {
/// graphemes,
/// vec![(0, 5, "à̖"), (5, 6, "\u{FFFD}"), (6, 14, "🇺🇸")]
/// );
+ /// # }
/// ```
#[cfg(feature = "unicode")]
#[inline]
@@ -2277,7 +2386,7 @@ pub trait ByteSlice: Sealed {
/// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
/// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), s.to_lowercase().as_bytes());
/// ```
- #[cfg(all(feature = "std", feature = "unicode"))]
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
#[inline]
fn to_lowercase(&self) -> Vec<u8> {
let mut buf = vec![];
@@ -2339,7 +2448,7 @@ pub trait ByteSlice: Sealed {
/// s.to_lowercase_into(&mut buf);
/// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), buf.as_bytes());
/// ```
- #[cfg(all(feature = "std", feature = "unicode"))]
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
#[inline]
fn to_lowercase_into(&self, buf: &mut Vec<u8>) {
// TODO: This is the best we can do given what std exposes I think.
@@ -2394,7 +2503,7 @@ pub trait ByteSlice: Sealed {
/// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
/// assert_eq!(s.to_ascii_lowercase(), B(b"foo\xFFbar\xE2\x98baz"));
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn to_ascii_lowercase(&self) -> Vec<u8> {
self.as_bytes().to_ascii_lowercase()
@@ -2424,11 +2533,13 @@ pub trait ByteSlice: Sealed {
/// Invalid UTF-8 remains as is:
///
/// ```
+ /// # #[cfg(feature = "alloc")] {
/// use bstr::{B, ByteSlice, ByteVec};
///
/// let mut s = <Vec<u8>>::from_slice(b"FOO\xFFBAR\xE2\x98BAZ");
/// s.make_ascii_lowercase();
/// assert_eq!(s, B(b"foo\xFFbar\xE2\x98baz"));
+ /// # }
/// ```
#[inline]
fn make_ascii_lowercase(&mut self) {
@@ -2480,7 +2591,7 @@ pub trait ByteSlice: Sealed {
/// let s = B(b"foo\xFFbar\xE2\x98baz");
/// assert_eq!(s.to_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
/// ```
- #[cfg(all(feature = "std", feature = "unicode"))]
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
#[inline]
fn to_uppercase(&self) -> Vec<u8> {
let mut buf = vec![];
@@ -2542,7 +2653,7 @@ pub trait ByteSlice: Sealed {
/// s.to_uppercase_into(&mut buf);
/// assert_eq!(buf, B(b"FOO\xFFBAR\xE2\x98BAZ"));
/// ```
- #[cfg(all(feature = "std", feature = "unicode"))]
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
#[inline]
fn to_uppercase_into(&self, buf: &mut Vec<u8>) {
// TODO: This is the best we can do given what std exposes I think.
@@ -2594,7 +2705,7 @@ pub trait ByteSlice: Sealed {
/// let s = B(b"foo\xFFbar\xE2\x98baz");
/// assert_eq!(s.to_ascii_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn to_ascii_uppercase(&self) -> Vec<u8> {
self.as_bytes().to_ascii_uppercase()
@@ -2624,11 +2735,13 @@ pub trait ByteSlice: Sealed {
/// Invalid UTF-8 remains as is:
///
/// ```
+ /// # #[cfg(feature = "alloc")] {
/// use bstr::{B, ByteSlice, ByteVec};
///
/// let mut s = <Vec<u8>>::from_slice(b"foo\xFFbar\xE2\x98baz");
/// s.make_ascii_uppercase();
/// assert_eq!(s, B(b"FOO\xFFBAR\xE2\x98BAZ"));
+ /// # }
/// ```
#[inline]
fn make_ascii_uppercase(&mut self) {
@@ -2900,72 +3013,6 @@ pub trait ByteSlice: Sealed {
Some(index)
}
}
-
- /// Copies elements from one part of the slice to another part of itself,
- /// where the parts may be overlapping.
- ///
- /// `src` is the range within this byte string to copy from, while `dest`
- /// is the starting index of the range within this byte string to copy to.
- /// The length indicated by `src` must be less than or equal to the number
- /// of bytes from `dest` to the end of the byte string.
- ///
- /// # Panics
- ///
- /// Panics if either range is out of bounds, or if `src` is too big to fit
- /// into `dest`, or if the end of `src` is before the start.
- ///
- /// # Examples
- ///
- /// Copying four bytes within a byte string:
- ///
- /// ```
- /// use bstr::{B, ByteSlice};
- ///
- /// let mut buf = *b"Hello, World!";
- /// let s = &mut buf;
- /// s.copy_within_str(1..5, 8);
- /// assert_eq!(s, B("Hello, Wello!"));
- /// ```
- #[inline]
- fn copy_within_str<R>(&mut self, src: R, dest: usize)
- where
- R: ops::RangeBounds<usize>,
- {
- // TODO: Deprecate this once slice::copy_within stabilizes.
- let src_start = match src.start_bound() {
- ops::Bound::Included(&n) => n,
- ops::Bound::Excluded(&n) => {
- n.checked_add(1).expect("attempted to index slice beyond max")
- }
- ops::Bound::Unbounded => 0,
- };
- let src_end = match src.end_bound() {
- ops::Bound::Included(&n) => {
- n.checked_add(1).expect("attempted to index slice beyond max")
- }
- ops::Bound::Excluded(&n) => n,
- ops::Bound::Unbounded => self.as_bytes().len(),
- };
- assert!(src_start <= src_end, "src end is before src start");
- assert!(src_end <= self.as_bytes().len(), "src is out of bounds");
- let count = src_end - src_start;
- assert!(
- dest <= self.as_bytes().len() - count,
- "dest is out of bounds",
- );
-
- // SAFETY: This is safe because we use ptr::copy to handle overlapping
- // copies, and is also safe because we've checked all the bounds above.
- // Finally, we are only dealing with u8 data, which is Copy, which
- // means we can copy without worrying about ownership/destructors.
- unsafe {
- ptr::copy(
- self.as_bytes().get_unchecked(src_start),
- self.as_bytes_mut().get_unchecked_mut(dest),
- count,
- );
- }
- }
}
/// A single substring searcher fixed to a particular needle.
@@ -3138,22 +3185,22 @@ impl<'a> FinderReverse<'a> {
///
/// Matches are reported by the byte offset at which they begin.
///
-/// `'a` is the shorter of two lifetimes: the byte string being searched or the
-/// byte string being looked for.
+/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
+/// needle.
#[derive(Debug)]
-pub struct Find<'a> {
- it: memmem::FindIter<'a, 'a>,
- haystack: &'a [u8],
- needle: &'a [u8],
+pub struct Find<'h, 'n> {
+ it: memmem::FindIter<'h, 'n>,
+ haystack: &'h [u8],
+ needle: &'n [u8],
}
-impl<'a> Find<'a> {
- fn new(haystack: &'a [u8], needle: &'a [u8]) -> Find<'a> {
+impl<'h, 'n> Find<'h, 'n> {
+ fn new(haystack: &'h [u8], needle: &'n [u8]) -> Find<'h, 'n> {
Find { it: memmem::find_iter(haystack, needle), haystack, needle }
}
}
-impl<'a> Iterator for Find<'a> {
+impl<'h, 'n> Iterator for Find<'h, 'n> {
type Item = usize;
#[inline]
@@ -3166,17 +3213,17 @@ impl<'a> Iterator for Find<'a> {
///
/// Matches are reported by the byte offset at which they begin.
///
-/// `'a` is the shorter of two lifetimes: the byte string being searched or the
-/// byte string being looked for.
+/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
+/// needle.
#[derive(Debug)]
-pub struct FindReverse<'a> {
- it: memmem::FindRevIter<'a, 'a>,
- haystack: &'a [u8],
- needle: &'a [u8],
+pub struct FindReverse<'h, 'n> {
+ it: memmem::FindRevIter<'h, 'n>,
+ haystack: &'h [u8],
+ needle: &'n [u8],
}
-impl<'a> FindReverse<'a> {
- fn new(haystack: &'a [u8], needle: &'a [u8]) -> FindReverse<'a> {
+impl<'h, 'n> FindReverse<'h, 'n> {
+ fn new(haystack: &'h [u8], needle: &'n [u8]) -> FindReverse<'h, 'n> {
FindReverse {
it: memmem::rfind_iter(haystack, needle),
haystack,
@@ -3184,16 +3231,16 @@ impl<'a> FindReverse<'a> {
}
}
- fn haystack(&self) -> &'a [u8] {
+ fn haystack(&self) -> &'h [u8] {
self.haystack
}
- fn needle(&self) -> &[u8] {
+ fn needle(&self) -> &'n [u8] {
self.needle
}
}
-impl<'a> Iterator for FindReverse<'a> {
+impl<'h, 'n> Iterator for FindReverse<'h, 'n> {
type Item = usize;
#[inline]
@@ -3215,7 +3262,7 @@ impl<'a> Bytes<'a> {
/// This has the same lifetime as the original slice,
/// and so the iterator can continue to be used while this exists.
#[inline]
- pub fn as_slice(&self) -> &'a [u8] {
+ pub fn as_bytes(&self) -> &'a [u8] {
self.it.as_slice()
}
}
@@ -3252,21 +3299,27 @@ impl<'a> iter::FusedIterator for Bytes<'a> {}
/// An iterator over the fields in a byte string, separated by whitespace.
///
+/// Whitespace for this iterator is defined by the Unicode property
+/// `White_Space`.
+///
/// This iterator splits on contiguous runs of whitespace, such that the fields
/// in `foo\t\t\n \nbar` are `foo` and `bar`.
///
/// `'a` is the lifetime of the byte string being split.
+#[cfg(feature = "unicode")]
#[derive(Debug)]
pub struct Fields<'a> {
it: FieldsWith<'a, fn(char) -> bool>,
}
+#[cfg(feature = "unicode")]
impl<'a> Fields<'a> {
fn new(bytes: &'a [u8]) -> Fields<'a> {
Fields { it: bytes.fields_with(|ch| ch.is_whitespace()) }
}
}
+#[cfg(feature = "unicode")]
impl<'a> Iterator for Fields<'a> {
type Item = &'a [u8];
@@ -3328,10 +3381,11 @@ impl<'a, F: FnMut(char) -> bool> Iterator for FieldsWith<'a, F> {
/// An iterator over substrings in a byte string, split by a separator.
///
-/// `'a` is the lifetime of the byte string being split.
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
#[derive(Debug)]
-pub struct Split<'a> {
- finder: Find<'a>,
+pub struct Split<'h, 's> {
+ finder: Find<'h, 's>,
/// The end position of the previous match of our splitter. The element
/// we yield corresponds to the substring starting at `last` up to the
/// beginning of the next match of the splitter.
@@ -3342,18 +3396,18 @@ pub struct Split<'a> {
done: bool,
}
-impl<'a> Split<'a> {
- fn new(haystack: &'a [u8], splitter: &'a [u8]) -> Split<'a> {
+impl<'h, 's> Split<'h, 's> {
+ fn new(haystack: &'h [u8], splitter: &'s [u8]) -> Split<'h, 's> {
let finder = haystack.find_iter(splitter);
Split { finder, last: 0, done: false }
}
}
-impl<'a> Iterator for Split<'a> {
- type Item = &'a [u8];
+impl<'h, 's> Iterator for Split<'h, 's> {
+ type Item = &'h [u8];
#[inline]
- fn next(&mut self) -> Option<&'a [u8]> {
+ fn next(&mut self) -> Option<&'h [u8]> {
let haystack = self.finder.haystack;
match self.finder.next() {
Some(start) => {
@@ -3383,11 +3437,11 @@ impl<'a> Iterator for Split<'a> {
/// An iterator over substrings in a byte string, split by a separator, in
/// reverse.
///
-/// `'a` is the lifetime of the byte string being split, while `F` is the type
-/// of the predicate, i.e., `FnMut(char) -> bool`.
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
#[derive(Debug)]
-pub struct SplitReverse<'a> {
- finder: FindReverse<'a>,
+pub struct SplitReverse<'h, 's> {
+ finder: FindReverse<'h, 's>,
/// The end position of the previous match of our splitter. The element
/// we yield corresponds to the substring starting at `last` up to the
/// beginning of the next match of the splitter.
@@ -3398,18 +3452,18 @@ pub struct SplitReverse<'a> {
done: bool,
}
-impl<'a> SplitReverse<'a> {
- fn new(haystack: &'a [u8], splitter: &'a [u8]) -> SplitReverse<'a> {
+impl<'h, 's> SplitReverse<'h, 's> {
+ fn new(haystack: &'h [u8], splitter: &'s [u8]) -> SplitReverse<'h, 's> {
let finder = haystack.rfind_iter(splitter);
SplitReverse { finder, last: haystack.len(), done: false }
}
}
-impl<'a> Iterator for SplitReverse<'a> {
- type Item = &'a [u8];
+impl<'h, 's> Iterator for SplitReverse<'h, 's> {
+ type Item = &'h [u8];
#[inline]
- fn next(&mut self) -> Option<&'a [u8]> {
+ fn next(&mut self) -> Option<&'h [u8]> {
let haystack = self.finder.haystack();
match self.finder.next() {
Some(start) => {
@@ -3440,31 +3494,31 @@ impl<'a> Iterator for SplitReverse<'a> {
/// An iterator over at most `n` substrings in a byte string, split by a
/// separator.
///
-/// `'a` is the lifetime of the byte string being split, while `F` is the type
-/// of the predicate, i.e., `FnMut(char) -> bool`.
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
#[derive(Debug)]
-pub struct SplitN<'a> {
- split: Split<'a>,
+pub struct SplitN<'h, 's> {
+ split: Split<'h, 's>,
limit: usize,
count: usize,
}
-impl<'a> SplitN<'a> {
+impl<'h, 's> SplitN<'h, 's> {
fn new(
- haystack: &'a [u8],
- splitter: &'a [u8],
+ haystack: &'h [u8],
+ splitter: &'s [u8],
limit: usize,
- ) -> SplitN<'a> {
+ ) -> SplitN<'h, 's> {
let split = haystack.split_str(splitter);
SplitN { split, limit, count: 0 }
}
}
-impl<'a> Iterator for SplitN<'a> {
- type Item = &'a [u8];
+impl<'h, 's> Iterator for SplitN<'h, 's> {
+ type Item = &'h [u8];
#[inline]
- fn next(&mut self) -> Option<&'a [u8]> {
+ fn next(&mut self) -> Option<&'h [u8]> {
self.count += 1;
if self.count > self.limit || self.split.done {
None
@@ -3479,31 +3533,31 @@ impl<'a> Iterator for SplitN<'a> {
/// An iterator over at most `n` substrings in a byte string, split by a
/// separator, in reverse.
///
-/// `'a` is the lifetime of the byte string being split, while `F` is the type
-/// of the predicate, i.e., `FnMut(char) -> bool`.
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
#[derive(Debug)]
-pub struct SplitNReverse<'a> {
- split: SplitReverse<'a>,
+pub struct SplitNReverse<'h, 's> {
+ split: SplitReverse<'h, 's>,
limit: usize,
count: usize,
}
-impl<'a> SplitNReverse<'a> {
+impl<'h, 's> SplitNReverse<'h, 's> {
fn new(
- haystack: &'a [u8],
- splitter: &'a [u8],
+ haystack: &'h [u8],
+ splitter: &'s [u8],
limit: usize,
- ) -> SplitNReverse<'a> {
+ ) -> SplitNReverse<'h, 's> {
let split = haystack.rsplit_str(splitter);
SplitNReverse { split, limit, count: 0 }
}
}
-impl<'a> Iterator for SplitNReverse<'a> {
- type Item = &'a [u8];
+impl<'h, 's> Iterator for SplitNReverse<'h, 's> {
+ type Item = &'h [u8];
#[inline]
- fn next(&mut self) -> Option<&'a [u8]> {
+ fn next(&mut self) -> Option<&'h [u8]> {
self.count += 1;
if self.count > self.limit || self.split.done {
None
@@ -3521,6 +3575,7 @@ impl<'a> Iterator for SplitNReverse<'a> {
/// `\n`.
///
/// `'a` is the lifetime of the byte string being iterated over.
+#[derive(Clone, Debug)]
pub struct Lines<'a> {
it: LinesWithTerminator<'a>,
}
@@ -3529,6 +3584,28 @@ impl<'a> Lines<'a> {
fn new(bytes: &'a [u8]) -> Lines<'a> {
Lines { it: LinesWithTerminator::new(bytes) }
}
+
+ /// Return a copy of the rest of the underlying bytes without affecting the
+ /// iterator itself.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = b"\
+ /// foo
+ /// bar\r
+ /// baz";
+ /// let mut lines = s.lines();
+ /// assert_eq!(lines.next(), Some(B("foo")));
+ /// assert_eq!(lines.as_bytes(), B("bar\r\nbaz"));
+ /// ```
+ pub fn as_bytes(&self) -> &'a [u8] {
+ self.it.bytes
+ }
}
impl<'a> Iterator for Lines<'a> {
@@ -3536,17 +3613,19 @@ impl<'a> Iterator for Lines<'a> {
#[inline]
fn next(&mut self) -> Option<&'a [u8]> {
- let mut line = self.it.next()?;
- if line.last_byte() == Some(b'\n') {
- line = &line[..line.len() - 1];
- if line.last_byte() == Some(b'\r') {
- line = &line[..line.len() - 1];
- }
- }
- Some(line)
+ Some(trim_last_terminator(self.it.next()?))
+ }
+}
+
+impl<'a> DoubleEndedIterator for Lines<'a> {
+ #[inline]
+ fn next_back(&mut self) -> Option<Self::Item> {
+ Some(trim_last_terminator(self.it.next_back()?))
}
}
+impl<'a> iter::FusedIterator for Lines<'a> {}
+
/// An iterator over all lines in a byte string, including their terminators.
///
/// For this iterator, the only line terminator recognized is `\n`. (Since
@@ -3560,6 +3639,7 @@ impl<'a> Iterator for Lines<'a> {
/// the original byte string.
///
/// `'a` is the lifetime of the byte string being iterated over.
+#[derive(Clone, Debug)]
pub struct LinesWithTerminator<'a> {
bytes: &'a [u8],
}
@@ -3568,6 +3648,28 @@ impl<'a> LinesWithTerminator<'a> {
fn new(bytes: &'a [u8]) -> LinesWithTerminator<'a> {
LinesWithTerminator { bytes }
}
+
+ /// Return a copy of the rest of the underlying bytes without affecting the
+ /// iterator itself.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = b"\
+ /// foo
+ /// bar\r
+ /// baz";
+ /// let mut lines = s.lines_with_terminator();
+ /// assert_eq!(lines.next(), Some(B("foo\n")));
+ /// assert_eq!(lines.as_bytes(), B("bar\r\nbaz"));
+ /// ```
+ pub fn as_bytes(&self) -> &'a [u8] {
+ self.bytes
+ }
}
impl<'a> Iterator for LinesWithTerminator<'a> {
@@ -3591,10 +3693,43 @@ impl<'a> Iterator for LinesWithTerminator<'a> {
}
}
-#[cfg(test)]
+impl<'a> DoubleEndedIterator for LinesWithTerminator<'a> {
+ #[inline]
+ fn next_back(&mut self) -> Option<Self::Item> {
+ let end = self.bytes.len().checked_sub(1)?;
+ match self.bytes[..end].rfind_byte(b'\n') {
+ None => {
+ let line = self.bytes;
+ self.bytes = b"";
+ Some(line)
+ }
+ Some(end) => {
+ let line = &self.bytes[end + 1..];
+ self.bytes = &self.bytes[..end + 1];
+ Some(line)
+ }
+ }
+ }
+}
+
+impl<'a> iter::FusedIterator for LinesWithTerminator<'a> {}
+
+fn trim_last_terminator(mut s: &[u8]) -> &[u8] {
+ if s.last_byte() == Some(b'\n') {
+ s = &s[..s.len() - 1];
+ if s.last_byte() == Some(b'\r') {
+ s = &s[..s.len() - 1];
+ }
+ }
+ s
+}
+
+#[cfg(all(test, feature = "std"))]
mod tests {
- use crate::ext_slice::{ByteSlice, B};
- use crate::tests::LOSSY_TESTS;
+ use crate::{
+ ext_slice::{ByteSlice, Lines, LinesWithTerminator, B},
+ tests::LOSSY_TESTS,
+ };
#[test]
fn to_str_lossy() {
@@ -3622,34 +3757,55 @@ mod tests {
}
#[test]
- #[should_panic]
- fn copy_within_fail1() {
- let mut buf = *b"foobar";
- let s = &mut buf;
- s.copy_within_str(0..2, 5);
- }
+ fn lines_iteration() {
+ macro_rules! t {
+ ($it:expr, $forward:expr) => {
+ let mut res: Vec<&[u8]> = Vec::from($forward);
+ assert_eq!($it.collect::<Vec<_>>(), res);
+ res.reverse();
+ assert_eq!($it.rev().collect::<Vec<_>>(), res);
+ };
+ }
- #[test]
- #[should_panic]
- fn copy_within_fail2() {
- let mut buf = *b"foobar";
- let s = &mut buf;
- s.copy_within_str(3..2, 0);
- }
+ t!(Lines::new(b""), []);
+ t!(LinesWithTerminator::new(b""), []);
- #[test]
- #[should_panic]
- fn copy_within_fail3() {
- let mut buf = *b"foobar";
- let s = &mut buf;
- s.copy_within_str(5..7, 0);
- }
+ t!(Lines::new(b"\n"), [B("")]);
+ t!(Lines::new(b"\r\n"), [B("")]);
+ t!(LinesWithTerminator::new(b"\n"), [B("\n")]);
- #[test]
- #[should_panic]
- fn copy_within_fail4() {
- let mut buf = *b"foobar";
- let s = &mut buf;
- s.copy_within_str(0..1, 6);
+ t!(Lines::new(b"a"), [B("a")]);
+ t!(LinesWithTerminator::new(b"a"), [B("a")]);
+
+ t!(Lines::new(b"abc"), [B("abc")]);
+ t!(LinesWithTerminator::new(b"abc"), [B("abc")]);
+
+ t!(Lines::new(b"abc\n"), [B("abc")]);
+ t!(Lines::new(b"abc\r\n"), [B("abc")]);
+ t!(LinesWithTerminator::new(b"abc\n"), [B("abc\n")]);
+
+ t!(Lines::new(b"abc\n\n"), [B("abc"), B("")]);
+ t!(LinesWithTerminator::new(b"abc\n\n"), [B("abc\n"), B("\n")]);
+
+ t!(Lines::new(b"abc\n\ndef"), [B("abc"), B(""), B("def")]);
+ t!(
+ LinesWithTerminator::new(b"abc\n\ndef"),
+ [B("abc\n"), B("\n"), B("def")]
+ );
+
+ t!(Lines::new(b"abc\n\ndef\n"), [B("abc"), B(""), B("def")]);
+ t!(
+ LinesWithTerminator::new(b"abc\n\ndef\n"),
+ [B("abc\n"), B("\n"), B("def\n")]
+ );
+
+ t!(Lines::new(b"\na\nb\n"), [B(""), B("a"), B("b")]);
+ t!(
+ LinesWithTerminator::new(b"\na\nb\n"),
+ [B("\n"), B("a\n"), B("b\n")]
+ );
+
+ t!(Lines::new(b"\n\n\n"), [B(""), B(""), B("")]);
+ t!(LinesWithTerminator::new(b"\n\n\n"), [B("\n"), B("\n"), B("\n")]);
}
}
diff --git a/src/ext_vec.rs b/src/ext_vec.rs
index 5beb0e1..5effdd0 100644
--- a/src/ext_vec.rs
+++ b/src/ext_vec.rs
@@ -1,16 +1,21 @@
-use std::borrow::Cow;
-use std::error;
-use std::ffi::{OsStr, OsString};
-use std::fmt;
-use std::iter;
-use std::ops;
-use std::path::{Path, PathBuf};
-use std::ptr;
-use std::str;
-use std::vec;
-
-use crate::ext_slice::ByteSlice;
-use crate::utf8::{self, Utf8Error};
+use core::fmt;
+use core::iter;
+use core::ops;
+use core::ptr;
+
+use alloc::{borrow::Cow, string::String, vec, vec::Vec};
+
+#[cfg(feature = "std")]
+use std::{
+ error,
+ ffi::{OsStr, OsString},
+ path::{Path, PathBuf},
+};
+
+use crate::{
+ ext_slice::ByteSlice,
+ utf8::{self, Utf8Error},
+};
/// Concatenate the elements given by the iterator together into a single
/// `Vec<u8>`.
@@ -154,8 +159,9 @@ pub trait ByteVec: Sealed {
/// Create a new byte string from an owned OS string.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns the original OS string if it is not valid UTF-8.
+ /// When the underlying bytes of OS strings are accessible, then this
+ /// always succeeds and is zero cost. Otherwise, this returns the given
+ /// `OsString` if it is not valid UTF-8.
///
/// # Examples
///
@@ -171,6 +177,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(bs, B("foo"));
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn from_os_string(os_str: OsString) -> Result<Vec<u8>, OsString> {
#[cfg(unix)]
#[inline]
@@ -191,10 +198,11 @@ pub trait ByteVec: Sealed {
/// Lossily create a new byte string from an OS string slice.
///
- /// On Unix, this always succeeds, is zero cost and always returns a slice.
- /// On non-Unix systems, this does a UTF-8 check. If the given OS string
- /// slice is not valid UTF-8, then it is lossily decoded into valid UTF-8
- /// (with invalid bytes replaced by the Unicode replacement codepoint).
+ /// When the underlying bytes of OS strings are accessible, then this is
+ /// zero cost and always returns a slice. Otherwise, a UTF-8 check is
+ /// performed and if the given OS string is not valid UTF-8, then it is
+ /// lossily decoded into valid UTF-8 (with invalid bytes replaced by the
+ /// Unicode replacement codepoint).
///
/// # Examples
///
@@ -210,6 +218,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(bs, B("foo"));
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn from_os_str_lossy<'a>(os_str: &'a OsStr) -> Cow<'a, [u8]> {
#[cfg(unix)]
#[inline]
@@ -233,8 +242,9 @@ pub trait ByteVec: Sealed {
/// Create a new byte string from an owned file path.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns the original path if it is not valid UTF-8.
+ /// When the underlying bytes of paths are accessible, then this always
+ /// succeeds and is zero cost. Otherwise, this returns the given `PathBuf`
+ /// if it is not valid UTF-8.
///
/// # Examples
///
@@ -250,16 +260,18 @@ pub trait ByteVec: Sealed {
/// assert_eq!(bs, B("foo"));
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn from_path_buf(path: PathBuf) -> Result<Vec<u8>, PathBuf> {
Vec::from_os_string(path.into_os_string()).map_err(PathBuf::from)
}
/// Lossily create a new byte string from a file path.
///
- /// On Unix, this always succeeds, is zero cost and always returns a slice.
- /// On non-Unix systems, this does a UTF-8 check. If the given path is not
- /// valid UTF-8, then it is lossily decoded into valid UTF-8 (with invalid
- /// bytes replaced by the Unicode replacement codepoint).
+ /// When the underlying bytes of paths are accessible, then this is
+ /// zero cost and always returns a slice. Otherwise, a UTF-8 check is
+ /// performed and if the given path is not valid UTF-8, then it is lossily
+ /// decoded into valid UTF-8 (with invalid bytes replaced by the Unicode
+ /// replacement codepoint).
///
/// # Examples
///
@@ -275,6 +287,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(bs, B("foo"));
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn from_path_lossy<'a>(path: &'a Path) -> Cow<'a, [u8]> {
Vec::from_os_str_lossy(path.as_os_str())
}
@@ -363,12 +376,10 @@ pub trait ByteVec: Sealed {
/// ```
/// use bstr::ByteVec;
///
- /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
/// let bytes = Vec::from("hello");
- /// let string = bytes.into_string()?;
+ /// let string = bytes.into_string().unwrap();
///
/// assert_eq!("hello", string);
- /// # Ok(()) }; example().unwrap()
/// ```
///
/// If this byte string is not valid UTF-8, then an error will be returned.
@@ -469,8 +480,9 @@ pub trait ByteVec: Sealed {
/// Converts this byte string into an OS string, in place.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns the original byte string if it is not valid UTF-8.
+ /// When OS strings can be constructed from arbitrary byte sequences, this
+ /// always succeeds and is zero cost. Otherwise, if this byte string is not
+ /// valid UTF-8, then an error (with the original byte string) is returned.
///
/// # Examples
///
@@ -485,14 +497,15 @@ pub trait ByteVec: Sealed {
/// let os_str = bs.into_os_string().expect("should be valid UTF-8");
/// assert_eq!(os_str, OsStr::new("foo"));
/// ```
+ #[cfg(feature = "std")]
#[inline]
- fn into_os_string(self) -> Result<OsString, Vec<u8>>
+ fn into_os_string(self) -> Result<OsString, FromUtf8Error>
where
Self: Sized,
{
#[cfg(unix)]
#[inline]
- fn imp(v: Vec<u8>) -> Result<OsString, Vec<u8>> {
+ fn imp(v: Vec<u8>) -> Result<OsString, FromUtf8Error> {
use std::os::unix::ffi::OsStringExt;
Ok(OsString::from_vec(v))
@@ -500,11 +513,8 @@ pub trait ByteVec: Sealed {
#[cfg(not(unix))]
#[inline]
- fn imp(v: Vec<u8>) -> Result<OsString, Vec<u8>> {
- match v.into_string() {
- Ok(s) => Ok(OsString::from(s)),
- Err(err) => Err(err.into_vec()),
- }
+ fn imp(v: Vec<u8>) -> Result<OsString, FromUtf8Error> {
+ v.into_string().map(OsString::from)
}
imp(self.into_vec())
@@ -512,13 +522,13 @@ pub trait ByteVec: Sealed {
/// Lossily converts this byte string into an OS string, in place.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this will perform a UTF-8 check and lossily convert this byte string
- /// into valid UTF-8 using the Unicode replacement codepoint.
+ /// When OS strings can be constructed from arbitrary byte sequences, this
+ /// is zero cost and always returns a slice. Otherwise, this will perform a
+ /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
+ /// the Unicode replacement codepoint.
///
- /// Note that this can prevent the correct roundtripping of file paths on
- /// non-Unix systems such as Windows, where file paths are an arbitrary
- /// sequence of 16-bit integers.
+ /// Note that this can prevent the correct roundtripping of file paths when
+ /// the representation of `OsString` is opaque.
///
/// # Examples
///
@@ -532,6 +542,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(os_str.to_string_lossy(), "foo\u{FFFD}bar");
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn into_os_string_lossy(self) -> OsString
where
Self: Sized,
@@ -555,8 +566,9 @@ pub trait ByteVec: Sealed {
/// Converts this byte string into an owned file path, in place.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns the original byte string if it is not valid UTF-8.
+ /// When paths can be constructed from arbitrary byte sequences, this
+ /// always succeeds and is zero cost. Otherwise, if this byte string is not
+ /// valid UTF-8, then an error (with the original byte string) is returned.
///
/// # Examples
///
@@ -569,8 +581,9 @@ pub trait ByteVec: Sealed {
/// let path = bs.into_path_buf().expect("should be valid UTF-8");
/// assert_eq!(path.as_os_str(), "foo");
/// ```
+ #[cfg(feature = "std")]
#[inline]
- fn into_path_buf(self) -> Result<PathBuf, Vec<u8>>
+ fn into_path_buf(self) -> Result<PathBuf, FromUtf8Error>
where
Self: Sized,
{
@@ -579,13 +592,13 @@ pub trait ByteVec: Sealed {
/// Lossily converts this byte string into an owned file path, in place.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this will perform a UTF-8 check and lossily convert this byte string
- /// into valid UTF-8 using the Unicode replacement codepoint.
+ /// When paths can be constructed from arbitrary byte sequences, this is
+ /// zero cost and always returns a slice. Otherwise, this will perform a
+ /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
+ /// the Unicode replacement codepoint.
///
- /// Note that this can prevent the correct roundtripping of file paths on
- /// non-Unix systems such as Windows, where file paths are an arbitrary
- /// sequence of 16-bit integers.
+ /// Note that this can prevent the correct roundtripping of file paths when
+ /// the representation of `PathBuf` is opaque.
///
/// # Examples
///
@@ -599,6 +612,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(path.to_string_lossy(), "foo\u{FFFD}bar");
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn into_path_buf_lossy(self) -> PathBuf
where
Self: Sized,
@@ -1029,6 +1043,7 @@ impl FromUtf8Error {
}
}
+#[cfg(feature = "std")]
impl error::Error for FromUtf8Error {
#[inline]
fn description(&self) -> &str {
@@ -1043,7 +1058,7 @@ impl fmt::Display for FromUtf8Error {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
use crate::ext_vec::ByteVec;
diff --git a/src/impls.rs b/src/impls.rs
index 85a27ba..669aee6 100644
--- a/src/impls.rs
+++ b/src/impls.rs
@@ -18,7 +18,7 @@ macro_rules! impl_partial_eq {
};
}
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
macro_rules! impl_partial_eq_cow {
($lhs:ty, $rhs:ty) => {
impl<'a, 'b> PartialEq<$rhs> for $lhs {
@@ -59,17 +59,22 @@ macro_rules! impl_partial_ord {
};
}
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
mod bstring {
- use std::borrow::{Borrow, Cow, ToOwned};
- use std::cmp::Ordering;
- use std::fmt;
- use std::iter::FromIterator;
- use std::ops;
+ use core::{
+ cmp::Ordering, convert::TryFrom, fmt, iter::FromIterator, ops,
+ };
- use crate::bstr::BStr;
- use crate::bstring::BString;
- use crate::ext_vec::ByteVec;
+ use alloc::{
+ borrow::{Borrow, Cow, ToOwned},
+ string::String,
+ vec,
+ vec::Vec,
+ };
+
+ use crate::{
+ bstr::BStr, bstring::BString, ext_slice::ByteSlice, ext_vec::ByteVec,
+ };
impl fmt::Display for BString {
#[inline]
@@ -90,21 +95,21 @@ mod bstring {
#[inline]
fn deref(&self) -> &Vec<u8> {
- &self.bytes
+ self.as_vec()
}
}
impl ops::DerefMut for BString {
#[inline]
fn deref_mut(&mut self) -> &mut Vec<u8> {
- &mut self.bytes
+ self.as_vec_mut()
}
}
impl AsRef<[u8]> for BString {
#[inline]
fn as_ref(&self) -> &[u8] {
- &self.bytes
+ self.as_bytes()
}
}
@@ -118,7 +123,7 @@ mod bstring {
impl AsMut<[u8]> for BString {
#[inline]
fn as_mut(&mut self) -> &mut [u8] {
- &mut self.bytes
+ self.as_bytes_mut()
}
}
@@ -161,14 +166,14 @@ mod bstring {
impl From<Vec<u8>> for BString {
#[inline]
fn from(s: Vec<u8>) -> BString {
- BString { bytes: s }
+ BString::new(s)
}
}
impl From<BString> for Vec<u8> {
#[inline]
fn from(s: BString) -> Vec<u8> {
- s.bytes
+ s.into_vec()
}
}
@@ -200,6 +205,24 @@ mod bstring {
}
}
+ impl TryFrom<BString> for String {
+ type Error = crate::FromUtf8Error;
+
+ #[inline]
+ fn try_from(s: BString) -> Result<String, crate::FromUtf8Error> {
+ s.into_vec().into_string()
+ }
+ }
+
+ impl<'a> TryFrom<&'a BString> for &'a str {
+ type Error = crate::Utf8Error;
+
+ #[inline]
+ fn try_from(s: &'a BString) -> Result<&'a str, crate::Utf8Error> {
+ s.as_bytes().to_str()
+ }
+ }
+
impl FromIterator<char> for BString {
#[inline]
fn from_iter<T: IntoIterator<Item = char>>(iter: T) -> BString {
@@ -279,7 +302,7 @@ mod bstring {
impl PartialOrd for BString {
#[inline]
fn partial_cmp(&self, other: &BString) -> Option<Ordering> {
- PartialOrd::partial_cmp(&self.bytes, &other.bytes)
+ PartialOrd::partial_cmp(self.as_bytes(), other.as_bytes())
}
}
@@ -301,15 +324,12 @@ mod bstring {
}
mod bstr {
- #[cfg(feature = "std")]
- use std::borrow::Cow;
+ use core::{cmp::Ordering, convert::TryFrom, fmt, ops};
- use core::cmp::Ordering;
- use core::fmt;
- use core::ops;
+ #[cfg(feature = "alloc")]
+ use alloc::{borrow::Cow, boxed::Box, string::String, vec::Vec};
- use crate::bstr::BStr;
- use crate::ext_slice::ByteSlice;
+ use crate::{bstr::BStr, ext_slice::ByteSlice};
impl fmt::Display for BStr {
#[inline]
@@ -590,6 +610,13 @@ mod bstr {
}
}
+ impl<'a> From<&'a BStr> for &'a [u8] {
+ #[inline]
+ fn from(s: &'a BStr) -> &'a [u8] {
+ BStr::as_bytes(s)
+ }
+ }
+
impl<'a> From<&'a str> for &'a BStr {
#[inline]
fn from(s: &'a str) -> &'a BStr {
@@ -597,7 +624,7 @@ mod bstr {
}
}
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl<'a> From<&'a BStr> for Cow<'a, BStr> {
#[inline]
fn from(s: &'a BStr) -> Cow<'a, BStr> {
@@ -605,7 +632,7 @@ mod bstr {
}
}
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl From<Box<[u8]>> for Box<BStr> {
#[inline]
fn from(s: Box<[u8]>) -> Box<BStr> {
@@ -613,7 +640,7 @@ mod bstr {
}
}
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl From<Box<BStr>> for Box<[u8]> {
#[inline]
fn from(s: Box<BStr>) -> Box<[u8]> {
@@ -621,6 +648,25 @@ mod bstr {
}
}
+ impl<'a> TryFrom<&'a BStr> for &'a str {
+ type Error = crate::Utf8Error;
+
+ #[inline]
+ fn try_from(s: &'a BStr) -> Result<&'a str, crate::Utf8Error> {
+ s.as_bytes().to_str()
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ impl<'a> TryFrom<&'a BStr> for String {
+ type Error = crate::Utf8Error;
+
+ #[inline]
+ fn try_from(s: &'a BStr) -> Result<String, crate::Utf8Error> {
+ Ok(s.as_bytes().to_str()?.into())
+ }
+ }
+
impl Eq for BStr {}
impl PartialEq<BStr> for BStr {
@@ -635,19 +681,19 @@ mod bstr {
impl_partial_eq!(BStr, str);
impl_partial_eq!(BStr, &'a str);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq!(BStr, Vec<u8>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq!(&'a BStr, Vec<u8>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq!(BStr, String);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq!(&'a BStr, String);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq_cow!(&'a BStr, Cow<'a, BStr>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq_cow!(&'a BStr, Cow<'a, str>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq_cow!(&'a BStr, Cow<'a, [u8]>);
impl PartialOrd for BStr {
@@ -669,17 +715,17 @@ mod bstr {
impl_partial_ord!(BStr, str);
impl_partial_ord!(BStr, &'a str);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_ord!(BStr, Vec<u8>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_ord!(&'a BStr, Vec<u8>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_ord!(BStr, String);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_ord!(&'a BStr, String);
}
-#[cfg(feature = "serde1-nostd")]
+#[cfg(feature = "serde")]
mod bstr_serde {
use core::fmt;
@@ -737,10 +783,11 @@ mod bstr_serde {
}
}
-#[cfg(feature = "serde1")]
+#[cfg(all(feature = "serde", feature = "alloc"))]
mod bstring_serde {
- use std::cmp;
- use std::fmt;
+ use core::{cmp, fmt};
+
+ use alloc::{string::String, vec::Vec};
use serde::{
de::Error, de::SeqAccess, de::Visitor, Deserialize, Deserializer,
@@ -825,8 +872,9 @@ mod bstring_serde {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod display {
+ #[cfg(not(miri))]
use crate::bstring::BString;
use crate::ByteSlice;
@@ -926,6 +974,7 @@ mod display {
);
}
+ #[cfg(not(miri))]
quickcheck::quickcheck! {
fn total_length(bstr: BString) -> bool {
let size = bstr.chars().count();
@@ -934,7 +983,7 @@ mod display {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "alloc"))]
mod bstring_arbitrary {
use crate::bstring::BString;
@@ -946,12 +995,13 @@ mod bstring_arbitrary {
}
fn shrink(&self) -> Box<dyn Iterator<Item = BString>> {
- Box::new(self.bytes.shrink().map(BString::from))
+ Box::new(self.as_vec().shrink().map(BString::from))
}
}
}
#[test]
+#[cfg(feature = "std")]
fn test_debug() {
use crate::{ByteSlice, B};
@@ -973,10 +1023,12 @@ fn test_debug() {
// See: https://github.com/BurntSushi/bstr/issues/82
#[test]
+#[cfg(feature = "std")]
fn test_cows_regression() {
- use crate::ByteSlice;
use std::borrow::Cow;
+ use crate::ByteSlice;
+
let c1 = Cow::from(b"hello bstr".as_bstr());
let c2 = b"goodbye bstr".as_bstr();
assert_ne!(c1, c2);
diff --git a/src/io.rs b/src/io.rs
index ad6f3c1..1386bf3 100644
--- a/src/io.rs
+++ b/src/io.rs
@@ -7,10 +7,11 @@ facilities for conveniently and efficiently working with lines as byte strings.
More APIs may be added in the future.
*/
+use alloc::{vec, vec::Vec};
+
use std::io;
-use crate::ext_slice::ByteSlice;
-use crate::ext_vec::ByteVec;
+use crate::{ext_slice::ByteSlice, ext_vec::ByteVec};
/// An extention trait for
/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html)
@@ -36,7 +37,7 @@ pub trait BufReadExt: io::BufRead {
/// use bstr::io::BufReadExt;
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
///
/// let mut lines = vec![];
/// for result in cursor.byte_lines() {
@@ -79,7 +80,7 @@ pub trait BufReadExt: io::BufRead {
/// use bstr::io::BufReadExt;
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
///
/// let mut records = vec![];
/// for result in cursor.byte_records(b'\x00') {
@@ -122,7 +123,7 @@ pub trait BufReadExt: io::BufRead {
/// use bstr::io::BufReadExt;
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
///
/// let mut lines = vec![];
/// cursor.for_byte_line(|line| {
@@ -135,7 +136,7 @@ pub trait BufReadExt: io::BufRead {
/// assert_eq!(lines[2], "dolor".as_bytes());
/// # Ok(()) }; example().unwrap()
/// ```
- fn for_byte_line<F>(self, mut for_each_line: F) -> io::Result<()>
+ fn for_byte_line<F>(&mut self, mut for_each_line: F) -> io::Result<()>
where
Self: Sized,
F: FnMut(&[u8]) -> io::Result<bool>,
@@ -169,7 +170,7 @@ pub trait BufReadExt: io::BufRead {
/// use bstr::io::BufReadExt;
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
///
/// let mut records = vec![];
/// cursor.for_byte_record(b'\x00', |record| {
@@ -183,7 +184,7 @@ pub trait BufReadExt: io::BufRead {
/// # Ok(()) }; example().unwrap()
/// ```
fn for_byte_record<F>(
- self,
+ &mut self,
terminator: u8,
mut for_each_record: F,
) -> io::Result<()>
@@ -223,7 +224,7 @@ pub trait BufReadExt: io::BufRead {
/// use bstr::io::BufReadExt;
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
///
/// let mut lines = vec![];
/// cursor.for_byte_line_with_terminator(|line| {
@@ -237,7 +238,7 @@ pub trait BufReadExt: io::BufRead {
/// # Ok(()) }; example().unwrap()
/// ```
fn for_byte_line_with_terminator<F>(
- self,
+ &mut self,
for_each_line: F,
) -> io::Result<()>
where
@@ -269,11 +270,10 @@ pub trait BufReadExt: io::BufRead {
/// ```
/// use std::io;
///
- /// use bstr::B;
- /// use bstr::io::BufReadExt;
+ /// use bstr::{io::BufReadExt, B};
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
///
/// let mut records = vec![];
/// cursor.for_byte_record_with_terminator(b'\x00', |record| {
@@ -287,7 +287,7 @@ pub trait BufReadExt: io::BufRead {
/// # Ok(()) }; example().unwrap()
/// ```
fn for_byte_record_with_terminator<F>(
- mut self,
+ &mut self,
terminator: u8,
mut for_each_record: F,
) -> io::Result<()>
@@ -438,11 +438,12 @@ fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] {
record
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
- use super::BufReadExt;
use crate::bstring::BString;
+ use super::BufReadExt;
+
fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
let mut lines = vec![];
slice
diff --git a/src/lib.rs b/src/lib.rs
index 41142c9..09e17b0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -52,23 +52,27 @@ Here's another example showing how to do a search and replace (and also showing
use of the `B` function):
```
+# #[cfg(feature = "alloc")] {
use bstr::{B, ByteSlice};
let old = B("foo ☃☃☃ foo foo quux foo");
let new = old.replace("foo", "hello");
assert_eq!(new, B("hello ☃☃☃ hello hello quux hello"));
+# }
```
And here's an example that shows case conversion, even in the presence of
invalid UTF-8:
```
+# #[cfg(all(feature = "alloc", feature = "unicode"))] {
use bstr::{ByteSlice, ByteVec};
let mut lower = Vec::from("hello β");
lower[0] = b'\xFF';
// lowercase β is uppercased to Β
assert_eq!(lower.to_uppercase(), b"\xFFELLO \xCE\x92");
+# }
```
# Convenient debug representation
@@ -98,10 +102,8 @@ method converts any `&[u8]` to a `&BStr`.
# When should I use byte strings?
-This library reflects my hypothesis that UTF-8 by convention is a better trade
-off in some circumstances than guaranteed UTF-8. It's possible, perhaps even
-likely, that this is a niche concern for folks working closely with core text
-primitives.
+This library reflects my belief that UTF-8 by convention is a better trade
+off in some circumstances than guaranteed UTF-8.
The first time this idea hit me was in the implementation of Rust's regex
engine. In particular, very little of the internal implementation cares at all
@@ -134,24 +136,26 @@ incremental way by only parsing chunks at a time, but this is often complex to
do or impractical. For example, many regex engines only accept one contiguous
sequence of bytes at a time with no way to perform incremental matching.
-In summary, conventional UTF-8 byte strings provided by this library are
-definitely useful in some limited circumstances, but how useful they are more
-broadly isn't clear yet.
-
# `bstr` in public APIs
-Since this library is not yet `1.0`, you should not use it in the public API of
-your crates until it hits `1.0` (unless you're OK with with tracking breaking
-releases of `bstr`). It is expected that `bstr 1.0` will be released before
-2022.
+This library is past version `1` and is expected to remain at version `1` for
+the foreseeable future. Therefore, it is encouraged to put types from `bstr`
+(like `BStr` and `BString`) in your public API if that makes sense for your
+crate.
+
+With that said, in general, it should be possible to avoid putting anything
+in this crate into your public APIs. Namely, you should never need to use the
+`ByteSlice` or `ByteVec` traits as bounds on public APIs, since their only
+purpose is to extend the methods on the concrete types `[u8]` and `Vec<u8>`,
+respectively. Similarly, it should not be necessary to put either the `BStr` or
+`BString` types into public APIs. If you want to use them internally, then they
+can be converted to/from `[u8]`/`Vec<u8>` as needed. The conversions are free.
+
+So while it shouldn't ever be 100% necessary to make `bstr` a public
+dependency, there may be cases where it is convenient to do so. This is an
+explicitly supported use case of `bstr`, and as such, major version releases
+should be exceptionally rare.
-In general, it should be possible to avoid putting anything in this crate into
-your public APIs. Namely, you should never need to use the `ByteSlice` or
-`ByteVec` traits as bounds on public APIs, since their only purpose is to
-extend the methods on the concrete types `[u8]` and `Vec<u8>`, respectively.
-Similarly, it should not be necessary to put either the `BStr` or `BString`
-types into public APIs. If you want to use them internally, then they can
-be converted to/from `[u8]`/`Vec<u8>` as needed.
# Differences with standard strings
@@ -318,7 +322,8 @@ they can do:
by accessing their underlying 16-bit integer representation. Unfortunately,
this isn't zero cost (it introduces a second WTF-8 decoding step) and it's
not clear this is a good thing to do, since WTF-8 should ideally remain an
- internal implementation detail.
+ internal implementation detail. This is roughly the approach taken by the
+ [`os_str_bytes`](https://crates.io/crates/os_str_bytes) crate.
2. One could instead declare that they will not handle paths on Windows that
are not valid UTF-16, and return an error when one is encountered.
3. Like (2), but instead of returning an error, lossily decode the file path
@@ -365,19 +370,57 @@ UTF-8, and thus contain latent bugs on Unix where paths with invalid UTF-8 are
not terribly uncommon. If you instead use byte strings, then you're guaranteed
to write correct code for Unix, at the cost of getting a corner case wrong on
Windows.
+
+# Cargo features
+
+This crates comes with a few features that control standard library, serde
+and Unicode support.
+
+* `std` - **Enabled** by default. This provides APIs that require the standard
+ library, such as `Vec<u8>` and `PathBuf`. Enabling this feature also enables
+ the `alloc` feature and any other relevant `std` features for dependencies.
+* `alloc` - **Enabled** by default. This provides APIs that require allocations
+ via the `alloc` crate, such as `Vec<u8>`.
+* `unicode` - **Enabled** by default. This provides APIs that require sizable
+ Unicode data compiled into the binary. This includes, but is not limited to,
+ grapheme/word/sentence segmenters. When this is disabled, basic support such
+ as UTF-8 decoding is still included. Note that currently, enabling this
+ feature also requires enabling the `std` feature. It is expected that this
+ limitation will be lifted at some point.
+* `serde` - Enables implementations of serde traits for `BStr`, and also
+ `BString` when `alloc` is enabled.
*/
-#![cfg_attr(not(feature = "std"), no_std)]
+#![cfg_attr(not(any(feature = "std", test)), no_std)]
+#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+
+// Why do we do this? Well, in order for us to use once_cell's 'Lazy' type to
+// load DFAs, it requires enabling its 'std' feature. Yet, there is really
+// nothing about our 'unicode' feature that requires 'std'. We could declare
+// that 'unicode = [std, ...]', which would be fine, but once regex-automata
+// 0.3 is a thing, I believe we can drop once_cell altogether and thus drop
+// the need for 'std' to be enabled when 'unicode' is enabled. But if we make
+// 'unicode' also enable 'std', then it would be a breaking change to remove
+// 'std' from that list.
+//
+// So, for right now, we force folks to explicitly say they want 'std' if they
+// want 'unicode'. In the future, we should be able to relax this.
+#[cfg(all(feature = "unicode", not(feature = "std")))]
+compile_error!("enabling 'unicode' requires enabling 'std'");
+
+#[cfg(feature = "alloc")]
+extern crate alloc;
pub use crate::bstr::BStr;
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
pub use crate::bstring::BString;
+#[cfg(feature = "unicode")]
+pub use crate::ext_slice::Fields;
pub use crate::ext_slice::{
- ByteSlice, Bytes, Fields, FieldsWith, Find, FindReverse, Finder,
- FinderReverse, Lines, LinesWithTerminator, Split, SplitN, SplitNReverse,
- SplitReverse, B,
+ ByteSlice, Bytes, FieldsWith, Find, FindReverse, Finder, FinderReverse,
+ Lines, LinesWithTerminator, Split, SplitN, SplitNReverse, SplitReverse, B,
};
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
pub use crate::ext_vec::{concat, join, ByteVec, DrainBytes, FromUtf8Error};
#[cfg(feature = "unicode")]
pub use crate::unicode::{
@@ -391,26 +434,28 @@ pub use crate::utf8::{
mod ascii;
mod bstr;
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
mod bstring;
mod byteset;
mod ext_slice;
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
mod ext_vec;
mod impls;
#[cfg(feature = "std")]
pub mod io;
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests;
#[cfg(feature = "unicode")]
mod unicode;
mod utf8;
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod apitests {
- use crate::bstr::BStr;
- use crate::bstring::BString;
- use crate::ext_slice::{Finder, FinderReverse};
+ use crate::{
+ bstr::BStr,
+ bstring::BString,
+ ext_slice::{Finder, FinderReverse},
+ };
#[test]
fn oibits() {
diff --git a/src/tests.rs b/src/tests.rs
index f4179fd..03a4461 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -6,7 +6,7 @@
///
/// The first element in each tuple is the expected result of lossy decoding,
/// while the second element is the input given.
-pub const LOSSY_TESTS: &[(&str, &[u8])] = &[
+pub(crate) const LOSSY_TESTS: &[(&str, &[u8])] = &[
("a", b"a"),
("\u{FFFD}", b"\xFF"),
("\u{FFFD}\u{FFFD}", b"\xFF\xFF"),
diff --git a/src/unicode/data/GraphemeBreakTest.txt b/src/unicode/data/GraphemeBreakTest.txt
index fb4fec9..eff2fd3 100644
--- a/src/unicode/data/GraphemeBreakTest.txt
+++ b/src/unicode/data/GraphemeBreakTest.txt
@@ -1,6 +1,6 @@
-# GraphemeBreakTest-12.1.0.txt
-# Date: 2019-03-10, 10:53:12 GMT
-# © 2019 Unicode®, Inc.
+# GraphemeBreakTest-14.0.0.txt
+# Date: 2021-03-08, 06:22:32 GMT
+# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
diff --git a/src/unicode/data/SentenceBreakTest.txt b/src/unicode/data/SentenceBreakTest.txt
index 7c1c34a..61ea42c 100644
--- a/src/unicode/data/SentenceBreakTest.txt
+++ b/src/unicode/data/SentenceBreakTest.txt
@@ -1,6 +1,6 @@
-# SentenceBreakTest-12.1.0.txt
-# Date: 2019-03-10, 10:53:28 GMT
-# © 2019 Unicode®, Inc.
+# SentenceBreakTest-14.0.0.txt
+# Date: 2021-03-08, 06:22:40 GMT
+# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
diff --git a/src/unicode/data/WordBreakTest.txt b/src/unicode/data/WordBreakTest.txt
index facd892..1d1435b 100644
--- a/src/unicode/data/WordBreakTest.txt
+++ b/src/unicode/data/WordBreakTest.txt
@@ -1,6 +1,6 @@
-# WordBreakTest-12.1.0.txt
-# Date: 2019-03-10, 10:53:29 GMT
-# © 2019 Unicode®, Inc.
+# WordBreakTest-14.0.0.txt
+# Date: 2021-03-08, 06:22:40 GMT
+# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
diff --git a/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa b/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
index 0efaaf2..31f99c1 100644
--- a/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
+++ b/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa b/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
index eb24025..3a51728 100644
--- a/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
+++ b/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_fwd.rs b/src/unicode/fsm/grapheme_break_fwd.rs
index b53b1d7..dea4a7e 100644
--- a/src/unicode/fsm/grapheme_break_fwd.rs
+++ b/src/unicode/fsm/grapheme_break_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name GRAPHEME_BREAK_FWD --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/grapheme_break_rev.bigendian.dfa b/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
index d42cd36..742d2a6 100644
--- a/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
+++ b/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_rev.littleendian.dfa b/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
index c75ea5f..d1937f2 100644
--- a/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
+++ b/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_rev.rs b/src/unicode/fsm/grapheme_break_rev.rs
index 93e888c..2d2cd54 100644
--- a/src/unicode/fsm/grapheme_break_rev.rs
+++ b/src/unicode/fsm/grapheme_break_rev.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name GRAPHEME_BREAK_REV --reverse --longest --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_rev.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_rev.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/regional_indicator_rev.rs b/src/unicode/fsm/regional_indicator_rev.rs
index 2bf7e4c..db7a40f 100644
--- a/src/unicode/fsm/regional_indicator_rev.rs
+++ b/src/unicode/fsm/regional_indicator_rev.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name REGIONAL_INDICATOR_REV --reverse --classes --minimize --anchored --premultiply --state-size 1 src/unicode/fsm/ \p{gcb=Regional_Indicator}
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static REGIONAL_INDICATOR_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static REGIONAL_INDICATOR_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/sentence_break_fwd.bigendian.dfa b/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
index a1813d7..1abdae8 100644
--- a/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
+++ b/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/sentence_break_fwd.littleendian.dfa b/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
index 2763583..2f8aadd 100644
--- a/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
+++ b/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/sentence_break_fwd.rs b/src/unicode/fsm/sentence_break_fwd.rs
index cc937a4..97dd658 100644
--- a/src/unicode/fsm/sentence_break_fwd.rs
+++ b/src/unicode/fsm/sentence_break_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name SENTENCE_BREAK_FWD --minimize --sparse --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static SENTENCE_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("sentence_break_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static SENTENCE_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("sentence_break_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/simple_word_fwd.bigendian.dfa b/src/unicode/fsm/simple_word_fwd.bigendian.dfa
index adc64c1..888e465 100644
--- a/src/unicode/fsm/simple_word_fwd.bigendian.dfa
+++ b/src/unicode/fsm/simple_word_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/simple_word_fwd.littleendian.dfa b/src/unicode/fsm/simple_word_fwd.littleendian.dfa
index dd48386..a1d527c 100644
--- a/src/unicode/fsm/simple_word_fwd.littleendian.dfa
+++ b/src/unicode/fsm/simple_word_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/simple_word_fwd.rs b/src/unicode/fsm/simple_word_fwd.rs
index f1f3da5..32b69b6 100644
--- a/src/unicode/fsm/simple_word_fwd.rs
+++ b/src/unicode/fsm/simple_word_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name SIMPLE_WORD_FWD --sparse --minimize --state-size 2 src/unicode/fsm/ \w
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static SIMPLE_WORD_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("simple_word_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static SIMPLE_WORD_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("simple_word_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/whitespace_anchored_fwd.rs b/src/unicode/fsm/whitespace_anchored_fwd.rs
index 419b5d4..0780412 100644
--- a/src/unicode/fsm/whitespace_anchored_fwd.rs
+++ b/src/unicode/fsm/whitespace_anchored_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name WHITESPACE_ANCHORED_FWD --anchored --classes --premultiply --minimize --state-size 1 src/unicode/fsm/ \s+
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static WHITESPACE_ANCHORED_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static WHITESPACE_ANCHORED_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/whitespace_anchored_rev.rs b/src/unicode/fsm/whitespace_anchored_rev.rs
index 301b03c..3d0d7a6 100644
--- a/src/unicode/fsm/whitespace_anchored_rev.rs
+++ b/src/unicode/fsm/whitespace_anchored_rev.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name WHITESPACE_ANCHORED_REV --reverse --anchored --classes --premultiply --minimize --state-size 2 src/unicode/fsm/ \s+
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = {
+pub static WHITESPACE_ANCHORED_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u16], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u16; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = {
+pub static WHITESPACE_ANCHORED_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u16], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u16; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/word_break_fwd.bigendian.dfa b/src/unicode/fsm/word_break_fwd.bigendian.dfa
index 1e75db6..efb9c81 100644
--- a/src/unicode/fsm/word_break_fwd.bigendian.dfa
+++ b/src/unicode/fsm/word_break_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/word_break_fwd.littleendian.dfa b/src/unicode/fsm/word_break_fwd.littleendian.dfa
index e3093a3..9a716d0 100644
--- a/src/unicode/fsm/word_break_fwd.littleendian.dfa
+++ b/src/unicode/fsm/word_break_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/word_break_fwd.rs b/src/unicode/fsm/word_break_fwd.rs
index fb041b7..dcb5f6b 100644
--- a/src/unicode/fsm/word_break_fwd.rs
+++ b/src/unicode/fsm/word_break_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name WORD_BREAK_FWD --sparse --minimize --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static WORD_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("word_break_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static WORD_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("word_break_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/grapheme.rs b/src/unicode/grapheme.rs
index ad31cf1..13b730c 100644
--- a/src/unicode/grapheme.rs
+++ b/src/unicode/grapheme.rs
@@ -1,10 +1,14 @@
use regex_automata::DFA;
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::grapheme_break_fwd::GRAPHEME_BREAK_FWD;
-use crate::unicode::fsm::grapheme_break_rev::GRAPHEME_BREAK_REV;
-use crate::unicode::fsm::regional_indicator_rev::REGIONAL_INDICATOR_REV;
-use crate::utf8;
+use crate::{
+ ext_slice::ByteSlice,
+ unicode::fsm::{
+ grapheme_break_fwd::GRAPHEME_BREAK_FWD,
+ grapheme_break_rev::GRAPHEME_BREAK_REV,
+ regional_indicator_rev::REGIONAL_INDICATOR_REV,
+ },
+ utf8,
+};
/// An iterator over grapheme clusters in a byte string.
///
@@ -125,7 +129,7 @@ pub struct GraphemeIndices<'a> {
impl<'a> GraphemeIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> {
- GraphemeIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
+ GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() }
}
/// View the underlying data as a subslice of the original data.
@@ -191,6 +195,22 @@ impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) {
if bs.is_empty() {
("", 0)
+ } else if bs.len() >= 2
+ && bs[0].is_ascii()
+ && bs[1].is_ascii()
+ && !bs[0].is_ascii_whitespace()
+ {
+ // FIXME: It is somewhat sad that we have to special case this, but it
+ // leads to a significant speed up in predominantly ASCII text. The
+ // issue here is that the DFA has a bit of overhead, and running it for
+ // every byte in mostly ASCII text results in a bit slowdown. We should
+ // re-litigate this once regex-automata 0.3 is out, but it might be
+ // hard to avoid the special case. A DFA is always going to at least
+ // require some memory access.
+
+ // Safe because all ASCII bytes are valid UTF-8.
+ let grapheme = unsafe { bs[..1].to_str_unchecked() };
+ (grapheme, 1)
} else if let Some(end) = GRAPHEME_BREAK_FWD.find(bs) {
// Safe because a match can only occur for valid UTF-8.
let grapheme = unsafe { bs[..end].to_str_unchecked() };
@@ -257,15 +277,17 @@ fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
+ #[cfg(not(miri))]
use ucd_parse::GraphemeClusterBreakTest;
+ use crate::{ext_slice::ByteSlice, tests::LOSSY_TESTS};
+
use super::*;
- use crate::ext_slice::ByteSlice;
- use crate::tests::LOSSY_TESTS;
#[test]
+ #[cfg(not(miri))]
fn forward_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.grapheme_clusters.concat();
@@ -288,6 +310,7 @@ mod tests {
}
#[test]
+ #[cfg(not(miri))]
fn reverse_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.grapheme_clusters.concat();
@@ -329,15 +352,18 @@ mod tests {
}
}
+ #[cfg(not(miri))]
fn uniescape(s: &str) -> String {
s.chars().flat_map(|c| c.escape_unicode()).collect::<String>()
}
+ #[cfg(not(miri))]
fn uniescape_vec(strs: &[String]) -> Vec<String> {
strs.iter().map(|s| uniescape(s)).collect()
}
/// Return all of the UCD for grapheme breaks.
+ #[cfg(not(miri))]
fn ucdtests() -> Vec<GraphemeClusterBreakTest> {
const TESTDATA: &'static str =
include_str!("data/GraphemeBreakTest.txt");
diff --git a/src/unicode/mod.rs b/src/unicode/mod.rs
index 60318f4..80638e8 100644
--- a/src/unicode/mod.rs
+++ b/src/unicode/mod.rs
@@ -1,8 +1,8 @@
-pub use self::grapheme::{decode_grapheme, GraphemeIndices, Graphemes};
-pub use self::sentence::{SentenceIndices, Sentences};
-pub use self::whitespace::{whitespace_len_fwd, whitespace_len_rev};
-pub use self::word::{
- WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks,
+pub use self::{
+ grapheme::{decode_grapheme, GraphemeIndices, Graphemes},
+ sentence::{SentenceIndices, Sentences},
+ whitespace::{whitespace_len_fwd, whitespace_len_rev},
+ word::{WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks},
};
mod fsm;
diff --git a/src/unicode/sentence.rs b/src/unicode/sentence.rs
index 063f342..ff29c7e 100644
--- a/src/unicode/sentence.rs
+++ b/src/unicode/sentence.rs
@@ -1,8 +1,9 @@
use regex_automata::DFA;
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD;
-use crate::utf8;
+use crate::{
+ ext_slice::ByteSlice,
+ unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8,
+};
/// An iterator over sentences in a byte string.
///
@@ -97,7 +98,7 @@ pub struct SentenceIndices<'a> {
impl<'a> SentenceIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> {
- SentenceIndices { bs: bs, forward_index: 0 }
+ SentenceIndices { bs, forward_index: 0 }
}
/// View the underlying data as a subslice of the original data.
@@ -156,13 +157,15 @@ fn decode_sentence(bs: &[u8]) -> (&str, usize) {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
+ #[cfg(not(miri))]
use ucd_parse::SentenceBreakTest;
use crate::ext_slice::ByteSlice;
#[test]
+ #[cfg(not(miri))]
fn forward_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.sentences.concat();
@@ -198,11 +201,13 @@ mod tests {
bytes.sentences().collect()
}
+ #[cfg(not(miri))]
fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
strs.iter().map(|s| s.as_ref().as_bytes()).collect()
}
/// Return all of the UCD for sentence breaks.
+ #[cfg(not(miri))]
fn ucdtests() -> Vec<SentenceBreakTest> {
const TESTDATA: &'static str =
include_str!("data/SentenceBreakTest.txt");
diff --git a/src/unicode/whitespace.rs b/src/unicode/whitespace.rs
index 949a83f..b5eff30 100644
--- a/src/unicode/whitespace.rs
+++ b/src/unicode/whitespace.rs
@@ -1,7 +1,9 @@
use regex_automata::DFA;
-use crate::unicode::fsm::whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD;
-use crate::unicode::fsm::whitespace_anchored_rev::WHITESPACE_ANCHORED_REV;
+use crate::unicode::fsm::{
+ whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD,
+ whitespace_anchored_rev::WHITESPACE_ANCHORED_REV,
+};
/// Return the first position of a non-whitespace character.
pub fn whitespace_len_fwd(slice: &[u8]) -> usize {
diff --git a/src/unicode/word.rs b/src/unicode/word.rs
index e0a5701..849f0c8 100644
--- a/src/unicode/word.rs
+++ b/src/unicode/word.rs
@@ -1,9 +1,12 @@
use regex_automata::DFA;
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD;
-use crate::unicode::fsm::word_break_fwd::WORD_BREAK_FWD;
-use crate::utf8;
+use crate::{
+ ext_slice::ByteSlice,
+ unicode::fsm::{
+ simple_word_fwd::SIMPLE_WORD_FWD, word_break_fwd::WORD_BREAK_FWD,
+ },
+ utf8,
+};
/// An iterator over words in a byte string.
///
@@ -254,7 +257,7 @@ pub struct WordsWithBreakIndices<'a> {
impl<'a> WordsWithBreakIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> {
- WordsWithBreakIndices { bs: bs, forward_index: 0 }
+ WordsWithBreakIndices { bs, forward_index: 0 }
}
/// View the underlying data as a subslice of the original data.
@@ -316,13 +319,15 @@ fn decode_word(bs: &[u8]) -> (&str, usize) {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
+ #[cfg(not(miri))]
use ucd_parse::WordBreakTest;
use crate::ext_slice::ByteSlice;
#[test]
+ #[cfg(not(miri))]
fn forward_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.words.concat();
@@ -379,17 +384,26 @@ mod tests {
assert_eq!(vec!["1XY"], words(b"1XY"));
assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes()));
+
+ // Tests that Vithkuqi works, which was introduced in Unicode 14.
+ // This test fails prior to Unicode 14.
+ assert_eq!(
+ vec!["\u{10570}\u{10597}"],
+ words("\u{10570}\u{10597}".as_bytes())
+ );
}
fn words(bytes: &[u8]) -> Vec<&str> {
bytes.words_with_breaks().collect()
}
+ #[cfg(not(miri))]
fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
strs.iter().map(|s| s.as_ref().as_bytes()).collect()
}
/// Return all of the UCD for word breaks.
+ #[cfg(not(miri))]
fn ucdtests() -> Vec<WordBreakTest> {
const TESTDATA: &'static str = include_str!("data/WordBreakTest.txt");
diff --git a/src/utf8.rs b/src/utf8.rs
index 5c7de36..bc9bc52 100644
--- a/src/utf8.rs
+++ b/src/utf8.rs
@@ -1,13 +1,9 @@
-use core::char;
-use core::cmp;
-use core::fmt;
-use core::str;
+use core::{char, cmp, fmt, str};
+
#[cfg(feature = "std")]
use std::error;
-use crate::ascii;
-use crate::bstr::BStr;
-use crate::ext_slice::ByteSlice;
+use crate::{ascii, bstr::BStr, ext_slice::ByteSlice};
// The UTF-8 decoder provided here is based on the one presented here:
// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
@@ -75,7 +71,7 @@ const STATES_FORWARD: &'static [u8] = &[
///
/// When invalid UTF-8 byte sequences are found, they are substituted with the
/// Unicode replacement codepoint (`U+FFFD`) using the
-/// ["maximal subpart" strategy](http://www.unicode.org/review/pr-121.html).
+/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
///
/// This iterator is created by the
/// [`chars`](trait.ByteSlice.html#method.chars) method provided by the
@@ -146,7 +142,7 @@ impl<'a> DoubleEndedIterator for Chars<'a> {
///
/// When invalid UTF-8 byte sequences are found, they are substituted with the
/// Unicode replacement codepoint (`U+FFFD`) using the
-/// ["maximal subpart" strategy](http://www.unicode.org/review/pr-121.html).
+/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
///
/// Note that this is slightly different from the `CharIndices` iterator
/// provided by the standard library. Aside from working on possibly invalid
@@ -168,7 +164,7 @@ pub struct CharIndices<'a> {
impl<'a> CharIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> {
- CharIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
+ CharIndices { bs, forward_index: 0, reverse_index: bs.len() }
}
/// View the underlying data as a subslice of the original data.
@@ -406,7 +402,7 @@ impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {}
/// assert_eq!(err.valid_up_to(), 6);
/// assert_eq!(err.error_len(), Some(1));
/// ```
-#[derive(Debug, Eq, PartialEq)]
+#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Utf8Error {
valid_up_to: usize,
error_len: Option<usize>,
@@ -854,13 +850,15 @@ fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
(b & 0b1100_0000) != 0b1000_0000
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
use std::char;
- use crate::ext_slice::{ByteSlice, B};
- use crate::tests::LOSSY_TESTS;
- use crate::utf8::{self, Utf8Error};
+ use crate::{
+ ext_slice::{ByteSlice, B},
+ tests::LOSSY_TESTS,
+ utf8::{self, Utf8Error},
+ };
fn utf8e(valid_up_to: usize) -> Utf8Error {
Utf8Error { valid_up_to, error_len: None }
@@ -871,6 +869,7 @@ mod tests {
}
#[test]
+ #[cfg(not(miri))]
fn validate_all_codepoints() {
for i in 0..(0x10FFFF + 1) {
let cp = match char::from_u32(i) {