aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorElliott Hughes <enh@google.com>2021-04-02 21:33:14 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2021-04-02 21:33:14 +0000
commitb6bd22e71f7fc5ace8beb388f0af7dcc82f81798 (patch)
tree5f360a0d078f211e4938d8bdb073d5aa685ed554
parent79147025d287492dc9d886e2f2714b8238d49e5c (diff)
parent58996b6c89bd8d7b1a2d4489979e10c9bb2afcd0 (diff)
downloadregex-android12L-d2-s2-release.tar.gz
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/regex/+/1663020 Change-Id: Iff37928c656f6da74cf19236958bc5f6f7cf5e95
-rw-r--r--.cargo_vcs_info.json2
-rw-r--r--Android.bp5
-rw-r--r--CHANGELOG.md29
-rw-r--r--Cargo.toml14
-rw-r--r--Cargo.toml.orig16
-rw-r--r--METADATA8
-rw-r--r--README.md6
-rw-r--r--TEST_MAPPING6
-rw-r--r--UNICODE.md34
-rw-r--r--src/backtrack.rs4
-rw-r--r--src/cache.rs100
-rw-r--r--src/dfa.rs20
-rw-r--r--src/exec.rs41
-rw-r--r--src/expand.rs3
-rw-r--r--src/lib.rs18
-rw-r--r--src/pool.rs333
-rw-r--r--src/re_bytes.rs59
-rw-r--r--src/re_set.rs2
-rw-r--r--src/re_unicode.rs59
-rw-r--r--src/sparse.rs2
-rw-r--r--tests/consistent.rs5
-rw-r--r--tests/crazy.rs5
-rw-r--r--tests/macros_bytes.rs1
-rw-r--r--tests/macros_str.rs1
-rw-r--r--tests/replace.rs98
-rw-r--r--tests/test_default.rs42
26 files changed, 710 insertions, 203 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index 99a59fe..33a143b 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,5 +1,5 @@
{
"git": {
- "sha1": "373d5ca4c594f5018be95cae3ad5b9dfc602945d"
+ "sha1": "ff283badce21dcebd581909d38b81f2c8c9bfb54"
}
}
diff --git a/Android.bp b/Android.bp
index 744661e..99eac66 100644
--- a/Android.bp
+++ b/Android.bp
@@ -40,7 +40,6 @@ license {
rust_library {
name: "libregex",
- // has rustc warnings
host_supported: true,
crate_name: "regex",
srcs: ["src/lib.rs"],
@@ -55,7 +54,6 @@ rust_library {
"perf-inline",
"perf-literal",
"std",
- "thread_local",
"unicode",
"unicode-age",
"unicode-bool",
@@ -69,13 +67,10 @@ rust_library {
"libaho_corasick",
"libmemchr",
"libregex_syntax",
- "libthread_local",
],
}
// dependent_library ["feature_list"]
// aho-corasick-0.7.15 "default,std"
// memchr-2.3.4 "default,std,use_std"
-// once_cell-1.7.2 "alloc,default,race,std"
// regex-syntax-0.6.23 "default,unicode,unicode-age,unicode-bool,unicode-case,unicode-gencat,unicode-perl,unicode-script,unicode-segment"
-// thread_local-1.1.3
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cf020d2..f294972 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,32 @@
+1.4.5 (2021-03-14)
+==================
+This is a small patch release that fixes a regression in the size of a `Regex`
+in the 1.4.4 release. Prior to 1.4.4, a `Regex` was 552 bytes. In the 1.4.4
+release, it was 856 bytes due to internal changes. In this release, a `Regex`
+is now 16 bytes. In general, the size of a `Regex` was never something that was
+on my radar, but this increased size in the 1.4.4 release seems to have crossed
+a threshold and resulted in stack overflows in some programs.
+
+* [BUG #750](https://github.com/rust-lang/regex/pull/750):
+ Fixes stack overflows seemingly caused by a large `Regex` size by decreasing
+ its size.
+
+
+1.4.4 (2021-03-11)
+==================
+This is a small patch release that contains some bug fixes. Notably, it also
+drops the `thread_local` (and `lazy_static`, via transitivity) dependencies.
+
+Bug fixes:
+
+* [BUG #362](https://github.com/rust-lang/regex/pull/362):
+ Memory leaks caused by an internal caching strategy should now be fixed.
+* [BUG #576](https://github.com/rust-lang/regex/pull/576):
+ All regex types now implement `UnwindSafe` and `RefUnwindSafe`.
+* [BUG #728](https://github.com/rust-lang/regex/pull/749):
+ Add missing `Replacer` impls for `Vec<u8>`, `String`, `Cow`, etc.
+
+
1.4.3 (2021-01-08)
==================
This is a small patch release that adds some missing standard trait
diff --git a/Cargo.toml b/Cargo.toml
index e43a1ba..6f8ef4e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,7 +12,7 @@
[package]
name = "regex"
-version = "1.4.3"
+version = "1.4.5"
authors = ["The Rust Project Developers"]
exclude = ["/scripts/*", "/.github/*"]
autotests = false
@@ -82,25 +82,23 @@ optional = true
[dependencies.regex-syntax]
version = "0.6.22"
default-features = false
-
-[dependencies.thread_local]
-version = "1"
-optional = true
[dev-dependencies.lazy_static]
version = "1"
[dev-dependencies.quickcheck]
-version = "0.8"
+version = "1.0.3"
default-features = false
[dev-dependencies.rand]
-version = "0.6.5"
+version = "0.8.3"
+features = ["getrandom", "small_rng"]
+default-features = false
[features]
default = ["std", "perf", "unicode", "regex-syntax/default"]
pattern = []
perf = ["perf-cache", "perf-dfa", "perf-inline", "perf-literal"]
-perf-cache = ["thread_local"]
+perf-cache = []
perf-dfa = []
perf-inline = []
perf-literal = ["aho-corasick", "memchr"]
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index 5e1545f..4b9ca7f 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,6 +1,6 @@
[package]
name = "regex"
-version = "1.4.3" #:version
+version = "1.4.5" #:version
authors = ["The Rust Project Developers"]
license = "MIT OR Apache-2.0"
readme = "README.md"
@@ -51,7 +51,10 @@ use_std = ["std"]
# Enables all performance features.
perf = ["perf-cache", "perf-dfa", "perf-inline", "perf-literal"]
# Enables fast caching. (If disabled, caching is still used, but is slower.)
-perf-cache = ["thread_local"]
+# Currently, this feature has no effect. It used to remove the thread_local
+# dependency and use a slower internal cache, but now the default cache has
+# been improved and thread_local is no longer a dependency at all.
+perf-cache = []
# Enables use of a lazy DFA when possible.
perf-dfa = []
# Enables aggressive use of inlining.
@@ -110,11 +113,6 @@ optional = true
version = "2.2.1"
optional = true
-# For managing regex caches quickly across multiple threads.
-[dependencies.thread_local]
-version = "1"
-optional = true
-
# For parsing regular expressions.
[dependencies.regex-syntax]
path = "regex-syntax"
@@ -125,9 +123,9 @@ default-features = false
# For examples.
lazy_static = "1"
# For property based tests.
-quickcheck = { version = "0.8", default-features = false }
+quickcheck = { version = "1.0.3", default-features = false }
# For generating random test data.
-rand = "0.6.5"
+rand = { version = "0.8.3", default-features = false, features = ["getrandom", "small_rng"] }
# To check README's example
# TODO: Re-enable this once the MSRV is 1.43 or greater.
# See: https://github.com/rust-lang/regex/issues/684
diff --git a/METADATA b/METADATA
index 1e7c0d3..52d2bfe 100644
--- a/METADATA
+++ b/METADATA
@@ -7,13 +7,13 @@ third_party {
}
url {
type: ARCHIVE
- value: "https://static.crates.io/crates/regex/regex-1.4.3.crate"
+ value: "https://static.crates.io/crates/regex/regex-1.4.5.crate"
}
- version: "1.4.3"
+ version: "1.4.5"
license_type: NOTICE
last_upgrade_date {
year: 2021
- month: 1
- day: 8
+ month: 4
+ day: 1
}
}
diff --git a/README.md b/README.md
index 8c05a90..f7a2554 100644
--- a/README.md
+++ b/README.md
@@ -245,12 +245,12 @@ supported version of Rust.
This project is licensed under either of
* Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
- http://www.apache.org/licenses/LICENSE-2.0)
+ https://www.apache.org/licenses/LICENSE-2.0)
* MIT license ([LICENSE-MIT](LICENSE-MIT) or
- http://opensource.org/licenses/MIT)
+ https://opensource.org/licenses/MIT)
at your option.
The data in `regex-syntax/src/unicode_tables/` is licensed under the Unicode
License Agreement
-([LICENSE-UNICODE](http://www.unicode.org/copyright.html#License)).
+([LICENSE-UNICODE](https://www.unicode.org/copyright.html#License)).
diff --git a/TEST_MAPPING b/TEST_MAPPING
index 3ad2d9a..ee45373 100644
--- a/TEST_MAPPING
+++ b/TEST_MAPPING
@@ -2,13 +2,13 @@
{
"presubmit": [
{
- "name": "vpnprofilestore_test"
+ "name": "keystore2_test"
},
{
- "name": "libsqlite3-sys_device_test_src_lib"
+ "name": "vpnprofilestore_test"
},
{
- "name": "keystore2_test"
+ "name": "libsqlite3-sys_device_test_src_lib"
}
]
}
diff --git a/UNICODE.md b/UNICODE.md
index 18fa9b1..df7d21e 100644
--- a/UNICODE.md
+++ b/UNICODE.md
@@ -1,7 +1,7 @@
# Unicode conformance
This document describes the regex crate's conformance to Unicode's
-[UTS#18](http://unicode.org/reports/tr18/)
+[UTS#18](https://unicode.org/reports/tr18/)
report, which lays out 3 levels of support: Basic, Extended and Tailored.
Full support for Level 1 ("Basic Unicode Support") is provided with two
@@ -10,7 +10,7 @@ exceptions:
1. Line boundaries are not Unicode aware. Namely, only the `\n`
(`END OF LINE`) character is recognized as a line boundary.
2. The compatibility properties specified by
- [RL1.2a](http://unicode.org/reports/tr18/#RL1.2a)
+ [RL1.2a](https://unicode.org/reports/tr18/#RL1.2a)
are ASCII-only definitions.
Little to no support is provided for either Level 2 or Level 3. For the most
@@ -61,18 +61,18 @@ provide a convenient way to construct character classes of groups of code
points specified by Unicode. The regex crate does not provide exhaustive
support, but covers a useful subset. In particular:
-* [General categories](http://unicode.org/reports/tr18/#General_Category_Property)
-* [Scripts and Script Extensions](http://unicode.org/reports/tr18/#Script_Property)
-* [Age](http://unicode.org/reports/tr18/#Age)
+* [General categories](https://unicode.org/reports/tr18/#General_Category_Property)
+* [Scripts and Script Extensions](https://unicode.org/reports/tr18/#Script_Property)
+* [Age](https://unicode.org/reports/tr18/#Age)
* A smattering of boolean properties, including all of those specified by
- [RL1.2](http://unicode.org/reports/tr18/#RL1.2) explicitly.
+ [RL1.2](https://unicode.org/reports/tr18/#RL1.2) explicitly.
In all cases, property name and value abbreviations are supported, and all
names/values are matched loosely without regard for case, whitespace or
underscores. Property name aliases can be found in Unicode's
-[`PropertyAliases.txt`](http://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt)
+[`PropertyAliases.txt`](https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt)
file, while property value aliases can be found in Unicode's
-[`PropertyValueAliases.txt`](http://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt)
+[`PropertyValueAliases.txt`](https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt)
file.
The syntax supported is also consistent with the UTS#18 recommendation:
@@ -149,10 +149,10 @@ properties correspond to properties required by RL1.2):
## RL1.2a Compatibility Properties
-[UTS#18 RL1.2a](http://unicode.org/reports/tr18/#RL1.2a)
+[UTS#18 RL1.2a](https://unicode.org/reports/tr18/#RL1.2a)
The regex crate only provides ASCII definitions of the
-[compatibility properties documented in UTS#18 Annex C](http://unicode.org/reports/tr18/#Compatibility_Properties)
+[compatibility properties documented in UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties)
(sans the `\X` class, for matching grapheme clusters, which isn't provided
at all). This is because it seems to be consistent with most other regular
expression engines, and in particular, because these are often referred to as
@@ -165,7 +165,7 @@ Their traditional ASCII definition can be used by disabling Unicode. That is,
## RL1.3 Subtraction and Intersection
-[UTS#18 RL1.3](http://unicode.org/reports/tr18/#Subtraction_and_Intersection)
+[UTS#18 RL1.3](https://unicode.org/reports/tr18/#Subtraction_and_Intersection)
The regex crate provides full support for nested character classes, along with
union, intersection (`&&`), difference (`--`) and symmetric difference (`~~`)
@@ -178,7 +178,7 @@ For example, to match all non-ASCII letters, you could use either
## RL1.4 Simple Word Boundaries
-[UTS#18 RL1.4](http://unicode.org/reports/tr18/#Simple_Word_Boundaries)
+[UTS#18 RL1.4](https://unicode.org/reports/tr18/#Simple_Word_Boundaries)
The regex crate provides basic Unicode aware word boundary assertions. A word
boundary assertion can be written as `\b`, or `\B` as its negation. A word
@@ -196,9 +196,9 @@ the following classes:
* `\p{gc:Connector_Punctuation}`
In particular, this differs slightly from the
-[prescription given in RL1.4](http://unicode.org/reports/tr18/#Simple_Word_Boundaries)
+[prescription given in RL1.4](https://unicode.org/reports/tr18/#Simple_Word_Boundaries)
but is permissible according to
-[UTS#18 Annex C](http://unicode.org/reports/tr18/#Compatibility_Properties).
+[UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties).
Namely, it is convenient and simpler to have `\w` and `\b` be in sync with
one another.
@@ -211,7 +211,7 @@ boundaries is currently sub-optimal on non-ASCII text.
## RL1.5 Simple Loose Matches
-[UTS#18 RL1.5](http://unicode.org/reports/tr18/#Simple_Loose_Matches)
+[UTS#18 RL1.5](https://unicode.org/reports/tr18/#Simple_Loose_Matches)
The regex crate provides full support for case insensitive matching in
accordance with RL1.5. That is, it uses the "simple" case folding mapping. The
@@ -226,7 +226,7 @@ then all characters classes are case folded as well.
## RL1.6 Line Boundaries
-[UTS#18 RL1.6](http://unicode.org/reports/tr18/#Line_Boundaries)
+[UTS#18 RL1.6](https://unicode.org/reports/tr18/#Line_Boundaries)
The regex crate only provides support for recognizing the `\n` (`END OF LINE`)
character as a line boundary. This choice was made mostly for implementation
@@ -239,7 +239,7 @@ well, and in theory, this could be done efficiently.
## RL1.7 Code Points
-[UTS#18 RL1.7](http://unicode.org/reports/tr18/#Supplementary_Characters)
+[UTS#18 RL1.7](https://unicode.org/reports/tr18/#Supplementary_Characters)
The regex crate provides full support for Unicode code point matching. Namely,
the fundamental atom of any match is always a single code point.
diff --git a/src/backtrack.rs b/src/backtrack.rs
index 2eaeb72..6100c17 100644
--- a/src/backtrack.rs
+++ b/src/backtrack.rs
@@ -115,8 +115,8 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
// Then we reset all existing allocated space to 0.
// Finally, we request more space if we need it.
//
- // This is all a little circuitous, but doing this unsafely
- // doesn't seem to have a measurable impact on performance.
+ // This is all a little circuitous, but doing this using unchecked
+ // operations doesn't seem to have a measurable impact on performance.
// (Probably because backtracking is limited to such small
// inputs/regexes in the first place.)
let visited_len =
diff --git a/src/cache.rs b/src/cache.rs
deleted file mode 100644
index dbb7e64..0000000
--- a/src/cache.rs
+++ /dev/null
@@ -1,100 +0,0 @@
-// This module defines a common API for caching internal runtime state.
-// The `thread_local` crate provides an extremely optimized version of this.
-// However, if the perf-cache feature is disabled, then we drop the
-// thread_local dependency and instead use a pretty naive caching mechanism
-// with a mutex.
-//
-// Strictly speaking, the CachedGuard isn't necessary for the much more
-// flexible thread_local API, but implementing thread_local's API doesn't
-// seem possible in purely safe code.
-
-pub use self::imp::{Cached, CachedGuard};
-
-#[cfg(feature = "perf-cache")]
-mod imp {
- use thread_local::CachedThreadLocal;
-
- #[derive(Debug)]
- pub struct Cached<T: Send>(CachedThreadLocal<T>);
-
- #[derive(Debug)]
- pub struct CachedGuard<'a, T: 'a>(&'a T);
-
- impl<T: Send> Cached<T> {
- pub fn new() -> Cached<T> {
- Cached(CachedThreadLocal::new())
- }
-
- pub fn get_or(&self, create: impl FnOnce() -> T) -> CachedGuard<T> {
- CachedGuard(self.0.get_or(|| create()))
- }
- }
-
- impl<'a, T: Send> CachedGuard<'a, T> {
- pub fn value(&self) -> &T {
- self.0
- }
- }
-}
-
-#[cfg(not(feature = "perf-cache"))]
-mod imp {
- use std::marker::PhantomData;
- use std::panic::UnwindSafe;
- use std::sync::Mutex;
-
- #[derive(Debug)]
- pub struct Cached<T: Send> {
- stack: Mutex<Vec<T>>,
- /// When perf-cache is enabled, the thread_local crate is used, and
- /// its CachedThreadLocal impls Send, Sync and UnwindSafe, but NOT
- /// RefUnwindSafe. However, a Mutex impls RefUnwindSafe. So in order
- /// to keep the APIs consistent regardless of whether perf-cache is
- /// enabled, we force this type to NOT impl RefUnwindSafe too.
- ///
- /// Ideally, we should always impl RefUnwindSafe, but it seems a little
- /// tricky to do that right now.
- ///
- /// See also: https://github.com/rust-lang/regex/issues/576
- _phantom: PhantomData<Box<dyn Send + Sync + UnwindSafe>>,
- }
-
- #[derive(Debug)]
- pub struct CachedGuard<'a, T: 'a + Send> {
- cache: &'a Cached<T>,
- value: Option<T>,
- }
-
- impl<T: Send> Cached<T> {
- pub fn new() -> Cached<T> {
- Cached { stack: Mutex::new(vec![]), _phantom: PhantomData }
- }
-
- pub fn get_or(&self, create: impl FnOnce() -> T) -> CachedGuard<T> {
- let mut stack = self.stack.lock().unwrap();
- match stack.pop() {
- None => CachedGuard { cache: self, value: Some(create()) },
- Some(value) => CachedGuard { cache: self, value: Some(value) },
- }
- }
-
- fn put(&self, value: T) {
- let mut stack = self.stack.lock().unwrap();
- stack.push(value);
- }
- }
-
- impl<'a, T: Send> CachedGuard<'a, T> {
- pub fn value(&self) -> &T {
- self.value.as_ref().unwrap()
- }
- }
-
- impl<'a, T: Send> Drop for CachedGuard<'a, T> {
- fn drop(&mut self) {
- if let Some(value) = self.value.take() {
- self.cache.put(value);
- }
- }
- }
-}
diff --git a/src/dfa.rs b/src/dfa.rs
index 2a365ee..9ac0c2c 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -848,7 +848,7 @@ impl<'a> Fsm<'a> {
/// next_si transitions to the next state, where the transition input
/// corresponds to text[i].
///
- /// This elides bounds checks, and is therefore unsafe.
+ /// This elides bounds checks, and is therefore not safe.
#[cfg_attr(feature = "perf-inline", inline(always))]
unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr {
// What is the argument for safety here?
@@ -1688,7 +1688,7 @@ impl Transitions {
self.num_byte_classes * mem::size_of::<StatePtr>()
}
- /// Like `next`, but uses unchecked access and is therefore unsafe.
+ /// Like `next`, but uses unchecked access and is therefore not safe.
unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr {
debug_assert!((si as usize) < self.table.len());
debug_assert!(cls < self.num_byte_classes);
@@ -1895,12 +1895,22 @@ mod tests {
push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32,
State, StateFlags,
};
- use quickcheck::{quickcheck, QuickCheck, StdGen};
+ use quickcheck::{quickcheck, Gen, QuickCheck};
use std::sync::Arc;
#[test]
fn prop_state_encode_decode() {
- fn p(ips: Vec<u32>, flags: u8) -> bool {
+ fn p(mut ips: Vec<u32>, flags: u8) -> bool {
+ // It looks like our encoding scheme can't handle instruction
+ // pointers at or above 2**31. We should fix that, but it seems
+ // unlikely to occur in real code due to the amount of memory
+ // required for such a state machine. So for now, we just clamp
+ // our test data.
+ for ip in &mut ips {
+ if *ip >= 1 << 31 {
+ *ip = (1 << 31) - 1;
+ }
+ }
let mut data = vec![flags];
let mut prev = 0;
for &ip in ips.iter() {
@@ -1914,7 +1924,7 @@ mod tests {
expected == got && state.flags() == StateFlags(flags)
}
QuickCheck::new()
- .gen(StdGen::new(self::rand::thread_rng(), 10_000))
+ .gen(Gen::new(10_000))
.quickcheck(p as fn(Vec<u32>, u8) -> bool);
}
diff --git a/src/exec.rs b/src/exec.rs
index e1aae87..3d5a52b 100644
--- a/src/exec.rs
+++ b/src/exec.rs
@@ -1,5 +1,6 @@
use std::cell::RefCell;
use std::collections::HashMap;
+use std::panic::AssertUnwindSafe;
use std::sync::Arc;
#[cfg(feature = "perf-literal")]
@@ -9,7 +10,6 @@ use syntax::hir::Hir;
use syntax::ParserBuilder;
use backtrack;
-use cache::{Cached, CachedGuard};
use compile::Compiler;
#[cfg(feature = "perf-dfa")]
use dfa;
@@ -17,6 +17,7 @@ use error::Error;
use input::{ByteInput, CharInput};
use literal::LiteralSearcher;
use pikevm;
+use pool::{Pool, PoolGuard};
use prog::Program;
use re_builder::RegexOptions;
use re_bytes;
@@ -34,8 +35,15 @@ use utf8::next_utf8;
pub struct Exec {
/// All read only state.
ro: Arc<ExecReadOnly>,
- /// Caches for the various matching engines.
- cache: Cached<ProgramCache>,
+ /// A pool of reusable values for the various matching engines.
+ ///
+ /// Note that boxing this value is not strictly necessary, but it is an
+ /// easy way to ensure that T does not bloat the stack sized used by a pool
+ /// in the case where T is big. And this turns out to be the case at the
+ /// time of writing for regex's use of this pool. At the time of writing,
+ /// the size of a Regex on the stack is 856 bytes. Boxing this value
+ /// reduces that size to 16 bytes.
+ pool: Box<Pool<ProgramCache>>,
}
/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
@@ -46,7 +54,7 @@ pub struct ExecNoSync<'c> {
/// All read only state.
ro: &'c Arc<ExecReadOnly>,
/// Caches for the various matching engines.
- cache: CachedGuard<'c, ProgramCache>,
+ cache: PoolGuard<'c, ProgramCache>,
}
/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
@@ -302,7 +310,8 @@ impl ExecBuilder {
ac: None,
match_type: MatchType::Nothing,
});
- return Ok(Exec { ro: ro, cache: Cached::new() });
+ let pool = ExecReadOnly::new_pool(&ro);
+ return Ok(Exec { ro: ro, pool });
}
let parsed = self.parse()?;
let mut nfa = Compiler::new()
@@ -342,7 +351,8 @@ impl ExecBuilder {
ro.match_type = ro.choose_match_type(self.match_type);
let ro = Arc::new(ro);
- Ok(Exec { ro: ro, cache: Cached::new() })
+ let pool = ExecReadOnly::new_pool(&ro);
+ Ok(Exec { ro, pool })
}
#[cfg(feature = "perf-literal")]
@@ -1254,10 +1264,9 @@ impl Exec {
/// Get a searcher that isn't Sync.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn searcher(&self) -> ExecNoSync {
- let create = || RefCell::new(ProgramCacheInner::new(&self.ro));
ExecNoSync {
ro: &self.ro, // a clone is too expensive here! (and not needed)
- cache: self.cache.get_or(create),
+ cache: self.pool.get(),
}
}
@@ -1309,7 +1318,8 @@ impl Exec {
impl Clone for Exec {
fn clone(&self) -> Exec {
- Exec { ro: self.ro.clone(), cache: Cached::new() }
+ let pool = ExecReadOnly::new_pool(&self.ro);
+ Exec { ro: self.ro.clone(), pool }
}
}
@@ -1442,6 +1452,13 @@ impl ExecReadOnly {
let lcs_len = self.suffixes.lcs().char_len();
lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
}
+
+ fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
+ let ro = ro.clone();
+ Box::new(Pool::new(Box::new(move || {
+ AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
+ })))
+ }
}
#[derive(Clone, Copy, Debug)]
@@ -1500,7 +1517,11 @@ enum MatchNfaType {
/// `ProgramCache` maintains reusable allocations for each matching engine
/// available to a particular program.
-pub type ProgramCache = RefCell<ProgramCacheInner>;
+///
+/// We declare this as unwind safe since it's a cache that's only used for
+/// performance purposes. If a panic occurs, it is (or should be) always safe
+/// to continue using the same regex object.
+pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
#[derive(Debug)]
pub struct ProgramCacheInner {
diff --git a/src/expand.rs b/src/expand.rs
index fd2ab03..70dbf91 100644
--- a/src/expand.rs
+++ b/src/expand.rs
@@ -144,7 +144,8 @@ fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> {
}
// We just verified that the range 0..cap_end is valid ASCII, so it must
// therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
- // check with either unsafe or by parsing the number straight from &[u8].
+ // check via an unchecked conversion or by parsing the number straight from
+ // &[u8].
let cap =
str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
Some(CaptureRef {
diff --git a/src/lib.rs b/src/lib.rs
index d3dc58d..357ac0d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -253,7 +253,7 @@ assert_eq!((mat.start(), mat.end()), (3, 23));
```
For a more detailed breakdown of Unicode support with respect to
-[UTS#18](http://unicode.org/reports/tr18/),
+[UTS#18](https://unicode.org/reports/tr18/),
please see the
[UNICODE](https://github.com/rust-lang/regex/blob/master/UNICODE.md)
document in the root of the regex repository.
@@ -455,7 +455,7 @@ assert_eq!(&cap[0], "abc");
## Perl character classes (Unicode friendly)
These classes are based on the definitions provided in
-[UTS#18](http://www.unicode.org/reports/tr18/#Compatibility_Properties):
+[UTS#18](https://www.unicode.org/reports/tr18/#Compatibility_Properties):
<pre class="rust">
\d digit (\p{Nd})
@@ -523,11 +523,6 @@ All features below are enabled by default.
Enables all performance related features. This feature is enabled by default
and will always cover all features that improve performance, even if more
are added in the future.
-* **perf-cache** -
- Enables the use of very fast thread safe caching for internal match state.
- When this is disabled, caching is still used, but with a slower and simpler
- implementation. Disabling this drops the `thread_local` and `lazy_static`
- dependencies.
* **perf-dfa** -
Enables the use of a lazy DFA for matching. The lazy DFA is used to compile
portions of a regex to a very fast DFA on an as-needed basis. This can
@@ -542,6 +537,11 @@ All features below are enabled by default.
Enables the use of literal optimizations for speeding up matches. In some
cases, literal optimizations can result in speedups of _several_ orders of
magnitude. Disabling this drops the `aho-corasick` and `memchr` dependencies.
+* **perf-cache** -
+ This feature used to enable a faster internal cache at the cost of using
+ additional dependencies, but this is no longer an option. A fast internal
+ cache is now used unconditionally with no additional dependencies. This may
+ change in the future.
### Unicode features
@@ -631,8 +631,6 @@ extern crate memchr;
#[cfg_attr(feature = "perf-literal", macro_use)]
extern crate quickcheck;
extern crate regex_syntax as syntax;
-#[cfg(feature = "perf-cache")]
-extern crate thread_local;
// #[cfg(doctest)]
// doc_comment::doctest!("../README.md");
@@ -749,7 +747,6 @@ pub mod bytes {
}
mod backtrack;
-mod cache;
mod compile;
#[cfg(feature = "perf-dfa")]
mod dfa;
@@ -764,6 +761,7 @@ mod literal;
#[cfg(feature = "pattern")]
mod pattern;
mod pikevm;
+mod pool;
mod prog;
mod re_builder;
mod re_bytes;
diff --git a/src/pool.rs b/src/pool.rs
new file mode 100644
index 0000000..a506ee9
--- /dev/null
+++ b/src/pool.rs
@@ -0,0 +1,333 @@
+// This module provides a relatively simple thread-safe pool of reusable
+// objects. For the most part, it's implemented by a stack represented by a
+// Mutex<Vec<T>>. It has one small trick: because unlocking a mutex is somewhat
+// costly, in the case where a pool is accessed by the first thread that tried
+// to get a value, we bypass the mutex. Here are some benchmarks showing the
+// difference.
+//
+// 1) misc::anchored_literal_long_non_match 21 (18571 MB/s)
+// 2) misc::anchored_literal_long_non_match 107 (3644 MB/s)
+// 3) misc::anchored_literal_long_non_match 45 (8666 MB/s)
+// 4) misc::anchored_literal_long_non_match 19 (20526 MB/s)
+//
+// (1) represents our baseline: the master branch at the time of writing when
+// using the 'thread_local' crate to implement the pool below.
+//
+// (2) represents a naive pool implemented completely via Mutex<Vec<T>>. There
+// is no special trick for bypassing the mutex.
+//
+// (3) is the same as (2), except it uses Mutex<Vec<Box<T>>>. It is twice as
+// fast because a Box<T> is much smaller than the T we use with a Pool in this
+// crate. So pushing and popping a Box<T> from a Vec is quite a bit faster
+// than for T.
+//
+// (4) is the same as (3), but with the trick for bypassing the mutex in the
+// case of the first-to-get thread.
+//
+// Why move off of thread_local? Even though (4) is a hair faster than (1)
+// above, this was not the main goal. The main goal was to move off of
+// thread_local and find a way to *simply* re-capture some of its speed for
+// regex's specific case. So again, why move off of it? The *primary* reason is
+// because of memory leaks. See https://github.com/rust-lang/regex/issues/362
+// for example. (Why do I want it to be simple? Well, I suppose what I mean is,
+// "use as much safe code as possible to minimize risk and be as sure as I can
+// be that it is correct.")
+//
+// My guess is that the thread_local design is probably not appropriate for
+// regex since its memory usage scales to the number of active threads that
+// have used a regex, where as the pool below scales to the number of threads
+// that simultaneously use a regex. While neither case permits contraction,
+// since we own the pool data structure below, we can add contraction if a
+// clear use case pops up in the wild. More pressingly though, it seems that
+// there are at least some use case patterns where one might have many threads
+// sitting around that might have used a regex at one point. While thread_local
+// does try to reuse space previously used by a thread that has since stopped,
+// its maximal memory usage still scales with the total number of active
+// threads. In contrast, the pool below scales with the total number of threads
+// *simultaneously* using the pool. The hope is that this uses less memory
+// overall. And if it doesn't, we can hopefully tune it somehow.
+//
+// It seems that these sort of conditions happen frequently
+// in FFI inside of other more "managed" languages. This was
+// mentioned in the issue linked above, and also mentioned here:
+// https://github.com/BurntSushi/rure-go/issues/3. And in particular, users
+// confirm that disabling the use of thread_local resolves the leak.
+//
+// There were other weaker reasons for moving off of thread_local as well.
+// Namely, at the time, I was looking to reduce dependencies. And for something
+// like regex, maintenance can be simpler when we own the full dependency tree.
+
+use std::panic::{RefUnwindSafe, UnwindSafe};
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::Mutex;
+
+/// An atomic counter used to allocate thread IDs.
+static COUNTER: AtomicUsize = AtomicUsize::new(1);
+
+thread_local!(
+ /// A thread local used to assign an ID to a thread.
+ static THREAD_ID: usize = {
+ let next = COUNTER.fetch_add(1, Ordering::Relaxed);
+ // SAFETY: We cannot permit the reuse of thread IDs since reusing a
+ // thread ID might result in more than one thread "owning" a pool,
+ // and thus, permit accessing a mutable value from multiple threads
+ // simultaneously without synchronization. The intent of this panic is
+ // to be a sanity check. It is not expected that the thread ID space
+ // will actually be exhausted in practice.
+ //
+ // This checks that the counter never wraps around, since atomic
+ // addition wraps around on overflow.
+ if next == 0 {
+ panic!("regex: thread ID allocation space exhausted");
+ }
+ next
+ };
+);
+
+/// The type of the function used to create values in a pool when the pool is
+/// empty and the caller requests one.
+type CreateFn<T> =
+ Box<dyn Fn() -> T + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>;
+
+/// A simple thread safe pool for reusing values.
+///
+/// Getting a value out comes with a guard. When that guard is dropped, the
+/// value is automatically put back in the pool.
+///
+/// A Pool<T> impls Sync when T is Send (even if it's not Sync). This means
+/// that T can use interior mutability. This is possible because a pool is
+/// guaranteed to provide a value to exactly one thread at any time.
+///
+/// Currently, a pool never contracts in size. Its size is proportional to the
+/// number of simultaneous uses.
+pub struct Pool<T> {
+ /// A stack of T values to hand out. These are used when a Pool is
+ /// accessed by a thread that didn't create it.
+ stack: Mutex<Vec<Box<T>>>,
+ /// A function to create more T values when stack is empty and a caller
+ /// has requested a T.
+ create: CreateFn<T>,
+ /// The ID of the thread that owns this pool. The owner is the thread
+ /// that makes the first call to 'get'. When the owner calls 'get', it
+ /// gets 'owner_val' directly instead of returning a T from 'stack'.
+ /// See comments elsewhere for details, but this is intended to be an
+ /// optimization for the common case that makes getting a T faster.
+ ///
+ /// It is initialized to a value of zero (an impossible thread ID) as a
+ /// sentinel to indicate that it is unowned.
+ owner: AtomicUsize,
+ /// A value to return when the caller is in the same thread that created
+ /// the Pool.
+ owner_val: T,
+}
+
+// SAFETY: Since we want to use a Pool from multiple threads simultaneously
+// behind an Arc, we need for it to be Sync. In cases where T is sync, Pool<T>
+// would be Sync. However, since we use a Pool to store mutable scratch space,
+// we wind up using a T that has interior mutability and is thus itself not
+// Sync. So what we *really* want is for our Pool<T> to by Sync even when T is
+// not Sync (but is at least Send).
+//
+// The only non-sync aspect of a Pool is its 'owner_val' field, which is used
+// to implement faster access to a pool value in the common case of a pool
+// being accessed in the same thread in which it was created. The 'stack' field
+// is also shared, but a Mutex<T> where T: Send is already Sync. So we only
+// need to worry about 'owner_val'.
+//
+// The key is to guarantee that 'owner_val' can only ever be accessed from one
+// thread. In our implementation below, we guarantee this by only returning the
+// 'owner_val' when the ID of the current thread matches the ID of the thread
+// that created the Pool. Since this can only ever be one thread, it follows
+// that only one thread can access 'owner_val' at any point in time. Thus, it
+// is safe to declare that Pool<T> is Sync when T is Send.
+//
+// NOTE: It would also be possible to make the owning thread be the *first*
+// thread that tries to get a value out of a Pool. However, the current
+// implementation is a little simpler and it's not clear if making the first
+// thread (rather than the creating thread) is meaningfully better.
+//
+// If there is a way to achieve our performance goals using safe code, then
+// I would very much welcome a patch. As it stands, the implementation below
+// tries to balance safety with performance. The case where a Regex is used
+// from multiple threads simultaneously will suffer a bit since getting a cache
+// will require unlocking a mutex.
+unsafe impl<T: Send> Sync for Pool<T> {}
+
+impl<T: ::std::fmt::Debug> ::std::fmt::Debug for Pool<T> {
+ fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result {
+ f.debug_struct("Pool")
+ .field("stack", &self.stack)
+ .field("owner", &self.owner)
+ .field("owner_val", &self.owner_val)
+ .finish()
+ }
+}
+
+/// A guard that is returned when a caller requests a value from the pool.
+///
+/// The purpose of the guard is to use RAII to automatically put the value back
+/// in the pool once it's dropped.
+#[derive(Debug)]
+pub struct PoolGuard<'a, T: 'a + Send> {
+ /// The pool that this guard is attached to.
+ pool: &'a Pool<T>,
+ /// This is None when the guard represents the special "owned" value. In
+ /// which case, the value is retrieved from 'pool.owner_val'.
+ value: Option<Box<T>>,
+}
+
+impl<T: Send> Pool<T> {
+ /// Create a new pool. The given closure is used to create values in the
+ /// pool when necessary.
+ pub fn new(create: CreateFn<T>) -> Pool<T> {
+ let owner = AtomicUsize::new(0);
+ let owner_val = create();
+ Pool { stack: Mutex::new(vec![]), create, owner, owner_val }
+ }
+
+ /// Get a value from the pool. The caller is guaranteed to have exclusive
+ /// access to the given value.
+ ///
+ /// Note that there is no guarantee provided about which value in the
+ /// pool is returned. That is, calling get, dropping the guard (causing
+ /// the value to go back into the pool) and then calling get again is NOT
+ /// guaranteed to return the same value received in the first get call.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn get(&self) -> PoolGuard<T> {
+ // Our fast path checks if the caller is the thread that "owns" this
+ // pool. Or stated differently, whether it is the first thread that
+ // tried to extract a value from the pool. If it is, then we can return
+ // a T to the caller without going through a mutex.
+ //
+ // SAFETY: We must guarantee that only one thread gets access to this
+ // value. Since a thread is uniquely identified by the THREAD_ID thread
+ // local, it follows that is the caller's thread ID is equal to the
+ // owner, then only one thread may receive this value.
+ let caller = THREAD_ID.with(|id| *id);
+ let owner = self.owner.load(Ordering::Relaxed);
+ if caller == owner {
+ return self.guard_owned();
+ }
+ self.get_slow(caller, owner)
+ }
+
+ /// This is the "slow" version that goes through a mutex to pop an
+ /// allocated value off a stack to return to the caller. (Or, if the stack
+ /// is empty, a new value is created.)
+ ///
+ /// If the pool has no owner, then this will set the owner.
+ #[cold]
+ fn get_slow(&self, caller: usize, owner: usize) -> PoolGuard<T> {
+ use std::sync::atomic::Ordering::Relaxed;
+
+ if owner == 0 {
+ // The sentinel 0 value means this pool is not yet owned. We
+ // try to atomically set the owner. If we do, then this thread
+ // becomes the owner and we can return a guard that represents
+ // the special T for the owner.
+ let res = self.owner.compare_exchange(0, caller, Relaxed, Relaxed);
+ if res.is_ok() {
+ return self.guard_owned();
+ }
+ }
+ let mut stack = self.stack.lock().unwrap();
+ let value = match stack.pop() {
+ None => Box::new((self.create)()),
+ Some(value) => value,
+ };
+ self.guard_stack(value)
+ }
+
+ /// Puts a value back into the pool. Callers don't need to call this. Once
+ /// the guard that's returned by 'get' is dropped, it is put back into the
+ /// pool automatically.
+ fn put(&self, value: Box<T>) {
+ let mut stack = self.stack.lock().unwrap();
+ stack.push(value);
+ }
+
+ /// Create a guard that represents the special owned T.
+ fn guard_owned(&self) -> PoolGuard<'_, T> {
+ PoolGuard { pool: self, value: None }
+ }
+
+ /// Create a guard that contains a value from the pool's stack.
+ fn guard_stack(&self, value: Box<T>) -> PoolGuard<'_, T> {
+ PoolGuard { pool: self, value: Some(value) }
+ }
+}
+
+impl<'a, T: Send> PoolGuard<'a, T> {
+ /// Return the underlying value.
+ pub fn value(&self) -> &T {
+ match self.value {
+ None => &self.pool.owner_val,
+ Some(ref v) => &**v,
+ }
+ }
+}
+
+impl<'a, T: Send> Drop for PoolGuard<'a, T> {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn drop(&mut self) {
+ if let Some(value) = self.value.take() {
+ self.pool.put(value);
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use std::panic::{RefUnwindSafe, UnwindSafe};
+
+ use super::*;
+
+ #[test]
+ fn oibits() {
+ use exec::ProgramCache;
+
+ fn has_oibits<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
+ has_oibits::<Pool<ProgramCache>>();
+ }
+
+ // Tests that Pool implements the "single owner" optimization. That is, the
+ // thread that first accesses the pool gets its own copy, while all other
+ // threads get distinct copies.
+ #[test]
+ fn thread_owner_optimization() {
+ use std::cell::RefCell;
+ use std::sync::Arc;
+
+ let pool: Arc<Pool<RefCell<Vec<char>>>> =
+ Arc::new(Pool::new(Box::new(|| RefCell::new(vec!['a']))));
+ pool.get().value().borrow_mut().push('x');
+
+ let pool1 = pool.clone();
+ let t1 = std::thread::spawn(move || {
+ let guard = pool1.get();
+ let v = guard.value();
+ v.borrow_mut().push('y');
+ });
+
+ let pool2 = pool.clone();
+ let t2 = std::thread::spawn(move || {
+ let guard = pool2.get();
+ let v = guard.value();
+ v.borrow_mut().push('z');
+ });
+
+ t1.join().unwrap();
+ t2.join().unwrap();
+
+ // If we didn't implement the single owner optimization, then one of
+ // the threads above is likely to have mutated the [a, x] vec that
+ // we stuffed in the pool before spawning the threads. But since
+ // neither thread was first to access the pool, and because of the
+ // optimization, we should be guaranteed that neither thread mutates
+ // the special owned pool value.
+ //
+ // (Technically this is an implementation detail and not a contract of
+ // Pool's API.)
+ assert_eq!(vec!['a', 'x'], *pool.get().value().borrow());
+ }
+}
diff --git a/src/re_bytes.rs b/src/re_bytes.rs
index a091436..204a70a 100644
--- a/src/re_bytes.rs
+++ b/src/re_bytes.rs
@@ -1105,9 +1105,9 @@ impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {}
/// string.
///
/// In general, users of this crate shouldn't need to implement this trait,
-/// since implementations are already provided for `&[u8]` and
-/// `FnMut(&Captures) -> Vec<u8>` (or any `FnMut(&Captures) -> T`
-/// where `T: AsRef<[u8]>`), which covers most use cases.
+/// since implementations are already provided for `&[u8]` along with other
+/// variants of bytes types and `FnMut(&Captures) -> Vec<u8>` (or any
+/// `FnMut(&Captures) -> T` where `T: AsRef<[u8]>`), which covers most use cases.
pub trait Replacer {
/// Appends text to `dst` to replace the current match.
///
@@ -1176,10 +1176,55 @@ impl<'a> Replacer for &'a [u8] {
}
fn no_expansion(&mut self) -> Option<Cow<[u8]>> {
- match find_byte(b'$', *self) {
- Some(_) => None,
- None => Some(Cow::Borrowed(*self)),
- }
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a Vec<u8> {
+ fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) {
+ caps.expand(*self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<[u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl Replacer for Vec<u8> {
+ fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) {
+ caps.expand(self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<[u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for Cow<'a, [u8]> {
+ fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) {
+ caps.expand(self.as_ref(), dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<[u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a Cow<'a, [u8]> {
+ fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) {
+ caps.expand(self.as_ref(), dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<[u8]>> {
+ no_expansion(self)
+ }
+}
+
+fn no_expansion<T: AsRef<[u8]>>(t: &T) -> Option<Cow<[u8]>> {
+ let s = t.as_ref();
+ match find_byte(b'$', s) {
+ Some(_) => None,
+ None => Some(Cow::Borrowed(s)),
}
}
diff --git a/src/re_set.rs b/src/re_set.rs
index 0a00229..5cb47ad 100644
--- a/src/re_set.rs
+++ b/src/re_set.rs
@@ -43,7 +43,7 @@ $(#[$doc_regexset_example])*
/// Note that it would be possible to adapt the above example to using `Regex`
/// with an expression like:
///
-/// ```ignore
+/// ```text
/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
/// ```
///
diff --git a/src/re_unicode.rs b/src/re_unicode.rs
index df87c34..1b478cd 100644
--- a/src/re_unicode.rs
+++ b/src/re_unicode.rs
@@ -1147,9 +1147,9 @@ impl<'r, 't> FusedIterator for Matches<'r, 't> {}
/// Replacer describes types that can be used to replace matches in a string.
///
/// In general, users of this crate shouldn't need to implement this trait,
-/// since implementations are already provided for `&str` and
-/// `FnMut(&Captures) -> String` (or any `FnMut(&Captures) -> T`
-/// where `T: AsRef<str>`), which covers most use cases.
+/// since implementations are already provided for `&str` along with other
+/// variants of string types and `FnMut(&Captures) -> String` (or any
+/// `FnMut(&Captures) -> T` where `T: AsRef<str>`), which covers most use cases.
pub trait Replacer {
/// Appends text to `dst` to replace the current match.
///
@@ -1218,10 +1218,55 @@ impl<'a> Replacer for &'a str {
}
fn no_expansion(&mut self) -> Option<Cow<str>> {
- match find_byte(b'$', self.as_bytes()) {
- Some(_) => None,
- None => Some(Cow::Borrowed(*self)),
- }
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a String {
+ fn replace_append(&mut self, caps: &Captures, dst: &mut String) {
+ self.as_str().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<str>> {
+ no_expansion(self)
+ }
+}
+
+impl Replacer for String {
+ fn replace_append(&mut self, caps: &Captures, dst: &mut String) {
+ self.as_str().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<str>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for Cow<'a, str> {
+ fn replace_append(&mut self, caps: &Captures, dst: &mut String) {
+ self.as_ref().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<str>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a Cow<'a, str> {
+ fn replace_append(&mut self, caps: &Captures, dst: &mut String) {
+ self.as_ref().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<str>> {
+ no_expansion(self)
+ }
+}
+
+fn no_expansion<T: AsRef<str>>(t: &T) -> Option<Cow<str>> {
+ let s = t.as_ref();
+ match find_byte(b'$', s.as_bytes()) {
+ Some(_) => None,
+ None => Some(Cow::Borrowed(s)),
}
}
diff --git a/src/sparse.rs b/src/sparse.rs
index bc1b2b5..421d6b6 100644
--- a/src/sparse.rs
+++ b/src/sparse.rs
@@ -8,7 +8,7 @@ use std::slice;
/// entire set can also be done in constant time. Iteration yields elements
/// in the order in which they were inserted.
///
-/// The data structure is based on: http://research.swtch.com/sparse
+/// The data structure is based on: https://research.swtch.com/sparse
/// Note though that we don't actually use uninitialized memory. We generally
/// reuse allocations, so the initial allocation cost is bareable. However,
/// its other properties listed above are extremely useful.
diff --git a/tests/consistent.rs b/tests/consistent.rs
index 0f9ea53..722f2a5 100644
--- a/tests/consistent.rs
+++ b/tests/consistent.rs
@@ -157,10 +157,7 @@ macro_rules! checker {
}
impl quickcheck::Testable for RegexEqualityTest {
- fn result<G: quickcheck::Gen>(
- &self,
- gen: &mut G,
- ) -> TestResult {
+ fn result(&self, gen: &mut quickcheck::Gen) -> TestResult {
let input = $mk_input(gen);
let input = &input;
diff --git a/tests/crazy.rs b/tests/crazy.rs
index 56f6cad..293ac1a 100644
--- a/tests/crazy.rs
+++ b/tests/crazy.rs
@@ -137,9 +137,10 @@ matiter!(match_empty23, r"a(?:)|b", "abc", (0, 1), (1, 2));
#[test]
fn dfa_handles_pathological_case() {
fn ones_and_zeroes(count: usize) -> String {
- use rand::{thread_rng, Rng};
+ use rand::rngs::SmallRng;
+ use rand::{Rng, SeedableRng};
- let mut rng = thread_rng();
+ let mut rng = SmallRng::from_entropy();
let mut s = String::new();
for _ in 0..count {
if rng.gen() {
diff --git a/tests/macros_bytes.rs b/tests/macros_bytes.rs
index 03c370d..3d6c8c3 100644
--- a/tests/macros_bytes.rs
+++ b/tests/macros_bytes.rs
@@ -4,7 +4,6 @@ macro_rules! t { ($re:expr) => { text!($re) } }
macro_rules! match_text { ($text:expr) => { $text.as_bytes() } }
macro_rules! use_ { ($($path: tt)*) => { use regex::bytes::$($path)*; } }
macro_rules! empty_vec { () => { <Vec<&[u8]>>::new() } }
-
macro_rules! bytes { ($text:expr) => { $text } }
macro_rules! no_expand {
diff --git a/tests/macros_str.rs b/tests/macros_str.rs
index 9b996b3..7b7eb11 100644
--- a/tests/macros_str.rs
+++ b/tests/macros_str.rs
@@ -4,6 +4,7 @@ macro_rules! t { ($text:expr) => { text!($text) } }
macro_rules! match_text { ($text:expr) => { $text.as_str() } }
macro_rules! use_ { ($($path: tt)*) => { use regex::$($path)*; } }
macro_rules! empty_vec { () => { <Vec<&str>>::new() } }
+macro_rules! bytes { ($text:expr) => { std::str::from_utf8($text.as_ref()).unwrap() } }
macro_rules! no_expand {
($text:expr) => {{
diff --git a/tests/replace.rs b/tests/replace.rs
index c156a39..700aff2 100644
--- a/tests/replace.rs
+++ b/tests/replace.rs
@@ -130,3 +130,101 @@ replace!(
t!("${1}a $1a"),
"ba "
);
+
+replace!(
+ impl_string,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ t!("Z".to_string()),
+ "age: Z6"
+);
+replace!(
+ impl_string_ref,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ t!(&"Z".to_string()),
+ "age: Z6"
+);
+replace!(
+ impl_cow_str_borrowed,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ t!(std::borrow::Cow::<'_, str>::Borrowed("Z")),
+ "age: Z6"
+);
+replace!(
+ impl_cow_str_borrowed_ref,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ t!(&std::borrow::Cow::<'_, str>::Borrowed("Z")),
+ "age: Z6"
+);
+replace!(
+ impl_cow_str_owned,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ t!(std::borrow::Cow::<'_, str>::Owned("Z".to_string())),
+ "age: Z6"
+);
+replace!(
+ impl_cow_str_owned_ref,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ t!(&std::borrow::Cow::<'_, str>::Owned("Z".to_string())),
+ "age: Z6"
+);
+
+replace!(
+ impl_vec_u8,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ bytes!(vec![b'Z']),
+ "age: Z6"
+);
+replace!(
+ impl_vec_u8_ref,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ bytes!(&vec![b'Z']),
+ "age: Z6"
+);
+replace!(
+ impl_cow_slice_borrowed,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ bytes!(std::borrow::Cow::<'_, [u8]>::Borrowed(&[b'Z'])),
+ "age: Z6"
+);
+replace!(
+ impl_cow_slice_borrowed_ref,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ bytes!(&std::borrow::Cow::<'_, [u8]>::Borrowed(&[b'Z'])),
+ "age: Z6"
+);
+replace!(
+ impl_cow_slice_owned,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ bytes!(std::borrow::Cow::<'_, [u8]>::Owned(vec![b'Z'])),
+ "age: Z6"
+);
+replace!(
+ impl_cow_slice_owned_ref,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ bytes!(&std::borrow::Cow::<'_, [u8]>::Owned(vec![b'Z'])),
+ "age: Z6"
+);
diff --git a/tests/test_default.rs b/tests/test_default.rs
index 241e580..af634a0 100644
--- a/tests/test_default.rs
+++ b/tests/test_default.rs
@@ -83,26 +83,49 @@ fn allow_octal() {
#[test]
fn oibits() {
use regex::bytes;
- use regex::{Regex, RegexBuilder};
- use std::panic::UnwindSafe;
+ use regex::{Regex, RegexBuilder, RegexSet, RegexSetBuilder};
+ use std::panic::{RefUnwindSafe, UnwindSafe};
fn assert_send<T: Send>() {}
fn assert_sync<T: Sync>() {}
fn assert_unwind_safe<T: UnwindSafe>() {}
+ fn assert_ref_unwind_safe<T: RefUnwindSafe>() {}
assert_send::<Regex>();
assert_sync::<Regex>();
assert_unwind_safe::<Regex>();
+ assert_ref_unwind_safe::<Regex>();
assert_send::<RegexBuilder>();
assert_sync::<RegexBuilder>();
assert_unwind_safe::<RegexBuilder>();
+ assert_ref_unwind_safe::<RegexBuilder>();
assert_send::<bytes::Regex>();
assert_sync::<bytes::Regex>();
assert_unwind_safe::<bytes::Regex>();
+ assert_ref_unwind_safe::<bytes::Regex>();
assert_send::<bytes::RegexBuilder>();
assert_sync::<bytes::RegexBuilder>();
assert_unwind_safe::<bytes::RegexBuilder>();
+ assert_ref_unwind_safe::<bytes::RegexBuilder>();
+
+ assert_send::<RegexSet>();
+ assert_sync::<RegexSet>();
+ assert_unwind_safe::<RegexSet>();
+ assert_ref_unwind_safe::<RegexSet>();
+ assert_send::<RegexSetBuilder>();
+ assert_sync::<RegexSetBuilder>();
+ assert_unwind_safe::<RegexSetBuilder>();
+ assert_ref_unwind_safe::<RegexSetBuilder>();
+
+ assert_send::<bytes::RegexSet>();
+ assert_sync::<bytes::RegexSet>();
+ assert_unwind_safe::<bytes::RegexSet>();
+ assert_ref_unwind_safe::<bytes::RegexSet>();
+ assert_send::<bytes::RegexSetBuilder>();
+ assert_sync::<bytes::RegexSetBuilder>();
+ assert_unwind_safe::<bytes::RegexSetBuilder>();
+ assert_ref_unwind_safe::<bytes::RegexSetBuilder>();
}
// See: https://github.com/rust-lang/regex/issues/568
@@ -113,3 +136,18 @@ fn oibits_regression() {
let _ = panic::catch_unwind(|| Regex::new("a").unwrap());
}
+
+// See: https://github.com/rust-lang/regex/issues/750
+#[test]
+#[cfg(target_pointer_width = "64")]
+fn regex_is_reasonably_small() {
+ use std::mem::size_of;
+
+ use regex::bytes;
+ use regex::{Regex, RegexSet};
+
+ assert_eq!(16, size_of::<Regex>());
+ assert_eq!(16, size_of::<RegexSet>());
+ assert_eq!(16, size_of::<bytes::Regex>());
+ assert_eq!(16, size_of::<bytes::RegexSet>());
+}