aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-07-07 05:08:34 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-07-07 05:08:34 +0000
commit040d3c7b6da7ad05ec0b007fec02bf26479c256e (patch)
treed050b095d3d08bc7800c98bf5fe9e2ec15a35921
parent6e92301fee3c847c54ffdab6da1b21c4a7deb1c6 (diff)
parent14bea0e36ba5a038a98be2269a0450fe90c9d229 (diff)
downloadregex-android14-mainline-permission-release.tar.gz
Change-Id: Ide292ec0881a70df1db3d8cab48f3617f3c61953
-rw-r--r--.cargo_vcs_info.json7
-rw-r--r--Android.bp24
-rw-r--r--CHANGELOG.md97
-rw-r--r--Cargo.toml61
-rw-r--r--Cargo.toml.orig4
-rw-r--r--METADATA14
-rw-r--r--README.md10
-rw-r--r--TEST_MAPPING31
-rw-r--r--cargo2android.json3
-rw-r--r--src/backtrack.rs12
-rw-r--r--src/compile.rs84
-rw-r--r--src/dfa.rs40
-rw-r--r--src/exec.rs18
-rw-r--r--src/expand.rs8
-rw-r--r--src/input.rs4
-rw-r--r--src/lib.rs4
-rw-r--r--src/literal/imp.rs4
-rw-r--r--src/pattern.rs2
-rw-r--r--src/pikevm.rs2
-rw-r--r--src/prog.rs2
-rw-r--r--src/re_bytes.rs24
-rw-r--r--src/re_set.rs44
-rw-r--r--src/re_trait.rs13
-rw-r--r--src/re_unicode.rs38
-rw-r--r--src/utf8.rs2
-rw-r--r--tests/regression.rs3
-rw-r--r--tests/replace.rs18
-rw-r--r--tests/test_default.rs80
-rw-r--r--tests/unicode.rs17
29 files changed, 490 insertions, 180 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index 51b3cd6..a82e282 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,5 +1,6 @@
{
"git": {
- "sha1": "f2dc1b788f773a49f1b6633a6302054978344452"
- }
-}
+ "sha1": "9582040009820380a16819ca0d1ae262c7d454b0"
+ },
+ "path_in_vcs": ""
+} \ No newline at end of file
diff --git a/Android.bp b/Android.bp
index 4fe3543..5b27d25 100644
--- a/Android.bp
+++ b/Android.bp
@@ -43,7 +43,7 @@ rust_library {
host_supported: true,
crate_name: "regex",
cargo_env_compat: true,
- cargo_pkg_version: "1.5.4",
+ cargo_pkg_version: "1.7.3",
srcs: ["src/lib.rs"],
edition: "2018",
features: [
@@ -75,6 +75,8 @@ rust_library {
"com.android.compos",
"com.android.virt",
],
+ product_available: true,
+ vendor_available: true,
}
rust_test {
@@ -82,7 +84,7 @@ rust_test {
host_supported: true,
crate_name: "regex",
cargo_env_compat: true,
- cargo_pkg_version: "1.5.4",
+ cargo_pkg_version: "1.7.3",
srcs: ["src/lib.rs"],
test_suites: ["general-tests"],
auto_gen_config: true,
@@ -124,7 +126,7 @@ rust_test {
host_supported: true,
crate_name: "backtrack",
cargo_env_compat: true,
- cargo_pkg_version: "1.5.4",
+ cargo_pkg_version: "1.7.3",
srcs: ["tests/test_backtrack.rs"],
test_suites: ["general-tests"],
auto_gen_config: true,
@@ -167,7 +169,7 @@ rust_test {
host_supported: true,
crate_name: "backtrack_bytes",
cargo_env_compat: true,
- cargo_pkg_version: "1.5.4",
+ cargo_pkg_version: "1.7.3",
srcs: ["tests/test_backtrack_bytes.rs"],
test_suites: ["general-tests"],
auto_gen_config: true,
@@ -210,7 +212,7 @@ rust_test {
host_supported: true,
crate_name: "backtrack_utf8bytes",
cargo_env_compat: true,
- cargo_pkg_version: "1.5.4",
+ cargo_pkg_version: "1.7.3",
srcs: ["tests/test_backtrack_utf8bytes.rs"],
test_suites: ["general-tests"],
auto_gen_config: true,
@@ -253,7 +255,7 @@ rust_test {
host_supported: true,
crate_name: "crates_regex",
cargo_env_compat: true,
- cargo_pkg_version: "1.5.4",
+ cargo_pkg_version: "1.7.3",
srcs: ["tests/test_crates_regex.rs"],
test_suites: ["general-tests"],
auto_gen_config: true,
@@ -296,7 +298,7 @@ rust_test {
host_supported: true,
crate_name: "default",
cargo_env_compat: true,
- cargo_pkg_version: "1.5.4",
+ cargo_pkg_version: "1.7.3",
srcs: ["tests/test_default.rs"],
test_suites: ["general-tests"],
auto_gen_config: true,
@@ -339,7 +341,7 @@ rust_test {
host_supported: true,
crate_name: "default_bytes",
cargo_env_compat: true,
- cargo_pkg_version: "1.5.4",
+ cargo_pkg_version: "1.7.3",
srcs: ["tests/test_default_bytes.rs"],
test_suites: ["general-tests"],
auto_gen_config: true,
@@ -382,7 +384,7 @@ rust_test {
host_supported: true,
crate_name: "nfa",
cargo_env_compat: true,
- cargo_pkg_version: "1.5.4",
+ cargo_pkg_version: "1.7.3",
srcs: ["tests/test_nfa.rs"],
test_suites: ["general-tests"],
auto_gen_config: true,
@@ -425,7 +427,7 @@ rust_test {
host_supported: true,
crate_name: "nfa_bytes",
cargo_env_compat: true,
- cargo_pkg_version: "1.5.4",
+ cargo_pkg_version: "1.7.3",
srcs: ["tests/test_nfa_bytes.rs"],
test_suites: ["general-tests"],
auto_gen_config: true,
@@ -468,7 +470,7 @@ rust_test {
host_supported: true,
crate_name: "nfa_utf8bytes",
cargo_env_compat: true,
- cargo_pkg_version: "1.5.4",
+ cargo_pkg_version: "1.7.3",
srcs: ["tests/test_nfa_utf8bytes.rs"],
test_suites: ["general-tests"],
auto_gen_config: true,
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 71d1963..44274ac 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,98 @@
+1.7.3 (2023-03-24)
+==================
+This is a small release that fixes a bug in `Regex::shortest_match_at` that
+could cause it to panic, even when the offset given is valid.
+
+Bug fixes:
+
+* [BUG #969](https://github.com/rust-lang/regex/issues/969):
+ Fix a bug in how the reverse DFA was called for `Regex::shortest_match_at`.
+
+
+1.7.2 (2023-03-21)
+==================
+This is a small release that fixes a failing test on FreeBSD.
+
+Bug fixes:
+
+* [BUG #967](https://github.com/rust-lang/regex/issues/967):
+ Fix "no stack overflow" test which can fail due to the small stack size.
+
+
+1.7.1 (2023-01-09)
+==================
+This release was done principally to try and fix the doc.rs rendering for the
+regex crate.
+
+Performance improvements:
+
+* [PERF #930](https://github.com/rust-lang/regex/pull/930):
+ Optimize `replacen`. This also applies to `replace`, but not `replace_all`.
+
+Bug fixes:
+
+* [BUG #945](https://github.com/rust-lang/regex/issues/945):
+ Maybe fix rustdoc rendering by just bumping a new release?
+
+
+1.7.0 (2022-11-05)
+==================
+This release principally includes an upgrade to Unicode 15.
+
+New features:
+
+* [FEATURE #832](https://github.com/rust-lang/regex/issues/916):
+ Upgrade to Unicode 15.
+
+
+1.6.0 (2022-07-05)
+==================
+This release principally includes an upgrade to Unicode 14.
+
+New features:
+
+* [FEATURE #832](https://github.com/rust-lang/regex/pull/832):
+ Clarify that `Captures::len` includes all groups, not just matching groups.
+* [FEATURE #857](https://github.com/rust-lang/regex/pull/857):
+ Add an `ExactSizeIterator` impl for `SubCaptureMatches`.
+* [FEATURE #861](https://github.com/rust-lang/regex/pull/861):
+ Improve `RegexSet` documentation examples.
+* [FEATURE #877](https://github.com/rust-lang/regex/issues/877):
+ Upgrade to Unicode 14.
+
+Bug fixes:
+
+* [BUG #792](https://github.com/rust-lang/regex/issues/792):
+ Fix error message rendering bug.
+
+
+1.5.6 (2022-05-20)
+==================
+This release includes a few bug fixes, including a bug that produced incorrect
+matches when a non-greedy `?` operator was used.
+
+* [BUG #680](https://github.com/rust-lang/regex/issues/680):
+ Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class.
+* [BUG #859](https://github.com/rust-lang/regex/issues/859):
+ Fixes a bug where `Hir::is_match_empty` returned `false` for `\b`.
+* [BUG #862](https://github.com/rust-lang/regex/issues/862):
+ Fixes a bug where 'ab??' matches 'ab' instead of 'a' in 'ab'.
+
+
+1.5.5 (2022-03-08)
+==================
+This releases fixes a security bug in the regex compiler. This bug permits a
+vector for a denial-of-service attack in cases where the regex being compiled
+is untrusted. There are no known problems where the regex is itself trusted,
+including in cases of untrusted haystacks.
+
+* [SECURITY #GHSA-m5pq-gvj9-9vr8](https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8):
+ Fixes a bug in the regex compiler where empty sub-expressions subverted the
+ existing mitigations in place to enforce a size limit on compiled regexes.
+ The Rust Security Response WG published an advisory about this:
+ https://groups.google.com/g/rustlang-security-announcements/c/NcNNL1Jq7Yw
+
+
1.5.4 (2021-05-06)
==================
This release fixes another compilation failure when building regex. This time,
@@ -669,7 +764,7 @@ New features:
* Empty sub-expressions are now permitted in most places. That is, `()+` is
now a valid regex.
* Almost everything in regex-syntax now uses constant stack space, even when
- performing anaylsis that requires structural induction. This reduces the risk
+ performing analysis that requires structural induction. This reduces the risk
of a user provided regular expression causing a stack overflow.
* [FEATURE #174](https://github.com/rust-lang/regex/issues/174):
The `Ast` type in `regex-syntax` now contains span information.
diff --git a/Cargo.toml b/Cargo.toml
index 260acec..37e44fb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,27 +3,33 @@
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
-# to registry (e.g., crates.io) dependencies
+# to registry (e.g., crates.io) dependencies.
#
-# If you believe there's an error in this file please file an
-# issue against the rust-lang/cargo repository. If you're
-# editing this file be aware that the upstream Cargo.toml
-# will likely look very different (and much more reasonable)
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
[package]
edition = "2018"
name = "regex"
-version = "1.5.4"
+version = "1.7.3"
authors = ["The Rust Project Developers"]
-exclude = ["/scripts/*", "/.github/*"]
+exclude = [
+ "/scripts/*",
+ "/.github/*",
+]
autotests = false
-description = "An implementation of regular expressions for Rust. This implementation uses\nfinite automata and guarantees linear time matching on all inputs.\n"
+description = """
+An implementation of regular expressions for Rust. This implementation uses
+finite automata and guarantees linear time matching on all inputs.
+"""
homepage = "https://github.com/rust-lang/regex"
documentation = "https://docs.rs/regex"
readme = "README.md"
categories = ["text-processing"]
license = "MIT OR Apache-2.0"
repository = "https://github.com/rust-lang/regex"
+
[profile.bench]
debug = true
@@ -72,6 +78,7 @@ path = "tests/test_backtrack_bytes.rs"
[[test]]
name = "crates-regex"
path = "tests/test_crates_regex.rs"
+
[dependencies.aho-corasick]
version = "0.7.18"
optional = true
@@ -81,8 +88,9 @@ version = "2.4.0"
optional = true
[dependencies.regex-syntax]
-version = "0.6.25"
+version = "0.6.29"
default-features = false
+
[dev-dependencies.lazy_static]
version = "1"
@@ -92,19 +100,44 @@ default-features = false
[dev-dependencies.rand]
version = "0.8.3"
-features = ["getrandom", "small_rng"]
+features = [
+ "getrandom",
+ "small_rng",
+]
default-features = false
[features]
-default = ["std", "perf", "unicode", "regex-syntax/default"]
+default = [
+ "std",
+ "perf",
+ "unicode",
+ "regex-syntax/default",
+]
pattern = []
-perf = ["perf-cache", "perf-dfa", "perf-inline", "perf-literal"]
+perf = [
+ "perf-cache",
+ "perf-dfa",
+ "perf-inline",
+ "perf-literal",
+]
perf-cache = []
perf-dfa = []
perf-inline = []
-perf-literal = ["aho-corasick", "memchr"]
+perf-literal = [
+ "aho-corasick",
+ "memchr",
+]
std = []
-unicode = ["unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment", "regex-syntax/unicode"]
+unicode = [
+ "unicode-age",
+ "unicode-bool",
+ "unicode-case",
+ "unicode-gencat",
+ "unicode-perl",
+ "unicode-script",
+ "unicode-segment",
+ "regex-syntax/unicode",
+]
unicode-age = ["regex-syntax/unicode-age"]
unicode-bool = ["regex-syntax/unicode-bool"]
unicode-case = ["regex-syntax/unicode-case"]
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index 468230b..4c5bd1c 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,6 +1,6 @@
[package]
name = "regex"
-version = "1.5.4" #:version
+version = "1.7.3" #:version
authors = ["The Rust Project Developers"]
license = "MIT OR Apache-2.0"
readme = "README.md"
@@ -117,7 +117,7 @@ optional = true
# For parsing regular expressions.
[dependencies.regex-syntax]
path = "regex-syntax"
-version = "0.6.25"
+version = "0.6.29"
default-features = false
[dev-dependencies]
diff --git a/METADATA b/METADATA
index 6611aac..f8d1a17 100644
--- a/METADATA
+++ b/METADATA
@@ -1,3 +1,7 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update rust/crates/regex
+# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md
+
name: "regex"
description: "An implementation of regular expressions for Rust. This implementation uses finite automata and guarantees linear time matching on all inputs."
third_party {
@@ -7,13 +11,13 @@ third_party {
}
url {
type: ARCHIVE
- value: "https://static.crates.io/crates/regex/regex-1.5.4.crate"
+ value: "https://static.crates.io/crates/regex/regex-1.7.3.crate"
}
- version: "1.5.4"
+ version: "1.7.3"
license_type: NOTICE
last_upgrade_date {
- year: 2021
- month: 5
- day: 19
+ year: 2023
+ month: 4
+ day: 3
}
}
diff --git a/README.md b/README.md
index 86d6996..861417d 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ Much of the syntax and implementation is inspired
by [RE2](https://github.com/google/re2).
[![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions)
-[![](https://meritbadge.herokuapp.com/regex)](https://crates.io/crates/regex)
+[![Crates.io](https://img.shields.io/crates/v/regex.svg)](https://crates.io/crates/regex)
[![Rust](https://img.shields.io/badge/rust-1.41.1%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex)
### Documentation
@@ -23,12 +23,8 @@ can be found on the
### Usage
-Add this to your `Cargo.toml`:
-
-```toml
-[dependencies]
-regex = "1.5"
-```
+To bring this crate into your repository, either add `regex` to your
+`Cargo.toml`, or run `cargo add regex`.
Here's a simple example that matches a date in YYYY-MM-DD format and prints the
year, month and day:
diff --git a/TEST_MAPPING b/TEST_MAPPING
index 6064d70..c99af76 100644
--- a/TEST_MAPPING
+++ b/TEST_MAPPING
@@ -5,6 +5,9 @@
"path": "external/rust/crates/base64"
},
{
+ "path": "external/rust/crates/clap/2.33.3"
+ },
+ {
"path": "external/rust/crates/libsqlite3-sys"
},
{
@@ -18,16 +21,22 @@
},
{
"path": "external/rust/crates/unicode-xid"
- }
- ],
- "presubmit": [
+ },
+ {
+ "path": "packages/modules/Virtualization/virtualizationmanager"
+ },
{
- "name": "keystore2_test"
+ "path": "system/keymint/hal"
},
{
- "name": "legacykeystore_test"
+ "path": "system/security/keystore2"
},
{
+ "path": "system/security/keystore2/legacykeystore"
+ }
+ ],
+ "presubmit": [
+ {
"name": "regex_test_src_lib"
},
{
@@ -56,19 +65,10 @@
},
{
"name": "regex_test_tests_test_nfa_utf8bytes"
- },
- {
- "name": "virtualizationservice_device_test"
}
],
"presubmit-rust": [
{
- "name": "keystore2_test"
- },
- {
- "name": "legacykeystore_test"
- },
- {
"name": "regex_test_src_lib"
},
{
@@ -97,9 +97,6 @@
},
{
"name": "regex_test_tests_test_nfa_utf8bytes"
- },
- {
- "name": "virtualizationservice_device_test"
}
]
}
diff --git a/cargo2android.json b/cargo2android.json
index 0e54308..bef74ca 100644
--- a/cargo2android.json
+++ b/cargo2android.json
@@ -7,5 +7,6 @@
"dependencies": true,
"device": true,
"run": true,
- "tests": true
+ "tests": true,
+ "vendor-available": true
}
diff --git a/src/backtrack.rs b/src/backtrack.rs
index a3d25d6..4d83856 100644
--- a/src/backtrack.rs
+++ b/src/backtrack.rs
@@ -93,13 +93,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
let mut cache = cache.borrow_mut();
let cache = &mut cache.backtrack;
let start = input.at(start);
- let mut b = Bounded {
- prog: prog,
- input: input,
- matches: matches,
- slots: slots,
- m: cache,
- };
+ let mut b = Bounded { prog, input, matches, slots, m: cache };
b.exec_(start, end)
}
@@ -220,14 +214,14 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
// job is popped and the old capture index is restored.
self.m.jobs.push(Job::SaveRestore {
slot: inst.slot,
- old_pos: old_pos,
+ old_pos,
});
self.slots[inst.slot] = Some(at.pos());
}
ip = inst.goto;
}
Split(ref inst) => {
- self.m.jobs.push(Job::Inst { ip: inst.goto2, at: at });
+ self.m.jobs.push(Job::Inst { ip: inst.goto2, at });
ip = inst.goto1;
}
EmptyLook(ref inst) => {
diff --git a/src/compile.rs b/src/compile.rs
index 9a2ed5e..90ca250 100644
--- a/src/compile.rs
+++ b/src/compile.rs
@@ -38,6 +38,16 @@ pub struct Compiler {
suffix_cache: SuffixCache,
utf8_seqs: Option<Utf8Sequences>,
byte_classes: ByteClassSet,
+ // This keeps track of extra bytes allocated while compiling the regex
+ // program. Currently, this corresponds to two things. First is the heap
+ // memory allocated by Unicode character classes ('InstRanges'). Second is
+ // a "fake" amount of memory used by empty sub-expressions, so that enough
+ // empty sub-expressions will ultimately trigger the compiler to bail
+ // because of a size limit restriction. (That empty sub-expressions don't
+ // add to heap memory usage is more-or-less an implementation detail.) In
+ // the second case, if we don't bail, then an excessively large repetition
+ // on an empty sub-expression can result in the compiler using a very large
+ // amount of CPU time.
extra_inst_bytes: usize,
}
@@ -139,7 +149,8 @@ impl Compiler {
self.compiled.start = dotstar_patch.entry;
}
self.compiled.captures = vec![None];
- let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst());
+ let patch =
+ self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
if self.compiled.needs_dotstar() {
self.fill(dotstar_patch.hole, patch.entry);
} else {
@@ -175,7 +186,7 @@ impl Compiler {
self.fill_to_next(prev_hole);
let split = self.push_split_hole();
let Patch { hole, entry } =
- self.c_capture(0, expr)?.unwrap_or(self.next_inst());
+ self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
self.fill_to_next(hole);
self.compiled.matches.push(self.insts.len());
self.push_compiled(Inst::Match(i));
@@ -183,7 +194,7 @@ impl Compiler {
}
let i = exprs.len() - 1;
let Patch { hole, entry } =
- self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst());
+ self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst());
self.fill(prev_hole, entry);
self.fill_to_next(hole);
self.compiled.matches.push(self.insts.len());
@@ -260,7 +271,7 @@ impl Compiler {
self.check_size()?;
match *expr.kind() {
- Empty => Ok(None),
+ Empty => self.c_empty(),
Literal(hir::Literal::Unicode(c)) => self.c_char(c),
Literal(hir::Literal::Byte(b)) => {
assert!(self.compiled.uses_bytes());
@@ -378,6 +389,19 @@ impl Compiler {
}
}
+ fn c_empty(&mut self) -> ResultOrEmpty {
+ // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
+ // See: CVE-2022-24713
+ //
+ // Since 'empty' sub-expressions don't increase the size of
+ // the actual compiled object, we "fake" an increase in its
+ // size so that our 'check_size_limit' routine will eventually
+ // stop compilation if there are too many empty sub-expressions
+ // (e.g., via a large repetition).
+ self.extra_inst_bytes += std::mem::size_of::<Inst>();
+ Ok(None)
+ }
+
fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
if self.num_exprs > 1 || self.compiled.is_dfa {
// Don't ever compile Save instructions for regex sets because
@@ -387,11 +411,11 @@ impl Compiler {
} else {
let entry = self.insts.len();
let hole = self.push_hole(InstHole::Save { slot: first_slot });
- let patch = self.c(expr)?.unwrap_or(self.next_inst());
+ let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst());
self.fill(hole, patch.entry);
self.fill_to_next(patch.hole);
let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
- Ok(Some(Patch { hole: hole, entry: entry }))
+ Ok(Some(Patch { hole, entry }))
}
}
@@ -425,7 +449,7 @@ impl Compiler {
self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
}
} else {
- let hole = self.push_hole(InstHole::Char { c: c });
+ let hole = self.push_hole(InstHole::Char { c });
Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
}
}
@@ -435,7 +459,7 @@ impl Compiler {
assert!(!ranges.is_empty());
if self.compiled.uses_bytes() {
- Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?))
+ Ok(Some(CompileClass { c: self, ranges }.compile()?))
} else {
let ranges: Vec<(char, char)> =
ranges.iter().map(|r| (r.start(), r.end())).collect();
@@ -444,9 +468,9 @@ impl Compiler {
} else {
self.extra_inst_bytes +=
ranges.len() * (size_of::<char>() * 2);
- self.push_hole(InstHole::Ranges { ranges: ranges })
+ self.push_hole(InstHole::Ranges { ranges })
};
- Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
+ Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
}
}
@@ -485,8 +509,8 @@ impl Compiler {
}
fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
- let hole = self.push_hole(InstHole::EmptyLook { look: look });
- Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
+ let hole = self.push_hole(InstHole::EmptyLook { look });
+ Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
}
fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
@@ -496,7 +520,7 @@ impl Compiler {
let mut exprs = exprs.into_iter();
let Patch { mut hole, entry } = loop {
match exprs.next() {
- None => return Ok(None),
+ None => return self.c_empty(),
Some(e) => {
if let Some(p) = self.c(e)? {
break p;
@@ -510,7 +534,7 @@ impl Compiler {
hole = p.hole;
}
}
- Ok(Some(Patch { hole: hole, entry: entry }))
+ Ok(Some(Patch { hole, entry }))
}
fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
@@ -653,7 +677,7 @@ impl Compiler {
// None).
let patch_concat = self
.c_concat(iter::repeat(expr).take(min))?
- .unwrap_or(self.next_inst());
+ .unwrap_or_else(|| self.next_inst());
if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
self.fill(patch_concat.hole, patch_rep.entry);
Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
@@ -677,7 +701,7 @@ impl Compiler {
}
// Same reasoning as in c_repeat_range_min_or_more (we know that min <
// max at this point).
- let patch_concat = patch_concat.unwrap_or(self.next_inst());
+ let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst());
let initial_entry = patch_concat.entry;
// It is much simpler to compile, e.g., `a{2,5}` as:
//
@@ -856,14 +880,14 @@ impl MaybeInst {
}
MaybeInst::Split1(goto1) => {
MaybeInst::Compiled(Inst::Split(InstSplit {
- goto1: goto1,
+ goto1,
goto2: goto,
}))
}
MaybeInst::Split2(goto2) => {
MaybeInst::Compiled(Inst::Split(InstSplit {
goto1: goto,
- goto2: goto2,
+ goto2,
}))
}
_ => unreachable!(
@@ -877,9 +901,7 @@ impl MaybeInst {
fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
let filled = match *self {
- MaybeInst::Split => {
- Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
- }
+ MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }),
_ => unreachable!(
"must be called on Split instruction, \
instead it was called on: {:?}",
@@ -937,19 +959,17 @@ enum InstHole {
impl InstHole {
fn fill(&self, goto: InstPtr) -> Inst {
match *self {
- InstHole::Save { slot } => {
- Inst::Save(InstSave { goto: goto, slot: slot })
- }
+ InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }),
InstHole::EmptyLook { look } => {
- Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
+ Inst::EmptyLook(InstEmptyLook { goto, look })
}
- InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
+ InstHole::Char { c } => Inst::Char(InstChar { goto, c }),
InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
- goto: goto,
+ goto,
ranges: ranges.clone().into_boxed_slice(),
}),
InstHole::Bytes { start, end } => {
- Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
+ Inst::Bytes(InstBytes { goto, start, end })
}
}
}
@@ -1019,7 +1039,7 @@ impl<'a, 'b> CompileClass<'a, 'b> {
let mut last_hole = Hole::None;
for byte_range in seq {
let key = SuffixCacheKey {
- from_inst: from_inst,
+ from_inst,
start: byte_range.start,
end: byte_range.end,
};
@@ -1109,7 +1129,7 @@ impl SuffixCache {
}
}
*pos = self.dense.len();
- self.dense.push(SuffixCacheEntry { key: key, pc: pc });
+ self.dense.push(SuffixCacheEntry { key, pc });
None
}
@@ -1120,8 +1140,8 @@ impl SuffixCache {
fn hash(&self, suffix: &SuffixCacheKey) -> usize {
// Basic FNV-1a hash as described:
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
- const FNV_PRIME: u64 = 1099511628211;
- let mut h = 14695981039346656037;
+ const FNV_PRIME: u64 = 1_099_511_628_211;
+ let mut h = 14_695_981_039_346_656_037;
h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
diff --git a/src/dfa.rs b/src/dfa.rs
index 4b60f4d..dc99521 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -31,7 +31,7 @@ considerably more complex than one might expect out of a DFA. A number of
tricks are employed to make it fast. Tread carefully.
N.B. While this implementation is heavily commented, Russ Cox's series of
-articles on regexes is strongly recommended: https://swtch.com/~rsc/regexp/
+articles on regexes is strongly recommended: <https://swtch.com/~rsc/regexp/>
(As is the DFA implementation in RE2, which heavily influenced this
implementation.)
*/
@@ -454,10 +454,10 @@ impl<'a> Fsm<'a> {
let mut cache = cache.borrow_mut();
let cache = &mut cache.dfa;
let mut dfa = Fsm {
- prog: prog,
+ prog,
start: 0, // filled in below
- at: at,
- quit_after_match: quit_after_match,
+ at,
+ quit_after_match,
last_match_si: STATE_UNKNOWN,
last_cache_flush: at,
cache: &mut cache.inner,
@@ -484,10 +484,10 @@ impl<'a> Fsm<'a> {
let mut cache = cache.borrow_mut();
let cache = &mut cache.dfa_reverse;
let mut dfa = Fsm {
- prog: prog,
+ prog,
start: 0, // filled in below
- at: at,
- quit_after_match: quit_after_match,
+ at,
+ quit_after_match,
last_match_si: STATE_UNKNOWN,
last_cache_flush: at,
cache: &mut cache.inner,
@@ -515,9 +515,9 @@ impl<'a> Fsm<'a> {
let mut cache = cache.borrow_mut();
let cache = &mut cache.dfa;
let mut dfa = Fsm {
- prog: prog,
+ prog,
start: 0, // filled in below
- at: at,
+ at,
quit_after_match: false,
last_match_si: STATE_UNKNOWN,
last_cache_flush: at,
@@ -1353,7 +1353,6 @@ impl<'a> Fsm<'a> {
match self.cache.trans.next(si, self.byte_class(b)) {
STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b),
STATE_QUIT => None,
- STATE_DEAD => Some(STATE_DEAD),
nsi => Some(nsi),
}
}
@@ -1387,7 +1386,6 @@ impl<'a> Fsm<'a> {
};
match self.cache.start_states[flagi] {
STATE_UNKNOWN => {}
- STATE_DEAD => return Some(STATE_DEAD),
si => return Some(si),
}
q.clear();
@@ -1608,11 +1606,7 @@ struct StateMap {
impl StateMap {
fn new(num_byte_classes: usize) -> StateMap {
- StateMap {
- map: HashMap::new(),
- states: vec![],
- num_byte_classes: num_byte_classes,
- }
+ StateMap { map: HashMap::new(), states: vec![], num_byte_classes }
}
fn len(&self) -> usize {
@@ -1648,7 +1642,7 @@ impl Transitions {
/// The number of byte classes corresponds to the stride. Every state will
/// have `num_byte_classes` slots for transitions.
fn new(num_byte_classes: usize) -> Transitions {
- Transitions { table: vec![], num_byte_classes: num_byte_classes }
+ Transitions { table: vec![], num_byte_classes }
}
/// Returns the total number of states currently in this table.
@@ -1698,27 +1692,27 @@ impl Transitions {
impl StateFlags {
fn is_match(&self) -> bool {
- self.0 & 0b0000000_1 > 0
+ self.0 & 0b0000_0001 > 0
}
fn set_match(&mut self) {
- self.0 |= 0b0000000_1;
+ self.0 |= 0b0000_0001;
}
fn is_word(&self) -> bool {
- self.0 & 0b000000_1_0 > 0
+ self.0 & 0b0000_0010 > 0
}
fn set_word(&mut self) {
- self.0 |= 0b000000_1_0;
+ self.0 |= 0b0000_0010;
}
fn has_empty(&self) -> bool {
- self.0 & 0b00000_1_00 > 0
+ self.0 & 0b0000_0100 > 0
}
fn set_empty(&mut self) {
- self.0 |= 0b00000_1_00;
+ self.0 |= 0b0000_0100;
}
}
diff --git a/src/exec.rs b/src/exec.rs
index d5fad1c..b9abcdc 100644
--- a/src/exec.rs
+++ b/src/exec.rs
@@ -288,10 +288,10 @@ impl ExecBuilder {
exprs.push(expr);
}
Ok(Parsed {
- exprs: exprs,
+ exprs,
prefixes: prefixes.unwrap_or_else(Literals::empty),
suffixes: suffixes.unwrap_or_else(Literals::empty),
- bytes: bytes,
+ bytes,
})
}
@@ -311,7 +311,7 @@ impl ExecBuilder {
match_type: MatchType::Nothing,
});
let pool = ExecReadOnly::new_pool(&ro);
- return Ok(Exec { ro: ro, pool });
+ return Ok(Exec { ro, pool });
}
let parsed = self.parse()?;
let mut nfa = Compiler::new()
@@ -340,12 +340,12 @@ impl ExecBuilder {
let mut ro = ExecReadOnly {
res: self.options.pats,
- nfa: nfa,
- dfa: dfa,
- dfa_reverse: dfa_reverse,
+ nfa,
+ dfa,
+ dfa_reverse,
suffixes: LiteralSearcher::suffixes(parsed.suffixes),
#[cfg(feature = "perf-literal")]
- ac: ac,
+ ac,
match_type: MatchType::Nothing,
};
ro.match_type = ro.choose_match_type(self.match_type);
@@ -459,7 +459,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
self.cache.value(),
true,
&text[start..],
- text.len(),
+ text.len() - start,
) {
dfa::Result::Match(_) => Some(text.len()),
dfa::Result::NoMatch(_) => None,
@@ -511,7 +511,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
self.cache.value(),
true,
&text[start..],
- text.len(),
+ text.len() - start,
) {
dfa::Result::Match(_) => true,
dfa::Result::NoMatch(_) => false,
diff --git a/src/expand.rs b/src/expand.rs
index fd9c2d0..67b5149 100644
--- a/src/expand.rs
+++ b/src/expand.rs
@@ -127,7 +127,7 @@ impl From<usize> for Ref<'static> {
/// If no such valid reference could be found, None is returned.
fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
let mut i = 0;
- let rep: &[u8] = replacement.as_ref();
+ let rep: &[u8] = replacement;
if rep.len() <= 1 || rep[0] != b'$' {
return None;
}
@@ -136,7 +136,7 @@ fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
return find_cap_ref_braced(rep, i + 1);
}
let mut cap_end = i;
- while rep.get(cap_end).map_or(false, is_valid_cap_letter) {
+ while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) {
cap_end += 1;
}
if cap_end == i {
@@ -183,8 +183,8 @@ fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
}
/// Returns true if and only if the given byte is allowed in a capture name.
-fn is_valid_cap_letter(b: &u8) -> bool {
- match *b {
+fn is_valid_cap_letter(b: u8) -> bool {
+ match b {
b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
_ => false,
}
diff --git a/src/input.rs b/src/input.rs
index 5d50ee3..df6c3e0 100644
--- a/src/input.rs
+++ b/src/input.rs
@@ -160,7 +160,7 @@ impl<'t> Input for CharInput<'t> {
InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
} else {
let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
- InputAt { pos: i, c: c, byte: None, len: c.len_utf8() }
+ InputAt { pos: i, c, byte: None, len: c.len_utf8() }
}
}
@@ -231,7 +231,7 @@ pub struct ByteInput<'t> {
impl<'t> ByteInput<'t> {
/// Return a new byte-based input reader for the given string.
pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
- ByteInput { text: text, only_utf8: only_utf8 }
+ ByteInput { text, only_utf8 }
}
}
diff --git a/src/lib.rs b/src/lib.rs
index 7f2dec8..6b95739 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -353,6 +353,9 @@ $ the end of text (or end-of-line with multi-line mode)
\B not a Unicode word boundary
</pre>
+The empty regex is valid and matches the empty string. For example, the empty
+regex matches `abc` at positions `0`, `1`, `2` and `3`.
+
## Grouping and flags
<pre class="rust">
@@ -628,7 +631,6 @@ pub use crate::re_builder::unicode::*;
#[cfg(feature = "std")]
pub use crate::re_set::unicode::*;
#[cfg(feature = "std")]
-#[cfg(feature = "std")]
pub use crate::re_unicode::{
escape, CaptureLocations, CaptureMatches, CaptureNames, Captures,
Locations, Match, Matches, NoExpand, Regex, Replacer, ReplacerRef, Split,
diff --git a/src/literal/imp.rs b/src/literal/imp.rs
index 82f050a..90b2f11 100644
--- a/src/literal/imp.rs
+++ b/src/literal/imp.rs
@@ -57,10 +57,10 @@ impl LiteralSearcher {
fn new(lits: Literals, matcher: Matcher) -> Self {
let complete = lits.all_complete();
LiteralSearcher {
- complete: complete,
+ complete,
lcp: Memmem::new(lits.longest_common_prefix()),
lcs: Memmem::new(lits.longest_common_suffix()),
- matcher: matcher,
+ matcher,
}
}
diff --git a/src/pattern.rs b/src/pattern.rs
index b4ffd8e..00549e5 100644
--- a/src/pattern.rs
+++ b/src/pattern.rs
@@ -15,7 +15,7 @@ impl<'r, 't> Pattern<'t> for &'r Regex {
fn into_searcher(self, haystack: &'t str) -> RegexSearcher<'r, 't> {
RegexSearcher {
- haystack: haystack,
+ haystack,
it: self.find_iter(haystack),
last_step_end: 0,
next_match: None,
diff --git a/src/pikevm.rs b/src/pikevm.rs
index 9a14240..8c9eac2 100644
--- a/src/pikevm.rs
+++ b/src/pikevm.rs
@@ -100,7 +100,7 @@ impl<'r, I: Input> Fsm<'r, I> {
cache.clist.resize(prog.len(), prog.captures.len());
cache.nlist.resize(prog.len(), prog.captures.len());
let at = input.at(start);
- Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_(
+ Fsm { prog, stack: &mut cache.stack, input }.exec_(
&mut cache.clist,
&mut cache.nlist,
matches,
diff --git a/src/prog.rs b/src/prog.rs
index 475a811..c211f71 100644
--- a/src/prog.rs
+++ b/src/prog.rs
@@ -233,7 +233,7 @@ impl fmt::Debug for Program {
if pc == self.start {
write!(f, " (start)")?;
}
- write!(f, "\n")?;
+ writeln!(f)?;
}
Ok(())
}
diff --git a/src/re_bytes.rs b/src/re_bytes.rs
index ae55d6d..07e9f98 100644
--- a/src/re_bytes.rs
+++ b/src/re_bytes.rs
@@ -53,7 +53,7 @@ impl<'t> Match<'t> {
/// Creates a new match from the given haystack and byte offsets.
#[inline]
fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> {
- Match { text: haystack, start: start, end: end }
+ Match { text: haystack, start, end }
}
}
@@ -255,7 +255,7 @@ impl Regex {
pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> {
let mut locs = self.capture_locations();
self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
- text: text,
+ text,
locs: locs.0,
named_groups: self.0.capture_name_idx().clone(),
})
@@ -496,12 +496,12 @@ impl Regex {
let mut new = Vec::with_capacity(text.len());
let mut last_match = 0;
for (i, m) in it {
- if limit > 0 && i >= limit {
- break;
- }
new.extend_from_slice(&text[last_match..m.start()]);
new.extend_from_slice(&rep);
last_match = m.end();
+ if limit > 0 && i >= limit - 1 {
+ break;
+ }
}
new.extend_from_slice(&text[last_match..]);
return Cow::Owned(new);
@@ -516,14 +516,14 @@ impl Regex {
let mut new = Vec::with_capacity(text.len());
let mut last_match = 0;
for (i, cap) in it {
- if limit > 0 && i >= limit {
- break;
- }
// unwrap on 0 is OK because captures only reports matches
let m = cap.get(0).unwrap();
new.extend_from_slice(&text[last_match..m.start()]);
rep.replace_append(&cap, &mut new);
last_match = m.end();
+ if limit > 0 && i >= limit - 1 {
+ break;
+ }
}
new.extend_from_slice(&text[last_match..]);
Cow::Owned(new)
@@ -578,7 +578,7 @@ impl Regex {
/// context into consideration. For example, the `\A` anchor can only
/// match when `start == 0`.
pub fn is_match_at(&self, text: &[u8], start: usize) -> bool {
- self.shortest_match_at(text, start).is_some()
+ self.0.searcher().is_match_at(text, start)
}
/// Returns the same as find, but starts the search at the given
@@ -723,7 +723,7 @@ impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
fn next(&mut self) -> Option<Captures<'t>> {
self.0.next().map(|locs| Captures {
text: self.0.text(),
- locs: locs,
+ locs,
named_groups: self.0.regex().capture_name_idx().clone(),
})
}
@@ -877,7 +877,7 @@ impl CaptureLocations {
self.0.pos(i)
}
- /// Returns the total number of capturing groups.
+ /// Returns the total number of capture groups (even if they didn't match).
///
/// This is always at least `1` since every regex has at least `1`
/// capturing group that corresponds to the entire match.
@@ -979,7 +979,7 @@ impl<'t> Captures<'t> {
expand_bytes(self, replacement, dst)
}
- /// Returns the number of captured groups.
+ /// Returns the total number of capture groups (even if they didn't match).
///
/// This is always at least `1`, since every regex has at least one capture
/// group that corresponds to the full match.
diff --git a/src/re_set.rs b/src/re_set.rs
index 73d5953..a6d886d 100644
--- a/src/re_set.rs
+++ b/src/re_set.rs
@@ -59,13 +59,45 @@ $(#[$doc_regexset_example])*
/// 1. Does any regex in the set match?
/// 2. If so, which regexes in the set match?
///
-/// As with the main `Regex` type, it is cheaper to ask (1) instead of (2)
-/// since the matching engines can stop after the first match is found.
+/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1)
+/// instead of (2) since the matching engines can stop after the first match
+/// is found.
///
-/// Other features like finding the location of successive matches or their
-/// sub-captures aren't supported. If you need this functionality, the
-/// recommended approach is to compile each regex in the set independently and
-/// selectively match them based on which regexes in the set matched.
+/// You cannot directly extract [`Match`][crate::Match] or
+/// [`Captures`][crate::Captures] objects from a regex set. If you need these
+/// operations, the recommended approach is to compile each pattern in the set
+/// independently and scan the exact same input a second time with those
+/// independently compiled patterns:
+///
+/// ```rust
+/// use regex::{Regex, RegexSet};
+///
+/// let patterns = ["foo", "bar"];
+/// // Both patterns will match different ranges of this string.
+/// let text = "barfoo";
+///
+/// // Compile a set matching any of our patterns.
+/// let set = RegexSet::new(&patterns).unwrap();
+/// // Compile each pattern independently.
+/// let regexes: Vec<_> = set.patterns().iter()
+/// .map(|pat| Regex::new(pat).unwrap())
+/// .collect();
+///
+/// // Match against the whole set first and identify the individual
+/// // matching patterns.
+/// let matches: Vec<&str> = set.matches(text).into_iter()
+/// // Dereference the match index to get the corresponding
+/// // compiled pattern.
+/// .map(|match_idx| &regexes[match_idx])
+/// // To get match locations or any other info, we then have to search
+/// // the exact same text again, using our separately-compiled pattern.
+/// .map(|pat| pat.find(text).unwrap().as_str())
+/// .collect();
+///
+/// // Matches arrive in the order the constituent patterns were declared,
+/// // not the order they appear in the input.
+/// assert_eq!(vec!["foo", "bar"], matches);
+/// ```
///
/// # Performance
///
diff --git a/src/re_trait.rs b/src/re_trait.rs
index 680aa54..d0c717d 100644
--- a/src/re_trait.rs
+++ b/src/re_trait.rs
@@ -74,8 +74,19 @@ impl<'c> Iterator for SubCapturesPosIter<'c> {
self.idx += 1;
x
}
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ let len = self.locs.len() - self.idx;
+ (len, Some(len))
+ }
+
+ fn count(self) -> usize {
+ self.len()
+ }
}
+impl<'c> ExactSizeIterator for SubCapturesPosIter<'c> {}
+
impl<'c> FusedIterator for SubCapturesPosIter<'c> {}
/// `RegularExpression` describes types that can implement regex searching.
@@ -139,7 +150,7 @@ pub trait RegularExpression: Sized + fmt::Debug {
/// Returns an iterator over all non-overlapping successive leftmost-first
/// matches.
fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> {
- Matches { re: self, text: text, last_end: 0, last_match: None }
+ Matches { re: self, text, last_end: 0, last_match: None }
}
/// Returns an iterator over all non-overlapping successive leftmost-first
diff --git a/src/re_unicode.rs b/src/re_unicode.rs
index 142c78f..197510e 100644
--- a/src/re_unicode.rs
+++ b/src/re_unicode.rs
@@ -61,7 +61,7 @@ impl<'t> Match<'t> {
/// Creates a new match from the given haystack and byte offsets.
#[inline]
fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> {
- Match { text: haystack, start: start, end: end }
+ Match { text: haystack, start, end }
}
}
@@ -129,7 +129,7 @@ impl<'t> From<Match<'t>> for Range<usize> {
/// assert!(haystack.contains(&re));
/// assert_eq!(haystack.find(&re), Some(1));
/// assert_eq!(haystack.match_indices(&re).collect::<Vec<_>>(),
-/// vec![(1, 4), (5, 8)]);
+/// vec![(1, "111"), (5, "222")]);
/// assert_eq!(haystack.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]);
/// ```
#[derive(Clone)]
@@ -311,7 +311,7 @@ impl Regex {
pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
let mut locs = self.capture_locations();
self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
- text: text,
+ text,
locs: locs.0,
named_groups: self.0.capture_name_idx().clone(),
})
@@ -538,7 +538,7 @@ impl Regex {
mut rep: R,
) -> Cow<'t, str> {
// If we know that the replacement doesn't have any capture expansions,
- // then we can fast path. The fast path can make a tremendous
+ // then we can use the fast path. The fast path can make a tremendous
// difference:
//
// 1) We use `find_iter` instead of `captures_iter`. Not asking for
@@ -554,12 +554,12 @@ impl Regex {
let mut new = String::with_capacity(text.len());
let mut last_match = 0;
for (i, m) in it {
- if limit > 0 && i >= limit {
- break;
- }
new.push_str(&text[last_match..m.start()]);
new.push_str(&rep);
last_match = m.end();
+ if limit > 0 && i >= limit - 1 {
+ break;
+ }
}
new.push_str(&text[last_match..]);
return Cow::Owned(new);
@@ -574,14 +574,14 @@ impl Regex {
let mut new = String::with_capacity(text.len());
let mut last_match = 0;
for (i, cap) in it {
- if limit > 0 && i >= limit {
- break;
- }
// unwrap on 0 is OK because captures only reports matches
let m = cap.get(0).unwrap();
new.push_str(&text[last_match..m.start()]);
rep.replace_append(&cap, &mut new);
last_match = m.end();
+ if limit > 0 && i >= limit - 1 {
+ break;
+ }
}
new.push_str(&text[last_match..]);
Cow::Owned(new)
@@ -636,7 +636,7 @@ impl Regex {
/// context into consideration. For example, the `\A` anchor can only
/// match when `start == 0`.
pub fn is_match_at(&self, text: &str, start: usize) -> bool {
- self.shortest_match_at(text, start).is_some()
+ self.0.searcher_str().is_match_at(text, start)
}
/// Returns the same as find, but starts the search at the given
@@ -887,7 +887,7 @@ impl CaptureLocations {
self.0.pos(i)
}
- /// Returns the total number of capturing groups.
+ /// Returns the total number of capture groups (even if they didn't match).
///
/// This is always at least `1` since every regex has at least `1`
/// capturing group that corresponds to the entire match.
@@ -989,7 +989,7 @@ impl<'t> Captures<'t> {
expand_str(self, replacement, dst)
}
- /// Returns the number of captured groups.
+ /// Returns the total number of capture groups (even if they didn't match).
///
/// This is always at least `1`, since every regex has at least one capture
/// group that corresponds to the full match.
@@ -1092,8 +1092,18 @@ impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
.next()
.map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e)))
}
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+
+ fn count(self) -> usize {
+ self.it.count()
+ }
}
+impl<'c, 't> ExactSizeIterator for SubCaptureMatches<'c, 't> {}
+
impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {}
/// An iterator that yields all non-overlapping capture groups matching a
@@ -1114,7 +1124,7 @@ impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
fn next(&mut self) -> Option<Captures<'t>> {
self.0.next().map(|locs| Captures {
text: self.0.text(),
- locs: locs,
+ locs,
named_groups: self.0.regex().capture_name_idx().clone(),
})
}
diff --git a/src/utf8.rs b/src/utf8.rs
index 6e0608f..2dfd2c0 100644
--- a/src/utf8.rs
+++ b/src/utf8.rs
@@ -108,7 +108,7 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
| ((b2 & !TAG_CONT) as u32) << 6
| ((b3 & !TAG_CONT) as u32);
match cp {
- 0x10000..=0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
+ 0x10000..=0x0010_FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
_ => None,
}
}
diff --git a/tests/regression.rs b/tests/regression.rs
index 44b9083..e8b2525 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -217,3 +217,6 @@ matiter!(
// https://en.wikipedia.org/wiki/Je_(Cyrillic)
ismatch!(empty_group_match, r"()Ј01", "zЈ01", true);
matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5));
+
+// See: https://github.com/rust-lang/regex/issues/862
+mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1)));
diff --git a/tests/replace.rs b/tests/replace.rs
index 1dc6106..d65be07 100644
--- a/tests/replace.rs
+++ b/tests/replace.rs
@@ -228,3 +228,21 @@ replace!(
bytes!(&std::borrow::Cow::<'_, [u8]>::Owned(vec![b'Z'])),
"age: Z6"
);
+
+#[test]
+fn replacen_no_captures() {
+ let re = regex!(r"[0-9]");
+ assert_eq!(
+ re.replacen(text!("age: 1234"), 2, t!("Z")),
+ text!("age: ZZ34")
+ );
+}
+
+#[test]
+fn replacen_with_captures() {
+ let re = regex!(r"([0-9])");
+ assert_eq!(
+ re.replacen(text!("age: 1234"), 2, t!("${1}Z")),
+ text!("age: 1Z2Z34")
+ );
+}
diff --git a/tests/test_default.rs b/tests/test_default.rs
index d4365fb..19a319a 100644
--- a/tests/test_default.rs
+++ b/tests/test_default.rs
@@ -150,3 +150,83 @@ fn regex_is_reasonably_small() {
assert_eq!(16, size_of::<bytes::Regex>());
assert_eq!(16, size_of::<bytes::RegexSet>());
}
+
+// See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
+// See: CVE-2022-24713
+//
+// We test that our regex compiler will correctly return a "too big" error when
+// we try to use a very large repetition on an *empty* sub-expression.
+//
+// At the time this test was written, the regex compiler does not represent
+// empty sub-expressions with any bytecode instructions. In effect, it's an
+// "optimization" to leave them out, since they would otherwise correspond
+// to an unconditional JUMP in the regex bytecode (i.e., an unconditional
+// epsilon transition in the NFA graph). Therefore, an empty sub-expression
+// represents an interesting case for the compiler's size limits. Since it
+// doesn't actually contribute any additional memory to the compiled regex
+// instructions, the size limit machinery never detects it. Instead, it just
+// dumbly tries to compile the empty sub-expression N times, where N is the
+// repetition size.
+//
+// When N is very large, this will cause the compiler to essentially spin and
+// do nothing for a decently large amount of time. It causes the regex to take
+// quite a bit of time to compile, despite the concrete syntax of the regex
+// being quite small.
+//
+// The degree to which this is actually a problem is somewhat of a judgment
+// call. Some regexes simply take a long time to compile. But in general, you
+// should be able to reasonably control this by setting lower or higher size
+// limits on the compiled object size. But this mitigation doesn't work at all
+// for this case.
+//
+// This particular test is somewhat narrow. It merely checks that regex
+// compilation will, at some point, return a "too big" error. Before the
+// fix landed, this test would eventually fail because the regex would be
+// successfully compiled (after enough time elapsed). So while this test
+// doesn't check that we exit in a reasonable amount of time, it does at least
+// check that we are properly returning an error at some point.
+#[test]
+fn big_empty_regex_fails() {
+ use regex::Regex;
+
+ let result = Regex::new("(?:){4294967295}");
+ assert!(result.is_err());
+}
+
+// Below is a "billion laughs" variant of the previous test case.
+#[test]
+fn big_empty_reps_chain_regex_fails() {
+ use regex::Regex;
+
+ let result = Regex::new("(?:){64}{64}{64}{64}{64}{64}");
+ assert!(result.is_err());
+}
+
+// Below is another situation where a zero-length sub-expression can be
+// introduced.
+#[test]
+fn big_zero_reps_regex_fails() {
+ use regex::Regex;
+
+ let result = Regex::new(r"x{0}{4294967295}");
+ assert!(result.is_err());
+}
+
+// Testing another case for completeness.
+#[test]
+fn empty_alt_regex_fails() {
+ use regex::Regex;
+
+ let result = Regex::new(r"(?:|){4294967295}");
+ assert!(result.is_err());
+}
+
+// Regression test for: https://github.com/rust-lang/regex/issues/969
+#[test]
+fn regression_i969() {
+ use regex::Regex;
+
+ let re = Regex::new(r"c.*d\z").unwrap();
+ assert_eq!(Some(6), re.shortest_match_at("ababcd", 4));
+ assert_eq!(Some(6), re.find_at("ababcd", 4).map(|m| m.end()));
+}
diff --git a/tests/unicode.rs b/tests/unicode.rs
index 9f1cd0c..9b32286 100644
--- a/tests/unicode.rs
+++ b/tests/unicode.rs
@@ -232,3 +232,20 @@ mat!(uni_class_sb2, r"\p{sb=lower}", "\u{0469}", Some((0, 2)));
mat!(uni_class_sb3, r"\p{sb=Close}", "\u{FF60}", Some((0, 3)));
mat!(uni_class_sb4, r"\p{sb=Close}", "\u{1F677}", Some((0, 4)));
mat!(uni_class_sb5, r"\p{sb=SContinue}", "\u{FF64}", Some((0, 3)));
+
+// Test 'Vithkuqi' support, which was added in Unicode 14.
+// See: https://github.com/rust-lang/regex/issues/877
+mat!(
+ uni_vithkuqi_literal_upper,
+ r"(?i)^\u{10570}$",
+ "\u{10570}",
+ Some((0, 4))
+);
+mat!(
+ uni_vithkuqi_literal_lower,
+ r"(?i)^\u{10570}$",
+ "\u{10597}",
+ Some((0, 4))
+);
+mat!(uni_vithkuqi_word_upper, r"^\w$", "\u{10570}", Some((0, 4)));
+mat!(uni_vithkuqi_word_lower, r"^\w$", "\u{10597}", Some((0, 4)));