diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2022-06-15 21:45:16 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2022-06-15 21:45:16 +0000 |
commit | 5856c588c52d87fd4c1408b0d62ee40eda83634d (patch) | |
tree | 5f360a0d078f211e4938d8bdb073d5aa685ed554 | |
parent | 471713278e7990f6326a0d142d68c416a997f7fb (diff) | |
parent | e47e2cd7550328fccccb1a12de97d62011f04bd7 (diff) | |
download | regex-aml_tz3_314012010.tar.gz |
Snap for 8730993 from e47e2cd7550328fccccb1a12de97d62011f04bd7 to mainline-tzdata3-releaseaml_tz3_314012070aml_tz3_314012050aml_tz3_314012010aml_tz3_313110000aml_tz3_312511020aml_tz3_312511010aml_tz3_312410020aml_tz3_312410010android12-mainline-tzdata3-releaseaml_tz3_314012010
Change-Id: I09b4f76635b861187223f7fb274f45ebfd84724e
48 files changed, 1080 insertions, 935 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json index 51b3cd6..33a143b 100644 --- a/.cargo_vcs_info.json +++ b/.cargo_vcs_info.json @@ -1,5 +1,5 @@ { "git": { - "sha1": "f2dc1b788f773a49f1b6633a6302054978344452" + "sha1": "ff283badce21dcebd581909d38b81f2c8c9bfb54" } } @@ -1,4 +1,4 @@ -// This file is generated by cargo2android.py --config cargo2android.json. +// This file is generated by cargo2android.py --run --device --dependencies. // Do not modify this file as changes will be overridden on upgrade. package { @@ -42,10 +42,8 @@ rust_library { name: "libregex", host_supported: true, crate_name: "regex", - cargo_env_compat: true, - cargo_pkg_version: "1.5.4", srcs: ["src/lib.rs"], - edition: "2018", + edition: "2015", features: [ "aho-corasick", "default", @@ -70,438 +68,9 @@ rust_library { "libmemchr", "libregex_syntax", ], - apex_available: [ - "//apex_available:platform", - "com.android.compos", - "com.android.virt", - ], -} - -rust_test { - name: "regex_test_src_lib", - host_supported: true, - crate_name: "regex", - cargo_env_compat: true, - cargo_pkg_version: "1.5.4", - srcs: ["src/lib.rs"], - test_suites: ["general-tests"], - auto_gen_config: true, - test_options: { - unit_test: true, - }, - edition: "2018", - features: [ - "aho-corasick", - "default", - "memchr", - "perf", - "perf-cache", - "perf-dfa", - "perf-inline", - "perf-literal", - "std", - "unicode", - "unicode-age", - "unicode-bool", - "unicode-case", - "unicode-gencat", - "unicode-perl", - "unicode-script", - "unicode-segment", - ], - rustlibs: [ - "libaho_corasick", - "liblazy_static", - "libmemchr", - "libquickcheck", - "librand", - "libregex_syntax", - ], -} - -rust_test { - name: "regex_test_tests_test_backtrack", - host_supported: true, - crate_name: "backtrack", - cargo_env_compat: true, - cargo_pkg_version: "1.5.4", - srcs: ["tests/test_backtrack.rs"], - test_suites: ["general-tests"], - auto_gen_config: true, - test_options: { - unit_test: true, - }, - edition: "2018", - features: [ - "aho-corasick", - "default", - "memchr", - "perf", - "perf-cache", - "perf-dfa", - "perf-inline", - "perf-literal", - "std", - "unicode", - "unicode-age", - "unicode-bool", - "unicode-case", - "unicode-gencat", - "unicode-perl", - "unicode-script", - "unicode-segment", - ], - rustlibs: [ - "libaho_corasick", - "liblazy_static", - "libmemchr", - "libquickcheck", - "librand", - "libregex", - "libregex_syntax", - ], -} - -rust_test { - name: "regex_test_tests_test_backtrack_bytes", - host_supported: true, - crate_name: "backtrack_bytes", - cargo_env_compat: true, - cargo_pkg_version: "1.5.4", - srcs: ["tests/test_backtrack_bytes.rs"], - test_suites: ["general-tests"], - auto_gen_config: true, - test_options: { - unit_test: true, - }, - edition: "2018", - features: [ - "aho-corasick", - "default", - "memchr", - "perf", - "perf-cache", - "perf-dfa", - "perf-inline", - "perf-literal", - "std", - "unicode", - "unicode-age", - "unicode-bool", - "unicode-case", - "unicode-gencat", - "unicode-perl", - "unicode-script", - "unicode-segment", - ], - rustlibs: [ - "libaho_corasick", - "liblazy_static", - "libmemchr", - "libquickcheck", - "librand", - "libregex", - "libregex_syntax", - ], -} - -rust_test { - name: "regex_test_tests_test_backtrack_utf8bytes", - host_supported: true, - crate_name: "backtrack_utf8bytes", - cargo_env_compat: true, - cargo_pkg_version: "1.5.4", - srcs: ["tests/test_backtrack_utf8bytes.rs"], - test_suites: ["general-tests"], - auto_gen_config: true, - test_options: { - unit_test: true, - }, - edition: "2018", - features: [ - "aho-corasick", - "default", - "memchr", - "perf", - "perf-cache", - "perf-dfa", - "perf-inline", - "perf-literal", - "std", - "unicode", - "unicode-age", - "unicode-bool", - "unicode-case", - "unicode-gencat", - "unicode-perl", - "unicode-script", - "unicode-segment", - ], - rustlibs: [ - "libaho_corasick", - "liblazy_static", - "libmemchr", - "libquickcheck", - "librand", - "libregex", - "libregex_syntax", - ], -} - -rust_test { - name: "regex_test_tests_test_crates_regex", - host_supported: true, - crate_name: "crates_regex", - cargo_env_compat: true, - cargo_pkg_version: "1.5.4", - srcs: ["tests/test_crates_regex.rs"], - test_suites: ["general-tests"], - auto_gen_config: true, - test_options: { - unit_test: true, - }, - edition: "2018", - features: [ - "aho-corasick", - "default", - "memchr", - "perf", - "perf-cache", - "perf-dfa", - "perf-inline", - "perf-literal", - "std", - "unicode", - "unicode-age", - "unicode-bool", - "unicode-case", - "unicode-gencat", - "unicode-perl", - "unicode-script", - "unicode-segment", - ], - rustlibs: [ - "libaho_corasick", - "liblazy_static", - "libmemchr", - "libquickcheck", - "librand", - "libregex", - "libregex_syntax", - ], -} - -rust_test { - name: "regex_test_tests_test_default", - host_supported: true, - crate_name: "default", - cargo_env_compat: true, - cargo_pkg_version: "1.5.4", - srcs: ["tests/test_default.rs"], - test_suites: ["general-tests"], - auto_gen_config: true, - test_options: { - unit_test: true, - }, - edition: "2018", - features: [ - "aho-corasick", - "default", - "memchr", - "perf", - "perf-cache", - "perf-dfa", - "perf-inline", - "perf-literal", - "std", - "unicode", - "unicode-age", - "unicode-bool", - "unicode-case", - "unicode-gencat", - "unicode-perl", - "unicode-script", - "unicode-segment", - ], - rustlibs: [ - "libaho_corasick", - "liblazy_static", - "libmemchr", - "libquickcheck", - "librand", - "libregex", - "libregex_syntax", - ], -} - -rust_test { - name: "regex_test_tests_test_default_bytes", - host_supported: true, - crate_name: "default_bytes", - cargo_env_compat: true, - cargo_pkg_version: "1.5.4", - srcs: ["tests/test_default_bytes.rs"], - test_suites: ["general-tests"], - auto_gen_config: true, - test_options: { - unit_test: true, - }, - edition: "2018", - features: [ - "aho-corasick", - "default", - "memchr", - "perf", - "perf-cache", - "perf-dfa", - "perf-inline", - "perf-literal", - "std", - "unicode", - "unicode-age", - "unicode-bool", - "unicode-case", - "unicode-gencat", - "unicode-perl", - "unicode-script", - "unicode-segment", - ], - rustlibs: [ - "libaho_corasick", - "liblazy_static", - "libmemchr", - "libquickcheck", - "librand", - "libregex", - "libregex_syntax", - ], -} - -rust_test { - name: "regex_test_tests_test_nfa", - host_supported: true, - crate_name: "nfa", - cargo_env_compat: true, - cargo_pkg_version: "1.5.4", - srcs: ["tests/test_nfa.rs"], - test_suites: ["general-tests"], - auto_gen_config: true, - test_options: { - unit_test: true, - }, - edition: "2018", - features: [ - "aho-corasick", - "default", - "memchr", - "perf", - "perf-cache", - "perf-dfa", - "perf-inline", - "perf-literal", - "std", - "unicode", - "unicode-age", - "unicode-bool", - "unicode-case", - "unicode-gencat", - "unicode-perl", - "unicode-script", - "unicode-segment", - ], - rustlibs: [ - "libaho_corasick", - "liblazy_static", - "libmemchr", - "libquickcheck", - "librand", - "libregex", - "libregex_syntax", - ], -} - -rust_test { - name: "regex_test_tests_test_nfa_bytes", - host_supported: true, - crate_name: "nfa_bytes", - cargo_env_compat: true, - cargo_pkg_version: "1.5.4", - srcs: ["tests/test_nfa_bytes.rs"], - test_suites: ["general-tests"], - auto_gen_config: true, - test_options: { - unit_test: true, - }, - edition: "2018", - features: [ - "aho-corasick", - "default", - "memchr", - "perf", - "perf-cache", - "perf-dfa", - "perf-inline", - "perf-literal", - "std", - "unicode", - "unicode-age", - "unicode-bool", - "unicode-case", - "unicode-gencat", - "unicode-perl", - "unicode-script", - "unicode-segment", - ], - rustlibs: [ - "libaho_corasick", - "liblazy_static", - "libmemchr", - "libquickcheck", - "librand", - "libregex", - "libregex_syntax", - ], } -rust_test { - name: "regex_test_tests_test_nfa_utf8bytes", - host_supported: true, - crate_name: "nfa_utf8bytes", - cargo_env_compat: true, - cargo_pkg_version: "1.5.4", - srcs: ["tests/test_nfa_utf8bytes.rs"], - test_suites: ["general-tests"], - auto_gen_config: true, - test_options: { - unit_test: true, - }, - edition: "2018", - features: [ - "aho-corasick", - "default", - "memchr", - "perf", - "perf-cache", - "perf-dfa", - "perf-inline", - "perf-literal", - "std", - "unicode", - "unicode-age", - "unicode-bool", - "unicode-case", - "unicode-gencat", - "unicode-perl", - "unicode-script", - "unicode-segment", - ], - rustlibs: [ - "libaho_corasick", - "liblazy_static", - "libmemchr", - "libquickcheck", - "librand", - "libregex", - "libregex_syntax", - ], -} +// dependent_library ["feature_list"] +// aho-corasick-0.7.15 "default,std" +// memchr-2.3.4 "default,std,use_std" +// regex-syntax-0.6.23 "default,unicode,unicode-age,unicode-bool,unicode-case,unicode-gencat,unicode-perl,unicode-script,unicode-segment" diff --git a/CHANGELOG.md b/CHANGELOG.md index 71d1963..f294972 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,67 +1,3 @@ -1.5.4 (2021-05-06) -================== -This release fixes another compilation failure when building regex. This time, -the fix is for when the `pattern` feature is enabled, which only works on -nightly Rust. CI has been updated to test this case. - -* [BUG #772](https://github.com/rust-lang/regex/pull/772): - Fix build when `pattern` feature is enabled. - - -1.5.3 (2021-05-01) -================== -This releases fixes a bug when building regex with only the `unicode-perl` -feature. It turns out that while CI was building this configuration, it wasn't -actually failing the overall build on a failed compilation. - -* [BUG #769](https://github.com/rust-lang/regex/issues/769): - Fix build in `regex-syntax` when only the `unicode-perl` feature is enabled. - - -1.5.2 (2021-05-01) -================== -This release fixes a performance bug when Unicode word boundaries are used. -Namely, for certain regexes on certain inputs, it's possible for the lazy DFA -to stop searching (causing a fallback to a slower engine) when it doesn't -actually need to. - -[PR #768](https://github.com/rust-lang/regex/pull/768) fixes the bug, which was -originally reported in -[ripgrep#1860](https://github.com/BurntSushi/ripgrep/issues/1860). - - -1.5.1 (2021-04-30) -================== -This is a patch release that fixes a compilation error when the `perf-literal` -feature is not enabled. - - -1.5.0 (2021-04-30) -================== -This release primarily updates to Rust 2018 (finally) and bumps the MSRV to -Rust 1.41 (from Rust 1.28). Rust 1.41 was chosen because it's still reasonably -old, and is what's in Debian stable at the time of writing. - -This release also drops this crate's own bespoke substring search algorithms -in favor of a new -[`memmem` implementation provided by the `memchr` crate](https://docs.rs/memchr/2.4.0/memchr/memmem/index.html). -This will change the performance profile of some regexes, sometimes getting a -little worse, and hopefully more frequently, getting a lot better. Please -report any serious performance regressions if you find them. - - -1.4.6 (2021-04-22) -================== -This is a small patch release that fixes the compiler's size check on how much -heap memory a regex uses. Previously, the compiler did not account for the -heap usage of Unicode character classes. Now it does. It's possible that this -may make some regexes fail to compile that previously did compile. If that -happens, please file an issue. - -* [BUG OSS-fuzz#33579](https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=33579): - Some regexes can use more heap memory than one would expect. - - 1.4.5 (2021-03-14) ================== This is a small patch release that fixes a regression in the size of a `Regex` @@ -11,9 +11,8 @@ # will likely look very different (and much more reasonable) [package] -edition = "2018" name = "regex" -version = "1.5.4" +version = "1.4.5" authors = ["The Rust Project Developers"] exclude = ["/scripts/*", "/.github/*"] autotests = false @@ -73,15 +72,15 @@ path = "tests/test_backtrack_bytes.rs" name = "crates-regex" path = "tests/test_crates_regex.rs" [dependencies.aho-corasick] -version = "0.7.18" +version = "0.7.6" optional = true [dependencies.memchr] -version = "2.4.0" +version = "2.2.1" optional = true [dependencies.regex-syntax] -version = "0.6.25" +version = "0.6.22" default-features = false [dev-dependencies.lazy_static] version = "1" diff --git a/Cargo.toml.orig b/Cargo.toml.orig index 468230b..4b9ca7f 100644 --- a/Cargo.toml.orig +++ b/Cargo.toml.orig @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.5.4" #:version +version = "1.4.5" #:version authors = ["The Rust Project Developers"] license = "MIT OR Apache-2.0" readme = "README.md" @@ -14,7 +14,6 @@ finite automata and guarantees linear time matching on all inputs. categories = ["text-processing"] autotests = false exclude = ["/scripts/*", "/.github/*"] -edition = "2018" [workspace] members = [ @@ -106,18 +105,18 @@ pattern = [] # For very fast prefix literal matching. [dependencies.aho-corasick] -version = "0.7.18" +version = "0.7.6" optional = true # For skipping along search text quickly when a leading byte is known. [dependencies.memchr] -version = "2.4.0" +version = "2.2.1" optional = true # For parsing regular expressions. [dependencies.regex-syntax] path = "regex-syntax" -version = "0.6.25" +version = "0.6.22" default-features = false [dev-dependencies] @@ -7,13 +7,13 @@ third_party { } url { type: ARCHIVE - value: "https://static.crates.io/crates/regex/regex-1.5.4.crate" + value: "https://static.crates.io/crates/regex/regex-1.4.5.crate" } - version: "1.5.4" + version: "1.4.5" license_type: NOTICE last_upgrade_date { year: 2021 - month: 5 - day: 19 + month: 4 + day: 1 } } diff --git a/PERFORMANCE.md b/PERFORMANCE.md index 8cd0d9c..b4aeb89 100644 --- a/PERFORMANCE.md +++ b/PERFORMANCE.md @@ -62,7 +62,9 @@ on how your program is structured. Thankfully, the [`lazy_static`](https://crates.io/crates/lazy_static) crate provides an answer that works well: - use lazy_static::lazy_static; + #[macro_use] extern crate lazy_static; + extern crate regex; + use regex::Regex; fn some_helper_function(text: &str) -> bool { @@ -145,9 +147,9 @@ In general, these are ordered from fastest to slowest. `is_match` is fastest because it doesn't actually need to find the start or the end of the leftmost-first match. It can quit immediately after it knows there is a match. For example, given the regex `a+` and the haystack, `aaaaa`, the -search will quit after examining the first byte. +search will quit after examing the first byte. -In contrast, `find` must return both the start and end location of the +In constrast, `find` must return both the start and end location of the leftmost-first match. It can use the DFA matcher for this, but must run it forwards once to find the end of the match *and then run it backwards* to find the start of the match. The two scans and the cost of finding the real end of @@ -196,7 +198,7 @@ a few examples of regexes that get literal prefixes detected: Literals in anchored regexes can also be used for detecting non-matches very quickly. For example, `^foo\w+` and `\w+foo$` may be able to detect a non-match -just by examining the first (or last) three bytes of the haystack. +just by examing the first (or last) three bytes of the haystack. ## Unicode word boundaries may prevent the DFA from being used @@ -9,7 +9,7 @@ by [RE2](https://github.com/google/re2). [![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) [![](https://meritbadge.herokuapp.com/regex)](https://crates.io/crates/regex) -[![Rust](https://img.shields.io/badge/rust-1.41.1%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) +[![Rust](https://img.shields.io/badge/rust-1.28.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) ### Documentation @@ -27,7 +27,13 @@ Add this to your `Cargo.toml`: ```toml [dependencies] -regex = "1.5" +regex = "1" +``` + +and this to your crate root (if you're using Rust 2015): + +```rust +extern crate regex; ``` Here's a simple example that matches a date in YYYY-MM-DD format and prints the @@ -222,7 +228,7 @@ The full set of features one can disable are ### Minimum Rust version policy -This crate's minimum supported `rustc` version is `1.41.1`. +This crate's minimum supported `rustc` version is `1.28.0`. The current **tentative** policy is that the minimum Rust version required to use this crate can be increased in minor version updates. For example, if diff --git a/TEST_MAPPING b/TEST_MAPPING index 6064d70..ee45373 100644 --- a/TEST_MAPPING +++ b/TEST_MAPPING @@ -1,105 +1,14 @@ // Generated by update_crate_tests.py for tests that depend on this crate. { - "imports": [ - { - "path": "external/rust/crates/base64" - }, - { - "path": "external/rust/crates/libsqlite3-sys" - }, - { - "path": "external/rust/crates/once_cell" - }, - { - "path": "external/rust/crates/tinytemplate" - }, - { - "path": "external/rust/crates/tinyvec" - }, - { - "path": "external/rust/crates/unicode-xid" - } - ], "presubmit": [ { "name": "keystore2_test" }, { - "name": "legacykeystore_test" - }, - { - "name": "regex_test_src_lib" - }, - { - "name": "regex_test_tests_test_backtrack" - }, - { - "name": "regex_test_tests_test_backtrack_bytes" - }, - { - "name": "regex_test_tests_test_backtrack_utf8bytes" - }, - { - "name": "regex_test_tests_test_crates_regex" - }, - { - "name": "regex_test_tests_test_default" - }, - { - "name": "regex_test_tests_test_default_bytes" - }, - { - "name": "regex_test_tests_test_nfa" - }, - { - "name": "regex_test_tests_test_nfa_bytes" - }, - { - "name": "regex_test_tests_test_nfa_utf8bytes" - }, - { - "name": "virtualizationservice_device_test" - } - ], - "presubmit-rust": [ - { - "name": "keystore2_test" - }, - { - "name": "legacykeystore_test" - }, - { - "name": "regex_test_src_lib" - }, - { - "name": "regex_test_tests_test_backtrack" - }, - { - "name": "regex_test_tests_test_backtrack_bytes" - }, - { - "name": "regex_test_tests_test_backtrack_utf8bytes" - }, - { - "name": "regex_test_tests_test_crates_regex" - }, - { - "name": "regex_test_tests_test_default" - }, - { - "name": "regex_test_tests_test_default_bytes" - }, - { - "name": "regex_test_tests_test_nfa" - }, - { - "name": "regex_test_tests_test_nfa_bytes" - }, - { - "name": "regex_test_tests_test_nfa_utf8bytes" + "name": "vpnprofilestore_test" }, { - "name": "virtualizationservice_device_test" + "name": "libsqlite3-sys_device_test_src_lib" } ] } diff --git a/cargo2android.json b/cargo2android.json deleted file mode 100644 index 0e54308..0000000 --- a/cargo2android.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "apex-available": [ - "//apex_available:platform", - "com.android.compos", - "com.android.virt" - ], - "dependencies": true, - "device": true, - "run": true, - "tests": true -} diff --git a/examples/shootout-regex-dna-bytes.rs b/examples/shootout-regex-dna-bytes.rs index 773fd9b..ac4c115 100644 --- a/examples/shootout-regex-dna-bytes.rs +++ b/examples/shootout-regex-dna-bytes.rs @@ -5,6 +5,8 @@ // contributed by TeXitoi // contributed by BurntSushi +extern crate regex; + use std::io::{self, Read}; use std::sync::Arc; use std::thread; diff --git a/examples/shootout-regex-dna-cheat.rs b/examples/shootout-regex-dna-cheat.rs index 1bde7ab..c7395b7 100644 --- a/examples/shootout-regex-dna-cheat.rs +++ b/examples/shootout-regex-dna-cheat.rs @@ -10,6 +10,8 @@ // replacing them with a single linear scan. i.e., it re-implements // `replace_all`. As a result, this is around 25% faster. ---AG +extern crate regex; + use std::io::{self, Read}; use std::sync::Arc; use std::thread; diff --git a/examples/shootout-regex-dna-replace.rs b/examples/shootout-regex-dna-replace.rs index 20694e0..681e077 100644 --- a/examples/shootout-regex-dna-replace.rs +++ b/examples/shootout-regex-dna-replace.rs @@ -1,3 +1,5 @@ +extern crate regex; + use std::io::{self, Read}; macro_rules! regex { diff --git a/examples/shootout-regex-dna-single-cheat.rs b/examples/shootout-regex-dna-single-cheat.rs index 70a979c..04ed05a 100644 --- a/examples/shootout-regex-dna-single-cheat.rs +++ b/examples/shootout-regex-dna-single-cheat.rs @@ -5,6 +5,8 @@ // contributed by TeXitoi // contributed by BurntSushi +extern crate regex; + use std::io::{self, Read}; macro_rules! regex { diff --git a/examples/shootout-regex-dna-single.rs b/examples/shootout-regex-dna-single.rs index b474059..a70c711 100644 --- a/examples/shootout-regex-dna-single.rs +++ b/examples/shootout-regex-dna-single.rs @@ -5,6 +5,8 @@ // contributed by TeXitoi // contributed by BurntSushi +extern crate regex; + use std::io::{self, Read}; macro_rules! regex { diff --git a/examples/shootout-regex-dna.rs b/examples/shootout-regex-dna.rs index b96518e..4527422 100644 --- a/examples/shootout-regex-dna.rs +++ b/examples/shootout-regex-dna.rs @@ -5,6 +5,8 @@ // contributed by TeXitoi // contributed by BurntSushi +extern crate regex; + use std::io::{self, Read}; use std::sync::Arc; use std::thread; diff --git a/src/backtrack.rs b/src/backtrack.rs index a3d25d6..6100c17 100644 --- a/src/backtrack.rs +++ b/src/backtrack.rs @@ -16,10 +16,10 @@ // the bitset has to be zeroed on each execution, which becomes quite expensive // on large bitsets. -use crate::exec::ProgramCache; -use crate::input::{Input, InputAt}; -use crate::prog::{InstPtr, Program}; -use crate::re_trait::Slot; +use exec::ProgramCache; +use input::{Input, InputAt}; +use prog::{InstPtr, Program}; +use re_trait::Slot; type Bits = u32; @@ -196,7 +196,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { } fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool { - use crate::prog::Inst::*; + use prog::Inst::*; loop { // This loop is an optimization to avoid constantly pushing/popping // from the stack. Namely, if we're pushing a job only to run it diff --git a/src/compile.rs b/src/compile.rs index 9a2ed5e..9ffd347 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -4,16 +4,16 @@ use std::iter; use std::result; use std::sync::Arc; -use regex_syntax::hir::{self, Hir}; -use regex_syntax::is_word_byte; -use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; +use syntax::hir::{self, Hir}; +use syntax::is_word_byte; +use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; -use crate::prog::{ +use prog::{ EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges, InstSave, InstSplit, Program, }; -use crate::Error; +use Error; type Result = result::Result<Patch, Error>; type ResultOrEmpty = result::Result<Option<Patch>, Error>; @@ -38,7 +38,6 @@ pub struct Compiler { suffix_cache: SuffixCache, utf8_seqs: Option<Utf8Sequences>, byte_classes: ByteClassSet, - extra_inst_bytes: usize, } impl Compiler { @@ -55,7 +54,6 @@ impl Compiler { suffix_cache: SuffixCache::new(1000), utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')), byte_classes: ByteClassSet::new(), - extra_inst_bytes: 0, } } @@ -255,8 +253,8 @@ impl Compiler { /// Ok(None) is returned when an expression is compiled to no /// instruction, and so no patch.entry value makes sense. fn c(&mut self, expr: &Hir) -> ResultOrEmpty { - use crate::prog; - use regex_syntax::hir::HirKind::*; + use prog; + use syntax::hir::HirKind::*; self.check_size()?; match *expr.kind() { @@ -318,13 +316,6 @@ impl Compiler { } self.compiled.has_unicode_word_boundary = true; self.byte_classes.set_word_boundary(); - // We also make sure that all ASCII bytes are in a different - // class from non-ASCII bytes. Otherwise, it's possible for - // ASCII bytes to get lumped into the same class as non-ASCII - // bytes. This in turn may cause the lazy DFA to falsely start - // when it sees an ASCII byte that maps to a byte class with - // non-ASCII bytes. This ensures that never happens. - self.byte_classes.set_range(0, 0x7F); self.c_empty_look(prog::EmptyLook::WordBoundary) } WordBoundary(hir::WordBoundary::UnicodeNegate) => { @@ -337,8 +328,6 @@ impl Compiler { } self.compiled.has_unicode_word_boundary = true; self.byte_classes.set_word_boundary(); - // See comments above for why we set the ASCII range here. - self.byte_classes.set_range(0, 0x7F); self.c_empty_look(prog::EmptyLook::NotWordBoundary) } WordBoundary(hir::WordBoundary::Ascii) => { @@ -431,8 +420,6 @@ impl Compiler { } fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { - use std::mem::size_of; - assert!(!ranges.is_empty()); if self.compiled.uses_bytes() { Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?)) @@ -442,8 +429,6 @@ impl Compiler { let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 { self.push_hole(InstHole::Char { c: ranges[0].0 }) } else { - self.extra_inst_bytes += - ranges.len() * (size_of::<char>() * 2); self.push_hole(InstHole::Ranges { ranges: ranges }) }; Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) @@ -563,7 +548,7 @@ impl Compiler { } fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { - use regex_syntax::hir::RepetitionKind::*; + use syntax::hir::RepetitionKind::*; match rep.kind { ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), @@ -810,9 +795,7 @@ impl Compiler { fn check_size(&self) -> result::Result<(), Error> { use std::mem::size_of; - let size = - self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>()); - if size > self.size_limit { + if self.insts.len() * size_of::<Inst>() > self.size_limit { Err(Error::CompiledTooBig(self.size_limit)) } else { Ok(()) @@ -944,10 +927,9 @@ impl InstHole { Inst::EmptyLook(InstEmptyLook { goto: goto, look: look }) } InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }), - InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges { - goto: goto, - ranges: ranges.clone().into_boxed_slice(), - }), + InstHole::Ranges { ref ranges } => { + Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() }) + } InstHole::Bytes { start, end } => { Inst::Bytes(InstBytes { goto: goto, start: start, end: end }) } @@ -42,9 +42,9 @@ use std::iter::repeat; use std::mem; use std::sync::Arc; -use crate::exec::ProgramCache; -use crate::prog::{Inst, Program}; -use crate::sparse::SparseSet; +use exec::ProgramCache; +use prog::{Inst, Program}; +use sparse::SparseSet; /// Return true if and only if the given program can be executed by a DFA. /// @@ -55,7 +55,7 @@ use crate::sparse::SparseSet; /// This function will also return false if the given program has any Unicode /// instructions (Char or Ranges) since the DFA operates on bytes only. pub fn can_exec(insts: &Program) -> bool { - use crate::prog::Inst::*; + use prog::Inst::*; // If for some reason we manage to allocate a regex program with more // than i32::MAX instructions, then we can't execute the DFA because we // use 32 bit instruction pointer deltas for memory savings. @@ -306,7 +306,7 @@ impl State { StateFlags(self.data[0]) } - fn inst_ptrs(&self) -> InstPtrs<'_> { + fn inst_ptrs(&self) -> InstPtrs { InstPtrs { base: 0, data: &self.data[1..] } } } @@ -894,7 +894,7 @@ impl<'a> Fsm<'a> { mut si: StatePtr, b: Byte, ) -> Option<StatePtr> { - use crate::prog::Inst::*; + use prog::Inst::*; // Initialize a queue with the current DFA state's NFA states. qcur.clear(); @@ -1056,8 +1056,8 @@ impl<'a> Fsm<'a> { q: &mut SparseSet, flags: EmptyFlags, ) { - use crate::prog::EmptyLook::*; - use crate::prog::Inst::*; + use prog::EmptyLook::*; + use prog::Inst::*; // We need to traverse the NFA to follow epsilon transitions, so avoid // recursion with an explicit stack. @@ -1190,7 +1190,7 @@ impl<'a> Fsm<'a> { q: &SparseSet, state_flags: &mut StateFlags, ) -> Option<State> { - use crate::prog::Inst::*; + use prog::Inst::*; // We need to build up enough information to recognize pre-built states // in the DFA. Generally speaking, this includes every instruction @@ -1754,7 +1754,7 @@ impl Byte { } impl fmt::Debug for State { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let ips: Vec<usize> = self.inst_ptrs().collect(); f.debug_struct("State") .field("flags", &self.flags()) @@ -1764,7 +1764,7 @@ impl fmt::Debug for State { } impl fmt::Debug for Transitions { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut fmtd = f.debug_map(); for si in 0..self.num_states() { let s = si * self.num_byte_classes; @@ -1778,7 +1778,7 @@ impl fmt::Debug for Transitions { struct TransitionsRow<'a>(&'a [StatePtr]); impl<'a> fmt::Debug for TransitionsRow<'a> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut fmtd = f.debug_map(); for (b, si) in self.0.iter().enumerate() { match *si { @@ -1796,7 +1796,7 @@ impl<'a> fmt::Debug for TransitionsRow<'a> { } impl fmt::Debug for StateFlags { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("StateFlags") .field("is_match", &self.is_match()) .field("is_word", &self.is_word()) @@ -1889,6 +1889,7 @@ fn read_varu32(data: &[u8]) -> (u32, usize) { #[cfg(test)] mod tests { + extern crate rand; use super::{ push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32, diff --git a/src/error.rs b/src/error.rs index 3e0ec75..1c32c85 100644 --- a/src/error.rs +++ b/src/error.rs @@ -31,7 +31,7 @@ impl ::std::error::Error for Error { } impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { Error::Syntax(ref err) => err.fmt(f), Error::CompiledTooBig(limit) => write!( @@ -49,7 +49,7 @@ impl fmt::Display for Error { // but the `Syntax` variant is already storing a `String` anyway, so we might // as well format it nicely. impl fmt::Debug for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { Error::Syntax(ref err) => { let hr: String = repeat('~').take(79).collect(); diff --git a/src/exec.rs b/src/exec.rs index d5fad1c..3d5a52b 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -5,26 +5,26 @@ use std::sync::Arc; #[cfg(feature = "perf-literal")] use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; -use regex_syntax::hir::literal::Literals; -use regex_syntax::hir::Hir; -use regex_syntax::ParserBuilder; +use syntax::hir::literal::Literals; +use syntax::hir::Hir; +use syntax::ParserBuilder; -use crate::backtrack; -use crate::compile::Compiler; +use backtrack; +use compile::Compiler; #[cfg(feature = "perf-dfa")] -use crate::dfa; -use crate::error::Error; -use crate::input::{ByteInput, CharInput}; -use crate::literal::LiteralSearcher; -use crate::pikevm; -use crate::pool::{Pool, PoolGuard}; -use crate::prog::Program; -use crate::re_builder::RegexOptions; -use crate::re_bytes; -use crate::re_set; -use crate::re_trait::{Locations, RegularExpression, Slot}; -use crate::re_unicode; -use crate::utf8::next_utf8; +use dfa; +use error::Error; +use input::{ByteInput, CharInput}; +use literal::LiteralSearcher; +use pikevm; +use pool::{Pool, PoolGuard}; +use prog::Program; +use re_builder::RegexOptions; +use re_bytes; +use re_set; +use re_trait::{Locations, RegularExpression, Slot}; +use re_unicode; +use utf8::next_utf8; /// `Exec` manages the execution of a regular expression. /// @@ -140,7 +140,7 @@ impl ExecBuilder { /// /// Note that when compiling 2 or more regular expressions, capture groups /// are completely unsupported. (This means both `find` and `captures` - /// won't work.) + /// wont work.) pub fn new_many<I, S>(res: I) -> Self where S: AsRef<str>, @@ -373,6 +373,9 @@ impl ExecBuilder { AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostFirst) .auto_configure(&lits) + // We always want this to reduce size, regardless + // of what auto-configure does. + .byte_classes(true) .build_with_size::<u32, _, _>(&lits) // This should never happen because we'd long exceed the // compilation limit for regexes first. @@ -736,7 +739,7 @@ impl<'c> ExecNoSync<'c> { text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)> { - use crate::dfa::Result::*; + use dfa::Result::*; let end = match dfa::Fsm::forward( &self.ro.dfa, self.cache.value(), @@ -776,7 +779,7 @@ impl<'c> ExecNoSync<'c> { text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)> { - use crate::dfa::Result::*; + use dfa::Result::*; match dfa::Fsm::reverse( &self.ro.dfa_reverse, self.cache.value(), @@ -832,7 +835,7 @@ impl<'c> ExecNoSync<'c> { text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>> { - use crate::dfa::Result::*; + use dfa::Result::*; let lcs = self.ro.suffixes.lcs(); debug_assert!(lcs.len() >= 1); @@ -877,7 +880,7 @@ impl<'c> ExecNoSync<'c> { text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)> { - use crate::dfa::Result::*; + use dfa::Result::*; let match_start = match self.exec_dfa_reverse_suffix(text, start) { None => return self.find_dfa_forward(text, start), @@ -1260,7 +1263,7 @@ impl<'c> ExecNoSyncStr<'c> { impl Exec { /// Get a searcher that isn't Sync. #[cfg_attr(feature = "perf-inline", inline(always))] - pub fn searcher(&self) -> ExecNoSync<'_> { + pub fn searcher(&self) -> ExecNoSync { ExecNoSync { ro: &self.ro, // a clone is too expensive here! (and not needed) cache: self.pool.get(), @@ -1269,7 +1272,7 @@ impl Exec { /// Get a searcher that isn't Sync and can match on &str. #[cfg_attr(feature = "perf-inline", inline(always))] - pub fn searcher_str(&self) -> ExecNoSyncStr<'_> { + pub fn searcher_str(&self) -> ExecNoSyncStr { ExecNoSyncStr(self.searcher()) } @@ -1547,7 +1550,7 @@ impl ProgramCacheInner { /// literals, and if so, returns them. Otherwise, this returns None. #[cfg(feature = "perf-literal")] fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> { - use regex_syntax::hir::{HirKind, Literal}; + use syntax::hir::{HirKind, Literal}; // This is pretty hacky, but basically, if `is_alternation_literal` is // true, then we can make several assumptions about the structure of our @@ -1599,7 +1602,7 @@ fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> { mod test { #[test] fn uppercut_s_backtracking_bytes_default_bytes_mismatch() { - use crate::internal::ExecBuilder; + use internal::ExecBuilder; let backtrack_bytes_re = ExecBuilder::new("^S") .bounded_backtracking() @@ -1627,7 +1630,7 @@ mod test { #[test] fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() { - use crate::internal::ExecBuilder; + use internal::ExecBuilder; let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)") .bounded_backtracking() diff --git a/src/expand.rs b/src/expand.rs index fd9c2d0..70dbf91 100644 --- a/src/expand.rs +++ b/src/expand.rs @@ -1,12 +1,12 @@ use std::str; -use crate::find_byte::find_byte; +use find_byte::find_byte; -use crate::re_bytes; -use crate::re_unicode; +use re_bytes; +use re_unicode; pub fn expand_str( - caps: &re_unicode::Captures<'_>, + caps: &re_unicode::Captures, mut replacement: &str, dst: &mut String, ) { @@ -48,7 +48,7 @@ pub fn expand_str( } pub fn expand_bytes( - caps: &re_bytes::Captures<'_>, + caps: &re_bytes::Captures, mut replacement: &[u8], dst: &mut Vec<u8>, ) { @@ -125,7 +125,7 @@ impl From<usize> for Ref<'static> { /// starting at the beginning of `replacement`. /// /// If no such valid reference could be found, None is returned. -fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> { +fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> { let mut i = 0; let rep: &[u8] = replacement.as_ref(); if rep.len() <= 1 || rep[0] != b'$' { @@ -157,7 +157,7 @@ fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> { }) } -fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> { +fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef> { let start = i; while rep.get(i).map_or(false, |&b| b != b'}') { i += 1; diff --git a/src/input.rs b/src/input.rs index 5d50ee3..3afa2d0 100644 --- a/src/input.rs +++ b/src/input.rs @@ -4,9 +4,11 @@ use std::fmt; use std::ops; use std::u32; -use crate::literal::LiteralSearcher; -use crate::prog::InstEmptyLook; -use crate::utf8::{decode_last_utf8, decode_utf8}; +use syntax; + +use literal::LiteralSearcher; +use prog::InstEmptyLook; +use utf8::{decode_last_utf8, decode_utf8}; /// Represents a location in the input. #[derive(Clone, Copy, Debug)] @@ -173,7 +175,7 @@ impl<'t> Input for CharInput<'t> { } fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { - use crate::prog::EmptyLook::*; + use prog::EmptyLook::*; match empty.look { StartLine => { let c = self.previous_char(at); @@ -266,7 +268,7 @@ impl<'t> Input for ByteInput<'t> { } fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { - use crate::prog::EmptyLook::*; + use prog::EmptyLook::*; match empty.look { StartLine => { let c = self.previous_char(at); @@ -346,7 +348,7 @@ impl<'t> Input for ByteInput<'t> { pub struct Char(u32); impl fmt::Debug for Char { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match char::from_u32(self.0) { None => write!(f, "Empty"), Some(c) => write!(f, "{:?}", c), @@ -377,7 +379,7 @@ impl Char { // available. However, our compiler ensures that if a Unicode word // boundary is used, then the data must also be available. If it isn't, // then the compiler returns an error. - char::from_u32(self.0).map_or(false, regex_syntax::is_word_character) + char::from_u32(self.0).map_or(false, syntax::is_word_character) } /// Returns true iff the byte is a word byte. @@ -385,7 +387,7 @@ impl Char { /// If the byte is absent, then false is returned. pub fn is_word_byte(self) -> bool { match char::from_u32(self.0) { - Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8), + Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8), None | Some(_) => false, } } @@ -22,6 +22,12 @@ used by adding `regex` to your dependencies in your project's `Cargo.toml`. regex = "1" ``` +If you're using Rust 2015, then you'll also need to add it to your crate root: + +```rust +extern crate regex; +``` + # Example: find a date General use of regular expressions in this package involves compiling an @@ -62,7 +68,9 @@ regular expressions are compiled exactly once. For example: ```rust -use lazy_static::lazy_static; +#[macro_use] extern crate lazy_static; +extern crate regex; + use regex::Regex; fn some_helper_function(text: &str) -> bool { @@ -86,7 +94,7 @@ matches. For example, to find all dates in a string and be able to access them by their component pieces: ```rust -# use regex::Regex; +# extern crate regex; use regex::Regex; # fn main() { let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); let text = "2012-03-14, 2013-01-01 and 2014-07-05"; @@ -111,7 +119,7 @@ clearer, we can *name* our capture groups and use those names as variables in our replacement text: ```rust -# use regex::Regex; +# extern crate regex; use regex::Regex; # fn main() { let re = Regex::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})").unwrap(); let before = "2012-03-14, 2013-01-01 and 2014-07-05"; @@ -128,7 +136,7 @@ Note that if your regex gets complicated, you can use the `x` flag to enable insignificant whitespace mode, which also lets you write comments: ```rust -# use regex::Regex; +# extern crate regex; use regex::Regex; # fn main() { let re = Regex::new(r"(?x) (?P<y>\d{4}) # the year @@ -209,7 +217,7 @@ Unicode scalar values. This means you can use Unicode characters directly in your expression: ```rust -# use regex::Regex; +# extern crate regex; use regex::Regex; # fn main() { let re = Regex::new(r"(?i)Δ+").unwrap(); let mat = re.find("ΔδΔ").unwrap(); @@ -236,7 +244,7 @@ of boolean properties are available as character classes. For example, you can match a sequence of numerals, Greek or Cherokee letters: ```rust -# use regex::Regex; +# extern crate regex; use regex::Regex; # fn main() { let re = Regex::new(r"[\pN\p{Greek}\p{Cherokee}]+").unwrap(); let mat = re.find("abcΔᎠβⅠᏴγδⅡxyz").unwrap(); @@ -383,7 +391,7 @@ Flags can be toggled within a pattern. Here's an example that matches case-insensitively for the first part but case-sensitively for the second part: ```rust -# use regex::Regex; +# extern crate regex; use regex::Regex; # fn main() { let re = Regex::new(r"(?i)a+(?-i)b+").unwrap(); let cap = re.captures("AaAaAbbBBBb").unwrap(); @@ -417,7 +425,7 @@ Here is an example that uses an ASCII word boundary instead of a Unicode word boundary: ```rust -# use regex::Regex; +# extern crate regex; use regex::Regex; # fn main() { let re = Regex::new(r"(?-u:\b).+(?-u:\b)").unwrap(); let cap = re.captures("$$abc$$").unwrap(); @@ -606,30 +614,38 @@ another matching engine with fixed memory requirements. */ #![deny(missing_docs)] +#![cfg_attr(test, deny(warnings))] #![cfg_attr(feature = "pattern", feature(pattern))] #![warn(missing_debug_implementations)] #[cfg(not(feature = "std"))] compile_error!("`std` feature is currently required to build this crate"); -// To check README's example -// TODO: Re-enable this once the MSRV is 1.43 or greater. -// See: https://github.com/rust-lang/regex/issues/684 -// See: https://github.com/rust-lang/regex/issues/685 +#[cfg(feature = "perf-literal")] +extern crate aho_corasick; +// #[cfg(doctest)] +// extern crate doc_comment; +#[cfg(feature = "perf-literal")] +extern crate memchr; +#[cfg(test)] +#[cfg_attr(feature = "perf-literal", macro_use)] +extern crate quickcheck; +extern crate regex_syntax as syntax; + // #[cfg(doctest)] // doc_comment::doctest!("../README.md"); #[cfg(feature = "std")] -pub use crate::error::Error; +pub use error::Error; #[cfg(feature = "std")] -pub use crate::re_builder::set_unicode::*; +pub use re_builder::set_unicode::*; #[cfg(feature = "std")] -pub use crate::re_builder::unicode::*; +pub use re_builder::unicode::*; #[cfg(feature = "std")] -pub use crate::re_set::unicode::*; +pub use re_set::unicode::*; #[cfg(feature = "std")] #[cfg(feature = "std")] -pub use crate::re_unicode::{ +pub use re_unicode::{ escape, CaptureLocations, CaptureMatches, CaptureNames, Captures, Locations, Match, Matches, NoExpand, Regex, Replacer, ReplacerRef, Split, SplitN, SubCaptureMatches, @@ -724,10 +740,10 @@ performance on `&str`. */ #[cfg(feature = "std")] pub mod bytes { - pub use crate::re_builder::bytes::*; - pub use crate::re_builder::set_bytes::*; - pub use crate::re_bytes::*; - pub use crate::re_set::bytes::*; + pub use re_builder::bytes::*; + pub use re_builder::set_bytes::*; + pub use re_bytes::*; + pub use re_set::bytes::*; } mod backtrack; @@ -738,6 +754,8 @@ mod error; mod exec; mod expand; mod find_byte; +#[cfg(feature = "perf-literal")] +mod freqs; mod input; mod literal; #[cfg(feature = "pattern")] @@ -759,9 +777,9 @@ mod utf8; #[doc(hidden)] #[cfg(feature = "std")] pub mod internal { - pub use crate::compile::Compiler; - pub use crate::exec::{Exec, ExecBuilder}; - pub use crate::input::{Char, CharInput, Input, InputAt}; - pub use crate::literal::LiteralSearcher; - pub use crate::prog::{EmptyLook, Inst, InstRanges, Program}; + pub use compile::Compiler; + pub use exec::{Exec, ExecBuilder}; + pub use input::{Char, CharInput, Input, InputAt}; + pub use literal::LiteralSearcher; + pub use prog::{EmptyLook, Inst, InstRanges, Program}; } diff --git a/src/literal/imp.rs b/src/literal/imp.rs index 82f050a..e4d04ed 100644 --- a/src/literal/imp.rs +++ b/src/literal/imp.rs @@ -1,8 +1,11 @@ +use std::cmp; use std::mem; use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder}; -use memchr::{memchr, memchr2, memchr3, memmem}; -use regex_syntax::hir::literal::{Literal, Literals}; +use memchr::{memchr, memchr2, memchr3}; +use syntax::hir::literal::{Literal, Literals}; + +use freqs::BYTE_FREQUENCIES; /// A prefix extracted from a compiled regular expression. /// @@ -12,8 +15,8 @@ use regex_syntax::hir::literal::{Literal, Literals}; #[derive(Clone, Debug)] pub struct LiteralSearcher { complete: bool, - lcp: Memmem, - lcs: Memmem, + lcp: FreqyPacked, + lcs: FreqyPacked, matcher: Matcher, } @@ -23,8 +26,10 @@ enum Matcher { Empty, /// A set of four or more single byte literals. Bytes(SingleByteSet), - /// A single substring, using vector accelerated routines when available. - Memmem(Memmem), + /// A single substring, find using memchr and frequency analysis. + FreqyPacked(FreqyPacked), + /// A single substring, find using Boyer-Moore. + BoyerMoore(BoyerMooreSearch), /// An Aho-Corasick automaton. AC { ac: AhoCorasick<u32>, lits: Vec<Literal> }, /// A packed multiple substring searcher, using SIMD. @@ -58,8 +63,8 @@ impl LiteralSearcher { let complete = lits.all_complete(); LiteralSearcher { complete: complete, - lcp: Memmem::new(lits.longest_common_prefix()), - lcs: Memmem::new(lits.longest_common_suffix()), + lcp: FreqyPacked::new(lits.longest_common_prefix().to_vec()), + lcs: FreqyPacked::new(lits.longest_common_suffix().to_vec()), matcher: matcher, } } @@ -81,7 +86,8 @@ impl LiteralSearcher { match self.matcher { Empty => Some((0, 0)), Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)), - Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())), + FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())), + BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())), AC { ref ac, .. } => { ac.find(haystack).map(|m| (m.start(), m.end())) } @@ -118,23 +124,24 @@ impl LiteralSearcher { } /// Returns an iterator over all literals to be matched. - pub fn iter(&self) -> LiteralIter<'_> { + pub fn iter(&self) -> LiteralIter { match self.matcher { Matcher::Empty => LiteralIter::Empty, Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense), - Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()), + Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat), + Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern), Matcher::AC { ref lits, .. } => LiteralIter::AC(lits), Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits), } } /// Returns a matcher for the longest common prefix of this matcher. - pub fn lcp(&self) -> &Memmem { + pub fn lcp(&self) -> &FreqyPacked { &self.lcp } /// Returns a matcher for the longest common suffix of this matcher. - pub fn lcs(&self) -> &Memmem { + pub fn lcs(&self) -> &FreqyPacked { &self.lcs } @@ -149,7 +156,8 @@ impl LiteralSearcher { match self.matcher { Empty => 0, Bytes(ref sset) => sset.dense.len(), - Memmem(_) => 1, + FreqyPacked(_) => 1, + BoyerMoore(_) => 1, AC { ref ac, .. } => ac.pattern_count(), Packed { ref lits, .. } => lits.len(), } @@ -161,7 +169,8 @@ impl LiteralSearcher { match self.matcher { Empty => 0, Bytes(ref sset) => sset.approximate_size(), - Memmem(ref single) => single.approximate_size(), + FreqyPacked(ref single) => single.approximate_size(), + BoyerMoore(ref single) => single.approximate_size(), AC { ref ac, .. } => ac.heap_bytes(), Packed { ref s, .. } => s.heap_bytes(), } @@ -196,7 +205,12 @@ impl Matcher { return Matcher::Bytes(sset); } if lits.literals().len() == 1 { - return Matcher::Memmem(Memmem::new(&lits.literals()[0])); + let lit = lits.literals()[0].to_vec(); + if BoyerMooreSearch::should_use(lit.as_slice()) { + return Matcher::BoyerMoore(BoyerMooreSearch::new(lit)); + } else { + return Matcher::FreqyPacked(FreqyPacked::new(lit)); + } } let pats = lits.literals().to_owned(); @@ -353,27 +367,116 @@ impl SingleByteSet { } } -/// A simple wrapper around the memchr crate's memmem implementation. +/// Provides an implementation of fast subtring search using frequency +/// analysis. /// -/// The API this exposes mirrors the API of previous substring searchers that -/// this supplanted. +/// memchr is so fast that we do everything we can to keep the loop in memchr +/// for as long as possible. The easiest way to do this is to intelligently +/// pick the byte to send to memchr. The best byte is the byte that occurs +/// least frequently in the haystack. Since doing frequency analysis on the +/// haystack is far too expensive, we compute a set of fixed frequencies up +/// front and hard code them in src/freqs.rs. Frequency analysis is done via +/// scripts/frequencies.py. #[derive(Clone, Debug)] -pub struct Memmem { - finder: memmem::Finder<'static>, +pub struct FreqyPacked { + /// The pattern. + pat: Vec<u8>, + /// The number of Unicode characters in the pattern. This is useful for + /// determining the effective length of a pattern when deciding which + /// optimizations to perform. A trailing incomplete UTF-8 sequence counts + /// as one character. char_len: usize, + /// The rarest byte in the pattern, according to pre-computed frequency + /// analysis. + rare1: u8, + /// The offset of the rarest byte in `pat`. + rare1i: usize, + /// The second rarest byte in the pattern, according to pre-computed + /// frequency analysis. (This may be equivalent to the rarest byte.) + /// + /// The second rarest byte is used as a type of guard for quickly detecting + /// a mismatch after memchr locates an instance of the rarest byte. This + /// is a hedge against pathological cases where the pre-computed frequency + /// analysis may be off. (But of course, does not prevent *all* + /// pathological cases.) + rare2: u8, + /// The offset of the second rarest byte in `pat`. + rare2i: usize, } -impl Memmem { - fn new(pat: &[u8]) -> Memmem { - Memmem { - finder: memmem::Finder::new(pat).into_owned(), - char_len: char_len_lossy(pat), +impl FreqyPacked { + fn new(pat: Vec<u8>) -> FreqyPacked { + if pat.is_empty() { + return FreqyPacked::empty(); + } + + // Find the rarest two bytes. Try to make them distinct (but it's not + // required). + let mut rare1 = pat[0]; + let mut rare2 = pat[0]; + for b in pat[1..].iter().cloned() { + if freq_rank(b) < freq_rank(rare1) { + rare1 = b; + } + } + for &b in &pat { + if rare1 == rare2 { + rare2 = b + } else if b != rare1 && freq_rank(b) < freq_rank(rare2) { + rare2 = b; + } + } + + // And find the offsets of their last occurrences. + let rare1i = pat.iter().rposition(|&b| b == rare1).unwrap(); + let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap(); + + let char_len = char_len_lossy(&pat); + FreqyPacked { + pat: pat, + char_len: char_len, + rare1: rare1, + rare1i: rare1i, + rare2: rare2, + rare2i: rare2i, + } + } + + fn empty() -> FreqyPacked { + FreqyPacked { + pat: vec![], + char_len: 0, + rare1: 0, + rare1i: 0, + rare2: 0, + rare2i: 0, } } #[cfg_attr(feature = "perf-inline", inline(always))] pub fn find(&self, haystack: &[u8]) -> Option<usize> { - self.finder.find(haystack) + let pat = &*self.pat; + if haystack.len() < pat.len() || pat.is_empty() { + return None; + } + let mut i = self.rare1i; + while i < haystack.len() { + i += match memchr(self.rare1, &haystack[i..]) { + None => return None, + Some(i) => i, + }; + let start = i - self.rare1i; + let end = start + pat.len(); + if end > haystack.len() { + return None; + } + let aligned = &haystack[start..end]; + if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat { + return Some(start); + } + i += 1; + } + None } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -381,11 +484,11 @@ impl Memmem { if text.len() < self.len() { return false; } - &text[text.len() - self.len()..] == self.finder.needle() + text[text.len() - self.len()..] == *self.pat } pub fn len(&self) -> usize { - self.finder.needle().len() + self.pat.len() } pub fn char_len(&self) -> usize { @@ -393,10 +496,627 @@ impl Memmem { } fn approximate_size(&self) -> usize { - self.finder.needle().len() * mem::size_of::<u8>() + self.pat.len() * mem::size_of::<u8>() } } fn char_len_lossy(bytes: &[u8]) -> usize { String::from_utf8_lossy(bytes).chars().count() } + +/// An implementation of Tuned Boyer-Moore as laid out by +/// Andrew Hume and Daniel Sunday in "Fast String Searching". +/// O(n) in the size of the input. +/// +/// Fast string searching algorithms come in many variations, +/// but they can generally be described in terms of three main +/// components. +/// +/// The skip loop is where the string searcher wants to spend +/// as much time as possible. Exactly which character in the +/// pattern the skip loop examines varies from algorithm to +/// algorithm, but in the simplest case this loop repeated +/// looks at the last character in the pattern and jumps +/// forward in the input if it is not in the pattern. +/// Robert Boyer and J Moore called this the "fast" loop in +/// their original paper. +/// +/// The match loop is responsible for actually examining the +/// whole potentially matching substring. In order to fail +/// faster, the match loop sometimes has a guard test attached. +/// The guard test uses frequency analysis of the different +/// characters in the pattern to choose the least frequency +/// occurring character and use it to find match failures +/// as quickly as possible. +/// +/// The shift rule governs how the algorithm will shuffle its +/// test window in the event of a failure during the match loop. +/// Certain shift rules allow the worst-case run time of the +/// algorithm to be shown to be O(n) in the size of the input +/// rather than O(nm) in the size of the input and the size +/// of the pattern (as naive Boyer-Moore is). +/// +/// "Fast String Searching", in addition to presenting a tuned +/// algorithm, provides a comprehensive taxonomy of the many +/// different flavors of string searchers. Under that taxonomy +/// TBM, the algorithm implemented here, uses an unrolled fast +/// skip loop with memchr fallback, a forward match loop with guard, +/// and the mini Sunday's delta shift rule. To unpack that you'll have to +/// read the paper. +#[derive(Clone, Debug)] +pub struct BoyerMooreSearch { + /// The pattern we are going to look for in the haystack. + pattern: Vec<u8>, + + /// The skip table for the skip loop. + /// + /// Maps the character at the end of the input + /// to a shift. + skip_table: Vec<usize>, + + /// The guard character (least frequently occurring char). + guard: u8, + /// The reverse-index of the guard character in the pattern. + guard_reverse_idx: usize, + + /// Daniel Sunday's mini generalized delta2 shift table. + /// + /// We use a skip loop, so we only have to provide a shift + /// for the skip char (last char). This is why it is a mini + /// shift rule. + md2_shift: usize, +} + +impl BoyerMooreSearch { + /// Create a new string searcher, performing whatever + /// compilation steps are required. + fn new(pattern: Vec<u8>) -> Self { + debug_assert!(!pattern.is_empty()); + + let (g, gi) = Self::select_guard(pattern.as_slice()); + let skip_table = Self::compile_skip_table(pattern.as_slice()); + let md2_shift = Self::compile_md2_shift(pattern.as_slice()); + BoyerMooreSearch { + pattern: pattern, + skip_table: skip_table, + guard: g, + guard_reverse_idx: gi, + md2_shift: md2_shift, + } + } + + /// Find the pattern in `haystack`, returning the offset + /// of the start of the first occurrence of the pattern + /// in `haystack`. + #[inline] + fn find(&self, haystack: &[u8]) -> Option<usize> { + if haystack.len() < self.pattern.len() { + return None; + } + + let mut window_end = self.pattern.len() - 1; + + // Inspired by the grep source. It is a way + // to do correct loop unrolling without having to place + // a crashpad of terminating charicters at the end in + // the way described in the Fast String Searching paper. + const NUM_UNROLL: usize = 10; + // 1 for the initial position, and 1 for the md2 shift + let short_circut = (NUM_UNROLL + 2) * self.pattern.len(); + + if haystack.len() > short_circut { + // just 1 for the md2 shift + let backstop = + haystack.len() - ((NUM_UNROLL + 1) * self.pattern.len()); + loop { + window_end = + match self.skip_loop(haystack, window_end, backstop) { + Some(i) => i, + None => return None, + }; + if window_end >= backstop { + break; + } + + if self.check_match(haystack, window_end) { + return Some(window_end - (self.pattern.len() - 1)); + } else { + let skip = self.skip_table[haystack[window_end] as usize]; + window_end += + if skip == 0 { self.md2_shift } else { skip }; + continue; + } + } + } + + // now process the input after the backstop + while window_end < haystack.len() { + let mut skip = self.skip_table[haystack[window_end] as usize]; + if skip == 0 { + if self.check_match(haystack, window_end) { + return Some(window_end - (self.pattern.len() - 1)); + } else { + skip = self.md2_shift; + } + } + window_end += skip; + } + + None + } + + fn len(&self) -> usize { + return self.pattern.len(); + } + + /// The key heuristic behind which the BoyerMooreSearch lives. + /// + /// See `rust-lang/regex/issues/408`. + /// + /// Tuned Boyer-Moore is actually pretty slow! It turns out a handrolled + /// platform-specific memchr routine with a bit of frequency + /// analysis sprinkled on top actually wins most of the time. + /// However, there are a few cases where Tuned Boyer-Moore still + /// wins. + /// + /// If the haystack is random, frequency analysis doesn't help us, + /// so Boyer-Moore will win for sufficiently large needles. + /// Unfortunately, there is no obvious way to determine this + /// ahead of time. + /// + /// If the pattern itself consists of very common characters, + /// frequency analysis won't get us anywhere. The most extreme + /// example of this is a pattern like `eeeeeeeeeeeeeeee`. Fortunately, + /// this case is wholly determined by the pattern, so we can actually + /// implement the heuristic. + /// + /// A third case is if the pattern is sufficiently long. The idea + /// here is that once the pattern gets long enough the Tuned + /// Boyer-Moore skip loop will start making strides long enough + /// to beat the asm deep magic that is memchr. + fn should_use(pattern: &[u8]) -> bool { + // The minimum pattern length required to use TBM. + const MIN_LEN: usize = 9; + // The minimum frequency rank (lower is rarer) that every byte in the + // pattern must have in order to use TBM. That is, if the pattern + // contains _any_ byte with a lower rank, then TBM won't be used. + const MIN_CUTOFF: usize = 150; + // The maximum frequency rank for any byte. + const MAX_CUTOFF: usize = 255; + // The scaling factor used to determine the actual cutoff frequency + // to use (keeping in mind that the minimum frequency rank is bounded + // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more + // likely to be used as the pattern grows longer. That is, longer + // patterns permit somewhat less frequent bytes than shorter patterns, + // under the assumption that TBM gets better as the pattern gets + // longer. + const LEN_CUTOFF_PROPORTION: usize = 4; + + let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION); + let cutoff = cmp::max( + MIN_CUTOFF, + MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank), + ); + // The pattern must be long enough to be worthwhile. e.g., memchr will + // be faster on `e` because it is short even though e is quite common. + pattern.len() > MIN_LEN + // all the bytes must be more common than the cutoff. + && pattern.iter().all(|c| freq_rank(*c) >= cutoff) + } + + /// Check to see if there is a match at the given position + #[inline] + fn check_match(&self, haystack: &[u8], window_end: usize) -> bool { + // guard test + if haystack[window_end - self.guard_reverse_idx] != self.guard { + return false; + } + + // match loop + let window_start = window_end - (self.pattern.len() - 1); + for i in 0..self.pattern.len() { + if self.pattern[i] != haystack[window_start + i] { + return false; + } + } + + true + } + + /// Skip forward according to the shift table. + /// + /// Returns the offset of the next occurrence + /// of the last char in the pattern, or the none + /// if it never reappears. If `skip_loop` hits the backstop + /// it will leave early. + #[inline] + fn skip_loop( + &self, + haystack: &[u8], + mut window_end: usize, + backstop: usize, + ) -> Option<usize> { + let window_end_snapshot = window_end; + let skip_of = |we: usize| -> usize { + // Unsafe might make this faster, but the benchmarks + // were hard to interpret. + self.skip_table[haystack[we] as usize] + }; + + loop { + let mut skip = skip_of(window_end); + window_end += skip; + skip = skip_of(window_end); + window_end += skip; + if skip != 0 { + skip = skip_of(window_end); + window_end += skip; + skip = skip_of(window_end); + window_end += skip; + skip = skip_of(window_end); + window_end += skip; + if skip != 0 { + skip = skip_of(window_end); + window_end += skip; + skip = skip_of(window_end); + window_end += skip; + skip = skip_of(window_end); + window_end += skip; + if skip != 0 { + skip = skip_of(window_end); + window_end += skip; + skip = skip_of(window_end); + window_end += skip; + + // If ten iterations did not make at least 16 words + // worth of progress, we just fall back on memchr. + if window_end - window_end_snapshot + > 16 * mem::size_of::<usize>() + { + // Returning a window_end >= backstop will + // immediatly break us out of the inner loop in + // `find`. + if window_end >= backstop { + return Some(window_end); + } + + continue; // we made enough progress + } else { + // In case we are already there, and so that + // we will catch the guard char. + window_end = window_end + .checked_sub(1 + self.guard_reverse_idx) + .unwrap_or(0); + + match memchr(self.guard, &haystack[window_end..]) { + None => return None, + Some(g_idx) => { + return Some( + window_end + + g_idx + + self.guard_reverse_idx, + ); + } + } + } + } + } + } + + return Some(window_end); + } + } + + /// Compute the ufast skip table. + fn compile_skip_table(pattern: &[u8]) -> Vec<usize> { + let mut tab = vec![pattern.len(); 256]; + + // For every char in the pattern, we write a skip + // that will line us up with the rightmost occurrence. + // + // N.B. the sentinel (0) is written by the last + // loop iteration. + for (i, c) in pattern.iter().enumerate() { + tab[*c as usize] = (pattern.len() - 1) - i; + } + + tab + } + + /// Select the guard character based off of the precomputed + /// frequency table. + fn select_guard(pattern: &[u8]) -> (u8, usize) { + let mut rarest = pattern[0]; + let mut rarest_rev_idx = pattern.len() - 1; + for (i, c) in pattern.iter().enumerate() { + if freq_rank(*c) < freq_rank(rarest) { + rarest = *c; + rarest_rev_idx = (pattern.len() - 1) - i; + } + } + + (rarest, rarest_rev_idx) + } + + /// If there is another occurrence of the skip + /// char, shift to it, otherwise just shift to + /// the next window. + fn compile_md2_shift(pattern: &[u8]) -> usize { + let shiftc = *pattern.last().unwrap(); + + // For a pattern of length 1 we will never apply the + // shift rule, so we use a poison value on the principle + // that failing fast is a good thing. + if pattern.len() == 1 { + return 0xDEADBEAF; + } + + let mut i = pattern.len() - 2; + while i > 0 { + if pattern[i] == shiftc { + return (pattern.len() - 1) - i; + } + i -= 1; + } + + // The skip char never re-occurs in the pattern, so + // we can just shift the whole window length. + pattern.len() - 1 + } + + fn approximate_size(&self) -> usize { + (self.pattern.len() * mem::size_of::<u8>()) + + (256 * mem::size_of::<usize>()) // skip table + } +} + +fn freq_rank(b: u8) -> usize { + BYTE_FREQUENCIES[b as usize] as usize +} + +#[cfg(test)] +mod tests { + use super::{BoyerMooreSearch, FreqyPacked}; + + // + // Unit Tests + // + + // The "hello, world" of string searching + #[test] + fn bm_find_subs() { + let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..])); + let haystack = b"I keep seeing patterns in this text"; + assert_eq!(14, searcher.find(haystack).unwrap()); + } + + #[test] + fn bm_find_no_subs() { + let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..])); + let haystack = b"I keep seeing needles in this text"; + assert_eq!(None, searcher.find(haystack)); + } + + // + // Regression Tests + // + + #[test] + fn bm_skip_reset_bug() { + let haystack = vec![0, 0, 0, 0, 0, 1, 1, 0]; + let needle = vec![0, 1, 1, 0]; + + let searcher = BoyerMooreSearch::new(needle); + let offset = searcher.find(haystack.as_slice()).unwrap(); + assert_eq!(4, offset); + } + + #[test] + fn bm_backstop_underflow_bug() { + let haystack = vec![0, 0]; + let needle = vec![0, 0]; + + let searcher = BoyerMooreSearch::new(needle); + let offset = searcher.find(haystack.as_slice()).unwrap(); + assert_eq!(0, offset); + } + + #[test] + fn bm_naive_off_by_one_bug() { + let haystack = vec![91]; + let needle = vec![91]; + + let naive_offset = naive_find(&needle, &haystack).unwrap(); + assert_eq!(0, naive_offset); + } + + #[test] + fn bm_memchr_fallback_indexing_bug() { + let mut haystack = vec![ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + let needle = vec![1, 1, 1, 1, 32, 32, 87]; + let needle_start = haystack.len(); + haystack.extend(needle.clone()); + + let searcher = BoyerMooreSearch::new(needle); + assert_eq!(needle_start, searcher.find(haystack.as_slice()).unwrap()); + } + + #[test] + fn bm_backstop_boundary() { + let haystack = b"\ +// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +e_data.clone_created(entity_id, entity_to_add.entity_id); +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +" + .to_vec(); + let needle = b"clone_created".to_vec(); + + let searcher = BoyerMooreSearch::new(needle); + let result = searcher.find(&haystack); + assert_eq!(Some(43), result); + } + + #[test] + fn bm_win_gnu_indexing_bug() { + let haystack_raw = vec![ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + let needle = vec![1, 1, 1, 1, 1, 1, 1]; + let haystack = haystack_raw.as_slice(); + + BoyerMooreSearch::new(needle.clone()).find(haystack); + } + + // + // QuickCheck Properties + // + + use quickcheck::TestResult; + + fn naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize> { + assert!(needle.len() <= haystack.len()); + + for i in 0..(haystack.len() - (needle.len() - 1)) { + if haystack[i] == needle[0] + && &haystack[i..(i + needle.len())] == needle + { + return Some(i); + } + } + + None + } + + quickcheck! { + fn qc_bm_equals_nieve_find(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult { + if pile1.len() == 0 || pile2.len() == 0 { + return TestResult::discard(); + } + + let (needle, haystack) = if pile1.len() < pile2.len() { + (pile1, pile2.as_slice()) + } else { + (pile2, pile1.as_slice()) + }; + + let searcher = BoyerMooreSearch::new(needle.clone()); + TestResult::from_bool( + searcher.find(haystack) == naive_find(&needle, haystack)) + } + + fn qc_bm_equals_single(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult { + if pile1.len() == 0 || pile2.len() == 0 { + return TestResult::discard(); + } + + let (needle, haystack) = if pile1.len() < pile2.len() { + (pile1, pile2.as_slice()) + } else { + (pile2, pile1.as_slice()) + }; + + let bm_searcher = BoyerMooreSearch::new(needle.clone()); + let freqy_memchr = FreqyPacked::new(needle); + TestResult::from_bool( + bm_searcher.find(haystack) == freqy_memchr.find(haystack)) + } + + fn qc_bm_finds_trailing_needle( + haystack_pre: Vec<u8>, + needle: Vec<u8> + ) -> TestResult { + if needle.len() == 0 { + return TestResult::discard(); + } + + let mut haystack = haystack_pre.clone(); + let searcher = BoyerMooreSearch::new(needle.clone()); + + if haystack.len() >= needle.len() && + searcher.find(haystack.as_slice()).is_some() { + return TestResult::discard(); + } + + haystack.extend(needle.clone()); + + // What if the the tail of the haystack can start the + // needle? + let start = haystack_pre.len() + .checked_sub(needle.len()) + .unwrap_or(0); + for i in 0..(needle.len() - 1) { + if searcher.find(&haystack[(i + start)..]).is_some() { + return TestResult::discard(); + } + } + + TestResult::from_bool( + searcher.find(haystack.as_slice()) + .map(|x| x == haystack_pre.len()) + .unwrap_or(false)) + } + + // qc_equals_* is only testing the negative case as @burntsushi + // pointed out in https://github.com/rust-lang/regex/issues/446. + // This quickcheck prop represents an effort to force testing of + // the positive case. qc_bm_finds_first and qc_bm_finds_trailing_needle + // already check some of the positive cases, but they don't cover + // cases where the needle is in the middle of haystack. This prop + // fills that hole. + fn qc_bm_finds_subslice( + haystack: Vec<u8>, + needle_start: usize, + needle_length: usize + ) -> TestResult { + if haystack.len() == 0 { + return TestResult::discard(); + } + + let needle_start = needle_start % haystack.len(); + let needle_length = needle_length % (haystack.len() - needle_start); + + if needle_length == 0 { + return TestResult::discard(); + } + + let needle = &haystack[needle_start..(needle_start + needle_length)]; + + let bm_searcher = BoyerMooreSearch::new(needle.to_vec()); + + let start = naive_find(&needle, &haystack); + match start { + None => TestResult::from_bool(false), + Some(nf_start) => + TestResult::from_bool( + nf_start <= needle_start + && bm_searcher.find(&haystack) == start + ) + } + } + + fn qc_bm_finds_first(needle: Vec<u8>) -> TestResult { + if needle.len() == 0 { + return TestResult::discard(); + } + + let mut haystack = needle.clone(); + let searcher = BoyerMooreSearch::new(needle.clone()); + haystack.extend(needle); + + TestResult::from_bool( + searcher.find(haystack.as_slice()) + .map(|x| x == 0) + .unwrap_or(false)) + } + } +} diff --git a/src/literal/mod.rs b/src/literal/mod.rs index 980f523..783c63b 100644 --- a/src/literal/mod.rs +++ b/src/literal/mod.rs @@ -6,7 +6,7 @@ mod imp; #[allow(missing_docs)] #[cfg(not(feature = "perf-literal"))] mod imp { - use regex_syntax::hir::literal::Literals; + use syntax::hir::literal::Literals; #[derive(Clone, Debug)] pub struct LiteralSearcher(()); diff --git a/src/pattern.rs b/src/pattern.rs index b4ffd8e..e942667 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -1,8 +1,7 @@ use std::str::pattern::{Pattern, SearchStep, Searcher}; -use crate::re_unicode::{Matches, Regex}; +use re_unicode::{Matches, Regex}; -#[derive(Debug)] pub struct RegexSearcher<'r, 't> { haystack: &'t str, it: Matches<'r, 't>, diff --git a/src/pikevm.rs b/src/pikevm.rs index 9a14240..299087d 100644 --- a/src/pikevm.rs +++ b/src/pikevm.rs @@ -17,11 +17,11 @@ use std::mem; -use crate::exec::ProgramCache; -use crate::input::{Input, InputAt}; -use crate::prog::{InstPtr, Program}; -use crate::re_trait::Slot; -use crate::sparse::SparseSet; +use exec::ProgramCache; +use input::{Input, InputAt}; +use prog::{InstPtr, Program}; +use re_trait::Slot; +use sparse::SparseSet; /// An NFA simulation matching engine. #[derive(Debug)] @@ -231,7 +231,7 @@ impl<'r, I: Input> Fsm<'r, I> { at: InputAt, at_next: InputAt, ) -> bool { - use crate::prog::Inst::*; + use prog::Inst::*; match self.prog[ip] { Match(match_slot) => { if match_slot < matches.len() { @@ -300,7 +300,7 @@ impl<'r, I: Input> Fsm<'r, I> { // traverse the set of states. We only push to the stack when we // absolutely need recursion (restoring captures or following a // branch). - use crate::prog::Inst::*; + use prog::Inst::*; loop { // Don't visit states we've already added. if nlist.set.contains(ip) { diff --git a/src/pool.rs b/src/pool.rs index 6a6f15b..a506ee9 100644 --- a/src/pool.rs +++ b/src/pool.rs @@ -154,7 +154,7 @@ pub struct Pool<T> { unsafe impl<T: Send> Sync for Pool<T> {} impl<T: ::std::fmt::Debug> ::std::fmt::Debug for Pool<T> { - fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { + fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { f.debug_struct("Pool") .field("stack", &self.stack) .field("owner", &self.owner) @@ -168,7 +168,7 @@ impl<T: ::std::fmt::Debug> ::std::fmt::Debug for Pool<T> { /// The purpose of the guard is to use RAII to automatically put the value back /// in the pool once it's dropped. #[derive(Debug)] -pub struct PoolGuard<'a, T: Send> { +pub struct PoolGuard<'a, T: 'a + Send> { /// The pool that this guard is attached to. pool: &'a Pool<T>, /// This is None when the guard represents the special "owned" value. In @@ -193,7 +193,7 @@ impl<T: Send> Pool<T> { /// the value to go back into the pool) and then calling get again is NOT /// guaranteed to return the same value received in the first get call. #[cfg_attr(feature = "perf-inline", inline(always))] - pub fn get(&self) -> PoolGuard<'_, T> { + pub fn get(&self) -> PoolGuard<T> { // Our fast path checks if the caller is the thread that "owns" this // pool. Or stated differently, whether it is the first thread that // tried to extract a value from the pool. If it is, then we can return @@ -217,7 +217,7 @@ impl<T: Send> Pool<T> { /// /// If the pool has no owner, then this will set the owner. #[cold] - fn get_slow(&self, caller: usize, owner: usize) -> PoolGuard<'_, T> { + fn get_slow(&self, caller: usize, owner: usize) -> PoolGuard<T> { use std::sync::atomic::Ordering::Relaxed; if owner == 0 { @@ -284,7 +284,7 @@ mod tests { #[test] fn oibits() { - use crate::exec::ProgramCache; + use exec::ProgramCache; fn has_oibits<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {} has_oibits::<Pool<ProgramCache>>(); diff --git a/src/prog.rs b/src/prog.rs index 475a811..74e5f2f 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -6,8 +6,8 @@ use std::ops::Deref; use std::slice; use std::sync::Arc; -use crate::input::Char; -use crate::literal::LiteralSearcher; +use input::Char; +use literal::LiteralSearcher; /// `InstPtr` represents the index of an instruction in a regex program. pub type InstPtr = usize; @@ -168,7 +168,7 @@ impl Deref for Program { } impl fmt::Debug for Program { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use self::Inst::*; fn with_goto(cur: usize, goto: usize, fmtd: String) -> String { @@ -259,8 +259,8 @@ impl<'a> IntoIterator for &'a Program { /// /// Other than the benefit of moving invariants into the type system, another /// benefit is the decreased size. If we remove the `Char` and `Ranges` -/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to -/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges` +/// instructions from the `Inst` enum, then its size shrinks from 40 bytes to +/// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges` /// variant.) Given that byte based machines are typically much bigger than /// their Unicode analogues (because they can decode UTF-8 directly), this ends /// up being a pretty significant savings. @@ -374,7 +374,7 @@ pub struct InstRanges { /// succeeds. pub goto: InstPtr, /// The set of Unicode scalar value ranges to test. - pub ranges: Box<[(char, char)]>, + pub ranges: Vec<(char, char)>, } impl InstRanges { @@ -432,16 +432,3 @@ impl InstBytes { self.start <= byte && byte <= self.end } } - -#[cfg(test)] -mod test { - #[test] - #[cfg(target_pointer_width = "64")] - fn test_size_of_inst() { - use std::mem::size_of; - - use super::Inst; - - assert_eq!(32, size_of::<Inst>()); - } -} diff --git a/src/re_builder.rs b/src/re_builder.rs index ee63836..fc140f8 100644 --- a/src/re_builder.rs +++ b/src/re_builder.rs @@ -37,10 +37,10 @@ macro_rules! define_builder { ($name:ident, $regex_mod:ident, $only_utf8:expr) => { pub mod $name { use super::RegexOptions; - use crate::error::Error; - use crate::exec::ExecBuilder; + use error::Error; + use exec::ExecBuilder; - use crate::$regex_mod::Regex; + use $regex_mod::Regex; /// A configurable builder for a regular expression. /// @@ -235,10 +235,10 @@ macro_rules! define_set_builder { ($name:ident, $regex_mod:ident, $only_utf8:expr) => { pub mod $name { use super::RegexOptions; - use crate::error::Error; - use crate::exec::ExecBuilder; + use error::Error; + use exec::ExecBuilder; - use crate::re_set::$regex_mod::RegexSet; + use re_set::$regex_mod::RegexSet; /// A configurable builder for a set of regular expressions. /// diff --git a/src/re_bytes.rs b/src/re_bytes.rs index ae55d6d..204a70a 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -6,13 +6,13 @@ use std::ops::{Index, Range}; use std::str::FromStr; use std::sync::Arc; -use crate::find_byte::find_byte; +use find_byte::find_byte; -use crate::error::Error; -use crate::exec::{Exec, ExecNoSync}; -use crate::expand::expand_bytes; -use crate::re_builder::bytes::RegexBuilder; -use crate::re_trait::{self, RegularExpression, SubCapturesPosIter}; +use error::Error; +use exec::{Exec, ExecNoSync}; +use expand::expand_bytes; +use re_builder::bytes::RegexBuilder; +use re_trait::{self, RegularExpression, SubCapturesPosIter}; /// Match represents a single match of a regex in a haystack. /// @@ -79,14 +79,14 @@ pub struct Regex(Exec); impl fmt::Display for Regex { /// Shows the original regular expression. - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", self.as_str()) } } impl fmt::Debug for Regex { /// Shows the original regular expression. - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fmt::Display::fmt(self, f) } } @@ -133,7 +133,7 @@ impl Regex { /// bytes: /// /// ```rust - /// # use regex::bytes::Regex; + /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let text = b"I categorically deny having triskaidekaphobia."; /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text)); @@ -156,7 +156,7 @@ impl Regex { /// ASCII word bytes: /// /// ```rust - /// # use regex::bytes::Regex; + /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let text = b"I categorically deny having triskaidekaphobia."; /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); @@ -177,7 +177,7 @@ impl Regex { /// word bytes: /// /// ```rust - /// # use regex::bytes::Regex; + /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let text = b"Retroactively relinquishing remunerations is reprehensible."; /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { @@ -205,7 +205,7 @@ impl Regex { /// year separately. /// /// ```rust - /// # use regex::bytes::Regex; + /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; @@ -227,7 +227,7 @@ impl Regex { /// We can make this example a bit clearer by using *named* capture groups: /// /// ```rust - /// # use regex::bytes::Regex; + /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") /// .unwrap(); @@ -271,7 +271,7 @@ impl Regex { /// some text, where the movie is formatted like "'Title' (xxxx)": /// /// ```rust - /// # use std::str; use regex::bytes::Regex; + /// # extern crate regex; use std::str; use regex::bytes::Regex; /// # fn main() { /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") /// .unwrap(); @@ -305,7 +305,7 @@ impl Regex { /// To split a string delimited by arbitrary amounts of spaces or tabs: /// /// ```rust - /// # use regex::bytes::Regex; + /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let re = Regex::new(r"[ \t]+").unwrap(); /// let fields: Vec<&[u8]> = re.split(b"a b \t c\td e").collect(); @@ -331,7 +331,7 @@ impl Regex { /// Get the first two words in some text: /// /// ```rust - /// # use regex::bytes::Regex; + /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let re = Regex::new(r"\W+").unwrap(); /// let fields: Vec<&[u8]> = re.splitn(b"Hey! How are you?", 3).collect(); @@ -379,7 +379,7 @@ impl Regex { /// In typical usage, this can just be a normal byte string: /// /// ```rust - /// # use regex::bytes::Regex; + /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let re = Regex::new("[^01]+").unwrap(); /// assert_eq!(re.replace(b"1078910", &b""[..]), &b"1010"[..]); @@ -392,7 +392,7 @@ impl Regex { /// group matches easily: /// /// ```rust - /// # use regex::bytes::Regex; + /// # extern crate regex; use regex::bytes::Regex; /// # use regex::bytes::Captures; fn main() { /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); /// let result = re.replace(b"Springsteen, Bruce", |caps: &Captures| { @@ -411,7 +411,7 @@ impl Regex { /// with named capture groups: /// /// ```rust - /// # use regex::bytes::Regex; + /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap(); /// let result = re.replace(b"Springsteen, Bruce", &b"$first $last"[..]); @@ -428,7 +428,7 @@ impl Regex { /// underscore: /// /// ```rust - /// # use regex::bytes::Regex; + /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap(); /// let result = re.replace(b"deep fried", &b"${first}_$second"[..]); @@ -445,7 +445,7 @@ impl Regex { /// byte string with `NoExpand`: /// /// ```rust - /// # use regex::bytes::Regex; + /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// use regex::bytes::NoExpand; /// @@ -546,7 +546,7 @@ impl Regex { /// `a`. /// /// ```rust - /// # use regex::bytes::Regex; + /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let text = b"aaaaa"; /// let pos = Regex::new(r"a+").unwrap().shortest_match(text); @@ -658,7 +658,7 @@ impl Regex { } /// Returns an iterator over the capture names. - pub fn capture_names(&self) -> CaptureNames<'_> { + pub fn capture_names(&self) -> CaptureNames { CaptureNames(self.0.capture_names().iter()) } @@ -990,15 +990,15 @@ impl<'t> Captures<'t> { } impl<'t> fmt::Debug for Captures<'t> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_tuple("Captures").field(&CapturesDebug(self)).finish() } } -struct CapturesDebug<'c, 't>(&'c Captures<'t>); +struct CapturesDebug<'c, 't: 'c>(&'c Captures<'t>); impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn escape_bytes(bytes: &[u8]) -> String { let mut s = String::new(); for &b in bytes { @@ -1084,7 +1084,7 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { /// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and /// the lifetime `'t` corresponds to the originally matched text. #[derive(Clone, Debug)] -pub struct SubCaptureMatches<'c, 't> { +pub struct SubCaptureMatches<'c, 't: 'c> { caps: &'c Captures<'t>, it: SubCapturesPosIter<'c>, } @@ -1116,7 +1116,7 @@ pub trait Replacer { /// /// For example, a no-op replacement would be /// `dst.extend(&caps[0])`. - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>); + fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>); /// Return a fixed unchanging replacement byte string. /// @@ -1159,10 +1159,10 @@ pub trait Replacer { /// /// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref). #[derive(Debug)] -pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); +pub struct ReplacerRef<'a, R: ?Sized + 'a>(&'a mut R); impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) { self.0.replace_append(caps, dst) } fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { @@ -1171,56 +1171,56 @@ impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { } impl<'a> Replacer for &'a [u8] { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) { caps.expand(*self, dst); } - fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + fn no_expansion(&mut self) -> Option<Cow<[u8]>> { no_expansion(self) } } impl<'a> Replacer for &'a Vec<u8> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) { caps.expand(*self, dst); } - fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + fn no_expansion(&mut self) -> Option<Cow<[u8]>> { no_expansion(self) } } impl Replacer for Vec<u8> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) { caps.expand(self, dst); } - fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + fn no_expansion(&mut self) -> Option<Cow<[u8]>> { no_expansion(self) } } impl<'a> Replacer for Cow<'a, [u8]> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) { caps.expand(self.as_ref(), dst); } - fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + fn no_expansion(&mut self) -> Option<Cow<[u8]>> { no_expansion(self) } } impl<'a> Replacer for &'a Cow<'a, [u8]> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) { caps.expand(self.as_ref(), dst); } - fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + fn no_expansion(&mut self) -> Option<Cow<[u8]>> { no_expansion(self) } } -fn no_expansion<T: AsRef<[u8]>>(t: &T) -> Option<Cow<'_, [u8]>> { +fn no_expansion<T: AsRef<[u8]>>(t: &T) -> Option<Cow<[u8]>> { let s = t.as_ref(); match find_byte(b'$', s) { Some(_) => None, @@ -1230,10 +1230,10 @@ fn no_expansion<T: AsRef<[u8]>>(t: &T) -> Option<Cow<'_, [u8]>> { impl<F, T> Replacer for F where - F: FnMut(&Captures<'_>) -> T, + F: FnMut(&Captures) -> T, T: AsRef<[u8]>, { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) { dst.extend_from_slice((*self)(caps).as_ref()); } } @@ -1250,11 +1250,11 @@ where pub struct NoExpand<'t>(pub &'t [u8]); impl<'t> Replacer for NoExpand<'t> { - fn replace_append(&mut self, _: &Captures<'_>, dst: &mut Vec<u8>) { + fn replace_append(&mut self, _: &Captures, dst: &mut Vec<u8>) { dst.extend_from_slice(self.0); } - fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + fn no_expansion(&mut self) -> Option<Cow<[u8]>> { Some(Cow::Borrowed(self.0)) } } diff --git a/src/re_set.rs b/src/re_set.rs index 73d5953..5cb47ad 100644 --- a/src/re_set.rs +++ b/src/re_set.rs @@ -7,10 +7,10 @@ macro_rules! define_set { use std::slice; use std::vec; - use crate::error::Error; - use crate::exec::Exec; - use crate::re_builder::$builder_mod::RegexSetBuilder; - use crate::re_trait::RegularExpression; + use error::Error; + use exec::Exec; + use re_builder::$builder_mod::RegexSetBuilder; + use re_trait::RegularExpression; /// Match multiple (possibly overlapping) regular expressions in a single scan. /// @@ -292,7 +292,7 @@ impl SetMatches { /// This will always produces matches in ascending order of index, where /// the index corresponds to the index of the regex that matched with /// respect to its position when initially building the set. - pub fn iter(&self) -> SetMatchesIter<'_> { + pub fn iter(&self) -> SetMatchesIter { SetMatchesIter((&*self.matches).into_iter().enumerate()) } } @@ -405,7 +405,7 @@ impl From<Exec> for RegexSet { } impl fmt::Debug for RegexSet { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "RegexSet({:?})", self.0.regex_strings()) } } diff --git a/src/re_trait.rs b/src/re_trait.rs index 680aa54..ea6be9c 100644 --- a/src/re_trait.rs +++ b/src/re_trait.rs @@ -30,7 +30,7 @@ impl Locations { /// Creates an iterator of all the capture group positions in order of /// appearance in the regular expression. Positions are byte indices /// in terms of the original string matched. - pub fn iter(&self) -> SubCapturesPosIter<'_> { + pub fn iter(&self) -> SubCapturesPosIter { SubCapturesPosIter { idx: 0, locs: self } } @@ -138,13 +138,13 @@ pub trait RegularExpression: Sized + fmt::Debug { /// Returns an iterator over all non-overlapping successive leftmost-first /// matches. - fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> { + fn find_iter(self, text: &Self::Text) -> Matches<Self> { Matches { re: self, text: text, last_end: 0, last_match: None } } /// Returns an iterator over all non-overlapping successive leftmost-first /// matches with captures. - fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> { + fn captures_iter(self, text: &Self::Text) -> CaptureMatches<Self> { CaptureMatches(self.find_iter(text)) } } diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 142c78f..1b478cd 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -6,20 +6,21 @@ use std::ops::{Index, Range}; use std::str::FromStr; use std::sync::Arc; -use crate::find_byte::find_byte; +use find_byte::find_byte; +use syntax; -use crate::error::Error; -use crate::exec::{Exec, ExecNoSyncStr}; -use crate::expand::expand_str; -use crate::re_builder::unicode::RegexBuilder; -use crate::re_trait::{self, RegularExpression, SubCapturesPosIter}; +use error::Error; +use exec::{Exec, ExecNoSyncStr}; +use expand::expand_str; +use re_builder::unicode::RegexBuilder; +use re_trait::{self, RegularExpression, SubCapturesPosIter}; /// Escapes all regular expression meta characters in `text`. /// /// The string returned may be safely used as a literal in a regular /// expression. pub fn escape(text: &str) -> String { - regex_syntax::escape(text) + syntax::escape(text) } /// Match represents a single match of a regex in a haystack. @@ -137,14 +138,14 @@ pub struct Regex(Exec); impl fmt::Display for Regex { /// Shows the original regular expression. - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", self.as_str()) } } impl fmt::Debug for Regex { /// Shows the original regular expression. - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fmt::Display::fmt(self, f) } } @@ -188,7 +189,7 @@ impl Regex { /// Unicode word characters: /// /// ```rust - /// # use regex::Regex; + /// # extern crate regex; use regex::Regex; /// # fn main() { /// let text = "I categorically deny having triskaidekaphobia."; /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text)); @@ -211,7 +212,7 @@ impl Regex { /// Unicode word characters: /// /// ```rust - /// # use regex::Regex; + /// # extern crate regex; use regex::Regex; /// # fn main() { /// let text = "I categorically deny having triskaidekaphobia."; /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); @@ -233,7 +234,7 @@ impl Regex { /// word characters: /// /// ```rust - /// # use regex::Regex; + /// # extern crate regex; use regex::Regex; /// # fn main() { /// let text = "Retroactively relinquishing remunerations is reprehensible."; /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { @@ -261,7 +262,7 @@ impl Regex { /// year separately. /// /// ```rust - /// # use regex::Regex; + /// # extern crate regex; use regex::Regex; /// # fn main() { /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; @@ -283,7 +284,7 @@ impl Regex { /// We can make this example a bit clearer by using *named* capture groups: /// /// ```rust - /// # use regex::Regex; + /// # extern crate regex; use regex::Regex; /// # fn main() { /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") /// .unwrap(); @@ -327,7 +328,7 @@ impl Regex { /// some text, where the movie is formatted like "'Title' (xxxx)": /// /// ```rust - /// # use regex::Regex; + /// # extern crate regex; use regex::Regex; /// # fn main() { /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") /// .unwrap(); @@ -360,7 +361,7 @@ impl Regex { /// To split a string delimited by arbitrary amounts of spaces or tabs: /// /// ```rust - /// # use regex::Regex; + /// # extern crate regex; use regex::Regex; /// # fn main() { /// let re = Regex::new(r"[ \t]+").unwrap(); /// let fields: Vec<&str> = re.split("a b \t c\td e").collect(); @@ -384,7 +385,7 @@ impl Regex { /// Get the first two words in some text: /// /// ```rust - /// # use regex::Regex; + /// # extern crate regex; use regex::Regex; /// # fn main() { /// let re = Regex::new(r"\W+").unwrap(); /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect(); @@ -431,7 +432,7 @@ impl Regex { /// In typical usage, this can just be a normal string: /// /// ```rust - /// # use regex::Regex; + /// # extern crate regex; use regex::Regex; /// # fn main() { /// let re = Regex::new("[^01]+").unwrap(); /// assert_eq!(re.replace("1078910", ""), "1010"); @@ -444,7 +445,7 @@ impl Regex { /// capturing group matches easily: /// /// ```rust - /// # use regex::Regex; + /// # extern crate regex; use regex::Regex; /// # use regex::Captures; fn main() { /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| { @@ -460,7 +461,7 @@ impl Regex { /// with named capture groups: /// /// ```rust - /// # use regex::Regex; + /// # extern crate regex; use regex::Regex; /// # fn main() { /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap(); /// let result = re.replace("Springsteen, Bruce", "$first $last"); @@ -477,7 +478,7 @@ impl Regex { /// underscore: /// /// ```rust - /// # use regex::Regex; + /// # extern crate regex; use regex::Regex; /// # fn main() { /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap(); /// let result = re.replace("deep fried", "${first}_$second"); @@ -494,7 +495,7 @@ impl Regex { /// byte string with `NoExpand`: /// /// ```rust - /// # use regex::Regex; + /// # extern crate regex; use regex::Regex; /// # fn main() { /// use regex::NoExpand; /// @@ -604,7 +605,7 @@ impl Regex { /// `a`. /// /// ```rust - /// # use regex::Regex; + /// # extern crate regex; use regex::Regex; /// # fn main() { /// let text = "aaaaa"; /// let pos = Regex::new(r"a+").unwrap().shortest_match(text); @@ -716,7 +717,7 @@ impl Regex { } /// Returns an iterator over the capture names. - pub fn capture_names(&self) -> CaptureNames<'_> { + pub fn capture_names(&self) -> CaptureNames { CaptureNames(self.0.capture_names().iter()) } @@ -1000,15 +1001,15 @@ impl<'t> Captures<'t> { } impl<'t> fmt::Debug for Captures<'t> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_tuple("Captures").field(&CapturesDebug(self)).finish() } } -struct CapturesDebug<'c, 't>(&'c Captures<'t>); +struct CapturesDebug<'c, 't: 'c>(&'c Captures<'t>); impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // We'd like to show something nice here, even if it means an // allocation to build a reverse index. let slot_to_name: HashMap<&usize, &String> = @@ -1079,7 +1080,7 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { /// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and /// the lifetime `'t` corresponds to the originally matched text. #[derive(Clone, Debug)] -pub struct SubCaptureMatches<'c, 't> { +pub struct SubCaptureMatches<'c, 't: 'c> { caps: &'c Captures<'t>, it: SubCapturesPosIter<'c>, } @@ -1157,7 +1158,7 @@ pub trait Replacer { /// /// For example, a no-op replacement would be /// `dst.push_str(caps.get(0).unwrap().as_str())`. - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String); + fn replace_append(&mut self, caps: &Captures, dst: &mut String); /// Return a fixed unchanging replacement string. /// @@ -1200,68 +1201,68 @@ pub trait Replacer { /// /// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref). #[derive(Debug)] -pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); +pub struct ReplacerRef<'a, R: ?Sized + 'a>(&'a mut R); impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + fn replace_append(&mut self, caps: &Captures, dst: &mut String) { self.0.replace_append(caps, dst) } - fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + fn no_expansion(&mut self) -> Option<Cow<str>> { self.0.no_expansion() } } impl<'a> Replacer for &'a str { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + fn replace_append(&mut self, caps: &Captures, dst: &mut String) { caps.expand(*self, dst); } - fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + fn no_expansion(&mut self) -> Option<Cow<str>> { no_expansion(self) } } impl<'a> Replacer for &'a String { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + fn replace_append(&mut self, caps: &Captures, dst: &mut String) { self.as_str().replace_append(caps, dst) } - fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + fn no_expansion(&mut self) -> Option<Cow<str>> { no_expansion(self) } } impl Replacer for String { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + fn replace_append(&mut self, caps: &Captures, dst: &mut String) { self.as_str().replace_append(caps, dst) } - fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + fn no_expansion(&mut self) -> Option<Cow<str>> { no_expansion(self) } } impl<'a> Replacer for Cow<'a, str> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + fn replace_append(&mut self, caps: &Captures, dst: &mut String) { self.as_ref().replace_append(caps, dst) } - fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + fn no_expansion(&mut self) -> Option<Cow<str>> { no_expansion(self) } } impl<'a> Replacer for &'a Cow<'a, str> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + fn replace_append(&mut self, caps: &Captures, dst: &mut String) { self.as_ref().replace_append(caps, dst) } - fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + fn no_expansion(&mut self) -> Option<Cow<str>> { no_expansion(self) } } -fn no_expansion<T: AsRef<str>>(t: &T) -> Option<Cow<'_, str>> { +fn no_expansion<T: AsRef<str>>(t: &T) -> Option<Cow<str>> { let s = t.as_ref(); match find_byte(b'$', s.as_bytes()) { Some(_) => None, @@ -1271,10 +1272,10 @@ fn no_expansion<T: AsRef<str>>(t: &T) -> Option<Cow<'_, str>> { impl<F, T> Replacer for F where - F: FnMut(&Captures<'_>) -> T, + F: FnMut(&Captures) -> T, T: AsRef<str>, { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + fn replace_append(&mut self, caps: &Captures, dst: &mut String) { dst.push_str((*self)(caps).as_ref()); } } @@ -1291,11 +1292,11 @@ where pub struct NoExpand<'t>(pub &'t str); impl<'t> Replacer for NoExpand<'t> { - fn replace_append(&mut self, _: &Captures<'_>, dst: &mut String) { + fn replace_append(&mut self, _: &Captures, dst: &mut String) { dst.push_str(self.0); } - fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + fn no_expansion(&mut self) -> Option<Cow<str>> { Some(Cow::Borrowed(self.0)) } } diff --git a/src/sparse.rs b/src/sparse.rs index 98b7266..421d6b6 100644 --- a/src/sparse.rs +++ b/src/sparse.rs @@ -62,7 +62,7 @@ impl SparseSet { } impl fmt::Debug for SparseSet { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "SparseSet({:?})", self.dense) } } @@ -1,7 +1,5 @@ #!/bin/bash -set -e - # This is a convenience script for running a broad swath of tests across # features. We don't test the complete space, since the complete space is quite # large. Hopefully once we migrate the test suite to better infrastructure diff --git a/tests/regression_fuzz.rs b/tests/regression_fuzz.rs index 4e76704..5f92ed0 100644 --- a/tests/regression_fuzz.rs +++ b/tests/regression_fuzz.rs @@ -17,15 +17,3 @@ fn fuzz1() { fn empty_any_errors_no_panic() { assert!(regex_new!(r"\P{any}").is_err()); } - -// This tests that a very large regex errors during compilation instead of -// using gratuitous amounts of memory. The specific problem is that the -// compiler wasn't accounting for the memory used by Unicode character classes -// correctly. -// -// See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=33579 -#[test] -fn big_regex_fails_to_compile() { - let pat = "[\u{0}\u{e}\u{2}\\w~~>[l\t\u{0}]p?<]{971158}"; - assert!(regex_new!(pat).is_err()); -} diff --git a/tests/replace.rs b/tests/replace.rs index 1dc6106..700aff2 100644 --- a/tests/replace.rs +++ b/tests/replace.rs @@ -94,7 +94,7 @@ replace!( replace, r"([0-9]+)", "age: 26", - |captures: &Captures<'_>| { + |captures: &Captures| { match_text!(captures.get(1).unwrap())[0..1].to_owned() }, "age: 2" @@ -104,7 +104,7 @@ replace!( replace, r"[0-9]+", "age: 26", - |_captures: &Captures<'_>| t!("Z").to_owned(), + |_captures: &Captures| t!("Z").to_owned(), "age: Z" ); diff --git a/tests/test_backtrack.rs b/tests/test_backtrack.rs index fb934e2..617185f 100644 --- a/tests/test_backtrack.rs +++ b/tests/test_backtrack.rs @@ -1,5 +1,8 @@ #![cfg_attr(feature = "pattern", feature(pattern))] +extern crate rand; +extern crate regex; + macro_rules! regex_new { ($re:expr) => {{ use regex::internal::ExecBuilder; diff --git a/tests/test_backtrack_bytes.rs b/tests/test_backtrack_bytes.rs index a59426c..17df4d8 100644 --- a/tests/test_backtrack_bytes.rs +++ b/tests/test_backtrack_bytes.rs @@ -1,3 +1,6 @@ +extern crate rand; +extern crate regex; + macro_rules! regex_new { ($re:expr) => {{ use regex::internal::ExecBuilder; diff --git a/tests/test_backtrack_utf8bytes.rs b/tests/test_backtrack_utf8bytes.rs index 6d308e9..78a0135 100644 --- a/tests/test_backtrack_utf8bytes.rs +++ b/tests/test_backtrack_utf8bytes.rs @@ -1,5 +1,8 @@ #![cfg_attr(feature = "pattern", feature(pattern))] +extern crate rand; +extern crate regex; + macro_rules! regex_new { ($re:expr) => {{ use regex::internal::ExecBuilder; diff --git a/tests/test_crates_regex.rs b/tests/test_crates_regex.rs index a681604..d697683 100644 --- a/tests/test_crates_regex.rs +++ b/tests/test_crates_regex.rs @@ -1,3 +1,6 @@ +extern crate quickcheck; +extern crate regex; + /* * This test is a minimal version of <rofl_0> and <subdiff_0> * diff --git a/tests/test_default.rs b/tests/test_default.rs index d4365fb..af634a0 100644 --- a/tests/test_default.rs +++ b/tests/test_default.rs @@ -1,6 +1,7 @@ #![cfg_attr(feature = "pattern", feature(pattern))] -use regex; +extern crate rand; +extern crate regex; // Due to macro scoping rules, this definition only applies for the modules // defined below. Effectively, it allows us to use the same tests for both diff --git a/tests/test_default_bytes.rs b/tests/test_default_bytes.rs index f200596..e4a25dc 100644 --- a/tests/test_default_bytes.rs +++ b/tests/test_default_bytes.rs @@ -1,3 +1,6 @@ +extern crate rand; +extern crate regex; + macro_rules! regex_new { ($re:expr) => {{ use regex::bytes::Regex; diff --git a/tests/test_nfa.rs b/tests/test_nfa.rs index e5a67d1..05dad23 100644 --- a/tests/test_nfa.rs +++ b/tests/test_nfa.rs @@ -1,5 +1,8 @@ #![cfg_attr(feature = "pattern", feature(pattern))] +extern crate rand; +extern crate regex; + macro_rules! regex_new { ($re:expr) => {{ use regex::internal::ExecBuilder; diff --git a/tests/test_nfa_bytes.rs b/tests/test_nfa_bytes.rs index 0a10e03..1042318 100644 --- a/tests/test_nfa_bytes.rs +++ b/tests/test_nfa_bytes.rs @@ -1,3 +1,6 @@ +extern crate rand; +extern crate regex; + macro_rules! regex_new { ($re:expr) => {{ use regex::internal::ExecBuilder; diff --git a/tests/test_nfa_utf8bytes.rs b/tests/test_nfa_utf8bytes.rs index 36a572b..86487a1 100644 --- a/tests/test_nfa_utf8bytes.rs +++ b/tests/test_nfa_utf8bytes.rs @@ -1,5 +1,8 @@ #![cfg_attr(feature = "pattern", feature(pattern))] +extern crate rand; +extern crate regex; + macro_rules! regex_new { ($re:expr) => {{ use regex::internal::ExecBuilder; |