summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCronet Mainline Eng <cronet-mainline-eng+copybara@google.com>2024-02-02 09:37:13 +0000
committerChidera Olibie <colibie@google.com>2024-02-02 09:53:24 +0000
commit5cfdd35118d5a23349255971e97737e32895ec0f (patch)
treef6b803e3a8bbddaf4814d1a43930799c3d7f4d8e
parentabce8a39488511c10b95ac52d1a3fdd2e886da83 (diff)
downloadcronet-upstream-import.tar.gz
Cronet 121.0.6167.71: import third_party/re2upstream-import
Bug: b/322154153 FolderOrigin-RevId: /tmp/copybara-origin/src Change-Id: Ic5f3b7c7578bf4e12b03944d863325abfc88853a
-rw-r--r--third_party/re2/BUILD.gn63
-rw-r--r--third_party/re2/DEPS7
-rw-r--r--third_party/re2/DIR_METADATA3
-rw-r--r--third_party/re2/LICENSE27
-rw-r--r--third_party/re2/README.chromium13
-rw-r--r--third_party/re2/src/.bazelrc23
-rwxr-xr-xthird_party/re2/src/.github/bazel.sh24
-rwxr-xr-xthird_party/re2/src/.github/cmake.sh12
-rw-r--r--third_party/re2/src/.github/workflows/ci-bazel.yml19
-rw-r--r--third_party/re2/src/.github/workflows/ci-cmake.yml60
-rw-r--r--third_party/re2/src/.github/workflows/ci.yml73
-rw-r--r--third_party/re2/src/.github/workflows/pr.yml26
-rw-r--r--third_party/re2/src/.github/workflows/python.yml222
-rw-r--r--third_party/re2/src/AUTHORS13
-rw-r--r--third_party/re2/src/BUILD.bazel400
-rw-r--r--third_party/re2/src/CMakeLists.txt263
-rw-r--r--third_party/re2/src/CONTRIBUTING.md2
-rw-r--r--third_party/re2/src/CONTRIBUTORS41
-rw-r--r--third_party/re2/src/LICENSE27
-rw-r--r--third_party/re2/src/MODULE.bazel27
-rw-r--r--third_party/re2/src/Makefile399
-rw-r--r--third_party/re2/src/README47
-rw-r--r--third_party/re2/src/SECURITY.md4
-rw-r--r--third_party/re2/src/WORKSPACE.bazel7
-rw-r--r--third_party/re2/src/WORKSPACE.bzlmod7
-rw-r--r--third_party/re2/src/app/BUILD.bazel24
-rw-r--r--third_party/re2/src/app/_re2.cc94
-rw-r--r--third_party/re2/src/app/_re2.d.ts23
-rw-r--r--third_party/re2/src/app/app.ts111
-rwxr-xr-xthird_party/re2/src/app/build.sh44
-rw-r--r--third_party/re2/src/app/index.html5
-rw-r--r--third_party/re2/src/app/package.json14
-rw-r--r--third_party/re2/src/app/rollup.config.js28
-rw-r--r--third_party/re2/src/app/tsconfig.json17
-rw-r--r--third_party/re2/src/benchlog/benchlog.c22211
-rw-r--r--third_party/re2/src/benchlog/benchlog.mini582
-rw-r--r--third_party/re2/src/benchlog/benchlog.r701475
-rw-r--r--third_party/re2/src/benchlog/benchlog.wreck1058
-rwxr-xr-xthird_party/re2/src/benchlog/benchplot.py98
-rwxr-xr-xthird_party/re2/src/benchlog/mktable155
-rwxr-xr-xthird_party/re2/src/doc/mksyntaxgo42
-rwxr-xr-xthird_party/re2/src/doc/mksyntaxhtml42
-rwxr-xr-xthird_party/re2/src/doc/mksyntaxwiki36
-rw-r--r--third_party/re2/src/doc/syntax.html477
-rw-r--r--third_party/re2/src/doc/syntax.txt463
-rwxr-xr-xthird_party/re2/src/lib/git/commit-msg.hook104
-rw-r--r--third_party/re2/src/libre2.symbols16
-rw-r--r--third_party/re2/src/libre2.symbols.darwin12
-rw-r--r--third_party/re2/src/python/BUILD.bazel36
l---------third_party/re2/src/python/LICENSE1
-rw-r--r--third_party/re2/src/python/README1
-rw-r--r--third_party/re2/src/python/_re2.cc338
-rw-r--r--third_party/re2/src/python/re2.py582
-rw-r--r--third_party/re2/src/python/re2_test.py482
-rw-r--r--third_party/re2/src/python/setup.py117
-rw-r--r--third_party/re2/src/re2.pc.in9
-rw-r--r--third_party/re2/src/re2/bitmap256.cc44
-rw-r--r--third_party/re2/src/re2/bitmap256.h86
-rw-r--r--third_party/re2/src/re2/bitstate.cc381
-rw-r--r--third_party/re2/src/re2/compile.cc1262
-rw-r--r--third_party/re2/src/re2/dfa.cc2132
-rw-r--r--third_party/re2/src/re2/filtered_re2.cc134
-rw-r--r--third_party/re2/src/re2/filtered_re2.h115
-rw-r--r--third_party/re2/src/re2/fuzzing/re2_fuzzer.cc282
-rwxr-xr-xthird_party/re2/src/re2/make_perl_groups.pl116
-rwxr-xr-xthird_party/re2/src/re2/make_unicode_casefold.py151
-rwxr-xr-xthird_party/re2/src/re2/make_unicode_groups.py117
-rw-r--r--third_party/re2/src/re2/mimics_pcre.cc196
-rw-r--r--third_party/re2/src/re2/nfa.cc710
-rw-r--r--third_party/re2/src/re2/onepass.cc621
-rw-r--r--third_party/re2/src/re2/parse.cc2479
-rw-r--r--third_party/re2/src/re2/perl_groups.cc119
-rw-r--r--third_party/re2/src/re2/pod_array.h55
-rw-r--r--third_party/re2/src/re2/prefilter.cc709
-rw-r--r--third_party/re2/src/re2/prefilter.h167
-rw-r--r--third_party/re2/src/re2/prefilter_tree.cc374
-rw-r--r--third_party/re2/src/re2/prefilter_tree.h152
-rw-r--r--third_party/re2/src/re2/prog.cc1174
-rw-r--r--third_party/re2/src/re2/prog.h466
-rw-r--r--third_party/re2/src/re2/re2.cc1345
-rw-r--r--third_party/re2/src/re2/re2.h1078
-rw-r--r--third_party/re2/src/re2/regexp.cc1002
-rw-r--r--third_party/re2/src/re2/regexp.h664
-rw-r--r--third_party/re2/src/re2/set.cc174
-rw-r--r--third_party/re2/src/re2/set.h86
-rw-r--r--third_party/re2/src/re2/simplify.cc685
-rw-r--r--third_party/re2/src/re2/sparse_array.h392
-rw-r--r--third_party/re2/src/re2/sparse_set.h264
-rw-r--r--third_party/re2/src/re2/stringpiece.h18
-rw-r--r--third_party/re2/src/re2/testing/backtrack.cc272
-rw-r--r--third_party/re2/src/re2/testing/charclass_test.cc228
-rw-r--r--third_party/re2/src/re2/testing/compile_test.cc428
-rw-r--r--third_party/re2/src/re2/testing/dfa_test.cc373
-rw-r--r--third_party/re2/src/re2/testing/dump.cc163
-rw-r--r--third_party/re2/src/re2/testing/exhaustive1_test.cc39
-rw-r--r--third_party/re2/src/re2/testing/exhaustive2_test.cc72
-rw-r--r--third_party/re2/src/re2/testing/exhaustive3_test.cc100
-rw-r--r--third_party/re2/src/re2/testing/exhaustive_test.cc36
-rw-r--r--third_party/re2/src/re2/testing/exhaustive_tester.cc195
-rw-r--r--third_party/re2/src/re2/testing/exhaustive_tester.h104
-rw-r--r--third_party/re2/src/re2/testing/filtered_re2_test.cc342
-rw-r--r--third_party/re2/src/re2/testing/mimics_pcre_test.cc78
-rw-r--r--third_party/re2/src/re2/testing/null_walker.cc49
-rw-r--r--third_party/re2/src/re2/testing/parse_test.cc528
-rw-r--r--third_party/re2/src/re2/testing/possible_match_test.cc248
-rw-r--r--third_party/re2/src/re2/testing/random_test.cc102
-rw-r--r--third_party/re2/src/re2/testing/re2_arg_test.cc183
-rw-r--r--third_party/re2/src/re2/testing/re2_test.cc1661
-rw-r--r--third_party/re2/src/re2/testing/regexp_benchmark.cc1569
-rw-r--r--third_party/re2/src/re2/testing/regexp_generator.cc280
-rw-r--r--third_party/re2/src/re2/testing/regexp_generator.h76
-rw-r--r--third_party/re2/src/re2/testing/regexp_test.cc86
-rw-r--r--third_party/re2/src/re2/testing/required_prefix_test.cc200
-rw-r--r--third_party/re2/src/re2/testing/search_test.cc335
-rw-r--r--third_party/re2/src/re2/testing/set_test.cc230
-rw-r--r--third_party/re2/src/re2/testing/simplify_test.cc290
-rw-r--r--third_party/re2/src/re2/testing/string_generator.cc141
-rw-r--r--third_party/re2/src/re2/testing/string_generator.h75
-rw-r--r--third_party/re2/src/re2/testing/string_generator_test.cc110
-rw-r--r--third_party/re2/src/re2/testing/tester.cc684
-rw-r--r--third_party/re2/src/re2/testing/tester.h121
-rw-r--r--third_party/re2/src/re2/tostring.cc350
-rw-r--r--third_party/re2/src/re2/unicode.py303
-rw-r--r--third_party/re2/src/re2/unicode_casefold.cc604
-rw-r--r--third_party/re2/src/re2/unicode_casefold.h77
-rw-r--r--third_party/re2/src/re2/unicode_groups.cc6517
-rw-r--r--third_party/re2/src/re2/unicode_groups.h66
-rw-r--r--third_party/re2/src/re2/walker-inl.h248
-rw-r--r--third_party/re2/src/re2Config.cmake.in28
-rwxr-xr-xthird_party/re2/src/runtests33
-rw-r--r--third_party/re2/src/testinstall.cc27
-rw-r--r--third_party/re2/src/ucs2.diff567
-rw-r--r--third_party/re2/src/util/logging.h109
-rw-r--r--third_party/re2/src/util/malloc_counter.h19
-rw-r--r--third_party/re2/src/util/pcre.cc956
-rw-r--r--third_party/re2/src/util/pcre.h671
-rw-r--r--third_party/re2/src/util/rune.cc260
-rw-r--r--third_party/re2/src/util/strutil.cc26
-rw-r--r--third_party/re2/src/util/strutil.h16
-rw-r--r--third_party/re2/src/util/utf.h44
140 files changed, 48575 insertions, 0 deletions
diff --git a/third_party/re2/BUILD.gn b/third_party/re2/BUILD.gn
new file mode 100644
index 000000000..046bf297e
--- /dev/null
+++ b/third_party/re2/BUILD.gn
@@ -0,0 +1,63 @@
+# Copyright 2014 The Chromium Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import("//testing/libfuzzer/fuzzer_test.gni")
+
+config("re2_config") {
+ include_dirs = [ "src" ]
+}
+
+static_library("re2") {
+ sources = [
+ "src/re2/bitmap256.cc",
+ "src/re2/bitmap256.h",
+ "src/re2/bitstate.cc",
+ "src/re2/compile.cc",
+ "src/re2/dfa.cc",
+ "src/re2/filtered_re2.cc",
+ "src/re2/filtered_re2.h",
+ "src/re2/mimics_pcre.cc",
+ "src/re2/nfa.cc",
+ "src/re2/onepass.cc",
+ "src/re2/parse.cc",
+ "src/re2/perl_groups.cc",
+ "src/re2/prefilter.cc",
+ "src/re2/prefilter.h",
+ "src/re2/prefilter_tree.cc",
+ "src/re2/prefilter_tree.h",
+ "src/re2/prog.cc",
+ "src/re2/prog.h",
+ "src/re2/re2.cc",
+ "src/re2/re2.h",
+ "src/re2/regexp.cc",
+ "src/re2/regexp.h",
+ "src/re2/set.cc",
+ "src/re2/set.h",
+ "src/re2/simplify.cc",
+ "src/re2/sparse_array.h",
+ "src/re2/sparse_set.h",
+ "src/re2/stringpiece.h",
+ "src/re2/tostring.cc",
+ "src/re2/unicode_casefold.cc",
+ "src/re2/unicode_casefold.h",
+ "src/re2/unicode_groups.cc",
+ "src/re2/unicode_groups.h",
+ "src/re2/walker-inl.h",
+ "src/util/logging.h",
+ "src/util/rune.cc",
+ "src/util/strutil.cc",
+ "src/util/strutil.h",
+ "src/util/utf.h",
+ ]
+
+ configs -= [ "//build/config/compiler:chromium_code" ]
+ configs += [ "//build/config/compiler:no_chromium_code" ]
+ public_configs = [ ":re2_config" ]
+ public_deps = [ "//third_party/abseil-cpp:absl" ]
+}
+
+fuzzer_test("third_party_re2_fuzzer") {
+ sources = [ "src/re2/fuzzing/re2_fuzzer.cc" ]
+ deps = [ ":re2" ]
+}
diff --git a/third_party/re2/DEPS b/third_party/re2/DEPS
new file mode 100644
index 000000000..82c266c5b
--- /dev/null
+++ b/third_party/re2/DEPS
@@ -0,0 +1,7 @@
+include_rules = [
+ '+base',
+ '+build',
+ '+re2',
+ '+utest',
+ '+util',
+]
diff --git a/third_party/re2/DIR_METADATA b/third_party/re2/DIR_METADATA
new file mode 100644
index 000000000..d366dc732
--- /dev/null
+++ b/third_party/re2/DIR_METADATA
@@ -0,0 +1,3 @@
+monorail: {
+ component: "Internals"
+}
diff --git a/third_party/re2/LICENSE b/third_party/re2/LICENSE
new file mode 100644
index 000000000..09e5ec1c7
--- /dev/null
+++ b/third_party/re2/LICENSE
@@ -0,0 +1,27 @@
+// Copyright (c) 2009 The RE2 Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/re2/README.chromium b/third_party/re2/README.chromium
new file mode 100644
index 000000000..77c90c2da
--- /dev/null
+++ b/third_party/re2/README.chromium
@@ -0,0 +1,13 @@
+Name: re2 - an efficient, principled regular expression library
+Short Name: re2
+URL: https://github.com/google/re2
+Version: 1e44e72d31ddc66b783a545e9d9fcaa876a146b7
+Date: 2023-05-31
+License: BSD 3-Clause
+License File: LICENSE
+Security Critical: yes
+Shipped: yes
+
+Description:
+RE2 is a fast, safe, thread-friendly alternative to backtracking regular
+expression engines like those used in PCRE, Perl, and Python.
diff --git a/third_party/re2/src/.bazelrc b/third_party/re2/src/.bazelrc
new file mode 100644
index 000000000..540fb5738
--- /dev/null
+++ b/third_party/re2/src/.bazelrc
@@ -0,0 +1,23 @@
+# Copyright 2022 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Enable Bzlmod. This will be the default eventually...
+build --enable_bzlmod
+# Enable layering check features. Useful on Clang only.
+build --features=layering_check
+# Enable parse headers features. Enforcing that headers are self-contained.
+build --features=parse_headers
+
+# Abseil requires C++14 at minimum.
+# Previously, the flag was set via `BAZEL_CXXOPTS`. On macOS, we also had to set
+# `BAZEL_USE_CPP_ONLY_TOOLCHAIN` since Bazel wouldn't respect the former without
+# the latter. However, the latter stopped Bazel from using Xcode and `-framework
+# Foundation`, which CCTZ (vendored into Abseil) requires.
+build --enable_platform_specific_config
+build:linux --cxxopt=-std=c++14
+build:macos --cxxopt=-std=c++14
+build:windows --cxxopt=/std:c++14
+
+# Print test logs for failed tests.
+test --test_output=errors
diff --git a/third_party/re2/src/.github/bazel.sh b/third_party/re2/src/.github/bazel.sh
new file mode 100755
index 000000000..7295ec6a8
--- /dev/null
+++ b/third_party/re2/src/.github/bazel.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -eux
+
+bazel clean
+bazel build --compilation_mode=dbg -- //:all
+bazel test --compilation_mode=dbg -- //:all \
+ -//:dfa_test \
+ -//:exhaustive1_test \
+ -//:exhaustive2_test \
+ -//:exhaustive3_test \
+ -//:exhaustive_test \
+ -//:random_test
+
+bazel clean
+bazel build --compilation_mode=opt -- //:all
+bazel test --compilation_mode=opt -- //:all \
+ -//:dfa_test \
+ -//:exhaustive1_test \
+ -//:exhaustive2_test \
+ -//:exhaustive3_test \
+ -//:exhaustive_test \
+ -//:random_test
+
+exit 0
diff --git a/third_party/re2/src/.github/cmake.sh b/third_party/re2/src/.github/cmake.sh
new file mode 100755
index 000000000..782334e81
--- /dev/null
+++ b/third_party/re2/src/.github/cmake.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -eux
+
+cmake . -D CMAKE_BUILD_TYPE=Debug -D RE2_BUILD_TESTING=ON "$@"
+cmake --build . --config Debug --clean-first
+ctest -C Debug --output-on-failure -E 'dfa|exhaustive|random'
+
+cmake . -D CMAKE_BUILD_TYPE=Release -D RE2_BUILD_TESTING=ON "$@"
+cmake --build . --config Release --clean-first
+ctest -C Release --output-on-failure -E 'dfa|exhaustive|random'
+
+exit 0
diff --git a/third_party/re2/src/.github/workflows/ci-bazel.yml b/third_party/re2/src/.github/workflows/ci-bazel.yml
new file mode 100644
index 000000000..013b52ca4
--- /dev/null
+++ b/third_party/re2/src/.github/workflows/ci-bazel.yml
@@ -0,0 +1,19 @@
+name: CI (Bazel)
+on:
+ push:
+ branches: [main]
+jobs:
+ build:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [macos-latest, ubuntu-latest, windows-latest]
+ env:
+ BAZELISK_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ steps:
+ - uses: actions/checkout@v3
+ # TODO(junyer): Use `v2` whenever a new release is tagged.
+ - uses: bazelbuild/setup-bazelisk@6244971d4f7ba9aca943c2f3ede2bbd813fcca51
+ - run: .github/bazel.sh
+ shell: bash
diff --git a/third_party/re2/src/.github/workflows/ci-cmake.yml b/third_party/re2/src/.github/workflows/ci-cmake.yml
new file mode 100644
index 000000000..d2d03afab
--- /dev/null
+++ b/third_party/re2/src/.github/workflows/ci-cmake.yml
@@ -0,0 +1,60 @@
+name: CI (CMake)
+on:
+ push:
+ branches: [main]
+jobs:
+ build-linux:
+ runs-on: ubuntu-latest
+ # The Benchmark package on Ubuntu 22.04 LTS is problematic whereas this
+ # Docker container is based on Debian bookworm and has a newer version.
+ container: gcc:13
+ strategy:
+ fail-fast: false
+ matrix:
+ build_shared_libs: [OFF, ON]
+ steps:
+ - uses: actions/checkout@v3
+ - name: Install CMake
+ run: |
+ apt update -y
+ apt install -y cmake
+ shell: bash
+ - name: Install Abseil, GoogleTest and Benchmark
+ run: |
+ apt update -y
+ apt install -y libabsl-dev libgtest-dev libbenchmark-dev
+ shell: bash
+ - run: .github/cmake.sh -D BUILD_SHARED_LIBS=${{ matrix.build_shared_libs }}
+ shell: bash
+ build-macos:
+ runs-on: macos-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ build_shared_libs: [OFF, ON]
+ steps:
+ - uses: actions/checkout@v3
+ - name: Install Abseil, GoogleTest and Benchmark
+ run: |
+ brew update
+ brew install abseil googletest google-benchmark
+ shell: bash
+ - run: .github/cmake.sh -D BUILD_SHARED_LIBS=${{ matrix.build_shared_libs }}
+ shell: bash
+ build-windows:
+ runs-on: windows-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ build_shared_libs: [OFF, ON]
+ steps:
+ - uses: actions/checkout@v3
+ - name: Install Abseil, GoogleTest and Benchmark
+ run: |
+ vcpkg update
+ vcpkg install abseil gtest benchmark
+ shell: bash
+ - run: |
+ .github/cmake.sh -D BUILD_SHARED_LIBS=${{ matrix.build_shared_libs }} \
+ -D CMAKE_TOOLCHAIN_FILE=C:/vcpkg/scripts/buildsystems/vcpkg.cmake
+ shell: bash
diff --git a/third_party/re2/src/.github/workflows/ci.yml b/third_party/re2/src/.github/workflows/ci.yml
new file mode 100644
index 000000000..44ac9dc29
--- /dev/null
+++ b/third_party/re2/src/.github/workflows/ci.yml
@@ -0,0 +1,73 @@
+name: CI
+on:
+ push:
+ branches: [main]
+jobs:
+ build-appleclang:
+ runs-on: macos-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ ver: [17, 20]
+ env:
+ CC: clang
+ CXX: clang++
+ # Unlike GCC and upstream Clang, AppleClang still defaults to `-std=c++98`
+ # for some reason. Also, the macOS image on GitHub Actions provides wildly
+ # numbered Xcode versions. Thus, rather than varying the compiler version,
+ # we set the `-std` flag explicitly in order to vary the language version.
+ # (The other two flags are the default provided for CXXFLAGS in Makefile.)
+ CXXFLAGS: -O3 -g -std=c++${{ matrix.ver }}
+ steps:
+ - uses: actions/checkout@v3
+ - name: Install Abseil, GoogleTest and Benchmark
+ run: |
+ brew update
+ brew install abseil googletest google-benchmark
+ shell: bash
+ - run: make && make test
+ shell: bash
+ build-clang:
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ ver: [15, 16, 17]
+ env:
+ CC: clang-${{ matrix.ver }}
+ CXX: clang++-${{ matrix.ver }}
+ steps:
+ - uses: actions/checkout@v3
+ - name: Install Clang ${{ matrix.ver }}
+ run: |
+ # Avoid `Conflicts: python3-lldb-x.y` between packages.
+ sudo apt purge -y python3-lldb-14
+ wget https://apt.llvm.org/llvm.sh
+ chmod +x ./llvm.sh
+ sudo ./llvm.sh ${{ matrix.ver }}
+ shell: bash
+ - name: Install Abseil, GoogleTest and Benchmark
+ run: |
+ sudo apt update -y
+ sudo apt install -y libabsl-dev libgtest-dev libbenchmark-dev
+ shell: bash
+ - run: make && make test
+ shell: bash
+ build-gcc:
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ ver: [11, 12, 13]
+ env:
+ CC: gcc-${{ matrix.ver }}
+ CXX: g++-${{ matrix.ver }}
+ steps:
+ - uses: actions/checkout@v3
+ - name: Install Abseil, GoogleTest and Benchmark
+ run: |
+ sudo apt update -y
+ sudo apt install -y libabsl-dev libgtest-dev libbenchmark-dev
+ shell: bash
+ - run: make && make test
+ shell: bash
diff --git a/third_party/re2/src/.github/workflows/pr.yml b/third_party/re2/src/.github/workflows/pr.yml
new file mode 100644
index 000000000..860da6236
--- /dev/null
+++ b/third_party/re2/src/.github/workflows/pr.yml
@@ -0,0 +1,26 @@
+name: PR
+on:
+ pull_request_target:
+ branches: [main]
+ types: [opened]
+jobs:
+ close:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - uses: actions/github-script@v6
+ with:
+ script: |
+ const fs = require('fs');
+ console.log(await github.rest.issues.createComment({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ body: fs.readFileSync('CONTRIBUTING.md', { encoding: 'utf8', }),
+ }));
+ console.log(await github.rest.pulls.update({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ pull_number: context.issue.number,
+ state: 'closed',
+ }));
diff --git a/third_party/re2/src/.github/workflows/python.yml b/third_party/re2/src/.github/workflows/python.yml
new file mode 100644
index 000000000..2680db24c
--- /dev/null
+++ b/third_party/re2/src/.github/workflows/python.yml
@@ -0,0 +1,222 @@
+name: Python
+on:
+ workflow_dispatch:
+ inputs:
+ build:
+ required: true
+ type: number
+jobs:
+ wheel-linux:
+ name: Linux ${{ matrix.os }}, ${{ matrix.arch.name }}, Python ${{ matrix.ver }}
+ runs-on: ${{ matrix.arch.runs-on }}
+ container:
+ image: quay.io/pypa/${{ matrix.os }}_${{ matrix.arch.python-name }}
+ # Don't run as root within the container.
+ # Neither Git nor Bazel appreciates that.
+ # 1001 is the GitHub Actions runner user.
+ options: --init --user 1001
+ strategy:
+ fail-fast: false
+ matrix:
+ arch:
+ - { name: X64, python-name: x86_64, runs-on: [ubuntu-latest] }
+ - { name: ARM64, python-name: aarch64, runs-on: [self-hosted, linux, arm64] }
+ os: [manylinux2014, manylinux_2_28]
+ ver: ['3.8', '3.9', '3.10', '3.11', '3.12']
+ env:
+ BAZELISK_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ PYTHON: /usr/local/bin/python${{ matrix.ver }}
+ # Bazel fails if the username is unknown.
+ USER: runner
+ steps:
+ - uses: actions/checkout@v3
+ # Stash the timestamp for the commit SHA that triggered the workflow.
+ - run: echo "timestamp=$(git log -1 --pretty=%ct)" >> "${GITHUB_ENV}"
+ shell: bash
+ # TODO(junyer): Use `v2` whenever a new release is tagged.
+ - uses: bazelbuild/setup-bazelisk@6244971d4f7ba9aca943c2f3ede2bbd813fcca51
+ - name: Prepare Python ${{ matrix.ver }} environment
+ run: |
+ "${PYTHON}" -m pip install --upgrade pip
+ "${PYTHON}" -m pip install --upgrade build wheel auditwheel
+ "${PYTHON}" -m pip install --upgrade absl-py
+ shell: bash
+ - name: Build wheel
+ env:
+ SOURCE_DATE_EPOCH: ${{ env.timestamp }}
+ run: |
+ "${PYTHON}" -m build --wheel
+ "${PYTHON}" -m auditwheel repair --wheel-dir=. dist/*
+ shell: bash
+ working-directory: python
+ - name: Test wheel
+ run: |
+ "${PYTHON}" -m pip install google_re2-*.whl
+ "${PYTHON}" re2_test.py
+ shell: bash
+ working-directory: python
+ - uses: actions/upload-artifact@v3
+ with:
+ name: ${{ hashFiles('python/google_re2-*.whl') }}
+ path: python/google_re2-*.whl
+ retention-days: 1
+ wheel-macos:
+ name: macOS ${{ matrix.os }}, ${{ matrix.arch.name }}, Python ${{ matrix.ver }}
+ runs-on: macos-${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ arch:
+ - { name: X64, bazel-name: x86_64, python-name: x86_64 }
+ - { name: ARM64, bazel-name: arm64, python-name: arm64 }
+ os: [11, 12, 13]
+ ver: ['3.8', '3.9', '3.10', '3.11', '3.12']
+ env:
+ BAZELISK_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ BAZEL_CPU: darwin_${{ matrix.arch.bazel-name }}
+ PLAT_NAME: macosx-${{ matrix.os }}.0-${{ matrix.arch.python-name }}
+ # Stop macOS from reporting the system version as 10.x.
+ # Otherwise, Python refuses to install the built wheel!
+ SYSTEM_VERSION_COMPAT: 0
+ steps:
+ - uses: actions/checkout@v3
+ # Stash the timestamp for the commit SHA that triggered the workflow.
+ - run: echo "timestamp=$(git log -1 --pretty=%ct)" >> "${GITHUB_ENV}"
+ shell: bash
+ # TODO(junyer): Use `v2` whenever a new release is tagged.
+ - uses: bazelbuild/setup-bazelisk@6244971d4f7ba9aca943c2f3ede2bbd813fcca51
+ - uses: actions/setup-python@v4
+ with:
+ python-version: ${{ matrix.ver }}
+ - name: Prepare Python ${{ matrix.ver }} environment
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install --upgrade build wheel delocate
+ python -m pip install --upgrade absl-py
+ shell: bash
+ - name: Build wheel
+ env:
+ SOURCE_DATE_EPOCH: ${{ env.timestamp }}
+ run: |
+ python -m build --wheel
+ python -m delocate.cmd.delocate_wheel --wheel-dir=. dist/*
+ shell: bash
+ working-directory: python
+ - if: matrix.arch.name == runner.arch
+ name: Test wheel
+ run: |
+ python -m pip install google_re2-*.whl
+ python re2_test.py
+ shell: bash
+ working-directory: python
+ - uses: actions/upload-artifact@v3
+ with:
+ name: ${{ hashFiles('python/google_re2-*.whl') }}
+ path: python/google_re2-*.whl
+ retention-days: 1
+ wheel-windows:
+ name: Windows, ${{ matrix.arch.name }}, Python ${{ matrix.ver }}
+ runs-on: windows-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ arch:
+ - { name: X86, bazel-name: x64_x86, python-name: win32 }
+ - { name: X64, bazel-name: x64, python-name: win_amd64 }
+ ver: ['3.8', '3.9', '3.10', '3.11', '3.12']
+ env:
+ BAZELISK_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ BAZEL_CPU: ${{ matrix.arch.bazel-name }}_windows
+ PLAT_NAME: ${{ matrix.arch.python-name }}
+ steps:
+ - uses: actions/checkout@v3
+ # Stash the timestamp for the commit SHA that triggered the workflow.
+ - run: echo "timestamp=$(git log -1 --pretty=%ct)" >> "${GITHUB_ENV}"
+ shell: bash
+ # Avoid the Chocolatey install of Bazel getting in the way;
+ # `bazelbuild/setup-bazelisk` doesn't work for some reason.
+ - run: |
+ choco uninstall -y bazel
+ choco install -y bazelisk
+ shell: bash
+ # Lowercase the architecture name for `actions/setup-python`.
+ - run: |
+ ARCHITECTURE=${{ matrix.arch.name }}
+ echo "architecture=${ARCHITECTURE,,}" >> "${GITHUB_ENV}"
+ shell: bash
+ - uses: actions/setup-python@v4
+ with:
+ python-version: ${{ matrix.ver }}
+ architecture: ${{ env.architecture }}
+ - name: Prepare Python ${{ matrix.ver }} environment
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install --upgrade build wheel delvewheel
+ python -m pip install --upgrade absl-py
+ shell: bash
+ - name: Build wheel
+ env:
+ SOURCE_DATE_EPOCH: ${{ env.timestamp }}
+ run: |
+ python -m build --wheel
+ python -m delvewheel repair --wheel-dir=. dist/*
+ shell: bash
+ working-directory: python
+ - name: Test wheel
+ run: |
+ python -m pip install google_re2-*.whl
+ python re2_test.py
+ shell: bash
+ working-directory: python
+ - uses: actions/upload-artifact@v3
+ with:
+ name: ${{ hashFiles('python/google_re2-*.whl') }}
+ path: python/google_re2-*.whl
+ retention-days: 1
+ publish:
+ needs:
+ - wheel-linux
+ - wheel-macos
+ - wheel-windows
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ # Stash the timestamp for the commit SHA that triggered the workflow.
+ - run: echo "timestamp=$(git log -1 --pretty=%ct)" >> "${GITHUB_ENV}"
+ shell: bash
+ - uses: actions/setup-python@v4
+ with:
+ python-version: '3.x'
+ - name: Prepare Python 3.x environment
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install --upgrade build wheel
+ shell: bash
+ - if: inputs.build == 1
+ name: Build source
+ env:
+ SOURCE_DATE_EPOCH: ${{ env.timestamp }}
+ run: |
+ python -m build --sdist
+ shell: bash
+ working-directory: python
+ - uses: actions/download-artifact@v3
+ with:
+ path: python
+ - name: Set build number to ${{ inputs.build }}
+ env:
+ SOURCE_DATE_EPOCH: ${{ env.timestamp }}
+ run: |
+ mkdir -p dist
+ for WHL in */google_re2-*.whl; do
+ python -m wheel unpack "${WHL}"
+ python -m wheel pack --dest-dir=dist --build-number=${{ inputs.build }} google_re2-*
+ rm -rf google_re2-*
+ done
+ shell: bash
+ working-directory: python
+ - if: inputs.build >= 1
+ uses: pypa/gh-action-pypi-publish@release/v1
+ with:
+ password: ${{ secrets.PYPI_API_TOKEN }}
+ packages_dir: python/dist
diff --git a/third_party/re2/src/AUTHORS b/third_party/re2/src/AUTHORS
new file mode 100644
index 000000000..0754006fe
--- /dev/null
+++ b/third_party/re2/src/AUTHORS
@@ -0,0 +1,13 @@
+# This is the official list of RE2 authors for copyright purposes.
+# This file is distinct from the CONTRIBUTORS files.
+# See the latter for an explanation.
+
+# Names should be added to this file as
+# Name or Organization <email address>
+# The email address is not required for organizations.
+
+# Please keep the list sorted.
+
+Google Inc.
+Samsung Electronics
+Stefano Rivera <stefano.rivera@gmail.com>
diff --git a/third_party/re2/src/BUILD.bazel b/third_party/re2/src/BUILD.bazel
new file mode 100644
index 000000000..ffe56c0c5
--- /dev/null
+++ b/third_party/re2/src/BUILD.bazel
@@ -0,0 +1,400 @@
+# Copyright 2009 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Bazel (http://bazel.build/) BUILD file for RE2.
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+cc_library(
+ name = "re2",
+ srcs = [
+ "re2/bitmap256.cc",
+ "re2/bitmap256.h",
+ "re2/bitstate.cc",
+ "re2/compile.cc",
+ "re2/dfa.cc",
+ "re2/filtered_re2.cc",
+ "re2/mimics_pcre.cc",
+ "re2/nfa.cc",
+ "re2/onepass.cc",
+ "re2/parse.cc",
+ "re2/perl_groups.cc",
+ "re2/pod_array.h",
+ "re2/prefilter.cc",
+ "re2/prefilter.h",
+ "re2/prefilter_tree.cc",
+ "re2/prefilter_tree.h",
+ "re2/prog.cc",
+ "re2/prog.h",
+ "re2/re2.cc",
+ "re2/regexp.cc",
+ "re2/regexp.h",
+ "re2/set.cc",
+ "re2/simplify.cc",
+ "re2/sparse_array.h",
+ "re2/sparse_set.h",
+ "re2/tostring.cc",
+ "re2/unicode_casefold.cc",
+ "re2/unicode_casefold.h",
+ "re2/unicode_groups.cc",
+ "re2/unicode_groups.h",
+ "re2/walker-inl.h",
+ "util/logging.h",
+ "util/rune.cc",
+ "util/strutil.cc",
+ "util/strutil.h",
+ "util/utf.h",
+ ],
+ hdrs = [
+ "re2/filtered_re2.h",
+ "re2/re2.h",
+ "re2/set.h",
+ "re2/stringpiece.h",
+ ],
+ copts = select({
+ # WebAssembly support for threads is... fraught at every level.
+ "@platforms//cpu:wasm32": [],
+ "@platforms//cpu:wasm64": [],
+ "@platforms//os:wasi": [],
+ "@platforms//os:windows": [],
+ "//conditions:default": ["-pthread"],
+ }),
+ linkopts = select({
+ # macOS doesn't need `-pthread' when linking and it appears that
+ # older versions of Clang will warn about the unused command line
+ # argument, so just don't pass it.
+ "@platforms//os:macos": [],
+ # WebAssembly support for threads is... fraught at every level.
+ "@platforms//cpu:wasm32": [],
+ "@platforms//cpu:wasm64": [],
+ "@platforms//os:wasi": [],
+ "@platforms//os:windows": [],
+ "//conditions:default": ["-pthread"],
+ }),
+ visibility = ["//visibility:public"],
+ deps = [
+ "@com_google_absl//absl/base",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/container:fixed_array",
+ "@com_google_absl//absl/container:flat_hash_map",
+ "@com_google_absl//absl/container:flat_hash_set",
+ "@com_google_absl//absl/container:inlined_vector",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/strings:str_format",
+ "@com_google_absl//absl/synchronization",
+ "@com_google_absl//absl/types:optional",
+ "@com_google_absl//absl/types:span",
+ ],
+)
+
+cc_library(
+ name = "testing",
+ testonly = 1,
+ srcs = [
+ "re2/testing/backtrack.cc",
+ "re2/testing/dump.cc",
+ "re2/testing/exhaustive_tester.cc",
+ "re2/testing/null_walker.cc",
+ "re2/testing/regexp_generator.cc",
+ "re2/testing/string_generator.cc",
+ "re2/testing/tester.cc",
+ "util/pcre.cc",
+ ],
+ hdrs = [
+ "re2/testing/exhaustive_tester.h",
+ "re2/testing/regexp_generator.h",
+ "re2/testing/string_generator.h",
+ "re2/testing/tester.h",
+ "util/malloc_counter.h",
+ "util/pcre.h",
+
+ # Exposed for testing only.
+ "re2/bitmap256.h",
+ "re2/pod_array.h",
+ "re2/prefilter.h",
+ "re2/prefilter_tree.h",
+ "re2/prog.h",
+ "re2/regexp.h",
+ "re2/sparse_array.h",
+ "re2/sparse_set.h",
+ "re2/unicode_casefold.h",
+ "re2/unicode_groups.h",
+ "re2/walker-inl.h",
+ "util/logging.h",
+ "util/strutil.h",
+ "util/utf.h",
+ ],
+ visibility = [":__subpackages__"],
+ deps = [
+ ":re2",
+ "@com_google_absl//absl/base",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/strings:str_format",
+ "@googletest//:gtest",
+ ],
+)
+
+cc_test(
+ name = "charclass_test",
+ size = "small",
+ srcs = ["re2/testing/charclass_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/strings:str_format",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "compile_test",
+ size = "small",
+ srcs = ["re2/testing/compile_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "filtered_re2_test",
+ size = "small",
+ srcs = ["re2/testing/filtered_re2_test.cc"],
+ deps = [
+ ":re2",
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "mimics_pcre_test",
+ size = "small",
+ srcs = ["re2/testing/mimics_pcre_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "parse_test",
+ size = "small",
+ srcs = ["re2/testing/parse_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "possible_match_test",
+ size = "small",
+ srcs = ["re2/testing/possible_match_test.cc"],
+ deps = [
+ ":re2",
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/strings",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "re2_arg_test",
+ size = "small",
+ srcs = ["re2/testing/re2_arg_test.cc"],
+ deps = [
+ ":re2",
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "re2_test",
+ size = "small",
+ srcs = ["re2/testing/re2_test.cc"],
+ deps = [
+ ":re2",
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/strings:str_format",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "regexp_test",
+ size = "small",
+ srcs = ["re2/testing/regexp_test.cc"],
+ deps = [
+ ":testing",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "required_prefix_test",
+ size = "small",
+ srcs = ["re2/testing/required_prefix_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "search_test",
+ size = "small",
+ srcs = ["re2/testing/search_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "set_test",
+ size = "small",
+ srcs = ["re2/testing/set_test.cc"],
+ deps = [
+ ":re2",
+ ":testing",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "simplify_test",
+ size = "small",
+ srcs = ["re2/testing/simplify_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "string_generator_test",
+ size = "small",
+ srcs = ["re2/testing/string_generator_test.cc"],
+ deps = [
+ ":testing",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "dfa_test",
+ size = "large",
+ srcs = ["re2/testing/dfa_test.cc"],
+ deps = [
+ ":re2",
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/strings:str_format",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "exhaustive1_test",
+ size = "large",
+ srcs = ["re2/testing/exhaustive1_test.cc"],
+ deps = [
+ ":testing",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "exhaustive2_test",
+ size = "large",
+ srcs = ["re2/testing/exhaustive2_test.cc"],
+ deps = [
+ ":testing",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "exhaustive3_test",
+ size = "large",
+ srcs = ["re2/testing/exhaustive3_test.cc"],
+ deps = [
+ ":testing",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "exhaustive_test",
+ size = "large",
+ srcs = ["re2/testing/exhaustive_test.cc"],
+ deps = [
+ ":testing",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "random_test",
+ size = "large",
+ srcs = ["re2/testing/random_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/strings:str_format",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_binary(
+ name = "regexp_benchmark",
+ testonly = 1,
+ srcs = ["re2/testing/regexp_benchmark.cc"],
+ deps = [
+ ":re2",
+ ":testing",
+ "@com_google_absl//absl/container:flat_hash_map",
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/strings:str_format",
+ "@com_google_absl//absl/synchronization",
+ "@google_benchmark//:benchmark_main",
+ ],
+)
diff --git a/third_party/re2/src/CMakeLists.txt b/third_party/re2/src/CMakeLists.txt
new file mode 100644
index 000000000..bdac5afd6
--- /dev/null
+++ b/third_party/re2/src/CMakeLists.txt
@@ -0,0 +1,263 @@
+# Copyright 2015 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# https://github.com/google/oss-policies-info/blob/main/foundational-cxx-support-matrix.md
+cmake_minimum_required(VERSION 3.13)
+
+project(RE2 CXX)
+include(CMakePackageConfigHelpers)
+include(CTest)
+include(GNUInstallDirs)
+
+option(BUILD_SHARED_LIBS "build shared libraries" OFF)
+option(RE2_USE_ICU "build against ICU for full Unicode properties support" OFF)
+
+# For historical reasons, this is just "USEPCRE", not "RE2_USE_PCRE".
+option(USEPCRE "build against PCRE for testing and benchmarking" OFF)
+
+# See https://groups.google.com/g/re2-dev/c/P6_NM0YIWvA for details.
+# This has no effect unless RE2 is being built for an Apple platform
+# such as macOS or iOS.
+option(RE2_BUILD_FRAMEWORK "build RE2 as a framework" OFF)
+
+# CMake seems to have no way to enable/disable testing per subproject,
+# so we provide an option similar to BUILD_TESTING, but just for RE2.
+option(RE2_BUILD_TESTING "enable testing for RE2" OFF)
+
+# The pkg-config Requires: field.
+set(REQUIRES)
+
+# ABI version
+# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
+set(SONAME 11)
+
+set(EXTRA_TARGET_LINK_LIBRARIES)
+
+if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+ if(MSVC_VERSION LESS 1920)
+ message(FATAL_ERROR "you need Visual Studio 2019 or later")
+ endif()
+ if(BUILD_SHARED_LIBS)
+ set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+ endif()
+ # CMake defaults to /W3, but some users like /W4 (or /Wall) and /WX,
+ # so we disable various warnings that aren't particularly helpful.
+ add_compile_options(/wd4100 /wd4201 /wd4456 /wd4457 /wd4702 /wd4815)
+ # Without a byte order mark (BOM), Visual Studio assumes that the source
+ # file is encoded using the current user code page, so we specify UTF-8.
+ add_compile_options(/utf-8)
+endif()
+
+if(WIN32)
+ add_definitions(-DUNICODE -D_UNICODE -DSTRICT -DNOMINMAX)
+ add_definitions(-D_CRT_SECURE_NO_WARNINGS -D_SCL_SECURE_NO_WARNINGS)
+endif()
+
+if(UNIX)
+ set(THREADS_PREFER_PTHREAD_FLAG ON)
+ find_package(Threads REQUIRED)
+endif()
+
+set(ABSL_DEPS
+ absl_base
+ absl_core_headers
+ absl_fixed_array
+ absl_flags
+ absl_flat_hash_map
+ absl_flat_hash_set
+ absl_inlined_vector
+ absl_optional
+ absl_span
+ absl_str_format
+ absl_strings
+ absl_synchronization
+ )
+
+# If a top-level project has called add_directory(abseil-cpp) already (possibly
+# indirectly), let that take precedence over any copy of Abseil that might have
+# been installed on the system. And likewise for ICU, GoogleTest and Benchmark.
+if(NOT TARGET absl::base)
+ find_package(absl REQUIRED)
+endif()
+list(APPEND REQUIRES ${ABSL_DEPS})
+
+if(RE2_USE_ICU)
+ if(NOT TARGET ICU::uc)
+ find_package(ICU REQUIRED COMPONENTS uc)
+ endif()
+ add_definitions(-DRE2_USE_ICU)
+ list(APPEND REQUIRES icu-uc)
+endif()
+
+if(USEPCRE)
+ add_definitions(-DUSEPCRE)
+ list(APPEND EXTRA_TARGET_LINK_LIBRARIES pcre)
+endif()
+
+list(JOIN REQUIRES " " REQUIRES)
+
+set(RE2_SOURCES
+ re2/bitmap256.cc
+ re2/bitstate.cc
+ re2/compile.cc
+ re2/dfa.cc
+ re2/filtered_re2.cc
+ re2/mimics_pcre.cc
+ re2/nfa.cc
+ re2/onepass.cc
+ re2/parse.cc
+ re2/perl_groups.cc
+ re2/prefilter.cc
+ re2/prefilter_tree.cc
+ re2/prog.cc
+ re2/re2.cc
+ re2/regexp.cc
+ re2/set.cc
+ re2/simplify.cc
+ re2/tostring.cc
+ re2/unicode_casefold.cc
+ re2/unicode_groups.cc
+ util/rune.cc
+ util/strutil.cc
+ )
+
+set(RE2_HEADERS
+ re2/filtered_re2.h
+ re2/re2.h
+ re2/set.h
+ re2/stringpiece.h
+ )
+
+add_library(re2 ${RE2_SOURCES})
+target_compile_features(re2 PUBLIC cxx_std_14)
+target_include_directories(re2 PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+# CMake gives "set_target_properties called with incorrect number of arguments."
+# errors if we don't quote ${RE2_HEADERS}, so quote it despite prevailing style.
+set_target_properties(re2 PROPERTIES PUBLIC_HEADER "${RE2_HEADERS}")
+set_target_properties(re2 PROPERTIES SOVERSION ${SONAME} VERSION ${SONAME}.0.0)
+add_library(re2::re2 ALIAS re2)
+
+if(APPLE AND RE2_BUILD_FRAMEWORK)
+ set_target_properties(re2 PROPERTIES
+ FRAMEWORK TRUE
+ FRAMEWORK_VERSION A
+ MACOSX_FRAMEWORK_IDENTIFIER com.googlesource.code.re2)
+endif()
+
+if(UNIX)
+ target_link_libraries(re2 PUBLIC Threads::Threads)
+endif()
+
+foreach(dep ${ABSL_DEPS})
+ string(REGEX REPLACE "^absl_" "absl::" dep ${dep})
+ target_link_libraries(re2 PUBLIC ${dep})
+endforeach()
+
+if(RE2_USE_ICU)
+ target_link_libraries(re2 PUBLIC ICU::uc)
+endif()
+
+if(RE2_BUILD_TESTING)
+ if(NOT TARGET GTest::gtest)
+ find_package(GTest REQUIRED)
+ endif()
+ if(NOT TARGET benchmark::benchmark)
+ find_package(benchmark REQUIRED)
+ endif()
+
+ set(TESTING_SOURCES
+ re2/testing/backtrack.cc
+ re2/testing/dump.cc
+ re2/testing/exhaustive_tester.cc
+ re2/testing/null_walker.cc
+ re2/testing/regexp_generator.cc
+ re2/testing/string_generator.cc
+ re2/testing/tester.cc
+ util/pcre.cc
+ )
+
+ add_library(testing ${TESTING_SOURCES})
+ if(BUILD_SHARED_LIBS AND WIN32)
+ target_compile_definitions(testing PRIVATE -DRE2_BUILD_TESTING_DLL)
+ endif()
+ target_compile_features(testing PUBLIC cxx_std_14)
+ target_link_libraries(testing PUBLIC re2 GTest::gtest)
+
+ set(TEST_TARGETS
+ charclass_test
+ compile_test
+ filtered_re2_test
+ mimics_pcre_test
+ parse_test
+ possible_match_test
+ re2_test
+ re2_arg_test
+ regexp_test
+ required_prefix_test
+ search_test
+ set_test
+ simplify_test
+ string_generator_test
+
+ dfa_test
+ exhaustive1_test
+ exhaustive2_test
+ exhaustive3_test
+ exhaustive_test
+ random_test
+ )
+
+ set(BENCHMARK_TARGETS
+ regexp_benchmark
+ )
+
+ foreach(target ${TEST_TARGETS})
+ add_executable(${target} re2/testing/${target}.cc)
+ if(BUILD_SHARED_LIBS AND WIN32)
+ target_compile_definitions(${target} PRIVATE -DRE2_CONSUME_TESTING_DLL)
+ endif()
+ target_compile_features(${target} PUBLIC cxx_std_14)
+ target_link_libraries(${target} PUBLIC testing GTest::gtest_main ${EXTRA_TARGET_LINK_LIBRARIES})
+ add_test(NAME ${target} COMMAND ${target})
+ endforeach()
+
+ foreach(target ${BENCHMARK_TARGETS})
+ add_executable(${target} re2/testing/${target}.cc)
+ if(BUILD_SHARED_LIBS AND WIN32)
+ target_compile_definitions(${target} PRIVATE -DRE2_CONSUME_TESTING_DLL)
+ endif()
+ target_compile_features(${target} PUBLIC cxx_std_14)
+ target_link_libraries(${target} PUBLIC testing benchmark::benchmark_main ${EXTRA_TARGET_LINK_LIBRARIES})
+ endforeach()
+endif()
+
+install(TARGETS re2
+ EXPORT re2Targets
+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+ FRAMEWORK DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/re2
+ INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(EXPORT re2Targets
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2
+ NAMESPACE re2::)
+
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/re2Config.cmake.in
+ ${CMAKE_CURRENT_BINARY_DIR}/re2Config.cmake
+ INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2)
+write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/re2ConfigVersion.cmake
+ VERSION ${SONAME}.0.0
+ COMPATIBILITY SameMajorVersion)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/re2Config.cmake
+ ${CMAKE_CURRENT_BINARY_DIR}/re2ConfigVersion.cmake
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/re2.pc.in
+ ${CMAKE_CURRENT_BINARY_DIR}/re2.pc
+ @ONLY)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/re2.pc
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
diff --git a/third_party/re2/src/CONTRIBUTING.md b/third_party/re2/src/CONTRIBUTING.md
new file mode 100644
index 000000000..882b0e2f3
--- /dev/null
+++ b/third_party/re2/src/CONTRIBUTING.md
@@ -0,0 +1,2 @@
+RE2 uses Gerrit instead of GitHub pull requests.
+See the [Contribute](https://github.com/google/re2/wiki/Contribute) wiki page.
diff --git a/third_party/re2/src/CONTRIBUTORS b/third_party/re2/src/CONTRIBUTORS
new file mode 100644
index 000000000..1a1c84827
--- /dev/null
+++ b/third_party/re2/src/CONTRIBUTORS
@@ -0,0 +1,41 @@
+# This is the official list of people who can contribute
+# (and typically have contributed) code to the RE2 repository.
+# The AUTHORS file lists the copyright holders; this file
+# lists people. For example, Google employees are listed here
+# but not in AUTHORS, because Google holds the copyright.
+#
+# The submission process automatically checks to make sure
+# that people submitting code are listed in this file (by email address).
+#
+# Names should be added to this file only after verifying that
+# the individual or the individual's organization has agreed to
+# the appropriate Contributor License Agreement, found here:
+#
+# http://code.google.com/legal/individual-cla-v1.0.html
+# http://code.google.com/legal/corporate-cla-v1.0.html
+#
+# The agreement for individuals can be filled out on the web.
+#
+# When adding J Random Contributor's name to this file,
+# either J's name or J's organization's name should be
+# added to the AUTHORS file, depending on whether the
+# individual or corporate CLA was used.
+
+# Names should be added to this file like so:
+# Name <email address>
+
+# Please keep the list sorted.
+
+Dominic Battré <battre@chromium.org>
+Doug Kwan <dougkwan@google.com>
+Dmitriy Vyukov <dvyukov@google.com>
+John Millikin <jmillikin@gmail.com>
+Mike Nazarewicz <mpn@google.com>
+Nico Weber <thakis@chromium.org>
+Pawel Hajdan <phajdan.jr@gmail.com>
+Rob Pike <r@google.com>
+Russ Cox <rsc@swtch.com>
+Sanjay Ghemawat <sanjay@google.com>
+Stefano Rivera <stefano.rivera@gmail.com>
+Srinivasan Venkatachary <vsri@google.com>
+Viatcheslav Ostapenko <sl.ostapenko@samsung.com>
diff --git a/third_party/re2/src/LICENSE b/third_party/re2/src/LICENSE
new file mode 100644
index 000000000..09e5ec1c7
--- /dev/null
+++ b/third_party/re2/src/LICENSE
@@ -0,0 +1,27 @@
+// Copyright (c) 2009 The RE2 Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/re2/src/MODULE.bazel b/third_party/re2/src/MODULE.bazel
new file mode 100644
index 000000000..87a5576a1
--- /dev/null
+++ b/third_party/re2/src/MODULE.bazel
@@ -0,0 +1,27 @@
+# Copyright 2009 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Bazel (http://bazel.build/) MODULE file for RE2.
+
+module(
+ name = "re2",
+ version = "2023-11-01",
+ compatibility_level = 1,
+)
+
+bazel_dep(name = "platforms", version = "0.0.8")
+bazel_dep(name = "rules_cc", version = "0.0.9")
+bazel_dep(name = "abseil-cpp", version = "20230802.0", repo_name = "com_google_absl")
+bazel_dep(name = "rules_python", version = "0.26.0")
+bazel_dep(name = "pybind11_bazel", version = "2.11.1")
+
+python_configure = use_extension("@pybind11_bazel//:python_configure.bzl", "extension")
+python_configure.toolchain(python_version = "3") # ignored when non-root module
+use_repo(python_configure, "local_config_python", "pybind11")
+
+# These dependencies will be ignored when the `re2` module is not
+# the root module (or when `--ignore_dev_dependency` is enabled).
+bazel_dep(name = "google_benchmark", version = "1.8.3", dev_dependency = True)
+bazel_dep(name = "googletest", version = "1.14.0.bcr.1", dev_dependency = True)
+bazel_dep(name = "abseil-py", version = "1.4.0", dev_dependency = True)
diff --git a/third_party/re2/src/Makefile b/third_party/re2/src/Makefile
new file mode 100644
index 000000000..017ab5567
--- /dev/null
+++ b/third_party/re2/src/Makefile
@@ -0,0 +1,399 @@
+# Copyright 2009 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Build against Abseil.
+ABSL_DEPS=\
+ absl_base\
+ absl_core_headers\
+ absl_fixed_array\
+ absl_flags\
+ absl_flat_hash_map\
+ absl_flat_hash_set\
+ absl_inlined_vector\
+ absl_optional\
+ absl_span\
+ absl_str_format\
+ absl_strings\
+ absl_synchronization\
+
+PKG_CONFIG?=pkg-config
+CCABSL=$(shell $(PKG_CONFIG) $(ABSL_DEPS) --cflags)
+# GCC barfs on `-Wl` whereas Clang doesn't mind, but it's unclear what
+# causes it to manifest on Ubuntu 22.04 LTS, so filter it out for now.
+# Similar is needed for `static-testinstall` and `shared-testinstall`.
+LDABSL=$(shell $(PKG_CONFIG) $(ABSL_DEPS) --libs | sed -e 's/-Wl / /g')
+
+# To build against ICU for full Unicode properties support,
+# uncomment the next two lines:
+# CCICU=$(shell $(PKG_CONFIG) icu-uc --cflags) -DRE2_USE_ICU
+# LDICU=$(shell $(PKG_CONFIG) icu-uc --libs)
+
+# To build against PCRE for testing and benchmarking,
+# uncomment the next two lines:
+# CCPCRE=-I/usr/local/include -DUSEPCRE
+# LDPCRE=-L/usr/local/lib -lpcre
+
+CXX?=g++
+# can override
+CXXFLAGS?=-O3 -g
+LDFLAGS?=
+# required
+RE2_CXXFLAGS?=-pthread -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCABSL) $(CCICU) $(CCPCRE)
+RE2_LDFLAGS?=-pthread $(LDABSL) $(LDICU) $(LDPCRE)
+AR?=ar
+ARFLAGS?=rsc
+NM?=nm
+NMFLAGS?=-p
+
+# Variables mandated by GNU, the arbiter of all good taste on the internet.
+# http://www.gnu.org/prep/standards/standards.html
+prefix=/usr/local
+exec_prefix=$(prefix)
+includedir=$(prefix)/include
+libdir=$(exec_prefix)/lib
+INSTALL=install
+INSTALL_DATA=$(INSTALL) -m 644
+
+# Work around the weirdness of sed(1) on Darwin. :/
+ifeq ($(shell uname),Darwin)
+SED_INPLACE=sed -i ''
+else ifeq ($(shell uname),SunOS)
+SED_INPLACE=sed -i
+else
+SED_INPLACE=sed -i
+endif
+
+# The pkg-config Requires: field.
+REQUIRES=$(ABSL_DEPS)
+ifdef LDICU
+REQUIRES+=icu-uc
+endif
+
+# ABI version
+# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
+SONAME=11
+
+# To rebuild the Tables generated by Perl and Python scripts (requires Internet
+# access for Unicode data), uncomment the following line:
+# REBUILD_TABLES=1
+
+# The SunOS linker does not support wildcards. :(
+ifeq ($(shell uname),Darwin)
+SOEXT=dylib
+SOEXTVER=$(SONAME).$(SOEXT)
+SOEXTVER00=$(SONAME).0.0.$(SOEXT)
+MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-compatibility_version,$(SONAME),-current_version,$(SONAME).0.0,-install_name,$(libdir)/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin
+else ifeq ($(shell uname),SunOS)
+SOEXT=so
+SOEXTVER=$(SOEXT).$(SONAME)
+SOEXTVER00=$(SOEXT).$(SONAME).0.0
+MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER)
+else
+SOEXT=so
+SOEXTVER=$(SOEXT).$(SONAME)
+SOEXTVER00=$(SOEXT).$(SONAME).0.0
+MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols
+endif
+
+.PHONY: all
+all: obj/libre2.a obj/so/libre2.$(SOEXT)
+
+INSTALL_HFILES=\
+ re2/filtered_re2.h\
+ re2/re2.h\
+ re2/set.h\
+ re2/stringpiece.h\
+
+HFILES=\
+ util/logging.h\
+ util/malloc_counter.h\
+ util/pcre.h\
+ util/strutil.h\
+ util/utf.h\
+ re2/bitmap256.h\
+ re2/filtered_re2.h\
+ re2/pod_array.h\
+ re2/prefilter.h\
+ re2/prefilter_tree.h\
+ re2/prog.h\
+ re2/re2.h\
+ re2/regexp.h\
+ re2/set.h\
+ re2/sparse_array.h\
+ re2/sparse_set.h\
+ re2/stringpiece.h\
+ re2/testing/exhaustive_tester.h\
+ re2/testing/regexp_generator.h\
+ re2/testing/string_generator.h\
+ re2/testing/tester.h\
+ re2/unicode_casefold.h\
+ re2/unicode_groups.h\
+ re2/walker-inl.h\
+
+OFILES=\
+ obj/util/rune.o\
+ obj/util/strutil.o\
+ obj/re2/bitmap256.o\
+ obj/re2/bitstate.o\
+ obj/re2/compile.o\
+ obj/re2/dfa.o\
+ obj/re2/filtered_re2.o\
+ obj/re2/mimics_pcre.o\
+ obj/re2/nfa.o\
+ obj/re2/onepass.o\
+ obj/re2/parse.o\
+ obj/re2/perl_groups.o\
+ obj/re2/prefilter.o\
+ obj/re2/prefilter_tree.o\
+ obj/re2/prog.o\
+ obj/re2/re2.o\
+ obj/re2/regexp.o\
+ obj/re2/set.o\
+ obj/re2/simplify.o\
+ obj/re2/tostring.o\
+ obj/re2/unicode_casefold.o\
+ obj/re2/unicode_groups.o\
+
+TESTOFILES=\
+ obj/util/pcre.o\
+ obj/re2/testing/backtrack.o\
+ obj/re2/testing/dump.o\
+ obj/re2/testing/exhaustive_tester.o\
+ obj/re2/testing/null_walker.o\
+ obj/re2/testing/regexp_generator.o\
+ obj/re2/testing/string_generator.o\
+ obj/re2/testing/tester.o\
+
+TESTS=\
+ obj/test/charclass_test\
+ obj/test/compile_test\
+ obj/test/filtered_re2_test\
+ obj/test/mimics_pcre_test\
+ obj/test/parse_test\
+ obj/test/possible_match_test\
+ obj/test/re2_test\
+ obj/test/re2_arg_test\
+ obj/test/regexp_test\
+ obj/test/required_prefix_test\
+ obj/test/search_test\
+ obj/test/set_test\
+ obj/test/simplify_test\
+ obj/test/string_generator_test\
+
+BIGTESTS=\
+ obj/test/dfa_test\
+ obj/test/exhaustive1_test\
+ obj/test/exhaustive2_test\
+ obj/test/exhaustive3_test\
+ obj/test/exhaustive_test\
+ obj/test/random_test\
+
+SOFILES=$(patsubst obj/%,obj/so/%,$(OFILES))
+# We use TESTOFILES for testing the shared lib, only it is built differently.
+STESTS=$(patsubst obj/%,obj/so/%,$(TESTS))
+SBIGTESTS=$(patsubst obj/%,obj/so/%,$(BIGTESTS))
+
+DOFILES=$(patsubst obj/%,obj/dbg/%,$(OFILES))
+DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES))
+DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS))
+DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS))
+
+.PRECIOUS: obj/%.o
+obj/%.o: %.cc $(HFILES)
+ @mkdir -p $$(dirname $@)
+ $(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc
+
+.PRECIOUS: obj/dbg/%.o
+obj/dbg/%.o: %.cc $(HFILES)
+ @mkdir -p $$(dirname $@)
+ $(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) $*.cc
+
+.PRECIOUS: obj/so/%.o
+obj/so/%.o: %.cc $(HFILES)
+ @mkdir -p $$(dirname $@)
+ $(CXX) -c -o $@ -fPIC $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc
+
+.PRECIOUS: obj/libre2.a
+obj/libre2.a: $(OFILES)
+ @mkdir -p obj
+ $(AR) $(ARFLAGS) obj/libre2.a $(OFILES)
+
+.PRECIOUS: obj/dbg/libre2.a
+obj/dbg/libre2.a: $(DOFILES)
+ @mkdir -p obj/dbg
+ $(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES)
+
+.PRECIOUS: obj/so/libre2.$(SOEXT)
+obj/so/libre2.$(SOEXT): $(SOFILES) libre2.symbols libre2.symbols.darwin
+ @mkdir -p obj/so
+ $(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES) $(RE2_LDFLAGS) $(LDFLAGS)
+ ln -sf libre2.$(SOEXTVER) $@
+
+.PRECIOUS: obj/dbg/test/%
+obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES)
+ @mkdir -p obj/dbg/test
+ $(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) -lgtest -lgtest_main obj/dbg/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
+
+.PRECIOUS: obj/test/%
+obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES)
+ @mkdir -p obj/test
+ $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) -lgtest -lgtest_main obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
+
+# Test the shared lib, falling back to the static lib for private symbols
+.PRECIOUS: obj/so/test/%
+obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/re2/testing/%.o $(TESTOFILES)
+ @mkdir -p obj/so/test
+ $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) -lgtest -lgtest_main -Lobj/so -lre2 obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
+
+obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES)
+ @mkdir -p obj/test
+ $(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) -lgtest -lbenchmark -lbenchmark_main obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
+
+obj/test/re2_fuzzer: obj/libre2.a obj/re2/fuzzing/re2_fuzzer.o
+ @mkdir -p obj/test
+ $(CXX) -o $@ obj/re2/fuzzing/re2_fuzzer.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
+
+ifdef REBUILD_TABLES
+.PRECIOUS: re2/perl_groups.cc
+re2/perl_groups.cc: re2/make_perl_groups.pl
+ perl $< > $@
+
+.PRECIOUS: re2/unicode_%.cc
+re2/unicode_%.cc: re2/make_unicode_%.py re2/unicode.py
+ python3 $< > $@
+endif
+
+.PHONY: distclean
+distclean: clean
+ rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc
+
+.PHONY: clean
+clean:
+ rm -rf obj
+ rm -f re2/*.pyc
+
+.PHONY: testofiles
+testofiles: $(TESTOFILES)
+
+.PHONY: test
+test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test
+
+.PHONY: debug-test
+debug-test: $(DTESTS)
+ @./runtests $(DTESTS)
+
+.PHONY: static-test
+static-test: $(TESTS)
+ @./runtests $(TESTS)
+
+.PHONY: shared-test
+shared-test: $(STESTS)
+ @./runtests -shared-library-path obj/so $(STESTS)
+
+.PHONY: debug-bigtest
+debug-bigtest: $(DTESTS) $(DBIGTESTS)
+ @./runtests $(DTESTS) $(DBIGTESTS)
+
+.PHONY: static-bigtest
+static-bigtest: $(TESTS) $(BIGTESTS)
+ @./runtests $(TESTS) $(BIGTESTS)
+
+.PHONY: shared-bigtest
+shared-bigtest: $(STESTS) $(SBIGTESTS)
+ @./runtests -shared-library-path obj/so $(STESTS) $(SBIGTESTS)
+
+.PHONY: benchmark
+benchmark: obj/test/regexp_benchmark
+
+.PHONY: fuzz
+fuzz: obj/test/re2_fuzzer
+
+.PHONY: install
+install: static-install shared-install
+
+.PHONY: static
+static: obj/libre2.a
+
+.PHONY: static-install
+static-install: obj/libre2.a common-install
+ $(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a
+
+.PHONY: shared
+shared: obj/so/libre2.$(SOEXT)
+
+.PHONY: shared-install
+shared-install: obj/so/libre2.$(SOEXT) common-install
+ $(INSTALL) obj/so/libre2.$(SOEXT) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER00)
+ ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER)
+ ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT)
+
+.PHONY: common-install
+common-install:
+ mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig
+ $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2
+ $(INSTALL_DATA) re2.pc.in $(DESTDIR)$(libdir)/pkgconfig/re2.pc
+ $(SED_INPLACE) -e "s#@CMAKE_INSTALL_FULL_INCLUDEDIR@#$(includedir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
+ $(SED_INPLACE) -e "s#@CMAKE_INSTALL_FULL_LIBDIR@#$(libdir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
+ $(SED_INPLACE) -e "s#@REQUIRES@#$(REQUIRES)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
+ $(SED_INPLACE) -e "s#@SONAME@#$(SONAME)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
+
+.PHONY: testinstall
+testinstall: static-testinstall shared-testinstall
+ @echo
+ @echo Install tests passed.
+ @echo
+
+.PHONY: static-testinstall
+static-testinstall:
+ifeq ($(shell uname),Darwin)
+ @echo Skipping test for libre2.a on Darwin.
+else ifeq ($(shell uname),SunOS)
+ @echo Skipping test for libre2.a on SunOS.
+else
+ @mkdir -p obj
+ @cp testinstall.cc obj/static-testinstall.cc
+ (cd obj && export PKG_CONFIG_PATH=$(DESTDIR)$(libdir)/pkgconfig; \
+ $(CXX) static-testinstall.cc -o static-testinstall $(CXXFLAGS) $(LDFLAGS) \
+ $$($(PKG_CONFIG) re2 --cflags) \
+ $$($(PKG_CONFIG) re2 --libs | sed -e 's/-Wl / /g' | sed -e 's/-lre2/-l:libre2.a/'))
+ obj/static-testinstall
+endif
+
+.PHONY: shared-testinstall
+shared-testinstall:
+ @mkdir -p obj
+ @cp testinstall.cc obj/shared-testinstall.cc
+ (cd obj && export PKG_CONFIG_PATH=$(DESTDIR)$(libdir)/pkgconfig; \
+ $(CXX) shared-testinstall.cc -o shared-testinstall $(CXXFLAGS) $(LDFLAGS) \
+ $$($(PKG_CONFIG) re2 --cflags) \
+ $$($(PKG_CONFIG) re2 --libs | sed -e 's/-Wl / /g'))
+ifeq ($(shell uname),Darwin)
+ DYLD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(DYLD_LIBRARY_PATH)" obj/shared-testinstall
+else
+ LD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(LD_LIBRARY_PATH)" obj/shared-testinstall
+endif
+
+.PHONY: benchlog
+benchlog: obj/test/regexp_benchmark
+ (echo '==BENCHMARK==' `hostname` `date`; \
+ (uname -a; $(CXX) --version; git rev-parse --short HEAD; file obj/test/regexp_benchmark) | sed 's/^/# /'; \
+ echo; \
+ ./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//')
+
+.PHONY: log
+log:
+ $(MAKE) clean
+ $(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" \
+ $(filter obj/test/exhaustive%_test,$(BIGTESTS))
+ echo '#' RE2 exhaustive tests built by make log >re2-exhaustive.txt
+ echo '#' $$(date) >>re2-exhaustive.txt
+ obj/test/exhaustive_test |grep -v '^PASS$$' >>re2-exhaustive.txt
+ obj/test/exhaustive1_test |grep -v '^PASS$$' >>re2-exhaustive.txt
+ obj/test/exhaustive2_test |grep -v '^PASS$$' >>re2-exhaustive.txt
+ obj/test/exhaustive3_test |grep -v '^PASS$$' >>re2-exhaustive.txt
+
+ $(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/search_test
+ echo '#' RE2 basic search tests built by make $@ >re2-search.txt
+ echo '#' $$(date) >>re2-search.txt
+ obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt
diff --git a/third_party/re2/src/README b/third_party/re2/src/README
new file mode 100644
index 000000000..469d6f397
--- /dev/null
+++ b/third_party/re2/src/README
@@ -0,0 +1,47 @@
+This is the source code repository for RE2, a regular expression library.
+
+For documentation about how to install and use RE2,
+visit https://github.com/google/re2/.
+
+The short version is:
+
+make
+make test
+make install
+make testinstall
+
+Building RE2 requires Abseil (https://github.com/abseil/abseil-cpp)
+to be installed on your system. Building the testing for RE2 requires
+GoogleTest (https://github.com/google/googletest) and Benchmark
+(https://github.com/google/benchmark) to be installed as well.
+
+There is a fair amount of documentation (including code snippets) in
+the re2.h header file.
+
+More information can be found on the wiki:
+https://github.com/google/re2/wiki
+
+Issue tracker:
+https://github.com/google/re2/issues
+
+Mailing list:
+https://groups.google.com/group/re2-dev
+
+Unless otherwise noted, the RE2 source files are distributed
+under the BSD-style license found in the LICENSE file.
+
+RE2's native language is C++.
+
+The Python wrapper is at https://github.com/google/re2/tree/abseil/python
+and on PyPI (https://pypi.org/project/google-re2/).
+
+A C wrapper is at https://github.com/marcomaggi/cre2/.
+A D wrapper is at https://github.com/ShigekiKarita/re2d/ and on DUB (code.dlang.org).
+An Erlang wrapper is at https://github.com/dukesoferl/re2/ and on Hex (hex.pm).
+An Inferno wrapper is at https://github.com/powerman/inferno-re2/.
+A Node.js wrapper is at https://github.com/uhop/node-re2/ and on NPM (npmjs.com).
+An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM (opam.ocaml.org).
+A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN (cpan.org).
+An R wrapper is at https://github.com/girishji/re2/ and on CRAN (cran.r-project.org).
+A Ruby wrapper is at https://github.com/mudge/re2/ and on RubyGems (rubygems.org).
+A WebAssembly wrapper is at https://github.com/google/re2-wasm/ and on NPM (npmjs.com).
diff --git a/third_party/re2/src/SECURITY.md b/third_party/re2/src/SECURITY.md
new file mode 100644
index 000000000..39ba0e93f
--- /dev/null
+++ b/third_party/re2/src/SECURITY.md
@@ -0,0 +1,4 @@
+To report a security issue, please use https://g.co/vulnz. We use
+https://g.co/vulnz for our intake, and do coordination and disclosure here on
+GitHub (including using GitHub Security Advisory). The Google Security Team will
+respond within 5 working days of your report on https://g.co/vulnz.
diff --git a/third_party/re2/src/WORKSPACE.bazel b/third_party/re2/src/WORKSPACE.bazel
new file mode 100644
index 000000000..fa514a877
--- /dev/null
+++ b/third_party/re2/src/WORKSPACE.bazel
@@ -0,0 +1,7 @@
+# Copyright 2009 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Bazel (http://bazel.build/) WORKSPACE file for RE2.
+
+workspace(name = "com_googlesource_code_re2")
diff --git a/third_party/re2/src/WORKSPACE.bzlmod b/third_party/re2/src/WORKSPACE.bzlmod
new file mode 100644
index 000000000..fa514a877
--- /dev/null
+++ b/third_party/re2/src/WORKSPACE.bzlmod
@@ -0,0 +1,7 @@
+# Copyright 2009 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Bazel (http://bazel.build/) WORKSPACE file for RE2.
+
+workspace(name = "com_googlesource_code_re2")
diff --git a/third_party/re2/src/app/BUILD.bazel b/third_party/re2/src/app/BUILD.bazel
new file mode 100644
index 000000000..cb510af90
--- /dev/null
+++ b/third_party/re2/src/app/BUILD.bazel
@@ -0,0 +1,24 @@
+# Copyright 2009 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Bazel (http://bazel.build/) BUILD file for RE2 app.
+
+cc_binary(
+ name = "_re2.js",
+ testonly = 1,
+ srcs = ["_re2.cc"],
+ linkopts = [
+ "--bind",
+ "-sENVIRONMENT=web",
+ "-sSINGLE_FILE=1",
+ "-sMODULARIZE=1",
+ "-sEXPORT_ES6=1",
+ "-sEXPORT_NAME=loadModule",
+ "-sUSE_PTHREADS=0",
+ ],
+ deps = [
+ "//:re2",
+ "//:testing",
+ ],
+)
diff --git a/third_party/re2/src/app/_re2.cc b/third_party/re2/src/app/_re2.cc
new file mode 100644
index 000000000..a63313e0d
--- /dev/null
+++ b/third_party/re2/src/app/_re2.cc
@@ -0,0 +1,94 @@
+// Copyright 2022 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <memory>
+#include <string>
+
+#include <emscripten/bind.h>
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+
+namespace re2_app {
+
+struct Info {
+ std::string pattern;
+ std::string error;
+ std::string prefix;
+ bool prefix_foldcase = false;
+ std::string accel_prefix;
+ bool accel_prefix_foldcase = false;
+ int num_captures;
+ bool is_one_pass;
+ bool can_bit_state;
+ std::string bytecode;
+ std::string bytemap;
+};
+
+Info GetInfo(const std::string& pattern) {
+ Info info;
+ info.pattern = pattern;
+
+ RE2::Options options;
+ re2::RegexpStatus status;
+ re2::Regexp* regexp = re2::Regexp::Parse(
+ pattern, static_cast<re2::Regexp::ParseFlags>(options.ParseFlags()),
+ &status);
+ if (regexp == nullptr) {
+ info.error = "failed to parse pattern: " + status.Text();
+ return info;
+ }
+
+ std::string prefix;
+ bool prefix_foldcase;
+ re2::Regexp* suffix;
+ if (regexp->RequiredPrefix(&prefix, &prefix_foldcase, &suffix)) {
+ info.prefix = prefix;
+ info.prefix_foldcase = prefix_foldcase;
+ } else {
+ suffix = regexp->Incref();
+ }
+
+ std::unique_ptr<re2::Prog> prog(suffix->CompileToProg(options.max_mem()));
+ if (prog == nullptr) {
+ info.error = "failed to compile forward Prog";
+ suffix->Decref();
+ regexp->Decref();
+ return info;
+ }
+
+ if (regexp->RequiredPrefixForAccel(&prefix, &prefix_foldcase)) {
+ info.accel_prefix = prefix;
+ info.accel_prefix_foldcase = prefix_foldcase;
+ }
+
+ info.num_captures = suffix->NumCaptures();
+ info.is_one_pass = prog->IsOnePass();
+ info.can_bit_state = prog->CanBitState();
+ info.bytecode = prog->Dump();
+ info.bytemap = prog->DumpByteMap();
+
+ suffix->Decref();
+ regexp->Decref();
+ return info;
+}
+
+EMSCRIPTEN_BINDINGS(_re2) {
+ emscripten::value_object<Info>("Info")
+ .field("pattern", &Info::pattern)
+ .field("error", &Info::error)
+ .field("prefix", &Info::prefix)
+ .field("prefix_foldcase", &Info::prefix_foldcase)
+ .field("accel_prefix", &Info::accel_prefix)
+ .field("accel_prefix_foldcase", &Info::accel_prefix_foldcase)
+ .field("num_captures", &Info::num_captures)
+ .field("is_one_pass", &Info::is_one_pass)
+ .field("can_bit_state", &Info::can_bit_state)
+ .field("bytecode", &Info::bytecode)
+ .field("bytemap", &Info::bytemap);
+
+ emscripten::function("getInfo", &GetInfo);
+}
+
+} // namespace re2_app
diff --git a/third_party/re2/src/app/_re2.d.ts b/third_party/re2/src/app/_re2.d.ts
new file mode 100644
index 000000000..dff5e49de
--- /dev/null
+++ b/third_party/re2/src/app/_re2.d.ts
@@ -0,0 +1,23 @@
+// Copyright 2022 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+export type Info = {
+ pattern: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
+ error: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
+ prefix: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
+ prefix_foldcase: boolean,
+ accel_prefix: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
+ accel_prefix_foldcase: boolean,
+ num_captures: number,
+ is_one_pass: boolean,
+ can_bit_state: boolean,
+ bytecode: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
+ bytemap: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
+};
+
+export interface MainModule {
+ getInfo(pattern: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string): Info;
+}
+
+export default function loadModule(): Promise<MainModule>;
diff --git a/third_party/re2/src/app/app.ts b/third_party/re2/src/app/app.ts
new file mode 100644
index 000000000..4b9e7bdd1
--- /dev/null
+++ b/third_party/re2/src/app/app.ts
@@ -0,0 +1,111 @@
+// Copyright 2022 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+import {css, html, LitElement, render} from 'lit';
+import {customElement} from 'lit/decorators.js';
+
+import /*default*/ loadModule from './_re2';
+import {Info, MainModule} from './_re2';
+
+var _re2: MainModule;
+loadModule().then((module: MainModule) => {
+ _re2 = module;
+ render(html`<title>re2-dev</title><re2-dev></re2-dev>`, document.body);
+});
+
+@customElement('re2-dev')
+export class RE2Dev extends LitElement {
+ private _pattern: string = '';
+ private _info: Info|null = null;
+
+ constructor() {
+ super();
+ this._pattern = decodeURIComponent(window.location.hash.slice(1));
+ this._info = this._pattern ? _re2.getInfo(this._pattern) : null;
+ this.requestUpdate();
+ }
+
+ private _onChange = (e: Event) => {
+ this._pattern = (e.target as HTMLInputElement).value;
+ this._info = this._pattern ? _re2.getInfo(this._pattern) : null;
+ this.requestUpdate();
+ window.location.hash = '#' + encodeURIComponent(this._pattern);
+ };
+
+ static override styles = css`
+.code {
+ font-family: monospace;
+ white-space: pre-line;
+}
+`;
+
+ override render() {
+ var fragments = [];
+ fragments.push(html`
+<div>
+ <input type="text" size="48" @change=${this._onChange} .value=${this._pattern}>
+</div>
+`);
+
+ if (this._info === null) {
+ return html`${fragments}`;
+ }
+
+ if (this._info.error) {
+ fragments.push(html`
+<br>
+<div>
+ error:
+ <span class="code">${this._info.error}</span>
+</div>
+`);
+ return html`${fragments}`;
+ }
+
+ fragments.push(html`
+<br>
+<div>
+ pattern:
+ <span class="code">${this._info.pattern}</span>
+ <br>
+ prefix:
+ <span class="code">${this._info.prefix}</span>
+ ·
+ _foldcase:
+ <span class="code">${this._info.prefix_foldcase}</span>
+ <br>
+ accel_prefix:
+ <span class="code">${this._info.accel_prefix}</span>
+ ·
+ _foldcase:
+ <span class="code">${this._info.accel_prefix_foldcase}</span>
+ <br>
+ num_captures:
+ <span class="code">${this._info.num_captures}</span>
+ <br>
+ is_one_pass:
+ <span class="code">${this._info.is_one_pass}</span>
+ <br>
+ can_bit_state:
+ <span class="code">${this._info.can_bit_state}</span>
+ <br>
+ <br>
+ bytecode:
+ <br>
+ <span class="code">${this._info.bytecode}</span>
+ <br>
+ bytemap:
+ <br>
+ <span class="code">${this._info.bytemap}</span>
+</div>
+`);
+ return html`${fragments}`;
+ }
+}
+
+declare global {
+ interface HTMLElementTagNameMap {
+ 're2-dev': RE2Dev;
+ }
+}
diff --git a/third_party/re2/src/app/build.sh b/third_party/re2/src/app/build.sh
new file mode 100755
index 000000000..09d931fcb
--- /dev/null
+++ b/third_party/re2/src/app/build.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+set -eux
+
+SRCDIR=$(readlink --canonicalize $(dirname $0))
+DSTDIR=$(mktemp --directory --tmpdir $(basename $0).XXXXXXXXXX)
+
+BAZEL=/tmp/bazel
+BAZELISK_RELEASE=v1.17.0
+
+if [[ ${UID} -ne 0 ]]; then
+ if [[ -d deploy ]]; then
+ echo -e '\033[1;31m' "** The ${PWD}/deploy directory exists! Refusing to clobber it! **" '\033[0m'
+ exit 1
+ fi
+ mkdir deploy
+ sudo docker run -i -t --pull always --rm -v ${SRCDIR}/..:/src -v ${PWD}:/dst emscripten/emsdk /src/app/$(basename $0)
+ ls -l deploy
+else
+ wget -O ${BAZEL} https://github.com/bazelbuild/bazelisk/releases/download/${BAZELISK_RELEASE}/bazelisk-linux-amd64
+ chmod +x ${BAZEL}
+
+ cd ${SRCDIR}
+ # Emscripten doesn't support `-fstack-protector`.
+ AR=emar CC=emcc \
+ ${BAZEL} build --compilation_mode=opt \
+ --copt=-fno-stack-protector \
+ -- :all
+ cp ../bazel-bin/app/_re2.js ${DSTDIR}
+ # Clean up the sundry Bazel output directories.
+ ${BAZEL} clean --expunge
+ cp app.ts index.html _re2.d.ts ${DSTDIR}
+ cp package.json rollup.config.js tsconfig.json ${DSTDIR}
+
+ cd ${DSTDIR}
+ npm install
+ npx tsc
+ npx rollup -c rollup.config.js -d deploy
+ mv deploy/* /dst/deploy
+fi
+
+cd ${SRCDIR}
+rm -rf ${DSTDIR}
+
+exit 0
diff --git a/third_party/re2/src/app/index.html b/third_party/re2/src/app/index.html
new file mode 100644
index 000000000..d229e56dd
--- /dev/null
+++ b/third_party/re2/src/app/index.html
@@ -0,0 +1,5 @@
+<!DOCTYPE html>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<style>:root { color-scheme: dark light; }</style>
+<script type="module" src="app.js"></script>
diff --git a/third_party/re2/src/app/package.json b/third_party/re2/src/app/package.json
new file mode 100644
index 000000000..e70278976
--- /dev/null
+++ b/third_party/re2/src/app/package.json
@@ -0,0 +1,14 @@
+{
+ "dependencies": {
+ "lit": "*"
+ },
+ "devDependencies": {
+ "@rollup/plugin-node-resolve": "*",
+ "@rollup/plugin-terser": "*",
+ "@web/rollup-plugin-html": "*",
+ "@web/rollup-plugin-import-meta-assets": "*",
+ "rollup": "~2",
+ "tslib": "*",
+ "typescript": "*"
+ }
+}
diff --git a/third_party/re2/src/app/rollup.config.js b/third_party/re2/src/app/rollup.config.js
new file mode 100644
index 000000000..3a20e6649
--- /dev/null
+++ b/third_party/re2/src/app/rollup.config.js
@@ -0,0 +1,28 @@
+// Copyright 2022 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+import nodeResolve from '@rollup/plugin-node-resolve';
+import terser from '@rollup/plugin-terser';
+import html from '@web/rollup-plugin-html';
+import {importMetaAssets} from '@web/rollup-plugin-import-meta-assets';
+
+export default {
+ input: 'index.html',
+ output: {
+ entryFileNames: '[hash].js',
+ chunkFileNames: '[hash].js',
+ assetFileNames: '[hash][extname]',
+ format: 'es',
+ },
+ preserveEntrySignatures: false,
+ plugins:
+ [
+ html({
+ minify: true,
+ }),
+ nodeResolve(),
+ terser(),
+ importMetaAssets(),
+ ],
+};
diff --git a/third_party/re2/src/app/tsconfig.json b/third_party/re2/src/app/tsconfig.json
new file mode 100644
index 000000000..86cc30207
--- /dev/null
+++ b/third_party/re2/src/app/tsconfig.json
@@ -0,0 +1,17 @@
+{
+ "compilerOptions": {
+ "target": "esnext",
+ "module": "esnext",
+ "moduleResolution": "node",
+ "noEmitOnError": true,
+ "lib": ["esnext", "dom"],
+ "strict": true,
+ "esModuleInterop": false,
+ "allowSyntheticDefaultImports": true,
+ "experimentalDecorators": true,
+ "importHelpers": true,
+ "sourceMap": true,
+ "inlineSources": true,
+ "incremental": true
+ }
+}
diff --git a/third_party/re2/src/benchlog/benchlog.c2 b/third_party/re2/src/benchlog/benchlog.c2
new file mode 100644
index 000000000..2c1664c69
--- /dev/null
+++ b/third_party/re2/src/benchlog/benchlog.c2
@@ -0,0 +1,2211 @@
+c2=; apt-cache show libpcre3-dev
+Package: libpcre3-dev
+Priority: optional
+Section: libdevel
+Installed-Size: 712
+Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com>
+Original-Maintainer: Mark Baker <mark@mnb.org.uk>
+Architecture: amd64
+Source: pcre3
+Version: 7.8-3
+Depends: libc6-dev, libpcre3 (= 7.8-3), libpcrecpp0 (= 7.8-3)
+Conflicts: libpcre1-dev, libpcre2-dev
+Filename: pool/main/p/pcre3/libpcre3-dev_7.8-3_amd64.deb
+Size: 263634
+MD5sum: 0a081735710002405b16b9fde7db3f90
+SHA1: 73d6ba90280a6d897f420ac99c62aceed8bc9886
+SHA256: 75a8ac25fba93e72d043f58cd4cde5af0c266a3764527c58b7b059cdc58d7d72
+Description: Perl 5 Compatible Regular Expression Library - development files
+ This is a library of functions to support regular expressions whose syntax
+ and semantics are as close as possible to those of the Perl 5 language.
+ .
+ This package contains the development files, including headers, static
+ libraries, and documentation.
+Bugs: https://bugs.launchpad.net/ubuntu/+filebug
+Origin: Ubuntu
+
+c2=;
+
+
+==BENCHMARK== c2 Fri Feb 26 11:56:53 PST 2010
+# Linux c2 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64 GNU/Linux
+# g++ (Ubuntu 4.4.1-4ubuntu8) 4.4.1
+# Copyright (C) 2009 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# 10870db2d7b5+ tip
+
+Search_Easy0_CachedPCRE/8 10000000 127 ns/op 62.57 MB/s
+Search_Easy0_CachedPCRE/16 10000000 156 ns/op 102.26 MB/s
+Search_Easy0_CachedPCRE/32 5000000 213 ns/op 150.12 MB/s
+Search_Easy0_CachedPCRE/64 5000000 326 ns/op 195.76 MB/s
+Search_Easy0_CachedPCRE/128 5000000 427 ns/op 299.08 MB/s
+Search_Easy0_CachedPCRE/256 1000000 1080 ns/op 236.95 MB/s
+Search_Easy0_CachedPCRE/512 1000000 1741 ns/op 294.04 MB/s
+Search_Easy0_CachedPCRE/1K 500000 3395 ns/op 301.54 MB/s
+Search_Easy0_CachedPCRE/2K 200000 5680 ns/op 360.53 MB/s
+Search_Easy0_CachedPCRE/4K 100000 10664 ns/op 384.07 MB/s
+Search_Easy0_CachedPCRE/8K 50000 21849 ns/op 374.93 MB/s
+Search_Easy0_CachedPCRE/16K 50000 42327 ns/op 387.08 MB/s
+Search_Easy0_CachedPCRE/32K 20000 85374 ns/op 383.81 MB/s
+Search_Easy0_CachedPCRE/64K 10000 169652 ns/op 386.30 MB/s
+Search_Easy0_CachedPCRE/128K 5000 340683 ns/op 384.73 MB/s
+Search_Easy0_CachedPCRE/256K 2000 679601 ns/op 385.73 MB/s
+Search_Easy0_CachedPCRE/512K 1000 1361625 ns/op 385.05 MB/s
+Search_Easy0_CachedPCRE/1M 500 2723438 ns/op 385.02 MB/s
+Search_Easy0_CachedPCRE/2M 200 5470390 ns/op 383.36 MB/s
+Search_Easy0_CachedPCRE/4M 100 11041050 ns/op 379.88 MB/s
+Search_Easy0_CachedPCRE/8M 50 22165440 ns/op 378.45 MB/s
+Search_Easy0_CachedPCRE/16M 50 44294160 ns/op 378.77 MB/s
+Search_Easy0_CachedRE2/8 5000000 316 ns/op 25.25 MB/s
+Search_Easy0_CachedRE2/16 5000000 317 ns/op 50.32 MB/s
+Search_Easy0_CachedRE2/32 5000000 332 ns/op 96.26 MB/s
+Search_Easy0_CachedRE2/64 5000000 334 ns/op 191.58 MB/s
+Search_Easy0_CachedRE2/128 5000000 371 ns/op 344.40 MB/s
+Search_Easy0_CachedRE2/256 5000000 391 ns/op 653.11 MB/s
+Search_Easy0_CachedRE2/512 5000000 465 ns/op 1099.45 MB/s
+Search_Easy0_CachedRE2/1K 2000000 664 ns/op 1541.25 MB/s
+Search_Easy0_CachedRE2/2K 1000000 1015 ns/op 2015.77 MB/s
+Search_Easy0_CachedRE2/4K 1000000 1581 ns/op 2590.41 MB/s
+Search_Easy0_CachedRE2/8K 500000 2914 ns/op 2810.63 MB/s
+Search_Easy0_CachedRE2/16K 200000 5406 ns/op 3030.64 MB/s
+Search_Easy0_CachedRE2/32K 100000 10992 ns/op 2980.97 MB/s
+Search_Easy0_CachedRE2/64K 50000 21829 ns/op 3002.12 MB/s
+Search_Easy0_CachedRE2/128K 50000 44263 ns/op 2961.20 MB/s
+Search_Easy0_CachedRE2/256K 20000 88222 ns/op 2971.39 MB/s
+Search_Easy0_CachedRE2/512K 10000 177626 ns/op 2951.64 MB/s
+Search_Easy0_CachedRE2/1M 5000 356519 ns/op 2941.15 MB/s
+Search_Easy0_CachedRE2/2M 2000 730121 ns/op 2872.33 MB/s
+Search_Easy0_CachedRE2/4M 1000 1522926 ns/op 2754.11 MB/s
+Search_Easy0_CachedRE2/8M 500 3093982 ns/op 2711.27 MB/s
+Search_Easy0_CachedRE2/16M 200 6173845 ns/op 2717.47 MB/s
+Search_Easy1_CachedPCRE/8 10000000 129 ns/op 61.93 MB/s
+Search_Easy1_CachedPCRE/16 10000000 156 ns/op 102.24 MB/s
+Search_Easy1_CachedPCRE/32 5000000 213 ns/op 150.10 MB/s
+Search_Easy1_CachedPCRE/64 5000000 326 ns/op 195.85 MB/s
+Search_Easy1_CachedPCRE/128 2000000 648 ns/op 197.35 MB/s
+Search_Easy1_CachedPCRE/256 2000000 934 ns/op 273.84 MB/s
+Search_Easy1_CachedPCRE/512 1000000 1971 ns/op 259.73 MB/s
+Search_Easy1_CachedPCRE/1K 500000 3432 ns/op 298.32 MB/s
+Search_Easy1_CachedPCRE/2K 200000 6255 ns/op 327.39 MB/s
+Search_Easy1_CachedPCRE/4K 100000 11212 ns/op 365.31 MB/s
+Search_Easy1_CachedPCRE/8K 50000 22182 ns/op 369.31 MB/s
+Search_Easy1_CachedPCRE/16K 50000 42713 ns/op 383.58 MB/s
+Search_Easy1_CachedPCRE/32K 20000 85747 ns/op 382.14 MB/s
+Search_Easy1_CachedPCRE/64K 10000 170670 ns/op 383.99 MB/s
+Search_Easy1_CachedPCRE/128K 5000 342979 ns/op 382.16 MB/s
+Search_Easy1_CachedPCRE/256K 2000 683959 ns/op 383.27 MB/s
+Search_Easy1_CachedPCRE/512K 1000 1370065 ns/op 382.67 MB/s
+Search_Easy1_CachedPCRE/1M 500 2742576 ns/op 382.33 MB/s
+Search_Easy1_CachedPCRE/2M 200 5507205 ns/op 380.80 MB/s
+Search_Easy1_CachedPCRE/4M 100 11116960 ns/op 377.29 MB/s
+Search_Easy1_CachedPCRE/8M 50 22302540 ns/op 376.13 MB/s
+Search_Easy1_CachedPCRE/16M 50 44593400 ns/op 376.23 MB/s
+Search_Easy1_CachedRE2/8 5000000 316 ns/op 25.30 MB/s
+Search_Easy1_CachedRE2/16 5000000 318 ns/op 50.29 MB/s
+Search_Easy1_CachedRE2/32 5000000 331 ns/op 96.45 MB/s
+Search_Easy1_CachedRE2/64 5000000 334 ns/op 191.09 MB/s
+Search_Easy1_CachedRE2/128 5000000 367 ns/op 348.34 MB/s
+Search_Easy1_CachedRE2/256 5000000 399 ns/op 640.68 MB/s
+Search_Easy1_CachedRE2/512 5000000 476 ns/op 1073.44 MB/s
+Search_Easy1_CachedRE2/1K 2000000 655 ns/op 1563.28 MB/s
+Search_Easy1_CachedRE2/2K 1000000 1002 ns/op 2043.22 MB/s
+Search_Easy1_CachedRE2/4K 1000000 1582 ns/op 2588.68 MB/s
+Search_Easy1_CachedRE2/8K 500000 2916 ns/op 2808.86 MB/s
+Search_Easy1_CachedRE2/16K 200000 5435 ns/op 3014.34 MB/s
+Search_Easy1_CachedRE2/32K 100000 10957 ns/op 2990.35 MB/s
+Search_Easy1_CachedRE2/64K 50000 21824 ns/op 3002.80 MB/s
+Search_Easy1_CachedRE2/128K 50000 44255 ns/op 2961.71 MB/s
+Search_Easy1_CachedRE2/256K 20000 88214 ns/op 2971.66 MB/s
+Search_Easy1_CachedRE2/512K 10000 177657 ns/op 2951.12 MB/s
+Search_Easy1_CachedRE2/1M 5000 356560 ns/op 2940.81 MB/s
+Search_Easy1_CachedRE2/2M 2000 730094 ns/op 2872.44 MB/s
+Search_Easy1_CachedRE2/4M 1000 1522720 ns/op 2754.48 MB/s
+Search_Easy1_CachedRE2/8M 500 3093050 ns/op 2712.08 MB/s
+Search_Easy1_CachedRE2/16M 200 6171535 ns/op 2718.48 MB/s
+Search_Medium_CachedPCRE/8 10000000 128 ns/op 62.07 MB/s
+Search_Medium_CachedPCRE/16 10000000 157 ns/op 101.40 MB/s
+Search_Medium_CachedPCRE/32 5000000 214 ns/op 149.24 MB/s
+Search_Medium_CachedPCRE/64 5000000 336 ns/op 190.31 MB/s
+Search_Medium_CachedPCRE/128 5000000 430 ns/op 297.32 MB/s
+Search_Medium_CachedPCRE/256 200000 8892 ns/op 28.79 MB/s
+Search_Medium_CachedPCRE/512 50000 21295 ns/op 24.04 MB/s
+Search_Medium_CachedPCRE/1K 50000 41581 ns/op 24.63 MB/s
+Search_Medium_CachedPCRE/2K 20000 61200 ns/op 33.46 MB/s
+Search_Medium_CachedPCRE/4K 10000 173807 ns/op 23.57 MB/s
+Search_Medium_CachedPCRE/8K 5000 382058 ns/op 21.44 MB/s
+Search_Medium_CachedPCRE/16K 2000 773090 ns/op 21.19 MB/s
+Search_Medium_CachedPCRE/32K 1000 1545797 ns/op 21.20 MB/s
+Search_Medium_CachedPCRE/64K 500 3076340 ns/op 21.30 MB/s
+Search_Medium_CachedPCRE/128K 200 6134010 ns/op 21.37 MB/s
+Search_Medium_CachedPCRE/256K 100 12315460 ns/op 21.29 MB/s
+Search_Medium_CachedRE2/8 5000000 338 ns/op 23.62 MB/s
+Search_Medium_CachedRE2/16 5000000 363 ns/op 43.99 MB/s
+Search_Medium_CachedRE2/32 5000000 413 ns/op 77.32 MB/s
+Search_Medium_CachedRE2/64 2000000 515 ns/op 124.15 MB/s
+Search_Medium_CachedRE2/128 2000000 722 ns/op 177.20 MB/s
+Search_Medium_CachedRE2/256 1000000 1126 ns/op 227.29 MB/s
+Search_Medium_CachedRE2/512 1000000 1937 ns/op 264.32 MB/s
+Search_Medium_CachedRE2/1K 500000 3553 ns/op 288.18 MB/s
+Search_Medium_CachedRE2/2K 200000 6788 ns/op 301.71 MB/s
+Search_Medium_CachedRE2/4K 100000 13258 ns/op 308.92 MB/s
+Search_Medium_CachedRE2/8K 50000 26198 ns/op 312.69 MB/s
+Search_Medium_CachedRE2/16K 20000 52097 ns/op 314.48 MB/s
+Search_Medium_CachedRE2/32K 10000 103975 ns/op 315.15 MB/s
+Search_Medium_CachedRE2/64K 5000 207487 ns/op 315.86 MB/s
+Search_Medium_CachedRE2/128K 5000 414637 ns/op 316.11 MB/s
+Search_Medium_CachedRE2/256K 2000 828752 ns/op 316.31 MB/s
+Search_Medium_CachedRE2/512K 1000 1657280 ns/op 316.35 MB/s
+Search_Medium_CachedRE2/1M 500 3314560 ns/op 316.35 MB/s
+Search_Medium_CachedRE2/2M 200 6643535 ns/op 315.67 MB/s
+Search_Medium_CachedRE2/4M 100 13338160 ns/op 314.46 MB/s
+Search_Medium_CachedRE2/8M 50 26716200 ns/op 313.99 MB/s
+Search_Medium_CachedRE2/16M 20 53439850 ns/op 313.95 MB/s
+Search_Hard_CachedPCRE/8 10000000 128 ns/op 62.06 MB/s
+Search_Hard_CachedPCRE/16 10000000 157 ns/op 101.42 MB/s
+Search_Hard_CachedPCRE/32 5000000 214 ns/op 149.37 MB/s
+Search_Hard_CachedPCRE/64 5000000 336 ns/op 190.26 MB/s
+Search_Hard_CachedPCRE/128 5000000 430 ns/op 297.44 MB/s
+Search_Hard_CachedPCRE/256 2000 780527 ns/op 0.33 MB/s
+Search_Hard_CachedPCRE/512 500 3210270 ns/op 0.16 MB/s
+Search_Hard_CachedPCRE/1K 100 12762760 ns/op 0.08 MB/s
+Search_Hard_CachedPCRE/2K 50 46734020 ns/op 0.04 MB/s
+Search_Hard_CachedPCRE/4K 5 201439400 ns/op 0.02 MB/s
+Search_Hard_CachedRE2/8 5000000 338 ns/op 23.65 MB/s
+Search_Hard_CachedRE2/16 5000000 363 ns/op 44.05 MB/s
+Search_Hard_CachedRE2/32 5000000 411 ns/op 77.70 MB/s
+Search_Hard_CachedRE2/64 2000000 512 ns/op 124.89 MB/s
+Search_Hard_CachedRE2/128 2000000 721 ns/op 177.47 MB/s
+Search_Hard_CachedRE2/256 1000000 1125 ns/op 227.50 MB/s
+Search_Hard_CachedRE2/512 1000000 1933 ns/op 264.85 MB/s
+Search_Hard_CachedRE2/1K 500000 3550 ns/op 288.42 MB/s
+Search_Hard_CachedRE2/2K 200000 6786 ns/op 301.80 MB/s
+Search_Hard_CachedRE2/4K 100000 13256 ns/op 308.97 MB/s
+Search_Hard_CachedRE2/8K 50000 26197 ns/op 312.71 MB/s
+Search_Hard_CachedRE2/16K 20000 52077 ns/op 314.61 MB/s
+Search_Hard_CachedRE2/32K 10000 103962 ns/op 315.19 MB/s
+Search_Hard_CachedRE2/64K 5000 207496 ns/op 315.84 MB/s
+Search_Hard_CachedRE2/128K 5000 414609 ns/op 316.13 MB/s
+Search_Hard_CachedRE2/256K 2000 828753 ns/op 316.31 MB/s
+Search_Hard_CachedRE2/512K 1000 1657228 ns/op 316.36 MB/s
+Search_Hard_CachedRE2/1M 500 3314250 ns/op 316.38 MB/s
+Search_Hard_CachedRE2/2M 200 6643040 ns/op 315.69 MB/s
+Search_Hard_CachedRE2/4M 100 13337040 ns/op 314.49 MB/s
+Search_Hard_CachedRE2/8M 50 26716100 ns/op 313.99 MB/s
+Search_Hard_CachedRE2/16M 20 53433550 ns/op 313.98 MB/s
+Search_Parens_CachedPCRE/8 5000000 213 ns/op 37.43 MB/s
+Search_Parens_CachedRE2/8 5000000 337 ns/op 23.73 MB/s
+Search_Parens_CachedRE2/16 5000000 362 ns/op 44.12 MB/s
+Search_Parens_CachedRE2/32 5000000 412 ns/op 77.59 MB/s
+Search_Parens_CachedRE2/64 2000000 514 ns/op 124.37 MB/s
+Search_Parens_CachedRE2/128 2000000 721 ns/op 177.32 MB/s
+Search_Parens_CachedRE2/256 1000000 1125 ns/op 227.50 MB/s
+Search_Parens_CachedRE2/512 1000000 1932 ns/op 264.92 MB/s
+Search_Parens_CachedRE2/1K 500000 3550 ns/op 288.37 MB/s
+Search_Parens_CachedRE2/2K 200000 6786 ns/op 301.78 MB/s
+Search_Parens_CachedRE2/4K 100000 13258 ns/op 308.94 MB/s
+Search_Parens_CachedRE2/8K 50000 26199 ns/op 312.68 MB/s
+Search_Parens_CachedRE2/16K 20000 52095 ns/op 314.50 MB/s
+Search_Parens_CachedRE2/32K 10000 103958 ns/op 315.20 MB/s
+Search_Parens_CachedRE2/64K 5000 207520 ns/op 315.81 MB/s
+Search_Parens_CachedRE2/128K 5000 414602 ns/op 316.14 MB/s
+Search_Parens_CachedRE2/256K 2000 828782 ns/op 316.30 MB/s
+Search_Parens_CachedRE2/512K 1000 1657076 ns/op 316.39 MB/s
+Search_Parens_CachedRE2/1M 500 3314154 ns/op 316.39 MB/s
+Search_Parens_CachedRE2/2M 200 6643900 ns/op 315.65 MB/s
+Search_Parens_CachedRE2/4M 100 13336670 ns/op 314.49 MB/s
+Search_Parens_CachedRE2/8M 50 26714480 ns/op 314.01 MB/s
+Search_Parens_CachedRE2/16M 20 53434900 ns/op 313.97 MB/s
+Search_BigFixed_CachedPCRE/8 5000000 251 ns/op 31.76 MB/s
+Search_BigFixed_CachedPCRE/16 5000000 314 ns/op 50.80 MB/s
+Search_BigFixed_CachedPCRE/32 5000000 441 ns/op 72.49 MB/s
+Search_BigFixed_CachedPCRE/64 2000000 694 ns/op 92.21 MB/s
+Search_BigFixed_CachedPCRE/128 1000000 1066 ns/op 119.99 MB/s
+Search_BigFixed_CachedPCRE/256 1000000 1933 ns/op 132.39 MB/s
+Search_BigFixed_CachedPCRE/512 500000 3652 ns/op 140.19 MB/s
+Search_BigFixed_CachedPCRE/1K 200000 7089 ns/op 144.43 MB/s
+Search_BigFixed_CachedPCRE/2K 100000 13964 ns/op 146.66 MB/s
+Search_BigFixed_CachedPCRE/4K 50000 27716 ns/op 147.78 MB/s
+Search_BigFixed_CachedPCRE/8K 20000 55232 ns/op 148.32 MB/s
+Search_BigFixed_CachedPCRE/16K 10000 110321 ns/op 148.51 MB/s
+Search_BigFixed_CachedPCRE/32K 5000 220561 ns/op 148.57 MB/s
+Search_BigFixed_CachedRE2/8 10000000 131 ns/op 60.69 MB/s
+Search_BigFixed_CachedRE2/16 5000000 374 ns/op 42.75 MB/s
+Search_BigFixed_CachedRE2/32 5000000 410 ns/op 77.99 MB/s
+Search_BigFixed_CachedRE2/64 5000000 489 ns/op 130.84 MB/s
+Search_BigFixed_CachedRE2/128 2000000 635 ns/op 201.43 MB/s
+Search_BigFixed_CachedRE2/256 2000000 945 ns/op 270.66 MB/s
+Search_BigFixed_CachedRE2/512 1000000 1552 ns/op 329.85 MB/s
+Search_BigFixed_CachedRE2/1K 500000 2766 ns/op 370.19 MB/s
+Search_BigFixed_CachedRE2/2K 200000 5191 ns/op 394.49 MB/s
+Search_BigFixed_CachedRE2/4K 100000 10046 ns/op 407.71 MB/s
+Search_BigFixed_CachedRE2/8K 100000 19752 ns/op 414.74 MB/s
+Search_BigFixed_CachedRE2/16K 50000 39168 ns/op 418.30 MB/s
+Search_BigFixed_CachedRE2/32K 20000 78114 ns/op 419.49 MB/s
+Search_BigFixed_CachedRE2/64K 10000 155895 ns/op 420.38 MB/s
+Search_BigFixed_CachedRE2/128K 5000 311573 ns/op 420.68 MB/s
+Search_BigFixed_CachedRE2/256K 2000 624241 ns/op 419.94 MB/s
+Search_BigFixed_CachedRE2/512K 1000 1253377 ns/op 418.30 MB/s
+Search_BigFixed_CachedRE2/1M 500 2530874 ns/op 414.31 MB/s
+Search_Success_PCRE/8 1000000 1836 ns/op 4.36 MB/s
+Search_Success_PCRE/16 1000000 1880 ns/op 8.51 MB/s
+Search_Success_PCRE/32 1000000 1970 ns/op 16.24 MB/s
+Search_Success_PCRE/64 500000 2106 ns/op 30.38 MB/s
+Search_Success_PCRE/128 500000 2447 ns/op 52.29 MB/s
+Search_Success_PCRE/256 500000 3103 ns/op 82.48 MB/s
+Search_Success_PCRE/512 500000 4428 ns/op 115.62 MB/s
+Search_Success_PCRE/1K 200000 7053 ns/op 145.17 MB/s
+Search_Success_PCRE/2K 100000 12308 ns/op 166.39 MB/s
+Search_Success_PCRE/4K 50000 22793 ns/op 179.70 MB/s
+Search_Success_PCRE/8K 50000 43847 ns/op 186.83 MB/s
+Search_Success_PCRE/16K 20000 85952 ns/op 190.62 MB/s
+Search_Success_PCRE/32K 10000 170305 ns/op 192.41 MB/s
+Search_Success_PCRE/64K 5000 338862 ns/op 193.40 MB/s
+Search_Success_PCRE/128K 2000 676940 ns/op 193.62 MB/s
+Search_Success_PCRE/256K 1000 1355784 ns/op 193.35 MB/s
+Search_Success_PCRE/512K 500 2725254 ns/op 192.38 MB/s
+Search_Success_PCRE/1M 200 5542255 ns/op 189.20 MB/s
+Search_Success_PCRE/2M 100 11433880 ns/op 183.42 MB/s
+Search_Success_PCRE/4M 50 24217120 ns/op 173.20 MB/s
+Search_Success_PCRE/8M 20 56016550 ns/op 149.75 MB/s
+Search_Success_PCRE/16M 10 137107400 ns/op 122.37 MB/s
+Search_Success_RE2/8 200000 8525 ns/op 0.94 MB/s
+Search_Success_RE2/16 100000 19567 ns/op 0.82 MB/s
+Search_Success_RE2/32 100000 19549 ns/op 1.64 MB/s
+Search_Success_RE2/64 100000 19744 ns/op 3.24 MB/s
+Search_Success_RE2/128 100000 19919 ns/op 6.43 MB/s
+Search_Success_RE2/256 50000 20201 ns/op 12.67 MB/s
+Search_Success_RE2/512 50000 20993 ns/op 24.39 MB/s
+Search_Success_RE2/1K 50000 22581 ns/op 45.35 MB/s
+Search_Success_RE2/2K 50000 25897 ns/op 79.08 MB/s
+Search_Success_RE2/4K 50000 32389 ns/op 126.46 MB/s
+Search_Success_RE2/8K 50000 45266 ns/op 180.97 MB/s
+Search_Success_RE2/16K 20000 71222 ns/op 230.04 MB/s
+Search_Success_RE2/32K 10000 123342 ns/op 265.67 MB/s
+Search_Success_RE2/64K 5000 227134 ns/op 288.53 MB/s
+Search_Success_RE2/128K 5000 434534 ns/op 301.64 MB/s
+Search_Success_RE2/256K 2000 852033 ns/op 307.67 MB/s
+Search_Success_RE2/512K 1000 1692057 ns/op 309.85 MB/s
+Search_Success_RE2/1M 500 3396306 ns/op 308.74 MB/s
+Search_Success_RE2/2M 200 6984505 ns/op 300.26 MB/s
+Search_Success_RE2/4M 100 14632000 ns/op 286.65 MB/s
+Search_Success_RE2/8M 50 31782800 ns/op 263.94 MB/s
+Search_Success_RE2/16M 10 103645400 ns/op 161.87 MB/s
+Search_Success_CachedPCRE/8 5000000 257 ns/op 31.04 MB/s
+Search_Success_CachedPCRE/16 5000000 308 ns/op 51.88 MB/s
+Search_Success_CachedPCRE/32 5000000 409 ns/op 78.14 MB/s
+Search_Success_CachedPCRE/64 2000000 611 ns/op 104.66 MB/s
+Search_Success_CachedPCRE/128 2000000 889 ns/op 143.85 MB/s
+Search_Success_CachedPCRE/256 1000000 1546 ns/op 165.48 MB/s
+Search_Success_CachedPCRE/512 500000 2861 ns/op 178.95 MB/s
+Search_Success_CachedPCRE/1K 200000 5491 ns/op 186.46 MB/s
+Search_Success_CachedPCRE/2K 100000 10746 ns/op 190.57 MB/s
+Search_Success_CachedPCRE/4K 50000 21262 ns/op 192.64 MB/s
+Search_Success_CachedPCRE/8K 50000 42295 ns/op 193.69 MB/s
+Search_Success_CachedPCRE/16K 20000 84375 ns/op 194.18 MB/s
+Search_Success_CachedPCRE/32K 10000 168635 ns/op 194.31 MB/s
+Search_Success_CachedPCRE/64K 5000 337158 ns/op 194.38 MB/s
+Search_Success_CachedPCRE/128K 2000 675199 ns/op 194.12 MB/s
+Search_Success_CachedPCRE/256K 1000 1353970 ns/op 193.61 MB/s
+Search_Success_CachedPCRE/512K 500 2723300 ns/op 192.52 MB/s
+Search_Success_CachedPCRE/1M 200 5539695 ns/op 189.28 MB/s
+Search_Success_CachedPCRE/2M 100 11424760 ns/op 183.56 MB/s
+Search_Success_CachedPCRE/4M 50 24204760 ns/op 173.28 MB/s
+Search_Success_CachedPCRE/8M 20 55998450 ns/op 149.80 MB/s
+Search_Success_CachedPCRE/16M 10 137082500 ns/op 122.39 MB/s
+Search_Success_CachedRE2/8 10000000 126 ns/op 63.05 MB/s
+Search_Success_CachedRE2/16 5000000 373 ns/op 42.86 MB/s
+Search_Success_CachedRE2/32 5000000 423 ns/op 75.51 MB/s
+Search_Success_CachedRE2/64 2000000 523 ns/op 122.33 MB/s
+Search_Success_CachedRE2/128 2000000 730 ns/op 175.15 MB/s
+Search_Success_CachedRE2/256 1000000 1135 ns/op 225.51 MB/s
+Search_Success_CachedRE2/512 1000000 1942 ns/op 263.51 MB/s
+Search_Success_CachedRE2/1K 500000 3562 ns/op 287.44 MB/s
+Search_Success_CachedRE2/2K 200000 6797 ns/op 301.31 MB/s
+Search_Success_CachedRE2/4K 100000 13268 ns/op 308.70 MB/s
+Search_Success_CachedRE2/8K 50000 26210 ns/op 312.55 MB/s
+Search_Success_CachedRE2/16K 20000 52116 ns/op 314.37 MB/s
+Search_Success_CachedRE2/32K 10000 104050 ns/op 314.92 MB/s
+Search_Success_CachedRE2/64K 5000 207912 ns/op 315.21 MB/s
+Search_Success_CachedRE2/128K 5000 415393 ns/op 315.54 MB/s
+Search_Success_CachedRE2/256K 2000 832643 ns/op 314.83 MB/s
+Search_Success_CachedRE2/512K 1000 1672561 ns/op 313.46 MB/s
+Search_Success_CachedRE2/1M 500 3376196 ns/op 310.58 MB/s
+Search_Success_CachedRE2/2M 200 6957190 ns/op 301.44 MB/s
+Search_Success_CachedRE2/4M 100 14592130 ns/op 287.44 MB/s
+Search_Success_CachedRE2/8M 50 31731860 ns/op 264.36 MB/s
+Search_Success_CachedRE2/16M 10 103597500 ns/op 161.95 MB/s
+Search_Success1_PCRE/8 500000 2053 ns/op 3.90 MB/s
+Search_Success1_PCRE/16 500000 2061 ns/op 7.76 MB/s
+Search_Success1_PCRE/32 500000 2169 ns/op 14.75 MB/s
+Search_Success1_PCRE/64 500000 2310 ns/op 27.70 MB/s
+Search_Success1_PCRE/128 500000 2640 ns/op 48.48 MB/s
+Search_Success1_PCRE/256 500000 3292 ns/op 77.76 MB/s
+Search_Success1_PCRE/512 500000 4593 ns/op 111.47 MB/s
+Search_Success1_PCRE/1K 200000 7241 ns/op 141.40 MB/s
+Search_Success1_PCRE/2K 100000 12489 ns/op 163.98 MB/s
+Search_Success1_PCRE/4K 50000 22994 ns/op 178.13 MB/s
+Search_Success1_PCRE/8K 50000 44014 ns/op 186.12 MB/s
+Search_Success1_PCRE/16K 20000 86120 ns/op 190.24 MB/s
+Search_Success1_PCRE/32K 10000 170489 ns/op 192.20 MB/s
+Search_Success1_PCRE/64K 5000 339029 ns/op 193.30 MB/s
+Search_Success1_PCRE/128K 2000 677115 ns/op 193.57 MB/s
+Search_Success1_PCRE/256K 1000 1355861 ns/op 193.34 MB/s
+Search_Success1_PCRE/512K 500 2725160 ns/op 192.39 MB/s
+Search_Success1_PCRE/1M 200 5543665 ns/op 189.15 MB/s
+Search_Success1_PCRE/2M 100 11434390 ns/op 183.41 MB/s
+Search_Success1_PCRE/4M 50 24215940 ns/op 173.20 MB/s
+Search_Success1_PCRE/8M 20 56027250 ns/op 149.72 MB/s
+Search_Success1_PCRE/16M 10 137103200 ns/op 122.37 MB/s
+Search_Success1_RE2/8 50000 26411 ns/op 0.30 MB/s
+Search_Success1_RE2/16 50000 27068 ns/op 0.59 MB/s
+Search_Success1_RE2/32 50000 27117 ns/op 1.18 MB/s
+Search_Success1_RE2/64 50000 27405 ns/op 2.34 MB/s
+Search_Success1_RE2/128 50000 27398 ns/op 4.67 MB/s
+Search_Success1_RE2/256 50000 27580 ns/op 9.28 MB/s
+Search_Success1_RE2/512 50000 28504 ns/op 17.96 MB/s
+Search_Success1_RE2/1K 50000 29993 ns/op 34.14 MB/s
+Search_Success1_RE2/2K 50000 33373 ns/op 61.37 MB/s
+Search_Success1_RE2/4K 50000 39867 ns/op 102.74 MB/s
+Search_Success1_RE2/8K 20000 52940 ns/op 154.74 MB/s
+Search_Success1_RE2/16K 20000 78818 ns/op 207.87 MB/s
+Search_Success1_RE2/32K 10000 130836 ns/op 250.45 MB/s
+Search_Success1_RE2/64K 5000 234725 ns/op 279.20 MB/s
+Search_Success1_RE2/128K 5000 442253 ns/op 296.37 MB/s
+Search_Success1_RE2/256K 2000 859671 ns/op 304.94 MB/s
+Search_Success1_RE2/512K 1000 1699921 ns/op 308.42 MB/s
+Search_Success1_RE2/1M 500 3404204 ns/op 308.02 MB/s
+Search_Success1_RE2/2M 200 6992400 ns/op 299.92 MB/s
+Search_Success1_RE2/4M 100 14641200 ns/op 286.47 MB/s
+Search_Success1_RE2/8M 50 31788680 ns/op 263.89 MB/s
+Search_Success1_RE2/16M 10 103656000 ns/op 161.85 MB/s
+Search_Success1_Cached_PCRE/8 5000000 305 ns/op 26.22 MB/s
+Search_Success1_Cached_PCRE/16 5000000 355 ns/op 44.96 MB/s
+Search_Success1_Cached_PCRE/32 5000000 456 ns/op 70.06 MB/s
+Search_Success1_Cached_PCRE/64 2000000 658 ns/op 97.12 MB/s
+Search_Success1_Cached_PCRE/128 2000000 936 ns/op 136.62 MB/s
+Search_Success1_Cached_PCRE/256 1000000 1593 ns/op 160.62 MB/s
+Search_Success1_Cached_PCRE/512 500000 2908 ns/op 176.05 MB/s
+Search_Success1_Cached_PCRE/1K 200000 5537 ns/op 184.93 MB/s
+Search_Success1_Cached_PCRE/2K 100000 10793 ns/op 189.74 MB/s
+Search_Success1_Cached_PCRE/4K 50000 21311 ns/op 192.19 MB/s
+Search_Success1_Cached_PCRE/8K 50000 42340 ns/op 193.48 MB/s
+Search_Success1_Cached_PCRE/16K 20000 84417 ns/op 194.08 MB/s
+Search_Success1_Cached_PCRE/32K 10000 168689 ns/op 194.25 MB/s
+Search_Success1_Cached_PCRE/64K 5000 337219 ns/op 194.34 MB/s
+Search_Success1_Cached_PCRE/128K 2000 675255 ns/op 194.11 MB/s
+Search_Success1_Cached_PCRE/256K 1000 1354027 ns/op 193.60 MB/s
+Search_Success1_Cached_PCRE/512K 500 2723352 ns/op 192.52 MB/s
+Search_Success1_Cached_PCRE/1M 200 5539800 ns/op 189.28 MB/s
+Search_Success1_Cached_PCRE/2M 100 11426990 ns/op 183.53 MB/s
+Search_Success1_Cached_PCRE/4M 50 24206500 ns/op 173.27 MB/s
+Search_Success1_Cached_PCRE/8M 20 56008200 ns/op 149.77 MB/s
+Search_Success1_Cached_PCRE/16M 10 137084600 ns/op 122.39 MB/s
+Search_Success1_Cached_RE2/8 5000000 347 ns/op 22.99 MB/s
+Search_Success1_Cached_RE2/16 5000000 373 ns/op 42.83 MB/s
+Search_Success1_Cached_RE2/32 5000000 421 ns/op 75.97 MB/s
+Search_Success1_Cached_RE2/64 2000000 520 ns/op 122.97 MB/s
+Search_Success1_Cached_RE2/128 2000000 729 ns/op 175.43 MB/s
+Search_Success1_Cached_RE2/256 1000000 1133 ns/op 225.82 MB/s
+Search_Success1_Cached_RE2/512 1000000 1945 ns/op 263.23 MB/s
+Search_Success1_Cached_RE2/1K 500000 3559 ns/op 287.66 MB/s
+Search_Success1_Cached_RE2/2K 200000 6795 ns/op 301.39 MB/s
+Search_Success1_Cached_RE2/4K 100000 13266 ns/op 308.74 MB/s
+Search_Success1_Cached_RE2/8K 50000 26210 ns/op 312.54 MB/s
+Search_Success1_Cached_RE2/16K 20000 52116 ns/op 314.37 MB/s
+Search_Success1_Cached_RE2/32K 10000 104042 ns/op 314.95 MB/s
+Search_Success1_Cached_RE2/64K 5000 207904 ns/op 315.22 MB/s
+Search_Success1_Cached_RE2/128K 5000 415336 ns/op 315.58 MB/s
+Search_Success1_Cached_RE2/256K 2000 832674 ns/op 314.82 MB/s
+Search_Success1_Cached_RE2/512K 1000 1672745 ns/op 313.43 MB/s
+Search_Success1_Cached_RE2/1M 500 3376504 ns/op 310.55 MB/s
+Search_Success1_Cached_RE2/2M 200 6957405 ns/op 301.43 MB/s
+Search_Success1_Cached_RE2/4M 100 14592660 ns/op 287.43 MB/s
+Search_Success1_Cached_RE2/8M 50 31728560 ns/op 264.39 MB/s
+Search_Success1_Cached_RE2/16M 10 103598300 ns/op 161.94 MB/s
+Search_Digits_PCRE 200000 5212 ns/op
+Search_Digits_RE2 50000 21686 ns/op
+Parse_Digits_PCRE 200000 5229 ns/op
+Parse_Digits_RE2 200000 9825 ns/op
+Parse_CachedDigits_PCRE 2000000 519 ns/op
+Parse_CachedDigits_RE2 5000000 271 ns/op
+Parse_DigitDs_PCRE 500000 4224 ns/op
+Parse_DigitDs_RE2 200000 9706 ns/op
+Parse_CachedDigitDs_PCRE 2000000 505 ns/op
+Parse_CachedDigitDs_RE2 5000000 279 ns/op
+Parse_Split_PCRE 500000 3533 ns/op
+Parse_Split_RE2 100000 11256 ns/op
+Parse_CachedSplit_PCRE 5000000 373 ns/op
+Parse_CachedSplit_RE2 10000000 167 ns/op
+Parse_SplitHard_PCRE 500000 3350 ns/op
+Parse_SplitHard_RE2 100000 13959 ns/op
+Parse_CachedSplitHard_PCRE 5000000 352 ns/op
+Parse_CachedSplitHard_RE2 1000000 1780 ns/op
+Parse_CachedSplitBig1_PCRE 500 4902314 ns/op
+Parse_CachedSplitBig1_RE2 2000 674772 ns/op
+Parse_CachedSplitBig2_PCRE 2000 513858 ns/op
+Parse_CachedSplitBig2_RE2 20 52044800 ns/op
+BM_PCRE_Compile 500000 3767 ns/op
+BM_RE2_Compile 100000 10752 ns/op
+SearchPhone_CachedPCRE/8 1000000 1231 ns/op 6.50 MB/s
+SearchPhone_CachedPCRE/16 500000 2026 ns/op 7.89 MB/s
+SearchPhone_CachedPCRE/32 500000 3623 ns/op 8.83 MB/s
+SearchPhone_CachedPCRE/64 200000 6813 ns/op 9.39 MB/s
+SearchPhone_CachedPCRE/128 100000 13330 ns/op 9.60 MB/s
+SearchPhone_CachedPCRE/256 50000 25832 ns/op 9.91 MB/s
+SearchPhone_CachedPCRE/512 20000 51132 ns/op 10.01 MB/s
+SearchPhone_CachedPCRE/1K 10000 101950 ns/op 10.04 MB/s
+SearchPhone_CachedPCRE/2K 10000 199960 ns/op 10.24 MB/s
+SearchPhone_CachedPCRE/4K 5000 397105 ns/op 10.31 MB/s
+SearchPhone_CachedPCRE/8K 2000 792685 ns/op 10.33 MB/s
+SearchPhone_CachedPCRE/16K 1000 1576834 ns/op 10.39 MB/s
+SearchPhone_CachedPCRE/32K 500 3152026 ns/op 10.40 MB/s
+SearchPhone_CachedPCRE/64K 200 6293925 ns/op 10.41 MB/s
+SearchPhone_CachedPCRE/128K 100 12613350 ns/op 10.39 MB/s
+SearchPhone_CachedPCRE/256K 50 25253020 ns/op 10.38 MB/s
+SearchPhone_CachedPCRE/512K 20 50462800 ns/op 10.39 MB/s
+SearchPhone_CachedPCRE/1M 10 101412600 ns/op 10.34 MB/s
+SearchPhone_CachedPCRE/2M 5 203302200 ns/op 10.32 MB/s
+SearchPhone_CachedPCRE/4M 5 404935400 ns/op 10.36 MB/s
+SearchPhone_CachedPCRE/8M 2 810444500 ns/op 10.35 MB/s
+SearchPhone_CachedPCRE/16M 1 1615334000 ns/op 10.39 MB/s
+SearchPhone_CachedRE2/8 2000000 897 ns/op 8.91 MB/s
+SearchPhone_CachedRE2/16 2000000 928 ns/op 17.24 MB/s
+SearchPhone_CachedRE2/32 2000000 968 ns/op 33.04 MB/s
+SearchPhone_CachedRE2/64 1000000 1069 ns/op 59.84 MB/s
+SearchPhone_CachedRE2/128 1000000 1286 ns/op 99.52 MB/s
+SearchPhone_CachedRE2/256 1000000 1691 ns/op 151.31 MB/s
+SearchPhone_CachedRE2/512 500000 2496 ns/op 205.07 MB/s
+SearchPhone_CachedRE2/1K 500000 4107 ns/op 249.27 MB/s
+SearchPhone_CachedRE2/2K 200000 7347 ns/op 278.74 MB/s
+SearchPhone_CachedRE2/4K 100000 13824 ns/op 296.29 MB/s
+SearchPhone_CachedRE2/8K 50000 26758 ns/op 306.15 MB/s
+SearchPhone_CachedRE2/16K 20000 52773 ns/op 310.46 MB/s
+SearchPhone_CachedRE2/32K 10000 104775 ns/op 312.75 MB/s
+SearchPhone_CachedRE2/64K 5000 208321 ns/op 314.59 MB/s
+SearchPhone_CachedRE2/128K 5000 415436 ns/op 315.50 MB/s
+SearchPhone_CachedRE2/256K 2000 829659 ns/op 315.97 MB/s
+SearchPhone_CachedRE2/512K 1000 1658073 ns/op 316.20 MB/s
+SearchPhone_CachedRE2/1M 500 3315418 ns/op 316.27 MB/s
+SearchPhone_CachedRE2/2M 200 6645570 ns/op 315.57 MB/s
+SearchPhone_CachedRE2/4M 100 13341780 ns/op 314.37 MB/s
+SearchPhone_CachedRE2/8M 50 26722980 ns/op 313.91 MB/s
+SearchPhone_CachedRE2/16M 20 53451450 ns/op 313.88 MB/s
+EmptyPartialMatchPCRE 10000000 139 ns/op
+EmptyPartialMatchRE2 5000000 314 ns/op
+SimplePartialMatchPCRE 10000000 195 ns/op
+SimplePartialMatchRE2 5000000 352 ns/op
+HTTPPartialMatchPCRE 2000000 577 ns/op
+HTTPPartialMatchRE2 2000000 624 ns/op
+SmallHTTPPartialMatchPCRE 2000000 577 ns/op
+SmallHTTPPartialMatchRE2 2000000 622 ns/op
+DotMatchPCRE 5000000 455 ns/op
+DotMatchRE2 2000000 671 ns/op
+ASCIIMatchPCRE 5000000 400 ns/op
+ASCIIMatchRE2 2000000 676 ns/op
+==BENCHMARK== c2 Fri Feb 26 14:16:33 PST 2010
+# Linux c2 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64 GNU/Linux
+# g++ (Ubuntu 4.4.1-4ubuntu8) 4.4.1
+# Copyright (C) 2009 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# c268b421d457+ tip
+
+Search_Easy0_CachedPCRE/8 10000000 132 ns/op 60.47 MB/s
+Search_Easy0_CachedPCRE/16 10000000 160 ns/op 99.62 MB/s
+Search_Easy0_CachedPCRE/32 5000000 217 ns/op 147.10 MB/s
+Search_Easy0_CachedPCRE/64 5000000 331 ns/op 193.17 MB/s
+Search_Easy0_CachedPCRE/128 5000000 431 ns/op 296.60 MB/s
+Search_Easy0_CachedPCRE/256 1000000 1081 ns/op 236.78 MB/s
+Search_Easy0_CachedPCRE/512 1000000 1741 ns/op 294.06 MB/s
+Search_Easy0_CachedPCRE/1K 500000 3393 ns/op 301.76 MB/s
+Search_Easy0_CachedPCRE/2K 200000 5681 ns/op 360.45 MB/s
+Search_Easy0_CachedPCRE/4K 100000 10623 ns/op 385.57 MB/s
+Search_Easy0_CachedPCRE/8K 50000 21776 ns/op 376.18 MB/s
+Search_Easy0_CachedPCRE/16K 50000 42162 ns/op 388.59 MB/s
+Search_Easy0_CachedPCRE/32K 20000 85140 ns/op 384.87 MB/s
+Search_Easy0_CachedPCRE/64K 10000 169773 ns/op 386.02 MB/s
+Search_Easy0_CachedPCRE/128K 5000 340985 ns/op 384.39 MB/s
+Search_Easy0_CachedPCRE/256K 2000 680440 ns/op 385.26 MB/s
+Search_Easy0_CachedPCRE/512K 1000 1362919 ns/op 384.68 MB/s
+Search_Easy0_CachedPCRE/1M 500 2726382 ns/op 384.60 MB/s
+Search_Easy0_CachedPCRE/2M 200 5456280 ns/op 384.36 MB/s
+Search_Easy0_CachedPCRE/4M 100 11055420 ns/op 379.39 MB/s
+Search_Easy0_CachedPCRE/8M 50 22173320 ns/op 378.32 MB/s
+Search_Easy0_CachedPCRE/16M 50 44321260 ns/op 378.54 MB/s
+Search_Easy0_CachedRE2/8 5000000 314 ns/op 25.47 MB/s
+Search_Easy0_CachedRE2/16 5000000 315 ns/op 50.72 MB/s
+Search_Easy0_CachedRE2/32 5000000 331 ns/op 96.44 MB/s
+Search_Easy0_CachedRE2/64 5000000 332 ns/op 192.59 MB/s
+Search_Easy0_CachedRE2/128 5000000 363 ns/op 352.00 MB/s
+Search_Easy0_CachedRE2/256 5000000 389 ns/op 658.00 MB/s
+Search_Easy0_CachedRE2/512 5000000 469 ns/op 1089.76 MB/s
+Search_Easy0_CachedRE2/1K 2000000 652 ns/op 1569.80 MB/s
+Search_Easy0_CachedRE2/2K 1000000 1013 ns/op 2020.66 MB/s
+Search_Easy0_CachedRE2/4K 1000000 1571 ns/op 2606.84 MB/s
+Search_Easy0_CachedRE2/8K 500000 2911 ns/op 2814.06 MB/s
+Search_Easy0_CachedRE2/16K 200000 5405 ns/op 3030.77 MB/s
+Search_Easy0_CachedRE2/32K 100000 10989 ns/op 2981.79 MB/s
+Search_Easy0_CachedRE2/64K 50000 21839 ns/op 3000.77 MB/s
+Search_Easy0_CachedRE2/128K 50000 44376 ns/op 2953.66 MB/s
+Search_Easy0_CachedRE2/256K 20000 88364 ns/op 2966.64 MB/s
+Search_Easy0_CachedRE2/512K 10000 177685 ns/op 2950.64 MB/s
+Search_Easy0_CachedRE2/1M 5000 356602 ns/op 2940.46 MB/s
+Search_Easy0_CachedRE2/2M 2000 715631 ns/op 2930.49 MB/s
+Search_Easy0_CachedRE2/4M 1000 1529594 ns/op 2742.10 MB/s
+Search_Easy0_CachedRE2/8M 500 3089266 ns/op 2715.40 MB/s
+Search_Easy0_CachedRE2/16M 200 6153925 ns/op 2726.26 MB/s
+Search_Easy1_CachedPCRE/8 10000000 132 ns/op 60.48 MB/s
+Search_Easy1_CachedPCRE/16 10000000 160 ns/op 99.52 MB/s
+Search_Easy1_CachedPCRE/32 5000000 217 ns/op 147.11 MB/s
+Search_Easy1_CachedPCRE/64 5000000 331 ns/op 193.20 MB/s
+Search_Easy1_CachedPCRE/128 2000000 648 ns/op 197.44 MB/s
+Search_Easy1_CachedPCRE/256 2000000 935 ns/op 273.76 MB/s
+Search_Easy1_CachedPCRE/512 1000000 1966 ns/op 260.32 MB/s
+Search_Easy1_CachedPCRE/1K 500000 3418 ns/op 299.53 MB/s
+Search_Easy1_CachedPCRE/2K 200000 6237 ns/op 328.33 MB/s
+Search_Easy1_CachedPCRE/4K 100000 11125 ns/op 368.18 MB/s
+Search_Easy1_CachedPCRE/8K 50000 22022 ns/op 371.98 MB/s
+Search_Easy1_CachedPCRE/16K 50000 42402 ns/op 386.39 MB/s
+Search_Easy1_CachedPCRE/32K 20000 85237 ns/op 384.43 MB/s
+Search_Easy1_CachedPCRE/64K 10000 170201 ns/op 385.05 MB/s
+Search_Easy1_CachedPCRE/128K 5000 342009 ns/op 383.24 MB/s
+Search_Easy1_CachedPCRE/256K 2000 682201 ns/op 384.26 MB/s
+Search_Easy1_CachedPCRE/512K 1000 1366471 ns/op 383.68 MB/s
+Search_Easy1_CachedPCRE/1M 500 2735128 ns/op 383.37 MB/s
+Search_Easy1_CachedPCRE/2M 200 5471205 ns/op 383.31 MB/s
+Search_Easy1_CachedPCRE/4M 100 11093340 ns/op 378.09 MB/s
+Search_Easy1_CachedPCRE/8M 50 22240420 ns/op 377.18 MB/s
+Search_Easy1_CachedPCRE/16M 50 44464400 ns/op 377.32 MB/s
+Search_Easy1_CachedRE2/8 5000000 316 ns/op 25.27 MB/s
+Search_Easy1_CachedRE2/16 5000000 317 ns/op 50.44 MB/s
+Search_Easy1_CachedRE2/32 5000000 330 ns/op 96.79 MB/s
+Search_Easy1_CachedRE2/64 5000000 334 ns/op 191.06 MB/s
+Search_Easy1_CachedRE2/128 5000000 365 ns/op 350.44 MB/s
+Search_Easy1_CachedRE2/256 5000000 400 ns/op 639.30 MB/s
+Search_Easy1_CachedRE2/512 5000000 472 ns/op 1083.02 MB/s
+Search_Easy1_CachedRE2/1K 2000000 652 ns/op 1570.19 MB/s
+Search_Easy1_CachedRE2/2K 1000000 1002 ns/op 2043.19 MB/s
+Search_Easy1_CachedRE2/4K 1000000 1576 ns/op 2598.26 MB/s
+Search_Easy1_CachedRE2/8K 500000 2924 ns/op 2801.57 MB/s
+Search_Easy1_CachedRE2/16K 200000 5449 ns/op 3006.54 MB/s
+Search_Easy1_CachedRE2/32K 100000 10985 ns/op 2982.90 MB/s
+Search_Easy1_CachedRE2/64K 50000 21837 ns/op 3001.13 MB/s
+Search_Easy1_CachedRE2/128K 50000 44336 ns/op 2956.31 MB/s
+Search_Easy1_CachedRE2/256K 20000 88350 ns/op 2967.08 MB/s
+Search_Easy1_CachedRE2/512K 10000 177698 ns/op 2950.43 MB/s
+Search_Easy1_CachedRE2/1M 5000 356645 ns/op 2940.11 MB/s
+Search_Easy1_CachedRE2/2M 2000 715710 ns/op 2930.17 MB/s
+Search_Easy1_CachedRE2/4M 1000 1529932 ns/op 2741.50 MB/s
+Search_Easy1_CachedRE2/8M 500 3087586 ns/op 2716.88 MB/s
+Search_Easy1_CachedRE2/16M 200 6155690 ns/op 2725.48 MB/s
+Search_Medium_CachedPCRE/8 10000000 133 ns/op 59.81 MB/s
+Search_Medium_CachedPCRE/16 10000000 162 ns/op 98.58 MB/s
+Search_Medium_CachedPCRE/32 5000000 219 ns/op 145.96 MB/s
+Search_Medium_CachedPCRE/64 5000000 340 ns/op 188.00 MB/s
+Search_Medium_CachedPCRE/128 5000000 434 ns/op 294.67 MB/s
+Search_Medium_CachedPCRE/256 200000 9076 ns/op 28.20 MB/s
+Search_Medium_CachedPCRE/512 50000 21579 ns/op 23.73 MB/s
+Search_Medium_CachedPCRE/1K 50000 42391 ns/op 24.16 MB/s
+Search_Medium_CachedPCRE/2K 20000 62367 ns/op 32.84 MB/s
+Search_Medium_CachedPCRE/4K 10000 153667 ns/op 26.66 MB/s
+Search_Medium_CachedPCRE/8K 5000 332606 ns/op 24.63 MB/s
+Search_Medium_CachedPCRE/16K 2000 677805 ns/op 24.17 MB/s
+Search_Medium_CachedPCRE/32K 1000 1355730 ns/op 24.17 MB/s
+Search_Medium_CachedPCRE/64K 500 2707474 ns/op 24.21 MB/s
+Search_Medium_CachedPCRE/128K 200 5409525 ns/op 24.23 MB/s
+Search_Medium_CachedPCRE/256K 100 10821290 ns/op 24.22 MB/s
+Search_Medium_CachedRE2/8 5000000 335 ns/op 23.87 MB/s
+Search_Medium_CachedRE2/16 5000000 362 ns/op 44.16 MB/s
+Search_Medium_CachedRE2/32 5000000 408 ns/op 78.36 MB/s
+Search_Medium_CachedRE2/64 2000000 510 ns/op 125.32 MB/s
+Search_Medium_CachedRE2/128 2000000 723 ns/op 176.99 MB/s
+Search_Medium_CachedRE2/256 1000000 1125 ns/op 227.47 MB/s
+Search_Medium_CachedRE2/512 1000000 1935 ns/op 264.50 MB/s
+Search_Medium_CachedRE2/1K 500000 3553 ns/op 288.20 MB/s
+Search_Medium_CachedRE2/2K 200000 6794 ns/op 301.41 MB/s
+Search_Medium_CachedRE2/4K 100000 13257 ns/op 308.96 MB/s
+Search_Medium_CachedRE2/8K 50000 26198 ns/op 312.69 MB/s
+Search_Medium_CachedRE2/16K 20000 52087 ns/op 314.55 MB/s
+Search_Medium_CachedRE2/32K 10000 103942 ns/op 315.25 MB/s
+Search_Medium_CachedRE2/64K 5000 207481 ns/op 315.86 MB/s
+Search_Medium_CachedRE2/128K 5000 414561 ns/op 316.17 MB/s
+Search_Medium_CachedRE2/256K 2000 828789 ns/op 316.30 MB/s
+Search_Medium_CachedRE2/512K 1000 1657133 ns/op 316.38 MB/s
+Search_Medium_CachedRE2/1M 500 3314164 ns/op 316.39 MB/s
+Search_Medium_CachedRE2/2M 200 6632795 ns/op 316.18 MB/s
+Search_Medium_CachedRE2/4M 100 13340680 ns/op 314.40 MB/s
+Search_Medium_CachedRE2/8M 50 26721100 ns/op 313.93 MB/s
+Search_Medium_CachedRE2/16M 20 53443050 ns/op 313.93 MB/s
+Search_Hard_CachedPCRE/8 10000000 133 ns/op 59.77 MB/s
+Search_Hard_CachedPCRE/16 10000000 162 ns/op 98.62 MB/s
+Search_Hard_CachedPCRE/32 5000000 219 ns/op 145.97 MB/s
+Search_Hard_CachedPCRE/64 5000000 340 ns/op 188.06 MB/s
+Search_Hard_CachedPCRE/128 5000000 434 ns/op 294.69 MB/s
+Search_Hard_CachedPCRE/256 2000 573267 ns/op 0.45 MB/s
+Search_Hard_CachedPCRE/512 500 2347118 ns/op 0.22 MB/s
+Search_Hard_CachedPCRE/1K 200 9316730 ns/op 0.11 MB/s
+Search_Hard_CachedPCRE/2K 50 34064460 ns/op 0.06 MB/s
+Search_Hard_CachedPCRE/4K 10 146725200 ns/op 0.03 MB/s
+Search_Hard_CachedRE2/8 5000000 335 ns/op 23.87 MB/s
+Search_Hard_CachedRE2/16 5000000 363 ns/op 44.03 MB/s
+Search_Hard_CachedRE2/32 5000000 411 ns/op 77.80 MB/s
+Search_Hard_CachedRE2/64 2000000 510 ns/op 125.28 MB/s
+Search_Hard_CachedRE2/128 2000000 720 ns/op 177.74 MB/s
+Search_Hard_CachedRE2/256 1000000 1125 ns/op 227.38 MB/s
+Search_Hard_CachedRE2/512 1000000 1936 ns/op 264.45 MB/s
+Search_Hard_CachedRE2/1K 500000 3552 ns/op 288.25 MB/s
+Search_Hard_CachedRE2/2K 200000 6794 ns/op 301.41 MB/s
+Search_Hard_CachedRE2/4K 100000 13257 ns/op 308.96 MB/s
+Search_Hard_CachedRE2/8K 50000 26201 ns/op 312.66 MB/s
+Search_Hard_CachedRE2/16K 20000 52089 ns/op 314.53 MB/s
+Search_Hard_CachedRE2/32K 10000 103959 ns/op 315.20 MB/s
+Search_Hard_CachedRE2/64K 5000 207483 ns/op 315.86 MB/s
+Search_Hard_CachedRE2/128K 5000 414583 ns/op 316.15 MB/s
+Search_Hard_CachedRE2/256K 2000 828720 ns/op 316.32 MB/s
+Search_Hard_CachedRE2/512K 1000 1657121 ns/op 316.38 MB/s
+Search_Hard_CachedRE2/1M 500 3314102 ns/op 316.40 MB/s
+Search_Hard_CachedRE2/2M 200 6632065 ns/op 316.21 MB/s
+Search_Hard_CachedRE2/4M 100 13339990 ns/op 314.42 MB/s
+Search_Hard_CachedRE2/8M 50 26721960 ns/op 313.92 MB/s
+Search_Hard_CachedRE2/16M 20 53440900 ns/op 313.94 MB/s
+Search_Parens_CachedPCRE/8 10000000 197 ns/op 40.42 MB/s
+Search_Parens_CachedRE2/8 5000000 334 ns/op 23.90 MB/s
+Search_Parens_CachedRE2/16 5000000 359 ns/op 44.46 MB/s
+Search_Parens_CachedRE2/32 5000000 413 ns/op 77.42 MB/s
+Search_Parens_CachedRE2/64 2000000 511 ns/op 125.07 MB/s
+Search_Parens_CachedRE2/128 2000000 722 ns/op 177.10 MB/s
+Search_Parens_CachedRE2/256 1000000 1128 ns/op 226.81 MB/s
+Search_Parens_CachedRE2/512 1000000 1935 ns/op 264.47 MB/s
+Search_Parens_CachedRE2/1K 500000 3561 ns/op 287.49 MB/s
+Search_Parens_CachedRE2/2K 200000 6787 ns/op 301.72 MB/s
+Search_Parens_CachedRE2/4K 100000 13262 ns/op 308.84 MB/s
+Search_Parens_CachedRE2/8K 50000 26204 ns/op 312.61 MB/s
+Search_Parens_CachedRE2/16K 20000 52095 ns/op 314.50 MB/s
+Search_Parens_CachedRE2/32K 10000 103945 ns/op 315.24 MB/s
+Search_Parens_CachedRE2/64K 5000 207517 ns/op 315.81 MB/s
+Search_Parens_CachedRE2/128K 5000 414628 ns/op 316.12 MB/s
+Search_Parens_CachedRE2/256K 2000 828799 ns/op 316.29 MB/s
+Search_Parens_CachedRE2/512K 1000 1657224 ns/op 316.37 MB/s
+Search_Parens_CachedRE2/1M 500 3314264 ns/op 316.38 MB/s
+Search_Parens_CachedRE2/2M 200 6633485 ns/op 316.15 MB/s
+Search_Parens_CachedRE2/4M 100 13340780 ns/op 314.40 MB/s
+Search_Parens_CachedRE2/8M 50 26719280 ns/op 313.95 MB/s
+Search_Parens_CachedRE2/16M 20 53447850 ns/op 313.90 MB/s
+Search_BigFixed_CachedPCRE/8 5000000 242 ns/op 32.93 MB/s
+Search_BigFixed_CachedPCRE/16 5000000 301 ns/op 53.06 MB/s
+Search_BigFixed_CachedPCRE/32 5000000 418 ns/op 76.48 MB/s
+Search_BigFixed_CachedPCRE/64 2000000 652 ns/op 98.09 MB/s
+Search_BigFixed_CachedPCRE/128 2000000 985 ns/op 129.90 MB/s
+Search_BigFixed_CachedPCRE/256 1000000 1775 ns/op 144.21 MB/s
+Search_BigFixed_CachedPCRE/512 500000 3342 ns/op 153.19 MB/s
+Search_BigFixed_CachedPCRE/1K 200000 6476 ns/op 158.12 MB/s
+Search_BigFixed_CachedPCRE/2K 100000 12745 ns/op 160.68 MB/s
+Search_BigFixed_CachedPCRE/4K 50000 25284 ns/op 162.00 MB/s
+Search_BigFixed_CachedPCRE/8K 20000 50366 ns/op 162.65 MB/s
+Search_BigFixed_CachedPCRE/16K 10000 100603 ns/op 162.86 MB/s
+Search_BigFixed_CachedPCRE/32K 5000 201124 ns/op 162.92 MB/s
+Search_BigFixed_CachedRE2/8 10000000 130 ns/op 61.36 MB/s
+Search_BigFixed_CachedRE2/16 5000000 375 ns/op 42.64 MB/s
+Search_BigFixed_CachedRE2/32 5000000 407 ns/op 78.61 MB/s
+Search_BigFixed_CachedRE2/64 5000000 486 ns/op 131.63 MB/s
+Search_BigFixed_CachedRE2/128 2000000 630 ns/op 203.08 MB/s
+Search_BigFixed_CachedRE2/256 2000000 945 ns/op 270.64 MB/s
+Search_BigFixed_CachedRE2/512 1000000 1547 ns/op 330.90 MB/s
+Search_BigFixed_CachedRE2/1K 500000 2765 ns/op 370.29 MB/s
+Search_BigFixed_CachedRE2/2K 200000 5187 ns/op 394.77 MB/s
+Search_BigFixed_CachedRE2/4K 100000 10045 ns/op 407.74 MB/s
+Search_BigFixed_CachedRE2/8K 100000 19754 ns/op 414.68 MB/s
+Search_BigFixed_CachedRE2/16K 50000 39160 ns/op 418.39 MB/s
+Search_BigFixed_CachedRE2/32K 20000 78097 ns/op 419.58 MB/s
+Search_BigFixed_CachedRE2/64K 10000 155858 ns/op 420.48 MB/s
+Search_BigFixed_CachedRE2/128K 5000 311449 ns/op 420.85 MB/s
+Search_BigFixed_CachedRE2/256K 2000 623620 ns/op 420.36 MB/s
+Search_BigFixed_CachedRE2/512K 1000 1250862 ns/op 419.14 MB/s
+Search_BigFixed_CachedRE2/1M 500 2517654 ns/op 416.49 MB/s
+Search_Success_PCRE/8 1000000 1812 ns/op 4.41 MB/s
+Search_Success_PCRE/16 1000000 1852 ns/op 8.64 MB/s
+Search_Success_PCRE/32 1000000 1935 ns/op 16.53 MB/s
+Search_Success_PCRE/64 500000 2130 ns/op 30.04 MB/s
+Search_Success_PCRE/128 500000 2480 ns/op 51.61 MB/s
+Search_Success_PCRE/256 500000 3190 ns/op 80.25 MB/s
+Search_Success_PCRE/512 500000 4611 ns/op 111.02 MB/s
+Search_Success_PCRE/1K 200000 7430 ns/op 137.80 MB/s
+Search_Success_PCRE/2K 100000 13072 ns/op 156.66 MB/s
+Search_Success_PCRE/4K 50000 24385 ns/op 167.97 MB/s
+Search_Success_PCRE/8K 50000 47046 ns/op 174.13 MB/s
+Search_Success_PCRE/16K 20000 92417 ns/op 177.28 MB/s
+Search_Success_PCRE/32K 10000 183262 ns/op 178.80 MB/s
+Search_Success_PCRE/64K 5000 364683 ns/op 179.71 MB/s
+Search_Success_PCRE/128K 2000 728298 ns/op 179.97 MB/s
+Search_Success_PCRE/256K 1000 1457823 ns/op 179.82 MB/s
+Search_Success_PCRE/512K 500 2926208 ns/op 179.17 MB/s
+Search_Success_PCRE/1M 200 5926520 ns/op 176.93 MB/s
+Search_Success_PCRE/2M 100 12118480 ns/op 173.05 MB/s
+Search_Success_PCRE/4M 50 25402020 ns/op 165.12 MB/s
+Search_Success_PCRE/8M 20 56959600 ns/op 147.27 MB/s
+Search_Success_PCRE/16M 10 134219200 ns/op 125.00 MB/s
+Search_Success_RE2/8 200000 8371 ns/op 0.96 MB/s
+Search_Success_RE2/16 100000 19886 ns/op 0.80 MB/s
+Search_Success_RE2/32 100000 19774 ns/op 1.62 MB/s
+Search_Success_RE2/64 50000 20190 ns/op 3.17 MB/s
+Search_Success_RE2/128 50000 20169 ns/op 6.35 MB/s
+Search_Success_RE2/256 50000 20632 ns/op 12.41 MB/s
+Search_Success_RE2/512 50000 21598 ns/op 23.71 MB/s
+Search_Success_RE2/1K 50000 23051 ns/op 44.42 MB/s
+Search_Success_RE2/2K 50000 26258 ns/op 77.99 MB/s
+Search_Success_RE2/4K 50000 32804 ns/op 124.86 MB/s
+Search_Success_RE2/8K 50000 45835 ns/op 178.73 MB/s
+Search_Success_RE2/16K 20000 71685 ns/op 228.55 MB/s
+Search_Success_RE2/32K 10000 123817 ns/op 264.65 MB/s
+Search_Success_RE2/64K 5000 227706 ns/op 287.81 MB/s
+Search_Success_RE2/128K 5000 435094 ns/op 301.25 MB/s
+Search_Success_RE2/256K 2000 851813 ns/op 307.75 MB/s
+Search_Success_RE2/512K 1000 1689866 ns/op 310.25 MB/s
+Search_Success_RE2/1M 500 3385158 ns/op 309.76 MB/s
+Search_Success_RE2/2M 200 6914280 ns/op 303.31 MB/s
+Search_Success_RE2/4M 100 14404490 ns/op 291.18 MB/s
+Search_Success_RE2/8M 50 30838520 ns/op 272.02 MB/s
+Search_Success_RE2/16M 10 7977066800 ns/op 2.10 MB/s
+Search_Success_CachedPCRE/8 5000000 247 ns/op 32.27 MB/s
+Search_Success_CachedPCRE/16 5000000 289 ns/op 55.29 MB/s
+Search_Success_CachedPCRE/32 5000000 396 ns/op 80.68 MB/s
+Search_Success_CachedPCRE/64 2000000 611 ns/op 104.66 MB/s
+Search_Success_CachedPCRE/128 2000000 1760 ns/op 72.71 MB/s
+==BENCHMARK== c2 Fri Feb 26 14:31:16 PST 2010
+# Linux c2 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64 GNU/Linux
+# g++ (Ubuntu 4.4.1-4ubuntu8) 4.4.1
+# Copyright (C) 2009 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# c268b421d457+ tip
+
+Search_Easy0_CachedPCRE/8 10000000 131 ns/op 61.07 MB/s
+Search_Easy0_CachedPCRE/16 10000000 159 ns/op 100.32 MB/s
+Search_Easy0_CachedPCRE/32 5000000 216 ns/op 147.77 MB/s
+Search_Easy0_CachedPCRE/64 5000000 330 ns/op 193.83 MB/s
+Search_Easy0_CachedPCRE/128 5000000 430 ns/op 297.34 MB/s
+Search_Easy0_CachedPCRE/256 1000000 1080 ns/op 236.97 MB/s
+Search_Easy0_CachedPCRE/512 1000000 1739 ns/op 294.27 MB/s
+Search_Easy0_CachedPCRE/1K 500000 3389 ns/op 302.07 MB/s
+Search_Easy0_CachedPCRE/2K 200000 5681 ns/op 360.45 MB/s
+Search_Easy0_CachedPCRE/4K 100000 10629 ns/op 385.34 MB/s
+Search_Easy0_CachedPCRE/8K 50000 21776 ns/op 376.18 MB/s
+Search_Easy0_CachedPCRE/16K 50000 42192 ns/op 388.32 MB/s
+Search_Easy0_CachedPCRE/32K 20000 85172 ns/op 384.73 MB/s
+Search_Easy0_CachedPCRE/64K 10000 169816 ns/op 385.92 MB/s
+Search_Easy0_CachedPCRE/128K 5000 341102 ns/op 384.26 MB/s
+Search_Easy0_CachedPCRE/256K 2000 680651 ns/op 385.14 MB/s
+Search_Easy0_CachedPCRE/512K 1000 1362954 ns/op 384.67 MB/s
+Search_Easy0_CachedPCRE/1M 500 2726140 ns/op 384.64 MB/s
+Search_Easy0_CachedPCRE/2M 200 5463185 ns/op 383.87 MB/s
+Search_Easy0_CachedPCRE/4M 100 11055500 ns/op 379.39 MB/s
+Search_Easy0_CachedPCRE/8M 50 22168840 ns/op 378.40 MB/s
+Search_Easy0_CachedPCRE/16M 50 44330340 ns/op 378.46 MB/s
+Search_Easy0_CachedRE2/8 5000000 318 ns/op 25.09 MB/s
+Search_Easy0_CachedRE2/16 5000000 317 ns/op 50.33 MB/s
+Search_Easy0_CachedRE2/32 5000000 341 ns/op 93.71 MB/s
+Search_Easy0_CachedRE2/64 5000000 350 ns/op 182.71 MB/s
+Search_Easy0_CachedRE2/128 5000000 383 ns/op 333.80 MB/s
+Search_Easy0_CachedRE2/256 5000000 401 ns/op 636.89 MB/s
+Search_Easy0_CachedRE2/512 5000000 483 ns/op 1058.27 MB/s
+Search_Easy0_CachedRE2/1K 2000000 672 ns/op 1523.32 MB/s
+Search_Easy0_CachedRE2/2K 1000000 1023 ns/op 2000.24 MB/s
+Search_Easy0_CachedRE2/4K 1000000 1597 ns/op 2564.44 MB/s
+Search_Easy0_CachedRE2/8K 500000 2918 ns/op 2807.09 MB/s
+Search_Easy0_CachedRE2/16K 200000 5429 ns/op 3017.39 MB/s
+Search_Easy0_CachedRE2/32K 100000 11045 ns/op 2966.75 MB/s
+Search_Easy0_CachedRE2/64K 50000 21873 ns/op 2996.08 MB/s
+Search_Easy0_CachedRE2/128K 50000 44398 ns/op 2952.16 MB/s
+Search_Easy0_CachedRE2/256K 20000 88429 ns/op 2964.44 MB/s
+Search_Easy0_CachedRE2/512K 10000 177688 ns/op 2950.60 MB/s
+Search_Easy0_CachedRE2/1M 5000 356798 ns/op 2938.84 MB/s
+Search_Easy0_CachedRE2/2M 2000 721040 ns/op 2908.51 MB/s
+Search_Easy0_CachedRE2/4M 1000 1526733 ns/op 2747.24 MB/s
+Search_Easy0_CachedRE2/8M 500 3085732 ns/op 2718.51 MB/s
+Search_Easy0_CachedRE2/16M 200 6155395 ns/op 2725.61 MB/s
+Search_Easy1_CachedPCRE/8 10000000 131 ns/op 60.93 MB/s
+Search_Easy1_CachedPCRE/16 10000000 159 ns/op 100.31 MB/s
+Search_Easy1_CachedPCRE/32 5000000 216 ns/op 147.91 MB/s
+Search_Easy1_CachedPCRE/64 5000000 330 ns/op 193.81 MB/s
+Search_Easy1_CachedPCRE/128 2000000 647 ns/op 197.81 MB/s
+Search_Easy1_CachedPCRE/256 2000000 933 ns/op 274.18 MB/s
+Search_Easy1_CachedPCRE/512 1000000 1969 ns/op 260.02 MB/s
+Search_Easy1_CachedPCRE/1K 500000 3440 ns/op 297.67 MB/s
+Search_Easy1_CachedPCRE/2K 200000 6230 ns/op 328.73 MB/s
+Search_Easy1_CachedPCRE/4K 100000 11116 ns/op 368.45 MB/s
+Search_Easy1_CachedPCRE/8K 50000 22010 ns/op 372.19 MB/s
+Search_Easy1_CachedPCRE/16K 50000 42395 ns/op 386.45 MB/s
+Search_Easy1_CachedPCRE/32K 20000 85210 ns/op 384.55 MB/s
+Search_Easy1_CachedPCRE/64K 10000 170224 ns/op 385.00 MB/s
+Search_Easy1_CachedPCRE/128K 5000 342017 ns/op 383.23 MB/s
+Search_Easy1_CachedPCRE/256K 2000 682168 ns/op 384.28 MB/s
+Search_Easy1_CachedPCRE/512K 1000 1366582 ns/op 383.65 MB/s
+Search_Easy1_CachedPCRE/1M 500 2735192 ns/op 383.36 MB/s
+Search_Easy1_CachedPCRE/2M 200 5480130 ns/op 382.68 MB/s
+Search_Easy1_CachedPCRE/4M 100 11087200 ns/op 378.30 MB/s
+Search_Easy1_CachedPCRE/8M 50 22238640 ns/op 377.21 MB/s
+Search_Easy1_CachedPCRE/16M 50 44462340 ns/op 377.34 MB/s
+Search_Easy1_CachedRE2/8 5000000 318 ns/op 25.09 MB/s
+Search_Easy1_CachedRE2/16 5000000 317 ns/op 50.36 MB/s
+Search_Easy1_CachedRE2/32 5000000 345 ns/op 92.55 MB/s
+Search_Easy1_CachedRE2/64 5000000 350 ns/op 182.79 MB/s
+Search_Easy1_CachedRE2/128 5000000 385 ns/op 331.75 MB/s
+Search_Easy1_CachedRE2/256 5000000 408 ns/op 626.83 MB/s
+Search_Easy1_CachedRE2/512 5000000 484 ns/op 1056.72 MB/s
+Search_Easy1_CachedRE2/1K 2000000 676 ns/op 1513.66 MB/s
+Search_Easy1_CachedRE2/2K 1000000 1020 ns/op 2007.55 MB/s
+Search_Easy1_CachedRE2/4K 1000000 1596 ns/op 2564.98 MB/s
+Search_Easy1_CachedRE2/8K 500000 2918 ns/op 2806.79 MB/s
+Search_Easy1_CachedRE2/16K 200000 5447 ns/op 3007.74 MB/s
+Search_Easy1_CachedRE2/32K 100000 11037 ns/op 2968.84 MB/s
+Search_Easy1_CachedRE2/64K 50000 21863 ns/op 2997.48 MB/s
+Search_Easy1_CachedRE2/128K 50000 44394 ns/op 2952.41 MB/s
+Search_Easy1_CachedRE2/256K 20000 88430 ns/op 2964.42 MB/s
+Search_Easy1_CachedRE2/512K 10000 177661 ns/op 2951.06 MB/s
+Search_Easy1_CachedRE2/1M 5000 356783 ns/op 2938.97 MB/s
+Search_Easy1_CachedRE2/2M 2000 721013 ns/op 2908.62 MB/s
+Search_Easy1_CachedRE2/4M 1000 1526313 ns/op 2748.00 MB/s
+Search_Easy1_CachedRE2/8M 500 3085670 ns/op 2718.57 MB/s
+Search_Easy1_CachedRE2/16M 200 6156380 ns/op 2725.18 MB/s
+Search_Medium_CachedPCRE/8 10000000 132 ns/op 60.24 MB/s
+Search_Medium_CachedPCRE/16 10000000 161 ns/op 99.22 MB/s
+Search_Medium_CachedPCRE/32 5000000 218 ns/op 146.72 MB/s
+Search_Medium_CachedPCRE/64 5000000 339 ns/op 188.54 MB/s
+Search_Medium_CachedPCRE/128 5000000 433 ns/op 295.45 MB/s
+Search_Medium_CachedPCRE/256 200000 9074 ns/op 28.21 MB/s
+Search_Medium_CachedPCRE/512 50000 21580 ns/op 23.73 MB/s
+Search_Medium_CachedPCRE/1K 50000 45469 ns/op 22.52 MB/s
+Search_Medium_CachedPCRE/2K 20000 62384 ns/op 32.83 MB/s
+Search_Medium_CachedPCRE/4K 10000 153718 ns/op 26.65 MB/s
+Search_Medium_CachedPCRE/8K 5000 332814 ns/op 24.61 MB/s
+Search_Medium_CachedPCRE/16K 2000 678531 ns/op 24.15 MB/s
+Search_Medium_CachedPCRE/32K 1000 1356201 ns/op 24.16 MB/s
+Search_Medium_CachedPCRE/64K 500 2708792 ns/op 24.19 MB/s
+Search_Medium_CachedPCRE/128K 200 5412745 ns/op 24.22 MB/s
+Search_Medium_CachedPCRE/256K 100 10830430 ns/op 24.20 MB/s
+Search_Medium_CachedRE2/8 5000000 326 ns/op 24.47 MB/s
+Search_Medium_CachedRE2/16 5000000 363 ns/op 43.98 MB/s
+Search_Medium_CachedRE2/32 5000000 412 ns/op 77.57 MB/s
+Search_Medium_CachedRE2/64 2000000 506 ns/op 126.31 MB/s
+Search_Medium_CachedRE2/128 2000000 715 ns/op 178.94 MB/s
+Search_Medium_CachedRE2/256 1000000 1119 ns/op 228.65 MB/s
+Search_Medium_CachedRE2/512 1000000 1928 ns/op 265.47 MB/s
+Search_Medium_CachedRE2/1K 500000 3546 ns/op 288.75 MB/s
+Search_Medium_CachedRE2/2K 200000 6782 ns/op 301.97 MB/s
+Search_Medium_CachedRE2/4K 100000 13257 ns/op 308.95 MB/s
+Search_Medium_CachedRE2/8K 50000 26197 ns/op 312.70 MB/s
+Search_Medium_CachedRE2/16K 20000 52081 ns/op 314.58 MB/s
+Search_Medium_CachedRE2/32K 10000 103926 ns/op 315.30 MB/s
+Search_Medium_CachedRE2/64K 5000 207484 ns/op 315.86 MB/s
+Search_Medium_CachedRE2/128K 5000 414545 ns/op 316.18 MB/s
+Search_Medium_CachedRE2/256K 2000 828791 ns/op 316.30 MB/s
+Search_Medium_CachedRE2/512K 1000 1657160 ns/op 316.38 MB/s
+Search_Medium_CachedRE2/1M 500 3314254 ns/op 316.38 MB/s
+Search_Medium_CachedRE2/2M 200 6636905 ns/op 315.98 MB/s
+Search_Medium_CachedRE2/4M 100 13339080 ns/op 314.44 MB/s
+Search_Medium_CachedRE2/8M 50 26718900 ns/op 313.96 MB/s
+Search_Medium_CachedRE2/16M 20 53442000 ns/op 313.93 MB/s
+Search_Hard_CachedPCRE/8 10000000 132 ns/op 60.21 MB/s
+Search_Hard_CachedPCRE/16 10000000 161 ns/op 99.25 MB/s
+Search_Hard_CachedPCRE/32 5000000 218 ns/op 146.67 MB/s
+Search_Hard_CachedPCRE/64 5000000 339 ns/op 188.62 MB/s
+Search_Hard_CachedPCRE/128 5000000 433 ns/op 295.34 MB/s
+Search_Hard_CachedPCRE/256 2000 573612 ns/op 0.45 MB/s
+Search_Hard_CachedPCRE/512 500 2344764 ns/op 0.22 MB/s
+Search_Hard_CachedPCRE/1K 200 9311170 ns/op 0.11 MB/s
+Search_Hard_CachedPCRE/2K 50 34066500 ns/op 0.06 MB/s
+Search_Hard_CachedPCRE/4K 10 146643800 ns/op 0.03 MB/s
+Search_Hard_CachedRE2/8 5000000 333 ns/op 23.98 MB/s
+Search_Hard_CachedRE2/16 5000000 358 ns/op 44.62 MB/s
+Search_Hard_CachedRE2/32 5000000 408 ns/op 78.26 MB/s
+Search_Hard_CachedRE2/64 2000000 509 ns/op 125.53 MB/s
+Search_Hard_CachedRE2/128 2000000 717 ns/op 178.52 MB/s
+Search_Hard_CachedRE2/256 1000000 1125 ns/op 227.48 MB/s
+Search_Hard_CachedRE2/512 1000000 1929 ns/op 265.34 MB/s
+Search_Hard_CachedRE2/1K 500000 3547 ns/op 288.63 MB/s
+Search_Hard_CachedRE2/2K 200000 6782 ns/op 301.97 MB/s
+Search_Hard_CachedRE2/4K 100000 13254 ns/op 309.02 MB/s
+Search_Hard_CachedRE2/8K 50000 26193 ns/op 312.74 MB/s
+Search_Hard_CachedRE2/16K 20000 52077 ns/op 314.61 MB/s
+Search_Hard_CachedRE2/32K 10000 103944 ns/op 315.25 MB/s
+Search_Hard_CachedRE2/64K 5000 207487 ns/op 315.86 MB/s
+Search_Hard_CachedRE2/128K 5000 414578 ns/op 316.16 MB/s
+Search_Hard_CachedRE2/256K 2000 828793 ns/op 316.30 MB/s
+Search_Hard_CachedRE2/512K 1000 1657164 ns/op 316.38 MB/s
+Search_Hard_CachedRE2/1M 500 3314178 ns/op 316.39 MB/s
+Search_Hard_CachedRE2/2M 200 6636585 ns/op 316.00 MB/s
+Search_Hard_CachedRE2/4M 100 13339310 ns/op 314.43 MB/s
+Search_Hard_CachedRE2/8M 50 26720420 ns/op 313.94 MB/s
+Search_Hard_CachedRE2/16M 20 53443250 ns/op 313.93 MB/s
+Search_Parens_CachedPCRE/8 10000000 196 ns/op 40.66 MB/s
+Search_Parens_CachedRE2/8 5000000 331 ns/op 24.15 MB/s
+Search_Parens_CachedRE2/16 5000000 359 ns/op 44.46 MB/s
+Search_Parens_CachedRE2/32 5000000 409 ns/op 78.05 MB/s
+Search_Parens_CachedRE2/64 2000000 509 ns/op 125.63 MB/s
+Search_Parens_CachedRE2/128 2000000 720 ns/op 177.69 MB/s
+Search_Parens_CachedRE2/256 1000000 1127 ns/op 226.97 MB/s
+Search_Parens_CachedRE2/512 1000000 1937 ns/op 264.32 MB/s
+Search_Parens_CachedRE2/1K 500000 3547 ns/op 288.65 MB/s
+Search_Parens_CachedRE2/2K 200000 6784 ns/op 301.88 MB/s
+Search_Parens_CachedRE2/4K 100000 13253 ns/op 309.05 MB/s
+Search_Parens_CachedRE2/8K 50000 26195 ns/op 312.73 MB/s
+Search_Parens_CachedRE2/16K 20000 52085 ns/op 314.56 MB/s
+Search_Parens_CachedRE2/32K 10000 103948 ns/op 315.23 MB/s
+Search_Parens_CachedRE2/64K 5000 207519 ns/op 315.81 MB/s
+Search_Parens_CachedRE2/128K 5000 414605 ns/op 316.14 MB/s
+Search_Parens_CachedRE2/256K 2000 828800 ns/op 316.29 MB/s
+Search_Parens_CachedRE2/512K 1000 1657191 ns/op 316.37 MB/s
+Search_Parens_CachedRE2/1M 500 3314252 ns/op 316.38 MB/s
+Search_Parens_CachedRE2/2M 200 6637005 ns/op 315.98 MB/s
+Search_Parens_CachedRE2/4M 100 13338840 ns/op 314.44 MB/s
+Search_Parens_CachedRE2/8M 50 26718340 ns/op 313.96 MB/s
+Search_Parens_CachedRE2/16M 20 53436450 ns/op 313.97 MB/s
+Search_BigFixed_CachedPCRE/8 5000000 242 ns/op 32.94 MB/s
+Search_BigFixed_CachedPCRE/16 5000000 301 ns/op 53.07 MB/s
+Search_BigFixed_CachedPCRE/32 5000000 418 ns/op 76.50 MB/s
+Search_BigFixed_CachedPCRE/64 2000000 652 ns/op 98.14 MB/s
+Search_BigFixed_CachedPCRE/128 2000000 985 ns/op 129.90 MB/s
+Search_BigFixed_CachedPCRE/256 1000000 1775 ns/op 144.21 MB/s
+Search_BigFixed_CachedPCRE/512 500000 3342 ns/op 153.17 MB/s
+Search_BigFixed_CachedPCRE/1K 200000 6476 ns/op 158.12 MB/s
+Search_BigFixed_CachedPCRE/2K 100000 12746 ns/op 160.68 MB/s
+Search_BigFixed_CachedPCRE/4K 50000 25285 ns/op 161.99 MB/s
+Search_BigFixed_CachedPCRE/8K 20000 50367 ns/op 162.64 MB/s
+Search_BigFixed_CachedPCRE/16K 10000 100611 ns/op 162.84 MB/s
+Search_BigFixed_CachedPCRE/32K 5000 201128 ns/op 162.92 MB/s
+Search_BigFixed_CachedRE2/8 10000000 130 ns/op 61.50 MB/s
+Search_BigFixed_CachedRE2/16 5000000 373 ns/op 42.81 MB/s
+Search_BigFixed_CachedRE2/32 5000000 406 ns/op 78.69 MB/s
+Search_BigFixed_CachedRE2/64 5000000 485 ns/op 131.89 MB/s
+Search_BigFixed_CachedRE2/128 2000000 630 ns/op 203.11 MB/s
+Search_BigFixed_CachedRE2/256 2000000 949 ns/op 269.70 MB/s
+Search_BigFixed_CachedRE2/512 1000000 1547 ns/op 330.96 MB/s
+Search_BigFixed_CachedRE2/1K 500000 2765 ns/op 370.28 MB/s
+Search_BigFixed_CachedRE2/2K 200000 5186 ns/op 394.84 MB/s
+Search_BigFixed_CachedRE2/4K 100000 10045 ns/op 407.74 MB/s
+Search_BigFixed_CachedRE2/8K 100000 19751 ns/op 414.75 MB/s
+Search_BigFixed_CachedRE2/16K 50000 39158 ns/op 418.41 MB/s
+Search_BigFixed_CachedRE2/32K 20000 78112 ns/op 419.50 MB/s
+Search_BigFixed_CachedRE2/64K 10000 155876 ns/op 420.44 MB/s
+Search_BigFixed_CachedRE2/128K 5000 311462 ns/op 420.83 MB/s
+Search_BigFixed_CachedRE2/256K 2000 623684 ns/op 420.32 MB/s
+Search_BigFixed_CachedRE2/512K 1000 1251098 ns/op 419.06 MB/s
+Search_BigFixed_CachedRE2/1M 500 2517996 ns/op 416.43 MB/s
+Search_Success_PCRE/8 1000000 1816 ns/op 4.40 MB/s
+Search_Success_PCRE/16 1000000 1862 ns/op 8.59 MB/s
+Search_Success_PCRE/32 1000000 1963 ns/op 16.30 MB/s
+Search_Success_PCRE/64 500000 2143 ns/op 29.86 MB/s
+Search_Success_PCRE/128 500000 2492 ns/op 51.35 MB/s
+Search_Success_PCRE/256 500000 3226 ns/op 79.35 MB/s
+Search_Success_PCRE/512 500000 4627 ns/op 110.65 MB/s
+Search_Success_PCRE/1K 200000 7459 ns/op 137.28 MB/s
+Search_Success_PCRE/2K 100000 13114 ns/op 156.16 MB/s
+Search_Success_PCRE/4K 50000 24417 ns/op 167.75 MB/s
+Search_Success_PCRE/8K 50000 47082 ns/op 173.99 MB/s
+Search_Success_PCRE/16K 20000 92415 ns/op 177.29 MB/s
+Search_Success_PCRE/32K 10000 183255 ns/op 178.81 MB/s
+Search_Success_PCRE/64K 5000 364699 ns/op 179.70 MB/s
+Search_Success_PCRE/128K 2000 728375 ns/op 179.95 MB/s
+Search_Success_PCRE/256K 1000 1457928 ns/op 179.81 MB/s
+Search_Success_PCRE/512K 500 2926398 ns/op 179.16 MB/s
+Search_Success_PCRE/1M 200 5926725 ns/op 176.92 MB/s
+Search_Success_PCRE/2M 100 12130250 ns/op 172.89 MB/s
+Search_Success_PCRE/4M 50 25401120 ns/op 165.12 MB/s
+Search_Success_PCRE/8M 20 56961850 ns/op 147.27 MB/s
+Search_Success_PCRE/16M 10 134232100 ns/op 124.99 MB/s
+Search_Success_RE2/8 200000 8299 ns/op 0.96 MB/s
+Search_Success_RE2/16 50000 20306 ns/op 0.79 MB/s
+Search_Success_RE2/32 50000 20336 ns/op 1.57 MB/s
+Search_Success_RE2/64 50000 20557 ns/op 3.11 MB/s
+Search_Success_RE2/128 50000 20586 ns/op 6.22 MB/s
+Search_Success_RE2/256 50000 20882 ns/op 12.26 MB/s
+Search_Success_RE2/512 50000 21673 ns/op 23.62 MB/s
+Search_Success_RE2/1K 50000 23408 ns/op 43.75 MB/s
+Search_Success_RE2/2K 50000 26992 ns/op 75.87 MB/s
+Search_Success_RE2/4K 50000 33213 ns/op 123.33 MB/s
+Search_Success_RE2/8K 50000 46189 ns/op 177.36 MB/s
+Search_Success_RE2/16K 20000 72241 ns/op 226.79 MB/s
+Search_Success_RE2/32K 10000 124254 ns/op 263.72 MB/s
+Search_Success_RE2/64K 5000 228106 ns/op 287.30 MB/s
+Search_Success_RE2/128K 5000 435538 ns/op 300.94 MB/s
+Search_Success_RE2/256K 2000 852223 ns/op 307.60 MB/s
+Search_Success_RE2/512K 1000 1690298 ns/op 310.17 MB/s
+Search_Success_RE2/1M 500 3385618 ns/op 309.71 MB/s
+Search_Success_RE2/2M 200 6919025 ns/op 303.10 MB/s
+Search_Success_RE2/4M 100 14401900 ns/op 291.23 MB/s
+Search_Success_RE2/8M 50 30840700 ns/op 272.00 MB/s
+==BENCHMARK== c2 Fri Feb 26 15:45:38 PST 2010
+# Linux c2 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64 GNU/Linux
+# g++ (Ubuntu 4.4.1-4ubuntu8) 4.4.1
+# Copyright (C) 2009 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# fd9366132ce9+ tip
+# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.15, not stripped
+
+Search_Easy0_CachedPCRE/8 10000000 134 ns/op 59.52 MB/s
+Search_Easy0_CachedPCRE/16 10000000 161 ns/op 99.25 MB/s
+Search_Easy0_CachedPCRE/32 10000000 218 ns/op 146.77 MB/s
+Search_Easy0_CachedPCRE/64 5000000 331 ns/op 192.89 MB/s
+Search_Easy0_CachedPCRE/128 5000000 432 ns/op 296.26 MB/s
+Search_Easy0_CachedPCRE/256 1000000 1081 ns/op 236.79 MB/s
+Search_Easy0_CachedPCRE/512 1000000 1741 ns/op 294.05 MB/s
+Search_Easy0_CachedPCRE/1K 500000 3390 ns/op 302.01 MB/s
+Search_Easy0_CachedPCRE/2K 500000 5686 ns/op 360.17 MB/s
+Search_Easy0_CachedPCRE/4K 200000 10629 ns/op 385.34 MB/s
+Search_Easy0_CachedPCRE/8K 100000 21787 ns/op 375.99 MB/s
+Search_Easy0_CachedPCRE/16K 50000 42183 ns/op 388.40 MB/s
+Search_Easy0_CachedPCRE/32K 20000 85149 ns/op 384.83 MB/s
+Search_Easy0_CachedPCRE/64K 10000 169790 ns/op 385.98 MB/s
+Search_Easy0_CachedPCRE/128K 5000 340958 ns/op 384.42 MB/s
+Search_Easy0_CachedPCRE/256K 5000 680879 ns/op 385.01 MB/s
+Search_Easy0_CachedPCRE/512K 2000 1364074 ns/op 384.35 MB/s
+Search_Easy0_CachedPCRE/1M 1000 2728489 ns/op 384.31 MB/s
+Search_Easy0_CachedPCRE/2M 500 5460158 ns/op 384.08 MB/s
+Search_Easy0_CachedPCRE/4M 100 11069260 ns/op 378.91 MB/s
+Search_Easy0_CachedPCRE/8M 100 22189670 ns/op 378.04 MB/s
+Search_Easy0_CachedPCRE/16M 50 44364000 ns/op 378.17 MB/s
+Search_Easy0_CachedRE2/8 5000000 317 ns/op 25.22 MB/s
+Search_Easy0_CachedRE2/16 5000000 317 ns/op 50.43 MB/s
+Search_Easy0_CachedRE2/32 5000000 331 ns/op 96.61 MB/s
+Search_Easy0_CachedRE2/64 5000000 334 ns/op 191.62 MB/s
+Search_Easy0_CachedRE2/128 5000000 377 ns/op 339.27 MB/s
+Search_Easy0_CachedRE2/256 5000000 404 ns/op 632.62 MB/s
+Search_Easy0_CachedRE2/512 5000000 483 ns/op 1058.96 MB/s
+Search_Easy0_CachedRE2/1K 5000000 664 ns/op 1542.06 MB/s
+Search_Easy0_CachedRE2/2K 1000000 1010 ns/op 2027.71 MB/s
+Search_Easy0_CachedRE2/4K 1000000 1581 ns/op 2590.42 MB/s
+Search_Easy0_CachedRE2/8K 1000000 2939 ns/op 2786.56 MB/s
+Search_Easy0_CachedRE2/16K 500000 5439 ns/op 3011.88 MB/s
+Search_Easy0_CachedRE2/32K 200000 11066 ns/op 2961.13 MB/s
+Search_Easy0_CachedRE2/64K 100000 21875 ns/op 2995.87 MB/s
+Search_Easy0_CachedRE2/128K 50000 44331 ns/op 2956.66 MB/s
+Search_Easy0_CachedRE2/256K 20000 88335 ns/op 2967.61 MB/s
+Search_Easy0_CachedRE2/512K 10000 177855 ns/op 2947.84 MB/s
+Search_Easy0_CachedRE2/1M 5000 356896 ns/op 2938.04 MB/s
+Search_Easy0_CachedRE2/2M 5000 716469 ns/op 2927.07 MB/s
+Search_Easy0_CachedRE2/4M 1000 1532367 ns/op 2737.14 MB/s
+Search_Easy0_CachedRE2/8M 500 3086890 ns/op 2717.49 MB/s
+Search_Easy0_CachedRE2/16M 500 6157146 ns/op 2724.84 MB/s
+Search_Easy1_CachedPCRE/8 20000000 133 ns/op 60.08 MB/s
+Search_Easy1_CachedPCRE/16 10000000 161 ns/op 99.37 MB/s
+Search_Easy1_CachedPCRE/32 10000000 218 ns/op 146.70 MB/s
+Search_Easy1_CachedPCRE/64 5000000 331 ns/op 192.79 MB/s
+Search_Easy1_CachedPCRE/128 5000000 649 ns/op 197.15 MB/s
+Search_Easy1_CachedPCRE/256 2000000 935 ns/op 273.55 MB/s
+Search_Easy1_CachedPCRE/512 1000000 1971 ns/op 259.76 MB/s
+Search_Easy1_CachedPCRE/1K 500000 3421 ns/op 299.32 MB/s
+Search_Easy1_CachedPCRE/2K 500000 6236 ns/op 328.40 MB/s
+Search_Easy1_CachedPCRE/4K 200000 11135 ns/op 367.84 MB/s
+Search_Easy1_CachedPCRE/8K 100000 22040 ns/op 371.68 MB/s
+Search_Easy1_CachedPCRE/16K 50000 42415 ns/op 386.28 MB/s
+Search_Easy1_CachedPCRE/32K 20000 85249 ns/op 384.38 MB/s
+Search_Easy1_CachedPCRE/64K 10000 170306 ns/op 384.81 MB/s
+Search_Easy1_CachedPCRE/128K 5000 342332 ns/op 382.88 MB/s
+Search_Easy1_CachedPCRE/256K 5000 682556 ns/op 384.06 MB/s
+Search_Easy1_CachedPCRE/512K 2000 1366952 ns/op 383.55 MB/s
+Search_Easy1_CachedPCRE/1M 1000 2736532 ns/op 383.18 MB/s
+Search_Easy1_CachedPCRE/2M 500 5477062 ns/op 382.90 MB/s
+Search_Easy1_CachedPCRE/4M 100 11097300 ns/op 377.96 MB/s
+Search_Easy1_CachedPCRE/8M 100 22254540 ns/op 376.94 MB/s
+Search_Easy1_CachedPCRE/16M 50 44510220 ns/op 376.93 MB/s
+Search_Easy1_CachedRE2/8 5000000 317 ns/op 25.19 MB/s
+Search_Easy1_CachedRE2/16 5000000 317 ns/op 50.42 MB/s
+Search_Easy1_CachedRE2/32 5000000 332 ns/op 96.25 MB/s
+Search_Easy1_CachedRE2/64 5000000 335 ns/op 190.94 MB/s
+Search_Easy1_CachedRE2/128 5000000 376 ns/op 340.07 MB/s
+Search_Easy1_CachedRE2/256 5000000 415 ns/op 615.50 MB/s
+Search_Easy1_CachedRE2/512 5000000 485 ns/op 1054.85 MB/s
+Search_Easy1_CachedRE2/1K 5000000 663 ns/op 1543.27 MB/s
+Search_Easy1_CachedRE2/2K 1000000 1009 ns/op 2029.62 MB/s
+Search_Easy1_CachedRE2/4K 1000000 1585 ns/op 2582.98 MB/s
+Search_Easy1_CachedRE2/8K 1000000 2947 ns/op 2779.39 MB/s
+Search_Easy1_CachedRE2/16K 500000 5474 ns/op 2992.97 MB/s
+Search_Easy1_CachedRE2/32K 200000 11058 ns/op 2963.24 MB/s
+Search_Easy1_CachedRE2/64K 100000 21872 ns/op 2996.27 MB/s
+Search_Easy1_CachedRE2/128K 50000 44328 ns/op 2956.85 MB/s
+Search_Easy1_CachedRE2/256K 20000 88325 ns/op 2967.95 MB/s
+Search_Easy1_CachedRE2/512K 10000 177870 ns/op 2947.58 MB/s
+Search_Easy1_CachedRE2/1M 5000 356912 ns/op 2937.91 MB/s
+Search_Easy1_CachedRE2/2M 5000 716384 ns/op 2927.41 MB/s
+Search_Easy1_CachedRE2/4M 1000 1532077 ns/op 2737.66 MB/s
+Search_Easy1_CachedRE2/8M 500 3087256 ns/op 2717.17 MB/s
+Search_Easy1_CachedRE2/16M 500 6163142 ns/op 2722.19 MB/s
+Search_Medium_CachedPCRE/8 20000000 134 ns/op 59.46 MB/s
+Search_Medium_CachedPCRE/16 10000000 162 ns/op 98.36 MB/s
+Search_Medium_CachedPCRE/32 10000000 219 ns/op 145.73 MB/s
+Search_Medium_CachedPCRE/64 5000000 340 ns/op 187.82 MB/s
+Search_Medium_CachedPCRE/128 5000000 434 ns/op 294.39 MB/s
+Search_Medium_CachedPCRE/256 200000 9077 ns/op 28.20 MB/s
+Search_Medium_CachedPCRE/512 100000 21579 ns/op 23.73 MB/s
+Search_Medium_CachedPCRE/1K 50000 42393 ns/op 24.15 MB/s
+Search_Medium_CachedPCRE/2K 50000 62381 ns/op 32.83 MB/s
+Search_Medium_CachedPCRE/4K 10000 153708 ns/op 26.65 MB/s
+Search_Medium_CachedPCRE/8K 5000 332752 ns/op 24.62 MB/s
+Search_Medium_CachedPCRE/16K 5000 678258 ns/op 24.16 MB/s
+Search_Medium_CachedPCRE/32K 2000 1355855 ns/op 24.17 MB/s
+Search_Medium_CachedPCRE/64K 1000 2707494 ns/op 24.21 MB/s
+Search_Medium_CachedPCRE/128K 500 5410032 ns/op 24.23 MB/s
+Search_Medium_CachedPCRE/256K 100 10825800 ns/op 24.21 MB/s
+Search_Medium_CachedRE2/8 5000000 337 ns/op 23.70 MB/s
+Search_Medium_CachedRE2/16 5000000 363 ns/op 44.02 MB/s
+Search_Medium_CachedRE2/32 5000000 414 ns/op 77.23 MB/s
+Search_Medium_CachedRE2/64 5000000 510 ns/op 125.47 MB/s
+Search_Medium_CachedRE2/128 5000000 724 ns/op 176.68 MB/s
+Search_Medium_CachedRE2/256 1000000 1124 ns/op 227.62 MB/s
+Search_Medium_CachedRE2/512 1000000 1933 ns/op 264.81 MB/s
+Search_Medium_CachedRE2/1K 500000 3551 ns/op 288.35 MB/s
+Search_Medium_CachedRE2/2K 500000 6786 ns/op 301.77 MB/s
+Search_Medium_CachedRE2/4K 200000 13256 ns/op 308.97 MB/s
+Search_Medium_CachedRE2/8K 100000 26198 ns/op 312.69 MB/s
+Search_Medium_CachedRE2/16K 50000 52085 ns/op 314.56 MB/s
+Search_Medium_CachedRE2/32K 10000 103940 ns/op 315.26 MB/s
+Search_Medium_CachedRE2/64K 10000 207489 ns/op 315.85 MB/s
+Search_Medium_CachedRE2/128K 5000 414571 ns/op 316.16 MB/s
+Search_Medium_CachedRE2/256K 2000 828757 ns/op 316.31 MB/s
+Search_Medium_CachedRE2/512K 1000 1657123 ns/op 316.38 MB/s
+Search_Medium_CachedRE2/1M 500 3314204 ns/op 316.39 MB/s
+Search_Medium_CachedRE2/2M 500 6633334 ns/op 316.15 MB/s
+Search_Medium_CachedRE2/4M 100 13342170 ns/op 314.36 MB/s
+Search_Medium_CachedRE2/8M 100 26718850 ns/op 313.96 MB/s
+Search_Medium_CachedRE2/16M 50 53433900 ns/op 313.98 MB/s
+Search_Hard_CachedPCRE/8 20000000 134 ns/op 59.43 MB/s
+Search_Hard_CachedPCRE/16 10000000 162 ns/op 98.34 MB/s
+Search_Hard_CachedPCRE/32 10000000 219 ns/op 145.72 MB/s
+Search_Hard_CachedPCRE/64 5000000 340 ns/op 187.84 MB/s
+Search_Hard_CachedPCRE/128 5000000 434 ns/op 294.64 MB/s
+Search_Hard_CachedPCRE/256 5000 572444 ns/op 0.45 MB/s
+Search_Hard_CachedPCRE/512 1000 2345148 ns/op 0.22 MB/s
+Search_Hard_CachedPCRE/1K 200 9327675 ns/op 0.11 MB/s
+Search_Hard_CachedPCRE/2K 50 34095380 ns/op 0.06 MB/s
+Search_Hard_CachedPCRE/4K 10 146669300 ns/op 0.03 MB/s
+Search_Hard_CachedRE2/8 5000000 338 ns/op 23.62 MB/s
+Search_Hard_CachedRE2/16 5000000 366 ns/op 43.65 MB/s
+Search_Hard_CachedRE2/32 5000000 422 ns/op 75.77 MB/s
+Search_Hard_CachedRE2/64 5000000 513 ns/op 124.66 MB/s
+Search_Hard_CachedRE2/128 5000000 721 ns/op 177.41 MB/s
+Search_Hard_CachedRE2/256 1000000 1125 ns/op 227.38 MB/s
+Search_Hard_CachedRE2/512 1000000 1933 ns/op 264.75 MB/s
+Search_Hard_CachedRE2/1K 500000 3551 ns/op 288.31 MB/s
+Search_Hard_CachedRE2/2K 500000 6787 ns/op 301.74 MB/s
+Search_Hard_CachedRE2/4K 200000 13262 ns/op 308.84 MB/s
+Search_Hard_CachedRE2/8K 100000 26203 ns/op 312.63 MB/s
+Search_Hard_CachedRE2/16K 50000 52085 ns/op 314.56 MB/s
+Search_Hard_CachedRE2/32K 10000 103943 ns/op 315.25 MB/s
+Search_Hard_CachedRE2/64K 10000 207492 ns/op 315.85 MB/s
+Search_Hard_CachedRE2/128K 5000 414602 ns/op 316.14 MB/s
+Search_Hard_CachedRE2/256K 2000 828771 ns/op 316.30 MB/s
+Search_Hard_CachedRE2/512K 1000 1657138 ns/op 316.38 MB/s
+==BENCHMARK== c2 Fri Feb 26 15:52:36 PST 2010
+# Linux c2 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64 GNU/Linux
+# g++ (Ubuntu 4.4.1-4ubuntu8) 4.4.1
+# Copyright (C) 2009 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# fd9366132ce9+ tip
+# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.15, not stripped
+
+Search_Easy0_CachedPCRE/8 10000000 132 ns/op 60.19 MB/s
+Search_Easy0_CachedPCRE/16 10000000 160 ns/op 99.51 MB/s
+Search_Easy0_CachedPCRE/32 10000000 217 ns/op 147.30 MB/s
+Search_Easy0_CachedPCRE/64 5000000 330 ns/op 193.41 MB/s
+Search_Easy0_CachedPCRE/128 5000000 431 ns/op 296.81 MB/s
+Search_Easy0_CachedPCRE/256 1000000 1082 ns/op 236.52 MB/s
+Search_Easy0_CachedPCRE/512 1000000 1742 ns/op 293.84 MB/s
+Search_Easy0_CachedPCRE/1K 500000 3391 ns/op 301.97 MB/s
+Search_Easy0_CachedPCRE/2K 500000 5680 ns/op 360.53 MB/s
+Search_Easy0_CachedPCRE/4K 200000 10620 ns/op 385.66 MB/s
+Search_Easy0_CachedPCRE/8K 100000 21760 ns/op 376.47 MB/s
+Search_Easy0_CachedPCRE/16K 50000 42151 ns/op 388.70 MB/s
+Search_Easy0_CachedPCRE/32K 20000 85091 ns/op 385.09 MB/s
+Search_Easy0_CachedPCRE/64K 10000 169811 ns/op 385.93 MB/s
+Search_Easy0_CachedPCRE/128K 5000 340974 ns/op 384.40 MB/s
+Search_Easy0_CachedPCRE/256K 5000 680322 ns/op 385.32 MB/s
+Search_Easy0_CachedPCRE/512K 2000 1362887 ns/op 384.69 MB/s
+Search_Easy0_CachedPCRE/1M 1000 2726335 ns/op 384.61 MB/s
+Search_Easy0_CachedPCRE/2M 500 5466910 ns/op 383.61 MB/s
+Search_Easy0_CachedPCRE/4M 100 11043660 ns/op 379.79 MB/s
+Search_Easy0_CachedPCRE/8M 100 22181360 ns/op 378.18 MB/s
+Search_Easy0_CachedPCRE/16M 50 44332240 ns/op 378.44 MB/s
+Search_Easy0_CachedRE2/8 5000000 319 ns/op 25.07 MB/s
+Search_Easy0_CachedRE2/16 5000000 320 ns/op 49.98 MB/s
+Search_Easy0_CachedRE2/32 5000000 334 ns/op 95.62 MB/s
+Search_Easy0_CachedRE2/64 5000000 336 ns/op 190.00 MB/s
+Search_Easy0_CachedRE2/128 5000000 367 ns/op 348.26 MB/s
+Search_Easy0_CachedRE2/256 5000000 392 ns/op 651.98 MB/s
+Search_Easy0_CachedRE2/512 5000000 472 ns/op 1084.04 MB/s
+Search_Easy0_CachedRE2/1K 5000000 652 ns/op 1569.77 MB/s
+Search_Easy0_CachedRE2/2K 1000000 1002 ns/op 2041.98 MB/s
+Search_Easy0_CachedRE2/4K 1000000 1588 ns/op 2579.04 MB/s
+Search_Easy0_CachedRE2/8K 1000000 2919 ns/op 2805.50 MB/s
+Search_Easy0_CachedRE2/16K 500000 5422 ns/op 3021.44 MB/s
+Search_Easy0_CachedRE2/32K 200000 11029 ns/op 2970.83 MB/s
+Search_Easy0_CachedRE2/64K 100000 21880 ns/op 2995.23 MB/s
+Search_Easy0_CachedRE2/128K 50000 44348 ns/op 2955.51 MB/s
+Search_Easy0_CachedRE2/256K 20000 88537 ns/op 2960.84 MB/s
+Search_Easy0_CachedRE2/512K 10000 178161 ns/op 2942.77 MB/s
+Search_Easy0_CachedRE2/1M 5000 357466 ns/op 2933.35 MB/s
+Search_Easy0_CachedRE2/2M 5000 726215 ns/op 2887.78 MB/s
+Search_Easy0_CachedRE2/4M 1000 1518925 ns/op 2761.36 MB/s
+Search_Easy0_CachedRE2/8M 500 3093556 ns/op 2711.64 MB/s
+Search_Easy0_CachedRE2/16M 500 6163216 ns/op 2722.15 MB/s
+Search_Easy1_CachedPCRE/8 20000000 133 ns/op 60.01 MB/s
+Search_Easy1_CachedPCRE/16 10000000 161 ns/op 99.01 MB/s
+Search_Easy1_CachedPCRE/32 10000000 218 ns/op 146.72 MB/s
+Search_Easy1_CachedPCRE/64 5000000 331 ns/op 192.81 MB/s
+Search_Easy1_CachedPCRE/128 5000000 651 ns/op 196.47 MB/s
+Search_Easy1_CachedPCRE/256 2000000 937 ns/op 273.13 MB/s
+Search_Easy1_CachedPCRE/512 1000000 1971 ns/op 259.75 MB/s
+Search_Easy1_CachedPCRE/1K 500000 3419 ns/op 299.44 MB/s
+Search_Easy1_CachedPCRE/2K 500000 6237 ns/op 328.32 MB/s
+Search_Easy1_CachedPCRE/4K 200000 11126 ns/op 368.14 MB/s
+Search_Easy1_CachedPCRE/8K 100000 22025 ns/op 371.94 MB/s
+Search_Easy1_CachedPCRE/16K 50000 42414 ns/op 386.28 MB/s
+Search_Easy1_CachedPCRE/32K 20000 85208 ns/op 384.56 MB/s
+Search_Easy1_CachedPCRE/64K 10000 170269 ns/op 384.90 MB/s
+Search_Easy1_CachedPCRE/128K 5000 342014 ns/op 383.24 MB/s
+Search_Easy1_CachedPCRE/256K 5000 682258 ns/op 384.23 MB/s
+Search_Easy1_CachedPCRE/512K 2000 1366582 ns/op 383.65 MB/s
+Search_Easy1_CachedPCRE/1M 1000 2735046 ns/op 383.39 MB/s
+Search_Easy1_CachedPCRE/2M 500 5591430 ns/op 375.07 MB/s
+Search_Easy1_CachedPCRE/4M 100 11077680 ns/op 378.63 MB/s
+Search_Easy1_CachedPCRE/8M 100 22246570 ns/op 377.07 MB/s
+Search_Easy1_CachedPCRE/16M 50 44470360 ns/op 377.27 MB/s
+Search_Easy1_CachedRE2/8 5000000 319 ns/op 25.02 MB/s
+Search_Easy1_CachedRE2/16 5000000 326 ns/op 49.00 MB/s
+Search_Easy1_CachedRE2/32 5000000 334 ns/op 95.72 MB/s
+Search_Easy1_CachedRE2/64 5000000 337 ns/op 189.56 MB/s
+Search_Easy1_CachedRE2/128 5000000 365 ns/op 349.81 MB/s
+Search_Easy1_CachedRE2/256 5000000 399 ns/op 640.98 MB/s
+Search_Easy1_CachedRE2/512 5000000 469 ns/op 1089.44 MB/s
+Search_Easy1_CachedRE2/1K 5000000 652 ns/op 1569.62 MB/s
+Search_Easy1_CachedRE2/2K 1000000 1004 ns/op 2038.28 MB/s
+Search_Easy1_CachedRE2/4K 1000000 1584 ns/op 2584.99 MB/s
+Search_Easy1_CachedRE2/8K 1000000 2919 ns/op 2806.40 MB/s
+Search_Easy1_CachedRE2/16K 500000 5451 ns/op 3005.49 MB/s
+Search_Easy1_CachedRE2/32K 200000 10985 ns/op 2982.94 MB/s
+Search_Easy1_CachedRE2/64K 100000 21869 ns/op 2996.70 MB/s
+Search_Easy1_CachedRE2/128K 50000 44326 ns/op 2956.95 MB/s
+Search_Easy1_CachedRE2/256K 20000 88517 ns/op 2961.50 MB/s
+Search_Easy1_CachedRE2/512K 10000 178161 ns/op 2942.76 MB/s
+Search_Easy1_CachedRE2/1M 5000 357524 ns/op 2932.88 MB/s
+Search_Easy1_CachedRE2/2M 5000 726271 ns/op 2887.56 MB/s
+Search_Easy1_CachedRE2/4M 1000 1519940 ns/op 2759.52 MB/s
+Search_Easy1_CachedRE2/8M 500 3095036 ns/op 2710.34 MB/s
+Search_Easy1_CachedRE2/16M 500 6165230 ns/op 2721.26 MB/s
+Search_Medium_CachedPCRE/8 20000000 134 ns/op 59.33 MB/s
+Search_Medium_CachedPCRE/16 10000000 162 ns/op 98.23 MB/s
+Search_Medium_CachedPCRE/32 10000000 219 ns/op 145.68 MB/s
+Search_Medium_CachedPCRE/64 5000000 340 ns/op 187.74 MB/s
+Search_Medium_CachedPCRE/128 5000000 434 ns/op 294.36 MB/s
+Search_Medium_CachedPCRE/256 200000 9116 ns/op 28.08 MB/s
+Search_Medium_CachedPCRE/512 100000 21829 ns/op 23.45 MB/s
+Search_Medium_CachedPCRE/1K 50000 42878 ns/op 23.88 MB/s
+Search_Medium_CachedPCRE/2K 50000 62528 ns/op 32.75 MB/s
+Search_Medium_CachedPCRE/4K 10000 153909 ns/op 26.61 MB/s
+Search_Medium_CachedPCRE/8K 5000 333099 ns/op 24.59 MB/s
+Search_Medium_CachedPCRE/16K 5000 678554 ns/op 24.15 MB/s
+Search_Medium_CachedPCRE/32K 2000 1354963 ns/op 24.18 MB/s
+Search_Medium_CachedPCRE/64K 1000 2705485 ns/op 24.22 MB/s
+Search_Medium_CachedPCRE/128K 500 5407590 ns/op 24.24 MB/s
+Search_Medium_CachedPCRE/256K 100 10817570 ns/op 24.23 MB/s
+Search_Medium_CachedRE2/8 5000000 339 ns/op 23.55 MB/s
+Search_Medium_CachedRE2/16 5000000 364 ns/op 43.85 MB/s
+Search_Medium_CachedRE2/32 5000000 417 ns/op 76.70 MB/s
+Search_Medium_CachedRE2/64 5000000 515 ns/op 124.27 MB/s
+Search_Medium_CachedRE2/128 5000000 723 ns/op 176.82 MB/s
+Search_Medium_CachedRE2/256 1000000 1127 ns/op 227.13 MB/s
+Search_Medium_CachedRE2/512 1000000 1935 ns/op 264.52 MB/s
+Search_Medium_CachedRE2/1K 500000 3553 ns/op 288.18 MB/s
+Search_Medium_CachedRE2/2K 500000 6794 ns/op 301.41 MB/s
+Search_Medium_CachedRE2/4K 200000 13257 ns/op 308.96 MB/s
+Search_Medium_CachedRE2/8K 100000 26198 ns/op 312.69 MB/s
+Search_Medium_CachedRE2/16K 50000 52083 ns/op 314.57 MB/s
+Search_Medium_CachedRE2/32K 10000 103951 ns/op 315.22 MB/s
+Search_Medium_CachedRE2/64K 10000 207486 ns/op 315.86 MB/s
+Search_Medium_CachedRE2/128K 5000 414561 ns/op 316.17 MB/s
+Search_Medium_CachedRE2/256K 2000 828728 ns/op 316.32 MB/s
+Search_Medium_CachedRE2/512K 1000 1657039 ns/op 316.40 MB/s
+Search_Medium_CachedRE2/1M 500 3314040 ns/op 316.40 MB/s
+Search_Medium_CachedRE2/2M 500 6637874 ns/op 315.94 MB/s
+Search_Medium_CachedRE2/4M 100 13332420 ns/op 314.59 MB/s
+Search_Medium_CachedRE2/8M 100 26715300 ns/op 314.00 MB/s
+Search_Medium_CachedRE2/16M 50 53430940 ns/op 314.00 MB/s
+Search_Hard_CachedPCRE/8 20000000 134 ns/op 59.35 MB/s
+Search_Hard_CachedPCRE/16 10000000 162 ns/op 98.21 MB/s
+Search_Hard_CachedPCRE/32 10000000 219 ns/op 145.65 MB/s
+Search_Hard_CachedPCRE/64 5000000 340 ns/op 187.74 MB/s
+Search_Hard_CachedPCRE/128 5000000 434 ns/op 294.33 MB/s
+Search_Hard_CachedPCRE/256 5000 572641 ns/op 0.45 MB/s
+Search_Hard_CachedPCRE/512 1000 2348430 ns/op 0.22 MB/s
+Search_Hard_CachedPCRE/1K 200 9314740 ns/op 0.11 MB/s
+Search_Hard_CachedPCRE/2K 50 34077360 ns/op 0.06 MB/s
+Search_Hard_CachedPCRE/4K 10 146685100 ns/op 0.03 MB/s
+Search_Hard_CachedRE2/8 5000000 339 ns/op 23.56 MB/s
+Search_Hard_CachedRE2/16 5000000 364 ns/op 43.87 MB/s
+Search_Hard_CachedRE2/32 5000000 416 ns/op 76.91 MB/s
+Search_Hard_CachedRE2/64 5000000 514 ns/op 124.39 MB/s
+Search_Hard_CachedRE2/128 5000000 723 ns/op 176.97 MB/s
+Search_Hard_CachedRE2/256 1000000 1126 ns/op 227.29 MB/s
+Search_Hard_CachedRE2/512 1000000 1935 ns/op 264.58 MB/s
+Search_Hard_CachedRE2/1K 500000 3552 ns/op 288.24 MB/s
+Search_Hard_CachedRE2/2K 500000 6787 ns/op 301.73 MB/s
+Search_Hard_CachedRE2/4K 200000 13258 ns/op 308.93 MB/s
+Search_Hard_CachedRE2/8K 100000 26198 ns/op 312.69 MB/s
+Search_Hard_CachedRE2/16K 50000 52078 ns/op 314.60 MB/s
+Search_Hard_CachedRE2/32K 10000 103957 ns/op 315.21 MB/s
+Search_Hard_CachedRE2/64K 10000 207490 ns/op 315.85 MB/s
+Search_Hard_CachedRE2/128K 5000 414573 ns/op 316.16 MB/s
+Search_Hard_CachedRE2/256K 2000 828748 ns/op 316.31 MB/s
+Search_Hard_CachedRE2/512K 1000 1657141 ns/op 316.38 MB/s
+Search_Hard_CachedRE2/1M 500 3314048 ns/op 316.40 MB/s
+Search_Hard_CachedRE2/2M 500 6637896 ns/op 315.94 MB/s
+Search_Hard_CachedRE2/4M 100 13331710 ns/op 314.61 MB/s
+Search_Hard_CachedRE2/8M 100 26716050 ns/op 313.99 MB/s
+Search_Hard_CachedRE2/16M 50 53428900 ns/op 314.01 MB/s
+Search_Parens_CachedPCRE/8 10000000 197 ns/op 40.52 MB/s
+Search_Parens_CachedRE2/8 5000000 339 ns/op 23.55 MB/s
+Search_Parens_CachedRE2/16 5000000 365 ns/op 43.83 MB/s
+Search_Parens_CachedRE2/32 5000000 416 ns/op 76.85 MB/s
+Search_Parens_CachedRE2/64 5000000 518 ns/op 123.48 MB/s
+Search_Parens_CachedRE2/128 5000000 732 ns/op 174.84 MB/s
+Search_Parens_CachedRE2/256 1000000 1125 ns/op 227.38 MB/s
+Search_Parens_CachedRE2/512 1000000 1935 ns/op 264.54 MB/s
+Search_Parens_CachedRE2/1K 500000 3553 ns/op 288.18 MB/s
+Search_Parens_CachedRE2/2K 500000 6787 ns/op 301.73 MB/s
+Search_Parens_CachedRE2/4K 200000 13258 ns/op 308.93 MB/s
+Search_Parens_CachedRE2/8K 100000 26198 ns/op 312.68 MB/s
+Search_Parens_CachedRE2/16K 50000 52082 ns/op 314.58 MB/s
+Search_Parens_CachedRE2/32K 10000 103942 ns/op 315.25 MB/s
+Search_Parens_CachedRE2/64K 10000 207482 ns/op 315.86 MB/s
+Search_Parens_CachedRE2/128K 5000 414565 ns/op 316.17 MB/s
+Search_Parens_CachedRE2/256K 2000 828752 ns/op 316.31 MB/s
+Search_Parens_CachedRE2/512K 1000 1657114 ns/op 316.39 MB/s
+Search_Parens_CachedRE2/1M 500 3314130 ns/op 316.40 MB/s
+Search_Parens_CachedRE2/2M 500 6637822 ns/op 315.94 MB/s
+Search_Parens_CachedRE2/4M 100 13333110 ns/op 314.58 MB/s
+Search_Parens_CachedRE2/8M 100 26718660 ns/op 313.96 MB/s
+Search_Parens_CachedRE2/16M 50 53434420 ns/op 313.98 MB/s
+Search_BigFixed_CachedPCRE/8 10000000 245 ns/op 32.58 MB/s
+Search_BigFixed_CachedPCRE/16 5000000 302 ns/op 52.87 MB/s
+Search_BigFixed_CachedPCRE/32 5000000 419 ns/op 76.34 MB/s
+Search_BigFixed_CachedPCRE/64 5000000 657 ns/op 97.37 MB/s
+Search_BigFixed_CachedPCRE/128 2000000 986 ns/op 129.75 MB/s
+Search_BigFixed_CachedPCRE/256 1000000 1776 ns/op 144.11 MB/s
+Search_BigFixed_CachedPCRE/512 500000 3343 ns/op 153.12 MB/s
+Search_BigFixed_CachedPCRE/1K 500000 6477 ns/op 158.09 MB/s
+Search_BigFixed_CachedPCRE/2K 200000 12745 ns/op 160.68 MB/s
+Search_BigFixed_CachedPCRE/4K 100000 25282 ns/op 162.01 MB/s
+Search_BigFixed_CachedPCRE/8K 50000 50360 ns/op 162.67 MB/s
+Search_BigFixed_CachedPCRE/16K 10000 100599 ns/op 162.86 MB/s
+Search_BigFixed_CachedPCRE/32K 10000 201002 ns/op 163.02 MB/s
+Search_BigFixed_CachedRE2/8 20000000 130 ns/op 61.10 MB/s
+Search_BigFixed_CachedRE2/16 5000000 375 ns/op 42.65 MB/s
+Search_BigFixed_CachedRE2/32 5000000 412 ns/op 77.57 MB/s
+Search_BigFixed_CachedRE2/64 5000000 488 ns/op 130.92 MB/s
+Search_BigFixed_CachedRE2/128 5000000 635 ns/op 201.33 MB/s
+Search_BigFixed_CachedRE2/256 2000000 946 ns/op 270.51 MB/s
+Search_BigFixed_CachedRE2/512 1000000 1551 ns/op 329.90 MB/s
+Search_BigFixed_CachedRE2/1K 1000000 2767 ns/op 369.95 MB/s
+Search_BigFixed_CachedRE2/2K 500000 5192 ns/op 394.43 MB/s
+Search_BigFixed_CachedRE2/4K 200000 10047 ns/op 407.68 MB/s
+Search_BigFixed_CachedRE2/8K 100000 19753 ns/op 414.70 MB/s
+Search_BigFixed_CachedRE2/16K 50000 39165 ns/op 418.33 MB/s
+Search_BigFixed_CachedRE2/32K 20000 78111 ns/op 419.50 MB/s
+Search_BigFixed_CachedRE2/64K 10000 155869 ns/op 420.45 MB/s
+Search_BigFixed_CachedRE2/128K 5000 311467 ns/op 420.82 MB/s
+Search_BigFixed_CachedRE2/256K 5000 622457 ns/op 421.14 MB/s
+Search_BigFixed_CachedRE2/512K 2000 1247149 ns/op 420.39 MB/s
+Search_BigFixed_CachedRE2/1M 1000 2502506 ns/op 419.01 MB/s
+Search_Success_PCRE/8 1000000 1835 ns/op 4.36 MB/s
+Search_Success_PCRE/16 1000000 1890 ns/op 8.46 MB/s
+Search_Success_PCRE/32 1000000 1981 ns/op 16.15 MB/s
+Search_Success_PCRE/64 1000000 2151 ns/op 29.75 MB/s
+Search_Success_PCRE/128 1000000 2511 ns/op 50.96 MB/s
+Search_Success_PCRE/256 500000 3229 ns/op 79.26 MB/s
+Search_Success_PCRE/512 500000 4647 ns/op 110.16 MB/s
+Search_Success_PCRE/1K 200000 7500 ns/op 136.52 MB/s
+Search_Success_PCRE/2K 200000 13134 ns/op 155.92 MB/s
+Search_Success_PCRE/4K 100000 24469 ns/op 167.39 MB/s
+Search_Success_PCRE/8K 50000 47127 ns/op 173.83 MB/s
+Search_Success_PCRE/16K 20000 92460 ns/op 177.20 MB/s
+Search_Success_PCRE/32K 10000 183255 ns/op 178.81 MB/s
+Search_Success_PCRE/64K 5000 364664 ns/op 179.72 MB/s
+Search_Success_PCRE/128K 2000 728382 ns/op 179.95 MB/s
+Search_Success_PCRE/256K 1000 1458071 ns/op 179.79 MB/s
+Search_Success_PCRE/512K 500 2927234 ns/op 179.11 MB/s
+Search_Success_PCRE/1M 500 5852934 ns/op 179.15 MB/s
+Search_Success_PCRE/2M 200 11886620 ns/op 176.43 MB/s
+Search_Success_PCRE/4M 100 24402710 ns/op 171.88 MB/s
+Search_Success_PCRE/8M 50 50996680 ns/op 164.49 MB/s
+Search_Success_PCRE/16M 10 135693000 ns/op 123.64 MB/s
+Search_Success_RE2/16M 20 74552300 ns/op 225.04 MB/s
+Search_Success_CachedPCRE/8 10000000 236 ns/op 33.88 MB/s
+Search_Success_CachedPCRE/16 10000000 289 ns/op 55.21 MB/s
+Search_Success_CachedPCRE/32 5000000 397 ns/op 80.58 MB/s
+Search_Success_CachedPCRE/64 5000000 611 ns/op 104.58 MB/s
+Search_Success_CachedPCRE/128 2000000 914 ns/op 139.91 MB/s
+Search_Success_CachedPCRE/256 1000000 1622 ns/op 157.81 MB/s
+Search_Success_CachedPCRE/512 500000 3037 ns/op 168.54 MB/s
+Search_Success_CachedPCRE/1K 500000 5867 ns/op 174.51 MB/s
+Search_Success_CachedPCRE/2K 200000 11529 ns/op 177.62 MB/s
+Search_Success_CachedPCRE/4K 100000 22852 ns/op 179.23 MB/s
+Search_Success_CachedPCRE/8K 50000 46293 ns/op 176.96 MB/s
+Search_Success_CachedPCRE/16K 20000 90812 ns/op 180.42 MB/s
+Search_Success_CachedPCRE/32K 10000 181517 ns/op 180.52 MB/s
+Search_Success_CachedPCRE/64K 5000 362941 ns/op 180.57 MB/s
+Search_Success_CachedPCRE/128K 2000 726534 ns/op 180.41 MB/s
+Search_Success_CachedPCRE/256K 1000 1456177 ns/op 180.02 MB/s
+Search_Success_CachedPCRE/512K 500 2925190 ns/op 179.23 MB/s
+Search_Success_CachedPCRE/1M 500 5850306 ns/op 179.23 MB/s
+Search_Success_CachedPCRE/2M 200 11879265 ns/op 176.54 MB/s
+Search_Success_CachedPCRE/4M 100 24386990 ns/op 171.99 MB/s
+Search_Success_CachedPCRE/8M 50 50981240 ns/op 164.54 MB/s
+Search_Success_CachedPCRE/16M 10 135670500 ns/op 123.66 MB/s
+Search_Success_CachedRE2/8 20000000 130 ns/op 61.27 MB/s
+Search_Success_CachedRE2/16 5000000 376 ns/op 42.54 MB/s
+Search_Success_CachedRE2/32 5000000 427 ns/op 74.93 MB/s
+Search_Success_CachedRE2/64 5000000 526 ns/op 121.48 MB/s
+Search_Success_CachedRE2/128 5000000 732 ns/op 174.78 MB/s
+Search_Success_CachedRE2/256 1000000 1135 ns/op 225.54 MB/s
+Search_Success_CachedRE2/512 1000000 1944 ns/op 263.31 MB/s
+Search_Success_CachedRE2/1K 500000 3563 ns/op 287.37 MB/s
+Search_Success_CachedRE2/2K 500000 6797 ns/op 301.31 MB/s
+Search_Success_CachedRE2/4K 200000 13268 ns/op 308.71 MB/s
+Search_Success_CachedRE2/8K 100000 26208 ns/op 312.57 MB/s
+Search_Success_CachedRE2/16K 50000 52094 ns/op 314.50 MB/s
+Search_Success_CachedRE2/32K 10000 104033 ns/op 314.98 MB/s
+Search_Success_CachedRE2/64K 10000 207643 ns/op 315.62 MB/s
+Search_Success_CachedRE2/128K 5000 415199 ns/op 315.68 MB/s
+Search_Success_CachedRE2/256K 2000 831963 ns/op 315.09 MB/s
+Search_Success_CachedRE2/512K 1000 1670044 ns/op 313.94 MB/s
+Search_Success_CachedRE2/1M 500 3366302 ns/op 311.49 MB/s
+Search_Success_CachedRE2/2M 200 6902225 ns/op 303.84 MB/s
+Search_Success_CachedRE2/4M 100 14383930 ns/op 291.60 MB/s
+Search_Success_CachedRE2/8M 50 30930940 ns/op 271.20 MB/s
+Search_Success_CachedRE2/16M 20 74507750 ns/op 225.17 MB/s
+Search_Success1_PCRE/8 1000000 1997 ns/op 4.00 MB/s
+Search_Success1_PCRE/16 1000000 2038 ns/op 7.85 MB/s
+Search_Success1_PCRE/32 1000000 2116 ns/op 15.12 MB/s
+Search_Success1_PCRE/64 1000000 2280 ns/op 28.07 MB/s
+Search_Success1_PCRE/128 1000000 2636 ns/op 48.56 MB/s
+Search_Success1_PCRE/256 500000 3351 ns/op 76.39 MB/s
+Search_Success1_PCRE/512 500000 4775 ns/op 107.22 MB/s
+Search_Success1_PCRE/1K 200000 7626 ns/op 134.27 MB/s
+Search_Success1_PCRE/2K 200000 13284 ns/op 154.17 MB/s
+Search_Success1_PCRE/4K 100000 24592 ns/op 166.56 MB/s
+Search_Success1_PCRE/8K 50000 47260 ns/op 173.34 MB/s
+Search_Success1_PCRE/16K 20000 92599 ns/op 176.93 MB/s
+Search_Success1_PCRE/32K 10000 183386 ns/op 178.68 MB/s
+Search_Success1_PCRE/64K 5000 364838 ns/op 179.63 MB/s
+Search_Success1_PCRE/128K 2000 728548 ns/op 179.91 MB/s
+Search_Success1_PCRE/256K 1000 1458127 ns/op 179.78 MB/s
+Search_Success1_PCRE/512K 500 2927280 ns/op 179.10 MB/s
+Search_Success1_PCRE/1M 500 5853210 ns/op 179.15 MB/s
+Search_Success1_PCRE/2M 200 11886770 ns/op 176.43 MB/s
+Search_Success1_PCRE/4M 100 24400970 ns/op 171.89 MB/s
+Search_Success1_PCRE/8M 50 50996900 ns/op 164.49 MB/s
+Search_Success1_PCRE/16M 10 135681900 ns/op 123.65 MB/s
+Search_Success1_RE2/8 100000 27018 ns/op 0.30 MB/s
+Search_Success1_RE2/16 100000 27002 ns/op 0.59 MB/s
+Search_Success1_RE2/32 100000 27343 ns/op 1.17 MB/s
+Search_Success1_RE2/64 100000 27321 ns/op 2.34 MB/s
+Search_Success1_RE2/128 100000 27794 ns/op 4.61 MB/s
+Search_Success1_RE2/256 100000 27807 ns/op 9.21 MB/s
+Search_Success1_RE2/512 100000 28664 ns/op 17.86 MB/s
+Search_Success1_RE2/1K 50000 30116 ns/op 34.00 MB/s
+Search_Success1_RE2/2K 50000 33360 ns/op 61.39 MB/s
+Search_Success1_RE2/4K 50000 40118 ns/op 102.10 MB/s
+Search_Success1_RE2/8K 50000 52901 ns/op 154.85 MB/s
+Search_Success1_RE2/16K 20000 78892 ns/op 207.68 MB/s
+Search_Success1_RE2/32K 10000 131013 ns/op 250.11 MB/s
+Search_Success1_RE2/64K 10000 234601 ns/op 279.35 MB/s
+Search_Success1_RE2/128K 5000 442246 ns/op 296.38 MB/s
+Search_Success1_RE2/256K 2000 859127 ns/op 305.13 MB/s
+Search_Success1_RE2/512K 1000 1697416 ns/op 308.87 MB/s
+Search_Success1_RE2/1M 500 3394086 ns/op 308.94 MB/s
+Search_Success1_RE2/2M 200 6933840 ns/op 302.45 MB/s
+Search_Success1_RE2/4M 100 14439980 ns/op 290.46 MB/s
+Search_Success1_RE2/8M 50 30989920 ns/op 270.69 MB/s
+Search_Success1_RE2/16M 20 74560700 ns/op 225.01 MB/s
+Search_Success1_Cached_PCRE/8 10000000 257 ns/op 31.10 MB/s
+Search_Success1_Cached_PCRE/16 5000000 311 ns/op 51.42 MB/s
+Search_Success1_Cached_PCRE/32 5000000 418 ns/op 76.48 MB/s
+Search_Success1_Cached_PCRE/64 5000000 633 ns/op 101.03 MB/s
+Search_Success1_Cached_PCRE/128 2000000 935 ns/op 136.78 MB/s
+Search_Success1_Cached_PCRE/256 1000000 1643 ns/op 155.79 MB/s
+Search_Success1_Cached_PCRE/512 500000 3058 ns/op 167.41 MB/s
+Search_Success1_Cached_PCRE/1K 500000 5888 ns/op 173.89 MB/s
+Search_Success1_Cached_PCRE/2K 200000 11550 ns/op 177.30 MB/s
+Search_Success1_Cached_PCRE/4K 100000 22873 ns/op 179.07 MB/s
+Search_Success1_Cached_PCRE/8K 50000 45522 ns/op 179.95 MB/s
+Search_Success1_Cached_PCRE/16K 20000 90830 ns/op 180.38 MB/s
+Search_Success1_Cached_PCRE/32K 10000 181547 ns/op 180.49 MB/s
+Search_Success1_Cached_PCRE/64K 5000 362960 ns/op 180.56 MB/s
+Search_Success1_Cached_PCRE/128K 2000 726612 ns/op 180.39 MB/s
+Search_Success1_Cached_PCRE/256K 1000 1456167 ns/op 180.02 MB/s
+Search_Success1_Cached_PCRE/512K 500 2924960 ns/op 179.25 MB/s
+Search_Success1_Cached_PCRE/1M 500 5850124 ns/op 179.24 MB/s
+Search_Success1_Cached_PCRE/2M 200 11879665 ns/op 176.53 MB/s
+Search_Success1_Cached_PCRE/4M 100 24385800 ns/op 172.00 MB/s
+Search_Success1_Cached_PCRE/8M 50 50977600 ns/op 164.55 MB/s
+Search_Success1_Cached_PCRE/16M 10 135651600 ns/op 123.68 MB/s
+Search_Success1_Cached_RE2/8 5000000 347 ns/op 23.00 MB/s
+Search_Success1_Cached_RE2/16 5000000 373 ns/op 42.83 MB/s
+Search_Success1_Cached_RE2/32 5000000 423 ns/op 75.55 MB/s
+Search_Success1_Cached_RE2/64 5000000 523 ns/op 122.23 MB/s
+Search_Success1_Cached_RE2/128 5000000 731 ns/op 174.97 MB/s
+Search_Success1_Cached_RE2/256 1000000 1133 ns/op 225.85 MB/s
+Search_Success1_Cached_RE2/512 1000000 1942 ns/op 263.56 MB/s
+Search_Success1_Cached_RE2/1K 500000 3560 ns/op 287.62 MB/s
+Search_Success1_Cached_RE2/2K 500000 6794 ns/op 301.40 MB/s
+Search_Success1_Cached_RE2/4K 200000 13267 ns/op 308.73 MB/s
+Search_Success1_Cached_RE2/8K 100000 26210 ns/op 312.54 MB/s
+Search_Success1_Cached_RE2/16K 50000 52100 ns/op 314.47 MB/s
+Search_Success1_Cached_RE2/32K 10000 104040 ns/op 314.95 MB/s
+Search_Success1_Cached_RE2/64K 10000 207650 ns/op 315.61 MB/s
+Search_Success1_Cached_RE2/128K 5000 415201 ns/op 315.68 MB/s
+Search_Success1_Cached_RE2/256K 2000 831979 ns/op 315.08 MB/s
+Search_Success1_Cached_RE2/512K 1000 1670071 ns/op 313.93 MB/s
+Search_Success1_Cached_RE2/1M 500 3366256 ns/op 311.50 MB/s
+Search_Success1_Cached_RE2/2M 200 6902045 ns/op 303.85 MB/s
+Search_Success1_Cached_RE2/4M 100 14384020 ns/op 291.59 MB/s
+Search_Success1_Cached_RE2/8M 50 30929640 ns/op 271.22 MB/s
+Search_Success1_Cached_RE2/16M 20 74502350 ns/op 225.19 MB/s
+Search_Digits_PCRE 500000 5023 ns/op
+Search_Digits_RE2 100000 21787 ns/op
+Parse_Digits_PCRE 500000 5015 ns/op
+Parse_Digits_RE2 200000 9912 ns/op
+Parse_CachedDigits_PCRE 5000000 448 ns/op
+Parse_CachedDigits_RE2 10000000 266 ns/op
+Parse_DigitDs_PCRE 500000 4128 ns/op
+Parse_DigitDs_RE2 200000 9679 ns/op
+Parse_CachedDigitDs_PCRE 5000000 459 ns/op
+Parse_CachedDigitDs_RE2 10000000 265 ns/op
+Parse_Split_PCRE 500000 3122 ns/op
+Parse_Split_RE2 200000 11139 ns/op
+Parse_CachedSplit_PCRE 5000000 333 ns/op
+Parse_CachedSplit_RE2 10000000 170 ns/op
+Parse_SplitHard_PCRE 500000 3113 ns/op
+Parse_SplitHard_RE2 200000 14117 ns/op
+Parse_CachedSplitHard_PCRE 5000000 328 ns/op
+Parse_CachedSplitHard_RE2 1000000 2472 ns/op
+Parse_CachedSplitBig1_PCRE 500 4502404 ns/op
+Parse_CachedSplitBig1_RE2 5000 635120 ns/op
+Parse_CachedSplitBig2_PCRE 5000 553267 ns/op
+Parse_CachedSplitBig2_RE2 50 51601920 ns/op
+BM_PCRE_Compile 500000 3798 ns/op
+BM_RE2_Compile 200000 10923 ns/op
+SearchPhone_CachedPCRE/8 1000000 1196 ns/op 6.68 MB/s
+SearchPhone_CachedPCRE/16 1000000 1969 ns/op 8.12 MB/s
+SearchPhone_CachedPCRE/32 500000 3511 ns/op 9.11 MB/s
+SearchPhone_CachedPCRE/64 500000 6563 ns/op 9.75 MB/s
+SearchPhone_CachedPCRE/128 200000 12796 ns/op 10.00 MB/s
+SearchPhone_CachedPCRE/256 100000 25045 ns/op 10.22 MB/s
+SearchPhone_CachedPCRE/512 50000 49381 ns/op 10.37 MB/s
+SearchPhone_CachedPCRE/1K 20000 98166 ns/op 10.43 MB/s
+SearchPhone_CachedPCRE/2K 10000 193434 ns/op 10.59 MB/s
+SearchPhone_CachedPCRE/4K 5000 382921 ns/op 10.70 MB/s
+SearchPhone_CachedPCRE/8K 2000 765255 ns/op 10.70 MB/s
+SearchPhone_CachedPCRE/16K 1000 1524376 ns/op 10.75 MB/s
+SearchPhone_CachedPCRE/32K 500 3046932 ns/op 10.75 MB/s
+SearchPhone_CachedPCRE/64K 500 6088620 ns/op 10.76 MB/s
+SearchPhone_CachedPCRE/128K 100 12170430 ns/op 10.77 MB/s
+SearchPhone_CachedPCRE/256K 100 24329780 ns/op 10.77 MB/s
+SearchPhone_CachedPCRE/512K 50 48663960 ns/op 10.77 MB/s
+SearchPhone_CachedPCRE/1M 20 97341800 ns/op 10.77 MB/s
+SearchPhone_CachedPCRE/2M 10 194512900 ns/op 10.78 MB/s
+SearchPhone_CachedPCRE/4M 5 389369200 ns/op 10.77 MB/s
+SearchPhone_CachedPCRE/8M 5 778852600 ns/op 10.77 MB/s
+SearchPhone_CachedPCRE/16M 1 1558273000 ns/op 10.77 MB/s
+SearchPhone_CachedRE2/8 2000000 884 ns/op 9.05 MB/s
+SearchPhone_CachedRE2/16 2000000 913 ns/op 17.52 MB/s
+SearchPhone_CachedRE2/32 2000000 965 ns/op 33.14 MB/s
+SearchPhone_CachedRE2/64 1000000 1078 ns/op 59.32 MB/s
+SearchPhone_CachedRE2/128 1000000 1269 ns/op 100.87 MB/s
+SearchPhone_CachedRE2/256 1000000 1678 ns/op 152.50 MB/s
+SearchPhone_CachedRE2/512 1000000 2482 ns/op 206.26 MB/s
+SearchPhone_CachedRE2/1K 500000 4110 ns/op 249.11 MB/s
+SearchPhone_CachedRE2/2K 500000 7347 ns/op 278.74 MB/s
+SearchPhone_CachedRE2/4K 200000 13805 ns/op 296.70 MB/s
+SearchPhone_CachedRE2/8K 100000 26763 ns/op 306.09 MB/s
+SearchPhone_CachedRE2/16K 50000 52718 ns/op 310.78 MB/s
+SearchPhone_CachedRE2/32K 10000 104770 ns/op 312.76 MB/s
+SearchPhone_CachedRE2/64K 10000 208323 ns/op 314.59 MB/s
+SearchPhone_CachedRE2/128K 5000 415437 ns/op 315.50 MB/s
+SearchPhone_CachedRE2/256K 2000 829593 ns/op 315.99 MB/s
+SearchPhone_CachedRE2/512K 1000 1657998 ns/op 316.22 MB/s
+SearchPhone_CachedRE2/1M 500 3314964 ns/op 316.32 MB/s
+SearchPhone_CachedRE2/2M 500 6639102 ns/op 315.88 MB/s
+SearchPhone_CachedRE2/4M 100 13334810 ns/op 314.54 MB/s
+SearchPhone_CachedRE2/8M 100 26721480 ns/op 313.93 MB/s
+SearchPhone_CachedRE2/16M 50 53438280 ns/op 313.96 MB/s
+EmptyPartialMatchPCRE 20000000 138 ns/op
+EmptyPartialMatchRE2 5000000 314 ns/op
+SimplePartialMatchPCRE 10000000 193 ns/op
+SimplePartialMatchRE2 5000000 344 ns/op
+HTTPPartialMatchPCRE 5000000 574 ns/op
+HTTPPartialMatchRE2 5000000 621 ns/op
+SmallHTTPPartialMatchPCRE 5000000 576 ns/op
+SmallHTTPPartialMatchRE2 5000000 625 ns/op
+DotMatchPCRE 5000000 414 ns/op
+DotMatchRE2 5000000 670 ns/op
+ASCIIMatchPCRE 5000000 395 ns/op
+ASCIIMatchRE2 5000000 668 ns/op
+==BENCHMARK== c2 Fri Feb 26 16:11:53 PST 2010
+# Linux c2 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64 GNU/Linux
+# g++ (Ubuntu 4.4.1-4ubuntu8) 4.4.1
+# Copyright (C) 2009 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# fd9366132ce9+ tip
+# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.15, not stripped
+
+Search_Easy0_CachedPCRE/8 10000000 132 ns/op 60.22 MB/s
+Search_Easy0_CachedPCRE/16 10000000 158 ns/op 100.63 MB/s
+Search_Easy0_CachedPCRE/32 10000000 215 ns/op 148.22 MB/s
+Search_Easy0_CachedPCRE/64 5000000 329 ns/op 194.12 MB/s
+Search_Easy0_CachedPCRE/128 5000000 429 ns/op 297.74 MB/s
+Search_Easy0_CachedPCRE/256 1000000 1081 ns/op 236.75 MB/s
+Search_Easy0_CachedPCRE/512 1000000 1740 ns/op 294.12 MB/s
+Search_Easy0_CachedPCRE/1K 500000 3390 ns/op 302.01 MB/s
+Search_Easy0_CachedPCRE/2K 500000 5682 ns/op 360.42 MB/s
+Search_Easy0_CachedPCRE/4K 200000 10631 ns/op 385.26 MB/s
+Search_Easy0_CachedPCRE/8K 100000 21774 ns/op 376.22 MB/s
+Search_Easy0_CachedPCRE/16K 50000 42171 ns/op 388.51 MB/s
+Search_Easy0_CachedPCRE/32K 20000 85140 ns/op 384.87 MB/s
+Search_Easy0_CachedPCRE/64K 10000 169833 ns/op 385.88 MB/s
+Search_Easy0_CachedPCRE/128K 5000 341039 ns/op 384.33 MB/s
+Search_Easy0_CachedPCRE/256K 5000 680619 ns/op 385.15 MB/s
+Search_Easy0_CachedPCRE/512K 2000 1363481 ns/op 384.52 MB/s
+Search_Easy0_CachedPCRE/1M 1000 2726584 ns/op 384.57 MB/s
+Search_Easy0_CachedPCRE/2M 500 5460554 ns/op 384.05 MB/s
+Search_Easy0_CachedPCRE/4M 100 11058850 ns/op 379.27 MB/s
+Search_Easy0_CachedPCRE/8M 100 22178340 ns/op 378.23 MB/s
+Search_Easy0_CachedPCRE/16M 50 44339640 ns/op 378.38 MB/s
+Search_Easy0_CachedRE2/8 5000000 315 ns/op 25.32 MB/s
+Search_Easy0_CachedRE2/16 5000000 317 ns/op 50.40 MB/s
+Search_Easy0_CachedRE2/32 5000000 332 ns/op 96.12 MB/s
+Search_Easy0_CachedRE2/64 5000000 333 ns/op 191.81 MB/s
+Search_Easy0_CachedRE2/128 5000000 365 ns/op 349.86 MB/s
+Search_Easy0_CachedRE2/256 5000000 395 ns/op 646.63 MB/s
+Search_Easy0_CachedRE2/512 5000000 459 ns/op 1114.56 MB/s
+Search_Easy0_CachedRE2/1K 5000000 634 ns/op 1613.10 MB/s
+Search_Easy0_CachedRE2/2K 2000000 991 ns/op 2065.21 MB/s
+Search_Easy0_CachedRE2/4K 1000000 1571 ns/op 2606.83 MB/s
+Search_Easy0_CachedRE2/8K 1000000 2919 ns/op 2805.81 MB/s
+Search_Easy0_CachedRE2/16K 500000 5406 ns/op 3030.65 MB/s
+Search_Easy0_CachedRE2/32K 200000 11015 ns/op 2974.76 MB/s
+Search_Easy0_CachedRE2/64K 100000 21911 ns/op 2990.89 MB/s
+Search_Easy0_CachedRE2/128K 50000 44356 ns/op 2954.95 MB/s
+Search_Easy0_CachedRE2/256K 20000 88544 ns/op 2960.58 MB/s
+Search_Easy0_CachedRE2/512K 10000 178349 ns/op 2939.67 MB/s
+Search_Easy0_CachedRE2/1M 5000 357706 ns/op 2931.39 MB/s
+Search_Easy0_CachedRE2/2M 5000 721832 ns/op 2905.32 MB/s
+Search_Easy0_CachedRE2/4M 1000 1529421 ns/op 2742.41 MB/s
+Search_Easy0_CachedRE2/8M 500 3092246 ns/op 2712.79 MB/s
+Search_Easy0_CachedRE2/16M 500 6166744 ns/op 2720.60 MB/s
+Search_Easy1_CachedPCRE/8 20000000 130 ns/op 61.31 MB/s
+Search_Easy1_CachedPCRE/16 10000000 158 ns/op 100.72 MB/s
+Search_Easy1_CachedPCRE/32 10000000 215 ns/op 148.32 MB/s
+Search_Easy1_CachedPCRE/64 5000000 329 ns/op 194.13 MB/s
+Search_Easy1_CachedPCRE/128 5000000 647 ns/op 197.60 MB/s
+Search_Easy1_CachedPCRE/256 2000000 934 ns/op 273.86 MB/s
+Search_Easy1_CachedPCRE/512 1000000 1968 ns/op 260.14 MB/s
+Search_Easy1_CachedPCRE/1K 500000 3418 ns/op 299.55 MB/s
+Search_Easy1_CachedPCRE/2K 500000 6235 ns/op 328.42 MB/s
+Search_Easy1_CachedPCRE/4K 200000 11128 ns/op 368.07 MB/s
+Search_Easy1_CachedPCRE/8K 100000 22016 ns/op 372.09 MB/s
+Search_Easy1_CachedPCRE/16K 50000 42398 ns/op 386.43 MB/s
+Search_Easy1_CachedPCRE/32K 20000 85215 ns/op 384.53 MB/s
+Search_Easy1_CachedPCRE/64K 10000 170243 ns/op 384.95 MB/s
+Search_Easy1_CachedPCRE/128K 5000 342036 ns/op 383.21 MB/s
+Search_Easy1_CachedPCRE/256K 5000 682271 ns/op 384.22 MB/s
+Search_Easy1_CachedPCRE/512K 2000 1367025 ns/op 383.52 MB/s
+Search_Easy1_CachedPCRE/1M 1000 2735481 ns/op 383.32 MB/s
+Search_Easy1_CachedPCRE/2M 500 5477128 ns/op 382.89 MB/s
+==BENCHMARK== c2 Fri Feb 26 16:14:43 PST 2010
+# Linux c2 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64 GNU/Linux
+# g++ (Ubuntu 4.4.1-4ubuntu8) 4.4.1
+# Copyright (C) 2009 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# fd9366132ce9+ tip
+# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.15, not stripped
+
+Search_Easy0_CachedPCRE/8 10000000 131 ns/op 60.99 MB/s
+Search_Easy0_CachedPCRE/16 10000000 159 ns/op 100.35 MB/s
+Search_Easy0_CachedPCRE/32 10000000 216 ns/op 147.95 MB/s
+Search_Easy0_CachedPCRE/64 5000000 330 ns/op 193.87 MB/s
+Search_Easy0_CachedPCRE/128 5000000 430 ns/op 297.35 MB/s
+Search_Easy0_CachedPCRE/256 1000000 1080 ns/op 236.90 MB/s
+Search_Easy0_CachedPCRE/512 1000000 1740 ns/op 294.24 MB/s
+Search_Easy0_CachedPCRE/1K 500000 3390 ns/op 302.06 MB/s
+Search_Easy0_CachedPCRE/2K 500000 5681 ns/op 360.48 MB/s
+Search_Easy0_CachedPCRE/4K 200000 10630 ns/op 385.32 MB/s
+Search_Easy0_CachedPCRE/8K 100000 21770 ns/op 376.29 MB/s
+Search_Easy0_CachedPCRE/16K 50000 42147 ns/op 388.73 MB/s
+Search_Easy0_CachedPCRE/32K 20000 85149 ns/op 384.83 MB/s
+Search_Easy0_CachedPCRE/64K 10000 169788 ns/op 385.99 MB/s
+Search_Easy0_CachedPCRE/128K 5000 340959 ns/op 384.42 MB/s
+Search_Easy0_CachedPCRE/256K 5000 680407 ns/op 385.27 MB/s
+Search_Easy0_CachedPCRE/512K 2000 1363245 ns/op 384.59 MB/s
+Search_Easy0_CachedPCRE/1M 1000 2726837 ns/op 384.54 MB/s
+Search_Easy0_CachedPCRE/2M 500 5462792 ns/op 383.90 MB/s
+Search_Easy0_CachedPCRE/4M 100 11055460 ns/op 379.39 MB/s
+Search_Easy0_CachedPCRE/8M 100 22174870 ns/op 378.29 MB/s
+Search_Easy0_CachedPCRE/16M 50 44348440 ns/op 378.30 MB/s
+Search_Easy0_CachedRE2/8 5000000 312 ns/op 25.61 MB/s
+Search_Easy0_CachedRE2/16 5000000 311 ns/op 51.44 MB/s
+Search_Easy0_CachedRE2/32 5000000 329 ns/op 97.01 MB/s
+Search_Easy0_CachedRE2/64 5000000 331 ns/op 193.03 MB/s
+Search_Easy0_CachedRE2/128 5000000 366 ns/op 349.43 MB/s
+Search_Easy0_CachedRE2/256 5000000 382 ns/op 668.48 MB/s
+Search_Easy0_CachedRE2/512 5000000 469 ns/op 1091.00 MB/s
+Search_Easy0_CachedRE2/1K 5000000 650 ns/op 1574.64 MB/s
+Search_Easy0_CachedRE2/2K 1000000 1002 ns/op 2043.38 MB/s
+Search_Easy0_CachedRE2/4K 1000000 1577 ns/op 2596.54 MB/s
+Search_Easy0_CachedRE2/8K 1000000 2911 ns/op 2813.46 MB/s
+Search_Easy0_CachedRE2/16K 500000 5425 ns/op 3019.69 MB/s
+Search_Easy0_CachedRE2/32K 200000 11026 ns/op 2971.78 MB/s
+Search_Easy0_CachedRE2/64K 100000 21854 ns/op 2998.69 MB/s
+Search_Easy0_CachedRE2/128K 50000 44382 ns/op 2953.23 MB/s
+Search_Easy0_CachedRE2/256K 20000 88308 ns/op 2968.52 MB/s
+Search_Easy0_CachedRE2/512K 10000 177645 ns/op 2951.32 MB/s
+Search_Easy0_CachedRE2/1M 5000 356548 ns/op 2940.90 MB/s
+Search_Easy0_CachedRE2/2M 5000 720036 ns/op 2912.56 MB/s
+Search_Easy0_CachedRE2/4M 1000 1524214 ns/op 2751.78 MB/s
+Search_Easy0_CachedRE2/8M 500 3083238 ns/op 2720.71 MB/s
+Search_Easy0_CachedRE2/16M 500 6149012 ns/op 2728.44 MB/s
+Search_Easy1_CachedPCRE/8 20000000 131 ns/op 60.89 MB/s
+Search_Easy1_CachedPCRE/16 10000000 159 ns/op 100.17 MB/s
+Search_Easy1_CachedPCRE/32 10000000 216 ns/op 147.73 MB/s
+Search_Easy1_CachedPCRE/64 5000000 330 ns/op 193.67 MB/s
+Search_Easy1_CachedPCRE/128 5000000 647 ns/op 197.80 MB/s
+Search_Easy1_CachedPCRE/256 2000000 933 ns/op 274.19 MB/s
+Search_Easy1_CachedPCRE/512 1000000 1963 ns/op 260.71 MB/s
+Search_Easy1_CachedPCRE/1K 500000 3417 ns/op 299.65 MB/s
+Search_Easy1_CachedPCRE/2K 500000 6237 ns/op 328.32 MB/s
+Search_Easy1_CachedPCRE/4K 200000 11124 ns/op 368.19 MB/s
+Search_Easy1_CachedPCRE/8K 100000 22020 ns/op 372.02 MB/s
+Search_Easy1_CachedPCRE/16K 50000 42400 ns/op 386.41 MB/s
+Search_Easy1_CachedPCRE/32K 20000 85208 ns/op 384.56 MB/s
+Search_Easy1_CachedPCRE/64K 10000 170218 ns/op 385.01 MB/s
+Search_Easy1_CachedPCRE/128K 5000 341992 ns/op 383.26 MB/s
+Search_Easy1_CachedPCRE/256K 5000 682192 ns/op 384.27 MB/s
+Search_Easy1_CachedPCRE/512K 2000 1366643 ns/op 383.63 MB/s
+Search_Easy1_CachedPCRE/1M 1000 2735060 ns/op 383.38 MB/s
+Search_Easy1_CachedPCRE/2M 500 5477962 ns/op 382.83 MB/s
+Search_Easy1_CachedPCRE/4M 100 11090380 ns/op 378.19 MB/s
+Search_Easy1_CachedPCRE/8M 100 22241800 ns/op 377.16 MB/s
+Search_Easy1_CachedPCRE/16M 50 44479060 ns/op 377.19 MB/s
+Search_Easy1_CachedRE2/8 5000000 314 ns/op 25.47 MB/s
+Search_Easy1_CachedRE2/16 5000000 316 ns/op 50.60 MB/s
+Search_Easy1_CachedRE2/32 5000000 332 ns/op 96.25 MB/s
+Search_Easy1_CachedRE2/64 5000000 338 ns/op 189.05 MB/s
+Search_Easy1_CachedRE2/128 5000000 367 ns/op 348.49 MB/s
+Search_Easy1_CachedRE2/256 5000000 399 ns/op 641.03 MB/s
+Search_Easy1_CachedRE2/512 5000000 468 ns/op 1092.75 MB/s
+Search_Easy1_CachedRE2/1K 5000000 650 ns/op 1573.57 MB/s
+Search_Easy1_CachedRE2/2K 1000000 1002 ns/op 2042.31 MB/s
+Search_Easy1_CachedRE2/4K 1000000 1576 ns/op 2598.20 MB/s
+Search_Easy1_CachedRE2/8K 1000000 2918 ns/op 2806.71 MB/s
+Search_Easy1_CachedRE2/16K 500000 5447 ns/op 3007.80 MB/s
+Search_Easy1_CachedRE2/32K 200000 10969 ns/op 2987.17 MB/s
+Search_Easy1_CachedRE2/64K 100000 21865 ns/op 2997.18 MB/s
+Search_Easy1_CachedRE2/128K 50000 44355 ns/op 2955.06 MB/s
+Search_Easy1_CachedRE2/256K 20000 88281 ns/op 2969.41 MB/s
+Search_Easy1_CachedRE2/512K 10000 177638 ns/op 2951.44 MB/s
+Search_Easy1_CachedRE2/1M 5000 356550 ns/op 2940.89 MB/s
+Search_Easy1_CachedRE2/2M 5000 720024 ns/op 2912.61 MB/s
+Search_Easy1_CachedRE2/4M 1000 1524169 ns/op 2751.86 MB/s
+Search_Easy1_CachedRE2/8M 500 3084670 ns/op 2719.45 MB/s
+Search_Easy1_CachedRE2/16M 500 6151972 ns/op 2727.13 MB/s
+Search_Medium_CachedPCRE/8 20000000 132 ns/op 60.22 MB/s
+Search_Medium_CachedPCRE/16 10000000 161 ns/op 99.16 MB/s
+Search_Medium_CachedPCRE/32 10000000 218 ns/op 146.58 MB/s
+Search_Medium_CachedPCRE/64 5000000 339 ns/op 188.59 MB/s
+Search_Medium_CachedPCRE/128 5000000 433 ns/op 295.34 MB/s
+Search_Medium_CachedPCRE/256 200000 9075 ns/op 28.21 MB/s
+Search_Medium_CachedPCRE/512 100000 21569 ns/op 23.74 MB/s
+Search_Medium_CachedPCRE/1K 50000 42379 ns/op 24.16 MB/s
+Search_Medium_CachedPCRE/2K 50000 62363 ns/op 32.84 MB/s
+Search_Medium_CachedPCRE/4K 10000 153731 ns/op 26.64 MB/s
+Search_Medium_CachedPCRE/8K 5000 332686 ns/op 24.62 MB/s
+Search_Medium_CachedPCRE/16K 5000 678481 ns/op 24.15 MB/s
+Search_Medium_CachedPCRE/32K 2000 1356329 ns/op 24.16 MB/s
+Search_Medium_CachedPCRE/64K 1000 2709033 ns/op 24.19 MB/s
+Search_Medium_CachedPCRE/128K 500 5413924 ns/op 24.21 MB/s
+Search_Medium_CachedPCRE/256K 100 10832790 ns/op 24.20 MB/s
+Search_Medium_CachedRE2/8 5000000 332 ns/op 24.08 MB/s
+Search_Medium_CachedRE2/16 5000000 358 ns/op 44.58 MB/s
+Search_Medium_CachedRE2/32 5000000 407 ns/op 78.49 MB/s
+Search_Medium_CachedRE2/64 5000000 508 ns/op 125.89 MB/s
+Search_Medium_CachedRE2/128 5000000 719 ns/op 177.95 MB/s
+Search_Medium_CachedRE2/256 1000000 1123 ns/op 227.89 MB/s
+Search_Medium_CachedRE2/512 1000000 1932 ns/op 264.94 MB/s
+Search_Medium_CachedRE2/1K 500000 3550 ns/op 288.40 MB/s
+Search_Medium_CachedRE2/2K 500000 6786 ns/op 301.78 MB/s
+Search_Medium_CachedRE2/4K 200000 13256 ns/op 308.98 MB/s
+Search_Medium_CachedRE2/8K 100000 26195 ns/op 312.72 MB/s
+Search_Medium_CachedRE2/16K 50000 52079 ns/op 314.60 MB/s
+Search_Medium_CachedRE2/32K 10000 103941 ns/op 315.25 MB/s
+Search_Medium_CachedRE2/64K 10000 207495 ns/op 315.84 MB/s
+Search_Medium_CachedRE2/128K 5000 414566 ns/op 316.17 MB/s
+Search_Medium_CachedRE2/256K 2000 828759 ns/op 316.31 MB/s
+Search_Medium_CachedRE2/512K 1000 1657168 ns/op 316.38 MB/s
+Search_Medium_CachedRE2/1M 500 3314174 ns/op 316.39 MB/s
+Search_Medium_CachedRE2/2M 500 6635590 ns/op 316.05 MB/s
+Search_Medium_CachedRE2/4M 100 13336940 ns/op 314.49 MB/s
+Search_Medium_CachedRE2/8M 100 26717640 ns/op 313.97 MB/s
+Search_Medium_CachedRE2/16M 50 53430720 ns/op 314.00 MB/s
+Search_Hard_CachedPCRE/8 20000000 133 ns/op 60.13 MB/s
+Search_Hard_CachedPCRE/16 10000000 161 ns/op 99.09 MB/s
+Search_Hard_CachedPCRE/32 10000000 218 ns/op 146.59 MB/s
+Search_Hard_CachedPCRE/64 5000000 339 ns/op 188.50 MB/s
+Search_Hard_CachedPCRE/128 5000000 433 ns/op 295.22 MB/s
+Search_Hard_CachedPCRE/256 5000 572457 ns/op 0.45 MB/s
+Search_Hard_CachedPCRE/512 1000 2346699 ns/op 0.22 MB/s
+Search_Hard_CachedPCRE/1K 200 9314450 ns/op 0.11 MB/s
+Search_Hard_CachedPCRE/2K 50 34065320 ns/op 0.06 MB/s
+Search_Hard_CachedPCRE/4K 10 146729800 ns/op 0.03 MB/s
+Search_Hard_CachedRE2/8 5000000 330 ns/op 24.19 MB/s
+Search_Hard_CachedRE2/16 5000000 358 ns/op 44.66 MB/s
+Search_Hard_CachedRE2/32 5000000 412 ns/op 77.51 MB/s
+Search_Hard_CachedRE2/64 5000000 507 ns/op 125.99 MB/s
+Search_Hard_CachedRE2/128 5000000 719 ns/op 178.01 MB/s
+Search_Hard_CachedRE2/256 1000000 1122 ns/op 228.01 MB/s
+Search_Hard_CachedRE2/512 1000000 1931 ns/op 265.03 MB/s
+Search_Hard_CachedRE2/1K 500000 3550 ns/op 288.44 MB/s
+Search_Hard_CachedRE2/2K 500000 6788 ns/op 301.70 MB/s
+Search_Hard_CachedRE2/4K 200000 13256 ns/op 308.98 MB/s
+Search_Hard_CachedRE2/8K 100000 26200 ns/op 312.67 MB/s
+Search_Hard_CachedRE2/16K 50000 52082 ns/op 314.58 MB/s
+Search_Hard_CachedRE2/32K 10000 103936 ns/op 315.27 MB/s
+Search_Hard_CachedRE2/64K 10000 207497 ns/op 315.84 MB/s
+Search_Hard_CachedRE2/128K 5000 414603 ns/op 316.14 MB/s
+Search_Hard_CachedRE2/256K 2000 828770 ns/op 316.30 MB/s
+Search_Hard_CachedRE2/512K 1000 1657127 ns/op 316.38 MB/s
+Search_Hard_CachedRE2/1M 500 3314338 ns/op 316.38 MB/s
+Search_Hard_CachedRE2/2M 500 6635802 ns/op 316.04 MB/s
+Search_Hard_CachedRE2/4M 100 13338440 ns/op 314.45 MB/s
+Search_Hard_CachedRE2/8M 100 26718310 ns/op 313.96 MB/s
+Search_Hard_CachedRE2/16M 50 53433380 ns/op 313.98 MB/s
+Search_Parens_CachedPCRE/8 10000000 196 ns/op 40.67 MB/s
+Search_Parens_CachedRE2/8 5000000 337 ns/op 23.70 MB/s
+Search_Parens_CachedRE2/16 5000000 365 ns/op 43.77 MB/s
+Search_Parens_CachedRE2/32 5000000 412 ns/op 77.62 MB/s
+Search_Parens_CachedRE2/64 5000000 517 ns/op 123.72 MB/s
+Search_Parens_CachedRE2/128 5000000 722 ns/op 177.17 MB/s
+Search_Parens_CachedRE2/256 1000000 1126 ns/op 227.26 MB/s
+Search_Parens_CachedRE2/512 1000000 1935 ns/op 264.56 MB/s
+Search_Parens_CachedRE2/1K 500000 3550 ns/op 288.41 MB/s
+Search_Parens_CachedRE2/2K 500000 6788 ns/op 301.68 MB/s
+Search_Parens_CachedRE2/4K 200000 13262 ns/op 308.84 MB/s
+Search_Parens_CachedRE2/8K 100000 26202 ns/op 312.65 MB/s
+Search_Parens_CachedRE2/16K 50000 52088 ns/op 314.54 MB/s
+Search_Parens_CachedRE2/32K 10000 103968 ns/op 315.17 MB/s
+Search_Parens_CachedRE2/64K 10000 207504 ns/op 315.83 MB/s
+Search_Parens_CachedRE2/128K 5000 414604 ns/op 316.14 MB/s
+Search_Parens_CachedRE2/256K 2000 828795 ns/op 316.30 MB/s
+Search_Parens_CachedRE2/512K 1000 1657211 ns/op 316.37 MB/s
+Search_Parens_CachedRE2/1M 500 3314290 ns/op 316.38 MB/s
+Search_Parens_CachedRE2/2M 500 6636392 ns/op 316.01 MB/s
+Search_Parens_CachedRE2/4M 100 13338070 ns/op 314.46 MB/s
+Search_Parens_CachedRE2/8M 100 26717640 ns/op 313.97 MB/s
+Search_Parens_CachedRE2/16M 50 53437080 ns/op 313.96 MB/s
+Search_BigFixed_CachedPCRE/8 10000000 242 ns/op 32.95 MB/s
+Search_BigFixed_CachedPCRE/16 5000000 301 ns/op 53.10 MB/s
+Search_BigFixed_CachedPCRE/32 5000000 418 ns/op 76.52 MB/s
+Search_BigFixed_CachedPCRE/64 5000000 654 ns/op 97.84 MB/s
+Search_BigFixed_CachedPCRE/128 2000000 985 ns/op 129.93 MB/s
+Search_BigFixed_CachedPCRE/256 1000000 1775 ns/op 144.21 MB/s
+Search_BigFixed_CachedPCRE/512 500000 3342 ns/op 153.19 MB/s
+Search_BigFixed_CachedPCRE/1K 500000 6476 ns/op 158.12 MB/s
+Search_BigFixed_CachedPCRE/2K 200000 12744 ns/op 160.69 MB/s
+Search_BigFixed_CachedPCRE/4K 100000 25281 ns/op 162.01 MB/s
+Search_BigFixed_CachedPCRE/8K 50000 50359 ns/op 162.67 MB/s
+Search_BigFixed_CachedPCRE/16K 10000 100607 ns/op 162.85 MB/s
+Search_BigFixed_CachedPCRE/32K 10000 200995 ns/op 163.03 MB/s
+Search_BigFixed_CachedRE2/8 20000000 131 ns/op 61.00 MB/s
+Search_BigFixed_CachedRE2/16 5000000 381 ns/op 41.95 MB/s
+Search_BigFixed_CachedRE2/32 5000000 412 ns/op 77.51 MB/s
+Search_BigFixed_CachedRE2/64 5000000 492 ns/op 129.84 MB/s
+Search_BigFixed_CachedRE2/128 5000000 636 ns/op 201.21 MB/s
+Search_BigFixed_CachedRE2/256 2000000 952 ns/op 268.71 MB/s
+Search_BigFixed_CachedRE2/512 1000000 1552 ns/op 329.79 MB/s
+Search_BigFixed_CachedRE2/1K 1000000 2772 ns/op 369.32 MB/s
+Search_BigFixed_CachedRE2/2K 500000 5192 ns/op 394.39 MB/s
+Search_BigFixed_CachedRE2/4K 200000 10051 ns/op 407.48 MB/s
+Search_BigFixed_CachedRE2/8K 100000 19758 ns/op 414.61 MB/s
+Search_BigFixed_CachedRE2/16K 50000 39167 ns/op 418.31 MB/s
+Search_BigFixed_CachedRE2/32K 20000 78103 ns/op 419.55 MB/s
+Search_BigFixed_CachedRE2/64K 10000 155875 ns/op 420.44 MB/s
+Search_BigFixed_CachedRE2/128K 5000 311474 ns/op 420.81 MB/s
+Search_BigFixed_CachedRE2/256K 5000 622461 ns/op 421.14 MB/s
+Search_BigFixed_CachedRE2/512K 2000 1246952 ns/op 420.46 MB/s
+Search_BigFixed_CachedRE2/1M 1000 2502325 ns/op 419.04 MB/s
+Search_Success_PCRE/8 1000000 1783 ns/op 4.48 MB/s
+Search_Success_PCRE/16 1000000 1839 ns/op 8.70 MB/s
+Search_Success_PCRE/32 1000000 1934 ns/op 16.54 MB/s
+Search_Success_PCRE/64 1000000 2104 ns/op 30.41 MB/s
+Search_Success_PCRE/128 1000000 2484 ns/op 51.52 MB/s
+Search_Success_PCRE/256 500000 3181 ns/op 80.47 MB/s
+Search_Success_PCRE/512 500000 4598 ns/op 111.34 MB/s
+Search_Success_PCRE/1K 500000 7463 ns/op 137.20 MB/s
+Search_Success_PCRE/2K 200000 13079 ns/op 156.58 MB/s
+Search_Success_PCRE/4K 100000 24404 ns/op 167.84 MB/s
+Search_Success_PCRE/8K 50000 47074 ns/op 174.02 MB/s
+Search_Success_PCRE/16K 20000 92372 ns/op 177.37 MB/s
+Search_Success_PCRE/32K 10000 183212 ns/op 178.85 MB/s
+Search_Success_PCRE/64K 5000 364671 ns/op 179.71 MB/s
+Search_Success_PCRE/128K 2000 728337 ns/op 179.96 MB/s
+Search_Success_PCRE/256K 1000 1457798 ns/op 179.82 MB/s
+Search_Success_PCRE/512K 500 2926292 ns/op 179.16 MB/s
+Search_Success_PCRE/1M 500 5851210 ns/op 179.21 MB/s
+Search_Success_PCRE/2M 200 11872745 ns/op 176.64 MB/s
+Search_Success_PCRE/4M 50 25398520 ns/op 165.14 MB/s
+Search_Success_PCRE/8M 20 56956150 ns/op 147.28 MB/s
+Search_Success_PCRE/16M 10 134245000 ns/op 124.97 MB/s
+Search_Success_RE2/8 200000 8097 ns/op 0.99 MB/s
+Search_Success_RE2/16 100000 19992 ns/op 0.80 MB/s
+Search_Success_RE2/32 100000 19968 ns/op 1.60 MB/s
+Search_Success_RE2/64 100000 20151 ns/op 3.18 MB/s
+Search_Success_RE2/128 100000 20319 ns/op 6.30 MB/s
+Search_Success_RE2/256 100000 20646 ns/op 12.40 MB/s
+Search_Success_RE2/512 100000 21451 ns/op 23.87 MB/s
+Search_Success_RE2/1K 100000 23054 ns/op 44.42 MB/s
+Search_Success_RE2/2K 100000 26339 ns/op 77.75 MB/s
+Search_Success_RE2/4K 50000 32820 ns/op 124.80 MB/s
+Search_Success_RE2/8K 50000 45821 ns/op 178.78 MB/s
+Search_Success_RE2/16K 50000 71718 ns/op 228.45 MB/s
+Search_Success_RE2/32K 10000 123789 ns/op 264.71 MB/s
+Search_Success_RE2/64K 10000 227372 ns/op 288.23 MB/s
+Search_Success_RE2/128K 5000 435072 ns/op 301.26 MB/s
+Search_Success_RE2/256K 2000 851760 ns/op 307.77 MB/s
+Search_Success_RE2/512K 1000 1689906 ns/op 310.25 MB/s
+Search_Success_RE2/1M 500 3385400 ns/op 309.73 MB/s
+Search_Success_RE2/2M 200 6918485 ns/op 303.12 MB/s
+Search_Success_RE2/4M 100 14404850 ns/op 291.17 MB/s
+Search_Success_RE2/8M 50 30839480 ns/op 272.01 MB/s
+Search_Success_RE2/16M 20 73836050 ns/op 227.22 MB/s
+Search_Success_CachedPCRE/8 10000000 234 ns/op 34.15 MB/s
+Search_Success_CachedPCRE/16 10000000 287 ns/op 55.56 MB/s
+Search_Success_CachedPCRE/32 5000000 395 ns/op 80.93 MB/s
+Search_Success_CachedPCRE/64 5000000 610 ns/op 104.84 MB/s
+Search_Success_CachedPCRE/128 2000000 913 ns/op 140.13 MB/s
+Search_Success_CachedPCRE/256 1000000 1620 ns/op 157.98 MB/s
+Search_Success_CachedPCRE/512 500000 3036 ns/op 168.63 MB/s
+Search_Success_CachedPCRE/1K 500000 5866 ns/op 174.55 MB/s
+Search_Success_CachedPCRE/2K 200000 11528 ns/op 177.64 MB/s
+Search_Success_CachedPCRE/4K 100000 22851 ns/op 179.24 MB/s
+Search_Success_CachedPCRE/8K 50000 45501 ns/op 180.04 MB/s
+Search_Success_CachedPCRE/16K 20000 90807 ns/op 180.43 MB/s
+Search_Success_CachedPCRE/32K 10000 181512 ns/op 180.53 MB/s
+Search_Success_CachedPCRE/64K 5000 362934 ns/op 180.57 MB/s
+Search_Success_CachedPCRE/128K 2000 726545 ns/op 180.40 MB/s
+Search_Success_CachedPCRE/256K 1000 1455974 ns/op 180.05 MB/s
+Search_Success_CachedPCRE/512K 500 2924332 ns/op 179.28 MB/s
+Search_Success_CachedPCRE/1M 500 5848344 ns/op 179.29 MB/s
+Search_Success_CachedPCRE/2M 200 11865095 ns/op 176.75 MB/s
+Search_Success_CachedPCRE/4M 50 25384340 ns/op 165.23 MB/s
+Search_Success_CachedPCRE/8M 20 56942400 ns/op 147.32 MB/s
+Search_Success_CachedPCRE/16M 10 134227100 ns/op 124.99 MB/s
+Search_Success_CachedRE2/8 20000000 133 ns/op 59.99 MB/s
+Search_Success_CachedRE2/16 5000000 371 ns/op 43.03 MB/s
+Search_Success_CachedRE2/32 5000000 417 ns/op 76.69 MB/s
+Search_Success_CachedRE2/64 5000000 517 ns/op 123.61 MB/s
+Search_Success_CachedRE2/128 5000000 730 ns/op 175.23 MB/s
+Search_Success_CachedRE2/256 1000000 1134 ns/op 225.72 MB/s
+Search_Success_CachedRE2/512 1000000 1943 ns/op 263.49 MB/s
+Search_Success_CachedRE2/1K 500000 3560 ns/op 287.59 MB/s
+Search_Success_CachedRE2/2K 500000 6796 ns/op 301.32 MB/s
+Search_Success_CachedRE2/4K 200000 13266 ns/op 308.76 MB/s
+Search_Success_CachedRE2/8K 100000 26213 ns/op 312.51 MB/s
+Search_Success_CachedRE2/16K 50000 52097 ns/op 314.49 MB/s
+Search_Success_CachedRE2/32K 10000 104050 ns/op 314.92 MB/s
+Search_Success_CachedRE2/64K 10000 207657 ns/op 315.60 MB/s
+Search_Success_CachedRE2/128K 5000 415228 ns/op 315.66 MB/s
+Search_Success_CachedRE2/256K 2000 831992 ns/op 315.08 MB/s
+Search_Success_CachedRE2/512K 1000 1669679 ns/op 314.01 MB/s
+Search_Success_CachedRE2/1M 500 3364660 ns/op 311.64 MB/s
+Search_Success_CachedRE2/2M 200 6892065 ns/op 304.29 MB/s
+Search_Success_CachedRE2/4M 100 14355860 ns/op 292.17 MB/s
+Search_Success_CachedRE2/8M 50 30788480 ns/op 272.46 MB/s
+Search_Success_CachedRE2/16M 20 73781750 ns/op 227.39 MB/s
+Search_Success1_PCRE/8 1000000 1945 ns/op 4.11 MB/s
+Search_Success1_PCRE/16 1000000 2005 ns/op 7.98 MB/s
+Search_Success1_PCRE/32 1000000 2102 ns/op 15.22 MB/s
+Search_Success1_PCRE/64 1000000 2277 ns/op 28.10 MB/s
+Search_Success1_PCRE/128 1000000 2640 ns/op 48.48 MB/s
+Search_Success1_PCRE/256 500000 3321 ns/op 77.07 MB/s
+Search_Success1_PCRE/512 500000 4750 ns/op 107.79 MB/s
+Search_Success1_PCRE/1K 200000 7579 ns/op 135.11 MB/s
+Search_Success1_PCRE/2K 200000 13241 ns/op 154.67 MB/s
+Search_Success1_PCRE/4K 100000 24584 ns/op 166.61 MB/s
+Search_Success1_PCRE/8K 50000 47274 ns/op 173.29 MB/s
+Search_Success1_PCRE/16K 20000 92603 ns/op 176.93 MB/s
+Search_Success1_PCRE/32K 10000 183395 ns/op 178.67 MB/s
+Search_Success1_PCRE/64K 5000 364841 ns/op 179.63 MB/s
+Search_Success1_PCRE/128K 2000 728503 ns/op 179.92 MB/s
+Search_Success1_PCRE/256K 1000 1458071 ns/op 179.79 MB/s
+Search_Success1_PCRE/512K 500 2926604 ns/op 179.15 MB/s
+Search_Success1_PCRE/1M 500 5851218 ns/op 179.21 MB/s
+Search_Success1_PCRE/2M 200 11872985 ns/op 176.63 MB/s
+Search_Success1_PCRE/4M 50 25401620 ns/op 165.12 MB/s
+Search_Success1_PCRE/8M 20 56961950 ns/op 147.27 MB/s
+Search_Success1_PCRE/16M 10 134240500 ns/op 124.98 MB/s
+Search_Success1_RE2/8 100000 27528 ns/op 0.29 MB/s
+Search_Success1_RE2/16 100000 27909 ns/op 0.57 MB/s
+Search_Success1_RE2/32 100000 27939 ns/op 1.15 MB/s
+Search_Success1_RE2/64 100000 28296 ns/op 2.26 MB/s
+Search_Success1_RE2/128 100000 28485 ns/op 4.49 MB/s
+Search_Success1_RE2/256 100000 28656 ns/op 8.93 MB/s
+Search_Success1_RE2/512 100000 29337 ns/op 17.45 MB/s
+Search_Success1_RE2/1K 50000 31020 ns/op 33.01 MB/s
+Search_Success1_RE2/2K 50000 34197 ns/op 59.89 MB/s
+Search_Success1_RE2/4K 50000 40779 ns/op 100.44 MB/s
+Search_Success1_RE2/8K 50000 53805 ns/op 152.25 MB/s
+Search_Success1_RE2/16K 20000 79804 ns/op 205.30 MB/s
+Search_Success1_RE2/32K 10000 131917 ns/op 248.40 MB/s
+Search_Success1_RE2/64K 10000 235487 ns/op 278.30 MB/s
+Search_Success1_RE2/128K 5000 443078 ns/op 295.82 MB/s
+Search_Success1_RE2/256K 2000 859950 ns/op 304.84 MB/s
+Search_Success1_RE2/512K 1000 1697973 ns/op 308.77 MB/s
+Search_Success1_RE2/1M 500 3393262 ns/op 309.02 MB/s
+Search_Success1_RE2/2M 200 6926335 ns/op 302.78 MB/s
+Search_Success1_RE2/4M 100 14413600 ns/op 291.00 MB/s
+Search_Success1_RE2/8M 50 30850640 ns/op 271.91 MB/s
+Search_Success1_RE2/16M 20 73845250 ns/op 227.19 MB/s
+Search_Success1_Cached_PCRE/8 10000000 255 ns/op 31.28 MB/s
+Search_Success1_Cached_PCRE/16 5000000 309 ns/op 51.71 MB/s
+Search_Success1_Cached_PCRE/32 5000000 416 ns/op 76.76 MB/s
+Search_Success1_Cached_PCRE/64 5000000 632 ns/op 101.25 MB/s
+Search_Success1_Cached_PCRE/128 2000000 935 ns/op 136.88 MB/s
+Search_Success1_Cached_PCRE/256 1000000 1641 ns/op 155.95 MB/s
+Search_Success1_Cached_PCRE/512 500000 3057 ns/op 167.45 MB/s
+Search_Success1_Cached_PCRE/1K 500000 5888 ns/op 173.91 MB/s
+Search_Success1_Cached_PCRE/2K 200000 11550 ns/op 177.30 MB/s
+Search_Success1_Cached_PCRE/4K 100000 22873 ns/op 179.07 MB/s
+Search_Success1_Cached_PCRE/8K 50000 45523 ns/op 179.95 MB/s
+Search_Success1_Cached_PCRE/16K 20000 90831 ns/op 180.38 MB/s
+Search_Success1_Cached_PCRE/32K 10000 181548 ns/op 180.49 MB/s
+Search_Success1_Cached_PCRE/64K 5000 362962 ns/op 180.56 MB/s
+Search_Success1_Cached_PCRE/128K 2000 726556 ns/op 180.40 MB/s
+Search_Success1_Cached_PCRE/256K 1000 1455905 ns/op 180.06 MB/s
+Search_Success1_Cached_PCRE/512K 500 2924290 ns/op 179.29 MB/s
+Search_Success1_Cached_PCRE/1M 500 5848600 ns/op 179.29 MB/s
+Search_Success1_Cached_PCRE/2M 200 11865335 ns/op 176.75 MB/s
+Search_Success1_Cached_PCRE/4M 50 25381500 ns/op 165.25 MB/s
+Search_Success1_Cached_PCRE/8M 20 56935900 ns/op 147.33 MB/s
+Search_Success1_Cached_PCRE/16M 10 134214600 ns/op 125.00 MB/s
+Search_Success1_Cached_RE2/8 5000000 343 ns/op 23.27 MB/s
+Search_Success1_Cached_RE2/16 5000000 372 ns/op 43.01 MB/s
+Search_Success1_Cached_RE2/32 5000000 421 ns/op 75.96 MB/s
+Search_Success1_Cached_RE2/64 5000000 518 ns/op 123.53 MB/s
+Search_Success1_Cached_RE2/128 5000000 730 ns/op 175.31 MB/s
+Search_Success1_Cached_RE2/256 1000000 1133 ns/op 225.77 MB/s
+Search_Success1_Cached_RE2/512 1000000 1943 ns/op 263.48 MB/s
+Search_Success1_Cached_RE2/1K 500000 3560 ns/op 287.59 MB/s
+Search_Success1_Cached_RE2/2K 500000 6796 ns/op 301.33 MB/s
+Search_Success1_Cached_RE2/4K 200000 13269 ns/op 308.69 MB/s
+Search_Success1_Cached_RE2/8K 100000 26212 ns/op 312.52 MB/s
+Search_Success1_Cached_RE2/16K 50000 52104 ns/op 314.45 MB/s
+Search_Success1_Cached_RE2/32K 10000 104063 ns/op 314.88 MB/s
+Search_Success1_Cached_RE2/64K 10000 207703 ns/op 315.53 MB/s
+Search_Success1_Cached_RE2/128K 5000 415264 ns/op 315.64 MB/s
+Search_Success1_Cached_RE2/256K 2000 831974 ns/op 315.09 MB/s
+Search_Success1_Cached_RE2/512K 1000 1669692 ns/op 314.00 MB/s
+Search_Success1_Cached_RE2/1M 500 3364484 ns/op 311.66 MB/s
+Search_Success1_Cached_RE2/2M 200 6892295 ns/op 304.27 MB/s
+Search_Success1_Cached_RE2/4M 100 14355830 ns/op 292.17 MB/s
+Search_Success1_Cached_RE2/8M 50 30788400 ns/op 272.46 MB/s
+Search_Success1_Cached_RE2/16M 20 73781700 ns/op 227.39 MB/s
+Search_Digits_PCRE 500000 4957 ns/op
+Search_Digits_RE2 100000 22155 ns/op
+Parse_Digits_PCRE 500000 5045 ns/op
+Parse_Digits_RE2 200000 9570 ns/op
+Parse_CachedDigits_PCRE 5000000 448 ns/op
+Parse_CachedDigits_RE2 5000000 301 ns/op
+Parse_DigitDs_PCRE 500000 4075 ns/op
+Parse_DigitDs_RE2 200000 9567 ns/op
+Parse_CachedDigitDs_PCRE 5000000 453 ns/op
+Parse_CachedDigitDs_RE2 5000000 301 ns/op
+Parse_Split_PCRE 500000 3055 ns/op
+Parse_Split_RE2 200000 10818 ns/op
+Parse_CachedSplit_PCRE 5000000 329 ns/op
+Parse_CachedSplit_RE2 10000000 172 ns/op
+Parse_SplitHard_PCRE 500000 3069 ns/op
+Parse_SplitHard_RE2 200000 13016 ns/op
+Parse_CachedSplitHard_PCRE 5000000 325 ns/op
+Parse_CachedSplitHard_RE2 1000000 2140 ns/op
+Parse_CachedSplitBig1_PCRE 500 4502460 ns/op
+Parse_CachedSplitBig1_RE2 5000 674142 ns/op
+Parse_CachedSplitBig2_PCRE 5000 553268 ns/op
+Parse_CachedSplitBig2_RE2 50 55654780 ns/op
+BM_PCRE_Compile 500000 3780 ns/op
+BM_RE2_Compile 200000 10409 ns/op
+SearchPhone_CachedPCRE/8 1000000 1155 ns/op 6.92 MB/s
+SearchPhone_CachedPCRE/16 1000000 1900 ns/op 8.42 MB/s
+SearchPhone_CachedPCRE/32 500000 3414 ns/op 9.37 MB/s
+SearchPhone_CachedPCRE/64 500000 6265 ns/op 10.21 MB/s
+SearchPhone_CachedPCRE/128 200000 12227 ns/op 10.47 MB/s
+SearchPhone_CachedPCRE/256 100000 23880 ns/op 10.72 MB/s
+SearchPhone_CachedPCRE/512 50000 47672 ns/op 10.74 MB/s
+SearchPhone_CachedPCRE/1K 20000 94526 ns/op 10.83 MB/s
+SearchPhone_CachedPCRE/2K 10000 186297 ns/op 10.99 MB/s
+SearchPhone_CachedPCRE/4K 5000 365404 ns/op 11.21 MB/s
+SearchPhone_CachedPCRE/8K 5000 726987 ns/op 11.27 MB/s
+SearchPhone_CachedPCRE/16K 2000 1451414 ns/op 11.29 MB/s
+SearchPhone_CachedPCRE/32K 1000 2900737 ns/op 11.30 MB/s
+SearchPhone_CachedPCRE/64K 500 5795914 ns/op 11.31 MB/s
+SearchPhone_CachedPCRE/128K 100 11603080 ns/op 11.30 MB/s
+SearchPhone_CachedPCRE/256K 100 23178330 ns/op 11.31 MB/s
+SearchPhone_CachedPCRE/512K 50 46345740 ns/op 11.31 MB/s
+SearchPhone_CachedPCRE/1M 20 92692000 ns/op 11.31 MB/s
+SearchPhone_CachedPCRE/2M 10 185324900 ns/op 11.32 MB/s
+SearchPhone_CachedPCRE/4M 5 370957000 ns/op 11.31 MB/s
+SearchPhone_CachedPCRE/8M 5 741607400 ns/op 11.31 MB/s
+SearchPhone_CachedPCRE/16M 1 1482978000 ns/op 11.31 MB/s
+SearchPhone_CachedRE2/8 2000000 860 ns/op 9.30 MB/s
+SearchPhone_CachedRE2/16 2000000 903 ns/op 17.71 MB/s
+SearchPhone_CachedRE2/32 2000000 944 ns/op 33.87 MB/s
+SearchPhone_CachedRE2/64 1000000 1060 ns/op 60.34 MB/s
+SearchPhone_CachedRE2/128 1000000 1267 ns/op 100.95 MB/s
+SearchPhone_CachedRE2/256 1000000 1674 ns/op 152.88 MB/s
+SearchPhone_CachedRE2/512 1000000 2476 ns/op 206.78 MB/s
+SearchPhone_CachedRE2/1K 500000 4097 ns/op 249.91 MB/s
+SearchPhone_CachedRE2/2K 500000 7343 ns/op 278.89 MB/s
+SearchPhone_CachedRE2/4K 200000 13823 ns/op 296.31 MB/s
+SearchPhone_CachedRE2/8K 100000 26767 ns/op 306.04 MB/s
+SearchPhone_CachedRE2/16K 50000 52732 ns/op 310.70 MB/s
+SearchPhone_CachedRE2/32K 10000 104785 ns/op 312.72 MB/s
+SearchPhone_CachedRE2/64K 10000 208330 ns/op 314.58 MB/s
+SearchPhone_CachedRE2/128K 5000 415442 ns/op 315.50 MB/s
+SearchPhone_CachedRE2/256K 2000 829700 ns/op 315.95 MB/s
+SearchPhone_CachedRE2/512K 1000 1658075 ns/op 316.20 MB/s
+SearchPhone_CachedRE2/1M 500 3315348 ns/op 316.28 MB/s
+SearchPhone_CachedRE2/2M 500 6637420 ns/op 315.96 MB/s
+SearchPhone_CachedRE2/4M 100 13343750 ns/op 314.33 MB/s
+SearchPhone_CachedRE2/8M 100 26723120 ns/op 313.91 MB/s
+SearchPhone_CachedRE2/16M 50 53440620 ns/op 313.94 MB/s
+EmptyPartialMatchPCRE 20000000 137 ns/op
+EmptyPartialMatchRE2 5000000 310 ns/op
+SimplePartialMatchPCRE 10000000 188 ns/op
+SimplePartialMatchRE2 5000000 354 ns/op
+HTTPPartialMatchPCRE 5000000 574 ns/op
+HTTPPartialMatchRE2 5000000 627 ns/op
+SmallHTTPPartialMatchPCRE 5000000 574 ns/op
+SmallHTTPPartialMatchRE2 5000000 627 ns/op
+DotMatchPCRE 5000000 409 ns/op
+DotMatchRE2 5000000 691 ns/op
+ASCIIMatchPCRE 5000000 392 ns/op
+ASCIIMatchRE2 5000000 686 ns/op
diff --git a/third_party/re2/src/benchlog/benchlog.mini b/third_party/re2/src/benchlog/benchlog.mini
new file mode 100644
index 000000000..276483d95
--- /dev/null
+++ b/third_party/re2/src/benchlog/benchlog.mini
@@ -0,0 +1,582 @@
+hw.ncpu: 2
+hw.byteorder: 1234
+hw.memsize: 4294967296
+hw.activecpu: 2
+hw.physicalcpu: 2
+hw.physicalcpu_max: 2
+hw.logicalcpu: 2
+hw.logicalcpu_max: 2
+hw.cputype: 7
+hw.cpusubtype: 4
+hw.cpu64bit_capable: 1
+hw.cpufamily: 1114597871
+hw.cacheconfig: 2 1 2 0 0 0 0 0 0 0
+hw.cachesize: 3221225472 32768 2097152 0 0 0 0 0 0 0
+hw.pagesize: 4096
+hw.busfrequency: 664000000
+hw.busfrequency_min: 664000000
+hw.busfrequency_max: 664000000
+hw.cpufrequency: 1830000000
+hw.cpufrequency_min: 1830000000
+hw.cpufrequency_max: 1830000000
+hw.cachelinesize: 64
+hw.l1icachesize: 32768
+hw.l1dcachesize: 32768
+hw.l2cachesize: 2097152
+hw.tbfrequency: 1000000000
+hw.packages: 1
+hw.optional.floatingpoint: 1
+hw.optional.mmx: 1
+hw.optional.sse: 1
+hw.optional.sse2: 1
+hw.optional.sse3: 1
+hw.optional.supplementalsse3: 1
+hw.optional.sse4_1: 0
+hw.optional.sse4_2: 0
+hw.optional.x86_64: 1
+hw.machine = i386
+hw.model = Macmini2,1
+hw.ncpu = 2
+hw.byteorder = 1234
+hw.physmem = 2147483648
+hw.usermem = 1849147392
+hw.pagesize = 4096
+hw.epoch = 0
+hw.vectorunit = 1
+hw.busfrequency = 664000000
+hw.cpufrequency = 1830000000
+hw.cachelinesize = 64
+hw.l1icachesize = 32768
+hw.l1dcachesize = 32768
+hw.l2settings = 1
+hw.l2cachesize = 2097152
+hw.tbfrequency = 1000000000
+hw.memsize = 4294967296
+hw.availcpu = 2
+
+machdep.cpu.max_basic: 10
+machdep.cpu.max_ext: 2147483656
+machdep.cpu.vendor: GenuineIntel
+machdep.cpu.brand_string: Intel(R) Core(TM)2 CPU T5600 @ 1.83GHz
+machdep.cpu.family: 6
+machdep.cpu.model: 15
+machdep.cpu.extmodel: 0
+machdep.cpu.extfamily: 0
+machdep.cpu.stepping: 2
+machdep.cpu.feature_bits: 3219913727 58301
+machdep.cpu.extfeature_bits: 537921536 1
+machdep.cpu.signature: 1778
+machdep.cpu.brand: 0
+machdep.cpu.features: FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR PGE MCA CMOV PAT PSE36 CLFSH DS ACPI MMX FXSR SSE SSE2 SS HTT TM SSE3 MON DSCPL VMX EST TM2 SSSE3 CX16 TPR PDCM
+machdep.cpu.extfeatures: SYSCALL XD EM64T
+machdep.cpu.logical_per_package: 2
+machdep.cpu.cores_per_package: 2
+machdep.cpu.microcode_version: 87
+machdep.cpu.mwait.linesize_min: 64
+machdep.cpu.mwait.linesize_max: 64
+machdep.cpu.mwait.extensions: 3
+machdep.cpu.mwait.sub_Cstates: 139808
+machdep.cpu.thermal.sensor: 1
+machdep.cpu.thermal.dynamic_acceleration: 0
+machdep.cpu.thermal.thresholds: 2
+machdep.cpu.thermal.ACNT_MCNT: 1
+machdep.cpu.arch_perf.version: 2
+machdep.cpu.arch_perf.number: 2
+machdep.cpu.arch_perf.width: 40
+machdep.cpu.arch_perf.events_number: 7
+machdep.cpu.arch_perf.events: 0
+machdep.cpu.arch_perf.fixed_number: 0
+machdep.cpu.arch_perf.fixed_width: 0
+machdep.cpu.cache.linesize: 64
+machdep.cpu.cache.L2_associativity: 6
+machdep.cpu.cache.size: 2048
+machdep.cpu.tlb.inst.small: 128
+machdep.cpu.tlb.inst.large: 8
+machdep.cpu.tlb.data.small: 16
+machdep.cpu.tlb.data.small_level1: 256
+machdep.cpu.tlb.data.large: 16
+machdep.cpu.tlb.data.large_level1: 32
+machdep.cpu.address_bits.physical: 36
+machdep.cpu.address_bits.virtual: 48
+machdep.cpu.core_count: 2
+machdep.cpu.thread_count: 2
+
+
+==BENCHMARK== mini.local Fri Feb 26 16:57:10 PST 2010
+# Darwin mini.local 10.2.0 Darwin Kernel Version 10.2.0: Tue Nov 3 10:37:10 PST 2009; root:xnu-1486.2.11~1/RELEASE_I386 i386
+# i686-apple-darwin10-g++-4.2.1 (GCC) 4.2.1 (Apple Inc. build 5646) (dot 1)
+# Copyright (C) 2007 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# a94585d91e66+ tip
+# obj/test/regexp_benchmark: Mach-O 64-bit executable x86_64
+
+Search_Easy0_CachedPCRE/8 10000000 176 ns/op 45.40 MB/s
+Search_Easy0_CachedPCRE/16 10000000 209 ns/op 76.41 MB/s
+Search_Easy0_CachedPCRE/32 10000000 269 ns/op 118.53 MB/s
+Search_Easy0_CachedPCRE/64 5000000 398 ns/op 160.77 MB/s
+Search_Easy0_CachedPCRE/128 5000000 536 ns/op 238.69 MB/s
+Search_Easy0_CachedPCRE/256 2000000 897 ns/op 285.22 MB/s
+Search_Easy0_CachedPCRE/512 1000000 2161 ns/op 236.92 MB/s
+Search_Easy0_CachedPCRE/1K 500000 4769 ns/op 214.70 MB/s
+Search_Easy0_CachedPCRE/2K 200000 8031 ns/op 255.00 MB/s
+Search_Easy0_CachedPCRE/4K 100000 16208 ns/op 252.71 MB/s
+Search_Easy0_CachedPCRE/8K 50000 32219 ns/op 254.26 MB/s
+Search_Easy0_CachedPCRE/16K 50000 63347 ns/op 258.64 MB/s
+Search_Easy0_CachedPCRE/32K 10000 125875 ns/op 260.32 MB/s
+Search_Easy0_CachedPCRE/64K 10000 247829 ns/op 264.44 MB/s
+Search_Easy0_CachedPCRE/128K 5000 498699 ns/op 262.83 MB/s
+Search_Easy0_CachedPCRE/256K 2000 978021 ns/op 268.04 MB/s
+Search_Easy0_CachedPCRE/512K 1000 1975059 ns/op 265.45 MB/s
+Search_Easy0_CachedPCRE/1M 500 3994258 ns/op 262.52 MB/s
+Search_Easy0_CachedPCRE/2M 200 7959640 ns/op 263.47 MB/s
+Search_Easy0_CachedPCRE/4M 100 15950300 ns/op 262.96 MB/s
+Search_Easy0_CachedPCRE/8M 50 32435540 ns/op 258.62 MB/s
+Search_Easy0_CachedPCRE/16M 50 64686180 ns/op 259.36 MB/s
+Search_Easy0_CachedRE2/8 5000000 535 ns/op 14.95 MB/s
+Search_Easy0_CachedRE2/16 5000000 557 ns/op 28.70 MB/s
+Search_Easy0_CachedRE2/32 5000000 595 ns/op 53.75 MB/s
+Search_Easy0_CachedRE2/64 5000000 643 ns/op 99.50 MB/s
+Search_Easy0_CachedRE2/128 2000000 759 ns/op 168.64 MB/s
+Search_Easy0_CachedRE2/256 2000000 972 ns/op 263.30 MB/s
+Search_Easy0_CachedRE2/512 1000000 1458 ns/op 351.13 MB/s
+Search_Easy0_CachedRE2/1K 1000000 2544 ns/op 402.51 MB/s
+Search_Easy0_CachedRE2/2K 500000 4551 ns/op 449.99 MB/s
+Search_Easy0_CachedRE2/4K 200000 8677 ns/op 472.01 MB/s
+Search_Easy0_CachedRE2/8K 100000 17188 ns/op 476.59 MB/s
+Search_Easy0_CachedRE2/16K 50000 33869 ns/op 483.73 MB/s
+Search_Easy0_CachedRE2/32K 50000 67787 ns/op 483.39 MB/s
+Search_Easy0_CachedRE2/64K 10000 133362 ns/op 491.41 MB/s
+Search_Easy0_CachedRE2/128K 10000 266469 ns/op 491.88 MB/s
+Search_Easy0_CachedRE2/256K 5000 536980 ns/op 488.18 MB/s
+Search_Easy0_CachedRE2/512K 2000 1050843 ns/op 498.92 MB/s
+Search_Easy0_CachedRE2/1M 1000 2120649 ns/op 494.46 MB/s
+Search_Easy0_CachedRE2/2M 500 4273918 ns/op 490.69 MB/s
+Search_Easy0_CachedRE2/4M 200 8591285 ns/op 488.20 MB/s
+Search_Easy0_CachedRE2/8M 100 17197390 ns/op 487.78 MB/s
+Search_Easy0_CachedRE2/16M 50 34338780 ns/op 488.58 MB/s
+Search_Easy1_CachedPCRE/8 10000000 174 ns/op 45.74 MB/s
+Search_Easy1_CachedPCRE/16 10000000 206 ns/op 77.31 MB/s
+Search_Easy1_CachedPCRE/32 10000000 270 ns/op 118.43 MB/s
+Search_Easy1_CachedPCRE/64 5000000 402 ns/op 159.07 MB/s
+Search_Easy1_CachedPCRE/128 5000000 540 ns/op 236.84 MB/s
+Search_Easy1_CachedPCRE/256 2000000 909 ns/op 281.34 MB/s
+Search_Easy1_CachedPCRE/512 1000000 1852 ns/op 276.34 MB/s
+Search_Easy1_CachedPCRE/1K 500000 4318 ns/op 237.12 MB/s
+Search_Easy1_CachedPCRE/2K 200000 8346 ns/op 245.37 MB/s
+Search_Easy1_CachedPCRE/4K 100000 16214 ns/op 252.62 MB/s
+Search_Easy1_CachedPCRE/8K 50000 32438 ns/op 252.54 MB/s
+Search_Easy1_CachedPCRE/16K 50000 62914 ns/op 260.42 MB/s
+Search_Easy1_CachedPCRE/32K 10000 124792 ns/op 262.58 MB/s
+Search_Easy1_CachedPCRE/64K 10000 250941 ns/op 261.16 MB/s
+Search_Easy1_CachedPCRE/128K 5000 498405 ns/op 262.98 MB/s
+Search_Easy1_CachedPCRE/256K 2000 997305 ns/op 262.85 MB/s
+Search_Easy1_CachedPCRE/512K 1000 2023179 ns/op 259.14 MB/s
+Search_Easy1_CachedPCRE/1M 500 4005202 ns/op 261.80 MB/s
+Search_Easy1_CachedPCRE/2M 200 8116410 ns/op 258.38 MB/s
+Search_Easy1_CachedPCRE/4M 100 16145970 ns/op 259.77 MB/s
+Search_Easy1_CachedPCRE/8M 50 32471260 ns/op 258.34 MB/s
+Search_Easy1_CachedPCRE/16M 50 64734020 ns/op 259.17 MB/s
+Search_Easy1_CachedRE2/8 5000000 543 ns/op 14.72 MB/s
+Search_Easy1_CachedRE2/16 5000000 570 ns/op 28.07 MB/s
+Search_Easy1_CachedRE2/32 5000000 605 ns/op 52.81 MB/s
+Search_Easy1_CachedRE2/64 5000000 643 ns/op 99.39 MB/s
+Search_Easy1_CachedRE2/128 2000000 764 ns/op 167.45 MB/s
+Search_Easy1_CachedRE2/256 2000000 970 ns/op 263.85 MB/s
+Search_Easy1_CachedRE2/512 1000000 1455 ns/op 351.75 MB/s
+Search_Easy1_CachedRE2/1K 1000000 2506 ns/op 408.48 MB/s
+Search_Easy1_CachedRE2/2K 500000 4571 ns/op 447.97 MB/s
+Search_Easy1_CachedRE2/4K 200000 8812 ns/op 464.81 MB/s
+Search_Easy1_CachedRE2/8K 100000 17079 ns/op 479.65 MB/s
+Search_Easy1_CachedRE2/16K 50000 33802 ns/op 484.70 MB/s
+Search_Easy1_CachedRE2/32K 50000 67171 ns/op 487.83 MB/s
+Search_Easy1_CachedRE2/64K 10000 131505 ns/op 498.35 MB/s
+Search_Easy1_CachedRE2/128K 10000 263228 ns/op 497.94 MB/s
+Search_Easy1_CachedRE2/256K 5000 528135 ns/op 496.36 MB/s
+Search_Easy1_CachedRE2/512K 2000 1052768 ns/op 498.01 MB/s
+Search_Easy1_CachedRE2/1M 1000 2112714 ns/op 496.32 MB/s
+Search_Easy1_CachedRE2/2M 500 4289478 ns/op 488.91 MB/s
+Search_Easy1_CachedRE2/4M 200 8519430 ns/op 492.32 MB/s
+Search_Easy1_CachedRE2/8M 100 17002860 ns/op 493.36 MB/s
+Search_Easy1_CachedRE2/16M 50 34341100 ns/op 488.55 MB/s
+Search_Medium_CachedPCRE/8 10000000 175 ns/op 45.48 MB/s
+Search_Medium_CachedPCRE/16 10000000 206 ns/op 77.31 MB/s
+Search_Medium_CachedPCRE/32 10000000 273 ns/op 117.10 MB/s
+Search_Medium_CachedPCRE/64 5000000 427 ns/op 149.60 MB/s
+Search_Medium_CachedPCRE/128 200000 9382 ns/op 13.64 MB/s
+Search_Medium_CachedPCRE/256 100000 15339 ns/op 16.69 MB/s
+Search_Medium_CachedPCRE/512 50000 35837 ns/op 14.29 MB/s
+Search_Medium_CachedPCRE/1K 50000 71109 ns/op 14.40 MB/s
+Search_Medium_CachedPCRE/2K 10000 111371 ns/op 18.39 MB/s
+Search_Medium_CachedPCRE/4K 10000 264964 ns/op 15.46 MB/s
+Search_Medium_CachedPCRE/8K 5000 554964 ns/op 14.76 MB/s
+Search_Medium_CachedPCRE/16K 2000 1122116 ns/op 14.60 MB/s
+Search_Medium_CachedPCRE/32K 1000 2305129 ns/op 14.22 MB/s
+Search_Medium_CachedPCRE/64K 500 4401888 ns/op 14.89 MB/s
+Search_Medium_CachedPCRE/128K 200 8591800 ns/op 15.26 MB/s
+Search_Medium_CachedPCRE/256K 100 17534580 ns/op 14.95 MB/s
+Search_Medium_CachedRE2/8 5000000 531 ns/op 15.06 MB/s
+Search_Medium_CachedRE2/16 5000000 579 ns/op 27.60 MB/s
+Search_Medium_CachedRE2/32 5000000 666 ns/op 47.99 MB/s
+Search_Medium_CachedRE2/64 2000000 817 ns/op 78.31 MB/s
+Search_Medium_CachedRE2/128 1000000 1174 ns/op 108.94 MB/s
+Search_Medium_CachedRE2/256 1000000 1824 ns/op 140.30 MB/s
+Search_Medium_CachedRE2/512 500000 3097 ns/op 165.29 MB/s
+Search_Medium_CachedRE2/1K 500000 6101 ns/op 167.84 MB/s
+Search_Medium_CachedRE2/2K 200000 12024 ns/op 170.32 MB/s
+Search_Medium_CachedRE2/4K 100000 21483 ns/op 190.66 MB/s
+Search_Medium_CachedRE2/8K 50000 41321 ns/op 198.25 MB/s
+Search_Medium_CachedRE2/16K 20000 82227 ns/op 199.25 MB/s
+Search_Medium_CachedRE2/32K 10000 166314 ns/op 197.02 MB/s
+Search_Medium_CachedRE2/64K 5000 334190 ns/op 196.10 MB/s
+Search_Medium_CachedRE2/128K 5000 672222 ns/op 194.98 MB/s
+Search_Medium_CachedRE2/256K 2000 1335691 ns/op 196.26 MB/s
+Search_Medium_CachedRE2/512K 1000 2650973 ns/op 197.77 MB/s
+Search_Medium_CachedRE2/1M 500 5401168 ns/op 194.14 MB/s
+Search_Medium_CachedRE2/2M 100 10724160 ns/op 195.55 MB/s
+Search_Medium_CachedRE2/4M 100 21647840 ns/op 193.75 MB/s
+Search_Medium_CachedRE2/8M 50 43369000 ns/op 193.42 MB/s
+Search_Medium_CachedRE2/16M 20 85095750 ns/op 197.16 MB/s
+Search_Hard_CachedPCRE/8 10000000 178 ns/op 44.77 MB/s
+Search_Hard_CachedPCRE/16 10000000 211 ns/op 75.54 MB/s
+Search_Hard_CachedPCRE/32 10000000 274 ns/op 116.75 MB/s
+Search_Hard_CachedPCRE/64 5000000 401 ns/op 159.58 MB/s
+Search_Hard_CachedPCRE/128 5000 331833 ns/op 0.39 MB/s
+Search_Hard_CachedPCRE/256 2000 1299658 ns/op 0.20 MB/s
+Search_Hard_CachedPCRE/512 500 5361070 ns/op 0.10 MB/s
+Search_Hard_CachedPCRE/1K 100 20744900 ns/op 0.05 MB/s
+Search_Hard_CachedPCRE/2K 20 78382950 ns/op 0.03 MB/s
+Search_Hard_CachedPCRE/4K 5 335826800 ns/op 0.01 MB/s
+Search_Hard_CachedRE2/8 5000000 550 ns/op 14.53 MB/s
+Search_Hard_CachedRE2/16 5000000 600 ns/op 26.66 MB/s
+Search_Hard_CachedRE2/32 5000000 683 ns/op 46.80 MB/s
+Search_Hard_CachedRE2/64 2000000 834 ns/op 76.69 MB/s
+Search_Hard_CachedRE2/128 1000000 1168 ns/op 109.57 MB/s
+Search_Hard_CachedRE2/256 1000000 1833 ns/op 139.65 MB/s
+Search_Hard_CachedRE2/512 500000 3069 ns/op 166.81 MB/s
+Search_Hard_CachedRE2/1K 500000 5780 ns/op 177.14 MB/s
+Search_Hard_CachedRE2/2K 200000 11060 ns/op 185.17 MB/s
+Search_Hard_CachedRE2/4K 100000 21511 ns/op 190.41 MB/s
+Search_Hard_CachedRE2/8K 50000 41962 ns/op 195.22 MB/s
+Search_Hard_CachedRE2/16K 20000 82460 ns/op 198.69 MB/s
+Search_Hard_CachedRE2/32K 10000 164209 ns/op 199.55 MB/s
+Search_Hard_CachedRE2/64K 5000 326354 ns/op 200.81 MB/s
+Search_Hard_CachedRE2/128K 5000 659142 ns/op 198.85 MB/s
+Search_Hard_CachedRE2/256K 2000 1333642 ns/op 196.56 MB/s
+Search_Hard_CachedRE2/512K 1000 2687422 ns/op 195.09 MB/s
+Search_Hard_CachedRE2/1M 500 5351592 ns/op 195.94 MB/s
+Search_Hard_CachedRE2/2M 100 10581690 ns/op 198.19 MB/s
+Search_Hard_CachedRE2/4M 100 21324320 ns/op 196.69 MB/s
+Search_Hard_CachedRE2/8M 50 41892520 ns/op 200.24 MB/s
+Search_Hard_CachedRE2/16M 20 85475700 ns/op 196.28 MB/s
+Search_Parens_CachedPCRE/8 10000000 298 ns/op 26.80 MB/s
+Search_Parens_CachedRE2/8 5000000 562 ns/op 14.21 MB/s
+Search_Parens_CachedRE2/16 5000000 598 ns/op 26.71 MB/s
+Search_Parens_CachedRE2/32 5000000 676 ns/op 47.27 MB/s
+Search_Parens_CachedRE2/64 2000000 828 ns/op 77.21 MB/s
+Search_Parens_CachedRE2/128 1000000 1155 ns/op 110.73 MB/s
+Search_Parens_CachedRE2/256 1000000 1788 ns/op 143.13 MB/s
+Search_Parens_CachedRE2/512 500000 3064 ns/op 167.09 MB/s
+Search_Parens_CachedRE2/1K 500000 5698 ns/op 179.69 MB/s
+Search_Parens_CachedRE2/2K 200000 10961 ns/op 186.84 MB/s
+Search_Parens_CachedRE2/4K 100000 21527 ns/op 190.27 MB/s
+Search_Parens_CachedRE2/8K 50000 41923 ns/op 195.40 MB/s
+Search_Parens_CachedRE2/16K 20000 85505 ns/op 191.61 MB/s
+Search_Parens_CachedRE2/32K 10000 164437 ns/op 199.27 MB/s
+Search_Parens_CachedRE2/64K 5000 332654 ns/op 197.01 MB/s
+Search_Parens_CachedRE2/128K 5000 677745 ns/op 193.39 MB/s
+Search_Parens_CachedRE2/256K 2000 1331012 ns/op 196.95 MB/s
+Search_Parens_CachedRE2/512K 1000 2692594 ns/op 194.71 MB/s
+Search_Parens_CachedRE2/1M 500 5355880 ns/op 195.78 MB/s
+Search_Parens_CachedRE2/2M 100 10822340 ns/op 193.78 MB/s
+Search_Parens_CachedRE2/4M 100 21464430 ns/op 195.41 MB/s
+Search_Parens_CachedRE2/8M 50 42875940 ns/op 195.65 MB/s
+Search_Parens_CachedRE2/16M 20 84654300 ns/op 198.19 MB/s
+Search_BigFixed_CachedPCRE/8 5000000 360 ns/op 22.21 MB/s
+Search_BigFixed_CachedPCRE/16 5000000 442 ns/op 36.15 MB/s
+Search_BigFixed_CachedPCRE/32 5000000 606 ns/op 52.73 MB/s
+Search_BigFixed_CachedPCRE/64 2000000 935 ns/op 68.39 MB/s
+Search_BigFixed_CachedPCRE/128 1000000 1525 ns/op 83.91 MB/s
+Search_BigFixed_CachedPCRE/256 1000000 2718 ns/op 94.18 MB/s
+Search_BigFixed_CachedPCRE/512 500000 5020 ns/op 101.98 MB/s
+Search_BigFixed_CachedPCRE/1K 200000 9761 ns/op 104.90 MB/s
+Search_BigFixed_CachedPCRE/2K 100000 19275 ns/op 106.25 MB/s
+Search_BigFixed_CachedPCRE/4K 50000 38488 ns/op 106.42 MB/s
+Search_BigFixed_CachedPCRE/8K 20000 76229 ns/op 107.46 MB/s
+Search_BigFixed_CachedPCRE/16K 10000 155350 ns/op 105.46 MB/s
+Search_BigFixed_CachedPCRE/32K 5000 309242 ns/op 105.96 MB/s
+Search_BigFixed_CachedRE2/8 10000000 194 ns/op 41.03 MB/s
+Search_BigFixed_CachedRE2/16 5000000 589 ns/op 27.15 MB/s
+Search_BigFixed_CachedRE2/32 5000000 655 ns/op 48.83 MB/s
+Search_BigFixed_CachedRE2/64 5000000 715 ns/op 89.46 MB/s
+Search_BigFixed_CachedRE2/128 2000000 882 ns/op 145.09 MB/s
+Search_BigFixed_CachedRE2/256 1000000 1293 ns/op 197.97 MB/s
+Search_BigFixed_CachedRE2/512 1000000 1924 ns/op 266.06 MB/s
+Search_BigFixed_CachedRE2/1K 500000 3294 ns/op 310.79 MB/s
+Search_BigFixed_CachedRE2/2K 500000 6057 ns/op 338.10 MB/s
+Search_BigFixed_CachedRE2/4K 200000 11475 ns/op 356.93 MB/s
+Search_BigFixed_CachedRE2/8K 100000 22395 ns/op 365.79 MB/s
+Search_BigFixed_CachedRE2/16K 50000 44333 ns/op 369.56 MB/s
+Search_BigFixed_CachedRE2/32K 20000 88061 ns/op 372.11 MB/s
+Search_BigFixed_CachedRE2/64K 10000 173649 ns/op 377.40 MB/s
+Search_BigFixed_CachedRE2/128K 5000 347251 ns/op 377.46 MB/s
+Search_BigFixed_CachedRE2/256K 2000 702561 ns/op 373.13 MB/s
+Search_BigFixed_CachedRE2/512K 1000 1408041 ns/op 372.35 MB/s
+Search_BigFixed_CachedRE2/1M 500 3003070 ns/op 349.17 MB/s
+Search_Success_PCRE/8 500000 3891 ns/op 2.06 MB/s
+Search_Success_PCRE/16 500000 3865 ns/op 4.14 MB/s
+Search_Success_PCRE/32 500000 3861 ns/op 8.29 MB/s
+Search_Success_PCRE/64 500000 3921 ns/op 16.32 MB/s
+Search_Success_PCRE/128 500000 4677 ns/op 27.37 MB/s
+Search_Success_PCRE/256 500000 5362 ns/op 47.73 MB/s
+Search_Success_PCRE/512 500000 7125 ns/op 71.85 MB/s
+Search_Success_PCRE/1K 200000 10643 ns/op 96.21 MB/s
+Search_Success_PCRE/2K 100000 17620 ns/op 116.23 MB/s
+Search_Success_PCRE/4K 50000 31657 ns/op 129.39 MB/s
+Search_Success_PCRE/8K 50000 59290 ns/op 138.17 MB/s
+Search_Success_PCRE/16K 10000 115346 ns/op 142.04 MB/s
+Search_Success_PCRE/32K 10000 225258 ns/op 145.47 MB/s
+Search_Success_PCRE/64K 5000 452994 ns/op 144.67 MB/s
+Search_Success_PCRE/128K 2000 904745 ns/op 144.87 MB/s
+Search_Success_PCRE/256K 1000 1786683 ns/op 146.72 MB/s
+Search_Success_PCRE/512K 500 3600316 ns/op 145.62 MB/s
+Search_Success_PCRE/1M 200 7413055 ns/op 141.45 MB/s
+Search_Success_PCRE/2M 100 15261930 ns/op 137.41 MB/s
+Search_Success_PCRE/4M 50 32827960 ns/op 127.77 MB/s
+Search_Success_PCRE/8M 20 73886450 ns/op 113.53 MB/s
+Search_Success_PCRE/16M 5 247881200 ns/op 67.68 MB/s
+Search_Success_RE2/8 100000 18948 ns/op 0.42 MB/s
+Search_Success_RE2/16 50000 40076 ns/op 0.40 MB/s
+Search_Success_RE2/32 50000 40543 ns/op 0.79 MB/s
+Search_Success_RE2/64 50000 40520 ns/op 1.58 MB/s
+Search_Success_RE2/128 50000 41222 ns/op 3.11 MB/s
+Search_Success_RE2/256 50000 41361 ns/op 6.19 MB/s
+Search_Success_RE2/512 50000 42418 ns/op 12.07 MB/s
+Search_Success_RE2/1K 50000 45239 ns/op 22.64 MB/s
+Search_Success_RE2/2K 50000 50568 ns/op 40.50 MB/s
+Search_Success_RE2/4K 50000 60722 ns/op 67.45 MB/s
+Search_Success_RE2/8K 20000 82046 ns/op 99.85 MB/s
+Search_Success_RE2/16K 10000 125412 ns/op 130.64 MB/s
+Search_Success_RE2/32K 10000 211805 ns/op 154.71 MB/s
+Search_Success_RE2/64K 5000 373132 ns/op 175.64 MB/s
+Search_Success_RE2/128K 2000 710166 ns/op 184.57 MB/s
+Search_Success_RE2/256K 2000 1392231 ns/op 188.29 MB/s
+Search_Success_RE2/512K 1000 2763051 ns/op 189.75 MB/s
+Search_Success_RE2/1M 500 5547628 ns/op 189.01 MB/s
+Search_Success_RE2/2M 100 11709090 ns/op 179.10 MB/s
+Search_Success_RE2/4M 50 25220160 ns/op 166.31 MB/s
+Search_Success_RE2/8M 20 59411600 ns/op 141.19 MB/s
+Search_Success_RE2/16M 5 219468600 ns/op 76.44 MB/s
+Search_Success_CachedPCRE/8 5000000 328 ns/op 24.35 MB/s
+Search_Success_CachedPCRE/16 5000000 389 ns/op 41.06 MB/s
+Search_Success_CachedPCRE/32 5000000 507 ns/op 63.11 MB/s
+Search_Success_CachedPCRE/64 2000000 754 ns/op 84.80 MB/s
+Search_Success_CachedPCRE/128 1000000 1164 ns/op 109.89 MB/s
+Search_Success_CachedPCRE/256 1000000 2051 ns/op 124.81 MB/s
+Search_Success_CachedPCRE/512 500000 3831 ns/op 133.64 MB/s
+Search_Success_CachedPCRE/1K 500000 7280 ns/op 140.66 MB/s
+Search_Success_CachedPCRE/2K 200000 14254 ns/op 143.67 MB/s
+Search_Success_CachedPCRE/4K 100000 28223 ns/op 145.13 MB/s
+Search_Success_CachedPCRE/8K 50000 55445 ns/op 147.75 MB/s
+Search_Success_CachedPCRE/16K 10000 112739 ns/op 145.33 MB/s
+Search_Success_CachedPCRE/32K 10000 219943 ns/op 148.98 MB/s
+Search_Success_CachedPCRE/64K 5000 440884 ns/op 148.65 MB/s
+Search_Success_CachedPCRE/128K 2000 898950 ns/op 145.81 MB/s
+Search_Success_CachedPCRE/256K 1000 1775905 ns/op 147.61 MB/s
+Search_Success_CachedPCRE/512K 500 3579178 ns/op 146.48 MB/s
+Search_Success_CachedPCRE/1M 200 7278075 ns/op 144.07 MB/s
+Search_Success_CachedPCRE/2M 100 14954670 ns/op 140.23 MB/s
+Search_Success_CachedPCRE/4M 50 31865060 ns/op 131.63 MB/s
+Search_Success_CachedPCRE/8M 20 73977900 ns/op 113.39 MB/s
+Search_Success_CachedPCRE/16M 5 250587400 ns/op 66.95 MB/s
+Search_Success_CachedRE2/8 10000000 206 ns/op 38.80 MB/s
+Search_Success_CachedRE2/16 5000000 598 ns/op 26.75 MB/s
+Search_Success_CachedRE2/32 5000000 675 ns/op 47.39 MB/s
+Search_Success_CachedRE2/64 2000000 847 ns/op 75.52 MB/s
+Search_Success_CachedRE2/128 1000000 1211 ns/op 105.65 MB/s
+Search_Success_CachedRE2/256 1000000 1886 ns/op 135.73 MB/s
+Search_Success_CachedRE2/512 500000 3123 ns/op 163.90 MB/s
+Search_Success_CachedRE2/1K 500000 5754 ns/op 177.94 MB/s
+Search_Success_CachedRE2/2K 200000 10929 ns/op 187.38 MB/s
+Search_Success_CachedRE2/4K 100000 20887 ns/op 196.10 MB/s
+Search_Success_CachedRE2/8K 50000 41295 ns/op 198.37 MB/s
+Search_Success_CachedRE2/16K 20000 82338 ns/op 198.98 MB/s
+Search_Success_CachedRE2/32K 10000 168893 ns/op 194.02 MB/s
+Search_Success_CachedRE2/64K 5000 337449 ns/op 194.21 MB/s
+Search_Success_CachedRE2/128K 5000 670247 ns/op 195.56 MB/s
+Search_Success_CachedRE2/256K 2000 1342666 ns/op 195.24 MB/s
+Search_Success_CachedRE2/512K 1000 2711677 ns/op 193.34 MB/s
+Search_Success_CachedRE2/1M 500 5403052 ns/op 194.07 MB/s
+Search_Success_CachedRE2/2M 100 11697250 ns/op 179.29 MB/s
+Search_Success_CachedRE2/4M 50 24796680 ns/op 169.15 MB/s
+Search_Success_CachedRE2/8M 20 59587450 ns/op 140.78 MB/s
+Search_Success_CachedRE2/16M 5 225415400 ns/op 74.43 MB/s
+Search_Success1_PCRE/8 500000 4063 ns/op 1.97 MB/s
+Search_Success1_PCRE/16 500000 4104 ns/op 3.90 MB/s
+Search_Success1_PCRE/32 500000 4162 ns/op 7.69 MB/s
+Search_Success1_PCRE/64 500000 4284 ns/op 14.94 MB/s
+Search_Success1_PCRE/128 500000 4857 ns/op 26.35 MB/s
+Search_Success1_PCRE/256 500000 5507 ns/op 46.48 MB/s
+Search_Success1_PCRE/512 500000 7203 ns/op 71.08 MB/s
+Search_Success1_PCRE/1K 200000 10470 ns/op 97.80 MB/s
+Search_Success1_PCRE/2K 100000 17455 ns/op 117.33 MB/s
+Search_Success1_PCRE/4K 50000 31564 ns/op 129.77 MB/s
+Search_Success1_PCRE/8K 50000 59112 ns/op 138.58 MB/s
+Search_Success1_PCRE/16K 10000 115903 ns/op 141.36 MB/s
+Search_Success1_PCRE/32K 10000 223311 ns/op 146.74 MB/s
+Search_Success1_PCRE/64K 5000 447509 ns/op 146.45 MB/s
+Search_Success1_PCRE/128K 2000 874543 ns/op 149.87 MB/s
+Search_Success1_PCRE/256K 1000 1836342 ns/op 142.75 MB/s
+Search_Success1_PCRE/512K 500 3636250 ns/op 144.18 MB/s
+Search_Success1_PCRE/1M 200 7256345 ns/op 144.50 MB/s
+Search_Success1_PCRE/2M 100 15093450 ns/op 138.94 MB/s
+Search_Success1_PCRE/4M 50 32167920 ns/op 130.39 MB/s
+Search_Success1_PCRE/8M 20 74735800 ns/op 112.24 MB/s
+Search_Success1_PCRE/16M 5 252818600 ns/op 66.36 MB/s
+Search_Success1_RE2/8 50000 51778 ns/op 0.15 MB/s
+Search_Success1_RE2/16 50000 50754 ns/op 0.32 MB/s
+Search_Success1_RE2/32 50000 51127 ns/op 0.63 MB/s
+Search_Success1_RE2/64 50000 51305 ns/op 1.25 MB/s
+Search_Success1_RE2/128 50000 51580 ns/op 2.48 MB/s
+Search_Success1_RE2/256 50000 52019 ns/op 4.92 MB/s
+Search_Success1_RE2/512 50000 53145 ns/op 9.63 MB/s
+Search_Success1_RE2/1K 50000 55871 ns/op 18.33 MB/s
+Search_Success1_RE2/2K 50000 61477 ns/op 33.31 MB/s
+Search_Success1_RE2/4K 50000 71875 ns/op 56.99 MB/s
+Search_Success1_RE2/8K 20000 94822 ns/op 86.39 MB/s
+Search_Success1_RE2/16K 10000 137021 ns/op 119.57 MB/s
+Search_Success1_RE2/32K 10000 220596 ns/op 148.54 MB/s
+Search_Success1_RE2/64K 5000 377808 ns/op 173.46 MB/s
+Search_Success1_RE2/128K 5000 707546 ns/op 185.25 MB/s
+Search_Success1_RE2/256K 2000 1367308 ns/op 191.72 MB/s
+Search_Success1_RE2/512K 1000 2729291 ns/op 192.10 MB/s
+Search_Success1_RE2/1M 500 5439634 ns/op 192.77 MB/s
+Search_Success1_RE2/2M 100 11626860 ns/op 180.37 MB/s
+Search_Success1_RE2/4M 50 24603160 ns/op 170.48 MB/s
+Search_Success1_RE2/8M 20 59001300 ns/op 142.18 MB/s
+Search_Success1_RE2/16M 5 219520200 ns/op 76.43 MB/s
+Search_Success1_Cached_PCRE/8 5000000 373 ns/op 21.41 MB/s
+Search_Success1_Cached_PCRE/16 5000000 437 ns/op 36.61 MB/s
+Search_Success1_Cached_PCRE/32 5000000 543 ns/op 58.84 MB/s
+Search_Success1_Cached_PCRE/64 2000000 784 ns/op 81.60 MB/s
+Search_Success1_Cached_PCRE/128 1000000 1193 ns/op 107.29 MB/s
+Search_Success1_Cached_PCRE/256 1000000 2044 ns/op 125.23 MB/s
+Search_Success1_Cached_PCRE/512 500000 3734 ns/op 137.10 MB/s
+Search_Success1_Cached_PCRE/1K 500000 7121 ns/op 143.78 MB/s
+Search_Success1_Cached_PCRE/2K 200000 13767 ns/op 148.76 MB/s
+Search_Success1_Cached_PCRE/4K 100000 27176 ns/op 150.72 MB/s
+Search_Success1_Cached_PCRE/8K 50000 54155 ns/op 151.27 MB/s
+Search_Success1_Cached_PCRE/16K 10000 109309 ns/op 149.89 MB/s
+Search_Success1_Cached_PCRE/32K 10000 215890 ns/op 151.78 MB/s
+Search_Success1_Cached_PCRE/64K 5000 432550 ns/op 151.51 MB/s
+Search_Success1_Cached_PCRE/128K 2000 870568 ns/op 150.56 MB/s
+Search_Success1_Cached_PCRE/256K 1000 1756215 ns/op 149.27 MB/s
+Search_Success1_Cached_PCRE/512K 500 3671994 ns/op 142.78 MB/s
+Search_Success1_Cached_PCRE/1M 200 7134810 ns/op 146.97 MB/s
+Search_Success1_Cached_PCRE/2M 100 14672580 ns/op 142.93 MB/s
+Search_Success1_Cached_PCRE/4M 50 31146040 ns/op 134.67 MB/s
+Search_Success1_Cached_PCRE/8M 20 72224500 ns/op 116.15 MB/s
+Search_Success1_Cached_PCRE/16M 5 243683800 ns/op 68.85 MB/s
+Search_Success1_Cached_RE2/8 5000000 544 ns/op 14.69 MB/s
+Search_Success1_Cached_RE2/16 5000000 583 ns/op 27.43 MB/s
+Search_Success1_Cached_RE2/32 5000000 661 ns/op 48.37 MB/s
+Search_Success1_Cached_RE2/64 2000000 818 ns/op 78.23 MB/s
+Search_Success1_Cached_RE2/128 1000000 1148 ns/op 111.40 MB/s
+Search_Success1_Cached_RE2/256 1000000 1778 ns/op 143.95 MB/s
+Search_Success1_Cached_RE2/512 500000 3036 ns/op 168.64 MB/s
+Search_Success1_Cached_RE2/1K 500000 5549 ns/op 184.53 MB/s
+Search_Success1_Cached_RE2/2K 200000 10580 ns/op 193.56 MB/s
+Search_Success1_Cached_RE2/4K 100000 20645 ns/op 198.39 MB/s
+Search_Success1_Cached_RE2/8K 50000 40775 ns/op 200.90 MB/s
+Search_Success1_Cached_RE2/16K 20000 81030 ns/op 202.20 MB/s
+Search_Success1_Cached_RE2/32K 10000 162338 ns/op 201.85 MB/s
+Search_Success1_Cached_RE2/64K 5000 324387 ns/op 202.03 MB/s
+Search_Success1_Cached_RE2/128K 5000 648468 ns/op 202.13 MB/s
+Search_Success1_Cached_RE2/256K 2000 1299439 ns/op 201.74 MB/s
+Search_Success1_Cached_RE2/512K 1000 2608958 ns/op 200.96 MB/s
+Search_Success1_Cached_RE2/1M 500 5263964 ns/op 199.20 MB/s
+Search_Success1_Cached_RE2/2M 200 10793175 ns/op 194.30 MB/s
+Search_Success1_Cached_RE2/4M 50 24138120 ns/op 173.76 MB/s
+Search_Success1_Cached_RE2/8M 20 58223300 ns/op 144.08 MB/s
+Search_Success1_Cached_RE2/16M 5 215741400 ns/op 77.77 MB/s
+Search_Digits_PCRE 500000 7534 ns/op
+Search_Digits_RE2 50000 44162 ns/op
+Parse_Digits_PCRE 200000 7664 ns/op
+Parse_Digits_RE2 100000 22595 ns/op
+Parse_CachedDigits_PCRE 5000000 721 ns/op
+Parse_CachedDigits_RE2 5000000 413 ns/op
+Parse_DigitDs_PCRE 500000 7095 ns/op
+Parse_DigitDs_RE2 100000 22259 ns/op
+Parse_CachedDigitDs_PCRE 5000000 704 ns/op
+Parse_CachedDigitDs_RE2 5000000 415 ns/op
+Parse_Split_PCRE 500000 5540 ns/op
+Parse_Split_RE2 100000 23817 ns/op
+Parse_CachedSplit_PCRE 5000000 490 ns/op
+Parse_CachedSplit_RE2 10000000 251 ns/op
+Parse_SplitHard_PCRE 500000 5410 ns/op
+Parse_SplitHard_RE2 100000 28518 ns/op
+Parse_CachedSplitHard_PCRE 5000000 488 ns/op
+Parse_CachedSplitHard_RE2 1000000 2489 ns/op
+Parse_CachedSplitBig1_PCRE 500 7171752 ns/op
+Parse_CachedSplitBig1_RE2 2000 990722 ns/op
+Parse_CachedSplitBig2_PCRE 5000 658331 ns/op
+Parse_CachedSplitBig2_RE2 20 81205250 ns/op
+BM_PCRE_Compile 500000 6443 ns/op
+BM_RE2_Compile 100000 24103 ns/op
+SearchPhone_CachedPCRE/8 1000000 2010 ns/op 3.98 MB/s
+SearchPhone_CachedPCRE/16 500000 3286 ns/op 4.87 MB/s
+SearchPhone_CachedPCRE/32 500000 5953 ns/op 5.37 MB/s
+SearchPhone_CachedPCRE/64 200000 11181 ns/op 5.72 MB/s
+SearchPhone_CachedPCRE/128 100000 21634 ns/op 5.92 MB/s
+SearchPhone_CachedPCRE/256 50000 42315 ns/op 6.05 MB/s
+SearchPhone_CachedPCRE/512 20000 83969 ns/op 6.10 MB/s
+SearchPhone_CachedPCRE/1K 10000 166005 ns/op 6.17 MB/s
+SearchPhone_CachedPCRE/2K 5000 327433 ns/op 6.25 MB/s
+SearchPhone_CachedPCRE/4K 5000 654794 ns/op 6.26 MB/s
+SearchPhone_CachedPCRE/8K 2000 1302747 ns/op 6.29 MB/s
+SearchPhone_CachedPCRE/16K 1000 2601137 ns/op 6.30 MB/s
+SearchPhone_CachedPCRE/32K 500 5170166 ns/op 6.34 MB/s
+SearchPhone_CachedPCRE/64K 100 10378910 ns/op 6.31 MB/s
+SearchPhone_CachedPCRE/128K 100 20783360 ns/op 6.31 MB/s
+SearchPhone_CachedPCRE/256K 50 41632940 ns/op 6.30 MB/s
+SearchPhone_CachedPCRE/512K 20 83663300 ns/op 6.27 MB/s
+SearchPhone_CachedPCRE/1M 10 167093400 ns/op 6.28 MB/s
+SearchPhone_CachedPCRE/2M 5 335078800 ns/op 6.26 MB/s
+SearchPhone_CachedPCRE/4M 5 673405400 ns/op 6.23 MB/s
+SearchPhone_CachedPCRE/8M 1 1335761000 ns/op 6.28 MB/s
+SearchPhone_CachedPCRE/16M 1 2682908000 ns/op 6.25 MB/s
+SearchPhone_CachedRE2/8 1000000 1470 ns/op 5.44 MB/s
+SearchPhone_CachedRE2/16 1000000 1496 ns/op 10.69 MB/s
+SearchPhone_CachedRE2/32 1000000 1570 ns/op 20.38 MB/s
+SearchPhone_CachedRE2/64 1000000 1770 ns/op 36.15 MB/s
+SearchPhone_CachedRE2/128 1000000 2082 ns/op 61.46 MB/s
+SearchPhone_CachedRE2/256 1000000 2701 ns/op 94.78 MB/s
+SearchPhone_CachedRE2/512 500000 3963 ns/op 129.19 MB/s
+SearchPhone_CachedRE2/1K 500000 6487 ns/op 157.85 MB/s
+SearchPhone_CachedRE2/2K 200000 11527 ns/op 177.67 MB/s
+SearchPhone_CachedRE2/4K 100000 21579 ns/op 189.81 MB/s
+SearchPhone_CachedRE2/8K 50000 41804 ns/op 195.96 MB/s
+SearchPhone_CachedRE2/16K 20000 82228 ns/op 199.25 MB/s
+SearchPhone_CachedRE2/32K 10000 163444 ns/op 200.48 MB/s
+SearchPhone_CachedRE2/64K 5000 325307 ns/op 201.46 MB/s
+SearchPhone_CachedRE2/128K 5000 648559 ns/op 202.10 MB/s
+SearchPhone_CachedRE2/256K 2000 1295574 ns/op 202.34 MB/s
+SearchPhone_CachedRE2/512K 1000 2591267 ns/op 202.33 MB/s
+SearchPhone_CachedRE2/1M 500 5178738 ns/op 202.48 MB/s
+SearchPhone_CachedRE2/2M 100 10389680 ns/op 201.85 MB/s
+SearchPhone_CachedRE2/4M 100 20851510 ns/op 201.15 MB/s
+SearchPhone_CachedRE2/8M 50 41763800 ns/op 200.86 MB/s
+SearchPhone_CachedRE2/16M 20 83492800 ns/op 200.94 MB/s
+EmptyPartialMatchPCRE 10000000 195 ns/op
+EmptyPartialMatchRE2 5000000 497 ns/op
+SimplePartialMatchPCRE 10000000 276 ns/op
+SimplePartialMatchRE2 5000000 548 ns/op
+HTTPPartialMatchPCRE 2000000 826 ns/op
+HTTPPartialMatchRE2 2000000 894 ns/op
+SmallHTTPPartialMatchPCRE 2000000 825 ns/op
+SmallHTTPPartialMatchRE2 2000000 895 ns/op
+DotMatchPCRE 2000000 810 ns/op
+DotMatchRE2 2000000 976 ns/op
+ASCIIMatchPCRE 5000000 604 ns/op
+ASCIIMatchRE2 2000000 976 ns/op
diff --git a/third_party/re2/src/benchlog/benchlog.r70 b/third_party/re2/src/benchlog/benchlog.r70
new file mode 100644
index 000000000..1e4e86b48
--- /dev/null
+++ b/third_party/re2/src/benchlog/benchlog.r70
@@ -0,0 +1,1475 @@
+processor : 0
+vendor_id : AuthenticAMD
+cpu family : 15
+model : 65
+model name : Dual-Core AMD Opteron(tm) Processor 8214 HE
+stepping : 2
+cpu MHz : 2200.000
+cache size : 1024 KB
+physical id : 0
+siblings : 2
+core id : 0
+cpu cores : 2
+fpu : yes
+fpu_exception : yes
+cpuid level : 1
+wp : yes
+flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt rdtscp lm 3dnowext 3dnow rep_good pni cx16 lahf_lm cmp_legacy svm extapic cr8_legacy
+bogomips : 4420.36
+TLB size : 1024 4K pages
+clflush size : 64
+cache_alignment : 64
+address sizes : 40 bits physical, 48 bits virtual
+power management: ts fid vid ttp tm stc
+
+MemTotal: 8235352 kB
+MemFree: 1083816 kB
+Buffers: 476688 kB
+Cached: 4809208 kB
+SwapCached: 88 kB
+Active: 2782140 kB
+Inactive: 3716900 kB
+SwapTotal: 2097136 kB
+SwapFree: 2047340 kB
+Dirty: 360 kB
+Writeback: 0 kB
+AnonPages: 1204244 kB
+Mapped: 123916 kB
+Slab: 583660 kB
+SReclaimable: 505492 kB
+SUnreclaim: 78168 kB
+PageTables: 13084 kB
+NFS_Unstable: 0 kB
+Bounce: 0 kB
+CommitLimit: 6214812 kB
+Committed_AS: 2371464 kB
+VmallocTotal: 34359738367 kB
+VmallocUsed: 58520 kB
+VmallocChunk: 34359678971 kB
+
+
+
+==BENCHMARK== r70.mtv.corp.google.com Fri Feb 26 14:10:56 PST 2010
+# Linux r70.mtv.corp.google.com 2.6.24-gg804007-generic #1 SMP Thu Jan 21 11:28:34 PST 2010 x86_64 GNU/Linux
+# g++ (GCC) 4.2.4 (Ubuntu 4.2.4-1ubuntu4)
+# Copyright (C) 2007 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# d7671f473f1a+ tip
+# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), for GNU/Linux 2.6.8, dynamically linked (uses shared libs), not stripped
+
+Search_Easy0_CachedPCRE/8 10000000 149 ns/op 53.57 MB/s
+Search_Easy0_CachedPCRE/16 10000000 194 ns/op 82.16 MB/s
+Search_Easy0_CachedPCRE/32 5000000 291 ns/op 109.70 MB/s
+Search_Easy0_CachedPCRE/64 5000000 452 ns/op 141.37 MB/s
+Search_Easy0_CachedPCRE/128 2000000 773 ns/op 165.41 MB/s
+Search_Easy0_CachedPCRE/256 500000 2135 ns/op 119.88 MB/s
+Search_Easy0_CachedPCRE/512 500000 3674 ns/op 139.33 MB/s
+Search_Easy0_CachedPCRE/1K 200000 7051 ns/op 145.21 MB/s
+Search_Easy0_CachedPCRE/2K 100000 12536 ns/op 163.37 MB/s
+Search_Easy0_CachedPCRE/4K 50000 24447 ns/op 167.54 MB/s
+Search_Easy0_CachedPCRE/8K 20000 50421 ns/op 162.47 MB/s
+Search_Easy0_CachedPCRE/16K 20000 98404 ns/op 166.50 MB/s
+Search_Easy0_CachedPCRE/32K 10000 197440 ns/op 165.96 MB/s
+Search_Easy0_CachedPCRE/64K 5000 394161 ns/op 166.27 MB/s
+Search_Easy0_CachedPCRE/128K 2000 791340 ns/op 165.63 MB/s
+Search_Easy0_CachedPCRE/256K 1000 1577534 ns/op 166.17 MB/s
+Search_Easy0_CachedPCRE/512K 500 3165770 ns/op 165.61 MB/s
+Search_Easy0_CachedPCRE/1M 200 6435865 ns/op 162.93 MB/s
+Search_Easy0_CachedPCRE/2M 100 12895230 ns/op 162.63 MB/s
+Search_Easy0_CachedPCRE/4M 50 25771800 ns/op 162.75 MB/s
+Search_Easy0_CachedPCRE/8M 20 52665900 ns/op 159.28 MB/s
+Search_Easy0_CachedPCRE/16M 10 104448400 ns/op 160.63 MB/s
+Search_Easy0_CachedRE2/8 5000000 332 ns/op 24.08 MB/s
+Search_Easy0_CachedRE2/16 5000000 339 ns/op 47.16 MB/s
+Search_Easy0_CachedRE2/32 5000000 377 ns/op 84.77 MB/s
+Search_Easy0_CachedRE2/64 5000000 388 ns/op 164.54 MB/s
+Search_Easy0_CachedRE2/128 5000000 443 ns/op 288.35 MB/s
+Search_Easy0_CachedRE2/256 2000000 517 ns/op 495.00 MB/s
+Search_Easy0_CachedRE2/512 2000000 674 ns/op 759.60 MB/s
+Search_Easy0_CachedRE2/1K 1000000 1192 ns/op 858.67 MB/s
+Search_Easy0_CachedRE2/2K 500000 2145 ns/op 954.62 MB/s
+Search_Easy0_CachedRE2/4K 500000 3711 ns/op 1103.52 MB/s
+Search_Easy0_CachedRE2/8K 200000 7176 ns/op 1141.45 MB/s
+Search_Easy0_CachedRE2/16K 100000 13930 ns/op 1176.13 MB/s
+Search_Easy0_CachedRE2/32K 50000 28054 ns/op 1168.03 MB/s
+Search_Easy0_CachedRE2/64K 20000 55914 ns/op 1172.07 MB/s
+Search_Easy0_CachedRE2/128K 10000 119719 ns/op 1094.83 MB/s
+Search_Easy0_CachedRE2/256K 5000 238165 ns/op 1100.68 MB/s
+Search_Easy0_CachedRE2/512K 5000 480109 ns/op 1092.02 MB/s
+Search_Easy0_CachedRE2/1M 1000 1024370 ns/op 1023.63 MB/s
+Search_Easy0_CachedRE2/2M 500 2052224 ns/op 1021.89 MB/s
+Search_Easy0_CachedRE2/4M 500 4122288 ns/op 1017.47 MB/s
+Search_Easy0_CachedRE2/8M 200 8271160 ns/op 1014.20 MB/s
+Search_Easy0_CachedRE2/16M 100 16825980 ns/op 997.10 MB/s
+Search_Easy1_CachedPCRE/8 10000000 153 ns/op 52.14 MB/s
+Search_Easy1_CachedPCRE/16 10000000 194 ns/op 82.27 MB/s
+Search_Easy1_CachedPCRE/32 5000000 292 ns/op 109.48 MB/s
+Search_Easy1_CachedPCRE/64 5000000 451 ns/op 141.88 MB/s
+Search_Easy1_CachedPCRE/128 1000000 1148 ns/op 111.46 MB/s
+Search_Easy1_CachedPCRE/256 1000000 1857 ns/op 137.80 MB/s
+Search_Easy1_CachedPCRE/512 500000 4148 ns/op 123.43 MB/s
+Search_Easy1_CachedPCRE/1K 200000 7516 ns/op 136.23 MB/s
+Search_Easy1_CachedPCRE/2K 100000 14053 ns/op 145.73 MB/s
+Search_Easy1_CachedPCRE/4K 50000 26487 ns/op 154.64 MB/s
+Search_Easy1_CachedPCRE/8K 20000 52324 ns/op 156.56 MB/s
+Search_Easy1_CachedPCRE/16K 10000 101153 ns/op 161.97 MB/s
+Search_Easy1_CachedPCRE/32K 5000 202395 ns/op 161.90 MB/s
+Search_Easy1_CachedPCRE/64K 5000 403530 ns/op 162.41 MB/s
+Search_Easy1_CachedPCRE/128K 2000 817517 ns/op 160.33 MB/s
+Search_Easy1_CachedPCRE/256K 1000 1628277 ns/op 160.99 MB/s
+Search_Easy1_CachedPCRE/512K 500 3252172 ns/op 161.21 MB/s
+Search_Easy1_CachedPCRE/1M 200 6555365 ns/op 159.96 MB/s
+Search_Easy1_CachedPCRE/2M 100 13116580 ns/op 159.89 MB/s
+Search_Easy1_CachedPCRE/4M 50 26249100 ns/op 159.79 MB/s
+Search_Easy1_CachedPCRE/8M 20 52633400 ns/op 159.38 MB/s
+Search_Easy1_CachedPCRE/16M 10 105218400 ns/op 159.45 MB/s
+Search_Easy1_CachedRE2/8 5000000 340 ns/op 23.49 MB/s
+Search_Easy1_CachedRE2/16 5000000 341 ns/op 46.81 MB/s
+Search_Easy1_CachedRE2/32 5000000 380 ns/op 84.01 MB/s
+Search_Easy1_CachedRE2/64 5000000 395 ns/op 161.89 MB/s
+Search_Easy1_CachedRE2/128 5000000 465 ns/op 275.05 MB/s
+Search_Easy1_CachedRE2/256 2000000 512 ns/op 499.90 MB/s
+Search_Easy1_CachedRE2/512 2000000 678 ns/op 754.90 MB/s
+Search_Easy1_CachedRE2/1K 1000000 1194 ns/op 857.60 MB/s
+Search_Easy1_CachedRE2/2K 500000 2163 ns/op 946.49 MB/s
+Search_Easy1_CachedRE2/4K 500000 3722 ns/op 1100.32 MB/s
+Search_Easy1_CachedRE2/8K 200000 7134 ns/op 1148.27 MB/s
+Search_Easy1_CachedRE2/16K 100000 14008 ns/op 1169.56 MB/s
+Search_Easy1_CachedRE2/32K 50000 28535 ns/op 1148.33 MB/s
+Search_Easy1_CachedRE2/64K 20000 57155 ns/op 1146.64 MB/s
+Search_Easy1_CachedRE2/128K 10000 119610 ns/op 1095.82 MB/s
+Search_Easy1_CachedRE2/256K 5000 238525 ns/op 1099.02 MB/s
+Search_Easy1_CachedRE2/512K 5000 480327 ns/op 1091.52 MB/s
+Search_Easy1_CachedRE2/1M 1000 1026046 ns/op 1021.96 MB/s
+Search_Easy1_CachedRE2/2M 500 2035202 ns/op 1030.44 MB/s
+Search_Easy1_CachedRE2/4M 500 4095944 ns/op 1024.01 MB/s
+Search_Easy1_CachedRE2/8M 200 8295200 ns/op 1011.26 MB/s
+Search_Easy1_CachedRE2/16M 100 17081710 ns/op 982.17 MB/s
+Search_Medium_CachedPCRE/8 10000000 161 ns/op 49.55 MB/s
+Search_Medium_CachedPCRE/16 5000000 212 ns/op 75.29 MB/s
+Search_Medium_CachedPCRE/32 5000000 290 ns/op 110.22 MB/s
+Search_Medium_CachedPCRE/64 5000000 450 ns/op 142.01 MB/s
+Search_Medium_CachedPCRE/128 2000000 771 ns/op 165.99 MB/s
+Search_Medium_CachedPCRE/256 100000 18958 ns/op 13.50 MB/s
+Search_Medium_CachedPCRE/512 50000 44112 ns/op 11.61 MB/s
+Search_Medium_CachedPCRE/1K 20000 87173 ns/op 11.75 MB/s
+Search_Medium_CachedPCRE/2K 10000 129587 ns/op 15.80 MB/s
+Search_Medium_CachedPCRE/4K 5000 321362 ns/op 12.75 MB/s
+Search_Medium_CachedPCRE/8K 2000 694721 ns/op 11.79 MB/s
+Search_Medium_CachedPCRE/16K 1000 1480844 ns/op 11.06 MB/s
+Search_Medium_CachedPCRE/32K 500 3018562 ns/op 10.86 MB/s
+Search_Medium_CachedPCRE/64K 200 6037290 ns/op 10.86 MB/s
+Search_Medium_CachedPCRE/128K 100 12019360 ns/op 10.91 MB/s
+Search_Medium_CachedPCRE/256K 50 23983440 ns/op 10.93 MB/s
+Search_Medium_CachedRE2/8 5000000 335 ns/op 23.86 MB/s
+Search_Medium_CachedRE2/16 5000000 391 ns/op 40.87 MB/s
+Search_Medium_CachedRE2/32 5000000 496 ns/op 64.45 MB/s
+Search_Medium_CachedRE2/64 2000000 723 ns/op 88.46 MB/s
+Search_Medium_CachedRE2/128 1000000 1154 ns/op 110.89 MB/s
+Search_Medium_CachedRE2/256 500000 2027 ns/op 126.29 MB/s
+Search_Medium_CachedRE2/512 500000 3773 ns/op 135.68 MB/s
+Search_Medium_CachedRE2/1K 200000 7258 ns/op 141.08 MB/s
+Search_Medium_CachedRE2/2K 100000 14262 ns/op 143.59 MB/s
+Search_Medium_CachedRE2/4K 50000 28179 ns/op 145.35 MB/s
+Search_Medium_CachedRE2/8K 20000 56070 ns/op 146.10 MB/s
+Search_Medium_CachedRE2/16K 10000 111844 ns/op 146.49 MB/s
+Search_Medium_CachedRE2/32K 5000 224068 ns/op 146.24 MB/s
+Search_Medium_CachedRE2/64K 5000 447358 ns/op 146.50 MB/s
+Search_Medium_CachedRE2/128K 2000 901733 ns/op 145.36 MB/s
+Search_Medium_CachedRE2/256K 1000 1805851 ns/op 145.16 MB/s
+Search_Medium_CachedRE2/512K 500 3612816 ns/op 145.12 MB/s
+Search_Medium_CachedRE2/1M 200 7351105 ns/op 142.64 MB/s
+Search_Medium_CachedRE2/2M 100 14694290 ns/op 142.72 MB/s
+Search_Medium_CachedRE2/4M 50 29395260 ns/op 142.69 MB/s
+Search_Medium_CachedRE2/8M 20 58088750 ns/op 144.41 MB/s
+Search_Medium_CachedRE2/16M 10 116312400 ns/op 144.24 MB/s
+Search_Hard_CachedPCRE/8 10000000 162 ns/op 49.10 MB/s
+Search_Hard_CachedPCRE/16 5000000 209 ns/op 76.28 MB/s
+Search_Hard_CachedPCRE/32 5000000 289 ns/op 110.69 MB/s
+Search_Hard_CachedPCRE/64 5000000 449 ns/op 142.33 MB/s
+Search_Hard_CachedPCRE/128 2000000 769 ns/op 166.34 MB/s
+Search_Hard_CachedPCRE/256 1000 1243528 ns/op 0.21 MB/s
+Search_Hard_CachedPCRE/512 200 5089915 ns/op 0.10 MB/s
+Search_Hard_CachedPCRE/1K 50 20228240 ns/op 0.05 MB/s
+Search_Hard_CachedPCRE/2K 20 74096950 ns/op 0.03 MB/s
+Search_Hard_CachedPCRE/4K 5 318803000 ns/op 0.01 MB/s
+Search_Hard_CachedRE2/8 5000000 332 ns/op 24.03 MB/s
+Search_Hard_CachedRE2/16 5000000 385 ns/op 41.47 MB/s
+Search_Hard_CachedRE2/32 5000000 497 ns/op 64.38 MB/s
+Search_Hard_CachedRE2/64 2000000 716 ns/op 89.38 MB/s
+Search_Hard_CachedRE2/128 1000000 1146 ns/op 111.66 MB/s
+Search_Hard_CachedRE2/256 500000 2017 ns/op 126.92 MB/s
+Search_Hard_CachedRE2/512 500000 3765 ns/op 135.98 MB/s
+Search_Hard_CachedRE2/1K 200000 7257 ns/op 141.09 MB/s
+Search_Hard_CachedRE2/2K 100000 14209 ns/op 144.13 MB/s
+Search_Hard_CachedRE2/4K 50000 28224 ns/op 145.12 MB/s
+Search_Hard_CachedRE2/8K 20000 56015 ns/op 146.25 MB/s
+Search_Hard_CachedRE2/16K 10000 112066 ns/op 146.20 MB/s
+Search_Hard_CachedRE2/32K 5000 223212 ns/op 146.80 MB/s
+Search_Hard_CachedRE2/64K 5000 447573 ns/op 146.43 MB/s
+Search_Hard_CachedRE2/128K 2000 900290 ns/op 145.59 MB/s
+Search_Hard_CachedRE2/256K 1000 1803864 ns/op 145.32 MB/s
+Search_Hard_CachedRE2/512K 500 3608078 ns/op 145.31 MB/s
+Search_Hard_CachedRE2/1M 200 7270210 ns/op 144.23 MB/s
+Search_Hard_CachedRE2/2M 100 14554490 ns/op 144.09 MB/s
+Search_Hard_CachedRE2/4M 50 29162380 ns/op 143.83 MB/s
+Search_Hard_CachedRE2/8M 20 58978900 ns/op 142.23 MB/s
+Search_Hard_CachedRE2/16M 10 116714000 ns/op 143.75 MB/s
+Search_Parens_CachedPCRE/8 5000000 251 ns/op 31.75 MB/s
+Search_Parens_CachedRE2/8 5000000 328 ns/op 24.34 MB/s
+Search_Parens_CachedRE2/16 5000000 382 ns/op 41.82 MB/s
+Search_Parens_CachedRE2/32 5000000 495 ns/op 64.62 MB/s
+Search_Parens_CachedRE2/64 2000000 695 ns/op 92.04 MB/s
+Search_Parens_CachedRE2/128 1000000 1107 ns/op 115.62 MB/s
+Search_Parens_CachedRE2/256 1000000 2021 ns/op 126.63 MB/s
+Search_Parens_CachedRE2/512 500000 3768 ns/op 135.88 MB/s
+Search_Parens_CachedRE2/1K 200000 7242 ns/op 141.39 MB/s
+Search_Parens_CachedRE2/2K 100000 14241 ns/op 143.81 MB/s
+Search_Parens_CachedRE2/4K 50000 28148 ns/op 145.52 MB/s
+Search_Parens_CachedRE2/8K 20000 56228 ns/op 145.69 MB/s
+Search_Parens_CachedRE2/16K 10000 111761 ns/op 146.60 MB/s
+Search_Parens_CachedRE2/32K 5000 223183 ns/op 146.82 MB/s
+Search_Parens_CachedRE2/64K 5000 447285 ns/op 146.52 MB/s
+Search_Parens_CachedRE2/128K 2000 902123 ns/op 145.29 MB/s
+Search_Parens_CachedRE2/256K 1000 1803973 ns/op 145.31 MB/s
+Search_Parens_CachedRE2/512K 500 3642388 ns/op 143.94 MB/s
+Search_Parens_CachedRE2/1M 200 7339060 ns/op 142.88 MB/s
+Search_Parens_CachedRE2/2M 100 14671260 ns/op 142.94 MB/s
+Search_Parens_CachedRE2/4M 50 29267200 ns/op 143.31 MB/s
+Search_Parens_CachedRE2/8M 20 58361500 ns/op 143.74 MB/s
+Search_Parens_CachedRE2/16M 10 116252000 ns/op 144.32 MB/s
+Search_BigFixed_CachedPCRE/8 5000000 400 ns/op 19.96 MB/s
+Search_BigFixed_CachedPCRE/16 2000000 506 ns/op 31.61 MB/s
+Search_BigFixed_CachedPCRE/32 2000000 697 ns/op 45.89 MB/s
+Search_BigFixed_CachedPCRE/64 1000000 1069 ns/op 59.84 MB/s
+Search_BigFixed_CachedPCRE/128 1000000 1812 ns/op 70.62 MB/s
+Search_BigFixed_CachedPCRE/256 500000 3311 ns/op 77.31 MB/s
+Search_BigFixed_CachedPCRE/512 200000 6284 ns/op 81.48 MB/s
+Search_BigFixed_CachedPCRE/1K 100000 12249 ns/op 83.60 MB/s
+Search_BigFixed_CachedPCRE/2K 50000 24210 ns/op 84.59 MB/s
+Search_BigFixed_CachedPCRE/4K 50000 48501 ns/op 84.45 MB/s
+Search_BigFixed_CachedPCRE/8K 20000 95883 ns/op 85.44 MB/s
+Search_BigFixed_CachedPCRE/16K 10000 191855 ns/op 85.40 MB/s
+Search_BigFixed_CachedPCRE/32K 5000 384026 ns/op 85.33 MB/s
+Search_BigFixed_CachedRE2/8 10000000 174 ns/op 45.91 MB/s
+Search_BigFixed_CachedRE2/16 5000000 357 ns/op 44.76 MB/s
+==BENCHMARK== r70.mtv.corp.google.com Fri Feb 26 14:19:30 PST 2010
+# Linux r70.mtv.corp.google.com 2.6.24-gg804007-generic #1 SMP Thu Jan 21 11:28:34 PST 2010 x86_64 GNU/Linux
+# g++ (GCC) 4.2.4 (Ubuntu 4.2.4-1ubuntu4)
+# Copyright (C) 2007 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# d7671f473f1a+ tip
+# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), for GNU/Linux 2.6.8, dynamically linked (uses shared libs), not stripped
+
+==BENCHMARK== r70.mtv.corp.google.com Fri Feb 26 14:19:39 PST 2010
+# Linux r70.mtv.corp.google.com 2.6.24-gg804007-generic #1 SMP Thu Jan 21 11:28:34 PST 2010 x86_64 GNU/Linux
+# g++ (GCC) 4.2.4 (Ubuntu 4.2.4-1ubuntu4)
+# Copyright (C) 2007 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# d7671f473f1a+ tip
+# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), for GNU/Linux 2.6.8, dynamically linked (uses shared libs), not stripped
+
+Search_Easy0_CachedPCRE/8 10000000 193 ns/op 41.33 MB/s
+Search_Easy0_CachedPCRE/16 5000000 233 ns/op 68.49 MB/s
+Search_Easy0_CachedPCRE/32 5000000 325 ns/op 98.43 MB/s
+Search_Easy0_CachedPCRE/64 5000000 471 ns/op 135.63 MB/s
+Search_Easy0_CachedPCRE/128 2000000 763 ns/op 167.55 MB/s
+Search_Easy0_CachedPCRE/256 500000 2056 ns/op 124.48 MB/s
+Search_Easy0_CachedPCRE/512 500000 3584 ns/op 142.84 MB/s
+Search_Easy0_CachedPCRE/1K 200000 7100 ns/op 144.21 MB/s
+Search_Easy0_CachedPCRE/2K 100000 12051 ns/op 169.93 MB/s
+Search_Easy0_CachedPCRE/4K 50000 24050 ns/op 170.31 MB/s
+Search_Easy0_CachedPCRE/8K 50000 49373 ns/op 165.92 MB/s
+Search_Easy0_CachedPCRE/16K 20000 96773 ns/op 169.30 MB/s
+Search_Easy0_CachedPCRE/32K 10000 194165 ns/op 168.76 MB/s
+Search_Easy0_CachedPCRE/64K 5000 386741 ns/op 169.46 MB/s
+Search_Easy0_CachedPCRE/128K 2000 777607 ns/op 168.56 MB/s
+Search_Easy0_CachedPCRE/256K 1000 1552988 ns/op 168.80 MB/s
+Search_Easy0_CachedPCRE/512K 500 3110474 ns/op 168.56 MB/s
+Search_Easy0_CachedPCRE/1M 200 6364800 ns/op 164.75 MB/s
+Search_Easy0_CachedPCRE/2M 100 12764160 ns/op 164.30 MB/s
+Search_Easy0_CachedPCRE/4M 50 25694680 ns/op 163.24 MB/s
+Search_Easy0_CachedPCRE/8M 20 51243350 ns/op 163.70 MB/s
+Search_Easy0_CachedPCRE/16M 10 102468300 ns/op 163.73 MB/s
+Search_Easy0_CachedRE2/8 5000000 316 ns/op 25.29 MB/s
+Search_Easy0_CachedRE2/16 5000000 325 ns/op 49.12 MB/s
+Search_Easy0_CachedRE2/32 5000000 371 ns/op 86.24 MB/s
+Search_Easy0_CachedRE2/64 5000000 369 ns/op 173.23 MB/s
+Search_Easy0_CachedRE2/128 5000000 451 ns/op 283.59 MB/s
+Search_Easy0_CachedRE2/256 2000000 562 ns/op 455.33 MB/s
+Search_Easy0_CachedRE2/512 2000000 744 ns/op 687.58 MB/s
+Search_Easy0_CachedRE2/1K 1000000 1306 ns/op 783.89 MB/s
+Search_Easy0_CachedRE2/2K 500000 2240 ns/op 913.97 MB/s
+Search_Easy0_CachedRE2/4K 500000 3941 ns/op 1039.23 MB/s
+Search_Easy0_CachedRE2/8K 200000 7648 ns/op 1071.01 MB/s
+Search_Easy0_CachedRE2/16K 100000 14759 ns/op 1110.05 MB/s
+Search_Easy0_CachedRE2/32K 50000 30302 ns/op 1081.38 MB/s
+Search_Easy0_CachedRE2/64K 20000 60931 ns/op 1075.56 MB/s
+Search_Easy0_CachedRE2/128K 10000 127313 ns/op 1029.52 MB/s
+Search_Easy0_CachedRE2/256K 5000 254134 ns/op 1031.52 MB/s
+Search_Easy0_CachedRE2/512K 5000 491892 ns/op 1065.86 MB/s
+Search_Easy0_CachedRE2/1M 1000 1133898 ns/op 924.75 MB/s
+Search_Easy0_CachedRE2/2M 500 2308796 ns/op 908.33 MB/s
+Search_Easy0_CachedRE2/4M 500 4548904 ns/op 922.05 MB/s
+Search_Easy0_CachedRE2/8M 200 9024270 ns/op 929.56 MB/s
+Search_Easy0_CachedRE2/16M 100 18438590 ns/op 909.90 MB/s
+Search_Easy1_CachedPCRE/8 10000000 191 ns/op 41.68 MB/s
+Search_Easy1_CachedPCRE/16 5000000 229 ns/op 69.63 MB/s
+Search_Easy1_CachedPCRE/32 5000000 324 ns/op 98.73 MB/s
+Search_Easy1_CachedPCRE/64 5000000 470 ns/op 136.08 MB/s
+Search_Easy1_CachedPCRE/128 1000000 1186 ns/op 107.88 MB/s
+Search_Easy1_CachedPCRE/256 1000000 1773 ns/op 144.34 MB/s
+Search_Easy1_CachedPCRE/512 500000 3948 ns/op 129.66 MB/s
+Search_Easy1_CachedPCRE/1K 200000 7218 ns/op 141.85 MB/s
+Search_Easy1_CachedPCRE/2K 100000 13609 ns/op 150.49 MB/s
+Search_Easy1_CachedPCRE/4K 50000 25314 ns/op 161.80 MB/s
+Search_Easy1_CachedPCRE/8K 20000 50313 ns/op 162.82 MB/s
+Search_Easy1_CachedPCRE/16K 20000 98301 ns/op 166.67 MB/s
+Search_Easy1_CachedPCRE/32K 10000 197269 ns/op 166.11 MB/s
+Search_Easy1_CachedPCRE/64K 5000 392344 ns/op 167.04 MB/s
+Search_Easy1_CachedPCRE/128K 2000 789037 ns/op 166.12 MB/s
+Search_Easy1_CachedPCRE/256K 1000 1572839 ns/op 166.67 MB/s
+Search_Easy1_CachedPCRE/512K 500 3152628 ns/op 166.30 MB/s
+Search_Easy1_CachedPCRE/1M 200 6467335 ns/op 162.13 MB/s
+Search_Easy1_CachedPCRE/2M 100 12945310 ns/op 162.00 MB/s
+Search_Easy1_CachedPCRE/4M 50 26107960 ns/op 160.65 MB/s
+Search_Easy1_CachedPCRE/8M 20 52019700 ns/op 161.26 MB/s
+Search_Easy1_CachedPCRE/16M 10 103979700 ns/op 161.35 MB/s
+Search_Easy1_CachedRE2/8 5000000 320 ns/op 24.99 MB/s
+Search_Easy1_CachedRE2/16 5000000 325 ns/op 49.14 MB/s
+Search_Easy1_CachedRE2/32 5000000 362 ns/op 88.30 MB/s
+Search_Easy1_CachedRE2/64 5000000 379 ns/op 168.55 MB/s
+Search_Easy1_CachedRE2/128 5000000 435 ns/op 294.25 MB/s
+Search_Easy1_CachedRE2/256 2000000 530 ns/op 482.85 MB/s
+Search_Easy1_CachedRE2/512 2000000 721 ns/op 710.03 MB/s
+Search_Easy1_CachedRE2/1K 1000000 1290 ns/op 793.55 MB/s
+Search_Easy1_CachedRE2/2K 500000 2223 ns/op 921.20 MB/s
+Search_Easy1_CachedRE2/4K 500000 3936 ns/op 1040.51 MB/s
+Search_Easy1_CachedRE2/8K 200000 7628 ns/op 1073.85 MB/s
+Search_Easy1_CachedRE2/16K 100000 14704 ns/op 1114.25 MB/s
+Search_Easy1_CachedRE2/32K 50000 30338 ns/op 1080.08 MB/s
+Search_Easy1_CachedRE2/64K 20000 60907 ns/op 1075.99 MB/s
+Search_Easy1_CachedRE2/128K 10000 127232 ns/op 1030.17 MB/s
+Search_Easy1_CachedRE2/256K 5000 253878 ns/op 1032.56 MB/s
+Search_Easy1_CachedRE2/512K 2000 510752 ns/op 1026.50 MB/s
+Search_Easy1_CachedRE2/1M 1000 1132564 ns/op 925.84 MB/s
+Search_Easy1_CachedRE2/2M 500 2265660 ns/op 925.63 MB/s
+Search_Easy1_CachedRE2/4M 500 4524512 ns/op 927.02 MB/s
+Search_Easy1_CachedRE2/8M 200 9113050 ns/op 920.50 MB/s
+Search_Easy1_CachedRE2/16M 100 18149050 ns/op 924.41 MB/s
+Search_Medium_CachedPCRE/8 10000000 196 ns/op 40.61 MB/s
+Search_Medium_CachedPCRE/16 5000000 239 ns/op 66.94 MB/s
+Search_Medium_CachedPCRE/32 5000000 311 ns/op 102.58 MB/s
+Search_Medium_CachedPCRE/64 5000000 458 ns/op 139.44 MB/s
+Search_Medium_CachedPCRE/128 2000000 747 ns/op 171.17 MB/s
+Search_Medium_CachedPCRE/256 100000 16750 ns/op 15.28 MB/s
+Search_Medium_CachedPCRE/512 50000 39824 ns/op 12.86 MB/s
+Search_Medium_CachedPCRE/1K 20000 78534 ns/op 13.04 MB/s
+Search_Medium_CachedPCRE/2K 10000 116649 ns/op 17.56 MB/s
+Search_Medium_CachedPCRE/4K 5000 289351 ns/op 14.16 MB/s
+Search_Medium_CachedPCRE/8K 2000 624232 ns/op 13.12 MB/s
+Search_Medium_CachedPCRE/16K 1000 1273353 ns/op 12.87 MB/s
+Search_Medium_CachedPCRE/32K 500 2547042 ns/op 12.87 MB/s
+Search_Medium_CachedPCRE/64K 200 5087635 ns/op 12.88 MB/s
+Search_Medium_CachedPCRE/128K 100 10218440 ns/op 12.83 MB/s
+Search_Medium_CachedPCRE/256K 50 20359020 ns/op 12.88 MB/s
+Search_Medium_CachedRE2/8 5000000 335 ns/op 23.85 MB/s
+Search_Medium_CachedRE2/16 5000000 387 ns/op 41.31 MB/s
+Search_Medium_CachedRE2/32 5000000 497 ns/op 64.31 MB/s
+Search_Medium_CachedRE2/64 2000000 698 ns/op 91.65 MB/s
+Search_Medium_CachedRE2/128 1000000 1116 ns/op 114.69 MB/s
+Search_Medium_CachedRE2/256 1000000 1941 ns/op 131.87 MB/s
+Search_Medium_CachedRE2/512 500000 3610 ns/op 141.81 MB/s
+Search_Medium_CachedRE2/1K 200000 6924 ns/op 147.87 MB/s
+Search_Medium_CachedRE2/2K 100000 13593 ns/op 150.66 MB/s
+Search_Medium_CachedRE2/4K 50000 26821 ns/op 152.71 MB/s
+Search_Medium_CachedRE2/8K 20000 53355 ns/op 153.54 MB/s
+Search_Medium_CachedRE2/16K 10000 106541 ns/op 153.78 MB/s
+Search_Medium_CachedRE2/32K 5000 213334 ns/op 153.60 MB/s
+Search_Medium_CachedRE2/64K 5000 425884 ns/op 153.88 MB/s
+Search_Medium_CachedRE2/128K 2000 861612 ns/op 152.12 MB/s
+Search_Medium_CachedRE2/256K 1000 1721932 ns/op 152.24 MB/s
+Search_Medium_CachedRE2/512K 500 3436138 ns/op 152.58 MB/s
+Search_Medium_CachedRE2/1M 200 6959260 ns/op 150.67 MB/s
+Search_Medium_CachedRE2/2M 100 13991020 ns/op 149.89 MB/s
+Search_Medium_CachedRE2/4M 50 27927180 ns/op 150.19 MB/s
+Search_Medium_CachedRE2/8M 20 56069500 ns/op 149.61 MB/s
+Search_Medium_CachedRE2/16M 10 112054200 ns/op 149.72 MB/s
+Search_Hard_CachedPCRE/8 10000000 198 ns/op 40.39 MB/s
+Search_Hard_CachedPCRE/16 5000000 240 ns/op 66.51 MB/s
+Search_Hard_CachedPCRE/32 5000000 312 ns/op 102.38 MB/s
+Search_Hard_CachedPCRE/64 5000000 459 ns/op 139.25 MB/s
+Search_Hard_CachedPCRE/128 2000000 749 ns/op 170.80 MB/s
+Search_Hard_CachedPCRE/256 1000 1035026 ns/op 0.25 MB/s
+Search_Hard_CachedPCRE/512 500 4247092 ns/op 0.12 MB/s
+Search_Hard_CachedPCRE/1K 100 16874720 ns/op 0.06 MB/s
+Search_Hard_CachedPCRE/2K 20 61595100 ns/op 0.03 MB/s
+Search_Hard_CachedPCRE/4K 5 266182000 ns/op 0.02 MB/s
+Search_Hard_CachedRE2/8 5000000 332 ns/op 24.04 MB/s
+Search_Hard_CachedRE2/16 5000000 389 ns/op 41.05 MB/s
+Search_Hard_CachedRE2/32 5000000 498 ns/op 64.25 MB/s
+Search_Hard_CachedRE2/64 2000000 695 ns/op 91.97 MB/s
+Search_Hard_CachedRE2/128 1000000 1109 ns/op 115.34 MB/s
+Search_Hard_CachedRE2/256 1000000 1944 ns/op 131.66 MB/s
+Search_Hard_CachedRE2/512 500000 3603 ns/op 142.09 MB/s
+Search_Hard_CachedRE2/1K 200000 6910 ns/op 148.17 MB/s
+Search_Hard_CachedRE2/2K 100000 13584 ns/op 150.76 MB/s
+Search_Hard_CachedRE2/4K 50000 26804 ns/op 152.81 MB/s
+Search_Hard_CachedRE2/8K 20000 53574 ns/op 152.91 MB/s
+Search_Hard_CachedRE2/16K 10000 106335 ns/op 154.08 MB/s
+Search_Hard_CachedRE2/32K 5000 212875 ns/op 153.93 MB/s
+Search_Hard_CachedRE2/64K 5000 426419 ns/op 153.69 MB/s
+Search_Hard_CachedRE2/128K 2000 860044 ns/op 152.40 MB/s
+Search_Hard_CachedRE2/256K 1000 1717290 ns/op 152.65 MB/s
+Search_Hard_CachedRE2/512K 500 3443368 ns/op 152.26 MB/s
+Search_Hard_CachedRE2/1M 200 6974745 ns/op 150.34 MB/s
+Search_Hard_CachedRE2/2M 100 13946120 ns/op 150.38 MB/s
+Search_Hard_CachedRE2/4M 50 27953660 ns/op 150.04 MB/s
+Search_Hard_CachedRE2/8M 20 55889600 ns/op 150.09 MB/s
+Search_Hard_CachedRE2/16M 10 111632200 ns/op 150.29 MB/s
+Search_Parens_CachedPCRE/8 5000000 306 ns/op 26.06 MB/s
+Search_Parens_CachedRE2/8 5000000 330 ns/op 24.18 MB/s
+Search_Parens_CachedRE2/16 5000000 384 ns/op 41.64 MB/s
+Search_Parens_CachedRE2/32 5000000 493 ns/op 64.80 MB/s
+Search_Parens_CachedRE2/64 2000000 713 ns/op 89.69 MB/s
+Search_Parens_CachedRE2/128 1000000 1148 ns/op 111.47 MB/s
+Search_Parens_CachedRE2/256 500000 2027 ns/op 126.29 MB/s
+Search_Parens_CachedRE2/512 500000 3767 ns/op 135.91 MB/s
+Search_Parens_CachedRE2/1K 200000 7264 ns/op 140.96 MB/s
+Search_Parens_CachedRE2/2K 100000 14217 ns/op 144.05 MB/s
+Search_Parens_CachedRE2/4K 50000 28234 ns/op 145.07 MB/s
+Search_Parens_CachedRE2/8K 20000 56090 ns/op 146.05 MB/s
+Search_Parens_CachedRE2/16K 10000 112201 ns/op 146.02 MB/s
+Search_Parens_CachedRE2/32K 5000 223654 ns/op 146.51 MB/s
+Search_Parens_CachedRE2/64K 5000 448713 ns/op 146.05 MB/s
+Search_Parens_CachedRE2/128K 2000 903401 ns/op 145.09 MB/s
+Search_Parens_CachedRE2/256K 1000 1801568 ns/op 145.51 MB/s
+Search_Parens_CachedRE2/512K 500 3611400 ns/op 145.18 MB/s
+Search_Parens_CachedRE2/1M 200 7303355 ns/op 143.57 MB/s
+Search_Parens_CachedRE2/2M 100 14659380 ns/op 143.06 MB/s
+Search_Parens_CachedRE2/4M 50 29371720 ns/op 142.80 MB/s
+Search_Parens_CachedRE2/8M 20 58387300 ns/op 143.67 MB/s
+Search_Parens_CachedRE2/16M 10 116634700 ns/op 143.84 MB/s
+Search_BigFixed_CachedPCRE/8 5000000 384 ns/op 20.82 MB/s
+Search_BigFixed_CachedPCRE/16 5000000 476 ns/op 33.60 MB/s
+Search_BigFixed_CachedPCRE/32 2000000 641 ns/op 49.87 MB/s
+Search_BigFixed_CachedPCRE/64 2000000 969 ns/op 66.03 MB/s
+Search_BigFixed_CachedPCRE/128 1000000 1619 ns/op 79.03 MB/s
+Search_BigFixed_CachedPCRE/256 500000 2934 ns/op 87.23 MB/s
+Search_BigFixed_CachedPCRE/512 200000 5548 ns/op 92.28 MB/s
+Search_BigFixed_CachedPCRE/1K 100000 10777 ns/op 95.01 MB/s
+Search_BigFixed_CachedPCRE/2K 50000 21301 ns/op 96.14 MB/s
+Search_BigFixed_CachedPCRE/4K 50000 42253 ns/op 96.94 MB/s
+Search_BigFixed_CachedPCRE/8K 20000 84014 ns/op 97.51 MB/s
+Search_BigFixed_CachedPCRE/16K 10000 168135 ns/op 97.45 MB/s
+Search_BigFixed_CachedPCRE/32K 5000 336352 ns/op 97.42 MB/s
+Search_BigFixed_CachedRE2/8 10000000 173 ns/op 46.12 MB/s
+Search_BigFixed_CachedRE2/16 5000000 361 ns/op 44.31 MB/s
+Search_BigFixed_CachedRE2/32 5000000 428 ns/op 74.67 MB/s
+Search_BigFixed_CachedRE2/64 2000000 542 ns/op 117.92 MB/s
+Search_BigFixed_CachedRE2/128 2000000 804 ns/op 159.17 MB/s
+Search_BigFixed_CachedRE2/256 1000000 1306 ns/op 195.91 MB/s
+Search_BigFixed_CachedRE2/512 500000 2303 ns/op 222.23 MB/s
+Search_BigFixed_CachedRE2/1K 500000 4140 ns/op 247.33 MB/s
+Search_BigFixed_CachedRE2/2K 200000 8305 ns/op 246.59 MB/s
+Search_BigFixed_CachedRE2/4K 100000 16335 ns/op 250.74 MB/s
+Search_BigFixed_CachedRE2/8K 50000 32402 ns/op 252.82 MB/s
+Search_BigFixed_CachedRE2/16K 20000 61654 ns/op 265.74 MB/s
+Search_BigFixed_CachedRE2/32K 10000 123161 ns/op 266.06 MB/s
+Search_BigFixed_CachedRE2/64K 5000 250635 ns/op 261.48 MB/s
+Search_BigFixed_CachedRE2/128K 2000 501539 ns/op 261.34 MB/s
+Search_BigFixed_CachedRE2/256K 1000 1029773 ns/op 254.56 MB/s
+Search_BigFixed_CachedRE2/512K 500 2088812 ns/op 251.00 MB/s
+Search_BigFixed_CachedRE2/1M 500 4367148 ns/op 240.11 MB/s
+Search_Success_PCRE/8 500000 3069 ns/op 2.61 MB/s
+Search_Success_PCRE/16 500000 3145 ns/op 5.09 MB/s
+Search_Success_PCRE/32 500000 3288 ns/op 9.73 MB/s
+Search_Success_PCRE/64 500000 3564 ns/op 17.96 MB/s
+Search_Success_PCRE/128 500000 4104 ns/op 31.18 MB/s
+Search_Success_PCRE/256 200000 5214 ns/op 49.09 MB/s
+Search_Success_PCRE/512 200000 7414 ns/op 69.05 MB/s
+Search_Success_PCRE/1K 100000 11867 ns/op 86.29 MB/s
+Search_Success_PCRE/2K 50000 20669 ns/op 99.08 MB/s
+Search_Success_PCRE/4K 50000 38338 ns/op 106.84 MB/s
+Search_Success_PCRE/8K 20000 73632 ns/op 111.26 MB/s
+Search_Success_PCRE/16K 10000 144640 ns/op 113.27 MB/s
+Search_Success_PCRE/32K 5000 286497 ns/op 114.37 MB/s
+Search_Success_PCRE/64K 2000 571622 ns/op 114.65 MB/s
+Search_Success_PCRE/128K 1000 1141585 ns/op 114.82 MB/s
+Search_Success_PCRE/256K 500 2297252 ns/op 114.11 MB/s
+Search_Success_PCRE/512K 500 4580748 ns/op 114.45 MB/s
+Search_Success_PCRE/1M 200 9388870 ns/op 111.68 MB/s
+Search_Success_PCRE/2M 100 19154170 ns/op 109.49 MB/s
+Search_Success_PCRE/4M 50 39603180 ns/op 105.91 MB/s
+Search_Success_PCRE/8M 10 100235900 ns/op 83.69 MB/s
+Search_Success_PCRE/16M 5 249216000 ns/op 67.32 MB/s
+Search_Success_RE2/8 100000 10763 ns/op 0.74 MB/s
+Search_Success_RE2/16 50000 24745 ns/op 0.65 MB/s
+Search_Success_RE2/32 50000 24874 ns/op 1.29 MB/s
+Search_Success_RE2/64 50000 25512 ns/op 2.51 MB/s
+Search_Success_RE2/128 50000 25781 ns/op 4.96 MB/s
+Search_Success_RE2/256 50000 26515 ns/op 9.65 MB/s
+Search_Success_RE2/512 50000 28061 ns/op 18.25 MB/s
+Search_Success_RE2/1K 50000 31719 ns/op 32.28 MB/s
+Search_Success_RE2/2K 50000 38644 ns/op 53.00 MB/s
+Search_Success_RE2/4K 20000 52593 ns/op 77.88 MB/s
+Search_Success_RE2/8K 20000 80472 ns/op 101.80 MB/s
+Search_Success_RE2/16K 10000 136735 ns/op 119.82 MB/s
+Search_Success_RE2/32K 5000 248227 ns/op 132.01 MB/s
+Search_Success_RE2/64K 5000 474621 ns/op 138.08 MB/s
+Search_Success_RE2/128K 2000 926762 ns/op 141.43 MB/s
+Search_Success_RE2/256K 1000 1834769 ns/op 142.88 MB/s
+Search_Success_RE2/512K 500 3659356 ns/op 143.27 MB/s
+Search_Success_RE2/1M 200 7482580 ns/op 140.14 MB/s
+Search_Success_RE2/2M 100 15275510 ns/op 137.29 MB/s
+Search_Success_RE2/4M 50 32164720 ns/op 130.40 MB/s
+Search_Success_RE2/8M 20 71208250 ns/op 117.80 MB/s
+Search_Success_RE2/16M 5 215755600 ns/op 77.76 MB/s
+Search_Success_CachedPCRE/8 5000000 397 ns/op 20.12 MB/s
+Search_Success_CachedPCRE/16 5000000 466 ns/op 34.27 MB/s
+Search_Success_CachedPCRE/32 2000000 602 ns/op 53.10 MB/s
+Search_Success_CachedPCRE/64 2000000 881 ns/op 72.63 MB/s
+Search_Success_CachedPCRE/128 1000000 1432 ns/op 89.38 MB/s
+Search_Success_CachedPCRE/256 500000 2542 ns/op 100.69 MB/s
+Search_Success_CachedPCRE/512 500000 4750 ns/op 107.78 MB/s
+Search_Success_CachedPCRE/1K 200000 9157 ns/op 111.83 MB/s
+Search_Success_CachedPCRE/2K 100000 18016 ns/op 113.67 MB/s
+Search_Success_CachedPCRE/4K 50000 35707 ns/op 114.71 MB/s
+Search_Success_CachedPCRE/8K 20000 70955 ns/op 115.45 MB/s
+Search_Success_CachedPCRE/16K 10000 141912 ns/op 115.45 MB/s
+Search_Success_CachedPCRE/32K 5000 284777 ns/op 115.07 MB/s
+Search_Success_CachedPCRE/64K 2000 571111 ns/op 114.75 MB/s
+Search_Success_CachedPCRE/128K 1000 1142328 ns/op 114.74 MB/s
+Search_Success_CachedPCRE/256K 500 2289468 ns/op 114.50 MB/s
+Search_Success_CachedPCRE/512K 500 4566850 ns/op 114.80 MB/s
+Search_Success_CachedPCRE/1M 200 9379830 ns/op 111.79 MB/s
+Search_Success_CachedPCRE/2M 100 19115070 ns/op 109.71 MB/s
+Search_Success_CachedPCRE/4M 50 39568300 ns/op 106.00 MB/s
+Search_Success_CachedPCRE/8M 10 100039600 ns/op 83.85 MB/s
+Search_Success_CachedPCRE/16M 5 249181800 ns/op 67.33 MB/s
+Search_Success_CachedRE2/8 5000000 201 ns/op 39.73 MB/s
+Search_Success_CachedRE2/16 5000000 395 ns/op 40.43 MB/s
+Search_Success_CachedRE2/32 2000000 507 ns/op 63.06 MB/s
+Search_Success_CachedRE2/64 2000000 723 ns/op 88.42 MB/s
+Search_Success_CachedRE2/128 1000000 1157 ns/op 110.59 MB/s
+Search_Success_CachedRE2/256 500000 2032 ns/op 125.94 MB/s
+Search_Success_CachedRE2/512 500000 3778 ns/op 135.49 MB/s
+Search_Success_CachedRE2/1K 200000 7275 ns/op 140.75 MB/s
+Search_Success_CachedRE2/2K 100000 14222 ns/op 144.00 MB/s
+Search_Success_CachedRE2/4K 50000 28255 ns/op 144.96 MB/s
+Search_Success_CachedRE2/8K 20000 56056 ns/op 146.14 MB/s
+Search_Success_CachedRE2/16K 10000 112188 ns/op 146.04 MB/s
+Search_Success_CachedRE2/32K 5000 223466 ns/op 146.64 MB/s
+Search_Success_CachedRE2/64K 5000 448677 ns/op 146.06 MB/s
+Search_Success_CachedRE2/128K 2000 901883 ns/op 145.33 MB/s
+Search_Success_CachedRE2/256K 1000 1810495 ns/op 144.79 MB/s
+Search_Success_CachedRE2/512K 500 3631582 ns/op 144.37 MB/s
+Search_Success_CachedRE2/1M 200 7434340 ns/op 141.04 MB/s
+Search_Success_CachedRE2/2M 100 15224310 ns/op 137.75 MB/s
+Search_Success_CachedRE2/4M 50 31757460 ns/op 132.07 MB/s
+Search_Success_CachedRE2/8M 20 70959200 ns/op 118.22 MB/s
+Search_Success_CachedRE2/16M 5 215988600 ns/op 77.68 MB/s
+Search_Success1_PCRE/8 500000 3292 ns/op 2.43 MB/s
+Search_Success1_PCRE/16 500000 3360 ns/op 4.76 MB/s
+Search_Success1_PCRE/32 500000 3476 ns/op 9.21 MB/s
+Search_Success1_PCRE/64 500000 3775 ns/op 16.95 MB/s
+Search_Success1_PCRE/128 500000 4303 ns/op 29.75 MB/s
+Search_Success1_PCRE/256 200000 5430 ns/op 47.14 MB/s
+Search_Success1_PCRE/512 200000 7664 ns/op 66.80 MB/s
+Search_Success1_PCRE/1K 100000 12062 ns/op 84.89 MB/s
+Search_Success1_PCRE/2K 50000 20956 ns/op 97.73 MB/s
+Search_Success1_PCRE/4K 50000 38521 ns/op 106.33 MB/s
+Search_Success1_PCRE/8K 20000 73852 ns/op 110.92 MB/s
+Search_Success1_PCRE/16K 10000 144900 ns/op 113.07 MB/s
+Search_Success1_PCRE/32K 5000 286158 ns/op 114.51 MB/s
+Search_Success1_PCRE/64K 2000 569992 ns/op 114.98 MB/s
+Search_Success1_PCRE/128K 1000 1144770 ns/op 114.50 MB/s
+Search_Success1_PCRE/256K 500 2292086 ns/op 114.37 MB/s
+Search_Success1_PCRE/512K 500 4578494 ns/op 114.51 MB/s
+Search_Success1_PCRE/1M 200 9410760 ns/op 111.42 MB/s
+Search_Success1_PCRE/2M 100 19166460 ns/op 109.42 MB/s
+Search_Success1_PCRE/4M 50 39599000 ns/op 105.92 MB/s
+Search_Success1_PCRE/8M 10 100725900 ns/op 83.28 MB/s
+Search_Success1_PCRE/16M 5 249356000 ns/op 67.28 MB/s
+Search_Success1_RE2/8 50000 33188 ns/op 0.24 MB/s
+Search_Success1_RE2/16 50000 33012 ns/op 0.48 MB/s
+Search_Success1_RE2/32 50000 32845 ns/op 0.97 MB/s
+Search_Success1_RE2/64 50000 33133 ns/op 1.93 MB/s
+Search_Success1_RE2/128 50000 33536 ns/op 3.82 MB/s
+Search_Success1_RE2/256 50000 34548 ns/op 7.41 MB/s
+Search_Success1_RE2/512 50000 36303 ns/op 14.10 MB/s
+Search_Success1_RE2/1K 50000 39676 ns/op 25.81 MB/s
+Search_Success1_RE2/2K 50000 46563 ns/op 43.98 MB/s
+Search_Success1_RE2/4K 20000 60801 ns/op 67.37 MB/s
+Search_Success1_RE2/8K 20000 88743 ns/op 92.31 MB/s
+Search_Success1_RE2/16K 10000 145159 ns/op 112.87 MB/s
+Search_Success1_RE2/32K 5000 257245 ns/op 127.38 MB/s
+Search_Success1_RE2/64K 5000 482971 ns/op 135.69 MB/s
+Search_Success1_RE2/128K 2000 935136 ns/op 140.16 MB/s
+Search_Success1_RE2/256K 1000 1844695 ns/op 142.11 MB/s
+Search_Success1_RE2/512K 500 3676360 ns/op 142.61 MB/s
+Search_Success1_RE2/1M 200 7511915 ns/op 139.59 MB/s
+Search_Success1_RE2/2M 100 15301160 ns/op 137.06 MB/s
+Search_Success1_RE2/4M 50 31848480 ns/op 131.70 MB/s
+Search_Success1_RE2/8M 20 71078250 ns/op 118.02 MB/s
+Search_Success1_RE2/16M 5 215988000 ns/op 77.68 MB/s
+Search_Success1_Cached_PCRE/8 5000000 442 ns/op 18.08 MB/s
+Search_Success1_Cached_PCRE/16 2000000 511 ns/op 31.31 MB/s
+Search_Success1_Cached_PCRE/32 2000000 649 ns/op 49.30 MB/s
+Search_Success1_Cached_PCRE/64 2000000 926 ns/op 69.09 MB/s
+Search_Success1_Cached_PCRE/128 1000000 1476 ns/op 86.70 MB/s
+Search_Success1_Cached_PCRE/256 500000 2584 ns/op 99.04 MB/s
+Search_Success1_Cached_PCRE/512 500000 4787 ns/op 106.93 MB/s
+Search_Success1_Cached_PCRE/1K 200000 9217 ns/op 111.10 MB/s
+Search_Success1_Cached_PCRE/2K 100000 18078 ns/op 113.28 MB/s
+Search_Success1_Cached_PCRE/4K 50000 35681 ns/op 114.79 MB/s
+Search_Success1_Cached_PCRE/8K 20000 71032 ns/op 115.33 MB/s
+Search_Success1_Cached_PCRE/16K 10000 142121 ns/op 115.28 MB/s
+Search_Success1_Cached_PCRE/32K 5000 283243 ns/op 115.69 MB/s
+Search_Success1_Cached_PCRE/64K 2000 566937 ns/op 115.60 MB/s
+Search_Success1_Cached_PCRE/128K 1000 1141044 ns/op 114.87 MB/s
+Search_Success1_Cached_PCRE/256K 500 2283570 ns/op 114.80 MB/s
+Search_Success1_Cached_PCRE/512K 500 4573362 ns/op 114.64 MB/s
+Search_Success1_Cached_PCRE/1M 200 9377975 ns/op 111.81 MB/s
+Search_Success1_Cached_PCRE/2M 100 19150760 ns/op 109.51 MB/s
+Search_Success1_Cached_PCRE/4M 50 39578540 ns/op 105.97 MB/s
+Search_Success1_Cached_PCRE/8M 10 102111900 ns/op 82.15 MB/s
+Search_Success1_Cached_PCRE/16M 5 247123000 ns/op 67.89 MB/s
+Search_Success1_Cached_RE2/8 5000000 348 ns/op 22.94 MB/s
+Search_Success1_Cached_RE2/16 5000000 396 ns/op 40.35 MB/s
+Search_Success1_Cached_RE2/32 5000000 492 ns/op 65.01 MB/s
+Search_Success1_Cached_RE2/64 2000000 716 ns/op 89.38 MB/s
+Search_Success1_Cached_RE2/128 1000000 1131 ns/op 113.09 MB/s
+Search_Success1_Cached_RE2/256 1000000 1961 ns/op 130.49 MB/s
+Search_Success1_Cached_RE2/512 500000 3626 ns/op 141.17 MB/s
+Search_Success1_Cached_RE2/1K 200000 6941 ns/op 147.51 MB/s
+Search_Success1_Cached_RE2/2K 100000 13591 ns/op 150.69 MB/s
+Search_Success1_Cached_RE2/4K 50000 26867 ns/op 152.45 MB/s
+Search_Success1_Cached_RE2/8K 20000 53455 ns/op 153.25 MB/s
+Search_Success1_Cached_RE2/16K 10000 106632 ns/op 153.65 MB/s
+Search_Success1_Cached_RE2/32K 5000 213141 ns/op 153.74 MB/s
+Search_Success1_Cached_RE2/64K 5000 426628 ns/op 153.61 MB/s
+Search_Success1_Cached_RE2/128K 2000 861903 ns/op 152.07 MB/s
+Search_Success1_Cached_RE2/256K 1000 1729300 ns/op 151.59 MB/s
+Search_Success1_Cached_RE2/512K 500 3470894 ns/op 151.05 MB/s
+Search_Success1_Cached_RE2/1M 200 7120350 ns/op 147.26 MB/s
+Search_Success1_Cached_RE2/2M 100 14538650 ns/op 144.25 MB/s
+Search_Success1_Cached_RE2/4M 50 30323940 ns/op 138.32 MB/s
+Search_Success1_Cached_RE2/8M 20 68069300 ns/op 123.24 MB/s
+Search_Success1_Cached_RE2/16M 5 211011000 ns/op 79.51 MB/s
+Search_Digits_PCRE 200000 7008 ns/op
+Search_Digits_RE2 50000 27251 ns/op
+Parse_Digits_PCRE 200000 6887 ns/op
+Parse_Digits_RE2 100000 13239 ns/op
+Parse_CachedDigits_PCRE 2000000 776 ns/op
+Parse_CachedDigits_RE2 5000000 451 ns/op
+Parse_DigitDs_PCRE 200000 6558 ns/op
+Parse_DigitDs_RE2 100000 12946 ns/op
+Parse_CachedDigitDs_PCRE 2000000 766 ns/op
+Parse_CachedDigitDs_RE2 5000000 445 ns/op
+Parse_Split_PCRE 500000 4751 ns/op
+Parse_Split_RE2 100000 14060 ns/op
+Parse_CachedSplit_PCRE 2000000 568 ns/op
+Parse_CachedSplit_RE2 5000000 275 ns/op
+Parse_SplitHard_PCRE 500000 4650 ns/op
+Parse_SplitHard_RE2 100000 17606 ns/op
+Parse_CachedSplitHard_PCRE 2000000 554 ns/op
+Parse_CachedSplitHard_RE2 500000 2987 ns/op
+Parse_CachedSplitBig1_PCRE 200 8376500 ns/op
+Parse_CachedSplitBig1_RE2 1000 1342272 ns/op
+Parse_CachedSplitBig2_PCRE 2000 848859 ns/op
+Parse_CachedSplitBig2_RE2 10 781553500 ns/op
+BM_PCRE_Compile 200000 5582 ns/op
+BM_RE2_Compile 100000 13961 ns/op
+SearchPhone_CachedPCRE/8 500000 2107 ns/op 3.80 MB/s
+SearchPhone_CachedPCRE/16 500000 3526 ns/op 4.54 MB/s
+SearchPhone_CachedPCRE/32 200000 6320 ns/op 5.06 MB/s
+SearchPhone_CachedPCRE/64 100000 11953 ns/op 5.35 MB/s
+SearchPhone_CachedPCRE/128 50000 23357 ns/op 5.48 MB/s
+SearchPhone_CachedPCRE/256 50000 45919 ns/op 5.57 MB/s
+SearchPhone_CachedPCRE/512 20000 90828 ns/op 5.64 MB/s
+SearchPhone_CachedPCRE/1K 10000 181299 ns/op 5.65 MB/s
+SearchPhone_CachedPCRE/2K 5000 358095 ns/op 5.72 MB/s
+SearchPhone_CachedPCRE/4K 2000 709670 ns/op 5.77 MB/s
+SearchPhone_CachedPCRE/8K 1000 1412480 ns/op 5.80 MB/s
+SearchPhone_CachedPCRE/16K 500 2826286 ns/op 5.80 MB/s
+SearchPhone_CachedPCRE/32K 200 5643125 ns/op 5.81 MB/s
+SearchPhone_CachedPCRE/64K 100 11303300 ns/op 5.80 MB/s
+SearchPhone_CachedPCRE/128K 50 22564640 ns/op 5.81 MB/s
+SearchPhone_CachedPCRE/256K 50 45145780 ns/op 5.81 MB/s
+SearchPhone_CachedPCRE/512K 20 90272200 ns/op 5.81 MB/s
+SearchPhone_CachedPCRE/1M 10 180937900 ns/op 5.80 MB/s
+SearchPhone_CachedPCRE/2M 5 362303400 ns/op 5.79 MB/s
+SearchPhone_CachedPCRE/4M 2 725048500 ns/op 5.78 MB/s
+SearchPhone_CachedPCRE/8M 1 1449458000 ns/op 5.79 MB/s
+SearchPhone_CachedPCRE/16M 1 2898562000 ns/op 5.79 MB/s
+SearchPhone_CachedRE2/8 1000000 1038 ns/op 7.70 MB/s
+SearchPhone_CachedRE2/16 1000000 1106 ns/op 14.46 MB/s
+SearchPhone_CachedRE2/32 1000000 1210 ns/op 26.44 MB/s
+SearchPhone_CachedRE2/64 1000000 1429 ns/op 44.78 MB/s
+SearchPhone_CachedRE2/128 1000000 1864 ns/op 68.64 MB/s
+SearchPhone_CachedRE2/256 500000 2741 ns/op 93.38 MB/s
+SearchPhone_CachedRE2/512 500000 4483 ns/op 114.18 MB/s
+SearchPhone_CachedRE2/1K 200000 7984 ns/op 128.24 MB/s
+SearchPhone_CachedRE2/2K 100000 14957 ns/op 136.92 MB/s
+SearchPhone_CachedRE2/4K 50000 28994 ns/op 141.27 MB/s
+SearchPhone_CachedRE2/8K 20000 56950 ns/op 143.85 MB/s
+SearchPhone_CachedRE2/16K 10000 112907 ns/op 145.11 MB/s
+SearchPhone_CachedRE2/32K 5000 224855 ns/op 145.73 MB/s
+SearchPhone_CachedRE2/64K 5000 449976 ns/op 145.64 MB/s
+SearchPhone_CachedRE2/128K 2000 899644 ns/op 145.69 MB/s
+SearchPhone_CachedRE2/256K 1000 1798122 ns/op 145.79 MB/s
+SearchPhone_CachedRE2/512K 500 3597034 ns/op 145.76 MB/s
+SearchPhone_CachedRE2/1M 200 7261140 ns/op 144.41 MB/s
+SearchPhone_CachedRE2/2M 100 14532060 ns/op 144.31 MB/s
+SearchPhone_CachedRE2/4M 50 29033780 ns/op 144.46 MB/s
+SearchPhone_CachedRE2/8M 20 57850800 ns/op 145.00 MB/s
+SearchPhone_CachedRE2/16M 10 115699800 ns/op 145.01 MB/s
+EmptyPartialMatchPCRE 10000000 192 ns/op
+EmptyPartialMatchRE2 5000000 273 ns/op
+SimplePartialMatchPCRE 5000000 263 ns/op
+SimplePartialMatchRE2 5000000 347 ns/op
+HTTPPartialMatchPCRE 2000000 885 ns/op
+HTTPPartialMatchRE2 1000000 1099 ns/op
+SmallHTTPPartialMatchPCRE 2000000 890 ns/op
+SmallHTTPPartialMatchRE2 1000000 1097 ns/op
+DotMatchPCRE 2000000 860 ns/op
+DotMatchRE2 1000000 1175 ns/op
+ASCIIMatchPCRE 2000000 767 ns/op
+ASCIIMatchRE2 1000000 1174 ns/op
+==BENCHMARK== r70.mtv.corp.google.com Fri Feb 26 15:25:04 PST 2010
+# Linux r70.mtv.corp.google.com 2.6.24-gg804007-generic #1 SMP Thu Jan 21 11:28:34 PST 2010 x86_64 GNU/Linux
+# g++ (GCC) 4.2.4 (Ubuntu 4.2.4-1ubuntu4)
+# Copyright (C) 2007 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# d7671f473f1a+ tip
+# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), for GNU/Linux 2.6.8, dynamically linked (uses shared libs), not stripped
+
+Search_Easy0_CachedPCRE/8 10000000 187 ns/op 42.72 MB/s
+Search_Easy0_CachedPCRE/16 5000000 225 ns/op 71.05 MB/s
+Search_Easy0_CachedPCRE/32 5000000 321 ns/op 99.66 MB/s
+Search_Easy0_CachedPCRE/64 5000000 472 ns/op 135.54 MB/s
+Search_Easy0_CachedPCRE/128 2000000 768 ns/op 166.47 MB/s
+Search_Easy0_CachedPCRE/256 500000 2071 ns/op 123.57 MB/s
+Search_Easy0_CachedPCRE/512 500000 3601 ns/op 142.17 MB/s
+Search_Easy0_CachedPCRE/1K 200000 7120 ns/op 143.81 MB/s
+Search_Easy0_CachedPCRE/2K 100000 12071 ns/op 169.66 MB/s
+Search_Easy0_CachedPCRE/4K 50000 24017 ns/op 170.54 MB/s
+Search_Easy0_CachedPCRE/8K 50000 49303 ns/op 166.15 MB/s
+Search_Easy0_CachedPCRE/16K 20000 96809 ns/op 169.24 MB/s
+Search_Easy0_CachedPCRE/32K 10000 194402 ns/op 168.56 MB/s
+Search_Easy0_CachedPCRE/64K 5000 387333 ns/op 169.20 MB/s
+Search_Easy0_CachedPCRE/128K 2000 785405 ns/op 166.88 MB/s
+Search_Easy0_CachedPCRE/256K 1000 1553127 ns/op 168.78 MB/s
+Search_Easy0_CachedPCRE/512K 500 3111082 ns/op 168.52 MB/s
+Search_Easy0_CachedPCRE/1M 200 6329655 ns/op 165.66 MB/s
+Search_Easy0_CachedPCRE/2M 100 12689760 ns/op 165.26 MB/s
+Search_Easy0_CachedPCRE/4M 50 25449660 ns/op 164.81 MB/s
+Search_Easy0_CachedPCRE/8M 20 50878350 ns/op 164.88 MB/s
+Search_Easy0_CachedPCRE/16M 10 101730500 ns/op 164.92 MB/s
+Search_Easy0_CachedRE2/8 5000000 293 ns/op 27.22 MB/s
+Search_Easy0_CachedRE2/16 5000000 296 ns/op 53.97 MB/s
+Search_Easy0_CachedRE2/32 5000000 332 ns/op 96.20 MB/s
+Search_Easy0_CachedRE2/64 5000000 344 ns/op 185.55 MB/s
+Search_Easy0_CachedRE2/128 5000000 419 ns/op 304.99 MB/s
+Search_Easy0_CachedRE2/256 5000000 498 ns/op 513.30 MB/s
+Search_Easy0_CachedRE2/512 2000000 693 ns/op 738.23 MB/s
+Search_Easy0_CachedRE2/1K 1000000 1178 ns/op 869.24 MB/s
+Search_Easy0_CachedRE2/2K 500000 2137 ns/op 958.09 MB/s
+Search_Easy0_CachedRE2/4K 500000 3802 ns/op 1077.05 MB/s
+Search_Easy0_CachedRE2/8K 200000 7338 ns/op 1116.29 MB/s
+Search_Easy0_CachedRE2/16K 100000 14223 ns/op 1151.89 MB/s
+Search_Easy0_CachedRE2/32K 50000 29212 ns/op 1121.69 MB/s
+Search_Easy0_CachedRE2/64K 20000 58571 ns/op 1118.91 MB/s
+Search_Easy0_CachedRE2/128K 10000 127652 ns/op 1026.79 MB/s
+Search_Easy0_CachedRE2/256K 5000 254173 ns/op 1031.36 MB/s
+Search_Easy0_CachedRE2/512K 2000 512878 ns/op 1022.25 MB/s
+Search_Easy0_CachedRE2/1M 1000 1025916 ns/op 1022.09 MB/s
+Search_Easy0_CachedRE2/2M 500 2025662 ns/op 1035.29 MB/s
+Search_Easy0_CachedRE2/4M 500 4102988 ns/op 1022.26 MB/s
+Search_Easy0_CachedRE2/8M 200 8215045 ns/op 1021.13 MB/s
+Search_Easy0_CachedRE2/16M 100 16434700 ns/op 1020.84 MB/s
+Search_Easy1_CachedPCRE/8 10000000 194 ns/op 41.08 MB/s
+Search_Easy1_CachedPCRE/16 5000000 234 ns/op 68.20 MB/s
+Search_Easy1_CachedPCRE/32 5000000 327 ns/op 97.57 MB/s
+Search_Easy1_CachedPCRE/64 5000000 474 ns/op 134.94 MB/s
+Search_Easy1_CachedPCRE/128 1000000 1189 ns/op 107.57 MB/s
+Search_Easy1_CachedPCRE/256 1000000 1782 ns/op 143.66 MB/s
+Search_Easy1_CachedPCRE/512 500000 3957 ns/op 129.38 MB/s
+Search_Easy1_CachedPCRE/1K 200000 7231 ns/op 141.60 MB/s
+Search_Easy1_CachedPCRE/2K 100000 13633 ns/op 150.22 MB/s
+Search_Easy1_CachedPCRE/4K 50000 25380 ns/op 161.39 MB/s
+Search_Easy1_CachedPCRE/8K 20000 50466 ns/op 162.32 MB/s
+Search_Easy1_CachedPCRE/16K 20000 98483 ns/op 166.36 MB/s
+Search_Easy1_CachedPCRE/32K 10000 197239 ns/op 166.13 MB/s
+Search_Easy1_CachedPCRE/64K 5000 393513 ns/op 166.54 MB/s
+Search_Easy1_CachedPCRE/128K 2000 792420 ns/op 165.41 MB/s
+Search_Easy1_CachedPCRE/256K 1000 1577956 ns/op 166.13 MB/s
+Search_Easy1_CachedPCRE/512K 500 3162854 ns/op 165.76 MB/s
+Search_Easy1_CachedPCRE/1M 200 6433560 ns/op 162.99 MB/s
+Search_Easy1_CachedPCRE/2M 100 12888530 ns/op 162.71 MB/s
+Search_Easy1_CachedPCRE/4M 50 25851040 ns/op 162.25 MB/s
+Search_Easy1_CachedPCRE/8M 20 51705700 ns/op 162.24 MB/s
+Search_Easy1_CachedPCRE/16M 10 103423200 ns/op 162.22 MB/s
+Search_Easy1_CachedRE2/8 5000000 292 ns/op 27.34 MB/s
+Search_Easy1_CachedRE2/16 5000000 293 ns/op 54.49 MB/s
+Search_Easy1_CachedRE2/32 5000000 330 ns/op 96.86 MB/s
+Search_Easy1_CachedRE2/64 5000000 343 ns/op 186.51 MB/s
+Search_Easy1_CachedRE2/128 5000000 421 ns/op 304.03 MB/s
+Search_Easy1_CachedRE2/256 5000000 499 ns/op 512.53 MB/s
+Search_Easy1_CachedRE2/512 2000000 697 ns/op 734.27 MB/s
+Search_Easy1_CachedRE2/1K 1000000 1180 ns/op 867.12 MB/s
+Search_Easy1_CachedRE2/2K 500000 2136 ns/op 958.55 MB/s
+Search_Easy1_CachedRE2/4K 500000 3808 ns/op 1075.53 MB/s
+Search_Easy1_CachedRE2/8K 200000 7335 ns/op 1116.83 MB/s
+Search_Easy1_CachedRE2/16K 100000 14184 ns/op 1155.10 MB/s
+Search_Easy1_CachedRE2/32K 50000 29181 ns/op 1122.91 MB/s
+Search_Easy1_CachedRE2/64K 20000 58567 ns/op 1118.98 MB/s
+Search_Easy1_CachedRE2/128K 10000 127629 ns/op 1026.98 MB/s
+Search_Easy1_CachedRE2/256K 5000 254045 ns/op 1031.88 MB/s
+Search_Easy1_CachedRE2/512K 5000 494356 ns/op 1060.55 MB/s
+Search_Easy1_CachedRE2/1M 1000 1027490 ns/op 1020.52 MB/s
+Search_Easy1_CachedRE2/2M 500 2033222 ns/op 1031.44 MB/s
+Search_Easy1_CachedRE2/4M 500 4106182 ns/op 1021.46 MB/s
+Search_Easy1_CachedRE2/8M 200 8215690 ns/op 1021.05 MB/s
+Search_Easy1_CachedRE2/16M 100 16420070 ns/op 1021.75 MB/s
+Search_Medium_CachedPCRE/8 10000000 200 ns/op 39.93 MB/s
+Search_Medium_CachedPCRE/16 5000000 242 ns/op 66.08 MB/s
+Search_Medium_CachedPCRE/32 5000000 315 ns/op 101.47 MB/s
+Search_Medium_CachedPCRE/64 5000000 461 ns/op 138.71 MB/s
+Search_Medium_CachedPCRE/128 2000000 753 ns/op 169.80 MB/s
+Search_Medium_CachedPCRE/256 100000 16809 ns/op 15.23 MB/s
+Search_Medium_CachedPCRE/512 50000 39860 ns/op 12.84 MB/s
+Search_Medium_CachedPCRE/1K 20000 78547 ns/op 13.04 MB/s
+Search_Medium_CachedPCRE/2K 10000 117089 ns/op 17.49 MB/s
+Search_Medium_CachedPCRE/4K 5000 289169 ns/op 14.16 MB/s
+Search_Medium_CachedPCRE/8K 2000 625908 ns/op 13.09 MB/s
+Search_Medium_CachedPCRE/16K 1000 1277969 ns/op 12.82 MB/s
+Search_Medium_CachedPCRE/32K 500 2554842 ns/op 12.83 MB/s
+Search_Medium_CachedPCRE/64K 200 5105160 ns/op 12.84 MB/s
+Search_Medium_CachedPCRE/128K 100 10206360 ns/op 12.84 MB/s
+Search_Medium_CachedPCRE/256K 50 20440340 ns/op 12.82 MB/s
+Search_Medium_CachedRE2/8 5000000 334 ns/op 23.89 MB/s
+Search_Medium_CachedRE2/16 5000000 388 ns/op 41.15 MB/s
+Search_Medium_CachedRE2/32 5000000 496 ns/op 64.50 MB/s
+Search_Medium_CachedRE2/64 2000000 717 ns/op 89.22 MB/s
+Search_Medium_CachedRE2/128 1000000 1157 ns/op 110.60 MB/s
+Search_Medium_CachedRE2/256 500000 2037 ns/op 125.64 MB/s
+Search_Medium_CachedRE2/512 500000 3792 ns/op 135.01 MB/s
+Search_Medium_CachedRE2/1K 200000 7288 ns/op 140.50 MB/s
+Search_Medium_CachedRE2/2K 100000 14294 ns/op 143.27 MB/s
+Search_Medium_CachedRE2/4K 50000 28286 ns/op 144.81 MB/s
+Search_Medium_CachedRE2/8K 20000 56393 ns/op 145.27 MB/s
+Search_Medium_CachedRE2/16K 10000 112792 ns/op 145.26 MB/s
+Search_Medium_CachedRE2/32K 5000 231024 ns/op 141.84 MB/s
+Search_Medium_CachedRE2/64K 5000 450957 ns/op 145.33 MB/s
+Search_Medium_CachedRE2/128K 2000 906402 ns/op 144.61 MB/s
+Search_Medium_CachedRE2/256K 1000 1813827 ns/op 144.53 MB/s
+Search_Medium_CachedRE2/512K 500 3619796 ns/op 144.84 MB/s
+Search_Medium_CachedRE2/1M 200 7317695 ns/op 143.29 MB/s
+Search_Medium_CachedRE2/2M 100 14642030 ns/op 143.23 MB/s
+Search_Medium_CachedRE2/4M 50 29237140 ns/op 143.46 MB/s
+Search_Medium_CachedRE2/8M 20 58367050 ns/op 143.72 MB/s
+Search_Medium_CachedRE2/16M 10 116398000 ns/op 144.14 MB/s
+Search_Hard_CachedPCRE/8 10000000 199 ns/op 40.08 MB/s
+Search_Hard_CachedPCRE/16 5000000 241 ns/op 66.13 MB/s
+Search_Hard_CachedPCRE/32 5000000 316 ns/op 101.13 MB/s
+Search_Hard_CachedPCRE/64 5000000 460 ns/op 138.85 MB/s
+Search_Hard_CachedPCRE/128 2000000 753 ns/op 169.98 MB/s
+Search_Hard_CachedPCRE/256 1000 1038013 ns/op 0.25 MB/s
+Search_Hard_CachedPCRE/512 500 4263992 ns/op 0.12 MB/s
+Search_Hard_CachedPCRE/1K 100 16899150 ns/op 0.06 MB/s
+Search_Hard_CachedPCRE/2K 20 61792450 ns/op 0.03 MB/s
+Search_Hard_CachedPCRE/4K 5 266424400 ns/op 0.02 MB/s
+Search_Hard_CachedRE2/8 5000000 331 ns/op 24.16 MB/s
+Search_Hard_CachedRE2/16 5000000 386 ns/op 41.36 MB/s
+Search_Hard_CachedRE2/32 5000000 492 ns/op 64.95 MB/s
+Search_Hard_CachedRE2/64 2000000 713 ns/op 89.67 MB/s
+Search_Hard_CachedRE2/128 1000000 1151 ns/op 111.15 MB/s
+Search_Hard_CachedRE2/256 500000 2025 ns/op 126.36 MB/s
+Search_Hard_CachedRE2/512 500000 3774 ns/op 135.64 MB/s
+Search_Hard_CachedRE2/1K 200000 7271 ns/op 140.82 MB/s
+Search_Hard_CachedRE2/2K 100000 14274 ns/op 143.48 MB/s
+Search_Hard_CachedRE2/4K 50000 28261 ns/op 144.93 MB/s
+Search_Hard_CachedRE2/8K 20000 56253 ns/op 145.63 MB/s
+Search_Hard_CachedRE2/16K 10000 112279 ns/op 145.92 MB/s
+Search_Hard_CachedRE2/32K 5000 224208 ns/op 146.15 MB/s
+Search_Hard_CachedRE2/64K 5000 448835 ns/op 146.01 MB/s
+Search_Hard_CachedRE2/128K 2000 906965 ns/op 144.52 MB/s
+Search_Hard_CachedRE2/256K 1000 1821843 ns/op 143.89 MB/s
+Search_Hard_CachedRE2/512K 500 3616856 ns/op 144.96 MB/s
+Search_Hard_CachedRE2/1M 200 7319770 ns/op 143.25 MB/s
+Search_Hard_CachedRE2/2M 100 14614680 ns/op 143.50 MB/s
+Search_Hard_CachedRE2/4M 50 29189100 ns/op 143.69 MB/s
+Search_Hard_CachedRE2/8M 20 58239300 ns/op 144.04 MB/s
+Search_Hard_CachedRE2/16M 10 116307800 ns/op 144.25 MB/s
+Search_Parens_CachedPCRE/8 5000000 307 ns/op 26.03 MB/s
+Search_Parens_CachedRE2/8 5000000 333 ns/op 24.01 MB/s
+Search_Parens_CachedRE2/16 5000000 383 ns/op 41.71 MB/s
+Search_Parens_CachedRE2/32 5000000 496 ns/op 64.49 MB/s
+Search_Parens_CachedRE2/64 2000000 696 ns/op 91.88 MB/s
+Search_Parens_CachedRE2/128 1000000 1113 ns/op 114.97 MB/s
+Search_Parens_CachedRE2/256 1000000 2025 ns/op 126.38 MB/s
+Search_Parens_CachedRE2/512 500000 3776 ns/op 135.58 MB/s
+Search_Parens_CachedRE2/1K 200000 7292 ns/op 140.41 MB/s
+Search_Parens_CachedRE2/2K 100000 14272 ns/op 143.49 MB/s
+Search_Parens_CachedRE2/4K 50000 28252 ns/op 144.98 MB/s
+Search_Parens_CachedRE2/8K 20000 56203 ns/op 145.76 MB/s
+Search_Parens_CachedRE2/16K 10000 112272 ns/op 145.93 MB/s
+Search_Parens_CachedRE2/32K 5000 224140 ns/op 146.19 MB/s
+Search_Parens_CachedRE2/64K 5000 448597 ns/op 146.09 MB/s
+Search_Parens_CachedRE2/128K 2000 903965 ns/op 145.00 MB/s
+Search_Parens_CachedRE2/256K 1000 1806597 ns/op 145.10 MB/s
+Search_Parens_CachedRE2/512K 500 3614264 ns/op 145.06 MB/s
+Search_Parens_CachedRE2/1M 200 7293425 ns/op 143.77 MB/s
+Search_Parens_CachedRE2/2M 100 14617970 ns/op 143.46 MB/s
+Search_Parens_CachedRE2/4M 50 29199860 ns/op 143.64 MB/s
+Search_Parens_CachedRE2/8M 20 58260650 ns/op 143.98 MB/s
+Search_Parens_CachedRE2/16M 10 116198600 ns/op 144.38 MB/s
+Search_BigFixed_CachedPCRE/8 5000000 387 ns/op 20.67 MB/s
+Search_BigFixed_CachedPCRE/16 5000000 475 ns/op 33.67 MB/s
+Search_BigFixed_CachedPCRE/32 2000000 637 ns/op 50.21 MB/s
+Search_BigFixed_CachedPCRE/64 2000000 965 ns/op 66.28 MB/s
+Search_BigFixed_CachedPCRE/128 1000000 1618 ns/op 79.09 MB/s
+Search_BigFixed_CachedPCRE/256 500000 2936 ns/op 87.18 MB/s
+Search_BigFixed_CachedPCRE/512 200000 5558 ns/op 92.12 MB/s
+Search_BigFixed_CachedPCRE/1K 100000 10840 ns/op 94.46 MB/s
+Search_BigFixed_CachedPCRE/2K 50000 21291 ns/op 96.19 MB/s
+Search_BigFixed_CachedPCRE/4K 50000 42377 ns/op 96.66 MB/s
+Search_BigFixed_CachedPCRE/8K 20000 84365 ns/op 97.10 MB/s
+Search_BigFixed_CachedPCRE/16K 10000 168595 ns/op 97.18 MB/s
+Search_BigFixed_CachedPCRE/32K 5000 337102 ns/op 97.20 MB/s
+Search_BigFixed_CachedRE2/8 10000000 174 ns/op 45.85 MB/s
+Search_BigFixed_CachedRE2/16 5000000 360 ns/op 44.33 MB/s
+Search_BigFixed_CachedRE2/32 5000000 432 ns/op 74.06 MB/s
+Search_BigFixed_CachedRE2/64 2000000 556 ns/op 115.06 MB/s
+Search_BigFixed_CachedRE2/128 2000000 803 ns/op 159.24 MB/s
+Search_BigFixed_CachedRE2/256 1000000 1307 ns/op 195.83 MB/s
+Search_BigFixed_CachedRE2/512 500000 2308 ns/op 221.82 MB/s
+Search_BigFixed_CachedRE2/1K 500000 4321 ns/op 236.95 MB/s
+Search_BigFixed_CachedRE2/2K 200000 8334 ns/op 245.73 MB/s
+Search_BigFixed_CachedRE2/4K 100000 16361 ns/op 250.34 MB/s
+Search_BigFixed_CachedRE2/8K 50000 30995 ns/op 264.30 MB/s
+Search_BigFixed_CachedRE2/16K 20000 64632 ns/op 253.49 MB/s
+Search_BigFixed_CachedRE2/32K 10000 128875 ns/op 254.26 MB/s
+Search_BigFixed_CachedRE2/64K 5000 258009 ns/op 254.01 MB/s
+Search_BigFixed_CachedRE2/128K 2000 511023 ns/op 256.49 MB/s
+Search_BigFixed_CachedRE2/256K 1000 1031677 ns/op 254.10 MB/s
+Search_BigFixed_CachedRE2/512K 500 2124050 ns/op 246.83 MB/s
+Search_BigFixed_CachedRE2/1M 500 4316322 ns/op 242.93 MB/s
+Search_Success_PCRE/8 500000 3060 ns/op 2.61 MB/s
+Search_Success_PCRE/16 500000 3122 ns/op 5.12 MB/s
+Search_Success_PCRE/32 500000 3256 ns/op 9.83 MB/s
+Search_Success_PCRE/64 500000 3545 ns/op 18.05 MB/s
+Search_Success_PCRE/128 500000 4098 ns/op 31.23 MB/s
+Search_Success_PCRE/256 200000 5215 ns/op 49.08 MB/s
+Search_Success_PCRE/512 200000 7408 ns/op 69.11 MB/s
+Search_Success_PCRE/1K 100000 11838 ns/op 86.50 MB/s
+Search_Success_PCRE/2K 50000 20731 ns/op 98.79 MB/s
+Search_Success_PCRE/4K 50000 38394 ns/op 106.68 MB/s
+Search_Success_PCRE/8K 20000 73969 ns/op 110.75 MB/s
+Search_Success_PCRE/16K 10000 144799 ns/op 113.15 MB/s
+Search_Success_PCRE/32K 5000 286717 ns/op 114.29 MB/s
+Search_Success_PCRE/64K 2000 571529 ns/op 114.67 MB/s
+Search_Success_PCRE/128K 1000 1144131 ns/op 114.56 MB/s
+Search_Success_PCRE/256K 500 2292450 ns/op 114.35 MB/s
+Search_Success_PCRE/512K 500 4584198 ns/op 114.37 MB/s
+Search_Success_PCRE/1M 200 9385225 ns/op 111.73 MB/s
+Search_Success_PCRE/2M 100 19063720 ns/op 110.01 MB/s
+Search_Success_PCRE/4M 50 39404920 ns/op 106.44 MB/s
+==BENCHMARK== r70.mtv.corp.google.com Fri Feb 26 16:16:46 PST 2010
+# Linux r70.mtv.corp.google.com 2.6.24-gg804007-generic #1 SMP Thu Jan 21 11:28:34 PST 2010 x86_64 GNU/Linux
+# g++ (GCC) 4.2.4 (Ubuntu 4.2.4-1ubuntu4)
+# Copyright (C) 2007 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# a94585d91e66 tip
+# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), for GNU/Linux 2.6.8, dynamically linked (uses shared libs), not stripped
+
+==BENCHMARK== r70.mtv.corp.google.com Fri Feb 26 16:29:12 PST 2010
+# Linux r70.mtv.corp.google.com 2.6.24-gg804007-generic #1 SMP Thu Jan 21 11:28:34 PST 2010 x86_64 GNU/Linux
+# g++ (GCC) 4.2.4 (Ubuntu 4.2.4-1ubuntu4)
+# Copyright (C) 2007 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# a94585d91e66 tip
+# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), for GNU/Linux 2.6.8, dynamically linked (uses shared libs), not stripped
+
+Search_Easy0_CachedPCRE/8 10000000 186 ns/op 42.88 MB/s
+Search_Easy0_CachedPCRE/16 10000000 225 ns/op 70.90 MB/s
+Search_Easy0_CachedPCRE/32 5000000 319 ns/op 100.14 MB/s
+Search_Easy0_CachedPCRE/64 5000000 461 ns/op 138.59 MB/s
+Search_Easy0_CachedPCRE/128 2000000 752 ns/op 170.02 MB/s
+Search_Easy0_CachedPCRE/256 1000000 2054 ns/op 124.63 MB/s
+Search_Easy0_CachedPCRE/512 500000 3573 ns/op 143.26 MB/s
+Search_Easy0_CachedPCRE/1K 500000 7078 ns/op 144.66 MB/s
+Search_Easy0_CachedPCRE/2K 200000 12015 ns/op 170.44 MB/s
+Search_Easy0_CachedPCRE/4K 100000 23912 ns/op 171.29 MB/s
+Search_Easy0_CachedPCRE/8K 50000 49095 ns/op 166.86 MB/s
+Search_Easy0_CachedPCRE/16K 20000 96360 ns/op 170.03 MB/s
+Search_Easy0_CachedPCRE/32K 10000 193533 ns/op 169.31 MB/s
+Search_Easy0_CachedPCRE/64K 5000 385817 ns/op 169.86 MB/s
+Search_Easy0_CachedPCRE/128K 2000 774428 ns/op 169.25 MB/s
+Search_Easy0_CachedPCRE/256K 1000 1548917 ns/op 169.24 MB/s
+Search_Easy0_CachedPCRE/512K 500 3100914 ns/op 169.08 MB/s
+Search_Easy0_CachedPCRE/1M 500 6304122 ns/op 166.33 MB/s
+Search_Easy0_CachedPCRE/2M 100 12604920 ns/op 166.38 MB/s
+Search_Easy0_CachedPCRE/4M 100 25405120 ns/op 165.10 MB/s
+Search_Easy0_CachedPCRE/8M 50 50940620 ns/op 164.67 MB/s
+Search_Easy0_CachedPCRE/16M 20 102598300 ns/op 163.52 MB/s
+Search_Easy0_CachedRE2/8 10000000 302 ns/op 26.41 MB/s
+Search_Easy0_CachedRE2/16 5000000 314 ns/op 50.87 MB/s
+Search_Easy0_CachedRE2/32 5000000 349 ns/op 91.67 MB/s
+Search_Easy0_CachedRE2/64 5000000 349 ns/op 183.19 MB/s
+Search_Easy0_CachedRE2/128 5000000 415 ns/op 308.21 MB/s
+Search_Easy0_CachedRE2/256 5000000 486 ns/op 526.44 MB/s
+Search_Easy0_CachedRE2/512 5000000 644 ns/op 794.39 MB/s
+Search_Easy0_CachedRE2/1K 1000000 1143 ns/op 895.18 MB/s
+Search_Easy0_CachedRE2/2K 1000000 2099 ns/op 975.57 MB/s
+Search_Easy0_CachedRE2/4K 500000 3655 ns/op 1120.64 MB/s
+Search_Easy0_CachedRE2/8K 500000 7055 ns/op 1161.07 MB/s
+Search_Easy0_CachedRE2/16K 200000 13913 ns/op 1177.56 MB/s
+Search_Easy0_CachedRE2/32K 100000 28452 ns/op 1151.67 MB/s
+Search_Easy0_CachedRE2/64K 50000 56987 ns/op 1150.02 MB/s
+Search_Easy0_CachedRE2/128K 10000 119200 ns/op 1099.60 MB/s
+Search_Easy0_CachedRE2/256K 10000 237730 ns/op 1102.70 MB/s
+Search_Easy0_CachedRE2/512K 5000 468114 ns/op 1120.00 MB/s
+Search_Easy0_CachedRE2/1M 2000 1021877 ns/op 1026.13 MB/s
+Search_Easy0_CachedRE2/2M 1000 2011649 ns/op 1042.50 MB/s
+Search_Easy0_CachedRE2/4M 500 4080682 ns/op 1027.84 MB/s
+Search_Easy0_CachedRE2/8M 200 8181740 ns/op 1025.28 MB/s
+Search_Easy0_CachedRE2/16M 100 16334010 ns/op 1027.13 MB/s
+Search_Easy1_CachedPCRE/8 10000000 192 ns/op 41.54 MB/s
+Search_Easy1_CachedPCRE/16 10000000 235 ns/op 67.96 MB/s
+Search_Easy1_CachedPCRE/32 5000000 322 ns/op 99.33 MB/s
+Search_Easy1_CachedPCRE/64 5000000 464 ns/op 137.87 MB/s
+Search_Easy1_CachedPCRE/128 1000000 1181 ns/op 108.34 MB/s
+Search_Easy1_CachedPCRE/256 1000000 1777 ns/op 144.06 MB/s
+Search_Easy1_CachedPCRE/512 500000 3954 ns/op 129.48 MB/s
+Search_Easy1_CachedPCRE/1K 500000 7239 ns/op 141.44 MB/s
+Search_Easy1_CachedPCRE/2K 200000 13617 ns/op 150.39 MB/s
+Search_Easy1_CachedPCRE/4K 100000 25383 ns/op 161.37 MB/s
+Search_Easy1_CachedPCRE/8K 50000 50456 ns/op 162.36 MB/s
+Search_Easy1_CachedPCRE/16K 20000 98470 ns/op 166.38 MB/s
+Search_Easy1_CachedPCRE/32K 10000 197261 ns/op 166.11 MB/s
+Search_Easy1_CachedPCRE/64K 5000 393359 ns/op 166.61 MB/s
+Search_Easy1_CachedPCRE/128K 2000 791563 ns/op 165.59 MB/s
+Search_Easy1_CachedPCRE/256K 1000 1584273 ns/op 165.47 MB/s
+Search_Easy1_CachedPCRE/512K 500 3164934 ns/op 165.66 MB/s
+Search_Easy1_CachedPCRE/1M 500 6457384 ns/op 162.38 MB/s
+Search_Easy1_CachedPCRE/2M 100 13022700 ns/op 161.04 MB/s
+Search_Easy1_CachedPCRE/4M 100 26111890 ns/op 160.63 MB/s
+Search_Easy1_CachedPCRE/8M 50 52238340 ns/op 160.58 MB/s
+Search_Easy1_CachedPCRE/16M 20 104536750 ns/op 160.49 MB/s
+Search_Easy1_CachedRE2/8 10000000 295 ns/op 27.08 MB/s
+Search_Easy1_CachedRE2/16 5000000 302 ns/op 52.81 MB/s
+Search_Easy1_CachedRE2/32 5000000 335 ns/op 95.38 MB/s
+Search_Easy1_CachedRE2/64 5000000 344 ns/op 185.80 MB/s
+Search_Easy1_CachedRE2/128 5000000 421 ns/op 303.85 MB/s
+Search_Easy1_CachedRE2/256 5000000 503 ns/op 508.89 MB/s
+Search_Easy1_CachedRE2/512 5000000 694 ns/op 736.74 MB/s
+Search_Easy1_CachedRE2/1K 1000000 1176 ns/op 870.59 MB/s
+Search_Easy1_CachedRE2/2K 1000000 2139 ns/op 957.38 MB/s
+Search_Easy1_CachedRE2/4K 500000 3803 ns/op 1076.84 MB/s
+Search_Easy1_CachedRE2/8K 500000 7336 ns/op 1116.54 MB/s
+Search_Easy1_CachedRE2/16K 200000 14191 ns/op 1154.47 MB/s
+Search_Easy1_CachedRE2/32K 100000 29177 ns/op 1123.07 MB/s
+Search_Easy1_CachedRE2/64K 50000 58598 ns/op 1118.38 MB/s
+Search_Easy1_CachedRE2/128K 10000 127625 ns/op 1027.01 MB/s
+Search_Easy1_CachedRE2/256K 10000 254186 ns/op 1031.30 MB/s
+Search_Easy1_CachedRE2/512K 5000 493326 ns/op 1062.76 MB/s
+Search_Easy1_CachedRE2/1M 2000 1135745 ns/op 923.25 MB/s
+Search_Easy1_CachedRE2/2M 1000 2250206 ns/op 931.98 MB/s
+Search_Easy1_CachedRE2/4M 500 4513804 ns/op 929.22 MB/s
+Search_Easy1_CachedRE2/8M 200 9019710 ns/op 930.03 MB/s
+Search_Easy1_CachedRE2/16M 100 18027570 ns/op 930.64 MB/s
+Search_Medium_CachedPCRE/8 10000000 172 ns/op 46.39 MB/s
+Search_Medium_CachedPCRE/16 10000000 215 ns/op 74.33 MB/s
+Search_Medium_CachedPCRE/32 5000000 298 ns/op 107.26 MB/s
+Search_Medium_CachedPCRE/64 5000000 441 ns/op 144.98 MB/s
+Search_Medium_CachedPCRE/128 5000000 729 ns/op 175.45 MB/s
+Search_Medium_CachedPCRE/256 100000 16796 ns/op 15.24 MB/s
+Search_Medium_CachedPCRE/512 50000 40007 ns/op 12.80 MB/s
+Search_Medium_CachedPCRE/1K 20000 78764 ns/op 13.00 MB/s
+Search_Medium_CachedPCRE/2K 10000 116986 ns/op 17.51 MB/s
+Search_Medium_CachedPCRE/4K 10000 289854 ns/op 14.13 MB/s
+Search_Medium_CachedPCRE/8K 5000 627300 ns/op 13.06 MB/s
+Search_Medium_CachedPCRE/16K 2000 1277751 ns/op 12.82 MB/s
+Search_Medium_CachedPCRE/32K 1000 2555076 ns/op 12.82 MB/s
+Search_Medium_CachedPCRE/64K 500 5106302 ns/op 12.83 MB/s
+Search_Medium_CachedPCRE/128K 100 10204640 ns/op 12.84 MB/s
+Search_Medium_CachedPCRE/256K 100 20416970 ns/op 12.84 MB/s
+Search_Medium_CachedRE2/8 5000000 333 ns/op 24.02 MB/s
+Search_Medium_CachedRE2/16 5000000 389 ns/op 41.12 MB/s
+Search_Medium_CachedRE2/32 5000000 498 ns/op 64.23 MB/s
+Search_Medium_CachedRE2/64 5000000 716 ns/op 89.35 MB/s
+Search_Medium_CachedRE2/128 1000000 1152 ns/op 111.08 MB/s
+Search_Medium_CachedRE2/256 1000000 2027 ns/op 126.29 MB/s
+Search_Medium_CachedRE2/512 500000 3772 ns/op 135.70 MB/s
+Search_Medium_CachedRE2/1K 500000 7264 ns/op 140.95 MB/s
+Search_Medium_CachedRE2/2K 200000 14266 ns/op 143.56 MB/s
+Search_Medium_CachedRE2/4K 100000 28230 ns/op 145.09 MB/s
+Search_Medium_CachedRE2/8K 50000 56221 ns/op 145.71 MB/s
+Search_Medium_CachedRE2/16K 10000 112045 ns/op 146.23 MB/s
+Search_Medium_CachedRE2/32K 10000 223917 ns/op 146.34 MB/s
+Search_Medium_CachedRE2/64K 5000 448381 ns/op 146.16 MB/s
+Search_Medium_CachedRE2/128K 2000 903067 ns/op 145.14 MB/s
+Search_Medium_CachedRE2/256K 1000 1804888 ns/op 145.24 MB/s
+Search_Medium_CachedRE2/512K 500 3621616 ns/op 144.77 MB/s
+Search_Medium_CachedRE2/1M 500 7316090 ns/op 143.32 MB/s
+Search_Medium_CachedRE2/2M 100 14672140 ns/op 142.93 MB/s
+Search_Medium_CachedRE2/4M 100 29322600 ns/op 143.04 MB/s
+Search_Medium_CachedRE2/8M 50 58591820 ns/op 143.17 MB/s
+Search_Medium_CachedRE2/16M 20 117035300 ns/op 143.35 MB/s
+Search_Hard_CachedPCRE/8 10000000 189 ns/op 42.19 MB/s
+Search_Hard_CachedPCRE/16 10000000 232 ns/op 68.88 MB/s
+Search_Hard_CachedPCRE/32 5000000 308 ns/op 103.56 MB/s
+Search_Hard_CachedPCRE/64 5000000 459 ns/op 139.43 MB/s
+Search_Hard_CachedPCRE/128 2000000 752 ns/op 170.21 MB/s
+Search_Hard_CachedPCRE/256 2000 1039441 ns/op 0.25 MB/s
+Search_Hard_CachedPCRE/512 500 4261278 ns/op 0.12 MB/s
+Search_Hard_CachedPCRE/1K 100 16900780 ns/op 0.06 MB/s
+Search_Hard_CachedPCRE/2K 50 61840340 ns/op 0.03 MB/s
+Search_Hard_CachedPCRE/4K 5 266433000 ns/op 0.02 MB/s
+Search_Hard_CachedRE2/8 5000000 333 ns/op 24.01 MB/s
+Search_Hard_CachedRE2/16 5000000 386 ns/op 41.42 MB/s
+Search_Hard_CachedRE2/32 5000000 498 ns/op 64.13 MB/s
+Search_Hard_CachedRE2/64 5000000 719 ns/op 88.97 MB/s
+Search_Hard_CachedRE2/128 1000000 1153 ns/op 110.93 MB/s
+Search_Hard_CachedRE2/256 1000000 2029 ns/op 126.12 MB/s
+Search_Hard_CachedRE2/512 500000 3765 ns/op 135.98 MB/s
+Search_Hard_CachedRE2/1K 500000 7257 ns/op 141.10 MB/s
+Search_Hard_CachedRE2/2K 200000 14263 ns/op 143.58 MB/s
+Search_Hard_CachedRE2/4K 100000 28235 ns/op 145.07 MB/s
+Search_Hard_CachedRE2/8K 50000 56166 ns/op 145.85 MB/s
+Search_Hard_CachedRE2/16K 10000 111887 ns/op 146.43 MB/s
+Search_Hard_CachedRE2/32K 10000 224057 ns/op 146.25 MB/s
+Search_Hard_CachedRE2/64K 5000 447562 ns/op 146.43 MB/s
+Search_Hard_CachedRE2/128K 2000 902071 ns/op 145.30 MB/s
+Search_Hard_CachedRE2/256K 1000 1804780 ns/op 145.25 MB/s
+Search_Hard_CachedRE2/512K 500 3601118 ns/op 145.59 MB/s
+Search_Hard_CachedRE2/1M 500 7287856 ns/op 143.88 MB/s
+Search_Hard_CachedRE2/2M 100 14713470 ns/op 142.53 MB/s
+Search_Hard_CachedRE2/4M 100 29151470 ns/op 143.88 MB/s
+Search_Hard_CachedRE2/8M 50 58191300 ns/op 144.16 MB/s
+Search_Hard_CachedRE2/16M 20 116104850 ns/op 144.50 MB/s
+Search_Parens_CachedPCRE/8 5000000 305 ns/op 26.22 MB/s
+Search_Parens_CachedRE2/8 5000000 329 ns/op 24.27 MB/s
+Search_Parens_CachedRE2/16 5000000 386 ns/op 41.35 MB/s
+Search_Parens_CachedRE2/32 5000000 494 ns/op 64.69 MB/s
+Search_Parens_CachedRE2/64 5000000 711 ns/op 89.92 MB/s
+Search_Parens_CachedRE2/128 1000000 1150 ns/op 111.21 MB/s
+Search_Parens_CachedRE2/256 1000000 2018 ns/op 126.81 MB/s
+Search_Parens_CachedRE2/512 500000 3767 ns/op 135.88 MB/s
+Search_Parens_CachedRE2/1K 500000 7254 ns/op 141.15 MB/s
+Search_Parens_CachedRE2/2K 200000 14250 ns/op 143.71 MB/s
+Search_Parens_CachedRE2/4K 100000 28199 ns/op 145.25 MB/s
+Search_Parens_CachedRE2/8K 50000 56158 ns/op 145.87 MB/s
+Search_Parens_CachedRE2/16K 10000 112139 ns/op 146.10 MB/s
+Search_Parens_CachedRE2/32K 10000 223758 ns/op 146.44 MB/s
+Search_Parens_CachedRE2/64K 5000 447242 ns/op 146.53 MB/s
+Search_Parens_CachedRE2/128K 2000 902342 ns/op 145.26 MB/s
+Search_Parens_CachedRE2/256K 1000 1804484 ns/op 145.27 MB/s
+Search_Parens_CachedRE2/512K 500 3603350 ns/op 145.50 MB/s
+Search_Parens_CachedRE2/1M 500 7275228 ns/op 144.13 MB/s
+Search_Parens_CachedRE2/2M 100 14546350 ns/op 144.17 MB/s
+Search_Parens_CachedRE2/4M 100 29132730 ns/op 143.97 MB/s
+Search_Parens_CachedRE2/8M 50 58143420 ns/op 144.27 MB/s
+Search_Parens_CachedRE2/16M 20 116224000 ns/op 144.35 MB/s
+Search_BigFixed_CachedPCRE/8 5000000 386 ns/op 20.73 MB/s
+Search_BigFixed_CachedPCRE/16 5000000 475 ns/op 33.64 MB/s
+Search_BigFixed_CachedPCRE/32 5000000 639 ns/op 50.07 MB/s
+Search_BigFixed_CachedPCRE/64 2000000 966 ns/op 66.19 MB/s
+Search_BigFixed_CachedPCRE/128 1000000 1619 ns/op 79.02 MB/s
+Search_BigFixed_CachedPCRE/256 1000000 2927 ns/op 87.43 MB/s
+Search_BigFixed_CachedPCRE/512 500000 5547 ns/op 92.29 MB/s
+Search_BigFixed_CachedPCRE/1K 200000 10789 ns/op 94.91 MB/s
+Search_BigFixed_CachedPCRE/2K 100000 21254 ns/op 96.36 MB/s
+Search_BigFixed_CachedPCRE/4K 50000 42248 ns/op 96.95 MB/s
+Search_BigFixed_CachedPCRE/8K 20000 85732 ns/op 95.55 MB/s
+Search_BigFixed_CachedPCRE/16K 10000 169041 ns/op 96.92 MB/s
+Search_BigFixed_CachedPCRE/32K 5000 336530 ns/op 97.37 MB/s
+Search_BigFixed_CachedRE2/8 10000000 173 ns/op 46.13 MB/s
+Search_BigFixed_CachedRE2/16 5000000 358 ns/op 44.63 MB/s
+Search_BigFixed_CachedRE2/32 5000000 428 ns/op 74.60 MB/s
+Search_BigFixed_CachedRE2/64 5000000 552 ns/op 115.91 MB/s
+Search_BigFixed_CachedRE2/128 2000000 786 ns/op 162.81 MB/s
+Search_BigFixed_CachedRE2/256 1000000 1261 ns/op 202.95 MB/s
+Search_BigFixed_CachedRE2/512 1000000 2226 ns/op 229.95 MB/s
+Search_BigFixed_CachedRE2/1K 500000 4306 ns/op 237.77 MB/s
+Search_BigFixed_CachedRE2/2K 200000 8298 ns/op 246.80 MB/s
+Search_BigFixed_CachedRE2/4K 100000 15641 ns/op 261.87 MB/s
+Search_BigFixed_CachedRE2/8K 50000 32298 ns/op 253.63 MB/s
+Search_BigFixed_CachedRE2/16K 50000 64673 ns/op 253.33 MB/s
+Search_BigFixed_CachedRE2/32K 10000 128773 ns/op 254.46 MB/s
+Search_BigFixed_CachedRE2/64K 5000 260717 ns/op 251.37 MB/s
+Search_BigFixed_CachedRE2/128K 5000 511763 ns/op 256.12 MB/s
+Search_BigFixed_CachedRE2/256K 2000 1010685 ns/op 259.37 MB/s
+Search_BigFixed_CachedRE2/512K 1000 2045435 ns/op 256.32 MB/s
+Search_BigFixed_CachedRE2/1M 500 4194192 ns/op 250.01 MB/s
+Search_Success_PCRE/8 500000 3180 ns/op 2.52 MB/s
+Search_Success_PCRE/16 500000 3257 ns/op 4.91 MB/s
+Search_Success_PCRE/32 500000 3398 ns/op 9.42 MB/s
+Search_Success_PCRE/64 500000 3667 ns/op 17.45 MB/s
+Search_Success_PCRE/128 500000 4217 ns/op 30.35 MB/s
+Search_Success_PCRE/256 500000 5323 ns/op 48.09 MB/s
+Search_Success_PCRE/512 200000 7548 ns/op 67.82 MB/s
+Search_Success_PCRE/1K 200000 11978 ns/op 85.48 MB/s
+Search_Success_PCRE/2K 100000 20952 ns/op 97.74 MB/s
+Search_Success_PCRE/4K 50000 38810 ns/op 105.54 MB/s
+Search_Success_PCRE/8K 50000 74005 ns/op 110.69 MB/s
+Search_Success_PCRE/16K 10000 145100 ns/op 112.91 MB/s
+Search_Success_PCRE/32K 10000 286997 ns/op 114.18 MB/s
+Search_Success_PCRE/64K 5000 570876 ns/op 114.80 MB/s
+Search_Success_PCRE/128K 2000 1145287 ns/op 114.44 MB/s
+Search_Success_PCRE/256K 1000 2293161 ns/op 114.32 MB/s
+Search_Success_PCRE/512K 500 4615962 ns/op 113.58 MB/s
+Search_Success_PCRE/1M 200 9465575 ns/op 110.78 MB/s
+Search_Success_PCRE/2M 100 19204210 ns/op 109.20 MB/s
+Search_Success_PCRE/4M 50 39546740 ns/op 106.06 MB/s
+Search_Success_PCRE/8M 20 86620850 ns/op 96.84 MB/s
+Search_Success_PCRE/16M 5 249759000 ns/op 67.17 MB/s
+Search_Success_RE2/8 200000 11045 ns/op 0.72 MB/s
+Search_Success_RE2/16 100000 24945 ns/op 0.64 MB/s
+Search_Success_RE2/32 100000 25051 ns/op 1.28 MB/s
+Search_Success_RE2/64 100000 25231 ns/op 2.54 MB/s
+Search_Success_RE2/128 100000 25674 ns/op 4.99 MB/s
+Search_Success_RE2/256 100000 26494 ns/op 9.66 MB/s
+Search_Success_RE2/512 100000 28177 ns/op 18.17 MB/s
+Search_Success_RE2/1K 50000 31724 ns/op 32.28 MB/s
+Search_Success_RE2/2K 50000 38681 ns/op 52.95 MB/s
+Search_Success_RE2/4K 50000 52757 ns/op 77.64 MB/s
+Search_Success_RE2/8K 20000 81316 ns/op 100.74 MB/s
+Search_Success_RE2/16K 10000 137268 ns/op 119.36 MB/s
+Search_Success_RE2/32K 10000 250210 ns/op 130.96 MB/s
+Search_Success_RE2/64K 5000 475959 ns/op 137.69 MB/s
+Search_Success_RE2/128K 2000 932651 ns/op 140.54 MB/s
+Search_Success_RE2/256K 1000 1834279 ns/op 142.91 MB/s
+Search_Success_RE2/512K 500 3667904 ns/op 142.94 MB/s
+Search_Success_RE2/1M 200 7492295 ns/op 139.95 MB/s
+Search_Success_RE2/2M 100 15393340 ns/op 136.24 MB/s
+Search_Success_RE2/4M 50 31713440 ns/op 132.26 MB/s
+Search_Success_RE2/8M 20 70783000 ns/op 118.51 MB/s
+Search_Success_RE2/16M 5 214766800 ns/op 78.12 MB/s
+Search_Success_CachedPCRE/8 5000000 398 ns/op 20.07 MB/s
+Search_Success_CachedPCRE/16 5000000 467 ns/op 34.21 MB/s
+Search_Success_CachedPCRE/32 5000000 606 ns/op 52.78 MB/s
+Search_Success_CachedPCRE/64 2000000 889 ns/op 71.92 MB/s
+Search_Success_CachedPCRE/128 1000000 1435 ns/op 89.15 MB/s
+Search_Success_CachedPCRE/256 1000000 2548 ns/op 100.46 MB/s
+Search_Success_CachedPCRE/512 500000 4759 ns/op 107.58 MB/s
+Search_Success_CachedPCRE/1K 200000 9196 ns/op 111.34 MB/s
+Search_Success_CachedPCRE/2K 100000 18028 ns/op 113.60 MB/s
+Search_Success_CachedPCRE/4K 50000 35661 ns/op 114.86 MB/s
+Search_Success_CachedPCRE/8K 50000 71119 ns/op 115.19 MB/s
+Search_Success_CachedPCRE/16K 10000 141806 ns/op 115.54 MB/s
+Search_Success_CachedPCRE/32K 10000 283456 ns/op 115.60 MB/s
+Search_Success_CachedPCRE/64K 5000 567732 ns/op 115.43 MB/s
+Search_Success_CachedPCRE/128K 2000 1138747 ns/op 115.10 MB/s
+Search_Success_CachedPCRE/256K 1000 2313186 ns/op 113.33 MB/s
+Search_Success_CachedPCRE/512K 500 4577496 ns/op 114.54 MB/s
+Search_Success_CachedPCRE/1M 200 9356010 ns/op 112.08 MB/s
+Search_Success_CachedPCRE/2M 100 19004790 ns/op 110.35 MB/s
+Search_Success_CachedPCRE/4M 50 39343000 ns/op 106.61 MB/s
+Search_Success_CachedPCRE/8M 20 86153650 ns/op 97.37 MB/s
+Search_Success_CachedPCRE/16M 5 246868000 ns/op 67.96 MB/s
+Search_Success_CachedRE2/8 10000000 194 ns/op 41.10 MB/s
+Search_Success_CachedRE2/16 5000000 398 ns/op 40.20 MB/s
+Search_Success_CachedRE2/32 5000000 503 ns/op 63.59 MB/s
+Search_Success_CachedRE2/64 5000000 723 ns/op 88.49 MB/s
+Search_Success_CachedRE2/128 1000000 1158 ns/op 110.49 MB/s
+Search_Success_CachedRE2/256 1000000 2033 ns/op 125.88 MB/s
+Search_Success_CachedRE2/512 500000 3778 ns/op 135.49 MB/s
+Search_Success_CachedRE2/1K 500000 7267 ns/op 140.91 MB/s
+Search_Success_CachedRE2/2K 200000 14244 ns/op 143.77 MB/s
+Search_Success_CachedRE2/4K 100000 28205 ns/op 145.22 MB/s
+Search_Success_CachedRE2/8K 50000 56127 ns/op 145.95 MB/s
+Search_Success_CachedRE2/16K 10000 111843 ns/op 146.49 MB/s
+Search_Success_CachedRE2/32K 10000 223998 ns/op 146.29 MB/s
+Search_Success_CachedRE2/64K 5000 448512 ns/op 146.12 MB/s
+Search_Success_CachedRE2/128K 2000 901455 ns/op 145.40 MB/s
+Search_Success_CachedRE2/256K 1000 1806001 ns/op 145.15 MB/s
+Search_Success_CachedRE2/512K 500 3657618 ns/op 143.34 MB/s
+Search_Success_CachedRE2/1M 200 7519345 ns/op 139.45 MB/s
+Search_Success_CachedRE2/2M 100 15277030 ns/op 137.27 MB/s
+Search_Success_CachedRE2/4M 50 31999980 ns/op 131.07 MB/s
+Search_Success_CachedRE2/8M 20 70956150 ns/op 118.22 MB/s
+Search_Success_CachedRE2/16M 5 216152800 ns/op 77.62 MB/s
+Search_Success1_PCRE/8 500000 3423 ns/op 2.34 MB/s
+Search_Success1_PCRE/16 500000 3479 ns/op 4.60 MB/s
+Search_Success1_PCRE/32 500000 3569 ns/op 8.97 MB/s
+Search_Success1_PCRE/64 500000 3861 ns/op 16.57 MB/s
+Search_Success1_PCRE/128 500000 4451 ns/op 28.76 MB/s
+Search_Success1_PCRE/256 500000 5540 ns/op 46.21 MB/s
+Search_Success1_PCRE/512 200000 7746 ns/op 66.09 MB/s
+Search_Success1_PCRE/1K 200000 12197 ns/op 83.95 MB/s
+Search_Success1_PCRE/2K 100000 21043 ns/op 97.32 MB/s
+Search_Success1_PCRE/4K 50000 38724 ns/op 105.77 MB/s
+Search_Success1_PCRE/8K 50000 74377 ns/op 110.14 MB/s
+Search_Success1_PCRE/16K 10000 145584 ns/op 112.54 MB/s
+Search_Success1_PCRE/32K 10000 287938 ns/op 113.80 MB/s
+Search_Success1_PCRE/64K 5000 573818 ns/op 114.21 MB/s
+Search_Success1_PCRE/128K 2000 1143687 ns/op 114.60 MB/s
+Search_Success1_PCRE/256K 1000 2289906 ns/op 114.48 MB/s
+Search_Success1_PCRE/512K 500 4585568 ns/op 114.33 MB/s
+Search_Success1_PCRE/1M 200 9418160 ns/op 111.34 MB/s
+Search_Success1_PCRE/2M 100 19084930 ns/op 109.89 MB/s
+Search_Success1_PCRE/4M 50 39363100 ns/op 106.55 MB/s
+Search_Success1_PCRE/8M 20 86060150 ns/op 97.47 MB/s
+Search_Success1_PCRE/16M 5 250110600 ns/op 67.08 MB/s
+Search_Success1_RE2/8 50000 33378 ns/op 0.24 MB/s
+Search_Success1_RE2/16 50000 33315 ns/op 0.48 MB/s
+Search_Success1_RE2/32 50000 33282 ns/op 0.96 MB/s
+Search_Success1_RE2/64 50000 33648 ns/op 1.90 MB/s
+Search_Success1_RE2/128 50000 34114 ns/op 3.75 MB/s
+Search_Success1_RE2/256 50000 35068 ns/op 7.30 MB/s
+Search_Success1_RE2/512 50000 36888 ns/op 13.88 MB/s
+Search_Success1_RE2/1K 50000 40304 ns/op 25.41 MB/s
+Search_Success1_RE2/2K 50000 47214 ns/op 43.38 MB/s
+Search_Success1_RE2/4K 50000 61269 ns/op 66.85 MB/s
+Search_Success1_RE2/8K 20000 89250 ns/op 91.79 MB/s
+Search_Success1_RE2/16K 10000 146292 ns/op 111.99 MB/s
+Search_Success1_RE2/32K 10000 258737 ns/op 126.65 MB/s
+Search_Success1_RE2/64K 5000 484877 ns/op 135.16 MB/s
+Search_Success1_RE2/128K 2000 943913 ns/op 138.86 MB/s
+Search_Success1_RE2/256K 1000 1873214 ns/op 139.94 MB/s
+Search_Success1_RE2/512K 500 3705398 ns/op 141.49 MB/s
+Search_Success1_RE2/1M 200 7572110 ns/op 138.48 MB/s
+Search_Success1_RE2/2M 100 15408090 ns/op 136.11 MB/s
+Search_Success1_RE2/4M 50 31925020 ns/op 131.38 MB/s
+Search_Success1_RE2/8M 20 71334800 ns/op 117.59 MB/s
+Search_Success1_RE2/16M 5 215033000 ns/op 78.02 MB/s
+Search_Success1_Cached_PCRE/8 5000000 444 ns/op 18.02 MB/s
+Search_Success1_Cached_PCRE/16 5000000 512 ns/op 31.25 MB/s
+Search_Success1_Cached_PCRE/32 5000000 648 ns/op 49.31 MB/s
+Search_Success1_Cached_PCRE/64 2000000 924 ns/op 69.23 MB/s
+Search_Success1_Cached_PCRE/128 1000000 1479 ns/op 86.50 MB/s
+Search_Success1_Cached_PCRE/256 1000000 2583 ns/op 99.09 MB/s
+Search_Success1_Cached_PCRE/512 500000 4820 ns/op 106.21 MB/s
+Search_Success1_Cached_PCRE/1K 200000 9312 ns/op 109.95 MB/s
+Search_Success1_Cached_PCRE/2K 100000 18101 ns/op 113.14 MB/s
+Search_Success1_Cached_PCRE/4K 50000 35873 ns/op 114.18 MB/s
+Search_Success1_Cached_PCRE/8K 50000 71355 ns/op 114.81 MB/s
+Search_Success1_Cached_PCRE/16K 10000 142622 ns/op 114.88 MB/s
+Search_Success1_Cached_PCRE/32K 10000 284619 ns/op 115.13 MB/s
+Search_Success1_Cached_PCRE/64K 5000 569459 ns/op 115.08 MB/s
+Search_Success1_Cached_PCRE/128K 2000 1141538 ns/op 114.82 MB/s
+Search_Success1_Cached_PCRE/256K 1000 2284009 ns/op 114.77 MB/s
+Search_Success1_Cached_PCRE/512K 500 4600102 ns/op 113.97 MB/s
+Search_Success1_Cached_PCRE/1M 200 9412150 ns/op 111.41 MB/s
+Search_Success1_Cached_PCRE/2M 100 19149300 ns/op 109.52 MB/s
+Search_Success1_Cached_PCRE/4M 50 39554360 ns/op 106.04 MB/s
+Search_Success1_Cached_PCRE/8M 20 86455700 ns/op 97.03 MB/s
+Search_Success1_Cached_PCRE/16M 5 247629000 ns/op 67.75 MB/s
+Search_Success1_Cached_RE2/8 5000000 342 ns/op 23.34 MB/s
+Search_Success1_Cached_RE2/16 5000000 393 ns/op 40.65 MB/s
+Search_Success1_Cached_RE2/32 5000000 491 ns/op 65.09 MB/s
+Search_Success1_Cached_RE2/64 5000000 722 ns/op 88.62 MB/s
+Search_Success1_Cached_RE2/128 1000000 1157 ns/op 110.54 MB/s
+Search_Success1_Cached_RE2/256 1000000 2032 ns/op 125.94 MB/s
+Search_Success1_Cached_RE2/512 500000 3783 ns/op 135.32 MB/s
+Search_Success1_Cached_RE2/1K 500000 7283 ns/op 140.59 MB/s
+Search_Success1_Cached_RE2/2K 200000 14272 ns/op 143.49 MB/s
+Search_Success1_Cached_RE2/4K 100000 28247 ns/op 145.00 MB/s
+Search_Success1_Cached_RE2/8K 50000 56279 ns/op 145.56 MB/s
+Search_Success1_Cached_RE2/16K 10000 112283 ns/op 145.92 MB/s
+Search_Success1_Cached_RE2/32K 10000 224269 ns/op 146.11 MB/s
+Search_Success1_Cached_RE2/64K 5000 448363 ns/op 146.17 MB/s
+Search_Success1_Cached_RE2/128K 2000 903637 ns/op 145.05 MB/s
+Search_Success1_Cached_RE2/256K 1000 1811174 ns/op 144.74 MB/s
+Search_Success1_Cached_RE2/512K 500 3637266 ns/op 144.14 MB/s
+Search_Success1_Cached_RE2/1M 200 7452810 ns/op 140.70 MB/s
+Search_Success1_Cached_RE2/2M 100 15218540 ns/op 137.80 MB/s
+Search_Success1_Cached_RE2/4M 50 31624240 ns/op 132.63 MB/s
+Search_Success1_Cached_RE2/8M 20 70441100 ns/op 119.09 MB/s
+Search_Success1_Cached_RE2/16M 5 214653600 ns/op 78.16 MB/s
+Search_Digits_PCRE 500000 7117 ns/op
+Search_Digits_RE2 100000 27121 ns/op
+Parse_Digits_PCRE 500000 7214 ns/op
+Parse_Digits_RE2 200000 13193 ns/op
+Parse_CachedDigits_PCRE 2000000 771 ns/op
+Parse_CachedDigits_RE2 5000000 452 ns/op
+Parse_DigitDs_PCRE 500000 6655 ns/op
+Parse_DigitDs_RE2 200000 12935 ns/op
+Parse_CachedDigitDs_PCRE 2000000 761 ns/op
+Parse_CachedDigitDs_RE2 5000000 452 ns/op
+Parse_Split_PCRE 500000 4849 ns/op
+Parse_Split_RE2 200000 14149 ns/op
+Parse_CachedSplit_PCRE 5000000 572 ns/op
+Parse_CachedSplit_RE2 10000000 278 ns/op
+Parse_SplitHard_PCRE 500000 4695 ns/op
+Parse_SplitHard_RE2 100000 17776 ns/op
+Parse_CachedSplitHard_PCRE 5000000 558 ns/op
+Parse_CachedSplitHard_RE2 500000 2925 ns/op
+Parse_CachedSplitBig1_PCRE 200 8378325 ns/op
+Parse_CachedSplitBig1_RE2 2000 1296256 ns/op
+Parse_CachedSplitBig2_PCRE 2000 849668 ns/op
+Parse_CachedSplitBig2_RE2 20 93559400 ns/op
+BM_PCRE_Compile 500000 5773 ns/op
+BM_RE2_Compile 200000 14117 ns/op
+SearchPhone_CachedPCRE/8 1000000 2107 ns/op 3.80 MB/s
+SearchPhone_CachedPCRE/16 500000 3511 ns/op 4.56 MB/s
+SearchPhone_CachedPCRE/32 500000 6303 ns/op 5.08 MB/s
+SearchPhone_CachedPCRE/64 200000 11898 ns/op 5.38 MB/s
+SearchPhone_CachedPCRE/128 100000 23242 ns/op 5.51 MB/s
+SearchPhone_CachedPCRE/256 50000 45867 ns/op 5.58 MB/s
+SearchPhone_CachedPCRE/512 20000 90764 ns/op 5.64 MB/s
+SearchPhone_CachedPCRE/1K 10000 180150 ns/op 5.68 MB/s
+SearchPhone_CachedPCRE/2K 5000 356942 ns/op 5.74 MB/s
+SearchPhone_CachedPCRE/4K 5000 707356 ns/op 5.79 MB/s
+SearchPhone_CachedPCRE/8K 2000 1408777 ns/op 5.81 MB/s
+SearchPhone_CachedPCRE/16K 1000 2816931 ns/op 5.82 MB/s
+SearchPhone_CachedPCRE/32K 500 5630556 ns/op 5.82 MB/s
+SearchPhone_CachedPCRE/64K 100 11257450 ns/op 5.82 MB/s
+SearchPhone_CachedPCRE/128K 100 22480780 ns/op 5.83 MB/s
+SearchPhone_CachedPCRE/256K 50 44877320 ns/op 5.84 MB/s
+SearchPhone_CachedPCRE/512K 20 90030600 ns/op 5.82 MB/s
+SearchPhone_CachedPCRE/1M 10 180520400 ns/op 5.81 MB/s
+SearchPhone_CachedPCRE/2M 5 360229400 ns/op 5.82 MB/s
+SearchPhone_CachedPCRE/4M 5 720922200 ns/op 5.82 MB/s
+SearchPhone_CachedPCRE/8M 1 1443346000 ns/op 5.81 MB/s
+SearchPhone_CachedPCRE/16M 1 2885907000 ns/op 5.81 MB/s
+SearchPhone_CachedRE2/8 1000000 1035 ns/op 7.73 MB/s
+SearchPhone_CachedRE2/16 1000000 1096 ns/op 14.59 MB/s
+SearchPhone_CachedRE2/32 1000000 1206 ns/op 26.53 MB/s
+SearchPhone_CachedRE2/64 1000000 1421 ns/op 45.01 MB/s
+SearchPhone_CachedRE2/128 1000000 1868 ns/op 68.49 MB/s
+SearchPhone_CachedRE2/256 1000000 2742 ns/op 93.35 MB/s
+SearchPhone_CachedRE2/512 500000 4488 ns/op 114.06 MB/s
+SearchPhone_CachedRE2/1K 200000 7960 ns/op 128.63 MB/s
+SearchPhone_CachedRE2/2K 200000 14980 ns/op 136.71 MB/s
+SearchPhone_CachedRE2/4K 100000 28984 ns/op 141.32 MB/s
+SearchPhone_CachedRE2/8K 50000 56914 ns/op 143.93 MB/s
+SearchPhone_CachedRE2/16K 10000 113004 ns/op 144.99 MB/s
+SearchPhone_CachedRE2/32K 10000 224690 ns/op 145.84 MB/s
+SearchPhone_CachedRE2/64K 5000 449388 ns/op 145.83 MB/s
+SearchPhone_CachedRE2/128K 2000 898866 ns/op 145.82 MB/s
+SearchPhone_CachedRE2/256K 1000 1796509 ns/op 145.92 MB/s
+SearchPhone_CachedRE2/512K 500 3590754 ns/op 146.01 MB/s
+SearchPhone_CachedRE2/1M 500 7255254 ns/op 144.53 MB/s
+SearchPhone_CachedRE2/2M 100 14476190 ns/op 144.87 MB/s
+SearchPhone_CachedRE2/4M 100 28990300 ns/op 144.68 MB/s
+SearchPhone_CachedRE2/8M 50 57857200 ns/op 144.99 MB/s
+SearchPhone_CachedRE2/16M 20 115874300 ns/op 144.79 MB/s
+EmptyPartialMatchPCRE 10000000 190 ns/op
+EmptyPartialMatchRE2 10000000 272 ns/op
+SimplePartialMatchPCRE 10000000 271 ns/op
+SimplePartialMatchRE2 5000000 334 ns/op
+HTTPPartialMatchPCRE 2000000 896 ns/op
+HTTPPartialMatchRE2 1000000 1089 ns/op
+SmallHTTPPartialMatchPCRE 2000000 895 ns/op
+SmallHTTPPartialMatchRE2 1000000 1080 ns/op
+DotMatchPCRE 2000000 863 ns/op
+DotMatchRE2 1000000 1080 ns/op
+ASCIIMatchPCRE 2000000 780 ns/op
+ASCIIMatchRE2 1000000 1079 ns/op
diff --git a/third_party/re2/src/benchlog/benchlog.wreck b/third_party/re2/src/benchlog/benchlog.wreck
new file mode 100644
index 000000000..073ec4c6b
--- /dev/null
+++ b/third_party/re2/src/benchlog/benchlog.wreck
@@ -0,0 +1,1058 @@
+hw.machine = i386
+hw.model = MacPro1,1
+hw.ncpu = 4
+hw.byteorder = 1234
+hw.physmem = 2147483648
+hw.usermem = 1477443584
+hw.pagesize = 4096
+hw.epoch = 0
+hw.vectorunit = 1
+hw.busfrequency = 1332000000
+hw.cpufrequency = 2660000000
+hw.cachelinesize = 64
+hw.l1icachesize = 32768
+hw.l1dcachesize = 32768
+hw.l2settings = 1
+hw.l2cachesize = 4194304
+hw.tbfrequency = 1000000000
+hw.memsize = 4294967296
+hw.availcpu = 4
+net.link.ether.inet.apple_hwcksum_rx: 1
+net.link.ether.inet.apple_hwcksum_tx: 1
+hw.ncpu: 4
+hw.byteorder: 1234
+hw.memsize: 4294967296
+hw.activecpu: 4
+hw.optional.x86_64: 1
+hw.optional.sse4_2: 0
+hw.optional.sse4_1: 0
+hw.optional.supplementalsse3: 1
+hw.optional.sse3: 1
+hw.optional.sse2: 1
+hw.optional.sse: 1
+hw.optional.mmx: 1
+hw.optional.floatingpoint: 1
+hw.packages: 2
+hw.tbfrequency: 1000000000
+hw.l2cachesize: 4194304
+hw.l1dcachesize: 32768
+hw.l1icachesize: 32768
+hw.cachelinesize: 64
+hw.cpufrequency_max: 2660000000
+hw.cpufrequency_min: 2660000000
+hw.cpufrequency: 2660000000
+hw.busfrequency_max: 1332000000
+hw.busfrequency_min: 1332000000
+hw.busfrequency: 1332000000
+hw.pagesize: 4096
+hw.cachesize: 4294967296 32768 4194304 0 0 0 0 0 0 0
+hw.cacheconfig: 4 1 2 0 0 0 0 0 0 0
+hw.cpufamily: 1114597871
+hw.cpu64bit_capable: 1
+hw.cpusubtype: 4
+hw.cputype: 7
+hw.logicalcpu_max: 4
+hw.logicalcpu: 4
+hw.physicalcpu_max: 4
+hw.physicalcpu: 4
+machdep.pmap.hashwalks: 1141082341
+
+machdep.cpu.thread_count: 2
+machdep.cpu.core_count: 2
+machdep.cpu.address_bits.virtual: 48
+machdep.cpu.address_bits.physical: 36
+machdep.cpu.tlb.data_large: 32
+machdep.cpu.tlb.inst_large: 8
+machdep.cpu.tlb.data_small: 256
+machdep.cpu.tlb.inst_small: 128
+machdep.cpu.cache.size: 4096
+machdep.cpu.cache.L2_associativity: 8
+machdep.cpu.cache.linesize: 64
+machdep.cpu.arch_perf.fixed_width: 0
+machdep.cpu.arch_perf.fixed_number: 0
+machdep.cpu.arch_perf.events: 0
+machdep.cpu.arch_perf.events_number: 7
+machdep.cpu.arch_perf.width: 40
+machdep.cpu.arch_perf.number: 2
+machdep.cpu.arch_perf.version: 2
+machdep.cpu.thermal.ACNT_MCNT: 1
+machdep.cpu.thermal.thresholds: 2
+machdep.cpu.thermal.dynamic_acceleration: 0
+machdep.cpu.thermal.sensor: 1
+machdep.cpu.mwait.sub_Cstates: 32
+machdep.cpu.mwait.extensions: 3
+machdep.cpu.mwait.linesize_max: 64
+machdep.cpu.mwait.linesize_min: 64
+machdep.cpu.microcode_version: 68
+machdep.cpu.cores_per_package: 2
+machdep.cpu.logical_per_package: 2
+machdep.cpu.extfeatures: XD EM64T
+machdep.cpu.features: FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR PGE MCA CMOV PAT PSE36 CLFSH DS ACPI MMX FXSR SSE SSE2 SS HTT TM SSE3 MON DSCPL VMX EST TM2 SSSE3 CX16 TPR PDCM
+machdep.cpu.brand: 0
+machdep.cpu.signature: 1782
+machdep.cpu.extfeature_bits: 537919488 1
+machdep.cpu.feature_bits: -1075053569 320445
+machdep.cpu.stepping: 6
+machdep.cpu.extfamily: 0
+machdep.cpu.extmodel: 0
+machdep.cpu.model: 15
+machdep.cpu.family: 6
+machdep.cpu.brand_string: Intel(R) Xeon(R) CPU 5150 @ 2.66GHz
+machdep.cpu.vendor: GenuineIntel
+
+==BENCHMARK== wreck.mtv.corp.google.com Fri Feb 26 13:45:06 PST 2010
+# Darwin wreck.mtv.corp.google.com 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:55:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_I386 i386
+# i686-apple-darwin9-g++-4.0.1 (GCC) 4.0.1 (Apple Inc. build 5484)
+# Copyright (C) 2005 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# d7671f473f1a+ tip
+obj/test/regexp_benchmark: Mach-O executable i386
+
+Search_Easy0_CachedPCRE/8 10000000 142 ns/op 56.08 MB/s
+Search_Easy0_CachedPCRE/16 10000000 184 ns/op 86.70 MB/s
+Search_Easy0_CachedPCRE/32 5000000 266 ns/op 120.15 MB/s
+Search_Easy0_CachedPCRE/64 5000000 433 ns/op 147.54 MB/s
+Search_Easy0_CachedPCRE/128 2000000 782 ns/op 163.61 MB/s
+Search_Easy0_CachedPCRE/256 1000000 1435 ns/op 178.37 MB/s
+Search_Easy0_CachedPCRE/512 500000 3151 ns/op 162.46 MB/s
+Search_Easy0_CachedPCRE/1K 200000 6522 ns/op 156.99 MB/s
+Search_Easy0_CachedPCRE/2K 100000 12024 ns/op 170.32 MB/s
+Search_Easy0_CachedPCRE/4K 50000 24372 ns/op 168.06 MB/s
+Search_Easy0_CachedPCRE/8K 50000 48326 ns/op 169.51 MB/s
+Search_Easy0_CachedPCRE/16K 20000 96331 ns/op 170.08 MB/s
+Search_Easy0_CachedPCRE/32K 10000 189172 ns/op 173.22 MB/s
+Search_Easy0_CachedPCRE/64K 5000 380022 ns/op 172.45 MB/s
+Search_Easy0_CachedPCRE/128K 2000 759526 ns/op 172.57 MB/s
+Search_Easy0_CachedPCRE/256K 1000 1514090 ns/op 173.14 MB/s
+Search_Easy0_CachedPCRE/512K 500 3039238 ns/op 172.51 MB/s
+Search_Easy0_CachedPCRE/1M 200 6089745 ns/op 172.19 MB/s
+Search_Easy0_CachedPCRE/2M 100 12326550 ns/op 170.13 MB/s
+Search_Easy0_CachedPCRE/4M 50 24663580 ns/op 170.06 MB/s
+Search_Easy0_CachedPCRE/8M 50 49853660 ns/op 168.26 MB/s
+Search_Easy0_CachedPCRE/16M 10 100141300 ns/op 167.54 MB/s
+Search_Easy0_CachedRE2/8 5000000 441 ns/op 18.13 MB/s
+Search_Easy0_CachedRE2/16 5000000 451 ns/op 35.44 MB/s
+Search_Easy0_CachedRE2/32 5000000 477 ns/op 66.96 MB/s
+Search_Easy0_CachedRE2/64 2000000 527 ns/op 121.31 MB/s
+Search_Easy0_CachedRE2/128 2000000 601 ns/op 212.78 MB/s
+Search_Easy0_CachedRE2/256 2000000 800 ns/op 319.67 MB/s
+Search_Easy0_CachedRE2/512 1000000 1189 ns/op 430.48 MB/s
+Search_Easy0_CachedRE2/1K 1000000 2010 ns/op 509.44 MB/s
+Search_Easy0_CachedRE2/2K 500000 3660 ns/op 559.55 MB/s
+Search_Easy0_CachedRE2/4K 200000 7036 ns/op 582.08 MB/s
+Search_Easy0_CachedRE2/8K 100000 13675 ns/op 599.04 MB/s
+Search_Easy0_CachedRE2/16K 50000 27108 ns/op 604.38 MB/s
+Search_Easy0_CachedRE2/32K 20000 53246 ns/op 615.40 MB/s
+Search_Easy0_CachedRE2/64K 10000 105293 ns/op 622.41 MB/s
+Search_Easy0_CachedRE2/128K 5000 210763 ns/op 621.89 MB/s
+Search_Easy0_CachedRE2/256K 5000 418416 ns/op 626.51 MB/s
+Search_Easy0_CachedRE2/512K 2000 840122 ns/op 624.06 MB/s
+Search_Easy0_CachedRE2/1M 1000 1686321 ns/op 621.81 MB/s
+Search_Easy0_CachedRE2/2M 500 3394234 ns/op 617.86 MB/s
+Search_Easy0_CachedRE2/4M 200 6926710 ns/op 605.53 MB/s
+Search_Easy0_CachedRE2/8M 100 13850090 ns/op 605.67 MB/s
+Search_Easy0_CachedRE2/16M 50 27810480 ns/op 603.27 MB/s
+Search_Easy1_CachedPCRE/8 10000000 145 ns/op 55.08 MB/s
+Search_Easy1_CachedPCRE/16 10000000 186 ns/op 85.59 MB/s
+Search_Easy1_CachedPCRE/32 5000000 268 ns/op 119.32 MB/s
+Search_Easy1_CachedPCRE/64 5000000 436 ns/op 146.76 MB/s
+Search_Easy1_CachedPCRE/128 2000000 786 ns/op 162.78 MB/s
+Search_Easy1_CachedPCRE/256 1000000 1446 ns/op 176.97 MB/s
+Search_Easy1_CachedPCRE/512 500000 2947 ns/op 173.70 MB/s
+Search_Easy1_CachedPCRE/1K 200000 6212 ns/op 164.82 MB/s
+Search_Easy1_CachedPCRE/2K 100000 12544 ns/op 163.25 MB/s
+Search_Easy1_CachedPCRE/4K 50000 24997 ns/op 163.85 MB/s
+Search_Easy1_CachedPCRE/8K 50000 49945 ns/op 164.02 MB/s
+Search_Easy1_CachedPCRE/16K 20000 98856 ns/op 165.73 MB/s
+Search_Easy1_CachedPCRE/32K 10000 196635 ns/op 166.64 MB/s
+Search_Easy1_CachedPCRE/64K 5000 392336 ns/op 167.04 MB/s
+Search_Easy1_CachedPCRE/128K 2000 781551 ns/op 167.71 MB/s
+Search_Easy1_CachedPCRE/256K 1000 1572536 ns/op 166.70 MB/s
+Search_Easy1_CachedPCRE/512K 500 3133634 ns/op 167.31 MB/s
+Search_Easy1_CachedPCRE/1M 200 6268370 ns/op 167.28 MB/s
+Search_Easy1_CachedPCRE/2M 100 12629380 ns/op 166.05 MB/s
+Search_Easy1_CachedPCRE/4M 50 25311280 ns/op 165.71 MB/s
+Search_Easy1_CachedPCRE/8M 20 50747250 ns/op 165.30 MB/s
+Search_Easy1_CachedPCRE/16M 10 102157400 ns/op 164.23 MB/s
+Search_Easy1_CachedRE2/8 5000000 431 ns/op 18.53 MB/s
+Search_Easy1_CachedRE2/16 5000000 448 ns/op 35.70 MB/s
+Search_Easy1_CachedRE2/32 5000000 475 ns/op 67.36 MB/s
+Search_Easy1_CachedRE2/64 2000000 526 ns/op 121.54 MB/s
+Search_Easy1_CachedRE2/128 2000000 603 ns/op 212.23 MB/s
+Search_Easy1_CachedRE2/256 2000000 799 ns/op 320.12 MB/s
+Search_Easy1_CachedRE2/512 1000000 1182 ns/op 433.15 MB/s
+Search_Easy1_CachedRE2/1K 1000000 2001 ns/op 511.61 MB/s
+Search_Easy1_CachedRE2/2K 500000 3639 ns/op 562.68 MB/s
+Search_Easy1_CachedRE2/4K 200000 7020 ns/op 583.43 MB/s
+Search_Easy1_CachedRE2/8K 100000 13720 ns/op 597.04 MB/s
+Search_Easy1_CachedRE2/16K 50000 27091 ns/op 604.76 MB/s
+Search_Easy1_CachedRE2/32K 20000 53363 ns/op 614.06 MB/s
+Search_Easy1_CachedRE2/64K 10000 104803 ns/op 625.32 MB/s
+Search_Easy1_CachedRE2/128K 5000 210012 ns/op 624.11 MB/s
+Search_Easy1_CachedRE2/256K 5000 416117 ns/op 629.98 MB/s
+Search_Easy1_CachedRE2/512K 2000 832909 ns/op 629.47 MB/s
+Search_Easy1_CachedRE2/1M 1000 1685969 ns/op 621.94 MB/s
+Search_Easy1_CachedRE2/2M 500 3388716 ns/op 618.86 MB/s
+Search_Easy1_CachedRE2/4M 200 6872645 ns/op 610.29 MB/s
+Search_Easy1_CachedRE2/8M 100 13975650 ns/op 600.23 MB/s
+Search_Easy1_CachedRE2/16M 50 27882420 ns/op 601.71 MB/s
+Search_Medium_CachedPCRE/8 10000000 144 ns/op 55.25 MB/s
+Search_Medium_CachedPCRE/16 10000000 192 ns/op 83.21 MB/s
+Search_Medium_CachedPCRE/32 5000000 280 ns/op 114.08 MB/s
+Search_Medium_CachedPCRE/64 5000000 452 ns/op 141.46 MB/s
+Search_Medium_CachedPCRE/128 200000 6086 ns/op 21.03 MB/s
+Search_Medium_CachedPCRE/256 100000 11456 ns/op 22.35 MB/s
+Search_Medium_CachedPCRE/512 50000 27208 ns/op 18.82 MB/s
+Search_Medium_CachedPCRE/1K 20000 53266 ns/op 19.22 MB/s
+Search_Medium_CachedPCRE/2K 20000 84985 ns/op 24.10 MB/s
+Search_Medium_CachedPCRE/4K 5000 205715 ns/op 19.91 MB/s
+Search_Medium_CachedPCRE/8K 5000 421092 ns/op 19.45 MB/s
+Search_Medium_CachedPCRE/16K 2000 847861 ns/op 19.32 MB/s
+Search_Medium_CachedPCRE/32K 1000 1688903 ns/op 19.40 MB/s
+Search_Medium_CachedPCRE/64K 500 3374828 ns/op 19.42 MB/s
+Search_Medium_CachedPCRE/128K 200 6737375 ns/op 19.45 MB/s
+Search_Medium_CachedPCRE/256K 100 13497210 ns/op 19.42 MB/s
+Search_Medium_CachedRE2/8 5000000 456 ns/op 17.53 MB/s
+Search_Medium_CachedRE2/16 5000000 499 ns/op 32.05 MB/s
+Search_Medium_CachedRE2/32 2000000 575 ns/op 55.62 MB/s
+Search_Medium_CachedRE2/64 2000000 730 ns/op 87.61 MB/s
+Search_Medium_CachedRE2/128 1000000 1051 ns/op 121.72 MB/s
+Search_Medium_CachedRE2/256 1000000 1695 ns/op 150.98 MB/s
+Search_Medium_CachedRE2/512 500000 2947 ns/op 173.73 MB/s
+Search_Medium_CachedRE2/1K 200000 5474 ns/op 187.04 MB/s
+Search_Medium_CachedRE2/2K 100000 10384 ns/op 197.21 MB/s
+Search_Medium_CachedRE2/4K 50000 20546 ns/op 199.35 MB/s
+Search_Medium_CachedRE2/8K 50000 39540 ns/op 207.18 MB/s
+Search_Medium_CachedRE2/16K 20000 77860 ns/op 210.43 MB/s
+Search_Medium_CachedRE2/32K 10000 154440 ns/op 212.17 MB/s
+Search_Medium_CachedRE2/64K 5000 306800 ns/op 213.61 MB/s
+Search_Medium_CachedRE2/128K 2000 627489 ns/op 208.88 MB/s
+Search_Medium_CachedRE2/256K 1000 1232221 ns/op 212.74 MB/s
+Search_Medium_CachedRE2/512K 500 2473372 ns/op 211.97 MB/s
+Search_Medium_CachedRE2/1M 500 4963800 ns/op 211.24 MB/s
+Search_Medium_CachedRE2/2M 200 10010555 ns/op 209.49 MB/s
+Search_Medium_CachedRE2/4M 50 20355180 ns/op 206.06 MB/s
+Search_Medium_CachedRE2/8M 50 40085120 ns/op 209.27 MB/s
+Search_Medium_CachedRE2/16M 20 81232650 ns/op 206.53 MB/s
+Search_Hard_CachedPCRE/8 10000000 145 ns/op 54.95 MB/s
+Search_Hard_CachedPCRE/16 10000000 191 ns/op 83.60 MB/s
+Search_Hard_CachedPCRE/32 5000000 279 ns/op 114.53 MB/s
+Search_Hard_CachedPCRE/64 5000000 463 ns/op 137.99 MB/s
+Search_Hard_CachedPCRE/128 5000 235508 ns/op 0.54 MB/s
+Search_Hard_CachedPCRE/256 2000 885356 ns/op 0.29 MB/s
+Search_Hard_CachedPCRE/512 500 3682430 ns/op 0.14 MB/s
+Search_Hard_CachedPCRE/1K 100 14493660 ns/op 0.07 MB/s
+Search_Hard_CachedPCRE/2K 20 54810600 ns/op 0.04 MB/s
+Search_Hard_CachedPCRE/4K 5 236421800 ns/op 0.02 MB/s
+Search_Hard_CachedRE2/8 5000000 460 ns/op 17.39 MB/s
+Search_Hard_CachedRE2/16 5000000 498 ns/op 32.11 MB/s
+Search_Hard_CachedRE2/32 2000000 570 ns/op 56.05 MB/s
+Search_Hard_CachedRE2/64 2000000 726 ns/op 88.08 MB/s
+Search_Hard_CachedRE2/128 1000000 1044 ns/op 122.53 MB/s
+Search_Hard_CachedRE2/256 1000000 1669 ns/op 153.37 MB/s
+Search_Hard_CachedRE2/512 500000 2910 ns/op 175.92 MB/s
+Search_Hard_CachedRE2/1K 200000 5380 ns/op 190.32 MB/s
+Search_Hard_CachedRE2/2K 100000 10730 ns/op 190.86 MB/s
+Search_Hard_CachedRE2/4K 50000 20827 ns/op 196.66 MB/s
+Search_Hard_CachedRE2/8K 50000 39641 ns/op 206.65 MB/s
+Search_Hard_CachedRE2/16K 20000 78174 ns/op 209.58 MB/s
+Search_Hard_CachedRE2/32K 10000 154236 ns/op 212.45 MB/s
+Search_Hard_CachedRE2/64K 5000 307131 ns/op 213.38 MB/s
+Search_Hard_CachedRE2/128K 2000 617929 ns/op 212.11 MB/s
+Search_Hard_CachedRE2/256K 1000 1235441 ns/op 212.19 MB/s
+Search_Hard_CachedRE2/512K 500 2465954 ns/op 212.61 MB/s
+Search_Hard_CachedRE2/1M 500 4943778 ns/op 212.10 MB/s
+Search_Hard_CachedRE2/2M 200 9957805 ns/op 210.60 MB/s
+Search_Hard_CachedRE2/4M 50 20109920 ns/op 208.57 MB/s
+Search_Hard_CachedRE2/8M 50 40249680 ns/op 208.41 MB/s
+Search_Hard_CachedRE2/16M 20 79626800 ns/op 210.70 MB/s
+Search_Parens_CachedPCRE/8 5000000 207 ns/op 38.46 MB/s
+Search_Parens_CachedRE2/8 5000000 460 ns/op 17.35 MB/s
+Search_Parens_CachedRE2/16 5000000 499 ns/op 32.01 MB/s
+Search_Parens_CachedRE2/32 2000000 566 ns/op 56.44 MB/s
+Search_Parens_CachedRE2/64 2000000 731 ns/op 87.44 MB/s
+Search_Parens_CachedRE2/128 1000000 1046 ns/op 122.35 MB/s
+Search_Parens_CachedRE2/256 1000000 1674 ns/op 152.87 MB/s
+Search_Parens_CachedRE2/512 500000 2889 ns/op 177.21 MB/s
+Search_Parens_CachedRE2/1K 200000 5456 ns/op 187.68 MB/s
+Search_Parens_CachedRE2/2K 100000 10527 ns/op 194.54 MB/s
+Search_Parens_CachedRE2/4K 50000 20632 ns/op 198.52 MB/s
+Search_Parens_CachedRE2/8K 50000 39791 ns/op 205.87 MB/s
+Search_Parens_CachedRE2/16K 20000 77748 ns/op 210.73 MB/s
+Search_Parens_CachedRE2/32K 10000 154317 ns/op 212.34 MB/s
+Search_Parens_CachedRE2/64K 5000 306631 ns/op 213.73 MB/s
+Search_Parens_CachedRE2/128K 2000 618071 ns/op 212.07 MB/s
+Search_Parens_CachedRE2/256K 1000 1231452 ns/op 212.87 MB/s
+Search_Parens_CachedRE2/512K 500 2463338 ns/op 212.84 MB/s
+Search_Parens_CachedRE2/1M 500 4945594 ns/op 212.02 MB/s
+Search_Parens_CachedRE2/2M 100 10028120 ns/op 209.13 MB/s
+Search_Parens_CachedRE2/4M 50 20201820 ns/op 207.62 MB/s
+Search_Parens_CachedRE2/8M 50 40668120 ns/op 206.27 MB/s
+Search_Parens_CachedRE2/16M 20 80655350 ns/op 208.01 MB/s
+Search_BigFixed_CachedPCRE/8 5000000 285 ns/op 28.06 MB/s
+Search_BigFixed_CachedPCRE/16 5000000 371 ns/op 43.10 MB/s
+Search_BigFixed_CachedPCRE/32 2000000 544 ns/op 58.77 MB/s
+Search_BigFixed_CachedPCRE/64 2000000 891 ns/op 71.75 MB/s
+Search_BigFixed_CachedPCRE/128 1000000 1599 ns/op 80.04 MB/s
+Search_BigFixed_CachedPCRE/256 500000 2995 ns/op 85.46 MB/s
+Search_BigFixed_CachedPCRE/512 200000 5724 ns/op 89.44 MB/s
+Search_BigFixed_CachedPCRE/1K 100000 11311 ns/op 90.53 MB/s
+Search_BigFixed_CachedPCRE/2K 50000 22347 ns/op 91.65 MB/s
+Search_BigFixed_CachedPCRE/4K 50000 44379 ns/op 92.29 MB/s
+Search_BigFixed_CachedPCRE/8K 20000 87509 ns/op 93.61 MB/s
+Search_BigFixed_CachedPCRE/16K 10000 175594 ns/op 93.31 MB/s
+Search_BigFixed_CachedPCRE/32K 5000 352953 ns/op 92.84 MB/s
+Search_BigFixed_CachedRE2/8 10000000 164 ns/op 48.68 MB/s
+Search_BigFixed_CachedRE2/16 5000000 487 ns/op 32.84 MB/s
+Search_BigFixed_CachedRE2/32 2000000 539 ns/op 59.28 MB/s
+Search_BigFixed_CachedRE2/64 2000000 612 ns/op 104.53 MB/s
+Search_BigFixed_CachedRE2/128 2000000 781 ns/op 163.87 MB/s
+Search_BigFixed_CachedRE2/256 1000000 1082 ns/op 236.58 MB/s
+Search_BigFixed_CachedRE2/512 1000000 1689 ns/op 303.00 MB/s
+Search_BigFixed_CachedRE2/1K 500000 2924 ns/op 350.17 MB/s
+Search_BigFixed_CachedRE2/2K 200000 5753 ns/op 355.99 MB/s
+Search_BigFixed_CachedRE2/4K 100000 10436 ns/op 392.46 MB/s
+Search_BigFixed_CachedRE2/8K 50000 20223 ns/op 405.08 MB/s
+Search_BigFixed_CachedRE2/16K 50000 40733 ns/op 402.23 MB/s
+Search_BigFixed_CachedRE2/32K 20000 80342 ns/op 407.86 MB/s
+Search_BigFixed_CachedRE2/64K 10000 159585 ns/op 410.66 MB/s
+Search_BigFixed_CachedRE2/128K 5000 320376 ns/op 409.12 MB/s
+Search_BigFixed_CachedRE2/256K 2000 641718 ns/op 408.50 MB/s
+Search_BigFixed_CachedRE2/512K 1000 1290373 ns/op 406.31 MB/s
+Search_BigFixed_CachedRE2/1M 500 2638566 ns/op 397.40 MB/s
+Search_Success_PCRE/8 500000 3393 ns/op 2.36 MB/s
+Search_Success_PCRE/16 500000 3469 ns/op 4.61 MB/s
+Search_Success_PCRE/32 500000 3499 ns/op 9.15 MB/s
+Search_Success_PCRE/64 500000 3848 ns/op 16.63 MB/s
+Search_Success_PCRE/128 500000 4582 ns/op 27.93 MB/s
+Search_Success_PCRE/256 200000 5678 ns/op 45.08 MB/s
+Search_Success_PCRE/512 200000 8267 ns/op 61.93 MB/s
+Search_Success_PCRE/1K 100000 13341 ns/op 76.75 MB/s
+Search_Success_PCRE/2K 50000 23974 ns/op 85.42 MB/s
+Search_Success_PCRE/4K 50000 44459 ns/op 92.13 MB/s
+Search_Success_PCRE/8K 20000 87665 ns/op 93.45 MB/s
+Search_Success_PCRE/16K 10000 174412 ns/op 93.94 MB/s
+Search_Success_PCRE/32K 5000 348685 ns/op 93.98 MB/s
+Search_Success_PCRE/64K 2000 695853 ns/op 94.18 MB/s
+Search_Success_PCRE/128K 1000 1382530 ns/op 94.81 MB/s
+Search_Success_PCRE/256K 500 2777966 ns/op 94.37 MB/s
+Search_Success_PCRE/512K 200 5622585 ns/op 93.25 MB/s
+Search_Success_PCRE/1M 100 11355970 ns/op 92.34 MB/s
+Search_Success_PCRE/2M 50 23359260 ns/op 89.78 MB/s
+Search_Success_PCRE/4M 20 50359900 ns/op 83.29 MB/s
+Search_Success_PCRE/8M 10 111431900 ns/op 75.28 MB/s
+Search_Success_PCRE/16M 5 265918600 ns/op 63.09 MB/s
+Search_Success_RE2/8 100000 16060 ns/op 0.50 MB/s
+Search_Success_RE2/16 50000 34580 ns/op 0.46 MB/s
+Search_Success_RE2/32 50000 35094 ns/op 0.91 MB/s
+Search_Success_RE2/64 50000 35110 ns/op 1.82 MB/s
+Search_Success_RE2/128 50000 35001 ns/op 3.66 MB/s
+Search_Success_RE2/256 50000 35354 ns/op 7.24 MB/s
+Search_Success_RE2/512 50000 36899 ns/op 13.88 MB/s
+Search_Success_RE2/1K 50000 39012 ns/op 26.25 MB/s
+Search_Success_RE2/2K 50000 42906 ns/op 47.73 MB/s
+Search_Success_RE2/4K 20000 53136 ns/op 77.08 MB/s
+Search_Success_RE2/8K 20000 72624 ns/op 112.80 MB/s
+Search_Success_RE2/16K 10000 112251 ns/op 145.96 MB/s
+Search_Success_RE2/32K 10000 189404 ns/op 173.01 MB/s
+Search_Success_RE2/64K 5000 345391 ns/op 189.74 MB/s
+Search_Success_RE2/128K 2000 651836 ns/op 201.08 MB/s
+Search_Success_RE2/256K 1000 1265262 ns/op 207.19 MB/s
+Search_Success_RE2/512K 500 2516902 ns/op 208.31 MB/s
+Search_Success_RE2/1M 200 5097685 ns/op 205.70 MB/s
+Search_Success_RE2/2M 100 10551640 ns/op 198.75 MB/s
+Search_Success_RE2/4M 50 22130760 ns/op 189.52 MB/s
+Search_Success_RE2/8M 20 51212750 ns/op 163.80 MB/s
+Search_Success_RE2/16M 10 125281500 ns/op 133.92 MB/s
+Search_Success_CachedPCRE/8 5000000 276 ns/op 28.97 MB/s
+Search_Success_CachedPCRE/16 5000000 354 ns/op 45.15 MB/s
+Search_Success_CachedPCRE/32 2000000 515 ns/op 62.03 MB/s
+Search_Success_CachedPCRE/64 2000000 823 ns/op 77.73 MB/s
+Search_Success_CachedPCRE/128 1000000 1470 ns/op 87.05 MB/s
+Search_Success_CachedPCRE/256 500000 2739 ns/op 93.46 MB/s
+Search_Success_CachedPCRE/512 200000 5254 ns/op 97.44 MB/s
+Search_Success_CachedPCRE/1K 100000 10228 ns/op 100.11 MB/s
+Search_Success_CachedPCRE/2K 50000 20449 ns/op 100.15 MB/s
+Search_Success_CachedPCRE/4K 50000 41084 ns/op 99.70 MB/s
+Search_Success_CachedPCRE/8K 20000 84617 ns/op 96.81 MB/s
+Search_Success_CachedPCRE/16K 10000 168594 ns/op 97.18 MB/s
+Search_Success_CachedPCRE/32K 5000 339675 ns/op 96.47 MB/s
+Search_Success_CachedPCRE/64K 2000 682138 ns/op 96.07 MB/s
+Search_Success_CachedPCRE/128K 1000 1373131 ns/op 95.45 MB/s
+Search_Success_CachedPCRE/256K 500 2767366 ns/op 94.73 MB/s
+Search_Success_CachedPCRE/512K 200 5562225 ns/op 94.26 MB/s
+Search_Success_CachedPCRE/1M 100 11188570 ns/op 93.72 MB/s
+Search_Success_CachedPCRE/2M 50 23191460 ns/op 90.43 MB/s
+Search_Success_CachedPCRE/4M 20 50011200 ns/op 83.87 MB/s
+Search_Success_CachedPCRE/8M 10 111201800 ns/op 75.44 MB/s
+Search_Success_CachedPCRE/16M 5 266875000 ns/op 62.87 MB/s
+Search_Success_CachedRE2/8 10000000 183 ns/op 43.67 MB/s
+Search_Success_CachedRE2/16 5000000 491 ns/op 32.56 MB/s
+Search_Success_CachedRE2/32 2000000 582 ns/op 54.98 MB/s
+Search_Success_CachedRE2/64 2000000 738 ns/op 86.61 MB/s
+Search_Success_CachedRE2/128 1000000 1043 ns/op 122.69 MB/s
+Search_Success_CachedRE2/256 1000000 1623 ns/op 157.70 MB/s
+Search_Success_CachedRE2/512 500000 2854 ns/op 179.39 MB/s
+Search_Success_CachedRE2/1K 200000 5165 ns/op 198.23 MB/s
+Search_Success_CachedRE2/2K 100000 10648 ns/op 192.32 MB/s
+Search_Success_CachedRE2/4K 50000 20892 ns/op 196.05 MB/s
+Search_Success_CachedRE2/8K 50000 38909 ns/op 210.54 MB/s
+Search_Success_CachedRE2/16K 20000 76762 ns/op 213.44 MB/s
+Search_Success_CachedRE2/32K 10000 153917 ns/op 212.89 MB/s
+Search_Success_CachedRE2/64K 5000 307908 ns/op 212.84 MB/s
+Search_Success_CachedRE2/128K 2000 610789 ns/op 214.59 MB/s
+Search_Success_CachedRE2/256K 1000 1228572 ns/op 213.37 MB/s
+Search_Success_CachedRE2/512K 500 2467884 ns/op 212.44 MB/s
+Search_Success_CachedRE2/1M 200 5100045 ns/op 205.60 MB/s
+Search_Success_CachedRE2/2M 100 10388080 ns/op 201.88 MB/s
+Search_Success_CachedRE2/4M 50 22091760 ns/op 189.86 MB/s
+Search_Success_CachedRE2/8M 20 51066600 ns/op 164.27 MB/s
+Search_Success_CachedRE2/16M 10 124756300 ns/op 134.48 MB/s
+Search_Success1_PCRE/8 500000 3329 ns/op 2.40 MB/s
+Search_Success1_PCRE/16 500000 3422 ns/op 4.68 MB/s
+Search_Success1_PCRE/32 500000 3562 ns/op 8.98 MB/s
+Search_Success1_PCRE/64 500000 3875 ns/op 16.51 MB/s
+Search_Success1_PCRE/128 500000 4487 ns/op 28.52 MB/s
+Search_Success1_PCRE/256 200000 5781 ns/op 44.28 MB/s
+Search_Success1_PCRE/512 200000 8232 ns/op 62.20 MB/s
+Search_Success1_PCRE/1K 100000 13396 ns/op 76.44 MB/s
+Search_Success1_PCRE/2K 50000 24063 ns/op 85.11 MB/s
+Search_Success1_PCRE/4K 50000 44662 ns/op 91.71 MB/s
+Search_Success1_PCRE/8K 20000 87800 ns/op 93.30 MB/s
+Search_Success1_PCRE/16K 10000 173248 ns/op 94.57 MB/s
+Search_Success1_PCRE/32K 5000 345953 ns/op 94.72 MB/s
+Search_Success1_PCRE/64K 2000 690898 ns/op 94.86 MB/s
+Search_Success1_PCRE/128K 1000 1380064 ns/op 94.98 MB/s
+Search_Success1_PCRE/256K 500 2756944 ns/op 95.08 MB/s
+Search_Success1_PCRE/512K 200 5554180 ns/op 94.40 MB/s
+Search_Success1_PCRE/1M 100 11227360 ns/op 93.39 MB/s
+Search_Success1_PCRE/2M 50 23068500 ns/op 90.91 MB/s
+Search_Success1_PCRE/4M 50 46455720 ns/op 90.29 MB/s
+Search_Success1_PCRE/8M 10 112184900 ns/op 74.77 MB/s
+Search_Success1_PCRE/16M 5 267271800 ns/op 62.77 MB/s
+Search_Success1_RE2/8 50000 47078 ns/op 0.17 MB/s
+Search_Success1_RE2/16 50000 46927 ns/op 0.34 MB/s
+Search_Success1_RE2/32 50000 46852 ns/op 0.68 MB/s
+Search_Success1_RE2/64 50000 47478 ns/op 1.35 MB/s
+Search_Success1_RE2/128 50000 47471 ns/op 2.70 MB/s
+Search_Success1_RE2/256 50000 47911 ns/op 5.34 MB/s
+Search_Success1_RE2/512 50000 48982 ns/op 10.45 MB/s
+Search_Success1_RE2/1K 20000 50955 ns/op 20.10 MB/s
+Search_Success1_RE2/2K 20000 55280 ns/op 37.05 MB/s
+Search_Success1_RE2/4K 20000 65176 ns/op 62.84 MB/s
+Search_Success1_RE2/8K 20000 84613 ns/op 96.82 MB/s
+Search_Success1_RE2/16K 10000 125384 ns/op 130.67 MB/s
+Search_Success1_RE2/32K 5000 200634 ns/op 163.32 MB/s
+Search_Success1_RE2/64K 5000 352274 ns/op 186.04 MB/s
+Search_Success1_RE2/128K 2000 655683 ns/op 199.90 MB/s
+Search_Success1_RE2/256K 1000 1289421 ns/op 203.30 MB/s
+Search_Success1_RE2/512K 500 2514970 ns/op 208.47 MB/s
+Search_Success1_RE2/1M 200 5109155 ns/op 205.23 MB/s
+Search_Success1_RE2/2M 100 10655670 ns/op 196.81 MB/s
+Search_Success1_RE2/4M 50 22707220 ns/op 184.71 MB/s
+Search_Success1_RE2/8M 20 50906850 ns/op 164.78 MB/s
+Search_Success1_RE2/16M 10 125901300 ns/op 133.26 MB/s
+Search_Success1_Cached_PCRE/8 5000000 308 ns/op 25.89 MB/s
+Search_Success1_Cached_PCRE/16 5000000 390 ns/op 40.98 MB/s
+Search_Success1_Cached_PCRE/32 2000000 556 ns/op 57.51 MB/s
+Search_Success1_Cached_PCRE/64 2000000 862 ns/op 74.24 MB/s
+Search_Success1_Cached_PCRE/128 1000000 1585 ns/op 80.72 MB/s
+Search_Success1_Cached_PCRE/256 500000 2772 ns/op 92.34 MB/s
+Search_Success1_Cached_PCRE/512 200000 5261 ns/op 97.31 MB/s
+Search_Success1_Cached_PCRE/1K 100000 10302 ns/op 99.40 MB/s
+Search_Success1_Cached_PCRE/2K 50000 20828 ns/op 98.33 MB/s
+Search_Success1_Cached_PCRE/4K 50000 41370 ns/op 99.01 MB/s
+Search_Success1_Cached_PCRE/8K 20000 84354 ns/op 97.11 MB/s
+Search_Success1_Cached_PCRE/16K 10000 170170 ns/op 96.28 MB/s
+Search_Success1_Cached_PCRE/32K 5000 342755 ns/op 95.60 MB/s
+Search_Success1_Cached_PCRE/64K 2000 688438 ns/op 95.20 MB/s
+Search_Success1_Cached_PCRE/128K 1000 1372324 ns/op 95.51 MB/s
+Search_Success1_Cached_PCRE/256K 500 2771422 ns/op 94.59 MB/s
+Search_Success1_Cached_PCRE/512K 200 5608635 ns/op 93.48 MB/s
+Search_Success1_Cached_PCRE/1M 100 11354700 ns/op 92.35 MB/s
+Search_Success1_Cached_PCRE/2M 50 23295740 ns/op 90.02 MB/s
+Search_Success1_Cached_PCRE/4M 20 50142650 ns/op 83.65 MB/s
+Search_Success1_Cached_PCRE/8M 10 111720200 ns/op 75.09 MB/s
+Search_Success1_Cached_PCRE/16M 5 269077800 ns/op 62.35 MB/s
+Search_Success1_Cached_RE2/8 5000000 461 ns/op 17.35 MB/s
+Search_Success1_Cached_RE2/16 2000000 503 ns/op 31.76 MB/s
+Search_Success1_Cached_RE2/32 2000000 579 ns/op 55.25 MB/s
+Search_Success1_Cached_RE2/64 2000000 739 ns/op 86.50 MB/s
+Search_Success1_Cached_RE2/128 1000000 1033 ns/op 123.83 MB/s
+Search_Success1_Cached_RE2/256 1000000 1643 ns/op 155.77 MB/s
+Search_Success1_Cached_RE2/512 500000 2869 ns/op 178.40 MB/s
+Search_Success1_Cached_RE2/1K 200000 5099 ns/op 200.79 MB/s
+Search_Success1_Cached_RE2/2K 100000 10309 ns/op 198.64 MB/s
+Search_Success1_Cached_RE2/4K 100000 19360 ns/op 211.57 MB/s
+Search_Success1_Cached_RE2/8K 50000 38961 ns/op 210.26 MB/s
+Search_Success1_Cached_RE2/16K 20000 78081 ns/op 209.83 MB/s
+Search_Success1_Cached_RE2/32K 10000 154337 ns/op 212.31 MB/s
+Search_Success1_Cached_RE2/64K 5000 306992 ns/op 213.48 MB/s
+Search_Success1_Cached_RE2/128K 2000 609073 ns/op 215.20 MB/s
+Search_Success1_Cached_RE2/256K 1000 1226916 ns/op 213.66 MB/s
+Search_Success1_Cached_RE2/512K 500 2486650 ns/op 210.84 MB/s
+Search_Success1_Cached_RE2/1M 200 5026605 ns/op 208.61 MB/s
+Search_Success1_Cached_RE2/2M 100 10540280 ns/op 198.97 MB/s
+Search_Success1_Cached_RE2/4M 50 22296140 ns/op 188.12 MB/s
+Search_Success1_Cached_RE2/8M 20 51183250 ns/op 163.89 MB/s
+Search_Success1_Cached_RE2/16M 10 125691100 ns/op 133.48 MB/s
+Search_Digits_PCRE 200000 7096 ns/op
+Search_Digits_RE2 50000 37491 ns/op
+Parse_Digits_PCRE 200000 7325 ns/op
+Parse_Digits_RE2 100000 19423 ns/op
+Parse_CachedDigits_PCRE 2000000 596 ns/op
+Parse_CachedDigits_RE2 5000000 325 ns/op
+Parse_DigitDs_PCRE 200000 6459 ns/op
+Parse_DigitDs_RE2 100000 19040 ns/op
+Parse_CachedDigitDs_PCRE 2000000 591 ns/op
+Parse_CachedDigitDs_RE2 5000000 334 ns/op
+Parse_Split_PCRE 500000 4865 ns/op
+Parse_Split_RE2 50000 20898 ns/op
+Parse_CachedSplit_PCRE 5000000 424 ns/op
+Parse_CachedSplit_RE2 5000000 237 ns/op
+Parse_SplitHard_PCRE 500000 4821 ns/op
+Parse_SplitHard_RE2 50000 25920 ns/op
+Parse_CachedSplitHard_PCRE 5000000 422 ns/op
+Parse_CachedSplitHard_RE2 500000 2340 ns/op
+Parse_CachedSplitBig1_PCRE 200 5460640 ns/op
+Parse_CachedSplitBig1_RE2 2000 935880 ns/op
+Parse_CachedSplitBig2_PCRE 1000 1050260 ns/op
+Parse_CachedSplitBig2_RE2 10 100186200 ns/op
+BM_PCRE_Compile 200000 5937 ns/op
+BM_RE2_Compile 50000 22091 ns/op
+SearchPhone_CachedPCRE/8 1000000 1520 ns/op 5.26 MB/s
+SearchPhone_CachedPCRE/16 500000 2461 ns/op 6.50 MB/s
+SearchPhone_CachedPCRE/32 500000 4142 ns/op 7.72 MB/s
+SearchPhone_CachedPCRE/64 200000 7477 ns/op 8.56 MB/s
+SearchPhone_CachedPCRE/128 100000 14151 ns/op 9.04 MB/s
+SearchPhone_CachedPCRE/256 50000 27740 ns/op 9.23 MB/s
+SearchPhone_CachedPCRE/512 20000 55556 ns/op 9.22 MB/s
+SearchPhone_CachedPCRE/1K 10000 109542 ns/op 9.35 MB/s
+SearchPhone_CachedPCRE/2K 5000 213707 ns/op 9.58 MB/s
+SearchPhone_CachedPCRE/4K 5000 423086 ns/op 9.68 MB/s
+SearchPhone_CachedPCRE/8K 2000 854898 ns/op 9.58 MB/s
+SearchPhone_CachedPCRE/16K 1000 1699907 ns/op 9.64 MB/s
+SearchPhone_CachedPCRE/32K 500 3411732 ns/op 9.60 MB/s
+SearchPhone_CachedPCRE/64K 200 6718010 ns/op 9.76 MB/s
+SearchPhone_CachedPCRE/128K 100 13504430 ns/op 9.71 MB/s
+SearchPhone_CachedPCRE/256K 50 27150480 ns/op 9.66 MB/s
+SearchPhone_CachedPCRE/512K 20 54088550 ns/op 9.69 MB/s
+SearchPhone_CachedPCRE/1M 10 107855400 ns/op 9.72 MB/s
+SearchPhone_CachedPCRE/2M 5 216948400 ns/op 9.67 MB/s
+SearchPhone_CachedPCRE/4M 5 432028400 ns/op 9.71 MB/s
+SearchPhone_CachedPCRE/8M 2 867550000 ns/op 9.67 MB/s
+SearchPhone_CachedPCRE/16M 1 1732859000 ns/op 9.68 MB/s
+SearchPhone_CachedRE2/8 1000000 1253 ns/op 6.38 MB/s
+SearchPhone_CachedRE2/16 1000000 1300 ns/op 12.30 MB/s
+SearchPhone_CachedRE2/32 1000000 1379 ns/op 23.20 MB/s
+SearchPhone_CachedRE2/64 1000000 1569 ns/op 40.77 MB/s
+SearchPhone_CachedRE2/128 1000000 1875 ns/op 68.24 MB/s
+SearchPhone_CachedRE2/256 500000 2460 ns/op 104.05 MB/s
+SearchPhone_CachedRE2/512 500000 3629 ns/op 141.08 MB/s
+SearchPhone_CachedRE2/1K 200000 5971 ns/op 171.49 MB/s
+SearchPhone_CachedRE2/2K 100000 10981 ns/op 186.50 MB/s
+SearchPhone_CachedRE2/4K 50000 20502 ns/op 199.78 MB/s
+SearchPhone_CachedRE2/8K 50000 39182 ns/op 209.07 MB/s
+SearchPhone_CachedRE2/16K 20000 77462 ns/op 211.51 MB/s
+SearchPhone_CachedRE2/32K 10000 154502 ns/op 212.09 MB/s
+SearchPhone_CachedRE2/64K 5000 307476 ns/op 213.14 MB/s
+SearchPhone_CachedRE2/128K 2000 611231 ns/op 214.44 MB/s
+SearchPhone_CachedRE2/256K 1000 1224134 ns/op 214.15 MB/s
+SearchPhone_CachedRE2/512K 500 2450828 ns/op 213.92 MB/s
+SearchPhone_CachedRE2/1M 500 4939050 ns/op 212.30 MB/s
+SearchPhone_CachedRE2/2M 200 9875035 ns/op 212.37 MB/s
+SearchPhone_CachedRE2/4M 50 20061240 ns/op 209.08 MB/s
+SearchPhone_CachedRE2/8M 50 39959540 ns/op 209.93 MB/s
+SearchPhone_CachedRE2/16M 20 79246550 ns/op 211.71 MB/s
+EmptyPartialMatchPCRE 10000000 139 ns/op
+EmptyPartialMatchRE2 5000000 423 ns/op
+SimplePartialMatchPCRE 10000000 201 ns/op
+SimplePartialMatchRE2 5000000 464 ns/op
+HTTPPartialMatchPCRE 2000000 640 ns/op
+HTTPPartialMatchRE2 1000000 1026 ns/op
+SmallHTTPPartialMatchPCRE 2000000 636 ns/op
+SmallHTTPPartialMatchRE2 1000000 1023 ns/op
+DotMatchPCRE 2000000 847 ns/op
+DotMatchRE2 1000000 1055 ns/op
+ASCIIMatchPCRE 5000000 470 ns/op
+ASCIIMatchRE2 1000000 1051 ns/op
+==BENCHMARK== wreck.mtv.corp.google.com Fri Feb 26 16:59:13 PST 2010
+# Darwin wreck.mtv.corp.google.com 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:55:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_I386 i386
+# i686-apple-darwin9-g++-4.0.1 (GCC) 4.0.1 (Apple Inc. build 5484)
+# Copyright (C) 2005 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions. There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# a94585d91e66 tip
+# obj/test/regexp_benchmark: Mach-O executable i386
+
+Search_Easy0_CachedPCRE/8 20000000 143 ns/op 55.84 MB/s
+Search_Easy0_CachedPCRE/16 10000000 185 ns/op 86.27 MB/s
+Search_Easy0_CachedPCRE/32 10000000 263 ns/op 121.33 MB/s
+Search_Easy0_CachedPCRE/64 5000000 425 ns/op 150.51 MB/s
+Search_Easy0_CachedPCRE/128 2000000 770 ns/op 166.11 MB/s
+Search_Easy0_CachedPCRE/256 1000000 1415 ns/op 180.86 MB/s
+Search_Easy0_CachedPCRE/512 500000 3141 ns/op 162.97 MB/s
+Search_Easy0_CachedPCRE/1K 500000 6420 ns/op 159.49 MB/s
+Search_Easy0_CachedPCRE/2K 200000 11854 ns/op 172.76 MB/s
+Search_Easy0_CachedPCRE/4K 100000 24056 ns/op 170.27 MB/s
+Search_Easy0_CachedPCRE/8K 50000 47702 ns/op 171.73 MB/s
+Search_Easy0_CachedPCRE/16K 20000 94800 ns/op 172.83 MB/s
+Search_Easy0_CachedPCRE/32K 10000 188696 ns/op 173.65 MB/s
+Search_Easy0_CachedPCRE/64K 5000 375648 ns/op 174.46 MB/s
+Search_Easy0_CachedPCRE/128K 2000 750820 ns/op 174.57 MB/s
+Search_Easy0_CachedPCRE/256K 1000 1501631 ns/op 174.57 MB/s
+Search_Easy0_CachedPCRE/512K 500 3041566 ns/op 172.37 MB/s
+Search_Easy0_CachedPCRE/1M 500 5961312 ns/op 175.90 MB/s
+Search_Easy0_CachedPCRE/2M 100 12012730 ns/op 174.58 MB/s
+Search_Easy0_CachedPCRE/4M 100 24195970 ns/op 173.35 MB/s
+Search_Easy0_CachedPCRE/8M 50 48470420 ns/op 173.07 MB/s
+Search_Easy0_CachedPCRE/16M 20 97183200 ns/op 172.63 MB/s
+Search_Easy0_CachedRE2/8 5000000 430 ns/op 18.58 MB/s
+Search_Easy0_CachedRE2/16 5000000 443 ns/op 36.04 MB/s
+Search_Easy0_CachedRE2/32 5000000 467 ns/op 68.43 MB/s
+Search_Easy0_CachedRE2/64 5000000 523 ns/op 122.32 MB/s
+Search_Easy0_CachedRE2/128 5000000 604 ns/op 211.72 MB/s
+Search_Easy0_CachedRE2/256 2000000 789 ns/op 324.09 MB/s
+Search_Easy0_CachedRE2/512 1000000 1166 ns/op 438.91 MB/s
+Search_Easy0_CachedRE2/1K 1000000 1984 ns/op 515.90 MB/s
+Search_Easy0_CachedRE2/2K 500000 3565 ns/op 574.46 MB/s
+Search_Easy0_CachedRE2/4K 500000 6845 ns/op 598.33 MB/s
+Search_Easy0_CachedRE2/8K 200000 13387 ns/op 611.90 MB/s
+Search_Easy0_CachedRE2/16K 100000 26446 ns/op 619.52 MB/s
+Search_Easy0_CachedRE2/32K 50000 51345 ns/op 638.19 MB/s
+Search_Easy0_CachedRE2/64K 10000 102368 ns/op 640.20 MB/s
+Search_Easy0_CachedRE2/128K 10000 203304 ns/op 644.71 MB/s
+Search_Easy0_CachedRE2/256K 5000 405765 ns/op 646.05 MB/s
+Search_Easy0_CachedRE2/512K 2000 810785 ns/op 646.64 MB/s
+Search_Easy0_CachedRE2/1M 1000 1649854 ns/op 635.56 MB/s
+Search_Easy0_CachedRE2/2M 500 3268662 ns/op 641.59 MB/s
+Search_Easy0_CachedRE2/4M 500 6628094 ns/op 632.81 MB/s
+Search_Easy0_CachedRE2/8M 100 13442320 ns/op 624.04 MB/s
+Search_Easy0_CachedRE2/16M 50 27306780 ns/op 614.40 MB/s
+Search_Easy1_CachedPCRE/8 20000000 143 ns/op 55.63 MB/s
+Search_Easy1_CachedPCRE/16 10000000 182 ns/op 87.52 MB/s
+Search_Easy1_CachedPCRE/32 10000000 265 ns/op 120.60 MB/s
+Search_Easy1_CachedPCRE/64 5000000 426 ns/op 150.14 MB/s
+Search_Easy1_CachedPCRE/128 2000000 776 ns/op 164.74 MB/s
+Search_Easy1_CachedPCRE/256 1000000 1414 ns/op 180.99 MB/s
+Search_Easy1_CachedPCRE/512 1000000 2889 ns/op 177.17 MB/s
+Search_Easy1_CachedPCRE/1K 500000 6111 ns/op 167.55 MB/s
+Search_Easy1_CachedPCRE/2K 200000 12463 ns/op 164.32 MB/s
+Search_Easy1_CachedPCRE/4K 100000 24610 ns/op 166.43 MB/s
+Search_Easy1_CachedPCRE/8K 50000 49456 ns/op 165.64 MB/s
+Search_Easy1_CachedPCRE/16K 20000 97720 ns/op 167.66 MB/s
+Search_Easy1_CachedPCRE/32K 10000 196508 ns/op 166.75 MB/s
+Search_Easy1_CachedPCRE/64K 5000 385132 ns/op 170.16 MB/s
+Search_Easy1_CachedPCRE/128K 2000 771133 ns/op 169.97 MB/s
+Search_Easy1_CachedPCRE/256K 1000 1547561 ns/op 169.39 MB/s
+Search_Easy1_CachedPCRE/512K 500 3083398 ns/op 170.04 MB/s
+Search_Easy1_CachedPCRE/1M 500 6178714 ns/op 169.71 MB/s
+Search_Easy1_CachedPCRE/2M 100 12357130 ns/op 169.71 MB/s
+Search_Easy1_CachedPCRE/4M 100 24767250 ns/op 169.35 MB/s
+Search_Easy1_CachedPCRE/8M 50 50543820 ns/op 165.97 MB/s
+Search_Easy1_CachedPCRE/16M 20 100643550 ns/op 166.70 MB/s
+Search_Easy1_CachedRE2/8 5000000 439 ns/op 18.18 MB/s
+Search_Easy1_CachedRE2/16 5000000 446 ns/op 35.87 MB/s
+Search_Easy1_CachedRE2/32 5000000 468 ns/op 68.24 MB/s
+Search_Easy1_CachedRE2/64 5000000 519 ns/op 123.23 MB/s
+Search_Easy1_CachedRE2/128 5000000 611 ns/op 209.37 MB/s
+Search_Easy1_CachedRE2/256 2000000 787 ns/op 324.89 MB/s
+Search_Easy1_CachedRE2/512 1000000 1176 ns/op 435.25 MB/s
+Search_Easy1_CachedRE2/1K 1000000 1969 ns/op 519.86 MB/s
+Search_Easy1_CachedRE2/2K 500000 3572 ns/op 573.31 MB/s
+Search_Easy1_CachedRE2/4K 500000 6911 ns/op 592.63 MB/s
+Search_Easy1_CachedRE2/8K 200000 13437 ns/op 609.63 MB/s
+Search_Easy1_CachedRE2/16K 100000 26382 ns/op 621.02 MB/s
+Search_Easy1_CachedRE2/32K 50000 52112 ns/op 628.80 MB/s
+Search_Easy1_CachedRE2/64K 10000 102128 ns/op 641.70 MB/s
+Search_Easy1_CachedRE2/128K 10000 203580 ns/op 643.84 MB/s
+Search_Easy1_CachedRE2/256K 5000 408200 ns/op 642.19 MB/s
+Search_Easy1_CachedRE2/512K 2000 816006 ns/op 642.51 MB/s
+Search_Easy1_CachedRE2/1M 1000 1630582 ns/op 643.07 MB/s
+Search_Easy1_CachedRE2/2M 500 3315480 ns/op 632.53 MB/s
+Search_Easy1_CachedRE2/4M 500 6623626 ns/op 633.23 MB/s
+Search_Easy1_CachedRE2/8M 100 13362480 ns/op 627.77 MB/s
+Search_Easy1_CachedRE2/16M 100 26699900 ns/op 628.36 MB/s
+Search_Medium_CachedPCRE/8 20000000 144 ns/op 55.23 MB/s
+Search_Medium_CachedPCRE/16 10000000 188 ns/op 85.00 MB/s
+Search_Medium_CachedPCRE/32 10000000 274 ns/op 116.45 MB/s
+Search_Medium_CachedPCRE/64 5000000 446 ns/op 143.18 MB/s
+Search_Medium_CachedPCRE/128 500000 5989 ns/op 21.37 MB/s
+Search_Medium_CachedPCRE/256 200000 11152 ns/op 22.96 MB/s
+Search_Medium_CachedPCRE/512 100000 26444 ns/op 19.36 MB/s
+Search_Medium_CachedPCRE/1K 50000 51772 ns/op 19.78 MB/s
+Search_Medium_CachedPCRE/2K 20000 83901 ns/op 24.41 MB/s
+Search_Medium_CachedPCRE/4K 10000 201033 ns/op 20.37 MB/s
+Search_Medium_CachedPCRE/8K 5000 410276 ns/op 19.97 MB/s
+Search_Medium_CachedPCRE/16K 2000 824703 ns/op 19.87 MB/s
+Search_Medium_CachedPCRE/32K 1000 1654099 ns/op 19.81 MB/s
+Search_Medium_CachedPCRE/64K 500 3345594 ns/op 19.59 MB/s
+Search_Medium_CachedPCRE/128K 500 6597588 ns/op 19.87 MB/s
+Search_Medium_CachedPCRE/256K 100 13204280 ns/op 19.85 MB/s
+Search_Medium_CachedRE2/8 5000000 447 ns/op 17.88 MB/s
+Search_Medium_CachedRE2/16 5000000 488 ns/op 32.78 MB/s
+Search_Medium_CachedRE2/32 5000000 565 ns/op 56.60 MB/s
+Search_Medium_CachedRE2/64 5000000 711 ns/op 90.00 MB/s
+Search_Medium_CachedRE2/128 1000000 1027 ns/op 124.61 MB/s
+Search_Medium_CachedRE2/256 1000000 1632 ns/op 156.77 MB/s
+Search_Medium_CachedRE2/512 1000000 2826 ns/op 181.15 MB/s
+Search_Medium_CachedRE2/1K 500000 5336 ns/op 191.89 MB/s
+Search_Medium_CachedRE2/2K 200000 10524 ns/op 194.59 MB/s
+Search_Medium_CachedRE2/4K 100000 20398 ns/op 200.80 MB/s
+Search_Medium_CachedRE2/8K 50000 38371 ns/op 213.49 MB/s
+Search_Medium_CachedRE2/16K 20000 75467 ns/op 217.10 MB/s
+Search_Medium_CachedRE2/32K 10000 150407 ns/op 217.86 MB/s
+Search_Medium_CachedRE2/64K 5000 300663 ns/op 217.97 MB/s
+Search_Medium_CachedRE2/128K 5000 600814 ns/op 218.16 MB/s
+Search_Medium_CachedRE2/256K 2000 1212538 ns/op 216.19 MB/s
+Search_Medium_CachedRE2/512K 1000 2408767 ns/op 217.66 MB/s
+Search_Medium_CachedRE2/1M 500 4816914 ns/op 217.69 MB/s
+Search_Medium_CachedRE2/2M 200 9658095 ns/op 217.14 MB/s
+Search_Medium_CachedRE2/4M 100 19816050 ns/op 211.66 MB/s
+Search_Medium_CachedRE2/8M 50 39373200 ns/op 213.05 MB/s
+Search_Medium_CachedRE2/16M 20 78759400 ns/op 213.02 MB/s
+Search_Hard_CachedPCRE/8 20000000 143 ns/op 55.68 MB/s
+Search_Hard_CachedPCRE/16 10000000 188 ns/op 84.70 MB/s
+Search_Hard_CachedPCRE/32 10000000 276 ns/op 115.88 MB/s
+Search_Hard_CachedPCRE/64 5000000 447 ns/op 143.08 MB/s
+Search_Hard_CachedPCRE/128 10000 225891 ns/op 0.57 MB/s
+Search_Hard_CachedPCRE/256 2000 869631 ns/op 0.29 MB/s
+Search_Hard_CachedPCRE/512 500 3629904 ns/op 0.14 MB/s
+Search_Hard_CachedPCRE/1K 100 14249010 ns/op 0.07 MB/s
+Search_Hard_CachedPCRE/2K 50 53816760 ns/op 0.04 MB/s
+Search_Hard_CachedPCRE/4K 10 227514600 ns/op 0.02 MB/s
+Search_Hard_CachedRE2/8 5000000 448 ns/op 17.83 MB/s
+Search_Hard_CachedRE2/16 5000000 487 ns/op 32.85 MB/s
+Search_Hard_CachedRE2/32 5000000 557 ns/op 57.41 MB/s
+Search_Hard_CachedRE2/64 5000000 699 ns/op 91.50 MB/s
+Search_Hard_CachedRE2/128 1000000 1009 ns/op 126.74 MB/s
+Search_Hard_CachedRE2/256 1000000 1604 ns/op 159.57 MB/s
+Search_Hard_CachedRE2/512 1000000 2810 ns/op 182.14 MB/s
+Search_Hard_CachedRE2/1K 500000 5294 ns/op 193.41 MB/s
+Search_Hard_CachedRE2/2K 200000 10504 ns/op 194.97 MB/s
+Search_Hard_CachedRE2/4K 100000 20510 ns/op 199.70 MB/s
+Search_Hard_CachedRE2/8K 50000 38946 ns/op 210.34 MB/s
+Search_Hard_CachedRE2/16K 20000 76344 ns/op 214.61 MB/s
+Search_Hard_CachedRE2/32K 10000 150705 ns/op 217.43 MB/s
+Search_Hard_CachedRE2/64K 5000 300904 ns/op 217.80 MB/s
+Search_Hard_CachedRE2/128K 5000 600464 ns/op 218.28 MB/s
+Search_Hard_CachedRE2/256K 2000 1210236 ns/op 216.61 MB/s
+Search_Hard_CachedRE2/512K 1000 2405366 ns/op 217.97 MB/s
+Search_Hard_CachedRE2/1M 500 4806626 ns/op 218.15 MB/s
+Search_Hard_CachedRE2/2M 200 9610875 ns/op 218.21 MB/s
+Search_Hard_CachedRE2/4M 100 19793040 ns/op 211.91 MB/s
+Search_Hard_CachedRE2/8M 50 39302500 ns/op 213.44 MB/s
+Search_Hard_CachedRE2/16M 20 78721650 ns/op 213.12 MB/s
+Search_Parens_CachedPCRE/8 10000000 204 ns/op 39.08 MB/s
+Search_Parens_CachedRE2/8 5000000 451 ns/op 17.70 MB/s
+Search_Parens_CachedRE2/16 5000000 483 ns/op 33.10 MB/s
+Search_Parens_CachedRE2/32 5000000 558 ns/op 57.28 MB/s
+Search_Parens_CachedRE2/64 5000000 707 ns/op 90.46 MB/s
+Search_Parens_CachedRE2/128 1000000 1044 ns/op 122.53 MB/s
+Search_Parens_CachedRE2/256 1000000 1624 ns/op 157.57 MB/s
+Search_Parens_CachedRE2/512 1000000 2806 ns/op 182.41 MB/s
+Search_Parens_CachedRE2/1K 500000 5191 ns/op 197.26 MB/s
+Search_Parens_CachedRE2/2K 200000 10005 ns/op 204.68 MB/s
+Search_Parens_CachedRE2/4K 100000 20406 ns/op 200.72 MB/s
+Search_Parens_CachedRE2/8K 50000 38039 ns/op 215.36 MB/s
+Search_Parens_CachedRE2/16K 50000 75328 ns/op 217.50 MB/s
+Search_Parens_CachedRE2/32K 10000 150731 ns/op 217.39 MB/s
+Search_Parens_CachedRE2/64K 5000 300916 ns/op 217.79 MB/s
+Search_Parens_CachedRE2/128K 5000 600672 ns/op 218.21 MB/s
+Search_Parens_CachedRE2/256K 2000 1200385 ns/op 218.38 MB/s
+Search_Parens_CachedRE2/512K 1000 2405773 ns/op 217.93 MB/s
+Search_Parens_CachedRE2/1M 500 4857044 ns/op 215.89 MB/s
+Search_Parens_CachedRE2/2M 200 9654535 ns/op 217.22 MB/s
+Search_Parens_CachedRE2/4M 100 19599170 ns/op 214.00 MB/s
+Search_Parens_CachedRE2/8M 50 39356100 ns/op 213.15 MB/s
+Search_Parens_CachedRE2/16M 20 78612450 ns/op 213.42 MB/s
+Search_BigFixed_CachedPCRE/8 10000000 268 ns/op 29.77 MB/s
+Search_BigFixed_CachedPCRE/16 5000000 358 ns/op 44.64 MB/s
+Search_BigFixed_CachedPCRE/32 5000000 524 ns/op 60.96 MB/s
+Search_BigFixed_CachedPCRE/64 2000000 866 ns/op 73.85 MB/s
+Search_BigFixed_CachedPCRE/128 1000000 1573 ns/op 81.36 MB/s
+Search_BigFixed_CachedPCRE/256 1000000 2932 ns/op 87.29 MB/s
+Search_BigFixed_CachedPCRE/512 500000 5603 ns/op 91.37 MB/s
+Search_BigFixed_CachedPCRE/1K 200000 10992 ns/op 93.16 MB/s
+Search_BigFixed_CachedPCRE/2K 100000 21994 ns/op 93.11 MB/s
+Search_BigFixed_CachedPCRE/4K 50000 43354 ns/op 94.48 MB/s
+Search_BigFixed_CachedPCRE/8K 20000 85192 ns/op 96.16 MB/s
+Search_BigFixed_CachedPCRE/16K 10000 173058 ns/op 94.67 MB/s
+Search_BigFixed_CachedPCRE/32K 5000 346039 ns/op 94.69 MB/s
+Search_BigFixed_CachedRE2/8 10000000 161 ns/op 49.53 MB/s
+Search_BigFixed_CachedRE2/16 5000000 477 ns/op 33.53 MB/s
+Search_BigFixed_CachedRE2/32 5000000 523 ns/op 61.15 MB/s
+Search_BigFixed_CachedRE2/64 5000000 600 ns/op 106.60 MB/s
+Search_BigFixed_CachedRE2/128 2000000 767 ns/op 166.75 MB/s
+Search_BigFixed_CachedRE2/256 1000000 1080 ns/op 236.97 MB/s
+Search_BigFixed_CachedRE2/512 1000000 1682 ns/op 304.28 MB/s
+Search_BigFixed_CachedRE2/1K 1000000 2848 ns/op 359.43 MB/s
+Search_BigFixed_CachedRE2/2K 500000 5376 ns/op 380.95 MB/s
+Search_BigFixed_CachedRE2/4K 200000 10112 ns/op 405.06 MB/s
+Search_BigFixed_CachedRE2/8K 100000 20308 ns/op 403.37 MB/s
+Search_BigFixed_CachedRE2/16K 50000 40343 ns/op 406.11 MB/s
+Search_BigFixed_CachedRE2/32K 20000 78888 ns/op 415.37 MB/s
+Search_BigFixed_CachedRE2/64K 10000 156583 ns/op 418.54 MB/s
+Search_BigFixed_CachedRE2/128K 5000 308819 ns/op 424.43 MB/s
+Search_BigFixed_CachedRE2/256K 5000 626294 ns/op 418.56 MB/s
+Search_BigFixed_CachedRE2/512K 2000 1242990 ns/op 421.80 MB/s
+Search_BigFixed_CachedRE2/1M 500 2551348 ns/op 410.99 MB/s
+Search_Success_PCRE/8 500000 3284 ns/op 2.44 MB/s
+Search_Success_PCRE/16 500000 3343 ns/op 4.79 MB/s
+Search_Success_PCRE/32 500000 3425 ns/op 9.34 MB/s
+Search_Success_PCRE/64 500000 3673 ns/op 17.42 MB/s
+Search_Success_PCRE/128 500000 4401 ns/op 29.08 MB/s
+Search_Success_PCRE/256 500000 5526 ns/op 46.32 MB/s
+Search_Success_PCRE/512 200000 8015 ns/op 63.87 MB/s
+Search_Success_PCRE/1K 200000 13062 ns/op 78.39 MB/s
+Search_Success_PCRE/2K 100000 23200 ns/op 88.27 MB/s
+Search_Success_PCRE/4K 50000 43223 ns/op 94.76 MB/s
+Search_Success_PCRE/8K 20000 85092 ns/op 96.27 MB/s
+Search_Success_PCRE/16K 10000 169823 ns/op 96.48 MB/s
+Search_Success_PCRE/32K 5000 343536 ns/op 95.38 MB/s
+Search_Success_PCRE/64K 5000 677599 ns/op 96.72 MB/s
+Search_Success_PCRE/128K 2000 1350767 ns/op 97.04 MB/s
+Search_Success_PCRE/256K 1000 2702077 ns/op 97.02 MB/s
+Search_Success_PCRE/512K 500 5452538 ns/op 96.15 MB/s
+Search_Success_PCRE/1M 200 10893210 ns/op 96.26 MB/s
+Search_Success_PCRE/2M 100 22137760 ns/op 94.73 MB/s
+Search_Success_PCRE/4M 50 45563840 ns/op 92.05 MB/s
+Search_Success_PCRE/8M 10 108622300 ns/op 77.23 MB/s
+Search_Success_PCRE/16M 5 259894000 ns/op 64.55 MB/s
+Search_Success_RE2/8 100000 15751 ns/op 0.51 MB/s
+Search_Success_RE2/16 50000 33455 ns/op 0.48 MB/s
+Search_Success_RE2/32 50000 33825 ns/op 0.95 MB/s
+Search_Success_RE2/64 50000 34252 ns/op 1.87 MB/s
+Search_Success_RE2/128 50000 34026 ns/op 3.76 MB/s
+Search_Success_RE2/256 50000 34117 ns/op 7.50 MB/s
+Search_Success_RE2/512 50000 35615 ns/op 14.38 MB/s
+Search_Success_RE2/1K 50000 38105 ns/op 26.87 MB/s
+Search_Success_RE2/2K 50000 42071 ns/op 48.68 MB/s
+Search_Success_RE2/4K 50000 52244 ns/op 78.40 MB/s
+Search_Success_RE2/8K 50000 70924 ns/op 115.50 MB/s
+Search_Success_RE2/16K 10000 110263 ns/op 148.59 MB/s
+Search_Success_RE2/32K 10000 185668 ns/op 176.49 MB/s
+Search_Success_RE2/64K 5000 340829 ns/op 192.28 MB/s
+Search_Success_RE2/128K 5000 637700 ns/op 205.54 MB/s
+Search_Success_RE2/256K 2000 1244739 ns/op 210.60 MB/s
+Search_Success_RE2/512K 1000 2455934 ns/op 213.48 MB/s
+Search_Success_RE2/1M 500 4916210 ns/op 213.29 MB/s
+Search_Success_RE2/2M 200 9864960 ns/op 212.59 MB/s
+Search_Success_RE2/4M 50 21928160 ns/op 191.27 MB/s
+Search_Success_RE2/8M 20 50505050 ns/op 166.09 MB/s
+Search_Success_RE2/16M 10 123615800 ns/op 135.72 MB/s
+Search_Success_CachedPCRE/8 10000000 269 ns/op 29.68 MB/s
+Search_Success_CachedPCRE/16 5000000 345 ns/op 46.27 MB/s
+Search_Success_CachedPCRE/32 5000000 500 ns/op 63.90 MB/s
+Search_Success_CachedPCRE/64 2000000 810 ns/op 79.00 MB/s
+Search_Success_CachedPCRE/128 1000000 1513 ns/op 84.56 MB/s
+Search_Success_CachedPCRE/256 1000000 2844 ns/op 90.01 MB/s
+Search_Success_CachedPCRE/512 500000 5152 ns/op 99.37 MB/s
+Search_Success_CachedPCRE/1K 200000 10063 ns/op 101.76 MB/s
+Search_Success_CachedPCRE/2K 100000 20455 ns/op 100.12 MB/s
+Search_Success_CachedPCRE/4K 50000 40840 ns/op 100.29 MB/s
+Search_Success_CachedPCRE/8K 20000 82378 ns/op 99.44 MB/s
+Search_Success_CachedPCRE/16K 10000 167041 ns/op 98.08 MB/s
+Search_Success_CachedPCRE/32K 5000 335674 ns/op 97.62 MB/s
+Search_Success_CachedPCRE/64K 5000 671790 ns/op 97.55 MB/s
+Search_Success_CachedPCRE/128K 2000 1359318 ns/op 96.42 MB/s
+Search_Success_CachedPCRE/256K 1000 2694557 ns/op 97.29 MB/s
+Search_Success_CachedPCRE/512K 500 5414676 ns/op 96.83 MB/s
+Search_Success_CachedPCRE/1M 200 10888010 ns/op 96.31 MB/s
+Search_Success_CachedPCRE/2M 100 22137680 ns/op 94.73 MB/s
+Search_Success_CachedPCRE/4M 50 45685360 ns/op 91.81 MB/s
+Search_Success_CachedPCRE/8M 10 108998100 ns/op 76.96 MB/s
+Search_Success_CachedPCRE/16M 5 261873000 ns/op 64.07 MB/s
+Search_Success_CachedRE2/8 10000000 184 ns/op 43.45 MB/s
+Search_Success_CachedRE2/16 5000000 493 ns/op 32.45 MB/s
+Search_Success_CachedRE2/32 5000000 564 ns/op 56.65 MB/s
+Search_Success_CachedRE2/64 5000000 719 ns/op 88.90 MB/s
+Search_Success_CachedRE2/128 2000000 986 ns/op 129.71 MB/s
+Search_Success_CachedRE2/256 1000000 1515 ns/op 168.96 MB/s
+Search_Success_CachedRE2/512 1000000 2755 ns/op 185.79 MB/s
+Search_Success_CachedRE2/1K 500000 5393 ns/op 189.85 MB/s
+Search_Success_CachedRE2/2K 200000 10600 ns/op 193.19 MB/s
+Search_Success_CachedRE2/4K 100000 20483 ns/op 199.96 MB/s
+Search_Success_CachedRE2/8K 50000 38668 ns/op 211.85 MB/s
+Search_Success_CachedRE2/16K 20000 76366 ns/op 214.55 MB/s
+Search_Success_CachedRE2/32K 10000 150929 ns/op 217.11 MB/s
+Search_Success_CachedRE2/64K 5000 305399 ns/op 214.59 MB/s
+Search_Success_CachedRE2/128K 5000 602232 ns/op 217.64 MB/s
+Search_Success_CachedRE2/256K 2000 1205052 ns/op 217.54 MB/s
+Search_Success_CachedRE2/512K 1000 2422666 ns/op 216.41 MB/s
+Search_Success_CachedRE2/1M 500 4914886 ns/op 213.35 MB/s
+Search_Success_CachedRE2/2M 200 9935245 ns/op 211.08 MB/s
+Search_Success_CachedRE2/4M 50 21790440 ns/op 192.48 MB/s
+Search_Success_CachedRE2/8M 20 50113100 ns/op 167.39 MB/s
+Search_Success_CachedRE2/16M 10 123046100 ns/op 136.35 MB/s
+Search_Success1_PCRE/8 500000 3366 ns/op 2.38 MB/s
+Search_Success1_PCRE/16 500000 3454 ns/op 4.63 MB/s
+Search_Success1_PCRE/32 500000 3660 ns/op 8.74 MB/s
+Search_Success1_PCRE/64 500000 3867 ns/op 16.55 MB/s
+Search_Success1_PCRE/128 500000 4565 ns/op 28.04 MB/s
+Search_Success1_PCRE/256 500000 5799 ns/op 44.14 MB/s
+Search_Success1_PCRE/512 200000 8419 ns/op 60.81 MB/s
+Search_Success1_PCRE/1K 200000 13336 ns/op 76.78 MB/s
+Search_Success1_PCRE/2K 100000 23535 ns/op 87.02 MB/s
+Search_Success1_PCRE/4K 50000 43661 ns/op 93.81 MB/s
+Search_Success1_PCRE/8K 20000 86796 ns/op 94.38 MB/s
+Search_Success1_PCRE/16K 10000 168549 ns/op 97.21 MB/s
+Search_Success1_PCRE/32K 5000 335853 ns/op 97.57 MB/s
+Search_Success1_PCRE/64K 5000 677253 ns/op 96.77 MB/s
+Search_Success1_PCRE/128K 2000 1353762 ns/op 96.82 MB/s
+Search_Success1_PCRE/256K 1000 2736863 ns/op 95.78 MB/s
+Search_Success1_PCRE/512K 500 5461592 ns/op 96.00 MB/s
+Search_Success1_PCRE/1M 200 10982585 ns/op 95.48 MB/s
+Search_Success1_PCRE/2M 100 22383350 ns/op 93.69 MB/s
+Search_Success1_PCRE/4M 50 46209500 ns/op 90.77 MB/s
+Search_Success1_PCRE/8M 10 110218000 ns/op 76.11 MB/s
+Search_Success1_PCRE/16M 5 264726600 ns/op 63.38 MB/s
+Search_Success1_RE2/8 50000 46109 ns/op 0.17 MB/s
+Search_Success1_RE2/16 50000 46782 ns/op 0.34 MB/s
+Search_Success1_RE2/32 50000 46352 ns/op 0.69 MB/s
+Search_Success1_RE2/64 50000 46245 ns/op 1.38 MB/s
+Search_Success1_RE2/128 50000 46455 ns/op 2.76 MB/s
+Search_Success1_RE2/256 50000 47186 ns/op 5.43 MB/s
+Search_Success1_RE2/512 50000 48004 ns/op 10.67 MB/s
+Search_Success1_RE2/1K 50000 50252 ns/op 20.38 MB/s
+Search_Success1_RE2/2K 50000 54161 ns/op 37.81 MB/s
+Search_Success1_RE2/4K 50000 64963 ns/op 63.05 MB/s
+Search_Success1_RE2/8K 20000 82940 ns/op 98.77 MB/s
+Search_Success1_RE2/16K 10000 122743 ns/op 133.48 MB/s
+Search_Success1_RE2/32K 10000 197762 ns/op 165.69 MB/s
+Search_Success1_RE2/64K 5000 352522 ns/op 185.91 MB/s
+Search_Success1_RE2/128K 5000 658216 ns/op 199.13 MB/s
+Search_Success1_RE2/256K 2000 1258225 ns/op 208.34 MB/s
+Search_Success1_RE2/512K 1000 2478527 ns/op 211.53 MB/s
+Search_Success1_RE2/1M 500 4926770 ns/op 212.83 MB/s
+Search_Success1_RE2/2M 200 10027130 ns/op 209.15 MB/s
+Search_Success1_RE2/4M 50 21907720 ns/op 191.45 MB/s
+Search_Success1_RE2/8M 20 50590450 ns/op 165.81 MB/s
+Search_Success1_RE2/16M 10 122882000 ns/op 136.53 MB/s
+Search_Success1_Cached_PCRE/8 10000000 298 ns/op 26.77 MB/s
+Search_Success1_Cached_PCRE/16 5000000 372 ns/op 42.92 MB/s
+Search_Success1_Cached_PCRE/32 5000000 525 ns/op 60.90 MB/s
+Search_Success1_Cached_PCRE/64 2000000 837 ns/op 76.39 MB/s
+Search_Success1_Cached_PCRE/128 1000000 1472 ns/op 86.94 MB/s
+Search_Success1_Cached_PCRE/256 1000000 2741 ns/op 93.36 MB/s
+Search_Success1_Cached_PCRE/512 500000 5211 ns/op 98.24 MB/s
+Search_Success1_Cached_PCRE/1K 200000 10138 ns/op 101.00 MB/s
+Search_Success1_Cached_PCRE/2K 100000 20494 ns/op 99.93 MB/s
+Search_Success1_Cached_PCRE/4K 50000 41028 ns/op 99.83 MB/s
+Search_Success1_Cached_PCRE/8K 20000 83370 ns/op 98.26 MB/s
+Search_Success1_Cached_PCRE/16K 10000 169360 ns/op 96.74 MB/s
+Search_Success1_Cached_PCRE/32K 5000 335152 ns/op 97.77 MB/s
+Search_Success1_Cached_PCRE/64K 5000 672917 ns/op 97.39 MB/s
+Search_Success1_Cached_PCRE/128K 2000 1357874 ns/op 96.53 MB/s
+Search_Success1_Cached_PCRE/256K 1000 2691864 ns/op 97.38 MB/s
+Search_Success1_Cached_PCRE/512K 500 5409458 ns/op 96.92 MB/s
+Search_Success1_Cached_PCRE/1M 200 10914605 ns/op 96.07 MB/s
+Search_Success1_Cached_PCRE/2M 100 22352650 ns/op 93.82 MB/s
+Search_Success1_Cached_PCRE/4M 50 45584220 ns/op 92.01 MB/s
+Search_Success1_Cached_PCRE/8M 10 109049200 ns/op 76.92 MB/s
+Search_Success1_Cached_PCRE/16M 5 262203600 ns/op 63.99 MB/s
+Search_Success1_Cached_RE2/8 5000000 456 ns/op 17.54 MB/s
+Search_Success1_Cached_RE2/16 5000000 485 ns/op 32.98 MB/s
+Search_Success1_Cached_RE2/32 5000000 567 ns/op 56.44 MB/s
+Search_Success1_Cached_RE2/64 5000000 721 ns/op 88.72 MB/s
+Search_Success1_Cached_RE2/128 1000000 1008 ns/op 126.93 MB/s
+Search_Success1_Cached_RE2/256 1000000 1564 ns/op 163.65 MB/s
+Search_Success1_Cached_RE2/512 1000000 2669 ns/op 191.81 MB/s
+Search_Success1_Cached_RE2/1K 500000 5409 ns/op 189.28 MB/s
+Search_Success1_Cached_RE2/2K 200000 10523 ns/op 194.61 MB/s
+Search_Success1_Cached_RE2/4K 100000 20564 ns/op 199.18 MB/s
+Search_Success1_Cached_RE2/8K 50000 38430 ns/op 213.16 MB/s
+Search_Success1_Cached_RE2/16K 20000 76032 ns/op 215.49 MB/s
+Search_Success1_Cached_RE2/32K 10000 151271 ns/op 216.62 MB/s
+Search_Success1_Cached_RE2/64K 5000 302063 ns/op 216.96 MB/s
+Search_Success1_Cached_RE2/128K 5000 605221 ns/op 216.57 MB/s
+Search_Success1_Cached_RE2/256K 2000 1205637 ns/op 217.43 MB/s
+Search_Success1_Cached_RE2/512K 1000 2421347 ns/op 216.53 MB/s
+Search_Success1_Cached_RE2/1M 500 4865300 ns/op 215.52 MB/s
+Search_Success1_Cached_RE2/2M 200 10079725 ns/op 208.06 MB/s
+Search_Success1_Cached_RE2/4M 50 21765520 ns/op 192.70 MB/s
+Search_Success1_Cached_RE2/8M 20 50470050 ns/op 166.21 MB/s
+Search_Success1_Cached_RE2/16M 10 122714000 ns/op 136.72 MB/s
+Search_Digits_PCRE 500000 6942 ns/op
+Search_Digits_RE2 50000 36247 ns/op
+Parse_Digits_PCRE 500000 7096 ns/op
+Parse_Digits_RE2 100000 18800 ns/op
+Parse_CachedDigits_PCRE 5000000 566 ns/op
+Parse_CachedDigits_RE2 5000000 340 ns/op
+Parse_DigitDs_PCRE 500000 6292 ns/op
+Parse_DigitDs_RE2 100000 18679 ns/op
+Parse_CachedDigitDs_PCRE 5000000 569 ns/op
+Parse_CachedDigitDs_RE2 5000000 335 ns/op
+Parse_Split_PCRE 500000 4704 ns/op
+Parse_Split_RE2 100000 20487 ns/op
+Parse_CachedSplit_PCRE 5000000 422 ns/op
+Parse_CachedSplit_RE2 10000000 231 ns/op
+Parse_SplitHard_PCRE 500000 4807 ns/op
+Parse_SplitHard_RE2 100000 25767 ns/op
+Parse_CachedSplitHard_PCRE 5000000 426 ns/op
+Parse_CachedSplitHard_RE2 1000000 2295 ns/op
+Parse_CachedSplitBig1_PCRE 500 5471602 ns/op
+Parse_CachedSplitBig1_RE2 2000 922666 ns/op
+Parse_CachedSplitBig2_PCRE 2000 1036110 ns/op
+Parse_CachedSplitBig2_RE2 20 95396100 ns/op
+BM_PCRE_Compile 500000 5864 ns/op
+BM_RE2_Compile 100000 21683 ns/op
+SearchPhone_CachedPCRE/8 1000000 1487 ns/op 5.38 MB/s
+SearchPhone_CachedPCRE/16 1000000 2368 ns/op 6.75 MB/s
+SearchPhone_CachedPCRE/32 500000 4068 ns/op 7.87 MB/s
+SearchPhone_CachedPCRE/64 500000 7319 ns/op 8.74 MB/s
+SearchPhone_CachedPCRE/128 200000 14025 ns/op 9.13 MB/s
+SearchPhone_CachedPCRE/256 100000 27296 ns/op 9.38 MB/s
+SearchPhone_CachedPCRE/512 50000 53753 ns/op 9.52 MB/s
+SearchPhone_CachedPCRE/1K 10000 106767 ns/op 9.59 MB/s
+SearchPhone_CachedPCRE/2K 10000 213088 ns/op 9.61 MB/s
+SearchPhone_CachedPCRE/4K 5000 418855 ns/op 9.78 MB/s
+SearchPhone_CachedPCRE/8K 2000 838067 ns/op 9.77 MB/s
+SearchPhone_CachedPCRE/16K 1000 1680195 ns/op 9.75 MB/s
+SearchPhone_CachedPCRE/32K 500 3348730 ns/op 9.79 MB/s
+SearchPhone_CachedPCRE/64K 500 6741460 ns/op 9.72 MB/s
+SearchPhone_CachedPCRE/128K 100 13386160 ns/op 9.79 MB/s
+SearchPhone_CachedPCRE/256K 100 26777290 ns/op 9.79 MB/s
+SearchPhone_CachedPCRE/512K 50 53539340 ns/op 9.79 MB/s
+SearchPhone_CachedPCRE/1M 20 107442600 ns/op 9.76 MB/s
+SearchPhone_CachedPCRE/2M 10 215474400 ns/op 9.73 MB/s
+SearchPhone_CachedPCRE/4M 5 429385600 ns/op 9.77 MB/s
+SearchPhone_CachedPCRE/8M 5 858351200 ns/op 9.77 MB/s
+SearchPhone_CachedPCRE/16M 1 1728512000 ns/op 9.71 MB/s
+SearchPhone_CachedRE2/8 1000000 1229 ns/op 6.51 MB/s
+SearchPhone_CachedRE2/16 1000000 1267 ns/op 12.62 MB/s
+SearchPhone_CachedRE2/32 1000000 1347 ns/op 23.74 MB/s
+SearchPhone_CachedRE2/64 1000000 1534 ns/op 41.71 MB/s
+SearchPhone_CachedRE2/128 1000000 1835 ns/op 69.73 MB/s
+SearchPhone_CachedRE2/256 1000000 2481 ns/op 103.16 MB/s
+SearchPhone_CachedRE2/512 500000 3680 ns/op 139.11 MB/s
+SearchPhone_CachedRE2/1K 500000 5979 ns/op 171.26 MB/s
+SearchPhone_CachedRE2/2K 200000 11101 ns/op 184.48 MB/s
+SearchPhone_CachedRE2/4K 100000 20468 ns/op 200.11 MB/s
+SearchPhone_CachedRE2/8K 50000 39643 ns/op 206.64 MB/s
+SearchPhone_CachedRE2/16K 20000 76829 ns/op 213.25 MB/s
+SearchPhone_CachedRE2/32K 10000 151593 ns/op 216.16 MB/s
+SearchPhone_CachedRE2/64K 5000 301378 ns/op 217.45 MB/s
+SearchPhone_CachedRE2/128K 5000 601951 ns/op 217.75 MB/s
+SearchPhone_CachedRE2/256K 2000 1216569 ns/op 215.48 MB/s
+SearchPhone_CachedRE2/512K 1000 2408186 ns/op 217.71 MB/s
+SearchPhone_CachedRE2/1M 500 4819808 ns/op 217.56 MB/s
+SearchPhone_CachedRE2/2M 200 9686115 ns/op 216.51 MB/s
+SearchPhone_CachedRE2/4M 100 19783390 ns/op 212.01 MB/s
+SearchPhone_CachedRE2/8M 50 39521640 ns/op 212.25 MB/s
+SearchPhone_CachedRE2/16M 20 78231500 ns/op 214.46 MB/s
+EmptyPartialMatchPCRE 20000000 137 ns/op
+EmptyPartialMatchRE2 5000000 413 ns/op
+SimplePartialMatchPCRE 10000000 205 ns/op
+SimplePartialMatchRE2 5000000 457 ns/op
+HTTPPartialMatchPCRE 5000000 636 ns/op
+HTTPPartialMatchRE2 1000000 1005 ns/op
+SmallHTTPPartialMatchPCRE 5000000 634 ns/op
+SmallHTTPPartialMatchRE2 2000000 1009 ns/op
+DotMatchPCRE 2000000 837 ns/op
+DotMatchRE2 1000000 1043 ns/op
+ASCIIMatchPCRE 5000000 468 ns/op
+ASCIIMatchRE2 1000000 1040 ns/op
diff --git a/third_party/re2/src/benchlog/benchplot.py b/third_party/re2/src/benchlog/benchplot.py
new file mode 100755
index 000000000..104abe8e9
--- /dev/null
+++ b/third_party/re2/src/benchlog/benchplot.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+
+import argparse # for ArgumentParser
+import subprocess # for Popen
+import tempfile # for NamedTemporaryFile
+import os # for remove
+
+class gnuplot(object):
+
+ output = "result.png"
+
+ script = """
+ set terminal png size 1024, 768
+ set output "{}.png"
+ set title "re2 benchlog"
+ set datafile separator ";"
+ set grid x y
+ set ylabel "MB/s"
+ set autoscale
+ plot """
+
+ template = """'{}' using 1:5:xticlabels(2) with linespoints linewidth 3 title "{}",\\\n"""
+
+ benchdata = dict()
+ tempfiles = []
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, type, value, traceback):
+ """
+ remove all temporary files
+ """
+
+ for filename in self.tempfiles:
+ os.remove(filename)
+
+ def parse_re2_benchlog(self, filename):
+ """
+ parse the input benchlog and return a dictionary contain bench data
+ """
+
+ benchdata = self.benchdata
+
+ with open(filename) as f:
+
+ for raw in f.readlines():
+
+ data = raw.split('\t')
+
+ if len(data) == 4:
+
+ data = data[0].split('/') + data[1:]
+ data = list(map(str.strip, data))
+
+ if not benchdata.get(data[0]):
+ benchdata[data[0]] = [ data[1:] ]
+ else:
+ benchdata[data[0]].append(data[1:])
+
+ def gen_csv(self):
+ """
+ generate temporary csv files
+ """
+
+ for name, data in self.benchdata.items():
+
+ with tempfile.NamedTemporaryFile(delete=False) as f:
+
+ for index, line in enumerate(data):
+ f.write('{};{}\n'.format(index, ';'.join(line)).encode())
+
+ self.tempfiles.append(f.name)
+ self.script = self.script + self.template.format(f.name, name)
+
+ def run(self):
+ self.gen_csv()
+ script = self.script[:-3].format(self.output)
+ command = subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE)
+ command.communicate(script.encode())
+
+
+if __name__ == '__main__':
+
+ parser = argparse.ArgumentParser(description='generate plots for benchlog')
+ parser.add_argument('benchlog', type=str, help='benchlog generated by re2')
+ args = parser.parse_args()
+
+ try:
+ subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE)
+ except FileNotFoundError:
+ print('you can install "gnuplot" to generate plots automatically')
+ exit(1)
+
+ with gnuplot() as plot:
+ plot.output = args.benchlog
+ plot.parse_re2_benchlog(args.benchlog)
+ plot.run()
diff --git a/third_party/re2/src/benchlog/mktable b/third_party/re2/src/benchlog/mktable
new file mode 100755
index 000000000..da0659820
--- /dev/null
+++ b/third_party/re2/src/benchlog/mktable
@@ -0,0 +1,155 @@
+#!/usr/bin/perl
+# XXX
+
+sub table() {
+ my ($name) = @_;
+ print <<'EOF';
+<table border=0>
+<tr><th>System</th><th>PCRE</th><th>RE2</th></tr>
+EOF
+ foreach my $sys (@sys) {
+ my $ns_pcre = $data{$sys}->{sprintf($name, "PCRE")}->{'ns/op'};
+ my $ns_re2 = $data{$sys}->{sprintf($name, "RE2")}->{'ns/op'};
+ printf "<tr><td>%s</td><td>%.1f µs</td><td>%.1f µs</td></tr>\n", $sysname{$sys}, $ns_pcre/1000., $ns_re2/1000.;
+ }
+ print <<'EOF';
+<tr height=5><td colspan=3></td></tr>
+</table>
+EOF
+}
+
+@sizes = (
+ "8", "16", "32", "64", "128", "256", "512",
+ "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K",
+ "1M", "2M", "4M", "8M", "16M"
+);
+
+%color = (
+ "PCRE" => "0.7 0 0",
+ "RE2" => "0 0 1",
+);
+
+$ngraph = 0;
+
+sub graph() {
+ my ($name) = @_;
+
+ my $sys = "wreck";
+ my $base = sprintf("regexp3g%d", ++$ngraph);
+
+ open(JGR, ">$base.jgr") || die "open >$base.jgr: $!";
+ printf JGR "bbox -20 -12 392 95\n";
+ printf JGR "newgraph clip x_translate 0.25 y_translate 0.25\n";
+ $ymax = 0;
+ %lastx = ();
+ %lasty = ();
+ foreach my $who ("PCRE", "RE2") {
+ printf JGR "newcurve pts\n";
+ for(my $i=0; $i<@sizes; $i++) {
+ my $key = sprintf("%s%s/%s", $name, $who, $sizes[$i]);
+ my $val = $data{$sys}->{$key}->{'MB/s'};
+ next if !defined($val);
+ if($val > $ymax) {
+ $ymax = $val;
+ }
+ $lastx{$who} = $i;
+ $lasty{$who} = $val;
+ printf JGR "$i %f (* %s *)\n", $val, $key;
+ }
+ my $color = $color{$who};
+ printf JGR "marktype none color $color linethickness 2 linetype solid label : $who\n";
+ }
+ my $n = @sizes;
+ printf JGR "xaxis min -1 max $n size 5 label : text size (bytes)\n";
+ printf JGR " no_auto_hash_marks hash_labels fontsize 9\n";
+ for($i=0; $i<@sizes; $i+=3) {
+ printf JGR " hash_at $i hash_label at $i : $sizes[$i]\n";
+ }
+ my $y = 1;
+ while(10*$y <= $ymax) {
+ $y = 10*$y;
+ }
+ for($i=2; $i<=10; $i++) {
+ if($i*$y > $ymax) {
+ $y = $i*$y;
+ last;
+ }
+ }
+ foreach my $who ("PCRE", "RE2") {
+ $x1 = $lastx{$who};
+ $y1 = $lasty{$who};
+ $x1 *= 1.01;
+ my $v = "vjc";
+ if($y1 < 0.05 * $y) {
+ $v = "vjb";
+ $y1 = 0.05 * $y;
+ }
+ printf JGR "newstring x $x1 y $y1 hjl $v : $who\n";
+ }
+ printf JGR "yaxis min 0 max $y size 1 label : speed (MB/s)\n";
+ printf JGR " hash_labels fontsize 9\n";
+ # printf JGR "legend defaults font Times-Roman fontsize 10 x 0 y $y hjl vjt\n";
+
+ system("jgraph $base.jgr >$base.eps"); # die "system: $!";
+ system("gs -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dEPSCrop -sDEVICE=png16m -r100 -sOutputFile=$base.png -dBATCH -dQUIT -dQUIET -dNOPAUSE $base.eps");
+
+ printf "<img src=$base.png>\n"
+
+}
+
+sub skip() {
+ while(<>) {
+ if(/^<!-- -->/) {
+ print;
+ last;
+ }
+ }
+}
+
+@sys = ("r70", "c2", "wreck", "mini");
+%sysname = (
+ "r70" => "AMD Opteron 8214 HE, 2.2 GHz",
+ "c2" => "Intel Core2 Duo E7200, 2.53 GHz",
+ "wreck" => "Intel Xeon 5150, 2.66 GHz (Mac Pro)",
+ "mini" => "Intel Core2 T5600, 1.83 GHz (Mac Mini)",
+);
+
+%func = (
+ "table" => \&table,
+ "graph" => \&graph,
+
+);
+
+foreach my $sys (@sys) {
+ open(F, "benchlog.$sys") || die "open benchlog.$sys: $!";
+ my %sysdat;
+ while(<F>) {
+ if(/^([A-Za-z0-9_\/]+)\s+(\d+)\s+(\d+) ns\/op/) {
+ my %row;
+ $row{"name"} = $1;
+ $row{"iter"} = $2;
+ $row{"ns/op"} = $3;
+ if(/([\d.]+) MB\/s/){
+ $row{"MB/s"} = $1;
+ }
+ $sysdat{$row{"name"}} = \%row;
+ }
+ }
+ close F;
+ $data{$sys} = \%sysdat;
+}
+
+while(<>) {
+ print;
+ if(/^<!-- benchlog (\w+) -->/) {
+ $func{$1}();
+ skip();
+ next;
+ }
+ if(/^<!-- benchlog (\w+) ([%\w]+) -->/) {
+ $func{$1}($2);
+ skip();
+ next;
+ }
+}
+
diff --git a/third_party/re2/src/doc/mksyntaxgo b/third_party/re2/src/doc/mksyntaxgo
new file mode 100755
index 000000000..d30d28146
--- /dev/null
+++ b/third_party/re2/src/doc/mksyntaxgo
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+set -e
+out=$GOROOT/src/regexp/syntax/doc.go
+cp syntax.txt $out
+sam -d $out <<'!'
+,x g/NOT SUPPORTED/d
+/^Unicode character class/,$d
+,s/[«»]//g
+,x g/^Possessive repetitions:/d
+,x g/\\C/d
+,x g/Flag syntax/d
+,s/.=(true|false)/flag &/g
+,s/^Flags:/ Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:\n/
+,s/\n\n\n+/\n\n/g
+,x/(^.* .*\n)+/ | awk -F' ' '{printf(" %-14s %s\n", $1, $2)}'
+1,2c
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution.
+
+/*
+Package syntax parses regular expressions into parse trees and compiles
+parse trees into programs. Most clients of regular expressions will use the
+facilities of package regexp (such as Compile and Match) instead of this package.
+
+Syntax
+
+The regular expression syntax understood by this package when parsing with the Perl flag is as follows.
+Parts of the syntax can be disabled by passing alternate flags to Parse.
+
+.
+$a
+Unicode character classes are those in unicode.Categories and unicode.Scripts.
+*/
+package syntax
+.
+w
+q
+!
diff --git a/third_party/re2/src/doc/mksyntaxhtml b/third_party/re2/src/doc/mksyntaxhtml
new file mode 100755
index 000000000..0292ea00a
--- /dev/null
+++ b/third_party/re2/src/doc/mksyntaxhtml
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+cp syntax.txt syntax.html
+sam -d syntax.html <<'!'
+,s/\&/\&amp;/g
+,s/</\&lt;/g
+,s/>/\&gt;/g
+,s!== (([^()]|\([^()]*\))*)!≡ <code>\1</code>!g
+,s!«!<code>!g
+,s!»!</code>!g
+,s! vim$! <font size=-2>VIM</font>!g
+,s! pcre$! <font size=-2>PCRE</font>!g
+,s! perl$! <font size=-2>PERL</font>!g
+,x g/NOT SUPPORTED/ s!^[^ ]+!<font color=#808080>&</font>!
+,s!NOT SUPPORTED!!g
+,s!(^[^ ]+) (.*)\n!<tr><td><code>\1</code></td><td>\2</td></tr>\n!g
+,s!.*:$!<b>&</b>!g
+,s!^$!<tr><td></td></tr>!g
+,x v/<tr>/ s!.*!<tr><td colspan=2>&</td></tr>!
+1,2c
+<html>
+<!-- AUTOMATICALLY GENERATED by mksyntaxhtml -->
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
+<title>RE2 regular expression syntax reference</title>
+</head>
+<body>
+<h1>RE2 regular expression syntax reference</h1>
+
+<table border=0 cellpadding=2 cellspacing=2>
+<tr><td colspan=2>This page lists the regular expression syntax accepted by RE2.</td></tr>
+<tr><td colspan=2>It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
+<tr><td colspan=2>Grayed out expressions are not supported by RE2.</td></tr>
+.
+$a
+</table>
+</body>
+</html>
+.
+w
+q
+!
diff --git a/third_party/re2/src/doc/mksyntaxwiki b/third_party/re2/src/doc/mksyntaxwiki
new file mode 100755
index 000000000..930b3896e
--- /dev/null
+++ b/third_party/re2/src/doc/mksyntaxwiki
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+cp syntax.txt syntax.wiki
+sam -d syntax.wiki <<'!'
+,s!`!`````!g
+,s!== (([^()]|\([^()]*\))*)!≡ `\1`!g
+,s!«!`!g
+,s!»!`!g
+,s! vim$! <font size="1">VIM</font>!g
+,s! pcre$! <font size="1">PCRE</font>!g
+,s! perl$! <font size="1">PERL</font>!g
+,s!(^[^ ]+) (.*)\n!`\1` \2\n!g
+,x g/NOT SUPPORTED/ s!^[^ ]+!<font color="#808080">&</font>!
+,s!NOT SUPPORTED!<font size="1">(&)</font>!g
+,s!(^[^ ]+) (.*)\n!<tr><td>\1</td><td>\2</td></tr>\n!g
+,s!.*:$!<b>&</b>!g
+,s!^$!<tr><td></td></tr>!g
+,x v/<tr>/ s!.*!<tr><td colspan="2">&</td></tr>!
+1,2c
+#summary I define UNIX as “30 definitions of regular expressions living under one roof.” —Don Knuth
+
+<wiki:comment>
+GENERATED BY mksyntaxwiki. DO NOT EDIT
+</wiki:comment>
+
+<table border="0" cellpadding="2" cellspacing="2">
+<tr><td colspan="2">This page lists the regular expression syntax accepted by RE2.</td></tr>
+<tr><td colspan="2">It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
+<tr><td colspan="2">Grayed out expressions are not supported by RE2.</td></tr>
+.
+$a
+</table>
+.
+w
+q
+!
diff --git a/third_party/re2/src/doc/syntax.html b/third_party/re2/src/doc/syntax.html
new file mode 100644
index 000000000..6cbda140e
--- /dev/null
+++ b/third_party/re2/src/doc/syntax.html
@@ -0,0 +1,477 @@
+<html>
+<!-- AUTOMATICALLY GENERATED by mksyntaxhtml -->
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
+<title>RE2 regular expression syntax reference</title>
+</head>
+<body>
+<h1>RE2 regular expression syntax reference</h1>
+
+<table border=0 cellpadding=2 cellspacing=2>
+<tr><td colspan=2>This page lists the regular expression syntax accepted by RE2.</td></tr>
+<tr><td colspan=2>It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
+<tr><td colspan=2>Grayed out expressions are not supported by RE2.</td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Single characters:</b></td></tr>
+<tr><td><code>.</code></td><td>any character, possibly including newline (s=true)</td></tr>
+<tr><td><code>[xyz]</code></td><td>character class</td></tr>
+<tr><td><code>[^xyz]</code></td><td>negated character class</td></tr>
+<tr><td><code>\d</code></td><td>Perl character class</td></tr>
+<tr><td><code>\D</code></td><td>negated Perl character class</td></tr>
+<tr><td><code>[[:alpha:]]</code></td><td>ASCII character class</td></tr>
+<tr><td><code>[[:^alpha:]]</code></td><td>negated ASCII character class</td></tr>
+<tr><td><code>\pN</code></td><td>Unicode character class (one-letter name)</td></tr>
+<tr><td><code>\p{Greek}</code></td><td>Unicode character class</td></tr>
+<tr><td><code>\PN</code></td><td>negated Unicode character class (one-letter name)</td></tr>
+<tr><td><code>\P{Greek}</code></td><td>negated Unicode character class</td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Composites:</b></td></tr>
+<tr><td><code>xy</code></td><td><code>x</code> followed by <code>y</code></td></tr>
+<tr><td><code>x|y</code></td><td><code>x</code> or <code>y</code> (prefer <code>x</code>)</td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Repetitions:</b></td></tr>
+<tr><td><code>x*</code></td><td>zero or more <code>x</code>, prefer more</td></tr>
+<tr><td><code>x+</code></td><td>one or more <code>x</code>, prefer more</td></tr>
+<tr><td><code>x?</code></td><td>zero or one <code>x</code>, prefer one</td></tr>
+<tr><td><code>x{n,m}</code></td><td><code>n</code> or <code>n</code>+1 or ... or <code>m</code> <code>x</code>, prefer more</td></tr>
+<tr><td><code>x{n,}</code></td><td><code>n</code> or more <code>x</code>, prefer more</td></tr>
+<tr><td><code>x{n}</code></td><td>exactly <code>n</code> <code>x</code></td></tr>
+<tr><td><code>x*?</code></td><td>zero or more <code>x</code>, prefer fewer</td></tr>
+<tr><td><code>x+?</code></td><td>one or more <code>x</code>, prefer fewer</td></tr>
+<tr><td><code>x??</code></td><td>zero or one <code>x</code>, prefer zero</td></tr>
+<tr><td><code>x{n,m}?</code></td><td><code>n</code> or <code>n</code>+1 or ... or <code>m</code> <code>x</code>, prefer fewer</td></tr>
+<tr><td><code>x{n,}?</code></td><td><code>n</code> or more <code>x</code>, prefer fewer</td></tr>
+<tr><td><code>x{n}?</code></td><td>exactly <code>n</code> <code>x</code></td></tr>
+<tr><td><code><font color=#808080>x{}</font></code></td><td>(≡ <code>x*</code>) <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>x{-}</font></code></td><td>(≡ <code>x*?</code>) <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>x{-n}</font></code></td><td>(≡ <code>x{n}?</code>) <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>x=</font></code></td><td>(≡ <code>x?</code>) <font size=-2>VIM</font></td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2>Implementation restriction: The counting forms <code>x{n,m}</code>, <code>x{n,}</code>, and <code>x{n}</code></td></tr>
+<tr><td colspan=2>reject forms that create a minimum or maximum repetition count above 1000.</td></tr>
+<tr><td colspan=2>Unlimited repetitions are not subject to this restriction.</td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Possessive repetitions:</b></td></tr>
+<tr><td><code><font color=#808080>x*+</font></code></td><td>zero or more <code>x</code>, possessive </td></tr>
+<tr><td><code><font color=#808080>x++</font></code></td><td>one or more <code>x</code>, possessive </td></tr>
+<tr><td><code><font color=#808080>x?+</font></code></td><td>zero or one <code>x</code>, possessive </td></tr>
+<tr><td><code><font color=#808080>x{n,m}+</font></code></td><td><code>n</code> or ... or <code>m</code> <code>x</code>, possessive </td></tr>
+<tr><td><code><font color=#808080>x{n,}+</font></code></td><td><code>n</code> or more <code>x</code>, possessive </td></tr>
+<tr><td><code><font color=#808080>x{n}+</font></code></td><td>exactly <code>n</code> <code>x</code>, possessive </td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Grouping:</b></td></tr>
+<tr><td><code>(re)</code></td><td>numbered capturing group (submatch)</td></tr>
+<tr><td><code>(?P&lt;name&gt;re)</code></td><td>named &amp; numbered capturing group (submatch)</td></tr>
+<tr><td><code>(?&lt;name&gt;re)</code></td><td>named &amp; numbered capturing group (submatch)</td></tr>
+<tr><td><code><font color=#808080>(?'name're)</font></code></td><td>named &amp; numbered capturing group (submatch) </td></tr>
+<tr><td><code>(?:re)</code></td><td>non-capturing group</td></tr>
+<tr><td><code>(?flags)</code></td><td>set flags within current group; non-capturing</td></tr>
+<tr><td><code>(?flags:re)</code></td><td>set flags during re; non-capturing</td></tr>
+<tr><td><code><font color=#808080>(?#text)</font></code></td><td>comment </td></tr>
+<tr><td><code><font color=#808080>(?|x|y|z)</font></code></td><td>branch numbering reset </td></tr>
+<tr><td><code><font color=#808080>(?&gt;re)</font></code></td><td>possessive match of <code>re</code> </td></tr>
+<tr><td><code><font color=#808080>re@&gt;</font></code></td><td>possessive match of <code>re</code> <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>%(re)</font></code></td><td>non-capturing group <font size=-2>VIM</font></td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Flags:</b></td></tr>
+<tr><td><code>i</code></td><td>case-insensitive (default false)</td></tr>
+<tr><td><code>m</code></td><td>multi-line mode: <code>^</code> and <code>$</code> match begin/end line in addition to begin/end text (default false)</td></tr>
+<tr><td><code>s</code></td><td>let <code>.</code> match <code>\n</code> (default false)</td></tr>
+<tr><td><code>U</code></td><td>ungreedy: swap meaning of <code>x*</code> and <code>x*?</code>, <code>x+</code> and <code>x+?</code>, etc (default false)</td></tr>
+<tr><td colspan=2>Flag syntax is <code>xyz</code> (set) or <code>-xyz</code> (clear) or <code>xy-z</code> (set <code>xy</code>, clear <code>z</code>).</td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Empty strings:</b></td></tr>
+<tr><td><code>^</code></td><td>at beginning of text or line (<code>m</code>=true)</td></tr>
+<tr><td><code>$</code></td><td>at end of text (like <code>\z</code> not <code>\Z</code>) or line (<code>m</code>=true)</td></tr>
+<tr><td><code>\A</code></td><td>at beginning of text</td></tr>
+<tr><td><code>\b</code></td><td>at ASCII word boundary (<code>\w</code> on one side and <code>\W</code>, <code>\A</code>, or <code>\z</code> on the other)</td></tr>
+<tr><td><code>\B</code></td><td>not at ASCII word boundary</td></tr>
+<tr><td><code><font color=#808080>\G</font></code></td><td>at beginning of subtext being searched <font size=-2>PCRE</font></td></tr>
+<tr><td><code><font color=#808080>\G</font></code></td><td>at end of last match <font size=-2>PERL</font></td></tr>
+<tr><td><code><font color=#808080>\Z</font></code></td><td>at end of text, or before newline at end of text </td></tr>
+<tr><td><code>\z</code></td><td>at end of text</td></tr>
+<tr><td><code><font color=#808080>(?=re)</font></code></td><td>before text matching <code>re</code> </td></tr>
+<tr><td><code><font color=#808080>(?!re)</font></code></td><td>before text not matching <code>re</code> </td></tr>
+<tr><td><code><font color=#808080>(?&lt;=re)</font></code></td><td>after text matching <code>re</code> </td></tr>
+<tr><td><code><font color=#808080>(?&lt;!re)</font></code></td><td>after text not matching <code>re</code> </td></tr>
+<tr><td><code><font color=#808080>re&amp;</font></code></td><td>before text matching <code>re</code> <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>re@=</font></code></td><td>before text matching <code>re</code> <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>re@!</font></code></td><td>before text not matching <code>re</code> <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>re@&lt;=</font></code></td><td>after text matching <code>re</code> <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>re@&lt;!</font></code></td><td>after text not matching <code>re</code> <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\zs</font></code></td><td>sets start of match (= \K) <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\ze</font></code></td><td>sets end of match <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\%^</font></code></td><td>beginning of file <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\%$</font></code></td><td>end of file <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\%V</font></code></td><td>on screen <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\%#</font></code></td><td>cursor position <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\%'m</font></code></td><td>mark <code>m</code> position <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\%23l</font></code></td><td>in line 23 <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\%23c</font></code></td><td>in column 23 <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\%23v</font></code></td><td>in virtual column 23 <font size=-2>VIM</font></td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Escape sequences:</b></td></tr>
+<tr><td><code>\a</code></td><td>bell (≡ <code>\007</code>)</td></tr>
+<tr><td><code>\f</code></td><td>form feed (≡ <code>\014</code>)</td></tr>
+<tr><td><code>\t</code></td><td>horizontal tab (≡ <code>\011</code>)</td></tr>
+<tr><td><code>\n</code></td><td>newline (≡ <code>\012</code>)</td></tr>
+<tr><td><code>\r</code></td><td>carriage return (≡ <code>\015</code>)</td></tr>
+<tr><td><code>\v</code></td><td>vertical tab character (≡ <code>\013</code>)</td></tr>
+<tr><td><code>\*</code></td><td>literal <code>*</code>, for any punctuation character <code>*</code></td></tr>
+<tr><td><code>\123</code></td><td>octal character code (up to three digits)</td></tr>
+<tr><td><code>\x7F</code></td><td>hex character code (exactly two digits)</td></tr>
+<tr><td><code>\x{10FFFF}</code></td><td>hex character code</td></tr>
+<tr><td><code>\C</code></td><td>match a single byte even in UTF-8 mode</td></tr>
+<tr><td><code>\Q...\E</code></td><td>literal text <code>...</code> even if <code>...</code> has punctuation</td></tr>
+<tr><td></td></tr>
+<tr><td><code><font color=#808080>\1</font></code></td><td>backreference </td></tr>
+<tr><td><code><font color=#808080>\b</font></code></td><td>backspace (use <code>\010</code>)</td></tr>
+<tr><td><code><font color=#808080>\cK</font></code></td><td>control char ^K (use <code>\001</code> etc)</td></tr>
+<tr><td><code><font color=#808080>\e</font></code></td><td>escape (use <code>\033</code>)</td></tr>
+<tr><td><code><font color=#808080>\g1</font></code></td><td>backreference </td></tr>
+<tr><td><code><font color=#808080>\g{1}</font></code></td><td>backreference </td></tr>
+<tr><td><code><font color=#808080>\g{+1}</font></code></td><td>backreference </td></tr>
+<tr><td><code><font color=#808080>\g{-1}</font></code></td><td>backreference </td></tr>
+<tr><td><code><font color=#808080>\g{name}</font></code></td><td>named backreference </td></tr>
+<tr><td><code><font color=#808080>\g&lt;name&gt;</font></code></td><td>subroutine call </td></tr>
+<tr><td><code><font color=#808080>\g'name'</font></code></td><td>subroutine call </td></tr>
+<tr><td><code><font color=#808080>\k&lt;name&gt;</font></code></td><td>named backreference </td></tr>
+<tr><td><code><font color=#808080>\k'name'</font></code></td><td>named backreference </td></tr>
+<tr><td><code><font color=#808080>\lX</font></code></td><td>lowercase <code>X</code> </td></tr>
+<tr><td><code><font color=#808080>\ux</font></code></td><td>uppercase <code>x</code> </td></tr>
+<tr><td><code><font color=#808080>\L...\E</font></code></td><td>lowercase text <code>...</code> </td></tr>
+<tr><td><code><font color=#808080>\K</font></code></td><td>reset beginning of <code>$0</code> </td></tr>
+<tr><td><code><font color=#808080>\N{name}</font></code></td><td>named Unicode character </td></tr>
+<tr><td><code><font color=#808080>\R</font></code></td><td>line break </td></tr>
+<tr><td><code><font color=#808080>\U...\E</font></code></td><td>upper case text <code>...</code> </td></tr>
+<tr><td><code><font color=#808080>\X</font></code></td><td>extended Unicode sequence </td></tr>
+<tr><td></td></tr>
+<tr><td><code><font color=#808080>\%d123</font></code></td><td>decimal character 123 <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\%xFF</font></code></td><td>hex character FF <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\%o123</font></code></td><td>octal character 123 <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\%u1234</font></code></td><td>Unicode character 0x1234 <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\%U12345678</font></code></td><td>Unicode character 0x12345678 <font size=-2>VIM</font></td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Character class elements:</b></td></tr>
+<tr><td><code>x</code></td><td>single character</td></tr>
+<tr><td><code>A-Z</code></td><td>character range (inclusive)</td></tr>
+<tr><td><code>\d</code></td><td>Perl character class</td></tr>
+<tr><td><code>[:foo:]</code></td><td>ASCII character class <code>foo</code></td></tr>
+<tr><td><code>\p{Foo}</code></td><td>Unicode character class <code>Foo</code></td></tr>
+<tr><td><code>\pF</code></td><td>Unicode character class <code>F</code> (one-letter name)</td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Named character classes as character class elements:</b></td></tr>
+<tr><td><code>[\d]</code></td><td>digits (≡ <code>\d</code>)</td></tr>
+<tr><td><code>[^\d]</code></td><td>not digits (≡ <code>\D</code>)</td></tr>
+<tr><td><code>[\D]</code></td><td>not digits (≡ <code>\D</code>)</td></tr>
+<tr><td><code>[^\D]</code></td><td>not not digits (≡ <code>\d</code>)</td></tr>
+<tr><td><code>[[:name:]]</code></td><td>named ASCII class inside character class (≡ <code>[:name:]</code>)</td></tr>
+<tr><td><code>[^[:name:]]</code></td><td>named ASCII class inside negated character class (≡ <code>[:^name:]</code>)</td></tr>
+<tr><td><code>[\p{Name}]</code></td><td>named Unicode property inside character class (≡ <code>\p{Name}</code>)</td></tr>
+<tr><td><code>[^\p{Name}]</code></td><td>named Unicode property inside negated character class (≡ <code>\P{Name}</code>)</td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Perl character classes (all ASCII-only):</b></td></tr>
+<tr><td><code>\d</code></td><td>digits (≡ <code>[0-9]</code>)</td></tr>
+<tr><td><code>\D</code></td><td>not digits (≡ <code>[^0-9]</code>)</td></tr>
+<tr><td><code>\s</code></td><td>whitespace (≡ <code>[\t\n\f\r ]</code>)</td></tr>
+<tr><td><code>\S</code></td><td>not whitespace (≡ <code>[^\t\n\f\r ]</code>)</td></tr>
+<tr><td><code>\w</code></td><td>word characters (≡ <code>[0-9A-Za-z_]</code>)</td></tr>
+<tr><td><code>\W</code></td><td>not word characters (≡ <code>[^0-9A-Za-z_]</code>)</td></tr>
+<tr><td></td></tr>
+<tr><td><code><font color=#808080>\h</font></code></td><td>horizontal space </td></tr>
+<tr><td><code><font color=#808080>\H</font></code></td><td>not horizontal space </td></tr>
+<tr><td><code><font color=#808080>\v</font></code></td><td>vertical space </td></tr>
+<tr><td><code><font color=#808080>\V</font></code></td><td>not vertical space </td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>ASCII character classes:</b></td></tr>
+<tr><td><code>[[:alnum:]]</code></td><td>alphanumeric (≡ <code>[0-9A-Za-z]</code>)</td></tr>
+<tr><td><code>[[:alpha:]]</code></td><td>alphabetic (≡ <code>[A-Za-z]</code>)</td></tr>
+<tr><td><code>[[:ascii:]]</code></td><td>ASCII (≡ <code>[\x00-\x7F]</code>)</td></tr>
+<tr><td><code>[[:blank:]]</code></td><td>blank (≡ <code>[\t ]</code>)</td></tr>
+<tr><td><code>[[:cntrl:]]</code></td><td>control (≡ <code>[\x00-\x1F\x7F]</code>)</td></tr>
+<tr><td><code>[[:digit:]]</code></td><td>digits (≡ <code>[0-9]</code>)</td></tr>
+<tr><td><code>[[:graph:]]</code></td><td>graphical (≡ <code>[!-~] == [A-Za-z0-9!"#$%&amp;'()*+,\-./:;&lt;=&gt;?@[\\\]^_`{|}~]</code>)</td></tr>
+<tr><td><code>[[:lower:]]</code></td><td>lower case (≡ <code>[a-z]</code>)</td></tr>
+<tr><td><code>[[:print:]]</code></td><td>printable (≡ <code>[ -~] == [ [:graph:]]</code>)</td></tr>
+<tr><td><code>[[:punct:]]</code></td><td>punctuation (≡ <code>[!-/:-@[-`{-~]</code>)</td></tr>
+<tr><td><code>[[:space:]]</code></td><td>whitespace (≡ <code>[\t\n\v\f\r ]</code>)</td></tr>
+<tr><td><code>[[:upper:]]</code></td><td>upper case (≡ <code>[A-Z]</code>)</td></tr>
+<tr><td><code>[[:word:]]</code></td><td>word characters (≡ <code>[0-9A-Za-z_]</code>)</td></tr>
+<tr><td><code>[[:xdigit:]]</code></td><td>hex digit (≡ <code>[0-9A-Fa-f]</code>)</td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Unicode character class names--general category:</b></td></tr>
+<tr><td><code>C</code></td><td>other</td></tr>
+<tr><td><code>Cc</code></td><td>control</td></tr>
+<tr><td><code>Cf</code></td><td>format</td></tr>
+<tr><td><code><font color=#808080>Cn</font></code></td><td>unassigned code points </td></tr>
+<tr><td><code>Co</code></td><td>private use</td></tr>
+<tr><td><code>Cs</code></td><td>surrogate</td></tr>
+<tr><td><code>L</code></td><td>letter</td></tr>
+<tr><td><code><font color=#808080>LC</font></code></td><td>cased letter </td></tr>
+<tr><td><code><font color=#808080>L&amp;</font></code></td><td>cased letter </td></tr>
+<tr><td><code>Ll</code></td><td>lowercase letter</td></tr>
+<tr><td><code>Lm</code></td><td>modifier letter</td></tr>
+<tr><td><code>Lo</code></td><td>other letter</td></tr>
+<tr><td><code>Lt</code></td><td>titlecase letter</td></tr>
+<tr><td><code>Lu</code></td><td>uppercase letter</td></tr>
+<tr><td><code>M</code></td><td>mark</td></tr>
+<tr><td><code>Mc</code></td><td>spacing mark</td></tr>
+<tr><td><code>Me</code></td><td>enclosing mark</td></tr>
+<tr><td><code>Mn</code></td><td>non-spacing mark</td></tr>
+<tr><td><code>N</code></td><td>number</td></tr>
+<tr><td><code>Nd</code></td><td>decimal number</td></tr>
+<tr><td><code>Nl</code></td><td>letter number</td></tr>
+<tr><td><code>No</code></td><td>other number</td></tr>
+<tr><td><code>P</code></td><td>punctuation</td></tr>
+<tr><td><code>Pc</code></td><td>connector punctuation</td></tr>
+<tr><td><code>Pd</code></td><td>dash punctuation</td></tr>
+<tr><td><code>Pe</code></td><td>close punctuation</td></tr>
+<tr><td><code>Pf</code></td><td>final punctuation</td></tr>
+<tr><td><code>Pi</code></td><td>initial punctuation</td></tr>
+<tr><td><code>Po</code></td><td>other punctuation</td></tr>
+<tr><td><code>Ps</code></td><td>open punctuation</td></tr>
+<tr><td><code>S</code></td><td>symbol</td></tr>
+<tr><td><code>Sc</code></td><td>currency symbol</td></tr>
+<tr><td><code>Sk</code></td><td>modifier symbol</td></tr>
+<tr><td><code>Sm</code></td><td>math symbol</td></tr>
+<tr><td><code>So</code></td><td>other symbol</td></tr>
+<tr><td><code>Z</code></td><td>separator</td></tr>
+<tr><td><code>Zl</code></td><td>line separator</td></tr>
+<tr><td><code>Zp</code></td><td>paragraph separator</td></tr>
+<tr><td><code>Zs</code></td><td>space separator</td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Unicode character class names--scripts:</b></td></tr>
+<tr><td colspan=2>Adlam</td></tr>
+<tr><td colspan=2>Ahom</td></tr>
+<tr><td colspan=2>Anatolian_Hieroglyphs</td></tr>
+<tr><td colspan=2>Arabic</td></tr>
+<tr><td colspan=2>Armenian</td></tr>
+<tr><td colspan=2>Avestan</td></tr>
+<tr><td colspan=2>Balinese</td></tr>
+<tr><td colspan=2>Bamum</td></tr>
+<tr><td colspan=2>Bassa_Vah</td></tr>
+<tr><td colspan=2>Batak</td></tr>
+<tr><td colspan=2>Bengali</td></tr>
+<tr><td colspan=2>Bhaiksuki</td></tr>
+<tr><td colspan=2>Bopomofo</td></tr>
+<tr><td colspan=2>Brahmi</td></tr>
+<tr><td colspan=2>Braille</td></tr>
+<tr><td colspan=2>Buginese</td></tr>
+<tr><td colspan=2>Buhid</td></tr>
+<tr><td colspan=2>Canadian_Aboriginal</td></tr>
+<tr><td colspan=2>Carian</td></tr>
+<tr><td colspan=2>Caucasian_Albanian</td></tr>
+<tr><td colspan=2>Chakma</td></tr>
+<tr><td colspan=2>Cham</td></tr>
+<tr><td colspan=2>Cherokee</td></tr>
+<tr><td colspan=2>Chorasmian</td></tr>
+<tr><td colspan=2>Common</td></tr>
+<tr><td colspan=2>Coptic</td></tr>
+<tr><td colspan=2>Cuneiform</td></tr>
+<tr><td colspan=2>Cypriot</td></tr>
+<tr><td colspan=2>Cypro_Minoan</td></tr>
+<tr><td colspan=2>Cyrillic</td></tr>
+<tr><td colspan=2>Deseret</td></tr>
+<tr><td colspan=2>Devanagari</td></tr>
+<tr><td colspan=2>Dives_Akuru</td></tr>
+<tr><td colspan=2>Dogra</td></tr>
+<tr><td colspan=2>Duployan</td></tr>
+<tr><td colspan=2>Egyptian_Hieroglyphs</td></tr>
+<tr><td colspan=2>Elbasan</td></tr>
+<tr><td colspan=2>Elymaic</td></tr>
+<tr><td colspan=2>Ethiopic</td></tr>
+<tr><td colspan=2>Georgian</td></tr>
+<tr><td colspan=2>Glagolitic</td></tr>
+<tr><td colspan=2>Gothic</td></tr>
+<tr><td colspan=2>Grantha</td></tr>
+<tr><td colspan=2>Greek</td></tr>
+<tr><td colspan=2>Gujarati</td></tr>
+<tr><td colspan=2>Gunjala_Gondi</td></tr>
+<tr><td colspan=2>Gurmukhi</td></tr>
+<tr><td colspan=2>Han</td></tr>
+<tr><td colspan=2>Hangul</td></tr>
+<tr><td colspan=2>Hanifi_Rohingya</td></tr>
+<tr><td colspan=2>Hanunoo</td></tr>
+<tr><td colspan=2>Hatran</td></tr>
+<tr><td colspan=2>Hebrew</td></tr>
+<tr><td colspan=2>Hiragana</td></tr>
+<tr><td colspan=2>Imperial_Aramaic</td></tr>
+<tr><td colspan=2>Inherited</td></tr>
+<tr><td colspan=2>Inscriptional_Pahlavi</td></tr>
+<tr><td colspan=2>Inscriptional_Parthian</td></tr>
+<tr><td colspan=2>Javanese</td></tr>
+<tr><td colspan=2>Kaithi</td></tr>
+<tr><td colspan=2>Kannada</td></tr>
+<tr><td colspan=2>Katakana</td></tr>
+<tr><td colspan=2>Kawi</td></tr>
+<tr><td colspan=2>Kayah_Li</td></tr>
+<tr><td colspan=2>Kharoshthi</td></tr>
+<tr><td colspan=2>Khitan_Small_Script</td></tr>
+<tr><td colspan=2>Khmer</td></tr>
+<tr><td colspan=2>Khojki</td></tr>
+<tr><td colspan=2>Khudawadi</td></tr>
+<tr><td colspan=2>Lao</td></tr>
+<tr><td colspan=2>Latin</td></tr>
+<tr><td colspan=2>Lepcha</td></tr>
+<tr><td colspan=2>Limbu</td></tr>
+<tr><td colspan=2>Linear_A</td></tr>
+<tr><td colspan=2>Linear_B</td></tr>
+<tr><td colspan=2>Lisu</td></tr>
+<tr><td colspan=2>Lycian</td></tr>
+<tr><td colspan=2>Lydian</td></tr>
+<tr><td colspan=2>Mahajani</td></tr>
+<tr><td colspan=2>Makasar</td></tr>
+<tr><td colspan=2>Malayalam</td></tr>
+<tr><td colspan=2>Mandaic</td></tr>
+<tr><td colspan=2>Manichaean</td></tr>
+<tr><td colspan=2>Marchen</td></tr>
+<tr><td colspan=2>Masaram_Gondi</td></tr>
+<tr><td colspan=2>Medefaidrin</td></tr>
+<tr><td colspan=2>Meetei_Mayek</td></tr>
+<tr><td colspan=2>Mende_Kikakui</td></tr>
+<tr><td colspan=2>Meroitic_Cursive</td></tr>
+<tr><td colspan=2>Meroitic_Hieroglyphs</td></tr>
+<tr><td colspan=2>Miao</td></tr>
+<tr><td colspan=2>Modi</td></tr>
+<tr><td colspan=2>Mongolian</td></tr>
+<tr><td colspan=2>Mro</td></tr>
+<tr><td colspan=2>Multani</td></tr>
+<tr><td colspan=2>Myanmar</td></tr>
+<tr><td colspan=2>Nabataean</td></tr>
+<tr><td colspan=2>Nag_Mundari</td></tr>
+<tr><td colspan=2>Nandinagari</td></tr>
+<tr><td colspan=2>New_Tai_Lue</td></tr>
+<tr><td colspan=2>Newa</td></tr>
+<tr><td colspan=2>Nko</td></tr>
+<tr><td colspan=2>Nushu</td></tr>
+<tr><td colspan=2>Nyiakeng_Puachue_Hmong</td></tr>
+<tr><td colspan=2>Ogham</td></tr>
+<tr><td colspan=2>Ol_Chiki</td></tr>
+<tr><td colspan=2>Old_Hungarian</td></tr>
+<tr><td colspan=2>Old_Italic</td></tr>
+<tr><td colspan=2>Old_North_Arabian</td></tr>
+<tr><td colspan=2>Old_Permic</td></tr>
+<tr><td colspan=2>Old_Persian</td></tr>
+<tr><td colspan=2>Old_Sogdian</td></tr>
+<tr><td colspan=2>Old_South_Arabian</td></tr>
+<tr><td colspan=2>Old_Turkic</td></tr>
+<tr><td colspan=2>Old_Uyghur</td></tr>
+<tr><td colspan=2>Oriya</td></tr>
+<tr><td colspan=2>Osage</td></tr>
+<tr><td colspan=2>Osmanya</td></tr>
+<tr><td colspan=2>Pahawh_Hmong</td></tr>
+<tr><td colspan=2>Palmyrene</td></tr>
+<tr><td colspan=2>Pau_Cin_Hau</td></tr>
+<tr><td colspan=2>Phags_Pa</td></tr>
+<tr><td colspan=2>Phoenician</td></tr>
+<tr><td colspan=2>Psalter_Pahlavi</td></tr>
+<tr><td colspan=2>Rejang</td></tr>
+<tr><td colspan=2>Runic</td></tr>
+<tr><td colspan=2>Samaritan</td></tr>
+<tr><td colspan=2>Saurashtra</td></tr>
+<tr><td colspan=2>Sharada</td></tr>
+<tr><td colspan=2>Shavian</td></tr>
+<tr><td colspan=2>Siddham</td></tr>
+<tr><td colspan=2>SignWriting</td></tr>
+<tr><td colspan=2>Sinhala</td></tr>
+<tr><td colspan=2>Sogdian</td></tr>
+<tr><td colspan=2>Sora_Sompeng</td></tr>
+<tr><td colspan=2>Soyombo</td></tr>
+<tr><td colspan=2>Sundanese</td></tr>
+<tr><td colspan=2>Syloti_Nagri</td></tr>
+<tr><td colspan=2>Syriac</td></tr>
+<tr><td colspan=2>Tagalog</td></tr>
+<tr><td colspan=2>Tagbanwa</td></tr>
+<tr><td colspan=2>Tai_Le</td></tr>
+<tr><td colspan=2>Tai_Tham</td></tr>
+<tr><td colspan=2>Tai_Viet</td></tr>
+<tr><td colspan=2>Takri</td></tr>
+<tr><td colspan=2>Tamil</td></tr>
+<tr><td colspan=2>Tangsa</td></tr>
+<tr><td colspan=2>Tangut</td></tr>
+<tr><td colspan=2>Telugu</td></tr>
+<tr><td colspan=2>Thaana</td></tr>
+<tr><td colspan=2>Thai</td></tr>
+<tr><td colspan=2>Tibetan</td></tr>
+<tr><td colspan=2>Tifinagh</td></tr>
+<tr><td colspan=2>Tirhuta</td></tr>
+<tr><td colspan=2>Toto</td></tr>
+<tr><td colspan=2>Ugaritic</td></tr>
+<tr><td colspan=2>Vai</td></tr>
+<tr><td colspan=2>Vithkuqi</td></tr>
+<tr><td colspan=2>Wancho</td></tr>
+<tr><td colspan=2>Warang_Citi</td></tr>
+<tr><td colspan=2>Yezidi</td></tr>
+<tr><td colspan=2>Yi</td></tr>
+<tr><td colspan=2>Zanabazar_Square</td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Vim character classes:</b></td></tr>
+<tr><td><code><font color=#808080>\i</font></code></td><td>identifier character <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\I</font></code></td><td><code>\i</code> except digits <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\k</font></code></td><td>keyword character <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\K</font></code></td><td><code>\k</code> except digits <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\f</font></code></td><td>file name character <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\F</font></code></td><td><code>\f</code> except digits <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\p</font></code></td><td>printable character <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\P</font></code></td><td><code>\p</code> except digits <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\s</font></code></td><td>whitespace character (≡ <code>[ \t]</code>) <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\S</font></code></td><td>non-white space character (≡ <code>[^ \t]</code>) <font size=-2>VIM</font></td></tr>
+<tr><td><code>\d</code></td><td>digits (≡ <code>[0-9]</code>) <font size=-2>VIM</font></td></tr>
+<tr><td><code>\D</code></td><td>not <code>\d</code> <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\x</font></code></td><td>hex digits (≡ <code>[0-9A-Fa-f]</code>) <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\X</font></code></td><td>not <code>\x</code> <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\o</font></code></td><td>octal digits (≡ <code>[0-7]</code>) <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\O</font></code></td><td>not <code>\o</code> <font size=-2>VIM</font></td></tr>
+<tr><td><code>\w</code></td><td>word character <font size=-2>VIM</font></td></tr>
+<tr><td><code>\W</code></td><td>not <code>\w</code> <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\h</font></code></td><td>head of word character <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\H</font></code></td><td>not <code>\h</code> <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\a</font></code></td><td>alphabetic <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\A</font></code></td><td>not <code>\a</code> <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\l</font></code></td><td>lowercase <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\L</font></code></td><td>not lowercase <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\u</font></code></td><td>uppercase <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\U</font></code></td><td>not uppercase <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\_x</font></code></td><td><code>\x</code> plus newline, for any <code>x</code> <font size=-2>VIM</font></td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Vim flags:</b></td></tr>
+<tr><td><code><font color=#808080>\c</font></code></td><td>ignore case <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\C</font></code></td><td>match case <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\m</font></code></td><td>magic <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\M</font></code></td><td>nomagic <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\v</font></code></td><td>verymagic <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\V</font></code></td><td>verynomagic <font size=-2>VIM</font></td></tr>
+<tr><td><code><font color=#808080>\Z</font></code></td><td>ignore differences in Unicode combining characters <font size=-2>VIM</font></td></tr>
+<tr><td></td></tr>
+<tr><td colspan=2><b>Magic:</b></td></tr>
+<tr><td><code><font color=#808080>(?{code})</font></code></td><td>arbitrary Perl code <font size=-2>PERL</font></td></tr>
+<tr><td><code><font color=#808080>(??{code})</font></code></td><td>postponed arbitrary Perl code <font size=-2>PERL</font></td></tr>
+<tr><td><code><font color=#808080>(?n)</font></code></td><td>recursive call to regexp capturing group <code>n</code> </td></tr>
+<tr><td><code><font color=#808080>(?+n)</font></code></td><td>recursive call to relative group <code>+n</code> </td></tr>
+<tr><td><code><font color=#808080>(?-n)</font></code></td><td>recursive call to relative group <code>-n</code> </td></tr>
+<tr><td><code><font color=#808080>(?C)</font></code></td><td>PCRE callout <font size=-2>PCRE</font></td></tr>
+<tr><td><code><font color=#808080>(?R)</font></code></td><td>recursive call to entire regexp (≡ <code>(?0)</code>) </td></tr>
+<tr><td><code><font color=#808080>(?&amp;name)</font></code></td><td>recursive call to named group </td></tr>
+<tr><td><code><font color=#808080>(?P=name)</font></code></td><td>named backreference </td></tr>
+<tr><td><code><font color=#808080>(?P&gt;name)</font></code></td><td>recursive call to named group </td></tr>
+<tr><td><code><font color=#808080>(?(cond)true|false)</font></code></td><td>conditional branch </td></tr>
+<tr><td><code><font color=#808080>(?(cond)true)</font></code></td><td>conditional branch </td></tr>
+<tr><td><code><font color=#808080>(*ACCEPT)</font></code></td><td>make regexps more like Prolog </td></tr>
+<tr><td><code><font color=#808080>(*COMMIT)</font></code></td><td></td></tr>
+<tr><td><code><font color=#808080>(*F)</font></code></td><td></td></tr>
+<tr><td><code><font color=#808080>(*FAIL)</font></code></td><td></td></tr>
+<tr><td><code><font color=#808080>(*MARK)</font></code></td><td></td></tr>
+<tr><td><code><font color=#808080>(*PRUNE)</font></code></td><td></td></tr>
+<tr><td><code><font color=#808080>(*SKIP)</font></code></td><td></td></tr>
+<tr><td><code><font color=#808080>(*THEN)</font></code></td><td></td></tr>
+<tr><td><code><font color=#808080>(*ANY)</font></code></td><td>set newline convention </td></tr>
+<tr><td><code><font color=#808080>(*ANYCRLF)</font></code></td><td></td></tr>
+<tr><td><code><font color=#808080>(*CR)</font></code></td><td></td></tr>
+<tr><td><code><font color=#808080>(*CRLF)</font></code></td><td></td></tr>
+<tr><td><code><font color=#808080>(*LF)</font></code></td><td></td></tr>
+<tr><td><code><font color=#808080>(*BSR_ANYCRLF)</font></code></td><td>set \R convention <font size=-2>PCRE</font></td></tr>
+<tr><td><code><font color=#808080>(*BSR_UNICODE)</font></code></td><td> <font size=-2>PCRE</font></td></tr>
+<tr><td></td></tr>
+</table>
+</body>
+</html>
diff --git a/third_party/re2/src/doc/syntax.txt b/third_party/re2/src/doc/syntax.txt
new file mode 100644
index 000000000..6070efd96
--- /dev/null
+++ b/third_party/re2/src/doc/syntax.txt
@@ -0,0 +1,463 @@
+RE2 regular expression syntax reference
+-------------------------­-------­-----
+
+Single characters:
+. any character, possibly including newline (s=true)
+[xyz] character class
+[^xyz] negated character class
+\d Perl character class
+\D negated Perl character class
+[[:alpha:]] ASCII character class
+[[:^alpha:]] negated ASCII character class
+\pN Unicode character class (one-letter name)
+\p{Greek} Unicode character class
+\PN negated Unicode character class (one-letter name)
+\P{Greek} negated Unicode character class
+
+Composites:
+xy «x» followed by «y»
+x|y «x» or «y» (prefer «x»)
+
+Repetitions:
+x* zero or more «x», prefer more
+x+ one or more «x», prefer more
+x? zero or one «x», prefer one
+x{n,m} «n» or «n»+1 or ... or «m» «x», prefer more
+x{n,} «n» or more «x», prefer more
+x{n} exactly «n» «x»
+x*? zero or more «x», prefer fewer
+x+? one or more «x», prefer fewer
+x?? zero or one «x», prefer zero
+x{n,m}? «n» or «n»+1 or ... or «m» «x», prefer fewer
+x{n,}? «n» or more «x», prefer fewer
+x{n}? exactly «n» «x»
+x{} (== x*) NOT SUPPORTED vim
+x{-} (== x*?) NOT SUPPORTED vim
+x{-n} (== x{n}?) NOT SUPPORTED vim
+x= (== x?) NOT SUPPORTED vim
+
+Implementation restriction: The counting forms «x{n,m}», «x{n,}», and «x{n}»
+reject forms that create a minimum or maximum repetition count above 1000.
+Unlimited repetitions are not subject to this restriction.
+
+Possessive repetitions:
+x*+ zero or more «x», possessive NOT SUPPORTED
+x++ one or more «x», possessive NOT SUPPORTED
+x?+ zero or one «x», possessive NOT SUPPORTED
+x{n,m}+ «n» or ... or «m» «x», possessive NOT SUPPORTED
+x{n,}+ «n» or more «x», possessive NOT SUPPORTED
+x{n}+ exactly «n» «x», possessive NOT SUPPORTED
+
+Grouping:
+(re) numbered capturing group (submatch)
+(?P<name>re) named & numbered capturing group (submatch)
+(?<name>re) named & numbered capturing group (submatch)
+(?'name're) named & numbered capturing group (submatch) NOT SUPPORTED
+(?:re) non-capturing group
+(?flags) set flags within current group; non-capturing
+(?flags:re) set flags during re; non-capturing
+(?#text) comment NOT SUPPORTED
+(?|x|y|z) branch numbering reset NOT SUPPORTED
+(?>re) possessive match of «re» NOT SUPPORTED
+re@> possessive match of «re» NOT SUPPORTED vim
+%(re) non-capturing group NOT SUPPORTED vim
+
+Flags:
+i case-insensitive (default false)
+m multi-line mode: «^» and «$» match begin/end line in addition to begin/end text (default false)
+s let «.» match «\n» (default false)
+U ungreedy: swap meaning of «x*» and «x*?», «x+» and «x+?», etc (default false)
+Flag syntax is «xyz» (set) or «-xyz» (clear) or «xy-z» (set «xy», clear «z»).
+
+Empty strings:
+^ at beginning of text or line («m»=true)
+$ at end of text (like «\z» not «\Z») or line («m»=true)
+\A at beginning of text
+\b at ASCII word boundary («\w» on one side and «\W», «\A», or «\z» on the other)
+\B not at ASCII word boundary
+\G at beginning of subtext being searched NOT SUPPORTED pcre
+\G at end of last match NOT SUPPORTED perl
+\Z at end of text, or before newline at end of text NOT SUPPORTED
+\z at end of text
+(?=re) before text matching «re» NOT SUPPORTED
+(?!re) before text not matching «re» NOT SUPPORTED
+(?<=re) after text matching «re» NOT SUPPORTED
+(?<!re) after text not matching «re» NOT SUPPORTED
+re& before text matching «re» NOT SUPPORTED vim
+re@= before text matching «re» NOT SUPPORTED vim
+re@! before text not matching «re» NOT SUPPORTED vim
+re@<= after text matching «re» NOT SUPPORTED vim
+re@<! after text not matching «re» NOT SUPPORTED vim
+\zs sets start of match (= \K) NOT SUPPORTED vim
+\ze sets end of match NOT SUPPORTED vim
+\%^ beginning of file NOT SUPPORTED vim
+\%$ end of file NOT SUPPORTED vim
+\%V on screen NOT SUPPORTED vim
+\%# cursor position NOT SUPPORTED vim
+\%'m mark «m» position NOT SUPPORTED vim
+\%23l in line 23 NOT SUPPORTED vim
+\%23c in column 23 NOT SUPPORTED vim
+\%23v in virtual column 23 NOT SUPPORTED vim
+
+Escape sequences:
+\a bell (== \007)
+\f form feed (== \014)
+\t horizontal tab (== \011)
+\n newline (== \012)
+\r carriage return (== \015)
+\v vertical tab character (== \013)
+\* literal «*», for any punctuation character «*»
+\123 octal character code (up to three digits)
+\x7F hex character code (exactly two digits)
+\x{10FFFF} hex character code
+\C match a single byte even in UTF-8 mode
+\Q...\E literal text «...» even if «...» has punctuation
+
+\1 backreference NOT SUPPORTED
+\b backspace NOT SUPPORTED (use «\010»)
+\cK control char ^K NOT SUPPORTED (use «\001» etc)
+\e escape NOT SUPPORTED (use «\033»)
+\g1 backreference NOT SUPPORTED
+\g{1} backreference NOT SUPPORTED
+\g{+1} backreference NOT SUPPORTED
+\g{-1} backreference NOT SUPPORTED
+\g{name} named backreference NOT SUPPORTED
+\g<name> subroutine call NOT SUPPORTED
+\g'name' subroutine call NOT SUPPORTED
+\k<name> named backreference NOT SUPPORTED
+\k'name' named backreference NOT SUPPORTED
+\lX lowercase «X» NOT SUPPORTED
+\ux uppercase «x» NOT SUPPORTED
+\L...\E lowercase text «...» NOT SUPPORTED
+\K reset beginning of «$0» NOT SUPPORTED
+\N{name} named Unicode character NOT SUPPORTED
+\R line break NOT SUPPORTED
+\U...\E upper case text «...» NOT SUPPORTED
+\X extended Unicode sequence NOT SUPPORTED
+
+\%d123 decimal character 123 NOT SUPPORTED vim
+\%xFF hex character FF NOT SUPPORTED vim
+\%o123 octal character 123 NOT SUPPORTED vim
+\%u1234 Unicode character 0x1234 NOT SUPPORTED vim
+\%U12345678 Unicode character 0x12345678 NOT SUPPORTED vim
+
+Character class elements:
+x single character
+A-Z character range (inclusive)
+\d Perl character class
+[:foo:] ASCII character class «foo»
+\p{Foo} Unicode character class «Foo»
+\pF Unicode character class «F» (one-letter name)
+
+Named character classes as character class elements:
+[\d] digits (== \d)
+[^\d] not digits (== \D)
+[\D] not digits (== \D)
+[^\D] not not digits (== \d)
+[[:name:]] named ASCII class inside character class (== [:name:])
+[^[:name:]] named ASCII class inside negated character class (== [:^name:])
+[\p{Name}] named Unicode property inside character class (== \p{Name})
+[^\p{Name}] named Unicode property inside negated character class (== \P{Name})
+
+Perl character classes (all ASCII-only):
+\d digits (== [0-9])
+\D not digits (== [^0-9])
+\s whitespace (== [\t\n\f\r ])
+\S not whitespace (== [^\t\n\f\r ])
+\w word characters (== [0-9A-Za-z_])
+\W not word characters (== [^0-9A-Za-z_])
+
+\h horizontal space NOT SUPPORTED
+\H not horizontal space NOT SUPPORTED
+\v vertical space NOT SUPPORTED
+\V not vertical space NOT SUPPORTED
+
+ASCII character classes:
+[[:alnum:]] alphanumeric (== [0-9A-Za-z])
+[[:alpha:]] alphabetic (== [A-Za-z])
+[[:ascii:]] ASCII (== [\x00-\x7F])
+[[:blank:]] blank (== [\t ])
+[[:cntrl:]] control (== [\x00-\x1F\x7F])
+[[:digit:]] digits (== [0-9])
+[[:graph:]] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
+[[:lower:]] lower case (== [a-z])
+[[:print:]] printable (== [ -~] == [ [:graph:]])
+[[:punct:]] punctuation (== [!-/:-@[-`{-~])
+[[:space:]] whitespace (== [\t\n\v\f\r ])
+[[:upper:]] upper case (== [A-Z])
+[[:word:]] word characters (== [0-9A-Za-z_])
+[[:xdigit:]] hex digit (== [0-9A-Fa-f])
+
+Unicode character class names--general category:
+C other
+Cc control
+Cf format
+Cn unassigned code points NOT SUPPORTED
+Co private use
+Cs surrogate
+L letter
+LC cased letter NOT SUPPORTED
+L& cased letter NOT SUPPORTED
+Ll lowercase letter
+Lm modifier letter
+Lo other letter
+Lt titlecase letter
+Lu uppercase letter
+M mark
+Mc spacing mark
+Me enclosing mark
+Mn non-spacing mark
+N number
+Nd decimal number
+Nl letter number
+No other number
+P punctuation
+Pc connector punctuation
+Pd dash punctuation
+Pe close punctuation
+Pf final punctuation
+Pi initial punctuation
+Po other punctuation
+Ps open punctuation
+S symbol
+Sc currency symbol
+Sk modifier symbol
+Sm math symbol
+So other symbol
+Z separator
+Zl line separator
+Zp paragraph separator
+Zs space separator
+
+Unicode character class names--scripts:
+Adlam
+Ahom
+Anatolian_Hieroglyphs
+Arabic
+Armenian
+Avestan
+Balinese
+Bamum
+Bassa_Vah
+Batak
+Bengali
+Bhaiksuki
+Bopomofo
+Brahmi
+Braille
+Buginese
+Buhid
+Canadian_Aboriginal
+Carian
+Caucasian_Albanian
+Chakma
+Cham
+Cherokee
+Chorasmian
+Common
+Coptic
+Cuneiform
+Cypriot
+Cypro_Minoan
+Cyrillic
+Deseret
+Devanagari
+Dives_Akuru
+Dogra
+Duployan
+Egyptian_Hieroglyphs
+Elbasan
+Elymaic
+Ethiopic
+Georgian
+Glagolitic
+Gothic
+Grantha
+Greek
+Gujarati
+Gunjala_Gondi
+Gurmukhi
+Han
+Hangul
+Hanifi_Rohingya
+Hanunoo
+Hatran
+Hebrew
+Hiragana
+Imperial_Aramaic
+Inherited
+Inscriptional_Pahlavi
+Inscriptional_Parthian
+Javanese
+Kaithi
+Kannada
+Katakana
+Kawi
+Kayah_Li
+Kharoshthi
+Khitan_Small_Script
+Khmer
+Khojki
+Khudawadi
+Lao
+Latin
+Lepcha
+Limbu
+Linear_A
+Linear_B
+Lisu
+Lycian
+Lydian
+Mahajani
+Makasar
+Malayalam
+Mandaic
+Manichaean
+Marchen
+Masaram_Gondi
+Medefaidrin
+Meetei_Mayek
+Mende_Kikakui
+Meroitic_Cursive
+Meroitic_Hieroglyphs
+Miao
+Modi
+Mongolian
+Mro
+Multani
+Myanmar
+Nabataean
+Nag_Mundari
+Nandinagari
+New_Tai_Lue
+Newa
+Nko
+Nushu
+Nyiakeng_Puachue_Hmong
+Ogham
+Ol_Chiki
+Old_Hungarian
+Old_Italic
+Old_North_Arabian
+Old_Permic
+Old_Persian
+Old_Sogdian
+Old_South_Arabian
+Old_Turkic
+Old_Uyghur
+Oriya
+Osage
+Osmanya
+Pahawh_Hmong
+Palmyrene
+Pau_Cin_Hau
+Phags_Pa
+Phoenician
+Psalter_Pahlavi
+Rejang
+Runic
+Samaritan
+Saurashtra
+Sharada
+Shavian
+Siddham
+SignWriting
+Sinhala
+Sogdian
+Sora_Sompeng
+Soyombo
+Sundanese
+Syloti_Nagri
+Syriac
+Tagalog
+Tagbanwa
+Tai_Le
+Tai_Tham
+Tai_Viet
+Takri
+Tamil
+Tangsa
+Tangut
+Telugu
+Thaana
+Thai
+Tibetan
+Tifinagh
+Tirhuta
+Toto
+Ugaritic
+Vai
+Vithkuqi
+Wancho
+Warang_Citi
+Yezidi
+Yi
+Zanabazar_Square
+
+Vim character classes:
+\i identifier character NOT SUPPORTED vim
+\I «\i» except digits NOT SUPPORTED vim
+\k keyword character NOT SUPPORTED vim
+\K «\k» except digits NOT SUPPORTED vim
+\f file name character NOT SUPPORTED vim
+\F «\f» except digits NOT SUPPORTED vim
+\p printable character NOT SUPPORTED vim
+\P «\p» except digits NOT SUPPORTED vim
+\s whitespace character (== [ \t]) NOT SUPPORTED vim
+\S non-white space character (== [^ \t]) NOT SUPPORTED vim
+\d digits (== [0-9]) vim
+\D not «\d» vim
+\x hex digits (== [0-9A-Fa-f]) NOT SUPPORTED vim
+\X not «\x» NOT SUPPORTED vim
+\o octal digits (== [0-7]) NOT SUPPORTED vim
+\O not «\o» NOT SUPPORTED vim
+\w word character vim
+\W not «\w» vim
+\h head of word character NOT SUPPORTED vim
+\H not «\h» NOT SUPPORTED vim
+\a alphabetic NOT SUPPORTED vim
+\A not «\a» NOT SUPPORTED vim
+\l lowercase NOT SUPPORTED vim
+\L not lowercase NOT SUPPORTED vim
+\u uppercase NOT SUPPORTED vim
+\U not uppercase NOT SUPPORTED vim
+\_x «\x» plus newline, for any «x» NOT SUPPORTED vim
+
+Vim flags:
+\c ignore case NOT SUPPORTED vim
+\C match case NOT SUPPORTED vim
+\m magic NOT SUPPORTED vim
+\M nomagic NOT SUPPORTED vim
+\v verymagic NOT SUPPORTED vim
+\V verynomagic NOT SUPPORTED vim
+\Z ignore differences in Unicode combining characters NOT SUPPORTED vim
+
+Magic:
+(?{code}) arbitrary Perl code NOT SUPPORTED perl
+(??{code}) postponed arbitrary Perl code NOT SUPPORTED perl
+(?n) recursive call to regexp capturing group «n» NOT SUPPORTED
+(?+n) recursive call to relative group «+n» NOT SUPPORTED
+(?-n) recursive call to relative group «-n» NOT SUPPORTED
+(?C) PCRE callout NOT SUPPORTED pcre
+(?R) recursive call to entire regexp (== (?0)) NOT SUPPORTED
+(?&name) recursive call to named group NOT SUPPORTED
+(?P=name) named backreference NOT SUPPORTED
+(?P>name) recursive call to named group NOT SUPPORTED
+(?(cond)true|false) conditional branch NOT SUPPORTED
+(?(cond)true) conditional branch NOT SUPPORTED
+(*ACCEPT) make regexps more like Prolog NOT SUPPORTED
+(*COMMIT) NOT SUPPORTED
+(*F) NOT SUPPORTED
+(*FAIL) NOT SUPPORTED
+(*MARK) NOT SUPPORTED
+(*PRUNE) NOT SUPPORTED
+(*SKIP) NOT SUPPORTED
+(*THEN) NOT SUPPORTED
+(*ANY) set newline convention NOT SUPPORTED
+(*ANYCRLF) NOT SUPPORTED
+(*CR) NOT SUPPORTED
+(*CRLF) NOT SUPPORTED
+(*LF) NOT SUPPORTED
+(*BSR_ANYCRLF) set \R convention NOT SUPPORTED pcre
+(*BSR_UNICODE) NOT SUPPORTED pcre
+
diff --git a/third_party/re2/src/lib/git/commit-msg.hook b/third_party/re2/src/lib/git/commit-msg.hook
new file mode 100755
index 000000000..985016b5b
--- /dev/null
+++ b/third_party/re2/src/lib/git/commit-msg.hook
@@ -0,0 +1,104 @@
+#!/bin/sh
+# From Gerrit Code Review 2.2.1
+#
+# Part of Gerrit Code Review (http://code.google.com/p/gerrit/)
+#
+# Copyright (C) 2009 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+CHANGE_ID_AFTER="Bug|Issue"
+MSG="$1"
+
+# Check for, and add if missing, a unique Change-Id
+#
+add_ChangeId() {
+ clean_message=`sed -e '
+ /^diff --git a\/.*/{
+ s///
+ q
+ }
+ /^Signed-off-by:/d
+ /^#/d
+ ' "$MSG" | git stripspace`
+ if test -z "$clean_message"
+ then
+ return
+ fi
+
+ if grep -i '^Change-Id:' "$MSG" >/dev/null
+ then
+ return
+ fi
+
+ id=`_gen_ChangeId`
+ perl -e '
+ $MSG = shift;
+ $id = shift;
+ $CHANGE_ID_AFTER = shift;
+
+ undef $/;
+ open(I, $MSG); $_ = <I>; close I;
+ s|^diff --git a/.*||ms;
+ s|^#.*$||mg;
+ exit unless $_;
+
+ @message = split /\n/;
+ $haveFooter = 0;
+ $startFooter = @message;
+ for($line = @message - 1; $line >= 0; $line--) {
+ $_ = $message[$line];
+
+ if (/^[a-zA-Z0-9-]+:/ && !m,^[a-z0-9-]+://,) {
+ $haveFooter++;
+ next;
+ }
+ next if /^[ []/;
+ $startFooter = $line if ($haveFooter && /^\r?$/);
+ last;
+ }
+
+ @footer = @message[$startFooter+1..@message];
+ @message = @message[0..$startFooter];
+ push(@footer, "") unless @footer;
+
+ for ($line = 0; $line < @footer; $line++) {
+ $_ = $footer[$line];
+ next if /^($CHANGE_ID_AFTER):/i;
+ last;
+ }
+ splice(@footer, $line, 0, "Change-Id: I$id");
+
+ $_ = join("\n", @message, @footer);
+ open(O, ">$MSG"); print O; close O;
+ ' "$MSG" "$id" "$CHANGE_ID_AFTER"
+}
+_gen_ChangeIdInput() {
+ echo "tree `git write-tree`"
+ if parent=`git rev-parse HEAD^0 2>/dev/null`
+ then
+ echo "parent $parent"
+ fi
+ echo "author `git var GIT_AUTHOR_IDENT`"
+ echo "committer `git var GIT_COMMITTER_IDENT`"
+ echo
+ printf '%s' "$clean_message"
+}
+_gen_ChangeId() {
+ _gen_ChangeIdInput |
+ git hash-object -t commit --stdin
+}
+
+
+add_ChangeId
diff --git a/third_party/re2/src/libre2.symbols b/third_party/re2/src/libre2.symbols
new file mode 100644
index 000000000..0cab3d94b
--- /dev/null
+++ b/third_party/re2/src/libre2.symbols
@@ -0,0 +1,16 @@
+{
+ global:
+ # re2::RE2*
+ _ZN3re23RE2*;
+ _ZNK3re23RE2*;
+ # re2::operator<<*
+ _ZN3re2ls*;
+ # re2::FilteredRE2*
+ _ZN3re211FilteredRE2*;
+ _ZNK3re211FilteredRE2*;
+ # re2::re2_internal*
+ _ZN3re212re2_internal*;
+ _ZNK3re212re2_internal*;
+ local:
+ *;
+};
diff --git a/third_party/re2/src/libre2.symbols.darwin b/third_party/re2/src/libre2.symbols.darwin
new file mode 100644
index 000000000..754f45cd2
--- /dev/null
+++ b/third_party/re2/src/libre2.symbols.darwin
@@ -0,0 +1,12 @@
+# Linker doesn't like these unmangled:
+# re2::RE2*
+__ZN3re23RE2*
+__ZNK3re23RE2*
+# re2::operator<<*
+__ZN3re2ls*
+# re2::FilteredRE2*
+__ZN3re211FilteredRE2*
+__ZNK3re211FilteredRE2*
+# re2::re2_internal*
+__ZN3re212re2_internal*
+__ZNK3re212re2_internal*
diff --git a/third_party/re2/src/python/BUILD.bazel b/third_party/re2/src/python/BUILD.bazel
new file mode 100644
index 000000000..a05fb6ec7
--- /dev/null
+++ b/third_party/re2/src/python/BUILD.bazel
@@ -0,0 +1,36 @@
+# Copyright 2009 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Bazel (http://bazel.build/) BUILD file for RE2 Python.
+
+load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
+load("@rules_python//python:defs.bzl", "py_library", "py_test")
+
+pybind_extension(
+ name = "_re2",
+ srcs = ["_re2.cc"],
+ deps = [
+ "//:re2",
+ "@com_google_absl//absl/strings",
+ ],
+)
+
+py_library(
+ name = "re2",
+ srcs = ["re2.py"],
+ data = [":_re2.so"],
+ imports = ["."],
+ visibility = ["//visibility:public"],
+)
+
+py_test(
+ name = "re2_test",
+ size = "small",
+ srcs = ["re2_test.py"],
+ deps = [
+ ":re2",
+ "@abseil-py//absl/testing:absltest",
+ "@abseil-py//absl/testing:parameterized",
+ ],
+)
diff --git a/third_party/re2/src/python/LICENSE b/third_party/re2/src/python/LICENSE
new file mode 120000
index 000000000..ea5b60640
--- /dev/null
+++ b/third_party/re2/src/python/LICENSE
@@ -0,0 +1 @@
+../LICENSE \ No newline at end of file
diff --git a/third_party/re2/src/python/README b/third_party/re2/src/python/README
new file mode 100644
index 000000000..782378f6e
--- /dev/null
+++ b/third_party/re2/src/python/README
@@ -0,0 +1 @@
+Building requires Python 3 and pybind11 to be installed on your system.
diff --git a/third_party/re2/src/python/_re2.cc b/third_party/re2/src/python/_re2.cc
new file mode 100644
index 000000000..8564f8a4f
--- /dev/null
+++ b/third_party/re2/src/python/_re2.cc
@@ -0,0 +1,338 @@
+// Copyright 2019 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include "absl/strings/string_view.h"
+#include "re2/filtered_re2.h"
+#include "re2/re2.h"
+#include "re2/set.h"
+
+#ifdef _WIN32
+#include <basetsd.h>
+#define ssize_t SSIZE_T
+#endif
+
+namespace re2_python {
+
+// This is conventional.
+namespace py = pybind11;
+
+// In terms of the pybind11 API, a py::buffer is merely a py::object that
+// supports the buffer interface/protocol and you must explicitly request
+// a py::buffer_info in order to access the actual bytes. Under the hood,
+// the py::buffer_info manages a reference count to the py::buffer, so it
+// must be constructed and subsequently destructed while holding the GIL.
+static inline absl::string_view FromBytes(const py::buffer_info& bytes) {
+ char* data = reinterpret_cast<char*>(bytes.ptr);
+ ssize_t size = bytes.size;
+ return absl::string_view(data, size);
+}
+
+static inline int OneCharLen(const char* ptr) {
+ return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*ptr & 0xFF) >> 4];
+}
+
+// Helper function for when Python encodes str to bytes and then needs to
+// convert str offsets to bytes offsets. Assumes that text is valid UTF-8.
+ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) {
+ auto bytes = buffer.request();
+ auto text = FromBytes(bytes);
+ auto ptr = text.data() + pos;
+ auto end = text.data() + text.size();
+ while (ptr < end && len > 0) {
+ ptr += OneCharLen(ptr);
+ --len;
+ }
+ return ptr - (text.data() + pos);
+}
+
+// Helper function for when Python decodes bytes to str and then needs to
+// convert bytes offsets to str offsets. Assumes that text is valid UTF-8.
+ssize_t BytesToCharLen(py::buffer buffer, ssize_t pos, ssize_t endpos) {
+ auto bytes = buffer.request();
+ auto text = FromBytes(bytes);
+ auto ptr = text.data() + pos;
+ auto end = text.data() + endpos;
+ ssize_t len = 0;
+ while (ptr < end) {
+ ptr += OneCharLen(ptr);
+ ++len;
+ }
+ return len;
+}
+
+std::unique_ptr<RE2> RE2InitShim(py::buffer buffer,
+ const RE2::Options& options) {
+ auto bytes = buffer.request();
+ auto pattern = FromBytes(bytes);
+ return std::make_unique<RE2>(pattern, options);
+}
+
+py::bytes RE2ErrorShim(const RE2& self) {
+ // Return std::string as bytes. That is, without decoding to str.
+ return self.error();
+}
+
+std::vector<std::pair<py::bytes, int>> RE2NamedCapturingGroupsShim(
+ const RE2& self) {
+ const int num_groups = self.NumberOfCapturingGroups();
+ std::vector<std::pair<py::bytes, int>> groups;
+ groups.reserve(num_groups);
+ for (const auto& it : self.NamedCapturingGroups()) {
+ groups.emplace_back(it.first, it.second);
+ }
+ return groups;
+}
+
+std::vector<int> RE2ProgramFanoutShim(const RE2& self) {
+ std::vector<int> histogram;
+ self.ProgramFanout(&histogram);
+ return histogram;
+}
+
+std::vector<int> RE2ReverseProgramFanoutShim(const RE2& self) {
+ std::vector<int> histogram;
+ self.ReverseProgramFanout(&histogram);
+ return histogram;
+}
+
+std::tuple<bool, py::bytes, py::bytes> RE2PossibleMatchRangeShim(
+ const RE2& self, int maxlen) {
+ std::string min, max;
+ // Return std::string as bytes. That is, without decoding to str.
+ return {self.PossibleMatchRange(&min, &max, maxlen), min, max};
+}
+
+std::vector<std::pair<ssize_t, ssize_t>> RE2MatchShim(const RE2& self,
+ RE2::Anchor anchor,
+ py::buffer buffer,
+ ssize_t pos,
+ ssize_t endpos) {
+ auto bytes = buffer.request();
+ auto text = FromBytes(bytes);
+ const int num_groups = self.NumberOfCapturingGroups() + 1; // need $0
+ std::vector<absl::string_view> groups;
+ groups.resize(num_groups);
+ py::gil_scoped_release release_gil;
+ if (!self.Match(text, pos, endpos, anchor, groups.data(), groups.size())) {
+ // Ensure that groups are null before converting to spans!
+ for (auto& it : groups) {
+ it = absl::string_view();
+ }
+ }
+ std::vector<std::pair<ssize_t, ssize_t>> spans;
+ spans.reserve(num_groups);
+ for (const auto& it : groups) {
+ if (it.data() == NULL) {
+ spans.emplace_back(-1, -1);
+ } else {
+ spans.emplace_back(it.data() - text.data(),
+ it.data() - text.data() + it.size());
+ }
+ }
+ return spans;
+}
+
+py::bytes RE2QuoteMetaShim(py::buffer buffer) {
+ auto bytes = buffer.request();
+ auto pattern = FromBytes(bytes);
+ // Return std::string as bytes. That is, without decoding to str.
+ return RE2::QuoteMeta(pattern);
+}
+
+class Set {
+ public:
+ Set(RE2::Anchor anchor, const RE2::Options& options)
+ : set_(options, anchor) {}
+
+ ~Set() = default;
+
+ // Not copyable or movable.
+ Set(const Set&) = delete;
+ Set& operator=(const Set&) = delete;
+
+ int Add(py::buffer buffer) {
+ auto bytes = buffer.request();
+ auto pattern = FromBytes(bytes);
+ int index = set_.Add(pattern, /*error=*/NULL); // -1 on error
+ return index;
+ }
+
+ bool Compile() {
+ // Compiling can fail.
+ return set_.Compile();
+ }
+
+ std::vector<int> Match(py::buffer buffer) const {
+ auto bytes = buffer.request();
+ auto text = FromBytes(bytes);
+ std::vector<int> matches;
+ py::gil_scoped_release release_gil;
+ set_.Match(text, &matches);
+ return matches;
+ }
+
+ private:
+ RE2::Set set_;
+};
+
+class Filter {
+ public:
+ Filter() = default;
+ ~Filter() = default;
+
+ // Not copyable or movable.
+ Filter(const Filter&) = delete;
+ Filter& operator=(const Filter&) = delete;
+
+ int Add(py::buffer buffer, const RE2::Options& options) {
+ auto bytes = buffer.request();
+ auto pattern = FromBytes(bytes);
+ int index = -1; // not clobbered on error
+ filter_.Add(pattern, options, &index);
+ return index;
+ }
+
+ bool Compile() {
+ std::vector<std::string> atoms;
+ filter_.Compile(&atoms);
+ RE2::Options options;
+ options.set_literal(true);
+ options.set_case_sensitive(false);
+ set_ = std::make_unique<RE2::Set>(options, RE2::UNANCHORED);
+ for (int i = 0; i < static_cast<int>(atoms.size()); ++i) {
+ if (set_->Add(atoms[i], /*error=*/NULL) != i) {
+ // Should never happen: the atom is a literal!
+ py::pybind11_fail("set_->Add() failed");
+ }
+ }
+ // Compiling can fail.
+ return set_->Compile();
+ }
+
+ std::vector<int> Match(py::buffer buffer, bool potential) const {
+ auto bytes = buffer.request();
+ auto text = FromBytes(bytes);
+ std::vector<int> atoms;
+ py::gil_scoped_release release_gil;
+ set_->Match(text, &atoms);
+ std::vector<int> matches;
+ if (potential) {
+ filter_.AllPotentials(atoms, &matches);
+ } else {
+ filter_.AllMatches(text, atoms, &matches);
+ }
+ return matches;
+ }
+
+ const RE2& GetRE2(int index) const {
+ return filter_.GetRE2(index);
+ }
+
+ private:
+ re2::FilteredRE2 filter_;
+ std::unique_ptr<RE2::Set> set_;
+};
+
+PYBIND11_MODULE(_re2, module) {
+ module.def("CharLenToBytes", &CharLenToBytes);
+ module.def("BytesToCharLen", &BytesToCharLen);
+
+ // CLASSES
+ // class RE2
+ // enum Anchor
+ // class Options
+ // enum Encoding
+ // class Set
+ // class Filter
+ py::class_<RE2> re2(module, "RE2");
+ py::enum_<RE2::Anchor> anchor(re2, "Anchor");
+ py::class_<RE2::Options> options(re2, "Options");
+ py::enum_<RE2::Options::Encoding> encoding(options, "Encoding");
+ py::class_<Set> set(module, "Set");
+ py::class_<Filter> filter(module, "Filter");
+
+ anchor.value("UNANCHORED", RE2::Anchor::UNANCHORED);
+ anchor.value("ANCHOR_START", RE2::Anchor::ANCHOR_START);
+ anchor.value("ANCHOR_BOTH", RE2::Anchor::ANCHOR_BOTH);
+
+ encoding.value("UTF8", RE2::Options::Encoding::EncodingUTF8);
+ encoding.value("LATIN1", RE2::Options::Encoding::EncodingLatin1);
+
+ options.def(py::init<>())
+ .def_property("max_mem", //
+ &RE2::Options::max_mem, //
+ &RE2::Options::set_max_mem) //
+ .def_property("encoding", //
+ &RE2::Options::encoding, //
+ &RE2::Options::set_encoding) //
+ .def_property("posix_syntax", //
+ &RE2::Options::posix_syntax, //
+ &RE2::Options::set_posix_syntax) //
+ .def_property("longest_match", //
+ &RE2::Options::longest_match, //
+ &RE2::Options::set_longest_match) //
+ .def_property("log_errors", //
+ &RE2::Options::log_errors, //
+ &RE2::Options::set_log_errors) //
+ .def_property("literal", //
+ &RE2::Options::literal, //
+ &RE2::Options::set_literal) //
+ .def_property("never_nl", //
+ &RE2::Options::never_nl, //
+ &RE2::Options::set_never_nl) //
+ .def_property("dot_nl", //
+ &RE2::Options::dot_nl, //
+ &RE2::Options::set_dot_nl) //
+ .def_property("never_capture", //
+ &RE2::Options::never_capture, //
+ &RE2::Options::set_never_capture) //
+ .def_property("case_sensitive", //
+ &RE2::Options::case_sensitive, //
+ &RE2::Options::set_case_sensitive) //
+ .def_property("perl_classes", //
+ &RE2::Options::perl_classes, //
+ &RE2::Options::set_perl_classes) //
+ .def_property("word_boundary", //
+ &RE2::Options::word_boundary, //
+ &RE2::Options::set_word_boundary) //
+ .def_property("one_line", //
+ &RE2::Options::one_line, //
+ &RE2::Options::set_one_line); //
+
+ re2.def(py::init(&RE2InitShim))
+ .def("ok", &RE2::ok)
+ .def("error", &RE2ErrorShim)
+ .def("options", &RE2::options)
+ .def("NumberOfCapturingGroups", &RE2::NumberOfCapturingGroups)
+ .def("NamedCapturingGroups", &RE2NamedCapturingGroupsShim)
+ .def("ProgramSize", &RE2::ProgramSize)
+ .def("ReverseProgramSize", &RE2::ReverseProgramSize)
+ .def("ProgramFanout", &RE2ProgramFanoutShim)
+ .def("ReverseProgramFanout", &RE2ReverseProgramFanoutShim)
+ .def("PossibleMatchRange", &RE2PossibleMatchRangeShim)
+ .def("Match", &RE2MatchShim)
+ .def_static("QuoteMeta", &RE2QuoteMetaShim);
+
+ set.def(py::init<RE2::Anchor, const RE2::Options&>())
+ .def("Add", &Set::Add)
+ .def("Compile", &Set::Compile)
+ .def("Match", &Set::Match);
+
+ filter.def(py::init<>())
+ .def("Add", &Filter::Add)
+ .def("Compile", &Filter::Compile)
+ .def("Match", &Filter::Match)
+ .def("GetRE2", &Filter::GetRE2,
+ py::return_value_policy::reference_internal);
+}
+
+} // namespace re2_python
diff --git a/third_party/re2/src/python/re2.py b/third_party/re2/src/python/re2.py
new file mode 100644
index 000000000..8a6d98539
--- /dev/null
+++ b/third_party/re2/src/python/re2.py
@@ -0,0 +1,582 @@
+# Copyright 2019 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+r"""A drop-in replacement for the re module.
+
+It uses RE2 under the hood, of course, so various PCRE features
+(e.g. backreferences, look-around assertions) are not supported.
+See https://github.com/google/re2/wiki/Syntax for the canonical
+reference, but known syntactic "gotchas" relative to Python are:
+
+ * PCRE supports \Z and \z; RE2 supports \z; Python supports \z,
+ but calls it \Z. You must rewrite \Z to \z in pattern strings.
+
+Known differences between this module's API and the re module's API:
+
+ * The error class does not provide any error information as attributes.
+ * The Options class replaces the re module's flags with RE2's options as
+ gettable/settable properties. Please see re2.h for their documentation.
+ * The pattern string and the input string do not have to be the same type.
+ Any str will be encoded to UTF-8.
+ * The pattern string cannot be str if the options specify Latin-1 encoding.
+
+This module's LRU cache contains a maximum of 128 regular expression objects.
+Each regular expression object's underlying RE2 object uses a maximum of 8MiB
+of memory (by default). Hence, this module's LRU cache uses a maximum of 1GiB
+of memory (by default), but in most cases, it should use much less than that.
+"""
+
+import codecs
+import functools
+import itertools
+
+import _re2
+
+
+class error(Exception):
+ pass
+
+
+class Options(_re2.RE2.Options):
+
+ __slots__ = ()
+
+ NAMES = (
+ 'max_mem',
+ 'encoding',
+ 'posix_syntax',
+ 'longest_match',
+ 'log_errors',
+ 'literal',
+ 'never_nl',
+ 'dot_nl',
+ 'never_capture',
+ 'case_sensitive',
+ 'perl_classes',
+ 'word_boundary',
+ 'one_line',
+ )
+
+
+def compile(pattern, options=None):
+ if isinstance(pattern, _Regexp):
+ if options:
+ raise error('pattern is already compiled, so '
+ 'options may not be specified')
+ pattern = pattern._pattern
+ options = options or Options()
+ values = tuple(getattr(options, name) for name in Options.NAMES)
+ return _Regexp._make(pattern, values)
+
+
+def search(pattern, text, options=None):
+ return compile(pattern, options=options).search(text)
+
+
+def match(pattern, text, options=None):
+ return compile(pattern, options=options).match(text)
+
+
+def fullmatch(pattern, text, options=None):
+ return compile(pattern, options=options).fullmatch(text)
+
+
+def finditer(pattern, text, options=None):
+ return compile(pattern, options=options).finditer(text)
+
+
+def findall(pattern, text, options=None):
+ return compile(pattern, options=options).findall(text)
+
+
+def split(pattern, text, maxsplit=0, options=None):
+ return compile(pattern, options=options).split(text, maxsplit)
+
+
+def subn(pattern, repl, text, count=0, options=None):
+ return compile(pattern, options=options).subn(repl, text, count)
+
+
+def sub(pattern, repl, text, count=0, options=None):
+ return compile(pattern, options=options).sub(repl, text, count)
+
+
+def _encode(t):
+ return t.encode(encoding='utf-8')
+
+
+def _decode(b):
+ return b.decode(encoding='utf-8')
+
+
+def escape(pattern):
+ if isinstance(pattern, str):
+ encoded_pattern = _encode(pattern)
+ escaped = _re2.RE2.QuoteMeta(encoded_pattern)
+ decoded_escaped = _decode(escaped)
+ return decoded_escaped
+ else:
+ escaped = _re2.RE2.QuoteMeta(pattern)
+ return escaped
+
+
+def purge():
+ return _Regexp._make.cache_clear()
+
+
+_Anchor = _re2.RE2.Anchor
+_NULL_SPAN = (-1, -1)
+
+
+class _Regexp(object):
+
+ __slots__ = ('_pattern', '_regexp')
+
+ @classmethod
+ @functools.lru_cache(typed=True)
+ def _make(cls, pattern, values):
+ options = Options()
+ for name, value in zip(Options.NAMES, values):
+ setattr(options, name, value)
+ return cls(pattern, options)
+
+ def __init__(self, pattern, options):
+ self._pattern = pattern
+ if isinstance(self._pattern, str):
+ if options.encoding == Options.Encoding.LATIN1:
+ raise error('string type of pattern is str, but '
+ 'encoding specified in options is LATIN1')
+ encoded_pattern = _encode(self._pattern)
+ self._regexp = _re2.RE2(encoded_pattern, options)
+ else:
+ self._regexp = _re2.RE2(self._pattern, options)
+ if not self._regexp.ok():
+ raise error(self._regexp.error())
+
+ def __getstate__(self):
+ options = {name: getattr(self.options, name) for name in Options.NAMES}
+ return self._pattern, options
+
+ def __setstate__(self, state):
+ pattern, options = state
+ values = tuple(options[name] for name in Options.NAMES)
+ other = _Regexp._make(pattern, values)
+ self._pattern = other._pattern
+ self._regexp = other._regexp
+
+ def _match(self, anchor, text, pos=None, endpos=None):
+ pos = 0 if pos is None else max(0, min(pos, len(text)))
+ endpos = len(text) if endpos is None else max(0, min(endpos, len(text)))
+ if pos > endpos:
+ return
+ if isinstance(text, str):
+ encoded_text = _encode(text)
+ encoded_pos = _re2.CharLenToBytes(encoded_text, 0, pos)
+ if endpos == len(text):
+ # This is the common case.
+ encoded_endpos = len(encoded_text)
+ else:
+ encoded_endpos = encoded_pos + _re2.CharLenToBytes(
+ encoded_text, encoded_pos, endpos - pos)
+ decoded_offsets = {0: 0}
+ last_offset = 0
+ while True:
+ spans = self._regexp.Match(anchor, encoded_text, encoded_pos,
+ encoded_endpos)
+ if spans[0] == _NULL_SPAN:
+ break
+
+ # This algorithm is linear in the length of encoded_text. Specifically,
+ # no matter how many groups there are for a given regular expression or
+ # how many iterations through the loop there are for a given generator,
+ # this algorithm uses a single, straightforward pass over encoded_text.
+ offsets = sorted(set(itertools.chain(*spans)))
+ if offsets[0] == -1:
+ offsets = offsets[1:]
+ # Discard the rest of the items because they are useless now - and we
+ # could accumulate one item per str offset in the pathological case!
+ decoded_offsets = {last_offset: decoded_offsets[last_offset]}
+ for offset in offsets:
+ decoded_offsets[offset] = (
+ decoded_offsets[last_offset] +
+ _re2.BytesToCharLen(encoded_text, last_offset, offset))
+ last_offset = offset
+
+ def decode(span):
+ if span == _NULL_SPAN:
+ return span
+ return decoded_offsets[span[0]], decoded_offsets[span[1]]
+
+ decoded_spans = [decode(span) for span in spans]
+ yield _Match(self, text, pos, endpos, decoded_spans)
+ if encoded_pos == encoded_endpos:
+ break
+ elif encoded_pos == spans[0][1]:
+ # We matched the empty string at encoded_pos and would be stuck, so
+ # in order to make forward progress, increment the str offset.
+ encoded_pos += _re2.CharLenToBytes(encoded_text, encoded_pos, 1)
+ else:
+ encoded_pos = spans[0][1]
+ else:
+ while True:
+ spans = self._regexp.Match(anchor, text, pos, endpos)
+ if spans[0] == _NULL_SPAN:
+ break
+ yield _Match(self, text, pos, endpos, spans)
+ if pos == endpos:
+ break
+ elif pos == spans[0][1]:
+ # We matched the empty string at pos and would be stuck, so in order
+ # to make forward progress, increment the bytes offset.
+ pos += 1
+ else:
+ pos = spans[0][1]
+
+ def search(self, text, pos=None, endpos=None):
+ return next(self._match(_Anchor.UNANCHORED, text, pos, endpos), None)
+
+ def match(self, text, pos=None, endpos=None):
+ return next(self._match(_Anchor.ANCHOR_START, text, pos, endpos), None)
+
+ def fullmatch(self, text, pos=None, endpos=None):
+ return next(self._match(_Anchor.ANCHOR_BOTH, text, pos, endpos), None)
+
+ def finditer(self, text, pos=None, endpos=None):
+ return self._match(_Anchor.UNANCHORED, text, pos, endpos)
+
+ def findall(self, text, pos=None, endpos=None):
+ empty = type(text)()
+ items = []
+ for match in self.finditer(text, pos, endpos):
+ if not self.groups:
+ item = match.group()
+ elif self.groups == 1:
+ item = match.groups(default=empty)[0]
+ else:
+ item = match.groups(default=empty)
+ items.append(item)
+ return items
+
+ def _split(self, cb, text, maxsplit=0):
+ if maxsplit < 0:
+ return [text], 0
+ elif maxsplit > 0:
+ matchiter = itertools.islice(self.finditer(text), maxsplit)
+ else:
+ matchiter = self.finditer(text)
+ pieces = []
+ end = 0
+ numsplit = 0
+ for match in matchiter:
+ pieces.append(text[end:match.start()])
+ pieces.extend(cb(match))
+ end = match.end()
+ numsplit += 1
+ pieces.append(text[end:])
+ return pieces, numsplit
+
+ def split(self, text, maxsplit=0):
+ cb = lambda match: [match[group] for group in range(1, self.groups + 1)]
+ pieces, _ = self._split(cb, text, maxsplit)
+ return pieces
+
+ def subn(self, repl, text, count=0):
+ cb = lambda match: [repl(match) if callable(repl) else match.expand(repl)]
+ empty = type(text)()
+ pieces, numsplit = self._split(cb, text, count)
+ joined_pieces = empty.join(pieces)
+ return joined_pieces, numsplit
+
+ def sub(self, repl, text, count=0):
+ joined_pieces, _ = self.subn(repl, text, count)
+ return joined_pieces
+
+ @property
+ def pattern(self):
+ return self._pattern
+
+ @property
+ def options(self):
+ return self._regexp.options()
+
+ @property
+ def groups(self):
+ return self._regexp.NumberOfCapturingGroups()
+
+ @property
+ def groupindex(self):
+ groups = self._regexp.NamedCapturingGroups()
+ if isinstance(self._pattern, str):
+ decoded_groups = [(_decode(group), index) for group, index in groups]
+ return dict(decoded_groups)
+ else:
+ return dict(groups)
+
+ @property
+ def programsize(self):
+ return self._regexp.ProgramSize()
+
+ @property
+ def reverseprogramsize(self):
+ return self._regexp.ReverseProgramSize()
+
+ @property
+ def programfanout(self):
+ return self._regexp.ProgramFanout()
+
+ @property
+ def reverseprogramfanout(self):
+ return self._regexp.ReverseProgramFanout()
+
+ def possiblematchrange(self, maxlen):
+ ok, min, max = self._regexp.PossibleMatchRange(maxlen)
+ if not ok:
+ raise error('failed to compute match range')
+ return min, max
+
+
+class _Match(object):
+
+ __slots__ = ('_regexp', '_text', '_pos', '_endpos', '_spans')
+
+ def __init__(self, regexp, text, pos, endpos, spans):
+ self._regexp = regexp
+ self._text = text
+ self._pos = pos
+ self._endpos = endpos
+ self._spans = spans
+
+ # Python prioritises three-digit octal numbers over group escapes.
+ # For example, \100 should not be handled the same way as \g<10>0.
+ _OCTAL_RE = compile('\\\\[0-7][0-7][0-7]')
+
+ # Python supports \1 through \99 (inclusive) and \g<...> syntax.
+ _GROUP_RE = compile('\\\\[1-9][0-9]?|\\\\g<\\w+>')
+
+ @classmethod
+ @functools.lru_cache(typed=True)
+ def _split(cls, template):
+ if isinstance(template, str):
+ backslash = '\\'
+ else:
+ backslash = b'\\'
+ empty = type(template)()
+ pieces = [empty]
+ index = template.find(backslash)
+ while index != -1:
+ piece, template = template[:index], template[index:]
+ pieces[-1] += piece
+ octal_match = cls._OCTAL_RE.match(template)
+ group_match = cls._GROUP_RE.match(template)
+ if (not octal_match) and group_match:
+ index = group_match.end()
+ piece, template = template[:index], template[index:]
+ pieces.extend((piece, empty))
+ else:
+ # 2 isn't enough for \o, \x, \N, \u and \U escapes, but none of those
+ # should contain backslashes, so break them here and then fix them at
+ # the beginning of the next loop iteration or right before returning.
+ index = 2
+ piece, template = template[:index], template[index:]
+ pieces[-1] += piece
+ index = template.find(backslash)
+ pieces[-1] += template
+ return pieces
+
+ def expand(self, template):
+ if isinstance(template, str):
+ unescape = codecs.unicode_escape_decode
+ else:
+ unescape = codecs.escape_decode
+ empty = type(template)()
+ # Make a copy so that we don't clobber the cached pieces!
+ pieces = list(self._split(template))
+ for index, piece in enumerate(pieces):
+ if not index % 2:
+ pieces[index], _ = unescape(piece)
+ else:
+ if len(piece) <= 3: # \1 through \99 (inclusive)
+ group = int(piece[1:])
+ else: # \g<...>
+ group = piece[3:-1]
+ try:
+ group = int(group)
+ except ValueError:
+ pass
+ pieces[index] = self.__getitem__(group) or empty
+ joined_pieces = empty.join(pieces)
+ return joined_pieces
+
+ def __getitem__(self, group):
+ if not isinstance(group, int):
+ try:
+ group = self._regexp.groupindex[group]
+ except KeyError:
+ raise IndexError('bad group name')
+ if not 0 <= group <= self._regexp.groups:
+ raise IndexError('bad group index')
+ span = self._spans[group]
+ if span == _NULL_SPAN:
+ return None
+ return self._text[span[0]:span[1]]
+
+ def group(self, *groups):
+ if not groups:
+ groups = (0,)
+ items = (self.__getitem__(group) for group in groups)
+ return next(items) if len(groups) == 1 else tuple(items)
+
+ def groups(self, default=None):
+ items = []
+ for group in range(1, self._regexp.groups + 1):
+ item = self.__getitem__(group)
+ items.append(default if item is None else item)
+ return tuple(items)
+
+ def groupdict(self, default=None):
+ items = []
+ for group, index in self._regexp.groupindex.items():
+ item = self.__getitem__(index)
+ items.append((group, default) if item is None else (group, item))
+ return dict(items)
+
+ def start(self, group=0):
+ if not 0 <= group <= self._regexp.groups:
+ raise IndexError('bad group index')
+ return self._spans[group][0]
+
+ def end(self, group=0):
+ if not 0 <= group <= self._regexp.groups:
+ raise IndexError('bad group index')
+ return self._spans[group][1]
+
+ def span(self, group=0):
+ if not 0 <= group <= self._regexp.groups:
+ raise IndexError('bad group index')
+ return self._spans[group]
+
+ @property
+ def re(self):
+ return self._regexp
+
+ @property
+ def string(self):
+ return self._text
+
+ @property
+ def pos(self):
+ return self._pos
+
+ @property
+ def endpos(self):
+ return self._endpos
+
+ @property
+ def lastindex(self):
+ max_end = -1
+ max_group = None
+ # We look for the rightmost right parenthesis by keeping the first group
+ # that ends at max_end because that is the leftmost/outermost group when
+ # there are nested groups!
+ for group in range(1, self._regexp.groups + 1):
+ end = self._spans[group][1]
+ if max_end < end:
+ max_end = end
+ max_group = group
+ return max_group
+
+ @property
+ def lastgroup(self):
+ max_group = self.lastindex
+ if not max_group:
+ return None
+ for group, index in self._regexp.groupindex.items():
+ if max_group == index:
+ return group
+ return None
+
+
+class Set(object):
+ """A Pythonic wrapper around RE2::Set."""
+
+ __slots__ = ('_set')
+
+ def __init__(self, anchor, options=None):
+ options = options or Options()
+ self._set = _re2.Set(anchor, options)
+
+ @classmethod
+ def SearchSet(cls, options=None):
+ return cls(_Anchor.UNANCHORED, options=options)
+
+ @classmethod
+ def MatchSet(cls, options=None):
+ return cls(_Anchor.ANCHOR_START, options=options)
+
+ @classmethod
+ def FullMatchSet(cls, options=None):
+ return cls(_Anchor.ANCHOR_BOTH, options=options)
+
+ def Add(self, pattern):
+ if isinstance(pattern, str):
+ encoded_pattern = _encode(pattern)
+ index = self._set.Add(encoded_pattern)
+ else:
+ index = self._set.Add(pattern)
+ if index == -1:
+ raise error('failed to add %r to Set' % pattern)
+ return index
+
+ def Compile(self):
+ if not self._set.Compile():
+ raise error('failed to compile Set')
+
+ def Match(self, text):
+ if isinstance(text, str):
+ encoded_text = _encode(text)
+ matches = self._set.Match(encoded_text)
+ else:
+ matches = self._set.Match(text)
+ return matches or None
+
+
+class Filter(object):
+ """A Pythonic wrapper around FilteredRE2."""
+
+ __slots__ = ('_filter', '_patterns')
+
+ def __init__(self):
+ self._filter = _re2.Filter()
+ self._patterns = []
+
+ def Add(self, pattern, options=None):
+ options = options or Options()
+ if isinstance(pattern, str):
+ encoded_pattern = _encode(pattern)
+ index = self._filter.Add(encoded_pattern, options)
+ else:
+ index = self._filter.Add(pattern, options)
+ if index == -1:
+ raise error('failed to add %r to Filter' % pattern)
+ self._patterns.append(pattern)
+ return index
+
+ def Compile(self):
+ if not self._filter.Compile():
+ raise error('failed to compile Filter')
+
+ def Match(self, text, potential=False):
+ if isinstance(text, str):
+ encoded_text = _encode(text)
+ matches = self._filter.Match(encoded_text, potential)
+ else:
+ matches = self._filter.Match(text, potential)
+ return matches or None
+
+ def re(self, index):
+ if not 0 <= index < len(self._patterns):
+ raise IndexError('bad index')
+ proxy = object.__new__(_Regexp)
+ proxy._pattern = self._patterns[index]
+ proxy._regexp = self._filter.GetRE2(index)
+ return proxy
diff --git a/third_party/re2/src/python/re2_test.py b/third_party/re2/src/python/re2_test.py
new file mode 100644
index 000000000..86aa9ae51
--- /dev/null
+++ b/third_party/re2/src/python/re2_test.py
@@ -0,0 +1,482 @@
+# Copyright 2019 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+"""Tests for google3.third_party.re2.python.re2."""
+
+import collections
+import pickle
+import re
+
+from absl.testing import absltest
+from absl.testing import parameterized
+import re2
+
+
+class OptionsTest(parameterized.TestCase):
+
+ @parameterized.parameters(*re2.Options.NAMES)
+ def test_option(self, name):
+ options = re2.Options()
+ value = getattr(options, name)
+ if isinstance(value, re2.Options.Encoding):
+ value = next(v for v in type(value).__members__.values() if v != value)
+ elif isinstance(value, bool):
+ value = not value
+ elif isinstance(value, int):
+ value = value + 1
+ else:
+ raise TypeError('option {!r}: {!r} {!r}'.format(name, type(value), value))
+ setattr(options, name, value)
+ self.assertEqual(value, getattr(options, name))
+
+
+class Re2CompileTest(parameterized.TestCase):
+ """Contains tests that apply to the re2 module only.
+
+ We disagree with Python on the string types of group names,
+ so there is no point attempting to verify consistency.
+ """
+
+ @parameterized.parameters(
+ (u'(foo*)(?P<bar>qux+)', 2, [(u'bar', 2)]),
+ (b'(foo*)(?P<bar>qux+)', 2, [(b'bar', 2)]),
+ (u'(foo*)(?P<中文>qux+)', 2, [(u'中文', 2)]),
+ )
+ def test_compile(self, pattern, expected_groups, expected_groupindex):
+ regexp = re2.compile(pattern)
+ self.assertIs(regexp, re2.compile(pattern)) # cached
+ self.assertIs(regexp, re2.compile(regexp)) # cached
+ with self.assertRaisesRegex(re2.error,
+ ('pattern is already compiled, so '
+ 'options may not be specified')):
+ options = re2.Options()
+ options.log_errors = not options.log_errors
+ re2.compile(regexp, options=options)
+ self.assertIsNotNone(regexp.options)
+ self.assertEqual(expected_groups, regexp.groups)
+ self.assertDictEqual(dict(expected_groupindex), regexp.groupindex)
+
+ def test_compile_with_options(self):
+ options = re2.Options()
+ options.max_mem = 100
+ with self.assertRaisesRegex(re2.error, 'pattern too large'):
+ re2.compile('.{1000}', options=options)
+
+ def test_programsize_reverseprogramsize(self):
+ regexp = re2.compile('a+b')
+ self.assertEqual(7, regexp.programsize)
+ self.assertEqual(7, regexp.reverseprogramsize)
+
+ def test_programfanout_reverseprogramfanout(self):
+ regexp = re2.compile('a+b')
+ self.assertListEqual([1, 1], regexp.programfanout)
+ self.assertListEqual([3], regexp.reverseprogramfanout)
+
+ @parameterized.parameters(
+ (u'abc', 0, None),
+ (b'abc', 0, None),
+ (u'abc', 10, (b'abc', b'abc')),
+ (b'abc', 10, (b'abc', b'abc')),
+ (u'ab*c', 10, (b'ab', b'ac')),
+ (b'ab*c', 10, (b'ab', b'ac')),
+ (u'ab+c', 10, (b'abb', b'abc')),
+ (b'ab+c', 10, (b'abb', b'abc')),
+ (u'ab?c', 10, (b'abc', b'ac')),
+ (b'ab?c', 10, (b'abc', b'ac')),
+ (u'.*', 10, (b'', b'\xf4\xbf\xbf\xc0')),
+ (b'.*', 10, None),
+ (u'\\C*', 10, None),
+ (b'\\C*', 10, None),
+ )
+ def test_possiblematchrange(self, pattern, maxlen, expected_min_max):
+ # For brevity, the string type of pattern determines the encoding.
+ # It would otherwise be possible to have bytes with UTF8, but as per
+ # the module docstring, it isn't permitted to have str with LATIN1.
+ options = re2.Options()
+ if isinstance(pattern, str):
+ options.encoding = re2.Options.Encoding.UTF8
+ else:
+ options.encoding = re2.Options.Encoding.LATIN1
+ regexp = re2.compile(pattern, options=options)
+ if expected_min_max:
+ self.assertEqual(expected_min_max, regexp.possiblematchrange(maxlen))
+ else:
+ with self.assertRaisesRegex(re2.error, 'failed to compute match range'):
+ regexp.possiblematchrange(maxlen)
+
+
+Params = collections.namedtuple(
+ 'Params', ('pattern', 'text', 'spans', 'search', 'match', 'fullmatch'))
+
+PARAMS = [
+ Params(u'\\d+', u'Hello, world.', None, False, False, False),
+ Params(b'\\d+', b'Hello, world.', None, False, False, False),
+ Params(u'\\s+', u'Hello, world.', [(6, 7)], True, False, False),
+ Params(b'\\s+', b'Hello, world.', [(6, 7)], True, False, False),
+ Params(u'\\w+', u'Hello, world.', [(0, 5)], True, True, False),
+ Params(b'\\w+', b'Hello, world.', [(0, 5)], True, True, False),
+ Params(u'(\\d+)?', u'Hello, world.', [(0, 0), (-1, -1)], True, True, False),
+ Params(b'(\\d+)?', b'Hello, world.', [(0, 0), (-1, -1)], True, True, False),
+ Params(u'youtube(_device|_md|_gaia|_multiday|_multiday_gaia)?',
+ u'youtube_ads', [(0, 7), (-1, -1)], True, True, False),
+ Params(b'youtube(_device|_md|_gaia|_multiday|_multiday_gaia)?',
+ b'youtube_ads', [(0, 7), (-1, -1)], True, True, False),
+]
+
+
+def upper(match):
+ return match.group().upper()
+
+
+class ReRegexpTest(parameterized.TestCase):
+ """Contains tests that apply to the re and re2 modules."""
+
+ MODULE = re
+
+ @parameterized.parameters((p.pattern,) for p in PARAMS)
+ def test_pickle(self, pattern):
+ regexp = self.MODULE.compile(pattern)
+ rick = pickle.loads(pickle.dumps(regexp))
+ self.assertEqual(regexp.pattern, rick.pattern)
+
+ @parameterized.parameters(
+ (p.pattern, p.text, (p.spans if p.search else None)) for p in PARAMS)
+ def test_search(self, pattern, text, expected_spans):
+ match = self.MODULE.search(pattern, text)
+ if expected_spans is None:
+ self.assertIsNone(match)
+ else:
+ spans = [match.span(group) for group in range(match.re.groups + 1)]
+ self.assertListEqual(expected_spans, spans)
+
+ def test_search_with_pos_and_endpos(self):
+ regexp = self.MODULE.compile(u'.+') # empty string NOT allowed
+ text = u'I \u2665 RE2!'
+ # Note that len(text) is the position of the empty string at the end of
+ # text, so range() stops at len(text) + 1 in order to include len(text).
+ for pos in range(len(text) + 1):
+ for endpos in range(pos, len(text) + 1):
+ match = regexp.search(text, pos=pos, endpos=endpos)
+ if pos == endpos:
+ self.assertIsNone(match)
+ else:
+ self.assertEqual(pos, match.pos)
+ self.assertEqual(endpos, match.endpos)
+ self.assertEqual(pos, match.start())
+ self.assertEqual(endpos, match.end())
+ self.assertTupleEqual((pos, endpos), match.span())
+
+ def test_search_with_bogus_pos_and_endpos(self):
+ regexp = self.MODULE.compile(u'.*') # empty string allowed
+ text = u'I \u2665 RE2!'
+
+ match = regexp.search(text, pos=-100)
+ self.assertEqual(0, match.pos)
+ match = regexp.search(text, pos=100)
+ self.assertEqual(8, match.pos)
+
+ match = regexp.search(text, endpos=-100)
+ self.assertEqual(0, match.endpos)
+ match = regexp.search(text, endpos=100)
+ self.assertEqual(8, match.endpos)
+
+ match = regexp.search(text, pos=100, endpos=-100)
+ self.assertIsNone(match)
+
+ @parameterized.parameters(
+ (p.pattern, p.text, (p.spans if p.match else None)) for p in PARAMS)
+ def test_match(self, pattern, text, expected_spans):
+ match = self.MODULE.match(pattern, text)
+ if expected_spans is None:
+ self.assertIsNone(match)
+ else:
+ spans = [match.span(group) for group in range(match.re.groups + 1)]
+ self.assertListEqual(expected_spans, spans)
+
+ @parameterized.parameters(
+ (p.pattern, p.text, (p.spans if p.fullmatch else None)) for p in PARAMS)
+ def test_fullmatch(self, pattern, text, expected_spans):
+ match = self.MODULE.fullmatch(pattern, text)
+ if expected_spans is None:
+ self.assertIsNone(match)
+ else:
+ spans = [match.span(group) for group in range(match.re.groups + 1)]
+ self.assertListEqual(expected_spans, spans)
+
+ @parameterized.parameters(
+ (u'', u'', [(0, 0)]),
+ (b'', b'', [(0, 0)]),
+ (u'', u'x', [(0, 0), (1, 1)]),
+ (b'', b'x', [(0, 0), (1, 1)]),
+ (u'', u'xy', [(0, 0), (1, 1), (2, 2)]),
+ (b'', b'xy', [(0, 0), (1, 1), (2, 2)]),
+ (u'.', u'xy', [(0, 1), (1, 2)]),
+ (b'.', b'xy', [(0, 1), (1, 2)]),
+ (u'x', u'xy', [(0, 1)]),
+ (b'x', b'xy', [(0, 1)]),
+ (u'y', u'xy', [(1, 2)]),
+ (b'y', b'xy', [(1, 2)]),
+ (u'z', u'xy', []),
+ (b'z', b'xy', []),
+ (u'\\w*', u'Hello, world.', [(0, 5), (5, 5), (6, 6), (7, 12), (12, 12),
+ (13, 13)]),
+ (b'\\w*', b'Hello, world.', [(0, 5), (5, 5), (6, 6), (7, 12), (12, 12),
+ (13, 13)]),
+ )
+ def test_finditer(self, pattern, text, expected_matches):
+ matches = [match.span() for match in self.MODULE.finditer(pattern, text)]
+ self.assertListEqual(expected_matches, matches)
+
+ @parameterized.parameters(
+ (u'\\w\\w+', u'Hello, world.', [u'Hello', u'world']),
+ (b'\\w\\w+', b'Hello, world.', [b'Hello', b'world']),
+ (u'(\\w)\\w+', u'Hello, world.', [u'H', u'w']),
+ (b'(\\w)\\w+', b'Hello, world.', [b'H', b'w']),
+ (u'(\\w)(\\w+)', u'Hello, world.', [(u'H', u'ello'), (u'w', u'orld')]),
+ (b'(\\w)(\\w+)', b'Hello, world.', [(b'H', b'ello'), (b'w', b'orld')]),
+ (u'(\\w)(\\w+)?', u'Hello, w.', [(u'H', u'ello'), (u'w', u'')]),
+ (b'(\\w)(\\w+)?', b'Hello, w.', [(b'H', b'ello'), (b'w', b'')]),
+ )
+ def test_findall(self, pattern, text, expected_matches):
+ matches = self.MODULE.findall(pattern, text)
+ self.assertListEqual(expected_matches, matches)
+
+ @parameterized.parameters(
+ (u'\\W+', u'Hello, world.', -1, [u'Hello, world.']),
+ (b'\\W+', b'Hello, world.', -1, [b'Hello, world.']),
+ (u'\\W+', u'Hello, world.', 0, [u'Hello', u'world', u'']),
+ (b'\\W+', b'Hello, world.', 0, [b'Hello', b'world', b'']),
+ (u'\\W+', u'Hello, world.', 1, [u'Hello', u'world.']),
+ (b'\\W+', b'Hello, world.', 1, [b'Hello', b'world.']),
+ (u'(\\W+)', u'Hello, world.', -1, [u'Hello, world.']),
+ (b'(\\W+)', b'Hello, world.', -1, [b'Hello, world.']),
+ (u'(\\W+)', u'Hello, world.', 0, [u'Hello', u', ', u'world', u'.', u'']),
+ (b'(\\W+)', b'Hello, world.', 0, [b'Hello', b', ', b'world', b'.', b'']),
+ (u'(\\W+)', u'Hello, world.', 1, [u'Hello', u', ', u'world.']),
+ (b'(\\W+)', b'Hello, world.', 1, [b'Hello', b', ', b'world.']),
+ )
+ def test_split(self, pattern, text, maxsplit, expected_pieces):
+ pieces = self.MODULE.split(pattern, text, maxsplit)
+ self.assertListEqual(expected_pieces, pieces)
+
+ @parameterized.parameters(
+ (u'\\w+', upper, u'Hello, world.', -1, u'Hello, world.', 0),
+ (b'\\w+', upper, b'Hello, world.', -1, b'Hello, world.', 0),
+ (u'\\w+', upper, u'Hello, world.', 0, u'HELLO, WORLD.', 2),
+ (b'\\w+', upper, b'Hello, world.', 0, b'HELLO, WORLD.', 2),
+ (u'\\w+', upper, u'Hello, world.', 1, u'HELLO, world.', 1),
+ (b'\\w+', upper, b'Hello, world.', 1, b'HELLO, world.', 1),
+ (u'\\w+', u'MEEP', u'Hello, world.', -1, u'Hello, world.', 0),
+ (b'\\w+', b'MEEP', b'Hello, world.', -1, b'Hello, world.', 0),
+ (u'\\w+', u'MEEP', u'Hello, world.', 0, u'MEEP, MEEP.', 2),
+ (b'\\w+', b'MEEP', b'Hello, world.', 0, b'MEEP, MEEP.', 2),
+ (u'\\w+', u'MEEP', u'Hello, world.', 1, u'MEEP, world.', 1),
+ (b'\\w+', b'MEEP', b'Hello, world.', 1, b'MEEP, world.', 1),
+ (u'\\\\', u'\\\\\\\\', u'Hello,\\world.', 0, u'Hello,\\\\world.', 1),
+ (b'\\\\', b'\\\\\\\\', b'Hello,\\world.', 0, b'Hello,\\\\world.', 1),
+ )
+ def test_subn_sub(self, pattern, repl, text, count, expected_joined_pieces,
+ expected_numsplit):
+ joined_pieces, numsplit = self.MODULE.subn(pattern, repl, text, count)
+ self.assertEqual(expected_joined_pieces, joined_pieces)
+ self.assertEqual(expected_numsplit, numsplit)
+
+ joined_pieces = self.MODULE.sub(pattern, repl, text, count)
+ self.assertEqual(expected_joined_pieces, joined_pieces)
+
+
+class Re2RegexpTest(ReRegexpTest):
+ """Contains tests that apply to the re2 module only."""
+
+ MODULE = re2
+
+ def test_compile_with_latin1_encoding(self):
+ options = re2.Options()
+ options.encoding = re2.Options.Encoding.LATIN1
+ with self.assertRaisesRegex(re2.error,
+ ('string type of pattern is str, but '
+ 'encoding specified in options is LATIN1')):
+ re2.compile(u'.?', options=options)
+
+ # ... whereas this is fine, of course.
+ re2.compile(b'.?', options=options)
+
+ @parameterized.parameters(
+ (u'\\p{Lo}', u'\u0ca0_\u0ca0', [(0, 1), (2, 3)]),
+ (b'\\p{Lo}', b'\xe0\xb2\xa0_\xe0\xb2\xa0', [(0, 3), (4, 7)]),
+ )
+ def test_finditer_with_utf8(self, pattern, text, expected_matches):
+ matches = [match.span() for match in self.MODULE.finditer(pattern, text)]
+ self.assertListEqual(expected_matches, matches)
+
+ def test_purge(self):
+ re2.compile('Goodbye, world.')
+ self.assertGreater(re2._Regexp._make.cache_info().currsize, 0)
+ re2.purge()
+ self.assertEqual(re2._Regexp._make.cache_info().currsize, 0)
+
+
+class Re2EscapeTest(parameterized.TestCase):
+ """Contains tests that apply to the re2 module only.
+
+ We disagree with Python on the escaping of some characters,
+ so there is no point attempting to verify consistency.
+ """
+
+ @parameterized.parameters(
+ (u'a*b+c?', u'a\\*b\\+c\\?'),
+ (b'a*b+c?', b'a\\*b\\+c\\?'),
+ )
+ def test_escape(self, pattern, expected_escaped):
+ escaped = re2.escape(pattern)
+ self.assertEqual(expected_escaped, escaped)
+
+
+class ReMatchTest(parameterized.TestCase):
+ """Contains tests that apply to the re and re2 modules."""
+
+ MODULE = re
+
+ def test_expand(self):
+ pattern = u'(?P<S>[\u2600-\u26ff]+).*?(?P<P>[^\\s\\w]+)'
+ text = u'I \u2665 RE2!\n'
+ match = self.MODULE.search(pattern, text)
+
+ self.assertEqual(u'\u2665\n!', match.expand(u'\\1\\n\\2'))
+ self.assertEqual(u'\u2665\n!', match.expand(u'\\g<1>\\n\\g<2>'))
+ self.assertEqual(u'\u2665\n!', match.expand(u'\\g<S>\\n\\g<P>'))
+ self.assertEqual(u'\\1\\2\n\u2665!', match.expand(u'\\\\1\\\\2\\n\\1\\2'))
+
+ def test_expand_with_octal(self):
+ pattern = u'()()()()()()()()()(\\w+)'
+ text = u'Hello, world.'
+ match = self.MODULE.search(pattern, text)
+
+ self.assertEqual(u'Hello\n', match.expand(u'\\g<0>\\n'))
+ self.assertEqual(u'Hello\n', match.expand(u'\\g<10>\\n'))
+
+ self.assertEqual(u'\x00\n', match.expand(u'\\0\\n'))
+ self.assertEqual(u'\x00\n', match.expand(u'\\00\\n'))
+ self.assertEqual(u'\x00\n', match.expand(u'\\000\\n'))
+ self.assertEqual(u'\x000\n', match.expand(u'\\0000\\n'))
+
+ self.assertEqual(u'\n', match.expand(u'\\1\\n'))
+ self.assertEqual(u'Hello\n', match.expand(u'\\10\\n'))
+ self.assertEqual(u'@\n', match.expand(u'\\100\\n'))
+ self.assertEqual(u'@0\n', match.expand(u'\\1000\\n'))
+
+ def test_getitem_group_groups_groupdict(self):
+ pattern = u'(?P<S>[\u2600-\u26ff]+).*?(?P<P>[^\\s\\w]+)'
+ text = u'Hello, world.\nI \u2665 RE2!\nGoodbye, world.\n'
+ match = self.MODULE.search(pattern, text)
+
+ self.assertEqual(u'\u2665 RE2!', match[0])
+ self.assertEqual(u'\u2665', match[1])
+ self.assertEqual(u'!', match[2])
+ self.assertEqual(u'\u2665', match[u'S'])
+ self.assertEqual(u'!', match[u'P'])
+
+ self.assertEqual(u'\u2665 RE2!', match.group())
+ self.assertEqual(u'\u2665 RE2!', match.group(0))
+ self.assertEqual(u'\u2665', match.group(1))
+ self.assertEqual(u'!', match.group(2))
+ self.assertEqual(u'\u2665', match.group(u'S'))
+ self.assertEqual(u'!', match.group(u'P'))
+
+ self.assertTupleEqual((u'\u2665', u'!'), match.group(1, 2))
+ self.assertTupleEqual((u'\u2665', u'!'), match.group(u'S', u'P'))
+ self.assertTupleEqual((u'\u2665', u'!'), match.groups())
+ self.assertDictEqual({u'S': u'\u2665', u'P': u'!'}, match.groupdict())
+
+ def test_bogus_group_start_end_and_span(self):
+ pattern = u'(?P<S>[\u2600-\u26ff]+).*?(?P<P>[^\\s\\w]+)'
+ text = u'I \u2665 RE2!\n'
+ match = self.MODULE.search(pattern, text)
+
+ self.assertRaises(IndexError, match.group, -1)
+ self.assertRaises(IndexError, match.group, 3)
+ self.assertRaises(IndexError, match.group, 'X')
+
+ self.assertRaises(IndexError, match.start, -1)
+ self.assertRaises(IndexError, match.start, 3)
+
+ self.assertRaises(IndexError, match.end, -1)
+ self.assertRaises(IndexError, match.end, 3)
+
+ self.assertRaises(IndexError, match.span, -1)
+ self.assertRaises(IndexError, match.span, 3)
+
+ @parameterized.parameters(
+ (u'((a)(b))((c)(d))', u'foo bar qux', None, None),
+ (u'(?P<one>(a)(b))((c)(d))', u'foo abcd qux', 4, None),
+ (u'(?P<one>(a)(b))(?P<four>(c)(d))', u'foo abcd qux', 4, 'four'),
+ )
+ def test_lastindex_lastgroup(self, pattern, text, expected_lastindex,
+ expected_lastgroup):
+ match = self.MODULE.search(pattern, text)
+ if expected_lastindex is None:
+ self.assertIsNone(match)
+ else:
+ self.assertEqual(expected_lastindex, match.lastindex)
+ self.assertEqual(expected_lastgroup, match.lastgroup)
+
+
+class Re2MatchTest(ReMatchTest):
+ """Contains tests that apply to the re2 module only."""
+
+ MODULE = re2
+
+
+class SetTest(absltest.TestCase):
+
+ def test_search(self):
+ s = re2.Set.SearchSet()
+ self.assertEqual(0, s.Add('\\d+'))
+ self.assertEqual(1, s.Add('\\s+'))
+ self.assertEqual(2, s.Add('\\w+'))
+ self.assertRaises(re2.error, s.Add, '(MEEP')
+ s.Compile()
+ self.assertItemsEqual([1, 2], s.Match('Hello, world.'))
+
+ def test_match(self):
+ s = re2.Set.MatchSet()
+ self.assertEqual(0, s.Add('\\d+'))
+ self.assertEqual(1, s.Add('\\s+'))
+ self.assertEqual(2, s.Add('\\w+'))
+ self.assertRaises(re2.error, s.Add, '(MEEP')
+ s.Compile()
+ self.assertItemsEqual([2], s.Match('Hello, world.'))
+
+ def test_fullmatch(self):
+ s = re2.Set.FullMatchSet()
+ self.assertEqual(0, s.Add('\\d+'))
+ self.assertEqual(1, s.Add('\\s+'))
+ self.assertEqual(2, s.Add('\\w+'))
+ self.assertRaises(re2.error, s.Add, '(MEEP')
+ s.Compile()
+ self.assertIsNone(s.Match('Hello, world.'))
+
+
+class FilterTest(absltest.TestCase):
+
+ def test_match(self):
+ f = re2.Filter()
+ self.assertEqual(0, f.Add('Hello, \\w+\\.'))
+ self.assertEqual(1, f.Add('\\w+, world\\.'))
+ self.assertEqual(2, f.Add('Goodbye, \\w+\\.'))
+ self.assertRaises(re2.error, f.Add, '(MEEP')
+ f.Compile()
+ self.assertItemsEqual([0, 1], f.Match('Hello, world.', potential=True))
+ self.assertItemsEqual([0, 1], f.Match('HELLO, WORLD.', potential=True))
+ self.assertItemsEqual([0, 1], f.Match('Hello, world.'))
+ self.assertIsNone(f.Match('HELLO, WORLD.'))
+
+ self.assertRaises(IndexError, f.re, -1)
+ self.assertRaises(IndexError, f.re, 3)
+ self.assertEqual('Goodbye, \\w+\\.', f.re(2).pattern)
+ # Verify whether the underlying RE2 object is usable.
+ self.assertEqual(0, f.re(2).groups)
+
+
+if __name__ == '__main__':
+ absltest.main()
diff --git a/third_party/re2/src/python/setup.py b/third_party/re2/src/python/setup.py
new file mode 100644
index 000000000..3bd11edb7
--- /dev/null
+++ b/third_party/re2/src/python/setup.py
@@ -0,0 +1,117 @@
+# Copyright 2019 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+import os
+import setuptools
+import setuptools.command.build_ext
+import shutil
+import sys
+
+long_description = r"""A drop-in replacement for the re module.
+
+It uses RE2 under the hood, of course, so various PCRE features
+(e.g. backreferences, look-around assertions) are not supported.
+See https://github.com/google/re2/wiki/Syntax for the canonical
+reference, but known syntactic "gotchas" relative to Python are:
+
+ * PCRE supports \Z and \z; RE2 supports \z; Python supports \z,
+ but calls it \Z. You must rewrite \Z to \z in pattern strings.
+
+Known differences between this module's API and the re module's API:
+
+ * The error class does not provide any error information as attributes.
+ * The Options class replaces the re module's flags with RE2's options as
+ gettable/settable properties. Please see re2.h for their documentation.
+ * The pattern string and the input string do not have to be the same type.
+ Any str will be encoded to UTF-8.
+ * The pattern string cannot be str if the options specify Latin-1 encoding.
+
+Known issues with regard to building the C++ extension:
+
+ * Building requires RE2 to be installed on your system.
+ On Debian, for example, install the libre2-dev package.
+ * Building requires pybind11 to be installed on your system OR venv.
+ On Debian, for example, install the pybind11-dev package.
+ For a venv, install the pybind11 package from PyPI.
+ * Building on macOS is known to work, but has been known to fail.
+ For example, the system Python may not know which compiler flags
+ to set when building bindings for software installed by Homebrew;
+ see https://docs.brew.sh/Homebrew-and-Python#brewed-python-modules.
+ * Building on Windows has not been tested yet and will probably fail.
+"""
+
+
+class BuildExt(setuptools.command.build_ext.build_ext):
+
+ def build_extension(self, ext):
+ if 'GITHUB_ACTIONS' not in os.environ:
+ return super().build_extension(ext)
+
+ # For @pybind11_bazel's `python_configure()`.
+ os.environ['PYTHON_BIN_PATH'] = sys.executable
+
+ cmd = ['bazel', 'build']
+ try:
+ cmd.append(f'--cpu={os.environ["BAZEL_CPU"].lower()}')
+ except KeyError:
+ pass
+ cmd += ['--compilation_mode=opt', '--', ':all']
+ self.spawn(cmd)
+
+ # This ensures that f'_re2.{importlib.machinery.EXTENSION_SUFFIXES[0]}'
+ # is the filename in the destination directory, which is what's needed.
+ shutil.copyfile('../bazel-bin/python/_re2.so',
+ self.get_ext_fullpath(ext.name))
+
+ cmd = ['bazel', 'clean', '--expunge']
+ self.spawn(cmd)
+
+
+def options():
+ bdist_wheel = {}
+ try:
+ bdist_wheel['plat_name'] = os.environ['PLAT_NAME']
+ except KeyError:
+ pass
+ return {'bdist_wheel': bdist_wheel}
+
+
+def include_dirs():
+ try:
+ import pybind11
+ yield pybind11.get_include()
+ except ModuleNotFoundError:
+ pass
+
+
+ext_module = setuptools.Extension(
+ name='_re2',
+ sources=['_re2.cc'],
+ include_dirs=list(include_dirs()),
+ libraries=['re2'],
+ extra_compile_args=['-fvisibility=hidden'],
+)
+
+setuptools.setup(
+ name='google-re2',
+ version='1.1',
+ description='RE2 Python bindings',
+ long_description=long_description,
+ long_description_content_type='text/plain',
+ author='The RE2 Authors',
+ author_email='re2-dev@googlegroups.com',
+ url='https://github.com/google/re2',
+ py_modules=['re2'],
+ ext_modules=[ext_module],
+ classifiers=[
+ 'Development Status :: 5 - Production/Stable',
+ 'Intended Audience :: Developers',
+ 'License :: OSI Approved :: BSD License',
+ 'Programming Language :: C++',
+ 'Programming Language :: Python :: 3.8',
+ ],
+ options=options(),
+ cmdclass={'build_ext': BuildExt},
+ python_requires='~=3.8',
+)
diff --git a/third_party/re2/src/re2.pc.in b/third_party/re2/src/re2.pc.in
new file mode 100644
index 000000000..c6182d828
--- /dev/null
+++ b/third_party/re2/src/re2.pc.in
@@ -0,0 +1,9 @@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+
+Name: re2
+Description: RE2 is a fast, safe, thread-friendly regular expression engine.
+Requires: @REQUIRES@
+Version: @SONAME@.0.0
+Cflags: -pthread -I${includedir}
+Libs: -pthread -L${libdir} -lre2
diff --git a/third_party/re2/src/re2/bitmap256.cc b/third_party/re2/src/re2/bitmap256.cc
new file mode 100644
index 000000000..f6fbca304
--- /dev/null
+++ b/third_party/re2/src/re2/bitmap256.cc
@@ -0,0 +1,44 @@
+// Copyright 2023 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "re2/bitmap256.h"
+
+#include <stdint.h>
+
+#include "absl/base/macros.h"
+#include "util/logging.h"
+
+namespace re2 {
+
+int Bitmap256::FindNextSetBit(int c) const {
+ DCHECK_GE(c, 0);
+ DCHECK_LE(c, 255);
+
+ // Check the word that contains the bit. Mask out any lower bits.
+ int i = c / 64;
+ uint64_t word = words_[i] & (~uint64_t{0} << (c % 64));
+ if (word != 0)
+ return (i * 64) + FindLSBSet(word);
+
+ // Check any following words.
+ i++;
+ switch (i) {
+ case 1:
+ if (words_[1] != 0)
+ return (1 * 64) + FindLSBSet(words_[1]);
+ ABSL_FALLTHROUGH_INTENDED;
+ case 2:
+ if (words_[2] != 0)
+ return (2 * 64) + FindLSBSet(words_[2]);
+ ABSL_FALLTHROUGH_INTENDED;
+ case 3:
+ if (words_[3] != 0)
+ return (3 * 64) + FindLSBSet(words_[3]);
+ ABSL_FALLTHROUGH_INTENDED;
+ default:
+ return -1;
+ }
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/bitmap256.h b/third_party/re2/src/re2/bitmap256.h
new file mode 100644
index 000000000..293b31d85
--- /dev/null
+++ b/third_party/re2/src/re2/bitmap256.h
@@ -0,0 +1,86 @@
+// Copyright 2016 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_BITMAP256_H_
+#define RE2_BITMAP256_H_
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+#include <stdint.h>
+#include <string.h>
+
+#include "util/logging.h"
+
+namespace re2 {
+
+class Bitmap256 {
+ public:
+ Bitmap256() {
+ Clear();
+ }
+
+ // Clears all of the bits.
+ void Clear() {
+ memset(words_, 0, sizeof words_);
+ }
+
+ // Tests the bit with index c.
+ bool Test(int c) const {
+ DCHECK_GE(c, 0);
+ DCHECK_LE(c, 255);
+
+ return (words_[c / 64] & (uint64_t{1} << (c % 64))) != 0;
+ }
+
+ // Sets the bit with index c.
+ void Set(int c) {
+ DCHECK_GE(c, 0);
+ DCHECK_LE(c, 255);
+
+ words_[c / 64] |= (uint64_t{1} << (c % 64));
+ }
+
+ // Finds the next non-zero bit with index >= c.
+ // Returns -1 if no such bit exists.
+ int FindNextSetBit(int c) const;
+
+ private:
+ // Finds the least significant non-zero bit in n.
+ static int FindLSBSet(uint64_t n) {
+ DCHECK_NE(n, 0);
+#if defined(__GNUC__)
+ return __builtin_ctzll(n);
+#elif defined(_MSC_VER) && defined(_M_X64)
+ unsigned long c;
+ _BitScanForward64(&c, n);
+ return static_cast<int>(c);
+#elif defined(_MSC_VER) && defined(_M_IX86)
+ unsigned long c;
+ if (static_cast<uint32_t>(n) != 0) {
+ _BitScanForward(&c, static_cast<uint32_t>(n));
+ return static_cast<int>(c);
+ } else {
+ _BitScanForward(&c, static_cast<uint32_t>(n >> 32));
+ return static_cast<int>(c) + 32;
+ }
+#else
+ int c = 63;
+ for (int shift = 1 << 5; shift != 0; shift >>= 1) {
+ uint64_t word = n << shift;
+ if (word != 0) {
+ n = word;
+ c -= shift;
+ }
+ }
+ return c;
+#endif
+ }
+
+ uint64_t words_[4];
+};
+
+} // namespace re2
+
+#endif // RE2_BITMAP256_H_
diff --git a/third_party/re2/src/re2/bitstate.cc b/third_party/re2/src/re2/bitstate.cc
new file mode 100644
index 000000000..38a0b87cc
--- /dev/null
+++ b/third_party/re2/src/re2/bitstate.cc
@@ -0,0 +1,381 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Tested by search_test.cc, exhaustive_test.cc, tester.cc
+
+// Prog::SearchBitState is a regular expression search with submatch
+// tracking for small regular expressions and texts. Similarly to
+// testing/backtrack.cc, it allocates a bitmap with (count of
+// lists) * (length of text) bits to make sure it never explores the
+// same (instruction list, character position) multiple times. This
+// limits the search to run in time linear in the length of the text.
+//
+// Unlike testing/backtrack.cc, SearchBitState is not recursive
+// on the text.
+//
+// SearchBitState is a fast replacement for the NFA code on small
+// regexps and texts when SearchOnePass cannot be used.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <limits>
+#include <utility>
+
+#include "util/logging.h"
+#include "re2/pod_array.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct Job {
+ int id;
+ int rle; // run length encoding
+ const char* p;
+};
+
+class BitState {
+ public:
+ explicit BitState(Prog* prog);
+
+ // The usual Search prototype.
+ // Can only call Search once per BitState.
+ bool Search(absl::string_view text, absl::string_view context, bool anchored,
+ bool longest, absl::string_view* submatch, int nsubmatch);
+
+ private:
+ inline bool ShouldVisit(int id, const char* p);
+ void Push(int id, const char* p);
+ void GrowStack();
+ bool TrySearch(int id, const char* p);
+
+ // Search parameters
+ Prog* prog_; // program being run
+ absl::string_view text_; // text being searched
+ absl::string_view context_; // greater context of text being searched
+ bool anchored_; // whether search is anchored at text.begin()
+ bool longest_; // whether search wants leftmost-longest match
+ bool endmatch_; // whether match must end at text.end()
+ absl::string_view* submatch_; // submatches to fill in
+ int nsubmatch_; // # of submatches to fill in
+
+ // Search state
+ static constexpr int kVisitedBits = 64;
+ PODArray<uint64_t> visited_; // bitmap: (list ID, char*) pairs visited
+ PODArray<const char*> cap_; // capture registers
+ PODArray<Job> job_; // stack of text positions to explore
+ int njob_; // stack size
+
+ BitState(const BitState&) = delete;
+ BitState& operator=(const BitState&) = delete;
+};
+
+BitState::BitState(Prog* prog)
+ : prog_(prog),
+ anchored_(false),
+ longest_(false),
+ endmatch_(false),
+ submatch_(NULL),
+ nsubmatch_(0),
+ njob_(0) {
+}
+
+// Given id, which *must* be a list head, we can look up its list ID.
+// Then the question is: Should the search visit the (list ID, p) pair?
+// If so, remember that it was visited so that the next time,
+// we don't repeat the visit.
+bool BitState::ShouldVisit(int id, const char* p) {
+ int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
+ static_cast<int>(p-text_.data());
+ if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1))))
+ return false;
+ visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1));
+ return true;
+}
+
+// Grow the stack.
+void BitState::GrowStack() {
+ PODArray<Job> tmp(2*job_.size());
+ memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
+ job_ = std::move(tmp);
+}
+
+// Push (id, p) onto the stack, growing it if necessary.
+void BitState::Push(int id, const char* p) {
+ if (njob_ >= job_.size()) {
+ GrowStack();
+ if (njob_ >= job_.size()) {
+ LOG(DFATAL) << "GrowStack() failed: "
+ << "njob_ = " << njob_ << ", "
+ << "job_.size() = " << job_.size();
+ return;
+ }
+ }
+
+ // If id < 0, it's undoing a Capture,
+ // so we mustn't interfere with that.
+ if (id >= 0 && njob_ > 0) {
+ Job* top = &job_[njob_-1];
+ if (id == top->id &&
+ p == top->p + top->rle + 1 &&
+ top->rle < std::numeric_limits<int>::max()) {
+ ++top->rle;
+ return;
+ }
+ }
+
+ Job* top = &job_[njob_++];
+ top->id = id;
+ top->rle = 0;
+ top->p = p;
+}
+
+// Try a search from instruction id0 in state p0.
+// Return whether it succeeded.
+bool BitState::TrySearch(int id0, const char* p0) {
+ bool matched = false;
+ const char* end = text_.data() + text_.size();
+ njob_ = 0;
+ // Push() no longer checks ShouldVisit(),
+ // so we must perform the check ourselves.
+ if (ShouldVisit(id0, p0))
+ Push(id0, p0);
+ while (njob_ > 0) {
+ // Pop job off stack.
+ --njob_;
+ int id = job_[njob_].id;
+ int& rle = job_[njob_].rle;
+ const char* p = job_[njob_].p;
+
+ if (id < 0) {
+ // Undo the Capture.
+ cap_[prog_->inst(-id)->cap()] = p;
+ continue;
+ }
+
+ if (rle > 0) {
+ p += rle;
+ // Revivify job on stack.
+ --rle;
+ ++njob_;
+ }
+
+ Loop:
+ // Visit id, p.
+ Prog::Inst* ip = prog_->inst(id);
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "Unexpected opcode: " << ip->opcode();
+ return false;
+
+ case kInstFail:
+ break;
+
+ case kInstAltMatch:
+ if (ip->greedy(prog_)) {
+ // out1 is the Match instruction.
+ id = ip->out1();
+ p = end;
+ goto Loop;
+ }
+ if (longest_) {
+ // ip must be non-greedy...
+ // out is the Match instruction.
+ id = ip->out();
+ p = end;
+ goto Loop;
+ }
+ goto Next;
+
+ case kInstByteRange: {
+ int c = -1;
+ if (p < end)
+ c = *p & 0xFF;
+ if (!ip->Matches(c))
+ goto Next;
+
+ if (ip->hint() != 0)
+ Push(id+ip->hint(), p); // try the next when we're done
+ id = ip->out();
+ p++;
+ goto CheckAndLoop;
+ }
+
+ case kInstCapture:
+ if (!ip->last())
+ Push(id+1, p); // try the next when we're done
+
+ if (0 <= ip->cap() && ip->cap() < cap_.size()) {
+ // Capture p to register, but save old value first.
+ Push(-id, cap_[ip->cap()]); // undo when we're done
+ cap_[ip->cap()] = p;
+ }
+
+ id = ip->out();
+ goto CheckAndLoop;
+
+ case kInstEmptyWidth:
+ if (ip->empty() & ~Prog::EmptyFlags(context_, p))
+ goto Next;
+
+ if (!ip->last())
+ Push(id+1, p); // try the next when we're done
+ id = ip->out();
+ goto CheckAndLoop;
+
+ case kInstNop:
+ if (!ip->last())
+ Push(id+1, p); // try the next when we're done
+ id = ip->out();
+
+ CheckAndLoop:
+ // Sanity check: id is the head of its list, which must
+ // be the case if id-1 is the last of *its* list. :)
+ DCHECK(id == 0 || prog_->inst(id-1)->last());
+ if (ShouldVisit(id, p))
+ goto Loop;
+ break;
+
+ case kInstMatch: {
+ if (endmatch_ && p != end)
+ goto Next;
+
+ // We found a match. If the caller doesn't care
+ // where the match is, no point going further.
+ if (nsubmatch_ == 0)
+ return true;
+
+ // Record best match so far.
+ // Only need to check end point, because this entire
+ // call is only considering one start position.
+ matched = true;
+ cap_[1] = p;
+ if (submatch_[0].data() == NULL ||
+ (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
+ for (int i = 0; i < nsubmatch_; i++)
+ submatch_[i] = absl::string_view(
+ cap_[2 * i],
+ static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
+ }
+
+ // If going for first match, we're done.
+ if (!longest_)
+ return true;
+
+ // If we used the entire text, no longer match is possible.
+ if (p == end)
+ return true;
+
+ // Otherwise, continue on in hope of a longer match.
+ // Note the absence of the ShouldVisit() check here
+ // due to execution remaining in the same list.
+ Next:
+ if (!ip->last()) {
+ id++;
+ goto Loop;
+ }
+ break;
+ }
+ }
+ }
+ return matched;
+}
+
+// Search text (within context) for prog_.
+bool BitState::Search(absl::string_view text, absl::string_view context,
+ bool anchored, bool longest, absl::string_view* submatch,
+ int nsubmatch) {
+ // Search parameters.
+ text_ = text;
+ context_ = context;
+ if (context_.data() == NULL)
+ context_ = text;
+ if (prog_->anchor_start() && BeginPtr(context_) != BeginPtr(text))
+ return false;
+ if (prog_->anchor_end() && EndPtr(context_) != EndPtr(text))
+ return false;
+ anchored_ = anchored || prog_->anchor_start();
+ longest_ = longest || prog_->anchor_end();
+ endmatch_ = prog_->anchor_end();
+ submatch_ = submatch;
+ nsubmatch_ = nsubmatch;
+ for (int i = 0; i < nsubmatch_; i++)
+ submatch_[i] = absl::string_view();
+
+ // Allocate scratch space.
+ int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
+ nvisited = (nvisited + kVisitedBits-1) / kVisitedBits;
+ visited_ = PODArray<uint64_t>(nvisited);
+ memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
+
+ int ncap = 2*nsubmatch;
+ if (ncap < 2)
+ ncap = 2;
+ cap_ = PODArray<const char*>(ncap);
+ memset(cap_.data(), 0, ncap*sizeof cap_[0]);
+
+ // When sizeof(Job) == 16, we start with a nice round 1KiB. :)
+ job_ = PODArray<Job>(64);
+
+ // Anchored search must start at text.begin().
+ if (anchored_) {
+ cap_[0] = text.data();
+ return TrySearch(prog_->start(), text.data());
+ }
+
+ // Unanchored search, starting from each possible text position.
+ // Notice that we have to try the empty string at the end of
+ // the text, so the loop condition is p <= text.end(), not p < text.end().
+ // This looks like it's quadratic in the size of the text,
+ // but we are not clearing visited_ between calls to TrySearch,
+ // so no work is duplicated and it ends up still being linear.
+ const char* etext = text.data() + text.size();
+ for (const char* p = text.data(); p <= etext; p++) {
+ // Try to use prefix accel (e.g. memchr) to skip ahead.
+ if (p < etext && prog_->can_prefix_accel()) {
+ p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p));
+ if (p == NULL)
+ p = etext;
+ }
+
+ cap_[0] = p;
+ if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
+ return true;
+ // Avoid invoking undefined behavior (arithmetic on a null pointer)
+ // by simply not continuing the loop.
+ if (p == NULL)
+ break;
+ }
+ return false;
+}
+
+// Bit-state search.
+bool Prog::SearchBitState(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind,
+ absl::string_view* match, int nmatch) {
+ // If full match, we ask for an anchored longest match
+ // and then check that match[0] == text.
+ // So make sure match[0] exists.
+ absl::string_view sp0;
+ if (kind == kFullMatch) {
+ anchor = kAnchored;
+ if (nmatch < 1) {
+ match = &sp0;
+ nmatch = 1;
+ }
+ }
+
+ // Run the search.
+ BitState b(this);
+ bool anchored = anchor == kAnchored;
+ bool longest = kind != kFirstMatch;
+ if (!b.Search(text, context, anchored, longest, match, nmatch))
+ return false;
+ if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
+ return false;
+ return true;
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/compile.cc b/third_party/re2/src/re2/compile.cc
new file mode 100644
index 000000000..aa798872e
--- /dev/null
+++ b/third_party/re2/src/re2/compile.cc
@@ -0,0 +1,1262 @@
+// Copyright 2007 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Compile regular expression to Prog.
+//
+// Prog and Inst are defined in prog.h.
+// This file's external interface is just Regexp::CompileToProg.
+// The Compiler class defined in this file is private.
+
+#include <stdint.h>
+#include <string.h>
+#include <utility>
+
+#include "absl/base/macros.h"
+#include "absl/container/flat_hash_map.h"
+#include "util/logging.h"
+#include "util/utf.h"
+#include "re2/pod_array.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// List of pointers to Inst* that need to be filled in (patched).
+// Because the Inst* haven't been filled in yet,
+// we can use the Inst* word to hold the list's "next" pointer.
+// It's kind of sleazy, but it works well in practice.
+// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
+//
+// Because the out and out1 fields in Inst are no longer pointers,
+// we can't use pointers directly here either. Instead, head refers
+// to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1).
+// head == 0 represents the NULL list. This is okay because instruction #0
+// is always the fail instruction, which never appears on a list.
+struct PatchList {
+ // Returns patch list containing just p.
+ static PatchList Mk(uint32_t p) {
+ return {p, p};
+ }
+
+ // Patches all the entries on l to have value p.
+ // Caller must not ever use patch list again.
+ static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) {
+ while (l.head != 0) {
+ Prog::Inst* ip = &inst0[l.head>>1];
+ if (l.head&1) {
+ l.head = ip->out1();
+ ip->out1_ = p;
+ } else {
+ l.head = ip->out();
+ ip->set_out(p);
+ }
+ }
+ }
+
+ // Appends two patch lists and returns result.
+ static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
+ if (l1.head == 0)
+ return l2;
+ if (l2.head == 0)
+ return l1;
+ Prog::Inst* ip = &inst0[l1.tail>>1];
+ if (l1.tail&1)
+ ip->out1_ = l2.head;
+ else
+ ip->set_out(l2.head);
+ return {l1.head, l2.tail};
+ }
+
+ uint32_t head;
+ uint32_t tail; // for constant-time append
+};
+
+static const PatchList kNullPatchList = {0, 0};
+
+// Compiled program fragment.
+struct Frag {
+ uint32_t begin;
+ PatchList end;
+ bool nullable;
+
+ Frag() : begin(0), end(kNullPatchList), nullable(false) {}
+ Frag(uint32_t begin, PatchList end, bool nullable)
+ : begin(begin), end(end), nullable(nullable) {}
+};
+
+// Input encodings.
+enum Encoding {
+ kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
+ kEncodingLatin1, // Latin-1 (0-FF)
+};
+
+class Compiler : public Regexp::Walker<Frag> {
+ public:
+ explicit Compiler();
+ ~Compiler();
+
+ // Compiles Regexp to a new Prog.
+ // Caller is responsible for deleting Prog when finished with it.
+ // If reversed is true, compiles for walking over the input
+ // string backward (reverses all concatenations).
+ static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem);
+
+ // Compiles alternation of all the re to a new Prog.
+ // Each re has a match with an id equal to its index in the vector.
+ static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
+
+ // Interface for Regexp::Walker, which helps traverse the Regexp.
+ // The walk is purely post-recursive: given the machines for the
+ // children, PostVisit combines them to create the machine for
+ // the current node. The child_args are Frags.
+ // The Compiler traverses the Regexp parse tree, visiting
+ // each node in depth-first order. It invokes PreVisit before
+ // visiting the node's children and PostVisit after visiting
+ // the children.
+ Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
+ Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
+ int nchild_args);
+ Frag ShortVisit(Regexp* re, Frag parent_arg);
+ Frag Copy(Frag arg);
+
+ // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
+ Frag Plus(Frag a, bool nongreedy);
+ Frag Star(Frag a, bool nongreedy);
+ Frag Quest(Frag a, bool nongreedy);
+
+ // Given fragment a, returns (a) capturing as \n.
+ Frag Capture(Frag a, int n);
+
+ // Given fragments a and b, returns ab; a|b
+ Frag Cat(Frag a, Frag b);
+ Frag Alt(Frag a, Frag b);
+
+ // Returns a fragment that can't match anything.
+ Frag NoMatch();
+
+ // Returns a fragment that matches the empty string.
+ Frag Match(int32_t id);
+
+ // Returns a no-op fragment.
+ Frag Nop();
+
+ // Returns a fragment matching the byte range lo-hi.
+ Frag ByteRange(int lo, int hi, bool foldcase);
+
+ // Returns a fragment matching an empty-width special op.
+ Frag EmptyWidth(EmptyOp op);
+
+ // Adds n instructions to the program.
+ // Returns the index of the first one.
+ // Returns -1 if no more instructions are available.
+ int AllocInst(int n);
+
+ // Rune range compiler.
+
+ // Begins a new alternation.
+ void BeginRange();
+
+ // Adds a fragment matching the rune range lo-hi.
+ void AddRuneRange(Rune lo, Rune hi, bool foldcase);
+ void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
+ void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
+ void Add_80_10ffff();
+
+ // New suffix that matches the byte range lo-hi, then goes to next.
+ int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
+ int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
+
+ // Returns true iff the suffix is cached.
+ bool IsCachedRuneByteSuffix(int id);
+
+ // Adds a suffix to alternation.
+ void AddSuffix(int id);
+
+ // Adds a suffix to the trie starting from the given root node.
+ // Returns zero iff allocating an instruction fails. Otherwise, returns
+ // the current root node, which might be different from what was given.
+ int AddSuffixRecursive(int root, int id);
+
+ // Finds the trie node for the given suffix. Returns a Frag in order to
+ // distinguish between pointing at the root node directly (end.head == 0)
+ // and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively).
+ Frag FindByteRange(int root, int id);
+
+ // Compares two ByteRanges and returns true iff they are equal.
+ bool ByteRangeEqual(int id1, int id2);
+
+ // Returns the alternation of all the added suffixes.
+ Frag EndRange();
+
+ // Single rune.
+ Frag Literal(Rune r, bool foldcase);
+
+ void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor);
+ Prog* Finish(Regexp* re);
+
+ // Returns .* where dot = any byte
+ Frag DotStar();
+
+ private:
+ Prog* prog_; // Program being built.
+ bool failed_; // Did we give up compiling?
+ Encoding encoding_; // Input encoding
+ bool reversed_; // Should program run backward over text?
+
+ PODArray<Prog::Inst> inst_;
+ int ninst_; // Number of instructions used.
+ int max_ninst_; // Maximum number of instructions.
+
+ int64_t max_mem_; // Total memory budget.
+
+ absl::flat_hash_map<uint64_t, int> rune_cache_;
+ Frag rune_range_;
+
+ RE2::Anchor anchor_; // anchor mode for RE2::Set
+
+ Compiler(const Compiler&) = delete;
+ Compiler& operator=(const Compiler&) = delete;
+};
+
+Compiler::Compiler() {
+ prog_ = new Prog();
+ failed_ = false;
+ encoding_ = kEncodingUTF8;
+ reversed_ = false;
+ ninst_ = 0;
+ max_ninst_ = 1; // make AllocInst for fail instruction okay
+ max_mem_ = 0;
+ int fail = AllocInst(1);
+ inst_[fail].InitFail();
+ max_ninst_ = 0; // Caller must change
+}
+
+Compiler::~Compiler() {
+ delete prog_;
+}
+
+int Compiler::AllocInst(int n) {
+ if (failed_ || ninst_ + n > max_ninst_) {
+ failed_ = true;
+ return -1;
+ }
+
+ if (ninst_ + n > inst_.size()) {
+ int cap = inst_.size();
+ if (cap == 0)
+ cap = 8;
+ while (ninst_ + n > cap)
+ cap *= 2;
+ PODArray<Prog::Inst> inst(cap);
+ if (inst_.data() != NULL)
+ memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]);
+ memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]);
+ inst_ = std::move(inst);
+ }
+ int id = ninst_;
+ ninst_ += n;
+ return id;
+}
+
+// These routines are somewhat hard to visualize in text --
+// see http://swtch.com/~rsc/regexp/regexp1.html for
+// pictures explaining what is going on here.
+
+// Returns an unmatchable fragment.
+Frag Compiler::NoMatch() {
+ return Frag();
+}
+
+// Is a an unmatchable fragment?
+static bool IsNoMatch(Frag a) {
+ return a.begin == 0;
+}
+
+// Given fragments a and b, returns fragment for ab.
+Frag Compiler::Cat(Frag a, Frag b) {
+ if (IsNoMatch(a) || IsNoMatch(b))
+ return NoMatch();
+
+ // Elide no-op.
+ Prog::Inst* begin = &inst_[a.begin];
+ if (begin->opcode() == kInstNop &&
+ a.end.head == (a.begin << 1) &&
+ begin->out() == 0) {
+ // in case refs to a somewhere
+ PatchList::Patch(inst_.data(), a.end, b.begin);
+ return b;
+ }
+
+ // To run backward over string, reverse all concatenations.
+ if (reversed_) {
+ PatchList::Patch(inst_.data(), b.end, a.begin);
+ return Frag(b.begin, a.end, b.nullable && a.nullable);
+ }
+
+ PatchList::Patch(inst_.data(), a.end, b.begin);
+ return Frag(a.begin, b.end, a.nullable && b.nullable);
+}
+
+// Given fragments for a and b, returns fragment for a|b.
+Frag Compiler::Alt(Frag a, Frag b) {
+ // Special case for convenience in loops.
+ if (IsNoMatch(a))
+ return b;
+ if (IsNoMatch(b))
+ return a;
+
+ int id = AllocInst(1);
+ if (id < 0)
+ return NoMatch();
+
+ inst_[id].InitAlt(a.begin, b.begin);
+ return Frag(id, PatchList::Append(inst_.data(), a.end, b.end),
+ a.nullable || b.nullable);
+}
+
+// When capturing submatches in like-Perl mode, a kOpAlt Inst
+// treats out_ as the first choice, out1_ as the second.
+//
+// For *, +, and ?, if out_ causes another repetition,
+// then the operator is greedy. If out1_ is the repetition
+// (and out_ moves forward), then the operator is non-greedy.
+
+// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
+Frag Compiler::Plus(Frag a, bool nongreedy) {
+ int id = AllocInst(1);
+ if (id < 0)
+ return NoMatch();
+ PatchList pl;
+ if (nongreedy) {
+ inst_[id].InitAlt(0, a.begin);
+ pl = PatchList::Mk(id << 1);
+ } else {
+ inst_[id].InitAlt(a.begin, 0);
+ pl = PatchList::Mk((id << 1) | 1);
+ }
+ PatchList::Patch(inst_.data(), a.end, id);
+ return Frag(a.begin, pl, a.nullable);
+}
+
+// Given a fragment for a, returns a fragment for a* or a*? (if nongreedy)
+Frag Compiler::Star(Frag a, bool nongreedy) {
+ // When the subexpression is nullable, one Alt isn't enough to guarantee
+ // correct priority ordering within the transitive closure. The simplest
+ // solution is to handle it as (a+)? instead, which adds the second Alt.
+ if (a.nullable)
+ return Quest(Plus(a, nongreedy), nongreedy);
+
+ int id = AllocInst(1);
+ if (id < 0)
+ return NoMatch();
+ PatchList pl;
+ if (nongreedy) {
+ inst_[id].InitAlt(0, a.begin);
+ pl = PatchList::Mk(id << 1);
+ } else {
+ inst_[id].InitAlt(a.begin, 0);
+ pl = PatchList::Mk((id << 1) | 1);
+ }
+ PatchList::Patch(inst_.data(), a.end, id);
+ return Frag(id, pl, true);
+}
+
+// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
+Frag Compiler::Quest(Frag a, bool nongreedy) {
+ if (IsNoMatch(a))
+ return Nop();
+ int id = AllocInst(1);
+ if (id < 0)
+ return NoMatch();
+ PatchList pl;
+ if (nongreedy) {
+ inst_[id].InitAlt(0, a.begin);
+ pl = PatchList::Mk(id << 1);
+ } else {
+ inst_[id].InitAlt(a.begin, 0);
+ pl = PatchList::Mk((id << 1) | 1);
+ }
+ return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true);
+}
+
+// Returns a fragment for the byte range lo-hi.
+Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
+ int id = AllocInst(1);
+ if (id < 0)
+ return NoMatch();
+ inst_[id].InitByteRange(lo, hi, foldcase, 0);
+ return Frag(id, PatchList::Mk(id << 1), false);
+}
+
+// Returns a no-op fragment. Sometimes unavoidable.
+Frag Compiler::Nop() {
+ int id = AllocInst(1);
+ if (id < 0)
+ return NoMatch();
+ inst_[id].InitNop(0);
+ return Frag(id, PatchList::Mk(id << 1), true);
+}
+
+// Returns a fragment that signals a match.
+Frag Compiler::Match(int32_t match_id) {
+ int id = AllocInst(1);
+ if (id < 0)
+ return NoMatch();
+ inst_[id].InitMatch(match_id);
+ return Frag(id, kNullPatchList, false);
+}
+
+// Returns a fragment matching a particular empty-width op (like ^ or $)
+Frag Compiler::EmptyWidth(EmptyOp empty) {
+ int id = AllocInst(1);
+ if (id < 0)
+ return NoMatch();
+ inst_[id].InitEmptyWidth(empty, 0);
+ return Frag(id, PatchList::Mk(id << 1), true);
+}
+
+// Given a fragment a, returns a fragment with capturing parens around a.
+Frag Compiler::Capture(Frag a, int n) {
+ if (IsNoMatch(a))
+ return NoMatch();
+ int id = AllocInst(2);
+ if (id < 0)
+ return NoMatch();
+ inst_[id].InitCapture(2*n, a.begin);
+ inst_[id+1].InitCapture(2*n+1, 0);
+ PatchList::Patch(inst_.data(), a.end, id+1);
+
+ return Frag(id, PatchList::Mk((id+1) << 1), a.nullable);
+}
+
+// A Rune is a name for a Unicode code point.
+// Returns maximum rune encoded by UTF-8 sequence of length len.
+static int MaxRune(int len) {
+ int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
+ if (len == 1)
+ b = 7;
+ else
+ b = 8-(len+1) + 6*(len-1);
+ return (1<<b) - 1; // maximum Rune for b bits.
+}
+
+// The rune range compiler caches common suffix fragments,
+// which are very common in UTF-8 (e.g., [80-bf]).
+// The fragment suffixes are identified by their start
+// instructions. NULL denotes the eventual end match.
+// The Frag accumulates in rune_range_. Caching common
+// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
+// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
+
+void Compiler::BeginRange() {
+ rune_cache_.clear();
+ rune_range_.begin = 0;
+ rune_range_.end = kNullPatchList;
+}
+
+int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
+ int next) {
+ Frag f = ByteRange(lo, hi, foldcase);
+ if (next != 0) {
+ PatchList::Patch(inst_.data(), f.end, next);
+ } else {
+ rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
+ }
+ return f.begin;
+}
+
+static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
+ int next) {
+ return (uint64_t)next << 17 |
+ (uint64_t)lo << 9 |
+ (uint64_t)hi << 1 |
+ (uint64_t)foldcase;
+}
+
+int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
+ int next) {
+ uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
+ absl::flat_hash_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
+ if (it != rune_cache_.end())
+ return it->second;
+ int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
+ rune_cache_[key] = id;
+ return id;
+}
+
+bool Compiler::IsCachedRuneByteSuffix(int id) {
+ uint8_t lo = inst_[id].lo_;
+ uint8_t hi = inst_[id].hi_;
+ bool foldcase = inst_[id].foldcase() != 0;
+ int next = inst_[id].out();
+
+ uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
+ return rune_cache_.find(key) != rune_cache_.end();
+}
+
+void Compiler::AddSuffix(int id) {
+ if (failed_)
+ return;
+
+ if (rune_range_.begin == 0) {
+ rune_range_.begin = id;
+ return;
+ }
+
+ if (encoding_ == kEncodingUTF8) {
+ // Build a trie in order to reduce fanout.
+ rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
+ return;
+ }
+
+ int alt = AllocInst(1);
+ if (alt < 0) {
+ rune_range_.begin = 0;
+ return;
+ }
+ inst_[alt].InitAlt(rune_range_.begin, id);
+ rune_range_.begin = alt;
+}
+
+int Compiler::AddSuffixRecursive(int root, int id) {
+ DCHECK(inst_[root].opcode() == kInstAlt ||
+ inst_[root].opcode() == kInstByteRange);
+
+ Frag f = FindByteRange(root, id);
+ if (IsNoMatch(f)) {
+ int alt = AllocInst(1);
+ if (alt < 0)
+ return 0;
+ inst_[alt].InitAlt(root, id);
+ return alt;
+ }
+
+ int br;
+ if (f.end.head == 0)
+ br = root;
+ else if (f.end.head&1)
+ br = inst_[f.begin].out1();
+ else
+ br = inst_[f.begin].out();
+
+ if (IsCachedRuneByteSuffix(br)) {
+ // We can't fiddle with cached suffixes, so make a clone of the head.
+ int byterange = AllocInst(1);
+ if (byterange < 0)
+ return 0;
+ inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
+ inst_[br].foldcase(), inst_[br].out());
+
+ // Ensure that the parent points to the clone, not to the original.
+ // Note that this could leave the head unreachable except via the cache.
+ br = byterange;
+ if (f.end.head == 0)
+ root = br;
+ else if (f.end.head&1)
+ inst_[f.begin].out1_ = br;
+ else
+ inst_[f.begin].set_out(br);
+ }
+
+ int out = inst_[id].out();
+ if (!IsCachedRuneByteSuffix(id)) {
+ // The head should be the instruction most recently allocated, so free it
+ // instead of leaving it unreachable.
+ DCHECK_EQ(id, ninst_-1);
+ inst_[id].out_opcode_ = 0;
+ inst_[id].out1_ = 0;
+ ninst_--;
+ }
+
+ out = AddSuffixRecursive(inst_[br].out(), out);
+ if (out == 0)
+ return 0;
+
+ inst_[br].set_out(out);
+ return root;
+}
+
+bool Compiler::ByteRangeEqual(int id1, int id2) {
+ return inst_[id1].lo() == inst_[id2].lo() &&
+ inst_[id1].hi() == inst_[id2].hi() &&
+ inst_[id1].foldcase() == inst_[id2].foldcase();
+}
+
+Frag Compiler::FindByteRange(int root, int id) {
+ if (inst_[root].opcode() == kInstByteRange) {
+ if (ByteRangeEqual(root, id))
+ return Frag(root, kNullPatchList, false);
+ else
+ return NoMatch();
+ }
+
+ while (inst_[root].opcode() == kInstAlt) {
+ int out1 = inst_[root].out1();
+ if (ByteRangeEqual(out1, id))
+ return Frag(root, PatchList::Mk((root << 1) | 1), false);
+
+ // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
+ // what we're looking for, then we can stop immediately. Unfortunately, we
+ // can't short-circuit the search in reverse mode.
+ if (!reversed_)
+ return NoMatch();
+
+ int out = inst_[root].out();
+ if (inst_[out].opcode() == kInstAlt)
+ root = out;
+ else if (ByteRangeEqual(out, id))
+ return Frag(root, PatchList::Mk(root << 1), false);
+ else
+ return NoMatch();
+ }
+
+ LOG(DFATAL) << "should never happen";
+ return NoMatch();
+}
+
+Frag Compiler::EndRange() {
+ return rune_range_;
+}
+
+// Converts rune range lo-hi into a fragment that recognizes
+// the bytes that would make up those runes in the current
+// encoding (Latin 1 or UTF-8).
+// This lets the machine work byte-by-byte even when
+// using multibyte encodings.
+
+void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
+ switch (encoding_) {
+ default:
+ case kEncodingUTF8:
+ AddRuneRangeUTF8(lo, hi, foldcase);
+ break;
+ case kEncodingLatin1:
+ AddRuneRangeLatin1(lo, hi, foldcase);
+ break;
+ }
+}
+
+void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
+ // Latin-1 is easy: runes *are* bytes.
+ if (lo > hi || lo > 0xFF)
+ return;
+ if (hi > 0xFF)
+ hi = 0xFF;
+ AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
+ static_cast<uint8_t>(hi), foldcase, 0));
+}
+
+void Compiler::Add_80_10ffff() {
+ // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
+ // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
+ // permitting overlong encodings in E0 and F0 sequences and code points
+ // over 10FFFF in F4 sequences, the size of the bytecode and the number
+ // of equivalence classes are reduced significantly.
+ int id;
+ if (reversed_) {
+ // Prefix factoring matters, but we don't have to handle it here
+ // because the rune range trie logic takes care of that already.
+ id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0);
+ id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+ AddSuffix(id);
+
+ id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0);
+ id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+ id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+ AddSuffix(id);
+
+ id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0);
+ id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+ id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+ id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+ AddSuffix(id);
+ } else {
+ // Suffix factoring matters - and we do have to handle it here.
+ int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0);
+ id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1);
+ AddSuffix(id);
+
+ int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1);
+ id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2);
+ AddSuffix(id);
+
+ int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2);
+ id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3);
+ AddSuffix(id);
+ }
+}
+
+void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
+ if (lo > hi)
+ return;
+
+ // Pick off 80-10FFFF as a common special case.
+ if (lo == 0x80 && hi == 0x10ffff) {
+ Add_80_10ffff();
+ return;
+ }
+
+ // Split range into same-length sized ranges.
+ for (int i = 1; i < UTFmax; i++) {
+ Rune max = MaxRune(i);
+ if (lo <= max && max < hi) {
+ AddRuneRangeUTF8(lo, max, foldcase);
+ AddRuneRangeUTF8(max+1, hi, foldcase);
+ return;
+ }
+ }
+
+ // ASCII range is always a special case.
+ if (hi < Runeself) {
+ AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
+ static_cast<uint8_t>(hi), foldcase, 0));
+ return;
+ }
+
+ // Split range into sections that agree on leading bytes.
+ for (int i = 1; i < UTFmax; i++) {
+ uint32_t m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
+ if ((lo & ~m) != (hi & ~m)) {
+ if ((lo & m) != 0) {
+ AddRuneRangeUTF8(lo, lo|m, foldcase);
+ AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
+ return;
+ }
+ if ((hi & m) != m) {
+ AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
+ AddRuneRangeUTF8(hi&~m, hi, foldcase);
+ return;
+ }
+ }
+ }
+
+ // Finally. Generate byte matching equivalent for lo-hi.
+ uint8_t ulo[UTFmax], uhi[UTFmax];
+ int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
+ int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
+ (void)m; // USED(m)
+ DCHECK_EQ(n, m);
+
+ // The logic below encodes this thinking:
+ //
+ // 1. When we have built the whole suffix, we know that it cannot
+ // possibly be a suffix of anything longer: in forward mode, nothing
+ // else can occur before the leading byte; in reverse mode, nothing
+ // else can occur after the last continuation byte or else the leading
+ // byte would have to change. Thus, there is no benefit to caching
+ // the first byte of the suffix whereas there is a cost involved in
+ // cloning it if it begins a common prefix, which is fairly likely.
+ //
+ // 2. Conversely, the last byte of the suffix cannot possibly be a
+ // prefix of anything because next == 0, so we will never want to
+ // clone it, but it is fairly likely to be a common suffix. Perhaps
+ // more so in reverse mode than in forward mode because the former is
+ // "converging" towards lower entropy, but caching is still worthwhile
+ // for the latter in cases such as 80-BF.
+ //
+ // 3. Handling the bytes between the first and the last is less
+ // straightforward and, again, the approach depends on whether we are
+ // "converging" towards lower entropy: in forward mode, a single byte
+ // is unlikely to be part of a common suffix whereas a byte range
+ // is more likely so; in reverse mode, a byte range is unlikely to
+ // be part of a common suffix whereas a single byte is more likely
+ // so. The same benefit versus cost argument applies here.
+ int id = 0;
+ if (reversed_) {
+ for (int i = 0; i < n; i++) {
+ // In reverse UTF-8 mode: cache the leading byte; don't cache the last
+ // continuation byte; cache anything else iff it's a single byte (XX-XX).
+ if (i == 0 || (ulo[i] == uhi[i] && i != n-1))
+ id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
+ else
+ id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
+ }
+ } else {
+ for (int i = n-1; i >= 0; i--) {
+ // In forward UTF-8 mode: don't cache the leading byte; cache the last
+ // continuation byte; cache anything else iff it's a byte range (XX-YY).
+ if (i == n-1 || (ulo[i] < uhi[i] && i != 0))
+ id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
+ else
+ id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
+ }
+ }
+ AddSuffix(id);
+}
+
+// Should not be called.
+Frag Compiler::Copy(Frag arg) {
+ // We're using WalkExponential; there should be no copying.
+ failed_ = true;
+ LOG(DFATAL) << "Compiler::Copy called!";
+ return NoMatch();
+}
+
+// Visits a node quickly; called once WalkExponential has
+// decided to cut this walk short.
+Frag Compiler::ShortVisit(Regexp* re, Frag) {
+ failed_ = true;
+ return NoMatch();
+}
+
+// Called before traversing a node's children during the walk.
+Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
+ // Cut off walk if we've already failed.
+ if (failed_)
+ *stop = true;
+
+ return Frag(); // not used by caller
+}
+
+Frag Compiler::Literal(Rune r, bool foldcase) {
+ switch (encoding_) {
+ default:
+ return Frag();
+
+ case kEncodingLatin1:
+ return ByteRange(r, r, foldcase);
+
+ case kEncodingUTF8: {
+ if (r < Runeself) // Make common case fast.
+ return ByteRange(r, r, foldcase);
+ uint8_t buf[UTFmax];
+ int n = runetochar(reinterpret_cast<char*>(buf), &r);
+ Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
+ for (int i = 1; i < n; i++)
+ f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
+ return f;
+ }
+ }
+}
+
+// Called after traversing the node's children during the walk.
+// Given their frags, build and return the frag for this re.
+Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
+ int nchild_frags) {
+ // If a child failed, don't bother going forward, especially
+ // since the child_frags might contain Frags with NULLs in them.
+ if (failed_)
+ return NoMatch();
+
+ // Given the child fragments, return the fragment for this node.
+ switch (re->op()) {
+ case kRegexpRepeat:
+ // Should not see; code at bottom of function will print error
+ break;
+
+ case kRegexpNoMatch:
+ return NoMatch();
+
+ case kRegexpEmptyMatch:
+ return Nop();
+
+ case kRegexpHaveMatch: {
+ Frag f = Match(re->match_id());
+ if (anchor_ == RE2::ANCHOR_BOTH) {
+ // Append \z or else the subexpression will effectively be unanchored.
+ // Complemented by the UNANCHORED case in CompileSet().
+ f = Cat(EmptyWidth(kEmptyEndText), f);
+ }
+ return f;
+ }
+
+ case kRegexpConcat: {
+ Frag f = child_frags[0];
+ for (int i = 1; i < nchild_frags; i++)
+ f = Cat(f, child_frags[i]);
+ return f;
+ }
+
+ case kRegexpAlternate: {
+ Frag f = child_frags[0];
+ for (int i = 1; i < nchild_frags; i++)
+ f = Alt(f, child_frags[i]);
+ return f;
+ }
+
+ case kRegexpStar:
+ return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
+
+ case kRegexpPlus:
+ return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
+
+ case kRegexpQuest:
+ return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
+
+ case kRegexpLiteral:
+ return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
+
+ case kRegexpLiteralString: {
+ // Concatenation of literals.
+ if (re->nrunes() == 0)
+ return Nop();
+ Frag f;
+ for (int i = 0; i < re->nrunes(); i++) {
+ Frag f1 = Literal(re->runes()[i],
+ (re->parse_flags()&Regexp::FoldCase) != 0);
+ if (i == 0)
+ f = f1;
+ else
+ f = Cat(f, f1);
+ }
+ return f;
+ }
+
+ case kRegexpAnyChar:
+ BeginRange();
+ AddRuneRange(0, Runemax, false);
+ return EndRange();
+
+ case kRegexpAnyByte:
+ return ByteRange(0x00, 0xFF, false);
+
+ case kRegexpCharClass: {
+ CharClass* cc = re->cc();
+ if (cc->empty()) {
+ // This can't happen.
+ failed_ = true;
+ LOG(DFATAL) << "No ranges in char class";
+ return NoMatch();
+ }
+
+ // ASCII case-folding optimization: if the char class
+ // behaves the same on A-Z as it does on a-z,
+ // discard any ranges wholly contained in A-Z
+ // and mark the other ranges as foldascii.
+ // This reduces the size of a program for
+ // (?i)abc from 3 insts per letter to 1 per letter.
+ bool foldascii = cc->FoldsASCII();
+
+ // Character class is just a big OR of the different
+ // character ranges in the class.
+ BeginRange();
+ for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
+ // ASCII case-folding optimization (see above).
+ if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
+ continue;
+
+ // If this range contains all of A-Za-z or none of it,
+ // the fold flag is unnecessary; don't bother.
+ bool fold = foldascii;
+ if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
+ ('Z' < i->lo && i->hi < 'a'))
+ fold = false;
+
+ AddRuneRange(i->lo, i->hi, fold);
+ }
+ return EndRange();
+ }
+
+ case kRegexpCapture:
+ // If this is a non-capturing parenthesis -- (?:foo) --
+ // just use the inner expression.
+ if (re->cap() < 0)
+ return child_frags[0];
+ return Capture(child_frags[0], re->cap());
+
+ case kRegexpBeginLine:
+ return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
+
+ case kRegexpEndLine:
+ return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
+
+ case kRegexpBeginText:
+ return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
+
+ case kRegexpEndText:
+ return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
+
+ case kRegexpWordBoundary:
+ return EmptyWidth(kEmptyWordBoundary);
+
+ case kRegexpNoWordBoundary:
+ return EmptyWidth(kEmptyNonWordBoundary);
+ }
+ failed_ = true;
+ LOG(DFATAL) << "Missing case in Compiler: " << re->op();
+ return NoMatch();
+}
+
+// Is this regexp required to start at the beginning of the text?
+// Only approximate; can return false for complicated regexps like (\Aa|\Ab),
+// but handles (\A(a|b)). Could use the Walker to write a more exact one.
+static bool IsAnchorStart(Regexp** pre, int depth) {
+ Regexp* re = *pre;
+ Regexp* sub;
+ // The depth limit makes sure that we don't overflow
+ // the stack on a deeply nested regexp. As the comment
+ // above says, IsAnchorStart is conservative, so returning
+ // a false negative is okay. The exact limit is somewhat arbitrary.
+ if (re == NULL || depth >= 4)
+ return false;
+ switch (re->op()) {
+ default:
+ break;
+ case kRegexpConcat:
+ if (re->nsub() > 0) {
+ sub = re->sub()[0]->Incref();
+ if (IsAnchorStart(&sub, depth+1)) {
+ PODArray<Regexp*> subcopy(re->nsub());
+ subcopy[0] = sub; // already have reference
+ for (int i = 1; i < re->nsub(); i++)
+ subcopy[i] = re->sub()[i]->Incref();
+ *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
+ re->Decref();
+ return true;
+ }
+ sub->Decref();
+ }
+ break;
+ case kRegexpCapture:
+ sub = re->sub()[0]->Incref();
+ if (IsAnchorStart(&sub, depth+1)) {
+ *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
+ re->Decref();
+ return true;
+ }
+ sub->Decref();
+ break;
+ case kRegexpBeginText:
+ *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
+ re->Decref();
+ return true;
+ }
+ return false;
+}
+
+// Is this regexp required to start at the end of the text?
+// Only approximate; can return false for complicated regexps like (a\z|b\z),
+// but handles ((a|b)\z). Could use the Walker to write a more exact one.
+static bool IsAnchorEnd(Regexp** pre, int depth) {
+ Regexp* re = *pre;
+ Regexp* sub;
+ // The depth limit makes sure that we don't overflow
+ // the stack on a deeply nested regexp. As the comment
+ // above says, IsAnchorEnd is conservative, so returning
+ // a false negative is okay. The exact limit is somewhat arbitrary.
+ if (re == NULL || depth >= 4)
+ return false;
+ switch (re->op()) {
+ default:
+ break;
+ case kRegexpConcat:
+ if (re->nsub() > 0) {
+ sub = re->sub()[re->nsub() - 1]->Incref();
+ if (IsAnchorEnd(&sub, depth+1)) {
+ PODArray<Regexp*> subcopy(re->nsub());
+ subcopy[re->nsub() - 1] = sub; // already have reference
+ for (int i = 0; i < re->nsub() - 1; i++)
+ subcopy[i] = re->sub()[i]->Incref();
+ *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
+ re->Decref();
+ return true;
+ }
+ sub->Decref();
+ }
+ break;
+ case kRegexpCapture:
+ sub = re->sub()[0]->Incref();
+ if (IsAnchorEnd(&sub, depth+1)) {
+ *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
+ re->Decref();
+ return true;
+ }
+ sub->Decref();
+ break;
+ case kRegexpEndText:
+ *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
+ re->Decref();
+ return true;
+ }
+ return false;
+}
+
+void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
+ RE2::Anchor anchor) {
+ if (flags & Regexp::Latin1)
+ encoding_ = kEncodingLatin1;
+ max_mem_ = max_mem;
+ if (max_mem <= 0) {
+ max_ninst_ = 100000; // more than enough
+ } else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
+ // No room for anything.
+ max_ninst_ = 0;
+ } else {
+ int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
+ // Limit instruction count so that inst->id() fits nicely in an int.
+ // SparseArray also assumes that the indices (inst->id()) are ints.
+ // The call to WalkExponential uses 2*max_ninst_ below,
+ // and other places in the code use 2 or 3 * prog->size().
+ // Limiting to 2^24 should avoid overflow in those places.
+ // (The point of allowing more than 32 bits of memory is to
+ // have plenty of room for the DFA states, not to use it up
+ // on the program.)
+ if (m >= 1<<24)
+ m = 1<<24;
+ // Inst imposes its own limit (currently bigger than 2^24 but be safe).
+ if (m > Prog::Inst::kMaxInst)
+ m = Prog::Inst::kMaxInst;
+ max_ninst_ = static_cast<int>(m);
+ }
+ anchor_ = anchor;
+}
+
+// Compiles re, returning program.
+// Caller is responsible for deleting prog_.
+// If reversed is true, compiles a program that expects
+// to run over the input string backward (reverses all concatenations).
+// The reversed flag is also recorded in the returned program.
+Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
+ Compiler c;
+ c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
+ c.reversed_ = reversed;
+
+ // Simplify to remove things like counted repetitions
+ // and character classes like \d.
+ Regexp* sre = re->Simplify();
+ if (sre == NULL)
+ return NULL;
+
+ // Record whether prog is anchored, removing the anchors.
+ // (They get in the way of other optimizations.)
+ bool is_anchor_start = IsAnchorStart(&sre, 0);
+ bool is_anchor_end = IsAnchorEnd(&sre, 0);
+
+ // Generate fragment for entire regexp.
+ Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
+ sre->Decref();
+ if (c.failed_)
+ return NULL;
+
+ // Success! Finish by putting Match node at end, and record start.
+ // Turn off c.reversed_ (if it is set) to force the remaining concatenations
+ // to behave normally.
+ c.reversed_ = false;
+ all = c.Cat(all, c.Match(0));
+
+ c.prog_->set_reversed(reversed);
+ if (c.prog_->reversed()) {
+ c.prog_->set_anchor_start(is_anchor_end);
+ c.prog_->set_anchor_end(is_anchor_start);
+ } else {
+ c.prog_->set_anchor_start(is_anchor_start);
+ c.prog_->set_anchor_end(is_anchor_end);
+ }
+
+ c.prog_->set_start(all.begin);
+ if (!c.prog_->anchor_start()) {
+ // Also create unanchored version, which starts with a .*? loop.
+ all = c.Cat(c.DotStar(), all);
+ }
+ c.prog_->set_start_unanchored(all.begin);
+
+ // Hand ownership of prog_ to caller.
+ return c.Finish(re);
+}
+
+Prog* Compiler::Finish(Regexp* re) {
+ if (failed_)
+ return NULL;
+
+ if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
+ // No possible matches; keep Fail instruction only.
+ ninst_ = 1;
+ }
+
+ // Hand off the array to Prog.
+ prog_->inst_ = std::move(inst_);
+ prog_->size_ = ninst_;
+
+ prog_->Optimize();
+ prog_->Flatten();
+ prog_->ComputeByteMap();
+
+ if (!prog_->reversed()) {
+ std::string prefix;
+ bool prefix_foldcase;
+ if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase))
+ prog_->ConfigurePrefixAccel(prefix, prefix_foldcase);
+ }
+
+ // Record remaining memory for DFA.
+ if (max_mem_ <= 0) {
+ prog_->set_dfa_mem(1<<20);
+ } else {
+ int64_t m = max_mem_ - sizeof(Prog);
+ m -= prog_->size_*sizeof(Prog::Inst); // account for inst_
+ if (prog_->CanBitState())
+ m -= prog_->size_*sizeof(uint16_t); // account for list_heads_
+ if (m < 0)
+ m = 0;
+ prog_->set_dfa_mem(m);
+ }
+
+ Prog* p = prog_;
+ prog_ = NULL;
+ return p;
+}
+
+// Converts Regexp to Prog.
+Prog* Regexp::CompileToProg(int64_t max_mem) {
+ return Compiler::Compile(this, false, max_mem);
+}
+
+Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
+ return Compiler::Compile(this, true, max_mem);
+}
+
+Frag Compiler::DotStar() {
+ return Star(ByteRange(0x00, 0xff, false), true);
+}
+
+// Compiles RE set to Prog.
+Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
+ Compiler c;
+ c.Setup(re->parse_flags(), max_mem, anchor);
+
+ Regexp* sre = re->Simplify();
+ if (sre == NULL)
+ return NULL;
+
+ Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
+ sre->Decref();
+ if (c.failed_)
+ return NULL;
+
+ c.prog_->set_anchor_start(true);
+ c.prog_->set_anchor_end(true);
+
+ if (anchor == RE2::UNANCHORED) {
+ // Prepend .* or else the expression will effectively be anchored.
+ // Complemented by the ANCHOR_BOTH case in PostVisit().
+ all = c.Cat(c.DotStar(), all);
+ }
+ c.prog_->set_start(all.begin);
+ c.prog_->set_start_unanchored(all.begin);
+
+ Prog* prog = c.Finish(re);
+ if (prog == NULL)
+ return NULL;
+
+ // Make sure DFA has enough memory to operate,
+ // since we're not going to fall back to the NFA.
+ bool dfa_failed = false;
+ absl::string_view sp = "hello, world";
+ prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
+ NULL, &dfa_failed, NULL);
+ if (dfa_failed) {
+ delete prog;
+ return NULL;
+ }
+
+ return prog;
+}
+
+Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
+ return Compiler::CompileSet(re, anchor, max_mem);
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/dfa.cc b/third_party/re2/src/re2/dfa.cc
new file mode 100644
index 000000000..e35fcb281
--- /dev/null
+++ b/third_party/re2/src/re2/dfa.cc
@@ -0,0 +1,2132 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// A DFA (deterministic finite automaton)-based regular expression search.
+//
+// The DFA search has two main parts: the construction of the automaton,
+// which is represented by a graph of State structures, and the execution
+// of the automaton over a given input string.
+//
+// The basic idea is that the State graph is constructed so that the
+// execution can simply start with a state s, and then for each byte c in
+// the input string, execute "s = s->next[c]", checking at each point whether
+// the current s represents a matching state.
+//
+// The simple explanation just given does convey the essence of this code,
+// but it omits the details of how the State graph gets constructed as well
+// as some performance-driven optimizations to the execution of the automaton.
+// All these details are explained in the comments for the code following
+// the definition of class DFA.
+//
+// See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <algorithm>
+#include <atomic>
+#include <deque>
+#include <new>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/call_once.h"
+#include "absl/base/macros.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "util/logging.h"
+#include "util/strutil.h"
+#include "re2/pod_array.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/sparse_set.h"
+
+// Silence "zero-sized array in struct/union" warning for DFA::State::next_.
+#ifdef _MSC_VER
+#pragma warning(disable: 4200)
+#endif
+
+namespace re2 {
+
+// Controls whether the DFA should bail out early if the NFA would be faster.
+static bool dfa_should_bail_when_slow = true;
+
+void Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(bool b) {
+ dfa_should_bail_when_slow = b;
+}
+
+// Changing this to true compiles in prints that trace execution of the DFA.
+// Generates a lot of output -- only useful for debugging.
+static const bool ExtraDebug = false;
+
+// A DFA implementation of a regular expression program.
+// Since this is entirely a forward declaration mandated by C++,
+// some of the comments here are better understood after reading
+// the comments in the sections that follow the DFA definition.
+class DFA {
+ public:
+ DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem);
+ ~DFA();
+ bool ok() const { return !init_failed_; }
+ Prog::MatchKind kind() { return kind_; }
+
+ // Searches for the regular expression in text, which is considered
+ // as a subsection of context for the purposes of interpreting flags
+ // like ^ and $ and \A and \z.
+ // Returns whether a match was found.
+ // If a match is found, sets *ep to the end point of the best match in text.
+ // If "anchored", the match must begin at the start of text.
+ // If "want_earliest_match", the match that ends first is used, not
+ // necessarily the best one.
+ // If "run_forward" is true, the DFA runs from text.begin() to text.end().
+ // If it is false, the DFA runs from text.end() to text.begin(),
+ // returning the leftmost end of the match instead of the rightmost one.
+ // If the DFA cannot complete the search (for example, if it is out of
+ // memory), it sets *failed and returns false.
+ bool Search(absl::string_view text, absl::string_view context, bool anchored,
+ bool want_earliest_match, bool run_forward, bool* failed,
+ const char** ep, SparseSet* matches);
+
+ // Builds out all states for the entire DFA.
+ // If cb is not empty, it receives one callback per state built.
+ // Returns the number of states built.
+ // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
+ int BuildAllStates(const Prog::DFAStateCallback& cb);
+
+ // Computes min and max for matching strings. Won't return strings
+ // bigger than maxlen.
+ bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
+
+ // These data structures are logically private, but C++ makes it too
+ // difficult to mark them as such.
+ class RWLocker;
+ class StateSaver;
+ class Workq;
+
+ // A single DFA state. The DFA is represented as a graph of these
+ // States, linked by the next_ pointers. If in state s and reading
+ // byte c, the next state should be s->next_[c].
+ struct State {
+ inline bool IsMatch() const { return (flag_ & kFlagMatch) != 0; }
+
+ template <typename H>
+ friend H AbslHashValue(H h, const State& a) {
+ const absl::Span<const int> ainst(a.inst_, a.ninst_);
+ return H::combine(std::move(h), a.flag_, ainst);
+ }
+
+ friend bool operator==(const State& a, const State& b) {
+ const absl::Span<const int> ainst(a.inst_, a.ninst_);
+ const absl::Span<const int> binst(b.inst_, b.ninst_);
+ return &a == &b || (a.flag_ == b.flag_ && ainst == binst);
+ }
+
+ int* inst_; // Instruction pointers in the state.
+ int ninst_; // # of inst_ pointers.
+ uint32_t flag_; // Empty string bitfield flags in effect on the way
+ // into this state, along with kFlagMatch if this
+ // is a matching state.
+
+ std::atomic<State*> next_[]; // Outgoing arrows from State,
+ // one per input byte class
+ };
+
+ enum {
+ kByteEndText = 256, // imaginary byte at end of text
+
+ kFlagEmptyMask = 0xFF, // State.flag_: bits holding kEmptyXXX flags
+ kFlagMatch = 0x0100, // State.flag_: this is a matching state
+ kFlagLastWord = 0x0200, // State.flag_: last byte was a word char
+ kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left
+ };
+
+ struct StateHash {
+ size_t operator()(const State* a) const {
+ DCHECK(a != NULL);
+ return absl::Hash<State>()(*a);
+ }
+ };
+
+ struct StateEqual {
+ bool operator()(const State* a, const State* b) const {
+ DCHECK(a != NULL);
+ DCHECK(b != NULL);
+ return *a == *b;
+ }
+ };
+
+ typedef absl::flat_hash_set<State*, StateHash, StateEqual> StateSet;
+
+ private:
+ // Make it easier to swap in a scalable reader-writer mutex.
+ using CacheMutex = absl::Mutex;
+
+ enum {
+ // Indices into start_ for unanchored searches.
+ // Add kStartAnchored for anchored searches.
+ kStartBeginText = 0, // text at beginning of context
+ kStartBeginLine = 2, // text at beginning of line
+ kStartAfterWordChar = 4, // text follows a word character
+ kStartAfterNonWordChar = 6, // text follows non-word character
+ kMaxStart = 8,
+
+ kStartAnchored = 1,
+ };
+
+ // Resets the DFA State cache, flushing all saved State* information.
+ // Releases and reacquires cache_mutex_ via cache_lock, so any
+ // State* existing before the call are not valid after the call.
+ // Use a StateSaver to preserve important states across the call.
+ // cache_mutex_.r <= L < mutex_
+ // After: cache_mutex_.w <= L < mutex_
+ void ResetCache(RWLocker* cache_lock);
+
+ // Looks up and returns the State corresponding to a Workq.
+ // L >= mutex_
+ State* WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag);
+
+ // Looks up and returns a State matching the inst, ninst, and flag.
+ // L >= mutex_
+ State* CachedState(int* inst, int ninst, uint32_t flag);
+
+ // Clear the cache entirely.
+ // Must hold cache_mutex_.w or be in destructor.
+ void ClearCache();
+
+ // Converts a State into a Workq: the opposite of WorkqToCachedState.
+ // L >= mutex_
+ void StateToWorkq(State* s, Workq* q);
+
+ // Runs a State on a given byte, returning the next state.
+ State* RunStateOnByteUnlocked(State*, int); // cache_mutex_.r <= L < mutex_
+ State* RunStateOnByte(State*, int); // L >= mutex_
+
+ // Runs a Workq on a given byte followed by a set of empty-string flags,
+ // producing a new Workq in nq. If a match instruction is encountered,
+ // sets *ismatch to true.
+ // L >= mutex_
+ void RunWorkqOnByte(Workq* q, Workq* nq,
+ int c, uint32_t flag, bool* ismatch);
+
+ // Runs a Workq on a set of empty-string flags, producing a new Workq in nq.
+ // L >= mutex_
+ void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint32_t flag);
+
+ // Adds the instruction id to the Workq, following empty arrows
+ // according to flag.
+ // L >= mutex_
+ void AddToQueue(Workq* q, int id, uint32_t flag);
+
+ // For debugging, returns a text representation of State.
+ static std::string DumpState(State* state);
+
+ // For debugging, returns a text representation of a Workq.
+ static std::string DumpWorkq(Workq* q);
+
+ // Search parameters
+ struct SearchParams {
+ SearchParams(absl::string_view text, absl::string_view context,
+ RWLocker* cache_lock)
+ : text(text),
+ context(context),
+ anchored(false),
+ can_prefix_accel(false),
+ want_earliest_match(false),
+ run_forward(false),
+ start(NULL),
+ cache_lock(cache_lock),
+ failed(false),
+ ep(NULL),
+ matches(NULL) {}
+
+ absl::string_view text;
+ absl::string_view context;
+ bool anchored;
+ bool can_prefix_accel;
+ bool want_earliest_match;
+ bool run_forward;
+ State* start;
+ RWLocker* cache_lock;
+ bool failed; // "out" parameter: whether search gave up
+ const char* ep; // "out" parameter: end pointer for match
+ SparseSet* matches;
+
+ private:
+ SearchParams(const SearchParams&) = delete;
+ SearchParams& operator=(const SearchParams&) = delete;
+ };
+
+ // Before each search, the parameters to Search are analyzed by
+ // AnalyzeSearch to determine the state in which to start.
+ struct StartInfo {
+ StartInfo() : start(NULL) {}
+ std::atomic<State*> start;
+ };
+
+ // Fills in params->start and params->can_prefix_accel using
+ // the other search parameters. Returns true on success,
+ // false on failure.
+ // cache_mutex_.r <= L < mutex_
+ bool AnalyzeSearch(SearchParams* params);
+ bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
+ uint32_t flags);
+
+ // The generic search loop, inlined to create specialized versions.
+ // cache_mutex_.r <= L < mutex_
+ // Might unlock and relock cache_mutex_ via params->cache_lock.
+ template <bool can_prefix_accel,
+ bool want_earliest_match,
+ bool run_forward>
+ inline bool InlinedSearchLoop(SearchParams* params);
+
+ // The specialized versions of InlinedSearchLoop. The three letters
+ // at the ends of the name denote the true/false values used as the
+ // last three parameters of InlinedSearchLoop.
+ // cache_mutex_.r <= L < mutex_
+ // Might unlock and relock cache_mutex_ via params->cache_lock.
+ bool SearchFFF(SearchParams* params);
+ bool SearchFFT(SearchParams* params);
+ bool SearchFTF(SearchParams* params);
+ bool SearchFTT(SearchParams* params);
+ bool SearchTFF(SearchParams* params);
+ bool SearchTFT(SearchParams* params);
+ bool SearchTTF(SearchParams* params);
+ bool SearchTTT(SearchParams* params);
+
+ // The main search loop: calls an appropriate specialized version of
+ // InlinedSearchLoop.
+ // cache_mutex_.r <= L < mutex_
+ // Might unlock and relock cache_mutex_ via params->cache_lock.
+ bool FastSearchLoop(SearchParams* params);
+
+
+ // Looks up bytes in bytemap_ but handles case c == kByteEndText too.
+ int ByteMap(int c) {
+ if (c == kByteEndText)
+ return prog_->bytemap_range();
+ return prog_->bytemap()[c];
+ }
+
+ // Constant after initialization.
+ Prog* prog_; // The regular expression program to run.
+ Prog::MatchKind kind_; // The kind of DFA.
+ bool init_failed_; // initialization failed (out of memory)
+
+ absl::Mutex mutex_; // mutex_ >= cache_mutex_.r
+
+ // Scratch areas, protected by mutex_.
+ Workq* q0_; // Two pre-allocated work queues.
+ Workq* q1_;
+ PODArray<int> stack_; // Pre-allocated stack for AddToQueue
+
+ // State* cache. Many threads use and add to the cache simultaneously,
+ // holding cache_mutex_ for reading and mutex_ (above) when adding.
+ // If the cache fills and needs to be discarded, the discarding is done
+ // while holding cache_mutex_ for writing, to avoid interrupting other
+ // readers. Any State* pointers are only valid while cache_mutex_
+ // is held.
+ CacheMutex cache_mutex_;
+ int64_t mem_budget_; // Total memory budget for all States.
+ int64_t state_budget_; // Amount of memory remaining for new States.
+ StateSet state_cache_; // All States computed so far.
+ StartInfo start_[kMaxStart];
+
+ DFA(const DFA&) = delete;
+ DFA& operator=(const DFA&) = delete;
+};
+
+// Shorthand for casting to uint8_t*.
+static inline const uint8_t* BytePtr(const void* v) {
+ return reinterpret_cast<const uint8_t*>(v);
+}
+
+// Work queues
+
+// Marks separate thread groups of different priority
+// in the work queue when in leftmost-longest matching mode.
+#define Mark (-1)
+
+// Separates the match IDs from the instructions in inst_.
+// Used only for "many match" DFA states.
+#define MatchSep (-2)
+
+// Internally, the DFA uses a sparse array of
+// program instruction pointers as a work queue.
+// In leftmost longest mode, marks separate sections
+// of workq that started executing at different
+// locations in the string (earlier locations first).
+class DFA::Workq : public SparseSet {
+ public:
+ // Constructor: n is number of normal slots, maxmark number of mark slots.
+ Workq(int n, int maxmark) :
+ SparseSet(n+maxmark),
+ n_(n),
+ maxmark_(maxmark),
+ nextmark_(n),
+ last_was_mark_(true) {
+ }
+
+ bool is_mark(int i) { return i >= n_; }
+
+ int maxmark() { return maxmark_; }
+
+ void clear() {
+ SparseSet::clear();
+ nextmark_ = n_;
+ }
+
+ void mark() {
+ if (last_was_mark_)
+ return;
+ last_was_mark_ = false;
+ SparseSet::insert_new(nextmark_++);
+ }
+
+ int size() {
+ return n_ + maxmark_;
+ }
+
+ void insert(int id) {
+ if (contains(id))
+ return;
+ insert_new(id);
+ }
+
+ void insert_new(int id) {
+ last_was_mark_ = false;
+ SparseSet::insert_new(id);
+ }
+
+ private:
+ int n_; // size excluding marks
+ int maxmark_; // maximum number of marks
+ int nextmark_; // id of next mark
+ bool last_was_mark_; // last inserted was mark
+
+ Workq(const Workq&) = delete;
+ Workq& operator=(const Workq&) = delete;
+};
+
+DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem)
+ : prog_(prog),
+ kind_(kind),
+ init_failed_(false),
+ q0_(NULL),
+ q1_(NULL),
+ mem_budget_(max_mem) {
+ if (ExtraDebug)
+ absl::FPrintF(stderr, "\nkind %d\n%s\n", kind_, prog_->DumpUnanchored());
+ int nmark = 0;
+ if (kind_ == Prog::kLongestMatch)
+ nmark = prog_->size();
+ // See DFA::AddToQueue() for why this is so.
+ int nstack = prog_->inst_count(kInstCapture) +
+ prog_->inst_count(kInstEmptyWidth) +
+ prog_->inst_count(kInstNop) +
+ nmark + 1; // + 1 for start inst
+
+ // Account for space needed for DFA, q0, q1, stack.
+ mem_budget_ -= sizeof(DFA);
+ mem_budget_ -= (prog_->size() + nmark) *
+ (sizeof(int)+sizeof(int)) * 2; // q0, q1
+ mem_budget_ -= nstack * sizeof(int); // stack
+ if (mem_budget_ < 0) {
+ init_failed_ = true;
+ return;
+ }
+
+ state_budget_ = mem_budget_;
+
+ // Make sure there is a reasonable amount of working room left.
+ // At minimum, the search requires room for two states in order
+ // to limp along, restarting frequently. We'll get better performance
+ // if there is room for a larger number of states, say 20.
+ // Note that a state stores list heads only, so we use the program
+ // list count for the upper bound, not the program size.
+ int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot
+ int64_t one_state = sizeof(State) + nnext*sizeof(std::atomic<State*>) +
+ (prog_->list_count()+nmark)*sizeof(int);
+ if (state_budget_ < 20*one_state) {
+ init_failed_ = true;
+ return;
+ }
+
+ q0_ = new Workq(prog_->size(), nmark);
+ q1_ = new Workq(prog_->size(), nmark);
+ stack_ = PODArray<int>(nstack);
+}
+
+DFA::~DFA() {
+ delete q0_;
+ delete q1_;
+ ClearCache();
+}
+
+// In the DFA state graph, s->next[c] == NULL means that the
+// state has not yet been computed and needs to be. We need
+// a different special value to signal that s->next[c] is a
+// state that can never lead to a match (and thus the search
+// can be called off). Hence DeadState.
+#define DeadState reinterpret_cast<State*>(1)
+
+// Signals that the rest of the string matches no matter what it is.
+#define FullMatchState reinterpret_cast<State*>(2)
+
+#define SpecialStateMax FullMatchState
+
+// Debugging printouts
+
+// For debugging, returns a string representation of the work queue.
+std::string DFA::DumpWorkq(Workq* q) {
+ std::string s;
+ const char* sep = "";
+ for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
+ if (q->is_mark(*it)) {
+ s += "|";
+ sep = "";
+ } else {
+ s += absl::StrFormat("%s%d", sep, *it);
+ sep = ",";
+ }
+ }
+ return s;
+}
+
+// For debugging, returns a string representation of the state.
+std::string DFA::DumpState(State* state) {
+ if (state == NULL)
+ return "_";
+ if (state == DeadState)
+ return "X";
+ if (state == FullMatchState)
+ return "*";
+ std::string s;
+ const char* sep = "";
+ s += absl::StrFormat("(%p)", state);
+ for (int i = 0; i < state->ninst_; i++) {
+ if (state->inst_[i] == Mark) {
+ s += "|";
+ sep = "";
+ } else if (state->inst_[i] == MatchSep) {
+ s += "||";
+ sep = "";
+ } else {
+ s += absl::StrFormat("%s%d", sep, state->inst_[i]);
+ sep = ",";
+ }
+ }
+ s += absl::StrFormat(" flag=%#x", state->flag_);
+ return s;
+}
+
+//////////////////////////////////////////////////////////////////////
+//
+// DFA state graph construction.
+//
+// The DFA state graph is a heavily-linked collection of State* structures.
+// The state_cache_ is a set of all the State structures ever allocated,
+// so that if the same state is reached by two different paths,
+// the same State structure can be used. This reduces allocation
+// requirements and also avoids duplication of effort across the two
+// identical states.
+//
+// A State is defined by an ordered list of instruction ids and a flag word.
+//
+// The choice of an ordered list of instructions differs from a typical
+// textbook DFA implementation, which would use an unordered set.
+// Textbook descriptions, however, only care about whether
+// the DFA matches, not where it matches in the text. To decide where the
+// DFA matches, we need to mimic the behavior of the dominant backtracking
+// implementations like PCRE, which try one possible regular expression
+// execution, then another, then another, stopping when one of them succeeds.
+// The DFA execution tries these many executions in parallel, representing
+// each by an instruction id. These pointers are ordered in the State.inst_
+// list in the same order that the executions would happen in a backtracking
+// search: if a match is found during execution of inst_[2], inst_[i] for i>=3
+// can be discarded.
+//
+// Textbooks also typically do not consider context-aware empty string operators
+// like ^ or $. These are handled by the flag word, which specifies the set
+// of empty-string operators that should be matched when executing at the
+// current text position. These flag bits are defined in prog.h.
+// The flag word also contains two DFA-specific bits: kFlagMatch if the state
+// is a matching state (one that reached a kInstMatch in the program)
+// and kFlagLastWord if the last processed byte was a word character, for the
+// implementation of \B and \b.
+//
+// The flag word also contains, shifted up 16 bits, the bits looked for by
+// any kInstEmptyWidth instructions in the state. These provide a useful
+// summary indicating when new flags might be useful.
+//
+// The permanent representation of a State's instruction ids is just an array,
+// but while a state is being analyzed, these instruction ids are represented
+// as a Workq, which is an array that allows iteration in insertion order.
+
+// NOTE(rsc): The choice of State construction determines whether the DFA
+// mimics backtracking implementations (so-called leftmost first matching) or
+// traditional DFA implementations (so-called leftmost longest matching as
+// prescribed by POSIX). This implementation chooses to mimic the
+// backtracking implementations, because we want to replace PCRE. To get
+// POSIX behavior, the states would need to be considered not as a simple
+// ordered list of instruction ids, but as a list of unordered sets of instruction
+// ids. A match by a state in one set would inhibit the running of sets
+// farther down the list but not other instruction ids in the same set. Each
+// set would correspond to matches beginning at a given point in the string.
+// This is implemented by separating different sets with Mark pointers.
+
+// Looks in the State cache for a State matching q, flag.
+// If one is found, returns it. If one is not found, allocates one,
+// inserts it in the cache, and returns it.
+// If mq is not null, MatchSep and the match IDs in mq will be appended
+// to the State.
+DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
+ //mutex_.AssertHeld();
+
+ // Construct array of instruction ids for the new state.
+ // In some cases, kInstAltMatch may trigger an upgrade to FullMatchState.
+ // Otherwise, "compress" q down to list heads for storage; StateToWorkq()
+ // will "decompress" it for computation by exploring from each list head.
+ //
+ // Historically, only kInstByteRange, kInstEmptyWidth and kInstMatch were
+ // useful to keep, but it turned out that kInstAlt was necessary to keep:
+ //
+ // > [*] kInstAlt would seem useless to record in a state, since
+ // > we've already followed both its arrows and saved all the
+ // > interesting states we can reach from there. The problem
+ // > is that one of the empty-width instructions might lead
+ // > back to the same kInstAlt (if an empty-width operator is starred),
+ // > producing a different evaluation order depending on whether
+ // > we keep the kInstAlt to begin with. Sigh.
+ // > A specific case that this affects is /(^|a)+/ matching "a".
+ // > If we don't save the kInstAlt, we will match the whole "a" (0,1)
+ // > but in fact the correct leftmost-first match is the leading "" (0,0).
+ //
+ // Recall that flattening transformed the Prog from "tree" form to "list"
+ // form: in the former, kInstAlt existed explicitly... and abundantly; in
+ // the latter, it's implied between the instructions that compose a list.
+ // Thus, because the information wasn't lost, the bug doesn't remanifest.
+ PODArray<int> inst(q->size());
+ int n = 0;
+ uint32_t needflags = 0; // flags needed by kInstEmptyWidth instructions
+ bool sawmatch = false; // whether queue contains guaranteed kInstMatch
+ bool sawmark = false; // whether queue contains a Mark
+ if (ExtraDebug)
+ absl::FPrintF(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q), flag);
+ for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
+ int id = *it;
+ if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id)))
+ break;
+ if (q->is_mark(id)) {
+ if (n > 0 && inst[n-1] != Mark) {
+ sawmark = true;
+ inst[n++] = Mark;
+ }
+ continue;
+ }
+ Prog::Inst* ip = prog_->inst(id);
+ switch (ip->opcode()) {
+ case kInstAltMatch:
+ // This state will continue to a match no matter what
+ // the rest of the input is. If it is the highest priority match
+ // being considered, return the special FullMatchState
+ // to indicate that it's all matches from here out.
+ if (kind_ != Prog::kManyMatch &&
+ (kind_ != Prog::kFirstMatch ||
+ (it == q->begin() && ip->greedy(prog_))) &&
+ (kind_ != Prog::kLongestMatch || !sawmark) &&
+ (flag & kFlagMatch)) {
+ if (ExtraDebug)
+ absl::FPrintF(stderr, " -> FullMatchState\n");
+ return FullMatchState;
+ }
+ ABSL_FALLTHROUGH_INTENDED;
+ default:
+ // Record iff id is the head of its list, which must
+ // be the case if id-1 is the last of *its* list. :)
+ if (prog_->inst(id-1)->last())
+ inst[n++] = *it;
+ if (ip->opcode() == kInstEmptyWidth)
+ needflags |= ip->empty();
+ if (ip->opcode() == kInstMatch && !prog_->anchor_end())
+ sawmatch = true;
+ break;
+ }
+ }
+ DCHECK_LE(n, q->size());
+ if (n > 0 && inst[n-1] == Mark)
+ n--;
+
+ // If there are no empty-width instructions waiting to execute,
+ // then the extra flag bits will not be used, so there is no
+ // point in saving them. (Discarding them reduces the number
+ // of distinct states.)
+ if (needflags == 0)
+ flag &= kFlagMatch;
+
+ // NOTE(rsc): The code above cannot do flag &= needflags,
+ // because if the right flags were present to pass the current
+ // kInstEmptyWidth instructions, new kInstEmptyWidth instructions
+ // might be reached that in turn need different flags.
+ // The only sure thing is that if there are no kInstEmptyWidth
+ // instructions at all, no flags will be needed.
+ // We could do the extra work to figure out the full set of
+ // possibly needed flags by exploring past the kInstEmptyWidth
+ // instructions, but the check above -- are any flags needed
+ // at all? -- handles the most common case. More fine-grained
+ // analysis can only be justified by measurements showing that
+ // too many redundant states are being allocated.
+
+ // If there are no Insts in the list, it's a dead state,
+ // which is useful to signal with a special pointer so that
+ // the execution loop can stop early. This is only okay
+ // if the state is *not* a matching state.
+ if (n == 0 && flag == 0) {
+ if (ExtraDebug)
+ absl::FPrintF(stderr, " -> DeadState\n");
+ return DeadState;
+ }
+
+ // If we're in longest match mode, the state is a sequence of
+ // unordered state sets separated by Marks. Sort each set
+ // to canonicalize, to reduce the number of distinct sets stored.
+ if (kind_ == Prog::kLongestMatch) {
+ int* ip = inst.data();
+ int* ep = ip + n;
+ while (ip < ep) {
+ int* markp = ip;
+ while (markp < ep && *markp != Mark)
+ markp++;
+ std::sort(ip, markp);
+ if (markp < ep)
+ markp++;
+ ip = markp;
+ }
+ }
+
+ // If we're in many match mode, canonicalize for similar reasons:
+ // we have an unordered set of states (i.e. we don't have Marks)
+ // and sorting will reduce the number of distinct sets stored.
+ if (kind_ == Prog::kManyMatch) {
+ int* ip = inst.data();
+ int* ep = ip + n;
+ std::sort(ip, ep);
+ }
+
+ // Append MatchSep and the match IDs in mq if necessary.
+ if (mq != NULL) {
+ inst[n++] = MatchSep;
+ for (Workq::iterator i = mq->begin(); i != mq->end(); ++i) {
+ int id = *i;
+ Prog::Inst* ip = prog_->inst(id);
+ if (ip->opcode() == kInstMatch)
+ inst[n++] = ip->match_id();
+ }
+ }
+
+ // Save the needed empty-width flags in the top bits for use later.
+ flag |= needflags << kFlagNeedShift;
+
+ State* state = CachedState(inst.data(), n, flag);
+ return state;
+}
+
+// Looks in the State cache for a State matching inst, ninst, flag.
+// If one is found, returns it. If one is not found, allocates one,
+// inserts it in the cache, and returns it.
+DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) {
+ //mutex_.AssertHeld();
+
+ // Look in the cache for a pre-existing state.
+ // We have to initialise the struct like this because otherwise
+ // MSVC will complain about the flexible array member. :(
+ State state;
+ state.inst_ = inst;
+ state.ninst_ = ninst;
+ state.flag_ = flag;
+ StateSet::iterator it = state_cache_.find(&state);
+ if (it != state_cache_.end()) {
+ if (ExtraDebug)
+ absl::FPrintF(stderr, " -cached-> %s\n", DumpState(*it));
+ return *it;
+ }
+
+ // Must have enough memory for new state.
+ // In addition to what we're going to allocate,
+ // the state cache hash table seems to incur about 18 bytes per
+ // State*. Worst case for non-small sets is it being half full, where each
+ // value present takes up 1 byte hash sample plus the pointer itself.
+ const int kStateCacheOverhead = 18;
+ int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot
+ int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>);
+ int instmem = ninst*sizeof(int);
+ if (mem_budget_ < mem + instmem + kStateCacheOverhead) {
+ mem_budget_ = -1;
+ return NULL;
+ }
+ mem_budget_ -= mem + instmem + kStateCacheOverhead;
+
+ // Allocate new state along with room for next_ and inst_.
+ // inst_ is stored separately since it's colder; this also
+ // means that the States for a given DFA are the same size
+ // class, so the allocator can hopefully pack them better.
+ char* space = std::allocator<char>().allocate(mem);
+ State* s = new (space) State;
+ (void) new (s->next_) std::atomic<State*>[nnext];
+ // Work around a unfortunate bug in older versions of libstdc++.
+ // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64658)
+ for (int i = 0; i < nnext; i++)
+ (void) new (s->next_ + i) std::atomic<State*>(NULL);
+ s->inst_ = std::allocator<int>().allocate(ninst);
+ (void) new (s->inst_) int[ninst];
+ memmove(s->inst_, inst, instmem);
+ s->ninst_ = ninst;
+ s->flag_ = flag;
+ if (ExtraDebug)
+ absl::FPrintF(stderr, " -> %s\n", DumpState(s));
+
+ // Put state in cache and return it.
+ state_cache_.insert(s);
+ return s;
+}
+
+// Clear the cache. Must hold cache_mutex_.w or be in destructor.
+void DFA::ClearCache() {
+ StateSet::iterator begin = state_cache_.begin();
+ StateSet::iterator end = state_cache_.end();
+ while (begin != end) {
+ StateSet::iterator tmp = begin;
+ ++begin;
+ // Deallocate the instruction array, which is stored separately as above.
+ std::allocator<int>().deallocate((*tmp)->inst_, (*tmp)->ninst_);
+ // Deallocate the blob of memory that we allocated in DFA::CachedState().
+ // We recompute mem in order to benefit from sized delete where possible.
+ int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot
+ int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>);
+ std::allocator<char>().deallocate(reinterpret_cast<char*>(*tmp), mem);
+ }
+ state_cache_.clear();
+}
+
+// Copies insts in state s to the work queue q.
+void DFA::StateToWorkq(State* s, Workq* q) {
+ q->clear();
+ for (int i = 0; i < s->ninst_; i++) {
+ if (s->inst_[i] == Mark) {
+ q->mark();
+ } else if (s->inst_[i] == MatchSep) {
+ // Nothing after this is an instruction!
+ break;
+ } else {
+ // Explore from the head of the list.
+ AddToQueue(q, s->inst_[i], s->flag_ & kFlagEmptyMask);
+ }
+ }
+}
+
+// Adds ip to the work queue, following empty arrows according to flag.
+void DFA::AddToQueue(Workq* q, int id, uint32_t flag) {
+
+ // Use stack_ to hold our stack of instructions yet to process.
+ // It was preallocated as follows:
+ // one entry per Capture;
+ // one entry per EmptyWidth; and
+ // one entry per Nop.
+ // This reflects the maximum number of stack pushes that each can
+ // perform. (Each instruction can be processed at most once.)
+ // When using marks, we also added nmark == prog_->size().
+ // (Otherwise, nmark == 0.)
+ int* stk = stack_.data();
+ int nstk = 0;
+
+ stk[nstk++] = id;
+ while (nstk > 0) {
+ DCHECK_LE(nstk, stack_.size());
+ id = stk[--nstk];
+
+ Loop:
+ if (id == Mark) {
+ q->mark();
+ continue;
+ }
+
+ if (id == 0)
+ continue;
+
+ // If ip is already on the queue, nothing to do.
+ // Otherwise add it. We don't actually keep all the
+ // ones that get added, but adding all of them here
+ // increases the likelihood of q->contains(id),
+ // reducing the amount of duplicated work.
+ if (q->contains(id))
+ continue;
+ q->insert_new(id);
+
+ // Process instruction.
+ Prog::Inst* ip = prog_->inst(id);
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
+ break;
+
+ case kInstByteRange: // just save these on the queue
+ case kInstMatch:
+ if (ip->last())
+ break;
+ id = id+1;
+ goto Loop;
+
+ case kInstCapture: // DFA treats captures as no-ops.
+ case kInstNop:
+ if (!ip->last())
+ stk[nstk++] = id+1;
+
+ // If this instruction is the [00-FF]* loop at the beginning of
+ // a leftmost-longest unanchored search, separate with a Mark so
+ // that future threads (which will start farther to the right in
+ // the input string) are lower priority than current threads.
+ if (ip->opcode() == kInstNop && q->maxmark() > 0 &&
+ id == prog_->start_unanchored() && id != prog_->start())
+ stk[nstk++] = Mark;
+ id = ip->out();
+ goto Loop;
+
+ case kInstAltMatch:
+ DCHECK(!ip->last());
+ id = id+1;
+ goto Loop;
+
+ case kInstEmptyWidth:
+ if (!ip->last())
+ stk[nstk++] = id+1;
+
+ // Continue on if we have all the right flag bits.
+ if (ip->empty() & ~flag)
+ break;
+ id = ip->out();
+ goto Loop;
+ }
+ }
+}
+
+// Running of work queues. In the work queue, order matters:
+// the queue is sorted in priority order. If instruction i comes before j,
+// then the instructions that i produces during the run must come before
+// the ones that j produces. In order to keep this invariant, all the
+// work queue runners have to take an old queue to process and then
+// also a new queue to fill in. It's not acceptable to add to the end of
+// an existing queue, because new instructions will not end up in the
+// correct position.
+
+// Runs the work queue, processing the empty strings indicated by flag.
+// For example, flag == kEmptyBeginLine|kEmptyEndLine means to match
+// both ^ and $. It is important that callers pass all flags at once:
+// processing both ^ and $ is not the same as first processing only ^
+// and then processing only $. Doing the two-step sequence won't match
+// ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior
+// exhibited by existing implementations).
+void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint32_t flag) {
+ newq->clear();
+ for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
+ if (oldq->is_mark(*i))
+ AddToQueue(newq, Mark, flag);
+ else
+ AddToQueue(newq, *i, flag);
+ }
+}
+
+// Runs the work queue, processing the single byte c followed by any empty
+// strings indicated by flag. For example, c == 'a' and flag == kEmptyEndLine,
+// means to match c$. Sets the bool *ismatch to true if the end of the
+// regular expression program has been reached (the regexp has matched).
+void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
+ int c, uint32_t flag, bool* ismatch) {
+ //mutex_.AssertHeld();
+
+ newq->clear();
+ for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
+ if (oldq->is_mark(*i)) {
+ if (*ismatch)
+ return;
+ newq->mark();
+ continue;
+ }
+ int id = *i;
+ Prog::Inst* ip = prog_->inst(id);
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
+ break;
+
+ case kInstFail: // never succeeds
+ case kInstCapture: // already followed
+ case kInstNop: // already followed
+ case kInstAltMatch: // already followed
+ case kInstEmptyWidth: // already followed
+ break;
+
+ case kInstByteRange: // can follow if c is in range
+ if (!ip->Matches(c))
+ break;
+ AddToQueue(newq, ip->out(), flag);
+ if (ip->hint() != 0) {
+ // We have a hint, but we must cancel out the
+ // increment that will occur after the break.
+ i += ip->hint() - 1;
+ } else {
+ // We have no hint, so we must find the end
+ // of the current list and then skip to it.
+ Prog::Inst* ip0 = ip;
+ while (!ip->last())
+ ++ip;
+ i += ip - ip0;
+ }
+ break;
+
+ case kInstMatch:
+ if (prog_->anchor_end() && c != kByteEndText &&
+ kind_ != Prog::kManyMatch)
+ break;
+ *ismatch = true;
+ if (kind_ == Prog::kFirstMatch) {
+ // Can stop processing work queue since we found a match.
+ return;
+ }
+ break;
+ }
+ }
+
+ if (ExtraDebug)
+ absl::FPrintF(stderr, "%s on %d[%#x] -> %s [%d]\n",
+ DumpWorkq(oldq), c, flag, DumpWorkq(newq), *ismatch);
+}
+
+// Processes input byte c in state, returning new state.
+// Caller does not hold mutex.
+DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) {
+ // Keep only one RunStateOnByte going
+ // even if the DFA is being run by multiple threads.
+ absl::MutexLock l(&mutex_);
+ return RunStateOnByte(state, c);
+}
+
+// Processes input byte c in state, returning new state.
+DFA::State* DFA::RunStateOnByte(State* state, int c) {
+ //mutex_.AssertHeld();
+
+ if (state <= SpecialStateMax) {
+ if (state == FullMatchState) {
+ // It is convenient for routines like PossibleMatchRange
+ // if we implement RunStateOnByte for FullMatchState:
+ // once you get into this state you never get out,
+ // so it's pretty easy.
+ return FullMatchState;
+ }
+ if (state == DeadState) {
+ LOG(DFATAL) << "DeadState in RunStateOnByte";
+ return NULL;
+ }
+ if (state == NULL) {
+ LOG(DFATAL) << "NULL state in RunStateOnByte";
+ return NULL;
+ }
+ LOG(DFATAL) << "Unexpected special state in RunStateOnByte";
+ return NULL;
+ }
+
+ // If someone else already computed this, return it.
+ State* ns = state->next_[ByteMap(c)].load(std::memory_order_relaxed);
+ if (ns != NULL)
+ return ns;
+
+ // Convert state into Workq.
+ StateToWorkq(state, q0_);
+
+ // Flags marking the kinds of empty-width things (^ $ etc)
+ // around this byte. Before the byte we have the flags recorded
+ // in the State structure itself. After the byte we have
+ // nothing yet (but that will change: read on).
+ uint32_t needflag = state->flag_ >> kFlagNeedShift;
+ uint32_t beforeflag = state->flag_ & kFlagEmptyMask;
+ uint32_t oldbeforeflag = beforeflag;
+ uint32_t afterflag = 0;
+
+ if (c == '\n') {
+ // Insert implicit $ and ^ around \n
+ beforeflag |= kEmptyEndLine;
+ afterflag |= kEmptyBeginLine;
+ }
+
+ if (c == kByteEndText) {
+ // Insert implicit $ and \z before the fake "end text" byte.
+ beforeflag |= kEmptyEndLine | kEmptyEndText;
+ }
+
+ // The state flag kFlagLastWord says whether the last
+ // byte processed was a word character. Use that info to
+ // insert empty-width (non-)word boundaries.
+ bool islastword = (state->flag_ & kFlagLastWord) != 0;
+ bool isword = c != kByteEndText && Prog::IsWordChar(static_cast<uint8_t>(c));
+ if (isword == islastword)
+ beforeflag |= kEmptyNonWordBoundary;
+ else
+ beforeflag |= kEmptyWordBoundary;
+
+ // Okay, finally ready to run.
+ // Only useful to rerun on empty string if there are new, useful flags.
+ if (beforeflag & ~oldbeforeflag & needflag) {
+ RunWorkqOnEmptyString(q0_, q1_, beforeflag);
+ using std::swap;
+ swap(q0_, q1_);
+ }
+ bool ismatch = false;
+ RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch);
+ using std::swap;
+ swap(q0_, q1_);
+
+ // Save afterflag along with ismatch and isword in new state.
+ uint32_t flag = afterflag;
+ if (ismatch)
+ flag |= kFlagMatch;
+ if (isword)
+ flag |= kFlagLastWord;
+
+ if (ismatch && kind_ == Prog::kManyMatch)
+ ns = WorkqToCachedState(q0_, q1_, flag);
+ else
+ ns = WorkqToCachedState(q0_, NULL, flag);
+
+ // Flush ns before linking to it.
+ // Write barrier before updating state->next_ so that the
+ // main search loop can proceed without any locking, for speed.
+ // (Otherwise it would need one mutex operation per input byte.)
+ state->next_[ByteMap(c)].store(ns, std::memory_order_release);
+ return ns;
+}
+
+
+//////////////////////////////////////////////////////////////////////
+// DFA cache reset.
+
+// Reader-writer lock helper.
+//
+// The DFA uses a reader-writer mutex to protect the state graph itself.
+// Traversing the state graph requires holding the mutex for reading,
+// and discarding the state graph and starting over requires holding the
+// lock for writing. If a search needs to expand the graph but is out
+// of memory, it will need to drop its read lock and then acquire the
+// write lock. Since it cannot then atomically downgrade from write lock
+// to read lock, it runs the rest of the search holding the write lock.
+// (This probably helps avoid repeated contention, but really the decision
+// is forced by the Mutex interface.) It's a bit complicated to keep
+// track of whether the lock is held for reading or writing and thread
+// that through the search, so instead we encapsulate it in the RWLocker
+// and pass that around.
+
+class DFA::RWLocker {
+ public:
+ explicit RWLocker(CacheMutex* mu);
+ ~RWLocker();
+
+ // If the lock is only held for reading right now,
+ // drop the read lock and re-acquire for writing.
+ // Subsequent calls to LockForWriting are no-ops.
+ // Notice that the lock is *released* temporarily.
+ void LockForWriting();
+
+ private:
+ CacheMutex* mu_;
+ bool writing_;
+
+ RWLocker(const RWLocker&) = delete;
+ RWLocker& operator=(const RWLocker&) = delete;
+};
+
+DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) {
+ mu_->ReaderLock();
+}
+
+// This function is marked as ABSL_NO_THREAD_SAFETY_ANALYSIS because
+// the annotations don't support lock upgrade.
+void DFA::RWLocker::LockForWriting() ABSL_NO_THREAD_SAFETY_ANALYSIS {
+ if (!writing_) {
+ mu_->ReaderUnlock();
+ mu_->WriterLock();
+ writing_ = true;
+ }
+}
+
+DFA::RWLocker::~RWLocker() {
+ if (!writing_)
+ mu_->ReaderUnlock();
+ else
+ mu_->WriterUnlock();
+}
+
+
+// When the DFA's State cache fills, we discard all the states in the
+// cache and start over. Many threads can be using and adding to the
+// cache at the same time, so we synchronize using the cache_mutex_
+// to keep from stepping on other threads. Specifically, all the
+// threads using the current cache hold cache_mutex_ for reading.
+// When a thread decides to flush the cache, it drops cache_mutex_
+// and then re-acquires it for writing. That ensures there are no
+// other threads accessing the cache anymore. The rest of the search
+// runs holding cache_mutex_ for writing, avoiding any contention
+// with or cache pollution caused by other threads.
+
+void DFA::ResetCache(RWLocker* cache_lock) {
+ // Re-acquire the cache_mutex_ for writing (exclusive use).
+ cache_lock->LockForWriting();
+
+ hooks::GetDFAStateCacheResetHook()({
+ state_budget_,
+ state_cache_.size(),
+ });
+
+ // Clear the cache, reset the memory budget.
+ for (int i = 0; i < kMaxStart; i++)
+ start_[i].start.store(NULL, std::memory_order_relaxed);
+ ClearCache();
+ mem_budget_ = state_budget_;
+}
+
+// Typically, a couple States do need to be preserved across a cache
+// reset, like the State at the current point in the search.
+// The StateSaver class helps keep States across cache resets.
+// It makes a copy of the state's guts outside the cache (before the reset)
+// and then can be asked, after the reset, to recreate the State
+// in the new cache. For example, in a DFA method ("this" is a DFA):
+//
+// StateSaver saver(this, s);
+// ResetCache(cache_lock);
+// s = saver.Restore();
+//
+// The saver should always have room in the cache to re-create the state,
+// because resetting the cache locks out all other threads, and the cache
+// is known to have room for at least a couple states (otherwise the DFA
+// constructor fails).
+
+class DFA::StateSaver {
+ public:
+ explicit StateSaver(DFA* dfa, State* state);
+ ~StateSaver();
+
+ // Recreates and returns a state equivalent to the
+ // original state passed to the constructor.
+ // Returns NULL if the cache has filled, but
+ // since the DFA guarantees to have room in the cache
+ // for a couple states, should never return NULL
+ // if used right after ResetCache.
+ State* Restore();
+
+ private:
+ DFA* dfa_; // the DFA to use
+ int* inst_; // saved info from State
+ int ninst_;
+ uint32_t flag_;
+ bool is_special_; // whether original state was special
+ State* special_; // if is_special_, the original state
+
+ StateSaver(const StateSaver&) = delete;
+ StateSaver& operator=(const StateSaver&) = delete;
+};
+
+DFA::StateSaver::StateSaver(DFA* dfa, State* state) {
+ dfa_ = dfa;
+ if (state <= SpecialStateMax) {
+ inst_ = NULL;
+ ninst_ = 0;
+ flag_ = 0;
+ is_special_ = true;
+ special_ = state;
+ return;
+ }
+ is_special_ = false;
+ special_ = NULL;
+ flag_ = state->flag_;
+ ninst_ = state->ninst_;
+ inst_ = new int[ninst_];
+ memmove(inst_, state->inst_, ninst_*sizeof inst_[0]);
+}
+
+DFA::StateSaver::~StateSaver() {
+ if (!is_special_)
+ delete[] inst_;
+}
+
+DFA::State* DFA::StateSaver::Restore() {
+ if (is_special_)
+ return special_;
+ absl::MutexLock l(&dfa_->mutex_);
+ State* s = dfa_->CachedState(inst_, ninst_, flag_);
+ if (s == NULL)
+ LOG(DFATAL) << "StateSaver failed to restore state.";
+ return s;
+}
+
+
+//////////////////////////////////////////////////////////////////////
+//
+// DFA execution.
+//
+// The basic search loop is easy: start in a state s and then for each
+// byte c in the input, s = s->next[c].
+//
+// This simple description omits a few efficiency-driven complications.
+//
+// First, the State graph is constructed incrementally: it is possible
+// that s->next[c] is null, indicating that that state has not been
+// fully explored. In this case, RunStateOnByte must be invoked to
+// determine the next state, which is cached in s->next[c] to save
+// future effort. An alternative reason for s->next[c] to be null is
+// that the DFA has reached a so-called "dead state", in which any match
+// is no longer possible. In this case RunStateOnByte will return NULL
+// and the processing of the string can stop early.
+//
+// Second, a 256-element pointer array for s->next_ makes each State
+// quite large (2kB on 64-bit machines). Instead, dfa->bytemap_[]
+// maps from bytes to "byte classes" and then next_ only needs to have
+// as many pointers as there are byte classes. A byte class is simply a
+// range of bytes that the regexp never distinguishes between.
+// A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1,
+// 'a', 'b' to 'c', and 'c' to 0xFF. The bytemap slows us a little bit
+// but in exchange we typically cut the size of a State (and thus our
+// memory footprint) by about 5-10x. The comments still refer to
+// s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]].
+//
+// Third, it is common for a DFA for an unanchored match to begin in a
+// state in which only one particular byte value can take the DFA to a
+// different state. That is, s->next[c] != s for only one c. In this
+// situation, the DFA can do better than executing the simple loop.
+// Instead, it can call memchr to search very quickly for the byte c.
+// Whether the start state has this property is determined during a
+// pre-compilation pass and the "can_prefix_accel" argument is set.
+//
+// Fourth, the desired behavior is to search for the leftmost-best match
+// (approximately, the same one that Perl would find), which is not
+// necessarily the match ending earliest in the string. Each time a
+// match is found, it must be noted, but the DFA must continue on in
+// hope of finding a higher-priority match. In some cases, the caller only
+// cares whether there is any match at all, not which one is found.
+// The "want_earliest_match" flag causes the search to stop at the first
+// match found.
+//
+// Fifth, one algorithm that uses the DFA needs it to run over the
+// input string backward, beginning at the end and ending at the beginning.
+// Passing false for the "run_forward" flag causes the DFA to run backward.
+//
+// The checks for these last three cases, which in a naive implementation
+// would be performed once per input byte, slow the general loop enough
+// to merit specialized versions of the search loop for each of the
+// eight possible settings of the three booleans. Rather than write
+// eight different functions, we write one general implementation and then
+// inline it to create the specialized ones.
+//
+// Note that matches are delayed by one byte, to make it easier to
+// accomodate match conditions depending on the next input byte (like $ and \b).
+// When s->next[c]->IsMatch(), it means that there is a match ending just
+// *before* byte c.
+
+// The generic search loop. Searches text for a match, returning
+// the pointer to the end of the chosen match, or NULL if no match.
+// The bools are equal to the same-named variables in params, but
+// making them function arguments lets the inliner specialize
+// this function to each combination (see two paragraphs above).
+template <bool can_prefix_accel,
+ bool want_earliest_match,
+ bool run_forward>
+inline bool DFA::InlinedSearchLoop(SearchParams* params) {
+ State* start = params->start;
+ const uint8_t* bp = BytePtr(params->text.data()); // start of text
+ const uint8_t* p = bp; // text scanning point
+ const uint8_t* ep = BytePtr(params->text.data() +
+ params->text.size()); // end of text
+ const uint8_t* resetp = NULL; // p at last cache reset
+ if (!run_forward) {
+ using std::swap;
+ swap(p, ep);
+ }
+
+ const uint8_t* bytemap = prog_->bytemap();
+ const uint8_t* lastmatch = NULL; // most recent matching position in text
+ bool matched = false;
+
+ State* s = start;
+ if (ExtraDebug)
+ absl::FPrintF(stderr, "@stx: %s\n", DumpState(s));
+
+ if (s->IsMatch()) {
+ matched = true;
+ lastmatch = p;
+ if (ExtraDebug)
+ absl::FPrintF(stderr, "match @stx! [%s]\n", DumpState(s));
+ if (params->matches != NULL) {
+ for (int i = s->ninst_ - 1; i >= 0; i--) {
+ int id = s->inst_[i];
+ if (id == MatchSep)
+ break;
+ params->matches->insert(id);
+ }
+ }
+ if (want_earliest_match) {
+ params->ep = reinterpret_cast<const char*>(lastmatch);
+ return true;
+ }
+ }
+
+ while (p != ep) {
+ if (ExtraDebug)
+ absl::FPrintF(stderr, "@%d: %s\n", p - bp, DumpState(s));
+
+ if (can_prefix_accel && s == start) {
+ // In start state, only way out is to find the prefix,
+ // so we use prefix accel (e.g. memchr) to skip ahead.
+ // If not found, we can skip to the end of the string.
+ p = BytePtr(prog_->PrefixAccel(p, ep - p));
+ if (p == NULL) {
+ p = ep;
+ break;
+ }
+ }
+
+ int c;
+ if (run_forward)
+ c = *p++;
+ else
+ c = *--p;
+
+ // Note that multiple threads might be consulting
+ // s->next_[bytemap[c]] simultaneously.
+ // RunStateOnByte takes care of the appropriate locking,
+ // including a memory barrier so that the unlocked access
+ // (sometimes known as "double-checked locking") is safe.
+ // The alternative would be either one DFA per thread
+ // or one mutex operation per input byte.
+ //
+ // ns == DeadState means the state is known to be dead
+ // (no more matches are possible).
+ // ns == NULL means the state has not yet been computed
+ // (need to call RunStateOnByteUnlocked).
+ // RunStateOnByte returns ns == NULL if it is out of memory.
+ // ns == FullMatchState means the rest of the string matches.
+ //
+ // Okay to use bytemap[] not ByteMap() here, because
+ // c is known to be an actual byte and not kByteEndText.
+
+ State* ns = s->next_[bytemap[c]].load(std::memory_order_acquire);
+ if (ns == NULL) {
+ ns = RunStateOnByteUnlocked(s, c);
+ if (ns == NULL) {
+ // After we reset the cache, we hold cache_mutex exclusively,
+ // so if resetp != NULL, it means we filled the DFA state
+ // cache with this search alone (without any other threads).
+ // Benchmarks show that doing a state computation on every
+ // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the
+ // same at about 2 MB/s. Unless we're processing an average
+ // of 10 bytes per state computation, fail so that RE2 can
+ // fall back to the NFA. However, RE2::Set cannot fall back,
+ // so we just have to keep on keeping on in that case.
+ if (dfa_should_bail_when_slow && resetp != NULL &&
+ static_cast<size_t>(p - resetp) < 10*state_cache_.size() &&
+ kind_ != Prog::kManyMatch) {
+ params->failed = true;
+ return false;
+ }
+ resetp = p;
+
+ // Prepare to save start and s across the reset.
+ StateSaver save_start(this, start);
+ StateSaver save_s(this, s);
+
+ // Discard all the States in the cache.
+ ResetCache(params->cache_lock);
+
+ // Restore start and s so we can continue.
+ if ((start = save_start.Restore()) == NULL ||
+ (s = save_s.Restore()) == NULL) {
+ // Restore already did LOG(DFATAL).
+ params->failed = true;
+ return false;
+ }
+ ns = RunStateOnByteUnlocked(s, c);
+ if (ns == NULL) {
+ LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache";
+ params->failed = true;
+ return false;
+ }
+ }
+ }
+ if (ns <= SpecialStateMax) {
+ if (ns == DeadState) {
+ params->ep = reinterpret_cast<const char*>(lastmatch);
+ return matched;
+ }
+ // FullMatchState
+ params->ep = reinterpret_cast<const char*>(ep);
+ return true;
+ }
+
+ s = ns;
+ if (s->IsMatch()) {
+ matched = true;
+ // The DFA notices the match one byte late,
+ // so adjust p before using it in the match.
+ if (run_forward)
+ lastmatch = p - 1;
+ else
+ lastmatch = p + 1;
+ if (ExtraDebug)
+ absl::FPrintF(stderr, "match @%d! [%s]\n", lastmatch - bp, DumpState(s));
+ if (params->matches != NULL) {
+ for (int i = s->ninst_ - 1; i >= 0; i--) {
+ int id = s->inst_[i];
+ if (id == MatchSep)
+ break;
+ params->matches->insert(id);
+ }
+ }
+ if (want_earliest_match) {
+ params->ep = reinterpret_cast<const char*>(lastmatch);
+ return true;
+ }
+ }
+ }
+
+ // Process one more byte to see if it triggers a match.
+ // (Remember, matches are delayed one byte.)
+ if (ExtraDebug)
+ absl::FPrintF(stderr, "@etx: %s\n", DumpState(s));
+
+ int lastbyte;
+ if (run_forward) {
+ if (EndPtr(params->text) == EndPtr(params->context))
+ lastbyte = kByteEndText;
+ else
+ lastbyte = EndPtr(params->text)[0] & 0xFF;
+ } else {
+ if (BeginPtr(params->text) == BeginPtr(params->context))
+ lastbyte = kByteEndText;
+ else
+ lastbyte = BeginPtr(params->text)[-1] & 0xFF;
+ }
+
+ State* ns = s->next_[ByteMap(lastbyte)].load(std::memory_order_acquire);
+ if (ns == NULL) {
+ ns = RunStateOnByteUnlocked(s, lastbyte);
+ if (ns == NULL) {
+ StateSaver save_s(this, s);
+ ResetCache(params->cache_lock);
+ if ((s = save_s.Restore()) == NULL) {
+ params->failed = true;
+ return false;
+ }
+ ns = RunStateOnByteUnlocked(s, lastbyte);
+ if (ns == NULL) {
+ LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset";
+ params->failed = true;
+ return false;
+ }
+ }
+ }
+ if (ns <= SpecialStateMax) {
+ if (ns == DeadState) {
+ params->ep = reinterpret_cast<const char*>(lastmatch);
+ return matched;
+ }
+ // FullMatchState
+ params->ep = reinterpret_cast<const char*>(ep);
+ return true;
+ }
+
+ s = ns;
+ if (s->IsMatch()) {
+ matched = true;
+ lastmatch = p;
+ if (ExtraDebug)
+ absl::FPrintF(stderr, "match @etx! [%s]\n", DumpState(s));
+ if (params->matches != NULL) {
+ for (int i = s->ninst_ - 1; i >= 0; i--) {
+ int id = s->inst_[i];
+ if (id == MatchSep)
+ break;
+ params->matches->insert(id);
+ }
+ }
+ }
+
+ params->ep = reinterpret_cast<const char*>(lastmatch);
+ return matched;
+}
+
+// Inline specializations of the general loop.
+bool DFA::SearchFFF(SearchParams* params) {
+ return InlinedSearchLoop<false, false, false>(params);
+}
+bool DFA::SearchFFT(SearchParams* params) {
+ return InlinedSearchLoop<false, false, true>(params);
+}
+bool DFA::SearchFTF(SearchParams* params) {
+ return InlinedSearchLoop<false, true, false>(params);
+}
+bool DFA::SearchFTT(SearchParams* params) {
+ return InlinedSearchLoop<false, true, true>(params);
+}
+bool DFA::SearchTFF(SearchParams* params) {
+ return InlinedSearchLoop<true, false, false>(params);
+}
+bool DFA::SearchTFT(SearchParams* params) {
+ return InlinedSearchLoop<true, false, true>(params);
+}
+bool DFA::SearchTTF(SearchParams* params) {
+ return InlinedSearchLoop<true, true, false>(params);
+}
+bool DFA::SearchTTT(SearchParams* params) {
+ return InlinedSearchLoop<true, true, true>(params);
+}
+
+// For performance, calls the appropriate specialized version
+// of InlinedSearchLoop.
+bool DFA::FastSearchLoop(SearchParams* params) {
+ // Because the methods are private, the Searches array
+ // cannot be declared at top level.
+ static bool (DFA::*Searches[])(SearchParams*) = {
+ &DFA::SearchFFF,
+ &DFA::SearchFFT,
+ &DFA::SearchFTF,
+ &DFA::SearchFTT,
+ &DFA::SearchTFF,
+ &DFA::SearchTFT,
+ &DFA::SearchTTF,
+ &DFA::SearchTTT,
+ };
+
+ int index = 4 * params->can_prefix_accel +
+ 2 * params->want_earliest_match +
+ 1 * params->run_forward;
+ return (this->*Searches[index])(params);
+}
+
+
+// The discussion of DFA execution above ignored the question of how
+// to determine the initial state for the search loop. There are two
+// factors that influence the choice of start state.
+//
+// The first factor is whether the search is anchored or not.
+// The regexp program (Prog*) itself has
+// two different entry points: one for anchored searches and one for
+// unanchored searches. (The unanchored version starts with a leading ".*?"
+// and then jumps to the anchored one.)
+//
+// The second factor is where text appears in the larger context, which
+// determines which empty-string operators can be matched at the beginning
+// of execution. If text is at the very beginning of context, \A and ^ match.
+// Otherwise if text is at the beginning of a line, then ^ matches.
+// Otherwise it matters whether the character before text is a word character
+// or a non-word character.
+//
+// The two cases (unanchored vs not) and four cases (empty-string flags)
+// combine to make the eight cases recorded in the DFA's begin_text_[2],
+// begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached
+// StartInfos. The start state for each is filled in the first time it
+// is used for an actual search.
+
+// Examines text, context, and anchored to determine the right start
+// state for the DFA search loop. Fills in params and returns true on success.
+// Returns false on failure.
+bool DFA::AnalyzeSearch(SearchParams* params) {
+ absl::string_view text = params->text;
+ absl::string_view context = params->context;
+
+ // Sanity check: make sure that text lies within context.
+ if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) {
+ LOG(DFATAL) << "context does not contain text";
+ params->start = DeadState;
+ return true;
+ }
+
+ // Determine correct search type.
+ int start;
+ uint32_t flags;
+ if (params->run_forward) {
+ if (BeginPtr(text) == BeginPtr(context)) {
+ start = kStartBeginText;
+ flags = kEmptyBeginText|kEmptyBeginLine;
+ } else if (BeginPtr(text)[-1] == '\n') {
+ start = kStartBeginLine;
+ flags = kEmptyBeginLine;
+ } else if (Prog::IsWordChar(BeginPtr(text)[-1] & 0xFF)) {
+ start = kStartAfterWordChar;
+ flags = kFlagLastWord;
+ } else {
+ start = kStartAfterNonWordChar;
+ flags = 0;
+ }
+ } else {
+ if (EndPtr(text) == EndPtr(context)) {
+ start = kStartBeginText;
+ flags = kEmptyBeginText|kEmptyBeginLine;
+ } else if (EndPtr(text)[0] == '\n') {
+ start = kStartBeginLine;
+ flags = kEmptyBeginLine;
+ } else if (Prog::IsWordChar(EndPtr(text)[0] & 0xFF)) {
+ start = kStartAfterWordChar;
+ flags = kFlagLastWord;
+ } else {
+ start = kStartAfterNonWordChar;
+ flags = 0;
+ }
+ }
+ if (params->anchored)
+ start |= kStartAnchored;
+ StartInfo* info = &start_[start];
+
+ // Try once without cache_lock for writing.
+ // Try again after resetting the cache
+ // (ResetCache will relock cache_lock for writing).
+ if (!AnalyzeSearchHelper(params, info, flags)) {
+ ResetCache(params->cache_lock);
+ if (!AnalyzeSearchHelper(params, info, flags)) {
+ params->failed = true;
+ LOG(DFATAL) << "Failed to analyze start state.";
+ return false;
+ }
+ }
+
+ params->start = info->start.load(std::memory_order_acquire);
+
+ // Even if we could prefix accel, we cannot do so when anchored and,
+ // less obviously, we cannot do so when we are going to need flags.
+ // This trick works only when there is a single byte that leads to a
+ // different state!
+ if (prog_->can_prefix_accel() &&
+ !params->anchored &&
+ params->start > SpecialStateMax &&
+ params->start->flag_ >> kFlagNeedShift == 0)
+ params->can_prefix_accel = true;
+
+ if (ExtraDebug)
+ absl::FPrintF(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n",
+ params->anchored, params->run_forward, flags,
+ DumpState(params->start), params->can_prefix_accel);
+
+ return true;
+}
+
+// Fills in info if needed. Returns true on success, false on failure.
+bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
+ uint32_t flags) {
+ // Quick check.
+ State* start = info->start.load(std::memory_order_acquire);
+ if (start != NULL)
+ return true;
+
+ absl::MutexLock l(&mutex_);
+ start = info->start.load(std::memory_order_relaxed);
+ if (start != NULL)
+ return true;
+
+ q0_->clear();
+ AddToQueue(q0_,
+ params->anchored ? prog_->start() : prog_->start_unanchored(),
+ flags);
+ start = WorkqToCachedState(q0_, NULL, flags);
+ if (start == NULL)
+ return false;
+
+ // Synchronize with "quick check" above.
+ info->start.store(start, std::memory_order_release);
+ return true;
+}
+
+// The actual DFA search: calls AnalyzeSearch and then FastSearchLoop.
+bool DFA::Search(absl::string_view text, absl::string_view context,
+ bool anchored, bool want_earliest_match, bool run_forward,
+ bool* failed, const char** epp, SparseSet* matches) {
+ *epp = NULL;
+ if (!ok()) {
+ *failed = true;
+ return false;
+ }
+ *failed = false;
+
+ if (ExtraDebug) {
+ absl::FPrintF(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored());
+ absl::FPrintF(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n",
+ text, anchored, want_earliest_match, run_forward, kind_);
+ }
+
+ RWLocker l(&cache_mutex_);
+ SearchParams params(text, context, &l);
+ params.anchored = anchored;
+ params.want_earliest_match = want_earliest_match;
+ params.run_forward = run_forward;
+ // matches should be null except when using RE2::Set.
+ DCHECK(matches == NULL || kind_ == Prog::kManyMatch);
+ params.matches = matches;
+
+ if (!AnalyzeSearch(&params)) {
+ *failed = true;
+ return false;
+ }
+ if (params.start == DeadState)
+ return false;
+ if (params.start == FullMatchState) {
+ if (run_forward == want_earliest_match)
+ *epp = text.data();
+ else
+ *epp = text.data() + text.size();
+ return true;
+ }
+ if (ExtraDebug)
+ absl::FPrintF(stderr, "start %s\n", DumpState(params.start));
+ bool ret = FastSearchLoop(&params);
+ if (params.failed) {
+ *failed = true;
+ return false;
+ }
+ *epp = params.ep;
+ return ret;
+}
+
+DFA* Prog::GetDFA(MatchKind kind) {
+ // For a forward DFA, half the memory goes to each DFA.
+ // However, if it is a "many match" DFA, then there is
+ // no counterpart with which the memory must be shared.
+ //
+ // For a reverse DFA, all the memory goes to the
+ // "longest match" DFA, because RE2 never does reverse
+ // "first match" searches.
+ if (kind == kFirstMatch) {
+ absl::call_once(dfa_first_once_, [](Prog* prog) {
+ prog->dfa_first_ = new DFA(prog, kFirstMatch, prog->dfa_mem_ / 2);
+ }, this);
+ return dfa_first_;
+ } else if (kind == kManyMatch) {
+ absl::call_once(dfa_first_once_, [](Prog* prog) {
+ prog->dfa_first_ = new DFA(prog, kManyMatch, prog->dfa_mem_);
+ }, this);
+ return dfa_first_;
+ } else {
+ absl::call_once(dfa_longest_once_, [](Prog* prog) {
+ if (!prog->reversed_)
+ prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_ / 2);
+ else
+ prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_);
+ }, this);
+ return dfa_longest_;
+ }
+}
+
+void Prog::DeleteDFA(DFA* dfa) {
+ delete dfa;
+}
+
+// Executes the regexp program to search in text,
+// which itself is inside the larger context. (As a convenience,
+// passing a NULL context is equivalent to passing text.)
+// Returns true if a match is found, false if not.
+// If a match is found, fills in match0->end() to point at the end of the match
+// and sets match0->begin() to text.begin(), since the DFA can't track
+// where the match actually began.
+//
+// This is the only external interface (class DFA only exists in this file).
+//
+bool Prog::SearchDFA(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind, absl::string_view* match0,
+ bool* failed, SparseSet* matches) {
+ *failed = false;
+
+ if (context.data() == NULL)
+ context = text;
+ bool caret = anchor_start();
+ bool dollar = anchor_end();
+ if (reversed_) {
+ using std::swap;
+ swap(caret, dollar);
+ }
+ if (caret && BeginPtr(context) != BeginPtr(text))
+ return false;
+ if (dollar && EndPtr(context) != EndPtr(text))
+ return false;
+
+ // Handle full match by running an anchored longest match
+ // and then checking if it covers all of text.
+ bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch;
+ bool endmatch = false;
+ if (kind == kManyMatch) {
+ // This is split out in order to avoid clobbering kind.
+ } else if (kind == kFullMatch || anchor_end()) {
+ endmatch = true;
+ kind = kLongestMatch;
+ }
+
+ // If the caller doesn't care where the match is (just whether one exists),
+ // then we can stop at the very first match we find, the so-called
+ // "earliest match".
+ bool want_earliest_match = false;
+ if (kind == kManyMatch) {
+ // This is split out in order to avoid clobbering kind.
+ if (matches == NULL) {
+ want_earliest_match = true;
+ }
+ } else if (match0 == NULL && !endmatch) {
+ want_earliest_match = true;
+ kind = kLongestMatch;
+ }
+
+ DFA* dfa = GetDFA(kind);
+ const char* ep;
+ bool matched = dfa->Search(text, context, anchored,
+ want_earliest_match, !reversed_,
+ failed, &ep, matches);
+ if (*failed) {
+ hooks::GetDFASearchFailureHook()({
+ // Nothing yet...
+ });
+ return false;
+ }
+ if (!matched)
+ return false;
+ if (endmatch && ep != (reversed_ ? text.data() : text.data() + text.size()))
+ return false;
+
+ // If caller cares, record the boundary of the match.
+ // We only know where it ends, so use the boundary of text
+ // as the beginning.
+ if (match0) {
+ if (reversed_)
+ *match0 =
+ absl::string_view(ep, static_cast<size_t>(text.data() + text.size() - ep));
+ else
+ *match0 =
+ absl::string_view(text.data(), static_cast<size_t>(ep - text.data()));
+ }
+ return true;
+}
+
+// Build out all states in DFA. Returns number of states.
+int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) {
+ if (!ok())
+ return 0;
+
+ // Pick out start state for unanchored search
+ // at beginning of text.
+ RWLocker l(&cache_mutex_);
+ SearchParams params(absl::string_view(), absl::string_view(), &l);
+ params.anchored = false;
+ if (!AnalyzeSearch(&params) ||
+ params.start == NULL ||
+ params.start == DeadState)
+ return 0;
+
+ // Add start state to work queue.
+ // Note that any State* that we handle here must point into the cache,
+ // so we can simply depend on pointer-as-a-number hashing and equality.
+ absl::flat_hash_map<State*, int> m;
+ std::deque<State*> q;
+ m.emplace(params.start, static_cast<int>(m.size()));
+ q.push_back(params.start);
+
+ // Compute the input bytes needed to cover all of the next pointers.
+ int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot
+ std::vector<int> input(nnext);
+ for (int c = 0; c < 256; c++) {
+ int b = prog_->bytemap()[c];
+ while (c < 256-1 && prog_->bytemap()[c+1] == b)
+ c++;
+ input[b] = c;
+ }
+ input[prog_->bytemap_range()] = kByteEndText;
+
+ // Scratch space for the output.
+ std::vector<int> output(nnext);
+
+ // Flood to expand every state.
+ bool oom = false;
+ while (!q.empty()) {
+ State* s = q.front();
+ q.pop_front();
+ for (int c : input) {
+ State* ns = RunStateOnByteUnlocked(s, c);
+ if (ns == NULL) {
+ oom = true;
+ break;
+ }
+ if (ns == DeadState) {
+ output[ByteMap(c)] = -1;
+ continue;
+ }
+ if (m.find(ns) == m.end()) {
+ m.emplace(ns, static_cast<int>(m.size()));
+ q.push_back(ns);
+ }
+ output[ByteMap(c)] = m[ns];
+ }
+ if (cb)
+ cb(oom ? NULL : output.data(),
+ s == FullMatchState || s->IsMatch());
+ if (oom)
+ break;
+ }
+
+ return static_cast<int>(m.size());
+}
+
+// Build out all states in DFA for kind. Returns number of states.
+int Prog::BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb) {
+ return GetDFA(kind)->BuildAllStates(cb);
+}
+
+// Computes min and max for matching string.
+// Won't return strings bigger than maxlen.
+bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
+ if (!ok())
+ return false;
+
+ // NOTE: if future users of PossibleMatchRange want more precision when
+ // presented with infinitely repeated elements, consider making this a
+ // parameter to PossibleMatchRange.
+ static int kMaxEltRepetitions = 0;
+
+ // Keep track of the number of times we've visited states previously. We only
+ // revisit a given state if it's part of a repeated group, so if the value
+ // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set
+ // |*max| to |PrefixSuccessor(*max)|.
+ //
+ // Also note that previously_visited_states[UnseenStatePtr] will, in the STL
+ // tradition, implicitly insert a '0' value at first use. We take advantage
+ // of that property below.
+ absl::flat_hash_map<State*, int> previously_visited_states;
+
+ // Pick out start state for anchored search at beginning of text.
+ RWLocker l(&cache_mutex_);
+ SearchParams params(absl::string_view(), absl::string_view(), &l);
+ params.anchored = true;
+ if (!AnalyzeSearch(&params))
+ return false;
+ if (params.start == DeadState) { // No matching strings
+ *min = "";
+ *max = "";
+ return true;
+ }
+ if (params.start == FullMatchState) // Every string matches: no max
+ return false;
+
+ // The DFA is essentially a big graph rooted at params.start,
+ // and paths in the graph correspond to accepted strings.
+ // Each node in the graph has potentially 256+1 arrows
+ // coming out, one for each byte plus the magic end of
+ // text character kByteEndText.
+
+ // To find the smallest possible prefix of an accepted
+ // string, we just walk the graph preferring to follow
+ // arrows with the lowest bytes possible. To find the
+ // largest possible prefix, we follow the largest bytes
+ // possible.
+
+ // The test for whether there is an arrow from s on byte j is
+ // ns = RunStateOnByteUnlocked(s, j);
+ // if (ns == NULL)
+ // return false;
+ // if (ns != DeadState && ns->ninst > 0)
+ // The RunStateOnByteUnlocked call asks the DFA to build out the graph.
+ // It returns NULL only if the DFA has run out of memory,
+ // in which case we can't be sure of anything.
+ // The second check sees whether there was graph built
+ // and whether it is interesting graph. Nodes might have
+ // ns->ninst == 0 if they exist only to represent the fact
+ // that a match was found on the previous byte.
+
+ // Build minimum prefix.
+ State* s = params.start;
+ min->clear();
+ absl::MutexLock lock(&mutex_);
+ for (int i = 0; i < maxlen; i++) {
+ if (previously_visited_states[s] > kMaxEltRepetitions)
+ break;
+ previously_visited_states[s]++;
+
+ // Stop if min is a match.
+ State* ns = RunStateOnByte(s, kByteEndText);
+ if (ns == NULL) // DFA out of memory
+ return false;
+ if (ns != DeadState && (ns == FullMatchState || ns->IsMatch()))
+ break;
+
+ // Try to extend the string with low bytes.
+ bool extended = false;
+ for (int j = 0; j < 256; j++) {
+ ns = RunStateOnByte(s, j);
+ if (ns == NULL) // DFA out of memory
+ return false;
+ if (ns == FullMatchState ||
+ (ns > SpecialStateMax && ns->ninst_ > 0)) {
+ extended = true;
+ min->append(1, static_cast<char>(j));
+ s = ns;
+ break;
+ }
+ }
+ if (!extended)
+ break;
+ }
+
+ // Build maximum prefix.
+ previously_visited_states.clear();
+ s = params.start;
+ max->clear();
+ for (int i = 0; i < maxlen; i++) {
+ if (previously_visited_states[s] > kMaxEltRepetitions)
+ break;
+ previously_visited_states[s] += 1;
+
+ // Try to extend the string with high bytes.
+ bool extended = false;
+ for (int j = 255; j >= 0; j--) {
+ State* ns = RunStateOnByte(s, j);
+ if (ns == NULL)
+ return false;
+ if (ns == FullMatchState ||
+ (ns > SpecialStateMax && ns->ninst_ > 0)) {
+ extended = true;
+ max->append(1, static_cast<char>(j));
+ s = ns;
+ break;
+ }
+ }
+ if (!extended) {
+ // Done, no need for PrefixSuccessor.
+ return true;
+ }
+ }
+
+ // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b
+ PrefixSuccessor(max);
+
+ // If there are no bytes left, we have no way to say "there is no maximum
+ // string". We could make the interface more complicated and be able to
+ // return "there is no maximum but here is a minimum", but that seems like
+ // overkill -- the most common no-max case is all possible strings, so not
+ // telling the caller that the empty string is the minimum match isn't a
+ // great loss.
+ if (max->empty())
+ return false;
+
+ return true;
+}
+
+// PossibleMatchRange for a Prog.
+bool Prog::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
+ // Have to use dfa_longest_ to get all strings for full matches.
+ // For example, (a|aa) never matches aa in first-match mode.
+ return GetDFA(kLongestMatch)->PossibleMatchRange(min, max, maxlen);
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/filtered_re2.cc b/third_party/re2/src/re2/filtered_re2.cc
new file mode 100644
index 000000000..49cf68601
--- /dev/null
+++ b/third_party/re2/src/re2/filtered_re2.cc
@@ -0,0 +1,134 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "re2/filtered_re2.h"
+
+#include <stddef.h>
+#include <string>
+#include <utility>
+
+#include "util/logging.h"
+#include "re2/prefilter.h"
+#include "re2/prefilter_tree.h"
+
+namespace re2 {
+
+FilteredRE2::FilteredRE2()
+ : compiled_(false),
+ prefilter_tree_(new PrefilterTree()) {
+}
+
+FilteredRE2::FilteredRE2(int min_atom_len)
+ : compiled_(false),
+ prefilter_tree_(new PrefilterTree(min_atom_len)) {
+}
+
+FilteredRE2::~FilteredRE2() {
+ for (size_t i = 0; i < re2_vec_.size(); i++)
+ delete re2_vec_[i];
+}
+
+FilteredRE2::FilteredRE2(FilteredRE2&& other)
+ : re2_vec_(std::move(other.re2_vec_)),
+ compiled_(other.compiled_),
+ prefilter_tree_(std::move(other.prefilter_tree_)) {
+ other.re2_vec_.clear();
+ other.re2_vec_.shrink_to_fit();
+ other.compiled_ = false;
+ other.prefilter_tree_.reset(new PrefilterTree());
+}
+
+FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) {
+ this->~FilteredRE2();
+ (void) new (this) FilteredRE2(std::move(other));
+ return *this;
+}
+
+RE2::ErrorCode FilteredRE2::Add(absl::string_view pattern,
+ const RE2::Options& options, int* id) {
+ RE2* re = new RE2(pattern, options);
+ RE2::ErrorCode code = re->error_code();
+
+ if (!re->ok()) {
+ if (options.log_errors()) {
+ LOG(ERROR) << "Couldn't compile regular expression, skipping: "
+ << pattern << " due to error " << re->error();
+ }
+ delete re;
+ } else {
+ *id = static_cast<int>(re2_vec_.size());
+ re2_vec_.push_back(re);
+ }
+
+ return code;
+}
+
+void FilteredRE2::Compile(std::vector<std::string>* atoms) {
+ if (compiled_) {
+ LOG(ERROR) << "Compile called already.";
+ return;
+ }
+
+ if (re2_vec_.empty()) {
+ LOG(ERROR) << "Compile called before Add.";
+ return;
+ }
+
+ for (size_t i = 0; i < re2_vec_.size(); i++) {
+ Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
+ prefilter_tree_->Add(prefilter);
+ }
+ atoms->clear();
+ prefilter_tree_->Compile(atoms);
+ compiled_ = true;
+}
+
+int FilteredRE2::SlowFirstMatch(absl::string_view text) const {
+ for (size_t i = 0; i < re2_vec_.size(); i++)
+ if (RE2::PartialMatch(text, *re2_vec_[i]))
+ return static_cast<int>(i);
+ return -1;
+}
+
+int FilteredRE2::FirstMatch(absl::string_view text,
+ const std::vector<int>& atoms) const {
+ if (!compiled_) {
+ LOG(DFATAL) << "FirstMatch called before Compile.";
+ return -1;
+ }
+ std::vector<int> regexps;
+ prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
+ for (size_t i = 0; i < regexps.size(); i++)
+ if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
+ return regexps[i];
+ return -1;
+}
+
+bool FilteredRE2::AllMatches(absl::string_view text,
+ const std::vector<int>& atoms,
+ std::vector<int>* matching_regexps) const {
+ matching_regexps->clear();
+ std::vector<int> regexps;
+ prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
+ for (size_t i = 0; i < regexps.size(); i++)
+ if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
+ matching_regexps->push_back(regexps[i]);
+ return !matching_regexps->empty();
+}
+
+void FilteredRE2::AllPotentials(const std::vector<int>& atoms,
+ std::vector<int>* potential_regexps) const {
+ prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps);
+}
+
+void FilteredRE2::RegexpsGivenStrings(const std::vector<int>& matched_atoms,
+ std::vector<int>* passed_regexps) {
+ prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
+}
+
+void FilteredRE2::PrintPrefilter(int regexpid) {
+ prefilter_tree_->PrintPrefilter(regexpid);
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/filtered_re2.h b/third_party/re2/src/re2/filtered_re2.h
new file mode 100644
index 000000000..a9abd6919
--- /dev/null
+++ b/third_party/re2/src/re2/filtered_re2.h
@@ -0,0 +1,115 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_FILTERED_RE2_H_
+#define RE2_FILTERED_RE2_H_
+
+// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
+// It provides a prefilter mechanism that helps in cutting down the
+// number of regexps that need to be actually searched.
+//
+// By design, it does not include a string matching engine. This is to
+// allow the user of the class to use their favorite string matching
+// engine. The overall flow is: Add all the regexps using Add, then
+// Compile the FilteredRE2. Compile returns strings that need to be
+// matched. Note that the returned strings are lowercased and distinct.
+// For applying regexps to a search text, the caller does the string
+// matching using the returned strings. When doing the string match,
+// note that the caller has to do that in a case-insensitive way or
+// on a lowercased version of the search text. Then call FirstMatch
+// or AllMatches with a vector of indices of strings that were found
+// in the text to get the actual regexp matches.
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "re2/re2.h"
+
+namespace re2 {
+
+class PrefilterTree;
+
+class FilteredRE2 {
+ public:
+ FilteredRE2();
+ explicit FilteredRE2(int min_atom_len);
+ ~FilteredRE2();
+
+ // Not copyable.
+ FilteredRE2(const FilteredRE2&) = delete;
+ FilteredRE2& operator=(const FilteredRE2&) = delete;
+ // Movable.
+ FilteredRE2(FilteredRE2&& other);
+ FilteredRE2& operator=(FilteredRE2&& other);
+
+ // Uses RE2 constructor to create a RE2 object (re). Returns
+ // re->error_code(). If error_code is other than NoError, then re is
+ // deleted and not added to re2_vec_.
+ RE2::ErrorCode Add(absl::string_view pattern,
+ const RE2::Options& options,
+ int* id);
+
+ // Prepares the regexps added by Add for filtering. Returns a set
+ // of strings that the caller should check for in candidate texts.
+ // The returned strings are lowercased and distinct. When doing
+ // string matching, it should be performed in a case-insensitive
+ // way or the search text should be lowercased first. Call after
+ // all Add calls are done.
+ void Compile(std::vector<std::string>* strings_to_match);
+
+ // Returns the index of the first matching regexp.
+ // Returns -1 on no match. Can be called prior to Compile.
+ // Does not do any filtering: simply tries to Match the
+ // regexps in a loop.
+ int SlowFirstMatch(absl::string_view text) const;
+
+ // Returns the index of the first matching regexp.
+ // Returns -1 on no match. Compile has to be called before
+ // calling this.
+ int FirstMatch(absl::string_view text,
+ const std::vector<int>& atoms) const;
+
+ // Returns the indices of all matching regexps, after first clearing
+ // matched_regexps.
+ bool AllMatches(absl::string_view text,
+ const std::vector<int>& atoms,
+ std::vector<int>* matching_regexps) const;
+
+ // Returns the indices of all potentially matching regexps after first
+ // clearing potential_regexps.
+ // A regexp is potentially matching if it passes the filter.
+ // If a regexp passes the filter it may still not match.
+ // A regexp that does not pass the filter is guaranteed to not match.
+ void AllPotentials(const std::vector<int>& atoms,
+ std::vector<int>* potential_regexps) const;
+
+ // The number of regexps added.
+ int NumRegexps() const { return static_cast<int>(re2_vec_.size()); }
+
+ // Get the individual RE2 objects.
+ const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; }
+
+ private:
+ // Print prefilter.
+ void PrintPrefilter(int regexpid);
+
+ // Useful for testing and debugging.
+ void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
+ std::vector<int>* passed_regexps);
+
+ // All the regexps in the FilteredRE2.
+ std::vector<RE2*> re2_vec_;
+
+ // Has the FilteredRE2 been compiled using Compile()
+ bool compiled_;
+
+ // An AND-OR tree of string atoms used for filtering regexps.
+ std::unique_ptr<PrefilterTree> prefilter_tree_;
+};
+
+} // namespace re2
+
+#endif // RE2_FILTERED_RE2_H_
diff --git a/third_party/re2/src/re2/fuzzing/re2_fuzzer.cc b/third_party/re2/src/re2/fuzzing/re2_fuzzer.cc
new file mode 100644
index 000000000..9a7af08a7
--- /dev/null
+++ b/third_party/re2/src/re2/fuzzing/re2_fuzzer.cc
@@ -0,0 +1,282 @@
+// Copyright 2016 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <fuzzer/FuzzedDataProvider.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "re2/filtered_re2.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+#include "re2/set.h"
+#include "re2/walker-inl.h"
+
+// NOT static, NOT signed.
+uint8_t dummy = 0;
+
+// Walks kRegexpConcat and kRegexpAlternate subexpressions
+// to determine their maximum length.
+class SubexpressionWalker : public re2::Regexp::Walker<int> {
+ public:
+ SubexpressionWalker() = default;
+ ~SubexpressionWalker() override = default;
+
+ int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
+ int* child_args, int nchild_args) override {
+ switch (re->op()) {
+ case re2::kRegexpConcat:
+ case re2::kRegexpAlternate: {
+ int max = nchild_args;
+ for (int i = 0; i < nchild_args; i++)
+ max = std::max(max, child_args[i]);
+ return max;
+ }
+
+ default:
+ break;
+ }
+ return -1;
+ }
+
+ // Should never be called: we use Walk(), not WalkExponential().
+ int ShortVisit(re2::Regexp* re, int parent_arg) override {
+ return parent_arg;
+ }
+
+ private:
+ SubexpressionWalker(const SubexpressionWalker&) = delete;
+ SubexpressionWalker& operator=(const SubexpressionWalker&) = delete;
+};
+
+// Walks substrings (i.e. kRegexpLiteralString subexpressions)
+// to determine their maximum length... in runes, but avoiding
+// overheads due to UTF-8 encoding is worthwhile when fuzzing.
+class SubstringWalker : public re2::Regexp::Walker<int> {
+ public:
+ SubstringWalker() = default;
+ ~SubstringWalker() override = default;
+
+ int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
+ int* child_args, int nchild_args) override {
+ switch (re->op()) {
+ case re2::kRegexpConcat:
+ case re2::kRegexpAlternate:
+ case re2::kRegexpStar:
+ case re2::kRegexpPlus:
+ case re2::kRegexpQuest:
+ case re2::kRegexpRepeat:
+ case re2::kRegexpCapture: {
+ int max = -1;
+ for (int i = 0; i < nchild_args; i++)
+ max = std::max(max, child_args[i]);
+ return max;
+ }
+
+ case re2::kRegexpLiteralString:
+ return re->nrunes();
+
+ default:
+ break;
+ }
+ return -1;
+ }
+
+ // Should never be called: we use Walk(), not WalkExponential().
+ int ShortVisit(re2::Regexp* re, int parent_arg) override {
+ return parent_arg;
+ }
+
+ private:
+ SubstringWalker(const SubstringWalker&) = delete;
+ SubstringWalker& operator=(const SubstringWalker&) = delete;
+};
+
+void TestOneInput(absl::string_view pattern, const RE2::Options& options,
+ RE2::Anchor anchor, absl::string_view text) {
+ // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
+ // Otherwise, we will waste time on inputs that have long runs of various
+ // character classes. The fuzzer has shown itself to be easily capable of
+ // generating such patterns that fall within the other limits, but result
+ // in timeouts nonetheless. The marginal cost is high - even more so when
+ // counted repetition is involved - whereas the marginal benefit is zero.
+ // Crudely limit the use of 'k', 'K', 's' and 'S' too because they become
+ // three-element character classes when case-insensitive and using UTF-8.
+ // TODO(junyer): Handle [[:alnum:]] et al. when they start to cause pain.
+ int char_class = 0;
+ int backslash_p = 0; // very expensive, so handle specially
+ for (size_t i = 0; i < pattern.size(); i++) {
+ if (pattern[i] == '.' ||
+ pattern[i] == 'k' || pattern[i] == 'K' ||
+ pattern[i] == 's' || pattern[i] == 'S')
+ char_class++;
+ if (pattern[i] != '\\')
+ continue;
+ i++;
+ if (i >= pattern.size())
+ break;
+ if (pattern[i] == 'p' || pattern[i] == 'P' ||
+ pattern[i] == 'd' || pattern[i] == 'D' ||
+ pattern[i] == 's' || pattern[i] == 'S' ||
+ pattern[i] == 'w' || pattern[i] == 'W')
+ char_class++;
+ if (pattern[i] == 'p' || pattern[i] == 'P')
+ backslash_p++;
+ }
+ if (char_class > 9)
+ return;
+ if (backslash_p > 1)
+ return;
+
+ // Iterate just once when fuzzing. Otherwise, we easily get bogged down
+ // and coverage is unlikely to improve despite significant expense.
+ RE2::FUZZING_ONLY_set_maximum_global_replace_count(1);
+ // The default is 1000. Even 100 turned out to be too generous
+ // for fuzzing, empirically speaking, so let's try 10 instead.
+ re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10);
+
+ RE2 re(pattern, options);
+ if (!re.ok())
+ return;
+
+ // Don't waste time fuzzing programs with large subexpressions.
+ // They can cause bug reports due to fuzzer timeouts. And they
+ // aren't interesting for fuzzing purposes.
+ if (SubexpressionWalker().Walk(re.Regexp(), -1) > 9)
+ return;
+
+ // Don't waste time fuzzing programs with large substrings.
+ // They can cause bug reports due to fuzzer timeouts when they
+ // are repetitions (e.g. hundreds of NUL bytes) and matching is
+ // unanchored. And they aren't interesting for fuzzing purposes.
+ if (SubstringWalker().Walk(re.Regexp(), -1) > 9)
+ return;
+
+ // Don't waste time fuzzing high-size programs.
+ // They can cause bug reports due to fuzzer timeouts.
+ int size = re.ProgramSize();
+ if (size > 9999)
+ return;
+ int rsize = re.ReverseProgramSize();
+ if (rsize > 9999)
+ return;
+
+ // Don't waste time fuzzing high-fanout programs.
+ // They can cause bug reports due to fuzzer timeouts.
+ std::vector<int> histogram;
+ int fanout = re.ProgramFanout(&histogram);
+ if (fanout > 9)
+ return;
+ int rfanout = re.ReverseProgramFanout(&histogram);
+ if (rfanout > 9)
+ return;
+
+ if (re.NumberOfCapturingGroups() == 0) {
+ // Avoid early return due to too many arguments.
+ absl::string_view sp = text;
+ RE2::FullMatch(sp, re);
+ RE2::PartialMatch(sp, re);
+ RE2::Consume(&sp, re);
+ sp = text; // Reset.
+ RE2::FindAndConsume(&sp, re);
+ } else {
+ // Okay, we have at least one capturing group...
+ // Try conversion for variously typed arguments.
+ absl::string_view sp = text;
+ short s;
+ RE2::FullMatch(sp, re, &s);
+ long l;
+ RE2::PartialMatch(sp, re, &l);
+ float f;
+ RE2::Consume(&sp, re, &f);
+ sp = text; // Reset.
+ double d;
+ RE2::FindAndConsume(&sp, re, &d);
+ }
+
+ std::string s = std::string(text);
+ RE2::Replace(&s, re, "");
+ s = std::string(text); // Reset.
+ RE2::GlobalReplace(&s, re, "");
+
+ std::string min, max;
+ re.PossibleMatchRange(&min, &max, /*maxlen=*/9);
+
+ // Exercise some other API functionality.
+ dummy += re.NamedCapturingGroups().size();
+ dummy += re.CapturingGroupNames().size();
+ dummy += RE2::QuoteMeta(pattern).size();
+ dummy += re.Regexp()->ToString().size();
+
+ RE2::Set set(options, anchor);
+ int index = set.Add(pattern, /*error=*/NULL); // -1 on error
+ if (index != -1 && set.Compile()) {
+ std::vector<int> matches;
+ set.Match(text, &matches);
+ }
+
+ re2::FilteredRE2 filter;
+ index = -1; // not clobbered on error
+ filter.Add(pattern, options, &index);
+ if (index != -1) {
+ std::vector<std::string> atoms;
+ filter.Compile(&atoms);
+ // Pretend that all atoms match, which
+ // triggers the AND-OR tree maximally.
+ std::vector<int> matched_atoms;
+ matched_atoms.reserve(atoms.size());
+ for (size_t i = 0; i < atoms.size(); ++i)
+ matched_atoms.push_back(static_cast<int>(i));
+ std::vector<int> matches;
+ filter.AllMatches(text, matched_atoms, &matches);
+ }
+}
+
+// Entry point for libFuzzer.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ // An input larger than 4 KiB probably isn't interesting. (This limit
+ // allows for fdp.ConsumeRandomLengthString()'s backslash behaviour.)
+ if (size == 0 || size > 4096)
+ return 0;
+
+ FuzzedDataProvider fdp(data, size);
+
+ // The convention here is that fdp.ConsumeBool() returning false sets
+ // the default value whereas returning true sets the alternate value:
+ // most options default to false and so can be set directly; encoding
+ // defaults to UTF-8; case_sensitive defaults to true. We do NOT want
+ // to log errors. max_mem is 64 MiB because we can afford to use more
+ // RAM in exchange for (hopefully) faster fuzzing.
+ RE2::Options options;
+ options.set_encoding(fdp.ConsumeBool() ? RE2::Options::EncodingLatin1
+ : RE2::Options::EncodingUTF8);
+ options.set_posix_syntax(fdp.ConsumeBool());
+ options.set_longest_match(fdp.ConsumeBool());
+ options.set_log_errors(false);
+ options.set_max_mem(64 << 20);
+ options.set_literal(fdp.ConsumeBool());
+ options.set_never_nl(fdp.ConsumeBool());
+ options.set_dot_nl(fdp.ConsumeBool());
+ options.set_never_capture(fdp.ConsumeBool());
+ options.set_case_sensitive(!fdp.ConsumeBool());
+ options.set_perl_classes(fdp.ConsumeBool());
+ options.set_word_boundary(fdp.ConsumeBool());
+ options.set_one_line(fdp.ConsumeBool());
+
+ // ConsumeEnum<RE2::Anchor>() would require RE2::Anchor to specify
+ // kMaxValue, so just use PickValueInArray<RE2::Anchor>() instead.
+ RE2::Anchor anchor = fdp.PickValueInArray<RE2::Anchor>({
+ RE2::UNANCHORED,
+ RE2::ANCHOR_START,
+ RE2::ANCHOR_BOTH,
+ });
+
+ std::string pattern = fdp.ConsumeRandomLengthString(999);
+ std::string text = fdp.ConsumeRandomLengthString(999);
+
+ TestOneInput(pattern, options, anchor, text);
+ return 0;
+}
diff --git a/third_party/re2/src/re2/make_perl_groups.pl b/third_party/re2/src/re2/make_perl_groups.pl
new file mode 100755
index 000000000..ed0d509dc
--- /dev/null
+++ b/third_party/re2/src/re2/make_perl_groups.pl
@@ -0,0 +1,116 @@
+#!/usr/bin/perl
+# Copyright 2008 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Generate table entries giving character ranges
+# for POSIX/Perl character classes. Rather than
+# figure out what the definition is, it is easier to ask
+# Perl about each letter from 0-128 and write down
+# its answer.
+
+@posixclasses = (
+ "[:alnum:]",
+ "[:alpha:]",
+ "[:ascii:]",
+ "[:blank:]",
+ "[:cntrl:]",
+ "[:digit:]",
+ "[:graph:]",
+ "[:lower:]",
+ "[:print:]",
+ "[:punct:]",
+ "[:space:]",
+ "[:upper:]",
+ "[:word:]",
+ "[:xdigit:]",
+);
+
+@perlclasses = (
+ "\\d",
+ "\\s",
+ "\\w",
+);
+
+%overrides = (
+ # Prior to Perl 5.18, \s did not match vertical tab.
+ # RE2 preserves that original behaviour.
+ "\\s:11" => 0,
+);
+
+sub ComputeClass($) {
+ my ($cname) = @_;
+ my @ranges;
+ my $regexp = qr/[$cname]/;
+ my $start = -1;
+ for (my $i=0; $i<=129; $i++) {
+ if ($i == 129) { $i = 256; }
+ if ($i <= 128 && ($overrides{"$cname:$i"} // chr($i) =~ $regexp)) {
+ if ($start < 0) {
+ $start = $i;
+ }
+ } else {
+ if ($start >= 0) {
+ push @ranges, [$start, $i-1];
+ }
+ $start = -1;
+ }
+ }
+ return @ranges;
+}
+
+sub PrintClass($$@) {
+ my ($cnum, $cname, @ranges) = @_;
+ print "static const URange16 code${cnum}[] = { /* $cname */\n";
+ for (my $i=0; $i<@ranges; $i++) {
+ my @a = @{$ranges[$i]};
+ printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1];
+ }
+ print "};\n";
+ my $n = @ranges;
+ my $escname = $cname;
+ $escname =~ s/\\/\\\\/g;
+ $negname = $escname;
+ if ($negname =~ /:/) {
+ $negname =~ s/:/:^/;
+ } else {
+ $negname =~ y/a-z/A-Z/;
+ }
+ return "{ \"$escname\", +1, code$cnum, $n, 0, 0 }", "{ \"$negname\", -1, code$cnum, $n, 0, 0 }";
+}
+
+my $cnum = 0;
+
+sub PrintClasses($@) {
+ my ($pname, @classes) = @_;
+ my @entries;
+ foreach my $cname (@classes) {
+ my @ranges = ComputeClass($cname);
+ push @entries, PrintClass(++$cnum, $cname, @ranges);
+ }
+ print "const UGroup ${pname}_groups[] = {\n";
+ foreach my $e (@entries) {
+ print "\t$e,\n";
+ }
+ print "};\n";
+ my $count = @entries;
+ print "const int num_${pname}_groups = $count;\n";
+}
+
+print <<EOF;
+// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
+// make_perl_groups.pl >perl_groups.cc
+
+#include "re2/unicode_groups.h"
+
+namespace re2 {
+
+EOF
+
+PrintClasses("perl", @perlclasses);
+PrintClasses("posix", @posixclasses);
+
+print <<EOF;
+
+} // namespace re2
+EOF
diff --git a/third_party/re2/src/re2/make_unicode_casefold.py b/third_party/re2/src/re2/make_unicode_casefold.py
new file mode 100755
index 000000000..803adbd42
--- /dev/null
+++ b/third_party/re2/src/re2/make_unicode_casefold.py
@@ -0,0 +1,151 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright 2008 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# See unicode_casefold.h for description of case folding tables.
+
+"""Generate C++ table for Unicode case folding."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import unicode
+
+_header = """
+// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
+// make_unicode_casefold.py >unicode_casefold.cc
+
+#include "re2/unicode_casefold.h"
+
+namespace re2 {
+
+"""
+
+_trailer = """
+
+} // namespace re2
+
+"""
+
+def _Delta(a, b):
+ """Compute the delta for b - a. Even/odd and odd/even
+ are handled specially, as described above."""
+ if a+1 == b:
+ if a%2 == 0:
+ return 'EvenOdd'
+ else:
+ return 'OddEven'
+ if a == b+1:
+ if a%2 == 0:
+ return 'OddEven'
+ else:
+ return 'EvenOdd'
+ return b - a
+
+def _AddDelta(a, delta):
+ """Return a + delta, handling EvenOdd and OddEven specially."""
+ if type(delta) == int:
+ return a+delta
+ if delta == 'EvenOdd':
+ if a%2 == 0:
+ return a+1
+ else:
+ return a-1
+ if delta == 'OddEven':
+ if a%2 == 1:
+ return a+1
+ else:
+ return a-1
+ print("Bad Delta:", delta, file=sys.stderr)
+ raise unicode.Error("Bad Delta")
+
+def _MakeRanges(pairs):
+ """Turn a list like [(65,97), (66, 98), ..., (90,122)]
+ into [(65, 90, +32)]."""
+ ranges = []
+ last = -100
+
+ def evenodd(last, a, b, r):
+ if a != last+1 or b != _AddDelta(a, r[2]):
+ return False
+ r[1] = a
+ return True
+
+ def evenoddpair(last, a, b, r):
+ if a != last+2:
+ return False
+ delta = r[2]
+ d = delta
+ if type(delta) is not str:
+ return False
+ if delta.endswith('Skip'):
+ d = delta[:-4]
+ else:
+ delta = d + 'Skip'
+ if b != _AddDelta(a, d):
+ return False
+ r[1] = a
+ r[2] = delta
+ return True
+
+ for a, b in pairs:
+ if ranges and evenodd(last, a, b, ranges[-1]):
+ pass
+ elif ranges and evenoddpair(last, a, b, ranges[-1]):
+ pass
+ else:
+ ranges.append([a, a, _Delta(a, b)])
+ last = a
+ return ranges
+
+# The maximum size of a case-folding group.
+# Case folding is implemented in parse.cc by a recursive process
+# with a recursion depth equal to the size of the largest
+# case-folding group, so it is important that this bound be small.
+# The current tables have no group bigger than 4.
+# If there are ever groups bigger than 10 or so, it will be
+# time to rework the code in parse.cc.
+MaxCasefoldGroup = 4
+
+def main():
+ lowergroups, casegroups = unicode.CaseGroups()
+ foldpairs = []
+ seen = {}
+ for c in casegroups:
+ if len(c) > MaxCasefoldGroup:
+ raise unicode.Error("casefold group too long: %s" % (c,))
+ for i in range(len(c)):
+ if c[i-1] in seen:
+ raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
+ seen[c[i-1]] = True
+ foldpairs.append([c[i-1], c[i]])
+
+ lowerpairs = []
+ for lower, group in lowergroups.items():
+ for g in group:
+ if g != lower:
+ lowerpairs.append([g, lower])
+
+ def printpairs(name, foldpairs):
+ foldpairs.sort()
+ foldranges = _MakeRanges(foldpairs)
+ print("// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges)))
+ print("const CaseFold unicode_%s[] = {" % (name,))
+ for lo, hi, delta in foldranges:
+ print("\t{ %d, %d, %s }," % (lo, hi, delta))
+ print("};")
+ print("const int num_unicode_%s = %d;" % (name, len(foldranges)))
+ print("")
+
+ print(_header)
+ printpairs("casefold", foldpairs)
+ printpairs("tolower", lowerpairs)
+ print(_trailer)
+
+if __name__ == '__main__':
+ main()
diff --git a/third_party/re2/src/re2/make_unicode_groups.py b/third_party/re2/src/re2/make_unicode_groups.py
new file mode 100755
index 000000000..cbe822ad9
--- /dev/null
+++ b/third_party/re2/src/re2/make_unicode_groups.py
@@ -0,0 +1,117 @@
+#!/usr/bin/python3
+# Copyright 2008 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+"""Generate C++ tables for Unicode Script and Category groups."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import unicode
+
+_header = """
+// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
+// make_unicode_groups.py >unicode_groups.cc
+
+#include "re2/unicode_groups.h"
+
+namespace re2 {
+
+"""
+
+_trailer = """
+
+} // namespace re2
+
+"""
+
+n16 = 0
+n32 = 0
+
+def MakeRanges(codes):
+ """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
+ ranges = []
+ last = -100
+ for c in codes:
+ if c == last+1:
+ ranges[-1][1] = c
+ else:
+ ranges.append([c, c])
+ last = c
+ return ranges
+
+def PrintRanges(type, name, ranges):
+ """Print the ranges as an array of type named name."""
+ print("static const %s %s[] = {" % (type, name))
+ for lo, hi in ranges:
+ print("\t{ %d, %d }," % (lo, hi))
+ print("};")
+
+# def PrintCodes(type, name, codes):
+# """Print the codes as an array of type named name."""
+# print("static %s %s[] = {" % (type, name))
+# for c in codes:
+# print("\t%d," % (c,))
+# print("};")
+
+def PrintGroup(name, codes):
+ """Print the data structures for the group of codes.
+ Return a UGroup literal for the group."""
+
+ # See unicode_groups.h for a description of the data structure.
+
+ # Split codes into 16-bit ranges and 32-bit ranges.
+ range16 = MakeRanges([c for c in codes if c < 65536])
+ range32 = MakeRanges([c for c in codes if c >= 65536])
+
+ # Pull singleton ranges out of range16.
+ # code16 = [lo for lo, hi in range16 if lo == hi]
+ # range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
+
+ global n16
+ global n32
+ n16 += len(range16)
+ n32 += len(range32)
+
+ ugroup = "{ \"%s\", +1" % (name,)
+ # if len(code16) > 0:
+ # PrintCodes("uint16_t", name+"_code16", code16)
+ # ugroup += ", %s_code16, %d" % (name, len(code16))
+ # else:
+ # ugroup += ", 0, 0"
+ if len(range16) > 0:
+ PrintRanges("URange16", name+"_range16", range16)
+ ugroup += ", %s_range16, %d" % (name, len(range16))
+ else:
+ ugroup += ", 0, 0"
+ if len(range32) > 0:
+ PrintRanges("URange32", name+"_range32", range32)
+ ugroup += ", %s_range32, %d" % (name, len(range32))
+ else:
+ ugroup += ", 0, 0"
+ ugroup += " }"
+ return ugroup
+
+def main():
+ categories = unicode.Categories()
+ scripts = unicode.Scripts()
+ print(_header)
+ ugroups = []
+ for name in sorted(categories):
+ ugroups.append(PrintGroup(name, categories[name]))
+ for name in sorted(scripts):
+ ugroups.append(PrintGroup(name, scripts[name]))
+ print("// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32))
+ print("const UGroup unicode_groups[] = {")
+ ugroups.sort()
+ for ug in ugroups:
+ print("\t%s," % (ug,))
+ print("};")
+ print("const int num_unicode_groups = %d;" % (len(ugroups),))
+ print(_trailer)
+
+if __name__ == '__main__':
+ main()
diff --git a/third_party/re2/src/re2/mimics_pcre.cc b/third_party/re2/src/re2/mimics_pcre.cc
new file mode 100644
index 000000000..ac0c69d7e
--- /dev/null
+++ b/third_party/re2/src/re2/mimics_pcre.cc
@@ -0,0 +1,196 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Determine whether this library should match PCRE exactly
+// for a particular Regexp. (If so, the testing framework can
+// check that it does.)
+//
+// This library matches PCRE except in these cases:
+// * the regexp contains a repetition of an empty string,
+// like (a*)* or (a*)+. In this case, PCRE will treat
+// the repetition sequence as ending with an empty string,
+// while this library does not.
+// * Perl and PCRE differ on whether \v matches \n.
+// For historical reasons, this library implements the Perl behavior.
+// * Perl and PCRE allow $ in one-line mode to match either the very
+// end of the text or just before a \n at the end of the text.
+// This library requires it to match only the end of the text.
+// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
+// match the end of the text if the last character is a \n.
+// This library does allow it.
+//
+// Regexp::MimicsPCRE checks for any of these conditions.
+
+#include "util/logging.h"
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// Returns whether re might match an empty string.
+static bool CanBeEmptyString(Regexp *re);
+
+// Walker class to compute whether library handles a regexp
+// exactly as PCRE would. See comment at top for conditions.
+
+class PCREWalker : public Regexp::Walker<bool> {
+ public:
+ PCREWalker() {}
+
+ virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+ bool* child_args, int nchild_args);
+
+ virtual bool ShortVisit(Regexp* re, bool a) {
+ // Should never be called: we use Walk(), not WalkExponential().
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ LOG(DFATAL) << "PCREWalker::ShortVisit called";
+#endif
+ return a;
+ }
+
+ private:
+ PCREWalker(const PCREWalker&) = delete;
+ PCREWalker& operator=(const PCREWalker&) = delete;
+};
+
+// Called after visiting each of re's children and accumulating
+// the return values in child_args. So child_args contains whether
+// this library mimics PCRE for those subexpressions.
+bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+ bool* child_args, int nchild_args) {
+ // If children failed, so do we.
+ for (int i = 0; i < nchild_args; i++)
+ if (!child_args[i])
+ return false;
+
+ // Otherwise look for other reasons to fail.
+ switch (re->op()) {
+ // Look for repeated empty string.
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ if (CanBeEmptyString(re->sub()[0]))
+ return false;
+ break;
+ case kRegexpRepeat:
+ if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
+ return false;
+ break;
+
+ // Look for \v
+ case kRegexpLiteral:
+ if (re->rune() == '\v')
+ return false;
+ break;
+
+ // Look for $ in single-line mode.
+ case kRegexpEndText:
+ case kRegexpEmptyMatch:
+ if (re->parse_flags() & Regexp::WasDollar)
+ return false;
+ break;
+
+ // Look for ^ in multi-line mode.
+ case kRegexpBeginLine:
+ // No condition: in single-line mode ^ becomes kRegexpBeginText.
+ return false;
+
+ default:
+ break;
+ }
+
+ // Not proven guilty.
+ return true;
+}
+
+// Returns whether this regexp's behavior will mimic PCRE's exactly.
+bool Regexp::MimicsPCRE() {
+ PCREWalker w;
+ return w.Walk(this, true);
+}
+
+
+// Walker class to compute whether a Regexp can match an empty string.
+// It is okay to overestimate. For example, \b\B cannot match an empty
+// string, because \b and \B are mutually exclusive, but this isn't
+// that smart and will say it can. Spurious empty strings
+// will reduce the number of regexps we sanity check against PCRE,
+// but they won't break anything.
+
+class EmptyStringWalker : public Regexp::Walker<bool> {
+ public:
+ EmptyStringWalker() {}
+
+ virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+ bool* child_args, int nchild_args);
+
+ virtual bool ShortVisit(Regexp* re, bool a) {
+ // Should never be called: we use Walk(), not WalkExponential().
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
+#endif
+ return a;
+ }
+
+ private:
+ EmptyStringWalker(const EmptyStringWalker&) = delete;
+ EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
+};
+
+// Called after visiting re's children. child_args contains the return
+// value from each of the children's PostVisits (i.e., whether each child
+// can match an empty string). Returns whether this clause can match an
+// empty string.
+bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+ bool* child_args, int nchild_args) {
+ switch (re->op()) {
+ case kRegexpNoMatch: // never empty
+ case kRegexpLiteral:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ case kRegexpCharClass:
+ case kRegexpLiteralString:
+ return false;
+
+ case kRegexpEmptyMatch: // always empty
+ case kRegexpBeginLine: // always empty, when they match
+ case kRegexpEndLine:
+ case kRegexpNoWordBoundary:
+ case kRegexpWordBoundary:
+ case kRegexpBeginText:
+ case kRegexpEndText:
+ case kRegexpStar: // can always be empty
+ case kRegexpQuest:
+ case kRegexpHaveMatch:
+ return true;
+
+ case kRegexpConcat: // can be empty if all children can
+ for (int i = 0; i < nchild_args; i++)
+ if (!child_args[i])
+ return false;
+ return true;
+
+ case kRegexpAlternate: // can be empty if any child can
+ for (int i = 0; i < nchild_args; i++)
+ if (child_args[i])
+ return true;
+ return false;
+
+ case kRegexpPlus: // can be empty if the child can
+ case kRegexpCapture:
+ return child_args[0];
+
+ case kRegexpRepeat: // can be empty if child can or is x{0}
+ return child_args[0] || re->min() == 0;
+ }
+ return false;
+}
+
+// Returns whether re can match an empty string.
+static bool CanBeEmptyString(Regexp* re) {
+ EmptyStringWalker w;
+ return w.Walk(re, true);
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/nfa.cc b/third_party/re2/src/re2/nfa.cc
new file mode 100644
index 000000000..a655884d7
--- /dev/null
+++ b/third_party/re2/src/re2/nfa.cc
@@ -0,0 +1,710 @@
+// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Tested by search_test.cc.
+//
+// Prog::SearchNFA, an NFA search.
+// This is an actual NFA like the theorists talk about,
+// not the pseudo-NFA found in backtracking regexp implementations.
+//
+// IMPLEMENTATION
+//
+// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
+// which is a variant of the one described in Thompson's 1968 CACM paper.
+// See http://swtch.com/~rsc/regexp/ for various history. The main feature
+// over the DFA implementation is that it tracks submatch boundaries.
+//
+// When the choice of submatch boundaries is ambiguous, this particular
+// implementation makes the same choices that traditional backtracking
+// implementations (in particular, Perl and PCRE) do.
+// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
+// time in the length of the input.
+//
+// Like Thompson's original machine and like the DFA implementation, this
+// implementation notices a match only once it is one byte past it.
+
+#include <stdio.h>
+#include <string.h>
+#include <algorithm>
+#include <deque>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_format.h"
+#include "util/logging.h"
+#include "re2/pod_array.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+#include "re2/sparse_array.h"
+#include "re2/sparse_set.h"
+
+namespace re2 {
+
+static const bool ExtraDebug = false;
+
+class NFA {
+ public:
+ NFA(Prog* prog);
+ ~NFA();
+
+ // Searches for a matching string.
+ // * If anchored is true, only considers matches starting at offset.
+ // Otherwise finds lefmost match at or after offset.
+ // * If longest is true, returns the longest match starting
+ // at the chosen start point. Otherwise returns the so-called
+ // left-biased match, the one traditional backtracking engines
+ // (like Perl and PCRE) find.
+ // Records submatch boundaries in submatch[1..nsubmatch-1].
+ // Submatch[0] is the entire match. When there is a choice in
+ // which text matches each subexpression, the submatch boundaries
+ // are chosen to match what a backtracking implementation would choose.
+ bool Search(absl::string_view text, absl::string_view context, bool anchored,
+ bool longest, absl::string_view* submatch, int nsubmatch);
+
+ private:
+ struct Thread {
+ union {
+ int ref;
+ Thread* next; // when on free list
+ };
+ const char** capture;
+ };
+
+ // State for explicit stack in AddToThreadq.
+ struct AddState {
+ int id; // Inst to process
+ Thread* t; // if not null, set t0 = t before processing id
+ };
+
+ // Threadq is a list of threads. The list is sorted by the order
+ // in which Perl would explore that particular state -- the earlier
+ // choices appear earlier in the list.
+ typedef SparseArray<Thread*> Threadq;
+
+ inline Thread* AllocThread();
+ inline Thread* Incref(Thread* t);
+ inline void Decref(Thread* t);
+
+ // Follows all empty arrows from id0 and enqueues all the states reached.
+ // Enqueues only the ByteRange instructions that match byte c.
+ // context is used (with p) for evaluating empty-width specials.
+ // p is the current input position, and t0 is the current thread.
+ void AddToThreadq(Threadq* q, int id0, int c, absl::string_view context,
+ const char* p, Thread* t0);
+
+ // Run runq on byte c, appending new states to nextq.
+ // Updates matched_ and match_ as new, better matches are found.
+ // context is used (with p) for evaluating empty-width specials.
+ // p is the position of byte c in the input string for AddToThreadq;
+ // p-1 will be used when processing Match instructions.
+ // Frees all the threads on runq.
+ // If there is a shortcut to the end, returns that shortcut.
+ int Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context,
+ const char* p);
+
+ // Returns text version of capture information, for debugging.
+ std::string FormatCapture(const char** capture);
+
+ void CopyCapture(const char** dst, const char** src) {
+ memmove(dst, src, ncapture_*sizeof src[0]);
+ }
+
+ Prog* prog_; // underlying program
+ int start_; // start instruction in program
+ int ncapture_; // number of submatches to track
+ bool longest_; // whether searching for longest match
+ bool endmatch_; // whether match must end at text.end()
+ const char* btext_; // beginning of text (for FormatSubmatch)
+ const char* etext_; // end of text (for endmatch_)
+ Threadq q0_, q1_; // pre-allocated for Search.
+ PODArray<AddState> stack_; // pre-allocated for AddToThreadq
+ std::deque<Thread> arena_; // thread arena
+ Thread* freelist_; // thread freelist
+ const char** match_; // best match so far
+ bool matched_; // any match so far?
+
+ NFA(const NFA&) = delete;
+ NFA& operator=(const NFA&) = delete;
+};
+
+NFA::NFA(Prog* prog) {
+ prog_ = prog;
+ start_ = prog_->start();
+ ncapture_ = 0;
+ longest_ = false;
+ endmatch_ = false;
+ btext_ = NULL;
+ etext_ = NULL;
+ q0_.resize(prog_->size());
+ q1_.resize(prog_->size());
+ // See NFA::AddToThreadq() for why this is so.
+ int nstack = 2*prog_->inst_count(kInstCapture) +
+ prog_->inst_count(kInstEmptyWidth) +
+ prog_->inst_count(kInstNop) + 1; // + 1 for start inst
+ stack_ = PODArray<AddState>(nstack);
+ freelist_ = NULL;
+ match_ = NULL;
+ matched_ = false;
+}
+
+NFA::~NFA() {
+ delete[] match_;
+ for (const Thread& t : arena_)
+ delete[] t.capture;
+}
+
+NFA::Thread* NFA::AllocThread() {
+ Thread* t = freelist_;
+ if (t != NULL) {
+ freelist_ = t->next;
+ t->ref = 1;
+ // We don't need to touch t->capture because
+ // the caller will immediately overwrite it.
+ return t;
+ }
+ arena_.emplace_back();
+ t = &arena_.back();
+ t->ref = 1;
+ t->capture = new const char*[ncapture_];
+ return t;
+}
+
+NFA::Thread* NFA::Incref(Thread* t) {
+ DCHECK(t != NULL);
+ t->ref++;
+ return t;
+}
+
+void NFA::Decref(Thread* t) {
+ DCHECK(t != NULL);
+ t->ref--;
+ if (t->ref > 0)
+ return;
+ DCHECK_EQ(t->ref, 0);
+ t->next = freelist_;
+ freelist_ = t;
+}
+
+// Follows all empty arrows from id0 and enqueues all the states reached.
+// Enqueues only the ByteRange instructions that match byte c.
+// context is used (with p) for evaluating empty-width specials.
+// p is the current input position, and t0 is the current thread.
+void NFA::AddToThreadq(Threadq* q, int id0, int c, absl::string_view context,
+ const char* p, Thread* t0) {
+ if (id0 == 0)
+ return;
+
+ // Use stack_ to hold our stack of instructions yet to process.
+ // It was preallocated as follows:
+ // two entries per Capture;
+ // one entry per EmptyWidth; and
+ // one entry per Nop.
+ // This reflects the maximum number of stack pushes that each can
+ // perform. (Each instruction can be processed at most once.)
+ AddState* stk = stack_.data();
+ int nstk = 0;
+
+ stk[nstk++] = {id0, NULL};
+ while (nstk > 0) {
+ DCHECK_LE(nstk, stack_.size());
+ AddState a = stk[--nstk];
+
+ Loop:
+ if (a.t != NULL) {
+ // t0 was a thread that we allocated and copied in order to
+ // record the capture, so we must now decref it.
+ Decref(t0);
+ t0 = a.t;
+ }
+
+ int id = a.id;
+ if (id == 0)
+ continue;
+ if (q->has_index(id)) {
+ if (ExtraDebug)
+ absl::FPrintF(stderr, " [%d%s]\n", id, FormatCapture(t0->capture));
+ continue;
+ }
+
+ // Create entry in q no matter what. We might fill it in below,
+ // or we might not. Even if not, it is necessary to have it,
+ // so that we don't revisit id0 during the recursion.
+ q->set_new(id, NULL);
+ Thread** tp = &q->get_existing(id);
+ int j;
+ Thread* t;
+ Prog::Inst* ip = prog_->inst(id);
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
+ break;
+
+ case kInstFail:
+ break;
+
+ case kInstAltMatch:
+ // Save state; will pick up at next byte.
+ t = Incref(t0);
+ *tp = t;
+
+ DCHECK(!ip->last());
+ a = {id+1, NULL};
+ goto Loop;
+
+ case kInstNop:
+ if (!ip->last())
+ stk[nstk++] = {id+1, NULL};
+
+ // Continue on.
+ a = {ip->out(), NULL};
+ goto Loop;
+
+ case kInstCapture:
+ if (!ip->last())
+ stk[nstk++] = {id+1, NULL};
+
+ if ((j=ip->cap()) < ncapture_) {
+ // Push a dummy whose only job is to restore t0
+ // once we finish exploring this possibility.
+ stk[nstk++] = {0, t0};
+
+ // Record capture.
+ t = AllocThread();
+ CopyCapture(t->capture, t0->capture);
+ t->capture[j] = p;
+ t0 = t;
+ }
+ a = {ip->out(), NULL};
+ goto Loop;
+
+ case kInstByteRange:
+ if (!ip->Matches(c))
+ goto Next;
+
+ // Save state; will pick up at next byte.
+ t = Incref(t0);
+ *tp = t;
+ if (ExtraDebug)
+ absl::FPrintF(stderr, " + %d%s\n", id, FormatCapture(t0->capture));
+
+ if (ip->hint() == 0)
+ break;
+ a = {id+ip->hint(), NULL};
+ goto Loop;
+
+ case kInstMatch:
+ // Save state; will pick up at next byte.
+ t = Incref(t0);
+ *tp = t;
+ if (ExtraDebug)
+ absl::FPrintF(stderr, " ! %d%s\n", id, FormatCapture(t0->capture));
+
+ Next:
+ if (ip->last())
+ break;
+ a = {id+1, NULL};
+ goto Loop;
+
+ case kInstEmptyWidth:
+ if (!ip->last())
+ stk[nstk++] = {id+1, NULL};
+
+ // Continue on if we have all the right flag bits.
+ if (ip->empty() & ~Prog::EmptyFlags(context, p))
+ break;
+ a = {ip->out(), NULL};
+ goto Loop;
+ }
+ }
+}
+
+// Run runq on byte c, appending new states to nextq.
+// Updates matched_ and match_ as new, better matches are found.
+// context is used (with p) for evaluating empty-width specials.
+// p is the position of byte c in the input string for AddToThreadq;
+// p-1 will be used when processing Match instructions.
+// Frees all the threads on runq.
+// If there is a shortcut to the end, returns that shortcut.
+int NFA::Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context,
+ const char* p) {
+ nextq->clear();
+
+ for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
+ Thread* t = i->value();
+ if (t == NULL)
+ continue;
+
+ if (longest_) {
+ // Can skip any threads started after our current best match.
+ if (matched_ && match_[0] < t->capture[0]) {
+ Decref(t);
+ continue;
+ }
+ }
+
+ int id = i->index();
+ Prog::Inst* ip = prog_->inst(id);
+
+ switch (ip->opcode()) {
+ default:
+ // Should only see the values handled below.
+ LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
+ break;
+
+ case kInstByteRange:
+ AddToThreadq(nextq, ip->out(), c, context, p, t);
+ break;
+
+ case kInstAltMatch:
+ if (i != runq->begin())
+ break;
+ // The match is ours if we want it.
+ if (ip->greedy(prog_) || longest_) {
+ CopyCapture(match_, t->capture);
+ matched_ = true;
+
+ Decref(t);
+ for (++i; i != runq->end(); ++i) {
+ if (i->value() != NULL)
+ Decref(i->value());
+ }
+ runq->clear();
+ if (ip->greedy(prog_))
+ return ip->out1();
+ return ip->out();
+ }
+ break;
+
+ case kInstMatch: {
+ // Avoid invoking undefined behavior (arithmetic on a null pointer)
+ // by storing p instead of p-1. (What would the latter even mean?!)
+ // This complements the special case in NFA::Search().
+ if (p == NULL) {
+ CopyCapture(match_, t->capture);
+ match_[1] = p;
+ matched_ = true;
+ break;
+ }
+
+ if (endmatch_ && p-1 != etext_)
+ break;
+
+ if (longest_) {
+ // Leftmost-longest mode: save this match only if
+ // it is either farther to the left or at the same
+ // point but longer than an existing match.
+ if (!matched_ || t->capture[0] < match_[0] ||
+ (t->capture[0] == match_[0] && p-1 > match_[1])) {
+ CopyCapture(match_, t->capture);
+ match_[1] = p-1;
+ matched_ = true;
+ }
+ } else {
+ // Leftmost-biased mode: this match is by definition
+ // better than what we've already found (see next line).
+ CopyCapture(match_, t->capture);
+ match_[1] = p-1;
+ matched_ = true;
+
+ // Cut off the threads that can only find matches
+ // worse than the one we just found: don't run the
+ // rest of the current Threadq.
+ Decref(t);
+ for (++i; i != runq->end(); ++i) {
+ if (i->value() != NULL)
+ Decref(i->value());
+ }
+ runq->clear();
+ return 0;
+ }
+ break;
+ }
+ }
+ Decref(t);
+ }
+ runq->clear();
+ return 0;
+}
+
+std::string NFA::FormatCapture(const char** capture) {
+ std::string s;
+ for (int i = 0; i < ncapture_; i+=2) {
+ if (capture[i] == NULL)
+ s += "(?,?)";
+ else if (capture[i+1] == NULL)
+ s += absl::StrFormat("(%d,?)",
+ capture[i] - btext_);
+ else
+ s += absl::StrFormat("(%d,%d)",
+ capture[i] - btext_,
+ capture[i+1] - btext_);
+ }
+ return s;
+}
+
+bool NFA::Search(absl::string_view text, absl::string_view context,
+ bool anchored, bool longest, absl::string_view* submatch,
+ int nsubmatch) {
+ if (start_ == 0)
+ return false;
+
+ if (context.data() == NULL)
+ context = text;
+
+ // Sanity check: make sure that text lies within context.
+ if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) {
+ LOG(DFATAL) << "context does not contain text";
+ return false;
+ }
+
+ if (prog_->anchor_start() && BeginPtr(context) != BeginPtr(text))
+ return false;
+ if (prog_->anchor_end() && EndPtr(context) != EndPtr(text))
+ return false;
+ anchored |= prog_->anchor_start();
+ if (prog_->anchor_end()) {
+ longest = true;
+ endmatch_ = true;
+ }
+
+ if (nsubmatch < 0) {
+ LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
+ return false;
+ }
+
+ // Save search parameters.
+ ncapture_ = 2*nsubmatch;
+ longest_ = longest;
+
+ if (nsubmatch == 0) {
+ // We need to maintain match[0], both to distinguish the
+ // longest match (if longest is true) and also to tell
+ // whether we've seen any matches at all.
+ ncapture_ = 2;
+ }
+
+ match_ = new const char*[ncapture_];
+ memset(match_, 0, ncapture_*sizeof match_[0]);
+ matched_ = false;
+
+ // For debugging prints.
+ btext_ = context.data();
+ // For convenience.
+ etext_ = text.data() + text.size();
+
+ if (ExtraDebug)
+ absl::FPrintF(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
+ text, context, anchored, longest);
+
+ // Set up search.
+ Threadq* runq = &q0_;
+ Threadq* nextq = &q1_;
+ runq->clear();
+ nextq->clear();
+
+ // Loop over the text, stepping the machine.
+ for (const char* p = text.data();; p++) {
+ if (ExtraDebug) {
+ int c = 0;
+ if (p == btext_)
+ c = '^';
+ else if (p > etext_)
+ c = '$';
+ else if (p < etext_)
+ c = p[0] & 0xFF;
+
+ absl::FPrintF(stderr, "%c:", c);
+ for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
+ Thread* t = i->value();
+ if (t == NULL)
+ continue;
+ absl::FPrintF(stderr, " %d%s", i->index(), FormatCapture(t->capture));
+ }
+ absl::FPrintF(stderr, "\n");
+ }
+
+ // This is a no-op the first time around the loop because runq is empty.
+ int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p);
+ DCHECK_EQ(runq->size(), 0);
+ using std::swap;
+ swap(nextq, runq);
+ nextq->clear();
+ if (id != 0) {
+ // We're done: full match ahead.
+ p = etext_;
+ for (;;) {
+ Prog::Inst* ip = prog_->inst(id);
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
+ break;
+
+ case kInstCapture:
+ if (ip->cap() < ncapture_)
+ match_[ip->cap()] = p;
+ id = ip->out();
+ continue;
+
+ case kInstNop:
+ id = ip->out();
+ continue;
+
+ case kInstMatch:
+ match_[1] = p;
+ matched_ = true;
+ break;
+ }
+ break;
+ }
+ break;
+ }
+
+ if (p > etext_)
+ break;
+
+ // Start a new thread if there have not been any matches.
+ // (No point in starting a new thread if there have been
+ // matches, since it would be to the right of the match
+ // we already found.)
+ if (!matched_ && (!anchored || p == text.data())) {
+ // Try to use prefix accel (e.g. memchr) to skip ahead.
+ // The search must be unanchored and there must be zero
+ // possible matches already.
+ if (!anchored && runq->size() == 0 &&
+ p < etext_ && prog_->can_prefix_accel()) {
+ p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p));
+ if (p == NULL)
+ p = etext_;
+ }
+
+ Thread* t = AllocThread();
+ CopyCapture(t->capture, match_);
+ t->capture[0] = p;
+ AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p,
+ t);
+ Decref(t);
+ }
+
+ // If all the threads have died, stop early.
+ if (runq->size() == 0) {
+ if (ExtraDebug)
+ absl::FPrintF(stderr, "dead\n");
+ break;
+ }
+
+ // Avoid invoking undefined behavior (arithmetic on a null pointer)
+ // by simply not continuing the loop.
+ // This complements the special case in NFA::Step().
+ if (p == NULL) {
+ (void) Step(runq, nextq, -1, context, p);
+ DCHECK_EQ(runq->size(), 0);
+ using std::swap;
+ swap(nextq, runq);
+ nextq->clear();
+ break;
+ }
+ }
+
+ for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
+ if (i->value() != NULL)
+ Decref(i->value());
+ }
+
+ if (matched_) {
+ for (int i = 0; i < nsubmatch; i++)
+ submatch[i] = absl::string_view(
+ match_[2 * i],
+ static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
+ if (ExtraDebug)
+ absl::FPrintF(stderr, "match (%d,%d)\n",
+ match_[0] - btext_,
+ match_[1] - btext_);
+ return true;
+ }
+ return false;
+}
+
+bool Prog::SearchNFA(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind, absl::string_view* match,
+ int nmatch) {
+ if (ExtraDebug)
+ Dump();
+
+ NFA nfa(this);
+ absl::string_view sp;
+ if (kind == kFullMatch) {
+ anchor = kAnchored;
+ if (nmatch == 0) {
+ match = &sp;
+ nmatch = 1;
+ }
+ }
+ if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
+ return false;
+ if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
+ return false;
+ return true;
+}
+
+// For each instruction i in the program reachable from the start, compute the
+// number of instructions reachable from i by following only empty transitions
+// and record that count as fanout[i].
+//
+// fanout holds the results and is also the work queue for the outer iteration.
+// reachable holds the reached nodes for the inner iteration.
+void Prog::Fanout(SparseArray<int>* fanout) {
+ DCHECK_EQ(fanout->max_size(), size());
+ SparseSet reachable(size());
+ fanout->clear();
+ fanout->set_new(start(), 0);
+ for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
+ int* count = &i->value();
+ reachable.clear();
+ reachable.insert(i->index());
+ for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
+ int id = *j;
+ Prog::Inst* ip = inst(id);
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
+ break;
+
+ case kInstByteRange:
+ if (!ip->last())
+ reachable.insert(id+1);
+
+ (*count)++;
+ if (!fanout->has_index(ip->out())) {
+ fanout->set_new(ip->out(), 0);
+ }
+ break;
+
+ case kInstAltMatch:
+ DCHECK(!ip->last());
+ reachable.insert(id+1);
+ break;
+
+ case kInstCapture:
+ case kInstEmptyWidth:
+ case kInstNop:
+ if (!ip->last())
+ reachable.insert(id+1);
+
+ reachable.insert(ip->out());
+ break;
+
+ case kInstMatch:
+ if (!ip->last())
+ reachable.insert(id+1);
+ break;
+
+ case kInstFail:
+ break;
+ }
+ }
+ }
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/onepass.cc b/third_party/re2/src/re2/onepass.cc
new file mode 100644
index 000000000..7931cf911
--- /dev/null
+++ b/third_party/re2/src/re2/onepass.cc
@@ -0,0 +1,621 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Tested by search_test.cc.
+//
+// Prog::SearchOnePass is an efficient implementation of
+// regular expression search with submatch tracking for
+// what I call "one-pass regular expressions". (An alternate
+// name might be "backtracking-free regular expressions".)
+//
+// One-pass regular expressions have the property that
+// at each input byte during an anchored match, there may be
+// multiple alternatives but only one can proceed for any
+// given input byte.
+//
+// For example, the regexp /x*yx*/ is one-pass: you read
+// x's until a y, then you read the y, then you keep reading x's.
+// At no point do you have to guess what to do or back up
+// and try a different guess.
+//
+// On the other hand, /x*x/ is not one-pass: when you're
+// looking at an input "x", it's not clear whether you should
+// use it to extend the x* or as the final x.
+//
+// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
+// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
+//
+// A simple intuition for identifying one-pass regular expressions
+// is that it's always immediately obvious when a repetition ends.
+// It must also be immediately obvious which branch of an | to take:
+//
+// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
+//
+// The NFA-based search in nfa.cc does some bookkeeping to
+// avoid the need for backtracking and its associated exponential blowup.
+// But if we have a one-pass regular expression, there is no
+// possibility of backtracking, so there is no need for the
+// extra bookkeeping. Hence, this code.
+//
+// On a one-pass regular expression, the NFA code in nfa.cc
+// runs at about 1/20 of the backtracking-based PCRE speed.
+// In contrast, the code in this file runs at about the same
+// speed as PCRE.
+//
+// One-pass regular expressions get used a lot when RE is
+// used for parsing simple strings, so it pays off to
+// notice them and handle them efficiently.
+//
+// See also Anne Brüggemann-Klein and Derick Wood,
+// "One-unambiguous regular languages", Information and Computation 142(2).
+
+#include <stdint.h>
+#include <string.h>
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "absl/container/fixed_array.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_format.h"
+#include "util/logging.h"
+#include "util/utf.h"
+#include "re2/pod_array.h"
+#include "re2/prog.h"
+#include "re2/sparse_set.h"
+
+// Silence "zero-sized array in struct/union" warning for OneState::action.
+#ifdef _MSC_VER
+#pragma warning(disable: 4200)
+#endif
+
+namespace re2 {
+
+static const bool ExtraDebug = false;
+
+// The key insight behind this implementation is that the
+// non-determinism in an NFA for a one-pass regular expression
+// is contained. To explain what that means, first a
+// refresher about what regular expression programs look like
+// and how the usual NFA execution runs.
+//
+// In a regular expression program, only the kInstByteRange
+// instruction processes an input byte c and moves on to the
+// next byte in the string (it does so if c is in the given range).
+// The kInstByteRange instructions correspond to literal characters
+// and character classes in the regular expression.
+//
+// The kInstAlt instructions are used as wiring to connect the
+// kInstByteRange instructions together in interesting ways when
+// implementing | + and *.
+// The kInstAlt instruction forks execution, like a goto that
+// jumps to ip->out() and ip->out1() in parallel. Each of the
+// resulting computation paths is called a thread.
+//
+// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
+// are interesting in their own right but like kInstAlt they don't
+// advance the input pointer. Only kInstByteRange does.
+//
+// The automaton execution in nfa.cc runs all the possible
+// threads of execution in lock-step over the input. To process
+// a particular byte, each thread gets run until it either dies
+// or finds a kInstByteRange instruction matching the byte.
+// If the latter happens, the thread stops just past the
+// kInstByteRange instruction (at ip->out()) and waits for
+// the other threads to finish processing the input byte.
+// Then, once all the threads have processed that input byte,
+// the whole process repeats. The kInstAlt state instruction
+// might create new threads during input processing, but no
+// matter what, all the threads stop after a kInstByteRange
+// and wait for the other threads to "catch up".
+// Running in lock step like this ensures that the NFA reads
+// the input string only once.
+//
+// Each thread maintains its own set of capture registers
+// (the string positions at which it executed the kInstCapture
+// instructions corresponding to capturing parentheses in the
+// regular expression). Repeated copying of the capture registers
+// is the main performance bottleneck in the NFA implementation.
+//
+// A regular expression program is "one-pass" if, no matter what
+// the input string, there is only one thread that makes it
+// past a kInstByteRange instruction at each input byte. This means
+// that there is in some sense only one active thread throughout
+// the execution. Other threads might be created during the
+// processing of an input byte, but they are ephemeral: only one
+// thread is left to start processing the next input byte.
+// This is what I meant above when I said the non-determinism
+// was "contained".
+//
+// To execute a one-pass regular expression program, we can build
+// a DFA (no non-determinism) that has at most as many states as
+// the NFA (compare this to the possibly exponential number of states
+// in the general case). Each state records, for each possible
+// input byte, the next state along with the conditions required
+// before entering that state -- empty-width flags that must be true
+// and capture operations that must be performed. It also records
+// whether a set of conditions required to finish a match at that
+// point in the input rather than process the next byte.
+
+// A state in the one-pass NFA - just an array of actions indexed
+// by the bytemap_[] of the next input byte. (The bytemap
+// maps next input bytes into equivalence classes, to reduce
+// the memory footprint.)
+struct OneState {
+ uint32_t matchcond; // conditions to match right now.
+ uint32_t action[];
+};
+
+// The uint32_t conditions in the action are a combination of
+// condition and capture bits and the next state. The bottom 16 bits
+// are the condition and capture bits, and the top 16 are the index of
+// the next state.
+//
+// Bits 0-5 are the empty-width flags from prog.h.
+// Bit 6 is kMatchWins, which means the match takes
+// priority over moving to next in a first-match search.
+// The remaining bits mark capture registers that should
+// be set to the current input position. The capture bits
+// start at index 2, since the search loop can take care of
+// cap[0], cap[1] (the overall match position).
+// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
+// No input position can satisfy both kEmptyWordBoundary
+// and kEmptyNonWordBoundary, so we can use that as a sentinel
+// instead of needing an extra bit.
+
+static const int kIndexShift = 16; // number of bits below index
+static const int kEmptyShift = 6; // number of empty flags in prog.h
+static const int kRealCapShift = kEmptyShift + 1;
+static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
+
+// Parameters used to skip over cap[0], cap[1].
+static const int kCapShift = kRealCapShift - 2;
+static const int kMaxCap = kRealMaxCap + 2;
+
+static const uint32_t kMatchWins = 1 << kEmptyShift;
+static const uint32_t kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
+
+static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
+
+// Check, at compile time, that prog.h agrees with math above.
+// This function is never called.
+void OnePass_Checks() {
+ static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags,
+ "kEmptyShift disagrees with kEmptyAllFlags");
+ // kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
+ static_assert(kMaxCap == Prog::kMaxOnePassCapture*2,
+ "kMaxCap disagrees with kMaxOnePassCapture");
+}
+
+static bool Satisfy(uint32_t cond, absl::string_view context, const char* p) {
+ uint32_t satisfied = Prog::EmptyFlags(context, p);
+ if (cond & kEmptyAllFlags & ~satisfied)
+ return false;
+ return true;
+}
+
+// Apply the capture bits in cond, saving p to the appropriate
+// locations in cap[].
+static void ApplyCaptures(uint32_t cond, const char* p,
+ const char** cap, int ncap) {
+ for (int i = 2; i < ncap; i++)
+ if (cond & (1 << kCapShift << i))
+ cap[i] = p;
+}
+
+// Computes the OneState* for the given nodeindex.
+static inline OneState* IndexToNode(uint8_t* nodes, int statesize,
+ int nodeindex) {
+ return reinterpret_cast<OneState*>(nodes + statesize*nodeindex);
+}
+
+bool Prog::SearchOnePass(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind,
+ absl::string_view* match, int nmatch) {
+ if (anchor != kAnchored && kind != kFullMatch) {
+ LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
+ return false;
+ }
+
+ // Make sure we have at least cap[1],
+ // because we use it to tell if we matched.
+ int ncap = 2*nmatch;
+ if (ncap < 2)
+ ncap = 2;
+
+ const char* cap[kMaxCap];
+ for (int i = 0; i < ncap; i++)
+ cap[i] = NULL;
+
+ const char* matchcap[kMaxCap];
+ for (int i = 0; i < ncap; i++)
+ matchcap[i] = NULL;
+
+ if (context.data() == NULL)
+ context = text;
+ if (anchor_start() && BeginPtr(context) != BeginPtr(text))
+ return false;
+ if (anchor_end() && EndPtr(context) != EndPtr(text))
+ return false;
+ if (anchor_end())
+ kind = kFullMatch;
+
+ uint8_t* nodes = onepass_nodes_.data();
+ int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
+ // start() is always mapped to the zeroth OneState.
+ OneState* state = IndexToNode(nodes, statesize, 0);
+ uint8_t* bytemap = bytemap_;
+ const char* bp = text.data();
+ const char* ep = text.data() + text.size();
+ const char* p;
+ bool matched = false;
+ matchcap[0] = bp;
+ cap[0] = bp;
+ uint32_t nextmatchcond = state->matchcond;
+ for (p = bp; p < ep; p++) {
+ int c = bytemap[*p & 0xFF];
+ uint32_t matchcond = nextmatchcond;
+ uint32_t cond = state->action[c];
+
+ // Determine whether we can reach act->next.
+ // If so, advance state and nextmatchcond.
+ if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
+ uint32_t nextindex = cond >> kIndexShift;
+ state = IndexToNode(nodes, statesize, nextindex);
+ nextmatchcond = state->matchcond;
+ } else {
+ state = NULL;
+ nextmatchcond = kImpossible;
+ }
+
+ // This code section is carefully tuned.
+ // The goto sequence is about 10% faster than the
+ // obvious rewrite as a large if statement in the
+ // ASCIIMatchRE2 and DotMatchRE2 benchmarks.
+
+ // Saving the match capture registers is expensive.
+ // Is this intermediate match worth thinking about?
+
+ // Not if we want a full match.
+ if (kind == kFullMatch)
+ goto skipmatch;
+
+ // Not if it's impossible.
+ if (matchcond == kImpossible)
+ goto skipmatch;
+
+ // Not if the possible match is beaten by the certain
+ // match at the next byte. When this test is useless
+ // (e.g., HTTPPartialMatchRE2) it slows the loop by
+ // about 10%, but when it avoids work (e.g., DotMatchRE2),
+ // it cuts the loop execution by about 45%.
+ if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
+ goto skipmatch;
+
+ // Finally, the match conditions must be satisfied.
+ if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
+ for (int i = 2; i < 2*nmatch; i++)
+ matchcap[i] = cap[i];
+ if (nmatch > 1 && (matchcond & kCapMask))
+ ApplyCaptures(matchcond, p, matchcap, ncap);
+ matchcap[1] = p;
+ matched = true;
+
+ // If we're in longest match mode, we have to keep
+ // going and see if we find a longer match.
+ // In first match mode, we can stop if the match
+ // takes priority over the next state for this input byte.
+ // That bit is per-input byte and thus in cond, not matchcond.
+ if (kind == kFirstMatch && (cond & kMatchWins))
+ goto done;
+ }
+
+ skipmatch:
+ if (state == NULL)
+ goto done;
+ if ((cond & kCapMask) && nmatch > 1)
+ ApplyCaptures(cond, p, cap, ncap);
+ }
+
+ // Look for match at end of input.
+ {
+ uint32_t matchcond = state->matchcond;
+ if (matchcond != kImpossible &&
+ ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
+ if (nmatch > 1 && (matchcond & kCapMask))
+ ApplyCaptures(matchcond, p, cap, ncap);
+ for (int i = 2; i < ncap; i++)
+ matchcap[i] = cap[i];
+ matchcap[1] = p;
+ matched = true;
+ }
+ }
+
+done:
+ if (!matched)
+ return false;
+ for (int i = 0; i < nmatch; i++)
+ match[i] = absl::string_view(
+ matchcap[2 * i],
+ static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i]));
+ return true;
+}
+
+// Analysis to determine whether a given regexp program is one-pass.
+
+// If ip is not on workq, adds ip to work queue and returns true.
+// If ip is already on work queue, does nothing and returns false.
+// If ip is NULL, does nothing and returns true (pretends to add it).
+typedef SparseSet Instq;
+static bool AddQ(Instq *q, int id) {
+ if (id == 0)
+ return true;
+ if (q->contains(id))
+ return false;
+ q->insert(id);
+ return true;
+}
+
+struct InstCond {
+ int id;
+ uint32_t cond;
+};
+
+// Returns whether this is a one-pass program; that is,
+// returns whether it is safe to use SearchOnePass on this program.
+// These conditions must be true for any instruction ip:
+//
+// (1) for any other Inst nip, there is at most one input-free
+// path from ip to nip.
+// (2) there is at most one kInstByte instruction reachable from
+// ip that matches any particular byte c.
+// (3) there is at most one input-free path from ip to a kInstMatch
+// instruction.
+//
+// This is actually just a conservative approximation: it might
+// return false when the answer is true, when kInstEmptyWidth
+// instructions are involved.
+// Constructs and saves corresponding one-pass NFA on success.
+bool Prog::IsOnePass() {
+ if (did_onepass_)
+ return onepass_nodes_.data() != NULL;
+ did_onepass_ = true;
+
+ if (start() == 0) // no match
+ return false;
+
+ // Steal memory for the one-pass NFA from the overall DFA budget.
+ // Willing to use at most 1/4 of the DFA budget (heuristic).
+ // Limit max node count to 65000 as a conservative estimate to
+ // avoid overflowing 16-bit node index in encoding.
+ int maxnodes = 2 + inst_count(kInstByteRange);
+ int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
+ if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
+ return false;
+
+ // Flood the graph starting at the start state, and check
+ // that in each reachable state, each possible byte leads
+ // to a unique next state.
+ int stacksize = inst_count(kInstCapture) +
+ inst_count(kInstEmptyWidth) +
+ inst_count(kInstNop) + 1; // + 1 for start inst
+ absl::FixedArray<InstCond, 64> stack_storage(stacksize);
+ InstCond* stack = stack_storage.data();
+
+ int size = this->size();
+ absl::FixedArray<int, 128> nodebyid_storage(size, -1); // indexed by ip
+ int* nodebyid = nodebyid_storage.data();
+
+ // Originally, nodes was a uint8_t[maxnodes*statesize], but that was
+ // unnecessarily optimistic: why allocate a large amount of memory
+ // upfront for a large program when it is unlikely to be one-pass?
+ absl::InlinedVector<uint8_t, 2048> nodes;
+
+ Instq tovisit(size), workq(size);
+ AddQ(&tovisit, start());
+ nodebyid[start()] = 0;
+ int nalloc = 1;
+ nodes.insert(nodes.end(), statesize, 0);
+ for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
+ int id = *it;
+ int nodeindex = nodebyid[id];
+ OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
+
+ // Flood graph using manual stack, filling in actions as found.
+ // Default is none.
+ for (int b = 0; b < bytemap_range_; b++)
+ node->action[b] = kImpossible;
+ node->matchcond = kImpossible;
+
+ workq.clear();
+ bool matched = false;
+ int nstack = 0;
+ stack[nstack].id = id;
+ stack[nstack++].cond = 0;
+ while (nstack > 0) {
+ int id = stack[--nstack].id;
+ uint32_t cond = stack[nstack].cond;
+
+ Loop:
+ Prog::Inst* ip = inst(id);
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
+ break;
+
+ case kInstAltMatch:
+ // TODO(rsc): Ignoring kInstAltMatch optimization.
+ // Should implement it in this engine, but it's subtle.
+ DCHECK(!ip->last());
+ // If already on work queue, (1) is violated: bail out.
+ if (!AddQ(&workq, id+1))
+ goto fail;
+ id = id+1;
+ goto Loop;
+
+ case kInstByteRange: {
+ int nextindex = nodebyid[ip->out()];
+ if (nextindex == -1) {
+ if (nalloc >= maxnodes) {
+ if (ExtraDebug)
+ LOG(ERROR) << absl::StrFormat(
+ "Not OnePass: hit node limit %d >= %d", nalloc, maxnodes);
+ goto fail;
+ }
+ nextindex = nalloc;
+ AddQ(&tovisit, ip->out());
+ nodebyid[ip->out()] = nalloc;
+ nalloc++;
+ nodes.insert(nodes.end(), statesize, 0);
+ // Update node because it might have been invalidated.
+ node = IndexToNode(nodes.data(), statesize, nodeindex);
+ }
+ for (int c = ip->lo(); c <= ip->hi(); c++) {
+ int b = bytemap_[c];
+ // Skip any bytes immediately after c that are also in b.
+ while (c < 256-1 && bytemap_[c+1] == b)
+ c++;
+ uint32_t act = node->action[b];
+ uint32_t newact = (nextindex << kIndexShift) | cond;
+ if (matched)
+ newact |= kMatchWins;
+ if ((act & kImpossible) == kImpossible) {
+ node->action[b] = newact;
+ } else if (act != newact) {
+ if (ExtraDebug)
+ LOG(ERROR) << absl::StrFormat(
+ "Not OnePass: conflict on byte %#x at state %d", c, *it);
+ goto fail;
+ }
+ }
+ if (ip->foldcase()) {
+ Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a';
+ Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a';
+ for (int c = lo; c <= hi; c++) {
+ int b = bytemap_[c];
+ // Skip any bytes immediately after c that are also in b.
+ while (c < 256-1 && bytemap_[c+1] == b)
+ c++;
+ uint32_t act = node->action[b];
+ uint32_t newact = (nextindex << kIndexShift) | cond;
+ if (matched)
+ newact |= kMatchWins;
+ if ((act & kImpossible) == kImpossible) {
+ node->action[b] = newact;
+ } else if (act != newact) {
+ if (ExtraDebug)
+ LOG(ERROR) << absl::StrFormat(
+ "Not OnePass: conflict on byte %#x at state %d", c, *it);
+ goto fail;
+ }
+ }
+ }
+
+ if (ip->last())
+ break;
+ // If already on work queue, (1) is violated: bail out.
+ if (!AddQ(&workq, id+1))
+ goto fail;
+ id = id+1;
+ goto Loop;
+ }
+
+ case kInstCapture:
+ case kInstEmptyWidth:
+ case kInstNop:
+ if (!ip->last()) {
+ // If already on work queue, (1) is violated: bail out.
+ if (!AddQ(&workq, id+1))
+ goto fail;
+ stack[nstack].id = id+1;
+ stack[nstack++].cond = cond;
+ }
+
+ if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap)
+ cond |= (1 << kCapShift) << ip->cap();
+ if (ip->opcode() == kInstEmptyWidth)
+ cond |= ip->empty();
+
+ // kInstCapture and kInstNop always proceed to ip->out().
+ // kInstEmptyWidth only sometimes proceeds to ip->out(),
+ // but as a conservative approximation we assume it always does.
+ // We could be a little more precise by looking at what c
+ // is, but that seems like overkill.
+
+ // If already on work queue, (1) is violated: bail out.
+ if (!AddQ(&workq, ip->out())) {
+ if (ExtraDebug)
+ LOG(ERROR) << absl::StrFormat(
+ "Not OnePass: multiple paths %d -> %d", *it, ip->out());
+ goto fail;
+ }
+ id = ip->out();
+ goto Loop;
+
+ case kInstMatch:
+ if (matched) {
+ // (3) is violated
+ if (ExtraDebug)
+ LOG(ERROR) << absl::StrFormat(
+ "Not OnePass: multiple matches from %d", *it);
+ goto fail;
+ }
+ matched = true;
+ node->matchcond = cond;
+
+ if (ip->last())
+ break;
+ // If already on work queue, (1) is violated: bail out.
+ if (!AddQ(&workq, id+1))
+ goto fail;
+ id = id+1;
+ goto Loop;
+
+ case kInstFail:
+ break;
+ }
+ }
+ }
+
+ if (ExtraDebug) { // For debugging, dump one-pass NFA to LOG(ERROR).
+ LOG(ERROR) << "bytemap:\n" << DumpByteMap();
+ LOG(ERROR) << "prog:\n" << Dump();
+
+ std::map<int, int> idmap;
+ for (int i = 0; i < size; i++)
+ if (nodebyid[i] != -1)
+ idmap[nodebyid[i]] = i;
+
+ std::string dump;
+ for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
+ int id = *it;
+ int nodeindex = nodebyid[id];
+ if (nodeindex == -1)
+ continue;
+ OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
+ dump += absl::StrFormat("node %d id=%d: matchcond=%#x\n",
+ nodeindex, id, node->matchcond);
+ for (int i = 0; i < bytemap_range_; i++) {
+ if ((node->action[i] & kImpossible) == kImpossible)
+ continue;
+ dump += absl::StrFormat(" %d cond %#x -> %d id=%d\n",
+ i, node->action[i] & 0xFFFF,
+ node->action[i] >> kIndexShift,
+ idmap[node->action[i] >> kIndexShift]);
+ }
+ }
+ LOG(ERROR) << "nodes:\n" << dump;
+ }
+
+ dfa_mem_ -= nalloc*statesize;
+ onepass_nodes_ = PODArray<uint8_t>(nalloc*statesize);
+ memmove(onepass_nodes_.data(), nodes.data(), nalloc*statesize);
+ return true;
+
+fail:
+ return false;
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/parse.cc b/third_party/re2/src/re2/parse.cc
new file mode 100644
index 000000000..655cb9a27
--- /dev/null
+++ b/third_party/re2/src/re2/parse.cc
@@ -0,0 +1,2479 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression parser.
+
+// The parser is a simple precedence-based parser with a
+// manual stack. The parsing work is done by the methods
+// of the ParseState class. The Regexp::Parse function is
+// essentially just a lexer that calls the ParseState method
+// for each token.
+
+// The parser recognizes POSIX extended regular expressions
+// excluding backreferences, collating elements, and collating
+// classes. It also allows the empty string as a regular expression
+// and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W.
+// See regexp.h for rationale.
+
+#include <ctype.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "absl/base/macros.h"
+#include "absl/strings/ascii.h"
+#include "util/logging.h"
+#include "util/utf.h"
+#include "re2/pod_array.h"
+#include "re2/regexp.h"
+#include "re2/unicode_casefold.h"
+#include "re2/unicode_groups.h"
+#include "re2/walker-inl.h"
+
+#if defined(RE2_USE_ICU)
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
+#include "unicode/utypes.h"
+#endif
+
+namespace re2 {
+
+// Controls the maximum repeat count permitted by the parser.
+static int maximum_repeat_count = 1000;
+
+void Regexp::FUZZING_ONLY_set_maximum_repeat_count(int i) {
+ maximum_repeat_count = i;
+}
+
+// Regular expression parse state.
+// The list of parsed regexps so far is maintained as a vector of
+// Regexp pointers called the stack. Left parenthesis and vertical
+// bar markers are also placed on the stack, as Regexps with
+// non-standard opcodes.
+// Scanning a left parenthesis causes the parser to push a left parenthesis
+// marker on the stack.
+// Scanning a vertical bar causes the parser to pop the stack until it finds a
+// vertical bar or left parenthesis marker (not popping the marker),
+// concatenate all the popped results, and push them back on
+// the stack (DoConcatenation).
+// Scanning a right parenthesis causes the parser to act as though it
+// has seen a vertical bar, which then leaves the top of the stack in the
+// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
+// The parser pops all this off the stack and creates an alternation of the
+// regexps (DoAlternation).
+
+class Regexp::ParseState {
+ public:
+ ParseState(ParseFlags flags, absl::string_view whole_regexp,
+ RegexpStatus* status);
+ ~ParseState();
+
+ ParseFlags flags() { return flags_; }
+ int rune_max() { return rune_max_; }
+
+ // Parse methods. All public methods return a bool saying
+ // whether parsing should continue. If a method returns
+ // false, it has set fields in *status_, and the parser
+ // should return NULL.
+
+ // Pushes the given regular expression onto the stack.
+ // Could check for too much memory used here.
+ bool PushRegexp(Regexp* re);
+
+ // Pushes the literal rune r onto the stack.
+ bool PushLiteral(Rune r);
+
+ // Pushes a regexp with the given op (and no args) onto the stack.
+ bool PushSimpleOp(RegexpOp op);
+
+ // Pushes a ^ onto the stack.
+ bool PushCaret();
+
+ // Pushes a \b (word == true) or \B (word == false) onto the stack.
+ bool PushWordBoundary(bool word);
+
+ // Pushes a $ onto the stack.
+ bool PushDollar();
+
+ // Pushes a . onto the stack
+ bool PushDot();
+
+ // Pushes a repeat operator regexp onto the stack.
+ // A valid argument for the operator must already be on the stack.
+ // s is the name of the operator, for use in error messages.
+ bool PushRepeatOp(RegexpOp op, absl::string_view s, bool nongreedy);
+
+ // Pushes a repetition regexp onto the stack.
+ // A valid argument for the operator must already be on the stack.
+ bool PushRepetition(int min, int max, absl::string_view s, bool nongreedy);
+
+ // Checks whether a particular regexp op is a marker.
+ bool IsMarker(RegexpOp op);
+
+ // Processes a left parenthesis in the input.
+ // Pushes a marker onto the stack.
+ bool DoLeftParen(absl::string_view name);
+ bool DoLeftParenNoCapture();
+
+ // Processes a vertical bar in the input.
+ bool DoVerticalBar();
+
+ // Processes a right parenthesis in the input.
+ bool DoRightParen();
+
+ // Processes the end of input, returning the final regexp.
+ Regexp* DoFinish();
+
+ // Finishes the regexp if necessary, preparing it for use
+ // in a more complicated expression.
+ // If it is a CharClassBuilder, converts into a CharClass.
+ Regexp* FinishRegexp(Regexp*);
+
+ // These routines don't manipulate the parse stack
+ // directly, but they do need to look at flags_.
+ // ParseCharClass also manipulates the internals of Regexp
+ // while creating *out_re.
+
+ // Parse a character class into *out_re.
+ // Removes parsed text from s.
+ bool ParseCharClass(absl::string_view* s, Regexp** out_re,
+ RegexpStatus* status);
+
+ // Parse a character class character into *rp.
+ // Removes parsed text from s.
+ bool ParseCCCharacter(absl::string_view* s, Rune* rp,
+ absl::string_view whole_class,
+ RegexpStatus* status);
+
+ // Parse a character class range into rr.
+ // Removes parsed text from s.
+ bool ParseCCRange(absl::string_view* s, RuneRange* rr,
+ absl::string_view whole_class,
+ RegexpStatus* status);
+
+ // Parse a Perl flag set or non-capturing group from s.
+ bool ParsePerlFlags(absl::string_view* s);
+
+ // Finishes the current concatenation,
+ // collapsing it into a single regexp on the stack.
+ void DoConcatenation();
+
+ // Finishes the current alternation,
+ // collapsing it to a single regexp on the stack.
+ void DoAlternation();
+
+ // Generalized DoAlternation/DoConcatenation.
+ void DoCollapse(RegexpOp op);
+
+ // Maybe concatenate Literals into LiteralString.
+ bool MaybeConcatString(int r, ParseFlags flags);
+
+private:
+ ParseFlags flags_;
+ absl::string_view whole_regexp_;
+ RegexpStatus* status_;
+ Regexp* stacktop_;
+ int ncap_; // number of capturing parens seen
+ int rune_max_; // maximum char value for this encoding
+
+ ParseState(const ParseState&) = delete;
+ ParseState& operator=(const ParseState&) = delete;
+};
+
+// Pseudo-operators - only on parse stack.
+const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);
+const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);
+
+Regexp::ParseState::ParseState(ParseFlags flags,
+ absl::string_view whole_regexp,
+ RegexpStatus* status)
+ : flags_(flags), whole_regexp_(whole_regexp),
+ status_(status), stacktop_(NULL), ncap_(0) {
+ if (flags_ & Latin1)
+ rune_max_ = 0xFF;
+ else
+ rune_max_ = Runemax;
+}
+
+// Cleans up by freeing all the regexps on the stack.
+Regexp::ParseState::~ParseState() {
+ Regexp* next;
+ for (Regexp* re = stacktop_; re != NULL; re = next) {
+ next = re->down_;
+ re->down_ = NULL;
+ if (re->op() == kLeftParen)
+ delete re->name_;
+ re->Decref();
+ }
+}
+
+// Finishes the regexp if necessary, preparing it for use in
+// a more complex expression.
+// If it is a CharClassBuilder, converts into a CharClass.
+Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) {
+ if (re == NULL)
+ return NULL;
+ re->down_ = NULL;
+
+ if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
+ CharClassBuilder* ccb = re->ccb_;
+ re->ccb_ = NULL;
+ re->cc_ = ccb->GetCharClass();
+ delete ccb;
+ }
+
+ return re;
+}
+
+// Pushes the given regular expression onto the stack.
+// Could check for too much memory used here.
+bool Regexp::ParseState::PushRegexp(Regexp* re) {
+ MaybeConcatString(-1, NoParseFlags);
+
+ // Special case: a character class of one character is just
+ // a literal. This is a common idiom for escaping
+ // single characters (e.g., [.] instead of \.), and some
+ // analysis does better with fewer character classes.
+ // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
+ if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
+ re->ccb_->RemoveAbove(rune_max_);
+ if (re->ccb_->size() == 1) {
+ Rune r = re->ccb_->begin()->lo;
+ re->Decref();
+ re = new Regexp(kRegexpLiteral, flags_);
+ re->rune_ = r;
+ } else if (re->ccb_->size() == 2) {
+ Rune r = re->ccb_->begin()->lo;
+ if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) {
+ re->Decref();
+ re = new Regexp(kRegexpLiteral, flags_ | FoldCase);
+ re->rune_ = r + 'a' - 'A';
+ }
+ }
+ }
+
+ if (!IsMarker(re->op()))
+ re->simple_ = re->ComputeSimple();
+ re->down_ = stacktop_;
+ stacktop_ = re;
+ return true;
+}
+
+// Searches the case folding tables and returns the CaseFold* that contains r.
+// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
+// If there isn't one, returns NULL.
+const CaseFold* LookupCaseFold(const CaseFold* f, int n, Rune r) {
+ const CaseFold* ef = f + n;
+
+ // Binary search for entry containing r.
+ while (n > 0) {
+ int m = n/2;
+ if (f[m].lo <= r && r <= f[m].hi)
+ return &f[m];
+ if (r < f[m].lo) {
+ n = m;
+ } else {
+ f += m+1;
+ n -= m+1;
+ }
+ }
+
+ // There is no entry that contains r, but f points
+ // where it would have been. Unless f points at
+ // the end of the array, it points at the next entry
+ // after r.
+ if (f < ef)
+ return f;
+
+ // No entry contains r; no entry contains runes > r.
+ return NULL;
+}
+
+// Returns the result of applying the fold f to the rune r.
+Rune ApplyFold(const CaseFold* f, Rune r) {
+ switch (f->delta) {
+ default:
+ return r + f->delta;
+
+ case EvenOddSkip: // even <-> odd but only applies to every other
+ if ((r - f->lo) % 2)
+ return r;
+ ABSL_FALLTHROUGH_INTENDED;
+ case EvenOdd: // even <-> odd
+ if (r%2 == 0)
+ return r + 1;
+ return r - 1;
+
+ case OddEvenSkip: // odd <-> even but only applies to every other
+ if ((r - f->lo) % 2)
+ return r;
+ ABSL_FALLTHROUGH_INTENDED;
+ case OddEven: // odd <-> even
+ if (r%2 == 1)
+ return r + 1;
+ return r - 1;
+ }
+}
+
+// Returns the next Rune in r's folding cycle (see unicode_casefold.h).
+// Examples:
+// CycleFoldRune('A') = 'a'
+// CycleFoldRune('a') = 'A'
+//
+// CycleFoldRune('K') = 'k'
+// CycleFoldRune('k') = 0x212A (Kelvin)
+// CycleFoldRune(0x212A) = 'K'
+//
+// CycleFoldRune('?') = '?'
+Rune CycleFoldRune(Rune r) {
+ const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);
+ if (f == NULL || r < f->lo)
+ return r;
+ return ApplyFold(f, r);
+}
+
+// Add lo-hi to the class, along with their fold-equivalent characters.
+// If lo-hi is already in the class, assume that the fold-equivalent
+// chars are there too, so there's no work to do.
+static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
+ // AddFoldedRange calls itself recursively for each rune in the fold cycle.
+ // Most folding cycles are small: there aren't any bigger than four in the
+ // current Unicode tables. make_unicode_casefold.py checks that
+ // the cycles are not too long, and we double-check here using depth.
+ if (depth > 10) {
+ LOG(DFATAL) << "AddFoldedRange recurses too much.";
+ return;
+ }
+
+ if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done
+ return;
+
+ while (lo <= hi) {
+ const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);
+ if (f == NULL) // lo has no fold, nor does anything above lo
+ break;
+ if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo
+ lo = f->lo;
+ continue;
+ }
+
+ // Add in the result of folding the range lo - f->hi
+ // and that range's fold, recursively.
+ Rune lo1 = lo;
+ Rune hi1 = std::min<Rune>(hi, f->hi);
+ switch (f->delta) {
+ default:
+ lo1 += f->delta;
+ hi1 += f->delta;
+ break;
+ case EvenOdd:
+ if (lo1%2 == 1)
+ lo1--;
+ if (hi1%2 == 0)
+ hi1++;
+ break;
+ case OddEven:
+ if (lo1%2 == 0)
+ lo1--;
+ if (hi1%2 == 1)
+ hi1++;
+ break;
+ }
+ AddFoldedRange(cc, lo1, hi1, depth+1);
+
+ // Pick up where this fold left off.
+ lo = f->hi + 1;
+ }
+}
+
+// Pushes the literal rune r onto the stack.
+bool Regexp::ParseState::PushLiteral(Rune r) {
+ // Do case folding if needed.
+ if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
+ Rune r1 = r;
+ do {
+ if (!(flags_ & NeverNL) || r != '\n') {
+ re->ccb_->AddRange(r, r);
+ }
+ r = CycleFoldRune(r);
+ } while (r != r1);
+ return PushRegexp(re);
+ }
+
+ // Exclude newline if applicable.
+ if ((flags_ & NeverNL) && r == '\n')
+ return PushRegexp(new Regexp(kRegexpNoMatch, flags_));
+
+ // No fancy stuff worked. Ordinary literal.
+ if (MaybeConcatString(r, flags_))
+ return true;
+
+ Regexp* re = new Regexp(kRegexpLiteral, flags_);
+ re->rune_ = r;
+ return PushRegexp(re);
+}
+
+// Pushes a ^ onto the stack.
+bool Regexp::ParseState::PushCaret() {
+ if (flags_ & OneLine) {
+ return PushSimpleOp(kRegexpBeginText);
+ }
+ return PushSimpleOp(kRegexpBeginLine);
+}
+
+// Pushes a \b or \B onto the stack.
+bool Regexp::ParseState::PushWordBoundary(bool word) {
+ if (word)
+ return PushSimpleOp(kRegexpWordBoundary);
+ return PushSimpleOp(kRegexpNoWordBoundary);
+}
+
+// Pushes a $ onto the stack.
+bool Regexp::ParseState::PushDollar() {
+ if (flags_ & OneLine) {
+ // Clumsy marker so that MimicsPCRE() can tell whether
+ // this kRegexpEndText was a $ and not a \z.
+ Regexp::ParseFlags oflags = flags_;
+ flags_ = flags_ | WasDollar;
+ bool ret = PushSimpleOp(kRegexpEndText);
+ flags_ = oflags;
+ return ret;
+ }
+ return PushSimpleOp(kRegexpEndLine);
+}
+
+// Pushes a . onto the stack.
+bool Regexp::ParseState::PushDot() {
+ if ((flags_ & DotNL) && !(flags_ & NeverNL))
+ return PushSimpleOp(kRegexpAnyChar);
+ // Rewrite . into [^\n]
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
+ re->ccb_->AddRange(0, '\n' - 1);
+ re->ccb_->AddRange('\n' + 1, rune_max_);
+ return PushRegexp(re);
+}
+
+// Pushes a regexp with the given op (and no args) onto the stack.
+bool Regexp::ParseState::PushSimpleOp(RegexpOp op) {
+ Regexp* re = new Regexp(op, flags_);
+ return PushRegexp(re);
+}
+
+// Pushes a repeat operator regexp onto the stack.
+// A valid argument for the operator must already be on the stack.
+// The char c is the name of the operator, for use in error messages.
+bool Regexp::ParseState::PushRepeatOp(RegexpOp op, absl::string_view s,
+ bool nongreedy) {
+ if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
+ status_->set_code(kRegexpRepeatArgument);
+ status_->set_error_arg(s);
+ return false;
+ }
+ Regexp::ParseFlags fl = flags_;
+ if (nongreedy)
+ fl = fl ^ NonGreedy;
+
+ // Squash **, ++ and ??. Regexp::Star() et al. handle this too, but
+ // they're mostly for use during simplification, not during parsing.
+ if (op == stacktop_->op() && fl == stacktop_->parse_flags())
+ return true;
+
+ // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
+ // op is a repeat, we just have to check that stacktop_->op() is too,
+ // then adjust stacktop_.
+ if ((stacktop_->op() == kRegexpStar ||
+ stacktop_->op() == kRegexpPlus ||
+ stacktop_->op() == kRegexpQuest) &&
+ fl == stacktop_->parse_flags()) {
+ stacktop_->op_ = kRegexpStar;
+ return true;
+ }
+
+ Regexp* re = new Regexp(op, fl);
+ re->AllocSub(1);
+ re->down_ = stacktop_->down_;
+ re->sub()[0] = FinishRegexp(stacktop_);
+ re->simple_ = re->ComputeSimple();
+ stacktop_ = re;
+ return true;
+}
+
+// RepetitionWalker reports whether the repetition regexp is valid.
+// Valid means that the combination of the top-level repetition
+// and any inner repetitions does not exceed n copies of the
+// innermost thing.
+// This rewalks the regexp tree and is called for every repetition,
+// so we have to worry about inducing quadratic behavior in the parser.
+// We avoid this by only using RepetitionWalker when min or max >= 2.
+// In that case the depth of any >= 2 nesting can only get to 9 without
+// triggering a parse error, so each subtree can only be rewalked 9 times.
+class RepetitionWalker : public Regexp::Walker<int> {
+ public:
+ RepetitionWalker() {}
+ virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
+ virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
+ int* child_args, int nchild_args);
+ virtual int ShortVisit(Regexp* re, int parent_arg);
+
+ private:
+ RepetitionWalker(const RepetitionWalker&) = delete;
+ RepetitionWalker& operator=(const RepetitionWalker&) = delete;
+};
+
+int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
+ int arg = parent_arg;
+ if (re->op() == kRegexpRepeat) {
+ int m = re->max();
+ if (m < 0) {
+ m = re->min();
+ }
+ if (m > 0) {
+ arg /= m;
+ }
+ }
+ return arg;
+}
+
+int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
+ int* child_args, int nchild_args) {
+ int arg = pre_arg;
+ for (int i = 0; i < nchild_args; i++) {
+ if (child_args[i] < arg) {
+ arg = child_args[i];
+ }
+ }
+ return arg;
+}
+
+int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {
+ // Should never be called: we use Walk(), not WalkExponential().
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ LOG(DFATAL) << "RepetitionWalker::ShortVisit called";
+#endif
+ return 0;
+}
+
+// Pushes a repetition regexp onto the stack.
+// A valid argument for the operator must already be on the stack.
+bool Regexp::ParseState::PushRepetition(int min, int max, absl::string_view s,
+ bool nongreedy) {
+ if ((max != -1 && max < min) ||
+ min > maximum_repeat_count ||
+ max > maximum_repeat_count) {
+ status_->set_code(kRegexpRepeatSize);
+ status_->set_error_arg(s);
+ return false;
+ }
+ if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
+ status_->set_code(kRegexpRepeatArgument);
+ status_->set_error_arg(s);
+ return false;
+ }
+ Regexp::ParseFlags fl = flags_;
+ if (nongreedy)
+ fl = fl ^ NonGreedy;
+ Regexp* re = new Regexp(kRegexpRepeat, fl);
+ re->min_ = min;
+ re->max_ = max;
+ re->AllocSub(1);
+ re->down_ = stacktop_->down_;
+ re->sub()[0] = FinishRegexp(stacktop_);
+ re->simple_ = re->ComputeSimple();
+ stacktop_ = re;
+ if (min >= 2 || max >= 2) {
+ RepetitionWalker w;
+ if (w.Walk(stacktop_, maximum_repeat_count) == 0) {
+ status_->set_code(kRegexpRepeatSize);
+ status_->set_error_arg(s);
+ return false;
+ }
+ }
+ return true;
+}
+
+// Checks whether a particular regexp op is a marker.
+bool Regexp::ParseState::IsMarker(RegexpOp op) {
+ return op >= kLeftParen;
+}
+
+// Processes a left parenthesis in the input.
+// Pushes a marker onto the stack.
+bool Regexp::ParseState::DoLeftParen(absl::string_view name) {
+ Regexp* re = new Regexp(kLeftParen, flags_);
+ re->cap_ = ++ncap_;
+ if (name.data() != NULL)
+ re->name_ = new std::string(name);
+ return PushRegexp(re);
+}
+
+// Pushes a non-capturing marker onto the stack.
+bool Regexp::ParseState::DoLeftParenNoCapture() {
+ Regexp* re = new Regexp(kLeftParen, flags_);
+ re->cap_ = -1;
+ return PushRegexp(re);
+}
+
+// Processes a vertical bar in the input.
+bool Regexp::ParseState::DoVerticalBar() {
+ MaybeConcatString(-1, NoParseFlags);
+ DoConcatenation();
+
+ // Below the vertical bar is a list to alternate.
+ // Above the vertical bar is a list to concatenate.
+ // We just did the concatenation, so either swap
+ // the result below the vertical bar or push a new
+ // vertical bar on the stack.
+ Regexp* r1;
+ Regexp* r2;
+ if ((r1 = stacktop_) != NULL &&
+ (r2 = r1->down_) != NULL &&
+ r2->op() == kVerticalBar) {
+ Regexp* r3;
+ if ((r3 = r2->down_) != NULL &&
+ (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) {
+ // AnyChar is above or below the vertical bar. Let it subsume
+ // the other when the other is Literal, CharClass or AnyChar.
+ if (r3->op() == kRegexpAnyChar &&
+ (r1->op() == kRegexpLiteral ||
+ r1->op() == kRegexpCharClass ||
+ r1->op() == kRegexpAnyChar)) {
+ // Discard r1.
+ stacktop_ = r2;
+ r1->Decref();
+ return true;
+ }
+ if (r1->op() == kRegexpAnyChar &&
+ (r3->op() == kRegexpLiteral ||
+ r3->op() == kRegexpCharClass ||
+ r3->op() == kRegexpAnyChar)) {
+ // Rearrange the stack and discard r3.
+ r1->down_ = r3->down_;
+ r2->down_ = r1;
+ stacktop_ = r2;
+ r3->Decref();
+ return true;
+ }
+ }
+ // Swap r1 below vertical bar (r2).
+ r1->down_ = r2->down_;
+ r2->down_ = r1;
+ stacktop_ = r2;
+ return true;
+ }
+ return PushSimpleOp(kVerticalBar);
+}
+
+// Processes a right parenthesis in the input.
+bool Regexp::ParseState::DoRightParen() {
+ // Finish the current concatenation and alternation.
+ DoAlternation();
+
+ // The stack should be: LeftParen regexp
+ // Remove the LeftParen, leaving the regexp,
+ // parenthesized.
+ Regexp* r1;
+ Regexp* r2;
+ if ((r1 = stacktop_) == NULL ||
+ (r2 = r1->down_) == NULL ||
+ r2->op() != kLeftParen) {
+ status_->set_code(kRegexpUnexpectedParen);
+ status_->set_error_arg(whole_regexp_);
+ return false;
+ }
+
+ // Pop off r1, r2. Will Decref or reuse below.
+ stacktop_ = r2->down_;
+
+ // Restore flags from when paren opened.
+ Regexp* re = r2;
+ flags_ = re->parse_flags();
+
+ // Rewrite LeftParen as capture if needed.
+ if (re->cap_ > 0) {
+ re->op_ = kRegexpCapture;
+ // re->cap_ is already set
+ re->AllocSub(1);
+ re->sub()[0] = FinishRegexp(r1);
+ re->simple_ = re->ComputeSimple();
+ } else {
+ re->Decref();
+ re = r1;
+ }
+ return PushRegexp(re);
+}
+
+// Processes the end of input, returning the final regexp.
+Regexp* Regexp::ParseState::DoFinish() {
+ DoAlternation();
+ Regexp* re = stacktop_;
+ if (re != NULL && re->down_ != NULL) {
+ status_->set_code(kRegexpMissingParen);
+ status_->set_error_arg(whole_regexp_);
+ return NULL;
+ }
+ stacktop_ = NULL;
+ return FinishRegexp(re);
+}
+
+// Returns the leading regexp that re starts with.
+// The returned Regexp* points into a piece of re,
+// so it must not be used after the caller calls re->Decref().
+Regexp* Regexp::LeadingRegexp(Regexp* re) {
+ if (re->op() == kRegexpEmptyMatch)
+ return NULL;
+ if (re->op() == kRegexpConcat && re->nsub() >= 2) {
+ Regexp** sub = re->sub();
+ if (sub[0]->op() == kRegexpEmptyMatch)
+ return NULL;
+ return sub[0];
+ }
+ return re;
+}
+
+// Removes LeadingRegexp(re) from re and returns what's left.
+// Consumes the reference to re and may edit it in place.
+// If caller wants to hold on to LeadingRegexp(re),
+// must have already Incref'ed it.
+Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {
+ if (re->op() == kRegexpEmptyMatch)
+ return re;
+ if (re->op() == kRegexpConcat && re->nsub() >= 2) {
+ Regexp** sub = re->sub();
+ if (sub[0]->op() == kRegexpEmptyMatch)
+ return re;
+ sub[0]->Decref();
+ sub[0] = NULL;
+ if (re->nsub() == 2) {
+ // Collapse concatenation to single regexp.
+ Regexp* nre = sub[1];
+ sub[1] = NULL;
+ re->Decref();
+ return nre;
+ }
+ // 3 or more -> 2 or more.
+ re->nsub_--;
+ memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
+ return re;
+ }
+ Regexp::ParseFlags pf = re->parse_flags();
+ re->Decref();
+ return new Regexp(kRegexpEmptyMatch, pf);
+}
+
+// Returns the leading string that re starts with.
+// The returned Rune* points into a piece of re,
+// so it must not be used after the caller calls re->Decref().
+Rune* Regexp::LeadingString(Regexp* re, int* nrune,
+ Regexp::ParseFlags* flags) {
+ while (re->op() == kRegexpConcat && re->nsub() > 0)
+ re = re->sub()[0];
+
+ *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase);
+
+ if (re->op() == kRegexpLiteral) {
+ *nrune = 1;
+ return &re->rune_;
+ }
+
+ if (re->op() == kRegexpLiteralString) {
+ *nrune = re->nrunes_;
+ return re->runes_;
+ }
+
+ *nrune = 0;
+ return NULL;
+}
+
+// Removes the first n leading runes from the beginning of re.
+// Edits re in place.
+void Regexp::RemoveLeadingString(Regexp* re, int n) {
+ // Chase down concats to find first string.
+ // For regexps generated by parser, nested concats are
+ // flattened except when doing so would overflow the 16-bit
+ // limit on the size of a concatenation, so we should never
+ // see more than two here.
+ Regexp* stk[4];
+ size_t d = 0;
+ while (re->op() == kRegexpConcat) {
+ if (d < ABSL_ARRAYSIZE(stk))
+ stk[d++] = re;
+ re = re->sub()[0];
+ }
+
+ // Remove leading string from re.
+ if (re->op() == kRegexpLiteral) {
+ re->rune_ = 0;
+ re->op_ = kRegexpEmptyMatch;
+ } else if (re->op() == kRegexpLiteralString) {
+ if (n >= re->nrunes_) {
+ delete[] re->runes_;
+ re->runes_ = NULL;
+ re->nrunes_ = 0;
+ re->op_ = kRegexpEmptyMatch;
+ } else if (n == re->nrunes_ - 1) {
+ Rune rune = re->runes_[re->nrunes_ - 1];
+ delete[] re->runes_;
+ re->runes_ = NULL;
+ re->nrunes_ = 0;
+ re->rune_ = rune;
+ re->op_ = kRegexpLiteral;
+ } else {
+ re->nrunes_ -= n;
+ memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]);
+ }
+ }
+
+ // If re is now empty, concatenations might simplify too.
+ while (d > 0) {
+ re = stk[--d];
+ Regexp** sub = re->sub();
+ if (sub[0]->op() == kRegexpEmptyMatch) {
+ sub[0]->Decref();
+ sub[0] = NULL;
+ // Delete first element of concat.
+ switch (re->nsub()) {
+ case 0:
+ case 1:
+ // Impossible.
+ LOG(DFATAL) << "Concat of " << re->nsub();
+ re->submany_ = NULL;
+ re->op_ = kRegexpEmptyMatch;
+ break;
+
+ case 2: {
+ // Replace re with sub[1].
+ Regexp* old = sub[1];
+ sub[1] = NULL;
+ re->Swap(old);
+ old->Decref();
+ break;
+ }
+
+ default:
+ // Slide down.
+ re->nsub_--;
+ memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
+ break;
+ }
+ }
+ }
+}
+
+// In the context of factoring alternations, a Splice is: a factored prefix or
+// merged character class computed by one iteration of one round of factoring;
+// the span of subexpressions of the alternation to be "spliced" (i.e. removed
+// and replaced); and, for a factored prefix, the number of suffixes after any
+// factoring that might have subsequently been performed on them. For a merged
+// character class, there are no suffixes, of course, so the field is ignored.
+struct Splice {
+ Splice(Regexp* prefix, Regexp** sub, int nsub)
+ : prefix(prefix),
+ sub(sub),
+ nsub(nsub),
+ nsuffix(-1) {}
+
+ Regexp* prefix;
+ Regexp** sub;
+ int nsub;
+ int nsuffix;
+};
+
+// Named so because it is used to implement an explicit stack, a Frame is: the
+// span of subexpressions of the alternation to be factored; the current round
+// of factoring; any Splices computed; and, for a factored prefix, an iterator
+// to the next Splice to be factored (i.e. in another Frame) because suffixes.
+struct Frame {
+ Frame(Regexp** sub, int nsub)
+ : sub(sub),
+ nsub(nsub),
+ round(0) {}
+
+ Regexp** sub;
+ int nsub;
+ int round;
+ std::vector<Splice> splices;
+ int spliceidx;
+};
+
+// Bundled into a class for friend access to Regexp without needing to declare
+// (or define) Splice in regexp.h.
+class FactorAlternationImpl {
+ public:
+ static void Round1(Regexp** sub, int nsub,
+ Regexp::ParseFlags flags,
+ std::vector<Splice>* splices);
+ static void Round2(Regexp** sub, int nsub,
+ Regexp::ParseFlags flags,
+ std::vector<Splice>* splices);
+ static void Round3(Regexp** sub, int nsub,
+ Regexp::ParseFlags flags,
+ std::vector<Splice>* splices);
+};
+
+// Factors common prefixes from alternation.
+// For example,
+// ABC|ABD|AEF|BCX|BCY
+// simplifies to
+// A(B(C|D)|EF)|BC(X|Y)
+// and thence to
+// A(B[CD]|EF)|BC[XY]
+//
+// Rewrites sub to contain simplified list to alternate and returns
+// the new length of sub. Adjusts reference counts accordingly
+// (incoming sub[i] decremented, outgoing sub[i] incremented).
+int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) {
+ std::vector<Frame> stk;
+ stk.emplace_back(sub, nsub);
+
+ for (;;) {
+ auto& sub = stk.back().sub;
+ auto& nsub = stk.back().nsub;
+ auto& round = stk.back().round;
+ auto& splices = stk.back().splices;
+ auto& spliceidx = stk.back().spliceidx;
+
+ if (splices.empty()) {
+ // Advance to the next round of factoring. Note that this covers
+ // the initialised state: when splices is empty and round is 0.
+ round++;
+ } else if (spliceidx < static_cast<int>(splices.size())) {
+ // We have at least one more Splice to factor. Recurse logically.
+ stk.emplace_back(splices[spliceidx].sub, splices[spliceidx].nsub);
+ continue;
+ } else {
+ // We have no more Splices to factor. Apply them.
+ auto iter = splices.begin();
+ int out = 0;
+ for (int i = 0; i < nsub; ) {
+ // Copy until we reach where the next Splice begins.
+ while (sub + i < iter->sub)
+ sub[out++] = sub[i++];
+ switch (round) {
+ case 1:
+ case 2: {
+ // Assemble the Splice prefix and the suffixes.
+ Regexp* re[2];
+ re[0] = iter->prefix;
+ re[1] = Regexp::AlternateNoFactor(iter->sub, iter->nsuffix, flags);
+ sub[out++] = Regexp::Concat(re, 2, flags);
+ i += iter->nsub;
+ break;
+ }
+ case 3:
+ // Just use the Splice prefix.
+ sub[out++] = iter->prefix;
+ i += iter->nsub;
+ break;
+ default:
+ LOG(DFATAL) << "unknown round: " << round;
+ break;
+ }
+ // If we are done, copy until the end of sub.
+ if (++iter == splices.end()) {
+ while (i < nsub)
+ sub[out++] = sub[i++];
+ }
+ }
+ splices.clear();
+ nsub = out;
+ // Advance to the next round of factoring.
+ round++;
+ }
+
+ switch (round) {
+ case 1:
+ FactorAlternationImpl::Round1(sub, nsub, flags, &splices);
+ break;
+ case 2:
+ FactorAlternationImpl::Round2(sub, nsub, flags, &splices);
+ break;
+ case 3:
+ FactorAlternationImpl::Round3(sub, nsub, flags, &splices);
+ break;
+ case 4:
+ if (stk.size() == 1) {
+ // We are at the top of the stack. Just return.
+ return nsub;
+ } else {
+ // Pop the stack and set the number of suffixes.
+ // (Note that references will be invalidated!)
+ int nsuffix = nsub;
+ stk.pop_back();
+ stk.back().splices[stk.back().spliceidx].nsuffix = nsuffix;
+ ++stk.back().spliceidx;
+ continue;
+ }
+ default:
+ LOG(DFATAL) << "unknown round: " << round;
+ break;
+ }
+
+ // Set spliceidx depending on whether we have Splices to factor.
+ if (splices.empty() || round == 3) {
+ spliceidx = static_cast<int>(splices.size());
+ } else {
+ spliceidx = 0;
+ }
+ }
+}
+
+void FactorAlternationImpl::Round1(Regexp** sub, int nsub,
+ Regexp::ParseFlags flags,
+ std::vector<Splice>* splices) {
+ // Round 1: Factor out common literal prefixes.
+ int start = 0;
+ Rune* rune = NULL;
+ int nrune = 0;
+ Regexp::ParseFlags runeflags = Regexp::NoParseFlags;
+ for (int i = 0; i <= nsub; i++) {
+ // Invariant: sub[start:i] consists of regexps that all
+ // begin with rune[0:nrune].
+ Rune* rune_i = NULL;
+ int nrune_i = 0;
+ Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags;
+ if (i < nsub) {
+ rune_i = Regexp::LeadingString(sub[i], &nrune_i, &runeflags_i);
+ if (runeflags_i == runeflags) {
+ int same = 0;
+ while (same < nrune && same < nrune_i && rune[same] == rune_i[same])
+ same++;
+ if (same > 0) {
+ // Matches at least one rune in current range. Keep going around.
+ nrune = same;
+ continue;
+ }
+ }
+ }
+
+ // Found end of a run with common leading literal string:
+ // sub[start:i] all begin with rune[0:nrune],
+ // but sub[i] does not even begin with rune[0].
+ if (i == start) {
+ // Nothing to do - first iteration.
+ } else if (i == start+1) {
+ // Just one: don't bother factoring.
+ } else {
+ Regexp* prefix = Regexp::LiteralString(rune, nrune, runeflags);
+ for (int j = start; j < i; j++)
+ Regexp::RemoveLeadingString(sub[j], nrune);
+ splices->emplace_back(prefix, sub + start, i - start);
+ }
+
+ // Prepare for next iteration (if there is one).
+ if (i < nsub) {
+ start = i;
+ rune = rune_i;
+ nrune = nrune_i;
+ runeflags = runeflags_i;
+ }
+ }
+}
+
+void FactorAlternationImpl::Round2(Regexp** sub, int nsub,
+ Regexp::ParseFlags flags,
+ std::vector<Splice>* splices) {
+ // Round 2: Factor out common simple prefixes,
+ // just the first piece of each concatenation.
+ // This will be good enough a lot of the time.
+ //
+ // Complex subexpressions (e.g. involving quantifiers)
+ // are not safe to factor because that collapses their
+ // distinct paths through the automaton, which affects
+ // correctness in some cases.
+ int start = 0;
+ Regexp* first = NULL;
+ for (int i = 0; i <= nsub; i++) {
+ // Invariant: sub[start:i] consists of regexps that all
+ // begin with first.
+ Regexp* first_i = NULL;
+ if (i < nsub) {
+ first_i = Regexp::LeadingRegexp(sub[i]);
+ if (first != NULL &&
+ // first must be an empty-width op
+ // OR a char class, any char or any byte
+ // OR a fixed repeat of a literal, char class, any char or any byte.
+ (first->op() == kRegexpBeginLine ||
+ first->op() == kRegexpEndLine ||
+ first->op() == kRegexpWordBoundary ||
+ first->op() == kRegexpNoWordBoundary ||
+ first->op() == kRegexpBeginText ||
+ first->op() == kRegexpEndText ||
+ first->op() == kRegexpCharClass ||
+ first->op() == kRegexpAnyChar ||
+ first->op() == kRegexpAnyByte ||
+ (first->op() == kRegexpRepeat &&
+ first->min() == first->max() &&
+ (first->sub()[0]->op() == kRegexpLiteral ||
+ first->sub()[0]->op() == kRegexpCharClass ||
+ first->sub()[0]->op() == kRegexpAnyChar ||
+ first->sub()[0]->op() == kRegexpAnyByte))) &&
+ Regexp::Equal(first, first_i))
+ continue;
+ }
+
+ // Found end of a run with common leading regexp:
+ // sub[start:i] all begin with first,
+ // but sub[i] does not.
+ if (i == start) {
+ // Nothing to do - first iteration.
+ } else if (i == start+1) {
+ // Just one: don't bother factoring.
+ } else {
+ Regexp* prefix = first->Incref();
+ for (int j = start; j < i; j++)
+ sub[j] = Regexp::RemoveLeadingRegexp(sub[j]);
+ splices->emplace_back(prefix, sub + start, i - start);
+ }
+
+ // Prepare for next iteration (if there is one).
+ if (i < nsub) {
+ start = i;
+ first = first_i;
+ }
+ }
+}
+
+void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
+ Regexp::ParseFlags flags,
+ std::vector<Splice>* splices) {
+ // Round 3: Merge runs of literals and/or character classes.
+ int start = 0;
+ Regexp* first = NULL;
+ for (int i = 0; i <= nsub; i++) {
+ // Invariant: sub[start:i] consists of regexps that all
+ // are either literals (i.e. runes) or character classes.
+ Regexp* first_i = NULL;
+ if (i < nsub) {
+ first_i = sub[i];
+ if (first != NULL &&
+ (first->op() == kRegexpLiteral ||
+ first->op() == kRegexpCharClass) &&
+ (first_i->op() == kRegexpLiteral ||
+ first_i->op() == kRegexpCharClass))
+ continue;
+ }
+
+ // Found end of a run of Literal/CharClass:
+ // sub[start:i] all are either one or the other,
+ // but sub[i] is not.
+ if (i == start) {
+ // Nothing to do - first iteration.
+ } else if (i == start+1) {
+ // Just one: don't bother factoring.
+ } else {
+ CharClassBuilder ccb;
+ for (int j = start; j < i; j++) {
+ Regexp* re = sub[j];
+ if (re->op() == kRegexpCharClass) {
+ CharClass* cc = re->cc();
+ for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
+ ccb.AddRange(it->lo, it->hi);
+ } else if (re->op() == kRegexpLiteral) {
+ ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());
+ } else {
+ LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "
+ << re->ToString();
+ }
+ re->Decref();
+ }
+ Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags);
+ splices->emplace_back(re, sub + start, i - start);
+ }
+
+ // Prepare for next iteration (if there is one).
+ if (i < nsub) {
+ start = i;
+ first = first_i;
+ }
+ }
+}
+
+// Collapse the regexps on top of the stack, down to the
+// first marker, into a new op node (op == kRegexpAlternate
+// or op == kRegexpConcat).
+void Regexp::ParseState::DoCollapse(RegexpOp op) {
+ // Scan backward to marker, counting children of composite.
+ int n = 0;
+ Regexp* next = NULL;
+ Regexp* sub;
+ for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
+ next = sub->down_;
+ if (sub->op_ == op)
+ n += sub->nsub_;
+ else
+ n++;
+ }
+
+ // If there's just one child, leave it alone.
+ // (Concat of one thing is that one thing; alternate of one thing is same.)
+ if (stacktop_ != NULL && stacktop_->down_ == next)
+ return;
+
+ // Construct op (alternation or concatenation), flattening op of op.
+ PODArray<Regexp*> subs(n);
+ next = NULL;
+ int i = n;
+ for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
+ next = sub->down_;
+ if (sub->op_ == op) {
+ Regexp** sub_subs = sub->sub();
+ for (int k = sub->nsub_ - 1; k >= 0; k--)
+ subs[--i] = sub_subs[k]->Incref();
+ sub->Decref();
+ } else {
+ subs[--i] = FinishRegexp(sub);
+ }
+ }
+
+ Regexp* re = ConcatOrAlternate(op, subs.data(), n, flags_, true);
+ re->simple_ = re->ComputeSimple();
+ re->down_ = next;
+ stacktop_ = re;
+}
+
+// Finishes the current concatenation,
+// collapsing it into a single regexp on the stack.
+void Regexp::ParseState::DoConcatenation() {
+ Regexp* r1 = stacktop_;
+ if (r1 == NULL || IsMarker(r1->op())) {
+ // empty concatenation is special case
+ Regexp* re = new Regexp(kRegexpEmptyMatch, flags_);
+ PushRegexp(re);
+ }
+ DoCollapse(kRegexpConcat);
+}
+
+// Finishes the current alternation,
+// collapsing it to a single regexp on the stack.
+void Regexp::ParseState::DoAlternation() {
+ DoVerticalBar();
+ // Now stack top is kVerticalBar.
+ Regexp* r1 = stacktop_;
+ stacktop_ = r1->down_;
+ r1->Decref();
+ DoCollapse(kRegexpAlternate);
+}
+
+// Incremental conversion of concatenated literals into strings.
+// If top two elements on stack are both literal or string,
+// collapse into single string.
+// Don't walk down the stack -- the parser calls this frequently
+// enough that below the bottom two is known to be collapsed.
+// Only called when another regexp is about to be pushed
+// on the stack, so that the topmost literal is not being considered.
+// (Otherwise ab* would turn into (ab)*.)
+// If r >= 0, consider pushing a literal r on the stack.
+// Return whether that happened.
+bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
+ Regexp* re1;
+ Regexp* re2;
+ if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL)
+ return false;
+
+ if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString)
+ return false;
+ if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString)
+ return false;
+ if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase))
+ return false;
+
+ if (re2->op_ == kRegexpLiteral) {
+ // convert into string
+ Rune rune = re2->rune_;
+ re2->op_ = kRegexpLiteralString;
+ re2->nrunes_ = 0;
+ re2->runes_ = NULL;
+ re2->AddRuneToString(rune);
+ }
+
+ // push re1 into re2.
+ if (re1->op_ == kRegexpLiteral) {
+ re2->AddRuneToString(re1->rune_);
+ } else {
+ for (int i = 0; i < re1->nrunes_; i++)
+ re2->AddRuneToString(re1->runes_[i]);
+ re1->nrunes_ = 0;
+ delete[] re1->runes_;
+ re1->runes_ = NULL;
+ }
+
+ // reuse re1 if possible
+ if (r >= 0) {
+ re1->op_ = kRegexpLiteral;
+ re1->rune_ = r;
+ re1->parse_flags_ = static_cast<uint16_t>(flags);
+ return true;
+ }
+
+ stacktop_ = re2;
+ re1->Decref();
+ return false;
+}
+
+// Lexing routines.
+
+// Parses a decimal integer, storing it in *np.
+// Sets *s to span the remainder of the string.
+static bool ParseInteger(absl::string_view* s, int* np) {
+ if (s->empty() || !absl::ascii_isdigit((*s)[0] & 0xFF))
+ return false;
+ // Disallow leading zeros.
+ if (s->size() >= 2 && (*s)[0] == '0' && absl::ascii_isdigit((*s)[1] & 0xFF))
+ return false;
+ int n = 0;
+ int c;
+ while (!s->empty() && absl::ascii_isdigit(c = (*s)[0] & 0xFF)) {
+ // Avoid overflow.
+ if (n >= 100000000)
+ return false;
+ n = n*10 + c - '0';
+ s->remove_prefix(1); // digit
+ }
+ *np = n;
+ return true;
+}
+
+// Parses a repetition suffix like {1,2} or {2} or {2,}.
+// Sets *s to span the remainder of the string on success.
+// Sets *lo and *hi to the given range.
+// In the case of {2,}, the high number is unbounded;
+// sets *hi to -1 to signify this.
+// {,2} is NOT a valid suffix.
+// The Maybe in the name signifies that the regexp parse
+// doesn't fail even if ParseRepetition does, so the string_view
+// s must NOT be edited unless MaybeParseRepetition returns true.
+static bool MaybeParseRepetition(absl::string_view* sp, int* lo, int* hi) {
+ absl::string_view s = *sp;
+ if (s.empty() || s[0] != '{')
+ return false;
+ s.remove_prefix(1); // '{'
+ if (!ParseInteger(&s, lo))
+ return false;
+ if (s.empty())
+ return false;
+ if (s[0] == ',') {
+ s.remove_prefix(1); // ','
+ if (s.empty())
+ return false;
+ if (s[0] == '}') {
+ // {2,} means at least 2
+ *hi = -1;
+ } else {
+ // {2,4} means 2, 3, or 4.
+ if (!ParseInteger(&s, hi))
+ return false;
+ }
+ } else {
+ // {2} means exactly two
+ *hi = *lo;
+ }
+ if (s.empty() || s[0] != '}')
+ return false;
+ s.remove_prefix(1); // '}'
+ *sp = s;
+ return true;
+}
+
+// Removes the next Rune from the string_view and stores it in *r.
+// Returns number of bytes removed from sp.
+// Behaves as though there is a terminating NUL at the end of sp.
+// Argument order is backwards from usual Google style
+// but consistent with chartorune.
+static int StringViewToRune(Rune* r, absl::string_view* sp,
+ RegexpStatus* status) {
+ // fullrune() takes int, not size_t. However, it just looks
+ // at the leading byte and treats any length >= 4 the same.
+ if (fullrune(sp->data(), static_cast<int>(std::min(size_t{4}, sp->size())))) {
+ int n = chartorune(r, sp->data());
+ // Some copies of chartorune have a bug that accepts
+ // encodings of values in (10FFFF, 1FFFFF] as valid.
+ // Those values break the character class algorithm,
+ // which assumes Runemax is the largest rune.
+ if (*r > Runemax) {
+ n = 1;
+ *r = Runeerror;
+ }
+ if (!(n == 1 && *r == Runeerror)) { // no decoding error
+ sp->remove_prefix(n);
+ return n;
+ }
+ }
+
+ if (status != NULL) {
+ status->set_code(kRegexpBadUTF8);
+ status->set_error_arg(absl::string_view());
+ }
+ return -1;
+}
+
+// Returns whether name is valid UTF-8.
+// If not, sets status to kRegexpBadUTF8.
+static bool IsValidUTF8(absl::string_view s, RegexpStatus* status) {
+ absl::string_view t = s;
+ Rune r;
+ while (!t.empty()) {
+ if (StringViewToRune(&r, &t, status) < 0)
+ return false;
+ }
+ return true;
+}
+
+// Is c a hex digit?
+static int IsHex(int c) {
+ return ('0' <= c && c <= '9') ||
+ ('A' <= c && c <= 'F') ||
+ ('a' <= c && c <= 'f');
+}
+
+// Convert hex digit to value.
+static int UnHex(int c) {
+ if ('0' <= c && c <= '9')
+ return c - '0';
+ if ('A' <= c && c <= 'F')
+ return c - 'A' + 10;
+ if ('a' <= c && c <= 'f')
+ return c - 'a' + 10;
+ LOG(DFATAL) << "Bad hex digit " << c;
+ return 0;
+}
+
+// Parse an escape sequence (e.g., \n, \{).
+// Sets *s to span the remainder of the string.
+// Sets *rp to the named character.
+static bool ParseEscape(absl::string_view* s, Rune* rp,
+ RegexpStatus* status, int rune_max) {
+ const char* begin = s->data();
+ if (s->empty() || (*s)[0] != '\\') {
+ // Should not happen - caller always checks.
+ status->set_code(kRegexpInternalError);
+ status->set_error_arg(absl::string_view());
+ return false;
+ }
+ if (s->size() == 1) {
+ status->set_code(kRegexpTrailingBackslash);
+ status->set_error_arg(absl::string_view());
+ return false;
+ }
+ Rune c, c1;
+ s->remove_prefix(1); // backslash
+ if (StringViewToRune(&c, s, status) < 0)
+ return false;
+ int code;
+ switch (c) {
+ default:
+ if (c < Runeself && !absl::ascii_isalnum(c)) {
+ // Escaped non-word characters are always themselves.
+ // PCRE is not quite so rigorous: it accepts things like
+ // \q, but we don't. We once rejected \_, but too many
+ // programs and people insist on using it, so allow \_.
+ *rp = c;
+ return true;
+ }
+ goto BadEscape;
+
+ // Octal escapes.
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ // Single non-zero octal digit is a backreference; not supported.
+ if (s->empty() || (*s)[0] < '0' || (*s)[0] > '7')
+ goto BadEscape;
+ ABSL_FALLTHROUGH_INTENDED;
+ case '0':
+ // consume up to three octal digits; already have one.
+ code = c - '0';
+ if (!s->empty() && '0' <= (c = (*s)[0]) && c <= '7') {
+ code = code * 8 + c - '0';
+ s->remove_prefix(1); // digit
+ if (!s->empty()) {
+ c = (*s)[0];
+ if ('0' <= c && c <= '7') {
+ code = code * 8 + c - '0';
+ s->remove_prefix(1); // digit
+ }
+ }
+ }
+ if (code > rune_max)
+ goto BadEscape;
+ *rp = code;
+ return true;
+
+ // Hexadecimal escapes
+ case 'x':
+ if (s->empty())
+ goto BadEscape;
+ if (StringViewToRune(&c, s, status) < 0)
+ return false;
+ if (c == '{') {
+ // Any number of digits in braces.
+ // Update n as we consume the string, so that
+ // the whole thing gets shown in the error message.
+ // Perl accepts any text at all; it ignores all text
+ // after the first non-hex digit. We require only hex digits,
+ // and at least one.
+ if (StringViewToRune(&c, s, status) < 0)
+ return false;
+ int nhex = 0;
+ code = 0;
+ while (IsHex(c)) {
+ nhex++;
+ code = code * 16 + UnHex(c);
+ if (code > rune_max)
+ goto BadEscape;
+ if (s->empty())
+ goto BadEscape;
+ if (StringViewToRune(&c, s, status) < 0)
+ return false;
+ }
+ if (c != '}' || nhex == 0)
+ goto BadEscape;
+ *rp = code;
+ return true;
+ }
+ // Easy case: two hex digits.
+ if (s->empty())
+ goto BadEscape;
+ if (StringViewToRune(&c1, s, status) < 0)
+ return false;
+ if (!IsHex(c) || !IsHex(c1))
+ goto BadEscape;
+ *rp = UnHex(c) * 16 + UnHex(c1);
+ return true;
+
+ // C escapes.
+ case 'n':
+ *rp = '\n';
+ return true;
+ case 'r':
+ *rp = '\r';
+ return true;
+ case 't':
+ *rp = '\t';
+ return true;
+
+ // Less common C escapes.
+ case 'a':
+ *rp = '\a';
+ return true;
+ case 'f':
+ *rp = '\f';
+ return true;
+ case 'v':
+ *rp = '\v';
+ return true;
+
+ // This code is disabled to avoid misparsing
+ // the Perl word-boundary \b as a backspace
+ // when in POSIX regexp mode. Surprisingly,
+ // in Perl, \b means word-boundary but [\b]
+ // means backspace. We don't support that:
+ // if you want a backspace embed a literal
+ // backspace character or use \x08.
+ //
+ // case 'b':
+ // *rp = '\b';
+ // return true;
+ }
+
+BadEscape:
+ // Unrecognized escape sequence.
+ status->set_code(kRegexpBadEscape);
+ status->set_error_arg(
+ absl::string_view(begin, static_cast<size_t>(s->data() - begin)));
+ return false;
+}
+
+// Add a range to the character class, but exclude newline if asked.
+// Also handle case folding.
+void CharClassBuilder::AddRangeFlags(
+ Rune lo, Rune hi, Regexp::ParseFlags parse_flags) {
+
+ // Take out \n if the flags say so.
+ bool cutnl = !(parse_flags & Regexp::ClassNL) ||
+ (parse_flags & Regexp::NeverNL);
+ if (cutnl && lo <= '\n' && '\n' <= hi) {
+ if (lo < '\n')
+ AddRangeFlags(lo, '\n' - 1, parse_flags);
+ if (hi > '\n')
+ AddRangeFlags('\n' + 1, hi, parse_flags);
+ return;
+ }
+
+ // If folding case, add fold-equivalent characters too.
+ if (parse_flags & Regexp::FoldCase)
+ AddFoldedRange(this, lo, hi, 0);
+ else
+ AddRange(lo, hi);
+}
+
+// Look for a group with the given name.
+static const UGroup* LookupGroup(absl::string_view name,
+ const UGroup* groups, int ngroups) {
+ // Simple name lookup.
+ for (int i = 0; i < ngroups; i++)
+ if (absl::string_view(groups[i].name) == name)
+ return &groups[i];
+ return NULL;
+}
+
+// Look for a POSIX group with the given name (e.g., "[:^alpha:]")
+static const UGroup* LookupPosixGroup(absl::string_view name) {
+ return LookupGroup(name, posix_groups, num_posix_groups);
+}
+
+static const UGroup* LookupPerlGroup(absl::string_view name) {
+ return LookupGroup(name, perl_groups, num_perl_groups);
+}
+
+#if !defined(RE2_USE_ICU)
+// Fake UGroup containing all Runes
+static URange16 any16[] = { { 0, 65535 } };
+static URange32 any32[] = { { 65536, Runemax } };
+static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 };
+
+// Look for a Unicode group with the given name (e.g., "Han")
+static const UGroup* LookupUnicodeGroup(absl::string_view name) {
+ // Special case: "Any" means any.
+ if (name == absl::string_view("Any"))
+ return &anygroup;
+ return LookupGroup(name, unicode_groups, num_unicode_groups);
+}
+#endif
+
+// Add a UGroup or its negation to the character class.
+static void AddUGroup(CharClassBuilder* cc, const UGroup* g, int sign,
+ Regexp::ParseFlags parse_flags) {
+ if (sign == +1) {
+ for (int i = 0; i < g->nr16; i++) {
+ cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags);
+ }
+ for (int i = 0; i < g->nr32; i++) {
+ cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags);
+ }
+ } else {
+ if (parse_flags & Regexp::FoldCase) {
+ // Normally adding a case-folded group means
+ // adding all the extra fold-equivalent runes too.
+ // But if we're adding the negation of the group,
+ // we have to exclude all the runes that are fold-equivalent
+ // to what's already missing. Too hard, so do in two steps.
+ CharClassBuilder ccb1;
+ AddUGroup(&ccb1, g, +1, parse_flags);
+ // If the flags say to take out \n, put it in, so that negating will take it out.
+ // Normally AddRangeFlags does this, but we're bypassing AddRangeFlags.
+ bool cutnl = !(parse_flags & Regexp::ClassNL) ||
+ (parse_flags & Regexp::NeverNL);
+ if (cutnl) {
+ ccb1.AddRange('\n', '\n');
+ }
+ ccb1.Negate();
+ cc->AddCharClass(&ccb1);
+ return;
+ }
+ int next = 0;
+ for (int i = 0; i < g->nr16; i++) {
+ if (next < g->r16[i].lo)
+ cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags);
+ next = g->r16[i].hi + 1;
+ }
+ for (int i = 0; i < g->nr32; i++) {
+ if (next < g->r32[i].lo)
+ cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags);
+ next = g->r32[i].hi + 1;
+ }
+ if (next <= Runemax)
+ cc->AddRangeFlags(next, Runemax, parse_flags);
+ }
+}
+
+// Maybe parse a Perl character class escape sequence.
+// Only recognizes the Perl character classes (\d \s \w \D \S \W),
+// not the Perl empty-string classes (\b \B \A \Z \z).
+// On success, sets *s to span the remainder of the string
+// and returns the corresponding UGroup.
+// The string_view must *NOT* be edited unless the call succeeds.
+const UGroup* MaybeParsePerlCCEscape(absl::string_view* s,
+ Regexp::ParseFlags parse_flags) {
+ if (!(parse_flags & Regexp::PerlClasses))
+ return NULL;
+ if (s->size() < 2 || (*s)[0] != '\\')
+ return NULL;
+ // Could use StringViewToRune, but there aren't
+ // any non-ASCII Perl group names.
+ absl::string_view name(s->data(), 2);
+ const UGroup* g = LookupPerlGroup(name);
+ if (g == NULL)
+ return NULL;
+ s->remove_prefix(name.size());
+ return g;
+}
+
+enum ParseStatus {
+ kParseOk, // Did some parsing.
+ kParseError, // Found an error.
+ kParseNothing, // Decided not to parse.
+};
+
+// Maybe parses a Unicode character group like \p{Han} or \P{Han}
+// (the latter is a negated group).
+ParseStatus ParseUnicodeGroup(absl::string_view* s,
+ Regexp::ParseFlags parse_flags,
+ CharClassBuilder* cc, RegexpStatus* status) {
+ // Decide whether to parse.
+ if (!(parse_flags & Regexp::UnicodeGroups))
+ return kParseNothing;
+ if (s->size() < 2 || (*s)[0] != '\\')
+ return kParseNothing;
+ Rune c = (*s)[1];
+ if (c != 'p' && c != 'P')
+ return kParseNothing;
+
+ // Committed to parse. Results:
+ int sign = +1; // -1 = negated char class
+ if (c == 'P')
+ sign = -sign;
+ absl::string_view seq = *s; // \p{Han} or \pL
+ absl::string_view name; // Han or L
+ s->remove_prefix(2); // '\\', 'p'
+
+ if (!StringViewToRune(&c, s, status))
+ return kParseError;
+ if (c != '{') {
+ // Name is the bit of string we just skipped over for c.
+ const char* p = seq.data() + 2;
+ name = absl::string_view(p, static_cast<size_t>(s->data() - p));
+ } else {
+ // Name is in braces. Look for closing }
+ size_t end = s->find('}', 0);
+ if (end == absl::string_view::npos) {
+ if (!IsValidUTF8(seq, status))
+ return kParseError;
+ status->set_code(kRegexpBadCharRange);
+ status->set_error_arg(seq);
+ return kParseError;
+ }
+ name = absl::string_view(s->data(), end); // without '}'
+ s->remove_prefix(end + 1); // with '}'
+ if (!IsValidUTF8(name, status))
+ return kParseError;
+ }
+
+ // Chop seq where s now begins.
+ seq = absl::string_view(seq.data(), static_cast<size_t>(s->data() - seq.data()));
+
+ if (!name.empty() && name[0] == '^') {
+ sign = -sign;
+ name.remove_prefix(1); // '^'
+ }
+
+#if !defined(RE2_USE_ICU)
+ // Look up the group in the RE2 Unicode data.
+ const UGroup* g = LookupUnicodeGroup(name);
+ if (g == NULL) {
+ status->set_code(kRegexpBadCharRange);
+ status->set_error_arg(seq);
+ return kParseError;
+ }
+
+ AddUGroup(cc, g, sign, parse_flags);
+#else
+ // Look up the group in the ICU Unicode data. Because ICU provides full
+ // Unicode properties support, this could be more than a lookup by name.
+ ::icu::UnicodeString ustr = ::icu::UnicodeString::fromUTF8(
+ std::string("\\p{") + std::string(name) + std::string("}"));
+ UErrorCode uerr = U_ZERO_ERROR;
+ ::icu::UnicodeSet uset(ustr, uerr);
+ if (U_FAILURE(uerr)) {
+ status->set_code(kRegexpBadCharRange);
+ status->set_error_arg(seq);
+ return kParseError;
+ }
+
+ // Convert the UnicodeSet to a URange32 and UGroup that we can add.
+ int nr = uset.getRangeCount();
+ PODArray<URange32> r(nr);
+ for (int i = 0; i < nr; i++) {
+ r[i].lo = uset.getRangeStart(i);
+ r[i].hi = uset.getRangeEnd(i);
+ }
+ UGroup g = {"", +1, 0, 0, r.data(), nr};
+ AddUGroup(cc, &g, sign, parse_flags);
+#endif
+
+ return kParseOk;
+}
+
+// Parses a character class name like [:alnum:].
+// Sets *s to span the remainder of the string.
+// Adds the ranges corresponding to the class to ranges.
+static ParseStatus ParseCCName(absl::string_view* s,
+ Regexp::ParseFlags parse_flags,
+ CharClassBuilder* cc, RegexpStatus* status) {
+ // Check begins with [:
+ const char* p = s->data();
+ const char* ep = s->data() + s->size();
+ if (ep - p < 2 || p[0] != '[' || p[1] != ':')
+ return kParseNothing;
+
+ // Look for closing :].
+ const char* q;
+ for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++)
+ ;
+
+ // If no closing :], then ignore.
+ if (q > ep-2)
+ return kParseNothing;
+
+ // Got it. Check that it's valid.
+ q += 2;
+ absl::string_view name(p, static_cast<size_t>(q - p));
+
+ const UGroup* g = LookupPosixGroup(name);
+ if (g == NULL) {
+ status->set_code(kRegexpBadCharRange);
+ status->set_error_arg(name);
+ return kParseError;
+ }
+
+ s->remove_prefix(name.size());
+ AddUGroup(cc, g, g->sign, parse_flags);
+ return kParseOk;
+}
+
+// Parses a character inside a character class.
+// There are fewer special characters here than in the rest of the regexp.
+// Sets *s to span the remainder of the string.
+// Sets *rp to the character.
+bool Regexp::ParseState::ParseCCCharacter(absl::string_view* s, Rune* rp,
+ absl::string_view whole_class,
+ RegexpStatus* status) {
+ if (s->empty()) {
+ status->set_code(kRegexpMissingBracket);
+ status->set_error_arg(whole_class);
+ return false;
+ }
+
+ // Allow regular escape sequences even though
+ // many need not be escaped in this context.
+ if ((*s)[0] == '\\')
+ return ParseEscape(s, rp, status, rune_max_);
+
+ // Otherwise take the next rune.
+ return StringViewToRune(rp, s, status) >= 0;
+}
+
+// Parses a character class character, or, if the character
+// is followed by a hyphen, parses a character class range.
+// For single characters, rr->lo == rr->hi.
+// Sets *s to span the remainder of the string.
+// Sets *rp to the character.
+bool Regexp::ParseState::ParseCCRange(absl::string_view* s, RuneRange* rr,
+ absl::string_view whole_class,
+ RegexpStatus* status) {
+ absl::string_view os = *s;
+ if (!ParseCCCharacter(s, &rr->lo, whole_class, status))
+ return false;
+ // [a-] means (a|-), so check for final ].
+ if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') {
+ s->remove_prefix(1); // '-'
+ if (!ParseCCCharacter(s, &rr->hi, whole_class, status))
+ return false;
+ if (rr->hi < rr->lo) {
+ status->set_code(kRegexpBadCharRange);
+ status->set_error_arg(absl::string_view(
+ os.data(), static_cast<size_t>(s->data() - os.data())));
+ return false;
+ }
+ } else {
+ rr->hi = rr->lo;
+ }
+ return true;
+}
+
+// Parses a possibly-negated character class expression like [^abx-z[:digit:]].
+// Sets *s to span the remainder of the string.
+// Sets *out_re to the regexp for the class.
+bool Regexp::ParseState::ParseCharClass(absl::string_view* s, Regexp** out_re,
+ RegexpStatus* status) {
+ absl::string_view whole_class = *s;
+ if (s->empty() || (*s)[0] != '[') {
+ // Caller checked this.
+ status->set_code(kRegexpInternalError);
+ status->set_error_arg(absl::string_view());
+ return false;
+ }
+ bool negated = false;
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
+ s->remove_prefix(1); // '['
+ if (!s->empty() && (*s)[0] == '^') {
+ s->remove_prefix(1); // '^'
+ negated = true;
+ if (!(flags_ & ClassNL) || (flags_ & NeverNL)) {
+ // If NL can't match implicitly, then pretend
+ // negated classes include a leading \n.
+ re->ccb_->AddRange('\n', '\n');
+ }
+ }
+ bool first = true; // ] is okay as first char in class
+ while (!s->empty() && ((*s)[0] != ']' || first)) {
+ // - is only okay unescaped as first or last in class.
+ // Except that Perl allows - anywhere.
+ if ((*s)[0] == '-' && !first && !(flags_&PerlX) &&
+ (s->size() == 1 || (*s)[1] != ']')) {
+ absl::string_view t = *s;
+ t.remove_prefix(1); // '-'
+ Rune r;
+ int n = StringViewToRune(&r, &t, status);
+ if (n < 0) {
+ re->Decref();
+ return false;
+ }
+ status->set_code(kRegexpBadCharRange);
+ status->set_error_arg(absl::string_view(s->data(), 1+n));
+ re->Decref();
+ return false;
+ }
+ first = false;
+
+ // Look for [:alnum:] etc.
+ if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') {
+ switch (ParseCCName(s, flags_, re->ccb_, status)) {
+ case kParseOk:
+ continue;
+ case kParseError:
+ re->Decref();
+ return false;
+ case kParseNothing:
+ break;
+ }
+ }
+
+ // Look for Unicode character group like \p{Han}
+ if (s->size() > 2 &&
+ (*s)[0] == '\\' &&
+ ((*s)[1] == 'p' || (*s)[1] == 'P')) {
+ switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) {
+ case kParseOk:
+ continue;
+ case kParseError:
+ re->Decref();
+ return false;
+ case kParseNothing:
+ break;
+ }
+ }
+
+ // Look for Perl character class symbols (extension).
+ const UGroup* g = MaybeParsePerlCCEscape(s, flags_);
+ if (g != NULL) {
+ AddUGroup(re->ccb_, g, g->sign, flags_);
+ continue;
+ }
+
+ // Otherwise assume single character or simple range.
+ RuneRange rr;
+ if (!ParseCCRange(s, &rr, whole_class, status)) {
+ re->Decref();
+ return false;
+ }
+ // AddRangeFlags is usually called in response to a class like
+ // \p{Foo} or [[:foo:]]; for those, it filters \n out unless
+ // Regexp::ClassNL is set. In an explicit range or singleton
+ // like we just parsed, we do not filter \n out, so set ClassNL
+ // in the flags.
+ re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL);
+ }
+ if (s->empty()) {
+ status->set_code(kRegexpMissingBracket);
+ status->set_error_arg(whole_class);
+ re->Decref();
+ return false;
+ }
+ s->remove_prefix(1); // ']'
+
+ if (negated)
+ re->ccb_->Negate();
+
+ *out_re = re;
+ return true;
+}
+
+// Returns whether name is a valid capture name.
+static bool IsValidCaptureName(absl::string_view name) {
+ if (name.empty())
+ return false;
+
+ // Historically, we effectively used [0-9A-Za-z_]+ to validate; that
+ // followed Python 2 except for not restricting the first character.
+ // As of Python 3, Unicode characters beyond ASCII are also allowed;
+ // accordingly, we permit the Lu, Ll, Lt, Lm, Lo, Nl, Mn, Mc, Nd and
+ // Pc categories, but again without restricting the first character.
+ // Also, Unicode normalization (e.g. NFKC) isn't performed: Python 3
+ // performs it for identifiers, but seemingly not for capture names;
+ // if they start doing that for capture names, we won't follow suit.
+ static const CharClass* const cc = []() {
+ CharClassBuilder ccb;
+ for (absl::string_view group :
+ {"Lu", "Ll", "Lt", "Lm", "Lo", "Nl", "Mn", "Mc", "Nd", "Pc"})
+ AddUGroup(&ccb, LookupGroup(group, unicode_groups, num_unicode_groups),
+ +1, Regexp::NoParseFlags);
+ return ccb.GetCharClass();
+ }();
+
+ absl::string_view t = name;
+ Rune r;
+ while (!t.empty()) {
+ if (StringViewToRune(&r, &t, NULL) < 0)
+ return false;
+ if (cc->Contains(r))
+ continue;
+ return false;
+ }
+ return true;
+}
+
+// Parses a Perl flag setting or non-capturing group or both,
+// like (?i) or (?: or (?i:. Removes from s, updates parse state.
+// The caller must check that s begins with "(?".
+// Returns true on success. If the Perl flag is not
+// well-formed or not supported, sets status_ and returns false.
+bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
+ absl::string_view t = *s;
+
+ // Caller is supposed to check this.
+ if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') {
+ status_->set_code(kRegexpInternalError);
+ LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags";
+ return false;
+ }
+
+ // Check for named captures, first introduced in Python's regexp library.
+ // As usual, there are three slightly different syntaxes:
+ //
+ // (?P<name>expr) the original, introduced by Python
+ // (?<name>expr) the .NET alteration, adopted by Perl 5.10
+ // (?'name'expr) another .NET alteration, adopted by Perl 5.10
+ //
+ // Perl 5.10 gave in and implemented the Python version too,
+ // but they claim that the last two are the preferred forms.
+ // PCRE and languages based on it (specifically, PHP and Ruby)
+ // support all three as well. EcmaScript 4 uses only the Python form.
+ //
+ // In both the open source world (via Code Search) and the
+ // Google source tree, (?P<name>expr) and (?<name>expr) are the
+ // dominant forms of named captures and both are supported.
+ if ((t.size() > 4 && t[2] == 'P' && t[3] == '<') ||
+ (t.size() > 3 && t[2] == '<')) {
+ // Pull out name.
+ size_t begin = t[2] == 'P' ? 4 : 3;
+ size_t end = t.find('>', begin);
+ if (end == absl::string_view::npos) {
+ if (!IsValidUTF8(t, status_))
+ return false;
+ status_->set_code(kRegexpBadNamedCapture);
+ status_->set_error_arg(t);
+ return false;
+ }
+
+ absl::string_view capture(t.data(), end+1);
+ absl::string_view name(t.data()+begin, end-begin);
+ if (!IsValidUTF8(name, status_))
+ return false;
+ if (!IsValidCaptureName(name)) {
+ status_->set_code(kRegexpBadNamedCapture);
+ status_->set_error_arg(capture);
+ return false;
+ }
+
+ if (!DoLeftParen(name)) {
+ // DoLeftParen's failure set status_.
+ return false;
+ }
+
+ s->remove_prefix(capture.size());
+ return true;
+ }
+
+ t.remove_prefix(2); // "(?"
+
+ bool negated = false;
+ bool sawflags = false;
+ int nflags = flags_;
+ Rune c;
+ for (bool done = false; !done; ) {
+ if (t.empty())
+ goto BadPerlOp;
+ if (StringViewToRune(&c, &t, status_) < 0)
+ return false;
+ switch (c) {
+ default:
+ goto BadPerlOp;
+
+ // Parse flags.
+ case 'i':
+ sawflags = true;
+ if (negated)
+ nflags &= ~FoldCase;
+ else
+ nflags |= FoldCase;
+ break;
+
+ case 'm': // opposite of our OneLine
+ sawflags = true;
+ if (negated)
+ nflags |= OneLine;
+ else
+ nflags &= ~OneLine;
+ break;
+
+ case 's':
+ sawflags = true;
+ if (negated)
+ nflags &= ~DotNL;
+ else
+ nflags |= DotNL;
+ break;
+
+ case 'U':
+ sawflags = true;
+ if (negated)
+ nflags &= ~NonGreedy;
+ else
+ nflags |= NonGreedy;
+ break;
+
+ // Negation
+ case '-':
+ if (negated)
+ goto BadPerlOp;
+ negated = true;
+ sawflags = false;
+ break;
+
+ // Open new group.
+ case ':':
+ if (!DoLeftParenNoCapture()) {
+ // DoLeftParenNoCapture's failure set status_.
+ return false;
+ }
+ done = true;
+ break;
+
+ // Finish flags.
+ case ')':
+ done = true;
+ break;
+ }
+ }
+
+ if (negated && !sawflags)
+ goto BadPerlOp;
+
+ flags_ = static_cast<Regexp::ParseFlags>(nflags);
+ *s = t;
+ return true;
+
+BadPerlOp:
+ status_->set_code(kRegexpBadPerlOp);
+ status_->set_error_arg(
+ absl::string_view(s->data(), static_cast<size_t>(t.data() - s->data())));
+ return false;
+}
+
+// Converts latin1 (assumed to be encoded as Latin1 bytes)
+// into UTF8 encoding in string.
+// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is
+// deprecated and because it rejects code points 0x80-0x9F.
+void ConvertLatin1ToUTF8(absl::string_view latin1, std::string* utf) {
+ char buf[UTFmax];
+
+ utf->clear();
+ for (size_t i = 0; i < latin1.size(); i++) {
+ Rune r = latin1[i] & 0xFF;
+ int n = runetochar(buf, &r);
+ utf->append(buf, n);
+ }
+}
+
+// Parses the regular expression given by s,
+// returning the corresponding Regexp tree.
+// The caller must Decref the return value when done with it.
+// Returns NULL on error.
+Regexp* Regexp::Parse(absl::string_view s, ParseFlags global_flags,
+ RegexpStatus* status) {
+ // Make status non-NULL (easier on everyone else).
+ RegexpStatus xstatus;
+ if (status == NULL)
+ status = &xstatus;
+
+ ParseState ps(global_flags, s, status);
+ absl::string_view t = s;
+
+ // Convert regexp to UTF-8 (easier on the rest of the parser).
+ if (global_flags & Latin1) {
+ std::string* tmp = new std::string;
+ ConvertLatin1ToUTF8(t, tmp);
+ status->set_tmp(tmp);
+ t = *tmp;
+ }
+
+ if (global_flags & Literal) {
+ // Special parse loop for literal string.
+ while (!t.empty()) {
+ Rune r;
+ if (StringViewToRune(&r, &t, status) < 0)
+ return NULL;
+ if (!ps.PushLiteral(r))
+ return NULL;
+ }
+ return ps.DoFinish();
+ }
+
+ absl::string_view lastunary = absl::string_view();
+ while (!t.empty()) {
+ absl::string_view isunary = absl::string_view();
+ switch (t[0]) {
+ default: {
+ Rune r;
+ if (StringViewToRune(&r, &t, status) < 0)
+ return NULL;
+ if (!ps.PushLiteral(r))
+ return NULL;
+ break;
+ }
+
+ case '(':
+ // "(?" introduces Perl escape.
+ if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) {
+ // Flag changes and non-capturing groups.
+ if (!ps.ParsePerlFlags(&t))
+ return NULL;
+ break;
+ }
+ if (ps.flags() & NeverCapture) {
+ if (!ps.DoLeftParenNoCapture())
+ return NULL;
+ } else {
+ if (!ps.DoLeftParen(absl::string_view()))
+ return NULL;
+ }
+ t.remove_prefix(1); // '('
+ break;
+
+ case '|':
+ if (!ps.DoVerticalBar())
+ return NULL;
+ t.remove_prefix(1); // '|'
+ break;
+
+ case ')':
+ if (!ps.DoRightParen())
+ return NULL;
+ t.remove_prefix(1); // ')'
+ break;
+
+ case '^': // Beginning of line.
+ if (!ps.PushCaret())
+ return NULL;
+ t.remove_prefix(1); // '^'
+ break;
+
+ case '$': // End of line.
+ if (!ps.PushDollar())
+ return NULL;
+ t.remove_prefix(1); // '$'
+ break;
+
+ case '.': // Any character (possibly except newline).
+ if (!ps.PushDot())
+ return NULL;
+ t.remove_prefix(1); // '.'
+ break;
+
+ case '[': { // Character class.
+ Regexp* re;
+ if (!ps.ParseCharClass(&t, &re, status))
+ return NULL;
+ if (!ps.PushRegexp(re))
+ return NULL;
+ break;
+ }
+
+ case '*': { // Zero or more.
+ RegexpOp op;
+ op = kRegexpStar;
+ goto Rep;
+ case '+': // One or more.
+ op = kRegexpPlus;
+ goto Rep;
+ case '?': // Zero or one.
+ op = kRegexpQuest;
+ goto Rep;
+ Rep:
+ absl::string_view opstr = t;
+ bool nongreedy = false;
+ t.remove_prefix(1); // '*' or '+' or '?'
+ if (ps.flags() & PerlX) {
+ if (!t.empty() && t[0] == '?') {
+ nongreedy = true;
+ t.remove_prefix(1); // '?'
+ }
+ if (!lastunary.empty()) {
+ // In Perl it is not allowed to stack repetition operators:
+ // a** is a syntax error, not a double-star.
+ // (and a++ means something else entirely, which we don't support!)
+ status->set_code(kRegexpRepeatOp);
+ status->set_error_arg(absl::string_view(
+ lastunary.data(),
+ static_cast<size_t>(t.data() - lastunary.data())));
+ return NULL;
+ }
+ }
+ opstr = absl::string_view(opstr.data(),
+ static_cast<size_t>(t.data() - opstr.data()));
+ if (!ps.PushRepeatOp(op, opstr, nongreedy))
+ return NULL;
+ isunary = opstr;
+ break;
+ }
+
+ case '{': { // Counted repetition.
+ int lo, hi;
+ absl::string_view opstr = t;
+ if (!MaybeParseRepetition(&t, &lo, &hi)) {
+ // Treat like a literal.
+ if (!ps.PushLiteral('{'))
+ return NULL;
+ t.remove_prefix(1); // '{'
+ break;
+ }
+ bool nongreedy = false;
+ if (ps.flags() & PerlX) {
+ if (!t.empty() && t[0] == '?') {
+ nongreedy = true;
+ t.remove_prefix(1); // '?'
+ }
+ if (!lastunary.empty()) {
+ // Not allowed to stack repetition operators.
+ status->set_code(kRegexpRepeatOp);
+ status->set_error_arg(absl::string_view(
+ lastunary.data(),
+ static_cast<size_t>(t.data() - lastunary.data())));
+ return NULL;
+ }
+ }
+ opstr = absl::string_view(opstr.data(),
+ static_cast<size_t>(t.data() - opstr.data()));
+ if (!ps.PushRepetition(lo, hi, opstr, nongreedy))
+ return NULL;
+ isunary = opstr;
+ break;
+ }
+
+ case '\\': { // Escaped character or Perl sequence.
+ // \b and \B: word boundary or not
+ if ((ps.flags() & Regexp::PerlB) &&
+ t.size() >= 2 && (t[1] == 'b' || t[1] == 'B')) {
+ if (!ps.PushWordBoundary(t[1] == 'b'))
+ return NULL;
+ t.remove_prefix(2); // '\\', 'b'
+ break;
+ }
+
+ if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) {
+ if (t[1] == 'A') {
+ if (!ps.PushSimpleOp(kRegexpBeginText))
+ return NULL;
+ t.remove_prefix(2); // '\\', 'A'
+ break;
+ }
+ if (t[1] == 'z') {
+ if (!ps.PushSimpleOp(kRegexpEndText))
+ return NULL;
+ t.remove_prefix(2); // '\\', 'z'
+ break;
+ }
+ // Do not recognize \Z, because this library can't
+ // implement the exact Perl/PCRE semantics.
+ // (This library treats "(?-m)$" as \z, even though
+ // in Perl and PCRE it is equivalent to \Z.)
+
+ if (t[1] == 'C') { // \C: any byte [sic]
+ if (!ps.PushSimpleOp(kRegexpAnyByte))
+ return NULL;
+ t.remove_prefix(2); // '\\', 'C'
+ break;
+ }
+
+ if (t[1] == 'Q') { // \Q ... \E: the ... is always literals
+ t.remove_prefix(2); // '\\', 'Q'
+ while (!t.empty()) {
+ if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') {
+ t.remove_prefix(2); // '\\', 'E'
+ break;
+ }
+ Rune r;
+ if (StringViewToRune(&r, &t, status) < 0)
+ return NULL;
+ if (!ps.PushLiteral(r))
+ return NULL;
+ }
+ break;
+ }
+ }
+
+ if (t.size() >= 2 && (t[1] == 'p' || t[1] == 'P')) {
+ Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
+ switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) {
+ case kParseOk:
+ if (!ps.PushRegexp(re))
+ return NULL;
+ goto Break2;
+ case kParseError:
+ re->Decref();
+ return NULL;
+ case kParseNothing:
+ re->Decref();
+ break;
+ }
+ }
+
+ const UGroup* g = MaybeParsePerlCCEscape(&t, ps.flags());
+ if (g != NULL) {
+ Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
+ AddUGroup(re->ccb_, g, g->sign, ps.flags());
+ if (!ps.PushRegexp(re))
+ return NULL;
+ break;
+ }
+
+ Rune r;
+ if (!ParseEscape(&t, &r, status, ps.rune_max()))
+ return NULL;
+ if (!ps.PushLiteral(r))
+ return NULL;
+ break;
+ }
+ }
+ Break2:
+ lastunary = isunary;
+ }
+ return ps.DoFinish();
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/perl_groups.cc b/third_party/re2/src/re2/perl_groups.cc
new file mode 100644
index 000000000..468744458
--- /dev/null
+++ b/third_party/re2/src/re2/perl_groups.cc
@@ -0,0 +1,119 @@
+// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
+// make_perl_groups.pl >perl_groups.cc
+
+#include "re2/unicode_groups.h"
+
+namespace re2 {
+
+static const URange16 code1[] = { /* \d */
+ { 0x30, 0x39 },
+};
+static const URange16 code2[] = { /* \s */
+ { 0x9, 0xa },
+ { 0xc, 0xd },
+ { 0x20, 0x20 },
+};
+static const URange16 code3[] = { /* \w */
+ { 0x30, 0x39 },
+ { 0x41, 0x5a },
+ { 0x5f, 0x5f },
+ { 0x61, 0x7a },
+};
+const UGroup perl_groups[] = {
+ { "\\d", +1, code1, 1, 0, 0 },
+ { "\\D", -1, code1, 1, 0, 0 },
+ { "\\s", +1, code2, 3, 0, 0 },
+ { "\\S", -1, code2, 3, 0, 0 },
+ { "\\w", +1, code3, 4, 0, 0 },
+ { "\\W", -1, code3, 4, 0, 0 },
+};
+const int num_perl_groups = 6;
+static const URange16 code4[] = { /* [:alnum:] */
+ { 0x30, 0x39 },
+ { 0x41, 0x5a },
+ { 0x61, 0x7a },
+};
+static const URange16 code5[] = { /* [:alpha:] */
+ { 0x41, 0x5a },
+ { 0x61, 0x7a },
+};
+static const URange16 code6[] = { /* [:ascii:] */
+ { 0x0, 0x7f },
+};
+static const URange16 code7[] = { /* [:blank:] */
+ { 0x9, 0x9 },
+ { 0x20, 0x20 },
+};
+static const URange16 code8[] = { /* [:cntrl:] */
+ { 0x0, 0x1f },
+ { 0x7f, 0x7f },
+};
+static const URange16 code9[] = { /* [:digit:] */
+ { 0x30, 0x39 },
+};
+static const URange16 code10[] = { /* [:graph:] */
+ { 0x21, 0x7e },
+};
+static const URange16 code11[] = { /* [:lower:] */
+ { 0x61, 0x7a },
+};
+static const URange16 code12[] = { /* [:print:] */
+ { 0x20, 0x7e },
+};
+static const URange16 code13[] = { /* [:punct:] */
+ { 0x21, 0x2f },
+ { 0x3a, 0x40 },
+ { 0x5b, 0x60 },
+ { 0x7b, 0x7e },
+};
+static const URange16 code14[] = { /* [:space:] */
+ { 0x9, 0xd },
+ { 0x20, 0x20 },
+};
+static const URange16 code15[] = { /* [:upper:] */
+ { 0x41, 0x5a },
+};
+static const URange16 code16[] = { /* [:word:] */
+ { 0x30, 0x39 },
+ { 0x41, 0x5a },
+ { 0x5f, 0x5f },
+ { 0x61, 0x7a },
+};
+static const URange16 code17[] = { /* [:xdigit:] */
+ { 0x30, 0x39 },
+ { 0x41, 0x46 },
+ { 0x61, 0x66 },
+};
+const UGroup posix_groups[] = {
+ { "[:alnum:]", +1, code4, 3, 0, 0 },
+ { "[:^alnum:]", -1, code4, 3, 0, 0 },
+ { "[:alpha:]", +1, code5, 2, 0, 0 },
+ { "[:^alpha:]", -1, code5, 2, 0, 0 },
+ { "[:ascii:]", +1, code6, 1, 0, 0 },
+ { "[:^ascii:]", -1, code6, 1, 0, 0 },
+ { "[:blank:]", +1, code7, 2, 0, 0 },
+ { "[:^blank:]", -1, code7, 2, 0, 0 },
+ { "[:cntrl:]", +1, code8, 2, 0, 0 },
+ { "[:^cntrl:]", -1, code8, 2, 0, 0 },
+ { "[:digit:]", +1, code9, 1, 0, 0 },
+ { "[:^digit:]", -1, code9, 1, 0, 0 },
+ { "[:graph:]", +1, code10, 1, 0, 0 },
+ { "[:^graph:]", -1, code10, 1, 0, 0 },
+ { "[:lower:]", +1, code11, 1, 0, 0 },
+ { "[:^lower:]", -1, code11, 1, 0, 0 },
+ { "[:print:]", +1, code12, 1, 0, 0 },
+ { "[:^print:]", -1, code12, 1, 0, 0 },
+ { "[:punct:]", +1, code13, 4, 0, 0 },
+ { "[:^punct:]", -1, code13, 4, 0, 0 },
+ { "[:space:]", +1, code14, 2, 0, 0 },
+ { "[:^space:]", -1, code14, 2, 0, 0 },
+ { "[:upper:]", +1, code15, 1, 0, 0 },
+ { "[:^upper:]", -1, code15, 1, 0, 0 },
+ { "[:word:]", +1, code16, 4, 0, 0 },
+ { "[:^word:]", -1, code16, 4, 0, 0 },
+ { "[:xdigit:]", +1, code17, 3, 0, 0 },
+ { "[:^xdigit:]", -1, code17, 3, 0, 0 },
+};
+const int num_posix_groups = 28;
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/pod_array.h b/third_party/re2/src/re2/pod_array.h
new file mode 100644
index 000000000..f234e976f
--- /dev/null
+++ b/third_party/re2/src/re2/pod_array.h
@@ -0,0 +1,55 @@
+// Copyright 2018 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_POD_ARRAY_H_
+#define RE2_POD_ARRAY_H_
+
+#include <memory>
+#include <type_traits>
+
+namespace re2 {
+
+template <typename T>
+class PODArray {
+ public:
+ static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+ "T must be POD");
+
+ PODArray()
+ : ptr_() {}
+ explicit PODArray(int len)
+ : ptr_(std::allocator<T>().allocate(len), Deleter(len)) {}
+
+ T* data() const {
+ return ptr_.get();
+ }
+
+ int size() const {
+ return ptr_.get_deleter().len_;
+ }
+
+ T& operator[](int pos) const {
+ return ptr_[pos];
+ }
+
+ private:
+ struct Deleter {
+ Deleter()
+ : len_(0) {}
+ explicit Deleter(int len)
+ : len_(len) {}
+
+ void operator()(T* ptr) const {
+ std::allocator<T>().deallocate(ptr, len_);
+ }
+
+ int len_;
+ };
+
+ std::unique_ptr<T[], Deleter> ptr_;
+};
+
+} // namespace re2
+
+#endif // RE2_POD_ARRAY_H_
diff --git a/third_party/re2/src/re2/prefilter.cc b/third_party/re2/src/re2/prefilter.cc
new file mode 100644
index 000000000..3c7886f83
--- /dev/null
+++ b/third_party/re2/src/re2/prefilter.cc
@@ -0,0 +1,709 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "re2/prefilter.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_format.h"
+#include "util/logging.h"
+#include "util/utf.h"
+#include "re2/re2.h"
+#include "re2/unicode_casefold.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+static const bool ExtraDebug = false;
+
+// Initializes a Prefilter, allocating subs_ as necessary.
+Prefilter::Prefilter(Op op) {
+ op_ = op;
+ subs_ = NULL;
+ if (op_ == AND || op_ == OR)
+ subs_ = new std::vector<Prefilter*>;
+}
+
+// Destroys a Prefilter.
+Prefilter::~Prefilter() {
+ if (subs_) {
+ for (size_t i = 0; i < subs_->size(); i++)
+ delete (*subs_)[i];
+ delete subs_;
+ subs_ = NULL;
+ }
+}
+
+// Simplify if the node is an empty Or or And.
+Prefilter* Prefilter::Simplify() {
+ if (op_ != AND && op_ != OR) {
+ return this;
+ }
+
+ // Nothing left in the AND/OR.
+ if (subs_->empty()) {
+ if (op_ == AND)
+ op_ = ALL; // AND of nothing is true
+ else
+ op_ = NONE; // OR of nothing is false
+
+ return this;
+ }
+
+ // Just one subnode: throw away wrapper.
+ if (subs_->size() == 1) {
+ Prefilter* a = (*subs_)[0];
+ subs_->clear();
+ delete this;
+ return a->Simplify();
+ }
+
+ return this;
+}
+
+// Combines two Prefilters together to create an "op" (AND or OR).
+// The passed Prefilters will be part of the returned Prefilter or deleted.
+// Does lots of work to avoid creating unnecessarily complicated structures.
+Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
+ // If a, b can be rewritten as op, do so.
+ a = a->Simplify();
+ b = b->Simplify();
+
+ // Canonicalize: a->op <= b->op.
+ if (a->op() > b->op()) {
+ Prefilter* t = a;
+ a = b;
+ b = t;
+ }
+
+ // Trivial cases.
+ // ALL AND b = b
+ // NONE OR b = b
+ // ALL OR b = ALL
+ // NONE AND b = NONE
+ // Don't need to look at b, because of canonicalization above.
+ // ALL and NONE are smallest opcodes.
+ if (a->op() == ALL || a->op() == NONE) {
+ if ((a->op() == ALL && op == AND) ||
+ (a->op() == NONE && op == OR)) {
+ delete a;
+ return b;
+ } else {
+ delete b;
+ return a;
+ }
+ }
+
+ // If a and b match op, merge their contents.
+ if (a->op() == op && b->op() == op) {
+ for (size_t i = 0; i < b->subs()->size(); i++) {
+ Prefilter* bb = (*b->subs())[i];
+ a->subs()->push_back(bb);
+ }
+ b->subs()->clear();
+ delete b;
+ return a;
+ }
+
+ // If a already has the same op as the op that is under construction
+ // add in b (similarly if b already has the same op, add in a).
+ if (b->op() == op) {
+ Prefilter* t = a;
+ a = b;
+ b = t;
+ }
+ if (a->op() == op) {
+ a->subs()->push_back(b);
+ return a;
+ }
+
+ // Otherwise just return the op.
+ Prefilter* c = new Prefilter(op);
+ c->subs()->push_back(a);
+ c->subs()->push_back(b);
+ return c;
+}
+
+Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
+ return AndOr(AND, a, b);
+}
+
+Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
+ return AndOr(OR, a, b);
+}
+
+void Prefilter::SimplifyStringSet(SSet* ss) {
+ // Now make sure that the strings aren't redundant. For example, if
+ // we know "ab" is a required string, then it doesn't help at all to
+ // know that "abc" is also a required string, so delete "abc". This
+ // is because, when we are performing a string search to filter
+ // regexps, matching "ab" will already allow this regexp to be a
+ // candidate for match, so further matching "abc" is redundant.
+ // Note that we must ignore "" because find() would find it at the
+ // start of everything and thus we would end up erasing everything.
+ //
+ // The SSet sorts strings by length, then lexicographically. Note that
+ // smaller strings appear first and all strings must be unique. These
+ // observations let us skip string comparisons when possible.
+ SSIter i = ss->begin();
+ if (i != ss->end() && i->empty()) {
+ ++i;
+ }
+ for (; i != ss->end(); ++i) {
+ SSIter j = i;
+ ++j;
+ while (j != ss->end()) {
+ if (j->size() > i->size() && j->find(*i) != std::string::npos) {
+ j = ss->erase(j);
+ continue;
+ }
+ ++j;
+ }
+ }
+}
+
+Prefilter* Prefilter::OrStrings(SSet* ss) {
+ Prefilter* or_prefilter = new Prefilter(NONE);
+ SimplifyStringSet(ss);
+ for (SSIter i = ss->begin(); i != ss->end(); ++i)
+ or_prefilter = Or(or_prefilter, FromString(*i));
+ return or_prefilter;
+}
+
+static Rune ToLowerRune(Rune r) {
+ if (r < Runeself) {
+ if ('A' <= r && r <= 'Z')
+ r += 'a' - 'A';
+ return r;
+ }
+
+ const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
+ if (f == NULL || r < f->lo)
+ return r;
+ return ApplyFold(f, r);
+}
+
+static Rune ToLowerRuneLatin1(Rune r) {
+ if ('A' <= r && r <= 'Z')
+ r += 'a' - 'A';
+ return r;
+}
+
+Prefilter* Prefilter::FromString(const std::string& str) {
+ Prefilter* m = new Prefilter(Prefilter::ATOM);
+ m->atom_ = str;
+ return m;
+}
+
+// Information about a regexp used during computation of Prefilter.
+// Can be thought of as information about the set of strings matching
+// the given regular expression.
+class Prefilter::Info {
+ public:
+ Info();
+ ~Info();
+
+ // More constructors. They delete their Info* arguments.
+ static Info* Alt(Info* a, Info* b);
+ static Info* Concat(Info* a, Info* b);
+ static Info* And(Info* a, Info* b);
+ static Info* Star(Info* a);
+ static Info* Plus(Info* a);
+ static Info* Quest(Info* a);
+ static Info* EmptyString();
+ static Info* NoMatch();
+ static Info* AnyCharOrAnyByte();
+ static Info* CClass(CharClass* cc, bool latin1);
+ static Info* Literal(Rune r);
+ static Info* LiteralLatin1(Rune r);
+ static Info* AnyMatch();
+
+ // Format Info as a string.
+ std::string ToString();
+
+ // Caller takes ownership of the Prefilter.
+ Prefilter* TakeMatch();
+
+ SSet& exact() { return exact_; }
+
+ bool is_exact() const { return is_exact_; }
+
+ class Walker;
+
+ private:
+ SSet exact_;
+
+ // When is_exact_ is true, the strings that match
+ // are placed in exact_. When it is no longer an exact
+ // set of strings that match this RE, then is_exact_
+ // is false and the match_ contains the required match
+ // criteria.
+ bool is_exact_;
+
+ // Accumulated Prefilter query that any
+ // match for this regexp is guaranteed to match.
+ Prefilter* match_;
+};
+
+
+Prefilter::Info::Info()
+ : is_exact_(false),
+ match_(NULL) {
+}
+
+Prefilter::Info::~Info() {
+ delete match_;
+}
+
+Prefilter* Prefilter::Info::TakeMatch() {
+ if (is_exact_) {
+ match_ = Prefilter::OrStrings(&exact_);
+ is_exact_ = false;
+ }
+ Prefilter* m = match_;
+ match_ = NULL;
+ return m;
+}
+
+// Format a Info in string form.
+std::string Prefilter::Info::ToString() {
+ if (is_exact_) {
+ int n = 0;
+ std::string s;
+ for (SSIter i = exact_.begin(); i != exact_.end(); ++i) {
+ if (n++ > 0)
+ s += ",";
+ s += *i;
+ }
+ return s;
+ }
+
+ if (match_)
+ return match_->DebugString();
+
+ return "";
+}
+
+void Prefilter::CrossProduct(const SSet& a, const SSet& b, SSet* dst) {
+ for (ConstSSIter i = a.begin(); i != a.end(); ++i)
+ for (ConstSSIter j = b.begin(); j != b.end(); ++j)
+ dst->insert(*i + *j);
+}
+
+// Concats a and b. Requires that both are exact sets.
+// Forms an exact set that is a crossproduct of a and b.
+Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
+ if (a == NULL)
+ return b;
+ DCHECK(a->is_exact_);
+ DCHECK(b && b->is_exact_);
+ Info *ab = new Info();
+
+ CrossProduct(a->exact_, b->exact_, &ab->exact_);
+ ab->is_exact_ = true;
+
+ delete a;
+ delete b;
+ return ab;
+}
+
+// Constructs an inexact Info for ab given a and b.
+// Used only when a or b is not exact or when the
+// exact cross product is likely to be too big.
+Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
+ if (a == NULL)
+ return b;
+ if (b == NULL)
+ return a;
+
+ Info *ab = new Info();
+
+ ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
+ ab->is_exact_ = false;
+ delete a;
+ delete b;
+ return ab;
+}
+
+// Constructs Info for a|b given a and b.
+Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
+ Info *ab = new Info();
+
+ if (a->is_exact_ && b->is_exact_) {
+ // Avoid string copies by moving the larger exact_ set into
+ // ab directly, then merge in the smaller set.
+ if (a->exact_.size() < b->exact_.size()) {
+ using std::swap;
+ swap(a, b);
+ }
+ ab->exact_ = std::move(a->exact_);
+ ab->exact_.insert(b->exact_.begin(), b->exact_.end());
+ ab->is_exact_ = true;
+ } else {
+ // Either a or b has is_exact_ = false. If the other
+ // one has is_exact_ = true, we move it to match_ and
+ // then create a OR of a,b. The resulting Info has
+ // is_exact_ = false.
+ ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
+ ab->is_exact_ = false;
+ }
+
+ delete a;
+ delete b;
+ return ab;
+}
+
+// Constructs Info for a? given a.
+Prefilter::Info* Prefilter::Info::Quest(Info *a) {
+ Info *ab = new Info();
+
+ ab->is_exact_ = false;
+ ab->match_ = new Prefilter(ALL);
+ delete a;
+ return ab;
+}
+
+// Constructs Info for a* given a.
+// Same as a? -- not much to do.
+Prefilter::Info* Prefilter::Info::Star(Info *a) {
+ return Quest(a);
+}
+
+// Constructs Info for a+ given a. If a was exact set, it isn't
+// anymore.
+Prefilter::Info* Prefilter::Info::Plus(Info *a) {
+ Info *ab = new Info();
+
+ ab->match_ = a->TakeMatch();
+ ab->is_exact_ = false;
+
+ delete a;
+ return ab;
+}
+
+static std::string RuneToString(Rune r) {
+ char buf[UTFmax];
+ int n = runetochar(buf, &r);
+ return std::string(buf, n);
+}
+
+static std::string RuneToStringLatin1(Rune r) {
+ char c = r & 0xff;
+ return std::string(&c, 1);
+}
+
+// Constructs Info for literal rune.
+Prefilter::Info* Prefilter::Info::Literal(Rune r) {
+ Info* info = new Info();
+ info->exact_.insert(RuneToString(ToLowerRune(r)));
+ info->is_exact_ = true;
+ return info;
+}
+
+// Constructs Info for literal rune for Latin1 encoded string.
+Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
+ Info* info = new Info();
+ info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
+ info->is_exact_ = true;
+ return info;
+}
+
+// Constructs Info for dot (any character) or \C (any byte).
+Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
+ Prefilter::Info* info = new Prefilter::Info();
+ info->match_ = new Prefilter(ALL);
+ return info;
+}
+
+// Constructs Prefilter::Info for no possible match.
+Prefilter::Info* Prefilter::Info::NoMatch() {
+ Prefilter::Info* info = new Prefilter::Info();
+ info->match_ = new Prefilter(NONE);
+ return info;
+}
+
+// Constructs Prefilter::Info for any possible match.
+// This Prefilter::Info is valid for any regular expression,
+// since it makes no assertions whatsoever about the
+// strings being matched.
+Prefilter::Info* Prefilter::Info::AnyMatch() {
+ Prefilter::Info *info = new Prefilter::Info();
+ info->match_ = new Prefilter(ALL);
+ return info;
+}
+
+// Constructs Prefilter::Info for just the empty string.
+Prefilter::Info* Prefilter::Info::EmptyString() {
+ Prefilter::Info* info = new Prefilter::Info();
+ info->is_exact_ = true;
+ info->exact_.insert("");
+ return info;
+}
+
+// Constructs Prefilter::Info for a character class.
+typedef CharClass::iterator CCIter;
+Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
+ bool latin1) {
+ if (ExtraDebug) {
+ LOG(ERROR) << "CharClassInfo:";
+ for (CCIter i = cc->begin(); i != cc->end(); ++i)
+ LOG(ERROR) << " " << i->lo << "-" << i->hi;
+ }
+
+ // If the class is too large, it's okay to overestimate.
+ if (cc->size() > 10)
+ return AnyCharOrAnyByte();
+
+ Prefilter::Info *a = new Prefilter::Info();
+ for (CCIter i = cc->begin(); i != cc->end(); ++i)
+ for (Rune r = i->lo; r <= i->hi; r++) {
+ if (latin1) {
+ a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
+ } else {
+ a->exact_.insert(RuneToString(ToLowerRune(r)));
+ }
+ }
+
+
+ a->is_exact_ = true;
+
+ if (ExtraDebug)
+ LOG(ERROR) << " = " << a->ToString();
+
+ return a;
+}
+
+class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
+ public:
+ Walker(bool latin1) : latin1_(latin1) {}
+
+ virtual Info* PostVisit(
+ Regexp* re, Info* parent_arg,
+ Info* pre_arg,
+ Info** child_args, int nchild_args);
+
+ virtual Info* ShortVisit(
+ Regexp* re,
+ Info* parent_arg);
+
+ bool latin1() { return latin1_; }
+ private:
+ bool latin1_;
+
+ Walker(const Walker&) = delete;
+ Walker& operator=(const Walker&) = delete;
+};
+
+Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
+ if (ExtraDebug)
+ LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
+
+ bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
+ Prefilter::Info::Walker w(latin1);
+ Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
+
+ if (w.stopped_early()) {
+ delete info;
+ return NULL;
+ }
+
+ return info;
+}
+
+Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
+ Regexp* re, Prefilter::Info* parent_arg) {
+ return AnyMatch();
+}
+
+// Constructs the Prefilter::Info for the given regular expression.
+// Assumes re is simplified.
+Prefilter::Info* Prefilter::Info::Walker::PostVisit(
+ Regexp* re, Prefilter::Info* parent_arg,
+ Prefilter::Info* pre_arg, Prefilter::Info** child_args,
+ int nchild_args) {
+ Prefilter::Info *info;
+ switch (re->op()) {
+ default:
+ case kRegexpRepeat:
+ info = EmptyString();
+ LOG(DFATAL) << "Bad regexp op " << re->op();
+ break;
+
+ case kRegexpNoMatch:
+ info = NoMatch();
+ break;
+
+ // These ops match the empty string:
+ case kRegexpEmptyMatch: // anywhere
+ case kRegexpBeginLine: // at beginning of line
+ case kRegexpEndLine: // at end of line
+ case kRegexpBeginText: // at beginning of text
+ case kRegexpEndText: // at end of text
+ case kRegexpWordBoundary: // at word boundary
+ case kRegexpNoWordBoundary: // not at word boundary
+ info = EmptyString();
+ break;
+
+ case kRegexpLiteral:
+ if (latin1()) {
+ info = LiteralLatin1(re->rune());
+ }
+ else {
+ info = Literal(re->rune());
+ }
+ break;
+
+ case kRegexpLiteralString:
+ if (re->nrunes() == 0) {
+ info = NoMatch();
+ break;
+ }
+ if (latin1()) {
+ info = LiteralLatin1(re->runes()[0]);
+ for (int i = 1; i < re->nrunes(); i++) {
+ info = Concat(info, LiteralLatin1(re->runes()[i]));
+ }
+ } else {
+ info = Literal(re->runes()[0]);
+ for (int i = 1; i < re->nrunes(); i++) {
+ info = Concat(info, Literal(re->runes()[i]));
+ }
+ }
+ break;
+
+ case kRegexpConcat: {
+ // Accumulate in info.
+ // Exact is concat of recent contiguous exact nodes.
+ info = NULL;
+ Info* exact = NULL;
+ for (int i = 0; i < nchild_args; i++) {
+ Info* ci = child_args[i]; // child info
+ if (!ci->is_exact() ||
+ (exact && ci->exact().size() * exact->exact().size() > 16)) {
+ // Exact run is over.
+ info = And(info, exact);
+ exact = NULL;
+ // Add this child's info.
+ info = And(info, ci);
+ } else {
+ // Append to exact run.
+ exact = Concat(exact, ci);
+ }
+ }
+ info = And(info, exact);
+ }
+ break;
+
+ case kRegexpAlternate:
+ info = child_args[0];
+ for (int i = 1; i < nchild_args; i++)
+ info = Alt(info, child_args[i]);
+ break;
+
+ case kRegexpStar:
+ info = Star(child_args[0]);
+ break;
+
+ case kRegexpQuest:
+ info = Quest(child_args[0]);
+ break;
+
+ case kRegexpPlus:
+ info = Plus(child_args[0]);
+ break;
+
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ // Claim nothing, except that it's not empty.
+ info = AnyCharOrAnyByte();
+ break;
+
+ case kRegexpCharClass:
+ info = CClass(re->cc(), latin1());
+ break;
+
+ case kRegexpCapture:
+ // These don't affect the set of matching strings.
+ info = child_args[0];
+ break;
+ }
+
+ if (ExtraDebug)
+ LOG(ERROR) << "BuildInfo " << re->ToString()
+ << ": " << (info ? info->ToString() : "");
+
+ return info;
+}
+
+
+Prefilter* Prefilter::FromRegexp(Regexp* re) {
+ if (re == NULL)
+ return NULL;
+
+ Regexp* simple = re->Simplify();
+ if (simple == NULL)
+ return NULL;
+
+ Prefilter::Info* info = BuildInfo(simple);
+ simple->Decref();
+ if (info == NULL)
+ return NULL;
+
+ Prefilter* m = info->TakeMatch();
+ delete info;
+ return m;
+}
+
+std::string Prefilter::DebugString() const {
+ switch (op_) {
+ default:
+ LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
+ return absl::StrFormat("op%d", op_);
+ case NONE:
+ return "*no-matches*";
+ case ATOM:
+ return atom_;
+ case ALL:
+ return "";
+ case AND: {
+ std::string s = "";
+ for (size_t i = 0; i < subs_->size(); i++) {
+ if (i > 0)
+ s += " ";
+ Prefilter* sub = (*subs_)[i];
+ s += sub ? sub->DebugString() : "<nil>";
+ }
+ return s;
+ }
+ case OR: {
+ std::string s = "(";
+ for (size_t i = 0; i < subs_->size(); i++) {
+ if (i > 0)
+ s += "|";
+ Prefilter* sub = (*subs_)[i];
+ s += sub ? sub->DebugString() : "<nil>";
+ }
+ s += ")";
+ return s;
+ }
+ }
+}
+
+Prefilter* Prefilter::FromRE2(const RE2* re2) {
+ if (re2 == NULL)
+ return NULL;
+
+ Regexp* regexp = re2->Regexp();
+ if (regexp == NULL)
+ return NULL;
+
+ return FromRegexp(regexp);
+}
+
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/prefilter.h b/third_party/re2/src/re2/prefilter.h
new file mode 100644
index 000000000..018691dcd
--- /dev/null
+++ b/third_party/re2/src/re2/prefilter.h
@@ -0,0 +1,167 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_PREFILTER_H_
+#define RE2_PREFILTER_H_
+
+// Prefilter is the class used to extract string guards from regexps.
+// Rather than using Prefilter class directly, use FilteredRE2.
+// See filtered_re2.h
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include "util/logging.h"
+
+namespace re2 {
+
+class RE2;
+
+class Regexp;
+
+class Prefilter {
+ // Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
+ public:
+ enum Op {
+ ALL = 0, // Everything matches
+ NONE, // Nothing matches
+ ATOM, // The string atom() must match
+ AND, // All in subs() must match
+ OR, // One of subs() must match
+ };
+
+ explicit Prefilter(Op op);
+ ~Prefilter();
+
+ Op op() { return op_; }
+ const std::string& atom() const { return atom_; }
+ void set_unique_id(int id) { unique_id_ = id; }
+ int unique_id() const { return unique_id_; }
+
+ // The children of the Prefilter node.
+ std::vector<Prefilter*>* subs() {
+ DCHECK(op_ == AND || op_ == OR);
+ return subs_;
+ }
+
+ // Set the children vector. Prefilter takes ownership of subs and
+ // subs_ will be deleted when Prefilter is deleted.
+ void set_subs(std::vector<Prefilter*>* subs) { subs_ = subs; }
+
+ // Given a RE2, return a Prefilter. The caller takes ownership of
+ // the Prefilter and should deallocate it. Returns NULL if Prefilter
+ // cannot be formed.
+ static Prefilter* FromRE2(const RE2* re2);
+
+ // Returns a readable debug string of the prefilter.
+ std::string DebugString() const;
+
+ private:
+ template <typename H>
+ friend H AbslHashValue(H h, const Prefilter& a) {
+ h = H::combine(std::move(h), a.op_);
+ if (a.op_ == ATOM) {
+ h = H::combine(std::move(h), a.atom_);
+ } else if (a.op_ == AND || a.op_ == OR) {
+ h = H::combine(std::move(h), a.subs_->size());
+ for (size_t i = 0; i < a.subs_->size(); ++i) {
+ h = H::combine(std::move(h), (*a.subs_)[i]->unique_id_);
+ }
+ }
+ return h;
+ }
+
+ friend bool operator==(const Prefilter& a, const Prefilter& b) {
+ if (&a == &b) {
+ return true;
+ }
+ if (a.op_ != b.op_) {
+ return false;
+ }
+ if (a.op_ == ATOM) {
+ if (a.atom_ != b.atom_) {
+ return false;
+ }
+ } else if (a.op_ == AND || a.op_ == OR) {
+ if (a.subs_->size() != b.subs_->size()) {
+ return false;
+ }
+ for (size_t i = 0; i < a.subs_->size(); ++i) {
+ if ((*a.subs_)[i]->unique_id_ != (*b.subs_)[i]->unique_id_) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ // A comparator used to store exact strings. We compare by length,
+ // then lexicographically. This ordering makes it easier to reduce the
+ // set of strings in SimplifyStringSet.
+ struct LengthThenLex {
+ bool operator()(const std::string& a, const std::string& b) const {
+ return (a.size() < b.size()) || (a.size() == b.size() && a < b);
+ }
+ };
+
+ class Info;
+
+ using SSet = std::set<std::string, LengthThenLex>;
+ using SSIter = SSet::iterator;
+ using ConstSSIter = SSet::const_iterator;
+
+ // Combines two prefilters together to create an AND. The passed
+ // Prefilters will be part of the returned Prefilter or deleted.
+ static Prefilter* And(Prefilter* a, Prefilter* b);
+
+ // Combines two prefilters together to create an OR. The passed
+ // Prefilters will be part of the returned Prefilter or deleted.
+ static Prefilter* Or(Prefilter* a, Prefilter* b);
+
+ // Generalized And/Or
+ static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
+
+ static Prefilter* FromRegexp(Regexp* a);
+
+ static Prefilter* FromString(const std::string& str);
+
+ static Prefilter* OrStrings(SSet* ss);
+
+ static Info* BuildInfo(Regexp* re);
+
+ Prefilter* Simplify();
+
+ // Removes redundant strings from the set. A string is redundant if
+ // any of the other strings appear as a substring. The empty string
+ // is a special case, which is ignored.
+ static void SimplifyStringSet(SSet* ss);
+
+ // Adds the cross-product of a and b to dst.
+ // (For each string i in a and j in b, add i+j.)
+ static void CrossProduct(const SSet& a, const SSet& b, SSet* dst);
+
+ // Kind of Prefilter.
+ Op op_;
+
+ // Sub-matches for AND or OR Prefilter.
+ std::vector<Prefilter*>* subs_;
+
+ // Actual string to match in leaf node.
+ std::string atom_;
+
+ // If different prefilters have the same string atom, or if they are
+ // structurally the same (e.g., OR of same atom strings) they are
+ // considered the same unique nodes. This is the id for each unique
+ // node. This field is populated with a unique id for every node,
+ // and -1 for duplicate nodes.
+ int unique_id_;
+
+ Prefilter(const Prefilter&) = delete;
+ Prefilter& operator=(const Prefilter&) = delete;
+};
+
+} // namespace re2
+
+#endif // RE2_PREFILTER_H_
diff --git a/third_party/re2/src/re2/prefilter_tree.cc b/third_party/re2/src/re2/prefilter_tree.cc
new file mode 100644
index 000000000..3afb241c9
--- /dev/null
+++ b/third_party/re2/src/re2/prefilter_tree.cc
@@ -0,0 +1,374 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "re2/prefilter_tree.h"
+
+#include <stddef.h>
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_format.h"
+#include "util/logging.h"
+#include "re2/prefilter.h"
+#include "re2/re2.h"
+
+namespace re2 {
+
+static const bool ExtraDebug = false;
+
+PrefilterTree::PrefilterTree()
+ : compiled_(false),
+ min_atom_len_(3) {
+}
+
+PrefilterTree::PrefilterTree(int min_atom_len)
+ : compiled_(false),
+ min_atom_len_(min_atom_len) {
+}
+
+PrefilterTree::~PrefilterTree() {
+ for (size_t i = 0; i < prefilter_vec_.size(); i++)
+ delete prefilter_vec_[i];
+}
+
+void PrefilterTree::Add(Prefilter* prefilter) {
+ if (compiled_) {
+ LOG(DFATAL) << "Add called after Compile.";
+ return;
+ }
+ if (prefilter != NULL && !KeepNode(prefilter)) {
+ delete prefilter;
+ prefilter = NULL;
+ }
+
+ prefilter_vec_.push_back(prefilter);
+}
+
+void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
+ if (compiled_) {
+ LOG(DFATAL) << "Compile called already.";
+ return;
+ }
+
+ // Some legacy users of PrefilterTree call Compile() before
+ // adding any regexps and expect Compile() to have no effect.
+ if (prefilter_vec_.empty())
+ return;
+
+ compiled_ = true;
+
+ NodeSet nodes;
+ AssignUniqueIds(&nodes, atom_vec);
+ if (ExtraDebug)
+ PrintDebugInfo(&nodes);
+}
+
+Prefilter* PrefilterTree::CanonicalNode(NodeSet* nodes, Prefilter* node) {
+ NodeSet::const_iterator iter = nodes->find(node);
+ if (iter != nodes->end()) {
+ return *iter;
+ }
+ return NULL;
+}
+
+bool PrefilterTree::KeepNode(Prefilter* node) const {
+ if (node == NULL)
+ return false;
+
+ switch (node->op()) {
+ default:
+ LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
+ return false;
+
+ case Prefilter::ALL:
+ case Prefilter::NONE:
+ return false;
+
+ case Prefilter::ATOM:
+ return node->atom().size() >= static_cast<size_t>(min_atom_len_);
+
+ case Prefilter::AND: {
+ int j = 0;
+ std::vector<Prefilter*>* subs = node->subs();
+ for (size_t i = 0; i < subs->size(); i++)
+ if (KeepNode((*subs)[i]))
+ (*subs)[j++] = (*subs)[i];
+ else
+ delete (*subs)[i];
+
+ subs->resize(j);
+ return j > 0;
+ }
+
+ case Prefilter::OR:
+ for (size_t i = 0; i < node->subs()->size(); i++)
+ if (!KeepNode((*node->subs())[i]))
+ return false;
+ return true;
+ }
+}
+
+void PrefilterTree::AssignUniqueIds(NodeSet* nodes,
+ std::vector<std::string>* atom_vec) {
+ atom_vec->clear();
+
+ // Build vector of all filter nodes, sorted topologically
+ // from top to bottom in v.
+ std::vector<Prefilter*> v;
+
+ // Add the top level nodes of each regexp prefilter.
+ for (size_t i = 0; i < prefilter_vec_.size(); i++) {
+ Prefilter* f = prefilter_vec_[i];
+ if (f == NULL)
+ unfiltered_.push_back(static_cast<int>(i));
+
+ // We push NULL also on to v, so that we maintain the
+ // mapping of index==regexpid for level=0 prefilter nodes.
+ v.push_back(f);
+ }
+
+ // Now add all the descendant nodes.
+ for (size_t i = 0; i < v.size(); i++) {
+ Prefilter* f = v[i];
+ if (f == NULL)
+ continue;
+ if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
+ const std::vector<Prefilter*>& subs = *f->subs();
+ for (size_t j = 0; j < subs.size(); j++)
+ v.push_back(subs[j]);
+ }
+ }
+
+ // Identify unique nodes.
+ int unique_id = 0;
+ for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
+ Prefilter *node = v[i];
+ if (node == NULL)
+ continue;
+ node->set_unique_id(-1);
+ Prefilter* canonical = CanonicalNode(nodes, node);
+ if (canonical == NULL) {
+ // Any further nodes that have the same atom/subs
+ // will find this node as the canonical node.
+ nodes->emplace(node);
+ if (node->op() == Prefilter::ATOM) {
+ atom_vec->push_back(node->atom());
+ atom_index_to_id_.push_back(unique_id);
+ }
+ node->set_unique_id(unique_id++);
+ } else {
+ node->set_unique_id(canonical->unique_id());
+ }
+ }
+ entries_.resize(unique_id);
+
+ // Fill the entries.
+ for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
+ Prefilter* prefilter = v[i];
+ if (prefilter == NULL)
+ continue;
+ if (CanonicalNode(nodes, prefilter) != prefilter)
+ continue;
+ int id = prefilter->unique_id();
+ switch (prefilter->op()) {
+ default:
+ LOG(DFATAL) << "Unexpected op: " << prefilter->op();
+ return;
+
+ case Prefilter::ATOM:
+ entries_[id].propagate_up_at_count = 1;
+ break;
+
+ case Prefilter::OR:
+ case Prefilter::AND: {
+ // For each child, we append our id to the child's list of
+ // parent ids... unless we happen to have done so already.
+ // The number of appends is the number of unique children,
+ // which allows correct upward propagation from AND nodes.
+ int up_count = 0;
+ for (size_t j = 0; j < prefilter->subs()->size(); j++) {
+ int child_id = (*prefilter->subs())[j]->unique_id();
+ std::vector<int>& parents = entries_[child_id].parents;
+ if (parents.empty() || parents.back() != id) {
+ parents.push_back(id);
+ up_count++;
+ }
+ }
+ entries_[id].propagate_up_at_count =
+ prefilter->op() == Prefilter::AND ? up_count : 1;
+ break;
+ }
+ }
+ }
+
+ // For top level nodes, populate regexp id.
+ for (size_t i = 0; i < prefilter_vec_.size(); i++) {
+ if (prefilter_vec_[i] == NULL)
+ continue;
+ int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
+ DCHECK_LE(0, id);
+ Entry* entry = &entries_[id];
+ entry->regexps.push_back(static_cast<int>(i));
+ }
+
+ // Lastly, using probability-based heuristics, we identify nodes
+ // that trigger too many parents and then we try to prune edges.
+ // We use logarithms below to avoid the likelihood of underflow.
+ double log_num_regexps = std::log(prefilter_vec_.size() - unfiltered_.size());
+ // Hoisted this above the loop so that we don't thrash the heap.
+ std::vector<std::pair<size_t, int>> entries_by_num_edges;
+ for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
+ Prefilter* prefilter = v[i];
+ // Pruning applies only to AND nodes because it "just" reduces
+ // precision; applied to OR nodes, it would break correctness.
+ if (prefilter == NULL || prefilter->op() != Prefilter::AND)
+ continue;
+ if (CanonicalNode(nodes, prefilter) != prefilter)
+ continue;
+ int id = prefilter->unique_id();
+
+ // Sort the current node's children by the numbers of parents.
+ entries_by_num_edges.clear();
+ for (size_t j = 0; j < prefilter->subs()->size(); j++) {
+ int child_id = (*prefilter->subs())[j]->unique_id();
+ const std::vector<int>& parents = entries_[child_id].parents;
+ entries_by_num_edges.emplace_back(parents.size(), child_id);
+ }
+ std::stable_sort(entries_by_num_edges.begin(), entries_by_num_edges.end());
+
+ // A running estimate of how many regexps will be triggered by
+ // pruning the remaining children's edges to the current node.
+ // Our nominal target is one, so the threshold is log(1) == 0;
+ // pruning occurs iff the child has more than nine edges left.
+ double log_num_triggered = log_num_regexps;
+ for (const auto& pair : entries_by_num_edges) {
+ int child_id = pair.second;
+ std::vector<int>& parents = entries_[child_id].parents;
+ if (log_num_triggered > 0.) {
+ log_num_triggered += std::log(parents.size());
+ log_num_triggered -= log_num_regexps;
+ } else if (parents.size() > 9) {
+ auto it = std::find(parents.begin(), parents.end(), id);
+ if (it != parents.end()) {
+ parents.erase(it);
+ entries_[id].propagate_up_at_count--;
+ }
+ }
+ }
+ }
+}
+
+// Functions for triggering during search.
+void PrefilterTree::RegexpsGivenStrings(
+ const std::vector<int>& matched_atoms,
+ std::vector<int>* regexps) const {
+ regexps->clear();
+ if (!compiled_) {
+ // Some legacy users of PrefilterTree call Compile() before
+ // adding any regexps and expect Compile() to have no effect.
+ // This kludge is a counterpart to that kludge.
+ if (prefilter_vec_.empty())
+ return;
+
+ LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
+ for (size_t i = 0; i < prefilter_vec_.size(); i++)
+ regexps->push_back(static_cast<int>(i));
+ } else {
+ IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
+ std::vector<int> matched_atom_ids;
+ for (size_t j = 0; j < matched_atoms.size(); j++)
+ matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
+ PropagateMatch(matched_atom_ids, &regexps_map);
+ for (IntMap::const_iterator it = regexps_map.begin();
+ it != regexps_map.end();
+ ++it)
+ regexps->push_back(it->index());
+
+ regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
+ }
+ std::sort(regexps->begin(), regexps->end());
+}
+
+void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
+ IntMap* regexps) const {
+ IntMap count(static_cast<int>(entries_.size()));
+ IntMap work(static_cast<int>(entries_.size()));
+ for (size_t i = 0; i < atom_ids.size(); i++)
+ work.set(atom_ids[i], 1);
+ for (IntMap::const_iterator it = work.begin(); it != work.end(); ++it) {
+ const Entry& entry = entries_[it->index()];
+ // Record regexps triggered.
+ for (size_t i = 0; i < entry.regexps.size(); i++)
+ regexps->set(entry.regexps[i], 1);
+ int c;
+ // Pass trigger up to parents.
+ for (int j : entry.parents) {
+ const Entry& parent = entries_[j];
+ // Delay until all the children have succeeded.
+ if (parent.propagate_up_at_count > 1) {
+ if (count.has_index(j)) {
+ c = count.get_existing(j) + 1;
+ count.set_existing(j, c);
+ } else {
+ c = 1;
+ count.set_new(j, c);
+ }
+ if (c < parent.propagate_up_at_count)
+ continue;
+ }
+ // Trigger the parent.
+ work.set(j, 1);
+ }
+ }
+}
+
+// Debugging help.
+void PrefilterTree::PrintPrefilter(int regexpid) {
+ LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
+}
+
+void PrefilterTree::PrintDebugInfo(NodeSet* nodes) {
+ LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
+ LOG(ERROR) << "#Unique Nodes: " << entries_.size();
+
+ for (size_t i = 0; i < entries_.size(); i++) {
+ const std::vector<int>& parents = entries_[i].parents;
+ const std::vector<int>& regexps = entries_[i].regexps;
+ LOG(ERROR) << "EntryId: " << i
+ << " N: " << parents.size() << " R: " << regexps.size();
+ for (int parent : parents)
+ LOG(ERROR) << parent;
+ }
+ LOG(ERROR) << "Set:";
+ for (NodeSet::const_iterator iter = nodes->begin();
+ iter != nodes->end(); ++iter)
+ LOG(ERROR) << "NodeId: " << (*iter)->unique_id();
+}
+
+std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
+ std::string node_string = "";
+ if (node->op() == Prefilter::ATOM) {
+ DCHECK(!node->atom().empty());
+ node_string += node->atom();
+ } else {
+ // Adding the operation disambiguates AND and OR nodes.
+ node_string += node->op() == Prefilter::AND ? "AND" : "OR";
+ node_string += "(";
+ for (size_t i = 0; i < node->subs()->size(); i++) {
+ if (i > 0)
+ node_string += ',';
+ node_string += absl::StrFormat("%d", (*node->subs())[i]->unique_id());
+ node_string += ":";
+ node_string += DebugNodeString((*node->subs())[i]);
+ }
+ node_string += ")";
+ }
+ return node_string;
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/prefilter_tree.h b/third_party/re2/src/re2/prefilter_tree.h
new file mode 100644
index 000000000..71e7a294f
--- /dev/null
+++ b/third_party/re2/src/re2/prefilter_tree.h
@@ -0,0 +1,152 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_PREFILTER_TREE_H_
+#define RE2_PREFILTER_TREE_H_
+
+// The PrefilterTree class is used to form an AND-OR tree of strings
+// that would trigger each regexp. The 'prefilter' of each regexp is
+// added to PrefilterTree, and then PrefilterTree is used to find all
+// the unique strings across the prefilters. During search, by using
+// matches from a string matching engine, PrefilterTree deduces the
+// set of regexps that are to be triggered. The 'string matching
+// engine' itself is outside of this class, and the caller can use any
+// favorite engine. PrefilterTree provides a set of strings (called
+// atoms) that the user of this class should use to do the string
+// matching.
+
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "re2/prefilter.h"
+#include "re2/sparse_array.h"
+#include "util/logging.h"
+
+namespace re2 {
+
+class PrefilterTree {
+ public:
+ PrefilterTree();
+ explicit PrefilterTree(int min_atom_len);
+ ~PrefilterTree();
+
+ // Adds the prefilter for the next regexp. Note that we assume that
+ // Add called sequentially for all regexps. All Add calls
+ // must precede Compile.
+ void Add(Prefilter* prefilter);
+
+ // The Compile returns a vector of string in atom_vec.
+ // Call this after all the prefilters are added through Add.
+ // No calls to Add after Compile are allowed.
+ // The caller should use the returned set of strings to do string matching.
+ // Each time a string matches, the corresponding index then has to be
+ // and passed to RegexpsGivenStrings below.
+ void Compile(std::vector<std::string>* atom_vec);
+
+ // Given the indices of the atoms that matched, returns the indexes
+ // of regexps that should be searched. The matched_atoms should
+ // contain all the ids of string atoms that were found to match the
+ // content. The caller can use any string match engine to perform
+ // this function. This function is thread safe.
+ void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
+ std::vector<int>* regexps) const;
+
+ // Print debug prefilter. Also prints unique ids associated with
+ // nodes of the prefilter of the regexp.
+ void PrintPrefilter(int regexpid);
+
+ private:
+ using IntMap = SparseArray<int>;
+
+ struct PrefilterHash {
+ size_t operator()(const Prefilter* a) const {
+ DCHECK(a != NULL);
+ return absl::Hash<Prefilter>()(*a);
+ }
+ };
+
+ struct PrefilterEqual {
+ bool operator()(const Prefilter* a, const Prefilter* b) const {
+ DCHECK(a != NULL);
+ DCHECK(b != NULL);
+ return *a == *b;
+ }
+ };
+
+ using NodeSet =
+ absl::flat_hash_set<Prefilter*, PrefilterHash, PrefilterEqual>;
+
+ // Each unique node has a corresponding Entry that helps in
+ // passing the matching trigger information along the tree.
+ struct Entry {
+ public:
+ // How many children should match before this node triggers the
+ // parent. For an atom and an OR node, this is 1 and for an AND
+ // node, it is the number of unique children.
+ int propagate_up_at_count;
+
+ // When this node is ready to trigger the parent, what are the indices
+ // of the parent nodes to trigger. The reason there may be more than
+ // one is because of sharing. For example (abc | def) and (xyz | def)
+ // are two different nodes, but they share the atom 'def'. So when
+ // 'def' matches, it triggers two parents, corresponding to the two
+ // different OR nodes.
+ std::vector<int> parents;
+
+ // When this node is ready to trigger the parent, what are the
+ // regexps that are triggered.
+ std::vector<int> regexps;
+ };
+
+ // Returns true if the prefilter node should be kept.
+ bool KeepNode(Prefilter* node) const;
+
+ // This function assigns unique ids to various parts of the
+ // prefilter, by looking at if these nodes are already in the
+ // PrefilterTree.
+ void AssignUniqueIds(NodeSet* nodes, std::vector<std::string>* atom_vec);
+
+ // Given the matching atoms, find the regexps to be triggered.
+ void PropagateMatch(const std::vector<int>& atom_ids,
+ IntMap* regexps) const;
+
+ // Returns the prefilter node that has the same atom/subs as this
+ // node. For the canonical node, returns node. Assumes that the
+ // children of node have already been assigned unique ids.
+ Prefilter* CanonicalNode(NodeSet* nodes, Prefilter* node);
+
+ // Recursively constructs a readable prefilter string.
+ std::string DebugNodeString(Prefilter* node) const;
+
+ // Used for debugging.
+ void PrintDebugInfo(NodeSet* nodes);
+
+ // These are all the nodes formed by Compile. Essentially, there is
+ // one node for each unique atom and each unique AND/OR node.
+ std::vector<Entry> entries_;
+
+ // indices of regexps that always pass through the filter (since we
+ // found no required literals in these regexps).
+ std::vector<int> unfiltered_;
+
+ // vector of Prefilter for all regexps.
+ std::vector<Prefilter*> prefilter_vec_;
+
+ // Atom index in returned strings to entry id mapping.
+ std::vector<int> atom_index_to_id_;
+
+ // Has the prefilter tree been compiled.
+ bool compiled_;
+
+ // Strings less than this length are not stored as atoms.
+ const int min_atom_len_;
+
+ PrefilterTree(const PrefilterTree&) = delete;
+ PrefilterTree& operator=(const PrefilterTree&) = delete;
+};
+
+} // namespace
+
+#endif // RE2_PREFILTER_TREE_H_
diff --git a/third_party/re2/src/re2/prog.cc b/third_party/re2/src/re2/prog.cc
new file mode 100644
index 000000000..6cadcfa83
--- /dev/null
+++ b/third_party/re2/src/re2/prog.cc
@@ -0,0 +1,1174 @@
+// Copyright 2007 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Compiled regular expression representation.
+// Tested by compile_test.cc
+
+#include "re2/prog.h"
+
+#if defined(__AVX2__)
+#include <immintrin.h>
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+#endif
+#include <stdint.h>
+#include <string.h>
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+#include "absl/base/macros.h"
+#include "absl/strings/str_format.h"
+#include "util/logging.h"
+#include "re2/bitmap256.h"
+
+namespace re2 {
+
+// Constructors per Inst opcode
+
+void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
+ DCHECK_EQ(out_opcode_, 0);
+ set_out_opcode(out, kInstAlt);
+ out1_ = out1;
+}
+
+void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
+ DCHECK_EQ(out_opcode_, 0);
+ set_out_opcode(out, kInstByteRange);
+ lo_ = lo & 0xFF;
+ hi_ = hi & 0xFF;
+ hint_foldcase_ = foldcase&1;
+}
+
+void Prog::Inst::InitCapture(int cap, uint32_t out) {
+ DCHECK_EQ(out_opcode_, 0);
+ set_out_opcode(out, kInstCapture);
+ cap_ = cap;
+}
+
+void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
+ DCHECK_EQ(out_opcode_, 0);
+ set_out_opcode(out, kInstEmptyWidth);
+ empty_ = empty;
+}
+
+void Prog::Inst::InitMatch(int32_t id) {
+ DCHECK_EQ(out_opcode_, 0);
+ set_opcode(kInstMatch);
+ match_id_ = id;
+}
+
+void Prog::Inst::InitNop(uint32_t out) {
+ DCHECK_EQ(out_opcode_, 0);
+ set_opcode(kInstNop);
+}
+
+void Prog::Inst::InitFail() {
+ DCHECK_EQ(out_opcode_, 0);
+ set_opcode(kInstFail);
+}
+
+std::string Prog::Inst::Dump() {
+ switch (opcode()) {
+ default:
+ return absl::StrFormat("opcode %d", static_cast<int>(opcode()));
+
+ case kInstAlt:
+ return absl::StrFormat("alt -> %d | %d", out(), out1_);
+
+ case kInstAltMatch:
+ return absl::StrFormat("altmatch -> %d | %d", out(), out1_);
+
+ case kInstByteRange:
+ return absl::StrFormat("byte%s [%02x-%02x] %d -> %d",
+ foldcase() ? "/i" : "",
+ lo_, hi_, hint(), out());
+
+ case kInstCapture:
+ return absl::StrFormat("capture %d -> %d", cap_, out());
+
+ case kInstEmptyWidth:
+ return absl::StrFormat("emptywidth %#x -> %d",
+ static_cast<int>(empty_), out());
+
+ case kInstMatch:
+ return absl::StrFormat("match! %d", match_id());
+
+ case kInstNop:
+ return absl::StrFormat("nop -> %d", out());
+
+ case kInstFail:
+ return absl::StrFormat("fail");
+ }
+}
+
+Prog::Prog()
+ : anchor_start_(false),
+ anchor_end_(false),
+ reversed_(false),
+ did_flatten_(false),
+ did_onepass_(false),
+ start_(0),
+ start_unanchored_(0),
+ size_(0),
+ bytemap_range_(0),
+ prefix_foldcase_(false),
+ prefix_size_(0),
+ list_count_(0),
+ bit_state_text_max_size_(0),
+ dfa_mem_(0),
+ dfa_first_(NULL),
+ dfa_longest_(NULL) {
+}
+
+Prog::~Prog() {
+ DeleteDFA(dfa_longest_);
+ DeleteDFA(dfa_first_);
+ if (prefix_foldcase_)
+ delete[] prefix_dfa_;
+}
+
+typedef SparseSet Workq;
+
+static inline void AddToQueue(Workq* q, int id) {
+ if (id != 0)
+ q->insert(id);
+}
+
+static std::string ProgToString(Prog* prog, Workq* q) {
+ std::string s;
+ for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
+ int id = *i;
+ Prog::Inst* ip = prog->inst(id);
+ s += absl::StrFormat("%d. %s\n", id, ip->Dump());
+ AddToQueue(q, ip->out());
+ if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
+ AddToQueue(q, ip->out1());
+ }
+ return s;
+}
+
+static std::string FlattenedProgToString(Prog* prog, int start) {
+ std::string s;
+ for (int id = start; id < prog->size(); id++) {
+ Prog::Inst* ip = prog->inst(id);
+ if (ip->last())
+ s += absl::StrFormat("%d. %s\n", id, ip->Dump());
+ else
+ s += absl::StrFormat("%d+ %s\n", id, ip->Dump());
+ }
+ return s;
+}
+
+std::string Prog::Dump() {
+ if (did_flatten_)
+ return FlattenedProgToString(this, start_);
+
+ Workq q(size_);
+ AddToQueue(&q, start_);
+ return ProgToString(this, &q);
+}
+
+std::string Prog::DumpUnanchored() {
+ if (did_flatten_)
+ return FlattenedProgToString(this, start_unanchored_);
+
+ Workq q(size_);
+ AddToQueue(&q, start_unanchored_);
+ return ProgToString(this, &q);
+}
+
+std::string Prog::DumpByteMap() {
+ std::string map;
+ for (int c = 0; c < 256; c++) {
+ int b = bytemap_[c];
+ int lo = c;
+ while (c < 256-1 && bytemap_[c+1] == b)
+ c++;
+ int hi = c;
+ map += absl::StrFormat("[%02x-%02x] -> %d\n", lo, hi, b);
+ }
+ return map;
+}
+
+// Is ip a guaranteed match at end of text, perhaps after some capturing?
+static bool IsMatch(Prog* prog, Prog::Inst* ip) {
+ for (;;) {
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
+ return false;
+
+ case kInstAlt:
+ case kInstAltMatch:
+ case kInstByteRange:
+ case kInstFail:
+ case kInstEmptyWidth:
+ return false;
+
+ case kInstCapture:
+ case kInstNop:
+ ip = prog->inst(ip->out());
+ break;
+
+ case kInstMatch:
+ return true;
+ }
+ }
+}
+
+// Peep-hole optimizer.
+void Prog::Optimize() {
+ Workq q(size_);
+
+ // Eliminate nops. Most are taken out during compilation
+ // but a few are hard to avoid.
+ q.clear();
+ AddToQueue(&q, start_);
+ for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
+ int id = *i;
+
+ Inst* ip = inst(id);
+ int j = ip->out();
+ Inst* jp;
+ while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
+ j = jp->out();
+ }
+ ip->set_out(j);
+ AddToQueue(&q, ip->out());
+
+ if (ip->opcode() == kInstAlt) {
+ j = ip->out1();
+ while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
+ j = jp->out();
+ }
+ ip->out1_ = j;
+ AddToQueue(&q, ip->out1());
+ }
+ }
+
+ // Insert kInstAltMatch instructions
+ // Look for
+ // ip: Alt -> j | k
+ // j: ByteRange [00-FF] -> ip
+ // k: Match
+ // or the reverse (the above is the greedy one).
+ // Rewrite Alt to AltMatch.
+ q.clear();
+ AddToQueue(&q, start_);
+ for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
+ int id = *i;
+ Inst* ip = inst(id);
+ AddToQueue(&q, ip->out());
+ if (ip->opcode() == kInstAlt)
+ AddToQueue(&q, ip->out1());
+
+ if (ip->opcode() == kInstAlt) {
+ Inst* j = inst(ip->out());
+ Inst* k = inst(ip->out1());
+ if (j->opcode() == kInstByteRange && j->out() == id &&
+ j->lo() == 0x00 && j->hi() == 0xFF &&
+ IsMatch(this, k)) {
+ ip->set_opcode(kInstAltMatch);
+ continue;
+ }
+ if (IsMatch(this, j) &&
+ k->opcode() == kInstByteRange && k->out() == id &&
+ k->lo() == 0x00 && k->hi() == 0xFF) {
+ ip->set_opcode(kInstAltMatch);
+ }
+ }
+ }
+}
+
+uint32_t Prog::EmptyFlags(absl::string_view text, const char* p) {
+ int flags = 0;
+
+ // ^ and \A
+ if (p == text.data())
+ flags |= kEmptyBeginText | kEmptyBeginLine;
+ else if (p[-1] == '\n')
+ flags |= kEmptyBeginLine;
+
+ // $ and \z
+ if (p == text.data() + text.size())
+ flags |= kEmptyEndText | kEmptyEndLine;
+ else if (p < text.data() + text.size() && p[0] == '\n')
+ flags |= kEmptyEndLine;
+
+ // \b and \B
+ if (p == text.data() && p == text.data() + text.size()) {
+ // no word boundary here
+ } else if (p == text.data()) {
+ if (IsWordChar(p[0]))
+ flags |= kEmptyWordBoundary;
+ } else if (p == text.data() + text.size()) {
+ if (IsWordChar(p[-1]))
+ flags |= kEmptyWordBoundary;
+ } else {
+ if (IsWordChar(p[-1]) != IsWordChar(p[0]))
+ flags |= kEmptyWordBoundary;
+ }
+ if (!(flags & kEmptyWordBoundary))
+ flags |= kEmptyNonWordBoundary;
+
+ return flags;
+}
+
+// ByteMapBuilder implements a coloring algorithm.
+//
+// The first phase is a series of "mark and merge" batches: we mark one or more
+// [lo-hi] ranges, then merge them into our internal state. Batching is not for
+// performance; rather, it means that the ranges are treated indistinguishably.
+//
+// Internally, the ranges are represented using a bitmap that stores the splits
+// and a vector that stores the colors; both of them are indexed by the ranges'
+// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
+// hi (if not already split), then recolor each range in between. The color map
+// (i.e. from the old color to the new color) is maintained for the lifetime of
+// the batch and so underpins this somewhat obscure approach to set operations.
+//
+// The second phase builds the bytemap from our internal state: we recolor each
+// range, then store the new color (which is now the byte class) in each of the
+// corresponding array elements. Finally, we output the number of byte classes.
+class ByteMapBuilder {
+ public:
+ ByteMapBuilder() {
+ // Initial state: the [0-255] range has color 256.
+ // This will avoid problems during the second phase,
+ // in which we assign byte classes numbered from 0.
+ splits_.Set(255);
+ colors_[255] = 256;
+ nextcolor_ = 257;
+ }
+
+ void Mark(int lo, int hi);
+ void Merge();
+ void Build(uint8_t* bytemap, int* bytemap_range);
+
+ private:
+ int Recolor(int oldcolor);
+
+ Bitmap256 splits_;
+ int colors_[256];
+ int nextcolor_;
+ std::vector<std::pair<int, int>> colormap_;
+ std::vector<std::pair<int, int>> ranges_;
+
+ ByteMapBuilder(const ByteMapBuilder&) = delete;
+ ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
+};
+
+void ByteMapBuilder::Mark(int lo, int hi) {
+ DCHECK_GE(lo, 0);
+ DCHECK_GE(hi, 0);
+ DCHECK_LE(lo, 255);
+ DCHECK_LE(hi, 255);
+ DCHECK_LE(lo, hi);
+
+ // Ignore any [0-255] ranges. They cause us to recolor every range, which
+ // has no effect on the eventual result and is therefore a waste of time.
+ if (lo == 0 && hi == 255)
+ return;
+
+ ranges_.emplace_back(lo, hi);
+}
+
+void ByteMapBuilder::Merge() {
+ for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
+ it != ranges_.end();
+ ++it) {
+ int lo = it->first-1;
+ int hi = it->second;
+
+ if (0 <= lo && !splits_.Test(lo)) {
+ splits_.Set(lo);
+ int next = splits_.FindNextSetBit(lo+1);
+ colors_[lo] = colors_[next];
+ }
+ if (!splits_.Test(hi)) {
+ splits_.Set(hi);
+ int next = splits_.FindNextSetBit(hi+1);
+ colors_[hi] = colors_[next];
+ }
+
+ int c = lo+1;
+ while (c < 256) {
+ int next = splits_.FindNextSetBit(c);
+ colors_[next] = Recolor(colors_[next]);
+ if (next == hi)
+ break;
+ c = next+1;
+ }
+ }
+ colormap_.clear();
+ ranges_.clear();
+}
+
+void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
+ // Assign byte classes numbered from 0.
+ nextcolor_ = 0;
+
+ int c = 0;
+ while (c < 256) {
+ int next = splits_.FindNextSetBit(c);
+ uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
+ while (c <= next) {
+ bytemap[c] = b;
+ c++;
+ }
+ }
+
+ *bytemap_range = nextcolor_;
+}
+
+int ByteMapBuilder::Recolor(int oldcolor) {
+ // Yes, this is a linear search. There can be at most 256
+ // colors and there will typically be far fewer than that.
+ // Also, we need to consider keys *and* values in order to
+ // avoid recoloring a given range more than once per batch.
+ std::vector<std::pair<int, int>>::const_iterator it =
+ std::find_if(colormap_.begin(), colormap_.end(),
+ [=](const std::pair<int, int>& kv) -> bool {
+ return kv.first == oldcolor || kv.second == oldcolor;
+ });
+ if (it != colormap_.end())
+ return it->second;
+ int newcolor = nextcolor_;
+ nextcolor_++;
+ colormap_.emplace_back(oldcolor, newcolor);
+ return newcolor;
+}
+
+void Prog::ComputeByteMap() {
+ // Fill in bytemap with byte classes for the program.
+ // Ranges of bytes that are treated indistinguishably
+ // will be mapped to a single byte class.
+ ByteMapBuilder builder;
+
+ // Don't repeat the work for ^ and $.
+ bool marked_line_boundaries = false;
+ // Don't repeat the work for \b and \B.
+ bool marked_word_boundaries = false;
+
+ for (int id = 0; id < size(); id++) {
+ Inst* ip = inst(id);
+ if (ip->opcode() == kInstByteRange) {
+ int lo = ip->lo();
+ int hi = ip->hi();
+ builder.Mark(lo, hi);
+ if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
+ int foldlo = lo;
+ int foldhi = hi;
+ if (foldlo < 'a')
+ foldlo = 'a';
+ if (foldhi > 'z')
+ foldhi = 'z';
+ if (foldlo <= foldhi) {
+ foldlo += 'A' - 'a';
+ foldhi += 'A' - 'a';
+ builder.Mark(foldlo, foldhi);
+ }
+ }
+ // If this Inst is not the last Inst in its list AND the next Inst is
+ // also a ByteRange AND the Insts have the same out, defer the merge.
+ if (!ip->last() &&
+ inst(id+1)->opcode() == kInstByteRange &&
+ ip->out() == inst(id+1)->out())
+ continue;
+ builder.Merge();
+ } else if (ip->opcode() == kInstEmptyWidth) {
+ if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
+ !marked_line_boundaries) {
+ builder.Mark('\n', '\n');
+ builder.Merge();
+ marked_line_boundaries = true;
+ }
+ if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
+ !marked_word_boundaries) {
+ // We require two batches here: the first for ranges that are word
+ // characters, the second for ranges that are not word characters.
+ for (bool isword : {true, false}) {
+ int j;
+ for (int i = 0; i < 256; i = j) {
+ for (j = i + 1; j < 256 &&
+ Prog::IsWordChar(static_cast<uint8_t>(i)) ==
+ Prog::IsWordChar(static_cast<uint8_t>(j));
+ j++)
+ ;
+ if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
+ builder.Mark(i, j - 1);
+ }
+ builder.Merge();
+ }
+ marked_word_boundaries = true;
+ }
+ }
+ }
+
+ builder.Build(bytemap_, &bytemap_range_);
+
+ if ((0)) { // For debugging, use trivial bytemap.
+ LOG(ERROR) << "Using trivial bytemap.";
+ for (int i = 0; i < 256; i++)
+ bytemap_[i] = static_cast<uint8_t>(i);
+ bytemap_range_ = 256;
+ }
+}
+
+// Prog::Flatten() implements a graph rewriting algorithm.
+//
+// The overall process is similar to epsilon removal, but retains some epsilon
+// transitions: those from Capture and EmptyWidth instructions; and those from
+// nullable subexpressions. (The latter avoids quadratic blowup in transitions
+// in the worst case.) It might be best thought of as Alt instruction elision.
+//
+// In conceptual terms, it divides the Prog into "trees" of instructions, then
+// traverses the "trees" in order to produce "lists" of instructions. A "tree"
+// is one or more instructions that grow from one "root" instruction to one or
+// more "leaf" instructions; if a "tree" has exactly one instruction, then the
+// "root" is also the "leaf". In most cases, a "root" is the successor of some
+// "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
+// and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
+// EmptyWidth or Match instruction. However, this is insufficient for handling
+// nested nullable subexpressions correctly, so in some cases, a "root" is the
+// dominator of the instructions reachable from some "successor root" (i.e. it
+// has an unreachable predecessor) and is considered a "dominator root". Since
+// only Alt instructions can be "dominator roots" (other instructions would be
+// "leaves"), only Alt instructions are required to be marked as predecessors.
+//
+// Dividing the Prog into "trees" comprises two passes: marking the "successor
+// roots" and the predecessors; and marking the "dominator roots". Sorting the
+// "successor roots" by their bytecode offsets enables iteration in order from
+// greatest to least during the second pass; by working backwards in this case
+// and flooding the graph no further than "leaves" and already marked "roots",
+// it becomes possible to mark "dominator roots" without doing excessive work.
+//
+// Traversing the "trees" is just iterating over the "roots" in order of their
+// marking and flooding the graph no further than "leaves" and "roots". When a
+// "leaf" is reached, the instruction is copied with its successor remapped to
+// its "root" number. When a "root" is reached, a Nop instruction is generated
+// with its successor remapped similarly. As each "list" is produced, its last
+// instruction is marked as such. After all of the "lists" have been produced,
+// a pass over their instructions remaps their successors to bytecode offsets.
+void Prog::Flatten() {
+ if (did_flatten_)
+ return;
+ did_flatten_ = true;
+
+ // Scratch structures. It's important that these are reused by functions
+ // that we call in loops because they would thrash the heap otherwise.
+ SparseSet reachable(size());
+ std::vector<int> stk;
+ stk.reserve(size());
+
+ // First pass: Marks "successor roots" and predecessors.
+ // Builds the mapping from inst-ids to root-ids.
+ SparseArray<int> rootmap(size());
+ SparseArray<int> predmap(size());
+ std::vector<std::vector<int>> predvec;
+ MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
+
+ // Second pass: Marks "dominator roots".
+ SparseArray<int> sorted(rootmap);
+ std::sort(sorted.begin(), sorted.end(), sorted.less);
+ for (SparseArray<int>::const_iterator i = sorted.end() - 1;
+ i != sorted.begin();
+ --i) {
+ if (i->index() != start_unanchored() && i->index() != start())
+ MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
+ }
+
+ // Third pass: Emits "lists". Remaps outs to root-ids.
+ // Builds the mapping from root-ids to flat-ids.
+ std::vector<int> flatmap(rootmap.size());
+ std::vector<Inst> flat;
+ flat.reserve(size());
+ for (SparseArray<int>::const_iterator i = rootmap.begin();
+ i != rootmap.end();
+ ++i) {
+ flatmap[i->value()] = static_cast<int>(flat.size());
+ EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
+ flat.back().set_last();
+ // We have the bounds of the "list", so this is the
+ // most convenient point at which to compute hints.
+ ComputeHints(&flat, flatmap[i->value()], static_cast<int>(flat.size()));
+ }
+
+ list_count_ = static_cast<int>(flatmap.size());
+ for (int i = 0; i < kNumInst; i++)
+ inst_count_[i] = 0;
+
+ // Fourth pass: Remaps outs to flat-ids.
+ // Counts instructions by opcode.
+ for (int id = 0; id < static_cast<int>(flat.size()); id++) {
+ Inst* ip = &flat[id];
+ if (ip->opcode() != kInstAltMatch) // handled in EmitList()
+ ip->set_out(flatmap[ip->out()]);
+ inst_count_[ip->opcode()]++;
+ }
+
+#if !defined(NDEBUG)
+ // Address a `-Wunused-but-set-variable' warning from Clang 13.x.
+ size_t total = 0;
+ for (int i = 0; i < kNumInst; i++)
+ total += inst_count_[i];
+ CHECK_EQ(total, flat.size());
+#endif
+
+ // Remap start_unanchored and start.
+ if (start_unanchored() == 0) {
+ DCHECK_EQ(start(), 0);
+ } else if (start_unanchored() == start()) {
+ set_start_unanchored(flatmap[1]);
+ set_start(flatmap[1]);
+ } else {
+ set_start_unanchored(flatmap[1]);
+ set_start(flatmap[2]);
+ }
+
+ // Finally, replace the old instructions with the new instructions.
+ size_ = static_cast<int>(flat.size());
+ inst_ = PODArray<Inst>(size_);
+ memmove(inst_.data(), flat.data(), size_*sizeof inst_[0]);
+
+ // Populate the list heads for BitState.
+ // 512 instructions limits the memory footprint to 1KiB.
+ if (size_ <= 512) {
+ list_heads_ = PODArray<uint16_t>(size_);
+ // 0xFF makes it more obvious if we try to look up a non-head.
+ memset(list_heads_.data(), 0xFF, size_*sizeof list_heads_[0]);
+ for (int i = 0; i < list_count_; ++i)
+ list_heads_[flatmap[i]] = i;
+ }
+
+ // BitState allocates a bitmap of size list_count_ * (text.size()+1)
+ // for tracking pairs of possibilities that it has already explored.
+ const size_t kBitStateBitmapMaxSize = 256*1024; // max size in bits
+ bit_state_text_max_size_ = kBitStateBitmapMaxSize / list_count_ - 1;
+}
+
+void Prog::MarkSuccessors(SparseArray<int>* rootmap,
+ SparseArray<int>* predmap,
+ std::vector<std::vector<int>>* predvec,
+ SparseSet* reachable, std::vector<int>* stk) {
+ // Mark the kInstFail instruction.
+ rootmap->set_new(0, rootmap->size());
+
+ // Mark the start_unanchored and start instructions.
+ if (!rootmap->has_index(start_unanchored()))
+ rootmap->set_new(start_unanchored(), rootmap->size());
+ if (!rootmap->has_index(start()))
+ rootmap->set_new(start(), rootmap->size());
+
+ reachable->clear();
+ stk->clear();
+ stk->push_back(start_unanchored());
+ while (!stk->empty()) {
+ int id = stk->back();
+ stk->pop_back();
+ Loop:
+ if (reachable->contains(id))
+ continue;
+ reachable->insert_new(id);
+
+ Inst* ip = inst(id);
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
+ break;
+
+ case kInstAltMatch:
+ case kInstAlt:
+ // Mark this instruction as a predecessor of each out.
+ for (int out : {ip->out(), ip->out1()}) {
+ if (!predmap->has_index(out)) {
+ predmap->set_new(out, static_cast<int>(predvec->size()));
+ predvec->emplace_back();
+ }
+ (*predvec)[predmap->get_existing(out)].emplace_back(id);
+ }
+ stk->push_back(ip->out1());
+ id = ip->out();
+ goto Loop;
+
+ case kInstByteRange:
+ case kInstCapture:
+ case kInstEmptyWidth:
+ // Mark the out of this instruction as a "root".
+ if (!rootmap->has_index(ip->out()))
+ rootmap->set_new(ip->out(), rootmap->size());
+ id = ip->out();
+ goto Loop;
+
+ case kInstNop:
+ id = ip->out();
+ goto Loop;
+
+ case kInstMatch:
+ case kInstFail:
+ break;
+ }
+ }
+}
+
+void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
+ SparseArray<int>* predmap,
+ std::vector<std::vector<int>>* predvec,
+ SparseSet* reachable, std::vector<int>* stk) {
+ reachable->clear();
+ stk->clear();
+ stk->push_back(root);
+ while (!stk->empty()) {
+ int id = stk->back();
+ stk->pop_back();
+ Loop:
+ if (reachable->contains(id))
+ continue;
+ reachable->insert_new(id);
+
+ if (id != root && rootmap->has_index(id)) {
+ // We reached another "tree" via epsilon transition.
+ continue;
+ }
+
+ Inst* ip = inst(id);
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
+ break;
+
+ case kInstAltMatch:
+ case kInstAlt:
+ stk->push_back(ip->out1());
+ id = ip->out();
+ goto Loop;
+
+ case kInstByteRange:
+ case kInstCapture:
+ case kInstEmptyWidth:
+ break;
+
+ case kInstNop:
+ id = ip->out();
+ goto Loop;
+
+ case kInstMatch:
+ case kInstFail:
+ break;
+ }
+ }
+
+ for (SparseSet::const_iterator i = reachable->begin();
+ i != reachable->end();
+ ++i) {
+ int id = *i;
+ if (predmap->has_index(id)) {
+ for (int pred : (*predvec)[predmap->get_existing(id)]) {
+ if (!reachable->contains(pred)) {
+ // id has a predecessor that cannot be reached from root!
+ // Therefore, id must be a "root" too - mark it as such.
+ if (!rootmap->has_index(id))
+ rootmap->set_new(id, rootmap->size());
+ }
+ }
+ }
+ }
+}
+
+void Prog::EmitList(int root, SparseArray<int>* rootmap,
+ std::vector<Inst>* flat,
+ SparseSet* reachable, std::vector<int>* stk) {
+ reachable->clear();
+ stk->clear();
+ stk->push_back(root);
+ while (!stk->empty()) {
+ int id = stk->back();
+ stk->pop_back();
+ Loop:
+ if (reachable->contains(id))
+ continue;
+ reachable->insert_new(id);
+
+ if (id != root && rootmap->has_index(id)) {
+ // We reached another "tree" via epsilon transition. Emit a kInstNop
+ // instruction so that the Prog does not become quadratically larger.
+ flat->emplace_back();
+ flat->back().set_opcode(kInstNop);
+ flat->back().set_out(rootmap->get_existing(id));
+ continue;
+ }
+
+ Inst* ip = inst(id);
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
+ break;
+
+ case kInstAltMatch:
+ flat->emplace_back();
+ flat->back().set_opcode(kInstAltMatch);
+ flat->back().set_out(static_cast<int>(flat->size()));
+ flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
+ ABSL_FALLTHROUGH_INTENDED;
+
+ case kInstAlt:
+ stk->push_back(ip->out1());
+ id = ip->out();
+ goto Loop;
+
+ case kInstByteRange:
+ case kInstCapture:
+ case kInstEmptyWidth:
+ flat->emplace_back();
+ memmove(&flat->back(), ip, sizeof *ip);
+ flat->back().set_out(rootmap->get_existing(ip->out()));
+ break;
+
+ case kInstNop:
+ id = ip->out();
+ goto Loop;
+
+ case kInstMatch:
+ case kInstFail:
+ flat->emplace_back();
+ memmove(&flat->back(), ip, sizeof *ip);
+ break;
+ }
+ }
+}
+
+// For each ByteRange instruction in [begin, end), computes a hint to execution
+// engines: the delta to the next instruction (in flat) worth exploring iff the
+// current instruction matched.
+//
+// Implements a coloring algorithm related to ByteMapBuilder, but in this case,
+// colors are instructions and recoloring ranges precisely identifies conflicts
+// between instructions. Iterating backwards over [begin, end) is guaranteed to
+// identify the nearest conflict (if any) with only linear complexity.
+void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) {
+ Bitmap256 splits;
+ int colors[256];
+
+ bool dirty = false;
+ for (int id = end; id >= begin; --id) {
+ if (id == end ||
+ (*flat)[id].opcode() != kInstByteRange) {
+ if (dirty) {
+ dirty = false;
+ splits.Clear();
+ }
+ splits.Set(255);
+ colors[255] = id;
+ // At this point, the [0-255] range is colored with id.
+ // Thus, hints cannot point beyond id; and if id == end,
+ // hints that would have pointed to id will be 0 instead.
+ continue;
+ }
+ dirty = true;
+
+ // We recolor the [lo-hi] range with id. Note that first ratchets backwards
+ // from end to the nearest conflict (if any) during recoloring.
+ int first = end;
+ auto Recolor = [&](int lo, int hi) {
+ // Like ByteMapBuilder, we split at lo-1 and at hi.
+ --lo;
+
+ if (0 <= lo && !splits.Test(lo)) {
+ splits.Set(lo);
+ int next = splits.FindNextSetBit(lo+1);
+ colors[lo] = colors[next];
+ }
+ if (!splits.Test(hi)) {
+ splits.Set(hi);
+ int next = splits.FindNextSetBit(hi+1);
+ colors[hi] = colors[next];
+ }
+
+ int c = lo+1;
+ while (c < 256) {
+ int next = splits.FindNextSetBit(c);
+ // Ratchet backwards...
+ first = std::min(first, colors[next]);
+ // Recolor with id - because it's the new nearest conflict!
+ colors[next] = id;
+ if (next == hi)
+ break;
+ c = next+1;
+ }
+ };
+
+ Inst* ip = &(*flat)[id];
+ int lo = ip->lo();
+ int hi = ip->hi();
+ Recolor(lo, hi);
+ if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
+ int foldlo = lo;
+ int foldhi = hi;
+ if (foldlo < 'a')
+ foldlo = 'a';
+ if (foldhi > 'z')
+ foldhi = 'z';
+ if (foldlo <= foldhi) {
+ foldlo += 'A' - 'a';
+ foldhi += 'A' - 'a';
+ Recolor(foldlo, foldhi);
+ }
+ }
+
+ if (first != end) {
+ uint16_t hint = static_cast<uint16_t>(std::min(first - id, 32767));
+ ip->hint_foldcase_ |= hint<<1;
+ }
+ }
+}
+
+// The final state will always be this, which frees up a register for the hot
+// loop and thus avoids the spilling that can occur when building with Clang.
+static const size_t kShiftDFAFinal = 9;
+
+// This function takes the prefix as std::string (i.e. not const std::string&
+// as normal) because it's going to clobber it, so a temporary is convenient.
+static uint64_t* BuildShiftDFA(std::string prefix) {
+ // This constant is for convenience now and also for correctness later when
+ // we clobber the prefix, but still need to know how long it was initially.
+ const size_t size = prefix.size();
+
+ // Construct the NFA.
+ // The table is indexed by input byte; each element is a bitfield of states
+ // reachable by the input byte. Given a bitfield of the current states, the
+ // bitfield of states reachable from those is - for this specific purpose -
+ // always ((ncurr << 1) | 1). Intersecting the reachability bitfields gives
+ // the bitfield of the next states reached by stepping over the input byte.
+ // Credits for this technique: the Hyperscan paper by Geoff Langdale et al.
+ uint16_t nfa[256]{};
+ for (size_t i = 0; i < size; ++i) {
+ uint8_t b = prefix[i];
+ nfa[b] |= 1 << (i+1);
+ }
+ // This is the `\C*?` for unanchored search.
+ for (int b = 0; b < 256; ++b)
+ nfa[b] |= 1;
+
+ // This maps from DFA state to NFA states; the reverse mapping is used when
+ // recording transitions and gets implemented with plain old linear search.
+ // The "Shift DFA" technique limits this to ten states when using uint64_t;
+ // to allow for the initial state, we use at most nine bytes of the prefix.
+ // That same limit is also why uint16_t is sufficient for the NFA bitfield.
+ uint16_t states[kShiftDFAFinal+1]{};
+ states[0] = 1;
+ for (size_t dcurr = 0; dcurr < size; ++dcurr) {
+ uint8_t b = prefix[dcurr];
+ uint16_t ncurr = states[dcurr];
+ uint16_t nnext = nfa[b] & ((ncurr << 1) | 1);
+ size_t dnext = dcurr+1;
+ if (dnext == size)
+ dnext = kShiftDFAFinal;
+ states[dnext] = nnext;
+ }
+
+ // Sort and unique the bytes of the prefix to avoid repeating work while we
+ // record transitions. This clobbers the prefix, but it's no longer needed.
+ std::sort(prefix.begin(), prefix.end());
+ prefix.erase(std::unique(prefix.begin(), prefix.end()), prefix.end());
+
+ // Construct the DFA.
+ // The table is indexed by input byte; each element is effectively a packed
+ // array of uint6_t; each array value will be multiplied by six in order to
+ // avoid having to do so later in the hot loop as well as masking/shifting.
+ // Credits for this technique: "Shift-based DFAs" on GitHub by Per Vognsen.
+ uint64_t* dfa = new uint64_t[256]{};
+ // Record a transition from each state for each of the bytes of the prefix.
+ // Note that all other input bytes go back to the initial state by default.
+ for (size_t dcurr = 0; dcurr < size; ++dcurr) {
+ for (uint8_t b : prefix) {
+ uint16_t ncurr = states[dcurr];
+ uint16_t nnext = nfa[b] & ((ncurr << 1) | 1);
+ size_t dnext = 0;
+ while (states[dnext] != nnext)
+ ++dnext;
+ dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);
+ // Convert ASCII letters to uppercase and record the extra transitions.
+ // Note that ASCII letters are guaranteed to be lowercase at this point
+ // because that's how the parser normalises them. #FunFact: 'k' and 's'
+ // match U+212A and U+017F, respectively, so they won't occur here when
+ // using UTF-8 encoding because the parser will emit character classes.
+ if ('a' <= b && b <= 'z') {
+ b -= 'a' - 'A';
+ dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);
+ }
+ }
+ }
+ // This lets the final state "saturate", which will matter for performance:
+ // in the hot loop, we check for a match only at the end of each iteration,
+ // so we must keep signalling the match until we get around to checking it.
+ for (int b = 0; b < 256; ++b)
+ dfa[b] |= static_cast<uint64_t>(kShiftDFAFinal * 6) << (kShiftDFAFinal * 6);
+
+ return dfa;
+}
+
+void Prog::ConfigurePrefixAccel(const std::string& prefix,
+ bool prefix_foldcase) {
+ prefix_foldcase_ = prefix_foldcase;
+ prefix_size_ = prefix.size();
+ if (prefix_foldcase_) {
+ // Use PrefixAccel_ShiftDFA().
+ // ... and no more than nine bytes of the prefix. (See above for details.)
+ prefix_size_ = std::min(prefix_size_, kShiftDFAFinal);
+ prefix_dfa_ = BuildShiftDFA(prefix.substr(0, prefix_size_));
+ } else if (prefix_size_ != 1) {
+ // Use PrefixAccel_FrontAndBack().
+ prefix_front_ = prefix.front();
+ prefix_back_ = prefix.back();
+ } else {
+ // Use memchr(3).
+ prefix_front_ = prefix.front();
+ }
+}
+
+const void* Prog::PrefixAccel_ShiftDFA(const void* data, size_t size) {
+ if (size < prefix_size_)
+ return NULL;
+
+ uint64_t curr = 0;
+
+ // At the time of writing, rough benchmarks on a Broadwell machine showed
+ // that this unroll factor (i.e. eight) achieves a speedup factor of two.
+ if (size >= 8) {
+ const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
+ const uint8_t* endp = p + (size&~7);
+ do {
+ uint8_t b0 = p[0];
+ uint8_t b1 = p[1];
+ uint8_t b2 = p[2];
+ uint8_t b3 = p[3];
+ uint8_t b4 = p[4];
+ uint8_t b5 = p[5];
+ uint8_t b6 = p[6];
+ uint8_t b7 = p[7];
+
+ uint64_t next0 = prefix_dfa_[b0];
+ uint64_t next1 = prefix_dfa_[b1];
+ uint64_t next2 = prefix_dfa_[b2];
+ uint64_t next3 = prefix_dfa_[b3];
+ uint64_t next4 = prefix_dfa_[b4];
+ uint64_t next5 = prefix_dfa_[b5];
+ uint64_t next6 = prefix_dfa_[b6];
+ uint64_t next7 = prefix_dfa_[b7];
+
+ uint64_t curr0 = next0 >> (curr & 63);
+ uint64_t curr1 = next1 >> (curr0 & 63);
+ uint64_t curr2 = next2 >> (curr1 & 63);
+ uint64_t curr3 = next3 >> (curr2 & 63);
+ uint64_t curr4 = next4 >> (curr3 & 63);
+ uint64_t curr5 = next5 >> (curr4 & 63);
+ uint64_t curr6 = next6 >> (curr5 & 63);
+ uint64_t curr7 = next7 >> (curr6 & 63);
+
+ if ((curr7 & 63) == kShiftDFAFinal * 6) {
+ // At the time of writing, using the same masking subexpressions from
+ // the preceding lines caused Clang to clutter the hot loop computing
+ // them - even though they aren't actually needed for shifting! Hence
+ // these rewritten conditions, which achieve a speedup factor of two.
+ if (((curr7-curr0) & 63) == 0) return p+1-prefix_size_;
+ if (((curr7-curr1) & 63) == 0) return p+2-prefix_size_;
+ if (((curr7-curr2) & 63) == 0) return p+3-prefix_size_;
+ if (((curr7-curr3) & 63) == 0) return p+4-prefix_size_;
+ if (((curr7-curr4) & 63) == 0) return p+5-prefix_size_;
+ if (((curr7-curr5) & 63) == 0) return p+6-prefix_size_;
+ if (((curr7-curr6) & 63) == 0) return p+7-prefix_size_;
+ if (((curr7-curr7) & 63) == 0) return p+8-prefix_size_;
+ }
+
+ curr = curr7;
+ p += 8;
+ } while (p != endp);
+ data = p;
+ size = size&7;
+ }
+
+ const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
+ const uint8_t* endp = p + size;
+ while (p != endp) {
+ uint8_t b = *p++;
+ uint64_t next = prefix_dfa_[b];
+ curr = next >> (curr & 63);
+ if ((curr & 63) == kShiftDFAFinal * 6)
+ return p-prefix_size_;
+ }
+ return NULL;
+}
+
+#if defined(__AVX2__)
+// Finds the least significant non-zero bit in n.
+static int FindLSBSet(uint32_t n) {
+ DCHECK_NE(n, 0);
+#if defined(__GNUC__)
+ return __builtin_ctz(n);
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+ unsigned long c;
+ _BitScanForward(&c, n);
+ return static_cast<int>(c);
+#else
+ int c = 31;
+ for (int shift = 1 << 4; shift != 0; shift >>= 1) {
+ uint32_t word = n << shift;
+ if (word != 0) {
+ n = word;
+ c -= shift;
+ }
+ }
+ return c;
+#endif
+}
+#endif
+
+const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) {
+ DCHECK_GE(prefix_size_, 2);
+ if (size < prefix_size_)
+ return NULL;
+ // Don't bother searching the last prefix_size_-1 bytes for prefix_front_.
+ // This also means that probing for prefix_back_ doesn't go out of bounds.
+ size -= prefix_size_-1;
+
+#if defined(__AVX2__)
+ // Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time.
+ if (size >= sizeof(__m256i)) {
+ const __m256i* fp = reinterpret_cast<const __m256i*>(
+ reinterpret_cast<const char*>(data));
+ const __m256i* bp = reinterpret_cast<const __m256i*>(
+ reinterpret_cast<const char*>(data) + prefix_size_-1);
+ const __m256i* endfp = fp + size/sizeof(__m256i);
+ const __m256i f_set1 = _mm256_set1_epi8(prefix_front_);
+ const __m256i b_set1 = _mm256_set1_epi8(prefix_back_);
+ do {
+ const __m256i f_loadu = _mm256_loadu_si256(fp++);
+ const __m256i b_loadu = _mm256_loadu_si256(bp++);
+ const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu);
+ const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu);
+ const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq);
+ if (fb_testz == 0) { // ZF: 1 means zero, 0 means non-zero.
+ const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq);
+ const int fb_movemask = _mm256_movemask_epi8(fb_and);
+ const int fb_ctz = FindLSBSet(fb_movemask);
+ return reinterpret_cast<const char*>(fp-1) + fb_ctz;
+ }
+ } while (fp != endfp);
+ data = fp;
+ size = size%sizeof(__m256i);
+ }
+#endif
+
+ const char* p0 = reinterpret_cast<const char*>(data);
+ for (const char* p = p0;; p++) {
+ DCHECK_GE(size, static_cast<size_t>(p-p0));
+ p = reinterpret_cast<const char*>(memchr(p, prefix_front_, size - (p-p0)));
+ if (p == NULL || p[prefix_size_-1] == prefix_back_)
+ return p;
+ }
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/prog.h b/third_party/re2/src/re2/prog.h
new file mode 100644
index 000000000..41923f314
--- /dev/null
+++ b/third_party/re2/src/re2/prog.h
@@ -0,0 +1,466 @@
+// Copyright 2007 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_PROG_H_
+#define RE2_PROG_H_
+
+// Compiled representation of regular expressions.
+// See regexp.h for the Regexp class, which represents a regular
+// expression symbolically.
+
+#include <stdint.h>
+#include <functional>
+#include <string>
+#include <vector>
+#include <type_traits>
+
+#include "absl/base/call_once.h"
+#include "absl/strings/string_view.h"
+#include "util/logging.h"
+#include "re2/pod_array.h"
+#include "re2/re2.h"
+#include "re2/sparse_array.h"
+#include "re2/sparse_set.h"
+
+namespace re2 {
+
+// Opcodes for Inst
+enum InstOp {
+ kInstAlt = 0, // choose between out_ and out1_
+ kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
+ kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
+ kInstCapture, // capturing parenthesis number cap_
+ kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
+ kInstMatch, // found a match!
+ kInstNop, // no-op; occasionally unavoidable
+ kInstFail, // never match; occasionally unavoidable
+ kNumInst,
+};
+
+// Bit flags for empty-width specials
+enum EmptyOp {
+ kEmptyBeginLine = 1<<0, // ^ - beginning of line
+ kEmptyEndLine = 1<<1, // $ - end of line
+ kEmptyBeginText = 1<<2, // \A - beginning of text
+ kEmptyEndText = 1<<3, // \z - end of text
+ kEmptyWordBoundary = 1<<4, // \b - word boundary
+ kEmptyNonWordBoundary = 1<<5, // \B - not \b
+ kEmptyAllFlags = (1<<6)-1,
+};
+
+class DFA;
+class Regexp;
+
+// Compiled form of regexp program.
+class Prog {
+ public:
+ Prog();
+ ~Prog();
+
+ // Single instruction in regexp program.
+ class Inst {
+ public:
+ // See the assertion below for why this is so.
+ Inst() = default;
+
+ // Copyable.
+ Inst(const Inst&) = default;
+ Inst& operator=(const Inst&) = default;
+
+ // Constructors per opcode
+ void InitAlt(uint32_t out, uint32_t out1);
+ void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
+ void InitCapture(int cap, uint32_t out);
+ void InitEmptyWidth(EmptyOp empty, uint32_t out);
+ void InitMatch(int id);
+ void InitNop(uint32_t out);
+ void InitFail();
+
+ // Getters
+ int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); }
+ InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
+ int last() { return (out_opcode_>>3)&1; }
+ int out() { return out_opcode_>>4; }
+ int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
+ int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
+ int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
+ int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
+ int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_&1; }
+ int hint() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_>>1; }
+ int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
+ EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
+
+ bool greedy(Prog* p) {
+ DCHECK_EQ(opcode(), kInstAltMatch);
+ return p->inst(out())->opcode() == kInstByteRange ||
+ (p->inst(out())->opcode() == kInstNop &&
+ p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
+ }
+
+ // Does this inst (an kInstByteRange) match c?
+ inline bool Matches(int c) {
+ DCHECK_EQ(opcode(), kInstByteRange);
+ if (foldcase() && 'A' <= c && c <= 'Z')
+ c += 'a' - 'A';
+ return lo_ <= c && c <= hi_;
+ }
+
+ // Returns string representation for debugging.
+ std::string Dump();
+
+ // Maximum instruction id.
+ // (Must fit in out_opcode_. PatchList/last steal another bit.)
+ static const int kMaxInst = (1<<28) - 1;
+
+ private:
+ void set_opcode(InstOp opcode) {
+ out_opcode_ = (out()<<4) | (last()<<3) | opcode;
+ }
+
+ void set_last() {
+ out_opcode_ = (out()<<4) | (1<<3) | opcode();
+ }
+
+ void set_out(int out) {
+ out_opcode_ = (out<<4) | (last()<<3) | opcode();
+ }
+
+ void set_out_opcode(int out, InstOp opcode) {
+ out_opcode_ = (out<<4) | (last()<<3) | opcode;
+ }
+
+ uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
+ union { // additional instruction arguments:
+ uint32_t out1_; // opcode == kInstAlt
+ // alternate next instruction
+
+ int32_t cap_; // opcode == kInstCapture
+ // Index of capture register (holds text
+ // position recorded by capturing parentheses).
+ // For \n (the submatch for the nth parentheses),
+ // the left parenthesis captures into register 2*n
+ // and the right one captures into register 2*n+1.
+
+ int32_t match_id_; // opcode == kInstMatch
+ // Match ID to identify this match (for re2::Set).
+
+ struct { // opcode == kInstByteRange
+ uint8_t lo_; // byte range is lo_-hi_ inclusive
+ uint8_t hi_; //
+ uint16_t hint_foldcase_; // 15 bits: hint, 1 (low) bit: foldcase
+ // hint to execution engines: the delta to the
+ // next instruction (in the current list) worth
+ // exploring iff this instruction matched; 0
+ // means there are no remaining possibilities,
+ // which is most likely for character classes.
+ // foldcase: A-Z -> a-z before checking range.
+ };
+
+ EmptyOp empty_; // opcode == kInstEmptyWidth
+ // empty_ is bitwise OR of kEmpty* flags above.
+ };
+
+ friend class Compiler;
+ friend struct PatchList;
+ friend class Prog;
+ };
+
+ // Inst must be trivial so that we can freely clear it with memset(3).
+ // Arrays of Inst are initialised by copying the initial elements with
+ // memmove(3) and then clearing any remaining elements with memset(3).
+ static_assert(std::is_trivial<Inst>::value, "Inst must be trivial");
+
+ // Whether to anchor the search.
+ enum Anchor {
+ kUnanchored, // match anywhere
+ kAnchored, // match only starting at beginning of text
+ };
+
+ // Kind of match to look for (for anchor != kFullMatch)
+ //
+ // kLongestMatch mode finds the overall longest
+ // match but still makes its submatch choices the way
+ // Perl would, not in the way prescribed by POSIX.
+ // The POSIX rules are much more expensive to implement,
+ // and no one has needed them.
+ //
+ // kFullMatch is not strictly necessary -- we could use
+ // kLongestMatch and then check the length of the match -- but
+ // the matching code can run faster if it knows to consider only
+ // full matches.
+ enum MatchKind {
+ kFirstMatch, // like Perl, PCRE
+ kLongestMatch, // like egrep or POSIX
+ kFullMatch, // match only entire text; implies anchor==kAnchored
+ kManyMatch // for SearchDFA, records set of matches
+ };
+
+ Inst *inst(int id) { return &inst_[id]; }
+ int start() { return start_; }
+ void set_start(int start) { start_ = start; }
+ int start_unanchored() { return start_unanchored_; }
+ void set_start_unanchored(int start) { start_unanchored_ = start; }
+ int size() { return size_; }
+ bool reversed() { return reversed_; }
+ void set_reversed(bool reversed) { reversed_ = reversed; }
+ int list_count() { return list_count_; }
+ int inst_count(InstOp op) { return inst_count_[op]; }
+ uint16_t* list_heads() { return list_heads_.data(); }
+ size_t bit_state_text_max_size() { return bit_state_text_max_size_; }
+ int64_t dfa_mem() { return dfa_mem_; }
+ void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
+ bool anchor_start() { return anchor_start_; }
+ void set_anchor_start(bool b) { anchor_start_ = b; }
+ bool anchor_end() { return anchor_end_; }
+ void set_anchor_end(bool b) { anchor_end_ = b; }
+ int bytemap_range() { return bytemap_range_; }
+ const uint8_t* bytemap() { return bytemap_; }
+ bool can_prefix_accel() { return prefix_size_ != 0; }
+
+ // Accelerates to the first likely occurrence of the prefix.
+ // Returns a pointer to the first byte or NULL if not found.
+ const void* PrefixAccel(const void* data, size_t size) {
+ DCHECK(can_prefix_accel());
+ if (prefix_foldcase_) {
+ return PrefixAccel_ShiftDFA(data, size);
+ } else if (prefix_size_ != 1) {
+ return PrefixAccel_FrontAndBack(data, size);
+ } else {
+ return memchr(data, prefix_front_, size);
+ }
+ }
+
+ // Configures prefix accel using the analysis performed during compilation.
+ void ConfigurePrefixAccel(const std::string& prefix, bool prefix_foldcase);
+
+ // An implementation of prefix accel that uses prefix_dfa_ to perform
+ // case-insensitive search.
+ const void* PrefixAccel_ShiftDFA(const void* data, size_t size);
+
+ // An implementation of prefix accel that looks for prefix_front_ and
+ // prefix_back_ to return fewer false positives than memchr(3) alone.
+ const void* PrefixAccel_FrontAndBack(const void* data, size_t size);
+
+ // Returns string representation of program for debugging.
+ std::string Dump();
+ std::string DumpUnanchored();
+ std::string DumpByteMap();
+
+ // Returns the set of kEmpty flags that are in effect at
+ // position p within context.
+ static uint32_t EmptyFlags(absl::string_view context, const char* p);
+
+ // Returns whether byte c is a word character: ASCII only.
+ // Used by the implementation of \b and \B.
+ // This is not right for Unicode, but:
+ // - it's hard to get right in a byte-at-a-time matching world
+ // (the DFA has only one-byte lookahead).
+ // - even if the lookahead were possible, the Progs would be huge.
+ // This crude approximation is the same one PCRE uses.
+ static bool IsWordChar(uint8_t c) {
+ return ('A' <= c && c <= 'Z') ||
+ ('a' <= c && c <= 'z') ||
+ ('0' <= c && c <= '9') ||
+ c == '_';
+ }
+
+ // Execution engines. They all search for the regexp (run the prog)
+ // in text, which is in the larger context (used for ^ $ \b etc).
+ // Anchor and kind control the kind of search.
+ // Returns true if match found, false if not.
+ // If match found, fills match[0..nmatch-1] with submatch info.
+ // match[0] is overall match, match[1] is first set of parens, etc.
+ // If a particular submatch is not matched during the regexp match,
+ // it is set to NULL.
+ //
+ // Matching text == absl::string_view() is treated as any other empty
+ // string, but note that on return, it will not be possible to distinguish
+ // submatches that matched that empty string from submatches that didn't
+ // match anything. Either way, match[i] == NULL.
+
+ // Search using NFA: can find submatches but kind of slow.
+ bool SearchNFA(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind, absl::string_view* match,
+ int nmatch);
+
+ // Search using DFA: much faster than NFA but only finds
+ // end of match and can use a lot more memory.
+ // Returns whether a match was found.
+ // If the DFA runs out of memory, sets *failed to true and returns false.
+ // If matches != NULL and kind == kManyMatch and there is a match,
+ // SearchDFA fills matches with the match IDs of the final matching state.
+ bool SearchDFA(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind, absl::string_view* match0,
+ bool* failed, SparseSet* matches);
+
+ // The callback issued after building each DFA state with BuildEntireDFA().
+ // If next is null, then the memory budget has been exhausted and building
+ // will halt. Otherwise, the state has been built and next points to an array
+ // of bytemap_range()+1 slots holding the next states as per the bytemap and
+ // kByteEndText. The number of the state is implied by the callback sequence:
+ // the first callback is for state 0, the second callback is for state 1, ...
+ // match indicates whether the state is a matching state.
+ using DFAStateCallback = std::function<void(const int* next, bool match)>;
+
+ // Build the entire DFA for the given match kind.
+ // Usually the DFA is built out incrementally, as needed, which
+ // avoids lots of unnecessary work.
+ // If cb is not empty, it receives one callback per state built.
+ // Returns the number of states built.
+ // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
+ int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
+
+ // Compute bytemap.
+ void ComputeByteMap();
+
+ // Run peep-hole optimizer on program.
+ void Optimize();
+
+ // One-pass NFA: only correct if IsOnePass() is true,
+ // but much faster than NFA (competitive with PCRE)
+ // for those expressions.
+ bool IsOnePass();
+ bool SearchOnePass(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind, absl::string_view* match,
+ int nmatch);
+
+ // Bit-state backtracking. Fast on small cases but uses memory
+ // proportional to the product of the list count and the text size.
+ bool CanBitState() { return list_heads_.data() != NULL; }
+ bool SearchBitState(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind, absl::string_view* match,
+ int nmatch);
+
+ static const int kMaxOnePassCapture = 5; // $0 through $4
+
+ // Backtracking search: the gold standard against which the other
+ // implementations are checked. FOR TESTING ONLY.
+ // It allocates a ton of memory to avoid running forever.
+ // It is also recursive, so can't use in production (will overflow stacks).
+ // The name "Unsafe" here is supposed to be a flag that
+ // you should not be using this function.
+ bool UnsafeSearchBacktrack(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind,
+ absl::string_view* match, int nmatch);
+
+ // Computes range for any strings matching regexp. The min and max can in
+ // some cases be arbitrarily precise, so the caller gets to specify the
+ // maximum desired length of string returned.
+ //
+ // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
+ // string s that is an anchored match for this regexp satisfies
+ // min <= s && s <= max.
+ //
+ // Note that PossibleMatchRange() will only consider the first copy of an
+ // infinitely repeated element (i.e., any regexp element followed by a '*' or
+ // '+' operator). Regexps with "{N}" constructions are not affected, as those
+ // do not compile down to infinite repetitions.
+ //
+ // Returns true on success, false on error.
+ bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
+
+ // Outputs the program fanout into the given sparse array.
+ void Fanout(SparseArray<int>* fanout);
+
+ // Compiles a collection of regexps to Prog. Each regexp will have
+ // its own Match instruction recording the index in the output vector.
+ static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
+
+ // Flattens the Prog from "tree" form to "list" form. This is an in-place
+ // operation in the sense that the old instructions are lost.
+ void Flatten();
+
+ // Walks the Prog; the "successor roots" or predecessors of the reachable
+ // instructions are marked in rootmap or predmap/predvec, respectively.
+ // reachable and stk are preallocated scratch structures.
+ void MarkSuccessors(SparseArray<int>* rootmap,
+ SparseArray<int>* predmap,
+ std::vector<std::vector<int>>* predvec,
+ SparseSet* reachable, std::vector<int>* stk);
+
+ // Walks the Prog from the given "root" instruction; the "dominator root"
+ // of the reachable instructions (if such exists) is marked in rootmap.
+ // reachable and stk are preallocated scratch structures.
+ void MarkDominator(int root, SparseArray<int>* rootmap,
+ SparseArray<int>* predmap,
+ std::vector<std::vector<int>>* predvec,
+ SparseSet* reachable, std::vector<int>* stk);
+
+ // Walks the Prog from the given "root" instruction; the reachable
+ // instructions are emitted in "list" form and appended to flat.
+ // reachable and stk are preallocated scratch structures.
+ void EmitList(int root, SparseArray<int>* rootmap,
+ std::vector<Inst>* flat,
+ SparseSet* reachable, std::vector<int>* stk);
+
+ // Computes hints for ByteRange instructions in [begin, end).
+ void ComputeHints(std::vector<Inst>* flat, int begin, int end);
+
+ // Controls whether the DFA should bail out early if the NFA would be faster.
+ // FOR TESTING ONLY.
+ static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b);
+
+ private:
+ friend class Compiler;
+
+ DFA* GetDFA(MatchKind kind);
+ void DeleteDFA(DFA* dfa);
+
+ bool anchor_start_; // regexp has explicit start anchor
+ bool anchor_end_; // regexp has explicit end anchor
+ bool reversed_; // whether program runs backward over input
+ bool did_flatten_; // has Flatten been called?
+ bool did_onepass_; // has IsOnePass been called?
+
+ int start_; // entry point for program
+ int start_unanchored_; // unanchored entry point for program
+ int size_; // number of instructions
+ int bytemap_range_; // bytemap_[x] < bytemap_range_
+
+ bool prefix_foldcase_; // whether prefix is case-insensitive
+ size_t prefix_size_; // size of prefix (0 if no prefix)
+ union {
+ uint64_t* prefix_dfa_; // "Shift DFA" for prefix
+ struct {
+ int prefix_front_; // first byte of prefix
+ int prefix_back_; // last byte of prefix
+ };
+ };
+
+ int list_count_; // count of lists (see above)
+ int inst_count_[kNumInst]; // count of instructions by opcode
+ PODArray<uint16_t> list_heads_; // sparse array enumerating list heads
+ // not populated if size_ is overly large
+ size_t bit_state_text_max_size_; // upper bound (inclusive) on text.size()
+
+ PODArray<Inst> inst_; // pointer to instruction array
+ PODArray<uint8_t> onepass_nodes_; // data for OnePass nodes
+
+ int64_t dfa_mem_; // Maximum memory for DFAs.
+ DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch
+ DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch
+
+ uint8_t bytemap_[256]; // map from input bytes to byte classes
+
+ absl::once_flag dfa_first_once_;
+ absl::once_flag dfa_longest_once_;
+
+ Prog(const Prog&) = delete;
+ Prog& operator=(const Prog&) = delete;
+};
+
+// std::string_view in MSVC has iterators that aren't just pointers and
+// that don't allow comparisons between different objects - not even if
+// those objects are views into the same string! Thus, we provide these
+// conversion functions for convenience.
+static inline const char* BeginPtr(absl::string_view s) {
+ return s.data();
+}
+static inline const char* EndPtr(absl::string_view s) {
+ return s.data() + s.size();
+}
+
+} // namespace re2
+
+#endif // RE2_PROG_H_
diff --git a/third_party/re2/src/re2/re2.cc b/third_party/re2/src/re2/re2.cc
new file mode 100644
index 000000000..61d9d1f0c
--- /dev/null
+++ b/third_party/re2/src/re2/re2.cc
@@ -0,0 +1,1345 @@
+// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression interface RE2.
+//
+// Originally the PCRE C++ wrapper, but adapted to use
+// the new automata-based regular expression engines.
+
+#include "re2/re2.h"
+
+#include <ctype.h>
+#include <errno.h>
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+#include <atomic>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/macros.h"
+#include "absl/container/fixed_array.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_format.h"
+#include "util/logging.h"
+#include "util/strutil.h"
+#include "util/utf.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+#include "re2/sparse_array.h"
+
+namespace re2 {
+
+// Controls the maximum count permitted by GlobalReplace(); -1 is unlimited.
+static int maximum_global_replace_count = -1;
+
+void RE2::FUZZING_ONLY_set_maximum_global_replace_count(int i) {
+ maximum_global_replace_count = i;
+}
+
+// Maximum number of args we can set
+static const int kMaxArgs = 16;
+static const int kVecSize = 1+kMaxArgs;
+
+const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
+
+RE2::Options::Options(RE2::CannedOptions opt)
+ : max_mem_(kDefaultMaxMem),
+ encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
+ posix_syntax_(opt == RE2::POSIX),
+ longest_match_(opt == RE2::POSIX),
+ log_errors_(opt != RE2::Quiet),
+ literal_(false),
+ never_nl_(false),
+ dot_nl_(false),
+ never_capture_(false),
+ case_sensitive_(true),
+ perl_classes_(false),
+ word_boundary_(false),
+ one_line_(false) {
+}
+
+// Empty objects for use as const references.
+// Statically allocating the storage and then
+// lazily constructing the objects (in a once
+// in RE2::Init()) avoids global constructors
+// and the false positives (thanks, Valgrind)
+// about memory leaks at program termination.
+struct EmptyStorage {
+ std::string empty_string;
+ std::map<std::string, int> empty_named_groups;
+ std::map<int, std::string> empty_group_names;
+};
+alignas(EmptyStorage) static char empty_storage[sizeof(EmptyStorage)];
+
+static inline std::string* empty_string() {
+ return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_string;
+}
+
+static inline std::map<std::string, int>* empty_named_groups() {
+ return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_named_groups;
+}
+
+static inline std::map<int, std::string>* empty_group_names() {
+ return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_group_names;
+}
+
+// Converts from Regexp error code to RE2 error code.
+// Maybe some day they will diverge. In any event, this
+// hides the existence of Regexp from RE2 users.
+static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
+ switch (code) {
+ case re2::kRegexpSuccess:
+ return RE2::NoError;
+ case re2::kRegexpInternalError:
+ return RE2::ErrorInternal;
+ case re2::kRegexpBadEscape:
+ return RE2::ErrorBadEscape;
+ case re2::kRegexpBadCharClass:
+ return RE2::ErrorBadCharClass;
+ case re2::kRegexpBadCharRange:
+ return RE2::ErrorBadCharRange;
+ case re2::kRegexpMissingBracket:
+ return RE2::ErrorMissingBracket;
+ case re2::kRegexpMissingParen:
+ return RE2::ErrorMissingParen;
+ case re2::kRegexpUnexpectedParen:
+ return RE2::ErrorUnexpectedParen;
+ case re2::kRegexpTrailingBackslash:
+ return RE2::ErrorTrailingBackslash;
+ case re2::kRegexpRepeatArgument:
+ return RE2::ErrorRepeatArgument;
+ case re2::kRegexpRepeatSize:
+ return RE2::ErrorRepeatSize;
+ case re2::kRegexpRepeatOp:
+ return RE2::ErrorRepeatOp;
+ case re2::kRegexpBadPerlOp:
+ return RE2::ErrorBadPerlOp;
+ case re2::kRegexpBadUTF8:
+ return RE2::ErrorBadUTF8;
+ case re2::kRegexpBadNamedCapture:
+ return RE2::ErrorBadNamedCapture;
+ }
+ return RE2::ErrorInternal;
+}
+
+static std::string trunc(absl::string_view pattern) {
+ if (pattern.size() < 100)
+ return std::string(pattern);
+ return std::string(pattern.substr(0, 100)) + "...";
+}
+
+
+RE2::RE2(const char* pattern) {
+ Init(pattern, DefaultOptions);
+}
+
+RE2::RE2(const std::string& pattern) {
+ Init(pattern, DefaultOptions);
+}
+
+RE2::RE2(absl::string_view pattern) {
+ Init(pattern, DefaultOptions);
+}
+
+RE2::RE2(absl::string_view pattern, const Options& options) {
+ Init(pattern, options);
+}
+
+int RE2::Options::ParseFlags() const {
+ int flags = Regexp::ClassNL;
+ switch (encoding()) {
+ default:
+ if (log_errors())
+ LOG(ERROR) << "Unknown encoding " << encoding();
+ break;
+ case RE2::Options::EncodingUTF8:
+ break;
+ case RE2::Options::EncodingLatin1:
+ flags |= Regexp::Latin1;
+ break;
+ }
+
+ if (!posix_syntax())
+ flags |= Regexp::LikePerl;
+
+ if (literal())
+ flags |= Regexp::Literal;
+
+ if (never_nl())
+ flags |= Regexp::NeverNL;
+
+ if (dot_nl())
+ flags |= Regexp::DotNL;
+
+ if (never_capture())
+ flags |= Regexp::NeverCapture;
+
+ if (!case_sensitive())
+ flags |= Regexp::FoldCase;
+
+ if (perl_classes())
+ flags |= Regexp::PerlClasses;
+
+ if (word_boundary())
+ flags |= Regexp::PerlB;
+
+ if (one_line())
+ flags |= Regexp::OneLine;
+
+ return flags;
+}
+
+void RE2::Init(absl::string_view pattern, const Options& options) {
+ static absl::once_flag empty_once;
+ absl::call_once(empty_once, []() {
+ (void) new (empty_storage) EmptyStorage;
+ });
+
+ pattern_ = new std::string(pattern);
+ options_.Copy(options);
+ entire_regexp_ = NULL;
+ suffix_regexp_ = NULL;
+ error_ = empty_string();
+ error_arg_ = empty_string();
+
+ num_captures_ = -1;
+ error_code_ = NoError;
+ longest_match_ = options_.longest_match();
+ is_one_pass_ = false;
+ prefix_foldcase_ = false;
+ prefix_.clear();
+ prog_ = NULL;
+
+ rprog_ = NULL;
+ named_groups_ = NULL;
+ group_names_ = NULL;
+
+ RegexpStatus status;
+ entire_regexp_ = Regexp::Parse(
+ *pattern_,
+ static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
+ &status);
+ if (entire_regexp_ == NULL) {
+ if (options_.log_errors()) {
+ LOG(ERROR) << "Error parsing '" << trunc(*pattern_) << "': "
+ << status.Text();
+ }
+ error_ = new std::string(status.Text());
+ error_code_ = RegexpErrorToRE2(status.code());
+ error_arg_ = new std::string(status.error_arg());
+ return;
+ }
+
+ bool foldcase;
+ re2::Regexp* suffix;
+ if (entire_regexp_->RequiredPrefix(&prefix_, &foldcase, &suffix)) {
+ prefix_foldcase_ = foldcase;
+ suffix_regexp_ = suffix;
+ }
+ else {
+ suffix_regexp_ = entire_regexp_->Incref();
+ }
+
+ // Two thirds of the memory goes to the forward Prog,
+ // one third to the reverse prog, because the forward
+ // Prog has two DFAs but the reverse prog has one.
+ prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
+ if (prog_ == NULL) {
+ if (options_.log_errors())
+ LOG(ERROR) << "Error compiling '" << trunc(*pattern_) << "'";
+ error_ = new std::string("pattern too large - compile failed");
+ error_code_ = RE2::ErrorPatternTooLarge;
+ return;
+ }
+
+ // We used to compute this lazily, but it's used during the
+ // typical control flow for a match call, so we now compute
+ // it eagerly, which avoids the overhead of absl::once_flag.
+ num_captures_ = suffix_regexp_->NumCaptures();
+
+ // Could delay this until the first match call that
+ // cares about submatch information, but the one-pass
+ // machine's memory gets cut from the DFA memory budget,
+ // and that is harder to do if the DFA has already
+ // been built.
+ is_one_pass_ = prog_->IsOnePass();
+}
+
+// Returns rprog_, computing it if needed.
+re2::Prog* RE2::ReverseProg() const {
+ absl::call_once(rprog_once_, [](const RE2* re) {
+ re->rprog_ =
+ re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3);
+ if (re->rprog_ == NULL) {
+ if (re->options_.log_errors())
+ LOG(ERROR) << "Error reverse compiling '" << trunc(*re->pattern_)
+ << "'";
+ // We no longer touch error_ and error_code_ because failing to compile
+ // the reverse Prog is not a showstopper: falling back to NFA execution
+ // is fine. More importantly, an RE2 object is supposed to be logically
+ // immutable: whatever ok() would have returned after Init() completed,
+ // it should continue to return that no matter what ReverseProg() does.
+ }
+ }, this);
+ return rprog_;
+}
+
+RE2::~RE2() {
+ if (group_names_ != empty_group_names())
+ delete group_names_;
+ if (named_groups_ != empty_named_groups())
+ delete named_groups_;
+ delete rprog_;
+ delete prog_;
+ if (error_arg_ != empty_string())
+ delete error_arg_;
+ if (error_ != empty_string())
+ delete error_;
+ if (suffix_regexp_)
+ suffix_regexp_->Decref();
+ if (entire_regexp_)
+ entire_regexp_->Decref();
+ delete pattern_;
+}
+
+int RE2::ProgramSize() const {
+ if (prog_ == NULL)
+ return -1;
+ return prog_->size();
+}
+
+int RE2::ReverseProgramSize() const {
+ if (prog_ == NULL)
+ return -1;
+ Prog* prog = ReverseProg();
+ if (prog == NULL)
+ return -1;
+ return prog->size();
+}
+
+// Finds the most significant non-zero bit in n.
+static int FindMSBSet(uint32_t n) {
+ DCHECK_NE(n, 0);
+#if defined(__GNUC__)
+ return 31 ^ __builtin_clz(n);
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+ unsigned long c;
+ _BitScanReverse(&c, n);
+ return static_cast<int>(c);
+#else
+ int c = 0;
+ for (int shift = 1 << 4; shift != 0; shift >>= 1) {
+ uint32_t word = n >> shift;
+ if (word != 0) {
+ n = word;
+ c += shift;
+ }
+ }
+ return c;
+#endif
+}
+
+static int Fanout(Prog* prog, std::vector<int>* histogram) {
+ SparseArray<int> fanout(prog->size());
+ prog->Fanout(&fanout);
+ int data[32] = {};
+ int size = 0;
+ for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
+ if (i->value() == 0)
+ continue;
+ uint32_t value = i->value();
+ int bucket = FindMSBSet(value);
+ bucket += value & (value-1) ? 1 : 0;
+ ++data[bucket];
+ size = std::max(size, bucket+1);
+ }
+ if (histogram != NULL)
+ histogram->assign(data, data+size);
+ return size-1;
+}
+
+int RE2::ProgramFanout(std::vector<int>* histogram) const {
+ if (prog_ == NULL)
+ return -1;
+ return Fanout(prog_, histogram);
+}
+
+int RE2::ReverseProgramFanout(std::vector<int>* histogram) const {
+ if (prog_ == NULL)
+ return -1;
+ Prog* prog = ReverseProg();
+ if (prog == NULL)
+ return -1;
+ return Fanout(prog, histogram);
+}
+
+// Returns named_groups_, computing it if needed.
+const std::map<std::string, int>& RE2::NamedCapturingGroups() const {
+ absl::call_once(named_groups_once_, [](const RE2* re) {
+ if (re->suffix_regexp_ != NULL)
+ re->named_groups_ = re->suffix_regexp_->NamedCaptures();
+ if (re->named_groups_ == NULL)
+ re->named_groups_ = empty_named_groups();
+ }, this);
+ return *named_groups_;
+}
+
+// Returns group_names_, computing it if needed.
+const std::map<int, std::string>& RE2::CapturingGroupNames() const {
+ absl::call_once(group_names_once_, [](const RE2* re) {
+ if (re->suffix_regexp_ != NULL)
+ re->group_names_ = re->suffix_regexp_->CaptureNames();
+ if (re->group_names_ == NULL)
+ re->group_names_ = empty_group_names();
+ }, this);
+ return *group_names_;
+}
+
+/***** Convenience interfaces *****/
+
+bool RE2::FullMatchN(absl::string_view text, const RE2& re,
+ const Arg* const args[], int n) {
+ return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
+}
+
+bool RE2::PartialMatchN(absl::string_view text, const RE2& re,
+ const Arg* const args[], int n) {
+ return re.DoMatch(text, UNANCHORED, NULL, args, n);
+}
+
+bool RE2::ConsumeN(absl::string_view* input, const RE2& re,
+ const Arg* const args[], int n) {
+ size_t consumed;
+ if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
+ input->remove_prefix(consumed);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool RE2::FindAndConsumeN(absl::string_view* input, const RE2& re,
+ const Arg* const args[], int n) {
+ size_t consumed;
+ if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
+ input->remove_prefix(consumed);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool RE2::Replace(std::string* str,
+ const RE2& re,
+ absl::string_view rewrite) {
+ absl::string_view vec[kVecSize];
+ int nvec = 1 + MaxSubmatch(rewrite);
+ if (nvec > 1 + re.NumberOfCapturingGroups())
+ return false;
+ if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec)))
+ return false;
+ if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
+ return false;
+
+ std::string s;
+ if (!re.Rewrite(&s, rewrite, vec, nvec))
+ return false;
+
+ DCHECK_GE(vec[0].data(), str->data());
+ DCHECK_LE(vec[0].data() + vec[0].size(), str->data() + str->size());
+ str->replace(vec[0].data() - str->data(), vec[0].size(), s);
+ return true;
+}
+
+int RE2::GlobalReplace(std::string* str,
+ const RE2& re,
+ absl::string_view rewrite) {
+ absl::string_view vec[kVecSize];
+ int nvec = 1 + MaxSubmatch(rewrite);
+ if (nvec > 1 + re.NumberOfCapturingGroups())
+ return false;
+ if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec)))
+ return false;
+
+ const char* p = str->data();
+ const char* ep = p + str->size();
+ const char* lastend = NULL;
+ std::string out;
+ int count = 0;
+ while (p <= ep) {
+ if (maximum_global_replace_count != -1 &&
+ count >= maximum_global_replace_count)
+ break;
+ if (!re.Match(*str, static_cast<size_t>(p - str->data()),
+ str->size(), UNANCHORED, vec, nvec))
+ break;
+ if (p < vec[0].data())
+ out.append(p, vec[0].data() - p);
+ if (vec[0].data() == lastend && vec[0].empty()) {
+ // Disallow empty match at end of last match: skip ahead.
+ //
+ // fullrune() takes int, not ptrdiff_t. However, it just looks
+ // at the leading byte and treats any length >= 4 the same.
+ if (re.options().encoding() == RE2::Options::EncodingUTF8 &&
+ fullrune(p, static_cast<int>(std::min(ptrdiff_t{4}, ep - p)))) {
+ // re is in UTF-8 mode and there is enough left of str
+ // to allow us to advance by up to UTFmax bytes.
+ Rune r;
+ int n = chartorune(&r, p);
+ // Some copies of chartorune have a bug that accepts
+ // encodings of values in (10FFFF, 1FFFFF] as valid.
+ if (r > Runemax) {
+ n = 1;
+ r = Runeerror;
+ }
+ if (!(n == 1 && r == Runeerror)) { // no decoding error
+ out.append(p, n);
+ p += n;
+ continue;
+ }
+ }
+ // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode,
+ // we fell through from above and the GIGO principle applies.
+ if (p < ep)
+ out.append(p, 1);
+ p++;
+ continue;
+ }
+ re.Rewrite(&out, rewrite, vec, nvec);
+ p = vec[0].data() + vec[0].size();
+ lastend = p;
+ count++;
+ }
+
+ if (count == 0)
+ return 0;
+
+ if (p < ep)
+ out.append(p, ep - p);
+ using std::swap;
+ swap(out, *str);
+ return count;
+}
+
+bool RE2::Extract(absl::string_view text,
+ const RE2& re,
+ absl::string_view rewrite,
+ std::string* out) {
+ absl::string_view vec[kVecSize];
+ int nvec = 1 + MaxSubmatch(rewrite);
+ if (nvec > 1 + re.NumberOfCapturingGroups())
+ return false;
+ if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec)))
+ return false;
+ if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
+ return false;
+
+ out->clear();
+ return re.Rewrite(out, rewrite, vec, nvec);
+}
+
+std::string RE2::QuoteMeta(absl::string_view unquoted) {
+ std::string result;
+ result.reserve(unquoted.size() << 1);
+
+ // Escape any ascii character not in [A-Za-z_0-9].
+ //
+ // Note that it's legal to escape a character even if it has no
+ // special meaning in a regular expression -- so this function does
+ // that. (This also makes it identical to the perl function of the
+ // same name except for the null-character special case;
+ // see `perldoc -f quotemeta`.)
+ for (size_t ii = 0; ii < unquoted.size(); ++ii) {
+ // Note that using 'isalnum' here raises the benchmark time from
+ // 32ns to 58ns:
+ if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
+ (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
+ (unquoted[ii] < '0' || unquoted[ii] > '9') &&
+ unquoted[ii] != '_' &&
+ // If this is the part of a UTF8 or Latin1 character, we need
+ // to copy this byte without escaping. Experimentally this is
+ // what works correctly with the regexp library.
+ !(unquoted[ii] & 128)) {
+ if (unquoted[ii] == '\0') { // Special handling for null chars.
+ // Note that this special handling is not strictly required for RE2,
+ // but this quoting is required for other regexp libraries such as
+ // PCRE.
+ // Can't use "\\0" since the next character might be a digit.
+ result += "\\x00";
+ continue;
+ }
+ result += '\\';
+ }
+ result += unquoted[ii];
+ }
+
+ return result;
+}
+
+bool RE2::PossibleMatchRange(std::string* min, std::string* max,
+ int maxlen) const {
+ if (prog_ == NULL)
+ return false;
+
+ int n = static_cast<int>(prefix_.size());
+ if (n > maxlen)
+ n = maxlen;
+
+ // Determine initial min max from prefix_ literal.
+ *min = prefix_.substr(0, n);
+ *max = prefix_.substr(0, n);
+ if (prefix_foldcase_) {
+ // prefix is ASCII lowercase; change *min to uppercase.
+ for (int i = 0; i < n; i++) {
+ char& c = (*min)[i];
+ if ('a' <= c && c <= 'z')
+ c += 'A' - 'a';
+ }
+ }
+
+ // Add to prefix min max using PossibleMatchRange on regexp.
+ std::string dmin, dmax;
+ maxlen -= n;
+ if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
+ min->append(dmin);
+ max->append(dmax);
+ } else if (!max->empty()) {
+ // prog_->PossibleMatchRange has failed us,
+ // but we still have useful information from prefix_.
+ // Round up *max to allow any possible suffix.
+ PrefixSuccessor(max);
+ } else {
+ // Nothing useful.
+ *min = "";
+ *max = "";
+ return false;
+ }
+
+ return true;
+}
+
+// Avoid possible locale nonsense in standard strcasecmp.
+// The string a is known to be all lowercase.
+static int ascii_strcasecmp(const char* a, const char* b, size_t len) {
+ const char* ae = a + len;
+
+ for (; a < ae; a++, b++) {
+ uint8_t x = *a;
+ uint8_t y = *b;
+ if ('A' <= y && y <= 'Z')
+ y += 'a' - 'A';
+ if (x != y)
+ return x - y;
+ }
+ return 0;
+}
+
+
+/***** Actual matching and rewriting code *****/
+
+bool RE2::Match(absl::string_view text,
+ size_t startpos,
+ size_t endpos,
+ Anchor re_anchor,
+ absl::string_view* submatch,
+ int nsubmatch) const {
+ if (!ok()) {
+ if (options_.log_errors())
+ LOG(ERROR) << "Invalid RE2: " << *error_;
+ return false;
+ }
+
+ if (startpos > endpos || endpos > text.size()) {
+ if (options_.log_errors())
+ LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["
+ << "startpos: " << startpos << ", "
+ << "endpos: " << endpos << ", "
+ << "text size: " << text.size() << "]";
+ return false;
+ }
+
+ absl::string_view subtext = text;
+ subtext.remove_prefix(startpos);
+ subtext.remove_suffix(text.size() - endpos);
+
+ // Use DFAs to find exact location of match, filter out non-matches.
+
+ // Don't ask for the location if we won't use it.
+ // SearchDFA can do extra optimizations in that case.
+ absl::string_view match;
+ absl::string_view* matchp = &match;
+ if (nsubmatch == 0)
+ matchp = NULL;
+
+ int ncap = 1 + NumberOfCapturingGroups();
+ if (ncap > nsubmatch)
+ ncap = nsubmatch;
+
+ // If the regexp is anchored explicitly, must not be in middle of text.
+ if (prog_->anchor_start() && startpos != 0)
+ return false;
+ if (prog_->anchor_end() && endpos != text.size())
+ return false;
+
+ // If the regexp is anchored explicitly, update re_anchor
+ // so that we can potentially fall into a faster case below.
+ if (prog_->anchor_start() && prog_->anchor_end())
+ re_anchor = ANCHOR_BOTH;
+ else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
+ re_anchor = ANCHOR_START;
+
+ // Check for the required prefix, if any.
+ size_t prefixlen = 0;
+ if (!prefix_.empty()) {
+ if (startpos != 0)
+ return false;
+ prefixlen = prefix_.size();
+ if (prefixlen > subtext.size())
+ return false;
+ if (prefix_foldcase_) {
+ if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
+ return false;
+ } else {
+ if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
+ return false;
+ }
+ subtext.remove_prefix(prefixlen);
+ // If there is a required prefix, the anchor must be at least ANCHOR_START.
+ if (re_anchor != ANCHOR_BOTH)
+ re_anchor = ANCHOR_START;
+ }
+
+ Prog::Anchor anchor = Prog::kUnanchored;
+ Prog::MatchKind kind =
+ longest_match_ ? Prog::kLongestMatch : Prog::kFirstMatch;
+
+ bool can_one_pass = is_one_pass_ && ncap <= Prog::kMaxOnePassCapture;
+ bool can_bit_state = prog_->CanBitState();
+ size_t bit_state_text_max_size = prog_->bit_state_text_max_size();
+
+#ifdef RE2_HAVE_THREAD_LOCAL
+ hooks::context = this;
+#endif
+ bool dfa_failed = false;
+ bool skipped_test = false;
+ switch (re_anchor) {
+ default:
+ LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor;
+ return false;
+
+ case UNANCHORED: {
+ if (prog_->anchor_end()) {
+ // This is a very special case: we don't need the forward DFA because
+ // we already know where the match must end! Instead, the reverse DFA
+ // can say whether there is a match and (optionally) where it starts.
+ Prog* prog = ReverseProg();
+ if (prog == NULL) {
+ // Fall back to NFA below.
+ skipped_test = true;
+ break;
+ }
+ if (!prog->SearchDFA(subtext, text, Prog::kAnchored,
+ Prog::kLongestMatch, matchp, &dfa_failed, NULL)) {
+ if (dfa_failed) {
+ if (options_.log_errors())
+ LOG(ERROR) << "DFA out of memory: "
+ << "pattern length " << pattern_->size() << ", "
+ << "program size " << prog->size() << ", "
+ << "list count " << prog->list_count() << ", "
+ << "bytemap range " << prog->bytemap_range();
+ // Fall back to NFA below.
+ skipped_test = true;
+ break;
+ }
+ return false;
+ }
+ if (matchp == NULL) // Matched. Don't care where.
+ return true;
+ break;
+ }
+
+ if (!prog_->SearchDFA(subtext, text, anchor, kind,
+ matchp, &dfa_failed, NULL)) {
+ if (dfa_failed) {
+ if (options_.log_errors())
+ LOG(ERROR) << "DFA out of memory: "
+ << "pattern length " << pattern_->size() << ", "
+ << "program size " << prog_->size() << ", "
+ << "list count " << prog_->list_count() << ", "
+ << "bytemap range " << prog_->bytemap_range();
+ // Fall back to NFA below.
+ skipped_test = true;
+ break;
+ }
+ return false;
+ }
+ if (matchp == NULL) // Matched. Don't care where.
+ return true;
+ // SearchDFA set match.end() but didn't know where the
+ // match started. Run the regexp backward from match.end()
+ // to find the longest possible match -- that's where it started.
+ Prog* prog = ReverseProg();
+ if (prog == NULL) {
+ // Fall back to NFA below.
+ skipped_test = true;
+ break;
+ }
+ if (!prog->SearchDFA(match, text, Prog::kAnchored,
+ Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
+ if (dfa_failed) {
+ if (options_.log_errors())
+ LOG(ERROR) << "DFA out of memory: "
+ << "pattern length " << pattern_->size() << ", "
+ << "program size " << prog->size() << ", "
+ << "list count " << prog->list_count() << ", "
+ << "bytemap range " << prog->bytemap_range();
+ // Fall back to NFA below.
+ skipped_test = true;
+ break;
+ }
+ if (options_.log_errors())
+ LOG(ERROR) << "SearchDFA inconsistency";
+ return false;
+ }
+ break;
+ }
+
+ case ANCHOR_BOTH:
+ case ANCHOR_START:
+ if (re_anchor == ANCHOR_BOTH)
+ kind = Prog::kFullMatch;
+ anchor = Prog::kAnchored;
+
+ // If only a small amount of text and need submatch
+ // information anyway and we're going to use OnePass or BitState
+ // to get it, we might as well not even bother with the DFA:
+ // OnePass or BitState will be fast enough.
+ // On tiny texts, OnePass outruns even the DFA, and
+ // it doesn't have the shared state and occasional mutex that
+ // the DFA does.
+ if (can_one_pass && text.size() <= 4096 &&
+ (ncap > 1 || text.size() <= 16)) {
+ skipped_test = true;
+ break;
+ }
+ if (can_bit_state && text.size() <= bit_state_text_max_size &&
+ ncap > 1) {
+ skipped_test = true;
+ break;
+ }
+ if (!prog_->SearchDFA(subtext, text, anchor, kind,
+ &match, &dfa_failed, NULL)) {
+ if (dfa_failed) {
+ if (options_.log_errors())
+ LOG(ERROR) << "DFA out of memory: "
+ << "pattern length " << pattern_->size() << ", "
+ << "program size " << prog_->size() << ", "
+ << "list count " << prog_->list_count() << ", "
+ << "bytemap range " << prog_->bytemap_range();
+ // Fall back to NFA below.
+ skipped_test = true;
+ break;
+ }
+ return false;
+ }
+ break;
+ }
+
+ if (!skipped_test && ncap <= 1) {
+ // We know exactly where it matches. That's enough.
+ if (ncap == 1)
+ submatch[0] = match;
+ } else {
+ absl::string_view subtext1;
+ if (skipped_test) {
+ // DFA ran out of memory or was skipped:
+ // need to search in entire original text.
+ subtext1 = subtext;
+ } else {
+ // DFA found the exact match location:
+ // let NFA run an anchored, full match search
+ // to find submatch locations.
+ subtext1 = match;
+ anchor = Prog::kAnchored;
+ kind = Prog::kFullMatch;
+ }
+
+ if (can_one_pass && anchor != Prog::kUnanchored) {
+ if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
+ if (!skipped_test && options_.log_errors())
+ LOG(ERROR) << "SearchOnePass inconsistency";
+ return false;
+ }
+ } else if (can_bit_state && subtext1.size() <= bit_state_text_max_size) {
+ if (!prog_->SearchBitState(subtext1, text, anchor,
+ kind, submatch, ncap)) {
+ if (!skipped_test && options_.log_errors())
+ LOG(ERROR) << "SearchBitState inconsistency";
+ return false;
+ }
+ } else {
+ if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
+ if (!skipped_test && options_.log_errors())
+ LOG(ERROR) << "SearchNFA inconsistency";
+ return false;
+ }
+ }
+ }
+
+ // Adjust overall match for required prefix that we stripped off.
+ if (prefixlen > 0 && nsubmatch > 0)
+ submatch[0] = absl::string_view(submatch[0].data() - prefixlen,
+ submatch[0].size() + prefixlen);
+
+ // Zero submatches that don't exist in the regexp.
+ for (int i = ncap; i < nsubmatch; i++)
+ submatch[i] = absl::string_view();
+ return true;
+}
+
+// Internal matcher - like Match() but takes Args not string_views.
+bool RE2::DoMatch(absl::string_view text,
+ Anchor re_anchor,
+ size_t* consumed,
+ const Arg* const* args,
+ int n) const {
+ if (!ok()) {
+ if (options_.log_errors())
+ LOG(ERROR) << "Invalid RE2: " << *error_;
+ return false;
+ }
+
+ if (NumberOfCapturingGroups() < n) {
+ // RE has fewer capturing groups than number of Arg pointers passed in.
+ return false;
+ }
+
+ // Count number of capture groups needed.
+ int nvec;
+ if (n == 0 && consumed == NULL)
+ nvec = 0;
+ else
+ nvec = n+1;
+
+ absl::FixedArray<absl::string_view, kVecSize> vec_storage(nvec);
+ absl::string_view* vec = vec_storage.data();
+
+ if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) {
+ return false;
+ }
+
+ if (consumed != NULL)
+ *consumed = static_cast<size_t>(EndPtr(vec[0]) - BeginPtr(text));
+
+ if (n == 0 || args == NULL) {
+ // We are not interested in results
+ return true;
+ }
+
+ // If we got here, we must have matched the whole pattern.
+ for (int i = 0; i < n; i++) {
+ absl::string_view s = vec[i+1];
+ if (!args[i]->Parse(s.data(), s.size())) {
+ // TODO: Should we indicate what the error was?
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// Checks that the rewrite string is well-formed with respect to this
+// regular expression.
+bool RE2::CheckRewriteString(absl::string_view rewrite,
+ std::string* error) const {
+ int max_token = -1;
+ for (const char *s = rewrite.data(), *end = s + rewrite.size();
+ s < end; s++) {
+ int c = *s;
+ if (c != '\\') {
+ continue;
+ }
+ if (++s == end) {
+ *error = "Rewrite schema error: '\\' not allowed at end.";
+ return false;
+ }
+ c = *s;
+ if (c == '\\') {
+ continue;
+ }
+ if (!absl::ascii_isdigit(c)) {
+ *error = "Rewrite schema error: "
+ "'\\' must be followed by a digit or '\\'.";
+ return false;
+ }
+ int n = (c - '0');
+ if (max_token < n) {
+ max_token = n;
+ }
+ }
+
+ if (max_token > NumberOfCapturingGroups()) {
+ *error = absl::StrFormat(
+ "Rewrite schema requests %d matches, but the regexp only has %d "
+ "parenthesized subexpressions.",
+ max_token, NumberOfCapturingGroups());
+ return false;
+ }
+ return true;
+}
+
+// Returns the maximum submatch needed for the rewrite to be done by Replace().
+// E.g. if rewrite == "foo \\2,\\1", returns 2.
+int RE2::MaxSubmatch(absl::string_view rewrite) {
+ int max = 0;
+ for (const char *s = rewrite.data(), *end = s + rewrite.size();
+ s < end; s++) {
+ if (*s == '\\') {
+ s++;
+ int c = (s < end) ? *s : -1;
+ if (absl::ascii_isdigit(c)) {
+ int n = (c - '0');
+ if (n > max)
+ max = n;
+ }
+ }
+ }
+ return max;
+}
+
+// Append the "rewrite" string, with backslash substitutions from "vec",
+// to string "out".
+bool RE2::Rewrite(std::string* out,
+ absl::string_view rewrite,
+ const absl::string_view* vec,
+ int veclen) const {
+ for (const char *s = rewrite.data(), *end = s + rewrite.size();
+ s < end; s++) {
+ if (*s != '\\') {
+ out->push_back(*s);
+ continue;
+ }
+ s++;
+ int c = (s < end) ? *s : -1;
+ if (absl::ascii_isdigit(c)) {
+ int n = (c - '0');
+ if (n >= veclen) {
+ if (options_.log_errors()) {
+ LOG(ERROR) << "invalid substitution \\" << n
+ << " from " << veclen << " groups";
+ }
+ return false;
+ }
+ absl::string_view snip = vec[n];
+ if (!snip.empty())
+ out->append(snip.data(), snip.size());
+ } else if (c == '\\') {
+ out->push_back('\\');
+ } else {
+ if (options_.log_errors())
+ LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
+ return false;
+ }
+ }
+ return true;
+}
+
+/***** Parsers for various types *****/
+
+namespace re2_internal {
+
+template <>
+bool Parse(const char* str, size_t n, void* dest) {
+ // We fail if somebody asked us to store into a non-NULL void* pointer
+ return (dest == NULL);
+}
+
+template <>
+bool Parse(const char* str, size_t n, std::string* dest) {
+ if (dest == NULL) return true;
+ dest->assign(str, n);
+ return true;
+}
+
+template <>
+bool Parse(const char* str, size_t n, absl::string_view* dest) {
+ if (dest == NULL) return true;
+ *dest = absl::string_view(str, n);
+ return true;
+}
+
+template <>
+bool Parse(const char* str, size_t n, char* dest) {
+ if (n != 1) return false;
+ if (dest == NULL) return true;
+ *dest = str[0];
+ return true;
+}
+
+template <>
+bool Parse(const char* str, size_t n, signed char* dest) {
+ if (n != 1) return false;
+ if (dest == NULL) return true;
+ *dest = str[0];
+ return true;
+}
+
+template <>
+bool Parse(const char* str, size_t n, unsigned char* dest) {
+ if (n != 1) return false;
+ if (dest == NULL) return true;
+ *dest = str[0];
+ return true;
+}
+
+// Largest number spec that we are willing to parse
+static const int kMaxNumberLength = 32;
+
+// REQUIRES "buf" must have length at least nbuf.
+// Copies "str" into "buf" and null-terminates.
+// Overwrites *np with the new length.
+static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
+ size_t* np, bool accept_spaces) {
+ size_t n = *np;
+ if (n == 0) return "";
+ if (n > 0 && absl::ascii_isspace(*str)) {
+ // We are less forgiving than the strtoxxx() routines and do not
+ // allow leading spaces. We do allow leading spaces for floats.
+ if (!accept_spaces) {
+ return "";
+ }
+ while (n > 0 && absl::ascii_isspace(*str)) {
+ n--;
+ str++;
+ }
+ }
+
+ // Although buf has a fixed maximum size, we can still handle
+ // arbitrarily large integers correctly by omitting leading zeros.
+ // (Numbers that are still too long will be out of range.)
+ // Before deciding whether str is too long,
+ // remove leading zeros with s/000+/00/.
+ // Leaving the leading two zeros in place means that
+ // we don't change 0000x123 (invalid) into 0x123 (valid).
+ // Skip over leading - before replacing.
+ bool neg = false;
+ if (n >= 1 && str[0] == '-') {
+ neg = true;
+ n--;
+ str++;
+ }
+
+ if (n >= 3 && str[0] == '0' && str[1] == '0') {
+ while (n >= 3 && str[2] == '0') {
+ n--;
+ str++;
+ }
+ }
+
+ if (neg) { // make room in buf for -
+ n++;
+ str--;
+ }
+
+ if (n > nbuf-1) return "";
+
+ memmove(buf, str, n);
+ if (neg) {
+ buf[0] = '-';
+ }
+ buf[n] = '\0';
+ *np = n;
+ return buf;
+}
+
+template <>
+bool Parse(const char* str, size_t n, float* dest) {
+ if (n == 0) return false;
+ static const int kMaxLength = 200;
+ char buf[kMaxLength+1];
+ str = TerminateNumber(buf, sizeof buf, str, &n, true);
+ char* end;
+ errno = 0;
+ float r = strtof(str, &end);
+ if (end != str + n) return false; // Leftover junk
+ if (errno) return false;
+ if (dest == NULL) return true;
+ *dest = r;
+ return true;
+}
+
+template <>
+bool Parse(const char* str, size_t n, double* dest) {
+ if (n == 0) return false;
+ static const int kMaxLength = 200;
+ char buf[kMaxLength+1];
+ str = TerminateNumber(buf, sizeof buf, str, &n, true);
+ char* end;
+ errno = 0;
+ double r = strtod(str, &end);
+ if (end != str + n) return false; // Leftover junk
+ if (errno) return false;
+ if (dest == NULL) return true;
+ *dest = r;
+ return true;
+}
+
+template <>
+bool Parse(const char* str, size_t n, long* dest, int radix) {
+ if (n == 0) return false;
+ char buf[kMaxNumberLength+1];
+ str = TerminateNumber(buf, sizeof buf, str, &n, false);
+ char* end;
+ errno = 0;
+ long r = strtol(str, &end, radix);
+ if (end != str + n) return false; // Leftover junk
+ if (errno) return false;
+ if (dest == NULL) return true;
+ *dest = r;
+ return true;
+}
+
+template <>
+bool Parse(const char* str, size_t n, unsigned long* dest, int radix) {
+ if (n == 0) return false;
+ char buf[kMaxNumberLength+1];
+ str = TerminateNumber(buf, sizeof buf, str, &n, false);
+ if (str[0] == '-') {
+ // strtoul() will silently accept negative numbers and parse
+ // them. This module is more strict and treats them as errors.
+ return false;
+ }
+
+ char* end;
+ errno = 0;
+ unsigned long r = strtoul(str, &end, radix);
+ if (end != str + n) return false; // Leftover junk
+ if (errno) return false;
+ if (dest == NULL) return true;
+ *dest = r;
+ return true;
+}
+
+template <>
+bool Parse(const char* str, size_t n, short* dest, int radix) {
+ long r;
+ if (!Parse(str, n, &r, radix)) return false; // Could not parse
+ if ((short)r != r) return false; // Out of range
+ if (dest == NULL) return true;
+ *dest = (short)r;
+ return true;
+}
+
+template <>
+bool Parse(const char* str, size_t n, unsigned short* dest, int radix) {
+ unsigned long r;
+ if (!Parse(str, n, &r, radix)) return false; // Could not parse
+ if ((unsigned short)r != r) return false; // Out of range
+ if (dest == NULL) return true;
+ *dest = (unsigned short)r;
+ return true;
+}
+
+template <>
+bool Parse(const char* str, size_t n, int* dest, int radix) {
+ long r;
+ if (!Parse(str, n, &r, radix)) return false; // Could not parse
+ if ((int)r != r) return false; // Out of range
+ if (dest == NULL) return true;
+ *dest = (int)r;
+ return true;
+}
+
+template <>
+bool Parse(const char* str, size_t n, unsigned int* dest, int radix) {
+ unsigned long r;
+ if (!Parse(str, n, &r, radix)) return false; // Could not parse
+ if ((unsigned int)r != r) return false; // Out of range
+ if (dest == NULL) return true;
+ *dest = (unsigned int)r;
+ return true;
+}
+
+template <>
+bool Parse(const char* str, size_t n, long long* dest, int radix) {
+ if (n == 0) return false;
+ char buf[kMaxNumberLength+1];
+ str = TerminateNumber(buf, sizeof buf, str, &n, false);
+ char* end;
+ errno = 0;
+ long long r = strtoll(str, &end, radix);
+ if (end != str + n) return false; // Leftover junk
+ if (errno) return false;
+ if (dest == NULL) return true;
+ *dest = r;
+ return true;
+}
+
+template <>
+bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) {
+ if (n == 0) return false;
+ char buf[kMaxNumberLength+1];
+ str = TerminateNumber(buf, sizeof buf, str, &n, false);
+ if (str[0] == '-') {
+ // strtoull() will silently accept negative numbers and parse
+ // them. This module is more strict and treats them as errors.
+ return false;
+ }
+ char* end;
+ errno = 0;
+ unsigned long long r = strtoull(str, &end, radix);
+ if (end != str + n) return false; // Leftover junk
+ if (errno) return false;
+ if (dest == NULL) return true;
+ *dest = r;
+ return true;
+}
+
+} // namespace re2_internal
+
+namespace hooks {
+
+#ifdef RE2_HAVE_THREAD_LOCAL
+thread_local const RE2* context = NULL;
+#endif
+
+template <typename T>
+union Hook {
+ void Store(T* cb) { cb_.store(cb, std::memory_order_release); }
+ T* Load() const { return cb_.load(std::memory_order_acquire); }
+
+#if !defined(__clang__) && defined(_MSC_VER)
+ // Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent,
+ // this is a gross hack to make std::atomic<T*> constant-initialized on MSVC.
+ static_assert(ATOMIC_POINTER_LOCK_FREE == 2,
+ "std::atomic<T*> must be always lock-free");
+ T* cb_for_constinit_;
+#endif
+
+ std::atomic<T*> cb_;
+};
+
+template <typename T>
+static void DoNothing(const T&) {}
+
+#define DEFINE_HOOK(type, name) \
+ static Hook<type##Callback> name##_hook = {{&DoNothing<type>}}; \
+ void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \
+ type##Callback* Get##type##Hook() { return name##_hook.Load(); }
+
+DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset)
+DEFINE_HOOK(DFASearchFailure, dfa_search_failure)
+
+#undef DEFINE_HOOK
+
+} // namespace hooks
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/re2.h b/third_party/re2/src/re2/re2.h
new file mode 100644
index 000000000..68fbed1d8
--- /dev/null
+++ b/third_party/re2/src/re2/re2.h
@@ -0,0 +1,1078 @@
+// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_RE2_H_
+#define RE2_RE2_H_
+
+// C++ interface to the re2 regular-expression library.
+// RE2 supports Perl-style regular expressions (with extensions like
+// \d, \w, \s, ...).
+//
+// -----------------------------------------------------------------------
+// REGEXP SYNTAX:
+//
+// This module uses the re2 library and hence supports
+// its syntax for regular expressions, which is similar to Perl's with
+// some of the more complicated things thrown away. In particular,
+// backreferences and generalized assertions are not available, nor is \Z.
+//
+// See https://github.com/google/re2/wiki/Syntax for the syntax
+// supported by RE2, and a comparison with PCRE and PERL regexps.
+//
+// For those not familiar with Perl's regular expressions,
+// here are some examples of the most commonly used extensions:
+//
+// "hello (\\w+) world" -- \w matches a "word" character
+// "version (\\d+)" -- \d matches a digit
+// "hello\\s+world" -- \s matches any whitespace character
+// "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
+// "(?i)hello" -- (?i) turns on case-insensitive matching
+// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
+//
+// The double backslashes are needed when writing C++ string literals.
+// However, they should NOT be used when writing C++11 raw string literals:
+//
+// R"(hello (\w+) world)" -- \w matches a "word" character
+// R"(version (\d+))" -- \d matches a digit
+// R"(hello\s+world)" -- \s matches any whitespace character
+// R"(\b(\w+)\b)" -- \b matches non-empty string at word boundary
+// R"((?i)hello)" -- (?i) turns on case-insensitive matching
+// R"(/\*(.*?)\*/)" -- .*? matches . minimum no. of times possible
+//
+// When using UTF-8 encoding, case-insensitive matching will perform
+// simple case folding, not full case folding.
+//
+// -----------------------------------------------------------------------
+// MATCHING INTERFACE:
+//
+// The "FullMatch" operation checks that supplied text matches a
+// supplied pattern exactly.
+//
+// Example: successful match
+// CHECK(RE2::FullMatch("hello", "h.*o"));
+//
+// Example: unsuccessful match (requires full match):
+// CHECK(!RE2::FullMatch("hello", "e"));
+//
+// -----------------------------------------------------------------------
+// UTF-8 AND THE MATCHING INTERFACE:
+//
+// By default, the pattern and input text are interpreted as UTF-8.
+// The RE2::Latin1 option causes them to be interpreted as Latin-1.
+//
+// Example:
+// CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
+// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
+//
+// -----------------------------------------------------------------------
+// SUBMATCH EXTRACTION:
+//
+// You can supply extra pointer arguments to extract submatches.
+// On match failure, none of the pointees will have been modified.
+// On match success, the submatches will be converted (as necessary) and
+// their values will be assigned to their pointees until all conversions
+// have succeeded or one conversion has failed.
+// On conversion failure, the pointees will be in an indeterminate state
+// because the caller has no way of knowing which conversion failed.
+// However, conversion cannot fail for types like string and string_view
+// that do not inspect the submatch contents. Hence, in the common case
+// where all of the pointees are of such types, failure is always due to
+// match failure and thus none of the pointees will have been modified.
+//
+// Example: extracts "ruby" into "s" and 1234 into "i"
+// int i;
+// std::string s;
+// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
+//
+// Example: extracts "ruby" into "s" and no value into "i"
+// absl::optional<int> i;
+// std::string s;
+// CHECK(RE2::FullMatch("ruby", "(\\w+)(?::(\\d+))?", &s, &i));
+//
+// Example: fails because string cannot be stored in integer
+// CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
+//
+// Example: fails because there aren't enough sub-patterns
+// CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
+//
+// Example: does not try to extract any extra sub-patterns
+// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
+//
+// Example: does not try to extract into NULL
+// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
+//
+// Example: integer overflow causes failure
+// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
+//
+// NOTE(rsc): Asking for submatches slows successful matches quite a bit.
+// This may get a little faster in the future, but right now is slower
+// than PCRE. On the other hand, failed matches run *very* fast (faster
+// than PCRE), as do matches without submatch extraction.
+//
+// -----------------------------------------------------------------------
+// PARTIAL MATCHES
+//
+// You can use the "PartialMatch" operation when you want the pattern
+// to match any substring of the text.
+//
+// Example: simple search for a string:
+// CHECK(RE2::PartialMatch("hello", "ell"));
+//
+// Example: find first number in a string
+// int number;
+// CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
+// CHECK_EQ(number, 100);
+//
+// -----------------------------------------------------------------------
+// PRE-COMPILED REGULAR EXPRESSIONS
+//
+// RE2 makes it easy to use any string as a regular expression, without
+// requiring a separate compilation step.
+//
+// If speed is of the essence, you can create a pre-compiled "RE2"
+// object from the pattern and use it multiple times. If you do so,
+// you can typically parse text faster than with sscanf.
+//
+// Example: precompile pattern for faster matching:
+// RE2 pattern("h.*o");
+// while (ReadLine(&str)) {
+// if (RE2::FullMatch(str, pattern)) ...;
+// }
+//
+// -----------------------------------------------------------------------
+// SCANNING TEXT INCREMENTALLY
+//
+// The "Consume" operation may be useful if you want to repeatedly
+// match regular expressions at the front of a string and skip over
+// them as they match. This requires use of the string_view type,
+// which represents a sub-range of a real string.
+//
+// Example: read lines of the form "var = value" from a string.
+// std::string contents = ...; // Fill string somehow
+// absl::string_view input(contents); // Wrap a string_view around it
+//
+// std::string var;
+// int value;
+// while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
+// ...;
+// }
+//
+// Each successful call to "Consume" will set "var/value", and also
+// advance "input" so it points past the matched text. Note that if the
+// regular expression matches an empty string, input will advance
+// by 0 bytes. If the regular expression being used might match
+// an empty string, the loop body must check for this case and either
+// advance the string or break out of the loop.
+//
+// The "FindAndConsume" operation is similar to "Consume" but does not
+// anchor your match at the beginning of the string. For example, you
+// could extract all words from a string by repeatedly calling
+// RE2::FindAndConsume(&input, "(\\w+)", &word)
+//
+// -----------------------------------------------------------------------
+// USING VARIABLE NUMBER OF ARGUMENTS
+//
+// The above operations require you to know the number of arguments
+// when you write the code. This is not always possible or easy (for
+// example, the regular expression may be calculated at run time).
+// You can use the "N" version of the operations when the number of
+// match arguments are determined at run time.
+//
+// Example:
+// const RE2::Arg* args[10];
+// int n;
+// // ... populate args with pointers to RE2::Arg values ...
+// // ... set n to the number of RE2::Arg objects ...
+// bool match = RE2::FullMatchN(input, pattern, args, n);
+//
+// The last statement is equivalent to
+//
+// bool match = RE2::FullMatch(input, pattern,
+// *args[0], *args[1], ..., *args[n - 1]);
+//
+// -----------------------------------------------------------------------
+// PARSING HEX/OCTAL/C-RADIX NUMBERS
+//
+// By default, if you pass a pointer to a numeric value, the
+// corresponding text is interpreted as a base-10 number. You can
+// instead wrap the pointer with a call to one of the operators Hex(),
+// Octal(), or CRadix() to interpret the text in another base. The
+// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
+// prefixes, but defaults to base-10.
+//
+// Example:
+// int a, b, c, d;
+// CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
+// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
+// will leave 64 in a, b, c, and d.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <algorithm>
+#include <map>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
+#include "absl/base/call_once.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "re2/stringpiece.h"
+
+namespace re2 {
+class Prog;
+class Regexp;
+} // namespace re2
+
+namespace re2 {
+
+// Interface for regular expression matching. Also corresponds to a
+// pre-compiled regular expression. An "RE2" object is safe for
+// concurrent use by multiple threads.
+class RE2 {
+ public:
+ // We convert user-passed pointers into special Arg objects
+ class Arg;
+ class Options;
+
+ // Defined in set.h.
+ class Set;
+
+ enum ErrorCode {
+ NoError = 0,
+
+ // Unexpected error
+ ErrorInternal,
+
+ // Parse errors
+ ErrorBadEscape, // bad escape sequence
+ ErrorBadCharClass, // bad character class
+ ErrorBadCharRange, // bad character class range
+ ErrorMissingBracket, // missing closing ]
+ ErrorMissingParen, // missing closing )
+ ErrorUnexpectedParen, // unexpected closing )
+ ErrorTrailingBackslash, // trailing \ at end of regexp
+ ErrorRepeatArgument, // repeat argument missing, e.g. "*"
+ ErrorRepeatSize, // bad repetition argument
+ ErrorRepeatOp, // bad repetition operator
+ ErrorBadPerlOp, // bad perl operator
+ ErrorBadUTF8, // invalid UTF-8 in regexp
+ ErrorBadNamedCapture, // bad named capture group
+ ErrorPatternTooLarge // pattern too large (compile failed)
+ };
+
+ // Predefined common options.
+ // If you need more complicated things, instantiate
+ // an Option class, possibly passing one of these to
+ // the Option constructor, change the settings, and pass that
+ // Option class to the RE2 constructor.
+ enum CannedOptions {
+ DefaultOptions = 0,
+ Latin1, // treat input as Latin-1 (default UTF-8)
+ POSIX, // POSIX syntax, leftmost-longest match
+ Quiet // do not log about regexp parse errors
+ };
+
+ // Need to have the const char* and const std::string& forms for implicit
+ // conversions when passing string literals to FullMatch and PartialMatch.
+ // Otherwise the absl::string_view form would be sufficient.
+ RE2(const char* pattern);
+ RE2(const std::string& pattern);
+ RE2(absl::string_view pattern);
+ RE2(absl::string_view pattern, const Options& options);
+ ~RE2();
+
+ // Not copyable.
+ // RE2 objects are expensive. You should probably use std::shared_ptr<RE2>
+ // instead. If you really must copy, RE2(first.pattern(), first.options())
+ // effectively does so: it produces a second object that mimics the first.
+ RE2(const RE2&) = delete;
+ RE2& operator=(const RE2&) = delete;
+ // Not movable.
+ // RE2 objects are thread-safe and logically immutable. You should probably
+ // use std::unique_ptr<RE2> instead. Otherwise, consider std::deque<RE2> if
+ // direct emplacement into a container is desired. If you really must move,
+ // be prepared to submit a design document along with your feature request.
+ RE2(RE2&&) = delete;
+ RE2& operator=(RE2&&) = delete;
+
+ // Returns whether RE2 was created properly.
+ bool ok() const { return error_code() == NoError; }
+
+ // The string specification for this RE2. E.g.
+ // RE2 re("ab*c?d+");
+ // re.pattern(); // "ab*c?d+"
+ const std::string& pattern() const { return *pattern_; }
+
+ // If RE2 could not be created properly, returns an error string.
+ // Else returns the empty string.
+ const std::string& error() const { return *error_; }
+
+ // If RE2 could not be created properly, returns an error code.
+ // Else returns RE2::NoError (== 0).
+ ErrorCode error_code() const { return error_code_; }
+
+ // If RE2 could not be created properly, returns the offending
+ // portion of the regexp.
+ const std::string& error_arg() const { return *error_arg_; }
+
+ // Returns the program size, a very approximate measure of a regexp's "cost".
+ // Larger numbers are more expensive than smaller numbers.
+ int ProgramSize() const;
+ int ReverseProgramSize() const;
+
+ // If histogram is not null, outputs the program fanout
+ // as a histogram bucketed by powers of 2.
+ // Returns the number of the largest non-empty bucket.
+ int ProgramFanout(std::vector<int>* histogram) const;
+ int ReverseProgramFanout(std::vector<int>* histogram) const;
+
+ // Returns the underlying Regexp; not for general use.
+ // Returns entire_regexp_ so that callers don't need
+ // to know about prefix_ and prefix_foldcase_.
+ re2::Regexp* Regexp() const { return entire_regexp_; }
+
+ /***** The array-based matching interface ******/
+
+ // The functions here have names ending in 'N' and are used to implement
+ // the functions whose names are the prefix before the 'N'. It is sometimes
+ // useful to invoke them directly, but the syntax is awkward, so the 'N'-less
+ // versions should be preferred.
+ static bool FullMatchN(absl::string_view text, const RE2& re,
+ const Arg* const args[], int n);
+ static bool PartialMatchN(absl::string_view text, const RE2& re,
+ const Arg* const args[], int n);
+ static bool ConsumeN(absl::string_view* input, const RE2& re,
+ const Arg* const args[], int n);
+ static bool FindAndConsumeN(absl::string_view* input, const RE2& re,
+ const Arg* const args[], int n);
+
+ private:
+ template <typename F, typename SP>
+ static inline bool Apply(F f, SP sp, const RE2& re) {
+ return f(sp, re, NULL, 0);
+ }
+
+ template <typename F, typename SP, typename... A>
+ static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) {
+ const Arg* const args[] = {&a...};
+ const int n = sizeof...(a);
+ return f(sp, re, args, n);
+ }
+
+ public:
+ // In order to allow FullMatch() et al. to be called with a varying number
+ // of arguments of varying types, we use two layers of variadic templates.
+ // The first layer constructs the temporary Arg objects. The second layer
+ // (above) constructs the array of pointers to the temporary Arg objects.
+
+ /***** The useful part: the matching interface *****/
+
+ // Matches "text" against "re". If pointer arguments are
+ // supplied, copies matched sub-patterns into them.
+ //
+ // You can pass in a "const char*" or a "std::string" for "text".
+ // You can pass in a "const char*" or a "std::string" or a "RE2" for "re".
+ //
+ // The provided pointer arguments can be pointers to any scalar numeric
+ // type, or one of:
+ // std::string (matched piece is copied to string)
+ // absl::string_view (string_view is mutated to point to matched piece)
+ // absl::optional<T> (T is a supported numeric or string type as above)
+ // T ("bool T::ParseFrom(const char*, size_t)" must exist)
+ // (void*)NULL (the corresponding matched sub-pattern is not copied)
+ //
+ // Returns true iff all of the following conditions are satisfied:
+ // a. "text" matches "re" fully - from the beginning to the end of "text".
+ // b. The number of matched sub-patterns is >= number of supplied pointers.
+ // c. The "i"th argument has a suitable type for holding the
+ // string captured as the "i"th sub-pattern. If you pass in
+ // NULL for the "i"th argument, or pass fewer arguments than
+ // number of sub-patterns, the "i"th captured sub-pattern is
+ // ignored.
+ //
+ // CAVEAT: An optional sub-pattern that does not exist in the
+ // matched string is assigned the null string. Therefore, the
+ // following returns false because the null string - absence of
+ // a string (not even the empty string) - is not a valid number:
+ //
+ // int number;
+ // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
+ //
+ // Use absl::optional<int> instead to handle this case correctly.
+ template <typename... A>
+ static bool FullMatch(absl::string_view text, const RE2& re, A&&... a) {
+ return Apply(FullMatchN, text, re, Arg(std::forward<A>(a))...);
+ }
+
+ // Like FullMatch(), except that "re" is allowed to match a substring
+ // of "text".
+ //
+ // Returns true iff all of the following conditions are satisfied:
+ // a. "text" matches "re" partially - for some substring of "text".
+ // b. The number of matched sub-patterns is >= number of supplied pointers.
+ // c. The "i"th argument has a suitable type for holding the
+ // string captured as the "i"th sub-pattern. If you pass in
+ // NULL for the "i"th argument, or pass fewer arguments than
+ // number of sub-patterns, the "i"th captured sub-pattern is
+ // ignored.
+ template <typename... A>
+ static bool PartialMatch(absl::string_view text, const RE2& re, A&&... a) {
+ return Apply(PartialMatchN, text, re, Arg(std::forward<A>(a))...);
+ }
+
+ // Like FullMatch() and PartialMatch(), except that "re" has to match
+ // a prefix of the text, and "input" is advanced past the matched
+ // text. Note: "input" is modified iff this routine returns true
+ // and "re" matched a non-empty substring of "input".
+ //
+ // Returns true iff all of the following conditions are satisfied:
+ // a. "input" matches "re" partially - for some prefix of "input".
+ // b. The number of matched sub-patterns is >= number of supplied pointers.
+ // c. The "i"th argument has a suitable type for holding the
+ // string captured as the "i"th sub-pattern. If you pass in
+ // NULL for the "i"th argument, or pass fewer arguments than
+ // number of sub-patterns, the "i"th captured sub-pattern is
+ // ignored.
+ template <typename... A>
+ static bool Consume(absl::string_view* input, const RE2& re, A&&... a) {
+ return Apply(ConsumeN, input, re, Arg(std::forward<A>(a))...);
+ }
+
+ // Like Consume(), but does not anchor the match at the beginning of
+ // the text. That is, "re" need not start its match at the beginning
+ // of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds
+ // the next word in "s" and stores it in "word".
+ //
+ // Returns true iff all of the following conditions are satisfied:
+ // a. "input" matches "re" partially - for some substring of "input".
+ // b. The number of matched sub-patterns is >= number of supplied pointers.
+ // c. The "i"th argument has a suitable type for holding the
+ // string captured as the "i"th sub-pattern. If you pass in
+ // NULL for the "i"th argument, or pass fewer arguments than
+ // number of sub-patterns, the "i"th captured sub-pattern is
+ // ignored.
+ template <typename... A>
+ static bool FindAndConsume(absl::string_view* input, const RE2& re, A&&... a) {
+ return Apply(FindAndConsumeN, input, re, Arg(std::forward<A>(a))...);
+ }
+
+ // Replace the first match of "re" in "str" with "rewrite".
+ // Within "rewrite", backslash-escaped digits (\1 to \9) can be
+ // used to insert text matching corresponding parenthesized group
+ // from the pattern. \0 in "rewrite" refers to the entire matching
+ // text. E.g.,
+ //
+ // std::string s = "yabba dabba doo";
+ // CHECK(RE2::Replace(&s, "b+", "d"));
+ //
+ // will leave "s" containing "yada dabba doo"
+ //
+ // Returns true if the pattern matches and a replacement occurs,
+ // false otherwise.
+ static bool Replace(std::string* str,
+ const RE2& re,
+ absl::string_view rewrite);
+
+ // Like Replace(), except replaces successive non-overlapping occurrences
+ // of the pattern in the string with the rewrite. E.g.
+ //
+ // std::string s = "yabba dabba doo";
+ // CHECK(RE2::GlobalReplace(&s, "b+", "d"));
+ //
+ // will leave "s" containing "yada dada doo"
+ // Replacements are not subject to re-matching.
+ //
+ // Because GlobalReplace only replaces non-overlapping matches,
+ // replacing "ana" within "banana" makes only one replacement, not two.
+ //
+ // Returns the number of replacements made.
+ static int GlobalReplace(std::string* str,
+ const RE2& re,
+ absl::string_view rewrite);
+
+ // Like Replace, except that if the pattern matches, "rewrite"
+ // is copied into "out" with substitutions. The non-matching
+ // portions of "text" are ignored.
+ //
+ // Returns true iff a match occurred and the extraction happened
+ // successfully; if no match occurs, the string is left unaffected.
+ //
+ // REQUIRES: "text" must not alias any part of "*out".
+ static bool Extract(absl::string_view text,
+ const RE2& re,
+ absl::string_view rewrite,
+ std::string* out);
+
+ // Escapes all potentially meaningful regexp characters in
+ // 'unquoted'. The returned string, used as a regular expression,
+ // will match exactly the original string. For example,
+ // 1.5-2.0?
+ // may become:
+ // 1\.5\-2\.0\?
+ static std::string QuoteMeta(absl::string_view unquoted);
+
+ // Computes range for any strings matching regexp. The min and max can in
+ // some cases be arbitrarily precise, so the caller gets to specify the
+ // maximum desired length of string returned.
+ //
+ // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
+ // string s that is an anchored match for this regexp satisfies
+ // min <= s && s <= max.
+ //
+ // Note that PossibleMatchRange() will only consider the first copy of an
+ // infinitely repeated element (i.e., any regexp element followed by a '*' or
+ // '+' operator). Regexps with "{N}" constructions are not affected, as those
+ // do not compile down to infinite repetitions.
+ //
+ // Returns true on success, false on error.
+ bool PossibleMatchRange(std::string* min, std::string* max,
+ int maxlen) const;
+
+ // Generic matching interface
+
+ // Type of match.
+ enum Anchor {
+ UNANCHORED, // No anchoring
+ ANCHOR_START, // Anchor at start only
+ ANCHOR_BOTH // Anchor at start and end
+ };
+
+ // Return the number of capturing sub-patterns, or -1 if the
+ // regexp wasn't valid on construction. The overall match ($0)
+ // does not count: if the regexp is "(a)(b)", returns 2.
+ int NumberOfCapturingGroups() const { return num_captures_; }
+
+ // Return a map from names to capturing indices.
+ // The map records the index of the leftmost group
+ // with the given name.
+ // Only valid until the re is deleted.
+ const std::map<std::string, int>& NamedCapturingGroups() const;
+
+ // Return a map from capturing indices to names.
+ // The map has no entries for unnamed groups.
+ // Only valid until the re is deleted.
+ const std::map<int, std::string>& CapturingGroupNames() const;
+
+ // General matching routine.
+ // Match against text starting at offset startpos
+ // and stopping the search at offset endpos.
+ // Returns true if match found, false if not.
+ // On a successful match, fills in submatch[] (up to nsubmatch entries)
+ // with information about submatches.
+ // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, with
+ // submatch[0] = "barbaz", submatch[1].data() = NULL, submatch[2] = "bar",
+ // submatch[3].data() = NULL, ..., up to submatch[nsubmatch-1].data() = NULL.
+ // Caveat: submatch[] may be clobbered even on match failure.
+ //
+ // Don't ask for more match information than you will use:
+ // runs much faster with nsubmatch == 1 than nsubmatch > 1, and
+ // runs even faster if nsubmatch == 0.
+ // Doesn't make sense to use nsubmatch > 1 + NumberOfCapturingGroups(),
+ // but will be handled correctly.
+ //
+ // Passing text == absl::string_view() will be handled like any other
+ // empty string, but note that on return, it will not be possible to tell
+ // whether submatch i matched the empty string or did not match:
+ // either way, submatch[i].data() == NULL.
+ bool Match(absl::string_view text,
+ size_t startpos,
+ size_t endpos,
+ Anchor re_anchor,
+ absl::string_view* submatch,
+ int nsubmatch) const;
+
+ // Check that the given rewrite string is suitable for use with this
+ // regular expression. It checks that:
+ // * The regular expression has enough parenthesized subexpressions
+ // to satisfy all of the \N tokens in rewrite
+ // * The rewrite string doesn't have any syntax errors. E.g.,
+ // '\' followed by anything other than a digit or '\'.
+ // A true return value guarantees that Replace() and Extract() won't
+ // fail because of a bad rewrite string.
+ bool CheckRewriteString(absl::string_view rewrite,
+ std::string* error) const;
+
+ // Returns the maximum submatch needed for the rewrite to be done by
+ // Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2.
+ static int MaxSubmatch(absl::string_view rewrite);
+
+ // Append the "rewrite" string, with backslash substitutions from "vec",
+ // to string "out".
+ // Returns true on success. This method can fail because of a malformed
+ // rewrite string. CheckRewriteString guarantees that the rewrite will
+ // be sucessful.
+ bool Rewrite(std::string* out,
+ absl::string_view rewrite,
+ const absl::string_view* vec,
+ int veclen) const;
+
+ // Constructor options
+ class Options {
+ public:
+ // The options are (defaults in parentheses):
+ //
+ // utf8 (true) text and pattern are UTF-8; otherwise Latin-1
+ // posix_syntax (false) restrict regexps to POSIX egrep syntax
+ // longest_match (false) search for longest match, not first match
+ // log_errors (true) log syntax and execution errors to ERROR
+ // max_mem (see below) approx. max memory footprint of RE2
+ // literal (false) interpret string as literal, not regexp
+ // never_nl (false) never match \n, even if it is in regexp
+ // dot_nl (false) dot matches everything including new line
+ // never_capture (false) parse all parens as non-capturing
+ // case_sensitive (true) match is case-sensitive (regexp can override
+ // with (?i) unless in posix_syntax mode)
+ //
+ // The following options are only consulted when posix_syntax == true.
+ // When posix_syntax == false, these features are always enabled and
+ // cannot be turned off; to perform multi-line matching in that case,
+ // begin the regexp with (?m).
+ // perl_classes (false) allow Perl's \d \s \w \D \S \W
+ // word_boundary (false) allow Perl's \b \B (word boundary and not)
+ // one_line (false) ^ and $ only match beginning and end of text
+ //
+ // The max_mem option controls how much memory can be used
+ // to hold the compiled form of the regexp (the Prog) and
+ // its cached DFA graphs. Code Search placed limits on the number
+ // of Prog instructions and DFA states: 10,000 for both.
+ // In RE2, those limits would translate to about 240 KB per Prog
+ // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
+ // better job of keeping them small than Code Search did).
+ // Each RE2 has two Progs (one forward, one reverse), and each Prog
+ // can have two DFAs (one first match, one longest match).
+ // That makes 4 DFAs:
+ //
+ // forward, first-match - used for UNANCHORED or ANCHOR_START searches
+ // if opt.longest_match() == false
+ // forward, longest-match - used for all ANCHOR_BOTH searches,
+ // and the other two kinds if
+ // opt.longest_match() == true
+ // reverse, first-match - never used
+ // reverse, longest-match - used as second phase for unanchored searches
+ //
+ // The RE2 memory budget is statically divided between the two
+ // Progs and then the DFAs: two thirds to the forward Prog
+ // and one third to the reverse Prog. The forward Prog gives half
+ // of what it has left over to each of its DFAs. The reverse Prog
+ // gives it all to its longest-match DFA.
+ //
+ // Once a DFA fills its budget, it flushes its cache and starts over.
+ // If this happens too often, RE2 falls back on the NFA implementation.
+
+ // For now, make the default budget something close to Code Search.
+ static const int kDefaultMaxMem = 8<<20;
+
+ enum Encoding {
+ EncodingUTF8 = 1,
+ EncodingLatin1
+ };
+
+ Options() :
+ max_mem_(kDefaultMaxMem),
+ encoding_(EncodingUTF8),
+ posix_syntax_(false),
+ longest_match_(false),
+ log_errors_(true),
+ literal_(false),
+ never_nl_(false),
+ dot_nl_(false),
+ never_capture_(false),
+ case_sensitive_(true),
+ perl_classes_(false),
+ word_boundary_(false),
+ one_line_(false) {
+ }
+
+ /*implicit*/ Options(CannedOptions);
+
+ int64_t max_mem() const { return max_mem_; }
+ void set_max_mem(int64_t m) { max_mem_ = m; }
+
+ Encoding encoding() const { return encoding_; }
+ void set_encoding(Encoding encoding) { encoding_ = encoding; }
+
+ bool posix_syntax() const { return posix_syntax_; }
+ void set_posix_syntax(bool b) { posix_syntax_ = b; }
+
+ bool longest_match() const { return longest_match_; }
+ void set_longest_match(bool b) { longest_match_ = b; }
+
+ bool log_errors() const { return log_errors_; }
+ void set_log_errors(bool b) { log_errors_ = b; }
+
+ bool literal() const { return literal_; }
+ void set_literal(bool b) { literal_ = b; }
+
+ bool never_nl() const { return never_nl_; }
+ void set_never_nl(bool b) { never_nl_ = b; }
+
+ bool dot_nl() const { return dot_nl_; }
+ void set_dot_nl(bool b) { dot_nl_ = b; }
+
+ bool never_capture() const { return never_capture_; }
+ void set_never_capture(bool b) { never_capture_ = b; }
+
+ bool case_sensitive() const { return case_sensitive_; }
+ void set_case_sensitive(bool b) { case_sensitive_ = b; }
+
+ bool perl_classes() const { return perl_classes_; }
+ void set_perl_classes(bool b) { perl_classes_ = b; }
+
+ bool word_boundary() const { return word_boundary_; }
+ void set_word_boundary(bool b) { word_boundary_ = b; }
+
+ bool one_line() const { return one_line_; }
+ void set_one_line(bool b) { one_line_ = b; }
+
+ void Copy(const Options& src) {
+ *this = src;
+ }
+
+ int ParseFlags() const;
+
+ private:
+ int64_t max_mem_;
+ Encoding encoding_;
+ bool posix_syntax_;
+ bool longest_match_;
+ bool log_errors_;
+ bool literal_;
+ bool never_nl_;
+ bool dot_nl_;
+ bool never_capture_;
+ bool case_sensitive_;
+ bool perl_classes_;
+ bool word_boundary_;
+ bool one_line_;
+ };
+
+ // Returns the options set in the constructor.
+ const Options& options() const { return options_; }
+
+ // Argument converters; see below.
+ template <typename T>
+ static Arg CRadix(T* ptr);
+ template <typename T>
+ static Arg Hex(T* ptr);
+ template <typename T>
+ static Arg Octal(T* ptr);
+
+ // Controls the maximum count permitted by GlobalReplace(); -1 is unlimited.
+ // FOR FUZZING ONLY.
+ static void FUZZING_ONLY_set_maximum_global_replace_count(int i);
+
+ private:
+ void Init(absl::string_view pattern, const Options& options);
+
+ bool DoMatch(absl::string_view text,
+ Anchor re_anchor,
+ size_t* consumed,
+ const Arg* const args[],
+ int n) const;
+
+ re2::Prog* ReverseProg() const;
+
+ // First cache line is relatively cold fields.
+ const std::string* pattern_; // string regular expression
+ Options options_; // option flags
+ re2::Regexp* entire_regexp_; // parsed regular expression
+ re2::Regexp* suffix_regexp_; // parsed regular expression, prefix_ removed
+ const std::string* error_; // error indicator (or points to empty string)
+ const std::string* error_arg_; // fragment of regexp showing error (or ditto)
+
+ // Second cache line is relatively hot fields.
+ // These are ordered oddly to pack everything.
+ int num_captures_; // number of capturing groups
+ ErrorCode error_code_ : 29; // error code (29 bits is more than enough)
+ bool longest_match_ : 1; // cached copy of options_.longest_match()
+ bool is_one_pass_ : 1; // can use prog_->SearchOnePass?
+ bool prefix_foldcase_ : 1; // prefix_ is ASCII case-insensitive
+ std::string prefix_; // required prefix (before suffix_regexp_)
+ re2::Prog* prog_; // compiled program for regexp
+
+ // Reverse Prog for DFA execution only
+ mutable re2::Prog* rprog_;
+ // Map from capture names to indices
+ mutable const std::map<std::string, int>* named_groups_;
+ // Map from capture indices to names
+ mutable const std::map<int, std::string>* group_names_;
+
+ mutable absl::once_flag rprog_once_;
+ mutable absl::once_flag named_groups_once_;
+ mutable absl::once_flag group_names_once_;
+};
+
+/***** Implementation details *****/
+
+namespace re2_internal {
+
+// Types for which the 3-ary Parse() function template has specializations.
+template <typename T> struct Parse3ary : public std::false_type {};
+template <> struct Parse3ary<void> : public std::true_type {};
+template <> struct Parse3ary<std::string> : public std::true_type {};
+template <> struct Parse3ary<absl::string_view> : public std::true_type {};
+template <> struct Parse3ary<char> : public std::true_type {};
+template <> struct Parse3ary<signed char> : public std::true_type {};
+template <> struct Parse3ary<unsigned char> : public std::true_type {};
+template <> struct Parse3ary<float> : public std::true_type {};
+template <> struct Parse3ary<double> : public std::true_type {};
+
+template <typename T>
+bool Parse(const char* str, size_t n, T* dest);
+
+// Types for which the 4-ary Parse() function template has specializations.
+template <typename T> struct Parse4ary : public std::false_type {};
+template <> struct Parse4ary<long> : public std::true_type {};
+template <> struct Parse4ary<unsigned long> : public std::true_type {};
+template <> struct Parse4ary<short> : public std::true_type {};
+template <> struct Parse4ary<unsigned short> : public std::true_type {};
+template <> struct Parse4ary<int> : public std::true_type {};
+template <> struct Parse4ary<unsigned int> : public std::true_type {};
+template <> struct Parse4ary<long long> : public std::true_type {};
+template <> struct Parse4ary<unsigned long long> : public std::true_type {};
+
+template <typename T>
+bool Parse(const char* str, size_t n, T* dest, int radix);
+
+// Support absl::optional<T> for all T with a stock parser.
+template <typename T> struct Parse3ary<absl::optional<T>> : public Parse3ary<T> {};
+template <typename T> struct Parse4ary<absl::optional<T>> : public Parse4ary<T> {};
+
+template <typename T>
+bool Parse(const char* str, size_t n, absl::optional<T>* dest) {
+ if (str == NULL) {
+ if (dest != NULL)
+ dest->reset();
+ return true;
+ }
+ T tmp;
+ if (Parse(str, n, &tmp)) {
+ if (dest != NULL)
+ dest->emplace(std::move(tmp));
+ return true;
+ }
+ return false;
+}
+
+template <typename T>
+bool Parse(const char* str, size_t n, absl::optional<T>* dest, int radix) {
+ if (str == NULL) {
+ if (dest != NULL)
+ dest->reset();
+ return true;
+ }
+ T tmp;
+ if (Parse(str, n, &tmp, radix)) {
+ if (dest != NULL)
+ dest->emplace(std::move(tmp));
+ return true;
+ }
+ return false;
+}
+
+} // namespace re2_internal
+
+class RE2::Arg {
+ private:
+ template <typename T>
+ using CanParse3ary = typename std::enable_if<
+ re2_internal::Parse3ary<T>::value,
+ int>::type;
+
+ template <typename T>
+ using CanParse4ary = typename std::enable_if<
+ re2_internal::Parse4ary<T>::value,
+ int>::type;
+
+#if !defined(_MSC_VER)
+ template <typename T>
+ using CanParseFrom = typename std::enable_if<
+ std::is_member_function_pointer<
+ decltype(static_cast<bool (T::*)(const char*, size_t)>(
+ &T::ParseFrom))>::value,
+ int>::type;
+#endif
+
+ public:
+ Arg() : Arg(nullptr) {}
+ Arg(std::nullptr_t ptr) : arg_(ptr), parser_(DoNothing) {}
+
+ template <typename T, CanParse3ary<T> = 0>
+ Arg(T* ptr) : arg_(ptr), parser_(DoParse3ary<T>) {}
+
+ template <typename T, CanParse4ary<T> = 0>
+ Arg(T* ptr) : arg_(ptr), parser_(DoParse4ary<T>) {}
+
+#if !defined(_MSC_VER)
+ template <typename T, CanParseFrom<T> = 0>
+ Arg(T* ptr) : arg_(ptr), parser_(DoParseFrom<T>) {}
+#endif
+
+ typedef bool (*Parser)(const char* str, size_t n, void* dest);
+
+ template <typename T>
+ Arg(T* ptr, Parser parser) : arg_(ptr), parser_(parser) {}
+
+ bool Parse(const char* str, size_t n) const {
+ return (*parser_)(str, n, arg_);
+ }
+
+ private:
+ static bool DoNothing(const char* /*str*/, size_t /*n*/, void* /*dest*/) {
+ return true;
+ }
+
+ template <typename T>
+ static bool DoParse3ary(const char* str, size_t n, void* dest) {
+ return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest));
+ }
+
+ template <typename T>
+ static bool DoParse4ary(const char* str, size_t n, void* dest) {
+ return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 10);
+ }
+
+#if !defined(_MSC_VER)
+ template <typename T>
+ static bool DoParseFrom(const char* str, size_t n, void* dest) {
+ if (dest == NULL) return true;
+ return reinterpret_cast<T*>(dest)->ParseFrom(str, n);
+ }
+#endif
+
+ void* arg_;
+ Parser parser_;
+};
+
+template <typename T>
+inline RE2::Arg RE2::CRadix(T* ptr) {
+ return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool {
+ return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 0);
+ });
+}
+
+template <typename T>
+inline RE2::Arg RE2::Hex(T* ptr) {
+ return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool {
+ return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 16);
+ });
+}
+
+template <typename T>
+inline RE2::Arg RE2::Octal(T* ptr) {
+ return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool {
+ return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 8);
+ });
+}
+
+// Silence warnings about missing initializers for members of LazyRE2.
+#if !defined(__clang__) && defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#endif
+
+// Helper for writing global or static RE2s safely.
+// Write
+// static LazyRE2 re = {".*"};
+// and then use *re instead of writing
+// static RE2 re(".*");
+// The former is more careful about multithreaded
+// situations than the latter.
+//
+// N.B. This class never deletes the RE2 object that
+// it constructs: that's a feature, so that it can be used
+// for global and function static variables.
+class LazyRE2 {
+ private:
+ struct NoArg {};
+
+ public:
+ typedef RE2 element_type; // support std::pointer_traits
+
+ // Constructor omitted to preserve braced initialization in C++98.
+
+ // Pretend to be a pointer to Type (never NULL due to on-demand creation):
+ RE2& operator*() const { return *get(); }
+ RE2* operator->() const { return get(); }
+
+ // Named accessor/initializer:
+ RE2* get() const {
+ absl::call_once(once_, &LazyRE2::Init, this);
+ return ptr_;
+ }
+
+ // All data fields must be public to support {"foo"} initialization.
+ const char* pattern_;
+ RE2::CannedOptions options_;
+ NoArg barrier_against_excess_initializers_;
+
+ mutable RE2* ptr_;
+ mutable absl::once_flag once_;
+
+ private:
+ static void Init(const LazyRE2* lazy_re2) {
+ lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_);
+ }
+
+ void operator=(const LazyRE2&); // disallowed
+};
+
+namespace hooks {
+
+// Most platforms support thread_local. Older versions of iOS don't support
+// thread_local, but for the sake of brevity, we lump together all versions
+// of Apple platforms that aren't macOS. If an iOS application really needs
+// the context pointee someday, we can get more specific then...
+//
+// As per https://github.com/google/re2/issues/325, thread_local support in
+// MinGW seems to be buggy. (FWIW, Abseil folks also avoid it.)
+#define RE2_HAVE_THREAD_LOCAL
+#if (defined(__APPLE__) && !(defined(TARGET_OS_OSX) && TARGET_OS_OSX)) || defined(__MINGW32__)
+#undef RE2_HAVE_THREAD_LOCAL
+#endif
+
+// A hook must not make any assumptions regarding the lifetime of the context
+// pointee beyond the current invocation of the hook. Pointers and references
+// obtained via the context pointee should be considered invalidated when the
+// hook returns. Hence, any data about the context pointee (e.g. its pattern)
+// would have to be copied in order for it to be kept for an indefinite time.
+//
+// A hook must not use RE2 for matching. Control flow reentering RE2::Match()
+// could result in infinite mutual recursion. To discourage that possibility,
+// RE2 will not maintain the context pointer correctly when used in that way.
+#ifdef RE2_HAVE_THREAD_LOCAL
+extern thread_local const RE2* context;
+#endif
+
+struct DFAStateCacheReset {
+ int64_t state_budget;
+ size_t state_cache_size;
+};
+
+struct DFASearchFailure {
+ // Nothing yet...
+};
+
+#define DECLARE_HOOK(type) \
+ using type##Callback = void(const type&); \
+ void Set##type##Hook(type##Callback* cb); \
+ type##Callback* Get##type##Hook();
+
+DECLARE_HOOK(DFAStateCacheReset)
+DECLARE_HOOK(DFASearchFailure)
+
+#undef DECLARE_HOOK
+
+} // namespace hooks
+
+} // namespace re2
+
+using re2::RE2;
+using re2::LazyRE2;
+
+#endif // RE2_RE2_H_
diff --git a/third_party/re2/src/re2/regexp.cc b/third_party/re2/src/re2/regexp.cc
new file mode 100644
index 000000000..4ea81cfcd
--- /dev/null
+++ b/third_party/re2/src/re2/regexp.cc
@@ -0,0 +1,1002 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression representation.
+// Tested by parse_test.cc
+
+#include "re2/regexp.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "absl/base/call_once.h"
+#include "absl/base/macros.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "util/logging.h"
+#include "util/utf.h"
+#include "re2/pod_array.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// Constructor. Allocates vectors as appropriate for operator.
+Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
+ : op_(static_cast<uint8_t>(op)),
+ simple_(false),
+ parse_flags_(static_cast<uint16_t>(parse_flags)),
+ ref_(1),
+ nsub_(0),
+ down_(NULL) {
+ subone_ = NULL;
+ memset(the_union_, 0, sizeof the_union_);
+}
+
+// Destructor. Assumes already cleaned up children.
+// Private: use Decref() instead of delete to destroy Regexps.
+// Can't call Decref on the sub-Regexps here because
+// that could cause arbitrarily deep recursion, so
+// required Decref() to have handled them for us.
+Regexp::~Regexp() {
+ if (nsub_ > 0)
+ LOG(DFATAL) << "Regexp not destroyed.";
+
+ switch (op_) {
+ default:
+ break;
+ case kRegexpCapture:
+ delete name_;
+ break;
+ case kRegexpLiteralString:
+ delete[] runes_;
+ break;
+ case kRegexpCharClass:
+ if (cc_)
+ cc_->Delete();
+ delete ccb_;
+ break;
+ }
+}
+
+// If it's possible to destroy this regexp without recurring,
+// do so and return true. Else return false.
+bool Regexp::QuickDestroy() {
+ if (nsub_ == 0) {
+ delete this;
+ return true;
+ }
+ return false;
+}
+
+// Similar to EmptyStorage in re2.cc.
+struct RefStorage {
+ absl::Mutex ref_mutex;
+ absl::flat_hash_map<Regexp*, int> ref_map;
+};
+alignas(RefStorage) static char ref_storage[sizeof(RefStorage)];
+
+static inline absl::Mutex* ref_mutex() {
+ return &reinterpret_cast<RefStorage*>(ref_storage)->ref_mutex;
+}
+
+static inline absl::flat_hash_map<Regexp*, int>* ref_map() {
+ return &reinterpret_cast<RefStorage*>(ref_storage)->ref_map;
+}
+
+int Regexp::Ref() {
+ if (ref_ < kMaxRef)
+ return ref_;
+
+ absl::MutexLock l(ref_mutex());
+ return (*ref_map())[this];
+}
+
+// Increments reference count, returns object as convenience.
+Regexp* Regexp::Incref() {
+ if (ref_ >= kMaxRef-1) {
+ static absl::once_flag ref_once;
+ absl::call_once(ref_once, []() {
+ (void) new (ref_storage) RefStorage;
+ });
+
+ // Store ref count in overflow map.
+ absl::MutexLock l(ref_mutex());
+ if (ref_ == kMaxRef) {
+ // already overflowed
+ (*ref_map())[this]++;
+ } else {
+ // overflowing now
+ (*ref_map())[this] = kMaxRef;
+ ref_ = kMaxRef;
+ }
+ return this;
+ }
+
+ ref_++;
+ return this;
+}
+
+// Decrements reference count and deletes this object if count reaches 0.
+void Regexp::Decref() {
+ if (ref_ == kMaxRef) {
+ // Ref count is stored in overflow map.
+ absl::MutexLock l(ref_mutex());
+ int r = (*ref_map())[this] - 1;
+ if (r < kMaxRef) {
+ ref_ = static_cast<uint16_t>(r);
+ ref_map()->erase(this);
+ } else {
+ (*ref_map())[this] = r;
+ }
+ return;
+ }
+ ref_--;
+ if (ref_ == 0)
+ Destroy();
+}
+
+// Deletes this object; ref count has count reached 0.
+void Regexp::Destroy() {
+ if (QuickDestroy())
+ return;
+
+ // Handle recursive Destroy with explicit stack
+ // to avoid arbitrarily deep recursion on process stack [sigh].
+ down_ = NULL;
+ Regexp* stack = this;
+ while (stack != NULL) {
+ Regexp* re = stack;
+ stack = re->down_;
+ if (re->ref_ != 0)
+ LOG(DFATAL) << "Bad reference count " << re->ref_;
+ if (re->nsub_ > 0) {
+ Regexp** subs = re->sub();
+ for (int i = 0; i < re->nsub_; i++) {
+ Regexp* sub = subs[i];
+ if (sub == NULL)
+ continue;
+ if (sub->ref_ == kMaxRef)
+ sub->Decref();
+ else
+ --sub->ref_;
+ if (sub->ref_ == 0 && !sub->QuickDestroy()) {
+ sub->down_ = stack;
+ stack = sub;
+ }
+ }
+ if (re->nsub_ > 1)
+ delete[] subs;
+ re->nsub_ = 0;
+ }
+ delete re;
+ }
+}
+
+void Regexp::AddRuneToString(Rune r) {
+ DCHECK(op_ == kRegexpLiteralString);
+ if (nrunes_ == 0) {
+ // start with 8
+ runes_ = new Rune[8];
+ } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
+ // double on powers of two
+ Rune *old = runes_;
+ runes_ = new Rune[nrunes_ * 2];
+ for (int i = 0; i < nrunes_; i++)
+ runes_[i] = old[i];
+ delete[] old;
+ }
+
+ runes_[nrunes_++] = r;
+}
+
+Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
+ Regexp* re = new Regexp(kRegexpHaveMatch, flags);
+ re->match_id_ = match_id;
+ return re;
+}
+
+Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
+ // Squash **, ++ and ??.
+ if (op == sub->op() && flags == sub->parse_flags())
+ return sub;
+
+ // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
+ // op is Star/Plus/Quest, we just have to check that sub->op() is too.
+ if ((sub->op() == kRegexpStar ||
+ sub->op() == kRegexpPlus ||
+ sub->op() == kRegexpQuest) &&
+ flags == sub->parse_flags()) {
+ // If sub is Star, no need to rewrite it.
+ if (sub->op() == kRegexpStar)
+ return sub;
+
+ // Rewrite sub to Star.
+ Regexp* re = new Regexp(kRegexpStar, flags);
+ re->AllocSub(1);
+ re->sub()[0] = sub->sub()[0]->Incref();
+ sub->Decref(); // We didn't consume the reference after all.
+ return re;
+ }
+
+ Regexp* re = new Regexp(op, flags);
+ re->AllocSub(1);
+ re->sub()[0] = sub;
+ return re;
+}
+
+Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
+ return StarPlusOrQuest(kRegexpPlus, sub, flags);
+}
+
+Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
+ return StarPlusOrQuest(kRegexpStar, sub, flags);
+}
+
+Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
+ return StarPlusOrQuest(kRegexpQuest, sub, flags);
+}
+
+Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
+ ParseFlags flags, bool can_factor) {
+ if (nsub == 1)
+ return sub[0];
+
+ if (nsub == 0) {
+ if (op == kRegexpAlternate)
+ return new Regexp(kRegexpNoMatch, flags);
+ else
+ return new Regexp(kRegexpEmptyMatch, flags);
+ }
+
+ PODArray<Regexp*> subcopy;
+ if (op == kRegexpAlternate && can_factor) {
+ // Going to edit sub; make a copy so we don't step on caller.
+ subcopy = PODArray<Regexp*>(nsub);
+ memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
+ sub = subcopy.data();
+ nsub = FactorAlternation(sub, nsub, flags);
+ if (nsub == 1) {
+ Regexp* re = sub[0];
+ return re;
+ }
+ }
+
+ if (nsub > kMaxNsub) {
+ // Too many subexpressions to fit in a single Regexp.
+ // Make a two-level tree. Two levels gets us to 65535^2.
+ int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
+ Regexp* re = new Regexp(op, flags);
+ re->AllocSub(nbigsub);
+ Regexp** subs = re->sub();
+ for (int i = 0; i < nbigsub - 1; i++)
+ subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
+ subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
+ nsub - (nbigsub-1)*kMaxNsub, flags,
+ false);
+ return re;
+ }
+
+ Regexp* re = new Regexp(op, flags);
+ re->AllocSub(nsub);
+ Regexp** subs = re->sub();
+ for (int i = 0; i < nsub; i++)
+ subs[i] = sub[i];
+ return re;
+}
+
+Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
+ return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
+}
+
+Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
+ return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
+}
+
+Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
+ return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
+}
+
+Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
+ Regexp* re = new Regexp(kRegexpCapture, flags);
+ re->AllocSub(1);
+ re->sub()[0] = sub;
+ re->cap_ = cap;
+ return re;
+}
+
+Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
+ Regexp* re = new Regexp(kRegexpRepeat, flags);
+ re->AllocSub(1);
+ re->sub()[0] = sub;
+ re->min_ = min;
+ re->max_ = max;
+ return re;
+}
+
+Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
+ Regexp* re = new Regexp(kRegexpLiteral, flags);
+ re->rune_ = rune;
+ return re;
+}
+
+Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
+ if (nrunes <= 0)
+ return new Regexp(kRegexpEmptyMatch, flags);
+ if (nrunes == 1)
+ return NewLiteral(runes[0], flags);
+ Regexp* re = new Regexp(kRegexpLiteralString, flags);
+ for (int i = 0; i < nrunes; i++)
+ re->AddRuneToString(runes[i]);
+ return re;
+}
+
+Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
+ Regexp* re = new Regexp(kRegexpCharClass, flags);
+ re->cc_ = cc;
+ return re;
+}
+
+void Regexp::Swap(Regexp* that) {
+ // Regexp is not trivially copyable, so we cannot freely copy it with
+ // memmove(3), but swapping objects like so is safe for our purposes.
+ char tmp[sizeof *this];
+ void* vthis = reinterpret_cast<void*>(this);
+ void* vthat = reinterpret_cast<void*>(that);
+ memmove(tmp, vthis, sizeof *this);
+ memmove(vthis, vthat, sizeof *this);
+ memmove(vthat, tmp, sizeof *this);
+}
+
+// Tests equality of all top-level structure but not subregexps.
+static bool TopEqual(Regexp* a, Regexp* b) {
+ if (a->op() != b->op())
+ return false;
+
+ switch (a->op()) {
+ case kRegexpNoMatch:
+ case kRegexpEmptyMatch:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ case kRegexpBeginLine:
+ case kRegexpEndLine:
+ case kRegexpWordBoundary:
+ case kRegexpNoWordBoundary:
+ case kRegexpBeginText:
+ return true;
+
+ case kRegexpEndText:
+ // The parse flags remember whether it's \z or (?-m:$),
+ // which matters when testing against PCRE.
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
+
+ case kRegexpLiteral:
+ return a->rune() == b->rune() &&
+ ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
+
+ case kRegexpLiteralString:
+ return a->nrunes() == b->nrunes() &&
+ ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
+ memcmp(a->runes(), b->runes(),
+ a->nrunes() * sizeof a->runes()[0]) == 0;
+
+ case kRegexpAlternate:
+ case kRegexpConcat:
+ return a->nsub() == b->nsub();
+
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
+
+ case kRegexpRepeat:
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
+ a->min() == b->min() &&
+ a->max() == b->max();
+
+ case kRegexpCapture:
+ if (a->name() == NULL || b->name() == NULL) {
+ // One pointer is null, so the other pointer should also be null.
+ return a->cap() == b->cap() && a->name() == b->name();
+ } else {
+ // Neither pointer is null, so compare the pointees for equality.
+ return a->cap() == b->cap() && *a->name() == *b->name();
+ }
+
+ case kRegexpHaveMatch:
+ return a->match_id() == b->match_id();
+
+ case kRegexpCharClass: {
+ CharClass* acc = a->cc();
+ CharClass* bcc = b->cc();
+ return acc->size() == bcc->size() &&
+ acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
+ memcmp(acc->begin(), bcc->begin(),
+ (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
+ }
+ }
+
+ LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
+ return 0;
+}
+
+bool Regexp::Equal(Regexp* a, Regexp* b) {
+ if (a == NULL || b == NULL)
+ return a == b;
+
+ if (!TopEqual(a, b))
+ return false;
+
+ // Fast path:
+ // return without allocating vector if there are no subregexps.
+ switch (a->op()) {
+ case kRegexpAlternate:
+ case kRegexpConcat:
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpRepeat:
+ case kRegexpCapture:
+ break;
+
+ default:
+ return true;
+ }
+
+ // Committed to doing real work.
+ // The stack (vector) has pairs of regexps waiting to
+ // be compared. The regexps are only equal if
+ // all the pairs end up being equal.
+ std::vector<Regexp*> stk;
+
+ for (;;) {
+ // Invariant: TopEqual(a, b) == true.
+ Regexp* a2;
+ Regexp* b2;
+ switch (a->op()) {
+ default:
+ break;
+ case kRegexpAlternate:
+ case kRegexpConcat:
+ for (int i = 0; i < a->nsub(); i++) {
+ a2 = a->sub()[i];
+ b2 = b->sub()[i];
+ if (!TopEqual(a2, b2))
+ return false;
+ stk.push_back(a2);
+ stk.push_back(b2);
+ }
+ break;
+
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpRepeat:
+ case kRegexpCapture:
+ a2 = a->sub()[0];
+ b2 = b->sub()[0];
+ if (!TopEqual(a2, b2))
+ return false;
+ // Really:
+ // stk.push_back(a2);
+ // stk.push_back(b2);
+ // break;
+ // but faster to assign directly and loop.
+ a = a2;
+ b = b2;
+ continue;
+ }
+
+ size_t n = stk.size();
+ if (n == 0)
+ break;
+
+ DCHECK_GE(n, 2);
+ a = stk[n-2];
+ b = stk[n-1];
+ stk.resize(n-2);
+ }
+
+ return true;
+}
+
+// Keep in sync with enum RegexpStatusCode in regexp.h
+static const char *kErrorStrings[] = {
+ "no error",
+ "unexpected error",
+ "invalid escape sequence",
+ "invalid character class",
+ "invalid character class range",
+ "missing ]",
+ "missing )",
+ "unexpected )",
+ "trailing \\",
+ "no argument for repetition operator",
+ "invalid repetition size",
+ "bad repetition operator",
+ "invalid perl operator",
+ "invalid UTF-8",
+ "invalid named capture group",
+};
+
+std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
+ if (code < 0 || code >= ABSL_ARRAYSIZE(kErrorStrings))
+ code = kRegexpInternalError;
+ return kErrorStrings[code];
+}
+
+std::string RegexpStatus::Text() const {
+ if (error_arg_.empty())
+ return CodeText(code_);
+ std::string s;
+ s.append(CodeText(code_));
+ s.append(": ");
+ s.append(error_arg_.data(), error_arg_.size());
+ return s;
+}
+
+void RegexpStatus::Copy(const RegexpStatus& status) {
+ code_ = status.code_;
+ error_arg_ = status.error_arg_;
+}
+
+typedef int Ignored; // Walker<void> doesn't exist
+
+// Walker subclass to count capturing parens in regexp.
+class NumCapturesWalker : public Regexp::Walker<Ignored> {
+ public:
+ NumCapturesWalker() : ncapture_(0) {}
+ int ncapture() { return ncapture_; }
+
+ virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
+ if (re->op() == kRegexpCapture)
+ ncapture_++;
+ return ignored;
+ }
+
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+ // Should never be called: we use Walk(), not WalkExponential().
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
+#endif
+ return ignored;
+ }
+
+ private:
+ int ncapture_;
+
+ NumCapturesWalker(const NumCapturesWalker&) = delete;
+ NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
+};
+
+int Regexp::NumCaptures() {
+ NumCapturesWalker w;
+ w.Walk(this, 0);
+ return w.ncapture();
+}
+
+// Walker class to build map of named capture groups and their indices.
+class NamedCapturesWalker : public Regexp::Walker<Ignored> {
+ public:
+ NamedCapturesWalker() : map_(NULL) {}
+ ~NamedCapturesWalker() { delete map_; }
+
+ std::map<std::string, int>* TakeMap() {
+ std::map<std::string, int>* m = map_;
+ map_ = NULL;
+ return m;
+ }
+
+ virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
+ if (re->op() == kRegexpCapture && re->name() != NULL) {
+ // Allocate map once we find a name.
+ if (map_ == NULL)
+ map_ = new std::map<std::string, int>;
+
+ // Record first occurrence of each name.
+ // (The rule is that if you have the same name
+ // multiple times, only the leftmost one counts.)
+ map_->insert({*re->name(), re->cap()});
+ }
+ return ignored;
+ }
+
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+ // Should never be called: we use Walk(), not WalkExponential().
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
+#endif
+ return ignored;
+ }
+
+ private:
+ std::map<std::string, int>* map_;
+
+ NamedCapturesWalker(const NamedCapturesWalker&) = delete;
+ NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
+};
+
+std::map<std::string, int>* Regexp::NamedCaptures() {
+ NamedCapturesWalker w;
+ w.Walk(this, 0);
+ return w.TakeMap();
+}
+
+// Walker class to build map from capture group indices to their names.
+class CaptureNamesWalker : public Regexp::Walker<Ignored> {
+ public:
+ CaptureNamesWalker() : map_(NULL) {}
+ ~CaptureNamesWalker() { delete map_; }
+
+ std::map<int, std::string>* TakeMap() {
+ std::map<int, std::string>* m = map_;
+ map_ = NULL;
+ return m;
+ }
+
+ virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
+ if (re->op() == kRegexpCapture && re->name() != NULL) {
+ // Allocate map once we find a name.
+ if (map_ == NULL)
+ map_ = new std::map<int, std::string>;
+
+ (*map_)[re->cap()] = *re->name();
+ }
+ return ignored;
+ }
+
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+ // Should never be called: we use Walk(), not WalkExponential().
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
+#endif
+ return ignored;
+ }
+
+ private:
+ std::map<int, std::string>* map_;
+
+ CaptureNamesWalker(const CaptureNamesWalker&) = delete;
+ CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
+};
+
+std::map<int, std::string>* Regexp::CaptureNames() {
+ CaptureNamesWalker w;
+ w.Walk(this, 0);
+ return w.TakeMap();
+}
+
+void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
+ std::string* bytes) {
+ if (latin1) {
+ bytes->resize(nrunes);
+ for (int i = 0; i < nrunes; i++)
+ (*bytes)[i] = static_cast<char>(runes[i]);
+ } else {
+ bytes->resize(nrunes * UTFmax); // worst case
+ char* p = &(*bytes)[0];
+ for (int i = 0; i < nrunes; i++)
+ p += runetochar(p, &runes[i]);
+ bytes->resize(p - &(*bytes)[0]);
+ bytes->shrink_to_fit();
+ }
+}
+
+// Determines whether regexp matches must be anchored
+// with a fixed string prefix. If so, returns the prefix and
+// the regexp that remains after the prefix. The prefix might
+// be ASCII case-insensitive.
+bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
+ Regexp** suffix) {
+ prefix->clear();
+ *foldcase = false;
+ *suffix = NULL;
+
+ // No need for a walker: the regexp must be of the form
+ // 1. some number of ^ anchors
+ // 2. a literal char or string
+ // 3. the rest
+ if (op_ != kRegexpConcat)
+ return false;
+ int i = 0;
+ while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
+ i++;
+ if (i == 0 || i >= nsub_)
+ return false;
+ Regexp* re = sub()[i];
+ if (re->op_ != kRegexpLiteral &&
+ re->op_ != kRegexpLiteralString)
+ return false;
+ i++;
+ if (i < nsub_) {
+ for (int j = i; j < nsub_; j++)
+ sub()[j]->Incref();
+ *suffix = Concat(sub() + i, nsub_ - i, parse_flags());
+ } else {
+ *suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
+ }
+
+ bool latin1 = (re->parse_flags() & Latin1) != 0;
+ Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
+ int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
+ ConvertRunesToBytes(latin1, runes, nrunes, prefix);
+ *foldcase = (re->parse_flags() & FoldCase) != 0;
+ return true;
+}
+
+// Determines whether regexp matches must be unanchored
+// with a fixed string prefix. If so, returns the prefix.
+// The prefix might be ASCII case-insensitive.
+bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) {
+ prefix->clear();
+ *foldcase = false;
+
+ // No need for a walker: the regexp must either begin with or be
+ // a literal char or string. We "see through" capturing groups,
+ // but make no effort to glue multiple prefix fragments together.
+ Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
+ while (re->op_ == kRegexpCapture) {
+ re = re->sub()[0];
+ if (re->op_ == kRegexpConcat && re->nsub_ > 0)
+ re = re->sub()[0];
+ }
+ if (re->op_ != kRegexpLiteral &&
+ re->op_ != kRegexpLiteralString)
+ return false;
+
+ bool latin1 = (re->parse_flags() & Latin1) != 0;
+ Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
+ int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
+ ConvertRunesToBytes(latin1, runes, nrunes, prefix);
+ *foldcase = (re->parse_flags() & FoldCase) != 0;
+ return true;
+}
+
+// Character class builder is a balanced binary tree (STL set)
+// containing non-overlapping, non-abutting RuneRanges.
+// The less-than operator used in the tree treats two
+// ranges as equal if they overlap at all, so that
+// lookups for a particular Rune are possible.
+
+CharClassBuilder::CharClassBuilder() {
+ nrunes_ = 0;
+ upper_ = 0;
+ lower_ = 0;
+}
+
+// Add lo-hi to the class; return whether class got bigger.
+bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
+ if (hi < lo)
+ return false;
+
+ if (lo <= 'z' && hi >= 'A') {
+ // Overlaps some alpha, maybe not all.
+ // Update bitmaps telling which ASCII letters are in the set.
+ Rune lo1 = std::max<Rune>(lo, 'A');
+ Rune hi1 = std::min<Rune>(hi, 'Z');
+ if (lo1 <= hi1)
+ upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
+
+ lo1 = std::max<Rune>(lo, 'a');
+ hi1 = std::min<Rune>(hi, 'z');
+ if (lo1 <= hi1)
+ lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
+ }
+
+ { // Check whether lo, hi is already in the class.
+ iterator it = ranges_.find(RuneRange(lo, lo));
+ if (it != end() && it->lo <= lo && hi <= it->hi)
+ return false;
+ }
+
+ // Look for a range abutting lo on the left.
+ // If it exists, take it out and increase our range.
+ if (lo > 0) {
+ iterator it = ranges_.find(RuneRange(lo-1, lo-1));
+ if (it != end()) {
+ lo = it->lo;
+ if (it->hi > hi)
+ hi = it->hi;
+ nrunes_ -= it->hi - it->lo + 1;
+ ranges_.erase(it);
+ }
+ }
+
+ // Look for a range abutting hi on the right.
+ // If it exists, take it out and increase our range.
+ if (hi < Runemax) {
+ iterator it = ranges_.find(RuneRange(hi+1, hi+1));
+ if (it != end()) {
+ hi = it->hi;
+ nrunes_ -= it->hi - it->lo + 1;
+ ranges_.erase(it);
+ }
+ }
+
+ // Look for ranges between lo and hi. Take them out.
+ // This is only safe because the set has no overlapping ranges.
+ // We've already removed any ranges abutting lo and hi, so
+ // any that overlap [lo, hi] must be contained within it.
+ for (;;) {
+ iterator it = ranges_.find(RuneRange(lo, hi));
+ if (it == end())
+ break;
+ nrunes_ -= it->hi - it->lo + 1;
+ ranges_.erase(it);
+ }
+
+ // Finally, add [lo, hi].
+ nrunes_ += hi - lo + 1;
+ ranges_.insert(RuneRange(lo, hi));
+ return true;
+}
+
+void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
+ for (iterator it = cc->begin(); it != cc->end(); ++it)
+ AddRange(it->lo, it->hi);
+}
+
+bool CharClassBuilder::Contains(Rune r) {
+ return ranges_.find(RuneRange(r, r)) != end();
+}
+
+// Does the character class behave the same on A-Z as on a-z?
+bool CharClassBuilder::FoldsASCII() {
+ return ((upper_ ^ lower_) & AlphaMask) == 0;
+}
+
+CharClassBuilder* CharClassBuilder::Copy() {
+ CharClassBuilder* cc = new CharClassBuilder;
+ for (iterator it = begin(); it != end(); ++it)
+ cc->ranges_.insert(RuneRange(it->lo, it->hi));
+ cc->upper_ = upper_;
+ cc->lower_ = lower_;
+ cc->nrunes_ = nrunes_;
+ return cc;
+}
+
+
+
+void CharClassBuilder::RemoveAbove(Rune r) {
+ if (r >= Runemax)
+ return;
+
+ if (r < 'z') {
+ if (r < 'a')
+ lower_ = 0;
+ else
+ lower_ &= AlphaMask >> ('z' - r);
+ }
+
+ if (r < 'Z') {
+ if (r < 'A')
+ upper_ = 0;
+ else
+ upper_ &= AlphaMask >> ('Z' - r);
+ }
+
+ for (;;) {
+
+ iterator it = ranges_.find(RuneRange(r + 1, Runemax));
+ if (it == end())
+ break;
+ RuneRange rr = *it;
+ ranges_.erase(it);
+ nrunes_ -= rr.hi - rr.lo + 1;
+ if (rr.lo <= r) {
+ rr.hi = r;
+ ranges_.insert(rr);
+ nrunes_ += rr.hi - rr.lo + 1;
+ }
+ }
+}
+
+void CharClassBuilder::Negate() {
+ // Build up negation and then copy in.
+ // Could edit ranges in place, but C++ won't let me.
+ std::vector<RuneRange> v;
+ v.reserve(ranges_.size() + 1);
+
+ // In negation, first range begins at 0, unless
+ // the current class begins at 0.
+ iterator it = begin();
+ if (it == end()) {
+ v.push_back(RuneRange(0, Runemax));
+ } else {
+ int nextlo = 0;
+ if (it->lo == 0) {
+ nextlo = it->hi + 1;
+ ++it;
+ }
+ for (; it != end(); ++it) {
+ v.push_back(RuneRange(nextlo, it->lo - 1));
+ nextlo = it->hi + 1;
+ }
+ if (nextlo <= Runemax)
+ v.push_back(RuneRange(nextlo, Runemax));
+ }
+
+ ranges_.clear();
+ for (size_t i = 0; i < v.size(); i++)
+ ranges_.insert(v[i]);
+
+ upper_ = AlphaMask & ~upper_;
+ lower_ = AlphaMask & ~lower_;
+ nrunes_ = Runemax+1 - nrunes_;
+}
+
+// Character class is a sorted list of ranges.
+// The ranges are allocated in the same block as the header,
+// necessitating a special allocator and Delete method.
+
+CharClass* CharClass::New(size_t maxranges) {
+ CharClass* cc;
+ uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
+ cc = reinterpret_cast<CharClass*>(data);
+ cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
+ cc->nranges_ = 0;
+ cc->folds_ascii_ = false;
+ cc->nrunes_ = 0;
+ return cc;
+}
+
+void CharClass::Delete() {
+ uint8_t* data = reinterpret_cast<uint8_t*>(this);
+ delete[] data;
+}
+
+CharClass* CharClass::Negate() {
+ CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1));
+ cc->folds_ascii_ = folds_ascii_;
+ cc->nrunes_ = Runemax + 1 - nrunes_;
+ int n = 0;
+ int nextlo = 0;
+ for (CharClass::iterator it = begin(); it != end(); ++it) {
+ if (it->lo == nextlo) {
+ nextlo = it->hi + 1;
+ } else {
+ cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
+ nextlo = it->hi + 1;
+ }
+ }
+ if (nextlo <= Runemax)
+ cc->ranges_[n++] = RuneRange(nextlo, Runemax);
+ cc->nranges_ = n;
+ return cc;
+}
+
+bool CharClass::Contains(Rune r) const {
+ RuneRange* rr = ranges_;
+ int n = nranges_;
+ while (n > 0) {
+ int m = n/2;
+ if (rr[m].hi < r) {
+ rr += m+1;
+ n -= m+1;
+ } else if (r < rr[m].lo) {
+ n = m;
+ } else { // rr[m].lo <= r && r <= rr[m].hi
+ return true;
+ }
+ }
+ return false;
+}
+
+CharClass* CharClassBuilder::GetCharClass() {
+ CharClass* cc = CharClass::New(ranges_.size());
+ int n = 0;
+ for (iterator it = begin(); it != end(); ++it)
+ cc->ranges_[n++] = *it;
+ cc->nranges_ = n;
+ DCHECK_LE(n, static_cast<int>(ranges_.size()));
+ cc->nrunes_ = nrunes_;
+ cc->folds_ascii_ = FoldsASCII();
+ return cc;
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/regexp.h b/third_party/re2/src/re2/regexp.h
new file mode 100644
index 000000000..df4989479
--- /dev/null
+++ b/third_party/re2/src/re2/regexp.h
@@ -0,0 +1,664 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_REGEXP_H_
+#define RE2_REGEXP_H_
+
+// --- SPONSORED LINK --------------------------------------------------
+// If you want to use this library for regular expression matching,
+// you should use re2/re2.h, which provides a class RE2 that
+// mimics the PCRE interface provided by PCRE's C++ wrappers.
+// This header describes the low-level interface used to implement RE2
+// and may change in backwards-incompatible ways from time to time.
+// In contrast, RE2's interface will not.
+// ---------------------------------------------------------------------
+
+// Regular expression library: parsing, execution, and manipulation
+// of regular expressions.
+//
+// Any operation that traverses the Regexp structures should be written
+// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
+// regular expressions such as x++++++++++++++++++++... might cause recursive
+// traversals to overflow the stack.
+//
+// It is the caller's responsibility to provide appropriate mutual exclusion
+// around manipulation of the regexps. RE2 does this.
+//
+// PARSING
+//
+// Regexp::Parse parses regular expressions encoded in UTF-8.
+// The default syntax is POSIX extended regular expressions,
+// with the following changes:
+//
+// 1. Backreferences (optional in POSIX EREs) are not supported.
+// (Supporting them precludes the use of DFA-based
+// matching engines.)
+//
+// 2. Collating elements and collation classes are not supported.
+// (No one has needed or wanted them.)
+//
+// The exact syntax accepted can be modified by passing flags to
+// Regexp::Parse. In particular, many of the basic Perl additions
+// are available. The flags are documented below (search for LikePerl).
+//
+// If parsed with the flag Regexp::Latin1, both the regular expression
+// and the input to the matching routines are assumed to be encoded in
+// Latin-1, not UTF-8.
+//
+// EXECUTION
+//
+// Once Regexp has parsed a regular expression, it provides methods
+// to search text using that regular expression. These methods are
+// implemented via calling out to other regular expression libraries.
+// (Let's call them the sublibraries.)
+//
+// To call a sublibrary, Regexp does not simply prepare a
+// string version of the regular expression and hand it to the
+// sublibrary. Instead, Regexp prepares, from its own parsed form, the
+// corresponding internal representation used by the sublibrary.
+// This has the drawback of needing to know the internal representation
+// used by the sublibrary, but it has two important benefits:
+//
+// 1. The syntax and meaning of regular expressions is guaranteed
+// to be that used by Regexp's parser, not the syntax expected
+// by the sublibrary. Regexp might accept a restricted or
+// expanded syntax for regular expressions as compared with
+// the sublibrary. As long as Regexp can translate from its
+// internal form into the sublibrary's, clients need not know
+// exactly which sublibrary they are using.
+//
+// 2. The sublibrary parsers are bypassed. For whatever reason,
+// sublibrary regular expression parsers often have security
+// problems. For example, plan9grep's regular expression parser
+// has a buffer overflow in its handling of large character
+// classes, and PCRE's parser has had buffer overflow problems
+// in the past. Security-team requires sandboxing of sublibrary
+// regular expression parsers. Avoiding the sublibrary parsers
+// avoids the sandbox.
+//
+// The execution methods we use now are provided by the compiled form,
+// Prog, described in prog.h
+//
+// MANIPULATION
+//
+// Unlike other regular expression libraries, Regexp makes its parsed
+// form accessible to clients, so that client code can analyze the
+// parsed regular expressions.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <map>
+#include <set>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "util/logging.h"
+#include "util/utf.h"
+
+namespace re2 {
+
+// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
+enum RegexpOp {
+ // Matches no strings.
+ kRegexpNoMatch = 1,
+
+ // Matches empty string.
+ kRegexpEmptyMatch,
+
+ // Matches rune_.
+ kRegexpLiteral,
+
+ // Matches runes_.
+ kRegexpLiteralString,
+
+ // Matches concatenation of sub_[0..nsub-1].
+ kRegexpConcat,
+ // Matches union of sub_[0..nsub-1].
+ kRegexpAlternate,
+
+ // Matches sub_[0] zero or more times.
+ kRegexpStar,
+ // Matches sub_[0] one or more times.
+ kRegexpPlus,
+ // Matches sub_[0] zero or one times.
+ kRegexpQuest,
+
+ // Matches sub_[0] at least min_ times, at most max_ times.
+ // max_ == -1 means no upper limit.
+ kRegexpRepeat,
+
+ // Parenthesized (capturing) subexpression. Index is cap_.
+ // Optionally, capturing name is name_.
+ kRegexpCapture,
+
+ // Matches any character.
+ kRegexpAnyChar,
+
+ // Matches any byte [sic].
+ kRegexpAnyByte,
+
+ // Matches empty string at beginning of line.
+ kRegexpBeginLine,
+ // Matches empty string at end of line.
+ kRegexpEndLine,
+
+ // Matches word boundary "\b".
+ kRegexpWordBoundary,
+ // Matches not-a-word boundary "\B".
+ kRegexpNoWordBoundary,
+
+ // Matches empty string at beginning of text.
+ kRegexpBeginText,
+ // Matches empty string at end of text.
+ kRegexpEndText,
+
+ // Matches character class given by cc_.
+ kRegexpCharClass,
+
+ // Forces match of entire expression right now,
+ // with match ID match_id_ (used by RE2::Set).
+ kRegexpHaveMatch,
+
+ kMaxRegexpOp = kRegexpHaveMatch,
+};
+
+// Keep in sync with string list in regexp.cc
+enum RegexpStatusCode {
+ // No error
+ kRegexpSuccess = 0,
+
+ // Unexpected error
+ kRegexpInternalError,
+
+ // Parse errors
+ kRegexpBadEscape, // bad escape sequence
+ kRegexpBadCharClass, // bad character class
+ kRegexpBadCharRange, // bad character class range
+ kRegexpMissingBracket, // missing closing ]
+ kRegexpMissingParen, // missing closing )
+ kRegexpUnexpectedParen, // unexpected closing )
+ kRegexpTrailingBackslash, // at end of regexp
+ kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
+ kRegexpRepeatSize, // bad repetition argument
+ kRegexpRepeatOp, // bad repetition operator
+ kRegexpBadPerlOp, // bad perl operator
+ kRegexpBadUTF8, // invalid UTF-8 in regexp
+ kRegexpBadNamedCapture, // bad named capture
+};
+
+// Error status for certain operations.
+class RegexpStatus {
+ public:
+ RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
+ ~RegexpStatus() { delete tmp_; }
+
+ void set_code(RegexpStatusCode code) { code_ = code; }
+ void set_error_arg(absl::string_view error_arg) { error_arg_ = error_arg; }
+ void set_tmp(std::string* tmp) { delete tmp_; tmp_ = tmp; }
+ RegexpStatusCode code() const { return code_; }
+ absl::string_view error_arg() const { return error_arg_; }
+ bool ok() const { return code() == kRegexpSuccess; }
+
+ // Copies state from status.
+ void Copy(const RegexpStatus& status);
+
+ // Returns text equivalent of code, e.g.:
+ // "Bad character class"
+ static std::string CodeText(RegexpStatusCode code);
+
+ // Returns text describing error, e.g.:
+ // "Bad character class: [z-a]"
+ std::string Text() const;
+
+ private:
+ RegexpStatusCode code_; // Kind of error.
+ absl::string_view error_arg_; // Piece of regexp containing syntax error.
+ std::string* tmp_; // Temporary storage, possibly for error_arg_.
+
+ RegexpStatus(const RegexpStatus&) = delete;
+ RegexpStatus& operator=(const RegexpStatus&) = delete;
+};
+
+// Compiled form; see prog.h
+class Prog;
+
+struct RuneRange {
+ RuneRange() : lo(0), hi(0) { }
+ RuneRange(int l, int h) : lo(l), hi(h) { }
+ Rune lo;
+ Rune hi;
+};
+
+// Less-than on RuneRanges treats a == b if they overlap at all.
+// This lets us look in a set to find the range covering a particular Rune.
+struct RuneRangeLess {
+ bool operator()(const RuneRange& a, const RuneRange& b) const {
+ return a.hi < b.lo;
+ }
+};
+
+class CharClassBuilder;
+
+class CharClass {
+ public:
+ void Delete();
+
+ typedef RuneRange* iterator;
+ iterator begin() { return ranges_; }
+ iterator end() { return ranges_ + nranges_; }
+
+ int size() { return nrunes_; }
+ bool empty() { return nrunes_ == 0; }
+ bool full() { return nrunes_ == Runemax+1; }
+ bool FoldsASCII() { return folds_ascii_; }
+
+ bool Contains(Rune r) const;
+ CharClass* Negate();
+
+ private:
+ CharClass(); // not implemented
+ ~CharClass(); // not implemented
+ static CharClass* New(size_t maxranges);
+
+ friend class CharClassBuilder;
+
+ bool folds_ascii_;
+ int nrunes_;
+ RuneRange *ranges_;
+ int nranges_;
+
+ CharClass(const CharClass&) = delete;
+ CharClass& operator=(const CharClass&) = delete;
+};
+
+class Regexp {
+ public:
+
+ // Flags for parsing. Can be ORed together.
+ enum ParseFlags {
+ NoParseFlags = 0,
+ FoldCase = 1<<0, // Fold case during matching (case-insensitive).
+ Literal = 1<<1, // Treat s as literal string instead of a regexp.
+ ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
+ // and [[:space:]] to match newline.
+ DotNL = 1<<3, // Allow . to match newline.
+ MatchNL = ClassNL | DotNL,
+ OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
+ // end of text, not around embedded newlines.
+ // (Perl's default)
+ Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
+ NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
+ PerlClasses = 1<<7, // Allow Perl character classes like \d.
+ PerlB = 1<<8, // Allow Perl's \b and \B.
+ PerlX = 1<<9, // Perl extensions:
+ // non-capturing parens - (?: )
+ // non-greedy operators - *? +? ?? {}?
+ // flag edits - (?i) (?-i) (?i: )
+ // i - FoldCase
+ // m - !OneLine
+ // s - DotNL
+ // U - NonGreedy
+ // line ends: \A \z
+ // \Q and \E to disable/enable metacharacters
+ // (?P<name>expr) for named captures
+ // \C to match any single byte
+ UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
+ // and \P{Han} for its negation.
+ NeverNL = 1<<11, // Never match NL, even if the regexp mentions
+ // it explicitly.
+ NeverCapture = 1<<12, // Parse all parens as non-capturing.
+
+ // As close to Perl as we can get.
+ LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
+ UnicodeGroups,
+
+ // Internal use only.
+ WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text
+ AllParseFlags = (1<<14)-1,
+ };
+
+ // Get. No set, Regexps are logically immutable once created.
+ RegexpOp op() { return static_cast<RegexpOp>(op_); }
+ int nsub() { return nsub_; }
+ bool simple() { return simple_ != 0; }
+ ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
+ int Ref(); // For testing.
+
+ Regexp** sub() {
+ if(nsub_ <= 1)
+ return &subone_;
+ else
+ return submany_;
+ }
+
+ int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
+ int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
+ Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
+ CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
+ int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
+ const std::string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
+ Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
+ int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
+ int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
+
+ // Increments reference count, returns object as convenience.
+ Regexp* Incref();
+
+ // Decrements reference count and deletes this object if count reaches 0.
+ void Decref();
+
+ // Parses string s to produce regular expression, returned.
+ // Caller must release return value with re->Decref().
+ // On failure, sets *status (if status != NULL) and returns NULL.
+ static Regexp* Parse(absl::string_view s, ParseFlags flags,
+ RegexpStatus* status);
+
+ // Returns a _new_ simplified version of the current regexp.
+ // Does not edit the current regexp.
+ // Caller must release return value with re->Decref().
+ // Simplified means that counted repetition has been rewritten
+ // into simpler terms and all Perl/POSIX features have been
+ // removed. The result will capture exactly the same
+ // subexpressions the original did, unless formatted with ToString.
+ Regexp* Simplify();
+ friend class CoalesceWalker;
+ friend class SimplifyWalker;
+
+ // Parses the regexp src and then simplifies it and sets *dst to the
+ // string representation of the simplified form. Returns true on success.
+ // Returns false and sets *status (if status != NULL) on parse error.
+ static bool SimplifyRegexp(absl::string_view src, ParseFlags flags,
+ std::string* dst, RegexpStatus* status);
+
+ // Returns the number of capturing groups in the regexp.
+ int NumCaptures();
+ friend class NumCapturesWalker;
+
+ // Returns a map from names to capturing group indices,
+ // or NULL if the regexp contains no named capture groups.
+ // The caller is responsible for deleting the map.
+ std::map<std::string, int>* NamedCaptures();
+
+ // Returns a map from capturing group indices to capturing group
+ // names or NULL if the regexp contains no named capture groups. The
+ // caller is responsible for deleting the map.
+ std::map<int, std::string>* CaptureNames();
+
+ // Returns a string representation of the current regexp,
+ // using as few parentheses as possible.
+ std::string ToString();
+
+ // Convenience functions. They consume the passed reference,
+ // so in many cases you should use, e.g., Plus(re->Incref(), flags).
+ // They do not consume allocated arrays like subs or runes.
+ static Regexp* Plus(Regexp* sub, ParseFlags flags);
+ static Regexp* Star(Regexp* sub, ParseFlags flags);
+ static Regexp* Quest(Regexp* sub, ParseFlags flags);
+ static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
+ static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
+ static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
+ static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
+ static Regexp* NewLiteral(Rune rune, ParseFlags flags);
+ static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
+ static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
+ static Regexp* HaveMatch(int match_id, ParseFlags flags);
+
+ // Like Alternate but does not factor out common prefixes.
+ static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
+
+ // Debugging function. Returns string format for regexp
+ // that makes structure clear. Does NOT use regexp syntax.
+ std::string Dump();
+
+ // Helper traversal class, defined fully in walker-inl.h.
+ template<typename T> class Walker;
+
+ // Compile to Prog. See prog.h
+ // Reverse prog expects to be run over text backward.
+ // Construction and execution of prog will
+ // stay within approximately max_mem bytes of memory.
+ // If max_mem <= 0, a reasonable default is used.
+ Prog* CompileToProg(int64_t max_mem);
+ Prog* CompileToReverseProg(int64_t max_mem);
+
+ // Whether to expect this library to find exactly the same answer as PCRE
+ // when running this regexp. Most regexps do mimic PCRE exactly, but a few
+ // obscure cases behave differently. Technically this is more a property
+ // of the Prog than the Regexp, but the computation is much easier to do
+ // on the Regexp. See mimics_pcre.cc for the exact conditions.
+ bool MimicsPCRE();
+
+ // Benchmarking function.
+ void NullWalk();
+
+ // Whether every match of this regexp must be anchored and
+ // begin with a non-empty fixed string (perhaps after ASCII
+ // case-folding). If so, returns the prefix and the sub-regexp that
+ // follows it.
+ // Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
+ // regardless of the return value.
+ bool RequiredPrefix(std::string* prefix, bool* foldcase,
+ Regexp** suffix);
+
+ // Whether every match of this regexp must be unanchored and
+ // begin with a non-empty fixed string (perhaps after ASCII
+ // case-folding). If so, returns the prefix.
+ // Callers should expect *prefix and *foldcase to be "zeroed"
+ // regardless of the return value.
+ bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase);
+
+ // Controls the maximum repeat count permitted by the parser.
+ // FOR FUZZING ONLY.
+ static void FUZZING_ONLY_set_maximum_repeat_count(int i);
+
+ private:
+ // Constructor allocates vectors as appropriate for operator.
+ explicit Regexp(RegexpOp op, ParseFlags parse_flags);
+
+ // Use Decref() instead of delete to release Regexps.
+ // This is private to catch deletes at compile time.
+ ~Regexp();
+ void Destroy();
+ bool QuickDestroy();
+
+ // Helpers for Parse. Listed here so they can edit Regexps.
+ class ParseState;
+
+ friend class ParseState;
+ friend bool ParseCharClass(absl::string_view* s, Regexp** out_re,
+ RegexpStatus* status);
+
+ // Helper for testing [sic].
+ friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
+
+ // Computes whether Regexp is already simple.
+ bool ComputeSimple();
+
+ // Constructor that generates a Star, Plus or Quest,
+ // squashing the pair if sub is also a Star, Plus or Quest.
+ static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
+
+ // Constructor that generates a concatenation or alternation,
+ // enforcing the limit on the number of subexpressions for
+ // a particular Regexp.
+ static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
+ ParseFlags flags, bool can_factor);
+
+ // Returns the leading string that re starts with.
+ // The returned Rune* points into a piece of re,
+ // so it must not be used after the caller calls re->Decref().
+ static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
+
+ // Removes the first n leading runes from the beginning of re.
+ // Edits re in place.
+ static void RemoveLeadingString(Regexp* re, int n);
+
+ // Returns the leading regexp in re's top-level concatenation.
+ // The returned Regexp* points at re or a sub-expression of re,
+ // so it must not be used after the caller calls re->Decref().
+ static Regexp* LeadingRegexp(Regexp* re);
+
+ // Removes LeadingRegexp(re) from re and returns the remainder.
+ // Might edit re in place.
+ static Regexp* RemoveLeadingRegexp(Regexp* re);
+
+ // Simplifies an alternation of literal strings by factoring out
+ // common prefixes.
+ static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
+ friend class FactorAlternationImpl;
+
+ // Is a == b? Only efficient on regexps that have not been through
+ // Simplify yet - the expansion of a kRegexpRepeat will make this
+ // take a long time. Do not call on such regexps, hence private.
+ static bool Equal(Regexp* a, Regexp* b);
+
+ // Allocate space for n sub-regexps.
+ void AllocSub(int n) {
+ DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
+ if (n > 1)
+ submany_ = new Regexp*[n];
+ nsub_ = static_cast<uint16_t>(n);
+ }
+
+ // Add Rune to LiteralString
+ void AddRuneToString(Rune r);
+
+ // Swaps this with that, in place.
+ void Swap(Regexp *that);
+
+ // Operator. See description of operators above.
+ // uint8_t instead of RegexpOp to control space usage.
+ uint8_t op_;
+
+ // Is this regexp structure already simple
+ // (has it been returned by Simplify)?
+ // uint8_t instead of bool to control space usage.
+ uint8_t simple_;
+
+ // Flags saved from parsing and used during execution.
+ // (Only FoldCase is used.)
+ // uint16_t instead of ParseFlags to control space usage.
+ uint16_t parse_flags_;
+
+ // Reference count. Exists so that SimplifyRegexp can build
+ // regexp structures that are dags rather than trees to avoid
+ // exponential blowup in space requirements.
+ // uint16_t to control space usage.
+ // The standard regexp routines will never generate a
+ // ref greater than the maximum repeat count (kMaxRepeat),
+ // but even so, Incref and Decref consult an overflow map
+ // when ref_ reaches kMaxRef.
+ uint16_t ref_;
+ static const uint16_t kMaxRef = 0xffff;
+
+ // Subexpressions.
+ // uint16_t to control space usage.
+ // Concat and Alternate handle larger numbers of subexpressions
+ // by building concatenation or alternation trees.
+ // Other routines should call Concat or Alternate instead of
+ // filling in sub() by hand.
+ uint16_t nsub_;
+ static const uint16_t kMaxNsub = 0xffff;
+ union {
+ Regexp** submany_; // if nsub_ > 1
+ Regexp* subone_; // if nsub_ == 1
+ };
+
+ // Extra space for parse and teardown stacks.
+ Regexp* down_;
+
+ // Arguments to operator. See description of operators above.
+ union {
+ struct { // Repeat
+ int max_;
+ int min_;
+ };
+ struct { // Capture
+ int cap_;
+ std::string* name_;
+ };
+ struct { // LiteralString
+ int nrunes_;
+ Rune* runes_;
+ };
+ struct { // CharClass
+ // These two could be in separate union members,
+ // but it wouldn't save any space (there are other two-word structs)
+ // and keeping them separate avoids confusion during parsing.
+ CharClass* cc_;
+ CharClassBuilder* ccb_;
+ };
+ Rune rune_; // Literal
+ int match_id_; // HaveMatch
+ void *the_union_[2]; // as big as any other element, for memset
+ };
+
+ Regexp(const Regexp&) = delete;
+ Regexp& operator=(const Regexp&) = delete;
+};
+
+// Character class set: contains non-overlapping, non-abutting RuneRanges.
+typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;
+
+class CharClassBuilder {
+ public:
+ CharClassBuilder();
+
+ typedef RuneRangeSet::iterator iterator;
+ iterator begin() { return ranges_.begin(); }
+ iterator end() { return ranges_.end(); }
+
+ int size() { return nrunes_; }
+ bool empty() { return nrunes_ == 0; }
+ bool full() { return nrunes_ == Runemax+1; }
+
+ bool Contains(Rune r);
+ bool FoldsASCII();
+ bool AddRange(Rune lo, Rune hi); // returns whether class changed
+ CharClassBuilder* Copy();
+ void AddCharClass(CharClassBuilder* cc);
+ void Negate();
+ void RemoveAbove(Rune r);
+ CharClass* GetCharClass();
+ void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
+
+ private:
+ static const uint32_t AlphaMask = (1<<26) - 1;
+ uint32_t upper_; // bitmap of A-Z
+ uint32_t lower_; // bitmap of a-z
+ int nrunes_;
+ RuneRangeSet ranges_;
+
+ CharClassBuilder(const CharClassBuilder&) = delete;
+ CharClassBuilder& operator=(const CharClassBuilder&) = delete;
+};
+
+// Bitwise ops on ParseFlags produce ParseFlags.
+inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
+ Regexp::ParseFlags b) {
+ return static_cast<Regexp::ParseFlags>(
+ static_cast<int>(a) | static_cast<int>(b));
+}
+
+inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
+ Regexp::ParseFlags b) {
+ return static_cast<Regexp::ParseFlags>(
+ static_cast<int>(a) ^ static_cast<int>(b));
+}
+
+inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
+ Regexp::ParseFlags b) {
+ return static_cast<Regexp::ParseFlags>(
+ static_cast<int>(a) & static_cast<int>(b));
+}
+
+inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
+ // Attempting to produce a value out of enum's range has undefined behaviour.
+ return static_cast<Regexp::ParseFlags>(
+ ~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
+}
+
+} // namespace re2
+
+#endif // RE2_REGEXP_H_
diff --git a/third_party/re2/src/re2/set.cc b/third_party/re2/src/re2/set.cc
new file mode 100644
index 000000000..b9c918e07
--- /dev/null
+++ b/third_party/re2/src/re2/set.cc
@@ -0,0 +1,174 @@
+// Copyright 2010 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "re2/set.h"
+
+#include <stddef.h>
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+#include "util/logging.h"
+#include "re2/pod_array.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor)
+ : options_(options),
+ anchor_(anchor),
+ compiled_(false),
+ size_(0) {
+ options_.set_never_capture(true); // might unblock some optimisations
+}
+
+RE2::Set::~Set() {
+ for (size_t i = 0; i < elem_.size(); i++)
+ elem_[i].second->Decref();
+}
+
+RE2::Set::Set(Set&& other)
+ : options_(other.options_),
+ anchor_(other.anchor_),
+ elem_(std::move(other.elem_)),
+ compiled_(other.compiled_),
+ size_(other.size_),
+ prog_(std::move(other.prog_)) {
+ other.elem_.clear();
+ other.elem_.shrink_to_fit();
+ other.compiled_ = false;
+ other.size_ = 0;
+ other.prog_.reset();
+}
+
+RE2::Set& RE2::Set::operator=(Set&& other) {
+ this->~Set();
+ (void) new (this) Set(std::move(other));
+ return *this;
+}
+
+int RE2::Set::Add(absl::string_view pattern, std::string* error) {
+ if (compiled_) {
+ LOG(DFATAL) << "RE2::Set::Add() called after compiling";
+ return -1;
+ }
+
+ Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
+ options_.ParseFlags());
+ RegexpStatus status;
+ re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
+ if (re == NULL) {
+ if (error != NULL)
+ *error = status.Text();
+ if (options_.log_errors())
+ LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
+ return -1;
+ }
+
+ // Concatenate with match index and push on vector.
+ int n = static_cast<int>(elem_.size());
+ re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
+ if (re->op() == kRegexpConcat) {
+ int nsub = re->nsub();
+ PODArray<re2::Regexp*> sub(nsub + 1);
+ for (int i = 0; i < nsub; i++)
+ sub[i] = re->sub()[i]->Incref();
+ sub[nsub] = m;
+ re->Decref();
+ re = re2::Regexp::Concat(sub.data(), nsub + 1, pf);
+ } else {
+ re2::Regexp* sub[2];
+ sub[0] = re;
+ sub[1] = m;
+ re = re2::Regexp::Concat(sub, 2, pf);
+ }
+ elem_.emplace_back(std::string(pattern), re);
+ return n;
+}
+
+bool RE2::Set::Compile() {
+ if (compiled_) {
+ LOG(DFATAL) << "RE2::Set::Compile() called more than once";
+ return false;
+ }
+ compiled_ = true;
+ size_ = static_cast<int>(elem_.size());
+
+ // Sort the elements by their patterns. This is good enough for now
+ // until we have a Regexp comparison function. (Maybe someday...)
+ std::sort(elem_.begin(), elem_.end(),
+ [](const Elem& a, const Elem& b) -> bool {
+ return a.first < b.first;
+ });
+
+ PODArray<re2::Regexp*> sub(size_);
+ for (int i = 0; i < size_; i++)
+ sub[i] = elem_[i].second;
+ elem_.clear();
+ elem_.shrink_to_fit();
+
+ Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
+ options_.ParseFlags());
+ re2::Regexp* re = re2::Regexp::Alternate(sub.data(), size_, pf);
+
+ prog_.reset(Prog::CompileSet(re, anchor_, options_.max_mem()));
+ re->Decref();
+ return prog_ != nullptr;
+}
+
+bool RE2::Set::Match(absl::string_view text, std::vector<int>* v) const {
+ return Match(text, v, NULL);
+}
+
+bool RE2::Set::Match(absl::string_view text, std::vector<int>* v,
+ ErrorInfo* error_info) const {
+ if (!compiled_) {
+ if (error_info != NULL)
+ error_info->kind = kNotCompiled;
+ LOG(DFATAL) << "RE2::Set::Match() called before compiling";
+ return false;
+ }
+#ifdef RE2_HAVE_THREAD_LOCAL
+ hooks::context = NULL;
+#endif
+ bool dfa_failed = false;
+ std::unique_ptr<SparseSet> matches;
+ if (v != NULL) {
+ matches.reset(new SparseSet(size_));
+ v->clear();
+ }
+ bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch,
+ NULL, &dfa_failed, matches.get());
+ if (dfa_failed) {
+ if (options_.log_errors())
+ LOG(ERROR) << "DFA out of memory: "
+ << "program size " << prog_->size() << ", "
+ << "list count " << prog_->list_count() << ", "
+ << "bytemap range " << prog_->bytemap_range();
+ if (error_info != NULL)
+ error_info->kind = kOutOfMemory;
+ return false;
+ }
+ if (ret == false) {
+ if (error_info != NULL)
+ error_info->kind = kNoError;
+ return false;
+ }
+ if (v != NULL) {
+ if (matches->empty()) {
+ if (error_info != NULL)
+ error_info->kind = kInconsistent;
+ LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!";
+ return false;
+ }
+ v->assign(matches->begin(), matches->end());
+ }
+ if (error_info != NULL)
+ error_info->kind = kNoError;
+ return true;
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/set.h b/third_party/re2/src/re2/set.h
new file mode 100644
index 000000000..3fe419ba3
--- /dev/null
+++ b/third_party/re2/src/re2/set.h
@@ -0,0 +1,86 @@
+// Copyright 2010 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_SET_H_
+#define RE2_SET_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "re2/re2.h"
+
+namespace re2 {
+class Prog;
+class Regexp;
+} // namespace re2
+
+namespace re2 {
+
+// An RE2::Set represents a collection of regexps that can
+// be searched for simultaneously.
+class RE2::Set {
+ public:
+ enum ErrorKind {
+ kNoError = 0,
+ kNotCompiled, // The set is not compiled.
+ kOutOfMemory, // The DFA ran out of memory.
+ kInconsistent, // The result is inconsistent. This should never happen.
+ };
+
+ struct ErrorInfo {
+ ErrorKind kind;
+ };
+
+ Set(const RE2::Options& options, RE2::Anchor anchor);
+ ~Set();
+
+ // Not copyable.
+ Set(const Set&) = delete;
+ Set& operator=(const Set&) = delete;
+ // Movable.
+ Set(Set&& other);
+ Set& operator=(Set&& other);
+
+ // Adds pattern to the set using the options passed to the constructor.
+ // Returns the index that will identify the regexp in the output of Match(),
+ // or -1 if the regexp cannot be parsed.
+ // Indices are assigned in sequential order starting from 0.
+ // Errors do not increment the index; if error is not NULL, *error will hold
+ // the error message from the parser.
+ int Add(absl::string_view pattern, std::string* error);
+
+ // Compiles the set in preparation for matching.
+ // Returns false if the compiler runs out of memory.
+ // Add() must not be called again after Compile().
+ // Compile() must be called before Match().
+ bool Compile();
+
+ // Returns true if text matches at least one of the regexps in the set.
+ // Fills v (if not NULL) with the indices of the matching regexps.
+ // Callers must not expect v to be sorted.
+ bool Match(absl::string_view text, std::vector<int>* v) const;
+
+ // As above, but populates error_info (if not NULL) when none of the regexps
+ // in the set matched. This can inform callers when DFA execution fails, for
+ // example, because they might wish to handle that case differently.
+ bool Match(absl::string_view text, std::vector<int>* v,
+ ErrorInfo* error_info) const;
+
+ private:
+ typedef std::pair<std::string, re2::Regexp*> Elem;
+
+ RE2::Options options_;
+ RE2::Anchor anchor_;
+ std::vector<Elem> elem_;
+ bool compiled_;
+ int size_;
+ std::unique_ptr<re2::Prog> prog_;
+};
+
+} // namespace re2
+
+#endif // RE2_SET_H_
diff --git a/third_party/re2/src/re2/simplify.cc b/third_party/re2/src/re2/simplify.cc
new file mode 100644
index 000000000..cea100b08
--- /dev/null
+++ b/third_party/re2/src/re2/simplify.cc
@@ -0,0 +1,685 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Rewrite POSIX and other features in re
+// to use simple extended regular expression features.
+// Also sort and simplify character classes.
+
+#include <algorithm>
+#include <string>
+
+#include "util/logging.h"
+#include "util/utf.h"
+#include "re2/pod_array.h"
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// Parses the regexp src and then simplifies it and sets *dst to the
+// string representation of the simplified form. Returns true on success.
+// Returns false and sets *error (if error != NULL) on error.
+bool Regexp::SimplifyRegexp(absl::string_view src, ParseFlags flags,
+ std::string* dst, RegexpStatus* status) {
+ Regexp* re = Parse(src, flags, status);
+ if (re == NULL)
+ return false;
+ Regexp* sre = re->Simplify();
+ re->Decref();
+ if (sre == NULL) {
+ if (status) {
+ status->set_code(kRegexpInternalError);
+ status->set_error_arg(src);
+ }
+ return false;
+ }
+ *dst = sre->ToString();
+ sre->Decref();
+ return true;
+}
+
+// Assuming the simple_ flags on the children are accurate,
+// is this Regexp* simple?
+bool Regexp::ComputeSimple() {
+ Regexp** subs;
+ switch (op_) {
+ case kRegexpNoMatch:
+ case kRegexpEmptyMatch:
+ case kRegexpLiteral:
+ case kRegexpLiteralString:
+ case kRegexpBeginLine:
+ case kRegexpEndLine:
+ case kRegexpBeginText:
+ case kRegexpWordBoundary:
+ case kRegexpNoWordBoundary:
+ case kRegexpEndText:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ case kRegexpHaveMatch:
+ return true;
+ case kRegexpConcat:
+ case kRegexpAlternate:
+ // These are simple as long as the subpieces are simple.
+ subs = sub();
+ for (int i = 0; i < nsub_; i++)
+ if (!subs[i]->simple())
+ return false;
+ return true;
+ case kRegexpCharClass:
+ // Simple as long as the char class is not empty, not full.
+ if (ccb_ != NULL)
+ return !ccb_->empty() && !ccb_->full();
+ return !cc_->empty() && !cc_->full();
+ case kRegexpCapture:
+ subs = sub();
+ return subs[0]->simple();
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ subs = sub();
+ if (!subs[0]->simple())
+ return false;
+ switch (subs[0]->op_) {
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpEmptyMatch:
+ case kRegexpNoMatch:
+ return false;
+ default:
+ break;
+ }
+ return true;
+ case kRegexpRepeat:
+ return false;
+ }
+ LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
+ return false;
+}
+
+// Walker subclass used by Simplify.
+// Coalesces runs of star/plus/quest/repeat of the same literal along with any
+// occurrences of that literal into repeats of that literal. It also works for
+// char classes, any char and any byte.
+// PostVisit creates the coalesced result, which should then be simplified.
+class CoalesceWalker : public Regexp::Walker<Regexp*> {
+ public:
+ CoalesceWalker() {}
+ virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
+ Regexp** child_args, int nchild_args);
+ virtual Regexp* Copy(Regexp* re);
+ virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
+
+ private:
+ // These functions are declared inside CoalesceWalker so that
+ // they can edit the private fields of the Regexps they construct.
+
+ // Returns true if r1 and r2 can be coalesced. In particular, ensures that
+ // the parse flags are consistent. (They will not be checked again later.)
+ static bool CanCoalesce(Regexp* r1, Regexp* r2);
+
+ // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
+ // will be empty match and the coalesced op. In other cases, where part of a
+ // literal string was removed to be coalesced, the array elements afterwards
+ // will be the coalesced op and the remainder of the literal string.
+ static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
+
+ CoalesceWalker(const CoalesceWalker&) = delete;
+ CoalesceWalker& operator=(const CoalesceWalker&) = delete;
+};
+
+// Walker subclass used by Simplify.
+// The simplify walk is purely post-recursive: given the simplified children,
+// PostVisit creates the simplified result.
+// The child_args are simplified Regexp*s.
+class SimplifyWalker : public Regexp::Walker<Regexp*> {
+ public:
+ SimplifyWalker() {}
+ virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
+ virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
+ Regexp** child_args, int nchild_args);
+ virtual Regexp* Copy(Regexp* re);
+ virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
+
+ private:
+ // These functions are declared inside SimplifyWalker so that
+ // they can edit the private fields of the Regexps they construct.
+
+ // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
+ // Caller must Decref return value when done with it.
+ static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
+
+ // Simplifies the expression re{min,max} in terms of *, +, and ?.
+ // Returns a new regexp. Does not edit re. Does not consume reference to re.
+ // Caller must Decref return value when done with it.
+ static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
+ Regexp::ParseFlags parse_flags);
+
+ // Simplifies a character class by expanding any named classes
+ // into rune ranges. Does not edit re. Does not consume ref to re.
+ // Caller must Decref return value when done with it.
+ static Regexp* SimplifyCharClass(Regexp* re);
+
+ SimplifyWalker(const SimplifyWalker&) = delete;
+ SimplifyWalker& operator=(const SimplifyWalker&) = delete;
+};
+
+// Simplifies a regular expression, returning a new regexp.
+// The new regexp uses traditional Unix egrep features only,
+// plus the Perl (?:) non-capturing parentheses.
+// Otherwise, no POSIX or Perl additions. The new regexp
+// captures exactly the same subexpressions (with the same indices)
+// as the original.
+// Does not edit current object.
+// Caller must Decref() return value when done with it.
+
+Regexp* Regexp::Simplify() {
+ CoalesceWalker cw;
+ Regexp* cre = cw.Walk(this, NULL);
+ if (cre == NULL)
+ return NULL;
+ if (cw.stopped_early()) {
+ cre->Decref();
+ return NULL;
+ }
+ SimplifyWalker sw;
+ Regexp* sre = sw.Walk(cre, NULL);
+ cre->Decref();
+ if (sre == NULL)
+ return NULL;
+ if (sw.stopped_early()) {
+ sre->Decref();
+ return NULL;
+ }
+ return sre;
+}
+
+#define Simplify DontCallSimplify // Avoid accidental recursion
+
+// Utility function for PostVisit implementations that compares re->sub() with
+// child_args to determine whether any child_args changed. In the common case,
+// where nothing changed, calls Decref() for all child_args and returns false,
+// so PostVisit must return re->Incref(). Otherwise, returns true.
+static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
+ for (int i = 0; i < re->nsub(); i++) {
+ Regexp* sub = re->sub()[i];
+ Regexp* newsub = child_args[i];
+ if (newsub != sub)
+ return true;
+ }
+ for (int i = 0; i < re->nsub(); i++) {
+ Regexp* newsub = child_args[i];
+ newsub->Decref();
+ }
+ return false;
+}
+
+Regexp* CoalesceWalker::Copy(Regexp* re) {
+ return re->Incref();
+}
+
+Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
+ // Should never be called: we use Walk(), not WalkExponential().
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
+#endif
+ return re->Incref();
+}
+
+Regexp* CoalesceWalker::PostVisit(Regexp* re,
+ Regexp* parent_arg,
+ Regexp* pre_arg,
+ Regexp** child_args,
+ int nchild_args) {
+ if (re->nsub() == 0)
+ return re->Incref();
+
+ if (re->op() != kRegexpConcat) {
+ if (!ChildArgsChanged(re, child_args))
+ return re->Incref();
+
+ // Something changed. Build a new op.
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
+ nre->AllocSub(re->nsub());
+ Regexp** nre_subs = nre->sub();
+ for (int i = 0; i < re->nsub(); i++)
+ nre_subs[i] = child_args[i];
+ // Repeats and Captures have additional data that must be copied.
+ if (re->op() == kRegexpRepeat) {
+ nre->min_ = re->min();
+ nre->max_ = re->max();
+ } else if (re->op() == kRegexpCapture) {
+ nre->cap_ = re->cap();
+ }
+ return nre;
+ }
+
+ bool can_coalesce = false;
+ for (int i = 0; i < re->nsub(); i++) {
+ if (i+1 < re->nsub() &&
+ CanCoalesce(child_args[i], child_args[i+1])) {
+ can_coalesce = true;
+ break;
+ }
+ }
+ if (!can_coalesce) {
+ if (!ChildArgsChanged(re, child_args))
+ return re->Incref();
+
+ // Something changed. Build a new op.
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
+ nre->AllocSub(re->nsub());
+ Regexp** nre_subs = nre->sub();
+ for (int i = 0; i < re->nsub(); i++)
+ nre_subs[i] = child_args[i];
+ return nre;
+ }
+
+ for (int i = 0; i < re->nsub(); i++) {
+ if (i+1 < re->nsub() &&
+ CanCoalesce(child_args[i], child_args[i+1]))
+ DoCoalesce(&child_args[i], &child_args[i+1]);
+ }
+ // Determine how many empty matches were left by DoCoalesce.
+ int n = 0;
+ for (int i = n; i < re->nsub(); i++) {
+ if (child_args[i]->op() == kRegexpEmptyMatch)
+ n++;
+ }
+ // Build a new op.
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
+ nre->AllocSub(re->nsub() - n);
+ Regexp** nre_subs = nre->sub();
+ for (int i = 0, j = 0; i < re->nsub(); i++) {
+ if (child_args[i]->op() == kRegexpEmptyMatch) {
+ child_args[i]->Decref();
+ continue;
+ }
+ nre_subs[j] = child_args[i];
+ j++;
+ }
+ return nre;
+}
+
+bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
+ // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
+ // any byte.
+ if ((r1->op() == kRegexpStar ||
+ r1->op() == kRegexpPlus ||
+ r1->op() == kRegexpQuest ||
+ r1->op() == kRegexpRepeat) &&
+ (r1->sub()[0]->op() == kRegexpLiteral ||
+ r1->sub()[0]->op() == kRegexpCharClass ||
+ r1->sub()[0]->op() == kRegexpAnyChar ||
+ r1->sub()[0]->op() == kRegexpAnyByte)) {
+ // r2 must be a star/plus/quest/repeat of the same literal, char class,
+ // any char or any byte.
+ if ((r2->op() == kRegexpStar ||
+ r2->op() == kRegexpPlus ||
+ r2->op() == kRegexpQuest ||
+ r2->op() == kRegexpRepeat) &&
+ Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
+ // The parse flags must be consistent.
+ ((r1->parse_flags() & Regexp::NonGreedy) ==
+ (r2->parse_flags() & Regexp::NonGreedy))) {
+ return true;
+ }
+ // ... OR an occurrence of that literal, char class, any char or any byte
+ if (Regexp::Equal(r1->sub()[0], r2)) {
+ return true;
+ }
+ // ... OR a literal string that begins with that literal.
+ if (r1->sub()[0]->op() == kRegexpLiteral &&
+ r2->op() == kRegexpLiteralString &&
+ r2->runes()[0] == r1->sub()[0]->rune() &&
+ // The parse flags must be consistent.
+ ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
+ (r2->parse_flags() & Regexp::FoldCase))) {
+ return true;
+ }
+ }
+ return false;
+}
+
+void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
+ Regexp* r1 = *r1ptr;
+ Regexp* r2 = *r2ptr;
+
+ Regexp* nre = Regexp::Repeat(
+ r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
+
+ switch (r1->op()) {
+ case kRegexpStar:
+ nre->min_ = 0;
+ nre->max_ = -1;
+ break;
+
+ case kRegexpPlus:
+ nre->min_ = 1;
+ nre->max_ = -1;
+ break;
+
+ case kRegexpQuest:
+ nre->min_ = 0;
+ nre->max_ = 1;
+ break;
+
+ case kRegexpRepeat:
+ nre->min_ = r1->min();
+ nre->max_ = r1->max();
+ break;
+
+ default:
+ nre->Decref();
+ LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
+ return;
+ }
+
+ switch (r2->op()) {
+ case kRegexpStar:
+ nre->max_ = -1;
+ goto LeaveEmpty;
+
+ case kRegexpPlus:
+ nre->min_++;
+ nre->max_ = -1;
+ goto LeaveEmpty;
+
+ case kRegexpQuest:
+ if (nre->max() != -1)
+ nre->max_++;
+ goto LeaveEmpty;
+
+ case kRegexpRepeat:
+ nre->min_ += r2->min();
+ if (r2->max() == -1)
+ nre->max_ = -1;
+ else if (nre->max() != -1)
+ nre->max_ += r2->max();
+ goto LeaveEmpty;
+
+ case kRegexpLiteral:
+ case kRegexpCharClass:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ nre->min_++;
+ if (nre->max() != -1)
+ nre->max_++;
+ goto LeaveEmpty;
+
+ LeaveEmpty:
+ *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
+ *r2ptr = nre;
+ break;
+
+ case kRegexpLiteralString: {
+ Rune r = r1->sub()[0]->rune();
+ // Determine how much of the literal string is removed.
+ // We know that we have at least one rune. :)
+ int n = 1;
+ while (n < r2->nrunes() && r2->runes()[n] == r)
+ n++;
+ nre->min_ += n;
+ if (nre->max() != -1)
+ nre->max_ += n;
+ if (n == r2->nrunes())
+ goto LeaveEmpty;
+ *r1ptr = nre;
+ *r2ptr = Regexp::LiteralString(
+ &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
+ break;
+ }
+
+ default:
+ nre->Decref();
+ LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
+ return;
+ }
+
+ r1->Decref();
+ r2->Decref();
+}
+
+Regexp* SimplifyWalker::Copy(Regexp* re) {
+ return re->Incref();
+}
+
+Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
+ // Should never be called: we use Walk(), not WalkExponential().
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
+#endif
+ return re->Incref();
+}
+
+Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
+ if (re->simple()) {
+ *stop = true;
+ return re->Incref();
+ }
+ return NULL;
+}
+
+Regexp* SimplifyWalker::PostVisit(Regexp* re,
+ Regexp* parent_arg,
+ Regexp* pre_arg,
+ Regexp** child_args,
+ int nchild_args) {
+ switch (re->op()) {
+ case kRegexpNoMatch:
+ case kRegexpEmptyMatch:
+ case kRegexpLiteral:
+ case kRegexpLiteralString:
+ case kRegexpBeginLine:
+ case kRegexpEndLine:
+ case kRegexpBeginText:
+ case kRegexpWordBoundary:
+ case kRegexpNoWordBoundary:
+ case kRegexpEndText:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ case kRegexpHaveMatch:
+ // All these are always simple.
+ re->simple_ = true;
+ return re->Incref();
+
+ case kRegexpConcat:
+ case kRegexpAlternate: {
+ // These are simple as long as the subpieces are simple.
+ if (!ChildArgsChanged(re, child_args)) {
+ re->simple_ = true;
+ return re->Incref();
+ }
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
+ nre->AllocSub(re->nsub());
+ Regexp** nre_subs = nre->sub();
+ for (int i = 0; i < re->nsub(); i++)
+ nre_subs[i] = child_args[i];
+ nre->simple_ = true;
+ return nre;
+ }
+
+ case kRegexpCapture: {
+ Regexp* newsub = child_args[0];
+ if (newsub == re->sub()[0]) {
+ newsub->Decref();
+ re->simple_ = true;
+ return re->Incref();
+ }
+ Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
+ nre->AllocSub(1);
+ nre->sub()[0] = newsub;
+ nre->cap_ = re->cap();
+ nre->simple_ = true;
+ return nre;
+ }
+
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest: {
+ Regexp* newsub = child_args[0];
+ // Special case: repeat the empty string as much as
+ // you want, but it's still the empty string.
+ if (newsub->op() == kRegexpEmptyMatch)
+ return newsub;
+
+ // These are simple as long as the subpiece is simple.
+ if (newsub == re->sub()[0]) {
+ newsub->Decref();
+ re->simple_ = true;
+ return re->Incref();
+ }
+
+ // These are also idempotent if flags are constant.
+ if (re->op() == newsub->op() &&
+ re->parse_flags() == newsub->parse_flags())
+ return newsub;
+
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
+ nre->AllocSub(1);
+ nre->sub()[0] = newsub;
+ nre->simple_ = true;
+ return nre;
+ }
+
+ case kRegexpRepeat: {
+ Regexp* newsub = child_args[0];
+ // Special case: repeat the empty string as much as
+ // you want, but it's still the empty string.
+ if (newsub->op() == kRegexpEmptyMatch)
+ return newsub;
+
+ Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
+ re->parse_flags());
+ newsub->Decref();
+ nre->simple_ = true;
+ return nre;
+ }
+
+ case kRegexpCharClass: {
+ Regexp* nre = SimplifyCharClass(re);
+ nre->simple_ = true;
+ return nre;
+ }
+ }
+
+ LOG(ERROR) << "Simplify case not handled: " << re->op();
+ return re->Incref();
+}
+
+// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
+// Returns a new Regexp, handing the ref to the caller.
+Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
+ Regexp::ParseFlags parse_flags) {
+ Regexp* re = new Regexp(kRegexpConcat, parse_flags);
+ re->AllocSub(2);
+ Regexp** subs = re->sub();
+ subs[0] = re1;
+ subs[1] = re2;
+ return re;
+}
+
+// Returns true if re is an empty-width op.
+static bool IsEmptyOp(Regexp* re) {
+ return (re->op() == kRegexpBeginLine ||
+ re->op() == kRegexpEndLine ||
+ re->op() == kRegexpWordBoundary ||
+ re->op() == kRegexpNoWordBoundary ||
+ re->op() == kRegexpBeginText ||
+ re->op() == kRegexpEndText);
+}
+
+// Simplifies the expression re{min,max} in terms of *, +, and ?.
+// Returns a new regexp. Does not edit re. Does not consume reference to re.
+// Caller must Decref return value when done with it.
+// The result will *not* necessarily have the right capturing parens
+// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
+// but in the Regexp* representation, both (x) are marked as $1.
+Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
+ Regexp::ParseFlags f) {
+ // For an empty-width op OR a concatenation or alternation of empty-width
+ // ops, cap the repetition count at 1.
+ if (IsEmptyOp(re) ||
+ ((re->op() == kRegexpConcat ||
+ re->op() == kRegexpAlternate) &&
+ std::all_of(re->sub(), re->sub() + re->nsub(), IsEmptyOp))) {
+ min = std::min(min, 1);
+ max = std::min(max, 1);
+ }
+
+ // x{n,} means at least n matches of x.
+ if (max == -1) {
+ // Special case: x{0,} is x*
+ if (min == 0)
+ return Regexp::Star(re->Incref(), f);
+
+ // Special case: x{1,} is x+
+ if (min == 1)
+ return Regexp::Plus(re->Incref(), f);
+
+ // General case: x{4,} is xxxx+
+ PODArray<Regexp*> nre_subs(min);
+ for (int i = 0; i < min-1; i++)
+ nre_subs[i] = re->Incref();
+ nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
+ return Regexp::Concat(nre_subs.data(), min, f);
+ }
+
+ // Special case: (x){0} matches only empty string.
+ if (min == 0 && max == 0)
+ return new Regexp(kRegexpEmptyMatch, f);
+
+ // Special case: x{1} is just x.
+ if (min == 1 && max == 1)
+ return re->Incref();
+
+ // General case: x{n,m} means n copies of x and m copies of x?.
+ // The machine will do less work if we nest the final m copies,
+ // so that x{2,5} = xx(x(x(x)?)?)?
+
+ // Build leading prefix: xx. Capturing only on the last one.
+ Regexp* nre = NULL;
+ if (min > 0) {
+ PODArray<Regexp*> nre_subs(min);
+ for (int i = 0; i < min; i++)
+ nre_subs[i] = re->Incref();
+ nre = Regexp::Concat(nre_subs.data(), min, f);
+ }
+
+ // Build and attach suffix: (x(x(x)?)?)?
+ if (max > min) {
+ Regexp* suf = Regexp::Quest(re->Incref(), f);
+ for (int i = min+1; i < max; i++)
+ suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
+ if (nre == NULL)
+ nre = suf;
+ else
+ nre = Concat2(nre, suf, f);
+ }
+
+ if (nre == NULL) {
+ // Some degenerate case, like min > max, or min < max < 0.
+ // This shouldn't happen, because the parser rejects such regexps.
+ LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
+ return new Regexp(kRegexpNoMatch, f);
+ }
+
+ return nre;
+}
+
+// Simplifies a character class.
+// Caller must Decref return value when done with it.
+Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
+ CharClass* cc = re->cc();
+
+ // Special cases
+ if (cc->empty())
+ return new Regexp(kRegexpNoMatch, re->parse_flags());
+ if (cc->full())
+ return new Regexp(kRegexpAnyChar, re->parse_flags());
+
+ return re->Incref();
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/sparse_array.h b/third_party/re2/src/re2/sparse_array.h
new file mode 100644
index 000000000..09ffe086b
--- /dev/null
+++ b/third_party/re2/src/re2/sparse_array.h
@@ -0,0 +1,392 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_SPARSE_ARRAY_H_
+#define RE2_SPARSE_ARRAY_H_
+
+// DESCRIPTION
+//
+// SparseArray<T>(m) is a map from integers in [0, m) to T values.
+// It requires (sizeof(T)+sizeof(int))*m memory, but it provides
+// fast iteration through the elements in the array and fast clearing
+// of the array. The array has a concept of certain elements being
+// uninitialized (having no value).
+//
+// Insertion and deletion are constant time operations.
+//
+// Allocating the array is a constant time operation
+// when memory allocation is a constant time operation.
+//
+// Clearing the array is a constant time operation (unusual!).
+//
+// Iterating through the array is an O(n) operation, where n
+// is the number of items in the array (not O(m)).
+//
+// The array iterator visits entries in the order they were first
+// inserted into the array. It is safe to add items to the array while
+// using an iterator: the iterator will visit indices added to the array
+// during the iteration, but will not re-visit indices whose values
+// change after visiting. Thus SparseArray can be a convenient
+// implementation of a work queue.
+//
+// The SparseArray implementation is NOT thread-safe. It is up to the
+// caller to make sure only one thread is accessing the array. (Typically
+// these arrays are temporary values and used in situations where speed is
+// important.)
+//
+// The SparseArray interface does not present all the usual STL bells and
+// whistles.
+//
+// Implemented with reference to Briggs & Torczon, An Efficient
+// Representation for Sparse Sets, ACM Letters on Programming Languages
+// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
+//
+// Briggs & Torczon popularized this technique, but it had been known
+// long before their paper. They point out that Aho, Hopcroft, and
+// Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's
+// 1986 Programming Pearls both hint at the technique in exercises to the
+// reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1
+// exercise 8).
+//
+// Briggs & Torczon describe a sparse set implementation. I have
+// trivially generalized it to create a sparse array (actually the original
+// target of the AHU and Bentley exercises).
+
+// IMPLEMENTATION
+//
+// SparseArray is an array dense_ and an array sparse_ of identical size.
+// At any point, the number of elements in the sparse array is size_.
+//
+// The array dense_ contains the size_ elements in the sparse array (with
+// their indices),
+// in the order that the elements were first inserted. This array is dense:
+// the size_ pairs are dense_[0] through dense_[size_-1].
+//
+// The array sparse_ maps from indices in [0,m) to indices in [0,size_).
+// For indices present in the array, dense_[sparse_[i]].index_ == i.
+// For indices not present in the array, sparse_ can contain any value at all,
+// perhaps outside the range [0, size_) but perhaps not.
+//
+// The lax requirement on sparse_ values makes clearing the array very easy:
+// set size_ to 0. Lookups are slightly more complicated.
+// An index i has a value in the array if and only if:
+// sparse_[i] is in [0, size_) AND
+// dense_[sparse_[i]].index_ == i.
+// If both these properties hold, only then it is safe to refer to
+// dense_[sparse_[i]].value_
+// as the value associated with index i.
+//
+// To insert a new entry, set sparse_[i] to size_,
+// initialize dense_[size_], and then increment size_.
+//
+// To make the sparse array as efficient as possible for non-primitive types,
+// elements may or may not be destroyed when they are deleted from the sparse
+// array through a call to resize(). They immediately become inaccessible, but
+// they are only guaranteed to be destroyed when the SparseArray destructor is
+// called.
+//
+// A moved-from SparseArray will be empty.
+
+// Doing this simplifies the logic below.
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#include <assert.h>
+#include <stdint.h>
+#if __has_feature(memory_sanitizer)
+#include <sanitizer/msan_interface.h>
+#endif
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+#include "re2/pod_array.h"
+
+namespace re2 {
+
+template<typename Value>
+class SparseArray {
+ public:
+ SparseArray();
+ explicit SparseArray(int max_size);
+ ~SparseArray();
+
+ // IndexValue pairs: exposed in SparseArray::iterator.
+ class IndexValue;
+
+ typedef IndexValue* iterator;
+ typedef const IndexValue* const_iterator;
+
+ SparseArray(const SparseArray& src);
+ SparseArray(SparseArray&& src);
+
+ SparseArray& operator=(const SparseArray& src);
+ SparseArray& operator=(SparseArray&& src);
+
+ // Return the number of entries in the array.
+ int size() const {
+ return size_;
+ }
+
+ // Indicate whether the array is empty.
+ int empty() const {
+ return size_ == 0;
+ }
+
+ // Iterate over the array.
+ iterator begin() {
+ return dense_.data();
+ }
+ iterator end() {
+ return dense_.data() + size_;
+ }
+
+ const_iterator begin() const {
+ return dense_.data();
+ }
+ const_iterator end() const {
+ return dense_.data() + size_;
+ }
+
+ // Change the maximum size of the array.
+ // Invalidates all iterators.
+ void resize(int new_max_size);
+
+ // Return the maximum size of the array.
+ // Indices can be in the range [0, max_size).
+ int max_size() const {
+ if (dense_.data() != NULL)
+ return dense_.size();
+ else
+ return 0;
+ }
+
+ // Clear the array.
+ void clear() {
+ size_ = 0;
+ }
+
+ // Check whether index i is in the array.
+ bool has_index(int i) const;
+
+ // Comparison function for sorting.
+ // Can sort the sparse array so that future iterations
+ // will visit indices in increasing order using
+ // std::sort(arr.begin(), arr.end(), arr.less);
+ static bool less(const IndexValue& a, const IndexValue& b);
+
+ public:
+ // Set the value at index i to v.
+ iterator set(int i, const Value& v) {
+ return SetInternal(true, i, v);
+ }
+
+ // Set the value at new index i to v.
+ // Fast but unsafe: only use if has_index(i) is false.
+ iterator set_new(int i, const Value& v) {
+ return SetInternal(false, i, v);
+ }
+
+ // Set the value at index i to v.
+ // Fast but unsafe: only use if has_index(i) is true.
+ iterator set_existing(int i, const Value& v) {
+ return SetExistingInternal(i, v);
+ }
+
+ // Get the value at index i.
+ // Fast but unsafe: only use if has_index(i) is true.
+ Value& get_existing(int i) {
+ assert(has_index(i));
+ return dense_[sparse_[i]].value_;
+ }
+ const Value& get_existing(int i) const {
+ assert(has_index(i));
+ return dense_[sparse_[i]].value_;
+ }
+
+ private:
+ iterator SetInternal(bool allow_existing, int i, const Value& v) {
+ DebugCheckInvariants();
+ if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) {
+ assert(false && "illegal index");
+ // Semantically, end() would be better here, but we already know
+ // the user did something stupid, so begin() insulates them from
+ // dereferencing an invalid pointer.
+ return begin();
+ }
+ if (!allow_existing) {
+ assert(!has_index(i));
+ create_index(i);
+ } else {
+ if (!has_index(i))
+ create_index(i);
+ }
+ return SetExistingInternal(i, v);
+ }
+
+ iterator SetExistingInternal(int i, const Value& v) {
+ DebugCheckInvariants();
+ assert(has_index(i));
+ dense_[sparse_[i]].value_ = v;
+ DebugCheckInvariants();
+ return dense_.data() + sparse_[i];
+ }
+
+ // Add the index i to the array.
+ // Only use if has_index(i) is known to be false.
+ // Since it doesn't set the value associated with i,
+ // this function is private, only intended as a helper
+ // for other methods.
+ void create_index(int i);
+
+ // In debug mode, verify that some invariant properties of the class
+ // are being maintained. This is called at the end of the constructor
+ // and at the beginning and end of all public non-const member functions.
+ void DebugCheckInvariants() const;
+
+ // Initializes memory for elements [min, max).
+ void MaybeInitializeMemory(int min, int max) {
+#if __has_feature(memory_sanitizer)
+ __msan_unpoison(sparse_.data() + min, (max - min) * sizeof sparse_[0]);
+#elif defined(RE2_ON_VALGRIND)
+ for (int i = min; i < max; i++) {
+ sparse_[i] = 0xababababU;
+ }
+#endif
+ }
+
+ int size_ = 0;
+ PODArray<int> sparse_;
+ PODArray<IndexValue> dense_;
+};
+
+template<typename Value>
+SparseArray<Value>::SparseArray() = default;
+
+template<typename Value>
+SparseArray<Value>::SparseArray(const SparseArray& src)
+ : size_(src.size_),
+ sparse_(src.max_size()),
+ dense_(src.max_size()) {
+ std::copy_n(src.sparse_.data(), src.max_size(), sparse_.data());
+ std::copy_n(src.dense_.data(), src.max_size(), dense_.data());
+}
+
+template<typename Value>
+SparseArray<Value>::SparseArray(SparseArray&& src)
+ : size_(src.size_),
+ sparse_(std::move(src.sparse_)),
+ dense_(std::move(src.dense_)) {
+ src.size_ = 0;
+}
+
+template<typename Value>
+SparseArray<Value>& SparseArray<Value>::operator=(const SparseArray& src) {
+ // Construct these first for exception safety.
+ PODArray<int> a(src.max_size());
+ PODArray<IndexValue> b(src.max_size());
+
+ size_ = src.size_;
+ sparse_ = std::move(a);
+ dense_ = std::move(b);
+ std::copy_n(src.sparse_.data(), src.max_size(), sparse_.data());
+ std::copy_n(src.dense_.data(), src.max_size(), dense_.data());
+ return *this;
+}
+
+template<typename Value>
+SparseArray<Value>& SparseArray<Value>::operator=(SparseArray&& src) {
+ size_ = src.size_;
+ sparse_ = std::move(src.sparse_);
+ dense_ = std::move(src.dense_);
+ src.size_ = 0;
+ return *this;
+}
+
+// IndexValue pairs: exposed in SparseArray::iterator.
+template<typename Value>
+class SparseArray<Value>::IndexValue {
+ public:
+ int index() const { return index_; }
+ Value& value() { return value_; }
+ const Value& value() const { return value_; }
+
+ private:
+ friend class SparseArray;
+ int index_;
+ Value value_;
+};
+
+// Change the maximum size of the array.
+// Invalidates all iterators.
+template<typename Value>
+void SparseArray<Value>::resize(int new_max_size) {
+ DebugCheckInvariants();
+ if (new_max_size > max_size()) {
+ const int old_max_size = max_size();
+
+ // Construct these first for exception safety.
+ PODArray<int> a(new_max_size);
+ PODArray<IndexValue> b(new_max_size);
+
+ std::copy_n(sparse_.data(), old_max_size, a.data());
+ std::copy_n(dense_.data(), old_max_size, b.data());
+
+ sparse_ = std::move(a);
+ dense_ = std::move(b);
+
+ MaybeInitializeMemory(old_max_size, new_max_size);
+ }
+ if (size_ > new_max_size)
+ size_ = new_max_size;
+ DebugCheckInvariants();
+}
+
+// Check whether index i is in the array.
+template<typename Value>
+bool SparseArray<Value>::has_index(int i) const {
+ assert(i >= 0);
+ assert(i < max_size());
+ if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) {
+ return false;
+ }
+ // Unsigned comparison avoids checking sparse_[i] < 0.
+ return (uint32_t)sparse_[i] < (uint32_t)size_ &&
+ dense_[sparse_[i]].index_ == i;
+}
+
+template<typename Value>
+void SparseArray<Value>::create_index(int i) {
+ assert(!has_index(i));
+ assert(size_ < max_size());
+ sparse_[i] = size_;
+ dense_[size_].index_ = i;
+ size_++;
+}
+
+template<typename Value> SparseArray<Value>::SparseArray(int max_size) :
+ sparse_(max_size), dense_(max_size) {
+ MaybeInitializeMemory(size_, max_size);
+ DebugCheckInvariants();
+}
+
+template<typename Value> SparseArray<Value>::~SparseArray() {
+ DebugCheckInvariants();
+}
+
+template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
+ assert(0 <= size_);
+ assert(size_ <= max_size());
+}
+
+// Comparison function for sorting.
+template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
+ const IndexValue& b) {
+ return a.index_ < b.index_;
+}
+
+} // namespace re2
+
+#endif // RE2_SPARSE_ARRAY_H_
diff --git a/third_party/re2/src/re2/sparse_set.h b/third_party/re2/src/re2/sparse_set.h
new file mode 100644
index 000000000..06ed88d81
--- /dev/null
+++ b/third_party/re2/src/re2/sparse_set.h
@@ -0,0 +1,264 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_SPARSE_SET_H_
+#define RE2_SPARSE_SET_H_
+
+// DESCRIPTION
+//
+// SparseSet(m) is a set of integers in [0, m).
+// It requires sizeof(int)*m memory, but it provides
+// fast iteration through the elements in the set and fast clearing
+// of the set.
+//
+// Insertion and deletion are constant time operations.
+//
+// Allocating the set is a constant time operation
+// when memory allocation is a constant time operation.
+//
+// Clearing the set is a constant time operation (unusual!).
+//
+// Iterating through the set is an O(n) operation, where n
+// is the number of items in the set (not O(m)).
+//
+// The set iterator visits entries in the order they were first
+// inserted into the set. It is safe to add items to the set while
+// using an iterator: the iterator will visit indices added to the set
+// during the iteration, but will not re-visit indices whose values
+// change after visiting. Thus SparseSet can be a convenient
+// implementation of a work queue.
+//
+// The SparseSet implementation is NOT thread-safe. It is up to the
+// caller to make sure only one thread is accessing the set. (Typically
+// these sets are temporary values and used in situations where speed is
+// important.)
+//
+// The SparseSet interface does not present all the usual STL bells and
+// whistles.
+//
+// Implemented with reference to Briggs & Torczon, An Efficient
+// Representation for Sparse Sets, ACM Letters on Programming Languages
+// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
+//
+// This is a specialization of sparse array; see sparse_array.h.
+
+// IMPLEMENTATION
+//
+// See sparse_array.h for implementation details.
+
+// Doing this simplifies the logic below.
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#include <assert.h>
+#include <stdint.h>
+#if __has_feature(memory_sanitizer)
+#include <sanitizer/msan_interface.h>
+#endif
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+#include "re2/pod_array.h"
+
+namespace re2 {
+
+template<typename Value>
+class SparseSetT {
+ public:
+ SparseSetT();
+ explicit SparseSetT(int max_size);
+ ~SparseSetT();
+
+ typedef int* iterator;
+ typedef const int* const_iterator;
+
+ // Return the number of entries in the set.
+ int size() const {
+ return size_;
+ }
+
+ // Indicate whether the set is empty.
+ int empty() const {
+ return size_ == 0;
+ }
+
+ // Iterate over the set.
+ iterator begin() {
+ return dense_.data();
+ }
+ iterator end() {
+ return dense_.data() + size_;
+ }
+
+ const_iterator begin() const {
+ return dense_.data();
+ }
+ const_iterator end() const {
+ return dense_.data() + size_;
+ }
+
+ // Change the maximum size of the set.
+ // Invalidates all iterators.
+ void resize(int new_max_size);
+
+ // Return the maximum size of the set.
+ // Indices can be in the range [0, max_size).
+ int max_size() const {
+ if (dense_.data() != NULL)
+ return dense_.size();
+ else
+ return 0;
+ }
+
+ // Clear the set.
+ void clear() {
+ size_ = 0;
+ }
+
+ // Check whether index i is in the set.
+ bool contains(int i) const;
+
+ // Comparison function for sorting.
+ // Can sort the sparse set so that future iterations
+ // will visit indices in increasing order using
+ // std::sort(arr.begin(), arr.end(), arr.less);
+ static bool less(int a, int b);
+
+ public:
+ // Insert index i into the set.
+ iterator insert(int i) {
+ return InsertInternal(true, i);
+ }
+
+ // Insert index i into the set.
+ // Fast but unsafe: only use if contains(i) is false.
+ iterator insert_new(int i) {
+ return InsertInternal(false, i);
+ }
+
+ private:
+ iterator InsertInternal(bool allow_existing, int i) {
+ DebugCheckInvariants();
+ if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) {
+ assert(false && "illegal index");
+ // Semantically, end() would be better here, but we already know
+ // the user did something stupid, so begin() insulates them from
+ // dereferencing an invalid pointer.
+ return begin();
+ }
+ if (!allow_existing) {
+ assert(!contains(i));
+ create_index(i);
+ } else {
+ if (!contains(i))
+ create_index(i);
+ }
+ DebugCheckInvariants();
+ return dense_.data() + sparse_[i];
+ }
+
+ // Add the index i to the set.
+ // Only use if contains(i) is known to be false.
+ // This function is private, only intended as a helper
+ // for other methods.
+ void create_index(int i);
+
+ // In debug mode, verify that some invariant properties of the class
+ // are being maintained. This is called at the end of the constructor
+ // and at the beginning and end of all public non-const member functions.
+ void DebugCheckInvariants() const;
+
+ // Initializes memory for elements [min, max).
+ void MaybeInitializeMemory(int min, int max) {
+#if __has_feature(memory_sanitizer)
+ __msan_unpoison(sparse_.data() + min, (max - min) * sizeof sparse_[0]);
+#elif defined(RE2_ON_VALGRIND)
+ for (int i = min; i < max; i++) {
+ sparse_[i] = 0xababababU;
+ }
+#endif
+ }
+
+ int size_ = 0;
+ PODArray<int> sparse_;
+ PODArray<int> dense_;
+};
+
+template<typename Value>
+SparseSetT<Value>::SparseSetT() = default;
+
+// Change the maximum size of the set.
+// Invalidates all iterators.
+template<typename Value>
+void SparseSetT<Value>::resize(int new_max_size) {
+ DebugCheckInvariants();
+ if (new_max_size > max_size()) {
+ const int old_max_size = max_size();
+
+ // Construct these first for exception safety.
+ PODArray<int> a(new_max_size);
+ PODArray<int> b(new_max_size);
+
+ std::copy_n(sparse_.data(), old_max_size, a.data());
+ std::copy_n(dense_.data(), old_max_size, b.data());
+
+ sparse_ = std::move(a);
+ dense_ = std::move(b);
+
+ MaybeInitializeMemory(old_max_size, new_max_size);
+ }
+ if (size_ > new_max_size)
+ size_ = new_max_size;
+ DebugCheckInvariants();
+}
+
+// Check whether index i is in the set.
+template<typename Value>
+bool SparseSetT<Value>::contains(int i) const {
+ assert(i >= 0);
+ assert(i < max_size());
+ if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) {
+ return false;
+ }
+ // Unsigned comparison avoids checking sparse_[i] < 0.
+ return (uint32_t)sparse_[i] < (uint32_t)size_ &&
+ dense_[sparse_[i]] == i;
+}
+
+template<typename Value>
+void SparseSetT<Value>::create_index(int i) {
+ assert(!contains(i));
+ assert(size_ < max_size());
+ sparse_[i] = size_;
+ dense_[size_] = i;
+ size_++;
+}
+
+template<typename Value> SparseSetT<Value>::SparseSetT(int max_size) :
+ sparse_(max_size), dense_(max_size) {
+ MaybeInitializeMemory(size_, max_size);
+ DebugCheckInvariants();
+}
+
+template<typename Value> SparseSetT<Value>::~SparseSetT() {
+ DebugCheckInvariants();
+}
+
+template<typename Value> void SparseSetT<Value>::DebugCheckInvariants() const {
+ assert(0 <= size_);
+ assert(size_ <= max_size());
+}
+
+// Comparison function for sorting.
+template<typename Value> bool SparseSetT<Value>::less(int a, int b) {
+ return a < b;
+}
+
+typedef SparseSetT<void> SparseSet;
+
+} // namespace re2
+
+#endif // RE2_SPARSE_SET_H_
diff --git a/third_party/re2/src/re2/stringpiece.h b/third_party/re2/src/re2/stringpiece.h
new file mode 100644
index 000000000..e9367bff3
--- /dev/null
+++ b/third_party/re2/src/re2/stringpiece.h
@@ -0,0 +1,18 @@
+// Copyright 2022 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_STRINGPIECE_H_
+#define RE2_STRINGPIECE_H_
+
+#include "absl/strings/string_view.h"
+
+namespace re2 {
+
+// Until RE2 requires C++17 and uses std::string_view, allow users to
+// continue to #include "re2/stringpiece.h" and use re2::StringPiece.
+using StringPiece = absl::string_view;
+
+} // namespace re2
+
+#endif // RE2_STRINGPIECE_H_
diff --git a/third_party/re2/src/re2/testing/backtrack.cc b/third_party/re2/src/re2/testing/backtrack.cc
new file mode 100644
index 000000000..90071bb0f
--- /dev/null
+++ b/third_party/re2/src/re2/testing/backtrack.cc
@@ -0,0 +1,272 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Tested by search_test.cc, exhaustive_test.cc, tester.cc
+//
+// Prog::UnsafeSearchBacktrack is a backtracking regular expression search,
+// except that it remembers where it has been, trading a lot of
+// memory for a lot of time. It exists only for testing purposes.
+//
+// Let me repeat that.
+//
+// THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
+// - It uses a ton of memory.
+// - It uses a ton of stack.
+// - It uses CHECK and LOG(FATAL).
+// - It implements unanchored search by repeated anchored search.
+//
+// On the other hand, it is very simple and a good reference
+// implementation for the more complicated regexp packages.
+//
+// In BUILD, this file is linked into the ":testing" library,
+// not the main library, in order to make it harder to pick up
+// accidentally.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "absl/base/macros.h"
+#include "util/logging.h"
+#include "re2/pod_array.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+// Backtracker holds the state for a backtracking search.
+//
+// Excluding the search parameters, the main search state
+// is just the "capture registers", which record, for the
+// current execution, the string position at which each
+// parenthesis was passed. cap_[0] and cap_[1] are the
+// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
+//
+// To avoid infinite loops during backtracking on expressions
+// like (a*)*, the visited_[] bitmap marks the (state, string-position)
+// pairs that have already been explored and are thus not worth
+// re-exploring if we get there via another path. Modern backtracking
+// libraries engineer their program representation differently, to make
+// such infinite loops possible to avoid without keeping a giant visited_
+// bitmap, but visited_ works fine for a reference implementation
+// and it has the nice benefit of making the search run in linear time.
+class Backtracker {
+ public:
+ explicit Backtracker(Prog* prog);
+
+ bool Search(absl::string_view text, absl::string_view context, bool anchored,
+ bool longest, absl::string_view* submatch, int nsubmatch);
+
+ private:
+ // Explores from instruction id at string position p looking for a match.
+ // Returns true if found (so that caller can stop trying other possibilities).
+ bool Visit(int id, const char* p);
+
+ // Tries instruction id at string position p.
+ // Returns true if a match is found.
+ bool Try(int id, const char* p);
+
+ // Search parameters
+ Prog* prog_; // program being run
+ absl::string_view text_; // text being searched
+ absl::string_view context_; // greater context of text being searched
+ bool anchored_; // whether search is anchored at text.begin()
+ bool longest_; // whether search wants leftmost-longest match
+ bool endmatch_; // whether search must end at text.end()
+ absl::string_view* submatch_; // submatches to fill in
+ int nsubmatch_; // # of submatches to fill in
+
+ // Search state
+ const char* cap_[64]; // capture registers
+ PODArray<uint32_t> visited_; // bitmap: (Inst*, char*) pairs visited
+
+ Backtracker(const Backtracker&) = delete;
+ Backtracker& operator=(const Backtracker&) = delete;
+};
+
+Backtracker::Backtracker(Prog* prog)
+ : prog_(prog),
+ anchored_(false),
+ longest_(false),
+ endmatch_(false),
+ submatch_(NULL),
+ nsubmatch_(0) {
+}
+
+// Runs a backtracking search.
+bool Backtracker::Search(absl::string_view text, absl::string_view context,
+ bool anchored, bool longest,
+ absl::string_view* submatch, int nsubmatch) {
+ text_ = text;
+ context_ = context;
+ if (context_.data() == NULL)
+ context_ = text;
+ if (prog_->anchor_start() && BeginPtr(text) > BeginPtr(context_))
+ return false;
+ if (prog_->anchor_end() && EndPtr(text) < EndPtr(context_))
+ return false;
+ anchored_ = anchored | prog_->anchor_start();
+ longest_ = longest | prog_->anchor_end();
+ endmatch_ = prog_->anchor_end();
+ submatch_ = submatch;
+ nsubmatch_ = nsubmatch;
+ CHECK_LT(2*nsubmatch_, static_cast<int>(ABSL_ARRAYSIZE(cap_)));
+ memset(cap_, 0, sizeof cap_);
+
+ // We use submatch_[0] for our own bookkeeping,
+ // so it had better exist.
+ absl::string_view sp0;
+ if (nsubmatch < 1) {
+ submatch_ = &sp0;
+ nsubmatch_ = 1;
+ }
+ submatch_[0] = absl::string_view();
+
+ // Allocate new visited_ bitmap -- size is proportional
+ // to text, so have to reallocate on each call to Search.
+ int nvisited = prog_->size() * static_cast<int>(text.size()+1);
+ nvisited = (nvisited + 31) / 32;
+ visited_ = PODArray<uint32_t>(nvisited);
+ memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
+
+ // Anchored search must start at text.begin().
+ if (anchored_) {
+ cap_[0] = text.data();
+ return Visit(prog_->start(), text.data());
+ }
+
+ // Unanchored search, starting from each possible text position.
+ // Notice that we have to try the empty string at the end of
+ // the text, so the loop condition is p <= text.end(), not p < text.end().
+ for (const char* p = text.data(); p <= text.data() + text.size(); p++) {
+ cap_[0] = p;
+ if (Visit(prog_->start(), p)) // Match must be leftmost; done.
+ return true;
+ // Avoid invoking undefined behavior (arithmetic on a null pointer)
+ // by simply not continuing the loop.
+ if (p == NULL)
+ break;
+ }
+ return false;
+}
+
+// Explores from instruction id at string position p looking for a match.
+// Return true if found (so that caller can stop trying other possibilities).
+bool Backtracker::Visit(int id, const char* p) {
+ // Check bitmap. If we've already explored from here,
+ // either it didn't match or it did but we're hoping for a better match.
+ // Either way, don't go down that road again.
+ CHECK(p <= text_.data() + text_.size());
+ int n = id * static_cast<int>(text_.size()+1) +
+ static_cast<int>(p-text_.data());
+ CHECK_LT(n/32, visited_.size());
+ if (visited_[n/32] & (1 << (n&31)))
+ return false;
+ visited_[n/32] |= 1 << (n&31);
+
+ Prog::Inst* ip = prog_->inst(id);
+ if (Try(id, p)) {
+ if (longest_ && !ip->last())
+ Visit(id+1, p);
+ return true;
+ }
+ if (!ip->last())
+ return Visit(id+1, p);
+ return false;
+}
+
+// Tries instruction id at string position p.
+// Returns true if a match is found.
+bool Backtracker::Try(int id, const char* p) {
+ // Pick out byte at current position. If at end of string,
+ // have to explore in hope of finishing a match. Use impossible byte -1.
+ int c = -1;
+ if (p < text_.data() + text_.size())
+ c = *p & 0xFF;
+
+ Prog::Inst* ip = prog_->inst(id);
+ switch (ip->opcode()) {
+ default:
+ LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
+ return false; // not reached
+
+ case kInstAltMatch:
+ // Ignored.
+ return false;
+
+ case kInstByteRange:
+ if (ip->Matches(c))
+ return Visit(ip->out(), p+1);
+ return false;
+
+ case kInstCapture:
+ if (0 <= ip->cap() &&
+ ip->cap() < static_cast<int>(ABSL_ARRAYSIZE(cap_))) {
+ // Capture p to register, but save old value.
+ const char* q = cap_[ip->cap()];
+ cap_[ip->cap()] = p;
+ bool ret = Visit(ip->out(), p);
+ // Restore old value as we backtrack.
+ cap_[ip->cap()] = q;
+ return ret;
+ }
+ return Visit(ip->out(), p);
+
+ case kInstEmptyWidth:
+ if (ip->empty() & ~Prog::EmptyFlags(context_, p))
+ return false;
+ return Visit(ip->out(), p);
+
+ case kInstNop:
+ return Visit(ip->out(), p);
+
+ case kInstMatch:
+ // We found a match. If it's the best so far, record the
+ // parameters in the caller's submatch_ array.
+ if (endmatch_ && p != context_.data() + context_.size())
+ return false;
+ cap_[1] = p;
+ if (submatch_[0].data() == NULL ||
+ (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
+ // First match so far - or better match.
+ for (int i = 0; i < nsubmatch_; i++)
+ submatch_[i] = absl::string_view(
+ cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
+ }
+ return true;
+
+ case kInstFail:
+ return false;
+ }
+}
+
+// Runs a backtracking search.
+bool Prog::UnsafeSearchBacktrack(absl::string_view text,
+ absl::string_view context, Anchor anchor,
+ MatchKind kind, absl::string_view* match,
+ int nmatch) {
+ // If full match, we ask for an anchored longest match
+ // and then check that match[0] == text.
+ // So make sure match[0] exists.
+ absl::string_view sp0;
+ if (kind == kFullMatch) {
+ anchor = kAnchored;
+ if (nmatch < 1) {
+ match = &sp0;
+ nmatch = 1;
+ }
+ }
+
+ // Run the search.
+ Backtracker b(this);
+ bool anchored = anchor == kAnchored;
+ bool longest = kind != kFirstMatch;
+ if (!b.Search(text, context, anchored, longest, match, nmatch))
+ return false;
+ if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
+ return false;
+ return true;
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/charclass_test.cc b/third_party/re2/src/re2/testing/charclass_test.cc
new file mode 100644
index 000000000..ad95d6c26
--- /dev/null
+++ b/third_party/re2/src/re2/testing/charclass_test.cc
@@ -0,0 +1,228 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test character class manipulations.
+
+#include <stdio.h>
+
+#include "absl/base/macros.h"
+#include "absl/strings/str_format.h"
+#include "gtest/gtest.h"
+#include "util/utf.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct CCTest {
+ struct {
+ Rune lo;
+ Rune hi;
+ } add[10];
+ int remove;
+ struct {
+ Rune lo;
+ Rune hi;
+ } final[10];
+};
+
+static CCTest tests[] = {
+ { { { 10, 20 }, {-1} }, -1,
+ { { 10, 20 }, {-1} } },
+
+ { { { 10, 20 }, { 20, 30 }, {-1} }, -1,
+ { { 10, 30 }, {-1} } },
+
+ { { { 10, 20 }, { 30, 40 }, { 20, 30 }, {-1} }, -1,
+ { { 10, 40 }, {-1} } },
+
+ { { { 0, 50 }, { 20, 30 }, {-1} }, -1,
+ { { 0, 50 }, {-1} } },
+
+ { { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} }, -1,
+ { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
+
+ { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
+ { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
+
+ { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
+ { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
+
+ { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 5, 25 }, {-1} }, -1,
+ { { 5, 25 }, {-1} } },
+
+ { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 12, 21 }, {-1} }, -1,
+ { { 10, 23 }, {-1} } },
+
+ // These check boundary cases during negation.
+ { { { 0, Runemax }, {-1} }, -1,
+ { { 0, Runemax }, {-1} } },
+
+ { { { 0, 50 }, {-1} }, -1,
+ { { 0, 50 }, {-1} } },
+
+ { { { 50, Runemax }, {-1} }, -1,
+ { { 50, Runemax }, {-1} } },
+
+ // Check RemoveAbove.
+ { { { 50, Runemax }, {-1} }, 255,
+ { { 50, 255 }, {-1} } },
+
+ { { { 50, Runemax }, {-1} }, 65535,
+ { { 50, 65535 }, {-1} } },
+
+ { { { 50, Runemax }, {-1} }, Runemax,
+ { { 50, Runemax }, {-1} } },
+
+ { { { 50, 60 }, { 250, 260 }, { 350, 360 }, {-1} }, 255,
+ { { 50, 60 }, { 250, 255 }, {-1} } },
+
+ { { { 50, 60 }, {-1} }, 255,
+ { { 50, 60 }, {-1} } },
+
+ { { { 350, 360 }, {-1} }, 255,
+ { {-1} } },
+
+ { { {-1} }, 255,
+ { {-1} } },
+};
+
+template <typename CharClass>
+static void Broke(const char *desc, const CCTest* t, CharClass* cc) {
+ if (t == NULL) {
+ absl::PrintF("\t%s:", desc);
+ } else {
+ absl::PrintF("\n");
+ absl::PrintF("CharClass added: [%s]", desc);
+ for (int k = 0; t->add[k].lo >= 0; k++)
+ absl::PrintF(" %d-%d", t->add[k].lo, t->add[k].hi);
+ absl::PrintF("\n");
+ if (t->remove >= 0)
+ absl::PrintF("Removed > %d\n", t->remove);
+ absl::PrintF("\twant:");
+ for (int k = 0; t->final[k].lo >= 0; k++)
+ absl::PrintF(" %d-%d", t->final[k].lo, t->final[k].hi);
+ absl::PrintF("\n");
+ absl::PrintF("\thave:");
+ }
+
+ for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
+ absl::PrintF(" %d-%d", it->lo, it->hi);
+ absl::PrintF("\n");
+}
+
+bool ShouldContain(CCTest *t, int x) {
+ for (int j = 0; t->final[j].lo >= 0; j++)
+ if (t->final[j].lo <= x && x <= t->final[j].hi)
+ return true;
+ return false;
+}
+
+// Helpers to make templated CorrectCC work with both CharClass and CharClassBuilder.
+
+CharClass* Negate(CharClass *cc) {
+ return cc->Negate();
+}
+
+void Delete(CharClass* cc) {
+ cc->Delete();
+}
+
+CharClassBuilder* Negate(CharClassBuilder* cc) {
+ CharClassBuilder* ncc = cc->Copy();
+ ncc->Negate();
+ return ncc;
+}
+
+void Delete(CharClassBuilder* cc) {
+ delete cc;
+}
+
+template <typename CharClass>
+bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
+ typename CharClass::iterator it = cc->begin();
+ int size = 0;
+ for (int j = 0; t->final[j].lo >= 0; j++, ++it) {
+ if (it == cc->end() ||
+ it->lo != t->final[j].lo ||
+ it->hi != t->final[j].hi) {
+ Broke(desc, t, cc);
+ return false;
+ }
+ size += it->hi - it->lo + 1;
+ }
+ if (it != cc->end()) {
+ Broke(desc, t, cc);
+ return false;
+ }
+ if (cc->size() != size) {
+ Broke(desc, t, cc);
+ absl::PrintF("wrong size: want %d have %d\n", size, cc->size());
+ return false;
+ }
+
+ for (int j = 0; j < 101; j++) {
+ if (j == 100)
+ j = Runemax;
+ if (ShouldContain(t, j) != cc->Contains(j)) {
+ Broke(desc, t, cc);
+ absl::PrintF("want contains(%d)=%d, got %d\n",
+ j, ShouldContain(t, j), cc->Contains(j));
+ return false;
+ }
+ }
+
+ CharClass* ncc = Negate(cc);
+ for (int j = 0; j < 101; j++) {
+ if (j == 100)
+ j = Runemax;
+ if (ShouldContain(t, j) == ncc->Contains(j)) {
+ Broke(desc, t, cc);
+ Broke("ncc", NULL, ncc);
+ absl::PrintF("want ncc contains(%d)!=%d, got %d\n",
+ j, ShouldContain(t, j), ncc->Contains(j));
+ Delete(ncc);
+ return false;
+ }
+ if (ncc->size() != Runemax+1 - cc->size()) {
+ Broke(desc, t, cc);
+ Broke("ncc", NULL, ncc);
+ absl::PrintF("ncc size should be %d is %d\n",
+ Runemax+1 - cc->size(), ncc->size());
+ Delete(ncc);
+ return false;
+ }
+ }
+ Delete(ncc);
+ return true;
+}
+
+TEST(TestCharClassBuilder, Adds) {
+ int nfail = 0;
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
+ CharClassBuilder ccb;
+ CCTest* t = &tests[i];
+ for (int j = 0; t->add[j].lo >= 0; j++)
+ ccb.AddRange(t->add[j].lo, t->add[j].hi);
+ if (t->remove >= 0)
+ ccb.RemoveAbove(t->remove);
+ if (!CorrectCC(&ccb, t, "before copy (CharClassBuilder)"))
+ nfail++;
+ CharClass* cc = ccb.GetCharClass();
+ if (!CorrectCC(cc, t, "before copy (CharClass)"))
+ nfail++;
+ cc->Delete();
+
+ CharClassBuilder *ccb1 = ccb.Copy();
+ if (!CorrectCC(ccb1, t, "after copy (CharClassBuilder)"))
+ nfail++;
+ cc = ccb.GetCharClass();
+ if (!CorrectCC(cc, t, "after copy (CharClass)"))
+ nfail++;
+ cc->Delete();
+ delete ccb1;
+ }
+ EXPECT_EQ(nfail, 0);
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/compile_test.cc b/third_party/re2/src/re2/testing/compile_test.cc
new file mode 100644
index 000000000..f6899d3d2
--- /dev/null
+++ b/third_party/re2/src/re2/testing/compile_test.cc
@@ -0,0 +1,428 @@
+// Copyright 2007 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test prog.cc, compile.cc
+
+#include <string>
+
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "re2/regexp.h"
+#include "re2/prog.h"
+
+namespace re2 {
+
+// Simple input/output tests checking that
+// the regexp compiles to the expected code.
+// These are just to sanity check the basic implementation.
+// The real confidence tests happen by testing the NFA/DFA
+// that run the compiled code.
+
+struct Test {
+ const char* regexp;
+ const char* code;
+};
+
+static Test tests[] = {
+ { "a",
+ "3. byte [61-61] 0 -> 4\n"
+ "4. match! 0\n" },
+ { "ab",
+ "3. byte [61-61] 0 -> 4\n"
+ "4. byte [62-62] 0 -> 5\n"
+ "5. match! 0\n" },
+ { "a|c",
+ "3+ byte [61-61] 0 -> 5\n"
+ "4. byte [63-63] 0 -> 5\n"
+ "5. match! 0\n" },
+ { "a|b",
+ "3. byte [61-62] 0 -> 4\n"
+ "4. match! 0\n" },
+ { "[ab]",
+ "3. byte [61-62] 0 -> 4\n"
+ "4. match! 0\n" },
+ { "a+",
+ "3. byte [61-61] 0 -> 4\n"
+ "4+ nop -> 3\n"
+ "5. match! 0\n" },
+ { "a+?",
+ "3. byte [61-61] 0 -> 4\n"
+ "4+ match! 0\n"
+ "5. nop -> 3\n" },
+ { "a*",
+ "3+ byte [61-61] 1 -> 3\n"
+ "4. match! 0\n" },
+ { "a*?",
+ "3+ match! 0\n"
+ "4. byte [61-61] 0 -> 3\n" },
+ { "a?",
+ "3+ byte [61-61] 1 -> 5\n"
+ "4. nop -> 5\n"
+ "5. match! 0\n" },
+ { "a??",
+ "3+ nop -> 5\n"
+ "4. byte [61-61] 0 -> 5\n"
+ "5. match! 0\n" },
+ { "a{4}",
+ "3. byte [61-61] 0 -> 4\n"
+ "4. byte [61-61] 0 -> 5\n"
+ "5. byte [61-61] 0 -> 6\n"
+ "6. byte [61-61] 0 -> 7\n"
+ "7. match! 0\n" },
+ { "(a)",
+ "3. capture 2 -> 4\n"
+ "4. byte [61-61] 0 -> 5\n"
+ "5. capture 3 -> 6\n"
+ "6. match! 0\n" },
+ { "(?:a)",
+ "3. byte [61-61] 0 -> 4\n"
+ "4. match! 0\n" },
+ { "",
+ "3. match! 0\n" },
+ { ".",
+ "3+ byte [00-09] 0 -> 5\n"
+ "4. byte [0b-ff] 0 -> 5\n"
+ "5. match! 0\n" },
+ { "[^ab]",
+ "3+ byte [00-09] 0 -> 6\n"
+ "4+ byte [0b-60] 0 -> 6\n"
+ "5. byte [63-ff] 0 -> 6\n"
+ "6. match! 0\n" },
+ { "[Aa]",
+ "3. byte/i [61-61] 0 -> 4\n"
+ "4. match! 0\n" },
+ { "\\C+",
+ "3. byte [00-ff] 0 -> 4\n"
+ "4+ altmatch -> 5 | 6\n"
+ "5+ nop -> 3\n"
+ "6. match! 0\n" },
+ { "\\C*",
+ "3+ altmatch -> 4 | 5\n"
+ "4+ byte [00-ff] 1 -> 3\n"
+ "5. match! 0\n" },
+ { "\\C?",
+ "3+ byte [00-ff] 1 -> 5\n"
+ "4. nop -> 5\n"
+ "5. match! 0\n" },
+ // Issue 20992936
+ { "[[-`]",
+ "3. byte [5b-60] 0 -> 4\n"
+ "4. match! 0\n" },
+ // Issue 310
+ { "(?:|a)*",
+ "3+ nop -> 7\n"
+ "4. nop -> 9\n"
+ "5+ nop -> 7\n"
+ "6. nop -> 9\n"
+ "7+ nop -> 5\n"
+ "8. byte [61-61] 0 -> 5\n"
+ "9. match! 0\n" },
+ { "(?:|a)+",
+ "3+ nop -> 5\n"
+ "4. byte [61-61] 0 -> 5\n"
+ "5+ nop -> 3\n"
+ "6. match! 0\n" },
+};
+
+TEST(TestRegexpCompileToProg, Simple) {
+ int failed = 0;
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
+ const re2::Test& t = tests[i];
+ Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
+ if (re == NULL) {
+ LOG(ERROR) << "Cannot parse: " << t.regexp;
+ failed++;
+ continue;
+ }
+ Prog* prog = re->CompileToProg(0);
+ if (prog == NULL) {
+ LOG(ERROR) << "Cannot compile: " << t.regexp;
+ re->Decref();
+ failed++;
+ continue;
+ }
+ ASSERT_TRUE(re->CompileToProg(1) == NULL);
+ std::string s = prog->Dump();
+ if (s != t.code) {
+ LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
+ LOG(ERROR) << "Want:\n" << t.code;
+ LOG(ERROR) << "Got:\n" << s;
+ failed++;
+ }
+ delete prog;
+ re->Decref();
+ }
+ EXPECT_EQ(failed, 0);
+}
+
+static void DumpByteMap(absl::string_view pattern, Regexp::ParseFlags flags,
+ std::string* bytemap) {
+ Regexp* re = Regexp::Parse(pattern, flags, NULL);
+ EXPECT_TRUE(re != NULL);
+
+ {
+ Prog* prog = re->CompileToProg(0);
+ EXPECT_TRUE(prog != NULL);
+ *bytemap = prog->DumpByteMap();
+ delete prog;
+ }
+
+ {
+ Prog* prog = re->CompileToReverseProg(0);
+ EXPECT_TRUE(prog != NULL);
+ EXPECT_EQ(*bytemap, prog->DumpByteMap());
+ delete prog;
+ }
+
+ re->Decref();
+}
+
+TEST(TestCompile, Latin1Ranges) {
+ // The distinct byte ranges involved in the Latin-1 dot ([^\n]).
+
+ std::string bytemap;
+
+ DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-09] -> 0\n"
+ "[0a-0a] -> 1\n"
+ "[0b-ff] -> 0\n",
+ bytemap);
+}
+
+TEST(TestCompile, OtherByteMapTests) {
+ std::string bytemap;
+
+ // Test that "absent" ranges are mapped to the same byte class.
+ DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-2f] -> 0\n"
+ "[30-39] -> 1\n"
+ "[3a-40] -> 0\n"
+ "[41-46] -> 1\n"
+ "[47-60] -> 0\n"
+ "[61-66] -> 1\n"
+ "[67-ff] -> 0\n",
+ bytemap);
+
+ // Test the byte classes for \b.
+ DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-2f] -> 0\n"
+ "[30-39] -> 1\n"
+ "[3a-40] -> 0\n"
+ "[41-5a] -> 1\n"
+ "[5b-5e] -> 0\n"
+ "[5f-5f] -> 1\n"
+ "[60-60] -> 0\n"
+ "[61-7a] -> 1\n"
+ "[7b-ff] -> 0\n",
+ bytemap);
+
+ // Bug in the ASCII case-folding optimization created too many byte classes.
+ DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-5e] -> 0\n"
+ "[5f-5f] -> 1\n"
+ "[60-ff] -> 0\n",
+ bytemap);
+}
+
+TEST(TestCompile, UTF8Ranges) {
+ // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
+ // Once, erroneously split between 0x3f and 0x40 because it is
+ // a 6-bit boundary.
+
+ std::string bytemap;
+
+ DumpByteMap(".", Regexp::PerlX, &bytemap);
+ EXPECT_EQ("[00-09] -> 0\n"
+ "[0a-0a] -> 1\n"
+ "[0b-7f] -> 0\n"
+ "[80-bf] -> 2\n"
+ "[c0-c1] -> 1\n"
+ "[c2-df] -> 3\n"
+ "[e0-ef] -> 4\n"
+ "[f0-f4] -> 5\n"
+ "[f5-ff] -> 1\n",
+ bytemap);
+}
+
+TEST(TestCompile, InsufficientMemory) {
+ Regexp* re = Regexp::Parse(
+ "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
+ Regexp::LikePerl, NULL);
+ EXPECT_TRUE(re != NULL);
+ Prog* prog = re->CompileToProg(850);
+ // If the memory budget has been exhausted, compilation should fail
+ // and return NULL instead of trying to do anything with NoMatch().
+ EXPECT_TRUE(prog == NULL);
+ re->Decref();
+}
+
+static void Dump(absl::string_view pattern, Regexp::ParseFlags flags,
+ std::string* forward, std::string* reverse) {
+ Regexp* re = Regexp::Parse(pattern, flags, NULL);
+ EXPECT_TRUE(re != NULL);
+
+ if (forward != NULL) {
+ Prog* prog = re->CompileToProg(0);
+ EXPECT_TRUE(prog != NULL);
+ *forward = prog->Dump();
+ delete prog;
+ }
+
+ if (reverse != NULL) {
+ Prog* prog = re->CompileToReverseProg(0);
+ EXPECT_TRUE(prog != NULL);
+ *reverse = prog->Dump();
+ delete prog;
+ }
+
+ re->Decref();
+}
+
+TEST(TestCompile, Bug26705922) {
+ // Bug in the compiler caused inefficient bytecode to be generated for Unicode
+ // groups: common suffixes were cached, but common prefixes were not factored.
+
+ std::string forward, reverse;
+
+ Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
+ EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n"
+ "4. byte [90-90] 0 -> 5\n"
+ "5. byte [80-80] 0 -> 6\n"
+ "6+ byte [80-80] 0 -> 8\n"
+ "7. byte [90-90] 0 -> 8\n"
+ "8. match! 0\n",
+ forward);
+ EXPECT_EQ("3+ byte [80-80] 0 -> 5\n"
+ "4. byte [90-90] 0 -> 5\n"
+ "5. byte [80-80] 0 -> 6\n"
+ "6. byte [90-90] 0 -> 7\n"
+ "7. byte [f0-f0] 0 -> 8\n"
+ "8. match! 0\n",
+ reverse);
+
+ Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
+ EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n"
+ "4. byte [f0-f0] 0 -> 8\n"
+ "5. byte [80-bf] 0 -> 6\n"
+ "6. byte [80-bf] 0 -> 7\n"
+ "7. match! 0\n"
+ "8. byte [90-90] 0 -> 5\n",
+ forward);
+ EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
+ "4. byte [80-bf] 0 -> 5\n"
+ "5+ byte [e8-ef] 0 -> 7\n"
+ "6. byte [90-90] 0 -> 8\n"
+ "7. match! 0\n"
+ "8. byte [f0-f0] 0 -> 7\n",
+ reverse);
+
+ Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse);
+ EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n"
+ "4+ byte [e0-ef] 0 -> 8\n"
+ "5. byte [f0-f4] 0 -> 9\n"
+ "6. byte [80-bf] 0 -> 7\n"
+ "7. match! 0\n"
+ "8. byte [80-bf] 0 -> 6\n"
+ "9. byte [80-bf] 0 -> 8\n",
+ forward);
+ EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
+ "4+ byte [c2-df] 0 -> 6\n"
+ "5. byte [80-bf] 0 -> 7\n"
+ "6. match! 0\n"
+ "7+ byte [e0-ef] 0 -> 6\n"
+ "8. byte [80-bf] 0 -> 9\n"
+ "9. byte [f0-f4] 0 -> 6\n",
+ reverse);
+}
+
+TEST(TestCompile, Bug35237384) {
+ // Bug in the compiler caused inefficient bytecode to be generated for
+ // nested nullable subexpressions.
+
+ std::string forward;
+
+ Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
+ EXPECT_EQ("3+ byte [61-61] 1 -> 3\n"
+ "4. nop -> 5\n"
+ "5+ byte [61-61] 1 -> 5\n"
+ "6. nop -> 7\n"
+ "7+ byte [61-61] 1 -> 7\n"
+ "8. match! 0\n",
+ forward);
+
+ Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
+ EXPECT_EQ("3+ nop -> 28\n"
+ "4. nop -> 30\n"
+ "5+ byte [61-61] 1 -> 5\n"
+ "6. nop -> 32\n"
+ "7+ byte [61-61] 1 -> 7\n"
+ "8. nop -> 26\n"
+ "9+ byte [61-61] 1 -> 9\n"
+ "10. nop -> 20\n"
+ "11+ byte [62-62] 1 -> 11\n"
+ "12. nop -> 20\n"
+ "13+ byte [62-62] 1 -> 13\n"
+ "14. nop -> 26\n"
+ "15+ byte [62-62] 1 -> 15\n"
+ "16. nop -> 32\n"
+ "17+ nop -> 9\n"
+ "18. nop -> 11\n"
+ "19. match! 0\n"
+ "20+ nop -> 17\n"
+ "21. nop -> 19\n"
+ "22+ nop -> 7\n"
+ "23. nop -> 13\n"
+ "24+ nop -> 17\n"
+ "25. nop -> 19\n"
+ "26+ nop -> 22\n"
+ "27. nop -> 24\n"
+ "28+ nop -> 5\n"
+ "29. nop -> 15\n"
+ "30+ nop -> 22\n"
+ "31. nop -> 24\n"
+ "32+ nop -> 28\n"
+ "33. nop -> 30\n",
+ forward);
+
+ Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
+ EXPECT_EQ("3+ nop -> 36\n"
+ "4+ nop -> 31\n"
+ "5. nop -> 33\n"
+ "6+ byte [00-09] 0 -> 8\n"
+ "7. byte [0b-ff] 0 -> 8\n"
+ "8+ nop -> 6\n"
+ "9+ nop -> 29\n"
+ "10. nop -> 28\n"
+ "11+ byte [00-09] 0 -> 13\n"
+ "12. byte [0b-ff] 0 -> 13\n"
+ "13+ nop -> 11\n"
+ "14+ nop -> 26\n"
+ "15. nop -> 28\n"
+ "16+ byte [00-09] 0 -> 18\n"
+ "17. byte [0b-ff] 0 -> 18\n"
+ "18+ nop -> 16\n"
+ "19+ nop -> 36\n"
+ "20. nop -> 33\n"
+ "21+ byte [00-09] 0 -> 23\n"
+ "22. byte [0b-ff] 0 -> 23\n"
+ "23+ nop -> 21\n"
+ "24+ nop -> 31\n"
+ "25. nop -> 33\n"
+ "26+ nop -> 28\n"
+ "27. byte [53-53] 0 -> 11\n"
+ "28. match! 0\n"
+ "29+ nop -> 28\n"
+ "30. byte [53-53] 0 -> 6\n"
+ "31+ nop -> 33\n"
+ "32. byte [53-53] 0 -> 21\n"
+ "33+ nop -> 29\n"
+ "34+ nop -> 26\n"
+ "35. nop -> 28\n"
+ "36+ nop -> 33\n"
+ "37. byte [53-53] 0 -> 16\n",
+ forward);
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/dfa_test.cc b/third_party/re2/src/re2/testing/dfa_test.cc
new file mode 100644
index 000000000..b0759f7c7
--- /dev/null
+++ b/third_party/re2/src/re2/testing/dfa_test.cc
@@ -0,0 +1,373 @@
+// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdint.h>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "absl/base/macros.h"
+#include "absl/flags/flag.h"
+#include "absl/strings/str_format.h"
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "util/malloc_counter.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+#include "re2/testing/regexp_generator.h"
+#include "re2/testing/string_generator.h"
+
+static const bool UsingMallocCounter = false;
+
+ABSL_FLAG(int, size, 8, "log2(number of DFA nodes)");
+ABSL_FLAG(int, repeat, 2, "Repetition count.");
+ABSL_FLAG(int, threads, 4, "number of threads");
+
+namespace re2 {
+
+static int state_cache_resets = 0;
+static int search_failures = 0;
+
+struct SetHooks {
+ SetHooks() {
+ hooks::SetDFAStateCacheResetHook([](const hooks::DFAStateCacheReset&) {
+ ++state_cache_resets;
+ });
+ hooks::SetDFASearchFailureHook([](const hooks::DFASearchFailure&) {
+ ++search_failures;
+ });
+ }
+} set_hooks;
+
+// Check that multithreaded access to DFA class works.
+
+// Helper function: builds entire DFA for prog.
+static void DoBuild(Prog* prog) {
+ ASSERT_TRUE(prog->BuildEntireDFA(Prog::kFirstMatch, nullptr));
+}
+
+TEST(Multithreaded, BuildEntireDFA) {
+ // Create regexp with 2^FLAGS_size states in DFA.
+ std::string s = "a";
+ for (int i = 0; i < absl::GetFlag(FLAGS_size); i++)
+ s += "[ab]";
+ s += "b";
+ Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
+ ASSERT_TRUE(re != NULL);
+
+ // Check that single-threaded code works.
+ {
+ Prog* prog = re->CompileToProg(0);
+ ASSERT_TRUE(prog != NULL);
+
+ std::thread t(DoBuild, prog);
+ t.join();
+
+ delete prog;
+ }
+
+ // Build the DFA simultaneously in a bunch of threads.
+ for (int i = 0; i < absl::GetFlag(FLAGS_repeat); i++) {
+ Prog* prog = re->CompileToProg(0);
+ ASSERT_TRUE(prog != NULL);
+
+ std::vector<std::thread> threads;
+ for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++)
+ threads.emplace_back(DoBuild, prog);
+ for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++)
+ threads[j].join();
+
+ // One more compile, to make sure everything is okay.
+ prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
+ delete prog;
+ }
+
+ re->Decref();
+}
+
+// Check that DFA size requirements are followed.
+// BuildEntireDFA will, like SearchDFA, stop building out
+// the DFA once the memory limits are reached.
+TEST(SingleThreaded, BuildEntireDFA) {
+ // Create regexp with 2^30 states in DFA.
+ Regexp* re = Regexp::Parse("a[ab]{30}b", Regexp::LikePerl, NULL);
+ ASSERT_TRUE(re != NULL);
+
+ for (int i = 17; i < 24; i++) {
+ int64_t limit = int64_t{1}<<i;
+ int64_t usage;
+ //int64_t progusage, dfamem;
+ {
+ testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
+ Prog* prog = re->CompileToProg(limit);
+ ASSERT_TRUE(prog != NULL);
+ //progusage = m.HeapGrowth();
+ //dfamem = prog->dfa_mem();
+ prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
+ prog->BuildEntireDFA(Prog::kLongestMatch, nullptr);
+ usage = m.HeapGrowth();
+ delete prog;
+ }
+ if (UsingMallocCounter) {
+ //LOG(INFO) << "limit " << limit << ", "
+ // << "prog usage " << progusage << ", "
+ // << "DFA budget " << dfamem << ", "
+ // << "total " << usage;
+ // Tolerate +/- 10%.
+ ASSERT_GT(usage, limit*9/10);
+ ASSERT_LT(usage, limit*11/10);
+ }
+ }
+ re->Decref();
+}
+
+// Test that the DFA gets the right result even if it runs
+// out of memory during a search. The regular expression
+// 0[01]{n}$ matches a binary string of 0s and 1s only if
+// the (n+1)th-to-last character is a 0. Matching this in
+// a single forward pass (as done by the DFA) requires
+// keeping one bit for each of the last n+1 characters
+// (whether each was a 0), or 2^(n+1) possible states.
+// If we run this regexp to search in a string that contains
+// every possible n-character binary string as a substring,
+// then it will have to run through at least 2^n states.
+// States are big data structures -- certainly more than 1 byte --
+// so if the DFA can search correctly while staying within a
+// 2^n byte limit, it must be handling out-of-memory conditions
+// gracefully.
+TEST(SingleThreaded, SearchDFA) {
+ // The De Bruijn string is the worst case input for this regexp.
+ // By default, the DFA will notice that it is flushing its cache
+ // too frequently and will bail out early, so that RE2 can use the
+ // NFA implementation instead. (The DFA loses its speed advantage
+ // if it can't get a good cache hit rate.)
+ // Tell the DFA to trudge along instead.
+ Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(false);
+ state_cache_resets = 0;
+ search_failures = 0;
+
+ // Choice of n is mostly arbitrary, except that:
+ // * making n too big makes the test run for too long.
+ // * making n too small makes the DFA refuse to run,
+ // because it has so little memory compared to the program size.
+ // Empirically, n = 18 is a good compromise between the two.
+ const int n = 18;
+
+ Regexp* re = Regexp::Parse(absl::StrFormat("0[01]{%d}$", n),
+ Regexp::LikePerl, NULL);
+ ASSERT_TRUE(re != NULL);
+
+ // The De Bruijn string for n ends with a 1 followed by n 0s in a row,
+ // which is not a match for 0[01]{n}$. Adding one more 0 is a match.
+ std::string no_match = DeBruijnString(n);
+ std::string match = no_match + "0";
+
+ int64_t usage;
+ int64_t peak_usage;
+ {
+ testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
+ Prog* prog = re->CompileToProg(1<<n);
+ ASSERT_TRUE(prog != NULL);
+ for (int i = 0; i < 10; i++) {
+ bool matched = false;
+ bool failed = false;
+ matched =
+ prog->SearchDFA(match, absl::string_view(), Prog::kUnanchored,
+ Prog::kFirstMatch, NULL, &failed, NULL);
+ ASSERT_FALSE(failed);
+ ASSERT_TRUE(matched);
+ matched =
+ prog->SearchDFA(no_match, absl::string_view(), Prog::kUnanchored,
+ Prog::kFirstMatch, NULL, &failed, NULL);
+ ASSERT_FALSE(failed);
+ ASSERT_FALSE(matched);
+ }
+ usage = m.HeapGrowth();
+ peak_usage = m.PeakHeapGrowth();
+ delete prog;
+ }
+ if (UsingMallocCounter) {
+ //LOG(INFO) << "usage " << usage << ", "
+ // << "peak usage " << peak_usage;
+ ASSERT_LT(usage, 1<<n);
+ ASSERT_LT(peak_usage, 1<<n);
+ }
+ re->Decref();
+
+ // Reset to original behaviour.
+ Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(true);
+ ASSERT_GT(state_cache_resets, 0);
+ ASSERT_EQ(search_failures, 0);
+}
+
+// Helper function: searches for match, which should match,
+// and no_match, which should not.
+static void DoSearch(Prog* prog, absl::string_view match,
+ absl::string_view no_match) {
+ for (int i = 0; i < 2; i++) {
+ bool matched = false;
+ bool failed = false;
+ matched =
+ prog->SearchDFA(match, absl::string_view(), Prog::kUnanchored,
+ Prog::kFirstMatch, NULL, &failed, NULL);
+ ASSERT_FALSE(failed);
+ ASSERT_TRUE(matched);
+ matched =
+ prog->SearchDFA(no_match, absl::string_view(), Prog::kUnanchored,
+ Prog::kFirstMatch, NULL, &failed, NULL);
+ ASSERT_FALSE(failed);
+ ASSERT_FALSE(matched);
+ }
+}
+
+TEST(Multithreaded, SearchDFA) {
+ Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(false);
+ state_cache_resets = 0;
+ search_failures = 0;
+
+ // Same as single-threaded test above.
+ const int n = 18;
+ Regexp* re = Regexp::Parse(absl::StrFormat("0[01]{%d}$", n),
+ Regexp::LikePerl, NULL);
+ ASSERT_TRUE(re != NULL);
+ std::string no_match = DeBruijnString(n);
+ std::string match = no_match + "0";
+
+ // Check that single-threaded code works.
+ {
+ Prog* prog = re->CompileToProg(1<<n);
+ ASSERT_TRUE(prog != NULL);
+
+ std::thread t(DoSearch, prog, match, no_match);
+ t.join();
+
+ delete prog;
+ }
+
+ // Run the search simultaneously in a bunch of threads.
+ // Reuse same flags for Multithreaded.BuildDFA above.
+ for (int i = 0; i < absl::GetFlag(FLAGS_repeat); i++) {
+ Prog* prog = re->CompileToProg(1<<n);
+ ASSERT_TRUE(prog != NULL);
+
+ std::vector<std::thread> threads;
+ for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++)
+ threads.emplace_back(DoSearch, prog, match, no_match);
+ for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++)
+ threads[j].join();
+
+ delete prog;
+ }
+
+ re->Decref();
+
+ // Reset to original behaviour.
+ Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(true);
+ ASSERT_GT(state_cache_resets, 0);
+ ASSERT_EQ(search_failures, 0);
+}
+
+struct ReverseTest {
+ const char* regexp;
+ const char* text;
+ bool match;
+};
+
+// Test that reverse DFA handles anchored/unanchored correctly.
+// It's in the DFA interface but not used by RE2.
+ReverseTest reverse_tests[] = {
+ { "\\A(a|b)", "abc", true },
+ { "(a|b)\\z", "cba", true },
+ { "\\A(a|b)", "cba", false },
+ { "(a|b)\\z", "abc", false },
+};
+
+TEST(DFA, ReverseMatch) {
+ int nfail = 0;
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(reverse_tests); i++) {
+ const ReverseTest& t = reverse_tests[i];
+ Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
+ ASSERT_TRUE(re != NULL);
+ Prog* prog = re->CompileToReverseProg(0);
+ ASSERT_TRUE(prog != NULL);
+ bool failed = false;
+ bool matched =
+ prog->SearchDFA(t.text, absl::string_view(), Prog::kUnanchored,
+ Prog::kFirstMatch, NULL, &failed, NULL);
+ if (matched != t.match) {
+ LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
+ nfail++;
+ }
+ delete prog;
+ re->Decref();
+ }
+ EXPECT_EQ(nfail, 0);
+}
+
+struct CallbackTest {
+ const char* regexp;
+ const char* dump;
+};
+
+// Test that DFA::BuildAllStates() builds the expected DFA states
+// and issues the expected callbacks. These test cases reflect the
+// very compact encoding of the callbacks, but that also makes them
+// very difficult to understand, so let's work through "\\Aa\\z".
+// There are three slots per DFA state because the bytemap has two
+// equivalence classes and there is a third slot for kByteEndText:
+// 0: all bytes that are not 'a'
+// 1: the byte 'a'
+// 2: kByteEndText
+// -1 means that there is no transition from that DFA state to any
+// other DFA state for that slot. The valid transitions are thus:
+// state 0 --slot 1--> state 1
+// state 1 --slot 2--> state 2
+// The double brackets indicate that state 2 is a matching state.
+// Putting it together, this means that the DFA must consume the
+// byte 'a' and then hit end of text. Q.E.D.
+CallbackTest callback_tests[] = {
+ { "\\Aa\\z", "[-1,1,-1] [-1,-1,2] [[-1,-1,-1]]" },
+ { "\\Aab\\z", "[-1,1,-1,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
+ { "\\Aa*b\\z", "[-1,0,1,-1] [-1,-1,-1,2] [[-1,-1,-1,-1]]" },
+ { "\\Aa+b\\z", "[-1,1,-1,-1] [-1,1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
+ { "\\Aa?b\\z", "[-1,1,2,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
+ { "\\Aa\\C*\\z", "[-1,1,-1] [1,1,2] [[-1,-1,-1]]" },
+ { "\\Aa\\C*", "[-1,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
+ { "a\\C*", "[0,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
+ { "\\C*", "[1,2] [[1,1]] [[-1,-1]]" },
+ { "a", "[0,1,-1] [2,2,2] [[-1,-1,-1]]"} ,
+};
+
+TEST(DFA, Callback) {
+ int nfail = 0;
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(callback_tests); i++) {
+ const CallbackTest& t = callback_tests[i];
+ Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
+ ASSERT_TRUE(re != NULL);
+ Prog* prog = re->CompileToProg(0);
+ ASSERT_TRUE(prog != NULL);
+ std::string dump;
+ prog->BuildEntireDFA(Prog::kLongestMatch, [&](const int* next, bool match) {
+ ASSERT_TRUE(next != NULL);
+ if (!dump.empty())
+ dump += " ";
+ dump += match ? "[[" : "[";
+ for (int b = 0; b < prog->bytemap_range() + 1; b++)
+ dump += absl::StrFormat("%d,", next[b]);
+ dump.pop_back();
+ dump += match ? "]]" : "]";
+ });
+ if (dump != t.dump) {
+ LOG(ERROR) << t.regexp << " bytemap:\n" << prog->DumpByteMap();
+ LOG(ERROR) << t.regexp << " dump:\ngot " << dump << "\nwant " << t.dump;
+ nfail++;
+ }
+ delete prog;
+ re->Decref();
+ }
+ EXPECT_EQ(nfail, 0);
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/dump.cc b/third_party/re2/src/re2/testing/dump.cc
new file mode 100644
index 000000000..5cddd2334
--- /dev/null
+++ b/third_party/re2/src/re2/testing/dump.cc
@@ -0,0 +1,163 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Dump the regexp into a string showing structure.
+// Tested by parse_unittest.cc
+
+// This function traverses the regexp recursively,
+// meaning that on inputs like Regexp::Simplify of
+// a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100},
+// it takes time and space exponential in the size of the
+// original regular expression. It can also use stack space
+// linear in the size of the regular expression for inputs
+// like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*.
+// IT IS NOT SAFE TO CALL FROM PRODUCTION CODE.
+// As a result, Dump is provided only in the testing
+// library (see BUILD).
+
+#include <string>
+
+#include "absl/base/macros.h"
+#include "absl/strings/str_format.h"
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "util/utf.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+static const char* kOpcodeNames[] = {
+ "bad",
+ "no",
+ "emp",
+ "lit",
+ "str",
+ "cat",
+ "alt",
+ "star",
+ "plus",
+ "que",
+ "rep",
+ "cap",
+ "dot",
+ "byte",
+ "bol",
+ "eol",
+ "wb", // kRegexpWordBoundary
+ "nwb", // kRegexpNoWordBoundary
+ "bot",
+ "eot",
+ "cc",
+ "match",
+};
+
+// Create string representation of regexp with explicit structure.
+// Nothing pretty, just for testing.
+static void DumpRegexpAppending(Regexp* re, std::string* s) {
+ if (re->op() < 0 || re->op() >= ABSL_ARRAYSIZE(kOpcodeNames)) {
+ *s += absl::StrFormat("op%d", re->op());
+ } else {
+ switch (re->op()) {
+ default:
+ break;
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpRepeat:
+ if (re->parse_flags() & Regexp::NonGreedy)
+ s->append("n");
+ break;
+ }
+ s->append(kOpcodeNames[re->op()]);
+ if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) {
+ Rune r = re->rune();
+ if ('a' <= r && r <= 'z')
+ s->append("fold");
+ }
+ if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) {
+ for (int i = 0; i < re->nrunes(); i++) {
+ Rune r = re->runes()[i];
+ if ('a' <= r && r <= 'z') {
+ s->append("fold");
+ break;
+ }
+ }
+ }
+ }
+ s->append("{");
+ switch (re->op()) {
+ default:
+ break;
+ case kRegexpEndText:
+ if (!(re->parse_flags() & Regexp::WasDollar)) {
+ s->append("\\z");
+ }
+ break;
+ case kRegexpLiteral: {
+ Rune r = re->rune();
+ char buf[UTFmax+1];
+ buf[runetochar(buf, &r)] = 0;
+ s->append(buf);
+ break;
+ }
+ case kRegexpLiteralString:
+ for (int i = 0; i < re->nrunes(); i++) {
+ Rune r = re->runes()[i];
+ char buf[UTFmax+1];
+ buf[runetochar(buf, &r)] = 0;
+ s->append(buf);
+ }
+ break;
+ case kRegexpConcat:
+ case kRegexpAlternate:
+ for (int i = 0; i < re->nsub(); i++)
+ DumpRegexpAppending(re->sub()[i], s);
+ break;
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ DumpRegexpAppending(re->sub()[0], s);
+ break;
+ case kRegexpCapture:
+ if (re->cap() == 0)
+ LOG(DFATAL) << "kRegexpCapture cap() == 0";
+ if (re->name()) {
+ s->append(*re->name());
+ s->append(":");
+ }
+ DumpRegexpAppending(re->sub()[0], s);
+ break;
+ case kRegexpRepeat:
+ s->append(absl::StrFormat("%d,%d ", re->min(), re->max()));
+ DumpRegexpAppending(re->sub()[0], s);
+ break;
+ case kRegexpCharClass: {
+ std::string sep;
+ for (CharClass::iterator it = re->cc()->begin();
+ it != re->cc()->end(); ++it) {
+ RuneRange rr = *it;
+ s->append(sep);
+ if (rr.lo == rr.hi)
+ s->append(absl::StrFormat("%#x", rr.lo));
+ else
+ s->append(absl::StrFormat("%#x-%#x", rr.lo, rr.hi));
+ sep = " ";
+ }
+ break;
+ }
+ }
+ s->append("}");
+}
+
+std::string Regexp::Dump() {
+ // Make sure that we are being called from a unit test.
+ // Should cause a link error if used outside of testing.
+ CHECK(!::testing::TempDir().empty());
+
+ std::string s;
+ DumpRegexpAppending(this, &s);
+ return s;
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/exhaustive1_test.cc b/third_party/re2/src/re2/testing/exhaustive1_test.cc
new file mode 100644
index 000000000..933798995
--- /dev/null
+++ b/third_party/re2/src/re2/testing/exhaustive1_test.cc
@@ -0,0 +1,39 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Exhaustive testing of regular expression matching.
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "re2/testing/exhaustive_tester.h"
+
+namespace re2 {
+
+// Test simple repetition operators
+TEST(Repetition, Simple) {
+ std::vector<std::string> ops = Split(" ",
+ "%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
+ "%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
+ "%s* %s+ %s? %s*? %s+? %s??");
+ ExhaustiveTest(3, 2, Explode("abc."), ops,
+ 6, Explode("ab"), "(?:%s)", "");
+ ExhaustiveTest(3, 2, Explode("abc."), ops,
+ 40, Explode("a"), "(?:%s)", "");
+}
+
+// Test capturing parens -- (a) -- inside repetition operators
+TEST(Repetition, Capturing) {
+ std::vector<std::string> ops = Split(" ",
+ "%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
+ "%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
+ "%s* %s+ %s? %s*? %s+? %s??");
+ ExhaustiveTest(3, 2, Split(" ", "a (a) b"), ops,
+ 7, Explode("ab"), "(?:%s)", "");
+ ExhaustiveTest(3, 2, Split(" ", "a (a)"), ops,
+ 50, Explode("a"), "(?:%s)", "");
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/exhaustive2_test.cc b/third_party/re2/src/re2/testing/exhaustive2_test.cc
new file mode 100644
index 000000000..14f629d4a
--- /dev/null
+++ b/third_party/re2/src/re2/testing/exhaustive2_test.cc
@@ -0,0 +1,72 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Exhaustive testing of regular expression matching.
+
+#include <stddef.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "re2/testing/exhaustive_tester.h"
+
+namespace re2 {
+
+// Test empty string matches (aka "(?:)")
+TEST(EmptyString, Exhaustive) {
+ ExhaustiveTest(2, 2, Split(" ", "(?:) a"),
+ RegexpGenerator::EgrepOps(),
+ 5, Split("", "ab"), "", "");
+}
+
+// Test escaped versions of regexp syntax.
+TEST(Punctuation, Literals) {
+ std::vector<std::string> alphabet = Explode("()*+?{}[]\\^$.");
+ std::vector<std::string> escaped = alphabet;
+ for (size_t i = 0; i < escaped.size(); i++)
+ escaped[i] = "\\" + escaped[i];
+ ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(),
+ 2, alphabet, "", "");
+}
+
+// Test ^ $ . \A \z in presence of line endings.
+// Have to wrap the empty-width ones in (?:) so that
+// they can be repeated -- PCRE rejects ^* but allows (?:^)*
+TEST(LineEnds, Exhaustive) {
+ ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"),
+ RegexpGenerator::EgrepOps(),
+ 4, Explode("ab\n"), "", "");
+}
+
+// Test what does and does not match \n.
+// This would be a good test, except that PCRE seems to have a bug:
+// in single-byte character set mode (the default),
+// [^a] matches \n, but in UTF-8 mode it does not.
+// So when we run the test, the tester complains that
+// we don't agree with PCRE, but it's PCRE that is at fault.
+// For what it's worth, Perl gets this right (matches
+// regardless of whether UTF-8 input is selected):
+//
+// #!/usr/bin/perl
+// use POSIX qw(locale_h);
+// print "matches in latin1\n" if "\n" =~ /[^a]/;
+// setlocale("en_US.utf8");
+// print "matches in utf8\n" if "\n" =~ /[^a]/;
+//
+// The rule chosen for RE2 is that by default, like Perl,
+// dot does not match \n but negated character classes [^a] do.
+// (?s) will allow dot to match \n; there is no way in RE2
+// to stop [^a] from matching \n, though the underlying library
+// provides a mechanism, and RE2 could add new syntax if needed.
+//
+// TEST(Newlines, Exhaustive) {
+// std::vector<std::string> empty_vector;
+// ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"),
+// RegexpGenerator::EgrepOps(),
+// 4, Explode("a\n"), "");
+// }
+
+} // namespace re2
+
diff --git a/third_party/re2/src/re2/testing/exhaustive3_test.cc b/third_party/re2/src/re2/testing/exhaustive3_test.cc
new file mode 100644
index 000000000..de703c00e
--- /dev/null
+++ b/third_party/re2/src/re2/testing/exhaustive3_test.cc
@@ -0,0 +1,100 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Exhaustive testing of regular expression matching.
+
+#include <stddef.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "util/utf.h"
+#include "re2/testing/exhaustive_tester.h"
+
+namespace re2 {
+
+// Test simple character classes by themselves.
+TEST(CharacterClasses, Exhaustive) {
+ std::vector<std::string> atoms = Split(" ",
+ "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
+ ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
+ 5, Explode("ab"), "", "");
+}
+
+// Test simple character classes inside a___b (for example, a[a]b).
+TEST(CharacterClasses, ExhaustiveAB) {
+ std::vector<std::string> atoms = Split(" ",
+ "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
+ ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
+ 5, Explode("ab"), "a%sb", "");
+}
+
+// Returns UTF8 for Rune r
+static std::string UTF8(Rune r) {
+ char buf[UTFmax+1];
+ buf[runetochar(buf, &r)] = 0;
+ return std::string(buf);
+}
+
+// Returns a vector of "interesting" UTF8 characters.
+// Unicode is now too big to just return all of them,
+// so UTF8Characters return a set likely to be good test cases.
+static const std::vector<std::string>& InterestingUTF8() {
+ static bool init;
+ static std::vector<std::string> v;
+
+ if (init)
+ return v;
+
+ init = true;
+ // All the Latin1 equivalents are interesting.
+ for (int i = 1; i < 256; i++)
+ v.push_back(UTF8(i));
+
+ // After that, the codes near bit boundaries are
+ // interesting, because they span byte sequence lengths.
+ for (int j = 0; j < 8; j++)
+ v.push_back(UTF8(256 + j));
+ for (int i = 512; i < Runemax; i <<= 1)
+ for (int j = -8; j < 8; j++)
+ v.push_back(UTF8(i + j));
+
+ // The codes near Runemax, including Runemax itself, are interesting.
+ for (int j = -8; j <= 0; j++)
+ v.push_back(UTF8(Runemax + j));
+
+ return v;
+}
+
+// Test interesting UTF-8 characters against character classes.
+TEST(InterestingUTF8, SingleOps) {
+ std::vector<std::string> atoms = Split(" ",
+ ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
+ "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
+ "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
+ "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
+ std::vector<std::string> ops; // no ops
+ ExhaustiveTest(1, 0, atoms, ops,
+ 1, InterestingUTF8(), "", "");
+}
+
+// Test interesting UTF-8 characters against character classes,
+// but wrap everything inside AB.
+TEST(InterestingUTF8, AB) {
+ std::vector<std::string> atoms = Split(" ",
+ ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
+ "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
+ "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
+ "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
+ std::vector<std::string> ops; // no ops
+ std::vector<std::string> alpha = InterestingUTF8();
+ for (size_t i = 0; i < alpha.size(); i++)
+ alpha[i] = "a" + alpha[i] + "b";
+ ExhaustiveTest(1, 0, atoms, ops,
+ 1, alpha, "a%sb", "");
+}
+
+} // namespace re2
+
diff --git a/third_party/re2/src/re2/testing/exhaustive_test.cc b/third_party/re2/src/re2/testing/exhaustive_test.cc
new file mode 100644
index 000000000..5e586f1fe
--- /dev/null
+++ b/third_party/re2/src/re2/testing/exhaustive_test.cc
@@ -0,0 +1,36 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Exhaustive testing of regular expression matching.
+
+#include "gtest/gtest.h"
+#include "re2/testing/exhaustive_tester.h"
+
+namespace re2 {
+
+// Test very simple expressions.
+TEST(EgrepLiterals, Lowercase) {
+ EgrepTest(3, 2, "abc.", 3, "abc", "");
+}
+
+// Test mixed-case expressions.
+TEST(EgrepLiterals, MixedCase) {
+ EgrepTest(3, 2, "AaBb.", 2, "AaBb", "");
+}
+
+// Test mixed-case in case-insensitive mode.
+TEST(EgrepLiterals, FoldCase) {
+ // The punctuation characters surround A-Z and a-z
+ // in the ASCII table. This looks for bugs in the
+ // bytemap range code in the DFA.
+ EgrepTest(3, 2, "abAB.", 2, "aBc@_~", "(?i:%s)");
+}
+
+// Test very simple expressions.
+TEST(EgrepLiterals, UTF8) {
+ EgrepTest(3, 2, "ab.", 4, "a\xE2\x98\xBA", "");
+}
+
+} // namespace re2
+
diff --git a/third_party/re2/src/re2/testing/exhaustive_tester.cc b/third_party/re2/src/re2/testing/exhaustive_tester.cc
new file mode 100644
index 000000000..a57f700bc
--- /dev/null
+++ b/third_party/re2/src/re2/testing/exhaustive_tester.cc
@@ -0,0 +1,195 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Exhaustive testing of regular expression matching.
+
+// Each test picks an alphabet (e.g., "abc"), a maximum string length,
+// a maximum regular expression length, and a maximum number of letters
+// that can appear in the regular expression. Given these parameters,
+// it tries every possible regular expression and string, verifying that
+// the NFA, DFA, and a trivial backtracking implementation agree about
+// the location of the match.
+
+#include <stdio.h>
+
+#include "absl/base/macros.h"
+#include "absl/flags/flag.h"
+#include "absl/strings/str_format.h"
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "re2/testing/exhaustive_tester.h"
+#include "re2/testing/tester.h"
+
+// For target `log' in the Makefile.
+#ifndef LOGGING
+#define LOGGING 0
+#endif
+
+ABSL_FLAG(bool, show_regexps, false, "show regexps during testing");
+
+ABSL_FLAG(int, max_bad_regexp_inputs, 1,
+ "Stop testing a regular expression after finding this many "
+ "strings that break it.");
+
+namespace re2 {
+
+static char* escape(absl::string_view sp) {
+ static char buf[512];
+ char* p = buf;
+ *p++ = '\"';
+ for (size_t i = 0; i < sp.size(); i++) {
+ if(p+5 >= buf+sizeof buf)
+ LOG(FATAL) << "ExhaustiveTester escape: too long";
+ if(sp[i] == '\\' || sp[i] == '\"') {
+ *p++ = '\\';
+ *p++ = sp[i];
+ } else if(sp[i] == '\n') {
+ *p++ = '\\';
+ *p++ = 'n';
+ } else {
+ *p++ = sp[i];
+ }
+ }
+ *p++ = '\"';
+ *p = '\0';
+ return buf;
+}
+
+static void PrintResult(const RE2& re, absl::string_view input,
+ RE2::Anchor anchor, absl::string_view* m, int n) {
+ if (!re.Match(input, 0, input.size(), anchor, m, n)) {
+ absl::PrintF("-");
+ return;
+ }
+ for (int i = 0; i < n; i++) {
+ if (i > 0)
+ absl::PrintF(" ");
+ if (m[i].data() == NULL)
+ absl::PrintF("-");
+ else
+ absl::PrintF("%d-%d",
+ BeginPtr(m[i]) - BeginPtr(input),
+ EndPtr(m[i]) - BeginPtr(input));
+ }
+}
+
+// Processes a single generated regexp.
+// Compiles it using Regexp interface and PCRE, and then
+// checks that NFA, DFA, and PCRE all return the same results.
+void ExhaustiveTester::HandleRegexp(const std::string& const_regexp) {
+ regexps_++;
+ std::string regexp = const_regexp;
+ if (!topwrapper_.empty()) {
+ auto fmt = absl::ParsedFormat<'s'>::New(topwrapper_);
+ CHECK(fmt != nullptr);
+ regexp = absl::StrFormat(*fmt, regexp);
+ }
+
+ if (absl::GetFlag(FLAGS_show_regexps)) {
+ absl::PrintF("\r%s", regexp);
+ fflush(stdout);
+ }
+
+ if (LOGGING) {
+ // Write out test cases and answers for use in testing
+ // other implementations, such as Go's regexp package.
+ if (randomstrings_)
+ LOG(ERROR) << "Cannot log with random strings.";
+ if (regexps_ == 1) { // first
+ absl::PrintF("strings\n");
+ strgen_.Reset();
+ while (strgen_.HasNext())
+ absl::PrintF("%s\n", escape(strgen_.Next()));
+ absl::PrintF("regexps\n");
+ }
+ absl::PrintF("%s\n", escape(regexp));
+
+ RE2 re(regexp);
+ RE2::Options longest;
+ longest.set_longest_match(true);
+ RE2 relongest(regexp, longest);
+ int ngroup = re.NumberOfCapturingGroups()+1;
+ absl::string_view* group = new absl::string_view[ngroup];
+
+ strgen_.Reset();
+ while (strgen_.HasNext()) {
+ absl::string_view input = strgen_.Next();
+ PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup);
+ absl::PrintF(";");
+ PrintResult(re, input, RE2::UNANCHORED, group, ngroup);
+ absl::PrintF(";");
+ PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup);
+ absl::PrintF(";");
+ PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup);
+ absl::PrintF("\n");
+ }
+ delete[] group;
+ return;
+ }
+
+ Tester tester(regexp);
+ if (tester.error())
+ return;
+
+ strgen_.Reset();
+ strgen_.GenerateNULL();
+ if (randomstrings_)
+ strgen_.Random(stringseed_, stringcount_);
+ int bad_inputs = 0;
+ while (strgen_.HasNext()) {
+ tests_++;
+ if (!tester.TestInput(strgen_.Next())) {
+ failures_++;
+ if (++bad_inputs >= absl::GetFlag(FLAGS_max_bad_regexp_inputs))
+ break;
+ }
+ }
+}
+
+// Runs an exhaustive test on the given parameters.
+void ExhaustiveTest(int maxatoms, int maxops,
+ const std::vector<std::string>& alphabet,
+ const std::vector<std::string>& ops,
+ int maxstrlen,
+ const std::vector<std::string>& stralphabet,
+ const std::string& wrapper,
+ const std::string& topwrapper) {
+ if (RE2_DEBUG_MODE) {
+ if (maxatoms > 1)
+ maxatoms--;
+ if (maxops > 1)
+ maxops--;
+ if (maxstrlen > 1)
+ maxstrlen--;
+ }
+ ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
+ maxstrlen, stralphabet, wrapper,
+ topwrapper);
+ t.Generate();
+ if (!LOGGING) {
+ absl::PrintF("%d regexps, %d tests, %d failures [%d/%d str]\n",
+ t.regexps(), t.tests(), t.failures(), maxstrlen, stralphabet.size());
+ }
+ EXPECT_EQ(0, t.failures());
+}
+
+// Runs an exhaustive test using the given parameters and
+// the basic egrep operators.
+void EgrepTest(int maxatoms, int maxops, const std::string& alphabet,
+ int maxstrlen, const std::string& stralphabet,
+ const std::string& wrapper) {
+ const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" };
+
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tops); i++) {
+ ExhaustiveTest(maxatoms, maxops,
+ Split("", alphabet),
+ RegexpGenerator::EgrepOps(),
+ maxstrlen,
+ Split("", stralphabet),
+ wrapper,
+ tops[i]);
+ }
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/exhaustive_tester.h b/third_party/re2/src/re2/testing/exhaustive_tester.h
new file mode 100644
index 000000000..906be0c8c
--- /dev/null
+++ b/third_party/re2/src/re2/testing/exhaustive_tester.h
@@ -0,0 +1,104 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H_
+#define RE2_TESTING_EXHAUSTIVE_TESTER_H_
+
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+#include "re2/testing/regexp_generator.h"
+#include "re2/testing/string_generator.h"
+
+namespace re2 {
+
+// Doing this simplifies the logic below.
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#if !defined(NDEBUG)
+// We are in a debug build.
+const bool RE2_DEBUG_MODE = true;
+#elif __has_feature(address_sanitizer) || __has_feature(memory_sanitizer) || __has_feature(thread_sanitizer)
+// Not a debug build, but still under sanitizers.
+const bool RE2_DEBUG_MODE = true;
+#else
+const bool RE2_DEBUG_MODE = false;
+#endif
+
+// Exhaustive regular expression test: generate all regexps within parameters,
+// then generate all strings of a given length over a given alphabet,
+// then check that NFA, DFA, and PCRE agree about whether each regexp matches
+// each possible string, and if so, where the match is.
+//
+// Can also be used in a "random" mode that generates a given number
+// of random regexp and strings, allowing testing of larger expressions
+// and inputs.
+class ExhaustiveTester : public RegexpGenerator {
+ public:
+ ExhaustiveTester(int maxatoms,
+ int maxops,
+ const std::vector<std::string>& alphabet,
+ const std::vector<std::string>& ops,
+ int maxstrlen,
+ const std::vector<std::string>& stralphabet,
+ const std::string& wrapper,
+ const std::string& topwrapper)
+ : RegexpGenerator(maxatoms, maxops, alphabet, ops),
+ strgen_(maxstrlen, stralphabet),
+ wrapper_(wrapper),
+ topwrapper_(topwrapper),
+ regexps_(0), tests_(0), failures_(0),
+ randomstrings_(0), stringseed_(0), stringcount_(0) { }
+
+ int regexps() { return regexps_; }
+ int tests() { return tests_; }
+ int failures() { return failures_; }
+
+ // Needed for RegexpGenerator interface.
+ void HandleRegexp(const std::string& regexp);
+
+ // Causes testing to generate random input strings.
+ void RandomStrings(int32_t seed, int32_t count) {
+ randomstrings_ = true;
+ stringseed_ = seed;
+ stringcount_ = count;
+ }
+
+ private:
+ StringGenerator strgen_;
+ std::string wrapper_; // Regexp wrapper - either empty or has one %s.
+ std::string topwrapper_; // Regexp top-level wrapper.
+ int regexps_; // Number of HandleRegexp calls
+ int tests_; // Number of regexp tests.
+ int failures_; // Number of tests failed.
+
+ bool randomstrings_; // Whether to use random strings
+ int32_t stringseed_; // If so, the seed.
+ int stringcount_; // If so, how many to generate.
+
+ ExhaustiveTester(const ExhaustiveTester&) = delete;
+ ExhaustiveTester& operator=(const ExhaustiveTester&) = delete;
+};
+
+// Runs an exhaustive test on the given parameters.
+void ExhaustiveTest(int maxatoms, int maxops,
+ const std::vector<std::string>& alphabet,
+ const std::vector<std::string>& ops,
+ int maxstrlen,
+ const std::vector<std::string>& stralphabet,
+ const std::string& wrapper,
+ const std::string& topwrapper);
+
+// Runs an exhaustive test using the given parameters and
+// the basic egrep operators.
+void EgrepTest(int maxatoms, int maxops, const std::string& alphabet,
+ int maxstrlen, const std::string& stralphabet,
+ const std::string& wrapper);
+
+} // namespace re2
+
+#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H_
diff --git a/third_party/re2/src/re2/testing/filtered_re2_test.cc b/third_party/re2/src/re2/testing/filtered_re2_test.cc
new file mode 100644
index 000000000..a8d2dfc72
--- /dev/null
+++ b/third_party/re2/src/re2/testing/filtered_re2_test.cc
@@ -0,0 +1,342 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stddef.h>
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include <utility>
+
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "re2/filtered_re2.h"
+#include "re2/re2.h"
+
+namespace re2 {
+
+struct FilterTestVars {
+ FilterTestVars() {}
+ explicit FilterTestVars(int min_atom_len) : f(min_atom_len) {}
+
+ std::vector<std::string> atoms;
+ std::vector<int> atom_indices;
+ std::vector<int> matches;
+ RE2::Options opts;
+ FilteredRE2 f;
+};
+
+TEST(FilteredRE2Test, EmptyTest) {
+ FilterTestVars v;
+
+ v.f.Compile(&v.atoms);
+ EXPECT_EQ(0, v.atoms.size());
+
+ // Compile has no effect at all when called before Add: it will not
+ // record that it has been called and it will not clear the vector.
+ // The second point does not matter here, but the first point means
+ // that an error will be logged during the call to AllMatches.
+ v.f.AllMatches("foo", v.atom_indices, &v.matches);
+ EXPECT_EQ(0, v.matches.size());
+}
+
+TEST(FilteredRE2Test, SmallOrTest) {
+ FilterTestVars v(4); // override the minimum atom length
+ int id;
+ v.f.Add("(foo|bar)", v.opts, &id);
+
+ v.f.Compile(&v.atoms);
+ EXPECT_EQ(0, v.atoms.size());
+
+ v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches);
+ EXPECT_EQ(1, v.matches.size());
+ EXPECT_EQ(id, v.matches[0]);
+}
+
+TEST(FilteredRE2Test, SmallLatinTest) {
+ FilterTestVars v;
+ int id;
+
+ v.opts.set_encoding(RE2::Options::EncodingLatin1);
+ v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id);
+ v.f.Compile(&v.atoms);
+ EXPECT_EQ(1, v.atoms.size());
+ EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef");
+
+ v.atom_indices.push_back(0);
+ v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches);
+ EXPECT_EQ(1, v.matches.size());
+ EXPECT_EQ(id, v.matches[0]);
+}
+
+struct AtomTest {
+ const char* testname;
+ // If any test needs more than this many regexps or atoms, increase
+ // the size of the corresponding array.
+ const char* regexps[20];
+ const char* atoms[20];
+};
+
+AtomTest atom_tests[] = {
+ {
+ // This test checks to make sure empty patterns are allowed.
+ "CheckEmptyPattern",
+ {""},
+ {}
+ }, {
+ // This test checks that all atoms of length greater than min length
+ // are found, and no atoms that are of smaller length are found.
+ "AllAtomsGtMinLengthFound", {
+ "(abc123|def456|ghi789).*mnop[x-z]+",
+ "abc..yyy..zz",
+ "mnmnpp[a-z]+PPP"
+ }, {
+ "abc123",
+ "def456",
+ "ghi789",
+ "mnop",
+ "abc",
+ "yyy",
+ "mnmnpp",
+ "ppp"
+ }
+ }, {
+ // Test to make sure that any atoms that have another atom as a
+ // substring in an OR are removed; that is, only the shortest
+ // substring is kept.
+ "SubstrAtomRemovesSuperStrInOr", {
+ "(abc123|abc|defxyz|ghi789|abc1234|xyz).*[x-z]+",
+ "abcd..yyy..yyyzzz",
+ "mnmnpp[a-z]+PPP"
+ }, {
+ "abc",
+ "ghi789",
+ "xyz",
+ "abcd",
+ "yyy",
+ "yyyzzz",
+ "mnmnpp",
+ "ppp"
+ }
+ }, {
+ // Test character class expansion.
+ "CharClassExpansion", {
+ "m[a-c][d-f]n.*[x-z]+",
+ "[x-y]bcde[ab]"
+ }, {
+ "madn", "maen", "mafn",
+ "mbdn", "mben", "mbfn",
+ "mcdn", "mcen", "mcfn",
+ "xbcdea", "xbcdeb",
+ "ybcdea", "ybcdeb"
+ }
+ }, {
+ // Test upper/lower of non-ASCII.
+ "UnicodeLower", {
+ "(?i)ΔδΠϖπΣςσ",
+ "ΛΜΝΟΠ",
+ "ψρστυ",
+ }, {
+ "δδπππσσσ",
+ "λμνοπ",
+ "ψρστυ",
+ },
+ },
+};
+
+void AddRegexpsAndCompile(const char* regexps[],
+ size_t n,
+ struct FilterTestVars* v) {
+ for (size_t i = 0; i < n; i++) {
+ int id;
+ v->f.Add(regexps[i], v->opts, &id);
+ }
+ v->f.Compile(&v->atoms);
+}
+
+bool CheckExpectedAtoms(const char* atoms[],
+ size_t n,
+ const char* testname,
+ struct FilterTestVars* v) {
+ std::vector<std::string> expected;
+ for (size_t i = 0; i < n; i++)
+ expected.push_back(atoms[i]);
+
+ bool pass = expected.size() == v->atoms.size();
+
+ std::sort(v->atoms.begin(), v->atoms.end());
+ std::sort(expected.begin(), expected.end());
+ for (size_t i = 0; pass && i < n; i++)
+ pass = pass && expected[i] == v->atoms[i];
+
+ if (!pass) {
+ LOG(ERROR) << "Failed " << testname;
+ LOG(ERROR) << "Expected #atoms = " << expected.size();
+ for (size_t i = 0; i < expected.size(); i++)
+ LOG(ERROR) << expected[i];
+ LOG(ERROR) << "Found #atoms = " << v->atoms.size();
+ for (size_t i = 0; i < v->atoms.size(); i++)
+ LOG(ERROR) << v->atoms[i];
+ }
+
+ return pass;
+}
+
+TEST(FilteredRE2Test, AtomTests) {
+ int nfail = 0;
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(atom_tests); i++) {
+ FilterTestVars v;
+ AtomTest* t = &atom_tests[i];
+ size_t nregexp, natom;
+ for (nregexp = 0; nregexp < ABSL_ARRAYSIZE(t->regexps); nregexp++)
+ if (t->regexps[nregexp] == NULL)
+ break;
+ for (natom = 0; natom < ABSL_ARRAYSIZE(t->atoms); natom++)
+ if (t->atoms[natom] == NULL)
+ break;
+ AddRegexpsAndCompile(t->regexps, nregexp, &v);
+ if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v))
+ nfail++;
+ }
+ EXPECT_EQ(0, nfail);
+}
+
+void FindAtomIndices(const std::vector<std::string>& atoms,
+ const std::vector<std::string>& matched_atoms,
+ std::vector<int>* atom_indices) {
+ atom_indices->clear();
+ for (size_t i = 0; i < matched_atoms.size(); i++) {
+ for (size_t j = 0; j < atoms.size(); j++) {
+ if (matched_atoms[i] == atoms[j]) {
+ atom_indices->push_back(static_cast<int>(j));
+ break;
+ }
+ }
+ }
+}
+
+TEST(FilteredRE2Test, MatchEmptyPattern) {
+ FilterTestVars v;
+ AtomTest* t = &atom_tests[0];
+ // We are using the regexps used in one of the atom tests
+ // for this test. Adding the EXPECT here to make sure
+ // the index we use for the test is for the correct test.
+ EXPECT_EQ("CheckEmptyPattern", std::string(t->testname));
+ size_t nregexp;
+ for (nregexp = 0; nregexp < ABSL_ARRAYSIZE(t->regexps); nregexp++)
+ if (t->regexps[nregexp] == NULL)
+ break;
+ AddRegexpsAndCompile(t->regexps, nregexp, &v);
+ std::string text = "0123";
+ std::vector<int> atom_ids;
+ std::vector<int> matching_regexps;
+ EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids));
+}
+
+TEST(FilteredRE2Test, MatchTests) {
+ FilterTestVars v;
+ AtomTest* t = &atom_tests[2];
+ // We are using the regexps used in one of the atom tests
+ // for this test.
+ EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", std::string(t->testname));
+ size_t nregexp;
+ for (nregexp = 0; nregexp < ABSL_ARRAYSIZE(t->regexps); nregexp++)
+ if (t->regexps[nregexp] == NULL)
+ break;
+ AddRegexpsAndCompile(t->regexps, nregexp, &v);
+
+ std::string text = "abc121212xyz";
+ // atoms = abc
+ std::vector<int> atom_ids;
+ std::vector<std::string> atoms;
+ atoms.push_back("abc");
+ FindAtomIndices(v.atoms, atoms, &atom_ids);
+ std::vector<int> matching_regexps;
+ v.f.AllMatches(text, atom_ids, &matching_regexps);
+ EXPECT_EQ(1, matching_regexps.size());
+
+ text = "abc12312yyyzzz";
+ atoms.clear();
+ atoms.push_back("abc");
+ atoms.push_back("yyy");
+ atoms.push_back("yyyzzz");
+ FindAtomIndices(v.atoms, atoms, &atom_ids);
+ v.f.AllMatches(text, atom_ids, &matching_regexps);
+ EXPECT_EQ(1, matching_regexps.size());
+
+ text = "abcd12yyy32yyyzzz";
+ atoms.clear();
+ atoms.push_back("abc");
+ atoms.push_back("abcd");
+ atoms.push_back("yyy");
+ atoms.push_back("yyyzzz");
+ FindAtomIndices(v.atoms, atoms, &atom_ids);
+ LOG(INFO) << "S: " << atom_ids.size();
+ for (size_t i = 0; i < atom_ids.size(); i++)
+ LOG(INFO) << "i: " << i << " : " << atom_ids[i];
+ v.f.AllMatches(text, atom_ids, &matching_regexps);
+ EXPECT_EQ(2, matching_regexps.size());
+}
+
+TEST(FilteredRE2Test, EmptyStringInStringSetBug) {
+ // Bug due to find() finding "" at the start of everything in a string
+ // set and thus SimplifyStringSet() would end up erasing everything.
+ // In order to test this, we have to keep PrefilterTree from discarding
+ // the OR entirely, so we have to make the minimum atom length zero.
+
+ FilterTestVars v(0); // override the minimum atom length
+ const char* regexps[] = {"-R.+(|ADD=;AA){12}}"};
+ const char* atoms[] = {"", "-r", "add=;aa", "}"};
+ AddRegexpsAndCompile(regexps, ABSL_ARRAYSIZE(regexps), &v);
+ EXPECT_TRUE(CheckExpectedAtoms(atoms, ABSL_ARRAYSIZE(atoms),
+ "EmptyStringInStringSetBug", &v));
+}
+
+TEST(FilteredRE2Test, MoveSemantics) {
+ FilterTestVars v1;
+ int id;
+ v1.f.Add("foo\\d+", v1.opts, &id);
+ EXPECT_EQ(0, id);
+ v1.f.Compile(&v1.atoms);
+ EXPECT_EQ(1, v1.atoms.size());
+ EXPECT_EQ("foo", v1.atoms[0]);
+ v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches);
+ EXPECT_EQ(1, v1.matches.size());
+ EXPECT_EQ(0, v1.matches[0]);
+ v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches);
+ EXPECT_EQ(0, v1.matches.size());
+
+ // The moved-to object should do what the moved-from object did.
+ FilterTestVars v2;
+ v2.f = std::move(v1.f);
+ v2.f.AllMatches("abc foo1 xyz", {0}, &v2.matches);
+ EXPECT_EQ(1, v2.matches.size());
+ EXPECT_EQ(0, v2.matches[0]);
+ v2.f.AllMatches("abc bar2 xyz", {0}, &v2.matches);
+ EXPECT_EQ(0, v2.matches.size());
+
+ // The moved-from object should have been reset and be reusable.
+ v1.f.Add("bar\\d+", v1.opts, &id);
+ EXPECT_EQ(0, id);
+ v1.f.Compile(&v1.atoms);
+ EXPECT_EQ(1, v1.atoms.size());
+ EXPECT_EQ("bar", v1.atoms[0]);
+ v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches);
+ EXPECT_EQ(0, v1.matches.size());
+ v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches);
+ EXPECT_EQ(1, v1.matches.size());
+ EXPECT_EQ(0, v1.matches[0]);
+
+ // Verify that "overwriting" works and also doesn't leak memory.
+ // (The latter will need a leak detector such as LeakSanitizer.)
+ v1.f = std::move(v2.f);
+ v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches);
+ EXPECT_EQ(1, v1.matches.size());
+ EXPECT_EQ(0, v1.matches[0]);
+ v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches);
+ EXPECT_EQ(0, v1.matches.size());
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/mimics_pcre_test.cc b/third_party/re2/src/re2/testing/mimics_pcre_test.cc
new file mode 100644
index 000000000..829659d67
--- /dev/null
+++ b/third_party/re2/src/re2/testing/mimics_pcre_test.cc
@@ -0,0 +1,78 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct PCRETest {
+ const char* regexp;
+ bool should_match;
+};
+
+static PCRETest tests[] = {
+ // Most things should behave exactly.
+ { "abc", true },
+ { "(a|b)c", true },
+ { "(a*|b)c", true },
+ { "(a|b*)c", true },
+ { "a(b|c)d", true },
+ { "a(()|())c", true },
+ { "ab*c", true },
+ { "ab+c", true },
+ { "a(b*|c*)d", true },
+ { "\\W", true },
+ { "\\W{1,2}", true },
+ { "\\d", true },
+
+ // Check that repeated empty strings do not.
+ { "(a*)*", false },
+ { "x(a*)*y", false },
+ { "(a*)+", false },
+ { "(a+)*", true },
+ { "(a+)+", true },
+ { "(a+)+", true },
+
+ // \v is the only character class that shouldn't.
+ { "\\b", true },
+ { "\\v", false },
+ { "\\d", true },
+
+ // The handling of ^ in multi-line mode is different, as is
+ // the handling of $ in single-line mode. (Both involve
+ // boundary cases if the string ends with \n.)
+ { "\\A", true },
+ { "\\z", true },
+ { "(?m)^", false },
+ { "(?m)$", true },
+ { "(?-m)^", true },
+ { "(?-m)$", false }, // In PCRE, == \Z
+ { "(?m)\\A", true },
+ { "(?m)\\z", true },
+ { "(?-m)\\A", true },
+ { "(?-m)\\z", true },
+};
+
+TEST(MimicsPCRE, SimpleTests) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
+ const PCRETest& t = tests[i];
+ for (size_t j = 0; j < 2; j++) {
+ Regexp::ParseFlags flags = Regexp::LikePerl;
+ if (j == 0)
+ flags = flags | Regexp::Latin1;
+ Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
+ ASSERT_TRUE(re != NULL) << " " << t.regexp;
+ ASSERT_EQ(t.should_match, re->MimicsPCRE())
+ << " " << t.regexp << " "
+ << (j == 0 ? "latin1" : "utf");
+ re->Decref();
+ }
+ }
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/null_walker.cc b/third_party/re2/src/re2/testing/null_walker.cc
new file mode 100644
index 000000000..745364b3c
--- /dev/null
+++ b/third_party/re2/src/re2/testing/null_walker.cc
@@ -0,0 +1,49 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// Null walker. For benchmarking the walker itself.
+
+class NullWalker : public Regexp::Walker<bool> {
+ public:
+ NullWalker() {}
+
+ virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+ bool* child_args, int nchild_args);
+
+ virtual bool ShortVisit(Regexp* re, bool a) {
+ // Should never be called: we use Walk(), not WalkExponential().
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ LOG(DFATAL) << "NullWalker::ShortVisit called";
+#endif
+ return a;
+ }
+
+ private:
+ NullWalker(const NullWalker&) = delete;
+ NullWalker& operator=(const NullWalker&) = delete;
+};
+
+// Called after visiting re's children. child_args contains the return
+// value from each of the children's PostVisits (i.e., whether each child
+// can match an empty string). Returns whether this clause can match an
+// empty string.
+bool NullWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+ bool* child_args, int nchild_args) {
+ return false;
+}
+
+// Returns whether re can match an empty string.
+void Regexp::NullWalk() {
+ NullWalker w;
+ w.Walk(this, false);
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/parse_test.cc b/third_party/re2/src/re2/testing/parse_test.cc
new file mode 100644
index 000000000..0ee5561e9
--- /dev/null
+++ b/third_party/re2/src/re2/testing/parse_test.cc
@@ -0,0 +1,528 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test parse.cc, dump.cc, and tostring.cc.
+
+#include <string>
+
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+// In the past, we used 1<<30 here and zeroed the bit later, but that
+// has undefined behaviour, so now we use an internal-only flag because
+// otherwise we would have to introduce a new flag value just for this.
+static const Regexp::ParseFlags TestZeroFlags = Regexp::WasDollar;
+
+struct Test {
+ const char* regexp;
+ const char* parse;
+ Regexp::ParseFlags flags;
+};
+
+static Regexp::ParseFlags kTestFlags = Regexp::MatchNL |
+ Regexp::PerlX |
+ Regexp::PerlClasses |
+ Regexp::UnicodeGroups;
+
+static Test tests[] = {
+ // Base cases
+ { "a", "lit{a}" },
+ { "a.", "cat{lit{a}dot{}}" },
+ { "a.b", "cat{lit{a}dot{}lit{b}}" },
+ { "ab", "str{ab}" },
+ { "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" },
+ { "abc", "str{abc}" },
+ { "a|^", "alt{lit{a}bol{}}" },
+ { "a|b", "cc{0x61-0x62}" },
+ { "(a)", "cap{lit{a}}" },
+ { "(a)|b", "alt{cap{lit{a}}lit{b}}" },
+ { "a*", "star{lit{a}}" },
+ { "a+", "plus{lit{a}}" },
+ { "a?", "que{lit{a}}" },
+ { "a{2}", "rep{2,2 lit{a}}" },
+ { "a{2,3}", "rep{2,3 lit{a}}" },
+ { "a{2,}", "rep{2,-1 lit{a}}" },
+ { "a*?", "nstar{lit{a}}" },
+ { "a+?", "nplus{lit{a}}" },
+ { "a??", "nque{lit{a}}" },
+ { "a{2}?", "nrep{2,2 lit{a}}" },
+ { "a{2,3}?", "nrep{2,3 lit{a}}" },
+ { "a{2,}?", "nrep{2,-1 lit{a}}" },
+ { "", "emp{}" },
+ { "|", "alt{emp{}emp{}}" },
+ { "|x|", "alt{emp{}lit{x}emp{}}" },
+ { ".", "dot{}" },
+ { "^", "bol{}" },
+ { "$", "eol{}" },
+ { "\\|", "lit{|}" },
+ { "\\(", "lit{(}" },
+ { "\\)", "lit{)}" },
+ { "\\*", "lit{*}" },
+ { "\\+", "lit{+}" },
+ { "\\?", "lit{?}" },
+ { "{", "lit{{}" },
+ { "}", "lit{}}" },
+ { "\\.", "lit{.}" },
+ { "\\^", "lit{^}" },
+ { "\\$", "lit{$}" },
+ { "\\\\", "lit{\\}" },
+ { "[ace]", "cc{0x61 0x63 0x65}" },
+ { "[abc]", "cc{0x61-0x63}" },
+ { "[a-z]", "cc{0x61-0x7a}" },
+ { "[a]", "lit{a}" },
+ { "\\-", "lit{-}" },
+ { "-", "lit{-}" },
+ { "\\_", "lit{_}" },
+
+ // Posix and Perl extensions
+ { "[[:lower:]]", "cc{0x61-0x7a}" },
+ { "[a-z]", "cc{0x61-0x7a}" },
+ { "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
+ { "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
+ { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
+ { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
+ { "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
+ { "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
+ { "\\d", "cc{0x30-0x39}" },
+ { "\\D", "cc{0-0x2f 0x3a-0x10ffff}" },
+ { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
+ { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
+ { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
+ { "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
+ { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
+ { "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
+ { "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" },
+ { "\\C", "byte{}" },
+
+ // Unicode, negatives, and a double negative.
+ { "\\p{Braille}", "cc{0x2800-0x28ff}" },
+ { "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
+ { "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
+ { "\\P{^Braille}", "cc{0x2800-0x28ff}" },
+
+ // More interesting regular expressions.
+ { "a{,2}", "str{a{,2}}" },
+ { "\\.\\^\\$\\\\", "str{.^$\\}" },
+ { "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" },
+ { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
+ { "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" }, // utf-8
+ { "a*{", "cat{star{lit{a}}lit{{}}" },
+
+ // Test precedences
+ { "(?:ab)*", "star{str{ab}}" },
+ { "(ab)*", "star{cap{str{ab}}}" },
+ { "ab|cd", "alt{str{ab}str{cd}}" },
+ { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
+
+ // Test squashing of **, ++, ?? et cetera.
+ { "(?:(?:a)*)*", "star{lit{a}}" },
+ { "(?:(?:a)+)+", "plus{lit{a}}" },
+ { "(?:(?:a)?)?", "que{lit{a}}" },
+ { "(?:(?:a)*)+", "star{lit{a}}" },
+ { "(?:(?:a)*)?", "star{lit{a}}" },
+ { "(?:(?:a)+)*", "star{lit{a}}" },
+ { "(?:(?:a)+)?", "star{lit{a}}" },
+ { "(?:(?:a)?)*", "star{lit{a}}" },
+ { "(?:(?:a)?)+", "star{lit{a}}" },
+
+ // Test flattening.
+ { "(?:a)", "lit{a}" },
+ { "(?:ab)(?:cd)", "str{abcd}" },
+ { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
+ { "a|c", "cc{0x61 0x63}" },
+ { "a|[cd]", "cc{0x61 0x63-0x64}" },
+ { "a|.", "dot{}" },
+ { "[ab]|c", "cc{0x61-0x63}" },
+ { "[ab]|[cd]", "cc{0x61-0x64}" },
+ { "[ab]|.", "dot{}" },
+ { ".|c", "dot{}" },
+ { ".|[cd]", "dot{}" },
+ { ".|.", "dot{}" },
+
+ // Test Perl quoted literals
+ { "\\Q+|*?{[\\E", "str{+|*?{[}" },
+ { "\\Q+\\E+", "plus{lit{+}}" },
+ { "\\Q\\\\E", "lit{\\}" },
+ { "\\Q\\\\\\E", "str{\\\\}" },
+ { "\\Qa\\E*", "star{lit{a}}" },
+ { "\\Qab\\E*", "cat{lit{a}star{lit{b}}}" },
+ { "\\Qabc\\E*", "cat{str{ab}star{lit{c}}}" },
+
+ // Test Perl \A and \z
+ { "(?m)^", "bol{}" },
+ { "(?m)$", "eol{}" },
+ { "(?-m)^", "bot{}" },
+ { "(?-m)$", "eot{}" },
+ { "(?m)\\A", "bot{}" },
+ { "(?m)\\z", "eot{\\z}" },
+ { "(?-m)\\A", "bot{}" },
+ { "(?-m)\\z", "eot{\\z}" },
+
+ // Test named captures
+ { "(?P<name>a)", "cap{name:lit{a}}" },
+ { "(?P<中文>a)", "cap{中文:lit{a}}" },
+ { "(?<name>a)", "cap{name:lit{a}}" },
+ { "(?<中文>a)", "cap{中文:lit{a}}" },
+
+ // Case-folded literals
+ { "[Aa]", "litfold{a}" },
+
+ // Strings
+ { "abcde", "str{abcde}" },
+ { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
+
+ // Reported bug involving \n leaking in despite use of NeverNL.
+ { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
+ { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
+ { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
+ { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
+ { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags },
+ { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
+ { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
+ { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
+ { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags },
+ { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
+ { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
+ { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
+ { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags },
+ { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
+ { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
+ { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
+ { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
+ { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
+ { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
+ { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
+ { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
+ { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
+ { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
+ { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
+ { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
+ { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
+ { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
+ { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
+ { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
+ Regexp::PerlClasses },
+ { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
+ Regexp::PerlClasses | Regexp::FoldCase },
+ { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
+ Regexp::PerlClasses | Regexp::NeverNL },
+ { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
+ Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
+ { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
+ Regexp::PerlClasses },
+ { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
+ Regexp::PerlClasses | Regexp::FoldCase },
+ { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
+ Regexp::PerlClasses | Regexp::NeverNL },
+ { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
+ Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
+
+ // Bug in Regexp::ToString() that emitted [^], which
+ // would (obviously) fail to parse when fed back in.
+ { "[\\s\\S]", "cc{0-0x10ffff}" },
+};
+
+bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
+ return Regexp::Equal(a, b);
+}
+
+void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
+ const std::string& title) {
+ Regexp** re = new Regexp*[ntests];
+ for (int i = 0; i < ntests; i++) {
+ RegexpStatus status;
+ Regexp::ParseFlags f = flags;
+ if (tests[i].flags != 0) {
+ f = tests[i].flags & ~TestZeroFlags;
+ }
+ re[i] = Regexp::Parse(tests[i].regexp, f, &status);
+ ASSERT_TRUE(re[i] != NULL)
+ << " " << tests[i].regexp << " " << status.Text();
+ std::string s = re[i]->Dump();
+ EXPECT_EQ(std::string(tests[i].parse), s)
+ << "Regexp: " << tests[i].regexp
+ << "\nparse: " << std::string(tests[i].parse)
+ << " s: " << s << " flag=" << f;
+ }
+
+ for (int i = 0; i < ntests; i++) {
+ for (int j = 0; j < ntests; j++) {
+ EXPECT_EQ(std::string(tests[i].parse) == std::string(tests[j].parse),
+ RegexpEqualTestingOnly(re[i], re[j]))
+ << "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
+ }
+ }
+
+ for (int i = 0; i < ntests; i++)
+ re[i]->Decref();
+ delete[] re;
+}
+
+// Test that regexps parse to expected structures.
+TEST(TestParse, SimpleRegexps) {
+ TestParse(tests, ABSL_ARRAYSIZE(tests), kTestFlags, "simple");
+}
+
+Test foldcase_tests[] = {
+ { "AbCdE", "strfold{abcde}" },
+ { "[Aa]", "litfold{a}" },
+ { "a", "litfold{a}" },
+
+ // 0x17F is an old English long s (looks like an f) and folds to s.
+ // 0x212A is the Kelvin symbol and folds to k.
+ { "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...]
+ { "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
+ { "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
+};
+
+// Test that parsing with FoldCase works.
+TEST(TestParse, FoldCase) {
+ TestParse(foldcase_tests, ABSL_ARRAYSIZE(foldcase_tests), Regexp::FoldCase, "foldcase");
+}
+
+Test literal_tests[] = {
+ { "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" },
+};
+
+// Test that parsing with Literal works.
+TEST(TestParse, Literal) {
+ TestParse(literal_tests, ABSL_ARRAYSIZE(literal_tests), Regexp::Literal, "literal");
+}
+
+Test matchnl_tests[] = {
+ { ".", "dot{}" },
+ { "\n", "lit{\n}" },
+ { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
+ { "[a\\n]", "cc{0xa 0x61}" },
+};
+
+// Test that parsing with MatchNL works.
+// (Also tested above during simple cases.)
+TEST(TestParse, MatchNL) {
+ TestParse(matchnl_tests, ABSL_ARRAYSIZE(matchnl_tests), Regexp::MatchNL, "with MatchNL");
+}
+
+Test nomatchnl_tests[] = {
+ { ".", "cc{0-0x9 0xb-0x10ffff}" },
+ { "\n", "lit{\n}" },
+ { "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" },
+ { "[a\\n]", "cc{0xa 0x61}" },
+};
+
+// Test that parsing without MatchNL works.
+TEST(TestParse, NoMatchNL) {
+ TestParse(nomatchnl_tests, ABSL_ARRAYSIZE(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
+}
+
+Test prefix_tests[] = {
+ { "abc|abd", "cat{str{ab}cc{0x63-0x64}}" },
+ { "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" },
+ { "abc|abd|aef|bcx|bcy",
+ "alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}"
+ "cat{str{bc}cc{0x78-0x79}}}" },
+ { "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" },
+ { "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
+ { "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
+ { ".c|.d", "cat{cc{0-0x9 0xb-0x10ffff}cc{0x63-0x64}}" },
+ { "\\Cc|\\Cd", "cat{byte{}cc{0x63-0x64}}" },
+ { "x{2}|x{2}[0-9]",
+ "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
+ { "x{2}y|x{2}[0-9]y",
+ "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
+ { "n|r|rs",
+ "alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" },
+ { "n|rs|r",
+ "alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" },
+ { "r|rs|n",
+ "alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" },
+ { "rs|r|n",
+ "alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" },
+ { "a\\C*?c|a\\C*?b",
+ "cat{lit{a}alt{cat{nstar{byte{}}lit{c}}cat{nstar{byte{}}lit{b}}}}" },
+ { "^/a/bc|^/a/de",
+ "cat{bol{}cat{str{/a/}alt{str{bc}str{de}}}}" },
+ // In the past, factoring was limited to kFactorAlternationMaxDepth (8).
+ { "a|aa|aaa|aaaa|aaaaa|aaaaaa|aaaaaaa|aaaaaaaa|aaaaaaaaa|aaaaaaaaaa",
+ "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
+ "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
+ "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
+ "lit{a}}}}}}}}}}}}}}}}}}}" },
+ { "a|aardvark|aardvarks|abaci|aback|abacus|abacuses|abaft|abalone|abalones",
+ "cat{lit{a}alt{emp{}cat{str{ardvark}alt{emp{}lit{s}}}"
+ "cat{str{ba}alt{cat{lit{c}alt{cc{0x69 0x6b}cat{str{us}alt{emp{}str{es}}}}}"
+ "str{ft}cat{str{lone}alt{emp{}lit{s}}}}}}}" },
+};
+
+// Test that prefix factoring works.
+TEST(TestParse, Prefix) {
+ TestParse(prefix_tests, ABSL_ARRAYSIZE(prefix_tests), Regexp::PerlX, "prefix");
+}
+
+Test nested_tests[] = {
+ { "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))",
+ "cap{cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}}}}}}}}" },
+ { "((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
+ "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{1,1 lit{x}}}}}}}}}}}}}}}}}}}}}" },
+ { "((((((((((x{0}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
+ "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{0,0 lit{x}}}}}}}}}}}}}}}}}}}}}" },
+ { "((((((x{2}){2}){2}){5}){5}){5})",
+ "cap{rep{5,5 cap{rep{5,5 cap{rep{5,5 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}" },
+};
+
+// Test that nested repetition works.
+TEST(TestParse, Nested) {
+ TestParse(nested_tests, ABSL_ARRAYSIZE(nested_tests), Regexp::PerlX, "nested");
+}
+
+// Invalid regular expressions
+const char* badtests[] = {
+ "(",
+ ")",
+ "(a",
+ "(a|b|",
+ "(a|b",
+ "[a-z",
+ "([a-z)",
+ "x{1001}",
+ "\xff", // Invalid UTF-8
+ "[\xff]",
+ "[\\\xff]",
+ "\\\xff",
+ "(?P<name>a",
+ "(?P<name>",
+ "(?P<name",
+ "(?P<x y>a)",
+ "(?P<>a)",
+ "(?<name>a",
+ "(?<name>",
+ "(?<name",
+ "(?<x y>a)",
+ "(?<>a)",
+ "[a-Z]",
+ "(?i)[a-Z]",
+ "a{100000}",
+ "a{100000,}",
+ "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
+ "(((x{7}){11}){13})",
+ "\\Q\\E*",
+};
+
+// Valid in Perl, bad in POSIX
+const char* only_perl[] = {
+ "[a-b-c]",
+ "\\Qabc\\E",
+ "\\Q*+?{[\\E",
+ "\\Q\\\\E",
+ "\\Q\\\\\\E",
+ "\\Q\\\\\\\\E",
+ "\\Q\\\\\\\\\\E",
+ "(?:a)",
+ "(?P<name>a)",
+ "(?<name>a)",
+};
+
+// Valid in POSIX, bad in Perl.
+const char* only_posix[] = {
+ "a++",
+ "a**",
+ "a?*",
+ "a+*",
+ "a{1}*",
+};
+
+// Test that parser rejects bad regexps.
+TEST(TestParse, InvalidRegexps) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(badtests); i++) {
+ ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
+ << " " << badtests[i];
+ ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
+ << " " << badtests[i];
+ }
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(only_posix); i++) {
+ ASSERT_TRUE(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
+ << " " << only_posix[i];
+ Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
+ ASSERT_TRUE(re != NULL) << " " << only_posix[i];
+ re->Decref();
+ }
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(only_perl); i++) {
+ ASSERT_TRUE(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
+ << " " << only_perl[i];
+ Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
+ ASSERT_TRUE(re != NULL) << " " << only_perl[i];
+ re->Decref();
+ }
+}
+
+// Test that ToString produces original regexp or equivalent one.
+TEST(TestToString, EquivalentParse) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
+ RegexpStatus status;
+ Regexp::ParseFlags f = kTestFlags;
+ if (tests[i].flags != 0) {
+ f = tests[i].flags & ~TestZeroFlags;
+ }
+ Regexp* re = Regexp::Parse(tests[i].regexp, f, &status);
+ ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text();
+ std::string s = re->Dump();
+ EXPECT_EQ(std::string(tests[i].parse), s)
+ << "Regexp: " << tests[i].regexp
+ << "\nparse: " << std::string(tests[i].parse)
+ << " s: " << s << " flag=" << f;
+ std::string t = re->ToString();
+ if (t != tests[i].regexp) {
+ // If ToString didn't return the original regexp,
+ // it must have found one with fewer parens.
+ // Unfortunately we can't check the length here, because
+ // ToString produces "\\{" for a literal brace,
+ // but "{" is a shorter equivalent.
+ // ASSERT_LT(t.size(), strlen(tests[i].regexp))
+ // << " t=" << t << " regexp=" << tests[i].regexp;
+
+ // Test that if we parse the new regexp we get the same structure.
+ Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
+ ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text();
+ std::string ss = nre->Dump();
+ std::string tt = nre->ToString();
+ if (s != ss || t != tt)
+ LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t;
+ EXPECT_EQ(s, ss);
+ EXPECT_EQ(t, tt);
+ nre->Decref();
+ }
+ re->Decref();
+ }
+}
+
+// Test that capture error args are correct.
+TEST(NamedCaptures, ErrorArgs) {
+ RegexpStatus status;
+ Regexp* re;
+
+ re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status);
+ EXPECT_TRUE(re == NULL);
+ EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
+ EXPECT_EQ(status.error_arg(), "(?P<name");
+
+ re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status);
+ EXPECT_TRUE(re == NULL);
+ EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
+ EXPECT_EQ(status.error_arg(), "(?P<space bar>");
+
+ re = Regexp::Parse("test(?<name", Regexp::LikePerl, &status);
+ EXPECT_TRUE(re == NULL);
+ EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
+ EXPECT_EQ(status.error_arg(), "(?<name");
+
+ re = Regexp::Parse("test(?<space bar>z)", Regexp::LikePerl, &status);
+ EXPECT_TRUE(re == NULL);
+ EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
+ EXPECT_EQ(status.error_arg(), "(?<space bar>");
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/possible_match_test.cc b/third_party/re2/src/re2/testing/possible_match_test.cc
new file mode 100644
index 000000000..fe199c662
--- /dev/null
+++ b/third_party/re2/src/re2/testing/possible_match_test.cc
@@ -0,0 +1,248 @@
+// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string.h>
+#include <string>
+#include <vector>
+
+#include "absl/base/macros.h"
+#include "absl/strings/escaping.h"
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+#include "re2/testing/exhaustive_tester.h"
+#include "re2/testing/regexp_generator.h"
+#include "re2/testing/string_generator.h"
+
+namespace re2 {
+
+// Test that C++ strings are compared as uint8s, not int8s.
+// PossibleMatchRange doesn't depend on this, but callers probably will.
+TEST(CplusplusStrings, EightBit) {
+ std::string s = "\x70";
+ std::string t = "\xA0";
+ EXPECT_LT(s, t);
+}
+
+struct PrefixTest {
+ const char* regexp;
+ int maxlen;
+ const char* min;
+ const char* max;
+};
+
+static PrefixTest tests[] = {
+ { "", 10, "", "", },
+ { "Abcdef", 10, "Abcdef", "Abcdef" },
+ { "abc(def|ghi)", 10, "abcdef", "abcghi" },
+ { "a+hello", 10, "aa", "ahello" },
+ { "a*hello", 10, "a", "hello" },
+ { "def|abc", 10, "abc", "def" },
+ { "a(b)(c)[d]", 10, "abcd", "abcd" },
+ { "ab(cab|cat)", 10, "abcab", "abcat" },
+ { "ab(cab|ca)x", 10, "abcabx", "abcax" },
+ { "(ab|x)(c|de)", 10, "abc", "xde" },
+ { "(ab|x)?(c|z)?", 10, "", "z" },
+ { "[^\\s\\S]", 10, "", "" },
+ { "(abc)+", 5, "abc", "abcac" },
+ { "(abc)+", 2, "ab", "ac" },
+ { "(abc)+", 1, "a", "b" },
+ { "[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
+ { "a*", 10, "", "ab" },
+
+ { "(?i)Abcdef", 10, "ABCDEF", "abcdef" },
+ { "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" },
+ { "(?i)a+hello", 10, "AA", "ahello" },
+ { "(?i)a*hello", 10, "A", "hello" },
+ { "(?i)def|abc", 10, "ABC", "def" },
+ { "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" },
+ { "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" },
+ { "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" },
+ { "(?i)(ab|x)(c|de)", 10, "ABC", "xde" },
+ { "(?i)(ab|x)?(c|z)?", 10, "", "z" },
+ { "(?i)[^\\s\\S]", 10, "", "" },
+ { "(?i)(abc)+", 5, "ABC", "abcac" },
+ { "(?i)(abc)+", 2, "AB", "ac" },
+ { "(?i)(abc)+", 1, "A", "b" },
+ { "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
+ { "(?i)a*", 10, "", "ab" },
+ { "(?i)A*", 10, "", "ab" },
+
+ { "\\AAbcdef", 10, "Abcdef", "Abcdef" },
+ { "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" },
+ { "\\Aa+hello", 10, "aa", "ahello" },
+ { "\\Aa*hello", 10, "a", "hello" },
+ { "\\Adef|abc", 10, "abc", "def" },
+ { "\\Aa(b)(c)[d]", 10, "abcd", "abcd" },
+ { "\\Aab(cab|cat)", 10, "abcab", "abcat" },
+ { "\\Aab(cab|ca)x", 10, "abcabx", "abcax" },
+ { "\\A(ab|x)(c|de)", 10, "abc", "xde" },
+ { "\\A(ab|x)?(c|z)?", 10, "", "z" },
+ { "\\A[^\\s\\S]", 10, "", "" },
+ { "\\A(abc)+", 5, "abc", "abcac" },
+ { "\\A(abc)+", 2, "ab", "ac" },
+ { "\\A(abc)+", 1, "a", "b" },
+ { "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
+ { "\\Aa*", 10, "", "ab" },
+
+ { "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" },
+ { "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" },
+ { "(?i)\\Aa+hello", 10, "AA", "ahello" },
+ { "(?i)\\Aa*hello", 10, "A", "hello" },
+ { "(?i)\\Adef|abc", 10, "ABC", "def" },
+ { "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" },
+ { "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" },
+ { "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" },
+ { "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" },
+ { "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" },
+ { "(?i)\\A[^\\s\\S]", 10, "", "" },
+ { "(?i)\\A(abc)+", 5, "ABC", "abcac" },
+ { "(?i)\\A(abc)+", 2, "AB", "ac" },
+ { "(?i)\\A(abc)+", 1, "A", "b" },
+ { "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
+ { "(?i)\\Aa*", 10, "", "ab" },
+ { "(?i)\\AA*", 10, "", "ab" },
+};
+
+TEST(PossibleMatchRange, HandWritten) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
+ for (size_t j = 0; j < 2; j++) {
+ const PrefixTest& t = tests[i];
+ std::string min, max;
+ if (j == 0) {
+ LOG(INFO) << "Checking regexp=" << absl::CEscape(t.regexp);
+ Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
+ ASSERT_TRUE(re != NULL);
+ Prog* prog = re->CompileToProg(0);
+ ASSERT_TRUE(prog != NULL);
+ ASSERT_TRUE(prog->PossibleMatchRange(&min, &max, t.maxlen))
+ << " " << t.regexp;
+ delete prog;
+ re->Decref();
+ } else {
+ ASSERT_TRUE(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
+ }
+ EXPECT_EQ(t.min, min) << t.regexp;
+ EXPECT_EQ(t.max, max) << t.regexp;
+ }
+ }
+}
+
+// Test cases where PossibleMatchRange should return false.
+TEST(PossibleMatchRange, Failures) {
+ std::string min, max;
+
+ // Fails because no room to write max.
+ EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
+
+ // Fails because there is no max -- any non-empty string matches
+ // or begins a match. Have to use Latin-1 input, because there
+ // are no valid UTF-8 strings beginning with byte 0xFF.
+ EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
+ PossibleMatchRange(&min, &max, 10))
+ << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
+ EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
+ PossibleMatchRange(&min, &max, 10))
+ << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
+ EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
+ PossibleMatchRange(&min, &max, 10))
+ << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
+ EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
+ PossibleMatchRange(&min, &max, 10))
+ << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
+ EXPECT_FALSE(RE2(".*", RE2::Latin1).
+ PossibleMatchRange(&min, &max, 10))
+ << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
+ EXPECT_FALSE(RE2("\\C*").
+ PossibleMatchRange(&min, &max, 10))
+ << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
+
+ // Fails because it's a malformed regexp.
+ EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
+ << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
+}
+
+// Exhaustive test: generate all regexps within parameters,
+// then generate all strings of a given length over a given alphabet,
+// then check that the prefix information agrees with whether
+// the regexp matches each of the strings.
+class PossibleMatchTester : public RegexpGenerator {
+ public:
+ PossibleMatchTester(int maxatoms,
+ int maxops,
+ const std::vector<std::string>& alphabet,
+ const std::vector<std::string>& ops,
+ int maxstrlen,
+ const std::vector<std::string>& stralphabet)
+ : RegexpGenerator(maxatoms, maxops, alphabet, ops),
+ strgen_(maxstrlen, stralphabet),
+ regexps_(0), tests_(0) { }
+
+ int regexps() { return regexps_; }
+ int tests() { return tests_; }
+
+ // Needed for RegexpGenerator interface.
+ void HandleRegexp(const std::string& regexp);
+
+ private:
+ StringGenerator strgen_;
+
+ int regexps_; // Number of HandleRegexp calls
+ int tests_; // Number of regexp tests.
+
+ PossibleMatchTester(const PossibleMatchTester&) = delete;
+ PossibleMatchTester& operator=(const PossibleMatchTester&) = delete;
+};
+
+// Processes a single generated regexp.
+// Checks that all accepted strings agree with the prefix range.
+void PossibleMatchTester::HandleRegexp(const std::string& regexp) {
+ regexps_++;
+
+ VLOG(3) << absl::CEscape(regexp);
+
+ RE2 re(regexp, RE2::Latin1);
+ ASSERT_EQ(re.error(), "");
+
+ std::string min, max;
+ if(!re.PossibleMatchRange(&min, &max, 10)) {
+ // There's no good max for "\\C*". Can't use strcmp
+ // because sometimes it gets embedded in more
+ // complicated expressions.
+ if(strstr(regexp.c_str(), "\\C*"))
+ return;
+ LOG(QFATAL) << "PossibleMatchRange failed on: " << absl::CEscape(regexp);
+ }
+
+ strgen_.Reset();
+ while (strgen_.HasNext()) {
+ absl::string_view s = strgen_.Next();
+ tests_++;
+ if (!RE2::FullMatch(s, re))
+ continue;
+ ASSERT_GE(s, min) << " regexp: " << regexp << " max: " << max;
+ ASSERT_LE(s, max) << " regexp: " << regexp << " min: " << min;
+ }
+}
+
+TEST(PossibleMatchRange, Exhaustive) {
+ int natom = 3;
+ int noperator = 3;
+ int stringlen = 5;
+ if (RE2_DEBUG_MODE) {
+ natom = 2;
+ noperator = 3;
+ stringlen = 3;
+ }
+ PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
+ RegexpGenerator::EgrepOps(),
+ stringlen, Explode("ab4"));
+ t.Generate();
+ LOG(INFO) << t.regexps() << " regexps, "
+ << t.tests() << " tests";
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/random_test.cc b/third_party/re2/src/re2/testing/random_test.cc
new file mode 100644
index 000000000..d076b39b1
--- /dev/null
+++ b/third_party/re2/src/re2/testing/random_test.cc
@@ -0,0 +1,102 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Random testing of regular expression matching.
+
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#include "absl/flags/flag.h"
+#include "absl/strings/str_format.h"
+#include "gtest/gtest.h"
+#include "re2/testing/exhaustive_tester.h"
+
+ABSL_FLAG(int, regexpseed, 404, "Random regexp seed.");
+ABSL_FLAG(int, regexpcount, 100, "How many random regexps to generate.");
+ABSL_FLAG(int, stringseed, 200, "Random string seed.");
+ABSL_FLAG(int, stringcount, 100, "How many random strings to generate.");
+
+namespace re2 {
+
+// Runs a random test on the given parameters.
+// (Always uses the same random seeds for reproducibility.
+// Can give different seeds on command line.)
+static void RandomTest(int maxatoms, int maxops,
+ const std::vector<std::string>& alphabet,
+ const std::vector<std::string>& ops,
+ int maxstrlen,
+ const std::vector<std::string>& stralphabet,
+ const std::string& wrapper) {
+ // Limit to smaller test cases in debug mode,
+ // because everything is so much slower.
+ if (RE2_DEBUG_MODE) {
+ maxatoms--;
+ maxops--;
+ maxstrlen /= 2;
+ }
+
+ ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
+ maxstrlen, stralphabet, wrapper, "");
+ t.RandomStrings(absl::GetFlag(FLAGS_stringseed),
+ absl::GetFlag(FLAGS_stringcount));
+ t.GenerateRandom(absl::GetFlag(FLAGS_regexpseed),
+ absl::GetFlag(FLAGS_regexpcount));
+ absl::PrintF("%d regexps, %d tests, %d failures [%d/%d str]\n",
+ t.regexps(), t.tests(), t.failures(), maxstrlen, stralphabet.size());
+ EXPECT_EQ(0, t.failures());
+}
+
+// Tests random small regexps involving literals and egrep operators.
+TEST(Random, SmallEgrepLiterals) {
+ RandomTest(5, 5, Explode("abc."), RegexpGenerator::EgrepOps(),
+ 15, Explode("abc"),
+ "");
+}
+
+// Tests random bigger regexps involving literals and egrep operators.
+TEST(Random, BigEgrepLiterals) {
+ RandomTest(10, 10, Explode("abc."), RegexpGenerator::EgrepOps(),
+ 15, Explode("abc"),
+ "");
+}
+
+// Tests random small regexps involving literals, capturing parens,
+// and egrep operators.
+TEST(Random, SmallEgrepCaptures) {
+ RandomTest(5, 5, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
+ 15, Explode("abc"),
+ "");
+}
+
+// Tests random bigger regexps involving literals, capturing parens,
+// and egrep operators.
+TEST(Random, BigEgrepCaptures) {
+ RandomTest(10, 10, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
+ 15, Explode("abc"),
+ "");
+}
+
+// Tests random large complicated expressions, using all the possible
+// operators, some literals, some parenthesized literals, and predefined
+// character classes like \d. (Adding larger character classes would
+// make for too many possibilities.)
+TEST(Random, Complicated) {
+ std::vector<std::string> ops = Split(" ",
+ "%s%s %s|%s %s* %s*? %s+ %s+? %s? %s?? "
+ "%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} %s{1,2} "
+ "%s{2} %s{2,} %s{3,4} %s{4,5}");
+
+ // Use (?:\b) and (?:\B) instead of \b and \B,
+ // because PCRE rejects \b* but accepts (?:\b)*.
+ // Ditto ^ and $.
+ std::vector<std::string> atoms = Split(" ",
+ ". (?:^) (?:$) \\a \\f \\n \\r \\t \\v "
+ "\\d \\D \\s \\S \\w \\W (?:\\b) (?:\\B) "
+ "a (a) b c - \\\\");
+ std::vector<std::string> alphabet = Explode("abc123\001\002\003\t\r\n\v\f\a");
+ RandomTest(10, 10, atoms, ops, 20, alphabet, "");
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/re2_arg_test.cc b/third_party/re2/src/re2/testing/re2_arg_test.cc
new file mode 100644
index 000000000..4b00be358
--- /dev/null
+++ b/third_party/re2/src/re2/testing/re2_arg_test.cc
@@ -0,0 +1,183 @@
+// Copyright 2005 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This tests to make sure numbers are parsed from strings
+// correctly.
+// Todo: Expand the test to validate strings parsed to the other types
+// supported by RE2::Arg class
+
+#include <stdint.h>
+#include <string.h>
+
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "re2/re2.h"
+
+namespace re2 {
+
+struct SuccessTable {
+ const char * value_string;
+ int64_t value;
+ bool success[6];
+};
+
+// Test boundary cases for different integral sizes.
+// Specifically I want to make sure that values outside the boundries
+// of an integral type will fail and that negative numbers will fail
+// for unsigned types. The following table contains the boundaries for
+// the various integral types and has entries for whether or not each
+// type can contain the given value.
+const SuccessTable kSuccessTable[] = {
+// string integer value i16 u16 i32 u32 i64 u64
+// 0 to 2^7-1
+{ "0", 0, { true, true, true, true, true, true }},
+{ "127", 127, { true, true, true, true, true, true }},
+
+// -1 to -2^7
+{ "-1", -1, { true, false, true, false, true, false }},
+{ "-128", -128, { true, false, true, false, true, false }},
+
+// 2^7 to 2^8-1
+{ "128", 128, { true, true, true, true, true, true }},
+{ "255", 255, { true, true, true, true, true, true }},
+
+// 2^8 to 2^15-1
+{ "256", 256, { true, true, true, true, true, true }},
+{ "32767", 32767, { true, true, true, true, true, true }},
+
+// -2^7-1 to -2^15
+{ "-129", -129, { true, false, true, false, true, false }},
+{ "-32768", -32768, { true, false, true, false, true, false }},
+
+// 2^15 to 2^16-1
+{ "32768", 32768, { false, true, true, true, true, true }},
+{ "65535", 65535, { false, true, true, true, true, true }},
+
+// 2^16 to 2^31-1
+{ "65536", 65536, { false, false, true, true, true, true }},
+{ "2147483647", 2147483647, { false, false, true, true, true, true }},
+
+// -2^15-1 to -2^31
+{ "-32769", -32769, { false, false, true, false, true, false }},
+{ "-2147483648", static_cast<int64_t>(0xFFFFFFFF80000000LL),
+ { false, false, true, false, true, false }},
+
+// 2^31 to 2^32-1
+{ "2147483648", 2147483648U, { false, false, false, true, true, true }},
+{ "4294967295", 4294967295U, { false, false, false, true, true, true }},
+
+// 2^32 to 2^63-1
+{ "4294967296", 4294967296LL, { false, false, false, false, true, true }},
+{ "9223372036854775807",
+ 9223372036854775807LL, { false, false, false, false, true, true }},
+
+// -2^31-1 to -2^63
+{ "-2147483649", -2147483649LL, { false, false, false, false, true, false }},
+{ "-9223372036854775808", static_cast<int64_t>(0x8000000000000000LL),
+ { false, false, false, false, true, false }},
+
+// 2^63 to 2^64-1
+{ "9223372036854775808", static_cast<int64_t>(9223372036854775808ULL),
+ { false, false, false, false, false, true }},
+{ "18446744073709551615", static_cast<int64_t>(18446744073709551615ULL),
+ { false, false, false, false, false, true }},
+
+// >= 2^64
+{ "18446744073709551616", 0, { false, false, false, false, false, false }},
+};
+
+const int kNumStrings = ABSL_ARRAYSIZE(kSuccessTable);
+
+// It's ugly to use a macro, but we apparently can't use the EXPECT_EQ
+// macro outside of a TEST block and this seems to be the only way to
+// avoid code duplication. I can also pull off a couple nice tricks
+// using concatenation for the type I'm checking against.
+#define PARSE_FOR_TYPE(type, column) { \
+ type r; \
+ for (int i = 0; i < kNumStrings; ++i) { \
+ RE2::Arg arg(&r); \
+ const char* const p = kSuccessTable[i].value_string; \
+ bool retval = arg.Parse(p, strlen(p)); \
+ bool success = kSuccessTable[i].success[column]; \
+ EXPECT_EQ(retval, success) \
+ << "Parsing '" << p << "' for type " #type " should return " \
+ << success; \
+ if (success) { \
+ EXPECT_EQ(r, (type)kSuccessTable[i].value); \
+ } \
+ } \
+}
+
+TEST(RE2ArgTest, Int16Test) {
+ PARSE_FOR_TYPE(int16_t, 0);
+}
+
+TEST(RE2ArgTest, Uint16Test) {
+ PARSE_FOR_TYPE(uint16_t, 1);
+}
+
+TEST(RE2ArgTest, Int32Test) {
+ PARSE_FOR_TYPE(int32_t, 2);
+}
+
+TEST(RE2ArgTest, Uint32Test) {
+ PARSE_FOR_TYPE(uint32_t, 3);
+}
+
+TEST(RE2ArgTest, Int64Test) {
+ PARSE_FOR_TYPE(int64_t, 4);
+}
+
+TEST(RE2ArgTest, Uint64Test) {
+ PARSE_FOR_TYPE(uint64_t, 5);
+}
+
+TEST(RE2ArgTest, ParseFromTest) {
+#if !defined(_MSC_VER)
+ struct {
+ bool ParseFrom(const char* str, size_t n) {
+ LOG(INFO) << "str = " << str << ", n = " << n;
+ return true;
+ }
+ } obj1;
+ RE2::Arg arg1(&obj1);
+ EXPECT_TRUE(arg1.Parse("one", 3));
+
+ struct {
+ bool ParseFrom(const char* str, size_t n) {
+ LOG(INFO) << "str = " << str << ", n = " << n;
+ return false;
+ }
+ // Ensure that RE2::Arg works even with overloaded ParseFrom().
+ void ParseFrom(const char* str) {}
+ } obj2;
+ RE2::Arg arg2(&obj2);
+ EXPECT_FALSE(arg2.Parse("two", 3));
+#endif
+}
+
+TEST(RE2ArgTest, OptionalDoubleTest) {
+ absl::optional<double> opt;
+ RE2::Arg arg(&opt);
+ EXPECT_TRUE(arg.Parse(NULL, 0));
+ EXPECT_FALSE(opt.has_value());
+ EXPECT_FALSE(arg.Parse("", 0));
+ EXPECT_TRUE(arg.Parse("28.30", 5));
+ EXPECT_TRUE(opt.has_value());
+ EXPECT_EQ(*opt, 28.30);
+}
+
+TEST(RE2ArgTest, OptionalIntWithCRadixTest) {
+ absl::optional<int> opt;
+ RE2::Arg arg = RE2::CRadix(&opt);
+ EXPECT_TRUE(arg.Parse(NULL, 0));
+ EXPECT_FALSE(opt.has_value());
+ EXPECT_FALSE(arg.Parse("", 0));
+ EXPECT_TRUE(arg.Parse("0xb0e", 5));
+ EXPECT_TRUE(opt.has_value());
+ EXPECT_EQ(*opt, 2830);
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/re2_test.cc b/third_party/re2/src/re2/testing/re2_test.cc
new file mode 100644
index 000000000..151525f2d
--- /dev/null
+++ b/third_party/re2/src/re2/testing/re2_test.cc
@@ -0,0 +1,1661 @@
+// -*- coding: utf-8 -*-
+// Copyright 2002-2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// TODO: Test extractions for PartialMatch/Consume
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+#if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__)
+#include <sys/mman.h>
+#include <unistd.h> /* for sysconf */
+#endif
+
+#include "absl/base/macros.h"
+#include "absl/strings/str_format.h"
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+TEST(RE2, HexTests) {
+#define ASSERT_HEX(type, value) \
+ do { \
+ type v; \
+ ASSERT_TRUE( \
+ RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \
+ ASSERT_EQ(v, 0x##value); \
+ ASSERT_TRUE(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", \
+ RE2::CRadix(&v))); \
+ ASSERT_EQ(v, 0x##value); \
+ } while (0)
+
+ ASSERT_HEX(short, 2bad);
+ ASSERT_HEX(unsigned short, 2badU);
+ ASSERT_HEX(int, dead);
+ ASSERT_HEX(unsigned int, deadU);
+ ASSERT_HEX(long, 7eadbeefL);
+ ASSERT_HEX(unsigned long, deadbeefUL);
+ ASSERT_HEX(long long, 12345678deadbeefLL);
+ ASSERT_HEX(unsigned long long, cafebabedeadbeefULL);
+
+#undef ASSERT_HEX
+}
+
+TEST(RE2, OctalTests) {
+#define ASSERT_OCTAL(type, value) \
+ do { \
+ type v; \
+ ASSERT_TRUE(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \
+ ASSERT_EQ(v, 0##value); \
+ ASSERT_TRUE(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", \
+ RE2::CRadix(&v))); \
+ ASSERT_EQ(v, 0##value); \
+ } while (0)
+
+ ASSERT_OCTAL(short, 77777);
+ ASSERT_OCTAL(unsigned short, 177777U);
+ ASSERT_OCTAL(int, 17777777777);
+ ASSERT_OCTAL(unsigned int, 37777777777U);
+ ASSERT_OCTAL(long, 17777777777L);
+ ASSERT_OCTAL(unsigned long, 37777777777UL);
+ ASSERT_OCTAL(long long, 777777777777777777777LL);
+ ASSERT_OCTAL(unsigned long long, 1777777777777777777777ULL);
+
+#undef ASSERT_OCTAL
+}
+
+TEST(RE2, DecimalTests) {
+#define ASSERT_DECIMAL(type, value) \
+ do { \
+ type v; \
+ ASSERT_TRUE(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \
+ ASSERT_EQ(v, value); \
+ ASSERT_TRUE( \
+ RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
+ ASSERT_EQ(v, value); \
+ } while (0)
+
+ ASSERT_DECIMAL(short, -1);
+ ASSERT_DECIMAL(unsigned short, 9999);
+ ASSERT_DECIMAL(int, -1000);
+ ASSERT_DECIMAL(unsigned int, 12345U);
+ ASSERT_DECIMAL(long, -10000000L);
+ ASSERT_DECIMAL(unsigned long, 3083324652U);
+ ASSERT_DECIMAL(long long, -100000000000000LL);
+ ASSERT_DECIMAL(unsigned long long, 1234567890987654321ULL);
+
+#undef ASSERT_DECIMAL
+}
+
+TEST(RE2, Replace) {
+ struct ReplaceTest {
+ const char *regexp;
+ const char *rewrite;
+ const char *original;
+ const char *single;
+ const char *global;
+ int greplace_count;
+ };
+ static const ReplaceTest tests[] = {
+ { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)",
+ "\\2\\1ay",
+ "the quick brown fox jumps over the lazy dogs.",
+ "ethay quick brown fox jumps over the lazy dogs.",
+ "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.",
+ 9 },
+ { "\\w+",
+ "\\0-NOSPAM",
+ "abcd.efghi@google.com",
+ "abcd-NOSPAM.efghi@google.com",
+ "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM",
+ 4 },
+ { "^",
+ "(START)",
+ "foo",
+ "(START)foo",
+ "(START)foo",
+ 1 },
+ { "^",
+ "(START)",
+ "",
+ "(START)",
+ "(START)",
+ 1 },
+ { "$",
+ "(END)",
+ "",
+ "(END)",
+ "(END)",
+ 1 },
+ { "b",
+ "bb",
+ "ababababab",
+ "abbabababab",
+ "abbabbabbabbabb",
+ 5 },
+ { "b",
+ "bb",
+ "bbbbbb",
+ "bbbbbbb",
+ "bbbbbbbbbbbb",
+ 6 },
+ { "b+",
+ "bb",
+ "bbbbbb",
+ "bb",
+ "bb",
+ 1 },
+ { "b*",
+ "bb",
+ "bbbbbb",
+ "bb",
+ "bb",
+ 1 },
+ { "b*",
+ "bb",
+ "aaaaa",
+ "bbaaaaa",
+ "bbabbabbabbabbabb",
+ 6 },
+ // Check newline handling
+ { "a.*a",
+ "(\\0)",
+ "aba\naba",
+ "(aba)\naba",
+ "(aba)\n(aba)",
+ 2 },
+ { "", NULL, NULL, NULL, NULL, 0 }
+ };
+
+ for (const ReplaceTest* t = tests; t->original != NULL; t++) {
+ std::string one(t->original);
+ ASSERT_TRUE(RE2::Replace(&one, t->regexp, t->rewrite));
+ ASSERT_EQ(one, t->single);
+ std::string all(t->original);
+ ASSERT_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count)
+ << "Got: " << all;
+ ASSERT_EQ(all, t->global);
+ }
+}
+
+static void TestCheckRewriteString(const char* regexp, const char* rewrite,
+ bool expect_ok) {
+ std::string error;
+ RE2 exp(regexp);
+ bool actual_ok = exp.CheckRewriteString(rewrite, &error);
+ EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error;
+}
+
+TEST(CheckRewriteString, all) {
+ TestCheckRewriteString("abc", "foo", true);
+ TestCheckRewriteString("abc", "foo\\", false);
+ TestCheckRewriteString("abc", "foo\\0bar", true);
+
+ TestCheckRewriteString("a(b)c", "foo", true);
+ TestCheckRewriteString("a(b)c", "foo\\0bar", true);
+ TestCheckRewriteString("a(b)c", "foo\\1bar", true);
+ TestCheckRewriteString("a(b)c", "foo\\2bar", false);
+ TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true);
+
+ TestCheckRewriteString("a(b)(c)", "foo\\12", true);
+ TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true);
+ TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false);
+}
+
+TEST(RE2, Extract) {
+ std::string s;
+
+ ASSERT_TRUE(RE2::Extract("boris@kremvax.ru", "(.*)@([^.]*)", "\\2!\\1", &s));
+ ASSERT_EQ(s, "kremvax!boris");
+
+ ASSERT_TRUE(RE2::Extract("foo", ".*", "'\\0'", &s));
+ ASSERT_EQ(s, "'foo'");
+ // check that false match doesn't overwrite
+ ASSERT_FALSE(RE2::Extract("baz", "bar", "'\\0'", &s));
+ ASSERT_EQ(s, "'foo'");
+}
+
+TEST(RE2, MaxSubmatchTooLarge) {
+ std::string s;
+ ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s));
+ s = "foo";
+ ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2"));
+ s = "foo";
+ ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2"));
+}
+
+TEST(RE2, Consume) {
+ RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace
+ std::string word;
+
+ std::string s(" aaa b!@#$@#$cccc");
+ absl::string_view input(s);
+
+ ASSERT_TRUE(RE2::Consume(&input, r, &word));
+ ASSERT_EQ(word, "aaa") << " input: " << input;
+ ASSERT_TRUE(RE2::Consume(&input, r, &word));
+ ASSERT_EQ(word, "b") << " input: " << input;
+ ASSERT_FALSE(RE2::Consume(&input, r, &word)) << " input: " << input;
+}
+
+TEST(RE2, ConsumeN) {
+ const std::string s(" one two three 4");
+ absl::string_view input(s);
+
+ RE2::Arg argv[2];
+ const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
+
+ // 0 arg
+ EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0)); // Skips "one".
+
+ // 1 arg
+ std::string word;
+ argv[0] = &word;
+ EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1));
+ EXPECT_EQ("two", word);
+
+ // Multi-args
+ int n;
+ argv[1] = &n;
+ EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2));
+ EXPECT_EQ("three", word);
+ EXPECT_EQ(4, n);
+}
+
+TEST(RE2, FindAndConsume) {
+ RE2 r("(\\w+)"); // matches a word
+ std::string word;
+
+ std::string s(" aaa b!@#$@#$cccc");
+ absl::string_view input(s);
+
+ ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
+ ASSERT_EQ(word, "aaa");
+ ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
+ ASSERT_EQ(word, "b");
+ ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
+ ASSERT_EQ(word, "cccc");
+ ASSERT_FALSE(RE2::FindAndConsume(&input, r, &word));
+
+ // Check that FindAndConsume works without any submatches.
+ // Earlier version used uninitialized data for
+ // length to consume.
+ input = "aaa";
+ ASSERT_TRUE(RE2::FindAndConsume(&input, "aaa"));
+ ASSERT_EQ(input, "");
+}
+
+TEST(RE2, FindAndConsumeN) {
+ const std::string s(" one two three 4");
+ absl::string_view input(s);
+
+ RE2::Arg argv[2];
+ const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
+
+ // 0 arg
+ EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0)); // Skips "one".
+
+ // 1 arg
+ std::string word;
+ argv[0] = &word;
+ EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1));
+ EXPECT_EQ("two", word);
+
+ // Multi-args
+ int n;
+ argv[1] = &n;
+ EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2));
+ EXPECT_EQ("three", word);
+ EXPECT_EQ(4, n);
+}
+
+TEST(RE2, MatchNumberPeculiarity) {
+ RE2 r("(foo)|(bar)|(baz)");
+ std::string word1;
+ std::string word2;
+ std::string word3;
+
+ ASSERT_TRUE(RE2::PartialMatch("foo", r, &word1, &word2, &word3));
+ ASSERT_EQ(word1, "foo");
+ ASSERT_EQ(word2, "");
+ ASSERT_EQ(word3, "");
+ ASSERT_TRUE(RE2::PartialMatch("bar", r, &word1, &word2, &word3));
+ ASSERT_EQ(word1, "");
+ ASSERT_EQ(word2, "bar");
+ ASSERT_EQ(word3, "");
+ ASSERT_TRUE(RE2::PartialMatch("baz", r, &word1, &word2, &word3));
+ ASSERT_EQ(word1, "");
+ ASSERT_EQ(word2, "");
+ ASSERT_EQ(word3, "baz");
+ ASSERT_FALSE(RE2::PartialMatch("f", r, &word1, &word2, &word3));
+
+ std::string a;
+ ASSERT_TRUE(RE2::FullMatch("hello", "(foo)|hello", &a));
+ ASSERT_EQ(a, "");
+}
+
+TEST(RE2, Match) {
+ RE2 re("((\\w+):([0-9]+))"); // extracts host and port
+ absl::string_view group[4];
+
+ // No match.
+ absl::string_view s = "zyzzyva";
+ ASSERT_FALSE(
+ re.Match(s, 0, s.size(), RE2::UNANCHORED, group, ABSL_ARRAYSIZE(group)));
+
+ // Matches and extracts.
+ s = "a chrisr:9000 here";
+ ASSERT_TRUE(
+ re.Match(s, 0, s.size(), RE2::UNANCHORED, group, ABSL_ARRAYSIZE(group)));
+ ASSERT_EQ(group[0], "chrisr:9000");
+ ASSERT_EQ(group[1], "chrisr:9000");
+ ASSERT_EQ(group[2], "chrisr");
+ ASSERT_EQ(group[3], "9000");
+
+ std::string all, host;
+ int port;
+ ASSERT_TRUE(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port));
+ ASSERT_EQ(all, "chrisr:9000");
+ ASSERT_EQ(host, "chrisr");
+ ASSERT_EQ(port, 9000);
+}
+
+static void TestRecursion(int size, const char* pattern) {
+ // Fill up a string repeating the pattern given
+ std::string domain;
+ domain.resize(size);
+ size_t patlen = strlen(pattern);
+ for (int i = 0; i < size; i++) {
+ domain[i] = pattern[i % patlen];
+ }
+ // Just make sure it doesn't crash due to too much recursion.
+ RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet);
+ RE2::FullMatch(domain, re);
+}
+
+// A meta-quoted string, interpreted as a pattern, should always match
+// the original unquoted string.
+static void TestQuoteMeta(const std::string& unquoted,
+ const RE2::Options& options = RE2::DefaultOptions) {
+ std::string quoted = RE2::QuoteMeta(unquoted);
+ RE2 re(quoted, options);
+ EXPECT_TRUE(RE2::FullMatch(unquoted, re))
+ << "Unquoted='" << unquoted << "', quoted='" << quoted << "'.";
+}
+
+// A meta-quoted string, interpreted as a pattern, should always match
+// the original unquoted string.
+static void NegativeTestQuoteMeta(
+ const std::string& unquoted, const std::string& should_not_match,
+ const RE2::Options& options = RE2::DefaultOptions) {
+ std::string quoted = RE2::QuoteMeta(unquoted);
+ RE2 re(quoted, options);
+ EXPECT_FALSE(RE2::FullMatch(should_not_match, re))
+ << "Unquoted='" << unquoted << "', quoted='" << quoted << "'.";
+}
+
+// Tests that quoted meta characters match their original strings,
+// and that a few things that shouldn't match indeed do not.
+TEST(QuoteMeta, Simple) {
+ TestQuoteMeta("foo");
+ TestQuoteMeta("foo.bar");
+ TestQuoteMeta("foo\\.bar");
+ TestQuoteMeta("[1-9]");
+ TestQuoteMeta("1.5-2.0?");
+ TestQuoteMeta("\\d");
+ TestQuoteMeta("Who doesn't like ice cream?");
+ TestQuoteMeta("((a|b)c?d*e+[f-h]i)");
+ TestQuoteMeta("((?!)xxx).*yyy");
+ TestQuoteMeta("([");
+}
+TEST(QuoteMeta, SimpleNegative) {
+ NegativeTestQuoteMeta("foo", "bar");
+ NegativeTestQuoteMeta("...", "bar");
+ NegativeTestQuoteMeta("\\.", ".");
+ NegativeTestQuoteMeta("\\.", "..");
+ NegativeTestQuoteMeta("(a)", "a");
+ NegativeTestQuoteMeta("(a|b)", "a");
+ NegativeTestQuoteMeta("(a|b)", "(a)");
+ NegativeTestQuoteMeta("(a|b)", "a|b");
+ NegativeTestQuoteMeta("[0-9]", "0");
+ NegativeTestQuoteMeta("[0-9]", "0-9");
+ NegativeTestQuoteMeta("[0-9]", "[9]");
+ NegativeTestQuoteMeta("((?!)xxx)", "xxx");
+}
+
+TEST(QuoteMeta, Latin1) {
+ TestQuoteMeta("3\xb2 = 9", RE2::Latin1);
+}
+
+TEST(QuoteMeta, UTF8) {
+ TestQuoteMeta("Plácido Domingo");
+ TestQuoteMeta("xyz"); // No fancy utf8.
+ TestQuoteMeta("\xc2\xb0"); // 2-byte utf8 -- a degree symbol.
+ TestQuoteMeta("27\xc2\xb0 degrees"); // As a middle character.
+ TestQuoteMeta("\xe2\x80\xb3"); // 3-byte utf8 -- a double prime.
+ TestQuoteMeta("\xf0\x9d\x85\x9f"); // 4-byte utf8 -- a music note.
+ TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, this should
+ // still work.
+ NegativeTestQuoteMeta("27\xc2\xb0",
+ "27\\\xc2\\\xb0"); // 2-byte utf8 -- a degree symbol.
+}
+
+TEST(QuoteMeta, HasNull) {
+ std::string has_null;
+
+ // string with one null character
+ has_null += '\0';
+ TestQuoteMeta(has_null);
+ NegativeTestQuoteMeta(has_null, "");
+
+ // Don't want null-followed-by-'1' to be interpreted as '\01'.
+ has_null += '1';
+ TestQuoteMeta(has_null);
+ NegativeTestQuoteMeta(has_null, "\1");
+}
+
+TEST(ProgramSize, BigProgram) {
+ RE2 re_simple("simple regexp");
+ RE2 re_medium("medium.*regexp");
+ RE2 re_complex("complex.{1,128}regexp");
+
+ ASSERT_GT(re_simple.ProgramSize(), 0);
+ ASSERT_GT(re_medium.ProgramSize(), re_simple.ProgramSize());
+ ASSERT_GT(re_complex.ProgramSize(), re_medium.ProgramSize());
+
+ ASSERT_GT(re_simple.ReverseProgramSize(), 0);
+ ASSERT_GT(re_medium.ReverseProgramSize(), re_simple.ReverseProgramSize());
+ ASSERT_GT(re_complex.ReverseProgramSize(), re_medium.ReverseProgramSize());
+}
+
+TEST(ProgramFanout, BigProgram) {
+ RE2 re1("(?:(?:(?:(?:(?:.)?){1})*)+)");
+ RE2 re10("(?:(?:(?:(?:(?:.)?){10})*)+)");
+ RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)");
+ RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)");
+
+ std::vector<int> histogram;
+
+ // 3 is the largest non-empty bucket and has 2 element.
+ ASSERT_EQ(3, re1.ProgramFanout(&histogram));
+ ASSERT_EQ(2, histogram[3]);
+
+ // 6 is the largest non-empty bucket and has 11 elements.
+ ASSERT_EQ(6, re10.ProgramFanout(&histogram));
+ ASSERT_EQ(11, histogram[6]);
+
+ // 9 is the largest non-empty bucket and has 101 elements.
+ ASSERT_EQ(9, re100.ProgramFanout(&histogram));
+ ASSERT_EQ(101, histogram[9]);
+
+ // 13 is the largest non-empty bucket and has 1001 elements.
+ ASSERT_EQ(13, re1000.ProgramFanout(&histogram));
+ ASSERT_EQ(1001, histogram[13]);
+
+ // 2 is the largest non-empty bucket and has 2 element.
+ ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram));
+ ASSERT_EQ(2, histogram[2]);
+
+ // 5 is the largest non-empty bucket and has 11 elements.
+ ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram));
+ ASSERT_EQ(11, histogram[5]);
+
+ // 9 is the largest non-empty bucket and has 101 elements.
+ ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram));
+ ASSERT_EQ(101, histogram[9]);
+
+ // 12 is the largest non-empty bucket and has 1001 elements.
+ ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram));
+ ASSERT_EQ(1001, histogram[12]);
+}
+
+// Issue 956519: handling empty character sets was
+// causing NULL dereference. This tests a few empty character sets.
+// (The way to get an empty character set is to negate a full one.)
+TEST(EmptyCharset, Fuzz) {
+ static const char *empties[] = {
+ "[^\\S\\s]",
+ "[^\\S[:space:]]",
+ "[^\\D\\d]",
+ "[^\\D[:digit:]]"
+ };
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(empties); i++)
+ ASSERT_FALSE(RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0));
+}
+
+// Bitstate assumes that kInstFail instructions in
+// alternations or capture groups have been "compiled away".
+TEST(EmptyCharset, BitstateAssumptions) {
+ // Captures trigger use of Bitstate.
+ static const char *nop_empties[] = {
+ "((((()))))" "[^\\S\\s]?",
+ "((((()))))" "([^\\S\\s])?",
+ "((((()))))" "([^\\S\\s]|[^\\S\\s])?",
+ "((((()))))" "(([^\\S\\s]|[^\\S\\s])|)"
+ };
+ absl::string_view group[6];
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(nop_empties); i++)
+ ASSERT_TRUE(RE2(nop_empties[i]).Match("", 0, 0, RE2::UNANCHORED, group, 6));
+}
+
+// Test that named groups work correctly.
+TEST(Capture, NamedGroups) {
+ {
+ RE2 re("(hello world)");
+ ASSERT_EQ(re.NumberOfCapturingGroups(), 1);
+ const std::map<std::string, int>& m = re.NamedCapturingGroups();
+ ASSERT_EQ(m.size(), 0);
+ }
+
+ {
+ RE2 re("(?P<A>expr(?P<B>expr)(?P<C>expr))((expr)(?P<D>expr))");
+ ASSERT_EQ(re.NumberOfCapturingGroups(), 6);
+ const std::map<std::string, int>& m = re.NamedCapturingGroups();
+ ASSERT_EQ(m.size(), 4);
+ ASSERT_EQ(m.find("A")->second, 1);
+ ASSERT_EQ(m.find("B")->second, 2);
+ ASSERT_EQ(m.find("C")->second, 3);
+ ASSERT_EQ(m.find("D")->second, 6); // $4 and $5 are anonymous
+ }
+}
+
+TEST(RE2, CapturedGroupTest) {
+ RE2 re("directions from (?P<S>.*) to (?P<D>.*)");
+ int num_groups = re.NumberOfCapturingGroups();
+ EXPECT_EQ(2, num_groups);
+ std::string args[4];
+ RE2::Arg arg0(&args[0]);
+ RE2::Arg arg1(&args[1]);
+ RE2::Arg arg2(&args[2]);
+ RE2::Arg arg3(&args[3]);
+
+ const RE2::Arg* const matches[4] = {&arg0, &arg1, &arg2, &arg3};
+ EXPECT_TRUE(RE2::FullMatchN("directions from mountain view to san jose",
+ re, matches, num_groups));
+ const std::map<std::string, int>& named_groups = re.NamedCapturingGroups();
+ EXPECT_TRUE(named_groups.find("S") != named_groups.end());
+ EXPECT_TRUE(named_groups.find("D") != named_groups.end());
+
+ // The named group index is 1-based.
+ int source_group_index = named_groups.find("S")->second;
+ int destination_group_index = named_groups.find("D")->second;
+ EXPECT_EQ(1, source_group_index);
+ EXPECT_EQ(2, destination_group_index);
+
+ // The args is zero-based.
+ EXPECT_EQ("mountain view", args[source_group_index - 1]);
+ EXPECT_EQ("san jose", args[destination_group_index - 1]);
+}
+
+TEST(RE2, FullMatchWithNoArgs) {
+ ASSERT_TRUE(RE2::FullMatch("h", "h"));
+ ASSERT_TRUE(RE2::FullMatch("hello", "hello"));
+ ASSERT_TRUE(RE2::FullMatch("hello", "h.*o"));
+ ASSERT_FALSE(RE2::FullMatch("othello", "h.*o")); // Must be anchored at front
+ ASSERT_FALSE(RE2::FullMatch("hello!", "h.*o")); // Must be anchored at end
+}
+
+TEST(RE2, PartialMatch) {
+ ASSERT_TRUE(RE2::PartialMatch("x", "x"));
+ ASSERT_TRUE(RE2::PartialMatch("hello", "h.*o"));
+ ASSERT_TRUE(RE2::PartialMatch("othello", "h.*o"));
+ ASSERT_TRUE(RE2::PartialMatch("hello!", "h.*o"));
+ ASSERT_TRUE(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))"));
+}
+
+TEST(RE2, PartialMatchN) {
+ RE2::Arg argv[2];
+ const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
+
+ // 0 arg
+ EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0));
+ EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0));
+
+ // 1 arg
+ int i;
+ argv[0] = &i;
+ EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1));
+ EXPECT_EQ(1001, i);
+ EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1));
+
+ // Multi-arg
+ std::string s;
+ argv[1] = &s;
+ EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2));
+ EXPECT_EQ(42, i);
+ EXPECT_EQ("life", s);
+ EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2));
+}
+
+TEST(RE2, FullMatchZeroArg) {
+ // Zero-arg
+ ASSERT_TRUE(RE2::FullMatch("1001", "\\d+"));
+}
+
+TEST(RE2, FullMatchOneArg) {
+ int i;
+
+ // Single-arg
+ ASSERT_TRUE(RE2::FullMatch("1001", "(\\d+)", &i));
+ ASSERT_EQ(i, 1001);
+ ASSERT_TRUE(RE2::FullMatch("-123", "(-?\\d+)", &i));
+ ASSERT_EQ(i, -123);
+ ASSERT_FALSE(RE2::FullMatch("10", "()\\d+", &i));
+ ASSERT_FALSE(
+ RE2::FullMatch("1234567890123456789012345678901234567890", "(\\d+)", &i));
+}
+
+TEST(RE2, FullMatchIntegerArg) {
+ int i;
+
+ // Digits surrounding integer-arg
+ ASSERT_TRUE(RE2::FullMatch("1234", "1(\\d*)4", &i));
+ ASSERT_EQ(i, 23);
+ ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)\\d+", &i));
+ ASSERT_EQ(i, 1);
+ ASSERT_TRUE(RE2::FullMatch("-1234", "(-\\d)\\d+", &i));
+ ASSERT_EQ(i, -1);
+ ASSERT_TRUE(RE2::PartialMatch("1234", "(\\d)", &i));
+ ASSERT_EQ(i, 1);
+ ASSERT_TRUE(RE2::PartialMatch("-1234", "(-\\d)", &i));
+ ASSERT_EQ(i, -1);
+}
+
+TEST(RE2, FullMatchStringArg) {
+ std::string s;
+ // string-arg
+ ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", &s));
+ ASSERT_EQ(s, std::string("ell"));
+}
+
+TEST(RE2, FullMatchStringViewArg) {
+ int i;
+ absl::string_view sp;
+ // string_view-arg
+ ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i));
+ ASSERT_EQ(sp.size(), 4);
+ ASSERT_TRUE(memcmp(sp.data(), "ruby", 4) == 0);
+ ASSERT_EQ(i, 1234);
+}
+
+TEST(RE2, FullMatchMultiArg) {
+ int i;
+ std::string s;
+ // Multi-arg
+ ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
+ ASSERT_EQ(s, std::string("ruby"));
+ ASSERT_EQ(i, 1234);
+}
+
+TEST(RE2, FullMatchN) {
+ RE2::Arg argv[2];
+ const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
+
+ // 0 arg
+ EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0));
+ EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0));
+
+ // 1 arg
+ int i;
+ argv[0] = &i;
+ EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1));
+ EXPECT_EQ(1001, i);
+ EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1));
+
+ // Multi-arg
+ std::string s;
+ argv[1] = &s;
+ EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2));
+ EXPECT_EQ(42, i);
+ EXPECT_EQ("life", s);
+ EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2));
+}
+
+TEST(RE2, FullMatchIgnoredArg) {
+ int i;
+ std::string s;
+
+ // Old-school NULL should be ignored.
+ ASSERT_TRUE(
+ RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i));
+ ASSERT_EQ(s, std::string("ruby"));
+ ASSERT_EQ(i, 1234);
+
+ // C++11 nullptr should also be ignored.
+ ASSERT_TRUE(RE2::FullMatch("rubz:1235", "(\\w+)(:)(\\d+)", &s, nullptr, &i));
+ ASSERT_EQ(s, std::string("rubz"));
+ ASSERT_EQ(i, 1235);
+}
+
+TEST(RE2, FullMatchTypedNullArg) {
+ std::string s;
+
+ // Ignore non-void* NULL arg
+ ASSERT_TRUE(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL));
+ ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (std::string*)NULL));
+ ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (absl::string_view*)NULL));
+ ASSERT_TRUE(RE2::FullMatch("1234", "(.*)", (int*)NULL));
+ ASSERT_TRUE(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL));
+ ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL));
+ ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL));
+
+ // Fail on non-void* NULL arg if the match doesn't parse for the given type.
+ ASSERT_FALSE(RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL));
+ ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (int*)NULL));
+ ASSERT_FALSE(RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL));
+ ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (double*)NULL));
+ ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (float*)NULL));
+}
+
+// Check that numeric parsing code does not read past the end of
+// the number being parsed.
+// This implementation requires mmap(2) et al. and thus cannot
+// be used unless they are available.
+TEST(RE2, NULTerminated) {
+#if defined(_POSIX_MAPPED_FILES) && _POSIX_MAPPED_FILES > 0
+ char *v;
+ int x;
+ long pagesize = sysconf(_SC_PAGE_SIZE);
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+ v = static_cast<char*>(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE,
+ MAP_ANONYMOUS|MAP_PRIVATE, -1, 0));
+ ASSERT_TRUE(v != reinterpret_cast<char*>(-1));
+ LOG(INFO) << "Memory at " << (void*)v;
+ ASSERT_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno;
+ v[pagesize - 1] = '1';
+
+ x = 0;
+ ASSERT_TRUE(
+ RE2::FullMatch(absl::string_view(v + pagesize - 1, 1), "(.*)", &x));
+ ASSERT_EQ(x, 1);
+#endif
+}
+
+TEST(RE2, FullMatchTypeTests) {
+ // Type tests
+ std::string zeros(1000, '0');
+ {
+ char c;
+ ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c));
+ ASSERT_EQ(c, 'H');
+ }
+ {
+ unsigned char c;
+ ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c));
+ ASSERT_EQ(c, static_cast<unsigned char>('H'));
+ }
+ {
+ int16_t v;
+ ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
+ ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100);
+ ASSERT_TRUE(RE2::FullMatch("32767", "(-?\\d+)", &v)); ASSERT_EQ(v, 32767);
+ ASSERT_TRUE(RE2::FullMatch("-32768", "(-?\\d+)", &v)); ASSERT_EQ(v, -32768);
+ ASSERT_FALSE(RE2::FullMatch("-32769", "(-?\\d+)", &v));
+ ASSERT_FALSE(RE2::FullMatch("32768", "(-?\\d+)", &v));
+ }
+ {
+ uint16_t v;
+ ASSERT_TRUE(RE2::FullMatch("100", "(\\d+)", &v)); ASSERT_EQ(v, 100);
+ ASSERT_TRUE(RE2::FullMatch("32767", "(\\d+)", &v)); ASSERT_EQ(v, 32767);
+ ASSERT_TRUE(RE2::FullMatch("65535", "(\\d+)", &v)); ASSERT_EQ(v, 65535);
+ ASSERT_FALSE(RE2::FullMatch("65536", "(\\d+)", &v));
+ }
+ {
+ int32_t v;
+ static const int32_t max = INT32_C(0x7fffffff);
+ static const int32_t min = -max - 1;
+ ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
+ ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100);
+ ASSERT_TRUE(RE2::FullMatch("2147483647", "(-?\\d+)", &v)); ASSERT_EQ(v, max);
+ ASSERT_TRUE(RE2::FullMatch("-2147483648", "(-?\\d+)", &v)); ASSERT_EQ(v, min);
+ ASSERT_FALSE(RE2::FullMatch("-2147483649", "(-?\\d+)", &v));
+ ASSERT_FALSE(RE2::FullMatch("2147483648", "(-?\\d+)", &v));
+
+ ASSERT_TRUE(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v));
+ ASSERT_EQ(v, max);
+ ASSERT_TRUE(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v));
+ ASSERT_EQ(v, min);
+
+ ASSERT_FALSE(RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v));
+ ASSERT_TRUE(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v)));
+ ASSERT_EQ(v, max);
+ ASSERT_FALSE(RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v)));
+ }
+ {
+ uint32_t v;
+ static const uint32_t max = UINT32_C(0xffffffff);
+ ASSERT_TRUE(RE2::FullMatch("100", "(\\d+)", &v)); ASSERT_EQ(v, 100);
+ ASSERT_TRUE(RE2::FullMatch("4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max);
+ ASSERT_FALSE(RE2::FullMatch("4294967296", "(\\d+)", &v));
+ ASSERT_FALSE(RE2::FullMatch("-1", "(\\d+)", &v));
+
+ ASSERT_TRUE(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max);
+ }
+ {
+ int64_t v;
+ static const int64_t max = INT64_C(0x7fffffffffffffff);
+ static const int64_t min = -max - 1;
+ std::string str;
+
+ ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
+ ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100);
+
+ str = std::to_string(max);
+ ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, max);
+
+ str = std::to_string(min);
+ ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, min);
+
+ str = std::to_string(max);
+ ASSERT_NE(str.back(), '9');
+ str.back()++;
+ ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v));
+
+ str = std::to_string(min);
+ ASSERT_NE(str.back(), '9');
+ str.back()++;
+ ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v));
+ }
+ {
+ uint64_t v;
+ int64_t v2;
+ static const uint64_t max = UINT64_C(0xffffffffffffffff);
+ std::string str;
+
+ ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
+ ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v2)); ASSERT_EQ(v2, -100);
+
+ str = std::to_string(max);
+ ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, max);
+
+ ASSERT_NE(str.back(), '9');
+ str.back()++;
+ ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v));
+ }
+}
+
+TEST(RE2, FloatingPointFullMatchTypes) {
+ std::string zeros(1000, '0');
+ {
+ float v;
+ ASSERT_TRUE(RE2::FullMatch("100", "(.*)", &v)); ASSERT_EQ(v, 100);
+ ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100);
+ ASSERT_TRUE(RE2::FullMatch("1e23", "(.*)", &v)); ASSERT_EQ(v, float(1e23));
+ ASSERT_TRUE(RE2::FullMatch(" 100", "(.*)", &v)); ASSERT_EQ(v, 100);
+
+ ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
+ ASSERT_EQ(v, float(1e23));
+
+ // 6700000000081920.1 is an edge case.
+ // 6700000000081920 is exactly halfway between
+ // two float32s, so the .1 should make it round up.
+ // However, the .1 is outside the precision possible with
+ // a float64: the nearest float64 is 6700000000081920.
+ // So if the code uses strtod and then converts to float32,
+ // round-to-even will make it round down instead of up.
+ // To pass the test, the parser must call strtof directly.
+ // This test case is carefully chosen to use only a 17-digit
+ // number, since C does not guarantee to get the correctly
+ // rounded answer for strtod and strtof unless the input is
+ // short.
+ //
+ // This is known to fail on Cygwin and MinGW due to a broken
+ // implementation of strtof(3). And apparently MSVC too. Sigh.
+#if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__)
+ ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v));
+ ASSERT_EQ(v, 0.1f) << absl::StrFormat("%.8g != %.8g", v, 0.1f);
+ ASSERT_TRUE(RE2::FullMatch("6700000000081920.1", "(.*)", &v));
+ ASSERT_EQ(v, 6700000000081920.1f)
+ << absl::StrFormat("%.8g != %.8g", v, 6700000000081920.1f);
+#endif
+ }
+ {
+ double v;
+ ASSERT_TRUE(RE2::FullMatch("100", "(.*)", &v)); ASSERT_EQ(v, 100);
+ ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100);
+ ASSERT_TRUE(RE2::FullMatch("1e23", "(.*)", &v)); ASSERT_EQ(v, 1e23);
+ ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
+ ASSERT_EQ(v, double(1e23));
+
+ ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v));
+ ASSERT_EQ(v, 0.1) << absl::StrFormat("%.17g != %.17g", v, 0.1);
+ ASSERT_TRUE(RE2::FullMatch("1.00000005960464485", "(.*)", &v));
+ ASSERT_EQ(v, 1.0000000596046448)
+ << absl::StrFormat("%.17g != %.17g", v, 1.0000000596046448);
+ }
+}
+
+TEST(RE2, FullMatchAnchored) {
+ int i;
+ // Check that matching is fully anchored
+ ASSERT_FALSE(RE2::FullMatch("x1001", "(\\d+)", &i));
+ ASSERT_FALSE(RE2::FullMatch("1001x", "(\\d+)", &i));
+ ASSERT_TRUE(RE2::FullMatch("x1001", "x(\\d+)", &i)); ASSERT_EQ(i, 1001);
+ ASSERT_TRUE(RE2::FullMatch("1001x", "(\\d+)x", &i)); ASSERT_EQ(i, 1001);
+}
+
+TEST(RE2, FullMatchBraces) {
+ // Braces
+ ASSERT_TRUE(RE2::FullMatch("0abcd", "[0-9a-f+.-]{5,}"));
+ ASSERT_TRUE(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}"));
+ ASSERT_FALSE(RE2::FullMatch("0abc", "[0-9a-f+.-]{5,}"));
+}
+
+TEST(RE2, Complicated) {
+ // Complicated RE2
+ ASSERT_TRUE(RE2::FullMatch("foo", "foo|bar|[A-Z]"));
+ ASSERT_TRUE(RE2::FullMatch("bar", "foo|bar|[A-Z]"));
+ ASSERT_TRUE(RE2::FullMatch("X", "foo|bar|[A-Z]"));
+ ASSERT_FALSE(RE2::FullMatch("XY", "foo|bar|[A-Z]"));
+}
+
+TEST(RE2, FullMatchEnd) {
+ // Check full-match handling (needs '$' tacked on internally)
+ ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo"));
+ ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo"));
+ ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo$"));
+ ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo$"));
+ ASSERT_TRUE(RE2::FullMatch("foo", "foo$"));
+ ASSERT_FALSE(RE2::FullMatch("foo$bar", "foo\\$"));
+ ASSERT_FALSE(RE2::FullMatch("fox", "fo|bar"));
+
+ // Uncomment the following if we change the handling of '$' to
+ // prevent it from matching a trailing newline
+ if (false) {
+ // Check that we don't get bitten by pcre's special handling of a
+ // '\n' at the end of the string matching '$'
+ ASSERT_FALSE(RE2::PartialMatch("foo\n", "foo$"));
+ }
+}
+
+TEST(RE2, FullMatchArgCount) {
+ // Number of args
+ int a[16];
+ ASSERT_TRUE(RE2::FullMatch("", ""));
+
+ memset(a, 0, sizeof(0));
+ ASSERT_TRUE(RE2::FullMatch("1", "(\\d){1}", &a[0]));
+ ASSERT_EQ(a[0], 1);
+
+ memset(a, 0, sizeof(0));
+ ASSERT_TRUE(RE2::FullMatch("12", "(\\d)(\\d)", &a[0], &a[1]));
+ ASSERT_EQ(a[0], 1);
+ ASSERT_EQ(a[1], 2);
+
+ memset(a, 0, sizeof(0));
+ ASSERT_TRUE(RE2::FullMatch("123", "(\\d)(\\d)(\\d)", &a[0], &a[1], &a[2]));
+ ASSERT_EQ(a[0], 1);
+ ASSERT_EQ(a[1], 2);
+ ASSERT_EQ(a[2], 3);
+
+ memset(a, 0, sizeof(0));
+ ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1],
+ &a[2], &a[3]));
+ ASSERT_EQ(a[0], 1);
+ ASSERT_EQ(a[1], 2);
+ ASSERT_EQ(a[2], 3);
+ ASSERT_EQ(a[3], 4);
+
+ memset(a, 0, sizeof(0));
+ ASSERT_TRUE(RE2::FullMatch("12345", "(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1],
+ &a[2], &a[3], &a[4]));
+ ASSERT_EQ(a[0], 1);
+ ASSERT_EQ(a[1], 2);
+ ASSERT_EQ(a[2], 3);
+ ASSERT_EQ(a[3], 4);
+ ASSERT_EQ(a[4], 5);
+
+ memset(a, 0, sizeof(0));
+ ASSERT_TRUE(RE2::FullMatch("123456", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0],
+ &a[1], &a[2], &a[3], &a[4], &a[5]));
+ ASSERT_EQ(a[0], 1);
+ ASSERT_EQ(a[1], 2);
+ ASSERT_EQ(a[2], 3);
+ ASSERT_EQ(a[3], 4);
+ ASSERT_EQ(a[4], 5);
+ ASSERT_EQ(a[5], 6);
+
+ memset(a, 0, sizeof(0));
+ ASSERT_TRUE(RE2::FullMatch("1234567", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
+ &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6]));
+ ASSERT_EQ(a[0], 1);
+ ASSERT_EQ(a[1], 2);
+ ASSERT_EQ(a[2], 3);
+ ASSERT_EQ(a[3], 4);
+ ASSERT_EQ(a[4], 5);
+ ASSERT_EQ(a[5], 6);
+ ASSERT_EQ(a[6], 7);
+
+ memset(a, 0, sizeof(0));
+ ASSERT_TRUE(RE2::FullMatch("1234567890123456",
+ "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)"
+ "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
+ &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
+ &a[7], &a[8], &a[9], &a[10], &a[11], &a[12],
+ &a[13], &a[14], &a[15]));
+ ASSERT_EQ(a[0], 1);
+ ASSERT_EQ(a[1], 2);
+ ASSERT_EQ(a[2], 3);
+ ASSERT_EQ(a[3], 4);
+ ASSERT_EQ(a[4], 5);
+ ASSERT_EQ(a[5], 6);
+ ASSERT_EQ(a[6], 7);
+ ASSERT_EQ(a[7], 8);
+ ASSERT_EQ(a[8], 9);
+ ASSERT_EQ(a[9], 0);
+ ASSERT_EQ(a[10], 1);
+ ASSERT_EQ(a[11], 2);
+ ASSERT_EQ(a[12], 3);
+ ASSERT_EQ(a[13], 4);
+ ASSERT_EQ(a[14], 5);
+ ASSERT_EQ(a[15], 6);
+}
+
+TEST(RE2, Accessors) {
+ // Check the pattern() accessor
+ {
+ const std::string kPattern = "http://([^/]+)/.*";
+ const RE2 re(kPattern);
+ ASSERT_EQ(kPattern, re.pattern());
+ }
+
+ // Check RE2 error field.
+ {
+ RE2 re("foo");
+ ASSERT_TRUE(re.error().empty()); // Must have no error
+ ASSERT_TRUE(re.ok());
+ ASSERT_EQ(re.error_code(), RE2::NoError);
+ }
+}
+
+TEST(RE2, UTF8) {
+ // Check UTF-8 handling
+ // Three Japanese characters (nihongo)
+ const char utf8_string[] = {
+ (char)0xe6, (char)0x97, (char)0xa5, // 65e5
+ (char)0xe6, (char)0x9c, (char)0xac, // 627c
+ (char)0xe8, (char)0xaa, (char)0x9e, // 8a9e
+ 0
+ };
+ const char utf8_pattern[] = {
+ '.',
+ (char)0xe6, (char)0x9c, (char)0xac, // 627c
+ '.',
+ 0
+ };
+
+ // Both should match in either mode, bytes or UTF-8
+ RE2 re_test1(".........", RE2::Latin1);
+ ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test1));
+ RE2 re_test2("...");
+ ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test2));
+
+ // Check that '.' matches one byte or UTF-8 character
+ // according to the mode.
+ std::string s;
+ RE2 re_test3("(.)", RE2::Latin1);
+ ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test3, &s));
+ ASSERT_EQ(s, std::string("\xe6"));
+ RE2 re_test4("(.)");
+ ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test4, &s));
+ ASSERT_EQ(s, std::string("\xe6\x97\xa5"));
+
+ // Check that string matches itself in either mode
+ RE2 re_test5(utf8_string, RE2::Latin1);
+ ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test5));
+ RE2 re_test6(utf8_string);
+ ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test6));
+
+ // Check that pattern matches string only in UTF8 mode
+ RE2 re_test7(utf8_pattern, RE2::Latin1);
+ ASSERT_FALSE(RE2::FullMatch(utf8_string, re_test7));
+ RE2 re_test8(utf8_pattern);
+ ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test8));
+}
+
+TEST(RE2, UngreedyUTF8) {
+ // Check that ungreedy, UTF8 regular expressions don't match when they
+ // oughtn't -- see bug 82246.
+ {
+ // This code always worked.
+ const char* pattern = "\\w+X";
+ const std::string target = "a aX";
+ RE2 match_sentence(pattern, RE2::Latin1);
+ RE2 match_sentence_re(pattern);
+
+ ASSERT_FALSE(RE2::FullMatch(target, match_sentence));
+ ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re));
+ }
+ {
+ const char* pattern = "(?U)\\w+X";
+ const std::string target = "a aX";
+ RE2 match_sentence(pattern, RE2::Latin1);
+ ASSERT_EQ(match_sentence.error(), "");
+ RE2 match_sentence_re(pattern);
+
+ ASSERT_FALSE(RE2::FullMatch(target, match_sentence));
+ ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re));
+ }
+}
+
+TEST(RE2, Rejects) {
+ {
+ RE2 re("a\\1", RE2::Quiet);
+ ASSERT_FALSE(re.ok()); }
+ {
+ RE2 re("a[x", RE2::Quiet);
+ ASSERT_FALSE(re.ok());
+ }
+ {
+ RE2 re("a[z-a]", RE2::Quiet);
+ ASSERT_FALSE(re.ok());
+ }
+ {
+ RE2 re("a[[:foobar:]]", RE2::Quiet);
+ ASSERT_FALSE(re.ok());
+ }
+ {
+ RE2 re("a(b", RE2::Quiet);
+ ASSERT_FALSE(re.ok());
+ }
+ {
+ RE2 re("a\\", RE2::Quiet);
+ ASSERT_FALSE(re.ok());
+ }
+}
+
+TEST(RE2, NoCrash) {
+ // Test that using a bad regexp doesn't crash.
+ {
+ RE2 re("a\\", RE2::Quiet);
+ ASSERT_FALSE(re.ok());
+ ASSERT_FALSE(RE2::PartialMatch("a\\b", re));
+ }
+
+ // Test that using an enormous regexp doesn't crash
+ {
+ RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet);
+ ASSERT_FALSE(re.ok());
+ ASSERT_FALSE(RE2::PartialMatch("aaa", re));
+ }
+
+ // Test that a crazy regexp still compiles and runs.
+ {
+ RE2 re(".{512}x", RE2::Quiet);
+ ASSERT_TRUE(re.ok());
+ std::string s;
+ s.append(515, 'c');
+ s.append("x");
+ ASSERT_TRUE(RE2::PartialMatch(s, re));
+ }
+}
+
+TEST(RE2, Recursion) {
+ // Test that recursion is stopped.
+ // This test is PCRE-legacy -- there's no recursion in RE2.
+ int bytes = 15 * 1024; // enough to crash PCRE
+ TestRecursion(bytes, ".");
+ TestRecursion(bytes, "a");
+ TestRecursion(bytes, "a.");
+ TestRecursion(bytes, "ab.");
+ TestRecursion(bytes, "abc.");
+}
+
+TEST(RE2, BigCountedRepetition) {
+ // Test that counted repetition works, given tons of memory.
+ RE2::Options opt;
+ opt.set_max_mem(256<<20);
+
+ RE2 re(".{512}x", opt);
+ ASSERT_TRUE(re.ok());
+ std::string s;
+ s.append(515, 'c');
+ s.append("x");
+ ASSERT_TRUE(RE2::PartialMatch(s, re));
+}
+
+TEST(RE2, DeepRecursion) {
+ // Test for deep stack recursion. This would fail with a
+ // segmentation violation due to stack overflow before pcre was
+ // patched.
+ // Again, a PCRE legacy test. RE2 doesn't recurse.
+ std::string comment("x*");
+ std::string a(131072, 'a');
+ comment += a;
+ comment += "*x";
+ RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)");
+ ASSERT_TRUE(RE2::FullMatch(comment, re));
+}
+
+// Suggested by Josh Hyman. Failed when SearchOnePass was
+// not implementing case-folding.
+TEST(CaseInsensitive, MatchAndConsume) {
+ std::string text = "A fish named *Wanda*";
+ absl::string_view sp(text);
+ absl::string_view result;
+ EXPECT_TRUE(RE2::PartialMatch(text, "(?i)([wand]{5})", &result));
+ EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result));
+}
+
+// RE2 should permit implicit conversions from string, string_view, const char*,
+// and C string literals.
+TEST(RE2, ImplicitConversions) {
+ std::string re_string(".");
+ absl::string_view re_string_view(".");
+ const char* re_c_string = ".";
+ EXPECT_TRUE(RE2::PartialMatch("e", re_string));
+ EXPECT_TRUE(RE2::PartialMatch("e", re_string_view));
+ EXPECT_TRUE(RE2::PartialMatch("e", re_c_string));
+ EXPECT_TRUE(RE2::PartialMatch("e", "."));
+}
+
+// Bugs introduced by 8622304
+TEST(RE2, CL8622304) {
+ // reported by ingow
+ std::string dir;
+ EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])")); // ok
+ EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir)); // fails
+
+ // reported by jacobsa
+ std::string key, val;
+ EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true",
+ "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?",
+ &key,
+ &val));
+ EXPECT_EQ(key, "bar");
+ EXPECT_EQ(val, "1,0x2F,030,4,5");
+}
+
+// Check that RE2 returns correct regexp pieces on error.
+// In particular, make sure it returns whole runes
+// and that it always reports invalid UTF-8.
+// Also check that Perl error flag piece is big enough.
+static struct ErrorTest {
+ const char *regexp;
+ RE2::ErrorCode error_code;
+ const char *error_arg;
+} error_tests[] = {
+ { "ab\\αcd", RE2::ErrorBadEscape, "\\α" },
+ { "ef\\x☺01", RE2::ErrorBadEscape, "\\x☺0" },
+ { "gh\\x1☺01", RE2::ErrorBadEscape, "\\x1☺" },
+ { "ij\\x1", RE2::ErrorBadEscape, "\\x1" },
+ { "kl\\x", RE2::ErrorBadEscape, "\\x" },
+ { "uv\\x{0000☺}", RE2::ErrorBadEscape, "\\x{0000☺" },
+ { "wx\\p{ABC", RE2::ErrorBadCharRange, "\\p{ABC" },
+ // used to return (?s but the error is X
+ { "yz(?smiUX:abc)", RE2::ErrorBadPerlOp, "(?smiUX" },
+ { "aa(?sm☺i", RE2::ErrorBadPerlOp, "(?sm☺" },
+ { "bb[abc", RE2::ErrorMissingBracket, "[abc" },
+ { "abc(def", RE2::ErrorMissingParen, "abc(def" },
+ { "abc)def", RE2::ErrorUnexpectedParen, "abc)def" },
+
+ // no argument string returned for invalid UTF-8
+ { "mn\\x1\377", RE2::ErrorBadUTF8, "" },
+ { "op\377qr", RE2::ErrorBadUTF8, "" },
+ { "st\\x{00000\377", RE2::ErrorBadUTF8, "" },
+ { "zz\\p{\377}", RE2::ErrorBadUTF8, "" },
+ { "zz\\x{00\377}", RE2::ErrorBadUTF8, "" },
+ { "zz(?P<name\377>abc)", RE2::ErrorBadUTF8, "" },
+};
+TEST(RE2, ErrorCodeAndArg) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(error_tests); i++) {
+ RE2 re(error_tests[i].regexp, RE2::Quiet);
+ EXPECT_FALSE(re.ok());
+ EXPECT_EQ(re.error_code(), error_tests[i].error_code) << re.error();
+ EXPECT_EQ(re.error_arg(), error_tests[i].error_arg) << re.error();
+ }
+}
+
+// Check that "never match \n" mode never matches \n.
+static struct NeverTest {
+ const char* regexp;
+ const char* text;
+ const char* match;
+} never_tests[] = {
+ { "(.*)", "abc\ndef\nghi\n", "abc" },
+ { "(?s)(abc.*def)", "abc\ndef\n", NULL },
+ { "(abc(.|\n)*def)", "abc\ndef\n", NULL },
+ { "(abc[^x]*def)", "abc\ndef\n", NULL },
+ { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" },
+};
+TEST(RE2, NeverNewline) {
+ RE2::Options opt;
+ opt.set_never_nl(true);
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(never_tests); i++) {
+ const NeverTest& t = never_tests[i];
+ RE2 re(t.regexp, opt);
+ if (t.match == NULL) {
+ EXPECT_FALSE(re.PartialMatch(t.text, re));
+ } else {
+ absl::string_view m;
+ EXPECT_TRUE(re.PartialMatch(t.text, re, &m));
+ EXPECT_EQ(m, t.match);
+ }
+ }
+}
+
+// Check that dot_nl option works.
+TEST(RE2, DotNL) {
+ RE2::Options opt;
+ opt.set_dot_nl(true);
+ EXPECT_TRUE(RE2::PartialMatch("\n", RE2(".", opt)));
+ EXPECT_FALSE(RE2::PartialMatch("\n", RE2("(?-s).", opt)));
+ opt.set_never_nl(true);
+ EXPECT_FALSE(RE2::PartialMatch("\n", RE2(".", opt)));
+}
+
+// Check that there are no capturing groups in "never capture" mode.
+TEST(RE2, NeverCapture) {
+ RE2::Options opt;
+ opt.set_never_capture(true);
+ RE2 re("(r)(e)", opt);
+ EXPECT_EQ(0, re.NumberOfCapturingGroups());
+}
+
+// Bitstate bug was looking at submatch[0] even if nsubmatch == 0.
+// Triggered by a failed DFA search falling back to Bitstate when
+// using Match with a NULL submatch set. Bitstate tried to read
+// the submatch[0] entry even if nsubmatch was 0.
+TEST(RE2, BitstateCaptureBug) {
+ RE2::Options opt;
+ opt.set_max_mem(20000);
+ RE2 re("(_________$)", opt);
+ absl::string_view s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x";
+ EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0));
+}
+
+// C++ version of bug 609710.
+TEST(RE2, UnicodeClasses) {
+ const std::string str = "ABCDEFGHI譚永鋒";
+ std::string a, b, c;
+
+ EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}"));
+ EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}"));
+ EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}"));
+ EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}"));
+ EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}"));
+ EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}"));
+
+ EXPECT_TRUE(RE2::FullMatch("譚", "\\p{L}"));
+ EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Lu}"));
+ EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Ll}"));
+ EXPECT_FALSE(RE2::FullMatch("譚", "\\P{L}"));
+ EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Lu}"));
+ EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Ll}"));
+
+ EXPECT_TRUE(RE2::FullMatch("永", "\\p{L}"));
+ EXPECT_FALSE(RE2::FullMatch("永", "\\p{Lu}"));
+ EXPECT_FALSE(RE2::FullMatch("永", "\\p{Ll}"));
+ EXPECT_FALSE(RE2::FullMatch("永", "\\P{L}"));
+ EXPECT_TRUE(RE2::FullMatch("永", "\\P{Lu}"));
+ EXPECT_TRUE(RE2::FullMatch("永", "\\P{Ll}"));
+
+ EXPECT_TRUE(RE2::FullMatch("鋒", "\\p{L}"));
+ EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Lu}"));
+ EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Ll}"));
+ EXPECT_FALSE(RE2::FullMatch("鋒", "\\P{L}"));
+ EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Lu}"));
+ EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Ll}"));
+
+ EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c));
+ EXPECT_EQ("A", a);
+ EXPECT_EQ("B", b);
+ EXPECT_EQ("C", c);
+
+ EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c));
+ EXPECT_EQ("A", a);
+ EXPECT_EQ("B", b);
+ EXPECT_EQ("C", c);
+
+ EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}"));
+
+ EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c));
+ EXPECT_EQ("A", a);
+ EXPECT_EQ("B", b);
+ EXPECT_EQ("C", c);
+
+ EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]"));
+
+ EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c));
+ EXPECT_EQ("譚", a);
+ EXPECT_EQ("永", b);
+ EXPECT_EQ("鋒", c);
+}
+
+TEST(RE2, LazyRE2) {
+ // Test with and without options.
+ static LazyRE2 a = {"a"};
+ static LazyRE2 b = {"b", RE2::Latin1};
+
+ EXPECT_EQ("a", a->pattern());
+ EXPECT_EQ(RE2::Options::EncodingUTF8, a->options().encoding());
+
+ EXPECT_EQ("b", b->pattern());
+ EXPECT_EQ(RE2::Options::EncodingLatin1, b->options().encoding());
+}
+
+// Bug reported by saito. 2009/02/17
+TEST(RE2, NullVsEmptyString) {
+ RE2 re(".*");
+ EXPECT_TRUE(re.ok());
+
+ absl::string_view null;
+ EXPECT_TRUE(RE2::FullMatch(null, re));
+
+ absl::string_view empty("");
+ EXPECT_TRUE(RE2::FullMatch(empty, re));
+}
+
+// Similar to the previous test, check that the null string and the empty
+// string both match, but also that the null string can only provide null
+// submatches whereas the empty string can also provide empty submatches.
+TEST(RE2, NullVsEmptyStringSubmatches) {
+ RE2 re("()|(foo)");
+ EXPECT_TRUE(re.ok());
+
+ // matches[0] is overall match, [1] is (), [2] is (foo), [3] is nonexistent.
+ absl::string_view matches[4];
+
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(matches); i++)
+ matches[i] = "bar";
+
+ absl::string_view null;
+ EXPECT_TRUE(re.Match(null, 0, null.size(), RE2::UNANCHORED,
+ matches, ABSL_ARRAYSIZE(matches)));
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(matches); i++) {
+ EXPECT_TRUE(matches[i].data() == NULL); // always null
+ EXPECT_TRUE(matches[i].empty());
+ }
+
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(matches); i++)
+ matches[i] = "bar";
+
+ absl::string_view empty("");
+ EXPECT_TRUE(re.Match(empty, 0, empty.size(), RE2::UNANCHORED,
+ matches, ABSL_ARRAYSIZE(matches)));
+ EXPECT_TRUE(matches[0].data() != NULL); // empty, not null
+ EXPECT_TRUE(matches[0].empty());
+ EXPECT_TRUE(matches[1].data() != NULL); // empty, not null
+ EXPECT_TRUE(matches[1].empty());
+ EXPECT_TRUE(matches[2].data() == NULL);
+ EXPECT_TRUE(matches[2].empty());
+ EXPECT_TRUE(matches[3].data() == NULL);
+ EXPECT_TRUE(matches[3].empty());
+}
+
+// Issue 1816809
+TEST(RE2, Bug1816809) {
+ RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))");
+ absl::string_view piece("llx-3;llx4");
+ std::string x;
+ EXPECT_TRUE(RE2::Consume(&piece, re, &x));
+}
+
+// Issue 3061120
+TEST(RE2, Bug3061120) {
+ RE2 re("(?i)\\W");
+ EXPECT_FALSE(RE2::PartialMatch("x", re)); // always worked
+ EXPECT_FALSE(RE2::PartialMatch("k", re)); // broke because of kelvin
+ EXPECT_FALSE(RE2::PartialMatch("s", re)); // broke because of latin long s
+}
+
+TEST(RE2, CapturingGroupNames) {
+ // Opening parentheses annotated with group IDs:
+ // 12 3 45 6 7
+ RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))");
+ EXPECT_TRUE(re.ok());
+ const std::map<int, std::string>& have = re.CapturingGroupNames();
+ std::map<int, std::string> want;
+ want[3] = "G2";
+ want[6] = "G2";
+ want[7] = "G1";
+ EXPECT_EQ(want, have);
+}
+
+TEST(RE2, RegexpToStringLossOfAnchor) {
+ EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at");
+ EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at");
+ EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$");
+ EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)");
+}
+
+// Issue 10131674
+TEST(RE2, Bug10131674) {
+ // Some of these escapes describe values that do not fit in a byte.
+ RE2 re("\\140\\440\\174\\271\\150\\656\\106\\201\\004\\332", RE2::Latin1);
+ EXPECT_FALSE(re.ok());
+ EXPECT_FALSE(RE2::FullMatch("hello world", re));
+}
+
+TEST(RE2, Bug18391750) {
+ // Stray write past end of match_ in nfa.cc, caught by fuzzing + address sanitizer.
+ const char t[] = {
+ (char)0x28, (char)0x28, (char)0xfc, (char)0xfc, (char)0x08, (char)0x08,
+ (char)0x26, (char)0x26, (char)0x28, (char)0xc2, (char)0x9b, (char)0xc5,
+ (char)0xc5, (char)0xd4, (char)0x8f, (char)0x8f, (char)0x69, (char)0x69,
+ (char)0xe7, (char)0x29, (char)0x7b, (char)0x37, (char)0x31, (char)0x31,
+ (char)0x7d, (char)0xae, (char)0x7c, (char)0x7c, (char)0xf3, (char)0x29,
+ (char)0xae, (char)0xae, (char)0x2e, (char)0x2a, (char)0x29, (char)0x00,
+ };
+ RE2::Options opt;
+ opt.set_encoding(RE2::Options::EncodingLatin1);
+ opt.set_longest_match(true);
+ opt.set_dot_nl(true);
+ opt.set_case_sensitive(false);
+ RE2 re(t, opt);
+ ASSERT_TRUE(re.ok());
+ RE2::PartialMatch(t, re);
+}
+
+TEST(RE2, Bug18458852) {
+ // Bug in parser accepting invalid (too large) rune,
+ // causing compiler to fail in DCHECK in UTF-8
+ // character class code.
+ const char b[] = {
+ (char)0x28, (char)0x05, (char)0x05, (char)0x41, (char)0x41, (char)0x28,
+ (char)0x24, (char)0x5b, (char)0x5e, (char)0xf5, (char)0x87, (char)0x87,
+ (char)0x90, (char)0x29, (char)0x5d, (char)0x29, (char)0x29, (char)0x00,
+ };
+ RE2 re(b);
+ ASSERT_FALSE(re.ok());
+}
+
+TEST(RE2, Bug18523943) {
+ // Bug in BitState: case kFailInst failed the match entirely.
+
+ RE2::Options opt;
+ const char a[] = {
+ (char)0x29, (char)0x29, (char)0x24, (char)0x00,
+ };
+ const char b[] = {
+ (char)0x28, (char)0x0a, (char)0x2a, (char)0x2a, (char)0x29, (char)0x00,
+ };
+ opt.set_log_errors(false);
+ opt.set_encoding(RE2::Options::EncodingLatin1);
+ opt.set_posix_syntax(true);
+ opt.set_longest_match(true);
+ opt.set_literal(false);
+ opt.set_never_nl(true);
+
+ RE2 re((const char*)b, opt);
+ ASSERT_TRUE(re.ok());
+ std::string s1;
+ ASSERT_TRUE(RE2::PartialMatch((const char*)a, re, &s1));
+}
+
+TEST(RE2, Bug21371806) {
+ // Bug in parser accepting Unicode groups in Latin-1 mode,
+ // causing compiler to fail in DCHECK in prog.cc.
+
+ RE2::Options opt;
+ opt.set_encoding(RE2::Options::EncodingLatin1);
+
+ RE2 re("g\\p{Zl}]", opt);
+ ASSERT_TRUE(re.ok());
+}
+
+TEST(RE2, Bug26356109) {
+ // Bug in parser caused by factoring of common prefixes in alternations.
+
+ // In the past, this was factored to "a\\C*?[bc]". Thus, the automaton would
+ // consume "ab" and then stop (when unanchored) whereas it should consume all
+ // of "abc" as per first-match semantics.
+ RE2 re("a\\C*?c|a\\C*?b");
+ ASSERT_TRUE(re.ok());
+
+ std::string s = "abc";
+ absl::string_view m;
+
+ ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
+ ASSERT_EQ(m, s) << " (UNANCHORED) got m='" << m << "', want '" << s << "'";
+
+ ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::ANCHOR_BOTH, &m, 1));
+ ASSERT_EQ(m, s) << " (ANCHOR_BOTH) got m='" << m << "', want '" << s << "'";
+}
+
+TEST(RE2, Issue104) {
+ // RE2::GlobalReplace always advanced by one byte when the empty string was
+ // matched, which would clobber any rune that is longer than one byte.
+
+ std::string s = "bc";
+ ASSERT_EQ(3, RE2::GlobalReplace(&s, "a*", "d"));
+ ASSERT_EQ("dbdcd", s);
+
+ s = "ąć";
+ ASSERT_EQ(3, RE2::GlobalReplace(&s, "Ć*", "Ĉ"));
+ ASSERT_EQ("ĈąĈćĈ", s);
+
+ s = "人类";
+ ASSERT_EQ(3, RE2::GlobalReplace(&s, "大*", "小"));
+ ASSERT_EQ("小人小类小", s);
+}
+
+TEST(RE2, Issue310) {
+ // (?:|a)* matched more text than (?:|a)+ did.
+
+ std::string s = "aaa";
+ absl::string_view m;
+
+ RE2 star("(?:|a)*");
+ ASSERT_TRUE(star.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
+ ASSERT_EQ(m, "") << " got m='" << m << "', want ''";
+
+ RE2 plus("(?:|a)+");
+ ASSERT_TRUE(plus.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
+ ASSERT_EQ(m, "") << " got m='" << m << "', want ''";
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/regexp_benchmark.cc b/third_party/re2/src/re2/testing/regexp_benchmark.cc
new file mode 100644
index 000000000..5352b3101
--- /dev/null
+++ b/third_party/re2/src/re2/testing/regexp_benchmark.cc
@@ -0,0 +1,1569 @@
+// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Benchmarks for regular expression implementations.
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <thread>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/flags/flag.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "benchmark/benchmark.h"
+#include "util/logging.h"
+#include "util/malloc_counter.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+#include "util/pcre.h"
+
+namespace re2 {
+void Test();
+void MemoryUsage();
+} // namespace re2
+
+typedef testing::MallocCounter MallocCounter;
+
+namespace re2 {
+
+void Test() {
+ Regexp* re = Regexp::Parse("(\\d+)-(\\d+)-(\\d+)", Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->IsOnePass());
+ CHECK(prog->CanBitState());
+ const char* text = "650-253-0001";
+ absl::string_view sp[4];
+ CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ CHECK_EQ(sp[0], "650-253-0001");
+ CHECK_EQ(sp[1], "650");
+ CHECK_EQ(sp[2], "253");
+ CHECK_EQ(sp[3], "0001");
+ delete prog;
+ re->Decref();
+ LOG(INFO) << "test passed\n";
+}
+
+void MemoryUsage() {
+ const char* regexp = "(\\d+)-(\\d+)-(\\d+)";
+ const char* text = "650-253-0001";
+ {
+ MallocCounter mc(MallocCounter::THIS_THREAD_ONLY);
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ // Can't pass mc.HeapGrowth() and mc.PeakHeapGrowth() to LOG(INFO) directly,
+ // because LOG(INFO) might do a big allocation before they get evaluated.
+ absl::FPrintF(stderr, "Regexp: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
+ mc.Reset();
+
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->IsOnePass());
+ CHECK(prog->CanBitState());
+ absl::FPrintF(stderr, "Prog: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
+ mc.Reset();
+
+ absl::string_view sp[4];
+ CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ absl::FPrintF(stderr, "Search: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
+ delete prog;
+ re->Decref();
+ }
+
+ {
+ MallocCounter mc(MallocCounter::THIS_THREAD_ONLY);
+
+ PCRE re(regexp, PCRE::UTF8);
+ absl::FPrintF(stderr, "RE: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
+ PCRE::FullMatch(text, re);
+ absl::FPrintF(stderr, "RE: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
+ }
+
+ {
+ MallocCounter mc(MallocCounter::THIS_THREAD_ONLY);
+
+ PCRE* re = new PCRE(regexp, PCRE::UTF8);
+ absl::FPrintF(stderr, "PCRE*: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
+ PCRE::FullMatch(text, *re);
+ absl::FPrintF(stderr, "PCRE*: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
+ delete re;
+ }
+
+ {
+ MallocCounter mc(MallocCounter::THIS_THREAD_ONLY);
+
+ RE2 re(regexp);
+ absl::FPrintF(stderr, "RE2: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
+ RE2::FullMatch(text, re);
+ absl::FPrintF(stderr, "RE2: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
+ }
+
+ absl::FPrintF(stderr, "sizeof: PCRE=%d RE2=%d Prog=%d Inst=%d\n",
+ sizeof(PCRE), sizeof(RE2), sizeof(Prog), sizeof(Prog::Inst));
+}
+
+int NumCPUs() {
+ return static_cast<int>(std::thread::hardware_concurrency());
+}
+
+// Regular expression implementation wrappers.
+// Defined at bottom of file, but they are repetitive
+// and not interesting.
+
+typedef void SearchImpl(benchmark::State& state, const char* regexp,
+ absl::string_view text, Prog::Anchor anchor,
+ bool expect_match);
+
+SearchImpl SearchDFA, SearchNFA, SearchOnePass, SearchBitState, SearchPCRE,
+ SearchRE2, SearchCachedDFA, SearchCachedNFA, SearchCachedOnePass,
+ SearchCachedBitState, SearchCachedPCRE, SearchCachedRE2;
+
+typedef void ParseImpl(benchmark::State& state, const char* regexp,
+ absl::string_view text);
+
+ParseImpl Parse1NFA, Parse1OnePass, Parse1BitState, Parse1PCRE, Parse1RE2,
+ Parse1Backtrack, Parse1CachedNFA, Parse1CachedOnePass, Parse1CachedBitState,
+ Parse1CachedPCRE, Parse1CachedRE2, Parse1CachedBacktrack;
+
+ParseImpl Parse3NFA, Parse3OnePass, Parse3BitState, Parse3PCRE, Parse3RE2,
+ Parse3Backtrack, Parse3CachedNFA, Parse3CachedOnePass, Parse3CachedBitState,
+ Parse3CachedPCRE, Parse3CachedRE2, Parse3CachedBacktrack;
+
+ParseImpl SearchParse2CachedPCRE, SearchParse2CachedRE2;
+
+ParseImpl SearchParse1CachedPCRE, SearchParse1CachedRE2;
+
+// Benchmark: failed search for regexp in random text.
+
+// Generate random text that won't contain the search string,
+// to test worst-case search behavior.
+std::string RandomText(int64_t nbytes) {
+ static const std::string* const text = []() {
+ std::string* text = new std::string;
+ srand(1);
+ text->resize(16<<20);
+ for (int64_t i = 0; i < 16<<20; i++) {
+ // Generate a one-byte rune that isn't a control character (e.g. '\n').
+ // Clipping to 0x20 introduces some bias, but we don't need uniformity.
+ int byte = rand() & 0x7F;
+ if (byte < 0x20)
+ byte = 0x20;
+ (*text)[i] = byte;
+ }
+ return text;
+ }();
+ CHECK_LE(nbytes, 16<<20);
+ return text->substr(0, nbytes);
+}
+
+// Makes text of size nbytes, then calls run to search
+// the text for regexp iters times.
+void Search(benchmark::State& state, const char* regexp, SearchImpl* search) {
+ std::string s = RandomText(state.range(0));
+ search(state, regexp, s, Prog::kUnanchored, false);
+ state.SetBytesProcessed(state.iterations() * state.range(0));
+}
+
+// These three are easy because they have prefixes,
+// giving the search loop something to prefix accel.
+#define EASY0 "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
+#define EASY1 "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"
+#define EASY2 "(?i)" EASY0
+
+// This is a little harder, since it starts with a character class
+// and thus can't be memchr'ed. Could look for ABC and work backward,
+// but no one does that.
+#define MEDIUM "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
+
+// This is a fair amount harder, because of the leading [ -~]*.
+// A bad backtracking implementation will take O(text^2) time to
+// figure out there's no match.
+#define HARD "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
+
+// This has quite a high degree of fanout.
+// NFA execution will be particularly slow.
+#define FANOUT "(?:[\\x{80}-\\x{10FFFF}]?){100}[\\x{80}-\\x{10FFFF}]"
+
+// This stresses engines that are trying to track parentheses.
+#define PARENS "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" \
+ "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$"
+
+void Search_Easy0_CachedDFA(benchmark::State& state) { Search(state, EASY0, SearchCachedDFA); }
+void Search_Easy0_CachedNFA(benchmark::State& state) { Search(state, EASY0, SearchCachedNFA); }
+void Search_Easy0_CachedPCRE(benchmark::State& state) { Search(state, EASY0, SearchCachedPCRE); }
+void Search_Easy0_CachedRE2(benchmark::State& state) { Search(state, EASY0, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_Easy0_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Easy0_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Easy0_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Easy0_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+void Search_Easy1_CachedDFA(benchmark::State& state) { Search(state, EASY1, SearchCachedDFA); }
+void Search_Easy1_CachedNFA(benchmark::State& state) { Search(state, EASY1, SearchCachedNFA); }
+void Search_Easy1_CachedPCRE(benchmark::State& state) { Search(state, EASY1, SearchCachedPCRE); }
+void Search_Easy1_CachedRE2(benchmark::State& state) { Search(state, EASY1, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_Easy1_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Easy1_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Easy1_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Easy1_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+void Search_Easy2_CachedDFA(benchmark::State& state) { Search(state, EASY2, SearchCachedDFA); }
+void Search_Easy2_CachedNFA(benchmark::State& state) { Search(state, EASY2, SearchCachedNFA); }
+void Search_Easy2_CachedPCRE(benchmark::State& state) { Search(state, EASY2, SearchCachedPCRE); }
+void Search_Easy2_CachedRE2(benchmark::State& state) { Search(state, EASY2, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_Easy2_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Easy2_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Easy2_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Easy2_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+void Search_Medium_CachedDFA(benchmark::State& state) { Search(state, MEDIUM, SearchCachedDFA); }
+void Search_Medium_CachedNFA(benchmark::State& state) { Search(state, MEDIUM, SearchCachedNFA); }
+void Search_Medium_CachedPCRE(benchmark::State& state) { Search(state, MEDIUM, SearchCachedPCRE); }
+void Search_Medium_CachedRE2(benchmark::State& state) { Search(state, MEDIUM, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_Medium_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Medium_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Medium_CachedPCRE, 8, 256<<10)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Medium_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+void Search_Hard_CachedDFA(benchmark::State& state) { Search(state, HARD, SearchCachedDFA); }
+void Search_Hard_CachedNFA(benchmark::State& state) { Search(state, HARD, SearchCachedNFA); }
+void Search_Hard_CachedPCRE(benchmark::State& state) { Search(state, HARD, SearchCachedPCRE); }
+void Search_Hard_CachedRE2(benchmark::State& state) { Search(state, HARD, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_Hard_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Hard_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Hard_CachedPCRE, 8, 4<<10)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Hard_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+void Search_Fanout_CachedDFA(benchmark::State& state) { Search(state, FANOUT, SearchCachedDFA); }
+void Search_Fanout_CachedNFA(benchmark::State& state) { Search(state, FANOUT, SearchCachedNFA); }
+void Search_Fanout_CachedPCRE(benchmark::State& state) { Search(state, FANOUT, SearchCachedPCRE); }
+void Search_Fanout_CachedRE2(benchmark::State& state) { Search(state, FANOUT, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_Fanout_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Fanout_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Fanout_CachedPCRE, 8, 4<<10)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Fanout_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+void Search_Parens_CachedDFA(benchmark::State& state) { Search(state, PARENS, SearchCachedDFA); }
+void Search_Parens_CachedNFA(benchmark::State& state) { Search(state, PARENS, SearchCachedNFA); }
+void Search_Parens_CachedPCRE(benchmark::State& state) { Search(state, PARENS, SearchCachedPCRE); }
+void Search_Parens_CachedRE2(benchmark::State& state) { Search(state, PARENS, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_Parens_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Parens_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Parens_CachedPCRE, 8, 8)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Parens_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+void SearchBigFixed(benchmark::State& state, SearchImpl* search) {
+ std::string s;
+ s.append(state.range(0)/2, 'x');
+ std::string regexp = "^" + s + ".*$";
+ std::string t = RandomText(state.range(0)/2);
+ s += t;
+ search(state, regexp.c_str(), s, Prog::kUnanchored, true);
+ state.SetBytesProcessed(state.iterations() * state.range(0));
+}
+
+void Search_BigFixed_CachedDFA(benchmark::State& state) { SearchBigFixed(state, SearchCachedDFA); }
+void Search_BigFixed_CachedNFA(benchmark::State& state) { SearchBigFixed(state, SearchCachedNFA); }
+void Search_BigFixed_CachedPCRE(benchmark::State& state) { SearchBigFixed(state, SearchCachedPCRE); }
+void Search_BigFixed_CachedRE2(benchmark::State& state) { SearchBigFixed(state, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_BigFixed_CachedDFA, 8, 1<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_BigFixed_CachedNFA, 8, 32<<10)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_BigFixed_CachedPCRE, 8, 32<<10)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_BigFixed_CachedRE2, 8, 1<<20)->ThreadRange(1, NumCPUs());
+
+// Benchmark: FindAndConsume
+
+void FindAndConsume(benchmark::State& state) {
+ std::string s = RandomText(state.range(0));
+ s.append("Hello World");
+ RE2 re("((Hello World))");
+ for (auto _ : state) {
+ absl::string_view t = s;
+ absl::string_view u;
+ CHECK(RE2::FindAndConsume(&t, re, &u));
+ CHECK_EQ(u, "Hello World");
+ }
+ state.SetBytesProcessed(state.iterations() * state.range(0));
+}
+
+BENCHMARK_RANGE(FindAndConsume, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+// Benchmark: successful anchored search.
+
+void SearchSuccess(benchmark::State& state, const char* regexp,
+ SearchImpl* search) {
+ std::string s = RandomText(state.range(0));
+ search(state, regexp, s, Prog::kAnchored, true);
+ state.SetBytesProcessed(state.iterations() * state.range(0));
+}
+
+// Unambiguous search (RE2 can use OnePass).
+
+void Search_Success_DFA(benchmark::State& state) { SearchSuccess(state, ".*$", SearchDFA); }
+void Search_Success_NFA(benchmark::State& state) { SearchSuccess(state, ".*$", SearchNFA); }
+void Search_Success_PCRE(benchmark::State& state) { SearchSuccess(state, ".*$", SearchPCRE); }
+void Search_Success_RE2(benchmark::State& state) { SearchSuccess(state, ".*$", SearchRE2); }
+void Search_Success_OnePass(benchmark::State& state) { SearchSuccess(state, ".*$", SearchOnePass); }
+
+BENCHMARK_RANGE(Search_Success_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success_NFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Success_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Success_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success_OnePass, 8, 2<<20)->ThreadRange(1, NumCPUs());
+
+void Search_Success_CachedDFA(benchmark::State& state) { SearchSuccess(state, ".*$", SearchCachedDFA); }
+void Search_Success_CachedNFA(benchmark::State& state) { SearchSuccess(state, ".*$", SearchCachedNFA); }
+void Search_Success_CachedPCRE(benchmark::State& state) { SearchSuccess(state, ".*$", SearchCachedPCRE); }
+void Search_Success_CachedRE2(benchmark::State& state) { SearchSuccess(state, ".*$", SearchCachedRE2); }
+void Search_Success_CachedOnePass(benchmark::State& state) { SearchSuccess(state, ".*$", SearchCachedOnePass); }
+
+BENCHMARK_RANGE(Search_Success_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success_CachedNFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Success_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Success_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success_CachedOnePass, 8, 2<<20)->ThreadRange(1, NumCPUs());
+
+// Ambiguous search (RE2 cannot use OnePass).
+// Used to be ".*.$", but that is coalesced to ".+$" these days.
+
+void Search_Success1_DFA(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchDFA); }
+void Search_Success1_NFA(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchNFA); }
+void Search_Success1_PCRE(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchPCRE); }
+void Search_Success1_RE2(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchRE2); }
+void Search_Success1_BitState(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchBitState); }
+
+BENCHMARK_RANGE(Search_Success1_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success1_NFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Success1_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Success1_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success1_BitState, 8, 2<<20)->ThreadRange(1, NumCPUs());
+
+void Search_Success1_CachedDFA(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchCachedDFA); }
+void Search_Success1_CachedNFA(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchCachedNFA); }
+void Search_Success1_CachedPCRE(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchCachedPCRE); }
+void Search_Success1_CachedRE2(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchCachedRE2); }
+void Search_Success1_CachedBitState(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchCachedBitState); }
+
+BENCHMARK_RANGE(Search_Success1_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success1_CachedNFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Success1_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Success1_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success1_CachedBitState, 8, 2<<20)->ThreadRange(1, NumCPUs());
+
+// Benchmark: AltMatch optimisation (just to verify that it works)
+// Note that OnePass doesn't implement it!
+
+void SearchAltMatch(benchmark::State& state, SearchImpl* search) {
+ std::string s = RandomText(state.range(0));
+ search(state, "\\C*", s, Prog::kAnchored, true);
+ state.SetBytesProcessed(state.iterations() * state.range(0));
+}
+
+void Search_AltMatch_DFA(benchmark::State& state) { SearchAltMatch(state, SearchDFA); }
+void Search_AltMatch_NFA(benchmark::State& state) { SearchAltMatch(state, SearchNFA); }
+void Search_AltMatch_OnePass(benchmark::State& state) { SearchAltMatch(state, SearchOnePass); }
+void Search_AltMatch_BitState(benchmark::State& state) { SearchAltMatch(state, SearchBitState); }
+void Search_AltMatch_PCRE(benchmark::State& state) { SearchAltMatch(state, SearchPCRE); }
+void Search_AltMatch_RE2(benchmark::State& state) { SearchAltMatch(state, SearchRE2); }
+
+BENCHMARK_RANGE(Search_AltMatch_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_AltMatch_NFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_AltMatch_OnePass, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_AltMatch_BitState, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_AltMatch_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_AltMatch_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+void Search_AltMatch_CachedDFA(benchmark::State& state) { SearchAltMatch(state, SearchCachedDFA); }
+void Search_AltMatch_CachedNFA(benchmark::State& state) { SearchAltMatch(state, SearchCachedNFA); }
+void Search_AltMatch_CachedOnePass(benchmark::State& state) { SearchAltMatch(state, SearchCachedOnePass); }
+void Search_AltMatch_CachedBitState(benchmark::State& state) { SearchAltMatch(state, SearchCachedBitState); }
+void Search_AltMatch_CachedPCRE(benchmark::State& state) { SearchAltMatch(state, SearchCachedPCRE); }
+void Search_AltMatch_CachedRE2(benchmark::State& state) { SearchAltMatch(state, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_AltMatch_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_AltMatch_CachedNFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_AltMatch_CachedOnePass, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_AltMatch_CachedBitState, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_AltMatch_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_AltMatch_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+// Benchmark: use regexp to find phone number.
+
+void SearchDigits(benchmark::State& state, SearchImpl* search) {
+ absl::string_view s("650-253-0001");
+ search(state, "([0-9]+)-([0-9]+)-([0-9]+)", s, Prog::kAnchored, true);
+ state.SetItemsProcessed(state.iterations());
+}
+
+void Search_Digits_DFA(benchmark::State& state) { SearchDigits(state, SearchDFA); }
+void Search_Digits_NFA(benchmark::State& state) { SearchDigits(state, SearchNFA); }
+void Search_Digits_OnePass(benchmark::State& state) { SearchDigits(state, SearchOnePass); }
+void Search_Digits_PCRE(benchmark::State& state) { SearchDigits(state, SearchPCRE); }
+void Search_Digits_RE2(benchmark::State& state) { SearchDigits(state, SearchRE2); }
+void Search_Digits_BitState(benchmark::State& state) { SearchDigits(state, SearchBitState); }
+
+BENCHMARK(Search_Digits_DFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Search_Digits_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Search_Digits_OnePass)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK(Search_Digits_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Search_Digits_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Search_Digits_BitState)->ThreadRange(1, NumCPUs());
+
+// Benchmark: use regexp to parse digit fields in phone number.
+
+void Parse3Digits(benchmark::State& state,
+ void (*parse3)(benchmark::State&, const char*,
+ absl::string_view)) {
+ parse3(state, "([0-9]+)-([0-9]+)-([0-9]+)", "650-253-0001");
+ state.SetItemsProcessed(state.iterations());
+}
+
+void Parse_Digits_NFA(benchmark::State& state) { Parse3Digits(state, Parse3NFA); }
+void Parse_Digits_OnePass(benchmark::State& state) { Parse3Digits(state, Parse3OnePass); }
+void Parse_Digits_PCRE(benchmark::State& state) { Parse3Digits(state, Parse3PCRE); }
+void Parse_Digits_RE2(benchmark::State& state) { Parse3Digits(state, Parse3RE2); }
+void Parse_Digits_Backtrack(benchmark::State& state) { Parse3Digits(state, Parse3Backtrack); }
+void Parse_Digits_BitState(benchmark::State& state) { Parse3Digits(state, Parse3BitState); }
+
+BENCHMARK(Parse_Digits_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_Digits_OnePass)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK(Parse_Digits_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_Digits_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_Digits_Backtrack)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_Digits_BitState)->ThreadRange(1, NumCPUs());
+
+void Parse_CachedDigits_NFA(benchmark::State& state) { Parse3Digits(state, Parse3CachedNFA); }
+void Parse_CachedDigits_OnePass(benchmark::State& state) { Parse3Digits(state, Parse3CachedOnePass); }
+void Parse_CachedDigits_PCRE(benchmark::State& state) { Parse3Digits(state, Parse3CachedPCRE); }
+void Parse_CachedDigits_RE2(benchmark::State& state) { Parse3Digits(state, Parse3CachedRE2); }
+void Parse_CachedDigits_Backtrack(benchmark::State& state) { Parse3Digits(state, Parse3CachedBacktrack); }
+void Parse_CachedDigits_BitState(benchmark::State& state) { Parse3Digits(state, Parse3CachedBitState); }
+
+BENCHMARK(Parse_CachedDigits_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedDigits_OnePass)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK(Parse_CachedDigits_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_CachedDigits_Backtrack)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedDigits_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedDigits_BitState)->ThreadRange(1, NumCPUs());
+
+void Parse3DigitDs(benchmark::State& state,
+ void (*parse3)(benchmark::State&, const char*,
+ absl::string_view)) {
+ parse3(state, "(\\d+)-(\\d+)-(\\d+)", "650-253-0001");
+ state.SetItemsProcessed(state.iterations());
+}
+
+void Parse_DigitDs_NFA(benchmark::State& state) { Parse3DigitDs(state, Parse3NFA); }
+void Parse_DigitDs_OnePass(benchmark::State& state) { Parse3DigitDs(state, Parse3OnePass); }
+void Parse_DigitDs_PCRE(benchmark::State& state) { Parse3DigitDs(state, Parse3PCRE); }
+void Parse_DigitDs_RE2(benchmark::State& state) { Parse3DigitDs(state, Parse3RE2); }
+void Parse_DigitDs_Backtrack(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedBacktrack); }
+void Parse_DigitDs_BitState(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedBitState); }
+
+BENCHMARK(Parse_DigitDs_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_DigitDs_OnePass)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK(Parse_DigitDs_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_DigitDs_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_DigitDs_Backtrack)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_DigitDs_BitState)->ThreadRange(1, NumCPUs());
+
+void Parse_CachedDigitDs_NFA(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedNFA); }
+void Parse_CachedDigitDs_OnePass(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedOnePass); }
+void Parse_CachedDigitDs_PCRE(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedPCRE); }
+void Parse_CachedDigitDs_RE2(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedRE2); }
+void Parse_CachedDigitDs_Backtrack(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedBacktrack); }
+void Parse_CachedDigitDs_BitState(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedBitState); }
+
+BENCHMARK(Parse_CachedDigitDs_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedDigitDs_OnePass)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK(Parse_CachedDigitDs_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_CachedDigitDs_Backtrack)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedDigitDs_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedDigitDs_BitState)->ThreadRange(1, NumCPUs());
+
+// Benchmark: splitting off leading number field.
+
+void Parse1Split(benchmark::State& state,
+ void (*parse1)(benchmark::State&, const char*,
+ absl::string_view)) {
+ parse1(state, "[0-9]+-(.*)", "650-253-0001");
+ state.SetItemsProcessed(state.iterations());
+}
+
+void Parse_Split_NFA(benchmark::State& state) { Parse1Split(state, Parse1NFA); }
+void Parse_Split_OnePass(benchmark::State& state) { Parse1Split(state, Parse1OnePass); }
+void Parse_Split_PCRE(benchmark::State& state) { Parse1Split(state, Parse1PCRE); }
+void Parse_Split_RE2(benchmark::State& state) { Parse1Split(state, Parse1RE2); }
+void Parse_Split_BitState(benchmark::State& state) { Parse1Split(state, Parse1BitState); }
+
+BENCHMARK(Parse_Split_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_Split_OnePass)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK(Parse_Split_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_Split_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_Split_BitState)->ThreadRange(1, NumCPUs());
+
+void Parse_CachedSplit_NFA(benchmark::State& state) { Parse1Split(state, Parse1CachedNFA); }
+void Parse_CachedSplit_OnePass(benchmark::State& state) { Parse1Split(state, Parse1CachedOnePass); }
+void Parse_CachedSplit_PCRE(benchmark::State& state) { Parse1Split(state, Parse1CachedPCRE); }
+void Parse_CachedSplit_RE2(benchmark::State& state) { Parse1Split(state, Parse1CachedRE2); }
+void Parse_CachedSplit_BitState(benchmark::State& state) { Parse1Split(state, Parse1CachedBitState); }
+
+BENCHMARK(Parse_CachedSplit_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedSplit_OnePass)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK(Parse_CachedSplit_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_CachedSplit_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedSplit_BitState)->ThreadRange(1, NumCPUs());
+
+// Benchmark: splitting off leading number field but harder (ambiguous regexp).
+
+void Parse1SplitHard(benchmark::State& state,
+ void (*run)(benchmark::State&, const char*,
+ absl::string_view)) {
+ run(state, "[0-9]+.(.*)", "650-253-0001");
+ state.SetItemsProcessed(state.iterations());
+}
+
+void Parse_SplitHard_NFA(benchmark::State& state) { Parse1SplitHard(state, Parse1NFA); }
+void Parse_SplitHard_PCRE(benchmark::State& state) { Parse1SplitHard(state, Parse1PCRE); }
+void Parse_SplitHard_RE2(benchmark::State& state) { Parse1SplitHard(state, Parse1RE2); }
+void Parse_SplitHard_BitState(benchmark::State& state) { Parse1SplitHard(state, Parse1BitState); }
+
+#ifdef USEPCRE
+BENCHMARK(Parse_SplitHard_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_SplitHard_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_SplitHard_BitState)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_SplitHard_NFA)->ThreadRange(1, NumCPUs());
+
+void Parse_CachedSplitHard_NFA(benchmark::State& state) { Parse1SplitHard(state, Parse1CachedNFA); }
+void Parse_CachedSplitHard_PCRE(benchmark::State& state) { Parse1SplitHard(state, Parse1CachedPCRE); }
+void Parse_CachedSplitHard_RE2(benchmark::State& state) { Parse1SplitHard(state, Parse1CachedRE2); }
+void Parse_CachedSplitHard_BitState(benchmark::State& state) { Parse1SplitHard(state, Parse1CachedBitState); }
+void Parse_CachedSplitHard_Backtrack(benchmark::State& state) { Parse1SplitHard(state, Parse1CachedBacktrack); }
+
+#ifdef USEPCRE
+BENCHMARK(Parse_CachedSplitHard_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_CachedSplitHard_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedSplitHard_BitState)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedSplitHard_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedSplitHard_Backtrack)->ThreadRange(1, NumCPUs());
+
+// Benchmark: Parse1SplitHard, big text, small match.
+
+void Parse1SplitBig1(benchmark::State& state,
+ void (*run)(benchmark::State&, const char*,
+ absl::string_view)) {
+ std::string s;
+ s.append(100000, 'x');
+ s.append("650-253-0001");
+ run(state, "[0-9]+.(.*)", s);
+ state.SetItemsProcessed(state.iterations());
+}
+
+void Parse_CachedSplitBig1_PCRE(benchmark::State& state) { Parse1SplitBig1(state, SearchParse1CachedPCRE); }
+void Parse_CachedSplitBig1_RE2(benchmark::State& state) { Parse1SplitBig1(state, SearchParse1CachedRE2); }
+
+#ifdef USEPCRE
+BENCHMARK(Parse_CachedSplitBig1_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_CachedSplitBig1_RE2)->ThreadRange(1, NumCPUs());
+
+// Benchmark: Parse1SplitHard, big text, big match.
+
+void Parse1SplitBig2(benchmark::State& state,
+ void (*run)(benchmark::State&, const char*,
+ absl::string_view)) {
+ std::string s;
+ s.append("650-253-");
+ s.append(100000, '0');
+ run(state, "[0-9]+.(.*)", s);
+ state.SetItemsProcessed(state.iterations());
+}
+
+void Parse_CachedSplitBig2_PCRE(benchmark::State& state) { Parse1SplitBig2(state, SearchParse1CachedPCRE); }
+void Parse_CachedSplitBig2_RE2(benchmark::State& state) { Parse1SplitBig2(state, SearchParse1CachedRE2); }
+
+#ifdef USEPCRE
+BENCHMARK(Parse_CachedSplitBig2_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_CachedSplitBig2_RE2)->ThreadRange(1, NumCPUs());
+
+// Benchmark: measure time required to parse (but not execute)
+// a simple regular expression.
+
+void ParseRegexp(benchmark::State& state, const std::string& regexp) {
+ for (auto _ : state) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ re->Decref();
+ }
+}
+
+void SimplifyRegexp(benchmark::State& state, const std::string& regexp) {
+ for (auto _ : state) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Regexp* sre = re->Simplify();
+ CHECK(sre);
+ sre->Decref();
+ re->Decref();
+ }
+}
+
+void NullWalkRegexp(benchmark::State& state, const std::string& regexp) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ for (auto _ : state) {
+ re->NullWalk();
+ }
+ re->Decref();
+}
+
+void SimplifyCompileRegexp(benchmark::State& state, const std::string& regexp) {
+ for (auto _ : state) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Regexp* sre = re->Simplify();
+ CHECK(sre);
+ Prog* prog = sre->CompileToProg(0);
+ CHECK(prog);
+ delete prog;
+ sre->Decref();
+ re->Decref();
+ }
+}
+
+void CompileRegexp(benchmark::State& state, const std::string& regexp) {
+ for (auto _ : state) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ delete prog;
+ re->Decref();
+ }
+}
+
+void CompileToProg(benchmark::State& state, const std::string& regexp) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ for (auto _ : state) {
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ delete prog;
+ }
+ re->Decref();
+}
+
+void CompileByteMap(benchmark::State& state, const std::string& regexp) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ for (auto _ : state) {
+ prog->ComputeByteMap();
+ }
+ delete prog;
+ re->Decref();
+}
+
+void CompilePCRE(benchmark::State& state, const std::string& regexp) {
+ for (auto _ : state) {
+ PCRE re(regexp, PCRE::UTF8);
+ CHECK_EQ(re.error(), "");
+ }
+}
+
+void CompileRE2(benchmark::State& state, const std::string& regexp) {
+ for (auto _ : state) {
+ RE2 re(regexp);
+ CHECK_EQ(re.error(), "");
+ }
+}
+
+void RunBuild(benchmark::State& state, const std::string& regexp,
+ void (*run)(benchmark::State&, const std::string&)) {
+ run(state, regexp);
+ state.SetItemsProcessed(state.iterations());
+}
+
+} // namespace re2
+
+ABSL_FLAG(std::string, compile_regexp, "(.*)-(\\d+)-of-(\\d+)",
+ "regexp for compile benchmarks");
+
+namespace re2 {
+
+void BM_PCRE_Compile(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), CompilePCRE); }
+void BM_Regexp_Parse(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), ParseRegexp); }
+void BM_Regexp_Simplify(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), SimplifyRegexp); }
+void BM_CompileToProg(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), CompileToProg); }
+void BM_CompileByteMap(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), CompileByteMap); }
+void BM_Regexp_Compile(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), CompileRegexp); }
+void BM_Regexp_SimplifyCompile(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), SimplifyCompileRegexp); }
+void BM_Regexp_NullWalk(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), NullWalkRegexp); }
+void BM_RE2_Compile(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), CompileRE2); }
+
+#ifdef USEPCRE
+BENCHMARK(BM_PCRE_Compile)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(BM_Regexp_Parse)->ThreadRange(1, NumCPUs());
+BENCHMARK(BM_Regexp_Simplify)->ThreadRange(1, NumCPUs());
+BENCHMARK(BM_CompileToProg)->ThreadRange(1, NumCPUs());
+BENCHMARK(BM_CompileByteMap)->ThreadRange(1, NumCPUs());
+BENCHMARK(BM_Regexp_Compile)->ThreadRange(1, NumCPUs());
+BENCHMARK(BM_Regexp_SimplifyCompile)->ThreadRange(1, NumCPUs());
+BENCHMARK(BM_Regexp_NullWalk)->ThreadRange(1, NumCPUs());
+BENCHMARK(BM_RE2_Compile)->ThreadRange(1, NumCPUs());
+
+// Makes text of size nbytes, then calls run to search
+// the text for regexp iters times.
+void SearchPhone(benchmark::State& state, ParseImpl* search) {
+ std::string s = RandomText(state.range(0));
+ s.append("(650) 253-0001");
+ search(state, "(\\d{3}-|\\(\\d{3}\\)\\s+)(\\d{3}-\\d{4})", s);
+ state.SetBytesProcessed(state.iterations() * state.range(0));
+}
+
+void SearchPhone_CachedPCRE(benchmark::State& state) {
+ SearchPhone(state, SearchParse2CachedPCRE);
+}
+
+void SearchPhone_CachedRE2(benchmark::State& state) {
+ SearchPhone(state, SearchParse2CachedRE2);
+}
+
+#ifdef USEPCRE
+BENCHMARK_RANGE(SearchPhone_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(SearchPhone_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+/*
+TODO(rsc): Make this work again.
+void CacheFill(int iters, int n, SearchImpl *srch) {
+ std::string s = DeBruijnString(n+1);
+ std::string t;
+ for (int i = n+1; i < 20; i++) {
+ t = s + s;
+ using std::swap;
+ swap(s, t);
+ }
+ srch(iters, StringPrintf("0[01]{%d}$", n).c_str(), s,
+ Prog::kUnanchored, true);
+ SetBenchmarkBytesProcessed(static_cast<int64_t>(iters)*s.size());
+}
+
+void CacheFillPCRE(int i, int n) { CacheFill(i, n, SearchCachedPCRE); }
+void CacheFillRE2(int i, int n) { CacheFill(i, n, SearchCachedRE2); }
+void CacheFillNFA(int i, int n) { CacheFill(i, n, SearchCachedNFA); }
+void CacheFillDFA(int i, int n) { CacheFill(i, n, SearchCachedDFA); }
+
+// BENCHMARK_WITH_ARG uses __LINE__ to generate distinct identifiers
+// for the static BenchmarkRegisterer, which makes it unusable inside
+// a macro like DO24 below. MY_BENCHMARK_WITH_ARG uses the argument a
+// to make the identifiers distinct (only possible when 'a' is a simple
+// expression like 2, not like 1+1).
+#define MY_BENCHMARK_WITH_ARG(n, a) \
+ bool __benchmark_ ## n ## a = \
+ (new ::testing::Benchmark(#n, NewPermanentCallback(&n)))->ThreadRange(1, NumCPUs());
+
+#define DO24(A, B) \
+ A(B, 1); A(B, 2); A(B, 3); A(B, 4); A(B, 5); A(B, 6); \
+ A(B, 7); A(B, 8); A(B, 9); A(B, 10); A(B, 11); A(B, 12); \
+ A(B, 13); A(B, 14); A(B, 15); A(B, 16); A(B, 17); A(B, 18); \
+ A(B, 19); A(B, 20); A(B, 21); A(B, 22); A(B, 23); A(B, 24);
+
+DO24(MY_BENCHMARK_WITH_ARG, CacheFillPCRE)
+DO24(MY_BENCHMARK_WITH_ARG, CacheFillNFA)
+DO24(MY_BENCHMARK_WITH_ARG, CacheFillRE2)
+DO24(MY_BENCHMARK_WITH_ARG, CacheFillDFA)
+
+#undef DO24
+#undef MY_BENCHMARK_WITH_ARG
+*/
+
+////////////////////////////////////////////////////////////////////////
+//
+// Implementation routines. Sad that there are so many,
+// but all the interfaces are slightly different.
+
+// Runs implementation to search for regexp in text, iters times.
+// Expect_match says whether the regexp should be found.
+// Anchored says whether to run an anchored search.
+
+void SearchDFA(benchmark::State& state, const char* regexp,
+ absl::string_view text, Prog::Anchor anchor,
+ bool expect_match) {
+ for (auto _ : state) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ bool failed = false;
+ CHECK_EQ(prog->SearchDFA(text, absl::string_view(), anchor,
+ Prog::kFirstMatch, NULL, &failed, NULL),
+ expect_match);
+ CHECK(!failed);
+ delete prog;
+ re->Decref();
+ }
+}
+
+void SearchNFA(benchmark::State& state, const char* regexp,
+ absl::string_view text, Prog::Anchor anchor,
+ bool expect_match) {
+ for (auto _ : state) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK_EQ(prog->SearchNFA(text, absl::string_view(), anchor,
+ Prog::kFirstMatch, NULL, 0),
+ expect_match);
+ delete prog;
+ re->Decref();
+ }
+}
+
+void SearchOnePass(benchmark::State& state, const char* regexp,
+ absl::string_view text, Prog::Anchor anchor,
+ bool expect_match) {
+ for (auto _ : state) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->IsOnePass());
+ CHECK_EQ(prog->SearchOnePass(text, text, anchor, Prog::kFirstMatch, NULL, 0),
+ expect_match);
+ delete prog;
+ re->Decref();
+ }
+}
+
+void SearchBitState(benchmark::State& state, const char* regexp,
+ absl::string_view text, Prog::Anchor anchor,
+ bool expect_match) {
+ for (auto _ : state) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->CanBitState());
+ CHECK_EQ(prog->SearchBitState(text, text, anchor, Prog::kFirstMatch, NULL, 0),
+ expect_match);
+ delete prog;
+ re->Decref();
+ }
+}
+
+void SearchPCRE(benchmark::State& state, const char* regexp,
+ absl::string_view text, Prog::Anchor anchor,
+ bool expect_match) {
+ for (auto _ : state) {
+ PCRE re(regexp, PCRE::UTF8);
+ CHECK_EQ(re.error(), "");
+ if (anchor == Prog::kAnchored)
+ CHECK_EQ(PCRE::FullMatch(text, re), expect_match);
+ else
+ CHECK_EQ(PCRE::PartialMatch(text, re), expect_match);
+ }
+}
+
+void SearchRE2(benchmark::State& state, const char* regexp,
+ absl::string_view text, Prog::Anchor anchor,
+ bool expect_match) {
+ for (auto _ : state) {
+ RE2 re(regexp);
+ CHECK_EQ(re.error(), "");
+ if (anchor == Prog::kAnchored)
+ CHECK_EQ(RE2::FullMatch(text, re), expect_match);
+ else
+ CHECK_EQ(RE2::PartialMatch(text, re), expect_match);
+ }
+}
+
+// SearchCachedXXX is like SearchXXX but only does the
+// regexp parsing and compiling once. This lets us measure
+// search time without the per-regexp overhead.
+
+Prog* GetCachedProg(const char* regexp) {
+ static auto& mutex = *new absl::Mutex;
+ absl::MutexLock lock(&mutex);
+ static auto& cache = *new absl::flat_hash_map<std::string, Prog*>;
+ Prog* prog = cache[regexp];
+ if (prog == NULL) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ prog = re->CompileToProg(int64_t{1}<<31); // mostly for the DFA
+ CHECK(prog);
+ cache[regexp] = prog;
+ re->Decref();
+ // We must call this here - while we have exclusive access.
+ prog->IsOnePass();
+ }
+ return prog;
+}
+
+PCRE* GetCachedPCRE(const char* regexp) {
+ static auto& mutex = *new absl::Mutex;
+ absl::MutexLock lock(&mutex);
+ static auto& cache = *new absl::flat_hash_map<std::string, PCRE*>;
+ PCRE* re = cache[regexp];
+ if (re == NULL) {
+ re = new PCRE(regexp, PCRE::UTF8);
+ CHECK_EQ(re->error(), "");
+ cache[regexp] = re;
+ }
+ return re;
+}
+
+RE2* GetCachedRE2(const char* regexp) {
+ static auto& mutex = *new absl::Mutex;
+ absl::MutexLock lock(&mutex);
+ static auto& cache = *new absl::flat_hash_map<std::string, RE2*>;
+ RE2* re = cache[regexp];
+ if (re == NULL) {
+ re = new RE2(regexp);
+ CHECK_EQ(re->error(), "");
+ cache[regexp] = re;
+ }
+ return re;
+}
+
+void SearchCachedDFA(benchmark::State& state, const char* regexp,
+ absl::string_view text, Prog::Anchor anchor,
+ bool expect_match) {
+ Prog* prog = GetCachedProg(regexp);
+ for (auto _ : state) {
+ bool failed = false;
+ CHECK_EQ(prog->SearchDFA(text, absl::string_view(), anchor,
+ Prog::kFirstMatch, NULL, &failed, NULL),
+ expect_match);
+ CHECK(!failed);
+ }
+}
+
+void SearchCachedNFA(benchmark::State& state, const char* regexp,
+ absl::string_view text, Prog::Anchor anchor,
+ bool expect_match) {
+ Prog* prog = GetCachedProg(regexp);
+ for (auto _ : state) {
+ CHECK_EQ(prog->SearchNFA(text, absl::string_view(), anchor,
+ Prog::kFirstMatch, NULL, 0),
+ expect_match);
+ }
+}
+
+void SearchCachedOnePass(benchmark::State& state, const char* regexp,
+ absl::string_view text, Prog::Anchor anchor,
+ bool expect_match) {
+ Prog* prog = GetCachedProg(regexp);
+ CHECK(prog->IsOnePass());
+ for (auto _ : state) {
+ CHECK_EQ(prog->SearchOnePass(text, text, anchor, Prog::kFirstMatch, NULL, 0),
+ expect_match);
+ }
+}
+
+void SearchCachedBitState(benchmark::State& state, const char* regexp,
+ absl::string_view text, Prog::Anchor anchor,
+ bool expect_match) {
+ Prog* prog = GetCachedProg(regexp);
+ CHECK(prog->CanBitState());
+ for (auto _ : state) {
+ CHECK_EQ(prog->SearchBitState(text, text, anchor, Prog::kFirstMatch, NULL, 0),
+ expect_match);
+ }
+}
+
+void SearchCachedPCRE(benchmark::State& state, const char* regexp,
+ absl::string_view text, Prog::Anchor anchor,
+ bool expect_match) {
+ PCRE& re = *GetCachedPCRE(regexp);
+ for (auto _ : state) {
+ if (anchor == Prog::kAnchored)
+ CHECK_EQ(PCRE::FullMatch(text, re), expect_match);
+ else
+ CHECK_EQ(PCRE::PartialMatch(text, re), expect_match);
+ }
+}
+
+void SearchCachedRE2(benchmark::State& state, const char* regexp,
+ absl::string_view text, Prog::Anchor anchor,
+ bool expect_match) {
+ RE2& re = *GetCachedRE2(regexp);
+ for (auto _ : state) {
+ if (anchor == Prog::kAnchored)
+ CHECK_EQ(RE2::FullMatch(text, re), expect_match);
+ else
+ CHECK_EQ(RE2::PartialMatch(text, re), expect_match);
+ }
+}
+
+// Runs implementation to full match regexp against text,
+// extracting three submatches. Expects match always.
+
+void Parse3NFA(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ for (auto _ : state) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
+ CHECK(prog->SearchNFA(text, absl::string_view(), Prog::kAnchored,
+ Prog::kFullMatch, sp, 4));
+ delete prog;
+ re->Decref();
+ }
+}
+
+void Parse3OnePass(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ for (auto _ : state) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->IsOnePass());
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
+ CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ delete prog;
+ re->Decref();
+ }
+}
+
+void Parse3BitState(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ for (auto _ : state) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->CanBitState());
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
+ CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ delete prog;
+ re->Decref();
+ }
+}
+
+void Parse3Backtrack(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ for (auto _ : state) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
+ CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ delete prog;
+ re->Decref();
+ }
+}
+
+void Parse3PCRE(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ for (auto _ : state) {
+ PCRE re(regexp, PCRE::UTF8);
+ CHECK_EQ(re.error(), "");
+ absl::string_view sp1, sp2, sp3;
+ CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3));
+ }
+}
+
+void Parse3RE2(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ for (auto _ : state) {
+ RE2 re(regexp);
+ CHECK_EQ(re.error(), "");
+ absl::string_view sp1, sp2, sp3;
+ CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3));
+ }
+}
+
+void Parse3CachedNFA(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ Prog* prog = GetCachedProg(regexp);
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
+ for (auto _ : state) {
+ CHECK(prog->SearchNFA(text, absl::string_view(), Prog::kAnchored,
+ Prog::kFullMatch, sp, 4));
+ }
+}
+
+void Parse3CachedOnePass(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ Prog* prog = GetCachedProg(regexp);
+ CHECK(prog->IsOnePass());
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
+ for (auto _ : state) {
+ CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ }
+}
+
+void Parse3CachedBitState(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ Prog* prog = GetCachedProg(regexp);
+ CHECK(prog->CanBitState());
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
+ for (auto _ : state) {
+ CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ }
+}
+
+void Parse3CachedBacktrack(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ Prog* prog = GetCachedProg(regexp);
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
+ for (auto _ : state) {
+ CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ }
+}
+
+void Parse3CachedPCRE(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ PCRE& re = *GetCachedPCRE(regexp);
+ absl::string_view sp1, sp2, sp3;
+ for (auto _ : state) {
+ CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3));
+ }
+}
+
+void Parse3CachedRE2(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ RE2& re = *GetCachedRE2(regexp);
+ absl::string_view sp1, sp2, sp3;
+ for (auto _ : state) {
+ CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3));
+ }
+}
+
+// Runs implementation to full match regexp against text,
+// extracting three submatches. Expects match always.
+
+void Parse1NFA(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ for (auto _ : state) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ absl::string_view sp[2]; // 2 because sp[0] is whole match.
+ CHECK(prog->SearchNFA(text, absl::string_view(), Prog::kAnchored,
+ Prog::kFullMatch, sp, 2));
+ delete prog;
+ re->Decref();
+ }
+}
+
+void Parse1OnePass(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ for (auto _ : state) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->IsOnePass());
+ absl::string_view sp[2]; // 2 because sp[0] is whole match.
+ CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2));
+ delete prog;
+ re->Decref();
+ }
+}
+
+void Parse1BitState(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ for (auto _ : state) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->CanBitState());
+ absl::string_view sp[2]; // 2 because sp[0] is whole match.
+ CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2));
+ delete prog;
+ re->Decref();
+ }
+}
+
+void Parse1PCRE(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ for (auto _ : state) {
+ PCRE re(regexp, PCRE::UTF8);
+ CHECK_EQ(re.error(), "");
+ absl::string_view sp1;
+ CHECK(PCRE::FullMatch(text, re, &sp1));
+ }
+}
+
+void Parse1RE2(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ for (auto _ : state) {
+ RE2 re(regexp);
+ CHECK_EQ(re.error(), "");
+ absl::string_view sp1;
+ CHECK(RE2::FullMatch(text, re, &sp1));
+ }
+}
+
+void Parse1CachedNFA(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ Prog* prog = GetCachedProg(regexp);
+ absl::string_view sp[2]; // 2 because sp[0] is whole match.
+ for (auto _ : state) {
+ CHECK(prog->SearchNFA(text, absl::string_view(), Prog::kAnchored,
+ Prog::kFullMatch, sp, 2));
+ }
+}
+
+void Parse1CachedOnePass(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ Prog* prog = GetCachedProg(regexp);
+ CHECK(prog->IsOnePass());
+ absl::string_view sp[2]; // 2 because sp[0] is whole match.
+ for (auto _ : state) {
+ CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2));
+ }
+}
+
+void Parse1CachedBitState(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ Prog* prog = GetCachedProg(regexp);
+ CHECK(prog->CanBitState());
+ absl::string_view sp[2]; // 2 because sp[0] is whole match.
+ for (auto _ : state) {
+ CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2));
+ }
+}
+
+void Parse1CachedBacktrack(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ Prog* prog = GetCachedProg(regexp);
+ absl::string_view sp[2]; // 2 because sp[0] is whole match.
+ for (auto _ : state) {
+ CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2));
+ }
+}
+
+void Parse1CachedPCRE(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ PCRE& re = *GetCachedPCRE(regexp);
+ absl::string_view sp1;
+ for (auto _ : state) {
+ CHECK(PCRE::FullMatch(text, re, &sp1));
+ }
+}
+
+void Parse1CachedRE2(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ RE2& re = *GetCachedRE2(regexp);
+ absl::string_view sp1;
+ for (auto _ : state) {
+ CHECK(RE2::FullMatch(text, re, &sp1));
+ }
+}
+
+void SearchParse2CachedPCRE(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ PCRE& re = *GetCachedPCRE(regexp);
+ for (auto _ : state) {
+ absl::string_view sp1, sp2;
+ CHECK(PCRE::PartialMatch(text, re, &sp1, &sp2));
+ }
+}
+
+void SearchParse2CachedRE2(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ RE2& re = *GetCachedRE2(regexp);
+ for (auto _ : state) {
+ absl::string_view sp1, sp2;
+ CHECK(RE2::PartialMatch(text, re, &sp1, &sp2));
+ }
+}
+
+void SearchParse1CachedPCRE(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ PCRE& re = *GetCachedPCRE(regexp);
+ for (auto _ : state) {
+ absl::string_view sp1;
+ CHECK(PCRE::PartialMatch(text, re, &sp1));
+ }
+}
+
+void SearchParse1CachedRE2(benchmark::State& state, const char* regexp,
+ absl::string_view text) {
+ RE2& re = *GetCachedRE2(regexp);
+ for (auto _ : state) {
+ absl::string_view sp1;
+ CHECK(RE2::PartialMatch(text, re, &sp1));
+ }
+}
+
+void EmptyPartialMatchPCRE(benchmark::State& state) {
+ PCRE re("");
+ for (auto _ : state) {
+ PCRE::PartialMatch("", re);
+ }
+}
+
+void EmptyPartialMatchRE2(benchmark::State& state) {
+ RE2 re("");
+ for (auto _ : state) {
+ RE2::PartialMatch("", re);
+ }
+}
+#ifdef USEPCRE
+BENCHMARK(EmptyPartialMatchPCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(EmptyPartialMatchRE2)->ThreadRange(1, NumCPUs());
+
+void SimplePartialMatchPCRE(benchmark::State& state) {
+ PCRE re("abcdefg");
+ for (auto _ : state) {
+ PCRE::PartialMatch("abcdefg", re);
+ }
+}
+
+void SimplePartialMatchRE2(benchmark::State& state) {
+ RE2 re("abcdefg");
+ for (auto _ : state) {
+ RE2::PartialMatch("abcdefg", re);
+ }
+}
+#ifdef USEPCRE
+BENCHMARK(SimplePartialMatchPCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(SimplePartialMatchRE2)->ThreadRange(1, NumCPUs());
+
+static std::string http_text =
+ "GET /asdfhjasdhfasdlfhasdflkjasdfkljasdhflaskdjhf"
+ "alksdjfhasdlkfhasdlkjfhasdljkfhadsjklf HTTP/1.1";
+
+void HTTPPartialMatchPCRE(benchmark::State& state) {
+ absl::string_view a;
+ PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP");
+ for (auto _ : state) {
+ PCRE::PartialMatch(http_text, re, &a);
+ }
+}
+
+void HTTPPartialMatchRE2(benchmark::State& state) {
+ absl::string_view a;
+ RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP");
+ for (auto _ : state) {
+ RE2::PartialMatch(http_text, re, &a);
+ }
+}
+
+#ifdef USEPCRE
+BENCHMARK(HTTPPartialMatchPCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(HTTPPartialMatchRE2)->ThreadRange(1, NumCPUs());
+
+static std::string smallhttp_text =
+ "GET /abc HTTP/1.1";
+
+void SmallHTTPPartialMatchPCRE(benchmark::State& state) {
+ absl::string_view a;
+ PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP");
+ for (auto _ : state) {
+ PCRE::PartialMatch(smallhttp_text, re, &a);
+ }
+}
+
+void SmallHTTPPartialMatchRE2(benchmark::State& state) {
+ absl::string_view a;
+ RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP");
+ for (auto _ : state) {
+ RE2::PartialMatch(smallhttp_text, re, &a);
+ }
+}
+
+#ifdef USEPCRE
+BENCHMARK(SmallHTTPPartialMatchPCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(SmallHTTPPartialMatchRE2)->ThreadRange(1, NumCPUs());
+
+void DotMatchPCRE(benchmark::State& state) {
+ absl::string_view a;
+ PCRE re("(?-s)^(.+)");
+ for (auto _ : state) {
+ PCRE::PartialMatch(http_text, re, &a);
+ }
+}
+
+void DotMatchRE2(benchmark::State& state) {
+ absl::string_view a;
+ RE2 re("(?-s)^(.+)");
+ for (auto _ : state) {
+ RE2::PartialMatch(http_text, re, &a);
+ }
+}
+
+#ifdef USEPCRE
+BENCHMARK(DotMatchPCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(DotMatchRE2)->ThreadRange(1, NumCPUs());
+
+void ASCIIMatchPCRE(benchmark::State& state) {
+ absl::string_view a;
+ PCRE re("(?-s)^([ -~]+)");
+ for (auto _ : state) {
+ PCRE::PartialMatch(http_text, re, &a);
+ }
+}
+
+void ASCIIMatchRE2(benchmark::State& state) {
+ absl::string_view a;
+ RE2 re("(?-s)^([ -~]+)");
+ for (auto _ : state) {
+ RE2::PartialMatch(http_text, re, &a);
+ }
+}
+
+#ifdef USEPCRE
+BENCHMARK(ASCIIMatchPCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(ASCIIMatchRE2)->ThreadRange(1, NumCPUs());
+
+void FullMatchPCRE(benchmark::State& state, const char *regexp) {
+ std::string s = RandomText(state.range(0));
+ s += "ABCDEFGHIJ";
+ PCRE re(regexp);
+ for (auto _ : state) {
+ CHECK(PCRE::FullMatch(s, re));
+ }
+ state.SetBytesProcessed(state.iterations() * state.range(0));
+}
+
+void FullMatchRE2(benchmark::State& state, const char *regexp) {
+ std::string s = RandomText(state.range(0));
+ s += "ABCDEFGHIJ";
+ RE2 re(regexp, RE2::Latin1);
+ for (auto _ : state) {
+ CHECK(RE2::FullMatch(s, re));
+ }
+ state.SetBytesProcessed(state.iterations() * state.range(0));
+}
+
+void FullMatch_DotStar_CachedPCRE(benchmark::State& state) { FullMatchPCRE(state, "(?s).*"); }
+void FullMatch_DotStar_CachedRE2(benchmark::State& state) { FullMatchRE2(state, "(?s).*"); }
+
+void FullMatch_DotStarDollar_CachedPCRE(benchmark::State& state) { FullMatchPCRE(state, "(?s).*$"); }
+void FullMatch_DotStarDollar_CachedRE2(benchmark::State& state) { FullMatchRE2(state, "(?s).*$"); }
+
+void FullMatch_DotStarCapture_CachedPCRE(benchmark::State& state) { FullMatchPCRE(state, "(?s)((.*)()()($))"); }
+void FullMatch_DotStarCapture_CachedRE2(benchmark::State& state) { FullMatchRE2(state, "(?s)((.*)()()($))"); }
+
+#ifdef USEPCRE
+BENCHMARK_RANGE(FullMatch_DotStar_CachedPCRE, 8, 2<<20);
+#endif
+BENCHMARK_RANGE(FullMatch_DotStar_CachedRE2, 8, 2<<20);
+
+#ifdef USEPCRE
+BENCHMARK_RANGE(FullMatch_DotStarDollar_CachedPCRE, 8, 2<<20);
+#endif
+BENCHMARK_RANGE(FullMatch_DotStarDollar_CachedRE2, 8, 2<<20);
+
+#ifdef USEPCRE
+BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedPCRE, 8, 2<<20);
+#endif
+BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedRE2, 8, 2<<20);
+
+void PossibleMatchRangeCommon(benchmark::State& state, const char* regexp) {
+ RE2 re(regexp);
+ std::string min;
+ std::string max;
+ const int kMaxLen = 16;
+ for (auto _ : state) {
+ CHECK(re.PossibleMatchRange(&min, &max, kMaxLen));
+ }
+}
+
+void PossibleMatchRange_Trivial(benchmark::State& state) {
+ PossibleMatchRangeCommon(state, ".*");
+}
+void PossibleMatchRange_Complex(benchmark::State& state) {
+ PossibleMatchRangeCommon(state, "^abc[def]?[gh]{1,2}.*");
+}
+void PossibleMatchRange_Prefix(benchmark::State& state) {
+ PossibleMatchRangeCommon(state, "^some_random_prefix.*");
+}
+void PossibleMatchRange_NoProg(benchmark::State& state) {
+ PossibleMatchRangeCommon(state, "^some_random_string$");
+}
+
+BENCHMARK(PossibleMatchRange_Trivial);
+BENCHMARK(PossibleMatchRange_Complex);
+BENCHMARK(PossibleMatchRange_Prefix);
+BENCHMARK(PossibleMatchRange_NoProg);
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/regexp_generator.cc b/third_party/re2/src/re2/testing/regexp_generator.cc
new file mode 100644
index 000000000..b1761ed93
--- /dev/null
+++ b/third_party/re2/src/re2/testing/regexp_generator.cc
@@ -0,0 +1,280 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression generator: generates all possible
+// regular expressions within parameters (see regexp_generator.h for details).
+
+// The regexp generator first generates a sequence of commands in a simple
+// postfix language. Each command in the language is a string,
+// like "a" or "%s*" or "%s|%s".
+//
+// To evaluate a command, enough arguments are popped from the value stack to
+// plug into the %s slots. Then the result is pushed onto the stack.
+// For example, the command sequence
+// a b %s%s c
+// results in the stack
+// ab c
+//
+// GeneratePostfix generates all possible command sequences.
+// Then RunPostfix turns each sequence into a regular expression
+// and passes the regexp to HandleRegexp.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <memory>
+#include <stack>
+#include <string>
+#include <vector>
+
+#include "absl/base/macros.h"
+#include "absl/strings/escaping.h"
+#include "absl/strings/str_format.h"
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "util/utf.h"
+#include "re2/testing/regexp_generator.h"
+
+namespace re2 {
+
+// Returns a vector of the egrep regexp operators.
+const std::vector<std::string>& RegexpGenerator::EgrepOps() {
+ static const char *ops[] = {
+ "%s%s",
+ "%s|%s",
+ "%s*",
+ "%s+",
+ "%s?",
+ "%s\\C*",
+ };
+ static std::vector<std::string> v(ops, ops + ABSL_ARRAYSIZE(ops));
+ return v;
+}
+
+RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
+ const std::vector<std::string>& atoms,
+ const std::vector<std::string>& ops)
+ : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
+ // Degenerate case.
+ if (atoms_.empty())
+ maxatoms_ = 0;
+ if (ops_.empty())
+ maxops_ = 0;
+}
+
+// Generates all possible regular expressions (within the parameters),
+// calling HandleRegexp for each one.
+void RegexpGenerator::Generate() {
+ std::vector<std::string> postfix;
+ GeneratePostfix(&postfix, 0, 0, 0);
+}
+
+// Generates random regular expressions, calling HandleRegexp for each one.
+void RegexpGenerator::GenerateRandom(int32_t seed, int n) {
+ rng_.seed(seed);
+
+ for (int i = 0; i < n; i++) {
+ std::vector<std::string> postfix;
+ GenerateRandomPostfix(&postfix, 0, 0, 0);
+ }
+}
+
+// Counts and returns the number of occurrences of "%s" in s.
+static int CountArgs(const std::string& s) {
+ const char *p = s.c_str();
+ int n = 0;
+ while ((p = strstr(p, "%s")) != NULL) {
+ p += 2;
+ n++;
+ }
+ return n;
+}
+
+// Generates all possible postfix command sequences.
+// Each sequence is handed off to RunPostfix to generate a regular expression.
+// The arguments are:
+// post: the current postfix sequence
+// nstk: the number of elements that would be on the stack after executing
+// the sequence
+// ops: the number of operators used in the sequence
+// atoms: the number of atoms used in the sequence
+// For example, if post were ["a", "b", "%s%s", "c"],
+// then nstk = 2, ops = 1, atoms = 3.
+//
+// The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
+//
+void RegexpGenerator::GeneratePostfix(std::vector<std::string>* post,
+ int nstk, int ops, int atoms) {
+ if (nstk == 1)
+ RunPostfix(*post);
+
+ // Early out: if used too many operators or can't
+ // get back down to a single expression on the stack
+ // using binary operators, give up.
+ if (ops + nstk - 1 > maxops_)
+ return;
+
+ // Add atoms if there is room.
+ if (atoms < maxatoms_) {
+ for (size_t i = 0; i < atoms_.size(); i++) {
+ post->push_back(atoms_[i]);
+ GeneratePostfix(post, nstk + 1, ops, atoms + 1);
+ post->pop_back();
+ }
+ }
+
+ // Add operators if there are enough arguments.
+ if (ops < maxops_) {
+ for (size_t i = 0; i < ops_.size(); i++) {
+ const std::string& fmt = ops_[i];
+ int nargs = CountArgs(fmt);
+ if (nargs <= nstk) {
+ post->push_back(fmt);
+ GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
+ post->pop_back();
+ }
+ }
+ }
+}
+
+// Generates a random postfix command sequence.
+// Stops and returns true once a single sequence has been generated.
+bool RegexpGenerator::GenerateRandomPostfix(std::vector<std::string>* post,
+ int nstk, int ops, int atoms) {
+ std::uniform_int_distribution<int> random_stop(0, maxatoms_ - atoms);
+ std::uniform_int_distribution<int> random_bit(0, 1);
+ std::uniform_int_distribution<int> random_ops_index(
+ 0, static_cast<int>(ops_.size()) - 1);
+ std::uniform_int_distribution<int> random_atoms_index(
+ 0, static_cast<int>(atoms_.size()) - 1);
+
+ for (;;) {
+ // Stop if we get to a single element, but only sometimes.
+ if (nstk == 1 && random_stop(rng_) == 0) {
+ RunPostfix(*post);
+ return true;
+ }
+
+ // Early out: if used too many operators or can't
+ // get back down to a single expression on the stack
+ // using binary operators, give up.
+ if (ops + nstk - 1 > maxops_)
+ return false;
+
+ // Add operators if there are enough arguments.
+ if (ops < maxops_ && random_bit(rng_) == 0) {
+ const std::string& fmt = ops_[random_ops_index(rng_)];
+ int nargs = CountArgs(fmt);
+ if (nargs <= nstk) {
+ post->push_back(fmt);
+ bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
+ ops + 1, atoms);
+ post->pop_back();
+ if (ret)
+ return true;
+ }
+ }
+
+ // Add atoms if there is room.
+ if (atoms < maxatoms_ && random_bit(rng_) == 0) {
+ post->push_back(atoms_[random_atoms_index(rng_)]);
+ bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
+ post->pop_back();
+ if (ret)
+ return true;
+ }
+ }
+}
+
+// Interprets the postfix command sequence to create a regular expression
+// passed to HandleRegexp. The results of operators like %s|%s are wrapped
+// in (?: ) to avoid needing to maintain a precedence table.
+void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) {
+ std::stack<std::string> regexps;
+ for (size_t i = 0; i < post.size(); i++) {
+ switch (CountArgs(post[i])) {
+ default:
+ LOG(FATAL) << "Bad operator: " << post[i];
+ case 0:
+ regexps.push(post[i]);
+ break;
+ case 1: {
+ auto fmt = absl::ParsedFormat<'s'>::New(post[i]);
+ CHECK(fmt != nullptr);
+ std::string a = regexps.top();
+ regexps.pop();
+ regexps.push("(?:" + absl::StrFormat(*fmt, a) + ")");
+ break;
+ }
+ case 2: {
+ auto fmt = absl::ParsedFormat<'s', 's'>::New(post[i]);
+ CHECK(fmt != nullptr);
+ std::string b = regexps.top();
+ regexps.pop();
+ std::string a = regexps.top();
+ regexps.pop();
+ regexps.push("(?:" + absl::StrFormat(*fmt, a, b) + ")");
+ break;
+ }
+ }
+ }
+
+ if (regexps.size() != 1) {
+ // Internal error - should never happen.
+ absl::PrintF("Bad regexp program:\n");
+ for (size_t i = 0; i < post.size(); i++) {
+ absl::PrintF(" %s\n", absl::CEscape(post[i]));
+ }
+ absl::PrintF("Stack after running program:\n");
+ while (!regexps.empty()) {
+ absl::PrintF(" %s\n", absl::CEscape(regexps.top()));
+ regexps.pop();
+ }
+ LOG(FATAL) << "Bad regexp program.";
+ }
+
+ HandleRegexp(regexps.top());
+ HandleRegexp("^(?:" + regexps.top() + ")$");
+ HandleRegexp("^(?:" + regexps.top() + ")");
+ HandleRegexp("(?:" + regexps.top() + ")$");
+}
+
+// Split s into an vector of strings, one for each UTF-8 character.
+std::vector<std::string> Explode(absl::string_view s) {
+ std::vector<std::string> v;
+
+ for (const char *q = s.data(); q < s.data() + s.size(); ) {
+ const char* p = q;
+ Rune r;
+ q += chartorune(&r, q);
+ v.push_back(std::string(p, q - p));
+ }
+
+ return v;
+}
+
+// Split string everywhere a substring is found, returning
+// vector of pieces.
+std::vector<std::string> Split(absl::string_view sep, absl::string_view s) {
+ std::vector<std::string> v;
+
+ if (sep.empty())
+ return Explode(s);
+
+ const char *p = s.data();
+ for (const char *q = s.data(); q + sep.size() <= s.data() + s.size(); q++) {
+ if (absl::string_view(q, sep.size()) == sep) {
+ v.push_back(std::string(p, q - p));
+ p = q + sep.size();
+ q = p - 1; // -1 for ++ in loop
+ continue;
+ }
+ }
+ if (p < s.data() + s.size())
+ v.push_back(std::string(p, s.data() + s.size() - p));
+ return v;
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/regexp_generator.h b/third_party/re2/src/re2/testing/regexp_generator.h
new file mode 100644
index 000000000..e1be1a93d
--- /dev/null
+++ b/third_party/re2/src/re2/testing/regexp_generator.h
@@ -0,0 +1,76 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_TESTING_REGEXP_GENERATOR_H_
+#define RE2_TESTING_REGEXP_GENERATOR_H_
+
+// Regular expression generator: generates all possible
+// regular expressions within given parameters (see below for details).
+
+#include <stdint.h>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+
+namespace re2 {
+
+// Regular expression generator.
+//
+// Given a set of atom expressions like "a", "b", or "."
+// and operators like "%s*", generates all possible regular expressions
+// using at most maxbases base expressions and maxops operators.
+// For each such expression re, calls HandleRegexp(re).
+//
+// Callers are expected to subclass RegexpGenerator and provide HandleRegexp.
+//
+class RegexpGenerator {
+ public:
+ RegexpGenerator(int maxatoms, int maxops,
+ const std::vector<std::string>& atoms,
+ const std::vector<std::string>& ops);
+ virtual ~RegexpGenerator() {}
+
+ // Generates all the regular expressions, calling HandleRegexp(re) for each.
+ void Generate();
+
+ // Generates n random regular expressions, calling HandleRegexp(re) for each.
+ void GenerateRandom(int32_t seed, int n);
+
+ // Handles a regular expression. Must be provided by subclass.
+ virtual void HandleRegexp(const std::string& regexp) = 0;
+
+ // The egrep regexp operators: * + ? | and concatenation.
+ static const std::vector<std::string>& EgrepOps();
+
+ private:
+ void RunPostfix(const std::vector<std::string>& post);
+ void GeneratePostfix(std::vector<std::string>* post,
+ int nstk, int ops, int lits);
+ bool GenerateRandomPostfix(std::vector<std::string>* post,
+ int nstk, int ops, int lits);
+
+ int maxatoms_; // Maximum number of atoms allowed in expr.
+ int maxops_; // Maximum number of ops allowed in expr.
+ std::vector<std::string> atoms_; // Possible atoms.
+ std::vector<std::string> ops_; // Possible ops.
+ std::minstd_rand0 rng_; // Random number generator.
+
+ RegexpGenerator(const RegexpGenerator&) = delete;
+ RegexpGenerator& operator=(const RegexpGenerator&) = delete;
+};
+
+// Helpers for preparing arguments to RegexpGenerator constructor.
+
+// Returns one string for each character in s.
+std::vector<std::string> Explode(absl::string_view s);
+
+// Splits string everywhere sep is found, returning
+// vector of pieces.
+std::vector<std::string> Split(absl::string_view sep, absl::string_view s);
+
+} // namespace re2
+
+#endif // RE2_TESTING_REGEXP_GENERATOR_H_
diff --git a/third_party/re2/src/re2/testing/regexp_test.cc b/third_party/re2/src/re2/testing/regexp_test.cc
new file mode 100644
index 000000000..ef8f59d36
--- /dev/null
+++ b/third_party/re2/src/re2/testing/regexp_test.cc
@@ -0,0 +1,86 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test parse.cc, dump.cc, and tostring.cc.
+
+#include <stddef.h>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+// Test that overflowed ref counts work.
+TEST(Regexp, BigRef) {
+ Regexp* re;
+ re = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
+ for (int i = 0; i < 100000; i++)
+ re->Incref();
+ for (int i = 0; i < 100000; i++)
+ re->Decref();
+ ASSERT_EQ(re->Ref(), 1);
+ re->Decref();
+}
+
+// Test that very large Concats work.
+// Depends on overflowed ref counts working.
+TEST(Regexp, BigConcat) {
+ Regexp* x;
+ x = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
+ std::vector<Regexp*> v(90000, x); // ToString bails out at 100000
+ for (size_t i = 0; i < v.size(); i++)
+ x->Incref();
+ ASSERT_EQ(x->Ref(), 1 + static_cast<int>(v.size())) << x->Ref();
+ Regexp* re = Regexp::Concat(v.data(), static_cast<int>(v.size()),
+ Regexp::NoParseFlags);
+ ASSERT_EQ(re->ToString(), std::string(v.size(), 'x'));
+ re->Decref();
+ ASSERT_EQ(x->Ref(), 1) << x->Ref();
+ x->Decref();
+}
+
+TEST(Regexp, NamedCaptures) {
+ Regexp* x;
+ RegexpStatus status;
+ x = Regexp::Parse(
+ "(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(4, x->NumCaptures());
+ const std::map<std::string, int>* have = x->NamedCaptures();
+ EXPECT_TRUE(have != NULL);
+ EXPECT_EQ(2, have->size()); // there are only two named groups in
+ // the regexp: 'g1' and 'g2'.
+ std::map<std::string, int> want;
+ want["g1"] = 1;
+ want["g2"] = 3;
+ EXPECT_EQ(want, *have);
+ x->Decref();
+ delete have;
+}
+
+TEST(Regexp, CaptureNames) {
+ Regexp* x;
+ RegexpStatus status;
+ x = Regexp::Parse(
+ "(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(4, x->NumCaptures());
+ const std::map<int, std::string>* have = x->CaptureNames();
+ EXPECT_TRUE(have != NULL);
+ EXPECT_EQ(3, have->size());
+ std::map<int, std::string> want;
+ want[1] = "g1";
+ want[3] = "g2";
+ want[4] = "g1";
+
+ EXPECT_EQ(want, *have);
+ x->Decref();
+ delete have;
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/required_prefix_test.cc b/third_party/re2/src/re2/testing/required_prefix_test.cc
new file mode 100644
index 000000000..231fd3485
--- /dev/null
+++ b/third_party/re2/src/re2/testing/required_prefix_test.cc
@@ -0,0 +1,200 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string>
+
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct PrefixTest {
+ const char* regexp;
+ bool return_value;
+ const char* prefix;
+ bool foldcase;
+ const char* suffix;
+};
+
+static PrefixTest tests[] = {
+ // Empty cases.
+ { "", false },
+ { "(?m)^", false },
+ { "(?-m)^", false },
+
+ // If the regexp has no ^, there's no required prefix.
+ { "abc", false },
+
+ // If the regexp immediately goes into
+ // something not a literal match, there's no required prefix.
+ { "^a*", false },
+ { "^(abc)", false },
+
+ // Otherwise, it should work.
+ { "^abc$", true, "abc", false, "(?-m:$)" },
+ { "^abc", true, "abc", false, "" },
+ { "^(?i)abc", true, "abc", true, "" },
+ { "^abcd*", true, "abc", false, "d*" },
+ { "^[Aa][Bb]cd*", true, "ab", true, "cd*" },
+ { "^ab[Cc]d*", true, "ab", false, "[Cc]d*" },
+ { "^☺abc", true, "☺abc", false, "" },
+};
+
+TEST(RequiredPrefix, SimpleTests) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
+ const PrefixTest& t = tests[i];
+ for (size_t j = 0; j < 2; j++) {
+ Regexp::ParseFlags flags = Regexp::LikePerl;
+ if (j == 0)
+ flags = flags | Regexp::Latin1;
+ Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
+ ASSERT_TRUE(re != NULL) << " " << t.regexp;
+
+ std::string p;
+ bool f;
+ Regexp* s;
+ ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s))
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
+ << " " << re->Dump();
+ if (t.return_value) {
+ ASSERT_EQ(p, std::string(t.prefix))
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
+ ASSERT_EQ(f, t.foldcase)
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
+ ASSERT_EQ(s->ToString(), std::string(t.suffix))
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
+ s->Decref();
+ }
+ re->Decref();
+ }
+ }
+}
+
+static PrefixTest for_accel_tests[] = {
+ // Empty cases.
+ { "", false },
+ { "(?m)^", false },
+ { "(?-m)^", false },
+
+ // If the regexp has a ^, there's no required prefix.
+ { "^abc", false },
+
+ // If the regexp immediately goes into
+ // something not a literal match, there's no required prefix.
+ { "a*", false },
+
+ // Unlike RequiredPrefix(), RequiredPrefixForAccel() can "see through"
+ // capturing groups, but doesn't try to glue prefix fragments together.
+ { "(a?)def", false },
+ { "(ab?)def", true, "a", false },
+ { "(abc?)def", true, "ab", false },
+ { "(()a)def", false },
+ { "((a)b)def", true, "a", false },
+ { "((ab)c)def", true, "ab", false },
+
+ // Otherwise, it should work.
+ { "abc$", true, "abc", false },
+ { "abc", true, "abc", false },
+ { "(?i)abc", true, "abc", true },
+ { "abcd*", true, "abc", false },
+ { "[Aa][Bb]cd*", true, "ab", true },
+ { "ab[Cc]d*", true, "ab", false },
+ { "☺abc", true, "☺abc", false },
+};
+
+TEST(RequiredPrefixForAccel, SimpleTests) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(for_accel_tests); i++) {
+ const PrefixTest& t = for_accel_tests[i];
+ for (size_t j = 0; j < 2; j++) {
+ Regexp::ParseFlags flags = Regexp::LikePerl;
+ if (j == 0)
+ flags = flags | Regexp::Latin1;
+ Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
+ ASSERT_TRUE(re != NULL) << " " << t.regexp;
+
+ std::string p;
+ bool f;
+ ASSERT_EQ(t.return_value, re->RequiredPrefixForAccel(&p, &f))
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
+ << " " << re->Dump();
+ if (t.return_value) {
+ ASSERT_EQ(p, std::string(t.prefix))
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
+ ASSERT_EQ(f, t.foldcase)
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
+ }
+ re->Decref();
+ }
+ }
+}
+
+TEST(RequiredPrefixForAccel, CaseFoldingForKAndS) {
+ Regexp* re;
+ std::string p;
+ bool f;
+
+ // With Latin-1 encoding, `(?i)` prefixes can include 'k' and 's'.
+ re = Regexp::Parse("(?i)KLM", Regexp::LikePerl|Regexp::Latin1, NULL);
+ ASSERT_TRUE(re != NULL);
+ ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
+ ASSERT_EQ(p, "klm");
+ ASSERT_EQ(f, true);
+ re->Decref();
+
+ re = Regexp::Parse("(?i)STU", Regexp::LikePerl|Regexp::Latin1, NULL);
+ ASSERT_TRUE(re != NULL);
+ ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
+ ASSERT_EQ(p, "stu");
+ ASSERT_EQ(f, true);
+ re->Decref();
+
+ // With UTF-8 encoding, `(?i)` prefixes can't include 'k' and 's'.
+ // This is because they match U+212A and U+017F, respectively, and
+ // so the parser ends up emitting character classes, not literals.
+ re = Regexp::Parse("(?i)KLM", Regexp::LikePerl, NULL);
+ ASSERT_TRUE(re != NULL);
+ ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
+ re->Decref();
+
+ re = Regexp::Parse("(?i)STU", Regexp::LikePerl, NULL);
+ ASSERT_TRUE(re != NULL);
+ ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
+ re->Decref();
+}
+
+static const char* prefix_accel_tests[] = {
+ "aababc\\d+",
+ "(?i)AABABC\\d+",
+};
+
+TEST(PrefixAccel, SimpleTests) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(prefix_accel_tests); i++) {
+ const char* pattern = prefix_accel_tests[i];
+ Regexp* re = Regexp::Parse(pattern, Regexp::LikePerl, NULL);
+ ASSERT_TRUE(re != NULL);
+ Prog* prog = re->CompileToProg(0);
+ ASSERT_TRUE(prog != NULL);
+ ASSERT_TRUE(prog->can_prefix_accel());
+ for (int j = 0; j < 100; j++) {
+ std::string text(j, 'a');
+ const char* p = reinterpret_cast<const char*>(
+ prog->PrefixAccel(text.data(), text.size()));
+ EXPECT_TRUE(p == NULL);
+ text.append("aababc");
+ for (int k = 0; k < 100; k++) {
+ text.append(k, 'a');
+ p = reinterpret_cast<const char*>(
+ prog->PrefixAccel(text.data(), text.size()));
+ EXPECT_EQ(j, p - text.data());
+ }
+ }
+ delete prog;
+ re->Decref();
+ }
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/search_test.cc b/third_party/re2/src/re2/testing/search_test.cc
new file mode 100644
index 000000000..166652a2d
--- /dev/null
+++ b/third_party/re2/src/re2/testing/search_test.cc
@@ -0,0 +1,335 @@
+// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+#include "re2/testing/tester.h"
+#include "re2/testing/exhaustive_tester.h"
+
+// For target `log' in the Makefile.
+#ifndef LOGGING
+#define LOGGING 0
+#endif
+
+namespace re2 {
+
+struct RegexpTest {
+ const char* regexp;
+ const char* text;
+};
+
+RegexpTest simple_tests[] = {
+ { "a", "a" },
+ { "a", "zyzzyva" },
+ { "a+", "aa" },
+ { "(a+|b)+", "ab" },
+ { "ab|cd", "xabcdx" },
+ { "h.*od?", "hello\ngoodbye\n" },
+ { "h.*o", "hello\ngoodbye\n" },
+ { "h.*o", "goodbye\nhello\n" },
+ { "h.*o", "hello world" },
+ { "h.*o", "othello, world" },
+ { "[^\\s\\S]", "aaaaaaa" },
+ { "a", "aaaaaaa" },
+ { "a*", "aaaaaaa" },
+ { "a*", "" },
+ { "ab|cd", "xabcdx" },
+ { "a", "cab" },
+ { "a*b", "cab" },
+ { "((((((((((((((((((((x))))))))))))))))))))", "x" },
+ { "[abcd]", "xxxabcdxxx" },
+ { "[^x]", "xxxabcdxxx" },
+ { "[abcd]+", "xxxabcdxxx" },
+ { "[^x]+", "xxxabcdxxx" },
+ { "(fo|foo)", "fo" },
+ { "(foo|fo)", "foo" },
+
+ { "aa", "aA" },
+ { "a", "Aa" },
+ { "a", "A" },
+ { "ABC", "abc" },
+ { "abc", "XABCY" },
+ { "ABC", "xabcy" },
+
+ // Make sure ^ and $ work.
+ // The pathological cases didn't work
+ // in the original grep code.
+ { "foo|bar|[A-Z]", "foo" },
+ { "^(foo|bar|[A-Z])", "foo" },
+ { "(foo|bar|[A-Z])$", "foo\n" },
+ { "(foo|bar|[A-Z])$", "foo" },
+ { "^(foo|bar|[A-Z])$", "foo\n" },
+ { "^(foo|bar|[A-Z])$", "foo" },
+ { "^(foo|bar|[A-Z])$", "bar" },
+ { "^(foo|bar|[A-Z])$", "X" },
+ { "^(foo|bar|[A-Z])$", "XY" },
+ { "^(fo|foo)$", "fo" },
+ { "^(fo|foo)$", "foo" },
+ { "^^(fo|foo)$", "fo" },
+ { "^^(fo|foo)$", "foo" },
+ { "^$", "" },
+ { "^$", "x" },
+ { "^^$", "" },
+ { "^$$", "" },
+ { "^^$", "x" },
+ { "^$$", "x" },
+ { "^^$$", "" },
+ { "^^$$", "x" },
+ { "^^^^^^^^$$$$$$$$", "" },
+ { "^", "x" },
+ { "$", "x" },
+
+ // Word boundaries.
+ { "\\bfoo\\b", "nofoo foo that" },
+ { "a\\b", "faoa x" },
+ { "\\bbar", "bar x" },
+ { "\\bbar", "foo\nbar x" },
+ { "bar\\b", "foobar" },
+ { "bar\\b", "foobar\nxxx" },
+ { "(foo|bar|[A-Z])\\b", "foo" },
+ { "(foo|bar|[A-Z])\\b", "foo\n" },
+ { "\\b", "" },
+ { "\\b", "x" },
+ { "\\b(foo|bar|[A-Z])", "foo" },
+ { "\\b(foo|bar|[A-Z])\\b", "X" },
+ { "\\b(foo|bar|[A-Z])\\b", "XY" },
+ { "\\b(foo|bar|[A-Z])\\b", "bar" },
+ { "\\b(foo|bar|[A-Z])\\b", "foo" },
+ { "\\b(foo|bar|[A-Z])\\b", "foo\n" },
+ { "\\b(foo|bar|[A-Z])\\b", "ffoo bbar N x" },
+ { "\\b(fo|foo)\\b", "fo" },
+ { "\\b(fo|foo)\\b", "foo" },
+ { "\\b\\b", "" },
+ { "\\b\\b", "x" },
+ { "\\b$", "" },
+ { "\\b$", "x" },
+ { "\\b$", "y x" },
+ { "\\b.$", "x" },
+ { "^\\b(fo|foo)\\b", "fo" },
+ { "^\\b(fo|foo)\\b", "foo" },
+ { "^\\b", "" },
+ { "^\\b", "x" },
+ { "^\\b\\b", "" },
+ { "^\\b\\b", "x" },
+ { "^\\b$", "" },
+ { "^\\b$", "x" },
+ { "^\\b.$", "x" },
+ { "^\\b.\\b$", "x" },
+ { "^^^^^^^^\\b$$$$$$$", "" },
+ { "^^^^^^^^\\b.$$$$$$", "x" },
+ { "^^^^^^^^\\b$$$$$$$", "x" },
+
+ // Non-word boundaries.
+ { "\\Bfoo\\B", "n foo xfoox that" },
+ { "a\\B", "faoa x" },
+ { "\\Bbar", "bar x" },
+ { "\\Bbar", "foo\nbar x" },
+ { "bar\\B", "foobar" },
+ { "bar\\B", "foobar\nxxx" },
+ { "(foo|bar|[A-Z])\\B", "foox" },
+ { "(foo|bar|[A-Z])\\B", "foo\n" },
+ { "\\B", "" },
+ { "\\B", "x" },
+ { "\\B(foo|bar|[A-Z])", "foo" },
+ { "\\B(foo|bar|[A-Z])\\B", "xXy" },
+ { "\\B(foo|bar|[A-Z])\\B", "XY" },
+ { "\\B(foo|bar|[A-Z])\\B", "XYZ" },
+ { "\\B(foo|bar|[A-Z])\\B", "abara" },
+ { "\\B(foo|bar|[A-Z])\\B", "xfoo_" },
+ { "\\B(foo|bar|[A-Z])\\B", "xfoo\n" },
+ { "\\B(foo|bar|[A-Z])\\B", "foo bar vNx" },
+ { "\\B(fo|foo)\\B", "xfoo" },
+ { "\\B(foo|fo)\\B", "xfooo" },
+ { "\\B\\B", "" },
+ { "\\B\\B", "x" },
+ { "\\B$", "" },
+ { "\\B$", "x" },
+ { "\\B$", "y x" },
+ { "\\B.$", "x" },
+ { "^\\B(fo|foo)\\B", "fo" },
+ { "^\\B(fo|foo)\\B", "foo" },
+ { "^\\B", "" },
+ { "^\\B", "x" },
+ { "^\\B\\B", "" },
+ { "^\\B\\B", "x" },
+ { "^\\B$", "" },
+ { "^\\B$", "x" },
+ { "^\\B.$", "x" },
+ { "^\\B.\\B$", "x" },
+ { "^^^^^^^^\\B$$$$$$$", "" },
+ { "^^^^^^^^\\B.$$$$$$", "x" },
+ { "^^^^^^^^\\B$$$$$$$", "x" },
+
+ // PCRE uses only ASCII for \b computation.
+ // All non-ASCII are *not* word characters.
+ { "\\bx\\b", "x" },
+ { "\\bx\\b", "x>" },
+ { "\\bx\\b", "<x" },
+ { "\\bx\\b", "<x>" },
+ { "\\bx\\b", "ax" },
+ { "\\bx\\b", "xb" },
+ { "\\bx\\b", "axb" },
+ { "\\bx\\b", "«x" },
+ { "\\bx\\b", "x»" },
+ { "\\bx\\b", "«x»" },
+ { "\\bx\\b", "axb" },
+ { "\\bx\\b", "áxβ" },
+ { "\\Bx\\B", "axb" },
+ { "\\Bx\\B", "áxβ" },
+
+ // Weird boundary cases.
+ { "^$^$", "" },
+ { "^$^", "" },
+ { "$^$", "" },
+
+ { "^$^$", "x" },
+ { "^$^", "x" },
+ { "$^$", "x" },
+
+ { "^$^$", "x\ny" },
+ { "^$^", "x\ny" },
+ { "$^$", "x\ny" },
+
+ { "^$^$", "x\n\ny" },
+ { "^$^", "x\n\ny" },
+ { "$^$", "x\n\ny" },
+
+ { "^(foo\\$)$", "foo$bar" },
+ { "(foo\\$)", "foo$bar" },
+ { "^...$", "abc" },
+
+ // UTF-8
+ { "^\xe6\x9c\xac$", "\xe6\x9c\xac" },
+ { "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
+ { "^...$", ".\xe6\x9c\xac." },
+
+ { "^\\C\\C\\C$", "\xe6\x9c\xac" },
+ { "^\\C$", "\xe6\x9c\xac" },
+ { "^\\C\\C\\C$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
+
+ // Latin1
+ { "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
+ { "^.........$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
+ { "^...$", ".\xe6\x9c\xac." },
+ { "^.....$", ".\xe6\x9c\xac." },
+
+ // Perl v Posix
+ { "\\B(fo|foo)\\B", "xfooo" },
+ { "(fo|foo)", "foo" },
+
+ // Octal escapes.
+ { "\\141", "a" },
+ { "\\060", "0" },
+ { "\\0600", "00" },
+ { "\\608", "08" },
+ { "\\01", "\01" },
+ { "\\018", "\01" "8" },
+
+ // Hexadecimal escapes
+ { "\\x{61}", "a" },
+ { "\\x61", "a" },
+ { "\\x{00000061}", "a" },
+
+ // Unicode scripts.
+ { "\\p{Greek}+", "aαβb" },
+ { "\\P{Greek}+", "aαβb" },
+ { "\\p{^Greek}+", "aαβb" },
+ { "\\P{^Greek}+", "aαβb" },
+
+ // Unicode properties. Nd is decimal number. N is any number.
+ { "[^0-9]+", "abc123" },
+ { "\\p{Nd}+", "abc123²³¼½¾₀₉" },
+ { "\\p{^Nd}+", "abc123²³¼½¾₀₉" },
+ { "\\P{Nd}+", "abc123²³¼½¾₀₉" },
+ { "\\P{^Nd}+", "abc123²³¼½¾₀₉" },
+ { "\\pN+", "abc123²³¼½¾₀₉" },
+ { "\\p{N}+", "abc123²³¼½¾₀₉" },
+ { "\\p{^N}+", "abc123²³¼½¾₀₉" },
+
+ { "\\p{Any}+", "abc123" },
+
+ // Character classes & case folding.
+ { "(?i)[@-A]+", "@AaB" }, // matches @Aa but not B
+ { "(?i)[A-Z]+", "aAzZ" },
+ { "(?i)[^\\\\]+", "Aa\\" }, // \\ is between A-Z and a-z -
+ // splits the ranges in an interesting way.
+
+ // would like to use, but PCRE mishandles in full-match, non-greedy mode
+ // { "(?i)[\\\\]+", "Aa" },
+
+ { "(?i)[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
+
+ // Character classes & case folding.
+ { "[@-A]+", "@AaB" },
+ { "[A-Z]+", "aAzZ" },
+ { "[^\\\\]+", "Aa\\" },
+ { "[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
+
+ // Anchoring. (^abc in aabcdef was a former bug)
+ // The tester checks for a match in the text and
+ // subpieces of the text with a byte removed on either side.
+ { "^abc", "abcdef" },
+ { "^abc", "aabcdef" },
+ { "^[ay]*[bx]+c", "abcdef" },
+ { "^[ay]*[bx]+c", "aabcdef" },
+ { "def$", "abcdef" },
+ { "def$", "abcdeff" },
+ { "d[ex][fy]$", "abcdef" },
+ { "d[ex][fy]$", "abcdeff" },
+ { "[dz][ex][fy]$", "abcdef" },
+ { "[dz][ex][fy]$", "abcdeff" },
+ { "(?m)^abc", "abcdef" },
+ { "(?m)^abc", "aabcdef" },
+ { "(?m)^[ay]*[bx]+c", "abcdef" },
+ { "(?m)^[ay]*[bx]+c", "aabcdef" },
+ { "(?m)def$", "abcdef" },
+ { "(?m)def$", "abcdeff" },
+ { "(?m)d[ex][fy]$", "abcdef" },
+ { "(?m)d[ex][fy]$", "abcdeff" },
+ { "(?m)[dz][ex][fy]$", "abcdef" },
+ { "(?m)[dz][ex][fy]$", "abcdeff" },
+ { "^", "a" },
+ { "^^", "a" },
+
+ // Context.
+ // The tester checks for a match in the text and
+ // subpieces of the text with a byte removed on either side.
+ { "a", "a" },
+ { "ab*", "a" },
+ { "a\\C*", "a" },
+ { "a\\C+", "a" },
+ { "a\\C?", "a" },
+ { "a\\C*?", "a" },
+ { "a\\C+?", "a" },
+ { "a\\C??", "a" },
+
+ // Former bugs.
+ { "a\\C*|ba\\C", "baba" },
+ { "\\w*I\\w*", "Inc." },
+ { "(?:|a)*", "aaa" },
+ { "(?:|a)+", "aaa" },
+};
+
+TEST(Regexp, SearchTests) {
+ int failures = 0;
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(simple_tests); i++) {
+ const RegexpTest& t = simple_tests[i];
+ if (!TestRegexpOnText(t.regexp, t.text))
+ failures++;
+
+ if (LOGGING) {
+ // Build a dummy ExhaustiveTest call that will trigger just
+ // this one test, so that we log the test case.
+ std::vector<std::string> atom, alpha, ops;
+ atom.push_back(t.regexp);
+ alpha.push_back(t.text);
+ ExhaustiveTest(1, 0, atom, ops, 1, alpha, "", "");
+ }
+ }
+ EXPECT_EQ(failures, 0);
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/set_test.cc b/third_party/re2/src/re2/testing/set_test.cc
new file mode 100644
index 000000000..fdbc0b2c7
--- /dev/null
+++ b/third_party/re2/src/re2/testing/set_test.cc
@@ -0,0 +1,230 @@
+// Copyright 2010 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stddef.h>
+#include <string>
+#include <vector>
+#include <utility>
+
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "re2/re2.h"
+#include "re2/set.h"
+
+namespace re2 {
+
+TEST(Set, Unanchored) {
+ RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
+
+ ASSERT_EQ(s.Add("foo", NULL), 0);
+ ASSERT_EQ(s.Add("(", NULL), -1);
+ ASSERT_EQ(s.Add("bar", NULL), 1);
+ ASSERT_EQ(s.Compile(), true);
+
+ ASSERT_EQ(s.Match("foobar", NULL), true);
+ ASSERT_EQ(s.Match("fooba", NULL), true);
+ ASSERT_EQ(s.Match("oobar", NULL), true);
+
+ std::vector<int> v;
+ ASSERT_EQ(s.Match("foobar", &v), true);
+ ASSERT_EQ(v.size(), 2);
+ ASSERT_EQ(v[0], 0);
+ ASSERT_EQ(v[1], 1);
+
+ ASSERT_EQ(s.Match("fooba", &v), true);
+ ASSERT_EQ(v.size(), 1);
+ ASSERT_EQ(v[0], 0);
+
+ ASSERT_EQ(s.Match("oobar", &v), true);
+ ASSERT_EQ(v.size(), 1);
+ ASSERT_EQ(v[0], 1);
+}
+
+TEST(Set, UnanchoredFactored) {
+ RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
+
+ ASSERT_EQ(s.Add("foo", NULL), 0);
+ ASSERT_EQ(s.Add("(", NULL), -1);
+ ASSERT_EQ(s.Add("foobar", NULL), 1);
+ ASSERT_EQ(s.Compile(), true);
+
+ ASSERT_EQ(s.Match("foobar", NULL), true);
+ ASSERT_EQ(s.Match("obarfoobaroo", NULL), true);
+ ASSERT_EQ(s.Match("fooba", NULL), true);
+ ASSERT_EQ(s.Match("oobar", NULL), false);
+
+ std::vector<int> v;
+ ASSERT_EQ(s.Match("foobar", &v), true);
+ ASSERT_EQ(v.size(), 2);
+ ASSERT_EQ(v[0], 0);
+ ASSERT_EQ(v[1], 1);
+
+ ASSERT_EQ(s.Match("obarfoobaroo", &v), true);
+ ASSERT_EQ(v.size(), 2);
+ ASSERT_EQ(v[0], 0);
+ ASSERT_EQ(v[1], 1);
+
+ ASSERT_EQ(s.Match("fooba", &v), true);
+ ASSERT_EQ(v.size(), 1);
+ ASSERT_EQ(v[0], 0);
+
+ ASSERT_EQ(s.Match("oobar", &v), false);
+ ASSERT_EQ(v.size(), 0);
+}
+
+TEST(Set, UnanchoredDollar) {
+ RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
+
+ ASSERT_EQ(s.Add("foo$", NULL), 0);
+ ASSERT_EQ(s.Compile(), true);
+
+ ASSERT_EQ(s.Match("foo", NULL), true);
+ ASSERT_EQ(s.Match("foobar", NULL), false);
+
+ std::vector<int> v;
+ ASSERT_EQ(s.Match("foo", &v), true);
+ ASSERT_EQ(v.size(), 1);
+ ASSERT_EQ(v[0], 0);
+
+ ASSERT_EQ(s.Match("foobar", &v), false);
+ ASSERT_EQ(v.size(), 0);
+}
+
+TEST(Set, UnanchoredWordBoundary) {
+ RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
+
+ ASSERT_EQ(s.Add("foo\\b", NULL), 0);
+ ASSERT_EQ(s.Compile(), true);
+
+ ASSERT_EQ(s.Match("foo", NULL), true);
+ ASSERT_EQ(s.Match("foobar", NULL), false);
+ ASSERT_EQ(s.Match("foo bar", NULL), true);
+
+ std::vector<int> v;
+ ASSERT_EQ(s.Match("foo", &v), true);
+ ASSERT_EQ(v.size(), 1);
+ ASSERT_EQ(v[0], 0);
+
+ ASSERT_EQ(s.Match("foobar", &v), false);
+ ASSERT_EQ(v.size(), 0);
+
+ ASSERT_EQ(s.Match("foo bar", &v), true);
+ ASSERT_EQ(v.size(), 1);
+ ASSERT_EQ(v[0], 0);
+}
+
+TEST(Set, Anchored) {
+ RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);
+
+ ASSERT_EQ(s.Add("foo", NULL), 0);
+ ASSERT_EQ(s.Add("(", NULL), -1);
+ ASSERT_EQ(s.Add("bar", NULL), 1);
+ ASSERT_EQ(s.Compile(), true);
+
+ ASSERT_EQ(s.Match("foobar", NULL), false);
+ ASSERT_EQ(s.Match("fooba", NULL), false);
+ ASSERT_EQ(s.Match("oobar", NULL), false);
+ ASSERT_EQ(s.Match("foo", NULL), true);
+ ASSERT_EQ(s.Match("bar", NULL), true);
+
+ std::vector<int> v;
+ ASSERT_EQ(s.Match("foobar", &v), false);
+ ASSERT_EQ(v.size(), 0);
+
+ ASSERT_EQ(s.Match("fooba", &v), false);
+ ASSERT_EQ(v.size(), 0);
+
+ ASSERT_EQ(s.Match("oobar", &v), false);
+ ASSERT_EQ(v.size(), 0);
+
+ ASSERT_EQ(s.Match("foo", &v), true);
+ ASSERT_EQ(v.size(), 1);
+ ASSERT_EQ(v[0], 0);
+
+ ASSERT_EQ(s.Match("bar", &v), true);
+ ASSERT_EQ(v.size(), 1);
+ ASSERT_EQ(v[0], 1);
+}
+
+TEST(Set, EmptyUnanchored) {
+ RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
+
+ ASSERT_EQ(s.Compile(), true);
+
+ ASSERT_EQ(s.Match("", NULL), false);
+ ASSERT_EQ(s.Match("foobar", NULL), false);
+
+ std::vector<int> v;
+ ASSERT_EQ(s.Match("", &v), false);
+ ASSERT_EQ(v.size(), 0);
+
+ ASSERT_EQ(s.Match("foobar", &v), false);
+ ASSERT_EQ(v.size(), 0);
+}
+
+TEST(Set, EmptyAnchored) {
+ RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);
+
+ ASSERT_EQ(s.Compile(), true);
+
+ ASSERT_EQ(s.Match("", NULL), false);
+ ASSERT_EQ(s.Match("foobar", NULL), false);
+
+ std::vector<int> v;
+ ASSERT_EQ(s.Match("", &v), false);
+ ASSERT_EQ(v.size(), 0);
+
+ ASSERT_EQ(s.Match("foobar", &v), false);
+ ASSERT_EQ(v.size(), 0);
+}
+
+TEST(Set, Prefix) {
+ RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);
+
+ ASSERT_EQ(s.Add("/prefix/\\d*", NULL), 0);
+ ASSERT_EQ(s.Compile(), true);
+
+ ASSERT_EQ(s.Match("/prefix", NULL), false);
+ ASSERT_EQ(s.Match("/prefix/", NULL), true);
+ ASSERT_EQ(s.Match("/prefix/42", NULL), true);
+
+ std::vector<int> v;
+ ASSERT_EQ(s.Match("/prefix", &v), false);
+ ASSERT_EQ(v.size(), 0);
+
+ ASSERT_EQ(s.Match("/prefix/", &v), true);
+ ASSERT_EQ(v.size(), 1);
+ ASSERT_EQ(v[0], 0);
+
+ ASSERT_EQ(s.Match("/prefix/42", &v), true);
+ ASSERT_EQ(v.size(), 1);
+ ASSERT_EQ(v[0], 0);
+}
+
+TEST(Set, MoveSemantics) {
+ RE2::Set s1(RE2::DefaultOptions, RE2::UNANCHORED);
+ ASSERT_EQ(s1.Add("foo\\d+", NULL), 0);
+ ASSERT_EQ(s1.Compile(), true);
+ ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true);
+ ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false);
+
+ // The moved-to object should do what the moved-from object did.
+ RE2::Set s2 = std::move(s1);
+ ASSERT_EQ(s2.Match("abc foo1 xyz", NULL), true);
+ ASSERT_EQ(s2.Match("abc bar2 xyz", NULL), false);
+
+ // The moved-from object should have been reset and be reusable.
+ ASSERT_EQ(s1.Add("bar\\d+", NULL), 0);
+ ASSERT_EQ(s1.Compile(), true);
+ ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), false);
+ ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), true);
+
+ // Verify that "overwriting" works and also doesn't leak memory.
+ // (The latter will need a leak detector such as LeakSanitizer.)
+ s1 = std::move(s2);
+ ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true);
+ ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false);
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/simplify_test.cc b/third_party/re2/src/re2/testing/simplify_test.cc
new file mode 100644
index 000000000..5b683f580
--- /dev/null
+++ b/third_party/re2/src/re2/testing/simplify_test.cc
@@ -0,0 +1,290 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test simplify.cc.
+
+#include <string.h>
+#include <string>
+
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct Test {
+ const char* regexp;
+ const char* simplified;
+};
+
+static Test tests[] = {
+ // Already-simple constructs
+ { "a", "a" },
+ { "ab", "ab" },
+ { "a|b", "[a-b]" },
+ { "ab|cd", "ab|cd" },
+ { "(ab)*", "(ab)*" },
+ { "(ab)+", "(ab)+" },
+ { "(ab)?", "(ab)?" },
+ { ".", "." },
+ { "^", "^" },
+ { "$", "$" },
+ { "[ac]", "[ac]" },
+ { "[^ac]", "[^ac]" },
+
+ // Posix character classes
+ { "[[:alnum:]]", "[0-9A-Za-z]" },
+ { "[[:alpha:]]", "[A-Za-z]" },
+ { "[[:blank:]]", "[\\t ]" },
+ { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" },
+ { "[[:digit:]]", "[0-9]" },
+ { "[[:graph:]]", "[!-~]" },
+ { "[[:lower:]]", "[a-z]" },
+ { "[[:print:]]", "[ -~]" },
+ { "[[:punct:]]", "[!-/:-@\\[-`{-~]" },
+ { "[[:space:]]" , "[\\t-\\r ]" },
+ { "[[:upper:]]", "[A-Z]" },
+ { "[[:xdigit:]]", "[0-9A-Fa-f]" },
+
+ // Perl character classes
+ { "\\d", "[0-9]" },
+ { "\\s", "[\\t-\\n\\f-\\r ]" },
+ { "\\w", "[0-9A-Z_a-z]" },
+ { "\\D", "[^0-9]" },
+ { "\\S", "[^\\t-\\n\\f-\\r ]" },
+ { "\\W", "[^0-9A-Z_a-z]" },
+ { "[\\d]", "[0-9]" },
+ { "[\\s]", "[\\t-\\n\\f-\\r ]" },
+ { "[\\w]", "[0-9A-Z_a-z]" },
+ { "[\\D]", "[^0-9]" },
+ { "[\\S]", "[^\\t-\\n\\f-\\r ]" },
+ { "[\\W]", "[^0-9A-Z_a-z]" },
+
+ // Posix repetitions
+ { "a{1}", "a" },
+ { "a{2}", "aa" },
+ { "a{5}", "aaaaa" },
+ { "a{0,1}", "a?" },
+ // The next three are illegible because Simplify inserts (?:)
+ // parens instead of () parens to avoid creating extra
+ // captured subexpressions. The comments show a version fewer parens.
+ { "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)?
+ { "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)?
+ { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)?
+ { "a{0,2}", "(?:aa?)?" }, // (aa?)?
+ { "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)?
+ { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)?
+ { "a{0,}", "a*" },
+ { "a{1,}", "a+" },
+ { "a{2,}", "aa+" },
+ { "a{5,}", "aaaaa+" },
+
+ // Test that operators simplify their arguments.
+ // (Simplify used to not simplify arguments to a {} repeat.)
+ { "(?:a{1,}){1,}", "a+" },
+ { "(a{1,}b{1,})", "(a+b+)" },
+ { "a{1,}|b{1,}", "a+|b+" },
+ { "(?:a{1,})*", "(?:a+)*" },
+ { "(?:a{1,})+", "a+" },
+ { "(?:a{1,})?", "(?:a+)?" },
+ { "a{0}", "" },
+
+ // Character class simplification
+ { "[ab]", "[a-b]" },
+ { "[a-za-za-z]", "[a-z]" },
+ { "[A-Za-zA-Za-z]", "[A-Za-z]" },
+ { "[ABCDEFGH]", "[A-H]" },
+ { "[AB-CD-EF-GH]", "[A-H]" },
+ { "[W-ZP-XE-R]", "[E-Z]" },
+ { "[a-ee-gg-m]", "[a-m]" },
+ { "[a-ea-ha-m]", "[a-m]" },
+ { "[a-ma-ha-e]", "[a-m]" },
+ { "[a-zA-Z0-9 -~]", "[ -~]" },
+
+ // Empty character classes
+ { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" },
+
+ // Full character classes
+ { "[[:cntrl:][:^cntrl:]]", "." },
+
+ // Unicode case folding.
+ { "(?i)A", "[Aa]" },
+ { "(?i)a", "[Aa]" },
+ { "(?i)K", "[Kk\\x{212a}]" },
+ { "(?i)k", "[Kk\\x{212a}]" },
+ { "(?i)\\x{212a}", "[Kk\\x{212a}]" },
+ { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" },
+ { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" },
+ { "(?i)[\\x00-\\x{10ffff}]", "." },
+
+ // Empty string as a regular expression.
+ // Empty string must be preserved inside parens in order
+ // to make submatches work right, so these are less
+ // interesting than they used to be. ToString inserts
+ // explicit (?:) in place of non-parenthesized empty strings,
+ // to make them easier to spot for other parsers.
+ { "(a|b|)", "([a-b]|(?:))" },
+ { "(|)", "((?:)|(?:))" },
+ { "a()", "a()" },
+ { "(()|())", "(()|())" },
+ { "(a|)", "(a|(?:))" },
+ { "ab()cd()", "ab()cd()" },
+ { "()", "()" },
+ { "()*", "()*" },
+ { "()+", "()+" },
+ { "()?" , "()?" },
+ { "(){0}", "" },
+ { "(){1}", "()" },
+ { "(){1,}", "()+" },
+ { "(){0,2}", "(?:()()?)?" },
+
+ // For an empty-width op OR a concatenation or alternation of empty-width
+ // ops, test that the repetition count is capped at 1.
+ { "(?:^){0,}", "^*" }, // x{0,} -> x*
+ { "(?:$){28,}", "$+" }, // x{N,} -> x{1,} -> x+
+ { "(?-m:^){0,30}", "(?-m:^)?" }, // x{0,N} -> x{0,1} -> x?
+ { "(?-m:$){28,30}", "(?-m:$)" }, // x{N,M} -> x{1,1} -> x
+ { "\\b(?:\\b\\B){999}\\B", "\\b\\b\\B\\B" },
+ { "\\b(?:\\b|\\B){999}\\B", "\\b(?:\\b|\\B)\\B" },
+ // NonGreedy should also be handled.
+ { "(?:^){0,}?", "^*?" },
+ { "(?:$){28,}?", "$+?" },
+ { "(?-m:^){0,30}?", "(?-m:^)??" },
+ { "(?-m:$){28,30}?", "(?-m:$)" },
+ { "\\b(?:\\b\\B){999}?\\B", "\\b\\b\\B\\B" },
+ { "\\b(?:\\b|\\B){999}?\\B", "\\b(?:\\b|\\B)\\B" },
+
+ // Test that coalescing occurs and that the resulting repeats are simplified.
+ // Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal:
+ { "a*a*", "a*" },
+ { "a*a+", "a+" },
+ { "a*a?", "a*" },
+ { "a*a{2}", "aa+" },
+ { "a*a{2,}", "aa+" },
+ { "a*a{2,3}", "aa+" },
+ { "a+a*", "a+" },
+ { "a+a+", "aa+" },
+ { "a+a?", "a+" },
+ { "a+a{2}", "aaa+" },
+ { "a+a{2,}", "aaa+" },
+ { "a+a{2,3}", "aaa+" },
+ { "a?a*", "a*" },
+ { "a?a+", "a+" },
+ { "a?a?", "(?:aa?)?" },
+ { "a?a{2}", "aaa?" },
+ { "a?a{2,}", "aa+" },
+ { "a?a{2,3}", "aa(?:aa?)?" },
+ { "a{2}a*", "aa+" },
+ { "a{2}a+", "aaa+" },
+ { "a{2}a?", "aaa?" },
+ { "a{2}a{2}", "aaaa" },
+ { "a{2}a{2,}", "aaaa+" },
+ { "a{2}a{2,3}", "aaaaa?" },
+ { "a{2,}a*", "aa+" },
+ { "a{2,}a+", "aaa+" },
+ { "a{2,}a?", "aa+" },
+ { "a{2,}a{2}", "aaaa+" },
+ { "a{2,}a{2,}", "aaaa+" },
+ { "a{2,}a{2,3}", "aaaa+" },
+ { "a{2,3}a*", "aa+" },
+ { "a{2,3}a+", "aaa+" },
+ { "a{2,3}a?", "aa(?:aa?)?" },
+ { "a{2,3}a{2}", "aaaaa?" },
+ { "a{2,3}a{2,}", "aaaa+" },
+ { "a{2,3}a{2,3}", "aaaa(?:aa?)?" },
+ // With a char class, any char and any byte:
+ { "\\d*\\d*", "[0-9]*" },
+ { ".*.*", ".*" },
+ { "\\C*\\C*", "\\C*" },
+ // FoldCase works, but must be consistent:
+ { "(?i)A*a*", "[Aa]*" },
+ { "(?i)a+A+", "[Aa][Aa]+" },
+ { "(?i)A*(?-i)a*", "[Aa]*a*" },
+ { "(?i)a+(?-i)A+", "[Aa]+A+" },
+ // NonGreedy works, but must be consistent:
+ { "a*?a*?", "a*?" },
+ { "a+?a+?", "aa+?" },
+ { "a*?a*", "a*?a*" },
+ { "a+a+?", "a+a+?" },
+ // The second element is the literal, char class, any char or any byte:
+ { "a*a", "a+" },
+ { "\\d*\\d", "[0-9]+" },
+ { ".*.", ".+" },
+ { "\\C*\\C", "\\C+" },
+ // FoldCase works, but must be consistent:
+ { "(?i)A*a", "[Aa]+" },
+ { "(?i)a+A", "[Aa][Aa]+" },
+ { "(?i)A*(?-i)a", "[Aa]*a" },
+ { "(?i)a+(?-i)A", "[Aa]+A" },
+ // The second element is a literal string that begins with the literal:
+ { "a*aa", "aa+" },
+ { "a*aab", "aa+b" },
+ // FoldCase works, but must be consistent:
+ { "(?i)a*aa", "[Aa][Aa]+" },
+ { "(?i)a*aab", "[Aa][Aa]+[Bb]" },
+ { "(?i)a*(?-i)aa", "[Aa]*aa" },
+ { "(?i)a*(?-i)aab", "[Aa]*aab" },
+ // Negative tests with mismatching ops:
+ { "a*b*", "a*b*" },
+ { "\\d*\\D*", "[0-9]*[^0-9]*" },
+ { "a+b", "a+b" },
+ { "\\d+\\D", "[0-9]+[^0-9]" },
+ { "a?bb", "a?bb" },
+ // Negative tests with capturing groups:
+ { "(a*)a*", "(a*)a*" },
+ { "a+(a)", "a+(a)" },
+ { "(a?)(aa)", "(a?)(aa)" },
+ // Just for fun:
+ { "aa*aa+aa?aa{2}aaa{2,}aaa{2,3}a", "aaaaaaaaaaaaaaaa+" },
+
+ // During coalescing, the child of the repeat changes, so we build a new
+ // repeat. The new repeat must have the min and max of the old repeat.
+ // Failure to copy them results in min=0 and max=0 -> empty match.
+ { "(?:a*aab){2}", "aa+baa+b" },
+
+ // During coalescing, the child of the capture changes, so we build a new
+ // capture. The new capture must have the cap of the old capture.
+ // Failure to copy it results in cap=0 -> ToString() logs a fatal error.
+ { "(a*aab)", "(aa+b)" },
+
+ // Test squashing of **, ++, ?? et cetera.
+ { "(?:(?:a){0,}){0,}", "a*" },
+ { "(?:(?:a){1,}){1,}", "a+" },
+ { "(?:(?:a){0,1}){0,1}", "a?" },
+ { "(?:(?:a){0,}){1,}", "a*" },
+ { "(?:(?:a){0,}){0,1}", "a*" },
+ { "(?:(?:a){1,}){0,}", "a*" },
+ { "(?:(?:a){1,}){0,1}", "a*" },
+ { "(?:(?:a){0,1}){0,}", "a*" },
+ { "(?:(?:a){0,1}){1,}", "a*" },
+};
+
+TEST(TestSimplify, SimpleRegexps) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
+ RegexpStatus status;
+ VLOG(1) << "Testing " << tests[i].regexp;
+ Regexp* re = Regexp::Parse(tests[i].regexp,
+ Regexp::MatchNL | (Regexp::LikePerl &
+ ~Regexp::OneLine),
+ &status);
+ ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text();
+ Regexp* sre = re->Simplify();
+ ASSERT_TRUE(sre != NULL);
+
+ // Check that already-simple regexps don't allocate new ones.
+ if (strcmp(tests[i].regexp, tests[i].simplified) == 0) {
+ ASSERT_TRUE(re == sre) << " " << tests[i].regexp
+ << " " << re->ToString() << " " << sre->ToString();
+ }
+
+ EXPECT_EQ(tests[i].simplified, sre->ToString())
+ << " " << tests[i].regexp << " " << sre->Dump();
+
+ re->Decref();
+ sre->Decref();
+ }
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/string_generator.cc b/third_party/re2/src/re2/testing/string_generator.cc
new file mode 100644
index 000000000..1891b14a7
--- /dev/null
+++ b/third_party/re2/src/re2/testing/string_generator.cc
@@ -0,0 +1,141 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// String generator: generates all possible strings of up to
+// maxlen letters using the set of letters in alpha.
+// Fetch strings using a Java-like Next()/HasNext() interface.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "util/logging.h"
+#include "re2/testing/string_generator.h"
+
+namespace re2 {
+
+StringGenerator::StringGenerator(int maxlen,
+ const std::vector<std::string>& alphabet)
+ : maxlen_(maxlen), alphabet_(alphabet),
+ generate_null_(false),
+ random_(false), nrandom_(0) {
+
+ // Degenerate case: no letters, no non-empty strings.
+ if (alphabet_.empty())
+ maxlen_ = 0;
+
+ // Next() will return empty string (digits_ is empty).
+ hasnext_ = true;
+}
+
+// Resets the string generator state to the beginning.
+void StringGenerator::Reset() {
+ digits_.clear();
+ hasnext_ = true;
+ random_ = false;
+ nrandom_ = 0;
+ generate_null_ = false;
+}
+
+// Increments the big number in digits_, returning true if successful.
+// Returns false if all the numbers have been used.
+bool StringGenerator::IncrementDigits() {
+ // First try to increment the current number.
+ for (int i = static_cast<int>(digits_.size()) - 1; i >= 0; i--) {
+ if (++digits_[i] < static_cast<int>(alphabet_.size()))
+ return true;
+ digits_[i] = 0;
+ }
+
+ // If that failed, make a longer number.
+ if (static_cast<int>(digits_.size()) < maxlen_) {
+ digits_.push_back(0);
+ return true;
+ }
+
+ return false;
+}
+
+// Generates random digits_, return true if successful.
+// Returns false if the random sequence is over.
+bool StringGenerator::RandomDigits() {
+ if (--nrandom_ <= 0)
+ return false;
+
+ std::uniform_int_distribution<int> random_len(0, maxlen_);
+ std::uniform_int_distribution<int> random_alphabet_index(
+ 0, static_cast<int>(alphabet_.size()) - 1);
+
+ // Pick length.
+ int len = random_len(rng_);
+ digits_.resize(len);
+ for (int i = 0; i < len; i++)
+ digits_[i] = random_alphabet_index(rng_);
+ return true;
+}
+
+// Returns the next string in the iteration, which is the one
+// currently described by digits_. Calls IncrementDigits
+// after computing the string, so that it knows the answer
+// for subsequent HasNext() calls.
+absl::string_view StringGenerator::Next() {
+ CHECK(hasnext_);
+ if (generate_null_) {
+ generate_null_ = false;
+ sp_ = absl::string_view();
+ return sp_;
+ }
+ s_.clear();
+ for (size_t i = 0; i < digits_.size(); i++) {
+ s_ += alphabet_[digits_[i]];
+ }
+ hasnext_ = random_ ? RandomDigits() : IncrementDigits();
+ sp_ = s_;
+ return sp_;
+}
+
+// Sets generator up to return n random strings.
+void StringGenerator::Random(int32_t seed, int n) {
+ rng_.seed(seed);
+
+ random_ = true;
+ nrandom_ = n;
+ hasnext_ = nrandom_ > 0;
+}
+
+void StringGenerator::GenerateNULL() {
+ generate_null_ = true;
+ hasnext_ = true;
+}
+
+std::string DeBruijnString(int n) {
+ CHECK_GE(n, 1);
+ CHECK_LE(n, 29);
+ const size_t size = size_t{1} << static_cast<size_t>(n);
+ const size_t mask = size - 1;
+ std::vector<bool> did(size, false);
+ std::string s;
+ s.reserve(static_cast<size_t>(n) + size);
+ for (size_t i = 0; i < static_cast<size_t>(n - 1); i++)
+ s += '0';
+ size_t bits = 0;
+ for (size_t i = 0; i < size; i++) {
+ bits <<= 1;
+ bits &= mask;
+ if (!did[bits | 1]) {
+ bits |= 1;
+ s += '1';
+ } else {
+ s += '0';
+ }
+ CHECK(!did[bits]);
+ did[bits] = true;
+ }
+ CHECK_EQ(s.size(), static_cast<size_t>(n - 1) + size);
+ return s;
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/string_generator.h b/third_party/re2/src/re2/testing/string_generator.h
new file mode 100644
index 000000000..0d6f5fcba
--- /dev/null
+++ b/third_party/re2/src/re2/testing/string_generator.h
@@ -0,0 +1,75 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_TESTING_STRING_GENERATOR_H_
+#define RE2_TESTING_STRING_GENERATOR_H_
+
+// String generator: generates all possible strings of up to
+// maxlen letters using the set of letters in alpha.
+// Fetch strings using a Java-like Next()/HasNext() interface.
+
+#include <stdint.h>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+
+namespace re2 {
+
+class StringGenerator {
+ public:
+ StringGenerator(int maxlen, const std::vector<std::string>& alphabet);
+ ~StringGenerator() {}
+
+ absl::string_view Next();
+ bool HasNext() { return hasnext_; }
+
+ // Resets generator to start sequence over.
+ void Reset();
+
+ // Causes generator to emit random strings for next n calls to Next().
+ void Random(int32_t seed, int n);
+
+ // Causes generator to emit a NULL as the next call.
+ void GenerateNULL();
+
+ private:
+ bool IncrementDigits();
+ bool RandomDigits();
+
+ // Global state.
+ int maxlen_; // Maximum length string to generate.
+ std::vector<std::string> alphabet_; // Alphabet, one string per letter.
+
+ // Iteration state.
+ absl::string_view sp_; // Last string_view returned by Next().
+ std::string s_; // String data in last string_view returned by Next().
+ bool hasnext_; // Whether Next() can be called again.
+ std::vector<int> digits_; // Alphabet indices for next string.
+ bool generate_null_; // Whether to generate a NULL string_view next.
+ bool random_; // Whether generated strings are random.
+ int nrandom_; // Number of random strings left to generate.
+ std::minstd_rand0 rng_; // Random number generator.
+
+ StringGenerator(const StringGenerator&) = delete;
+ StringGenerator& operator=(const StringGenerator&) = delete;
+};
+
+// Generates and returns a string over binary alphabet {0,1} that contains
+// all possible binary sequences of length n as subsequences. The obvious
+// brute force method would generate a string of length n * 2^n, but this
+// generates a string of length n-1 + 2^n called a De Bruijn cycle.
+// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
+//
+// Such a string is useful for testing a DFA. If you have a DFA
+// where distinct last n bytes implies distinct states, then running on a
+// DeBruijn string causes the DFA to need to create a new state at every
+// position in the input, never reusing any states until it gets to the
+// end of the string. This is the worst possible case for DFA execution.
+std::string DeBruijnString(int n);
+
+} // namespace re2
+
+#endif // RE2_TESTING_STRING_GENERATOR_H_
diff --git a/third_party/re2/src/re2/testing/string_generator_test.cc b/third_party/re2/src/re2/testing/string_generator_test.cc
new file mode 100644
index 000000000..b1273d9f6
--- /dev/null
+++ b/third_party/re2/src/re2/testing/string_generator_test.cc
@@ -0,0 +1,110 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test StringGenerator.
+
+#include <stdint.h>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "util/utf.h"
+#include "re2/testing/string_generator.h"
+#include "re2/testing/regexp_generator.h"
+
+namespace re2 {
+
+// Returns i to the e.
+static int64_t IntegerPower(int i, int e) {
+ int64_t p = 1;
+ while (e-- > 0)
+ p *= i;
+ return p;
+}
+
+// Checks that for given settings of the string generator:
+// * it generates strings that are non-decreasing in length.
+// * strings of the same length are sorted in alphabet order.
+// * it doesn't generate the same string twice.
+// * it generates the right number of strings.
+//
+// If all of these hold, the StringGenerator is behaving.
+// Assumes that the alphabet is sorted, so that the generated
+// strings can just be compared lexicographically.
+static void RunTest(int len, const std::string& alphabet, bool donull) {
+ StringGenerator g(len, Explode(alphabet));
+
+ int n = 0;
+ int last_l = -1;
+ std::string last_s;
+
+ if (donull) {
+ g.GenerateNULL();
+ EXPECT_TRUE(g.HasNext());
+ absl::string_view sp = g.Next();
+ EXPECT_EQ(sp.data(), static_cast<const char*>(NULL));
+ EXPECT_EQ(sp.size(), 0);
+ }
+
+ while (g.HasNext()) {
+ std::string s = std::string(g.Next());
+ n++;
+
+ // Check that all characters in s appear in alphabet.
+ for (const char *p = s.c_str(); *p != '\0'; ) {
+ Rune r;
+ p += chartorune(&r, p);
+ EXPECT_TRUE(utfrune(alphabet.c_str(), r) != NULL);
+ }
+
+ // Check that string is properly ordered w.r.t. previous string.
+ int l = utflen(s.c_str());
+ EXPECT_LE(l, len);
+ if (last_l < l) {
+ last_l = l;
+ } else {
+ EXPECT_EQ(last_l, l);
+ EXPECT_LT(last_s, s);
+ }
+ last_s = s;
+ }
+
+ // Check total string count.
+ int64_t m = 0;
+ int alpha = utflen(alphabet.c_str());
+ if (alpha == 0) // Degenerate case.
+ len = 0;
+ for (int i = 0; i <= len; i++)
+ m += IntegerPower(alpha, i);
+ EXPECT_EQ(n, m);
+}
+
+TEST(StringGenerator, NoLength) {
+ RunTest(0, "abc", false);
+}
+
+TEST(StringGenerator, NoLengthNoAlphabet) {
+ RunTest(0, "", false);
+}
+
+TEST(StringGenerator, NoAlphabet) {
+ RunTest(5, "", false);
+}
+
+TEST(StringGenerator, Simple) {
+ RunTest(3, "abc", false);
+}
+
+TEST(StringGenerator, UTF8) {
+ RunTest(4, "abc\xE2\x98\xBA", false);
+}
+
+TEST(StringGenerator, GenNULL) {
+ RunTest(0, "abc", true);
+ RunTest(0, "", true);
+ RunTest(5, "", true);
+ RunTest(3, "abc", true);
+ RunTest(4, "abc\xE2\x98\xBA", true);
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/tester.cc b/third_party/re2/src/re2/testing/tester.cc
new file mode 100644
index 000000000..a094cb4ff
--- /dev/null
+++ b/third_party/re2/src/re2/testing/tester.cc
@@ -0,0 +1,684 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression engine tester -- test all the implementations against each other.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <string>
+
+#include "absl/base/macros.h"
+#include "absl/flags/flag.h"
+#include "absl/strings/escaping.h"
+#include "absl/strings/str_format.h"
+#include "util/logging.h"
+#include "re2/testing/tester.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+
+ABSL_FLAG(bool, dump_prog, false, "dump regexp program");
+ABSL_FLAG(bool, log_okay, false, "log successful runs");
+ABSL_FLAG(bool, dump_rprog, false, "dump reversed regexp program");
+
+ABSL_FLAG(int, max_regexp_failures, 100,
+ "maximum number of regexp test failures (-1 = unlimited)");
+
+ABSL_FLAG(std::string, regexp_engines, "",
+ "pattern to select regexp engines to test");
+
+namespace re2 {
+
+enum {
+ kMaxSubmatch = 1+16, // $0...$16
+};
+
+const char* engine_names[kEngineMax] = {
+ "Backtrack",
+ "NFA",
+ "DFA",
+ "DFA1",
+ "OnePass",
+ "BitState",
+ "RE2",
+ "RE2a",
+ "RE2b",
+ "PCRE",
+};
+
+// Returns the name of the engine.
+static const char* EngineName(Engine e) {
+ CHECK_GE(e, 0);
+ CHECK_LT(e, ABSL_ARRAYSIZE(engine_names));
+ CHECK(engine_names[e] != NULL);
+ return engine_names[e];
+}
+
+// Returns bit mask of engines to use.
+static uint32_t Engines() {
+ static bool did_parse = false;
+ static uint32_t cached_engines = 0;
+
+ if (did_parse)
+ return cached_engines;
+
+ if (absl::GetFlag(FLAGS_regexp_engines).empty()) {
+ cached_engines = ~0;
+ } else {
+ for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
+ if (absl::GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos)
+ cached_engines |= 1<<i;
+ }
+
+ if (cached_engines == 0)
+ LOG(INFO) << "Warning: no engines enabled.";
+ if (!UsingPCRE)
+ cached_engines &= ~(1<<kEnginePCRE);
+ for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
+ if (cached_engines & (1<<i))
+ LOG(INFO) << EngineName(i) << " enabled";
+ }
+
+ did_parse = true;
+ return cached_engines;
+}
+
+// The result of running a match.
+struct TestInstance::Result {
+ Result()
+ : skipped(false),
+ matched(false),
+ untrusted(false),
+ have_submatch(false),
+ have_submatch0(false) {
+ ClearSubmatch();
+ }
+
+ void ClearSubmatch() {
+ for (int i = 0; i < kMaxSubmatch; i++)
+ submatch[i] = absl::string_view();
+ }
+
+ bool skipped; // test skipped: wasn't applicable
+ bool matched; // found a match
+ bool untrusted; // don't really trust the answer
+ bool have_submatch; // computed all submatch info
+ bool have_submatch0; // computed just submatch[0]
+ absl::string_view submatch[kMaxSubmatch];
+};
+
+typedef TestInstance::Result Result;
+
+// Formats a single capture range s in text in the form (a,b)
+// where a and b are the starting and ending offsets of s in text.
+static std::string FormatCapture(absl::string_view text,
+ absl::string_view s) {
+ if (s.data() == NULL)
+ return "(?,?)";
+ return absl::StrFormat("(%d,%d)",
+ BeginPtr(s) - BeginPtr(text),
+ EndPtr(s) - BeginPtr(text));
+}
+
+// Returns whether text contains non-ASCII (>= 0x80) bytes.
+static bool NonASCII(absl::string_view text) {
+ for (size_t i = 0; i < text.size(); i++)
+ if ((uint8_t)text[i] >= 0x80)
+ return true;
+ return false;
+}
+
+// Returns string representation of match kind.
+static std::string FormatKind(Prog::MatchKind kind) {
+ switch (kind) {
+ case Prog::kFullMatch:
+ return "full match";
+ case Prog::kLongestMatch:
+ return "longest match";
+ case Prog::kFirstMatch:
+ return "first match";
+ case Prog::kManyMatch:
+ return "many match";
+ }
+ return "???";
+}
+
+// Returns string representation of anchor kind.
+static std::string FormatAnchor(Prog::Anchor anchor) {
+ switch (anchor) {
+ case Prog::kAnchored:
+ return "anchored";
+ case Prog::kUnanchored:
+ return "unanchored";
+ }
+ return "???";
+}
+
+struct ParseMode {
+ Regexp::ParseFlags parse_flags;
+ std::string desc;
+};
+
+static const Regexp::ParseFlags single_line =
+ Regexp::LikePerl;
+static const Regexp::ParseFlags multi_line =
+ static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
+
+static ParseMode parse_modes[] = {
+ { single_line, "single-line" },
+ { single_line|Regexp::Latin1, "single-line, latin1" },
+ { multi_line, "multiline" },
+ { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
+ { multi_line|Regexp::Latin1, "multiline, latin1" },
+};
+
+static std::string FormatMode(Regexp::ParseFlags flags) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(parse_modes); i++)
+ if (parse_modes[i].parse_flags == flags)
+ return parse_modes[i].desc;
+ return absl::StrFormat("%#x", static_cast<uint32_t>(flags));
+}
+
+// Constructs and saves all the matching engines that
+// will be required for the given tests.
+TestInstance::TestInstance(absl::string_view regexp_str, Prog::MatchKind kind,
+ Regexp::ParseFlags flags)
+ : regexp_str_(regexp_str),
+ kind_(kind),
+ flags_(flags),
+ error_(false),
+ regexp_(NULL),
+ num_captures_(0),
+ prog_(NULL),
+ rprog_(NULL),
+ re_(NULL),
+ re2_(NULL) {
+
+ VLOG(1) << absl::CEscape(regexp_str);
+
+ // Compile regexp to prog.
+ // Always required - needed for backtracking (reference implementation).
+ RegexpStatus status;
+ regexp_ = Regexp::Parse(regexp_str, flags, &status);
+ if (regexp_ == NULL) {
+ LOG(INFO) << "Cannot parse: " << absl::CEscape(regexp_str_)
+ << " mode: " << FormatMode(flags);
+ error_ = true;
+ return;
+ }
+ num_captures_ = regexp_->NumCaptures();
+ prog_ = regexp_->CompileToProg(0);
+ if (prog_ == NULL) {
+ LOG(INFO) << "Cannot compile: " << absl::CEscape(regexp_str_);
+ error_ = true;
+ return;
+ }
+ if (absl::GetFlag(FLAGS_dump_prog)) {
+ LOG(INFO) << "Prog for "
+ << " regexp "
+ << absl::CEscape(regexp_str_)
+ << " (" << FormatKind(kind_)
+ << ", " << FormatMode(flags_)
+ << ")\n"
+ << prog_->Dump();
+ }
+
+ // Compile regexp to reversed prog. Only needed for DFA engines.
+ if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
+ rprog_ = regexp_->CompileToReverseProg(0);
+ if (rprog_ == NULL) {
+ LOG(INFO) << "Cannot reverse compile: " << absl::CEscape(regexp_str_);
+ error_ = true;
+ return;
+ }
+ if (absl::GetFlag(FLAGS_dump_rprog))
+ LOG(INFO) << rprog_->Dump();
+ }
+
+ // Create re string that will be used for RE and RE2.
+ std::string re = std::string(regexp_str);
+ // Accomodate flags.
+ // Regexp::Latin1 will be accomodated below.
+ if (!(flags & Regexp::OneLine))
+ re = "(?m)" + re;
+ if (flags & Regexp::NonGreedy)
+ re = "(?U)" + re;
+ if (flags & Regexp::DotNL)
+ re = "(?s)" + re;
+
+ // Compile regexp to RE2.
+ if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
+ RE2::Options options;
+ if (flags & Regexp::Latin1)
+ options.set_encoding(RE2::Options::EncodingLatin1);
+ if (kind_ == Prog::kLongestMatch)
+ options.set_longest_match(true);
+ re2_ = new RE2(re, options);
+ if (!re2_->error().empty()) {
+ LOG(INFO) << "Cannot RE2: " << absl::CEscape(re);
+ error_ = true;
+ return;
+ }
+ }
+
+ // Compile regexp to RE.
+ // PCRE as exposed by the RE interface isn't always usable.
+ // 1. It disagrees about handling of empty-string reptitions
+ // like matching (a*)* against "b". PCRE treats the (a*) as
+ // occurring once, while we treat it as occurring not at all.
+ // 2. It treats $ as this weird thing meaning end of string
+ // or before the \n at the end of the string.
+ // 3. It doesn't implement POSIX leftmost-longest matching.
+ // 4. It lets \s match vertical tab.
+ // MimicsPCRE() detects 1 and 2.
+ if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
+ kind_ != Prog::kLongestMatch) {
+ PCRE_Options o;
+ o.set_option(PCRE::UTF8);
+ if (flags & Regexp::Latin1)
+ o.set_option(PCRE::None);
+ // PCRE has interface bug keeping us from finding $0, so
+ // add one more layer of parens.
+ re_ = new PCRE("("+re+")", o);
+ if (!re_->error().empty()) {
+ LOG(INFO) << "Cannot PCRE: " << absl::CEscape(re);
+ error_ = true;
+ return;
+ }
+ }
+}
+
+TestInstance::~TestInstance() {
+ if (regexp_)
+ regexp_->Decref();
+ delete prog_;
+ delete rprog_;
+ delete re_;
+ delete re2_;
+}
+
+// Runs a single search using the named engine type.
+// This interface hides all the irregularities of the various
+// engine interfaces from the rest of this file.
+void TestInstance::RunSearch(Engine type, absl::string_view orig_text,
+ absl::string_view orig_context,
+ Prog::Anchor anchor, Result* result) {
+ if (regexp_ == NULL) {
+ result->skipped = true;
+ return;
+ }
+ int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0
+ if (nsubmatch > kMaxSubmatch)
+ nsubmatch = kMaxSubmatch;
+
+ absl::string_view text = orig_text;
+ absl::string_view context = orig_context;
+
+ switch (type) {
+ default:
+ LOG(FATAL) << "Bad RunSearch type: " << (int)type;
+
+ case kEngineBacktrack:
+ if (prog_ == NULL) {
+ result->skipped = true;
+ break;
+ }
+ result->matched =
+ prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
+ result->submatch, nsubmatch);
+ result->have_submatch = true;
+ break;
+
+ case kEngineNFA:
+ if (prog_ == NULL) {
+ result->skipped = true;
+ break;
+ }
+ result->matched =
+ prog_->SearchNFA(text, context, anchor, kind_,
+ result->submatch, nsubmatch);
+ result->have_submatch = true;
+ break;
+
+ case kEngineDFA:
+ if (prog_ == NULL) {
+ result->skipped = true;
+ break;
+ }
+ result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
+ &result->skipped, NULL);
+ break;
+
+ case kEngineDFA1:
+ if (prog_ == NULL || rprog_ == NULL) {
+ result->skipped = true;
+ break;
+ }
+ result->matched =
+ prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
+ &result->skipped, NULL);
+ // If anchored, no need for second run,
+ // but do it anyway to find more bugs.
+ if (result->matched) {
+ if (!rprog_->SearchDFA(result->submatch[0], context,
+ Prog::kAnchored, Prog::kLongestMatch,
+ result->submatch,
+ &result->skipped, NULL)) {
+ LOG(ERROR) << "Reverse DFA inconsistency: "
+ << absl::CEscape(regexp_str_)
+ << " on " << absl::CEscape(text);
+ result->matched = false;
+ }
+ }
+ result->have_submatch0 = true;
+ break;
+
+ case kEngineOnePass:
+ if (prog_ == NULL ||
+ !prog_->IsOnePass() ||
+ anchor == Prog::kUnanchored ||
+ nsubmatch > Prog::kMaxOnePassCapture) {
+ result->skipped = true;
+ break;
+ }
+ result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
+ result->submatch, nsubmatch);
+ result->have_submatch = true;
+ break;
+
+ case kEngineBitState:
+ if (prog_ == NULL ||
+ !prog_->CanBitState()) {
+ result->skipped = true;
+ break;
+ }
+ result->matched = prog_->SearchBitState(text, context, anchor, kind_,
+ result->submatch, nsubmatch);
+ result->have_submatch = true;
+ break;
+
+ case kEngineRE2:
+ case kEngineRE2a:
+ case kEngineRE2b: {
+ if (!re2_ || EndPtr(text) != EndPtr(context)) {
+ result->skipped = true;
+ break;
+ }
+
+ RE2::Anchor re_anchor;
+ if (anchor == Prog::kAnchored)
+ re_anchor = RE2::ANCHOR_START;
+ else
+ re_anchor = RE2::UNANCHORED;
+ if (kind_ == Prog::kFullMatch)
+ re_anchor = RE2::ANCHOR_BOTH;
+
+ result->matched = re2_->Match(
+ context,
+ static_cast<size_t>(BeginPtr(text) - BeginPtr(context)),
+ static_cast<size_t>(EndPtr(text) - BeginPtr(context)),
+ re_anchor,
+ result->submatch,
+ nsubmatch);
+ result->have_submatch = nsubmatch > 0;
+ break;
+ }
+
+ case kEnginePCRE: {
+ if (!re_ || BeginPtr(text) != BeginPtr(context) ||
+ EndPtr(text) != EndPtr(context)) {
+ result->skipped = true;
+ break;
+ }
+
+ // In Perl/PCRE, \v matches any character considered vertical
+ // whitespace, not just vertical tab. Regexp::MimicsPCRE() is
+ // unable to handle all cases of this, unfortunately, so just
+ // catch them here. :(
+ if (regexp_str_.find("\\v") != absl::string_view::npos &&
+ (text.find('\n') != absl::string_view::npos ||
+ text.find('\f') != absl::string_view::npos ||
+ text.find('\r') != absl::string_view::npos)) {
+ result->skipped = true;
+ break;
+ }
+
+ // PCRE 8.34 or so started allowing vertical tab to match \s,
+ // following a change made in Perl 5.18. RE2 does not.
+ if ((regexp_str_.find("\\s") != absl::string_view::npos ||
+ regexp_str_.find("\\S") != absl::string_view::npos) &&
+ text.find('\v') != absl::string_view::npos) {
+ result->skipped = true;
+ break;
+ }
+
+ const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
+ PCRE::Arg *a = new PCRE::Arg[nsubmatch];
+ for (int i = 0; i < nsubmatch; i++) {
+ a[i] = PCRE::Arg(&result->submatch[i]);
+ argptr[i] = &a[i];
+ }
+ size_t consumed;
+ PCRE::Anchor pcre_anchor;
+ if (anchor == Prog::kAnchored)
+ pcre_anchor = PCRE::ANCHOR_START;
+ else
+ pcre_anchor = PCRE::UNANCHORED;
+ if (kind_ == Prog::kFullMatch)
+ pcre_anchor = PCRE::ANCHOR_BOTH;
+ re_->ClearHitLimit();
+ result->matched =
+ re_->DoMatch(text,
+ pcre_anchor,
+ &consumed,
+ argptr, nsubmatch);
+ if (re_->HitLimit()) {
+ result->untrusted = true;
+ delete[] argptr;
+ delete[] a;
+ break;
+ }
+ result->have_submatch = true;
+ delete[] argptr;
+ delete[] a;
+ break;
+ }
+ }
+
+ if (!result->matched)
+ result->ClearSubmatch();
+}
+
+// Checks whether r is okay given that correct is the right answer.
+// Specifically, r's answers have to match (but it doesn't have to
+// claim to have all the answers).
+static bool ResultOkay(const Result& r, const Result& correct) {
+ if (r.skipped)
+ return true;
+ if (r.matched != correct.matched)
+ return false;
+ if (r.have_submatch || r.have_submatch0) {
+ for (int i = 0; i < kMaxSubmatch; i++) {
+ if (correct.submatch[i].data() != r.submatch[i].data() ||
+ correct.submatch[i].size() != r.submatch[i].size())
+ return false;
+ if (!r.have_submatch)
+ break;
+ }
+ }
+ return true;
+}
+
+// Runs a single test.
+bool TestInstance::RunCase(absl::string_view text, absl::string_view context,
+ Prog::Anchor anchor) {
+ // Backtracking is the gold standard.
+ Result correct;
+ RunSearch(kEngineBacktrack, text, context, anchor, &correct);
+ if (correct.skipped) {
+ if (regexp_ == NULL)
+ return true;
+ LOG(ERROR) << "Skipped backtracking! " << absl::CEscape(regexp_str_)
+ << " " << FormatMode(flags_);
+ return false;
+ }
+ VLOG(1) << "Try: regexp " << absl::CEscape(regexp_str_)
+ << " text " << absl::CEscape(text)
+ << " (" << FormatKind(kind_)
+ << ", " << FormatAnchor(anchor)
+ << ", " << FormatMode(flags_)
+ << ")";
+
+ // Compare the others.
+ bool all_okay = true;
+ for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
+ if (!(Engines() & (1<<i)))
+ continue;
+
+ Result r;
+ RunSearch(i, text, context, anchor, &r);
+ if (ResultOkay(r, correct)) {
+ if (absl::GetFlag(FLAGS_log_okay))
+ LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
+ continue;
+ }
+
+ // We disagree with PCRE on the meaning of some Unicode matches.
+ // In particular, we treat non-ASCII UTF-8 as non-word characters.
+ // We also treat "empty" character sets like [^\w\W] as being
+ // impossible to match, while PCRE apparently excludes some code
+ // points (e.g., 0x0080) from both \w and \W.
+ if (i == kEnginePCRE && NonASCII(text))
+ continue;
+
+ if (!r.untrusted)
+ all_okay = false;
+
+ LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
+ context, anchor);
+ if (r.matched != correct.matched) {
+ if (r.matched) {
+ LOG(INFO) << " Should not match (but does).";
+ } else {
+ LOG(INFO) << " Should match (but does not).";
+ continue;
+ }
+ }
+ for (int i = 0; i < 1+num_captures_; i++) {
+ if (r.submatch[i].data() != correct.submatch[i].data() ||
+ r.submatch[i].size() != correct.submatch[i].size()) {
+ LOG(INFO) <<
+ absl::StrFormat(" $%d: should be %s is %s",
+ i,
+ FormatCapture(text, correct.submatch[i]),
+ FormatCapture(text, r.submatch[i]));
+ } else {
+ LOG(INFO) <<
+ absl::StrFormat(" $%d: %s ok", i,
+ FormatCapture(text, r.submatch[i]));
+ }
+ }
+ }
+
+ if (!all_okay) {
+ // This will be initialised once (after flags have been initialised)
+ // and that is desirable because we want to enforce a global limit.
+ static int max_regexp_failures = absl::GetFlag(FLAGS_max_regexp_failures);
+ if (max_regexp_failures > 0 && --max_regexp_failures == 0)
+ LOG(QFATAL) << "Too many regexp failures.";
+ }
+
+ return all_okay;
+}
+
+void TestInstance::LogMatch(const char* prefix, Engine e,
+ absl::string_view text, absl::string_view context,
+ Prog::Anchor anchor) {
+ LOG(INFO) << prefix
+ << EngineName(e)
+ << " regexp "
+ << absl::CEscape(regexp_str_)
+ << " "
+ << absl::CEscape(regexp_->ToString())
+ << " text "
+ << absl::CEscape(text)
+ << " ("
+ << BeginPtr(text) - BeginPtr(context)
+ << ","
+ << EndPtr(text) - BeginPtr(context)
+ << ") of context "
+ << absl::CEscape(context)
+ << " (" << FormatKind(kind_)
+ << ", " << FormatAnchor(anchor)
+ << ", " << FormatMode(flags_)
+ << ")";
+}
+
+static Prog::MatchKind kinds[] = {
+ Prog::kFirstMatch,
+ Prog::kLongestMatch,
+ Prog::kFullMatch,
+};
+
+// Test all possible match kinds and parse modes.
+Tester::Tester(absl::string_view regexp) {
+ error_ = false;
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(kinds); i++) {
+ for (size_t j = 0; j < ABSL_ARRAYSIZE(parse_modes); j++) {
+ TestInstance* t = new TestInstance(regexp, kinds[i],
+ parse_modes[j].parse_flags);
+ error_ |= t->error();
+ v_.push_back(t);
+ }
+ }
+}
+
+Tester::~Tester() {
+ for (size_t i = 0; i < v_.size(); i++)
+ delete v_[i];
+}
+
+bool Tester::TestCase(absl::string_view text, absl::string_view context,
+ Prog::Anchor anchor) {
+ bool okay = true;
+ for (size_t i = 0; i < v_.size(); i++)
+ okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
+ return okay;
+}
+
+static Prog::Anchor anchors[] = {
+ Prog::kAnchored,
+ Prog::kUnanchored
+};
+
+bool Tester::TestInput(absl::string_view text) {
+ bool okay = TestInputInContext(text, text);
+ if (!text.empty()) {
+ absl::string_view sp;
+ sp = text;
+ sp.remove_prefix(1);
+ okay &= TestInputInContext(sp, text);
+ sp = text;
+ sp.remove_suffix(1);
+ okay &= TestInputInContext(sp, text);
+ }
+ return okay;
+}
+
+bool Tester::TestInputInContext(absl::string_view text,
+ absl::string_view context) {
+ bool okay = true;
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(anchors); i++)
+ okay &= TestCase(text, context, anchors[i]);
+ return okay;
+}
+
+bool TestRegexpOnText(absl::string_view regexp,
+ absl::string_view text) {
+ Tester t(regexp);
+ return t.TestInput(text);
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/testing/tester.h b/third_party/re2/src/re2/testing/tester.h
new file mode 100644
index 000000000..59be5ea0a
--- /dev/null
+++ b/third_party/re2/src/re2/testing/tester.h
@@ -0,0 +1,121 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_TESTING_TESTER_H_
+#define RE2_TESTING_TESTER_H_
+
+// Comparative tester for regular expression matching.
+// Checks all implementations against each other.
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+#include "re2/re2.h"
+#include "util/pcre.h"
+
+namespace re2 {
+
+// All the supported regexp engines.
+enum Engine {
+ kEngineBacktrack = 0, // Prog::UnsafeSearchBacktrack
+ kEngineNFA, // Prog::SearchNFA
+ kEngineDFA, // Prog::SearchDFA, only ask whether it matched
+ kEngineDFA1, // Prog::SearchDFA, ask for match[0]
+ kEngineOnePass, // Prog::SearchOnePass, if applicable
+ kEngineBitState, // Prog::SearchBitState
+ kEngineRE2, // RE2, all submatches
+ kEngineRE2a, // RE2, only ask for match[0]
+ kEngineRE2b, // RE2, only ask whether it matched
+ kEnginePCRE, // PCRE (util/pcre.h)
+
+ kEngineMax,
+};
+
+// Make normal math on the enum preserve the type.
+// By default, C++ doesn't define ++ on enum, and e+1 has type int.
+static inline void operator++(Engine& e, int unused) {
+ e = static_cast<Engine>(e+1);
+}
+
+static inline Engine operator+(Engine e, int i) {
+ return static_cast<Engine>(static_cast<int>(e)+i);
+}
+
+// A TestInstance caches per-regexp state for a given
+// regular expression in a given configuration
+// (UTF-8 vs Latin1, longest vs first match, etc.).
+class TestInstance {
+ public:
+ struct Result;
+
+ TestInstance(absl::string_view regexp, Prog::MatchKind kind,
+ Regexp::ParseFlags flags);
+ ~TestInstance();
+ Regexp::ParseFlags flags() { return flags_; }
+ bool error() { return error_; }
+
+ // Runs a single test case: search in text, which is in context,
+ // using the given anchoring.
+ bool RunCase(absl::string_view text, absl::string_view context,
+ Prog::Anchor anchor);
+
+ private:
+ // Runs a single search using the named engine type.
+ void RunSearch(Engine type, absl::string_view text, absl::string_view context,
+ Prog::Anchor anchor, Result* result);
+
+ void LogMatch(const char* prefix, Engine e, absl::string_view text,
+ absl::string_view context, Prog::Anchor anchor);
+
+ absl::string_view regexp_str_; // regexp being tested
+ Prog::MatchKind kind_; // kind of match
+ Regexp::ParseFlags flags_; // flags for parsing regexp_str_
+ bool error_; // error during constructor?
+
+ Regexp* regexp_; // parsed regexp
+ int num_captures_; // regexp_->NumCaptures() cached
+ Prog* prog_; // compiled program
+ Prog* rprog_; // compiled reverse program
+ PCRE* re_; // PCRE implementation
+ RE2* re2_; // RE2 implementation
+
+ TestInstance(const TestInstance&) = delete;
+ TestInstance& operator=(const TestInstance&) = delete;
+};
+
+// A group of TestInstances for all possible configurations.
+class Tester {
+ public:
+ explicit Tester(absl::string_view regexp);
+ ~Tester();
+
+ bool error() { return error_; }
+
+ // Runs a single test case: search in text, which is in context,
+ // using the given anchoring.
+ bool TestCase(absl::string_view text, absl::string_view context,
+ Prog::Anchor anchor);
+
+ // Run TestCase(text, text, anchor) for all anchoring modes.
+ bool TestInput(absl::string_view text);
+
+ // Run TestCase(text, context, anchor) for all anchoring modes.
+ bool TestInputInContext(absl::string_view text, absl::string_view context);
+
+ private:
+ bool error_;
+ std::vector<TestInstance*> v_;
+
+ Tester(const Tester&) = delete;
+ Tester& operator=(const Tester&) = delete;
+};
+
+// Run all possible tests using regexp and text.
+bool TestRegexpOnText(absl::string_view regexp, absl::string_view text);
+
+} // namespace re2
+
+#endif // RE2_TESTING_TESTER_H_
diff --git a/third_party/re2/src/re2/tostring.cc b/third_party/re2/src/re2/tostring.cc
new file mode 100644
index 000000000..33179fdeb
--- /dev/null
+++ b/third_party/re2/src/re2/tostring.cc
@@ -0,0 +1,350 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Format a regular expression structure as a string.
+// Tested by parse_test.cc
+
+#include <string.h>
+#include <string>
+
+#include "absl/strings/str_format.h"
+#include "util/logging.h"
+#include "util/utf.h"
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+enum {
+ PrecAtom,
+ PrecUnary,
+ PrecConcat,
+ PrecAlternate,
+ PrecEmpty,
+ PrecParen,
+ PrecToplevel,
+};
+
+// Helper function. See description below.
+static void AppendCCRange(std::string* t, Rune lo, Rune hi);
+
+// Walker to generate string in s_.
+// The arg pointers are actually integers giving the
+// context precedence.
+// The child_args are always NULL.
+class ToStringWalker : public Regexp::Walker<int> {
+ public:
+ explicit ToStringWalker(std::string* t) : t_(t) {}
+
+ virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
+ virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
+ int* child_args, int nchild_args);
+ virtual int ShortVisit(Regexp* re, int parent_arg) {
+ return 0;
+ }
+
+ private:
+ std::string* t_; // The string the walker appends to.
+
+ ToStringWalker(const ToStringWalker&) = delete;
+ ToStringWalker& operator=(const ToStringWalker&) = delete;
+};
+
+std::string Regexp::ToString() {
+ std::string t;
+ ToStringWalker w(&t);
+ w.WalkExponential(this, PrecToplevel, 100000);
+ if (w.stopped_early())
+ t += " [truncated]";
+ return t;
+}
+
+#define ToString DontCallToString // Avoid accidental recursion.
+
+// Visits re before children are processed.
+// Appends ( if needed and passes new precedence to children.
+int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
+ int prec = parent_arg;
+ int nprec = PrecAtom;
+
+ switch (re->op()) {
+ case kRegexpNoMatch:
+ case kRegexpEmptyMatch:
+ case kRegexpLiteral:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ case kRegexpBeginLine:
+ case kRegexpEndLine:
+ case kRegexpBeginText:
+ case kRegexpEndText:
+ case kRegexpWordBoundary:
+ case kRegexpNoWordBoundary:
+ case kRegexpCharClass:
+ case kRegexpHaveMatch:
+ nprec = PrecAtom;
+ break;
+
+ case kRegexpConcat:
+ case kRegexpLiteralString:
+ if (prec < PrecConcat)
+ t_->append("(?:");
+ nprec = PrecConcat;
+ break;
+
+ case kRegexpAlternate:
+ if (prec < PrecAlternate)
+ t_->append("(?:");
+ nprec = PrecAlternate;
+ break;
+
+ case kRegexpCapture:
+ t_->append("(");
+ if (re->cap() == 0)
+ LOG(DFATAL) << "kRegexpCapture cap() == 0";
+ if (re->name()) {
+ t_->append("?P<");
+ t_->append(*re->name());
+ t_->append(">");
+ }
+ nprec = PrecParen;
+ break;
+
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpRepeat:
+ if (prec < PrecUnary)
+ t_->append("(?:");
+ // The subprecedence here is PrecAtom instead of PrecUnary
+ // because PCRE treats two unary ops in a row as a parse error.
+ nprec = PrecAtom;
+ break;
+ }
+
+ return nprec;
+}
+
+static void AppendLiteral(std::string *t, Rune r, bool foldcase) {
+ if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
+ t->append(1, '\\');
+ t->append(1, static_cast<char>(r));
+ } else if (foldcase && 'a' <= r && r <= 'z') {
+ r -= 'a' - 'A';
+ t->append(1, '[');
+ t->append(1, static_cast<char>(r));
+ t->append(1, static_cast<char>(r) + 'a' - 'A');
+ t->append(1, ']');
+ } else {
+ AppendCCRange(t, r, r);
+ }
+}
+
+// Visits re after children are processed.
+// For childless regexps, all the work is done here.
+// For regexps with children, append any unary suffixes or ).
+int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
+ int* child_args, int nchild_args) {
+ int prec = parent_arg;
+ switch (re->op()) {
+ case kRegexpNoMatch:
+ // There's no simple symbol for "no match", but
+ // [^0-Runemax] excludes everything.
+ t_->append("[^\\x00-\\x{10ffff}]");
+ break;
+
+ case kRegexpEmptyMatch:
+ // Append (?:) to make empty string visible,
+ // unless this is already being parenthesized.
+ if (prec < PrecEmpty)
+ t_->append("(?:)");
+ break;
+
+ case kRegexpLiteral:
+ AppendLiteral(t_, re->rune(),
+ (re->parse_flags() & Regexp::FoldCase) != 0);
+ break;
+
+ case kRegexpLiteralString:
+ for (int i = 0; i < re->nrunes(); i++)
+ AppendLiteral(t_, re->runes()[i],
+ (re->parse_flags() & Regexp::FoldCase) != 0);
+ if (prec < PrecConcat)
+ t_->append(")");
+ break;
+
+ case kRegexpConcat:
+ if (prec < PrecConcat)
+ t_->append(")");
+ break;
+
+ case kRegexpAlternate:
+ // Clumsy but workable: the children all appended |
+ // at the end of their strings, so just remove the last one.
+ if ((*t_)[t_->size()-1] == '|')
+ t_->erase(t_->size()-1);
+ else
+ LOG(DFATAL) << "Bad final char: " << t_;
+ if (prec < PrecAlternate)
+ t_->append(")");
+ break;
+
+ case kRegexpStar:
+ t_->append("*");
+ if (re->parse_flags() & Regexp::NonGreedy)
+ t_->append("?");
+ if (prec < PrecUnary)
+ t_->append(")");
+ break;
+
+ case kRegexpPlus:
+ t_->append("+");
+ if (re->parse_flags() & Regexp::NonGreedy)
+ t_->append("?");
+ if (prec < PrecUnary)
+ t_->append(")");
+ break;
+
+ case kRegexpQuest:
+ t_->append("?");
+ if (re->parse_flags() & Regexp::NonGreedy)
+ t_->append("?");
+ if (prec < PrecUnary)
+ t_->append(")");
+ break;
+
+ case kRegexpRepeat:
+ if (re->max() == -1)
+ t_->append(absl::StrFormat("{%d,}", re->min()));
+ else if (re->min() == re->max())
+ t_->append(absl::StrFormat("{%d}", re->min()));
+ else
+ t_->append(absl::StrFormat("{%d,%d}", re->min(), re->max()));
+ if (re->parse_flags() & Regexp::NonGreedy)
+ t_->append("?");
+ if (prec < PrecUnary)
+ t_->append(")");
+ break;
+
+ case kRegexpAnyChar:
+ t_->append(".");
+ break;
+
+ case kRegexpAnyByte:
+ t_->append("\\C");
+ break;
+
+ case kRegexpBeginLine:
+ t_->append("^");
+ break;
+
+ case kRegexpEndLine:
+ t_->append("$");
+ break;
+
+ case kRegexpBeginText:
+ t_->append("(?-m:^)");
+ break;
+
+ case kRegexpEndText:
+ if (re->parse_flags() & Regexp::WasDollar)
+ t_->append("(?-m:$)");
+ else
+ t_->append("\\z");
+ break;
+
+ case kRegexpWordBoundary:
+ t_->append("\\b");
+ break;
+
+ case kRegexpNoWordBoundary:
+ t_->append("\\B");
+ break;
+
+ case kRegexpCharClass: {
+ if (re->cc()->size() == 0) {
+ t_->append("[^\\x00-\\x{10ffff}]");
+ break;
+ }
+ t_->append("[");
+ // Heuristic: show class as negated if it contains the
+ // non-character 0xFFFE and yet somehow isn't full.
+ CharClass* cc = re->cc();
+ if (cc->Contains(0xFFFE) && !cc->full()) {
+ cc = cc->Negate();
+ t_->append("^");
+ }
+ for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
+ AppendCCRange(t_, i->lo, i->hi);
+ if (cc != re->cc())
+ cc->Delete();
+ t_->append("]");
+ break;
+ }
+
+ case kRegexpCapture:
+ t_->append(")");
+ break;
+
+ case kRegexpHaveMatch:
+ // There's no syntax accepted by the parser to generate
+ // this node (it is generated by RE2::Set) so make something
+ // up that is readable but won't compile.
+ t_->append(absl::StrFormat("(?HaveMatch:%d)", re->match_id()));
+ break;
+ }
+
+ // If the parent is an alternation, append the | for it.
+ if (prec == PrecAlternate)
+ t_->append("|");
+
+ return 0;
+}
+
+// Appends a rune for use in a character class to the string t.
+static void AppendCCChar(std::string* t, Rune r) {
+ if (0x20 <= r && r <= 0x7E) {
+ if (strchr("[]^-\\", r))
+ t->append("\\");
+ t->append(1, static_cast<char>(r));
+ return;
+ }
+ switch (r) {
+ default:
+ break;
+
+ case '\r':
+ t->append("\\r");
+ return;
+
+ case '\t':
+ t->append("\\t");
+ return;
+
+ case '\n':
+ t->append("\\n");
+ return;
+
+ case '\f':
+ t->append("\\f");
+ return;
+ }
+
+ if (r < 0x100) {
+ *t += absl::StrFormat("\\x%02x", static_cast<int>(r));
+ return;
+ }
+ *t += absl::StrFormat("\\x{%x}", static_cast<int>(r));
+}
+
+static void AppendCCRange(std::string* t, Rune lo, Rune hi) {
+ if (lo > hi)
+ return;
+ AppendCCChar(t, lo);
+ if (lo < hi) {
+ t->append("-");
+ AppendCCChar(t, hi);
+ }
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/re2/unicode.py b/third_party/re2/src/re2/unicode.py
new file mode 100644
index 000000000..91734074a
--- /dev/null
+++ b/third_party/re2/src/re2/unicode.py
@@ -0,0 +1,303 @@
+# Copyright 2008 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+"""Parser for Unicode data files (as distributed by unicode.org)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import re
+import urllib.request
+
+# Directory or URL where Unicode tables reside.
+_UNICODE_DIR = "https://www.unicode.org/Public/15.1.0/ucd"
+
+# Largest valid Unicode code value.
+_RUNE_MAX = 0x10FFFF
+
+
+class Error(Exception):
+ """Unicode error base class."""
+
+
+class InputError(Error):
+ """Unicode input error class. Raised on invalid input."""
+
+
+def _UInt(s):
+ """Converts string to Unicode code point ('263A' => 0x263a).
+
+ Args:
+ s: string to convert
+
+ Returns:
+ Unicode code point
+
+ Raises:
+ InputError: the string is not a valid Unicode value.
+ """
+
+ try:
+ v = int(s, 16)
+ except ValueError:
+ v = -1
+ if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
+ raise InputError("invalid Unicode value %s" % (s,))
+ return v
+
+
+def _URange(s):
+ """Converts string to Unicode range.
+
+ '0001..0003' => [1, 2, 3].
+ '0001' => [1].
+
+ Args:
+ s: string to convert
+
+ Returns:
+ Unicode range
+
+ Raises:
+ InputError: the string is not a valid Unicode range.
+ """
+ a = s.split("..")
+ if len(a) == 1:
+ return [_UInt(a[0])]
+ if len(a) == 2:
+ lo = _UInt(a[0])
+ hi = _UInt(a[1])
+ if lo < hi:
+ return range(lo, hi + 1)
+ raise InputError("invalid Unicode range %s" % (s,))
+
+
+def _UStr(v):
+ """Converts Unicode code point to hex string.
+
+ 0x263a => '0x263A'.
+
+ Args:
+ v: code point to convert
+
+ Returns:
+ Unicode string
+
+ Raises:
+ InputError: the argument is not a valid Unicode value.
+ """
+ if v < 0 or v > _RUNE_MAX:
+ raise InputError("invalid Unicode value %s" % (v,))
+ return "0x%04X" % (v,)
+
+
+def _ParseContinue(s):
+ """Parses a Unicode continuation field.
+
+ These are of the form '<Name, First>' or '<Name, Last>'.
+ Instead of giving an explicit range in a single table entry,
+ some Unicode tables use two entries, one for the first
+ code value in the range and one for the last.
+ The first entry's description is '<Name, First>' instead of 'Name'
+ and the second is '<Name, Last>'.
+
+ '<Name, First>' => ('Name', 'First')
+ '<Name, Last>' => ('Name', 'Last')
+ 'Anything else' => ('Anything else', None)
+
+ Args:
+ s: continuation field string
+
+ Returns:
+ pair: name and ('First', 'Last', or None)
+ """
+
+ match = re.match("<(.*), (First|Last)>", s)
+ if match is not None:
+ return match.groups()
+ return (s, None)
+
+
+def ReadUnicodeTable(filename, nfields, doline):
+ """Generic Unicode table text file reader.
+
+ The reader takes care of stripping out comments and also
+ parsing the two different ways that the Unicode tables specify
+ code ranges (using the .. notation and splitting the range across
+ multiple lines).
+
+ Each non-comment line in the table is expected to have the given
+ number of fields. The first field is known to be the Unicode value
+ and the second field its description.
+
+ The reader calls doline(codes, fields) for each entry in the table.
+ If fn raises an exception, the reader prints that exception,
+ prefixed with the file name and line number, and continues
+ processing the file. When done with the file, the reader re-raises
+ the first exception encountered during the file.
+
+ Arguments:
+ filename: the Unicode data file to read, or a file-like object.
+ nfields: the number of expected fields per line in that file.
+ doline: the function to call for each table entry.
+
+ Raises:
+ InputError: nfields is invalid (must be >= 2).
+ """
+
+ if nfields < 2:
+ raise InputError("invalid number of fields %d" % (nfields,))
+
+ if type(filename) == str:
+ if filename.startswith("https://"):
+ fil = urllib.request.urlopen(filename)
+ else:
+ fil = open(filename, "rb")
+ else:
+ fil = filename
+
+ first = None # first code in multiline range
+ expect_last = None # tag expected for "Last" line in multiline range
+ lineno = 0 # current line number
+ for line in fil:
+ lineno += 1
+ try:
+ line = line.decode('latin1')
+
+ # Chop # comments and white space; ignore empty lines.
+ sharp = line.find("#")
+ if sharp >= 0:
+ line = line[:sharp]
+ line = line.strip()
+ if not line:
+ continue
+
+ # Split fields on ";", chop more white space.
+ # Must have the expected number of fields.
+ fields = [s.strip() for s in line.split(";")]
+ if len(fields) != nfields:
+ raise InputError("wrong number of fields %d %d - %s" %
+ (len(fields), nfields, line))
+
+ # The Unicode text files have two different ways
+ # to list a Unicode range. Either the first field is
+ # itself a range (0000..FFFF), or the range is split
+ # across two lines, with the second field noting
+ # the continuation.
+ codes = _URange(fields[0])
+ (name, cont) = _ParseContinue(fields[1])
+
+ if expect_last is not None:
+ # If the last line gave the First code in a range,
+ # this one had better give the Last one.
+ if (len(codes) != 1 or codes[0] <= first or
+ cont != "Last" or name != expect_last):
+ raise InputError("expected Last line for %s" %
+ (expect_last,))
+ codes = range(first, codes[0] + 1)
+ first = None
+ expect_last = None
+ fields[0] = "%04X..%04X" % (codes[0], codes[-1])
+ fields[1] = name
+ elif cont == "First":
+ # Otherwise, if this is the First code in a range,
+ # remember it and go to the next line.
+ if len(codes) != 1:
+ raise InputError("bad First line: range given")
+ expect_last = name
+ first = codes[0]
+ continue
+
+ doline(codes, fields)
+
+ except Exception as e:
+ print("%s:%d: %s" % (filename, lineno, e))
+ raise
+
+ if expect_last is not None:
+ raise InputError("expected Last line for %s; got EOF" %
+ (expect_last,))
+
+
+def CaseGroups(unicode_dir=_UNICODE_DIR):
+ """Returns list of Unicode code groups equivalent under case folding.
+
+ Each group is a sorted list of code points,
+ and the list of groups is sorted by first code point
+ in the group.
+
+ Args:
+ unicode_dir: Unicode data directory
+
+ Returns:
+ list of Unicode code groups
+ """
+
+ # Dict mapping lowercase code point to fold-equivalent group.
+ togroup = {}
+
+ def DoLine(codes, fields):
+ """Process single CaseFolding.txt line, updating togroup."""
+ (_, foldtype, lower, _) = fields
+ if foldtype not in ("C", "S"):
+ return
+ lower = _UInt(lower)
+ togroup.setdefault(lower, [lower]).extend(codes)
+
+ ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
+
+ groups = list(togroup.values())
+ for g in groups:
+ g.sort()
+ groups.sort()
+ return togroup, groups
+
+
+def Scripts(unicode_dir=_UNICODE_DIR):
+ """Returns dict mapping script names to code lists.
+
+ Args:
+ unicode_dir: Unicode data directory
+
+ Returns:
+ dict mapping script names to code lists
+ """
+
+ scripts = {}
+
+ def DoLine(codes, fields):
+ """Process single Scripts.txt line, updating scripts."""
+ (_, name) = fields
+ scripts.setdefault(name, []).extend(codes)
+
+ ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
+ return scripts
+
+
+def Categories(unicode_dir=_UNICODE_DIR):
+ """Returns dict mapping category names to code lists.
+
+ Args:
+ unicode_dir: Unicode data directory
+
+ Returns:
+ dict mapping category names to code lists
+ """
+
+ categories = {}
+
+ def DoLine(codes, fields):
+ """Process single UnicodeData.txt line, updating categories."""
+ category = fields[2]
+ categories.setdefault(category, []).extend(codes)
+ # Add codes from Lu into L, etc.
+ if len(category) > 1:
+ short = category[0]
+ categories.setdefault(short, []).extend(codes)
+
+ ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
+ return categories
+
diff --git a/third_party/re2/src/re2/unicode_casefold.cc b/third_party/re2/src/re2/unicode_casefold.cc
new file mode 100644
index 000000000..297d0c8a4
--- /dev/null
+++ b/third_party/re2/src/re2/unicode_casefold.cc
@@ -0,0 +1,604 @@
+
+// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
+// make_unicode_casefold.py >unicode_casefold.cc
+
+#include "re2/unicode_casefold.h"
+
+namespace re2 {
+
+
+// 1427 groups, 2884 pairs, 372 ranges
+const CaseFold unicode_casefold[] = {
+ { 65, 90, 32 },
+ { 97, 106, -32 },
+ { 107, 107, 8383 },
+ { 108, 114, -32 },
+ { 115, 115, 268 },
+ { 116, 122, -32 },
+ { 181, 181, 743 },
+ { 192, 214, 32 },
+ { 216, 222, 32 },
+ { 223, 223, 7615 },
+ { 224, 228, -32 },
+ { 229, 229, 8262 },
+ { 230, 246, -32 },
+ { 248, 254, -32 },
+ { 255, 255, 121 },
+ { 256, 303, EvenOdd },
+ { 306, 311, EvenOdd },
+ { 313, 328, OddEven },
+ { 330, 375, EvenOdd },
+ { 376, 376, -121 },
+ { 377, 382, OddEven },
+ { 383, 383, -300 },
+ { 384, 384, 195 },
+ { 385, 385, 210 },
+ { 386, 389, EvenOdd },
+ { 390, 390, 206 },
+ { 391, 392, OddEven },
+ { 393, 394, 205 },
+ { 395, 396, OddEven },
+ { 398, 398, 79 },
+ { 399, 399, 202 },
+ { 400, 400, 203 },
+ { 401, 402, OddEven },
+ { 403, 403, 205 },
+ { 404, 404, 207 },
+ { 405, 405, 97 },
+ { 406, 406, 211 },
+ { 407, 407, 209 },
+ { 408, 409, EvenOdd },
+ { 410, 410, 163 },
+ { 412, 412, 211 },
+ { 413, 413, 213 },
+ { 414, 414, 130 },
+ { 415, 415, 214 },
+ { 416, 421, EvenOdd },
+ { 422, 422, 218 },
+ { 423, 424, OddEven },
+ { 425, 425, 218 },
+ { 428, 429, EvenOdd },
+ { 430, 430, 218 },
+ { 431, 432, OddEven },
+ { 433, 434, 217 },
+ { 435, 438, OddEven },
+ { 439, 439, 219 },
+ { 440, 441, EvenOdd },
+ { 444, 445, EvenOdd },
+ { 447, 447, 56 },
+ { 452, 452, EvenOdd },
+ { 453, 453, OddEven },
+ { 454, 454, -2 },
+ { 455, 455, OddEven },
+ { 456, 456, EvenOdd },
+ { 457, 457, -2 },
+ { 458, 458, EvenOdd },
+ { 459, 459, OddEven },
+ { 460, 460, -2 },
+ { 461, 476, OddEven },
+ { 477, 477, -79 },
+ { 478, 495, EvenOdd },
+ { 497, 497, OddEven },
+ { 498, 498, EvenOdd },
+ { 499, 499, -2 },
+ { 500, 501, EvenOdd },
+ { 502, 502, -97 },
+ { 503, 503, -56 },
+ { 504, 543, EvenOdd },
+ { 544, 544, -130 },
+ { 546, 563, EvenOdd },
+ { 570, 570, 10795 },
+ { 571, 572, OddEven },
+ { 573, 573, -163 },
+ { 574, 574, 10792 },
+ { 575, 576, 10815 },
+ { 577, 578, OddEven },
+ { 579, 579, -195 },
+ { 580, 580, 69 },
+ { 581, 581, 71 },
+ { 582, 591, EvenOdd },
+ { 592, 592, 10783 },
+ { 593, 593, 10780 },
+ { 594, 594, 10782 },
+ { 595, 595, -210 },
+ { 596, 596, -206 },
+ { 598, 599, -205 },
+ { 601, 601, -202 },
+ { 603, 603, -203 },
+ { 604, 604, 42319 },
+ { 608, 608, -205 },
+ { 609, 609, 42315 },
+ { 611, 611, -207 },
+ { 613, 613, 42280 },
+ { 614, 614, 42308 },
+ { 616, 616, -209 },
+ { 617, 617, -211 },
+ { 618, 618, 42308 },
+ { 619, 619, 10743 },
+ { 620, 620, 42305 },
+ { 623, 623, -211 },
+ { 625, 625, 10749 },
+ { 626, 626, -213 },
+ { 629, 629, -214 },
+ { 637, 637, 10727 },
+ { 640, 640, -218 },
+ { 642, 642, 42307 },
+ { 643, 643, -218 },
+ { 647, 647, 42282 },
+ { 648, 648, -218 },
+ { 649, 649, -69 },
+ { 650, 651, -217 },
+ { 652, 652, -71 },
+ { 658, 658, -219 },
+ { 669, 669, 42261 },
+ { 670, 670, 42258 },
+ { 837, 837, 84 },
+ { 880, 883, EvenOdd },
+ { 886, 887, EvenOdd },
+ { 891, 893, 130 },
+ { 895, 895, 116 },
+ { 902, 902, 38 },
+ { 904, 906, 37 },
+ { 908, 908, 64 },
+ { 910, 911, 63 },
+ { 912, 912, 7235 },
+ { 913, 929, 32 },
+ { 931, 931, 31 },
+ { 932, 939, 32 },
+ { 940, 940, -38 },
+ { 941, 943, -37 },
+ { 944, 944, 7219 },
+ { 945, 945, -32 },
+ { 946, 946, 30 },
+ { 947, 948, -32 },
+ { 949, 949, 64 },
+ { 950, 951, -32 },
+ { 952, 952, 25 },
+ { 953, 953, 7173 },
+ { 954, 954, 54 },
+ { 955, 955, -32 },
+ { 956, 956, -775 },
+ { 957, 959, -32 },
+ { 960, 960, 22 },
+ { 961, 961, 48 },
+ { 962, 962, EvenOdd },
+ { 963, 965, -32 },
+ { 966, 966, 15 },
+ { 967, 968, -32 },
+ { 969, 969, 7517 },
+ { 970, 971, -32 },
+ { 972, 972, -64 },
+ { 973, 974, -63 },
+ { 975, 975, 8 },
+ { 976, 976, -62 },
+ { 977, 977, 35 },
+ { 981, 981, -47 },
+ { 982, 982, -54 },
+ { 983, 983, -8 },
+ { 984, 1007, EvenOdd },
+ { 1008, 1008, -86 },
+ { 1009, 1009, -80 },
+ { 1010, 1010, 7 },
+ { 1011, 1011, -116 },
+ { 1012, 1012, -92 },
+ { 1013, 1013, -96 },
+ { 1015, 1016, OddEven },
+ { 1017, 1017, -7 },
+ { 1018, 1019, EvenOdd },
+ { 1021, 1023, -130 },
+ { 1024, 1039, 80 },
+ { 1040, 1071, 32 },
+ { 1072, 1073, -32 },
+ { 1074, 1074, 6222 },
+ { 1075, 1075, -32 },
+ { 1076, 1076, 6221 },
+ { 1077, 1085, -32 },
+ { 1086, 1086, 6212 },
+ { 1087, 1088, -32 },
+ { 1089, 1090, 6210 },
+ { 1091, 1097, -32 },
+ { 1098, 1098, 6204 },
+ { 1099, 1103, -32 },
+ { 1104, 1119, -80 },
+ { 1120, 1122, EvenOdd },
+ { 1123, 1123, 6180 },
+ { 1124, 1153, EvenOdd },
+ { 1162, 1215, EvenOdd },
+ { 1216, 1216, 15 },
+ { 1217, 1230, OddEven },
+ { 1231, 1231, -15 },
+ { 1232, 1327, EvenOdd },
+ { 1329, 1366, 48 },
+ { 1377, 1414, -48 },
+ { 4256, 4293, 7264 },
+ { 4295, 4295, 7264 },
+ { 4301, 4301, 7264 },
+ { 4304, 4346, 3008 },
+ { 4349, 4351, 3008 },
+ { 5024, 5103, 38864 },
+ { 5104, 5109, 8 },
+ { 5112, 5117, -8 },
+ { 7296, 7296, -6254 },
+ { 7297, 7297, -6253 },
+ { 7298, 7298, -6244 },
+ { 7299, 7299, -6242 },
+ { 7300, 7300, EvenOdd },
+ { 7301, 7301, -6243 },
+ { 7302, 7302, -6236 },
+ { 7303, 7303, -6181 },
+ { 7304, 7304, 35266 },
+ { 7312, 7354, -3008 },
+ { 7357, 7359, -3008 },
+ { 7545, 7545, 35332 },
+ { 7549, 7549, 3814 },
+ { 7566, 7566, 35384 },
+ { 7680, 7776, EvenOdd },
+ { 7777, 7777, 58 },
+ { 7778, 7829, EvenOdd },
+ { 7835, 7835, -59 },
+ { 7838, 7838, -7615 },
+ { 7840, 7935, EvenOdd },
+ { 7936, 7943, 8 },
+ { 7944, 7951, -8 },
+ { 7952, 7957, 8 },
+ { 7960, 7965, -8 },
+ { 7968, 7975, 8 },
+ { 7976, 7983, -8 },
+ { 7984, 7991, 8 },
+ { 7992, 7999, -8 },
+ { 8000, 8005, 8 },
+ { 8008, 8013, -8 },
+ { 8017, 8017, 8 },
+ { 8019, 8019, 8 },
+ { 8021, 8021, 8 },
+ { 8023, 8023, 8 },
+ { 8025, 8025, -8 },
+ { 8027, 8027, -8 },
+ { 8029, 8029, -8 },
+ { 8031, 8031, -8 },
+ { 8032, 8039, 8 },
+ { 8040, 8047, -8 },
+ { 8048, 8049, 74 },
+ { 8050, 8053, 86 },
+ { 8054, 8055, 100 },
+ { 8056, 8057, 128 },
+ { 8058, 8059, 112 },
+ { 8060, 8061, 126 },
+ { 8064, 8071, 8 },
+ { 8072, 8079, -8 },
+ { 8080, 8087, 8 },
+ { 8088, 8095, -8 },
+ { 8096, 8103, 8 },
+ { 8104, 8111, -8 },
+ { 8112, 8113, 8 },
+ { 8115, 8115, 9 },
+ { 8120, 8121, -8 },
+ { 8122, 8123, -74 },
+ { 8124, 8124, -9 },
+ { 8126, 8126, -7289 },
+ { 8131, 8131, 9 },
+ { 8136, 8139, -86 },
+ { 8140, 8140, -9 },
+ { 8144, 8145, 8 },
+ { 8147, 8147, -7235 },
+ { 8152, 8153, -8 },
+ { 8154, 8155, -100 },
+ { 8160, 8161, 8 },
+ { 8163, 8163, -7219 },
+ { 8165, 8165, 7 },
+ { 8168, 8169, -8 },
+ { 8170, 8171, -112 },
+ { 8172, 8172, -7 },
+ { 8179, 8179, 9 },
+ { 8184, 8185, -128 },
+ { 8186, 8187, -126 },
+ { 8188, 8188, -9 },
+ { 8486, 8486, -7549 },
+ { 8490, 8490, -8415 },
+ { 8491, 8491, -8294 },
+ { 8498, 8498, 28 },
+ { 8526, 8526, -28 },
+ { 8544, 8559, 16 },
+ { 8560, 8575, -16 },
+ { 8579, 8580, OddEven },
+ { 9398, 9423, 26 },
+ { 9424, 9449, -26 },
+ { 11264, 11311, 48 },
+ { 11312, 11359, -48 },
+ { 11360, 11361, EvenOdd },
+ { 11362, 11362, -10743 },
+ { 11363, 11363, -3814 },
+ { 11364, 11364, -10727 },
+ { 11365, 11365, -10795 },
+ { 11366, 11366, -10792 },
+ { 11367, 11372, OddEven },
+ { 11373, 11373, -10780 },
+ { 11374, 11374, -10749 },
+ { 11375, 11375, -10783 },
+ { 11376, 11376, -10782 },
+ { 11378, 11379, EvenOdd },
+ { 11381, 11382, OddEven },
+ { 11390, 11391, -10815 },
+ { 11392, 11491, EvenOdd },
+ { 11499, 11502, OddEven },
+ { 11506, 11507, EvenOdd },
+ { 11520, 11557, -7264 },
+ { 11559, 11559, -7264 },
+ { 11565, 11565, -7264 },
+ { 42560, 42570, EvenOdd },
+ { 42571, 42571, -35267 },
+ { 42572, 42605, EvenOdd },
+ { 42624, 42651, EvenOdd },
+ { 42786, 42799, EvenOdd },
+ { 42802, 42863, EvenOdd },
+ { 42873, 42876, OddEven },
+ { 42877, 42877, -35332 },
+ { 42878, 42887, EvenOdd },
+ { 42891, 42892, OddEven },
+ { 42893, 42893, -42280 },
+ { 42896, 42899, EvenOdd },
+ { 42900, 42900, 48 },
+ { 42902, 42921, EvenOdd },
+ { 42922, 42922, -42308 },
+ { 42923, 42923, -42319 },
+ { 42924, 42924, -42315 },
+ { 42925, 42925, -42305 },
+ { 42926, 42926, -42308 },
+ { 42928, 42928, -42258 },
+ { 42929, 42929, -42282 },
+ { 42930, 42930, -42261 },
+ { 42931, 42931, 928 },
+ { 42932, 42947, EvenOdd },
+ { 42948, 42948, -48 },
+ { 42949, 42949, -42307 },
+ { 42950, 42950, -35384 },
+ { 42951, 42954, OddEven },
+ { 42960, 42961, EvenOdd },
+ { 42966, 42969, EvenOdd },
+ { 42997, 42998, OddEven },
+ { 43859, 43859, -928 },
+ { 43888, 43967, -38864 },
+ { 64261, 64262, OddEven },
+ { 65313, 65338, 32 },
+ { 65345, 65370, -32 },
+ { 66560, 66599, 40 },
+ { 66600, 66639, -40 },
+ { 66736, 66771, 40 },
+ { 66776, 66811, -40 },
+ { 66928, 66938, 39 },
+ { 66940, 66954, 39 },
+ { 66956, 66962, 39 },
+ { 66964, 66965, 39 },
+ { 66967, 66977, -39 },
+ { 66979, 66993, -39 },
+ { 66995, 67001, -39 },
+ { 67003, 67004, -39 },
+ { 68736, 68786, 64 },
+ { 68800, 68850, -64 },
+ { 71840, 71871, 32 },
+ { 71872, 71903, -32 },
+ { 93760, 93791, 32 },
+ { 93792, 93823, -32 },
+ { 125184, 125217, 34 },
+ { 125218, 125251, -34 },
+};
+const int num_unicode_casefold = 372;
+
+// 1427 groups, 1457 pairs, 208 ranges
+const CaseFold unicode_tolower[] = {
+ { 65, 90, 32 },
+ { 181, 181, 775 },
+ { 192, 214, 32 },
+ { 216, 222, 32 },
+ { 256, 302, EvenOddSkip },
+ { 306, 310, EvenOddSkip },
+ { 313, 327, OddEvenSkip },
+ { 330, 374, EvenOddSkip },
+ { 376, 376, -121 },
+ { 377, 381, OddEvenSkip },
+ { 383, 383, -268 },
+ { 385, 385, 210 },
+ { 386, 388, EvenOddSkip },
+ { 390, 390, 206 },
+ { 391, 391, OddEven },
+ { 393, 394, 205 },
+ { 395, 395, OddEven },
+ { 398, 398, 79 },
+ { 399, 399, 202 },
+ { 400, 400, 203 },
+ { 401, 401, OddEven },
+ { 403, 403, 205 },
+ { 404, 404, 207 },
+ { 406, 406, 211 },
+ { 407, 407, 209 },
+ { 408, 408, EvenOdd },
+ { 412, 412, 211 },
+ { 413, 413, 213 },
+ { 415, 415, 214 },
+ { 416, 420, EvenOddSkip },
+ { 422, 422, 218 },
+ { 423, 423, OddEven },
+ { 425, 425, 218 },
+ { 428, 428, EvenOdd },
+ { 430, 430, 218 },
+ { 431, 431, OddEven },
+ { 433, 434, 217 },
+ { 435, 437, OddEvenSkip },
+ { 439, 439, 219 },
+ { 440, 440, EvenOdd },
+ { 444, 444, EvenOdd },
+ { 452, 452, 2 },
+ { 453, 453, OddEven },
+ { 455, 455, 2 },
+ { 456, 456, EvenOdd },
+ { 458, 458, 2 },
+ { 459, 475, OddEvenSkip },
+ { 478, 494, EvenOddSkip },
+ { 497, 497, 2 },
+ { 498, 500, EvenOddSkip },
+ { 502, 502, -97 },
+ { 503, 503, -56 },
+ { 504, 542, EvenOddSkip },
+ { 544, 544, -130 },
+ { 546, 562, EvenOddSkip },
+ { 570, 570, 10795 },
+ { 571, 571, OddEven },
+ { 573, 573, -163 },
+ { 574, 574, 10792 },
+ { 577, 577, OddEven },
+ { 579, 579, -195 },
+ { 580, 580, 69 },
+ { 581, 581, 71 },
+ { 582, 590, EvenOddSkip },
+ { 837, 837, 116 },
+ { 880, 882, EvenOddSkip },
+ { 886, 886, EvenOdd },
+ { 895, 895, 116 },
+ { 902, 902, 38 },
+ { 904, 906, 37 },
+ { 908, 908, 64 },
+ { 910, 911, 63 },
+ { 913, 929, 32 },
+ { 931, 939, 32 },
+ { 962, 962, EvenOdd },
+ { 975, 975, 8 },
+ { 976, 976, -30 },
+ { 977, 977, -25 },
+ { 981, 981, -15 },
+ { 982, 982, -22 },
+ { 984, 1006, EvenOddSkip },
+ { 1008, 1008, -54 },
+ { 1009, 1009, -48 },
+ { 1012, 1012, -60 },
+ { 1013, 1013, -64 },
+ { 1015, 1015, OddEven },
+ { 1017, 1017, -7 },
+ { 1018, 1018, EvenOdd },
+ { 1021, 1023, -130 },
+ { 1024, 1039, 80 },
+ { 1040, 1071, 32 },
+ { 1120, 1152, EvenOddSkip },
+ { 1162, 1214, EvenOddSkip },
+ { 1216, 1216, 15 },
+ { 1217, 1229, OddEvenSkip },
+ { 1232, 1326, EvenOddSkip },
+ { 1329, 1366, 48 },
+ { 4256, 4293, 7264 },
+ { 4295, 4295, 7264 },
+ { 4301, 4301, 7264 },
+ { 5112, 5117, -8 },
+ { 7296, 7296, -6222 },
+ { 7297, 7297, -6221 },
+ { 7298, 7298, -6212 },
+ { 7299, 7300, -6210 },
+ { 7301, 7301, -6211 },
+ { 7302, 7302, -6204 },
+ { 7303, 7303, -6180 },
+ { 7304, 7304, 35267 },
+ { 7312, 7354, -3008 },
+ { 7357, 7359, -3008 },
+ { 7680, 7828, EvenOddSkip },
+ { 7835, 7835, -58 },
+ { 7838, 7838, -7615 },
+ { 7840, 7934, EvenOddSkip },
+ { 7944, 7951, -8 },
+ { 7960, 7965, -8 },
+ { 7976, 7983, -8 },
+ { 7992, 7999, -8 },
+ { 8008, 8013, -8 },
+ { 8025, 8025, -8 },
+ { 8027, 8027, -8 },
+ { 8029, 8029, -8 },
+ { 8031, 8031, -8 },
+ { 8040, 8047, -8 },
+ { 8072, 8079, -8 },
+ { 8088, 8095, -8 },
+ { 8104, 8111, -8 },
+ { 8120, 8121, -8 },
+ { 8122, 8123, -74 },
+ { 8124, 8124, -9 },
+ { 8126, 8126, -7173 },
+ { 8136, 8139, -86 },
+ { 8140, 8140, -9 },
+ { 8147, 8147, -7235 },
+ { 8152, 8153, -8 },
+ { 8154, 8155, -100 },
+ { 8163, 8163, -7219 },
+ { 8168, 8169, -8 },
+ { 8170, 8171, -112 },
+ { 8172, 8172, -7 },
+ { 8184, 8185, -128 },
+ { 8186, 8187, -126 },
+ { 8188, 8188, -9 },
+ { 8486, 8486, -7517 },
+ { 8490, 8490, -8383 },
+ { 8491, 8491, -8262 },
+ { 8498, 8498, 28 },
+ { 8544, 8559, 16 },
+ { 8579, 8579, OddEven },
+ { 9398, 9423, 26 },
+ { 11264, 11311, 48 },
+ { 11360, 11360, EvenOdd },
+ { 11362, 11362, -10743 },
+ { 11363, 11363, -3814 },
+ { 11364, 11364, -10727 },
+ { 11367, 11371, OddEvenSkip },
+ { 11373, 11373, -10780 },
+ { 11374, 11374, -10749 },
+ { 11375, 11375, -10783 },
+ { 11376, 11376, -10782 },
+ { 11378, 11378, EvenOdd },
+ { 11381, 11381, OddEven },
+ { 11390, 11391, -10815 },
+ { 11392, 11490, EvenOddSkip },
+ { 11499, 11501, OddEvenSkip },
+ { 11506, 11506, EvenOdd },
+ { 42560, 42604, EvenOddSkip },
+ { 42624, 42650, EvenOddSkip },
+ { 42786, 42798, EvenOddSkip },
+ { 42802, 42862, EvenOddSkip },
+ { 42873, 42875, OddEvenSkip },
+ { 42877, 42877, -35332 },
+ { 42878, 42886, EvenOddSkip },
+ { 42891, 42891, OddEven },
+ { 42893, 42893, -42280 },
+ { 42896, 42898, EvenOddSkip },
+ { 42902, 42920, EvenOddSkip },
+ { 42922, 42922, -42308 },
+ { 42923, 42923, -42319 },
+ { 42924, 42924, -42315 },
+ { 42925, 42925, -42305 },
+ { 42926, 42926, -42308 },
+ { 42928, 42928, -42258 },
+ { 42929, 42929, -42282 },
+ { 42930, 42930, -42261 },
+ { 42931, 42931, 928 },
+ { 42932, 42946, EvenOddSkip },
+ { 42948, 42948, -48 },
+ { 42949, 42949, -42307 },
+ { 42950, 42950, -35384 },
+ { 42951, 42953, OddEvenSkip },
+ { 42960, 42960, EvenOdd },
+ { 42966, 42968, EvenOddSkip },
+ { 42997, 42997, OddEven },
+ { 43888, 43967, -38864 },
+ { 64261, 64261, OddEven },
+ { 65313, 65338, 32 },
+ { 66560, 66599, 40 },
+ { 66736, 66771, 40 },
+ { 66928, 66938, 39 },
+ { 66940, 66954, 39 },
+ { 66956, 66962, 39 },
+ { 66964, 66965, 39 },
+ { 68736, 68786, 64 },
+ { 71840, 71871, 32 },
+ { 93760, 93791, 32 },
+ { 125184, 125217, 34 },
+};
+const int num_unicode_tolower = 208;
+
+
+
+} // namespace re2
+
+
diff --git a/third_party/re2/src/re2/unicode_casefold.h b/third_party/re2/src/re2/unicode_casefold.h
new file mode 100644
index 000000000..4acad6800
--- /dev/null
+++ b/third_party/re2/src/re2/unicode_casefold.h
@@ -0,0 +1,77 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_UNICODE_CASEFOLD_H_
+#define RE2_UNICODE_CASEFOLD_H_
+
+// Unicode case folding tables.
+
+// The Unicode case folding tables encode the mapping from one Unicode point
+// to the next largest Unicode point with equivalent folding. The largest
+// point wraps back to the first. For example, the tables map:
+//
+// 'A' -> 'a'
+// 'a' -> 'A'
+//
+// 'K' -> 'k'
+// 'k' -> 'K' (Kelvin symbol)
+// 'K' -> 'K'
+//
+// Like everything Unicode, these tables are big. If we represent the table
+// as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB.
+// Most table entries look like the ones around them:
+// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
+// Instead of listing all the pairs explicitly, we make a list of ranges
+// and deltas, so that the table entries for 'A' through 'Z' can be represented
+// as a single entry { 'A', 'Z', +32 }.
+//
+// In addition to blocks that map to each other (A-Z mapping to a-z)
+// there are blocks of pairs that individually map to each other
+// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
+// For those, the special delta value EvenOdd marks even/odd pairs
+// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
+//
+// In this form, the table has 274 entries, about 3kB. If we were to split
+// the table into one for 16-bit codes and an overflow table for larger ones,
+// we could get it down to about 1.5kB, but that's not worth the complexity.
+//
+// The grouped form also allows for efficient fold range calculations
+// rather than looping one character at a time.
+
+#include <stdint.h>
+
+#include "util/utf.h"
+
+namespace re2 {
+
+enum {
+ EvenOdd = 1,
+ OddEven = -1,
+ EvenOddSkip = 1<<30,
+ OddEvenSkip,
+};
+
+struct CaseFold {
+ Rune lo;
+ Rune hi;
+ int32_t delta;
+};
+
+extern const CaseFold unicode_casefold[];
+extern const int num_unicode_casefold;
+
+extern const CaseFold unicode_tolower[];
+extern const int num_unicode_tolower;
+
+// Returns the CaseFold* in the tables that contains rune.
+// If rune is not in the tables, returns the first CaseFold* after rune.
+// If rune is larger than any value in the tables, returns NULL.
+extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune);
+
+// Returns the result of applying the fold f to the rune r.
+extern Rune ApplyFold(const CaseFold *f, Rune r);
+
+} // namespace re2
+
+#endif // RE2_UNICODE_CASEFOLD_H_
diff --git a/third_party/re2/src/re2/unicode_groups.cc b/third_party/re2/src/re2/unicode_groups.cc
new file mode 100644
index 000000000..b2a7ba666
--- /dev/null
+++ b/third_party/re2/src/re2/unicode_groups.cc
@@ -0,0 +1,6517 @@
+
+// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
+// make_unicode_groups.py >unicode_groups.cc
+
+#include "re2/unicode_groups.h"
+
+namespace re2 {
+
+
+static const URange16 C_range16[] = {
+ { 0, 31 },
+ { 127, 159 },
+ { 173, 173 },
+ { 1536, 1541 },
+ { 1564, 1564 },
+ { 1757, 1757 },
+ { 1807, 1807 },
+ { 2192, 2193 },
+ { 2274, 2274 },
+ { 6158, 6158 },
+ { 8203, 8207 },
+ { 8234, 8238 },
+ { 8288, 8292 },
+ { 8294, 8303 },
+ { 55296, 63743 },
+ { 65279, 65279 },
+ { 65529, 65531 },
+};
+static const URange32 C_range32[] = {
+ { 69821, 69821 },
+ { 69837, 69837 },
+ { 78896, 78911 },
+ { 113824, 113827 },
+ { 119155, 119162 },
+ { 917505, 917505 },
+ { 917536, 917631 },
+ { 983040, 1048573 },
+ { 1048576, 1114109 },
+};
+static const URange16 Cc_range16[] = {
+ { 0, 31 },
+ { 127, 159 },
+};
+static const URange16 Cf_range16[] = {
+ { 173, 173 },
+ { 1536, 1541 },
+ { 1564, 1564 },
+ { 1757, 1757 },
+ { 1807, 1807 },
+ { 2192, 2193 },
+ { 2274, 2274 },
+ { 6158, 6158 },
+ { 8203, 8207 },
+ { 8234, 8238 },
+ { 8288, 8292 },
+ { 8294, 8303 },
+ { 65279, 65279 },
+ { 65529, 65531 },
+};
+static const URange32 Cf_range32[] = {
+ { 69821, 69821 },
+ { 69837, 69837 },
+ { 78896, 78911 },
+ { 113824, 113827 },
+ { 119155, 119162 },
+ { 917505, 917505 },
+ { 917536, 917631 },
+};
+static const URange16 Co_range16[] = {
+ { 57344, 63743 },
+};
+static const URange32 Co_range32[] = {
+ { 983040, 1048573 },
+ { 1048576, 1114109 },
+};
+static const URange16 Cs_range16[] = {
+ { 55296, 57343 },
+};
+static const URange16 L_range16[] = {
+ { 65, 90 },
+ { 97, 122 },
+ { 170, 170 },
+ { 181, 181 },
+ { 186, 186 },
+ { 192, 214 },
+ { 216, 246 },
+ { 248, 705 },
+ { 710, 721 },
+ { 736, 740 },
+ { 748, 748 },
+ { 750, 750 },
+ { 880, 884 },
+ { 886, 887 },
+ { 890, 893 },
+ { 895, 895 },
+ { 902, 902 },
+ { 904, 906 },
+ { 908, 908 },
+ { 910, 929 },
+ { 931, 1013 },
+ { 1015, 1153 },
+ { 1162, 1327 },
+ { 1329, 1366 },
+ { 1369, 1369 },
+ { 1376, 1416 },
+ { 1488, 1514 },
+ { 1519, 1522 },
+ { 1568, 1610 },
+ { 1646, 1647 },
+ { 1649, 1747 },
+ { 1749, 1749 },
+ { 1765, 1766 },
+ { 1774, 1775 },
+ { 1786, 1788 },
+ { 1791, 1791 },
+ { 1808, 1808 },
+ { 1810, 1839 },
+ { 1869, 1957 },
+ { 1969, 1969 },
+ { 1994, 2026 },
+ { 2036, 2037 },
+ { 2042, 2042 },
+ { 2048, 2069 },
+ { 2074, 2074 },
+ { 2084, 2084 },
+ { 2088, 2088 },
+ { 2112, 2136 },
+ { 2144, 2154 },
+ { 2160, 2183 },
+ { 2185, 2190 },
+ { 2208, 2249 },
+ { 2308, 2361 },
+ { 2365, 2365 },
+ { 2384, 2384 },
+ { 2392, 2401 },
+ { 2417, 2432 },
+ { 2437, 2444 },
+ { 2447, 2448 },
+ { 2451, 2472 },
+ { 2474, 2480 },
+ { 2482, 2482 },
+ { 2486, 2489 },
+ { 2493, 2493 },
+ { 2510, 2510 },
+ { 2524, 2525 },
+ { 2527, 2529 },
+ { 2544, 2545 },
+ { 2556, 2556 },
+ { 2565, 2570 },
+ { 2575, 2576 },
+ { 2579, 2600 },
+ { 2602, 2608 },
+ { 2610, 2611 },
+ { 2613, 2614 },
+ { 2616, 2617 },
+ { 2649, 2652 },
+ { 2654, 2654 },
+ { 2674, 2676 },
+ { 2693, 2701 },
+ { 2703, 2705 },
+ { 2707, 2728 },
+ { 2730, 2736 },
+ { 2738, 2739 },
+ { 2741, 2745 },
+ { 2749, 2749 },
+ { 2768, 2768 },
+ { 2784, 2785 },
+ { 2809, 2809 },
+ { 2821, 2828 },
+ { 2831, 2832 },
+ { 2835, 2856 },
+ { 2858, 2864 },
+ { 2866, 2867 },
+ { 2869, 2873 },
+ { 2877, 2877 },
+ { 2908, 2909 },
+ { 2911, 2913 },
+ { 2929, 2929 },
+ { 2947, 2947 },
+ { 2949, 2954 },
+ { 2958, 2960 },
+ { 2962, 2965 },
+ { 2969, 2970 },
+ { 2972, 2972 },
+ { 2974, 2975 },
+ { 2979, 2980 },
+ { 2984, 2986 },
+ { 2990, 3001 },
+ { 3024, 3024 },
+ { 3077, 3084 },
+ { 3086, 3088 },
+ { 3090, 3112 },
+ { 3114, 3129 },
+ { 3133, 3133 },
+ { 3160, 3162 },
+ { 3165, 3165 },
+ { 3168, 3169 },
+ { 3200, 3200 },
+ { 3205, 3212 },
+ { 3214, 3216 },
+ { 3218, 3240 },
+ { 3242, 3251 },
+ { 3253, 3257 },
+ { 3261, 3261 },
+ { 3293, 3294 },
+ { 3296, 3297 },
+ { 3313, 3314 },
+ { 3332, 3340 },
+ { 3342, 3344 },
+ { 3346, 3386 },
+ { 3389, 3389 },
+ { 3406, 3406 },
+ { 3412, 3414 },
+ { 3423, 3425 },
+ { 3450, 3455 },
+ { 3461, 3478 },
+ { 3482, 3505 },
+ { 3507, 3515 },
+ { 3517, 3517 },
+ { 3520, 3526 },
+ { 3585, 3632 },
+ { 3634, 3635 },
+ { 3648, 3654 },
+ { 3713, 3714 },
+ { 3716, 3716 },
+ { 3718, 3722 },
+ { 3724, 3747 },
+ { 3749, 3749 },
+ { 3751, 3760 },
+ { 3762, 3763 },
+ { 3773, 3773 },
+ { 3776, 3780 },
+ { 3782, 3782 },
+ { 3804, 3807 },
+ { 3840, 3840 },
+ { 3904, 3911 },
+ { 3913, 3948 },
+ { 3976, 3980 },
+ { 4096, 4138 },
+ { 4159, 4159 },
+ { 4176, 4181 },
+ { 4186, 4189 },
+ { 4193, 4193 },
+ { 4197, 4198 },
+ { 4206, 4208 },
+ { 4213, 4225 },
+ { 4238, 4238 },
+ { 4256, 4293 },
+ { 4295, 4295 },
+ { 4301, 4301 },
+ { 4304, 4346 },
+ { 4348, 4680 },
+ { 4682, 4685 },
+ { 4688, 4694 },
+ { 4696, 4696 },
+ { 4698, 4701 },
+ { 4704, 4744 },
+ { 4746, 4749 },
+ { 4752, 4784 },
+ { 4786, 4789 },
+ { 4792, 4798 },
+ { 4800, 4800 },
+ { 4802, 4805 },
+ { 4808, 4822 },
+ { 4824, 4880 },
+ { 4882, 4885 },
+ { 4888, 4954 },
+ { 4992, 5007 },
+ { 5024, 5109 },
+ { 5112, 5117 },
+ { 5121, 5740 },
+ { 5743, 5759 },
+ { 5761, 5786 },
+ { 5792, 5866 },
+ { 5873, 5880 },
+ { 5888, 5905 },
+ { 5919, 5937 },
+ { 5952, 5969 },
+ { 5984, 5996 },
+ { 5998, 6000 },
+ { 6016, 6067 },
+ { 6103, 6103 },
+ { 6108, 6108 },
+ { 6176, 6264 },
+ { 6272, 6276 },
+ { 6279, 6312 },
+ { 6314, 6314 },
+ { 6320, 6389 },
+ { 6400, 6430 },
+ { 6480, 6509 },
+ { 6512, 6516 },
+ { 6528, 6571 },
+ { 6576, 6601 },
+ { 6656, 6678 },
+ { 6688, 6740 },
+ { 6823, 6823 },
+ { 6917, 6963 },
+ { 6981, 6988 },
+ { 7043, 7072 },
+ { 7086, 7087 },
+ { 7098, 7141 },
+ { 7168, 7203 },
+ { 7245, 7247 },
+ { 7258, 7293 },
+ { 7296, 7304 },
+ { 7312, 7354 },
+ { 7357, 7359 },
+ { 7401, 7404 },
+ { 7406, 7411 },
+ { 7413, 7414 },
+ { 7418, 7418 },
+ { 7424, 7615 },
+ { 7680, 7957 },
+ { 7960, 7965 },
+ { 7968, 8005 },
+ { 8008, 8013 },
+ { 8016, 8023 },
+ { 8025, 8025 },
+ { 8027, 8027 },
+ { 8029, 8029 },
+ { 8031, 8061 },
+ { 8064, 8116 },
+ { 8118, 8124 },
+ { 8126, 8126 },
+ { 8130, 8132 },
+ { 8134, 8140 },
+ { 8144, 8147 },
+ { 8150, 8155 },
+ { 8160, 8172 },
+ { 8178, 8180 },
+ { 8182, 8188 },
+ { 8305, 8305 },
+ { 8319, 8319 },
+ { 8336, 8348 },
+ { 8450, 8450 },
+ { 8455, 8455 },
+ { 8458, 8467 },
+ { 8469, 8469 },
+ { 8473, 8477 },
+ { 8484, 8484 },
+ { 8486, 8486 },
+ { 8488, 8488 },
+ { 8490, 8493 },
+ { 8495, 8505 },
+ { 8508, 8511 },
+ { 8517, 8521 },
+ { 8526, 8526 },
+ { 8579, 8580 },
+ { 11264, 11492 },
+ { 11499, 11502 },
+ { 11506, 11507 },
+ { 11520, 11557 },
+ { 11559, 11559 },
+ { 11565, 11565 },
+ { 11568, 11623 },
+ { 11631, 11631 },
+ { 11648, 11670 },
+ { 11680, 11686 },
+ { 11688, 11694 },
+ { 11696, 11702 },
+ { 11704, 11710 },
+ { 11712, 11718 },
+ { 11720, 11726 },
+ { 11728, 11734 },
+ { 11736, 11742 },
+ { 11823, 11823 },
+ { 12293, 12294 },
+ { 12337, 12341 },
+ { 12347, 12348 },
+ { 12353, 12438 },
+ { 12445, 12447 },
+ { 12449, 12538 },
+ { 12540, 12543 },
+ { 12549, 12591 },
+ { 12593, 12686 },
+ { 12704, 12735 },
+ { 12784, 12799 },
+ { 13312, 19903 },
+ { 19968, 42124 },
+ { 42192, 42237 },
+ { 42240, 42508 },
+ { 42512, 42527 },
+ { 42538, 42539 },
+ { 42560, 42606 },
+ { 42623, 42653 },
+ { 42656, 42725 },
+ { 42775, 42783 },
+ { 42786, 42888 },
+ { 42891, 42954 },
+ { 42960, 42961 },
+ { 42963, 42963 },
+ { 42965, 42969 },
+ { 42994, 43009 },
+ { 43011, 43013 },
+ { 43015, 43018 },
+ { 43020, 43042 },
+ { 43072, 43123 },
+ { 43138, 43187 },
+ { 43250, 43255 },
+ { 43259, 43259 },
+ { 43261, 43262 },
+ { 43274, 43301 },
+ { 43312, 43334 },
+ { 43360, 43388 },
+ { 43396, 43442 },
+ { 43471, 43471 },
+ { 43488, 43492 },
+ { 43494, 43503 },
+ { 43514, 43518 },
+ { 43520, 43560 },
+ { 43584, 43586 },
+ { 43588, 43595 },
+ { 43616, 43638 },
+ { 43642, 43642 },
+ { 43646, 43695 },
+ { 43697, 43697 },
+ { 43701, 43702 },
+ { 43705, 43709 },
+ { 43712, 43712 },
+ { 43714, 43714 },
+ { 43739, 43741 },
+ { 43744, 43754 },
+ { 43762, 43764 },
+ { 43777, 43782 },
+ { 43785, 43790 },
+ { 43793, 43798 },
+ { 43808, 43814 },
+ { 43816, 43822 },
+ { 43824, 43866 },
+ { 43868, 43881 },
+ { 43888, 44002 },
+ { 44032, 55203 },
+ { 55216, 55238 },
+ { 55243, 55291 },
+ { 63744, 64109 },
+ { 64112, 64217 },
+ { 64256, 64262 },
+ { 64275, 64279 },
+ { 64285, 64285 },
+ { 64287, 64296 },
+ { 64298, 64310 },
+ { 64312, 64316 },
+ { 64318, 64318 },
+ { 64320, 64321 },
+ { 64323, 64324 },
+ { 64326, 64433 },
+ { 64467, 64829 },
+ { 64848, 64911 },
+ { 64914, 64967 },
+ { 65008, 65019 },
+ { 65136, 65140 },
+ { 65142, 65276 },
+ { 65313, 65338 },
+ { 65345, 65370 },
+ { 65382, 65470 },
+ { 65474, 65479 },
+ { 65482, 65487 },
+ { 65490, 65495 },
+ { 65498, 65500 },
+};
+static const URange32 L_range32[] = {
+ { 65536, 65547 },
+ { 65549, 65574 },
+ { 65576, 65594 },
+ { 65596, 65597 },
+ { 65599, 65613 },
+ { 65616, 65629 },
+ { 65664, 65786 },
+ { 66176, 66204 },
+ { 66208, 66256 },
+ { 66304, 66335 },
+ { 66349, 66368 },
+ { 66370, 66377 },
+ { 66384, 66421 },
+ { 66432, 66461 },
+ { 66464, 66499 },
+ { 66504, 66511 },
+ { 66560, 66717 },
+ { 66736, 66771 },
+ { 66776, 66811 },
+ { 66816, 66855 },
+ { 66864, 66915 },
+ { 66928, 66938 },
+ { 66940, 66954 },
+ { 66956, 66962 },
+ { 66964, 66965 },
+ { 66967, 66977 },
+ { 66979, 66993 },
+ { 66995, 67001 },
+ { 67003, 67004 },
+ { 67072, 67382 },
+ { 67392, 67413 },
+ { 67424, 67431 },
+ { 67456, 67461 },
+ { 67463, 67504 },
+ { 67506, 67514 },
+ { 67584, 67589 },
+ { 67592, 67592 },
+ { 67594, 67637 },
+ { 67639, 67640 },
+ { 67644, 67644 },
+ { 67647, 67669 },
+ { 67680, 67702 },
+ { 67712, 67742 },
+ { 67808, 67826 },
+ { 67828, 67829 },
+ { 67840, 67861 },
+ { 67872, 67897 },
+ { 67968, 68023 },
+ { 68030, 68031 },
+ { 68096, 68096 },
+ { 68112, 68115 },
+ { 68117, 68119 },
+ { 68121, 68149 },
+ { 68192, 68220 },
+ { 68224, 68252 },
+ { 68288, 68295 },
+ { 68297, 68324 },
+ { 68352, 68405 },
+ { 68416, 68437 },
+ { 68448, 68466 },
+ { 68480, 68497 },
+ { 68608, 68680 },
+ { 68736, 68786 },
+ { 68800, 68850 },
+ { 68864, 68899 },
+ { 69248, 69289 },
+ { 69296, 69297 },
+ { 69376, 69404 },
+ { 69415, 69415 },
+ { 69424, 69445 },
+ { 69488, 69505 },
+ { 69552, 69572 },
+ { 69600, 69622 },
+ { 69635, 69687 },
+ { 69745, 69746 },
+ { 69749, 69749 },
+ { 69763, 69807 },
+ { 69840, 69864 },
+ { 69891, 69926 },
+ { 69956, 69956 },
+ { 69959, 69959 },
+ { 69968, 70002 },
+ { 70006, 70006 },
+ { 70019, 70066 },
+ { 70081, 70084 },
+ { 70106, 70106 },
+ { 70108, 70108 },
+ { 70144, 70161 },
+ { 70163, 70187 },
+ { 70207, 70208 },
+ { 70272, 70278 },
+ { 70280, 70280 },
+ { 70282, 70285 },
+ { 70287, 70301 },
+ { 70303, 70312 },
+ { 70320, 70366 },
+ { 70405, 70412 },
+ { 70415, 70416 },
+ { 70419, 70440 },
+ { 70442, 70448 },
+ { 70450, 70451 },
+ { 70453, 70457 },
+ { 70461, 70461 },
+ { 70480, 70480 },
+ { 70493, 70497 },
+ { 70656, 70708 },
+ { 70727, 70730 },
+ { 70751, 70753 },
+ { 70784, 70831 },
+ { 70852, 70853 },
+ { 70855, 70855 },
+ { 71040, 71086 },
+ { 71128, 71131 },
+ { 71168, 71215 },
+ { 71236, 71236 },
+ { 71296, 71338 },
+ { 71352, 71352 },
+ { 71424, 71450 },
+ { 71488, 71494 },
+ { 71680, 71723 },
+ { 71840, 71903 },
+ { 71935, 71942 },
+ { 71945, 71945 },
+ { 71948, 71955 },
+ { 71957, 71958 },
+ { 71960, 71983 },
+ { 71999, 71999 },
+ { 72001, 72001 },
+ { 72096, 72103 },
+ { 72106, 72144 },
+ { 72161, 72161 },
+ { 72163, 72163 },
+ { 72192, 72192 },
+ { 72203, 72242 },
+ { 72250, 72250 },
+ { 72272, 72272 },
+ { 72284, 72329 },
+ { 72349, 72349 },
+ { 72368, 72440 },
+ { 72704, 72712 },
+ { 72714, 72750 },
+ { 72768, 72768 },
+ { 72818, 72847 },
+ { 72960, 72966 },
+ { 72968, 72969 },
+ { 72971, 73008 },
+ { 73030, 73030 },
+ { 73056, 73061 },
+ { 73063, 73064 },
+ { 73066, 73097 },
+ { 73112, 73112 },
+ { 73440, 73458 },
+ { 73474, 73474 },
+ { 73476, 73488 },
+ { 73490, 73523 },
+ { 73648, 73648 },
+ { 73728, 74649 },
+ { 74880, 75075 },
+ { 77712, 77808 },
+ { 77824, 78895 },
+ { 78913, 78918 },
+ { 82944, 83526 },
+ { 92160, 92728 },
+ { 92736, 92766 },
+ { 92784, 92862 },
+ { 92880, 92909 },
+ { 92928, 92975 },
+ { 92992, 92995 },
+ { 93027, 93047 },
+ { 93053, 93071 },
+ { 93760, 93823 },
+ { 93952, 94026 },
+ { 94032, 94032 },
+ { 94099, 94111 },
+ { 94176, 94177 },
+ { 94179, 94179 },
+ { 94208, 100343 },
+ { 100352, 101589 },
+ { 101632, 101640 },
+ { 110576, 110579 },
+ { 110581, 110587 },
+ { 110589, 110590 },
+ { 110592, 110882 },
+ { 110898, 110898 },
+ { 110928, 110930 },
+ { 110933, 110933 },
+ { 110948, 110951 },
+ { 110960, 111355 },
+ { 113664, 113770 },
+ { 113776, 113788 },
+ { 113792, 113800 },
+ { 113808, 113817 },
+ { 119808, 119892 },
+ { 119894, 119964 },
+ { 119966, 119967 },
+ { 119970, 119970 },
+ { 119973, 119974 },
+ { 119977, 119980 },
+ { 119982, 119993 },
+ { 119995, 119995 },
+ { 119997, 120003 },
+ { 120005, 120069 },
+ { 120071, 120074 },
+ { 120077, 120084 },
+ { 120086, 120092 },
+ { 120094, 120121 },
+ { 120123, 120126 },
+ { 120128, 120132 },
+ { 120134, 120134 },
+ { 120138, 120144 },
+ { 120146, 120485 },
+ { 120488, 120512 },
+ { 120514, 120538 },
+ { 120540, 120570 },
+ { 120572, 120596 },
+ { 120598, 120628 },
+ { 120630, 120654 },
+ { 120656, 120686 },
+ { 120688, 120712 },
+ { 120714, 120744 },
+ { 120746, 120770 },
+ { 120772, 120779 },
+ { 122624, 122654 },
+ { 122661, 122666 },
+ { 122928, 122989 },
+ { 123136, 123180 },
+ { 123191, 123197 },
+ { 123214, 123214 },
+ { 123536, 123565 },
+ { 123584, 123627 },
+ { 124112, 124139 },
+ { 124896, 124902 },
+ { 124904, 124907 },
+ { 124909, 124910 },
+ { 124912, 124926 },
+ { 124928, 125124 },
+ { 125184, 125251 },
+ { 125259, 125259 },
+ { 126464, 126467 },
+ { 126469, 126495 },
+ { 126497, 126498 },
+ { 126500, 126500 },
+ { 126503, 126503 },
+ { 126505, 126514 },
+ { 126516, 126519 },
+ { 126521, 126521 },
+ { 126523, 126523 },
+ { 126530, 126530 },
+ { 126535, 126535 },
+ { 126537, 126537 },
+ { 126539, 126539 },
+ { 126541, 126543 },
+ { 126545, 126546 },
+ { 126548, 126548 },
+ { 126551, 126551 },
+ { 126553, 126553 },
+ { 126555, 126555 },
+ { 126557, 126557 },
+ { 126559, 126559 },
+ { 126561, 126562 },
+ { 126564, 126564 },
+ { 126567, 126570 },
+ { 126572, 126578 },
+ { 126580, 126583 },
+ { 126585, 126588 },
+ { 126590, 126590 },
+ { 126592, 126601 },
+ { 126603, 126619 },
+ { 126625, 126627 },
+ { 126629, 126633 },
+ { 126635, 126651 },
+ { 131072, 173791 },
+ { 173824, 177977 },
+ { 177984, 178205 },
+ { 178208, 183969 },
+ { 183984, 191456 },
+ { 191472, 192093 },
+ { 194560, 195101 },
+ { 196608, 201546 },
+ { 201552, 205743 },
+};
+static const URange16 Ll_range16[] = {
+ { 97, 122 },
+ { 181, 181 },
+ { 223, 246 },
+ { 248, 255 },
+ { 257, 257 },
+ { 259, 259 },
+ { 261, 261 },
+ { 263, 263 },
+ { 265, 265 },
+ { 267, 267 },
+ { 269, 269 },
+ { 271, 271 },
+ { 273, 273 },
+ { 275, 275 },
+ { 277, 277 },
+ { 279, 279 },
+ { 281, 281 },
+ { 283, 283 },
+ { 285, 285 },
+ { 287, 287 },
+ { 289, 289 },
+ { 291, 291 },
+ { 293, 293 },
+ { 295, 295 },
+ { 297, 297 },
+ { 299, 299 },
+ { 301, 301 },
+ { 303, 303 },
+ { 305, 305 },
+ { 307, 307 },
+ { 309, 309 },
+ { 311, 312 },
+ { 314, 314 },
+ { 316, 316 },
+ { 318, 318 },
+ { 320, 320 },
+ { 322, 322 },
+ { 324, 324 },
+ { 326, 326 },
+ { 328, 329 },
+ { 331, 331 },
+ { 333, 333 },
+ { 335, 335 },
+ { 337, 337 },
+ { 339, 339 },
+ { 341, 341 },
+ { 343, 343 },
+ { 345, 345 },
+ { 347, 347 },
+ { 349, 349 },
+ { 351, 351 },
+ { 353, 353 },
+ { 355, 355 },
+ { 357, 357 },
+ { 359, 359 },
+ { 361, 361 },
+ { 363, 363 },
+ { 365, 365 },
+ { 367, 367 },
+ { 369, 369 },
+ { 371, 371 },
+ { 373, 373 },
+ { 375, 375 },
+ { 378, 378 },
+ { 380, 380 },
+ { 382, 384 },
+ { 387, 387 },
+ { 389, 389 },
+ { 392, 392 },
+ { 396, 397 },
+ { 402, 402 },
+ { 405, 405 },
+ { 409, 411 },
+ { 414, 414 },
+ { 417, 417 },
+ { 419, 419 },
+ { 421, 421 },
+ { 424, 424 },
+ { 426, 427 },
+ { 429, 429 },
+ { 432, 432 },
+ { 436, 436 },
+ { 438, 438 },
+ { 441, 442 },
+ { 445, 447 },
+ { 454, 454 },
+ { 457, 457 },
+ { 460, 460 },
+ { 462, 462 },
+ { 464, 464 },
+ { 466, 466 },
+ { 468, 468 },
+ { 470, 470 },
+ { 472, 472 },
+ { 474, 474 },
+ { 476, 477 },
+ { 479, 479 },
+ { 481, 481 },
+ { 483, 483 },
+ { 485, 485 },
+ { 487, 487 },
+ { 489, 489 },
+ { 491, 491 },
+ { 493, 493 },
+ { 495, 496 },
+ { 499, 499 },
+ { 501, 501 },
+ { 505, 505 },
+ { 507, 507 },
+ { 509, 509 },
+ { 511, 511 },
+ { 513, 513 },
+ { 515, 515 },
+ { 517, 517 },
+ { 519, 519 },
+ { 521, 521 },
+ { 523, 523 },
+ { 525, 525 },
+ { 527, 527 },
+ { 529, 529 },
+ { 531, 531 },
+ { 533, 533 },
+ { 535, 535 },
+ { 537, 537 },
+ { 539, 539 },
+ { 541, 541 },
+ { 543, 543 },
+ { 545, 545 },
+ { 547, 547 },
+ { 549, 549 },
+ { 551, 551 },
+ { 553, 553 },
+ { 555, 555 },
+ { 557, 557 },
+ { 559, 559 },
+ { 561, 561 },
+ { 563, 569 },
+ { 572, 572 },
+ { 575, 576 },
+ { 578, 578 },
+ { 583, 583 },
+ { 585, 585 },
+ { 587, 587 },
+ { 589, 589 },
+ { 591, 659 },
+ { 661, 687 },
+ { 881, 881 },
+ { 883, 883 },
+ { 887, 887 },
+ { 891, 893 },
+ { 912, 912 },
+ { 940, 974 },
+ { 976, 977 },
+ { 981, 983 },
+ { 985, 985 },
+ { 987, 987 },
+ { 989, 989 },
+ { 991, 991 },
+ { 993, 993 },
+ { 995, 995 },
+ { 997, 997 },
+ { 999, 999 },
+ { 1001, 1001 },
+ { 1003, 1003 },
+ { 1005, 1005 },
+ { 1007, 1011 },
+ { 1013, 1013 },
+ { 1016, 1016 },
+ { 1019, 1020 },
+ { 1072, 1119 },
+ { 1121, 1121 },
+ { 1123, 1123 },
+ { 1125, 1125 },
+ { 1127, 1127 },
+ { 1129, 1129 },
+ { 1131, 1131 },
+ { 1133, 1133 },
+ { 1135, 1135 },
+ { 1137, 1137 },
+ { 1139, 1139 },
+ { 1141, 1141 },
+ { 1143, 1143 },
+ { 1145, 1145 },
+ { 1147, 1147 },
+ { 1149, 1149 },
+ { 1151, 1151 },
+ { 1153, 1153 },
+ { 1163, 1163 },
+ { 1165, 1165 },
+ { 1167, 1167 },
+ { 1169, 1169 },
+ { 1171, 1171 },
+ { 1173, 1173 },
+ { 1175, 1175 },
+ { 1177, 1177 },
+ { 1179, 1179 },
+ { 1181, 1181 },
+ { 1183, 1183 },
+ { 1185, 1185 },
+ { 1187, 1187 },
+ { 1189, 1189 },
+ { 1191, 1191 },
+ { 1193, 1193 },
+ { 1195, 1195 },
+ { 1197, 1197 },
+ { 1199, 1199 },
+ { 1201, 1201 },
+ { 1203, 1203 },
+ { 1205, 1205 },
+ { 1207, 1207 },
+ { 1209, 1209 },
+ { 1211, 1211 },
+ { 1213, 1213 },
+ { 1215, 1215 },
+ { 1218, 1218 },
+ { 1220, 1220 },
+ { 1222, 1222 },
+ { 1224, 1224 },
+ { 1226, 1226 },
+ { 1228, 1228 },
+ { 1230, 1231 },
+ { 1233, 1233 },
+ { 1235, 1235 },
+ { 1237, 1237 },
+ { 1239, 1239 },
+ { 1241, 1241 },
+ { 1243, 1243 },
+ { 1245, 1245 },
+ { 1247, 1247 },
+ { 1249, 1249 },
+ { 1251, 1251 },
+ { 1253, 1253 },
+ { 1255, 1255 },
+ { 1257, 1257 },
+ { 1259, 1259 },
+ { 1261, 1261 },
+ { 1263, 1263 },
+ { 1265, 1265 },
+ { 1267, 1267 },
+ { 1269, 1269 },
+ { 1271, 1271 },
+ { 1273, 1273 },
+ { 1275, 1275 },
+ { 1277, 1277 },
+ { 1279, 1279 },
+ { 1281, 1281 },
+ { 1283, 1283 },
+ { 1285, 1285 },
+ { 1287, 1287 },
+ { 1289, 1289 },
+ { 1291, 1291 },
+ { 1293, 1293 },
+ { 1295, 1295 },
+ { 1297, 1297 },
+ { 1299, 1299 },
+ { 1301, 1301 },
+ { 1303, 1303 },
+ { 1305, 1305 },
+ { 1307, 1307 },
+ { 1309, 1309 },
+ { 1311, 1311 },
+ { 1313, 1313 },
+ { 1315, 1315 },
+ { 1317, 1317 },
+ { 1319, 1319 },
+ { 1321, 1321 },
+ { 1323, 1323 },
+ { 1325, 1325 },
+ { 1327, 1327 },
+ { 1376, 1416 },
+ { 4304, 4346 },
+ { 4349, 4351 },
+ { 5112, 5117 },
+ { 7296, 7304 },
+ { 7424, 7467 },
+ { 7531, 7543 },
+ { 7545, 7578 },
+ { 7681, 7681 },
+ { 7683, 7683 },
+ { 7685, 7685 },
+ { 7687, 7687 },
+ { 7689, 7689 },
+ { 7691, 7691 },
+ { 7693, 7693 },
+ { 7695, 7695 },
+ { 7697, 7697 },
+ { 7699, 7699 },
+ { 7701, 7701 },
+ { 7703, 7703 },
+ { 7705, 7705 },
+ { 7707, 7707 },
+ { 7709, 7709 },
+ { 7711, 7711 },
+ { 7713, 7713 },
+ { 7715, 7715 },
+ { 7717, 7717 },
+ { 7719, 7719 },
+ { 7721, 7721 },
+ { 7723, 7723 },
+ { 7725, 7725 },
+ { 7727, 7727 },
+ { 7729, 7729 },
+ { 7731, 7731 },
+ { 7733, 7733 },
+ { 7735, 7735 },
+ { 7737, 7737 },
+ { 7739, 7739 },
+ { 7741, 7741 },
+ { 7743, 7743 },
+ { 7745, 7745 },
+ { 7747, 7747 },
+ { 7749, 7749 },
+ { 7751, 7751 },
+ { 7753, 7753 },
+ { 7755, 7755 },
+ { 7757, 7757 },
+ { 7759, 7759 },
+ { 7761, 7761 },
+ { 7763, 7763 },
+ { 7765, 7765 },
+ { 7767, 7767 },
+ { 7769, 7769 },
+ { 7771, 7771 },
+ { 7773, 7773 },
+ { 7775, 7775 },
+ { 7777, 7777 },
+ { 7779, 7779 },
+ { 7781, 7781 },
+ { 7783, 7783 },
+ { 7785, 7785 },
+ { 7787, 7787 },
+ { 7789, 7789 },
+ { 7791, 7791 },
+ { 7793, 7793 },
+ { 7795, 7795 },
+ { 7797, 7797 },
+ { 7799, 7799 },
+ { 7801, 7801 },
+ { 7803, 7803 },
+ { 7805, 7805 },
+ { 7807, 7807 },
+ { 7809, 7809 },
+ { 7811, 7811 },
+ { 7813, 7813 },
+ { 7815, 7815 },
+ { 7817, 7817 },
+ { 7819, 7819 },
+ { 7821, 7821 },
+ { 7823, 7823 },
+ { 7825, 7825 },
+ { 7827, 7827 },
+ { 7829, 7837 },
+ { 7839, 7839 },
+ { 7841, 7841 },
+ { 7843, 7843 },
+ { 7845, 7845 },
+ { 7847, 7847 },
+ { 7849, 7849 },
+ { 7851, 7851 },
+ { 7853, 7853 },
+ { 7855, 7855 },
+ { 7857, 7857 },
+ { 7859, 7859 },
+ { 7861, 7861 },
+ { 7863, 7863 },
+ { 7865, 7865 },
+ { 7867, 7867 },
+ { 7869, 7869 },
+ { 7871, 7871 },
+ { 7873, 7873 },
+ { 7875, 7875 },
+ { 7877, 7877 },
+ { 7879, 7879 },
+ { 7881, 7881 },
+ { 7883, 7883 },
+ { 7885, 7885 },
+ { 7887, 7887 },
+ { 7889, 7889 },
+ { 7891, 7891 },
+ { 7893, 7893 },
+ { 7895, 7895 },
+ { 7897, 7897 },
+ { 7899, 7899 },
+ { 7901, 7901 },
+ { 7903, 7903 },
+ { 7905, 7905 },
+ { 7907, 7907 },
+ { 7909, 7909 },
+ { 7911, 7911 },
+ { 7913, 7913 },
+ { 7915, 7915 },
+ { 7917, 7917 },
+ { 7919, 7919 },
+ { 7921, 7921 },
+ { 7923, 7923 },
+ { 7925, 7925 },
+ { 7927, 7927 },
+ { 7929, 7929 },
+ { 7931, 7931 },
+ { 7933, 7933 },
+ { 7935, 7943 },
+ { 7952, 7957 },
+ { 7968, 7975 },
+ { 7984, 7991 },
+ { 8000, 8005 },
+ { 8016, 8023 },
+ { 8032, 8039 },
+ { 8048, 8061 },
+ { 8064, 8071 },
+ { 8080, 8087 },
+ { 8096, 8103 },
+ { 8112, 8116 },
+ { 8118, 8119 },
+ { 8126, 8126 },
+ { 8130, 8132 },
+ { 8134, 8135 },
+ { 8144, 8147 },
+ { 8150, 8151 },
+ { 8160, 8167 },
+ { 8178, 8180 },
+ { 8182, 8183 },
+ { 8458, 8458 },
+ { 8462, 8463 },
+ { 8467, 8467 },
+ { 8495, 8495 },
+ { 8500, 8500 },
+ { 8505, 8505 },
+ { 8508, 8509 },
+ { 8518, 8521 },
+ { 8526, 8526 },
+ { 8580, 8580 },
+ { 11312, 11359 },
+ { 11361, 11361 },
+ { 11365, 11366 },
+ { 11368, 11368 },
+ { 11370, 11370 },
+ { 11372, 11372 },
+ { 11377, 11377 },
+ { 11379, 11380 },
+ { 11382, 11387 },
+ { 11393, 11393 },
+ { 11395, 11395 },
+ { 11397, 11397 },
+ { 11399, 11399 },
+ { 11401, 11401 },
+ { 11403, 11403 },
+ { 11405, 11405 },
+ { 11407, 11407 },
+ { 11409, 11409 },
+ { 11411, 11411 },
+ { 11413, 11413 },
+ { 11415, 11415 },
+ { 11417, 11417 },
+ { 11419, 11419 },
+ { 11421, 11421 },
+ { 11423, 11423 },
+ { 11425, 11425 },
+ { 11427, 11427 },
+ { 11429, 11429 },
+ { 11431, 11431 },
+ { 11433, 11433 },
+ { 11435, 11435 },
+ { 11437, 11437 },
+ { 11439, 11439 },
+ { 11441, 11441 },
+ { 11443, 11443 },
+ { 11445, 11445 },
+ { 11447, 11447 },
+ { 11449, 11449 },
+ { 11451, 11451 },
+ { 11453, 11453 },
+ { 11455, 11455 },
+ { 11457, 11457 },
+ { 11459, 11459 },
+ { 11461, 11461 },
+ { 11463, 11463 },
+ { 11465, 11465 },
+ { 11467, 11467 },
+ { 11469, 11469 },
+ { 11471, 11471 },
+ { 11473, 11473 },
+ { 11475, 11475 },
+ { 11477, 11477 },
+ { 11479, 11479 },
+ { 11481, 11481 },
+ { 11483, 11483 },
+ { 11485, 11485 },
+ { 11487, 11487 },
+ { 11489, 11489 },
+ { 11491, 11492 },
+ { 11500, 11500 },
+ { 11502, 11502 },
+ { 11507, 11507 },
+ { 11520, 11557 },
+ { 11559, 11559 },
+ { 11565, 11565 },
+ { 42561, 42561 },
+ { 42563, 42563 },
+ { 42565, 42565 },
+ { 42567, 42567 },
+ { 42569, 42569 },
+ { 42571, 42571 },
+ { 42573, 42573 },
+ { 42575, 42575 },
+ { 42577, 42577 },
+ { 42579, 42579 },
+ { 42581, 42581 },
+ { 42583, 42583 },
+ { 42585, 42585 },
+ { 42587, 42587 },
+ { 42589, 42589 },
+ { 42591, 42591 },
+ { 42593, 42593 },
+ { 42595, 42595 },
+ { 42597, 42597 },
+ { 42599, 42599 },
+ { 42601, 42601 },
+ { 42603, 42603 },
+ { 42605, 42605 },
+ { 42625, 42625 },
+ { 42627, 42627 },
+ { 42629, 42629 },
+ { 42631, 42631 },
+ { 42633, 42633 },
+ { 42635, 42635 },
+ { 42637, 42637 },
+ { 42639, 42639 },
+ { 42641, 42641 },
+ { 42643, 42643 },
+ { 42645, 42645 },
+ { 42647, 42647 },
+ { 42649, 42649 },
+ { 42651, 42651 },
+ { 42787, 42787 },
+ { 42789, 42789 },
+ { 42791, 42791 },
+ { 42793, 42793 },
+ { 42795, 42795 },
+ { 42797, 42797 },
+ { 42799, 42801 },
+ { 42803, 42803 },
+ { 42805, 42805 },
+ { 42807, 42807 },
+ { 42809, 42809 },
+ { 42811, 42811 },
+ { 42813, 42813 },
+ { 42815, 42815 },
+ { 42817, 42817 },
+ { 42819, 42819 },
+ { 42821, 42821 },
+ { 42823, 42823 },
+ { 42825, 42825 },
+ { 42827, 42827 },
+ { 42829, 42829 },
+ { 42831, 42831 },
+ { 42833, 42833 },
+ { 42835, 42835 },
+ { 42837, 42837 },
+ { 42839, 42839 },
+ { 42841, 42841 },
+ { 42843, 42843 },
+ { 42845, 42845 },
+ { 42847, 42847 },
+ { 42849, 42849 },
+ { 42851, 42851 },
+ { 42853, 42853 },
+ { 42855, 42855 },
+ { 42857, 42857 },
+ { 42859, 42859 },
+ { 42861, 42861 },
+ { 42863, 42863 },
+ { 42865, 42872 },
+ { 42874, 42874 },
+ { 42876, 42876 },
+ { 42879, 42879 },
+ { 42881, 42881 },
+ { 42883, 42883 },
+ { 42885, 42885 },
+ { 42887, 42887 },
+ { 42892, 42892 },
+ { 42894, 42894 },
+ { 42897, 42897 },
+ { 42899, 42901 },
+ { 42903, 42903 },
+ { 42905, 42905 },
+ { 42907, 42907 },
+ { 42909, 42909 },
+ { 42911, 42911 },
+ { 42913, 42913 },
+ { 42915, 42915 },
+ { 42917, 42917 },
+ { 42919, 42919 },
+ { 42921, 42921 },
+ { 42927, 42927 },
+ { 42933, 42933 },
+ { 42935, 42935 },
+ { 42937, 42937 },
+ { 42939, 42939 },
+ { 42941, 42941 },
+ { 42943, 42943 },
+ { 42945, 42945 },
+ { 42947, 42947 },
+ { 42952, 42952 },
+ { 42954, 42954 },
+ { 42961, 42961 },
+ { 42963, 42963 },
+ { 42965, 42965 },
+ { 42967, 42967 },
+ { 42969, 42969 },
+ { 42998, 42998 },
+ { 43002, 43002 },
+ { 43824, 43866 },
+ { 43872, 43880 },
+ { 43888, 43967 },
+ { 64256, 64262 },
+ { 64275, 64279 },
+ { 65345, 65370 },
+};
+static const URange32 Ll_range32[] = {
+ { 66600, 66639 },
+ { 66776, 66811 },
+ { 66967, 66977 },
+ { 66979, 66993 },
+ { 66995, 67001 },
+ { 67003, 67004 },
+ { 68800, 68850 },
+ { 71872, 71903 },
+ { 93792, 93823 },
+ { 119834, 119859 },
+ { 119886, 119892 },
+ { 119894, 119911 },
+ { 119938, 119963 },
+ { 119990, 119993 },
+ { 119995, 119995 },
+ { 119997, 120003 },
+ { 120005, 120015 },
+ { 120042, 120067 },
+ { 120094, 120119 },
+ { 120146, 120171 },
+ { 120198, 120223 },
+ { 120250, 120275 },
+ { 120302, 120327 },
+ { 120354, 120379 },
+ { 120406, 120431 },
+ { 120458, 120485 },
+ { 120514, 120538 },
+ { 120540, 120545 },
+ { 120572, 120596 },
+ { 120598, 120603 },
+ { 120630, 120654 },
+ { 120656, 120661 },
+ { 120688, 120712 },
+ { 120714, 120719 },
+ { 120746, 120770 },
+ { 120772, 120777 },
+ { 120779, 120779 },
+ { 122624, 122633 },
+ { 122635, 122654 },
+ { 122661, 122666 },
+ { 125218, 125251 },
+};
+static const URange16 Lm_range16[] = {
+ { 688, 705 },
+ { 710, 721 },
+ { 736, 740 },
+ { 748, 748 },
+ { 750, 750 },
+ { 884, 884 },
+ { 890, 890 },
+ { 1369, 1369 },
+ { 1600, 1600 },
+ { 1765, 1766 },
+ { 2036, 2037 },
+ { 2042, 2042 },
+ { 2074, 2074 },
+ { 2084, 2084 },
+ { 2088, 2088 },
+ { 2249, 2249 },
+ { 2417, 2417 },
+ { 3654, 3654 },
+ { 3782, 3782 },
+ { 4348, 4348 },
+ { 6103, 6103 },
+ { 6211, 6211 },
+ { 6823, 6823 },
+ { 7288, 7293 },
+ { 7468, 7530 },
+ { 7544, 7544 },
+ { 7579, 7615 },
+ { 8305, 8305 },
+ { 8319, 8319 },
+ { 8336, 8348 },
+ { 11388, 11389 },
+ { 11631, 11631 },
+ { 11823, 11823 },
+ { 12293, 12293 },
+ { 12337, 12341 },
+ { 12347, 12347 },
+ { 12445, 12446 },
+ { 12540, 12542 },
+ { 40981, 40981 },
+ { 42232, 42237 },
+ { 42508, 42508 },
+ { 42623, 42623 },
+ { 42652, 42653 },
+ { 42775, 42783 },
+ { 42864, 42864 },
+ { 42888, 42888 },
+ { 42994, 42996 },
+ { 43000, 43001 },
+ { 43471, 43471 },
+ { 43494, 43494 },
+ { 43632, 43632 },
+ { 43741, 43741 },
+ { 43763, 43764 },
+ { 43868, 43871 },
+ { 43881, 43881 },
+ { 65392, 65392 },
+ { 65438, 65439 },
+};
+static const URange32 Lm_range32[] = {
+ { 67456, 67461 },
+ { 67463, 67504 },
+ { 67506, 67514 },
+ { 92992, 92995 },
+ { 94099, 94111 },
+ { 94176, 94177 },
+ { 94179, 94179 },
+ { 110576, 110579 },
+ { 110581, 110587 },
+ { 110589, 110590 },
+ { 122928, 122989 },
+ { 123191, 123197 },
+ { 124139, 124139 },
+ { 125259, 125259 },
+};
+static const URange16 Lo_range16[] = {
+ { 170, 170 },
+ { 186, 186 },
+ { 443, 443 },
+ { 448, 451 },
+ { 660, 660 },
+ { 1488, 1514 },
+ { 1519, 1522 },
+ { 1568, 1599 },
+ { 1601, 1610 },
+ { 1646, 1647 },
+ { 1649, 1747 },
+ { 1749, 1749 },
+ { 1774, 1775 },
+ { 1786, 1788 },
+ { 1791, 1791 },
+ { 1808, 1808 },
+ { 1810, 1839 },
+ { 1869, 1957 },
+ { 1969, 1969 },
+ { 1994, 2026 },
+ { 2048, 2069 },
+ { 2112, 2136 },
+ { 2144, 2154 },
+ { 2160, 2183 },
+ { 2185, 2190 },
+ { 2208, 2248 },
+ { 2308, 2361 },
+ { 2365, 2365 },
+ { 2384, 2384 },
+ { 2392, 2401 },
+ { 2418, 2432 },
+ { 2437, 2444 },
+ { 2447, 2448 },
+ { 2451, 2472 },
+ { 2474, 2480 },
+ { 2482, 2482 },
+ { 2486, 2489 },
+ { 2493, 2493 },
+ { 2510, 2510 },
+ { 2524, 2525 },
+ { 2527, 2529 },
+ { 2544, 2545 },
+ { 2556, 2556 },
+ { 2565, 2570 },
+ { 2575, 2576 },
+ { 2579, 2600 },
+ { 2602, 2608 },
+ { 2610, 2611 },
+ { 2613, 2614 },
+ { 2616, 2617 },
+ { 2649, 2652 },
+ { 2654, 2654 },
+ { 2674, 2676 },
+ { 2693, 2701 },
+ { 2703, 2705 },
+ { 2707, 2728 },
+ { 2730, 2736 },
+ { 2738, 2739 },
+ { 2741, 2745 },
+ { 2749, 2749 },
+ { 2768, 2768 },
+ { 2784, 2785 },
+ { 2809, 2809 },
+ { 2821, 2828 },
+ { 2831, 2832 },
+ { 2835, 2856 },
+ { 2858, 2864 },
+ { 2866, 2867 },
+ { 2869, 2873 },
+ { 2877, 2877 },
+ { 2908, 2909 },
+ { 2911, 2913 },
+ { 2929, 2929 },
+ { 2947, 2947 },
+ { 2949, 2954 },
+ { 2958, 2960 },
+ { 2962, 2965 },
+ { 2969, 2970 },
+ { 2972, 2972 },
+ { 2974, 2975 },
+ { 2979, 2980 },
+ { 2984, 2986 },
+ { 2990, 3001 },
+ { 3024, 3024 },
+ { 3077, 3084 },
+ { 3086, 3088 },
+ { 3090, 3112 },
+ { 3114, 3129 },
+ { 3133, 3133 },
+ { 3160, 3162 },
+ { 3165, 3165 },
+ { 3168, 3169 },
+ { 3200, 3200 },
+ { 3205, 3212 },
+ { 3214, 3216 },
+ { 3218, 3240 },
+ { 3242, 3251 },
+ { 3253, 3257 },
+ { 3261, 3261 },
+ { 3293, 3294 },
+ { 3296, 3297 },
+ { 3313, 3314 },
+ { 3332, 3340 },
+ { 3342, 3344 },
+ { 3346, 3386 },
+ { 3389, 3389 },
+ { 3406, 3406 },
+ { 3412, 3414 },
+ { 3423, 3425 },
+ { 3450, 3455 },
+ { 3461, 3478 },
+ { 3482, 3505 },
+ { 3507, 3515 },
+ { 3517, 3517 },
+ { 3520, 3526 },
+ { 3585, 3632 },
+ { 3634, 3635 },
+ { 3648, 3653 },
+ { 3713, 3714 },
+ { 3716, 3716 },
+ { 3718, 3722 },
+ { 3724, 3747 },
+ { 3749, 3749 },
+ { 3751, 3760 },
+ { 3762, 3763 },
+ { 3773, 3773 },
+ { 3776, 3780 },
+ { 3804, 3807 },
+ { 3840, 3840 },
+ { 3904, 3911 },
+ { 3913, 3948 },
+ { 3976, 3980 },
+ { 4096, 4138 },
+ { 4159, 4159 },
+ { 4176, 4181 },
+ { 4186, 4189 },
+ { 4193, 4193 },
+ { 4197, 4198 },
+ { 4206, 4208 },
+ { 4213, 4225 },
+ { 4238, 4238 },
+ { 4352, 4680 },
+ { 4682, 4685 },
+ { 4688, 4694 },
+ { 4696, 4696 },
+ { 4698, 4701 },
+ { 4704, 4744 },
+ { 4746, 4749 },
+ { 4752, 4784 },
+ { 4786, 4789 },
+ { 4792, 4798 },
+ { 4800, 4800 },
+ { 4802, 4805 },
+ { 4808, 4822 },
+ { 4824, 4880 },
+ { 4882, 4885 },
+ { 4888, 4954 },
+ { 4992, 5007 },
+ { 5121, 5740 },
+ { 5743, 5759 },
+ { 5761, 5786 },
+ { 5792, 5866 },
+ { 5873, 5880 },
+ { 5888, 5905 },
+ { 5919, 5937 },
+ { 5952, 5969 },
+ { 5984, 5996 },
+ { 5998, 6000 },
+ { 6016, 6067 },
+ { 6108, 6108 },
+ { 6176, 6210 },
+ { 6212, 6264 },
+ { 6272, 6276 },
+ { 6279, 6312 },
+ { 6314, 6314 },
+ { 6320, 6389 },
+ { 6400, 6430 },
+ { 6480, 6509 },
+ { 6512, 6516 },
+ { 6528, 6571 },
+ { 6576, 6601 },
+ { 6656, 6678 },
+ { 6688, 6740 },
+ { 6917, 6963 },
+ { 6981, 6988 },
+ { 7043, 7072 },
+ { 7086, 7087 },
+ { 7098, 7141 },
+ { 7168, 7203 },
+ { 7245, 7247 },
+ { 7258, 7287 },
+ { 7401, 7404 },
+ { 7406, 7411 },
+ { 7413, 7414 },
+ { 7418, 7418 },
+ { 8501, 8504 },
+ { 11568, 11623 },
+ { 11648, 11670 },
+ { 11680, 11686 },
+ { 11688, 11694 },
+ { 11696, 11702 },
+ { 11704, 11710 },
+ { 11712, 11718 },
+ { 11720, 11726 },
+ { 11728, 11734 },
+ { 11736, 11742 },
+ { 12294, 12294 },
+ { 12348, 12348 },
+ { 12353, 12438 },
+ { 12447, 12447 },
+ { 12449, 12538 },
+ { 12543, 12543 },
+ { 12549, 12591 },
+ { 12593, 12686 },
+ { 12704, 12735 },
+ { 12784, 12799 },
+ { 13312, 19903 },
+ { 19968, 40980 },
+ { 40982, 42124 },
+ { 42192, 42231 },
+ { 42240, 42507 },
+ { 42512, 42527 },
+ { 42538, 42539 },
+ { 42606, 42606 },
+ { 42656, 42725 },
+ { 42895, 42895 },
+ { 42999, 42999 },
+ { 43003, 43009 },
+ { 43011, 43013 },
+ { 43015, 43018 },
+ { 43020, 43042 },
+ { 43072, 43123 },
+ { 43138, 43187 },
+ { 43250, 43255 },
+ { 43259, 43259 },
+ { 43261, 43262 },
+ { 43274, 43301 },
+ { 43312, 43334 },
+ { 43360, 43388 },
+ { 43396, 43442 },
+ { 43488, 43492 },
+ { 43495, 43503 },
+ { 43514, 43518 },
+ { 43520, 43560 },
+ { 43584, 43586 },
+ { 43588, 43595 },
+ { 43616, 43631 },
+ { 43633, 43638 },
+ { 43642, 43642 },
+ { 43646, 43695 },
+ { 43697, 43697 },
+ { 43701, 43702 },
+ { 43705, 43709 },
+ { 43712, 43712 },
+ { 43714, 43714 },
+ { 43739, 43740 },
+ { 43744, 43754 },
+ { 43762, 43762 },
+ { 43777, 43782 },
+ { 43785, 43790 },
+ { 43793, 43798 },
+ { 43808, 43814 },
+ { 43816, 43822 },
+ { 43968, 44002 },
+ { 44032, 55203 },
+ { 55216, 55238 },
+ { 55243, 55291 },
+ { 63744, 64109 },
+ { 64112, 64217 },
+ { 64285, 64285 },
+ { 64287, 64296 },
+ { 64298, 64310 },
+ { 64312, 64316 },
+ { 64318, 64318 },
+ { 64320, 64321 },
+ { 64323, 64324 },
+ { 64326, 64433 },
+ { 64467, 64829 },
+ { 64848, 64911 },
+ { 64914, 64967 },
+ { 65008, 65019 },
+ { 65136, 65140 },
+ { 65142, 65276 },
+ { 65382, 65391 },
+ { 65393, 65437 },
+ { 65440, 65470 },
+ { 65474, 65479 },
+ { 65482, 65487 },
+ { 65490, 65495 },
+ { 65498, 65500 },
+};
+static const URange32 Lo_range32[] = {
+ { 65536, 65547 },
+ { 65549, 65574 },
+ { 65576, 65594 },
+ { 65596, 65597 },
+ { 65599, 65613 },
+ { 65616, 65629 },
+ { 65664, 65786 },
+ { 66176, 66204 },
+ { 66208, 66256 },
+ { 66304, 66335 },
+ { 66349, 66368 },
+ { 66370, 66377 },
+ { 66384, 66421 },
+ { 66432, 66461 },
+ { 66464, 66499 },
+ { 66504, 66511 },
+ { 66640, 66717 },
+ { 66816, 66855 },
+ { 66864, 66915 },
+ { 67072, 67382 },
+ { 67392, 67413 },
+ { 67424, 67431 },
+ { 67584, 67589 },
+ { 67592, 67592 },
+ { 67594, 67637 },
+ { 67639, 67640 },
+ { 67644, 67644 },
+ { 67647, 67669 },
+ { 67680, 67702 },
+ { 67712, 67742 },
+ { 67808, 67826 },
+ { 67828, 67829 },
+ { 67840, 67861 },
+ { 67872, 67897 },
+ { 67968, 68023 },
+ { 68030, 68031 },
+ { 68096, 68096 },
+ { 68112, 68115 },
+ { 68117, 68119 },
+ { 68121, 68149 },
+ { 68192, 68220 },
+ { 68224, 68252 },
+ { 68288, 68295 },
+ { 68297, 68324 },
+ { 68352, 68405 },
+ { 68416, 68437 },
+ { 68448, 68466 },
+ { 68480, 68497 },
+ { 68608, 68680 },
+ { 68864, 68899 },
+ { 69248, 69289 },
+ { 69296, 69297 },
+ { 69376, 69404 },
+ { 69415, 69415 },
+ { 69424, 69445 },
+ { 69488, 69505 },
+ { 69552, 69572 },
+ { 69600, 69622 },
+ { 69635, 69687 },
+ { 69745, 69746 },
+ { 69749, 69749 },
+ { 69763, 69807 },
+ { 69840, 69864 },
+ { 69891, 69926 },
+ { 69956, 69956 },
+ { 69959, 69959 },
+ { 69968, 70002 },
+ { 70006, 70006 },
+ { 70019, 70066 },
+ { 70081, 70084 },
+ { 70106, 70106 },
+ { 70108, 70108 },
+ { 70144, 70161 },
+ { 70163, 70187 },
+ { 70207, 70208 },
+ { 70272, 70278 },
+ { 70280, 70280 },
+ { 70282, 70285 },
+ { 70287, 70301 },
+ { 70303, 70312 },
+ { 70320, 70366 },
+ { 70405, 70412 },
+ { 70415, 70416 },
+ { 70419, 70440 },
+ { 70442, 70448 },
+ { 70450, 70451 },
+ { 70453, 70457 },
+ { 70461, 70461 },
+ { 70480, 70480 },
+ { 70493, 70497 },
+ { 70656, 70708 },
+ { 70727, 70730 },
+ { 70751, 70753 },
+ { 70784, 70831 },
+ { 70852, 70853 },
+ { 70855, 70855 },
+ { 71040, 71086 },
+ { 71128, 71131 },
+ { 71168, 71215 },
+ { 71236, 71236 },
+ { 71296, 71338 },
+ { 71352, 71352 },
+ { 71424, 71450 },
+ { 71488, 71494 },
+ { 71680, 71723 },
+ { 71935, 71942 },
+ { 71945, 71945 },
+ { 71948, 71955 },
+ { 71957, 71958 },
+ { 71960, 71983 },
+ { 71999, 71999 },
+ { 72001, 72001 },
+ { 72096, 72103 },
+ { 72106, 72144 },
+ { 72161, 72161 },
+ { 72163, 72163 },
+ { 72192, 72192 },
+ { 72203, 72242 },
+ { 72250, 72250 },
+ { 72272, 72272 },
+ { 72284, 72329 },
+ { 72349, 72349 },
+ { 72368, 72440 },
+ { 72704, 72712 },
+ { 72714, 72750 },
+ { 72768, 72768 },
+ { 72818, 72847 },
+ { 72960, 72966 },
+ { 72968, 72969 },
+ { 72971, 73008 },
+ { 73030, 73030 },
+ { 73056, 73061 },
+ { 73063, 73064 },
+ { 73066, 73097 },
+ { 73112, 73112 },
+ { 73440, 73458 },
+ { 73474, 73474 },
+ { 73476, 73488 },
+ { 73490, 73523 },
+ { 73648, 73648 },
+ { 73728, 74649 },
+ { 74880, 75075 },
+ { 77712, 77808 },
+ { 77824, 78895 },
+ { 78913, 78918 },
+ { 82944, 83526 },
+ { 92160, 92728 },
+ { 92736, 92766 },
+ { 92784, 92862 },
+ { 92880, 92909 },
+ { 92928, 92975 },
+ { 93027, 93047 },
+ { 93053, 93071 },
+ { 93952, 94026 },
+ { 94032, 94032 },
+ { 94208, 100343 },
+ { 100352, 101589 },
+ { 101632, 101640 },
+ { 110592, 110882 },
+ { 110898, 110898 },
+ { 110928, 110930 },
+ { 110933, 110933 },
+ { 110948, 110951 },
+ { 110960, 111355 },
+ { 113664, 113770 },
+ { 113776, 113788 },
+ { 113792, 113800 },
+ { 113808, 113817 },
+ { 122634, 122634 },
+ { 123136, 123180 },
+ { 123214, 123214 },
+ { 123536, 123565 },
+ { 123584, 123627 },
+ { 124112, 124138 },
+ { 124896, 124902 },
+ { 124904, 124907 },
+ { 124909, 124910 },
+ { 124912, 124926 },
+ { 124928, 125124 },
+ { 126464, 126467 },
+ { 126469, 126495 },
+ { 126497, 126498 },
+ { 126500, 126500 },
+ { 126503, 126503 },
+ { 126505, 126514 },
+ { 126516, 126519 },
+ { 126521, 126521 },
+ { 126523, 126523 },
+ { 126530, 126530 },
+ { 126535, 126535 },
+ { 126537, 126537 },
+ { 126539, 126539 },
+ { 126541, 126543 },
+ { 126545, 126546 },
+ { 126548, 126548 },
+ { 126551, 126551 },
+ { 126553, 126553 },
+ { 126555, 126555 },
+ { 126557, 126557 },
+ { 126559, 126559 },
+ { 126561, 126562 },
+ { 126564, 126564 },
+ { 126567, 126570 },
+ { 126572, 126578 },
+ { 126580, 126583 },
+ { 126585, 126588 },
+ { 126590, 126590 },
+ { 126592, 126601 },
+ { 126603, 126619 },
+ { 126625, 126627 },
+ { 126629, 126633 },
+ { 126635, 126651 },
+ { 131072, 173791 },
+ { 173824, 177977 },
+ { 177984, 178205 },
+ { 178208, 183969 },
+ { 183984, 191456 },
+ { 191472, 192093 },
+ { 194560, 195101 },
+ { 196608, 201546 },
+ { 201552, 205743 },
+};
+static const URange16 Lt_range16[] = {
+ { 453, 453 },
+ { 456, 456 },
+ { 459, 459 },
+ { 498, 498 },
+ { 8072, 8079 },
+ { 8088, 8095 },
+ { 8104, 8111 },
+ { 8124, 8124 },
+ { 8140, 8140 },
+ { 8188, 8188 },
+};
+static const URange16 Lu_range16[] = {
+ { 65, 90 },
+ { 192, 214 },
+ { 216, 222 },
+ { 256, 256 },
+ { 258, 258 },
+ { 260, 260 },
+ { 262, 262 },
+ { 264, 264 },
+ { 266, 266 },
+ { 268, 268 },
+ { 270, 270 },
+ { 272, 272 },
+ { 274, 274 },
+ { 276, 276 },
+ { 278, 278 },
+ { 280, 280 },
+ { 282, 282 },
+ { 284, 284 },
+ { 286, 286 },
+ { 288, 288 },
+ { 290, 290 },
+ { 292, 292 },
+ { 294, 294 },
+ { 296, 296 },
+ { 298, 298 },
+ { 300, 300 },
+ { 302, 302 },
+ { 304, 304 },
+ { 306, 306 },
+ { 308, 308 },
+ { 310, 310 },
+ { 313, 313 },
+ { 315, 315 },
+ { 317, 317 },
+ { 319, 319 },
+ { 321, 321 },
+ { 323, 323 },
+ { 325, 325 },
+ { 327, 327 },
+ { 330, 330 },
+ { 332, 332 },
+ { 334, 334 },
+ { 336, 336 },
+ { 338, 338 },
+ { 340, 340 },
+ { 342, 342 },
+ { 344, 344 },
+ { 346, 346 },
+ { 348, 348 },
+ { 350, 350 },
+ { 352, 352 },
+ { 354, 354 },
+ { 356, 356 },
+ { 358, 358 },
+ { 360, 360 },
+ { 362, 362 },
+ { 364, 364 },
+ { 366, 366 },
+ { 368, 368 },
+ { 370, 370 },
+ { 372, 372 },
+ { 374, 374 },
+ { 376, 377 },
+ { 379, 379 },
+ { 381, 381 },
+ { 385, 386 },
+ { 388, 388 },
+ { 390, 391 },
+ { 393, 395 },
+ { 398, 401 },
+ { 403, 404 },
+ { 406, 408 },
+ { 412, 413 },
+ { 415, 416 },
+ { 418, 418 },
+ { 420, 420 },
+ { 422, 423 },
+ { 425, 425 },
+ { 428, 428 },
+ { 430, 431 },
+ { 433, 435 },
+ { 437, 437 },
+ { 439, 440 },
+ { 444, 444 },
+ { 452, 452 },
+ { 455, 455 },
+ { 458, 458 },
+ { 461, 461 },
+ { 463, 463 },
+ { 465, 465 },
+ { 467, 467 },
+ { 469, 469 },
+ { 471, 471 },
+ { 473, 473 },
+ { 475, 475 },
+ { 478, 478 },
+ { 480, 480 },
+ { 482, 482 },
+ { 484, 484 },
+ { 486, 486 },
+ { 488, 488 },
+ { 490, 490 },
+ { 492, 492 },
+ { 494, 494 },
+ { 497, 497 },
+ { 500, 500 },
+ { 502, 504 },
+ { 506, 506 },
+ { 508, 508 },
+ { 510, 510 },
+ { 512, 512 },
+ { 514, 514 },
+ { 516, 516 },
+ { 518, 518 },
+ { 520, 520 },
+ { 522, 522 },
+ { 524, 524 },
+ { 526, 526 },
+ { 528, 528 },
+ { 530, 530 },
+ { 532, 532 },
+ { 534, 534 },
+ { 536, 536 },
+ { 538, 538 },
+ { 540, 540 },
+ { 542, 542 },
+ { 544, 544 },
+ { 546, 546 },
+ { 548, 548 },
+ { 550, 550 },
+ { 552, 552 },
+ { 554, 554 },
+ { 556, 556 },
+ { 558, 558 },
+ { 560, 560 },
+ { 562, 562 },
+ { 570, 571 },
+ { 573, 574 },
+ { 577, 577 },
+ { 579, 582 },
+ { 584, 584 },
+ { 586, 586 },
+ { 588, 588 },
+ { 590, 590 },
+ { 880, 880 },
+ { 882, 882 },
+ { 886, 886 },
+ { 895, 895 },
+ { 902, 902 },
+ { 904, 906 },
+ { 908, 908 },
+ { 910, 911 },
+ { 913, 929 },
+ { 931, 939 },
+ { 975, 975 },
+ { 978, 980 },
+ { 984, 984 },
+ { 986, 986 },
+ { 988, 988 },
+ { 990, 990 },
+ { 992, 992 },
+ { 994, 994 },
+ { 996, 996 },
+ { 998, 998 },
+ { 1000, 1000 },
+ { 1002, 1002 },
+ { 1004, 1004 },
+ { 1006, 1006 },
+ { 1012, 1012 },
+ { 1015, 1015 },
+ { 1017, 1018 },
+ { 1021, 1071 },
+ { 1120, 1120 },
+ { 1122, 1122 },
+ { 1124, 1124 },
+ { 1126, 1126 },
+ { 1128, 1128 },
+ { 1130, 1130 },
+ { 1132, 1132 },
+ { 1134, 1134 },
+ { 1136, 1136 },
+ { 1138, 1138 },
+ { 1140, 1140 },
+ { 1142, 1142 },
+ { 1144, 1144 },
+ { 1146, 1146 },
+ { 1148, 1148 },
+ { 1150, 1150 },
+ { 1152, 1152 },
+ { 1162, 1162 },
+ { 1164, 1164 },
+ { 1166, 1166 },
+ { 1168, 1168 },
+ { 1170, 1170 },
+ { 1172, 1172 },
+ { 1174, 1174 },
+ { 1176, 1176 },
+ { 1178, 1178 },
+ { 1180, 1180 },
+ { 1182, 1182 },
+ { 1184, 1184 },
+ { 1186, 1186 },
+ { 1188, 1188 },
+ { 1190, 1190 },
+ { 1192, 1192 },
+ { 1194, 1194 },
+ { 1196, 1196 },
+ { 1198, 1198 },
+ { 1200, 1200 },
+ { 1202, 1202 },
+ { 1204, 1204 },
+ { 1206, 1206 },
+ { 1208, 1208 },
+ { 1210, 1210 },
+ { 1212, 1212 },
+ { 1214, 1214 },
+ { 1216, 1217 },
+ { 1219, 1219 },
+ { 1221, 1221 },
+ { 1223, 1223 },
+ { 1225, 1225 },
+ { 1227, 1227 },
+ { 1229, 1229 },
+ { 1232, 1232 },
+ { 1234, 1234 },
+ { 1236, 1236 },
+ { 1238, 1238 },
+ { 1240, 1240 },
+ { 1242, 1242 },
+ { 1244, 1244 },
+ { 1246, 1246 },
+ { 1248, 1248 },
+ { 1250, 1250 },
+ { 1252, 1252 },
+ { 1254, 1254 },
+ { 1256, 1256 },
+ { 1258, 1258 },
+ { 1260, 1260 },
+ { 1262, 1262 },
+ { 1264, 1264 },
+ { 1266, 1266 },
+ { 1268, 1268 },
+ { 1270, 1270 },
+ { 1272, 1272 },
+ { 1274, 1274 },
+ { 1276, 1276 },
+ { 1278, 1278 },
+ { 1280, 1280 },
+ { 1282, 1282 },
+ { 1284, 1284 },
+ { 1286, 1286 },
+ { 1288, 1288 },
+ { 1290, 1290 },
+ { 1292, 1292 },
+ { 1294, 1294 },
+ { 1296, 1296 },
+ { 1298, 1298 },
+ { 1300, 1300 },
+ { 1302, 1302 },
+ { 1304, 1304 },
+ { 1306, 1306 },
+ { 1308, 1308 },
+ { 1310, 1310 },
+ { 1312, 1312 },
+ { 1314, 1314 },
+ { 1316, 1316 },
+ { 1318, 1318 },
+ { 1320, 1320 },
+ { 1322, 1322 },
+ { 1324, 1324 },
+ { 1326, 1326 },
+ { 1329, 1366 },
+ { 4256, 4293 },
+ { 4295, 4295 },
+ { 4301, 4301 },
+ { 5024, 5109 },
+ { 7312, 7354 },
+ { 7357, 7359 },
+ { 7680, 7680 },
+ { 7682, 7682 },
+ { 7684, 7684 },
+ { 7686, 7686 },
+ { 7688, 7688 },
+ { 7690, 7690 },
+ { 7692, 7692 },
+ { 7694, 7694 },
+ { 7696, 7696 },
+ { 7698, 7698 },
+ { 7700, 7700 },
+ { 7702, 7702 },
+ { 7704, 7704 },
+ { 7706, 7706 },
+ { 7708, 7708 },
+ { 7710, 7710 },
+ { 7712, 7712 },
+ { 7714, 7714 },
+ { 7716, 7716 },
+ { 7718, 7718 },
+ { 7720, 7720 },
+ { 7722, 7722 },
+ { 7724, 7724 },
+ { 7726, 7726 },
+ { 7728, 7728 },
+ { 7730, 7730 },
+ { 7732, 7732 },
+ { 7734, 7734 },
+ { 7736, 7736 },
+ { 7738, 7738 },
+ { 7740, 7740 },
+ { 7742, 7742 },
+ { 7744, 7744 },
+ { 7746, 7746 },
+ { 7748, 7748 },
+ { 7750, 7750 },
+ { 7752, 7752 },
+ { 7754, 7754 },
+ { 7756, 7756 },
+ { 7758, 7758 },
+ { 7760, 7760 },
+ { 7762, 7762 },
+ { 7764, 7764 },
+ { 7766, 7766 },
+ { 7768, 7768 },
+ { 7770, 7770 },
+ { 7772, 7772 },
+ { 7774, 7774 },
+ { 7776, 7776 },
+ { 7778, 7778 },
+ { 7780, 7780 },
+ { 7782, 7782 },
+ { 7784, 7784 },
+ { 7786, 7786 },
+ { 7788, 7788 },
+ { 7790, 7790 },
+ { 7792, 7792 },
+ { 7794, 7794 },
+ { 7796, 7796 },
+ { 7798, 7798 },
+ { 7800, 7800 },
+ { 7802, 7802 },
+ { 7804, 7804 },
+ { 7806, 7806 },
+ { 7808, 7808 },
+ { 7810, 7810 },
+ { 7812, 7812 },
+ { 7814, 7814 },
+ { 7816, 7816 },
+ { 7818, 7818 },
+ { 7820, 7820 },
+ { 7822, 7822 },
+ { 7824, 7824 },
+ { 7826, 7826 },
+ { 7828, 7828 },
+ { 7838, 7838 },
+ { 7840, 7840 },
+ { 7842, 7842 },
+ { 7844, 7844 },
+ { 7846, 7846 },
+ { 7848, 7848 },
+ { 7850, 7850 },
+ { 7852, 7852 },
+ { 7854, 7854 },
+ { 7856, 7856 },
+ { 7858, 7858 },
+ { 7860, 7860 },
+ { 7862, 7862 },
+ { 7864, 7864 },
+ { 7866, 7866 },
+ { 7868, 7868 },
+ { 7870, 7870 },
+ { 7872, 7872 },
+ { 7874, 7874 },
+ { 7876, 7876 },
+ { 7878, 7878 },
+ { 7880, 7880 },
+ { 7882, 7882 },
+ { 7884, 7884 },
+ { 7886, 7886 },
+ { 7888, 7888 },
+ { 7890, 7890 },
+ { 7892, 7892 },
+ { 7894, 7894 },
+ { 7896, 7896 },
+ { 7898, 7898 },
+ { 7900, 7900 },
+ { 7902, 7902 },
+ { 7904, 7904 },
+ { 7906, 7906 },
+ { 7908, 7908 },
+ { 7910, 7910 },
+ { 7912, 7912 },
+ { 7914, 7914 },
+ { 7916, 7916 },
+ { 7918, 7918 },
+ { 7920, 7920 },
+ { 7922, 7922 },
+ { 7924, 7924 },
+ { 7926, 7926 },
+ { 7928, 7928 },
+ { 7930, 7930 },
+ { 7932, 7932 },
+ { 7934, 7934 },
+ { 7944, 7951 },
+ { 7960, 7965 },
+ { 7976, 7983 },
+ { 7992, 7999 },
+ { 8008, 8013 },
+ { 8025, 8025 },
+ { 8027, 8027 },
+ { 8029, 8029 },
+ { 8031, 8031 },
+ { 8040, 8047 },
+ { 8120, 8123 },
+ { 8136, 8139 },
+ { 8152, 8155 },
+ { 8168, 8172 },
+ { 8184, 8187 },
+ { 8450, 8450 },
+ { 8455, 8455 },
+ { 8459, 8461 },
+ { 8464, 8466 },
+ { 8469, 8469 },
+ { 8473, 8477 },
+ { 8484, 8484 },
+ { 8486, 8486 },
+ { 8488, 8488 },
+ { 8490, 8493 },
+ { 8496, 8499 },
+ { 8510, 8511 },
+ { 8517, 8517 },
+ { 8579, 8579 },
+ { 11264, 11311 },
+ { 11360, 11360 },
+ { 11362, 11364 },
+ { 11367, 11367 },
+ { 11369, 11369 },
+ { 11371, 11371 },
+ { 11373, 11376 },
+ { 11378, 11378 },
+ { 11381, 11381 },
+ { 11390, 11392 },
+ { 11394, 11394 },
+ { 11396, 11396 },
+ { 11398, 11398 },
+ { 11400, 11400 },
+ { 11402, 11402 },
+ { 11404, 11404 },
+ { 11406, 11406 },
+ { 11408, 11408 },
+ { 11410, 11410 },
+ { 11412, 11412 },
+ { 11414, 11414 },
+ { 11416, 11416 },
+ { 11418, 11418 },
+ { 11420, 11420 },
+ { 11422, 11422 },
+ { 11424, 11424 },
+ { 11426, 11426 },
+ { 11428, 11428 },
+ { 11430, 11430 },
+ { 11432, 11432 },
+ { 11434, 11434 },
+ { 11436, 11436 },
+ { 11438, 11438 },
+ { 11440, 11440 },
+ { 11442, 11442 },
+ { 11444, 11444 },
+ { 11446, 11446 },
+ { 11448, 11448 },
+ { 11450, 11450 },
+ { 11452, 11452 },
+ { 11454, 11454 },
+ { 11456, 11456 },
+ { 11458, 11458 },
+ { 11460, 11460 },
+ { 11462, 11462 },
+ { 11464, 11464 },
+ { 11466, 11466 },
+ { 11468, 11468 },
+ { 11470, 11470 },
+ { 11472, 11472 },
+ { 11474, 11474 },
+ { 11476, 11476 },
+ { 11478, 11478 },
+ { 11480, 11480 },
+ { 11482, 11482 },
+ { 11484, 11484 },
+ { 11486, 11486 },
+ { 11488, 11488 },
+ { 11490, 11490 },
+ { 11499, 11499 },
+ { 11501, 11501 },
+ { 11506, 11506 },
+ { 42560, 42560 },
+ { 42562, 42562 },
+ { 42564, 42564 },
+ { 42566, 42566 },
+ { 42568, 42568 },
+ { 42570, 42570 },
+ { 42572, 42572 },
+ { 42574, 42574 },
+ { 42576, 42576 },
+ { 42578, 42578 },
+ { 42580, 42580 },
+ { 42582, 42582 },
+ { 42584, 42584 },
+ { 42586, 42586 },
+ { 42588, 42588 },
+ { 42590, 42590 },
+ { 42592, 42592 },
+ { 42594, 42594 },
+ { 42596, 42596 },
+ { 42598, 42598 },
+ { 42600, 42600 },
+ { 42602, 42602 },
+ { 42604, 42604 },
+ { 42624, 42624 },
+ { 42626, 42626 },
+ { 42628, 42628 },
+ { 42630, 42630 },
+ { 42632, 42632 },
+ { 42634, 42634 },
+ { 42636, 42636 },
+ { 42638, 42638 },
+ { 42640, 42640 },
+ { 42642, 42642 },
+ { 42644, 42644 },
+ { 42646, 42646 },
+ { 42648, 42648 },
+ { 42650, 42650 },
+ { 42786, 42786 },
+ { 42788, 42788 },
+ { 42790, 42790 },
+ { 42792, 42792 },
+ { 42794, 42794 },
+ { 42796, 42796 },
+ { 42798, 42798 },
+ { 42802, 42802 },
+ { 42804, 42804 },
+ { 42806, 42806 },
+ { 42808, 42808 },
+ { 42810, 42810 },
+ { 42812, 42812 },
+ { 42814, 42814 },
+ { 42816, 42816 },
+ { 42818, 42818 },
+ { 42820, 42820 },
+ { 42822, 42822 },
+ { 42824, 42824 },
+ { 42826, 42826 },
+ { 42828, 42828 },
+ { 42830, 42830 },
+ { 42832, 42832 },
+ { 42834, 42834 },
+ { 42836, 42836 },
+ { 42838, 42838 },
+ { 42840, 42840 },
+ { 42842, 42842 },
+ { 42844, 42844 },
+ { 42846, 42846 },
+ { 42848, 42848 },
+ { 42850, 42850 },
+ { 42852, 42852 },
+ { 42854, 42854 },
+ { 42856, 42856 },
+ { 42858, 42858 },
+ { 42860, 42860 },
+ { 42862, 42862 },
+ { 42873, 42873 },
+ { 42875, 42875 },
+ { 42877, 42878 },
+ { 42880, 42880 },
+ { 42882, 42882 },
+ { 42884, 42884 },
+ { 42886, 42886 },
+ { 42891, 42891 },
+ { 42893, 42893 },
+ { 42896, 42896 },
+ { 42898, 42898 },
+ { 42902, 42902 },
+ { 42904, 42904 },
+ { 42906, 42906 },
+ { 42908, 42908 },
+ { 42910, 42910 },
+ { 42912, 42912 },
+ { 42914, 42914 },
+ { 42916, 42916 },
+ { 42918, 42918 },
+ { 42920, 42920 },
+ { 42922, 42926 },
+ { 42928, 42932 },
+ { 42934, 42934 },
+ { 42936, 42936 },
+ { 42938, 42938 },
+ { 42940, 42940 },
+ { 42942, 42942 },
+ { 42944, 42944 },
+ { 42946, 42946 },
+ { 42948, 42951 },
+ { 42953, 42953 },
+ { 42960, 42960 },
+ { 42966, 42966 },
+ { 42968, 42968 },
+ { 42997, 42997 },
+ { 65313, 65338 },
+};
+static const URange32 Lu_range32[] = {
+ { 66560, 66599 },
+ { 66736, 66771 },
+ { 66928, 66938 },
+ { 66940, 66954 },
+ { 66956, 66962 },
+ { 66964, 66965 },
+ { 68736, 68786 },
+ { 71840, 71871 },
+ { 93760, 93791 },
+ { 119808, 119833 },
+ { 119860, 119885 },
+ { 119912, 119937 },
+ { 119964, 119964 },
+ { 119966, 119967 },
+ { 119970, 119970 },
+ { 119973, 119974 },
+ { 119977, 119980 },
+ { 119982, 119989 },
+ { 120016, 120041 },
+ { 120068, 120069 },
+ { 120071, 120074 },
+ { 120077, 120084 },
+ { 120086, 120092 },
+ { 120120, 120121 },
+ { 120123, 120126 },
+ { 120128, 120132 },
+ { 120134, 120134 },
+ { 120138, 120144 },
+ { 120172, 120197 },
+ { 120224, 120249 },
+ { 120276, 120301 },
+ { 120328, 120353 },
+ { 120380, 120405 },
+ { 120432, 120457 },
+ { 120488, 120512 },
+ { 120546, 120570 },
+ { 120604, 120628 },
+ { 120662, 120686 },
+ { 120720, 120744 },
+ { 120778, 120778 },
+ { 125184, 125217 },
+};
+static const URange16 M_range16[] = {
+ { 768, 879 },
+ { 1155, 1161 },
+ { 1425, 1469 },
+ { 1471, 1471 },
+ { 1473, 1474 },
+ { 1476, 1477 },
+ { 1479, 1479 },
+ { 1552, 1562 },
+ { 1611, 1631 },
+ { 1648, 1648 },
+ { 1750, 1756 },
+ { 1759, 1764 },
+ { 1767, 1768 },
+ { 1770, 1773 },
+ { 1809, 1809 },
+ { 1840, 1866 },
+ { 1958, 1968 },
+ { 2027, 2035 },
+ { 2045, 2045 },
+ { 2070, 2073 },
+ { 2075, 2083 },
+ { 2085, 2087 },
+ { 2089, 2093 },
+ { 2137, 2139 },
+ { 2200, 2207 },
+ { 2250, 2273 },
+ { 2275, 2307 },
+ { 2362, 2364 },
+ { 2366, 2383 },
+ { 2385, 2391 },
+ { 2402, 2403 },
+ { 2433, 2435 },
+ { 2492, 2492 },
+ { 2494, 2500 },
+ { 2503, 2504 },
+ { 2507, 2509 },
+ { 2519, 2519 },
+ { 2530, 2531 },
+ { 2558, 2558 },
+ { 2561, 2563 },
+ { 2620, 2620 },
+ { 2622, 2626 },
+ { 2631, 2632 },
+ { 2635, 2637 },
+ { 2641, 2641 },
+ { 2672, 2673 },
+ { 2677, 2677 },
+ { 2689, 2691 },
+ { 2748, 2748 },
+ { 2750, 2757 },
+ { 2759, 2761 },
+ { 2763, 2765 },
+ { 2786, 2787 },
+ { 2810, 2815 },
+ { 2817, 2819 },
+ { 2876, 2876 },
+ { 2878, 2884 },
+ { 2887, 2888 },
+ { 2891, 2893 },
+ { 2901, 2903 },
+ { 2914, 2915 },
+ { 2946, 2946 },
+ { 3006, 3010 },
+ { 3014, 3016 },
+ { 3018, 3021 },
+ { 3031, 3031 },
+ { 3072, 3076 },
+ { 3132, 3132 },
+ { 3134, 3140 },
+ { 3142, 3144 },
+ { 3146, 3149 },
+ { 3157, 3158 },
+ { 3170, 3171 },
+ { 3201, 3203 },
+ { 3260, 3260 },
+ { 3262, 3268 },
+ { 3270, 3272 },
+ { 3274, 3277 },
+ { 3285, 3286 },
+ { 3298, 3299 },
+ { 3315, 3315 },
+ { 3328, 3331 },
+ { 3387, 3388 },
+ { 3390, 3396 },
+ { 3398, 3400 },
+ { 3402, 3405 },
+ { 3415, 3415 },
+ { 3426, 3427 },
+ { 3457, 3459 },
+ { 3530, 3530 },
+ { 3535, 3540 },
+ { 3542, 3542 },
+ { 3544, 3551 },
+ { 3570, 3571 },
+ { 3633, 3633 },
+ { 3636, 3642 },
+ { 3655, 3662 },
+ { 3761, 3761 },
+ { 3764, 3772 },
+ { 3784, 3790 },
+ { 3864, 3865 },
+ { 3893, 3893 },
+ { 3895, 3895 },
+ { 3897, 3897 },
+ { 3902, 3903 },
+ { 3953, 3972 },
+ { 3974, 3975 },
+ { 3981, 3991 },
+ { 3993, 4028 },
+ { 4038, 4038 },
+ { 4139, 4158 },
+ { 4182, 4185 },
+ { 4190, 4192 },
+ { 4194, 4196 },
+ { 4199, 4205 },
+ { 4209, 4212 },
+ { 4226, 4237 },
+ { 4239, 4239 },
+ { 4250, 4253 },
+ { 4957, 4959 },
+ { 5906, 5909 },
+ { 5938, 5940 },
+ { 5970, 5971 },
+ { 6002, 6003 },
+ { 6068, 6099 },
+ { 6109, 6109 },
+ { 6155, 6157 },
+ { 6159, 6159 },
+ { 6277, 6278 },
+ { 6313, 6313 },
+ { 6432, 6443 },
+ { 6448, 6459 },
+ { 6679, 6683 },
+ { 6741, 6750 },
+ { 6752, 6780 },
+ { 6783, 6783 },
+ { 6832, 6862 },
+ { 6912, 6916 },
+ { 6964, 6980 },
+ { 7019, 7027 },
+ { 7040, 7042 },
+ { 7073, 7085 },
+ { 7142, 7155 },
+ { 7204, 7223 },
+ { 7376, 7378 },
+ { 7380, 7400 },
+ { 7405, 7405 },
+ { 7412, 7412 },
+ { 7415, 7417 },
+ { 7616, 7679 },
+ { 8400, 8432 },
+ { 11503, 11505 },
+ { 11647, 11647 },
+ { 11744, 11775 },
+ { 12330, 12335 },
+ { 12441, 12442 },
+ { 42607, 42610 },
+ { 42612, 42621 },
+ { 42654, 42655 },
+ { 42736, 42737 },
+ { 43010, 43010 },
+ { 43014, 43014 },
+ { 43019, 43019 },
+ { 43043, 43047 },
+ { 43052, 43052 },
+ { 43136, 43137 },
+ { 43188, 43205 },
+ { 43232, 43249 },
+ { 43263, 43263 },
+ { 43302, 43309 },
+ { 43335, 43347 },
+ { 43392, 43395 },
+ { 43443, 43456 },
+ { 43493, 43493 },
+ { 43561, 43574 },
+ { 43587, 43587 },
+ { 43596, 43597 },
+ { 43643, 43645 },
+ { 43696, 43696 },
+ { 43698, 43700 },
+ { 43703, 43704 },
+ { 43710, 43711 },
+ { 43713, 43713 },
+ { 43755, 43759 },
+ { 43765, 43766 },
+ { 44003, 44010 },
+ { 44012, 44013 },
+ { 64286, 64286 },
+ { 65024, 65039 },
+ { 65056, 65071 },
+};
+static const URange32 M_range32[] = {
+ { 66045, 66045 },
+ { 66272, 66272 },
+ { 66422, 66426 },
+ { 68097, 68099 },
+ { 68101, 68102 },
+ { 68108, 68111 },
+ { 68152, 68154 },
+ { 68159, 68159 },
+ { 68325, 68326 },
+ { 68900, 68903 },
+ { 69291, 69292 },
+ { 69373, 69375 },
+ { 69446, 69456 },
+ { 69506, 69509 },
+ { 69632, 69634 },
+ { 69688, 69702 },
+ { 69744, 69744 },
+ { 69747, 69748 },
+ { 69759, 69762 },
+ { 69808, 69818 },
+ { 69826, 69826 },
+ { 69888, 69890 },
+ { 69927, 69940 },
+ { 69957, 69958 },
+ { 70003, 70003 },
+ { 70016, 70018 },
+ { 70067, 70080 },
+ { 70089, 70092 },
+ { 70094, 70095 },
+ { 70188, 70199 },
+ { 70206, 70206 },
+ { 70209, 70209 },
+ { 70367, 70378 },
+ { 70400, 70403 },
+ { 70459, 70460 },
+ { 70462, 70468 },
+ { 70471, 70472 },
+ { 70475, 70477 },
+ { 70487, 70487 },
+ { 70498, 70499 },
+ { 70502, 70508 },
+ { 70512, 70516 },
+ { 70709, 70726 },
+ { 70750, 70750 },
+ { 70832, 70851 },
+ { 71087, 71093 },
+ { 71096, 71104 },
+ { 71132, 71133 },
+ { 71216, 71232 },
+ { 71339, 71351 },
+ { 71453, 71467 },
+ { 71724, 71738 },
+ { 71984, 71989 },
+ { 71991, 71992 },
+ { 71995, 71998 },
+ { 72000, 72000 },
+ { 72002, 72003 },
+ { 72145, 72151 },
+ { 72154, 72160 },
+ { 72164, 72164 },
+ { 72193, 72202 },
+ { 72243, 72249 },
+ { 72251, 72254 },
+ { 72263, 72263 },
+ { 72273, 72283 },
+ { 72330, 72345 },
+ { 72751, 72758 },
+ { 72760, 72767 },
+ { 72850, 72871 },
+ { 72873, 72886 },
+ { 73009, 73014 },
+ { 73018, 73018 },
+ { 73020, 73021 },
+ { 73023, 73029 },
+ { 73031, 73031 },
+ { 73098, 73102 },
+ { 73104, 73105 },
+ { 73107, 73111 },
+ { 73459, 73462 },
+ { 73472, 73473 },
+ { 73475, 73475 },
+ { 73524, 73530 },
+ { 73534, 73538 },
+ { 78912, 78912 },
+ { 78919, 78933 },
+ { 92912, 92916 },
+ { 92976, 92982 },
+ { 94031, 94031 },
+ { 94033, 94087 },
+ { 94095, 94098 },
+ { 94180, 94180 },
+ { 94192, 94193 },
+ { 113821, 113822 },
+ { 118528, 118573 },
+ { 118576, 118598 },
+ { 119141, 119145 },
+ { 119149, 119154 },
+ { 119163, 119170 },
+ { 119173, 119179 },
+ { 119210, 119213 },
+ { 119362, 119364 },
+ { 121344, 121398 },
+ { 121403, 121452 },
+ { 121461, 121461 },
+ { 121476, 121476 },
+ { 121499, 121503 },
+ { 121505, 121519 },
+ { 122880, 122886 },
+ { 122888, 122904 },
+ { 122907, 122913 },
+ { 122915, 122916 },
+ { 122918, 122922 },
+ { 123023, 123023 },
+ { 123184, 123190 },
+ { 123566, 123566 },
+ { 123628, 123631 },
+ { 124140, 124143 },
+ { 125136, 125142 },
+ { 125252, 125258 },
+ { 917760, 917999 },
+};
+static const URange16 Mc_range16[] = {
+ { 2307, 2307 },
+ { 2363, 2363 },
+ { 2366, 2368 },
+ { 2377, 2380 },
+ { 2382, 2383 },
+ { 2434, 2435 },
+ { 2494, 2496 },
+ { 2503, 2504 },
+ { 2507, 2508 },
+ { 2519, 2519 },
+ { 2563, 2563 },
+ { 2622, 2624 },
+ { 2691, 2691 },
+ { 2750, 2752 },
+ { 2761, 2761 },
+ { 2763, 2764 },
+ { 2818, 2819 },
+ { 2878, 2878 },
+ { 2880, 2880 },
+ { 2887, 2888 },
+ { 2891, 2892 },
+ { 2903, 2903 },
+ { 3006, 3007 },
+ { 3009, 3010 },
+ { 3014, 3016 },
+ { 3018, 3020 },
+ { 3031, 3031 },
+ { 3073, 3075 },
+ { 3137, 3140 },
+ { 3202, 3203 },
+ { 3262, 3262 },
+ { 3264, 3268 },
+ { 3271, 3272 },
+ { 3274, 3275 },
+ { 3285, 3286 },
+ { 3315, 3315 },
+ { 3330, 3331 },
+ { 3390, 3392 },
+ { 3398, 3400 },
+ { 3402, 3404 },
+ { 3415, 3415 },
+ { 3458, 3459 },
+ { 3535, 3537 },
+ { 3544, 3551 },
+ { 3570, 3571 },
+ { 3902, 3903 },
+ { 3967, 3967 },
+ { 4139, 4140 },
+ { 4145, 4145 },
+ { 4152, 4152 },
+ { 4155, 4156 },
+ { 4182, 4183 },
+ { 4194, 4196 },
+ { 4199, 4205 },
+ { 4227, 4228 },
+ { 4231, 4236 },
+ { 4239, 4239 },
+ { 4250, 4252 },
+ { 5909, 5909 },
+ { 5940, 5940 },
+ { 6070, 6070 },
+ { 6078, 6085 },
+ { 6087, 6088 },
+ { 6435, 6438 },
+ { 6441, 6443 },
+ { 6448, 6449 },
+ { 6451, 6456 },
+ { 6681, 6682 },
+ { 6741, 6741 },
+ { 6743, 6743 },
+ { 6753, 6753 },
+ { 6755, 6756 },
+ { 6765, 6770 },
+ { 6916, 6916 },
+ { 6965, 6965 },
+ { 6971, 6971 },
+ { 6973, 6977 },
+ { 6979, 6980 },
+ { 7042, 7042 },
+ { 7073, 7073 },
+ { 7078, 7079 },
+ { 7082, 7082 },
+ { 7143, 7143 },
+ { 7146, 7148 },
+ { 7150, 7150 },
+ { 7154, 7155 },
+ { 7204, 7211 },
+ { 7220, 7221 },
+ { 7393, 7393 },
+ { 7415, 7415 },
+ { 12334, 12335 },
+ { 43043, 43044 },
+ { 43047, 43047 },
+ { 43136, 43137 },
+ { 43188, 43203 },
+ { 43346, 43347 },
+ { 43395, 43395 },
+ { 43444, 43445 },
+ { 43450, 43451 },
+ { 43454, 43456 },
+ { 43567, 43568 },
+ { 43571, 43572 },
+ { 43597, 43597 },
+ { 43643, 43643 },
+ { 43645, 43645 },
+ { 43755, 43755 },
+ { 43758, 43759 },
+ { 43765, 43765 },
+ { 44003, 44004 },
+ { 44006, 44007 },
+ { 44009, 44010 },
+ { 44012, 44012 },
+};
+static const URange32 Mc_range32[] = {
+ { 69632, 69632 },
+ { 69634, 69634 },
+ { 69762, 69762 },
+ { 69808, 69810 },
+ { 69815, 69816 },
+ { 69932, 69932 },
+ { 69957, 69958 },
+ { 70018, 70018 },
+ { 70067, 70069 },
+ { 70079, 70080 },
+ { 70094, 70094 },
+ { 70188, 70190 },
+ { 70194, 70195 },
+ { 70197, 70197 },
+ { 70368, 70370 },
+ { 70402, 70403 },
+ { 70462, 70463 },
+ { 70465, 70468 },
+ { 70471, 70472 },
+ { 70475, 70477 },
+ { 70487, 70487 },
+ { 70498, 70499 },
+ { 70709, 70711 },
+ { 70720, 70721 },
+ { 70725, 70725 },
+ { 70832, 70834 },
+ { 70841, 70841 },
+ { 70843, 70846 },
+ { 70849, 70849 },
+ { 71087, 71089 },
+ { 71096, 71099 },
+ { 71102, 71102 },
+ { 71216, 71218 },
+ { 71227, 71228 },
+ { 71230, 71230 },
+ { 71340, 71340 },
+ { 71342, 71343 },
+ { 71350, 71350 },
+ { 71456, 71457 },
+ { 71462, 71462 },
+ { 71724, 71726 },
+ { 71736, 71736 },
+ { 71984, 71989 },
+ { 71991, 71992 },
+ { 71997, 71997 },
+ { 72000, 72000 },
+ { 72002, 72002 },
+ { 72145, 72147 },
+ { 72156, 72159 },
+ { 72164, 72164 },
+ { 72249, 72249 },
+ { 72279, 72280 },
+ { 72343, 72343 },
+ { 72751, 72751 },
+ { 72766, 72766 },
+ { 72873, 72873 },
+ { 72881, 72881 },
+ { 72884, 72884 },
+ { 73098, 73102 },
+ { 73107, 73108 },
+ { 73110, 73110 },
+ { 73461, 73462 },
+ { 73475, 73475 },
+ { 73524, 73525 },
+ { 73534, 73535 },
+ { 73537, 73537 },
+ { 94033, 94087 },
+ { 94192, 94193 },
+ { 119141, 119142 },
+ { 119149, 119154 },
+};
+static const URange16 Me_range16[] = {
+ { 1160, 1161 },
+ { 6846, 6846 },
+ { 8413, 8416 },
+ { 8418, 8420 },
+ { 42608, 42610 },
+};
+static const URange16 Mn_range16[] = {
+ { 768, 879 },
+ { 1155, 1159 },
+ { 1425, 1469 },
+ { 1471, 1471 },
+ { 1473, 1474 },
+ { 1476, 1477 },
+ { 1479, 1479 },
+ { 1552, 1562 },
+ { 1611, 1631 },
+ { 1648, 1648 },
+ { 1750, 1756 },
+ { 1759, 1764 },
+ { 1767, 1768 },
+ { 1770, 1773 },
+ { 1809, 1809 },
+ { 1840, 1866 },
+ { 1958, 1968 },
+ { 2027, 2035 },
+ { 2045, 2045 },
+ { 2070, 2073 },
+ { 2075, 2083 },
+ { 2085, 2087 },
+ { 2089, 2093 },
+ { 2137, 2139 },
+ { 2200, 2207 },
+ { 2250, 2273 },
+ { 2275, 2306 },
+ { 2362, 2362 },
+ { 2364, 2364 },
+ { 2369, 2376 },
+ { 2381, 2381 },
+ { 2385, 2391 },
+ { 2402, 2403 },
+ { 2433, 2433 },
+ { 2492, 2492 },
+ { 2497, 2500 },
+ { 2509, 2509 },
+ { 2530, 2531 },
+ { 2558, 2558 },
+ { 2561, 2562 },
+ { 2620, 2620 },
+ { 2625, 2626 },
+ { 2631, 2632 },
+ { 2635, 2637 },
+ { 2641, 2641 },
+ { 2672, 2673 },
+ { 2677, 2677 },
+ { 2689, 2690 },
+ { 2748, 2748 },
+ { 2753, 2757 },
+ { 2759, 2760 },
+ { 2765, 2765 },
+ { 2786, 2787 },
+ { 2810, 2815 },
+ { 2817, 2817 },
+ { 2876, 2876 },
+ { 2879, 2879 },
+ { 2881, 2884 },
+ { 2893, 2893 },
+ { 2901, 2902 },
+ { 2914, 2915 },
+ { 2946, 2946 },
+ { 3008, 3008 },
+ { 3021, 3021 },
+ { 3072, 3072 },
+ { 3076, 3076 },
+ { 3132, 3132 },
+ { 3134, 3136 },
+ { 3142, 3144 },
+ { 3146, 3149 },
+ { 3157, 3158 },
+ { 3170, 3171 },
+ { 3201, 3201 },
+ { 3260, 3260 },
+ { 3263, 3263 },
+ { 3270, 3270 },
+ { 3276, 3277 },
+ { 3298, 3299 },
+ { 3328, 3329 },
+ { 3387, 3388 },
+ { 3393, 3396 },
+ { 3405, 3405 },
+ { 3426, 3427 },
+ { 3457, 3457 },
+ { 3530, 3530 },
+ { 3538, 3540 },
+ { 3542, 3542 },
+ { 3633, 3633 },
+ { 3636, 3642 },
+ { 3655, 3662 },
+ { 3761, 3761 },
+ { 3764, 3772 },
+ { 3784, 3790 },
+ { 3864, 3865 },
+ { 3893, 3893 },
+ { 3895, 3895 },
+ { 3897, 3897 },
+ { 3953, 3966 },
+ { 3968, 3972 },
+ { 3974, 3975 },
+ { 3981, 3991 },
+ { 3993, 4028 },
+ { 4038, 4038 },
+ { 4141, 4144 },
+ { 4146, 4151 },
+ { 4153, 4154 },
+ { 4157, 4158 },
+ { 4184, 4185 },
+ { 4190, 4192 },
+ { 4209, 4212 },
+ { 4226, 4226 },
+ { 4229, 4230 },
+ { 4237, 4237 },
+ { 4253, 4253 },
+ { 4957, 4959 },
+ { 5906, 5908 },
+ { 5938, 5939 },
+ { 5970, 5971 },
+ { 6002, 6003 },
+ { 6068, 6069 },
+ { 6071, 6077 },
+ { 6086, 6086 },
+ { 6089, 6099 },
+ { 6109, 6109 },
+ { 6155, 6157 },
+ { 6159, 6159 },
+ { 6277, 6278 },
+ { 6313, 6313 },
+ { 6432, 6434 },
+ { 6439, 6440 },
+ { 6450, 6450 },
+ { 6457, 6459 },
+ { 6679, 6680 },
+ { 6683, 6683 },
+ { 6742, 6742 },
+ { 6744, 6750 },
+ { 6752, 6752 },
+ { 6754, 6754 },
+ { 6757, 6764 },
+ { 6771, 6780 },
+ { 6783, 6783 },
+ { 6832, 6845 },
+ { 6847, 6862 },
+ { 6912, 6915 },
+ { 6964, 6964 },
+ { 6966, 6970 },
+ { 6972, 6972 },
+ { 6978, 6978 },
+ { 7019, 7027 },
+ { 7040, 7041 },
+ { 7074, 7077 },
+ { 7080, 7081 },
+ { 7083, 7085 },
+ { 7142, 7142 },
+ { 7144, 7145 },
+ { 7149, 7149 },
+ { 7151, 7153 },
+ { 7212, 7219 },
+ { 7222, 7223 },
+ { 7376, 7378 },
+ { 7380, 7392 },
+ { 7394, 7400 },
+ { 7405, 7405 },
+ { 7412, 7412 },
+ { 7416, 7417 },
+ { 7616, 7679 },
+ { 8400, 8412 },
+ { 8417, 8417 },
+ { 8421, 8432 },
+ { 11503, 11505 },
+ { 11647, 11647 },
+ { 11744, 11775 },
+ { 12330, 12333 },
+ { 12441, 12442 },
+ { 42607, 42607 },
+ { 42612, 42621 },
+ { 42654, 42655 },
+ { 42736, 42737 },
+ { 43010, 43010 },
+ { 43014, 43014 },
+ { 43019, 43019 },
+ { 43045, 43046 },
+ { 43052, 43052 },
+ { 43204, 43205 },
+ { 43232, 43249 },
+ { 43263, 43263 },
+ { 43302, 43309 },
+ { 43335, 43345 },
+ { 43392, 43394 },
+ { 43443, 43443 },
+ { 43446, 43449 },
+ { 43452, 43453 },
+ { 43493, 43493 },
+ { 43561, 43566 },
+ { 43569, 43570 },
+ { 43573, 43574 },
+ { 43587, 43587 },
+ { 43596, 43596 },
+ { 43644, 43644 },
+ { 43696, 43696 },
+ { 43698, 43700 },
+ { 43703, 43704 },
+ { 43710, 43711 },
+ { 43713, 43713 },
+ { 43756, 43757 },
+ { 43766, 43766 },
+ { 44005, 44005 },
+ { 44008, 44008 },
+ { 44013, 44013 },
+ { 64286, 64286 },
+ { 65024, 65039 },
+ { 65056, 65071 },
+};
+static const URange32 Mn_range32[] = {
+ { 66045, 66045 },
+ { 66272, 66272 },
+ { 66422, 66426 },
+ { 68097, 68099 },
+ { 68101, 68102 },
+ { 68108, 68111 },
+ { 68152, 68154 },
+ { 68159, 68159 },
+ { 68325, 68326 },
+ { 68900, 68903 },
+ { 69291, 69292 },
+ { 69373, 69375 },
+ { 69446, 69456 },
+ { 69506, 69509 },
+ { 69633, 69633 },
+ { 69688, 69702 },
+ { 69744, 69744 },
+ { 69747, 69748 },
+ { 69759, 69761 },
+ { 69811, 69814 },
+ { 69817, 69818 },
+ { 69826, 69826 },
+ { 69888, 69890 },
+ { 69927, 69931 },
+ { 69933, 69940 },
+ { 70003, 70003 },
+ { 70016, 70017 },
+ { 70070, 70078 },
+ { 70089, 70092 },
+ { 70095, 70095 },
+ { 70191, 70193 },
+ { 70196, 70196 },
+ { 70198, 70199 },
+ { 70206, 70206 },
+ { 70209, 70209 },
+ { 70367, 70367 },
+ { 70371, 70378 },
+ { 70400, 70401 },
+ { 70459, 70460 },
+ { 70464, 70464 },
+ { 70502, 70508 },
+ { 70512, 70516 },
+ { 70712, 70719 },
+ { 70722, 70724 },
+ { 70726, 70726 },
+ { 70750, 70750 },
+ { 70835, 70840 },
+ { 70842, 70842 },
+ { 70847, 70848 },
+ { 70850, 70851 },
+ { 71090, 71093 },
+ { 71100, 71101 },
+ { 71103, 71104 },
+ { 71132, 71133 },
+ { 71219, 71226 },
+ { 71229, 71229 },
+ { 71231, 71232 },
+ { 71339, 71339 },
+ { 71341, 71341 },
+ { 71344, 71349 },
+ { 71351, 71351 },
+ { 71453, 71455 },
+ { 71458, 71461 },
+ { 71463, 71467 },
+ { 71727, 71735 },
+ { 71737, 71738 },
+ { 71995, 71996 },
+ { 71998, 71998 },
+ { 72003, 72003 },
+ { 72148, 72151 },
+ { 72154, 72155 },
+ { 72160, 72160 },
+ { 72193, 72202 },
+ { 72243, 72248 },
+ { 72251, 72254 },
+ { 72263, 72263 },
+ { 72273, 72278 },
+ { 72281, 72283 },
+ { 72330, 72342 },
+ { 72344, 72345 },
+ { 72752, 72758 },
+ { 72760, 72765 },
+ { 72767, 72767 },
+ { 72850, 72871 },
+ { 72874, 72880 },
+ { 72882, 72883 },
+ { 72885, 72886 },
+ { 73009, 73014 },
+ { 73018, 73018 },
+ { 73020, 73021 },
+ { 73023, 73029 },
+ { 73031, 73031 },
+ { 73104, 73105 },
+ { 73109, 73109 },
+ { 73111, 73111 },
+ { 73459, 73460 },
+ { 73472, 73473 },
+ { 73526, 73530 },
+ { 73536, 73536 },
+ { 73538, 73538 },
+ { 78912, 78912 },
+ { 78919, 78933 },
+ { 92912, 92916 },
+ { 92976, 92982 },
+ { 94031, 94031 },
+ { 94095, 94098 },
+ { 94180, 94180 },
+ { 113821, 113822 },
+ { 118528, 118573 },
+ { 118576, 118598 },
+ { 119143, 119145 },
+ { 119163, 119170 },
+ { 119173, 119179 },
+ { 119210, 119213 },
+ { 119362, 119364 },
+ { 121344, 121398 },
+ { 121403, 121452 },
+ { 121461, 121461 },
+ { 121476, 121476 },
+ { 121499, 121503 },
+ { 121505, 121519 },
+ { 122880, 122886 },
+ { 122888, 122904 },
+ { 122907, 122913 },
+ { 122915, 122916 },
+ { 122918, 122922 },
+ { 123023, 123023 },
+ { 123184, 123190 },
+ { 123566, 123566 },
+ { 123628, 123631 },
+ { 124140, 124143 },
+ { 125136, 125142 },
+ { 125252, 125258 },
+ { 917760, 917999 },
+};
+static const URange16 N_range16[] = {
+ { 48, 57 },
+ { 178, 179 },
+ { 185, 185 },
+ { 188, 190 },
+ { 1632, 1641 },
+ { 1776, 1785 },
+ { 1984, 1993 },
+ { 2406, 2415 },
+ { 2534, 2543 },
+ { 2548, 2553 },
+ { 2662, 2671 },
+ { 2790, 2799 },
+ { 2918, 2927 },
+ { 2930, 2935 },
+ { 3046, 3058 },
+ { 3174, 3183 },
+ { 3192, 3198 },
+ { 3302, 3311 },
+ { 3416, 3422 },
+ { 3430, 3448 },
+ { 3558, 3567 },
+ { 3664, 3673 },
+ { 3792, 3801 },
+ { 3872, 3891 },
+ { 4160, 4169 },
+ { 4240, 4249 },
+ { 4969, 4988 },
+ { 5870, 5872 },
+ { 6112, 6121 },
+ { 6128, 6137 },
+ { 6160, 6169 },
+ { 6470, 6479 },
+ { 6608, 6618 },
+ { 6784, 6793 },
+ { 6800, 6809 },
+ { 6992, 7001 },
+ { 7088, 7097 },
+ { 7232, 7241 },
+ { 7248, 7257 },
+ { 8304, 8304 },
+ { 8308, 8313 },
+ { 8320, 8329 },
+ { 8528, 8578 },
+ { 8581, 8585 },
+ { 9312, 9371 },
+ { 9450, 9471 },
+ { 10102, 10131 },
+ { 11517, 11517 },
+ { 12295, 12295 },
+ { 12321, 12329 },
+ { 12344, 12346 },
+ { 12690, 12693 },
+ { 12832, 12841 },
+ { 12872, 12879 },
+ { 12881, 12895 },
+ { 12928, 12937 },
+ { 12977, 12991 },
+ { 42528, 42537 },
+ { 42726, 42735 },
+ { 43056, 43061 },
+ { 43216, 43225 },
+ { 43264, 43273 },
+ { 43472, 43481 },
+ { 43504, 43513 },
+ { 43600, 43609 },
+ { 44016, 44025 },
+ { 65296, 65305 },
+};
+static const URange32 N_range32[] = {
+ { 65799, 65843 },
+ { 65856, 65912 },
+ { 65930, 65931 },
+ { 66273, 66299 },
+ { 66336, 66339 },
+ { 66369, 66369 },
+ { 66378, 66378 },
+ { 66513, 66517 },
+ { 66720, 66729 },
+ { 67672, 67679 },
+ { 67705, 67711 },
+ { 67751, 67759 },
+ { 67835, 67839 },
+ { 67862, 67867 },
+ { 68028, 68029 },
+ { 68032, 68047 },
+ { 68050, 68095 },
+ { 68160, 68168 },
+ { 68221, 68222 },
+ { 68253, 68255 },
+ { 68331, 68335 },
+ { 68440, 68447 },
+ { 68472, 68479 },
+ { 68521, 68527 },
+ { 68858, 68863 },
+ { 68912, 68921 },
+ { 69216, 69246 },
+ { 69405, 69414 },
+ { 69457, 69460 },
+ { 69573, 69579 },
+ { 69714, 69743 },
+ { 69872, 69881 },
+ { 69942, 69951 },
+ { 70096, 70105 },
+ { 70113, 70132 },
+ { 70384, 70393 },
+ { 70736, 70745 },
+ { 70864, 70873 },
+ { 71248, 71257 },
+ { 71360, 71369 },
+ { 71472, 71483 },
+ { 71904, 71922 },
+ { 72016, 72025 },
+ { 72784, 72812 },
+ { 73040, 73049 },
+ { 73120, 73129 },
+ { 73552, 73561 },
+ { 73664, 73684 },
+ { 74752, 74862 },
+ { 92768, 92777 },
+ { 92864, 92873 },
+ { 93008, 93017 },
+ { 93019, 93025 },
+ { 93824, 93846 },
+ { 119488, 119507 },
+ { 119520, 119539 },
+ { 119648, 119672 },
+ { 120782, 120831 },
+ { 123200, 123209 },
+ { 123632, 123641 },
+ { 124144, 124153 },
+ { 125127, 125135 },
+ { 125264, 125273 },
+ { 126065, 126123 },
+ { 126125, 126127 },
+ { 126129, 126132 },
+ { 126209, 126253 },
+ { 126255, 126269 },
+ { 127232, 127244 },
+ { 130032, 130041 },
+};
+static const URange16 Nd_range16[] = {
+ { 48, 57 },
+ { 1632, 1641 },
+ { 1776, 1785 },
+ { 1984, 1993 },
+ { 2406, 2415 },
+ { 2534, 2543 },
+ { 2662, 2671 },
+ { 2790, 2799 },
+ { 2918, 2927 },
+ { 3046, 3055 },
+ { 3174, 3183 },
+ { 3302, 3311 },
+ { 3430, 3439 },
+ { 3558, 3567 },
+ { 3664, 3673 },
+ { 3792, 3801 },
+ { 3872, 3881 },
+ { 4160, 4169 },
+ { 4240, 4249 },
+ { 6112, 6121 },
+ { 6160, 6169 },
+ { 6470, 6479 },
+ { 6608, 6617 },
+ { 6784, 6793 },
+ { 6800, 6809 },
+ { 6992, 7001 },
+ { 7088, 7097 },
+ { 7232, 7241 },
+ { 7248, 7257 },
+ { 42528, 42537 },
+ { 43216, 43225 },
+ { 43264, 43273 },
+ { 43472, 43481 },
+ { 43504, 43513 },
+ { 43600, 43609 },
+ { 44016, 44025 },
+ { 65296, 65305 },
+};
+static const URange32 Nd_range32[] = {
+ { 66720, 66729 },
+ { 68912, 68921 },
+ { 69734, 69743 },
+ { 69872, 69881 },
+ { 69942, 69951 },
+ { 70096, 70105 },
+ { 70384, 70393 },
+ { 70736, 70745 },
+ { 70864, 70873 },
+ { 71248, 71257 },
+ { 71360, 71369 },
+ { 71472, 71481 },
+ { 71904, 71913 },
+ { 72016, 72025 },
+ { 72784, 72793 },
+ { 73040, 73049 },
+ { 73120, 73129 },
+ { 73552, 73561 },
+ { 92768, 92777 },
+ { 92864, 92873 },
+ { 93008, 93017 },
+ { 120782, 120831 },
+ { 123200, 123209 },
+ { 123632, 123641 },
+ { 124144, 124153 },
+ { 125264, 125273 },
+ { 130032, 130041 },
+};
+static const URange16 Nl_range16[] = {
+ { 5870, 5872 },
+ { 8544, 8578 },
+ { 8581, 8584 },
+ { 12295, 12295 },
+ { 12321, 12329 },
+ { 12344, 12346 },
+ { 42726, 42735 },
+};
+static const URange32 Nl_range32[] = {
+ { 65856, 65908 },
+ { 66369, 66369 },
+ { 66378, 66378 },
+ { 66513, 66517 },
+ { 74752, 74862 },
+};
+static const URange16 No_range16[] = {
+ { 178, 179 },
+ { 185, 185 },
+ { 188, 190 },
+ { 2548, 2553 },
+ { 2930, 2935 },
+ { 3056, 3058 },
+ { 3192, 3198 },
+ { 3416, 3422 },
+ { 3440, 3448 },
+ { 3882, 3891 },
+ { 4969, 4988 },
+ { 6128, 6137 },
+ { 6618, 6618 },
+ { 8304, 8304 },
+ { 8308, 8313 },
+ { 8320, 8329 },
+ { 8528, 8543 },
+ { 8585, 8585 },
+ { 9312, 9371 },
+ { 9450, 9471 },
+ { 10102, 10131 },
+ { 11517, 11517 },
+ { 12690, 12693 },
+ { 12832, 12841 },
+ { 12872, 12879 },
+ { 12881, 12895 },
+ { 12928, 12937 },
+ { 12977, 12991 },
+ { 43056, 43061 },
+};
+static const URange32 No_range32[] = {
+ { 65799, 65843 },
+ { 65909, 65912 },
+ { 65930, 65931 },
+ { 66273, 66299 },
+ { 66336, 66339 },
+ { 67672, 67679 },
+ { 67705, 67711 },
+ { 67751, 67759 },
+ { 67835, 67839 },
+ { 67862, 67867 },
+ { 68028, 68029 },
+ { 68032, 68047 },
+ { 68050, 68095 },
+ { 68160, 68168 },
+ { 68221, 68222 },
+ { 68253, 68255 },
+ { 68331, 68335 },
+ { 68440, 68447 },
+ { 68472, 68479 },
+ { 68521, 68527 },
+ { 68858, 68863 },
+ { 69216, 69246 },
+ { 69405, 69414 },
+ { 69457, 69460 },
+ { 69573, 69579 },
+ { 69714, 69733 },
+ { 70113, 70132 },
+ { 71482, 71483 },
+ { 71914, 71922 },
+ { 72794, 72812 },
+ { 73664, 73684 },
+ { 93019, 93025 },
+ { 93824, 93846 },
+ { 119488, 119507 },
+ { 119520, 119539 },
+ { 119648, 119672 },
+ { 125127, 125135 },
+ { 126065, 126123 },
+ { 126125, 126127 },
+ { 126129, 126132 },
+ { 126209, 126253 },
+ { 126255, 126269 },
+ { 127232, 127244 },
+};
+static const URange16 P_range16[] = {
+ { 33, 35 },
+ { 37, 42 },
+ { 44, 47 },
+ { 58, 59 },
+ { 63, 64 },
+ { 91, 93 },
+ { 95, 95 },
+ { 123, 123 },
+ { 125, 125 },
+ { 161, 161 },
+ { 167, 167 },
+ { 171, 171 },
+ { 182, 183 },
+ { 187, 187 },
+ { 191, 191 },
+ { 894, 894 },
+ { 903, 903 },
+ { 1370, 1375 },
+ { 1417, 1418 },
+ { 1470, 1470 },
+ { 1472, 1472 },
+ { 1475, 1475 },
+ { 1478, 1478 },
+ { 1523, 1524 },
+ { 1545, 1546 },
+ { 1548, 1549 },
+ { 1563, 1563 },
+ { 1565, 1567 },
+ { 1642, 1645 },
+ { 1748, 1748 },
+ { 1792, 1805 },
+ { 2039, 2041 },
+ { 2096, 2110 },
+ { 2142, 2142 },
+ { 2404, 2405 },
+ { 2416, 2416 },
+ { 2557, 2557 },
+ { 2678, 2678 },
+ { 2800, 2800 },
+ { 3191, 3191 },
+ { 3204, 3204 },
+ { 3572, 3572 },
+ { 3663, 3663 },
+ { 3674, 3675 },
+ { 3844, 3858 },
+ { 3860, 3860 },
+ { 3898, 3901 },
+ { 3973, 3973 },
+ { 4048, 4052 },
+ { 4057, 4058 },
+ { 4170, 4175 },
+ { 4347, 4347 },
+ { 4960, 4968 },
+ { 5120, 5120 },
+ { 5742, 5742 },
+ { 5787, 5788 },
+ { 5867, 5869 },
+ { 5941, 5942 },
+ { 6100, 6102 },
+ { 6104, 6106 },
+ { 6144, 6154 },
+ { 6468, 6469 },
+ { 6686, 6687 },
+ { 6816, 6822 },
+ { 6824, 6829 },
+ { 7002, 7008 },
+ { 7037, 7038 },
+ { 7164, 7167 },
+ { 7227, 7231 },
+ { 7294, 7295 },
+ { 7360, 7367 },
+ { 7379, 7379 },
+ { 8208, 8231 },
+ { 8240, 8259 },
+ { 8261, 8273 },
+ { 8275, 8286 },
+ { 8317, 8318 },
+ { 8333, 8334 },
+ { 8968, 8971 },
+ { 9001, 9002 },
+ { 10088, 10101 },
+ { 10181, 10182 },
+ { 10214, 10223 },
+ { 10627, 10648 },
+ { 10712, 10715 },
+ { 10748, 10749 },
+ { 11513, 11516 },
+ { 11518, 11519 },
+ { 11632, 11632 },
+ { 11776, 11822 },
+ { 11824, 11855 },
+ { 11858, 11869 },
+ { 12289, 12291 },
+ { 12296, 12305 },
+ { 12308, 12319 },
+ { 12336, 12336 },
+ { 12349, 12349 },
+ { 12448, 12448 },
+ { 12539, 12539 },
+ { 42238, 42239 },
+ { 42509, 42511 },
+ { 42611, 42611 },
+ { 42622, 42622 },
+ { 42738, 42743 },
+ { 43124, 43127 },
+ { 43214, 43215 },
+ { 43256, 43258 },
+ { 43260, 43260 },
+ { 43310, 43311 },
+ { 43359, 43359 },
+ { 43457, 43469 },
+ { 43486, 43487 },
+ { 43612, 43615 },
+ { 43742, 43743 },
+ { 43760, 43761 },
+ { 44011, 44011 },
+ { 64830, 64831 },
+ { 65040, 65049 },
+ { 65072, 65106 },
+ { 65108, 65121 },
+ { 65123, 65123 },
+ { 65128, 65128 },
+ { 65130, 65131 },
+ { 65281, 65283 },
+ { 65285, 65290 },
+ { 65292, 65295 },
+ { 65306, 65307 },
+ { 65311, 65312 },
+ { 65339, 65341 },
+ { 65343, 65343 },
+ { 65371, 65371 },
+ { 65373, 65373 },
+ { 65375, 65381 },
+};
+static const URange32 P_range32[] = {
+ { 65792, 65794 },
+ { 66463, 66463 },
+ { 66512, 66512 },
+ { 66927, 66927 },
+ { 67671, 67671 },
+ { 67871, 67871 },
+ { 67903, 67903 },
+ { 68176, 68184 },
+ { 68223, 68223 },
+ { 68336, 68342 },
+ { 68409, 68415 },
+ { 68505, 68508 },
+ { 69293, 69293 },
+ { 69461, 69465 },
+ { 69510, 69513 },
+ { 69703, 69709 },
+ { 69819, 69820 },
+ { 69822, 69825 },
+ { 69952, 69955 },
+ { 70004, 70005 },
+ { 70085, 70088 },
+ { 70093, 70093 },
+ { 70107, 70107 },
+ { 70109, 70111 },
+ { 70200, 70205 },
+ { 70313, 70313 },
+ { 70731, 70735 },
+ { 70746, 70747 },
+ { 70749, 70749 },
+ { 70854, 70854 },
+ { 71105, 71127 },
+ { 71233, 71235 },
+ { 71264, 71276 },
+ { 71353, 71353 },
+ { 71484, 71486 },
+ { 71739, 71739 },
+ { 72004, 72006 },
+ { 72162, 72162 },
+ { 72255, 72262 },
+ { 72346, 72348 },
+ { 72350, 72354 },
+ { 72448, 72457 },
+ { 72769, 72773 },
+ { 72816, 72817 },
+ { 73463, 73464 },
+ { 73539, 73551 },
+ { 73727, 73727 },
+ { 74864, 74868 },
+ { 77809, 77810 },
+ { 92782, 92783 },
+ { 92917, 92917 },
+ { 92983, 92987 },
+ { 92996, 92996 },
+ { 93847, 93850 },
+ { 94178, 94178 },
+ { 113823, 113823 },
+ { 121479, 121483 },
+ { 125278, 125279 },
+};
+static const URange16 Pc_range16[] = {
+ { 95, 95 },
+ { 8255, 8256 },
+ { 8276, 8276 },
+ { 65075, 65076 },
+ { 65101, 65103 },
+ { 65343, 65343 },
+};
+static const URange16 Pd_range16[] = {
+ { 45, 45 },
+ { 1418, 1418 },
+ { 1470, 1470 },
+ { 5120, 5120 },
+ { 6150, 6150 },
+ { 8208, 8213 },
+ { 11799, 11799 },
+ { 11802, 11802 },
+ { 11834, 11835 },
+ { 11840, 11840 },
+ { 11869, 11869 },
+ { 12316, 12316 },
+ { 12336, 12336 },
+ { 12448, 12448 },
+ { 65073, 65074 },
+ { 65112, 65112 },
+ { 65123, 65123 },
+ { 65293, 65293 },
+};
+static const URange32 Pd_range32[] = {
+ { 69293, 69293 },
+};
+static const URange16 Pe_range16[] = {
+ { 41, 41 },
+ { 93, 93 },
+ { 125, 125 },
+ { 3899, 3899 },
+ { 3901, 3901 },
+ { 5788, 5788 },
+ { 8262, 8262 },
+ { 8318, 8318 },
+ { 8334, 8334 },
+ { 8969, 8969 },
+ { 8971, 8971 },
+ { 9002, 9002 },
+ { 10089, 10089 },
+ { 10091, 10091 },
+ { 10093, 10093 },
+ { 10095, 10095 },
+ { 10097, 10097 },
+ { 10099, 10099 },
+ { 10101, 10101 },
+ { 10182, 10182 },
+ { 10215, 10215 },
+ { 10217, 10217 },
+ { 10219, 10219 },
+ { 10221, 10221 },
+ { 10223, 10223 },
+ { 10628, 10628 },
+ { 10630, 10630 },
+ { 10632, 10632 },
+ { 10634, 10634 },
+ { 10636, 10636 },
+ { 10638, 10638 },
+ { 10640, 10640 },
+ { 10642, 10642 },
+ { 10644, 10644 },
+ { 10646, 10646 },
+ { 10648, 10648 },
+ { 10713, 10713 },
+ { 10715, 10715 },
+ { 10749, 10749 },
+ { 11811, 11811 },
+ { 11813, 11813 },
+ { 11815, 11815 },
+ { 11817, 11817 },
+ { 11862, 11862 },
+ { 11864, 11864 },
+ { 11866, 11866 },
+ { 11868, 11868 },
+ { 12297, 12297 },
+ { 12299, 12299 },
+ { 12301, 12301 },
+ { 12303, 12303 },
+ { 12305, 12305 },
+ { 12309, 12309 },
+ { 12311, 12311 },
+ { 12313, 12313 },
+ { 12315, 12315 },
+ { 12318, 12319 },
+ { 64830, 64830 },
+ { 65048, 65048 },
+ { 65078, 65078 },
+ { 65080, 65080 },
+ { 65082, 65082 },
+ { 65084, 65084 },
+ { 65086, 65086 },
+ { 65088, 65088 },
+ { 65090, 65090 },
+ { 65092, 65092 },
+ { 65096, 65096 },
+ { 65114, 65114 },
+ { 65116, 65116 },
+ { 65118, 65118 },
+ { 65289, 65289 },
+ { 65341, 65341 },
+ { 65373, 65373 },
+ { 65376, 65376 },
+ { 65379, 65379 },
+};
+static const URange16 Pf_range16[] = {
+ { 187, 187 },
+ { 8217, 8217 },
+ { 8221, 8221 },
+ { 8250, 8250 },
+ { 11779, 11779 },
+ { 11781, 11781 },
+ { 11786, 11786 },
+ { 11789, 11789 },
+ { 11805, 11805 },
+ { 11809, 11809 },
+};
+static const URange16 Pi_range16[] = {
+ { 171, 171 },
+ { 8216, 8216 },
+ { 8219, 8220 },
+ { 8223, 8223 },
+ { 8249, 8249 },
+ { 11778, 11778 },
+ { 11780, 11780 },
+ { 11785, 11785 },
+ { 11788, 11788 },
+ { 11804, 11804 },
+ { 11808, 11808 },
+};
+static const URange16 Po_range16[] = {
+ { 33, 35 },
+ { 37, 39 },
+ { 42, 42 },
+ { 44, 44 },
+ { 46, 47 },
+ { 58, 59 },
+ { 63, 64 },
+ { 92, 92 },
+ { 161, 161 },
+ { 167, 167 },
+ { 182, 183 },
+ { 191, 191 },
+ { 894, 894 },
+ { 903, 903 },
+ { 1370, 1375 },
+ { 1417, 1417 },
+ { 1472, 1472 },
+ { 1475, 1475 },
+ { 1478, 1478 },
+ { 1523, 1524 },
+ { 1545, 1546 },
+ { 1548, 1549 },
+ { 1563, 1563 },
+ { 1565, 1567 },
+ { 1642, 1645 },
+ { 1748, 1748 },
+ { 1792, 1805 },
+ { 2039, 2041 },
+ { 2096, 2110 },
+ { 2142, 2142 },
+ { 2404, 2405 },
+ { 2416, 2416 },
+ { 2557, 2557 },
+ { 2678, 2678 },
+ { 2800, 2800 },
+ { 3191, 3191 },
+ { 3204, 3204 },
+ { 3572, 3572 },
+ { 3663, 3663 },
+ { 3674, 3675 },
+ { 3844, 3858 },
+ { 3860, 3860 },
+ { 3973, 3973 },
+ { 4048, 4052 },
+ { 4057, 4058 },
+ { 4170, 4175 },
+ { 4347, 4347 },
+ { 4960, 4968 },
+ { 5742, 5742 },
+ { 5867, 5869 },
+ { 5941, 5942 },
+ { 6100, 6102 },
+ { 6104, 6106 },
+ { 6144, 6149 },
+ { 6151, 6154 },
+ { 6468, 6469 },
+ { 6686, 6687 },
+ { 6816, 6822 },
+ { 6824, 6829 },
+ { 7002, 7008 },
+ { 7037, 7038 },
+ { 7164, 7167 },
+ { 7227, 7231 },
+ { 7294, 7295 },
+ { 7360, 7367 },
+ { 7379, 7379 },
+ { 8214, 8215 },
+ { 8224, 8231 },
+ { 8240, 8248 },
+ { 8251, 8254 },
+ { 8257, 8259 },
+ { 8263, 8273 },
+ { 8275, 8275 },
+ { 8277, 8286 },
+ { 11513, 11516 },
+ { 11518, 11519 },
+ { 11632, 11632 },
+ { 11776, 11777 },
+ { 11782, 11784 },
+ { 11787, 11787 },
+ { 11790, 11798 },
+ { 11800, 11801 },
+ { 11803, 11803 },
+ { 11806, 11807 },
+ { 11818, 11822 },
+ { 11824, 11833 },
+ { 11836, 11839 },
+ { 11841, 11841 },
+ { 11843, 11855 },
+ { 11858, 11860 },
+ { 12289, 12291 },
+ { 12349, 12349 },
+ { 12539, 12539 },
+ { 42238, 42239 },
+ { 42509, 42511 },
+ { 42611, 42611 },
+ { 42622, 42622 },
+ { 42738, 42743 },
+ { 43124, 43127 },
+ { 43214, 43215 },
+ { 43256, 43258 },
+ { 43260, 43260 },
+ { 43310, 43311 },
+ { 43359, 43359 },
+ { 43457, 43469 },
+ { 43486, 43487 },
+ { 43612, 43615 },
+ { 43742, 43743 },
+ { 43760, 43761 },
+ { 44011, 44011 },
+ { 65040, 65046 },
+ { 65049, 65049 },
+ { 65072, 65072 },
+ { 65093, 65094 },
+ { 65097, 65100 },
+ { 65104, 65106 },
+ { 65108, 65111 },
+ { 65119, 65121 },
+ { 65128, 65128 },
+ { 65130, 65131 },
+ { 65281, 65283 },
+ { 65285, 65287 },
+ { 65290, 65290 },
+ { 65292, 65292 },
+ { 65294, 65295 },
+ { 65306, 65307 },
+ { 65311, 65312 },
+ { 65340, 65340 },
+ { 65377, 65377 },
+ { 65380, 65381 },
+};
+static const URange32 Po_range32[] = {
+ { 65792, 65794 },
+ { 66463, 66463 },
+ { 66512, 66512 },
+ { 66927, 66927 },
+ { 67671, 67671 },
+ { 67871, 67871 },
+ { 67903, 67903 },
+ { 68176, 68184 },
+ { 68223, 68223 },
+ { 68336, 68342 },
+ { 68409, 68415 },
+ { 68505, 68508 },
+ { 69461, 69465 },
+ { 69510, 69513 },
+ { 69703, 69709 },
+ { 69819, 69820 },
+ { 69822, 69825 },
+ { 69952, 69955 },
+ { 70004, 70005 },
+ { 70085, 70088 },
+ { 70093, 70093 },
+ { 70107, 70107 },
+ { 70109, 70111 },
+ { 70200, 70205 },
+ { 70313, 70313 },
+ { 70731, 70735 },
+ { 70746, 70747 },
+ { 70749, 70749 },
+ { 70854, 70854 },
+ { 71105, 71127 },
+ { 71233, 71235 },
+ { 71264, 71276 },
+ { 71353, 71353 },
+ { 71484, 71486 },
+ { 71739, 71739 },
+ { 72004, 72006 },
+ { 72162, 72162 },
+ { 72255, 72262 },
+ { 72346, 72348 },
+ { 72350, 72354 },
+ { 72448, 72457 },
+ { 72769, 72773 },
+ { 72816, 72817 },
+ { 73463, 73464 },
+ { 73539, 73551 },
+ { 73727, 73727 },
+ { 74864, 74868 },
+ { 77809, 77810 },
+ { 92782, 92783 },
+ { 92917, 92917 },
+ { 92983, 92987 },
+ { 92996, 92996 },
+ { 93847, 93850 },
+ { 94178, 94178 },
+ { 113823, 113823 },
+ { 121479, 121483 },
+ { 125278, 125279 },
+};
+static const URange16 Ps_range16[] = {
+ { 40, 40 },
+ { 91, 91 },
+ { 123, 123 },
+ { 3898, 3898 },
+ { 3900, 3900 },
+ { 5787, 5787 },
+ { 8218, 8218 },
+ { 8222, 8222 },
+ { 8261, 8261 },
+ { 8317, 8317 },
+ { 8333, 8333 },
+ { 8968, 8968 },
+ { 8970, 8970 },
+ { 9001, 9001 },
+ { 10088, 10088 },
+ { 10090, 10090 },
+ { 10092, 10092 },
+ { 10094, 10094 },
+ { 10096, 10096 },
+ { 10098, 10098 },
+ { 10100, 10100 },
+ { 10181, 10181 },
+ { 10214, 10214 },
+ { 10216, 10216 },
+ { 10218, 10218 },
+ { 10220, 10220 },
+ { 10222, 10222 },
+ { 10627, 10627 },
+ { 10629, 10629 },
+ { 10631, 10631 },
+ { 10633, 10633 },
+ { 10635, 10635 },
+ { 10637, 10637 },
+ { 10639, 10639 },
+ { 10641, 10641 },
+ { 10643, 10643 },
+ { 10645, 10645 },
+ { 10647, 10647 },
+ { 10712, 10712 },
+ { 10714, 10714 },
+ { 10748, 10748 },
+ { 11810, 11810 },
+ { 11812, 11812 },
+ { 11814, 11814 },
+ { 11816, 11816 },
+ { 11842, 11842 },
+ { 11861, 11861 },
+ { 11863, 11863 },
+ { 11865, 11865 },
+ { 11867, 11867 },
+ { 12296, 12296 },
+ { 12298, 12298 },
+ { 12300, 12300 },
+ { 12302, 12302 },
+ { 12304, 12304 },
+ { 12308, 12308 },
+ { 12310, 12310 },
+ { 12312, 12312 },
+ { 12314, 12314 },
+ { 12317, 12317 },
+ { 64831, 64831 },
+ { 65047, 65047 },
+ { 65077, 65077 },
+ { 65079, 65079 },
+ { 65081, 65081 },
+ { 65083, 65083 },
+ { 65085, 65085 },
+ { 65087, 65087 },
+ { 65089, 65089 },
+ { 65091, 65091 },
+ { 65095, 65095 },
+ { 65113, 65113 },
+ { 65115, 65115 },
+ { 65117, 65117 },
+ { 65288, 65288 },
+ { 65339, 65339 },
+ { 65371, 65371 },
+ { 65375, 65375 },
+ { 65378, 65378 },
+};
+static const URange16 S_range16[] = {
+ { 36, 36 },
+ { 43, 43 },
+ { 60, 62 },
+ { 94, 94 },
+ { 96, 96 },
+ { 124, 124 },
+ { 126, 126 },
+ { 162, 166 },
+ { 168, 169 },
+ { 172, 172 },
+ { 174, 177 },
+ { 180, 180 },
+ { 184, 184 },
+ { 215, 215 },
+ { 247, 247 },
+ { 706, 709 },
+ { 722, 735 },
+ { 741, 747 },
+ { 749, 749 },
+ { 751, 767 },
+ { 885, 885 },
+ { 900, 901 },
+ { 1014, 1014 },
+ { 1154, 1154 },
+ { 1421, 1423 },
+ { 1542, 1544 },
+ { 1547, 1547 },
+ { 1550, 1551 },
+ { 1758, 1758 },
+ { 1769, 1769 },
+ { 1789, 1790 },
+ { 2038, 2038 },
+ { 2046, 2047 },
+ { 2184, 2184 },
+ { 2546, 2547 },
+ { 2554, 2555 },
+ { 2801, 2801 },
+ { 2928, 2928 },
+ { 3059, 3066 },
+ { 3199, 3199 },
+ { 3407, 3407 },
+ { 3449, 3449 },
+ { 3647, 3647 },
+ { 3841, 3843 },
+ { 3859, 3859 },
+ { 3861, 3863 },
+ { 3866, 3871 },
+ { 3892, 3892 },
+ { 3894, 3894 },
+ { 3896, 3896 },
+ { 4030, 4037 },
+ { 4039, 4044 },
+ { 4046, 4047 },
+ { 4053, 4056 },
+ { 4254, 4255 },
+ { 5008, 5017 },
+ { 5741, 5741 },
+ { 6107, 6107 },
+ { 6464, 6464 },
+ { 6622, 6655 },
+ { 7009, 7018 },
+ { 7028, 7036 },
+ { 8125, 8125 },
+ { 8127, 8129 },
+ { 8141, 8143 },
+ { 8157, 8159 },
+ { 8173, 8175 },
+ { 8189, 8190 },
+ { 8260, 8260 },
+ { 8274, 8274 },
+ { 8314, 8316 },
+ { 8330, 8332 },
+ { 8352, 8384 },
+ { 8448, 8449 },
+ { 8451, 8454 },
+ { 8456, 8457 },
+ { 8468, 8468 },
+ { 8470, 8472 },
+ { 8478, 8483 },
+ { 8485, 8485 },
+ { 8487, 8487 },
+ { 8489, 8489 },
+ { 8494, 8494 },
+ { 8506, 8507 },
+ { 8512, 8516 },
+ { 8522, 8525 },
+ { 8527, 8527 },
+ { 8586, 8587 },
+ { 8592, 8967 },
+ { 8972, 9000 },
+ { 9003, 9254 },
+ { 9280, 9290 },
+ { 9372, 9449 },
+ { 9472, 10087 },
+ { 10132, 10180 },
+ { 10183, 10213 },
+ { 10224, 10626 },
+ { 10649, 10711 },
+ { 10716, 10747 },
+ { 10750, 11123 },
+ { 11126, 11157 },
+ { 11159, 11263 },
+ { 11493, 11498 },
+ { 11856, 11857 },
+ { 11904, 11929 },
+ { 11931, 12019 },
+ { 12032, 12245 },
+ { 12272, 12287 },
+ { 12292, 12292 },
+ { 12306, 12307 },
+ { 12320, 12320 },
+ { 12342, 12343 },
+ { 12350, 12351 },
+ { 12443, 12444 },
+ { 12688, 12689 },
+ { 12694, 12703 },
+ { 12736, 12771 },
+ { 12783, 12783 },
+ { 12800, 12830 },
+ { 12842, 12871 },
+ { 12880, 12880 },
+ { 12896, 12927 },
+ { 12938, 12976 },
+ { 12992, 13311 },
+ { 19904, 19967 },
+ { 42128, 42182 },
+ { 42752, 42774 },
+ { 42784, 42785 },
+ { 42889, 42890 },
+ { 43048, 43051 },
+ { 43062, 43065 },
+ { 43639, 43641 },
+ { 43867, 43867 },
+ { 43882, 43883 },
+ { 64297, 64297 },
+ { 64434, 64450 },
+ { 64832, 64847 },
+ { 64975, 64975 },
+ { 65020, 65023 },
+ { 65122, 65122 },
+ { 65124, 65126 },
+ { 65129, 65129 },
+ { 65284, 65284 },
+ { 65291, 65291 },
+ { 65308, 65310 },
+ { 65342, 65342 },
+ { 65344, 65344 },
+ { 65372, 65372 },
+ { 65374, 65374 },
+ { 65504, 65510 },
+ { 65512, 65518 },
+ { 65532, 65533 },
+};
+static const URange32 S_range32[] = {
+ { 65847, 65855 },
+ { 65913, 65929 },
+ { 65932, 65934 },
+ { 65936, 65948 },
+ { 65952, 65952 },
+ { 66000, 66044 },
+ { 67703, 67704 },
+ { 68296, 68296 },
+ { 71487, 71487 },
+ { 73685, 73713 },
+ { 92988, 92991 },
+ { 92997, 92997 },
+ { 113820, 113820 },
+ { 118608, 118723 },
+ { 118784, 119029 },
+ { 119040, 119078 },
+ { 119081, 119140 },
+ { 119146, 119148 },
+ { 119171, 119172 },
+ { 119180, 119209 },
+ { 119214, 119274 },
+ { 119296, 119361 },
+ { 119365, 119365 },
+ { 119552, 119638 },
+ { 120513, 120513 },
+ { 120539, 120539 },
+ { 120571, 120571 },
+ { 120597, 120597 },
+ { 120629, 120629 },
+ { 120655, 120655 },
+ { 120687, 120687 },
+ { 120713, 120713 },
+ { 120745, 120745 },
+ { 120771, 120771 },
+ { 120832, 121343 },
+ { 121399, 121402 },
+ { 121453, 121460 },
+ { 121462, 121475 },
+ { 121477, 121478 },
+ { 123215, 123215 },
+ { 123647, 123647 },
+ { 126124, 126124 },
+ { 126128, 126128 },
+ { 126254, 126254 },
+ { 126704, 126705 },
+ { 126976, 127019 },
+ { 127024, 127123 },
+ { 127136, 127150 },
+ { 127153, 127167 },
+ { 127169, 127183 },
+ { 127185, 127221 },
+ { 127245, 127405 },
+ { 127462, 127490 },
+ { 127504, 127547 },
+ { 127552, 127560 },
+ { 127568, 127569 },
+ { 127584, 127589 },
+ { 127744, 128727 },
+ { 128732, 128748 },
+ { 128752, 128764 },
+ { 128768, 128886 },
+ { 128891, 128985 },
+ { 128992, 129003 },
+ { 129008, 129008 },
+ { 129024, 129035 },
+ { 129040, 129095 },
+ { 129104, 129113 },
+ { 129120, 129159 },
+ { 129168, 129197 },
+ { 129200, 129201 },
+ { 129280, 129619 },
+ { 129632, 129645 },
+ { 129648, 129660 },
+ { 129664, 129672 },
+ { 129680, 129725 },
+ { 129727, 129733 },
+ { 129742, 129755 },
+ { 129760, 129768 },
+ { 129776, 129784 },
+ { 129792, 129938 },
+ { 129940, 129994 },
+};
+static const URange16 Sc_range16[] = {
+ { 36, 36 },
+ { 162, 165 },
+ { 1423, 1423 },
+ { 1547, 1547 },
+ { 2046, 2047 },
+ { 2546, 2547 },
+ { 2555, 2555 },
+ { 2801, 2801 },
+ { 3065, 3065 },
+ { 3647, 3647 },
+ { 6107, 6107 },
+ { 8352, 8384 },
+ { 43064, 43064 },
+ { 65020, 65020 },
+ { 65129, 65129 },
+ { 65284, 65284 },
+ { 65504, 65505 },
+ { 65509, 65510 },
+};
+static const URange32 Sc_range32[] = {
+ { 73693, 73696 },
+ { 123647, 123647 },
+ { 126128, 126128 },
+};
+static const URange16 Sk_range16[] = {
+ { 94, 94 },
+ { 96, 96 },
+ { 168, 168 },
+ { 175, 175 },
+ { 180, 180 },
+ { 184, 184 },
+ { 706, 709 },
+ { 722, 735 },
+ { 741, 747 },
+ { 749, 749 },
+ { 751, 767 },
+ { 885, 885 },
+ { 900, 901 },
+ { 2184, 2184 },
+ { 8125, 8125 },
+ { 8127, 8129 },
+ { 8141, 8143 },
+ { 8157, 8159 },
+ { 8173, 8175 },
+ { 8189, 8190 },
+ { 12443, 12444 },
+ { 42752, 42774 },
+ { 42784, 42785 },
+ { 42889, 42890 },
+ { 43867, 43867 },
+ { 43882, 43883 },
+ { 64434, 64450 },
+ { 65342, 65342 },
+ { 65344, 65344 },
+ { 65507, 65507 },
+};
+static const URange32 Sk_range32[] = {
+ { 127995, 127999 },
+};
+static const URange16 Sm_range16[] = {
+ { 43, 43 },
+ { 60, 62 },
+ { 124, 124 },
+ { 126, 126 },
+ { 172, 172 },
+ { 177, 177 },
+ { 215, 215 },
+ { 247, 247 },
+ { 1014, 1014 },
+ { 1542, 1544 },
+ { 8260, 8260 },
+ { 8274, 8274 },
+ { 8314, 8316 },
+ { 8330, 8332 },
+ { 8472, 8472 },
+ { 8512, 8516 },
+ { 8523, 8523 },
+ { 8592, 8596 },
+ { 8602, 8603 },
+ { 8608, 8608 },
+ { 8611, 8611 },
+ { 8614, 8614 },
+ { 8622, 8622 },
+ { 8654, 8655 },
+ { 8658, 8658 },
+ { 8660, 8660 },
+ { 8692, 8959 },
+ { 8992, 8993 },
+ { 9084, 9084 },
+ { 9115, 9139 },
+ { 9180, 9185 },
+ { 9655, 9655 },
+ { 9665, 9665 },
+ { 9720, 9727 },
+ { 9839, 9839 },
+ { 10176, 10180 },
+ { 10183, 10213 },
+ { 10224, 10239 },
+ { 10496, 10626 },
+ { 10649, 10711 },
+ { 10716, 10747 },
+ { 10750, 11007 },
+ { 11056, 11076 },
+ { 11079, 11084 },
+ { 64297, 64297 },
+ { 65122, 65122 },
+ { 65124, 65126 },
+ { 65291, 65291 },
+ { 65308, 65310 },
+ { 65372, 65372 },
+ { 65374, 65374 },
+ { 65506, 65506 },
+ { 65513, 65516 },
+};
+static const URange32 Sm_range32[] = {
+ { 120513, 120513 },
+ { 120539, 120539 },
+ { 120571, 120571 },
+ { 120597, 120597 },
+ { 120629, 120629 },
+ { 120655, 120655 },
+ { 120687, 120687 },
+ { 120713, 120713 },
+ { 120745, 120745 },
+ { 120771, 120771 },
+ { 126704, 126705 },
+};
+static const URange16 So_range16[] = {
+ { 166, 166 },
+ { 169, 169 },
+ { 174, 174 },
+ { 176, 176 },
+ { 1154, 1154 },
+ { 1421, 1422 },
+ { 1550, 1551 },
+ { 1758, 1758 },
+ { 1769, 1769 },
+ { 1789, 1790 },
+ { 2038, 2038 },
+ { 2554, 2554 },
+ { 2928, 2928 },
+ { 3059, 3064 },
+ { 3066, 3066 },
+ { 3199, 3199 },
+ { 3407, 3407 },
+ { 3449, 3449 },
+ { 3841, 3843 },
+ { 3859, 3859 },
+ { 3861, 3863 },
+ { 3866, 3871 },
+ { 3892, 3892 },
+ { 3894, 3894 },
+ { 3896, 3896 },
+ { 4030, 4037 },
+ { 4039, 4044 },
+ { 4046, 4047 },
+ { 4053, 4056 },
+ { 4254, 4255 },
+ { 5008, 5017 },
+ { 5741, 5741 },
+ { 6464, 6464 },
+ { 6622, 6655 },
+ { 7009, 7018 },
+ { 7028, 7036 },
+ { 8448, 8449 },
+ { 8451, 8454 },
+ { 8456, 8457 },
+ { 8468, 8468 },
+ { 8470, 8471 },
+ { 8478, 8483 },
+ { 8485, 8485 },
+ { 8487, 8487 },
+ { 8489, 8489 },
+ { 8494, 8494 },
+ { 8506, 8507 },
+ { 8522, 8522 },
+ { 8524, 8525 },
+ { 8527, 8527 },
+ { 8586, 8587 },
+ { 8597, 8601 },
+ { 8604, 8607 },
+ { 8609, 8610 },
+ { 8612, 8613 },
+ { 8615, 8621 },
+ { 8623, 8653 },
+ { 8656, 8657 },
+ { 8659, 8659 },
+ { 8661, 8691 },
+ { 8960, 8967 },
+ { 8972, 8991 },
+ { 8994, 9000 },
+ { 9003, 9083 },
+ { 9085, 9114 },
+ { 9140, 9179 },
+ { 9186, 9254 },
+ { 9280, 9290 },
+ { 9372, 9449 },
+ { 9472, 9654 },
+ { 9656, 9664 },
+ { 9666, 9719 },
+ { 9728, 9838 },
+ { 9840, 10087 },
+ { 10132, 10175 },
+ { 10240, 10495 },
+ { 11008, 11055 },
+ { 11077, 11078 },
+ { 11085, 11123 },
+ { 11126, 11157 },
+ { 11159, 11263 },
+ { 11493, 11498 },
+ { 11856, 11857 },
+ { 11904, 11929 },
+ { 11931, 12019 },
+ { 12032, 12245 },
+ { 12272, 12287 },
+ { 12292, 12292 },
+ { 12306, 12307 },
+ { 12320, 12320 },
+ { 12342, 12343 },
+ { 12350, 12351 },
+ { 12688, 12689 },
+ { 12694, 12703 },
+ { 12736, 12771 },
+ { 12783, 12783 },
+ { 12800, 12830 },
+ { 12842, 12871 },
+ { 12880, 12880 },
+ { 12896, 12927 },
+ { 12938, 12976 },
+ { 12992, 13311 },
+ { 19904, 19967 },
+ { 42128, 42182 },
+ { 43048, 43051 },
+ { 43062, 43063 },
+ { 43065, 43065 },
+ { 43639, 43641 },
+ { 64832, 64847 },
+ { 64975, 64975 },
+ { 65021, 65023 },
+ { 65508, 65508 },
+ { 65512, 65512 },
+ { 65517, 65518 },
+ { 65532, 65533 },
+};
+static const URange32 So_range32[] = {
+ { 65847, 65855 },
+ { 65913, 65929 },
+ { 65932, 65934 },
+ { 65936, 65948 },
+ { 65952, 65952 },
+ { 66000, 66044 },
+ { 67703, 67704 },
+ { 68296, 68296 },
+ { 71487, 71487 },
+ { 73685, 73692 },
+ { 73697, 73713 },
+ { 92988, 92991 },
+ { 92997, 92997 },
+ { 113820, 113820 },
+ { 118608, 118723 },
+ { 118784, 119029 },
+ { 119040, 119078 },
+ { 119081, 119140 },
+ { 119146, 119148 },
+ { 119171, 119172 },
+ { 119180, 119209 },
+ { 119214, 119274 },
+ { 119296, 119361 },
+ { 119365, 119365 },
+ { 119552, 119638 },
+ { 120832, 121343 },
+ { 121399, 121402 },
+ { 121453, 121460 },
+ { 121462, 121475 },
+ { 121477, 121478 },
+ { 123215, 123215 },
+ { 126124, 126124 },
+ { 126254, 126254 },
+ { 126976, 127019 },
+ { 127024, 127123 },
+ { 127136, 127150 },
+ { 127153, 127167 },
+ { 127169, 127183 },
+ { 127185, 127221 },
+ { 127245, 127405 },
+ { 127462, 127490 },
+ { 127504, 127547 },
+ { 127552, 127560 },
+ { 127568, 127569 },
+ { 127584, 127589 },
+ { 127744, 127994 },
+ { 128000, 128727 },
+ { 128732, 128748 },
+ { 128752, 128764 },
+ { 128768, 128886 },
+ { 128891, 128985 },
+ { 128992, 129003 },
+ { 129008, 129008 },
+ { 129024, 129035 },
+ { 129040, 129095 },
+ { 129104, 129113 },
+ { 129120, 129159 },
+ { 129168, 129197 },
+ { 129200, 129201 },
+ { 129280, 129619 },
+ { 129632, 129645 },
+ { 129648, 129660 },
+ { 129664, 129672 },
+ { 129680, 129725 },
+ { 129727, 129733 },
+ { 129742, 129755 },
+ { 129760, 129768 },
+ { 129776, 129784 },
+ { 129792, 129938 },
+ { 129940, 129994 },
+};
+static const URange16 Z_range16[] = {
+ { 32, 32 },
+ { 160, 160 },
+ { 5760, 5760 },
+ { 8192, 8202 },
+ { 8232, 8233 },
+ { 8239, 8239 },
+ { 8287, 8287 },
+ { 12288, 12288 },
+};
+static const URange16 Zl_range16[] = {
+ { 8232, 8232 },
+};
+static const URange16 Zp_range16[] = {
+ { 8233, 8233 },
+};
+static const URange16 Zs_range16[] = {
+ { 32, 32 },
+ { 160, 160 },
+ { 5760, 5760 },
+ { 8192, 8202 },
+ { 8239, 8239 },
+ { 8287, 8287 },
+ { 12288, 12288 },
+};
+static const URange32 Adlam_range32[] = {
+ { 125184, 125259 },
+ { 125264, 125273 },
+ { 125278, 125279 },
+};
+static const URange32 Ahom_range32[] = {
+ { 71424, 71450 },
+ { 71453, 71467 },
+ { 71472, 71494 },
+};
+static const URange32 Anatolian_Hieroglyphs_range32[] = {
+ { 82944, 83526 },
+};
+static const URange16 Arabic_range16[] = {
+ { 1536, 1540 },
+ { 1542, 1547 },
+ { 1549, 1562 },
+ { 1564, 1566 },
+ { 1568, 1599 },
+ { 1601, 1610 },
+ { 1622, 1647 },
+ { 1649, 1756 },
+ { 1758, 1791 },
+ { 1872, 1919 },
+ { 2160, 2190 },
+ { 2192, 2193 },
+ { 2200, 2273 },
+ { 2275, 2303 },
+ { 64336, 64450 },
+ { 64467, 64829 },
+ { 64832, 64911 },
+ { 64914, 64967 },
+ { 64975, 64975 },
+ { 65008, 65023 },
+ { 65136, 65140 },
+ { 65142, 65276 },
+};
+static const URange32 Arabic_range32[] = {
+ { 69216, 69246 },
+ { 69373, 69375 },
+ { 126464, 126467 },
+ { 126469, 126495 },
+ { 126497, 126498 },
+ { 126500, 126500 },
+ { 126503, 126503 },
+ { 126505, 126514 },
+ { 126516, 126519 },
+ { 126521, 126521 },
+ { 126523, 126523 },
+ { 126530, 126530 },
+ { 126535, 126535 },
+ { 126537, 126537 },
+ { 126539, 126539 },
+ { 126541, 126543 },
+ { 126545, 126546 },
+ { 126548, 126548 },
+ { 126551, 126551 },
+ { 126553, 126553 },
+ { 126555, 126555 },
+ { 126557, 126557 },
+ { 126559, 126559 },
+ { 126561, 126562 },
+ { 126564, 126564 },
+ { 126567, 126570 },
+ { 126572, 126578 },
+ { 126580, 126583 },
+ { 126585, 126588 },
+ { 126590, 126590 },
+ { 126592, 126601 },
+ { 126603, 126619 },
+ { 126625, 126627 },
+ { 126629, 126633 },
+ { 126635, 126651 },
+ { 126704, 126705 },
+};
+static const URange16 Armenian_range16[] = {
+ { 1329, 1366 },
+ { 1369, 1418 },
+ { 1421, 1423 },
+ { 64275, 64279 },
+};
+static const URange32 Avestan_range32[] = {
+ { 68352, 68405 },
+ { 68409, 68415 },
+};
+static const URange16 Balinese_range16[] = {
+ { 6912, 6988 },
+ { 6992, 7038 },
+};
+static const URange16 Bamum_range16[] = {
+ { 42656, 42743 },
+};
+static const URange32 Bamum_range32[] = {
+ { 92160, 92728 },
+};
+static const URange32 Bassa_Vah_range32[] = {
+ { 92880, 92909 },
+ { 92912, 92917 },
+};
+static const URange16 Batak_range16[] = {
+ { 7104, 7155 },
+ { 7164, 7167 },
+};
+static const URange16 Bengali_range16[] = {
+ { 2432, 2435 },
+ { 2437, 2444 },
+ { 2447, 2448 },
+ { 2451, 2472 },
+ { 2474, 2480 },
+ { 2482, 2482 },
+ { 2486, 2489 },
+ { 2492, 2500 },
+ { 2503, 2504 },
+ { 2507, 2510 },
+ { 2519, 2519 },
+ { 2524, 2525 },
+ { 2527, 2531 },
+ { 2534, 2558 },
+};
+static const URange32 Bhaiksuki_range32[] = {
+ { 72704, 72712 },
+ { 72714, 72758 },
+ { 72760, 72773 },
+ { 72784, 72812 },
+};
+static const URange16 Bopomofo_range16[] = {
+ { 746, 747 },
+ { 12549, 12591 },
+ { 12704, 12735 },
+};
+static const URange32 Brahmi_range32[] = {
+ { 69632, 69709 },
+ { 69714, 69749 },
+ { 69759, 69759 },
+};
+static const URange16 Braille_range16[] = {
+ { 10240, 10495 },
+};
+static const URange16 Buginese_range16[] = {
+ { 6656, 6683 },
+ { 6686, 6687 },
+};
+static const URange16 Buhid_range16[] = {
+ { 5952, 5971 },
+};
+static const URange16 Canadian_Aboriginal_range16[] = {
+ { 5120, 5759 },
+ { 6320, 6389 },
+};
+static const URange32 Canadian_Aboriginal_range32[] = {
+ { 72368, 72383 },
+};
+static const URange32 Carian_range32[] = {
+ { 66208, 66256 },
+};
+static const URange32 Caucasian_Albanian_range32[] = {
+ { 66864, 66915 },
+ { 66927, 66927 },
+};
+static const URange32 Chakma_range32[] = {
+ { 69888, 69940 },
+ { 69942, 69959 },
+};
+static const URange16 Cham_range16[] = {
+ { 43520, 43574 },
+ { 43584, 43597 },
+ { 43600, 43609 },
+ { 43612, 43615 },
+};
+static const URange16 Cherokee_range16[] = {
+ { 5024, 5109 },
+ { 5112, 5117 },
+ { 43888, 43967 },
+};
+static const URange32 Chorasmian_range32[] = {
+ { 69552, 69579 },
+};
+static const URange16 Common_range16[] = {
+ { 0, 64 },
+ { 91, 96 },
+ { 123, 169 },
+ { 171, 185 },
+ { 187, 191 },
+ { 215, 215 },
+ { 247, 247 },
+ { 697, 735 },
+ { 741, 745 },
+ { 748, 767 },
+ { 884, 884 },
+ { 894, 894 },
+ { 901, 901 },
+ { 903, 903 },
+ { 1541, 1541 },
+ { 1548, 1548 },
+ { 1563, 1563 },
+ { 1567, 1567 },
+ { 1600, 1600 },
+ { 1757, 1757 },
+ { 2274, 2274 },
+ { 2404, 2405 },
+ { 3647, 3647 },
+ { 4053, 4056 },
+ { 4347, 4347 },
+ { 5867, 5869 },
+ { 5941, 5942 },
+ { 6146, 6147 },
+ { 6149, 6149 },
+ { 7379, 7379 },
+ { 7393, 7393 },
+ { 7401, 7404 },
+ { 7406, 7411 },
+ { 7413, 7415 },
+ { 7418, 7418 },
+ { 8192, 8203 },
+ { 8206, 8292 },
+ { 8294, 8304 },
+ { 8308, 8318 },
+ { 8320, 8334 },
+ { 8352, 8384 },
+ { 8448, 8485 },
+ { 8487, 8489 },
+ { 8492, 8497 },
+ { 8499, 8525 },
+ { 8527, 8543 },
+ { 8585, 8587 },
+ { 8592, 9254 },
+ { 9280, 9290 },
+ { 9312, 10239 },
+ { 10496, 11123 },
+ { 11126, 11157 },
+ { 11159, 11263 },
+ { 11776, 11869 },
+ { 12272, 12292 },
+ { 12294, 12294 },
+ { 12296, 12320 },
+ { 12336, 12343 },
+ { 12348, 12351 },
+ { 12443, 12444 },
+ { 12448, 12448 },
+ { 12539, 12540 },
+ { 12688, 12703 },
+ { 12736, 12771 },
+ { 12783, 12783 },
+ { 12832, 12895 },
+ { 12927, 13007 },
+ { 13055, 13055 },
+ { 13144, 13311 },
+ { 19904, 19967 },
+ { 42752, 42785 },
+ { 42888, 42890 },
+ { 43056, 43065 },
+ { 43310, 43310 },
+ { 43471, 43471 },
+ { 43867, 43867 },
+ { 43882, 43883 },
+ { 64830, 64831 },
+ { 65040, 65049 },
+ { 65072, 65106 },
+ { 65108, 65126 },
+ { 65128, 65131 },
+ { 65279, 65279 },
+ { 65281, 65312 },
+ { 65339, 65344 },
+ { 65371, 65381 },
+ { 65392, 65392 },
+ { 65438, 65439 },
+ { 65504, 65510 },
+ { 65512, 65518 },
+ { 65529, 65533 },
+};
+static const URange32 Common_range32[] = {
+ { 65792, 65794 },
+ { 65799, 65843 },
+ { 65847, 65855 },
+ { 65936, 65948 },
+ { 66000, 66044 },
+ { 66273, 66299 },
+ { 113824, 113827 },
+ { 118608, 118723 },
+ { 118784, 119029 },
+ { 119040, 119078 },
+ { 119081, 119142 },
+ { 119146, 119162 },
+ { 119171, 119172 },
+ { 119180, 119209 },
+ { 119214, 119274 },
+ { 119488, 119507 },
+ { 119520, 119539 },
+ { 119552, 119638 },
+ { 119648, 119672 },
+ { 119808, 119892 },
+ { 119894, 119964 },
+ { 119966, 119967 },
+ { 119970, 119970 },
+ { 119973, 119974 },
+ { 119977, 119980 },
+ { 119982, 119993 },
+ { 119995, 119995 },
+ { 119997, 120003 },
+ { 120005, 120069 },
+ { 120071, 120074 },
+ { 120077, 120084 },
+ { 120086, 120092 },
+ { 120094, 120121 },
+ { 120123, 120126 },
+ { 120128, 120132 },
+ { 120134, 120134 },
+ { 120138, 120144 },
+ { 120146, 120485 },
+ { 120488, 120779 },
+ { 120782, 120831 },
+ { 126065, 126132 },
+ { 126209, 126269 },
+ { 126976, 127019 },
+ { 127024, 127123 },
+ { 127136, 127150 },
+ { 127153, 127167 },
+ { 127169, 127183 },
+ { 127185, 127221 },
+ { 127232, 127405 },
+ { 127462, 127487 },
+ { 127489, 127490 },
+ { 127504, 127547 },
+ { 127552, 127560 },
+ { 127568, 127569 },
+ { 127584, 127589 },
+ { 127744, 128727 },
+ { 128732, 128748 },
+ { 128752, 128764 },
+ { 128768, 128886 },
+ { 128891, 128985 },
+ { 128992, 129003 },
+ { 129008, 129008 },
+ { 129024, 129035 },
+ { 129040, 129095 },
+ { 129104, 129113 },
+ { 129120, 129159 },
+ { 129168, 129197 },
+ { 129200, 129201 },
+ { 129280, 129619 },
+ { 129632, 129645 },
+ { 129648, 129660 },
+ { 129664, 129672 },
+ { 129680, 129725 },
+ { 129727, 129733 },
+ { 129742, 129755 },
+ { 129760, 129768 },
+ { 129776, 129784 },
+ { 129792, 129938 },
+ { 129940, 129994 },
+ { 130032, 130041 },
+ { 917505, 917505 },
+ { 917536, 917631 },
+};
+static const URange16 Coptic_range16[] = {
+ { 994, 1007 },
+ { 11392, 11507 },
+ { 11513, 11519 },
+};
+static const URange32 Cuneiform_range32[] = {
+ { 73728, 74649 },
+ { 74752, 74862 },
+ { 74864, 74868 },
+ { 74880, 75075 },
+};
+static const URange32 Cypriot_range32[] = {
+ { 67584, 67589 },
+ { 67592, 67592 },
+ { 67594, 67637 },
+ { 67639, 67640 },
+ { 67644, 67644 },
+ { 67647, 67647 },
+};
+static const URange32 Cypro_Minoan_range32[] = {
+ { 77712, 77810 },
+};
+static const URange16 Cyrillic_range16[] = {
+ { 1024, 1156 },
+ { 1159, 1327 },
+ { 7296, 7304 },
+ { 7467, 7467 },
+ { 7544, 7544 },
+ { 11744, 11775 },
+ { 42560, 42655 },
+ { 65070, 65071 },
+};
+static const URange32 Cyrillic_range32[] = {
+ { 122928, 122989 },
+ { 123023, 123023 },
+};
+static const URange32 Deseret_range32[] = {
+ { 66560, 66639 },
+};
+static const URange16 Devanagari_range16[] = {
+ { 2304, 2384 },
+ { 2389, 2403 },
+ { 2406, 2431 },
+ { 43232, 43263 },
+};
+static const URange32 Devanagari_range32[] = {
+ { 72448, 72457 },
+};
+static const URange32 Dives_Akuru_range32[] = {
+ { 71936, 71942 },
+ { 71945, 71945 },
+ { 71948, 71955 },
+ { 71957, 71958 },
+ { 71960, 71989 },
+ { 71991, 71992 },
+ { 71995, 72006 },
+ { 72016, 72025 },
+};
+static const URange32 Dogra_range32[] = {
+ { 71680, 71739 },
+};
+static const URange32 Duployan_range32[] = {
+ { 113664, 113770 },
+ { 113776, 113788 },
+ { 113792, 113800 },
+ { 113808, 113817 },
+ { 113820, 113823 },
+};
+static const URange32 Egyptian_Hieroglyphs_range32[] = {
+ { 77824, 78933 },
+};
+static const URange32 Elbasan_range32[] = {
+ { 66816, 66855 },
+};
+static const URange32 Elymaic_range32[] = {
+ { 69600, 69622 },
+};
+static const URange16 Ethiopic_range16[] = {
+ { 4608, 4680 },
+ { 4682, 4685 },
+ { 4688, 4694 },
+ { 4696, 4696 },
+ { 4698, 4701 },
+ { 4704, 4744 },
+ { 4746, 4749 },
+ { 4752, 4784 },
+ { 4786, 4789 },
+ { 4792, 4798 },
+ { 4800, 4800 },
+ { 4802, 4805 },
+ { 4808, 4822 },
+ { 4824, 4880 },
+ { 4882, 4885 },
+ { 4888, 4954 },
+ { 4957, 4988 },
+ { 4992, 5017 },
+ { 11648, 11670 },
+ { 11680, 11686 },
+ { 11688, 11694 },
+ { 11696, 11702 },
+ { 11704, 11710 },
+ { 11712, 11718 },
+ { 11720, 11726 },
+ { 11728, 11734 },
+ { 11736, 11742 },
+ { 43777, 43782 },
+ { 43785, 43790 },
+ { 43793, 43798 },
+ { 43808, 43814 },
+ { 43816, 43822 },
+};
+static const URange32 Ethiopic_range32[] = {
+ { 124896, 124902 },
+ { 124904, 124907 },
+ { 124909, 124910 },
+ { 124912, 124926 },
+};
+static const URange16 Georgian_range16[] = {
+ { 4256, 4293 },
+ { 4295, 4295 },
+ { 4301, 4301 },
+ { 4304, 4346 },
+ { 4348, 4351 },
+ { 7312, 7354 },
+ { 7357, 7359 },
+ { 11520, 11557 },
+ { 11559, 11559 },
+ { 11565, 11565 },
+};
+static const URange16 Glagolitic_range16[] = {
+ { 11264, 11359 },
+};
+static const URange32 Glagolitic_range32[] = {
+ { 122880, 122886 },
+ { 122888, 122904 },
+ { 122907, 122913 },
+ { 122915, 122916 },
+ { 122918, 122922 },
+};
+static const URange32 Gothic_range32[] = {
+ { 66352, 66378 },
+};
+static const URange32 Grantha_range32[] = {
+ { 70400, 70403 },
+ { 70405, 70412 },
+ { 70415, 70416 },
+ { 70419, 70440 },
+ { 70442, 70448 },
+ { 70450, 70451 },
+ { 70453, 70457 },
+ { 70460, 70468 },
+ { 70471, 70472 },
+ { 70475, 70477 },
+ { 70480, 70480 },
+ { 70487, 70487 },
+ { 70493, 70499 },
+ { 70502, 70508 },
+ { 70512, 70516 },
+};
+static const URange16 Greek_range16[] = {
+ { 880, 883 },
+ { 885, 887 },
+ { 890, 893 },
+ { 895, 895 },
+ { 900, 900 },
+ { 902, 902 },
+ { 904, 906 },
+ { 908, 908 },
+ { 910, 929 },
+ { 931, 993 },
+ { 1008, 1023 },
+ { 7462, 7466 },
+ { 7517, 7521 },
+ { 7526, 7530 },
+ { 7615, 7615 },
+ { 7936, 7957 },
+ { 7960, 7965 },
+ { 7968, 8005 },
+ { 8008, 8013 },
+ { 8016, 8023 },
+ { 8025, 8025 },
+ { 8027, 8027 },
+ { 8029, 8029 },
+ { 8031, 8061 },
+ { 8064, 8116 },
+ { 8118, 8132 },
+ { 8134, 8147 },
+ { 8150, 8155 },
+ { 8157, 8175 },
+ { 8178, 8180 },
+ { 8182, 8190 },
+ { 8486, 8486 },
+ { 43877, 43877 },
+};
+static const URange32 Greek_range32[] = {
+ { 65856, 65934 },
+ { 65952, 65952 },
+ { 119296, 119365 },
+};
+static const URange16 Gujarati_range16[] = {
+ { 2689, 2691 },
+ { 2693, 2701 },
+ { 2703, 2705 },
+ { 2707, 2728 },
+ { 2730, 2736 },
+ { 2738, 2739 },
+ { 2741, 2745 },
+ { 2748, 2757 },
+ { 2759, 2761 },
+ { 2763, 2765 },
+ { 2768, 2768 },
+ { 2784, 2787 },
+ { 2790, 2801 },
+ { 2809, 2815 },
+};
+static const URange32 Gunjala_Gondi_range32[] = {
+ { 73056, 73061 },
+ { 73063, 73064 },
+ { 73066, 73102 },
+ { 73104, 73105 },
+ { 73107, 73112 },
+ { 73120, 73129 },
+};
+static const URange16 Gurmukhi_range16[] = {
+ { 2561, 2563 },
+ { 2565, 2570 },
+ { 2575, 2576 },
+ { 2579, 2600 },
+ { 2602, 2608 },
+ { 2610, 2611 },
+ { 2613, 2614 },
+ { 2616, 2617 },
+ { 2620, 2620 },
+ { 2622, 2626 },
+ { 2631, 2632 },
+ { 2635, 2637 },
+ { 2641, 2641 },
+ { 2649, 2652 },
+ { 2654, 2654 },
+ { 2662, 2678 },
+};
+static const URange16 Han_range16[] = {
+ { 11904, 11929 },
+ { 11931, 12019 },
+ { 12032, 12245 },
+ { 12293, 12293 },
+ { 12295, 12295 },
+ { 12321, 12329 },
+ { 12344, 12347 },
+ { 13312, 19903 },
+ { 19968, 40959 },
+ { 63744, 64109 },
+ { 64112, 64217 },
+};
+static const URange32 Han_range32[] = {
+ { 94178, 94179 },
+ { 94192, 94193 },
+ { 131072, 173791 },
+ { 173824, 177977 },
+ { 177984, 178205 },
+ { 178208, 183969 },
+ { 183984, 191456 },
+ { 191472, 192093 },
+ { 194560, 195101 },
+ { 196608, 201546 },
+ { 201552, 205743 },
+};
+static const URange16 Hangul_range16[] = {
+ { 4352, 4607 },
+ { 12334, 12335 },
+ { 12593, 12686 },
+ { 12800, 12830 },
+ { 12896, 12926 },
+ { 43360, 43388 },
+ { 44032, 55203 },
+ { 55216, 55238 },
+ { 55243, 55291 },
+ { 65440, 65470 },
+ { 65474, 65479 },
+ { 65482, 65487 },
+ { 65490, 65495 },
+ { 65498, 65500 },
+};
+static const URange32 Hanifi_Rohingya_range32[] = {
+ { 68864, 68903 },
+ { 68912, 68921 },
+};
+static const URange16 Hanunoo_range16[] = {
+ { 5920, 5940 },
+};
+static const URange32 Hatran_range32[] = {
+ { 67808, 67826 },
+ { 67828, 67829 },
+ { 67835, 67839 },
+};
+static const URange16 Hebrew_range16[] = {
+ { 1425, 1479 },
+ { 1488, 1514 },
+ { 1519, 1524 },
+ { 64285, 64310 },
+ { 64312, 64316 },
+ { 64318, 64318 },
+ { 64320, 64321 },
+ { 64323, 64324 },
+ { 64326, 64335 },
+};
+static const URange16 Hiragana_range16[] = {
+ { 12353, 12438 },
+ { 12445, 12447 },
+};
+static const URange32 Hiragana_range32[] = {
+ { 110593, 110879 },
+ { 110898, 110898 },
+ { 110928, 110930 },
+ { 127488, 127488 },
+};
+static const URange32 Imperial_Aramaic_range32[] = {
+ { 67648, 67669 },
+ { 67671, 67679 },
+};
+static const URange16 Inherited_range16[] = {
+ { 768, 879 },
+ { 1157, 1158 },
+ { 1611, 1621 },
+ { 1648, 1648 },
+ { 2385, 2388 },
+ { 6832, 6862 },
+ { 7376, 7378 },
+ { 7380, 7392 },
+ { 7394, 7400 },
+ { 7405, 7405 },
+ { 7412, 7412 },
+ { 7416, 7417 },
+ { 7616, 7679 },
+ { 8204, 8205 },
+ { 8400, 8432 },
+ { 12330, 12333 },
+ { 12441, 12442 },
+ { 65024, 65039 },
+ { 65056, 65069 },
+};
+static const URange32 Inherited_range32[] = {
+ { 66045, 66045 },
+ { 66272, 66272 },
+ { 70459, 70459 },
+ { 118528, 118573 },
+ { 118576, 118598 },
+ { 119143, 119145 },
+ { 119163, 119170 },
+ { 119173, 119179 },
+ { 119210, 119213 },
+ { 917760, 917999 },
+};
+static const URange32 Inscriptional_Pahlavi_range32[] = {
+ { 68448, 68466 },
+ { 68472, 68479 },
+};
+static const URange32 Inscriptional_Parthian_range32[] = {
+ { 68416, 68437 },
+ { 68440, 68447 },
+};
+static const URange16 Javanese_range16[] = {
+ { 43392, 43469 },
+ { 43472, 43481 },
+ { 43486, 43487 },
+};
+static const URange32 Kaithi_range32[] = {
+ { 69760, 69826 },
+ { 69837, 69837 },
+};
+static const URange16 Kannada_range16[] = {
+ { 3200, 3212 },
+ { 3214, 3216 },
+ { 3218, 3240 },
+ { 3242, 3251 },
+ { 3253, 3257 },
+ { 3260, 3268 },
+ { 3270, 3272 },
+ { 3274, 3277 },
+ { 3285, 3286 },
+ { 3293, 3294 },
+ { 3296, 3299 },
+ { 3302, 3311 },
+ { 3313, 3315 },
+};
+static const URange16 Katakana_range16[] = {
+ { 12449, 12538 },
+ { 12541, 12543 },
+ { 12784, 12799 },
+ { 13008, 13054 },
+ { 13056, 13143 },
+ { 65382, 65391 },
+ { 65393, 65437 },
+};
+static const URange32 Katakana_range32[] = {
+ { 110576, 110579 },
+ { 110581, 110587 },
+ { 110589, 110590 },
+ { 110592, 110592 },
+ { 110880, 110882 },
+ { 110933, 110933 },
+ { 110948, 110951 },
+};
+static const URange32 Kawi_range32[] = {
+ { 73472, 73488 },
+ { 73490, 73530 },
+ { 73534, 73561 },
+};
+static const URange16 Kayah_Li_range16[] = {
+ { 43264, 43309 },
+ { 43311, 43311 },
+};
+static const URange32 Kharoshthi_range32[] = {
+ { 68096, 68099 },
+ { 68101, 68102 },
+ { 68108, 68115 },
+ { 68117, 68119 },
+ { 68121, 68149 },
+ { 68152, 68154 },
+ { 68159, 68168 },
+ { 68176, 68184 },
+};
+static const URange32 Khitan_Small_Script_range32[] = {
+ { 94180, 94180 },
+ { 101120, 101589 },
+};
+static const URange16 Khmer_range16[] = {
+ { 6016, 6109 },
+ { 6112, 6121 },
+ { 6128, 6137 },
+ { 6624, 6655 },
+};
+static const URange32 Khojki_range32[] = {
+ { 70144, 70161 },
+ { 70163, 70209 },
+};
+static const URange32 Khudawadi_range32[] = {
+ { 70320, 70378 },
+ { 70384, 70393 },
+};
+static const URange16 Lao_range16[] = {
+ { 3713, 3714 },
+ { 3716, 3716 },
+ { 3718, 3722 },
+ { 3724, 3747 },
+ { 3749, 3749 },
+ { 3751, 3773 },
+ { 3776, 3780 },
+ { 3782, 3782 },
+ { 3784, 3790 },
+ { 3792, 3801 },
+ { 3804, 3807 },
+};
+static const URange16 Latin_range16[] = {
+ { 65, 90 },
+ { 97, 122 },
+ { 170, 170 },
+ { 186, 186 },
+ { 192, 214 },
+ { 216, 246 },
+ { 248, 696 },
+ { 736, 740 },
+ { 7424, 7461 },
+ { 7468, 7516 },
+ { 7522, 7525 },
+ { 7531, 7543 },
+ { 7545, 7614 },
+ { 7680, 7935 },
+ { 8305, 8305 },
+ { 8319, 8319 },
+ { 8336, 8348 },
+ { 8490, 8491 },
+ { 8498, 8498 },
+ { 8526, 8526 },
+ { 8544, 8584 },
+ { 11360, 11391 },
+ { 42786, 42887 },
+ { 42891, 42954 },
+ { 42960, 42961 },
+ { 42963, 42963 },
+ { 42965, 42969 },
+ { 42994, 43007 },
+ { 43824, 43866 },
+ { 43868, 43876 },
+ { 43878, 43881 },
+ { 64256, 64262 },
+ { 65313, 65338 },
+ { 65345, 65370 },
+};
+static const URange32 Latin_range32[] = {
+ { 67456, 67461 },
+ { 67463, 67504 },
+ { 67506, 67514 },
+ { 122624, 122654 },
+ { 122661, 122666 },
+};
+static const URange16 Lepcha_range16[] = {
+ { 7168, 7223 },
+ { 7227, 7241 },
+ { 7245, 7247 },
+};
+static const URange16 Limbu_range16[] = {
+ { 6400, 6430 },
+ { 6432, 6443 },
+ { 6448, 6459 },
+ { 6464, 6464 },
+ { 6468, 6479 },
+};
+static const URange32 Linear_A_range32[] = {
+ { 67072, 67382 },
+ { 67392, 67413 },
+ { 67424, 67431 },
+};
+static const URange32 Linear_B_range32[] = {
+ { 65536, 65547 },
+ { 65549, 65574 },
+ { 65576, 65594 },
+ { 65596, 65597 },
+ { 65599, 65613 },
+ { 65616, 65629 },
+ { 65664, 65786 },
+};
+static const URange16 Lisu_range16[] = {
+ { 42192, 42239 },
+};
+static const URange32 Lisu_range32[] = {
+ { 73648, 73648 },
+};
+static const URange32 Lycian_range32[] = {
+ { 66176, 66204 },
+};
+static const URange32 Lydian_range32[] = {
+ { 67872, 67897 },
+ { 67903, 67903 },
+};
+static const URange32 Mahajani_range32[] = {
+ { 69968, 70006 },
+};
+static const URange32 Makasar_range32[] = {
+ { 73440, 73464 },
+};
+static const URange16 Malayalam_range16[] = {
+ { 3328, 3340 },
+ { 3342, 3344 },
+ { 3346, 3396 },
+ { 3398, 3400 },
+ { 3402, 3407 },
+ { 3412, 3427 },
+ { 3430, 3455 },
+};
+static const URange16 Mandaic_range16[] = {
+ { 2112, 2139 },
+ { 2142, 2142 },
+};
+static const URange32 Manichaean_range32[] = {
+ { 68288, 68326 },
+ { 68331, 68342 },
+};
+static const URange32 Marchen_range32[] = {
+ { 72816, 72847 },
+ { 72850, 72871 },
+ { 72873, 72886 },
+};
+static const URange32 Masaram_Gondi_range32[] = {
+ { 72960, 72966 },
+ { 72968, 72969 },
+ { 72971, 73014 },
+ { 73018, 73018 },
+ { 73020, 73021 },
+ { 73023, 73031 },
+ { 73040, 73049 },
+};
+static const URange32 Medefaidrin_range32[] = {
+ { 93760, 93850 },
+};
+static const URange16 Meetei_Mayek_range16[] = {
+ { 43744, 43766 },
+ { 43968, 44013 },
+ { 44016, 44025 },
+};
+static const URange32 Mende_Kikakui_range32[] = {
+ { 124928, 125124 },
+ { 125127, 125142 },
+};
+static const URange32 Meroitic_Cursive_range32[] = {
+ { 68000, 68023 },
+ { 68028, 68047 },
+ { 68050, 68095 },
+};
+static const URange32 Meroitic_Hieroglyphs_range32[] = {
+ { 67968, 67999 },
+};
+static const URange32 Miao_range32[] = {
+ { 93952, 94026 },
+ { 94031, 94087 },
+ { 94095, 94111 },
+};
+static const URange32 Modi_range32[] = {
+ { 71168, 71236 },
+ { 71248, 71257 },
+};
+static const URange16 Mongolian_range16[] = {
+ { 6144, 6145 },
+ { 6148, 6148 },
+ { 6150, 6169 },
+ { 6176, 6264 },
+ { 6272, 6314 },
+};
+static const URange32 Mongolian_range32[] = {
+ { 71264, 71276 },
+};
+static const URange32 Mro_range32[] = {
+ { 92736, 92766 },
+ { 92768, 92777 },
+ { 92782, 92783 },
+};
+static const URange32 Multani_range32[] = {
+ { 70272, 70278 },
+ { 70280, 70280 },
+ { 70282, 70285 },
+ { 70287, 70301 },
+ { 70303, 70313 },
+};
+static const URange16 Myanmar_range16[] = {
+ { 4096, 4255 },
+ { 43488, 43518 },
+ { 43616, 43647 },
+};
+static const URange32 Nabataean_range32[] = {
+ { 67712, 67742 },
+ { 67751, 67759 },
+};
+static const URange32 Nag_Mundari_range32[] = {
+ { 124112, 124153 },
+};
+static const URange32 Nandinagari_range32[] = {
+ { 72096, 72103 },
+ { 72106, 72151 },
+ { 72154, 72164 },
+};
+static const URange16 New_Tai_Lue_range16[] = {
+ { 6528, 6571 },
+ { 6576, 6601 },
+ { 6608, 6618 },
+ { 6622, 6623 },
+};
+static const URange32 Newa_range32[] = {
+ { 70656, 70747 },
+ { 70749, 70753 },
+};
+static const URange16 Nko_range16[] = {
+ { 1984, 2042 },
+ { 2045, 2047 },
+};
+static const URange32 Nushu_range32[] = {
+ { 94177, 94177 },
+ { 110960, 111355 },
+};
+static const URange32 Nyiakeng_Puachue_Hmong_range32[] = {
+ { 123136, 123180 },
+ { 123184, 123197 },
+ { 123200, 123209 },
+ { 123214, 123215 },
+};
+static const URange16 Ogham_range16[] = {
+ { 5760, 5788 },
+};
+static const URange16 Ol_Chiki_range16[] = {
+ { 7248, 7295 },
+};
+static const URange32 Old_Hungarian_range32[] = {
+ { 68736, 68786 },
+ { 68800, 68850 },
+ { 68858, 68863 },
+};
+static const URange32 Old_Italic_range32[] = {
+ { 66304, 66339 },
+ { 66349, 66351 },
+};
+static const URange32 Old_North_Arabian_range32[] = {
+ { 68224, 68255 },
+};
+static const URange32 Old_Permic_range32[] = {
+ { 66384, 66426 },
+};
+static const URange32 Old_Persian_range32[] = {
+ { 66464, 66499 },
+ { 66504, 66517 },
+};
+static const URange32 Old_Sogdian_range32[] = {
+ { 69376, 69415 },
+};
+static const URange32 Old_South_Arabian_range32[] = {
+ { 68192, 68223 },
+};
+static const URange32 Old_Turkic_range32[] = {
+ { 68608, 68680 },
+};
+static const URange32 Old_Uyghur_range32[] = {
+ { 69488, 69513 },
+};
+static const URange16 Oriya_range16[] = {
+ { 2817, 2819 },
+ { 2821, 2828 },
+ { 2831, 2832 },
+ { 2835, 2856 },
+ { 2858, 2864 },
+ { 2866, 2867 },
+ { 2869, 2873 },
+ { 2876, 2884 },
+ { 2887, 2888 },
+ { 2891, 2893 },
+ { 2901, 2903 },
+ { 2908, 2909 },
+ { 2911, 2915 },
+ { 2918, 2935 },
+};
+static const URange32 Osage_range32[] = {
+ { 66736, 66771 },
+ { 66776, 66811 },
+};
+static const URange32 Osmanya_range32[] = {
+ { 66688, 66717 },
+ { 66720, 66729 },
+};
+static const URange32 Pahawh_Hmong_range32[] = {
+ { 92928, 92997 },
+ { 93008, 93017 },
+ { 93019, 93025 },
+ { 93027, 93047 },
+ { 93053, 93071 },
+};
+static const URange32 Palmyrene_range32[] = {
+ { 67680, 67711 },
+};
+static const URange32 Pau_Cin_Hau_range32[] = {
+ { 72384, 72440 },
+};
+static const URange16 Phags_Pa_range16[] = {
+ { 43072, 43127 },
+};
+static const URange32 Phoenician_range32[] = {
+ { 67840, 67867 },
+ { 67871, 67871 },
+};
+static const URange32 Psalter_Pahlavi_range32[] = {
+ { 68480, 68497 },
+ { 68505, 68508 },
+ { 68521, 68527 },
+};
+static const URange16 Rejang_range16[] = {
+ { 43312, 43347 },
+ { 43359, 43359 },
+};
+static const URange16 Runic_range16[] = {
+ { 5792, 5866 },
+ { 5870, 5880 },
+};
+static const URange16 Samaritan_range16[] = {
+ { 2048, 2093 },
+ { 2096, 2110 },
+};
+static const URange16 Saurashtra_range16[] = {
+ { 43136, 43205 },
+ { 43214, 43225 },
+};
+static const URange32 Sharada_range32[] = {
+ { 70016, 70111 },
+};
+static const URange32 Shavian_range32[] = {
+ { 66640, 66687 },
+};
+static const URange32 Siddham_range32[] = {
+ { 71040, 71093 },
+ { 71096, 71133 },
+};
+static const URange32 SignWriting_range32[] = {
+ { 120832, 121483 },
+ { 121499, 121503 },
+ { 121505, 121519 },
+};
+static const URange16 Sinhala_range16[] = {
+ { 3457, 3459 },
+ { 3461, 3478 },
+ { 3482, 3505 },
+ { 3507, 3515 },
+ { 3517, 3517 },
+ { 3520, 3526 },
+ { 3530, 3530 },
+ { 3535, 3540 },
+ { 3542, 3542 },
+ { 3544, 3551 },
+ { 3558, 3567 },
+ { 3570, 3572 },
+};
+static const URange32 Sinhala_range32[] = {
+ { 70113, 70132 },
+};
+static const URange32 Sogdian_range32[] = {
+ { 69424, 69465 },
+};
+static const URange32 Sora_Sompeng_range32[] = {
+ { 69840, 69864 },
+ { 69872, 69881 },
+};
+static const URange32 Soyombo_range32[] = {
+ { 72272, 72354 },
+};
+static const URange16 Sundanese_range16[] = {
+ { 7040, 7103 },
+ { 7360, 7367 },
+};
+static const URange16 Syloti_Nagri_range16[] = {
+ { 43008, 43052 },
+};
+static const URange16 Syriac_range16[] = {
+ { 1792, 1805 },
+ { 1807, 1866 },
+ { 1869, 1871 },
+ { 2144, 2154 },
+};
+static const URange16 Tagalog_range16[] = {
+ { 5888, 5909 },
+ { 5919, 5919 },
+};
+static const URange16 Tagbanwa_range16[] = {
+ { 5984, 5996 },
+ { 5998, 6000 },
+ { 6002, 6003 },
+};
+static const URange16 Tai_Le_range16[] = {
+ { 6480, 6509 },
+ { 6512, 6516 },
+};
+static const URange16 Tai_Tham_range16[] = {
+ { 6688, 6750 },
+ { 6752, 6780 },
+ { 6783, 6793 },
+ { 6800, 6809 },
+ { 6816, 6829 },
+};
+static const URange16 Tai_Viet_range16[] = {
+ { 43648, 43714 },
+ { 43739, 43743 },
+};
+static const URange32 Takri_range32[] = {
+ { 71296, 71353 },
+ { 71360, 71369 },
+};
+static const URange16 Tamil_range16[] = {
+ { 2946, 2947 },
+ { 2949, 2954 },
+ { 2958, 2960 },
+ { 2962, 2965 },
+ { 2969, 2970 },
+ { 2972, 2972 },
+ { 2974, 2975 },
+ { 2979, 2980 },
+ { 2984, 2986 },
+ { 2990, 3001 },
+ { 3006, 3010 },
+ { 3014, 3016 },
+ { 3018, 3021 },
+ { 3024, 3024 },
+ { 3031, 3031 },
+ { 3046, 3066 },
+};
+static const URange32 Tamil_range32[] = {
+ { 73664, 73713 },
+ { 73727, 73727 },
+};
+static const URange32 Tangsa_range32[] = {
+ { 92784, 92862 },
+ { 92864, 92873 },
+};
+static const URange32 Tangut_range32[] = {
+ { 94176, 94176 },
+ { 94208, 100343 },
+ { 100352, 101119 },
+ { 101632, 101640 },
+};
+static const URange16 Telugu_range16[] = {
+ { 3072, 3084 },
+ { 3086, 3088 },
+ { 3090, 3112 },
+ { 3114, 3129 },
+ { 3132, 3140 },
+ { 3142, 3144 },
+ { 3146, 3149 },
+ { 3157, 3158 },
+ { 3160, 3162 },
+ { 3165, 3165 },
+ { 3168, 3171 },
+ { 3174, 3183 },
+ { 3191, 3199 },
+};
+static const URange16 Thaana_range16[] = {
+ { 1920, 1969 },
+};
+static const URange16 Thai_range16[] = {
+ { 3585, 3642 },
+ { 3648, 3675 },
+};
+static const URange16 Tibetan_range16[] = {
+ { 3840, 3911 },
+ { 3913, 3948 },
+ { 3953, 3991 },
+ { 3993, 4028 },
+ { 4030, 4044 },
+ { 4046, 4052 },
+ { 4057, 4058 },
+};
+static const URange16 Tifinagh_range16[] = {
+ { 11568, 11623 },
+ { 11631, 11632 },
+ { 11647, 11647 },
+};
+static const URange32 Tirhuta_range32[] = {
+ { 70784, 70855 },
+ { 70864, 70873 },
+};
+static const URange32 Toto_range32[] = {
+ { 123536, 123566 },
+};
+static const URange32 Ugaritic_range32[] = {
+ { 66432, 66461 },
+ { 66463, 66463 },
+};
+static const URange16 Vai_range16[] = {
+ { 42240, 42539 },
+};
+static const URange32 Vithkuqi_range32[] = {
+ { 66928, 66938 },
+ { 66940, 66954 },
+ { 66956, 66962 },
+ { 66964, 66965 },
+ { 66967, 66977 },
+ { 66979, 66993 },
+ { 66995, 67001 },
+ { 67003, 67004 },
+};
+static const URange32 Wancho_range32[] = {
+ { 123584, 123641 },
+ { 123647, 123647 },
+};
+static const URange32 Warang_Citi_range32[] = {
+ { 71840, 71922 },
+ { 71935, 71935 },
+};
+static const URange32 Yezidi_range32[] = {
+ { 69248, 69289 },
+ { 69291, 69293 },
+ { 69296, 69297 },
+};
+static const URange16 Yi_range16[] = {
+ { 40960, 42124 },
+ { 42128, 42182 },
+};
+static const URange32 Zanabazar_Square_range32[] = {
+ { 72192, 72263 },
+};
+// 4042 16-bit ranges, 1778 32-bit ranges
+const UGroup unicode_groups[] = {
+ { "Adlam", +1, 0, 0, Adlam_range32, 3 },
+ { "Ahom", +1, 0, 0, Ahom_range32, 3 },
+ { "Anatolian_Hieroglyphs", +1, 0, 0, Anatolian_Hieroglyphs_range32, 1 },
+ { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 36 },
+ { "Armenian", +1, Armenian_range16, 4, 0, 0 },
+ { "Avestan", +1, 0, 0, Avestan_range32, 2 },
+ { "Balinese", +1, Balinese_range16, 2, 0, 0 },
+ { "Bamum", +1, Bamum_range16, 1, Bamum_range32, 1 },
+ { "Bassa_Vah", +1, 0, 0, Bassa_Vah_range32, 2 },
+ { "Batak", +1, Batak_range16, 2, 0, 0 },
+ { "Bengali", +1, Bengali_range16, 14, 0, 0 },
+ { "Bhaiksuki", +1, 0, 0, Bhaiksuki_range32, 4 },
+ { "Bopomofo", +1, Bopomofo_range16, 3, 0, 0 },
+ { "Brahmi", +1, 0, 0, Brahmi_range32, 3 },
+ { "Braille", +1, Braille_range16, 1, 0, 0 },
+ { "Buginese", +1, Buginese_range16, 2, 0, 0 },
+ { "Buhid", +1, Buhid_range16, 1, 0, 0 },
+ { "C", +1, C_range16, 17, C_range32, 9 },
+ { "Canadian_Aboriginal", +1, Canadian_Aboriginal_range16, 2, Canadian_Aboriginal_range32, 1 },
+ { "Carian", +1, 0, 0, Carian_range32, 1 },
+ { "Caucasian_Albanian", +1, 0, 0, Caucasian_Albanian_range32, 2 },
+ { "Cc", +1, Cc_range16, 2, 0, 0 },
+ { "Cf", +1, Cf_range16, 14, Cf_range32, 7 },
+ { "Chakma", +1, 0, 0, Chakma_range32, 2 },
+ { "Cham", +1, Cham_range16, 4, 0, 0 },
+ { "Cherokee", +1, Cherokee_range16, 3, 0, 0 },
+ { "Chorasmian", +1, 0, 0, Chorasmian_range32, 1 },
+ { "Co", +1, Co_range16, 1, Co_range32, 2 },
+ { "Common", +1, Common_range16, 91, Common_range32, 82 },
+ { "Coptic", +1, Coptic_range16, 3, 0, 0 },
+ { "Cs", +1, Cs_range16, 1, 0, 0 },
+ { "Cuneiform", +1, 0, 0, Cuneiform_range32, 4 },
+ { "Cypriot", +1, 0, 0, Cypriot_range32, 6 },
+ { "Cypro_Minoan", +1, 0, 0, Cypro_Minoan_range32, 1 },
+ { "Cyrillic", +1, Cyrillic_range16, 8, Cyrillic_range32, 2 },
+ { "Deseret", +1, 0, 0, Deseret_range32, 1 },
+ { "Devanagari", +1, Devanagari_range16, 4, Devanagari_range32, 1 },
+ { "Dives_Akuru", +1, 0, 0, Dives_Akuru_range32, 8 },
+ { "Dogra", +1, 0, 0, Dogra_range32, 1 },
+ { "Duployan", +1, 0, 0, Duployan_range32, 5 },
+ { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 1 },
+ { "Elbasan", +1, 0, 0, Elbasan_range32, 1 },
+ { "Elymaic", +1, 0, 0, Elymaic_range32, 1 },
+ { "Ethiopic", +1, Ethiopic_range16, 32, Ethiopic_range32, 4 },
+ { "Georgian", +1, Georgian_range16, 10, 0, 0 },
+ { "Glagolitic", +1, Glagolitic_range16, 1, Glagolitic_range32, 5 },
+ { "Gothic", +1, 0, 0, Gothic_range32, 1 },
+ { "Grantha", +1, 0, 0, Grantha_range32, 15 },
+ { "Greek", +1, Greek_range16, 33, Greek_range32, 3 },
+ { "Gujarati", +1, Gujarati_range16, 14, 0, 0 },
+ { "Gunjala_Gondi", +1, 0, 0, Gunjala_Gondi_range32, 6 },
+ { "Gurmukhi", +1, Gurmukhi_range16, 16, 0, 0 },
+ { "Han", +1, Han_range16, 11, Han_range32, 11 },
+ { "Hangul", +1, Hangul_range16, 14, 0, 0 },
+ { "Hanifi_Rohingya", +1, 0, 0, Hanifi_Rohingya_range32, 2 },
+ { "Hanunoo", +1, Hanunoo_range16, 1, 0, 0 },
+ { "Hatran", +1, 0, 0, Hatran_range32, 3 },
+ { "Hebrew", +1, Hebrew_range16, 9, 0, 0 },
+ { "Hiragana", +1, Hiragana_range16, 2, Hiragana_range32, 4 },
+ { "Imperial_Aramaic", +1, 0, 0, Imperial_Aramaic_range32, 2 },
+ { "Inherited", +1, Inherited_range16, 19, Inherited_range32, 10 },
+ { "Inscriptional_Pahlavi", +1, 0, 0, Inscriptional_Pahlavi_range32, 2 },
+ { "Inscriptional_Parthian", +1, 0, 0, Inscriptional_Parthian_range32, 2 },
+ { "Javanese", +1, Javanese_range16, 3, 0, 0 },
+ { "Kaithi", +1, 0, 0, Kaithi_range32, 2 },
+ { "Kannada", +1, Kannada_range16, 13, 0, 0 },
+ { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 7 },
+ { "Kawi", +1, 0, 0, Kawi_range32, 3 },
+ { "Kayah_Li", +1, Kayah_Li_range16, 2, 0, 0 },
+ { "Kharoshthi", +1, 0, 0, Kharoshthi_range32, 8 },
+ { "Khitan_Small_Script", +1, 0, 0, Khitan_Small_Script_range32, 2 },
+ { "Khmer", +1, Khmer_range16, 4, 0, 0 },
+ { "Khojki", +1, 0, 0, Khojki_range32, 2 },
+ { "Khudawadi", +1, 0, 0, Khudawadi_range32, 2 },
+ { "L", +1, L_range16, 380, L_range32, 280 },
+ { "Lao", +1, Lao_range16, 11, 0, 0 },
+ { "Latin", +1, Latin_range16, 34, Latin_range32, 5 },
+ { "Lepcha", +1, Lepcha_range16, 3, 0, 0 },
+ { "Limbu", +1, Limbu_range16, 5, 0, 0 },
+ { "Linear_A", +1, 0, 0, Linear_A_range32, 3 },
+ { "Linear_B", +1, 0, 0, Linear_B_range32, 7 },
+ { "Lisu", +1, Lisu_range16, 1, Lisu_range32, 1 },
+ { "Ll", +1, Ll_range16, 617, Ll_range32, 41 },
+ { "Lm", +1, Lm_range16, 57, Lm_range32, 14 },
+ { "Lo", +1, Lo_range16, 290, Lo_range32, 221 },
+ { "Lt", +1, Lt_range16, 10, 0, 0 },
+ { "Lu", +1, Lu_range16, 605, Lu_range32, 41 },
+ { "Lycian", +1, 0, 0, Lycian_range32, 1 },
+ { "Lydian", +1, 0, 0, Lydian_range32, 2 },
+ { "M", +1, M_range16, 190, M_range32, 120 },
+ { "Mahajani", +1, 0, 0, Mahajani_range32, 1 },
+ { "Makasar", +1, 0, 0, Makasar_range32, 1 },
+ { "Malayalam", +1, Malayalam_range16, 7, 0, 0 },
+ { "Mandaic", +1, Mandaic_range16, 2, 0, 0 },
+ { "Manichaean", +1, 0, 0, Manichaean_range32, 2 },
+ { "Marchen", +1, 0, 0, Marchen_range32, 3 },
+ { "Masaram_Gondi", +1, 0, 0, Masaram_Gondi_range32, 7 },
+ { "Mc", +1, Mc_range16, 112, Mc_range32, 70 },
+ { "Me", +1, Me_range16, 5, 0, 0 },
+ { "Medefaidrin", +1, 0, 0, Medefaidrin_range32, 1 },
+ { "Meetei_Mayek", +1, Meetei_Mayek_range16, 3, 0, 0 },
+ { "Mende_Kikakui", +1, 0, 0, Mende_Kikakui_range32, 2 },
+ { "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 3 },
+ { "Meroitic_Hieroglyphs", +1, 0, 0, Meroitic_Hieroglyphs_range32, 1 },
+ { "Miao", +1, 0, 0, Miao_range32, 3 },
+ { "Mn", +1, Mn_range16, 212, Mn_range32, 134 },
+ { "Modi", +1, 0, 0, Modi_range32, 2 },
+ { "Mongolian", +1, Mongolian_range16, 5, Mongolian_range32, 1 },
+ { "Mro", +1, 0, 0, Mro_range32, 3 },
+ { "Multani", +1, 0, 0, Multani_range32, 5 },
+ { "Myanmar", +1, Myanmar_range16, 3, 0, 0 },
+ { "N", +1, N_range16, 67, N_range32, 70 },
+ { "Nabataean", +1, 0, 0, Nabataean_range32, 2 },
+ { "Nag_Mundari", +1, 0, 0, Nag_Mundari_range32, 1 },
+ { "Nandinagari", +1, 0, 0, Nandinagari_range32, 3 },
+ { "Nd", +1, Nd_range16, 37, Nd_range32, 27 },
+ { "New_Tai_Lue", +1, New_Tai_Lue_range16, 4, 0, 0 },
+ { "Newa", +1, 0, 0, Newa_range32, 2 },
+ { "Nko", +1, Nko_range16, 2, 0, 0 },
+ { "Nl", +1, Nl_range16, 7, Nl_range32, 5 },
+ { "No", +1, No_range16, 29, No_range32, 43 },
+ { "Nushu", +1, 0, 0, Nushu_range32, 2 },
+ { "Nyiakeng_Puachue_Hmong", +1, 0, 0, Nyiakeng_Puachue_Hmong_range32, 4 },
+ { "Ogham", +1, Ogham_range16, 1, 0, 0 },
+ { "Ol_Chiki", +1, Ol_Chiki_range16, 1, 0, 0 },
+ { "Old_Hungarian", +1, 0, 0, Old_Hungarian_range32, 3 },
+ { "Old_Italic", +1, 0, 0, Old_Italic_range32, 2 },
+ { "Old_North_Arabian", +1, 0, 0, Old_North_Arabian_range32, 1 },
+ { "Old_Permic", +1, 0, 0, Old_Permic_range32, 1 },
+ { "Old_Persian", +1, 0, 0, Old_Persian_range32, 2 },
+ { "Old_Sogdian", +1, 0, 0, Old_Sogdian_range32, 1 },
+ { "Old_South_Arabian", +1, 0, 0, Old_South_Arabian_range32, 1 },
+ { "Old_Turkic", +1, 0, 0, Old_Turkic_range32, 1 },
+ { "Old_Uyghur", +1, 0, 0, Old_Uyghur_range32, 1 },
+ { "Oriya", +1, Oriya_range16, 14, 0, 0 },
+ { "Osage", +1, 0, 0, Osage_range32, 2 },
+ { "Osmanya", +1, 0, 0, Osmanya_range32, 2 },
+ { "P", +1, P_range16, 133, P_range32, 58 },
+ { "Pahawh_Hmong", +1, 0, 0, Pahawh_Hmong_range32, 5 },
+ { "Palmyrene", +1, 0, 0, Palmyrene_range32, 1 },
+ { "Pau_Cin_Hau", +1, 0, 0, Pau_Cin_Hau_range32, 1 },
+ { "Pc", +1, Pc_range16, 6, 0, 0 },
+ { "Pd", +1, Pd_range16, 18, Pd_range32, 1 },
+ { "Pe", +1, Pe_range16, 76, 0, 0 },
+ { "Pf", +1, Pf_range16, 10, 0, 0 },
+ { "Phags_Pa", +1, Phags_Pa_range16, 1, 0, 0 },
+ { "Phoenician", +1, 0, 0, Phoenician_range32, 2 },
+ { "Pi", +1, Pi_range16, 11, 0, 0 },
+ { "Po", +1, Po_range16, 130, Po_range32, 57 },
+ { "Ps", +1, Ps_range16, 79, 0, 0 },
+ { "Psalter_Pahlavi", +1, 0, 0, Psalter_Pahlavi_range32, 3 },
+ { "Rejang", +1, Rejang_range16, 2, 0, 0 },
+ { "Runic", +1, Runic_range16, 2, 0, 0 },
+ { "S", +1, S_range16, 152, S_range32, 81 },
+ { "Samaritan", +1, Samaritan_range16, 2, 0, 0 },
+ { "Saurashtra", +1, Saurashtra_range16, 2, 0, 0 },
+ { "Sc", +1, Sc_range16, 18, Sc_range32, 3 },
+ { "Sharada", +1, 0, 0, Sharada_range32, 1 },
+ { "Shavian", +1, 0, 0, Shavian_range32, 1 },
+ { "Siddham", +1, 0, 0, Siddham_range32, 2 },
+ { "SignWriting", +1, 0, 0, SignWriting_range32, 3 },
+ { "Sinhala", +1, Sinhala_range16, 12, Sinhala_range32, 1 },
+ { "Sk", +1, Sk_range16, 30, Sk_range32, 1 },
+ { "Sm", +1, Sm_range16, 53, Sm_range32, 11 },
+ { "So", +1, So_range16, 115, So_range32, 70 },
+ { "Sogdian", +1, 0, 0, Sogdian_range32, 1 },
+ { "Sora_Sompeng", +1, 0, 0, Sora_Sompeng_range32, 2 },
+ { "Soyombo", +1, 0, 0, Soyombo_range32, 1 },
+ { "Sundanese", +1, Sundanese_range16, 2, 0, 0 },
+ { "Syloti_Nagri", +1, Syloti_Nagri_range16, 1, 0, 0 },
+ { "Syriac", +1, Syriac_range16, 4, 0, 0 },
+ { "Tagalog", +1, Tagalog_range16, 2, 0, 0 },
+ { "Tagbanwa", +1, Tagbanwa_range16, 3, 0, 0 },
+ { "Tai_Le", +1, Tai_Le_range16, 2, 0, 0 },
+ { "Tai_Tham", +1, Tai_Tham_range16, 5, 0, 0 },
+ { "Tai_Viet", +1, Tai_Viet_range16, 2, 0, 0 },
+ { "Takri", +1, 0, 0, Takri_range32, 2 },
+ { "Tamil", +1, Tamil_range16, 16, Tamil_range32, 2 },
+ { "Tangsa", +1, 0, 0, Tangsa_range32, 2 },
+ { "Tangut", +1, 0, 0, Tangut_range32, 4 },
+ { "Telugu", +1, Telugu_range16, 13, 0, 0 },
+ { "Thaana", +1, Thaana_range16, 1, 0, 0 },
+ { "Thai", +1, Thai_range16, 2, 0, 0 },
+ { "Tibetan", +1, Tibetan_range16, 7, 0, 0 },
+ { "Tifinagh", +1, Tifinagh_range16, 3, 0, 0 },
+ { "Tirhuta", +1, 0, 0, Tirhuta_range32, 2 },
+ { "Toto", +1, 0, 0, Toto_range32, 1 },
+ { "Ugaritic", +1, 0, 0, Ugaritic_range32, 2 },
+ { "Vai", +1, Vai_range16, 1, 0, 0 },
+ { "Vithkuqi", +1, 0, 0, Vithkuqi_range32, 8 },
+ { "Wancho", +1, 0, 0, Wancho_range32, 2 },
+ { "Warang_Citi", +1, 0, 0, Warang_Citi_range32, 2 },
+ { "Yezidi", +1, 0, 0, Yezidi_range32, 3 },
+ { "Yi", +1, Yi_range16, 2, 0, 0 },
+ { "Z", +1, Z_range16, 8, 0, 0 },
+ { "Zanabazar_Square", +1, 0, 0, Zanabazar_Square_range32, 1 },
+ { "Zl", +1, Zl_range16, 1, 0, 0 },
+ { "Zp", +1, Zp_range16, 1, 0, 0 },
+ { "Zs", +1, Zs_range16, 7, 0, 0 },
+};
+const int num_unicode_groups = 199;
+
+
+} // namespace re2
+
+
diff --git a/third_party/re2/src/re2/unicode_groups.h b/third_party/re2/src/re2/unicode_groups.h
new file mode 100644
index 000000000..6dc653218
--- /dev/null
+++ b/third_party/re2/src/re2/unicode_groups.h
@@ -0,0 +1,66 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_UNICODE_GROUPS_H_
+#define RE2_UNICODE_GROUPS_H_
+
+// Unicode character groups.
+
+// The codes get split into ranges of 16-bit codes
+// and ranges of 32-bit codes. It would be simpler
+// to use only 32-bit ranges, but these tables are large
+// enough to warrant extra care.
+//
+// Using just 32-bit ranges gives 27 kB of data.
+// Adding 16-bit ranges gives 18 kB of data.
+// Adding an extra table of 16-bit singletons would reduce
+// to 16.5 kB of data but make the data harder to use;
+// we don't bother.
+
+#include <stdint.h>
+
+#include "util/utf.h"
+
+namespace re2 {
+
+struct URange16
+{
+ uint16_t lo;
+ uint16_t hi;
+};
+
+struct URange32
+{
+ Rune lo;
+ Rune hi;
+};
+
+struct UGroup
+{
+ const char *name;
+ int sign; // +1 for [abc], -1 for [^abc]
+ const URange16 *r16;
+ int nr16;
+ const URange32 *r32;
+ int nr32;
+};
+
+// Named by property or script name (e.g., "Nd", "N", "Han").
+// Negated groups are not included.
+extern const UGroup unicode_groups[];
+extern const int num_unicode_groups;
+
+// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]").
+// Negated groups are included.
+extern const UGroup posix_groups[];
+extern const int num_posix_groups;
+
+// Named by Perl name (e.g., "\\d", "\\D").
+// Negated groups are included.
+extern const UGroup perl_groups[];
+extern const int num_perl_groups;
+
+} // namespace re2
+
+#endif // RE2_UNICODE_GROUPS_H_
diff --git a/third_party/re2/src/re2/walker-inl.h b/third_party/re2/src/re2/walker-inl.h
new file mode 100644
index 000000000..45763a7b2
--- /dev/null
+++ b/third_party/re2/src/re2/walker-inl.h
@@ -0,0 +1,248 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_WALKER_INL_H_
+#define RE2_WALKER_INL_H_
+
+// Helper class for traversing Regexps without recursion.
+// Clients should declare their own subclasses that override
+// the PreVisit and PostVisit methods, which are called before
+// and after visiting the subexpressions.
+
+// Not quite the Visitor pattern, because (among other things)
+// the Visitor pattern is recursive.
+
+#include <stack>
+
+#include "absl/base/macros.h"
+#include "util/logging.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+template<typename T> struct WalkState;
+
+template<typename T> class Regexp::Walker {
+ public:
+ Walker();
+ virtual ~Walker();
+
+ // Virtual method called before visiting re's children.
+ // PreVisit passes ownership of its return value to its caller.
+ // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
+ // and passed to the child PreVisits and PostVisits as parent_arg.
+ // At the top-most Regexp, parent_arg is arg passed to walk.
+ // If PreVisit sets *stop to true, the walk does not recurse
+ // into the children. Instead it behaves as though the return
+ // value from PreVisit is the return value from PostVisit.
+ // The default PreVisit returns parent_arg.
+ virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
+
+ // Virtual method called after visiting re's children.
+ // The pre_arg is the T that PreVisit returned.
+ // The child_args is a vector of the T that the child PostVisits returned.
+ // PostVisit takes ownership of pre_arg.
+ // PostVisit takes ownership of the Ts
+ // in *child_args, but not the vector itself.
+ // PostVisit passes ownership of its return value
+ // to its caller.
+ // The default PostVisit simply returns pre_arg.
+ virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
+ T* child_args, int nchild_args);
+
+ // Virtual method called to copy a T,
+ // when Walk notices that more than one child is the same re.
+ virtual T Copy(T arg);
+
+ // Virtual method called to do a "quick visit" of the re,
+ // but not its children. Only called once the visit budget
+ // has been used up and we're trying to abort the walk
+ // as quickly as possible. Should return a value that
+ // makes sense for the parent PostVisits still to be run.
+ // This function is (hopefully) only called by
+ // WalkExponential, but must be implemented by all clients,
+ // just in case.
+ virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
+
+ // Walks over a regular expression.
+ // Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
+ // Returns the T returned by PostVisit on re.
+ T Walk(Regexp* re, T top_arg);
+
+ // Like Walk, but doesn't use Copy. This can lead to
+ // exponential runtimes on cross-linked Regexps like the
+ // ones generated by Simplify. To help limit this,
+ // at most max_visits nodes will be visited and then
+ // the walk will be cut off early.
+ // If the walk *is* cut off early, ShortVisit(re)
+ // will be called on regexps that cannot be fully
+ // visited rather than calling PreVisit/PostVisit.
+ T WalkExponential(Regexp* re, T top_arg, int max_visits);
+
+ // Clears the stack. Should never be necessary, since
+ // Walk always enters and exits with an empty stack.
+ // Logs DFATAL if stack is not already clear.
+ void Reset();
+
+ // Returns whether walk was cut off.
+ bool stopped_early() { return stopped_early_; }
+
+ private:
+ // Walk state for the entire traversal.
+ std::stack<WalkState<T>> stack_;
+ bool stopped_early_;
+ int max_visits_;
+
+ T WalkInternal(Regexp* re, T top_arg, bool use_copy);
+
+ Walker(const Walker&) = delete;
+ Walker& operator=(const Walker&) = delete;
+};
+
+template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
+ T parent_arg,
+ bool* stop) {
+ return parent_arg;
+}
+
+template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
+ T parent_arg,
+ T pre_arg,
+ T* child_args,
+ int nchild_args) {
+ return pre_arg;
+}
+
+template<typename T> T Regexp::Walker<T>::Copy(T arg) {
+ return arg;
+}
+
+// State about a single level in the traversal.
+template<typename T> struct WalkState {
+ WalkState(Regexp* re, T parent)
+ : re(re),
+ n(-1),
+ parent_arg(parent),
+ child_args(NULL) { }
+
+ Regexp* re; // The regexp
+ int n; // The index of the next child to process; -1 means need to PreVisit
+ T parent_arg; // Accumulated arguments.
+ T pre_arg;
+ T child_arg; // One-element buffer for child_args.
+ T* child_args;
+};
+
+template<typename T> Regexp::Walker<T>::Walker() {
+ stopped_early_ = false;
+}
+
+template<typename T> Regexp::Walker<T>::~Walker() {
+ Reset();
+}
+
+// Clears the stack. Should never be necessary, since
+// Walk always enters and exits with an empty stack.
+// Logs DFATAL if stack is not already clear.
+template<typename T> void Regexp::Walker<T>::Reset() {
+ if (!stack_.empty()) {
+ LOG(DFATAL) << "Stack not empty.";
+ while (!stack_.empty()) {
+ if (stack_.top().re->nsub_ > 1)
+ delete[] stack_.top().child_args;
+ stack_.pop();
+ }
+ }
+}
+
+template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
+ bool use_copy) {
+ Reset();
+
+ if (re == NULL) {
+ LOG(DFATAL) << "Walk NULL";
+ return top_arg;
+ }
+
+ stack_.push(WalkState<T>(re, top_arg));
+
+ WalkState<T>* s;
+ for (;;) {
+ T t;
+ s = &stack_.top();
+ re = s->re;
+ switch (s->n) {
+ case -1: {
+ if (--max_visits_ < 0) {
+ stopped_early_ = true;
+ t = ShortVisit(re, s->parent_arg);
+ break;
+ }
+ bool stop = false;
+ s->pre_arg = PreVisit(re, s->parent_arg, &stop);
+ if (stop) {
+ t = s->pre_arg;
+ break;
+ }
+ s->n = 0;
+ s->child_args = NULL;
+ if (re->nsub_ == 1)
+ s->child_args = &s->child_arg;
+ else if (re->nsub_ > 1)
+ s->child_args = new T[re->nsub_];
+ ABSL_FALLTHROUGH_INTENDED;
+ }
+ default: {
+ if (re->nsub_ > 0) {
+ Regexp** sub = re->sub();
+ if (s->n < re->nsub_) {
+ if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
+ s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
+ s->n++;
+ } else {
+ stack_.push(WalkState<T>(sub[s->n], s->pre_arg));
+ }
+ continue;
+ }
+ }
+
+ t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
+ if (re->nsub_ > 1)
+ delete[] s->child_args;
+ break;
+ }
+ }
+
+ // We've finished stack_.top().
+ // Update next guy down.
+ stack_.pop();
+ if (stack_.empty())
+ return t;
+ s = &stack_.top();
+ if (s->child_args != NULL)
+ s->child_args[s->n] = t;
+ else
+ s->child_arg = t;
+ s->n++;
+ }
+}
+
+template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
+ // Without the exponential walking behavior,
+ // this budget should be more than enough for any
+ // regexp, and yet not enough to get us in trouble
+ // as far as CPU time.
+ max_visits_ = 1000000;
+ return WalkInternal(re, top_arg, true);
+}
+
+template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
+ int max_visits) {
+ max_visits_ = max_visits;
+ return WalkInternal(re, top_arg, false);
+}
+
+} // namespace re2
+
+#endif // RE2_WALKER_INL_H_
diff --git a/third_party/re2/src/re2Config.cmake.in b/third_party/re2/src/re2Config.cmake.in
new file mode 100644
index 000000000..6a177c615
--- /dev/null
+++ b/third_party/re2/src/re2Config.cmake.in
@@ -0,0 +1,28 @@
+# Copyright 2022 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro)
+
+set_and_check(re2_INCLUDE_DIR ${PACKAGE_PREFIX_DIR}/@CMAKE_INSTALL_INCLUDEDIR@)
+
+if(UNIX)
+ set(THREADS_PREFER_PTHREAD_FLAG ON)
+ find_dependency(Threads REQUIRED)
+endif()
+
+find_dependency(absl REQUIRED)
+
+if(@RE2_USE_ICU@)
+ find_dependency(ICU REQUIRED COMPONENTS uc)
+endif()
+
+check_required_components(re2)
+
+if(TARGET re2::re2)
+ return()
+endif()
+
+include(${CMAKE_CURRENT_LIST_DIR}/re2Targets.cmake)
diff --git a/third_party/re2/src/runtests b/third_party/re2/src/runtests
new file mode 100755
index 000000000..94584a660
--- /dev/null
+++ b/third_party/re2/src/runtests
@@ -0,0 +1,33 @@
+#!/usr/bin/env sh
+
+# System Integrity Protection on Darwin complicated these matters somewhat.
+# See https://github.com/google/re2/issues/175 for details.
+if [ "x$1" = "x-shared-library-path" ]; then
+ if [ "x$(uname)" = "xDarwin" ]; then
+ DYLD_LIBRARY_PATH="$2:$DYLD_LIBRARY_PATH"
+ export DYLD_LIBRARY_PATH
+ else
+ LD_LIBRARY_PATH="$2:$LD_LIBRARY_PATH"
+ export LD_LIBRARY_PATH
+ fi
+ shift 2
+fi
+
+success=true
+for i; do
+ printf "%-40s" $i
+ if $($i >$i.log 2>&1) 2>/dev/null; then
+ echo PASS
+ else
+ echo FAIL';' output in $i.log
+ success=false
+ fi
+done
+
+if $success; then
+ echo 'ALL TESTS PASSED.'
+ exit 0
+else
+ echo 'TESTS FAILED.'
+ exit 1
+fi
diff --git a/third_party/re2/src/testinstall.cc b/third_party/re2/src/testinstall.cc
new file mode 100644
index 000000000..19cc9003b
--- /dev/null
+++ b/third_party/re2/src/testinstall.cc
@@ -0,0 +1,27 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+#include <re2/filtered_re2.h>
+#include <re2/re2.h>
+
+int main() {
+ re2::FilteredRE2 f;
+ int id;
+ f.Add("a.*b.*c", RE2::DefaultOptions, &id);
+ std::vector<std::string> v;
+ f.Compile(&v);
+ std::vector<int> ids;
+ f.FirstMatch("abbccc", ids);
+
+ int n;
+ if (RE2::FullMatch("axbyc", "a.*b.*c") &&
+ RE2::PartialMatch("foo123bar", "(\\d+)", &n) && n == 123) {
+ printf("PASS\n");
+ return 0;
+ }
+
+ printf("FAIL\n");
+ return 2;
+}
diff --git a/third_party/re2/src/ucs2.diff b/third_party/re2/src/ucs2.diff
new file mode 100644
index 000000000..57aec04a1
--- /dev/null
+++ b/third_party/re2/src/ucs2.diff
@@ -0,0 +1,567 @@
+This is a dump from Google's source control system of the change
+that removed UCS-2 support from RE2. As the explanation below
+says, UCS-2 mode is fundamentally at odds with things like ^ and $,
+so it never really worked very well. But if you are interested in using
+it without those operators, it did work for that. It assumed that the
+UCS-2 data was in the native host byte order.
+
+If you are interested in adding UCS-2 mode back, this patch might
+be a good starting point.
+
+
+Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15
+
+ Retire UCS-2 mode.
+
+ I added it as an experiment for V8, but it
+ requires 2-byte lookahead to do completely,
+ and RE2 has 1-byte lookahead (enough for UTF-8)
+ as a fairly deep fundamental assumption,
+ so it did not support ^ or $.
+
+==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ====
+re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319
+ cap_[0] = p;
+ if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
+ return true;
+- if (prog_->flags() & Regexp::UCS2)
+- p++;
+ }
+ return false;
+ }
+==== re2/compile.cc#17 - re2/compile.cc#18 ====
+re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100
+ // Input encodings.
+ enum Encoding {
+ kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
+- kEncodingUCS2, // UCS-2 (0-FFFF), native byte order
+ kEncodingLatin1, // Latin1 (0-FF)
+ };
+
+re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172
+ void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
+ void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
+ void Add_80_10ffff();
+- void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase);
+- void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
+- uint8 lo2, uint8 hi2, bool fold2);
+
+ // New suffix that matches the byte range lo-hi, then goes to next.
+ Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);
+re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477
+
+ // Converts rune range lo-hi into a fragment that recognizes
+ // the bytes that would make up those runes in the current
+- // encoding (Latin 1, UTF-8, or UCS-2).
++ // encoding (Latin 1 or UTF-8).
+ // This lets the machine work byte-by-byte even when
+ // using multibyte encodings.
+
+re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489
+ case kEncodingLatin1:
+ AddRuneRangeLatin1(lo, hi, foldcase);
+ break;
+- case kEncodingUCS2:
+- AddRuneRangeUCS2(lo, hi, foldcase);
+- break;
+ }
+ }
+
+re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501
+ AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));
+ }
+
+- // Test whether 16-bit values are big or little endian.
+- static bool BigEndian() {
+- union {
+- char byte[2];
+- int16 endian;
+- } u;
+-
+- u.byte[0] = 1;
+- u.byte[1] = 2;
+- return u.endian == 0x0102;
+- }
+-
+- void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
+- uint8 lo2, uint8 hi2, bool fold2) {
+- Inst* ip;
+- if (reversed_) {
+- ip = RuneByteSuffix(lo1, hi1, fold1, NULL);
+- ip = RuneByteSuffix(lo2, hi2, fold2, ip);
+- } else {
+- ip = RuneByteSuffix(lo2, hi2, fold2, NULL);
+- ip = RuneByteSuffix(lo1, hi1, fold1, ip);
+- }
+- AddSuffix(ip);
+- }
+-
+- void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) {
+- if (lo > hi || lo > 0xFFFF)
+- return;
+- if (hi > 0xFFFF)
+- hi = 0xFFFF;
+-
+- // We'll assemble a pattern assuming big endian.
+- // If the machine isn't, tell Cat to reverse its arguments.
+- bool oldreversed = reversed_;
+- if (!BigEndian()) {
+- reversed_ = !oldreversed;
+- }
+-
+- // Split into bytes.
+- int lo1 = lo >> 8;
+- int lo2 = lo & 0xFF;
+- int hi1 = hi >> 8;
+- int hi2 = hi & 0xFF;
+-
+- if (lo1 == hi1) {
+- // Easy case: high bits are same in both.
+- // Only do ASCII case folding on the second byte if the top byte is 00.
+- AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase);
+- } else {
+- // Harder case: different second byte ranges depending on first byte.
+-
+- // Initial fragment.
+- if (lo2 > 0) {
+- AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase);
+- lo1++;
+- }
+-
+- // Trailing fragment.
+- if (hi2 < 0xFF) {
+- AddUCS2Pair(hi1, hi1, false, 0, hi2, false);
+- hi1--;
+- }
+-
+- // Inner ranges.
+- if (lo1 <= hi1) {
+- AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false);
+- }
+- }
+-
+- // Restore reverse setting.
+- reversed_ = oldreversed;
+- }
+-
+ // Table describing how to make a UTF-8 matching machine
+ // for the rune range 80-10FFFF (Runeself-Runemax).
+ // This range happens frequently enough (for example /./ and /[^a-z]/)
+re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634
+
+ Frag Compiler::Literal(Rune r, bool foldcase) {
+ switch (encoding_) {
+- default: // UCS-2 or something new
+- BeginRange();
+- AddRuneRange(r, r, foldcase);
+- return EndRange();
++ default:
++ return kNullFrag;
+
+ case kEncodingLatin1:
+ return ByteRange(r, r, foldcase);
+re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850
+
+ if (re->parse_flags() & Regexp::Latin1)
+ c.encoding_ = kEncodingLatin1;
+- else if (re->parse_flags() & Regexp::UCS2)
+- c.encoding_ = kEncodingUCS2;
+ c.reversed_ = reversed;
+ if (max_mem <= 0) {
+ c.max_inst_ = 100000; // more than enough
+re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905
+ c.prog_->set_start_unanchored(c.prog_->start());
+ } else {
+ Frag dot;
+- if (c.encoding_ == kEncodingUCS2) {
+- dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false));
+- } else {
+- dot = c.ByteRange(0x00, 0xFF, false);
+- }
++ dot = c.ByteRange(0x00, 0xFF, false);
+ Frag dotloop = c.Star(dot, true);
+ Frag unanchored = c.Cat(dotloop, all);
+ c.prog_->set_start_unanchored(unanchored.begin);
+==== re2/nfa.cc#8 - re2/nfa.cc#9 ====
+re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431
+ const char* bp = context.begin();
+ int c = -1;
+ int wasword = 0;
+- bool ucs2 = prog_->flags() & Regexp::UCS2;
+
+ if (text.begin() > context.begin()) {
+ c = text.begin()[-1] & 0xFF;
+re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497
+ // If there's a required first byte for an unanchored search
+ // and we're not in the middle of any possible matches,
+ // use memchr to search for the byte quickly.
+- if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 &&
++ if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
+ p < text.end() && (p[0] & 0xFF) != first_byte_) {
+ p = reinterpret_cast<const char*>(memchr(p, first_byte_,
+ text.end() - p));
+re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514
+ flag = Prog::EmptyFlags(context, p);
+ }
+
+- // In UCS-2 mode, if we need to start a new thread,
+- // make sure to do it on an even boundary.
+- if(ucs2 && runq->size() == 0 &&
+- (p - context.begin()) % 2 && p < text.end()) {
+- p++;
+- flag = Prog::EmptyFlags(context, p);
+- }
+-
+ // Steal match storage (cleared but unused as of yet)
+ // temporarily to hold match boundaries for new thread.
+- // In UCS-2 mode, only start the thread on a 2-byte boundary.
+- if(!ucs2 || (p - context.begin()) % 2 == 0) {
+- match_[0] = p;
+- AddToThreadq(runq, start_, flag, p, match_);
+- match_[0] = NULL;
+- }
++ match_[0] = p;
++ AddToThreadq(runq, start_, flag, p, match_);
++ match_[0] = NULL;
+ }
+
+ // If all the threads have died, stop early.
+==== re2/parse.cc#22 - re2/parse.cc#23 ====
+re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165
+ status_(status), stacktop_(NULL), ncap_(0) {
+ if (flags_ & Latin1)
+ rune_max_ = 0xFF;
+- else if (flags & UCS2)
+- rune_max_ = 0xFFFF;
+ else
+ rune_max_ = Runemax;
+ }
+re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374
+ bool Regexp::ParseState::PushCarat() {
+ if (flags_ & OneLine) {
+ return PushSimpleOp(kRegexpBeginText);
+- } else {
+- if (flags_ & UCS2) {
+- status_->set_code(kRegexpUnsupported);
+- status_->set_error_arg("multiline ^ in UCS-2 mode");
+- return false;
+- }
+- return PushSimpleOp(kRegexpBeginLine);
+ }
++ return PushSimpleOp(kRegexpBeginLine);
+ }
+
+ // Pushes a \b or \B onto the stack.
+ bool Regexp::ParseState::PushWordBoundary(bool word) {
+- if (flags_ & UCS2) {
+- status_->set_code(kRegexpUnsupported);
+- status_->set_error_arg("\\b or \\B in UCS-2 mode");
+- return false;
+- }
+ if (word)
+ return PushSimpleOp(kRegexpWordBoundary);
+ return PushSimpleOp(kRegexpNoWordBoundary);
+re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389
+ bool ret = PushSimpleOp(kRegexpEndText);
+ flags_ = oflags;
+ return ret;
+- }
+- if (flags_ & UCS2) {
+- status_->set_code(kRegexpUnsupported);
+- status_->set_error_arg("multiline $ in UCS-2 mode");
+- return false;
+ }
+ return PushSimpleOp(kRegexpEndLine);
+ }
+==== re2/re2.cc#34 - re2/re2.cc#35 ====
+re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84
+ return RE2::ErrorBadUTF8;
+ case re2::kRegexpBadNamedCapture:
+ return RE2::ErrorBadNamedCapture;
+- case re2::kRegexpUnsupported:
+- return RE2::ErrorUnsupported;
+ }
+ return RE2::ErrorInternal;
+ }
+re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125
+ break;
+ case RE2::Options::EncodingLatin1:
+ flags |= Regexp::Latin1;
+- break;
+- case RE2::Options::EncodingUCS2:
+- flags |= Regexp::UCS2;
+ break;
+ }
+
+==== re2/re2.h#36 - re2/re2.h#37 ====
+re2/re2.h#36:246,252 - re2/re2.h#37:246,251
+ ErrorBadUTF8, // invalid UTF-8 in regexp
+ ErrorBadNamedCapture, // bad named capture group
+ ErrorPatternTooLarge, // pattern too large (compile failed)
+- ErrorUnsupported, // unsupported feature (in UCS-2 mode)
+ };
+
+ // Predefined common options.
+re2/re2.h#36:570,576 - re2/re2.h#37:569,574
+
+ enum Encoding {
+ EncodingUTF8 = 1,
+- EncodingUCS2, // 16-bit Unicode 0-FFFF only
+ EncodingLatin1
+ };
+
+==== re2/regexp.cc#15 - re2/regexp.cc#16 ====
+re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329
+ // the regexp that remains after the prefix. The prefix might
+ // be ASCII case-insensitive.
+ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
+- // Don't even bother for UCS-2; it's time to throw that code away.
+- if (parse_flags_ & UCS2)
+- return false;
+-
+ // No need for a walker: the regexp must be of the form
+ // 1. some number of ^ anchors
+ // 2. a literal char or string
+==== re2/regexp.h#20 - re2/regexp.h#21 ====
+re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192
+ kRegexpBadPerlOp, // bad perl operator
+ kRegexpBadUTF8, // invalid UTF-8 in regexp
+ kRegexpBadNamedCapture, // bad named capture
+- kRegexpUnsupported, // unsupported operator
+ };
+
+ // Error status for certain operations.
+re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314
+ // \Q and \E to disable/enable metacharacters
+ // (?P<name>expr) for named captures
+ // \C to match any single byte
+- UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8.
+- UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group
++ UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
+ // and \P{Han} for its negation.
+- NeverNL = 1<<12, // Never match NL, even if the regexp mentions
++ NeverNL = 1<<11, // Never match NL, even if the regexp mentions
+ // it explicitly.
+
+ // As close to Perl as we can get.
+==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ====
+re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139
+ cap_[0] = p;
+ if (Visit(prog_->start(), p)) // Match must be leftmost; done.
+ return true;
+- if (prog_->flags() & Regexp::UCS2)
+- p++;
+ }
+ return false;
+ }
+==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ====
+re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152
+ static ParseMode parse_modes[] = {
+ { single_line, "single-line" },
+ { single_line|Regexp::Latin1, "single-line, latin1" },
+- { single_line|Regexp::UCS2, "single-line, ucs2" },
+ { multi_line, "multiline" },
+ { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
+ { multi_line|Regexp::Latin1, "multiline, latin1" },
+- { multi_line|Regexp::UCS2, "multiline, ucs2" },
+ };
+
+ static string FormatMode(Regexp::ParseFlags flags) {
+re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185
+ RegexpStatus status;
+ regexp_ = Regexp::Parse(regexp_str, flags, &status);
+ if (regexp_ == NULL) {
+- if (status.code() != kRegexpUnsupported) {
+- LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
+- << " mode: " << FormatMode(flags);
+- error_ = true;
+- }
++ LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
++ << " mode: " << FormatMode(flags);
++ error_ = true;
+ return;
+ }
+ prog_ = regexp_->CompileToProg(0);
+re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231
+ RE2::Options options;
+ if (flags & Regexp::Latin1)
+ options.set_encoding(RE2::Options::EncodingLatin1);
+- else if (flags & Regexp::UCS2)
+- options.set_encoding(RE2::Options::EncodingUCS2);
+ if (kind_ == Prog::kLongestMatch)
+ options.set_longest_match(true);
+ re2_ = new RE2(re, options);
+re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280
+ delete re2_;
+ }
+
+- // Converts UTF-8 string in text into UCS-2 string in new_text.
+- static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) {
+- const char* p = text.begin();
+- const char* ep = text.end();
+- uint16* q = new uint16[ep - p];
+- uint16* q0 = q;
+-
+- int n;
+- Rune r;
+- for (; p < ep; p += n) {
+- if (!fullrune(p, ep - p)) {
+- delete[] q0;
+- return false;
+- }
+- n = chartorune(&r, p);
+- if (r > 0xFFFF) {
+- delete[] q0;
+- return false;
+- }
+- *q++ = r;
+- }
+- *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0));
+- return true;
+- }
+-
+- // Rewrites *sp from being a pointer into text8 (UTF-8)
+- // to being a pointer into text16 (equivalent text but in UCS-2).
+- static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16,
+- StringPiece *sp) {
+- if (sp->begin() == NULL && text8.begin() != NULL)
+- return;
+-
+- int nrune = 0;
+- int n;
+- Rune r;
+- const char* p = text8.begin();
+- const char* ep = text8.end();
+- const char* spbegin = NULL;
+- const char* spend = NULL;
+- for (;;) {
+- if (p == sp->begin())
+- spbegin = text16.begin() + sizeof(uint16)*nrune;
+- if (p == sp->end())
+- spend = text16.begin() + sizeof(uint16)*nrune;
+- if (p >= ep)
+- break;
+- n = chartorune(&r, p);
+- p += n;
+- nrune++;
+- }
+- if (spbegin == NULL || spend == NULL) {
+- LOG(FATAL) << "Error in AdjustUTF8ToUCS2 "
+- << CEscape(text8) << " "
+- << (int)(sp->begin() - text8.begin()) << " "
+- << (int)(sp->end() - text8.begin());
+- }
+- *sp = StringPiece(spbegin, spend - spbegin);
+- }
+-
+- // Rewrites *sp from begin a pointer into text16 (UCS-2)
+- // to being a pointer into text8 (equivalent text but in UTF-8).
+- static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8,
+- StringPiece* sp) {
+- if (sp->begin() == NULL)
+- return;
+-
+- int nrune = 0;
+- int n;
+- Rune r;
+- const char* p = text8.begin();
+- const char* ep = text8.end();
+- const char* spbegin = NULL;
+- const char* spend = NULL;
+- for (;;) {
+- if (nrune == (sp->begin() - text16.begin())/2)
+- spbegin = p;
+- if (nrune == (sp->end() - text16.begin())/2)
+- spend = p;
+- if (p >= ep)
+- break;
+- n = chartorune(&r, p);
+- p += n;
+- nrune++;
+- }
+- if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) {
+- LOG(FATAL) << "Error in AdjustUCS2ToUTF8 "
+- << CEscape(text16) << " "
+- << (int)(sp->begin() - text16.begin()) << " "
+- << (int)(sp->end() - text16.begin());
+- }
+- *sp = StringPiece(spbegin, spend - spbegin);
+- }
+-
+ // Runs a single search using the named engine type.
+ // This interface hides all the irregularities of the various
+ // engine interfaces from the rest of this file.
+re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300
+
+ StringPiece text = orig_text;
+ StringPiece context = orig_context;
+- bool ucs2 = false;
+
+- if ((flags() & Regexp::UCS2) && type != kEnginePCRE) {
+- if (!ConvertUTF8ToUCS2(orig_context, &context)) {
+- result->skipped = true;
+- return;
+- }
+-
+- // Rewrite context to refer to new text.
+- AdjustUTF8ToUCS2(orig_context, context, &text);
+- ucs2 = true;
+- }
+-
+ switch (type) {
+ default:
+ LOG(FATAL) << "Bad RunSearch type: " << (int)type;
+re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451
+ }
+ }
+
+- // If we did UCS-2 matching, rewrite the matches to refer
+- // to the original UTF-8 text.
+- if (ucs2) {
+- if (result->matched) {
+- if (result->have_submatch0) {
+- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]);
+- } else if (result->have_submatch) {
+- for (int i = 0; i < nsubmatch; i++) {
+- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]);
+- }
+- }
+- }
+- delete[] context.begin();
+- }
+-
+ if (!result->matched)
+ memset(result->submatch, 0, sizeof result->submatch);
+ }
+re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475
+ return true;
+ }
+
+- // Check whether text uses only Unicode points <= 0xFFFF
+- // (in the BMP).
+- static bool IsBMP(const StringPiece& text) {
+- const char* p = text.begin();
+- const char* ep = text.end();
+- while (p < ep) {
+- if (!fullrune(p, ep - p))
+- return false;
+- Rune r;
+- p += chartorune(&r, p);
+- if (r > 0xFFFF)
+- return false;
+- }
+- return true;
+- }
+-
+ // Runs a single test.
+ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
+ Prog::Anchor anchor) {
+re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483
+ Result correct;
+ RunSearch(kEngineBacktrack, text, context, anchor, &correct);
+ if (correct.skipped) {
+- if (regexp_ == NULL || !IsBMP(context)) // okay to skip in UCS-2 mode
++ if (regexp_ == NULL)
+ return true;
+ LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
+ << " " << FormatMode(flags_);
diff --git a/third_party/re2/src/util/logging.h b/third_party/re2/src/util/logging.h
new file mode 100644
index 000000000..946962b39
--- /dev/null
+++ b/third_party/re2/src/util/logging.h
@@ -0,0 +1,109 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef UTIL_LOGGING_H_
+#define UTIL_LOGGING_H_
+
+// Simplified version of Google's logging.
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ostream>
+#include <sstream>
+
+#include "absl/base/attributes.h"
+
+// Debug-only checking.
+#define DCHECK(condition) assert(condition)
+#define DCHECK_EQ(val1, val2) assert((val1) == (val2))
+#define DCHECK_NE(val1, val2) assert((val1) != (val2))
+#define DCHECK_LE(val1, val2) assert((val1) <= (val2))
+#define DCHECK_LT(val1, val2) assert((val1) < (val2))
+#define DCHECK_GE(val1, val2) assert((val1) >= (val2))
+#define DCHECK_GT(val1, val2) assert((val1) > (val2))
+
+// Always-on checking
+#define CHECK(x) if(x){}else LogMessageFatal(__FILE__, __LINE__).stream() << "Check failed: " #x
+#define CHECK_LT(x, y) CHECK((x) < (y))
+#define CHECK_GT(x, y) CHECK((x) > (y))
+#define CHECK_LE(x, y) CHECK((x) <= (y))
+#define CHECK_GE(x, y) CHECK((x) >= (y))
+#define CHECK_EQ(x, y) CHECK((x) == (y))
+#define CHECK_NE(x, y) CHECK((x) != (y))
+
+#define LOG_INFO LogMessage(__FILE__, __LINE__)
+#define LOG_WARNING LogMessage(__FILE__, __LINE__)
+#define LOG_ERROR LogMessage(__FILE__, __LINE__)
+#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__)
+#define LOG_QFATAL LOG_FATAL
+
+// It seems that one of the Windows header files defines ERROR as 0.
+#ifdef _WIN32
+#define LOG_0 LOG_INFO
+#endif
+
+#ifdef NDEBUG
+#define LOG_DFATAL LOG_ERROR
+#else
+#define LOG_DFATAL LOG_FATAL
+#endif
+
+#define LOG(severity) LOG_ ## severity.stream()
+
+#define VLOG(x) if((x)>0){}else LOG_INFO.stream()
+
+class LogMessage {
+ public:
+ LogMessage(const char* file, int line)
+ : flushed_(false) {
+ stream() << file << ":" << line << ": ";
+ }
+ void Flush() {
+ stream() << "\n";
+ std::string s = str_.str();
+ size_t n = s.size();
+ if (fwrite(s.data(), 1, n, stderr) < n) {} // shut up gcc
+ flushed_ = true;
+ }
+ ~LogMessage() {
+ if (!flushed_) {
+ Flush();
+ }
+ }
+ std::ostream& stream() { return str_; }
+
+ private:
+ bool flushed_;
+ std::ostringstream str_;
+
+ LogMessage(const LogMessage&) = delete;
+ LogMessage& operator=(const LogMessage&) = delete;
+};
+
+// Silence "destructor never returns" warning for ~LogMessageFatal().
+// Since this is a header file, push and then pop to limit the scope.
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4722)
+#endif
+
+class LogMessageFatal : public LogMessage {
+ public:
+ LogMessageFatal(const char* file, int line)
+ : LogMessage(file, line) {}
+ ABSL_ATTRIBUTE_NORETURN ~LogMessageFatal() {
+ Flush();
+ abort();
+ }
+ private:
+ LogMessageFatal(const LogMessageFatal&) = delete;
+ LogMessageFatal& operator=(const LogMessageFatal&) = delete;
+};
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif // UTIL_LOGGING_H_
diff --git a/third_party/re2/src/util/malloc_counter.h b/third_party/re2/src/util/malloc_counter.h
new file mode 100644
index 000000000..81b564ff9
--- /dev/null
+++ b/third_party/re2/src/util/malloc_counter.h
@@ -0,0 +1,19 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef UTIL_MALLOC_COUNTER_H_
+#define UTIL_MALLOC_COUNTER_H_
+
+namespace testing {
+class MallocCounter {
+ public:
+ MallocCounter(int x) {}
+ static const int THIS_THREAD_ONLY = 0;
+ long long HeapGrowth() { return 0; }
+ long long PeakHeapGrowth() { return 0; }
+ void Reset() {}
+};
+} // namespace testing
+
+#endif // UTIL_MALLOC_COUNTER_H_
diff --git a/third_party/re2/src/util/pcre.cc b/third_party/re2/src/util/pcre.cc
new file mode 100644
index 000000000..f54cb28f8
--- /dev/null
+++ b/third_party/re2/src/util/pcre.cc
@@ -0,0 +1,956 @@
+// Copyright 2003-2009 Google Inc. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This is a variant of PCRE's pcrecpp.cc, originally written at Google.
+// The main changes are the addition of the HitLimit method and
+// compilation as PCRE in namespace re2.
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "absl/flags/flag.h"
+#include "absl/strings/str_format.h"
+#include "util/logging.h"
+#include "util/pcre.h"
+
+// Silence warnings about the wacky formatting in the operator() functions.
+#if !defined(__clang__) && defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wmisleading-indentation"
+#endif
+
+#define PCREPORT(level) LOG(level)
+
+// Default PCRE limits.
+// Defaults chosen to allow a plausible amount of CPU and
+// not exceed main thread stacks. Note that other threads
+// often have smaller stacks, and therefore tightening
+// regexp_stack_limit may frequently be necessary.
+ABSL_FLAG(int, regexp_stack_limit, 256 << 10,
+ "default PCRE stack limit (bytes)");
+ABSL_FLAG(int, regexp_match_limit, 1000000,
+ "default PCRE match limit (function calls)");
+
+#ifndef USEPCRE
+
+// Fake just enough of the PCRE API to allow this file to build. :)
+
+struct pcre_extra {
+ int flags;
+ int match_limit;
+ int match_limit_recursion;
+};
+
+#define PCRE_EXTRA_MATCH_LIMIT 0
+#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
+#define PCRE_ANCHORED 0
+#define PCRE_NOTEMPTY 0
+#define PCRE_ERROR_NOMATCH 1
+#define PCRE_ERROR_MATCHLIMIT 2
+#define PCRE_ERROR_RECURSIONLIMIT 3
+#define PCRE_INFO_CAPTURECOUNT 0
+
+void pcre_free(void*) {
+}
+
+pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) {
+ return NULL;
+}
+
+int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) {
+ return 0;
+}
+
+int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) {
+ return 0;
+}
+
+#endif
+
+namespace re2 {
+
+// Maximum number of args we can set
+static const int kMaxArgs = 16;
+static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace
+
+// Approximate size of a recursive invocation of PCRE's
+// internal "match()" frame. This varies depending on the
+// compiler and architecture, of course, so the constant is
+// just a conservative estimate. To find the exact number,
+// run regexp_unittest with --regexp_stack_limit=0 under
+// a debugger and look at the frames when it crashes.
+// The exact frame size was 656 in production on 2008/02/03.
+static const int kPCREFrameSize = 700;
+
+// Special name for missing C++ arguments.
+PCRE::Arg PCRE::no_more_args((void*)NULL);
+
+const PCRE::PartialMatchFunctor PCRE::PartialMatch = { };
+const PCRE::FullMatchFunctor PCRE::FullMatch = { } ;
+const PCRE::ConsumeFunctor PCRE::Consume = { };
+const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { };
+
+// If a regular expression has no error, its error_ field points here
+static const std::string empty_string;
+
+void PCRE::Init(const char* pattern, Option options, int match_limit,
+ int stack_limit, bool report_errors) {
+ pattern_ = pattern;
+ options_ = options;
+ match_limit_ = match_limit;
+ stack_limit_ = stack_limit;
+ hit_limit_ = false;
+ error_ = &empty_string;
+ report_errors_ = report_errors;
+ re_full_ = NULL;
+ re_partial_ = NULL;
+
+ if (options & ~(EnabledCompileOptions | EnabledExecOptions)) {
+ error_ = new std::string("illegal regexp option");
+ PCREPORT(ERROR)
+ << "Error compiling '" << pattern << "': illegal regexp option";
+ } else {
+ re_partial_ = Compile(UNANCHORED);
+ if (re_partial_ != NULL) {
+ re_full_ = Compile(ANCHOR_BOTH);
+ }
+ }
+}
+
+PCRE::PCRE(const char* pattern) {
+ Init(pattern, None, 0, 0, true);
+}
+PCRE::PCRE(const char* pattern, Option option) {
+ Init(pattern, option, 0, 0, true);
+}
+PCRE::PCRE(const std::string& pattern) {
+ Init(pattern.c_str(), None, 0, 0, true);
+}
+PCRE::PCRE(const std::string& pattern, Option option) {
+ Init(pattern.c_str(), option, 0, 0, true);
+}
+PCRE::PCRE(const std::string& pattern, const PCRE_Options& re_option) {
+ Init(pattern.c_str(), re_option.option(), re_option.match_limit(),
+ re_option.stack_limit(), re_option.report_errors());
+}
+
+PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) {
+ Init(pattern, re_option.option(), re_option.match_limit(),
+ re_option.stack_limit(), re_option.report_errors());
+}
+
+PCRE::~PCRE() {
+ if (re_full_ != NULL) pcre_free(re_full_);
+ if (re_partial_ != NULL) pcre_free(re_partial_);
+ if (error_ != &empty_string) delete error_;
+}
+
+pcre* PCRE::Compile(Anchor anchor) {
+ // Special treatment for anchoring. This is needed because at
+ // runtime pcre only provides an option for anchoring at the
+ // beginning of a string.
+ //
+ // There are three types of anchoring we want:
+ // UNANCHORED Compile the original pattern, and use
+ // a pcre unanchored match.
+ // ANCHOR_START Compile the original pattern, and use
+ // a pcre anchored match.
+ // ANCHOR_BOTH Tack a "\z" to the end of the original pattern
+ // and use a pcre anchored match.
+
+ const char* error = "";
+ int eoffset;
+ pcre* re;
+ if (anchor != ANCHOR_BOTH) {
+ re = pcre_compile(pattern_.c_str(),
+ (options_ & EnabledCompileOptions),
+ &error, &eoffset, NULL);
+ } else {
+ // Tack a '\z' at the end of PCRE. Parenthesize it first so that
+ // the '\z' applies to all top-level alternatives in the regexp.
+ std::string wrapped = "(?:"; // A non-counting grouping operator
+ wrapped += pattern_;
+ wrapped += ")\\z";
+ re = pcre_compile(wrapped.c_str(),
+ (options_ & EnabledCompileOptions),
+ &error, &eoffset, NULL);
+ }
+ if (re == NULL) {
+ if (error_ == &empty_string) error_ = new std::string(error);
+ PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error;
+ }
+ return re;
+}
+
+/***** Convenience interfaces *****/
+
+bool PCRE::FullMatchFunctor::operator()(
+ absl::string_view text, const PCRE& re, const Arg& a0, const Arg& a1,
+ const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
+ const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
+ const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
+ const Arg* args[kMaxArgs];
+ int n = 0;
+ if (&a0 == &no_more_args) goto done; args[n++] = &a0;
+ if (&a1 == &no_more_args) goto done; args[n++] = &a1;
+ if (&a2 == &no_more_args) goto done; args[n++] = &a2;
+ if (&a3 == &no_more_args) goto done; args[n++] = &a3;
+ if (&a4 == &no_more_args) goto done; args[n++] = &a4;
+ if (&a5 == &no_more_args) goto done; args[n++] = &a5;
+ if (&a6 == &no_more_args) goto done; args[n++] = &a6;
+ if (&a7 == &no_more_args) goto done; args[n++] = &a7;
+ if (&a8 == &no_more_args) goto done; args[n++] = &a8;
+ if (&a9 == &no_more_args) goto done; args[n++] = &a9;
+ if (&a10 == &no_more_args) goto done; args[n++] = &a10;
+ if (&a11 == &no_more_args) goto done; args[n++] = &a11;
+ if (&a12 == &no_more_args) goto done; args[n++] = &a12;
+ if (&a13 == &no_more_args) goto done; args[n++] = &a13;
+ if (&a14 == &no_more_args) goto done; args[n++] = &a14;
+ if (&a15 == &no_more_args) goto done; args[n++] = &a15;
+done:
+
+ size_t consumed;
+ int vec[kVecSize] = {};
+ return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
+}
+
+bool PCRE::PartialMatchFunctor::operator()(
+ absl::string_view text, const PCRE& re, const Arg& a0, const Arg& a1,
+ const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
+ const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
+ const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
+ const Arg* args[kMaxArgs];
+ int n = 0;
+ if (&a0 == &no_more_args) goto done; args[n++] = &a0;
+ if (&a1 == &no_more_args) goto done; args[n++] = &a1;
+ if (&a2 == &no_more_args) goto done; args[n++] = &a2;
+ if (&a3 == &no_more_args) goto done; args[n++] = &a3;
+ if (&a4 == &no_more_args) goto done; args[n++] = &a4;
+ if (&a5 == &no_more_args) goto done; args[n++] = &a5;
+ if (&a6 == &no_more_args) goto done; args[n++] = &a6;
+ if (&a7 == &no_more_args) goto done; args[n++] = &a7;
+ if (&a8 == &no_more_args) goto done; args[n++] = &a8;
+ if (&a9 == &no_more_args) goto done; args[n++] = &a9;
+ if (&a10 == &no_more_args) goto done; args[n++] = &a10;
+ if (&a11 == &no_more_args) goto done; args[n++] = &a11;
+ if (&a12 == &no_more_args) goto done; args[n++] = &a12;
+ if (&a13 == &no_more_args) goto done; args[n++] = &a13;
+ if (&a14 == &no_more_args) goto done; args[n++] = &a14;
+ if (&a15 == &no_more_args) goto done; args[n++] = &a15;
+done:
+
+ size_t consumed;
+ int vec[kVecSize] = {};
+ return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
+}
+
+bool PCRE::ConsumeFunctor::operator()(
+ absl::string_view* input, const PCRE& pattern, const Arg& a0, const Arg& a1,
+ const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
+ const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
+ const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
+ const Arg* args[kMaxArgs];
+ int n = 0;
+ if (&a0 == &no_more_args) goto done; args[n++] = &a0;
+ if (&a1 == &no_more_args) goto done; args[n++] = &a1;
+ if (&a2 == &no_more_args) goto done; args[n++] = &a2;
+ if (&a3 == &no_more_args) goto done; args[n++] = &a3;
+ if (&a4 == &no_more_args) goto done; args[n++] = &a4;
+ if (&a5 == &no_more_args) goto done; args[n++] = &a5;
+ if (&a6 == &no_more_args) goto done; args[n++] = &a6;
+ if (&a7 == &no_more_args) goto done; args[n++] = &a7;
+ if (&a8 == &no_more_args) goto done; args[n++] = &a8;
+ if (&a9 == &no_more_args) goto done; args[n++] = &a9;
+ if (&a10 == &no_more_args) goto done; args[n++] = &a10;
+ if (&a11 == &no_more_args) goto done; args[n++] = &a11;
+ if (&a12 == &no_more_args) goto done; args[n++] = &a12;
+ if (&a13 == &no_more_args) goto done; args[n++] = &a13;
+ if (&a14 == &no_more_args) goto done; args[n++] = &a14;
+ if (&a15 == &no_more_args) goto done; args[n++] = &a15;
+done:
+
+ size_t consumed;
+ int vec[kVecSize] = {};
+ if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed,
+ args, n, vec, kVecSize)) {
+ input->remove_prefix(consumed);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool PCRE::FindAndConsumeFunctor::operator()(
+ absl::string_view* input, const PCRE& pattern, const Arg& a0, const Arg& a1,
+ const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
+ const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
+ const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
+ const Arg* args[kMaxArgs];
+ int n = 0;
+ if (&a0 == &no_more_args) goto done; args[n++] = &a0;
+ if (&a1 == &no_more_args) goto done; args[n++] = &a1;
+ if (&a2 == &no_more_args) goto done; args[n++] = &a2;
+ if (&a3 == &no_more_args) goto done; args[n++] = &a3;
+ if (&a4 == &no_more_args) goto done; args[n++] = &a4;
+ if (&a5 == &no_more_args) goto done; args[n++] = &a5;
+ if (&a6 == &no_more_args) goto done; args[n++] = &a6;
+ if (&a7 == &no_more_args) goto done; args[n++] = &a7;
+ if (&a8 == &no_more_args) goto done; args[n++] = &a8;
+ if (&a9 == &no_more_args) goto done; args[n++] = &a9;
+ if (&a10 == &no_more_args) goto done; args[n++] = &a10;
+ if (&a11 == &no_more_args) goto done; args[n++] = &a11;
+ if (&a12 == &no_more_args) goto done; args[n++] = &a12;
+ if (&a13 == &no_more_args) goto done; args[n++] = &a13;
+ if (&a14 == &no_more_args) goto done; args[n++] = &a14;
+ if (&a15 == &no_more_args) goto done; args[n++] = &a15;
+done:
+
+ size_t consumed;
+ int vec[kVecSize] = {};
+ if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed,
+ args, n, vec, kVecSize)) {
+ input->remove_prefix(consumed);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool PCRE::Replace(std::string* str, const PCRE& pattern,
+ absl::string_view rewrite) {
+ int vec[kVecSize] = {};
+ int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
+ if (matches == 0)
+ return false;
+
+ std::string s;
+ if (!pattern.Rewrite(&s, rewrite, *str, vec, matches))
+ return false;
+
+ assert(vec[0] >= 0);
+ assert(vec[1] >= 0);
+ str->replace(vec[0], vec[1] - vec[0], s);
+ return true;
+}
+
+int PCRE::GlobalReplace(std::string* str, const PCRE& pattern,
+ absl::string_view rewrite) {
+ int count = 0;
+ int vec[kVecSize] = {};
+ std::string out;
+ size_t start = 0;
+ bool last_match_was_empty_string = false;
+
+ while (start <= str->size()) {
+ // If the previous match was for the empty string, we shouldn't
+ // just match again: we'll match in the same way and get an
+ // infinite loop. Instead, we do the match in a special way:
+ // anchored -- to force another try at the same position --
+ // and with a flag saying that this time, ignore empty matches.
+ // If this special match returns, that means there's a non-empty
+ // match at this position as well, and we can continue. If not,
+ // we do what perl does, and just advance by one.
+ // Notice that perl prints '@@@' for this;
+ // perl -le '$_ = "aa"; s/b*|aa/@/g; print'
+ int matches;
+ if (last_match_was_empty_string) {
+ matches = pattern.TryMatch(*str, start, ANCHOR_START, false,
+ vec, kVecSize);
+ if (matches <= 0) {
+ if (start < str->size())
+ out.push_back((*str)[start]);
+ start++;
+ last_match_was_empty_string = false;
+ continue;
+ }
+ } else {
+ matches = pattern.TryMatch(*str, start, UNANCHORED, true,
+ vec, kVecSize);
+ if (matches <= 0)
+ break;
+ }
+ size_t matchstart = vec[0], matchend = vec[1];
+ assert(matchstart >= start);
+ assert(matchend >= matchstart);
+
+ out.append(*str, start, matchstart - start);
+ pattern.Rewrite(&out, rewrite, *str, vec, matches);
+ start = matchend;
+ count++;
+ last_match_was_empty_string = (matchstart == matchend);
+ }
+
+ if (count == 0)
+ return 0;
+
+ if (start < str->size())
+ out.append(*str, start, str->size() - start);
+ using std::swap;
+ swap(out, *str);
+ return count;
+}
+
+bool PCRE::Extract(absl::string_view text, const PCRE& pattern,
+ absl::string_view rewrite, std::string* out) {
+ int vec[kVecSize] = {};
+ int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
+ if (matches == 0)
+ return false;
+ out->clear();
+ return pattern.Rewrite(out, rewrite, text, vec, matches);
+}
+
+std::string PCRE::QuoteMeta(absl::string_view unquoted) {
+ std::string result;
+ result.reserve(unquoted.size() << 1);
+
+ // Escape any ascii character not in [A-Za-z_0-9].
+ //
+ // Note that it's legal to escape a character even if it has no
+ // special meaning in a regular expression -- so this function does
+ // that. (This also makes it identical to the perl function of the
+ // same name except for the null-character special case;
+ // see `perldoc -f quotemeta`.)
+ for (size_t ii = 0; ii < unquoted.size(); ++ii) {
+ // Note that using 'isalnum' here raises the benchmark time from
+ // 32ns to 58ns:
+ if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
+ (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
+ (unquoted[ii] < '0' || unquoted[ii] > '9') &&
+ unquoted[ii] != '_' &&
+ // If this is the part of a UTF8 or Latin1 character, we need
+ // to copy this byte without escaping. Experimentally this is
+ // what works correctly with the regexp library.
+ !(unquoted[ii] & 128)) {
+ if (unquoted[ii] == '\0') { // Special handling for null chars.
+ // Can't use "\\0" since the next character might be a digit.
+ result += "\\x00";
+ continue;
+ }
+ result += '\\';
+ }
+ result += unquoted[ii];
+ }
+
+ return result;
+}
+
+/***** Actual matching and rewriting code *****/
+
+bool PCRE::HitLimit() {
+ return hit_limit_ != 0;
+}
+
+void PCRE::ClearHitLimit() {
+ hit_limit_ = 0;
+}
+
+int PCRE::TryMatch(absl::string_view text, size_t startpos, Anchor anchor,
+ bool empty_ok, int* vec, int vecsize) const {
+ pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
+ if (re == NULL) {
+ PCREPORT(ERROR) << "Matching against invalid re: " << *error_;
+ return 0;
+ }
+
+ int match_limit = match_limit_;
+ if (match_limit <= 0) {
+ match_limit = absl::GetFlag(FLAGS_regexp_match_limit);
+ }
+
+ int stack_limit = stack_limit_;
+ if (stack_limit <= 0) {
+ stack_limit = absl::GetFlag(FLAGS_regexp_stack_limit);
+ }
+
+ pcre_extra extra = { 0 };
+ if (match_limit > 0) {
+ extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
+ extra.match_limit = match_limit;
+ }
+ if (stack_limit > 0) {
+ extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
+ extra.match_limit_recursion = stack_limit / kPCREFrameSize;
+ }
+
+ int options = 0;
+ if (anchor != UNANCHORED)
+ options |= PCRE_ANCHORED;
+ if (!empty_ok)
+ options |= PCRE_NOTEMPTY;
+
+ int rc = pcre_exec(re, // The regular expression object
+ &extra,
+ (text.data() == NULL) ? "" : text.data(),
+ static_cast<int>(text.size()),
+ static_cast<int>(startpos),
+ options,
+ vec,
+ vecsize);
+
+ // Handle errors
+ if (rc == 0) {
+ // pcre_exec() returns 0 as a special case when the number of
+ // capturing subpatterns exceeds the size of the vector.
+ // When this happens, there is a match and the output vector
+ // is filled, but we miss out on the positions of the extra subpatterns.
+ rc = vecsize / 2;
+ } else if (rc < 0) {
+ switch (rc) {
+ case PCRE_ERROR_NOMATCH:
+ return 0;
+ case PCRE_ERROR_MATCHLIMIT:
+ // Writing to hit_limit is not safe if multiple threads
+ // are using the PCRE, but the flag is only intended
+ // for use by unit tests anyway, so we let it go.
+ hit_limit_ = true;
+ PCREPORT(WARNING) << "Exceeded match limit of " << match_limit
+ << " when matching '" << pattern_ << "'"
+ << " against text that is " << text.size() << " bytes.";
+ return 0;
+ case PCRE_ERROR_RECURSIONLIMIT:
+ // See comment about hit_limit above.
+ hit_limit_ = true;
+ PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit
+ << " when matching '" << pattern_ << "'"
+ << " against text that is " << text.size() << " bytes.";
+ return 0;
+ default:
+ // There are other return codes from pcre.h :
+ // PCRE_ERROR_NULL (-2)
+ // PCRE_ERROR_BADOPTION (-3)
+ // PCRE_ERROR_BADMAGIC (-4)
+ // PCRE_ERROR_UNKNOWN_NODE (-5)
+ // PCRE_ERROR_NOMEMORY (-6)
+ // PCRE_ERROR_NOSUBSTRING (-7)
+ // ...
+ PCREPORT(ERROR) << "Unexpected return code: " << rc
+ << " when matching '" << pattern_ << "'"
+ << ", re=" << re
+ << ", text=" << text
+ << ", vec=" << vec
+ << ", vecsize=" << vecsize;
+ return 0;
+ }
+ }
+
+ return rc;
+}
+
+bool PCRE::DoMatchImpl(absl::string_view text, Anchor anchor, size_t* consumed,
+ const Arg* const* args, int n, int* vec,
+ int vecsize) const {
+ assert((1 + n) * 3 <= vecsize); // results + PCRE workspace
+ if (NumberOfCapturingGroups() < n) {
+ // RE has fewer capturing groups than number of Arg pointers passed in.
+ return false;
+ }
+
+ int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
+ assert(matches >= 0); // TryMatch never returns negatives
+ if (matches == 0)
+ return false;
+
+ *consumed = vec[1];
+
+ if (n == 0 || args == NULL) {
+ // We are not interested in results
+ return true;
+ }
+
+ // If we got here, we must have matched the whole pattern.
+ // We do not need (can not do) any more checks on the value of 'matches' here
+ // -- see the comment for TryMatch.
+ for (int i = 0; i < n; i++) {
+ const int start = vec[2*(i+1)];
+ const int limit = vec[2*(i+1)+1];
+
+ // Avoid invoking undefined behavior when text.data() happens
+ // to be null and start happens to be -1, the latter being the
+ // case for an unmatched subexpression. Even if text.data() is
+ // not null, pointing one byte before was a longstanding bug.
+ const char* addr = NULL;
+ if (start != -1) {
+ addr = text.data() + start;
+ }
+
+ if (!args[i]->Parse(addr, limit-start)) {
+ // TODO: Should we indicate what the error was?
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool PCRE::DoMatch(absl::string_view text, Anchor anchor, size_t* consumed,
+ const Arg* const args[], int n) const {
+ assert(n >= 0);
+ const int vecsize = (1 + n) * 3; // results + PCRE workspace
+ // (as for kVecSize)
+ int* vec = new int[vecsize];
+ bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
+ delete[] vec;
+ return b;
+}
+
+bool PCRE::Rewrite(std::string* out, absl::string_view rewrite,
+ absl::string_view text, int* vec, int veclen) const {
+ int number_of_capturing_groups = NumberOfCapturingGroups();
+ for (const char *s = rewrite.data(), *end = s + rewrite.size();
+ s < end; s++) {
+ int c = *s;
+ if (c == '\\') {
+ c = *++s;
+ if (isdigit(c)) {
+ int n = (c - '0');
+ if (n >= veclen) {
+ if (n <= number_of_capturing_groups) {
+ // unmatched optional capturing group. treat
+ // its value as empty string; i.e., nothing to append.
+ } else {
+ PCREPORT(ERROR) << "requested group " << n
+ << " in regexp " << rewrite.data();
+ return false;
+ }
+ }
+ int start = vec[2 * n];
+ if (start >= 0)
+ out->append(text.data() + start, vec[2 * n + 1] - start);
+ } else if (c == '\\') {
+ out->push_back('\\');
+ } else {
+ PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data();
+ return false;
+ }
+ } else {
+ out->push_back(c);
+ }
+ }
+ return true;
+}
+
+bool PCRE::CheckRewriteString(absl::string_view rewrite,
+ std::string* error) const {
+ int max_token = -1;
+ for (const char *s = rewrite.data(), *end = s + rewrite.size();
+ s < end; s++) {
+ int c = *s;
+ if (c != '\\') {
+ continue;
+ }
+ if (++s == end) {
+ *error = "Rewrite schema error: '\\' not allowed at end.";
+ return false;
+ }
+ c = *s;
+ if (c == '\\') {
+ continue;
+ }
+ if (!isdigit(c)) {
+ *error = "Rewrite schema error: "
+ "'\\' must be followed by a digit or '\\'.";
+ return false;
+ }
+ int n = (c - '0');
+ if (max_token < n) {
+ max_token = n;
+ }
+ }
+
+ if (max_token > NumberOfCapturingGroups()) {
+ *error = absl::StrFormat(
+ "Rewrite schema requests %d matches, but the regexp only has %d "
+ "parenthesized subexpressions.",
+ max_token, NumberOfCapturingGroups());
+ return false;
+ }
+ return true;
+}
+
+// Return the number of capturing subpatterns, or -1 if the
+// regexp wasn't valid on construction.
+int PCRE::NumberOfCapturingGroups() const {
+ if (re_partial_ == NULL) return -1;
+
+ int result;
+ int rc = pcre_fullinfo(re_partial_, // The regular expression object
+ NULL, // We did not study the pattern
+ PCRE_INFO_CAPTURECOUNT,
+ &result);
+ if (rc != 0) {
+ PCREPORT(ERROR) << "Unexpected return code: " << rc;
+ return -1;
+ }
+ return result;
+}
+
+
+/***** Parsers for various types *****/
+
+bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) {
+ // We fail if somebody asked us to store into a non-NULL void* pointer
+ return (dest == NULL);
+}
+
+bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) {
+ if (dest == NULL) return true;
+ reinterpret_cast<std::string*>(dest)->assign(str, n);
+ return true;
+}
+
+bool PCRE::Arg::parse_string_view(const char* str, size_t n, void* dest) {
+ if (dest == NULL) return true;
+ *(reinterpret_cast<absl::string_view*>(dest)) = absl::string_view(str, n);
+ return true;
+}
+
+bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) {
+ if (n != 1) return false;
+ if (dest == NULL) return true;
+ *(reinterpret_cast<char*>(dest)) = str[0];
+ return true;
+}
+
+bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) {
+ if (n != 1) return false;
+ if (dest == NULL) return true;
+ *(reinterpret_cast<signed char*>(dest)) = str[0];
+ return true;
+}
+
+bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) {
+ if (n != 1) return false;
+ if (dest == NULL) return true;
+ *(reinterpret_cast<unsigned char*>(dest)) = str[0];
+ return true;
+}
+
+// Largest number spec that we are willing to parse
+static const int kMaxNumberLength = 32;
+
+// PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1
+// PCREQUIPCRES "n > 0"
+// Copies "str" into "buf" and null-terminates if necessary.
+// Returns one of:
+// a. "str" if no termination is needed
+// b. "buf" if the string was copied and null-terminated
+// c. "" if the input was invalid and has no hope of being parsed
+static const char* TerminateNumber(char* buf, const char* str, size_t n) {
+ if ((n > 0) && isspace(*str)) {
+ // We are less forgiving than the strtoxxx() routines and do not
+ // allow leading spaces.
+ return "";
+ }
+
+ // See if the character right after the input text may potentially
+ // look like a digit.
+ if (isdigit(str[n]) ||
+ ((str[n] >= 'a') && (str[n] <= 'f')) ||
+ ((str[n] >= 'A') && (str[n] <= 'F'))) {
+ if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
+ memcpy(buf, str, n);
+ buf[n] = '\0';
+ return buf;
+ } else {
+ // We can parse right out of the supplied string, so return it.
+ return str;
+ }
+}
+
+bool PCRE::Arg::parse_long_radix(const char* str,
+ size_t n,
+ void* dest,
+ int radix) {
+ if (n == 0) return false;
+ char buf[kMaxNumberLength+1];
+ str = TerminateNumber(buf, str, n);
+ char* end;
+ errno = 0;
+ long r = strtol(str, &end, radix);
+ if (end != str + n) return false; // Leftover junk
+ if (errno) return false;
+ if (dest == NULL) return true;
+ *(reinterpret_cast<long*>(dest)) = r;
+ return true;
+}
+
+bool PCRE::Arg::parse_ulong_radix(const char* str,
+ size_t n,
+ void* dest,
+ int radix) {
+ if (n == 0) return false;
+ char buf[kMaxNumberLength+1];
+ str = TerminateNumber(buf, str, n);
+ if (str[0] == '-') {
+ // strtoul() will silently accept negative numbers and parse
+ // them. This module is more strict and treats them as errors.
+ return false;
+ }
+
+ char* end;
+ errno = 0;
+ unsigned long r = strtoul(str, &end, radix);
+ if (end != str + n) return false; // Leftover junk
+ if (errno) return false;
+ if (dest == NULL) return true;
+ *(reinterpret_cast<unsigned long*>(dest)) = r;
+ return true;
+}
+
+bool PCRE::Arg::parse_short_radix(const char* str,
+ size_t n,
+ void* dest,
+ int radix) {
+ long r;
+ if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
+ if ((short)r != r) return false; // Out of range
+ if (dest == NULL) return true;
+ *(reinterpret_cast<short*>(dest)) = (short)r;
+ return true;
+}
+
+bool PCRE::Arg::parse_ushort_radix(const char* str,
+ size_t n,
+ void* dest,
+ int radix) {
+ unsigned long r;
+ if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
+ if ((unsigned short)r != r) return false; // Out of range
+ if (dest == NULL) return true;
+ *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
+ return true;
+}
+
+bool PCRE::Arg::parse_int_radix(const char* str,
+ size_t n,
+ void* dest,
+ int radix) {
+ long r;
+ if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
+ if ((int)r != r) return false; // Out of range
+ if (dest == NULL) return true;
+ *(reinterpret_cast<int*>(dest)) = (int)r;
+ return true;
+}
+
+bool PCRE::Arg::parse_uint_radix(const char* str,
+ size_t n,
+ void* dest,
+ int radix) {
+ unsigned long r;
+ if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
+ if ((unsigned int)r != r) return false; // Out of range
+ if (dest == NULL) return true;
+ *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
+ return true;
+}
+
+bool PCRE::Arg::parse_longlong_radix(const char* str,
+ size_t n,
+ void* dest,
+ int radix) {
+ if (n == 0) return false;
+ char buf[kMaxNumberLength+1];
+ str = TerminateNumber(buf, str, n);
+ char* end;
+ errno = 0;
+ long long r = strtoll(str, &end, radix);
+ if (end != str + n) return false; // Leftover junk
+ if (errno) return false;
+ if (dest == NULL) return true;
+ *(reinterpret_cast<long long*>(dest)) = r;
+ return true;
+}
+
+bool PCRE::Arg::parse_ulonglong_radix(const char* str,
+ size_t n,
+ void* dest,
+ int radix) {
+ if (n == 0) return false;
+ char buf[kMaxNumberLength+1];
+ str = TerminateNumber(buf, str, n);
+ if (str[0] == '-') {
+ // strtoull() will silently accept negative numbers and parse
+ // them. This module is more strict and treats them as errors.
+ return false;
+ }
+ char* end;
+ errno = 0;
+ unsigned long long r = strtoull(str, &end, radix);
+ if (end != str + n) return false; // Leftover junk
+ if (errno) return false;
+ if (dest == NULL) return true;
+ *(reinterpret_cast<unsigned long long*>(dest)) = r;
+ return true;
+}
+
+static bool parse_double_float(const char* str, size_t n, bool isfloat,
+ void* dest) {
+ if (n == 0) return false;
+ static const int kMaxLength = 200;
+ char buf[kMaxLength];
+ if (n >= kMaxLength) return false;
+ memcpy(buf, str, n);
+ buf[n] = '\0';
+ char* end;
+ errno = 0;
+ double r;
+ if (isfloat) {
+ r = strtof(buf, &end);
+ } else {
+ r = strtod(buf, &end);
+ }
+ if (end != buf + n) return false; // Leftover junk
+ if (errno) return false;
+ if (dest == NULL) return true;
+ if (isfloat) {
+ *(reinterpret_cast<float*>(dest)) = (float)r;
+ } else {
+ *(reinterpret_cast<double*>(dest)) = r;
+ }
+ return true;
+}
+
+bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) {
+ return parse_double_float(str, n, false, dest);
+}
+
+bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) {
+ return parse_double_float(str, n, true, dest);
+}
+
+#define DEFINE_INTEGER_PARSER(name) \
+ bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) { \
+ return parse_##name##_radix(str, n, dest, 10); \
+ } \
+ bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \
+ return parse_##name##_radix(str, n, dest, 16); \
+ } \
+ bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n, \
+ void* dest) { \
+ return parse_##name##_radix(str, n, dest, 8); \
+ } \
+ bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n, \
+ void* dest) { \
+ return parse_##name##_radix(str, n, dest, 0); \
+ }
+
+DEFINE_INTEGER_PARSER(short);
+DEFINE_INTEGER_PARSER(ushort);
+DEFINE_INTEGER_PARSER(int);
+DEFINE_INTEGER_PARSER(uint);
+DEFINE_INTEGER_PARSER(long);
+DEFINE_INTEGER_PARSER(ulong);
+DEFINE_INTEGER_PARSER(longlong);
+DEFINE_INTEGER_PARSER(ulonglong);
+
+#undef DEFINE_INTEGER_PARSER
+
+} // namespace re2
diff --git a/third_party/re2/src/util/pcre.h b/third_party/re2/src/util/pcre.h
new file mode 100644
index 000000000..846f30019
--- /dev/null
+++ b/third_party/re2/src/util/pcre.h
@@ -0,0 +1,671 @@
+// Copyright 2003-2010 Google Inc. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef UTIL_PCRE_H_
+#define UTIL_PCRE_H_
+
+// This is a variant of PCRE's pcrecpp.h, originally written at Google.
+// The main changes are the addition of the HitLimit method and
+// compilation as PCRE in namespace re2.
+
+// C++ interface to the pcre regular-expression library. PCRE supports
+// Perl-style regular expressions (with extensions like \d, \w, \s,
+// ...).
+//
+// -----------------------------------------------------------------------
+// REGEXP SYNTAX:
+//
+// This module uses the pcre library and hence supports its syntax
+// for regular expressions:
+//
+// http://www.google.com/search?q=pcre
+//
+// The syntax is pretty similar to Perl's. For those not familiar
+// with Perl's regular expressions, here are some examples of the most
+// commonly used extensions:
+//
+// "hello (\\w+) world" -- \w matches a "word" character
+// "version (\\d+)" -- \d matches a digit
+// "hello\\s+world" -- \s matches any whitespace character
+// "\\b(\\w+)\\b" -- \b matches empty string at a word boundary
+// "(?i)hello" -- (?i) turns on case-insensitive matching
+// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
+//
+// -----------------------------------------------------------------------
+// MATCHING INTERFACE:
+//
+// The "FullMatch" operation checks that supplied text matches a
+// supplied pattern exactly.
+//
+// Example: successful match
+// CHECK(PCRE::FullMatch("hello", "h.*o"));
+//
+// Example: unsuccessful match (requires full match):
+// CHECK(!PCRE::FullMatch("hello", "e"));
+//
+// -----------------------------------------------------------------------
+// UTF-8 AND THE MATCHING INTERFACE:
+//
+// By default, pattern and text are plain text, one byte per character.
+// The UTF8 flag, passed to the constructor, causes both pattern
+// and string to be treated as UTF-8 text, still a byte stream but
+// potentially multiple bytes per character. In practice, the text
+// is likelier to be UTF-8 than the pattern, but the match returned
+// may depend on the UTF8 flag, so always use it when matching
+// UTF8 text. E.g., "." will match one byte normally but with UTF8
+// set may match up to three bytes of a multi-byte character.
+//
+// Example:
+// PCRE re(utf8_pattern, PCRE::UTF8);
+// CHECK(PCRE::FullMatch(utf8_string, re));
+//
+// -----------------------------------------------------------------------
+// MATCHING WITH SUBSTRING EXTRACTION:
+//
+// You can supply extra pointer arguments to extract matched substrings.
+//
+// Example: extracts "ruby" into "s" and 1234 into "i"
+// int i;
+// std::string s;
+// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
+//
+// Example: fails because string cannot be stored in integer
+// CHECK(!PCRE::FullMatch("ruby", "(.*)", &i));
+//
+// Example: fails because there aren't enough sub-patterns:
+// CHECK(!PCRE::FullMatch("ruby:1234", "\\w+:\\d+", &s));
+//
+// Example: does not try to extract any extra sub-patterns
+// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
+//
+// Example: does not try to extract into NULL
+// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
+//
+// Example: integer overflow causes failure
+// CHECK(!PCRE::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
+//
+// -----------------------------------------------------------------------
+// PARTIAL MATCHES
+//
+// You can use the "PartialMatch" operation when you want the pattern
+// to match any substring of the text.
+//
+// Example: simple search for a string:
+// CHECK(PCRE::PartialMatch("hello", "ell"));
+//
+// Example: find first number in a string
+// int number;
+// CHECK(PCRE::PartialMatch("x*100 + 20", "(\\d+)", &number));
+// CHECK_EQ(number, 100);
+//
+// -----------------------------------------------------------------------
+// PPCRE-COMPILED PCREGULAR EXPPCRESSIONS
+//
+// PCRE makes it easy to use any string as a regular expression, without
+// requiring a separate compilation step.
+//
+// If speed is of the essence, you can create a pre-compiled "PCRE"
+// object from the pattern and use it multiple times. If you do so,
+// you can typically parse text faster than with sscanf.
+//
+// Example: precompile pattern for faster matching:
+// PCRE pattern("h.*o");
+// while (ReadLine(&str)) {
+// if (PCRE::FullMatch(str, pattern)) ...;
+// }
+//
+// -----------------------------------------------------------------------
+// SCANNING TEXT INCPCREMENTALLY
+//
+// The "Consume" operation may be useful if you want to repeatedly
+// match regular expressions at the front of a string and skip over
+// them as they match. This requires use of the string_view type,
+// which represents a sub-range of a real string.
+//
+// Example: read lines of the form "var = value" from a string.
+// std::string contents = ...; // Fill string somehow
+// absl::string_view input(contents); // Wrap a string_view around it
+//
+// std::string var;
+// int value;
+// while (PCRE::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
+// ...;
+// }
+//
+// Each successful call to "Consume" will set "var/value", and also
+// advance "input" so it points past the matched text. Note that if the
+// regular expression matches an empty string, input will advance
+// by 0 bytes. If the regular expression being used might match
+// an empty string, the loop body must check for this case and either
+// advance the string or break out of the loop.
+//
+// The "FindAndConsume" operation is similar to "Consume" but does not
+// anchor your match at the beginning of the string. For example, you
+// could extract all words from a string by repeatedly calling
+// PCRE::FindAndConsume(&input, "(\\w+)", &word)
+//
+// -----------------------------------------------------------------------
+// PARSING HEX/OCTAL/C-RADIX NUMBERS
+//
+// By default, if you pass a pointer to a numeric value, the
+// corresponding text is interpreted as a base-10 number. You can
+// instead wrap the pointer with a call to one of the operators Hex(),
+// Octal(), or CRadix() to interpret the text in another base. The
+// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
+// prefixes, but defaults to base-10.
+//
+// Example:
+// int a, b, c, d;
+// CHECK(PCRE::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
+// Octal(&a), Hex(&b), CRadix(&c), CRadix(&d));
+// will leave 64 in a, b, c, and d.
+
+#include "absl/strings/string_view.h"
+
+#ifdef USEPCRE
+#include <pcre.h>
+namespace re2 {
+const bool UsingPCRE = true;
+} // namespace re2
+#else
+struct pcre; // opaque
+namespace re2 {
+const bool UsingPCRE = false;
+} // namespace re2
+#endif
+
+// To produce a DLL, CMake can automatically export code symbols,
+// but not data symbols, so we have to annotate those manually...
+#if defined(RE2_BUILD_TESTING_DLL)
+#define RE2_TESTING_DLL __declspec(dllexport)
+#elif defined(RE2_CONSUME_TESTING_DLL)
+#define RE2_TESTING_DLL __declspec(dllimport)
+#else
+#define RE2_TESTING_DLL
+#endif
+
+namespace re2 {
+
+class PCRE_Options;
+
+// Interface for regular expression matching. Also corresponds to a
+// pre-compiled regular expression. An "PCRE" object is safe for
+// concurrent use by multiple threads.
+class PCRE {
+ public:
+ // We convert user-passed pointers into special Arg objects
+ class Arg;
+
+ // Marks end of arg list.
+ // ONLY USE IN OPTIONAL ARG DEFAULTS.
+ // DO NOT PASS EXPLICITLY.
+ RE2_TESTING_DLL static Arg no_more_args;
+
+ // Options are same value as those in pcre. We provide them here
+ // to avoid users needing to include pcre.h and also to isolate
+ // users from pcre should we change the underlying library.
+ // Only those needed by Google programs are exposed here to
+ // avoid collision with options employed internally by regexp.cc
+ // Note that some options have equivalents that can be specified in
+ // the regexp itself. For example, prefixing your regexp with
+ // "(?s)" has the same effect as the PCRE_DOTALL option.
+ enum Option {
+ None = 0x0000,
+ UTF8 = 0x0800, // == PCRE_UTF8
+ EnabledCompileOptions = UTF8,
+ EnabledExecOptions = 0x0000, // TODO: use to replace anchor flag
+ };
+
+ // We provide implicit conversions from strings so that users can
+ // pass in a string or a "const char*" wherever an "PCRE" is expected.
+ PCRE(const char* pattern);
+ PCRE(const char* pattern, Option option);
+ PCRE(const std::string& pattern);
+ PCRE(const std::string& pattern, Option option);
+ PCRE(const char *pattern, const PCRE_Options& re_option);
+ PCRE(const std::string& pattern, const PCRE_Options& re_option);
+
+ ~PCRE();
+
+ // The string specification for this PCRE. E.g.
+ // PCRE re("ab*c?d+");
+ // re.pattern(); // "ab*c?d+"
+ const std::string& pattern() const { return pattern_; }
+
+ // If PCRE could not be created properly, returns an error string.
+ // Else returns the empty string.
+ const std::string& error() const { return *error_; }
+
+ // Whether the PCRE has hit a match limit during execution.
+ // Not thread safe. Intended only for testing.
+ // If hitting match limits is a problem,
+ // you should be using PCRE2 (re2/re2.h)
+ // instead of checking this flag.
+ bool HitLimit();
+ void ClearHitLimit();
+
+ /***** The useful part: the matching interface *****/
+
+ // Matches "text" against "pattern". If pointer arguments are
+ // supplied, copies matched sub-patterns into them.
+ //
+ // You can pass in a "const char*" or a "std::string" for "text".
+ // You can pass in a "const char*" or a "std::string" or a "PCRE" for "pattern".
+ //
+ // The provided pointer arguments can be pointers to any scalar numeric
+ // type, or one of:
+ // std::string (matched piece is copied to string)
+ // absl::string_view (string_view is mutated to point to matched piece)
+ // T ("bool T::ParseFrom(const char*, size_t)" must exist)
+ // (void*)NULL (the corresponding matched sub-pattern is not copied)
+ //
+ // Returns true iff all of the following conditions are satisfied:
+ // a. "text" matches "pattern" exactly
+ // b. The number of matched sub-patterns is >= number of supplied pointers
+ // c. The "i"th argument has a suitable type for holding the
+ // string captured as the "i"th sub-pattern. If you pass in
+ // NULL for the "i"th argument, or pass fewer arguments than
+ // number of sub-patterns, "i"th captured sub-pattern is
+ // ignored.
+ //
+ // CAVEAT: An optional sub-pattern that does not exist in the
+ // matched string is assigned the empty string. Therefore, the
+ // following will return false (because the empty string is not a
+ // valid number):
+ // int number;
+ // PCRE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
+ struct FullMatchFunctor {
+ bool operator ()(absl::string_view text, const PCRE& re, // 3..16 args
+ const Arg& ptr1 = no_more_args,
+ const Arg& ptr2 = no_more_args,
+ const Arg& ptr3 = no_more_args,
+ const Arg& ptr4 = no_more_args,
+ const Arg& ptr5 = no_more_args,
+ const Arg& ptr6 = no_more_args,
+ const Arg& ptr7 = no_more_args,
+ const Arg& ptr8 = no_more_args,
+ const Arg& ptr9 = no_more_args,
+ const Arg& ptr10 = no_more_args,
+ const Arg& ptr11 = no_more_args,
+ const Arg& ptr12 = no_more_args,
+ const Arg& ptr13 = no_more_args,
+ const Arg& ptr14 = no_more_args,
+ const Arg& ptr15 = no_more_args,
+ const Arg& ptr16 = no_more_args) const;
+ };
+
+ RE2_TESTING_DLL static const FullMatchFunctor FullMatch;
+
+ // Exactly like FullMatch(), except that "pattern" is allowed to match
+ // a substring of "text".
+ struct PartialMatchFunctor {
+ bool operator ()(absl::string_view text, const PCRE& re, // 3..16 args
+ const Arg& ptr1 = no_more_args,
+ const Arg& ptr2 = no_more_args,
+ const Arg& ptr3 = no_more_args,
+ const Arg& ptr4 = no_more_args,
+ const Arg& ptr5 = no_more_args,
+ const Arg& ptr6 = no_more_args,
+ const Arg& ptr7 = no_more_args,
+ const Arg& ptr8 = no_more_args,
+ const Arg& ptr9 = no_more_args,
+ const Arg& ptr10 = no_more_args,
+ const Arg& ptr11 = no_more_args,
+ const Arg& ptr12 = no_more_args,
+ const Arg& ptr13 = no_more_args,
+ const Arg& ptr14 = no_more_args,
+ const Arg& ptr15 = no_more_args,
+ const Arg& ptr16 = no_more_args) const;
+ };
+
+ RE2_TESTING_DLL static const PartialMatchFunctor PartialMatch;
+
+ // Like FullMatch() and PartialMatch(), except that pattern has to
+ // match a prefix of "text", and "input" is advanced past the matched
+ // text. Note: "input" is modified iff this routine returns true.
+ struct ConsumeFunctor {
+ bool operator ()(absl::string_view* input, const PCRE& pattern, // 3..16 args
+ const Arg& ptr1 = no_more_args,
+ const Arg& ptr2 = no_more_args,
+ const Arg& ptr3 = no_more_args,
+ const Arg& ptr4 = no_more_args,
+ const Arg& ptr5 = no_more_args,
+ const Arg& ptr6 = no_more_args,
+ const Arg& ptr7 = no_more_args,
+ const Arg& ptr8 = no_more_args,
+ const Arg& ptr9 = no_more_args,
+ const Arg& ptr10 = no_more_args,
+ const Arg& ptr11 = no_more_args,
+ const Arg& ptr12 = no_more_args,
+ const Arg& ptr13 = no_more_args,
+ const Arg& ptr14 = no_more_args,
+ const Arg& ptr15 = no_more_args,
+ const Arg& ptr16 = no_more_args) const;
+ };
+
+ RE2_TESTING_DLL static const ConsumeFunctor Consume;
+
+ // Like Consume(..), but does not anchor the match at the beginning of the
+ // string. That is, "pattern" need not start its match at the beginning of
+ // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
+ // word in "s" and stores it in "word".
+ struct FindAndConsumeFunctor {
+ bool operator ()(absl::string_view* input, const PCRE& pattern, // 3..16 args
+ const Arg& ptr1 = no_more_args,
+ const Arg& ptr2 = no_more_args,
+ const Arg& ptr3 = no_more_args,
+ const Arg& ptr4 = no_more_args,
+ const Arg& ptr5 = no_more_args,
+ const Arg& ptr6 = no_more_args,
+ const Arg& ptr7 = no_more_args,
+ const Arg& ptr8 = no_more_args,
+ const Arg& ptr9 = no_more_args,
+ const Arg& ptr10 = no_more_args,
+ const Arg& ptr11 = no_more_args,
+ const Arg& ptr12 = no_more_args,
+ const Arg& ptr13 = no_more_args,
+ const Arg& ptr14 = no_more_args,
+ const Arg& ptr15 = no_more_args,
+ const Arg& ptr16 = no_more_args) const;
+ };
+
+ RE2_TESTING_DLL static const FindAndConsumeFunctor FindAndConsume;
+
+ // Replace the first match of "pattern" in "str" with "rewrite".
+ // Within "rewrite", backslash-escaped digits (\1 to \9) can be
+ // used to insert text matching corresponding parenthesized group
+ // from the pattern. \0 in "rewrite" refers to the entire matching
+ // text. E.g.,
+ //
+ // std::string s = "yabba dabba doo";
+ // CHECK(PCRE::Replace(&s, "b+", "d"));
+ //
+ // will leave "s" containing "yada dabba doo"
+ //
+ // Returns true if the pattern matches and a replacement occurs,
+ // false otherwise.
+ static bool Replace(std::string* str, const PCRE& pattern,
+ absl::string_view rewrite);
+
+ // Like Replace(), except replaces all occurrences of the pattern in
+ // the string with the rewrite. Replacements are not subject to
+ // re-matching. E.g.,
+ //
+ // std::string s = "yabba dabba doo";
+ // CHECK(PCRE::GlobalReplace(&s, "b+", "d"));
+ //
+ // will leave "s" containing "yada dada doo"
+ //
+ // Returns the number of replacements made.
+ static int GlobalReplace(std::string* str, const PCRE& pattern,
+ absl::string_view rewrite);
+
+ // Like Replace, except that if the pattern matches, "rewrite"
+ // is copied into "out" with substitutions. The non-matching
+ // portions of "text" are ignored.
+ //
+ // Returns true iff a match occurred and the extraction happened
+ // successfully; if no match occurs, the string is left unaffected.
+ static bool Extract(absl::string_view text, const PCRE& pattern,
+ absl::string_view rewrite, std::string* out);
+
+ // Check that the given @p rewrite string is suitable for use with
+ // this PCRE. It checks that:
+ // * The PCRE has enough parenthesized subexpressions to satisfy all
+ // of the \N tokens in @p rewrite, and
+ // * The @p rewrite string doesn't have any syntax errors
+ // ('\' followed by anything besides [0-9] and '\').
+ // Making this test will guarantee that "replace" and "extract"
+ // operations won't LOG(ERROR) or fail because of a bad rewrite
+ // string.
+ // @param rewrite The proposed rewrite string.
+ // @param error An error message is recorded here, iff we return false.
+ // Otherwise, it is unchanged.
+ // @return true, iff @p rewrite is suitable for use with the PCRE.
+ bool CheckRewriteString(absl::string_view rewrite, std::string* error) const;
+
+ // Returns a copy of 'unquoted' with all potentially meaningful
+ // regexp characters backslash-escaped. The returned string, used
+ // as a regular expression, will exactly match the original string.
+ // For example,
+ // 1.5-2.0?
+ // becomes:
+ // 1\.5\-2\.0\?
+ static std::string QuoteMeta(absl::string_view unquoted);
+
+ /***** Generic matching interface (not so nice to use) *****/
+
+ // Type of match (TODO: Should be restructured as an Option)
+ enum Anchor {
+ UNANCHORED, // No anchoring
+ ANCHOR_START, // Anchor at start only
+ ANCHOR_BOTH, // Anchor at start and end
+ };
+
+ // General matching routine. Stores the length of the match in
+ // "*consumed" if successful.
+ bool DoMatch(absl::string_view text, Anchor anchor, size_t* consumed,
+ const Arg* const* args, int n) const;
+
+ // Return the number of capturing subpatterns, or -1 if the
+ // regexp wasn't valid on construction.
+ int NumberOfCapturingGroups() const;
+
+ private:
+ void Init(const char* pattern, Option option, int match_limit,
+ int stack_limit, bool report_errors);
+
+ // Match against "text", filling in "vec" (up to "vecsize" * 2/3) with
+ // pairs of integers for the beginning and end positions of matched
+ // text. The first pair corresponds to the entire matched text;
+ // subsequent pairs correspond, in order, to parentheses-captured
+ // matches. Returns the number of pairs (one more than the number of
+ // the last subpattern with a match) if matching was successful
+ // and zero if the match failed.
+ // I.e. for PCRE("(foo)|(bar)|(baz)") it will return 2, 3, and 4 when matching
+ // against "foo", "bar", and "baz" respectively.
+ // When matching PCRE("(foo)|hello") against "hello", it will return 1.
+ // But the values for all subpattern are filled in into "vec".
+ int TryMatch(absl::string_view text, size_t startpos, Anchor anchor,
+ bool empty_ok, int* vec, int vecsize) const;
+
+ // Append the "rewrite" string, with backslash substitutions from "text"
+ // and "vec", to string "out".
+ bool Rewrite(std::string* out, absl::string_view rewrite,
+ absl::string_view text, int* vec, int veclen) const;
+
+ // internal implementation for DoMatch
+ bool DoMatchImpl(absl::string_view text, Anchor anchor, size_t* consumed,
+ const Arg* const args[], int n, int* vec, int vecsize) const;
+
+ // Compile the regexp for the specified anchoring mode
+ pcre* Compile(Anchor anchor);
+
+ std::string pattern_;
+ Option options_;
+ pcre* re_full_; // For full matches
+ pcre* re_partial_; // For partial matches
+ const std::string* error_; // Error indicator (or empty string)
+ bool report_errors_; // Silences error logging if false
+ int match_limit_; // Limit on execution resources
+ int stack_limit_; // Limit on stack resources (bytes)
+ mutable int hit_limit_; // Hit limit during execution (bool)
+
+ PCRE(const PCRE&) = delete;
+ PCRE& operator=(const PCRE&) = delete;
+};
+
+// PCRE_Options allow you to set the PCRE::Options, plus any pcre
+// "extra" options. The only extras are match_limit, which limits
+// the CPU time of a match, and stack_limit, which limits the
+// stack usage. Setting a limit to <= 0 lets PCRE pick a sensible default
+// that should not cause too many problems in production code.
+// If PCRE hits a limit during a match, it may return a false negative,
+// but (hopefully) it won't crash.
+//
+// NOTE: If you are handling regular expressions specified by
+// (external or internal) users, rather than hard-coded ones,
+// you should be using PCRE2, which uses an alternate implementation
+// that avoids these issues. See http://go/re2quick.
+class PCRE_Options {
+ public:
+ // constructor
+ PCRE_Options() : option_(PCRE::None), match_limit_(0), stack_limit_(0), report_errors_(true) {}
+ // accessors
+ PCRE::Option option() const { return option_; }
+ void set_option(PCRE::Option option) {
+ option_ = option;
+ }
+ int match_limit() const { return match_limit_; }
+ void set_match_limit(int match_limit) {
+ match_limit_ = match_limit;
+ }
+ int stack_limit() const { return stack_limit_; }
+ void set_stack_limit(int stack_limit) {
+ stack_limit_ = stack_limit;
+ }
+
+ // If the regular expression is malformed, an error message will be printed
+ // iff report_errors() is true. Default: true.
+ bool report_errors() const { return report_errors_; }
+ void set_report_errors(bool report_errors) {
+ report_errors_ = report_errors;
+ }
+ private:
+ PCRE::Option option_;
+ int match_limit_;
+ int stack_limit_;
+ bool report_errors_;
+};
+
+
+/***** Implementation details *****/
+
+// Hex/Octal/Binary?
+
+// Special class for parsing into objects that define a ParseFrom() method
+template <typename T>
+class _PCRE_MatchObject {
+ public:
+ static inline bool Parse(const char* str, size_t n, void* dest) {
+ if (dest == NULL) return true;
+ T* object = reinterpret_cast<T*>(dest);
+ return object->ParseFrom(str, n);
+ }
+};
+
+class PCRE::Arg {
+ public:
+ // Empty constructor so we can declare arrays of PCRE::Arg
+ Arg();
+
+ // Constructor specially designed for NULL arguments
+ Arg(void*);
+
+ typedef bool (*Parser)(const char* str, size_t n, void* dest);
+
+// Type-specific parsers
+#define MAKE_PARSER(type, name) \
+ Arg(type* p) : arg_(p), parser_(name) {} \
+ Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
+
+ MAKE_PARSER(char, parse_char);
+ MAKE_PARSER(signed char, parse_schar);
+ MAKE_PARSER(unsigned char, parse_uchar);
+ MAKE_PARSER(float, parse_float);
+ MAKE_PARSER(double, parse_double);
+ MAKE_PARSER(std::string, parse_string);
+ MAKE_PARSER(absl::string_view, parse_string_view);
+
+ MAKE_PARSER(short, parse_short);
+ MAKE_PARSER(unsigned short, parse_ushort);
+ MAKE_PARSER(int, parse_int);
+ MAKE_PARSER(unsigned int, parse_uint);
+ MAKE_PARSER(long, parse_long);
+ MAKE_PARSER(unsigned long, parse_ulong);
+ MAKE_PARSER(long long, parse_longlong);
+ MAKE_PARSER(unsigned long long, parse_ulonglong);
+
+#undef MAKE_PARSER
+
+ // Generic constructor
+ template <typename T> Arg(T*, Parser parser);
+ // Generic constructor template
+ template <typename T> Arg(T* p)
+ : arg_(p), parser_(_PCRE_MatchObject<T>::Parse) {
+ }
+
+ // Parse the data
+ bool Parse(const char* str, size_t n) const;
+
+ private:
+ void* arg_;
+ Parser parser_;
+
+ static bool parse_null (const char* str, size_t n, void* dest);
+ static bool parse_char (const char* str, size_t n, void* dest);
+ static bool parse_schar (const char* str, size_t n, void* dest);
+ static bool parse_uchar (const char* str, size_t n, void* dest);
+ static bool parse_float (const char* str, size_t n, void* dest);
+ static bool parse_double (const char* str, size_t n, void* dest);
+ static bool parse_string (const char* str, size_t n, void* dest);
+ static bool parse_string_view (const char* str, size_t n, void* dest);
+
+#define DECLARE_INTEGER_PARSER(name) \
+ private: \
+ static bool parse_##name(const char* str, size_t n, void* dest); \
+ static bool parse_##name##_radix(const char* str, size_t n, void* dest, \
+ int radix); \
+ \
+ public: \
+ static bool parse_##name##_hex(const char* str, size_t n, void* dest); \
+ static bool parse_##name##_octal(const char* str, size_t n, void* dest); \
+ static bool parse_##name##_cradix(const char* str, size_t n, void* dest)
+
+ DECLARE_INTEGER_PARSER(short);
+ DECLARE_INTEGER_PARSER(ushort);
+ DECLARE_INTEGER_PARSER(int);
+ DECLARE_INTEGER_PARSER(uint);
+ DECLARE_INTEGER_PARSER(long);
+ DECLARE_INTEGER_PARSER(ulong);
+ DECLARE_INTEGER_PARSER(longlong);
+ DECLARE_INTEGER_PARSER(ulonglong);
+
+#undef DECLARE_INTEGER_PARSER
+
+};
+
+inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
+inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
+
+inline bool PCRE::Arg::Parse(const char* str, size_t n) const {
+ return (*parser_)(str, n, arg_);
+}
+
+// This part of the parser, appropriate only for ints, deals with bases
+#define MAKE_INTEGER_PARSER(type, name) \
+ inline PCRE::Arg Hex(type* ptr) { \
+ return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_hex); \
+ } \
+ inline PCRE::Arg Octal(type* ptr) { \
+ return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_octal); \
+ } \
+ inline PCRE::Arg CRadix(type* ptr) { \
+ return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_cradix); \
+ }
+
+MAKE_INTEGER_PARSER(short, short);
+MAKE_INTEGER_PARSER(unsigned short, ushort);
+MAKE_INTEGER_PARSER(int, int);
+MAKE_INTEGER_PARSER(unsigned int, uint);
+MAKE_INTEGER_PARSER(long, long);
+MAKE_INTEGER_PARSER(unsigned long, ulong);
+MAKE_INTEGER_PARSER(long long, longlong);
+MAKE_INTEGER_PARSER(unsigned long long, ulonglong);
+
+#undef MAKE_INTEGER_PARSER
+
+} // namespace re2
+
+#endif // UTIL_PCRE_H_
diff --git a/third_party/re2/src/util/rune.cc b/third_party/re2/src/util/rune.cc
new file mode 100644
index 000000000..a40e756c4
--- /dev/null
+++ b/third_party/re2/src/util/rune.cc
@@ -0,0 +1,260 @@
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ * Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+
+#include <stdarg.h>
+#include <string.h>
+
+#include "util/utf.h"
+
+namespace re2 {
+
+enum
+{
+ Bit1 = 7,
+ Bitx = 6,
+ Bit2 = 5,
+ Bit3 = 4,
+ Bit4 = 3,
+ Bit5 = 2,
+
+ T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
+ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
+ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
+ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
+ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
+ T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
+
+ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
+ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
+ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
+ Rune4 = (1<<(Bit4+3*Bitx))-1,
+ /* 0001 1111 1111 1111 1111 1111 */
+
+ Maskx = (1<<Bitx)-1, /* 0011 1111 */
+ Testx = Maskx ^ 0xFF, /* 1100 0000 */
+
+ Bad = Runeerror,
+};
+
+int
+chartorune(Rune *rune, const char *str)
+{
+ int c, c1, c2, c3;
+ Rune l;
+
+ /*
+ * one character sequence
+ * 00000-0007F => T1
+ */
+ c = *(unsigned char*)str;
+ if(c < Tx) {
+ *rune = c;
+ return 1;
+ }
+
+ /*
+ * two character sequence
+ * 0080-07FF => T2 Tx
+ */
+ c1 = *(unsigned char*)(str+1) ^ Tx;
+ if(c1 & Testx)
+ goto bad;
+ if(c < T3) {
+ if(c < T2)
+ goto bad;
+ l = ((c << Bitx) | c1) & Rune2;
+ if(l <= Rune1)
+ goto bad;
+ *rune = l;
+ return 2;
+ }
+
+ /*
+ * three character sequence
+ * 0800-FFFF => T3 Tx Tx
+ */
+ c2 = *(unsigned char*)(str+2) ^ Tx;
+ if(c2 & Testx)
+ goto bad;
+ if(c < T4) {
+ l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
+ if(l <= Rune2)
+ goto bad;
+ *rune = l;
+ return 3;
+ }
+
+ /*
+ * four character sequence (21-bit value)
+ * 10000-1FFFFF => T4 Tx Tx Tx
+ */
+ c3 = *(unsigned char*)(str+3) ^ Tx;
+ if (c3 & Testx)
+ goto bad;
+ if (c < T5) {
+ l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+ if (l <= Rune3)
+ goto bad;
+ *rune = l;
+ return 4;
+ }
+
+ /*
+ * Support for 5-byte or longer UTF-8 would go here, but
+ * since we don't have that, we'll just fall through to bad.
+ */
+
+ /*
+ * bad decoding
+ */
+bad:
+ *rune = Bad;
+ return 1;
+}
+
+int
+runetochar(char *str, const Rune *rune)
+{
+ /* Runes are signed, so convert to unsigned for range check. */
+ unsigned int c;
+
+ /*
+ * one character sequence
+ * 00000-0007F => 00-7F
+ */
+ c = *rune;
+ if(c <= Rune1) {
+ str[0] = static_cast<char>(c);
+ return 1;
+ }
+
+ /*
+ * two character sequence
+ * 0080-07FF => T2 Tx
+ */
+ if(c <= Rune2) {
+ str[0] = T2 | static_cast<char>(c >> 1*Bitx);
+ str[1] = Tx | (c & Maskx);
+ return 2;
+ }
+
+ /*
+ * If the Rune is out of range, convert it to the error rune.
+ * Do this test here because the error rune encodes to three bytes.
+ * Doing it earlier would duplicate work, since an out of range
+ * Rune wouldn't have fit in one or two bytes.
+ */
+ if (c > Runemax)
+ c = Runeerror;
+
+ /*
+ * three character sequence
+ * 0800-FFFF => T3 Tx Tx
+ */
+ if (c <= Rune3) {
+ str[0] = T3 | static_cast<char>(c >> 2*Bitx);
+ str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[2] = Tx | (c & Maskx);
+ return 3;
+ }
+
+ /*
+ * four character sequence (21-bit value)
+ * 10000-1FFFFF => T4 Tx Tx Tx
+ */
+ str[0] = T4 | static_cast<char>(c >> 3*Bitx);
+ str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+ str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[3] = Tx | (c & Maskx);
+ return 4;
+}
+
+int
+runelen(Rune rune)
+{
+ char str[10];
+
+ return runetochar(str, &rune);
+}
+
+int
+fullrune(const char *str, int n)
+{
+ if (n > 0) {
+ int c = *(unsigned char*)str;
+ if (c < Tx)
+ return 1;
+ if (n > 1) {
+ if (c < T3)
+ return 1;
+ if (n > 2) {
+ if (c < T4 || n > 3)
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+
+int
+utflen(const char *s)
+{
+ int c;
+ int n;
+ Rune rune;
+
+ n = 0;
+ for(;;) {
+ c = *(unsigned char*)s;
+ if(c < Runeself) {
+ if(c == 0)
+ return n;
+ s++;
+ } else
+ s += chartorune(&rune, s);
+ n++;
+ }
+ return 0;
+}
+
+char*
+utfrune(const char *s, Rune c)
+{
+ int c1;
+ Rune r;
+ int n;
+
+ if(c < Runesync) /* not part of utf sequence */
+ return strchr((char*)s, c);
+
+ for(;;) {
+ c1 = *(unsigned char*)s;
+ if(c1 < Runeself) { /* one byte rune */
+ if(c1 == 0)
+ return 0;
+ if(c1 == c)
+ return (char*)s;
+ s++;
+ continue;
+ }
+ n = chartorune(&r, s);
+ if(r == c)
+ return (char*)s;
+ s += n;
+ }
+ return 0;
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/util/strutil.cc b/third_party/re2/src/util/strutil.cc
new file mode 100644
index 000000000..da06f85a3
--- /dev/null
+++ b/third_party/re2/src/util/strutil.cc
@@ -0,0 +1,26 @@
+// Copyright 1999-2005 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/strutil.h"
+
+namespace re2 {
+
+void PrefixSuccessor(std::string* prefix) {
+ // We can increment the last character in the string and be done
+ // unless that character is 255, in which case we have to erase the
+ // last character and increment the previous character, unless that
+ // is 255, etc. If the string is empty or consists entirely of
+ // 255's, we just return the empty string.
+ while (!prefix->empty()) {
+ char& c = prefix->back();
+ if (c == '\xff') { // char literal avoids signed/unsigned.
+ prefix->pop_back();
+ } else {
+ ++c;
+ break;
+ }
+ }
+}
+
+} // namespace re2
diff --git a/third_party/re2/src/util/strutil.h b/third_party/re2/src/util/strutil.h
new file mode 100644
index 000000000..f5d87a533
--- /dev/null
+++ b/third_party/re2/src/util/strutil.h
@@ -0,0 +1,16 @@
+// Copyright 2016 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef UTIL_STRUTIL_H_
+#define UTIL_STRUTIL_H_
+
+#include <string>
+
+namespace re2 {
+
+void PrefixSuccessor(std::string* prefix);
+
+} // namespace re2
+
+#endif // UTIL_STRUTIL_H_
diff --git a/third_party/re2/src/util/utf.h b/third_party/re2/src/util/utf.h
new file mode 100644
index 000000000..85b429723
--- /dev/null
+++ b/third_party/re2/src/util/utf.h
@@ -0,0 +1,44 @@
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ * Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ *
+ * This file and rune.cc have been converted to compile as C++ code
+ * in name space re2.
+ */
+
+#ifndef UTIL_UTF_H_
+#define UTIL_UTF_H_
+
+#include <stdint.h>
+
+namespace re2 {
+
+typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
+
+enum
+{
+ UTFmax = 4, /* maximum bytes per rune */
+ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
+ Runeself = 0x80, /* rune and UTF sequences are the same (<) */
+ Runeerror = 0xFFFD, /* decoding error in UTF */
+ Runemax = 0x10FFFF, /* maximum rune value */
+};
+
+int runetochar(char* s, const Rune* r);
+int chartorune(Rune* r, const char* s);
+int fullrune(const char* s, int n);
+int utflen(const char* s);
+char* utfrune(const char*, Rune);
+
+} // namespace re2
+
+#endif // UTIL_UTF_H_