Add some additional tests that I found lying around.

author: Philip Hazel <Philip.Hazel@gmail.com> 2023-11-25 17:10:35 +0000
committer: Philip Hazel <Philip.Hazel@gmail.com> 2023-11-25 17:10:35 +0000
commit: 198379ca8faaeb60a6677daebfc52480189704c8 (patch)
tree: 345c3f7c4edb9e946fceb685a3732a0815366753
parent: 630b1cd68f51339a6ef4ff60142d9d66373d5f4d (diff)
download: pcre-198379ca8faaeb60a6677daebfc52480189704c8.tar.gz
4 files changed, 231 insertions, 0 deletions
diff --git a/testdata/testinput1 b/testdata/testinput1
index 533389dc..c0da415e 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -6557,4 +6557,95 @@ ef) x/x,mark
 /A{ 3, }/
     BBAAAAAACC 
 
+# This pattern validates regular expression patterns. The original that I was
+# sent was this:
+# /^((?:(?:[^?+*{}()[\]\\|]+|\\.|\[(?:\^?\\.|\^[^\\]|[^\\^])(?:[^\]\\]+|\\.)*\]|\((?:\?[:=!]|\?<[=!]|\?>)?(?1)??\)|\(\?(?:R|[+-]?\d+)\))(?:(?:[?+*]|\{\d+(?:,\d*)?\})[?+]?)?|\|)*)$/
+# This is not very readable, and also does not handle all features. I have done
+# some work on it.
+
+/^
+(?<re>
+# A regular expression is zero or more of these items.
+  (?:
+  # An item is one of these:
+    (?:
+      [^?+*{}()\[\]\\|]++|  # Non-meta characters or unquoted .
+      \\.|                  # Quoted .
+
+      \[                    # Class, which is [
+      (?:                   # Followed by
+        \^?\\.|             # Optional ^ and any escaped character
+        \^[^\\]|            # OR ^ and not escaped character
+        [^\\^]              # OR neither ^ nor \
+      )                     # Followed by
+      (?:[^\]\\]+|\\.)*+    # Zero or more (not ] or \) OR escaped dot
+      \]|                   # Class ends with ]
+
+      \(                    # Parenthesized group
+        (?:                 # Start with optional
+          \?[:=!]|          # ? followed by : = !
+          \?<[=!]|          # OR ?< followed by = or !
+          \?>               # OR ?>
+        )?
+        (?&re)??            # Then a nested <re>
+      \)|                   # End parenthesized group
+
+      \(\?                  # Other parenthesized items
+        (?:                 # (? followed by
+          R|                # R
+          [+-]?\d++         # Or optional +- and digits
+        )
+      \)|                   # End parens
+
+      \(\*                  # Verbs
+        (?:
+          COMMIT|
+          FAIL|
+          MARK:[^)]*|
+          (?:PRUNE|SKIP|THEN)(?::[^\)]*+)?
+        )
+      \)
+    )                       # End list of items
+
+    # Followed by an optional quantifier
+
+    (?:
+      (?:
+        [?+*]     # ?+*
+        |         # OR
+        \{\d+     # { digits
+        (?:,\d*)? # optionally followed by ,digits
+        \}        # then closing }
+        |         # OR
+        \{,\d+}   # {,digits}
+      )
+      [?+]?       # optional ungreedy or possessive
+    )?
+
+    | # OR an "item" is a branch ending
+
+    \|
+
+  )*  # Zero or more top-level items.
+)     # End regex group.
+$/x
+    [abcdef]
+    [abc\\]def]
+    a.b|abcd
+    ab()d
+    ab{1,3}d
+    ab{,3}d
+    ab(*FAIL)d(*COMMIT)(*SKIP)(*THEN:abc)
+    ab(*MARK:xyz)
+    (?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\\s])
+    abcd\\t\\n\\r\\f\\a\\e\\071\\x3b\\^\\\\\\?caxyz
+    a*abc?xyz+pqr{3}ab{2,}xy{4,5}pq{0,6}AB{0,}zz
+    \\G(?:(?=(\\1.|)(.))){1,13}?(?!.*\\2.*\\2)\\1\\K\\2
+\= Expect no match
+    ab)d
+    ab(d
+    {4,5}
+    a[]b
+    (a)(?(1)a|b|c)
+
 # End of testinput1 
diff --git a/testdata/testinput2 b/testdata/testinput2
index b874f20c..f4597397 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -6059,4 +6059,11 @@ a)"xI
     a
     a\=noteol 
 
+# This matches a character that only exists once in the subject, sort of like a
+# hypothetical "(.)(?<!\1.+)(?!.*\1)". That has unlimited variable length
+# lookbehind, so is invalid. This pattern doesn't work in Perl 5.38.0.
+
+/\G(?:(?=(\1.|)(.))){1,13}?(?!.*\2.*\2)\1\K\2/g
+    aaabcccdeee
+
 # End of testinput2
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index bedd9241..84fe0c6f 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -10375,4 +10375,124 @@ No match
     BBAAAAAACC 
  0: AAAAAA
 
+# This pattern validates regular expression patterns. The original that I was
+# sent was this:
+# /^((?:(?:[^?+*{}()[\]\\|]+|\\.|\[(?:\^?\\.|\^[^\\]|[^\\^])(?:[^\]\\]+|\\.)*\]|\((?:\?[:=!]|\?<[=!]|\?>)?(?1)??\)|\(\?(?:R|[+-]?\d+)\))(?:(?:[?+*]|\{\d+(?:,\d*)?\})[?+]?)?|\|)*)$/
+# This is not very readable, and also does not handle all features. I have done
+# some work on it.
+
+/^
+(?<re>
+# A regular expression is zero or more of these items.
+  (?:
+  # An item is one of these:
+    (?:
+      [^?+*{}()\[\]\\|]++|  # Non-meta characters or unquoted .
+      \\.|                  # Quoted .
+
+      \[                    # Class, which is [
+      (?:                   # Followed by
+        \^?\\.|             # Optional ^ and any escaped character
+        \^[^\\]|            # OR ^ and not escaped character
+        [^\\^]              # OR neither ^ nor \
+      )                     # Followed by
+      (?:[^\]\\]+|\\.)*+    # Zero or more (not ] or \) OR escaped dot
+      \]|                   # Class ends with ]
+
+      \(                    # Parenthesized group
+        (?:                 # Start with optional
+          \?[:=!]|          # ? followed by : = !
+          \?<[=!]|          # OR ?< followed by = or !
+          \?>               # OR ?>
+        )?
+        (?&re)??            # Then a nested <re>
+      \)|                   # End parenthesized group
+
+      \(\?                  # Other parenthesized items
+        (?:                 # (? followed by
+          R|                # R
+          [+-]?\d++         # Or optional +- and digits
+        )
+      \)|                   # End parens
+
+      \(\*                  # Verbs
+        (?:
+          COMMIT|
+          FAIL|
+          MARK:[^)]*|
+          (?:PRUNE|SKIP|THEN)(?::[^\)]*+)?
+        )
+      \)
+    )                       # End list of items
+
+    # Followed by an optional quantifier
+
+    (?:
+      (?:
+        [?+*]     # ?+*
+        |         # OR
+        \{\d+     # { digits
+        (?:,\d*)? # optionally followed by ,digits
+        \}        # then closing }
+        |         # OR
+        \{,\d+}   # {,digits}
+      )
+      [?+]?       # optional ungreedy or possessive
+    )?
+
+    | # OR an "item" is a branch ending
+
+    \|
+
+  )*  # Zero or more top-level items.
+)     # End regex group.
+$/x
+    [abcdef]
+ 0: [abcdef]
+ 1: [abcdef]
+    [abc\\]def]
+ 0: [abc\]def]
+ 1: [abc\]def]
+    a.b|abcd
+ 0: a.b|abcd
+ 1: a.b|abcd
+    ab()d
+ 0: ab()d
+ 1: ab()d
+    ab{1,3}d
+ 0: ab{1,3}d
+ 1: ab{1,3}d
+    ab{,3}d
+ 0: ab{,3}d
+ 1: ab{,3}d
+    ab(*FAIL)d(*COMMIT)(*SKIP)(*THEN:abc)
+ 0: ab(*FAIL)d(*COMMIT)(*SKIP)(*THEN:abc)
+ 1: ab(*FAIL)d(*COMMIT)(*SKIP)(*THEN:abc)
+    ab(*MARK:xyz)
+ 0: ab(*MARK:xyz)
+ 1: ab(*MARK:xyz)
+    (?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\\s])
+ 0: (?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])
+ 1: (?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])
+    abcd\\t\\n\\r\\f\\a\\e\\071\\x3b\\^\\\\\\?caxyz
+ 0: abcd\t\n\r\f\a\e\071\x3b\^\\\?caxyz
+ 1: abcd\t\n\r\f\a\e\071\x3b\^\\\?caxyz
+    a*abc?xyz+pqr{3}ab{2,}xy{4,5}pq{0,6}AB{0,}zz
+ 0: a*abc?xyz+pqr{3}ab{2,}xy{4,5}pq{0,6}AB{0,}zz
+ 1: a*abc?xyz+pqr{3}ab{2,}xy{4,5}pq{0,6}AB{0,}zz
+    \\G(?:(?=(\\1.|)(.))){1,13}?(?!.*\\2.*\\2)\\1\\K\\2
+ 0: \G(?:(?=(\1.|)(.))){1,13}?(?!.*\2.*\2)\1\K\2
+ 1: \G(?:(?=(\1.|)(.))){1,13}?(?!.*\2.*\2)\1\K\2
+\= Expect no match
+    ab)d
+No match
+    ab(d
+No match
+    {4,5}
+No match
+    a[]b
+No match
+    (a)(?(1)a|b|c)
+No match
+
 # End of testinput1 
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index c1bc0e64..f3861ed3 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -17952,6 +17952,19 @@ No match
     a\=noteol 
  0: a
 
+# This matches a character that only exists once in the subject, sort of like a
+# hypothetical "(.)(?<!\1.+)(?!.*\1)". That has unlimited variable length
+# lookbehind, so is invalid. This pattern doesn't work in Perl 5.38.0.
+
+/\G(?:(?=(\1.|)(.))){1,13}?(?!.*\2.*\2)\1\K\2/g
+    aaabcccdeee
+ 0: b
+ 1: aaa
+ 2: b
+ 0: d
+ 1: ccc
+ 2: d
+
 # End of testinput2
 Error -70: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data
author	Philip Hazel <Philip.Hazel@gmail.com>	2023-11-25 17:10:35 +0000
committer	Philip Hazel <Philip.Hazel@gmail.com>	2023-11-25 17:10:35 +0000
commit	198379ca8faaeb60a6677daebfc52480189704c8 (patch)
tree	345c3f7c4edb9e946fceb685a3732a0815366753
parent	630b1cd68f51339a6ef4ff60142d9d66373d5f4d (diff)
download	pcre-198379ca8faaeb60a6677daebfc52480189704c8.tar.gz