maint/GenerateTest26.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

#! /usr/bin/python

#                   PCRE2 UNICODE PROPERTY SUPPORT
#                   ------------------------------
#
# This file auto-generates unicode property tests and their expected output.
# It is recommended to re-run this generator after the unicode files are
# updated. The names of the generated files are `testinput26` and `testoutput26`

import re
import sys

from GenerateCommon import \
  script_names, \
  script_abbrevs

def write_both(text):
  input_file.write(text)
  output_file.write(text)

def to_string_char(ch_idx):
  if ch_idx < 128:
    if ch_idx < 16:
      return "\\x{0%x}" % ch_idx
    if ch_idx >= 32:
      return chr(ch_idx)
  return "\\x{%x}" % ch_idx

output_directory = ""

if len(sys.argv) > 2:
  print('** Too many arguments: just give a directory name')
  sys.exit(1)
if len(sys.argv) == 2:
  output_directory = sys.argv[1]
  if not output_directory.endswith("/"):
    output_directory += "/"

try:
  input_file = open(output_directory + "testinput26", "w")
  output_file = open(output_directory + "testoutput26", "w")
except IOError:
  print("** Couldn't open output files")
  sys.exit(1)

write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")

# ---------------------------------------------------------------------------
#                      UNICODE SCRIPT EXTENSION TESTS
# ---------------------------------------------------------------------------

write_both("# Unicode Script Extension tests.\n\n")

def gen_script_tests():
  script_data = [None] * len(script_names)
  char_data = [None] * 0x110000

  property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
  prev_name = ""
  script_idx = -1

  with open("Unicode.tables/Scripts.txt") as f:
    for line in f:
      match_obj = property_re.match(line)

      if match_obj == None:
        continue

      name = match_obj.group(3)
      if name != prev_name:
        script_idx = script_names.index(name)
        prev_name = name

      low = int(match_obj.group(1), 16)
      high = low
      char_data[low] = name

      if match_obj.group(2) != None:
        high = int(match_obj.group(2), 16)
        for idx in range(low + 1, high + 1):
           char_data[idx] = name

      if script_data[script_idx] == None:
        script_data[script_idx] = [low, None, None, None, None]
      script_data[script_idx][1] = high

  extended_script_indicies = {}

  with open("Unicode.tables/ScriptExtensions.txt") as f:
    for line in f:
      match_obj = property_re.match(line)

      if match_obj == None:
        continue

      low = int(match_obj.group(1), 16)
      high = low
      if match_obj.group(2) != None:
        high = int(match_obj.group(2), 16)

      for abbrev in match_obj.group(3).split(" "):
        if abbrev not in extended_script_indicies:
          idx = script_abbrevs.index(abbrev)
          extended_script_indicies[abbrev] = idx
          rec = script_data[idx]
          rec[2] = low
          rec[3] = high
        else:
          idx = extended_script_indicies[abbrev]
          rec = script_data[idx]
          if rec[2] > low:
            rec[2] = low
          if rec[3] < high:
            rec[3] = high

        if rec[4] == None:
          name = script_names[idx]
          for idx in range(low, high + 1):
            if char_data[idx] != name:
              rec[4] = idx
              break

  long_property_name = False

  for idx, rec in enumerate(script_data):
    script_name = script_names[idx]

    if script_name == "Unknown":
      continue

    script_abbrev = script_abbrevs[idx]

    write_both("# Base script check\n")
    write_both("/^\\p{sc=%s}/utf\n" % script_name)
    write_both("  %s\n" % to_string_char(rec[0]))
    output_file.write(" 0: %s\n" % to_string_char(rec[0]))
    write_both("\n")

    write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
    write_both("  %s\n" % to_string_char(rec[1]))
    output_file.write(" 0: %s\n" % to_string_char(rec[1]))
    write_both("\n")

    if rec[2] != None:
      property_name = "scx"
      if long_property_name:
        property_name = "Script_Extensions"

      write_both("# Script extension check\n")
      write_both("/^\\p{%s}/utf\n" % script_name)
      write_both("  %s\n" % to_string_char(rec[2]))
      output_file.write(" 0: %s\n" % to_string_char(rec[2]))
      write_both("\n")

      write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
      write_both("  %s\n" % to_string_char(rec[3]))
      output_file.write(" 0: %s\n" % to_string_char(rec[3]))
      write_both("\n")

      long_property_name = not long_property_name

      if rec[4] != None:
        write_both("# Script extension only character\n")
        write_both("/^\\p{%s}/utf\n" % script_name)
        write_both("  %s\n" % to_string_char(rec[4]))
        output_file.write(" 0: %s\n" % to_string_char(rec[4]))
        write_both("\n")

        write_both("/^\\p{sc=%s}/utf\n" % script_name)
        write_both("  %s\n" % to_string_char(rec[4]))
        output_file.write("No match\n")
        write_both("\n")
      else:
        print("External character has not found for %s" % script_name)

    high = rec[1]
    if rec[3] != None and rec[3] > rec[1]:
      high = rec[3]
    write_both("# Character not in script\n")
    write_both("/^\\p{%s}/utf\n" % script_name)
    write_both("  %s\n" % to_string_char(high + 1))
    output_file.write("No match\n")
    write_both("\n")


gen_script_tests()

write_both("# End of testinput26\n")