1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
|
#! /usr/bin/python
# PCRE2 UNICODE PROPERTY SUPPORT
# ------------------------------
#
# This file auto-generates unicode property tests and their expected output.
# It is recommended to re-run this generator after the unicode files are
# updated. The names of the generated files are `testinput26` and `testoutput26`
import re
import sys
from GenerateCommon import \
script_names, \
script_abbrevs
def write_both(text):
input_file.write(text)
output_file.write(text)
def to_string_char(ch_idx):
if ch_idx < 128:
if ch_idx < 16:
return "\\x{0%x}" % ch_idx
if ch_idx >= 32:
return chr(ch_idx)
return "\\x{%x}" % ch_idx
output_directory = ""
if len(sys.argv) > 2:
print('** Too many arguments: just give a directory name')
sys.exit(1)
if len(sys.argv) == 2:
output_directory = sys.argv[1]
if not output_directory.endswith("/"):
output_directory += "/"
try:
input_file = open(output_directory + "testinput26", "w")
output_file = open(output_directory + "testoutput26", "w")
except IOError:
print("** Couldn't open output files")
sys.exit(1)
write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
# ---------------------------------------------------------------------------
# UNICODE SCRIPT EXTENSION TESTS
# ---------------------------------------------------------------------------
write_both("# Unicode Script Extension tests.\n\n")
def gen_script_tests():
script_data = [None] * len(script_names)
char_data = [None] * 0x110000
property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
prev_name = ""
script_idx = -1
with open("Unicode.tables/Scripts.txt") as f:
for line in f:
match_obj = property_re.match(line)
if match_obj == None:
continue
name = match_obj.group(3)
if name != prev_name:
script_idx = script_names.index(name)
prev_name = name
low = int(match_obj.group(1), 16)
high = low
char_data[low] = name
if match_obj.group(2) != None:
high = int(match_obj.group(2), 16)
for idx in range(low + 1, high + 1):
char_data[idx] = name
if script_data[script_idx] == None:
script_data[script_idx] = [low, None, None, None, None]
script_data[script_idx][1] = high
extended_script_indicies = {}
with open("Unicode.tables/ScriptExtensions.txt") as f:
for line in f:
match_obj = property_re.match(line)
if match_obj == None:
continue
low = int(match_obj.group(1), 16)
high = low
if match_obj.group(2) != None:
high = int(match_obj.group(2), 16)
for abbrev in match_obj.group(3).split(" "):
if abbrev not in extended_script_indicies:
idx = script_abbrevs.index(abbrev)
extended_script_indicies[abbrev] = idx
rec = script_data[idx]
rec[2] = low
rec[3] = high
else:
idx = extended_script_indicies[abbrev]
rec = script_data[idx]
if rec[2] > low:
rec[2] = low
if rec[3] < high:
rec[3] = high
if rec[4] == None:
name = script_names[idx]
for idx in range(low, high + 1):
if char_data[idx] != name:
rec[4] = idx
break
long_property_name = False
for idx, rec in enumerate(script_data):
script_name = script_names[idx]
if script_name == "Unknown":
continue
script_abbrev = script_abbrevs[idx]
write_both("# Base script check\n")
write_both("/^\\p{sc=%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[0]))
output_file.write(" 0: %s\n" % to_string_char(rec[0]))
write_both("\n")
write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
write_both(" %s\n" % to_string_char(rec[1]))
output_file.write(" 0: %s\n" % to_string_char(rec[1]))
write_both("\n")
if rec[2] != None:
property_name = "scx"
if long_property_name:
property_name = "Script_Extensions"
write_both("# Script extension check\n")
write_both("/^\\p{%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[2]))
output_file.write(" 0: %s\n" % to_string_char(rec[2]))
write_both("\n")
write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
write_both(" %s\n" % to_string_char(rec[3]))
output_file.write(" 0: %s\n" % to_string_char(rec[3]))
write_both("\n")
long_property_name = not long_property_name
if rec[4] != None:
write_both("# Script extension only character\n")
write_both("/^\\p{%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[4]))
output_file.write(" 0: %s\n" % to_string_char(rec[4]))
write_both("\n")
write_both("/^\\p{sc=%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[4]))
output_file.write("No match\n")
write_both("\n")
else:
print("External character has not found for %s" % script_name)
high = rec[1]
if rec[3] != None and rec[3] > rec[1]:
high = rec[3]
write_both("# Character not in script\n")
write_both("/^\\p{%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(high + 1))
output_file.write("No match\n")
write_both("\n")
gen_script_tests()
write_both("# End of testinput26\n")
|