aboutsummaryrefslogtreecommitdiff
path: root/maint/GenerateUcd.py
diff options
context:
space:
mode:
Diffstat (limited to 'maint/GenerateUcd.py')
-rwxr-xr-xmaint/GenerateUcd.py45
1 files changed, 34 insertions, 11 deletions
diff --git a/maint/GenerateUcd.py b/maint/GenerateUcd.py
index 6081800d..1bde7187 100755
--- a/maint/GenerateUcd.py
+++ b/maint/GenerateUcd.py
@@ -109,6 +109,8 @@
# 10-January-2022: Addition of general Boolean property support
# 12-January-2022: Merge scriptx and bidiclass fields
# 14-January-2022: Enlarge Boolean property offset to 12 bits
+# 28-January-2023: Remove ASCII "other case" from non-ASCII character that
+# are present in caseless sets.
#
# ----------------------------------------------------------------------------
#
@@ -269,19 +271,27 @@ NOTACHAR = 0xffffffff
# ---------------------------------------------------------------------------
-# Parse a line of Scripts.txt, GraphemeBreakProperty.txt, DerivedBidiClass.txt
-# or DerivedGeneralCategory.txt
+# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
def make_get_names(enum):
return lambda chardata: enum.index(chardata[1])
+# Parse a line of DerivedBidiClass.txt
+
+def get_bidi(chardata):
+ if len(chardata[1]) > 3:
+ return bidi_classes_long.index(chardata[1])
+ else:
+ return bidi_classes_short.index(chardata[1])
+
+
# Parse a line of CaseFolding.txt
def get_other_case(chardata):
if chardata[1] == 'C' or chardata[1] == 'S':
return int(chardata[2], 16) - int(chardata[0], 16)
- return 0
+ return None
# Parse a line of ScriptExtensions.txt
@@ -316,11 +326,16 @@ def read_table(file_name, get_value, default_value):
table = [default_value] * MAX_UNICODE
for line in file:
+ if file_base == 'DerivedBidiClass':
+ line = re.sub(r'# @missing: ', '', line)
+
line = re.sub(r'#.*', '', line)
chardata = list(map(str.strip, line.split(';')))
if len(chardata) <= 1:
continue
value = get_value(chardata)
+ if value is None:
+ continue
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
char = int(m.group(1), 16)
if m.group(3) is None:
@@ -328,11 +343,8 @@ def read_table(file_name, get_value, default_value):
else:
last = int(m.group(3), 16)
for i in range(char, last + 1):
- # It is important not to overwrite a previously set value because in the
- # CaseFolding file there are lines to be ignored (returning the default
- # value of 0) which often come after a line which has already set data.
- if table[i] == default_value:
- table[i] = value
+ table[i] = value
+
file.close()
return table
@@ -506,7 +518,8 @@ unicode_version = ""
# strings for use by GenerateUcpHeader. The comments are not wanted here, so
# remove them.
-bidi_classes = bidi_classes[::2]
+bidi_classes_short = bidi_classes[::2]
+bidi_classes_long = bidi_classes[1::2]
break_properties = break_properties[::2]
category_names = category_names[::2]
@@ -516,7 +529,7 @@ script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names),
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other'))
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
-bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidi_classes), bidi_classes.index('L'))
+bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', get_bidi, bidi_classes_short.index('L'))
# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
# we need to find the Extended_Pictographic property for emoji characters. This
@@ -710,6 +723,16 @@ for s in caseless_sets:
# End of block of code for creating offsets for caseless matching sets.
+# Scan the caseless sets, and for any non-ASCII character that has an ASCII
+# character as its "base" other case, remove the other case. This makes it
+# easier to handle those characters when the PCRE2 option for not mixing ASCII
+# and non-ASCII is enabled. In principle one should perhaps scan for a
+# non-ASCII alternative, but in practice these don't exist.
+
+for s in caseless_sets:
+ for x in s:
+ if x > 127 and x + other_case[x] < 128:
+ other_case[x] = 0
# Combine all the tables
@@ -728,7 +751,7 @@ for block_size in [2 ** i for i in range(5,10)]:
size = len(records) * record_size
stage1, stage2 = compress_table(table, block_size)
size += get_tables_size(stage1, stage2)
- #print "/* block size %5d => %5d bytes */" % (block_size, size)
+ #print("/* block size {:3d} => {:5d} bytes */".format(block_size, size))
if size < min_size:
min_size = size
min_stage1, min_stage2 = stage1, stage2