diff options
Diffstat (limited to 'maint/GenerateUcd.py')
-rwxr-xr-x | maint/GenerateUcd.py | 45 |
1 files changed, 34 insertions, 11 deletions
diff --git a/maint/GenerateUcd.py b/maint/GenerateUcd.py index 6081800d..1bde7187 100755 --- a/maint/GenerateUcd.py +++ b/maint/GenerateUcd.py @@ -109,6 +109,8 @@ # 10-January-2022: Addition of general Boolean property support # 12-January-2022: Merge scriptx and bidiclass fields # 14-January-2022: Enlarge Boolean property offset to 12 bits +# 28-January-2023: Remove ASCII "other case" from non-ASCII character that +# are present in caseless sets. # # ---------------------------------------------------------------------------- # @@ -269,19 +271,27 @@ NOTACHAR = 0xffffffff # --------------------------------------------------------------------------- -# Parse a line of Scripts.txt, GraphemeBreakProperty.txt, DerivedBidiClass.txt -# or DerivedGeneralCategory.txt +# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt def make_get_names(enum): return lambda chardata: enum.index(chardata[1]) +# Parse a line of DerivedBidiClass.txt + +def get_bidi(chardata): + if len(chardata[1]) > 3: + return bidi_classes_long.index(chardata[1]) + else: + return bidi_classes_short.index(chardata[1]) + + # Parse a line of CaseFolding.txt def get_other_case(chardata): if chardata[1] == 'C' or chardata[1] == 'S': return int(chardata[2], 16) - int(chardata[0], 16) - return 0 + return None # Parse a line of ScriptExtensions.txt @@ -316,11 +326,16 @@ def read_table(file_name, get_value, default_value): table = [default_value] * MAX_UNICODE for line in file: + if file_base == 'DerivedBidiClass': + line = re.sub(r'# @missing: ', '', line) + line = re.sub(r'#.*', '', line) chardata = list(map(str.strip, line.split(';'))) if len(chardata) <= 1: continue value = get_value(chardata) + if value is None: + continue m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) char = int(m.group(1), 16) if m.group(3) is None: @@ -328,11 +343,8 @@ def read_table(file_name, get_value, default_value): else: last = int(m.group(3), 16) for i in range(char, last + 1): - # It is important not to overwrite a previously set value because in the - # CaseFolding file there are lines to be ignored (returning the default - # value of 0) which often come after a line which has already set data. - if table[i] == default_value: - table[i] = value + table[i] = value + file.close() return table @@ -506,7 +518,8 @@ unicode_version = "" # strings for use by GenerateUcpHeader. The comments are not wanted here, so # remove them. -bidi_classes = bidi_classes[::2] +bidi_classes_short = bidi_classes[::2] +bidi_classes_long = bidi_classes[1::2] break_properties = break_properties[::2] category_names = category_names[::2] @@ -516,7 +529,7 @@ script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn')) break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other')) other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0) -bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidi_classes), bidi_classes.index('L')) +bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', get_bidi, bidi_classes_short.index('L')) # The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now # we need to find the Extended_Pictographic property for emoji characters. This @@ -710,6 +723,16 @@ for s in caseless_sets: # End of block of code for creating offsets for caseless matching sets. +# Scan the caseless sets, and for any non-ASCII character that has an ASCII +# character as its "base" other case, remove the other case. This makes it +# easier to handle those characters when the PCRE2 option for not mixing ASCII +# and non-ASCII is enabled. In principle one should perhaps scan for a +# non-ASCII alternative, but in practice these don't exist. + +for s in caseless_sets: + for x in s: + if x > 127 and x + other_case[x] < 128: + other_case[x] = 0 # Combine all the tables @@ -728,7 +751,7 @@ for block_size in [2 ** i for i in range(5,10)]: size = len(records) * record_size stage1, stage2 = compress_table(table, block_size) size += get_tables_size(stage1, stage2) - #print "/* block size %5d => %5d bytes */" % (block_size, size) + #print("/* block size {:3d} => {:5d} bytes */".format(block_size, size)) if size < min_size: min_size = size min_stage1, min_stage2 = stage1, stage2 |