Index: pcre_ucd.c =================================================================== --- pcre_ucd.c (revision 1687) +++ pcre_ucd.c (revision 1688) @@ -38,6 +38,20 @@ const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0}; #else +/* If the 32-bit library is run in non-32-bit mode, character values +greater than 0x10ffff may be encountered. For these we set up a +special record. */ + +#ifdef COMPILE_PCRE32 +const ucd_record PRIV(dummy_ucd_record)[] = {{ + ucp_Common, /* script */ + ucp_Cn, /* type unassigned */ + ucp_gbOther, /* grapheme break property */ + 0, /* case set */ + 0, /* other case */ + }}; +#endif + /* When recompiling tables with a new Unicode version, please check the types in this structure definition from pcre_internal.h (the actual field names will be different): Index: pcre_internal.h =================================================================== --- pcre_internal.h (revision 1687) +++ pcre_internal.h (revision 1688) @@ -2772,6 +2772,9 @@ extern const pcre_uint16 PRIV(ucd_stage2)[]; extern const pcre_uint32 PRIV(ucp_gentype)[]; extern const pcre_uint32 PRIV(ucp_gbtable)[]; +#ifdef COMPILE_PCRE32 +extern const ucd_record PRIV(dummy_ucd_record)[]; +#endif #ifdef SUPPORT_JIT extern const int PRIV(ucp_typerange)[]; #endif @@ -2780,9 +2783,15 @@ /* UCD access macros */ #define UCD_BLOCK_SIZE 128 -#define GET_UCD(ch) (PRIV(ucd_records) + \ +#define REAL_GET_UCD(ch) (PRIV(ucd_records) + \ PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \ UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE]) + +#ifdef COMPILE_PCRE32 +#define GET_UCD(ch) ((ch > 0x10ffff)? PRIV(dummy_ucd_record) : REAL_GET_UCD(ch)) +#else +#define GET_UCD(ch) REAL_GET_UCD(ch) +#endif #define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype #define UCD_SCRIPT(ch) GET_UCD(ch)->script Index: ChangeLog =================================================================== --- ChangeLog (revision 1687) +++ ChangeLog (revision 1688) @@ -10,7 +10,7 @@ 1. Fixed typo in CMakeLists.txt (wrong number of arguments for PCRE_STATIC_RUNTIME (affects MSVC only). -2. Issue 1 for 8.40 below was not correctly fixed. If pcregrep in multiline +2. Issue 1 for 8.40 below was not correctly fixed. If pcregrep in multiline mode with --only-matching matched several lines, it restarted scanning at the next line instead of moving on to the end of the matched string, which can be several lines after the start. @@ -29,6 +29,10 @@ (a) Check for values < 256 when calling isprint() in pcretest. (b) Give an error for too big a number after \O. + +7. In the 32-bit library in non-UTF mode, an attempt to find a Unicode +property for a character with a code point greater than 0x10ffff (the Unicode +maximum) caused a crash. Version 8.40 11-January-2017 Index: maint/MultiStage2.py =================================================================== --- maint/MultiStage2.py (revision 1687) +++ maint/MultiStage2.py (revision 1688) @@ -1,5 +1,7 @@ #! /usr/bin/python +# WARNING! This is a python 2 script. + # Multistage table builder # (c) Peter Kankowski, 2008 @@ -15,10 +17,10 @@ # ./MultiStage2.py >../pcre_ucd.c # # It requires four Unicode data tables, DerivedGeneralCategory.txt, -# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the -# Unicode.tables subdirectory. The first of these is found in the "extracted" -# subdirectory of the Unicode database (UCD) on the Unicode web site; the -# second is in the "auxiliary" subdirectory; the other two are directly in the +# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the +# Unicode.tables subdirectory. The first of these is found in the "extracted" +# subdirectory of the Unicode database (UCD) on the Unicode web site; the +# second is in the "auxiliary" subdirectory; the other two are directly in the # UCD directory. # # Minor modifications made to this script: @@ -42,7 +44,7 @@ # code scans CaseFolding.txt instead of UnicodeData.txt. # # The main tables generated by this script are used by macros defined in -# pcre_internal.h. They look up Unicode character properties using short +# pcre_internal.h. They look up Unicode character properties using short # sequences of code that contains no branches, which makes for greater speed. # # Conceptually, there is a table of records (of type ucd_record), containing a @@ -69,13 +71,13 @@ # Example: lowercase "a" (U+0061) is in block 0 # lookup 0 in stage1 table yields 0 # lookup 97 in the first table in stage2 yields 16 -# record 17 is { 33, 5, 11, 0, -32 } +# record 17 is { 33, 5, 11, 0, -32 } # 33 = ucp_Latin => Latin script # 5 = ucp_Ll => Lower case letter # 11 = ucp_gbOther => Grapheme break property "Other" # 0 => not part of a caseless set # -32 => Other case is U+0041 -# +# # Almost all lowercase latin characters resolve to the same record. One or two # are different because they are part of a multi-character caseless set (for # example, k, K and the Kelvin symbol are such a set). @@ -83,17 +85,17 @@ # Example: hiragana letter A (U+3042) is in block 96 (0x60) # lookup 96 in stage1 table yields 88 # lookup 66 in the 88th table in stage2 yields 467 -# record 470 is { 26, 7, 11, 0, 0 } +# record 470 is { 26, 7, 11, 0, 0 } # 26 = ucp_Hiragana => Hiragana script # 7 = ucp_Lo => Other letter # 11 = ucp_gbOther => Grapheme break property "Other" # 0 => not part of a caseless set -# 0 => No other case +# 0 => No other case # # In these examples, no other blocks resolve to the same "virtual" block, as it # happens, but plenty of other blocks do share "virtual" blocks. # -# There is a fourth table, maintained by hand, which translates from the +# There is a fourth table, maintained by hand, which translates from the # individual character types such as ucp_Cc to the general types like ucp_C. # # Philip Hazel, 03 July 2008 @@ -101,8 +103,8 @@ # 01-March-2010: Updated list of scripts for Unicode 5.2.0 # 30-April-2011: Updated list of scripts for Unicode 6.0.0 # July-2012: Updated list of scripts for Unicode 6.1.0 -# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new -# field in the record to hold the value. Luckily, the +# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new +# field in the record to hold the value. Luckily, the # structure had a hole in it, so the resulting table is # not much bigger than before. # 18-September-2012: Added code for multiple caseless sets. This uses the @@ -144,14 +146,14 @@ if m.group(3) is None: last = char else: - last = int(m.group(3), 16) + last = int(m.group(3), 16) for i in range(char, last + 1): # It is important not to overwrite a previously set # value because in the CaseFolding file there are lines - # to be ignored (returning the default value of 0) - # which often come after a line which has already set - # data. - if table[i] == default_value: + # to be ignored (returning the default value of 0) + # which often come after a line which has already set + # data. + if table[i] == default_value: table[i] = value file.close() return table @@ -192,7 +194,7 @@ stage2 += block blocks[block] = start stage1.append(start) - + return stage1, stage2 # Print a table @@ -199,7 +201,7 @@ def print_table(table, table_name, block_size = None): type, size = get_type_size(table) ELEMS_PER_LINE = 16 - + s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table)) if block_size: s += ", block = %d" % block_size @@ -245,15 +247,15 @@ size = (size + slice_size - 1) & -slice_size size += slice_size structure += '%s property_%d;\n' % (slice_type, i) - + # round up to the first item of the next structure in array record_slice = map(lambda record: record[0], records) slice_type, slice_size = get_type_size(record_slice) size = (size + slice_size - 1) & -slice_size - + structure += '} ucd_record;\n*/\n\n' return size, structure - + def test_record_size(): tests = [ \ ( [(3,), (6,), (6,), (1,)], 1 ), \ @@ -305,7 +307,7 @@ 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi', 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi' ] - + category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ] @@ -321,20 +323,20 @@ other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0) -# This block of code was added by PH in September 2012. I am not a Python -# programmer, so the style is probably dreadful, but it does the job. It scans -# the other_case table to find sets of more than two characters that must all -# match each other caselessly. Later in this script a table of these sets is -# written out. However, we have to do this work here in order to compute the +# This block of code was added by PH in September 2012. I am not a Python +# programmer, so the style is probably dreadful, but it does the job. It scans +# the other_case table to find sets of more than two characters that must all +# match each other caselessly. Later in this script a table of these sets is +# written out. However, we have to do this work here in order to compute the # offsets in the table that are inserted into the main table. # The CaseFolding.txt file lists pairs, but the common logic for reading data -# sets only one value, so first we go through the table and set "return" +# sets only one value, so first we go through the table and set "return" # offsets for those that are not already set. for c in range(0x10ffff): if other_case[c] != 0 and other_case[c + other_case[c]] == 0: - other_case[c + other_case[c]] = -other_case[c] + other_case[c + other_case[c]] = -other_case[c] # Now scan again and create equivalence sets. @@ -344,25 +346,25 @@ o = c + other_case[c] # Trigger when this character's other case does not point back here. We - # now have three characters that are case-equivalent. - + # now have three characters that are case-equivalent. + if other_case[o] != -other_case[c]: t = o + other_case[o] - - # Scan the existing sets to see if any of the three characters are already + + # Scan the existing sets to see if any of the three characters are already # part of a set. If so, unite the existing set with the new set. - - appended = 0 + + appended = 0 for s in sets: - found = 0 + found = 0 for x in s: if x == c or x == o or x == t: found = 1 - + # Add new characters to an existing set - + if found: - found = 0 + found = 0 for y in [c, o, t]: for x in s: if x == y: @@ -370,10 +372,10 @@ if not found: s.append(y) appended = 1 - + # If we have not added to an existing set, create a new one. - if not appended: + if not appended: sets.append([c, o, t]) # End of loop looking for caseless sets. @@ -384,7 +386,7 @@ offset = 1; for s in sets: - for x in s: + for x in s: caseless_offsets[x] = offset offset += len(s) + 1 @@ -393,7 +395,7 @@ # Combine the tables -table, records = combine_tables(script, category, break_props, +table, records = combine_tables(script, category, break_props, caseless_offsets, other_case) record_size, record_struct = get_record_size_struct(records.keys()) @@ -450,6 +452,20 @@ print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};" print "#else" print +print "/* If the 32-bit library is run in non-32-bit mode, character values" +print "greater than 0x10ffff may be encountered. For these we set up a" +print "special record. */" +print +print "#ifdef COMPILE_PCRE32" +print "const ucd_record PRIV(dummy_ucd_record)[] = {{" +print " ucp_Common, /* script */" +print " ucp_Cn, /* type unassigned */" +print " ucp_gbOther, /* grapheme break property */" +print " 0, /* case set */" +print " 0, /* other case */" +print " }};" +print "#endif" +print print record_struct # --- Added by PH: output the table of caseless character sets --- @@ -460,7 +476,7 @@ s = sorted(s) for x in s: print ' 0x%04x,' % x, - print ' NOTACHAR,' + print ' NOTACHAR,' print '};' print