Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 614054
Collapse All | Expand All

(-)pcre_ucd.c (+14 lines)
Lines 38-43 Link Here
38
const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};
38
const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};
39
#else
39
#else
40
40
41
/* If the 32-bit library is run in non-32-bit mode, character values
42
greater than 0x10ffff may be encountered. For these we set up a
43
special record. */
44
45
#ifdef COMPILE_PCRE32
46
const ucd_record PRIV(dummy_ucd_record)[] = {{
47
  ucp_Common,    /* script */
48
  ucp_Cn,        /* type unassigned */
49
  ucp_gbOther,   /* grapheme break property */
50
  0,             /* case set */
51
  0,             /* other case */
52
  }};
53
#endif
54
41
/* When recompiling tables with a new Unicode version, please check the
55
/* When recompiling tables with a new Unicode version, please check the
42
types in this structure definition from pcre_internal.h (the actual
56
types in this structure definition from pcre_internal.h (the actual
43
field names will be different):
57
field names will be different):
(-)pcre_internal.h (-1 / +10 lines)
Lines 2772-2777 Link Here
2772
extern const pcre_uint16 PRIV(ucd_stage2)[];
2772
extern const pcre_uint16 PRIV(ucd_stage2)[];
2773
extern const pcre_uint32 PRIV(ucp_gentype)[];
2773
extern const pcre_uint32 PRIV(ucp_gentype)[];
2774
extern const pcre_uint32 PRIV(ucp_gbtable)[];
2774
extern const pcre_uint32 PRIV(ucp_gbtable)[];
2775
#ifdef COMPILE_PCRE32
2776
extern const ucd_record  PRIV(dummy_ucd_record)[];
2777
#endif
2775
#ifdef SUPPORT_JIT
2778
#ifdef SUPPORT_JIT
2776
extern const int         PRIV(ucp_typerange)[];
2779
extern const int         PRIV(ucp_typerange)[];
2777
#endif
2780
#endif
Lines 2780-2788 Link Here
2780
/* UCD access macros */
2783
/* UCD access macros */
2781
2784
2782
#define UCD_BLOCK_SIZE 128
2785
#define UCD_BLOCK_SIZE 128
2783
#define GET_UCD(ch) (PRIV(ucd_records) + \
2786
#define REAL_GET_UCD(ch) (PRIV(ucd_records) + \
2784
        PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \
2787
        PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \
2785
        UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE])
2788
        UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE])
2789
        
2790
#ifdef COMPILE_PCRE32
2791
#define GET_UCD(ch) ((ch > 0x10ffff)? PRIV(dummy_ucd_record) : REAL_GET_UCD(ch))
2792
#else
2793
#define GET_UCD(ch) REAL_GET_UCD(ch)
2794
#endif 
2786
2795
2787
#define UCD_CHARTYPE(ch)    GET_UCD(ch)->chartype
2796
#define UCD_CHARTYPE(ch)    GET_UCD(ch)->chartype
2788
#define UCD_SCRIPT(ch)      GET_UCD(ch)->script
2797
#define UCD_SCRIPT(ch)      GET_UCD(ch)->script
(-)ChangeLog (-1 / +5 lines)
Lines 10-16 Link Here
10
1.  Fixed typo in CMakeLists.txt (wrong number of arguments for 
10
1.  Fixed typo in CMakeLists.txt (wrong number of arguments for 
11
PCRE_STATIC_RUNTIME (affects MSVC only).
11
PCRE_STATIC_RUNTIME (affects MSVC only).
12
12
13
2. Issue 1 for 8.40 below was not correctly fixed. If pcregrep in multiline 
13
2.  Issue 1 for 8.40 below was not correctly fixed. If pcregrep in multiline 
14
mode with --only-matching matched several lines, it restarted scanning at the 
14
mode with --only-matching matched several lines, it restarted scanning at the 
15
next line instead of moving on to the end of the matched string, which can be 
15
next line instead of moving on to the end of the matched string, which can be 
16
several lines after the start.
16
several lines after the start.
Lines 29-34 Link Here
29
29
30
    (a) Check for values < 256 when calling isprint() in pcretest.
30
    (a) Check for values < 256 when calling isprint() in pcretest.
31
    (b) Give an error for too big a number after \O. 
31
    (b) Give an error for too big a number after \O. 
32
    
33
7.  In the 32-bit library in non-UTF mode, an attempt to find a Unicode 
34
property for a character with a code point greater than 0x10ffff (the Unicode 
35
maximum) caused a crash.
32
36
33
37
34
Version 8.40 11-January-2017
38
Version 8.40 11-January-2017
(-)maint/MultiStage2.py (-45 / +61 lines)
Lines 1-5 Link Here
1
#! /usr/bin/python
1
#! /usr/bin/python
2
2
3
# WARNING! This is a python 2 script.
4
3
# Multistage table builder
5
# Multistage table builder
4
# (c) Peter Kankowski, 2008
6
# (c) Peter Kankowski, 2008
5
7
Lines 15-24 Link Here
15
# ./MultiStage2.py >../pcre_ucd.c
17
# ./MultiStage2.py >../pcre_ucd.c
16
#
18
#
17
# It requires four Unicode data tables, DerivedGeneralCategory.txt,
19
# It requires four Unicode data tables, DerivedGeneralCategory.txt,
18
# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the 
20
# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
19
# Unicode.tables subdirectory. The first of these is found in the "extracted" 
21
# Unicode.tables subdirectory. The first of these is found in the "extracted"
20
# subdirectory of the Unicode database (UCD) on the Unicode web site; the 
22
# subdirectory of the Unicode database (UCD) on the Unicode web site; the
21
# second is in the "auxiliary" subdirectory; the other two are directly in the 
23
# second is in the "auxiliary" subdirectory; the other two are directly in the
22
# UCD directory.
24
# UCD directory.
23
#
25
#
24
# Minor modifications made to this script:
26
# Minor modifications made to this script:
Lines 42-48 Link Here
42
#  code scans CaseFolding.txt instead of UnicodeData.txt.
44
#  code scans CaseFolding.txt instead of UnicodeData.txt.
43
#
45
#
44
# The main tables generated by this script are used by macros defined in
46
# The main tables generated by this script are used by macros defined in
45
# pcre_internal.h. They look up Unicode character properties using short 
47
# pcre_internal.h. They look up Unicode character properties using short
46
# sequences of code that contains no branches, which makes for greater speed.
48
# sequences of code that contains no branches, which makes for greater speed.
47
#
49
#
48
# Conceptually, there is a table of records (of type ucd_record), containing a
50
# Conceptually, there is a table of records (of type ucd_record), containing a
Lines 69-81 Link Here
69
# Example: lowercase "a" (U+0061) is in block 0
71
# Example: lowercase "a" (U+0061) is in block 0
70
#          lookup 0 in stage1 table yields 0
72
#          lookup 0 in stage1 table yields 0
71
#          lookup 97 in the first table in stage2 yields 16
73
#          lookup 97 in the first table in stage2 yields 16
72
#          record 17 is { 33, 5, 11, 0, -32 } 
74
#          record 17 is { 33, 5, 11, 0, -32 }
73
#            33 = ucp_Latin   => Latin script
75
#            33 = ucp_Latin   => Latin script
74
#             5 = ucp_Ll      => Lower case letter
76
#             5 = ucp_Ll      => Lower case letter
75
#            11 = ucp_gbOther => Grapheme break property "Other"
77
#            11 = ucp_gbOther => Grapheme break property "Other"
76
#             0               => not part of a caseless set
78
#             0               => not part of a caseless set
77
#           -32               => Other case is U+0041
79
#           -32               => Other case is U+0041
78
#         
80
#
79
# Almost all lowercase latin characters resolve to the same record. One or two
81
# Almost all lowercase latin characters resolve to the same record. One or two
80
# are different because they are part of a multi-character caseless set (for
82
# are different because they are part of a multi-character caseless set (for
81
# example, k, K and the Kelvin symbol are such a set).
83
# example, k, K and the Kelvin symbol are such a set).
Lines 83-99 Link Here
83
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
85
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
84
#          lookup 96 in stage1 table yields 88
86
#          lookup 96 in stage1 table yields 88
85
#          lookup 66 in the 88th table in stage2 yields 467
87
#          lookup 66 in the 88th table in stage2 yields 467
86
#          record 470 is { 26, 7, 11, 0, 0 } 
88
#          record 470 is { 26, 7, 11, 0, 0 }
87
#            26 = ucp_Hiragana => Hiragana script
89
#            26 = ucp_Hiragana => Hiragana script
88
#             7 = ucp_Lo       => Other letter
90
#             7 = ucp_Lo       => Other letter
89
#            11 = ucp_gbOther  => Grapheme break property "Other"
91
#            11 = ucp_gbOther  => Grapheme break property "Other"
90
#             0                => not part of a caseless set
92
#             0                => not part of a caseless set
91
#             0                => No other case 
93
#             0                => No other case
92
#
94
#
93
# In these examples, no other blocks resolve to the same "virtual" block, as it
95
# In these examples, no other blocks resolve to the same "virtual" block, as it
94
# happens, but plenty of other blocks do share "virtual" blocks.
96
# happens, but plenty of other blocks do share "virtual" blocks.
95
#
97
#
96
# There is a fourth table, maintained by hand, which translates from the 
98
# There is a fourth table, maintained by hand, which translates from the
97
# individual character types such as ucp_Cc to the general types like ucp_C.
99
# individual character types such as ucp_Cc to the general types like ucp_C.
98
#
100
#
99
#  Philip Hazel, 03 July 2008
101
#  Philip Hazel, 03 July 2008
Lines 101-108 Link Here
101
# 01-March-2010:     Updated list of scripts for Unicode 5.2.0
103
# 01-March-2010:     Updated list of scripts for Unicode 5.2.0
102
# 30-April-2011:     Updated list of scripts for Unicode 6.0.0
104
# 30-April-2011:     Updated list of scripts for Unicode 6.0.0
103
#     July-2012:     Updated list of scripts for Unicode 6.1.0
105
#     July-2012:     Updated list of scripts for Unicode 6.1.0
104
# 20-August-2012:    Added scan of GraphemeBreakProperty.txt and added a new 
106
# 20-August-2012:    Added scan of GraphemeBreakProperty.txt and added a new
105
#                      field in the record to hold the value. Luckily, the 
107
#                      field in the record to hold the value. Luckily, the
106
#                      structure had a hole in it, so the resulting table is
108
#                      structure had a hole in it, so the resulting table is
107
#                      not much bigger than before.
109
#                      not much bigger than before.
108
# 18-September-2012: Added code for multiple caseless sets. This uses the
110
# 18-September-2012: Added code for multiple caseless sets. This uses the
Lines 144-157 Link Here
144
                if m.group(3) is None:
146
                if m.group(3) is None:
145
                        last = char
147
                        last = char
146
                else:
148
                else:
147
                        last = int(m.group(3), 16)            
149
                        last = int(m.group(3), 16)
148
                for i in range(char, last + 1):
150
                for i in range(char, last + 1):
149
                        # It is important not to overwrite a previously set
151
                        # It is important not to overwrite a previously set
150
                        # value because in the CaseFolding file there are lines
152
                        # value because in the CaseFolding file there are lines
151
                        # to be ignored (returning the default value of 0) 
153
                        # to be ignored (returning the default value of 0)
152
                        # which often come after a line which has already set 
154
                        # which often come after a line which has already set
153
                        # data.   
155
                        # data.
154
                        if table[i] == default_value: 
156
                        if table[i] == default_value:
155
                          table[i] = value
157
                          table[i] = value
156
        file.close()
158
        file.close()
157
        return table
159
        return table
Lines 192-198 Link Here
192
                        stage2 += block
194
                        stage2 += block
193
                        blocks[block] = start
195
                        blocks[block] = start
194
                stage1.append(start)
196
                stage1.append(start)
195
        
197
196
        return stage1, stage2
198
        return stage1, stage2
197
199
198
# Print a table
200
# Print a table
Lines 199-205 Link Here
199
def print_table(table, table_name, block_size = None):
201
def print_table(table, table_name, block_size = None):
200
        type, size = get_type_size(table)
202
        type, size = get_type_size(table)
201
        ELEMS_PER_LINE = 16
203
        ELEMS_PER_LINE = 16
202
        
204
203
        s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
205
        s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
204
        if block_size:
206
        if block_size:
205
                s += ", block = %d" % block_size
207
                s += ", block = %d" % block_size
Lines 245-259 Link Here
245
                size = (size + slice_size - 1) & -slice_size
247
                size = (size + slice_size - 1) & -slice_size
246
                size += slice_size
248
                size += slice_size
247
                structure += '%s property_%d;\n' % (slice_type, i)
249
                structure += '%s property_%d;\n' % (slice_type, i)
248
        
250
249
        # round up to the first item of the next structure in array
251
        # round up to the first item of the next structure in array
250
        record_slice = map(lambda record: record[0], records)
252
        record_slice = map(lambda record: record[0], records)
251
        slice_type, slice_size = get_type_size(record_slice)
253
        slice_type, slice_size = get_type_size(record_slice)
252
        size = (size + slice_size - 1) & -slice_size
254
        size = (size + slice_size - 1) & -slice_size
253
        
255
254
        structure += '} ucd_record;\n*/\n\n'
256
        structure += '} ucd_record;\n*/\n\n'
255
        return size, structure
257
        return size, structure
256
        
258
257
def test_record_size():
259
def test_record_size():
258
        tests = [ \
260
        tests = [ \
259
          ( [(3,), (6,), (6,), (1,)], 1 ), \
261
          ( [(3,), (6,), (6,), (1,)], 1 ), \
Lines 305-311 Link Here
305
 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
307
 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
306
 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi'
308
 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi'
307
 ]
309
 ]
308
 
310
309
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
311
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
310
  'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
312
  'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
311
  'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
313
  'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
Lines 321-340 Link Here
321
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
323
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
322
324
323
325
324
# This block of code was added by PH in September 2012. I am not a Python 
326
# This block of code was added by PH in September 2012. I am not a Python
325
# programmer, so the style is probably dreadful, but it does the job. It scans 
327
# programmer, so the style is probably dreadful, but it does the job. It scans
326
# the other_case table to find sets of more than two characters that must all 
328
# the other_case table to find sets of more than two characters that must all
327
# match each other caselessly. Later in this script a table of these sets is 
329
# match each other caselessly. Later in this script a table of these sets is
328
# written out. However, we have to do this work here in order to compute the 
330
# written out. However, we have to do this work here in order to compute the
329
# offsets in the table that are inserted into the main table.
331
# offsets in the table that are inserted into the main table.
330
332
331
# The CaseFolding.txt file lists pairs, but the common logic for reading data
333
# The CaseFolding.txt file lists pairs, but the common logic for reading data
332
# sets only one value, so first we go through the table and set "return" 
334
# sets only one value, so first we go through the table and set "return"
333
# offsets for those that are not already set.
335
# offsets for those that are not already set.
334
336
335
for c in range(0x10ffff):
337
for c in range(0x10ffff):
336
  if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
338
  if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
337
    other_case[c + other_case[c]] = -other_case[c] 
339
    other_case[c + other_case[c]] = -other_case[c]
338
340
339
# Now scan again and create equivalence sets.
341
# Now scan again and create equivalence sets.
340
342
Lines 344-368 Link Here
344
  o = c + other_case[c]
346
  o = c + other_case[c]
345
347
346
  # Trigger when this character's other case does not point back here. We
348
  # Trigger when this character's other case does not point back here. We
347
  # now have three characters that are case-equivalent. 
349
  # now have three characters that are case-equivalent.
348
 
350
349
  if other_case[o] != -other_case[c]:
351
  if other_case[o] != -other_case[c]:
350
    t = o + other_case[o]
352
    t = o + other_case[o]
351
    
353
352
    # Scan the existing sets to see if any of the three characters are already 
354
    # Scan the existing sets to see if any of the three characters are already
353
    # part of a set. If so, unite the existing set with the new set.
355
    # part of a set. If so, unite the existing set with the new set.
354
 
356
355
    appended = 0 
357
    appended = 0
356
    for s in sets:
358
    for s in sets:
357
      found = 0 
359
      found = 0
358
      for x in s:
360
      for x in s:
359
        if x == c or x == o or x == t:
361
        if x == c or x == o or x == t:
360
          found = 1
362
          found = 1
361
    
363
362
      # Add new characters to an existing set
364
      # Add new characters to an existing set
363
       
365
364
      if found:
366
      if found:
365
        found = 0 
367
        found = 0
366
        for y in [c, o, t]:
368
        for y in [c, o, t]:
367
          for x in s:
369
          for x in s:
368
            if x == y:
370
            if x == y:
Lines 370-379 Link Here
370
          if not found:
372
          if not found:
371
            s.append(y)
373
            s.append(y)
372
        appended = 1
374
        appended = 1
373
        
375
374
    # If we have not added to an existing set, create a new one.
376
    # If we have not added to an existing set, create a new one.
375
377
376
    if not appended:     
378
    if not appended:
377
      sets.append([c, o, t])
379
      sets.append([c, o, t])
378
380
379
# End of loop looking for caseless sets.
381
# End of loop looking for caseless sets.
Lines 384-390 Link Here
384
386
385
offset = 1;
387
offset = 1;
386
for s in sets:
388
for s in sets:
387
  for x in s:   
389
  for x in s:
388
    caseless_offsets[x] = offset
390
    caseless_offsets[x] = offset
389
  offset += len(s) + 1
391
  offset += len(s) + 1
390
392
Lines 393-399 Link Here
393
395
394
# Combine the tables
396
# Combine the tables
395
397
396
table, records = combine_tables(script, category, break_props, 
398
table, records = combine_tables(script, category, break_props,
397
  caseless_offsets, other_case)
399
  caseless_offsets, other_case)
398
400
399
record_size, record_struct = get_record_size_struct(records.keys())
401
record_size, record_struct = get_record_size_struct(records.keys())
Lines 450-455 Link Here
450
print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};"
452
print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};"
451
print "#else"
453
print "#else"
452
print
454
print
455
print "/* If the 32-bit library is run in non-32-bit mode, character values"
456
print "greater than 0x10ffff may be encountered. For these we set up a"
457
print "special record. */"
458
print
459
print "#ifdef COMPILE_PCRE32"
460
print "const ucd_record PRIV(dummy_ucd_record)[] = {{"
461
print "  ucp_Common,    /* script */"
462
print "  ucp_Cn,        /* type unassigned */"
463
print "  ucp_gbOther,   /* grapheme break property */"
464
print "  0,             /* case set */"
465
print "  0,             /* other case */"
466
print "  }};"
467
print "#endif"
468
print
453
print record_struct
469
print record_struct
454
470
455
# --- Added by PH: output the table of caseless character sets ---
471
# --- Added by PH: output the table of caseless character sets ---
Lines 460-466 Link Here
460
  s = sorted(s)
476
  s = sorted(s)
461
  for x in s:
477
  for x in s:
462
    print '  0x%04x,' % x,
478
    print '  0x%04x,' % x,
463
  print '  NOTACHAR,'   
479
  print '  NOTACHAR,'
464
print '};'
480
print '};'
465
print
481
print
466
482

Return to bug 614054