Lines 1-5
Link Here
|
1 |
#! /usr/bin/python |
1 |
#! /usr/bin/python |
2 |
|
2 |
|
|
|
3 |
# WARNING! This is a python 2 script. |
4 |
|
3 |
# Multistage table builder |
5 |
# Multistage table builder |
4 |
# (c) Peter Kankowski, 2008 |
6 |
# (c) Peter Kankowski, 2008 |
5 |
|
7 |
|
Lines 15-24
Link Here
|
15 |
# ./MultiStage2.py >../pcre_ucd.c |
17 |
# ./MultiStage2.py >../pcre_ucd.c |
16 |
# |
18 |
# |
17 |
# It requires four Unicode data tables, DerivedGeneralCategory.txt, |
19 |
# It requires four Unicode data tables, DerivedGeneralCategory.txt, |
18 |
# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the |
20 |
# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the |
19 |
# Unicode.tables subdirectory. The first of these is found in the "extracted" |
21 |
# Unicode.tables subdirectory. The first of these is found in the "extracted" |
20 |
# subdirectory of the Unicode database (UCD) on the Unicode web site; the |
22 |
# subdirectory of the Unicode database (UCD) on the Unicode web site; the |
21 |
# second is in the "auxiliary" subdirectory; the other two are directly in the |
23 |
# second is in the "auxiliary" subdirectory; the other two are directly in the |
22 |
# UCD directory. |
24 |
# UCD directory. |
23 |
# |
25 |
# |
24 |
# Minor modifications made to this script: |
26 |
# Minor modifications made to this script: |
Lines 42-48
Link Here
|
42 |
# code scans CaseFolding.txt instead of UnicodeData.txt. |
44 |
# code scans CaseFolding.txt instead of UnicodeData.txt. |
43 |
# |
45 |
# |
44 |
# The main tables generated by this script are used by macros defined in |
46 |
# The main tables generated by this script are used by macros defined in |
45 |
# pcre_internal.h. They look up Unicode character properties using short |
47 |
# pcre_internal.h. They look up Unicode character properties using short |
46 |
# sequences of code that contains no branches, which makes for greater speed. |
48 |
# sequences of code that contains no branches, which makes for greater speed. |
47 |
# |
49 |
# |
48 |
# Conceptually, there is a table of records (of type ucd_record), containing a |
50 |
# Conceptually, there is a table of records (of type ucd_record), containing a |
Lines 69-81
Link Here
|
69 |
# Example: lowercase "a" (U+0061) is in block 0 |
71 |
# Example: lowercase "a" (U+0061) is in block 0 |
70 |
# lookup 0 in stage1 table yields 0 |
72 |
# lookup 0 in stage1 table yields 0 |
71 |
# lookup 97 in the first table in stage2 yields 16 |
73 |
# lookup 97 in the first table in stage2 yields 16 |
72 |
# record 17 is { 33, 5, 11, 0, -32 } |
74 |
# record 17 is { 33, 5, 11, 0, -32 } |
73 |
# 33 = ucp_Latin => Latin script |
75 |
# 33 = ucp_Latin => Latin script |
74 |
# 5 = ucp_Ll => Lower case letter |
76 |
# 5 = ucp_Ll => Lower case letter |
75 |
# 11 = ucp_gbOther => Grapheme break property "Other" |
77 |
# 11 = ucp_gbOther => Grapheme break property "Other" |
76 |
# 0 => not part of a caseless set |
78 |
# 0 => not part of a caseless set |
77 |
# -32 => Other case is U+0041 |
79 |
# -32 => Other case is U+0041 |
78 |
# |
80 |
# |
79 |
# Almost all lowercase latin characters resolve to the same record. One or two |
81 |
# Almost all lowercase latin characters resolve to the same record. One or two |
80 |
# are different because they are part of a multi-character caseless set (for |
82 |
# are different because they are part of a multi-character caseless set (for |
81 |
# example, k, K and the Kelvin symbol are such a set). |
83 |
# example, k, K and the Kelvin symbol are such a set). |
Lines 83-99
Link Here
|
83 |
# Example: hiragana letter A (U+3042) is in block 96 (0x60) |
85 |
# Example: hiragana letter A (U+3042) is in block 96 (0x60) |
84 |
# lookup 96 in stage1 table yields 88 |
86 |
# lookup 96 in stage1 table yields 88 |
85 |
# lookup 66 in the 88th table in stage2 yields 467 |
87 |
# lookup 66 in the 88th table in stage2 yields 467 |
86 |
# record 470 is { 26, 7, 11, 0, 0 } |
88 |
# record 470 is { 26, 7, 11, 0, 0 } |
87 |
# 26 = ucp_Hiragana => Hiragana script |
89 |
# 26 = ucp_Hiragana => Hiragana script |
88 |
# 7 = ucp_Lo => Other letter |
90 |
# 7 = ucp_Lo => Other letter |
89 |
# 11 = ucp_gbOther => Grapheme break property "Other" |
91 |
# 11 = ucp_gbOther => Grapheme break property "Other" |
90 |
# 0 => not part of a caseless set |
92 |
# 0 => not part of a caseless set |
91 |
# 0 => No other case |
93 |
# 0 => No other case |
92 |
# |
94 |
# |
93 |
# In these examples, no other blocks resolve to the same "virtual" block, as it |
95 |
# In these examples, no other blocks resolve to the same "virtual" block, as it |
94 |
# happens, but plenty of other blocks do share "virtual" blocks. |
96 |
# happens, but plenty of other blocks do share "virtual" blocks. |
95 |
# |
97 |
# |
96 |
# There is a fourth table, maintained by hand, which translates from the |
98 |
# There is a fourth table, maintained by hand, which translates from the |
97 |
# individual character types such as ucp_Cc to the general types like ucp_C. |
99 |
# individual character types such as ucp_Cc to the general types like ucp_C. |
98 |
# |
100 |
# |
99 |
# Philip Hazel, 03 July 2008 |
101 |
# Philip Hazel, 03 July 2008 |
Lines 101-108
Link Here
|
101 |
# 01-March-2010: Updated list of scripts for Unicode 5.2.0 |
103 |
# 01-March-2010: Updated list of scripts for Unicode 5.2.0 |
102 |
# 30-April-2011: Updated list of scripts for Unicode 6.0.0 |
104 |
# 30-April-2011: Updated list of scripts for Unicode 6.0.0 |
103 |
# July-2012: Updated list of scripts for Unicode 6.1.0 |
105 |
# July-2012: Updated list of scripts for Unicode 6.1.0 |
104 |
# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new |
106 |
# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new |
105 |
# field in the record to hold the value. Luckily, the |
107 |
# field in the record to hold the value. Luckily, the |
106 |
# structure had a hole in it, so the resulting table is |
108 |
# structure had a hole in it, so the resulting table is |
107 |
# not much bigger than before. |
109 |
# not much bigger than before. |
108 |
# 18-September-2012: Added code for multiple caseless sets. This uses the |
110 |
# 18-September-2012: Added code for multiple caseless sets. This uses the |
Lines 144-157
Link Here
|
144 |
if m.group(3) is None: |
146 |
if m.group(3) is None: |
145 |
last = char |
147 |
last = char |
146 |
else: |
148 |
else: |
147 |
last = int(m.group(3), 16) |
149 |
last = int(m.group(3), 16) |
148 |
for i in range(char, last + 1): |
150 |
for i in range(char, last + 1): |
149 |
# It is important not to overwrite a previously set |
151 |
# It is important not to overwrite a previously set |
150 |
# value because in the CaseFolding file there are lines |
152 |
# value because in the CaseFolding file there are lines |
151 |
# to be ignored (returning the default value of 0) |
153 |
# to be ignored (returning the default value of 0) |
152 |
# which often come after a line which has already set |
154 |
# which often come after a line which has already set |
153 |
# data. |
155 |
# data. |
154 |
if table[i] == default_value: |
156 |
if table[i] == default_value: |
155 |
table[i] = value |
157 |
table[i] = value |
156 |
file.close() |
158 |
file.close() |
157 |
return table |
159 |
return table |
Lines 192-198
Link Here
|
192 |
stage2 += block |
194 |
stage2 += block |
193 |
blocks[block] = start |
195 |
blocks[block] = start |
194 |
stage1.append(start) |
196 |
stage1.append(start) |
195 |
|
197 |
|
196 |
return stage1, stage2 |
198 |
return stage1, stage2 |
197 |
|
199 |
|
198 |
# Print a table |
200 |
# Print a table |
Lines 199-205
Link Here
|
199 |
def print_table(table, table_name, block_size = None): |
201 |
def print_table(table, table_name, block_size = None): |
200 |
type, size = get_type_size(table) |
202 |
type, size = get_type_size(table) |
201 |
ELEMS_PER_LINE = 16 |
203 |
ELEMS_PER_LINE = 16 |
202 |
|
204 |
|
203 |
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table)) |
205 |
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table)) |
204 |
if block_size: |
206 |
if block_size: |
205 |
s += ", block = %d" % block_size |
207 |
s += ", block = %d" % block_size |
Lines 245-259
Link Here
|
245 |
size = (size + slice_size - 1) & -slice_size |
247 |
size = (size + slice_size - 1) & -slice_size |
246 |
size += slice_size |
248 |
size += slice_size |
247 |
structure += '%s property_%d;\n' % (slice_type, i) |
249 |
structure += '%s property_%d;\n' % (slice_type, i) |
248 |
|
250 |
|
249 |
# round up to the first item of the next structure in array |
251 |
# round up to the first item of the next structure in array |
250 |
record_slice = map(lambda record: record[0], records) |
252 |
record_slice = map(lambda record: record[0], records) |
251 |
slice_type, slice_size = get_type_size(record_slice) |
253 |
slice_type, slice_size = get_type_size(record_slice) |
252 |
size = (size + slice_size - 1) & -slice_size |
254 |
size = (size + slice_size - 1) & -slice_size |
253 |
|
255 |
|
254 |
structure += '} ucd_record;\n*/\n\n' |
256 |
structure += '} ucd_record;\n*/\n\n' |
255 |
return size, structure |
257 |
return size, structure |
256 |
|
258 |
|
257 |
def test_record_size(): |
259 |
def test_record_size(): |
258 |
tests = [ \ |
260 |
tests = [ \ |
259 |
( [(3,), (6,), (6,), (1,)], 1 ), \ |
261 |
( [(3,), (6,), (6,), (1,)], 1 ), \ |
Lines 305-311
Link Here
|
305 |
'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi', |
307 |
'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi', |
306 |
'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi' |
308 |
'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi' |
307 |
] |
309 |
] |
308 |
|
310 |
|
309 |
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', |
311 |
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', |
310 |
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', |
312 |
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', |
311 |
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ] |
313 |
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ] |
Lines 321-340
Link Here
|
321 |
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0) |
323 |
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0) |
322 |
|
324 |
|
323 |
|
325 |
|
324 |
# This block of code was added by PH in September 2012. I am not a Python |
326 |
# This block of code was added by PH in September 2012. I am not a Python |
325 |
# programmer, so the style is probably dreadful, but it does the job. It scans |
327 |
# programmer, so the style is probably dreadful, but it does the job. It scans |
326 |
# the other_case table to find sets of more than two characters that must all |
328 |
# the other_case table to find sets of more than two characters that must all |
327 |
# match each other caselessly. Later in this script a table of these sets is |
329 |
# match each other caselessly. Later in this script a table of these sets is |
328 |
# written out. However, we have to do this work here in order to compute the |
330 |
# written out. However, we have to do this work here in order to compute the |
329 |
# offsets in the table that are inserted into the main table. |
331 |
# offsets in the table that are inserted into the main table. |
330 |
|
332 |
|
331 |
# The CaseFolding.txt file lists pairs, but the common logic for reading data |
333 |
# The CaseFolding.txt file lists pairs, but the common logic for reading data |
332 |
# sets only one value, so first we go through the table and set "return" |
334 |
# sets only one value, so first we go through the table and set "return" |
333 |
# offsets for those that are not already set. |
335 |
# offsets for those that are not already set. |
334 |
|
336 |
|
335 |
for c in range(0x10ffff): |
337 |
for c in range(0x10ffff): |
336 |
if other_case[c] != 0 and other_case[c + other_case[c]] == 0: |
338 |
if other_case[c] != 0 and other_case[c + other_case[c]] == 0: |
337 |
other_case[c + other_case[c]] = -other_case[c] |
339 |
other_case[c + other_case[c]] = -other_case[c] |
338 |
|
340 |
|
339 |
# Now scan again and create equivalence sets. |
341 |
# Now scan again and create equivalence sets. |
340 |
|
342 |
|
Lines 344-368
Link Here
|
344 |
o = c + other_case[c] |
346 |
o = c + other_case[c] |
345 |
|
347 |
|
346 |
# Trigger when this character's other case does not point back here. We |
348 |
# Trigger when this character's other case does not point back here. We |
347 |
# now have three characters that are case-equivalent. |
349 |
# now have three characters that are case-equivalent. |
348 |
|
350 |
|
349 |
if other_case[o] != -other_case[c]: |
351 |
if other_case[o] != -other_case[c]: |
350 |
t = o + other_case[o] |
352 |
t = o + other_case[o] |
351 |
|
353 |
|
352 |
# Scan the existing sets to see if any of the three characters are already |
354 |
# Scan the existing sets to see if any of the three characters are already |
353 |
# part of a set. If so, unite the existing set with the new set. |
355 |
# part of a set. If so, unite the existing set with the new set. |
354 |
|
356 |
|
355 |
appended = 0 |
357 |
appended = 0 |
356 |
for s in sets: |
358 |
for s in sets: |
357 |
found = 0 |
359 |
found = 0 |
358 |
for x in s: |
360 |
for x in s: |
359 |
if x == c or x == o or x == t: |
361 |
if x == c or x == o or x == t: |
360 |
found = 1 |
362 |
found = 1 |
361 |
|
363 |
|
362 |
# Add new characters to an existing set |
364 |
# Add new characters to an existing set |
363 |
|
365 |
|
364 |
if found: |
366 |
if found: |
365 |
found = 0 |
367 |
found = 0 |
366 |
for y in [c, o, t]: |
368 |
for y in [c, o, t]: |
367 |
for x in s: |
369 |
for x in s: |
368 |
if x == y: |
370 |
if x == y: |
Lines 370-379
Link Here
|
370 |
if not found: |
372 |
if not found: |
371 |
s.append(y) |
373 |
s.append(y) |
372 |
appended = 1 |
374 |
appended = 1 |
373 |
|
375 |
|
374 |
# If we have not added to an existing set, create a new one. |
376 |
# If we have not added to an existing set, create a new one. |
375 |
|
377 |
|
376 |
if not appended: |
378 |
if not appended: |
377 |
sets.append([c, o, t]) |
379 |
sets.append([c, o, t]) |
378 |
|
380 |
|
379 |
# End of loop looking for caseless sets. |
381 |
# End of loop looking for caseless sets. |
Lines 384-390
Link Here
|
384 |
|
386 |
|
385 |
offset = 1; |
387 |
offset = 1; |
386 |
for s in sets: |
388 |
for s in sets: |
387 |
for x in s: |
389 |
for x in s: |
388 |
caseless_offsets[x] = offset |
390 |
caseless_offsets[x] = offset |
389 |
offset += len(s) + 1 |
391 |
offset += len(s) + 1 |
390 |
|
392 |
|
Lines 393-399
Link Here
|
393 |
|
395 |
|
394 |
# Combine the tables |
396 |
# Combine the tables |
395 |
|
397 |
|
396 |
table, records = combine_tables(script, category, break_props, |
398 |
table, records = combine_tables(script, category, break_props, |
397 |
caseless_offsets, other_case) |
399 |
caseless_offsets, other_case) |
398 |
|
400 |
|
399 |
record_size, record_struct = get_record_size_struct(records.keys()) |
401 |
record_size, record_struct = get_record_size_struct(records.keys()) |
Lines 450-455
Link Here
|
450 |
print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};" |
452 |
print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};" |
451 |
print "#else" |
453 |
print "#else" |
452 |
print |
454 |
print |
|
|
455 |
print "/* If the 32-bit library is run in non-32-bit mode, character values" |
456 |
print "greater than 0x10ffff may be encountered. For these we set up a" |
457 |
print "special record. */" |
458 |
print |
459 |
print "#ifdef COMPILE_PCRE32" |
460 |
print "const ucd_record PRIV(dummy_ucd_record)[] = {{" |
461 |
print " ucp_Common, /* script */" |
462 |
print " ucp_Cn, /* type unassigned */" |
463 |
print " ucp_gbOther, /* grapheme break property */" |
464 |
print " 0, /* case set */" |
465 |
print " 0, /* other case */" |
466 |
print " }};" |
467 |
print "#endif" |
468 |
print |
453 |
print record_struct |
469 |
print record_struct |
454 |
|
470 |
|
455 |
# --- Added by PH: output the table of caseless character sets --- |
471 |
# --- Added by PH: output the table of caseless character sets --- |
Lines 460-466
Link Here
|
460 |
s = sorted(s) |
476 |
s = sorted(s) |
461 |
for x in s: |
477 |
for x in s: |
462 |
print ' 0x%04x,' % x, |
478 |
print ' 0x%04x,' % x, |
463 |
print ' NOTACHAR,' |
479 |
print ' NOTACHAR,' |
464 |
print '};' |
480 |
print '};' |
465 |
print |
481 |
print |
466 |
|
482 |
|