Attachment #475122 for bug #614054

View | Details | Raw Unified | Return to bug 614054
Collapse All | Expand All




const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};
#else

/* If the 32-bit library is run in non-32-bit mode, character values
greater than 0x10ffff may be encountered. For these we set up a
special record. */

#ifdef COMPILE_PCRE32
const ucd_record PRIV(dummy_ucd_record)[] = {{
  ucp_Common,    /* script */
  ucp_Cn,        /* type unassigned */
  ucp_gbOther,   /* grapheme break property */
  0,             /* case set */
  0,             /* other case */
  }};
#endif

/* When recompiling tables with a new Unicode version, please check the
types in this structure definition from pcre_internal.h (the actual
field names will be different):

Lines 2772-2777 Link Here

(-)pcre_internal.h (-1 / +10 lines)
2772	extern const pcre_uint16 PRIV(ucd_stage2)[];	2772	extern const pcre_uint16 PRIV(ucd_stage2)[];
2773	extern const pcre_uint32 PRIV(ucp_gentype)[];	2773	extern const pcre_uint32 PRIV(ucp_gentype)[];
2774	extern const pcre_uint32 PRIV(ucp_gbtable)[];	2774	extern const pcre_uint32 PRIV(ucp_gbtable)[];
		2775	#ifdef COMPILE_PCRE32
		2776	extern const ucd_record PRIV(dummy_ucd_record)[];
		2777	#endif
2775	#ifdef SUPPORT_JIT	2778	#ifdef SUPPORT_JIT
2776	extern const int PRIV(ucp_typerange)[];	2779	extern const int PRIV(ucp_typerange)[];
2777	#endif	2780	#endif
Lines 2780-2788 Link Here
2780	/* UCD access macros */	2783	/* UCD access macros */
2781		2784
2782	#define UCD_BLOCK_SIZE 128	2785	#define UCD_BLOCK_SIZE 128
2783	#define GET_UCD(ch) (PRIV(ucd_records) + \	2786	#define REAL_GET_UCD(ch) (PRIV(ucd_records) + \
2784	PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \	2787	PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \
2785	UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE])	2788	UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE])
		2789
		2790	#ifdef COMPILE_PCRE32
		2791	#define GET_UCD(ch) ((ch > 0x10ffff)? PRIV(dummy_ucd_record) : REAL_GET_UCD(ch))
		2792	#else
		2793	#define GET_UCD(ch) REAL_GET_UCD(ch)
		2794	#endif
2786		2795
2787	#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype	2796	#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype
2788	#define UCD_SCRIPT(ch) GET_UCD(ch)->script	2797	#define UCD_SCRIPT(ch) GET_UCD(ch)->script

Lines 10-16 Link Here

(-)ChangeLog (-1 / +5 lines)
10	1. Fixed typo in CMakeLists.txt (wrong number of arguments for	10	1. Fixed typo in CMakeLists.txt (wrong number of arguments for
11	PCRE_STATIC_RUNTIME (affects MSVC only).	11	PCRE_STATIC_RUNTIME (affects MSVC only).
12		12
13	2. Issue 1 for 8.40 below was not correctly fixed. If pcregrep in multiline	13	2. Issue 1 for 8.40 below was not correctly fixed. If pcregrep in multiline
14	mode with --only-matching matched several lines, it restarted scanning at the	14	mode with --only-matching matched several lines, it restarted scanning at the
15	next line instead of moving on to the end of the matched string, which can be	15	next line instead of moving on to the end of the matched string, which can be
16	several lines after the start.	16	several lines after the start.
Lines 29-34 Link Here
29		29
30	(a) Check for values < 256 when calling isprint() in pcretest.	30	(a) Check for values < 256 when calling isprint() in pcretest.
31	(b) Give an error for too big a number after \O.	31	(b) Give an error for too big a number after \O.
		32
		33	7. In the 32-bit library in non-UTF mode, an attempt to find a Unicode
		34	property for a character with a code point greater than 0x10ffff (the Unicode
		35	maximum) caused a crash.
32		36
33		37
34	Version 8.40 11-January-2017	38	Version 8.40 11-January-2017

Lines 1-5 Link Here

(-)maint/MultiStage2.py (-45 / +61 lines)
1	#! /usr/bin/python	1	#! /usr/bin/python
2		2
		3	# WARNING! This is a python 2 script.
		4
3	# Multistage table builder	5	# Multistage table builder
4	# (c) Peter Kankowski, 2008	6	# (c) Peter Kankowski, 2008
5		7
Lines 15-24 Link Here
15	# ./MultiStage2.py >../pcre_ucd.c	17	# ./MultiStage2.py >../pcre_ucd.c
16	#	18	#
17	# It requires four Unicode data tables, DerivedGeneralCategory.txt,	19	# It requires four Unicode data tables, DerivedGeneralCategory.txt,
18	# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the	20	# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
19	# Unicode.tables subdirectory. The first of these is found in the "extracted"	21	# Unicode.tables subdirectory. The first of these is found in the "extracted"
20	# subdirectory of the Unicode database (UCD) on the Unicode web site; the	22	# subdirectory of the Unicode database (UCD) on the Unicode web site; the
21	# second is in the "auxiliary" subdirectory; the other two are directly in the	23	# second is in the "auxiliary" subdirectory; the other two are directly in the
22	# UCD directory.	24	# UCD directory.
23	#	25	#
24	# Minor modifications made to this script:	26	# Minor modifications made to this script:
Lines 42-48 Link Here
42	# code scans CaseFolding.txt instead of UnicodeData.txt.	44	# code scans CaseFolding.txt instead of UnicodeData.txt.
43	#	45	#
44	# The main tables generated by this script are used by macros defined in	46	# The main tables generated by this script are used by macros defined in
45	# pcre_internal.h. They look up Unicode character properties using short	47	# pcre_internal.h. They look up Unicode character properties using short
46	# sequences of code that contains no branches, which makes for greater speed.	48	# sequences of code that contains no branches, which makes for greater speed.
47	#	49	#
48	# Conceptually, there is a table of records (of type ucd_record), containing a	50	# Conceptually, there is a table of records (of type ucd_record), containing a
Lines 69-81 Link Here
69	# Example: lowercase "a" (U+0061) is in block 0	71	# Example: lowercase "a" (U+0061) is in block 0
70	# lookup 0 in stage1 table yields 0	72	# lookup 0 in stage1 table yields 0
71	# lookup 97 in the first table in stage2 yields 16	73	# lookup 97 in the first table in stage2 yields 16
72	# record 17 is { 33, 5, 11, 0, -32 }	74	# record 17 is { 33, 5, 11, 0, -32 }
73	# 33 = ucp_Latin => Latin script	75	# 33 = ucp_Latin => Latin script
74	# 5 = ucp_Ll => Lower case letter	76	# 5 = ucp_Ll => Lower case letter
75	# 11 = ucp_gbOther => Grapheme break property "Other"	77	# 11 = ucp_gbOther => Grapheme break property "Other"
76	# 0 => not part of a caseless set	78	# 0 => not part of a caseless set
77	# -32 => Other case is U+0041	79	# -32 => Other case is U+0041
78	#	80	#
79	# Almost all lowercase latin characters resolve to the same record. One or two	81	# Almost all lowercase latin characters resolve to the same record. One or two
80	# are different because they are part of a multi-character caseless set (for	82	# are different because they are part of a multi-character caseless set (for
81	# example, k, K and the Kelvin symbol are such a set).	83	# example, k, K and the Kelvin symbol are such a set).
Lines 83-99 Link Here
83	# Example: hiragana letter A (U+3042) is in block 96 (0x60)	85	# Example: hiragana letter A (U+3042) is in block 96 (0x60)
84	# lookup 96 in stage1 table yields 88	86	# lookup 96 in stage1 table yields 88
85	# lookup 66 in the 88th table in stage2 yields 467	87	# lookup 66 in the 88th table in stage2 yields 467
86	# record 470 is { 26, 7, 11, 0, 0 }	88	# record 470 is { 26, 7, 11, 0, 0 }
87	# 26 = ucp_Hiragana => Hiragana script	89	# 26 = ucp_Hiragana => Hiragana script
88	# 7 = ucp_Lo => Other letter	90	# 7 = ucp_Lo => Other letter
89	# 11 = ucp_gbOther => Grapheme break property "Other"	91	# 11 = ucp_gbOther => Grapheme break property "Other"
90	# 0 => not part of a caseless set	92	# 0 => not part of a caseless set
91	# 0 => No other case	93	# 0 => No other case
92	#	94	#
93	# In these examples, no other blocks resolve to the same "virtual" block, as it	95	# In these examples, no other blocks resolve to the same "virtual" block, as it
94	# happens, but plenty of other blocks do share "virtual" blocks.	96	# happens, but plenty of other blocks do share "virtual" blocks.
95	#	97	#
96	# There is a fourth table, maintained by hand, which translates from the	98	# There is a fourth table, maintained by hand, which translates from the
97	# individual character types such as ucp_Cc to the general types like ucp_C.	99	# individual character types such as ucp_Cc to the general types like ucp_C.
98	#	100	#
99	# Philip Hazel, 03 July 2008	101	# Philip Hazel, 03 July 2008
Lines 101-108 Link Here
101	# 01-March-2010: Updated list of scripts for Unicode 5.2.0	103	# 01-March-2010: Updated list of scripts for Unicode 5.2.0
102	# 30-April-2011: Updated list of scripts for Unicode 6.0.0	104	# 30-April-2011: Updated list of scripts for Unicode 6.0.0
103	# July-2012: Updated list of scripts for Unicode 6.1.0	105	# July-2012: Updated list of scripts for Unicode 6.1.0
104	# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new	106	# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
105	# field in the record to hold the value. Luckily, the	107	# field in the record to hold the value. Luckily, the
106	# structure had a hole in it, so the resulting table is	108	# structure had a hole in it, so the resulting table is
107	# not much bigger than before.	109	# not much bigger than before.
108	# 18-September-2012: Added code for multiple caseless sets. This uses the	110	# 18-September-2012: Added code for multiple caseless sets. This uses the
Lines 144-157 Link Here
144	if m.group(3) is None:	146	if m.group(3) is None:
145	last = char	147	last = char
146	else:	148	else:
147	last = int(m.group(3), 16)	149	last = int(m.group(3), 16)
148	for i in range(char, last + 1):	150	for i in range(char, last + 1):
149	# It is important not to overwrite a previously set	151	# It is important not to overwrite a previously set
150	# value because in the CaseFolding file there are lines	152	# value because in the CaseFolding file there are lines
151	# to be ignored (returning the default value of 0)	153	# to be ignored (returning the default value of 0)
152	# which often come after a line which has already set	154	# which often come after a line which has already set
153	# data.	155	# data.
154	if table[i] == default_value:	156	if table[i] == default_value:
155	table[i] = value	157	table[i] = value
156	file.close()	158	file.close()
157	return table	159	return table
Lines 192-198 Link Here
192	stage2 += block	194	stage2 += block
193	blocks[block] = start	195	blocks[block] = start
194	stage1.append(start)	196	stage1.append(start)
195		197
196	return stage1, stage2	198	return stage1, stage2
197		199
198	# Print a table	200	# Print a table
Lines 199-205 Link Here
199	def print_table(table, table_name, block_size = None):	201	def print_table(table, table_name, block_size = None):
200	type, size = get_type_size(table)	202	type, size = get_type_size(table)
201	ELEMS_PER_LINE = 16	203	ELEMS_PER_LINE = 16
202		204
203	s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))	205	s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
204	if block_size:	206	if block_size:
205	s += ", block = %d" % block_size	207	s += ", block = %d" % block_size
Lines 245-259 Link Here
245	size = (size + slice_size - 1) & -slice_size	247	size = (size + slice_size - 1) & -slice_size
246	size += slice_size	248	size += slice_size
247	structure += '%s property_%d;\n' % (slice_type, i)	249	structure += '%s property_%d;\n' % (slice_type, i)
248		250
249	# round up to the first item of the next structure in array	251	# round up to the first item of the next structure in array
250	record_slice = map(lambda record: record[0], records)	252	record_slice = map(lambda record: record[0], records)
251	slice_type, slice_size = get_type_size(record_slice)	253	slice_type, slice_size = get_type_size(record_slice)
252	size = (size + slice_size - 1) & -slice_size	254	size = (size + slice_size - 1) & -slice_size
253		255
254	structure += '} ucd_record;\n*/\n\n'	256	structure += '} ucd_record;\n*/\n\n'
255	return size, structure	257	return size, structure
256		258
257	def test_record_size():	259	def test_record_size():
258	tests = [ \	260	tests = [ \
259	( [(3,), (6,), (6,), (1,)], 1 ), \	261	( [(3,), (6,), (6,), (1,)], 1 ), \
Lines 305-311 Link Here
305	'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',	307	'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
306	'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi'	308	'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi'
307	]	309	]
308		310
309	category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',	311	category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
310	'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',	312	'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
311	'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]	313	'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
Lines 321-340 Link Here
321	other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)	323	other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
322		324
323		325
324	# This block of code was added by PH in September 2012. I am not a Python	326	# This block of code was added by PH in September 2012. I am not a Python
325	# programmer, so the style is probably dreadful, but it does the job. It scans	327	# programmer, so the style is probably dreadful, but it does the job. It scans
326	# the other_case table to find sets of more than two characters that must all	328	# the other_case table to find sets of more than two characters that must all
327	# match each other caselessly. Later in this script a table of these sets is	329	# match each other caselessly. Later in this script a table of these sets is
328	# written out. However, we have to do this work here in order to compute the	330	# written out. However, we have to do this work here in order to compute the
329	# offsets in the table that are inserted into the main table.	331	# offsets in the table that are inserted into the main table.
330		332
331	# The CaseFolding.txt file lists pairs, but the common logic for reading data	333	# The CaseFolding.txt file lists pairs, but the common logic for reading data
332	# sets only one value, so first we go through the table and set "return"	334	# sets only one value, so first we go through the table and set "return"
333	# offsets for those that are not already set.	335	# offsets for those that are not already set.
334		336
335	for c in range(0x10ffff):	337	for c in range(0x10ffff):
336	if other_case[c] != 0 and other_case[c + other_case[c]] == 0:	338	if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
337	other_case[c + other_case[c]] = -other_case[c]	339	other_case[c + other_case[c]] = -other_case[c]
338		340
339	# Now scan again and create equivalence sets.	341	# Now scan again and create equivalence sets.
340		342
Lines 344-368 Link Here
344	o = c + other_case[c]	346	o = c + other_case[c]
345		347
346	# Trigger when this character's other case does not point back here. We	348	# Trigger when this character's other case does not point back here. We
347	# now have three characters that are case-equivalent.	349	# now have three characters that are case-equivalent.
348		350
349	if other_case[o] != -other_case[c]:	351	if other_case[o] != -other_case[c]:
350	t = o + other_case[o]	352	t = o + other_case[o]
351		353
352	# Scan the existing sets to see if any of the three characters are already	354	# Scan the existing sets to see if any of the three characters are already
353	# part of a set. If so, unite the existing set with the new set.	355	# part of a set. If so, unite the existing set with the new set.
354		356
355	appended = 0	357	appended = 0
356	for s in sets:	358	for s in sets:
357	found = 0	359	found = 0
358	for x in s:	360	for x in s:
359	if x == c or x == o or x == t:	361	if x == c or x == o or x == t:
360	found = 1	362	found = 1
361		363
362	# Add new characters to an existing set	364	# Add new characters to an existing set
363		365
364	if found:	366	if found:
365	found = 0	367	found = 0
366	for y in [c, o, t]:	368	for y in [c, o, t]:
367	for x in s:	369	for x in s:
368	if x == y:	370	if x == y:
Lines 370-379 Link Here
370	if not found:	372	if not found:
371	s.append(y)	373	s.append(y)
372	appended = 1	374	appended = 1
373		375
374	# If we have not added to an existing set, create a new one.	376	# If we have not added to an existing set, create a new one.
375		377
376	if not appended:	378	if not appended:
377	sets.append([c, o, t])	379	sets.append([c, o, t])
378		380
379	# End of loop looking for caseless sets.	381	# End of loop looking for caseless sets.
Lines 384-390 Link Here
384		386
385	offset = 1;	387	offset = 1;
386	for s in sets:	388	for s in sets:
387	for x in s:	389	for x in s:
388	caseless_offsets[x] = offset	390	caseless_offsets[x] = offset
389	offset += len(s) + 1	391	offset += len(s) + 1
390		392
Lines 393-399 Link Here
393		395
394	# Combine the tables	396	# Combine the tables
395		397
396	table, records = combine_tables(script, category, break_props,	398	table, records = combine_tables(script, category, break_props,
397	caseless_offsets, other_case)	399	caseless_offsets, other_case)
398		400
399	record_size, record_struct = get_record_size_struct(records.keys())	401	record_size, record_struct = get_record_size_struct(records.keys())
Lines 450-455 Link Here
450	print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};"	452	print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};"
451	print "#else"	453	print "#else"
452	print	454	print
		455	print "/* If the 32-bit library is run in non-32-bit mode, character values"
		456	print "greater than 0x10ffff may be encountered. For these we set up a"
		457	print "special record. */"
		458	print
		459	print "#ifdef COMPILE_PCRE32"
		460	print "const ucd_record PRIV(dummy_ucd_record)[] = {{"
		461	print " ucp_Common, /* script */"
		462	print " ucp_Cn, /* type unassigned */"
		463	print " ucp_gbOther, /* grapheme break property */"
		464	print " 0, /* case set */"
		465	print " 0, /* other case */"
		466	print " }};"
		467	print "#endif"
		468	print
453	print record_struct	469	print record_struct
454		470
455	# --- Added by PH: output the table of caseless character sets ---	471	# --- Added by PH: output the table of caseless character sets ---
Lines 460-466 Link Here
460	s = sorted(s)	476	s = sorted(s)
461	for x in s:	477	for x in s:
462	print ' 0x%04x,' % x,	478	print ' 0x%04x,' % x,
463	print ' NOTACHAR,'	479	print ' NOTACHAR,'
464	print '};'	480	print '};'
465	print	481	print
466		482

Return to bug 614054

Lines 38-43 Link Here

(-)pcre_ucd.c (+14 lines)
38	const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};	38	const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};
39	#else	39	#else
40		40
		41	/* If the 32-bit library is run in non-32-bit mode, character values
		42	greater than 0x10ffff may be encountered. For these we set up a
		43	special record. */
		44
		45	#ifdef COMPILE_PCRE32
		46	const ucd_record PRIV(dummy_ucd_record)[] = {{
		47	ucp_Common, /* script */
		48	ucp_Cn, /* type unassigned */
		49	ucp_gbOther, /* grapheme break property */
		50	0, /* case set */
		51	0, /* other case */
		52	}};
		53	#endif
		54
41	/* When recompiling tables with a new Unicode version, please check the	55	/* When recompiling tables with a new Unicode version, please check the
42	types in this structure definition from pcre_internal.h (the actual	56	types in this structure definition from pcre_internal.h (the actual
43	field names will be different):	57	field names will be different):