Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 917618
Collapse All | Expand All

(-)a/configure.ac (-2 / +2 lines)
Lines 10803-10810 Link Here
10803
dnl ===================================================================
10803
dnl ===================================================================
10804
dnl Check for system icu
10804
dnl Check for system icu
10805
dnl ===================================================================
10805
dnl ===================================================================
10806
ICU_MAJOR=73
10806
ICU_MAJOR=74
10807
ICU_MINOR=2
10807
ICU_MINOR=1
10808
ICU_CFLAGS_internal="-I${WORKDIR}/UnpackedTarball/icu/source/i18n -I${WORKDIR}/UnpackedTarball/icu/source/common"
10808
ICU_CFLAGS_internal="-I${WORKDIR}/UnpackedTarball/icu/source/i18n -I${WORKDIR}/UnpackedTarball/icu/source/common"
10809
ICU_LIBS_internal="-L${WORKDIR}/UnpackedTarball/icu/source/lib"
10809
ICU_LIBS_internal="-L${WORKDIR}/UnpackedTarball/icu/source/lib"
10810
libo_CHECK_SYSTEM_MODULE([icu],[ICU],[icu-i18n >= 66])
10810
libo_CHECK_SYSTEM_MODULE([icu],[ICU],[icu-i18n >= 66])
(-)a/download.lst (-4 / +4 lines)
Lines 312-321 Link Here
312
# three static lines
312
# three static lines
313
# so that git cherry-pick
313
# so that git cherry-pick
314
# will not run into conflicts
314
# will not run into conflicts
315
ICU_SHA256SUM := 818a80712ed3caacd9b652305e01afc7fa167e6f2e94996da44b90c2ab604ce1
315
ICU_SHA256SUM := 86ce8e60681972e60e4dcb2490c697463fcec60dd400a5f9bffba26d0b52b8d0
316
ICU_TARBALL := icu4c-73_2-src.tgz
316
ICU_TARBALL := icu4c-74_1-src.tgz
317
ICU_DATA_SHA256SUM := ca1ee076163b438461e484421a7679fc33a64cd0a54f9d4b401893fa1eb42701
317
ICU_DATA_SHA256SUM := 67d5ab39c5187e1dd0fed60a3fe52794dce9784b4c045cb85e19f5d317fd783f
318
ICU_DATA_TARBALL := icu4c-73_2-data.zip
318
ICU_DATA_TARBALL := icu4c-74_1-data.zip
319
# three static lines
319
# three static lines
320
# so that git cherry-pick
320
# so that git cherry-pick
321
# will not run into conflicts
321
# will not run into conflicts
(-)a/external/icu/icu4c-khmerbreakengine.patch.1 (-1 lines)
Lines 796-802 Link Here
796
             if (wordCount < limit) {
796
             if (wordCount < limit) {
797
                 if (values != nullptr) {
797
                 if (values != nullptr) {
798
                     values[wordCount] = bt.getValue();
798
                     values[wordCount] = bt.getValue();
799
800
diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h
799
diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h
801
--- icu.org/source/common/dictionarydata.h	2023-06-14 06:23:55.000000000 +0900
800
--- icu.org/source/common/dictionarydata.h	2023-06-14 06:23:55.000000000 +0900
802
+++ icu/source/common/dictionarydata.h	2023-06-26 17:43:53.097724900 +0900
801
+++ icu/source/common/dictionarydata.h	2023-06-26 17:43:53.097724900 +0900
(-)a/i18npool/source/breakiterator/data/line.txt (-467 / +209 lines)
Lines 1-177 Link Here
1
# Copyright (c) 2002-2006  International Business Machines Corporation and
1
# Copyright (C) 2016 and later: Unicode, Inc. and others.
2
# License & terms of use: http://www.unicode.org/copyright.html
3
# Copyright (c) 2002-2016  International Business Machines Corporation and
2
# others. All Rights Reserved.
4
# others. All Rights Reserved.
3
#
5
#
4
#  file:  line.txt
6
#  file:  line.txt
5
#
7
#
6
#         Line Breaking Rules
8
#         Line Breaking Rules
7
#         Implement default line breaking as defined by Unicode Standard Annex #14 version 5.0.0
9
#         Implement default line breaking as defined by
8
#         http://www.unicode.org/reports/tr14/
10
#         Unicode Standard Annex #14 (https://www.unicode.org/reports/tr14/)
9
11
#         for Unicode 14.0, with the following modification:
10
12
#
13
#         Boundaries between hyphens and following letters are suppressed when
14
#         there is a boundary preceding the hyphen. See rule 20.9
15
#
16
#         This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
17
#         It sets characters of class CJ to behave like NS.
11
18
12
#
19
#
13
#  Character Classes defined by TR 14.
20
#  Character Classes defined by TR 14.
14
#
21
#
15
22
16
!!chain;
23
!!chain;
17
!!LBCMNoChain;
24
!!quoted_literals_only;
18
19
20
!!lookAheadHardBreak;
21
#
22
#  !!lookAheadHardBreak    Described here because it is (as yet) undocumented elsewhere
23
#                          and only used for the line break rules.
24
#
25
#           It is used in the implementation of the incredibly annoying rule LB 10
26
#           which says to treat any combining mark that is not attached to a base
27
#           character as if it were of class AL  (alphabetic).
28
#
29
#           The problem occurs in the reverse rules.
30
#
31
#           Consider a sequence like, with correct breaks as shown
32
#               LF  ID  CM  AL  AL
33
#                  ^       ^       ^
34
#           Then consider the sequence without the initial ID (ideographic)
35
#                 LF  CM  AL  AL
36
#                    ^           ^
37
#           Our CM, which in the first example was attached to the ideograph,
38
#           is now unattached, becomes an alpha, and joins in with the other
39
#           alphas.
40
#
41
#           When iterating forwards, these sequences do not present any problems
42
#           When iterating backwards, we need to look ahead when encountering
43
#           a CM to see whether it attaches to something further on or not.
44
#           (Look-ahead in a reverse rule is looking towards the start)
45
#
46
#           If the CM is unattached, we need to force a break.
47
#
48
#           !!lookAheadHardBreak forces the run time state machine to
49
#           stop immediately when a look ahead rule ( '/' operator) matches,
50
#           and set the match position to that of the look-ahead operator,
51
#           no matter what other rules may be in play at the time.
52
#
53
#           See rule LB 19 for an example.
54
#
55
25
56
$AI = [:LineBreak =  Ambiguous:];
26
$AI = [:LineBreak =  Ambiguous:];
57
$DG = \u00B0;
27
$AK = [:LineBreak =  Aksara:];
58
$AL = [[:LineBreak =  Alphabetic:] $DG];
28
$AL = [:LineBreak =  Alphabetic:];
29
$AP = [:LineBreak =  Aksara_Prebase:];
30
$AS = [:LineBreak =  Aksara_Start:];
59
$BA = [:LineBreak =  Break_After:];
31
$BA = [:LineBreak =  Break_After:];
32
$HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
60
$BB = [:LineBreak =  Break_Before:];
33
$BB = [:LineBreak =  Break_Before:];
61
$BK = [:LineBreak =  Mandatory_Break:];
34
$BK = [:LineBreak =  Mandatory_Break:];
62
$B2 = [:LineBreak =  Break_Both:];
35
$B2 = [:LineBreak =  Break_Both:];
63
$CB = [:LineBreak =  Contingent_Break:];
36
$CB = [:LineBreak =  Contingent_Break:];
64
$CJ = [:LineBreak =  Conditional_Japanese_Starter:];
37
$CJ = [:LineBreak =  Conditional_Japanese_Starter:];
65
$CL = [[:LineBreak =  Close_Punctuation:] [:LineBreak = Close_Parenthesis:]]; # tdf#31271
38
$CL = [:LineBreak =  Close_Punctuation:];
66
$CM = [:LineBreak =  Combining_Mark:];
39
# $CM = [:LineBreak =  Combining_Mark:];
40
$CP = [:LineBreak =  Close_Parenthesis:];
67
$CR = [:LineBreak =  Carriage_Return:];
41
$CR = [:LineBreak =  Carriage_Return:];
42
$EB = [:LineBreak =  EB:];
43
$EM = [:LineBreak =  EM:];
68
$EX = [:LineBreak =  Exclamation:];
44
$EX = [:LineBreak =  Exclamation:];
69
$GL = [:LineBreak =  Glue:];
45
$GL = [:LineBreak =  Glue:];
70
$HL = [:LineBreak =  Hebrew_Letter:];
46
$HL = [:LineBreak =  Hebrew_Letter:];
71
$HY = [:LineBreak =  Hyphen:];
47
$HY = [:LineBreak =  Hyphen:];
72
$H2 = [:LineBreak =  H2:];
48
$H2 = [:LineBreak =  H2:];
73
$H3 = [:LineBreak =  H3:];
49
$H3 = [:LineBreak =  H3:];
74
$ID = [[:LineBreak =  Ideographic:] - [\ufe30]];
50
$ID = [:LineBreak =  Ideographic:];
75
$IN = [:LineBreak =  Inseparable:];
51
$IN = [:LineBreak =  Inseperable:];
76
$IS = [[:LineBreak =  Infix_Numeric:] [\ufe30]];
52
$IS = [:LineBreak =  Infix_Numeric:];
77
$JL = [:LineBreak =  JL:];
53
$JL = [:LineBreak =  JL:];
78
$JV = [:LineBreak =  JV:];
54
$JV = [:LineBreak =  JV:];
79
$JT = [:LineBreak =  JT:];
55
$JT = [:LineBreak =  JT:];
80
$LF = [:LineBreak =  Line_Feed:];
56
$LF = [:LineBreak =  Line_Feed:];
81
$NL = [:LineBreak =  Next_Line:];
57
$NL = [:LineBreak =  Next_Line:];
58
# NS includes CJ for CSS strict line breaking.
82
$NS = [[:LineBreak =  Nonstarter:] $CJ];
59
$NS = [[:LineBreak =  Nonstarter:] $CJ];
83
$NU = [:LineBreak =  Numeric:];
60
$NU = [:LineBreak =  Numeric:];
84
$OP = [[:LineBreak =  Open_Punctuation:] - $DG];
61
$OP = [:LineBreak =  Open_Punctuation:];
85
$PO = [:LineBreak =  Postfix_Numeric:];
62
$PO = [:LineBreak =  Postfix_Numeric:];
86
$BS = \u005C;
63
$PR = [:LineBreak =  Prefix_Numeric:];
87
$PR = [[:LineBreak =  Prefix_Numeric:] - $BS];
88
$QU = [:LineBreak =  Quotation:];
64
$QU = [:LineBreak =  Quotation:];
65
$RI = [:LineBreak =  Regional_Indicator:];
89
$SA = [:LineBreak =  Complex_Context:];
66
$SA = [:LineBreak =  Complex_Context:];
90
$SG = [:LineBreak =  Surrogate:];
67
$SG = [:LineBreak =  Surrogate:];
91
$SP = [:LineBreak =  Space:];
68
$SP = [:LineBreak =  Space:];
92
$SY = [[:LineBreak =  Break_Symbols:] $BS];
69
$SY = [:LineBreak =  Break_Symbols:];
70
$VF = [:LineBreak =  Virama_Final:];
71
$VI = [:LineBreak =  Virama:];
93
$WJ = [:LineBreak =  Word_Joiner:];
72
$WJ = [:LineBreak =  Word_Joiner:];
94
$XX = [:LineBreak =  Unknown:];
73
$XX = [:LineBreak =  Unknown:];
95
$ZW = [:LineBreak =  ZWSpace:];
74
$ZW = [:LineBreak =  ZWSpace:];
75
$ZWJ = [:LineBreak = ZWJ:];
76
77
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
78
# without a formal name. Because ICU rules require multiple uses of the expressions,
79
# give them a single definition with a name
80
81
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
82
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
83
84
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
85
86
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
87
#         list it in the numerous rules that use CM.
88
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
89
90
$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
91
$CMX = [[$CM] - [$ZWJ]];
96
92
97
#   Dictionary character set, for triggering language-based break engines. Currently
93
#   Dictionary character set, for triggering language-based break engines. Currently
98
#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
94
#   limited to LineBreak=Complex_Context (SA).
99
#   5.0 or later as the definition of Complex_Context was corrected to include all
100
#   characters requiring dictionary break.
101
95
102
$dictionary = [:LineBreak = Complex_Context:];
96
$dictionary = [$SA];
103
97
104
#
98
#
105
#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
99
#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
106
#                               SA  (South East Asian: Thai, Lao, Khmer)
100
#                               SA  (Dictionary chars, excluding Mn and Mc)
107
#                               SG  (Unpaired Surrogates)
101
#                               SG  (Unpaired Surrogates)
108
#                               XX  (Unknown, unassigned)
102
#                               XX  (Unknown, unassigned)
109
#                         as $AL  (Alphabetic)
103
#                         as $AL  (Alphabetic)
110
#
104
#
111
$ALPlus = [$AL $AI $SA $SG $XX];
105
$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
112
106
113
#
114
#  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
115
#
116
$ALcm = $ALPlus $CM*;
117
$BAcm = $BA $CM*;
118
$BBcm = $BB $CM*;
119
$B2cm = $B2 $CM*;
120
$CLcm = $CL $CM*;
121
$EXcm = $EX $CM*;
122
$GLcm = $GL $CM*;
123
$HLcm = $HL $CM*;
124
$HYcm = $HY $CM*;
125
$H2cm = $H2 $CM*;
126
$H3cm = $H3 $CM*;
127
$IDcm = $ID $CM*;
128
$INcm = $IN $CM*;
129
$IScm = $IS $CM*;
130
$JLcm = $JL $CM*;
131
$JVcm = $JV $CM*;
132
$JTcm = $JT $CM*;
133
$NScm = $NS $CM*;
134
$NUcm = $NU $CM*;
135
$OPcm = $OP $CM*;
136
$POcm = $PO $CM*;
137
$PRcm = $PR $CM*;
138
$QUcm = $QU $CM*;
139
$SYcm = $SY $CM*;
140
$WJcm = $WJ $CM*;
141
107
142
## -------------------------------------------------
108
## -------------------------------------------------
143
109
144
!!forward;
145
146
#
147
#  Each class of character can stand by itself as an unbroken token, with trailing combining stuff
148
#
149
$ALPlus $CM+;
150
$BA $CM+;
151
$BB $CM+;
152
$B2 $CM+;
153
$CL $CM+;
154
$EX $CM+;
155
$GL $CM+;
156
$HL $CM+;
157
$HY $CM+;
158
$H2 $CM+;
159
$H3 $CM+;
160
$ID $CM+;
161
$IN $CM+;
162
$IS $CM+;
163
$JL $CM+;
164
$JV $CM+;
165
$JT $CM+;
166
$NS $CM+;
167
$NU $CM+;
168
$OP $CM+;
169
$PO $CM+;
170
$PR $CM+;
171
$QU $CM+;
172
$SY $CM+;
173
$WJ $CM+;
174
175
#
110
#
176
# CAN_CM  is the set of characters that may combine with CM combining chars.
111
# CAN_CM  is the set of characters that may combine with CM combining chars.
177
#         Note that Linebreak UAX 14's concept of a combining char and the rules
112
#         Note that Linebreak UAX 14's concept of a combining char and the rules
Lines 186-204 Link Here
186
#
121
#
187
# AL_FOLLOW  set of chars that can unconditionally follow an AL
122
# AL_FOLLOW  set of chars that can unconditionally follow an AL
188
#            Needed in rules where stand-alone $CM s are treated as AL.
123
#            Needed in rules where stand-alone $CM s are treated as AL.
189
#            Chaining is disabled with CM because it causes other failures,
190
#            so for this one case we need to manually list out longer sequences.
191
#
124
#
192
$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
125
$AL_FOLLOW      = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
193
$AL_FOLLOW_CM   = [$CL $EX $HL $IS $SY $WJ $GL $QU $BA $HY $NS $IN $NU $ALPlus $OP];
194
$AL_FOLLOW      = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
195
126
196
127
197
#
128
#
198
#  Rule LB 4, 5    Mandatory (Hard) breaks.
129
#  Rule LB 4, 5    Mandatory (Hard) breaks.
199
#
130
#
200
$LB4Breaks    = [$BK $CR $LF $NL];
131
$LB4Breaks    = [$BK $CR $LF $NL];
201
$LB4NonBreaks = [^$BK $CR $LF $NL];
132
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
202
$CR $LF {100};
133
$CR $LF {100};
203
134
204
#
135
#
Lines 206-296 Link Here
206
#
137
#
207
$LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
138
$LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
208
$CAN_CM $CM*    $LB4Breaks {100};
139
$CAN_CM $CM*    $LB4Breaks {100};
209
$CM+            $LB4Breaks {100};
140
^$CM+           $LB4Breaks {100};
210
141
211
# LB 7         x SP
142
# LB 7         x SP
212
#              x ZW
143
#              x ZW
213
$LB4NonBreaks [$SP $ZW];
144
$LB4NonBreaks [$SP $ZW];
214
$CAN_CM $CM*  [$SP $ZW];
145
$CAN_CM $CM*  [$SP $ZW];
215
$CM+          [$SP $ZW];
146
^$CM+         [$SP $ZW];
216
147
217
#
148
#
218
# LB 8         Break after zero width space
149
# LB 8         Break after zero width space
150
#              ZW SP* ÷
219
#
151
#
220
$LB8Breaks    = [$LB4Breaks $ZW];
152
$LB8Breaks    = [$LB4Breaks $ZW];
221
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
153
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
154
$ZW $SP* / [^$SP $ZW $LB4Breaks];
222
155
156
# LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
157
#
158
$ZWJ [^$CM];
223
159
224
# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL 
160
# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
225
#                                $CM not covered by the above needs to behave like $AL   
161
#                                $CM not covered by the above needs to behave like $AL
226
#                                See definition of $CAN_CM.
162
#                                See definition of $CAN_CM.
227
163
228
$CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
164
$CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
229
$CM+;
165
^$CM+;
230
166
231
#
167
#
232
# LB 11  Do not break before or after WORD JOINER & related characters.
168
# LB 11  Do not break before or after WORD JOINER & related characters.
233
#
169
#
234
$CAN_CM $CM*  $WJcm;
170
$CAN_CM $CM*  $WJ;
235
$LB8NonBreaks $WJcm;
171
$LB8NonBreaks $WJ;
236
$CM+          $WJcm;
172
^$CM+         $WJ;
237
173
238
$WJcm [^$CAN_CM];
174
$WJ $CM* .;
239
$WJcm $CAN_CM $CM*;
240
175
241
#
176
#
242
# LB 12  Do not break before or after NBSP and related characters.
177
# LB 12  Do not break after NBSP and related characters.
243
#
244
#         (!SP) x GL
245
[$LB8NonBreaks-$SP] $CM* $GLcm;
246
$CM+               $GLcm;
247
248
#         GL  x
178
#         GL  x
249
$GLcm ($LB8Breaks | $SP);
179
#
250
$GLcm [$LB8NonBreaks-$SP] $CM*;     # Don't let a combining mark go onto $CR, $BK, etc.
180
$GL $CM* .;
251
                              #  TODO:  I don't think we need this rule.
252
                              #         All but $CM will chain off of preceding rule.
253
                              #         $GLcm will pick up the CM case by itself.
254
255
256
257
181
258
#
182
#
259
# LB 13   Don't break before ']' or '!' or ';' or '/', even after spaces.
183
# LB 12a  Do not break before NBSP and related characters ...
184
#            [^SP BA HY] x GL
185
#
186
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
187
^$CM+ $GL;
188
189
190
191
192
# LB 13   Don't break before ']' or '!' or '/', even after spaces.
260
#
193
#
261
$LB8NonBreaks $CL;
194
$LB8NonBreaks $CL;
262
$CAN_CM $CM*  $CL;
195
$CAN_CM $CM*  $CL;
263
$CM+          $CL;              # by rule 10, stand-alone CM behaves as AL
196
^$CM+         $CL;              # by rule 10, stand-alone CM behaves as AL
197
198
$LB8NonBreaks $CP;
199
$CAN_CM $CM*  $CP;
200
^$CM+         $CP;              # by rule 10, stand-alone CM behaves as AL
264
201
265
$LB8NonBreaks $EX;
202
$LB8NonBreaks $EX;
266
$CAN_CM $CM*  $EX;
203
$CAN_CM $CM*  $EX;
267
$CM+          $EX;              # by rule 10, stand-alone CM behaves as AL
204
^$CM+         $EX;              # by rule 10, stand-alone CM behaves as AL
268
269
$LB8NonBreaks $IS;
270
$CAN_CM $CM*  $IS;
271
$CM+          $IS;              # by rule 10, stand-alone CM behaves as AL
272
205
273
$LB8NonBreaks $SY;
206
$LB8NonBreaks $SY;
274
$CAN_CM $CM*  $SY;
207
$CAN_CM $CM*  $SY;
275
$CM+          $SY;              # by rule 10, stand-alone CM behaves as AL
208
^$CM+         $SY;              # by rule 10, stand-alone CM behaves as AL
276
209
277
210
278
#
211
#
279
# LB 14  Do not break after OP, even after spaced
212
# LB 14  Do not break after OP, even after spaces
213
#        Note subtle interaction with "SP IS /" rules in LB14a.
214
#        This rule consumes the SP, chaining happens on the IS, effectivley overriding the  SP IS rules,
215
#        which is the desired behavior.
280
#
216
#
281
$OPcm $SP* $CAN_CM $CM*;
217
$OP $CM* $SP* .;
282
$OPcm $SP* $CANT_CM;
283
218
284
$OPcm $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
219
$OP $CM* $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
220
                                   # by rule 8, CM following a SP is stand-alone.
285
221
286
# LB 15
222
287
# $QUcm $SP* $OPcm;
223
# LB 15a
224
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .;
225
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
226
^([\p{Pi} & $QU] $CM* $SP*)+ .;
227
^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
228
229
# LB 15b
230
$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
231
$CAN_CM $CM*  [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
232
^$CM+  [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
233
234
# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi.
235
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
236
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
237
$CAN_CM $CM*  [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
238
$CAN_CM $CM*  [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
239
^$CM+  [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
240
^$CM+  [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
241
242
243
# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23"
244
#        Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations.
245
#        See issue ICU-20303
246
247
248
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
249
$SP $IS           / [^ $CanFollowIS $NU $CM];
250
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
251
252
#
253
# LB 15d Do not break before numeric separators (IS), even after spaces.
254
255
[$LB8NonBreaks - $SP] $IS;
256
$SP $IS $CM* [$CanFollowIS {eof}];
257
$SP $IS $CM* $ZWJ [^$CM $NU];
258
259
$CAN_CM $CM*  $IS;
260
^$CM+         $IS;              # by rule 10, stand-alone CM behaves as AL
261
288
262
289
# LB 16
263
# LB 16
290
$CLcm $SP* $NScm;
264
($CL | $CP) $CM* $SP* $NS;
291
265
292
# LB 17
266
# LB 17
293
$B2cm $SP* $B2cm;
267
$B2 $CM* $SP* $B2;
294
268
295
#
269
#
296
# LB 18  Break after spaces.
270
# LB 18  Break after spaces.
Lines 301-647 Link Here
301
275
302
# LB 19
276
# LB 19
303
#         x QU
277
#         x QU
304
$LB18NonBreaks $CM* $QUcm;
278
$LB18NonBreaks $CM* $QU;
305
$CM+                $QUcm;
279
^$CM+               $QU;
306
280
307
#         QU  x
281
#         QU  x
308
$QUcm .?;
282
$QU $CM* .;
309
$QUcm $LB18NonBreaks $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
310
                              #  TODO:  I don't think this rule is needed.
311
312
283
313
# LB 20
284
# LB 20
314
#        <break>  $CB
285
#        <break>  $CB
315
#        $CB   <break>
286
#        $CB   <break>
316
287
#
317
$LB20NonBreaks = [$LB18NonBreaks - $CB];
288
$LB20NonBreaks = [$LB18NonBreaks - $CB];
318
289
290
# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
291
#             Originally added as a Finnish tailoring, now promoted to default ICU behavior.
292
#             Note: this is not default UAX-14 behaviour. See issue ICU-8151.
293
#
294
^($HY | $HH) $CM* $ALPlus;
295
319
# LB 21        x   (BA | HY | NS)
296
# LB 21        x   (BA | HY | NS)
320
#           BB x
297
#           BB x
321
#
298
#
322
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); 
299
$LB20NonBreaks $CM* ($BA | $HY | $NS);
323
300
324
$BBcm [^$CB];                                  #  $BB  x
301
325
$BBcm $LB20NonBreaks $CM*;
302
^$CM+ ($BA | $HY | $NS);
303
304
$BB $CM* [^$CB];                                  #  $BB  x
305
$BB $CM* $LB20NonBreaks;
326
306
327
# LB 21a Don't break after Hebrew + Hyphen
307
# LB 21a Don't break after Hebrew + Hyphen
328
#   HL (HY | BA) x
308
#   HL (HY | BA) x
329
#  
309
#
330
$HLcm ($HYcm | $BAcm) [^$CB]?;
310
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
331
311
332
# LB 22
312
# LB 21b (forward) Don't break between SY and HL
333
($ALcm | $HLcm) $INcm;
313
# (break between HL and SY already disallowed by LB 13 above)
334
$CM+     $INcm;     #  by rule 10, any otherwise unattached CM behaves as AL
314
$SY $CM* $HL;
335
$IDcm    $INcm;
315
336
$INcm    $INcm;
316
# LB 22  Do not break before ellipses
337
$NUcm    $INcm;
317
#
318
$LB20NonBreaks $CM*    $IN;
319
^$CM+ $IN;
338
320
339
321
340
# $LB 23
322
# LB 23
341
$IDcm  $POcm;
323
#
342
$ALcm  $NUcm;       # includes $LB19
324
($ALPlus | $HL) $CM* $NU;
343
$HLcm  $NUcm;
325
^$CM+  $NU;       # Rule 10, any otherwise unattached CM behaves as AL
344
$CM+   $NUcm;       # Rule 10, any otherwise unattached CM behaves as AL
326
$NU $CM* ($ALPlus | $HL);
345
$NUcm  $ALcm;
327
346
$NUcm  $HLcm;
328
# LB 23a
329
#
330
$PR $CM* ($ID | $EB | $EM);
331
($ID | $EB | $EM) $CM*  $PO;
332
347
333
348
#
334
#
349
# LB 24
335
# LB 24
350
#
336
#
351
$PRcm $IDcm;
337
($PR | $PO) $CM* ($ALPlus | $HL);
352
$ALcm $PRcm;
338
($ALPlus | $HL) $CM* ($PR | $PO);
353
$PRcm ($ALcm | $HLcm);
339
^$CM+ ($PR | $PO);       # Rule 10, any otherwise unattached CM behaves as AL
354
$POcm ($ALcm | $HLcm);
355
340
356
#
341
#
357
# LB 25   Numbers.
342
# LB 25   Numbers.
358
#
343
#
359
($PRcm | $POcm)? ($OPcm)? $NUcm ($NUcm | $SYcm | $IScm)* $CLcm? ($PRcm | $POcm)?;
344
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))*
345
    ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
360
346
361
# LB 26  Do not break a Korean syllable
347
# LB 26  Do not break a Korean syllable
362
#
348
#
363
$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
349
$JL $CM* ($JL | $JV | $H2 | $H3);
364
($JVcm | $H2cm) ($JVcm | $JTcm);
350
($JV | $H2) $CM* ($JV | $JT);
365
($JTcm | $H3cm) $JTcm;
351
($JT | $H3) $CM* $JT;
366
352
367
# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
353
# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
368
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
354
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
369
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
355
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
370
$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
371
356
372
357
373
# LB 28   Do not break between alphabetics
358
# LB 28   Do not break between alphabetics
374
#
359
#
375
($ALcm | $HLcm) ($ALcm | $HLcm);
360
($ALPlus | $HL) $CM* ($ALPlus | $HL);
376
$CM+ ($ALcm | $HLcm);      # The $CM+ is from rule 10, an unattached CM is treated as AL
361
^$CM+ ($ALPlus | $HL);      # The $CM+ is from rule 10, an unattached CM is treated as AL
362
363
#LB 28a  Do not break Orthographic syllables
364
($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?;
377
365
378
# LB 29
366
# LB 29
379
$IScm ($ALcm | $NUcm);
367
$IS $CM* ($ALPlus | $HL);
380
381
#
382
# Rule 30   Do not break between letters, numbers or ordinary symbols
383
#           and opening or closing punctuation
384
#
385
($ALcm | $HLcm | $NUcm) $OPcm;
386
$CM+ $OPcm;
387
$CLcm ($ALcm | $HLcm | $NUcm);
388
389
#
390
#  Reverse Rules.
391
#
392
## -------------------------------------------------
393
394
!!reverse;
395
396
$CM+ $ALPlus;
397
$CM+ $BA;
398
$CM+ $BB;
399
$CM+ $B2;
400
$CM+ $CL;
401
$CM+ $EX;
402
$CM+ $GL;
403
$CM+ $HL;
404
$CM+ $HY;
405
$CM+ $H2;
406
$CM+ $H3;
407
$CM+ $ID;
408
$CM+ $IN;
409
$CM+ $IS;
410
$CM+ $JL;
411
$CM+ $JV;
412
$CM+ $JT;
413
$CM+ $NS;
414
$CM+ $NU;
415
$CM+ $OP;
416
$CM+ $PO;
417
$CM+ $PR;
418
$CM+ $QU;
419
$CM+ $SY;
420
$CM+ $WJ;
421
$CM+;
422
423
424
#
425
#  Sequences of the form  (shown forwards)
426
#      [CANT_CM]  <break>  [CM]  [whatever]
427
#  The CM needs to behave as an AL
428
#
429
$AL_FOLLOW $CM+ / (
430
          [$BK $CR $LF $NL $ZW {eof}] |
431
          $SP+ $CM+ $SP |
432
          $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}]));   # if LB 14 will match, need to suppress this break.
433
                                               #  LB14 says    OP SP* x .        
434
                                               #    becomes    OP SP* x AL
435
                                               #    becomes    OP SP* x CM+ AL_FOLLOW
436
                                               #
437
                                               # Further note:  the $AL in [$AL {eof}] is only to work around
438
                                               #                a rule compiler bug which complains about
439
                                               #                empty sets otherwise.
440
          
441
#
442
#  Sequences of the form  (shown forwards)
443
#      [CANT_CM]  <break> [CM]  <break>  [PR]
444
#  The CM needs to behave as an AL
445
#  This rule is concerned about getting the second of the two <breaks> in place.
446
#
447
448
[$PR   ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
449
450
451
452
# LB 4, 5, 5
453
454
$LB4Breaks [$LB4NonBreaks-$CM];
455
$LB4Breaks $CM+ $CAN_CM;
456
$LF $CR;
457
458
459
# LB 7         x SP
460
#              x ZW
461
[$SP $ZW] [$LB4NonBreaks-$CM];
462
[$SP $ZW] $CM+ $CAN_CM;
463
464
# LB 8 Break after zero width space
465
466
467
# LB 9,10  Combining marks.
468
#    X   $CM needs to behave like X, where X is not $SP or controls.
469
#    $CM not covered by the above needs to behave like $AL
470
# Stick together any combining sequences that don't match other rules.
471
$CM+ $CAN_CM;
472
473
474
# LB 11
475
$CM* $WJ $CM* $CAN_CM;
476
$CM* $WJ      [$LB8NonBreaks-$CM];
477
478
     $CANT_CM $CM* $WJ;
479
$CM* $CAN_CM  $CM* $WJ;
480
481
# LB 12
482
#         x GL
483
#
484
$CM* $GL $CM* [$LB8NonBreaks-$CM-$SP];
485
486
#
487
#     GL  x
488
#
489
$CANT_CM $CM* $GL;
490
$CM* $CAN_CM $CM* $GL;
491
492
493
# LB 13
494
$CL $CM+ $CAN_CM;
495
$EX $CM+ $CAN_CM;
496
$IS $CM+ $CAN_CM;
497
$SY $CM+ $CAN_CM;
498
499
$CL [$LB8NonBreaks-$CM];
500
$EX [$LB8NonBreaks-$CM];
501
$IS [$LB8NonBreaks-$CM];
502
$SY [$LB8NonBreaks-$CM];
503
504
# Rule 13 & 14 taken together for an edge case.
505
#   Match this, shown forward
506
#     OP SP+  ($CM+ behaving as $AL) (CL | EX | IS | IY)
507
#   This really wants to chain at the $CM+ (which is acting as an $AL)
508
#   except for $CM chaining being disabled.
509
[$CL $EX $IS $SY] $CM+ $SP+ $CM* $OP;  
510
511
# LB 14    OP SP* x
512
#
513
$CM* $CAN_CM    $SP* $CM* $OP;
514
     $CANT_CM   $SP* $CM* $OP;
515
$AL_FOLLOW? $CM+  $SP $SP* $CM* $OP;     #  by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
516
     
517
     $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
518
$CM* $AL_FOLLOW_CM   $CM+ $SP+ $CM* $OP;
519
$SY $CM $SP+ $OP;   # TODO:  Experiment.  Remove.
520
521
522
523
# LB 15
524
# $CM* $OP $SP* $CM* $QU;
525
526
# LB 16
527
$CM* $NS $SP* $CM* $CL;
528
529
# LB 17
530
$CM* $B2 $SP* $CM* $B2;
531
532
# LB 18  break after spaces
533
#        Nothing explicit needed here.
534
535
536
#
537
# LB 19
538
#
539
$CM* $QU $CM* $CAN_CM;                                #   . x QU
540
$CM* $QU      $LB18NonBreaks;
541
542
543
$CM* $CAN_CM  $CM* $QU;                               #   QU x .
544
     $CANT_CM $CM* $QU;
545
     
546
#
547
#  LB 20  Break before and after CB.
548
#         nothing needed here.
549
#
550
551
# LB 21
552
$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM];     #  . x (BA | HY | NS)
553
554
$CM* [$LB20NonBreaks-$CM] $CM* $BB;                   #  BB x .
555
[^$CB] $CM* $BB;                                      # 
556
557
# LB21a
558
[^$CB] $CM* ($HY | $BA) $CM* $HL;
559
560
# LB 22
561
$CM* $IN $CM* ($ALPlus | $HL);
562
$CM* $IN $CM* $ID;
563
$CM* $IN $CM* $IN;
564
$CM* $IN $CM* $NU;
565
566
# LB 23
567
$CM* $PO $CM* $ID;
568
$CM* $NU $CM* ($ALPlus | $HL);
569
$CM* ($ALPlus | $HL) $CM* $NU;
570
571
# LB 24
572
$CM* $ID $CM* $PR;
573
$CM* $PR $CM* $ALPlus;
574
$CM* ($ALPlus | $HL) $CM* $PR;
575
$CM* ($ALPlus | $HL) $CM* $PO;
576
577
$CM* $ALPlus $CM* ($IS | $SY | $HY)+ / $SP;
578
$CM* $NU+ $CM* $HY+ / $SP;
579
580
# LB 25
581
($CM* ($PR | $PO))? ($CM* $CL)? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP))? ($CM* ($PR | $PO))?;
582
583
# LB 26
584
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
585
$CM* ($JT | $JV) $CM* ($H2 | $JV);
586
$CM* $JT $CM* ($H3 | $JT);
587
588
# LB 27
589
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
590
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
591
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
592
593
# LB 28
594
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
595
596
# LB 29
597
$CM* ($NU | $ALPlus) $CM* $IS+ [^$SP];
598
368
599
# LB 30
369
# LB 30
600
$CM* $OP $CM* ($ALPlus | $HL | $NU);
370
($ALPlus | $HL | $NU) $CM* $OP30;
601
$CM* ($ALPlus | $HL | $NU) $CM* ($CL | $SY)+ [^$SP];
371
^$CM+ $OP30;         # The $CM+ is from rule 10, an unattached CM is treated as AL.
372
$CP30 $CM* ($ALPlus | $HL | $NU);
602
373
374
# LB 30a  Do not break between regional indicators. Break after pairs of them.
375
#         Tricky interaction with LB8a: ZWJ x .   together with ZWJ acting like a CM.
376
$RI $CM* $RI                 / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
377
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
378
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
379
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
380
#       because of the chain-out behavior difference. The rule must chain out only from the [set characters],
381
#       not from the preceding $RI or $CM, which it would be able to do if the set were optional.
603
382
604
## -------------------------------------------------
383
# LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
384
$EB $CM* $EM;
385
$ExtPictUnassigned $CM* $EM;
605
386
606
!!safe_reverse;
387
# LB 31 Break everywhere else.
607
388
#       Match a single code point if no other rule applies.
608
# LB 7
389
.;
609
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
610
$CM+ $SP / .;
611
612
# LB 9
613
$SP+ $CM* $OP;
614
615
# LB 10
616
$SP+ $CM* $QU;
617
618
# LB 11
619
$SP+ $CM* $CL;
620
$SP+ $CM* $B2;
621
622
# LB 21
623
$CM* ($HY | $BA) $CM* $HL;
624
625
# LB 18
626
($CM* ($IS | $SY))+ $CM* $NU;
627
$CL $CM* ($NU | $IS | $SY);
628
629
# For dictionary-based break
630
$dictionary $dictionary;
631
632
## -------------------------------------------------
633
634
!!safe_forward;
635
636
# Skip forward over all character classes that are involved in
637
#   rules containing patterns with possibly more than one char
638
#   of context.
639
#
640
#  It might be slightly more efficient to have specific rules
641
#  instead of one generic one, but only if we could
642
#  turn off rule chaining.  We don't want to move more
643
#  than necessary.
644
#
645
[$CM $OP $QU $CL $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $B2 $PR $HY $BA $dictionary];
646
$dictionary $dictionary;
647
(-)a/i18npool/source/breakiterator/data/sent.txt (-66 / +21 lines)
Lines 1-43 Link Here
1
# Copyright (C) 2016 and later: Unicode, Inc. and others.
2
# License & terms of use: http://www.unicode.org/copyright.html
1
#
3
#
2
#   Copyright (C) 2002-2006, International Business Machines Corporation and others.
4
#   Copyright (C) 2002-2015, International Business Machines Corporation and others.
3
#       All Rights Reserved.
5
#       All Rights Reserved.
4
#
6
#
5
#   file:  sent.txt
7
#   file:  sent.txt
6
#
8
#
7
#   ICU Sentence Break Rules
9
#   ICU Sentence Break Rules
8
#      See Unicode Standard Annex #29.
10
#      See Unicode Standard Annex #29.
9
#      These rules are based on SA 29 version 5.0.0
11
#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
10
#      Includes post 5.0 changes to treat Japanese half width voicing marks
11
#        as Grapheme Extend.
12
#
12
#
13
13
14
14
!!quoted_literals_only;
15
$VoiceMarks   = [\uff9e\uff9f];
16
$Thai         = [:Script = Thai:];
17
15
18
#
16
#
19
# Character categories as defined in TR 29
17
# Character categories as defined in TR 29
20
#
18
#
19
$CR        = [\p{Sentence_Break = CR}];
20
$LF        = [\p{Sentence_Break = LF}];
21
$Extend    = [\p{Sentence_Break = Extend}];
21
$Sep       = [\p{Sentence_Break = Sep}];
22
$Sep       = [\p{Sentence_Break = Sep}];
22
$Format    = [\p{Sentence_Break = Format}];
23
$Format    = [\p{Sentence_Break = Format}];
23
$Sp        = [\p{Sentence_Break = Sp}];
24
$Sp        = [\p{Sentence_Break = Sp}];
24
$Lower     = [\p{Sentence_Break = Lower}];
25
$Lower     = [\p{Sentence_Break = Lower}];
25
$Upper     = [\p{Sentence_Break = Upper}];
26
$Upper     = [\p{Sentence_Break = Upper}];
26
$OLetter   = [\p{Sentence_Break = OLetter}-$VoiceMarks];
27
$OLetter   = [\p{Sentence_Break = OLetter}];
27
$Numeric   = [\p{Sentence_Break = Numeric}];
28
$Numeric   = [\p{Sentence_Break = Numeric}];
28
$ATerm     = [\p{Sentence_Break = ATerm}];
29
$ATerm     = [\p{Sentence_Break = ATerm}];
30
$SContinue = [\p{Sentence_Break = SContinue}];
29
$STerm     = [\p{Sentence_Break = STerm}];
31
$STerm     = [\p{Sentence_Break = STerm}];
30
$Close     = [\p{Sentence_Break = Close}];
32
$Close     = [\p{Sentence_Break = Close}];
31
33
32
#
34
#
33
# Define extended forms of the character classes,
35
# Define extended forms of the character classes,
34
#   incorporate grapheme cluster + format chars.
36
#   incorporate trailing Extend or Format chars.
35
#   Rules 4 and 5.  
37
#   Rules 4 and 5.
36
37
38
$CR         = \u000d;
39
$LF         = \u000a;
40
$Extend     = [[:Grapheme_Extend = TRUE:]$VoiceMarks];
41
38
42
$SpEx       = $Sp      ($Extend | $Format)*;
39
$SpEx       = $Sp      ($Extend | $Format)*;
43
$LowerEx    = $Lower   ($Extend | $Format)*;
40
$LowerEx    = $Lower   ($Extend | $Format)*;
Lines 45-50 Link Here
45
$OLetterEx  = $OLetter ($Extend | $Format)*;
42
$OLetterEx  = $OLetter ($Extend | $Format)*;
46
$NumericEx  = $Numeric ($Extend | $Format)*;
43
$NumericEx  = $Numeric ($Extend | $Format)*;
47
$ATermEx    = $ATerm   ($Extend | $Format)*;
44
$ATermEx    = $ATerm   ($Extend | $Format)*;
45
$SContinueEx= $SContinue ($Extend | $Format)*;
48
$STermEx    = $STerm   ($Extend | $Format)*;
46
$STermEx    = $STerm   ($Extend | $Format)*;
49
$CloseEx    = $Close   ($Extend | $Format)*;
47
$CloseEx    = $Close   ($Extend | $Format)*;
50
48
Lines 52-128 Link Here
52
## -------------------------------------------------
50
## -------------------------------------------------
53
51
54
!!chain;
52
!!chain;
55
!!forward;
56
53
57
# Rule 3 - break after separators.  Keep CR/LF together.
54
# Rule 3 - break after separators.  Keep CR/LF together.
58
#
55
#
59
$CR $LF;
56
$CR $LF;
60
57
61
$LettersEx = [$OLetter $Upper $Lower $Numeric $Close $STerm] ($Extend | $Format)*;
62
$LettersEx* $Thai $LettersEx* ($ATermEx | $SpEx)*;
63
58
64
# Rule 4 - Break after $Sep.
59
# Rule 4 - Break after $Sep.
65
# Rule 5 - Ignore $Format and $Extend
60
# Rule 5 - Ignore $Format and $Extend
66
#
61
#
67
[^$Sep]? ($Extend | $Format)*;
62
[^$Sep $CR $LF]? ($Extend | $Format)*;
68
63
69
64
70
# Rule 6
65
# Rule 6
71
$ATermEx $NumericEx;
66
$ATermEx $NumericEx;
72
67
73
# Rule 7
68
# Rule 7
74
$UpperEx $ATermEx $UpperEx;
69
($UpperEx | $LowerEx) $ATermEx $UpperEx;
75
70
76
#Rule 8
71
#Rule 8
77
#  Note:  follows errata for Unicode 5.0 boundary rules.
72
$NotLettersEx = [^$OLetter $Upper $Lower $Sep $CR $LF $ATerm $STerm] ($Extend | $Format)*;
78
$NotLettersEx = [^$OLetter $Upper $Lower $Sep $ATerm $STerm] ($Extend | $Format)*;
79
$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
73
$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
80
74
81
# Rule 8a
75
# Rule 8a
82
($STermEx | $ATermEx) $CloseEx* $SpEx* ($STermEx | $ATermEx);
76
($STermEx | $ATermEx) $CloseEx* $SpEx* ($SContinueEx | $STermEx | $ATermEx);
83
77
84
#Rule 9, 10, 11
78
#Rule 9, 10, 11
85
($STermEx | $ATermEx) $CloseEx* $SpEx* $Sep?;
79
($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?;
86
80
87
#Rule 12
81
#Rule 998
88
[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend $Thai]{bof}] ($Extend | $Format | $Close | $Sp)* [^$Thai];
82
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
89
[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep{eof}] | $CR $LF){100};
83
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
90
91
## -------------------------------------------------
92
93
!!reverse;
94
95
$SpEx_R       = ($Extend | $Format)* $Sp;
96
$ATermEx_R    = ($Extend | $Format)* $ATerm;
97
$STermEx_R    = ($Extend | $Format)* $STerm;
98
$CloseEx_R    = ($Extend | $Format)* $Close;
99
100
#
101
#  Reverse rules.
102
#     For now, use the old style inexact reverse rules, which are easier
103
#     to write, but less efficient.
104
#     TODO:  exact reverse rules.  It appears that exact reverse rules
105
#            may require improving support for look-ahead breaks in the
106
#            builder.  Needs more investigation.
107
#
108
109
[{bof}] (.? | $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
110
#.*;
111
112
# Explanation for this rule:
113
#
114
#    It needs to back over
115
#        The $Sep at which we probably begin
116
#        All of the non $Sep chars leading to the preceding $Sep
117
#        The preceding $Sep, which will be the second one that the rule matches.
118
#        Any immediately preceding STerm or ATerm sequences.  We need to see these
119
#              to get the correct rule status when moving forwards again.
120
#        
121
# [{bof}]           inhibit rule chaining.  Without this, rule would loop on itself and match
122
#                   the entire string.
123
#
124
# (.? | $LF $CR)    Match one $Sep instance.  Use .? rather than $Sep because position might be
125
#                   at the beginning of the string at this point, and we don't want to fail.
126
#                   Can only use {eof} once, and it is used later.
127
#
128
(-)a/include/svx/strings.hrc (+1 lines)
Lines 1800-1805 Link Here
1800
#define RID_SUBSETSTR_KAKTOVIK_NUMERALS                     NC_("RID_SUBSETMAP", "Kaktovik Numerals")
1800
#define RID_SUBSETSTR_KAKTOVIK_NUMERALS                     NC_("RID_SUBSETMAP", "Kaktovik Numerals")
1801
#define RID_SUBSETSTR_KAWI                                  NC_("RID_SUBSETMAP", "Kawi")
1801
#define RID_SUBSETSTR_KAWI                                  NC_("RID_SUBSETMAP", "Kawi")
1802
#define RID_SUBSETSTR_NAG_MUNDARI                           NC_("RID_SUBSETMAP", "Nag Mundari")
1802
#define RID_SUBSETSTR_NAG_MUNDARI                           NC_("RID_SUBSETMAP", "Nag Mundari")
1803
#define RID_SUBSETSTR_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I    NC_("RID_SUBSETMAP", "CJK Unified Ideographs Extension I")
1803
1804
1804
#define RID_SVXSTR_FRAMEDIR_LTR                             NC_("RID_SVXSTR_FRAMEDIR_LTR", "Left-to-right (LTR)")
1805
#define RID_SVXSTR_FRAMEDIR_LTR                             NC_("RID_SVXSTR_FRAMEDIR_LTR", "Left-to-right (LTR)")
1805
#define RID_SVXSTR_FRAMEDIR_RTL                             NC_("RID_SVXSTR_FRAMEDIR_RTL", "Right-to-left (RTL)")
1806
#define RID_SVXSTR_FRAMEDIR_RTL                             NC_("RID_SVXSTR_FRAMEDIR_RTL", "Right-to-left (RTL)")
(-)a/svx/source/dialog/charmap.cxx (+5 lines)
Lines 1924-1929 Link Here
1924
                    aAllSubsets.emplace_back( 0x1E4D0, 0x1E4FF, SvxResId(RID_SUBSETSTR_NAG_MUNDARI) );
1924
                    aAllSubsets.emplace_back( 0x1E4D0, 0x1E4FF, SvxResId(RID_SUBSETSTR_NAG_MUNDARI) );
1925
                    break;
1925
                    break;
1926
#endif
1926
#endif
1927
#if (U_ICU_VERSION_MAJOR_NUM >= 74)
1928
                case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I:
1929
                    aAllSubsets.emplace_back( 0x2EBF0, 0x2EE5F, SvxResId(RID_SUBSETSTR_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I) );
1930
                    break;
1931
#endif
1927
            }
1932
            }
1928
1933
1929
#if OSL_DEBUG_LEVEL > 0 && !defined NDEBUG
1934
#if OSL_DEBUG_LEVEL > 0 && !defined NDEBUG

Return to bug 917618