Attachment #875923 for bug #917618

View | Details | Raw Unified | Return to bug 917618
Collapse All | Expand All

Lines 10803-10810 Link Here

(-)a/configure.ac (-2 / +2 lines)
10803	dnl ===================================================================	10803	dnl ===================================================================
10804	dnl Check for system icu	10804	dnl Check for system icu
10805	dnl ===================================================================	10805	dnl ===================================================================
10806	ICU_MAJOR=73	10806	ICU_MAJOR=74
10807	ICU_MINOR=2	10807	ICU_MINOR=1
10808	ICU_CFLAGS_internal="-I${WORKDIR}/UnpackedTarball/icu/source/i18n -I${WORKDIR}/UnpackedTarball/icu/source/common"	10808	ICU_CFLAGS_internal="-I${WORKDIR}/UnpackedTarball/icu/source/i18n -I${WORKDIR}/UnpackedTarball/icu/source/common"
10809	ICU_LIBS_internal="-L${WORKDIR}/UnpackedTarball/icu/source/lib"	10809	ICU_LIBS_internal="-L${WORKDIR}/UnpackedTarball/icu/source/lib"
10810	libo_CHECK_SYSTEM_MODULE([icu],[ICU],[icu-i18n >= 66])	10810	libo_CHECK_SYSTEM_MODULE([icu],[ICU],[icu-i18n >= 66])

Lines 312-321 Link Here

(-)a/download.lst (-4 / +4 lines)
312	# three static lines	312	# three static lines
313	# so that git cherry-pick	313	# so that git cherry-pick
314	# will not run into conflicts	314	# will not run into conflicts
315	ICU_SHA256SUM := 818a80712ed3caacd9b652305e01afc7fa167e6f2e94996da44b90c2ab604ce1	315	ICU_SHA256SUM := 86ce8e60681972e60e4dcb2490c697463fcec60dd400a5f9bffba26d0b52b8d0
316	ICU_TARBALL := icu4c-73_2-src.tgz	316	ICU_TARBALL := icu4c-74_1-src.tgz
317	ICU_DATA_SHA256SUM := ca1ee076163b438461e484421a7679fc33a64cd0a54f9d4b401893fa1eb42701	317	ICU_DATA_SHA256SUM := 67d5ab39c5187e1dd0fed60a3fe52794dce9784b4c045cb85e19f5d317fd783f
318	ICU_DATA_TARBALL := icu4c-73_2-data.zip	318	ICU_DATA_TARBALL := icu4c-74_1-data.zip
319	# three static lines	319	# three static lines
320	# so that git cherry-pick	320	# so that git cherry-pick
321	# will not run into conflicts	321	# will not run into conflicts

Lines 796-802 Link Here

(-)a/external/icu/icu4c-khmerbreakengine.patch.1 (-1 lines)
796	if (wordCount < limit) {	796	if (wordCount < limit) {
797	if (values != nullptr) {	797	if (values != nullptr) {
798	values[wordCount] = bt.getValue();	798	values[wordCount] = bt.getValue();
799
800	diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h	799	diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h
801	--- icu.org/source/common/dictionarydata.h 2023-06-14 06:23:55.000000000 +0900	800	--- icu.org/source/common/dictionarydata.h 2023-06-14 06:23:55.000000000 +0900
802	+++ icu/source/common/dictionarydata.h 2023-06-26 17:43:53.097724900 +0900	801	+++ icu/source/common/dictionarydata.h 2023-06-26 17:43:53.097724900 +0900




# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016  International Business Machines Corporation and
# others. All Rights Reserved.
#
#  file:  line.txt
#
#         Line Breaking Rules
#         Implement default line breaking as defined by
#         Unicode Standard Annex #14 (https://www.unicode.org/reports/tr14/)
#         for Unicode 14.0, with the following modification:
#
#         Boundaries between hyphens and following letters are suppressed when
#         there is a boundary preceding the hyphen. See rule 20.9
#
#         This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
#         It sets characters of class CJ to behave like NS.

#
#  Character Classes defined by TR 14.
#

!!chain;
!!quoted_literals_only;


!!lookAheadHardBreak;
#
#  !!lookAheadHardBreak    Described here because it is (as yet) undocumented elsewhere
#                          and only used for the line break rules.
#
#           It is used in the implementation of the incredibly annoying rule LB 10
#           which says to treat any combining mark that is not attached to a base
#           character as if it were of class AL  (alphabetic).
#
#           The problem occurs in the reverse rules.
#
#           Consider a sequence like, with correct breaks as shown
#               LF  ID  CM  AL  AL
#                  ^       ^       ^
#           Then consider the sequence without the initial ID (ideographic)
#                 LF  CM  AL  AL
#                    ^           ^
#           Our CM, which in the first example was attached to the ideograph,
#           is now unattached, becomes an alpha, and joins in with the other
#           alphas.
#
#           When iterating forwards, these sequences do not present any problems
#           When iterating backwards, we need to look ahead when encountering
#           a CM to see whether it attaches to something further on or not.
#           (Look-ahead in a reverse rule is looking towards the start)
#
#           If the CM is unattached, we need to force a break.
#
#           !!lookAheadHardBreak forces the run time state machine to
#           stop immediately when a look ahead rule ( '/' operator) matches,
#           and set the match position to that of the look-ahead operator,
#           no matter what other rules may be in play at the time.
#
#           See rule LB 19 for an example.
#

$AI = [:LineBreak =  Ambiguous:];
$AK = [:LineBreak =  Aksara:];
$AL = [:LineBreak =  Alphabetic:];
$AP = [:LineBreak =  Aksara_Prebase:];
$AS = [:LineBreak =  Aksara_Start:];
$BA = [:LineBreak =  Break_After:];
$HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
$BB = [:LineBreak =  Break_Before:];
$BK = [:LineBreak =  Mandatory_Break:];
$B2 = [:LineBreak =  Break_Both:];
$CB = [:LineBreak =  Contingent_Break:];
$CJ = [:LineBreak =  Conditional_Japanese_Starter:];
$CL = [:LineBreak =  Close_Punctuation:];
# $CM = [:LineBreak =  Combining_Mark:];
$CP = [:LineBreak =  Close_Parenthesis:];
$CR = [:LineBreak =  Carriage_Return:];
$EB = [:LineBreak =  EB:];
$EM = [:LineBreak =  EM:];
$EX = [:LineBreak =  Exclamation:];
$GL = [:LineBreak =  Glue:];
$HL = [:LineBreak =  Hebrew_Letter:];
$HY = [:LineBreak =  Hyphen:];
$H2 = [:LineBreak =  H2:];
$H3 = [:LineBreak =  H3:];
$ID = [:LineBreak =  Ideographic:];
$IN = [:LineBreak =  Inseperable:];
$IS = [:LineBreak =  Infix_Numeric:];
$JL = [:LineBreak =  JL:];
$JV = [:LineBreak =  JV:];
$JT = [:LineBreak =  JT:];
$LF = [:LineBreak =  Line_Feed:];
$NL = [:LineBreak =  Next_Line:];
# NS includes CJ for CSS strict line breaking.
$NS = [[:LineBreak =  Nonstarter:] $CJ];
$NU = [:LineBreak =  Numeric:];
$OP = [:LineBreak =  Open_Punctuation:];
$PO = [:LineBreak =  Postfix_Numeric:];
$PR = [:LineBreak =  Prefix_Numeric:];
$PR = [[:LineBreak =  Prefix_Numeric:] - $BS];
$QU = [:LineBreak =  Quotation:];
$RI = [:LineBreak =  Regional_Indicator:];
$SA = [:LineBreak =  Complex_Context:];
$SG = [:LineBreak =  Surrogate:];
$SP = [:LineBreak =  Space:];
$SY = [:LineBreak =  Break_Symbols:];
$VF = [:LineBreak =  Virama_Final:];
$VI = [:LineBreak =  Virama:];
$WJ = [:LineBreak =  Word_Joiner:];
$XX = [:LineBreak =  Unknown:];
$ZW = [:LineBreak =  ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];

# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
# without a formal name. Because ICU rules require multiple uses of the expressions,
# give them a single definition with a name

$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];

$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];

# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
#         list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.

$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
$CMX = [[$CM] - [$ZWJ]];

#   Dictionary character set, for triggering language-based break engines. Currently
#   limited to LineBreak=Complex_Context (SA).
#   5.0 or later as the definition of Complex_Context was corrected to include all
#   characters requiring dictionary break.

$dictionary = [$SA];

#
#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
#                               SA  (Dictionary chars, excluding Mn and Mc)
#                               SG  (Unpaired Surrogates)
#                               XX  (Unknown, unassigned)
#                         as $AL  (Alphabetic)
#
$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];

#
#  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
#
$ALcm = $ALPlus $CM*;
$BAcm = $BA $CM*;
$BBcm = $BB $CM*;
$B2cm = $B2 $CM*;
$CLcm = $CL $CM*;
$EXcm = $EX $CM*;
$GLcm = $GL $CM*;
$HLcm = $HL $CM*;
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$JLcm = $JL $CM*;
$JVcm = $JV $CM*;
$JTcm = $JT $CM*;
$NScm = $NS $CM*;
$NUcm = $NU $CM*;
$OPcm = $OP $CM*;
$POcm = $PO $CM*;
$PRcm = $PR $CM*;
$QUcm = $QU $CM*;
$SYcm = $SY $CM*;
$WJcm = $WJ $CM*;

## -------------------------------------------------

!!forward;

#
#  Each class of character can stand by itself as an unbroken token, with trailing combining stuff
#
$ALPlus $CM+;
$BA $CM+;
$BB $CM+;
$B2 $CM+;
$CL $CM+;
$EX $CM+;
$GL $CM+;
$HL $CM+;
$HY $CM+;
$H2 $CM+;
$H3 $CM+;
$ID $CM+;
$IN $CM+;
$IS $CM+;
$JL $CM+;
$JV $CM+;
$JT $CM+;
$NS $CM+;
$NU $CM+;
$OP $CM+;
$PO $CM+;
$PR $CM+;
$QU $CM+;
$SY $CM+;
$WJ $CM+;

#
# CAN_CM  is the set of characters that may combine with CM combining chars.
#         Note that Linebreak UAX 14's concept of a combining char and the rules

#
# AL_FOLLOW  set of chars that can unconditionally follow an AL
#            Needed in rules where stand-alone $CM s are treated as AL.
#            Chaining is disabled with CM because it causes other failures,
#            so for this one case we need to manually list out longer sequences.
#
$AL_FOLLOW      = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
$AL_FOLLOW_CM   = [$CL $EX $HL $IS $SY $WJ $GL $QU $BA $HY $NS $IN $NU $ALPlus $OP];
$AL_FOLLOW      = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];


#
#  Rule LB 4, 5    Mandatory (Hard) breaks.
#
$LB4Breaks    = [$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};

#

#
$LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
$CAN_CM $CM*    $LB4Breaks {100};
^$CM+           $LB4Breaks {100};

# LB 7         x SP
#              x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM*  [$SP $ZW];
^$CM+         [$SP $ZW];

#
# LB 8         Break after zero width space
#              ZW SP* ÷
#
$LB8Breaks    = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
$ZW $SP* / [^$SP $ZW $LB4Breaks];

# LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
#
$ZWJ [^$CM];

# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
#                                $CM not covered by the above needs to behave like $AL
#                                See definition of $CAN_CM.

$CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
^$CM+;

#
# LB 11  Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM*  $WJ;
$LB8NonBreaks $WJ;
^$CM+         $WJ;

$WJ $CM* .;
$WJcm $CAN_CM $CM*;

#
# LB 12  Do not break after NBSP and related characters.
#
#         (!SP) x GL
[$LB8NonBreaks-$SP] $CM* $GLcm;
$CM+               $GLcm;

#         GL  x
#
$GL $CM* .;
                              #  TODO:  I don't think we need this rule.
                              #         All but $CM will chain off of preceding rule.
                              #         $GLcm will pick up the CM case by itself.




#
# LB 12a  Do not break before NBSP and related characters ...
#            [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
^$CM+ $GL;




# LB 13   Don't break before ']' or '!' or '/', even after spaces.
#
$LB8NonBreaks $CL;
$CAN_CM $CM*  $CL;
^$CM+         $CL;              # by rule 10, stand-alone CM behaves as AL

$LB8NonBreaks $CP;
$CAN_CM $CM*  $CP;
^$CM+         $CP;              # by rule 10, stand-alone CM behaves as AL

$LB8NonBreaks $EX;
$CAN_CM $CM*  $EX;
^$CM+         $EX;              # by rule 10, stand-alone CM behaves as AL

$LB8NonBreaks $IS;
$CAN_CM $CM*  $IS;
$CM+          $IS;              # by rule 10, stand-alone CM behaves as AL

$LB8NonBreaks $SY;
$CAN_CM $CM*  $SY;
^$CM+         $SY;              # by rule 10, stand-alone CM behaves as AL


#
# LB 14  Do not break after OP, even after spaces
#        Note subtle interaction with "SP IS /" rules in LB14a.
#        This rule consumes the SP, chaining happens on the IS, effectivley overriding the  SP IS rules,
#        which is the desired behavior.
#
$OP $CM* $SP* .;
$OPcm $SP* $CANT_CM;

$OP $CM* $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
                                   # by rule 8, CM following a SP is stand-alone.


# LB 15a
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .;
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
^([\p{Pi} & $QU] $CM* $SP*)+ .;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;

# LB 15b
$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
$CAN_CM $CM*  [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
^$CM+  [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];

# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi.
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
$CAN_CM $CM*  [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
$CAN_CM $CM*  [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
^$CM+  [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
^$CM+  [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;


# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23"
#        Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations.
#        See issue ICU-20303


$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
$SP $IS           / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];

#
# LB 15d Do not break before numeric separators (IS), even after spaces.

[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
$SP $IS $CM* $ZWJ [^$CM $NU];

$CAN_CM $CM*  $IS;
^$CM+         $IS;              # by rule 10, stand-alone CM behaves as AL


# LB 16
($CL | $CP) $CM* $SP* $NS;

# LB 17
$B2 $CM* $SP* $B2;

#
# LB 18  Break after spaces.


# LB 19
#         x QU
$LB18NonBreaks $CM* $QU;
^$CM+               $QU;

#         QU  x
$QU $CM* .;
$QUcm $LB18NonBreaks $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
                              #  TODO:  I don't think this rule is needed.


# LB 20
#        <break>  $CB
#        $CB   <break>
#
$LB20NonBreaks = [$LB18NonBreaks - $CB];

# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
#             Originally added as a Finnish tailoring, now promoted to default ICU behavior.
#             Note: this is not default UAX-14 behaviour. See issue ICU-8151.
#
^($HY | $HH) $CM* $ALPlus;

# LB 21        x   (BA | HY | NS)
#           BB x
#
$LB20NonBreaks $CM* ($BA | $HY | $NS);


^$CM+ ($BA | $HY | $NS);

$BB $CM* [^$CB];                                  #  $BB  x
$BB $CM* $LB20NonBreaks;

# LB 21a Don't break after Hebrew + Hyphen
#   HL (HY | BA) x
#
$HL $CM* ($HY | $BA) $CM* [^$CB]?;

# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
$SY $CM* $HL;

# LB 22  Do not break before ellipses
#
$LB20NonBreaks $CM*    $IN;
^$CM+ $IN;


# LB 23
#
($ALPlus | $HL) $CM* $NU;
^$CM+  $NU;       # Rule 10, any otherwise unattached CM behaves as AL
$NU $CM* ($ALPlus | $HL);

# LB 23a
#
$PR $CM* ($ID | $EB | $EM);
($ID | $EB | $EM) $CM*  $PO;


#
# LB 24
#
($PR | $PO) $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* ($PR | $PO);
^$CM+ ($PR | $PO);       # Rule 10, any otherwise unattached CM behaves as AL
$POcm ($ALcm | $HLcm);

#
# LB 25   Numbers.
#
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))*
    ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;

# LB 26  Do not break a Korean syllable
#
$JL $CM* ($JL | $JV | $H2 | $H3);
($JV | $H2) $CM* ($JV | $JT);
($JT | $H3) $CM* $JT;

# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);


# LB 28   Do not break between alphabetics
#
($ALPlus | $HL) $CM* ($ALPlus | $HL);
^$CM+ ($ALPlus | $HL);      # The $CM+ is from rule 10, an unattached CM is treated as AL

#LB 28a  Do not break Orthographic syllables
($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?;

# LB 29
$IS $CM* ($ALPlus | $HL);

#
# Rule 30   Do not break between letters, numbers or ordinary symbols
#           and opening or closing punctuation
#
($ALcm | $HLcm | $NUcm) $OPcm;
$CM+ $OPcm;
$CLcm ($ALcm | $HLcm | $NUcm);

#
#  Reverse Rules.
#
## -------------------------------------------------

!!reverse;

$CM+ $ALPlus;
$CM+ $BA;
$CM+ $BB;
$CM+ $B2;
$CM+ $CL;
$CM+ $EX;
$CM+ $GL;
$CM+ $HL;
$CM+ $HY;
$CM+ $H2;
$CM+ $H3;
$CM+ $ID;
$CM+ $IN;
$CM+ $IS;
$CM+ $JL;
$CM+ $JV;
$CM+ $JT;
$CM+ $NS;
$CM+ $NU;
$CM+ $OP;
$CM+ $PO;
$CM+ $PR;
$CM+ $QU;
$CM+ $SY;
$CM+ $WJ;
$CM+;


#
#  Sequences of the form  (shown forwards)
#      [CANT_CM]  <break>  [CM]  [whatever]
#  The CM needs to behave as an AL
#
$AL_FOLLOW $CM+ / (
          [$BK $CR $LF $NL $ZW {eof}] |
          $SP+ $CM+ $SP |
          $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}]));   # if LB 14 will match, need to suppress this break.
                                               #  LB14 says    OP SP* x .        
                                               #    becomes    OP SP* x AL
                                               #    becomes    OP SP* x CM+ AL_FOLLOW
                                               #
                                               # Further note:  the $AL in [$AL {eof}] is only to work around
                                               #                a rule compiler bug which complains about
                                               #                empty sets otherwise.
          
#
#  Sequences of the form  (shown forwards)
#      [CANT_CM]  <break> [CM]  <break>  [PR]
#  The CM needs to behave as an AL
#  This rule is concerned about getting the second of the two <breaks> in place.
#

[$PR   ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];



# LB 4, 5, 5

$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
$LF $CR;


# LB 7         x SP
#              x ZW
[$SP $ZW] [$LB4NonBreaks-$CM];
[$SP $ZW] $CM+ $CAN_CM;

# LB 8 Break after zero width space


# LB 9,10  Combining marks.
#    X   $CM needs to behave like X, where X is not $SP or controls.
#    $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
$CM+ $CAN_CM;


# LB 11
$CM* $WJ $CM* $CAN_CM;
$CM* $WJ      [$LB8NonBreaks-$CM];

     $CANT_CM $CM* $WJ;
$CM* $CAN_CM  $CM* $WJ;

# LB 12
#         x GL
#
$CM* $GL $CM* [$LB8NonBreaks-$CM-$SP];

#
#     GL  x
#
$CANT_CM $CM* $GL;
$CM* $CAN_CM $CM* $GL;


# LB 13
$CL $CM+ $CAN_CM;
$EX $CM+ $CAN_CM;
$IS $CM+ $CAN_CM;
$SY $CM+ $CAN_CM;

$CL [$LB8NonBreaks-$CM];
$EX [$LB8NonBreaks-$CM];
$IS [$LB8NonBreaks-$CM];
$SY [$LB8NonBreaks-$CM];

# Rule 13 & 14 taken together for an edge case.
#   Match this, shown forward
#     OP SP+  ($CM+ behaving as $AL) (CL | EX | IS | IY)
#   This really wants to chain at the $CM+ (which is acting as an $AL)
#   except for $CM chaining being disabled.
[$CL $EX $IS $SY] $CM+ $SP+ $CM* $OP;  

# LB 14    OP SP* x
#
$CM* $CAN_CM    $SP* $CM* $OP;
     $CANT_CM   $SP* $CM* $OP;
$AL_FOLLOW? $CM+  $SP $SP* $CM* $OP;     #  by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
     
     $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$CM* $AL_FOLLOW_CM   $CM+ $SP+ $CM* $OP;
$SY $CM $SP+ $OP;   # TODO:  Experiment.  Remove.



# LB 15
# $CM* $OP $SP* $CM* $QU;

# LB 16
$CM* $NS $SP* $CM* $CL;

# LB 17
$CM* $B2 $SP* $CM* $B2;

# LB 18  break after spaces
#        Nothing explicit needed here.


#
# LB 19
#
$CM* $QU $CM* $CAN_CM;                                #   . x QU
$CM* $QU      $LB18NonBreaks;


$CM* $CAN_CM  $CM* $QU;                               #   QU x .
     $CANT_CM $CM* $QU;
     
#
#  LB 20  Break before and after CB.
#         nothing needed here.
#

# LB 21
$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM];     #  . x (BA | HY | NS)

$CM* [$LB20NonBreaks-$CM] $CM* $BB;                   #  BB x .
[^$CB] $CM* $BB;                                      # 

# LB21a
[^$CB] $CM* ($HY | $BA) $CM* $HL;

# LB 22
$CM* $IN $CM* ($ALPlus | $HL);
$CM* $IN $CM* $ID;
$CM* $IN $CM* $IN;
$CM* $IN $CM* $NU;

# LB 23
$CM* $PO $CM* $ID;
$CM* $NU $CM* ($ALPlus | $HL);
$CM* ($ALPlus | $HL) $CM* $NU;

# LB 24
$CM* $ID $CM* $PR;
$CM* $PR $CM* $ALPlus;
$CM* ($ALPlus | $HL) $CM* $PR;
$CM* ($ALPlus | $HL) $CM* $PO;

$CM* $ALPlus $CM* ($IS | $SY | $HY)+ / $SP;
$CM* $NU+ $CM* $HY+ / $SP;

# LB 25
($CM* ($PR | $PO))? ($CM* $CL)? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP))? ($CM* ($PR | $PO))?;

# LB 26
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
$CM* ($JT | $JV) $CM* ($H2 | $JV);
$CM* $JT $CM* ($H3 | $JT);

# LB 27
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;

# LB 28
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);

# LB 29
$CM* ($NU | $ALPlus) $CM* $IS+ [^$SP];

# LB 30
($ALPlus | $HL | $NU) $CM* $OP30;
^$CM+ $OP30;         # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP30 $CM* ($ALPlus | $HL | $NU);

# LB 30a  Do not break between regional indicators. Break after pairs of them.
#         Tricky interaction with LB8a: ZWJ x .   together with ZWJ acting like a CM.
$RI $CM* $RI                 / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
#       because of the chain-out behavior difference. The rule must chain out only from the [set characters],
#       not from the preceding $RI or $CM, which it would be able to do if the set were optional.

# LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
$EB $CM* $EM;
$ExtPictUnassigned $CM* $EM;

# LB 31 Break everywhere else.
#       Match a single code point if no other rule applies.
.;
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
$CM+ $SP / .;

# LB 9
$SP+ $CM* $OP;

# LB 10
$SP+ $CM* $QU;

# LB 11
$SP+ $CM* $CL;
$SP+ $CM* $B2;

# LB 21
$CM* ($HY | $BA) $CM* $HL;

# LB 18
($CM* ($IS | $SY))+ $CM* $NU;
$CL $CM* ($NU | $IS | $SY);

# For dictionary-based break
$dictionary $dictionary;

## -------------------------------------------------

!!safe_forward;

# Skip forward over all character classes that are involved in
#   rules containing patterns with possibly more than one char
#   of context.
#
#  It might be slightly more efficient to have specific rules
#  instead of one generic one, but only if we could
#  turn off rule chaining.  We don't want to move more
#  than necessary.
#
[$CM $OP $QU $CL $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $B2 $PR $HY $BA $dictionary];
$dictionary $dictionary;





# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
#
#   Copyright (C) 2002-2015, International Business Machines Corporation and others.
#       All Rights Reserved.
#
#   file:  sent.txt
#
#   ICU Sentence Break Rules
#      See Unicode Standard Annex #29.
#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
#      Includes post 5.0 changes to treat Japanese half width voicing marks
#        as Grapheme Extend.
#

!!quoted_literals_only;
$VoiceMarks   = [\uff9e\uff9f];
$Thai         = [:Script = Thai:];

#
# Character categories as defined in TR 29
#
$CR        = [\p{Sentence_Break = CR}];
$LF        = [\p{Sentence_Break = LF}];
$Extend    = [\p{Sentence_Break = Extend}];
$Sep       = [\p{Sentence_Break = Sep}];
$Format    = [\p{Sentence_Break = Format}];
$Sp        = [\p{Sentence_Break = Sp}];
$Lower     = [\p{Sentence_Break = Lower}];
$Upper     = [\p{Sentence_Break = Upper}];
$OLetter   = [\p{Sentence_Break = OLetter}];
$Numeric   = [\p{Sentence_Break = Numeric}];
$ATerm     = [\p{Sentence_Break = ATerm}];
$SContinue = [\p{Sentence_Break = SContinue}];
$STerm     = [\p{Sentence_Break = STerm}];
$Close     = [\p{Sentence_Break = Close}];

#
# Define extended forms of the character classes,
#   incorporate trailing Extend or Format chars.
#   Rules 4 and 5.


$CR         = \u000d;
$LF         = \u000a;
$Extend     = [[:Grapheme_Extend = TRUE:]$VoiceMarks];

$SpEx       = $Sp      ($Extend | $Format)*;
$LowerEx    = $Lower   ($Extend | $Format)*;

$OLetterEx  = $OLetter ($Extend | $Format)*;
$NumericEx  = $Numeric ($Extend | $Format)*;
$ATermEx    = $ATerm   ($Extend | $Format)*;
$SContinueEx= $SContinue ($Extend | $Format)*;
$STermEx    = $STerm   ($Extend | $Format)*;
$CloseEx    = $Close   ($Extend | $Format)*;


## -------------------------------------------------

!!chain;
!!forward;

# Rule 3 - break after separators.  Keep CR/LF together.
#
$CR $LF;

$LettersEx = [$OLetter $Upper $Lower $Numeric $Close $STerm] ($Extend | $Format)*;
$LettersEx* $Thai $LettersEx* ($ATermEx | $SpEx)*;

# Rule 4 - Break after $Sep.
# Rule 5 - Ignore $Format and $Extend
#
[^$Sep $CR $LF]? ($Extend | $Format)*;


# Rule 6
$ATermEx $NumericEx;

# Rule 7
($UpperEx | $LowerEx) $ATermEx $UpperEx;

#Rule 8
$NotLettersEx = [^$OLetter $Upper $Lower $Sep $CR $LF $ATerm $STerm] ($Extend | $Format)*;
$NotLettersEx = [^$OLetter $Upper $Lower $Sep $ATerm $STerm] ($Extend | $Format)*;
$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;

# Rule 8a
($STermEx | $ATermEx) $CloseEx* $SpEx* ($SContinueEx | $STermEx | $ATermEx);

#Rule 9, 10, 11
($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?;

#Rule 998
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};

## -------------------------------------------------

!!reverse;

$SpEx_R       = ($Extend | $Format)* $Sp;
$ATermEx_R    = ($Extend | $Format)* $ATerm;
$STermEx_R    = ($Extend | $Format)* $STerm;
$CloseEx_R    = ($Extend | $Format)* $Close;

#
#  Reverse rules.
#     For now, use the old style inexact reverse rules, which are easier
#     to write, but less efficient.
#     TODO:  exact reverse rules.  It appears that exact reverse rules
#            may require improving support for look-ahead breaks in the
#            builder.  Needs more investigation.
#

[{bof}] (.? | $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
#.*;

# Explanation for this rule:
#
#    It needs to back over
#        The $Sep at which we probably begin
#        All of the non $Sep chars leading to the preceding $Sep
#        The preceding $Sep, which will be the second one that the rule matches.
#        Any immediately preceding STerm or ATerm sequences.  We need to see these
#              to get the correct rule status when moving forwards again.
#        
# [{bof}]           inhibit rule chaining.  Without this, rule would loop on itself and match
#                   the entire string.
#
# (.? | $LF $CR)    Match one $Sep instance.  Use .? rather than $Sep because position might be
#                   at the beginning of the string at this point, and we don't want to fail.
#                   Can only use {eof} once, and it is used later.
#


Lines 1800-1805 Link Here

(-)a/include/svx/strings.hrc (+1 lines)
1800	#define RID_SUBSETSTR_KAKTOVIK_NUMERALS NC_("RID_SUBSETMAP", "Kaktovik Numerals")	1800	#define RID_SUBSETSTR_KAKTOVIK_NUMERALS NC_("RID_SUBSETMAP", "Kaktovik Numerals")
1801	#define RID_SUBSETSTR_KAWI NC_("RID_SUBSETMAP", "Kawi")	1801	#define RID_SUBSETSTR_KAWI NC_("RID_SUBSETMAP", "Kawi")
1802	#define RID_SUBSETSTR_NAG_MUNDARI NC_("RID_SUBSETMAP", "Nag Mundari")	1802	#define RID_SUBSETSTR_NAG_MUNDARI NC_("RID_SUBSETMAP", "Nag Mundari")
		1803	#define RID_SUBSETSTR_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I NC_("RID_SUBSETMAP", "CJK Unified Ideographs Extension I")
1803		1804
1804	#define RID_SVXSTR_FRAMEDIR_LTR NC_("RID_SVXSTR_FRAMEDIR_LTR", "Left-to-right (LTR)")	1805	#define RID_SVXSTR_FRAMEDIR_LTR NC_("RID_SVXSTR_FRAMEDIR_LTR", "Left-to-right (LTR)")
1805	#define RID_SVXSTR_FRAMEDIR_RTL NC_("RID_SVXSTR_FRAMEDIR_RTL", "Right-to-left (RTL)")	1806	#define RID_SVXSTR_FRAMEDIR_RTL NC_("RID_SVXSTR_FRAMEDIR_RTL", "Right-to-left (RTL)")

Lines 1924-1929 Link Here

(-)a/svx/source/dialog/charmap.cxx (+5 lines)
1924	aAllSubsets.emplace_back( 0x1E4D0, 0x1E4FF, SvxResId(RID_SUBSETSTR_NAG_MUNDARI) );	1924	aAllSubsets.emplace_back( 0x1E4D0, 0x1E4FF, SvxResId(RID_SUBSETSTR_NAG_MUNDARI) );
1925	break;	1925	break;
1926	#endif	1926	#endif
		1927	#if (U_ICU_VERSION_MAJOR_NUM >= 74)
		1928	case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I:
		1929	aAllSubsets.emplace_back( 0x2EBF0, 0x2EE5F, SvxResId(RID_SUBSETSTR_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I) );
		1930	break;
		1931	#endif
1927	}	1932	}
1928		1933
1929	#if OSL_DEBUG_LEVEL > 0 && !defined NDEBUG	1934	#if OSL_DEBUG_LEVEL > 0 && !defined NDEBUG

Return to bug 917618

Lines 1-43 Link Here

(-)a/i18npool/source/breakiterator/data/sent.txt (-66 / +21 lines)
		1	# Copyright (C) 2016 and later: Unicode, Inc. and others.
		2	# License & terms of use: http://www.unicode.org/copyright.html
1	#	3	#
2	# Copyright (C) 2002-2006, International Business Machines Corporation and others.	4	# Copyright (C) 2002-2015, International Business Machines Corporation and others.
3	# All Rights Reserved.	5	# All Rights Reserved.
4	#	6	#
5	# file: sent.txt	7	# file: sent.txt
6	#	8	#
7	# ICU Sentence Break Rules	9	# ICU Sentence Break Rules
8	# See Unicode Standard Annex #29.	10	# See Unicode Standard Annex #29.
9	# These rules are based on SA 29 version 5.0.0	11	# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
10	# Includes post 5.0 changes to treat Japanese half width voicing marks
11	# as Grapheme Extend.
12	#	12	#
13		13
14		14	!!quoted_literals_only;
15	$VoiceMarks = [\uff9e\uff9f];
16	$Thai = [:Script = Thai:];
17		15
18	#	16	#
19	# Character categories as defined in TR 29	17	# Character categories as defined in TR 29
20	#	18	#
		19	$CR = [\p{Sentence_Break = CR}];
		20	$LF = [\p{Sentence_Break = LF}];
		21	$Extend = [\p{Sentence_Break = Extend}];
21	$Sep = [\p{Sentence_Break = Sep}];	22	$Sep = [\p{Sentence_Break = Sep}];
22	$Format = [\p{Sentence_Break = Format}];	23	$Format = [\p{Sentence_Break = Format}];
23	$Sp = [\p{Sentence_Break = Sp}];	24	$Sp = [\p{Sentence_Break = Sp}];
24	$Lower = [\p{Sentence_Break = Lower}];	25	$Lower = [\p{Sentence_Break = Lower}];
25	$Upper = [\p{Sentence_Break = Upper}];	26	$Upper = [\p{Sentence_Break = Upper}];
26	$OLetter = [\p{Sentence_Break = OLetter}-$VoiceMarks];	27	$OLetter = [\p{Sentence_Break = OLetter}];
27	$Numeric = [\p{Sentence_Break = Numeric}];	28	$Numeric = [\p{Sentence_Break = Numeric}];
28	$ATerm = [\p{Sentence_Break = ATerm}];	29	$ATerm = [\p{Sentence_Break = ATerm}];
		30	$SContinue = [\p{Sentence_Break = SContinue}];
29	$STerm = [\p{Sentence_Break = STerm}];	31	$STerm = [\p{Sentence_Break = STerm}];
30	$Close = [\p{Sentence_Break = Close}];	32	$Close = [\p{Sentence_Break = Close}];
31		33
32	#	34	#
33	# Define extended forms of the character classes,	35	# Define extended forms of the character classes,
34	# incorporate grapheme cluster + format chars.	36	# incorporate trailing Extend or Format chars.
35	# Rules 4 and 5.	37	# Rules 4 and 5.
36
37
38	$CR = \u000d;
39	$LF = \u000a;
40	$Extend = [[:Grapheme_Extend = TRUE:]$VoiceMarks];
41		38
42	$SpEx = $Sp ($Extend \| $Format)*;	39	$SpEx = $Sp ($Extend \| $Format)*;
43	$LowerEx = $Lower ($Extend \| $Format)*;	40	$LowerEx = $Lower ($Extend \| $Format)*;
Lines 45-50 Link Here
45	$OLetterEx = $OLetter ($Extend \| $Format)*;	42	$OLetterEx = $OLetter ($Extend \| $Format)*;
46	$NumericEx = $Numeric ($Extend \| $Format)*;	43	$NumericEx = $Numeric ($Extend \| $Format)*;
47	$ATermEx = $ATerm ($Extend \| $Format)*;	44	$ATermEx = $ATerm ($Extend \| $Format)*;
		45	$SContinueEx= $SContinue ($Extend \| $Format)*;
48	$STermEx = $STerm ($Extend \| $Format)*;	46	$STermEx = $STerm ($Extend \| $Format)*;
49	$CloseEx = $Close ($Extend \| $Format)*;	47	$CloseEx = $Close ($Extend \| $Format)*;
50		48
Lines 52-128 Link Here
52	## -------------------------------------------------	50	## -------------------------------------------------
53		51
54	!!chain;	52	!!chain;
55	!!forward;
56		53
57	# Rule 3 - break after separators. Keep CR/LF together.	54	# Rule 3 - break after separators. Keep CR/LF together.
58	#	55	#
59	$CR $LF;	56	$CR $LF;
60		57
61	$LettersEx = [$OLetter $Upper $Lower $Numeric $Close $STerm] ($Extend \| $Format)*;
62	$LettersEx* $Thai $LettersEx* ($ATermEx \| $SpEx)*;
63		58
64	# Rule 4 - Break after $Sep.	59	# Rule 4 - Break after $Sep.
65	# Rule 5 - Ignore $Format and $Extend	60	# Rule 5 - Ignore $Format and $Extend
66	#	61	#
67	[^$Sep]? ($Extend \| $Format)*;	62	[^$Sep $CR $LF]? ($Extend \| $Format)*;
68		63
69		64
70	# Rule 6	65	# Rule 6
71	$ATermEx $NumericEx;	66	$ATermEx $NumericEx;
72		67
73	# Rule 7	68	# Rule 7
74	$UpperEx $ATermEx $UpperEx;	69	($UpperEx \| $LowerEx) $ATermEx $UpperEx;
75		70
76	#Rule 8	71	#Rule 8
77	# Note: follows errata for Unicode 5.0 boundary rules.	72	$NotLettersEx = [^$OLetter $Upper $Lower $Sep $CR $LF $ATerm $STerm] ($Extend \| $Format)*;
78	$NotLettersEx = [^$OLetter $Upper $Lower $Sep $ATerm $STerm] ($Extend \| $Format)*;
79	$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;	73	$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
80		74
81	# Rule 8a	75	# Rule 8a
82	($STermEx \| $ATermEx) $CloseEx* $SpEx* ($STermEx \| $ATermEx);	76	($STermEx \| $ATermEx) $CloseEx* $SpEx* ($SContinueEx \| $STermEx \| $ATermEx);
83		77
84	#Rule 9, 10, 11	78	#Rule 9, 10, 11
85	($STermEx \| $ATermEx) $CloseEx* $SpEx* $Sep?;	79	($STermEx \| $ATermEx) $CloseEx* $SpEx* ($Sep \| $CR \| $LF)?;
86		80
87	#Rule 12	81	#Rule 998
88	[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend $Thai]{bof}] ($Extend \| $Format \| $Close \| $Sp)* [^$Thai];	82	[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend \| $Format \| $Close \| $Sp)* .;
89	[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend \| $Format \| $Close \| $Sp)* ([$Sep{eof}] \| $CR $LF){100};	83	[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend \| $Format \| $Close \| $Sp)* ([$Sep $LF $CR {eof}] \| $CR $LF){100};
90
91	## -------------------------------------------------
92
93	!!reverse;
94
95	$SpEx_R = ($Extend \| $Format)* $Sp;
96	$ATermEx_R = ($Extend \| $Format)* $ATerm;
97	$STermEx_R = ($Extend \| $Format)* $STerm;
98	$CloseEx_R = ($Extend \| $Format)* $Close;
99
100	#
101	# Reverse rules.
102	# For now, use the old style inexact reverse rules, which are easier
103	# to write, but less efficient.
104	# TODO: exact reverse rules. It appears that exact reverse rules
105	# may require improving support for look-ahead breaks in the
106	# builder. Needs more investigation.
107	#
108
109	[{bof}] (.? \| $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R \| $ATermEx_R))*;
110	#.*;
111
112	# Explanation for this rule:
113	#
114	# It needs to back over
115	# The $Sep at which we probably begin
116	# All of the non $Sep chars leading to the preceding $Sep
117	# The preceding $Sep, which will be the second one that the rule matches.
118	# Any immediately preceding STerm or ATerm sequences. We need to see these
119	# to get the correct rule status when moving forwards again.
120	#
121	# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match
122	# the entire string.
123	#
124	# (.? \| $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be
125	# at the beginning of the string at this point, and we don't want to fail.
126	# Can only use {eof} once, and it is used later.
127	#
128