Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 620014
Collapse All | Expand All

(-)common/utext.cpp.orig (-5 / +23 lines)
Lines 848-856 Link Here
848
848
849
// Chunk size.
849
// Chunk size.
850
//     Must be less than 85, because of byte mapping from UChar indexes to native indexes.
850
//     Must be less than 42  (256/6), because of byte mapping from UChar indexes to native indexes.
851
//     Worst case is three native bytes to one UChar.  (Supplemenaries are 4 native bytes
851
//     Worst case there are six UTF-8 bytes per UChar.
852
//     to two UChars.)
852
//         obsolete 6 byte form fd + 5 trails maps to fffd
853
//         obsolete 5 byte form fc + 4 trails maps to fffd
854
//         non-shortest 4 byte forms maps to fffd
855
//         normal supplementaries map to a pair of utf-16, two utf8 bytes per utf-16 unit
856
//     mapToUChars array size must allow for the worst case, 6.
857
//     This could be brought down to 4, by treating fd and fc as pure illegal,
858
//     rather than obsolete lead bytes. But that is not compatible with the utf-8 access macros.
853
//
859
//
854
enum { UTF8_TEXT_CHUNK_SIZE=32 };
860
enum { UTF8_TEXT_CHUNK_SIZE=32 };
861
enum { SIZEOF_MAPTOUCHARS=UTF8_TEXT_CHUNK_SIZE*6+6 };
855
862
856
//
863
//
Lines 890-894 Link Here
890
                                                     //    one for a supplementary starting in the last normal position,
897
                                                     //    one for a supplementary starting in the last normal position,
891
                                                     //    and one for an entry for the buffer limit position.
898
                                                     //    and one for an entry for the buffer limit position.
892
    uint8_t   mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
899
    uint8_t   mapToUChars[SIZEOF_MAPTOUCHARS];       // Map native offset from bufNativeStart to
893
                                                     //   correspoding offset in filled part of buf.
900
                                                     //   correspoding offset in filled part of buf.
894
    int32_t   align;
901
    int32_t   align;
Lines 1033-1036 Link Here
1033
            u8b = (UTF8Buf *)ut->p;   // the current buffer
1040
            u8b = (UTF8Buf *)ut->p;   // the current buffer
1034
            mapIndex = ix - u8b->toUCharsMapStart;
1041
            mapIndex = ix - u8b->toUCharsMapStart;
1042
            U_ASSERT(mapIndex < (int32_t)SIZEOF_MAPTOUCHARS);
1035
            ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1043
            ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1036
            return TRUE;
1044
            return TRUE;
Lines 1299-1302 Link Here
1299
        //   If index is at the end, there is no character there to look at.
1307
        //   If index is at the end, there is no character there to look at.
1300
        if (ix != ut->b) {
1308
        if (ix != ut->b) {
1309
            // Note: this function will only move the index back if it is on a trail byte
1310
            //       and there is a preceding lead byte and the sequence from the lead 
1311
            //       through this trail could be part of a valid UTF-8 sequence
1312
            //       Otherwise the index remains unchanged.
1301
            U8_SET_CP_START(s8, 0, ix);
1313
            U8_SET_CP_START(s8, 0, ix);
1302
        }
1314
        }
Lines 1312-1316 Link Here
1312
        uint8_t *mapToNative = u8b->mapToNative;
1324
        uint8_t *mapToNative = u8b->mapToNative;
1313
        uint8_t *mapToUChars = u8b->mapToUChars;
1325
        uint8_t *mapToUChars = u8b->mapToUChars;
1314
        int32_t  toUCharsMapStart = ix - (UTF8_TEXT_CHUNK_SIZE*3 + 1);
1326
        int32_t  toUCharsMapStart = ix - SIZEOF_MAPTOUCHARS + 1;
1327
        // Note that toUCharsMapStart can be negative. Happens when the remaining
1328
        // text from current position to the beginning is less than the buffer size.
1329
        // + 1 because mapToUChars must have a slot at the end for the bufNativeLimit entry.
1315
        int32_t  destIx = UTF8_TEXT_CHUNK_SIZE+2;   // Start in the overflow region
1330
        int32_t  destIx = UTF8_TEXT_CHUNK_SIZE+2;   // Start in the overflow region
1316
                                                    //   at end of buffer to leave room
1331
                                                    //   at end of buffer to leave room
Lines 1339-1342 Link Here
1339
                // Special case ASCII range for speed.
1354
                // Special case ASCII range for speed.
1340
                buf[destIx] = (UChar)c;
1355
                buf[destIx] = (UChar)c;
1356
                U_ASSERT(toUCharsMapStart <= srcIx);
1341
                mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
1357
                mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
1342
                mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1358
                mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
Lines 1368-1371 Link Here
1368
                    mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx;
1384
                    mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx;
1369
                } while (sIx >= srcIx);
1385
                } while (sIx >= srcIx);
1386
                U_ASSERT(toUCharsMapStart <= (srcIx+1));
1370
1387
1371
                // Set native indexing limit to be the current position.
1388
                // Set native indexing limit to be the current position.
Lines 1542-1545 Link Here
1542
    U_ASSERT(index<=ut->chunkNativeLimit);
1559
    U_ASSERT(index<=ut->chunkNativeLimit);
1543
    int32_t mapIndex = index - u8b->toUCharsMapStart;
1560
    int32_t mapIndex = index - u8b->toUCharsMapStart;
1561
    U_ASSERT(mapIndex < (int32_t)SIZEOF_MAPTOUCHARS);
1544
    int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1562
    int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1545
    U_ASSERT(offset>=0 && offset<=ut->chunkLength);
1563
    U_ASSERT(offset>=0 && offset<=ut->chunkLength);
(-)test/intltest/utxttest.cpp (+62 lines)
Lines 68-71 Link Here
68
        case 7: name = "Ticket12130";
68
        case 7: name = "Ticket12130";
69
            if (exec) Ticket12130(); break;
69
            if (exec) Ticket12130(); break;
70
        case 8: name = "Ticket12888";
71
            if (exec) Ticket12888(); break;
70
        default: name = "";          break;
72
        default: name = "";          break;
71
    }
73
    }
Lines 1584-1585 Link Here
1584
    utext_close(&ut);
1586
    utext_close(&ut);
1585
}
1587
}
1588
1589
// Ticket 12888: bad handling of illegal utf-8 containing many instances of the archaic, now illegal,
1590
//               six byte utf-8 forms. Original implementation had an assumption that
1591
//               there would be at most three utf-8 bytes per UTF-16 code unit.
1592
//               The five and six byte sequences map to a single replacement character.
1593
1594
void UTextTest::Ticket12888() {
1595
    const char *badString = 
1596
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1597
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1598
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1599
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1600
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1601
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1602
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1603
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1604
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1605
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1606
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1607
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1608
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1609
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1610
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1611
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1612
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1613
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1614
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1615
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80";
1616
1617
    UErrorCode status = U_ZERO_ERROR;
1618
    LocalUTextPointer ut(utext_openUTF8(NULL, badString, -1, &status));
1619
    TEST_SUCCESS(status);
1620
    for (;;) {
1621
        UChar32 c = utext_next32(ut.getAlias());
1622
        if (c == U_SENTINEL) {
1623
            break;
1624
        }
1625
    }
1626
    int32_t endIdx = utext_getNativeIndex(ut.getAlias());
1627
    if (endIdx != (int32_t)strlen(badString)) {
1628
        errln("%s:%d expected=%d, actual=%d", __FILE__, __LINE__, strlen(badString), endIdx);
1629
        return;
1630
    }
1631
1632
    for (int32_t prevIndex = endIdx; prevIndex>0;) {
1633
        UChar32 c = utext_previous32(ut.getAlias());
1634
        int32_t currentIndex = utext_getNativeIndex(ut.getAlias());
1635
        if (c != 0xfffd) {
1636
            errln("%s:%d (expected, actual, index) = (%d, %d, %d)\n",
1637
                    __FILE__, __LINE__, 0xfffd, c, currentIndex);
1638
            break;
1639
        }
1640
        if (currentIndex != prevIndex - 6) {
1641
            errln("%s:%d: wrong index. Expected, actual = %d, %d",
1642
                    __FILE__, __LINE__, prevIndex - 6, currentIndex);
1643
            break;
1644
        }
1645
        prevIndex = currentIndex;
1646
    }
1647
}
(-)test/intltest/utxttest.h (+1 lines)
Lines 39-42 Link Here
39
    void Ticket10983();
39
    void Ticket10983();
40
    void Ticket12130();
40
    void Ticket12130();
41
    void Ticket12888();
41
42
42
private:
43
private:

Return to bug 620014