--- grep-2.5.1/src/search.c 2004-12-21 13:37:15.700555594 +0000 +++ grep-2.5.1/src/search.c 2004-12-21 13:49:05.873811016 +0000 @@ -21,6 +21,7 @@ #ifdef HAVE_CONFIG_H # include #endif +#include #include #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC /* We can handle multibyte string. */ @@ -39,6 +40,9 @@ #ifdef HAVE_LIBPCRE # include #endif +#ifdef HAVE_LANGINFO_CODESET +# include +#endif #define NCHAR (UCHAR_MAX + 1) @@ -70,9 +74,10 @@ call the regexp matcher at all. */ static int kwset_exact_matches; -#if defined(MBS_SUPPORT) -static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); -#endif +/* UTF-8 encoding allows some optimizations that we can't otherwise + assume in a multibyte encoding. */ +static int using_utf8; + static void kwsinit PARAMS ((void)); static void kwsmusts PARAMS ((void)); static void Gcompile PARAMS ((char const *, size_t)); @@ -84,6 +89,15 @@ static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); void +check_utf8 (void) +{ +#ifdef HAVE_LANGINFO_CODESET + if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) + using_utf8 = 1; +#endif +} + +void dfaerror (char const *mesg) { error (2, 0, mesg); @@ -141,47 +155,6 @@ } } -#ifdef MBS_SUPPORT -/* This function allocate the array which correspond to "buf". - Then this check multibyte string and mark on the positions which - are not singlebyte character nor the first byte of a multibyte - character. Caller must free the array. */ -static char* -check_multibyte_string(char const *buf, size_t size) -{ - char *mb_properties = xmalloc(size); - mbstate_t cur_state; - wchar_t wc; - int i; - memset(&cur_state, 0, sizeof(mbstate_t)); - memset(mb_properties, 0, sizeof(char)*size); - for (i = 0; i < size ;) - { - size_t mbclen; - mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state); - - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) - { - /* An invalid sequence, or a truncated multibyte character. - We treat it as a singlebyte character. */ - mbclen = 1; - } - else if (match_icase) - { - if (iswupper((wint_t)wc)) - { - wc = towlower((wint_t)wc); - wcrtomb(buf + i, wc, &cur_state); - } - } - mb_properties[i] = mbclen; - i += mbclen; - } - - return mb_properties; -} -#endif - static void Gcompile (char const *pattern, size_t size) { @@ -190,6 +163,7 @@ size_t total = size; char const *motif = pattern; + check_utf8 (); re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0)); dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); @@ -266,6 +240,7 @@ size_t total = size; char const *motif = pattern; + check_utf8 (); if (strcmp (matcher, "awk") == 0) { re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0)); @@ -350,18 +325,8 @@ struct kwsmatch kwsm; size_t i, ret_val; #ifdef MBS_SUPPORT - char *mb_properties = NULL; - if (MB_CUR_MAX > 1) - { - if (match_icase) - { - char *case_buf = xmalloc(size); - memcpy(case_buf, buf, size); - buf = case_buf; - } - if (kwset) - mb_properties = check_multibyte_string(buf, size); - } + mbstate_t mbs; + memset (&mbs, '\0', sizeof (mbstate_t)); #endif /* MBS_SUPPORT */ buflim = buf + size; @@ -373,21 +338,63 @@ if (kwset) { /* Find a possible match using the KWset matcher. */ - size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); +#ifdef MBS_SUPPORT + size_t bytes_left = 0; +#endif /* MBS_SUPPORT */ + size_t offset; +#ifdef MBS_SUPPORT + /* kwsexec doesn't work with match_icase and multibyte input. */ + if (match_icase && MB_CUR_MAX > 1) + /* Avoid kwset */ + offset = 0; + else +#endif /* MBS_SUPPORT */ + offset = kwsexec (kwset, beg, buflim - beg, &kwsm); if (offset == (size_t) -1) goto failure; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && !using_utf8) + { + bytes_left = offset; + while (bytes_left) + { + size_t len = mbrlen (beg, bytes_left, &mbs); + if (len == (size_t) -1 || len == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (len == (size_t) -2) + /* Offset points inside multibyte character: + * no good. */ + break; + + beg += len; + bytes_left -= len; + } + } + else +#endif /* MBS_SUPPORT */ beg += offset; /* Narrow down to the line containing the candidate, and run it through DFA. */ end = memchr(beg, eol, buflim - beg); end++; #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) + if (MB_CUR_MAX > 1 && bytes_left) continue; -#endif +#endif /* MBS_SUPPORT */ while (beg > buf && beg[-1] != eol) --beg; - if (kwsm.index < kwset_exact_matches) + if ( +#ifdef MBS_SUPPORT + !(match_icase && MB_CUR_MAX > 1) && +#endif /* MBS_SUPPORT */ + (kwsm.index < kwset_exact_matches)) goto success_in_beg_and_end; if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) continue; @@ -395,13 +402,47 @@ else { /* No good fixed strings; start with DFA. */ +#ifdef MBS_SUPPORT + size_t bytes_left = 0; +#endif /* MBS_SUPPORT */ size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); if (offset == (size_t) -1) break; /* Narrow down to the line we've found. */ +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && !using_utf8) + { + bytes_left = offset; + while (bytes_left) + { + size_t len = mbrlen (beg, bytes_left, &mbs); + if (len == (size_t) -1 || len == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (len == (size_t) -2) + /* Offset points inside multibyte character: + * no good. */ + break; + + beg += len; + bytes_left -= len; + } + } + else +#endif /* MBS_SUPPORT */ beg += offset; end = memchr (beg, eol, buflim - beg); end++; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && bytes_left) + continue; +#endif /* MBS_SUPPORT */ while (beg > buf && beg[-1] != eol) --beg; } @@ -469,15 +510,6 @@ } /* for (beg = end ..) */ failure: -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (mb_properties) - free (mb_properties); - if (match_icase) - free ((char *) buf); - } -#endif /* MBS_SUPPORT */ return (size_t) -1; success_in_beg_and_end: @@ -486,24 +518,125 @@ /* FALLTHROUGH */ success_in_start_and_len: -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (mb_properties) - free (mb_properties); - if (match_icase) - free ((char *) buf); - } -#endif /* MBS_SUPPORT */ *match_size = len; return start; } +static wchar_t **f_pattern; +static char *f_initial_byte; +static size_t f_pattern_count; +static int f_i_multibyte; /* whether we're using the new -Fi MB method */ + static void Fcompile (char const *pattern, size_t size) { char const *beg, *lim, *err; + check_utf8 (); +#ifdef MBS_SUPPORT + /* Support -F -i for UTF-8 input. */ + if (match_icase && MB_CUR_MAX > 1) + { + size_t in = 0; + + while (f_i_multibyte != -1 && in < size) + { + wchar_t *f_this_pattern; + size_t f_this_pattern_allocated = sizeof (wchar_t) * 1000; + mbstate_t mbs; + size_t out = 0; + f_pattern_count++; + f_pattern = xrealloc (f_pattern, + sizeof (wchar_t *) * f_pattern_count); + f_initial_byte = xrealloc (f_initial_byte, + sizeof (char) * + (2 * f_pattern_count + 1)); + if (f_pattern_count == 1) + f_initial_byte[0] = '\0'; + + /* Convert pattern into wchar_t*, storing them in this_pattern. + Don't read more than we're given. */ + f_this_pattern = xmalloc (f_this_pattern_allocated); + memset (&mbs, '\0', sizeof (mbs)); + while (in < size) + { + size_t c; + wchar_t this_wc; + if (out == f_this_pattern_allocated) + { + f_this_pattern_allocated *= 2; + f_this_pattern = xrealloc (f_this_pattern, + f_this_pattern_allocated); + } + + c = mbrtowc (&this_wc, pattern + in, size - in, &mbs); + if (c < 1) + { + /* Fall back to old method. */ + f_i_multibyte = -1; + while (f_pattern_count--) + free (f_pattern[f_pattern_count]); + free (f_pattern); + f_pattern = NULL; + break; + } + + f_this_pattern[out] = towlower (this_wc); + if (out == 0) + { + /* First character. Work out the first byte of upper and + lower case multibyte strings for the first character. */ + wchar_t wc; + char mbs[MB_CUR_MAX]; + mbstate_t ps; + + if (iswupper (this_wc)) + { + wc = towlower (this_wc); + } + else + { + wc = towupper (this_wc); + } + + memset (&ps, '\0', sizeof (ps)); + wcrtomb (mbs, this_wc, &ps); + mbs[1] = '\0'; + strcat (f_initial_byte, mbs); + + memset (&ps, '\0', sizeof (ps)); + wcrtomb (mbs, wc, &ps); + mbs[1] = '\0'; + strcat (f_initial_byte, mbs); + } + + in += c; + + if (this_wc == L'\n') + break; + + out++; + } + + if (f_i_multibyte == -1) + break; + + /* Nul-terminate it. */ + if (out == f_this_pattern_allocated) + { + f_this_pattern_allocated++; + f_this_pattern = xrealloc (f_this_pattern, + f_this_pattern_allocated); + } + + f_this_pattern[out] = L'\0'; + f_pattern[f_pattern_count - 1] = f_this_pattern; + f_i_multibyte = 1; + } + } +#endif /* MBS_SUPPORT */ + + kwsinit (); beg = pattern; do @@ -523,6 +656,87 @@ } static size_t +Fimbexec (const char *buf, size_t size, size_t *plen) +{ + char const *beg; + size_t len; + mbstate_t mbs; + + assert (match_icase && f_i_multibyte == 1); + assert (MB_CUR_MAX > 1); + + memset (&mbs, '\0', sizeof (mbs)); + beg = buf; + len = 0; + while (beg < buf + size) + { + wchar_t wc; + char const *p; + char const *next_char; + unsigned char match[f_pattern_count]; + size_t i, letter; + int patterns_left; + + for (p = beg; + (p < buf + size) && !strchr (f_initial_byte, *p); + p++) + ; + + if (p == NULL || p == buf + size) + break; + + /* First byte matches, now check the rest */ + beg = p; + letter = len = 0; + memset (match, '\1', f_pattern_count); + patterns_left = 1; + while (patterns_left) + { + size_t c; + + patterns_left = 0; + + c = mbrtowc (&wc, beg + len, size - (beg - buf) - len, &mbs); + if (c < 1) + { + memset (&mbs, '\0', sizeof (mbs)); + next_char = beg + 1; + break; + } + + if (!len) + next_char = beg + c; + + wc = towlower (wc); + for (i = 0; i < f_pattern_count; i++) + { + if (match[i]) + { + if (f_pattern[i][letter] == L'\0') + { + /* Found a match. */ + *plen = len; + return beg - buf; + } + + if (f_pattern[i][letter] == wc) + patterns_left = 1; + else + match[i] = '\0'; + } + } + + len += c; + letter++; + } + + beg = next_char; + } + + return -1; +} + +static size_t Fexecute (char const *buf, size_t size, size_t *match_size, int exact) { register char const *beg, *try, *end; @@ -531,27 +745,50 @@ struct kwsmatch kwsmatch; size_t ret_val; #ifdef MBS_SUPPORT - char *mb_properties = NULL; - if (MB_CUR_MAX > 1) - { - if (match_icase) - { - char *case_buf = xmalloc(size); - memcpy(case_buf, buf, size); - buf = case_buf; - } - mb_properties = check_multibyte_string(buf, size); - } + mbstate_t mbs; + memset (&mbs, '\0', sizeof (mbstate_t)); #endif /* MBS_SUPPORT */ for (beg = buf; beg <= buf + size; ++beg) { - size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); + size_t offset; +#ifdef MBS_SUPPORT + if (match_icase && f_i_multibyte == 1) + offset = Fimbexec (beg, buf + size - beg, &kwsmatch.size[0]); + else +#endif /* MBS_SUPPORT */ + offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); + if (offset == (size_t) -1) goto failure; #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) - continue; /* It is a part of multibyte character. */ + if (MB_CUR_MAX > 1 && !using_utf8) + { + size_t bytes_left = offset; + while (bytes_left) + { + size_t len = mbrlen (beg, bytes_left, &mbs); + if (len == (size_t) -1 || len == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (len == (size_t) -2) + /* Offset points inside multibyte character: no good. */ + break; + + beg += len; + bytes_left -= len; + } + + if (bytes_left) + continue; + } + else #endif /* MBS_SUPPORT */ beg += offset; len = kwsmatch.size[0]; @@ -583,10 +820,46 @@ { /* Try a shorter length anchored at the same place. */ --len; +#ifdef MBS_SUPPORT + if (match_icase && f_i_multibyte == 1) + offset = Fimbexec (beg, len, &kwsmatch.size[0]); + else +#endif /* MBS_SUPPORT */ offset = kwsexec (kwset, beg, len, &kwsmatch); + if (offset == -1) { break; /* Try a different anchor. */ } +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && !using_utf8) + { + size_t bytes_left = offset; + while (bytes_left) + { + size_t len = mbrlen (beg, bytes_left, &mbs); + if (len == (size_t) -1 || len == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (len == (size_t) -2) + /* Offset points inside multibyte character: + * no good. */ + break; + + beg += len; + bytes_left -= len; + } + + if (bytes_left) + break; /* Try a different anchor. */ + } + else +#endif /* MBS_SUPPORT */ beg += offset; len = kwsmatch.size[0]; } @@ -597,19 +870,31 @@ } failure: + return -1; + + success: #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) + if (MB_CUR_MAX > 1 && !using_utf8) { - if (match_icase) - free((char *) buf); - if (mb_properties) - free(mb_properties); + end = beg + len; + while (end < buf + size) + { + size_t len = mbrlen (end, buf + size - end, &mbs); + if (len == (size_t) -1 || len == (size_t) -2 || len == 0) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + len = 1; + } + if (len == 1 && *end == eol) + break; + + end += len; + } } + else #endif /* MBS_SUPPORT */ - return -1; - - success: end = memchr (beg + len, eol, (buf + size) - (beg + len)); + end++; while (buf < beg && beg[-1] != eol) --beg; @@ -618,15 +903,6 @@ success_in_beg_and_len: *match_size = len; -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (mb_properties) - free (mb_properties); - if (match_icase) - free ((char *) buf); - } -#endif /* MBS_SUPPORT */ return beg - buf; }