Go to:
Gentoo Home
Documentation
Forums
Lists
Bugs
Planet
Store
Wiki
Get Gentoo!
Gentoo's Bugzilla – Attachment 83645 Details for
Bug 93443
grep 79x slowdown with LANG=en_us.utf8 LC_ALL=en_us.utf8
Home
|
New
–
[Ex]
|
Browse
|
Search
|
Privacy Policy
|
[?]
|
Reports
|
Requests
|
Help
|
New Account
|
Log In
[x]
|
Forgot Password
Login:
[x]
[patch]
Take 2, Patch 1: Special-case UTF-8, massive speed-up (60x on my testcase)
grep-2.5.1-egf-speedup.patch (text/plain), 15.30 KB, created by
Alexey Spiridonov
on 2006-04-01 11:57:16 UTC
(
hide
)
Description:
Take 2, Patch 1: Special-case UTF-8, massive speed-up (60x on my testcase)
Filename:
MIME Type:
Creator:
Alexey Spiridonov
Created:
2006-04-01 11:57:16 UTC
Size:
15.30 KB
patch
obsolete
>--- grep-2.5.1/src/search.c 2004-12-21 13:37:15.700555594 +0000 >+++ grep-2.5.1/src/search.c 2004-12-21 13:49:05.873811016 +0000 >@@ -21,6 +21,7 @@ > #ifdef HAVE_CONFIG_H > # include <config.h> > #endif >+#include <assert.h> > #include <sys/types.h> > #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC > /* We can handle multibyte string. */ >@@ -39,6 +40,9 @@ > #ifdef HAVE_LIBPCRE > # include <pcre.h> > #endif >+#ifdef HAVE_LANGINFO_CODESET >+# include <langinfo.h> >+#endif > > #define NCHAR (UCHAR_MAX + 1) > >@@ -70,9 +74,10 @@ > call the regexp matcher at all. */ > static int kwset_exact_matches; > >-#if defined(MBS_SUPPORT) >-static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); >-#endif >+/* UTF-8 encoding allows some optimizations that we can't otherwise >+ assume in a multibyte encoding. */ >+static int using_utf8; >+ > static void kwsinit PARAMS ((void)); > static void kwsmusts PARAMS ((void)); > static void Gcompile PARAMS ((char const *, size_t)); >@@ -84,6 +89,15 @@ > static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); > > void >+check_utf8 (void) >+{ >+#ifdef HAVE_LANGINFO_CODESET >+ if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) >+ using_utf8 = 1; >+#endif >+} >+ >+void > dfaerror (char const *mesg) > { > error (2, 0, mesg); >@@ -141,47 +155,6 @@ > } > } > >-#ifdef MBS_SUPPORT >-/* This function allocate the array which correspond to "buf". >- Then this check multibyte string and mark on the positions which >- are not singlebyte character nor the first byte of a multibyte >- character. Caller must free the array. */ >-static char* >-check_multibyte_string(char const *buf, size_t size) >-{ >- char *mb_properties = xmalloc(size); >- mbstate_t cur_state; >- wchar_t wc; >- int i; >- memset(&cur_state, 0, sizeof(mbstate_t)); >- memset(mb_properties, 0, sizeof(char)*size); >- for (i = 0; i < size ;) >- { >- size_t mbclen; >- mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state); >- >- if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) >- { >- /* An invalid sequence, or a truncated multibyte character. >- We treat it as a singlebyte character. */ >- mbclen = 1; >- } >- else if (match_icase) >- { >- if (iswupper((wint_t)wc)) >- { >- wc = towlower((wint_t)wc); >- wcrtomb(buf + i, wc, &cur_state); >- } >- } >- mb_properties[i] = mbclen; >- i += mbclen; >- } >- >- return mb_properties; >-} >-#endif >- > static void > Gcompile (char const *pattern, size_t size) > { >@@ -190,6 +163,7 @@ > size_t total = size; > char const *motif = pattern; > >+ check_utf8 (); > re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0)); > dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); > >@@ -266,6 +240,7 @@ > size_t total = size; > char const *motif = pattern; > >+ check_utf8 (); > if (strcmp (matcher, "awk") == 0) > { > re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0)); >@@ -350,18 +325,8 @@ > struct kwsmatch kwsm; > size_t i, ret_val; > #ifdef MBS_SUPPORT >- char *mb_properties = NULL; >- if (MB_CUR_MAX > 1) >- { >- if (match_icase) >- { >- char *case_buf = xmalloc(size); >- memcpy(case_buf, buf, size); >- buf = case_buf; >- } >- if (kwset) >- mb_properties = check_multibyte_string(buf, size); >- } >+ mbstate_t mbs; >+ memset (&mbs, '\0', sizeof (mbstate_t)); > #endif /* MBS_SUPPORT */ > > buflim = buf + size; >@@ -373,21 +338,63 @@ > if (kwset) > { > /* Find a possible match using the KWset matcher. */ >- size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); >+#ifdef MBS_SUPPORT >+ size_t bytes_left = 0; >+#endif /* MBS_SUPPORT */ >+ size_t offset; >+#ifdef MBS_SUPPORT >+ /* kwsexec doesn't work with match_icase and multibyte input. */ >+ if (match_icase && MB_CUR_MAX > 1) >+ /* Avoid kwset */ >+ offset = 0; >+ else >+#endif /* MBS_SUPPORT */ >+ offset = kwsexec (kwset, beg, buflim - beg, &kwsm); > if (offset == (size_t) -1) > goto failure; >+#ifdef MBS_SUPPORT >+ if (MB_CUR_MAX > 1 && !using_utf8) >+ { >+ bytes_left = offset; >+ while (bytes_left) >+ { >+ size_t len = mbrlen (beg, bytes_left, &mbs); >+ if (len == (size_t) -1 || len == 0) >+ { >+ /* Incomplete character: treat as single-byte. */ >+ memset (&mbs, '\0', sizeof (mbstate_t)); >+ beg++; >+ bytes_left--; >+ continue; >+ } >+ >+ if (len == (size_t) -2) >+ /* Offset points inside multibyte character: >+ * no good. */ >+ break; >+ >+ beg += len; >+ bytes_left -= len; >+ } >+ } >+ else >+#endif /* MBS_SUPPORT */ > beg += offset; > /* Narrow down to the line containing the candidate, and > run it through DFA. */ > end = memchr(beg, eol, buflim - beg); > end++; > #ifdef MBS_SUPPORT >- if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) >+ if (MB_CUR_MAX > 1 && bytes_left) > continue; >-#endif >+#endif /* MBS_SUPPORT */ > while (beg > buf && beg[-1] != eol) > --beg; >- if (kwsm.index < kwset_exact_matches) >+ if ( >+#ifdef MBS_SUPPORT >+ !(match_icase && MB_CUR_MAX > 1) && >+#endif /* MBS_SUPPORT */ >+ (kwsm.index < kwset_exact_matches)) > goto success_in_beg_and_end; > if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) > continue; >@@ -395,13 +402,47 @@ > else > { > /* No good fixed strings; start with DFA. */ >+#ifdef MBS_SUPPORT >+ size_t bytes_left = 0; >+#endif /* MBS_SUPPORT */ > size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); > if (offset == (size_t) -1) > break; > /* Narrow down to the line we've found. */ >+#ifdef MBS_SUPPORT >+ if (MB_CUR_MAX > 1 && !using_utf8) >+ { >+ bytes_left = offset; >+ while (bytes_left) >+ { >+ size_t len = mbrlen (beg, bytes_left, &mbs); >+ if (len == (size_t) -1 || len == 0) >+ { >+ /* Incomplete character: treat as single-byte. */ >+ memset (&mbs, '\0', sizeof (mbstate_t)); >+ beg++; >+ bytes_left--; >+ continue; >+ } >+ >+ if (len == (size_t) -2) >+ /* Offset points inside multibyte character: >+ * no good. */ >+ break; >+ >+ beg += len; >+ bytes_left -= len; >+ } >+ } >+ else >+#endif /* MBS_SUPPORT */ > beg += offset; > end = memchr (beg, eol, buflim - beg); > end++; >+#ifdef MBS_SUPPORT >+ if (MB_CUR_MAX > 1 && bytes_left) >+ continue; >+#endif /* MBS_SUPPORT */ > while (beg > buf && beg[-1] != eol) > --beg; > } >@@ -469,15 +510,6 @@ > } /* for (beg = end ..) */ > > failure: >-#ifdef MBS_SUPPORT >- if (MB_CUR_MAX > 1) >- { >- if (mb_properties) >- free (mb_properties); >- if (match_icase) >- free ((char *) buf); >- } >-#endif /* MBS_SUPPORT */ > return (size_t) -1; > > success_in_beg_and_end: >@@ -486,24 +518,125 @@ > /* FALLTHROUGH */ > > success_in_start_and_len: >-#ifdef MBS_SUPPORT >- if (MB_CUR_MAX > 1) >- { >- if (mb_properties) >- free (mb_properties); >- if (match_icase) >- free ((char *) buf); >- } >-#endif /* MBS_SUPPORT */ > *match_size = len; > return start; > } > >+static wchar_t **f_pattern; >+static char *f_initial_byte; >+static size_t f_pattern_count; >+static int f_i_multibyte; /* whether we're using the new -Fi MB method */ >+ > static void > Fcompile (char const *pattern, size_t size) > { > char const *beg, *lim, *err; > >+ check_utf8 (); >+#ifdef MBS_SUPPORT >+ /* Support -F -i for UTF-8 input. */ >+ if (match_icase && MB_CUR_MAX > 1) >+ { >+ size_t in = 0; >+ >+ while (f_i_multibyte != -1 && in < size) >+ { >+ wchar_t *f_this_pattern; >+ size_t f_this_pattern_allocated = sizeof (wchar_t) * 1000; >+ mbstate_t mbs; >+ size_t out = 0; >+ f_pattern_count++; >+ f_pattern = xrealloc (f_pattern, >+ sizeof (wchar_t *) * f_pattern_count); >+ f_initial_byte = xrealloc (f_initial_byte, >+ sizeof (char) * >+ (2 * f_pattern_count + 1)); >+ if (f_pattern_count == 1) >+ f_initial_byte[0] = '\0'; >+ >+ /* Convert pattern into wchar_t*, storing them in this_pattern. >+ Don't read more than we're given. */ >+ f_this_pattern = xmalloc (f_this_pattern_allocated); >+ memset (&mbs, '\0', sizeof (mbs)); >+ while (in < size) >+ { >+ size_t c; >+ wchar_t this_wc; >+ if (out == f_this_pattern_allocated) >+ { >+ f_this_pattern_allocated *= 2; >+ f_this_pattern = xrealloc (f_this_pattern, >+ f_this_pattern_allocated); >+ } >+ >+ c = mbrtowc (&this_wc, pattern + in, size - in, &mbs); >+ if (c < 1) >+ { >+ /* Fall back to old method. */ >+ f_i_multibyte = -1; >+ while (f_pattern_count--) >+ free (f_pattern[f_pattern_count]); >+ free (f_pattern); >+ f_pattern = NULL; >+ break; >+ } >+ >+ f_this_pattern[out] = towlower (this_wc); >+ if (out == 0) >+ { >+ /* First character. Work out the first byte of upper and >+ lower case multibyte strings for the first character. */ >+ wchar_t wc; >+ char mbs[MB_CUR_MAX]; >+ mbstate_t ps; >+ >+ if (iswupper (this_wc)) >+ { >+ wc = towlower (this_wc); >+ } >+ else >+ { >+ wc = towupper (this_wc); >+ } >+ >+ memset (&ps, '\0', sizeof (ps)); >+ wcrtomb (mbs, this_wc, &ps); >+ mbs[1] = '\0'; >+ strcat (f_initial_byte, mbs); >+ >+ memset (&ps, '\0', sizeof (ps)); >+ wcrtomb (mbs, wc, &ps); >+ mbs[1] = '\0'; >+ strcat (f_initial_byte, mbs); >+ } >+ >+ in += c; >+ >+ if (this_wc == L'\n') >+ break; >+ >+ out++; >+ } >+ >+ if (f_i_multibyte == -1) >+ break; >+ >+ /* Nul-terminate it. */ >+ if (out == f_this_pattern_allocated) >+ { >+ f_this_pattern_allocated++; >+ f_this_pattern = xrealloc (f_this_pattern, >+ f_this_pattern_allocated); >+ } >+ >+ f_this_pattern[out] = L'\0'; >+ f_pattern[f_pattern_count - 1] = f_this_pattern; >+ f_i_multibyte = 1; >+ } >+ } >+#endif /* MBS_SUPPORT */ >+ >+ > kwsinit (); > beg = pattern; > do >@@ -523,6 +656,87 @@ > } > > static size_t >+Fimbexec (const char *buf, size_t size, size_t *plen) >+{ >+ char const *beg; >+ size_t len; >+ mbstate_t mbs; >+ >+ assert (match_icase && f_i_multibyte == 1); >+ assert (MB_CUR_MAX > 1); >+ >+ memset (&mbs, '\0', sizeof (mbs)); >+ beg = buf; >+ len = 0; >+ while (beg < buf + size) >+ { >+ wchar_t wc; >+ char const *p; >+ char const *next_char; >+ unsigned char match[f_pattern_count]; >+ size_t i, letter; >+ int patterns_left; >+ >+ for (p = beg; >+ (p < buf + size) && !strchr (f_initial_byte, *p); >+ p++) >+ ; >+ >+ if (p == NULL || p == buf + size) >+ break; >+ >+ /* First byte matches, now check the rest */ >+ beg = p; >+ letter = len = 0; >+ memset (match, '\1', f_pattern_count); >+ patterns_left = 1; >+ while (patterns_left) >+ { >+ size_t c; >+ >+ patterns_left = 0; >+ >+ c = mbrtowc (&wc, beg + len, size - (beg - buf) - len, &mbs); >+ if (c < 1) >+ { >+ memset (&mbs, '\0', sizeof (mbs)); >+ next_char = beg + 1; >+ break; >+ } >+ >+ if (!len) >+ next_char = beg + c; >+ >+ wc = towlower (wc); >+ for (i = 0; i < f_pattern_count; i++) >+ { >+ if (match[i]) >+ { >+ if (f_pattern[i][letter] == L'\0') >+ { >+ /* Found a match. */ >+ *plen = len; >+ return beg - buf; >+ } >+ >+ if (f_pattern[i][letter] == wc) >+ patterns_left = 1; >+ else >+ match[i] = '\0'; >+ } >+ } >+ >+ len += c; >+ letter++; >+ } >+ >+ beg = next_char; >+ } >+ >+ return -1; >+} >+ >+static size_t > Fexecute (char const *buf, size_t size, size_t *match_size, int exact) > { > register char const *beg, *try, *end; >@@ -531,27 +745,50 @@ > struct kwsmatch kwsmatch; > size_t ret_val; > #ifdef MBS_SUPPORT >- char *mb_properties = NULL; >- if (MB_CUR_MAX > 1) >- { >- if (match_icase) >- { >- char *case_buf = xmalloc(size); >- memcpy(case_buf, buf, size); >- buf = case_buf; >- } >- mb_properties = check_multibyte_string(buf, size); >- } >+ mbstate_t mbs; >+ memset (&mbs, '\0', sizeof (mbstate_t)); > #endif /* MBS_SUPPORT */ > > for (beg = buf; beg <= buf + size; ++beg) > { >- size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); >+ size_t offset; >+#ifdef MBS_SUPPORT >+ if (match_icase && f_i_multibyte == 1) >+ offset = Fimbexec (beg, buf + size - beg, &kwsmatch.size[0]); >+ else >+#endif /* MBS_SUPPORT */ >+ offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); >+ > if (offset == (size_t) -1) > goto failure; > #ifdef MBS_SUPPORT >- if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) >- continue; /* It is a part of multibyte character. */ >+ if (MB_CUR_MAX > 1 && !using_utf8) >+ { >+ size_t bytes_left = offset; >+ while (bytes_left) >+ { >+ size_t len = mbrlen (beg, bytes_left, &mbs); >+ if (len == (size_t) -1 || len == 0) >+ { >+ /* Incomplete character: treat as single-byte. */ >+ memset (&mbs, '\0', sizeof (mbstate_t)); >+ beg++; >+ bytes_left--; >+ continue; >+ } >+ >+ if (len == (size_t) -2) >+ /* Offset points inside multibyte character: no good. */ >+ break; >+ >+ beg += len; >+ bytes_left -= len; >+ } >+ >+ if (bytes_left) >+ continue; >+ } >+ else > #endif /* MBS_SUPPORT */ > beg += offset; > len = kwsmatch.size[0]; >@@ -583,10 +820,46 @@ > { > /* Try a shorter length anchored at the same place. */ > --len; >+#ifdef MBS_SUPPORT >+ if (match_icase && f_i_multibyte == 1) >+ offset = Fimbexec (beg, len, &kwsmatch.size[0]); >+ else >+#endif /* MBS_SUPPORT */ > offset = kwsexec (kwset, beg, len, &kwsmatch); >+ > if (offset == -1) { > break; /* Try a different anchor. */ > } >+#ifdef MBS_SUPPORT >+ if (MB_CUR_MAX > 1 && !using_utf8) >+ { >+ size_t bytes_left = offset; >+ while (bytes_left) >+ { >+ size_t len = mbrlen (beg, bytes_left, &mbs); >+ if (len == (size_t) -1 || len == 0) >+ { >+ /* Incomplete character: treat as single-byte. */ >+ memset (&mbs, '\0', sizeof (mbstate_t)); >+ beg++; >+ bytes_left--; >+ continue; >+ } >+ >+ if (len == (size_t) -2) >+ /* Offset points inside multibyte character: >+ * no good. */ >+ break; >+ >+ beg += len; >+ bytes_left -= len; >+ } >+ >+ if (bytes_left) >+ break; /* Try a different anchor. */ >+ } >+ else >+#endif /* MBS_SUPPORT */ > beg += offset; > len = kwsmatch.size[0]; > } >@@ -597,19 +870,31 @@ > } > > failure: >+ return -1; >+ >+ success: > #ifdef MBS_SUPPORT >- if (MB_CUR_MAX > 1) >+ if (MB_CUR_MAX > 1 && !using_utf8) > { >- if (match_icase) >- free((char *) buf); >- if (mb_properties) >- free(mb_properties); >+ end = beg + len; >+ while (end < buf + size) >+ { >+ size_t len = mbrlen (end, buf + size - end, &mbs); >+ if (len == (size_t) -1 || len == (size_t) -2 || len == 0) >+ { >+ memset (&mbs, '\0', sizeof (mbstate_t)); >+ len = 1; >+ } >+ if (len == 1 && *end == eol) >+ break; >+ >+ end += len; >+ } > } >+ else > #endif /* MBS_SUPPORT */ >- return -1; >- >- success: > end = memchr (beg + len, eol, (buf + size) - (beg + len)); >+ > end++; > while (buf < beg && beg[-1] != eol) > --beg; >@@ -618,15 +903,6 @@ > > success_in_beg_and_len: > *match_size = len; >-#ifdef MBS_SUPPORT >- if (MB_CUR_MAX > 1) >- { >- if (mb_properties) >- free (mb_properties); >- if (match_icase) >- free ((char *) buf); >- } >-#endif /* MBS_SUPPORT */ > return beg - buf; > } >
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 93443
:
59486
|
59488
|
59490
|
59492
|
59493
|
59494
|
74453
|
74455
| 83645 |
83646
|
83647
|
83648