--- tesseract-2.03-orig/training/unicharset_extractor.cpp 2007-11-22 02:48:59.000000000 +0100 +++ tesseract-2.03/training/unicharset_extractor.cpp 2008-05-02 21:27:56.000000000 +0200 @@ -59,3 +59,6 @@ - int step = UNICHAR::utf8_step(c_string); - if (step == 0) - return; // Invalid utf-8. + int step = 0; + int len = strlen(c_string); + for (int offset = 0; offset < len; offset += step) { + step = UNICHAR::utf8_step(c_string + offset); + if (step == 0) + break; // Invalid utf-8. @@ -63,3 +66,3 @@ - // Get the next Unicode cond point in the string. - UNICHAR ch(c_string, step); - wc = ch.first_uni(); + // Get the next Unicode cond point in the string. + UNICHAR ch(c_string + offset, step); + wc = ch.first_uni(); @@ -67,7 +70,10 @@ - /* Copy the properties. */ - if (iswalpha(wc)) { - unicharset->set_isalpha(id, 1); - if (iswlower(wc)) - unicharset->set_islower(id, 1); - if (iswupper(wc)) - unicharset->set_isupper(id, 1); + /* Copy the properties. */ + if (iswalpha(wc)) { + unicharset->set_isalpha(id, 1); + if (iswlower(wc)) + unicharset->set_islower(id, 1); + if (iswupper(wc)) + unicharset->set_isupper(id, 1); + } + if (iswdigit(wc)) + unicharset->set_isdigit(id, 1); @@ -75,2 +80,0 @@ - if (iswdigit(wc)) - unicharset->set_isdigit(id, 1); @@ -79,0 +84,4 @@ +enum { + kBufSize = 4096 //default buffer size +}; + @@ -122 +130 @@ - char c_string[kBoxReadBufSize]; + char c_string[kBufSize];