--- tesseract-2.03-orig/training/unicharset_extractor.cpp	2007-11-22 02:48:59.000000000 +0100
+++ tesseract-2.03/training/unicharset_extractor.cpp	2008-05-02 21:27:56.000000000 +0200
@@ -59,3 +59,6 @@
-  int step = UNICHAR::utf8_step(c_string);
-  if (step == 0)
-    return; // Invalid utf-8.
+  int step = 0;
+  int len = strlen(c_string);
+  for (int offset = 0; offset < len; offset += step) {
+    step = UNICHAR::utf8_step(c_string + offset);
+    if (step == 0)
+      break; // Invalid utf-8.
@@ -63,3 +66,3 @@
-  // Get the next Unicode cond point in the string.
-  UNICHAR ch(c_string, step);
-  wc = ch.first_uni();
+    // Get the next Unicode cond point in the string.
+    UNICHAR ch(c_string + offset, step);
+    wc = ch.first_uni();
@@ -67,7 +70,10 @@
-  /* Copy the properties. */
-  if (iswalpha(wc)) {
-    unicharset->set_isalpha(id, 1);
-    if (iswlower(wc))
-      unicharset->set_islower(id, 1);
-    if (iswupper(wc))
-      unicharset->set_isupper(id, 1);
+    /* Copy the properties. */
+    if (iswalpha(wc)) {
+      unicharset->set_isalpha(id, 1);
+      if (iswlower(wc))
+        unicharset->set_islower(id, 1);
+      if (iswupper(wc))
+        unicharset->set_isupper(id, 1);
+    }
+    if (iswdigit(wc))
+      unicharset->set_isdigit(id, 1);
@@ -75,2 +80,0 @@
-  if (iswdigit(wc))
-    unicharset->set_isdigit(id, 1);
@@ -79,0 +84,4 @@
+enum {
+        kBufSize = 4096 //default buffer size
+};
+
@@ -122 +130 @@
-    char c_string[kBoxReadBufSize];
+    char c_string[kBufSize];