Attachment #151673 for bug #197167

Lines 59-61 Link Here

(-)tesseract-2.03-orig/training/unicharset_extractor.cpp (-16 / +24 lines)
59	int step = UNICHAR::utf8_step(c_string);	59	int step = 0;
60	if (step == 0)	60	int len = strlen(c_string);
61	return; // Invalid utf-8.	61	for (int offset = 0; offset < len; offset += step) {
		62	step = UNICHAR::utf8_step(c_string + offset);
		63	if (step == 0)
		64	break; // Invalid utf-8.
Lines 63-65 Link Here
63	// Get the next Unicode cond point in the string.	66	// Get the next Unicode cond point in the string.
64	UNICHAR ch(c_string, step);	67	UNICHAR ch(c_string + offset, step);
65	wc = ch.first_uni();	68	wc = ch.first_uni();
Lines 67-73 Link Here
67	/* Copy the properties. */	70	/* Copy the properties. */
68	if (iswalpha(wc)) {	71	if (iswalpha(wc)) {
69	unicharset->set_isalpha(id, 1);	72	unicharset->set_isalpha(id, 1);
70	if (iswlower(wc))	73	if (iswlower(wc))
71	unicharset->set_islower(id, 1);	74	unicharset->set_islower(id, 1);
72	if (iswupper(wc))	75	if (iswupper(wc))
73	unicharset->set_isupper(id, 1);	76	unicharset->set_isupper(id, 1);
		77	}
		78	if (iswdigit(wc))
		79	unicharset->set_isdigit(id, 1);
Lines 75-76 Link Here
75	if (iswdigit(wc))
76	unicharset->set_isdigit(id, 1);
Line 79 Link Here
		84	enum {
		85	kBufSize = 4096 //default buffer size
		86	};
		87
Line 122 Link Here
122	char c_string[kBoxReadBufSize];	130	char c_string[kBufSize];

Return to bug 197167