When regular expression matching certain strings, uregex_find() is pinning the
CPU for minutes to hours before I finally give up a terminate the program.
Similar regular expression/string pairs return successfully. It is just certain
combinations that are running into some kind of looping condition within the
ICU.
The following C program recreates the issue for me on Windows. We are having
similar troubles on Solaris as well. We are using ICU4C 3.2, but I experienced
the problem under ICU4C 3.4.1 as well.
#include <stdlib.h>
#include "unicode/uchar.h"
#include "unicode/uregex.h"
#include "unicode/ustring.h"
#include "unicode/utypes.h"
/**
- Builds the UTF-16 equivalent of specified UTF-8 string. The resulting
string
- is dynamically allocated and must be freed by the caller.
*
- @param utf8String null-terminated UTF-8 string
- @return Null-terminated UTF-16 string.
*/
static UChar *
buildUTF16StringFromUTF8String(const char *utf8String)
{
UChar *utf16String;
UErrorCode errorCode = U_ZERO_ERROR;
int32_t utf8StringLength;
/* determine length of UTF-8 string */
u_strFromUTF8(NULL, 0, &utf8StringLength, utf8String, -1, &errorCode);
if (U_FAILURE(errorCode) && errorCode != U_BUFFER_OVERFLOW_ERROR) {
return NULL;
}
/* allocate memory for UTF-16 string */
utf16String = (UChar *) malloc((utf8StringLength + 1) * sizeof(UChar));
if (utf16String == NULL) {
return NULL;
}
/* convert string from UTF-8 to UTF-16 */
errorCode = U_ZERO_ERROR;
u_strFromUTF8(utf16String, utf8StringLength + 1, NULL, utf8String, -1,
&errorCode);
if (U_FAILURE(errorCode)) {
free(utf16String);
return NULL;
}
return utf16String;
}
int
main(int argc, char **argv)
{
char *utf8RegularExpressionString =
"\\x19(([[:M:][:P:][:S:][[:Z:]-[\\x19\\x1B]]]|\\x19)*?([[:M:][:P:][:S:][[:Z:]-[\\x19\\x1B]]]|\\x19)([[:M:][:P:][:S:][[:Z:]-[\\x19\\x1B]]]|\\x19)*?){0,}V(([[:M:][:P:][:S:][[:Z:]-[\\x19\\x1B]]]|\\x19)*?([[:M:][:P:][:S:][[:Z:]-[\\x19\\x1B]]]|\\x19)([[:M:][:P:][:S:][[:Z:]-[\\x19\\x1B]]]|\\x19)*?){0,}U(([[:M:][:P:][:S:][[:Z:]-[\\x19\\x1B]]]|\\x19)*?([[:M:][:P:][:S:][[:Z:]-[\\x19\\x1B]]]|\\x19)([[:M:][:P:][:S:][[:Z:]-[\\x19\\x1B]]]|\\x19)*?){0,}[[:L:][:N:][:M:][:P:][:S:][[:Z:]-[\\x19\\x1B]]]{0,1}?(([[:M:][:P:][:S:][[:Z:]-[\\x19\\x1B]]]|\\x19)*?([[:M:][:P:][:S:][[:Z:]-[\\x19\\x1B]]]|\\x19)([[:M:][:P:][:S:][[:Z:]-[\\x19\\x1B]]]|\\x19)*?){0,}\\x19";
UChar *utf16RegularExpressionString = NULL;
UErrorCode errorCode = U_ZERO_ERROR;
URegularExpression *regularExpression = NULL;
char *utf8String = "\x19LU\x19 \x19VUSTRU\x19 \x19SUKURUSU\x19
.................... \x19LU\x19 \x19NUSTRU\x19 \x19PRUFUSUNU\x19 \x1B";
UChar *utf16String = NULL;
(void) argc;
(void) argv;
utf16RegularExpressionString =
buildUTF16StringFromUTF8String(utf8RegularExpressionString);
regularExpression = uregex_open(utf16RegularExpressionString, -1, 0, NULL,
&errorCode);
free(utf16RegularExpressionString);
utf16String = buildUTF16StringFromUTF8String(utf8String);
uregex_setText(regularExpression, utf16String, -1, &errorCode);
uregex_find(regularExpression, 0, &errorCode);
free(utf16String);
uregex_close(regularExpression);
return 0;
}