I found a bug in string search code. We are using ICU 3.2.1 but I confirmed the bug exists in ICU 3.8. Here is a self-contained program that reproduces the problem. I expect match to be 1, but it get a value of 4. Also, this bug only occurs when I use a Strength 1 collator. If I used a Strength 2 or greater collator, I get the correct value of 1.
#include <stdio.h>
#include "unicode/ucol.h"
#include "unicode/ubrk.h"
#include "unicode/usearch.h"
int main()
{
UChar search[] = { 0x00C2, 0x0303 };
UChar source[] = { 0x0020,
0x00C2, 0x0303, 0x0020, 0x0041, 0x0061,
0x1EAA, 0x0041, 0x0302, 0x0303, 0x00C2,
0x0303, 0x1EAB, 0x0061, 0x0302, 0x0303,
0x00E2, 0x0303, 0xD806, 0xDC01, 0x0300,
0x0020, };
int32_t searchLen;
int32_t sourceLen;
UErrorCode icuStatus = U_ZERO_ERROR;
UCollator *coll;
const char *locale;
UBreakIterator *ubrk;
UStringSearch *usearch;
int32_t match = 0;
searchLen = sizeof(search)/sizeof(UChar);
sourceLen = sizeof(source)/sizeof(UChar);
coll = ucol_openFromShortString( "LDE_AN_CX_EX_FX_HX_NX_S1",
false,
NULL,
&icuStatus );
if ( U_FAILURE(icuStatus) )
{
printf( "ucol_openFromShortString error\n" );
goto exit;
}
locale = ucol_getLocaleByType( coll,
ULOC_VALID_LOCALE,
&icuStatus );
if ( U_FAILURE(icuStatus) )
{
printf( "ucol_getLocaleByType error\n" );
goto exit;
}
ubrk = ubrk_open( UBRK_CHARACTER,
locale,
source,
sourceLen,
&icuStatus );
if ( U_FAILURE(icuStatus) )
{
printf( "ubrk_open error\n" );
goto exit;
}
usearch = usearch_openFromCollator( search,
searchLen,
source,
sourceLen,
coll,
NULL,
&icuStatus );
if ( U_FAILURE(icuStatus) )
{
printf( "usearch_openFromCollator error\n" );
goto exit;
}
usearch_setAttribute( usearch,
USEARCH_OVERLAP,
USEARCH_ON,
&icuStatus );
if ( U_FAILURE(icuStatus) )
{
printf( "usearch_setAttribute error\n" );
goto exit;
}
match = usearch_first( usearch,
&icuStatus );
if ( U_FAILURE(icuStatus) )
{
printf( "usearch_first error\n" );
goto exit;
}
printf( "match=%d\n", match );
exit:
return 0;
}