ICU4J has Thai-Latin Transform. This should be eiter ported to ICU4C or special
syntax need to be added to Transform to invoke a break iterator.
<quote author="Alan">
The Thai transliterator uses a temporary break-iterator solution that Mark put
together. My guess is that we never ported it to C, since the "real" solution
was to implement "\b" syntax in transliterator (which would invoke a break
iterator). So this remained a Java-only implementation.
</quote>
=======Test Code====================
class LegalThai : public Legal{
private:
BreakIterator* thaiBreak;
// anything is legal except word ending with Logical-order-exception
public :
LegalThai(UErrorCode& status){
if(U_FAILURE(status)){
return;
}
thaiBreak = BreakIterator::createWordInstance(Locale("th",
"TH"),status);
}
~LegalThai(){
delete thaiBreak;
}
UBool is(UnicodeString sourceString) {
if (sourceString.length() == 0) return TRUE;
UChar32 ch = sourceString.charAt(sourceString.length() - 1); // don't
worry about surrogates.
if (u_hasBinaryProperty(ch, UCHAR_LOGICAL_ORDER_EXCEPTION)) return
FALSE;
return TRUE;
}
};
void TransliteratorRoundTripTest::TestThai() {
RTTest test("Latin-Thai");
UErrorCode status = U_ZERO_ERROR;
LegalThai lt(status);
if(U_FAILURE(status)){
errln("Could not create a LegalThai oject. Error : %s",
u_errorName(status));
return;
}
test.test("[a-zA-Z\\u0142\\u1ECD\\u00E6\\u0131\\u0268\\u02CC]",
"[\\u0E01-\\u0E3A\\u0E40-\\u0E5B]",
"[a-zA-Z\\u0142\\u1ECD\\u00E6\\u0131\\u0268\\u02B9\\u02CC]",
this, quick, <,50 );
}