Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the string search behavior when using ICU #57078

Merged
merged 8 commits into from
Aug 17, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 166 additions & 6 deletions src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,155 @@ ResultCode GlobalizationNative_GetSortHandle(const char* lpLocaleName, SortHandl
return GetResultCode(err);
}

static const char* BreakIteratorRuleOld = // supported on ICU like versions 52
"$CR = [\\p{Grapheme_Cluster_Break = CR}]; \n" \
"$LF = [\\p{Grapheme_Cluster_Break = LF}]; \n" \
"$Control = [\\p{Grapheme_Cluster_Break = Control}]; \n" \
"$Extend = [\\p{Grapheme_Cluster_Break = Extend}]; \n" \
"$SpacingMark = [\\p{Grapheme_Cluster_Break = SpacingMark}]; \n" \
"$Regional_Indicator = [\\p{Grapheme_Cluster_Break = Regional_Indicator}]; \n" \
"$L = [\\p{Grapheme_Cluster_Break = L}]; \n" \
"$V = [\\p{Grapheme_Cluster_Break = V}]; \n" \
"$T = [\\p{Grapheme_Cluster_Break = T}]; \n" \
"$LV = [\\p{Grapheme_Cluster_Break = LV}]; \n" \
"$LVT = [\\p{Grapheme_Cluster_Break = LVT}]; \n" \
"!!chain; \n" \
"!!forward; \n" \
"$L ($L | $V | $LV | $LVT); \n" \
"($LV | $V) ($V | $T); \n" \
"($LVT | $T) $T; \n" \
"$Regional_Indicator $Regional_Indicator; \n" \
"[^$Control $CR $LF] $Extend; \n" \
"[^$Control $CR $LF] $SpacingMark; \n" \
"!!reverse; \n" \
"($L | $V | $LV | $LVT) $L; \n" \
"($V | $T) ($LV | $V); \n" \
"$T ($LVT | $T); \n" \
"$Regional_Indicator $Regional_Indicator; \n" \
"$Extend [^$Control $CR $LF]; \n" \
"$SpacingMark [^$Control $CR $LF]; \n" \
"!!safe_reverse; \n" \
"!!safe_forward; \n";

static const char* BreakIteratorRuleNew = // supported on newer ICU versions like 62 and up
"!!quoted_literals_only; \n" \
"$CR = [\\p{Grapheme_Cluster_Break = CR}]; \n" \
"$LF = [\\p{Grapheme_Cluster_Break = LF}]; \n" \
"$Control = [[\\p{Grapheme_Cluster_Break = Control}]]; \n" \
"$Extend = [[\\p{Grapheme_Cluster_Break = Extend}]]; \n" \
"$ZWJ = [\\p{Grapheme_Cluster_Break = ZWJ}]; \n" \
"$Regional_Indicator = [\\p{Grapheme_Cluster_Break = Regional_Indicator}]; \n" \
"$Prepend = [\\p{Grapheme_Cluster_Break = Prepend}]; \n" \
"$SpacingMark = [\\p{Grapheme_Cluster_Break = SpacingMark}]; \n" \
"$Virama = [\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Virama}]; \n" \
"$LinkingConsonant = [\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Consonant}]; \n" \
"$ExtCccZwj = [[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]; \n" \
"$L = [\\p{Grapheme_Cluster_Break = L}]; \n" \
"$V = [\\p{Grapheme_Cluster_Break = V}]; \n" \
"$T = [\\p{Grapheme_Cluster_Break = T}]; \n" \
"$LV = [\\p{Grapheme_Cluster_Break = LV}]; \n" \
"$LVT = [\\p{Grapheme_Cluster_Break = LVT}]; \n" \
"$Extended_Pict = [:ExtPict:]; \n" \
"!!chain; \n" \
"!!lookAheadHardBreak; \n" \
"$L ($L | $V | $LV | $LVT); \n" \
"($LV | $V) ($V | $T); \n" \
"($LVT | $T) $T; \n" \
"[^$Control $CR $LF] ($Extend | $ZWJ); \n" \
"[^$Control $CR $LF] $SpacingMark; \n" \
"$Prepend [^$Control $CR $LF]; \n" \
"$LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* $LinkingConsonant; \n" \
"$Extended_Pict $Extend* $ZWJ $Extended_Pict; \n" \
"^$Prepend* $Regional_Indicator $Regional_Indicator / $Regional_Indicator; \n" \
"^$Prepend* $Regional_Indicator $Regional_Indicator; \n" \
".;";

static UChar* s_breakIteratorRules = NULL;
static int32_t s_breakIteratorRulesLength = 0;

// When doing string search operations using ICU, it is internally using a break iterator which doesn't allow breaking between some characters according to
// the Grapheme Cluster Boundary Rules specified in http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules.
// Unfortunately, not all rules will have the desired behavior we need to get in .NET. For example, the rules don't allow breaking between CR '\r' and LF '\n' characters.
// When searching for "\n" in a string like "\r\n", will get not found result.
// We are customizing the break iterator to exclude the CRxLF rule which don't allow breaking between CR and LF.
// The general rules syntax explained in the doc https://unicode-org.github.io/icu/userguide/boundaryanalysis/break-rules.html.
// The ICU latest rules definition exist here https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/rules/char.txt.
static UBreakIterator* CreateCustomizedBreakIterator()
{
static UChar emptyString[1];
UBreakIterator* breaker;

UErrorCode status = U_ZERO_ERROR;
if (s_breakIteratorRules != NULL)
tarekgh marked this conversation as resolved.
Show resolved Hide resolved
{
assert(s_breakIteratorRulesLength > 0);
breaker = ubrk_openRules(s_breakIteratorRules, s_breakIteratorRulesLength, emptyString, 0, NULL, &status);
return U_FAILURE(status) ? NULL : breaker;
}

int32_t oldRulesLength = (int32_t)strlen(BreakIteratorRuleOld);
int32_t newRulesLength = (int32_t)strlen(BreakIteratorRuleNew);

int32_t breakIteratorRulesLength = newRulesLength > oldRulesLength ? newRulesLength : oldRulesLength;

UChar* rules = (UChar*)malloc(((size_t)breakIteratorRulesLength + 1) * sizeof(UChar));
tarekgh marked this conversation as resolved.
Show resolved Hide resolved
if (rules == NULL)
{
return NULL;
}

u_uastrcpy(rules, BreakIteratorRuleNew);
tarekgh marked this conversation as resolved.
Show resolved Hide resolved

breaker = ubrk_openRules(rules, newRulesLength, emptyString, 0, NULL, &status);
if (U_FAILURE(status))
{
status = U_ZERO_ERROR;
u_uastrcpy(rules, BreakIteratorRuleOld);
breaker = ubrk_openRules(rules, oldRulesLength, emptyString, 0, NULL, &status);
s_breakIteratorRulesLength = oldRulesLength;
}
else
{
s_breakIteratorRulesLength = newRulesLength;
}

if (U_FAILURE(status))
{
free(rules);
return NULL;
}

UChar* pNull = NULL;
if (!pal_atomic_cas_ptr((void* volatile*)&s_breakIteratorRules, rules, pNull))
{
free(rules);
assert(s_breakIteratorRules != NULL);
}

return breaker;
}

static void CloseSearchIterator(UStringSearch* pSearch)
{
assert(pSearch != NULL);

#if !defined(TARGET_WINDOWS)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wcast-qual"
#endif
UBreakIterator* breakIterator = (UBreakIterator*)usearch_getBreakIterator(pSearch);
#if !defined(TARGET_WINDOWS)
#pragma GCC diagnostic pop
#endif

usearch_close(pSearch);

if (breakIterator != NULL)
{
ubrk_close(breakIterator);
}
}

void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle)
{
for (int i = 0; i <= CompareOptionsMask; i++)
Expand All @@ -460,7 +609,7 @@ void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle)
{
if (pSearch != USED_STRING_SEARCH)
{
usearch_close(pSearch);
CloseSearchIterator(pSearch);
}
pSortHandle->searchIteratorList[i].searchIterator = NULL;
SearchIteratorNode* pNext = pSortHandle->searchIteratorList[i].next;
Expand All @@ -470,7 +619,7 @@ void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle)
{
if (pNext->searchIterator != NULL && pNext->searchIterator != USED_STRING_SEARCH)
{
usearch_close(pNext->searchIterator);
CloseSearchIterator(pNext->searchIterator);
}
SearchIteratorNode* pCurrent = pNext;
pNext = pCurrent->next;
Expand Down Expand Up @@ -581,9 +730,14 @@ static int32_t GetSearchIteratorUsingCollator(

if (*pSearchIterator == NULL)
{
*pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, NULL, &err);
UBreakIterator* breakIterator = CreateCustomizedBreakIterator();
*pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, breakIterator, &err);
if (!U_SUCCESS(err))
{
if (breakIterator != NULL)
{
ubrk_close(breakIterator);
}
assert(false && "Couldn't open the search iterator.");
return -1;
}
Expand All @@ -593,7 +747,7 @@ static int32_t GetSearchIteratorUsingCollator(
{
if (!CreateNewSearchNode(pSortHandle, options))
{
usearch_close(*pSearchIterator);
CloseSearchIterator(*pSearchIterator);
return -1;
}
}
Expand All @@ -619,16 +773,22 @@ static int32_t GetSearchIteratorUsingCollator(

if (*pSearchIterator == NULL) // Couldn't find any available handle to borrow then create a new one.
{
*pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, NULL, &err);
UBreakIterator* breakIterator = CreateCustomizedBreakIterator();
*pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, breakIterator, &err);
if (!U_SUCCESS(err))
{
if (breakIterator != NULL)
{
ubrk_close(breakIterator);
}

assert(false && "Couldn't open a new search iterator.");
return -1;
}

if (!CreateNewSearchNode(pSortHandle, options))
{
usearch_close(*pSearchIterator);
CloseSearchIterator(*pSearchIterator);
return -1;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ U_CAPI int32_t U_EXPORT2 ucal_getWindowsTimeZoneID(const UChar* id, int32_t len,
PER_FUNCTION_BLOCK(u_tolower, libicuuc, true) \
PER_FUNCTION_BLOCK(u_toupper, libicuuc, true) \
PER_FUNCTION_BLOCK(u_uastrcpy, libicuuc, true) \
PER_FUNCTION_BLOCK(ubrk_close, libicui18n, true) \
PER_FUNCTION_BLOCK(ubrk_openRules, libicui18n, true) \
PER_FUNCTION_BLOCK(ucal_add, libicui18n, true) \
PER_FUNCTION_BLOCK(ucal_close, libicui18n, true) \
PER_FUNCTION_BLOCK(ucal_get, libicui18n, true) \
Expand Down Expand Up @@ -155,6 +157,7 @@ U_CAPI int32_t U_EXPORT2 ucal_getWindowsTimeZoneID(const UChar* id, int32_t len,
PER_FUNCTION_BLOCK(ures_open, libicuuc, true) \
PER_FUNCTION_BLOCK(usearch_close, libicui18n, true) \
PER_FUNCTION_BLOCK(usearch_first, libicui18n, true) \
PER_FUNCTION_BLOCK(usearch_getBreakIterator, libicui18n, true) \
PER_FUNCTION_BLOCK(usearch_getMatchedLength, libicui18n, true) \
PER_FUNCTION_BLOCK(usearch_last, libicui18n, true) \
PER_FUNCTION_BLOCK(usearch_openFromCollator, libicui18n, true) \
Expand Down Expand Up @@ -216,6 +219,8 @@ FOR_ALL_ICU_FUNCTIONS
#define u_tolower(...) u_tolower_ptr(__VA_ARGS__)
#define u_toupper(...) u_toupper_ptr(__VA_ARGS__)
#define u_uastrcpy(...) u_uastrcpy_ptr(__VA_ARGS__)
#define ubrk_close(...) ubrk_close_ptr(__VA_ARGS__)
#define ubrk_openRules(...) ubrk_openRules_ptr(__VA_ARGS__)
#define ucal_add(...) ucal_add_ptr(__VA_ARGS__)
#define ucal_close(...) ucal_close_ptr(__VA_ARGS__)
#define ucal_get(...) ucal_get_ptr(__VA_ARGS__)
Expand Down Expand Up @@ -310,6 +315,7 @@ FOR_ALL_ICU_FUNCTIONS
#define ures_open(...) ures_open_ptr(__VA_ARGS__)
#define usearch_close(...) usearch_close_ptr(__VA_ARGS__)
#define usearch_first(...) usearch_first_ptr(__VA_ARGS__)
#define usearch_getBreakIterator(...) usearch_getBreakIterator_ptr(__VA_ARGS__)
#define usearch_getMatchedLength(...) usearch_getMatchedLength_ptr(__VA_ARGS__)
#define usearch_last(...) usearch_last_ptr(__VA_ARGS__)
#define usearch_openFromCollator(...) usearch_openFromCollator_ptr(__VA_ARGS__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,6 @@ typedef struct UFieldPosition {
int32_t endIndex;
} UFieldPosition;


void u_charsToUChars(const char * cs, UChar * us, int32_t length);
void u_getVersion(UVersionInfo versionArray);
int32_t u_strlen(const UChar * s);
Expand All @@ -443,6 +442,8 @@ UChar * u_strncpy(UChar * dst, const UChar * src, int32_t n);
UChar32 u_tolower(UChar32 c);
UChar32 u_toupper(UChar32 c);
UChar* u_uastrcpy(UChar * dst, const char * src);
void ubrk_close(UBreakIterator * bi);
UBreakIterator* ubrk_openRules(const UChar * rules, int32_t rulesLength, const UChar * text, int32_t textLength, UParseError * parseErr, UErrorCode * status);
void ucal_add(UCalendar * cal, UCalendarDateFields field, int32_t amount, UErrorCode * status);
void ucal_close(UCalendar * cal);
int32_t ucal_get(const UCalendar * cal, UCalendarDateFields field, UErrorCode * status);
Expand Down Expand Up @@ -532,6 +533,7 @@ const UChar * ures_getStringByIndex(const UResourceBundle * resourceBundle, int3
UResourceBundle * ures_open(const char * packageName, const char * locale, UErrorCode * status);
void usearch_close(UStringSearch * searchiter);
int32_t usearch_first(UStringSearch * strsrch, UErrorCode * status);
const UBreakIterator* usearch_getBreakIterator(const UStringSearch * strsrch);
int32_t usearch_getMatchedLength(const UStringSearch * strsrch);
int32_t usearch_last(UStringSearch * strsrch, UErrorCode * status);
UStringSearch * usearch_openFromCollator(const UChar * pattern, int32_t patternlength, const UChar * text, int32_t textlength, const UCollator * collator, UBreakIterator * breakiter, UErrorCode * status);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ public static IEnumerable<object[]> IndexOf_TestData()
yield return new object[] { s_invariantCompare, "FooBar", "Foo\u0400Bar", 0, 6, CompareOptions.Ordinal, -1, 0 };
yield return new object[] { s_invariantCompare, "TestFooBA\u0300R", "FooB\u00C0R", 0, 11, CompareOptions.IgnoreNonSpace, 4, 7 };
yield return new object[] { s_invariantCompare, "o\u0308", "o", 0, 2, CompareOptions.None, -1, 0 };
yield return new object[] { s_invariantCompare, "\r\n", "\n", 0, 2, CompareOptions.None, 1, 1 };

// Weightless characters
yield return new object[] { s_invariantCompare, "", "\u200d", 0, 0, CompareOptions.None, 0, 0 };
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ public static IEnumerable<object[]> LastIndexOf_TestData()
yield return new object[] { s_invariantCompare, "FooBar", "Foo\u0400Bar", 5, 6, CompareOptions.Ordinal, -1, 0 };
yield return new object[] { s_invariantCompare, "TestFooBA\u0300R", "FooB\u00C0R", 10, 11, CompareOptions.IgnoreNonSpace, 4, 7 };
yield return new object[] { s_invariantCompare, "o\u0308", "o", 1, 2, CompareOptions.None, -1, 0 };
yield return new object[] { s_invariantCompare, "\r\n", "\n", 1, 2, CompareOptions.None, 1, 1 };

// Weightless characters
// NLS matches weightless characters at the end of the string
Expand Down