From cd7fc5cff7a274335cd658540d567aaad36fd7c7 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Mon, 9 Aug 2021 12:36:19 -0700 Subject: [PATCH 1/7] Fix the string search behavior when using ICU --- .../pal_collation.c | 73 +++++++++++++++++-- .../pal_icushim_internal.h | 6 ++ .../pal_icushim_internal_android.h | 4 +- .../CompareInfo/CompareInfoTests.IndexOf.cs | 5 ++ .../CompareInfoTests.LastIndexOf.cs | 5 ++ 5 files changed, 86 insertions(+), 7 deletions(-) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c index 0e0a46c5b1c59..7e57f31d2c66a 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c @@ -449,6 +449,56 @@ ResultCode GlobalizationNative_GetSortHandle(const char* lpLocaleName, SortHandl return GetResultCode(err); } +static const char* BreakIteratorRule = "$CR = [\\p{Grapheme_Cluster_Break = CR}]; \n" \ + "$LF = [\\p{Grapheme_Cluster_Break = LF}]; \n" \ + "$Control = [[\\p{Grapheme_Cluster_Break = Control}]]; \n" \ + "$Extend = [[\\p{Grapheme_Cluster_Break = Extend}]]; \n" \ + "$ZWJ = [\\p{Grapheme_Cluster_Break = ZWJ}]; \n" \ + "[^$Control $CR $LF] ($Extend | $ZWJ); \n"; + +// When doing string search operations using ICU, it is internally using a break iterator which doesn't allow breaking between some characters according to +// the Grapheme Cluster Boundary Rules specified in http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules. +// Unfortunately, not all rules will have the desired behavior we need to get in .NET. For example, the rules don't allow breaking between CR '\r' and LF '\n' characters. +// When searching for "\n" in a string like "\r\n", will get not found result. +// We are customizing the break iterator to include only the rules which give the desired behavior. Mainly, we include the GB9 rule http://www.unicode.org/reports/tr29/#GB9 +// which doesn't allow breaking before the nonspace marks. +// The general rules syntax explained in the doc https://unicode-org.github.io/icu/userguide/boundaryanalysis/break-rules.html. +// The ICU rules definition exist here https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/rules/char.txt. +static UBreakIterator* CreateCustomizedBreakIterator() +{ + UChar uRule[400]; + assert(strlen(BreakIteratorRule) < 400); + u_uastrcpy(uRule, BreakIteratorRule); + + static UChar emptyString[1]; + UErrorCode status = U_ZERO_ERROR; + + UBreakIterator* breaker = ubrk_openRules(uRule, (int32_t)strlen(BreakIteratorRule), emptyString, 0, NULL, &status); + + return U_FAILURE(status) ? NULL : breaker; +} + +static void CloseSearchIterator(UStringSearch* pSearch) +{ + assert(pSearch != NULL); + +#if !defined(TARGET_WINDOWS) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcast-qual" +#endif + UBreakIterator* breakIterator = (UBreakIterator*)usearch_getBreakIterator(pSearch); +#if !defined(TARGET_WINDOWS) +#pragma GCC diagnostic pop +#endif + + usearch_close(pSearch); + + if (breakIterator != NULL) + { + ubrk_close(breakIterator); + } +} + void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle) { for (int i = 0; i <= CompareOptionsMask; i++) @@ -460,7 +510,7 @@ void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle) { if (pSearch != USED_STRING_SEARCH) { - usearch_close(pSearch); + CloseSearchIterator(pSearch); } pSortHandle->searchIteratorList[i].searchIterator = NULL; SearchIteratorNode* pNext = pSortHandle->searchIteratorList[i].next; @@ -470,7 +520,7 @@ void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle) { if (pNext->searchIterator != NULL && pNext->searchIterator != USED_STRING_SEARCH) { - usearch_close(pNext->searchIterator); + CloseSearchIterator(pNext->searchIterator); } SearchIteratorNode* pCurrent = pNext; pNext = pCurrent->next; @@ -581,9 +631,14 @@ static int32_t GetSearchIteratorUsingCollator( if (*pSearchIterator == NULL) { - *pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, NULL, &err); + UBreakIterator* breakIterator = CreateCustomizedBreakIterator(); + *pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, breakIterator, &err); if (!U_SUCCESS(err)) { + if (breakIterator != NULL) + { + ubrk_close(breakIterator); + } assert(false && "Couldn't open the search iterator."); return -1; } @@ -593,7 +648,7 @@ static int32_t GetSearchIteratorUsingCollator( { if (!CreateNewSearchNode(pSortHandle, options)) { - usearch_close(*pSearchIterator); + CloseSearchIterator(*pSearchIterator); return -1; } } @@ -619,16 +674,22 @@ static int32_t GetSearchIteratorUsingCollator( if (*pSearchIterator == NULL) // Couldn't find any available handle to borrow then create a new one. { - *pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, NULL, &err); + UBreakIterator* breakIterator = CreateCustomizedBreakIterator(); + *pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, breakIterator, &err); if (!U_SUCCESS(err)) { + if (breakIterator != NULL) + { + ubrk_close(breakIterator); + } + assert(false && "Couldn't open a new search iterator."); return -1; } if (!CreateNewSearchNode(pSortHandle, options)) { - usearch_close(*pSearchIterator); + CloseSearchIterator(*pSearchIterator); return -1; } diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal.h b/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal.h index b70e31fe9c0bb..e6acb61c74289 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal.h +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal.h @@ -73,6 +73,8 @@ U_CAPI int32_t U_EXPORT2 ucal_getWindowsTimeZoneID(const UChar* id, int32_t len, PER_FUNCTION_BLOCK(u_tolower, libicuuc, true) \ PER_FUNCTION_BLOCK(u_toupper, libicuuc, true) \ PER_FUNCTION_BLOCK(u_uastrcpy, libicuuc, true) \ + PER_FUNCTION_BLOCK(ubrk_close, libicui18n, true) \ + PER_FUNCTION_BLOCK(ubrk_openRules, libicui18n, true) \ PER_FUNCTION_BLOCK(ucal_add, libicui18n, true) \ PER_FUNCTION_BLOCK(ucal_close, libicui18n, true) \ PER_FUNCTION_BLOCK(ucal_get, libicui18n, true) \ @@ -155,6 +157,7 @@ U_CAPI int32_t U_EXPORT2 ucal_getWindowsTimeZoneID(const UChar* id, int32_t len, PER_FUNCTION_BLOCK(ures_open, libicuuc, true) \ PER_FUNCTION_BLOCK(usearch_close, libicui18n, true) \ PER_FUNCTION_BLOCK(usearch_first, libicui18n, true) \ + PER_FUNCTION_BLOCK(usearch_getBreakIterator, libicui18n, true) \ PER_FUNCTION_BLOCK(usearch_getMatchedLength, libicui18n, true) \ PER_FUNCTION_BLOCK(usearch_last, libicui18n, true) \ PER_FUNCTION_BLOCK(usearch_openFromCollator, libicui18n, true) \ @@ -216,6 +219,8 @@ FOR_ALL_ICU_FUNCTIONS #define u_tolower(...) u_tolower_ptr(__VA_ARGS__) #define u_toupper(...) u_toupper_ptr(__VA_ARGS__) #define u_uastrcpy(...) u_uastrcpy_ptr(__VA_ARGS__) +#define ubrk_close(...) ubrk_close_ptr(__VA_ARGS__) +#define ubrk_openRules(...) ubrk_openRules_ptr(__VA_ARGS__) #define ucal_add(...) ucal_add_ptr(__VA_ARGS__) #define ucal_close(...) ucal_close_ptr(__VA_ARGS__) #define ucal_get(...) ucal_get_ptr(__VA_ARGS__) @@ -310,6 +315,7 @@ FOR_ALL_ICU_FUNCTIONS #define ures_open(...) ures_open_ptr(__VA_ARGS__) #define usearch_close(...) usearch_close_ptr(__VA_ARGS__) #define usearch_first(...) usearch_first_ptr(__VA_ARGS__) +#define usearch_getBreakIterator(...) usearch_getBreakIterator_ptr(__VA_ARGS__) #define usearch_getMatchedLength(...) usearch_getMatchedLength_ptr(__VA_ARGS__) #define usearch_last(...) usearch_last_ptr(__VA_ARGS__) #define usearch_openFromCollator(...) usearch_openFromCollator_ptr(__VA_ARGS__) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal_android.h b/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal_android.h index 4439787705c87..a1b42cdcdadc8 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal_android.h +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal_android.h @@ -433,7 +433,6 @@ typedef struct UFieldPosition { int32_t endIndex; } UFieldPosition; - void u_charsToUChars(const char * cs, UChar * us, int32_t length); void u_getVersion(UVersionInfo versionArray); int32_t u_strlen(const UChar * s); @@ -443,6 +442,8 @@ UChar * u_strncpy(UChar * dst, const UChar * src, int32_t n); UChar32 u_tolower(UChar32 c); UChar32 u_toupper(UChar32 c); UChar* u_uastrcpy(UChar * dst, const char * src); +void ubrk_close(UBreakIterator * bi); +UBreakIterator* ubrk_openRules(const UChar * rules, int32_t rulesLength, const UChar * text, int32_t textLength, UParseError * parseErr, UErrorCode * status); void ucal_add(UCalendar * cal, UCalendarDateFields field, int32_t amount, UErrorCode * status); void ucal_close(UCalendar * cal); int32_t ucal_get(const UCalendar * cal, UCalendarDateFields field, UErrorCode * status); @@ -532,6 +533,7 @@ const UChar * ures_getStringByIndex(const UResourceBundle * resourceBundle, int3 UResourceBundle * ures_open(const char * packageName, const char * locale, UErrorCode * status); void usearch_close(UStringSearch * searchiter); int32_t usearch_first(UStringSearch * strsrch, UErrorCode * status); +const UBreakIterator* usearch_getBreakIterator(const UStringSearch * strsrch); int32_t usearch_getMatchedLength(const UStringSearch * strsrch); int32_t usearch_last(UStringSearch * strsrch, UErrorCode * status); UStringSearch * usearch_openFromCollator(const UChar * pattern, int32_t patternlength, const UChar * text, int32_t textlength, const UCollator * collator, UBreakIterator * breakiter, UErrorCode * status); diff --git a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.IndexOf.cs b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.IndexOf.cs index 960db32be6622..2d8d3953e10ea 100644 --- a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.IndexOf.cs +++ b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.IndexOf.cs @@ -72,6 +72,11 @@ public static IEnumerable IndexOf_TestData() yield return new object[] { s_invariantCompare, "FooBar", "Foo\u0400Bar", 0, 6, CompareOptions.Ordinal, -1, 0 }; yield return new object[] { s_invariantCompare, "TestFooBA\u0300R", "FooB\u00C0R", 0, 11, CompareOptions.IgnoreNonSpace, 4, 7 }; yield return new object[] { s_invariantCompare, "o\u0308", "o", 0, 2, CompareOptions.None, -1, 0 }; + yield return new object[] { s_invariantCompare, "\r\n", "\n", 0, 2, CompareOptions.None, 1, 1 }; + yield return new object[] { s_invariantCompare, "\u0600x", "x", 0, 2, CompareOptions.None, 1, 1 }; + yield return new object[] { s_invariantCompare, "\u0601x", "x", 0, 2, CompareOptions.None, 1, 1 }; + yield return new object[] { s_invariantCompare, "x\u0600", "x", 0, 2, CompareOptions.None, 0, 1 }; + yield return new object[] { s_invariantCompare, "x\u0601", "x", 0, 2, CompareOptions.None, 0, 1 }; // Weightless characters yield return new object[] { s_invariantCompare, "", "\u200d", 0, 0, CompareOptions.None, 0, 0 }; diff --git a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.LastIndexOf.cs b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.LastIndexOf.cs index a400dedbdca66..bc0c59ace3c3a 100644 --- a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.LastIndexOf.cs +++ b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.LastIndexOf.cs @@ -86,6 +86,11 @@ public static IEnumerable LastIndexOf_TestData() yield return new object[] { s_invariantCompare, "FooBar", "Foo\u0400Bar", 5, 6, CompareOptions.Ordinal, -1, 0 }; yield return new object[] { s_invariantCompare, "TestFooBA\u0300R", "FooB\u00C0R", 10, 11, CompareOptions.IgnoreNonSpace, 4, 7 }; yield return new object[] { s_invariantCompare, "o\u0308", "o", 1, 2, CompareOptions.None, -1, 0 }; + yield return new object[] { s_invariantCompare, "\r\n", "\n", 1, 2, CompareOptions.None, 1, 1 }; + yield return new object[] { s_invariantCompare, "x\u0600", "x", 1, 2, CompareOptions.None, 0, 1 }; + yield return new object[] { s_invariantCompare, "x\u0601", "x", 1, 2, CompareOptions.None, 0, 1 }; + yield return new object[] { s_invariantCompare, "\u0600x", "x", 1, 2, CompareOptions.None, 1, 1 }; + yield return new object[] { s_invariantCompare, "\u0601x", "x", 1, 2, CompareOptions.None, 1, 1 }; // Weightless characters // NLS matches weightless characters at the end of the string From 6eac118b49795de180cde12b4786a323782391af Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Tue, 10 Aug 2021 18:10:00 -0700 Subject: [PATCH 2/7] Fix the break with the old ICU versions --- .../Native/Unix/System.Globalization.Native/pal_collation.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c index 7e57f31d2c66a..4f52801376bb5 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c @@ -453,8 +453,7 @@ static const char* BreakIteratorRule = "$CR = [\\p{Grapheme_Cluster_Break = "$LF = [\\p{Grapheme_Cluster_Break = LF}]; \n" \ "$Control = [[\\p{Grapheme_Cluster_Break = Control}]]; \n" \ "$Extend = [[\\p{Grapheme_Cluster_Break = Extend}]]; \n" \ - "$ZWJ = [\\p{Grapheme_Cluster_Break = ZWJ}]; \n" \ - "[^$Control $CR $LF] ($Extend | $ZWJ); \n"; + "[^$Control $CR $LF] ($Extend | [\\u200D]); \n"; // When doing string search operations using ICU, it is internally using a break iterator which doesn't allow breaking between some characters according to // the Grapheme Cluster Boundary Rules specified in http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules. From 131b1ef4ebe57a3b3c867a352abc293706441730 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Mon, 16 Aug 2021 21:26:46 -0700 Subject: [PATCH 3/7] Restrict the change to exclude only the CRxLF breaking rule --- .../pal_collation.c | 128 ++++++++++++++++-- .../CompareInfo/CompareInfoTests.IndexOf.cs | 4 - 2 files changed, 114 insertions(+), 18 deletions(-) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c index 4f52801376bb5..d8c462ae237c5 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c @@ -449,32 +449,132 @@ ResultCode GlobalizationNative_GetSortHandle(const char* lpLocaleName, SortHandl return GetResultCode(err); } -static const char* BreakIteratorRule = "$CR = [\\p{Grapheme_Cluster_Break = CR}]; \n" \ - "$LF = [\\p{Grapheme_Cluster_Break = LF}]; \n" \ - "$Control = [[\\p{Grapheme_Cluster_Break = Control}]]; \n" \ - "$Extend = [[\\p{Grapheme_Cluster_Break = Extend}]]; \n" \ - "[^$Control $CR $LF] ($Extend | [\\u200D]); \n"; +static const char* BreakIteratorRuleOld = // supported on ICU like versions 52 + "$CR = [\\p{Grapheme_Cluster_Break = CR}]; \n" \ + "$LF = [\\p{Grapheme_Cluster_Break = LF}]; \n" \ + "$Control = [\\p{Grapheme_Cluster_Break = Control}]; \n" \ + "$Extend = [\\p{Grapheme_Cluster_Break = Extend}]; \n" \ + "$SpacingMark = [\\p{Grapheme_Cluster_Break = SpacingMark}]; \n" \ + "$Regional_Indicator = [\\p{Grapheme_Cluster_Break = Regional_Indicator}]; \n" \ + "$L = [\\p{Grapheme_Cluster_Break = L}]; \n" \ + "$V = [\\p{Grapheme_Cluster_Break = V}]; \n" \ + "$T = [\\p{Grapheme_Cluster_Break = T}]; \n" \ + "$LV = [\\p{Grapheme_Cluster_Break = LV}]; \n" \ + "$LVT = [\\p{Grapheme_Cluster_Break = LVT}]; \n" \ + "!!chain; \n" \ + "!!forward; \n" \ + "$L ($L | $V | $LV | $LVT); \n" \ + "($LV | $V) ($V | $T); \n" \ + "($LVT | $T) $T; \n" \ + "$Regional_Indicator $Regional_Indicator; \n" \ + "[^$Control $CR $LF] $Extend; \n" \ + "[^$Control $CR $LF] $SpacingMark; \n" \ + "!!reverse; \n" \ + "($L | $V | $LV | $LVT) $L; \n" \ + "($V | $T) ($LV | $V); \n" \ + "$T ($LVT | $T); \n" \ + "$Regional_Indicator $Regional_Indicator; \n" \ + "$Extend [^$Control $CR $LF]; \n" \ + "$SpacingMark [^$Control $CR $LF]; \n" \ + "!!safe_reverse; \n" \ + "!!safe_forward; \n"; + +static const char* BreakIteratorRuleNew = // supported on newer ICU versions like 62 and up + "!!quoted_literals_only; \n" \ + "$CR = [\\p{Grapheme_Cluster_Break = CR}]; \n" \ + "$LF = [\\p{Grapheme_Cluster_Break = LF}]; \n" \ + "$Control = [[\\p{Grapheme_Cluster_Break = Control}]]; \n" \ + "$Extend = [[\\p{Grapheme_Cluster_Break = Extend}]]; \n" \ + "$ZWJ = [\\p{Grapheme_Cluster_Break = ZWJ}]; \n" \ + "$Regional_Indicator = [\\p{Grapheme_Cluster_Break = Regional_Indicator}]; \n" \ + "$Prepend = [\\p{Grapheme_Cluster_Break = Prepend}]; \n" \ + "$SpacingMark = [\\p{Grapheme_Cluster_Break = SpacingMark}]; \n" \ + "$Virama = [\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Virama}]; \n" \ + "$LinkingConsonant = [\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Consonant}]; \n" \ + "$ExtCccZwj = [[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]; \n" \ + "$L = [\\p{Grapheme_Cluster_Break = L}]; \n" \ + "$V = [\\p{Grapheme_Cluster_Break = V}]; \n" \ + "$T = [\\p{Grapheme_Cluster_Break = T}]; \n" \ + "$LV = [\\p{Grapheme_Cluster_Break = LV}]; \n" \ + "$LVT = [\\p{Grapheme_Cluster_Break = LVT}]; \n" \ + "$Extended_Pict = [:ExtPict:]; \n" \ + "!!chain; \n" \ + "!!lookAheadHardBreak; \n" \ + "$L ($L | $V | $LV | $LVT); \n" \ + "($LV | $V) ($V | $T); \n" \ + "($LVT | $T) $T; \n" \ + "[^$Control $CR $LF] ($Extend | $ZWJ); \n" \ + "[^$Control $CR $LF] $SpacingMark; \n" \ + "$Prepend [^$Control $CR $LF]; \n" \ + "$LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* $LinkingConsonant; \n" \ + "$Extended_Pict $Extend* $ZWJ $Extended_Pict; \n" \ + "^$Prepend* $Regional_Indicator $Regional_Indicator / $Regional_Indicator; \n" \ + "^$Prepend* $Regional_Indicator $Regional_Indicator; \n" \ + ".;"; + +static UChar* s_breakIteratorRules = NULL; +static int32_t s_breakIteratorRulesLength = 0; // When doing string search operations using ICU, it is internally using a break iterator which doesn't allow breaking between some characters according to // the Grapheme Cluster Boundary Rules specified in http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules. // Unfortunately, not all rules will have the desired behavior we need to get in .NET. For example, the rules don't allow breaking between CR '\r' and LF '\n' characters. // When searching for "\n" in a string like "\r\n", will get not found result. -// We are customizing the break iterator to include only the rules which give the desired behavior. Mainly, we include the GB9 rule http://www.unicode.org/reports/tr29/#GB9 -// which doesn't allow breaking before the nonspace marks. +// We are customizing the break iterator to exclude the CRxLF rule which don't allow breaking between CR and LF. // The general rules syntax explained in the doc https://unicode-org.github.io/icu/userguide/boundaryanalysis/break-rules.html. -// The ICU rules definition exist here https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/rules/char.txt. +// The ICU latest rules definition exist here https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/rules/char.txt. static UBreakIterator* CreateCustomizedBreakIterator() { - UChar uRule[400]; - assert(strlen(BreakIteratorRule) < 400); - u_uastrcpy(uRule, BreakIteratorRule); - static UChar emptyString[1]; + UBreakIterator* breaker; + UErrorCode status = U_ZERO_ERROR; + if (s_breakIteratorRules != NULL) + { + assert(s_breakIteratorRulesLength > 0); + breaker = ubrk_openRules(s_breakIteratorRules, s_breakIteratorRulesLength, emptyString, 0, NULL, &status); + return U_FAILURE(status) ? NULL : breaker; + } + + int32_t oldRulesLength = (int32_t)strlen(BreakIteratorRuleOld); + int32_t newRulesLength = (int32_t)strlen(BreakIteratorRuleNew); + + int32_t breakIteratorRulesLength = newRulesLength > oldRulesLength ? newRulesLength : oldRulesLength; - UBreakIterator* breaker = ubrk_openRules(uRule, (int32_t)strlen(BreakIteratorRule), emptyString, 0, NULL, &status); + UChar* rules = (UChar*)malloc((breakIteratorRulesLength + 1) * sizeof(UChar)); + if (rules == NULL) + { + return NULL; + } + + u_uastrcpy(rules, BreakIteratorRuleNew); + + breaker = ubrk_openRules(rules, newRulesLength, emptyString, 0, NULL, &status); + if (U_FAILURE(status)) + { + status = U_ZERO_ERROR; + u_uastrcpy(rules, BreakIteratorRuleOld); + breaker = ubrk_openRules(rules, oldRulesLength, emptyString, 0, NULL, &status); + s_breakIteratorRulesLength = oldRulesLength; + } + else + { + s_breakIteratorRulesLength = newRulesLength; + } + + if (U_FAILURE(status)) + { + free(rules); + return NULL; + } + + UChar* pNull = NULL; + if (!pal_atomic_cas_ptr((void* volatile*)&s_breakIteratorRules, rules, pNull)) + { + free(rules); + assert(s_breakIteratorRules != NULL); + } - return U_FAILURE(status) ? NULL : breaker; + return breaker; } static void CloseSearchIterator(UStringSearch* pSearch) diff --git a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.IndexOf.cs b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.IndexOf.cs index 2d8d3953e10ea..8c1956754236a 100644 --- a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.IndexOf.cs +++ b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.IndexOf.cs @@ -73,10 +73,6 @@ public static IEnumerable IndexOf_TestData() yield return new object[] { s_invariantCompare, "TestFooBA\u0300R", "FooB\u00C0R", 0, 11, CompareOptions.IgnoreNonSpace, 4, 7 }; yield return new object[] { s_invariantCompare, "o\u0308", "o", 0, 2, CompareOptions.None, -1, 0 }; yield return new object[] { s_invariantCompare, "\r\n", "\n", 0, 2, CompareOptions.None, 1, 1 }; - yield return new object[] { s_invariantCompare, "\u0600x", "x", 0, 2, CompareOptions.None, 1, 1 }; - yield return new object[] { s_invariantCompare, "\u0601x", "x", 0, 2, CompareOptions.None, 1, 1 }; - yield return new object[] { s_invariantCompare, "x\u0600", "x", 0, 2, CompareOptions.None, 0, 1 }; - yield return new object[] { s_invariantCompare, "x\u0601", "x", 0, 2, CompareOptions.None, 0, 1 }; // Weightless characters yield return new object[] { s_invariantCompare, "", "\u200d", 0, 0, CompareOptions.None, 0, 0 }; From 44bdc8b4252ad62a7fa206947aed50f4e7b6b885 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Mon, 16 Aug 2021 21:29:45 -0700 Subject: [PATCH 4/7] Remove previousely added test --- .../tests/CompareInfo/CompareInfoTests.LastIndexOf.cs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.LastIndexOf.cs b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.LastIndexOf.cs index bc0c59ace3c3a..e5bd894e08f63 100644 --- a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.LastIndexOf.cs +++ b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.LastIndexOf.cs @@ -87,10 +87,6 @@ public static IEnumerable LastIndexOf_TestData() yield return new object[] { s_invariantCompare, "TestFooBA\u0300R", "FooB\u00C0R", 10, 11, CompareOptions.IgnoreNonSpace, 4, 7 }; yield return new object[] { s_invariantCompare, "o\u0308", "o", 1, 2, CompareOptions.None, -1, 0 }; yield return new object[] { s_invariantCompare, "\r\n", "\n", 1, 2, CompareOptions.None, 1, 1 }; - yield return new object[] { s_invariantCompare, "x\u0600", "x", 1, 2, CompareOptions.None, 0, 1 }; - yield return new object[] { s_invariantCompare, "x\u0601", "x", 1, 2, CompareOptions.None, 0, 1 }; - yield return new object[] { s_invariantCompare, "\u0600x", "x", 1, 2, CompareOptions.None, 1, 1 }; - yield return new object[] { s_invariantCompare, "\u0601x", "x", 1, 2, CompareOptions.None, 1, 1 }; // Weightless characters // NLS matches weightless characters at the end of the string From a12e5c5310bcf3a3b3c3e1e5710f802438014299 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Mon, 16 Aug 2021 21:46:49 -0700 Subject: [PATCH 5/7] Fix Linux compile error --- .../Native/Unix/System.Globalization.Native/pal_collation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c index d8c462ae237c5..3cea0ea89bb76 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c @@ -540,7 +540,7 @@ static UBreakIterator* CreateCustomizedBreakIterator() int32_t breakIteratorRulesLength = newRulesLength > oldRulesLength ? newRulesLength : oldRulesLength; - UChar* rules = (UChar*)malloc((breakIteratorRulesLength + 1) * sizeof(UChar)); + UChar* rules = (UChar*)malloc(((size_t)breakIteratorRulesLength + 1) * sizeof(UChar)); if (rules == NULL) { return NULL; From fc816ef31f1ac6efd29463057a13fc510a373fbc Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Tue, 17 Aug 2021 10:24:49 -0700 Subject: [PATCH 6/7] Address the feedback --- .../pal_collation.c | 21 +++++++------------ .../pal_icushim_internal.h | 4 ++-- .../pal_icushim_internal_android.h | 2 +- .../pal_timeZoneInfo.c | 3 ++- 4 files changed, 13 insertions(+), 17 deletions(-) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c index d8c462ae237c5..1d7cf97449a89 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c @@ -417,7 +417,7 @@ static int CanIgnoreAllCollationElements(const UCollator* pColl, const UChar* lp static void CreateSortHandle(SortHandle** ppSortHandle) { - *ppSortHandle = (SortHandle*)malloc(sizeof(SortHandle)); + *ppSortHandle = (SortHandle*)calloc(1, sizeof(SortHandle)); if ((*ppSortHandle) == NULL) { return; @@ -513,7 +513,6 @@ static const char* BreakIteratorRuleNew = // supported on newer ICU versions li ".;"; static UChar* s_breakIteratorRules = NULL; -static int32_t s_breakIteratorRulesLength = 0; // When doing string search operations using ICU, it is internally using a break iterator which doesn't allow breaking between some characters according to // the Grapheme Cluster Boundary Rules specified in http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules. @@ -530,8 +529,7 @@ static UBreakIterator* CreateCustomizedBreakIterator() UErrorCode status = U_ZERO_ERROR; if (s_breakIteratorRules != NULL) { - assert(s_breakIteratorRulesLength > 0); - breaker = ubrk_openRules(s_breakIteratorRules, s_breakIteratorRulesLength, emptyString, 0, NULL, &status); + breaker = ubrk_openRules(s_breakIteratorRules, -1, emptyString, 0, NULL, &status); return U_FAILURE(status) ? NULL : breaker; } @@ -540,25 +538,22 @@ static UBreakIterator* CreateCustomizedBreakIterator() int32_t breakIteratorRulesLength = newRulesLength > oldRulesLength ? newRulesLength : oldRulesLength; - UChar* rules = (UChar*)malloc((breakIteratorRulesLength + 1) * sizeof(UChar)); + UChar* rules = (UChar*)calloc((breakIteratorRulesLength + 1), sizeof(UChar)); if (rules == NULL) { return NULL; } - u_uastrcpy(rules, BreakIteratorRuleNew); + u_uastrncpy(rules, BreakIteratorRuleNew, newRulesLength); + rules[newRulesLength] = '\0'; breaker = ubrk_openRules(rules, newRulesLength, emptyString, 0, NULL, &status); if (U_FAILURE(status)) { status = U_ZERO_ERROR; - u_uastrcpy(rules, BreakIteratorRuleOld); + u_uastrncpy(rules, BreakIteratorRuleOld, oldRulesLength); + rules[oldRulesLength] = '\0'; breaker = ubrk_openRules(rules, oldRulesLength, emptyString, 0, NULL, &status); - s_breakIteratorRulesLength = oldRulesLength; - } - else - { - s_breakIteratorRulesLength = newRulesLength; } if (U_FAILURE(status)) @@ -667,7 +662,7 @@ static const UCollator* GetCollatorFromSortHandle(SortHandle* pSortHandle, int32 // CreateNewSearchNode will create a new node in the linked list and mark this node search handle as borrowed handle. static inline int32_t CreateNewSearchNode(SortHandle* pSortHandle, int32_t options) { - SearchIteratorNode* node = (SearchIteratorNode*) malloc(sizeof(SearchIteratorNode)); + SearchIteratorNode* node = (SearchIteratorNode*)calloc(1, sizeof(SearchIteratorNode)); if (node == NULL) { return false; diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal.h b/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal.h index e6acb61c74289..92ded52d68724 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal.h +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal.h @@ -72,7 +72,7 @@ U_CAPI int32_t U_EXPORT2 ucal_getWindowsTimeZoneID(const UChar* id, int32_t len, PER_FUNCTION_BLOCK(u_strncpy, libicuuc, true) \ PER_FUNCTION_BLOCK(u_tolower, libicuuc, true) \ PER_FUNCTION_BLOCK(u_toupper, libicuuc, true) \ - PER_FUNCTION_BLOCK(u_uastrcpy, libicuuc, true) \ + PER_FUNCTION_BLOCK(u_uastrncpy, libicuuc, true) \ PER_FUNCTION_BLOCK(ubrk_close, libicui18n, true) \ PER_FUNCTION_BLOCK(ubrk_openRules, libicui18n, true) \ PER_FUNCTION_BLOCK(ucal_add, libicui18n, true) \ @@ -218,7 +218,7 @@ FOR_ALL_ICU_FUNCTIONS #define u_strncpy(...) u_strncpy_ptr(__VA_ARGS__) #define u_tolower(...) u_tolower_ptr(__VA_ARGS__) #define u_toupper(...) u_toupper_ptr(__VA_ARGS__) -#define u_uastrcpy(...) u_uastrcpy_ptr(__VA_ARGS__) +#define u_uastrncpy(...) u_uastrncpy_ptr(__VA_ARGS__) #define ubrk_close(...) ubrk_close_ptr(__VA_ARGS__) #define ubrk_openRules(...) ubrk_openRules_ptr(__VA_ARGS__) #define ucal_add(...) ucal_add_ptr(__VA_ARGS__) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal_android.h b/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal_android.h index a1b42cdcdadc8..1125ce91c3f80 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal_android.h +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal_android.h @@ -441,7 +441,7 @@ UChar * u_strcpy(UChar * dst, const UChar * src); UChar * u_strncpy(UChar * dst, const UChar * src, int32_t n); UChar32 u_tolower(UChar32 c); UChar32 u_toupper(UChar32 c); -UChar* u_uastrcpy(UChar * dst, const char * src); +UChar* u_uastrncpy(UChar * dst, const char * src, int32_t n); void ubrk_close(UBreakIterator * bi); UBreakIterator* ubrk_openRules(const UChar * rules, int32_t rulesLength, const UChar * text, int32_t textLength, UParseError * parseErr, UErrorCode * status); void ucal_add(UCalendar * cal, UCalendarDateFields field, int32_t amount, UErrorCode * status); diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_timeZoneInfo.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_timeZoneInfo.c index 2620c146b53a0..790fb3b3055b4 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_timeZoneInfo.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_timeZoneInfo.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "pal_errors_internal.h" #include "pal_locale_internal.h" @@ -223,7 +224,7 @@ static void FixupTimeZoneGenericDisplayName(const char* locale, const UChar* tim } // Make a UChar[] version of the test time zone id for use in the API calls. - u_uastrcpy(testTimeZoneId, testId); + u_uastrncpy(testTimeZoneId, testId, (int32_t)strlen(testId)); // Get the standard name from the test time zone. GetTimeZoneDisplayName_FromCalendar(locale, testTimeZoneId, timestamp, UCAL_STANDARD, testDisplayName, DISPLAY_NAME_LENGTH, err); From aa84853b03dda99ede20e05ad3e1b99fb6f4a788 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Tue, 17 Aug 2021 10:41:00 -0700 Subject: [PATCH 7/7] More feedback addressing --- .../Native/Unix/System.Globalization.Native/pal_timeZoneInfo.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_timeZoneInfo.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_timeZoneInfo.c index 790fb3b3055b4..018746e972bc6 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_timeZoneInfo.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_timeZoneInfo.c @@ -5,7 +5,6 @@ #include #include #include -#include #include "pal_errors_internal.h" #include "pal_locale_internal.h" @@ -224,7 +223,7 @@ static void FixupTimeZoneGenericDisplayName(const char* locale, const UChar* tim } // Make a UChar[] version of the test time zone id for use in the API calls. - u_uastrncpy(testTimeZoneId, testId, (int32_t)strlen(testId)); + u_uastrncpy(testTimeZoneId, testId, TZID_LENGTH); // Get the standard name from the test time zone. GetTimeZoneDisplayName_FromCalendar(locale, testTimeZoneId, timestamp, UCAL_STANDARD, testDisplayName, DISPLAY_NAME_LENGTH, err);