diff options
author | Victor Chang <vichang@google.com> | 2024-04-25 10:36:20 +0100 |
---|---|---|
committer | Victor Chang <vichang@google.com> | 2024-04-25 10:43:23 +0100 |
commit | 2d40414736f295d14f159284388b8c73f6c6ffaa (patch) | |
tree | 8ae187c87ea4410b9f831c6091f1e45553b21e74 | |
parent | 690a1de1dc8cee61c822c44c88f88fd6f66c4668 (diff) | |
download | icu-2d40414736f295d14f159284388b8c73f6c6ffaa.tar.gz |
Cherry-pick: ICU-22742 Fix handling of XA,XB,XC for addLikelySubtags
Upstream commit:
https://github.com/unicode-org/icu/pull/2977/commits/78502b61366731c5483e48d2e746742e5962632e
Upstream bug:
https://unicode-org.atlassian.net/browse/ICU-22742
Add more tests.
ICU-22742 Add PS... variants
ICU-22742 Add java tests
ICU-22742 extend tests
ICU-22742 wrap java
Bug: 331740612
Test: atest CtsIcuTestCases CtsIcu4cTestCases
Change-Id: I91546706f5918ac353c0bf779ba6f16c0a0a1c5d
8 files changed, 502 insertions, 151 deletions
diff --git a/android_icu4j/src/main/java/android/icu/impl/locale/LikelySubtags.java b/android_icu4j/src/main/java/android/icu/impl/locale/LikelySubtags.java index ffc3a7413..6c4b699c9 100644 --- a/android_icu4j/src/main/java/android/icu/impl/locale/LikelySubtags.java +++ b/android_icu4j/src/main/java/android/icu/impl/locale/LikelySubtags.java @@ -220,49 +220,42 @@ public final class LikelySubtags { // Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK. // They should match only themselves, // not other locales with what looks like the same language and script subtags. - if (region.length() == 2 && region.charAt(0) == 'X') { - switch (region.charAt(1)) { - case 'A': - if (returnInputIfUnmatch) { - return new LSR(language, script, region, LSR.EXPLICIT_LSR); - } - return new LSR(PSEUDO_ACCENTS_PREFIX + language, - PSEUDO_ACCENTS_PREFIX + script, region, LSR.EXPLICIT_LSR); - case 'B': - if (returnInputIfUnmatch) { - return new LSR(language, script, region, LSR.EXPLICIT_LSR); - } - return new LSR(PSEUDO_BIDI_PREFIX + language, - PSEUDO_BIDI_PREFIX + script, region, LSR.EXPLICIT_LSR); - case 'C': - if (returnInputIfUnmatch) { - return new LSR(language, script, region, LSR.EXPLICIT_LSR); + if (!returnInputIfUnmatch) { + if (region.length() == 2 && region.charAt(0) == 'X') { + switch (region.charAt(1)) { + case 'A': + return new LSR(PSEUDO_ACCENTS_PREFIX + language, + PSEUDO_ACCENTS_PREFIX + script, region, LSR.EXPLICIT_LSR); + case 'B': + return new LSR(PSEUDO_BIDI_PREFIX + language, + PSEUDO_BIDI_PREFIX + script, region, LSR.EXPLICIT_LSR); + case 'C': + return new LSR(PSEUDO_CRACKED_PREFIX + language, + PSEUDO_CRACKED_PREFIX + script, region, LSR.EXPLICIT_LSR); + default: // normal locale + break; } - return new LSR(PSEUDO_CRACKED_PREFIX + language, - PSEUDO_CRACKED_PREFIX + script, region, LSR.EXPLICIT_LSR); - default: // normal locale - break; } - } - if (variant.startsWith("PS")) { - int lsrFlags = region.isEmpty() ? - LSR.EXPLICIT_LANGUAGE | LSR.EXPLICIT_SCRIPT : LSR.EXPLICIT_LSR; - switch (variant) { - case "PSACCENT": - return new LSR(PSEUDO_ACCENTS_PREFIX + language, - PSEUDO_ACCENTS_PREFIX + script, - region.isEmpty() ? "XA" : region, lsrFlags); - case "PSBIDI": - return new LSR(PSEUDO_BIDI_PREFIX + language, - PSEUDO_BIDI_PREFIX + script, - region.isEmpty() ? "XB" : region, lsrFlags); - case "PSCRACK": - return new LSR(PSEUDO_CRACKED_PREFIX + language, - PSEUDO_CRACKED_PREFIX + script, - region.isEmpty() ? "XC" : region, lsrFlags); - default: // normal locale - break; + if (variant.startsWith("PS")) { + int lsrFlags = region.isEmpty() ? + LSR.EXPLICIT_LANGUAGE | LSR.EXPLICIT_SCRIPT : LSR.EXPLICIT_LSR; + switch (variant) { + case "PSACCENT": + return new LSR(PSEUDO_ACCENTS_PREFIX + language, + PSEUDO_ACCENTS_PREFIX + script, + region.isEmpty() ? "XA" : region, lsrFlags); + case "PSBIDI": + return new LSR(PSEUDO_BIDI_PREFIX + language, + PSEUDO_BIDI_PREFIX + script, + region.isEmpty() ? "XB" : region, lsrFlags); + case "PSCRACK": + return new LSR(PSEUDO_CRACKED_PREFIX + language, + PSEUDO_CRACKED_PREFIX + script, + region.isEmpty() ? "XC" : region, lsrFlags); + default: // normal locale + break; + } } } diff --git a/android_icu4j/src/main/tests/android/icu/dev/test/util/ULocaleTest.java b/android_icu4j/src/main/tests/android/icu/dev/test/util/ULocaleTest.java index a8fb04a62..be223f7c8 100644 --- a/android_icu4j/src/main/tests/android/icu/dev/test/util/ULocaleTest.java +++ b/android_icu4j/src/main/tests/android/icu/dev/test/util/ULocaleTest.java @@ -1975,17 +1975,36 @@ public class ULocaleTest extends CoreTestFmwk { "zh_Hani", "zh_Hani_CN" // If change, please also update ULocale.java }, { - // ICU-22545 - "en_XA", + // ICU-22545 & ICU-22742 "en_XA", + "en_Latn_XA", + }, { + // ICU-22545 & ICU-22742 + "ar_XB", + "ar_Arab_XB", + }, { + // ICU-22545 & ICU-22742 + "ru_XC", + "ru_Cyrl_XC", + }, { + // ICU-22742 + "en_PSACCENT", + "en_Latn_US_PSACCENT", + }, { + "ar_PSBIDI", + "ar_Arab_EG_PSBIDI", + }, { + "ru_PSCRACK", + "ru_Cyrl_RU_PSCRACK", }, { - // ICU-22545 - "en_XB", - "en_XB", + "ar_PSACCENT", + "ar_Arab_EG_PSACCENT", }, { - // ICU-22545 - "en_XC", - "en_XC", + "ru_PSBIDI", + "ru_Cyrl_RU_PSBIDI", + }, { + "en_PSCRACK", + "en_Latn_US_PSCRACK", } }; @@ -5595,6 +5614,103 @@ public class ULocaleTest extends CoreTestFmwk { return tests; } + // ICU-22742, test addLikelySubtags with pseudo-locales + @Test + public void TestPseudoLocales() { + // input locale tag, expected locale tag + String[][] testCases = new String[][] { + // language + region, en + { "en-XA", "en-Latn-XA" }, + { "en-XB", "en-Latn-XB" }, + { "en-XC", "en-Latn-XC" }, + + // language + region, ar + { "ar-XA", "ar-Arab-XA" }, + { "ar-XB", "ar-Arab-XB" }, + { "ar-XC", "ar-Arab-XC" }, + + // language + region, something other than en, ar + { "ru-XA", "ru-Cyrl-XA" }, + { "el-XB", "el-Grek-XB" }, + + // undefined language - region + { "und-XA", "en-Latn-XA" }, + { "und-XB", "en-Latn-XB" }, + { "und-XC", "en-Latn-XC" }, + + // language + script + region + { "und-Latn-XA", "en-Latn-XA" }, + { "und-Latn-XB", "en-Latn-XB" }, + { "und-Latn-XC", "en-Latn-XC" }, + { "und-Arab-XA", "ar-Arab-XA" }, + { "und-Arab-XB", "ar-Arab-XB" }, + { "und-Arab-XC", "ar-Arab-XC" }, + { "und-Cyrl-XA", "ru-Cyrl-XA" }, + { "und-Grek-XB", "el-Grek-XB" }, + + // Make sure the script is not damaged, when correct + { "ru-Cyrl-XA", "ru-Cyrl-XA" }, + { "el-Grek-XB", "el-Grek-XB" }, + + // Make sure the script is not damaged, even if it is wrong + { "ru-Grek-XA", "ru-Grek-XA" }, + { "el-Cyrl-XB", "el-Cyrl-XB" }, + + // PS Variants + { "en-XA-PSACCENT", "en-Latn-XA-psaccent" }, + { "en-XA-PSBIDI", "en-Latn-XA-psbidi" }, + { "en-XA-PSCRACK", "en-Latn-XA-pscrack" }, + { "ar-XB-PSACCENT", "ar-Arab-XB-psaccent" }, + { "ar-XB-PSBIDI", "ar-Arab-XB-psbidi" }, + { "ar-XB-PSCRACK", "ar-Arab-XB-pscrack" }, + { "en-XC-PSACCENT", "en-Latn-XC-psaccent" }, + { "en-XC-PSBIDI", "en-Latn-XC-psbidi" }, + { "en-XC-PSCRACK", "en-Latn-XC-pscrack" }, + + { "en-US-PSACCENT", "en-Latn-US-psaccent" }, + { "en-US-PSBIDI", "en-Latn-US-psbidi" }, + { "en-US-PSCRACK", "en-Latn-US-pscrack" }, + { "ar-EG-PSACCENT", "ar-Arab-EG-psaccent" }, + { "ar-EG-PSBIDI", "ar-Arab-EG-psbidi" }, + { "ar-EG-PSCRACK", "ar-Arab-EG-pscrack" }, + + { "en-PSACCENT", "en-Latn-US-psaccent" }, + { "en-PSBIDI", "en-Latn-US-psbidi" }, + { "en-PSCRACK", "en-Latn-US-pscrack" }, + { "ar-PSACCENT", "ar-Arab-EG-psaccent" }, + { "ar-PSBIDI", "ar-Arab-EG-psbidi" }, + { "ar-PSCRACK", "ar-Arab-EG-pscrack" }, + + { "und-US-PSACCENT", "en-Latn-US-psaccent" }, + { "und-US-PSBIDI", "en-Latn-US-psbidi" }, + { "und-US-PSCRACK", "en-Latn-US-pscrack" }, + { "und-EG-PSACCENT", "ar-Arab-EG-psaccent" }, + { "und-EG-PSBIDI", "ar-Arab-EG-psbidi" }, + { "und-EG-PSCRACK", "ar-Arab-EG-pscrack" }, + + { "und-PSACCENT", "en-Latn-US-psaccent" }, + { "und-PSBIDI", "en-Latn-US-psbidi" }, + { "und-PSCRACK", "en-Latn-US-pscrack" }, + { "und-PSACCENT", "en-Latn-US-psaccent" }, + { "und-PSBIDI", "en-Latn-US-psbidi" }, + { "und-PSCRACK", "en-Latn-US-pscrack" }, + }; + String extensions = "-u-nu-Deva-hc-h23-fw-mon-mu-celsius-x-something-more"; + + for (String[] testCase : testCases) { + String inputTag = testCase[0]; + String expectedTag = testCase[1]; + ULocale result = ULocale.addLikelySubtags(ULocale.forLanguageTag(inputTag)); + ULocale expected = ULocale.forLanguageTag(expectedTag); + assertEquals("pseudo-locales(" + inputTag + ")", expected, result); + + // Make sure this also works with extensions. Kind of hacky... + result = ULocale.addLikelySubtags(ULocale.forLanguageTag(inputTag + extensions)); + expected = ULocale.forLanguageTag(expectedTag + extensions); + assertEquals("pseudo-locales(" + inputTag + ")", expected, result); + } + } + @Test @Parameters(method = "readLikelySubtagsTestCases") public void likelySubtagsDataDriven(TestCase test) { diff --git a/icu4c/source/common/loclikelysubtags.cpp b/icu4c/source/common/loclikelysubtags.cpp index c18219105..a750bb7b1 100644 --- a/icu4c/source/common/loclikelysubtags.cpp +++ b/icu4c/source/common/loclikelysubtags.cpp @@ -564,47 +564,40 @@ LSR LikelySubtags::makeMaximizedLsr(const char *language, const char *script, co // Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK. // They should match only themselves, // not other locales with what looks like the same language and script subtags. - char c1; - if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) { - switch (c1) { - case 'A': - if (returnInputIfUnmatch) { - return LSR(language, script, region, LSR::EXPLICIT_LSR); - } - return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region, - LSR::EXPLICIT_LSR, errorCode); - case 'B': - if (returnInputIfUnmatch) { - return LSR(language, script, region, LSR::EXPLICIT_LSR); - } - return LSR(PSEUDO_BIDI_PREFIX, language, script, region, - LSR::EXPLICIT_LSR, errorCode); - case 'C': - if (returnInputIfUnmatch) { - return LSR(language, script, region, LSR::EXPLICIT_LSR); + if (!returnInputIfUnmatch) { + char c1; + if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) { + switch (c1) { + case 'A': + return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region, + LSR::EXPLICIT_LSR, errorCode); + case 'B': + return LSR(PSEUDO_BIDI_PREFIX, language, script, region, + LSR::EXPLICIT_LSR, errorCode); + case 'C': + return LSR(PSEUDO_CRACKED_PREFIX, language, script, region, + LSR::EXPLICIT_LSR, errorCode); + default: // normal locale + break; } - return LSR(PSEUDO_CRACKED_PREFIX, language, script, region, - LSR::EXPLICIT_LSR, errorCode); - default: // normal locale - break; } - } - if (variant[0] == 'P' && variant[1] == 'S') { - int32_t lsrFlags = *region == 0 ? - LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR; - if (uprv_strcmp(variant, "PSACCENT") == 0) { - return LSR(PSEUDO_ACCENTS_PREFIX, language, script, - *region == 0 ? "XA" : region, lsrFlags, errorCode); - } else if (uprv_strcmp(variant, "PSBIDI") == 0) { - return LSR(PSEUDO_BIDI_PREFIX, language, script, - *region == 0 ? "XB" : region, lsrFlags, errorCode); - } else if (uprv_strcmp(variant, "PSCRACK") == 0) { - return LSR(PSEUDO_CRACKED_PREFIX, language, script, - *region == 0 ? "XC" : region, lsrFlags, errorCode); + if (variant[0] == 'P' && variant[1] == 'S') { + int32_t lsrFlags = *region == 0 ? + LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR; + if (uprv_strcmp(variant, "PSACCENT") == 0) { + return LSR(PSEUDO_ACCENTS_PREFIX, language, script, + *region == 0 ? "XA" : region, lsrFlags, errorCode); + } else if (uprv_strcmp(variant, "PSBIDI") == 0) { + return LSR(PSEUDO_BIDI_PREFIX, language, script, + *region == 0 ? "XB" : region, lsrFlags, errorCode); + } else if (uprv_strcmp(variant, "PSCRACK") == 0) { + return LSR(PSEUDO_CRACKED_PREFIX, language, script, + *region == 0 ? "XC" : region, lsrFlags, errorCode); + } + // else normal locale } - // else normal locale - } + } // end of if (!returnInputIfUnmatch) language = getCanonical(languageAliases, language); // (We have no script mappings.) diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c index 04efd5f37..5815b14f2 100644 --- a/icu4c/source/test/cintltst/cloctst.c +++ b/icu4c/source/test/cintltst/cloctst.c @@ -3920,17 +3920,17 @@ const char* const basic_maximize_data[][2] = { "zh_Hani", "zh_Hani_CN" // If change, please also update common/unicode/uloc.h }, { - // ICU-22545 + // ICU-22545 & ICU-22742 "en_XA", - "en_XA" + "en_Latn_XA" }, { - // ICU-22545 - "en_XB", - "en_XB" + // ICU-22545 & ICU-22742 + "ar_XB", + "ar_Arab_XB" }, { - // ICU-22545 - "en_XC", - "en_XC" + // ICU-22545 & ICU-22742 + "ru_XC", + "ru_Cyrl_XC" } }; diff --git a/icu4c/source/test/intltest/loctest.cpp b/icu4c/source/test/intltest/loctest.cpp index f39fb1ad5..dc0947bfd 100644 --- a/icu4c/source/test/intltest/loctest.cpp +++ b/icu4c/source/test/intltest/loctest.cpp @@ -233,6 +233,7 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c #endif TESTCASE_AUTO(TestSetIsBogus); TESTCASE_AUTO(TestParallelAPIValues); + TESTCASE_AUTO(TestPseudoLocales); TESTCASE_AUTO(TestAddLikelySubtags); TESTCASE_AUTO(TestMinimizeSubtags); TESTCASE_AUTO(TestAddLikelyAndMinimizeSubtags); @@ -1741,6 +1742,119 @@ LocaleTest::TestSetIsBogus() { } +void LocaleTest::TestPseudoLocales() { + // input locale tag, expected locale tag + static const struct { + const char* const input; + const char* const expected; + } test_cases[] = { + // language + region, en + { "en-XA", "en-Latn-XA" }, + { "en-XB", "en-Latn-XB" }, + { "en-XC", "en-Latn-XC" }, + + // language + region, ar + { "ar-XA", "ar-Arab-XA" }, + { "ar-XB", "ar-Arab-XB" }, + { "ar-XC", "ar-Arab-XC" }, + + // language + region, something other than en, ar + { "ru-XA", "ru-Cyrl-XA" }, + { "el-XB", "el-Grek-XB" }, + + // undefined language - region + { "und-XA", "en-Latn-XA" }, + { "und-XB", "en-Latn-XB" }, + { "und-XC", "en-Latn-XC" }, + + // language + script + region + { "und-Latn-XA", "en-Latn-XA" }, + { "und-Latn-XB", "en-Latn-XB" }, + { "und-Latn-XC", "en-Latn-XC" }, + { "und-Arab-XA", "ar-Arab-XA" }, + { "und-Arab-XB", "ar-Arab-XB" }, + { "und-Arab-XC", "ar-Arab-XC" }, + { "und-Cyrl-XA", "ru-Cyrl-XA" }, + { "und-Grek-XB", "el-Grek-XB" }, + + // Make sure the script is not damaged, when correct + { "ru-Cyrl-XA", "ru-Cyrl-XA" }, + { "el-Grek-XB", "el-Grek-XB" }, + + // Make sure the script is not damaged, even if it is wrong + { "ru-Grek-XA", "ru-Grek-XA" }, + { "el-Cyrl-XB", "el-Cyrl-XB" }, + + // PS Variants + { "en-XA-PSACCENT", "en-Latn-XA-psaccent" }, + { "en-XA-PSBIDI", "en-Latn-XA-psbidi" }, + { "en-XA-PSCRACK", "en-Latn-XA-pscrack" }, + { "ar-XB-PSACCENT", "ar-Arab-XB-psaccent" }, + { "ar-XB-PSBIDI", "ar-Arab-XB-psbidi" }, + { "ar-XB-PSCRACK", "ar-Arab-XB-pscrack" }, + { "en-XC-PSACCENT", "en-Latn-XC-psaccent" }, + { "en-XC-PSBIDI", "en-Latn-XC-psbidi" }, + { "en-XC-PSCRACK", "en-Latn-XC-pscrack" }, + + { "en-US-PSACCENT", "en-Latn-US-psaccent" }, + { "en-US-PSBIDI", "en-Latn-US-psbidi" }, + { "en-US-PSCRACK", "en-Latn-US-pscrack" }, + { "ar-EG-PSACCENT", "ar-Arab-EG-psaccent" }, + { "ar-EG-PSBIDI", "ar-Arab-EG-psbidi" }, + { "ar-EG-PSCRACK", "ar-Arab-EG-pscrack" }, + + { "en-PSACCENT", "en-Latn-US-psaccent" }, + { "en-PSBIDI", "en-Latn-US-psbidi" }, + { "en-PSCRACK", "en-Latn-US-pscrack" }, + { "ar-PSACCENT", "ar-Arab-EG-psaccent" }, + { "ar-PSBIDI", "ar-Arab-EG-psbidi" }, + { "ar-PSCRACK", "ar-Arab-EG-pscrack" }, + + { "und-US-PSACCENT", "en-Latn-US-psaccent" }, + { "und-US-PSBIDI", "en-Latn-US-psbidi" }, + { "und-US-PSCRACK", "en-Latn-US-pscrack" }, + { "und-EG-PSACCENT", "ar-Arab-EG-psaccent" }, + { "und-EG-PSBIDI", "ar-Arab-EG-psbidi" }, + { "und-EG-PSCRACK", "ar-Arab-EG-pscrack" }, + + { "und-PSACCENT", "en-Latn-US-psaccent" }, + { "und-PSBIDI", "en-Latn-US-psbidi" }, + { "und-PSCRACK", "en-Latn-US-pscrack" }, + { "und-PSACCENT", "en-Latn-US-psaccent" }, + { "und-PSBIDI", "en-Latn-US-psbidi" }, + { "und-PSCRACK", "en-Latn-US-pscrack" }, + }; + + std::string extensions("-u-nu-Deva-hc-h23-fw-mon-mu-celsius-x-somethin-more"); + + IcuTestErrorCode status(*this, "TestPseudoLocales()"); + for (const auto& item : test_cases) { + const char* const inputTag = item.input; + const char* const expectedTag = item.expected; + Locale result = Locale::forLanguageTag(inputTag, status); + result.addLikelySubtags(status); + status.errIfFailureAndReset("\"%s\"", inputTag); + Locale expected = Locale::forLanguageTag(expectedTag, status); + status.errIfFailureAndReset("\"%s\"", expectedTag); + assertEquals(inputTag, expected.getName(), result.getName()); + + // Test extension + std::string extendedTag(inputTag); + extendedTag.append(extensions); + + result = Locale::forLanguageTag(extendedTag, status); + result.addLikelySubtags(status); + status.errIfFailureAndReset(extendedTag.c_str()); + + std::string expectedExtendedTag(expectedTag); + expectedExtendedTag.append(extensions); + + expected = Locale::forLanguageTag(expectedExtendedTag, status); + status.errIfFailureAndReset(expectedExtendedTag.c_str()); + assertEquals(extendedTag.c_str(), expected.getName(), result.getName()); + } +} + void LocaleTest::TestAddLikelySubtags() { IcuTestErrorCode status(*this, "TestAddLikelySubtags()"); @@ -3974,20 +4088,45 @@ LocaleTest::TestAddLikelyAndMinimizeSubtags() { "zh_Hani_CN", // If change, please also update common/unicode/locid.h "zh_Hani" }, { - // ICU-22545 - "en_XA", + // ICU-22545 & ICU-22742 "en_XA", + "en_Latn_XA", "en_XA", }, { - // ICU-22545 - "en_XB", - "en_XB", - "en_XB", + // ICU-22545 & ICU-22742 + "ar_XB", + "ar_Arab_XB", + "ar_XB", + }, { + // ICU-22545 & ICU-22742 + "ru_XC", + "ru_Cyrl_XC", + "ru_XC", + }, { + // ICU-22742 + "en_PSACCENT", + "en_Latn_US_PSACCENT", + "en__PSACCENT" + }, { + "ar_PSBIDI", + "ar_Arab_EG_PSBIDI", + "ar__PSBIDI" + }, { + "ru_PSCRACK", + "ru_Cyrl_RU_PSCRACK", + "ru__PSCRACK" + }, { + "ar_PSACCENT", + "ar_Arab_EG_PSACCENT", + "ar__PSACCENT" + }, { + "ru_PSBIDI", + "ru_Cyrl_RU_PSBIDI", + "ru__PSBIDI" }, { - // ICU-22545 - "en_XC", - "en_XC", - "en_XC", + "en_PSCRACK", + "en_Latn_US_PSCRACK", + "en__PSCRACK" } }; diff --git a/icu4c/source/test/intltest/loctest.h b/icu4c/source/test/intltest/loctest.h index cb79b456d..b3410242e 100644 --- a/icu4c/source/test/intltest/loctest.h +++ b/icu4c/source/test/intltest/loctest.h @@ -129,6 +129,7 @@ public: void TestKnownCanonicalizedListCorrect(); void TestConstructorAcceptsBCP47(); + void TestPseudoLocales(); void TestAddLikelySubtags(); void TestMinimizeSubtags(); void TestAddLikelyAndMinimizeSubtags(); diff --git a/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java b/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java index 39b981370..3361782ec 100644 --- a/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java +++ b/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java @@ -1972,17 +1972,36 @@ public class ULocaleTest extends CoreTestFmwk { "zh_Hani", "zh_Hani_CN" // If change, please also update ULocale.java }, { - // ICU-22545 - "en_XA", + // ICU-22545 & ICU-22742 "en_XA", + "en_Latn_XA", + }, { + // ICU-22545 & ICU-22742 + "ar_XB", + "ar_Arab_XB", + }, { + // ICU-22545 & ICU-22742 + "ru_XC", + "ru_Cyrl_XC", + }, { + // ICU-22742 + "en_PSACCENT", + "en_Latn_US_PSACCENT", + }, { + "ar_PSBIDI", + "ar_Arab_EG_PSBIDI", + }, { + "ru_PSCRACK", + "ru_Cyrl_RU_PSCRACK", }, { - // ICU-22545 - "en_XB", - "en_XB", + "ar_PSACCENT", + "ar_Arab_EG_PSACCENT", }, { - // ICU-22545 - "en_XC", - "en_XC", + "ru_PSBIDI", + "ru_Cyrl_RU_PSBIDI", + }, { + "en_PSCRACK", + "en_Latn_US_PSCRACK", } }; @@ -5592,6 +5611,103 @@ public class ULocaleTest extends CoreTestFmwk { return tests; } + // ICU-22742, test addLikelySubtags with pseudo-locales + @Test + public void TestPseudoLocales() { + // input locale tag, expected locale tag + String[][] testCases = new String[][] { + // language + region, en + { "en-XA", "en-Latn-XA" }, + { "en-XB", "en-Latn-XB" }, + { "en-XC", "en-Latn-XC" }, + + // language + region, ar + { "ar-XA", "ar-Arab-XA" }, + { "ar-XB", "ar-Arab-XB" }, + { "ar-XC", "ar-Arab-XC" }, + + // language + region, something other than en, ar + { "ru-XA", "ru-Cyrl-XA" }, + { "el-XB", "el-Grek-XB" }, + + // undefined language - region + { "und-XA", "en-Latn-XA" }, + { "und-XB", "en-Latn-XB" }, + { "und-XC", "en-Latn-XC" }, + + // language + script + region + { "und-Latn-XA", "en-Latn-XA" }, + { "und-Latn-XB", "en-Latn-XB" }, + { "und-Latn-XC", "en-Latn-XC" }, + { "und-Arab-XA", "ar-Arab-XA" }, + { "und-Arab-XB", "ar-Arab-XB" }, + { "und-Arab-XC", "ar-Arab-XC" }, + { "und-Cyrl-XA", "ru-Cyrl-XA" }, + { "und-Grek-XB", "el-Grek-XB" }, + + // Make sure the script is not damaged, when correct + { "ru-Cyrl-XA", "ru-Cyrl-XA" }, + { "el-Grek-XB", "el-Grek-XB" }, + + // Make sure the script is not damaged, even if it is wrong + { "ru-Grek-XA", "ru-Grek-XA" }, + { "el-Cyrl-XB", "el-Cyrl-XB" }, + + // PS Variants + { "en-XA-PSACCENT", "en-Latn-XA-psaccent" }, + { "en-XA-PSBIDI", "en-Latn-XA-psbidi" }, + { "en-XA-PSCRACK", "en-Latn-XA-pscrack" }, + { "ar-XB-PSACCENT", "ar-Arab-XB-psaccent" }, + { "ar-XB-PSBIDI", "ar-Arab-XB-psbidi" }, + { "ar-XB-PSCRACK", "ar-Arab-XB-pscrack" }, + { "en-XC-PSACCENT", "en-Latn-XC-psaccent" }, + { "en-XC-PSBIDI", "en-Latn-XC-psbidi" }, + { "en-XC-PSCRACK", "en-Latn-XC-pscrack" }, + + { "en-US-PSACCENT", "en-Latn-US-psaccent" }, + { "en-US-PSBIDI", "en-Latn-US-psbidi" }, + { "en-US-PSCRACK", "en-Latn-US-pscrack" }, + { "ar-EG-PSACCENT", "ar-Arab-EG-psaccent" }, + { "ar-EG-PSBIDI", "ar-Arab-EG-psbidi" }, + { "ar-EG-PSCRACK", "ar-Arab-EG-pscrack" }, + + { "en-PSACCENT", "en-Latn-US-psaccent" }, + { "en-PSBIDI", "en-Latn-US-psbidi" }, + { "en-PSCRACK", "en-Latn-US-pscrack" }, + { "ar-PSACCENT", "ar-Arab-EG-psaccent" }, + { "ar-PSBIDI", "ar-Arab-EG-psbidi" }, + { "ar-PSCRACK", "ar-Arab-EG-pscrack" }, + + { "und-US-PSACCENT", "en-Latn-US-psaccent" }, + { "und-US-PSBIDI", "en-Latn-US-psbidi" }, + { "und-US-PSCRACK", "en-Latn-US-pscrack" }, + { "und-EG-PSACCENT", "ar-Arab-EG-psaccent" }, + { "und-EG-PSBIDI", "ar-Arab-EG-psbidi" }, + { "und-EG-PSCRACK", "ar-Arab-EG-pscrack" }, + + { "und-PSACCENT", "en-Latn-US-psaccent" }, + { "und-PSBIDI", "en-Latn-US-psbidi" }, + { "und-PSCRACK", "en-Latn-US-pscrack" }, + { "und-PSACCENT", "en-Latn-US-psaccent" }, + { "und-PSBIDI", "en-Latn-US-psbidi" }, + { "und-PSCRACK", "en-Latn-US-pscrack" }, + }; + String extensions = "-u-nu-Deva-hc-h23-fw-mon-mu-celsius-x-something-more"; + + for (String[] testCase : testCases) { + String inputTag = testCase[0]; + String expectedTag = testCase[1]; + ULocale result = ULocale.addLikelySubtags(ULocale.forLanguageTag(inputTag)); + ULocale expected = ULocale.forLanguageTag(expectedTag); + assertEquals("pseudo-locales(" + inputTag + ")", expected, result); + + // Make sure this also works with extensions. Kind of hacky... + result = ULocale.addLikelySubtags(ULocale.forLanguageTag(inputTag + extensions)); + expected = ULocale.forLanguageTag(expectedTag + extensions); + assertEquals("pseudo-locales(" + inputTag + ")", expected, result); + } + } + @Test @Parameters(method = "readLikelySubtagsTestCases") public void likelySubtagsDataDriven(TestCase test) { diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/impl/locale/LikelySubtags.java b/icu4j/main/core/src/main/java/com/ibm/icu/impl/locale/LikelySubtags.java index 6d5a35379..2a42c60c4 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/impl/locale/LikelySubtags.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/impl/locale/LikelySubtags.java @@ -213,49 +213,42 @@ public final class LikelySubtags { // Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK. // They should match only themselves, // not other locales with what looks like the same language and script subtags. - if (region.length() == 2 && region.charAt(0) == 'X') { - switch (region.charAt(1)) { - case 'A': - if (returnInputIfUnmatch) { - return new LSR(language, script, region, LSR.EXPLICIT_LSR); - } - return new LSR(PSEUDO_ACCENTS_PREFIX + language, - PSEUDO_ACCENTS_PREFIX + script, region, LSR.EXPLICIT_LSR); - case 'B': - if (returnInputIfUnmatch) { - return new LSR(language, script, region, LSR.EXPLICIT_LSR); - } - return new LSR(PSEUDO_BIDI_PREFIX + language, - PSEUDO_BIDI_PREFIX + script, region, LSR.EXPLICIT_LSR); - case 'C': - if (returnInputIfUnmatch) { - return new LSR(language, script, region, LSR.EXPLICIT_LSR); + if (!returnInputIfUnmatch) { + if (region.length() == 2 && region.charAt(0) == 'X') { + switch (region.charAt(1)) { + case 'A': + return new LSR(PSEUDO_ACCENTS_PREFIX + language, + PSEUDO_ACCENTS_PREFIX + script, region, LSR.EXPLICIT_LSR); + case 'B': + return new LSR(PSEUDO_BIDI_PREFIX + language, + PSEUDO_BIDI_PREFIX + script, region, LSR.EXPLICIT_LSR); + case 'C': + return new LSR(PSEUDO_CRACKED_PREFIX + language, + PSEUDO_CRACKED_PREFIX + script, region, LSR.EXPLICIT_LSR); + default: // normal locale + break; } - return new LSR(PSEUDO_CRACKED_PREFIX + language, - PSEUDO_CRACKED_PREFIX + script, region, LSR.EXPLICIT_LSR); - default: // normal locale - break; } - } - if (variant.startsWith("PS")) { - int lsrFlags = region.isEmpty() ? - LSR.EXPLICIT_LANGUAGE | LSR.EXPLICIT_SCRIPT : LSR.EXPLICIT_LSR; - switch (variant) { - case "PSACCENT": - return new LSR(PSEUDO_ACCENTS_PREFIX + language, - PSEUDO_ACCENTS_PREFIX + script, - region.isEmpty() ? "XA" : region, lsrFlags); - case "PSBIDI": - return new LSR(PSEUDO_BIDI_PREFIX + language, - PSEUDO_BIDI_PREFIX + script, - region.isEmpty() ? "XB" : region, lsrFlags); - case "PSCRACK": - return new LSR(PSEUDO_CRACKED_PREFIX + language, - PSEUDO_CRACKED_PREFIX + script, - region.isEmpty() ? "XC" : region, lsrFlags); - default: // normal locale - break; + if (variant.startsWith("PS")) { + int lsrFlags = region.isEmpty() ? + LSR.EXPLICIT_LANGUAGE | LSR.EXPLICIT_SCRIPT : LSR.EXPLICIT_LSR; + switch (variant) { + case "PSACCENT": + return new LSR(PSEUDO_ACCENTS_PREFIX + language, + PSEUDO_ACCENTS_PREFIX + script, + region.isEmpty() ? "XA" : region, lsrFlags); + case "PSBIDI": + return new LSR(PSEUDO_BIDI_PREFIX + language, + PSEUDO_BIDI_PREFIX + script, + region.isEmpty() ? "XB" : region, lsrFlags); + case "PSCRACK": + return new LSR(PSEUDO_CRACKED_PREFIX + language, + PSEUDO_CRACKED_PREFIX + script, + region.isEmpty() ? "XC" : region, lsrFlags); + default: // normal locale + break; + } } } |