12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142 |
- /*
- * Copyright 2005 - 2016 Zarafa and its licensors
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License, version 3,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- */
- /**
- @file
- Unicode String Utilities
- @defgroup ustringutil Unicode String Utilities
- @{
- The Unicode String Utilities provide some common string utilities aimed to be compliant with
- all (or at least most) of the Unicode quirks.
- The provided functions are:
- - str_equals, wcs_equals, u8_equals: Check if two strings are equal.
- - str_iequals, wcs_iequals, u8_iequals: Check if two strings are equal ignoring case.
- - str_startswith, wcs_startswith, u8_startswith: Check if one string starts with another.
- - str_istartswith, wcs_istartswith, u8_istartswith: Check if one string starts with another ignoring case.
- - str_icompare, wcs_icompare, u8_icompare: Compare two strings ignoring case.
- - str_contains, wcs_contains, u8_contains: Check if one string contains the other.
- - str_icontains, wcs_icontains, u8_icontains: Check if one string contains the other ignoring case.
- @par Normalization
- In order to compare unicode strings, the data needs to be normailized first. This is needed because Unicode allows
- different binary representations of the same data. The functions provide in this module make no assumptions about
- the provided data and will always perform a normalization before doing a comparison.
- @par Case mapping
- The case insensitive functions need a way to match code points regardless of their case. ICU provides a few methods for
- this, but they use a method called case-folding to avoid the need for a locale (changing case is dependant on a locale).
- Since case-folding doesn't take a locale, it's a best guess method, which will produce wrong results in certain situations.
- The functions in this library apply a method called case-mapping, which basically means we perform a to-upper on all
- code-points with a provided locale.
- @par Collation
- The functions that try to match (sub)strings, have no interest in the order in which strings would appear if they would be
- sorted. However, the compare functions do produce a result that could be used for sorting. Since sorting is dependant on a
- locale as well, they would need a locale. However, ICU provides a Collator class that performs the actual comparison for a
- particular locale. Since we don't want to construct a Collator class for every string comparison, the string comparison
- functions take a Collator object as argument. This way the caller can reuse the Collator.
- @par Performance
- Performance of the current (21-05-2010) implementation is probably pretty bad. This is caused by all the conversion that are
- performed on the complete strings before the actual comparison is even started.
- At some point we need to rewqrite these functions to do all the conversion on the fly to minimize processing.
- */
- #include "config.h"
- #include <kopano/platform.h>
- #include <kopano/ustringutil.h>
- #include <kopano/CommonUtil.h>
- #include "utf8/unchecked.h"
- #include <cassert>
- #include <memory>
- #include <unicode/unorm.h>
- #include <unicode/coll.h>
- #include <unicode/tblcoll.h>
- #include <unicode/coleitr.h>
- #include <unicode/normlzr.h>
- #include <unicode/ustring.h>
- #include "ustringutil/utfutil.h"
- typedef std::unique_ptr<Collator> unique_ptr_Collator;
- namespace KC {
- /**
- * US-ASCII version to find a case-insensitive string part in a
- * haystack.
- *
- * @param haystack search this haystack for a case-insensitive needle
- * @param needle search this needle in the case-insensitive haystack
- *
- * @return pointer where needle is found or NULL
- */
- const char* str_ifind(const char *haystack, const char *needle)
- {
- locale_t loc = createlocale(LC_CTYPE, "C");
- const char *needlepos = needle;
- const char *needlestart = haystack;
- while(*haystack) {
- if (toupper_l(*haystack, loc) == toupper_l(*needlepos, loc)) {
- ++needlepos;
- if(*needlepos == 0)
- goto exit;
- } else {
- haystack = needlestart++;
- needlepos = needle;
- }
- ++haystack;
- }
- needlestart = NULL;
- exit:
- freelocale(loc);
- return needlestart;
- }
- /**
- * Check if two strings are canonical equivalent.
- *
- * @param[in] s1 The string to compare s2 with.
- * @param[in] s2 The string to compare s1 with.
- * @param[in] locale The locale used to perform string collation.
- *
- * @return boolean
- * @retval true The strings are canonical equivalent
- * @retval false The strings are not canonical equivalent
- */
- bool str_equals(const char *s1, const char *s2, const ECLocale &locale)
- {
- assert(s1);
- assert(s2);
- UnicodeString a = StringToUnicode(s1);
- UnicodeString b = StringToUnicode(s2);
- return a.compare(b) == 0;
- }
- /**
- * Check if two strings are canonical equivalent when ignoring the case.
- *
- * @param[in] s1 The string to compare s2 with.
- * @param[in] s2 The string to compare s1 with.
- * @param[in] locale The locale used to convert the case of the strings.
- *
- * @return boolean
- * @retval true The strings are canonical equivalent
- * @retval false The strings are not canonical equivalent
- */
- bool str_iequals(const char *s1, const char *s2, const ECLocale &locale)
- {
- assert(s1);
- assert(s2);
- UnicodeString a = StringToUnicode(s1);
- UnicodeString b = StringToUnicode(s2);
- return a.caseCompare(b, 0) == 0;
- }
- /**
- * Check if the string s1 starts with s2.
- *
- * @param[in] s1 The string to compare s2 with.
- * @param[in] s2 The string to compare s1 with.
- * @param[in] locale The locale used to perform string collation.
- *
- * @return boolean
- * @retval true The strings are canonical equivalent
- * @retval false The strings are not canonical equivalent
- */
- bool str_startswith(const char *s1, const char *s2, const ECLocale &locale)
- {
- assert(s1);
- assert(s2);
- UnicodeString a = StringToUnicode(s1);
- UnicodeString b = StringToUnicode(s2);
- return a.compare(0, b.length(), b) == 0;
- }
- /**
- * Check if the string s1 starts with s2 when ignoring the case.
- *
- * @param[in] s1 The string to compare s2 with.
- * @param[in] s2 The string to compare s1 with.
- * @param[in] locale The locale used to convert the case of the strings.
- *
- * @return boolean
- * @retval true The strings are canonical equivalent
- * @retval false The strings are not canonical equivalent
- */
- bool str_istartswith(const char *s1, const char *s2, const ECLocale &locale)
- {
- assert(s1);
- assert(s2);
- UnicodeString a = StringToUnicode(s1);
- UnicodeString b = StringToUnicode(s2);
- return a.caseCompare(0, b.length(), b, 0) == 0;
- }
- /**
- * Compare two strings using the collator to determine the sort order.
- *
- * Both strings are expectes to be in the current locale. The comparison is
- * case insensitive. Effectively this only changes behavior compared to strcmp_unicode
- * if the two strings are the same if the case is discarded. It doesn't effect the
- * sorting in any other way.
- *
- * @param[in] s1 The string to compare s2 with.
- * @param[in] s2 The string to compare s1 with.
- * @param[in] collator The collator used to determine which string precedes the other.
- *
- * @return An integer.
- * @retval -1 s1 is smaller than s2
- * @retval 0 s1 equals s2.
- * @retval 1 s1 is greater than s2
- */
- int str_icompare(const char *s1, const char *s2, const ECLocale &locale)
- {
- assert(s1);
- assert(s2);
- UErrorCode status = U_ZERO_ERROR;
- unique_ptr_Collator ptrCollator(Collator::createInstance(locale, status));
- UnicodeString a = StringToUnicode(s1);
- UnicodeString b = StringToUnicode(s2);
- a.foldCase();
- b.foldCase();
- return ptrCollator->compare(a,b,status);
- }
- /**
- * Find a string in another string.
- *
- * @param[in] haystack The string to search in
- * @param[in] needle The string to search for.
- * @param[in] locale The locale used to perform string collation.
- *
- * @return boolean
- * @retval true The needle was found
- * @retval false The needle wasn't found
- *
- * @note This function behaves different than strstr in that it returns a
- * a boolean instead of a pointer to the found substring. This is
- * because we search on a transformed string. Getting the correct
- * pointer would involve additional processing while we don't need
- * the result anyway.
- */
- bool str_contains(const char *haystack, const char *needle, const ECLocale &locale)
- {
- assert(haystack);
- assert(needle);
- UnicodeString a = StringToUnicode(haystack);
- UnicodeString b = StringToUnicode(needle);
- return u_strstr(a.getTerminatedBuffer(), b.getTerminatedBuffer());
- }
- /**
- * Find a string in another string while ignoreing case.
- *
- * @param[in] haystack The string to search in
- * @param[in] needle The string to search for.
- * @param[in] locale The locale used to convert the case of the strings.
- *
- * @return boolean
- * @retval true The needle was found
- * @retval false The needle wasn't found
- */
- bool str_icontains(const char *haystack, const char *needle, const ECLocale &locale)
- {
- assert(haystack);
- assert(needle);
- UnicodeString a = StringToUnicode(haystack);
- UnicodeString b = StringToUnicode(needle);
- a.foldCase();
- b.foldCase();
- return u_strstr(a.getTerminatedBuffer(), b.getTerminatedBuffer());
- }
- /**
- * Check if two strings are canonical equivalent.
- *
- * @param[in] s1 The string to compare s2 with.
- * @param[in] s2 The string to compare s1 with.
- * @param[in] locale The locale used to perform string collation.
- *
- * @return boolean
- * @retval true The strings are canonical equivalent
- * @retval false The strings are not canonical equivalent
- */
- bool wcs_equals(const wchar_t *s1, const wchar_t *s2, const ECLocale &locale)
- {
- assert(s1);
- assert(s2);
- UnicodeString a = WCHARToUnicode(s1);
- UnicodeString b = WCHARToUnicode(s2);
- return a.compare(b) == 0;
- }
- /**
- * Check if two strings are canonical equivalent when ignoring the case.
- *
- * @param[in] s1 The string to compare s2 with.
- * @param[in] s2 The string to compare s1 with.
- * @param[in] locale The locale used to convert the case of the strings.
- *
- * @return boolean
- * @retval true The strings are canonical equivalent
- * @retval false The strings are not canonical equivalent
- */
- bool wcs_iequals(const wchar_t *s1, const wchar_t *s2, const ECLocale &locale)
- {
- assert(s1);
- assert(s2);
- UnicodeString a = WCHARToUnicode(s1);
- UnicodeString b = WCHARToUnicode(s2);
- return a.caseCompare(b, 0) == 0;
- }
- /**
- * Check if s1 starts with s2.
- *
- * @param[in] s1 The string to compare s2 with.
- * @param[in] s2 The string to compare s1 with.
- * @param[in] locale The locale used to perform string collation.
- *
- * @return boolean
- * @retval true The strings are canonical equivalent
- * @retval false The strings are not canonical equivalent
- */
- bool wcs_startswith(const wchar_t *s1, const wchar_t *s2, const ECLocale &locale)
- {
- assert(s1);
- assert(s2);
- UnicodeString a = WCHARToUnicode(s1);
- UnicodeString b = WCHARToUnicode(s2);
- return a.compare(0, b.length(), b) == 0;
- }
- /**
- * Check if s1 starts with s2 when ignoring the case.
- *
- * @param[in] s1 The string to compare s2 with.
- * @param[in] s2 The string to compare s1 with.
- * @param[in] locale The locale used to convert the case of the strings.
- *
- * @return boolean
- * @retval true The strings are canonical equivalent
- * @retval false The strings are not canonical equivalent
- */
- bool wcs_istartswith(const wchar_t *s1, const wchar_t *s2, const ECLocale &locale)
- {
- assert(s1);
- assert(s2);
- UnicodeString a = WCHARToUnicode(s1);
- UnicodeString b = WCHARToUnicode(s2);
- return a.caseCompare(0, b.length(), b, 0) == 0;
- }
- /**
- * Compare two strings using the collator to determine the sort order.
- *
- * Both strings are expectes to be in the current locale. The comparison is
- * case insensitive. Effectively this only changes behavior compared to strcmp_unicode
- * if the two strings are the same if the case is discarded. It doesn't effect the
- * sorting in any other way.
- *
- * @param[in] s1 The string to compare s2 with.
- * @param[in] s2 The string to compare s1 with.
- * @param[in] collator The collator used to determine which string precedes the other.
- *
- * @return An integer.
- * @retval -1 s1 is smaller than s2
- * @retval 0 s1 equals s2.
- * @retval 1 s1 is greater than s2
- */
- int wcs_icompare(const wchar_t *s1, const wchar_t *s2, const ECLocale &locale)
- {
- assert(s1);
- assert(s2);
- UErrorCode status = U_ZERO_ERROR;
- unique_ptr_Collator ptrCollator(Collator::createInstance(locale, status));
- UnicodeString a = WCHARToUnicode(s1);
- UnicodeString b = WCHARToUnicode(s2);
- a.foldCase();
- b.foldCase();
- return ptrCollator->compare(a,b,status);
- }
- /**
- * Find a string in another string.
- *
- * @param[in] haystack The string to search in
- * @param[in] needle The string to search for.
- * @param[in] locale The locale used to perform string collation.
- *
- * @return boolean
- * @retval true The needle was found
- * @retval false The needle wasn't found
- *
- * @note This function behaves different than strstr in that it returns a
- * a boolean instead of a pointer to the found substring. This is
- * because we search on a transformed string. Getting the correct
- * pointer would involve additional processing while we don't need
- * the result anyway.
- */
- bool wcs_contains(const wchar_t *haystack, const wchar_t *needle, const ECLocale &locale)
- {
- assert(haystack);
- assert(needle);
- UnicodeString a = WCHARToUnicode(haystack);
- UnicodeString b = WCHARToUnicode(needle);
- return u_strstr(a.getTerminatedBuffer(), b.getTerminatedBuffer());
- }
- /**
- * Find a string in another string while ignoreing case.
- *
- * @param[in] haystack The string to search in
- * @param[in] needle The string to search for.
- * @param[in] locale The locale to use when converting case.
- *
- * @return boolean
- * @retval true The needle was found
- * @retval false The needle wasn't found
- *
- * @note This function behaves different than strstr in that it returns a
- * a boolean instead of a pointer to the found substring. This is
- * because we search on a transformed string. Getting the correct
- * pointer would involve additional processing while we don't need
- * the result anyway.
- */
- bool wcs_icontains(const wchar_t *haystack, const wchar_t *needle, const ECLocale &locale)
- {
- assert(haystack);
- assert(needle);
- UnicodeString a = WCHARToUnicode(haystack);
- UnicodeString b = WCHARToUnicode(needle);
- a.foldCase();
- b.foldCase();
- return u_strstr(a.getTerminatedBuffer(), b.getTerminatedBuffer());
- }
- /**
- * Check if two strings are canonical equivalent.
- *
- * @param[in] s1 The string to compare s2 with.
- * @param[in] s2 The string to compare s1 with.
- * @param[in] locale The locale used to perform string collation.
- *
- * @return boolean
- * @retval true The strings are canonical equivalent
- * @retval false The strings are not canonical equivalent
- */
- bool u8_equals(const char *s1, const char *s2, const ECLocale &locale)
- {
- assert(s1);
- assert(s2);
- UnicodeString a = UTF8ToUnicode(s1);
- UnicodeString b = UTF8ToUnicode(s2);
- return a.compare(b) == 0;
- }
- /**
- * Check if two strings are canonical equivalent when ignoring the case.
- *
- * @param[in] s1 The string to compare s2 with.
- * @param[in] s2 The string to compare s1 with.
- * @param[in] locale The locale to use when converting case.
- *
- * @return boolean
- * @retval true The strings are canonical equivalent
- * @retval false The strings are not canonical equivalent
- */
- bool u8_iequals(const char *s1, const char *s2, const ECLocale &locale)
- {
- assert(s1);
- assert(s2);
- UnicodeString a = UTF8ToUnicode(s1);
- UnicodeString b = UTF8ToUnicode(s2);
- return a.caseCompare(b, 0) == 0;
- }
- /**
- * Check if s1 starts with s2.
- *
- * @param[in] s1 The string to compare s2 with.
- * @param[in] s2 The string to compare s1 with.
- * @param[in] locale The locale used to perform string collation.
- *
- * @return boolean
- * @retval true The strings are canonical equivalent
- * @retval false The strings are not canonical equivalent
- */
- bool u8_startswith(const char *s1, const char *s2, const ECLocale &locale)
- {
- assert(s1);
- assert(s2);
- UnicodeString a = UTF8ToUnicode(s1);
- UnicodeString b = UTF8ToUnicode(s2);
- return a.compare(0, b.length(), b) == 0;
- }
- /**
- * Check if s1 starts with s2 when ignoring the case.
- *
- * @param[in] s1 The string to compare s2 with.
- * @param[in] s2 The string to compare s1 with.
- * @param[in] locale The locale to use when converting case.
- *
- * @return boolean
- * @retval true The strings are canonical equivalent
- * @retval false The strings are not canonical equivalent
- */
- bool u8_istartswith(const char *s1, const char *s2, const ECLocale &locale)
- {
- assert(s1);
- assert(s2);
- UnicodeString a = UTF8ToUnicode(s1);
- UnicodeString b = UTF8ToUnicode(s2);
- return a.caseCompare(0, b.length(), b, 0) == 0;
- }
- /**
- * Compare two strings using the collator to determine the sort order.
- *
- * Both strings are expectes to be encoded in UTF-8. The comparison is
- * case insensitive. Effectively this only changes behavior compared to strcmp_unicode
- * if the two strings are the same if the case is discarded. It doesn't effect the
- * sorting in any other way.
- *
- * @param[in] s1 The string to compare s2 with.
- * @param[in] s2 The string to compare s1 with.
- * @param[in] collator The collator used to determine which string precedes the other.
- *
- * @return An integer.
- * @retval -1 s1 is smaller than s2
- * @retval 0 s1 equals s2.
- * @retval 1 s1 is greater than s2
- */
- int u8_icompare(const char *s1, const char *s2, const ECLocale &locale)
- {
- assert(s1);
- assert(s2);
- UErrorCode status = U_ZERO_ERROR;
- unique_ptr_Collator ptrCollator(Collator::createInstance(locale, status));
- UnicodeString a = UTF8ToUnicode(s1);
- UnicodeString b = UTF8ToUnicode(s2);
-
- a.foldCase();
- b.foldCase();
- return ptrCollator->compare(a,b,status);
- }
- /**
- * Find a string in another string.
- *
- * @param[in] haystack The string to search in
- * @param[in] needle The string to search for.
- * @param[in] locale The locale used to perform string collation.
- *
- * @return boolean
- * @retval true The needle was found
- * @retval false The needle wasn't found
- *
- * @note This function behaves different than strstr in that it returns a
- * a boolean instead of a pointer to the found substring. This is
- * because we search on a transformed string. Getting the correct
- * pointer would involve additional processing while we don't need
- * the result anyway.
- */
- bool u8_contains(const char *haystack, const char *needle, const ECLocale &locale)
- {
- assert(haystack);
- assert(needle);
- UnicodeString a = UTF8ToUnicode(haystack);
- UnicodeString b = UTF8ToUnicode(needle);
- return u_strstr(a.getTerminatedBuffer(), b.getTerminatedBuffer());
- }
- /**
- * Find a string in another string while ignoreing case.
- *
- * @param[in] haystack The string to search in
- * @param[in] needle The string to search for.
- * @param[in] locale The locale to use when converting case.
- *
- * @return boolean
- * @retval true The needle was found
- * @retval false The needle wasn't found
- */
- bool u8_icontains(const char *haystack, const char *needle, const ECLocale &locale)
- {
- assert(haystack);
- assert(needle);
- UnicodeString a = UTF8ToUnicode(haystack);
- UnicodeString b = UTF8ToUnicode(needle);
- a.foldCase();
- b.foldCase();
- return u_strstr(a.getTerminatedBuffer(), b.getTerminatedBuffer());
- }
- /**
- * Copy at most n characters from the utf8 string src to lpstrDest.
- *
- * @param[in] src The UTF-8 source data to copy
- * @param[in] n The maximum amount of characters to copy
- * @param[out] lpstrDest The copied data.
- *
- * @return The amount of characters copied.
- */
- unsigned u8_ncpy(const char *src, unsigned n, std::string *lpstrDest)
- {
- const char *it = src;
- unsigned len = 0;
- while (true) {
- const char *tmp = it;
- utf8::uint32_t cp = utf8::unchecked::next(tmp);
- if (cp == 0)
- break;
- it = tmp;
- if (++len == n)
- break;
- }
- lpstrDest->assign(src, it);
- return len;
- }
- /**
- * Returns the length in bytes of the string s when capped to a maximum of
- * max characters.
- *
- * @param[in] s The UTF-8 string to process
- * @param[in] max The maximum amount of characters for which to return
- * the length in bytes.
- *
- * @return The length in bytes of the capped string.
- */
- unsigned u8_cappedbytes(const char *s, unsigned max)
- {
- const char *it = s;
- unsigned len = 0;
- while (true) {
- const char *tmp = it;
- utf8::uint32_t cp = utf8::unchecked::next(tmp);
- if (cp == 0)
- break;
- it = tmp;
- if (++len == max)
- break;
- }
- return unsigned(it - s);
- }
- /**
- * Returns the length in characters of the passed UTF-8 string s
- *
- * @param[in] s The UTF-8 string to get length of.
- *
- * @return The length in characters of string s
- */
- unsigned u8_len(const char *s)
- {
- unsigned len = 0;
- while (true) {
- utf8::uint32_t cp = utf8::unchecked::next(s);
- if (cp == 0)
- break;
- ++len;
- }
- return len;
- }
- static const struct localemap {
- const char *lpszLocaleID; /*< Posix locale id */
- ULONG ulLCID; /*< Windows LCID */
- const char *lpszLocaleName; /*< Windows locale name */
- } localeMap[] = {
- {"af",54,"Afrikaans_South Africa"},
- {"af_NA",54,"Afrikaans_South Africa"},
- {"af_ZA",1078,"Afrikaans_South Africa"},
- {"ar",1,"Arabic_Saudi Arabia"},
- {"ar_BH",15361,"Arabic_Bahrain"},
- {"ar_DZ",5121,"Arabic_Algeria"},
- {"ar_EG",3073,"Arabic_Egypt"},
- {"ar_IQ",2049,"Arabic_Iraq"},
- {"ar_JO",11265,"Arabic_Jordan"},
- {"ar_KW",13313,"Arabic_Kuwait"},
- {"ar_LB",12289,"Arabic_Lebanon"},
- {"ar_LY",4097,"Arabic_Libya"},
- {"ar_MA",6145,"Arabic_Morocco"},
- {"ar_OM",8193,"Arabic_Oman"},
- {"ar_QA",16385,"Arabic_Qatar"},
- {"ar_SA",1025,"Arabic_Saudi Arabia"},
- {"ar_SD",1,"Arabic_Saudi Arabia"},
- {"ar_SY",10241,"Arabic_Syria"},
- {"ar_TN",7169,"Arabic_Tunisia"},
- {"ar_YE",9217,"Arabic_Yemen"},
- {"az",44,"Azeri (Latin)_Azerbaijan"},
- {"az_Cyrl_AZ",2092,"Azeri (Cyrillic)_Azerbaijan"},
- {"az_Latn_AZ",1068,"Azeri (Latin)_Azerbaijan"},
- {"be",35,"Belarusian_Belarus"},
- {"be_BY",1059,"Belarusian_Belarus"},
- {"bg",2,"Bulgarian_Bulgaria"},
- {"bg_BG",1026,"Bulgarian_Bulgaria"},
- {"ca",3,"Catalan_Spain"},
- {"ca_ES",1027,"Catalan_Spain"},
- {"cs",5,"Czech_Czech Republic"},
- {"cs_CZ",1029,"Czech_Czech Republic"},
- {"cy",82,"Welsh_United Kingdom"},
- {"cy_GB",1106,"Welsh_United Kingdom"},
- {"da",6,"Danish_Denmark"},
- {"da_DK",1030,"Danish_Denmark"},
- {"de",7,"German_Germany"},
- {"de_AT",3079,"German_Austria"},
- {"de_BE",7,"German_Germany"},
- {"de_CH",2055,"German_Switzerland"},
- {"de_DE",1031,"German_Germany"},
- {"de_LI",5127,"German_Liechtenstein"},
- {"de_LU",4103,"German_Luxembourg"},
- {"el",8,"Greek_Greece"},
- {"el_CY",8,"Greek_Greece"},
- {"el_GR",1032,"Greek_Greece"},
- {"en",9,"English_United States"},
- {"en_AU",3081,"English_Australia"},
- {"en_BE",9,"English_United States"},
- {"en_BW",9,"English_United States"},
- {"en_BZ",10249,"English_Belize"},
- {"en_CA",4105,"English_Canada"},
- {"en_GB",2057,"English_United Kingdom"},
- {"en_HK",9,"English_United States"},
- {"en_IE",6153,"English_Ireland"},
- {"en_JM",8201,"English_Jamaica"},
- {"en_MH",1033,"English_United States"},
- {"en_MT",9,"English_United States"},
- {"en_MU",9,"English_United States"},
- {"en_NA",9,"English_United States"},
- {"en_NZ",5129,"English_New Zealand"},
- {"en_PH",13321,"English_Republic of the Philippines"},
- {"en_PK",9,"English_United States"},
- {"en_TT",11273,"English_Trinidad and Tobago"},
- {"en_US",1033,"English_United States"},
- {"en_VI",9225,"English_Caribbean"},
- {"en_ZA",7177,"English_South Africa"},
- {"en_ZW",12297,"English_Zimbabwe"},
- {"es",10,"Spanish_Spain"},
- {"es_AR",11274,"Spanish_Argentina"},
- {"es_BO",16394,"Spanish_Bolivia"},
- {"es_CL",13322,"Spanish_Chile"},
- {"es_CO",9226,"Spanish_Colombia"},
- {"es_CR",5130,"Spanish_Costa Rica"},
- {"es_DO",7178,"Spanish_Dominican Republic"},
- {"es_EC",12298,"Spanish_Ecuador"},
- {"es_ES",3082,"Spanish_Spain"},
- {"es_GQ",10,"Spanish_Spain"},
- {"es_GT",4106,"Spanish_Guatemala"},
- {"es_HN",18442,"Spanish_Honduras"},
- {"es_MX",2058,"Spanish_Mexico"},
- {"es_NI",19466,"Spanish_Nicaragua"},
- {"es_PA",6154,"Spanish_Panama"},
- {"es_PE",10250,"Spanish_Peru"},
- {"es_PR",20490,"Spanish_Puerto Rico"},
- {"es_PY",15370,"Spanish_Paraguay"},
- {"es_SV",17418,"Spanish_El Salvador"},
- {"es_UY",14346,"Spanish_Uruguay"},
- {"es_VE",8202,"Spanish_Venezuela"},
- {"et",37,"Estonian_Estonia"},
- {"et_EE",1061,"Estonian_Estonia"},
- {"eu",45,"Basque_Spain"},
- {"eu_ES",1069,"Basque_Spain"},
- {"fa",41,"Farsi_Iran"},
- {"fa_IR",1065,"Farsi_Iran"},
- {"fi",11,"Finnish_Finland"},
- {"fi_FI",1035,"Finnish_Finland"},
- {"fil",100,"Filipino_Philippines"},
- {"fil_PH",1124,"Filipino_Philippines"},
- {"fo",56,"Faroese_Faroe Islands"},
- {"fo_FO",1080,"Faroese_Faroe Islands"},
- {"fr",12,"French_France"},
- {"fr_BE",2060,"French_Belgium"},
- {"fr_BL",12,"French_France"},
- {"fr_CA",3084,"French_Canada"},
- {"fr_CF",12,"French_France"},
- {"fr_CH",4108,"French_Switzerland"},
- {"fr_FR",1036,"French_France"},
- {"fr_GN",12,"French_France"},
- {"fr_GP",12,"French_France"},
- {"fr_LU",5132,"French_Luxembourg"},
- {"fr_MC",6156,"French_Principality of Monaco"},
- {"fr_MF",12,"French_France"},
- {"fr_MG",12,"French_France"},
- {"fr_MQ",12,"French_France"},
- {"fr_NE",12,"French_France"},
- {"ga_IE",2108,"Irish_Ireland"},
- {"gl",86,"Galician_Spain"},
- {"gl_ES",1110,"Galician_Spain"},
- {"gu",71,"Gujarati_India"},
- {"gu_IN",1095,"Gujarati_India"},
- {"he",13,"Hebrew_Israel"},
- {"he_IL",1037,"Hebrew_Israel"},
- {"hi",57,"Hindi_India"},
- {"hi_IN",1081,"Hindi_India"},
- {"hr",26,"Croatian_Croatia"},
- {"hr_HR",1050,"Croatian_Croatia"},
- {"hu",14,"Hungarian_Hungary"},
- {"hu_HU",1038,"Hungarian_Hungary"},
- {"hy",43,"Armenian_Armenia"},
- {"hy_AM",1067,"Armenian_Armenia"},
- {"id",33,"Indonesian_Indonesia"},
- {"id_ID",1057,"Indonesian_Indonesia"},
- {"is",15,"Icelandic_Iceland"},
- {"is_IS",1039,"Icelandic_Iceland"},
- {"it",16,"Italian_Italy"},
- {"it_CH",2064,"Italian_Switzerland"},
- {"it_IT",1040,"Italian_Italy"},
- {"ja",17,"Japanese_Japan"},
- {"ja_JP",1041,"Japanese_Japan"},
- {"ka",55,"Georgian_Georgia"},
- {"ka_GE",1079,"Georgian_Georgia"},
- {"kk",63,"Kazakh_Kazakhstan"},
- {"kk_Cyrl",63,"Kazakh_Kazakhstan"},
- {"kk_Cyrl_KZ",63,"Kazakh_Kazakhstan"},
- {"kn",75,"Kannada_India"},
- {"kn_IN",1099,"Kannada_India"},
- {"ko",18,"Korean_Korea"},
- {"ko_KR",1042,"Korean_Korea"},
- {"kok",87,"Konkani_India"},
- {"kok_IN",1111,"Konkani_India"},
- {"lt",39,"Lithuanian_Lithuania"},
- {"lt_LT",1063,"Lithuanian_Lithuania"},
- {"lv",38,"Latvian_Latvia"},
- {"lv_LV",1062,"Latvian_Latvia"},
- {"mk",47,"FYRO Macedonian_Former Yugoslav Republic of Macedonia"},
- {"mk_MK",1071,"FYRO Macedonian_Former Yugoslav Republic of Macedonia"},
- {"mr",78,"Marathi_India"},
- {"mr_IN",1102,"Marathi_India"},
- {"ms",62,"Malay_Malaysia"},
- {"ms_BN",2110,"Malay_Brunei Darussalam"},
- {"ms_MY",1086,"Malay_Malaysia"},
- {"mt",58,"Maltese_Malta"},
- {"mt_MT",1082,"Maltese_Malta"},
- {"nb_NO",1044,"Norwegian_Norway"},
- {"ne",97,"Nepali_Nepal"},
- {"ne_NP",1121,"Nepali_Nepal"},
- {"nl",19,"Dutch_Netherlands"},
- {"nl_BE",2067,"Dutch_Belgium"},
- {"nl_NL",1043,"Dutch_Netherlands"},
- {"nn_NO",2068,"Norwegian (Nynorsk)_Norway"},
- {"pa",70,"Punjabi_India"},
- {"pa_Arab",70,"Punjabi_India"},
- {"pa_Arab_PK",70,"Punjabi_India"},
- {"pa_Guru",70,"Punjabi_India"},
- {"pa_Guru_IN",70,"Punjabi_India"},
- {"pl",21,"Polish_Poland"},
- {"pl_PL",1045,"Polish_Poland"},
- {"ps",99,"Pashto_Afghanistan"},
- {"ps_AF",1123,"Pashto_Afghanistan"},
- {"pt",22,"Portuguese_Brazil"},
- {"pt_BR",1046,"Portuguese_Brazil"},
- {"pt_GW",22,"Portuguese_Brazil"},
- {"pt_MZ",22,"Portuguese_Brazil"},
- {"pt_PT",2070,"Portuguese_Portugal"},
- {"rm",23,"Romansh_Switzerland"},
- {"rm_CH",1047,"Romansh_Switzerland"},
- {"ro",24,"Romanian_Romania"},
- {"ro_MD",24,"Romanian_Romania"},
- {"ro_RO",1048,"Romanian_Romania"},
- {"ru",25,"Russian_Russia"},
- {"ru_MD",25,"Russian_Russia"},
- {"ru_RU",1049,"Russian_Russia"},
- {"ru_UA",25,"Russian_Russia"},
- {"sk",27,"Slovak_Slovakia"},
- {"sk_SK",1051,"Slovak_Slovakia"},
- {"sl",36,"Slovenian_Slovenia"},
- {"sl_SI",1060,"Slovenian_Slovenia"},
- {"sq",28,"Albanian_Albania"},
- {"sq_AL",1052,"Albanian_Albania"},
- {"sr_Cyrl_BA",7194,"Serbian (Cyrillic)_Bosnia and Herzegovina"},
- {"sr_Latn_BA",6170,"Serbian (Latin)_Bosnia and Herzegovina"},
- {"sv",29,"Swedish_Sweden"},
- {"sv_FI",2077,"Swedish_Finland"},
- {"sv_SE",1053,"Swedish_Sweden"},
- {"sw",65,"Swahili_Kenya"},
- {"sw_KE",1089,"Swahili_Kenya"},
- {"sw_TZ",65,"Swahili_Kenya"},
- {"ta",73,"Tamil_India"},
- {"ta_IN",1097,"Tamil_India"},
- {"ta_LK",73,"Tamil_India"},
- {"te",74,"Telugu_India"},
- {"te_IN",1098,"Telugu_India"},
- {"th",30,"Thai_Thailand"},
- {"th_TH",1054,"Thai_Thailand"},
- {"tr",31,"Turkish_Turkey"},
- {"tr_TR",1055,"Turkish_Turkey"},
- {"uk",34,"Ukrainian_Ukraine"},
- {"uk_UA",1058,"Ukrainian_Ukraine"},
- {"ur",32,"Urdu_Islamic Republic of Pakistan"},
- {"ur_PK",1056,"Urdu_Islamic Republic of Pakistan"},
- {"uz",67,"Uzbek (Latin)_Uzbekistan"},
- {"uz_Arab",67,"Uzbek (Latin)_Uzbekistan"},
- {"uz_Arab_AF",67,"Uzbek (Latin)_Uzbekistan"},
- {"uz_Cyrl_UZ",2115,"Uzbek (Cyrillic)_Uzbekistan"},
- {"uz_Latn_UZ",1091,"Uzbek (Latin)_Uzbekistan"},
- {"vi",42,"Vietnamese_Viet Nam"},
- {"vi_VN",1066,"Vietnamese_Viet Nam"},
- {"zh_Hans",4,"Chinese_Taiwan"},
- {"zh_Hans_CN",2052,"Chinese_People's Republic of China"},
- {"zh_Hans_HK",4,"Chinese_Taiwan"},
- {"zh_Hans_MO",4,"Chinese_Taiwan"},
- {"zh_Hans_SG",4100,"Chinese_Singapore"},
- {"zh_Hant_TW",1028,"Chinese_Taiwan"},
- {"zu",53,"Zulu_South Africa"},
- {"zu_ZA",1077,"Zulu_South Africa"},
- };
- ECLocale createLocaleFromName(const char *lpszLocale)
- {
- return Locale::createFromName(lpszLocale);
- }
- ECRESULT LocaleIdToLCID(const char *lpszLocaleID, ULONG *lpulLcid)
- {
- const struct localemap *lpMapEntry = NULL;
- assert(lpszLocaleID != NULL);
- assert(lpulLcid != NULL);
- for (size_t i = 0; lpMapEntry == nullptr && i < ARRAY_SIZE(localeMap); ++i)
- if (strcasecmp(localeMap[i].lpszLocaleID, lpszLocaleID) == 0)
- lpMapEntry = &localeMap[i];
- if (lpMapEntry == NULL)
- return KCERR_NOT_FOUND;
- *lpulLcid = lpMapEntry->ulLCID;
- return erSuccess;
- }
- ECRESULT LCIDToLocaleId(ULONG ulLcid, const char **lppszLocaleID)
- {
- const struct localemap *lpMapEntry = NULL;
- assert(lppszLocaleID != NULL);
- for (size_t i = 0; lpMapEntry == nullptr && i < ARRAY_SIZE(localeMap); ++i)
- if (localeMap[i].ulLCID == ulLcid)
- lpMapEntry = &localeMap[i];
- if (lpMapEntry == NULL)
- return KCERR_NOT_FOUND;
- *lppszLocaleID = lpMapEntry->lpszLocaleID;
- return erSuccess;
- }
- /**
- * Create a locale independant blob that can be used to sort
- * strings fast. This is used when a string would be compared
- * multiple times.
- *
- * @param[in] s The string to compare.
- * @param[in] nCap Base the key on the first nCap characters of s (if larger than 0).
- * @param[in] locale The locale used to create the sort key.
- *
- * @returns ECSortKey object containing the blob
- */
- static ECSortKey createSortKey(UnicodeString s, int nCap,
- const ECLocale &locale)
- {
- if (nCap > 1)
- s.truncate(nCap);
- // Quick workaround for sorting items starting with ' (like From and To) and ( and '(
- if (s.startsWith("'") || s.startsWith("("))
- s.remove(0, 1);
- CollationKey key;
- UErrorCode status = U_ZERO_ERROR;
- unique_ptr_Collator ptrCollator(Collator::createInstance(locale, status));
- ptrCollator->getCollationKey(s, key, status); // Create a collation key for sorting
- return key;
- }
- /**
- * Create a locale independant blob that can be used to sort
- * strings fast. This is used when a string would be compared
- * multiple times.
- *
- * @param[in] s The string to compare.
- * @param[in] nCap Base the key on the first nCap characters of s (if larger than 0).
- * @param[in] locale The locale used to create the sort key.
- * @param[out] lpcbKeys The size in bytes of the returned key.
- * @param[ou]t lppKey The returned key.
- */
- static void createSortKeyData(const UnicodeString &s, int nCap, const ECLocale &locale, unsigned int *lpcbKey, unsigned char **lppKey)
- {
- unsigned char *lpKey = NULL;
- CollationKey key = createSortKey(s, nCap, locale);
- int32_t cbKeyData = 0;
- const uint8_t *lpKeyData = key.getByteArray(cbKeyData);
- lpKey = new unsigned char[cbKeyData];
- memcpy(lpKey, lpKeyData, cbKeyData);
- *lpcbKey = cbKeyData;
- *lppKey = lpKey;
- }
- /**
- * Create a locale independant blob that can be used to sort
- * strings fast. This is used when a string would be compared
- * multiple times.
- *
- * @param[in] s The string to compare.
- * @param[in] nCap Base the key on the first nCap characters of s (if larger than 0).
- * @param[in] locale The locale used to create the sort key.
- * @param[out] lpcbKeys The size in bytes of the returned key.
- * @param[ou]t lppKey The returned key.
- */
- void createSortKeyData(const char *s, int nCap, const ECLocale &locale, unsigned int *lpcbKey, unsigned char **lppKey)
- {
- assert(s != NULL);
- assert(lpcbKey != NULL);
- assert(lppKey != NULL);
- createSortKeyData(UnicodeString(s), nCap, locale, lpcbKey, lppKey);
- }
- /**
- * Create a locale independant blob that can be used to sort
- * strings fast. This is used when a string would be compared
- * multiple times.
- *
- * @param[in] s The string to compare.
- * @param[in] locale The locale used to create the sort key.
- * @param[out] lpcbKeys The size in bytes of the returned key.
- * @param[ou]t lppKey The returned key.
- */
- void createSortKeyData(const wchar_t *s, int nCap, const ECLocale &locale, unsigned int *lpcbKey, unsigned char **lppKey)
- {
- assert(s != NULL);
- assert(lpcbKey != NULL);
- assert(lppKey != NULL);
- UnicodeString ustring;
- ustring = UTF32ToUnicode((const UChar32*)s);
- createSortKeyData(ustring, nCap, locale, lpcbKey, lppKey);
- }
- /**
- * Create a locale independant blob that can be used to sort
- * strings fast. This is used when a string would be compared
- * multiple times.
- *
- * @param[in] s The string to compare.
- * @param[in] nCap Base the key on the first nCap characters of s (if larger than 0).
- * @param[in] locale The locale used to create the sort key.
- * @param[out] lpcbKeys The size in bytes of the returned key.
- * @param[ou]t lppKey The returned key.
- */
- void createSortKeyDataFromUTF8(const char *s, int nCap, const ECLocale &locale, unsigned int *lpcbKey, unsigned char **lppKey)
- {
- assert(s != NULL);
- assert(lpcbKey != NULL);
- assert(lppKey != NULL);
- createSortKeyData(UTF8ToUnicode(s), nCap, locale, lpcbKey, lppKey);
- }
- /**
- * Create a locale independant blob that can be used to sort
- * strings fast. This is used when a string would be compared
- * multiple times.
- *
- * @param[in] s The string to compare.
- * @param[in] nCap Base the key on the first nCap characters of s (if larger than 0).
- * @param[in] locale The locale used to create the sort key.
- *
- * @returns The ECSortKey containing the blob.
- */
- ECSortKey createSortKeyFromUTF8(const char *s, int nCap, const ECLocale &locale)
- {
- assert(s != NULL);
- return createSortKey(UTF8ToUnicode(s), nCap, locale);
- }
- /**
- * Compare two sort keys previously created with createSortKey.
- *
- * @param[in] cbKey1 The size i nbytes of key 1.
- * @param[in] lpKey1 Key 1.
- * @param[in] cbKey2 The size i nbytes of key 2.
- * @param[in] lpKey2 Key 2.
- *
- * @retval <0 Key1 is smaller than key2
- * @retval 0 Key1 equals key2
- * @retval >0 Key1 is greater than key2
- */
- int compareSortKeys(unsigned int cbKey1, const unsigned char *lpKey1, unsigned int cbKey2, const unsigned char *lpKey2)
- {
- assert(!(cbKey1 != 0 && lpKey1 == NULL));
- assert(!(cbKey2 != 0 && lpKey2 == NULL));
- CollationKey ckA(lpKey1, cbKey1);
- CollationKey ckB(lpKey2, cbKey2);
- int cmp = 1;
- UErrorCode status = U_ZERO_ERROR;
- switch (ckA.compareTo(ckB, status)) {
- case UCOL_LESS: cmp = -1; break;
- case UCOL_EQUAL: cmp = 0; break;
- case UCOL_GREATER: cmp = 1; break;
- }
- return cmp;
- }
- } /* namespace */
- /** @} */
|