3.1.5.2.4 GetWindowsSortKey Pseudocode
This algorithm specifies the generation of sort keys for a specific UTF-16 string.<4>
-
STRUCTURE CharacterWeightType ( ScriptMember: 8 bit integer PrimaryWeight: 8 bit integer DiacriticWeight: 8 bit integer CaseWeight: 8 bit integer ) STRUCTURE UnicodeWeightType ( ScriptMember: 8 bit integer PrimaryWeight: 8 bit integer ThirdByteWeight: 8 bit integer ) STRUCTURE SpecialWeightType ( Position: 16 bit integer ScriptMember: 8 bit integer PrimaryWeight: 8 bit integer ) STRUCTURE ExtraWeightType ( W6: 8 bit integer W7: 8 bit integer ) SET constant LCID_KOREAN to 0x0412 SET constant LCID_KOREAN_UNICODE_SORT to 0x010412 SET constant LCID_HUNGARIAN to 0x040e SET constant SORTKEY_SEPARATOR to 0x01 SET constant SORTKEY_TERMINATOR to 0x00 SET global KoreanScriptMap to InitKoreanScriptMap // // Script Member Values. // SET constant UNSORTABLE to 0 SET constant NONSPACE_MARK to 1 SET constant EXPANSION to 2 SET constant EASTASIA_SPECIAL to 3 SET constant JAMO_SPECIAL to 4 SET constant EXTENSION_A to 5 SET constant PUNCTUATION to 6 SET constant SYMBOL_1 to 7 SET constant SYMBOL_2 to 8 SET constant SYMBOL_3 to 9 SET constant SYMBOL_4 to 10 SET constant SYMBOL_5 to 11 SET constant SYMBOL_6 to 12 SET constant DIGIT to 13 SET constant LATIN to 14 SET constant KANA to 34 SET constant IDEOGRAPH to 128 IF Windows version is Windows Vista, Windows Server 2008, Windows 7, or Windows Server 2008 R2 THEN SET constant MAX_SPECIAL_CASE to SYMBOL_6 ELSE SET constant MAX_SPECIAL_CASE to SYMBOL_5 ENDIF COMMENT Set the constant for fhe first script member of the Unicode COMMENT Private Use Area (PUA) range SET constant PUA3BYTESTART to 0xA9 COMMENT Set the constant for the last script member of the Unicode COMMENT Private Use Area (PUA) range SET constant PUA3BYTEEND to 0xAF COMMENT Set the constant for the first script member of CJK COMMENT(Chinese/Japanese/Korean) 3 byte weight range SET constant CJK3BYTESTART to 0xC0 COMMMENT Set the constant for the last script member of CJK COMMENT (Chinese/Japanese/Korean) 3 byte weight range SET constant CJK3BYTEEND to 0xF9 ENDIF SET constant FIRST_SCRIPT to LATIN SET constant MAX_SCRIPTS to 256 // // Values for CJK Unified Ideographs Extension A range. // 0x3400 thru 0x4dbf // SET constant SCRIPT_MEMBER_EXT_A to 254 // SM for Extension A SET constant PRIMARY_WEIGHT_EXT_A to 255 // AW for Extension A // // Lowest weight values. // Used to remove trailing DW and CW values. // Also used to keep illegal values out of sort keys. // SET constant MIN_DW to 2 SET constant MIN_DW to 2 // // Bit mask values. // // Case Weight (CW) - 8 bits: // bit 0 => width // bit 1,2 => small kana, sei-on // bit 3,4 => upper/lower case // bit 5 => kana // bit 6,7 => contraction // SET constant CONTRACTION_8_MASK to 0xc0 SET constant CONTRACTION_7_MASK to 0xc0 SET constant CONTRACTION_6_MASK to 0xc0 SET constant CONTRACTION_5_MASK to 0x80 SET constant CONTRACTION_4_MASK to 0x80 SET constant CONTRACTION_3_MASK to 0x40 SET constant CONTRACTION_2_MASK to 0x40 SET constant CONTRACTION_MASK to 0xc0 ELSE COMMENT Otherwise, only 2-character or 3-character contractions // are supported. SET constant CONTRACTION_3_MASK to 0xc0 // Bit-mask to check 2 character contraction or 3 character contraction SET constant CONTRACTION_2_MASK to 0x80 // Bit-mask to check 2 character contraction ENDIF SET constant CASE_UPPER_MASK to 0xe7 // zero out case bits SET constant CASE_KANA_MASK to 0xdf // zero out kana bit SET constant CASE_WIDTH_MASK to 0xfe // zero out width bit // // Masks to isolate the various bits in the case weight. // // NOTE: Bit 2 needs to always equal 1 to avoid getting // a byte value of either 0 or 1. // SET constant CASE_EXTRA_WEIGHT_MASK to 0xc4 SET constant ISOLATE_KANA to (~CASE_KANA_MASK) | CASE_EXTRA_WEIGHT_MASK SET constant ISOLATE_WIDTH to (~CASE_WIDTH_MASK) | CASE_EXTRA_WEIGHT_MASK // // Values for East Asia special case primary weights. // SET constant PW_REPEAT to 0 SET constant PW_CHO_ON to 1 SET constant MAX_SPECIAL_PW to PW_CHO_ON // // Values for weight 5 - East Asia Extra Weights. // SET constant WT_FIVE_KANA to 3 SET constant WT_FIVE_REPEAT to 4 SET constant WT_FIVE_CHO_ON to 5 // // PW Mask for Cho-On: // Leaves bit 7 on in PW, so it becomes Repeat // if it follows Kana N. // SET constant CHO_ON_PW_MASK to 0x87 // // Special weight values // SET constant MAP_INVALID_WEIGHT to 0xff // // Some Significant Values for Korean Jamo. // The L, V & T syllables in the 0x1100 Unicode range // can be composed to characters in the 0xac00 range. // See The Unicode Standard for details. // SET constant NLS_CHAR_FIRST_JAMO to 0x1100 // Begin Jamo range SET constant NLS_CHAR_LAST_JAMO to 0x11f9 // End Jamo range SET constant NLS_CHAR_FIRST_VOWEL_JAMO to 0x1160 // First Vowel Jamo SET constant NLS_CHAR_FIRST_TRAILING_JAMO to 0x11a8 // First Trailing Jamo SET constant NLS_JAMO_VOWEL_COUNT to 21 // Number of vowel Jamo (V) SET constant NLS_JAMO_TRAILING_COUNT to 28 // Number of trailing Jamo (L) SET constant NLS_HANGUL_FIRST_COMPOSED to 0xac00 // Begin composed range // // Values for Unicode Weight extra weights (e.g. Jamo (old Hangul)). // The following uses SM for extra UW weights. // SET constant ScriptMember_Extra_UnicodeWeight to 255 // Leading Weight / Vowel Weight / Trailing Weight // according to the current Jamo class. // STRUCTURE JamoSortInfoType ( // true for an old Hangul sequence OldHangulFlag : Boolean // true if U+1160 (Hangul Jungseong Filler) used FillerUsed : Boolean // index to the prior modern Hangul syllable (L) LeadingIndex : 8 bit integer // index to the prior modern Hangul syllable (V) VowelIndex : 8 bit integer // index to the prior modern Hangul syllable (T) TrailingIndex : 8 bit integer // Weight to offset from other old hangul (L) LeadingWeight : 8 bit integer // Weight to offset from other old hangul (V) VowelWeight : 8 bit integer // Weight to offset from other old hangul (T) TrailingWeight : 8 bit integer ) // This is the raw data record type from the data table STRUCTURE JamoStateDataType ( // true for an old Hangul sequence OldHangulFlag : Boolean // index to the prior modern Hangul syllable (L) LeadingIndex : 8 bit integer // index to the prior modern Hangul syllable (V) VowelIndex : 8 bit integer // index to the prior modern Hangul syllable (T) TrailingIndex : 8 bit integer // weight to distinguish from old Hangul ExtraWeight : 8 bit integer // number of additional records in this state TransitionCount : 8 bit integer // Current record in unisort.txt Jamo table: JamoRecord : data record // SORTTABLES\JAMOSORT\[Character] section ) COMMENT GetWindowsSortKey COMMENT COMMENT On Entry: SourceString - Unicode String to compute a COMMENT sort key for COMMENT SortLocale - Locale to determine correct COMMENT linguistic sort COMMENT Flags - Bit Flag to control behavior COMMENT of sort key generation. COMMENT COMMENT NORM_IGNORENONSPACE Ignore diacritic weight COMMENT NORM_IGNORECASE: Ignore case weight COMMENT NORM_IGNOREKANATYPE: Ignore Japanese Katakana/Hiraga COMMENT difference COMMENT NORM_IGNOREWIDTH: Ignore Chinese/Japanese/Korean COMMENT half-width and full-width difference. COMMENT COMMENT On Exit: SortKey - Byte array containing the COMMENT computed sort key. COMMENT PROCEDURE GetWindowsSortKey(IN SourceString : Unicode String, IN SortLocale : LCID, IN Flags : 32 bit integer, OUT SortKey : BYTE String) COMMENT Compute flags for sort conditions COMMENT Based on the case/kana/width flags, COMMENT turn off bits in case mask when comparing case weight. SET CaseMask to 0xff If (NORM_IGNORECASE bit is on in Flags) THEN SET CaseMask to CaseMask LOGICAL AND with CASE_UPPER_MASK ENDIF If (NORM_IGNOREKANATYPE bit is on in Flags) THEN SET CaseMask to CaseMask LOGICAL AND with CASE_KANA_MASK ENDIF If (NORM_IGNOREWIDTH bit is on in Flags) THEN SET CaseMask to CaseMask LOGICAL AND with CASE_WIDTH_MASK ENDIF COMMENT Windows 7 and Windows Server 2008 R2 use 3-byte COMMENT (instead of 2-byte) sequence for Unicode Weights COMMENT for Private Use Area (PUA) and some Chinese/Japanese/Korean ( COMMENT CJK) script members. COMMENT Does this sort have a 3-byte Unicode Weight (CJK sorts)? IF Windows version is Windows 7 and Windows Server 2008 R2 THEN COMMENT Check if the locale can have 3-byte Unicode weight SET Is3ByteWeightLocale to CALL Check3ByteWeightLocale(SortLocale) ENDIF IF Windows version is Windows Vista, Windows Server 2008, Windows 7, or Windows Server 2008 R2 THEN COMMENT For Windows Vista, Windows Server 2008, Windows 7, and COMMENT Windows Server 2008 R2, the algorithm COMMENT does not remap the script for Korean locale SET IsKoreanLocale to false ELSE IF SortLocale is LCID_KOREAN or SortLocale is LCID_KOREAN_UNICODE_SORT THEN SET IsKoreanLocale to true IF KoreanScriptMap is null THEN CALL InitKoreanScriptMap ELSE SET IsKoreanLocale to false ENDIF ENDIF // // Allocate buffer to hold different levels of sort key weights. // UnicodeWeights/ExtraWeights/SpecialWeights will be eventually // to be collected together, in that order, into the returned // Sortkey byte string. // // Maximum expansion size is 3 times the input size // // Unicode Weight => 4 word (16 bit) length // (extension A and Jamo need extra words) SET UnicodeWeights to new empty string of UnicodeWeightType SET DiacriticWeights to new empty string of BYTE SET CaseWeights to new empty string of BYTE // Extra Weight=>4 byte length (4 weights, 1 byte each) FE Special SET ExtraWeights to new empty string of ExtraWeightType // Special Weight => dword length (2 words each of 16 bits) SET SpecialWeights to new empty string of SpecialWeightType // // Go through the string, code point by code point, // testing for contractions and Hungarian special character sequence // // loop presumes 0 based index for source string FOR SourceIndex is 0 to Length(SourceString) -1 // // Get weights // CharacterWeight will contain all of the weight information // for the character tested. // SET CharacterWeight to CALL GetCharacterWeights WITH (SortLocale, SourceString[SourceIndex]) SET ScriptMember to CharacterWeight.ScriptMember // Special case weights have script members less than // MAX_SPECIAL_CASE (11) IF ScriptMember is greater than MAX_SPECIAL_CASE THEN // // No special case on character, but has to check for // contraction characters and Hungarian special // character sequence characters. // SET HasHungarianSpecialCharacterSequence to CALL TestHungarianCharacterSequences WITH (SortLocale, SourceString, SourceIndex) SET Result to CALL GetContractionType WITH (CharacterWeight) CASE Result OF "3-character Contraction": COMMENT This is only possible for Windows versions that COMMENT are Windows NT 4.0 through Windows Server 2003 Set ContractionFound to CALL SortkeyContractionHandler WITH (SortLocale, SourceString, SourceIndex, HasHungarianSpecialCharacterSequence, 3, UnicodeWeights, DiacriticWieghts, CaseWeights) IF ContractionFound is true THEN COMMENT Break out of the case statement BREAK ENDIF IF ContractionFound is true THEN COMMENT Break out of the case statement BREAK ENDIF COMMENT If no contraction is found, fall through into additional cases. FALLTHROUGH "2-character Contraction": COMMENT This is only possible for Windows versions that are COMMENT Windows NT 4.0 through Windows Server 2003 Set ContractionFound to CALL SortkeyContractionHandler WITH (SortLocale, SourceString, SourceIndex, HasHungarianSpecialCharacterSequence, 2, UnicodeWeights, DiacriticWieghts, CaseWeights) IF ContractionFound is true THEN COMMENT Break out of the case statement BREAK ENDIF COMMENT If no contraction is found, fall through into the OTHER case. COMMENT Since "3-character contraction" or "2-character contraction" COMMENT are the only two possible values for COMMENT Windows NT 4.0 through Windows Server 2003, all calls to COMMENT SortkeyContractionHandler will return false. COMMENT So, the fallthrough will go directly to the OTHERS section FALLTHROUGH "6-character contraction, 7-character contraction, or 8-character contraction": Set ContractionFound to CALL SortkeyContractionHandler WITH (SortLocale, SourceString, SourceIndex, HasHungarianSpecialCharacterSequence, 8, UnicodeWeights, DiacriticWieghts, CaseWeights) IF ContractionFound is true THEN COMMENT Break out of the case statement BREAK ELSE Set ContractionFound to CALL SortkeyContractionHandler WITH (SortLocale, SourceString, SourceIndex, HasHungarianSpecialCharacterSequence, 7, UnicodeWeights, DiacriticWieghts, CaseWeights) ENDIF IF ContractionFound is true THEN COMMENT Break out of the case statement BREAK ELSE Set ContractionFound to CALL SortkeyContractionHandler WITH (SortLocale, SourceString, SourceIndex, HasHungarianSpecialCharacterSequence, 6, UnicodeWeights, DiacriticWieghts, CaseWeights) ENDIF IF ContractionFound is true THEN COMMENT Break out of the case statement BREAK ENDIF COMMENT If no contraction is found, fall through into additional cases. FALLTHROUGH "4-character contraction or 5-character contraction": Set ContractionFound to CALL SortkeyContractionHandler WITH (SortLocale, SourceString, SourceIndex, HasHungarianSpecialCharacterSequence, 5, UnicodeWeights, DiacriticWieghts, CaseWeights) IF ContractionFound is true THEN COMMENT Break out of the case statement BREAK ELSE Set ContractionFound to CALL SortkeyContractionHandler WITH (SortLocale, SourceString, SourceIndex, HasHungarianSpecialCharacterSequence, 4, UnicodeWeights, DiacriticWieghts, CaseWeights) ENDIF IF ContractionFound is true THEN COMMENT Break out of the case statement BREAK ENDIF COMMENT If no contraction is found, fall through into additional cases. FALLTHROUGH "2-character contraction or 3-character contraction": Set ContractionFound to CALL SortkeyContractionHandler WITH (SortLocale, SourceString, SourceIndex, HasHungarianSpecialCharacterSequence, 3, UnicodeWeights, DiacriticWieghts, CaseWeights) IF ContractionFound is true THEN COMMENT Break out of the case statement BREAK ELSE Set ContractionFound to CALL SortkeyContractionHandler WITH (SortLocale, SourceString, SourceIndex, HasHungarianSpecialCharacterSequence, 2, UnicodeWeights, DiacriticWieghts, CaseWeights) ENDIF IF ContractionFound is true THEN COMMENT Break out of the case statement BREAK ENDIF COMMENT If no contraction is found, fall through into additional cases. FALLTHROUGH OTHERS : IF Windows version is greater than Windows Server 2008 R2 or Windows 7 THEN COMMENT In Windows Server 2008 R2 or Windows 7, COMMENT Private Use Area (PUA) code points COMMENT and some CJK (Chinese/Japanese/Korean) sorts COMMENT might need 3 byte weights COMMENT Store normal Unicode weight first. Note that there is no COMMENT adjustment of Korean weight anymore. SET UnicodeWeight to CorrectUnicodeWeight(CharacterWeight, FALSE) COMMENT Assume 3-byte Unicode Weight is not used first. COMMENT The alogorithm will check this later. SET UnicodeWeight.ThirdByteWeight to 0 IF (ScriptMember is equal to or greater than PUA3BYTESTART) AND (ScriptMember is less than or equal to PUA3BYTEEND) THEN SET IsScriptMemberPUA3BYTEWeight to true ELSE SET IsScriptMemberPUA3ByteWeight to false ENDIF IF (ScriptMember is equal to or greater than CJK3BYTESTART) AND (ScriptMember is less than or equal to CJK3BYTEEND) THEN SET IsScriptMemberCJK3ByteWeight to true ELSE SET IsScriptMemberCJK3ByteWeight to false ENDIF IF (IsScriptMemberPUA3ByteWeight is true) OR (Is3ByteWeightLocale AND IsScriptMemberCJK3ByteWeight is true) THEN COMMENT PUA code points and some CJK sorts need 3 byte weights SET UnicodeWeight.ThirdByteWeight to CharacterWeight.DiacriticWeight ELSE COMMENT Normal Diacritic Weight APPEND CharacterWeight.DiacriticWeight to DiacriticWeights as a BYTE ENDIF APPEND UnicodeWeight to UnicodeWeights SET CaseWeight to GetCaseWeight(CharacterWeight) APPEND CharacterWeight.CaseWeight to CaseWeights as a BYTE ELSE SET UnicodeWeight to CorrectUnicodeWeight(CharacterWeight, IsKoreanLocale) APPEND UnicodeWeight to UnicodeWeights APPEND CharacterWeight.DiacriticWeight to DiacriticWeights as a BYTE SET CaseWeight to GetCaseWeight(CharacterWeight) APPEND CharacterWeight.CaseWeight to CaseWeights as a BYTE ENDIF ENDCASE ELSE CALL SpecialCaseHandler WITH (SourceString, SourceIndex, UnicodeWeights, ExtraWeights, SpecialWeights, SortLocale, IsKoreanLocale) ENDIF ENDFOR // // Store the Unicode Weights in the destination buffer. // FOR each UnicodeWeight in UnicodeWeights // // Copy Unicode weight to destination buffer. // APPEND UnicodeWeight.ScriptMember to SortKey as a BYTE APPEND UnicodeWeight.PrimaryWeight to SortKey as a BYTE IF Windows version is greater than Windows Server 2008 R2 or Windows 7 THEN IF UnicodeWeight.ThirdByteWeight is not 0 THEN COMMENT When 3-byte Unicode Weight is used, append the additional COMMENT BYTE into SortKey APPEND UnicodeWeight.ThirdByteWeight to SortKey as a BYTE ENDIF ENDIF ENDFOR // // Copy Separator to destination buffer. // APPEND SORTKEY_SEPARATOR to SortKey as a BYTE // // Store Diacritic Weights in the destination buffer. // IF (NORM_IGNORENONSPACE bit is not turned on in Flags) THEN IF (IsReverseDW is TRUE) THEN // // Reverse diacritics: // - remove diacritics from left to right. // - store diacritics from right to left. // FOR each DiacriticWeight in DiacriticWeights in the "first in first out" order IF DiacriticWeight <= MIN_DW THEN REMOVE DiacriticWeight from DiacriticWeights ELSE BREAK from the current FOR loop ENDIF ENDFOR FOR each DiacriticWeight in DiacriticWeights in the "last in first out" order // // Copy Unicode weight to destination buffer. // APPEND DiacriticWeight to SortKey as a BYTE ENDFOR ELSE // // Regular diacritics: // - remove diacritics from right to left. // - store diacritics from left to right. FOR each DiacriticWeight in DiacriticWeights in the "last in first out" order IF DiacriticWeight <= MIN_DW THEN REMOVE DiacriticWeight from DiacriticWeights ELSE BREAK from the current FOR loop ENDIF ENDFOR FOR each DiacriticWeight in DiacriticWeights in the order of "first in first out" // // Copy Unicode weight to destination buffer. // APPEND DiacriticWeight to SortKey as a BYTE ENDFOR ENDIF ENDIF // // Copy Separator to destination buffer. // APPEND SORTKEY_SEPARATOR to SortKey as a BYTE // // Store case Weights // // - Eliminate minimum CW. // - Copy case weights to destination buffer. // IF (NORM_IGNORECASE bit is not turned on in Flags OR NORM_IGNOREWIDTH bit is not turned on in Flags) THEN FOR each CaseWeight in CaseWeights in the "last in first out" order IF CaseWeight <= MIN_CW THEN REMOVE CaseWeight from CaseWeights ELSE BREAK from the current FOR loop ENDIF ENDFOR FOR each CaseWeight in CaseWeights // // Copy Unicode weight to destination buffer. // APPEND CaseWeight to SortKey as a BYTE ENDFOR ENDIF // // Copy Separator to destination buffer. // APPEND SORTKEY_SEPARATOR to SortKey as a BYTE // // Store the Extra Weights in the destination buffer for // EAST ASIA Special. // // - Eliminate unnecessary XW. // - Copy extra weights to destination buffer. // IF Length(ExtraWeights) is greater than 0 THEN IF (NORM_IGNORENONSPACE bit is turned on in Flag) THEN APPEND 0xff to SortKey as a BYTE APPEND 0x02 to SortKey as a BYTE ENDIF // Append W6 group to SortKey // Trim unused values from the end of the string SET EndExtraWeight to Length(ExtraWeights) - 1 WHILE EndExtraWeight greater than 0 and ExtraWeightSeparator[EndExtraWeight].W6 == 0xe4 DECREMENT EndExtraWeight ENDWHILE SET ExtraWeightIndex to 0 WHILE ExtraWeightIndex is less than or equal to EndExtraWeight APPEND ExtraWeightSeparator[ExtraWeightIndex].W6 to SortKey as a BYTE INCREMENT ExtraWeightIndex ENDWHILE // Append W6 separator APPEND 0xff to SortKey as a BYTE // Append W7 group to SortKey // Trim unused values from the end of the string SET EndExtraWeight to Length(ExtraWeights) - 1 WHILE EndExtraWeight greater than 0 and ExtraWeightSeparator[EndExtraWeight].W7 == 0xe4 DECREMENT EndExtraWeight ENDWHILE SET ExtraWeightIndex to 0 WHILE ExtraWeightIndex is less than or equal to EndExtraWeight APPEND ExtraWeightSeparator[ExtraWeightIndex].W7 to SortKey INCREMENT ExtraWeightIndex ENDWHILE // Append W7 separator APPEND 0xff to SortKey as a BYTE ENDIF // // Copy Separator to destination buffer. // APPEND SORTKEY_SEPARATOR to SortKey as a BYTE // // Store the Special Weights in the destination buffer. // // - Copy special weights to destination buffer. // FOR each SpecialWeight in SpecialWeights // High byte (most significant) SET Byte1 to SpecialWeight.Position >> 8 // Low byte (least significant) SET Byte2 to SpecialWeight.Position & 0xff APPEND Byte1 to SortKey as a BYTE APPEND Byte2 to SortKey as a BYTE APPEND SpecialWeight.Script to SortKey as a BYTE APPEND SpecialWeight.Weight to SortKey as a BYTE ENDFOR // // Copy terminator to destination buffer. // APPEND SORTKEY_TERMINATOR to SortKey RETURN SortKey