001/* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019 020package org.apache.wiki.util.comparators; 021 022import org.apache.commons.lang3.StringUtils; 023 024import java.util.Comparator; 025 026/** 027 * A comparator that sorts Strings using "human" ordering, including decimal 028 * ordering. Only works for languages where every character is lexigraphically 029 * distinct and correctly unicode ordered (e.g. English). Other languages should use 030 * <code>CollatedHumanComparator</code>. Pretty efficient but still slower than 031 * String.compareTo(). 032 * 033 */ 034public class HumanComparator implements Comparator< String > { 035 036 // Constants for categorising characters and specifying category level 037 // ordering 038 public enum CharType 039 { 040 TYPE_OTHER, TYPE_DIGIT, TYPE_LETTER 041 } 042 043 // A special singleton instance for quick access 044 public static final Comparator<String> DEFAULT_HUMAN_COMPARATOR = new HumanComparator(); 045 046 /** 047 * Returns a singleton comparator that implements the default behaviour. 048 * 049 * @return the singleton comparator. 050 */ 051 public static Comparator<String> getInstance() 052 { 053 return DEFAULT_HUMAN_COMPARATOR; 054 } 055 056 private CharType[] sortOrder = { CharType.TYPE_OTHER, CharType.TYPE_DIGIT, CharType.TYPE_LETTER }; 057 058 /** 059 * Default constructor which does nothing. Here because it has a non-default 060 * constructor. 061 */ 062 public HumanComparator() 063 { 064 // Empty 065 } 066 067 /** 068 * Constructor specifying all the character type order. 069 * 070 * @param sortOrder see setSortOrder 071 */ 072 public HumanComparator(final CharType[] sortOrder ) 073 { 074 setSortOrder( sortOrder ); 075 } 076 077 /* 078 * (non-Javadoc) 079 * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) 080 */ 081 public int compare(final String str1, final String str2 ) 082 { 083 // Some quick and easy checks 084 if( StringUtils.equals( str1, str2 ) ) { 085 // they're identical, possibly both null 086 return 0; 087 } else if ( str1 == null ) { 088 // str1 is null and str2 isn't so str1 is smaller 089 return -1; 090 } else if ( str2 == null ) { 091 // str2 is null and str1 isn't so str1 is bigger 092 return 1; 093 } 094 095 final char[] s1 = str1.toCharArray(); 096 final char[] s2 = str2.toCharArray(); 097 final int len1 = s1.length; 098 final int len2 = s2.length; 099 int idx = 0; 100 // caseComparison used to defer a case sensitive comparison 101 int caseComparison = 0; 102 103 while ( idx < len1 && idx < len2 ) 104 { 105 char c1 = s1[idx]; 106 char c2 = s2[idx++]; 107 108 // Convert to lower case 109 final char lc1 = Character.toLowerCase( c1 ); 110 final char lc2 = Character.toLowerCase( c2 ); 111 112 // If case makes a difference, note the difference the first time 113 // it's encountered 114 if( caseComparison == 0 && c1 != c2 && lc1 == lc2 ) 115 { 116 if( Character.isLowerCase( c1 ) ) 117 caseComparison = 1; 118 else if( Character.isLowerCase( c2 ) ) 119 caseComparison = -1; 120 } 121 // Do the rest of the tests in lower case 122 c1 = lc1; 123 c2 = lc2; 124 125 // leading zeros are a special case 126 if( c1 != c2 || c1 == '0' ) 127 { 128 // They might be different, now we can do a comparison 129 final CharType type1 = mapCharTypes( c1 ); 130 final CharType type2 = mapCharTypes( c2 ); 131 132 // Do the character class check 133 int result = compareCharTypes( type1, type2 ); 134 if( result != 0 ) 135 { 136 // different character classes so that's sufficient 137 return result; 138 } 139 140 // If they're not digits, use character to character comparison 141 if( type1 != CharType.TYPE_DIGIT ) 142 { 143 final Character ch1 = Character.valueOf( c1 ); 144 final Character ch2 = Character.valueOf( c2 ); 145 return ch1.compareTo( ch2 ); 146 } 147 148 // The only way to get here is both characters are digits 149 assert( type1 == CharType.TYPE_DIGIT && type2 == CharType.TYPE_DIGIT ); 150 result = compareDigits( s1, s2, idx - 1 ); 151 if( result != 0 ) 152 { 153 // Got a result so return it 154 return result; 155 } 156 157 // No result yet, spin through the digits and continue trying 158 while ( idx < len1 && idx < len2 && Character.isDigit( s1[idx] ) ) { 159 idx++; 160 } 161 } 162 } 163 164 if( len1 == len2 ) 165 { 166 // identical so return any case dependency 167 return caseComparison; 168 } 169 170 // Shorter String is less 171 return len1 - len2; 172 } 173 174 /** 175 * Implements ordering based on broad categories (e.g. numbers are always 176 * less than digits) 177 * 178 * @param type1 first CharType 179 * @param type2 second CharType 180 * @return -1 if type1 < type2, 0 if type1 == type2, 1 if type1 > type2 181 */ 182 private int compareCharTypes(final CharType type1, final CharType type2 ) 183 { 184 if( type1 == type2 ) 185 { 186 // Same type so equal 187 return 0; 188 } else if( type1 == sortOrder[0] ) 189 { 190 // t1 is the lowest order and t2 isn't so t1 must be less 191 return -1; 192 } else if( type2 == sortOrder[0] ) 193 { 194 // t2 is the lowest order and t1 isn't so t1 must be more 195 return 1; 196 } else if( type1 == sortOrder[1] ) 197 { 198 // t1 is the middle order and t2 isn't so t1 must be less 199 return -1; 200 } else if( type2 == sortOrder[1] ) 201 { 202 // t2 is the middle order and t1 isn't so t1 must be more 203 return 1; 204 } else 205 { 206 // Can't possibly get here as that would mean they're both sortOrder[2] 207 assert( type1 != type2 ); 208 return 0; 209 } 210 } 211 212 /** 213 * Do a numeric comparison on two otherwise identical char arrays. 214 * 215 * @param left the left hand character array. 216 * @param offset the index of the first digit of the number in both char 217 * arrays. 218 * @return negative, zero or positive depending on the numeric comparison of 219 * left and right. 220 */ 221 private int compareDigits(final char[] left, final char[] right, final int offset ) 222 { 223 // Calculate the integer value of the left hand side 224 int idx = offset; 225 while ( idx < left.length && Character.isDigit( left[idx] ) ) { 226 idx++; 227 } 228 final int leftLen = idx - offset; 229 final int leftValue = Integer.valueOf( new String( left, offset, leftLen ) ); 230 231 // Calculate the integer value of the right hand side 232 idx = offset; 233 while ( idx < right.length && Character.isDigit( right[idx] ) ) { 234 idx++; 235 } 236 final int rightLen = idx - offset; 237 final int rightValue = Integer.valueOf( new String( right, offset, rightLen ) ); 238 239 if( leftValue == rightValue ) { 240 return leftLen - rightLen; // Same value so use the lengths 241 } 242 return leftValue - rightValue; // Otherwise compare the values 243 } 244 245 public CharType[] getSortOrder() 246 { 247 return sortOrder; 248 } 249 250 /** 251 * Very broadly characterises a character as a digit, a letter or a punctuation character. 252 * 253 * @param c <code>char</code> to be characterised 254 * @return <code>IS_DIGIT</code> if it's a digit, <code>IS_LETTER</code> if 255 * it's a letter, <code>IS_PUNC</code> otherwise. 256 */ 257 private CharType mapCharTypes(final char c ) { 258 if( Character.isDigit( c ) ) { 259 return CharType.TYPE_DIGIT; 260 } else if( Character.isLetter( c ) ) { 261 return CharType.TYPE_LETTER; 262 } else { 263 return CharType.TYPE_OTHER; 264 } 265 } 266 267 /** 268 * Set the order in which letters, numbers and everything else is presented. 269 * Default is other, digits and then letters. For example, the strings 270 * "abb", "a1b" and "a b" will sort in the order "a b", "a1b" then "abb" by 271 * default. 272 * 273 * @param sortOrder Must be an array of <code>CharType</code> containing 274 * exactly 3 elements each of which must be distinct. 275 * @throws IllegalArgumentException if being called on the result of 276 * <code>HumanStringComparator.getInstance()</code> or 277 * <code>sortOrder</code> is not exactly 3 different 278 * <code>CharType</code>. 279 */ 280 public void setSortOrder(final CharType[] sortOrder ) { 281 if( this == DEFAULT_HUMAN_COMPARATOR ) { 282 throw new IllegalArgumentException( "Can't call setters on default " + HumanComparator.class.getName() ); 283 } 284 285 // Sanity check the sort order 286 if( sortOrder == null || sortOrder.length != 3 ) { 287 throw new IllegalArgumentException( "There must be exactly three elements in the sort order" ); 288 } 289 if( sortOrder[0] == sortOrder[1] || sortOrder[0] == sortOrder[2] || sortOrder[1] == sortOrder[2] ) { 290 throw new IllegalArgumentException( "The sort order must contain EXACTLY one of each CharType" ); 291 } 292 this.sortOrder = sortOrder.clone(); 293 } 294 295}