001/* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019 020package org.apache.wiki.util.comparators; 021 022import org.apache.commons.lang3.StringUtils; 023 024import java.util.Comparator; 025 026/** 027 * A comparator that sorts Strings using "human" ordering, including decimal 028 * ordering. Only works for languages where every character is lexicographically 029 * distinct and correctly unicode ordered (e.g. English). Other languages should use 030 * <code>CollatedHumanComparator</code>. Pretty efficient but still slower than 031 * String.compareTo(). 032 * 033 */ 034public class HumanComparator implements Comparator< String > { 035 036 // Constants for categorising characters and specifying category level 037 // ordering 038 public enum CharType 039 { 040 TYPE_OTHER, TYPE_DIGIT, TYPE_LETTER 041 } 042 043 // A special singleton instance for quick access 044 public static final Comparator<String> DEFAULT_HUMAN_COMPARATOR = new HumanComparator(); 045 046 /** 047 * Returns a singleton comparator that implements the default behaviour. 048 * 049 * @return the singleton comparator. 050 */ 051 public static Comparator<String> getInstance() 052 { 053 return DEFAULT_HUMAN_COMPARATOR; 054 } 055 056 private CharType[] sortOrder = { CharType.TYPE_OTHER, CharType.TYPE_DIGIT, CharType.TYPE_LETTER }; 057 058 /** 059 * Default constructor which does nothing. Here because it has a non-default 060 * constructor. 061 */ 062 public HumanComparator() 063 { 064 // Empty 065 } 066 067 /** 068 * Constructor specifying all the character type order. 069 * 070 * @param sortOrder see setSortOrder 071 */ 072 public HumanComparator(final CharType[] sortOrder ) 073 { 074 setSortOrder( sortOrder ); 075 } 076 077 /* 078 * (non-Javadoc) 079 * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) 080 */ 081 @Override 082 public int compare(final String str1, final String str2 ) 083 { 084 // Some quick and easy checks 085 if( StringUtils.equals( str1, str2 ) ) { 086 // they're identical, possibly both null 087 return 0; 088 } else if ( str1 == null ) { 089 // str1 is null and str2 isn't so str1 is smaller 090 return -1; 091 } else if ( str2 == null ) { 092 // str2 is null and str1 isn't so str1 is bigger 093 return 1; 094 } 095 096 final char[] s1 = str1.toCharArray(); 097 final char[] s2 = str2.toCharArray(); 098 final int len1 = s1.length; 099 final int len2 = s2.length; 100 int idx = 0; 101 // caseComparison used to defer a case sensitive comparison 102 int caseComparison = 0; 103 104 while ( idx < len1 && idx < len2 ) 105 { 106 char c1 = s1[idx]; 107 char c2 = s2[idx++]; 108 109 // Convert to lower case 110 final char lc1 = Character.toLowerCase( c1 ); 111 final char lc2 = Character.toLowerCase( c2 ); 112 113 // If case makes a difference, note the difference the first time 114 // it's encountered 115 if( caseComparison == 0 && c1 != c2 && lc1 == lc2 ) 116 { 117 if( Character.isLowerCase( c1 ) ) 118 caseComparison = 1; 119 else if( Character.isLowerCase( c2 ) ) 120 caseComparison = -1; 121 } 122 // Do the rest of the tests in lower case 123 c1 = lc1; 124 c2 = lc2; 125 126 // leading zeros are a special case 127 if( c1 != c2 || c1 == '0' ) 128 { 129 // They might be different, now we can do a comparison 130 final CharType type1 = mapCharTypes( c1 ); 131 final CharType type2 = mapCharTypes( c2 ); 132 133 // Do the character class check 134 int result = compareCharTypes( type1, type2 ); 135 if( result != 0 ) 136 { 137 // different character classes so that's sufficient 138 return result; 139 } 140 141 // If they're not digits, use character to character comparison 142 if( type1 != CharType.TYPE_DIGIT ) 143 { 144 final Character ch1 = c1; 145 final Character ch2 = c2; 146 return ch1.compareTo( ch2 ); 147 } 148 149 // The only way to get here is both characters are digits 150 assert( type1 == CharType.TYPE_DIGIT && type2 == CharType.TYPE_DIGIT ); 151 result = compareDigits( s1, s2, idx - 1 ); 152 if( result != 0 ) 153 { 154 // Got a result so return it 155 return result; 156 } 157 158 // No result yet, spin through the digits and continue trying 159 while ( idx < len1 && idx < len2 && Character.isDigit( s1[idx] ) ) { 160 idx++; 161 } 162 } 163 } 164 165 if( len1 == len2 ) 166 { 167 // identical so return any case dependency 168 return caseComparison; 169 } 170 171 // Shorter String is less 172 return len1 - len2; 173 } 174 175 /** 176 * Implements ordering based on broad categories (e.g. numbers are always 177 * less than digits) 178 * 179 * @param type1 first CharType 180 * @param type2 second CharType 181 * @return -1 if type1 < type2, 0 if type1 == type2, 1 if type1 > type2 182 */ 183 private int compareCharTypes(final CharType type1, final CharType type2 ) 184 { 185 if( type1 == type2 ) 186 { 187 // Same type so equal 188 return 0; 189 } else if( type1 == sortOrder[0] ) 190 { 191 // t1 is the lowest order and t2 isn't so t1 must be less 192 return -1; 193 } else if( type2 == sortOrder[0] ) 194 { 195 // t2 is the lowest order and t1 isn't so t1 must be more 196 return 1; 197 } else if( type1 == sortOrder[1] ) 198 { 199 // t1 is the middle order and t2 isn't so t1 must be less 200 return -1; 201 } else if( type2 == sortOrder[1] ) 202 { 203 // t2 is the middle order and t1 isn't so t1 must be more 204 return 1; 205 } else 206 { 207 // Can't possibly get here as that would mean they're both sortOrder[2] 208 assert( type1 != type2 ); 209 return 0; 210 } 211 } 212 213 /** 214 * Do a numeric comparison on two otherwise identical char arrays. 215 * 216 * @param left the left hand character array. 217 * @param offset the index of the first digit of the number in both char 218 * arrays. 219 * @return negative, zero or positive depending on the numeric comparison of 220 * left and right. 221 */ 222 private int compareDigits(final char[] left, final char[] right, final int offset ) 223 { 224 // Calculate the integer value of the left hand side 225 int idx = offset; 226 while ( idx < left.length && Character.isDigit( left[idx] ) ) { 227 idx++; 228 } 229 final int leftLen = idx - offset; 230 final int leftValue = Integer.parseInt( new String( left, offset, leftLen ) ); 231 232 // Calculate the integer value of the right hand side 233 idx = offset; 234 while ( idx < right.length && Character.isDigit( right[idx] ) ) { 235 idx++; 236 } 237 final int rightLen = idx - offset; 238 final int rightValue = Integer.parseInt( new String( right, offset, rightLen ) ); 239 240 if( leftValue == rightValue ) { 241 return leftLen - rightLen; // Same value so use the lengths 242 } 243 return leftValue - rightValue; // Otherwise compare the values 244 } 245 246 public CharType[] getSortOrder() 247 { 248 return sortOrder; 249 } 250 251 /** 252 * Very broadly characterises a character as a digit, a letter or a punctuation character. 253 * 254 * @param c <code>char</code> to be characterised 255 * @return <code>IS_DIGIT</code> if it's a digit, <code>IS_LETTER</code> if 256 * it's a letter, <code>IS_PUNC</code> otherwise. 257 */ 258 private CharType mapCharTypes(final char c ) { 259 if( Character.isDigit( c ) ) { 260 return CharType.TYPE_DIGIT; 261 } else if( Character.isLetter( c ) ) { 262 return CharType.TYPE_LETTER; 263 } else { 264 return CharType.TYPE_OTHER; 265 } 266 } 267 268 /** 269 * Set the order in which letters, numbers and everything else is presented. 270 * Default is other, digits and then letters. For example, the strings 271 * "abb", "a1b" and "a b" will sort in the order "a b", "a1b" then "abb" by 272 * default. 273 * 274 * @param sortOrder Must be an array of <code>CharType</code> containing 275 * exactly 3 elements each of which must be distinct. 276 * @throws IllegalArgumentException if being called on the result of 277 * <code>HumanStringComparator.getInstance()</code> or 278 * <code>sortOrder</code> is not exactly 3 different 279 * <code>CharType</code>. 280 */ 281 public void setSortOrder(final CharType[] sortOrder ) { 282 if( this == DEFAULT_HUMAN_COMPARATOR ) { 283 throw new IllegalArgumentException( "Can't call setters on default " + HumanComparator.class.getName() ); 284 } 285 286 // Sanity check the sort order 287 if( sortOrder == null || sortOrder.length != 3 ) { 288 throw new IllegalArgumentException( "There must be exactly three elements in the sort order" ); 289 } 290 if( sortOrder[0] == sortOrder[1] || sortOrder[0] == sortOrder[2] || sortOrder[1] == sortOrder[2] ) { 291 throw new IllegalArgumentException( "The sort order must contain EXACTLY one of each CharType" ); 292 } 293 this.sortOrder = sortOrder.clone(); 294 } 295 296}