001 /*
002 Licensed to the Apache Software Foundation (ASF) under one
003 or more contributor license agreements. See the NOTICE file
004 distributed with this work for additional information
005 regarding copyright ownership. The ASF licenses this file
006 to you under the Apache License, Version 2.0 (the
007 "License"); you may not use this file except in compliance
008 with the License. You may obtain a copy of the License at
009
010 http://www.apache.org/licenses/LICENSE-2.0
011
012 Unless required by applicable law or agreed to in writing,
013 software distributed under the License is distributed on an
014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 KIND, either express or implied. See the License for the
016 specific language governing permissions and limitations
017 under the License.
018 */
019
020 package org.apache.wiki.util.comparators;
021
022 import java.util.Comparator;
023
024 import org.apache.commons.lang.StringUtils;
025
026 /**
027 * A comparator that sorts Strings using "human" ordering, including decimal
028 * ordering. Only works for languages where every character is lexigraphically
029 * distinct and correctly unicode ordered (e.g. English). Other languages should use
030 * <code>CollatedHumanComparator</code>. Pretty efficient but still slower than
031 * String.compareTo().
032 *
033 */
034 public class HumanComparator implements Comparator< String > {
035
036 // Constants for categorising characters and specifying category level
037 // ordering
038 public enum CharType
039 {
040 TYPE_OTHER, TYPE_DIGIT, TYPE_LETTER
041 }
042
043 // A special singleton instance for quick access
044 public static final Comparator<String> DEFAULT_HUMAN_COMPARATOR = new HumanComparator();
045
046 /**
047 * Returns a singleton comparator that implements the default behaviour.
048 *
049 * @return the singleton comparator.
050 */
051 public static Comparator<String> getInstance()
052 {
053 return DEFAULT_HUMAN_COMPARATOR;
054 }
055
056 private CharType[] sortOrder = { CharType.TYPE_OTHER, CharType.TYPE_DIGIT, CharType.TYPE_LETTER };
057
058 /**
059 * Default constructor which does nothing. Here because it has a non-default
060 * constructor.
061 */
062 public HumanComparator()
063 {
064 // Empty
065 }
066
067 /**
068 * Constructor specifying all the character type order.
069 *
070 * @param sortOrder see setSortOrder
071 */
072 public HumanComparator( CharType[] sortOrder )
073 {
074 setSortOrder( sortOrder );
075 }
076
077 /*
078 * (non-Javadoc)
079 * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
080 */
081 public int compare( String str1, String str2 )
082 {
083 // Some quick and easy checks
084 if( StringUtils.equals( str1, str2 ) ) {
085 // they're identical, possibly both null
086 return 0;
087 } else if ( str1 == null ) {
088 // str1 is null and str2 isn't so str1 is smaller
089 return -1;
090 } else if ( str2 == null ) {
091 // str2 is null and str1 isn't so str1 is bigger
092 return 1;
093 }
094
095 char[] s1 = str1.toCharArray();
096 char[] s2 = str2.toCharArray();
097 int len1 = s1.length;
098 int len2 = s2.length;
099 int idx = 0;
100 // caseComparison used to defer a case sensitive comparison
101 int caseComparison = 0;
102
103 while ( idx < len1 && idx < len2 )
104 {
105 char c1 = s1[idx];
106 char c2 = s2[idx++];
107
108 // Convert to lower case
109 char lc1 = Character.toLowerCase( c1 );
110 char lc2 = Character.toLowerCase( c2 );
111
112 // If case makes a difference, note the difference the first time
113 // it's encountered
114 if( caseComparison == 0 && c1 != c2 && lc1 == lc2 )
115 {
116 if( Character.isLowerCase( c1 ) )
117 caseComparison = 1;
118 else if( Character.isLowerCase( c2 ) )
119 caseComparison = -1;
120 }
121 // Do the rest of the tests in lower case
122 c1 = lc1;
123 c2 = lc2;
124
125 // leading zeros are a special case
126 if( c1 != c2 || c1 == '0' )
127 {
128 // They might be different, now we can do a comparison
129 CharType type1 = mapCharTypes( c1 );
130 CharType type2 = mapCharTypes( c2 );
131
132 // Do the character class check
133 int result = compareCharTypes( type1, type2 );
134 if( result != 0 )
135 {
136 // different character classes so that's sufficient
137 return result;
138 }
139
140 // If they're not digits, use character to character comparison
141 if( type1 != CharType.TYPE_DIGIT )
142 {
143 Character ch1 = Character.valueOf( c1 );
144 Character ch2 = Character.valueOf( c2 );
145 return ch1.compareTo( ch2 );
146 }
147
148 // The only way to get here is both characters are digits
149 assert( type1 == CharType.TYPE_DIGIT && type2 == CharType.TYPE_DIGIT );
150 result = compareDigits( s1, s2, idx - 1 );
151 if( result != 0 )
152 {
153 // Got a result so return it
154 return result;
155 }
156
157 // No result yet, spin through the digits and continue trying
158 while ( idx < len1 && idx < len2 && Character.isDigit( s1[idx] ) ) {
159 idx++;
160 }
161 }
162 }
163
164 if( len1 == len2 )
165 {
166 // identical so return any case dependency
167 return caseComparison;
168 }
169
170 // Shorter String is less
171 return len1 - len2;
172 }
173
174 /**
175 * Implements ordering based on broad categories (e.g. numbers are always
176 * less than digits)
177 *
178 * @param type1 first CharType
179 * @param type2 second CharType
180 * @return -1 if type1 < type2, 0 if type1 == type2, 1 if type1 > type2
181 */
182 private int compareCharTypes( CharType type1, CharType type2 )
183 {
184 if( type1 == type2 )
185 {
186 // Same type so equal
187 return 0;
188 } else if( type1 == sortOrder[0] )
189 {
190 // t1 is the lowest order and t2 isn't so t1 must be less
191 return -1;
192 } else if( type2 == sortOrder[0] )
193 {
194 // t2 is the lowest order and t1 isn't so t1 must be more
195 return 1;
196 } else if( type1 == sortOrder[1] )
197 {
198 // t1 is the middle order and t2 isn't so t1 must be less
199 return -1;
200 } else if( type2 == sortOrder[1] )
201 {
202 // t2 is the middle order and t1 isn't so t1 must be more
203 return 1;
204 } else
205 {
206 // Can't possibly get here as that would mean they're both sortOrder[2]
207 assert( type1 != type2 );
208 return 0;
209 }
210 }
211
212 /**
213 * Do a numeric comparison on two otherwise identical char arrays.
214 *
215 * @param left the left hand character array.
216 * @param offset the index of the first digit of the number in both char
217 * arrays.
218 * @return negative, zero or positive depending on the numeric comparison of
219 * left and right.
220 */
221 private int compareDigits( char[] left, char[] right, int offset )
222 {
223 // Calculate the integer value of the left hand side
224 int idx = offset;
225 while ( idx < left.length && Character.isDigit( left[idx] ) ) {
226 idx++;
227 }
228 int leftLen = idx - offset;
229 int leftValue = Integer.valueOf( new String( left, offset, leftLen ) );
230
231 // Calculate the integer value of the right hand side
232 idx = offset;
233 while ( idx < right.length && Character.isDigit( right[idx] ) ) {
234 idx++;
235 }
236 int rightLen = idx - offset;
237 int rightValue = Integer.valueOf( new String( right, offset, rightLen ) );
238
239 if( leftValue == rightValue ) {
240 return leftLen - rightLen; // Same value so use the lengths
241 }
242 return leftValue - rightValue; // Otherwise compare the values
243 }
244
245 public CharType[] getSortOrder()
246 {
247 return sortOrder;
248 }
249
250 /**
251 * Very broadly characterises a character as a digit, a letter or a punctuation character.
252 *
253 * @param c <code>char</code> to be characterised
254 * @return <code>IS_DIGIT</code> if it's a digit, <code>IS_LETTER</code> if
255 * it's a letter, <code>IS_PUNC</code> otherwise.
256 */
257 private CharType mapCharTypes( char c ) {
258 if( Character.isDigit( c ) ) {
259 return CharType.TYPE_DIGIT;
260 } else if( Character.isLetter( c ) ) {
261 return CharType.TYPE_LETTER;
262 } else {
263 return CharType.TYPE_OTHER;
264 }
265 }
266
267 /**
268 * Set the order in which letters, numbers and everything else is presented.
269 * Default is other, digits and then letters. For example, the strings
270 * "abb", "a1b" and "a b" will sort in the order "a b", "a1b" then "abb" by
271 * default.
272 *
273 * @param sortOrder Must be an array of <code>CharType</code> containing
274 * exactly 3 elements each of which must be distinct.
275 * @throws IllegalArgumentException if being called on the result of
276 * <code>HumanStringComparator.getInstance()</code> or
277 * <code>sortOrder</code> is not exactly 3 different
278 * <code>CharType</code>.
279 */
280 public void setSortOrder( CharType[] sortOrder ) {
281 if( this == DEFAULT_HUMAN_COMPARATOR ) {
282 throw new IllegalArgumentException( "Can't call setters on default " + HumanComparator.class.getName() );
283 }
284
285 // Sanity check the sort order
286 if( sortOrder == null || sortOrder.length != 3 ) {
287 throw new IllegalArgumentException( "There must be exactly three elements in the sort order" );
288 }
289 if( sortOrder[0] == sortOrder[1] || sortOrder[0] == sortOrder[2] || sortOrder[1] == sortOrder[2] ) {
290 throw new IllegalArgumentException( "The sort order must contain EXACTLY one of each CharType" );
291 }
292 this.sortOrder = sortOrder.clone();
293 }
294
295 }