001/* 
002    Licensed to the Apache Software Foundation (ASF) under one
003    or more contributor license agreements.  See the NOTICE file
004    distributed with this work for additional information
005    regarding copyright ownership.  The ASF licenses this file
006    to you under the Apache License, Version 2.0 (the
007    "License"); you may not use this file except in compliance
008    with the License.  You may obtain a copy of the License at
009
010       http://www.apache.org/licenses/LICENSE-2.0
011
012    Unless required by applicable law or agreed to in writing,
013    software distributed under the License is distributed on an
014    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015    KIND, either express or implied.  See the License for the
016    specific language governing permissions and limitations
017    under the License.  
018 */
019
020package org.apache.wiki.util.comparators;
021
022import org.apache.commons.lang3.StringUtils;
023
024import java.util.Comparator;
025
026/**
027 * A comparator that sorts Strings using "human" ordering, including decimal
028 * ordering. Only works for languages where every character is lexicographically
029 * distinct and correctly unicode ordered (e.g. English). Other languages should use
030 * <code>CollatedHumanComparator</code>. Pretty efficient but still slower than
031 * String.compareTo().
032 * 
033 */
034public class HumanComparator implements Comparator< String > {
035
036    // Constants for categorising characters and specifying category level
037    // ordering
038    public enum CharType
039    {
040        TYPE_OTHER, TYPE_DIGIT, TYPE_LETTER
041    }
042
043    // A special singleton instance for quick access
044    public static final Comparator<String> DEFAULT_HUMAN_COMPARATOR = new HumanComparator();
045
046    /**
047     * Returns a singleton comparator that implements the default behaviour.
048     * 
049     * @return the singleton comparator.
050     */
051    public static Comparator<String> getInstance()
052    {
053        return DEFAULT_HUMAN_COMPARATOR;
054    }
055
056    private CharType[] sortOrder = { CharType.TYPE_OTHER, CharType.TYPE_DIGIT, CharType.TYPE_LETTER };
057
058    /**
059     * Default constructor which does nothing. Here because it has a non-default
060     * constructor.
061     */
062    public HumanComparator()
063    {
064        // Empty
065    }
066
067    /**
068     * Constructor specifying all the character type order.
069     * 
070     * @param sortOrder see setSortOrder
071     */
072    public HumanComparator(final CharType[] sortOrder )
073    {
074        setSortOrder( sortOrder );
075    }
076
077    /*
078     * (non-Javadoc)
079     * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
080     */
081    @Override
082    public int compare(final String str1, final String str2 )
083    {
084        // Some quick and easy checks
085        if( StringUtils.equals( str1, str2 ) ) {
086            // they're identical, possibly both null
087            return 0;
088        } else if ( str1 == null ) {
089            // str1 is null and str2 isn't so str1 is smaller
090            return -1;
091        } else if ( str2 == null ) {
092            // str2 is null and str1 isn't so str1 is bigger
093            return 1;
094        }
095
096        final char[] s1 = str1.toCharArray();
097        final char[] s2 = str2.toCharArray();
098        final int len1 = s1.length;
099        final int len2 = s2.length;
100        int idx = 0;
101        // caseComparison used to defer a case sensitive comparison
102        int caseComparison = 0;
103
104        while ( idx < len1 && idx < len2 )
105        {
106            char c1 = s1[idx];
107            char c2 = s2[idx++];
108
109            // Convert to lower case
110            final char lc1 = Character.toLowerCase( c1 );
111            final char lc2 = Character.toLowerCase( c2 );
112
113            // If case makes a difference, note the difference the first time
114            // it's encountered
115            if( caseComparison == 0 && c1 != c2 && lc1 == lc2 )
116            {
117                if( Character.isLowerCase( c1 ) )
118                    caseComparison = 1;
119                else if( Character.isLowerCase( c2 ) )
120                    caseComparison = -1;
121            }
122            // Do the rest of the tests in lower case
123            c1 = lc1;
124            c2 = lc2;
125
126            // leading zeros are a special case
127            if( c1 != c2 || c1 == '0' )
128            {
129                // They might be different, now we can do a comparison
130                final CharType type1 = mapCharTypes( c1 );
131                final CharType type2 = mapCharTypes( c2 );
132
133                // Do the character class check
134                int result = compareCharTypes( type1, type2 );
135                if( result != 0 )
136                {
137                    // different character classes so that's sufficient
138                    return result;
139                }
140
141                // If they're not digits, use character to character comparison
142                if( type1 != CharType.TYPE_DIGIT )
143                {
144                    final Character ch1 = c1;
145                    final Character ch2 = c2;
146                    return ch1.compareTo( ch2 );
147                }
148
149                // The only way to get here is both characters are digits
150                assert( type1 == CharType.TYPE_DIGIT && type2 == CharType.TYPE_DIGIT );
151                result = compareDigits( s1, s2, idx - 1 );
152                if( result != 0 )
153                {
154                    // Got a result so return it
155                    return result;
156                }
157
158                // No result yet, spin through the digits and continue trying
159                while ( idx < len1 && idx < len2 && Character.isDigit( s1[idx] ) ) {
160                    idx++;
161                }
162            }
163        }
164
165        if( len1 == len2 )
166        {
167            // identical so return any case dependency
168            return caseComparison;
169        }
170
171        // Shorter String is less
172        return len1 - len2;
173    }
174
175    /**
176     * Implements ordering based on broad categories (e.g. numbers are always
177     * less than digits)
178     * 
179     * @param type1 first CharType
180     * @param type2 second CharType
181     * @return -1 if type1 < type2, 0 if type1 == type2, 1 if type1 > type2
182     */
183    private int compareCharTypes(final CharType type1, final CharType type2 )
184    {
185        if( type1 == type2 )
186        {
187            // Same type so equal
188            return 0;
189        } else if( type1 == sortOrder[0] )
190        {
191            // t1 is the lowest order and t2 isn't so t1 must be less
192            return -1;
193        } else if( type2 == sortOrder[0] )
194        {
195            // t2 is the lowest order and t1 isn't so t1 must be more
196            return 1;
197        } else if( type1 == sortOrder[1] )
198        {
199            // t1 is the middle order and t2 isn't so t1 must be less
200            return -1;
201        } else if( type2 == sortOrder[1] )
202        {
203            // t2 is the middle order and t1 isn't so t1 must be more
204            return 1;
205        } else
206        {
207            // Can't possibly get here as that would mean they're both sortOrder[2]
208            assert( type1 != type2 );
209            return 0;
210        }
211    }
212
213    /**
214     * Do a numeric comparison on two otherwise identical char arrays.
215     * 
216     * @param left the left hand character array.
217     * @param offset the index of the first digit of the number in both char
218     *            arrays.
219     * @return negative, zero or positive depending on the numeric comparison of
220     *         left and right.
221     */
222    private int compareDigits(final char[] left, final char[] right, final int offset )
223    {
224        // Calculate the integer value of the left hand side
225        int idx = offset;
226        while ( idx < left.length && Character.isDigit( left[idx] ) ) {
227            idx++;
228        }
229        final int leftLen = idx - offset;
230        final int leftValue = Integer.parseInt( new String( left, offset, leftLen ) );
231
232        // Calculate the integer value of the right hand side
233        idx = offset;
234        while ( idx < right.length && Character.isDigit( right[idx] ) ) {
235            idx++;
236        }
237        final int rightLen = idx - offset;
238        final int rightValue = Integer.parseInt( new String( right, offset, rightLen ) );
239
240        if( leftValue == rightValue ) {
241            return leftLen - rightLen; // Same value so use the lengths
242        }
243        return leftValue - rightValue; // Otherwise compare the values
244    }
245
246    public CharType[] getSortOrder()
247    {
248        return sortOrder;
249    }
250
251    /**
252     * Very broadly characterises a character as a digit, a letter or a punctuation character.
253     * 
254     * @param c <code>char</code> to be characterised
255     * @return <code>IS_DIGIT</code> if it's a digit, <code>IS_LETTER</code> if
256     *         it's a letter, <code>IS_PUNC</code> otherwise.
257     */
258    private CharType mapCharTypes(final char c ) {
259        if( Character.isDigit( c ) ) {
260            return CharType.TYPE_DIGIT;
261        } else if( Character.isLetter( c ) ) {
262            return CharType.TYPE_LETTER;
263        } else {
264            return CharType.TYPE_OTHER;
265        }
266    }
267
268    /**
269     * Set the order in which letters, numbers and everything else is presented.
270     * Default is other, digits and then letters. For example, the strings
271     * "abb", "a1b" and "a b" will sort in the order "a b", "a1b" then "abb" by
272     * default.
273     * 
274     * @param sortOrder Must be an array of <code>CharType</code> containing
275     *            exactly 3 elements each of which must be distinct.
276     * @throws IllegalArgumentException if being called on the result of
277     *             <code>HumanStringComparator.getInstance()</code> or
278     *             <code>sortOrder</code> is not exactly 3 different
279     *             <code>CharType</code>.
280     */
281    public void setSortOrder(final CharType[] sortOrder ) {
282        if( this == DEFAULT_HUMAN_COMPARATOR ) {
283            throw new IllegalArgumentException( "Can't call setters on default " + HumanComparator.class.getName() );
284        }
285
286        // Sanity check the sort order
287        if( sortOrder == null || sortOrder.length != 3 ) {
288            throw new IllegalArgumentException( "There must be exactly three elements in the sort order" );
289        }
290        if( sortOrder[0] == sortOrder[1] || sortOrder[0] == sortOrder[2] || sortOrder[1] == sortOrder[2] ) {
291            throw new IllegalArgumentException( "The sort order must contain EXACTLY one of each CharType" );
292        }
293        this.sortOrder = sortOrder.clone();
294    }
295
296}