001/* 
002    Licensed to the Apache Software Foundation (ASF) under one
003    or more contributor license agreements.  See the NOTICE file
004    distributed with this work for additional information
005    regarding copyright ownership.  The ASF licenses this file
006    to you under the Apache License, Version 2.0 (the
007    "License"); you may not use this file except in compliance
008    with the License.  You may obtain a copy of the License at
009
010       http://www.apache.org/licenses/LICENSE-2.0
011
012    Unless required by applicable law or agreed to in writing,
013    software distributed under the License is distributed on an
014    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015    KIND, either express or implied.  See the License for the
016    specific language governing permissions and limitations
017    under the License.  
018 */
019
020package org.apache.wiki.util.comparators;
021
022import java.util.Comparator;
023
024import org.apache.commons.lang.StringUtils;
025
026/**
027 * A comparator that sorts Strings using "human" ordering, including decimal
028 * ordering. Only works for languages where every character is lexigraphically
029 * distinct and correctly unicode ordered (e.g. English). Other languages should use
030 * <code>CollatedHumanComparator</code>. Pretty efficient but still slower than
031 * String.compareTo().
032 * 
033 */
034public class HumanComparator implements Comparator< String > {
035
036    // Constants for categorising characters and specifying category level
037    // ordering
038    public enum CharType
039    {
040        TYPE_OTHER, TYPE_DIGIT, TYPE_LETTER
041    }
042
043    // A special singleton instance for quick access
044    public static final Comparator<String> DEFAULT_HUMAN_COMPARATOR = new HumanComparator();
045
046    /**
047     * Returns a singleton comparator that implements the default behaviour.
048     * 
049     * @return the singleton comparator.
050     */
051    public static Comparator<String> getInstance()
052    {
053        return DEFAULT_HUMAN_COMPARATOR;
054    }
055
056    private CharType[] sortOrder = { CharType.TYPE_OTHER, CharType.TYPE_DIGIT, CharType.TYPE_LETTER };
057
058    /**
059     * Default constructor which does nothing. Here because it has a non-default
060     * constructor.
061     */
062    public HumanComparator()
063    {
064        // Empty
065    }
066
067    /**
068     * Constructor specifying all the character type order.
069     * 
070     * @param sortOrder see setSortOrder
071     */
072    public HumanComparator( CharType[] sortOrder )
073    {
074        setSortOrder( sortOrder );
075    }
076
077    /*
078     * (non-Javadoc)
079     * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
080     */
081    public int compare( String str1, String str2 )
082    {
083        // Some quick and easy checks
084        if( StringUtils.equals( str1, str2 ) ) {
085            // they're identical, possibly both null
086            return 0;
087        } else if ( str1 == null ) {
088            // str1 is null and str2 isn't so str1 is smaller
089            return -1;
090        } else if ( str2 == null ) {
091            // str2 is null and str1 isn't so str1 is bigger
092            return 1;
093        }
094
095        char[] s1 = str1.toCharArray();
096        char[] s2 = str2.toCharArray();
097        int len1 = s1.length;
098        int len2 = s2.length;
099        int idx = 0;
100        // caseComparison used to defer a case sensitive comparison
101        int caseComparison = 0;
102
103        while ( idx < len1 && idx < len2 )
104        {
105            char c1 = s1[idx];
106            char c2 = s2[idx++];
107
108            // Convert to lower case
109            char lc1 = Character.toLowerCase( c1 );
110            char lc2 = Character.toLowerCase( c2 );
111
112            // If case makes a difference, note the difference the first time
113            // it's encountered
114            if( caseComparison == 0 && c1 != c2 && lc1 == lc2 )
115            {
116                if( Character.isLowerCase( c1 ) )
117                    caseComparison = 1;
118                else if( Character.isLowerCase( c2 ) )
119                    caseComparison = -1;
120            }
121            // Do the rest of the tests in lower case
122            c1 = lc1;
123            c2 = lc2;
124
125            // leading zeros are a special case
126            if( c1 != c2 || c1 == '0' )
127            {
128                // They might be different, now we can do a comparison
129                CharType type1 = mapCharTypes( c1 );
130                CharType type2 = mapCharTypes( c2 );
131
132                // Do the character class check
133                int result = compareCharTypes( type1, type2 );
134                if( result != 0 )
135                {
136                    // different character classes so that's sufficient
137                    return result;
138                }
139
140                // If they're not digits, use character to character comparison
141                if( type1 != CharType.TYPE_DIGIT )
142                {
143                    Character ch1 = Character.valueOf( c1 );
144                    Character ch2 = Character.valueOf( c2 );
145                    return ch1.compareTo( ch2 );
146                }
147
148                // The only way to get here is both characters are digits
149                assert( type1 == CharType.TYPE_DIGIT && type2 == CharType.TYPE_DIGIT );
150                result = compareDigits( s1, s2, idx - 1 );
151                if( result != 0 )
152                {
153                    // Got a result so return it
154                    return result;
155                }
156
157                // No result yet, spin through the digits and continue trying
158                while ( idx < len1 && idx < len2 && Character.isDigit( s1[idx] ) ) {
159                    idx++;
160                }
161            }
162        }
163
164        if( len1 == len2 )
165        {
166            // identical so return any case dependency
167            return caseComparison;
168        }
169
170        // Shorter String is less
171        return len1 - len2;
172    }
173
174    /**
175     * Implements ordering based on broad categories (e.g. numbers are always
176     * less than digits)
177     * 
178     * @param type1 first CharType
179     * @param type2 second CharType
180     * @return -1 if type1 < type2, 0 if type1 == type2, 1 if type1 > type2
181     */
182    private int compareCharTypes( CharType type1, CharType type2 )
183    {
184        if( type1 == type2 )
185        {
186            // Same type so equal
187            return 0;
188        } else if( type1 == sortOrder[0] )
189        {
190            // t1 is the lowest order and t2 isn't so t1 must be less
191            return -1;
192        } else if( type2 == sortOrder[0] )
193        {
194            // t2 is the lowest order and t1 isn't so t1 must be more
195            return 1;
196        } else if( type1 == sortOrder[1] )
197        {
198            // t1 is the middle order and t2 isn't so t1 must be less
199            return -1;
200        } else if( type2 == sortOrder[1] )
201        {
202            // t2 is the middle order and t1 isn't so t1 must be more
203            return 1;
204        } else
205        {
206            // Can't possibly get here as that would mean they're both sortOrder[2]
207            assert( type1 != type2 );
208            return 0;
209        }
210    }
211
212    /**
213     * Do a numeric comparison on two otherwise identical char arrays.
214     * 
215     * @param left the left hand character array.
216     * @param offset the index of the first digit of the number in both char
217     *            arrays.
218     * @return negative, zero or positive depending on the numeric comparison of
219     *         left and right.
220     */
221    private int compareDigits( char[] left, char[] right, int offset )
222    {
223        // Calculate the integer value of the left hand side
224        int idx = offset;
225        while ( idx < left.length && Character.isDigit( left[idx] ) ) {
226            idx++;
227        }
228        int leftLen = idx - offset;
229        int leftValue = Integer.valueOf( new String( left, offset, leftLen ) );
230
231        // Calculate the integer value of the right hand side
232        idx = offset;
233        while ( idx < right.length && Character.isDigit( right[idx] ) ) {
234            idx++;
235        }
236        int rightLen = idx - offset;
237        int rightValue = Integer.valueOf( new String( right, offset, rightLen ) );
238
239        if( leftValue == rightValue ) {
240            return leftLen - rightLen; // Same value so use the lengths
241        }
242        return leftValue - rightValue; // Otherwise compare the values
243    }
244
245    public CharType[] getSortOrder()
246    {
247        return sortOrder;
248    }
249
250    /**
251     * Very broadly characterises a character as a digit, a letter or a punctuation character.
252     * 
253     * @param c <code>char</code> to be characterised
254     * @return <code>IS_DIGIT</code> if it's a digit, <code>IS_LETTER</code> if
255     *         it's a letter, <code>IS_PUNC</code> otherwise.
256     */
257    private CharType mapCharTypes( char c ) {
258        if( Character.isDigit( c ) ) {
259            return CharType.TYPE_DIGIT;
260        } else if( Character.isLetter( c ) ) {
261            return CharType.TYPE_LETTER;
262        } else {
263            return CharType.TYPE_OTHER;
264        }
265    }
266
267    /**
268     * Set the order in which letters, numbers and everything else is presented.
269     * Default is other, digits and then letters. For example, the strings
270     * "abb", "a1b" and "a b" will sort in the order "a b", "a1b" then "abb" by
271     * default.
272     * 
273     * @param sortOrder Must be an array of <code>CharType</code> containing
274     *            exactly 3 elements each of which must be distinct.
275     * @throws IllegalArgumentException if being called on the result of
276     *             <code>HumanStringComparator.getInstance()</code> or
277     *             <code>sortOrder</code> is not exactly 3 different
278     *             <code>CharType</code>.
279     */
280    public void setSortOrder( CharType[] sortOrder ) {
281        if( this == DEFAULT_HUMAN_COMPARATOR ) {
282            throw new IllegalArgumentException( "Can't call setters on default " + HumanComparator.class.getName() );
283        }
284
285        // Sanity check the sort order
286        if( sortOrder == null || sortOrder.length != 3 ) {
287            throw new IllegalArgumentException( "There must be exactly three elements in the sort order" );
288        }
289        if( sortOrder[0] == sortOrder[1] || sortOrder[0] == sortOrder[2] || sortOrder[1] == sortOrder[2] ) {
290            throw new IllegalArgumentException( "The sort order must contain EXACTLY one of each CharType" );
291        }
292        this.sortOrder = sortOrder.clone();
293    }
294
295}