001    /* 
002        Licensed to the Apache Software Foundation (ASF) under one
003        or more contributor license agreements.  See the NOTICE file
004        distributed with this work for additional information
005        regarding copyright ownership.  The ASF licenses this file
006        to you under the Apache License, Version 2.0 (the
007        "License"); you may not use this file except in compliance
008        with the License.  You may obtain a copy of the License at
009    
010           http://www.apache.org/licenses/LICENSE-2.0
011    
012        Unless required by applicable law or agreed to in writing,
013        software distributed under the License is distributed on an
014        "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015        KIND, either express or implied.  See the License for the
016        specific language governing permissions and limitations
017        under the License.  
018     */
019    
020    package org.apache.wiki.util.comparators;
021    
022    import java.util.Comparator;
023    
024    import org.apache.commons.lang.StringUtils;
025    
026    /**
027     * A comparator that sorts Strings using "human" ordering, including decimal
028     * ordering. Only works for languages where every character is lexigraphically
029     * distinct and correctly unicode ordered (e.g. English). Other languages should use
030     * <code>CollatedHumanComparator</code>. Pretty efficient but still slower than
031     * String.compareTo().
032     * 
033     */
034    public class HumanComparator implements Comparator< String > {
035    
036        // Constants for categorising characters and specifying category level
037        // ordering
038        public enum CharType
039        {
040            TYPE_OTHER, TYPE_DIGIT, TYPE_LETTER
041        }
042    
043        // A special singleton instance for quick access
044        public static final Comparator<String> DEFAULT_HUMAN_COMPARATOR = new HumanComparator();
045    
046        /**
047         * Returns a singleton comparator that implements the default behaviour.
048         * 
049         * @return the singleton comparator.
050         */
051        public static Comparator<String> getInstance()
052        {
053            return DEFAULT_HUMAN_COMPARATOR;
054        }
055    
056        private CharType[] sortOrder = { CharType.TYPE_OTHER, CharType.TYPE_DIGIT, CharType.TYPE_LETTER };
057    
058        /**
059         * Default constructor which does nothing. Here because it has a non-default
060         * constructor.
061         */
062        public HumanComparator()
063        {
064            // Empty
065        }
066    
067        /**
068         * Constructor specifying all the character type order.
069         * 
070         * @param sortOrder see setSortOrder
071         */
072        public HumanComparator( CharType[] sortOrder )
073        {
074            setSortOrder( sortOrder );
075        }
076    
077        /*
078         * (non-Javadoc)
079         * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
080         */
081        public int compare( String str1, String str2 )
082        {
083            // Some quick and easy checks
084            if( StringUtils.equals( str1, str2 ) ) {
085                // they're identical, possibly both null
086                return 0;
087            } else if ( str1 == null ) {
088                // str1 is null and str2 isn't so str1 is smaller
089                return -1;
090            } else if ( str2 == null ) {
091                // str2 is null and str1 isn't so str1 is bigger
092                return 1;
093            }
094    
095            char[] s1 = str1.toCharArray();
096            char[] s2 = str2.toCharArray();
097            int len1 = s1.length;
098            int len2 = s2.length;
099            int idx = 0;
100            // caseComparison used to defer a case sensitive comparison
101            int caseComparison = 0;
102    
103            while ( idx < len1 && idx < len2 )
104            {
105                char c1 = s1[idx];
106                char c2 = s2[idx++];
107    
108                // Convert to lower case
109                char lc1 = Character.toLowerCase( c1 );
110                char lc2 = Character.toLowerCase( c2 );
111    
112                // If case makes a difference, note the difference the first time
113                // it's encountered
114                if( caseComparison == 0 && c1 != c2 && lc1 == lc2 )
115                {
116                    if( Character.isLowerCase( c1 ) )
117                        caseComparison = 1;
118                    else if( Character.isLowerCase( c2 ) )
119                        caseComparison = -1;
120                }
121                // Do the rest of the tests in lower case
122                c1 = lc1;
123                c2 = lc2;
124    
125                // leading zeros are a special case
126                if( c1 != c2 || c1 == '0' )
127                {
128                    // They might be different, now we can do a comparison
129                    CharType type1 = mapCharTypes( c1 );
130                    CharType type2 = mapCharTypes( c2 );
131    
132                    // Do the character class check
133                    int result = compareCharTypes( type1, type2 );
134                    if( result != 0 )
135                    {
136                        // different character classes so that's sufficient
137                        return result;
138                    }
139    
140                    // If they're not digits, use character to character comparison
141                    if( type1 != CharType.TYPE_DIGIT )
142                    {
143                        Character ch1 = Character.valueOf( c1 );
144                        Character ch2 = Character.valueOf( c2 );
145                        return ch1.compareTo( ch2 );
146                    }
147    
148                    // The only way to get here is both characters are digits
149                    assert( type1 == CharType.TYPE_DIGIT && type2 == CharType.TYPE_DIGIT );
150                    result = compareDigits( s1, s2, idx - 1 );
151                    if( result != 0 )
152                    {
153                        // Got a result so return it
154                        return result;
155                    }
156    
157                    // No result yet, spin through the digits and continue trying
158                    while ( idx < len1 && idx < len2 && Character.isDigit( s1[idx] ) ) {
159                        idx++;
160                    }
161                }
162            }
163    
164            if( len1 == len2 )
165            {
166                // identical so return any case dependency
167                return caseComparison;
168            }
169    
170            // Shorter String is less
171            return len1 - len2;
172        }
173    
174        /**
175         * Implements ordering based on broad categories (e.g. numbers are always
176         * less than digits)
177         * 
178         * @param type1 first CharType
179         * @param type2 second CharType
180         * @return -1 if type1 < type2, 0 if type1 == type2, 1 if type1 > type2
181         */
182        private int compareCharTypes( CharType type1, CharType type2 )
183        {
184            if( type1 == type2 )
185            {
186                // Same type so equal
187                return 0;
188            } else if( type1 == sortOrder[0] )
189            {
190                // t1 is the lowest order and t2 isn't so t1 must be less
191                return -1;
192            } else if( type2 == sortOrder[0] )
193            {
194                // t2 is the lowest order and t1 isn't so t1 must be more
195                return 1;
196            } else if( type1 == sortOrder[1] )
197            {
198                // t1 is the middle order and t2 isn't so t1 must be less
199                return -1;
200            } else if( type2 == sortOrder[1] )
201            {
202                // t2 is the middle order and t1 isn't so t1 must be more
203                return 1;
204            } else
205            {
206                // Can't possibly get here as that would mean they're both sortOrder[2]
207                assert( type1 != type2 );
208                return 0;
209            }
210        }
211    
212        /**
213         * Do a numeric comparison on two otherwise identical char arrays.
214         * 
215         * @param left the left hand character array.
216         * @param offset the index of the first digit of the number in both char
217         *            arrays.
218         * @return negative, zero or positive depending on the numeric comparison of
219         *         left and right.
220         */
221        private int compareDigits( char[] left, char[] right, int offset )
222        {
223            // Calculate the integer value of the left hand side
224            int idx = offset;
225            while ( idx < left.length && Character.isDigit( left[idx] ) ) {
226                idx++;
227            }
228            int leftLen = idx - offset;
229            int leftValue = Integer.valueOf( new String( left, offset, leftLen ) );
230    
231            // Calculate the integer value of the right hand side
232            idx = offset;
233            while ( idx < right.length && Character.isDigit( right[idx] ) ) {
234                idx++;
235            }
236            int rightLen = idx - offset;
237            int rightValue = Integer.valueOf( new String( right, offset, rightLen ) );
238    
239            if( leftValue == rightValue ) {
240                return leftLen - rightLen; // Same value so use the lengths
241            }
242            return leftValue - rightValue; // Otherwise compare the values
243        }
244    
245        public CharType[] getSortOrder()
246        {
247            return sortOrder;
248        }
249    
250        /**
251         * Very broadly characterises a character as a digit, a letter or a punctuation character.
252         * 
253         * @param c <code>char</code> to be characterised
254         * @return <code>IS_DIGIT</code> if it's a digit, <code>IS_LETTER</code> if
255         *         it's a letter, <code>IS_PUNC</code> otherwise.
256         */
257        private CharType mapCharTypes( char c ) {
258            if( Character.isDigit( c ) ) {
259                return CharType.TYPE_DIGIT;
260            } else if( Character.isLetter( c ) ) {
261                return CharType.TYPE_LETTER;
262            } else {
263                return CharType.TYPE_OTHER;
264            }
265        }
266    
267        /**
268         * Set the order in which letters, numbers and everything else is presented.
269         * Default is other, digits and then letters. For example, the strings
270         * "abb", "a1b" and "a b" will sort in the order "a b", "a1b" then "abb" by
271         * default.
272         * 
273         * @param sortOrder Must be an array of <code>CharType</code> containing
274         *            exactly 3 elements each of which must be distinct.
275         * @throws IllegalArgumentException if being called on the result of
276         *             <code>HumanStringComparator.getInstance()</code> or
277         *             <code>sortOrder</code> is not exactly 3 different
278         *             <code>CharType</code>.
279         */
280        public void setSortOrder( CharType[] sortOrder ) {
281            if( this == DEFAULT_HUMAN_COMPARATOR ) {
282                throw new IllegalArgumentException( "Can't call setters on default " + HumanComparator.class.getName() );
283            }
284    
285            // Sanity check the sort order
286            if( sortOrder == null || sortOrder.length != 3 ) {
287                throw new IllegalArgumentException( "There must be exactly three elements in the sort order" );
288            }
289            if( sortOrder[0] == sortOrder[1] || sortOrder[0] == sortOrder[2] || sortOrder[1] == sortOrder[2] ) {
290                throw new IllegalArgumentException( "The sort order must contain EXACTLY one of each CharType" );
291            }
292            this.sortOrder = sortOrder.clone();
293        }
294    
295    }