001    /*
002        Licensed to the Apache Software Foundation (ASF) under one
003        or more contributor license agreements.  See the NOTICE file
004        distributed with this work for additional information
005        regarding copyright ownership.  The ASF licenses this file
006        to you under the Apache License, Version 2.0 (the
007        "License"); you may not use this file except in compliance
008        with the License.  You may obtain a copy of the License at
009    
010           http://www.apache.org/licenses/LICENSE-2.0
011    
012        Unless required by applicable law or agreed to in writing,
013        software distributed under the License is distributed on an
014        "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015        KIND, either express or implied.  See the License for the
016        specific language governing permissions and limitations
017        under the License.  
018     */
019    
020    package org.apache.wiki.parser;
021    
022    import java.util.*;
023    
024    import org.apache.log4j.Logger;
025    import org.jdom2.Attribute;
026    
027    /**
028     *  Parses JSPWiki-style "augmented" link markup into a Link object
029     *  containing the link text, link reference, and any optional link
030     *  attributes (as JDOM Attributes).
031     *  <p>
032     *  The parser recognizes three link forms:
033     *  </p>
034     *  <ol>
035     *    <li><tt> [Text] </tt></li>
036     *    <li><tt> [Text | Link] </tt></li>
037     *    <li><tt> [Text | Link | attributes] </tt></li>
038     *  </ol>
039     *  <p>
040     *  where the attributes are space-delimited, each in the form of
041     *  </p>
042     *  <pre>
043     *      name1='value1' name2='value2' name3='value3' (etc.) </pre>
044     *  <p>
045     *  If the attribute parsing fails, the parser will still return the
046     *  basic link, writing a warning to the log.
047     *  </p>
048     *
049     *  <h3>Permitted Attributes</h3>
050     *  <p>
051     *  Attributes that aren't declared on <tt>&lt;a&gt;</tt> or those that
052     *  permit scripting in HTML (as this is a security risk) are ignored
053     *  and have no effect on parsing, nor show up in the resulting attribute
054     *  list). The 'href' and 'name' attributes are also ignored as spurious.
055     *  The permitted list is: 'accesskey', 'charset', 'class', 'hreflang',
056     *  'id', 'lang', 'dir', 'rel', 'rev', 'style' , 'tabindex', 'target' ,
057     *  'title', and 'type'. The declared attributes that will be ignored
058     *  are: 'href', 'name', 'shape', 'coords', 'onfocus', 'onblur', or any
059     *  of the other 'on*' event attributes.
060     *  </p>
061     *  <p>
062     *  The permitted attributes and target attribute values are static
063     *  String arrays ({@link #PERMITTED_ATTRIBUTES} and
064     *  {@link #PERMITTED_TARGET_VALUES} resp.) that could be compile-time
065     *  modified (i.e., predeclared).
066     *  </p>
067     *
068     *  <h3>Permitted Values on Target Attribute</h3>
069     *  <p>
070     *  The following target names are reserved in HTML 4 and have special
071     *  meanings. These are the only values permitted by the parser.
072     *  <dl>
073     *    <dt><b>_blank</b></dt>
074     *    <dd> The user agent should load the designated document in a new,
075     *    unnamed window. </dd>
076     *    <dt><b>_self</b></dt>
077     *    <dd> The user agent should load the document in the same frame as
078     *    the element that refers to this target. </dd>
079     *    <dt><b>_parent</b></dt>
080     *    <dd> The user agent should load the document into the immediate
081     *    FRAMESET parent of the current frame. This value is equivalent to
082     *    _self if the current frame has no parent. </dd>
083     *    <dt><b>_top</b></dt>
084     *    <dd> The user agent should load the document into the full,
085     *    original window (thus canceling all other frames). This value is
086     *    equivalent to _self if the current frame has no parent. </dd>
087     *  </dl>
088     *
089     *  <h3>Returned Value</h3>
090     *  <p>
091     *  This returns a <b>Link</b> object, a public inner class with methods:
092     *  <ul>
093     *    <li> <tt>getText()</tt> returns the link text. </li>
094     *    <li> <tt>getReference()</tt> returns the link reference value. </li>
095     *    <li> <tt>attributeCount()</tt> returns the number of declared attributes. </li>
096     *    <li> <tt>getAttributes()</tt> returns an iterator over any validated
097     *        XHTML-compliant attributes, returned as JDOM Attributes.
098     *    </li>
099     *  </ul>
100     *  <p>
101     *  The <tt>attributeCount()</tt> method can be used to circumvent calling
102     *  <tt>getAttributes()</tt>, which will create an empty Iterator rather
103     *  than return a null.
104     *  </p>
105     *
106     *  <h3>Example: Link Form 1</h3>
107     *  <p>
108     *  From an incoming wikitext link of:
109     *  <pre>
110     *     [Acme] </pre>
111     *  returns:
112     *  <pre>
113     *    getText():         "Acme"
114     *    getReference():    "Acme"
115     *    attributeCount():  0
116     *    getAttributes():   an empty Iterator </pre>
117     *
118     *  <h3>Example: Link Form 2</h3>
119     *  <p>
120     *  From an incoming wikitext link of:
121     *  <pre>
122     *     [Acme | http://www.acme.com/] </pre>
123     *  returns:
124     *  <pre>
125     *    getText():         "Acme"
126     *    getReference():    "http://www.acme.com/"
127     *    attributeCount():  0
128     *    getAttributes():   an empty Iterator </pre>
129     *
130     *  <h3>Example: Link Form 3</h3>
131     *  <p>
132     *  From an incoming wikitext link of:
133     *  </p>
134     *  <pre>
135     *    [Acme | http://www.acme.com/ | id='foo' rel='Next'] </pre>
136     *  returns:
137     *  <pre>
138     *    getText():         "Acme"
139     *    getReference():    "http://www.acme.com/"
140     *    attributeCount():  2
141     *    getAttributes():   an Iterator containing:
142     *      JDOM Attribute:  id="foo"
143     *      JDOM Attribute:  rel="Next" </pre>
144     *
145     *
146     *  @since  2.5.10
147     */
148    public class LinkParser
149    {
150        private static Logger log = Logger.getLogger(LinkParser.class);
151    
152        /** Permitted attributes on links.  Keep this sorted. */
153        private static final String[] PERMITTED_ATTRIBUTES = new String[] {
154                "accesskey", "charset", "class", "dir", "hreflang", "id", "lang",
155                "rel", "rev", "style", "tabindex", "target", "title", "type" };
156    
157        /** Permitted values on the 'target' attribute. */
158        private static final String[] PERMITTED_TARGET_VALUES = new String[] {
159                "_blank", "_self", "_parent", "_top" };
160    
161        private static final String EQSQUO = "='";
162        private static final String SQUO   = "'";
163        private static final String EQ     = "=";
164        private static final String TARGET = "target";
165        private static final String DELIMS = " \t\n\r\f=";
166    
167        private static final List m_EMPTY = new ArrayList();
168    
169        // ............
170    
171    
172        /**
173         *  Processes incoming link text, separating out the link text, the link
174         *  URI, and then any specified attributes.
175         *
176         * @param  linktext  the wiki link text to be parsed
177         * @return a Link object containing the link text, reference, and any valid Attributes
178         * @throws ParseException if the parameter is null
179         */
180        public Link parse( String linktext ) throws ParseException
181        {
182            if( linktext == null )
183            {
184                throw new ParseException("null value passed to link parser");
185            }
186    
187            Link link = null;
188    
189            try
190            {
191                // establish link text and link ref
192                int cut1   = linktext.indexOf('|');
193                if( cut1 == -1 )
194                {
195                    //  link form 1:  [Acme]
196                    return new Link( linktext );
197                }
198    
199                int cut2 = cut1+1 < linktext.length()
200                        ? linktext.indexOf('|', cut1+1 )
201                        : -1 ;
202    
203                if ( cut2 == -1 )
204                {
205                    // link form 2:  [Acme | http://www.acme.com/]
206                    // text = Acme
207                    String text = linktext.substring( 0, cut1 ).trim();
208                    // ref = http://www.acme.com/
209                    String ref  = linktext.substring( cut1+1 ).trim();
210                    return new Link( text, ref );
211                }
212    
213                // link form 3:  [Acme | http://www.acme.com/ | id='foo' rel='Next']
214                String text    = linktext.substring( 0, cut1 ).trim();
215                String ref     = linktext.substring( cut1+1, cut2 ).trim();
216                // attribs = id='foo' rel='Next'
217                String attribs = linktext.substring( cut2+1 ).trim();
218    
219                link = new Link( text, ref );
220    
221                // parse attributes
222                // contains "='" that looks like attrib spec
223                if( attribs.indexOf(EQSQUO) != -1 )
224                {
225                    try
226                    {
227                        StringTokenizer tok = new StringTokenizer(attribs,DELIMS,true);
228                        while ( tok.hasMoreTokens() )
229                        {
230                            // get attribute name token
231                            String token = tok.nextToken(DELIMS).trim();
232                            while ( isSpace(token) && tok.hasMoreTokens() )
233                            {
234                                // remove all whitespace
235                                token = tok.nextToken(DELIMS).trim();
236                            }
237    
238                            // eat '=', break after '='
239                            require( tok, EQ );
240                            // eat opening delim
241                            require( tok, SQUO );
242                            // using existing delim
243                            String value = tok.nextToken(SQUO);
244                            // eat closing delim
245                            require( tok, SQUO );
246    
247                            if( token != null && value != null )
248                            {
249                                if( Arrays.binarySearch( PERMITTED_ATTRIBUTES, token ) >= 0 )
250                                {
251                                    // _blank _self _parent _top
252                                    if( !token.equals(TARGET)
253                                            || Arrays.binarySearch( PERMITTED_TARGET_VALUES, value ) >= 0 )
254                                    {
255                                        Attribute a = new Attribute(token,value);
256                                        link.addAttribute(a);
257                                    }
258                                    else
259                                    {
260                                        throw new ParseException("unknown target attribute value='"
261                                                                 + value + "' on link");
262                                    }
263                                }
264                                else
265                                {
266                                    throw new ParseException("unknown attribute name '"
267                                                             + token + "' on link");
268                                }
269                            }
270                            else
271                            {
272                                throw new ParseException("unable to parse link attributes '"
273                                                         + attribs + "'");
274    
275                            }
276                        }
277                    }
278                    catch( ParseException pe )
279                    {
280                        log.warn("syntax error parsing link attributes '"+attribs+"': " + pe.getMessage());
281                    }
282                    catch( NoSuchElementException nse )
283                    {
284                        log.warn("expected more tokens while parsing link attributes '" + attribs + "'");
285                    }
286                }
287    
288            }
289            catch( Exception e )
290            {
291                log.warn( e.getClass().getName() + " thrown by link parser: " + e.getMessage() );
292            }
293    
294            return link;
295        }
296    
297    
298        private String require( StringTokenizer tok, String required )
299                throws ParseException, NoSuchElementException
300        {
301            String s = tok.nextToken(required);
302            if( !s.equals(required) )
303            {
304                throw new ParseException("expected '"+required+"' not '"+s+"'");
305            }
306            return s;
307        }
308    
309    
310        /** 
311         *  Returns true if the String <tt>s</tt> is completely
312         *  composed of whitespace.
313         * 
314         *  @param s The string to check
315         *  @return True, if "s" is all XML whitespace.
316         */
317        public static final boolean isSpace( String s )
318        {
319            for( int i = 0 ; i < s.length() ; i++ )
320            {
321                if( !isSpace( s.charAt(i)) ) return false;
322            }
323            return true;
324        }
325    
326    
327        /** 
328         *  Returns true if char <tt>c</tt> is a member of
329         *  <tt>S</tt> (space) [XML 1.1 production 3].
330         *  
331         *  @param c Character to check.
332         *  @return True, if the character is an XML space.
333         */
334        public static final boolean isSpace( char c )
335        {
336            // 0x20 = SPACE, 0x0A = LF, 0x0D = CR, 0x09 = TAB, 0x85 = NEL, 0x2028 = Line separator
337            return
338               0x20 == c
339            || 0x0A == c
340            || 0x0D == c
341            || 0x09 == c
342            || 0x85 == c
343            || 0x2028 == c;
344        }
345    
346    
347        // .........................................................................
348    
349    
350        /**
351         *  Inner class serving as a struct containing the parsed
352         *  components of a link.
353         */
354        public static class Link
355        {
356            private String            m_text;
357            private String            m_ref = null;
358            private int               m_interwikiPoint = -1;
359            private List<Attribute>   m_attribs = null;
360    
361            /**
362             *  Create a new Link with text but no reference.
363             *  @param text The link text.
364             *  @throws ParseException If the link text is illegal.
365             */
366            protected Link( String text ) throws ParseException
367            {
368                setText(text);
369            }
370    
371            /**
372             *  Create a new link with a given text and hyperlink (reference).
373             *  
374             *  @param text The link text.
375             *  @param ref  The hypertext reference.
376             *  @throws ParseException If the link text or reference are illegal.
377             */
378            protected Link( String text, String ref ) throws ParseException
379            {
380                setText(text);
381                setReference(ref);
382            }
383    
384            /**
385             *  Sets the link text.
386             *  
387             *  @param text The link text.
388             *  @throws ParseException If the text is illegal (e.g. null).
389             */
390            protected void setText( String text ) throws ParseException
391            {
392                if( text == null )
393                {
394                    throw new ParseException("null link text");
395                }
396                m_text = text;
397            }
398    
399            /**
400             *  Returns the link text.
401             *  
402             *  @return Link text.
403             */
404            public String getText()
405            {
406                return m_text;
407            }
408    
409            /**
410             *  Sets the hypertext reference.  Typically, this is an URI or an interwiki link,
411             *  or a wikilink.
412             *  
413             *  @param ref The reference.
414             *  @throws ParseException If the reference is illegal.
415             */
416            protected void setReference( String ref ) throws ParseException
417            {
418                if( ref == null )
419                {
420                    throw new ParseException("null link reference value");
421                }
422                m_ref = ref;
423            }
424    
425            /**
426             *  Returns true, if there is a reference.
427             *  
428             *  @return True, if there's a reference; false otherwise.
429             */
430            public boolean hasReference()
431            {
432                return m_ref != null;
433            }
434    
435            /** 
436             *  Returns the link reference, or the link text if null. 
437             *  
438             *  @return A link reference.
439             */
440            public String getReference()
441            {
442                return m_ref != null
443                        ? m_ref
444                        : m_text ;
445            }
446    
447            /**
448             *  Returns true, if this Link represents an InterWiki link (of the form wiki:page).
449             * 
450             *  @return True, if this Link represents an InterWiki link.
451             */
452            public boolean isInterwikiLink()
453            {
454                if( !hasReference() ) m_ref = m_text;
455    
456                m_interwikiPoint = m_ref.indexOf(':');
457    
458                return m_interwikiPoint != -1;
459            }
460    
461            /**
462             *  Returns the name of the wiki if this is an interwiki link. 
463             *  <pre>
464             *    Link link = new Link("Foo","Wikipedia:Foobar");
465             *    assert( link.getExternalWikiPage(), "Wikipedia" );
466             *  </pre> 
467             *  
468             *  @return Name of the wiki, or null, if this is not an interwiki link.
469             */
470            public String getExternalWiki()
471            {
472                if( isInterwikiLink() )
473                {
474                    return m_ref.substring( 0, m_interwikiPoint );
475                }
476                
477                return null;
478            }
479    
480            /** 
481             *  Returns the wikiname part of an interwiki link. Used only with interwiki links.
482             *  <pre>
483             *    Link link = new Link("Foo","Wikipedia:Foobar");
484             *    assert( link.getExternalWikiPage(), "Foobar" );
485             *  </pre> 
486             *  
487             *  @return Wikiname part, or null, if this is not an interwiki link.
488             */
489            public String getExternalWikiPage()
490            {
491                if( isInterwikiLink() )
492                {
493                    return m_ref.substring( m_interwikiPoint+1 );
494                }
495                
496                return null;
497            }
498    
499            /**
500             *  Returns the number of attributes on this link.
501             *  
502             *  @return The number of attributes.
503             */
504            public int attributeCount()
505            {
506                return m_attribs != null
507                        ? m_attribs.size()
508                        : 0 ;
509            }
510    
511            /**
512             *  Adds another attribute to the link.
513             *  
514             *  @param attr A JDOM Attribute.
515             */
516            public void addAttribute( Attribute attr )
517            {
518                if( m_attribs == null )
519                {
520                    m_attribs = new ArrayList<Attribute>();
521                }
522                m_attribs.add(attr);
523            }
524    
525            /** 
526             *  Returns an Iterator over the list of JDOM Attributes.
527             *  
528             *  @return Iterator over the attributes.
529             */
530            public Iterator getAttributes()
531            {
532                return m_attribs != null
533                        ? m_attribs.iterator()
534                        : m_EMPTY.iterator() ;
535            }
536    
537            /** 
538             *  Returns a wikitext string representation of this Link. 
539             *  @return WikiText.
540             */
541            public String toString()
542            {
543                StringBuffer sb = new StringBuffer();
544                sb.append( '[' );
545                sb.append( m_text );
546    
547                if( m_ref != null )
548                {
549                    sb.append( ' ' );
550                    sb.append( '|' );
551                    sb.append( ' ' );
552                    sb.append( m_ref );
553                }
554    
555                if( m_attribs != null )
556                {
557                    sb.append( ' ' );
558                    sb.append( '|' );
559                    Iterator it = getAttributes();
560                    while ( it.hasNext() )
561                    {
562                        Attribute a = (Attribute)it.next();
563                        sb.append( ' ' );
564                        sb.append( a.getName() );
565                        sb.append( '=' );
566                        sb.append( '\'' );
567                        sb.append( a.getValue() );
568                        sb.append( '\'' );
569                    }
570                }
571                sb.append( ']' );
572                return sb.toString();
573            }
574    
575        }
576        // end inner class
577    
578    }