001/*
002    Licensed to the Apache Software Foundation (ASF) under one
003    or more contributor license agreements.  See the NOTICE file
004    distributed with this work for additional information
005    regarding copyright ownership.  The ASF licenses this file
006    to you under the Apache License, Version 2.0 (the
007    "License"); you may not use this file except in compliance
008    with the License.  You may obtain a copy of the License at
009
010       http://www.apache.org/licenses/LICENSE-2.0
011
012    Unless required by applicable law or agreed to in writing,
013    software distributed under the License is distributed on an
014    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015    KIND, either express or implied.  See the License for the
016    specific language governing permissions and limitations
017    under the License.
018 */
019
020package org.apache.wiki.parser;
021
022import java.util.ArrayList;
023import java.util.Arrays;
024import java.util.Iterator;
025import java.util.List;
026import java.util.NoSuchElementException;
027import java.util.StringTokenizer;
028
029import org.apache.log4j.Logger;
030import org.jdom2.Attribute;
031
032/**
033 *  Parses JSPWiki-style "augmented" link markup into a Link object
034 *  containing the link text, link reference, and any optional link
035 *  attributes (as JDOM Attributes).
036 *  <p>
037 *  The parser recognizes three link forms:
038 *  </p>
039 *  <ol>
040 *    <li><tt> [Text] </tt></li>
041 *    <li><tt> [Text | Link] </tt></li>
042 *    <li><tt> [Text | Link | attributes] </tt></li>
043 *  </ol>
044 *  <p>
045 *  where the attributes are space-delimited, each in the form of
046 *  </p>
047 *  <pre>
048 *      name1='value1' name2='value2' name3='value3' (etc.) </pre>
049 *  <p>
050 *  If the attribute parsing fails, the parser will still return the
051 *  basic link, writing a warning to the log.
052 *  </p>
053 *
054 *  <h3>Permitted Attributes</h3>
055 *  <p>
056 *  Attributes that aren't declared on <tt>&lt;a&gt;</tt> or those that
057 *  permit scripting in HTML (as this is a security risk) are ignored
058 *  and have no effect on parsing, nor show up in the resulting attribute
059 *  list). The 'href' and 'name' attributes are also ignored as spurious.
060 *  The permitted list is: 'accesskey', 'charset', 'class', 'hreflang',
061 *  'id', 'lang', 'dir', 'rel', 'rev', 'style' , 'tabindex', 'target' ,
062 *  'title', and 'type'. The declared attributes that will be ignored
063 *  are: 'href', 'name', 'shape', 'coords', 'onfocus', 'onblur', or any
064 *  of the other 'on*' event attributes.
065 *  </p>
066 *  <p>
067 *  The permitted attributes and target attribute values are static
068 *  String arrays ({@link #PERMITTED_ATTRIBUTES} and
069 *  {@link #PERMITTED_TARGET_VALUES} resp.) that could be compile-time
070 *  modified (i.e., predeclared).
071 *  </p>
072 *
073 *  <h3>Permitted Values on Target Attribute</h3>
074 *  <p>
075 *  The following target names are reserved in HTML 4 and have special
076 *  meanings. These are the only values permitted by the parser.
077 *  <dl>
078 *    <dt><b>_blank</b></dt>
079 *    <dd> The user agent should load the designated document in a new,
080 *    unnamed window. </dd>
081 *    <dt><b>_self</b></dt>
082 *    <dd> The user agent should load the document in the same frame as
083 *    the element that refers to this target. </dd>
084 *    <dt><b>_parent</b></dt>
085 *    <dd> The user agent should load the document into the immediate
086 *    FRAMESET parent of the current frame. This value is equivalent to
087 *    _self if the current frame has no parent. </dd>
088 *    <dt><b>_top</b></dt>
089 *    <dd> The user agent should load the document into the full,
090 *    original window (thus canceling all other frames). This value is
091 *    equivalent to _self if the current frame has no parent. </dd>
092 *  </dl>
093 *
094 *  <h3>Returned Value</h3>
095 *  <p>
096 *  This returns a <b>Link</b> object, a public inner class with methods:
097 *  <ul>
098 *    <li> <tt>getText()</tt> returns the link text. </li>
099 *    <li> <tt>getReference()</tt> returns the link reference value. </li>
100 *    <li> <tt>attributeCount()</tt> returns the number of declared attributes. </li>
101 *    <li> <tt>getAttributes()</tt> returns an iterator over any validated
102 *        XHTML-compliant attributes, returned as JDOM Attributes.
103 *    </li>
104 *  </ul>
105 *  <p>
106 *  The <tt>attributeCount()</tt> method can be used to circumvent calling
107 *  <tt>getAttributes()</tt>, which will create an empty Iterator rather
108 *  than return a null.
109 *  </p>
110 *
111 *  <h3>Example: Link Form 1</h3>
112 *  <p>
113 *  From an incoming wikitext link of:
114 *  <pre>
115 *     [Acme] </pre>
116 *  returns:
117 *  <pre>
118 *    getText():         "Acme"
119 *    getReference():    "Acme"
120 *    attributeCount():  0
121 *    getAttributes():   an empty Iterator </pre>
122 *
123 *  <h3>Example: Link Form 2</h3>
124 *  <p>
125 *  From an incoming wikitext link of:
126 *  <pre>
127 *     [Acme | http://www.acme.com/] </pre>
128 *  returns:
129 *  <pre>
130 *    getText():         "Acme"
131 *    getReference():    "http://www.acme.com/"
132 *    attributeCount():  0
133 *    getAttributes():   an empty Iterator </pre>
134 *
135 *  <h3>Example: Link Form 3</h3>
136 *  <p>
137 *  From an incoming wikitext link of:
138 *  </p>
139 *  <pre>
140 *    [Acme | http://www.acme.com/ | id='foo' rel='Next'] </pre>
141 *  returns:
142 *  <pre>
143 *    getText():         "Acme"
144 *    getReference():    "http://www.acme.com/"
145 *    attributeCount():  2
146 *    getAttributes():   an Iterator containing:
147 *      JDOM Attribute:  id="foo"
148 *      JDOM Attribute:  rel="Next" </pre>
149 *
150 *
151 *  @since  2.5.10
152 */
153public class LinkParser
154{
155    private static Logger log = Logger.getLogger(LinkParser.class);
156
157    /** Permitted attributes on links.  Keep this sorted. */
158    private static final String[] PERMITTED_ATTRIBUTES = new String[] {
159            "accesskey", "charset", "class", "dir", "hreflang", "id", "lang",
160            "rel", "rev", "style", "tabindex", "target", "title", "type" };
161
162    /** Permitted values on the 'target' attribute. */
163    private static final String[] PERMITTED_TARGET_VALUES = new String[] {
164            "_blank", "_self", "_parent", "_top" };
165
166    private static final String EQSQUO = "='";
167    private static final String SQUO   = "'";
168    private static final String EQ     = "=";
169    private static final String TARGET = "target";
170    private static final String DELIMS = " \t\n\r\f=";
171
172    private static final List< Attribute > m_EMPTY = new ArrayList< >();
173
174    // ............
175
176
177    /**
178     *  Processes incoming link text, separating out the link text, the link
179     *  URI, and then any specified attributes.
180     *
181     * @param  linktext  the wiki link text to be parsed
182     * @return a Link object containing the link text, reference, and any valid Attributes
183     * @throws ParseException if the parameter is null
184     */
185    public Link parse( String linktext ) throws ParseException
186    {
187        if( linktext == null )
188        {
189            throw new ParseException("null value passed to link parser");
190        }
191
192        Link link = null;
193
194        try
195        {
196            // establish link text and link ref
197            int cut1   = linktext.indexOf('|');
198            if( cut1 == -1 )
199            {
200                //  link form 1:  [Acme]
201                return new Link( linktext );
202            }
203
204            int cut2 = cut1+1 < linktext.length()
205                    ? linktext.indexOf('|', cut1+1 )
206                    : -1 ;
207
208            if ( cut2 == -1 )
209            {
210                // link form 2:  [Acme | http://www.acme.com/]
211                // text = Acme
212                String text = linktext.substring( 0, cut1 ).trim();
213                // ref = http://www.acme.com/
214                String ref  = linktext.substring( cut1+1 ).trim();
215                return new Link( text, ref );
216            }
217
218            // link form 3:  [Acme | http://www.acme.com/ | id='foo' rel='Next']
219            String text    = linktext.substring( 0, cut1 ).trim();
220            String ref     = linktext.substring( cut1+1, cut2 ).trim();
221            // attribs = id='foo' rel='Next'
222            String attribs = linktext.substring( cut2+1 ).trim();
223
224            link = new Link( text, ref );
225
226            // parse attributes
227            // contains "='" that looks like attrib spec
228            if( attribs.indexOf(EQSQUO) != -1 )
229            {
230                try
231                {
232                    StringTokenizer tok = new StringTokenizer(attribs,DELIMS,true);
233                    while ( tok.hasMoreTokens() )
234                    {
235                        // get attribute name token
236                        String token = tok.nextToken(DELIMS).trim();
237                        while ( isSpace(token) && tok.hasMoreTokens() )
238                        {
239                            // remove all whitespace
240                            token = tok.nextToken(DELIMS).trim();
241                        }
242
243                        // eat '=', break after '='
244                        require( tok, EQ );
245                        // eat opening delim
246                        require( tok, SQUO );
247                        // using existing delim
248                        String value = tok.nextToken(SQUO);
249                        // eat closing delim
250                        require( tok, SQUO );
251
252                        if( token != null && value != null )
253                        {
254                            if( Arrays.binarySearch( PERMITTED_ATTRIBUTES, token ) >= 0 )
255                            {
256                                // _blank _self _parent _top
257                                if( !token.equals(TARGET)
258                                        || Arrays.binarySearch( PERMITTED_TARGET_VALUES, value ) >= 0 )
259                                {
260                                    Attribute a = new Attribute(token,value);
261                                    link.addAttribute(a);
262                                }
263                                else
264                                {
265                                    throw new ParseException("unknown target attribute value='"
266                                                             + value + "' on link");
267                                }
268                            }
269                            else
270                            {
271                                throw new ParseException("unknown attribute name '"
272                                                         + token + "' on link");
273                            }
274                        }
275                        else
276                        {
277                            throw new ParseException("unable to parse link attributes '"
278                                                     + attribs + "'");
279
280                        }
281                    }
282                }
283                catch( ParseException pe )
284                {
285                    log.warn("syntax error parsing link attributes '"+attribs+"': " + pe.getMessage());
286                }
287                catch( NoSuchElementException nse )
288                {
289                    log.warn("expected more tokens while parsing link attributes '" + attribs + "'");
290                }
291            }
292
293        }
294        catch( Exception e )
295        {
296            log.warn( e.getClass().getName() + " thrown by link parser: " + e.getMessage() );
297        }
298
299        return link;
300    }
301
302
303    private String require( StringTokenizer tok, String required )
304            throws ParseException, NoSuchElementException
305    {
306        String s = tok.nextToken(required);
307        if( !s.equals(required) )
308        {
309            throw new ParseException("expected '"+required+"' not '"+s+"'");
310        }
311        return s;
312    }
313
314
315    /**
316     *  Returns true if the String <tt>s</tt> is completely
317     *  composed of whitespace.
318     *
319     *  @param s The string to check
320     *  @return True, if "s" is all XML whitespace.
321     */
322    public static final boolean isSpace( String s )
323    {
324        for( int i = 0 ; i < s.length() ; i++ )
325        {
326            if( !isSpace( s.charAt(i)) ) return false;
327        }
328        return true;
329    }
330
331
332    /**
333     *  Returns true if char <tt>c</tt> is a member of
334     *  <tt>S</tt> (space) [XML 1.1 production 3].
335     *
336     *  @param c Character to check.
337     *  @return True, if the character is an XML space.
338     */
339    public static final boolean isSpace( char c )
340    {
341        // 0x20 = SPACE, 0x0A = LF, 0x0D = CR, 0x09 = TAB, 0x85 = NEL, 0x2028 = Line separator
342        return
343           0x20 == c
344        || 0x0A == c
345        || 0x0D == c
346        || 0x09 == c
347        || 0x85 == c
348        || 0x2028 == c;
349    }
350
351
352    // .........................................................................
353
354
355    /**
356     *  Inner class serving as a struct containing the parsed
357     *  components of a link.
358     */
359    public static class Link
360    {
361        private String            m_text;
362        private String            m_ref = null;
363        private int               m_interwikiPoint = -1;
364        private List<Attribute>   m_attribs = null;
365
366        /**
367         *  Create a new Link with text but no reference.
368         *  @param text The link text.
369         *  @throws ParseException If the link text is illegal.
370         */
371        protected Link( String text ) throws ParseException
372        {
373            setText(text);
374        }
375
376        /**
377         *  Create a new link with a given text and hyperlink (reference).
378         *
379         *  @param text The link text.
380         *  @param ref  The hypertext reference.
381         *  @throws ParseException If the link text or reference are illegal.
382         */
383        protected Link( String text, String ref ) throws ParseException
384        {
385            setText(text);
386            setReference(ref);
387        }
388
389        /**
390         *  Sets the link text.
391         *
392         *  @param text The link text.
393         *  @throws ParseException If the text is illegal (e.g. null).
394         */
395        protected void setText( String text ) throws ParseException
396        {
397            if( text == null )
398            {
399                throw new ParseException("null link text");
400            }
401            m_text = text;
402        }
403
404        /**
405         *  Returns the link text.
406         *
407         *  @return Link text.
408         */
409        public String getText()
410        {
411            return m_text;
412        }
413
414        /**
415         *  Sets the hypertext reference.  Typically, this is an URI or an interwiki link,
416         *  or a wikilink.
417         *
418         *  @param ref The reference.
419         *  @throws ParseException If the reference is illegal.
420         */
421        protected void setReference( String ref ) throws ParseException
422        {
423            if( ref == null )
424            {
425                throw new ParseException("null link reference value");
426            }
427            m_ref = ref;
428        }
429
430        /**
431         *  Returns true, if there is a reference.
432         *
433         *  @return True, if there's a reference; false otherwise.
434         */
435        public boolean hasReference()
436        {
437            return m_ref != null;
438        }
439
440        /**
441         *  Returns the link reference, or the link text if null.
442         *
443         *  @return A link reference.
444         */
445        public String getReference()
446        {
447            return m_ref != null
448                    ? m_ref
449                    : m_text ;
450        }
451
452        /**
453         *  Returns true, if this Link represents an InterWiki link (of the form wiki:page).
454         *
455         *  @return True, if this Link represents an InterWiki link.
456         */
457        public boolean isInterwikiLink()
458        {
459            LinkParsingOperations lpo = new LinkParsingOperations( null );
460            if( !hasReference() ) m_ref = m_text;
461            m_interwikiPoint = lpo.interWikiLinkAt( m_ref );
462            return lpo.isInterWikiLink( m_ref );
463        }
464
465        /**
466         *  Returns the name of the wiki if this is an interwiki link.
467         *  <pre>
468         *    Link link = new Link("Foo","Wikipedia:Foobar");
469         *    assert( link.getExternalWikiPage(), "Wikipedia" );
470         *  </pre>
471         *
472         *  @return Name of the wiki, or null, if this is not an interwiki link.
473         */
474        public String getExternalWiki()
475        {
476            if( isInterwikiLink() )
477            {
478                return m_ref.substring( 0, m_interwikiPoint );
479            }
480
481            return null;
482        }
483
484        /**
485         *  Returns the wikiname part of an interwiki link. Used only with interwiki links.
486         *  <pre>
487         *    Link link = new Link("Foo","Wikipedia:Foobar");
488         *    assert( link.getExternalWikiPage(), "Foobar" );
489         *  </pre>
490         *
491         *  @return Wikiname part, or null, if this is not an interwiki link.
492         */
493        public String getExternalWikiPage()
494        {
495            if( isInterwikiLink() )
496            {
497                return m_ref.substring( m_interwikiPoint+1 );
498            }
499
500            return null;
501        }
502
503        /**
504         *  Returns the number of attributes on this link.
505         *
506         *  @return The number of attributes.
507         */
508        public int attributeCount()
509        {
510            return m_attribs != null
511                    ? m_attribs.size()
512                    : 0 ;
513        }
514
515        /**
516         *  Adds another attribute to the link.
517         *
518         *  @param attr A JDOM Attribute.
519         */
520        public void addAttribute( Attribute attr )
521        {
522            if( m_attribs == null )
523            {
524                m_attribs = new ArrayList<>();
525            }
526            m_attribs.add(attr);
527        }
528
529        /**
530         *  Returns an Iterator over the list of JDOM Attributes.
531         *
532         *  @return Iterator over the attributes.
533         */
534        public Iterator< Attribute > getAttributes()
535        {
536            return m_attribs != null
537                    ? m_attribs.iterator()
538                    : m_EMPTY.iterator() ;
539        }
540
541        /**
542         *  Returns a wikitext string representation of this Link.
543         *  @return WikiText.
544         */
545        @Override
546        public String toString()
547        {
548            StringBuilder sb = new StringBuilder();
549            sb.append( '[' );
550            sb.append( m_text );
551
552            if( m_ref != null )
553            {
554                sb.append( ' ' );
555                sb.append( '|' );
556                sb.append( ' ' );
557                sb.append( m_ref );
558            }
559
560            if( m_attribs != null )
561            {
562                sb.append( ' ' );
563                sb.append( '|' );
564                Iterator< Attribute > it = getAttributes();
565                while ( it.hasNext() )
566                {
567                    Attribute a = it.next();
568                    sb.append( ' ' );
569                    sb.append( a.getName() );
570                    sb.append( '=' );
571                    sb.append( '\'' );
572                    sb.append( a.getValue() );
573                    sb.append( '\'' );
574                }
575            }
576            sb.append( ']' );
577            return sb.toString();
578        }
579
580    }
581    // end inner class
582
583}