001/*
002    Licensed to the Apache Software Foundation (ASF) under one
003    or more contributor license agreements.  See the NOTICE file
004    distributed with this work for additional information
005    regarding copyright ownership.  The ASF licenses this file
006    to you under the Apache License, Version 2.0 (the
007    "License"); you may not use this file except in compliance
008    with the License.  You may obtain a copy of the License at
009
010       http://www.apache.org/licenses/LICENSE-2.0
011
012    Unless required by applicable law or agreed to in writing,
013    software distributed under the License is distributed on an
014    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015    KIND, either express or implied.  See the License for the
016    specific language governing permissions and limitations
017    under the License.
018 */
019
020package org.apache.wiki.parser;
021
022import java.util.ArrayList;
023import java.util.Arrays;
024import java.util.Iterator;
025import java.util.List;
026import java.util.NoSuchElementException;
027import java.util.StringTokenizer;
028
029import org.apache.logging.log4j.LogManager;
030import org.apache.logging.log4j.Logger;
031import org.jdom2.Attribute;
032
033/**
034 *  Parses JSPWiki-style "augmented" link markup into a Link object
035 *  containing the link text, link reference, and any optional link
036 *  attributes (as JDOM Attributes).
037 *  <p>
038 *  The parser recognizes three link forms:
039 *  </p>
040 *  <ol>
041 *    <li><tt> [Text] </tt></li>
042 *    <li><tt> [Text | Link] </tt></li>
043 *    <li><tt> [Text | Link | attributes] </tt></li>
044 *  </ol>
045 *  <p>
046 *  where the attributes are space-delimited, each in the form of
047 *  </p>
048 *  <pre>
049 *      name1='value1' name2='value2' name3='value3' (etc.) </pre>
050 *  <p>
051 *  If the attribute parsing fails, the parser will still return the
052 *  basic link, writing a warning to the log.
053 *  </p>
054 *
055 *  <h3>Permitted Attributes</h3>
056 *  <p>
057 *  Attributes that aren't declared on <tt>&lt;a&gt;</tt> or those that
058 *  permit scripting in HTML (as this is a security risk) are ignored
059 *  and have no effect on parsing, nor show up in the resulting attribute
060 *  list). The 'href' and 'name' attributes are also ignored as spurious.
061 *  The permitted list is: 'accesskey', 'charset', 'class', 'hreflang',
062 *  'id', 'lang', 'dir', 'rel', 'rev', 'style' , 'tabindex', 'target' ,
063 *  'title', and 'type'. The declared attributes that will be ignored
064 *  are: 'href', 'name', 'shape', 'coords', 'onfocus', 'onblur', or any
065 *  of the other 'on*' event attributes.
066 *  </p>
067 *  <p>
068 *  The permitted attributes and target attribute values are static
069 *  String arrays ({@link #PERMITTED_ATTRIBUTES} and
070 *  {@link #PERMITTED_TARGET_VALUES} resp.) that could be compile-time
071 *  modified (i.e., predeclared).
072 *  </p>
073 *
074 *  <h3>Permitted Values on Target Attribute</h3>
075 *  <p>
076 *  The following target names are reserved in HTML 4 and have special
077 *  meanings. These are the only values permitted by the parser.
078 *  <dl>
079 *    <dt><b>_blank</b></dt>
080 *    <dd> The user agent should load the designated document in a new,
081 *    unnamed window. </dd>
082 *    <dt><b>_self</b></dt>
083 *    <dd> The user agent should load the document in the same frame as
084 *    the element that refers to this target. </dd>
085 *    <dt><b>_parent</b></dt>
086 *    <dd> The user agent should load the document into the immediate
087 *    FRAMESET parent of the current frame. This value is equivalent to
088 *    _self if the current frame has no parent. </dd>
089 *    <dt><b>_top</b></dt>
090 *    <dd> The user agent should load the document into the full,
091 *    original window (thus canceling all other frames). This value is
092 *    equivalent to _self if the current frame has no parent. </dd>
093 *  </dl>
094 *
095 *  <h3>Returned Value</h3>
096 *  <p>
097 *  This returns a <b>Link</b> object, a public inner class with methods:
098 *  <ul>
099 *    <li> <tt>getText()</tt> returns the link text. </li>
100 *    <li> <tt>getReference()</tt> returns the link reference value. </li>
101 *    <li> <tt>attributeCount()</tt> returns the number of declared attributes. </li>
102 *    <li> <tt>getAttributes()</tt> returns an iterator over any validated
103 *        XHTML-compliant attributes, returned as JDOM Attributes.
104 *    </li>
105 *  </ul>
106 *  <p>
107 *  The <tt>attributeCount()</tt> method can be used to circumvent calling
108 *  <tt>getAttributes()</tt>, which will create an empty Iterator rather
109 *  than return a null.
110 *  </p>
111 *
112 *  <h3>Example: Link Form 1</h3>
113 *  <p>
114 *  From an incoming wikitext link of:
115 *  <pre>
116 *     [Acme] </pre>
117 *  returns:
118 *  <pre>
119 *    getText():         "Acme"
120 *    getReference():    "Acme"
121 *    attributeCount():  0
122 *    getAttributes():   an empty Iterator </pre>
123 *
124 *  <h3>Example: Link Form 2</h3>
125 *  <p>
126 *  From an incoming wikitext link of:
127 *  <pre>
128 *     [Acme | http://www.acme.com/] </pre>
129 *  returns:
130 *  <pre>
131 *    getText():         "Acme"
132 *    getReference():    "http://www.acme.com/"
133 *    attributeCount():  0
134 *    getAttributes():   an empty Iterator </pre>
135 *
136 *  <h3>Example: Link Form 3</h3>
137 *  <p>
138 *  From an incoming wikitext link of:
139 *  </p>
140 *  <pre>
141 *    [Acme | http://www.acme.com/ | id='foo' rel='Next'] </pre>
142 *  returns:
143 *  <pre>
144 *    getText():         "Acme"
145 *    getReference():    "http://www.acme.com/"
146 *    attributeCount():  2
147 *    getAttributes():   an Iterator containing:
148 *      JDOM Attribute:  id="foo"
149 *      JDOM Attribute:  rel="Next" </pre>
150 *
151 *
152 *  @since  2.5.10
153 */
154public class LinkParser
155{
156    private static final Logger log = LogManager.getLogger(LinkParser.class);
157
158    /** Permitted attributes on links.  Keep this sorted. */
159    private static final String[] PERMITTED_ATTRIBUTES = new String[] {
160            "accesskey", "charset", "class", "dir", "hreflang", "id", "lang",
161            "rel", "rev", "style", "tabindex", "target", "title", "type" };
162
163    /** Permitted values on the 'target' attribute. */
164    private static final String[] PERMITTED_TARGET_VALUES = new String[] {
165            "_blank", "_self", "_parent", "_top" };
166
167    /** Links with target="_blank" can expose your site to performance and security issues.
168        To fix, add rel="noopener" or rel="noreferrer" to these links.
169    */
170    private static final String REL = "rel";
171    private static final String NOREFERRER = "noreferrer";
172
173    private static final String EQSQUO = "='";
174    private static final String SQUO   = "'";
175    private static final String EQ     = "=";
176    private static final String TARGET = "target";
177    private static final String DELIMS = " \t\n\r\f=";
178
179    private static final List< Attribute > m_EMPTY = new ArrayList< >();
180
181    // ............
182
183
184    /**
185     *  Processes incoming link text, separating out the link text, the link
186     *  URI, and then any specified attributes.
187     *
188     * @param  linktext  the wiki link text to be parsed
189     * @return a Link object containing the link text, reference, and any valid Attributes
190     * @throws ParseException if the parameter is null
191     */
192    public Link parse(final String linktext ) throws ParseException
193    {
194        if( linktext == null )
195        {
196            throw new ParseException("null value passed to link parser");
197        }
198
199        Link link = null;
200
201        try
202        {
203            // establish link text and link ref
204            final int cut1   = linktext.indexOf('|');
205            if( cut1 == -1 )
206            {
207                //  link form 1:  [Acme]
208                return new Link( linktext );
209            }
210
211            final int cut2 = cut1+1 < linktext.length()
212                    ? linktext.indexOf('|', cut1+1 )
213                    : -1 ;
214
215            if ( cut2 == -1 )
216            {
217                // link form 2:  [Acme | http://www.acme.com/]
218                // text = Acme
219                final String text = linktext.substring( 0, cut1 ).trim();
220                // ref = http://www.acme.com/
221                final String ref  = linktext.substring( cut1+1 ).trim();
222                return new Link( text, ref );
223            }
224
225            // link form 3:  [Acme | http://www.acme.com/ | id='foo' rel='Next']
226            final String text    = linktext.substring( 0, cut1 ).trim();
227            final String ref     = linktext.substring( cut1+1, cut2 ).trim();
228            // attribs = id='foo' rel='Next'
229            final String attribs = linktext.substring( cut2+1 ).trim();
230
231            link = new Link( text, ref );
232
233            // parse attributes
234            // contains "='" that looks like attrib spec
235            if( attribs.indexOf(EQSQUO) != -1 )
236            {
237                try
238                {
239                    final StringTokenizer tok = new StringTokenizer(attribs,DELIMS,true);
240                    while ( tok.hasMoreTokens() )
241                    {
242                        // get attribute name token
243                        String token = tok.nextToken(DELIMS).trim();
244                        while ( isSpace(token) && tok.hasMoreTokens() )
245                        {
246                            // remove all whitespace
247                            token = tok.nextToken(DELIMS).trim();
248                        }
249
250                        // eat '=', break after '='
251                        require( tok, EQ );
252                        // eat opening delim
253                        require( tok, SQUO );
254                        // using existing delim
255                        final String value = tok.nextToken(SQUO);
256                        // eat closing delim
257                        require( tok, SQUO );
258
259                        if( token != null && value != null )
260                        {
261                            if( Arrays.binarySearch( PERMITTED_ATTRIBUTES, token ) >= 0 )
262                            {
263                                // _blank _self _parent _top
264                                if( !token.equals(TARGET)
265                                        || Arrays.binarySearch( PERMITTED_TARGET_VALUES, value ) >= 0 )
266                                {
267                                    final Attribute a = new Attribute(token,value);
268                                    link.addAttribute(a);
269
270                                    if( token.equals(TARGET) )
271                                    {
272                                        final Attribute rel = new Attribute(REL,NOREFERRER);
273                                        link.addAttribute(rel);
274                                   }
275
276                                }
277                                else
278                                {
279                                    throw new ParseException("unknown target attribute value='"
280                                                             + value + "' on link");
281                                }
282                            }
283                            else
284                            {
285                                throw new ParseException("unknown attribute name '"
286                                                         + token + "' on link");
287                            }
288                        }
289                        else
290                        {
291                            throw new ParseException("unable to parse link attributes '"
292                                                     + attribs + "'");
293
294                        }
295                    }
296                }
297                catch( final ParseException pe )
298                {
299                    log.warn("syntax error parsing link attributes '"+attribs+"': " + pe.getMessage());
300                }
301                catch( final NoSuchElementException nse )
302                {
303                    log.warn("expected more tokens while parsing link attributes '" + attribs + "'");
304                }
305            }
306
307        }
308        catch( final Exception e )
309        {
310            log.warn( e.getClass().getName() + " thrown by link parser: " + e.getMessage() );
311        }
312
313        return link;
314    }
315
316
317    private String require(final StringTokenizer tok, final String required )
318            throws ParseException, NoSuchElementException
319    {
320        final String s = tok.nextToken(required);
321        if( !s.equals(required) )
322        {
323            throw new ParseException("expected '"+required+"' not '"+s+"'");
324        }
325        return s;
326    }
327
328
329    /**
330     *  Returns true if the String <tt>s</tt> is completely
331     *  composed of whitespace.
332     *
333     *  @param s The string to check
334     *  @return True, if "s" is all XML whitespace.
335     */
336    public static final boolean isSpace(final String s )
337    {
338        for( int i = 0 ; i < s.length() ; i++ )
339        {
340            if( !isSpace( s.charAt(i)) ) return false;
341        }
342        return true;
343    }
344
345
346    /**
347     *  Returns true if char <tt>c</tt> is a member of
348     *  <tt>S</tt> (space) [XML 1.1 production 3].
349     *
350     *  @param c Character to check.
351     *  @return True, if the character is an XML space.
352     */
353    public static final boolean isSpace(final char c )
354    {
355        // 0x20 = SPACE, 0x0A = LF, 0x0D = CR, 0x09 = TAB, 0x85 = NEL, 0x2028 = Line separator
356        return
357           0x20 == c
358        || 0x0A == c
359        || 0x0D == c
360        || 0x09 == c
361        || 0x85 == c
362        || 0x2028 == c;
363    }
364
365
366    // .........................................................................
367
368
369    /**
370     *  Inner class serving as a struct containing the parsed
371     *  components of a link.
372     */
373    public static class Link
374    {
375        private String            m_text;
376        private String            m_ref;
377        private int               m_interwikiPoint = -1;
378        private List<Attribute>   m_attribs;
379
380        /**
381         *  Create a new Link with text but no reference.
382         *  @param text The link text.
383         *  @throws ParseException If the link text is illegal.
384         */
385        protected Link(final String text ) throws ParseException
386        {
387            setText(text);
388        }
389
390        /**
391         *  Create a new link with a given text and hyperlink (reference).
392         *
393         *  @param text The link text.
394         *  @param ref  The hypertext reference.
395         *  @throws ParseException If the link text or reference are illegal.
396         */
397        protected Link(final String text, final String ref ) throws ParseException
398        {
399            setText(text);
400            setReference(ref);
401        }
402
403        /**
404         *  Sets the link text.
405         *
406         *  @param text The link text.
407         *  @throws ParseException If the text is illegal (e.g. null).
408         */
409        protected void setText(final String text ) throws ParseException
410        {
411            if( text == null )
412            {
413                throw new ParseException("null link text");
414            }
415            m_text = text;
416        }
417
418        /**
419         *  Returns the link text.
420         *
421         *  @return Link text.
422         */
423        public String getText()
424        {
425            return m_text;
426        }
427
428        /**
429         *  Sets the hypertext reference.  Typically, this is an URI or an interwiki link,
430         *  or a wikilink.
431         *
432         *  @param ref The reference.
433         *  @throws ParseException If the reference is illegal.
434         */
435        protected void setReference(final String ref ) throws ParseException
436        {
437            if( ref == null )
438            {
439                throw new ParseException("null link reference value");
440            }
441            m_ref = ref;
442        }
443
444        /**
445         *  Returns true, if there is a reference.
446         *
447         *  @return True, if there's a reference; false otherwise.
448         */
449        public boolean hasReference()
450        {
451            return m_ref != null;
452        }
453
454        /**
455         *  Returns the link reference, or the link text if null.
456         *
457         *  @return A link reference.
458         */
459        public String getReference()
460        {
461            return m_ref != null
462                    ? m_ref
463                    : m_text ;
464        }
465
466        /**
467         *  Returns true, if this Link represents an InterWiki link (of the form wiki:page).
468         *
469         *  @return True, if this Link represents an InterWiki link.
470         */
471        public boolean isInterwikiLink()
472        {
473            final LinkParsingOperations lpo = new LinkParsingOperations( null );
474            if( !hasReference() ) m_ref = m_text;
475            m_interwikiPoint = lpo.interWikiLinkAt( m_ref );
476            return lpo.isInterWikiLink( m_ref );
477        }
478
479        /**
480         *  Returns the name of the wiki if this is an interwiki link.
481         *  <pre>
482         *    Link link = new Link("Foo","Wikipedia:Foobar");
483         *    assert( link.getExternalWikiPage(), "Wikipedia" );
484         *  </pre>
485         *
486         *  @return Name of the wiki, or null, if this is not an interwiki link.
487         */
488        public String getExternalWiki()
489        {
490            if( isInterwikiLink() )
491            {
492                return m_ref.substring( 0, m_interwikiPoint );
493            }
494
495            return null;
496        }
497
498        /**
499         *  Returns the wikiname part of an interwiki link. Used only with interwiki links.
500         *  <pre>
501         *    Link link = new Link("Foo","Wikipedia:Foobar");
502         *    assert( link.getExternalWikiPage(), "Foobar" );
503         *  </pre>
504         *
505         *  @return Wikiname part, or null, if this is not an interwiki link.
506         */
507        public String getExternalWikiPage()
508        {
509            if( isInterwikiLink() )
510            {
511                return m_ref.substring( m_interwikiPoint+1 );
512            }
513
514            return null;
515        }
516
517        /**
518         *  Returns the number of attributes on this link.
519         *
520         *  @return The number of attributes.
521         */
522        public int attributeCount()
523        {
524            return m_attribs != null
525                    ? m_attribs.size()
526                    : 0 ;
527        }
528
529        /**
530         *  Adds another attribute to the link.
531         *
532         *  @param attr A JDOM Attribute.
533         */
534        public void addAttribute(final Attribute attr )
535        {
536            if( m_attribs == null )
537            {
538                m_attribs = new ArrayList<>();
539            }
540            m_attribs.add(attr);
541        }
542
543        /**
544         *  Returns an Iterator over the list of JDOM Attributes.
545         *
546         *  @return Iterator over the attributes.
547         */
548        public Iterator< Attribute > getAttributes()
549        {
550            return m_attribs != null
551                    ? m_attribs.iterator()
552                    : m_EMPTY.iterator() ;
553        }
554
555        /**
556         *  Returns a wikitext string representation of this Link.
557         *  @return WikiText.
558         */
559        @Override
560        public String toString()
561        {
562            final StringBuilder sb = new StringBuilder();
563            sb.append( '[' );
564            sb.append( m_text );
565
566            if( m_ref != null )
567            {
568                sb.append( ' ' );
569                sb.append( '|' );
570                sb.append( ' ' );
571                sb.append( m_ref );
572            }
573
574            if( m_attribs != null )
575            {
576                sb.append( ' ' );
577                sb.append( '|' );
578                final Iterator< Attribute > it = getAttributes();
579                while ( it.hasNext() )
580                {
581                    final Attribute a = it.next();
582                    sb.append( ' ' );
583                    sb.append( a.getName() );
584                    sb.append( '=' );
585                    sb.append( '\'' );
586                    sb.append( a.getValue() );
587                    sb.append( '\'' );
588                }
589            }
590            sb.append( ']' );
591            return sb.toString();
592        }
593
594    }
595    // end inner class
596
597}