001/*
002    Licensed to the Apache Software Foundation (ASF) under one
003    or more contributor license agreements.  See the NOTICE file
004    distributed with this work for additional information
005    regarding copyright ownership.  The ASF licenses this file
006    to you under the Apache License, Version 2.0 (the
007    "License"); you may not use this file except in compliance
008    with the License.  You may obtain a copy of the License at
009
010       http://www.apache.org/licenses/LICENSE-2.0
011
012    Unless required by applicable law or agreed to in writing,
013    software distributed under the License is distributed on an
014    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015    KIND, either express or implied.  See the License for the
016    specific language governing permissions and limitations
017    under the License.
018 */
019
020package org.apache.wiki.parser;
021
022import java.util.ArrayList;
023import java.util.Arrays;
024import java.util.Iterator;
025import java.util.List;
026import java.util.NoSuchElementException;
027import java.util.StringTokenizer;
028import java.util.stream.IntStream;
029
030import org.apache.logging.log4j.LogManager;
031import org.apache.logging.log4j.Logger;
032import org.jdom2.Attribute;
033
034/**
035 *  Parses JSPWiki-style "augmented" link markup into a Link object
036 *  containing the link text, link reference, and any optional link
037 *  attributes (as JDOM Attributes).
038 *  <p>
039 *  The parser recognizes three link forms:
040 *  </p>
041 *  <ol>
042 *    <li><tt> [Text] </tt></li>
043 *    <li><tt> [Text | Link] </tt></li>
044 *    <li><tt> [Text | Link | attributes] </tt></li>
045 *  </ol>
046 *  <p>
047 *  where the attributes are space-delimited, each in the form of
048 *  </p>
049 *  <pre>
050 *      name1='value1' name2='value2' name3='value3' (etc.) </pre>
051 *  <p>
052 *  If the attribute parsing fails, the parser will still return the
053 *  basic link, writing a warning to the log.
054 *  </p>
055 *
056 *  <h3>Permitted Attributes</h3>
057 *  <p>
058 *  Attributes that aren't declared on <tt>&lt;a&gt;</tt> or those that
059 *  permit scripting in HTML (as this is a security risk) are ignored
060 *  and have no effect on parsing, nor show up in the resulting attribute
061 *  list). The 'href' and 'name' attributes are also ignored as spurious.
062 *  The permitted list is: 'accesskey', 'charset', 'class', 'hreflang',
063 *  'id', 'lang', 'dir', 'rel', 'rev', 'style' , 'tabindex', 'target' ,
064 *  'title', and 'type'. The declared attributes that will be ignored
065 *  are: 'href', 'name', 'shape', 'coords', 'onfocus', 'onblur', or any
066 *  of the other 'on*' event attributes.
067 *  </p>
068 *  <p>
069 *  The permitted attributes and target attribute values are static
070 *  String arrays ({@link #PERMITTED_ATTRIBUTES} and
071 *  {@link #PERMITTED_TARGET_VALUES} resp.) that could be compile-time
072 *  modified (i.e., predeclared).
073 *  </p>
074 *
075 *  <h3>Permitted Values on Target Attribute</h3>
076 *  <p>
077 *  The following target names are reserved in HTML 4 and have special
078 *  meanings. These are the only values permitted by the parser.
079 *  <dl>
080 *    <dt><b>_blank</b></dt>
081 *    <dd> The user agent should load the designated document in a new,
082 *    unnamed window. </dd>
083 *    <dt><b>_self</b></dt>
084 *    <dd> The user agent should load the document in the same frame as
085 *    the element that refers to this target. </dd>
086 *    <dt><b>_parent</b></dt>
087 *    <dd> The user agent should load the document into the immediate
088 *    FRAMESET parent of the current frame. This value is equivalent to
089 *    _self if the current frame has no parent. </dd>
090 *    <dt><b>_top</b></dt>
091 *    <dd> The user agent should load the document into the full,
092 *    original window (thus canceling all other frames). This value is
093 *    equivalent to _self if the current frame has no parent. </dd>
094 *  </dl>
095 *
096 *  <h3>Returned Value</h3>
097 *  <p>
098 *  This returns a <b>Link</b> object, a public inner class with methods:
099 *  <ul>
100 *    <li> <tt>getText()</tt> returns the link text. </li>
101 *    <li> <tt>getReference()</tt> returns the link reference value. </li>
102 *    <li> <tt>attributeCount()</tt> returns the number of declared attributes. </li>
103 *    <li> <tt>getAttributes()</tt> returns an iterator over any validated
104 *        XHTML-compliant attributes, returned as JDOM Attributes.
105 *    </li>
106 *  </ul>
107 *  <p>
108 *  The <tt>attributeCount()</tt> method can be used to circumvent calling
109 *  <tt>getAttributes()</tt>, which will create an empty Iterator rather
110 *  than return a null.
111 *  </p>
112 *
113 *  <h3>Example: Link Form 1</h3>
114 *  <p>
115 *  From an incoming wikitext link of:
116 *  <pre>
117 *     [Acme] </pre>
118 *  returns:
119 *  <pre>
120 *    getText():         "Acme"
121 *    getReference():    "Acme"
122 *    attributeCount():  0
123 *    getAttributes():   an empty Iterator </pre>
124 *
125 *  <h3>Example: Link Form 2</h3>
126 *  <p>
127 *  From an incoming wikitext link of:
128 *  <pre>
129 *     [Acme | http://www.acme.com/] </pre>
130 *  returns:
131 *  <pre>
132 *    getText():         "Acme"
133 *    getReference():    "http://www.acme.com/"
134 *    attributeCount():  0
135 *    getAttributes():   an empty Iterator </pre>
136 *
137 *  <h3>Example: Link Form 3</h3>
138 *  <p>
139 *  From an incoming wikitext link of:
140 *  </p>
141 *  <pre>
142 *    [Acme | http://www.acme.com/ | id='foo' rel='Next'] </pre>
143 *  returns:
144 *  <pre>
145 *    getText():         "Acme"
146 *    getReference():    "http://www.acme.com/"
147 *    attributeCount():  2
148 *    getAttributes():   an Iterator containing:
149 *      JDOM Attribute:  id="foo"
150 *      JDOM Attribute:  rel="Next" </pre>
151 *
152 *
153 *  @since  2.5.10
154 */
155public class LinkParser
156{
157    private static final Logger LOG = LogManager.getLogger(LinkParser.class);
158
159    /** Permitted attributes on links.  Keep this sorted. */
160    private static final String[] PERMITTED_ATTRIBUTES = new String[] {
161            "accesskey", "charset", "class", "dir", "hreflang", "id", "lang",
162            "rel", "rev", "style", "tabindex", "target", "title", "type" };
163
164    /** Permitted values on the 'target' attribute. */
165    private static final String[] PERMITTED_TARGET_VALUES = new String[] {
166            "_blank", "_self", "_parent", "_top" };
167
168    /** Links with target="_blank" can expose your site to performance and security issues.
169        To fix, add rel="noopener" or rel="noreferrer" to these links.
170    */
171    private static final String REL = "rel";
172    private static final String NOREFERRER = "noreferrer";
173
174    private static final String EQSQUO = "='";
175    private static final String SQUO   = "'";
176    private static final String EQ     = "=";
177    private static final String TARGET = "target";
178    private static final String DELIMS = " \t\n\r\f=";
179
180    private static final List< Attribute > m_EMPTY = new ArrayList< >();
181
182    // ............
183
184
185    /**
186     *  Processes incoming link text, separating out the link text, the link
187     *  URI, and then any specified attributes.
188     *
189     * @param  linktext  the wiki link text to be parsed
190     * @return a Link object containing the link text, reference, and any valid Attributes
191     * @throws ParseException if the parameter is null
192     */
193    public Link parse(final String linktext ) throws ParseException
194    {
195        if( linktext == null )
196        {
197            throw new ParseException("null value passed to link parser");
198        }
199
200        Link link = null;
201
202        try
203        {
204            // establish link text and link ref
205            final int cut1   = linktext.indexOf('|');
206            if( cut1 == -1 )
207            {
208                //  link form 1:  [Acme]
209                return new Link( linktext );
210            }
211
212            final int cut2 = cut1+1 < linktext.length()
213                    ? linktext.indexOf('|', cut1+1 )
214                    : -1 ;
215
216            if ( cut2 == -1 )
217            {
218                // link form 2:  [Acme | http://www.acme.com/]
219                // text = Acme
220                final String text = linktext.substring( 0, cut1 ).trim();
221                // ref = http://www.acme.com/
222                final String ref  = linktext.substring( cut1+1 ).trim();
223                return new Link( text, ref );
224            }
225
226            // link form 3:  [Acme | http://www.acme.com/ | id='foo' rel='Next']
227            final String text    = linktext.substring( 0, cut1 ).trim();
228            final String ref     = linktext.substring( cut1+1, cut2 ).trim();
229            // attribs = id='foo' rel='Next'
230            final String attribs = linktext.substring( cut2+1 ).trim();
231
232            link = new Link( text, ref );
233
234            // parse attributes
235            // contains "='" that looks like attrib spec
236            if(attribs.contains(EQSQUO))
237            {
238                try
239                {
240                    final StringTokenizer tok = new StringTokenizer(attribs,DELIMS,true);
241                    while ( tok.hasMoreTokens() )
242                    {
243                        // get attribute name token
244                        String token = tok.nextToken(DELIMS).trim();
245                        while ( isSpace(token) && tok.hasMoreTokens() )
246                        {
247                            // remove all whitespace
248                            token = tok.nextToken(DELIMS).trim();
249                        }
250
251                        // eat '=', break after '='
252                        require( tok, EQ );
253                        // eat opening delim
254                        require( tok, SQUO );
255                        // using existing delim
256                        final String value = tok.nextToken(SQUO);
257                        // eat closing delim
258                        require( tok, SQUO );
259
260                        if( token != null && value != null )
261                        {
262                            if( Arrays.binarySearch( PERMITTED_ATTRIBUTES, token ) >= 0 )
263                            {
264                                // _blank _self _parent _top
265                                if( !token.equals(TARGET)
266                                        || Arrays.binarySearch( PERMITTED_TARGET_VALUES, value ) >= 0 )
267                                {
268                                    final Attribute a = new Attribute(token,value);
269                                    link.addAttribute(a);
270
271                                    if( token.equals(TARGET) )
272                                    {
273                                        final Attribute rel = new Attribute(REL,NOREFERRER);
274                                        link.addAttribute(rel);
275                                   }
276
277                                }
278                                else
279                                {
280                                    throw new ParseException("unknown target attribute value='"
281                                                             + value + "' on link");
282                                }
283                            }
284                            else
285                            {
286                                throw new ParseException("unknown attribute name '"
287                                                         + token + "' on link");
288                            }
289                        }
290                        else
291                        {
292                            throw new ParseException("unable to parse link attributes '"
293                                                     + attribs + "'");
294
295                        }
296                    }
297                }
298                catch( final ParseException pe )
299                {
300                    LOG.warn("syntax error parsing link attributes '"+attribs+"': " + pe.getMessage());
301                }
302                catch( final NoSuchElementException nse )
303                {
304                    LOG.warn("expected more tokens while parsing link attributes '" + attribs + "'");
305                }
306            }
307
308        }
309        catch( final Exception e )
310        {
311            LOG.warn( e.getClass().getName() + " thrown by link parser: " + e.getMessage() );
312        }
313
314        return link;
315    }
316
317
318    private String require(final StringTokenizer tok, final String required )
319            throws ParseException, NoSuchElementException
320    {
321        final String s = tok.nextToken(required);
322        if( !s.equals(required) )
323        {
324            throw new ParseException("expected '"+required+"' not '"+s+"'");
325        }
326        return s;
327    }
328
329
330    /**
331     *  Returns true if the String <tt>s</tt> is completely
332     *  composed of whitespace.
333     *
334     *  @param s The string to check
335     *  @return True, if "s" is all XML whitespace.
336     */
337    public static final boolean isSpace(final String s )
338    {
339        return IntStream.range(0, s.length()).allMatch(i -> isSpace(s.charAt(i)));
340    }
341
342
343    /**
344     *  Returns true if char <tt>c</tt> is a member of
345     *  <tt>S</tt> (space) [XML 1.1 production 3].
346     *
347     *  @param c Character to check.
348     *  @return True, if the character is an XML space.
349     */
350    public static final boolean isSpace(final char c )
351    {
352        // 0x20 = SPACE, 0x0A = LF, 0x0D = CR, 0x09 = TAB, 0x85 = NEL, 0x2028 = Line separator
353        return
354           0x20 == c
355        || 0x0A == c
356        || 0x0D == c
357        || 0x09 == c
358        || 0x85 == c
359        || 0x2028 == c;
360    }
361
362
363    // .........................................................................
364
365
366    /**
367     *  Inner class serving as a struct containing the parsed
368     *  components of a link.
369     */
370    public static class Link
371    {
372        private String            m_text;
373        private String            m_ref;
374        private int               m_interwikiPoint = -1;
375        private List<Attribute>   m_attribs;
376
377        /**
378         *  Create a new Link with text but no reference.
379         *  @param text The link text.
380         *  @throws ParseException If the link text is illegal.
381         */
382        protected Link(final String text ) throws ParseException
383        {
384            setText(text);
385        }
386
387        /**
388         *  Create a new link with a given text and hyperlink (reference).
389         *
390         *  @param text The link text.
391         *  @param ref  The hypertext reference.
392         *  @throws ParseException If the link text or reference are illegal.
393         */
394        protected Link(final String text, final String ref ) throws ParseException
395        {
396            setText(text);
397            setReference(ref);
398        }
399
400        /**
401         *  Sets the link text.
402         *
403         *  @param text The link text.
404         *  @throws ParseException If the text is illegal (e.g. null).
405         */
406        protected void setText(final String text ) throws ParseException
407        {
408            if( text == null )
409            {
410                throw new ParseException("null link text");
411            }
412            m_text = text;
413        }
414
415        /**
416         *  Returns the link text.
417         *
418         *  @return Link text.
419         */
420        public String getText()
421        {
422            return m_text;
423        }
424
425        /**
426         *  Sets the hypertext reference.  Typically, this is an URI or an interwiki link,
427         *  or a wikilink.
428         *
429         *  @param ref The reference.
430         *  @throws ParseException If the reference is illegal.
431         */
432        protected void setReference(final String ref ) throws ParseException
433        {
434            if( ref == null )
435            {
436                throw new ParseException("null link reference value");
437            }
438            m_ref = ref;
439        }
440
441        /**
442         *  Returns true, if there is a reference.
443         *
444         *  @return True, if there's a reference; false otherwise.
445         */
446        public boolean hasReference()
447        {
448            return m_ref != null;
449        }
450
451        /**
452         *  Returns the link reference, or the link text if null.
453         *
454         *  @return A link reference.
455         */
456        public String getReference()
457        {
458            return m_ref != null
459                    ? m_ref
460                    : m_text ;
461        }
462
463        /**
464         *  Returns true, if this Link represents an InterWiki link (of the form wiki:page).
465         *
466         *  @return True, if this Link represents an InterWiki link.
467         */
468        public boolean isInterwikiLink()
469        {
470            final LinkParsingOperations lpo = new LinkParsingOperations( null );
471            if( !hasReference() ) m_ref = m_text;
472            m_interwikiPoint = lpo.interWikiLinkAt( m_ref );
473            return lpo.isInterWikiLink( m_ref );
474        }
475
476        /**
477         *  Returns the name of the wiki if this is an interwiki link.
478         *  <pre>
479         *    Link link = new Link("Foo","Wikipedia:Foobar");
480         *    assert( link.getExternalWikiPage(), "Wikipedia" );
481         *  </pre>
482         *
483         *  @return Name of the wiki, or null, if this is not an interwiki link.
484         */
485        public String getExternalWiki()
486        {
487            if( isInterwikiLink() )
488            {
489                return m_ref.substring( 0, m_interwikiPoint );
490            }
491
492            return null;
493        }
494
495        /**
496         *  Returns the wikiname part of an interwiki link. Used only with interwiki links.
497         *  <pre>
498         *    Link link = new Link("Foo","Wikipedia:Foobar");
499         *    assert( link.getExternalWikiPage(), "Foobar" );
500         *  </pre>
501         *
502         *  @return Wikiname part, or null, if this is not an interwiki link.
503         */
504        public String getExternalWikiPage()
505        {
506            if( isInterwikiLink() )
507            {
508                return m_ref.substring( m_interwikiPoint+1 );
509            }
510
511            return null;
512        }
513
514        /**
515         *  Returns the number of attributes on this link.
516         *
517         *  @return The number of attributes.
518         */
519        public int attributeCount()
520        {
521            return m_attribs != null
522                    ? m_attribs.size()
523                    : 0 ;
524        }
525
526        /**
527         *  Adds another attribute to the link.
528         *
529         *  @param attr A JDOM Attribute.
530         */
531        public void addAttribute(final Attribute attr )
532        {
533            if( m_attribs == null )
534            {
535                m_attribs = new ArrayList<>();
536            }
537            m_attribs.add(attr);
538        }
539
540        /**
541         *  Returns an Iterator over the list of JDOM Attributes.
542         *
543         *  @return Iterator over the attributes.
544         */
545        public Iterator< Attribute > getAttributes()
546        {
547            return m_attribs != null
548                    ? m_attribs.iterator()
549                    : m_EMPTY.iterator() ;
550        }
551
552        /**
553         *  Returns a wikitext string representation of this Link.
554         *  @return WikiText.
555         */
556        @Override
557        public String toString()
558        {
559            final StringBuilder sb = new StringBuilder();
560            sb.append( '[' );
561            sb.append( m_text );
562
563            if( m_ref != null )
564            {
565                sb.append( ' ' );
566                sb.append( '|' );
567                sb.append( ' ' );
568                sb.append( m_ref );
569            }
570
571            if( m_attribs != null )
572            {
573                sb.append( ' ' );
574                sb.append( '|' );
575                final Iterator< Attribute > it = getAttributes();
576                while ( it.hasNext() )
577                {
578                    final Attribute a = it.next();
579                    sb.append( ' ' );
580                    sb.append( a.getName() );
581                    sb.append( '=' );
582                    sb.append( '\'' );
583                    sb.append( a.getValue() );
584                    sb.append( '\'' );
585                }
586            }
587            sb.append( ']' );
588            return sb.toString();
589        }
590
591    }
592    // end inner class
593
594}