001/*
002    Licensed to the Apache Software Foundation (ASF) under one
003    or more contributor license agreements.  See the NOTICE file
004    distributed with this work for additional information
005    regarding copyright ownership.  The ASF licenses this file
006    to you under the Apache License, Version 2.0 (the
007    "License"); you may not use this file except in compliance
008    with the License.  You may obtain a copy of the License at
009
010       http://www.apache.org/licenses/LICENSE-2.0
011
012    Unless required by applicable law or agreed to in writing,
013    software distributed under the License is distributed on an
014    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015    KIND, either express or implied.  See the License for the
016    specific language governing permissions and limitations
017    under the License.
018 */
019
020package org.apache.wiki.parser;
021
022import java.util.ArrayList;
023import java.util.Arrays;
024import java.util.Iterator;
025import java.util.List;
026import java.util.NoSuchElementException;
027import java.util.StringTokenizer;
028
029import org.apache.log4j.Logger;
030import org.jdom2.Attribute;
031
032/**
033 *  Parses JSPWiki-style "augmented" link markup into a Link object
034 *  containing the link text, link reference, and any optional link
035 *  attributes (as JDOM Attributes).
036 *  <p>
037 *  The parser recognizes three link forms:
038 *  </p>
039 *  <ol>
040 *    <li><tt> [Text] </tt></li>
041 *    <li><tt> [Text | Link] </tt></li>
042 *    <li><tt> [Text | Link | attributes] </tt></li>
043 *  </ol>
044 *  <p>
045 *  where the attributes are space-delimited, each in the form of
046 *  </p>
047 *  <pre>
048 *      name1='value1' name2='value2' name3='value3' (etc.) </pre>
049 *  <p>
050 *  If the attribute parsing fails, the parser will still return the
051 *  basic link, writing a warning to the log.
052 *  </p>
053 *
054 *  <h3>Permitted Attributes</h3>
055 *  <p>
056 *  Attributes that aren't declared on <tt>&lt;a&gt;</tt> or those that
057 *  permit scripting in HTML (as this is a security risk) are ignored
058 *  and have no effect on parsing, nor show up in the resulting attribute
059 *  list). The 'href' and 'name' attributes are also ignored as spurious.
060 *  The permitted list is: 'accesskey', 'charset', 'class', 'hreflang',
061 *  'id', 'lang', 'dir', 'rel', 'rev', 'style' , 'tabindex', 'target' ,
062 *  'title', and 'type'. The declared attributes that will be ignored
063 *  are: 'href', 'name', 'shape', 'coords', 'onfocus', 'onblur', or any
064 *  of the other 'on*' event attributes.
065 *  </p>
066 *  <p>
067 *  The permitted attributes and target attribute values are static
068 *  String arrays ({@link #PERMITTED_ATTRIBUTES} and
069 *  {@link #PERMITTED_TARGET_VALUES} resp.) that could be compile-time
070 *  modified (i.e., predeclared).
071 *  </p>
072 *
073 *  <h3>Permitted Values on Target Attribute</h3>
074 *  <p>
075 *  The following target names are reserved in HTML 4 and have special
076 *  meanings. These are the only values permitted by the parser.
077 *  <dl>
078 *    <dt><b>_blank</b></dt>
079 *    <dd> The user agent should load the designated document in a new,
080 *    unnamed window. </dd>
081 *    <dt><b>_self</b></dt>
082 *    <dd> The user agent should load the document in the same frame as
083 *    the element that refers to this target. </dd>
084 *    <dt><b>_parent</b></dt>
085 *    <dd> The user agent should load the document into the immediate
086 *    FRAMESET parent of the current frame. This value is equivalent to
087 *    _self if the current frame has no parent. </dd>
088 *    <dt><b>_top</b></dt>
089 *    <dd> The user agent should load the document into the full,
090 *    original window (thus canceling all other frames). This value is
091 *    equivalent to _self if the current frame has no parent. </dd>
092 *  </dl>
093 *
094 *  <h3>Returned Value</h3>
095 *  <p>
096 *  This returns a <b>Link</b> object, a public inner class with methods:
097 *  <ul>
098 *    <li> <tt>getText()</tt> returns the link text. </li>
099 *    <li> <tt>getReference()</tt> returns the link reference value. </li>
100 *    <li> <tt>attributeCount()</tt> returns the number of declared attributes. </li>
101 *    <li> <tt>getAttributes()</tt> returns an iterator over any validated
102 *        XHTML-compliant attributes, returned as JDOM Attributes.
103 *    </li>
104 *  </ul>
105 *  <p>
106 *  The <tt>attributeCount()</tt> method can be used to circumvent calling
107 *  <tt>getAttributes()</tt>, which will create an empty Iterator rather
108 *  than return a null.
109 *  </p>
110 *
111 *  <h3>Example: Link Form 1</h3>
112 *  <p>
113 *  From an incoming wikitext link of:
114 *  <pre>
115 *     [Acme] </pre>
116 *  returns:
117 *  <pre>
118 *    getText():         "Acme"
119 *    getReference():    "Acme"
120 *    attributeCount():  0
121 *    getAttributes():   an empty Iterator </pre>
122 *
123 *  <h3>Example: Link Form 2</h3>
124 *  <p>
125 *  From an incoming wikitext link of:
126 *  <pre>
127 *     [Acme | http://www.acme.com/] </pre>
128 *  returns:
129 *  <pre>
130 *    getText():         "Acme"
131 *    getReference():    "http://www.acme.com/"
132 *    attributeCount():  0
133 *    getAttributes():   an empty Iterator </pre>
134 *
135 *  <h3>Example: Link Form 3</h3>
136 *  <p>
137 *  From an incoming wikitext link of:
138 *  </p>
139 *  <pre>
140 *    [Acme | http://www.acme.com/ | id='foo' rel='Next'] </pre>
141 *  returns:
142 *  <pre>
143 *    getText():         "Acme"
144 *    getReference():    "http://www.acme.com/"
145 *    attributeCount():  2
146 *    getAttributes():   an Iterator containing:
147 *      JDOM Attribute:  id="foo"
148 *      JDOM Attribute:  rel="Next" </pre>
149 *
150 *
151 *  @since  2.5.10
152 */
153public class LinkParser
154{
155    private static Logger log = Logger.getLogger(LinkParser.class);
156
157    /** Permitted attributes on links.  Keep this sorted. */
158    private static final String[] PERMITTED_ATTRIBUTES = new String[] {
159            "accesskey", "charset", "class", "dir", "hreflang", "id", "lang",
160            "rel", "rev", "style", "tabindex", "target", "title", "type" };
161
162    /** Permitted values on the 'target' attribute. */
163    private static final String[] PERMITTED_TARGET_VALUES = new String[] {
164            "_blank", "_self", "_parent", "_top" };
165
166    /** Links with target="_blank" can expose your site to performance and security issues.
167        To fix, add rel="noopener" or rel="noreferrer" to these links.
168    */
169    private static final String REL = "rel";
170    private static final String NOREFERRER = "noreferrer";
171
172    private static final String EQSQUO = "='";
173    private static final String SQUO   = "'";
174    private static final String EQ     = "=";
175    private static final String TARGET = "target";
176    private static final String DELIMS = " \t\n\r\f=";
177
178    private static final List< Attribute > m_EMPTY = new ArrayList< >();
179
180    // ............
181
182
183    /**
184     *  Processes incoming link text, separating out the link text, the link
185     *  URI, and then any specified attributes.
186     *
187     * @param  linktext  the wiki link text to be parsed
188     * @return a Link object containing the link text, reference, and any valid Attributes
189     * @throws ParseException if the parameter is null
190     */
191    public Link parse( String linktext ) throws ParseException
192    {
193        if( linktext == null )
194        {
195            throw new ParseException("null value passed to link parser");
196        }
197
198        Link link = null;
199
200        try
201        {
202            // establish link text and link ref
203            int cut1   = linktext.indexOf('|');
204            if( cut1 == -1 )
205            {
206                //  link form 1:  [Acme]
207                return new Link( linktext );
208            }
209
210            int cut2 = cut1+1 < linktext.length()
211                    ? linktext.indexOf('|', cut1+1 )
212                    : -1 ;
213
214            if ( cut2 == -1 )
215            {
216                // link form 2:  [Acme | http://www.acme.com/]
217                // text = Acme
218                String text = linktext.substring( 0, cut1 ).trim();
219                // ref = http://www.acme.com/
220                String ref  = linktext.substring( cut1+1 ).trim();
221                return new Link( text, ref );
222            }
223
224            // link form 3:  [Acme | http://www.acme.com/ | id='foo' rel='Next']
225            String text    = linktext.substring( 0, cut1 ).trim();
226            String ref     = linktext.substring( cut1+1, cut2 ).trim();
227            // attribs = id='foo' rel='Next'
228            String attribs = linktext.substring( cut2+1 ).trim();
229
230            link = new Link( text, ref );
231
232            // parse attributes
233            // contains "='" that looks like attrib spec
234            if( attribs.indexOf(EQSQUO) != -1 )
235            {
236                try
237                {
238                    StringTokenizer tok = new StringTokenizer(attribs,DELIMS,true);
239                    while ( tok.hasMoreTokens() )
240                    {
241                        // get attribute name token
242                        String token = tok.nextToken(DELIMS).trim();
243                        while ( isSpace(token) && tok.hasMoreTokens() )
244                        {
245                            // remove all whitespace
246                            token = tok.nextToken(DELIMS).trim();
247                        }
248
249                        // eat '=', break after '='
250                        require( tok, EQ );
251                        // eat opening delim
252                        require( tok, SQUO );
253                        // using existing delim
254                        String value = tok.nextToken(SQUO);
255                        // eat closing delim
256                        require( tok, SQUO );
257
258                        if( token != null && value != null )
259                        {
260                            if( Arrays.binarySearch( PERMITTED_ATTRIBUTES, token ) >= 0 )
261                            {
262                                // _blank _self _parent _top
263                                if( !token.equals(TARGET)
264                                        || Arrays.binarySearch( PERMITTED_TARGET_VALUES, value ) >= 0 )
265                                {
266                                    Attribute a = new Attribute(token,value);
267                                    link.addAttribute(a);
268
269                                    if( token.equals(TARGET) )
270                                    {
271                                        Attribute rel = new Attribute(REL,NOREFERRER);
272                                        link.addAttribute(rel);
273                                   }
274
275                                }
276                                else
277                                {
278                                    throw new ParseException("unknown target attribute value='"
279                                                             + value + "' on link");
280                                }
281                            }
282                            else
283                            {
284                                throw new ParseException("unknown attribute name '"
285                                                         + token + "' on link");
286                            }
287                        }
288                        else
289                        {
290                            throw new ParseException("unable to parse link attributes '"
291                                                     + attribs + "'");
292
293                        }
294                    }
295                }
296                catch( ParseException pe )
297                {
298                    log.warn("syntax error parsing link attributes '"+attribs+"': " + pe.getMessage());
299                }
300                catch( NoSuchElementException nse )
301                {
302                    log.warn("expected more tokens while parsing link attributes '" + attribs + "'");
303                }
304            }
305
306        }
307        catch( Exception e )
308        {
309            log.warn( e.getClass().getName() + " thrown by link parser: " + e.getMessage() );
310        }
311
312        return link;
313    }
314
315
316    private String require( StringTokenizer tok, String required )
317            throws ParseException, NoSuchElementException
318    {
319        String s = tok.nextToken(required);
320        if( !s.equals(required) )
321        {
322            throw new ParseException("expected '"+required+"' not '"+s+"'");
323        }
324        return s;
325    }
326
327
328    /**
329     *  Returns true if the String <tt>s</tt> is completely
330     *  composed of whitespace.
331     *
332     *  @param s The string to check
333     *  @return True, if "s" is all XML whitespace.
334     */
335    public static final boolean isSpace( String s )
336    {
337        for( int i = 0 ; i < s.length() ; i++ )
338        {
339            if( !isSpace( s.charAt(i)) ) return false;
340        }
341        return true;
342    }
343
344
345    /**
346     *  Returns true if char <tt>c</tt> is a member of
347     *  <tt>S</tt> (space) [XML 1.1 production 3].
348     *
349     *  @param c Character to check.
350     *  @return True, if the character is an XML space.
351     */
352    public static final boolean isSpace( char c )
353    {
354        // 0x20 = SPACE, 0x0A = LF, 0x0D = CR, 0x09 = TAB, 0x85 = NEL, 0x2028 = Line separator
355        return
356           0x20 == c
357        || 0x0A == c
358        || 0x0D == c
359        || 0x09 == c
360        || 0x85 == c
361        || 0x2028 == c;
362    }
363
364
365    // .........................................................................
366
367
368    /**
369     *  Inner class serving as a struct containing the parsed
370     *  components of a link.
371     */
372    public static class Link
373    {
374        private String            m_text;
375        private String            m_ref = null;
376        private int               m_interwikiPoint = -1;
377        private List<Attribute>   m_attribs = null;
378
379        /**
380         *  Create a new Link with text but no reference.
381         *  @param text The link text.
382         *  @throws ParseException If the link text is illegal.
383         */
384        protected Link( String text ) throws ParseException
385        {
386            setText(text);
387        }
388
389        /**
390         *  Create a new link with a given text and hyperlink (reference).
391         *
392         *  @param text The link text.
393         *  @param ref  The hypertext reference.
394         *  @throws ParseException If the link text or reference are illegal.
395         */
396        protected Link( String text, String ref ) throws ParseException
397        {
398            setText(text);
399            setReference(ref);
400        }
401
402        /**
403         *  Sets the link text.
404         *
405         *  @param text The link text.
406         *  @throws ParseException If the text is illegal (e.g. null).
407         */
408        protected void setText( String text ) throws ParseException
409        {
410            if( text == null )
411            {
412                throw new ParseException("null link text");
413            }
414            m_text = text;
415        }
416
417        /**
418         *  Returns the link text.
419         *
420         *  @return Link text.
421         */
422        public String getText()
423        {
424            return m_text;
425        }
426
427        /**
428         *  Sets the hypertext reference.  Typically, this is an URI or an interwiki link,
429         *  or a wikilink.
430         *
431         *  @param ref The reference.
432         *  @throws ParseException If the reference is illegal.
433         */
434        protected void setReference( String ref ) throws ParseException
435        {
436            if( ref == null )
437            {
438                throw new ParseException("null link reference value");
439            }
440            m_ref = ref;
441        }
442
443        /**
444         *  Returns true, if there is a reference.
445         *
446         *  @return True, if there's a reference; false otherwise.
447         */
448        public boolean hasReference()
449        {
450            return m_ref != null;
451        }
452
453        /**
454         *  Returns the link reference, or the link text if null.
455         *
456         *  @return A link reference.
457         */
458        public String getReference()
459        {
460            return m_ref != null
461                    ? m_ref
462                    : m_text ;
463        }
464
465        /**
466         *  Returns true, if this Link represents an InterWiki link (of the form wiki:page).
467         *
468         *  @return True, if this Link represents an InterWiki link.
469         */
470        public boolean isInterwikiLink()
471        {
472            LinkParsingOperations lpo = new LinkParsingOperations( null );
473            if( !hasReference() ) m_ref = m_text;
474            m_interwikiPoint = lpo.interWikiLinkAt( m_ref );
475            return lpo.isInterWikiLink( m_ref );
476        }
477
478        /**
479         *  Returns the name of the wiki if this is an interwiki link.
480         *  <pre>
481         *    Link link = new Link("Foo","Wikipedia:Foobar");
482         *    assert( link.getExternalWikiPage(), "Wikipedia" );
483         *  </pre>
484         *
485         *  @return Name of the wiki, or null, if this is not an interwiki link.
486         */
487        public String getExternalWiki()
488        {
489            if( isInterwikiLink() )
490            {
491                return m_ref.substring( 0, m_interwikiPoint );
492            }
493
494            return null;
495        }
496
497        /**
498         *  Returns the wikiname part of an interwiki link. Used only with interwiki links.
499         *  <pre>
500         *    Link link = new Link("Foo","Wikipedia:Foobar");
501         *    assert( link.getExternalWikiPage(), "Foobar" );
502         *  </pre>
503         *
504         *  @return Wikiname part, or null, if this is not an interwiki link.
505         */
506        public String getExternalWikiPage()
507        {
508            if( isInterwikiLink() )
509            {
510                return m_ref.substring( m_interwikiPoint+1 );
511            }
512
513            return null;
514        }
515
516        /**
517         *  Returns the number of attributes on this link.
518         *
519         *  @return The number of attributes.
520         */
521        public int attributeCount()
522        {
523            return m_attribs != null
524                    ? m_attribs.size()
525                    : 0 ;
526        }
527
528        /**
529         *  Adds another attribute to the link.
530         *
531         *  @param attr A JDOM Attribute.
532         */
533        public void addAttribute( Attribute attr )
534        {
535            if( m_attribs == null )
536            {
537                m_attribs = new ArrayList<>();
538            }
539            m_attribs.add(attr);
540        }
541
542        /**
543         *  Returns an Iterator over the list of JDOM Attributes.
544         *
545         *  @return Iterator over the attributes.
546         */
547        public Iterator< Attribute > getAttributes()
548        {
549            return m_attribs != null
550                    ? m_attribs.iterator()
551                    : m_EMPTY.iterator() ;
552        }
553
554        /**
555         *  Returns a wikitext string representation of this Link.
556         *  @return WikiText.
557         */
558        @Override
559        public String toString()
560        {
561            StringBuilder sb = new StringBuilder();
562            sb.append( '[' );
563            sb.append( m_text );
564
565            if( m_ref != null )
566            {
567                sb.append( ' ' );
568                sb.append( '|' );
569                sb.append( ' ' );
570                sb.append( m_ref );
571            }
572
573            if( m_attribs != null )
574            {
575                sb.append( ' ' );
576                sb.append( '|' );
577                Iterator< Attribute > it = getAttributes();
578                while ( it.hasNext() )
579                {
580                    Attribute a = it.next();
581                    sb.append( ' ' );
582                    sb.append( a.getName() );
583                    sb.append( '=' );
584                    sb.append( '\'' );
585                    sb.append( a.getValue() );
586                    sb.append( '\'' );
587                }
588            }
589            sb.append( ']' );
590            return sb.toString();
591        }
592
593    }
594    // end inner class
595
596}