001/* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019 020package org.apache.wiki.parser; 021 022import java.util.ArrayList; 023import java.util.Arrays; 024import java.util.Iterator; 025import java.util.List; 026import java.util.NoSuchElementException; 027import java.util.StringTokenizer; 028import java.util.stream.IntStream; 029 030import org.apache.logging.log4j.LogManager; 031import org.apache.logging.log4j.Logger; 032import org.jdom2.Attribute; 033 034/** 035 * Parses JSPWiki-style "augmented" link markup into a Link object 036 * containing the link text, link reference, and any optional link 037 * attributes (as JDOM Attributes). 038 * <p> 039 * The parser recognizes three link forms: 040 * </p> 041 * <ol> 042 * <li><tt> [Text] </tt></li> 043 * <li><tt> [Text | Link] </tt></li> 044 * <li><tt> [Text | Link | attributes] </tt></li> 045 * </ol> 046 * <p> 047 * where the attributes are space-delimited, each in the form of 048 * </p> 049 * <pre> 050 * name1='value1' name2='value2' name3='value3' (etc.) </pre> 051 * <p> 052 * If the attribute parsing fails, the parser will still return the 053 * basic link, writing a warning to the log. 054 * </p> 055 * 056 * <h3>Permitted Attributes</h3> 057 * <p> 058 * Attributes that aren't declared on <tt><a></tt> or those that 059 * permit scripting in HTML (as this is a security risk) are ignored 060 * and have no effect on parsing, nor show up in the resulting attribute 061 * list). The 'href' and 'name' attributes are also ignored as spurious. 062 * The permitted list is: 'accesskey', 'charset', 'class', 'hreflang', 063 * 'id', 'lang', 'dir', 'rel', 'rev', 'style' , 'tabindex', 'target' , 064 * 'title', and 'type'. The declared attributes that will be ignored 065 * are: 'href', 'name', 'shape', 'coords', 'onfocus', 'onblur', or any 066 * of the other 'on*' event attributes. 067 * </p> 068 * <p> 069 * The permitted attributes and target attribute values are static 070 * String arrays ({@link #PERMITTED_ATTRIBUTES} and 071 * {@link #PERMITTED_TARGET_VALUES} resp.) that could be compile-time 072 * modified (i.e., predeclared). 073 * </p> 074 * 075 * <h3>Permitted Values on Target Attribute</h3> 076 * <p> 077 * The following target names are reserved in HTML 4 and have special 078 * meanings. These are the only values permitted by the parser. 079 * <dl> 080 * <dt><b>_blank</b></dt> 081 * <dd> The user agent should load the designated document in a new, 082 * unnamed window. </dd> 083 * <dt><b>_self</b></dt> 084 * <dd> The user agent should load the document in the same frame as 085 * the element that refers to this target. </dd> 086 * <dt><b>_parent</b></dt> 087 * <dd> The user agent should load the document into the immediate 088 * FRAMESET parent of the current frame. This value is equivalent to 089 * _self if the current frame has no parent. </dd> 090 * <dt><b>_top</b></dt> 091 * <dd> The user agent should load the document into the full, 092 * original window (thus canceling all other frames). This value is 093 * equivalent to _self if the current frame has no parent. </dd> 094 * </dl> 095 * 096 * <h3>Returned Value</h3> 097 * <p> 098 * This returns a <b>Link</b> object, a public inner class with methods: 099 * <ul> 100 * <li> <tt>getText()</tt> returns the link text. </li> 101 * <li> <tt>getReference()</tt> returns the link reference value. </li> 102 * <li> <tt>attributeCount()</tt> returns the number of declared attributes. </li> 103 * <li> <tt>getAttributes()</tt> returns an iterator over any validated 104 * XHTML-compliant attributes, returned as JDOM Attributes. 105 * </li> 106 * </ul> 107 * <p> 108 * The <tt>attributeCount()</tt> method can be used to circumvent calling 109 * <tt>getAttributes()</tt>, which will create an empty Iterator rather 110 * than return a null. 111 * </p> 112 * 113 * <h3>Example: Link Form 1</h3> 114 * <p> 115 * From an incoming wikitext link of: 116 * <pre> 117 * [Acme] </pre> 118 * returns: 119 * <pre> 120 * getText(): "Acme" 121 * getReference(): "Acme" 122 * attributeCount(): 0 123 * getAttributes(): an empty Iterator </pre> 124 * 125 * <h3>Example: Link Form 2</h3> 126 * <p> 127 * From an incoming wikitext link of: 128 * <pre> 129 * [Acme | http://www.acme.com/] </pre> 130 * returns: 131 * <pre> 132 * getText(): "Acme" 133 * getReference(): "http://www.acme.com/" 134 * attributeCount(): 0 135 * getAttributes(): an empty Iterator </pre> 136 * 137 * <h3>Example: Link Form 3</h3> 138 * <p> 139 * From an incoming wikitext link of: 140 * </p> 141 * <pre> 142 * [Acme | http://www.acme.com/ | id='foo' rel='Next'] </pre> 143 * returns: 144 * <pre> 145 * getText(): "Acme" 146 * getReference(): "http://www.acme.com/" 147 * attributeCount(): 2 148 * getAttributes(): an Iterator containing: 149 * JDOM Attribute: id="foo" 150 * JDOM Attribute: rel="Next" </pre> 151 * 152 * 153 * @since 2.5.10 154 */ 155public class LinkParser 156{ 157 private static final Logger LOG = LogManager.getLogger(LinkParser.class); 158 159 /** Permitted attributes on links. Keep this sorted. */ 160 private static final String[] PERMITTED_ATTRIBUTES = new String[] { 161 "accesskey", "charset", "class", "dir", "hreflang", "id", "lang", 162 "rel", "rev", "style", "tabindex", "target", "title", "type" }; 163 164 /** Permitted values on the 'target' attribute. */ 165 private static final String[] PERMITTED_TARGET_VALUES = new String[] { 166 "_blank", "_self", "_parent", "_top" }; 167 168 /** Links with target="_blank" can expose your site to performance and security issues. 169 To fix, add rel="noopener" or rel="noreferrer" to these links. 170 */ 171 private static final String REL = "rel"; 172 private static final String NOREFERRER = "noreferrer"; 173 174 private static final String EQSQUO = "='"; 175 private static final String SQUO = "'"; 176 private static final String EQ = "="; 177 private static final String TARGET = "target"; 178 private static final String DELIMS = " \t\n\r\f="; 179 180 private static final List< Attribute > m_EMPTY = new ArrayList< >(); 181 182 // ............ 183 184 185 /** 186 * Processes incoming link text, separating out the link text, the link 187 * URI, and then any specified attributes. 188 * 189 * @param linktext the wiki link text to be parsed 190 * @return a Link object containing the link text, reference, and any valid Attributes 191 * @throws ParseException if the parameter is null 192 */ 193 public Link parse(final String linktext ) throws ParseException 194 { 195 if( linktext == null ) 196 { 197 throw new ParseException("null value passed to link parser"); 198 } 199 200 Link link = null; 201 202 try 203 { 204 // establish link text and link ref 205 final int cut1 = linktext.indexOf('|'); 206 if( cut1 == -1 ) 207 { 208 // link form 1: [Acme] 209 return new Link( linktext ); 210 } 211 212 final int cut2 = cut1+1 < linktext.length() 213 ? linktext.indexOf('|', cut1+1 ) 214 : -1 ; 215 216 if ( cut2 == -1 ) 217 { 218 // link form 2: [Acme | http://www.acme.com/] 219 // text = Acme 220 final String text = linktext.substring( 0, cut1 ).trim(); 221 // ref = http://www.acme.com/ 222 final String ref = linktext.substring( cut1+1 ).trim(); 223 return new Link( text, ref ); 224 } 225 226 // link form 3: [Acme | http://www.acme.com/ | id='foo' rel='Next'] 227 final String text = linktext.substring( 0, cut1 ).trim(); 228 final String ref = linktext.substring( cut1+1, cut2 ).trim(); 229 // attribs = id='foo' rel='Next' 230 final String attribs = linktext.substring( cut2+1 ).trim(); 231 232 link = new Link( text, ref ); 233 234 // parse attributes 235 // contains "='" that looks like attrib spec 236 if(attribs.contains(EQSQUO)) 237 { 238 try 239 { 240 final StringTokenizer tok = new StringTokenizer(attribs,DELIMS,true); 241 while ( tok.hasMoreTokens() ) 242 { 243 // get attribute name token 244 String token = tok.nextToken(DELIMS).trim(); 245 while ( isSpace(token) && tok.hasMoreTokens() ) 246 { 247 // remove all whitespace 248 token = tok.nextToken(DELIMS).trim(); 249 } 250 251 // eat '=', break after '=' 252 require( tok, EQ ); 253 // eat opening delim 254 require( tok, SQUO ); 255 // using existing delim 256 final String value = tok.nextToken(SQUO); 257 // eat closing delim 258 require( tok, SQUO ); 259 260 if( token != null && value != null ) 261 { 262 if( Arrays.binarySearch( PERMITTED_ATTRIBUTES, token ) >= 0 ) 263 { 264 // _blank _self _parent _top 265 if( !token.equals(TARGET) 266 || Arrays.binarySearch( PERMITTED_TARGET_VALUES, value ) >= 0 ) 267 { 268 final Attribute a = new Attribute(token,value); 269 link.addAttribute(a); 270 271 if( token.equals(TARGET) ) 272 { 273 final Attribute rel = new Attribute(REL,NOREFERRER); 274 link.addAttribute(rel); 275 } 276 277 } 278 else 279 { 280 throw new ParseException("unknown target attribute value='" 281 + value + "' on link"); 282 } 283 } 284 else 285 { 286 throw new ParseException("unknown attribute name '" 287 + token + "' on link"); 288 } 289 } 290 else 291 { 292 throw new ParseException("unable to parse link attributes '" 293 + attribs + "'"); 294 295 } 296 } 297 } 298 catch( final ParseException pe ) 299 { 300 LOG.warn("syntax error parsing link attributes '"+attribs+"': " + pe.getMessage()); 301 } 302 catch( final NoSuchElementException nse ) 303 { 304 LOG.warn("expected more tokens while parsing link attributes '" + attribs + "'"); 305 } 306 } 307 308 } 309 catch( final Exception e ) 310 { 311 LOG.warn( e.getClass().getName() + " thrown by link parser: " + e.getMessage() ); 312 } 313 314 return link; 315 } 316 317 318 private String require(final StringTokenizer tok, final String required ) 319 throws ParseException, NoSuchElementException 320 { 321 final String s = tok.nextToken(required); 322 if( !s.equals(required) ) 323 { 324 throw new ParseException("expected '"+required+"' not '"+s+"'"); 325 } 326 return s; 327 } 328 329 330 /** 331 * Returns true if the String <tt>s</tt> is completely 332 * composed of whitespace. 333 * 334 * @param s The string to check 335 * @return True, if "s" is all XML whitespace. 336 */ 337 public static final boolean isSpace(final String s ) 338 { 339 return IntStream.range(0, s.length()).allMatch(i -> isSpace(s.charAt(i))); 340 } 341 342 343 /** 344 * Returns true if char <tt>c</tt> is a member of 345 * <tt>S</tt> (space) [XML 1.1 production 3]. 346 * 347 * @param c Character to check. 348 * @return True, if the character is an XML space. 349 */ 350 public static final boolean isSpace(final char c ) 351 { 352 // 0x20 = SPACE, 0x0A = LF, 0x0D = CR, 0x09 = TAB, 0x85 = NEL, 0x2028 = Line separator 353 return 354 0x20 == c 355 || 0x0A == c 356 || 0x0D == c 357 || 0x09 == c 358 || 0x85 == c 359 || 0x2028 == c; 360 } 361 362 363 // ......................................................................... 364 365 366 /** 367 * Inner class serving as a struct containing the parsed 368 * components of a link. 369 */ 370 public static class Link 371 { 372 private String m_text; 373 private String m_ref; 374 private int m_interwikiPoint = -1; 375 private List<Attribute> m_attribs; 376 377 /** 378 * Create a new Link with text but no reference. 379 * @param text The link text. 380 * @throws ParseException If the link text is illegal. 381 */ 382 protected Link(final String text ) throws ParseException 383 { 384 setText(text); 385 } 386 387 /** 388 * Create a new link with a given text and hyperlink (reference). 389 * 390 * @param text The link text. 391 * @param ref The hypertext reference. 392 * @throws ParseException If the link text or reference are illegal. 393 */ 394 protected Link(final String text, final String ref ) throws ParseException 395 { 396 setText(text); 397 setReference(ref); 398 } 399 400 /** 401 * Sets the link text. 402 * 403 * @param text The link text. 404 * @throws ParseException If the text is illegal (e.g. null). 405 */ 406 protected void setText(final String text ) throws ParseException 407 { 408 if( text == null ) 409 { 410 throw new ParseException("null link text"); 411 } 412 m_text = text; 413 } 414 415 /** 416 * Returns the link text. 417 * 418 * @return Link text. 419 */ 420 public String getText() 421 { 422 return m_text; 423 } 424 425 /** 426 * Sets the hypertext reference. Typically, this is an URI or an interwiki link, 427 * or a wikilink. 428 * 429 * @param ref The reference. 430 * @throws ParseException If the reference is illegal. 431 */ 432 protected void setReference(final String ref ) throws ParseException 433 { 434 if( ref == null ) 435 { 436 throw new ParseException("null link reference value"); 437 } 438 m_ref = ref; 439 } 440 441 /** 442 * Returns true, if there is a reference. 443 * 444 * @return True, if there's a reference; false otherwise. 445 */ 446 public boolean hasReference() 447 { 448 return m_ref != null; 449 } 450 451 /** 452 * Returns the link reference, or the link text if null. 453 * 454 * @return A link reference. 455 */ 456 public String getReference() 457 { 458 return m_ref != null 459 ? m_ref 460 : m_text ; 461 } 462 463 /** 464 * Returns true, if this Link represents an InterWiki link (of the form wiki:page). 465 * 466 * @return True, if this Link represents an InterWiki link. 467 */ 468 public boolean isInterwikiLink() 469 { 470 final LinkParsingOperations lpo = new LinkParsingOperations( null ); 471 if( !hasReference() ) m_ref = m_text; 472 m_interwikiPoint = lpo.interWikiLinkAt( m_ref ); 473 return lpo.isInterWikiLink( m_ref ); 474 } 475 476 /** 477 * Returns the name of the wiki if this is an interwiki link. 478 * <pre> 479 * Link link = new Link("Foo","Wikipedia:Foobar"); 480 * assert( link.getExternalWikiPage(), "Wikipedia" ); 481 * </pre> 482 * 483 * @return Name of the wiki, or null, if this is not an interwiki link. 484 */ 485 public String getExternalWiki() 486 { 487 if( isInterwikiLink() ) 488 { 489 return m_ref.substring( 0, m_interwikiPoint ); 490 } 491 492 return null; 493 } 494 495 /** 496 * Returns the wikiname part of an interwiki link. Used only with interwiki links. 497 * <pre> 498 * Link link = new Link("Foo","Wikipedia:Foobar"); 499 * assert( link.getExternalWikiPage(), "Foobar" ); 500 * </pre> 501 * 502 * @return Wikiname part, or null, if this is not an interwiki link. 503 */ 504 public String getExternalWikiPage() 505 { 506 if( isInterwikiLink() ) 507 { 508 return m_ref.substring( m_interwikiPoint+1 ); 509 } 510 511 return null; 512 } 513 514 /** 515 * Returns the number of attributes on this link. 516 * 517 * @return The number of attributes. 518 */ 519 public int attributeCount() 520 { 521 return m_attribs != null 522 ? m_attribs.size() 523 : 0 ; 524 } 525 526 /** 527 * Adds another attribute to the link. 528 * 529 * @param attr A JDOM Attribute. 530 */ 531 public void addAttribute(final Attribute attr ) 532 { 533 if( m_attribs == null ) 534 { 535 m_attribs = new ArrayList<>(); 536 } 537 m_attribs.add(attr); 538 } 539 540 /** 541 * Returns an Iterator over the list of JDOM Attributes. 542 * 543 * @return Iterator over the attributes. 544 */ 545 public Iterator< Attribute > getAttributes() 546 { 547 return m_attribs != null 548 ? m_attribs.iterator() 549 : m_EMPTY.iterator() ; 550 } 551 552 /** 553 * Returns a wikitext string representation of this Link. 554 * @return WikiText. 555 */ 556 @Override 557 public String toString() 558 { 559 final StringBuilder sb = new StringBuilder(); 560 sb.append( '[' ); 561 sb.append( m_text ); 562 563 if( m_ref != null ) 564 { 565 sb.append( ' ' ); 566 sb.append( '|' ); 567 sb.append( ' ' ); 568 sb.append( m_ref ); 569 } 570 571 if( m_attribs != null ) 572 { 573 sb.append( ' ' ); 574 sb.append( '|' ); 575 final Iterator< Attribute > it = getAttributes(); 576 while ( it.hasNext() ) 577 { 578 final Attribute a = it.next(); 579 sb.append( ' ' ); 580 sb.append( a.getName() ); 581 sb.append( '=' ); 582 sb.append( '\'' ); 583 sb.append( a.getValue() ); 584 sb.append( '\'' ); 585 } 586 } 587 sb.append( ']' ); 588 return sb.toString(); 589 } 590 591 } 592 // end inner class 593 594}