001/* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019 020package org.apache.wiki.parser; 021 022import java.util.ArrayList; 023import java.util.Arrays; 024import java.util.Iterator; 025import java.util.List; 026import java.util.NoSuchElementException; 027import java.util.StringTokenizer; 028 029import org.apache.logging.log4j.LogManager; 030import org.apache.logging.log4j.Logger; 031import org.jdom2.Attribute; 032 033/** 034 * Parses JSPWiki-style "augmented" link markup into a Link object 035 * containing the link text, link reference, and any optional link 036 * attributes (as JDOM Attributes). 037 * <p> 038 * The parser recognizes three link forms: 039 * </p> 040 * <ol> 041 * <li><tt> [Text] </tt></li> 042 * <li><tt> [Text | Link] </tt></li> 043 * <li><tt> [Text | Link | attributes] </tt></li> 044 * </ol> 045 * <p> 046 * where the attributes are space-delimited, each in the form of 047 * </p> 048 * <pre> 049 * name1='value1' name2='value2' name3='value3' (etc.) </pre> 050 * <p> 051 * If the attribute parsing fails, the parser will still return the 052 * basic link, writing a warning to the log. 053 * </p> 054 * 055 * <h3>Permitted Attributes</h3> 056 * <p> 057 * Attributes that aren't declared on <tt><a></tt> or those that 058 * permit scripting in HTML (as this is a security risk) are ignored 059 * and have no effect on parsing, nor show up in the resulting attribute 060 * list). The 'href' and 'name' attributes are also ignored as spurious. 061 * The permitted list is: 'accesskey', 'charset', 'class', 'hreflang', 062 * 'id', 'lang', 'dir', 'rel', 'rev', 'style' , 'tabindex', 'target' , 063 * 'title', and 'type'. The declared attributes that will be ignored 064 * are: 'href', 'name', 'shape', 'coords', 'onfocus', 'onblur', or any 065 * of the other 'on*' event attributes. 066 * </p> 067 * <p> 068 * The permitted attributes and target attribute values are static 069 * String arrays ({@link #PERMITTED_ATTRIBUTES} and 070 * {@link #PERMITTED_TARGET_VALUES} resp.) that could be compile-time 071 * modified (i.e., predeclared). 072 * </p> 073 * 074 * <h3>Permitted Values on Target Attribute</h3> 075 * <p> 076 * The following target names are reserved in HTML 4 and have special 077 * meanings. These are the only values permitted by the parser. 078 * <dl> 079 * <dt><b>_blank</b></dt> 080 * <dd> The user agent should load the designated document in a new, 081 * unnamed window. </dd> 082 * <dt><b>_self</b></dt> 083 * <dd> The user agent should load the document in the same frame as 084 * the element that refers to this target. </dd> 085 * <dt><b>_parent</b></dt> 086 * <dd> The user agent should load the document into the immediate 087 * FRAMESET parent of the current frame. This value is equivalent to 088 * _self if the current frame has no parent. </dd> 089 * <dt><b>_top</b></dt> 090 * <dd> The user agent should load the document into the full, 091 * original window (thus canceling all other frames). This value is 092 * equivalent to _self if the current frame has no parent. </dd> 093 * </dl> 094 * 095 * <h3>Returned Value</h3> 096 * <p> 097 * This returns a <b>Link</b> object, a public inner class with methods: 098 * <ul> 099 * <li> <tt>getText()</tt> returns the link text. </li> 100 * <li> <tt>getReference()</tt> returns the link reference value. </li> 101 * <li> <tt>attributeCount()</tt> returns the number of declared attributes. </li> 102 * <li> <tt>getAttributes()</tt> returns an iterator over any validated 103 * XHTML-compliant attributes, returned as JDOM Attributes. 104 * </li> 105 * </ul> 106 * <p> 107 * The <tt>attributeCount()</tt> method can be used to circumvent calling 108 * <tt>getAttributes()</tt>, which will create an empty Iterator rather 109 * than return a null. 110 * </p> 111 * 112 * <h3>Example: Link Form 1</h3> 113 * <p> 114 * From an incoming wikitext link of: 115 * <pre> 116 * [Acme] </pre> 117 * returns: 118 * <pre> 119 * getText(): "Acme" 120 * getReference(): "Acme" 121 * attributeCount(): 0 122 * getAttributes(): an empty Iterator </pre> 123 * 124 * <h3>Example: Link Form 2</h3> 125 * <p> 126 * From an incoming wikitext link of: 127 * <pre> 128 * [Acme | http://www.acme.com/] </pre> 129 * returns: 130 * <pre> 131 * getText(): "Acme" 132 * getReference(): "http://www.acme.com/" 133 * attributeCount(): 0 134 * getAttributes(): an empty Iterator </pre> 135 * 136 * <h3>Example: Link Form 3</h3> 137 * <p> 138 * From an incoming wikitext link of: 139 * </p> 140 * <pre> 141 * [Acme | http://www.acme.com/ | id='foo' rel='Next'] </pre> 142 * returns: 143 * <pre> 144 * getText(): "Acme" 145 * getReference(): "http://www.acme.com/" 146 * attributeCount(): 2 147 * getAttributes(): an Iterator containing: 148 * JDOM Attribute: id="foo" 149 * JDOM Attribute: rel="Next" </pre> 150 * 151 * 152 * @since 2.5.10 153 */ 154public class LinkParser 155{ 156 private static final Logger log = LogManager.getLogger(LinkParser.class); 157 158 /** Permitted attributes on links. Keep this sorted. */ 159 private static final String[] PERMITTED_ATTRIBUTES = new String[] { 160 "accesskey", "charset", "class", "dir", "hreflang", "id", "lang", 161 "rel", "rev", "style", "tabindex", "target", "title", "type" }; 162 163 /** Permitted values on the 'target' attribute. */ 164 private static final String[] PERMITTED_TARGET_VALUES = new String[] { 165 "_blank", "_self", "_parent", "_top" }; 166 167 /** Links with target="_blank" can expose your site to performance and security issues. 168 To fix, add rel="noopener" or rel="noreferrer" to these links. 169 */ 170 private static final String REL = "rel"; 171 private static final String NOREFERRER = "noreferrer"; 172 173 private static final String EQSQUO = "='"; 174 private static final String SQUO = "'"; 175 private static final String EQ = "="; 176 private static final String TARGET = "target"; 177 private static final String DELIMS = " \t\n\r\f="; 178 179 private static final List< Attribute > m_EMPTY = new ArrayList< >(); 180 181 // ............ 182 183 184 /** 185 * Processes incoming link text, separating out the link text, the link 186 * URI, and then any specified attributes. 187 * 188 * @param linktext the wiki link text to be parsed 189 * @return a Link object containing the link text, reference, and any valid Attributes 190 * @throws ParseException if the parameter is null 191 */ 192 public Link parse(final String linktext ) throws ParseException 193 { 194 if( linktext == null ) 195 { 196 throw new ParseException("null value passed to link parser"); 197 } 198 199 Link link = null; 200 201 try 202 { 203 // establish link text and link ref 204 final int cut1 = linktext.indexOf('|'); 205 if( cut1 == -1 ) 206 { 207 // link form 1: [Acme] 208 return new Link( linktext ); 209 } 210 211 final int cut2 = cut1+1 < linktext.length() 212 ? linktext.indexOf('|', cut1+1 ) 213 : -1 ; 214 215 if ( cut2 == -1 ) 216 { 217 // link form 2: [Acme | http://www.acme.com/] 218 // text = Acme 219 final String text = linktext.substring( 0, cut1 ).trim(); 220 // ref = http://www.acme.com/ 221 final String ref = linktext.substring( cut1+1 ).trim(); 222 return new Link( text, ref ); 223 } 224 225 // link form 3: [Acme | http://www.acme.com/ | id='foo' rel='Next'] 226 final String text = linktext.substring( 0, cut1 ).trim(); 227 final String ref = linktext.substring( cut1+1, cut2 ).trim(); 228 // attribs = id='foo' rel='Next' 229 final String attribs = linktext.substring( cut2+1 ).trim(); 230 231 link = new Link( text, ref ); 232 233 // parse attributes 234 // contains "='" that looks like attrib spec 235 if( attribs.indexOf(EQSQUO) != -1 ) 236 { 237 try 238 { 239 final StringTokenizer tok = new StringTokenizer(attribs,DELIMS,true); 240 while ( tok.hasMoreTokens() ) 241 { 242 // get attribute name token 243 String token = tok.nextToken(DELIMS).trim(); 244 while ( isSpace(token) && tok.hasMoreTokens() ) 245 { 246 // remove all whitespace 247 token = tok.nextToken(DELIMS).trim(); 248 } 249 250 // eat '=', break after '=' 251 require( tok, EQ ); 252 // eat opening delim 253 require( tok, SQUO ); 254 // using existing delim 255 final String value = tok.nextToken(SQUO); 256 // eat closing delim 257 require( tok, SQUO ); 258 259 if( token != null && value != null ) 260 { 261 if( Arrays.binarySearch( PERMITTED_ATTRIBUTES, token ) >= 0 ) 262 { 263 // _blank _self _parent _top 264 if( !token.equals(TARGET) 265 || Arrays.binarySearch( PERMITTED_TARGET_VALUES, value ) >= 0 ) 266 { 267 final Attribute a = new Attribute(token,value); 268 link.addAttribute(a); 269 270 if( token.equals(TARGET) ) 271 { 272 final Attribute rel = new Attribute(REL,NOREFERRER); 273 link.addAttribute(rel); 274 } 275 276 } 277 else 278 { 279 throw new ParseException("unknown target attribute value='" 280 + value + "' on link"); 281 } 282 } 283 else 284 { 285 throw new ParseException("unknown attribute name '" 286 + token + "' on link"); 287 } 288 } 289 else 290 { 291 throw new ParseException("unable to parse link attributes '" 292 + attribs + "'"); 293 294 } 295 } 296 } 297 catch( final ParseException pe ) 298 { 299 log.warn("syntax error parsing link attributes '"+attribs+"': " + pe.getMessage()); 300 } 301 catch( final NoSuchElementException nse ) 302 { 303 log.warn("expected more tokens while parsing link attributes '" + attribs + "'"); 304 } 305 } 306 307 } 308 catch( final Exception e ) 309 { 310 log.warn( e.getClass().getName() + " thrown by link parser: " + e.getMessage() ); 311 } 312 313 return link; 314 } 315 316 317 private String require(final StringTokenizer tok, final String required ) 318 throws ParseException, NoSuchElementException 319 { 320 final String s = tok.nextToken(required); 321 if( !s.equals(required) ) 322 { 323 throw new ParseException("expected '"+required+"' not '"+s+"'"); 324 } 325 return s; 326 } 327 328 329 /** 330 * Returns true if the String <tt>s</tt> is completely 331 * composed of whitespace. 332 * 333 * @param s The string to check 334 * @return True, if "s" is all XML whitespace. 335 */ 336 public static final boolean isSpace(final String s ) 337 { 338 for( int i = 0 ; i < s.length() ; i++ ) 339 { 340 if( !isSpace( s.charAt(i)) ) return false; 341 } 342 return true; 343 } 344 345 346 /** 347 * Returns true if char <tt>c</tt> is a member of 348 * <tt>S</tt> (space) [XML 1.1 production 3]. 349 * 350 * @param c Character to check. 351 * @return True, if the character is an XML space. 352 */ 353 public static final boolean isSpace(final char c ) 354 { 355 // 0x20 = SPACE, 0x0A = LF, 0x0D = CR, 0x09 = TAB, 0x85 = NEL, 0x2028 = Line separator 356 return 357 0x20 == c 358 || 0x0A == c 359 || 0x0D == c 360 || 0x09 == c 361 || 0x85 == c 362 || 0x2028 == c; 363 } 364 365 366 // ......................................................................... 367 368 369 /** 370 * Inner class serving as a struct containing the parsed 371 * components of a link. 372 */ 373 public static class Link 374 { 375 private String m_text; 376 private String m_ref; 377 private int m_interwikiPoint = -1; 378 private List<Attribute> m_attribs; 379 380 /** 381 * Create a new Link with text but no reference. 382 * @param text The link text. 383 * @throws ParseException If the link text is illegal. 384 */ 385 protected Link(final String text ) throws ParseException 386 { 387 setText(text); 388 } 389 390 /** 391 * Create a new link with a given text and hyperlink (reference). 392 * 393 * @param text The link text. 394 * @param ref The hypertext reference. 395 * @throws ParseException If the link text or reference are illegal. 396 */ 397 protected Link(final String text, final String ref ) throws ParseException 398 { 399 setText(text); 400 setReference(ref); 401 } 402 403 /** 404 * Sets the link text. 405 * 406 * @param text The link text. 407 * @throws ParseException If the text is illegal (e.g. null). 408 */ 409 protected void setText(final String text ) throws ParseException 410 { 411 if( text == null ) 412 { 413 throw new ParseException("null link text"); 414 } 415 m_text = text; 416 } 417 418 /** 419 * Returns the link text. 420 * 421 * @return Link text. 422 */ 423 public String getText() 424 { 425 return m_text; 426 } 427 428 /** 429 * Sets the hypertext reference. Typically, this is an URI or an interwiki link, 430 * or a wikilink. 431 * 432 * @param ref The reference. 433 * @throws ParseException If the reference is illegal. 434 */ 435 protected void setReference(final String ref ) throws ParseException 436 { 437 if( ref == null ) 438 { 439 throw new ParseException("null link reference value"); 440 } 441 m_ref = ref; 442 } 443 444 /** 445 * Returns true, if there is a reference. 446 * 447 * @return True, if there's a reference; false otherwise. 448 */ 449 public boolean hasReference() 450 { 451 return m_ref != null; 452 } 453 454 /** 455 * Returns the link reference, or the link text if null. 456 * 457 * @return A link reference. 458 */ 459 public String getReference() 460 { 461 return m_ref != null 462 ? m_ref 463 : m_text ; 464 } 465 466 /** 467 * Returns true, if this Link represents an InterWiki link (of the form wiki:page). 468 * 469 * @return True, if this Link represents an InterWiki link. 470 */ 471 public boolean isInterwikiLink() 472 { 473 final LinkParsingOperations lpo = new LinkParsingOperations( null ); 474 if( !hasReference() ) m_ref = m_text; 475 m_interwikiPoint = lpo.interWikiLinkAt( m_ref ); 476 return lpo.isInterWikiLink( m_ref ); 477 } 478 479 /** 480 * Returns the name of the wiki if this is an interwiki link. 481 * <pre> 482 * Link link = new Link("Foo","Wikipedia:Foobar"); 483 * assert( link.getExternalWikiPage(), "Wikipedia" ); 484 * </pre> 485 * 486 * @return Name of the wiki, or null, if this is not an interwiki link. 487 */ 488 public String getExternalWiki() 489 { 490 if( isInterwikiLink() ) 491 { 492 return m_ref.substring( 0, m_interwikiPoint ); 493 } 494 495 return null; 496 } 497 498 /** 499 * Returns the wikiname part of an interwiki link. Used only with interwiki links. 500 * <pre> 501 * Link link = new Link("Foo","Wikipedia:Foobar"); 502 * assert( link.getExternalWikiPage(), "Foobar" ); 503 * </pre> 504 * 505 * @return Wikiname part, or null, if this is not an interwiki link. 506 */ 507 public String getExternalWikiPage() 508 { 509 if( isInterwikiLink() ) 510 { 511 return m_ref.substring( m_interwikiPoint+1 ); 512 } 513 514 return null; 515 } 516 517 /** 518 * Returns the number of attributes on this link. 519 * 520 * @return The number of attributes. 521 */ 522 public int attributeCount() 523 { 524 return m_attribs != null 525 ? m_attribs.size() 526 : 0 ; 527 } 528 529 /** 530 * Adds another attribute to the link. 531 * 532 * @param attr A JDOM Attribute. 533 */ 534 public void addAttribute(final Attribute attr ) 535 { 536 if( m_attribs == null ) 537 { 538 m_attribs = new ArrayList<>(); 539 } 540 m_attribs.add(attr); 541 } 542 543 /** 544 * Returns an Iterator over the list of JDOM Attributes. 545 * 546 * @return Iterator over the attributes. 547 */ 548 public Iterator< Attribute > getAttributes() 549 { 550 return m_attribs != null 551 ? m_attribs.iterator() 552 : m_EMPTY.iterator() ; 553 } 554 555 /** 556 * Returns a wikitext string representation of this Link. 557 * @return WikiText. 558 */ 559 @Override 560 public String toString() 561 { 562 final StringBuilder sb = new StringBuilder(); 563 sb.append( '[' ); 564 sb.append( m_text ); 565 566 if( m_ref != null ) 567 { 568 sb.append( ' ' ); 569 sb.append( '|' ); 570 sb.append( ' ' ); 571 sb.append( m_ref ); 572 } 573 574 if( m_attribs != null ) 575 { 576 sb.append( ' ' ); 577 sb.append( '|' ); 578 final Iterator< Attribute > it = getAttributes(); 579 while ( it.hasNext() ) 580 { 581 final Attribute a = it.next(); 582 sb.append( ' ' ); 583 sb.append( a.getName() ); 584 sb.append( '=' ); 585 sb.append( '\'' ); 586 sb.append( a.getValue() ); 587 sb.append( '\'' ); 588 } 589 } 590 sb.append( ']' ); 591 return sb.toString(); 592 } 593 594 } 595 // end inner class 596 597}