001/* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019 020package org.apache.wiki.parser; 021 022import java.util.ArrayList; 023import java.util.Arrays; 024import java.util.Iterator; 025import java.util.List; 026import java.util.NoSuchElementException; 027import java.util.StringTokenizer; 028 029import org.apache.log4j.Logger; 030import org.jdom2.Attribute; 031 032/** 033 * Parses JSPWiki-style "augmented" link markup into a Link object 034 * containing the link text, link reference, and any optional link 035 * attributes (as JDOM Attributes). 036 * <p> 037 * The parser recognizes three link forms: 038 * </p> 039 * <ol> 040 * <li><tt> [Text] </tt></li> 041 * <li><tt> [Text | Link] </tt></li> 042 * <li><tt> [Text | Link | attributes] </tt></li> 043 * </ol> 044 * <p> 045 * where the attributes are space-delimited, each in the form of 046 * </p> 047 * <pre> 048 * name1='value1' name2='value2' name3='value3' (etc.) </pre> 049 * <p> 050 * If the attribute parsing fails, the parser will still return the 051 * basic link, writing a warning to the log. 052 * </p> 053 * 054 * <h3>Permitted Attributes</h3> 055 * <p> 056 * Attributes that aren't declared on <tt><a></tt> or those that 057 * permit scripting in HTML (as this is a security risk) are ignored 058 * and have no effect on parsing, nor show up in the resulting attribute 059 * list). The 'href' and 'name' attributes are also ignored as spurious. 060 * The permitted list is: 'accesskey', 'charset', 'class', 'hreflang', 061 * 'id', 'lang', 'dir', 'rel', 'rev', 'style' , 'tabindex', 'target' , 062 * 'title', and 'type'. The declared attributes that will be ignored 063 * are: 'href', 'name', 'shape', 'coords', 'onfocus', 'onblur', or any 064 * of the other 'on*' event attributes. 065 * </p> 066 * <p> 067 * The permitted attributes and target attribute values are static 068 * String arrays ({@link #PERMITTED_ATTRIBUTES} and 069 * {@link #PERMITTED_TARGET_VALUES} resp.) that could be compile-time 070 * modified (i.e., predeclared). 071 * </p> 072 * 073 * <h3>Permitted Values on Target Attribute</h3> 074 * <p> 075 * The following target names are reserved in HTML 4 and have special 076 * meanings. These are the only values permitted by the parser. 077 * <dl> 078 * <dt><b>_blank</b></dt> 079 * <dd> The user agent should load the designated document in a new, 080 * unnamed window. </dd> 081 * <dt><b>_self</b></dt> 082 * <dd> The user agent should load the document in the same frame as 083 * the element that refers to this target. </dd> 084 * <dt><b>_parent</b></dt> 085 * <dd> The user agent should load the document into the immediate 086 * FRAMESET parent of the current frame. This value is equivalent to 087 * _self if the current frame has no parent. </dd> 088 * <dt><b>_top</b></dt> 089 * <dd> The user agent should load the document into the full, 090 * original window (thus canceling all other frames). This value is 091 * equivalent to _self if the current frame has no parent. </dd> 092 * </dl> 093 * 094 * <h3>Returned Value</h3> 095 * <p> 096 * This returns a <b>Link</b> object, a public inner class with methods: 097 * <ul> 098 * <li> <tt>getText()</tt> returns the link text. </li> 099 * <li> <tt>getReference()</tt> returns the link reference value. </li> 100 * <li> <tt>attributeCount()</tt> returns the number of declared attributes. </li> 101 * <li> <tt>getAttributes()</tt> returns an iterator over any validated 102 * XHTML-compliant attributes, returned as JDOM Attributes. 103 * </li> 104 * </ul> 105 * <p> 106 * The <tt>attributeCount()</tt> method can be used to circumvent calling 107 * <tt>getAttributes()</tt>, which will create an empty Iterator rather 108 * than return a null. 109 * </p> 110 * 111 * <h3>Example: Link Form 1</h3> 112 * <p> 113 * From an incoming wikitext link of: 114 * <pre> 115 * [Acme] </pre> 116 * returns: 117 * <pre> 118 * getText(): "Acme" 119 * getReference(): "Acme" 120 * attributeCount(): 0 121 * getAttributes(): an empty Iterator </pre> 122 * 123 * <h3>Example: Link Form 2</h3> 124 * <p> 125 * From an incoming wikitext link of: 126 * <pre> 127 * [Acme | http://www.acme.com/] </pre> 128 * returns: 129 * <pre> 130 * getText(): "Acme" 131 * getReference(): "http://www.acme.com/" 132 * attributeCount(): 0 133 * getAttributes(): an empty Iterator </pre> 134 * 135 * <h3>Example: Link Form 3</h3> 136 * <p> 137 * From an incoming wikitext link of: 138 * </p> 139 * <pre> 140 * [Acme | http://www.acme.com/ | id='foo' rel='Next'] </pre> 141 * returns: 142 * <pre> 143 * getText(): "Acme" 144 * getReference(): "http://www.acme.com/" 145 * attributeCount(): 2 146 * getAttributes(): an Iterator containing: 147 * JDOM Attribute: id="foo" 148 * JDOM Attribute: rel="Next" </pre> 149 * 150 * 151 * @since 2.5.10 152 */ 153public class LinkParser 154{ 155 private static Logger log = Logger.getLogger(LinkParser.class); 156 157 /** Permitted attributes on links. Keep this sorted. */ 158 private static final String[] PERMITTED_ATTRIBUTES = new String[] { 159 "accesskey", "charset", "class", "dir", "hreflang", "id", "lang", 160 "rel", "rev", "style", "tabindex", "target", "title", "type" }; 161 162 /** Permitted values on the 'target' attribute. */ 163 private static final String[] PERMITTED_TARGET_VALUES = new String[] { 164 "_blank", "_self", "_parent", "_top" }; 165 166 /** Links with target="_blank" can expose your site to performance and security issues. 167 To fix, add rel="noopener" or rel="noreferrer" to these links. 168 */ 169 private static final String REL = "rel"; 170 private static final String NOREFERRER = "noreferrer"; 171 172 private static final String EQSQUO = "='"; 173 private static final String SQUO = "'"; 174 private static final String EQ = "="; 175 private static final String TARGET = "target"; 176 private static final String DELIMS = " \t\n\r\f="; 177 178 private static final List< Attribute > m_EMPTY = new ArrayList< >(); 179 180 // ............ 181 182 183 /** 184 * Processes incoming link text, separating out the link text, the link 185 * URI, and then any specified attributes. 186 * 187 * @param linktext the wiki link text to be parsed 188 * @return a Link object containing the link text, reference, and any valid Attributes 189 * @throws ParseException if the parameter is null 190 */ 191 public Link parse( String linktext ) throws ParseException 192 { 193 if( linktext == null ) 194 { 195 throw new ParseException("null value passed to link parser"); 196 } 197 198 Link link = null; 199 200 try 201 { 202 // establish link text and link ref 203 int cut1 = linktext.indexOf('|'); 204 if( cut1 == -1 ) 205 { 206 // link form 1: [Acme] 207 return new Link( linktext ); 208 } 209 210 int cut2 = cut1+1 < linktext.length() 211 ? linktext.indexOf('|', cut1+1 ) 212 : -1 ; 213 214 if ( cut2 == -1 ) 215 { 216 // link form 2: [Acme | http://www.acme.com/] 217 // text = Acme 218 String text = linktext.substring( 0, cut1 ).trim(); 219 // ref = http://www.acme.com/ 220 String ref = linktext.substring( cut1+1 ).trim(); 221 return new Link( text, ref ); 222 } 223 224 // link form 3: [Acme | http://www.acme.com/ | id='foo' rel='Next'] 225 String text = linktext.substring( 0, cut1 ).trim(); 226 String ref = linktext.substring( cut1+1, cut2 ).trim(); 227 // attribs = id='foo' rel='Next' 228 String attribs = linktext.substring( cut2+1 ).trim(); 229 230 link = new Link( text, ref ); 231 232 // parse attributes 233 // contains "='" that looks like attrib spec 234 if( attribs.indexOf(EQSQUO) != -1 ) 235 { 236 try 237 { 238 StringTokenizer tok = new StringTokenizer(attribs,DELIMS,true); 239 while ( tok.hasMoreTokens() ) 240 { 241 // get attribute name token 242 String token = tok.nextToken(DELIMS).trim(); 243 while ( isSpace(token) && tok.hasMoreTokens() ) 244 { 245 // remove all whitespace 246 token = tok.nextToken(DELIMS).trim(); 247 } 248 249 // eat '=', break after '=' 250 require( tok, EQ ); 251 // eat opening delim 252 require( tok, SQUO ); 253 // using existing delim 254 String value = tok.nextToken(SQUO); 255 // eat closing delim 256 require( tok, SQUO ); 257 258 if( token != null && value != null ) 259 { 260 if( Arrays.binarySearch( PERMITTED_ATTRIBUTES, token ) >= 0 ) 261 { 262 // _blank _self _parent _top 263 if( !token.equals(TARGET) 264 || Arrays.binarySearch( PERMITTED_TARGET_VALUES, value ) >= 0 ) 265 { 266 Attribute a = new Attribute(token,value); 267 link.addAttribute(a); 268 269 if( token.equals(TARGET) ) 270 { 271 Attribute rel = new Attribute(REL,NOREFERRER); 272 link.addAttribute(rel); 273 } 274 275 } 276 else 277 { 278 throw new ParseException("unknown target attribute value='" 279 + value + "' on link"); 280 } 281 } 282 else 283 { 284 throw new ParseException("unknown attribute name '" 285 + token + "' on link"); 286 } 287 } 288 else 289 { 290 throw new ParseException("unable to parse link attributes '" 291 + attribs + "'"); 292 293 } 294 } 295 } 296 catch( ParseException pe ) 297 { 298 log.warn("syntax error parsing link attributes '"+attribs+"': " + pe.getMessage()); 299 } 300 catch( NoSuchElementException nse ) 301 { 302 log.warn("expected more tokens while parsing link attributes '" + attribs + "'"); 303 } 304 } 305 306 } 307 catch( Exception e ) 308 { 309 log.warn( e.getClass().getName() + " thrown by link parser: " + e.getMessage() ); 310 } 311 312 return link; 313 } 314 315 316 private String require( StringTokenizer tok, String required ) 317 throws ParseException, NoSuchElementException 318 { 319 String s = tok.nextToken(required); 320 if( !s.equals(required) ) 321 { 322 throw new ParseException("expected '"+required+"' not '"+s+"'"); 323 } 324 return s; 325 } 326 327 328 /** 329 * Returns true if the String <tt>s</tt> is completely 330 * composed of whitespace. 331 * 332 * @param s The string to check 333 * @return True, if "s" is all XML whitespace. 334 */ 335 public static final boolean isSpace( String s ) 336 { 337 for( int i = 0 ; i < s.length() ; i++ ) 338 { 339 if( !isSpace( s.charAt(i)) ) return false; 340 } 341 return true; 342 } 343 344 345 /** 346 * Returns true if char <tt>c</tt> is a member of 347 * <tt>S</tt> (space) [XML 1.1 production 3]. 348 * 349 * @param c Character to check. 350 * @return True, if the character is an XML space. 351 */ 352 public static final boolean isSpace( char c ) 353 { 354 // 0x20 = SPACE, 0x0A = LF, 0x0D = CR, 0x09 = TAB, 0x85 = NEL, 0x2028 = Line separator 355 return 356 0x20 == c 357 || 0x0A == c 358 || 0x0D == c 359 || 0x09 == c 360 || 0x85 == c 361 || 0x2028 == c; 362 } 363 364 365 // ......................................................................... 366 367 368 /** 369 * Inner class serving as a struct containing the parsed 370 * components of a link. 371 */ 372 public static class Link 373 { 374 private String m_text; 375 private String m_ref = null; 376 private int m_interwikiPoint = -1; 377 private List<Attribute> m_attribs = null; 378 379 /** 380 * Create a new Link with text but no reference. 381 * @param text The link text. 382 * @throws ParseException If the link text is illegal. 383 */ 384 protected Link( String text ) throws ParseException 385 { 386 setText(text); 387 } 388 389 /** 390 * Create a new link with a given text and hyperlink (reference). 391 * 392 * @param text The link text. 393 * @param ref The hypertext reference. 394 * @throws ParseException If the link text or reference are illegal. 395 */ 396 protected Link( String text, String ref ) throws ParseException 397 { 398 setText(text); 399 setReference(ref); 400 } 401 402 /** 403 * Sets the link text. 404 * 405 * @param text The link text. 406 * @throws ParseException If the text is illegal (e.g. null). 407 */ 408 protected void setText( String text ) throws ParseException 409 { 410 if( text == null ) 411 { 412 throw new ParseException("null link text"); 413 } 414 m_text = text; 415 } 416 417 /** 418 * Returns the link text. 419 * 420 * @return Link text. 421 */ 422 public String getText() 423 { 424 return m_text; 425 } 426 427 /** 428 * Sets the hypertext reference. Typically, this is an URI or an interwiki link, 429 * or a wikilink. 430 * 431 * @param ref The reference. 432 * @throws ParseException If the reference is illegal. 433 */ 434 protected void setReference( String ref ) throws ParseException 435 { 436 if( ref == null ) 437 { 438 throw new ParseException("null link reference value"); 439 } 440 m_ref = ref; 441 } 442 443 /** 444 * Returns true, if there is a reference. 445 * 446 * @return True, if there's a reference; false otherwise. 447 */ 448 public boolean hasReference() 449 { 450 return m_ref != null; 451 } 452 453 /** 454 * Returns the link reference, or the link text if null. 455 * 456 * @return A link reference. 457 */ 458 public String getReference() 459 { 460 return m_ref != null 461 ? m_ref 462 : m_text ; 463 } 464 465 /** 466 * Returns true, if this Link represents an InterWiki link (of the form wiki:page). 467 * 468 * @return True, if this Link represents an InterWiki link. 469 */ 470 public boolean isInterwikiLink() 471 { 472 LinkParsingOperations lpo = new LinkParsingOperations( null ); 473 if( !hasReference() ) m_ref = m_text; 474 m_interwikiPoint = lpo.interWikiLinkAt( m_ref ); 475 return lpo.isInterWikiLink( m_ref ); 476 } 477 478 /** 479 * Returns the name of the wiki if this is an interwiki link. 480 * <pre> 481 * Link link = new Link("Foo","Wikipedia:Foobar"); 482 * assert( link.getExternalWikiPage(), "Wikipedia" ); 483 * </pre> 484 * 485 * @return Name of the wiki, or null, if this is not an interwiki link. 486 */ 487 public String getExternalWiki() 488 { 489 if( isInterwikiLink() ) 490 { 491 return m_ref.substring( 0, m_interwikiPoint ); 492 } 493 494 return null; 495 } 496 497 /** 498 * Returns the wikiname part of an interwiki link. Used only with interwiki links. 499 * <pre> 500 * Link link = new Link("Foo","Wikipedia:Foobar"); 501 * assert( link.getExternalWikiPage(), "Foobar" ); 502 * </pre> 503 * 504 * @return Wikiname part, or null, if this is not an interwiki link. 505 */ 506 public String getExternalWikiPage() 507 { 508 if( isInterwikiLink() ) 509 { 510 return m_ref.substring( m_interwikiPoint+1 ); 511 } 512 513 return null; 514 } 515 516 /** 517 * Returns the number of attributes on this link. 518 * 519 * @return The number of attributes. 520 */ 521 public int attributeCount() 522 { 523 return m_attribs != null 524 ? m_attribs.size() 525 : 0 ; 526 } 527 528 /** 529 * Adds another attribute to the link. 530 * 531 * @param attr A JDOM Attribute. 532 */ 533 public void addAttribute( Attribute attr ) 534 { 535 if( m_attribs == null ) 536 { 537 m_attribs = new ArrayList<>(); 538 } 539 m_attribs.add(attr); 540 } 541 542 /** 543 * Returns an Iterator over the list of JDOM Attributes. 544 * 545 * @return Iterator over the attributes. 546 */ 547 public Iterator< Attribute > getAttributes() 548 { 549 return m_attribs != null 550 ? m_attribs.iterator() 551 : m_EMPTY.iterator() ; 552 } 553 554 /** 555 * Returns a wikitext string representation of this Link. 556 * @return WikiText. 557 */ 558 @Override 559 public String toString() 560 { 561 StringBuilder sb = new StringBuilder(); 562 sb.append( '[' ); 563 sb.append( m_text ); 564 565 if( m_ref != null ) 566 { 567 sb.append( ' ' ); 568 sb.append( '|' ); 569 sb.append( ' ' ); 570 sb.append( m_ref ); 571 } 572 573 if( m_attribs != null ) 574 { 575 sb.append( ' ' ); 576 sb.append( '|' ); 577 Iterator< Attribute > it = getAttributes(); 578 while ( it.hasNext() ) 579 { 580 Attribute a = it.next(); 581 sb.append( ' ' ); 582 sb.append( a.getName() ); 583 sb.append( '=' ); 584 sb.append( '\'' ); 585 sb.append( a.getValue() ); 586 sb.append( '\'' ); 587 } 588 } 589 sb.append( ']' ); 590 return sb.toString(); 591 } 592 593 } 594 // end inner class 595 596}