001/* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019 020package org.apache.wiki.parser; 021 022import java.util.ArrayList; 023import java.util.Arrays; 024import java.util.Iterator; 025import java.util.List; 026import java.util.NoSuchElementException; 027import java.util.StringTokenizer; 028 029import org.apache.log4j.Logger; 030import org.jdom2.Attribute; 031 032/** 033 * Parses JSPWiki-style "augmented" link markup into a Link object 034 * containing the link text, link reference, and any optional link 035 * attributes (as JDOM Attributes). 036 * <p> 037 * The parser recognizes three link forms: 038 * </p> 039 * <ol> 040 * <li><tt> [Text] </tt></li> 041 * <li><tt> [Text | Link] </tt></li> 042 * <li><tt> [Text | Link | attributes] </tt></li> 043 * </ol> 044 * <p> 045 * where the attributes are space-delimited, each in the form of 046 * </p> 047 * <pre> 048 * name1='value1' name2='value2' name3='value3' (etc.) </pre> 049 * <p> 050 * If the attribute parsing fails, the parser will still return the 051 * basic link, writing a warning to the log. 052 * </p> 053 * 054 * <h3>Permitted Attributes</h3> 055 * <p> 056 * Attributes that aren't declared on <tt><a></tt> or those that 057 * permit scripting in HTML (as this is a security risk) are ignored 058 * and have no effect on parsing, nor show up in the resulting attribute 059 * list). The 'href' and 'name' attributes are also ignored as spurious. 060 * The permitted list is: 'accesskey', 'charset', 'class', 'hreflang', 061 * 'id', 'lang', 'dir', 'rel', 'rev', 'style' , 'tabindex', 'target' , 062 * 'title', and 'type'. The declared attributes that will be ignored 063 * are: 'href', 'name', 'shape', 'coords', 'onfocus', 'onblur', or any 064 * of the other 'on*' event attributes. 065 * </p> 066 * <p> 067 * The permitted attributes and target attribute values are static 068 * String arrays ({@link #PERMITTED_ATTRIBUTES} and 069 * {@link #PERMITTED_TARGET_VALUES} resp.) that could be compile-time 070 * modified (i.e., predeclared). 071 * </p> 072 * 073 * <h3>Permitted Values on Target Attribute</h3> 074 * <p> 075 * The following target names are reserved in HTML 4 and have special 076 * meanings. These are the only values permitted by the parser. 077 * <dl> 078 * <dt><b>_blank</b></dt> 079 * <dd> The user agent should load the designated document in a new, 080 * unnamed window. </dd> 081 * <dt><b>_self</b></dt> 082 * <dd> The user agent should load the document in the same frame as 083 * the element that refers to this target. </dd> 084 * <dt><b>_parent</b></dt> 085 * <dd> The user agent should load the document into the immediate 086 * FRAMESET parent of the current frame. This value is equivalent to 087 * _self if the current frame has no parent. </dd> 088 * <dt><b>_top</b></dt> 089 * <dd> The user agent should load the document into the full, 090 * original window (thus canceling all other frames). This value is 091 * equivalent to _self if the current frame has no parent. </dd> 092 * </dl> 093 * 094 * <h3>Returned Value</h3> 095 * <p> 096 * This returns a <b>Link</b> object, a public inner class with methods: 097 * <ul> 098 * <li> <tt>getText()</tt> returns the link text. </li> 099 * <li> <tt>getReference()</tt> returns the link reference value. </li> 100 * <li> <tt>attributeCount()</tt> returns the number of declared attributes. </li> 101 * <li> <tt>getAttributes()</tt> returns an iterator over any validated 102 * XHTML-compliant attributes, returned as JDOM Attributes. 103 * </li> 104 * </ul> 105 * <p> 106 * The <tt>attributeCount()</tt> method can be used to circumvent calling 107 * <tt>getAttributes()</tt>, which will create an empty Iterator rather 108 * than return a null. 109 * </p> 110 * 111 * <h3>Example: Link Form 1</h3> 112 * <p> 113 * From an incoming wikitext link of: 114 * <pre> 115 * [Acme] </pre> 116 * returns: 117 * <pre> 118 * getText(): "Acme" 119 * getReference(): "Acme" 120 * attributeCount(): 0 121 * getAttributes(): an empty Iterator </pre> 122 * 123 * <h3>Example: Link Form 2</h3> 124 * <p> 125 * From an incoming wikitext link of: 126 * <pre> 127 * [Acme | http://www.acme.com/] </pre> 128 * returns: 129 * <pre> 130 * getText(): "Acme" 131 * getReference(): "http://www.acme.com/" 132 * attributeCount(): 0 133 * getAttributes(): an empty Iterator </pre> 134 * 135 * <h3>Example: Link Form 3</h3> 136 * <p> 137 * From an incoming wikitext link of: 138 * </p> 139 * <pre> 140 * [Acme | http://www.acme.com/ | id='foo' rel='Next'] </pre> 141 * returns: 142 * <pre> 143 * getText(): "Acme" 144 * getReference(): "http://www.acme.com/" 145 * attributeCount(): 2 146 * getAttributes(): an Iterator containing: 147 * JDOM Attribute: id="foo" 148 * JDOM Attribute: rel="Next" </pre> 149 * 150 * 151 * @since 2.5.10 152 */ 153public class LinkParser 154{ 155 private static Logger log = Logger.getLogger(LinkParser.class); 156 157 /** Permitted attributes on links. Keep this sorted. */ 158 private static final String[] PERMITTED_ATTRIBUTES = new String[] { 159 "accesskey", "charset", "class", "dir", "hreflang", "id", "lang", 160 "rel", "rev", "style", "tabindex", "target", "title", "type" }; 161 162 /** Permitted values on the 'target' attribute. */ 163 private static final String[] PERMITTED_TARGET_VALUES = new String[] { 164 "_blank", "_self", "_parent", "_top" }; 165 166 private static final String EQSQUO = "='"; 167 private static final String SQUO = "'"; 168 private static final String EQ = "="; 169 private static final String TARGET = "target"; 170 private static final String DELIMS = " \t\n\r\f="; 171 172 private static final List< Attribute > m_EMPTY = new ArrayList< >(); 173 174 // ............ 175 176 177 /** 178 * Processes incoming link text, separating out the link text, the link 179 * URI, and then any specified attributes. 180 * 181 * @param linktext the wiki link text to be parsed 182 * @return a Link object containing the link text, reference, and any valid Attributes 183 * @throws ParseException if the parameter is null 184 */ 185 public Link parse( String linktext ) throws ParseException 186 { 187 if( linktext == null ) 188 { 189 throw new ParseException("null value passed to link parser"); 190 } 191 192 Link link = null; 193 194 try 195 { 196 // establish link text and link ref 197 int cut1 = linktext.indexOf('|'); 198 if( cut1 == -1 ) 199 { 200 // link form 1: [Acme] 201 return new Link( linktext ); 202 } 203 204 int cut2 = cut1+1 < linktext.length() 205 ? linktext.indexOf('|', cut1+1 ) 206 : -1 ; 207 208 if ( cut2 == -1 ) 209 { 210 // link form 2: [Acme | http://www.acme.com/] 211 // text = Acme 212 String text = linktext.substring( 0, cut1 ).trim(); 213 // ref = http://www.acme.com/ 214 String ref = linktext.substring( cut1+1 ).trim(); 215 return new Link( text, ref ); 216 } 217 218 // link form 3: [Acme | http://www.acme.com/ | id='foo' rel='Next'] 219 String text = linktext.substring( 0, cut1 ).trim(); 220 String ref = linktext.substring( cut1+1, cut2 ).trim(); 221 // attribs = id='foo' rel='Next' 222 String attribs = linktext.substring( cut2+1 ).trim(); 223 224 link = new Link( text, ref ); 225 226 // parse attributes 227 // contains "='" that looks like attrib spec 228 if( attribs.indexOf(EQSQUO) != -1 ) 229 { 230 try 231 { 232 StringTokenizer tok = new StringTokenizer(attribs,DELIMS,true); 233 while ( tok.hasMoreTokens() ) 234 { 235 // get attribute name token 236 String token = tok.nextToken(DELIMS).trim(); 237 while ( isSpace(token) && tok.hasMoreTokens() ) 238 { 239 // remove all whitespace 240 token = tok.nextToken(DELIMS).trim(); 241 } 242 243 // eat '=', break after '=' 244 require( tok, EQ ); 245 // eat opening delim 246 require( tok, SQUO ); 247 // using existing delim 248 String value = tok.nextToken(SQUO); 249 // eat closing delim 250 require( tok, SQUO ); 251 252 if( token != null && value != null ) 253 { 254 if( Arrays.binarySearch( PERMITTED_ATTRIBUTES, token ) >= 0 ) 255 { 256 // _blank _self _parent _top 257 if( !token.equals(TARGET) 258 || Arrays.binarySearch( PERMITTED_TARGET_VALUES, value ) >= 0 ) 259 { 260 Attribute a = new Attribute(token,value); 261 link.addAttribute(a); 262 } 263 else 264 { 265 throw new ParseException("unknown target attribute value='" 266 + value + "' on link"); 267 } 268 } 269 else 270 { 271 throw new ParseException("unknown attribute name '" 272 + token + "' on link"); 273 } 274 } 275 else 276 { 277 throw new ParseException("unable to parse link attributes '" 278 + attribs + "'"); 279 280 } 281 } 282 } 283 catch( ParseException pe ) 284 { 285 log.warn("syntax error parsing link attributes '"+attribs+"': " + pe.getMessage()); 286 } 287 catch( NoSuchElementException nse ) 288 { 289 log.warn("expected more tokens while parsing link attributes '" + attribs + "'"); 290 } 291 } 292 293 } 294 catch( Exception e ) 295 { 296 log.warn( e.getClass().getName() + " thrown by link parser: " + e.getMessage() ); 297 } 298 299 return link; 300 } 301 302 303 private String require( StringTokenizer tok, String required ) 304 throws ParseException, NoSuchElementException 305 { 306 String s = tok.nextToken(required); 307 if( !s.equals(required) ) 308 { 309 throw new ParseException("expected '"+required+"' not '"+s+"'"); 310 } 311 return s; 312 } 313 314 315 /** 316 * Returns true if the String <tt>s</tt> is completely 317 * composed of whitespace. 318 * 319 * @param s The string to check 320 * @return True, if "s" is all XML whitespace. 321 */ 322 public static final boolean isSpace( String s ) 323 { 324 for( int i = 0 ; i < s.length() ; i++ ) 325 { 326 if( !isSpace( s.charAt(i)) ) return false; 327 } 328 return true; 329 } 330 331 332 /** 333 * Returns true if char <tt>c</tt> is a member of 334 * <tt>S</tt> (space) [XML 1.1 production 3]. 335 * 336 * @param c Character to check. 337 * @return True, if the character is an XML space. 338 */ 339 public static final boolean isSpace( char c ) 340 { 341 // 0x20 = SPACE, 0x0A = LF, 0x0D = CR, 0x09 = TAB, 0x85 = NEL, 0x2028 = Line separator 342 return 343 0x20 == c 344 || 0x0A == c 345 || 0x0D == c 346 || 0x09 == c 347 || 0x85 == c 348 || 0x2028 == c; 349 } 350 351 352 // ......................................................................... 353 354 355 /** 356 * Inner class serving as a struct containing the parsed 357 * components of a link. 358 */ 359 public static class Link 360 { 361 private String m_text; 362 private String m_ref = null; 363 private int m_interwikiPoint = -1; 364 private List<Attribute> m_attribs = null; 365 366 /** 367 * Create a new Link with text but no reference. 368 * @param text The link text. 369 * @throws ParseException If the link text is illegal. 370 */ 371 protected Link( String text ) throws ParseException 372 { 373 setText(text); 374 } 375 376 /** 377 * Create a new link with a given text and hyperlink (reference). 378 * 379 * @param text The link text. 380 * @param ref The hypertext reference. 381 * @throws ParseException If the link text or reference are illegal. 382 */ 383 protected Link( String text, String ref ) throws ParseException 384 { 385 setText(text); 386 setReference(ref); 387 } 388 389 /** 390 * Sets the link text. 391 * 392 * @param text The link text. 393 * @throws ParseException If the text is illegal (e.g. null). 394 */ 395 protected void setText( String text ) throws ParseException 396 { 397 if( text == null ) 398 { 399 throw new ParseException("null link text"); 400 } 401 m_text = text; 402 } 403 404 /** 405 * Returns the link text. 406 * 407 * @return Link text. 408 */ 409 public String getText() 410 { 411 return m_text; 412 } 413 414 /** 415 * Sets the hypertext reference. Typically, this is an URI or an interwiki link, 416 * or a wikilink. 417 * 418 * @param ref The reference. 419 * @throws ParseException If the reference is illegal. 420 */ 421 protected void setReference( String ref ) throws ParseException 422 { 423 if( ref == null ) 424 { 425 throw new ParseException("null link reference value"); 426 } 427 m_ref = ref; 428 } 429 430 /** 431 * Returns true, if there is a reference. 432 * 433 * @return True, if there's a reference; false otherwise. 434 */ 435 public boolean hasReference() 436 { 437 return m_ref != null; 438 } 439 440 /** 441 * Returns the link reference, or the link text if null. 442 * 443 * @return A link reference. 444 */ 445 public String getReference() 446 { 447 return m_ref != null 448 ? m_ref 449 : m_text ; 450 } 451 452 /** 453 * Returns true, if this Link represents an InterWiki link (of the form wiki:page). 454 * 455 * @return True, if this Link represents an InterWiki link. 456 */ 457 public boolean isInterwikiLink() 458 { 459 LinkParsingOperations lpo = new LinkParsingOperations( null ); 460 if( !hasReference() ) m_ref = m_text; 461 m_interwikiPoint = lpo.interWikiLinkAt( m_ref ); 462 return lpo.isInterWikiLink( m_ref ); 463 } 464 465 /** 466 * Returns the name of the wiki if this is an interwiki link. 467 * <pre> 468 * Link link = new Link("Foo","Wikipedia:Foobar"); 469 * assert( link.getExternalWikiPage(), "Wikipedia" ); 470 * </pre> 471 * 472 * @return Name of the wiki, or null, if this is not an interwiki link. 473 */ 474 public String getExternalWiki() 475 { 476 if( isInterwikiLink() ) 477 { 478 return m_ref.substring( 0, m_interwikiPoint ); 479 } 480 481 return null; 482 } 483 484 /** 485 * Returns the wikiname part of an interwiki link. Used only with interwiki links. 486 * <pre> 487 * Link link = new Link("Foo","Wikipedia:Foobar"); 488 * assert( link.getExternalWikiPage(), "Foobar" ); 489 * </pre> 490 * 491 * @return Wikiname part, or null, if this is not an interwiki link. 492 */ 493 public String getExternalWikiPage() 494 { 495 if( isInterwikiLink() ) 496 { 497 return m_ref.substring( m_interwikiPoint+1 ); 498 } 499 500 return null; 501 } 502 503 /** 504 * Returns the number of attributes on this link. 505 * 506 * @return The number of attributes. 507 */ 508 public int attributeCount() 509 { 510 return m_attribs != null 511 ? m_attribs.size() 512 : 0 ; 513 } 514 515 /** 516 * Adds another attribute to the link. 517 * 518 * @param attr A JDOM Attribute. 519 */ 520 public void addAttribute( Attribute attr ) 521 { 522 if( m_attribs == null ) 523 { 524 m_attribs = new ArrayList<>(); 525 } 526 m_attribs.add(attr); 527 } 528 529 /** 530 * Returns an Iterator over the list of JDOM Attributes. 531 * 532 * @return Iterator over the attributes. 533 */ 534 public Iterator< Attribute > getAttributes() 535 { 536 return m_attribs != null 537 ? m_attribs.iterator() 538 : m_EMPTY.iterator() ; 539 } 540 541 /** 542 * Returns a wikitext string representation of this Link. 543 * @return WikiText. 544 */ 545 @Override 546 public String toString() 547 { 548 StringBuilder sb = new StringBuilder(); 549 sb.append( '[' ); 550 sb.append( m_text ); 551 552 if( m_ref != null ) 553 { 554 sb.append( ' ' ); 555 sb.append( '|' ); 556 sb.append( ' ' ); 557 sb.append( m_ref ); 558 } 559 560 if( m_attribs != null ) 561 { 562 sb.append( ' ' ); 563 sb.append( '|' ); 564 Iterator< Attribute > it = getAttributes(); 565 while ( it.hasNext() ) 566 { 567 Attribute a = it.next(); 568 sb.append( ' ' ); 569 sb.append( a.getName() ); 570 sb.append( '=' ); 571 sb.append( '\'' ); 572 sb.append( a.getValue() ); 573 sb.append( '\'' ); 574 } 575 } 576 sb.append( ']' ); 577 return sb.toString(); 578 } 579 580 } 581 // end inner class 582 583}