001 /* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019 020 package org.apache.wiki.parser; 021 022 import java.util.*; 023 024 import org.apache.log4j.Logger; 025 import org.jdom2.Attribute; 026 027 /** 028 * Parses JSPWiki-style "augmented" link markup into a Link object 029 * containing the link text, link reference, and any optional link 030 * attributes (as JDOM Attributes). 031 * <p> 032 * The parser recognizes three link forms: 033 * </p> 034 * <ol> 035 * <li><tt> [Text] </tt></li> 036 * <li><tt> [Text | Link] </tt></li> 037 * <li><tt> [Text | Link | attributes] </tt></li> 038 * </ol> 039 * <p> 040 * where the attributes are space-delimited, each in the form of 041 * </p> 042 * <pre> 043 * name1='value1' name2='value2' name3='value3' (etc.) </pre> 044 * <p> 045 * If the attribute parsing fails, the parser will still return the 046 * basic link, writing a warning to the log. 047 * </p> 048 * 049 * <h3>Permitted Attributes</h3> 050 * <p> 051 * Attributes that aren't declared on <tt><a></tt> or those that 052 * permit scripting in HTML (as this is a security risk) are ignored 053 * and have no effect on parsing, nor show up in the resulting attribute 054 * list). The 'href' and 'name' attributes are also ignored as spurious. 055 * The permitted list is: 'accesskey', 'charset', 'class', 'hreflang', 056 * 'id', 'lang', 'dir', 'rel', 'rev', 'style' , 'tabindex', 'target' , 057 * 'title', and 'type'. The declared attributes that will be ignored 058 * are: 'href', 'name', 'shape', 'coords', 'onfocus', 'onblur', or any 059 * of the other 'on*' event attributes. 060 * </p> 061 * <p> 062 * The permitted attributes and target attribute values are static 063 * String arrays ({@link #PERMITTED_ATTRIBUTES} and 064 * {@link #PERMITTED_TARGET_VALUES} resp.) that could be compile-time 065 * modified (i.e., predeclared). 066 * </p> 067 * 068 * <h3>Permitted Values on Target Attribute</h3> 069 * <p> 070 * The following target names are reserved in HTML 4 and have special 071 * meanings. These are the only values permitted by the parser. 072 * <dl> 073 * <dt><b>_blank</b></dt> 074 * <dd> The user agent should load the designated document in a new, 075 * unnamed window. </dd> 076 * <dt><b>_self</b></dt> 077 * <dd> The user agent should load the document in the same frame as 078 * the element that refers to this target. </dd> 079 * <dt><b>_parent</b></dt> 080 * <dd> The user agent should load the document into the immediate 081 * FRAMESET parent of the current frame. This value is equivalent to 082 * _self if the current frame has no parent. </dd> 083 * <dt><b>_top</b></dt> 084 * <dd> The user agent should load the document into the full, 085 * original window (thus canceling all other frames). This value is 086 * equivalent to _self if the current frame has no parent. </dd> 087 * </dl> 088 * 089 * <h3>Returned Value</h3> 090 * <p> 091 * This returns a <b>Link</b> object, a public inner class with methods: 092 * <ul> 093 * <li> <tt>getText()</tt> returns the link text. </li> 094 * <li> <tt>getReference()</tt> returns the link reference value. </li> 095 * <li> <tt>attributeCount()</tt> returns the number of declared attributes. </li> 096 * <li> <tt>getAttributes()</tt> returns an iterator over any validated 097 * XHTML-compliant attributes, returned as JDOM Attributes. 098 * </li> 099 * </ul> 100 * <p> 101 * The <tt>attributeCount()</tt> method can be used to circumvent calling 102 * <tt>getAttributes()</tt>, which will create an empty Iterator rather 103 * than return a null. 104 * </p> 105 * 106 * <h3>Example: Link Form 1</h3> 107 * <p> 108 * From an incoming wikitext link of: 109 * <pre> 110 * [Acme] </pre> 111 * returns: 112 * <pre> 113 * getText(): "Acme" 114 * getReference(): "Acme" 115 * attributeCount(): 0 116 * getAttributes(): an empty Iterator </pre> 117 * 118 * <h3>Example: Link Form 2</h3> 119 * <p> 120 * From an incoming wikitext link of: 121 * <pre> 122 * [Acme | http://www.acme.com/] </pre> 123 * returns: 124 * <pre> 125 * getText(): "Acme" 126 * getReference(): "http://www.acme.com/" 127 * attributeCount(): 0 128 * getAttributes(): an empty Iterator </pre> 129 * 130 * <h3>Example: Link Form 3</h3> 131 * <p> 132 * From an incoming wikitext link of: 133 * </p> 134 * <pre> 135 * [Acme | http://www.acme.com/ | id='foo' rel='Next'] </pre> 136 * returns: 137 * <pre> 138 * getText(): "Acme" 139 * getReference(): "http://www.acme.com/" 140 * attributeCount(): 2 141 * getAttributes(): an Iterator containing: 142 * JDOM Attribute: id="foo" 143 * JDOM Attribute: rel="Next" </pre> 144 * 145 * 146 * @since 2.5.10 147 */ 148 public class LinkParser 149 { 150 private static Logger log = Logger.getLogger(LinkParser.class); 151 152 /** Permitted attributes on links. Keep this sorted. */ 153 private static final String[] PERMITTED_ATTRIBUTES = new String[] { 154 "accesskey", "charset", "class", "dir", "hreflang", "id", "lang", 155 "rel", "rev", "style", "tabindex", "target", "title", "type" }; 156 157 /** Permitted values on the 'target' attribute. */ 158 private static final String[] PERMITTED_TARGET_VALUES = new String[] { 159 "_blank", "_self", "_parent", "_top" }; 160 161 private static final String EQSQUO = "='"; 162 private static final String SQUO = "'"; 163 private static final String EQ = "="; 164 private static final String TARGET = "target"; 165 private static final String DELIMS = " \t\n\r\f="; 166 167 private static final List m_EMPTY = new ArrayList(); 168 169 // ............ 170 171 172 /** 173 * Processes incoming link text, separating out the link text, the link 174 * URI, and then any specified attributes. 175 * 176 * @param linktext the wiki link text to be parsed 177 * @return a Link object containing the link text, reference, and any valid Attributes 178 * @throws ParseException if the parameter is null 179 */ 180 public Link parse( String linktext ) throws ParseException 181 { 182 if( linktext == null ) 183 { 184 throw new ParseException("null value passed to link parser"); 185 } 186 187 Link link = null; 188 189 try 190 { 191 // establish link text and link ref 192 int cut1 = linktext.indexOf('|'); 193 if( cut1 == -1 ) 194 { 195 // link form 1: [Acme] 196 return new Link( linktext ); 197 } 198 199 int cut2 = cut1+1 < linktext.length() 200 ? linktext.indexOf('|', cut1+1 ) 201 : -1 ; 202 203 if ( cut2 == -1 ) 204 { 205 // link form 2: [Acme | http://www.acme.com/] 206 // text = Acme 207 String text = linktext.substring( 0, cut1 ).trim(); 208 // ref = http://www.acme.com/ 209 String ref = linktext.substring( cut1+1 ).trim(); 210 return new Link( text, ref ); 211 } 212 213 // link form 3: [Acme | http://www.acme.com/ | id='foo' rel='Next'] 214 String text = linktext.substring( 0, cut1 ).trim(); 215 String ref = linktext.substring( cut1+1, cut2 ).trim(); 216 // attribs = id='foo' rel='Next' 217 String attribs = linktext.substring( cut2+1 ).trim(); 218 219 link = new Link( text, ref ); 220 221 // parse attributes 222 // contains "='" that looks like attrib spec 223 if( attribs.indexOf(EQSQUO) != -1 ) 224 { 225 try 226 { 227 StringTokenizer tok = new StringTokenizer(attribs,DELIMS,true); 228 while ( tok.hasMoreTokens() ) 229 { 230 // get attribute name token 231 String token = tok.nextToken(DELIMS).trim(); 232 while ( isSpace(token) && tok.hasMoreTokens() ) 233 { 234 // remove all whitespace 235 token = tok.nextToken(DELIMS).trim(); 236 } 237 238 // eat '=', break after '=' 239 require( tok, EQ ); 240 // eat opening delim 241 require( tok, SQUO ); 242 // using existing delim 243 String value = tok.nextToken(SQUO); 244 // eat closing delim 245 require( tok, SQUO ); 246 247 if( token != null && value != null ) 248 { 249 if( Arrays.binarySearch( PERMITTED_ATTRIBUTES, token ) >= 0 ) 250 { 251 // _blank _self _parent _top 252 if( !token.equals(TARGET) 253 || Arrays.binarySearch( PERMITTED_TARGET_VALUES, value ) >= 0 ) 254 { 255 Attribute a = new Attribute(token,value); 256 link.addAttribute(a); 257 } 258 else 259 { 260 throw new ParseException("unknown target attribute value='" 261 + value + "' on link"); 262 } 263 } 264 else 265 { 266 throw new ParseException("unknown attribute name '" 267 + token + "' on link"); 268 } 269 } 270 else 271 { 272 throw new ParseException("unable to parse link attributes '" 273 + attribs + "'"); 274 275 } 276 } 277 } 278 catch( ParseException pe ) 279 { 280 log.warn("syntax error parsing link attributes '"+attribs+"': " + pe.getMessage()); 281 } 282 catch( NoSuchElementException nse ) 283 { 284 log.warn("expected more tokens while parsing link attributes '" + attribs + "'"); 285 } 286 } 287 288 } 289 catch( Exception e ) 290 { 291 log.warn( e.getClass().getName() + " thrown by link parser: " + e.getMessage() ); 292 } 293 294 return link; 295 } 296 297 298 private String require( StringTokenizer tok, String required ) 299 throws ParseException, NoSuchElementException 300 { 301 String s = tok.nextToken(required); 302 if( !s.equals(required) ) 303 { 304 throw new ParseException("expected '"+required+"' not '"+s+"'"); 305 } 306 return s; 307 } 308 309 310 /** 311 * Returns true if the String <tt>s</tt> is completely 312 * composed of whitespace. 313 * 314 * @param s The string to check 315 * @return True, if "s" is all XML whitespace. 316 */ 317 public static final boolean isSpace( String s ) 318 { 319 for( int i = 0 ; i < s.length() ; i++ ) 320 { 321 if( !isSpace( s.charAt(i)) ) return false; 322 } 323 return true; 324 } 325 326 327 /** 328 * Returns true if char <tt>c</tt> is a member of 329 * <tt>S</tt> (space) [XML 1.1 production 3]. 330 * 331 * @param c Character to check. 332 * @return True, if the character is an XML space. 333 */ 334 public static final boolean isSpace( char c ) 335 { 336 // 0x20 = SPACE, 0x0A = LF, 0x0D = CR, 0x09 = TAB, 0x85 = NEL, 0x2028 = Line separator 337 return 338 0x20 == c 339 || 0x0A == c 340 || 0x0D == c 341 || 0x09 == c 342 || 0x85 == c 343 || 0x2028 == c; 344 } 345 346 347 // ......................................................................... 348 349 350 /** 351 * Inner class serving as a struct containing the parsed 352 * components of a link. 353 */ 354 public static class Link 355 { 356 private String m_text; 357 private String m_ref = null; 358 private int m_interwikiPoint = -1; 359 private List<Attribute> m_attribs = null; 360 361 /** 362 * Create a new Link with text but no reference. 363 * @param text The link text. 364 * @throws ParseException If the link text is illegal. 365 */ 366 protected Link( String text ) throws ParseException 367 { 368 setText(text); 369 } 370 371 /** 372 * Create a new link with a given text and hyperlink (reference). 373 * 374 * @param text The link text. 375 * @param ref The hypertext reference. 376 * @throws ParseException If the link text or reference are illegal. 377 */ 378 protected Link( String text, String ref ) throws ParseException 379 { 380 setText(text); 381 setReference(ref); 382 } 383 384 /** 385 * Sets the link text. 386 * 387 * @param text The link text. 388 * @throws ParseException If the text is illegal (e.g. null). 389 */ 390 protected void setText( String text ) throws ParseException 391 { 392 if( text == null ) 393 { 394 throw new ParseException("null link text"); 395 } 396 m_text = text; 397 } 398 399 /** 400 * Returns the link text. 401 * 402 * @return Link text. 403 */ 404 public String getText() 405 { 406 return m_text; 407 } 408 409 /** 410 * Sets the hypertext reference. Typically, this is an URI or an interwiki link, 411 * or a wikilink. 412 * 413 * @param ref The reference. 414 * @throws ParseException If the reference is illegal. 415 */ 416 protected void setReference( String ref ) throws ParseException 417 { 418 if( ref == null ) 419 { 420 throw new ParseException("null link reference value"); 421 } 422 m_ref = ref; 423 } 424 425 /** 426 * Returns true, if there is a reference. 427 * 428 * @return True, if there's a reference; false otherwise. 429 */ 430 public boolean hasReference() 431 { 432 return m_ref != null; 433 } 434 435 /** 436 * Returns the link reference, or the link text if null. 437 * 438 * @return A link reference. 439 */ 440 public String getReference() 441 { 442 return m_ref != null 443 ? m_ref 444 : m_text ; 445 } 446 447 /** 448 * Returns true, if this Link represents an InterWiki link (of the form wiki:page). 449 * 450 * @return True, if this Link represents an InterWiki link. 451 */ 452 public boolean isInterwikiLink() 453 { 454 if( !hasReference() ) m_ref = m_text; 455 456 m_interwikiPoint = m_ref.indexOf(':'); 457 458 return m_interwikiPoint != -1; 459 } 460 461 /** 462 * Returns the name of the wiki if this is an interwiki link. 463 * <pre> 464 * Link link = new Link("Foo","Wikipedia:Foobar"); 465 * assert( link.getExternalWikiPage(), "Wikipedia" ); 466 * </pre> 467 * 468 * @return Name of the wiki, or null, if this is not an interwiki link. 469 */ 470 public String getExternalWiki() 471 { 472 if( isInterwikiLink() ) 473 { 474 return m_ref.substring( 0, m_interwikiPoint ); 475 } 476 477 return null; 478 } 479 480 /** 481 * Returns the wikiname part of an interwiki link. Used only with interwiki links. 482 * <pre> 483 * Link link = new Link("Foo","Wikipedia:Foobar"); 484 * assert( link.getExternalWikiPage(), "Foobar" ); 485 * </pre> 486 * 487 * @return Wikiname part, or null, if this is not an interwiki link. 488 */ 489 public String getExternalWikiPage() 490 { 491 if( isInterwikiLink() ) 492 { 493 return m_ref.substring( m_interwikiPoint+1 ); 494 } 495 496 return null; 497 } 498 499 /** 500 * Returns the number of attributes on this link. 501 * 502 * @return The number of attributes. 503 */ 504 public int attributeCount() 505 { 506 return m_attribs != null 507 ? m_attribs.size() 508 : 0 ; 509 } 510 511 /** 512 * Adds another attribute to the link. 513 * 514 * @param attr A JDOM Attribute. 515 */ 516 public void addAttribute( Attribute attr ) 517 { 518 if( m_attribs == null ) 519 { 520 m_attribs = new ArrayList<Attribute>(); 521 } 522 m_attribs.add(attr); 523 } 524 525 /** 526 * Returns an Iterator over the list of JDOM Attributes. 527 * 528 * @return Iterator over the attributes. 529 */ 530 public Iterator getAttributes() 531 { 532 return m_attribs != null 533 ? m_attribs.iterator() 534 : m_EMPTY.iterator() ; 535 } 536 537 /** 538 * Returns a wikitext string representation of this Link. 539 * @return WikiText. 540 */ 541 public String toString() 542 { 543 StringBuffer sb = new StringBuffer(); 544 sb.append( '[' ); 545 sb.append( m_text ); 546 547 if( m_ref != null ) 548 { 549 sb.append( ' ' ); 550 sb.append( '|' ); 551 sb.append( ' ' ); 552 sb.append( m_ref ); 553 } 554 555 if( m_attribs != null ) 556 { 557 sb.append( ' ' ); 558 sb.append( '|' ); 559 Iterator it = getAttributes(); 560 while ( it.hasNext() ) 561 { 562 Attribute a = (Attribute)it.next(); 563 sb.append( ' ' ); 564 sb.append( a.getName() ); 565 sb.append( '=' ); 566 sb.append( '\'' ); 567 sb.append( a.getValue() ); 568 sb.append( '\'' ); 569 } 570 } 571 sb.append( ']' ); 572 return sb.toString(); 573 } 574 575 } 576 // end inner class 577 578 }