001 /*
002 Licensed to the Apache Software Foundation (ASF) under one
003 or more contributor license agreements. See the NOTICE file
004 distributed with this work for additional information
005 regarding copyright ownership. The ASF licenses this file
006 to you under the Apache License, Version 2.0 (the
007 "License"); you may not use this file except in compliance
008 with the License. You may obtain a copy of the License at
009
010 http://www.apache.org/licenses/LICENSE-2.0
011
012 Unless required by applicable law or agreed to in writing,
013 software distributed under the License is distributed on an
014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 KIND, either express or implied. See the License for the
016 specific language governing permissions and limitations
017 under the License.
018 */
019
020 package org.apache.wiki.parser;
021
022 import java.util.*;
023
024 import org.apache.log4j.Logger;
025 import org.jdom2.Attribute;
026
027 /**
028 * Parses JSPWiki-style "augmented" link markup into a Link object
029 * containing the link text, link reference, and any optional link
030 * attributes (as JDOM Attributes).
031 * <p>
032 * The parser recognizes three link forms:
033 * </p>
034 * <ol>
035 * <li><tt> [Text] </tt></li>
036 * <li><tt> [Text | Link] </tt></li>
037 * <li><tt> [Text | Link | attributes] </tt></li>
038 * </ol>
039 * <p>
040 * where the attributes are space-delimited, each in the form of
041 * </p>
042 * <pre>
043 * name1='value1' name2='value2' name3='value3' (etc.) </pre>
044 * <p>
045 * If the attribute parsing fails, the parser will still return the
046 * basic link, writing a warning to the log.
047 * </p>
048 *
049 * <h3>Permitted Attributes</h3>
050 * <p>
051 * Attributes that aren't declared on <tt><a></tt> or those that
052 * permit scripting in HTML (as this is a security risk) are ignored
053 * and have no effect on parsing, nor show up in the resulting attribute
054 * list). The 'href' and 'name' attributes are also ignored as spurious.
055 * The permitted list is: 'accesskey', 'charset', 'class', 'hreflang',
056 * 'id', 'lang', 'dir', 'rel', 'rev', 'style' , 'tabindex', 'target' ,
057 * 'title', and 'type'. The declared attributes that will be ignored
058 * are: 'href', 'name', 'shape', 'coords', 'onfocus', 'onblur', or any
059 * of the other 'on*' event attributes.
060 * </p>
061 * <p>
062 * The permitted attributes and target attribute values are static
063 * String arrays ({@link #PERMITTED_ATTRIBUTES} and
064 * {@link #PERMITTED_TARGET_VALUES} resp.) that could be compile-time
065 * modified (i.e., predeclared).
066 * </p>
067 *
068 * <h3>Permitted Values on Target Attribute</h3>
069 * <p>
070 * The following target names are reserved in HTML 4 and have special
071 * meanings. These are the only values permitted by the parser.
072 * <dl>
073 * <dt><b>_blank</b></dt>
074 * <dd> The user agent should load the designated document in a new,
075 * unnamed window. </dd>
076 * <dt><b>_self</b></dt>
077 * <dd> The user agent should load the document in the same frame as
078 * the element that refers to this target. </dd>
079 * <dt><b>_parent</b></dt>
080 * <dd> The user agent should load the document into the immediate
081 * FRAMESET parent of the current frame. This value is equivalent to
082 * _self if the current frame has no parent. </dd>
083 * <dt><b>_top</b></dt>
084 * <dd> The user agent should load the document into the full,
085 * original window (thus canceling all other frames). This value is
086 * equivalent to _self if the current frame has no parent. </dd>
087 * </dl>
088 *
089 * <h3>Returned Value</h3>
090 * <p>
091 * This returns a <b>Link</b> object, a public inner class with methods:
092 * <ul>
093 * <li> <tt>getText()</tt> returns the link text. </li>
094 * <li> <tt>getReference()</tt> returns the link reference value. </li>
095 * <li> <tt>attributeCount()</tt> returns the number of declared attributes. </li>
096 * <li> <tt>getAttributes()</tt> returns an iterator over any validated
097 * XHTML-compliant attributes, returned as JDOM Attributes.
098 * </li>
099 * </ul>
100 * <p>
101 * The <tt>attributeCount()</tt> method can be used to circumvent calling
102 * <tt>getAttributes()</tt>, which will create an empty Iterator rather
103 * than return a null.
104 * </p>
105 *
106 * <h3>Example: Link Form 1</h3>
107 * <p>
108 * From an incoming wikitext link of:
109 * <pre>
110 * [Acme] </pre>
111 * returns:
112 * <pre>
113 * getText(): "Acme"
114 * getReference(): "Acme"
115 * attributeCount(): 0
116 * getAttributes(): an empty Iterator </pre>
117 *
118 * <h3>Example: Link Form 2</h3>
119 * <p>
120 * From an incoming wikitext link of:
121 * <pre>
122 * [Acme | http://www.acme.com/] </pre>
123 * returns:
124 * <pre>
125 * getText(): "Acme"
126 * getReference(): "http://www.acme.com/"
127 * attributeCount(): 0
128 * getAttributes(): an empty Iterator </pre>
129 *
130 * <h3>Example: Link Form 3</h3>
131 * <p>
132 * From an incoming wikitext link of:
133 * </p>
134 * <pre>
135 * [Acme | http://www.acme.com/ | id='foo' rel='Next'] </pre>
136 * returns:
137 * <pre>
138 * getText(): "Acme"
139 * getReference(): "http://www.acme.com/"
140 * attributeCount(): 2
141 * getAttributes(): an Iterator containing:
142 * JDOM Attribute: id="foo"
143 * JDOM Attribute: rel="Next" </pre>
144 *
145 *
146 * @since 2.5.10
147 */
148 public class LinkParser
149 {
150 private static Logger log = Logger.getLogger(LinkParser.class);
151
152 /** Permitted attributes on links. Keep this sorted. */
153 private static final String[] PERMITTED_ATTRIBUTES = new String[] {
154 "accesskey", "charset", "class", "dir", "hreflang", "id", "lang",
155 "rel", "rev", "style", "tabindex", "target", "title", "type" };
156
157 /** Permitted values on the 'target' attribute. */
158 private static final String[] PERMITTED_TARGET_VALUES = new String[] {
159 "_blank", "_self", "_parent", "_top" };
160
161 private static final String EQSQUO = "='";
162 private static final String SQUO = "'";
163 private static final String EQ = "=";
164 private static final String TARGET = "target";
165 private static final String DELIMS = " \t\n\r\f=";
166
167 private static final List m_EMPTY = new ArrayList();
168
169 // ............
170
171
172 /**
173 * Processes incoming link text, separating out the link text, the link
174 * URI, and then any specified attributes.
175 *
176 * @param linktext the wiki link text to be parsed
177 * @return a Link object containing the link text, reference, and any valid Attributes
178 * @throws ParseException if the parameter is null
179 */
180 public Link parse( String linktext ) throws ParseException
181 {
182 if( linktext == null )
183 {
184 throw new ParseException("null value passed to link parser");
185 }
186
187 Link link = null;
188
189 try
190 {
191 // establish link text and link ref
192 int cut1 = linktext.indexOf('|');
193 if( cut1 == -1 )
194 {
195 // link form 1: [Acme]
196 return new Link( linktext );
197 }
198
199 int cut2 = cut1+1 < linktext.length()
200 ? linktext.indexOf('|', cut1+1 )
201 : -1 ;
202
203 if ( cut2 == -1 )
204 {
205 // link form 2: [Acme | http://www.acme.com/]
206 // text = Acme
207 String text = linktext.substring( 0, cut1 ).trim();
208 // ref = http://www.acme.com/
209 String ref = linktext.substring( cut1+1 ).trim();
210 return new Link( text, ref );
211 }
212
213 // link form 3: [Acme | http://www.acme.com/ | id='foo' rel='Next']
214 String text = linktext.substring( 0, cut1 ).trim();
215 String ref = linktext.substring( cut1+1, cut2 ).trim();
216 // attribs = id='foo' rel='Next'
217 String attribs = linktext.substring( cut2+1 ).trim();
218
219 link = new Link( text, ref );
220
221 // parse attributes
222 // contains "='" that looks like attrib spec
223 if( attribs.indexOf(EQSQUO) != -1 )
224 {
225 try
226 {
227 StringTokenizer tok = new StringTokenizer(attribs,DELIMS,true);
228 while ( tok.hasMoreTokens() )
229 {
230 // get attribute name token
231 String token = tok.nextToken(DELIMS).trim();
232 while ( isSpace(token) && tok.hasMoreTokens() )
233 {
234 // remove all whitespace
235 token = tok.nextToken(DELIMS).trim();
236 }
237
238 // eat '=', break after '='
239 require( tok, EQ );
240 // eat opening delim
241 require( tok, SQUO );
242 // using existing delim
243 String value = tok.nextToken(SQUO);
244 // eat closing delim
245 require( tok, SQUO );
246
247 if( token != null && value != null )
248 {
249 if( Arrays.binarySearch( PERMITTED_ATTRIBUTES, token ) >= 0 )
250 {
251 // _blank _self _parent _top
252 if( !token.equals(TARGET)
253 || Arrays.binarySearch( PERMITTED_TARGET_VALUES, value ) >= 0 )
254 {
255 Attribute a = new Attribute(token,value);
256 link.addAttribute(a);
257 }
258 else
259 {
260 throw new ParseException("unknown target attribute value='"
261 + value + "' on link");
262 }
263 }
264 else
265 {
266 throw new ParseException("unknown attribute name '"
267 + token + "' on link");
268 }
269 }
270 else
271 {
272 throw new ParseException("unable to parse link attributes '"
273 + attribs + "'");
274
275 }
276 }
277 }
278 catch( ParseException pe )
279 {
280 log.warn("syntax error parsing link attributes '"+attribs+"': " + pe.getMessage());
281 }
282 catch( NoSuchElementException nse )
283 {
284 log.warn("expected more tokens while parsing link attributes '" + attribs + "'");
285 }
286 }
287
288 }
289 catch( Exception e )
290 {
291 log.warn( e.getClass().getName() + " thrown by link parser: " + e.getMessage() );
292 }
293
294 return link;
295 }
296
297
298 private String require( StringTokenizer tok, String required )
299 throws ParseException, NoSuchElementException
300 {
301 String s = tok.nextToken(required);
302 if( !s.equals(required) )
303 {
304 throw new ParseException("expected '"+required+"' not '"+s+"'");
305 }
306 return s;
307 }
308
309
310 /**
311 * Returns true if the String <tt>s</tt> is completely
312 * composed of whitespace.
313 *
314 * @param s The string to check
315 * @return True, if "s" is all XML whitespace.
316 */
317 public static final boolean isSpace( String s )
318 {
319 for( int i = 0 ; i < s.length() ; i++ )
320 {
321 if( !isSpace( s.charAt(i)) ) return false;
322 }
323 return true;
324 }
325
326
327 /**
328 * Returns true if char <tt>c</tt> is a member of
329 * <tt>S</tt> (space) [XML 1.1 production 3].
330 *
331 * @param c Character to check.
332 * @return True, if the character is an XML space.
333 */
334 public static final boolean isSpace( char c )
335 {
336 // 0x20 = SPACE, 0x0A = LF, 0x0D = CR, 0x09 = TAB, 0x85 = NEL, 0x2028 = Line separator
337 return
338 0x20 == c
339 || 0x0A == c
340 || 0x0D == c
341 || 0x09 == c
342 || 0x85 == c
343 || 0x2028 == c;
344 }
345
346
347 // .........................................................................
348
349
350 /**
351 * Inner class serving as a struct containing the parsed
352 * components of a link.
353 */
354 public static class Link
355 {
356 private String m_text;
357 private String m_ref = null;
358 private int m_interwikiPoint = -1;
359 private List<Attribute> m_attribs = null;
360
361 /**
362 * Create a new Link with text but no reference.
363 * @param text The link text.
364 * @throws ParseException If the link text is illegal.
365 */
366 protected Link( String text ) throws ParseException
367 {
368 setText(text);
369 }
370
371 /**
372 * Create a new link with a given text and hyperlink (reference).
373 *
374 * @param text The link text.
375 * @param ref The hypertext reference.
376 * @throws ParseException If the link text or reference are illegal.
377 */
378 protected Link( String text, String ref ) throws ParseException
379 {
380 setText(text);
381 setReference(ref);
382 }
383
384 /**
385 * Sets the link text.
386 *
387 * @param text The link text.
388 * @throws ParseException If the text is illegal (e.g. null).
389 */
390 protected void setText( String text ) throws ParseException
391 {
392 if( text == null )
393 {
394 throw new ParseException("null link text");
395 }
396 m_text = text;
397 }
398
399 /**
400 * Returns the link text.
401 *
402 * @return Link text.
403 */
404 public String getText()
405 {
406 return m_text;
407 }
408
409 /**
410 * Sets the hypertext reference. Typically, this is an URI or an interwiki link,
411 * or a wikilink.
412 *
413 * @param ref The reference.
414 * @throws ParseException If the reference is illegal.
415 */
416 protected void setReference( String ref ) throws ParseException
417 {
418 if( ref == null )
419 {
420 throw new ParseException("null link reference value");
421 }
422 m_ref = ref;
423 }
424
425 /**
426 * Returns true, if there is a reference.
427 *
428 * @return True, if there's a reference; false otherwise.
429 */
430 public boolean hasReference()
431 {
432 return m_ref != null;
433 }
434
435 /**
436 * Returns the link reference, or the link text if null.
437 *
438 * @return A link reference.
439 */
440 public String getReference()
441 {
442 return m_ref != null
443 ? m_ref
444 : m_text ;
445 }
446
447 /**
448 * Returns true, if this Link represents an InterWiki link (of the form wiki:page).
449 *
450 * @return True, if this Link represents an InterWiki link.
451 */
452 public boolean isInterwikiLink()
453 {
454 if( !hasReference() ) m_ref = m_text;
455
456 m_interwikiPoint = m_ref.indexOf(':');
457
458 return m_interwikiPoint != -1;
459 }
460
461 /**
462 * Returns the name of the wiki if this is an interwiki link.
463 * <pre>
464 * Link link = new Link("Foo","Wikipedia:Foobar");
465 * assert( link.getExternalWikiPage(), "Wikipedia" );
466 * </pre>
467 *
468 * @return Name of the wiki, or null, if this is not an interwiki link.
469 */
470 public String getExternalWiki()
471 {
472 if( isInterwikiLink() )
473 {
474 return m_ref.substring( 0, m_interwikiPoint );
475 }
476
477 return null;
478 }
479
480 /**
481 * Returns the wikiname part of an interwiki link. Used only with interwiki links.
482 * <pre>
483 * Link link = new Link("Foo","Wikipedia:Foobar");
484 * assert( link.getExternalWikiPage(), "Foobar" );
485 * </pre>
486 *
487 * @return Wikiname part, or null, if this is not an interwiki link.
488 */
489 public String getExternalWikiPage()
490 {
491 if( isInterwikiLink() )
492 {
493 return m_ref.substring( m_interwikiPoint+1 );
494 }
495
496 return null;
497 }
498
499 /**
500 * Returns the number of attributes on this link.
501 *
502 * @return The number of attributes.
503 */
504 public int attributeCount()
505 {
506 return m_attribs != null
507 ? m_attribs.size()
508 : 0 ;
509 }
510
511 /**
512 * Adds another attribute to the link.
513 *
514 * @param attr A JDOM Attribute.
515 */
516 public void addAttribute( Attribute attr )
517 {
518 if( m_attribs == null )
519 {
520 m_attribs = new ArrayList<Attribute>();
521 }
522 m_attribs.add(attr);
523 }
524
525 /**
526 * Returns an Iterator over the list of JDOM Attributes.
527 *
528 * @return Iterator over the attributes.
529 */
530 public Iterator getAttributes()
531 {
532 return m_attribs != null
533 ? m_attribs.iterator()
534 : m_EMPTY.iterator() ;
535 }
536
537 /**
538 * Returns a wikitext string representation of this Link.
539 * @return WikiText.
540 */
541 public String toString()
542 {
543 StringBuffer sb = new StringBuffer();
544 sb.append( '[' );
545 sb.append( m_text );
546
547 if( m_ref != null )
548 {
549 sb.append( ' ' );
550 sb.append( '|' );
551 sb.append( ' ' );
552 sb.append( m_ref );
553 }
554
555 if( m_attribs != null )
556 {
557 sb.append( ' ' );
558 sb.append( '|' );
559 Iterator it = getAttributes();
560 while ( it.hasNext() )
561 {
562 Attribute a = (Attribute)it.next();
563 sb.append( ' ' );
564 sb.append( a.getName() );
565 sb.append( '=' );
566 sb.append( '\'' );
567 sb.append( a.getValue() );
568 sb.append( '\'' );
569 }
570 }
571 sb.append( ']' );
572 return sb.toString();
573 }
574
575 }
576 // end inner class
577
578 }