001/* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018*/ 019package org.apache.wiki.parser; 020 021import java.io.BufferedReader; 022import java.io.IOException; 023import java.io.PushbackReader; 024import java.io.Reader; 025import java.util.ArrayList; 026import java.util.Collection; 027import java.util.Collections; 028import java.util.Iterator; 029import java.util.List; 030 031import org.apache.log4j.Logger; 032import org.apache.oro.text.GlobCompiler; 033import org.apache.oro.text.regex.MalformedPatternException; 034import org.apache.oro.text.regex.Pattern; 035import org.apache.oro.text.regex.PatternCompiler; 036import org.apache.wiki.StringTransmutator; 037import org.apache.wiki.WikiContext; 038import org.apache.wiki.WikiEngine; 039import org.jdom2.Element; 040 041/** 042 * Provides an abstract class for the parser instances. 043 * 044 * @since 2.4 045 */ 046public abstract class MarkupParser 047{ 048 /** Allow this many characters to be pushed back in the stream. In effect, 049 this limits the size of a single line. */ 050 protected static final int PUSHBACK_BUFFER_SIZE = 10*1024; 051 protected PushbackReader m_in; 052 private int m_pos = -1; // current position in reader stream 053 054 protected WikiEngine m_engine; 055 protected WikiContext m_context; 056 057 /** Optionally stores internal wikilinks */ 058 protected ArrayList<StringTransmutator> m_localLinkMutatorChain = new ArrayList<>(); 059 protected ArrayList<StringTransmutator> m_externalLinkMutatorChain = new ArrayList<>(); 060 protected ArrayList<StringTransmutator> m_attachmentLinkMutatorChain = new ArrayList<>(); 061 protected ArrayList<HeadingListener> m_headingListenerChain = new ArrayList<>(); 062 protected ArrayList<StringTransmutator> m_linkMutators = new ArrayList<>(); 063 064 protected boolean m_inlineImages = true; 065 protected boolean m_parseAccessRules = true; 066 /** Keeps image regexp Patterns */ 067 protected List< Pattern > m_inlineImagePatterns = null; 068 protected LinkParsingOperations m_linkParsingOperations; 069 070 private static Logger log = Logger.getLogger( MarkupParser.class ); 071 072 /** If set to "true", allows using raw HTML within Wiki text. Be warned, 073 this is a VERY dangerous option to set - never turn this on in a publicly 074 allowable Wiki, unless you are absolutely certain of what you're doing. */ 075 public static final String PROP_ALLOWHTML = "jspwiki.translatorReader.allowHTML"; 076 /** If set to "true", enables plugins during parsing */ 077 public static final String PROP_RUNPLUGINS = "jspwiki.translatorReader.runPlugins"; 078 079 /** Lists all punctuation characters allowed in WikiMarkup. These 080 will not be cleaned away. This is for compatibility for older versions 081 of JSPWiki. */ 082 protected static final String LEGACY_CHARS_ALLOWED = "._"; 083 084 /** Lists all punctuation characters allowed in page names. */ 085 public static final String PUNCTUATION_CHARS_ALLOWED = " ()&+,-=._$"; 086 087 public static final String HASHLINK = "hashlink"; 088 089 /** Name of the outlink image; relative path to the JSPWiki directory. */ 090 public static final String OUTLINK_IMAGE = "images/out.png"; 091 /** Outlink css class. */ 092 public static final String OUTLINK = "outlink"; 093 094 /** If true, all outward links (external links) have a small link image appended. */ 095 public static final String PROP_USEOUTLINKIMAGE = "jspwiki.translatorReader.useOutlinkImage"; 096 097 private static final String INLINE_IMAGE_PATTERNS = "JSPWikiMarkupParser.inlineImagePatterns"; 098 099 /** If set to "true", all external links are tagged with 'rel="nofollow"' */ 100 public static final String PROP_USERELNOFOLLOW = "jspwiki.translatorReader.useRelNofollow"; 101 102 /** The value for anchor element <tt>class</tt> attributes when used 103 * for wiki page (normal) links. The value is "wikipage". */ 104 public static final String CLASS_WIKIPAGE = "wikipage"; 105 106 /** The value for anchor element <tt>class</tt> attributes when used 107 * for edit page links. The value is "createpage". */ 108 public static final String CLASS_EDITPAGE = "createpage"; 109 110 /** The value for anchor element <tt>class</tt> attributes when used 111 * for interwiki page links. The value is "interwiki". */ 112 public static final String CLASS_INTERWIKI = "interwiki"; 113 114 /** The value for anchor element <tt>class</tt> attributes when used 115 * for footnote links. The value is "footnote". */ 116 public static final String CLASS_FOOTNOTE = "footnote"; 117 118 /** The value for anchor element <tt>class</tt> attributes when used 119 * for footnote links. The value is "footnote". */ 120 public static final String CLASS_FOOTNOTE_REF = "footnoteref"; 121 122 /** The value for anchor element <tt>class</tt> attributes when used 123 * for external links. The value is "external". */ 124 public static final String CLASS_EXTERNAL = "external"; 125 126 /** The value for anchor element <tt>class</tt> attributes when used 127 * for attachments. The value is "attachment". */ 128 public static final String CLASS_ATTACHMENT = "attachment"; 129 130 public static final String[] CLASS_TYPES = 131 { 132 CLASS_WIKIPAGE, 133 CLASS_EDITPAGE, 134 "", 135 CLASS_FOOTNOTE, 136 CLASS_FOOTNOTE_REF, 137 "", 138 CLASS_EXTERNAL, 139 CLASS_INTERWIKI, 140 CLASS_EXTERNAL, 141 CLASS_WIKIPAGE, 142 CLASS_ATTACHMENT 143 }; 144 145 /** 146 * Constructs a MarkupParser. The subclass must call this constructor 147 * to set up the necessary bits and pieces. 148 * 149 * @param context The WikiContext. 150 * @param in The reader from which we are reading the bytes from. 151 */ 152 protected MarkupParser( WikiContext context, Reader in ) 153 { 154 m_engine = context.getEngine(); 155 m_context = context; 156 m_linkParsingOperations = new LinkParsingOperations( m_context ); 157 setInputReader( in ); 158 } 159 160 /** 161 * Replaces the current input character stream with a new one. 162 * @param in New source for input. If null, this method does nothing. 163 * @return the old stream 164 */ 165 public Reader setInputReader( Reader in ) 166 { 167 Reader old = m_in; 168 169 if( in != null ) 170 { 171 m_in = new PushbackReader( new BufferedReader( in ), 172 PUSHBACK_BUFFER_SIZE ); 173 } 174 175 return old; 176 } 177 178 /** 179 * Adds a hook for processing link texts. This hook is called 180 * when the link text is written into the output stream, and 181 * you may use it to modify the text. It does not affect the 182 * actual link, only the user-visible text. 183 * 184 * @param mutator The hook to call. Null is safe. 185 */ 186 public void addLinkTransmutator( StringTransmutator mutator ) 187 { 188 if( mutator != null ) 189 { 190 m_linkMutators.add( mutator ); 191 } 192 } 193 194 /** 195 * Adds a hook for processing local links. The engine 196 * transforms both non-existing and existing page links. 197 * 198 * @param mutator The hook to call. Null is safe. 199 */ 200 public void addLocalLinkHook( StringTransmutator mutator ) 201 { 202 if( mutator != null ) 203 { 204 m_localLinkMutatorChain.add( mutator ); 205 } 206 } 207 208 /** 209 * Adds a hook for processing external links. This includes 210 * all http:// ftp://, etc. links, including inlined images. 211 * 212 * @param mutator The hook to call. Null is safe. 213 */ 214 public void addExternalLinkHook( StringTransmutator mutator ) 215 { 216 if( mutator != null ) 217 { 218 m_externalLinkMutatorChain.add( mutator ); 219 } 220 } 221 222 /** 223 * Adds a hook for processing attachment links. 224 * 225 * @param mutator The hook to call. Null is safe. 226 */ 227 public void addAttachmentLinkHook( StringTransmutator mutator ) 228 { 229 if( mutator != null ) 230 { 231 m_attachmentLinkMutatorChain.add( mutator ); 232 } 233 } 234 235 /** 236 * Adds a HeadingListener to the parser chain. It will be called whenever 237 * a parsed header is found. 238 * 239 * @param listener The listener to add. 240 */ 241 public void addHeadingListener( HeadingListener listener ) 242 { 243 if( listener != null ) 244 { 245 m_headingListenerChain.add( listener ); 246 } 247 } 248 249 /** 250 * Disables access rule parsing. 251 */ 252 public void disableAccessRules() 253 { 254 m_parseAccessRules = false; 255 } 256 257 public boolean isParseAccessRules() 258 { 259 return m_parseAccessRules; 260 } 261 262 /** 263 * Use this to turn on or off image inlining. 264 * @param toggle If true, images are inlined (as per set in jspwiki.properties) 265 * If false, then images won't be inlined; instead, they will be 266 * treated as standard hyperlinks. 267 * @since 2.2.9 268 */ 269 public void enableImageInlining( boolean toggle ) 270 { 271 m_inlineImages = toggle; 272 } 273 274 public boolean isImageInlining() { 275 return m_inlineImages; 276 } 277 278 @SuppressWarnings( "unchecked" ) 279 protected final void initInlineImagePatterns() { 280 PatternCompiler compiler = new GlobCompiler(); 281 // 282 // We cache compiled patterns in the engine, since their creation is really expensive 283 // 284 List< Pattern > compiledpatterns = ( List< Pattern > )m_engine.getAttribute( INLINE_IMAGE_PATTERNS ); 285 286 if( compiledpatterns == null ) { 287 compiledpatterns = new ArrayList< >( 20 ); 288 Collection< String > ptrns = m_engine.getAllInlinedImagePatterns(); 289 290 // 291 // Make them into Regexp Patterns. Unknown patterns are ignored. 292 // 293 for( Iterator< String > i = ptrns.iterator(); i.hasNext(); ) { 294 String pattern = i.next(); 295 try { 296 compiledpatterns.add( compiler.compile( pattern, 297 GlobCompiler.DEFAULT_MASK | GlobCompiler.READ_ONLY_MASK ) ); 298 } catch( MalformedPatternException e ) { 299 log.error( "Malformed pattern [" + pattern + "] in properties: ", e ); 300 } 301 } 302 303 m_engine.setAttribute( INLINE_IMAGE_PATTERNS, compiledpatterns ); 304 } 305 306 m_inlineImagePatterns = Collections.unmodifiableList( compiledpatterns ); 307 } 308 309 public List< Pattern > getInlineImagePatterns() { 310 if( m_inlineImagePatterns == null ) { 311 initInlineImagePatterns(); 312 } 313 return m_inlineImagePatterns; 314 } 315 316 /** 317 * Parses the document. 318 * @return the parsed document, as a WikiDocument 319 * @throws IOException If something goes wrong. 320 */ 321 public abstract WikiDocument parse() 322 throws IOException; 323 324 /** 325 * Return the current position in the reader stream. 326 * The value will be -1 prior to reading. 327 * @return the reader position as an int. 328 */ 329 public int getPosition() 330 { 331 return m_pos; 332 } 333 334 /** 335 * Returns the next token in the stream. This is the most called method 336 * in the entire parser, so it needs to be lean and mean. 337 * 338 * @return The next token in the stream; or, if the stream is ended, -1. 339 * @throws IOException If something bad happens 340 * @throws NullPointerException If you have not yet created an input document. 341 */ 342 protected final int nextToken() 343 throws IOException, NullPointerException 344 { 345 // if( m_in == null ) return -1; 346 m_pos++; 347 return m_in.read(); 348 } 349 350 /** 351 * Push back any character to the current input. Does not 352 * push back a read EOF, though. 353 * 354 * @param c Character to push back. 355 * @throws IOException In case the character cannot be pushed back. 356 */ 357 protected void pushBack( int c ) 358 throws IOException 359 { 360 if( c != -1 && m_in != null ) 361 { 362 m_pos--; 363 m_in.unread( c ); 364 } 365 } 366 367 /** 368 * Writes HTML for error message. Does not add it to the document, you 369 * have to do it yourself. 370 * 371 * @param error The error string. 372 * @return An Element containing the error. 373 */ 374 375 public static Element makeError( String error ) 376 { 377 return new Element("span").setAttribute("class","error").addContent(error); 378 } 379 380 /** 381 * Cleans a Wiki name. The functionality of this method was changed in 2.6 382 * so that the list of allowed characters is much larger. Use wikifyLink() 383 * to get the legacy behaviour. 384 * <P> 385 * [ This is a link ] -> This is a link 386 * 387 * @param link Link to be cleared. Null is safe, and causes this to return null. 388 * @return A cleaned link. 389 * 390 * @since 2.0 391 */ 392 public static String cleanLink( String link ) 393 { 394 return cleanLink(link, PUNCTUATION_CHARS_ALLOWED); 395 } 396 397 /** 398 * Cleans a Wiki name based on a list of characters. Also, any multiple 399 * whitespace is collapsed into a single space, and any leading or trailing 400 * space is removed. 401 * 402 * @param link Link to be cleared. Null is safe, and causes this to return null. 403 * @param allowedChars Characters which are allowed in the string. 404 * @return A cleaned link. 405 * 406 * @since 2.6 407 */ 408 public static String cleanLink( String link, String allowedChars ) 409 { 410 if( link == null ) return null; 411 412 link = link.trim(); 413 StringBuilder clean = new StringBuilder(link.length()); 414 415 // 416 // Remove non-alphanumeric characters that should not 417 // be put inside WikiNames. Note that all valid 418 // Unicode letters are considered okay for WikiNames. 419 // It is the problem of the WikiPageProvider to take 420 // care of actually storing that information. 421 // 422 // Also capitalize things, if necessary. 423 // 424 425 boolean isWord = true; // If true, we've just crossed a word boundary 426 boolean wasSpace = false; 427 428 for( int i = 0; i < link.length(); i++ ) 429 { 430 char ch = link.charAt(i); 431 432 // 433 // Cleans away repetitive whitespace and only uses the first one. 434 // 435 if( Character.isWhitespace(ch) ) 436 { 437 if( wasSpace ) 438 continue; 439 440 wasSpace = true; 441 } 442 else 443 { 444 wasSpace = false; 445 } 446 447 // 448 // Check if it is allowed to use this char, and capitalize, if necessary. 449 // 450 if( Character.isLetterOrDigit( ch ) || allowedChars.indexOf(ch) != -1 ) 451 { 452 // Is a letter 453 454 if( isWord ) ch = Character.toUpperCase( ch ); 455 clean.append( ch ); 456 isWord = false; 457 } 458 else 459 { 460 isWord = true; 461 } 462 } 463 464 return clean.toString(); 465 } 466 467 /** 468 * Cleans away extra legacy characters. This method functions exactly 469 * like pre-2.6 cleanLink() 470 * <P> 471 * [ This is a link ] -> ThisIsALink 472 * 473 * @param link Link to be cleared. Null is safe, and causes this to return null. 474 * @return A cleaned link. 475 * @since 2.6 476 */ 477 public static String wikifyLink(String link) 478 { 479 return MarkupParser.cleanLink(link, MarkupParser.LEGACY_CHARS_ALLOWED); 480 } 481 482}