001/*
002    Licensed to the Apache Software Foundation (ASF) under one
003    or more contributor license agreements.  See the NOTICE file
004    distributed with this work for additional information
005    regarding copyright ownership.  The ASF licenses this file
006    to you under the Apache License, Version 2.0 (the
007    "License"); you may not use this file except in compliance
008    with the License.  You may obtain a copy of the License at
009
010       http://www.apache.org/licenses/LICENSE-2.0
011
012    Unless required by applicable law or agreed to in writing,
013    software distributed under the License is distributed on an
014    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015    KIND, either express or implied.  See the License for the
016    specific language governing permissions and limitations
017    under the License.
018*/
019package org.apache.wiki.parser;
020
021import java.io.BufferedReader;
022import java.io.IOException;
023import java.io.PushbackReader;
024import java.io.Reader;
025import java.util.ArrayList;
026import java.util.Collection;
027import java.util.Collections;
028import java.util.Iterator;
029import java.util.List;
030
031import org.apache.log4j.Logger;
032import org.apache.oro.text.GlobCompiler;
033import org.apache.oro.text.regex.MalformedPatternException;
034import org.apache.oro.text.regex.Pattern;
035import org.apache.oro.text.regex.PatternCompiler;
036import org.apache.wiki.StringTransmutator;
037import org.apache.wiki.WikiContext;
038import org.apache.wiki.WikiEngine;
039import org.jdom2.Element;
040
041/**
042 *   Provides an abstract class for the parser instances.
043 *
044 *   @since  2.4
045 */
046public abstract class MarkupParser
047{
048    /** Allow this many characters to be pushed back in the stream.  In effect,
049        this limits the size of a single line.  */
050    protected static final int              PUSHBACK_BUFFER_SIZE = 10*1024;
051    protected PushbackReader                m_in;
052    private int              m_pos = -1; // current position in reader stream
053
054    protected WikiEngine     m_engine;
055    protected WikiContext    m_context;
056
057    /** Optionally stores internal wikilinks */
058    protected ArrayList<StringTransmutator>      m_localLinkMutatorChain    = new ArrayList<>();
059    protected ArrayList<StringTransmutator>      m_externalLinkMutatorChain = new ArrayList<>();
060    protected ArrayList<StringTransmutator>      m_attachmentLinkMutatorChain = new ArrayList<>();
061    protected ArrayList<HeadingListener>         m_headingListenerChain     = new ArrayList<>();
062    protected ArrayList<StringTransmutator>      m_linkMutators             = new ArrayList<>();
063
064    protected boolean        m_inlineImages     = true;
065    protected boolean        m_parseAccessRules = true;
066    /** Keeps image regexp Patterns */
067    protected List< Pattern > m_inlineImagePatterns = null;
068    protected LinkParsingOperations m_linkParsingOperations;
069
070    private static Logger log = Logger.getLogger( MarkupParser.class );
071
072    /** If set to "true", allows using raw HTML within Wiki text.  Be warned,
073        this is a VERY dangerous option to set - never turn this on in a publicly
074        allowable Wiki, unless you are absolutely certain of what you're doing. */
075    public static final String     PROP_ALLOWHTML        = "jspwiki.translatorReader.allowHTML";
076    /** If set to "true", enables plugins during parsing */
077    public static final String     PROP_RUNPLUGINS       = "jspwiki.translatorReader.runPlugins";
078
079    /** Lists all punctuation characters allowed in WikiMarkup. These
080        will not be cleaned away. This is for compatibility for older versions
081        of JSPWiki. */
082    protected static final String LEGACY_CHARS_ALLOWED      = "._";
083
084    /** Lists all punctuation characters allowed in page names. */
085    public    static final String PUNCTUATION_CHARS_ALLOWED = " ()&+,-=._$";
086
087    public    static final String HASHLINK = "hashlink";
088
089    /** Name of the outlink image; relative path to the JSPWiki directory. */
090    public    static final String OUTLINK_IMAGE = "images/out.png";
091    /** Outlink css class. */
092    public    static final String OUTLINK = "outlink";
093
094    /** If true, all outward links (external links) have a small link image appended. */
095    public    static final String PROP_USEOUTLINKIMAGE  = "jspwiki.translatorReader.useOutlinkImage";
096
097    private   static final String INLINE_IMAGE_PATTERNS = "JSPWikiMarkupParser.inlineImagePatterns";
098
099    /** If set to "true", all external links are tagged with 'rel="nofollow"' */
100    public static final String     PROP_USERELNOFOLLOW   = "jspwiki.translatorReader.useRelNofollow";
101
102    /** The value for anchor element <tt>class</tt> attributes when used
103     * for wiki page (normal) links. The value is "wikipage". */
104   public static final String CLASS_WIKIPAGE = "wikipage";
105
106   /** The value for anchor element <tt>class</tt> attributes when used
107     * for edit page links. The value is "createpage". */
108   public static final String CLASS_EDITPAGE = "createpage";
109
110   /** The value for anchor element <tt>class</tt> attributes when used
111     * for interwiki page links. The value is "interwiki". */
112   public static final String CLASS_INTERWIKI = "interwiki";
113
114   /** The value for anchor element <tt>class</tt> attributes when used
115     * for footnote links. The value is "footnote". */
116   public static final String CLASS_FOOTNOTE = "footnote";
117
118   /** The value for anchor element <tt>class</tt> attributes when used
119     * for footnote links. The value is "footnote". */
120   public static final String CLASS_FOOTNOTE_REF = "footnoteref";
121
122   /** The value for anchor element <tt>class</tt> attributes when used
123     * for external links. The value is "external". */
124   public static final String CLASS_EXTERNAL = "external";
125
126   /** The value for anchor element <tt>class</tt> attributes when used
127     * for attachments. The value is "attachment". */
128   public static final String CLASS_ATTACHMENT = "attachment";
129
130   public static final String[] CLASS_TYPES =
131   {
132      CLASS_WIKIPAGE,
133      CLASS_EDITPAGE,
134      "",
135      CLASS_FOOTNOTE,
136      CLASS_FOOTNOTE_REF,
137      "",
138      CLASS_EXTERNAL,
139      CLASS_INTERWIKI,
140      CLASS_EXTERNAL,
141      CLASS_WIKIPAGE,
142      CLASS_ATTACHMENT
143   };
144
145    /**
146     *  Constructs a MarkupParser.  The subclass must call this constructor
147     *  to set up the necessary bits and pieces.
148     *
149     *  @param context The WikiContext.
150     *  @param in The reader from which we are reading the bytes from.
151     */
152    protected MarkupParser( WikiContext context, Reader in )
153    {
154        m_engine = context.getEngine();
155        m_context = context;
156        m_linkParsingOperations = new LinkParsingOperations( m_context );
157        setInputReader( in );
158    }
159
160    /**
161     *  Replaces the current input character stream with a new one.
162     *  @param in New source for input.  If null, this method does nothing.
163     *  @return the old stream
164     */
165    public Reader setInputReader( Reader in )
166    {
167        Reader old = m_in;
168
169        if( in != null )
170        {
171            m_in = new PushbackReader( new BufferedReader( in ),
172                                       PUSHBACK_BUFFER_SIZE );
173        }
174
175        return old;
176    }
177
178    /**
179     *  Adds a hook for processing link texts.  This hook is called
180     *  when the link text is written into the output stream, and
181     *  you may use it to modify the text.  It does not affect the
182     *  actual link, only the user-visible text.
183     *
184     *  @param mutator The hook to call.  Null is safe.
185     */
186    public void addLinkTransmutator( StringTransmutator mutator )
187    {
188        if( mutator != null )
189        {
190            m_linkMutators.add( mutator );
191        }
192    }
193
194    /**
195     *  Adds a hook for processing local links.  The engine
196     *  transforms both non-existing and existing page links.
197     *
198     *  @param mutator The hook to call.  Null is safe.
199     */
200    public void addLocalLinkHook( StringTransmutator mutator )
201    {
202        if( mutator != null )
203        {
204            m_localLinkMutatorChain.add( mutator );
205        }
206    }
207
208    /**
209     *  Adds a hook for processing external links.  This includes
210     *  all http:// ftp://, etc. links, including inlined images.
211     *
212     *  @param mutator The hook to call.  Null is safe.
213     */
214    public void addExternalLinkHook( StringTransmutator mutator )
215    {
216        if( mutator != null )
217        {
218            m_externalLinkMutatorChain.add( mutator );
219        }
220    }
221
222    /**
223     *  Adds a hook for processing attachment links.
224     *
225     *  @param mutator The hook to call.  Null is safe.
226     */
227    public void addAttachmentLinkHook( StringTransmutator mutator )
228    {
229        if( mutator != null )
230        {
231            m_attachmentLinkMutatorChain.add( mutator );
232        }
233    }
234
235    /**
236     *  Adds a HeadingListener to the parser chain.  It will be called whenever
237     *  a parsed header is found.
238     *
239     *  @param listener The listener to add.
240     */
241    public void addHeadingListener( HeadingListener listener )
242    {
243        if( listener != null )
244        {
245            m_headingListenerChain.add( listener );
246        }
247    }
248
249    /**
250     *  Disables access rule parsing.
251     */
252    public void disableAccessRules()
253    {
254        m_parseAccessRules = false;
255    }
256
257    public boolean isParseAccessRules()
258    {
259        return m_parseAccessRules;
260    }
261
262    /**
263     *  Use this to turn on or off image inlining.
264     *  @param toggle If true, images are inlined (as per set in jspwiki.properties)
265     *                If false, then images won't be inlined; instead, they will be
266     *                treated as standard hyperlinks.
267     *  @since 2.2.9
268     */
269    public void enableImageInlining( boolean toggle )
270    {
271        m_inlineImages = toggle;
272    }
273
274    public boolean isImageInlining() {
275        return m_inlineImages;
276    }
277
278    @SuppressWarnings( "unchecked" )
279    protected final void initInlineImagePatterns() {
280        PatternCompiler compiler = new GlobCompiler();
281        //
282        //  We cache compiled patterns in the engine, since their creation is really expensive
283        //
284        List< Pattern > compiledpatterns = ( List< Pattern > )m_engine.getAttribute( INLINE_IMAGE_PATTERNS );
285
286        if( compiledpatterns == null ) {
287            compiledpatterns = new ArrayList< >( 20 );
288            Collection< String > ptrns = m_engine.getAllInlinedImagePatterns();
289
290            //
291            //  Make them into Regexp Patterns.  Unknown patterns are ignored.
292            //
293            for( Iterator< String > i = ptrns.iterator(); i.hasNext(); ) {
294                String pattern = i.next();
295                try {
296                    compiledpatterns.add( compiler.compile( pattern,
297                                                            GlobCompiler.DEFAULT_MASK | GlobCompiler.READ_ONLY_MASK ) );
298                } catch( MalformedPatternException e ) {
299                    log.error( "Malformed pattern [" + pattern + "] in properties: ", e );
300                }
301            }
302
303            m_engine.setAttribute( INLINE_IMAGE_PATTERNS, compiledpatterns );
304        }
305
306        m_inlineImagePatterns = Collections.unmodifiableList( compiledpatterns );
307    }
308
309    public List< Pattern > getInlineImagePatterns() {
310        if( m_inlineImagePatterns == null ) {
311            initInlineImagePatterns();
312        }
313        return m_inlineImagePatterns;
314    }
315
316    /**
317     *  Parses the document.
318     *  @return the parsed document, as a WikiDocument
319     *  @throws IOException If something goes wrong.
320     */
321    public abstract WikiDocument parse()
322         throws IOException;
323
324    /**
325     *  Return the current position in the reader stream.
326     *  The value will be -1 prior to reading.
327     * @return the reader position as an int.
328     */
329    public int getPosition()
330    {
331        return m_pos;
332    }
333
334    /**
335     * Returns the next token in the stream.  This is the most called method
336     * in the entire parser, so it needs to be lean and mean.
337     *
338     * @return The next token in the stream; or, if the stream is ended, -1.
339     * @throws IOException If something bad happens
340     * @throws NullPointerException If you have not yet created an input document.
341     */
342    protected final int nextToken()
343        throws IOException, NullPointerException
344    {
345        // if( m_in == null ) return -1;
346        m_pos++;
347        return m_in.read();
348    }
349
350    /**
351     *  Push back any character to the current input.  Does not
352     *  push back a read EOF, though.
353     *
354     *  @param c Character to push back.
355     *  @throws IOException In case the character cannot be pushed back.
356     */
357    protected void pushBack( int c )
358        throws IOException
359    {
360        if( c != -1 && m_in != null )
361        {
362            m_pos--;
363            m_in.unread( c );
364        }
365    }
366
367    /**
368     *  Writes HTML for error message.  Does not add it to the document, you
369     *  have to do it yourself.
370     *
371     *  @param error The error string.
372     *  @return An Element containing the error.
373     */
374
375    public static Element makeError( String error )
376    {
377        return new Element("span").setAttribute("class","error").addContent(error);
378    }
379
380    /**
381     *  Cleans a Wiki name.  The functionality of this method was changed in 2.6
382     *  so that the list of allowed characters is much larger.  Use wikifyLink()
383     *  to get the legacy behaviour.
384     *  <P>
385     *  [ This is a link ] -&gt; This is a link
386     *
387     *  @param link Link to be cleared. Null is safe, and causes this to return null.
388     *  @return A cleaned link.
389     *
390     *  @since 2.0
391     */
392    public static String cleanLink( String link )
393    {
394        return cleanLink(link, PUNCTUATION_CHARS_ALLOWED);
395    }
396
397    /**
398     *  Cleans a Wiki name based on a list of characters.  Also, any multiple
399     *  whitespace is collapsed into a single space, and any leading or trailing
400     *  space is removed.
401     *
402     *  @param link Link to be cleared. Null is safe, and causes this to return null.
403     *  @param allowedChars Characters which are allowed in the string.
404     *  @return A cleaned link.
405     *
406     *  @since 2.6
407     */
408    public static String cleanLink( String link, String allowedChars )
409    {
410        if( link == null ) return null;
411
412        link = link.trim();
413        StringBuilder clean = new StringBuilder(link.length());
414
415        //
416        //  Remove non-alphanumeric characters that should not
417        //  be put inside WikiNames.  Note that all valid
418        //  Unicode letters are considered okay for WikiNames.
419        //  It is the problem of the WikiPageProvider to take
420        //  care of actually storing that information.
421        //
422        //  Also capitalize things, if necessary.
423        //
424
425        boolean isWord = true;  // If true, we've just crossed a word boundary
426        boolean wasSpace = false;
427
428        for( int i = 0; i < link.length(); i++ )
429        {
430            char ch = link.charAt(i);
431
432            //
433            //  Cleans away repetitive whitespace and only uses the first one.
434            //
435            if( Character.isWhitespace(ch) )
436            {
437                if( wasSpace )
438                    continue;
439
440                wasSpace = true;
441            }
442            else
443            {
444                wasSpace = false;
445            }
446
447            //
448            //  Check if it is allowed to use this char, and capitalize, if necessary.
449            //
450            if( Character.isLetterOrDigit( ch ) || allowedChars.indexOf(ch) != -1 )
451            {
452                // Is a letter
453
454                if( isWord ) ch = Character.toUpperCase( ch );
455                clean.append( ch );
456                isWord = false;
457            }
458            else
459            {
460                isWord = true;
461            }
462        }
463
464        return clean.toString();
465    }
466
467    /**
468     *  Cleans away extra legacy characters.  This method functions exactly
469     *  like pre-2.6 cleanLink()
470     *  <P>
471     *  [ This is a link ] -&gt; ThisIsALink
472     *
473     *  @param link Link to be cleared. Null is safe, and causes this to return null.
474     *  @return A cleaned link.
475     *  @since 2.6
476     */
477    public static String wikifyLink(String link)
478    {
479        return MarkupParser.cleanLink(link, MarkupParser.LEGACY_CHARS_ALLOWED);
480    }
481
482}