001/*
002    Licensed to the Apache Software Foundation (ASF) under one
003    or more contributor license agreements.  See the NOTICE file
004    distributed with this work for additional information
005    regarding copyright ownership.  The ASF licenses this file
006    to you under the Apache License, Version 2.0 (the
007    "License"); you may not use this file except in compliance
008    with the License.  You may obtain a copy of the License at
009
010       http://www.apache.org/licenses/LICENSE-2.0
011
012    Unless required by applicable law or agreed to in writing,
013    software distributed under the License is distributed on an
014    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015    KIND, either express or implied.  See the License for the
016    specific language governing permissions and limitations
017    under the License.  
018 */
019package org.apache.wiki.filters;
020
021import net.sf.akismet.Akismet;
022import org.apache.commons.lang3.StringUtils;
023import org.apache.commons.lang3.time.StopWatch;
024import org.apache.logging.log4j.LogManager;
025import org.apache.logging.log4j.Logger;
026import org.apache.oro.text.regex.MalformedPatternException;
027import org.apache.oro.text.regex.MatchResult;
028import org.apache.oro.text.regex.Pattern;
029import org.apache.oro.text.regex.PatternCompiler;
030import org.apache.oro.text.regex.PatternMatcher;
031import org.apache.oro.text.regex.Perl5Compiler;
032import org.apache.oro.text.regex.Perl5Matcher;
033import org.apache.wiki.InternalWikiException;
034import org.apache.wiki.api.core.Attachment;
035import org.apache.wiki.api.core.Context;
036import org.apache.wiki.api.core.ContextEnum;
037import org.apache.wiki.api.core.Engine;
038import org.apache.wiki.api.core.Page;
039import org.apache.wiki.api.exceptions.ProviderException;
040import org.apache.wiki.api.exceptions.RedirectException;
041import org.apache.wiki.api.filters.BasePageFilter;
042import org.apache.wiki.api.providers.WikiProvider;
043import org.apache.wiki.attachment.AttachmentManager;
044import org.apache.wiki.auth.user.UserProfile;
045import org.apache.wiki.pages.PageManager;
046import org.apache.wiki.ui.EditorManager;
047import org.apache.wiki.util.FileUtil;
048import org.apache.wiki.util.HttpUtil;
049import org.apache.wiki.util.TextUtil;
050import org.suigeneris.jrcs.diff.Diff;
051import org.suigeneris.jrcs.diff.DifferentiationFailedException;
052import org.suigeneris.jrcs.diff.Revision;
053import org.suigeneris.jrcs.diff.delta.AddDelta;
054import org.suigeneris.jrcs.diff.delta.ChangeDelta;
055import org.suigeneris.jrcs.diff.delta.DeleteDelta;
056import org.suigeneris.jrcs.diff.delta.Delta;
057import org.suigeneris.jrcs.diff.myers.MyersDiff;
058
059import javax.servlet.http.HttpServletRequest;
060import javax.servlet.http.HttpServletResponse;
061import javax.servlet.jsp.PageContext;
062import java.io.BufferedReader;
063import java.io.IOException;
064import java.io.InputStream;
065import java.io.InputStreamReader;
066import java.io.StringReader;
067import java.io.StringWriter;
068import java.nio.charset.StandardCharsets;
069import java.util.ArrayList;
070import java.util.Arrays;
071import java.util.Collection;
072import java.util.Date;
073import java.util.Iterator;
074import java.util.List;
075import java.util.Properties;
076import java.util.Random;
077import java.util.StringTokenizer;
078import java.util.Vector;
079import java.util.concurrent.ThreadLocalRandom;
080
081
082/**
083 *  This is Herb, the JSPWiki spamfilter that can also do choke modifications.
084 *
085 *  Parameters:
086 *  <ul>
087 *    <li>wordlist - Page name where the spamword regexps are found.  Use [{SET spamwords='regexp list separated with spaces'}] on
088 *     that page.  Default is "SpamFilterWordList".
089 *    <li>IPlist - Page name where the IP regexps are found.  Use [{SET ips='regexp list separated with spaces'}] on
090 *     that page.  Default is "SpamFilterIPList".
091 *    <li>maxpagenamelength - Maximum page name length. Default is 100.
092 *    <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is
093 *        "SpamFilterWordList/blacklist.txt"</li>
094 *    <li>errorpage - The page to which the user is redirected.  Has a special variable $msg which states the reason. Default is "RejectedMessage".
095 *    <li>pagechangesinminute - How many page changes are allowed/minute.  Default is 5.</li>
096 *    <li>similarchanges - How many similar page changes are allowed before the host is banned.  Default is 2.  (since 2.4.72)</li>
097 *    <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li>
098 *    <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li>
099 *    <li>akismet-apikey - The Akismet API key (see akismet.org)</li>
100 *    <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li>
101 *    <li>captcha - Sets the captcha technology to use.  Current allowed values are "none" and "asirra".</li>
102 *    <li>strategy - Sets the filtering strategy to use.  If set to "eager", will stop at the first probable
103 *        match, and won't consider any other tests.  This is the default, as it's considerably lighter. If set to "score", will go through all of the tests
104 *        and calculates a score for the spam, which is then compared to a filter level value.
105 *  </ul>
106 *
107 *  <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates
108 *  with the editor system.</p>
109 *  
110 *  <p>Changes by admin users are ignored in any case.</p>
111 *
112 *  @since 2.1.112
113 */
114public class SpamFilter extends BasePageFilter {
115    
116    private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score";
117    private static final String REASON_REGEXP = "Regexp";
118    private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily";
119    private static final String REASON_IP_BANNED_PERMANENTLY = "IPBannedPermanently";
120    private static final String REASON_BOT_TRAP = "BotTrap";
121    private static final String REASON_AKISMET = "Akismet";
122    private static final String REASON_TOO_MANY_URLS = "TooManyUrls";
123    private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications";
124    private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications";
125    private static final String REASON_PAGENAME_TOO_LONG = "PageNameTooLong";
126    private static final String REASON_UTF8_TRAP = "UTF8Trap";
127
128    private static final String LISTVAR = "spamwords";
129    private static final String LISTIPVAR = "ips";
130
131    private static final Random RANDOM = ThreadLocalRandom.current();
132
133    /** The filter property name for specifying the page which contains the list of spamwords. Value is <tt>{@value}</tt>. */
134    public static final String  PROP_WORDLIST              = "wordlist";
135
136    /** The filter property name for specifying the page which contains the list of IPs to ban. Value is <tt>{@value}</tt>. */
137    public static final String  PROP_IPLIST                = "IPlist";
138
139    /** The filter property name for specifying the maximum page name length.  Value is <tt>{@value}</tt>. */
140    public static final String  PROP_MAX_PAGENAME_LENGTH   = "maxpagenamelength";
141
142    /** The filter property name for the page to which you are directed if Herb rejects your edit.  Value is <tt>{@value}</tt>. */
143    public static final String  PROP_ERRORPAGE             = "errorpage";
144    
145    /** The filter property name for specifying how many changes is any given IP address
146     *  allowed to do per minute.  Value is <tt>{@value}</tt>.
147     */
148    public static final String  PROP_PAGECHANGES           = "pagechangesinminute";
149    
150    /** The filter property name for specifying how many similar changes are allowed before a host is banned.  Value is <tt>{@value}</tt>. */
151    public static final String  PROP_SIMILARCHANGES        = "similarchanges";
152    
153    /** The filter property name for specifying how long a host is banned.  Value is <tt>{@value}</tt>.*/
154    public static final String  PROP_BANTIME               = "bantime";
155    
156    /** The filter property name for the attachment containing the blacklist.  Value is <tt>{@value}</tt>.*/
157    public static final String  PROP_BLACKLIST             = "blacklist";
158    
159    /** The filter property name for specifying how many URLs can any given edit contain. Value is <tt>{@value}</tt> */
160    public static final String  PROP_MAXURLS               = "maxurls";
161    
162    /** The filter property name for specifying the Akismet API-key.  Value is <tt>{@value}</tt>. */
163    public static final String  PROP_AKISMET_API_KEY       = "akismet-apikey";
164    
165    /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */
166    public static final String  PROP_IGNORE_AUTHENTICATED  = "ignoreauthenticated";
167
168    /** The filter property name for specifying groups allowed to bypass the spam filter. Value is <tt>{@value}</tt>. */
169    public static final String PROP_ALLOWED_GROUPS = "jspwiki.filters.spamfilter.allowedgroups";
170    
171    /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */
172    public static final String  PROP_CAPTCHA               = "captcha";
173    
174    /** The filter property name for specifying which filter strategy should be used.  Value is <tt>{@value}</tt>. */
175    public static final String  PROP_FILTERSTRATEGY        = "strategy";
176
177    /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */
178    public static final String  STRATEGY_EAGER             = "eager";
179    
180    /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */
181    public static final String  STRATEGY_SCORE             = "score";
182
183    private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)";
184
185    private String          m_forbiddenWordsPage = "SpamFilterWordList";
186    private String          m_forbiddenIPsPage   = "SpamFilterIPList";
187    private String          m_pageNameMaxLength  = "100";
188    private String          m_errorPage          = "RejectedMessage";
189    private String          m_blacklist          = "SpamFilterWordList/blacklist.txt";
190
191    private final PatternMatcher  m_matcher = new Perl5Matcher();
192    private final PatternCompiler m_compiler = new Perl5Compiler();
193
194    private Collection<Pattern> m_spamPatterns;
195    private Collection<Pattern> m_IPPatterns;
196
197    private Date m_lastRebuild = new Date( 0L );
198
199    private static final Logger C_SPAMLOG = LogManager.getLogger( "SpamLog" );
200    private static final Logger LOG = LogManager.getLogger( SpamFilter.class );
201
202    private final Vector<Host>    m_temporaryBanList = new Vector<>();
203
204    private int             m_banTime = 60; // minutes
205
206    private final Vector<Host>    m_lastModifications = new Vector<>();
207
208    /** How many times a single IP address can change a page per minute? */
209    private int             m_limitSinglePageChanges = 5;
210
211    /** How many times can you add the exact same string to a page? */
212    private int             m_limitSimilarChanges = 2;
213
214    /** How many URLs can be added at maximum. */
215    private int             m_maxUrls = 10;
216
217    private Pattern         m_urlPattern;
218    private Akismet         m_akismet;
219
220    private String          m_akismetAPIKey;
221
222    private boolean         m_useCaptcha;
223
224    /** The limit at which we consider something to be spam. */
225    private final int             m_scoreLimit = 1;
226
227    /** If set to true, will ignore anyone who is in Authenticated role. */
228    private boolean         m_ignoreAuthenticated;
229
230    /** Groups allowed to bypass the filter */
231    private String[]         m_allowedGroups;
232
233    private boolean         m_stopAtFirstMatch = true;
234
235    private static String   c_hashName;
236    private static long     c_lastUpdate;
237
238    /** The HASH_DELAY value is a maximum amount of time that an user can keep
239     *  a session open, because after the value has expired, we will invent a new
240     *  hash field name.  By default this is {@value} hours, which should be ample
241     *  time for someone.
242     */
243    private static final long HASH_DELAY = 24;
244
245
246    /**
247     *  {@inheritDoc}
248     */
249    @Override
250    public void initialize( final Engine engine, final Properties properties ) {
251        m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage );
252        m_forbiddenIPsPage = properties.getProperty( PROP_IPLIST, m_forbiddenIPsPage);
253        m_pageNameMaxLength = properties.getProperty( PROP_MAX_PAGENAME_LENGTH, m_pageNameMaxLength);
254        m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage );
255        m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties, PROP_PAGECHANGES, m_limitSinglePageChanges );
256        
257        m_limitSimilarChanges = TextUtil.getIntegerProperty( properties, PROP_SIMILARCHANGES, m_limitSimilarChanges );
258
259        m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls );
260        m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime );
261        m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist );
262
263        m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties, PROP_IGNORE_AUTHENTICATED, m_ignoreAuthenticated );
264        m_allowedGroups = StringUtils.split( StringUtils.defaultString( properties.getProperty( PROP_ALLOWED_GROUPS, m_blacklist ) ), ',' );
265
266        m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra");
267
268        try {
269            m_urlPattern = m_compiler.compile( URL_REGEXP );
270        } catch( final MalformedPatternException e ) {
271            LOG.fatal( "Internal error: Someone put in a faulty pattern.", e );
272            throw new InternalWikiException( "Faulty pattern." , e);
273        }
274
275        m_akismetAPIKey = TextUtil.getStringProperty( properties, PROP_AKISMET_API_KEY, m_akismetAPIKey );
276        m_stopAtFirstMatch = TextUtil.getStringProperty( properties, PROP_FILTERSTRATEGY, STRATEGY_EAGER ).equals( STRATEGY_EAGER );
277
278        LOG.info( "# Spam filter initialized.  Temporary ban time " + m_banTime +
279                  " mins, max page changes/minute: " + m_limitSinglePageChanges );
280    }
281
282    private static final int REJECT = 0;
283    private static final int ACCEPT = 1;
284    private static final int NOTE   = 2;
285
286    private static String log( final Context ctx, final int type, final String source, String message ) {
287        message = TextUtil.replaceString( message, "\r\n", "\\r\\n" );
288        message = TextUtil.replaceString( message, "\"", "\\\"" );
289
290        final String uid = getUniqueID();
291        final String page   = ctx.getPage().getName();
292        final String addr   = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-";
293        final String reason;
294        switch( type ) {
295            case REJECT: reason = "REJECTED";
296                break;
297            case ACCEPT: reason = "ACCEPTED";
298                break;
299            case NOTE: reason = "NOTE";
300                break;
301            default: throw new InternalWikiException( "Illegal type " + type );
302        }
303        C_SPAMLOG.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message );
304
305        return uid;
306    }
307
308    /** {@inheritDoc} */
309    @Override
310    public String preSave( final Context context, final String content ) throws RedirectException {
311        cleanBanList();
312        refreshBlacklists( context );
313        final Change change = getChange( context, content );
314
315        if( !ignoreThisUser( context ) ) {
316            checkBanList( context, change );
317            checkSinglePageChange( context, content, change );
318            checkIPList( context );
319            checkPatternList( context, content, change );
320            checkPageName( context, content, change);
321        }
322
323        if( !m_stopAtFirstMatch ) {
324            final Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE );
325
326            if( score != null && score >= m_scoreLimit ) {
327                throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) );
328            }
329        }
330
331        log( context, ACCEPT, "-", change.toString() );
332        return content;
333    }
334
335    private void checkPageName( final Context context, final String content, final Change change ) throws RedirectException {
336        final Page page = context.getPage();
337        final String pageName = page.getName();
338        final int maxlength = Integer.parseInt(m_pageNameMaxLength);
339        if ( pageName.length() > maxlength) {
340            //
341            //  Spam filter has a match.
342            //
343
344            final String uid = log( context, REJECT, REASON_PAGENAME_TOO_LONG + "(" + m_pageNameMaxLength + ")" , pageName);
345
346            LOG.info("SPAM:PageNameTooLong (" + uid + "). The length of the page name is too large (" + pageName.length() + " , limit is " + m_pageNameMaxLength + ")");
347            checkStrategy( context, REASON_PAGENAME_TOO_LONG, "Herb says '" + pageName + "' is a bad pageName and I trust Herb! (Incident code " + uid + ")" );
348
349        }
350    }
351
352    private void checkStrategy( final Context context, final String error, final String message ) throws RedirectException {
353        if( m_stopAtFirstMatch ) {
354            throw new RedirectException( message, getRedirectPage( context ) );
355        }
356
357        Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE );
358        if( score != null ) {
359            score = score + 1;
360        } else {
361            score = 1;
362        }
363
364        context.setVariable( ATTR_SPAMFILTER_SCORE, score );
365    }
366    
367    /**
368     *  Parses a list of patterns and returns a Collection of compiled Pattern objects.
369     *
370     * @param source page containing the list of patterns.
371     * @param list list of patterns.
372     * @return A Collection of the Patterns that were found from the lists.
373     */
374    private Collection< Pattern > parseWordList( final Page source, final String list ) {
375        final ArrayList< Pattern > compiledpatterns = new ArrayList<>();
376
377        if( list != null ) {
378            final StringTokenizer tok = new StringTokenizer( list, " \t\n" );
379
380            while( tok.hasMoreTokens() ) {
381                final String pattern = tok.nextToken();
382
383                try {
384                    compiledpatterns.add( m_compiler.compile( pattern ) );
385                } catch( final MalformedPatternException e ) {
386                    LOG.debug( "Malformed spam filter pattern " + pattern );
387                    source.setAttribute("error", "Malformed spam filter pattern " + pattern);
388                }
389            }
390        }
391
392        return compiledpatterns;
393    }
394
395    /**
396     *  Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects.
397     *
398     *  @param list list of patterns.
399     *  @return The parsed blacklist patterns.
400     */
401    private Collection< Pattern > parseBlacklist( final String list ) {
402        final ArrayList< Pattern > compiledpatterns = new ArrayList<>();
403
404        if( list != null ) {
405            try {
406                final BufferedReader in = new BufferedReader( new StringReader(list) );
407                String line;
408                while( (line = in.readLine() ) != null ) {
409                    line = line.trim();
410                    if( line.isEmpty() ) continue; // Empty line
411                    if( line.startsWith("#") ) continue; // It's a comment
412
413                    int ws = line.indexOf( ' ' );
414                    if( ws == -1 ) ws = line.indexOf( '\t' );
415                    if( ws != -1 ) line = line.substring( 0, ws );
416
417                    try {
418                        compiledpatterns.add( m_compiler.compile( line ) );
419                    } catch( final MalformedPatternException e ) {
420                        LOG.debug( "Malformed spam filter pattern " + line );
421                    }
422                }
423            } catch( final IOException e ) {
424                LOG.info( "Could not read patterns; returning what I got" , e );
425            }
426        }
427
428        return compiledpatterns;
429    }
430
431    /**
432     * Takes a single page change and performs a load of tests on the content change. An admin can modify anything.
433     *
434     * @param context page Context
435     * @param content page content
436     * @param change page change
437     * @throws RedirectException spam filter rejects the page change.
438     */
439    private synchronized void checkSinglePageChange( final Context context, final String content, final Change change )
440            throws RedirectException {
441        final HttpServletRequest req = context.getHttpRequest();
442
443        if( req != null ) {
444            final String addr = HttpUtil.getRemoteAddress( req );
445            int hostCounter = 0;
446            int changeCounter = 0;
447
448            LOG.debug( "Change is " + change.m_change );
449
450            final long time = System.currentTimeMillis() - 60*1000L; // 1 minute
451
452            for( final Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) {
453                final Host host = i.next();
454
455                //  Check if this item is invalid
456                if( host.getAddedTime() < time ) {
457                    LOG.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" );
458                    i.remove();
459                    continue;
460                }
461
462                // Check if this IP address has been seen before
463                if( host.getAddress().equals( addr ) ) {
464                    hostCounter++;
465                }
466
467                //  Check, if this change has been seen before
468                if( host.getChange() != null && host.getChange().equals( change ) ) {
469                    changeCounter++;
470                }
471            }
472
473            //  Now, let's check against the limits.
474            if( hostCounter >= m_limitSinglePageChanges ) {
475                final Host host = new Host( addr, null );
476                m_temporaryBanList.add( host );
477
478                final String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change );
479                LOG.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" );
480                checkStrategy( context, REASON_TOO_MANY_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
481            }
482
483            if( changeCounter >= m_limitSimilarChanges ) {
484                final Host host = new Host( addr, null );
485                m_temporaryBanList.add( host );
486
487                final String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change );
488                LOG.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" );
489                checkStrategy( context, REASON_SIMILAR_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")");
490            }
491
492            //  Calculate the number of links in the addition.
493            String tstChange  = change.toString();
494            int urlCounter = 0;
495            while( m_matcher.contains( tstChange,m_urlPattern ) ) {
496                final MatchResult m = m_matcher.getMatch();
497                tstChange = tstChange.substring( m.endOffset(0) );
498                urlCounter++;
499            }
500
501            if( urlCounter > m_maxUrls ) {
502                final Host host = new Host( addr, null );
503                m_temporaryBanList.add( host );
504
505                final String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() );
506                LOG.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" );
507                checkStrategy( context, REASON_TOO_MANY_URLS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
508            }
509
510            //  Check bot trap
511            checkBotTrap( context, change );
512
513            //  Check UTF-8 mangling
514            checkUTF8( context, change );
515
516            //  Do Akismet check.  This is good to be the last, because this is the most expensive operation.
517            checkAkismet( context, change );
518
519            m_lastModifications.add( new Host( addr, change ) );
520        }
521    }
522
523
524    /**
525     *  Checks against the akismet system.
526     *
527     * @param context page Context
528     * @throws RedirectException spam filter rejects the page change.
529     */
530    private void checkAkismet( final Context context, final Change change ) throws RedirectException {
531        if( m_akismetAPIKey != null ) {
532            if( m_akismet == null ) {
533                LOG.info( "Initializing Akismet spam protection." );
534                m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() );
535
536                if( !m_akismet.verifyAPIKey() ) {
537                    LOG.error( "Akismet API key cannot be verified.  Please check your config." );
538                    m_akismetAPIKey = null;
539                    m_akismet = null;
540                }
541            }
542
543            final HttpServletRequest req = context.getHttpRequest();
544
545            //  Akismet will mark all empty statements as spam, so we'll just ignore them.
546            if( change.m_adds == 0 && change.m_removals > 0 ) {
547                return;
548            }
549            
550            if( req != null && m_akismet != null ) {
551                LOG.debug( "Calling Akismet to check for spam..." );
552
553                final StopWatch sw = new StopWatch();
554                sw.start();
555
556                final String ipAddress     = HttpUtil.getRemoteAddress( req );
557                final String userAgent     = req.getHeader( "User-Agent" );
558                final String referrer      = req.getHeader( "Referer");
559                final String permalink     = context.getViewURL( context.getPage().getName() );
560                final String commentType   = context.getRequestContext().equals( ContextEnum.PAGE_COMMENT.getRequestContext() ) ? "comment" : "edit";
561                final String commentAuthor = context.getCurrentUser().getName();
562                final String commentAuthorEmail = null;
563                final String commentAuthorURL   = null;
564
565                final boolean isSpam = m_akismet.commentCheck( ipAddress,
566                                                               userAgent,
567                                                               referrer,
568                                                               permalink,
569                                                               commentType,
570                                                               commentAuthor,
571                                                               commentAuthorEmail,
572                                                               commentAuthorURL,
573                                                               change.toString(),
574                                                               null );
575
576                sw.stop();
577                LOG.debug( "Akismet request done in: " + sw );
578
579                if( isSpam ) {
580                    // Host host = new Host( ipAddress, null );
581                    // m_temporaryBanList.add( host );
582
583                    final String uid = log( context, REJECT, REASON_AKISMET, change.toString() );
584                    LOG.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." );
585                    checkStrategy( context, REASON_AKISMET, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" );
586                }
587            }
588        }
589    }
590
591    /**
592     * Returns a static string which can be used to detect spambots which just wildly fill in all the fields.
593     *
594     * @return A string
595     */
596    public static String getBotFieldName() {
597        return "submit_auth";
598    }
599
600    /**
601     * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam.
602     *
603     * @param context page Context
604     * @param change page change
605     * @throws RedirectException spam filter rejects the page change.
606     */
607    private void checkBotTrap( final Context context, final Change change ) throws RedirectException {
608        final HttpServletRequest request = context.getHttpRequest();
609        if( request != null ) {
610            final String unspam = request.getParameter( getBotFieldName() );
611            if( unspam != null && !unspam.isEmpty() ) {
612                final String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() );
613
614                LOG.info( "SPAM:BotTrap (" + uid + ").  Wildly behaving bot detected." );
615                checkStrategy( context, REASON_BOT_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" );
616            }
617        }
618    }
619
620    private void checkUTF8( final Context context, final Change change ) throws RedirectException {
621        final HttpServletRequest request = context.getHttpRequest();
622        if( request != null ) {
623            final String utf8field = request.getParameter( "encodingcheck" );
624            if( utf8field != null && !utf8field.equals( "\u3041" ) ) {
625                final String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() );
626
627                LOG.info( "SPAM:UTF8Trap (" + uid + ").  Wildly posting dumb bot detected." );
628                checkStrategy( context, REASON_UTF8_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" );
629            }
630        }
631    }
632
633    /** Goes through the ban list and cleans away any host which has expired from it. */
634    private synchronized void cleanBanList() {
635        final long now = System.currentTimeMillis();
636        for( final Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) {
637            final Host host = i.next();
638
639            if( host.getReleaseTime() < now ) {
640                LOG.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" );
641                i.remove();
642            }
643        }
644    }
645
646    /**
647     *  Checks the ban list if the IP address of the changer is already on it.
648     *
649     *  @param context page context
650     *  @throws RedirectException spam filter rejects the page change.
651     */
652    private void checkBanList( final Context context, final Change change ) throws RedirectException {
653        final HttpServletRequest req = context.getHttpRequest();
654
655        if( req != null ) {
656            final String remote = HttpUtil.getRemoteAddress(req);
657            final long now = System.currentTimeMillis();
658
659            for( final Host host : m_temporaryBanList ) {
660                if( host.getAddress().equals( remote ) ) {
661                    final long timeleft = ( host.getReleaseTime() - now ) / 1000L;
662
663                    log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change );
664                    checkStrategy( context, REASON_IP_BANNED_TEMPORARILY,
665                            "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" );
666                }
667            }
668        }
669    }
670
671    /**
672     *  If the spam filter notices changes in the black list page, it will refresh them automatically.
673     *
674     *  @param context associated WikiContext
675     */
676    private void refreshBlacklists( final Context context ) {
677        try {
678            boolean rebuild = false;
679
680            //  Rebuild, if the spam words page, the attachment or the IP ban page has changed since.
681            final Page sourceSpam = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenWordsPage );
682            if( sourceSpam != null ) {
683                if( m_spamPatterns == null || m_spamPatterns.isEmpty() || sourceSpam.getLastModified().after( m_lastRebuild ) ) {
684                    rebuild = true;
685                }
686            }
687
688            final Attachment att = context.getEngine().getManager( AttachmentManager.class ).getAttachmentInfo( context, m_blacklist );
689            if( att != null ) {
690                if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) {
691                    rebuild = true;
692                }
693            }
694
695            final Page sourceIPs = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenIPsPage );
696            if( sourceIPs != null ) {
697                if( m_IPPatterns == null || m_IPPatterns.isEmpty() || sourceIPs.getLastModified().after( m_lastRebuild ) ) {
698                    rebuild = true;
699                }
700            }
701
702            //  Do the actual rebuilding.  For simplicity's sake, we always rebuild the complete filter list regardless of what changed.
703            if( rebuild ) {
704                m_lastRebuild = new Date();
705                m_spamPatterns = parseWordList( sourceSpam, ( sourceSpam != null ) ? sourceSpam.getAttribute( LISTVAR ) : null );
706
707                LOG.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage );
708
709                m_IPPatterns = parseWordList( sourceIPs,  ( sourceIPs != null ) ? sourceIPs.getAttribute( LISTIPVAR ) : null );
710                LOG.info( "IP filter reloaded - recognizing " + m_IPPatterns.size() + " patterns from page " + m_forbiddenIPsPage );
711
712                if( att != null ) {
713                    final InputStream in = context.getEngine().getManager( AttachmentManager.class ).getAttachmentStream(att);
714                    final StringWriter out = new StringWriter();
715                    FileUtil.copyContents( new InputStreamReader( in, StandardCharsets.UTF_8 ), out );
716                    final Collection< Pattern > blackList = parseBlacklist( out.toString() );
717                    LOG.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist );
718                    m_spamPatterns.addAll( blackList );
719                }
720            }
721        } catch( final IOException ex ) {
722            LOG.info( "Unable to read attachment data, continuing...", ex );
723        } catch( final ProviderException ex ) {
724            LOG.info( "Failed to read spam filter attachment, continuing...", ex );
725        }
726    }
727
728    /**
729     * Does a check against a known pattern list.
730     *
731     * @param context page Context
732     * @param content page content
733     * @param change page change
734     * @throws RedirectException spam filter rejects the page change.
735     */
736    private void checkPatternList( final Context context, final String content, final Change change ) throws RedirectException {
737        // If we have no spam patterns defined, or we're trying to save the page containing the patterns, just return.
738        if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) {
739            return;
740        }
741
742        String ch = change.toString();
743        if( context.getHttpRequest() != null ) {
744            ch += HttpUtil.getRemoteAddress( context.getHttpRequest() );
745        }
746
747        for( final Pattern p : m_spamPatterns ) {
748            // LOG.debug("Attempting to match page contents with "+p.getPattern());
749
750            if( m_matcher.contains( ch, p ) ) {
751                //  Spam filter has a match.
752                final String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch );
753
754                LOG.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" );
755                checkStrategy( context, REASON_REGEXP, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" );
756            }
757        }
758    }
759
760
761    /**
762     *  Does a check against a pattern list of IPs.
763     *
764     *  @param context page context
765     *  @throws RedirectException spam filter rejects the page change.
766     */
767    private void checkIPList( final Context context ) throws RedirectException {
768        //  If we have no IP patterns defined, or we're trying to save the page containing the IP patterns, just return.
769        if( m_IPPatterns == null || context.getPage().getName().equals( m_forbiddenIPsPage ) ) {
770            return;
771        }
772
773        final String remoteIP = HttpUtil.getRemoteAddress( context.getHttpRequest() );
774        LOG.info("Attempting to match remoteIP " + remoteIP + " against " + m_IPPatterns.size() + " patterns");
775
776        for( final Pattern p : m_IPPatterns ) {
777             LOG.debug("Attempting to match remoteIP with " + p.getPattern());
778
779            if( m_matcher.contains( remoteIP, p ) ) {
780
781                //  IP filter has a match.
782                //
783                final String uid = log( context, REJECT, REASON_IP_BANNED_PERMANENTLY + "(" + p.getPattern() + ")", remoteIP );
784
785                LOG.info( "SPAM:IPBanList (" + uid + "). remoteIP matches the IP filter '" + p.getPattern() + "'" );
786                checkStrategy( context, REASON_IP_BANNED_PERMANENTLY, "Herb says '" + p.getPattern() + "' is a banned IP and I trust Herb! (Incident code " + uid + ")" );
787            }
788        }
789    }
790
791    private void checkPatternList( final Context context, final String content, final String change ) throws RedirectException {
792        final Change c = new Change();
793        c.m_change = change;
794        checkPatternList( context, content, c );
795    }
796 
797    /**
798     *  Creates a simple text string describing the added content.
799     *
800     *  @param context page context
801     *  @param newText added content
802     *  @return Empty string, if there is no change.
803     */
804    private static Change getChange( final Context context, final String newText ) {
805        final Page page = context.getPage();
806        final StringBuffer change = new StringBuffer();
807        final Engine engine = context.getEngine();
808        // Get current page version
809
810        final Change ch = new Change();
811        
812        try {
813            final String oldText = engine.getManager( PageManager.class ).getPureText( page.getName(), WikiProvider.LATEST_VERSION );
814            final String[] first  = Diff.stringToArray( oldText );
815            final String[] second = Diff.stringToArray( newText );
816            final Revision rev = Diff.diff( first, second, new MyersDiff() );
817
818            if( rev == null || rev.size() == 0 ) {
819                return ch;
820            }
821            
822            for( int i = 0; i < rev.size(); i++ ) {
823                final Delta d = rev.getDelta( i );
824
825                if( d instanceof AddDelta ) {
826                    d.getRevised().toString( change, "", "\r\n" );
827                    ch.m_adds++;
828                    
829                } else if( d instanceof ChangeDelta ) {
830                    d.getRevised().toString( change, "", "\r\n" );
831                    ch.m_adds++;
832                    
833                } else if( d instanceof DeleteDelta ) {
834                    ch.m_removals++;
835                }
836            }
837        } catch( final DifferentiationFailedException e ) {
838            LOG.error( "Diff failed", e );
839        }
840
841        //  Don't forget to include the change note, too
842        final String changeNote = page.getAttribute( Page.CHANGENOTE );
843        if( changeNote != null ) {
844            change.append( "\r\n" );
845            change.append( changeNote );
846        }
847
848        //  And author as well
849        if( page.getAuthor() != null ) {
850            change.append( "\r\n" ).append( page.getAuthor() );
851        }
852
853        ch.m_change = change.toString();
854        return ch;
855    }
856
857    /**
858     * Returns true, if this user should be ignored.  For example, admin users.
859     *
860     * @param context page context
861     * @return True, if this user should be ignored.
862     */
863    private boolean ignoreThisUser( final Context context ) {
864        if( context.hasAdminPermissions() ) {
865            return true;
866        }
867
868        final List< String > groups = Arrays.asList( m_allowedGroups );
869        if( Arrays.stream( context.getWikiSession().getRoles() ).anyMatch( role -> groups.contains( role.getName() ) ) ) {
870            return true;
871        }
872
873        if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) {
874            return true;
875        }
876
877        return context.getVariable("captcha") != null;
878    }
879
880    /**
881     *  Returns a random string of six uppercase characters.
882     *
883     *  @return A random string
884     */
885    private static String getUniqueID() {
886        final StringBuilder sb = new StringBuilder();
887        for( int i = 0; i < 6; i++ ) {
888            final char x = ( char )( 'A' + RANDOM.nextInt( 26 ) );
889            sb.append( x );
890        }
891
892        return sb.toString();
893    }
894
895    /**
896     *  Returns a page to which we shall redirect, based on the current value of the "captcha" parameter.
897     *
898     *  @param ctx WikiContext
899     *  @return An URL to redirect to
900     */
901    private String getRedirectPage( final Context ctx ) {
902        if( m_useCaptcha ) {
903            return ctx.getURL( ContextEnum.PAGE_NONE.getRequestContext(), "Captcha.jsp", "page= " +ctx.getEngine().encodeName( ctx.getPage().getName() ) );
904        }
905
906        return ctx.getURL( ContextEnum.PAGE_VIEW.getRequestContext(), m_errorPage );
907    }
908
909    /**
910     *  Checks whether the UserProfile matches certain checks.
911     *
912     *  @param profile The profile to check
913     *  @param context The WikiContext
914     *  @return False, if this userprofile is suspect and should not be allowed to be added.
915     *  @since 2.6.1
916     */
917    public boolean isValidUserProfile( final Context context, final UserProfile profile ) {
918        try {
919            checkPatternList( context, profile.getEmail(), profile.getEmail() );
920            checkPatternList( context, profile.getFullname(), profile.getFullname() );
921            checkPatternList( context, profile.getLoginName(), profile.getLoginName() );
922        } catch( final RedirectException e ) {
923            LOG.info("Detected attempt to create a spammer user account (see above for rejection reason)");
924            return false;
925        }
926
927        return true;
928    }
929
930    /**
931     *  This method is used to calculate an unique code when submitting the page to detect edit conflicts.  
932     *  It currently incorporates the last-modified date of the page, and the IP address of the submitter.
933     *
934     *  @param page The WikiPage under edit
935     *  @param request The HTTP Request
936     *  @since 2.6
937     *  @return A hash value for this page and session
938     */
939    public static String getSpamHash( final Page page, final HttpServletRequest request ) {
940        long lastModified = 0;
941
942        if( page.getLastModified() != null ) {
943            lastModified = page.getLastModified().getTime();
944        }
945        final long remote = HttpUtil.getRemoteAddress( request ).hashCode();
946
947        return Long.toString( lastModified ^ remote );
948    }
949
950    /**
951     *  Returns the name of the hash field to be used in this request. The value is unique per session, and once 
952     *  the session has expired, you cannot edit anymore.
953     *
954     *  @param request The page request
955     *  @return The name to be used in the hash field
956     *  @since  2.6
957     */
958    public static String getHashFieldName( final HttpServletRequest request ) {
959        String hash = null;
960
961        if( request.getSession() != null ) {
962            hash = ( String )request.getSession().getAttribute( "_hash" );
963
964            if( hash == null ) {
965                hash = c_hashName;
966                request.getSession().setAttribute( "_hash", hash );
967            }
968        }
969
970        if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) {
971            c_hashName = getUniqueID().toLowerCase();
972            c_lastUpdate = System.currentTimeMillis();
973        }
974
975        return hash != null ? hash : c_hashName;
976    }
977
978
979    /**
980     *  This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases: 
981     *  either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long, 
982     *  and their session has expired.
983     *  <p>
984     *  This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in 
985     *  the spam log (it may or may not be spam, but it's rather likely that it is).
986     *
987     *  @param context The WikiContext
988     *  @param pageContext The JSP PageContext.
989     *  @return True, if hash is okay.  False, if hash is not okay, and you need to redirect.
990     *  @throws IOException If redirection fails
991     *  @since 2.6
992     */
993    public static boolean checkHash( final Context context, final PageContext pageContext ) throws IOException {
994        final String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() );
995        if( pageContext.getRequest().getParameter(hashName) == null ) {
996            if( pageContext.getAttribute( hashName ) == null ) {
997                final Change change = getChange( context, EditorManager.getEditedText( pageContext ) );
998                log( context, REJECT, "MissingHash", change.m_change );
999
1000                final String redirect = context.getURL( ContextEnum.PAGE_VIEW.getRequestContext(),"SessionExpired" );
1001                ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect );
1002                return false;
1003            }
1004        }
1005
1006        return true;
1007    }
1008
1009    /**
1010     * This helper method adds all the input fields to your editor that the SpamFilter requires
1011     * to check for spam.  This <i>must</i> be in your editor form if you intend to use the SpamFilter.
1012     *  
1013     * @param pageContext The PageContext
1014     * @return A HTML string which contains input fields for the SpamFilter.
1015     */
1016    public static String insertInputFields( final PageContext pageContext ) {
1017        final Context ctx = Context.findContext( pageContext );
1018        final Engine engine = ctx.getEngine();
1019        final StringBuilder sb = new StringBuilder();
1020        if( engine.getContentEncoding().equals( StandardCharsets.UTF_8 ) ) {
1021            sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" );
1022        }
1023
1024        return sb.toString();
1025    }
1026    
1027    /**
1028     *  A local class for storing host information.
1029     */
1030    private class Host {
1031
1032        private final long m_addedTime = System.currentTimeMillis();
1033        private final long m_releaseTime;
1034        private final String m_address;
1035        private final Change m_change;
1036
1037        public String getAddress() {
1038            return m_address;
1039        }
1040
1041        public long getReleaseTime() {
1042            return m_releaseTime;
1043        }
1044
1045        public long getAddedTime() {
1046            return m_addedTime;
1047        }
1048
1049        public Change getChange() {
1050            return m_change;
1051        }
1052
1053        public Host( final String ipaddress, final Change change ) {
1054            m_address = ipaddress;
1055            m_change = change;
1056            m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L;
1057        }
1058        
1059    }
1060    
1061    private static class Change {
1062        
1063        public String m_change;
1064        public int    m_adds;
1065        public int    m_removals;
1066
1067        @Override
1068        public String toString() {
1069            return m_change;
1070        }
1071
1072        @Override
1073        public boolean equals( final Object o ) {
1074            if( o instanceof Change ) {
1075                return m_change.equals( ( ( Change )o ).m_change );
1076            }
1077            return false;
1078        }
1079
1080        @Override
1081        public int hashCode() {
1082            return m_change.hashCode() + 17;
1083        }
1084        
1085    }
1086
1087}