001/*
002    Licensed to the Apache Software Foundation (ASF) under one
003    or more contributor license agreements.  See the NOTICE file
004    distributed with this work for additional information
005    regarding copyright ownership.  The ASF licenses this file
006    to you under the Apache License, Version 2.0 (the
007    "License"); you may not use this file except in compliance
008    with the License.  You may obtain a copy of the License at
009
010       http://www.apache.org/licenses/LICENSE-2.0
011
012    Unless required by applicable law or agreed to in writing,
013    software distributed under the License is distributed on an
014    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015    KIND, either express or implied.  See the License for the
016    specific language governing permissions and limitations
017    under the License.  
018 */
019package org.apache.wiki.filters;
020
021import java.io.BufferedReader;
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.InputStreamReader;
025import java.io.StringReader;
026import java.io.StringWriter;
027import java.util.ArrayList;
028import java.util.Collection;
029import java.util.Date;
030import java.util.Iterator;
031import java.util.Properties;
032import java.util.Random;
033import java.util.StringTokenizer;
034import java.util.Vector;
035
036import javax.servlet.http.HttpServletRequest;
037import javax.servlet.http.HttpServletResponse;
038import javax.servlet.jsp.PageContext;
039
040import org.apache.commons.lang.time.StopWatch;
041import org.apache.log4j.Logger;
042import org.apache.oro.text.regex.MalformedPatternException;
043import org.apache.oro.text.regex.MatchResult;
044import org.apache.oro.text.regex.Pattern;
045import org.apache.oro.text.regex.PatternCompiler;
046import org.apache.oro.text.regex.PatternMatcher;
047import org.apache.oro.text.regex.Perl5Compiler;
048import org.apache.oro.text.regex.Perl5Matcher;
049import org.apache.wiki.InternalWikiException;
050import org.apache.wiki.WikiContext;
051import org.apache.wiki.WikiEngine;
052import org.apache.wiki.WikiPage;
053import org.apache.wiki.WikiProvider;
054import org.apache.wiki.api.exceptions.ProviderException;
055import org.apache.wiki.api.exceptions.RedirectException;
056import org.apache.wiki.api.filters.BasicPageFilter;
057import org.apache.wiki.attachment.Attachment;
058import org.apache.wiki.auth.user.UserProfile;
059import org.apache.wiki.ui.EditorManager;
060import org.apache.wiki.util.FileUtil;
061import org.apache.wiki.util.HttpUtil;
062import org.apache.wiki.util.TextUtil;
063import org.suigeneris.jrcs.diff.Diff;
064import org.suigeneris.jrcs.diff.DifferentiationFailedException;
065import org.suigeneris.jrcs.diff.Revision;
066import org.suigeneris.jrcs.diff.delta.AddDelta;
067import org.suigeneris.jrcs.diff.delta.ChangeDelta;
068import org.suigeneris.jrcs.diff.delta.DeleteDelta;
069import org.suigeneris.jrcs.diff.delta.Delta;
070import org.suigeneris.jrcs.diff.myers.MyersDiff;
071
072import net.sf.akismet.Akismet;
073
074
075/**
076 *  This is Herb, the JSPWiki spamfilter that can also do choke modifications.
077 *
078 *  Parameters:
079 *  <ul>
080 *    <li>wordlist - Page name where the spamword regexps are found.  Use [{SET spamwords='regexp list separated with spaces'}] on
081 *     that page.  Default is "SpamFilterWordList".
082 *    <li>IPlist - Page name where the IP regexps are found.  Use [{SET ips='regexp list separated with spaces'}] on
083 *     that page.  Default is "SpamFilterIPList".
084 *    <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is
085 *        "SpamFilterWordList/blacklist.txt"</li>
086 *    <li>errorpage - The page to which the user is redirected.  Has a special variable $msg which states the reason. Default is "RejectedMessage".
087 *    <li>pagechangesinminute - How many page changes are allowed/minute.  Default is 5.</li>
088 *    <li>similarchanges - How many similar page changes are allowed before the host is banned.  Default is 2.  (since 2.4.72)</li>
089 *    <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li>
090 *    <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li>
091 *    <li>akismet-apikey - The Akismet API key (see akismet.org)</li>
092 *    <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li>
093 *    <li>captcha - Sets the captcha technology to use.  Current allowed values are "none" and "asirra".</li>
094 *    <li>strategy - Sets the filtering strategy to use.  If set to "eager", will stop at the first probable
095 *        match, and won't consider any other tests.  This is the default, as it's considerably lighter. If set to "score", will go through all of the tests
096 *        and calculates a score for the spam, which is then compared to a filter level value.
097 *  </ul>
098 *
099 *  <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates
100 *  with the editor system.</p>
101 *  
102 *  <p>Changes by admin users are ignored in any case.</p>
103 *
104 *  @since 2.1.112
105 */
106public class SpamFilter extends BasicPageFilter {
107    
108    private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score";
109    private static final String REASON_REGEXP = "Regexp";
110    private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily";
111    private static final String REASON_IP_BANNED_PERMANENTLY = "IPBannedPermanently";
112    private static final String REASON_BOT_TRAP = "BotTrap";
113    private static final String REASON_AKISMET = "Akismet";
114    private static final String REASON_TOO_MANY_URLS = "TooManyUrls";
115    private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications";
116    private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications";
117    private static final String REASON_UTF8_TRAP = "UTF8Trap";
118
119    private static final String LISTVAR = "spamwords";
120    private static final String LISTIPVAR = "ips";
121
122    /** The filter property name for specifying the page which contains the list of spamwords.
123     *  Value is <tt>{@value}</tt>. */
124    public static final String  PROP_WORDLIST              = "wordlist";
125
126    /** The filter property name for specifying the page which contains the list of IPs to ban.
127     *  Value is <tt>{@value}</tt>. */
128    public static final String  PROP_IPLIST              = "IPlist";
129
130    /** The filter property name for the page to which you are directed if Herb rejects your
131     *  edit.  Value is <tt>{@value}</tt>. */
132    public static final String  PROP_ERRORPAGE             = "errorpage";
133    
134    /** The filter property name for specifying how many changes is any given IP address
135     *  allowed to do per minute.  Value is <tt>{@value}</tt>.
136     */
137    public static final String  PROP_PAGECHANGES           = "pagechangesinminute";
138    
139    /** The filter property name for specifying how many similar changes are allowed
140     *  before a host is banned.  Value is <tt>{@value}</tt>.
141     */
142    public static final String  PROP_SIMILARCHANGES        = "similarchanges";
143    
144    /** The filter property name for specifying how long a host is banned.  Value is <tt>{@value}</tt>.*/
145    public static final String  PROP_BANTIME               = "bantime";
146    
147    /** The filter property name for the attachment containing the blacklist.  Value is <tt>{@value}</tt>.*/
148    public static final String  PROP_BLACKLIST             = "blacklist";
149    
150    /** The filter property name for specifying how many URLs can any given edit contain.  
151     *  Value is <tt>{@value}</tt> */
152    public static final String  PROP_MAXURLS               = "maxurls";
153    
154    /** The filter property name for specifying the Akismet API-key.  Value is <tt>{@value}</tt>. */
155    public static final String  PROP_AKISMET_API_KEY       = "akismet-apikey";
156    
157    /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */
158    public static final String  PROP_IGNORE_AUTHENTICATED  = "ignoreauthenticated";
159    
160    /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */
161    public static final String  PROP_CAPTCHA               = "captcha";
162    
163    /** The filter property name for specifying which filter strategy should be used.  Value is <tt>{@value}</tt>. */
164    public static final String  PROP_FILTERSTRATEGY        = "strategy";
165
166    /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */
167    public static final String  STRATEGY_EAGER             = "eager";
168    
169    /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */
170    public static final String  STRATEGY_SCORE             = "score";
171
172    private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)";
173
174    private String          m_forbiddenWordsPage = "SpamFilterWordList";
175    private String          m_forbiddenIPsPage   = "SpamFilterIPList";
176    private String          m_errorPage          = "RejectedMessage";
177    private String          m_blacklist          = "SpamFilterWordList/blacklist.txt";
178
179    private PatternMatcher  m_matcher = new Perl5Matcher();
180    private PatternCompiler m_compiler = new Perl5Compiler();
181
182    private Collection<Pattern> m_spamPatterns = null;
183    private Collection<Pattern> m_IPPatterns = null;
184
185    private Date            m_lastRebuild = new Date( 0L );
186
187    private static  Logger  c_spamlog = Logger.getLogger( "SpamLog" );
188    private static  Logger  log = Logger.getLogger( SpamFilter.class );
189
190
191    private Vector<Host>    m_temporaryBanList = new Vector<Host>();
192
193    private int             m_banTime = 60; // minutes
194
195    private Vector<Host>    m_lastModifications = new Vector<Host>();
196
197    /**
198     *  How many times a single IP address can change a page per minute?
199     */
200    private int             m_limitSinglePageChanges = 5;
201
202    /**
203     *  How many times can you add the exact same string to a page?
204     */
205    private int             m_limitSimilarChanges = 2;
206
207    /**
208     *  How many URLs can be added at maximum.
209     */
210    private int             m_maxUrls = 10;
211
212    private Pattern         m_urlPattern;
213    private Akismet         m_akismet;
214
215    private String          m_akismetAPIKey = null;
216
217    private boolean         m_useCaptcha = false;
218
219    /** The limit at which we consider something to be spam. */
220    private int             m_scoreLimit = 1;
221
222    /**
223     * If set to true, will ignore anyone who is in Authenticated role.
224     */
225    private boolean         m_ignoreAuthenticated = false;
226
227    private boolean         m_stopAtFirstMatch = true;
228
229    private static String   c_hashName;
230    private static long     c_lastUpdate;
231
232    /** The HASH_DELAY value is a maximum amount of time that an user can keep
233     *  a session open, because after the value has expired, we will invent a new
234     *  hash field name.  By default this is {@value} hours, which should be ample
235     *  time for someone.
236     */
237    private static final long HASH_DELAY = 24;
238
239
240    /**
241     *  {@inheritDoc}
242     */
243    @Override
244    public void initialize( WikiEngine engine, Properties properties ) {
245        m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage );
246        m_forbiddenIPsPage = properties.getProperty( PROP_IPLIST, m_forbiddenIPsPage);
247        m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage );
248        m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties,
249                                                                PROP_PAGECHANGES,
250                                                                m_limitSinglePageChanges );
251        
252        m_limitSimilarChanges = TextUtil.getIntegerProperty( properties,
253                                                             PROP_SIMILARCHANGES,
254                                                             m_limitSimilarChanges );
255
256        m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls );
257        m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime );
258        m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist );
259
260        m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties,
261                                                             PROP_IGNORE_AUTHENTICATED,
262                                                             m_ignoreAuthenticated );
263
264        m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra");
265
266        try {
267            m_urlPattern = m_compiler.compile( URL_REGEXP );
268        } catch( MalformedPatternException e ) {
269            log.fatal( "Internal error: Someone put in a faulty pattern.", e );
270            throw new InternalWikiException( "Faulty pattern." );
271        }
272
273        m_akismetAPIKey = TextUtil.getStringProperty( properties,
274                                                      PROP_AKISMET_API_KEY,
275                                                      m_akismetAPIKey );
276
277        m_stopAtFirstMatch = TextUtil.getStringProperty( properties,
278                                                         PROP_FILTERSTRATEGY,
279                                                         STRATEGY_EAGER ).equals( STRATEGY_EAGER );
280
281        log.info( "# Spam filter initialized.  Temporary ban time " + m_banTime +
282                  " mins, max page changes/minute: " + m_limitSinglePageChanges );
283
284
285    }
286
287    private static final int REJECT = 0;
288    private static final int ACCEPT = 1;
289    private static final int NOTE   = 2;
290
291    private static String log( WikiContext ctx, int type, String source, String message ) {
292        message = TextUtil.replaceString( message, "\r\n", "\\r\\n" );
293        message = TextUtil.replaceString( message, "\"", "\\\"" );
294
295        String uid = getUniqueID();
296
297        String page   = ctx.getPage().getName();
298        String reason = "UNKNOWN";
299        String addr   = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-";
300
301        switch( type ) {
302            case REJECT:
303                reason = "REJECTED";
304                break;
305            case ACCEPT:
306                reason = "ACCEPTED";
307                break;
308            case NOTE:
309                reason = "NOTE";
310                break;
311            default:
312                throw new InternalWikiException( "Illegal type " + type );
313        }
314        c_spamlog.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message );
315
316        return uid;
317    }
318
319    /** {@inheritDoc} */
320    public String preSave( WikiContext context, String content ) throws RedirectException {
321        cleanBanList();
322        refreshBlacklists( context );
323        Change change = getChange( context, content );
324
325        if( !ignoreThisUser( context ) ) {
326            checkBanList( context, change );
327            checkSinglePageChange( context, content, change );
328            checkIPList( context );
329            checkPatternList( context, content, change );
330        }
331
332        if( !m_stopAtFirstMatch ) {
333            Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE );
334
335            if( score != null && score.intValue() >= m_scoreLimit ) {
336                throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) );
337            }
338        }
339
340        log( context, ACCEPT, "-", change.toString() );
341        return content;
342    }
343
344    private void checkStrategy( WikiContext context, String error, String message ) throws RedirectException {
345        if( m_stopAtFirstMatch ) {
346            throw new RedirectException( message, getRedirectPage( context ) );
347        }
348
349        Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE );
350        if( score != null ) {
351            score = score + 1;
352        } else {
353            score = 1;
354        }
355
356        context.setVariable( ATTR_SPAMFILTER_SCORE, score );
357    }
358    
359    /**
360     *  Parses a list of patterns and returns a Collection of compiled Pattern
361     *  objects.
362     *
363     * @param source
364     * @param list
365     * @return A Collection of the Patterns that were found from the lists.
366     */
367    private Collection< Pattern > parseWordList( WikiPage source, String list ) {
368        ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >();
369
370        if( list != null ) {
371            StringTokenizer tok = new StringTokenizer( list, " \t\n" );
372
373            while( tok.hasMoreTokens() ) {
374                String pattern = tok.nextToken();
375
376                try {
377                    compiledpatterns.add( m_compiler.compile( pattern ) );
378                } catch( MalformedPatternException e ) {
379                    log.debug( "Malformed spam filter pattern " + pattern );
380                    source.setAttribute("error", "Malformed spam filter pattern " + pattern);
381                }
382            }
383        }
384
385        return compiledpatterns;
386    }
387
388    /**
389     *  Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects.
390     *
391     *  @param list
392     *  @return The parsed blacklist patterns.
393     */
394    private Collection< Pattern > parseBlacklist( String list ) {
395        ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >();
396
397        if( list != null ) {
398            try {
399                BufferedReader in = new BufferedReader( new StringReader(list) );
400                String line;
401                while( (line = in.readLine() ) != null ) {
402                    line = line.trim();
403                    if( line.length() == 0 ) continue; // Empty line
404                    if( line.startsWith("#") ) continue; // It's a comment
405
406                    int ws = line.indexOf( ' ' );
407                    if( ws == -1 ) ws = line.indexOf( '\t' );
408                    if( ws != -1 ) line = line.substring( 0, ws );
409
410                    try {
411                        compiledpatterns.add( m_compiler.compile( line ) );
412                    } catch( MalformedPatternException e ) {
413                        log.debug( "Malformed spam filter pattern " + line );
414                    }
415                }
416            } catch( IOException e ) {
417                log.info( "Could not read patterns; returning what I got" , e );
418            }
419        }
420
421        return compiledpatterns;
422    }
423
424    /**
425     *  Takes a single page change and performs a load of tests on the content change.
426     *  An admin can modify anything.
427     *
428     *  @param context
429     *  @param content
430     *  @throws RedirectException
431     */
432    private synchronized void checkSinglePageChange( WikiContext context, String content, Change change ) 
433            throws RedirectException {
434        HttpServletRequest req = context.getHttpRequest();
435
436        if( req != null ) {
437            String addr = HttpUtil.getRemoteAddress( req );
438            int hostCounter = 0;
439            int changeCounter = 0;
440
441            log.debug( "Change is " + change.m_change );
442
443            long time = System.currentTimeMillis() - 60*1000L; // 1 minute
444
445            for( Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) {
446                Host host = i.next();
447
448                //
449                //  Check if this item is invalid
450                //
451                if( host.getAddedTime() < time ) {
452                    log.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" );
453                    i.remove();
454                    continue;
455                }
456
457                //
458                // Check if this IP address has been seen before
459                //
460
461                if( host.getAddress().equals( addr ) ) {
462                    hostCounter++;
463                }
464
465                //
466                //  Check, if this change has been seen before
467                //
468
469                if( host.getChange() != null && host.getChange().equals( change ) ) {
470                    changeCounter++;
471                }
472            }
473
474            //
475            //  Now, let's check against the limits.
476            //
477            if( hostCounter >= m_limitSinglePageChanges ) {
478                Host host = new Host( addr, null );
479                m_temporaryBanList.add( host );
480
481                String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change );
482                log.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" );
483                checkStrategy( context, REASON_TOO_MANY_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
484            }
485
486            if( changeCounter >= m_limitSimilarChanges ) {
487                Host host = new Host( addr, null );
488                m_temporaryBanList.add( host );
489
490                String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change );
491                log.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" );
492                checkStrategy( context, REASON_SIMILAR_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")");
493            }
494
495            //
496            //  Calculate the number of links in the addition.
497            //
498            String tstChange  = change.toString();
499            int    urlCounter = 0;
500            while( m_matcher.contains( tstChange,m_urlPattern ) ) {
501                MatchResult m = m_matcher.getMatch();
502                tstChange = tstChange.substring( m.endOffset(0) );
503                urlCounter++;
504            }
505
506            if( urlCounter > m_maxUrls ) {
507                Host host = new Host( addr, null );
508                m_temporaryBanList.add( host );
509
510                String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() );
511                log.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" );
512                checkStrategy( context, REASON_TOO_MANY_URLS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
513            }
514
515            //
516            //  Check bot trap
517            //
518            checkBotTrap( context, change );
519
520            //
521            //  Check UTF-8 mangling
522            //
523            checkUTF8( context, change );
524
525            //
526            //  Do Akismet check.  This is good to be the last, because this is the most
527            //  expensive operation.
528            //
529            checkAkismet( context, change );
530
531            m_lastModifications.add( new Host( addr, change ) );
532        }
533    }
534
535
536    /**
537     *  Checks against the akismet system.
538     *
539     * @param context
540     * @param change
541     * @throws RedirectException
542     */
543    private void checkAkismet( WikiContext context, Change change ) throws RedirectException {
544        if( m_akismetAPIKey != null ) {
545            if( m_akismet == null ) {
546                log.info( "Initializing Akismet spam protection." );
547                m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() );
548
549                if( !m_akismet.verifyAPIKey() ) {
550                    log.error( "Akismet API key cannot be verified.  Please check your config." );
551                    m_akismetAPIKey = null;
552                    m_akismet = null;
553                }
554            }
555
556            HttpServletRequest req = context.getHttpRequest();
557
558            //
559            //  Akismet will mark all empty statements as spam, so we'll just
560            //  ignore them.
561            //
562            if( change.m_adds == 0 && change.m_removals > 0 ) {
563                return;
564            }
565            
566            if( req != null && m_akismet != null ) {
567                log.debug( "Calling Akismet to check for spam..." );
568
569                StopWatch sw = new StopWatch();
570                sw.start();
571
572                String ipAddress     = HttpUtil.getRemoteAddress( req );
573                String userAgent     = req.getHeader( "User-Agent" );
574                String referrer      = req.getHeader( "Referer");
575                String permalink     = context.getViewURL( context.getPage().getName() );
576                String commentType   = context.getRequestContext().equals( WikiContext.COMMENT ) ? "comment" : "edit";
577                String commentAuthor = context.getCurrentUser().getName();
578                String commentAuthorEmail = null;
579                String commentAuthorURL   = null;
580
581                boolean isSpam = m_akismet.commentCheck( ipAddress,
582                                                         userAgent,
583                                                         referrer,
584                                                         permalink,
585                                                         commentType,
586                                                         commentAuthor,
587                                                         commentAuthorEmail,
588                                                         commentAuthorURL,
589                                                         change.toString(),
590                                                         null );
591
592                sw.stop();
593                log.debug( "Akismet request done in: " + sw );
594
595                if( isSpam ) {
596                    // Host host = new Host( ipAddress, null );
597                    // m_temporaryBanList.add( host );
598
599                    String uid = log( context, REJECT, REASON_AKISMET, change.toString() );
600                    log.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." );
601                    checkStrategy( context, REASON_AKISMET, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" );
602                }
603            }
604        }
605    }
606
607    /**
608     * Returns a static string which can be used to detect spambots which just wildly fill in all the fields.
609     *
610     * @return A string
611     */
612    public static String getBotFieldName() {
613        return "submit_auth";
614    }
615
616    /**
617     * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam.
618     *
619     * @param context
620     * @param change
621     * @throws RedirectException
622     */
623    private void checkBotTrap( WikiContext context, Change change ) throws RedirectException {
624        HttpServletRequest request = context.getHttpRequest();
625
626        if( request != null ) {
627            String unspam = request.getParameter( getBotFieldName() );
628            if( unspam != null && unspam.length() > 0 ) {
629                String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() );
630
631                log.info( "SPAM:BotTrap (" + uid + ").  Wildly behaving bot detected." );
632                checkStrategy( context, REASON_BOT_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" );
633            }
634        }
635    }
636
637    private void checkUTF8( WikiContext context, Change change ) throws RedirectException {
638        HttpServletRequest request = context.getHttpRequest();
639
640        if( request != null ) {
641            String utf8field = request.getParameter( "encodingcheck" );
642
643            if( utf8field != null && !utf8field.equals( "\u3041" ) ) {
644                String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() );
645
646                log.info( "SPAM:UTF8Trap (" + uid + ").  Wildly posting dumb bot detected." );
647                checkStrategy( context, REASON_UTF8_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" );
648            }
649        }
650    }
651
652    /** Goes through the ban list and cleans away any host which has expired from it. */
653    private synchronized void cleanBanList() {
654        long now = System.currentTimeMillis();
655
656        for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) {
657            Host host = i.next();
658
659            if( host.getReleaseTime() < now ) {
660                log.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" );
661                i.remove();
662            }
663        }
664    }
665
666    /**
667     *  Checks the ban list if the IP address of the changer is already on it.
668     *
669     *  @param context
670     *  @throws RedirectException
671     */
672    private void checkBanList( WikiContext context, Change change ) throws RedirectException {
673        HttpServletRequest req = context.getHttpRequest();
674
675        if( req != null ) {
676            String remote = HttpUtil.getRemoteAddress(req);
677            long now = System.currentTimeMillis();
678
679            for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) {
680                Host host = i.next();
681
682                if( host.getAddress().equals( remote ) ) {
683                    long timeleft = ( host.getReleaseTime() - now ) / 1000L;
684
685                    log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change );
686                    checkStrategy( context, REASON_IP_BANNED_TEMPORARILY, "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" );
687                }
688            }
689        }
690    }
691
692    /**
693     *  If the spam filter notices changes in the black list page, it will refresh them automatically.
694     *
695     *  @param context
696     */
697    private void refreshBlacklists( WikiContext context ) {
698        try {
699
700            boolean rebuild = false;
701
702            //
703            //  Rebuild, if the spam words page, the attachment or the IP ban page has changed since.
704            //
705            WikiPage sourceSpam = context.getEngine().getPage( m_forbiddenWordsPage );
706            if( sourceSpam != null ) {
707                if( m_spamPatterns == null || m_spamPatterns.isEmpty() || sourceSpam.getLastModified().after( m_lastRebuild ) ) {
708                    rebuild = true;
709                }
710            }
711
712            Attachment att = context.getEngine().getAttachmentManager().getAttachmentInfo( context, m_blacklist );
713            if( att != null ) {
714                if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) {
715                    rebuild = true;
716                }
717            }
718
719            WikiPage sourceIPs = context.getEngine().getPage( m_forbiddenIPsPage );
720            if( sourceIPs != null ) {
721                if( m_IPPatterns == null || m_IPPatterns.isEmpty() || sourceIPs.getLastModified().after( m_lastRebuild ) ) {
722                    rebuild = true;
723                }
724            }
725
726            //
727            //  Do the actual rebuilding.  For simplicity's sake, we always rebuild the complete
728            //  filter list regardless of what changed.
729            //
730            if( rebuild ) {
731                m_lastRebuild = new Date();
732                m_spamPatterns = parseWordList( sourceSpam,
733                                                ( sourceSpam != null ) ? ( String )sourceSpam.getAttribute( LISTVAR ) : null );
734
735                log.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage );
736
737                m_IPPatterns = parseWordList( sourceIPs,
738                        ( sourceIPs != null ) ? ( String )sourceIPs.getAttribute( LISTIPVAR ) : null );
739                log.info( "IP filter reloaded - recognizing " + m_IPPatterns.size() + " patterns from page " + m_forbiddenIPsPage );
740
741                if( att != null ) {
742                    InputStream in = context.getEngine().getAttachmentManager().getAttachmentStream(att);
743                    StringWriter out = new StringWriter();
744                    FileUtil.copyContents( new InputStreamReader( in,"UTF-8" ), out );
745                    Collection< Pattern > blackList = parseBlacklist( out.toString() );
746                    log.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist );
747                    m_spamPatterns.addAll( blackList );
748                }
749            }
750        } catch( IOException ex ) {
751            log.info( "Unable to read attachment data, continuing...", ex );
752        } catch( ProviderException ex ) {
753            log.info( "Failed to read spam filter attachment, continuing...", ex );
754        }
755    }
756
757    /**
758     *  Does a check against a known pattern list.
759     *
760     *  @param context
761     *  @param content
762     *  @param change
763     *  @throws RedirectException
764     */
765    private void checkPatternList( WikiContext context, String content, Change change ) throws RedirectException {
766        //
767        //  If we have no spam patterns defined, or we're trying to save
768        //  the page containing the patterns, just return.
769        //
770        if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) {
771            return;
772        }
773
774        String ch = change.toString();
775        if( context.getHttpRequest() != null ) {
776            ch += HttpUtil.getRemoteAddress( context.getHttpRequest() );
777        }
778
779        for( Pattern p : m_spamPatterns ) {
780            // log.debug("Attempting to match page contents with "+p.getPattern());
781
782            if( m_matcher.contains( ch, p ) ) {
783                //
784                //  Spam filter has a match.
785                //
786                String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch );
787
788                log.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" );
789                checkStrategy( context, REASON_REGEXP, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" );
790            }
791        }
792    }
793
794
795    /**
796     *  Does a check against a pattern list of IPs.
797     *
798     *  @param context
799     *  @throws RedirectException
800     */
801    private void checkIPList( WikiContext context ) throws RedirectException {
802        //
803        //  If we have no IP patterns defined, or we're trying to save
804        //  the page containing the IP patterns, just return.
805        //
806        if( m_IPPatterns == null || context.getPage().getName().equals( m_forbiddenIPsPage ) ) {
807            return;
808        }
809
810        String remoteIP = HttpUtil.getRemoteAddress( context.getHttpRequest() );
811        log.info("Attempting to match remoteIP " + remoteIP + " against " + m_IPPatterns.size() + " patterns");
812
813        for( Pattern p : m_IPPatterns ) {
814             log.debug("Attempting to match remoteIP with " + p.getPattern());
815
816            if( m_matcher.contains( remoteIP, p ) ) {
817
818                //  IP filter has a match.
819                //
820                String uid = log( context, REJECT, REASON_IP_BANNED_PERMANENTLY + "(" + p.getPattern() + ")", remoteIP );
821
822                log.info( "SPAM:Regexp (" + uid + "). remoteIP matches the IP filter '" + p.getPattern() + "'" );
823                checkStrategy( context, REASON_IP_BANNED_PERMANENTLY, "Herb says '" + p.getPattern() + "' is a banned IP and I trust Herb! (Incident code " + uid + ")" );
824            }
825        }
826    }
827
828    private void checkPatternList( WikiContext context, String content, String change ) throws RedirectException {
829        Change c = new Change();
830        c.m_change = change;
831        checkPatternList( context, content, c );
832    }
833 
834    /**
835     *  Creates a simple text string describing the added content.
836     *
837     *  @param context
838     *  @param newText
839     *  @return Empty string, if there is no change.
840     */
841    private static Change getChange( WikiContext context, String newText ) {
842        WikiPage page = context.getPage();
843        StringBuffer change = new StringBuffer();
844        WikiEngine engine = context.getEngine();
845        // Get current page version
846
847        Change ch = new Change();
848        
849        try {
850            String oldText = engine.getPureText( page.getName(), WikiProvider.LATEST_VERSION );
851
852            String[] first  = Diff.stringToArray( oldText );
853            String[] second = Diff.stringToArray( newText );
854            Revision rev = Diff.diff( first, second, new MyersDiff() );
855
856            if( rev == null || rev.size() == 0 ) {
857                return ch;
858            }
859            
860            for( int i = 0; i < rev.size(); i++ ) {
861                Delta d = rev.getDelta( i );
862
863                if( d instanceof AddDelta ) {
864                    d.getRevised().toString( change, "", "\r\n" );
865                    ch.m_adds++;
866                    
867                } else if( d instanceof ChangeDelta ) {
868                    d.getRevised().toString( change, "", "\r\n" );
869                    ch.m_adds++;
870                    
871                } else if( d instanceof DeleteDelta ) {
872                    ch.m_removals++;
873                }
874            }
875        } catch( DifferentiationFailedException e ) {
876            log.error( "Diff failed", e );
877        }
878
879        //
880        //  Don't forget to include the change note, too
881        //
882        String changeNote = ( String )page.getAttribute( WikiPage.CHANGENOTE );
883
884        if( changeNote != null ) {
885            change.append( "\r\n" );
886            change.append( changeNote );
887        }
888
889        //
890        //  And author as well
891        //
892        if( page.getAuthor() != null ) {
893            change.append( "\r\n" + page.getAuthor() );
894        }
895
896        ch.m_change = change.toString();
897        return ch;
898    }
899
900    /**
901     *  Returns true, if this user should be ignored.  For example, admin users.
902     *
903     * @param context
904     * @return True, if this users should be ignored.
905     */
906    private boolean ignoreThisUser( WikiContext context ) {
907        if( context.hasAdminPermissions() ) {
908            return true;
909        }
910
911        if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) {
912            return true;
913        }
914
915        if( context.getVariable( "captcha" ) != null ) {
916            return true;
917        }
918
919        return false;
920    }
921
922    /**
923     *  Returns a random string of six uppercase characters.
924     *
925     *  @return A random string
926     */
927    private static String getUniqueID() {
928        StringBuilder sb = new StringBuilder();
929        Random rand = new Random();
930
931        for( int i = 0; i < 6; i++ ) {
932            char x = ( char )( 'A' + rand.nextInt( 26 ) );
933            sb.append( x );
934        }
935
936        return sb.toString();
937    }
938
939    /**
940     *  Returns a page to which we shall redirect, based on the current value of the "captcha" parameter.
941     *
942     *  @param ctx WikiContext
943     *  @return An URL to redirect to
944     */
945    private String getRedirectPage( WikiContext ctx ) {
946        if( m_useCaptcha ) {
947            return ctx.getURL( WikiContext.NONE, "Captcha.jsp", "page="+ctx.getEngine().encodeName( ctx.getPage().getName() ) );
948        }
949
950        return ctx.getURL( WikiContext.VIEW, m_errorPage );
951    }
952
953    /**
954     *  Checks whether the UserProfile matches certain checks.
955     *
956     *  @param profile The profile to check
957     *  @param context The WikiContext
958     *  @return False, if this userprofile is suspect and should not be allowed to be added.
959     *  @since 2.6.1
960     */
961    public boolean isValidUserProfile( WikiContext context, UserProfile profile ) {
962        try {
963            checkPatternList( context, profile.getEmail(), profile.getEmail() );
964            checkPatternList( context, profile.getFullname(), profile.getFullname() );
965            checkPatternList( context, profile.getLoginName(), profile.getLoginName() );
966        } catch( RedirectException e ) {
967            log.info("Detected attempt to create a spammer user account (see above for rejection reason)");
968            return false;
969        }
970
971        return true;
972    }
973
974    /**
975     *  This method is used to calculate an unique code when submitting the page to detect edit conflicts.  
976     *  It currently incorporates the last-modified date of the page, and the IP address of the submitter.
977     *
978     *  @param page The WikiPage under edit
979     *  @param request The HTTP Request
980     *  @since 2.6
981     *  @return A hash value for this page and session
982     */
983    public static final String getSpamHash( WikiPage page, HttpServletRequest request ) {
984        long lastModified = 0;
985
986        if( page.getLastModified() != null ) {
987            lastModified = page.getLastModified().getTime();
988        }
989        long remote = HttpUtil.getRemoteAddress( request ).hashCode();
990
991        return Long.toString( lastModified ^ remote );
992    }
993
994    /**
995     *  Returns the name of the hash field to be used in this request. The value is unique per session, and once 
996     *  the session has expired, you cannot edit anymore.
997     *
998     *  @param request The page request
999     *  @return The name to be used in the hash field
1000     *  @since  2.6
1001     */
1002    public static final String getHashFieldName( HttpServletRequest request ) {
1003        String hash = null;
1004
1005        if( request.getSession() != null ) {
1006            hash = ( String )request.getSession().getAttribute( "_hash" );
1007
1008            if( hash == null ) {
1009                hash = c_hashName;
1010                request.getSession().setAttribute( "_hash", hash );
1011            }
1012        }
1013
1014        if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) {
1015            c_hashName = getUniqueID().toLowerCase();
1016            c_lastUpdate = System.currentTimeMillis();
1017        }
1018
1019        return hash != null ? hash : c_hashName;
1020    }
1021
1022
1023    /**
1024     *  This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases: 
1025     *  either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long, 
1026     *  and their session has expired.
1027     *  <p>
1028     *  This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in 
1029     *  the spam log (it may or may not be spam, but it's rather likely that it is).
1030     *
1031     *  @param context The WikiContext
1032     *  @param pageContext The JSP PageContext.
1033     *  @return True, if hash is okay.  False, if hash is not okay, and you need to redirect.
1034     *  @throws IOException If redirection fails
1035     *  @since 2.6
1036     */
1037    public static final boolean checkHash( WikiContext context, PageContext pageContext ) throws IOException {
1038        String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() );
1039
1040        if( pageContext.getRequest().getParameter(hashName) == null ) {
1041            if( pageContext.getAttribute( hashName ) == null ) {
1042                Change change = getChange( context, EditorManager.getEditedText( pageContext ) );
1043                log( context, REJECT, "MissingHash", change.m_change );
1044
1045                String redirect = context.getURL( WikiContext.VIEW,"SessionExpired" );
1046                ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect );
1047                return false;
1048            }
1049        }
1050
1051        return true;
1052    }
1053
1054    /**
1055     * This helper method adds all the input fields to your editor that the SpamFilter requires
1056     * to check for spam.  This <i>must</i> be in your editor form if you intend to use the SpamFilter.
1057     *  
1058     * @param pageContext The PageContext
1059     * @return A HTML string which contains input fields for the SpamFilter.
1060     */
1061    public static final String insertInputFields( PageContext pageContext ) {
1062        WikiContext ctx = WikiContext.findContext( pageContext );
1063        WikiEngine engine = ctx.getEngine();
1064
1065        StringBuilder sb = new StringBuilder();
1066        if( engine.getContentEncoding().equals( "UTF-8" ) ) {
1067            sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" );
1068        }
1069
1070        return sb.toString();
1071    }
1072    
1073    /**
1074     *  A local class for storing host information.
1075     *
1076     *  @since
1077     */
1078    private class Host {
1079        
1080        private long   m_addedTime = System.currentTimeMillis();
1081        private long   m_releaseTime;
1082        private String m_address;
1083        private Change m_change;
1084
1085        public String getAddress() {
1086            return m_address;
1087        }
1088
1089        public long getReleaseTime() {
1090            return m_releaseTime;
1091        }
1092
1093        public long getAddedTime() {
1094            return m_addedTime;
1095        }
1096
1097        public Change getChange() {
1098            return m_change;
1099        }
1100
1101        public Host( String ipaddress, Change change ) {
1102            m_address = ipaddress;
1103            m_change  = change;
1104            m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L;
1105        }
1106        
1107    }
1108    
1109    private static class Change {
1110        
1111        public String m_change;
1112        public int    m_adds;
1113        public int    m_removals;
1114        
1115        public String toString() {
1116            return m_change;
1117        }
1118        
1119        public boolean equals( Object o ) {
1120            if( o instanceof Change ) {
1121                return m_change.equals( ( ( Change )o ).m_change );
1122            }
1123            return false;
1124        }
1125        
1126        public int hashCode() {
1127            return m_change.hashCode() + 17;
1128        }
1129        
1130    }
1131
1132}