001/*
002    Licensed to the Apache Software Foundation (ASF) under one
003    or more contributor license agreements.  See the NOTICE file
004    distributed with this work for additional information
005    regarding copyright ownership.  The ASF licenses this file
006    to you under the Apache License, Version 2.0 (the
007    "License"); you may not use this file except in compliance
008    with the License.  You may obtain a copy of the License at
009
010       http://www.apache.org/licenses/LICENSE-2.0
011
012    Unless required by applicable law or agreed to in writing,
013    software distributed under the License is distributed on an
014    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015    KIND, either express or implied.  See the License for the
016    specific language governing permissions and limitations
017    under the License.  
018 */
019package org.apache.wiki.filters;
020
021import java.io.BufferedReader;
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.InputStreamReader;
025import java.io.StringReader;
026import java.io.StringWriter;
027import java.util.ArrayList;
028import java.util.Collection;
029import java.util.Date;
030import java.util.Iterator;
031import java.util.Properties;
032import java.util.Random;
033import java.util.StringTokenizer;
034import java.util.Vector;
035
036import javax.servlet.http.HttpServletRequest;
037import javax.servlet.http.HttpServletResponse;
038import javax.servlet.jsp.PageContext;
039
040import org.apache.commons.lang.time.StopWatch;
041import org.apache.log4j.Logger;
042import org.apache.oro.text.regex.MalformedPatternException;
043import org.apache.oro.text.regex.MatchResult;
044import org.apache.oro.text.regex.Pattern;
045import org.apache.oro.text.regex.PatternCompiler;
046import org.apache.oro.text.regex.PatternMatcher;
047import org.apache.oro.text.regex.Perl5Compiler;
048import org.apache.oro.text.regex.Perl5Matcher;
049import org.apache.wiki.InternalWikiException;
050import org.apache.wiki.WikiContext;
051import org.apache.wiki.WikiEngine;
052import org.apache.wiki.WikiPage;
053import org.apache.wiki.WikiProvider;
054import org.apache.wiki.api.exceptions.ProviderException;
055import org.apache.wiki.api.exceptions.RedirectException;
056import org.apache.wiki.api.filters.BasicPageFilter;
057import org.apache.wiki.attachment.Attachment;
058import org.apache.wiki.auth.user.UserProfile;
059import org.apache.wiki.ui.EditorManager;
060import org.apache.wiki.util.FileUtil;
061import org.apache.wiki.util.HttpUtil;
062import org.apache.wiki.util.TextUtil;
063import org.suigeneris.jrcs.diff.Diff;
064import org.suigeneris.jrcs.diff.DifferentiationFailedException;
065import org.suigeneris.jrcs.diff.Revision;
066import org.suigeneris.jrcs.diff.delta.AddDelta;
067import org.suigeneris.jrcs.diff.delta.ChangeDelta;
068import org.suigeneris.jrcs.diff.delta.DeleteDelta;
069import org.suigeneris.jrcs.diff.delta.Delta;
070import org.suigeneris.jrcs.diff.myers.MyersDiff;
071
072import net.sf.akismet.Akismet;
073
074
075/**
076 *  This is Herb, the JSPWiki spamfilter that can also do choke modifications.
077 *
078 *  Parameters:
079 *  <ul>
080 *    <li>wordlist - Page name where the spamword regexps are found.  Use [{SET spamwords='regexp list separated with spaces'}] on
081 *     that page.  Default is "SpamFilterWordList".
082 *    <li>IPlist - Page name where the IP regexps are found.  Use [{SET ips='regexp list separated with spaces'}] on
083 *     that page.  Default is "SpamFilterIPList".
084 *    <li>maxpagenamelength - Maximum page name length. Default is 100.
085 *    <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is
086 *        "SpamFilterWordList/blacklist.txt"</li>
087 *    <li>errorpage - The page to which the user is redirected.  Has a special variable $msg which states the reason. Default is "RejectedMessage".
088 *    <li>pagechangesinminute - How many page changes are allowed/minute.  Default is 5.</li>
089 *    <li>similarchanges - How many similar page changes are allowed before the host is banned.  Default is 2.  (since 2.4.72)</li>
090 *    <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li>
091 *    <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li>
092 *    <li>akismet-apikey - The Akismet API key (see akismet.org)</li>
093 *    <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li>
094 *    <li>captcha - Sets the captcha technology to use.  Current allowed values are "none" and "asirra".</li>
095 *    <li>strategy - Sets the filtering strategy to use.  If set to "eager", will stop at the first probable
096 *        match, and won't consider any other tests.  This is the default, as it's considerably lighter. If set to "score", will go through all of the tests
097 *        and calculates a score for the spam, which is then compared to a filter level value.
098 *  </ul>
099 *
100 *  <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates
101 *  with the editor system.</p>
102 *  
103 *  <p>Changes by admin users are ignored in any case.</p>
104 *
105 *  @since 2.1.112
106 */
107public class SpamFilter extends BasicPageFilter {
108    
109    private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score";
110    private static final String REASON_REGEXP = "Regexp";
111    private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily";
112    private static final String REASON_IP_BANNED_PERMANENTLY = "IPBannedPermanently";
113    private static final String REASON_BOT_TRAP = "BotTrap";
114    private static final String REASON_AKISMET = "Akismet";
115    private static final String REASON_TOO_MANY_URLS = "TooManyUrls";
116    private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications";
117    private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications";
118    private static final String REASON_PAGENAME_TOO_LONG = "PageNameTooLong";
119    private static final String REASON_UTF8_TRAP = "UTF8Trap";
120
121    private static final String LISTVAR = "spamwords";
122    private static final String LISTIPVAR = "ips";
123
124    /** The filter property name for specifying the page which contains the list of spamwords.
125     *  Value is <tt>{@value}</tt>. */
126    public static final String  PROP_WORDLIST              = "wordlist";
127
128    /** The filter property name for specifying the page which contains the list of IPs to ban.
129     *  Value is <tt>{@value}</tt>. */
130    public static final String  PROP_IPLIST                = "IPlist";
131
132    /** The filter property name for specifying the maximum page name length.
133     *  Value is <tt>{@value}</tt>. */
134    public static final String  PROP_MAX_PAGENAME_LENGTH   = "maxpagenamelength";
135
136    /** The filter property name for the page to which you are directed if Herb rejects your
137     *  edit.  Value is <tt>{@value}</tt>. */
138    public static final String  PROP_ERRORPAGE             = "errorpage";
139    
140    /** The filter property name for specifying how many changes is any given IP address
141     *  allowed to do per minute.  Value is <tt>{@value}</tt>.
142     */
143    public static final String  PROP_PAGECHANGES           = "pagechangesinminute";
144    
145    /** The filter property name for specifying how many similar changes are allowed
146     *  before a host is banned.  Value is <tt>{@value}</tt>.
147     */
148    public static final String  PROP_SIMILARCHANGES        = "similarchanges";
149    
150    /** The filter property name for specifying how long a host is banned.  Value is <tt>{@value}</tt>.*/
151    public static final String  PROP_BANTIME               = "bantime";
152    
153    /** The filter property name for the attachment containing the blacklist.  Value is <tt>{@value}</tt>.*/
154    public static final String  PROP_BLACKLIST             = "blacklist";
155    
156    /** The filter property name for specifying how many URLs can any given edit contain.  
157     *  Value is <tt>{@value}</tt> */
158    public static final String  PROP_MAXURLS               = "maxurls";
159    
160    /** The filter property name for specifying the Akismet API-key.  Value is <tt>{@value}</tt>. */
161    public static final String  PROP_AKISMET_API_KEY       = "akismet-apikey";
162    
163    /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */
164    public static final String  PROP_IGNORE_AUTHENTICATED  = "ignoreauthenticated";
165    
166    /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */
167    public static final String  PROP_CAPTCHA               = "captcha";
168    
169    /** The filter property name for specifying which filter strategy should be used.  Value is <tt>{@value}</tt>. */
170    public static final String  PROP_FILTERSTRATEGY        = "strategy";
171
172    /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */
173    public static final String  STRATEGY_EAGER             = "eager";
174    
175    /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */
176    public static final String  STRATEGY_SCORE             = "score";
177
178    private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)";
179
180    private String          m_forbiddenWordsPage = "SpamFilterWordList";
181    private String          m_forbiddenIPsPage   = "SpamFilterIPList";
182    private String          m_pageNameMaxLength  = "100";
183    private String          m_errorPage          = "RejectedMessage";
184    private String          m_blacklist          = "SpamFilterWordList/blacklist.txt";
185
186    private PatternMatcher  m_matcher = new Perl5Matcher();
187    private PatternCompiler m_compiler = new Perl5Compiler();
188
189    private Collection<Pattern> m_spamPatterns = null;
190    private Collection<Pattern> m_IPPatterns = null;
191
192    private Date            m_lastRebuild = new Date( 0L );
193
194    private static  Logger  c_spamlog = Logger.getLogger( "SpamLog" );
195    private static  Logger  log = Logger.getLogger( SpamFilter.class );
196
197
198    private Vector<Host>    m_temporaryBanList = new Vector<Host>();
199
200    private int             m_banTime = 60; // minutes
201
202    private Vector<Host>    m_lastModifications = new Vector<Host>();
203
204    /**
205     *  How many times a single IP address can change a page per minute?
206     */
207    private int             m_limitSinglePageChanges = 5;
208
209    /**
210     *  How many times can you add the exact same string to a page?
211     */
212    private int             m_limitSimilarChanges = 2;
213
214    /**
215     *  How many URLs can be added at maximum.
216     */
217    private int             m_maxUrls = 10;
218
219    private Pattern         m_urlPattern;
220    private Akismet         m_akismet;
221
222    private String          m_akismetAPIKey = null;
223
224    private boolean         m_useCaptcha = false;
225
226    /** The limit at which we consider something to be spam. */
227    private int             m_scoreLimit = 1;
228
229    /**
230     * If set to true, will ignore anyone who is in Authenticated role.
231     */
232    private boolean         m_ignoreAuthenticated = false;
233
234    private boolean         m_stopAtFirstMatch = true;
235
236    private static String   c_hashName;
237    private static long     c_lastUpdate;
238
239    /** The HASH_DELAY value is a maximum amount of time that an user can keep
240     *  a session open, because after the value has expired, we will invent a new
241     *  hash field name.  By default this is {@value} hours, which should be ample
242     *  time for someone.
243     */
244    private static final long HASH_DELAY = 24;
245
246
247    /**
248     *  {@inheritDoc}
249     */
250    @Override
251    public void initialize( WikiEngine engine, Properties properties ) {
252        m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage );
253        m_forbiddenIPsPage = properties.getProperty( PROP_IPLIST, m_forbiddenIPsPage);
254        m_pageNameMaxLength = properties.getProperty( PROP_MAX_PAGENAME_LENGTH, m_pageNameMaxLength);
255        m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage );
256        m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties,
257                                                                PROP_PAGECHANGES,
258                                                                m_limitSinglePageChanges );
259        
260        m_limitSimilarChanges = TextUtil.getIntegerProperty( properties,
261                                                             PROP_SIMILARCHANGES,
262                                                             m_limitSimilarChanges );
263
264        m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls );
265        m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime );
266        m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist );
267
268        m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties,
269                                                             PROP_IGNORE_AUTHENTICATED,
270                                                             m_ignoreAuthenticated );
271
272        m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra");
273
274        try {
275            m_urlPattern = m_compiler.compile( URL_REGEXP );
276        } catch( MalformedPatternException e ) {
277            log.fatal( "Internal error: Someone put in a faulty pattern.", e );
278            throw new InternalWikiException( "Faulty pattern." , e);
279        }
280
281        m_akismetAPIKey = TextUtil.getStringProperty( properties,
282                                                      PROP_AKISMET_API_KEY,
283                                                      m_akismetAPIKey );
284
285        m_stopAtFirstMatch = TextUtil.getStringProperty( properties,
286                                                         PROP_FILTERSTRATEGY,
287                                                         STRATEGY_EAGER ).equals( STRATEGY_EAGER );
288
289        log.info( "# Spam filter initialized.  Temporary ban time " + m_banTime +
290                  " mins, max page changes/minute: " + m_limitSinglePageChanges );
291
292
293    }
294
295    private static final int REJECT = 0;
296    private static final int ACCEPT = 1;
297    private static final int NOTE   = 2;
298
299    private static String log( WikiContext ctx, int type, String source, String message ) {
300        message = TextUtil.replaceString( message, "\r\n", "\\r\\n" );
301        message = TextUtil.replaceString( message, "\"", "\\\"" );
302
303        String uid = getUniqueID();
304
305        String page   = ctx.getPage().getName();
306        String reason = "UNKNOWN";
307        String addr   = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-";
308
309        switch( type ) {
310            case REJECT:
311                reason = "REJECTED";
312                break;
313            case ACCEPT:
314                reason = "ACCEPTED";
315                break;
316            case NOTE:
317                reason = "NOTE";
318                break;
319            default:
320                throw new InternalWikiException( "Illegal type " + type );
321        }
322        c_spamlog.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message );
323
324        return uid;
325    }
326
327    /** {@inheritDoc} */
328    public String preSave( WikiContext context, String content ) throws RedirectException {
329        cleanBanList();
330        refreshBlacklists( context );
331        Change change = getChange( context, content );
332
333        if( !ignoreThisUser( context ) ) {
334            checkBanList( context, change );
335            checkSinglePageChange( context, content, change );
336            checkIPList( context );
337            checkPatternList( context, content, change );
338            checkPageName( context, content, change);
339        }
340
341        if( !m_stopAtFirstMatch ) {
342            Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE );
343
344            if( score != null && score.intValue() >= m_scoreLimit ) {
345                throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) );
346            }
347        }
348
349        log( context, ACCEPT, "-", change.toString() );
350        return content;
351    }
352
353    private void checkPageName(WikiContext context, String content, Change change) throws RedirectException {
354        WikiPage page = context.getPage();
355        String pageName = page.getName();
356        int maxlength = Integer.valueOf(m_pageNameMaxLength);
357        if ( pageName.length() > maxlength) {
358            //
359            //  Spam filter has a match.
360            //
361
362            String uid = log( context, REJECT, REASON_PAGENAME_TOO_LONG + "(" + m_pageNameMaxLength + ")" , pageName);
363
364            log.info("SPAM:PageNameTooLong (" + uid + "). The length of the page name is too large (" + pageName.length() + " , limit is " + m_pageNameMaxLength + ")");
365            checkStrategy( context, REASON_PAGENAME_TOO_LONG, "Herb says '" + pageName + "' is a bad pageName and I trust Herb! (Incident code " + uid + ")" );
366
367        }
368    }
369
370    private void checkStrategy( WikiContext context, String error, String message ) throws RedirectException {
371        if( m_stopAtFirstMatch ) {
372            throw new RedirectException( message, getRedirectPage( context ) );
373        }
374
375        Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE );
376        if( score != null ) {
377            score = score + 1;
378        } else {
379            score = 1;
380        }
381
382        context.setVariable( ATTR_SPAMFILTER_SCORE, score );
383    }
384    
385    /**
386     *  Parses a list of patterns and returns a Collection of compiled Pattern
387     *  objects.
388     *
389     * @param source
390     * @param list
391     * @return A Collection of the Patterns that were found from the lists.
392     */
393    private Collection< Pattern > parseWordList( WikiPage source, String list ) {
394        ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >();
395
396        if( list != null ) {
397            StringTokenizer tok = new StringTokenizer( list, " \t\n" );
398
399            while( tok.hasMoreTokens() ) {
400                String pattern = tok.nextToken();
401
402                try {
403                    compiledpatterns.add( m_compiler.compile( pattern ) );
404                } catch( MalformedPatternException e ) {
405                    log.debug( "Malformed spam filter pattern " + pattern );
406                    source.setAttribute("error", "Malformed spam filter pattern " + pattern);
407                }
408            }
409        }
410
411        return compiledpatterns;
412    }
413
414    /**
415     *  Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects.
416     *
417     *  @param list
418     *  @return The parsed blacklist patterns.
419     */
420    private Collection< Pattern > parseBlacklist( String list ) {
421        ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >();
422
423        if( list != null ) {
424            try {
425                BufferedReader in = new BufferedReader( new StringReader(list) );
426                String line;
427                while( (line = in.readLine() ) != null ) {
428                    line = line.trim();
429                    if( line.length() == 0 ) continue; // Empty line
430                    if( line.startsWith("#") ) continue; // It's a comment
431
432                    int ws = line.indexOf( ' ' );
433                    if( ws == -1 ) ws = line.indexOf( '\t' );
434                    if( ws != -1 ) line = line.substring( 0, ws );
435
436                    try {
437                        compiledpatterns.add( m_compiler.compile( line ) );
438                    } catch( MalformedPatternException e ) {
439                        log.debug( "Malformed spam filter pattern " + line );
440                    }
441                }
442            } catch( IOException e ) {
443                log.info( "Could not read patterns; returning what I got" , e );
444            }
445        }
446
447        return compiledpatterns;
448    }
449
450    /**
451     *  Takes a single page change and performs a load of tests on the content change.
452     *  An admin can modify anything.
453     *
454     *  @param context
455     *  @param content
456     *  @throws RedirectException
457     */
458    private synchronized void checkSinglePageChange( WikiContext context, String content, Change change ) 
459            throws RedirectException {
460        HttpServletRequest req = context.getHttpRequest();
461
462        if( req != null ) {
463            String addr = HttpUtil.getRemoteAddress( req );
464            int hostCounter = 0;
465            int changeCounter = 0;
466
467            log.debug( "Change is " + change.m_change );
468
469            long time = System.currentTimeMillis() - 60*1000L; // 1 minute
470
471            for( Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) {
472                Host host = i.next();
473
474                //
475                //  Check if this item is invalid
476                //
477                if( host.getAddedTime() < time ) {
478                    log.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" );
479                    i.remove();
480                    continue;
481                }
482
483                //
484                // Check if this IP address has been seen before
485                //
486
487                if( host.getAddress().equals( addr ) ) {
488                    hostCounter++;
489                }
490
491                //
492                //  Check, if this change has been seen before
493                //
494
495                if( host.getChange() != null && host.getChange().equals( change ) ) {
496                    changeCounter++;
497                }
498            }
499
500            //
501            //  Now, let's check against the limits.
502            //
503            if( hostCounter >= m_limitSinglePageChanges ) {
504                Host host = new Host( addr, null );
505                m_temporaryBanList.add( host );
506
507                String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change );
508                log.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" );
509                checkStrategy( context, REASON_TOO_MANY_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
510            }
511
512            if( changeCounter >= m_limitSimilarChanges ) {
513                Host host = new Host( addr, null );
514                m_temporaryBanList.add( host );
515
516                String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change );
517                log.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" );
518                checkStrategy( context, REASON_SIMILAR_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")");
519            }
520
521            //
522            //  Calculate the number of links in the addition.
523            //
524            String tstChange  = change.toString();
525            int    urlCounter = 0;
526            while( m_matcher.contains( tstChange,m_urlPattern ) ) {
527                MatchResult m = m_matcher.getMatch();
528                tstChange = tstChange.substring( m.endOffset(0) );
529                urlCounter++;
530            }
531
532            if( urlCounter > m_maxUrls ) {
533                Host host = new Host( addr, null );
534                m_temporaryBanList.add( host );
535
536                String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() );
537                log.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" );
538                checkStrategy( context, REASON_TOO_MANY_URLS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
539            }
540
541            //
542            //  Check bot trap
543            //
544            checkBotTrap( context, change );
545
546            //
547            //  Check UTF-8 mangling
548            //
549            checkUTF8( context, change );
550
551            //
552            //  Do Akismet check.  This is good to be the last, because this is the most
553            //  expensive operation.
554            //
555            checkAkismet( context, change );
556
557            m_lastModifications.add( new Host( addr, change ) );
558        }
559    }
560
561
562    /**
563     *  Checks against the akismet system.
564     *
565     * @param context
566     * @param change
567     * @throws RedirectException
568     */
569    private void checkAkismet( WikiContext context, Change change ) throws RedirectException {
570        if( m_akismetAPIKey != null ) {
571            if( m_akismet == null ) {
572                log.info( "Initializing Akismet spam protection." );
573                m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() );
574
575                if( !m_akismet.verifyAPIKey() ) {
576                    log.error( "Akismet API key cannot be verified.  Please check your config." );
577                    m_akismetAPIKey = null;
578                    m_akismet = null;
579                }
580            }
581
582            HttpServletRequest req = context.getHttpRequest();
583
584            //
585            //  Akismet will mark all empty statements as spam, so we'll just
586            //  ignore them.
587            //
588            if( change.m_adds == 0 && change.m_removals > 0 ) {
589                return;
590            }
591            
592            if( req != null && m_akismet != null ) {
593                log.debug( "Calling Akismet to check for spam..." );
594
595                StopWatch sw = new StopWatch();
596                sw.start();
597
598                String ipAddress     = HttpUtil.getRemoteAddress( req );
599                String userAgent     = req.getHeader( "User-Agent" );
600                String referrer      = req.getHeader( "Referer");
601                String permalink     = context.getViewURL( context.getPage().getName() );
602                String commentType   = context.getRequestContext().equals( WikiContext.COMMENT ) ? "comment" : "edit";
603                String commentAuthor = context.getCurrentUser().getName();
604                String commentAuthorEmail = null;
605                String commentAuthorURL   = null;
606
607                boolean isSpam = m_akismet.commentCheck( ipAddress,
608                                                         userAgent,
609                                                         referrer,
610                                                         permalink,
611                                                         commentType,
612                                                         commentAuthor,
613                                                         commentAuthorEmail,
614                                                         commentAuthorURL,
615                                                         change.toString(),
616                                                         null );
617
618                sw.stop();
619                log.debug( "Akismet request done in: " + sw );
620
621                if( isSpam ) {
622                    // Host host = new Host( ipAddress, null );
623                    // m_temporaryBanList.add( host );
624
625                    String uid = log( context, REJECT, REASON_AKISMET, change.toString() );
626                    log.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." );
627                    checkStrategy( context, REASON_AKISMET, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" );
628                }
629            }
630        }
631    }
632
633    /**
634     * Returns a static string which can be used to detect spambots which just wildly fill in all the fields.
635     *
636     * @return A string
637     */
638    public static String getBotFieldName() {
639        return "submit_auth";
640    }
641
642    /**
643     * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam.
644     *
645     * @param context
646     * @param change
647     * @throws RedirectException
648     */
649    private void checkBotTrap( WikiContext context, Change change ) throws RedirectException {
650        HttpServletRequest request = context.getHttpRequest();
651
652        if( request != null ) {
653            String unspam = request.getParameter( getBotFieldName() );
654            if( unspam != null && unspam.length() > 0 ) {
655                String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() );
656
657                log.info( "SPAM:BotTrap (" + uid + ").  Wildly behaving bot detected." );
658                checkStrategy( context, REASON_BOT_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" );
659            }
660        }
661    }
662
663    private void checkUTF8( WikiContext context, Change change ) throws RedirectException {
664        HttpServletRequest request = context.getHttpRequest();
665
666        if( request != null ) {
667            String utf8field = request.getParameter( "encodingcheck" );
668
669            if( utf8field != null && !utf8field.equals( "\u3041" ) ) {
670                String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() );
671
672                log.info( "SPAM:UTF8Trap (" + uid + ").  Wildly posting dumb bot detected." );
673                checkStrategy( context, REASON_UTF8_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" );
674            }
675        }
676    }
677
678    /** Goes through the ban list and cleans away any host which has expired from it. */
679    private synchronized void cleanBanList() {
680        long now = System.currentTimeMillis();
681
682        for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) {
683            Host host = i.next();
684
685            if( host.getReleaseTime() < now ) {
686                log.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" );
687                i.remove();
688            }
689        }
690    }
691
692    /**
693     *  Checks the ban list if the IP address of the changer is already on it.
694     *
695     *  @param context
696     *  @throws RedirectException
697     */
698    private void checkBanList( WikiContext context, Change change ) throws RedirectException {
699        HttpServletRequest req = context.getHttpRequest();
700
701        if( req != null ) {
702            String remote = HttpUtil.getRemoteAddress(req);
703            long now = System.currentTimeMillis();
704
705            for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) {
706                Host host = i.next();
707
708                if( host.getAddress().equals( remote ) ) {
709                    long timeleft = ( host.getReleaseTime() - now ) / 1000L;
710
711                    log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change );
712                    checkStrategy( context, REASON_IP_BANNED_TEMPORARILY, "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" );
713                }
714            }
715        }
716    }
717
718    /**
719     *  If the spam filter notices changes in the black list page, it will refresh them automatically.
720     *
721     *  @param context
722     */
723    private void refreshBlacklists( WikiContext context ) {
724        try {
725
726            boolean rebuild = false;
727
728            //
729            //  Rebuild, if the spam words page, the attachment or the IP ban page has changed since.
730            //
731            WikiPage sourceSpam = context.getEngine().getPage( m_forbiddenWordsPage );
732            if( sourceSpam != null ) {
733                if( m_spamPatterns == null || m_spamPatterns.isEmpty() || sourceSpam.getLastModified().after( m_lastRebuild ) ) {
734                    rebuild = true;
735                }
736            }
737
738            Attachment att = context.getEngine().getAttachmentManager().getAttachmentInfo( context, m_blacklist );
739            if( att != null ) {
740                if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) {
741                    rebuild = true;
742                }
743            }
744
745            WikiPage sourceIPs = context.getEngine().getPage( m_forbiddenIPsPage );
746            if( sourceIPs != null ) {
747                if( m_IPPatterns == null || m_IPPatterns.isEmpty() || sourceIPs.getLastModified().after( m_lastRebuild ) ) {
748                    rebuild = true;
749                }
750            }
751
752            //
753            //  Do the actual rebuilding.  For simplicity's sake, we always rebuild the complete
754            //  filter list regardless of what changed.
755            //
756            if( rebuild ) {
757                m_lastRebuild = new Date();
758                m_spamPatterns = parseWordList( sourceSpam,
759                                                ( sourceSpam != null ) ? ( String )sourceSpam.getAttribute( LISTVAR ) : null );
760
761                log.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage );
762
763                m_IPPatterns = parseWordList( sourceIPs,
764                        ( sourceIPs != null ) ? ( String )sourceIPs.getAttribute( LISTIPVAR ) : null );
765                log.info( "IP filter reloaded - recognizing " + m_IPPatterns.size() + " patterns from page " + m_forbiddenIPsPage );
766
767                if( att != null ) {
768                    InputStream in = context.getEngine().getAttachmentManager().getAttachmentStream(att);
769                    StringWriter out = new StringWriter();
770                    FileUtil.copyContents( new InputStreamReader( in,"UTF-8" ), out );
771                    Collection< Pattern > blackList = parseBlacklist( out.toString() );
772                    log.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist );
773                    m_spamPatterns.addAll( blackList );
774                }
775            }
776        } catch( IOException ex ) {
777            log.info( "Unable to read attachment data, continuing...", ex );
778        } catch( ProviderException ex ) {
779            log.info( "Failed to read spam filter attachment, continuing...", ex );
780        }
781    }
782
783    /**
784     *  Does a check against a known pattern list.
785     *
786     *  @param context
787     *  @param content
788     *  @param change
789     *  @throws RedirectException
790     */
791    private void checkPatternList( WikiContext context, String content, Change change ) throws RedirectException {
792        //
793        //  If we have no spam patterns defined, or we're trying to save
794        //  the page containing the patterns, just return.
795        //
796        if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) {
797            return;
798        }
799
800        String ch = change.toString();
801        if( context.getHttpRequest() != null ) {
802            ch += HttpUtil.getRemoteAddress( context.getHttpRequest() );
803        }
804
805        for( Pattern p : m_spamPatterns ) {
806            // log.debug("Attempting to match page contents with "+p.getPattern());
807
808            if( m_matcher.contains( ch, p ) ) {
809                //
810                //  Spam filter has a match.
811                //
812                String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch );
813
814                log.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" );
815                checkStrategy( context, REASON_REGEXP, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" );
816            }
817        }
818    }
819
820
821    /**
822     *  Does a check against a pattern list of IPs.
823     *
824     *  @param context
825     *  @throws RedirectException
826     */
827    private void checkIPList( WikiContext context ) throws RedirectException {
828        //
829        //  If we have no IP patterns defined, or we're trying to save
830        //  the page containing the IP patterns, just return.
831        //
832        if( m_IPPatterns == null || context.getPage().getName().equals( m_forbiddenIPsPage ) ) {
833            return;
834        }
835
836        String remoteIP = HttpUtil.getRemoteAddress( context.getHttpRequest() );
837        log.info("Attempting to match remoteIP " + remoteIP + " against " + m_IPPatterns.size() + " patterns");
838
839        for( Pattern p : m_IPPatterns ) {
840             log.debug("Attempting to match remoteIP with " + p.getPattern());
841
842            if( m_matcher.contains( remoteIP, p ) ) {
843
844                //  IP filter has a match.
845                //
846                String uid = log( context, REJECT, REASON_IP_BANNED_PERMANENTLY + "(" + p.getPattern() + ")", remoteIP );
847
848                log.info( "SPAM:IPBanList (" + uid + "). remoteIP matches the IP filter '" + p.getPattern() + "'" );
849                checkStrategy( context, REASON_IP_BANNED_PERMANENTLY, "Herb says '" + p.getPattern() + "' is a banned IP and I trust Herb! (Incident code " + uid + ")" );
850            }
851        }
852    }
853
854    private void checkPatternList( WikiContext context, String content, String change ) throws RedirectException {
855        Change c = new Change();
856        c.m_change = change;
857        checkPatternList( context, content, c );
858    }
859 
860    /**
861     *  Creates a simple text string describing the added content.
862     *
863     *  @param context
864     *  @param newText
865     *  @return Empty string, if there is no change.
866     */
867    private static Change getChange( WikiContext context, String newText ) {
868        WikiPage page = context.getPage();
869        StringBuffer change = new StringBuffer();
870        WikiEngine engine = context.getEngine();
871        // Get current page version
872
873        Change ch = new Change();
874        
875        try {
876            String oldText = engine.getPureText( page.getName(), WikiProvider.LATEST_VERSION );
877
878            String[] first  = Diff.stringToArray( oldText );
879            String[] second = Diff.stringToArray( newText );
880            Revision rev = Diff.diff( first, second, new MyersDiff() );
881
882            if( rev == null || rev.size() == 0 ) {
883                return ch;
884            }
885            
886            for( int i = 0; i < rev.size(); i++ ) {
887                Delta d = rev.getDelta( i );
888
889                if( d instanceof AddDelta ) {
890                    d.getRevised().toString( change, "", "\r\n" );
891                    ch.m_adds++;
892                    
893                } else if( d instanceof ChangeDelta ) {
894                    d.getRevised().toString( change, "", "\r\n" );
895                    ch.m_adds++;
896                    
897                } else if( d instanceof DeleteDelta ) {
898                    ch.m_removals++;
899                }
900            }
901        } catch( DifferentiationFailedException e ) {
902            log.error( "Diff failed", e );
903        }
904
905        //
906        //  Don't forget to include the change note, too
907        //
908        String changeNote = ( String )page.getAttribute( WikiPage.CHANGENOTE );
909
910        if( changeNote != null ) {
911            change.append( "\r\n" );
912            change.append( changeNote );
913        }
914
915        //
916        //  And author as well
917        //
918        if( page.getAuthor() != null ) {
919            change.append( "\r\n" + page.getAuthor() );
920        }
921
922        ch.m_change = change.toString();
923        return ch;
924    }
925
926    /**
927     *  Returns true, if this user should be ignored.  For example, admin users.
928     *
929     * @param context
930     * @return True, if this users should be ignored.
931     */
932    private boolean ignoreThisUser( WikiContext context ) {
933        if( context.hasAdminPermissions() ) {
934            return true;
935        }
936
937        if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) {
938            return true;
939        }
940
941        if( context.getVariable( "captcha" ) != null ) {
942            return true;
943        }
944
945        return false;
946    }
947
948    /**
949     *  Returns a random string of six uppercase characters.
950     *
951     *  @return A random string
952     */
953    private static String getUniqueID() {
954        StringBuilder sb = new StringBuilder();
955        Random rand = new Random();
956
957        for( int i = 0; i < 6; i++ ) {
958            char x = ( char )( 'A' + rand.nextInt( 26 ) );
959            sb.append( x );
960        }
961
962        return sb.toString();
963    }
964
965    /**
966     *  Returns a page to which we shall redirect, based on the current value of the "captcha" parameter.
967     *
968     *  @param ctx WikiContext
969     *  @return An URL to redirect to
970     */
971    private String getRedirectPage( WikiContext ctx ) {
972        if( m_useCaptcha ) {
973            return ctx.getURL( WikiContext.NONE, "Captcha.jsp", "page="+ctx.getEngine().encodeName( ctx.getPage().getName() ) );
974        }
975
976        return ctx.getURL( WikiContext.VIEW, m_errorPage );
977    }
978
979    /**
980     *  Checks whether the UserProfile matches certain checks.
981     *
982     *  @param profile The profile to check
983     *  @param context The WikiContext
984     *  @return False, if this userprofile is suspect and should not be allowed to be added.
985     *  @since 2.6.1
986     */
987    public boolean isValidUserProfile( WikiContext context, UserProfile profile ) {
988        try {
989            checkPatternList( context, profile.getEmail(), profile.getEmail() );
990            checkPatternList( context, profile.getFullname(), profile.getFullname() );
991            checkPatternList( context, profile.getLoginName(), profile.getLoginName() );
992        } catch( RedirectException e ) {
993            log.info("Detected attempt to create a spammer user account (see above for rejection reason)");
994            return false;
995        }
996
997        return true;
998    }
999
1000    /**
1001     *  This method is used to calculate an unique code when submitting the page to detect edit conflicts.  
1002     *  It currently incorporates the last-modified date of the page, and the IP address of the submitter.
1003     *
1004     *  @param page The WikiPage under edit
1005     *  @param request The HTTP Request
1006     *  @since 2.6
1007     *  @return A hash value for this page and session
1008     */
1009    public static final String getSpamHash( WikiPage page, HttpServletRequest request ) {
1010        long lastModified = 0;
1011
1012        if( page.getLastModified() != null ) {
1013            lastModified = page.getLastModified().getTime();
1014        }
1015        long remote = HttpUtil.getRemoteAddress( request ).hashCode();
1016
1017        return Long.toString( lastModified ^ remote );
1018    }
1019
1020    /**
1021     *  Returns the name of the hash field to be used in this request. The value is unique per session, and once 
1022     *  the session has expired, you cannot edit anymore.
1023     *
1024     *  @param request The page request
1025     *  @return The name to be used in the hash field
1026     *  @since  2.6
1027     */
1028    public static final String getHashFieldName( HttpServletRequest request ) {
1029        String hash = null;
1030
1031        if( request.getSession() != null ) {
1032            hash = ( String )request.getSession().getAttribute( "_hash" );
1033
1034            if( hash == null ) {
1035                hash = c_hashName;
1036                request.getSession().setAttribute( "_hash", hash );
1037            }
1038        }
1039
1040        if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) {
1041            c_hashName = getUniqueID().toLowerCase();
1042            c_lastUpdate = System.currentTimeMillis();
1043        }
1044
1045        return hash != null ? hash : c_hashName;
1046    }
1047
1048
1049    /**
1050     *  This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases: 
1051     *  either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long, 
1052     *  and their session has expired.
1053     *  <p>
1054     *  This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in 
1055     *  the spam log (it may or may not be spam, but it's rather likely that it is).
1056     *
1057     *  @param context The WikiContext
1058     *  @param pageContext The JSP PageContext.
1059     *  @return True, if hash is okay.  False, if hash is not okay, and you need to redirect.
1060     *  @throws IOException If redirection fails
1061     *  @since 2.6
1062     */
1063    public static final boolean checkHash( WikiContext context, PageContext pageContext ) throws IOException {
1064        String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() );
1065
1066        if( pageContext.getRequest().getParameter(hashName) == null ) {
1067            if( pageContext.getAttribute( hashName ) == null ) {
1068                Change change = getChange( context, EditorManager.getEditedText( pageContext ) );
1069                log( context, REJECT, "MissingHash", change.m_change );
1070
1071                String redirect = context.getURL( WikiContext.VIEW,"SessionExpired" );
1072                ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect );
1073                return false;
1074            }
1075        }
1076
1077        return true;
1078    }
1079
1080    /**
1081     * This helper method adds all the input fields to your editor that the SpamFilter requires
1082     * to check for spam.  This <i>must</i> be in your editor form if you intend to use the SpamFilter.
1083     *  
1084     * @param pageContext The PageContext
1085     * @return A HTML string which contains input fields for the SpamFilter.
1086     */
1087    public static final String insertInputFields( PageContext pageContext ) {
1088        WikiContext ctx = WikiContext.findContext( pageContext );
1089        WikiEngine engine = ctx.getEngine();
1090
1091        StringBuilder sb = new StringBuilder();
1092        if( engine.getContentEncoding().equals( "UTF-8" ) ) {
1093            sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" );
1094        }
1095
1096        return sb.toString();
1097    }
1098    
1099    /**
1100     *  A local class for storing host information.
1101     *
1102     *  @since
1103     */
1104    private class Host {
1105        
1106        private long   m_addedTime = System.currentTimeMillis();
1107        private long   m_releaseTime;
1108        private String m_address;
1109        private Change m_change;
1110
1111        public String getAddress() {
1112            return m_address;
1113        }
1114
1115        public long getReleaseTime() {
1116            return m_releaseTime;
1117        }
1118
1119        public long getAddedTime() {
1120            return m_addedTime;
1121        }
1122
1123        public Change getChange() {
1124            return m_change;
1125        }
1126
1127        public Host( String ipaddress, Change change ) {
1128            m_address = ipaddress;
1129            m_change  = change;
1130            m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L;
1131        }
1132        
1133    }
1134    
1135    private static class Change {
1136        
1137        public String m_change;
1138        public int    m_adds;
1139        public int    m_removals;
1140        
1141        public String toString() {
1142            return m_change;
1143        }
1144        
1145        public boolean equals( Object o ) {
1146            if( o instanceof Change ) {
1147                return m_change.equals( ( ( Change )o ).m_change );
1148            }
1149            return false;
1150        }
1151        
1152        public int hashCode() {
1153            return m_change.hashCode() + 17;
1154        }
1155        
1156    }
1157
1158}