001/*
002    Licensed to the Apache Software Foundation (ASF) under one
003    or more contributor license agreements.  See the NOTICE file
004    distributed with this work for additional information
005    regarding copyright ownership.  The ASF licenses this file
006    to you under the Apache License, Version 2.0 (the
007    "License"); you may not use this file except in compliance
008    with the License.  You may obtain a copy of the License at
009
010       http://www.apache.org/licenses/LICENSE-2.0
011
012    Unless required by applicable law or agreed to in writing,
013    software distributed under the License is distributed on an
014    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015    KIND, either express or implied.  See the License for the
016    specific language governing permissions and limitations
017    under the License.  
018 */
019package org.apache.wiki.filters;
020
021import net.sf.akismet.Akismet;
022import org.apache.commons.lang3.time.StopWatch;
023import org.apache.log4j.Logger;
024import org.apache.oro.text.regex.MalformedPatternException;
025import org.apache.oro.text.regex.MatchResult;
026import org.apache.oro.text.regex.Pattern;
027import org.apache.oro.text.regex.PatternCompiler;
028import org.apache.oro.text.regex.PatternMatcher;
029import org.apache.oro.text.regex.Perl5Compiler;
030import org.apache.oro.text.regex.Perl5Matcher;
031import org.apache.wiki.InternalWikiException;
032import org.apache.wiki.WikiContext;
033import org.apache.wiki.WikiEngine;
034import org.apache.wiki.WikiPage;
035import org.apache.wiki.WikiProvider;
036import org.apache.wiki.api.exceptions.ProviderException;
037import org.apache.wiki.api.exceptions.RedirectException;
038import org.apache.wiki.api.filters.BasicPageFilter;
039import org.apache.wiki.attachment.Attachment;
040import org.apache.wiki.auth.user.UserProfile;
041import org.apache.wiki.ui.EditorManager;
042import org.apache.wiki.util.FileUtil;
043import org.apache.wiki.util.HttpUtil;
044import org.apache.wiki.util.TextUtil;
045import org.suigeneris.jrcs.diff.Diff;
046import org.suigeneris.jrcs.diff.DifferentiationFailedException;
047import org.suigeneris.jrcs.diff.Revision;
048import org.suigeneris.jrcs.diff.delta.AddDelta;
049import org.suigeneris.jrcs.diff.delta.ChangeDelta;
050import org.suigeneris.jrcs.diff.delta.DeleteDelta;
051import org.suigeneris.jrcs.diff.delta.Delta;
052import org.suigeneris.jrcs.diff.myers.MyersDiff;
053
054import javax.servlet.http.HttpServletRequest;
055import javax.servlet.http.HttpServletResponse;
056import javax.servlet.jsp.PageContext;
057import java.io.BufferedReader;
058import java.io.IOException;
059import java.io.InputStream;
060import java.io.InputStreamReader;
061import java.io.StringReader;
062import java.io.StringWriter;
063import java.nio.charset.StandardCharsets;
064import java.util.ArrayList;
065import java.util.Collection;
066import java.util.Date;
067import java.util.Iterator;
068import java.util.Properties;
069import java.util.Random;
070import java.util.StringTokenizer;
071import java.util.Vector;
072
073
074/**
075 *  This is Herb, the JSPWiki spamfilter that can also do choke modifications.
076 *
077 *  Parameters:
078 *  <ul>
079 *    <li>wordlist - Page name where the spamword regexps are found.  Use [{SET spamwords='regexp list separated with spaces'}] on
080 *     that page.  Default is "SpamFilterWordList".
081 *    <li>IPlist - Page name where the IP regexps are found.  Use [{SET ips='regexp list separated with spaces'}] on
082 *     that page.  Default is "SpamFilterIPList".
083 *    <li>maxpagenamelength - Maximum page name length. Default is 100.
084 *    <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is
085 *        "SpamFilterWordList/blacklist.txt"</li>
086 *    <li>errorpage - The page to which the user is redirected.  Has a special variable $msg which states the reason. Default is "RejectedMessage".
087 *    <li>pagechangesinminute - How many page changes are allowed/minute.  Default is 5.</li>
088 *    <li>similarchanges - How many similar page changes are allowed before the host is banned.  Default is 2.  (since 2.4.72)</li>
089 *    <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li>
090 *    <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li>
091 *    <li>akismet-apikey - The Akismet API key (see akismet.org)</li>
092 *    <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li>
093 *    <li>captcha - Sets the captcha technology to use.  Current allowed values are "none" and "asirra".</li>
094 *    <li>strategy - Sets the filtering strategy to use.  If set to "eager", will stop at the first probable
095 *        match, and won't consider any other tests.  This is the default, as it's considerably lighter. If set to "score", will go through all of the tests
096 *        and calculates a score for the spam, which is then compared to a filter level value.
097 *  </ul>
098 *
099 *  <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates
100 *  with the editor system.</p>
101 *  
102 *  <p>Changes by admin users are ignored in any case.</p>
103 *
104 *  @since 2.1.112
105 */
106public class SpamFilter extends BasicPageFilter {
107    
108    private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score";
109    private static final String REASON_REGEXP = "Regexp";
110    private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily";
111    private static final String REASON_IP_BANNED_PERMANENTLY = "IPBannedPermanently";
112    private static final String REASON_BOT_TRAP = "BotTrap";
113    private static final String REASON_AKISMET = "Akismet";
114    private static final String REASON_TOO_MANY_URLS = "TooManyUrls";
115    private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications";
116    private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications";
117    private static final String REASON_PAGENAME_TOO_LONG = "PageNameTooLong";
118    private static final String REASON_UTF8_TRAP = "UTF8Trap";
119
120    private static final String LISTVAR = "spamwords";
121    private static final String LISTIPVAR = "ips";
122
123    /** The filter property name for specifying the page which contains the list of spamwords.
124     *  Value is <tt>{@value}</tt>. */
125    public static final String  PROP_WORDLIST              = "wordlist";
126
127    /** The filter property name for specifying the page which contains the list of IPs to ban.
128     *  Value is <tt>{@value}</tt>. */
129    public static final String  PROP_IPLIST                = "IPlist";
130
131    /** The filter property name for specifying the maximum page name length.
132     *  Value is <tt>{@value}</tt>. */
133    public static final String  PROP_MAX_PAGENAME_LENGTH   = "maxpagenamelength";
134
135    /** The filter property name for the page to which you are directed if Herb rejects your
136     *  edit.  Value is <tt>{@value}</tt>. */
137    public static final String  PROP_ERRORPAGE             = "errorpage";
138    
139    /** The filter property name for specifying how many changes is any given IP address
140     *  allowed to do per minute.  Value is <tt>{@value}</tt>.
141     */
142    public static final String  PROP_PAGECHANGES           = "pagechangesinminute";
143    
144    /** The filter property name for specifying how many similar changes are allowed
145     *  before a host is banned.  Value is <tt>{@value}</tt>.
146     */
147    public static final String  PROP_SIMILARCHANGES        = "similarchanges";
148    
149    /** The filter property name for specifying how long a host is banned.  Value is <tt>{@value}</tt>.*/
150    public static final String  PROP_BANTIME               = "bantime";
151    
152    /** The filter property name for the attachment containing the blacklist.  Value is <tt>{@value}</tt>.*/
153    public static final String  PROP_BLACKLIST             = "blacklist";
154    
155    /** The filter property name for specifying how many URLs can any given edit contain.  
156     *  Value is <tt>{@value}</tt> */
157    public static final String  PROP_MAXURLS               = "maxurls";
158    
159    /** The filter property name for specifying the Akismet API-key.  Value is <tt>{@value}</tt>. */
160    public static final String  PROP_AKISMET_API_KEY       = "akismet-apikey";
161    
162    /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */
163    public static final String  PROP_IGNORE_AUTHENTICATED  = "ignoreauthenticated";
164    
165    /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */
166    public static final String  PROP_CAPTCHA               = "captcha";
167    
168    /** The filter property name for specifying which filter strategy should be used.  Value is <tt>{@value}</tt>. */
169    public static final String  PROP_FILTERSTRATEGY        = "strategy";
170
171    /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */
172    public static final String  STRATEGY_EAGER             = "eager";
173    
174    /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */
175    public static final String  STRATEGY_SCORE             = "score";
176
177    private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)";
178
179    private String          m_forbiddenWordsPage = "SpamFilterWordList";
180    private String          m_forbiddenIPsPage   = "SpamFilterIPList";
181    private String          m_pageNameMaxLength  = "100";
182    private String          m_errorPage          = "RejectedMessage";
183    private String          m_blacklist          = "SpamFilterWordList/blacklist.txt";
184
185    private PatternMatcher  m_matcher = new Perl5Matcher();
186    private PatternCompiler m_compiler = new Perl5Compiler();
187
188    private Collection<Pattern> m_spamPatterns = null;
189    private Collection<Pattern> m_IPPatterns = null;
190
191    private Date            m_lastRebuild = new Date( 0L );
192
193    private static  Logger  c_spamlog = Logger.getLogger( "SpamLog" );
194    private static  Logger  log = Logger.getLogger( SpamFilter.class );
195
196
197    private Vector<Host>    m_temporaryBanList = new Vector<Host>();
198
199    private int             m_banTime = 60; // minutes
200
201    private Vector<Host>    m_lastModifications = new Vector<Host>();
202
203    /**
204     *  How many times a single IP address can change a page per minute?
205     */
206    private int             m_limitSinglePageChanges = 5;
207
208    /**
209     *  How many times can you add the exact same string to a page?
210     */
211    private int             m_limitSimilarChanges = 2;
212
213    /**
214     *  How many URLs can be added at maximum.
215     */
216    private int             m_maxUrls = 10;
217
218    private Pattern         m_urlPattern;
219    private Akismet         m_akismet;
220
221    private String          m_akismetAPIKey = null;
222
223    private boolean         m_useCaptcha = false;
224
225    /** The limit at which we consider something to be spam. */
226    private int             m_scoreLimit = 1;
227
228    /**
229     * If set to true, will ignore anyone who is in Authenticated role.
230     */
231    private boolean         m_ignoreAuthenticated = false;
232
233    private boolean         m_stopAtFirstMatch = true;
234
235    private static String   c_hashName;
236    private static long     c_lastUpdate;
237
238    /** The HASH_DELAY value is a maximum amount of time that an user can keep
239     *  a session open, because after the value has expired, we will invent a new
240     *  hash field name.  By default this is {@value} hours, which should be ample
241     *  time for someone.
242     */
243    private static final long HASH_DELAY = 24;
244
245
246    /**
247     *  {@inheritDoc}
248     */
249    @Override
250    public void initialize( WikiEngine engine, Properties properties ) {
251        m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage );
252        m_forbiddenIPsPage = properties.getProperty( PROP_IPLIST, m_forbiddenIPsPage);
253        m_pageNameMaxLength = properties.getProperty( PROP_MAX_PAGENAME_LENGTH, m_pageNameMaxLength);
254        m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage );
255        m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties,
256                                                                PROP_PAGECHANGES,
257                                                                m_limitSinglePageChanges );
258        
259        m_limitSimilarChanges = TextUtil.getIntegerProperty( properties,
260                                                             PROP_SIMILARCHANGES,
261                                                             m_limitSimilarChanges );
262
263        m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls );
264        m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime );
265        m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist );
266
267        m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties,
268                                                             PROP_IGNORE_AUTHENTICATED,
269                                                             m_ignoreAuthenticated );
270
271        m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra");
272
273        try {
274            m_urlPattern = m_compiler.compile( URL_REGEXP );
275        } catch( MalformedPatternException e ) {
276            log.fatal( "Internal error: Someone put in a faulty pattern.", e );
277            throw new InternalWikiException( "Faulty pattern." , e);
278        }
279
280        m_akismetAPIKey = TextUtil.getStringProperty( properties,
281                                                      PROP_AKISMET_API_KEY,
282                                                      m_akismetAPIKey );
283
284        m_stopAtFirstMatch = TextUtil.getStringProperty( properties,
285                                                         PROP_FILTERSTRATEGY,
286                                                         STRATEGY_EAGER ).equals( STRATEGY_EAGER );
287
288        log.info( "# Spam filter initialized.  Temporary ban time " + m_banTime +
289                  " mins, max page changes/minute: " + m_limitSinglePageChanges );
290
291
292    }
293
294    private static final int REJECT = 0;
295    private static final int ACCEPT = 1;
296    private static final int NOTE   = 2;
297
298    private static String log( WikiContext ctx, int type, String source, String message ) {
299        message = TextUtil.replaceString( message, "\r\n", "\\r\\n" );
300        message = TextUtil.replaceString( message, "\"", "\\\"" );
301
302        String uid = getUniqueID();
303
304        String page   = ctx.getPage().getName();
305        String reason = "UNKNOWN";
306        String addr   = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-";
307
308        switch( type ) {
309            case REJECT:
310                reason = "REJECTED";
311                break;
312            case ACCEPT:
313                reason = "ACCEPTED";
314                break;
315            case NOTE:
316                reason = "NOTE";
317                break;
318            default:
319                throw new InternalWikiException( "Illegal type " + type );
320        }
321        c_spamlog.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message );
322
323        return uid;
324    }
325
326    /** {@inheritDoc} */
327    public String preSave( WikiContext context, String content ) throws RedirectException {
328        cleanBanList();
329        refreshBlacklists( context );
330        Change change = getChange( context, content );
331
332        if( !ignoreThisUser( context ) ) {
333            checkBanList( context, change );
334            checkSinglePageChange( context, content, change );
335            checkIPList( context );
336            checkPatternList( context, content, change );
337            checkPageName( context, content, change);
338        }
339
340        if( !m_stopAtFirstMatch ) {
341            Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE );
342
343            if( score != null && score.intValue() >= m_scoreLimit ) {
344                throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) );
345            }
346        }
347
348        log( context, ACCEPT, "-", change.toString() );
349        return content;
350    }
351
352    private void checkPageName(WikiContext context, String content, Change change) throws RedirectException {
353        WikiPage page = context.getPage();
354        String pageName = page.getName();
355        int maxlength = Integer.valueOf(m_pageNameMaxLength);
356        if ( pageName.length() > maxlength) {
357            //
358            //  Spam filter has a match.
359            //
360
361            String uid = log( context, REJECT, REASON_PAGENAME_TOO_LONG + "(" + m_pageNameMaxLength + ")" , pageName);
362
363            log.info("SPAM:PageNameTooLong (" + uid + "). The length of the page name is too large (" + pageName.length() + " , limit is " + m_pageNameMaxLength + ")");
364            checkStrategy( context, REASON_PAGENAME_TOO_LONG, "Herb says '" + pageName + "' is a bad pageName and I trust Herb! (Incident code " + uid + ")" );
365
366        }
367    }
368
369    private void checkStrategy( WikiContext context, String error, String message ) throws RedirectException {
370        if( m_stopAtFirstMatch ) {
371            throw new RedirectException( message, getRedirectPage( context ) );
372        }
373
374        Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE );
375        if( score != null ) {
376            score = score + 1;
377        } else {
378            score = 1;
379        }
380
381        context.setVariable( ATTR_SPAMFILTER_SCORE, score );
382    }
383    
384    /**
385     *  Parses a list of patterns and returns a Collection of compiled Pattern
386     *  objects.
387     *
388     * @param source
389     * @param list
390     * @return A Collection of the Patterns that were found from the lists.
391     */
392    private Collection< Pattern > parseWordList( WikiPage source, String list ) {
393        ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >();
394
395        if( list != null ) {
396            StringTokenizer tok = new StringTokenizer( list, " \t\n" );
397
398            while( tok.hasMoreTokens() ) {
399                String pattern = tok.nextToken();
400
401                try {
402                    compiledpatterns.add( m_compiler.compile( pattern ) );
403                } catch( MalformedPatternException e ) {
404                    log.debug( "Malformed spam filter pattern " + pattern );
405                    source.setAttribute("error", "Malformed spam filter pattern " + pattern);
406                }
407            }
408        }
409
410        return compiledpatterns;
411    }
412
413    /**
414     *  Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects.
415     *
416     *  @param list
417     *  @return The parsed blacklist patterns.
418     */
419    private Collection< Pattern > parseBlacklist( String list ) {
420        ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >();
421
422        if( list != null ) {
423            try {
424                BufferedReader in = new BufferedReader( new StringReader(list) );
425                String line;
426                while( (line = in.readLine() ) != null ) {
427                    line = line.trim();
428                    if( line.length() == 0 ) continue; // Empty line
429                    if( line.startsWith("#") ) continue; // It's a comment
430
431                    int ws = line.indexOf( ' ' );
432                    if( ws == -1 ) ws = line.indexOf( '\t' );
433                    if( ws != -1 ) line = line.substring( 0, ws );
434
435                    try {
436                        compiledpatterns.add( m_compiler.compile( line ) );
437                    } catch( MalformedPatternException e ) {
438                        log.debug( "Malformed spam filter pattern " + line );
439                    }
440                }
441            } catch( IOException e ) {
442                log.info( "Could not read patterns; returning what I got" , e );
443            }
444        }
445
446        return compiledpatterns;
447    }
448
449    /**
450     *  Takes a single page change and performs a load of tests on the content change.
451     *  An admin can modify anything.
452     *
453     *  @param context
454     *  @param content
455     *  @throws RedirectException
456     */
457    private synchronized void checkSinglePageChange( WikiContext context, String content, Change change ) 
458            throws RedirectException {
459        HttpServletRequest req = context.getHttpRequest();
460
461        if( req != null ) {
462            String addr = HttpUtil.getRemoteAddress( req );
463            int hostCounter = 0;
464            int changeCounter = 0;
465
466            log.debug( "Change is " + change.m_change );
467
468            long time = System.currentTimeMillis() - 60*1000L; // 1 minute
469
470            for( Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) {
471                Host host = i.next();
472
473                //
474                //  Check if this item is invalid
475                //
476                if( host.getAddedTime() < time ) {
477                    log.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" );
478                    i.remove();
479                    continue;
480                }
481
482                //
483                // Check if this IP address has been seen before
484                //
485
486                if( host.getAddress().equals( addr ) ) {
487                    hostCounter++;
488                }
489
490                //
491                //  Check, if this change has been seen before
492                //
493
494                if( host.getChange() != null && host.getChange().equals( change ) ) {
495                    changeCounter++;
496                }
497            }
498
499            //
500            //  Now, let's check against the limits.
501            //
502            if( hostCounter >= m_limitSinglePageChanges ) {
503                Host host = new Host( addr, null );
504                m_temporaryBanList.add( host );
505
506                String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change );
507                log.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" );
508                checkStrategy( context, REASON_TOO_MANY_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
509            }
510
511            if( changeCounter >= m_limitSimilarChanges ) {
512                Host host = new Host( addr, null );
513                m_temporaryBanList.add( host );
514
515                String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change );
516                log.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" );
517                checkStrategy( context, REASON_SIMILAR_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")");
518            }
519
520            //
521            //  Calculate the number of links in the addition.
522            //
523            String tstChange  = change.toString();
524            int    urlCounter = 0;
525            while( m_matcher.contains( tstChange,m_urlPattern ) ) {
526                MatchResult m = m_matcher.getMatch();
527                tstChange = tstChange.substring( m.endOffset(0) );
528                urlCounter++;
529            }
530
531            if( urlCounter > m_maxUrls ) {
532                Host host = new Host( addr, null );
533                m_temporaryBanList.add( host );
534
535                String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() );
536                log.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" );
537                checkStrategy( context, REASON_TOO_MANY_URLS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
538            }
539
540            //
541            //  Check bot trap
542            //
543            checkBotTrap( context, change );
544
545            //
546            //  Check UTF-8 mangling
547            //
548            checkUTF8( context, change );
549
550            //
551            //  Do Akismet check.  This is good to be the last, because this is the most
552            //  expensive operation.
553            //
554            checkAkismet( context, change );
555
556            m_lastModifications.add( new Host( addr, change ) );
557        }
558    }
559
560
561    /**
562     *  Checks against the akismet system.
563     *
564     * @param context
565     * @param change
566     * @throws RedirectException
567     */
568    private void checkAkismet( WikiContext context, Change change ) throws RedirectException {
569        if( m_akismetAPIKey != null ) {
570            if( m_akismet == null ) {
571                log.info( "Initializing Akismet spam protection." );
572                m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() );
573
574                if( !m_akismet.verifyAPIKey() ) {
575                    log.error( "Akismet API key cannot be verified.  Please check your config." );
576                    m_akismetAPIKey = null;
577                    m_akismet = null;
578                }
579            }
580
581            HttpServletRequest req = context.getHttpRequest();
582
583            //
584            //  Akismet will mark all empty statements as spam, so we'll just
585            //  ignore them.
586            //
587            if( change.m_adds == 0 && change.m_removals > 0 ) {
588                return;
589            }
590            
591            if( req != null && m_akismet != null ) {
592                log.debug( "Calling Akismet to check for spam..." );
593
594                StopWatch sw = new StopWatch();
595                sw.start();
596
597                String ipAddress     = HttpUtil.getRemoteAddress( req );
598                String userAgent     = req.getHeader( "User-Agent" );
599                String referrer      = req.getHeader( "Referer");
600                String permalink     = context.getViewURL( context.getPage().getName() );
601                String commentType   = context.getRequestContext().equals( WikiContext.COMMENT ) ? "comment" : "edit";
602                String commentAuthor = context.getCurrentUser().getName();
603                String commentAuthorEmail = null;
604                String commentAuthorURL   = null;
605
606                boolean isSpam = m_akismet.commentCheck( ipAddress,
607                                                         userAgent,
608                                                         referrer,
609                                                         permalink,
610                                                         commentType,
611                                                         commentAuthor,
612                                                         commentAuthorEmail,
613                                                         commentAuthorURL,
614                                                         change.toString(),
615                                                         null );
616
617                sw.stop();
618                log.debug( "Akismet request done in: " + sw );
619
620                if( isSpam ) {
621                    // Host host = new Host( ipAddress, null );
622                    // m_temporaryBanList.add( host );
623
624                    String uid = log( context, REJECT, REASON_AKISMET, change.toString() );
625                    log.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." );
626                    checkStrategy( context, REASON_AKISMET, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" );
627                }
628            }
629        }
630    }
631
632    /**
633     * Returns a static string which can be used to detect spambots which just wildly fill in all the fields.
634     *
635     * @return A string
636     */
637    public static String getBotFieldName() {
638        return "submit_auth";
639    }
640
641    /**
642     * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam.
643     *
644     * @param context
645     * @param change
646     * @throws RedirectException
647     */
648    private void checkBotTrap( WikiContext context, Change change ) throws RedirectException {
649        HttpServletRequest request = context.getHttpRequest();
650
651        if( request != null ) {
652            String unspam = request.getParameter( getBotFieldName() );
653            if( unspam != null && unspam.length() > 0 ) {
654                String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() );
655
656                log.info( "SPAM:BotTrap (" + uid + ").  Wildly behaving bot detected." );
657                checkStrategy( context, REASON_BOT_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" );
658            }
659        }
660    }
661
662    private void checkUTF8( WikiContext context, Change change ) throws RedirectException {
663        HttpServletRequest request = context.getHttpRequest();
664
665        if( request != null ) {
666            String utf8field = request.getParameter( "encodingcheck" );
667
668            if( utf8field != null && !utf8field.equals( "\u3041" ) ) {
669                String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() );
670
671                log.info( "SPAM:UTF8Trap (" + uid + ").  Wildly posting dumb bot detected." );
672                checkStrategy( context, REASON_UTF8_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" );
673            }
674        }
675    }
676
677    /** Goes through the ban list and cleans away any host which has expired from it. */
678    private synchronized void cleanBanList() {
679        long now = System.currentTimeMillis();
680
681        for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) {
682            Host host = i.next();
683
684            if( host.getReleaseTime() < now ) {
685                log.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" );
686                i.remove();
687            }
688        }
689    }
690
691    /**
692     *  Checks the ban list if the IP address of the changer is already on it.
693     *
694     *  @param context
695     *  @throws RedirectException
696     */
697    private void checkBanList( WikiContext context, Change change ) throws RedirectException {
698        HttpServletRequest req = context.getHttpRequest();
699
700        if( req != null ) {
701            String remote = HttpUtil.getRemoteAddress(req);
702            long now = System.currentTimeMillis();
703
704            for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) {
705                Host host = i.next();
706
707                if( host.getAddress().equals( remote ) ) {
708                    long timeleft = ( host.getReleaseTime() - now ) / 1000L;
709
710                    log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change );
711                    checkStrategy( context, REASON_IP_BANNED_TEMPORARILY, "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" );
712                }
713            }
714        }
715    }
716
717    /**
718     *  If the spam filter notices changes in the black list page, it will refresh them automatically.
719     *
720     *  @param context
721     */
722    private void refreshBlacklists( WikiContext context ) {
723        try {
724
725            boolean rebuild = false;
726
727            //
728            //  Rebuild, if the spam words page, the attachment or the IP ban page has changed since.
729            //
730            WikiPage sourceSpam = context.getEngine().getPage( m_forbiddenWordsPage );
731            if( sourceSpam != null ) {
732                if( m_spamPatterns == null || m_spamPatterns.isEmpty() || sourceSpam.getLastModified().after( m_lastRebuild ) ) {
733                    rebuild = true;
734                }
735            }
736
737            Attachment att = context.getEngine().getAttachmentManager().getAttachmentInfo( context, m_blacklist );
738            if( att != null ) {
739                if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) {
740                    rebuild = true;
741                }
742            }
743
744            WikiPage sourceIPs = context.getEngine().getPage( m_forbiddenIPsPage );
745            if( sourceIPs != null ) {
746                if( m_IPPatterns == null || m_IPPatterns.isEmpty() || sourceIPs.getLastModified().after( m_lastRebuild ) ) {
747                    rebuild = true;
748                }
749            }
750
751            //
752            //  Do the actual rebuilding.  For simplicity's sake, we always rebuild the complete
753            //  filter list regardless of what changed.
754            //
755            if( rebuild ) {
756                m_lastRebuild = new Date();
757                m_spamPatterns = parseWordList( sourceSpam,
758                                                ( sourceSpam != null ) ? ( String )sourceSpam.getAttribute( LISTVAR ) : null );
759
760                log.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage );
761
762                m_IPPatterns = parseWordList( sourceIPs,
763                        ( sourceIPs != null ) ? ( String )sourceIPs.getAttribute( LISTIPVAR ) : null );
764                log.info( "IP filter reloaded - recognizing " + m_IPPatterns.size() + " patterns from page " + m_forbiddenIPsPage );
765
766                if( att != null ) {
767                    InputStream in = context.getEngine().getAttachmentManager().getAttachmentStream(att);
768                    StringWriter out = new StringWriter();
769                    FileUtil.copyContents( new InputStreamReader( in,"UTF-8" ), out );
770                    Collection< Pattern > blackList = parseBlacklist( out.toString() );
771                    log.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist );
772                    m_spamPatterns.addAll( blackList );
773                }
774            }
775        } catch( IOException ex ) {
776            log.info( "Unable to read attachment data, continuing...", ex );
777        } catch( ProviderException ex ) {
778            log.info( "Failed to read spam filter attachment, continuing...", ex );
779        }
780    }
781
782    /**
783     *  Does a check against a known pattern list.
784     *
785     *  @param context
786     *  @param content
787     *  @param change
788     *  @throws RedirectException
789     */
790    private void checkPatternList( WikiContext context, String content, Change change ) throws RedirectException {
791        //
792        //  If we have no spam patterns defined, or we're trying to save
793        //  the page containing the patterns, just return.
794        //
795        if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) {
796            return;
797        }
798
799        String ch = change.toString();
800        if( context.getHttpRequest() != null ) {
801            ch += HttpUtil.getRemoteAddress( context.getHttpRequest() );
802        }
803
804        for( Pattern p : m_spamPatterns ) {
805            // log.debug("Attempting to match page contents with "+p.getPattern());
806
807            if( m_matcher.contains( ch, p ) ) {
808                //
809                //  Spam filter has a match.
810                //
811                String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch );
812
813                log.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" );
814                checkStrategy( context, REASON_REGEXP, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" );
815            }
816        }
817    }
818
819
820    /**
821     *  Does a check against a pattern list of IPs.
822     *
823     *  @param context
824     *  @throws RedirectException
825     */
826    private void checkIPList( WikiContext context ) throws RedirectException {
827        //
828        //  If we have no IP patterns defined, or we're trying to save
829        //  the page containing the IP patterns, just return.
830        //
831        if( m_IPPatterns == null || context.getPage().getName().equals( m_forbiddenIPsPage ) ) {
832            return;
833        }
834
835        String remoteIP = HttpUtil.getRemoteAddress( context.getHttpRequest() );
836        log.info("Attempting to match remoteIP " + remoteIP + " against " + m_IPPatterns.size() + " patterns");
837
838        for( Pattern p : m_IPPatterns ) {
839             log.debug("Attempting to match remoteIP with " + p.getPattern());
840
841            if( m_matcher.contains( remoteIP, p ) ) {
842
843                //  IP filter has a match.
844                //
845                String uid = log( context, REJECT, REASON_IP_BANNED_PERMANENTLY + "(" + p.getPattern() + ")", remoteIP );
846
847                log.info( "SPAM:IPBanList (" + uid + "). remoteIP matches the IP filter '" + p.getPattern() + "'" );
848                checkStrategy( context, REASON_IP_BANNED_PERMANENTLY, "Herb says '" + p.getPattern() + "' is a banned IP and I trust Herb! (Incident code " + uid + ")" );
849            }
850        }
851    }
852
853    private void checkPatternList( WikiContext context, String content, String change ) throws RedirectException {
854        Change c = new Change();
855        c.m_change = change;
856        checkPatternList( context, content, c );
857    }
858 
859    /**
860     *  Creates a simple text string describing the added content.
861     *
862     *  @param context
863     *  @param newText
864     *  @return Empty string, if there is no change.
865     */
866    private static Change getChange( WikiContext context, String newText ) {
867        WikiPage page = context.getPage();
868        StringBuffer change = new StringBuffer();
869        WikiEngine engine = context.getEngine();
870        // Get current page version
871
872        Change ch = new Change();
873        
874        try {
875            String oldText = engine.getPureText( page.getName(), WikiProvider.LATEST_VERSION );
876
877            String[] first  = Diff.stringToArray( oldText );
878            String[] second = Diff.stringToArray( newText );
879            Revision rev = Diff.diff( first, second, new MyersDiff() );
880
881            if( rev == null || rev.size() == 0 ) {
882                return ch;
883            }
884            
885            for( int i = 0; i < rev.size(); i++ ) {
886                Delta d = rev.getDelta( i );
887
888                if( d instanceof AddDelta ) {
889                    d.getRevised().toString( change, "", "\r\n" );
890                    ch.m_adds++;
891                    
892                } else if( d instanceof ChangeDelta ) {
893                    d.getRevised().toString( change, "", "\r\n" );
894                    ch.m_adds++;
895                    
896                } else if( d instanceof DeleteDelta ) {
897                    ch.m_removals++;
898                }
899            }
900        } catch( DifferentiationFailedException e ) {
901            log.error( "Diff failed", e );
902        }
903
904        //
905        //  Don't forget to include the change note, too
906        //
907        String changeNote = ( String )page.getAttribute( WikiPage.CHANGENOTE );
908
909        if( changeNote != null ) {
910            change.append( "\r\n" );
911            change.append( changeNote );
912        }
913
914        //
915        //  And author as well
916        //
917        if( page.getAuthor() != null ) {
918            change.append( "\r\n" + page.getAuthor() );
919        }
920
921        ch.m_change = change.toString();
922        return ch;
923    }
924
925    /**
926     *  Returns true, if this user should be ignored.  For example, admin users.
927     *
928     * @param context
929     * @return True, if this users should be ignored.
930     */
931    private boolean ignoreThisUser( WikiContext context ) {
932        if( context.hasAdminPermissions() ) {
933            return true;
934        }
935
936        if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) {
937            return true;
938        }
939
940        if( context.getVariable( "captcha" ) != null ) {
941            return true;
942        }
943
944        return false;
945    }
946
947    /**
948     *  Returns a random string of six uppercase characters.
949     *
950     *  @return A random string
951     */
952    private static String getUniqueID() {
953        StringBuilder sb = new StringBuilder();
954        Random rand = new Random();
955
956        for( int i = 0; i < 6; i++ ) {
957            char x = ( char )( 'A' + rand.nextInt( 26 ) );
958            sb.append( x );
959        }
960
961        return sb.toString();
962    }
963
964    /**
965     *  Returns a page to which we shall redirect, based on the current value of the "captcha" parameter.
966     *
967     *  @param ctx WikiContext
968     *  @return An URL to redirect to
969     */
970    private String getRedirectPage( WikiContext ctx ) {
971        if( m_useCaptcha ) {
972            return ctx.getURL( WikiContext.NONE, "Captcha.jsp", "page="+ctx.getEngine().encodeName( ctx.getPage().getName() ) );
973        }
974
975        return ctx.getURL( WikiContext.VIEW, m_errorPage );
976    }
977
978    /**
979     *  Checks whether the UserProfile matches certain checks.
980     *
981     *  @param profile The profile to check
982     *  @param context The WikiContext
983     *  @return False, if this userprofile is suspect and should not be allowed to be added.
984     *  @since 2.6.1
985     */
986    public boolean isValidUserProfile( WikiContext context, UserProfile profile ) {
987        try {
988            checkPatternList( context, profile.getEmail(), profile.getEmail() );
989            checkPatternList( context, profile.getFullname(), profile.getFullname() );
990            checkPatternList( context, profile.getLoginName(), profile.getLoginName() );
991        } catch( RedirectException e ) {
992            log.info("Detected attempt to create a spammer user account (see above for rejection reason)");
993            return false;
994        }
995
996        return true;
997    }
998
999    /**
1000     *  This method is used to calculate an unique code when submitting the page to detect edit conflicts.  
1001     *  It currently incorporates the last-modified date of the page, and the IP address of the submitter.
1002     *
1003     *  @param page The WikiPage under edit
1004     *  @param request The HTTP Request
1005     *  @since 2.6
1006     *  @return A hash value for this page and session
1007     */
1008    public static final String getSpamHash( WikiPage page, HttpServletRequest request ) {
1009        long lastModified = 0;
1010
1011        if( page.getLastModified() != null ) {
1012            lastModified = page.getLastModified().getTime();
1013        }
1014        long remote = HttpUtil.getRemoteAddress( request ).hashCode();
1015
1016        return Long.toString( lastModified ^ remote );
1017    }
1018
1019    /**
1020     *  Returns the name of the hash field to be used in this request. The value is unique per session, and once 
1021     *  the session has expired, you cannot edit anymore.
1022     *
1023     *  @param request The page request
1024     *  @return The name to be used in the hash field
1025     *  @since  2.6
1026     */
1027    public static final String getHashFieldName( HttpServletRequest request ) {
1028        String hash = null;
1029
1030        if( request.getSession() != null ) {
1031            hash = ( String )request.getSession().getAttribute( "_hash" );
1032
1033            if( hash == null ) {
1034                hash = c_hashName;
1035                request.getSession().setAttribute( "_hash", hash );
1036            }
1037        }
1038
1039        if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) {
1040            c_hashName = getUniqueID().toLowerCase();
1041            c_lastUpdate = System.currentTimeMillis();
1042        }
1043
1044        return hash != null ? hash : c_hashName;
1045    }
1046
1047
1048    /**
1049     *  This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases: 
1050     *  either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long, 
1051     *  and their session has expired.
1052     *  <p>
1053     *  This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in 
1054     *  the spam log (it may or may not be spam, but it's rather likely that it is).
1055     *
1056     *  @param context The WikiContext
1057     *  @param pageContext The JSP PageContext.
1058     *  @return True, if hash is okay.  False, if hash is not okay, and you need to redirect.
1059     *  @throws IOException If redirection fails
1060     *  @since 2.6
1061     */
1062    public static final boolean checkHash( WikiContext context, PageContext pageContext ) throws IOException {
1063        String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() );
1064
1065        if( pageContext.getRequest().getParameter(hashName) == null ) {
1066            if( pageContext.getAttribute( hashName ) == null ) {
1067                Change change = getChange( context, EditorManager.getEditedText( pageContext ) );
1068                log( context, REJECT, "MissingHash", change.m_change );
1069
1070                String redirect = context.getURL( WikiContext.VIEW,"SessionExpired" );
1071                ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect );
1072                return false;
1073            }
1074        }
1075
1076        return true;
1077    }
1078
1079    /**
1080     * This helper method adds all the input fields to your editor that the SpamFilter requires
1081     * to check for spam.  This <i>must</i> be in your editor form if you intend to use the SpamFilter.
1082     *  
1083     * @param pageContext The PageContext
1084     * @return A HTML string which contains input fields for the SpamFilter.
1085     */
1086    public static final String insertInputFields( final PageContext pageContext ) {
1087        final WikiContext ctx = WikiContext.findContext( pageContext );
1088        final WikiEngine engine = ctx.getEngine();
1089        final StringBuilder sb = new StringBuilder();
1090        if( engine.getContentEncoding().equals( StandardCharsets.UTF_8 ) ) {
1091            sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" );
1092        }
1093
1094        return sb.toString();
1095    }
1096    
1097    /**
1098     *  A local class for storing host information.
1099     *
1100     *  @since
1101     */
1102    private class Host {
1103        
1104        private long   m_addedTime = System.currentTimeMillis();
1105        private long   m_releaseTime;
1106        private String m_address;
1107        private Change m_change;
1108
1109        public String getAddress() {
1110            return m_address;
1111        }
1112
1113        public long getReleaseTime() {
1114            return m_releaseTime;
1115        }
1116
1117        public long getAddedTime() {
1118            return m_addedTime;
1119        }
1120
1121        public Change getChange() {
1122            return m_change;
1123        }
1124
1125        public Host( String ipaddress, Change change ) {
1126            m_address = ipaddress;
1127            m_change  = change;
1128            m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L;
1129        }
1130        
1131    }
1132    
1133    private static class Change {
1134        
1135        public String m_change;
1136        public int    m_adds;
1137        public int    m_removals;
1138        
1139        public String toString() {
1140            return m_change;
1141        }
1142        
1143        public boolean equals( Object o ) {
1144            if( o instanceof Change ) {
1145                return m_change.equals( ( ( Change )o ).m_change );
1146            }
1147            return false;
1148        }
1149        
1150        public int hashCode() {
1151            return m_change.hashCode() + 17;
1152        }
1153        
1154    }
1155
1156}