001/*
002    Licensed to the Apache Software Foundation (ASF) under one
003    or more contributor license agreements.  See the NOTICE file
004    distributed with this work for additional information
005    regarding copyright ownership.  The ASF licenses this file
006    to you under the Apache License, Version 2.0 (the
007    "License"); you may not use this file except in compliance
008    with the License.  You may obtain a copy of the License at
009
010       http://www.apache.org/licenses/LICENSE-2.0
011
012    Unless required by applicable law or agreed to in writing,
013    software distributed under the License is distributed on an
014    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015    KIND, either express or implied.  See the License for the
016    specific language governing permissions and limitations
017    under the License.  
018 */
019package org.apache.wiki.filters;
020
021import net.sf.akismet.Akismet;
022import org.apache.commons.lang3.time.StopWatch;
023import org.apache.log4j.Logger;
024import org.apache.oro.text.regex.MalformedPatternException;
025import org.apache.oro.text.regex.MatchResult;
026import org.apache.oro.text.regex.Pattern;
027import org.apache.oro.text.regex.PatternCompiler;
028import org.apache.oro.text.regex.PatternMatcher;
029import org.apache.oro.text.regex.Perl5Compiler;
030import org.apache.oro.text.regex.Perl5Matcher;
031import org.apache.wiki.InternalWikiException;
032import org.apache.wiki.api.core.Attachment;
033import org.apache.wiki.api.core.Context;
034import org.apache.wiki.api.core.ContextEnum;
035import org.apache.wiki.api.core.Engine;
036import org.apache.wiki.api.core.Page;
037import org.apache.wiki.api.exceptions.ProviderException;
038import org.apache.wiki.api.exceptions.RedirectException;
039import org.apache.wiki.api.filters.BasePageFilter;
040import org.apache.wiki.api.providers.WikiProvider;
041import org.apache.wiki.attachment.AttachmentManager;
042import org.apache.wiki.auth.user.UserProfile;
043import org.apache.wiki.pages.PageManager;
044import org.apache.wiki.ui.EditorManager;
045import org.apache.wiki.util.FileUtil;
046import org.apache.wiki.util.HttpUtil;
047import org.apache.wiki.util.TextUtil;
048import org.suigeneris.jrcs.diff.Diff;
049import org.suigeneris.jrcs.diff.DifferentiationFailedException;
050import org.suigeneris.jrcs.diff.Revision;
051import org.suigeneris.jrcs.diff.delta.AddDelta;
052import org.suigeneris.jrcs.diff.delta.ChangeDelta;
053import org.suigeneris.jrcs.diff.delta.DeleteDelta;
054import org.suigeneris.jrcs.diff.delta.Delta;
055import org.suigeneris.jrcs.diff.myers.MyersDiff;
056
057import javax.servlet.http.HttpServletRequest;
058import javax.servlet.http.HttpServletResponse;
059import javax.servlet.jsp.PageContext;
060import java.io.BufferedReader;
061import java.io.IOException;
062import java.io.InputStream;
063import java.io.InputStreamReader;
064import java.io.StringReader;
065import java.io.StringWriter;
066import java.nio.charset.StandardCharsets;
067import java.util.ArrayList;
068import java.util.Collection;
069import java.util.Date;
070import java.util.Iterator;
071import java.util.Properties;
072import java.util.Random;
073import java.util.StringTokenizer;
074import java.util.Vector;
075import java.util.concurrent.ThreadLocalRandom;
076
077
078/**
079 *  This is Herb, the JSPWiki spamfilter that can also do choke modifications.
080 *
081 *  Parameters:
082 *  <ul>
083 *    <li>wordlist - Page name where the spamword regexps are found.  Use [{SET spamwords='regexp list separated with spaces'}] on
084 *     that page.  Default is "SpamFilterWordList".
085 *    <li>IPlist - Page name where the IP regexps are found.  Use [{SET ips='regexp list separated with spaces'}] on
086 *     that page.  Default is "SpamFilterIPList".
087 *    <li>maxpagenamelength - Maximum page name length. Default is 100.
088 *    <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is
089 *        "SpamFilterWordList/blacklist.txt"</li>
090 *    <li>errorpage - The page to which the user is redirected.  Has a special variable $msg which states the reason. Default is "RejectedMessage".
091 *    <li>pagechangesinminute - How many page changes are allowed/minute.  Default is 5.</li>
092 *    <li>similarchanges - How many similar page changes are allowed before the host is banned.  Default is 2.  (since 2.4.72)</li>
093 *    <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li>
094 *    <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li>
095 *    <li>akismet-apikey - The Akismet API key (see akismet.org)</li>
096 *    <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li>
097 *    <li>captcha - Sets the captcha technology to use.  Current allowed values are "none" and "asirra".</li>
098 *    <li>strategy - Sets the filtering strategy to use.  If set to "eager", will stop at the first probable
099 *        match, and won't consider any other tests.  This is the default, as it's considerably lighter. If set to "score", will go through all of the tests
100 *        and calculates a score for the spam, which is then compared to a filter level value.
101 *  </ul>
102 *
103 *  <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates
104 *  with the editor system.</p>
105 *  
106 *  <p>Changes by admin users are ignored in any case.</p>
107 *
108 *  @since 2.1.112
109 */
110public class SpamFilter extends BasePageFilter {
111    
112    private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score";
113    private static final String REASON_REGEXP = "Regexp";
114    private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily";
115    private static final String REASON_IP_BANNED_PERMANENTLY = "IPBannedPermanently";
116    private static final String REASON_BOT_TRAP = "BotTrap";
117    private static final String REASON_AKISMET = "Akismet";
118    private static final String REASON_TOO_MANY_URLS = "TooManyUrls";
119    private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications";
120    private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications";
121    private static final String REASON_PAGENAME_TOO_LONG = "PageNameTooLong";
122    private static final String REASON_UTF8_TRAP = "UTF8Trap";
123
124    private static final String LISTVAR = "spamwords";
125    private static final String LISTIPVAR = "ips";
126
127    private static final Random RANDOM = ThreadLocalRandom.current();
128
129    /** The filter property name for specifying the page which contains the list of spamwords. Value is <tt>{@value}</tt>. */
130    public static final String  PROP_WORDLIST              = "wordlist";
131
132    /** The filter property name for specifying the page which contains the list of IPs to ban. Value is <tt>{@value}</tt>. */
133    public static final String  PROP_IPLIST                = "IPlist";
134
135    /** The filter property name for specifying the maximum page name length.  Value is <tt>{@value}</tt>. */
136    public static final String  PROP_MAX_PAGENAME_LENGTH   = "maxpagenamelength";
137
138    /** The filter property name for the page to which you are directed if Herb rejects your edit.  Value is <tt>{@value}</tt>. */
139    public static final String  PROP_ERRORPAGE             = "errorpage";
140    
141    /** The filter property name for specifying how many changes is any given IP address
142     *  allowed to do per minute.  Value is <tt>{@value}</tt>.
143     */
144    public static final String  PROP_PAGECHANGES           = "pagechangesinminute";
145    
146    /** The filter property name for specifying how many similar changes are allowed before a host is banned.  Value is <tt>{@value}</tt>. */
147    public static final String  PROP_SIMILARCHANGES        = "similarchanges";
148    
149    /** The filter property name for specifying how long a host is banned.  Value is <tt>{@value}</tt>.*/
150    public static final String  PROP_BANTIME               = "bantime";
151    
152    /** The filter property name for the attachment containing the blacklist.  Value is <tt>{@value}</tt>.*/
153    public static final String  PROP_BLACKLIST             = "blacklist";
154    
155    /** The filter property name for specifying how many URLs can any given edit contain. Value is <tt>{@value}</tt> */
156    public static final String  PROP_MAXURLS               = "maxurls";
157    
158    /** The filter property name for specifying the Akismet API-key.  Value is <tt>{@value}</tt>. */
159    public static final String  PROP_AKISMET_API_KEY       = "akismet-apikey";
160    
161    /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */
162    public static final String  PROP_IGNORE_AUTHENTICATED  = "ignoreauthenticated";
163    
164    /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */
165    public static final String  PROP_CAPTCHA               = "captcha";
166    
167    /** The filter property name for specifying which filter strategy should be used.  Value is <tt>{@value}</tt>. */
168    public static final String  PROP_FILTERSTRATEGY        = "strategy";
169
170    /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */
171    public static final String  STRATEGY_EAGER             = "eager";
172    
173    /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */
174    public static final String  STRATEGY_SCORE             = "score";
175
176    private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)";
177
178    private String          m_forbiddenWordsPage = "SpamFilterWordList";
179    private String          m_forbiddenIPsPage   = "SpamFilterIPList";
180    private String          m_pageNameMaxLength  = "100";
181    private String          m_errorPage          = "RejectedMessage";
182    private String          m_blacklist          = "SpamFilterWordList/blacklist.txt";
183
184    private final PatternMatcher  m_matcher = new Perl5Matcher();
185    private final PatternCompiler m_compiler = new Perl5Compiler();
186
187    private Collection<Pattern> m_spamPatterns = null;
188    private Collection<Pattern> m_IPPatterns = null;
189
190    private Date m_lastRebuild = new Date( 0L );
191
192    private static final Logger c_spamlog = Logger.getLogger( "SpamLog" );
193    private static final Logger log = Logger.getLogger( SpamFilter.class );
194
195    private Vector<Host>    m_temporaryBanList = new Vector<>();
196
197    private int             m_banTime = 60; // minutes
198
199    private Vector<Host>    m_lastModifications = new Vector<>();
200
201    /** How many times a single IP address can change a page per minute? */
202    private int             m_limitSinglePageChanges = 5;
203
204    /** How many times can you add the exact same string to a page? */
205    private int             m_limitSimilarChanges = 2;
206
207    /** How many URLs can be added at maximum. */
208    private int             m_maxUrls = 10;
209
210    private Pattern         m_urlPattern;
211    private Akismet         m_akismet;
212
213    private String          m_akismetAPIKey = null;
214
215    private boolean         m_useCaptcha = false;
216
217    /** The limit at which we consider something to be spam. */
218    private int             m_scoreLimit = 1;
219
220    /** If set to true, will ignore anyone who is in Authenticated role. */
221    private boolean         m_ignoreAuthenticated = false;
222
223    private boolean         m_stopAtFirstMatch = true;
224
225    private static String   c_hashName;
226    private static long     c_lastUpdate;
227
228    /** The HASH_DELAY value is a maximum amount of time that an user can keep
229     *  a session open, because after the value has expired, we will invent a new
230     *  hash field name.  By default this is {@value} hours, which should be ample
231     *  time for someone.
232     */
233    private static final long HASH_DELAY = 24;
234
235
236    /**
237     *  {@inheritDoc}
238     */
239    @Override
240    public void initialize( final Engine engine, final Properties properties ) {
241        m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage );
242        m_forbiddenIPsPage = properties.getProperty( PROP_IPLIST, m_forbiddenIPsPage);
243        m_pageNameMaxLength = properties.getProperty( PROP_MAX_PAGENAME_LENGTH, m_pageNameMaxLength);
244        m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage );
245        m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties, PROP_PAGECHANGES, m_limitSinglePageChanges );
246        
247        m_limitSimilarChanges = TextUtil.getIntegerProperty( properties, PROP_SIMILARCHANGES, m_limitSimilarChanges );
248
249        m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls );
250        m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime );
251        m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist );
252
253        m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties, PROP_IGNORE_AUTHENTICATED, m_ignoreAuthenticated );
254
255        m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra");
256
257        try {
258            m_urlPattern = m_compiler.compile( URL_REGEXP );
259        } catch( final MalformedPatternException e ) {
260            log.fatal( "Internal error: Someone put in a faulty pattern.", e );
261            throw new InternalWikiException( "Faulty pattern." , e);
262        }
263
264        m_akismetAPIKey = TextUtil.getStringProperty( properties, PROP_AKISMET_API_KEY, m_akismetAPIKey );
265        m_stopAtFirstMatch = TextUtil.getStringProperty( properties, PROP_FILTERSTRATEGY, STRATEGY_EAGER ).equals( STRATEGY_EAGER );
266
267        log.info( "# Spam filter initialized.  Temporary ban time " + m_banTime +
268                  " mins, max page changes/minute: " + m_limitSinglePageChanges );
269    }
270
271    private static final int REJECT = 0;
272    private static final int ACCEPT = 1;
273    private static final int NOTE   = 2;
274
275    private static String log( final Context ctx, final int type, final String source, String message ) {
276        message = TextUtil.replaceString( message, "\r\n", "\\r\\n" );
277        message = TextUtil.replaceString( message, "\"", "\\\"" );
278
279        final String uid = getUniqueID();
280        final String page   = ctx.getPage().getName();
281        final String addr   = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-";
282        final String reason;
283        switch( type ) {
284            case REJECT: reason = "REJECTED";
285                break;
286            case ACCEPT: reason = "ACCEPTED";
287                break;
288            case NOTE: reason = "NOTE";
289                break;
290            default: throw new InternalWikiException( "Illegal type " + type );
291        }
292        c_spamlog.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message );
293
294        return uid;
295    }
296
297    /** {@inheritDoc} */
298    @Override
299    public String preSave( final Context context, final String content ) throws RedirectException {
300        cleanBanList();
301        refreshBlacklists( context );
302        final Change change = getChange( context, content );
303
304        if( !ignoreThisUser( context ) ) {
305            checkBanList( context, change );
306            checkSinglePageChange( context, content, change );
307            checkIPList( context );
308            checkPatternList( context, content, change );
309            checkPageName( context, content, change);
310        }
311
312        if( !m_stopAtFirstMatch ) {
313            final Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE );
314
315            if( score != null && score >= m_scoreLimit ) {
316                throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) );
317            }
318        }
319
320        log( context, ACCEPT, "-", change.toString() );
321        return content;
322    }
323
324    private void checkPageName( final Context context, final String content, final Change change ) throws RedirectException {
325        final Page page = context.getPage();
326        final String pageName = page.getName();
327        final int maxlength = Integer.parseInt(m_pageNameMaxLength);
328        if ( pageName.length() > maxlength) {
329            //
330            //  Spam filter has a match.
331            //
332
333            final String uid = log( context, REJECT, REASON_PAGENAME_TOO_LONG + "(" + m_pageNameMaxLength + ")" , pageName);
334
335            log.info("SPAM:PageNameTooLong (" + uid + "). The length of the page name is too large (" + pageName.length() + " , limit is " + m_pageNameMaxLength + ")");
336            checkStrategy( context, REASON_PAGENAME_TOO_LONG, "Herb says '" + pageName + "' is a bad pageName and I trust Herb! (Incident code " + uid + ")" );
337
338        }
339    }
340
341    private void checkStrategy( final Context context, final String error, final String message ) throws RedirectException {
342        if( m_stopAtFirstMatch ) {
343            throw new RedirectException( message, getRedirectPage( context ) );
344        }
345
346        Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE );
347        if( score != null ) {
348            score = score + 1;
349        } else {
350            score = 1;
351        }
352
353        context.setVariable( ATTR_SPAMFILTER_SCORE, score );
354    }
355    
356    /**
357     *  Parses a list of patterns and returns a Collection of compiled Pattern objects.
358     *
359     * @param source page containing the list of patterns.
360     * @param list list of patterns.
361     * @return A Collection of the Patterns that were found from the lists.
362     */
363    private Collection< Pattern > parseWordList( final Page source, final String list ) {
364        final ArrayList< Pattern > compiledpatterns = new ArrayList<>();
365
366        if( list != null ) {
367            final StringTokenizer tok = new StringTokenizer( list, " \t\n" );
368
369            while( tok.hasMoreTokens() ) {
370                final String pattern = tok.nextToken();
371
372                try {
373                    compiledpatterns.add( m_compiler.compile( pattern ) );
374                } catch( final MalformedPatternException e ) {
375                    log.debug( "Malformed spam filter pattern " + pattern );
376                    source.setAttribute("error", "Malformed spam filter pattern " + pattern);
377                }
378            }
379        }
380
381        return compiledpatterns;
382    }
383
384    /**
385     *  Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects.
386     *
387     *  @param list list of patterns.
388     *  @return The parsed blacklist patterns.
389     */
390    private Collection< Pattern > parseBlacklist( final String list ) {
391        final ArrayList< Pattern > compiledpatterns = new ArrayList<>();
392
393        if( list != null ) {
394            try {
395                final BufferedReader in = new BufferedReader( new StringReader(list) );
396                String line;
397                while( (line = in.readLine() ) != null ) {
398                    line = line.trim();
399                    if( line.length() == 0 ) continue; // Empty line
400                    if( line.startsWith("#") ) continue; // It's a comment
401
402                    int ws = line.indexOf( ' ' );
403                    if( ws == -1 ) ws = line.indexOf( '\t' );
404                    if( ws != -1 ) line = line.substring( 0, ws );
405
406                    try {
407                        compiledpatterns.add( m_compiler.compile( line ) );
408                    } catch( final MalformedPatternException e ) {
409                        log.debug( "Malformed spam filter pattern " + line );
410                    }
411                }
412            } catch( final IOException e ) {
413                log.info( "Could not read patterns; returning what I got" , e );
414            }
415        }
416
417        return compiledpatterns;
418    }
419
420    /**
421     * Takes a single page change and performs a load of tests on the content change. An admin can modify anything.
422     *
423     * @param context page Context
424     * @param content page content
425     * @param change page change
426     * @throws RedirectException spam filter rejects the page change.
427     */
428    private synchronized void checkSinglePageChange( final Context context, final String content, final Change change )
429            throws RedirectException {
430        final HttpServletRequest req = context.getHttpRequest();
431
432        if( req != null ) {
433            final String addr = HttpUtil.getRemoteAddress( req );
434            int hostCounter = 0;
435            int changeCounter = 0;
436
437            log.debug( "Change is " + change.m_change );
438
439            final long time = System.currentTimeMillis() - 60*1000L; // 1 minute
440
441            for( final Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) {
442                final Host host = i.next();
443
444                //  Check if this item is invalid
445                if( host.getAddedTime() < time ) {
446                    log.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" );
447                    i.remove();
448                    continue;
449                }
450
451                // Check if this IP address has been seen before
452                if( host.getAddress().equals( addr ) ) {
453                    hostCounter++;
454                }
455
456                //  Check, if this change has been seen before
457                if( host.getChange() != null && host.getChange().equals( change ) ) {
458                    changeCounter++;
459                }
460            }
461
462            //  Now, let's check against the limits.
463            if( hostCounter >= m_limitSinglePageChanges ) {
464                final Host host = new Host( addr, null );
465                m_temporaryBanList.add( host );
466
467                final String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change );
468                log.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" );
469                checkStrategy( context, REASON_TOO_MANY_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
470            }
471
472            if( changeCounter >= m_limitSimilarChanges ) {
473                final Host host = new Host( addr, null );
474                m_temporaryBanList.add( host );
475
476                final String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change );
477                log.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" );
478                checkStrategy( context, REASON_SIMILAR_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")");
479            }
480
481            //  Calculate the number of links in the addition.
482            String tstChange  = change.toString();
483            int urlCounter = 0;
484            while( m_matcher.contains( tstChange,m_urlPattern ) ) {
485                final MatchResult m = m_matcher.getMatch();
486                tstChange = tstChange.substring( m.endOffset(0) );
487                urlCounter++;
488            }
489
490            if( urlCounter > m_maxUrls ) {
491                final Host host = new Host( addr, null );
492                m_temporaryBanList.add( host );
493
494                final String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() );
495                log.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" );
496                checkStrategy( context, REASON_TOO_MANY_URLS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
497            }
498
499            //  Check bot trap
500            checkBotTrap( context, change );
501
502            //  Check UTF-8 mangling
503            checkUTF8( context, change );
504
505            //  Do Akismet check.  This is good to be the last, because this is the most expensive operation.
506            checkAkismet( context, change );
507
508            m_lastModifications.add( new Host( addr, change ) );
509        }
510    }
511
512
513    /**
514     *  Checks against the akismet system.
515     *
516     * @param context page Context
517     * @throws RedirectException spam filter rejects the page change.
518     */
519    private void checkAkismet( final Context context, final Change change ) throws RedirectException {
520        if( m_akismetAPIKey != null ) {
521            if( m_akismet == null ) {
522                log.info( "Initializing Akismet spam protection." );
523                m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() );
524
525                if( !m_akismet.verifyAPIKey() ) {
526                    log.error( "Akismet API key cannot be verified.  Please check your config." );
527                    m_akismetAPIKey = null;
528                    m_akismet = null;
529                }
530            }
531
532            final HttpServletRequest req = context.getHttpRequest();
533
534            //  Akismet will mark all empty statements as spam, so we'll just ignore them.
535            if( change.m_adds == 0 && change.m_removals > 0 ) {
536                return;
537            }
538            
539            if( req != null && m_akismet != null ) {
540                log.debug( "Calling Akismet to check for spam..." );
541
542                final StopWatch sw = new StopWatch();
543                sw.start();
544
545                final String ipAddress     = HttpUtil.getRemoteAddress( req );
546                final String userAgent     = req.getHeader( "User-Agent" );
547                final String referrer      = req.getHeader( "Referer");
548                final String permalink     = context.getViewURL( context.getPage().getName() );
549                final String commentType   = context.getRequestContext().equals( ContextEnum.PAGE_COMMENT.getRequestContext() ) ? "comment" : "edit";
550                final String commentAuthor = context.getCurrentUser().getName();
551                final String commentAuthorEmail = null;
552                final String commentAuthorURL   = null;
553
554                final boolean isSpam = m_akismet.commentCheck( ipAddress,
555                                                               userAgent,
556                                                               referrer,
557                                                               permalink,
558                                                               commentType,
559                                                               commentAuthor,
560                                                               commentAuthorEmail,
561                                                               commentAuthorURL,
562                                                               change.toString(),
563                                                               null );
564
565                sw.stop();
566                log.debug( "Akismet request done in: " + sw );
567
568                if( isSpam ) {
569                    // Host host = new Host( ipAddress, null );
570                    // m_temporaryBanList.add( host );
571
572                    final String uid = log( context, REJECT, REASON_AKISMET, change.toString() );
573                    log.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." );
574                    checkStrategy( context, REASON_AKISMET, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" );
575                }
576            }
577        }
578    }
579
580    /**
581     * Returns a static string which can be used to detect spambots which just wildly fill in all the fields.
582     *
583     * @return A string
584     */
585    public static String getBotFieldName() {
586        return "submit_auth";
587    }
588
589    /**
590     * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam.
591     *
592     * @param context page Context
593     * @param change page change
594     * @throws RedirectException spam filter rejects the page change.
595     */
596    private void checkBotTrap( final Context context, final Change change ) throws RedirectException {
597        final HttpServletRequest request = context.getHttpRequest();
598        if( request != null ) {
599            final String unspam = request.getParameter( getBotFieldName() );
600            if( unspam != null && unspam.length() > 0 ) {
601                final String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() );
602
603                log.info( "SPAM:BotTrap (" + uid + ").  Wildly behaving bot detected." );
604                checkStrategy( context, REASON_BOT_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" );
605            }
606        }
607    }
608
609    private void checkUTF8( final Context context, final Change change ) throws RedirectException {
610        final HttpServletRequest request = context.getHttpRequest();
611        if( request != null ) {
612            final String utf8field = request.getParameter( "encodingcheck" );
613            if( utf8field != null && !utf8field.equals( "\u3041" ) ) {
614                final String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() );
615
616                log.info( "SPAM:UTF8Trap (" + uid + ").  Wildly posting dumb bot detected." );
617                checkStrategy( context, REASON_UTF8_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" );
618            }
619        }
620    }
621
622    /** Goes through the ban list and cleans away any host which has expired from it. */
623    private synchronized void cleanBanList() {
624        final long now = System.currentTimeMillis();
625        for( final Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) {
626            final Host host = i.next();
627
628            if( host.getReleaseTime() < now ) {
629                log.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" );
630                i.remove();
631            }
632        }
633    }
634
635    /**
636     *  Checks the ban list if the IP address of the changer is already on it.
637     *
638     *  @param context page context
639     *  @throws RedirectException spam filter rejects the page change.
640     */
641    private void checkBanList( final Context context, final Change change ) throws RedirectException {
642        final HttpServletRequest req = context.getHttpRequest();
643
644        if( req != null ) {
645            final String remote = HttpUtil.getRemoteAddress(req);
646            final long now = System.currentTimeMillis();
647
648            for( final Host host : m_temporaryBanList ) {
649                if( host.getAddress().equals( remote ) ) {
650                    final long timeleft = ( host.getReleaseTime() - now ) / 1000L;
651
652                    log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change );
653                    checkStrategy( context, REASON_IP_BANNED_TEMPORARILY,
654                            "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" );
655                }
656            }
657        }
658    }
659
660    /**
661     *  If the spam filter notices changes in the black list page, it will refresh them automatically.
662     *
663     *  @param context associated WikiContext
664     */
665    private void refreshBlacklists( final Context context ) {
666        try {
667            boolean rebuild = false;
668
669            //  Rebuild, if the spam words page, the attachment or the IP ban page has changed since.
670            final Page sourceSpam = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenWordsPage );
671            if( sourceSpam != null ) {
672                if( m_spamPatterns == null || m_spamPatterns.isEmpty() || sourceSpam.getLastModified().after( m_lastRebuild ) ) {
673                    rebuild = true;
674                }
675            }
676
677            final Attachment att = context.getEngine().getManager( AttachmentManager.class ).getAttachmentInfo( context, m_blacklist );
678            if( att != null ) {
679                if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) {
680                    rebuild = true;
681                }
682            }
683
684            final Page sourceIPs = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenIPsPage );
685            if( sourceIPs != null ) {
686                if( m_IPPatterns == null || m_IPPatterns.isEmpty() || sourceIPs.getLastModified().after( m_lastRebuild ) ) {
687                    rebuild = true;
688                }
689            }
690
691            //  Do the actual rebuilding.  For simplicity's sake, we always rebuild the complete filter list regardless of what changed.
692            if( rebuild ) {
693                m_lastRebuild = new Date();
694                m_spamPatterns = parseWordList( sourceSpam, ( sourceSpam != null ) ? sourceSpam.getAttribute( LISTVAR ) : null );
695
696                log.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage );
697
698                m_IPPatterns = parseWordList( sourceIPs,  ( sourceIPs != null ) ? sourceIPs.getAttribute( LISTIPVAR ) : null );
699                log.info( "IP filter reloaded - recognizing " + m_IPPatterns.size() + " patterns from page " + m_forbiddenIPsPage );
700
701                if( att != null ) {
702                    final InputStream in = context.getEngine().getManager( AttachmentManager.class ).getAttachmentStream(att);
703                    final StringWriter out = new StringWriter();
704                    FileUtil.copyContents( new InputStreamReader( in, StandardCharsets.UTF_8 ), out );
705                    final Collection< Pattern > blackList = parseBlacklist( out.toString() );
706                    log.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist );
707                    m_spamPatterns.addAll( blackList );
708                }
709            }
710        } catch( final IOException ex ) {
711            log.info( "Unable to read attachment data, continuing...", ex );
712        } catch( final ProviderException ex ) {
713            log.info( "Failed to read spam filter attachment, continuing...", ex );
714        }
715    }
716
717    /**
718     * Does a check against a known pattern list.
719     *
720     * @param context page Context
721     * @param content page content
722     * @param change page change
723     * @throws RedirectException spam filter rejects the page change.
724     */
725    private void checkPatternList( final Context context, final String content, final Change change ) throws RedirectException {
726        // If we have no spam patterns defined, or we're trying to save the page containing the patterns, just return.
727        if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) {
728            return;
729        }
730
731        String ch = change.toString();
732        if( context.getHttpRequest() != null ) {
733            ch += HttpUtil.getRemoteAddress( context.getHttpRequest() );
734        }
735
736        for( final Pattern p : m_spamPatterns ) {
737            // log.debug("Attempting to match page contents with "+p.getPattern());
738
739            if( m_matcher.contains( ch, p ) ) {
740                //  Spam filter has a match.
741                final String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch );
742
743                log.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" );
744                checkStrategy( context, REASON_REGEXP, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" );
745            }
746        }
747    }
748
749
750    /**
751     *  Does a check against a pattern list of IPs.
752     *
753     *  @param context page context
754     *  @throws RedirectException spam filter rejects the page change.
755     */
756    private void checkIPList( final Context context ) throws RedirectException {
757        //  If we have no IP patterns defined, or we're trying to save the page containing the IP patterns, just return.
758        if( m_IPPatterns == null || context.getPage().getName().equals( m_forbiddenIPsPage ) ) {
759            return;
760        }
761
762        final String remoteIP = HttpUtil.getRemoteAddress( context.getHttpRequest() );
763        log.info("Attempting to match remoteIP " + remoteIP + " against " + m_IPPatterns.size() + " patterns");
764
765        for( final Pattern p : m_IPPatterns ) {
766             log.debug("Attempting to match remoteIP with " + p.getPattern());
767
768            if( m_matcher.contains( remoteIP, p ) ) {
769
770                //  IP filter has a match.
771                //
772                final String uid = log( context, REJECT, REASON_IP_BANNED_PERMANENTLY + "(" + p.getPattern() + ")", remoteIP );
773
774                log.info( "SPAM:IPBanList (" + uid + "). remoteIP matches the IP filter '" + p.getPattern() + "'" );
775                checkStrategy( context, REASON_IP_BANNED_PERMANENTLY, "Herb says '" + p.getPattern() + "' is a banned IP and I trust Herb! (Incident code " + uid + ")" );
776            }
777        }
778    }
779
780    private void checkPatternList( final Context context, final String content, final String change ) throws RedirectException {
781        final Change c = new Change();
782        c.m_change = change;
783        checkPatternList( context, content, c );
784    }
785 
786    /**
787     *  Creates a simple text string describing the added content.
788     *
789     *  @param context page context
790     *  @param newText added content
791     *  @return Empty string, if there is no change.
792     */
793    private static Change getChange( final Context context, final String newText ) {
794        final Page page = context.getPage();
795        final StringBuffer change = new StringBuffer();
796        final Engine engine = context.getEngine();
797        // Get current page version
798
799        final Change ch = new Change();
800        
801        try {
802            final String oldText = engine.getManager( PageManager.class ).getPureText( page.getName(), WikiProvider.LATEST_VERSION );
803            final String[] first  = Diff.stringToArray( oldText );
804            final String[] second = Diff.stringToArray( newText );
805            final Revision rev = Diff.diff( first, second, new MyersDiff() );
806
807            if( rev == null || rev.size() == 0 ) {
808                return ch;
809            }
810            
811            for( int i = 0; i < rev.size(); i++ ) {
812                final Delta d = rev.getDelta( i );
813
814                if( d instanceof AddDelta ) {
815                    d.getRevised().toString( change, "", "\r\n" );
816                    ch.m_adds++;
817                    
818                } else if( d instanceof ChangeDelta ) {
819                    d.getRevised().toString( change, "", "\r\n" );
820                    ch.m_adds++;
821                    
822                } else if( d instanceof DeleteDelta ) {
823                    ch.m_removals++;
824                }
825            }
826        } catch( final DifferentiationFailedException e ) {
827            log.error( "Diff failed", e );
828        }
829
830        //  Don't forget to include the change note, too
831        final String changeNote = page.getAttribute( Page.CHANGENOTE );
832        if( changeNote != null ) {
833            change.append( "\r\n" );
834            change.append( changeNote );
835        }
836
837        //  And author as well
838        if( page.getAuthor() != null ) {
839            change.append( "\r\n" + page.getAuthor() );
840        }
841
842        ch.m_change = change.toString();
843        return ch;
844    }
845
846    /**
847     * Returns true, if this user should be ignored.  For example, admin users.
848     *
849     * @param context page context
850     * @return True, if this users should be ignored.
851     */
852    private boolean ignoreThisUser( final Context context ) {
853        if( context.hasAdminPermissions() ) {
854            return true;
855        }
856
857        if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) {
858            return true;
859        }
860
861        return context.getVariable("captcha") != null;
862    }
863
864    /**
865     *  Returns a random string of six uppercase characters.
866     *
867     *  @return A random string
868     */
869    private static String getUniqueID() {
870        final StringBuilder sb = new StringBuilder();
871        for( int i = 0; i < 6; i++ ) {
872            final char x = ( char )( 'A' + RANDOM.nextInt( 26 ) );
873            sb.append( x );
874        }
875
876        return sb.toString();
877    }
878
879    /**
880     *  Returns a page to which we shall redirect, based on the current value of the "captcha" parameter.
881     *
882     *  @param ctx WikiContext
883     *  @return An URL to redirect to
884     */
885    private String getRedirectPage( final Context ctx ) {
886        if( m_useCaptcha ) {
887            return ctx.getURL( ContextEnum.PAGE_NONE.getRequestContext(), "Captcha.jsp", "page= " +ctx.getEngine().encodeName( ctx.getPage().getName() ) );
888        }
889
890        return ctx.getURL( ContextEnum.PAGE_VIEW.getRequestContext(), m_errorPage );
891    }
892
893    /**
894     *  Checks whether the UserProfile matches certain checks.
895     *
896     *  @param profile The profile to check
897     *  @param context The WikiContext
898     *  @return False, if this userprofile is suspect and should not be allowed to be added.
899     *  @since 2.6.1
900     */
901    public boolean isValidUserProfile( final Context context, final UserProfile profile ) {
902        try {
903            checkPatternList( context, profile.getEmail(), profile.getEmail() );
904            checkPatternList( context, profile.getFullname(), profile.getFullname() );
905            checkPatternList( context, profile.getLoginName(), profile.getLoginName() );
906        } catch( final RedirectException e ) {
907            log.info("Detected attempt to create a spammer user account (see above for rejection reason)");
908            return false;
909        }
910
911        return true;
912    }
913
914    /**
915     *  This method is used to calculate an unique code when submitting the page to detect edit conflicts.  
916     *  It currently incorporates the last-modified date of the page, and the IP address of the submitter.
917     *
918     *  @param page The WikiPage under edit
919     *  @param request The HTTP Request
920     *  @since 2.6
921     *  @return A hash value for this page and session
922     */
923    public static String getSpamHash( final Page page, final HttpServletRequest request ) {
924        long lastModified = 0;
925
926        if( page.getLastModified() != null ) {
927            lastModified = page.getLastModified().getTime();
928        }
929        final long remote = HttpUtil.getRemoteAddress( request ).hashCode();
930
931        return Long.toString( lastModified ^ remote );
932    }
933
934    /**
935     *  Returns the name of the hash field to be used in this request. The value is unique per session, and once 
936     *  the session has expired, you cannot edit anymore.
937     *
938     *  @param request The page request
939     *  @return The name to be used in the hash field
940     *  @since  2.6
941     */
942    public static String getHashFieldName( final HttpServletRequest request ) {
943        String hash = null;
944
945        if( request.getSession() != null ) {
946            hash = ( String )request.getSession().getAttribute( "_hash" );
947
948            if( hash == null ) {
949                hash = c_hashName;
950                request.getSession().setAttribute( "_hash", hash );
951            }
952        }
953
954        if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) {
955            c_hashName = getUniqueID().toLowerCase();
956            c_lastUpdate = System.currentTimeMillis();
957        }
958
959        return hash != null ? hash : c_hashName;
960    }
961
962
963    /**
964     *  This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases: 
965     *  either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long, 
966     *  and their session has expired.
967     *  <p>
968     *  This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in 
969     *  the spam log (it may or may not be spam, but it's rather likely that it is).
970     *
971     *  @param context The WikiContext
972     *  @param pageContext The JSP PageContext.
973     *  @return True, if hash is okay.  False, if hash is not okay, and you need to redirect.
974     *  @throws IOException If redirection fails
975     *  @since 2.6
976     */
977    public static boolean checkHash( final Context context, final PageContext pageContext ) throws IOException {
978        final String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() );
979        if( pageContext.getRequest().getParameter(hashName) == null ) {
980            if( pageContext.getAttribute( hashName ) == null ) {
981                final Change change = getChange( context, EditorManager.getEditedText( pageContext ) );
982                log( context, REJECT, "MissingHash", change.m_change );
983
984                final String redirect = context.getURL( ContextEnum.PAGE_VIEW.getRequestContext(),"SessionExpired" );
985                ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect );
986                return false;
987            }
988        }
989
990        return true;
991    }
992
993    /**
994     * This helper method adds all the input fields to your editor that the SpamFilter requires
995     * to check for spam.  This <i>must</i> be in your editor form if you intend to use the SpamFilter.
996     *  
997     * @param pageContext The PageContext
998     * @return A HTML string which contains input fields for the SpamFilter.
999     */
1000    public static String insertInputFields( final PageContext pageContext ) {
1001        final Context ctx = Context.findContext( pageContext );
1002        final Engine engine = ctx.getEngine();
1003        final StringBuilder sb = new StringBuilder();
1004        if( engine.getContentEncoding().equals( StandardCharsets.UTF_8 ) ) {
1005            sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" );
1006        }
1007
1008        return sb.toString();
1009    }
1010    
1011    /**
1012     *  A local class for storing host information.
1013     */
1014    private class Host {
1015
1016        private final long m_addedTime = System.currentTimeMillis();
1017        private final long m_releaseTime;
1018        private final String m_address;
1019        private final Change m_change;
1020
1021        public String getAddress() {
1022            return m_address;
1023        }
1024
1025        public long getReleaseTime() {
1026            return m_releaseTime;
1027        }
1028
1029        public long getAddedTime() {
1030            return m_addedTime;
1031        }
1032
1033        public Change getChange() {
1034            return m_change;
1035        }
1036
1037        public Host( final String ipaddress, final Change change ) {
1038            m_address = ipaddress;
1039            m_change = change;
1040            m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L;
1041        }
1042        
1043    }
1044    
1045    private static class Change {
1046        
1047        public String m_change;
1048        public int    m_adds;
1049        public int    m_removals;
1050
1051        @Override
1052        public String toString() {
1053            return m_change;
1054        }
1055
1056        @Override
1057        public boolean equals( final Object o ) {
1058            if( o instanceof Change ) {
1059                return m_change.equals( ( ( Change )o ).m_change );
1060            }
1061            return false;
1062        }
1063
1064        @Override
1065        public int hashCode() {
1066            return m_change.hashCode() + 17;
1067        }
1068        
1069    }
1070
1071}