001 /*
002 Licensed to the Apache Software Foundation (ASF) under one
003 or more contributor license agreements. See the NOTICE file
004 distributed with this work for additional information
005 regarding copyright ownership. The ASF licenses this file
006 to you under the Apache License, Version 2.0 (the
007 "License"); you may not use this file except in compliance
008 with the License. You may obtain a copy of the License at
009
010 http://www.apache.org/licenses/LICENSE-2.0
011
012 Unless required by applicable law or agreed to in writing,
013 software distributed under the License is distributed on an
014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 KIND, either express or implied. See the License for the
016 specific language governing permissions and limitations
017 under the License.
018 */
019 package org.apache.wiki.filters;
020
021 import java.io.BufferedReader;
022 import java.io.IOException;
023 import java.io.InputStream;
024 import java.io.InputStreamReader;
025 import java.io.StringReader;
026 import java.io.StringWriter;
027 import java.util.ArrayList;
028 import java.util.Collection;
029 import java.util.Date;
030 import java.util.Iterator;
031 import java.util.Properties;
032 import java.util.Random;
033 import java.util.StringTokenizer;
034 import java.util.Vector;
035
036 import javax.servlet.http.HttpServletRequest;
037 import javax.servlet.http.HttpServletResponse;
038 import javax.servlet.jsp.PageContext;
039
040 import net.sf.akismet.Akismet;
041
042 import org.apache.commons.lang.time.StopWatch;
043 import org.apache.log4j.Logger;
044 import org.apache.oro.text.regex.MalformedPatternException;
045 import org.apache.oro.text.regex.MatchResult;
046 import org.apache.oro.text.regex.Pattern;
047 import org.apache.oro.text.regex.PatternCompiler;
048 import org.apache.oro.text.regex.PatternMatcher;
049 import org.apache.oro.text.regex.Perl5Compiler;
050 import org.apache.oro.text.regex.Perl5Matcher;
051 import org.apache.wiki.InternalWikiException;
052 import org.apache.wiki.WikiContext;
053 import org.apache.wiki.WikiEngine;
054 import org.apache.wiki.WikiPage;
055 import org.apache.wiki.WikiProvider;
056 import org.apache.wiki.api.exceptions.ProviderException;
057 import org.apache.wiki.api.exceptions.RedirectException;
058 import org.apache.wiki.api.filters.BasicPageFilter;
059 import org.apache.wiki.attachment.Attachment;
060 import org.apache.wiki.auth.user.UserProfile;
061 import org.apache.wiki.ui.EditorManager;
062 import org.apache.wiki.util.FileUtil;
063 import org.apache.wiki.util.HttpUtil;
064 import org.apache.wiki.util.TextUtil;
065 import org.suigeneris.jrcs.diff.Diff;
066 import org.suigeneris.jrcs.diff.DifferentiationFailedException;
067 import org.suigeneris.jrcs.diff.Revision;
068 import org.suigeneris.jrcs.diff.delta.AddDelta;
069 import org.suigeneris.jrcs.diff.delta.ChangeDelta;
070 import org.suigeneris.jrcs.diff.delta.DeleteDelta;
071 import org.suigeneris.jrcs.diff.delta.Delta;
072 import org.suigeneris.jrcs.diff.myers.MyersDiff;
073
074
075 /**
076 * This is Herb, the JSPWiki spamfilter that can also do choke modifications.
077 *
078 * Parameters:
079 * <ul>
080 * <li>wordlist - Page name where the regexps are found. Use [{SET spamwords='regexp list separated with spaces'}] on
081 * that page. Default is "SpamFilterWordList".
082 * <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is
083 * "SpamFilterWordList/blacklist.txt"</li>
084 * <li>errorpage - The page to which the user is redirected. Has a special variable $msg which states the reason. Default is "RejectedMessage".
085 * <li>pagechangesinminute - How many page changes are allowed/minute. Default is 5.</li>
086 * <li>similarchanges - How many similar page changes are allowed before the host is banned. Default is 2. (since 2.4.72)</li>
087 * <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li>
088 * <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li>
089 * <li>akismet-apikey - The Akismet API key (see akismet.org)</li>
090 * <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li>
091 * <li>captcha - Sets the captcha technology to use. Current allowed values are "none" and "asirra".</li>
092 * <li>strategy - Sets the filtering strategy to use. If set to "eager", will stop at the first probable
093 * match, and won't consider any other tests. This is the default, as it's considerably lighter. If set to "score", will go through all of the tests
094 * and calculates a score for the spam, which is then compared to a filter level value.
095 * </ul>
096 *
097 * <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates
098 * with the editor system.</p>
099 *
100 * <p>Changes by admin users are ignored in any case.</p>
101 *
102 * @since 2.1.112
103 */
104 public class SpamFilter extends BasicPageFilter {
105
106 private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score";
107 private static final String REASON_REGEXP = "Regexp";
108 private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily";
109 private static final String REASON_BOT_TRAP = "BotTrap";
110 private static final String REASON_AKISMET = "Akismet";
111 private static final String REASON_TOO_MANY_URLS = "TooManyUrls";
112 private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications";
113 private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications";
114 private static final String REASON_UTF8_TRAP = "UTF8Trap";
115
116 private static final String LISTVAR = "spamwords";
117
118 /** The filter property name for specifying the page which contains the list of spamwords.
119 * Value is <tt>{@value}</tt>. */
120 public static final String PROP_WORDLIST = "wordlist";
121
122 /** The filter property name for the page to which you are directed if Herb rejects your
123 * edit. Value is <tt>{@value}</tt>. */
124 public static final String PROP_ERRORPAGE = "errorpage";
125
126 /** The filter property name for specifying how many changes is any given IP address
127 * allowed to do per minute. Value is <tt>{@value}</tt>.
128 */
129 public static final String PROP_PAGECHANGES = "pagechangesinminute";
130
131 /** The filter property name for specifying how many similar changes are allowed
132 * before a host is banned. Value is <tt>{@value}</tt>.
133 */
134 public static final String PROP_SIMILARCHANGES = "similarchanges";
135
136 /** The filter property name for specifying how long a host is banned. Value is <tt>{@value}</tt>.*/
137 public static final String PROP_BANTIME = "bantime";
138
139 /** The filter property name for the attachment containing the blacklist. Value is <tt>{@value}</tt>.*/
140 public static final String PROP_BLACKLIST = "blacklist";
141
142 /** The filter property name for specifying how many URLs can any given edit contain.
143 * Value is <tt>{@value}</tt> */
144 public static final String PROP_MAXURLS = "maxurls";
145
146 /** The filter property name for specifying the Akismet API-key. Value is <tt>{@value}</tt>. */
147 public static final String PROP_AKISMET_API_KEY = "akismet-apikey";
148
149 /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */
150 public static final String PROP_IGNORE_AUTHENTICATED = "ignoreauthenticated";
151
152 /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */
153 public static final String PROP_CAPTCHA = "captcha";
154
155 /** The filter property name for specifying which filter strategy should be used. Value is <tt>{@value}</tt>. */
156 public static final String PROP_FILTERSTRATEGY = "strategy";
157
158 /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */
159 public static final String STRATEGY_EAGER = "eager";
160
161 /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */
162 public static final String STRATEGY_SCORE = "score";
163
164 private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)";
165
166 private String m_forbiddenWordsPage = "SpamFilterWordList";
167 private String m_errorPage = "RejectedMessage";
168 private String m_blacklist = "SpamFilterWordList/blacklist.txt";
169
170 private PatternMatcher m_matcher = new Perl5Matcher();
171 private PatternCompiler m_compiler = new Perl5Compiler();
172
173 private Collection<Pattern> m_spamPatterns = null;
174
175 private Date m_lastRebuild = new Date( 0L );
176
177 private static Logger c_spamlog = Logger.getLogger( "SpamLog" );
178 private static Logger log = Logger.getLogger( SpamFilter.class );
179
180
181 private Vector<Host> m_temporaryBanList = new Vector<Host>();
182
183 private int m_banTime = 60; // minutes
184
185 private Vector<Host> m_lastModifications = new Vector<Host>();
186
187 /**
188 * How many times a single IP address can change a page per minute?
189 */
190 private int m_limitSinglePageChanges = 5;
191
192 /**
193 * How many times can you add the exact same string to a page?
194 */
195 private int m_limitSimilarChanges = 2;
196
197 /**
198 * How many URLs can be added at maximum.
199 */
200 private int m_maxUrls = 10;
201
202 private Pattern m_urlPattern;
203 private Akismet m_akismet;
204
205 private String m_akismetAPIKey = null;
206
207 private boolean m_useCaptcha = false;
208
209 /** The limit at which we consider something to be spam. */
210 private int m_scoreLimit = 1;
211
212 /**
213 * If set to true, will ignore anyone who is in Authenticated role.
214 */
215 private boolean m_ignoreAuthenticated = false;
216
217 private boolean m_stopAtFirstMatch = true;
218
219 private static String c_hashName;
220 private static long c_lastUpdate;
221
222 /** The HASH_DELAY value is a maximum amount of time that an user can keep
223 * a session open, because after the value has expired, we will invent a new
224 * hash field name. By default this is {@value} hours, which should be ample
225 * time for someone.
226 */
227 private static final long HASH_DELAY = 24;
228
229
230 /**
231 * {@inheritDoc}
232 */
233 @Override
234 public void initialize( WikiEngine engine, Properties properties ) {
235 m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage );
236 m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage );
237 m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties,
238 PROP_PAGECHANGES,
239 m_limitSinglePageChanges );
240
241 m_limitSimilarChanges = TextUtil.getIntegerProperty( properties,
242 PROP_SIMILARCHANGES,
243 m_limitSimilarChanges );
244
245 m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls );
246 m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime );
247 m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist );
248
249 m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties,
250 PROP_IGNORE_AUTHENTICATED,
251 m_ignoreAuthenticated );
252
253 m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra");
254
255 try {
256 m_urlPattern = m_compiler.compile( URL_REGEXP );
257 } catch( MalformedPatternException e ) {
258 log.fatal( "Internal error: Someone put in a faulty pattern.", e );
259 throw new InternalWikiException( "Faulty pattern." );
260 }
261
262 m_akismetAPIKey = TextUtil.getStringProperty( properties,
263 PROP_AKISMET_API_KEY,
264 m_akismetAPIKey );
265
266 m_stopAtFirstMatch = TextUtil.getStringProperty( properties,
267 PROP_FILTERSTRATEGY,
268 STRATEGY_EAGER ).equals( STRATEGY_EAGER );
269
270 log.info( "# Spam filter initialized. Temporary ban time " + m_banTime +
271 " mins, max page changes/minute: " + m_limitSinglePageChanges );
272
273
274 }
275
276 private static final int REJECT = 0;
277 private static final int ACCEPT = 1;
278 private static final int NOTE = 2;
279
280 private static String log( WikiContext ctx, int type, String source, String message ) {
281 message = TextUtil.replaceString( message, "\r\n", "\\r\\n" );
282 message = TextUtil.replaceString( message, "\"", "\\\"" );
283
284 String uid = getUniqueID();
285
286 String page = ctx.getPage().getName();
287 String reason = "UNKNOWN";
288 String addr = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-";
289
290 switch( type ) {
291 case REJECT:
292 reason = "REJECTED";
293 break;
294 case ACCEPT:
295 reason = "ACCEPTED";
296 break;
297 case NOTE:
298 reason = "NOTE";
299 break;
300 default:
301 throw new InternalWikiException( "Illegal type " + type );
302 }
303 c_spamlog.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message );
304
305 return uid;
306 }
307
308 /** {@inheritDoc} */
309 public String preSave( WikiContext context, String content ) throws RedirectException {
310 cleanBanList();
311 refreshBlacklists( context );
312 Change change = getChange( context, content );
313
314 if( !ignoreThisUser( context ) ) {
315 checkBanList( context, change );
316 checkSinglePageChange( context, content, change );
317 checkPatternList( context, content, change );
318 }
319
320 if( !m_stopAtFirstMatch ) {
321 Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE );
322
323 if( score != null && score.intValue() >= m_scoreLimit ) {
324 throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) );
325 }
326 }
327
328 log( context, ACCEPT, "-", change.toString() );
329 return content;
330 }
331
332 private void checkStrategy( WikiContext context, String error, String message ) throws RedirectException {
333 if( m_stopAtFirstMatch ) {
334 throw new RedirectException( message, getRedirectPage( context ) );
335 }
336
337 Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE );
338 if( score != null ) {
339 score = score + 1;
340 } else {
341 score = 1;
342 }
343
344 context.setVariable( ATTR_SPAMFILTER_SCORE, score );
345 }
346
347 /**
348 * Parses a list of patterns and returns a Collection of compiled Pattern
349 * objects.
350 *
351 * @param source
352 * @param list
353 * @return A Collection of the Patterns that were found from the lists.
354 */
355 private Collection< Pattern > parseWordList( WikiPage source, String list ) {
356 ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >();
357
358 if( list != null ) {
359 StringTokenizer tok = new StringTokenizer( list, " \t\n" );
360
361 while( tok.hasMoreTokens() ) {
362 String pattern = tok.nextToken();
363
364 try {
365 compiledpatterns.add( m_compiler.compile( pattern ) );
366 } catch( MalformedPatternException e ) {
367 log.debug( "Malformed spam filter pattern " + pattern );
368 source.setAttribute("error", "Malformed spam filter pattern " + pattern);
369 }
370 }
371 }
372
373 return compiledpatterns;
374 }
375
376 /**
377 * Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects.
378 *
379 * @param list
380 * @return The parsed blacklist patterns.
381 */
382 private Collection< Pattern > parseBlacklist( String list ) {
383 ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >();
384
385 if( list != null ) {
386 try {
387 BufferedReader in = new BufferedReader( new StringReader(list) );
388 String line;
389 while( (line = in.readLine() ) != null ) {
390 line = line.trim();
391 if( line.length() == 0 ) continue; // Empty line
392 if( line.startsWith("#") ) continue; // It's a comment
393
394 int ws = line.indexOf( ' ' );
395 if( ws == -1 ) ws = line.indexOf( '\t' );
396 if( ws != -1 ) line = line.substring( 0, ws );
397
398 try {
399 compiledpatterns.add( m_compiler.compile( line ) );
400 } catch( MalformedPatternException e ) {
401 log.debug( "Malformed spam filter pattern " + line );
402 }
403 }
404 } catch( IOException e ) {
405 log.info( "Could not read patterns; returning what I got" , e );
406 }
407 }
408
409 return compiledpatterns;
410 }
411
412 /**
413 * Takes a single page change and performs a load of tests on the content change.
414 * An admin can modify anything.
415 *
416 * @param context
417 * @param content
418 * @throws RedirectException
419 */
420 private synchronized void checkSinglePageChange( WikiContext context, String content, Change change )
421 throws RedirectException {
422 HttpServletRequest req = context.getHttpRequest();
423
424 if( req != null ) {
425 String addr = HttpUtil.getRemoteAddress( req );
426 int hostCounter = 0;
427 int changeCounter = 0;
428
429 log.debug( "Change is " + change.m_change );
430
431 long time = System.currentTimeMillis() - 60*1000L; // 1 minute
432
433 for( Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) {
434 Host host = i.next();
435
436 //
437 // Check if this item is invalid
438 //
439 if( host.getAddedTime() < time ) {
440 log.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" );
441 i.remove();
442 continue;
443 }
444
445 //
446 // Check if this IP address has been seen before
447 //
448
449 if( host.getAddress().equals( addr ) ) {
450 hostCounter++;
451 }
452
453 //
454 // Check, if this change has been seen before
455 //
456
457 if( host.getChange() != null && host.getChange().equals( change ) ) {
458 changeCounter++;
459 }
460 }
461
462 //
463 // Now, let's check against the limits.
464 //
465 if( hostCounter >= m_limitSinglePageChanges ) {
466 Host host = new Host( addr, null );
467 m_temporaryBanList.add( host );
468
469 String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change );
470 log.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" );
471 checkStrategy( context, REASON_TOO_MANY_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
472 }
473
474 if( changeCounter >= m_limitSimilarChanges ) {
475 Host host = new Host( addr, null );
476 m_temporaryBanList.add( host );
477
478 String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change );
479 log.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" );
480 checkStrategy( context, REASON_SIMILAR_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")");
481 }
482
483 //
484 // Calculate the number of links in the addition.
485 //
486 String tstChange = change.toString();
487 int urlCounter = 0;
488 while( m_matcher.contains( tstChange,m_urlPattern ) ) {
489 MatchResult m = m_matcher.getMatch();
490 tstChange = tstChange.substring( m.endOffset(0) );
491 urlCounter++;
492 }
493
494 if( urlCounter > m_maxUrls ) {
495 Host host = new Host( addr, null );
496 m_temporaryBanList.add( host );
497
498 String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() );
499 log.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" );
500 checkStrategy( context, REASON_TOO_MANY_URLS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
501 }
502
503 //
504 // Check bot trap
505 //
506 checkBotTrap( context, change );
507
508 //
509 // Check UTF-8 mangling
510 //
511 checkUTF8( context, change );
512
513 //
514 // Do Akismet check. This is good to be the last, because this is the most
515 // expensive operation.
516 //
517 checkAkismet( context, change );
518
519 m_lastModifications.add( new Host( addr, change ) );
520 }
521 }
522
523
524 /**
525 * Checks against the akismet system.
526 *
527 * @param context
528 * @param change
529 * @throws RedirectException
530 */
531 private void checkAkismet( WikiContext context, Change change ) throws RedirectException {
532 if( m_akismetAPIKey != null ) {
533 if( m_akismet == null ) {
534 log.info( "Initializing Akismet spam protection." );
535 m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() );
536
537 if( !m_akismet.verifyAPIKey() ) {
538 log.error( "Akismet API key cannot be verified. Please check your config." );
539 m_akismetAPIKey = null;
540 m_akismet = null;
541 }
542 }
543
544 HttpServletRequest req = context.getHttpRequest();
545
546 //
547 // Akismet will mark all empty statements as spam, so we'll just
548 // ignore them.
549 //
550 if( change.m_adds == 0 && change.m_removals > 0 ) {
551 return;
552 }
553
554 if( req != null && m_akismet != null ) {
555 log.debug( "Calling Akismet to check for spam..." );
556
557 StopWatch sw = new StopWatch();
558 sw.start();
559
560 String ipAddress = HttpUtil.getRemoteAddress( req );
561 String userAgent = req.getHeader( "User-Agent" );
562 String referrer = req.getHeader( "Referer");
563 String permalink = context.getViewURL( context.getPage().getName() );
564 String commentType = context.getRequestContext().equals( WikiContext.COMMENT ) ? "comment" : "edit";
565 String commentAuthor = context.getCurrentUser().getName();
566 String commentAuthorEmail = null;
567 String commentAuthorURL = null;
568
569 boolean isSpam = m_akismet.commentCheck( ipAddress,
570 userAgent,
571 referrer,
572 permalink,
573 commentType,
574 commentAuthor,
575 commentAuthorEmail,
576 commentAuthorURL,
577 change.toString(),
578 null );
579
580 sw.stop();
581 log.debug( "Akismet request done in: " + sw );
582
583 if( isSpam ) {
584 // Host host = new Host( ipAddress, null );
585 // m_temporaryBanList.add( host );
586
587 String uid = log( context, REJECT, REASON_AKISMET, change.toString() );
588 log.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." );
589 checkStrategy( context, REASON_AKISMET, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" );
590 }
591 }
592 }
593 }
594
595 /**
596 * Returns a static string which can be used to detect spambots which just wildly fill in all the fields.
597 *
598 * @return A string
599 */
600 public static String getBotFieldName() {
601 return "submit_auth";
602 }
603
604 /**
605 * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam.
606 *
607 * @param context
608 * @param change
609 * @throws RedirectException
610 */
611 private void checkBotTrap( WikiContext context, Change change ) throws RedirectException {
612 HttpServletRequest request = context.getHttpRequest();
613
614 if( request != null ) {
615 String unspam = request.getParameter( getBotFieldName() );
616 if( unspam != null && unspam.length() > 0 ) {
617 String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() );
618
619 log.info( "SPAM:BotTrap (" + uid + "). Wildly behaving bot detected." );
620 checkStrategy( context, REASON_BOT_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" );
621 }
622 }
623 }
624
625 private void checkUTF8( WikiContext context, Change change ) throws RedirectException {
626 HttpServletRequest request = context.getHttpRequest();
627
628 if( request != null ) {
629 String utf8field = request.getParameter( "encodingcheck" );
630
631 if( utf8field != null && !utf8field.equals( "\u3041" ) ) {
632 String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() );
633
634 log.info( "SPAM:UTF8Trap (" + uid + "). Wildly posting dumb bot detected." );
635 checkStrategy( context, REASON_UTF8_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" );
636 }
637 }
638 }
639
640 /** Goes through the ban list and cleans away any host which has expired from it. */
641 private synchronized void cleanBanList() {
642 long now = System.currentTimeMillis();
643
644 for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) {
645 Host host = i.next();
646
647 if( host.getReleaseTime() < now ) {
648 log.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" );
649 i.remove();
650 }
651 }
652 }
653
654 /**
655 * Checks the ban list if the IP address of the changer is already on it.
656 *
657 * @param context
658 * @throws RedirectException
659 */
660 private void checkBanList( WikiContext context, Change change ) throws RedirectException {
661 HttpServletRequest req = context.getHttpRequest();
662
663 if( req != null ) {
664 String remote = HttpUtil.getRemoteAddress(req);
665 long now = System.currentTimeMillis();
666
667 for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) {
668 Host host = i.next();
669
670 if( host.getAddress().equals( remote ) ) {
671 long timeleft = ( host.getReleaseTime() - now ) / 1000L;
672
673 log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change );
674 checkStrategy( context, REASON_IP_BANNED_TEMPORARILY, "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" );
675 }
676 }
677 }
678 }
679
680 /**
681 * If the spam filter notices changes in the black list page, it will refresh them automatically.
682 *
683 * @param context
684 */
685 private void refreshBlacklists( WikiContext context ) {
686 try {
687 WikiPage source = context.getEngine().getPage( m_forbiddenWordsPage );
688 Attachment att = context.getEngine().getAttachmentManager().getAttachmentInfo( context, m_blacklist );
689
690 boolean rebuild = false;
691
692 //
693 // Rebuild, if the page or the attachment has changed since.
694 //
695 if( source != null ) {
696 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || source.getLastModified().after( m_lastRebuild ) ) {
697 rebuild = true;
698 }
699 }
700
701 if( att != null ) {
702 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) {
703 rebuild = true;
704 }
705 }
706
707 //
708 // Do the actual rebuilding. For simplicity's sake, we always rebuild the complete
709 // filter list regardless of what changed.
710 //
711 if( rebuild ) {
712 m_lastRebuild = new Date();
713 m_spamPatterns = parseWordList( source,
714 ( source != null ) ? ( String )source.getAttribute( LISTVAR ) : null );
715
716 log.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage );
717
718 if( att != null ) {
719 InputStream in = context.getEngine().getAttachmentManager().getAttachmentStream(att);
720 StringWriter out = new StringWriter();
721 FileUtil.copyContents( new InputStreamReader( in,"UTF-8" ), out );
722 Collection< Pattern > blackList = parseBlacklist( out.toString() );
723 log.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist );
724 m_spamPatterns.addAll( blackList );
725 }
726 }
727 } catch( IOException ex ) {
728 log.info( "Unable to read attachment data, continuing...", ex );
729 } catch( ProviderException ex ) {
730 log.info( "Failed to read spam filter attachment, continuing...", ex );
731 }
732 }
733
734 /**
735 * Does a check against a known pattern list.
736 *
737 * @param context
738 * @param content
739 * @param change
740 * @throws RedirectException
741 */
742 private void checkPatternList( WikiContext context, String content, Change change ) throws RedirectException {
743 //
744 // If we have no spam patterns defined, or we're trying to save
745 // the page containing the patterns, just return.
746 //
747 if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) {
748 return;
749 }
750
751 String ch = change.toString();
752 if( context.getHttpRequest() != null ) {
753 ch += HttpUtil.getRemoteAddress( context.getHttpRequest() );
754 }
755
756 for( Pattern p : m_spamPatterns ) {
757 // log.debug("Attempting to match page contents with "+p.getPattern());
758
759 if( m_matcher.contains( ch, p ) ) {
760 //
761 // Spam filter has a match.
762 //
763 String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch );
764
765 log.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" );
766 checkStrategy( context, REASON_REGEXP, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" );
767 }
768 }
769 }
770
771 private void checkPatternList( WikiContext context, String content, String change ) throws RedirectException {
772 Change c = new Change();
773 c.m_change = change;
774 checkPatternList( context, content, c );
775 }
776
777 /**
778 * Creates a simple text string describing the added content.
779 *
780 * @param context
781 * @param newText
782 * @return Empty string, if there is no change.
783 */
784 private static Change getChange( WikiContext context, String newText ) {
785 WikiPage page = context.getPage();
786 StringBuffer change = new StringBuffer();
787 WikiEngine engine = context.getEngine();
788 // Get current page version
789
790 Change ch = new Change();
791
792 try {
793 String oldText = engine.getPureText( page.getName(), WikiProvider.LATEST_VERSION );
794
795 String[] first = Diff.stringToArray( oldText );
796 String[] second = Diff.stringToArray( newText );
797 Revision rev = Diff.diff( first, second, new MyersDiff() );
798
799 if( rev == null || rev.size() == 0 ) {
800 return ch;
801 }
802
803 for( int i = 0; i < rev.size(); i++ ) {
804 Delta d = rev.getDelta( i );
805
806 if( d instanceof AddDelta ) {
807 d.getRevised().toString( change, "", "\r\n" );
808 ch.m_adds++;
809
810 } else if( d instanceof ChangeDelta ) {
811 d.getRevised().toString( change, "", "\r\n" );
812 ch.m_adds++;
813
814 } else if( d instanceof DeleteDelta ) {
815 ch.m_removals++;
816 }
817 }
818 } catch( DifferentiationFailedException e ) {
819 log.error( "Diff failed", e );
820 }
821
822 //
823 // Don't forget to include the change note, too
824 //
825 String changeNote = ( String )page.getAttribute( WikiPage.CHANGENOTE );
826
827 if( changeNote != null ) {
828 change.append( "\r\n" );
829 change.append( changeNote );
830 }
831
832 //
833 // And author as well
834 //
835 if( page.getAuthor() != null ) {
836 change.append( "\r\n" + page.getAuthor() );
837 }
838
839 ch.m_change = change.toString();
840 return ch;
841 }
842
843 /**
844 * Returns true, if this user should be ignored. For example, admin users.
845 *
846 * @param context
847 * @return True, if this users should be ignored.
848 */
849 private boolean ignoreThisUser( WikiContext context ) {
850 if( context.hasAdminPermissions() ) {
851 return true;
852 }
853
854 if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) {
855 return true;
856 }
857
858 if( context.getVariable( "captcha" ) != null ) {
859 return true;
860 }
861
862 return false;
863 }
864
865 /**
866 * Returns a random string of six uppercase characters.
867 *
868 * @return A random string
869 */
870 private static String getUniqueID() {
871 StringBuilder sb = new StringBuilder();
872 Random rand = new Random();
873
874 for( int i = 0; i < 6; i++ ) {
875 char x = ( char )( 'A' + rand.nextInt( 26 ) );
876 sb.append( x );
877 }
878
879 return sb.toString();
880 }
881
882 /**
883 * Returns a page to which we shall redirect, based on the current value of the "captcha" parameter.
884 *
885 * @param ctx WikiContext
886 * @return An URL to redirect to
887 */
888 private String getRedirectPage( WikiContext ctx ) {
889 if( m_useCaptcha ) {
890 return ctx.getURL( WikiContext.NONE, "Captcha.jsp", "page="+ctx.getEngine().encodeName( ctx.getPage().getName() ) );
891 }
892
893 return ctx.getURL( WikiContext.VIEW, m_errorPage );
894 }
895
896 /**
897 * Checks whether the UserProfile matches certain checks.
898 *
899 * @param profile The profile to check
900 * @param context The WikiContext
901 * @return False, if this userprofile is suspect and should not be allowed to be added.
902 * @since 2.6.1
903 */
904 public boolean isValidUserProfile( WikiContext context, UserProfile profile ) {
905 try {
906 checkPatternList( context, profile.getEmail(), profile.getEmail() );
907 checkPatternList( context, profile.getFullname(), profile.getFullname() );
908 checkPatternList( context, profile.getLoginName(), profile.getLoginName() );
909 } catch( RedirectException e ) {
910 log.info("Detected attempt to create a spammer user account (see above for rejection reason)");
911 return false;
912 }
913
914 return true;
915 }
916
917 /**
918 * This method is used to calculate an unique code when submitting the page to detect edit conflicts.
919 * It currently incorporates the last-modified date of the page, and the IP address of the submitter.
920 *
921 * @param page The WikiPage under edit
922 * @param request The HTTP Request
923 * @since 2.6
924 * @return A hash value for this page and session
925 */
926 public static final String getSpamHash( WikiPage page, HttpServletRequest request ) {
927 long lastModified = 0;
928
929 if( page.getLastModified() != null ) {
930 lastModified = page.getLastModified().getTime();
931 }
932 long remote = HttpUtil.getRemoteAddress( request ).hashCode();
933
934 return Long.toString( lastModified ^ remote );
935 }
936
937 /**
938 * Returns the name of the hash field to be used in this request. The value is unique per session, and once
939 * the session has expired, you cannot edit anymore.
940 *
941 * @param request The page request
942 * @return The name to be used in the hash field
943 * @since 2.6
944 */
945 public static final String getHashFieldName( HttpServletRequest request ) {
946 String hash = null;
947
948 if( request.getSession() != null ) {
949 hash = ( String )request.getSession().getAttribute( "_hash" );
950
951 if( hash == null ) {
952 hash = c_hashName;
953 request.getSession().setAttribute( "_hash", hash );
954 }
955 }
956
957 if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) {
958 c_hashName = getUniqueID().toLowerCase();
959 c_lastUpdate = System.currentTimeMillis();
960 }
961
962 return hash != null ? hash : c_hashName;
963 }
964
965
966 /**
967 * This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases:
968 * either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long,
969 * and their session has expired.
970 * <p>
971 * This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in
972 * the spam log (it may or may not be spam, but it's rather likely that it is).
973 *
974 * @param context The WikiContext
975 * @param pageContext The JSP PageContext.
976 * @return True, if hash is okay. False, if hash is not okay, and you need to redirect.
977 * @throws IOException If redirection fails
978 * @since 2.6
979 */
980 public static final boolean checkHash( WikiContext context, PageContext pageContext ) throws IOException {
981 String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() );
982
983 if( pageContext.getRequest().getParameter(hashName) == null ) {
984 if( pageContext.getAttribute( hashName ) == null ) {
985 Change change = getChange( context, EditorManager.getEditedText( pageContext ) );
986 log( context, REJECT, "MissingHash", change.m_change );
987
988 String redirect = context.getURL( WikiContext.VIEW,"SessionExpired" );
989 ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect );
990 return false;
991 }
992 }
993
994 return true;
995 }
996
997 /**
998 * This helper method adds all the input fields to your editor that the SpamFilter requires
999 * to check for spam. This <i>must</i> be in your editor form if you intend to use the SpamFilter.
1000 *
1001 * @param pageContext The PageContext
1002 * @return A HTML string which contains input fields for the SpamFilter.
1003 */
1004 public static final String insertInputFields( PageContext pageContext ) {
1005 WikiContext ctx = WikiContext.findContext( pageContext );
1006 WikiEngine engine = ctx.getEngine();
1007
1008 StringBuilder sb = new StringBuilder();
1009 if( engine.getContentEncoding().equals( "UTF-8" ) ) {
1010 sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" );
1011 }
1012
1013 return sb.toString();
1014 }
1015
1016 /**
1017 * A local class for storing host information.
1018 *
1019 * @since
1020 */
1021 private class Host {
1022
1023 private long m_addedTime = System.currentTimeMillis();
1024 private long m_releaseTime;
1025 private String m_address;
1026 private Change m_change;
1027
1028 public String getAddress() {
1029 return m_address;
1030 }
1031
1032 public long getReleaseTime() {
1033 return m_releaseTime;
1034 }
1035
1036 public long getAddedTime() {
1037 return m_addedTime;
1038 }
1039
1040 public Change getChange() {
1041 return m_change;
1042 }
1043
1044 public Host( String ipaddress, Change change ) {
1045 m_address = ipaddress;
1046 m_change = change;
1047 m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L;
1048 }
1049
1050 }
1051
1052 private static class Change {
1053
1054 public String m_change;
1055 public int m_adds;
1056 public int m_removals;
1057
1058 public String toString() {
1059 return m_change;
1060 }
1061
1062 public boolean equals( Object o ) {
1063 if( o instanceof Change ) {
1064 return m_change.equals( ( ( Change )o ).m_change );
1065 }
1066 return false;
1067 }
1068
1069 public int hashCode() {
1070 return m_change.hashCode() + 17;
1071 }
1072
1073 }
1074
1075 }