001/* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019package org.apache.wiki.filters; 020 021import java.io.BufferedReader; 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.InputStreamReader; 025import java.io.StringReader; 026import java.io.StringWriter; 027import java.util.ArrayList; 028import java.util.Collection; 029import java.util.Date; 030import java.util.Iterator; 031import java.util.Properties; 032import java.util.Random; 033import java.util.StringTokenizer; 034import java.util.Vector; 035 036import javax.servlet.http.HttpServletRequest; 037import javax.servlet.http.HttpServletResponse; 038import javax.servlet.jsp.PageContext; 039 040import org.apache.commons.lang.time.StopWatch; 041import org.apache.log4j.Logger; 042import org.apache.oro.text.regex.MalformedPatternException; 043import org.apache.oro.text.regex.MatchResult; 044import org.apache.oro.text.regex.Pattern; 045import org.apache.oro.text.regex.PatternCompiler; 046import org.apache.oro.text.regex.PatternMatcher; 047import org.apache.oro.text.regex.Perl5Compiler; 048import org.apache.oro.text.regex.Perl5Matcher; 049import org.apache.wiki.InternalWikiException; 050import org.apache.wiki.WikiContext; 051import org.apache.wiki.WikiEngine; 052import org.apache.wiki.WikiPage; 053import org.apache.wiki.WikiProvider; 054import org.apache.wiki.api.exceptions.ProviderException; 055import org.apache.wiki.api.exceptions.RedirectException; 056import org.apache.wiki.api.filters.BasicPageFilter; 057import org.apache.wiki.attachment.Attachment; 058import org.apache.wiki.auth.user.UserProfile; 059import org.apache.wiki.ui.EditorManager; 060import org.apache.wiki.util.FileUtil; 061import org.apache.wiki.util.HttpUtil; 062import org.apache.wiki.util.TextUtil; 063import org.suigeneris.jrcs.diff.Diff; 064import org.suigeneris.jrcs.diff.DifferentiationFailedException; 065import org.suigeneris.jrcs.diff.Revision; 066import org.suigeneris.jrcs.diff.delta.AddDelta; 067import org.suigeneris.jrcs.diff.delta.ChangeDelta; 068import org.suigeneris.jrcs.diff.delta.DeleteDelta; 069import org.suigeneris.jrcs.diff.delta.Delta; 070import org.suigeneris.jrcs.diff.myers.MyersDiff; 071 072import net.sf.akismet.Akismet; 073 074 075/** 076 * This is Herb, the JSPWiki spamfilter that can also do choke modifications. 077 * 078 * Parameters: 079 * <ul> 080 * <li>wordlist - Page name where the spamword regexps are found. Use [{SET spamwords='regexp list separated with spaces'}] on 081 * that page. Default is "SpamFilterWordList". 082 * <li>IPlist - Page name where the IP regexps are found. Use [{SET ips='regexp list separated with spaces'}] on 083 * that page. Default is "SpamFilterIPList". 084 * <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is 085 * "SpamFilterWordList/blacklist.txt"</li> 086 * <li>errorpage - The page to which the user is redirected. Has a special variable $msg which states the reason. Default is "RejectedMessage". 087 * <li>pagechangesinminute - How many page changes are allowed/minute. Default is 5.</li> 088 * <li>similarchanges - How many similar page changes are allowed before the host is banned. Default is 2. (since 2.4.72)</li> 089 * <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li> 090 * <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li> 091 * <li>akismet-apikey - The Akismet API key (see akismet.org)</li> 092 * <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li> 093 * <li>captcha - Sets the captcha technology to use. Current allowed values are "none" and "asirra".</li> 094 * <li>strategy - Sets the filtering strategy to use. If set to "eager", will stop at the first probable 095 * match, and won't consider any other tests. This is the default, as it's considerably lighter. If set to "score", will go through all of the tests 096 * and calculates a score for the spam, which is then compared to a filter level value. 097 * </ul> 098 * 099 * <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates 100 * with the editor system.</p> 101 * 102 * <p>Changes by admin users are ignored in any case.</p> 103 * 104 * @since 2.1.112 105 */ 106public class SpamFilter extends BasicPageFilter { 107 108 private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score"; 109 private static final String REASON_REGEXP = "Regexp"; 110 private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily"; 111 private static final String REASON_IP_BANNED_PERMANENTLY = "IPBannedPermanently"; 112 private static final String REASON_BOT_TRAP = "BotTrap"; 113 private static final String REASON_AKISMET = "Akismet"; 114 private static final String REASON_TOO_MANY_URLS = "TooManyUrls"; 115 private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications"; 116 private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications"; 117 private static final String REASON_UTF8_TRAP = "UTF8Trap"; 118 119 private static final String LISTVAR = "spamwords"; 120 private static final String LISTIPVAR = "ips"; 121 122 /** The filter property name for specifying the page which contains the list of spamwords. 123 * Value is <tt>{@value}</tt>. */ 124 public static final String PROP_WORDLIST = "wordlist"; 125 126 /** The filter property name for specifying the page which contains the list of IPs to ban. 127 * Value is <tt>{@value}</tt>. */ 128 public static final String PROP_IPLIST = "IPlist"; 129 130 /** The filter property name for the page to which you are directed if Herb rejects your 131 * edit. Value is <tt>{@value}</tt>. */ 132 public static final String PROP_ERRORPAGE = "errorpage"; 133 134 /** The filter property name for specifying how many changes is any given IP address 135 * allowed to do per minute. Value is <tt>{@value}</tt>. 136 */ 137 public static final String PROP_PAGECHANGES = "pagechangesinminute"; 138 139 /** The filter property name for specifying how many similar changes are allowed 140 * before a host is banned. Value is <tt>{@value}</tt>. 141 */ 142 public static final String PROP_SIMILARCHANGES = "similarchanges"; 143 144 /** The filter property name for specifying how long a host is banned. Value is <tt>{@value}</tt>.*/ 145 public static final String PROP_BANTIME = "bantime"; 146 147 /** The filter property name for the attachment containing the blacklist. Value is <tt>{@value}</tt>.*/ 148 public static final String PROP_BLACKLIST = "blacklist"; 149 150 /** The filter property name for specifying how many URLs can any given edit contain. 151 * Value is <tt>{@value}</tt> */ 152 public static final String PROP_MAXURLS = "maxurls"; 153 154 /** The filter property name for specifying the Akismet API-key. Value is <tt>{@value}</tt>. */ 155 public static final String PROP_AKISMET_API_KEY = "akismet-apikey"; 156 157 /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */ 158 public static final String PROP_IGNORE_AUTHENTICATED = "ignoreauthenticated"; 159 160 /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */ 161 public static final String PROP_CAPTCHA = "captcha"; 162 163 /** The filter property name for specifying which filter strategy should be used. Value is <tt>{@value}</tt>. */ 164 public static final String PROP_FILTERSTRATEGY = "strategy"; 165 166 /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */ 167 public static final String STRATEGY_EAGER = "eager"; 168 169 /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */ 170 public static final String STRATEGY_SCORE = "score"; 171 172 private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)"; 173 174 private String m_forbiddenWordsPage = "SpamFilterWordList"; 175 private String m_forbiddenIPsPage = "SpamFilterIPList"; 176 private String m_errorPage = "RejectedMessage"; 177 private String m_blacklist = "SpamFilterWordList/blacklist.txt"; 178 179 private PatternMatcher m_matcher = new Perl5Matcher(); 180 private PatternCompiler m_compiler = new Perl5Compiler(); 181 182 private Collection<Pattern> m_spamPatterns = null; 183 private Collection<Pattern> m_IPPatterns = null; 184 185 private Date m_lastRebuild = new Date( 0L ); 186 187 private static Logger c_spamlog = Logger.getLogger( "SpamLog" ); 188 private static Logger log = Logger.getLogger( SpamFilter.class ); 189 190 191 private Vector<Host> m_temporaryBanList = new Vector<Host>(); 192 193 private int m_banTime = 60; // minutes 194 195 private Vector<Host> m_lastModifications = new Vector<Host>(); 196 197 /** 198 * How many times a single IP address can change a page per minute? 199 */ 200 private int m_limitSinglePageChanges = 5; 201 202 /** 203 * How many times can you add the exact same string to a page? 204 */ 205 private int m_limitSimilarChanges = 2; 206 207 /** 208 * How many URLs can be added at maximum. 209 */ 210 private int m_maxUrls = 10; 211 212 private Pattern m_urlPattern; 213 private Akismet m_akismet; 214 215 private String m_akismetAPIKey = null; 216 217 private boolean m_useCaptcha = false; 218 219 /** The limit at which we consider something to be spam. */ 220 private int m_scoreLimit = 1; 221 222 /** 223 * If set to true, will ignore anyone who is in Authenticated role. 224 */ 225 private boolean m_ignoreAuthenticated = false; 226 227 private boolean m_stopAtFirstMatch = true; 228 229 private static String c_hashName; 230 private static long c_lastUpdate; 231 232 /** The HASH_DELAY value is a maximum amount of time that an user can keep 233 * a session open, because after the value has expired, we will invent a new 234 * hash field name. By default this is {@value} hours, which should be ample 235 * time for someone. 236 */ 237 private static final long HASH_DELAY = 24; 238 239 240 /** 241 * {@inheritDoc} 242 */ 243 @Override 244 public void initialize( WikiEngine engine, Properties properties ) { 245 m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage ); 246 m_forbiddenIPsPage = properties.getProperty( PROP_IPLIST, m_forbiddenIPsPage); 247 m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage ); 248 m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties, 249 PROP_PAGECHANGES, 250 m_limitSinglePageChanges ); 251 252 m_limitSimilarChanges = TextUtil.getIntegerProperty( properties, 253 PROP_SIMILARCHANGES, 254 m_limitSimilarChanges ); 255 256 m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls ); 257 m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime ); 258 m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist ); 259 260 m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties, 261 PROP_IGNORE_AUTHENTICATED, 262 m_ignoreAuthenticated ); 263 264 m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra"); 265 266 try { 267 m_urlPattern = m_compiler.compile( URL_REGEXP ); 268 } catch( MalformedPatternException e ) { 269 log.fatal( "Internal error: Someone put in a faulty pattern.", e ); 270 throw new InternalWikiException( "Faulty pattern." ); 271 } 272 273 m_akismetAPIKey = TextUtil.getStringProperty( properties, 274 PROP_AKISMET_API_KEY, 275 m_akismetAPIKey ); 276 277 m_stopAtFirstMatch = TextUtil.getStringProperty( properties, 278 PROP_FILTERSTRATEGY, 279 STRATEGY_EAGER ).equals( STRATEGY_EAGER ); 280 281 log.info( "# Spam filter initialized. Temporary ban time " + m_banTime + 282 " mins, max page changes/minute: " + m_limitSinglePageChanges ); 283 284 285 } 286 287 private static final int REJECT = 0; 288 private static final int ACCEPT = 1; 289 private static final int NOTE = 2; 290 291 private static String log( WikiContext ctx, int type, String source, String message ) { 292 message = TextUtil.replaceString( message, "\r\n", "\\r\\n" ); 293 message = TextUtil.replaceString( message, "\"", "\\\"" ); 294 295 String uid = getUniqueID(); 296 297 String page = ctx.getPage().getName(); 298 String reason = "UNKNOWN"; 299 String addr = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-"; 300 301 switch( type ) { 302 case REJECT: 303 reason = "REJECTED"; 304 break; 305 case ACCEPT: 306 reason = "ACCEPTED"; 307 break; 308 case NOTE: 309 reason = "NOTE"; 310 break; 311 default: 312 throw new InternalWikiException( "Illegal type " + type ); 313 } 314 c_spamlog.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message ); 315 316 return uid; 317 } 318 319 /** {@inheritDoc} */ 320 public String preSave( WikiContext context, String content ) throws RedirectException { 321 cleanBanList(); 322 refreshBlacklists( context ); 323 Change change = getChange( context, content ); 324 325 if( !ignoreThisUser( context ) ) { 326 checkBanList( context, change ); 327 checkSinglePageChange( context, content, change ); 328 checkIPList( context ); 329 checkPatternList( context, content, change ); 330 } 331 332 if( !m_stopAtFirstMatch ) { 333 Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE ); 334 335 if( score != null && score.intValue() >= m_scoreLimit ) { 336 throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) ); 337 } 338 } 339 340 log( context, ACCEPT, "-", change.toString() ); 341 return content; 342 } 343 344 private void checkStrategy( WikiContext context, String error, String message ) throws RedirectException { 345 if( m_stopAtFirstMatch ) { 346 throw new RedirectException( message, getRedirectPage( context ) ); 347 } 348 349 Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE ); 350 if( score != null ) { 351 score = score + 1; 352 } else { 353 score = 1; 354 } 355 356 context.setVariable( ATTR_SPAMFILTER_SCORE, score ); 357 } 358 359 /** 360 * Parses a list of patterns and returns a Collection of compiled Pattern 361 * objects. 362 * 363 * @param source 364 * @param list 365 * @return A Collection of the Patterns that were found from the lists. 366 */ 367 private Collection< Pattern > parseWordList( WikiPage source, String list ) { 368 ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >(); 369 370 if( list != null ) { 371 StringTokenizer tok = new StringTokenizer( list, " \t\n" ); 372 373 while( tok.hasMoreTokens() ) { 374 String pattern = tok.nextToken(); 375 376 try { 377 compiledpatterns.add( m_compiler.compile( pattern ) ); 378 } catch( MalformedPatternException e ) { 379 log.debug( "Malformed spam filter pattern " + pattern ); 380 source.setAttribute("error", "Malformed spam filter pattern " + pattern); 381 } 382 } 383 } 384 385 return compiledpatterns; 386 } 387 388 /** 389 * Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects. 390 * 391 * @param list 392 * @return The parsed blacklist patterns. 393 */ 394 private Collection< Pattern > parseBlacklist( String list ) { 395 ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >(); 396 397 if( list != null ) { 398 try { 399 BufferedReader in = new BufferedReader( new StringReader(list) ); 400 String line; 401 while( (line = in.readLine() ) != null ) { 402 line = line.trim(); 403 if( line.length() == 0 ) continue; // Empty line 404 if( line.startsWith("#") ) continue; // It's a comment 405 406 int ws = line.indexOf( ' ' ); 407 if( ws == -1 ) ws = line.indexOf( '\t' ); 408 if( ws != -1 ) line = line.substring( 0, ws ); 409 410 try { 411 compiledpatterns.add( m_compiler.compile( line ) ); 412 } catch( MalformedPatternException e ) { 413 log.debug( "Malformed spam filter pattern " + line ); 414 } 415 } 416 } catch( IOException e ) { 417 log.info( "Could not read patterns; returning what I got" , e ); 418 } 419 } 420 421 return compiledpatterns; 422 } 423 424 /** 425 * Takes a single page change and performs a load of tests on the content change. 426 * An admin can modify anything. 427 * 428 * @param context 429 * @param content 430 * @throws RedirectException 431 */ 432 private synchronized void checkSinglePageChange( WikiContext context, String content, Change change ) 433 throws RedirectException { 434 HttpServletRequest req = context.getHttpRequest(); 435 436 if( req != null ) { 437 String addr = HttpUtil.getRemoteAddress( req ); 438 int hostCounter = 0; 439 int changeCounter = 0; 440 441 log.debug( "Change is " + change.m_change ); 442 443 long time = System.currentTimeMillis() - 60*1000L; // 1 minute 444 445 for( Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) { 446 Host host = i.next(); 447 448 // 449 // Check if this item is invalid 450 // 451 if( host.getAddedTime() < time ) { 452 log.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" ); 453 i.remove(); 454 continue; 455 } 456 457 // 458 // Check if this IP address has been seen before 459 // 460 461 if( host.getAddress().equals( addr ) ) { 462 hostCounter++; 463 } 464 465 // 466 // Check, if this change has been seen before 467 // 468 469 if( host.getChange() != null && host.getChange().equals( change ) ) { 470 changeCounter++; 471 } 472 } 473 474 // 475 // Now, let's check against the limits. 476 // 477 if( hostCounter >= m_limitSinglePageChanges ) { 478 Host host = new Host( addr, null ); 479 m_temporaryBanList.add( host ); 480 481 String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change ); 482 log.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" ); 483 checkStrategy( context, REASON_TOO_MANY_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 484 } 485 486 if( changeCounter >= m_limitSimilarChanges ) { 487 Host host = new Host( addr, null ); 488 m_temporaryBanList.add( host ); 489 490 String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change ); 491 log.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" ); 492 checkStrategy( context, REASON_SIMILAR_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")"); 493 } 494 495 // 496 // Calculate the number of links in the addition. 497 // 498 String tstChange = change.toString(); 499 int urlCounter = 0; 500 while( m_matcher.contains( tstChange,m_urlPattern ) ) { 501 MatchResult m = m_matcher.getMatch(); 502 tstChange = tstChange.substring( m.endOffset(0) ); 503 urlCounter++; 504 } 505 506 if( urlCounter > m_maxUrls ) { 507 Host host = new Host( addr, null ); 508 m_temporaryBanList.add( host ); 509 510 String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() ); 511 log.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" ); 512 checkStrategy( context, REASON_TOO_MANY_URLS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 513 } 514 515 // 516 // Check bot trap 517 // 518 checkBotTrap( context, change ); 519 520 // 521 // Check UTF-8 mangling 522 // 523 checkUTF8( context, change ); 524 525 // 526 // Do Akismet check. This is good to be the last, because this is the most 527 // expensive operation. 528 // 529 checkAkismet( context, change ); 530 531 m_lastModifications.add( new Host( addr, change ) ); 532 } 533 } 534 535 536 /** 537 * Checks against the akismet system. 538 * 539 * @param context 540 * @param change 541 * @throws RedirectException 542 */ 543 private void checkAkismet( WikiContext context, Change change ) throws RedirectException { 544 if( m_akismetAPIKey != null ) { 545 if( m_akismet == null ) { 546 log.info( "Initializing Akismet spam protection." ); 547 m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() ); 548 549 if( !m_akismet.verifyAPIKey() ) { 550 log.error( "Akismet API key cannot be verified. Please check your config." ); 551 m_akismetAPIKey = null; 552 m_akismet = null; 553 } 554 } 555 556 HttpServletRequest req = context.getHttpRequest(); 557 558 // 559 // Akismet will mark all empty statements as spam, so we'll just 560 // ignore them. 561 // 562 if( change.m_adds == 0 && change.m_removals > 0 ) { 563 return; 564 } 565 566 if( req != null && m_akismet != null ) { 567 log.debug( "Calling Akismet to check for spam..." ); 568 569 StopWatch sw = new StopWatch(); 570 sw.start(); 571 572 String ipAddress = HttpUtil.getRemoteAddress( req ); 573 String userAgent = req.getHeader( "User-Agent" ); 574 String referrer = req.getHeader( "Referer"); 575 String permalink = context.getViewURL( context.getPage().getName() ); 576 String commentType = context.getRequestContext().equals( WikiContext.COMMENT ) ? "comment" : "edit"; 577 String commentAuthor = context.getCurrentUser().getName(); 578 String commentAuthorEmail = null; 579 String commentAuthorURL = null; 580 581 boolean isSpam = m_akismet.commentCheck( ipAddress, 582 userAgent, 583 referrer, 584 permalink, 585 commentType, 586 commentAuthor, 587 commentAuthorEmail, 588 commentAuthorURL, 589 change.toString(), 590 null ); 591 592 sw.stop(); 593 log.debug( "Akismet request done in: " + sw ); 594 595 if( isSpam ) { 596 // Host host = new Host( ipAddress, null ); 597 // m_temporaryBanList.add( host ); 598 599 String uid = log( context, REJECT, REASON_AKISMET, change.toString() ); 600 log.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." ); 601 checkStrategy( context, REASON_AKISMET, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" ); 602 } 603 } 604 } 605 } 606 607 /** 608 * Returns a static string which can be used to detect spambots which just wildly fill in all the fields. 609 * 610 * @return A string 611 */ 612 public static String getBotFieldName() { 613 return "submit_auth"; 614 } 615 616 /** 617 * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam. 618 * 619 * @param context 620 * @param change 621 * @throws RedirectException 622 */ 623 private void checkBotTrap( WikiContext context, Change change ) throws RedirectException { 624 HttpServletRequest request = context.getHttpRequest(); 625 626 if( request != null ) { 627 String unspam = request.getParameter( getBotFieldName() ); 628 if( unspam != null && unspam.length() > 0 ) { 629 String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() ); 630 631 log.info( "SPAM:BotTrap (" + uid + "). Wildly behaving bot detected." ); 632 checkStrategy( context, REASON_BOT_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" ); 633 } 634 } 635 } 636 637 private void checkUTF8( WikiContext context, Change change ) throws RedirectException { 638 HttpServletRequest request = context.getHttpRequest(); 639 640 if( request != null ) { 641 String utf8field = request.getParameter( "encodingcheck" ); 642 643 if( utf8field != null && !utf8field.equals( "\u3041" ) ) { 644 String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() ); 645 646 log.info( "SPAM:UTF8Trap (" + uid + "). Wildly posting dumb bot detected." ); 647 checkStrategy( context, REASON_UTF8_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" ); 648 } 649 } 650 } 651 652 /** Goes through the ban list and cleans away any host which has expired from it. */ 653 private synchronized void cleanBanList() { 654 long now = System.currentTimeMillis(); 655 656 for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) { 657 Host host = i.next(); 658 659 if( host.getReleaseTime() < now ) { 660 log.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" ); 661 i.remove(); 662 } 663 } 664 } 665 666 /** 667 * Checks the ban list if the IP address of the changer is already on it. 668 * 669 * @param context 670 * @throws RedirectException 671 */ 672 private void checkBanList( WikiContext context, Change change ) throws RedirectException { 673 HttpServletRequest req = context.getHttpRequest(); 674 675 if( req != null ) { 676 String remote = HttpUtil.getRemoteAddress(req); 677 long now = System.currentTimeMillis(); 678 679 for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) { 680 Host host = i.next(); 681 682 if( host.getAddress().equals( remote ) ) { 683 long timeleft = ( host.getReleaseTime() - now ) / 1000L; 684 685 log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change ); 686 checkStrategy( context, REASON_IP_BANNED_TEMPORARILY, "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" ); 687 } 688 } 689 } 690 } 691 692 /** 693 * If the spam filter notices changes in the black list page, it will refresh them automatically. 694 * 695 * @param context 696 */ 697 private void refreshBlacklists( WikiContext context ) { 698 try { 699 700 boolean rebuild = false; 701 702 // 703 // Rebuild, if the spam words page, the attachment or the IP ban page has changed since. 704 // 705 WikiPage sourceSpam = context.getEngine().getPage( m_forbiddenWordsPage ); 706 if( sourceSpam != null ) { 707 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || sourceSpam.getLastModified().after( m_lastRebuild ) ) { 708 rebuild = true; 709 } 710 } 711 712 Attachment att = context.getEngine().getAttachmentManager().getAttachmentInfo( context, m_blacklist ); 713 if( att != null ) { 714 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) { 715 rebuild = true; 716 } 717 } 718 719 WikiPage sourceIPs = context.getEngine().getPage( m_forbiddenIPsPage ); 720 if( sourceIPs != null ) { 721 if( m_IPPatterns == null || m_IPPatterns.isEmpty() || sourceIPs.getLastModified().after( m_lastRebuild ) ) { 722 rebuild = true; 723 } 724 } 725 726 // 727 // Do the actual rebuilding. For simplicity's sake, we always rebuild the complete 728 // filter list regardless of what changed. 729 // 730 if( rebuild ) { 731 m_lastRebuild = new Date(); 732 m_spamPatterns = parseWordList( sourceSpam, 733 ( sourceSpam != null ) ? ( String )sourceSpam.getAttribute( LISTVAR ) : null ); 734 735 log.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage ); 736 737 m_IPPatterns = parseWordList( sourceIPs, 738 ( sourceIPs != null ) ? ( String )sourceIPs.getAttribute( LISTIPVAR ) : null ); 739 log.info( "IP filter reloaded - recognizing " + m_IPPatterns.size() + " patterns from page " + m_forbiddenIPsPage ); 740 741 if( att != null ) { 742 InputStream in = context.getEngine().getAttachmentManager().getAttachmentStream(att); 743 StringWriter out = new StringWriter(); 744 FileUtil.copyContents( new InputStreamReader( in,"UTF-8" ), out ); 745 Collection< Pattern > blackList = parseBlacklist( out.toString() ); 746 log.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist ); 747 m_spamPatterns.addAll( blackList ); 748 } 749 } 750 } catch( IOException ex ) { 751 log.info( "Unable to read attachment data, continuing...", ex ); 752 } catch( ProviderException ex ) { 753 log.info( "Failed to read spam filter attachment, continuing...", ex ); 754 } 755 } 756 757 /** 758 * Does a check against a known pattern list. 759 * 760 * @param context 761 * @param content 762 * @param change 763 * @throws RedirectException 764 */ 765 private void checkPatternList( WikiContext context, String content, Change change ) throws RedirectException { 766 // 767 // If we have no spam patterns defined, or we're trying to save 768 // the page containing the patterns, just return. 769 // 770 if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) { 771 return; 772 } 773 774 String ch = change.toString(); 775 if( context.getHttpRequest() != null ) { 776 ch += HttpUtil.getRemoteAddress( context.getHttpRequest() ); 777 } 778 779 for( Pattern p : m_spamPatterns ) { 780 // log.debug("Attempting to match page contents with "+p.getPattern()); 781 782 if( m_matcher.contains( ch, p ) ) { 783 // 784 // Spam filter has a match. 785 // 786 String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch ); 787 788 log.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" ); 789 checkStrategy( context, REASON_REGEXP, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" ); 790 } 791 } 792 } 793 794 795 /** 796 * Does a check against a pattern list of IPs. 797 * 798 * @param context 799 * @throws RedirectException 800 */ 801 private void checkIPList( WikiContext context ) throws RedirectException { 802 // 803 // If we have no IP patterns defined, or we're trying to save 804 // the page containing the IP patterns, just return. 805 // 806 if( m_IPPatterns == null || context.getPage().getName().equals( m_forbiddenIPsPage ) ) { 807 return; 808 } 809 810 String remoteIP = HttpUtil.getRemoteAddress( context.getHttpRequest() ); 811 log.info("Attempting to match remoteIP " + remoteIP + " against " + m_IPPatterns.size() + " patterns"); 812 813 for( Pattern p : m_IPPatterns ) { 814 log.debug("Attempting to match remoteIP with " + p.getPattern()); 815 816 if( m_matcher.contains( remoteIP, p ) ) { 817 818 // IP filter has a match. 819 // 820 String uid = log( context, REJECT, REASON_IP_BANNED_PERMANENTLY + "(" + p.getPattern() + ")", remoteIP ); 821 822 log.info( "SPAM:Regexp (" + uid + "). remoteIP matches the IP filter '" + p.getPattern() + "'" ); 823 checkStrategy( context, REASON_IP_BANNED_PERMANENTLY, "Herb says '" + p.getPattern() + "' is a banned IP and I trust Herb! (Incident code " + uid + ")" ); 824 } 825 } 826 } 827 828 private void checkPatternList( WikiContext context, String content, String change ) throws RedirectException { 829 Change c = new Change(); 830 c.m_change = change; 831 checkPatternList( context, content, c ); 832 } 833 834 /** 835 * Creates a simple text string describing the added content. 836 * 837 * @param context 838 * @param newText 839 * @return Empty string, if there is no change. 840 */ 841 private static Change getChange( WikiContext context, String newText ) { 842 WikiPage page = context.getPage(); 843 StringBuffer change = new StringBuffer(); 844 WikiEngine engine = context.getEngine(); 845 // Get current page version 846 847 Change ch = new Change(); 848 849 try { 850 String oldText = engine.getPureText( page.getName(), WikiProvider.LATEST_VERSION ); 851 852 String[] first = Diff.stringToArray( oldText ); 853 String[] second = Diff.stringToArray( newText ); 854 Revision rev = Diff.diff( first, second, new MyersDiff() ); 855 856 if( rev == null || rev.size() == 0 ) { 857 return ch; 858 } 859 860 for( int i = 0; i < rev.size(); i++ ) { 861 Delta d = rev.getDelta( i ); 862 863 if( d instanceof AddDelta ) { 864 d.getRevised().toString( change, "", "\r\n" ); 865 ch.m_adds++; 866 867 } else if( d instanceof ChangeDelta ) { 868 d.getRevised().toString( change, "", "\r\n" ); 869 ch.m_adds++; 870 871 } else if( d instanceof DeleteDelta ) { 872 ch.m_removals++; 873 } 874 } 875 } catch( DifferentiationFailedException e ) { 876 log.error( "Diff failed", e ); 877 } 878 879 // 880 // Don't forget to include the change note, too 881 // 882 String changeNote = ( String )page.getAttribute( WikiPage.CHANGENOTE ); 883 884 if( changeNote != null ) { 885 change.append( "\r\n" ); 886 change.append( changeNote ); 887 } 888 889 // 890 // And author as well 891 // 892 if( page.getAuthor() != null ) { 893 change.append( "\r\n" + page.getAuthor() ); 894 } 895 896 ch.m_change = change.toString(); 897 return ch; 898 } 899 900 /** 901 * Returns true, if this user should be ignored. For example, admin users. 902 * 903 * @param context 904 * @return True, if this users should be ignored. 905 */ 906 private boolean ignoreThisUser( WikiContext context ) { 907 if( context.hasAdminPermissions() ) { 908 return true; 909 } 910 911 if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) { 912 return true; 913 } 914 915 if( context.getVariable( "captcha" ) != null ) { 916 return true; 917 } 918 919 return false; 920 } 921 922 /** 923 * Returns a random string of six uppercase characters. 924 * 925 * @return A random string 926 */ 927 private static String getUniqueID() { 928 StringBuilder sb = new StringBuilder(); 929 Random rand = new Random(); 930 931 for( int i = 0; i < 6; i++ ) { 932 char x = ( char )( 'A' + rand.nextInt( 26 ) ); 933 sb.append( x ); 934 } 935 936 return sb.toString(); 937 } 938 939 /** 940 * Returns a page to which we shall redirect, based on the current value of the "captcha" parameter. 941 * 942 * @param ctx WikiContext 943 * @return An URL to redirect to 944 */ 945 private String getRedirectPage( WikiContext ctx ) { 946 if( m_useCaptcha ) { 947 return ctx.getURL( WikiContext.NONE, "Captcha.jsp", "page="+ctx.getEngine().encodeName( ctx.getPage().getName() ) ); 948 } 949 950 return ctx.getURL( WikiContext.VIEW, m_errorPage ); 951 } 952 953 /** 954 * Checks whether the UserProfile matches certain checks. 955 * 956 * @param profile The profile to check 957 * @param context The WikiContext 958 * @return False, if this userprofile is suspect and should not be allowed to be added. 959 * @since 2.6.1 960 */ 961 public boolean isValidUserProfile( WikiContext context, UserProfile profile ) { 962 try { 963 checkPatternList( context, profile.getEmail(), profile.getEmail() ); 964 checkPatternList( context, profile.getFullname(), profile.getFullname() ); 965 checkPatternList( context, profile.getLoginName(), profile.getLoginName() ); 966 } catch( RedirectException e ) { 967 log.info("Detected attempt to create a spammer user account (see above for rejection reason)"); 968 return false; 969 } 970 971 return true; 972 } 973 974 /** 975 * This method is used to calculate an unique code when submitting the page to detect edit conflicts. 976 * It currently incorporates the last-modified date of the page, and the IP address of the submitter. 977 * 978 * @param page The WikiPage under edit 979 * @param request The HTTP Request 980 * @since 2.6 981 * @return A hash value for this page and session 982 */ 983 public static final String getSpamHash( WikiPage page, HttpServletRequest request ) { 984 long lastModified = 0; 985 986 if( page.getLastModified() != null ) { 987 lastModified = page.getLastModified().getTime(); 988 } 989 long remote = HttpUtil.getRemoteAddress( request ).hashCode(); 990 991 return Long.toString( lastModified ^ remote ); 992 } 993 994 /** 995 * Returns the name of the hash field to be used in this request. The value is unique per session, and once 996 * the session has expired, you cannot edit anymore. 997 * 998 * @param request The page request 999 * @return The name to be used in the hash field 1000 * @since 2.6 1001 */ 1002 public static final String getHashFieldName( HttpServletRequest request ) { 1003 String hash = null; 1004 1005 if( request.getSession() != null ) { 1006 hash = ( String )request.getSession().getAttribute( "_hash" ); 1007 1008 if( hash == null ) { 1009 hash = c_hashName; 1010 request.getSession().setAttribute( "_hash", hash ); 1011 } 1012 } 1013 1014 if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) { 1015 c_hashName = getUniqueID().toLowerCase(); 1016 c_lastUpdate = System.currentTimeMillis(); 1017 } 1018 1019 return hash != null ? hash : c_hashName; 1020 } 1021 1022 1023 /** 1024 * This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases: 1025 * either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long, 1026 * and their session has expired. 1027 * <p> 1028 * This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in 1029 * the spam log (it may or may not be spam, but it's rather likely that it is). 1030 * 1031 * @param context The WikiContext 1032 * @param pageContext The JSP PageContext. 1033 * @return True, if hash is okay. False, if hash is not okay, and you need to redirect. 1034 * @throws IOException If redirection fails 1035 * @since 2.6 1036 */ 1037 public static final boolean checkHash( WikiContext context, PageContext pageContext ) throws IOException { 1038 String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() ); 1039 1040 if( pageContext.getRequest().getParameter(hashName) == null ) { 1041 if( pageContext.getAttribute( hashName ) == null ) { 1042 Change change = getChange( context, EditorManager.getEditedText( pageContext ) ); 1043 log( context, REJECT, "MissingHash", change.m_change ); 1044 1045 String redirect = context.getURL( WikiContext.VIEW,"SessionExpired" ); 1046 ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect ); 1047 return false; 1048 } 1049 } 1050 1051 return true; 1052 } 1053 1054 /** 1055 * This helper method adds all the input fields to your editor that the SpamFilter requires 1056 * to check for spam. This <i>must</i> be in your editor form if you intend to use the SpamFilter. 1057 * 1058 * @param pageContext The PageContext 1059 * @return A HTML string which contains input fields for the SpamFilter. 1060 */ 1061 public static final String insertInputFields( PageContext pageContext ) { 1062 WikiContext ctx = WikiContext.findContext( pageContext ); 1063 WikiEngine engine = ctx.getEngine(); 1064 1065 StringBuilder sb = new StringBuilder(); 1066 if( engine.getContentEncoding().equals( "UTF-8" ) ) { 1067 sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" ); 1068 } 1069 1070 return sb.toString(); 1071 } 1072 1073 /** 1074 * A local class for storing host information. 1075 * 1076 * @since 1077 */ 1078 private class Host { 1079 1080 private long m_addedTime = System.currentTimeMillis(); 1081 private long m_releaseTime; 1082 private String m_address; 1083 private Change m_change; 1084 1085 public String getAddress() { 1086 return m_address; 1087 } 1088 1089 public long getReleaseTime() { 1090 return m_releaseTime; 1091 } 1092 1093 public long getAddedTime() { 1094 return m_addedTime; 1095 } 1096 1097 public Change getChange() { 1098 return m_change; 1099 } 1100 1101 public Host( String ipaddress, Change change ) { 1102 m_address = ipaddress; 1103 m_change = change; 1104 m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L; 1105 } 1106 1107 } 1108 1109 private static class Change { 1110 1111 public String m_change; 1112 public int m_adds; 1113 public int m_removals; 1114 1115 public String toString() { 1116 return m_change; 1117 } 1118 1119 public boolean equals( Object o ) { 1120 if( o instanceof Change ) { 1121 return m_change.equals( ( ( Change )o ).m_change ); 1122 } 1123 return false; 1124 } 1125 1126 public int hashCode() { 1127 return m_change.hashCode() + 17; 1128 } 1129 1130 } 1131 1132}