001/* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019package org.apache.wiki.filters; 020 021import java.io.BufferedReader; 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.InputStreamReader; 025import java.io.StringReader; 026import java.io.StringWriter; 027import java.util.ArrayList; 028import java.util.Collection; 029import java.util.Date; 030import java.util.Iterator; 031import java.util.Properties; 032import java.util.Random; 033import java.util.StringTokenizer; 034import java.util.Vector; 035 036import javax.servlet.http.HttpServletRequest; 037import javax.servlet.http.HttpServletResponse; 038import javax.servlet.jsp.PageContext; 039 040import org.apache.commons.lang.time.StopWatch; 041import org.apache.log4j.Logger; 042import org.apache.oro.text.regex.MalformedPatternException; 043import org.apache.oro.text.regex.MatchResult; 044import org.apache.oro.text.regex.Pattern; 045import org.apache.oro.text.regex.PatternCompiler; 046import org.apache.oro.text.regex.PatternMatcher; 047import org.apache.oro.text.regex.Perl5Compiler; 048import org.apache.oro.text.regex.Perl5Matcher; 049import org.apache.wiki.InternalWikiException; 050import org.apache.wiki.WikiContext; 051import org.apache.wiki.WikiEngine; 052import org.apache.wiki.WikiPage; 053import org.apache.wiki.WikiProvider; 054import org.apache.wiki.api.exceptions.ProviderException; 055import org.apache.wiki.api.exceptions.RedirectException; 056import org.apache.wiki.api.filters.BasicPageFilter; 057import org.apache.wiki.attachment.Attachment; 058import org.apache.wiki.auth.user.UserProfile; 059import org.apache.wiki.ui.EditorManager; 060import org.apache.wiki.util.FileUtil; 061import org.apache.wiki.util.HttpUtil; 062import org.apache.wiki.util.TextUtil; 063import org.suigeneris.jrcs.diff.Diff; 064import org.suigeneris.jrcs.diff.DifferentiationFailedException; 065import org.suigeneris.jrcs.diff.Revision; 066import org.suigeneris.jrcs.diff.delta.AddDelta; 067import org.suigeneris.jrcs.diff.delta.ChangeDelta; 068import org.suigeneris.jrcs.diff.delta.DeleteDelta; 069import org.suigeneris.jrcs.diff.delta.Delta; 070import org.suigeneris.jrcs.diff.myers.MyersDiff; 071 072import net.sf.akismet.Akismet; 073 074 075/** 076 * This is Herb, the JSPWiki spamfilter that can also do choke modifications. 077 * 078 * Parameters: 079 * <ul> 080 * <li>wordlist - Page name where the spamword regexps are found. Use [{SET spamwords='regexp list separated with spaces'}] on 081 * that page. Default is "SpamFilterWordList". 082 * <li>IPlist - Page name where the IP regexps are found. Use [{SET ips='regexp list separated with spaces'}] on 083 * that page. Default is "SpamFilterIPList". 084 * <li>maxpagenamelength - Maximum page name length. Default is 100. 085 * <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is 086 * "SpamFilterWordList/blacklist.txt"</li> 087 * <li>errorpage - The page to which the user is redirected. Has a special variable $msg which states the reason. Default is "RejectedMessage". 088 * <li>pagechangesinminute - How many page changes are allowed/minute. Default is 5.</li> 089 * <li>similarchanges - How many similar page changes are allowed before the host is banned. Default is 2. (since 2.4.72)</li> 090 * <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li> 091 * <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li> 092 * <li>akismet-apikey - The Akismet API key (see akismet.org)</li> 093 * <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li> 094 * <li>captcha - Sets the captcha technology to use. Current allowed values are "none" and "asirra".</li> 095 * <li>strategy - Sets the filtering strategy to use. If set to "eager", will stop at the first probable 096 * match, and won't consider any other tests. This is the default, as it's considerably lighter. If set to "score", will go through all of the tests 097 * and calculates a score for the spam, which is then compared to a filter level value. 098 * </ul> 099 * 100 * <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates 101 * with the editor system.</p> 102 * 103 * <p>Changes by admin users are ignored in any case.</p> 104 * 105 * @since 2.1.112 106 */ 107public class SpamFilter extends BasicPageFilter { 108 109 private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score"; 110 private static final String REASON_REGEXP = "Regexp"; 111 private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily"; 112 private static final String REASON_IP_BANNED_PERMANENTLY = "IPBannedPermanently"; 113 private static final String REASON_BOT_TRAP = "BotTrap"; 114 private static final String REASON_AKISMET = "Akismet"; 115 private static final String REASON_TOO_MANY_URLS = "TooManyUrls"; 116 private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications"; 117 private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications"; 118 private static final String REASON_PAGENAME_TOO_LONG = "PageNameTooLong"; 119 private static final String REASON_UTF8_TRAP = "UTF8Trap"; 120 121 private static final String LISTVAR = "spamwords"; 122 private static final String LISTIPVAR = "ips"; 123 124 /** The filter property name for specifying the page which contains the list of spamwords. 125 * Value is <tt>{@value}</tt>. */ 126 public static final String PROP_WORDLIST = "wordlist"; 127 128 /** The filter property name for specifying the page which contains the list of IPs to ban. 129 * Value is <tt>{@value}</tt>. */ 130 public static final String PROP_IPLIST = "IPlist"; 131 132 /** The filter property name for specifying the maximum page name length. 133 * Value is <tt>{@value}</tt>. */ 134 public static final String PROP_MAX_PAGENAME_LENGTH = "maxpagenamelength"; 135 136 /** The filter property name for the page to which you are directed if Herb rejects your 137 * edit. Value is <tt>{@value}</tt>. */ 138 public static final String PROP_ERRORPAGE = "errorpage"; 139 140 /** The filter property name for specifying how many changes is any given IP address 141 * allowed to do per minute. Value is <tt>{@value}</tt>. 142 */ 143 public static final String PROP_PAGECHANGES = "pagechangesinminute"; 144 145 /** The filter property name for specifying how many similar changes are allowed 146 * before a host is banned. Value is <tt>{@value}</tt>. 147 */ 148 public static final String PROP_SIMILARCHANGES = "similarchanges"; 149 150 /** The filter property name for specifying how long a host is banned. Value is <tt>{@value}</tt>.*/ 151 public static final String PROP_BANTIME = "bantime"; 152 153 /** The filter property name for the attachment containing the blacklist. Value is <tt>{@value}</tt>.*/ 154 public static final String PROP_BLACKLIST = "blacklist"; 155 156 /** The filter property name for specifying how many URLs can any given edit contain. 157 * Value is <tt>{@value}</tt> */ 158 public static final String PROP_MAXURLS = "maxurls"; 159 160 /** The filter property name for specifying the Akismet API-key. Value is <tt>{@value}</tt>. */ 161 public static final String PROP_AKISMET_API_KEY = "akismet-apikey"; 162 163 /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */ 164 public static final String PROP_IGNORE_AUTHENTICATED = "ignoreauthenticated"; 165 166 /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */ 167 public static final String PROP_CAPTCHA = "captcha"; 168 169 /** The filter property name for specifying which filter strategy should be used. Value is <tt>{@value}</tt>. */ 170 public static final String PROP_FILTERSTRATEGY = "strategy"; 171 172 /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */ 173 public static final String STRATEGY_EAGER = "eager"; 174 175 /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */ 176 public static final String STRATEGY_SCORE = "score"; 177 178 private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)"; 179 180 private String m_forbiddenWordsPage = "SpamFilterWordList"; 181 private String m_forbiddenIPsPage = "SpamFilterIPList"; 182 private String m_pageNameMaxLength = "100"; 183 private String m_errorPage = "RejectedMessage"; 184 private String m_blacklist = "SpamFilterWordList/blacklist.txt"; 185 186 private PatternMatcher m_matcher = new Perl5Matcher(); 187 private PatternCompiler m_compiler = new Perl5Compiler(); 188 189 private Collection<Pattern> m_spamPatterns = null; 190 private Collection<Pattern> m_IPPatterns = null; 191 192 private Date m_lastRebuild = new Date( 0L ); 193 194 private static Logger c_spamlog = Logger.getLogger( "SpamLog" ); 195 private static Logger log = Logger.getLogger( SpamFilter.class ); 196 197 198 private Vector<Host> m_temporaryBanList = new Vector<Host>(); 199 200 private int m_banTime = 60; // minutes 201 202 private Vector<Host> m_lastModifications = new Vector<Host>(); 203 204 /** 205 * How many times a single IP address can change a page per minute? 206 */ 207 private int m_limitSinglePageChanges = 5; 208 209 /** 210 * How many times can you add the exact same string to a page? 211 */ 212 private int m_limitSimilarChanges = 2; 213 214 /** 215 * How many URLs can be added at maximum. 216 */ 217 private int m_maxUrls = 10; 218 219 private Pattern m_urlPattern; 220 private Akismet m_akismet; 221 222 private String m_akismetAPIKey = null; 223 224 private boolean m_useCaptcha = false; 225 226 /** The limit at which we consider something to be spam. */ 227 private int m_scoreLimit = 1; 228 229 /** 230 * If set to true, will ignore anyone who is in Authenticated role. 231 */ 232 private boolean m_ignoreAuthenticated = false; 233 234 private boolean m_stopAtFirstMatch = true; 235 236 private static String c_hashName; 237 private static long c_lastUpdate; 238 239 /** The HASH_DELAY value is a maximum amount of time that an user can keep 240 * a session open, because after the value has expired, we will invent a new 241 * hash field name. By default this is {@value} hours, which should be ample 242 * time for someone. 243 */ 244 private static final long HASH_DELAY = 24; 245 246 247 /** 248 * {@inheritDoc} 249 */ 250 @Override 251 public void initialize( WikiEngine engine, Properties properties ) { 252 m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage ); 253 m_forbiddenIPsPage = properties.getProperty( PROP_IPLIST, m_forbiddenIPsPage); 254 m_pageNameMaxLength = properties.getProperty( PROP_MAX_PAGENAME_LENGTH, m_pageNameMaxLength); 255 m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage ); 256 m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties, 257 PROP_PAGECHANGES, 258 m_limitSinglePageChanges ); 259 260 m_limitSimilarChanges = TextUtil.getIntegerProperty( properties, 261 PROP_SIMILARCHANGES, 262 m_limitSimilarChanges ); 263 264 m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls ); 265 m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime ); 266 m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist ); 267 268 m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties, 269 PROP_IGNORE_AUTHENTICATED, 270 m_ignoreAuthenticated ); 271 272 m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra"); 273 274 try { 275 m_urlPattern = m_compiler.compile( URL_REGEXP ); 276 } catch( MalformedPatternException e ) { 277 log.fatal( "Internal error: Someone put in a faulty pattern.", e ); 278 throw new InternalWikiException( "Faulty pattern." , e); 279 } 280 281 m_akismetAPIKey = TextUtil.getStringProperty( properties, 282 PROP_AKISMET_API_KEY, 283 m_akismetAPIKey ); 284 285 m_stopAtFirstMatch = TextUtil.getStringProperty( properties, 286 PROP_FILTERSTRATEGY, 287 STRATEGY_EAGER ).equals( STRATEGY_EAGER ); 288 289 log.info( "# Spam filter initialized. Temporary ban time " + m_banTime + 290 " mins, max page changes/minute: " + m_limitSinglePageChanges ); 291 292 293 } 294 295 private static final int REJECT = 0; 296 private static final int ACCEPT = 1; 297 private static final int NOTE = 2; 298 299 private static String log( WikiContext ctx, int type, String source, String message ) { 300 message = TextUtil.replaceString( message, "\r\n", "\\r\\n" ); 301 message = TextUtil.replaceString( message, "\"", "\\\"" ); 302 303 String uid = getUniqueID(); 304 305 String page = ctx.getPage().getName(); 306 String reason = "UNKNOWN"; 307 String addr = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-"; 308 309 switch( type ) { 310 case REJECT: 311 reason = "REJECTED"; 312 break; 313 case ACCEPT: 314 reason = "ACCEPTED"; 315 break; 316 case NOTE: 317 reason = "NOTE"; 318 break; 319 default: 320 throw new InternalWikiException( "Illegal type " + type ); 321 } 322 c_spamlog.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message ); 323 324 return uid; 325 } 326 327 /** {@inheritDoc} */ 328 public String preSave( WikiContext context, String content ) throws RedirectException { 329 cleanBanList(); 330 refreshBlacklists( context ); 331 Change change = getChange( context, content ); 332 333 if( !ignoreThisUser( context ) ) { 334 checkBanList( context, change ); 335 checkSinglePageChange( context, content, change ); 336 checkIPList( context ); 337 checkPatternList( context, content, change ); 338 checkPageName( context, content, change); 339 } 340 341 if( !m_stopAtFirstMatch ) { 342 Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE ); 343 344 if( score != null && score.intValue() >= m_scoreLimit ) { 345 throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) ); 346 } 347 } 348 349 log( context, ACCEPT, "-", change.toString() ); 350 return content; 351 } 352 353 private void checkPageName(WikiContext context, String content, Change change) throws RedirectException { 354 WikiPage page = context.getPage(); 355 String pageName = page.getName(); 356 int maxlength = Integer.valueOf(m_pageNameMaxLength); 357 if ( pageName.length() > maxlength) { 358 // 359 // Spam filter has a match. 360 // 361 362 String uid = log( context, REJECT, REASON_PAGENAME_TOO_LONG + "(" + m_pageNameMaxLength + ")" , pageName); 363 364 log.info("SPAM:PageNameTooLong (" + uid + "). The length of the page name is too large (" + pageName.length() + " , limit is " + m_pageNameMaxLength + ")"); 365 checkStrategy( context, REASON_PAGENAME_TOO_LONG, "Herb says '" + pageName + "' is a bad pageName and I trust Herb! (Incident code " + uid + ")" ); 366 367 } 368 } 369 370 private void checkStrategy( WikiContext context, String error, String message ) throws RedirectException { 371 if( m_stopAtFirstMatch ) { 372 throw new RedirectException( message, getRedirectPage( context ) ); 373 } 374 375 Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE ); 376 if( score != null ) { 377 score = score + 1; 378 } else { 379 score = 1; 380 } 381 382 context.setVariable( ATTR_SPAMFILTER_SCORE, score ); 383 } 384 385 /** 386 * Parses a list of patterns and returns a Collection of compiled Pattern 387 * objects. 388 * 389 * @param source 390 * @param list 391 * @return A Collection of the Patterns that were found from the lists. 392 */ 393 private Collection< Pattern > parseWordList( WikiPage source, String list ) { 394 ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >(); 395 396 if( list != null ) { 397 StringTokenizer tok = new StringTokenizer( list, " \t\n" ); 398 399 while( tok.hasMoreTokens() ) { 400 String pattern = tok.nextToken(); 401 402 try { 403 compiledpatterns.add( m_compiler.compile( pattern ) ); 404 } catch( MalformedPatternException e ) { 405 log.debug( "Malformed spam filter pattern " + pattern ); 406 source.setAttribute("error", "Malformed spam filter pattern " + pattern); 407 } 408 } 409 } 410 411 return compiledpatterns; 412 } 413 414 /** 415 * Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects. 416 * 417 * @param list 418 * @return The parsed blacklist patterns. 419 */ 420 private Collection< Pattern > parseBlacklist( String list ) { 421 ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >(); 422 423 if( list != null ) { 424 try { 425 BufferedReader in = new BufferedReader( new StringReader(list) ); 426 String line; 427 while( (line = in.readLine() ) != null ) { 428 line = line.trim(); 429 if( line.length() == 0 ) continue; // Empty line 430 if( line.startsWith("#") ) continue; // It's a comment 431 432 int ws = line.indexOf( ' ' ); 433 if( ws == -1 ) ws = line.indexOf( '\t' ); 434 if( ws != -1 ) line = line.substring( 0, ws ); 435 436 try { 437 compiledpatterns.add( m_compiler.compile( line ) ); 438 } catch( MalformedPatternException e ) { 439 log.debug( "Malformed spam filter pattern " + line ); 440 } 441 } 442 } catch( IOException e ) { 443 log.info( "Could not read patterns; returning what I got" , e ); 444 } 445 } 446 447 return compiledpatterns; 448 } 449 450 /** 451 * Takes a single page change and performs a load of tests on the content change. 452 * An admin can modify anything. 453 * 454 * @param context 455 * @param content 456 * @throws RedirectException 457 */ 458 private synchronized void checkSinglePageChange( WikiContext context, String content, Change change ) 459 throws RedirectException { 460 HttpServletRequest req = context.getHttpRequest(); 461 462 if( req != null ) { 463 String addr = HttpUtil.getRemoteAddress( req ); 464 int hostCounter = 0; 465 int changeCounter = 0; 466 467 log.debug( "Change is " + change.m_change ); 468 469 long time = System.currentTimeMillis() - 60*1000L; // 1 minute 470 471 for( Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) { 472 Host host = i.next(); 473 474 // 475 // Check if this item is invalid 476 // 477 if( host.getAddedTime() < time ) { 478 log.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" ); 479 i.remove(); 480 continue; 481 } 482 483 // 484 // Check if this IP address has been seen before 485 // 486 487 if( host.getAddress().equals( addr ) ) { 488 hostCounter++; 489 } 490 491 // 492 // Check, if this change has been seen before 493 // 494 495 if( host.getChange() != null && host.getChange().equals( change ) ) { 496 changeCounter++; 497 } 498 } 499 500 // 501 // Now, let's check against the limits. 502 // 503 if( hostCounter >= m_limitSinglePageChanges ) { 504 Host host = new Host( addr, null ); 505 m_temporaryBanList.add( host ); 506 507 String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change ); 508 log.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" ); 509 checkStrategy( context, REASON_TOO_MANY_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 510 } 511 512 if( changeCounter >= m_limitSimilarChanges ) { 513 Host host = new Host( addr, null ); 514 m_temporaryBanList.add( host ); 515 516 String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change ); 517 log.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" ); 518 checkStrategy( context, REASON_SIMILAR_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")"); 519 } 520 521 // 522 // Calculate the number of links in the addition. 523 // 524 String tstChange = change.toString(); 525 int urlCounter = 0; 526 while( m_matcher.contains( tstChange,m_urlPattern ) ) { 527 MatchResult m = m_matcher.getMatch(); 528 tstChange = tstChange.substring( m.endOffset(0) ); 529 urlCounter++; 530 } 531 532 if( urlCounter > m_maxUrls ) { 533 Host host = new Host( addr, null ); 534 m_temporaryBanList.add( host ); 535 536 String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() ); 537 log.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" ); 538 checkStrategy( context, REASON_TOO_MANY_URLS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 539 } 540 541 // 542 // Check bot trap 543 // 544 checkBotTrap( context, change ); 545 546 // 547 // Check UTF-8 mangling 548 // 549 checkUTF8( context, change ); 550 551 // 552 // Do Akismet check. This is good to be the last, because this is the most 553 // expensive operation. 554 // 555 checkAkismet( context, change ); 556 557 m_lastModifications.add( new Host( addr, change ) ); 558 } 559 } 560 561 562 /** 563 * Checks against the akismet system. 564 * 565 * @param context 566 * @param change 567 * @throws RedirectException 568 */ 569 private void checkAkismet( WikiContext context, Change change ) throws RedirectException { 570 if( m_akismetAPIKey != null ) { 571 if( m_akismet == null ) { 572 log.info( "Initializing Akismet spam protection." ); 573 m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() ); 574 575 if( !m_akismet.verifyAPIKey() ) { 576 log.error( "Akismet API key cannot be verified. Please check your config." ); 577 m_akismetAPIKey = null; 578 m_akismet = null; 579 } 580 } 581 582 HttpServletRequest req = context.getHttpRequest(); 583 584 // 585 // Akismet will mark all empty statements as spam, so we'll just 586 // ignore them. 587 // 588 if( change.m_adds == 0 && change.m_removals > 0 ) { 589 return; 590 } 591 592 if( req != null && m_akismet != null ) { 593 log.debug( "Calling Akismet to check for spam..." ); 594 595 StopWatch sw = new StopWatch(); 596 sw.start(); 597 598 String ipAddress = HttpUtil.getRemoteAddress( req ); 599 String userAgent = req.getHeader( "User-Agent" ); 600 String referrer = req.getHeader( "Referer"); 601 String permalink = context.getViewURL( context.getPage().getName() ); 602 String commentType = context.getRequestContext().equals( WikiContext.COMMENT ) ? "comment" : "edit"; 603 String commentAuthor = context.getCurrentUser().getName(); 604 String commentAuthorEmail = null; 605 String commentAuthorURL = null; 606 607 boolean isSpam = m_akismet.commentCheck( ipAddress, 608 userAgent, 609 referrer, 610 permalink, 611 commentType, 612 commentAuthor, 613 commentAuthorEmail, 614 commentAuthorURL, 615 change.toString(), 616 null ); 617 618 sw.stop(); 619 log.debug( "Akismet request done in: " + sw ); 620 621 if( isSpam ) { 622 // Host host = new Host( ipAddress, null ); 623 // m_temporaryBanList.add( host ); 624 625 String uid = log( context, REJECT, REASON_AKISMET, change.toString() ); 626 log.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." ); 627 checkStrategy( context, REASON_AKISMET, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" ); 628 } 629 } 630 } 631 } 632 633 /** 634 * Returns a static string which can be used to detect spambots which just wildly fill in all the fields. 635 * 636 * @return A string 637 */ 638 public static String getBotFieldName() { 639 return "submit_auth"; 640 } 641 642 /** 643 * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam. 644 * 645 * @param context 646 * @param change 647 * @throws RedirectException 648 */ 649 private void checkBotTrap( WikiContext context, Change change ) throws RedirectException { 650 HttpServletRequest request = context.getHttpRequest(); 651 652 if( request != null ) { 653 String unspam = request.getParameter( getBotFieldName() ); 654 if( unspam != null && unspam.length() > 0 ) { 655 String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() ); 656 657 log.info( "SPAM:BotTrap (" + uid + "). Wildly behaving bot detected." ); 658 checkStrategy( context, REASON_BOT_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" ); 659 } 660 } 661 } 662 663 private void checkUTF8( WikiContext context, Change change ) throws RedirectException { 664 HttpServletRequest request = context.getHttpRequest(); 665 666 if( request != null ) { 667 String utf8field = request.getParameter( "encodingcheck" ); 668 669 if( utf8field != null && !utf8field.equals( "\u3041" ) ) { 670 String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() ); 671 672 log.info( "SPAM:UTF8Trap (" + uid + "). Wildly posting dumb bot detected." ); 673 checkStrategy( context, REASON_UTF8_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" ); 674 } 675 } 676 } 677 678 /** Goes through the ban list and cleans away any host which has expired from it. */ 679 private synchronized void cleanBanList() { 680 long now = System.currentTimeMillis(); 681 682 for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) { 683 Host host = i.next(); 684 685 if( host.getReleaseTime() < now ) { 686 log.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" ); 687 i.remove(); 688 } 689 } 690 } 691 692 /** 693 * Checks the ban list if the IP address of the changer is already on it. 694 * 695 * @param context 696 * @throws RedirectException 697 */ 698 private void checkBanList( WikiContext context, Change change ) throws RedirectException { 699 HttpServletRequest req = context.getHttpRequest(); 700 701 if( req != null ) { 702 String remote = HttpUtil.getRemoteAddress(req); 703 long now = System.currentTimeMillis(); 704 705 for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) { 706 Host host = i.next(); 707 708 if( host.getAddress().equals( remote ) ) { 709 long timeleft = ( host.getReleaseTime() - now ) / 1000L; 710 711 log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change ); 712 checkStrategy( context, REASON_IP_BANNED_TEMPORARILY, "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" ); 713 } 714 } 715 } 716 } 717 718 /** 719 * If the spam filter notices changes in the black list page, it will refresh them automatically. 720 * 721 * @param context 722 */ 723 private void refreshBlacklists( WikiContext context ) { 724 try { 725 726 boolean rebuild = false; 727 728 // 729 // Rebuild, if the spam words page, the attachment or the IP ban page has changed since. 730 // 731 WikiPage sourceSpam = context.getEngine().getPage( m_forbiddenWordsPage ); 732 if( sourceSpam != null ) { 733 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || sourceSpam.getLastModified().after( m_lastRebuild ) ) { 734 rebuild = true; 735 } 736 } 737 738 Attachment att = context.getEngine().getAttachmentManager().getAttachmentInfo( context, m_blacklist ); 739 if( att != null ) { 740 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) { 741 rebuild = true; 742 } 743 } 744 745 WikiPage sourceIPs = context.getEngine().getPage( m_forbiddenIPsPage ); 746 if( sourceIPs != null ) { 747 if( m_IPPatterns == null || m_IPPatterns.isEmpty() || sourceIPs.getLastModified().after( m_lastRebuild ) ) { 748 rebuild = true; 749 } 750 } 751 752 // 753 // Do the actual rebuilding. For simplicity's sake, we always rebuild the complete 754 // filter list regardless of what changed. 755 // 756 if( rebuild ) { 757 m_lastRebuild = new Date(); 758 m_spamPatterns = parseWordList( sourceSpam, 759 ( sourceSpam != null ) ? ( String )sourceSpam.getAttribute( LISTVAR ) : null ); 760 761 log.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage ); 762 763 m_IPPatterns = parseWordList( sourceIPs, 764 ( sourceIPs != null ) ? ( String )sourceIPs.getAttribute( LISTIPVAR ) : null ); 765 log.info( "IP filter reloaded - recognizing " + m_IPPatterns.size() + " patterns from page " + m_forbiddenIPsPage ); 766 767 if( att != null ) { 768 InputStream in = context.getEngine().getAttachmentManager().getAttachmentStream(att); 769 StringWriter out = new StringWriter(); 770 FileUtil.copyContents( new InputStreamReader( in,"UTF-8" ), out ); 771 Collection< Pattern > blackList = parseBlacklist( out.toString() ); 772 log.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist ); 773 m_spamPatterns.addAll( blackList ); 774 } 775 } 776 } catch( IOException ex ) { 777 log.info( "Unable to read attachment data, continuing...", ex ); 778 } catch( ProviderException ex ) { 779 log.info( "Failed to read spam filter attachment, continuing...", ex ); 780 } 781 } 782 783 /** 784 * Does a check against a known pattern list. 785 * 786 * @param context 787 * @param content 788 * @param change 789 * @throws RedirectException 790 */ 791 private void checkPatternList( WikiContext context, String content, Change change ) throws RedirectException { 792 // 793 // If we have no spam patterns defined, or we're trying to save 794 // the page containing the patterns, just return. 795 // 796 if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) { 797 return; 798 } 799 800 String ch = change.toString(); 801 if( context.getHttpRequest() != null ) { 802 ch += HttpUtil.getRemoteAddress( context.getHttpRequest() ); 803 } 804 805 for( Pattern p : m_spamPatterns ) { 806 // log.debug("Attempting to match page contents with "+p.getPattern()); 807 808 if( m_matcher.contains( ch, p ) ) { 809 // 810 // Spam filter has a match. 811 // 812 String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch ); 813 814 log.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" ); 815 checkStrategy( context, REASON_REGEXP, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" ); 816 } 817 } 818 } 819 820 821 /** 822 * Does a check against a pattern list of IPs. 823 * 824 * @param context 825 * @throws RedirectException 826 */ 827 private void checkIPList( WikiContext context ) throws RedirectException { 828 // 829 // If we have no IP patterns defined, or we're trying to save 830 // the page containing the IP patterns, just return. 831 // 832 if( m_IPPatterns == null || context.getPage().getName().equals( m_forbiddenIPsPage ) ) { 833 return; 834 } 835 836 String remoteIP = HttpUtil.getRemoteAddress( context.getHttpRequest() ); 837 log.info("Attempting to match remoteIP " + remoteIP + " against " + m_IPPatterns.size() + " patterns"); 838 839 for( Pattern p : m_IPPatterns ) { 840 log.debug("Attempting to match remoteIP with " + p.getPattern()); 841 842 if( m_matcher.contains( remoteIP, p ) ) { 843 844 // IP filter has a match. 845 // 846 String uid = log( context, REJECT, REASON_IP_BANNED_PERMANENTLY + "(" + p.getPattern() + ")", remoteIP ); 847 848 log.info( "SPAM:IPBanList (" + uid + "). remoteIP matches the IP filter '" + p.getPattern() + "'" ); 849 checkStrategy( context, REASON_IP_BANNED_PERMANENTLY, "Herb says '" + p.getPattern() + "' is a banned IP and I trust Herb! (Incident code " + uid + ")" ); 850 } 851 } 852 } 853 854 private void checkPatternList( WikiContext context, String content, String change ) throws RedirectException { 855 Change c = new Change(); 856 c.m_change = change; 857 checkPatternList( context, content, c ); 858 } 859 860 /** 861 * Creates a simple text string describing the added content. 862 * 863 * @param context 864 * @param newText 865 * @return Empty string, if there is no change. 866 */ 867 private static Change getChange( WikiContext context, String newText ) { 868 WikiPage page = context.getPage(); 869 StringBuffer change = new StringBuffer(); 870 WikiEngine engine = context.getEngine(); 871 // Get current page version 872 873 Change ch = new Change(); 874 875 try { 876 String oldText = engine.getPureText( page.getName(), WikiProvider.LATEST_VERSION ); 877 878 String[] first = Diff.stringToArray( oldText ); 879 String[] second = Diff.stringToArray( newText ); 880 Revision rev = Diff.diff( first, second, new MyersDiff() ); 881 882 if( rev == null || rev.size() == 0 ) { 883 return ch; 884 } 885 886 for( int i = 0; i < rev.size(); i++ ) { 887 Delta d = rev.getDelta( i ); 888 889 if( d instanceof AddDelta ) { 890 d.getRevised().toString( change, "", "\r\n" ); 891 ch.m_adds++; 892 893 } else if( d instanceof ChangeDelta ) { 894 d.getRevised().toString( change, "", "\r\n" ); 895 ch.m_adds++; 896 897 } else if( d instanceof DeleteDelta ) { 898 ch.m_removals++; 899 } 900 } 901 } catch( DifferentiationFailedException e ) { 902 log.error( "Diff failed", e ); 903 } 904 905 // 906 // Don't forget to include the change note, too 907 // 908 String changeNote = ( String )page.getAttribute( WikiPage.CHANGENOTE ); 909 910 if( changeNote != null ) { 911 change.append( "\r\n" ); 912 change.append( changeNote ); 913 } 914 915 // 916 // And author as well 917 // 918 if( page.getAuthor() != null ) { 919 change.append( "\r\n" + page.getAuthor() ); 920 } 921 922 ch.m_change = change.toString(); 923 return ch; 924 } 925 926 /** 927 * Returns true, if this user should be ignored. For example, admin users. 928 * 929 * @param context 930 * @return True, if this users should be ignored. 931 */ 932 private boolean ignoreThisUser( WikiContext context ) { 933 if( context.hasAdminPermissions() ) { 934 return true; 935 } 936 937 if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) { 938 return true; 939 } 940 941 if( context.getVariable( "captcha" ) != null ) { 942 return true; 943 } 944 945 return false; 946 } 947 948 /** 949 * Returns a random string of six uppercase characters. 950 * 951 * @return A random string 952 */ 953 private static String getUniqueID() { 954 StringBuilder sb = new StringBuilder(); 955 Random rand = new Random(); 956 957 for( int i = 0; i < 6; i++ ) { 958 char x = ( char )( 'A' + rand.nextInt( 26 ) ); 959 sb.append( x ); 960 } 961 962 return sb.toString(); 963 } 964 965 /** 966 * Returns a page to which we shall redirect, based on the current value of the "captcha" parameter. 967 * 968 * @param ctx WikiContext 969 * @return An URL to redirect to 970 */ 971 private String getRedirectPage( WikiContext ctx ) { 972 if( m_useCaptcha ) { 973 return ctx.getURL( WikiContext.NONE, "Captcha.jsp", "page="+ctx.getEngine().encodeName( ctx.getPage().getName() ) ); 974 } 975 976 return ctx.getURL( WikiContext.VIEW, m_errorPage ); 977 } 978 979 /** 980 * Checks whether the UserProfile matches certain checks. 981 * 982 * @param profile The profile to check 983 * @param context The WikiContext 984 * @return False, if this userprofile is suspect and should not be allowed to be added. 985 * @since 2.6.1 986 */ 987 public boolean isValidUserProfile( WikiContext context, UserProfile profile ) { 988 try { 989 checkPatternList( context, profile.getEmail(), profile.getEmail() ); 990 checkPatternList( context, profile.getFullname(), profile.getFullname() ); 991 checkPatternList( context, profile.getLoginName(), profile.getLoginName() ); 992 } catch( RedirectException e ) { 993 log.info("Detected attempt to create a spammer user account (see above for rejection reason)"); 994 return false; 995 } 996 997 return true; 998 } 999 1000 /** 1001 * This method is used to calculate an unique code when submitting the page to detect edit conflicts. 1002 * It currently incorporates the last-modified date of the page, and the IP address of the submitter. 1003 * 1004 * @param page The WikiPage under edit 1005 * @param request The HTTP Request 1006 * @since 2.6 1007 * @return A hash value for this page and session 1008 */ 1009 public static final String getSpamHash( WikiPage page, HttpServletRequest request ) { 1010 long lastModified = 0; 1011 1012 if( page.getLastModified() != null ) { 1013 lastModified = page.getLastModified().getTime(); 1014 } 1015 long remote = HttpUtil.getRemoteAddress( request ).hashCode(); 1016 1017 return Long.toString( lastModified ^ remote ); 1018 } 1019 1020 /** 1021 * Returns the name of the hash field to be used in this request. The value is unique per session, and once 1022 * the session has expired, you cannot edit anymore. 1023 * 1024 * @param request The page request 1025 * @return The name to be used in the hash field 1026 * @since 2.6 1027 */ 1028 public static final String getHashFieldName( HttpServletRequest request ) { 1029 String hash = null; 1030 1031 if( request.getSession() != null ) { 1032 hash = ( String )request.getSession().getAttribute( "_hash" ); 1033 1034 if( hash == null ) { 1035 hash = c_hashName; 1036 request.getSession().setAttribute( "_hash", hash ); 1037 } 1038 } 1039 1040 if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) { 1041 c_hashName = getUniqueID().toLowerCase(); 1042 c_lastUpdate = System.currentTimeMillis(); 1043 } 1044 1045 return hash != null ? hash : c_hashName; 1046 } 1047 1048 1049 /** 1050 * This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases: 1051 * either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long, 1052 * and their session has expired. 1053 * <p> 1054 * This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in 1055 * the spam log (it may or may not be spam, but it's rather likely that it is). 1056 * 1057 * @param context The WikiContext 1058 * @param pageContext The JSP PageContext. 1059 * @return True, if hash is okay. False, if hash is not okay, and you need to redirect. 1060 * @throws IOException If redirection fails 1061 * @since 2.6 1062 */ 1063 public static final boolean checkHash( WikiContext context, PageContext pageContext ) throws IOException { 1064 String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() ); 1065 1066 if( pageContext.getRequest().getParameter(hashName) == null ) { 1067 if( pageContext.getAttribute( hashName ) == null ) { 1068 Change change = getChange( context, EditorManager.getEditedText( pageContext ) ); 1069 log( context, REJECT, "MissingHash", change.m_change ); 1070 1071 String redirect = context.getURL( WikiContext.VIEW,"SessionExpired" ); 1072 ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect ); 1073 return false; 1074 } 1075 } 1076 1077 return true; 1078 } 1079 1080 /** 1081 * This helper method adds all the input fields to your editor that the SpamFilter requires 1082 * to check for spam. This <i>must</i> be in your editor form if you intend to use the SpamFilter. 1083 * 1084 * @param pageContext The PageContext 1085 * @return A HTML string which contains input fields for the SpamFilter. 1086 */ 1087 public static final String insertInputFields( PageContext pageContext ) { 1088 WikiContext ctx = WikiContext.findContext( pageContext ); 1089 WikiEngine engine = ctx.getEngine(); 1090 1091 StringBuilder sb = new StringBuilder(); 1092 if( engine.getContentEncoding().equals( "UTF-8" ) ) { 1093 sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" ); 1094 } 1095 1096 return sb.toString(); 1097 } 1098 1099 /** 1100 * A local class for storing host information. 1101 * 1102 * @since 1103 */ 1104 private class Host { 1105 1106 private long m_addedTime = System.currentTimeMillis(); 1107 private long m_releaseTime; 1108 private String m_address; 1109 private Change m_change; 1110 1111 public String getAddress() { 1112 return m_address; 1113 } 1114 1115 public long getReleaseTime() { 1116 return m_releaseTime; 1117 } 1118 1119 public long getAddedTime() { 1120 return m_addedTime; 1121 } 1122 1123 public Change getChange() { 1124 return m_change; 1125 } 1126 1127 public Host( String ipaddress, Change change ) { 1128 m_address = ipaddress; 1129 m_change = change; 1130 m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L; 1131 } 1132 1133 } 1134 1135 private static class Change { 1136 1137 public String m_change; 1138 public int m_adds; 1139 public int m_removals; 1140 1141 public String toString() { 1142 return m_change; 1143 } 1144 1145 public boolean equals( Object o ) { 1146 if( o instanceof Change ) { 1147 return m_change.equals( ( ( Change )o ).m_change ); 1148 } 1149 return false; 1150 } 1151 1152 public int hashCode() { 1153 return m_change.hashCode() + 17; 1154 } 1155 1156 } 1157 1158}