001 /* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019 package org.apache.wiki.filters; 020 021 import java.io.BufferedReader; 022 import java.io.IOException; 023 import java.io.InputStream; 024 import java.io.InputStreamReader; 025 import java.io.StringReader; 026 import java.io.StringWriter; 027 import java.util.ArrayList; 028 import java.util.Collection; 029 import java.util.Date; 030 import java.util.Iterator; 031 import java.util.Properties; 032 import java.util.Random; 033 import java.util.StringTokenizer; 034 import java.util.Vector; 035 036 import javax.servlet.http.HttpServletRequest; 037 import javax.servlet.http.HttpServletResponse; 038 import javax.servlet.jsp.PageContext; 039 040 import net.sf.akismet.Akismet; 041 042 import org.apache.commons.lang.time.StopWatch; 043 import org.apache.log4j.Logger; 044 import org.apache.oro.text.regex.MalformedPatternException; 045 import org.apache.oro.text.regex.MatchResult; 046 import org.apache.oro.text.regex.Pattern; 047 import org.apache.oro.text.regex.PatternCompiler; 048 import org.apache.oro.text.regex.PatternMatcher; 049 import org.apache.oro.text.regex.Perl5Compiler; 050 import org.apache.oro.text.regex.Perl5Matcher; 051 import org.apache.wiki.InternalWikiException; 052 import org.apache.wiki.WikiContext; 053 import org.apache.wiki.WikiEngine; 054 import org.apache.wiki.WikiPage; 055 import org.apache.wiki.WikiProvider; 056 import org.apache.wiki.api.exceptions.ProviderException; 057 import org.apache.wiki.api.exceptions.RedirectException; 058 import org.apache.wiki.api.filters.BasicPageFilter; 059 import org.apache.wiki.attachment.Attachment; 060 import org.apache.wiki.auth.user.UserProfile; 061 import org.apache.wiki.ui.EditorManager; 062 import org.apache.wiki.util.FileUtil; 063 import org.apache.wiki.util.HttpUtil; 064 import org.apache.wiki.util.TextUtil; 065 import org.suigeneris.jrcs.diff.Diff; 066 import org.suigeneris.jrcs.diff.DifferentiationFailedException; 067 import org.suigeneris.jrcs.diff.Revision; 068 import org.suigeneris.jrcs.diff.delta.AddDelta; 069 import org.suigeneris.jrcs.diff.delta.ChangeDelta; 070 import org.suigeneris.jrcs.diff.delta.DeleteDelta; 071 import org.suigeneris.jrcs.diff.delta.Delta; 072 import org.suigeneris.jrcs.diff.myers.MyersDiff; 073 074 075 /** 076 * This is Herb, the JSPWiki spamfilter that can also do choke modifications. 077 * 078 * Parameters: 079 * <ul> 080 * <li>wordlist - Page name where the regexps are found. Use [{SET spamwords='regexp list separated with spaces'}] on 081 * that page. Default is "SpamFilterWordList". 082 * <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is 083 * "SpamFilterWordList/blacklist.txt"</li> 084 * <li>errorpage - The page to which the user is redirected. Has a special variable $msg which states the reason. Default is "RejectedMessage". 085 * <li>pagechangesinminute - How many page changes are allowed/minute. Default is 5.</li> 086 * <li>similarchanges - How many similar page changes are allowed before the host is banned. Default is 2. (since 2.4.72)</li> 087 * <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li> 088 * <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li> 089 * <li>akismet-apikey - The Akismet API key (see akismet.org)</li> 090 * <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li> 091 * <li>captcha - Sets the captcha technology to use. Current allowed values are "none" and "asirra".</li> 092 * <li>strategy - Sets the filtering strategy to use. If set to "eager", will stop at the first probable 093 * match, and won't consider any other tests. This is the default, as it's considerably lighter. If set to "score", will go through all of the tests 094 * and calculates a score for the spam, which is then compared to a filter level value. 095 * </ul> 096 * 097 * <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates 098 * with the editor system.</p> 099 * 100 * <p>Changes by admin users are ignored in any case.</p> 101 * 102 * @since 2.1.112 103 */ 104 public class SpamFilter extends BasicPageFilter { 105 106 private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score"; 107 private static final String REASON_REGEXP = "Regexp"; 108 private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily"; 109 private static final String REASON_BOT_TRAP = "BotTrap"; 110 private static final String REASON_AKISMET = "Akismet"; 111 private static final String REASON_TOO_MANY_URLS = "TooManyUrls"; 112 private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications"; 113 private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications"; 114 private static final String REASON_UTF8_TRAP = "UTF8Trap"; 115 116 private static final String LISTVAR = "spamwords"; 117 118 /** The filter property name for specifying the page which contains the list of spamwords. 119 * Value is <tt>{@value}</tt>. */ 120 public static final String PROP_WORDLIST = "wordlist"; 121 122 /** The filter property name for the page to which you are directed if Herb rejects your 123 * edit. Value is <tt>{@value}</tt>. */ 124 public static final String PROP_ERRORPAGE = "errorpage"; 125 126 /** The filter property name for specifying how many changes is any given IP address 127 * allowed to do per minute. Value is <tt>{@value}</tt>. 128 */ 129 public static final String PROP_PAGECHANGES = "pagechangesinminute"; 130 131 /** The filter property name for specifying how many similar changes are allowed 132 * before a host is banned. Value is <tt>{@value}</tt>. 133 */ 134 public static final String PROP_SIMILARCHANGES = "similarchanges"; 135 136 /** The filter property name for specifying how long a host is banned. Value is <tt>{@value}</tt>.*/ 137 public static final String PROP_BANTIME = "bantime"; 138 139 /** The filter property name for the attachment containing the blacklist. Value is <tt>{@value}</tt>.*/ 140 public static final String PROP_BLACKLIST = "blacklist"; 141 142 /** The filter property name for specifying how many URLs can any given edit contain. 143 * Value is <tt>{@value}</tt> */ 144 public static final String PROP_MAXURLS = "maxurls"; 145 146 /** The filter property name for specifying the Akismet API-key. Value is <tt>{@value}</tt>. */ 147 public static final String PROP_AKISMET_API_KEY = "akismet-apikey"; 148 149 /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */ 150 public static final String PROP_IGNORE_AUTHENTICATED = "ignoreauthenticated"; 151 152 /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */ 153 public static final String PROP_CAPTCHA = "captcha"; 154 155 /** The filter property name for specifying which filter strategy should be used. Value is <tt>{@value}</tt>. */ 156 public static final String PROP_FILTERSTRATEGY = "strategy"; 157 158 /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */ 159 public static final String STRATEGY_EAGER = "eager"; 160 161 /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */ 162 public static final String STRATEGY_SCORE = "score"; 163 164 private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)"; 165 166 private String m_forbiddenWordsPage = "SpamFilterWordList"; 167 private String m_errorPage = "RejectedMessage"; 168 private String m_blacklist = "SpamFilterWordList/blacklist.txt"; 169 170 private PatternMatcher m_matcher = new Perl5Matcher(); 171 private PatternCompiler m_compiler = new Perl5Compiler(); 172 173 private Collection<Pattern> m_spamPatterns = null; 174 175 private Date m_lastRebuild = new Date( 0L ); 176 177 private static Logger c_spamlog = Logger.getLogger( "SpamLog" ); 178 private static Logger log = Logger.getLogger( SpamFilter.class ); 179 180 181 private Vector<Host> m_temporaryBanList = new Vector<Host>(); 182 183 private int m_banTime = 60; // minutes 184 185 private Vector<Host> m_lastModifications = new Vector<Host>(); 186 187 /** 188 * How many times a single IP address can change a page per minute? 189 */ 190 private int m_limitSinglePageChanges = 5; 191 192 /** 193 * How many times can you add the exact same string to a page? 194 */ 195 private int m_limitSimilarChanges = 2; 196 197 /** 198 * How many URLs can be added at maximum. 199 */ 200 private int m_maxUrls = 10; 201 202 private Pattern m_urlPattern; 203 private Akismet m_akismet; 204 205 private String m_akismetAPIKey = null; 206 207 private boolean m_useCaptcha = false; 208 209 /** The limit at which we consider something to be spam. */ 210 private int m_scoreLimit = 1; 211 212 /** 213 * If set to true, will ignore anyone who is in Authenticated role. 214 */ 215 private boolean m_ignoreAuthenticated = false; 216 217 private boolean m_stopAtFirstMatch = true; 218 219 private static String c_hashName; 220 private static long c_lastUpdate; 221 222 /** The HASH_DELAY value is a maximum amount of time that an user can keep 223 * a session open, because after the value has expired, we will invent a new 224 * hash field name. By default this is {@value} hours, which should be ample 225 * time for someone. 226 */ 227 private static final long HASH_DELAY = 24; 228 229 230 /** 231 * {@inheritDoc} 232 */ 233 @Override 234 public void initialize( WikiEngine engine, Properties properties ) { 235 m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage ); 236 m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage ); 237 m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties, 238 PROP_PAGECHANGES, 239 m_limitSinglePageChanges ); 240 241 m_limitSimilarChanges = TextUtil.getIntegerProperty( properties, 242 PROP_SIMILARCHANGES, 243 m_limitSimilarChanges ); 244 245 m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls ); 246 m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime ); 247 m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist ); 248 249 m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties, 250 PROP_IGNORE_AUTHENTICATED, 251 m_ignoreAuthenticated ); 252 253 m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra"); 254 255 try { 256 m_urlPattern = m_compiler.compile( URL_REGEXP ); 257 } catch( MalformedPatternException e ) { 258 log.fatal( "Internal error: Someone put in a faulty pattern.", e ); 259 throw new InternalWikiException( "Faulty pattern." ); 260 } 261 262 m_akismetAPIKey = TextUtil.getStringProperty( properties, 263 PROP_AKISMET_API_KEY, 264 m_akismetAPIKey ); 265 266 m_stopAtFirstMatch = TextUtil.getStringProperty( properties, 267 PROP_FILTERSTRATEGY, 268 STRATEGY_EAGER ).equals( STRATEGY_EAGER ); 269 270 log.info( "# Spam filter initialized. Temporary ban time " + m_banTime + 271 " mins, max page changes/minute: " + m_limitSinglePageChanges ); 272 273 274 } 275 276 private static final int REJECT = 0; 277 private static final int ACCEPT = 1; 278 private static final int NOTE = 2; 279 280 private static String log( WikiContext ctx, int type, String source, String message ) { 281 message = TextUtil.replaceString( message, "\r\n", "\\r\\n" ); 282 message = TextUtil.replaceString( message, "\"", "\\\"" ); 283 284 String uid = getUniqueID(); 285 286 String page = ctx.getPage().getName(); 287 String reason = "UNKNOWN"; 288 String addr = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-"; 289 290 switch( type ) { 291 case REJECT: 292 reason = "REJECTED"; 293 break; 294 case ACCEPT: 295 reason = "ACCEPTED"; 296 break; 297 case NOTE: 298 reason = "NOTE"; 299 break; 300 default: 301 throw new InternalWikiException( "Illegal type " + type ); 302 } 303 c_spamlog.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message ); 304 305 return uid; 306 } 307 308 /** {@inheritDoc} */ 309 public String preSave( WikiContext context, String content ) throws RedirectException { 310 cleanBanList(); 311 refreshBlacklists( context ); 312 Change change = getChange( context, content ); 313 314 if( !ignoreThisUser( context ) ) { 315 checkBanList( context, change ); 316 checkSinglePageChange( context, content, change ); 317 checkPatternList( context, content, change ); 318 } 319 320 if( !m_stopAtFirstMatch ) { 321 Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE ); 322 323 if( score != null && score.intValue() >= m_scoreLimit ) { 324 throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) ); 325 } 326 } 327 328 log( context, ACCEPT, "-", change.toString() ); 329 return content; 330 } 331 332 private void checkStrategy( WikiContext context, String error, String message ) throws RedirectException { 333 if( m_stopAtFirstMatch ) { 334 throw new RedirectException( message, getRedirectPage( context ) ); 335 } 336 337 Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE ); 338 if( score != null ) { 339 score = score + 1; 340 } else { 341 score = 1; 342 } 343 344 context.setVariable( ATTR_SPAMFILTER_SCORE, score ); 345 } 346 347 /** 348 * Parses a list of patterns and returns a Collection of compiled Pattern 349 * objects. 350 * 351 * @param source 352 * @param list 353 * @return A Collection of the Patterns that were found from the lists. 354 */ 355 private Collection< Pattern > parseWordList( WikiPage source, String list ) { 356 ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >(); 357 358 if( list != null ) { 359 StringTokenizer tok = new StringTokenizer( list, " \t\n" ); 360 361 while( tok.hasMoreTokens() ) { 362 String pattern = tok.nextToken(); 363 364 try { 365 compiledpatterns.add( m_compiler.compile( pattern ) ); 366 } catch( MalformedPatternException e ) { 367 log.debug( "Malformed spam filter pattern " + pattern ); 368 source.setAttribute("error", "Malformed spam filter pattern " + pattern); 369 } 370 } 371 } 372 373 return compiledpatterns; 374 } 375 376 /** 377 * Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects. 378 * 379 * @param list 380 * @return The parsed blacklist patterns. 381 */ 382 private Collection< Pattern > parseBlacklist( String list ) { 383 ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >(); 384 385 if( list != null ) { 386 try { 387 BufferedReader in = new BufferedReader( new StringReader(list) ); 388 String line; 389 while( (line = in.readLine() ) != null ) { 390 line = line.trim(); 391 if( line.length() == 0 ) continue; // Empty line 392 if( line.startsWith("#") ) continue; // It's a comment 393 394 int ws = line.indexOf( ' ' ); 395 if( ws == -1 ) ws = line.indexOf( '\t' ); 396 if( ws != -1 ) line = line.substring( 0, ws ); 397 398 try { 399 compiledpatterns.add( m_compiler.compile( line ) ); 400 } catch( MalformedPatternException e ) { 401 log.debug( "Malformed spam filter pattern " + line ); 402 } 403 } 404 } catch( IOException e ) { 405 log.info( "Could not read patterns; returning what I got" , e ); 406 } 407 } 408 409 return compiledpatterns; 410 } 411 412 /** 413 * Takes a single page change and performs a load of tests on the content change. 414 * An admin can modify anything. 415 * 416 * @param context 417 * @param content 418 * @throws RedirectException 419 */ 420 private synchronized void checkSinglePageChange( WikiContext context, String content, Change change ) 421 throws RedirectException { 422 HttpServletRequest req = context.getHttpRequest(); 423 424 if( req != null ) { 425 String addr = HttpUtil.getRemoteAddress( req ); 426 int hostCounter = 0; 427 int changeCounter = 0; 428 429 log.debug( "Change is " + change.m_change ); 430 431 long time = System.currentTimeMillis() - 60*1000L; // 1 minute 432 433 for( Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) { 434 Host host = i.next(); 435 436 // 437 // Check if this item is invalid 438 // 439 if( host.getAddedTime() < time ) { 440 log.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" ); 441 i.remove(); 442 continue; 443 } 444 445 // 446 // Check if this IP address has been seen before 447 // 448 449 if( host.getAddress().equals( addr ) ) { 450 hostCounter++; 451 } 452 453 // 454 // Check, if this change has been seen before 455 // 456 457 if( host.getChange() != null && host.getChange().equals( change ) ) { 458 changeCounter++; 459 } 460 } 461 462 // 463 // Now, let's check against the limits. 464 // 465 if( hostCounter >= m_limitSinglePageChanges ) { 466 Host host = new Host( addr, null ); 467 m_temporaryBanList.add( host ); 468 469 String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change ); 470 log.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" ); 471 checkStrategy( context, REASON_TOO_MANY_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 472 } 473 474 if( changeCounter >= m_limitSimilarChanges ) { 475 Host host = new Host( addr, null ); 476 m_temporaryBanList.add( host ); 477 478 String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change ); 479 log.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" ); 480 checkStrategy( context, REASON_SIMILAR_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")"); 481 } 482 483 // 484 // Calculate the number of links in the addition. 485 // 486 String tstChange = change.toString(); 487 int urlCounter = 0; 488 while( m_matcher.contains( tstChange,m_urlPattern ) ) { 489 MatchResult m = m_matcher.getMatch(); 490 tstChange = tstChange.substring( m.endOffset(0) ); 491 urlCounter++; 492 } 493 494 if( urlCounter > m_maxUrls ) { 495 Host host = new Host( addr, null ); 496 m_temporaryBanList.add( host ); 497 498 String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() ); 499 log.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" ); 500 checkStrategy( context, REASON_TOO_MANY_URLS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 501 } 502 503 // 504 // Check bot trap 505 // 506 checkBotTrap( context, change ); 507 508 // 509 // Check UTF-8 mangling 510 // 511 checkUTF8( context, change ); 512 513 // 514 // Do Akismet check. This is good to be the last, because this is the most 515 // expensive operation. 516 // 517 checkAkismet( context, change ); 518 519 m_lastModifications.add( new Host( addr, change ) ); 520 } 521 } 522 523 524 /** 525 * Checks against the akismet system. 526 * 527 * @param context 528 * @param change 529 * @throws RedirectException 530 */ 531 private void checkAkismet( WikiContext context, Change change ) throws RedirectException { 532 if( m_akismetAPIKey != null ) { 533 if( m_akismet == null ) { 534 log.info( "Initializing Akismet spam protection." ); 535 m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() ); 536 537 if( !m_akismet.verifyAPIKey() ) { 538 log.error( "Akismet API key cannot be verified. Please check your config." ); 539 m_akismetAPIKey = null; 540 m_akismet = null; 541 } 542 } 543 544 HttpServletRequest req = context.getHttpRequest(); 545 546 // 547 // Akismet will mark all empty statements as spam, so we'll just 548 // ignore them. 549 // 550 if( change.m_adds == 0 && change.m_removals > 0 ) { 551 return; 552 } 553 554 if( req != null && m_akismet != null ) { 555 log.debug( "Calling Akismet to check for spam..." ); 556 557 StopWatch sw = new StopWatch(); 558 sw.start(); 559 560 String ipAddress = HttpUtil.getRemoteAddress( req ); 561 String userAgent = req.getHeader( "User-Agent" ); 562 String referrer = req.getHeader( "Referer"); 563 String permalink = context.getViewURL( context.getPage().getName() ); 564 String commentType = context.getRequestContext().equals( WikiContext.COMMENT ) ? "comment" : "edit"; 565 String commentAuthor = context.getCurrentUser().getName(); 566 String commentAuthorEmail = null; 567 String commentAuthorURL = null; 568 569 boolean isSpam = m_akismet.commentCheck( ipAddress, 570 userAgent, 571 referrer, 572 permalink, 573 commentType, 574 commentAuthor, 575 commentAuthorEmail, 576 commentAuthorURL, 577 change.toString(), 578 null ); 579 580 sw.stop(); 581 log.debug( "Akismet request done in: " + sw ); 582 583 if( isSpam ) { 584 // Host host = new Host( ipAddress, null ); 585 // m_temporaryBanList.add( host ); 586 587 String uid = log( context, REJECT, REASON_AKISMET, change.toString() ); 588 log.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." ); 589 checkStrategy( context, REASON_AKISMET, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" ); 590 } 591 } 592 } 593 } 594 595 /** 596 * Returns a static string which can be used to detect spambots which just wildly fill in all the fields. 597 * 598 * @return A string 599 */ 600 public static String getBotFieldName() { 601 return "submit_auth"; 602 } 603 604 /** 605 * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam. 606 * 607 * @param context 608 * @param change 609 * @throws RedirectException 610 */ 611 private void checkBotTrap( WikiContext context, Change change ) throws RedirectException { 612 HttpServletRequest request = context.getHttpRequest(); 613 614 if( request != null ) { 615 String unspam = request.getParameter( getBotFieldName() ); 616 if( unspam != null && unspam.length() > 0 ) { 617 String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() ); 618 619 log.info( "SPAM:BotTrap (" + uid + "). Wildly behaving bot detected." ); 620 checkStrategy( context, REASON_BOT_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" ); 621 } 622 } 623 } 624 625 private void checkUTF8( WikiContext context, Change change ) throws RedirectException { 626 HttpServletRequest request = context.getHttpRequest(); 627 628 if( request != null ) { 629 String utf8field = request.getParameter( "encodingcheck" ); 630 631 if( utf8field != null && !utf8field.equals( "\u3041" ) ) { 632 String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() ); 633 634 log.info( "SPAM:UTF8Trap (" + uid + "). Wildly posting dumb bot detected." ); 635 checkStrategy( context, REASON_UTF8_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" ); 636 } 637 } 638 } 639 640 /** Goes through the ban list and cleans away any host which has expired from it. */ 641 private synchronized void cleanBanList() { 642 long now = System.currentTimeMillis(); 643 644 for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) { 645 Host host = i.next(); 646 647 if( host.getReleaseTime() < now ) { 648 log.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" ); 649 i.remove(); 650 } 651 } 652 } 653 654 /** 655 * Checks the ban list if the IP address of the changer is already on it. 656 * 657 * @param context 658 * @throws RedirectException 659 */ 660 private void checkBanList( WikiContext context, Change change ) throws RedirectException { 661 HttpServletRequest req = context.getHttpRequest(); 662 663 if( req != null ) { 664 String remote = HttpUtil.getRemoteAddress(req); 665 long now = System.currentTimeMillis(); 666 667 for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) { 668 Host host = i.next(); 669 670 if( host.getAddress().equals( remote ) ) { 671 long timeleft = ( host.getReleaseTime() - now ) / 1000L; 672 673 log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change ); 674 checkStrategy( context, REASON_IP_BANNED_TEMPORARILY, "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" ); 675 } 676 } 677 } 678 } 679 680 /** 681 * If the spam filter notices changes in the black list page, it will refresh them automatically. 682 * 683 * @param context 684 */ 685 private void refreshBlacklists( WikiContext context ) { 686 try { 687 WikiPage source = context.getEngine().getPage( m_forbiddenWordsPage ); 688 Attachment att = context.getEngine().getAttachmentManager().getAttachmentInfo( context, m_blacklist ); 689 690 boolean rebuild = false; 691 692 // 693 // Rebuild, if the page or the attachment has changed since. 694 // 695 if( source != null ) { 696 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || source.getLastModified().after( m_lastRebuild ) ) { 697 rebuild = true; 698 } 699 } 700 701 if( att != null ) { 702 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) { 703 rebuild = true; 704 } 705 } 706 707 // 708 // Do the actual rebuilding. For simplicity's sake, we always rebuild the complete 709 // filter list regardless of what changed. 710 // 711 if( rebuild ) { 712 m_lastRebuild = new Date(); 713 m_spamPatterns = parseWordList( source, 714 ( source != null ) ? ( String )source.getAttribute( LISTVAR ) : null ); 715 716 log.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage ); 717 718 if( att != null ) { 719 InputStream in = context.getEngine().getAttachmentManager().getAttachmentStream(att); 720 StringWriter out = new StringWriter(); 721 FileUtil.copyContents( new InputStreamReader( in,"UTF-8" ), out ); 722 Collection< Pattern > blackList = parseBlacklist( out.toString() ); 723 log.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist ); 724 m_spamPatterns.addAll( blackList ); 725 } 726 } 727 } catch( IOException ex ) { 728 log.info( "Unable to read attachment data, continuing...", ex ); 729 } catch( ProviderException ex ) { 730 log.info( "Failed to read spam filter attachment, continuing...", ex ); 731 } 732 } 733 734 /** 735 * Does a check against a known pattern list. 736 * 737 * @param context 738 * @param content 739 * @param change 740 * @throws RedirectException 741 */ 742 private void checkPatternList( WikiContext context, String content, Change change ) throws RedirectException { 743 // 744 // If we have no spam patterns defined, or we're trying to save 745 // the page containing the patterns, just return. 746 // 747 if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) { 748 return; 749 } 750 751 String ch = change.toString(); 752 if( context.getHttpRequest() != null ) { 753 ch += HttpUtil.getRemoteAddress( context.getHttpRequest() ); 754 } 755 756 for( Pattern p : m_spamPatterns ) { 757 // log.debug("Attempting to match page contents with "+p.getPattern()); 758 759 if( m_matcher.contains( ch, p ) ) { 760 // 761 // Spam filter has a match. 762 // 763 String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch ); 764 765 log.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" ); 766 checkStrategy( context, REASON_REGEXP, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" ); 767 } 768 } 769 } 770 771 private void checkPatternList( WikiContext context, String content, String change ) throws RedirectException { 772 Change c = new Change(); 773 c.m_change = change; 774 checkPatternList( context, content, c ); 775 } 776 777 /** 778 * Creates a simple text string describing the added content. 779 * 780 * @param context 781 * @param newText 782 * @return Empty string, if there is no change. 783 */ 784 private static Change getChange( WikiContext context, String newText ) { 785 WikiPage page = context.getPage(); 786 StringBuffer change = new StringBuffer(); 787 WikiEngine engine = context.getEngine(); 788 // Get current page version 789 790 Change ch = new Change(); 791 792 try { 793 String oldText = engine.getPureText( page.getName(), WikiProvider.LATEST_VERSION ); 794 795 String[] first = Diff.stringToArray( oldText ); 796 String[] second = Diff.stringToArray( newText ); 797 Revision rev = Diff.diff( first, second, new MyersDiff() ); 798 799 if( rev == null || rev.size() == 0 ) { 800 return ch; 801 } 802 803 for( int i = 0; i < rev.size(); i++ ) { 804 Delta d = rev.getDelta( i ); 805 806 if( d instanceof AddDelta ) { 807 d.getRevised().toString( change, "", "\r\n" ); 808 ch.m_adds++; 809 810 } else if( d instanceof ChangeDelta ) { 811 d.getRevised().toString( change, "", "\r\n" ); 812 ch.m_adds++; 813 814 } else if( d instanceof DeleteDelta ) { 815 ch.m_removals++; 816 } 817 } 818 } catch( DifferentiationFailedException e ) { 819 log.error( "Diff failed", e ); 820 } 821 822 // 823 // Don't forget to include the change note, too 824 // 825 String changeNote = ( String )page.getAttribute( WikiPage.CHANGENOTE ); 826 827 if( changeNote != null ) { 828 change.append( "\r\n" ); 829 change.append( changeNote ); 830 } 831 832 // 833 // And author as well 834 // 835 if( page.getAuthor() != null ) { 836 change.append( "\r\n" + page.getAuthor() ); 837 } 838 839 ch.m_change = change.toString(); 840 return ch; 841 } 842 843 /** 844 * Returns true, if this user should be ignored. For example, admin users. 845 * 846 * @param context 847 * @return True, if this users should be ignored. 848 */ 849 private boolean ignoreThisUser( WikiContext context ) { 850 if( context.hasAdminPermissions() ) { 851 return true; 852 } 853 854 if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) { 855 return true; 856 } 857 858 if( context.getVariable( "captcha" ) != null ) { 859 return true; 860 } 861 862 return false; 863 } 864 865 /** 866 * Returns a random string of six uppercase characters. 867 * 868 * @return A random string 869 */ 870 private static String getUniqueID() { 871 StringBuilder sb = new StringBuilder(); 872 Random rand = new Random(); 873 874 for( int i = 0; i < 6; i++ ) { 875 char x = ( char )( 'A' + rand.nextInt( 26 ) ); 876 sb.append( x ); 877 } 878 879 return sb.toString(); 880 } 881 882 /** 883 * Returns a page to which we shall redirect, based on the current value of the "captcha" parameter. 884 * 885 * @param ctx WikiContext 886 * @return An URL to redirect to 887 */ 888 private String getRedirectPage( WikiContext ctx ) { 889 if( m_useCaptcha ) { 890 return ctx.getURL( WikiContext.NONE, "Captcha.jsp", "page="+ctx.getEngine().encodeName( ctx.getPage().getName() ) ); 891 } 892 893 return ctx.getURL( WikiContext.VIEW, m_errorPage ); 894 } 895 896 /** 897 * Checks whether the UserProfile matches certain checks. 898 * 899 * @param profile The profile to check 900 * @param context The WikiContext 901 * @return False, if this userprofile is suspect and should not be allowed to be added. 902 * @since 2.6.1 903 */ 904 public boolean isValidUserProfile( WikiContext context, UserProfile profile ) { 905 try { 906 checkPatternList( context, profile.getEmail(), profile.getEmail() ); 907 checkPatternList( context, profile.getFullname(), profile.getFullname() ); 908 checkPatternList( context, profile.getLoginName(), profile.getLoginName() ); 909 } catch( RedirectException e ) { 910 log.info("Detected attempt to create a spammer user account (see above for rejection reason)"); 911 return false; 912 } 913 914 return true; 915 } 916 917 /** 918 * This method is used to calculate an unique code when submitting the page to detect edit conflicts. 919 * It currently incorporates the last-modified date of the page, and the IP address of the submitter. 920 * 921 * @param page The WikiPage under edit 922 * @param request The HTTP Request 923 * @since 2.6 924 * @return A hash value for this page and session 925 */ 926 public static final String getSpamHash( WikiPage page, HttpServletRequest request ) { 927 long lastModified = 0; 928 929 if( page.getLastModified() != null ) { 930 lastModified = page.getLastModified().getTime(); 931 } 932 long remote = HttpUtil.getRemoteAddress( request ).hashCode(); 933 934 return Long.toString( lastModified ^ remote ); 935 } 936 937 /** 938 * Returns the name of the hash field to be used in this request. The value is unique per session, and once 939 * the session has expired, you cannot edit anymore. 940 * 941 * @param request The page request 942 * @return The name to be used in the hash field 943 * @since 2.6 944 */ 945 public static final String getHashFieldName( HttpServletRequest request ) { 946 String hash = null; 947 948 if( request.getSession() != null ) { 949 hash = ( String )request.getSession().getAttribute( "_hash" ); 950 951 if( hash == null ) { 952 hash = c_hashName; 953 request.getSession().setAttribute( "_hash", hash ); 954 } 955 } 956 957 if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) { 958 c_hashName = getUniqueID().toLowerCase(); 959 c_lastUpdate = System.currentTimeMillis(); 960 } 961 962 return hash != null ? hash : c_hashName; 963 } 964 965 966 /** 967 * This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases: 968 * either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long, 969 * and their session has expired. 970 * <p> 971 * This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in 972 * the spam log (it may or may not be spam, but it's rather likely that it is). 973 * 974 * @param context The WikiContext 975 * @param pageContext The JSP PageContext. 976 * @return True, if hash is okay. False, if hash is not okay, and you need to redirect. 977 * @throws IOException If redirection fails 978 * @since 2.6 979 */ 980 public static final boolean checkHash( WikiContext context, PageContext pageContext ) throws IOException { 981 String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() ); 982 983 if( pageContext.getRequest().getParameter(hashName) == null ) { 984 if( pageContext.getAttribute( hashName ) == null ) { 985 Change change = getChange( context, EditorManager.getEditedText( pageContext ) ); 986 log( context, REJECT, "MissingHash", change.m_change ); 987 988 String redirect = context.getURL( WikiContext.VIEW,"SessionExpired" ); 989 ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect ); 990 return false; 991 } 992 } 993 994 return true; 995 } 996 997 /** 998 * This helper method adds all the input fields to your editor that the SpamFilter requires 999 * to check for spam. This <i>must</i> be in your editor form if you intend to use the SpamFilter. 1000 * 1001 * @param pageContext The PageContext 1002 * @return A HTML string which contains input fields for the SpamFilter. 1003 */ 1004 public static final String insertInputFields( PageContext pageContext ) { 1005 WikiContext ctx = WikiContext.findContext( pageContext ); 1006 WikiEngine engine = ctx.getEngine(); 1007 1008 StringBuilder sb = new StringBuilder(); 1009 if( engine.getContentEncoding().equals( "UTF-8" ) ) { 1010 sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" ); 1011 } 1012 1013 return sb.toString(); 1014 } 1015 1016 /** 1017 * A local class for storing host information. 1018 * 1019 * @since 1020 */ 1021 private class Host { 1022 1023 private long m_addedTime = System.currentTimeMillis(); 1024 private long m_releaseTime; 1025 private String m_address; 1026 private Change m_change; 1027 1028 public String getAddress() { 1029 return m_address; 1030 } 1031 1032 public long getReleaseTime() { 1033 return m_releaseTime; 1034 } 1035 1036 public long getAddedTime() { 1037 return m_addedTime; 1038 } 1039 1040 public Change getChange() { 1041 return m_change; 1042 } 1043 1044 public Host( String ipaddress, Change change ) { 1045 m_address = ipaddress; 1046 m_change = change; 1047 m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L; 1048 } 1049 1050 } 1051 1052 private static class Change { 1053 1054 public String m_change; 1055 public int m_adds; 1056 public int m_removals; 1057 1058 public String toString() { 1059 return m_change; 1060 } 1061 1062 public boolean equals( Object o ) { 1063 if( o instanceof Change ) { 1064 return m_change.equals( ( ( Change )o ).m_change ); 1065 } 1066 return false; 1067 } 1068 1069 public int hashCode() { 1070 return m_change.hashCode() + 17; 1071 } 1072 1073 } 1074 1075 }