001/* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019package org.apache.wiki.filters; 020 021import net.sf.akismet.Akismet; 022import org.apache.commons.lang3.time.StopWatch; 023import org.apache.log4j.Logger; 024import org.apache.oro.text.regex.MalformedPatternException; 025import org.apache.oro.text.regex.MatchResult; 026import org.apache.oro.text.regex.Pattern; 027import org.apache.oro.text.regex.PatternCompiler; 028import org.apache.oro.text.regex.PatternMatcher; 029import org.apache.oro.text.regex.Perl5Compiler; 030import org.apache.oro.text.regex.Perl5Matcher; 031import org.apache.wiki.InternalWikiException; 032import org.apache.wiki.WikiContext; 033import org.apache.wiki.WikiEngine; 034import org.apache.wiki.WikiPage; 035import org.apache.wiki.WikiProvider; 036import org.apache.wiki.api.exceptions.ProviderException; 037import org.apache.wiki.api.exceptions.RedirectException; 038import org.apache.wiki.api.filters.BasicPageFilter; 039import org.apache.wiki.attachment.Attachment; 040import org.apache.wiki.auth.user.UserProfile; 041import org.apache.wiki.ui.EditorManager; 042import org.apache.wiki.util.FileUtil; 043import org.apache.wiki.util.HttpUtil; 044import org.apache.wiki.util.TextUtil; 045import org.suigeneris.jrcs.diff.Diff; 046import org.suigeneris.jrcs.diff.DifferentiationFailedException; 047import org.suigeneris.jrcs.diff.Revision; 048import org.suigeneris.jrcs.diff.delta.AddDelta; 049import org.suigeneris.jrcs.diff.delta.ChangeDelta; 050import org.suigeneris.jrcs.diff.delta.DeleteDelta; 051import org.suigeneris.jrcs.diff.delta.Delta; 052import org.suigeneris.jrcs.diff.myers.MyersDiff; 053 054import javax.servlet.http.HttpServletRequest; 055import javax.servlet.http.HttpServletResponse; 056import javax.servlet.jsp.PageContext; 057import java.io.BufferedReader; 058import java.io.IOException; 059import java.io.InputStream; 060import java.io.InputStreamReader; 061import java.io.StringReader; 062import java.io.StringWriter; 063import java.nio.charset.StandardCharsets; 064import java.util.ArrayList; 065import java.util.Collection; 066import java.util.Date; 067import java.util.Iterator; 068import java.util.Properties; 069import java.util.Random; 070import java.util.StringTokenizer; 071import java.util.Vector; 072 073 074/** 075 * This is Herb, the JSPWiki spamfilter that can also do choke modifications. 076 * 077 * Parameters: 078 * <ul> 079 * <li>wordlist - Page name where the spamword regexps are found. Use [{SET spamwords='regexp list separated with spaces'}] on 080 * that page. Default is "SpamFilterWordList". 081 * <li>IPlist - Page name where the IP regexps are found. Use [{SET ips='regexp list separated with spaces'}] on 082 * that page. Default is "SpamFilterIPList". 083 * <li>maxpagenamelength - Maximum page name length. Default is 100. 084 * <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is 085 * "SpamFilterWordList/blacklist.txt"</li> 086 * <li>errorpage - The page to which the user is redirected. Has a special variable $msg which states the reason. Default is "RejectedMessage". 087 * <li>pagechangesinminute - How many page changes are allowed/minute. Default is 5.</li> 088 * <li>similarchanges - How many similar page changes are allowed before the host is banned. Default is 2. (since 2.4.72)</li> 089 * <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li> 090 * <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li> 091 * <li>akismet-apikey - The Akismet API key (see akismet.org)</li> 092 * <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li> 093 * <li>captcha - Sets the captcha technology to use. Current allowed values are "none" and "asirra".</li> 094 * <li>strategy - Sets the filtering strategy to use. If set to "eager", will stop at the first probable 095 * match, and won't consider any other tests. This is the default, as it's considerably lighter. If set to "score", will go through all of the tests 096 * and calculates a score for the spam, which is then compared to a filter level value. 097 * </ul> 098 * 099 * <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates 100 * with the editor system.</p> 101 * 102 * <p>Changes by admin users are ignored in any case.</p> 103 * 104 * @since 2.1.112 105 */ 106public class SpamFilter extends BasicPageFilter { 107 108 private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score"; 109 private static final String REASON_REGEXP = "Regexp"; 110 private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily"; 111 private static final String REASON_IP_BANNED_PERMANENTLY = "IPBannedPermanently"; 112 private static final String REASON_BOT_TRAP = "BotTrap"; 113 private static final String REASON_AKISMET = "Akismet"; 114 private static final String REASON_TOO_MANY_URLS = "TooManyUrls"; 115 private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications"; 116 private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications"; 117 private static final String REASON_PAGENAME_TOO_LONG = "PageNameTooLong"; 118 private static final String REASON_UTF8_TRAP = "UTF8Trap"; 119 120 private static final String LISTVAR = "spamwords"; 121 private static final String LISTIPVAR = "ips"; 122 123 /** The filter property name for specifying the page which contains the list of spamwords. 124 * Value is <tt>{@value}</tt>. */ 125 public static final String PROP_WORDLIST = "wordlist"; 126 127 /** The filter property name for specifying the page which contains the list of IPs to ban. 128 * Value is <tt>{@value}</tt>. */ 129 public static final String PROP_IPLIST = "IPlist"; 130 131 /** The filter property name for specifying the maximum page name length. 132 * Value is <tt>{@value}</tt>. */ 133 public static final String PROP_MAX_PAGENAME_LENGTH = "maxpagenamelength"; 134 135 /** The filter property name for the page to which you are directed if Herb rejects your 136 * edit. Value is <tt>{@value}</tt>. */ 137 public static final String PROP_ERRORPAGE = "errorpage"; 138 139 /** The filter property name for specifying how many changes is any given IP address 140 * allowed to do per minute. Value is <tt>{@value}</tt>. 141 */ 142 public static final String PROP_PAGECHANGES = "pagechangesinminute"; 143 144 /** The filter property name for specifying how many similar changes are allowed 145 * before a host is banned. Value is <tt>{@value}</tt>. 146 */ 147 public static final String PROP_SIMILARCHANGES = "similarchanges"; 148 149 /** The filter property name for specifying how long a host is banned. Value is <tt>{@value}</tt>.*/ 150 public static final String PROP_BANTIME = "bantime"; 151 152 /** The filter property name for the attachment containing the blacklist. Value is <tt>{@value}</tt>.*/ 153 public static final String PROP_BLACKLIST = "blacklist"; 154 155 /** The filter property name for specifying how many URLs can any given edit contain. 156 * Value is <tt>{@value}</tt> */ 157 public static final String PROP_MAXURLS = "maxurls"; 158 159 /** The filter property name for specifying the Akismet API-key. Value is <tt>{@value}</tt>. */ 160 public static final String PROP_AKISMET_API_KEY = "akismet-apikey"; 161 162 /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */ 163 public static final String PROP_IGNORE_AUTHENTICATED = "ignoreauthenticated"; 164 165 /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */ 166 public static final String PROP_CAPTCHA = "captcha"; 167 168 /** The filter property name for specifying which filter strategy should be used. Value is <tt>{@value}</tt>. */ 169 public static final String PROP_FILTERSTRATEGY = "strategy"; 170 171 /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */ 172 public static final String STRATEGY_EAGER = "eager"; 173 174 /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */ 175 public static final String STRATEGY_SCORE = "score"; 176 177 private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)"; 178 179 private String m_forbiddenWordsPage = "SpamFilterWordList"; 180 private String m_forbiddenIPsPage = "SpamFilterIPList"; 181 private String m_pageNameMaxLength = "100"; 182 private String m_errorPage = "RejectedMessage"; 183 private String m_blacklist = "SpamFilterWordList/blacklist.txt"; 184 185 private PatternMatcher m_matcher = new Perl5Matcher(); 186 private PatternCompiler m_compiler = new Perl5Compiler(); 187 188 private Collection<Pattern> m_spamPatterns = null; 189 private Collection<Pattern> m_IPPatterns = null; 190 191 private Date m_lastRebuild = new Date( 0L ); 192 193 private static Logger c_spamlog = Logger.getLogger( "SpamLog" ); 194 private static Logger log = Logger.getLogger( SpamFilter.class ); 195 196 197 private Vector<Host> m_temporaryBanList = new Vector<Host>(); 198 199 private int m_banTime = 60; // minutes 200 201 private Vector<Host> m_lastModifications = new Vector<Host>(); 202 203 /** 204 * How many times a single IP address can change a page per minute? 205 */ 206 private int m_limitSinglePageChanges = 5; 207 208 /** 209 * How many times can you add the exact same string to a page? 210 */ 211 private int m_limitSimilarChanges = 2; 212 213 /** 214 * How many URLs can be added at maximum. 215 */ 216 private int m_maxUrls = 10; 217 218 private Pattern m_urlPattern; 219 private Akismet m_akismet; 220 221 private String m_akismetAPIKey = null; 222 223 private boolean m_useCaptcha = false; 224 225 /** The limit at which we consider something to be spam. */ 226 private int m_scoreLimit = 1; 227 228 /** 229 * If set to true, will ignore anyone who is in Authenticated role. 230 */ 231 private boolean m_ignoreAuthenticated = false; 232 233 private boolean m_stopAtFirstMatch = true; 234 235 private static String c_hashName; 236 private static long c_lastUpdate; 237 238 /** The HASH_DELAY value is a maximum amount of time that an user can keep 239 * a session open, because after the value has expired, we will invent a new 240 * hash field name. By default this is {@value} hours, which should be ample 241 * time for someone. 242 */ 243 private static final long HASH_DELAY = 24; 244 245 246 /** 247 * {@inheritDoc} 248 */ 249 @Override 250 public void initialize( WikiEngine engine, Properties properties ) { 251 m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage ); 252 m_forbiddenIPsPage = properties.getProperty( PROP_IPLIST, m_forbiddenIPsPage); 253 m_pageNameMaxLength = properties.getProperty( PROP_MAX_PAGENAME_LENGTH, m_pageNameMaxLength); 254 m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage ); 255 m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties, 256 PROP_PAGECHANGES, 257 m_limitSinglePageChanges ); 258 259 m_limitSimilarChanges = TextUtil.getIntegerProperty( properties, 260 PROP_SIMILARCHANGES, 261 m_limitSimilarChanges ); 262 263 m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls ); 264 m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime ); 265 m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist ); 266 267 m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties, 268 PROP_IGNORE_AUTHENTICATED, 269 m_ignoreAuthenticated ); 270 271 m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra"); 272 273 try { 274 m_urlPattern = m_compiler.compile( URL_REGEXP ); 275 } catch( MalformedPatternException e ) { 276 log.fatal( "Internal error: Someone put in a faulty pattern.", e ); 277 throw new InternalWikiException( "Faulty pattern." , e); 278 } 279 280 m_akismetAPIKey = TextUtil.getStringProperty( properties, 281 PROP_AKISMET_API_KEY, 282 m_akismetAPIKey ); 283 284 m_stopAtFirstMatch = TextUtil.getStringProperty( properties, 285 PROP_FILTERSTRATEGY, 286 STRATEGY_EAGER ).equals( STRATEGY_EAGER ); 287 288 log.info( "# Spam filter initialized. Temporary ban time " + m_banTime + 289 " mins, max page changes/minute: " + m_limitSinglePageChanges ); 290 291 292 } 293 294 private static final int REJECT = 0; 295 private static final int ACCEPT = 1; 296 private static final int NOTE = 2; 297 298 private static String log( WikiContext ctx, int type, String source, String message ) { 299 message = TextUtil.replaceString( message, "\r\n", "\\r\\n" ); 300 message = TextUtil.replaceString( message, "\"", "\\\"" ); 301 302 String uid = getUniqueID(); 303 304 String page = ctx.getPage().getName(); 305 String reason = "UNKNOWN"; 306 String addr = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-"; 307 308 switch( type ) { 309 case REJECT: 310 reason = "REJECTED"; 311 break; 312 case ACCEPT: 313 reason = "ACCEPTED"; 314 break; 315 case NOTE: 316 reason = "NOTE"; 317 break; 318 default: 319 throw new InternalWikiException( "Illegal type " + type ); 320 } 321 c_spamlog.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message ); 322 323 return uid; 324 } 325 326 /** {@inheritDoc} */ 327 public String preSave( WikiContext context, String content ) throws RedirectException { 328 cleanBanList(); 329 refreshBlacklists( context ); 330 Change change = getChange( context, content ); 331 332 if( !ignoreThisUser( context ) ) { 333 checkBanList( context, change ); 334 checkSinglePageChange( context, content, change ); 335 checkIPList( context ); 336 checkPatternList( context, content, change ); 337 checkPageName( context, content, change); 338 } 339 340 if( !m_stopAtFirstMatch ) { 341 Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE ); 342 343 if( score != null && score.intValue() >= m_scoreLimit ) { 344 throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) ); 345 } 346 } 347 348 log( context, ACCEPT, "-", change.toString() ); 349 return content; 350 } 351 352 private void checkPageName(WikiContext context, String content, Change change) throws RedirectException { 353 WikiPage page = context.getPage(); 354 String pageName = page.getName(); 355 int maxlength = Integer.valueOf(m_pageNameMaxLength); 356 if ( pageName.length() > maxlength) { 357 // 358 // Spam filter has a match. 359 // 360 361 String uid = log( context, REJECT, REASON_PAGENAME_TOO_LONG + "(" + m_pageNameMaxLength + ")" , pageName); 362 363 log.info("SPAM:PageNameTooLong (" + uid + "). The length of the page name is too large (" + pageName.length() + " , limit is " + m_pageNameMaxLength + ")"); 364 checkStrategy( context, REASON_PAGENAME_TOO_LONG, "Herb says '" + pageName + "' is a bad pageName and I trust Herb! (Incident code " + uid + ")" ); 365 366 } 367 } 368 369 private void checkStrategy( WikiContext context, String error, String message ) throws RedirectException { 370 if( m_stopAtFirstMatch ) { 371 throw new RedirectException( message, getRedirectPage( context ) ); 372 } 373 374 Integer score = ( Integer )context.getVariable( ATTR_SPAMFILTER_SCORE ); 375 if( score != null ) { 376 score = score + 1; 377 } else { 378 score = 1; 379 } 380 381 context.setVariable( ATTR_SPAMFILTER_SCORE, score ); 382 } 383 384 /** 385 * Parses a list of patterns and returns a Collection of compiled Pattern 386 * objects. 387 * 388 * @param source 389 * @param list 390 * @return A Collection of the Patterns that were found from the lists. 391 */ 392 private Collection< Pattern > parseWordList( WikiPage source, String list ) { 393 ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >(); 394 395 if( list != null ) { 396 StringTokenizer tok = new StringTokenizer( list, " \t\n" ); 397 398 while( tok.hasMoreTokens() ) { 399 String pattern = tok.nextToken(); 400 401 try { 402 compiledpatterns.add( m_compiler.compile( pattern ) ); 403 } catch( MalformedPatternException e ) { 404 log.debug( "Malformed spam filter pattern " + pattern ); 405 source.setAttribute("error", "Malformed spam filter pattern " + pattern); 406 } 407 } 408 } 409 410 return compiledpatterns; 411 } 412 413 /** 414 * Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects. 415 * 416 * @param list 417 * @return The parsed blacklist patterns. 418 */ 419 private Collection< Pattern > parseBlacklist( String list ) { 420 ArrayList< Pattern > compiledpatterns = new ArrayList< Pattern >(); 421 422 if( list != null ) { 423 try { 424 BufferedReader in = new BufferedReader( new StringReader(list) ); 425 String line; 426 while( (line = in.readLine() ) != null ) { 427 line = line.trim(); 428 if( line.length() == 0 ) continue; // Empty line 429 if( line.startsWith("#") ) continue; // It's a comment 430 431 int ws = line.indexOf( ' ' ); 432 if( ws == -1 ) ws = line.indexOf( '\t' ); 433 if( ws != -1 ) line = line.substring( 0, ws ); 434 435 try { 436 compiledpatterns.add( m_compiler.compile( line ) ); 437 } catch( MalformedPatternException e ) { 438 log.debug( "Malformed spam filter pattern " + line ); 439 } 440 } 441 } catch( IOException e ) { 442 log.info( "Could not read patterns; returning what I got" , e ); 443 } 444 } 445 446 return compiledpatterns; 447 } 448 449 /** 450 * Takes a single page change and performs a load of tests on the content change. 451 * An admin can modify anything. 452 * 453 * @param context 454 * @param content 455 * @throws RedirectException 456 */ 457 private synchronized void checkSinglePageChange( WikiContext context, String content, Change change ) 458 throws RedirectException { 459 HttpServletRequest req = context.getHttpRequest(); 460 461 if( req != null ) { 462 String addr = HttpUtil.getRemoteAddress( req ); 463 int hostCounter = 0; 464 int changeCounter = 0; 465 466 log.debug( "Change is " + change.m_change ); 467 468 long time = System.currentTimeMillis() - 60*1000L; // 1 minute 469 470 for( Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) { 471 Host host = i.next(); 472 473 // 474 // Check if this item is invalid 475 // 476 if( host.getAddedTime() < time ) { 477 log.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" ); 478 i.remove(); 479 continue; 480 } 481 482 // 483 // Check if this IP address has been seen before 484 // 485 486 if( host.getAddress().equals( addr ) ) { 487 hostCounter++; 488 } 489 490 // 491 // Check, if this change has been seen before 492 // 493 494 if( host.getChange() != null && host.getChange().equals( change ) ) { 495 changeCounter++; 496 } 497 } 498 499 // 500 // Now, let's check against the limits. 501 // 502 if( hostCounter >= m_limitSinglePageChanges ) { 503 Host host = new Host( addr, null ); 504 m_temporaryBanList.add( host ); 505 506 String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change ); 507 log.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" ); 508 checkStrategy( context, REASON_TOO_MANY_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 509 } 510 511 if( changeCounter >= m_limitSimilarChanges ) { 512 Host host = new Host( addr, null ); 513 m_temporaryBanList.add( host ); 514 515 String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change ); 516 log.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" ); 517 checkStrategy( context, REASON_SIMILAR_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")"); 518 } 519 520 // 521 // Calculate the number of links in the addition. 522 // 523 String tstChange = change.toString(); 524 int urlCounter = 0; 525 while( m_matcher.contains( tstChange,m_urlPattern ) ) { 526 MatchResult m = m_matcher.getMatch(); 527 tstChange = tstChange.substring( m.endOffset(0) ); 528 urlCounter++; 529 } 530 531 if( urlCounter > m_maxUrls ) { 532 Host host = new Host( addr, null ); 533 m_temporaryBanList.add( host ); 534 535 String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() ); 536 log.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" ); 537 checkStrategy( context, REASON_TOO_MANY_URLS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 538 } 539 540 // 541 // Check bot trap 542 // 543 checkBotTrap( context, change ); 544 545 // 546 // Check UTF-8 mangling 547 // 548 checkUTF8( context, change ); 549 550 // 551 // Do Akismet check. This is good to be the last, because this is the most 552 // expensive operation. 553 // 554 checkAkismet( context, change ); 555 556 m_lastModifications.add( new Host( addr, change ) ); 557 } 558 } 559 560 561 /** 562 * Checks against the akismet system. 563 * 564 * @param context 565 * @param change 566 * @throws RedirectException 567 */ 568 private void checkAkismet( WikiContext context, Change change ) throws RedirectException { 569 if( m_akismetAPIKey != null ) { 570 if( m_akismet == null ) { 571 log.info( "Initializing Akismet spam protection." ); 572 m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() ); 573 574 if( !m_akismet.verifyAPIKey() ) { 575 log.error( "Akismet API key cannot be verified. Please check your config." ); 576 m_akismetAPIKey = null; 577 m_akismet = null; 578 } 579 } 580 581 HttpServletRequest req = context.getHttpRequest(); 582 583 // 584 // Akismet will mark all empty statements as spam, so we'll just 585 // ignore them. 586 // 587 if( change.m_adds == 0 && change.m_removals > 0 ) { 588 return; 589 } 590 591 if( req != null && m_akismet != null ) { 592 log.debug( "Calling Akismet to check for spam..." ); 593 594 StopWatch sw = new StopWatch(); 595 sw.start(); 596 597 String ipAddress = HttpUtil.getRemoteAddress( req ); 598 String userAgent = req.getHeader( "User-Agent" ); 599 String referrer = req.getHeader( "Referer"); 600 String permalink = context.getViewURL( context.getPage().getName() ); 601 String commentType = context.getRequestContext().equals( WikiContext.COMMENT ) ? "comment" : "edit"; 602 String commentAuthor = context.getCurrentUser().getName(); 603 String commentAuthorEmail = null; 604 String commentAuthorURL = null; 605 606 boolean isSpam = m_akismet.commentCheck( ipAddress, 607 userAgent, 608 referrer, 609 permalink, 610 commentType, 611 commentAuthor, 612 commentAuthorEmail, 613 commentAuthorURL, 614 change.toString(), 615 null ); 616 617 sw.stop(); 618 log.debug( "Akismet request done in: " + sw ); 619 620 if( isSpam ) { 621 // Host host = new Host( ipAddress, null ); 622 // m_temporaryBanList.add( host ); 623 624 String uid = log( context, REJECT, REASON_AKISMET, change.toString() ); 625 log.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." ); 626 checkStrategy( context, REASON_AKISMET, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" ); 627 } 628 } 629 } 630 } 631 632 /** 633 * Returns a static string which can be used to detect spambots which just wildly fill in all the fields. 634 * 635 * @return A string 636 */ 637 public static String getBotFieldName() { 638 return "submit_auth"; 639 } 640 641 /** 642 * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam. 643 * 644 * @param context 645 * @param change 646 * @throws RedirectException 647 */ 648 private void checkBotTrap( WikiContext context, Change change ) throws RedirectException { 649 HttpServletRequest request = context.getHttpRequest(); 650 651 if( request != null ) { 652 String unspam = request.getParameter( getBotFieldName() ); 653 if( unspam != null && unspam.length() > 0 ) { 654 String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() ); 655 656 log.info( "SPAM:BotTrap (" + uid + "). Wildly behaving bot detected." ); 657 checkStrategy( context, REASON_BOT_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" ); 658 } 659 } 660 } 661 662 private void checkUTF8( WikiContext context, Change change ) throws RedirectException { 663 HttpServletRequest request = context.getHttpRequest(); 664 665 if( request != null ) { 666 String utf8field = request.getParameter( "encodingcheck" ); 667 668 if( utf8field != null && !utf8field.equals( "\u3041" ) ) { 669 String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() ); 670 671 log.info( "SPAM:UTF8Trap (" + uid + "). Wildly posting dumb bot detected." ); 672 checkStrategy( context, REASON_UTF8_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" ); 673 } 674 } 675 } 676 677 /** Goes through the ban list and cleans away any host which has expired from it. */ 678 private synchronized void cleanBanList() { 679 long now = System.currentTimeMillis(); 680 681 for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) { 682 Host host = i.next(); 683 684 if( host.getReleaseTime() < now ) { 685 log.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" ); 686 i.remove(); 687 } 688 } 689 } 690 691 /** 692 * Checks the ban list if the IP address of the changer is already on it. 693 * 694 * @param context 695 * @throws RedirectException 696 */ 697 private void checkBanList( WikiContext context, Change change ) throws RedirectException { 698 HttpServletRequest req = context.getHttpRequest(); 699 700 if( req != null ) { 701 String remote = HttpUtil.getRemoteAddress(req); 702 long now = System.currentTimeMillis(); 703 704 for( Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) { 705 Host host = i.next(); 706 707 if( host.getAddress().equals( remote ) ) { 708 long timeleft = ( host.getReleaseTime() - now ) / 1000L; 709 710 log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change ); 711 checkStrategy( context, REASON_IP_BANNED_TEMPORARILY, "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" ); 712 } 713 } 714 } 715 } 716 717 /** 718 * If the spam filter notices changes in the black list page, it will refresh them automatically. 719 * 720 * @param context 721 */ 722 private void refreshBlacklists( WikiContext context ) { 723 try { 724 725 boolean rebuild = false; 726 727 // 728 // Rebuild, if the spam words page, the attachment or the IP ban page has changed since. 729 // 730 WikiPage sourceSpam = context.getEngine().getPage( m_forbiddenWordsPage ); 731 if( sourceSpam != null ) { 732 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || sourceSpam.getLastModified().after( m_lastRebuild ) ) { 733 rebuild = true; 734 } 735 } 736 737 Attachment att = context.getEngine().getAttachmentManager().getAttachmentInfo( context, m_blacklist ); 738 if( att != null ) { 739 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) { 740 rebuild = true; 741 } 742 } 743 744 WikiPage sourceIPs = context.getEngine().getPage( m_forbiddenIPsPage ); 745 if( sourceIPs != null ) { 746 if( m_IPPatterns == null || m_IPPatterns.isEmpty() || sourceIPs.getLastModified().after( m_lastRebuild ) ) { 747 rebuild = true; 748 } 749 } 750 751 // 752 // Do the actual rebuilding. For simplicity's sake, we always rebuild the complete 753 // filter list regardless of what changed. 754 // 755 if( rebuild ) { 756 m_lastRebuild = new Date(); 757 m_spamPatterns = parseWordList( sourceSpam, 758 ( sourceSpam != null ) ? ( String )sourceSpam.getAttribute( LISTVAR ) : null ); 759 760 log.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage ); 761 762 m_IPPatterns = parseWordList( sourceIPs, 763 ( sourceIPs != null ) ? ( String )sourceIPs.getAttribute( LISTIPVAR ) : null ); 764 log.info( "IP filter reloaded - recognizing " + m_IPPatterns.size() + " patterns from page " + m_forbiddenIPsPage ); 765 766 if( att != null ) { 767 InputStream in = context.getEngine().getAttachmentManager().getAttachmentStream(att); 768 StringWriter out = new StringWriter(); 769 FileUtil.copyContents( new InputStreamReader( in,"UTF-8" ), out ); 770 Collection< Pattern > blackList = parseBlacklist( out.toString() ); 771 log.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist ); 772 m_spamPatterns.addAll( blackList ); 773 } 774 } 775 } catch( IOException ex ) { 776 log.info( "Unable to read attachment data, continuing...", ex ); 777 } catch( ProviderException ex ) { 778 log.info( "Failed to read spam filter attachment, continuing...", ex ); 779 } 780 } 781 782 /** 783 * Does a check against a known pattern list. 784 * 785 * @param context 786 * @param content 787 * @param change 788 * @throws RedirectException 789 */ 790 private void checkPatternList( WikiContext context, String content, Change change ) throws RedirectException { 791 // 792 // If we have no spam patterns defined, or we're trying to save 793 // the page containing the patterns, just return. 794 // 795 if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) { 796 return; 797 } 798 799 String ch = change.toString(); 800 if( context.getHttpRequest() != null ) { 801 ch += HttpUtil.getRemoteAddress( context.getHttpRequest() ); 802 } 803 804 for( Pattern p : m_spamPatterns ) { 805 // log.debug("Attempting to match page contents with "+p.getPattern()); 806 807 if( m_matcher.contains( ch, p ) ) { 808 // 809 // Spam filter has a match. 810 // 811 String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch ); 812 813 log.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" ); 814 checkStrategy( context, REASON_REGEXP, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" ); 815 } 816 } 817 } 818 819 820 /** 821 * Does a check against a pattern list of IPs. 822 * 823 * @param context 824 * @throws RedirectException 825 */ 826 private void checkIPList( WikiContext context ) throws RedirectException { 827 // 828 // If we have no IP patterns defined, or we're trying to save 829 // the page containing the IP patterns, just return. 830 // 831 if( m_IPPatterns == null || context.getPage().getName().equals( m_forbiddenIPsPage ) ) { 832 return; 833 } 834 835 String remoteIP = HttpUtil.getRemoteAddress( context.getHttpRequest() ); 836 log.info("Attempting to match remoteIP " + remoteIP + " against " + m_IPPatterns.size() + " patterns"); 837 838 for( Pattern p : m_IPPatterns ) { 839 log.debug("Attempting to match remoteIP with " + p.getPattern()); 840 841 if( m_matcher.contains( remoteIP, p ) ) { 842 843 // IP filter has a match. 844 // 845 String uid = log( context, REJECT, REASON_IP_BANNED_PERMANENTLY + "(" + p.getPattern() + ")", remoteIP ); 846 847 log.info( "SPAM:IPBanList (" + uid + "). remoteIP matches the IP filter '" + p.getPattern() + "'" ); 848 checkStrategy( context, REASON_IP_BANNED_PERMANENTLY, "Herb says '" + p.getPattern() + "' is a banned IP and I trust Herb! (Incident code " + uid + ")" ); 849 } 850 } 851 } 852 853 private void checkPatternList( WikiContext context, String content, String change ) throws RedirectException { 854 Change c = new Change(); 855 c.m_change = change; 856 checkPatternList( context, content, c ); 857 } 858 859 /** 860 * Creates a simple text string describing the added content. 861 * 862 * @param context 863 * @param newText 864 * @return Empty string, if there is no change. 865 */ 866 private static Change getChange( WikiContext context, String newText ) { 867 WikiPage page = context.getPage(); 868 StringBuffer change = new StringBuffer(); 869 WikiEngine engine = context.getEngine(); 870 // Get current page version 871 872 Change ch = new Change(); 873 874 try { 875 String oldText = engine.getPureText( page.getName(), WikiProvider.LATEST_VERSION ); 876 877 String[] first = Diff.stringToArray( oldText ); 878 String[] second = Diff.stringToArray( newText ); 879 Revision rev = Diff.diff( first, second, new MyersDiff() ); 880 881 if( rev == null || rev.size() == 0 ) { 882 return ch; 883 } 884 885 for( int i = 0; i < rev.size(); i++ ) { 886 Delta d = rev.getDelta( i ); 887 888 if( d instanceof AddDelta ) { 889 d.getRevised().toString( change, "", "\r\n" ); 890 ch.m_adds++; 891 892 } else if( d instanceof ChangeDelta ) { 893 d.getRevised().toString( change, "", "\r\n" ); 894 ch.m_adds++; 895 896 } else if( d instanceof DeleteDelta ) { 897 ch.m_removals++; 898 } 899 } 900 } catch( DifferentiationFailedException e ) { 901 log.error( "Diff failed", e ); 902 } 903 904 // 905 // Don't forget to include the change note, too 906 // 907 String changeNote = ( String )page.getAttribute( WikiPage.CHANGENOTE ); 908 909 if( changeNote != null ) { 910 change.append( "\r\n" ); 911 change.append( changeNote ); 912 } 913 914 // 915 // And author as well 916 // 917 if( page.getAuthor() != null ) { 918 change.append( "\r\n" + page.getAuthor() ); 919 } 920 921 ch.m_change = change.toString(); 922 return ch; 923 } 924 925 /** 926 * Returns true, if this user should be ignored. For example, admin users. 927 * 928 * @param context 929 * @return True, if this users should be ignored. 930 */ 931 private boolean ignoreThisUser( WikiContext context ) { 932 if( context.hasAdminPermissions() ) { 933 return true; 934 } 935 936 if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) { 937 return true; 938 } 939 940 if( context.getVariable( "captcha" ) != null ) { 941 return true; 942 } 943 944 return false; 945 } 946 947 /** 948 * Returns a random string of six uppercase characters. 949 * 950 * @return A random string 951 */ 952 private static String getUniqueID() { 953 StringBuilder sb = new StringBuilder(); 954 Random rand = new Random(); 955 956 for( int i = 0; i < 6; i++ ) { 957 char x = ( char )( 'A' + rand.nextInt( 26 ) ); 958 sb.append( x ); 959 } 960 961 return sb.toString(); 962 } 963 964 /** 965 * Returns a page to which we shall redirect, based on the current value of the "captcha" parameter. 966 * 967 * @param ctx WikiContext 968 * @return An URL to redirect to 969 */ 970 private String getRedirectPage( WikiContext ctx ) { 971 if( m_useCaptcha ) { 972 return ctx.getURL( WikiContext.NONE, "Captcha.jsp", "page="+ctx.getEngine().encodeName( ctx.getPage().getName() ) ); 973 } 974 975 return ctx.getURL( WikiContext.VIEW, m_errorPage ); 976 } 977 978 /** 979 * Checks whether the UserProfile matches certain checks. 980 * 981 * @param profile The profile to check 982 * @param context The WikiContext 983 * @return False, if this userprofile is suspect and should not be allowed to be added. 984 * @since 2.6.1 985 */ 986 public boolean isValidUserProfile( WikiContext context, UserProfile profile ) { 987 try { 988 checkPatternList( context, profile.getEmail(), profile.getEmail() ); 989 checkPatternList( context, profile.getFullname(), profile.getFullname() ); 990 checkPatternList( context, profile.getLoginName(), profile.getLoginName() ); 991 } catch( RedirectException e ) { 992 log.info("Detected attempt to create a spammer user account (see above for rejection reason)"); 993 return false; 994 } 995 996 return true; 997 } 998 999 /** 1000 * This method is used to calculate an unique code when submitting the page to detect edit conflicts. 1001 * It currently incorporates the last-modified date of the page, and the IP address of the submitter. 1002 * 1003 * @param page The WikiPage under edit 1004 * @param request The HTTP Request 1005 * @since 2.6 1006 * @return A hash value for this page and session 1007 */ 1008 public static final String getSpamHash( WikiPage page, HttpServletRequest request ) { 1009 long lastModified = 0; 1010 1011 if( page.getLastModified() != null ) { 1012 lastModified = page.getLastModified().getTime(); 1013 } 1014 long remote = HttpUtil.getRemoteAddress( request ).hashCode(); 1015 1016 return Long.toString( lastModified ^ remote ); 1017 } 1018 1019 /** 1020 * Returns the name of the hash field to be used in this request. The value is unique per session, and once 1021 * the session has expired, you cannot edit anymore. 1022 * 1023 * @param request The page request 1024 * @return The name to be used in the hash field 1025 * @since 2.6 1026 */ 1027 public static final String getHashFieldName( HttpServletRequest request ) { 1028 String hash = null; 1029 1030 if( request.getSession() != null ) { 1031 hash = ( String )request.getSession().getAttribute( "_hash" ); 1032 1033 if( hash == null ) { 1034 hash = c_hashName; 1035 request.getSession().setAttribute( "_hash", hash ); 1036 } 1037 } 1038 1039 if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) { 1040 c_hashName = getUniqueID().toLowerCase(); 1041 c_lastUpdate = System.currentTimeMillis(); 1042 } 1043 1044 return hash != null ? hash : c_hashName; 1045 } 1046 1047 1048 /** 1049 * This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases: 1050 * either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long, 1051 * and their session has expired. 1052 * <p> 1053 * This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in 1054 * the spam log (it may or may not be spam, but it's rather likely that it is). 1055 * 1056 * @param context The WikiContext 1057 * @param pageContext The JSP PageContext. 1058 * @return True, if hash is okay. False, if hash is not okay, and you need to redirect. 1059 * @throws IOException If redirection fails 1060 * @since 2.6 1061 */ 1062 public static final boolean checkHash( WikiContext context, PageContext pageContext ) throws IOException { 1063 String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() ); 1064 1065 if( pageContext.getRequest().getParameter(hashName) == null ) { 1066 if( pageContext.getAttribute( hashName ) == null ) { 1067 Change change = getChange( context, EditorManager.getEditedText( pageContext ) ); 1068 log( context, REJECT, "MissingHash", change.m_change ); 1069 1070 String redirect = context.getURL( WikiContext.VIEW,"SessionExpired" ); 1071 ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect ); 1072 return false; 1073 } 1074 } 1075 1076 return true; 1077 } 1078 1079 /** 1080 * This helper method adds all the input fields to your editor that the SpamFilter requires 1081 * to check for spam. This <i>must</i> be in your editor form if you intend to use the SpamFilter. 1082 * 1083 * @param pageContext The PageContext 1084 * @return A HTML string which contains input fields for the SpamFilter. 1085 */ 1086 public static final String insertInputFields( final PageContext pageContext ) { 1087 final WikiContext ctx = WikiContext.findContext( pageContext ); 1088 final WikiEngine engine = ctx.getEngine(); 1089 final StringBuilder sb = new StringBuilder(); 1090 if( engine.getContentEncoding().equals( StandardCharsets.UTF_8 ) ) { 1091 sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" ); 1092 } 1093 1094 return sb.toString(); 1095 } 1096 1097 /** 1098 * A local class for storing host information. 1099 * 1100 * @since 1101 */ 1102 private class Host { 1103 1104 private long m_addedTime = System.currentTimeMillis(); 1105 private long m_releaseTime; 1106 private String m_address; 1107 private Change m_change; 1108 1109 public String getAddress() { 1110 return m_address; 1111 } 1112 1113 public long getReleaseTime() { 1114 return m_releaseTime; 1115 } 1116 1117 public long getAddedTime() { 1118 return m_addedTime; 1119 } 1120 1121 public Change getChange() { 1122 return m_change; 1123 } 1124 1125 public Host( String ipaddress, Change change ) { 1126 m_address = ipaddress; 1127 m_change = change; 1128 m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L; 1129 } 1130 1131 } 1132 1133 private static class Change { 1134 1135 public String m_change; 1136 public int m_adds; 1137 public int m_removals; 1138 1139 public String toString() { 1140 return m_change; 1141 } 1142 1143 public boolean equals( Object o ) { 1144 if( o instanceof Change ) { 1145 return m_change.equals( ( ( Change )o ).m_change ); 1146 } 1147 return false; 1148 } 1149 1150 public int hashCode() { 1151 return m_change.hashCode() + 17; 1152 } 1153 1154 } 1155 1156}