001/* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019package org.apache.wiki.filters; 020 021import net.sf.akismet.Akismet; 022import org.apache.commons.lang3.time.StopWatch; 023import org.apache.log4j.Logger; 024import org.apache.oro.text.regex.MalformedPatternException; 025import org.apache.oro.text.regex.MatchResult; 026import org.apache.oro.text.regex.Pattern; 027import org.apache.oro.text.regex.PatternCompiler; 028import org.apache.oro.text.regex.PatternMatcher; 029import org.apache.oro.text.regex.Perl5Compiler; 030import org.apache.oro.text.regex.Perl5Matcher; 031import org.apache.wiki.InternalWikiException; 032import org.apache.wiki.api.core.Attachment; 033import org.apache.wiki.api.core.Context; 034import org.apache.wiki.api.core.ContextEnum; 035import org.apache.wiki.api.core.Engine; 036import org.apache.wiki.api.core.Page; 037import org.apache.wiki.api.exceptions.ProviderException; 038import org.apache.wiki.api.exceptions.RedirectException; 039import org.apache.wiki.api.filters.BasePageFilter; 040import org.apache.wiki.api.providers.WikiProvider; 041import org.apache.wiki.attachment.AttachmentManager; 042import org.apache.wiki.auth.user.UserProfile; 043import org.apache.wiki.pages.PageManager; 044import org.apache.wiki.ui.EditorManager; 045import org.apache.wiki.util.FileUtil; 046import org.apache.wiki.util.HttpUtil; 047import org.apache.wiki.util.TextUtil; 048import org.suigeneris.jrcs.diff.Diff; 049import org.suigeneris.jrcs.diff.DifferentiationFailedException; 050import org.suigeneris.jrcs.diff.Revision; 051import org.suigeneris.jrcs.diff.delta.AddDelta; 052import org.suigeneris.jrcs.diff.delta.ChangeDelta; 053import org.suigeneris.jrcs.diff.delta.DeleteDelta; 054import org.suigeneris.jrcs.diff.delta.Delta; 055import org.suigeneris.jrcs.diff.myers.MyersDiff; 056 057import javax.servlet.http.HttpServletRequest; 058import javax.servlet.http.HttpServletResponse; 059import javax.servlet.jsp.PageContext; 060import java.io.BufferedReader; 061import java.io.IOException; 062import java.io.InputStream; 063import java.io.InputStreamReader; 064import java.io.StringReader; 065import java.io.StringWriter; 066import java.nio.charset.StandardCharsets; 067import java.util.ArrayList; 068import java.util.Collection; 069import java.util.Date; 070import java.util.Iterator; 071import java.util.Properties; 072import java.util.Random; 073import java.util.StringTokenizer; 074import java.util.Vector; 075import java.util.concurrent.ThreadLocalRandom; 076 077 078/** 079 * This is Herb, the JSPWiki spamfilter that can also do choke modifications. 080 * 081 * Parameters: 082 * <ul> 083 * <li>wordlist - Page name where the spamword regexps are found. Use [{SET spamwords='regexp list separated with spaces'}] on 084 * that page. Default is "SpamFilterWordList". 085 * <li>IPlist - Page name where the IP regexps are found. Use [{SET ips='regexp list separated with spaces'}] on 086 * that page. Default is "SpamFilterIPList". 087 * <li>maxpagenamelength - Maximum page name length. Default is 100. 088 * <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is 089 * "SpamFilterWordList/blacklist.txt"</li> 090 * <li>errorpage - The page to which the user is redirected. Has a special variable $msg which states the reason. Default is "RejectedMessage". 091 * <li>pagechangesinminute - How many page changes are allowed/minute. Default is 5.</li> 092 * <li>similarchanges - How many similar page changes are allowed before the host is banned. Default is 2. (since 2.4.72)</li> 093 * <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li> 094 * <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li> 095 * <li>akismet-apikey - The Akismet API key (see akismet.org)</li> 096 * <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li> 097 * <li>captcha - Sets the captcha technology to use. Current allowed values are "none" and "asirra".</li> 098 * <li>strategy - Sets the filtering strategy to use. If set to "eager", will stop at the first probable 099 * match, and won't consider any other tests. This is the default, as it's considerably lighter. If set to "score", will go through all of the tests 100 * and calculates a score for the spam, which is then compared to a filter level value. 101 * </ul> 102 * 103 * <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates 104 * with the editor system.</p> 105 * 106 * <p>Changes by admin users are ignored in any case.</p> 107 * 108 * @since 2.1.112 109 */ 110public class SpamFilter extends BasePageFilter { 111 112 private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score"; 113 private static final String REASON_REGEXP = "Regexp"; 114 private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily"; 115 private static final String REASON_IP_BANNED_PERMANENTLY = "IPBannedPermanently"; 116 private static final String REASON_BOT_TRAP = "BotTrap"; 117 private static final String REASON_AKISMET = "Akismet"; 118 private static final String REASON_TOO_MANY_URLS = "TooManyUrls"; 119 private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications"; 120 private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications"; 121 private static final String REASON_PAGENAME_TOO_LONG = "PageNameTooLong"; 122 private static final String REASON_UTF8_TRAP = "UTF8Trap"; 123 124 private static final String LISTVAR = "spamwords"; 125 private static final String LISTIPVAR = "ips"; 126 127 private static final Random RANDOM = ThreadLocalRandom.current(); 128 129 /** The filter property name for specifying the page which contains the list of spamwords. Value is <tt>{@value}</tt>. */ 130 public static final String PROP_WORDLIST = "wordlist"; 131 132 /** The filter property name for specifying the page which contains the list of IPs to ban. Value is <tt>{@value}</tt>. */ 133 public static final String PROP_IPLIST = "IPlist"; 134 135 /** The filter property name for specifying the maximum page name length. Value is <tt>{@value}</tt>. */ 136 public static final String PROP_MAX_PAGENAME_LENGTH = "maxpagenamelength"; 137 138 /** The filter property name for the page to which you are directed if Herb rejects your edit. Value is <tt>{@value}</tt>. */ 139 public static final String PROP_ERRORPAGE = "errorpage"; 140 141 /** The filter property name for specifying how many changes is any given IP address 142 * allowed to do per minute. Value is <tt>{@value}</tt>. 143 */ 144 public static final String PROP_PAGECHANGES = "pagechangesinminute"; 145 146 /** The filter property name for specifying how many similar changes are allowed before a host is banned. Value is <tt>{@value}</tt>. */ 147 public static final String PROP_SIMILARCHANGES = "similarchanges"; 148 149 /** The filter property name for specifying how long a host is banned. Value is <tt>{@value}</tt>.*/ 150 public static final String PROP_BANTIME = "bantime"; 151 152 /** The filter property name for the attachment containing the blacklist. Value is <tt>{@value}</tt>.*/ 153 public static final String PROP_BLACKLIST = "blacklist"; 154 155 /** The filter property name for specifying how many URLs can any given edit contain. Value is <tt>{@value}</tt> */ 156 public static final String PROP_MAXURLS = "maxurls"; 157 158 /** The filter property name for specifying the Akismet API-key. Value is <tt>{@value}</tt>. */ 159 public static final String PROP_AKISMET_API_KEY = "akismet-apikey"; 160 161 /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */ 162 public static final String PROP_IGNORE_AUTHENTICATED = "ignoreauthenticated"; 163 164 /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */ 165 public static final String PROP_CAPTCHA = "captcha"; 166 167 /** The filter property name for specifying which filter strategy should be used. Value is <tt>{@value}</tt>. */ 168 public static final String PROP_FILTERSTRATEGY = "strategy"; 169 170 /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */ 171 public static final String STRATEGY_EAGER = "eager"; 172 173 /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */ 174 public static final String STRATEGY_SCORE = "score"; 175 176 private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)"; 177 178 private String m_forbiddenWordsPage = "SpamFilterWordList"; 179 private String m_forbiddenIPsPage = "SpamFilterIPList"; 180 private String m_pageNameMaxLength = "100"; 181 private String m_errorPage = "RejectedMessage"; 182 private String m_blacklist = "SpamFilterWordList/blacklist.txt"; 183 184 private final PatternMatcher m_matcher = new Perl5Matcher(); 185 private final PatternCompiler m_compiler = new Perl5Compiler(); 186 187 private Collection<Pattern> m_spamPatterns = null; 188 private Collection<Pattern> m_IPPatterns = null; 189 190 private Date m_lastRebuild = new Date( 0L ); 191 192 private static final Logger c_spamlog = Logger.getLogger( "SpamLog" ); 193 private static final Logger log = Logger.getLogger( SpamFilter.class ); 194 195 private Vector<Host> m_temporaryBanList = new Vector<>(); 196 197 private int m_banTime = 60; // minutes 198 199 private Vector<Host> m_lastModifications = new Vector<>(); 200 201 /** How many times a single IP address can change a page per minute? */ 202 private int m_limitSinglePageChanges = 5; 203 204 /** How many times can you add the exact same string to a page? */ 205 private int m_limitSimilarChanges = 2; 206 207 /** How many URLs can be added at maximum. */ 208 private int m_maxUrls = 10; 209 210 private Pattern m_urlPattern; 211 private Akismet m_akismet; 212 213 private String m_akismetAPIKey = null; 214 215 private boolean m_useCaptcha = false; 216 217 /** The limit at which we consider something to be spam. */ 218 private int m_scoreLimit = 1; 219 220 /** If set to true, will ignore anyone who is in Authenticated role. */ 221 private boolean m_ignoreAuthenticated = false; 222 223 private boolean m_stopAtFirstMatch = true; 224 225 private static String c_hashName; 226 private static long c_lastUpdate; 227 228 /** The HASH_DELAY value is a maximum amount of time that an user can keep 229 * a session open, because after the value has expired, we will invent a new 230 * hash field name. By default this is {@value} hours, which should be ample 231 * time for someone. 232 */ 233 private static final long HASH_DELAY = 24; 234 235 236 /** 237 * {@inheritDoc} 238 */ 239 @Override 240 public void initialize( final Engine engine, final Properties properties ) { 241 m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage ); 242 m_forbiddenIPsPage = properties.getProperty( PROP_IPLIST, m_forbiddenIPsPage); 243 m_pageNameMaxLength = properties.getProperty( PROP_MAX_PAGENAME_LENGTH, m_pageNameMaxLength); 244 m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage ); 245 m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties, PROP_PAGECHANGES, m_limitSinglePageChanges ); 246 247 m_limitSimilarChanges = TextUtil.getIntegerProperty( properties, PROP_SIMILARCHANGES, m_limitSimilarChanges ); 248 249 m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls ); 250 m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime ); 251 m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist ); 252 253 m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties, PROP_IGNORE_AUTHENTICATED, m_ignoreAuthenticated ); 254 255 m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra"); 256 257 try { 258 m_urlPattern = m_compiler.compile( URL_REGEXP ); 259 } catch( final MalformedPatternException e ) { 260 log.fatal( "Internal error: Someone put in a faulty pattern.", e ); 261 throw new InternalWikiException( "Faulty pattern." , e); 262 } 263 264 m_akismetAPIKey = TextUtil.getStringProperty( properties, PROP_AKISMET_API_KEY, m_akismetAPIKey ); 265 m_stopAtFirstMatch = TextUtil.getStringProperty( properties, PROP_FILTERSTRATEGY, STRATEGY_EAGER ).equals( STRATEGY_EAGER ); 266 267 log.info( "# Spam filter initialized. Temporary ban time " + m_banTime + 268 " mins, max page changes/minute: " + m_limitSinglePageChanges ); 269 } 270 271 private static final int REJECT = 0; 272 private static final int ACCEPT = 1; 273 private static final int NOTE = 2; 274 275 private static String log( final Context ctx, final int type, final String source, String message ) { 276 message = TextUtil.replaceString( message, "\r\n", "\\r\\n" ); 277 message = TextUtil.replaceString( message, "\"", "\\\"" ); 278 279 final String uid = getUniqueID(); 280 final String page = ctx.getPage().getName(); 281 final String addr = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-"; 282 final String reason; 283 switch( type ) { 284 case REJECT: reason = "REJECTED"; 285 break; 286 case ACCEPT: reason = "ACCEPTED"; 287 break; 288 case NOTE: reason = "NOTE"; 289 break; 290 default: throw new InternalWikiException( "Illegal type " + type ); 291 } 292 c_spamlog.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message ); 293 294 return uid; 295 } 296 297 /** {@inheritDoc} */ 298 @Override 299 public String preSave( final Context context, final String content ) throws RedirectException { 300 cleanBanList(); 301 refreshBlacklists( context ); 302 final Change change = getChange( context, content ); 303 304 if( !ignoreThisUser( context ) ) { 305 checkBanList( context, change ); 306 checkSinglePageChange( context, content, change ); 307 checkIPList( context ); 308 checkPatternList( context, content, change ); 309 checkPageName( context, content, change); 310 } 311 312 if( !m_stopAtFirstMatch ) { 313 final Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE ); 314 315 if( score != null && score >= m_scoreLimit ) { 316 throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) ); 317 } 318 } 319 320 log( context, ACCEPT, "-", change.toString() ); 321 return content; 322 } 323 324 private void checkPageName( final Context context, final String content, final Change change ) throws RedirectException { 325 final Page page = context.getPage(); 326 final String pageName = page.getName(); 327 final int maxlength = Integer.parseInt(m_pageNameMaxLength); 328 if ( pageName.length() > maxlength) { 329 // 330 // Spam filter has a match. 331 // 332 333 final String uid = log( context, REJECT, REASON_PAGENAME_TOO_LONG + "(" + m_pageNameMaxLength + ")" , pageName); 334 335 log.info("SPAM:PageNameTooLong (" + uid + "). The length of the page name is too large (" + pageName.length() + " , limit is " + m_pageNameMaxLength + ")"); 336 checkStrategy( context, REASON_PAGENAME_TOO_LONG, "Herb says '" + pageName + "' is a bad pageName and I trust Herb! (Incident code " + uid + ")" ); 337 338 } 339 } 340 341 private void checkStrategy( final Context context, final String error, final String message ) throws RedirectException { 342 if( m_stopAtFirstMatch ) { 343 throw new RedirectException( message, getRedirectPage( context ) ); 344 } 345 346 Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE ); 347 if( score != null ) { 348 score = score + 1; 349 } else { 350 score = 1; 351 } 352 353 context.setVariable( ATTR_SPAMFILTER_SCORE, score ); 354 } 355 356 /** 357 * Parses a list of patterns and returns a Collection of compiled Pattern objects. 358 * 359 * @param source page containing the list of patterns. 360 * @param list list of patterns. 361 * @return A Collection of the Patterns that were found from the lists. 362 */ 363 private Collection< Pattern > parseWordList( final Page source, final String list ) { 364 final ArrayList< Pattern > compiledpatterns = new ArrayList<>(); 365 366 if( list != null ) { 367 final StringTokenizer tok = new StringTokenizer( list, " \t\n" ); 368 369 while( tok.hasMoreTokens() ) { 370 final String pattern = tok.nextToken(); 371 372 try { 373 compiledpatterns.add( m_compiler.compile( pattern ) ); 374 } catch( final MalformedPatternException e ) { 375 log.debug( "Malformed spam filter pattern " + pattern ); 376 source.setAttribute("error", "Malformed spam filter pattern " + pattern); 377 } 378 } 379 } 380 381 return compiledpatterns; 382 } 383 384 /** 385 * Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects. 386 * 387 * @param list list of patterns. 388 * @return The parsed blacklist patterns. 389 */ 390 private Collection< Pattern > parseBlacklist( final String list ) { 391 final ArrayList< Pattern > compiledpatterns = new ArrayList<>(); 392 393 if( list != null ) { 394 try { 395 final BufferedReader in = new BufferedReader( new StringReader(list) ); 396 String line; 397 while( (line = in.readLine() ) != null ) { 398 line = line.trim(); 399 if( line.length() == 0 ) continue; // Empty line 400 if( line.startsWith("#") ) continue; // It's a comment 401 402 int ws = line.indexOf( ' ' ); 403 if( ws == -1 ) ws = line.indexOf( '\t' ); 404 if( ws != -1 ) line = line.substring( 0, ws ); 405 406 try { 407 compiledpatterns.add( m_compiler.compile( line ) ); 408 } catch( final MalformedPatternException e ) { 409 log.debug( "Malformed spam filter pattern " + line ); 410 } 411 } 412 } catch( final IOException e ) { 413 log.info( "Could not read patterns; returning what I got" , e ); 414 } 415 } 416 417 return compiledpatterns; 418 } 419 420 /** 421 * Takes a single page change and performs a load of tests on the content change. An admin can modify anything. 422 * 423 * @param context page Context 424 * @param content page content 425 * @param change page change 426 * @throws RedirectException spam filter rejects the page change. 427 */ 428 private synchronized void checkSinglePageChange( final Context context, final String content, final Change change ) 429 throws RedirectException { 430 final HttpServletRequest req = context.getHttpRequest(); 431 432 if( req != null ) { 433 final String addr = HttpUtil.getRemoteAddress( req ); 434 int hostCounter = 0; 435 int changeCounter = 0; 436 437 log.debug( "Change is " + change.m_change ); 438 439 final long time = System.currentTimeMillis() - 60*1000L; // 1 minute 440 441 for( final Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) { 442 final Host host = i.next(); 443 444 // Check if this item is invalid 445 if( host.getAddedTime() < time ) { 446 log.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" ); 447 i.remove(); 448 continue; 449 } 450 451 // Check if this IP address has been seen before 452 if( host.getAddress().equals( addr ) ) { 453 hostCounter++; 454 } 455 456 // Check, if this change has been seen before 457 if( host.getChange() != null && host.getChange().equals( change ) ) { 458 changeCounter++; 459 } 460 } 461 462 // Now, let's check against the limits. 463 if( hostCounter >= m_limitSinglePageChanges ) { 464 final Host host = new Host( addr, null ); 465 m_temporaryBanList.add( host ); 466 467 final String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change ); 468 log.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" ); 469 checkStrategy( context, REASON_TOO_MANY_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 470 } 471 472 if( changeCounter >= m_limitSimilarChanges ) { 473 final Host host = new Host( addr, null ); 474 m_temporaryBanList.add( host ); 475 476 final String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change ); 477 log.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" ); 478 checkStrategy( context, REASON_SIMILAR_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")"); 479 } 480 481 // Calculate the number of links in the addition. 482 String tstChange = change.toString(); 483 int urlCounter = 0; 484 while( m_matcher.contains( tstChange,m_urlPattern ) ) { 485 final MatchResult m = m_matcher.getMatch(); 486 tstChange = tstChange.substring( m.endOffset(0) ); 487 urlCounter++; 488 } 489 490 if( urlCounter > m_maxUrls ) { 491 final Host host = new Host( addr, null ); 492 m_temporaryBanList.add( host ); 493 494 final String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() ); 495 log.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" ); 496 checkStrategy( context, REASON_TOO_MANY_URLS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 497 } 498 499 // Check bot trap 500 checkBotTrap( context, change ); 501 502 // Check UTF-8 mangling 503 checkUTF8( context, change ); 504 505 // Do Akismet check. This is good to be the last, because this is the most expensive operation. 506 checkAkismet( context, change ); 507 508 m_lastModifications.add( new Host( addr, change ) ); 509 } 510 } 511 512 513 /** 514 * Checks against the akismet system. 515 * 516 * @param context page Context 517 * @throws RedirectException spam filter rejects the page change. 518 */ 519 private void checkAkismet( final Context context, final Change change ) throws RedirectException { 520 if( m_akismetAPIKey != null ) { 521 if( m_akismet == null ) { 522 log.info( "Initializing Akismet spam protection." ); 523 m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() ); 524 525 if( !m_akismet.verifyAPIKey() ) { 526 log.error( "Akismet API key cannot be verified. Please check your config." ); 527 m_akismetAPIKey = null; 528 m_akismet = null; 529 } 530 } 531 532 final HttpServletRequest req = context.getHttpRequest(); 533 534 // Akismet will mark all empty statements as spam, so we'll just ignore them. 535 if( change.m_adds == 0 && change.m_removals > 0 ) { 536 return; 537 } 538 539 if( req != null && m_akismet != null ) { 540 log.debug( "Calling Akismet to check for spam..." ); 541 542 final StopWatch sw = new StopWatch(); 543 sw.start(); 544 545 final String ipAddress = HttpUtil.getRemoteAddress( req ); 546 final String userAgent = req.getHeader( "User-Agent" ); 547 final String referrer = req.getHeader( "Referer"); 548 final String permalink = context.getViewURL( context.getPage().getName() ); 549 final String commentType = context.getRequestContext().equals( ContextEnum.PAGE_COMMENT.getRequestContext() ) ? "comment" : "edit"; 550 final String commentAuthor = context.getCurrentUser().getName(); 551 final String commentAuthorEmail = null; 552 final String commentAuthorURL = null; 553 554 final boolean isSpam = m_akismet.commentCheck( ipAddress, 555 userAgent, 556 referrer, 557 permalink, 558 commentType, 559 commentAuthor, 560 commentAuthorEmail, 561 commentAuthorURL, 562 change.toString(), 563 null ); 564 565 sw.stop(); 566 log.debug( "Akismet request done in: " + sw ); 567 568 if( isSpam ) { 569 // Host host = new Host( ipAddress, null ); 570 // m_temporaryBanList.add( host ); 571 572 final String uid = log( context, REJECT, REASON_AKISMET, change.toString() ); 573 log.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." ); 574 checkStrategy( context, REASON_AKISMET, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" ); 575 } 576 } 577 } 578 } 579 580 /** 581 * Returns a static string which can be used to detect spambots which just wildly fill in all the fields. 582 * 583 * @return A string 584 */ 585 public static String getBotFieldName() { 586 return "submit_auth"; 587 } 588 589 /** 590 * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam. 591 * 592 * @param context page Context 593 * @param change page change 594 * @throws RedirectException spam filter rejects the page change. 595 */ 596 private void checkBotTrap( final Context context, final Change change ) throws RedirectException { 597 final HttpServletRequest request = context.getHttpRequest(); 598 if( request != null ) { 599 final String unspam = request.getParameter( getBotFieldName() ); 600 if( unspam != null && unspam.length() > 0 ) { 601 final String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() ); 602 603 log.info( "SPAM:BotTrap (" + uid + "). Wildly behaving bot detected." ); 604 checkStrategy( context, REASON_BOT_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" ); 605 } 606 } 607 } 608 609 private void checkUTF8( final Context context, final Change change ) throws RedirectException { 610 final HttpServletRequest request = context.getHttpRequest(); 611 if( request != null ) { 612 final String utf8field = request.getParameter( "encodingcheck" ); 613 if( utf8field != null && !utf8field.equals( "\u3041" ) ) { 614 final String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() ); 615 616 log.info( "SPAM:UTF8Trap (" + uid + "). Wildly posting dumb bot detected." ); 617 checkStrategy( context, REASON_UTF8_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" ); 618 } 619 } 620 } 621 622 /** Goes through the ban list and cleans away any host which has expired from it. */ 623 private synchronized void cleanBanList() { 624 final long now = System.currentTimeMillis(); 625 for( final Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) { 626 final Host host = i.next(); 627 628 if( host.getReleaseTime() < now ) { 629 log.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" ); 630 i.remove(); 631 } 632 } 633 } 634 635 /** 636 * Checks the ban list if the IP address of the changer is already on it. 637 * 638 * @param context page context 639 * @throws RedirectException spam filter rejects the page change. 640 */ 641 private void checkBanList( final Context context, final Change change ) throws RedirectException { 642 final HttpServletRequest req = context.getHttpRequest(); 643 644 if( req != null ) { 645 final String remote = HttpUtil.getRemoteAddress(req); 646 final long now = System.currentTimeMillis(); 647 648 for( final Host host : m_temporaryBanList ) { 649 if( host.getAddress().equals( remote ) ) { 650 final long timeleft = ( host.getReleaseTime() - now ) / 1000L; 651 652 log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change ); 653 checkStrategy( context, REASON_IP_BANNED_TEMPORARILY, 654 "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" ); 655 } 656 } 657 } 658 } 659 660 /** 661 * If the spam filter notices changes in the black list page, it will refresh them automatically. 662 * 663 * @param context associated WikiContext 664 */ 665 private void refreshBlacklists( final Context context ) { 666 try { 667 boolean rebuild = false; 668 669 // Rebuild, if the spam words page, the attachment or the IP ban page has changed since. 670 final Page sourceSpam = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenWordsPage ); 671 if( sourceSpam != null ) { 672 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || sourceSpam.getLastModified().after( m_lastRebuild ) ) { 673 rebuild = true; 674 } 675 } 676 677 final Attachment att = context.getEngine().getManager( AttachmentManager.class ).getAttachmentInfo( context, m_blacklist ); 678 if( att != null ) { 679 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) { 680 rebuild = true; 681 } 682 } 683 684 final Page sourceIPs = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenIPsPage ); 685 if( sourceIPs != null ) { 686 if( m_IPPatterns == null || m_IPPatterns.isEmpty() || sourceIPs.getLastModified().after( m_lastRebuild ) ) { 687 rebuild = true; 688 } 689 } 690 691 // Do the actual rebuilding. For simplicity's sake, we always rebuild the complete filter list regardless of what changed. 692 if( rebuild ) { 693 m_lastRebuild = new Date(); 694 m_spamPatterns = parseWordList( sourceSpam, ( sourceSpam != null ) ? sourceSpam.getAttribute( LISTVAR ) : null ); 695 696 log.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage ); 697 698 m_IPPatterns = parseWordList( sourceIPs, ( sourceIPs != null ) ? sourceIPs.getAttribute( LISTIPVAR ) : null ); 699 log.info( "IP filter reloaded - recognizing " + m_IPPatterns.size() + " patterns from page " + m_forbiddenIPsPage ); 700 701 if( att != null ) { 702 final InputStream in = context.getEngine().getManager( AttachmentManager.class ).getAttachmentStream(att); 703 final StringWriter out = new StringWriter(); 704 FileUtil.copyContents( new InputStreamReader( in, StandardCharsets.UTF_8 ), out ); 705 final Collection< Pattern > blackList = parseBlacklist( out.toString() ); 706 log.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist ); 707 m_spamPatterns.addAll( blackList ); 708 } 709 } 710 } catch( final IOException ex ) { 711 log.info( "Unable to read attachment data, continuing...", ex ); 712 } catch( final ProviderException ex ) { 713 log.info( "Failed to read spam filter attachment, continuing...", ex ); 714 } 715 } 716 717 /** 718 * Does a check against a known pattern list. 719 * 720 * @param context page Context 721 * @param content page content 722 * @param change page change 723 * @throws RedirectException spam filter rejects the page change. 724 */ 725 private void checkPatternList( final Context context, final String content, final Change change ) throws RedirectException { 726 // If we have no spam patterns defined, or we're trying to save the page containing the patterns, just return. 727 if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) { 728 return; 729 } 730 731 String ch = change.toString(); 732 if( context.getHttpRequest() != null ) { 733 ch += HttpUtil.getRemoteAddress( context.getHttpRequest() ); 734 } 735 736 for( final Pattern p : m_spamPatterns ) { 737 // log.debug("Attempting to match page contents with "+p.getPattern()); 738 739 if( m_matcher.contains( ch, p ) ) { 740 // Spam filter has a match. 741 final String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch ); 742 743 log.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" ); 744 checkStrategy( context, REASON_REGEXP, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" ); 745 } 746 } 747 } 748 749 750 /** 751 * Does a check against a pattern list of IPs. 752 * 753 * @param context page context 754 * @throws RedirectException spam filter rejects the page change. 755 */ 756 private void checkIPList( final Context context ) throws RedirectException { 757 // If we have no IP patterns defined, or we're trying to save the page containing the IP patterns, just return. 758 if( m_IPPatterns == null || context.getPage().getName().equals( m_forbiddenIPsPage ) ) { 759 return; 760 } 761 762 final String remoteIP = HttpUtil.getRemoteAddress( context.getHttpRequest() ); 763 log.info("Attempting to match remoteIP " + remoteIP + " against " + m_IPPatterns.size() + " patterns"); 764 765 for( final Pattern p : m_IPPatterns ) { 766 log.debug("Attempting to match remoteIP with " + p.getPattern()); 767 768 if( m_matcher.contains( remoteIP, p ) ) { 769 770 // IP filter has a match. 771 // 772 final String uid = log( context, REJECT, REASON_IP_BANNED_PERMANENTLY + "(" + p.getPattern() + ")", remoteIP ); 773 774 log.info( "SPAM:IPBanList (" + uid + "). remoteIP matches the IP filter '" + p.getPattern() + "'" ); 775 checkStrategy( context, REASON_IP_BANNED_PERMANENTLY, "Herb says '" + p.getPattern() + "' is a banned IP and I trust Herb! (Incident code " + uid + ")" ); 776 } 777 } 778 } 779 780 private void checkPatternList( final Context context, final String content, final String change ) throws RedirectException { 781 final Change c = new Change(); 782 c.m_change = change; 783 checkPatternList( context, content, c ); 784 } 785 786 /** 787 * Creates a simple text string describing the added content. 788 * 789 * @param context page context 790 * @param newText added content 791 * @return Empty string, if there is no change. 792 */ 793 private static Change getChange( final Context context, final String newText ) { 794 final Page page = context.getPage(); 795 final StringBuffer change = new StringBuffer(); 796 final Engine engine = context.getEngine(); 797 // Get current page version 798 799 final Change ch = new Change(); 800 801 try { 802 final String oldText = engine.getManager( PageManager.class ).getPureText( page.getName(), WikiProvider.LATEST_VERSION ); 803 final String[] first = Diff.stringToArray( oldText ); 804 final String[] second = Diff.stringToArray( newText ); 805 final Revision rev = Diff.diff( first, second, new MyersDiff() ); 806 807 if( rev == null || rev.size() == 0 ) { 808 return ch; 809 } 810 811 for( int i = 0; i < rev.size(); i++ ) { 812 final Delta d = rev.getDelta( i ); 813 814 if( d instanceof AddDelta ) { 815 d.getRevised().toString( change, "", "\r\n" ); 816 ch.m_adds++; 817 818 } else if( d instanceof ChangeDelta ) { 819 d.getRevised().toString( change, "", "\r\n" ); 820 ch.m_adds++; 821 822 } else if( d instanceof DeleteDelta ) { 823 ch.m_removals++; 824 } 825 } 826 } catch( final DifferentiationFailedException e ) { 827 log.error( "Diff failed", e ); 828 } 829 830 // Don't forget to include the change note, too 831 final String changeNote = page.getAttribute( Page.CHANGENOTE ); 832 if( changeNote != null ) { 833 change.append( "\r\n" ); 834 change.append( changeNote ); 835 } 836 837 // And author as well 838 if( page.getAuthor() != null ) { 839 change.append( "\r\n" + page.getAuthor() ); 840 } 841 842 ch.m_change = change.toString(); 843 return ch; 844 } 845 846 /** 847 * Returns true, if this user should be ignored. For example, admin users. 848 * 849 * @param context page context 850 * @return True, if this users should be ignored. 851 */ 852 private boolean ignoreThisUser( final Context context ) { 853 if( context.hasAdminPermissions() ) { 854 return true; 855 } 856 857 if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) { 858 return true; 859 } 860 861 return context.getVariable("captcha") != null; 862 } 863 864 /** 865 * Returns a random string of six uppercase characters. 866 * 867 * @return A random string 868 */ 869 private static String getUniqueID() { 870 final StringBuilder sb = new StringBuilder(); 871 for( int i = 0; i < 6; i++ ) { 872 final char x = ( char )( 'A' + RANDOM.nextInt( 26 ) ); 873 sb.append( x ); 874 } 875 876 return sb.toString(); 877 } 878 879 /** 880 * Returns a page to which we shall redirect, based on the current value of the "captcha" parameter. 881 * 882 * @param ctx WikiContext 883 * @return An URL to redirect to 884 */ 885 private String getRedirectPage( final Context ctx ) { 886 if( m_useCaptcha ) { 887 return ctx.getURL( ContextEnum.PAGE_NONE.getRequestContext(), "Captcha.jsp", "page= " +ctx.getEngine().encodeName( ctx.getPage().getName() ) ); 888 } 889 890 return ctx.getURL( ContextEnum.PAGE_VIEW.getRequestContext(), m_errorPage ); 891 } 892 893 /** 894 * Checks whether the UserProfile matches certain checks. 895 * 896 * @param profile The profile to check 897 * @param context The WikiContext 898 * @return False, if this userprofile is suspect and should not be allowed to be added. 899 * @since 2.6.1 900 */ 901 public boolean isValidUserProfile( final Context context, final UserProfile profile ) { 902 try { 903 checkPatternList( context, profile.getEmail(), profile.getEmail() ); 904 checkPatternList( context, profile.getFullname(), profile.getFullname() ); 905 checkPatternList( context, profile.getLoginName(), profile.getLoginName() ); 906 } catch( final RedirectException e ) { 907 log.info("Detected attempt to create a spammer user account (see above for rejection reason)"); 908 return false; 909 } 910 911 return true; 912 } 913 914 /** 915 * This method is used to calculate an unique code when submitting the page to detect edit conflicts. 916 * It currently incorporates the last-modified date of the page, and the IP address of the submitter. 917 * 918 * @param page The WikiPage under edit 919 * @param request The HTTP Request 920 * @since 2.6 921 * @return A hash value for this page and session 922 */ 923 public static String getSpamHash( final Page page, final HttpServletRequest request ) { 924 long lastModified = 0; 925 926 if( page.getLastModified() != null ) { 927 lastModified = page.getLastModified().getTime(); 928 } 929 final long remote = HttpUtil.getRemoteAddress( request ).hashCode(); 930 931 return Long.toString( lastModified ^ remote ); 932 } 933 934 /** 935 * Returns the name of the hash field to be used in this request. The value is unique per session, and once 936 * the session has expired, you cannot edit anymore. 937 * 938 * @param request The page request 939 * @return The name to be used in the hash field 940 * @since 2.6 941 */ 942 public static String getHashFieldName( final HttpServletRequest request ) { 943 String hash = null; 944 945 if( request.getSession() != null ) { 946 hash = ( String )request.getSession().getAttribute( "_hash" ); 947 948 if( hash == null ) { 949 hash = c_hashName; 950 request.getSession().setAttribute( "_hash", hash ); 951 } 952 } 953 954 if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) { 955 c_hashName = getUniqueID().toLowerCase(); 956 c_lastUpdate = System.currentTimeMillis(); 957 } 958 959 return hash != null ? hash : c_hashName; 960 } 961 962 963 /** 964 * This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases: 965 * either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long, 966 * and their session has expired. 967 * <p> 968 * This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in 969 * the spam log (it may or may not be spam, but it's rather likely that it is). 970 * 971 * @param context The WikiContext 972 * @param pageContext The JSP PageContext. 973 * @return True, if hash is okay. False, if hash is not okay, and you need to redirect. 974 * @throws IOException If redirection fails 975 * @since 2.6 976 */ 977 public static boolean checkHash( final Context context, final PageContext pageContext ) throws IOException { 978 final String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() ); 979 if( pageContext.getRequest().getParameter(hashName) == null ) { 980 if( pageContext.getAttribute( hashName ) == null ) { 981 final Change change = getChange( context, EditorManager.getEditedText( pageContext ) ); 982 log( context, REJECT, "MissingHash", change.m_change ); 983 984 final String redirect = context.getURL( ContextEnum.PAGE_VIEW.getRequestContext(),"SessionExpired" ); 985 ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect ); 986 return false; 987 } 988 } 989 990 return true; 991 } 992 993 /** 994 * This helper method adds all the input fields to your editor that the SpamFilter requires 995 * to check for spam. This <i>must</i> be in your editor form if you intend to use the SpamFilter. 996 * 997 * @param pageContext The PageContext 998 * @return A HTML string which contains input fields for the SpamFilter. 999 */ 1000 public static String insertInputFields( final PageContext pageContext ) { 1001 final Context ctx = Context.findContext( pageContext ); 1002 final Engine engine = ctx.getEngine(); 1003 final StringBuilder sb = new StringBuilder(); 1004 if( engine.getContentEncoding().equals( StandardCharsets.UTF_8 ) ) { 1005 sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" ); 1006 } 1007 1008 return sb.toString(); 1009 } 1010 1011 /** 1012 * A local class for storing host information. 1013 */ 1014 private class Host { 1015 1016 private final long m_addedTime = System.currentTimeMillis(); 1017 private final long m_releaseTime; 1018 private final String m_address; 1019 private final Change m_change; 1020 1021 public String getAddress() { 1022 return m_address; 1023 } 1024 1025 public long getReleaseTime() { 1026 return m_releaseTime; 1027 } 1028 1029 public long getAddedTime() { 1030 return m_addedTime; 1031 } 1032 1033 public Change getChange() { 1034 return m_change; 1035 } 1036 1037 public Host( final String ipaddress, final Change change ) { 1038 m_address = ipaddress; 1039 m_change = change; 1040 m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L; 1041 } 1042 1043 } 1044 1045 private static class Change { 1046 1047 public String m_change; 1048 public int m_adds; 1049 public int m_removals; 1050 1051 @Override 1052 public String toString() { 1053 return m_change; 1054 } 1055 1056 @Override 1057 public boolean equals( final Object o ) { 1058 if( o instanceof Change ) { 1059 return m_change.equals( ( ( Change )o ).m_change ); 1060 } 1061 return false; 1062 } 1063 1064 @Override 1065 public int hashCode() { 1066 return m_change.hashCode() + 17; 1067 } 1068 1069 } 1070 1071}