001/* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019package org.apache.wiki.filters; 020 021import net.sf.akismet.Akismet; 022import org.apache.commons.lang3.StringUtils; 023import org.apache.commons.lang3.time.StopWatch; 024import org.apache.logging.log4j.LogManager; 025import org.apache.logging.log4j.Logger; 026import org.apache.oro.text.regex.MalformedPatternException; 027import org.apache.oro.text.regex.MatchResult; 028import org.apache.oro.text.regex.Pattern; 029import org.apache.oro.text.regex.PatternCompiler; 030import org.apache.oro.text.regex.PatternMatcher; 031import org.apache.oro.text.regex.Perl5Compiler; 032import org.apache.oro.text.regex.Perl5Matcher; 033import org.apache.wiki.InternalWikiException; 034import org.apache.wiki.api.core.Attachment; 035import org.apache.wiki.api.core.Context; 036import org.apache.wiki.api.core.ContextEnum; 037import org.apache.wiki.api.core.Engine; 038import org.apache.wiki.api.core.Page; 039import org.apache.wiki.api.exceptions.ProviderException; 040import org.apache.wiki.api.exceptions.RedirectException; 041import org.apache.wiki.api.filters.BasePageFilter; 042import org.apache.wiki.api.providers.WikiProvider; 043import org.apache.wiki.attachment.AttachmentManager; 044import org.apache.wiki.auth.user.UserProfile; 045import org.apache.wiki.pages.PageManager; 046import org.apache.wiki.ui.EditorManager; 047import org.apache.wiki.util.FileUtil; 048import org.apache.wiki.util.HttpUtil; 049import org.apache.wiki.util.TextUtil; 050import org.suigeneris.jrcs.diff.Diff; 051import org.suigeneris.jrcs.diff.DifferentiationFailedException; 052import org.suigeneris.jrcs.diff.Revision; 053import org.suigeneris.jrcs.diff.delta.AddDelta; 054import org.suigeneris.jrcs.diff.delta.ChangeDelta; 055import org.suigeneris.jrcs.diff.delta.DeleteDelta; 056import org.suigeneris.jrcs.diff.delta.Delta; 057import org.suigeneris.jrcs.diff.myers.MyersDiff; 058 059import javax.servlet.http.HttpServletRequest; 060import javax.servlet.http.HttpServletResponse; 061import javax.servlet.jsp.PageContext; 062import java.io.BufferedReader; 063import java.io.IOException; 064import java.io.InputStream; 065import java.io.InputStreamReader; 066import java.io.StringReader; 067import java.io.StringWriter; 068import java.nio.charset.StandardCharsets; 069import java.util.ArrayList; 070import java.util.Arrays; 071import java.util.Collection; 072import java.util.Date; 073import java.util.Iterator; 074import java.util.List; 075import java.util.Properties; 076import java.util.Random; 077import java.util.StringTokenizer; 078import java.util.Vector; 079import java.util.concurrent.ThreadLocalRandom; 080 081 082/** 083 * This is Herb, the JSPWiki spamfilter that can also do choke modifications. 084 * 085 * Parameters: 086 * <ul> 087 * <li>wordlist - Page name where the spamword regexps are found. Use [{SET spamwords='regexp list separated with spaces'}] on 088 * that page. Default is "SpamFilterWordList". 089 * <li>IPlist - Page name where the IP regexps are found. Use [{SET ips='regexp list separated with spaces'}] on 090 * that page. Default is "SpamFilterIPList". 091 * <li>maxpagenamelength - Maximum page name length. Default is 100. 092 * <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is 093 * "SpamFilterWordList/blacklist.txt"</li> 094 * <li>errorpage - The page to which the user is redirected. Has a special variable $msg which states the reason. Default is "RejectedMessage". 095 * <li>pagechangesinminute - How many page changes are allowed/minute. Default is 5.</li> 096 * <li>similarchanges - How many similar page changes are allowed before the host is banned. Default is 2. (since 2.4.72)</li> 097 * <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li> 098 * <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li> 099 * <li>akismet-apikey - The Akismet API key (see akismet.org)</li> 100 * <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li> 101 * <li>captcha - Sets the captcha technology to use. Current allowed values are "none" and "asirra".</li> 102 * <li>strategy - Sets the filtering strategy to use. If set to "eager", will stop at the first probable 103 * match, and won't consider any other tests. This is the default, as it's considerably lighter. If set to "score", will go through all of the tests 104 * and calculates a score for the spam, which is then compared to a filter level value. 105 * </ul> 106 * 107 * <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates 108 * with the editor system.</p> 109 * 110 * <p>Changes by admin users are ignored in any case.</p> 111 * 112 * @since 2.1.112 113 */ 114public class SpamFilter extends BasePageFilter { 115 116 private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score"; 117 private static final String REASON_REGEXP = "Regexp"; 118 private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily"; 119 private static final String REASON_IP_BANNED_PERMANENTLY = "IPBannedPermanently"; 120 private static final String REASON_BOT_TRAP = "BotTrap"; 121 private static final String REASON_AKISMET = "Akismet"; 122 private static final String REASON_TOO_MANY_URLS = "TooManyUrls"; 123 private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications"; 124 private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications"; 125 private static final String REASON_PAGENAME_TOO_LONG = "PageNameTooLong"; 126 private static final String REASON_UTF8_TRAP = "UTF8Trap"; 127 128 private static final String LISTVAR = "spamwords"; 129 private static final String LISTIPVAR = "ips"; 130 131 private static final Random RANDOM = ThreadLocalRandom.current(); 132 133 /** The filter property name for specifying the page which contains the list of spamwords. Value is <tt>{@value}</tt>. */ 134 public static final String PROP_WORDLIST = "wordlist"; 135 136 /** The filter property name for specifying the page which contains the list of IPs to ban. Value is <tt>{@value}</tt>. */ 137 public static final String PROP_IPLIST = "IPlist"; 138 139 /** The filter property name for specifying the maximum page name length. Value is <tt>{@value}</tt>. */ 140 public static final String PROP_MAX_PAGENAME_LENGTH = "maxpagenamelength"; 141 142 /** The filter property name for the page to which you are directed if Herb rejects your edit. Value is <tt>{@value}</tt>. */ 143 public static final String PROP_ERRORPAGE = "errorpage"; 144 145 /** The filter property name for specifying how many changes is any given IP address 146 * allowed to do per minute. Value is <tt>{@value}</tt>. 147 */ 148 public static final String PROP_PAGECHANGES = "pagechangesinminute"; 149 150 /** The filter property name for specifying how many similar changes are allowed before a host is banned. Value is <tt>{@value}</tt>. */ 151 public static final String PROP_SIMILARCHANGES = "similarchanges"; 152 153 /** The filter property name for specifying how long a host is banned. Value is <tt>{@value}</tt>.*/ 154 public static final String PROP_BANTIME = "bantime"; 155 156 /** The filter property name for the attachment containing the blacklist. Value is <tt>{@value}</tt>.*/ 157 public static final String PROP_BLACKLIST = "blacklist"; 158 159 /** The filter property name for specifying how many URLs can any given edit contain. Value is <tt>{@value}</tt> */ 160 public static final String PROP_MAXURLS = "maxurls"; 161 162 /** The filter property name for specifying the Akismet API-key. Value is <tt>{@value}</tt>. */ 163 public static final String PROP_AKISMET_API_KEY = "akismet-apikey"; 164 165 /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */ 166 public static final String PROP_IGNORE_AUTHENTICATED = "ignoreauthenticated"; 167 168 /** The filter property name for specifying groups allowed to bypass the spam filter. Value is <tt>{@value}</tt>. */ 169 public static final String PROP_ALLOWED_GROUPS = "jspwiki.filters.spamfilter.allowedgroups"; 170 171 /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */ 172 public static final String PROP_CAPTCHA = "captcha"; 173 174 /** The filter property name for specifying which filter strategy should be used. Value is <tt>{@value}</tt>. */ 175 public static final String PROP_FILTERSTRATEGY = "strategy"; 176 177 /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */ 178 public static final String STRATEGY_EAGER = "eager"; 179 180 /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */ 181 public static final String STRATEGY_SCORE = "score"; 182 183 private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)"; 184 185 private String m_forbiddenWordsPage = "SpamFilterWordList"; 186 private String m_forbiddenIPsPage = "SpamFilterIPList"; 187 private String m_pageNameMaxLength = "100"; 188 private String m_errorPage = "RejectedMessage"; 189 private String m_blacklist = "SpamFilterWordList/blacklist.txt"; 190 191 private final PatternMatcher m_matcher = new Perl5Matcher(); 192 private final PatternCompiler m_compiler = new Perl5Compiler(); 193 194 private Collection<Pattern> m_spamPatterns; 195 private Collection<Pattern> m_IPPatterns; 196 197 private Date m_lastRebuild = new Date( 0L ); 198 199 private static final Logger C_SPAMLOG = LogManager.getLogger( "SpamLog" ); 200 private static final Logger LOG = LogManager.getLogger( SpamFilter.class ); 201 202 private final Vector<Host> m_temporaryBanList = new Vector<>(); 203 204 private int m_banTime = 60; // minutes 205 206 private final Vector<Host> m_lastModifications = new Vector<>(); 207 208 /** How many times a single IP address can change a page per minute? */ 209 private int m_limitSinglePageChanges = 5; 210 211 /** How many times can you add the exact same string to a page? */ 212 private int m_limitSimilarChanges = 2; 213 214 /** How many URLs can be added at maximum. */ 215 private int m_maxUrls = 10; 216 217 private Pattern m_urlPattern; 218 private Akismet m_akismet; 219 220 private String m_akismetAPIKey; 221 222 private boolean m_useCaptcha; 223 224 /** The limit at which we consider something to be spam. */ 225 private final int m_scoreLimit = 1; 226 227 /** If set to true, will ignore anyone who is in Authenticated role. */ 228 private boolean m_ignoreAuthenticated; 229 230 /** Groups allowed to bypass the filter */ 231 private String[] m_allowedGroups; 232 233 private boolean m_stopAtFirstMatch = true; 234 235 private static String c_hashName; 236 private static long c_lastUpdate; 237 238 /** The HASH_DELAY value is a maximum amount of time that an user can keep 239 * a session open, because after the value has expired, we will invent a new 240 * hash field name. By default this is {@value} hours, which should be ample 241 * time for someone. 242 */ 243 private static final long HASH_DELAY = 24; 244 245 246 /** 247 * {@inheritDoc} 248 */ 249 @Override 250 public void initialize( final Engine engine, final Properties properties ) { 251 m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage ); 252 m_forbiddenIPsPage = properties.getProperty( PROP_IPLIST, m_forbiddenIPsPage); 253 m_pageNameMaxLength = properties.getProperty( PROP_MAX_PAGENAME_LENGTH, m_pageNameMaxLength); 254 m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage ); 255 m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties, PROP_PAGECHANGES, m_limitSinglePageChanges ); 256 257 m_limitSimilarChanges = TextUtil.getIntegerProperty( properties, PROP_SIMILARCHANGES, m_limitSimilarChanges ); 258 259 m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls ); 260 m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime ); 261 m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist ); 262 263 m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties, PROP_IGNORE_AUTHENTICATED, m_ignoreAuthenticated ); 264 m_allowedGroups = StringUtils.split( StringUtils.defaultString( properties.getProperty( PROP_ALLOWED_GROUPS, m_blacklist ) ), ',' ); 265 266 m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra"); 267 268 try { 269 m_urlPattern = m_compiler.compile( URL_REGEXP ); 270 } catch( final MalformedPatternException e ) { 271 LOG.fatal( "Internal error: Someone put in a faulty pattern.", e ); 272 throw new InternalWikiException( "Faulty pattern." , e); 273 } 274 275 m_akismetAPIKey = TextUtil.getStringProperty( properties, PROP_AKISMET_API_KEY, m_akismetAPIKey ); 276 m_stopAtFirstMatch = TextUtil.getStringProperty( properties, PROP_FILTERSTRATEGY, STRATEGY_EAGER ).equals( STRATEGY_EAGER ); 277 278 LOG.info( "# Spam filter initialized. Temporary ban time " + m_banTime + 279 " mins, max page changes/minute: " + m_limitSinglePageChanges ); 280 } 281 282 private static final int REJECT = 0; 283 private static final int ACCEPT = 1; 284 private static final int NOTE = 2; 285 286 private static String log( final Context ctx, final int type, final String source, String message ) { 287 message = TextUtil.replaceString( message, "\r\n", "\\r\\n" ); 288 message = TextUtil.replaceString( message, "\"", "\\\"" ); 289 290 final String uid = getUniqueID(); 291 final String page = ctx.getPage().getName(); 292 final String addr = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-"; 293 final String reason; 294 switch( type ) { 295 case REJECT: reason = "REJECTED"; 296 break; 297 case ACCEPT: reason = "ACCEPTED"; 298 break; 299 case NOTE: reason = "NOTE"; 300 break; 301 default: throw new InternalWikiException( "Illegal type " + type ); 302 } 303 C_SPAMLOG.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message ); 304 305 return uid; 306 } 307 308 /** {@inheritDoc} */ 309 @Override 310 public String preSave( final Context context, final String content ) throws RedirectException { 311 cleanBanList(); 312 refreshBlacklists( context ); 313 final Change change = getChange( context, content ); 314 315 if( !ignoreThisUser( context ) ) { 316 checkBanList( context, change ); 317 checkSinglePageChange( context, content, change ); 318 checkIPList( context ); 319 checkPatternList( context, content, change ); 320 checkPageName( context, content, change); 321 } 322 323 if( !m_stopAtFirstMatch ) { 324 final Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE ); 325 326 if( score != null && score >= m_scoreLimit ) { 327 throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) ); 328 } 329 } 330 331 log( context, ACCEPT, "-", change.toString() ); 332 return content; 333 } 334 335 private void checkPageName( final Context context, final String content, final Change change ) throws RedirectException { 336 final Page page = context.getPage(); 337 final String pageName = page.getName(); 338 final int maxlength = Integer.parseInt(m_pageNameMaxLength); 339 if ( pageName.length() > maxlength) { 340 // 341 // Spam filter has a match. 342 // 343 344 final String uid = log( context, REJECT, REASON_PAGENAME_TOO_LONG + "(" + m_pageNameMaxLength + ")" , pageName); 345 346 LOG.info("SPAM:PageNameTooLong (" + uid + "). The length of the page name is too large (" + pageName.length() + " , limit is " + m_pageNameMaxLength + ")"); 347 checkStrategy( context, REASON_PAGENAME_TOO_LONG, "Herb says '" + pageName + "' is a bad pageName and I trust Herb! (Incident code " + uid + ")" ); 348 349 } 350 } 351 352 private void checkStrategy( final Context context, final String error, final String message ) throws RedirectException { 353 if( m_stopAtFirstMatch ) { 354 throw new RedirectException( message, getRedirectPage( context ) ); 355 } 356 357 Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE ); 358 if( score != null ) { 359 score = score + 1; 360 } else { 361 score = 1; 362 } 363 364 context.setVariable( ATTR_SPAMFILTER_SCORE, score ); 365 } 366 367 /** 368 * Parses a list of patterns and returns a Collection of compiled Pattern objects. 369 * 370 * @param source page containing the list of patterns. 371 * @param list list of patterns. 372 * @return A Collection of the Patterns that were found from the lists. 373 */ 374 private Collection< Pattern > parseWordList( final Page source, final String list ) { 375 final ArrayList< Pattern > compiledpatterns = new ArrayList<>(); 376 377 if( list != null ) { 378 final StringTokenizer tok = new StringTokenizer( list, " \t\n" ); 379 380 while( tok.hasMoreTokens() ) { 381 final String pattern = tok.nextToken(); 382 383 try { 384 compiledpatterns.add( m_compiler.compile( pattern ) ); 385 } catch( final MalformedPatternException e ) { 386 LOG.debug( "Malformed spam filter pattern " + pattern ); 387 source.setAttribute("error", "Malformed spam filter pattern " + pattern); 388 } 389 } 390 } 391 392 return compiledpatterns; 393 } 394 395 /** 396 * Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects. 397 * 398 * @param list list of patterns. 399 * @return The parsed blacklist patterns. 400 */ 401 private Collection< Pattern > parseBlacklist( final String list ) { 402 final ArrayList< Pattern > compiledpatterns = new ArrayList<>(); 403 404 if( list != null ) { 405 try { 406 final BufferedReader in = new BufferedReader( new StringReader(list) ); 407 String line; 408 while( (line = in.readLine() ) != null ) { 409 line = line.trim(); 410 if( line.isEmpty() ) continue; // Empty line 411 if( line.startsWith("#") ) continue; // It's a comment 412 413 int ws = line.indexOf( ' ' ); 414 if( ws == -1 ) ws = line.indexOf( '\t' ); 415 if( ws != -1 ) line = line.substring( 0, ws ); 416 417 try { 418 compiledpatterns.add( m_compiler.compile( line ) ); 419 } catch( final MalformedPatternException e ) { 420 LOG.debug( "Malformed spam filter pattern " + line ); 421 } 422 } 423 } catch( final IOException e ) { 424 LOG.info( "Could not read patterns; returning what I got" , e ); 425 } 426 } 427 428 return compiledpatterns; 429 } 430 431 /** 432 * Takes a single page change and performs a load of tests on the content change. An admin can modify anything. 433 * 434 * @param context page Context 435 * @param content page content 436 * @param change page change 437 * @throws RedirectException spam filter rejects the page change. 438 */ 439 private synchronized void checkSinglePageChange( final Context context, final String content, final Change change ) 440 throws RedirectException { 441 final HttpServletRequest req = context.getHttpRequest(); 442 443 if( req != null ) { 444 final String addr = HttpUtil.getRemoteAddress( req ); 445 int hostCounter = 0; 446 int changeCounter = 0; 447 448 LOG.debug( "Change is " + change.m_change ); 449 450 final long time = System.currentTimeMillis() - 60*1000L; // 1 minute 451 452 for( final Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) { 453 final Host host = i.next(); 454 455 // Check if this item is invalid 456 if( host.getAddedTime() < time ) { 457 LOG.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" ); 458 i.remove(); 459 continue; 460 } 461 462 // Check if this IP address has been seen before 463 if( host.getAddress().equals( addr ) ) { 464 hostCounter++; 465 } 466 467 // Check, if this change has been seen before 468 if( host.getChange() != null && host.getChange().equals( change ) ) { 469 changeCounter++; 470 } 471 } 472 473 // Now, let's check against the limits. 474 if( hostCounter >= m_limitSinglePageChanges ) { 475 final Host host = new Host( addr, null ); 476 m_temporaryBanList.add( host ); 477 478 final String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change ); 479 LOG.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" ); 480 checkStrategy( context, REASON_TOO_MANY_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 481 } 482 483 if( changeCounter >= m_limitSimilarChanges ) { 484 final Host host = new Host( addr, null ); 485 m_temporaryBanList.add( host ); 486 487 final String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change ); 488 LOG.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" ); 489 checkStrategy( context, REASON_SIMILAR_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")"); 490 } 491 492 // Calculate the number of links in the addition. 493 String tstChange = change.toString(); 494 int urlCounter = 0; 495 while( m_matcher.contains( tstChange,m_urlPattern ) ) { 496 final MatchResult m = m_matcher.getMatch(); 497 tstChange = tstChange.substring( m.endOffset(0) ); 498 urlCounter++; 499 } 500 501 if( urlCounter > m_maxUrls ) { 502 final Host host = new Host( addr, null ); 503 m_temporaryBanList.add( host ); 504 505 final String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() ); 506 LOG.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" ); 507 checkStrategy( context, REASON_TOO_MANY_URLS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 508 } 509 510 // Check bot trap 511 checkBotTrap( context, change ); 512 513 // Check UTF-8 mangling 514 checkUTF8( context, change ); 515 516 // Do Akismet check. This is good to be the last, because this is the most expensive operation. 517 checkAkismet( context, change ); 518 519 m_lastModifications.add( new Host( addr, change ) ); 520 } 521 } 522 523 524 /** 525 * Checks against the akismet system. 526 * 527 * @param context page Context 528 * @throws RedirectException spam filter rejects the page change. 529 */ 530 private void checkAkismet( final Context context, final Change change ) throws RedirectException { 531 if( m_akismetAPIKey != null ) { 532 if( m_akismet == null ) { 533 LOG.info( "Initializing Akismet spam protection." ); 534 m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() ); 535 536 if( !m_akismet.verifyAPIKey() ) { 537 LOG.error( "Akismet API key cannot be verified. Please check your config." ); 538 m_akismetAPIKey = null; 539 m_akismet = null; 540 } 541 } 542 543 final HttpServletRequest req = context.getHttpRequest(); 544 545 // Akismet will mark all empty statements as spam, so we'll just ignore them. 546 if( change.m_adds == 0 && change.m_removals > 0 ) { 547 return; 548 } 549 550 if( req != null && m_akismet != null ) { 551 LOG.debug( "Calling Akismet to check for spam..." ); 552 553 final StopWatch sw = new StopWatch(); 554 sw.start(); 555 556 final String ipAddress = HttpUtil.getRemoteAddress( req ); 557 final String userAgent = req.getHeader( "User-Agent" ); 558 final String referrer = req.getHeader( "Referer"); 559 final String permalink = context.getViewURL( context.getPage().getName() ); 560 final String commentType = context.getRequestContext().equals( ContextEnum.PAGE_COMMENT.getRequestContext() ) ? "comment" : "edit"; 561 final String commentAuthor = context.getCurrentUser().getName(); 562 final String commentAuthorEmail = null; 563 final String commentAuthorURL = null; 564 565 final boolean isSpam = m_akismet.commentCheck( ipAddress, 566 userAgent, 567 referrer, 568 permalink, 569 commentType, 570 commentAuthor, 571 commentAuthorEmail, 572 commentAuthorURL, 573 change.toString(), 574 null ); 575 576 sw.stop(); 577 LOG.debug( "Akismet request done in: " + sw ); 578 579 if( isSpam ) { 580 // Host host = new Host( ipAddress, null ); 581 // m_temporaryBanList.add( host ); 582 583 final String uid = log( context, REJECT, REASON_AKISMET, change.toString() ); 584 LOG.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." ); 585 checkStrategy( context, REASON_AKISMET, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" ); 586 } 587 } 588 } 589 } 590 591 /** 592 * Returns a static string which can be used to detect spambots which just wildly fill in all the fields. 593 * 594 * @return A string 595 */ 596 public static String getBotFieldName() { 597 return "submit_auth"; 598 } 599 600 /** 601 * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam. 602 * 603 * @param context page Context 604 * @param change page change 605 * @throws RedirectException spam filter rejects the page change. 606 */ 607 private void checkBotTrap( final Context context, final Change change ) throws RedirectException { 608 final HttpServletRequest request = context.getHttpRequest(); 609 if( request != null ) { 610 final String unspam = request.getParameter( getBotFieldName() ); 611 if( unspam != null && !unspam.isEmpty() ) { 612 final String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() ); 613 614 LOG.info( "SPAM:BotTrap (" + uid + "). Wildly behaving bot detected." ); 615 checkStrategy( context, REASON_BOT_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" ); 616 } 617 } 618 } 619 620 private void checkUTF8( final Context context, final Change change ) throws RedirectException { 621 final HttpServletRequest request = context.getHttpRequest(); 622 if( request != null ) { 623 final String utf8field = request.getParameter( "encodingcheck" ); 624 if( utf8field != null && !utf8field.equals( "\u3041" ) ) { 625 final String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() ); 626 627 LOG.info( "SPAM:UTF8Trap (" + uid + "). Wildly posting dumb bot detected." ); 628 checkStrategy( context, REASON_UTF8_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" ); 629 } 630 } 631 } 632 633 /** Goes through the ban list and cleans away any host which has expired from it. */ 634 private synchronized void cleanBanList() { 635 final long now = System.currentTimeMillis(); 636 for( final Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) { 637 final Host host = i.next(); 638 639 if( host.getReleaseTime() < now ) { 640 LOG.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" ); 641 i.remove(); 642 } 643 } 644 } 645 646 /** 647 * Checks the ban list if the IP address of the changer is already on it. 648 * 649 * @param context page context 650 * @throws RedirectException spam filter rejects the page change. 651 */ 652 private void checkBanList( final Context context, final Change change ) throws RedirectException { 653 final HttpServletRequest req = context.getHttpRequest(); 654 655 if( req != null ) { 656 final String remote = HttpUtil.getRemoteAddress(req); 657 final long now = System.currentTimeMillis(); 658 659 for( final Host host : m_temporaryBanList ) { 660 if( host.getAddress().equals( remote ) ) { 661 final long timeleft = ( host.getReleaseTime() - now ) / 1000L; 662 663 log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change ); 664 checkStrategy( context, REASON_IP_BANNED_TEMPORARILY, 665 "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" ); 666 } 667 } 668 } 669 } 670 671 /** 672 * If the spam filter notices changes in the black list page, it will refresh them automatically. 673 * 674 * @param context associated WikiContext 675 */ 676 private void refreshBlacklists( final Context context ) { 677 try { 678 boolean rebuild = false; 679 680 // Rebuild, if the spam words page, the attachment or the IP ban page has changed since. 681 final Page sourceSpam = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenWordsPage ); 682 if( sourceSpam != null ) { 683 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || sourceSpam.getLastModified().after( m_lastRebuild ) ) { 684 rebuild = true; 685 } 686 } 687 688 final Attachment att = context.getEngine().getManager( AttachmentManager.class ).getAttachmentInfo( context, m_blacklist ); 689 if( att != null ) { 690 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) { 691 rebuild = true; 692 } 693 } 694 695 final Page sourceIPs = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenIPsPage ); 696 if( sourceIPs != null ) { 697 if( m_IPPatterns == null || m_IPPatterns.isEmpty() || sourceIPs.getLastModified().after( m_lastRebuild ) ) { 698 rebuild = true; 699 } 700 } 701 702 // Do the actual rebuilding. For simplicity's sake, we always rebuild the complete filter list regardless of what changed. 703 if( rebuild ) { 704 m_lastRebuild = new Date(); 705 m_spamPatterns = parseWordList( sourceSpam, ( sourceSpam != null ) ? sourceSpam.getAttribute( LISTVAR ) : null ); 706 707 LOG.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage ); 708 709 m_IPPatterns = parseWordList( sourceIPs, ( sourceIPs != null ) ? sourceIPs.getAttribute( LISTIPVAR ) : null ); 710 LOG.info( "IP filter reloaded - recognizing " + m_IPPatterns.size() + " patterns from page " + m_forbiddenIPsPage ); 711 712 if( att != null ) { 713 final InputStream in = context.getEngine().getManager( AttachmentManager.class ).getAttachmentStream(att); 714 final StringWriter out = new StringWriter(); 715 FileUtil.copyContents( new InputStreamReader( in, StandardCharsets.UTF_8 ), out ); 716 final Collection< Pattern > blackList = parseBlacklist( out.toString() ); 717 LOG.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist ); 718 m_spamPatterns.addAll( blackList ); 719 } 720 } 721 } catch( final IOException ex ) { 722 LOG.info( "Unable to read attachment data, continuing...", ex ); 723 } catch( final ProviderException ex ) { 724 LOG.info( "Failed to read spam filter attachment, continuing...", ex ); 725 } 726 } 727 728 /** 729 * Does a check against a known pattern list. 730 * 731 * @param context page Context 732 * @param content page content 733 * @param change page change 734 * @throws RedirectException spam filter rejects the page change. 735 */ 736 private void checkPatternList( final Context context, final String content, final Change change ) throws RedirectException { 737 // If we have no spam patterns defined, or we're trying to save the page containing the patterns, just return. 738 if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) { 739 return; 740 } 741 742 String ch = change.toString(); 743 if( context.getHttpRequest() != null ) { 744 ch += HttpUtil.getRemoteAddress( context.getHttpRequest() ); 745 } 746 747 for( final Pattern p : m_spamPatterns ) { 748 // LOG.debug("Attempting to match page contents with "+p.getPattern()); 749 750 if( m_matcher.contains( ch, p ) ) { 751 // Spam filter has a match. 752 final String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch ); 753 754 LOG.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" ); 755 checkStrategy( context, REASON_REGEXP, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" ); 756 } 757 } 758 } 759 760 761 /** 762 * Does a check against a pattern list of IPs. 763 * 764 * @param context page context 765 * @throws RedirectException spam filter rejects the page change. 766 */ 767 private void checkIPList( final Context context ) throws RedirectException { 768 // If we have no IP patterns defined, or we're trying to save the page containing the IP patterns, just return. 769 if( m_IPPatterns == null || context.getPage().getName().equals( m_forbiddenIPsPage ) ) { 770 return; 771 } 772 773 final String remoteIP = HttpUtil.getRemoteAddress( context.getHttpRequest() ); 774 LOG.info("Attempting to match remoteIP " + remoteIP + " against " + m_IPPatterns.size() + " patterns"); 775 776 for( final Pattern p : m_IPPatterns ) { 777 LOG.debug("Attempting to match remoteIP with " + p.getPattern()); 778 779 if( m_matcher.contains( remoteIP, p ) ) { 780 781 // IP filter has a match. 782 // 783 final String uid = log( context, REJECT, REASON_IP_BANNED_PERMANENTLY + "(" + p.getPattern() + ")", remoteIP ); 784 785 LOG.info( "SPAM:IPBanList (" + uid + "). remoteIP matches the IP filter '" + p.getPattern() + "'" ); 786 checkStrategy( context, REASON_IP_BANNED_PERMANENTLY, "Herb says '" + p.getPattern() + "' is a banned IP and I trust Herb! (Incident code " + uid + ")" ); 787 } 788 } 789 } 790 791 private void checkPatternList( final Context context, final String content, final String change ) throws RedirectException { 792 final Change c = new Change(); 793 c.m_change = change; 794 checkPatternList( context, content, c ); 795 } 796 797 /** 798 * Creates a simple text string describing the added content. 799 * 800 * @param context page context 801 * @param newText added content 802 * @return Empty string, if there is no change. 803 */ 804 private static Change getChange( final Context context, final String newText ) { 805 final Page page = context.getPage(); 806 final StringBuffer change = new StringBuffer(); 807 final Engine engine = context.getEngine(); 808 // Get current page version 809 810 final Change ch = new Change(); 811 812 try { 813 final String oldText = engine.getManager( PageManager.class ).getPureText( page.getName(), WikiProvider.LATEST_VERSION ); 814 final String[] first = Diff.stringToArray( oldText ); 815 final String[] second = Diff.stringToArray( newText ); 816 final Revision rev = Diff.diff( first, second, new MyersDiff() ); 817 818 if( rev == null || rev.size() == 0 ) { 819 return ch; 820 } 821 822 for( int i = 0; i < rev.size(); i++ ) { 823 final Delta d = rev.getDelta( i ); 824 825 if( d instanceof AddDelta ) { 826 d.getRevised().toString( change, "", "\r\n" ); 827 ch.m_adds++; 828 829 } else if( d instanceof ChangeDelta ) { 830 d.getRevised().toString( change, "", "\r\n" ); 831 ch.m_adds++; 832 833 } else if( d instanceof DeleteDelta ) { 834 ch.m_removals++; 835 } 836 } 837 } catch( final DifferentiationFailedException e ) { 838 LOG.error( "Diff failed", e ); 839 } 840 841 // Don't forget to include the change note, too 842 final String changeNote = page.getAttribute( Page.CHANGENOTE ); 843 if( changeNote != null ) { 844 change.append( "\r\n" ); 845 change.append( changeNote ); 846 } 847 848 // And author as well 849 if( page.getAuthor() != null ) { 850 change.append( "\r\n" ).append( page.getAuthor() ); 851 } 852 853 ch.m_change = change.toString(); 854 return ch; 855 } 856 857 /** 858 * Returns true, if this user should be ignored. For example, admin users. 859 * 860 * @param context page context 861 * @return True, if this user should be ignored. 862 */ 863 private boolean ignoreThisUser( final Context context ) { 864 if( context.hasAdminPermissions() ) { 865 return true; 866 } 867 868 final List< String > groups = Arrays.asList( m_allowedGroups ); 869 if( Arrays.stream( context.getWikiSession().getRoles() ).anyMatch( role -> groups.contains( role.getName() ) ) ) { 870 return true; 871 } 872 873 if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) { 874 return true; 875 } 876 877 return context.getVariable("captcha") != null; 878 } 879 880 /** 881 * Returns a random string of six uppercase characters. 882 * 883 * @return A random string 884 */ 885 private static String getUniqueID() { 886 final StringBuilder sb = new StringBuilder(); 887 for( int i = 0; i < 6; i++ ) { 888 final char x = ( char )( 'A' + RANDOM.nextInt( 26 ) ); 889 sb.append( x ); 890 } 891 892 return sb.toString(); 893 } 894 895 /** 896 * Returns a page to which we shall redirect, based on the current value of the "captcha" parameter. 897 * 898 * @param ctx WikiContext 899 * @return An URL to redirect to 900 */ 901 private String getRedirectPage( final Context ctx ) { 902 if( m_useCaptcha ) { 903 return ctx.getURL( ContextEnum.PAGE_NONE.getRequestContext(), "Captcha.jsp", "page= " +ctx.getEngine().encodeName( ctx.getPage().getName() ) ); 904 } 905 906 return ctx.getURL( ContextEnum.PAGE_VIEW.getRequestContext(), m_errorPage ); 907 } 908 909 /** 910 * Checks whether the UserProfile matches certain checks. 911 * 912 * @param profile The profile to check 913 * @param context The WikiContext 914 * @return False, if this userprofile is suspect and should not be allowed to be added. 915 * @since 2.6.1 916 */ 917 public boolean isValidUserProfile( final Context context, final UserProfile profile ) { 918 try { 919 checkPatternList( context, profile.getEmail(), profile.getEmail() ); 920 checkPatternList( context, profile.getFullname(), profile.getFullname() ); 921 checkPatternList( context, profile.getLoginName(), profile.getLoginName() ); 922 } catch( final RedirectException e ) { 923 LOG.info("Detected attempt to create a spammer user account (see above for rejection reason)"); 924 return false; 925 } 926 927 return true; 928 } 929 930 /** 931 * This method is used to calculate an unique code when submitting the page to detect edit conflicts. 932 * It currently incorporates the last-modified date of the page, and the IP address of the submitter. 933 * 934 * @param page The WikiPage under edit 935 * @param request The HTTP Request 936 * @since 2.6 937 * @return A hash value for this page and session 938 */ 939 public static String getSpamHash( final Page page, final HttpServletRequest request ) { 940 long lastModified = 0; 941 942 if( page.getLastModified() != null ) { 943 lastModified = page.getLastModified().getTime(); 944 } 945 final long remote = HttpUtil.getRemoteAddress( request ).hashCode(); 946 947 return Long.toString( lastModified ^ remote ); 948 } 949 950 /** 951 * Returns the name of the hash field to be used in this request. The value is unique per session, and once 952 * the session has expired, you cannot edit anymore. 953 * 954 * @param request The page request 955 * @return The name to be used in the hash field 956 * @since 2.6 957 */ 958 public static String getHashFieldName( final HttpServletRequest request ) { 959 String hash = null; 960 961 if( request.getSession() != null ) { 962 hash = ( String )request.getSession().getAttribute( "_hash" ); 963 964 if( hash == null ) { 965 hash = c_hashName; 966 request.getSession().setAttribute( "_hash", hash ); 967 } 968 } 969 970 if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) { 971 c_hashName = getUniqueID().toLowerCase(); 972 c_lastUpdate = System.currentTimeMillis(); 973 } 974 975 return hash != null ? hash : c_hashName; 976 } 977 978 979 /** 980 * This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases: 981 * either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long, 982 * and their session has expired. 983 * <p> 984 * This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in 985 * the spam log (it may or may not be spam, but it's rather likely that it is). 986 * 987 * @param context The WikiContext 988 * @param pageContext The JSP PageContext. 989 * @return True, if hash is okay. False, if hash is not okay, and you need to redirect. 990 * @throws IOException If redirection fails 991 * @since 2.6 992 */ 993 public static boolean checkHash( final Context context, final PageContext pageContext ) throws IOException { 994 final String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() ); 995 if( pageContext.getRequest().getParameter(hashName) == null ) { 996 if( pageContext.getAttribute( hashName ) == null ) { 997 final Change change = getChange( context, EditorManager.getEditedText( pageContext ) ); 998 log( context, REJECT, "MissingHash", change.m_change ); 999 1000 final String redirect = context.getURL( ContextEnum.PAGE_VIEW.getRequestContext(),"SessionExpired" ); 1001 ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect ); 1002 return false; 1003 } 1004 } 1005 1006 return true; 1007 } 1008 1009 /** 1010 * This helper method adds all the input fields to your editor that the SpamFilter requires 1011 * to check for spam. This <i>must</i> be in your editor form if you intend to use the SpamFilter. 1012 * 1013 * @param pageContext The PageContext 1014 * @return A HTML string which contains input fields for the SpamFilter. 1015 */ 1016 public static String insertInputFields( final PageContext pageContext ) { 1017 final Context ctx = Context.findContext( pageContext ); 1018 final Engine engine = ctx.getEngine(); 1019 final StringBuilder sb = new StringBuilder(); 1020 if( engine.getContentEncoding().equals( StandardCharsets.UTF_8 ) ) { 1021 sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" ); 1022 } 1023 1024 return sb.toString(); 1025 } 1026 1027 /** 1028 * A local class for storing host information. 1029 */ 1030 private class Host { 1031 1032 private final long m_addedTime = System.currentTimeMillis(); 1033 private final long m_releaseTime; 1034 private final String m_address; 1035 private final Change m_change; 1036 1037 public String getAddress() { 1038 return m_address; 1039 } 1040 1041 public long getReleaseTime() { 1042 return m_releaseTime; 1043 } 1044 1045 public long getAddedTime() { 1046 return m_addedTime; 1047 } 1048 1049 public Change getChange() { 1050 return m_change; 1051 } 1052 1053 public Host( final String ipaddress, final Change change ) { 1054 m_address = ipaddress; 1055 m_change = change; 1056 m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L; 1057 } 1058 1059 } 1060 1061 private static class Change { 1062 1063 public String m_change; 1064 public int m_adds; 1065 public int m_removals; 1066 1067 @Override 1068 public String toString() { 1069 return m_change; 1070 } 1071 1072 @Override 1073 public boolean equals( final Object o ) { 1074 if( o instanceof Change ) { 1075 return m_change.equals( ( ( Change )o ).m_change ); 1076 } 1077 return false; 1078 } 1079 1080 @Override 1081 public int hashCode() { 1082 return m_change.hashCode() + 17; 1083 } 1084 1085 } 1086 1087}