Restore the CrawlerSessionManagerValve
authormarkt <markt@13f79535-47bb-0310-9956-ffa450edef68>
Thu, 24 Feb 2011 13:11:17 +0000 (13:11 +0000)
committermarkt <markt@13f79535-47bb-0310-9956-ffa450edef68>
Thu, 24 Feb 2011 13:11:17 +0000 (13:11 +0000)
Local testing with Jira 4.2.4 shows it works.

git-svn-id: https://svn.apache.org/repos/asf/tomcat/trunk@1074132 13f79535-47bb-0310-9956-ffa450edef68

java/org/apache/catalina/valves/CrawlerSessionManagerValve.java [new file with mode: 0644]

diff --git a/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java b/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java
new file mode 100644 (file)
index 0000000..b2c5880
--- /dev/null
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.catalina.valves;
+
+import java.io.IOException;
+import java.util.Enumeration;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.regex.Pattern;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpSession;
+
+import org.apache.catalina.LifecycleException;
+import org.apache.catalina.connector.Request;
+import org.apache.catalina.connector.Response;
+import org.apache.juli.logging.Log;
+import org.apache.juli.logging.LogFactory;
+
+/**
+ * Web crawlers can trigger the creation of many thousands of sessions as they
+ * crawl a site which may result in significant memory consumption. This Valve
+ * ensures that crawlers are associated with a single session - just like normal
+ * users - regardless of whether or not they provide a session token with their
+ * requests.
+ */
+public class CrawlerSessionManagerValve extends ValveBase {
+
+    private static final Log log =
+        LogFactory.getLog(CrawlerSessionManagerValve.class);
+
+    private Map<String,SessionInfo> uaIpSessionInfo =
+        new ConcurrentHashMap<String, SessionInfo>();
+
+    private String crawlerUserAgents =
+        ".*GoogleBot.*|.*bingbot.*|.*Yahoo! Slurp.*";
+    private Pattern uaPattern = null;
+    private int sessionInactiveInterval = 60;
+
+
+    /**
+     * Specify the regular expression (using {@link Pattern}) that will be used
+     * to identify crawlers based in the User-Agent header provided. The default
+     * is ".*GoogleBot.*|.*bingbot.*|.*Yahoo! Slurp.*"
+     *  
+     * @param crawlerUserAgents The regular expression using {@link Pattern}
+     */
+    public void setCrawlerUserAgents(String crawlerUserAgents) {
+        this.crawlerUserAgents = crawlerUserAgents;
+        if (crawlerUserAgents == null || crawlerUserAgents.length() == 0) {
+            uaPattern = null;
+        } else {
+            uaPattern = Pattern.compile(crawlerUserAgents);
+        }
+    }
+
+    /**
+     * @see #setCrawlerUserAgents(String)
+     * @return  The current regular expression being used to match user agents. 
+     */
+    public String getCrawlerUserAgents() {
+        return crawlerUserAgents;
+    }
+
+
+    /**
+     * Specify the session timeout (in seconds) for a crawler's session. This is
+     * typically lower than that for a user session. The default is 60 seconds.
+     *  
+     * @param sessionInactiveInterval   The new timeout for crawler sessions
+     */
+    public void setSessionInactiveInterval(int sessionInactiveInterval) {
+        this.sessionInactiveInterval = sessionInactiveInterval;
+    }
+
+    /**
+     * @see #setSessionInactiveInterval(int)
+     * @return  The current timeout in seconds
+     */
+    public int getSessionInactiveInterval() {
+        return sessionInactiveInterval;
+    }
+
+
+    @Override
+    protected void initInternal() throws LifecycleException {
+        super.initInternal();
+        
+        uaPattern = Pattern.compile(crawlerUserAgents);
+    }
+
+
+    @Override
+    public void invoke(Request request, Response response) throws IOException,
+            ServletException {
+
+        boolean isBot = false;
+        SessionInfo sessionInfo = null;
+        String clientIp = null;
+
+        if (log.isDebugEnabled()) {
+            log.debug(request.hashCode() + ": ClientIp=" +
+                    request.getRemoteAddr() + ", RequestedSessionId=" +
+                    request.getRequestedSessionId());
+        }
+
+        // If the incoming request has a session ID, no action is required
+        if (request.getRequestedSessionId() == null) {
+
+            // Is this a crawler - cheack the UA headers
+            Enumeration<String> uaHeaders = request.getHeaders("user-agent");
+            String uaHeader = uaHeaders.nextElement();
+            
+            // If more than one UA header - assume not a bot
+            if (!uaHeaders.hasMoreElements()) {
+
+                if (log.isDebugEnabled()) {
+                    log.debug(request.hashCode() + ": UserAgent=" + uaHeader);
+                }
+                
+                if (uaPattern.matcher(uaHeader).matches()) {
+                    isBot = true;
+                    
+                    if (log.isDebugEnabled()) {
+                        log.debug(request.hashCode() +
+                                ": Bot found. UserAgent=" + uaHeader);
+                    }
+                }
+            }
+            
+            // If this is a bot, is the session ID known?
+            if (isBot) {
+                clientIp = request.getRemoteAddr();
+                sessionInfo = uaIpSessionInfo.get(clientIp);
+                if (sessionInfo != null) {
+                    request.setRequestedSessionId(sessionInfo.getSessionId());
+                    if (log.isDebugEnabled()) {
+                        log.debug(request.hashCode() +
+                                ": SessionID=" + sessionInfo.getSessionId());
+                    }
+                }
+            }
+        }
+
+        getNext().invoke(request, response);
+        
+        if (isBot) {
+            if (sessionInfo == null) {
+                // Has bot just created a session, if so make a note of it
+                HttpSession s = request.getSession(false);
+                if (s != null) {
+                    uaIpSessionInfo.put(clientIp, new SessionInfo(s.getId()));
+                    s.setMaxInactiveInterval(sessionInactiveInterval);
+
+                    if (log.isDebugEnabled()) {
+                        log.debug(request.hashCode() +
+                                ": New bot session. SessionID=" + s.getId());
+                    }
+                }
+            } else {
+                sessionInfo.access();
+
+                if (log.isDebugEnabled()) {
+                    log.debug(request.hashCode() +
+                            ": Bot session accessed. SessionID=" +
+                            sessionInfo.getSessionId());
+                }
+            }
+        }
+    }
+
+
+    @Override
+    public void backgroundProcess() {
+        super.backgroundProcess();
+        
+        long expireTime = System.currentTimeMillis() -
+                (sessionInactiveInterval + 60) * 1000;
+
+        Iterator<Entry<String,SessionInfo>> iter =
+            uaIpSessionInfo.entrySet().iterator();
+
+        // Remove any sessions in the cache that have expired. 
+        while (iter.hasNext()) {
+            Entry<String,SessionInfo> entry = iter.next();
+            if (entry.getValue().getLastAccessed() < expireTime) {
+                iter.remove();
+            }
+        }
+    }
+
+
+    private static final class SessionInfo {
+        private final String sessionId;
+        private volatile long lastAccessed;
+        
+        public SessionInfo(String sessionId) {
+            this.sessionId = sessionId;
+            this.lastAccessed = System.currentTimeMillis();
+        }
+
+        public String getSessionId() {
+            return sessionId;
+        }
+
+        public long getLastAccessed() {
+            return lastAccessed;
+        }
+
+        public void access() {
+            lastAccessed = System.currentTimeMillis();
+        }
+    }
+}