Fix https://bz.apache.org/bugzilla/show_bug.cgi?id=62297
Enable the CrawlerSessionManagerValve to correctly handle bots that crawl multiple hosts and/or web applications when the Valve is configured on a Host or an Engine.

git-svn-id: https://svn.apache.org/repos/asf/tomcat/tc8.0.x/trunk@1829936 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java b/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java
index 4a698e9..aab1caf 100644
--- a/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java
+++ b/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java
@@ -27,6 +27,8 @@
 import javax.servlet.http.HttpSessionBindingEvent;
 import javax.servlet.http.HttpSessionBindingListener;
 
+import org.apache.catalina.Context;
+import org.apache.catalina.Host;
 import org.apache.catalina.LifecycleException;
 import org.apache.catalina.connector.Request;
 import org.apache.catalina.connector.Response;
@@ -44,8 +46,8 @@
 
     private static final Log log = LogFactory.getLog(CrawlerSessionManagerValve.class);
 
-    private final Map<String, String> clientIpSessionId = new ConcurrentHashMap<>();
-    private final Map<String, String> sessionIdClientIp = new ConcurrentHashMap<>();
+    private final Map<String, String> clientIdSessionId = new ConcurrentHashMap<>();
+    private final Map<String, String> sessionIdClientId = new ConcurrentHashMap<>();
 
     private String crawlerUserAgents = ".*[bB]ot.*|.*Yahoo! Slurp.*|.*Feedfetcher-Google.*";
     private Pattern uaPattern = null;
@@ -55,6 +57,10 @@
 
     private int sessionInactiveInterval = 60;
 
+    private boolean isHostAware = true;
+
+    private boolean isContextAware = true;
+
 
     /**
      * Specifies a default constructor so async support can be configured.
@@ -134,7 +140,27 @@
 
 
     public Map<String, String> getClientIpSessionId() {
-        return clientIpSessionId;
+        return clientIdSessionId;
+    }
+
+
+    public boolean isHostAware() {
+        return isHostAware;
+    }
+
+
+    public void setHostAware(boolean isHostAware) {
+        this.isHostAware = isHostAware;
+    }
+
+
+    public boolean isContextAware() {
+        return isContextAware;
+    }
+
+
+    public void setContextAware(boolean isContextAware) {
+        this.isContextAware = isContextAware;
     }
 
 
@@ -152,9 +178,10 @@
         boolean isBot = false;
         String sessionId = null;
         String clientIp = request.getRemoteAddr();
+        String clientIdentifier = getClientIdentifier(request.getHost(), request.getContext(), clientIp);
 
         if (log.isDebugEnabled()) {
-            log.debug(request.hashCode() + ": ClientIp=" + clientIp + ", RequestedSessionId="
+            log.debug(request.hashCode() + ": ClientIdentifier=" + clientIdentifier + ", RequestedSessionId="
                     + request.getRequestedSessionId());
         }
 
@@ -194,7 +221,7 @@
 
             // If this is a bot, is the session ID known?
             if (isBot) {
-                sessionId = clientIpSessionId.get(clientIp);
+                sessionId = clientIdSessionId.get(clientIdentifier);
                 if (sessionId != null) {
                     request.setRequestedSessionId(sessionId);
                     if (log.isDebugEnabled()) {
@@ -211,8 +238,8 @@
                 // Has bot just created a session, if so make a note of it
                 HttpSession s = request.getSession(false);
                 if (s != null) {
-                    clientIpSessionId.put(clientIp, s.getId());
-                    sessionIdClientIp.put(s.getId(), clientIp);
+                    clientIdSessionId.put(clientIdentifier, s.getId());
+                    sessionIdClientId.put(s.getId(), clientIdentifier);
                     // #valueUnbound() will be called on session expiration
                     s.setAttribute(this.getClass().getName(), this);
                     s.setMaxInactiveInterval(sessionInactiveInterval);
@@ -231,6 +258,18 @@
     }
 
 
+    private String getClientIdentifier(Host host, Context context, String clientIp) {
+        StringBuilder result = new StringBuilder(clientIp);
+        if (isHostAware) {
+            result.append('-').append(host.getName());
+        }
+        if (isContextAware) {
+            result.append(context.getName());
+        }
+        return result.toString();
+    }
+
+
     @Override
     public void valueBound(HttpSessionBindingEvent event) {
         // NOOP
@@ -239,9 +278,9 @@
 
     @Override
     public void valueUnbound(HttpSessionBindingEvent event) {
-        String clientIp = sessionIdClientIp.remove(event.getSession().getId());
-        if (clientIp != null) {
-            clientIpSessionId.remove(clientIp);
+        String clientIdentifier = sessionIdClientId.remove(event.getSession().getId());
+        if (clientIdentifier != null) {
+            clientIdSessionId.remove(clientIdentifier);
         }
     }
 }
diff --git a/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java b/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java
index bfdbaba..759a248 100644
--- a/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java
+++ b/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java
@@ -16,12 +16,17 @@
  */
 package org.apache.catalina.valves;
 
+import java.io.IOException;
+import java.util.Arrays;
 import java.util.Collections;
 
+import javax.servlet.ServletException;
 import javax.servlet.http.HttpSession;
 
 import org.junit.Test;
 
+import org.apache.catalina.Context;
+import org.apache.catalina.Host;
 import org.apache.catalina.Valve;
 import org.apache.catalina.connector.Request;
 import org.apache.catalina.connector.Response;
@@ -34,6 +39,7 @@
     public void testCrawlerIpsPositive() throws Exception {
         CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve();
         valve.setCrawlerIps("216\\.58\\.206\\.174");
+        valve.setCrawlerUserAgents(valve.getCrawlerUserAgents());
         valve.setNext(EasyMock.createMock(Valve.class));
         HttpSession session = createSessionExpectations(valve, true);
         Request request = createRequestExpectations("216.58.206.174", session, true);
@@ -49,6 +55,7 @@
     public void testCrawlerIpsNegative() throws Exception {
         CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve();
         valve.setCrawlerIps("216\\.58\\.206\\.174");
+        valve.setCrawlerUserAgents(valve.getCrawlerUserAgents());
         valve.setNext(EasyMock.createMock(Valve.class));
         HttpSession session = createSessionExpectations(valve, false);
         Request request = createRequestExpectations("127.0.0.1", session, false);
@@ -60,6 +67,32 @@
         EasyMock.verify(request, session);
     }
 
+    @Test
+    public void testCrawlerMultipleHostsHostAware() throws Exception {
+        CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve();
+        valve.setCrawlerUserAgents(valve.getCrawlerUserAgents());
+        valve.setHostAware(true);
+        valve.setContextAware(true);
+        valve.setNext(EasyMock.createMock(Valve.class));
+
+        verifyCrawlingLocalhost(valve, "localhost");
+        verifyCrawlingLocalhost(valve, "example.invalid");
+    }
+
+
+    private void verifyCrawlingLocalhost(CrawlerSessionManagerValve valve, String hostname)
+            throws IOException, ServletException {
+        HttpSession session = createSessionExpectations(valve, true);
+        Request request = createRequestExpectations("127.0.0.1", session, true, hostname, "tomcatBot 1.0");
+
+        EasyMock.replay(request, session);
+
+        valve.invoke(request, EasyMock.createMock(Response.class));
+
+        EasyMock.verify(request, session);
+    }
+
+
     private HttpSession createSessionExpectations(CrawlerSessionManagerValve valve, boolean isBot) {
         HttpSession session = EasyMock.createMock(HttpSession.class);
         if (isBot) {
@@ -72,15 +105,36 @@
         return session;
     }
 
+
     private Request createRequestExpectations(String ip, HttpSession session, boolean isBot) {
+        return createRequestExpectations(ip, session, isBot, "localhost", "something 1.0");
+    }
+
+    private Request createRequestExpectations(String ip, HttpSession session, boolean isBot, String hostname, String userAgent) {
         Request request = EasyMock.createMock(Request.class);
         EasyMock.expect(request.getRemoteAddr()).andReturn(ip);
+        EasyMock.expect(request.getHost()).andReturn(simpleHostWithName(hostname));
+        EasyMock.expect(request.getContext()).andReturn(simpleContextWithName());
         IExpectationSetters<HttpSession> setter = EasyMock.expect(request.getSession(false))
                 .andReturn(null);
         if (isBot) {
             setter.andReturn(session);
         }
-        EasyMock.expect(request.getHeaders("user-agent")).andReturn(Collections.<String>emptyEnumeration());
+        EasyMock.expect(request.getHeaders("user-agent")).andReturn(Collections.enumeration(Arrays.asList(userAgent)));
         return request;
     }
+
+    private Host simpleHostWithName(String hostname) {
+        Host host = EasyMock.createMock(Host.class);
+        EasyMock.expect(host.getName()).andReturn(hostname);
+        EasyMock.replay(host);
+        return host;
+    }
+
+    private Context simpleContextWithName() {
+        Context context = EasyMock.createMock(Context.class);
+        EasyMock.expect(context.getName()).andReturn("/examples");
+        EasyMock.replay(context);
+        return context;
+    }
 }
diff --git a/webapps/docs/changelog.xml b/webapps/docs/changelog.xml
index 32fe2a7..6cc9ac6 100644
--- a/webapps/docs/changelog.xml
+++ b/webapps/docs/changelog.xml
@@ -58,6 +58,11 @@
         type="javax.sql.XADataSource"</code>. Patch provided by Masafumi Miura.
         (csutherl)
       </fix>
+      <fix>
+        <bug>62297</bug>: Enable the <code>CrawlerSessionManagerValve</code> to
+        correctly handle bots that crawl multiple hosts and/or web applications
+        when the Valve is configured on a Host or an Engine. (fschumacher)
+      </fix>
     </changelog>
   </subsection>  
   <subsection name="Jasper">
diff --git a/webapps/docs/config/valve.xml b/webapps/docs/config/valve.xml
index 56db80f..7307a85 100644
--- a/webapps/docs/config/valve.xml
+++ b/webapps/docs/config/valve.xml
@@ -1664,6 +1664,13 @@
         </p>
       </attribute>
 
+      <attribute name="contextAware" required="false">
+        <p>Flag to use the context name together with the client IP to
+        identify the session to re-use. Can be combined with <code>hostAware</code>.
+        Default value: <code>true</code>
+        </p>
+      </attribute>
+
       <attribute name="crawlerIps" required="false">
         <p>Regular expression (using <code>java.util.regex</code>) that client
         IP is matched against to determine if a request is from a web crawler.
@@ -1677,6 +1684,13 @@
         <code>.*[bB]ot.*|.*Yahoo! Slurp.*|.*Feedfetcher-Google.*</code> is used.</p>
       </attribute>
 
+      <attribute name="hostAware" required="false">
+        <p>Flag to use the configured host together with the client IP to
+        identify the session to re-use. Can be combined with <code>contextAware</code>.
+        Default value: <code>true</code>
+        </p>
+      </attribute>
+
       <attribute name="sessionInactiveInterval" required="false">
         <p>The minimum time in seconds that the Crawler Session Manager Valve
         should keep the mapping of client IP to session ID in memory without any