Fix https://bz.apache.org/bugzilla/show_bug.cgi?id=62297
Enable the CrawlerSessionManagerValve to correctly handle bots that crawl multiple hosts and/or web applications when the Valve is configured on a Host or an Engine.
git-svn-id: https://svn.apache.org/repos/asf/tomcat/tc8.0.x/trunk@1829936 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java b/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java
index 4a698e9..aab1caf 100644
--- a/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java
+++ b/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java
@@ -27,6 +27,8 @@
import javax.servlet.http.HttpSessionBindingEvent;
import javax.servlet.http.HttpSessionBindingListener;
+import org.apache.catalina.Context;
+import org.apache.catalina.Host;
import org.apache.catalina.LifecycleException;
import org.apache.catalina.connector.Request;
import org.apache.catalina.connector.Response;
@@ -44,8 +46,8 @@
private static final Log log = LogFactory.getLog(CrawlerSessionManagerValve.class);
- private final Map<String, String> clientIpSessionId = new ConcurrentHashMap<>();
- private final Map<String, String> sessionIdClientIp = new ConcurrentHashMap<>();
+ private final Map<String, String> clientIdSessionId = new ConcurrentHashMap<>();
+ private final Map<String, String> sessionIdClientId = new ConcurrentHashMap<>();
private String crawlerUserAgents = ".*[bB]ot.*|.*Yahoo! Slurp.*|.*Feedfetcher-Google.*";
private Pattern uaPattern = null;
@@ -55,6 +57,10 @@
private int sessionInactiveInterval = 60;
+ private boolean isHostAware = true;
+
+ private boolean isContextAware = true;
+
/**
* Specifies a default constructor so async support can be configured.
@@ -134,7 +140,27 @@
public Map<String, String> getClientIpSessionId() {
- return clientIpSessionId;
+ return clientIdSessionId;
+ }
+
+
+ public boolean isHostAware() {
+ return isHostAware;
+ }
+
+
+ public void setHostAware(boolean isHostAware) {
+ this.isHostAware = isHostAware;
+ }
+
+
+ public boolean isContextAware() {
+ return isContextAware;
+ }
+
+
+ public void setContextAware(boolean isContextAware) {
+ this.isContextAware = isContextAware;
}
@@ -152,9 +178,10 @@
boolean isBot = false;
String sessionId = null;
String clientIp = request.getRemoteAddr();
+ String clientIdentifier = getClientIdentifier(request.getHost(), request.getContext(), clientIp);
if (log.isDebugEnabled()) {
- log.debug(request.hashCode() + ": ClientIp=" + clientIp + ", RequestedSessionId="
+ log.debug(request.hashCode() + ": ClientIdentifier=" + clientIdentifier + ", RequestedSessionId="
+ request.getRequestedSessionId());
}
@@ -194,7 +221,7 @@
// If this is a bot, is the session ID known?
if (isBot) {
- sessionId = clientIpSessionId.get(clientIp);
+ sessionId = clientIdSessionId.get(clientIdentifier);
if (sessionId != null) {
request.setRequestedSessionId(sessionId);
if (log.isDebugEnabled()) {
@@ -211,8 +238,8 @@
// Has bot just created a session, if so make a note of it
HttpSession s = request.getSession(false);
if (s != null) {
- clientIpSessionId.put(clientIp, s.getId());
- sessionIdClientIp.put(s.getId(), clientIp);
+ clientIdSessionId.put(clientIdentifier, s.getId());
+ sessionIdClientId.put(s.getId(), clientIdentifier);
// #valueUnbound() will be called on session expiration
s.setAttribute(this.getClass().getName(), this);
s.setMaxInactiveInterval(sessionInactiveInterval);
@@ -231,6 +258,18 @@
}
+ private String getClientIdentifier(Host host, Context context, String clientIp) {
+ StringBuilder result = new StringBuilder(clientIp);
+ if (isHostAware) {
+ result.append('-').append(host.getName());
+ }
+ if (isContextAware) {
+ result.append(context.getName());
+ }
+ return result.toString();
+ }
+
+
@Override
public void valueBound(HttpSessionBindingEvent event) {
// NOOP
@@ -239,9 +278,9 @@
@Override
public void valueUnbound(HttpSessionBindingEvent event) {
- String clientIp = sessionIdClientIp.remove(event.getSession().getId());
- if (clientIp != null) {
- clientIpSessionId.remove(clientIp);
+ String clientIdentifier = sessionIdClientId.remove(event.getSession().getId());
+ if (clientIdentifier != null) {
+ clientIdSessionId.remove(clientIdentifier);
}
}
}
diff --git a/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java b/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java
index bfdbaba..759a248 100644
--- a/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java
+++ b/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java
@@ -16,12 +16,17 @@
*/
package org.apache.catalina.valves;
+import java.io.IOException;
+import java.util.Arrays;
import java.util.Collections;
+import javax.servlet.ServletException;
import javax.servlet.http.HttpSession;
import org.junit.Test;
+import org.apache.catalina.Context;
+import org.apache.catalina.Host;
import org.apache.catalina.Valve;
import org.apache.catalina.connector.Request;
import org.apache.catalina.connector.Response;
@@ -34,6 +39,7 @@
public void testCrawlerIpsPositive() throws Exception {
CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve();
valve.setCrawlerIps("216\\.58\\.206\\.174");
+ valve.setCrawlerUserAgents(valve.getCrawlerUserAgents());
valve.setNext(EasyMock.createMock(Valve.class));
HttpSession session = createSessionExpectations(valve, true);
Request request = createRequestExpectations("216.58.206.174", session, true);
@@ -49,6 +55,7 @@
public void testCrawlerIpsNegative() throws Exception {
CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve();
valve.setCrawlerIps("216\\.58\\.206\\.174");
+ valve.setCrawlerUserAgents(valve.getCrawlerUserAgents());
valve.setNext(EasyMock.createMock(Valve.class));
HttpSession session = createSessionExpectations(valve, false);
Request request = createRequestExpectations("127.0.0.1", session, false);
@@ -60,6 +67,32 @@
EasyMock.verify(request, session);
}
+ @Test
+ public void testCrawlerMultipleHostsHostAware() throws Exception {
+ CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve();
+ valve.setCrawlerUserAgents(valve.getCrawlerUserAgents());
+ valve.setHostAware(true);
+ valve.setContextAware(true);
+ valve.setNext(EasyMock.createMock(Valve.class));
+
+ verifyCrawlingLocalhost(valve, "localhost");
+ verifyCrawlingLocalhost(valve, "example.invalid");
+ }
+
+
+ private void verifyCrawlingLocalhost(CrawlerSessionManagerValve valve, String hostname)
+ throws IOException, ServletException {
+ HttpSession session = createSessionExpectations(valve, true);
+ Request request = createRequestExpectations("127.0.0.1", session, true, hostname, "tomcatBot 1.0");
+
+ EasyMock.replay(request, session);
+
+ valve.invoke(request, EasyMock.createMock(Response.class));
+
+ EasyMock.verify(request, session);
+ }
+
+
private HttpSession createSessionExpectations(CrawlerSessionManagerValve valve, boolean isBot) {
HttpSession session = EasyMock.createMock(HttpSession.class);
if (isBot) {
@@ -72,15 +105,36 @@
return session;
}
+
private Request createRequestExpectations(String ip, HttpSession session, boolean isBot) {
+ return createRequestExpectations(ip, session, isBot, "localhost", "something 1.0");
+ }
+
+ private Request createRequestExpectations(String ip, HttpSession session, boolean isBot, String hostname, String userAgent) {
Request request = EasyMock.createMock(Request.class);
EasyMock.expect(request.getRemoteAddr()).andReturn(ip);
+ EasyMock.expect(request.getHost()).andReturn(simpleHostWithName(hostname));
+ EasyMock.expect(request.getContext()).andReturn(simpleContextWithName());
IExpectationSetters<HttpSession> setter = EasyMock.expect(request.getSession(false))
.andReturn(null);
if (isBot) {
setter.andReturn(session);
}
- EasyMock.expect(request.getHeaders("user-agent")).andReturn(Collections.<String>emptyEnumeration());
+ EasyMock.expect(request.getHeaders("user-agent")).andReturn(Collections.enumeration(Arrays.asList(userAgent)));
return request;
}
+
+ private Host simpleHostWithName(String hostname) {
+ Host host = EasyMock.createMock(Host.class);
+ EasyMock.expect(host.getName()).andReturn(hostname);
+ EasyMock.replay(host);
+ return host;
+ }
+
+ private Context simpleContextWithName() {
+ Context context = EasyMock.createMock(Context.class);
+ EasyMock.expect(context.getName()).andReturn("/examples");
+ EasyMock.replay(context);
+ return context;
+ }
}
diff --git a/webapps/docs/changelog.xml b/webapps/docs/changelog.xml
index 32fe2a7..6cc9ac6 100644
--- a/webapps/docs/changelog.xml
+++ b/webapps/docs/changelog.xml
@@ -58,6 +58,11 @@
type="javax.sql.XADataSource"</code>. Patch provided by Masafumi Miura.
(csutherl)
</fix>
+ <fix>
+ <bug>62297</bug>: Enable the <code>CrawlerSessionManagerValve</code> to
+ correctly handle bots that crawl multiple hosts and/or web applications
+ when the Valve is configured on a Host or an Engine. (fschumacher)
+ </fix>
</changelog>
</subsection>
<subsection name="Jasper">
diff --git a/webapps/docs/config/valve.xml b/webapps/docs/config/valve.xml
index 56db80f..7307a85 100644
--- a/webapps/docs/config/valve.xml
+++ b/webapps/docs/config/valve.xml
@@ -1664,6 +1664,13 @@
</p>
</attribute>
+ <attribute name="contextAware" required="false">
+ <p>Flag to use the context name together with the client IP to
+ identify the session to re-use. Can be combined with <code>hostAware</code>.
+ Default value: <code>true</code>
+ </p>
+ </attribute>
+
<attribute name="crawlerIps" required="false">
<p>Regular expression (using <code>java.util.regex</code>) that client
IP is matched against to determine if a request is from a web crawler.
@@ -1677,6 +1684,13 @@
<code>.*[bB]ot.*|.*Yahoo! Slurp.*|.*Feedfetcher-Google.*</code> is used.</p>
</attribute>
+ <attribute name="hostAware" required="false">
+ <p>Flag to use the configured host together with the client IP to
+ identify the session to re-use. Can be combined with <code>contextAware</code>.
+ Default value: <code>true</code>
+ </p>
+ </attribute>
+
<attribute name="sessionInactiveInterval" required="false">
<p>The minimum time in seconds that the Crawler Session Manager Valve
should keep the mapping of client IP to session ID in memory without any