| /* | 
 |  * Licensed to the Apache Software Foundation (ASF) under one or more | 
 |  * contributor license agreements.  See the NOTICE file distributed with | 
 |  * this work for additional information regarding copyright ownership. | 
 |  * The ASF licenses this file to You under the Apache License, Version 2.0 | 
 |  * (the "License"); you may not use this file except in compliance with | 
 |  * the License.  You may obtain a copy of the License at | 
 |  * | 
 |  *      http://www.apache.org/licenses/LICENSE-2.0 | 
 |  * | 
 |  * Unless required by applicable law or agreed to in writing, software | 
 |  * distributed under the License is distributed on an "AS IS" BASIS, | 
 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
 |  * See the License for the specific language governing permissions and | 
 |  * limitations under the License. | 
 |  */ | 
 | package org.apache.catalina.valves; | 
 |  | 
 | import java.io.IOException; | 
 | import java.io.Serializable; | 
 | import java.util.Enumeration; | 
 | import java.util.Map; | 
 | import java.util.concurrent.ConcurrentHashMap; | 
 | import java.util.regex.Pattern; | 
 |  | 
 | import jakarta.servlet.ServletException; | 
 | import jakarta.servlet.http.HttpSession; | 
 | import jakarta.servlet.http.HttpSessionBindingEvent; | 
 | import jakarta.servlet.http.HttpSessionBindingListener; | 
 |  | 
 | import org.apache.catalina.Context; | 
 | import org.apache.catalina.Host; | 
 | import org.apache.catalina.LifecycleException; | 
 | import org.apache.catalina.connector.Request; | 
 | import org.apache.catalina.connector.Response; | 
 | import org.apache.juli.logging.Log; | 
 | import org.apache.juli.logging.LogFactory; | 
 |  | 
 | /** | 
 |  * Web crawlers can trigger the creation of many thousands of sessions as they | 
 |  * crawl a site which may result in significant memory consumption. This Valve | 
 |  * ensures that crawlers are associated with a single session - just like normal | 
 |  * users - regardless of whether or not they provide a session token with their | 
 |  * requests. | 
 |  */ | 
 | public class CrawlerSessionManagerValve extends ValveBase { | 
 |  | 
 |     private static final Log log = LogFactory.getLog(CrawlerSessionManagerValve.class); | 
 |  | 
 |     private final Map<String, String> clientIdSessionId = new ConcurrentHashMap<>(); | 
 |     private final Map<String, String> sessionIdClientId = new ConcurrentHashMap<>(); | 
 |  | 
 |     private String crawlerUserAgents = ".*[bB]ot.*|.*Yahoo! Slurp.*|.*Feedfetcher-Google.*"; | 
 |     private Pattern uaPattern = null; | 
 |  | 
 |     private String crawlerIps = null; | 
 |     private Pattern ipPattern = null; | 
 |  | 
 |     private int sessionInactiveInterval = 60; | 
 |  | 
 |     private boolean isHostAware = true; | 
 |  | 
 |     private boolean isContextAware = true; | 
 |  | 
 |  | 
 |     /** | 
 |      * Specifies a default constructor so async support can be configured. | 
 |      */ | 
 |     public CrawlerSessionManagerValve() { | 
 |         super(true); | 
 |     } | 
 |  | 
 |  | 
 |     /** | 
 |      * Specify the regular expression (using {@link Pattern}) that will be used | 
 |      * to identify crawlers based in the User-Agent header provided. The default | 
 |      * is ".*GoogleBot.*|.*bingbot.*|.*Yahoo! Slurp.*" | 
 |      * | 
 |      * @param crawlerUserAgents The regular expression using {@link Pattern} | 
 |      */ | 
 |     public void setCrawlerUserAgents(String crawlerUserAgents) { | 
 |         this.crawlerUserAgents = crawlerUserAgents; | 
 |         if (crawlerUserAgents == null || crawlerUserAgents.length() == 0) { | 
 |             uaPattern = null; | 
 |         } else { | 
 |             uaPattern = Pattern.compile(crawlerUserAgents); | 
 |         } | 
 |     } | 
 |  | 
 |     /** | 
 |      * @see #setCrawlerUserAgents(String) | 
 |      * @return  The current regular expression being used to match user agents. | 
 |      */ | 
 |     public String getCrawlerUserAgents() { | 
 |         return crawlerUserAgents; | 
 |     } | 
 |  | 
 |  | 
 |     /** | 
 |      * Specify the regular expression (using {@link Pattern}) that will be used | 
 |      * to identify crawlers based on their IP address. The default is no crawler | 
 |      * IPs. | 
 |      * | 
 |      * @param crawlerIps The regular expression using {@link Pattern} | 
 |      */ | 
 |     public void setCrawlerIps(String crawlerIps) { | 
 |         this.crawlerIps = crawlerIps; | 
 |         if (crawlerIps == null || crawlerIps.length() == 0) { | 
 |             ipPattern = null; | 
 |         } else { | 
 |             ipPattern = Pattern.compile(crawlerIps); | 
 |         } | 
 |     } | 
 |  | 
 |     /** | 
 |      * @see #setCrawlerIps(String) | 
 |      * @return The current regular expression being used to match IP addresses. | 
 |      */ | 
 |     public String getCrawlerIps() { | 
 |         return crawlerIps; | 
 |     } | 
 |  | 
 |  | 
 |     /** | 
 |      * Specify the session timeout (in seconds) for a crawler's session. This is | 
 |      * typically lower than that for a user session. The default is 60 seconds. | 
 |      * | 
 |      * @param sessionInactiveInterval   The new timeout for crawler sessions | 
 |      */ | 
 |     public void setSessionInactiveInterval(int sessionInactiveInterval) { | 
 |         this.sessionInactiveInterval = sessionInactiveInterval; | 
 |     } | 
 |  | 
 |     /** | 
 |      * @see #setSessionInactiveInterval(int) | 
 |      * @return  The current timeout in seconds | 
 |      */ | 
 |     public int getSessionInactiveInterval() { | 
 |         return sessionInactiveInterval; | 
 |     } | 
 |  | 
 |  | 
 |     public Map<String, String> getClientIpSessionId() { | 
 |         return clientIdSessionId; | 
 |     } | 
 |  | 
 |  | 
 |     public boolean isHostAware() { | 
 |         return isHostAware; | 
 |     } | 
 |  | 
 |  | 
 |     public void setHostAware(boolean isHostAware) { | 
 |         this.isHostAware = isHostAware; | 
 |     } | 
 |  | 
 |  | 
 |     public boolean isContextAware() { | 
 |         return isContextAware; | 
 |     } | 
 |  | 
 |  | 
 |     public void setContextAware(boolean isContextAware) { | 
 |         this.isContextAware = isContextAware; | 
 |     } | 
 |  | 
 |  | 
 |     @Override | 
 |     protected void initInternal() throws LifecycleException { | 
 |         super.initInternal(); | 
 |  | 
 |         uaPattern = Pattern.compile(crawlerUserAgents); | 
 |     } | 
 |  | 
 |  | 
 |     @Override | 
 |     public void invoke(Request request, Response response) throws IOException, ServletException { | 
 |  | 
 |         boolean isBot = false; | 
 |         String sessionId = null; | 
 |         String clientIp = request.getRemoteAddr(); | 
 |         String clientIdentifier = getClientIdentifier(request.getHost(), request.getContext(), clientIp); | 
 |  | 
 |         if (log.isDebugEnabled()) { | 
 |             log.debug(request.hashCode() + ": ClientIdentifier=" + clientIdentifier + ", RequestedSessionId=" | 
 |                     + request.getRequestedSessionId()); | 
 |         } | 
 |  | 
 |         // If the incoming request has a valid session ID, no action is required | 
 |         if (request.getSession(false) == null) { | 
 |  | 
 |             // Is this a crawler - check the UA headers | 
 |             Enumeration<String> uaHeaders = request.getHeaders("user-agent"); | 
 |             String uaHeader = null; | 
 |             if (uaHeaders.hasMoreElements()) { | 
 |                 uaHeader = uaHeaders.nextElement(); | 
 |             } | 
 |  | 
 |             // If more than one UA header - assume not a bot | 
 |             if (uaHeader != null && !uaHeaders.hasMoreElements()) { | 
 |  | 
 |                 if (log.isDebugEnabled()) { | 
 |                     log.debug(request.hashCode() + ": UserAgent=" + uaHeader); | 
 |                 } | 
 |  | 
 |                 if (uaPattern.matcher(uaHeader).matches()) { | 
 |                     isBot = true; | 
 |  | 
 |                     if (log.isDebugEnabled()) { | 
 |                         log.debug(request.hashCode() + ": Bot found. UserAgent=" + uaHeader); | 
 |                     } | 
 |                 } | 
 |             } | 
 |  | 
 |             if (ipPattern != null && ipPattern.matcher(clientIp).matches()) { | 
 |                 isBot = true; | 
 |  | 
 |                 if (log.isDebugEnabled()) { | 
 |                     log.debug(request.hashCode() + ": Bot found. IP=" + clientIp); | 
 |                 } | 
 |             } | 
 |  | 
 |             // If this is a bot, is the session ID known? | 
 |             if (isBot) { | 
 |                 sessionId = clientIdSessionId.get(clientIdentifier); | 
 |                 if (sessionId != null) { | 
 |                     request.setRequestedSessionId(sessionId); | 
 |                     if (log.isDebugEnabled()) { | 
 |                         log.debug(request.hashCode() + ": SessionID=" + sessionId); | 
 |                     } | 
 |                 } | 
 |             } | 
 |         } | 
 |  | 
 |         getNext().invoke(request, response); | 
 |  | 
 |         if (isBot) { | 
 |             if (sessionId == null) { | 
 |                 // Has bot just created a session, if so make a note of it | 
 |                 HttpSession s = request.getSession(false); | 
 |                 if (s != null) { | 
 |                     clientIdSessionId.put(clientIdentifier, s.getId()); | 
 |                     sessionIdClientId.put(s.getId(), clientIdentifier); | 
 |                     // #valueUnbound() will be called on session expiration | 
 |                     s.setAttribute(this.getClass().getName(), | 
 |                             new CrawlerHttpSessionBindingListener(clientIdSessionId, clientIdentifier)); | 
 |                     s.setMaxInactiveInterval(sessionInactiveInterval); | 
 |  | 
 |                     if (log.isDebugEnabled()) { | 
 |                         log.debug(request.hashCode() + ": New bot session. SessionID=" + s.getId()); | 
 |                     } | 
 |                 } | 
 |             } else { | 
 |                 if (log.isDebugEnabled()) { | 
 |                     log.debug( | 
 |                             request.hashCode() + ": Bot session accessed. SessionID=" + sessionId); | 
 |                 } | 
 |             } | 
 |         } | 
 |     } | 
 |  | 
 |  | 
 |     private String getClientIdentifier(Host host, Context context, String clientIp) { | 
 |         StringBuilder result = new StringBuilder(clientIp); | 
 |         if (isHostAware) { | 
 |             result.append('-').append(host.getName()); | 
 |         } | 
 |         if (isContextAware && context != null) { | 
 |             result.append(context.getName()); | 
 |         } | 
 |         return result.toString(); | 
 |     } | 
 |  | 
 |     private static class CrawlerHttpSessionBindingListener implements HttpSessionBindingListener, Serializable { | 
 |         private static final long serialVersionUID = 1L; | 
 |  | 
 |         private final transient Map<String, String> clientIdSessionId; | 
 |         private final transient String clientIdentifier; | 
 |  | 
 |         private CrawlerHttpSessionBindingListener(Map<String, String> clientIdSessionId, String clientIdentifier) { | 
 |             this.clientIdSessionId = clientIdSessionId; | 
 |             this.clientIdentifier = clientIdentifier; | 
 |         } | 
 |  | 
 |         @Override | 
 |         public void valueUnbound(HttpSessionBindingEvent event) { | 
 |             if (clientIdentifier != null && clientIdSessionId != null) { | 
 |                 clientIdSessionId.remove(clientIdentifier, event.getSession().getId()); | 
 |             } | 
 |         } | 
 |     } | 
 | } |