| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.update.processor; |
| |
| import java.io.IOException; |
| import java.lang.invoke.MethodHandles; |
| import java.net.MalformedURLException; |
| import java.net.URI; |
| import java.net.URISyntaxException; |
| import java.net.URL; |
| import java.util.Locale; |
| |
| import org.apache.solr.common.SolrInputDocument; |
| import org.apache.solr.common.params.SolrParams; |
| import org.apache.solr.request.SolrQueryRequest; |
| import org.apache.solr.response.SolrQueryResponse; |
| import org.apache.solr.update.AddUpdateCommand; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * <p> |
| * Update processor which examines a URL and outputs to various other fields |
| * characteristics of that URL, including length, number of path levels, whether |
| * it is a top level URL (levels==0), whether it looks like a landing/index page, |
| * a canonical representation of the URL (e.g. stripping index.html), the domain |
| * and path parts of the URL etc. |
| * </p> |
| * |
| * <p> |
| * This processor is intended used in connection with processing web resources, |
| * and helping to produce values which may be used for boosting or filtering later. |
| * </p> |
| * |
| * <p> |
| * In the example configuration below, we construct a custom |
| * <code>updateRequestProcessorChain</code> and then instruct the |
| * <code>/update</code> requesthandler to use it for every incoming document. |
| * </p> |
| * <pre class="prettyprint"> |
| * <updateRequestProcessorChain name="urlProcessor"> |
| * <processor class="org.apache.solr.update.processor.URLClassifyProcessorFactory"> |
| * <bool name="enabled">true</bool> |
| * <str name="inputField">id</str> |
| * <str name="domainOutputField">hostname</str> |
| * </processor> |
| * <processor class="solr.RunUpdateProcessorFactory" /> |
| * </updateRequestProcessorChain> |
| * |
| * <requestHandler name="/update" class="solr.UpdateRequestHandler"> |
| * <lst name="defaults"> |
| * <str name="update.chain">urlProcessor</str> |
| * </lst> |
| * </requestHandler> |
| * </pre> |
| * <p> |
| * Then, at index time, Solr will look at the <code>id</code> field value and extract |
| * it's domain portion into a new <code>hostname</code> field. By default, the |
| * following fields will also be added: |
| * </p> |
| * <ul> |
| * <li>url_length</li> |
| * <li>url_levels</li> |
| * <li>url_toplevel</li> |
| * <li>url_landingpage</li> |
| * </ul> |
| * <p> |
| * For example, adding the following document |
| * <pre class="prettyprint"> |
| * { "id":"http://wwww.mydomain.com/subpath/document.html" } |
| * </pre> |
| * <p> |
| * will result in this document in Solr: |
| * </p> |
| * <pre class="prettyprint"> |
| * { |
| * "id":"http://wwww.mydomain.com/subpath/document.html", |
| * "url_length":46, |
| * "url_levels":2, |
| * "url_toplevel":0, |
| * "url_landingpage":0, |
| * "hostname":"wwww.mydomain.com", |
| * "_version_":1603193062117343232}] |
| * } |
| * </pre> |
| */ |
| public class URLClassifyProcessor extends UpdateRequestProcessor { |
| |
| private static final String INPUT_FIELD_PARAM = "inputField"; |
| private static final String OUTPUT_LENGTH_FIELD_PARAM = "lengthOutputField"; |
| private static final String OUTPUT_LEVELS_FIELD_PARAM = "levelsOutputField"; |
| private static final String OUTPUT_TOPLEVEL_FIELD_PARAM = "toplevelOutputField"; |
| private static final String OUTPUT_LANDINGPAGE_FIELD_PARAM = "landingpageOutputField"; |
| private static final String OUTPUT_DOMAIN_FIELD_PARAM = "domainOutputField"; |
| private static final String OUTPUT_CANONICALURL_FIELD_PARAM = "canonicalUrlOutputField"; |
| private static final String DEFAULT_URL_FIELDNAME = "url"; |
| private static final String DEFAULT_LENGTH_FIELDNAME = "url_length"; |
| private static final String DEFAULT_LEVELS_FIELDNAME = "url_levels"; |
| private static final String DEFAULT_TOPLEVEL_FIELDNAME = "url_toplevel"; |
| private static final String DEFAULT_LANDINGPAGE_FIELDNAME = "url_landingpage"; |
| private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); |
| private boolean enabled = true; |
| private String urlFieldname = DEFAULT_URL_FIELDNAME; |
| private String lengthFieldname = DEFAULT_LENGTH_FIELDNAME; |
| private String levelsFieldname = DEFAULT_LEVELS_FIELDNAME; |
| private String toplevelpageFieldname = DEFAULT_TOPLEVEL_FIELDNAME; |
| private String landingpageFieldname = DEFAULT_LANDINGPAGE_FIELDNAME; |
| private String domainFieldname = null; |
| private String canonicalUrlFieldname = null; |
| private static final String[] landingPageSuffixes = { |
| "/", |
| "index.html", |
| "index.htm", |
| "index.phtml", |
| "index.shtml", |
| "index.xml", |
| "index.php", |
| "index.asp", |
| "index.aspx", |
| "welcome.html", |
| "welcome.htm", |
| "welcome.phtml", |
| "welcome.shtml", |
| "welcome.xml", |
| "welcome.php", |
| "welcome.asp", |
| "welcome.aspx" |
| }; |
| |
| public URLClassifyProcessor(SolrParams parameters, |
| SolrQueryRequest request, |
| SolrQueryResponse response, |
| UpdateRequestProcessor nextProcessor) { |
| super(nextProcessor); |
| |
| this.initParameters(parameters); |
| } |
| |
| private void initParameters(SolrParams parameters) { |
| if (parameters != null) { |
| this.setEnabled(parameters.getBool("enabled", true)); |
| this.urlFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_URL_FIELDNAME); |
| this.lengthFieldname = parameters.get(OUTPUT_LENGTH_FIELD_PARAM, DEFAULT_LENGTH_FIELDNAME); |
| this.levelsFieldname = parameters.get(OUTPUT_LEVELS_FIELD_PARAM, DEFAULT_LEVELS_FIELDNAME); |
| this.toplevelpageFieldname = parameters.get(OUTPUT_TOPLEVEL_FIELD_PARAM, DEFAULT_TOPLEVEL_FIELDNAME); |
| this.landingpageFieldname = parameters.get(OUTPUT_LANDINGPAGE_FIELD_PARAM, DEFAULT_LANDINGPAGE_FIELDNAME); |
| this.domainFieldname = parameters.get(OUTPUT_DOMAIN_FIELD_PARAM); |
| this.canonicalUrlFieldname = parameters.get(OUTPUT_CANONICALURL_FIELD_PARAM); |
| } |
| } |
| |
| @Override |
| public void processAdd(AddUpdateCommand command) throws IOException { |
| if (isEnabled()) { |
| SolrInputDocument document = command.getSolrInputDocument(); |
| if (document.containsKey(urlFieldname)) { |
| String url = (String) document.getFieldValue(urlFieldname); |
| try { |
| URL normalizedURL = getNormalizedURL(url); |
| document.setField(lengthFieldname, length(normalizedURL)); |
| document.setField(levelsFieldname, levels(normalizedURL)); |
| document.setField(toplevelpageFieldname, isTopLevelPage(normalizedURL) ? 1 : 0); |
| document.setField(landingpageFieldname, isLandingPage(normalizedURL) ? 1 : 0); |
| if (domainFieldname != null) { |
| document.setField(domainFieldname, normalizedURL.getHost()); |
| } |
| if (canonicalUrlFieldname != null) { |
| document.setField(canonicalUrlFieldname, getCanonicalUrl(normalizedURL)); |
| } |
| log.debug("{}", document); |
| } catch (MalformedURLException | URISyntaxException e) { |
| log.warn("cannot get the normalized url for '{}' due to ", url, e); |
| } |
| } |
| } |
| super.processAdd(command); |
| } |
| |
| /** |
| * Gets a canonical form of the URL for use as main URL |
| * @param url The input url |
| * @return The URL object representing the canonical URL |
| */ |
| public URL getCanonicalUrl(URL url) { |
| // NOTE: Do we want to make sure this URL is normalized? (Christian thinks we should) |
| String urlString = url.toString(); |
| try { |
| String lps = landingPageSuffix(url); |
| return new URL(urlString.replaceFirst("/"+lps+"$", "/")); |
| } catch (MalformedURLException e) { |
| e.printStackTrace(); |
| } |
| return url; |
| } |
| |
| /** |
| * Calculates the length of the URL in characters |
| * @param url The input URL |
| * @return the length of the URL |
| */ |
| public int length(URL url) { |
| return url.toString().length(); |
| } |
| |
| /** |
| * Calculates the number of path levels in the given URL |
| * @param url The input URL |
| * @return the number of levels, where a top-level URL is 0 |
| */ |
| public int levels(URL url) { |
| // Remove any trailing slashes for the purpose of level counting |
| String path = getPathWithoutSuffix(url).replaceAll("/+$", ""); |
| int levels = 0; |
| for (int i = 0; i < path.length(); i++) { |
| if (path.charAt(i) == '/') { |
| levels++; |
| } |
| } |
| return levels; |
| } |
| |
| /** |
| * Calculates whether a URL is a top level page |
| * @param url The input URL |
| * @return true if page is a top level page |
| */ |
| public boolean isTopLevelPage(URL url) { |
| // Remove any trailing slashes for the purpose of level counting |
| String path = getPathWithoutSuffix(url).replaceAll("/+$", ""); |
| return path.length() == 0 && url.getQuery() == null; |
| } |
| |
| /** |
| * Calculates whether the URL is a landing page or not |
| * @param url The input URL |
| * @return true if URL represents a landing page (index page) |
| */ |
| public boolean isLandingPage(URL url) { |
| if (url.getQuery() != null) { |
| return false; |
| } else { |
| return landingPageSuffix(url) != ""; |
| } |
| } |
| |
| public URL getNormalizedURL(String url) throws MalformedURLException, URISyntaxException { |
| return new URI(url).normalize().toURL(); |
| } |
| |
| public boolean isEnabled() { |
| return enabled; |
| } |
| |
| public void setEnabled(boolean enabled) { |
| this.enabled = enabled; |
| } |
| |
| private String landingPageSuffix(URL url) { |
| String path = url.getPath().toLowerCase(Locale.ROOT); |
| for(String suffix : landingPageSuffixes) { |
| if(path.endsWith(suffix)) { |
| return suffix; |
| } |
| } |
| return ""; |
| } |
| |
| private String getPathWithoutSuffix(URL url) { |
| return url.getPath().toLowerCase(Locale.ROOT).replaceFirst(landingPageSuffix(url)+"$", ""); |
| } |
| } |