solr/core/src/java/org/apache/solr/util/SimplePostTool.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.solr.util;

 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.xpath.XPath;
 import javax.xml.xpath.XPathConstants;
 import javax.xml.xpath.XPathExpression;
 import javax.xml.xpath.XPathExpressionException;
 import javax.xml.xpath.XPathFactory;
 import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileFilter;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.OutputStream;
 import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
 import java.net.ProtocolException;
 import java.net.URL;
 import java.net.URLEncoder;
 import java.nio.BufferOverflowException;
 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.InvalidPathException;
 import java.security.GeneralSecurityException;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Base64;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.TimeZone;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.Inflater;
 import java.util.zip.InflaterInputStream;

 import org.w3c.dom.Document;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.SAXException;

 import static java.nio.charset.StandardCharsets.US_ASCII;
 import static java.nio.charset.StandardCharsets.UTF_8;

 /**
  * A simple utility class for posting raw updates to a Solr server,
  * has a main method so it can be run on the command line.
  * View this not as a best-practice code example, but as a standalone
  * example built with an explicit purpose of not having external
  * jar dependencies.
  */
 public class SimplePostTool {
   private static final String DEFAULT_POST_HOST = "localhost";
   private static final String DEFAULT_POST_PORT = "8983";
   private static final String VERSION_OF_THIS_TOOL = "5.0.0";  // TODO: hardcoded for now, but eventually to sync with actual Solr version

   private static final String DEFAULT_COMMIT = "yes";
   private static final String DEFAULT_OPTIMIZE = "no";
   private static final String DEFAULT_OUT = "no";
   private static final String DEFAULT_AUTO = "no";
   private static final String DEFAULT_RECURSIVE = "0";
   private static final int DEFAULT_WEB_DELAY = 10;
   private static final int MAX_WEB_DEPTH = 10;
   private static final String DEFAULT_CONTENT_TYPE = "application/xml";
   private static final String DEFAULT_FILE_TYPES = "xml,json,jsonl,csv,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,odp,ods,ott,otp,ots,rtf,htm,html,txt,log";
   private static final String BASIC_AUTH = "basicauth";

   static final String DATA_MODE_FILES = "files";
   static final String DATA_MODE_ARGS = "args";
   static final String DATA_MODE_STDIN = "stdin";
   static final String DATA_MODE_WEB = "web";
   static final String DEFAULT_DATA_MODE = DATA_MODE_FILES;

   // Input args
   boolean auto = false;
   int recursive = 0;
   int delay = 0;
   String fileTypes;
   URL solrUrl;
   OutputStream out = null;
   String type;
   String format;
   String mode;
   boolean commit;
   boolean optimize;
   String[] args;

   private int currentDepth;

   static HashMap<String,String> mimeMap;
   FileFilter fileFilter;
   // Backlog for crawling
   List<LinkedHashSet<URL>> backlog = new ArrayList<>();
   Set<URL> visited = new HashSet<>();

   static final Set<String> DATA_MODES = new HashSet<>();
   static final String USAGE_STRING_SHORT =
       "Usage: java [SystemProperties] -jar post.jar [-h|-] [<file|folder|url|arg> [<file|folder|url|arg>...]]";

   // Used in tests to avoid doing actual network traffic
   boolean mockMode = false;
   PageFetcher pageFetcher;

   static {
     DATA_MODES.add(DATA_MODE_FILES);
     DATA_MODES.add(DATA_MODE_ARGS);
     DATA_MODES.add(DATA_MODE_STDIN);
     DATA_MODES.add(DATA_MODE_WEB);

     mimeMap = new HashMap<>();
     mimeMap.put("xml", "application/xml");
     mimeMap.put("csv", "text/csv");
     mimeMap.put("json", "application/json");
     mimeMap.put("jsonl", "application/json");
     mimeMap.put("pdf", "application/pdf");
     mimeMap.put("rtf", "text/rtf");
     mimeMap.put("html", "text/html");
     mimeMap.put("htm", "text/html");
     mimeMap.put("doc", "application/msword");
     mimeMap.put("docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
     mimeMap.put("ppt", "application/vnd.ms-powerpoint");
     mimeMap.put("pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
     mimeMap.put("xls", "application/vnd.ms-excel");
     mimeMap.put("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
     mimeMap.put("odt", "application/vnd.oasis.opendocument.text");
     mimeMap.put("ott", "application/vnd.oasis.opendocument.text");
     mimeMap.put("odp", "application/vnd.oasis.opendocument.presentation");
     mimeMap.put("otp", "application/vnd.oasis.opendocument.presentation");
     mimeMap.put("ods", "application/vnd.oasis.opendocument.spreadsheet");
     mimeMap.put("ots", "application/vnd.oasis.opendocument.spreadsheet");
     mimeMap.put("txt", "text/plain");
     mimeMap.put("log", "text/plain");
   }

   /**
    * See usage() for valid command line usage
    * @param args the params on the command line
    */
   public static void main(String[] args) {
     info("SimplePostTool version " + VERSION_OF_THIS_TOOL);
     if (0 < args.length && ("-help".equals(args[0]) || "--help".equals(args[0]) || "-h".equals(args[0]))) {
       usage();
     } else {
       final SimplePostTool t = parseArgsAndInit(args);
       t.execute();
     }
   }

   /**
    * After initialization, call execute to start the post job.
    * This method delegates to the correct mode method.
    */
   public void execute() {
     final RTimer timer = new RTimer();
     if (DATA_MODE_FILES.equals(mode) && args.length > 0) {
       doFilesMode();
     } else if(DATA_MODE_ARGS.equals(mode) && args.length > 0) {
       doArgsMode();
     } else if(DATA_MODE_WEB.equals(mode) && args.length > 0) {
       doWebMode();
     } else if(DATA_MODE_STDIN.equals(mode)) {
       doStdinMode();
     } else {
       usageShort();
       return;
     }

     if (commit)   commit();
     if (optimize) optimize();
     displayTiming((long) timer.getTime());
   }

   /**
    * Pretty prints the number of milliseconds taken to post the content to Solr
    * @param millis the time in milliseconds
    */
   private void displayTiming(long millis) {
     SimpleDateFormat df = new SimpleDateFormat("H:mm:ss.SSS", Locale.getDefault());
     df.setTimeZone(TimeZone.getTimeZone("UTC"));
     System.out.println("Time spent: "+df.format(new Date(millis)));
   }

   /**
    * Parses incoming arguments and system params and initializes the tool
    * @param args the incoming cmd line args
    * @return an instance of SimplePostTool
    */
   protected static SimplePostTool parseArgsAndInit(String[] args) {
     String urlStr = null;
     try {
       // Parse args
       final String mode = System.getProperty("data", DEFAULT_DATA_MODE);
       if (! DATA_MODES.contains(mode)) {
         fatal("System Property 'data' is not valid for this tool: " + mode);
       }

       String params = System.getProperty("params", "");

       String host = System.getProperty("host", DEFAULT_POST_HOST);
       String port = System.getProperty("port", DEFAULT_POST_PORT);
       String core = System.getProperty("c");

       urlStr = System.getProperty("url");

       if (urlStr == null && core == null) {
         fatal("Specifying either url or core/collection is mandatory.\n" + USAGE_STRING_SHORT);
       }

       if(urlStr == null) {
         urlStr = String.format(Locale.ROOT, "http://%s:%s/solr/%s/update", host, port, core);
       }
       urlStr = SimplePostTool.appendParam(urlStr, params);
       URL url = new URL(urlStr);
       String user = null;
       if (url.getUserInfo() != null && url.getUserInfo().trim().length() > 0) {
         user = url.getUserInfo().split(":")[0];
       } else if (System.getProperty(BASIC_AUTH) != null) {
         user = System.getProperty(BASIC_AUTH).trim().split(":")[0];
       }
       if (user != null)
         info("Basic Authentication enabled, user=" + user);

       boolean auto = isOn(System.getProperty("auto", DEFAULT_AUTO));
       String type = System.getProperty("type");
       String format = System.getProperty("format");
       // Recursive
       int recursive = 0;
       String r = System.getProperty("recursive", DEFAULT_RECURSIVE);
       try {
         recursive = Integer.parseInt(r);
       } catch(Exception e) {
         if (isOn(r))
           recursive = DATA_MODE_WEB.equals(mode)?1:999;
       }
       // Delay
       int delay = DATA_MODE_WEB.equals(mode) ? DEFAULT_WEB_DELAY : 0;
       try {
         delay = Integer.parseInt(System.getProperty("delay", ""+delay));
       } catch(Exception e) { }
       OutputStream out = isOn(System.getProperty("out", DEFAULT_OUT)) ? System.out : null;
       String fileTypes = System.getProperty("filetypes", DEFAULT_FILE_TYPES);
       boolean commit = isOn(System.getProperty("commit",DEFAULT_COMMIT));
       boolean optimize = isOn(System.getProperty("optimize",DEFAULT_OPTIMIZE));

       return new SimplePostTool(mode, url, auto, type, format, recursive, delay, fileTypes, out, commit, optimize, args);
     } catch (MalformedURLException e) {
       fatal("System Property 'url' is not a valid URL: " + urlStr);
       return null;
     }
   }

   /**
    * Constructor which takes in all mandatory input for the tool to work.
    * Also see usage() for further explanation of the params.
    * @param mode whether to post files, web pages, params or stdin
    * @param url the Solr base Url to post to, should end with /update
    * @param auto if true, we'll guess type and add resourcename/url
    * @param type content-type of the data you are posting
    * @param recursive number of levels for file/web mode, or 0 if one file only
    * @param delay if recursive then delay will be the wait time between posts
    * @param fileTypes a comma separated list of file-name endings to accept for file/web
    * @param out an OutputStream to write output to, e.g. stdout to print to console
    * @param commit if true, will commit at end of posting
    * @param optimize if true, will optimize at end of posting
    * @param args a String[] of arguments, varies between modes
    */
   public SimplePostTool(String mode, URL url, boolean auto, String type, String format,
       int recursive, int delay, String fileTypes, OutputStream out,
       boolean commit, boolean optimize, String[] args) {
     this.mode = mode;
     this.solrUrl = url;
     this.auto = auto;
     this.type = type;
     this.format = format;
     this.recursive = recursive;
     this.delay = delay;
     this.fileTypes = fileTypes;
     this.fileFilter = getFileFilterFromFileTypes(fileTypes);
     this.out = out;
     this.commit = commit;
     this.optimize = optimize;
     this.args = args;
     pageFetcher = new PageFetcher();
   }

   public SimplePostTool() {}

   //
   // Do some action depending on which mode we have
   //
   private void doFilesMode() {
     currentDepth = 0;
     // Skip posting files if special param "-" given
     if (!args[0].equals("-")) {
       info("Posting files to [base] url " + solrUrl + (!auto?" using content-type "+(type==null?DEFAULT_CONTENT_TYPE:type):"")+"...");
       if(auto)
         info("Entering auto mode. File endings considered are "+fileTypes);
       if(recursive > 0)
         info("Entering recursive mode, max depth="+recursive+", delay="+delay+"s");
       int numFilesPosted = postFiles(args, 0, out, type);
       info(numFilesPosted + " files indexed.");
     }
   }

   private void doArgsMode() {
     info("POSTing args to " + solrUrl + "...");
     for (String a : args) {
       postData(stringToStream(a), null, out, type, solrUrl);
     }
   }

   private int doWebMode() {
     reset();
     int numPagesPosted = 0;
     try {
       if(type != null) {
         fatal("Specifying content-type with \"-Ddata=web\" is not supported");
       }
       if (args[0].equals("-")) {
         // Skip posting url if special param "-" given
         return 0;
       }
       // Set Extracting handler as default
       solrUrl = appendUrlPath(solrUrl, "/extract");

       info("Posting web pages to Solr url "+solrUrl);
       auto=true;
       info("Entering auto mode. Indexing pages with content-types corresponding to file endings "+fileTypes);
       if(recursive > 0) {
         if(recursive > MAX_WEB_DEPTH) {
           recursive = MAX_WEB_DEPTH;
           warn("Too large recursion depth for web mode, limiting to "+MAX_WEB_DEPTH+"...");
         }
         if(delay < DEFAULT_WEB_DELAY)
           warn("Never crawl an external web site faster than every 10 seconds, your IP will probably be blocked");
         info("Entering recursive mode, depth="+recursive+", delay="+delay+"s");
       }
       numPagesPosted = postWebPages(args, 0, out);
       info(numPagesPosted + " web pages indexed.");
     } catch(MalformedURLException e) {
       fatal("Wrong URL trying to append /extract to "+solrUrl);
     }
     return numPagesPosted;
   }

   private void doStdinMode() {
     info("POSTing stdin to " + solrUrl + "...");
     postData(System.in, null, out, type, solrUrl);
   }

   private void reset() {
     backlog = new ArrayList<>();
     visited = new HashSet<>();
   }


   //
   // USAGE
   //
   private static void usageShort() {
     System.out.println(USAGE_STRING_SHORT+"\n"+
         "       Please invoke with -h option for extended usage help.");
   }

   private static void usage() {
     System.out.println
     (USAGE_STRING_SHORT+"\n\n" +
      "Supported System Properties and their defaults:\n"+
      "  -Dc=<core/collection>\n"+
      "  -Durl=<base Solr update URL> (overrides -Dc option if specified)\n"+
      "  -Ddata=files|web|args|stdin (default=" + DEFAULT_DATA_MODE + ")\n"+
      "  -Dtype=<content-type> (default=" + DEFAULT_CONTENT_TYPE + ")\n"+
      "  -Dhost=<host> (default: " + DEFAULT_POST_HOST+ ")\n"+
      "  -Dport=<port> (default: " + DEFAULT_POST_PORT+ ")\n"+
      "  -Dbasicauth=<user:pass> (sets Basic Authentication credentials)\n"+
      "  -Dauto=yes|no (default=" + DEFAULT_AUTO + ")\n"+
      "  -Drecursive=yes|no|<depth> (default=" + DEFAULT_RECURSIVE + ")\n"+
      "  -Ddelay=<seconds> (default=0 for files, 10 for web)\n"+
      "  -Dfiletypes=<type>[,<type>,...] (default=" + DEFAULT_FILE_TYPES + ")\n"+
      "  -Dparams=\"<key>=<value>[&<key>=<value>...]\" (values must be URL-encoded)\n"+
      "  -Dcommit=yes|no (default=" + DEFAULT_COMMIT + ")\n"+
      "  -Doptimize=yes|no (default=" + DEFAULT_OPTIMIZE + ")\n"+
      "  -Dout=yes|no (default=" + DEFAULT_OUT + ")\n\n"+
      "This is a simple command line tool for POSTing raw data to a Solr port.\n"+
      "NOTE: Specifying the url/core/collection name is mandatory.\n" +
      "Data can be read from files specified as commandline args,\n"+
      "URLs specified as args, as raw commandline arg strings or via STDIN.\n"+
      "Examples:\n"+
      "  java -Dc=gettingstarted -jar post.jar *.xml\n"+
      "  java -Ddata=args -Dc=gettingstarted -jar post.jar '<delete><id>42</id></delete>'\n"+
      "  java -Ddata=stdin -Dc=gettingstarted -jar post.jar < hd.xml\n"+
      "  java -Ddata=web -Dc=gettingstarted -jar post.jar http://example.com/\n"+
      "  java -Dtype=text/csv -Dc=gettingstarted -jar post.jar *.csv\n"+
      "  java -Dtype=application/json -Dc=gettingstarted -jar post.jar *.json\n"+
      "  java -Durl=http://localhost:8983/solr/techproducts/update/extract -Dparams=literal.id=pdf1 -jar post.jar solr-word.pdf\n"+
      "  java -Dauto -Dc=gettingstarted -jar post.jar *\n"+
      "  java -Dauto -Dc=gettingstarted -Drecursive -jar post.jar afolder\n"+
      "  java -Dauto -Dc=gettingstarted -Dfiletypes=ppt,html -jar post.jar afolder\n"+
      "The options controlled by System Properties include the Solr\n"+
      "URL to POST to, the Content-Type of the data, whether a commit\n"+
      "or optimize should be executed, and whether the response should\n"+
      "be written to STDOUT. If auto=yes the tool will try to set type\n"+
      "automatically from file name. When posting rich documents the\n"+
      "file name will be propagated as \"resource.name\" and also used\n"+
      "as \"literal.id\". You may override these or any other request parameter\n"+
      "through the -Dparams property. To do a commit only, use \"-\" as argument.\n"+
      "The web mode is a simple crawler following links within domain, default delay=10s.");
   }

   private boolean checkIsValidPath(File srcFile) {
     try {
       srcFile.toPath();
       return true;
     } catch (InvalidPathException e) {
       return false;
     }
   }

   /** Post all filenames provided in args
    * @param args array of file names
    * @param startIndexInArgs offset to start
    * @param out output stream to post data to
    * @param type default content-type to use when posting (may be overridden in auto mode)
    * @return number of files posted
    * */
   public int postFiles(String [] args,int startIndexInArgs, OutputStream out, String type) {
     reset();
     int filesPosted = 0;
     for (int j = startIndexInArgs; j < args.length; j++) {
       File srcFile = new File(args[j]);
       boolean isValidPath = checkIsValidPath(srcFile);
       if(isValidPath && srcFile.isDirectory() && srcFile.canRead()) {
         filesPosted += postDirectory(srcFile, out, type);
       } else if (isValidPath && srcFile.isFile() && srcFile.canRead()) {
         filesPosted += postFiles(new File[] {srcFile}, out, type);
       } else {
         filesPosted += handleGlob(srcFile, out, type);
       }
     }
     return filesPosted;
   }

   /** Post all filenames provided in args
    * @param files array of Files
    * @param startIndexInArgs offset to start
    * @param out output stream to post data to
    * @param type default content-type to use when posting (may be overridden in auto mode)
    * @return number of files posted
    * */
   public int postFiles(File[] files, int startIndexInArgs, OutputStream out, String type) {
     reset();
     int filesPosted = 0;
     for (File srcFile : files) {
       boolean isValidPath = checkIsValidPath(srcFile);
       if(isValidPath && srcFile.isDirectory() && srcFile.canRead()) {
         filesPosted += postDirectory(srcFile, out, type);
       } else if (isValidPath && srcFile.isFile() && srcFile.canRead()) {
         filesPosted += postFiles(new File[] {srcFile}, out, type);
       } else {
         filesPosted += handleGlob(srcFile, out, type);
       }
     }
     return filesPosted;
   }

   /**
    * Posts a whole directory
    * @return number of files posted total
    */
   private int postDirectory(File dir, OutputStream out, String type) {
     if(dir.isHidden() && !dir.getName().equals("."))
       return(0);
     info("Indexing directory "+dir.getPath()+" ("+dir.listFiles(fileFilter).length+" files, depth="+currentDepth+")");
     int posted = 0;
     posted += postFiles(dir.listFiles(fileFilter), out, type);
     if(recursive > currentDepth) {
       for(File d : dir.listFiles()) {
         if(d.isDirectory()) {
           currentDepth++;
           posted += postDirectory(d, out, type);
           currentDepth--;
         }
       }
     }
     return posted;
   }

   /**
    * Posts a list of file names
    * @return number of files posted
    */
   int postFiles(File[] files, OutputStream out, String type) {
     int filesPosted = 0;
     for(File srcFile : files) {
       try {
         if(!srcFile.isFile() || srcFile.isHidden())
           continue;
         postFile(srcFile, out, type);
         Thread.sleep(delay * 1000);
         filesPosted++;
       } catch (InterruptedException e) {
         throw new RuntimeException(e);
       }
     }
     return filesPosted;
   }

   /**
    * This only handles file globs not full path globbing.
    * @param globFile file holding glob path
    * @param out outputStream to write results to
    * @param type default content-type to use when posting (may be overridden in auto mode)
    * @return number of files posted
    */
   int handleGlob(File globFile, OutputStream out, String type) {
     int filesPosted = 0;
     File parent = globFile.getParentFile();
     if (parent == null) parent = new File(".");
     String fileGlob = globFile.getName();
     GlobFileFilter ff = new GlobFileFilter(fileGlob, false);
     File[] fileList = parent.listFiles(ff);
     if (fileList == null || fileList.length == 0) {
       warn("No files or directories matching " + globFile);
     } else {
       filesPosted = postFiles(fileList, out, type);
     }
     return filesPosted;
   }

   /**
    * This method takes as input a list of start URL strings for crawling,
    * adds each one to a backlog and then starts crawling
    * @param args the raw input args from main()
    * @param startIndexInArgs offset for where to start
    * @param out outputStream to write results to
    * @return the number of web pages posted
    */
   public int postWebPages(String[] args, int startIndexInArgs, OutputStream out) {
     reset();
     LinkedHashSet<URL> s = new LinkedHashSet<>();
     for (int j = startIndexInArgs; j < args.length; j++) {
       try {
         URL u = new URL(normalizeUrlEnding(args[j]));
         s.add(u);
       } catch(MalformedURLException e) {
         warn("Skipping malformed input URL: "+args[j]);
       }
     }
     // Add URLs to level 0 of the backlog and start recursive crawling
     backlog.add(s);
     return webCrawl(0, out);
   }

   /**
    * Normalizes a URL string by removing anchor part and trailing slash
    * @return the normalized URL string
    */
   protected static String normalizeUrlEnding(String link) {
     if(link.indexOf("#") > -1)
       link = link.substring(0,link.indexOf("#"));
     if(link.endsWith("?"))
       link = link.substring(0,link.length()-1);
     if(link.endsWith("/"))
       link = link.substring(0,link.length()-1);
     return link;
   }

   /**
    * A very simple crawler, pulling URLs to fetch from a backlog and then
    * recurses N levels deep if recursive&gt;0. Links are parsed from HTML
    * through first getting an XHTML version using SolrCell with extractOnly,
    * and followed if they are local. The crawler pauses for a default delay
    * of 10 seconds between each fetch, this can be configured in the delay
    * variable. This is only meant for test purposes, as it does not respect
    * robots or anything else fancy :)
    * @param level which level to crawl
    * @param out output stream to write to
    * @return number of pages crawled on this level and below
    */
   protected int webCrawl(int level, OutputStream out) {
     int numPages = 0;
     LinkedHashSet<URL> stack = backlog.get(level);
     int rawStackSize = stack.size();
     stack.removeAll(visited);
     int stackSize = stack.size();
     LinkedHashSet<URL> subStack = new LinkedHashSet<>();
     info("Entering crawl at level "+level+" ("+rawStackSize+" links total, "+stackSize+" new)");
     for(URL u : stack) {
       try {
         visited.add(u);
         PageFetcherResult result = pageFetcher.readPageFromUrl(u);
         if(result.httpStatus == 200) {
           u = (result.redirectUrl != null) ? result.redirectUrl : u;
           URL postUrl = new URL(appendParam(solrUrl.toString(),
               "literal.id="+URLEncoder.encode(u.toString(),"UTF-8") +
               "&literal.url="+URLEncoder.encode(u.toString(),"UTF-8")));
           ByteBuffer content = result.content;
           boolean success = postData(new ByteArrayInputStream(content.array(), content.arrayOffset(), content.limit()), null, out, result.contentType, postUrl);
           if (success) {
             info("POSTed web resource "+u+" (depth: "+level+")");
             Thread.sleep(delay * 1000);
             numPages++;
             // Pull links from HTML pages only
             if(recursive > level && result.contentType.equals("text/html")) {
               Set<URL> children = pageFetcher.getLinksFromWebPage(u, new ByteArrayInputStream(content.array(), content.arrayOffset(), content.limit()), result.contentType, postUrl);
               subStack.addAll(children);
             }
           } else {
             warn("An error occurred while posting "+u);
           }
         } else {
           warn("The URL "+u+" returned a HTTP result status of "+result.httpStatus);
         }
       } catch (IOException e) {
         warn("Caught exception when trying to open connection to "+u+": "+e.getMessage());
       } catch (InterruptedException e) {
         throw new RuntimeException(e);
       }
     }
     if(!subStack.isEmpty()) {
       backlog.add(subStack);
       numPages += webCrawl(level+1, out);
     }
     return numPages;
   }
   public static class BAOS extends ByteArrayOutputStream {
     public ByteBuffer getByteBuffer() {
       return ByteBuffer.wrap(super.buf,0,super.count);
     }
   }
   public static ByteBuffer inputStreamToByteArray(InputStream is) throws IOException {
     return inputStreamToByteArray(is,Integer.MAX_VALUE);

   }

   /**
    * Reads an input stream into a byte array
    *
    * @param is the input stream
    * @return the byte array
    * @throws IOException If there is a low-level I/O error.
    */
   public static ByteBuffer inputStreamToByteArray(InputStream is, long maxSize) throws IOException {
     try (BAOS bos = new BAOS()) {
       long sz = 0;
       int next = is.read();
       while (next > -1) {
         if (++sz > maxSize) throw new BufferOverflowException();
         bos.write(next);
         next = is.read();
       }
       bos.flush();
       return bos.getByteBuffer();
     }
   }

   /**
    * Computes the full URL based on a base url and a possibly relative link found
    * in the href param of an HTML anchor.
    * @param baseUrl the base url from where the link was found
    * @param link the absolute or relative link
    * @return the string version of the full URL
    */
   protected String computeFullUrl(URL baseUrl, String link) {
     if(link == null || link.length() == 0) {
       return null;
     }
     if(!link.startsWith("http")) {
       if(link.startsWith("/")) {
         link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + link;
       } else {
         if(link.contains(":")) {
           return null; // Skip non-relative URLs
         }
         String path = baseUrl.getPath();
         if(!path.endsWith("/")) {
           int sep = path.lastIndexOf("/");
           String file = path.substring(sep+1);
           if(file.contains(".") || file.contains("?"))
             path = path.substring(0,sep);
         }
         link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + path + "/" + link;
       }
     }
     link = normalizeUrlEnding(link);
     String l = link.toLowerCase(Locale.ROOT);
     // Simple brute force skip images
     if(l.endsWith(".jpg") || l.endsWith(".jpeg") || l.endsWith(".png") || l.endsWith(".gif")) {
       return null; // Skip images
     }
     return link;
   }

   /**
    * Uses the mime-type map to reverse lookup whether the file ending for our type
    * is supported by the fileTypes option
    * @param type what content-type to lookup
    * @return true if this is a supported content type
    */
   protected boolean typeSupported(String type) {
     for(Map.Entry<String, String> entry : mimeMap.entrySet()) {
       if(entry.getValue().equals(type)) {
         if(fileTypes.contains(entry.getKey()))
           return true;
       }
     }
     return false;
   }

   /**
    * Tests if a string is either "true", "on", "yes" or "1"
    * @param property the string to test
    * @return true if "on"
    */
   protected static boolean isOn(String property) {
     return("true,on,yes,1".indexOf(property) > -1);
   }

   static void warn(String msg) {
     System.err.println("SimplePostTool: WARNING: " + msg);
   }

   static void info(String msg) {
     System.out.println(msg);
   }

   static void fatal(String msg) {
     System.err.println("SimplePostTool: FATAL: " + msg);
     System.exit(2);
   }

   /**
    * Does a simple commit operation
    */
   public void commit() {
     info("COMMITting Solr index changes to " + solrUrl + "...");
     doGet(appendParam(solrUrl.toString(), "commit=true"));
   }

   /**
    * Does a simple optimize operation
    */
   public void optimize() {
     info("Performing an OPTIMIZE to " + solrUrl + "...");
     doGet(appendParam(solrUrl.toString(), "optimize=true"));
   }

   /**
    * Appends a URL query parameter to a URL
    * @param url the original URL
    * @param param the parameter(s) to append, separated by "&amp;"
    * @return the string version of the resulting URL
    */
   public static String appendParam(String url, String param) {
     String[] pa = param.split("&");
     for(String p : pa) {
       if(p.trim().length() == 0) continue;
       String[] kv = p.split("=");
       if(kv.length == 2) {
         url = url + (url.indexOf('?')>0 ? "&" : "?") + kv[0] +"="+ kv[1];
       } else {
         warn("Skipping param "+p+" which is not on form key=value");
       }
     }
     return url;
   }

   /**
    * Opens the file and posts its contents to the solrUrl,
    * writes to response to output.
    */
   public void postFile(File file, OutputStream output, String type) {
     InputStream is = null;
     try {
       URL url = solrUrl;
       String suffix = "";
       if(auto) {
         if(type == null) {
           type = guessType(file);
         }
         // TODO: Add a flag that disables /update and sends all to /update/extract, to avoid CSV, JSON, and XML files
         // TODO: from being interpreted as Solr documents internally
         if (type.equals("application/json") && !"solr".equals(format))  {
           suffix = "/json/docs";
           String urlStr = appendUrlPath(solrUrl, suffix).toString();
           url = new URL(urlStr);
         } else if (type.equals("application/xml") || type.equals("text/csv") || type.equals("application/json")) {
           // Default handler
         } else {
           // SolrCell
           suffix = "/extract";
           String urlStr = appendUrlPath(solrUrl, suffix).toString();
           if(urlStr.indexOf("resource.name")==-1)
             urlStr = appendParam(urlStr, "resource.name=" + URLEncoder.encode(file.getAbsolutePath(), "UTF-8"));
           if(urlStr.indexOf("literal.id")==-1)
             urlStr = appendParam(urlStr, "literal.id=" + URLEncoder.encode(file.getAbsolutePath(), "UTF-8"));
           url = new URL(urlStr);
         }
       } else {
         if(type == null) type = DEFAULT_CONTENT_TYPE;
       }
       info("POSTing file " + file.getName() + (auto?" ("+type+")":"") + " to [base]" + suffix + (mockMode ? " MOCK!":""));
       is = new FileInputStream(file);
       postData(is, file.length(), output, type, url);
     } catch (IOException e) {
       e.printStackTrace();
       warn("Can't open/read file: " + file);
     } finally {
       try {
         if(is!=null) is.close();
       } catch (IOException e) {
         fatal("IOException while closing file: "+ e);
       }
     }
   }

   /**
    * Appends to the path of the URL
    * @param url the URL
    * @param append the path to append
    * @return the final URL version
    */
   protected static URL appendUrlPath(URL url, String append) throws MalformedURLException {
     return new URL(url.getProtocol() + "://" + url.getAuthority() + url.getPath() + append + (url.getQuery() != null ? "?"+url.getQuery() : ""));
   }

   /**
    * Guesses the type of a file, based on file name suffix
    * Returns "application/octet-stream" if no corresponding mimeMap type.
    * @param file the file
    * @return the content-type guessed
    */
   protected static String guessType(File file) {
     String name = file.getName();
     String suffix = name.substring(name.lastIndexOf(".")+1);
     String type = mimeMap.get(suffix.toLowerCase(Locale.ROOT));
     return (type != null) ? type : "application/octet-stream";
   }

   /**
    * Performs a simple get on the given URL
    */
   public void doGet(String url) {
     try {
       doGet(new URL(url));
     } catch (MalformedURLException e) {
       warn("The specified URL "+url+" is not a valid URL. Please check");
     }
   }

   /**
    * Performs a simple get on the given URL
    */
   public void doGet(URL url) {
     try {
       if(mockMode) return;
       HttpURLConnection urlc = (HttpURLConnection) url.openConnection();
       basicAuth(urlc);
       urlc.connect();
       checkResponseCode(urlc);
     } catch (IOException e) {
       warn("An error occurred getting data from "+url+". Please check that Solr is running.");
     } catch (Exception e) {
       warn("An error occurred getting data from "+url+". Message: " + e.getMessage());
     }
   }

   /**
    * Reads data from the data stream and posts it to solr,
    * writes to the response to output
    * @return true if success
    */
   public boolean postData(InputStream data, Long length, OutputStream output, String type, URL url) {
     if(mockMode) return true;
     boolean success = true;
     if(type == null)
       type = DEFAULT_CONTENT_TYPE;
     HttpURLConnection urlc = null;
     try {
       try {
         urlc = (HttpURLConnection) url.openConnection();
         try {
           urlc.setRequestMethod("POST");
         } catch (ProtocolException e) {
           fatal("Shouldn't happen: HttpURLConnection doesn't support POST??"+e);
         }
         urlc.setDoOutput(true);
         urlc.setDoInput(true);
         urlc.setUseCaches(false);
         urlc.setAllowUserInteraction(false);
         urlc.setRequestProperty("Content-type", type);
         basicAuth(urlc);
         if (null != length) {
           urlc.setFixedLengthStreamingMode(length);
         } else {
           urlc.setChunkedStreamingMode(-1);//use JDK default chunkLen, 4k in Java 8.
         }
         urlc.connect();
       } catch (IOException e) {
         fatal("Connection error (is Solr running at " + solrUrl + " ?): " + e);
         success = false;
       } catch (Exception e) {
         fatal("POST failed with error " + e.getMessage());
       }

       try (final OutputStream out = urlc.getOutputStream()) {
         pipe(data, out);
       } catch (IOException e) {
         fatal("IOException while posting data: " + e);
       }

       try {
         success &= checkResponseCode(urlc);
         try (final InputStream in = urlc.getInputStream()) {
           pipe(in, output);
         }
       } catch (IOException e) {
         warn("IOException while reading response: " + e);
         success = false;
       } catch (GeneralSecurityException e) {
         fatal("Looks like Solr is secured and would not let us in. Try with another user in '-u' parameter");
       }
     } finally {
       if (urlc!=null) urlc.disconnect();
     }
     return success;
   }

   private static void basicAuth(HttpURLConnection urlc) throws Exception {
     if (urlc.getURL().getUserInfo() != null) {
       String encoding = Base64.getEncoder().encodeToString(urlc.getURL().getUserInfo().getBytes(US_ASCII));
       urlc.setRequestProperty("Authorization", "Basic " + encoding);
     } else if (System.getProperty(BASIC_AUTH) != null) {
       String basicauth = System.getProperty(BASIC_AUTH).trim();
       if (!basicauth.contains(":")) {
         throw new Exception("System property '"+BASIC_AUTH+"' must be of format user:pass");
       }
       urlc.setRequestProperty("Authorization", "Basic " + Base64.getEncoder().encodeToString(basicauth.getBytes(UTF_8)));
     }
   }

   private static boolean checkResponseCode(HttpURLConnection urlc) throws IOException, GeneralSecurityException {
     if (urlc.getResponseCode() >= 400) {
       warn("Solr returned an error #" + urlc.getResponseCode() +
             " (" + urlc.getResponseMessage() + ") for url: " + urlc.getURL());
       Charset charset = StandardCharsets.ISO_8859_1;
       final String contentType = urlc.getContentType();
       // code cloned from ContentStreamBase, but post.jar should be standalone!
       if (contentType != null) {
         int idx = contentType.toLowerCase(Locale.ROOT).indexOf("charset=");
         if (idx > 0) {
           charset = Charset.forName(contentType.substring(idx + "charset=".length()).trim());
         }
       }
       // Print the response returned by Solr
       try (InputStream errStream = urlc.getErrorStream()) {
         if (errStream != null) {
           BufferedReader br = new BufferedReader(new InputStreamReader(errStream, charset));
           final StringBuilder response = new StringBuilder("Response: ");
           int ch;
           while ((ch = br.read()) != -1) {
             response.append((char) ch);
           }
           warn(response.toString().trim());
         }
       }
       if (urlc.getResponseCode() == 401) {
         throw new GeneralSecurityException("Solr requires authentication (response 401). Please try again with '-u' option");
       }
       if (urlc.getResponseCode() == 403) {
         throw new GeneralSecurityException("You are not authorized to perform this action against Solr. (response 403)");
       }
       return false;
     }
     return true;
   }

   /**
    * Converts a string to an input stream
    * @param s the string
    * @return the input stream
    */
   public static InputStream stringToStream(String s) {
     return new ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8));
   }

   /**
    * Pipes everything from the source to the dest.  If dest is null,
    * then everything is read from source and thrown away.
    */
   private static void pipe(InputStream source, OutputStream dest) throws IOException {
     byte[] buf = new byte[1024];
     int read = 0;
     while ( (read = source.read(buf) ) >= 0) {
       if (null != dest) dest.write(buf, 0, read);
     }
     if (null != dest) dest.flush();
   }

   public FileFilter getFileFilterFromFileTypes(String fileTypes) {
     String glob;
     if(fileTypes.equals("*"))
       glob = ".*";
     else
       glob = "^.*\\.(" + fileTypes.replace(",", "|") + ")$";
     return new GlobFileFilter(glob, true);
   }

   //
   // Utility methods for XPath handing
   //

   /**
    * Gets all nodes matching an XPath
    */
   public static NodeList getNodesFromXP(Node n, String xpath) throws XPathExpressionException {
     XPathFactory factory = XPathFactory.newInstance();
     XPath xp = factory.newXPath();
     XPathExpression expr = xp.compile(xpath);
     return (NodeList) expr.evaluate(n, XPathConstants.NODESET);
   }

   /**
    * Gets the string content of the matching an XPath
    * @param n the node (or doc)
    * @param xpath the xpath string
    * @param concatAll if true, text from all matching nodes will be concatenated, else only the first returned
    */
   public static String getXP(Node n, String xpath, boolean concatAll)
       throws XPathExpressionException {
     NodeList nodes = getNodesFromXP(n, xpath);
     StringBuilder sb = new StringBuilder();
     if (nodes.getLength() > 0) {
       for(int i = 0; i < nodes.getLength() ; i++) {
         sb.append(nodes.item(i).getNodeValue()).append(' ');
         if(!concatAll) break;
       }
       return sb.toString().trim();
     } else
       return "";
   }

   /**
    * Takes a string as input and returns a DOM
    */
   public static Document makeDom(byte[] in) throws SAXException, IOException,
   ParserConfigurationException {
     InputStream is = new ByteArrayInputStream(in);
     Document dom = DocumentBuilderFactory.newInstance()
         .newDocumentBuilder().parse(is);
     return dom;
   }

   /**
    * Inner class to filter files based on glob wildcards
    */
   static class GlobFileFilter implements FileFilter
   {
     private String _pattern;
     private Pattern p;

     public GlobFileFilter(String pattern, boolean isRegex)
     {
       _pattern = pattern;
       if(!isRegex) {
         _pattern = _pattern
             .replace("^", "\\^")
             .replace("$", "\\$")
             .replace(".", "\\.")
             .replace("(", "\\(")
             .replace(")", "\\)")
             .replace("+", "\\+")
             .replace("*", ".*")
             .replace("?", ".");
         _pattern = "^" + _pattern + "$";
       }

       try {
         p = Pattern.compile(_pattern,Pattern.CASE_INSENSITIVE);
       } catch(PatternSyntaxException e) {
         fatal("Invalid type list "+pattern+". "+e.getDescription());
       }
     }

     @Override
     public boolean accept(File file)
     {
       return p.matcher(file.getName()).find();
     }
   }

   //
   // Simple crawler class which can fetch a page and check for robots.txt
   //
   class PageFetcher {
     Map<String, List<String>> robotsCache;
     static final String DISALLOW = "Disallow:";

     public PageFetcher() {
       robotsCache = new HashMap<>();
     }

     public PageFetcherResult readPageFromUrl(URL u) {
       PageFetcherResult res = new PageFetcherResult();
       try {
         if (isDisallowedByRobots(u)) {
           warn("The URL "+u+" is disallowed by robots.txt and will not be crawled.");
           res.httpStatus = 403;
           visited.add(u);
           return res;
         }
         res.httpStatus = 404;
         HttpURLConnection conn = (HttpURLConnection) u.openConnection();
         conn.setRequestProperty("User-Agent", "SimplePostTool-crawler/"+VERSION_OF_THIS_TOOL+" (http://lucene.apache.org/solr/)");
         conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
         conn.connect();
         res.httpStatus = conn.getResponseCode();
         if(!normalizeUrlEnding(conn.getURL().toString()).equals(normalizeUrlEnding(u.toString()))) {
           info("The URL "+u+" caused a redirect to "+conn.getURL());
           u = conn.getURL();
           res.redirectUrl = u;
           visited.add(u);
         }
         if(res.httpStatus == 200) {
           // Raw content type of form "text/html; encoding=utf-8"
           String rawContentType = conn.getContentType();
           String type = rawContentType.split(";")[0];
           if(typeSupported(type) || "*".equals(fileTypes)) {
             String encoding = conn.getContentEncoding();
             InputStream is;
             if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
               is = new GZIPInputStream(conn.getInputStream());
             } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
               is = new InflaterInputStream(conn.getInputStream(), new Inflater(true));
             } else {
               is = conn.getInputStream();
             }

             // Read into memory, so that we later can pull links from the page without re-fetching
             res.content = inputStreamToByteArray(is);
             is.close();
           } else {
             warn("Skipping URL with unsupported type "+type);
             res.httpStatus = 415;
           }
         }
       } catch(IOException e) {
         warn("IOException when reading page from url "+u+": "+e.getMessage());
       }
       return res;
     }

     public boolean isDisallowedByRobots(URL url) {
       String host = url.getHost();
       String strRobot = url.getProtocol() + "://" + host + "/robots.txt";
       List<String> disallows = robotsCache.get(host);
       if(disallows == null) {
         disallows = new ArrayList<>();
         URL urlRobot;
         try {
           urlRobot = new URL(strRobot);
           disallows = parseRobotsTxt(urlRobot.openStream());
         } catch (MalformedURLException e) {
           return true; // We cannot trust this robots URL, should not happen
         } catch (IOException e) {
           // There is no robots.txt, will cache an empty disallow list
         }
       }

       robotsCache.put(host, disallows);

       String strURL = url.getFile();
       for (String path : disallows) {
         if (path.equals("/") || strURL.indexOf(path) == 0)
           return true;
       }
       return false;
     }

     /**
      * Very simple robots.txt parser which obeys all Disallow lines regardless
      * of user agent or whether there are valid Allow: lines.
      * @param is Input stream of the robots.txt file
      * @return a list of disallow paths
      * @throws IOException if problems reading the stream
      */
     protected List<String> parseRobotsTxt(InputStream is) throws IOException {
       List<String> disallows = new ArrayList<>();
       BufferedReader r = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
       String l;
       while((l = r.readLine()) != null) {
         String[] arr = l.split("#");
         if(arr.length == 0) continue;
         l = arr[0].trim();
         if(l.startsWith(DISALLOW)) {
           l = l.substring(DISALLOW.length()).trim();
           if(l.length() == 0) continue;
           disallows.add(l);
         }
       }
       is.close();
       return disallows;
     }

     /**
      * Finds links on a web page, using /extract?extractOnly=true
      * @param u the URL of the web page
      * @param is the input stream of the page
      * @param type the content-type
      * @param postUrl the URL (typically /solr/extract) in order to pull out links
      * @return a set of URLs parsed from the page
      */
     protected Set<URL> getLinksFromWebPage(URL u, InputStream is, String type, URL postUrl) {
       Set<URL> l = new HashSet<>();
       URL url = null;
       try {
         ByteArrayOutputStream os = new ByteArrayOutputStream();
         URL extractUrl = new URL(appendParam(postUrl.toString(), "extractOnly=true"));
         boolean success = postData(is, null, os, type, extractUrl);
         if(success) {
           Document d = makeDom(os.toByteArray());
           String innerXml = getXP(d, "/response/str/text()[1]", false);
           d = makeDom(innerXml.getBytes(StandardCharsets.UTF_8));
           NodeList links = getNodesFromXP(d, "/html/body//a/@href");
           for(int i = 0; i < links.getLength(); i++) {
             String link = links.item(i).getTextContent();
             link = computeFullUrl(u, link);
             if(link == null)
               continue;
             url = new URL(link);
             if(url.getAuthority() == null || !url.getAuthority().equals(u.getAuthority()))
               continue;
             l.add(url);
           }
         }
       } catch (MalformedURLException e) {
         warn("Malformed URL "+url);
       } catch (IOException e) {
         warn("IOException opening URL "+url+": "+e.getMessage());
       } catch (Exception e) {
         throw new RuntimeException(e);
       }
       return l;
     }
   }

   /**
    * Utility class to hold the result form a page fetch
    */
   public static class PageFetcherResult {
     int httpStatus = 200;
     String contentType = "text/html";
     URL redirectUrl = null;
     ByteBuffer content;
   }
 }