webloader/service/src/main/java/org/apache/sling/samples/webloader/internal/WebloaderJob.java - sling-samples - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.sling.samples.webloader.internal;

 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.net.URL;
 import java.net.URLConnection;
 import java.net.URLDecoder;
 import java.net.URLEncoder;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Calendar;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;

 import javax.jcr.Item;
 import javax.jcr.Node;
 import javax.jcr.RepositoryException;
 import javax.jcr.Session;
 import javax.swing.text.BadLocationException;
 import javax.swing.text.html.HTML;
 import javax.swing.text.html.HTMLDocument;
 import javax.swing.text.html.HTMLEditorKit;

 import org.apache.sling.commons.mime.MimeTypeService;
 import org.apache.sling.samples.webloader.WebloaderException;
 import org.apache.sling.samples.webloader.WebloaderJobStatus;
 import org.apache.sling.jcr.api.SlingRepository;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /** A Webloader job, manages retrieval of documents from the web and storage
  *  in the Sling repository. This code is based on the "populate.jsp" example
  *  of the jackrabbit-webapp module.
  */
 class WebloaderJob extends Thread implements WebloaderJobStatus {
     private Throwable error;
     private int numDocsLoaded;
     private final int maxDocsToRetrieve;
     private final int maxDocSize;
     private String statusInfo = "initialized";
     private String statusDetails = "";
     private boolean running = true;
     private final String jobId;
     private final String webQuery;
     private String storagePath;
     private final SlingRepository repository;
     private final MimeTypeService mimeTypeService;
     private Session session;
     private Node storageRoot;
     private static int idCounter;
     private final String [] filetypes;

     public static final String [] DEFAULT_FILETYPES = { "pdf", "rtf", "ppt", "doc", "xls" };
     public static final int URL_RETRIEVE_TIMEOUT_SECONDS = 10;

     private static final Logger log = LoggerFactory.getLogger(WebloaderJob.class);

     @SuppressWarnings("serial")
     static class DocTooBigException extends IOException {
         DocTooBigException(URL url, int size) {
             super("Document at URL " + url + " too big (" + size + " bytes), will be ignored");
         }
     }

     WebloaderJob(SlingRepository repository, MimeTypeService mimeTypeService,
             String webQuery, String storagePath, String fileExtensions, int maxDocsToRetrieve, int maxDocSize) {
         synchronized (WebloaderJob.class) {
             jobId = String.valueOf(idCounter++);
         }

         this.repository = repository;
         this.mimeTypeService = mimeTypeService;
         this.webQuery = webQuery;
         this.storagePath = storagePath;
         this.maxDocsToRetrieve = maxDocsToRetrieve;
         this.maxDocSize = maxDocSize;

         final String [] ft = fileExtensions == null ? null : fileExtensions.split(",");
         if(ft!=null && ft.length > 0) {
             filetypes = new String[ft.length];
             for(int i=0; i < ft.length; i++) {
                 filetypes[i] = ft[i].trim().toLowerCase();
             }
         } else {
             filetypes = DEFAULT_FILETYPES;
         }

         if(mimeTypeService == null) {
             throw new WebloaderException("Missing MimeTypeService");
         }
         if(repository == null) {
             throw new WebloaderException("Missing Repository");
         }

         setDaemon(true);
         start();
     }

     @Override
     public String toString() {
         final StringBuffer sb = new StringBuffer();
         for(String str : filetypes) {
             if(sb.length() > 0) {
                 sb.append(",");
             }
             sb.append(str);
         }

         return getClass().getSimpleName() + ", webQuery=" + webQuery
             + ", storagePath=" + storagePath
             + ", fileTypes=" + sb.toString()
             + ", maxDocsToRetrieve=" + maxDocsToRetrieve
             + ", maxDocSize=" + maxDocSize
         ;
     }

     @Override
     public void run() {
         log.debug("Job thread starting: {}", this);

         // TODO should use a session provided by client, but can we use it
         // safely for our async job?
         session = null;

         if(storagePath.charAt(0) == '/') {
             storagePath = storagePath.substring(1);
         }
         final String absStoragePath = "/" + storagePath;

         try {
             session = repository.loginAdministrative(null);
             if(session.itemExists(absStoragePath)) {
                 final Item i = session.getItem(absStoragePath);
                 if(i.isNode()) {
                     storageRoot = (Node)i;
                 } else {
                     throw new WebloaderException("Item at " + storagePath + " is not a Node");
                 }
             } else {
                 // TODO deep-create hierarchy if needed
                 storageRoot = session.getRootNode().addNode(storagePath);
                 session.save();
             }

             int offset = 0;
             for(String type : filetypes) {
                 final URL[] urls = getDocumentUrlsFromGoogle(type, offset);
                 for(URL url : urls) {
                     try {
                         getAndStoreDocument(url);
                         session.save();
                         numDocsLoaded++;
                         if(numDocsLoaded >= maxDocsToRetrieve) {
                             break;
                         }
                     } catch(DocTooBigException dtb) {
                         log.info(dtb.getMessage());
                     } catch(Exception e) {
                         log.warn("Exception while retrieving url " + url, e);
                     } finally {
                         session.refresh(false);
                     }
                 }
                 offset += 10;

                 if(numDocsLoaded >= maxDocsToRetrieve) {
                     break;
                 }
             }

             statusInfo = "All done.";

         } catch(Exception e) {
             error = e;
             log.warn("Exception in WebloaderJob.run()", e);
             statusInfo = "Exception while running job: " + e;

         } finally {
             if(session != null) {
                 session.logout();
             }
             statusDetails = "";
             running = false;
         }

         if(numDocsLoaded >= maxDocsToRetrieve) {
             log.info("Stopped after retrieving maximum number of documents ({})", maxDocsToRetrieve);
         }

         log.info("Job thread ends: {}, {} documents loaded", this, numDocsLoaded);
     }

     private URL [] getDocumentUrlsFromGoogle(String currentFiletype, int start) throws IOException, BadLocationException {
         final List urls = new ArrayList();
         String query = webQuery + " filetype:" + currentFiletype;
         final URL google = new URL("http://www.google.com/search?q=" +
                 URLEncoder.encode(query, "UTF-8") + "&start=" + start);
         log.debug("Querying {}", google.toString());
         statusInfo = "Querying " + google.toString();
         statusDetails = "";
         URLConnection con = google.openConnection();
         con.setRequestProperty("User-Agent", "");
         InputStream in = con.getInputStream();
         try {
             HTMLEditorKit kit = new HTMLEditorKit();
             HTMLDocument doc = new HTMLDocument();
             doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
             kit.read(new InputStreamReader(in, "UTF-8"), doc, 0);
             HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
             while (it.isValid()) {
                 if(it.getAttributes() != null) {
                     String href = (String) it.getAttributes().getAttribute(HTML.Attribute.HREF);
                     if (href != null && href.endsWith("." + currentFiletype)) {
                         URL url = new URL(new URL("http", "www.google.com", "dummy"), href);
                         if (url.getHost().indexOf("google") == -1) {
                             log.debug("Got document URL from google: {}", url);
                             statusDetails = "Got URL " + url;
                             urls.add(url);
                         }
                     }
                 }
                 it.next();
             }
         } finally {
             in.close();
         }
         return (URL[]) urls.toArray(new URL[urls.size()]);

     }

     private void getAndStoreDocument(URL currentURL) throws RepositoryException, IOException {

         statusInfo = "Retrieving document " + currentURL;
         statusDetails = "";

         // build JCR path for storing document, based on its URL
         String path = currentURL.getPath();
         if (path.startsWith("/")) {
             path = path.substring(1);
         }
         final String host = currentURL.getHost();
         final List folderNames = new ArrayList();
         folderNames.addAll(Arrays.asList(host.split("\\.")));
         Collections.reverse(folderNames);
         folderNames.addAll(Arrays.asList(path.split("/", 0)));
         final String filename = URLDecoder.decode((String) folderNames.remove(folderNames.size() - 1), "UTF-8").replaceAll(":", "_");
         Node node = storageRoot;
         for (Iterator fn = folderNames.iterator(); fn.hasNext(); ) {
             String name = URLDecoder.decode((String) fn.next(), "UTF-8");
             name = name.replaceAll(":", "_");
             if (name.length() == 0) {
                 continue;
             }
             if (!node.hasNode(name)) {
                 node.addNode(name, "nt:folder");
             }
             node = node.getNode(name);
         }

         log.debug("Retrieving document {}, will be stored at {}", currentURL, node.getPath() + "/" + filename);

         if (!node.hasNode(filename)) {
             Node file = node.addNode(filename, "nt:file");
             final Node resource = file.addNode("jcr:content", "nt:resource");
             getAndStoreContent(currentURL, resource, filename);
         }

     }

     private void getAndStoreContent(URL currentURL, Node resource, String filename)
     throws RepositoryException, IOException {
         statusInfo = "Retrieving content from " + currentURL;
         statusDetails = "";

         final URLConnection con = currentURL.openConnection();
         con.setReadTimeout(URL_RETRIEVE_TIMEOUT_SECONDS * 1000);
         InputStream in = con.getInputStream();
         try {
             // Read with a ProgressInputStream, so that our status is updated while
             // downloading
             int length = con.getContentLength();
             if (length != -1) {
                 if(length > maxDocSize * 1024) {
                     throw new DocTooBigException(currentURL, length);
                 }
                 in = new ProgressInputStream(in, length) {
                     int nextReport = 0;
                     protected void reportProgress(int bytesRead, int totalBytesToRead) {
                         if(bytesRead > nextReport) {
                             nextReport += 1024;
                             statusDetails = "Downloaded " + bytesRead + " bytes out of " + totalBytesToRead;
                         }
                     }
                 };
             }

             resource.setProperty("jcr:data", in);
             final String mimeType = mimeTypeService.getMimeType(filename);
             resource.setProperty("jcr:mimeType", mimeType);
             final Calendar lastModified = Calendar.getInstance();
             lastModified.setTimeInMillis(con.getLastModified());
             resource.setProperty("jcr:lastModified", lastModified);
         } finally {
             if(in != null) {
                 in.close();
             }
         }
     }

     String getJobId() {
         return jobId;
     }

     /** {@inheritDoc} */
     public Throwable getError() {
         return error;
     }

     /** {@inheritDoc} */
     public int getNumberOfDocumentsLoaded() {
         return numDocsLoaded;
     }

     /** {@inheritDoc} */
     public String getStatusInfo() {
         return statusInfo;
     }

     /** {@inheritDoc} */
     public String getStatusDetails() {
         return statusDetails;
     }

     /** {@inheritDoc} */
     public boolean isRunning() {
         return running;
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.sling.samples.webloader.internal;

	import java.io.IOException;
	import java.io.InputStream;
	import java.io.InputStreamReader;
	import java.net.URL;
	import java.net.URLConnection;
	import java.net.URLDecoder;
	import java.net.URLEncoder;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Calendar;
	import java.util.Collections;
	import java.util.Iterator;
	import java.util.List;

	import javax.jcr.Item;
	import javax.jcr.Node;
	import javax.jcr.RepositoryException;
	import javax.jcr.Session;
	import javax.swing.text.BadLocationException;
	import javax.swing.text.html.HTML;
	import javax.swing.text.html.HTMLDocument;
	import javax.swing.text.html.HTMLEditorKit;

	import org.apache.sling.commons.mime.MimeTypeService;
	import org.apache.sling.samples.webloader.WebloaderException;
	import org.apache.sling.samples.webloader.WebloaderJobStatus;
	import org.apache.sling.jcr.api.SlingRepository;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/** A Webloader job, manages retrieval of documents from the web and storage
	* in the Sling repository. This code is based on the "populate.jsp" example
	* of the jackrabbit-webapp module.
	*/
	class WebloaderJob extends Thread implements WebloaderJobStatus {
	private Throwable error;
	private int numDocsLoaded;
	private final int maxDocsToRetrieve;
	private final int maxDocSize;
	private String statusInfo = "initialized";
	private String statusDetails = "";
	private boolean running = true;
	private final String jobId;
	private final String webQuery;
	private String storagePath;
	private final SlingRepository repository;
	private final MimeTypeService mimeTypeService;
	private Session session;
	private Node storageRoot;
	private static int idCounter;
	private final String [] filetypes;

	public static final String [] DEFAULT_FILETYPES = { "pdf", "rtf", "ppt", "doc", "xls" };
	public static final int URL_RETRIEVE_TIMEOUT_SECONDS = 10;

	private static final Logger log = LoggerFactory.getLogger(WebloaderJob.class);

	@SuppressWarnings("serial")
	static class DocTooBigException extends IOException {
	DocTooBigException(URL url, int size) {
	super("Document at URL " + url + " too big (" + size + " bytes), will be ignored");
	}
	}

	WebloaderJob(SlingRepository repository, MimeTypeService mimeTypeService,
	String webQuery, String storagePath, String fileExtensions, int maxDocsToRetrieve, int maxDocSize) {
	synchronized (WebloaderJob.class) {
	jobId = String.valueOf(idCounter++);
	}

	this.repository = repository;
	this.mimeTypeService = mimeTypeService;
	this.webQuery = webQuery;
	this.storagePath = storagePath;
	this.maxDocsToRetrieve = maxDocsToRetrieve;
	this.maxDocSize = maxDocSize;

	final String [] ft = fileExtensions == null ? null : fileExtensions.split(",");
	if(ft!=null && ft.length > 0) {
	filetypes = new String[ft.length];
	for(int i=0; i < ft.length; i++) {
	filetypes[i] = ft[i].trim().toLowerCase();
	}
	} else {
	filetypes = DEFAULT_FILETYPES;
	}

	if(mimeTypeService == null) {
	throw new WebloaderException("Missing MimeTypeService");
	}
	if(repository == null) {
	throw new WebloaderException("Missing Repository");
	}

	setDaemon(true);
	start();
	}

	@Override
	public String toString() {
	final StringBuffer sb = new StringBuffer();
	for(String str : filetypes) {
	if(sb.length() > 0) {
	sb.append(",");
	}
	sb.append(str);
	}

	return getClass().getSimpleName() + ", webQuery=" + webQuery
	+ ", storagePath=" + storagePath
	+ ", fileTypes=" + sb.toString()
	+ ", maxDocsToRetrieve=" + maxDocsToRetrieve
	+ ", maxDocSize=" + maxDocSize
	;
	}

	@Override
	public void run() {
	log.debug("Job thread starting: {}", this);

	// TODO should use a session provided by client, but can we use it
	// safely for our async job?
	session = null;

	if(storagePath.charAt(0) == '/') {
	storagePath = storagePath.substring(1);
	}
	final String absStoragePath = "/" + storagePath;

	try {
	session = repository.loginAdministrative(null);
	if(session.itemExists(absStoragePath)) {
	final Item i = session.getItem(absStoragePath);
	if(i.isNode()) {
	storageRoot = (Node)i;
	} else {
	throw new WebloaderException("Item at " + storagePath + " is not a Node");
	}
	} else {
	// TODO deep-create hierarchy if needed
	storageRoot = session.getRootNode().addNode(storagePath);
	session.save();
	}

	int offset = 0;
	for(String type : filetypes) {
	final URL[] urls = getDocumentUrlsFromGoogle(type, offset);
	for(URL url : urls) {
	try {
	getAndStoreDocument(url);
	session.save();
	numDocsLoaded++;
	if(numDocsLoaded >= maxDocsToRetrieve) {
	break;
	}
	} catch(DocTooBigException dtb) {
	log.info(dtb.getMessage());
	} catch(Exception e) {
	log.warn("Exception while retrieving url " + url, e);
	} finally {
	session.refresh(false);
	}
	}
	offset += 10;

	if(numDocsLoaded >= maxDocsToRetrieve) {
	break;
	}
	}

	statusInfo = "All done.";

	} catch(Exception e) {
	error = e;
	log.warn("Exception in WebloaderJob.run()", e);
	statusInfo = "Exception while running job: " + e;

	} finally {
	if(session != null) {
	session.logout();
	}
	statusDetails = "";
	running = false;
	}

	if(numDocsLoaded >= maxDocsToRetrieve) {
	log.info("Stopped after retrieving maximum number of documents ({})", maxDocsToRetrieve);
	}

	log.info("Job thread ends: {}, {} documents loaded", this, numDocsLoaded);
	}

	private URL [] getDocumentUrlsFromGoogle(String currentFiletype, int start) throws IOException, BadLocationException {
	final List urls = new ArrayList();
	String query = webQuery + " filetype:" + currentFiletype;
	final URL google = new URL("http://www.google.com/search?q=" +
	URLEncoder.encode(query, "UTF-8") + "&start=" + start);
	log.debug("Querying {}", google.toString());
	statusInfo = "Querying " + google.toString();
	statusDetails = "";
	URLConnection con = google.openConnection();
	con.setRequestProperty("User-Agent", "");
	InputStream in = con.getInputStream();
	try {
	HTMLEditorKit kit = new HTMLEditorKit();
	HTMLDocument doc = new HTMLDocument();
	doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
	kit.read(new InputStreamReader(in, "UTF-8"), doc, 0);
	HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
	while (it.isValid()) {
	if(it.getAttributes() != null) {
	String href = (String) it.getAttributes().getAttribute(HTML.Attribute.HREF);
	if (href != null && href.endsWith("." + currentFiletype)) {
	URL url = new URL(new URL("http", "www.google.com", "dummy"), href);
	if (url.getHost().indexOf("google") == -1) {
	log.debug("Got document URL from google: {}", url);
	statusDetails = "Got URL " + url;
	urls.add(url);
	}
	}
	}
	it.next();
	}
	} finally {
	in.close();
	}
	return (URL[]) urls.toArray(new URL[urls.size()]);

	}

	private void getAndStoreDocument(URL currentURL) throws RepositoryException, IOException {

	statusInfo = "Retrieving document " + currentURL;
	statusDetails = "";

	// build JCR path for storing document, based on its URL
	String path = currentURL.getPath();
	if (path.startsWith("/")) {
	path = path.substring(1);
	}
	final String host = currentURL.getHost();
	final List folderNames = new ArrayList();
	folderNames.addAll(Arrays.asList(host.split("\\.")));
	Collections.reverse(folderNames);
	folderNames.addAll(Arrays.asList(path.split("/", 0)));
	final String filename = URLDecoder.decode((String) folderNames.remove(folderNames.size() - 1), "UTF-8").replaceAll(":", "_");
	Node node = storageRoot;
	for (Iterator fn = folderNames.iterator(); fn.hasNext(); ) {
	String name = URLDecoder.decode((String) fn.next(), "UTF-8");
	name = name.replaceAll(":", "_");
	if (name.length() == 0) {
	continue;
	}
	if (!node.hasNode(name)) {
	node.addNode(name, "nt:folder");
	}
	node = node.getNode(name);
	}

	log.debug("Retrieving document {}, will be stored at {}", currentURL, node.getPath() + "/" + filename);

	if (!node.hasNode(filename)) {
	Node file = node.addNode(filename, "nt:file");
	final Node resource = file.addNode("jcr:content", "nt:resource");
	getAndStoreContent(currentURL, resource, filename);
	}

	}

	private void getAndStoreContent(URL currentURL, Node resource, String filename)
	throws RepositoryException, IOException {
	statusInfo = "Retrieving content from " + currentURL;
	statusDetails = "";

	final URLConnection con = currentURL.openConnection();
	con.setReadTimeout(URL_RETRIEVE_TIMEOUT_SECONDS * 1000);
	InputStream in = con.getInputStream();
	try {
	// Read with a ProgressInputStream, so that our status is updated while
	// downloading
	int length = con.getContentLength();
	if (length != -1) {
	if(length > maxDocSize * 1024) {
	throw new DocTooBigException(currentURL, length);
	}
	in = new ProgressInputStream(in, length) {
	int nextReport = 0;
	protected void reportProgress(int bytesRead, int totalBytesToRead) {
	if(bytesRead > nextReport) {
	nextReport += 1024;
	statusDetails = "Downloaded " + bytesRead + " bytes out of " + totalBytesToRead;
	}
	}
	};
	}

	resource.setProperty("jcr:data", in);
	final String mimeType = mimeTypeService.getMimeType(filename);
	resource.setProperty("jcr:mimeType", mimeType);
	final Calendar lastModified = Calendar.getInstance();
	lastModified.setTimeInMillis(con.getLastModified());
	resource.setProperty("jcr:lastModified", lastModified);
	} finally {
	if(in != null) {
	in.close();
	}
	}
	}

	String getJobId() {
	return jobId;
	}

	/** {@inheritDoc} */
	public Throwable getError() {
	return error;
	}

	/** {@inheritDoc} */
	public int getNumberOfDocumentsLoaded() {
	return numDocsLoaded;
	}

	/** {@inheritDoc} */
	public String getStatusInfo() {
	return statusInfo;
	}

	/** {@inheritDoc} */
	public String getStatusDetails() {
	return statusDetails;
	}

	/** {@inheritDoc} */
	public boolean isRunning() {
	return running;
	}

	}