| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.sling.samples.webloader.internal; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.net.URL; |
| import java.net.URLConnection; |
| import java.net.URLDecoder; |
| import java.net.URLEncoder; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Calendar; |
| import java.util.Collections; |
| import java.util.Iterator; |
| import java.util.List; |
| |
| import javax.jcr.Item; |
| import javax.jcr.Node; |
| import javax.jcr.RepositoryException; |
| import javax.jcr.Session; |
| import javax.swing.text.BadLocationException; |
| import javax.swing.text.html.HTML; |
| import javax.swing.text.html.HTMLDocument; |
| import javax.swing.text.html.HTMLEditorKit; |
| |
| import org.apache.sling.commons.mime.MimeTypeService; |
| import org.apache.sling.samples.webloader.WebloaderException; |
| import org.apache.sling.samples.webloader.WebloaderJobStatus; |
| import org.apache.sling.jcr.api.SlingRepository; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** A Webloader job, manages retrieval of documents from the web and storage |
| * in the Sling repository. This code is based on the "populate.jsp" example |
| * of the jackrabbit-webapp module. |
| */ |
| class WebloaderJob extends Thread implements WebloaderJobStatus { |
| private Throwable error; |
| private int numDocsLoaded; |
| private final int maxDocsToRetrieve; |
| private final int maxDocSize; |
| private String statusInfo = "initialized"; |
| private String statusDetails = ""; |
| private boolean running = true; |
| private final String jobId; |
| private final String webQuery; |
| private String storagePath; |
| private final SlingRepository repository; |
| private final MimeTypeService mimeTypeService; |
| private Session session; |
| private Node storageRoot; |
| private static int idCounter; |
| private final String [] filetypes; |
| |
| public static final String [] DEFAULT_FILETYPES = { "pdf", "rtf", "ppt", "doc", "xls" }; |
| public static final int URL_RETRIEVE_TIMEOUT_SECONDS = 10; |
| |
| private static final Logger log = LoggerFactory.getLogger(WebloaderJob.class); |
| |
| @SuppressWarnings("serial") |
| static class DocTooBigException extends IOException { |
| DocTooBigException(URL url, int size) { |
| super("Document at URL " + url + " too big (" + size + " bytes), will be ignored"); |
| } |
| } |
| |
| WebloaderJob(SlingRepository repository, MimeTypeService mimeTypeService, |
| String webQuery, String storagePath, String fileExtensions, int maxDocsToRetrieve, int maxDocSize) { |
| synchronized (WebloaderJob.class) { |
| jobId = String.valueOf(idCounter++); |
| } |
| |
| this.repository = repository; |
| this.mimeTypeService = mimeTypeService; |
| this.webQuery = webQuery; |
| this.storagePath = storagePath; |
| this.maxDocsToRetrieve = maxDocsToRetrieve; |
| this.maxDocSize = maxDocSize; |
| |
| final String [] ft = fileExtensions == null ? null : fileExtensions.split(","); |
| if(ft!=null && ft.length > 0) { |
| filetypes = new String[ft.length]; |
| for(int i=0; i < ft.length; i++) { |
| filetypes[i] = ft[i].trim().toLowerCase(); |
| } |
| } else { |
| filetypes = DEFAULT_FILETYPES; |
| } |
| |
| if(mimeTypeService == null) { |
| throw new WebloaderException("Missing MimeTypeService"); |
| } |
| if(repository == null) { |
| throw new WebloaderException("Missing Repository"); |
| } |
| |
| setDaemon(true); |
| start(); |
| } |
| |
| @Override |
| public String toString() { |
| final StringBuffer sb = new StringBuffer(); |
| for(String str : filetypes) { |
| if(sb.length() > 0) { |
| sb.append(","); |
| } |
| sb.append(str); |
| } |
| |
| return getClass().getSimpleName() + ", webQuery=" + webQuery |
| + ", storagePath=" + storagePath |
| + ", fileTypes=" + sb.toString() |
| + ", maxDocsToRetrieve=" + maxDocsToRetrieve |
| + ", maxDocSize=" + maxDocSize |
| ; |
| } |
| |
| @Override |
| public void run() { |
| log.debug("Job thread starting: {}", this); |
| |
| // TODO should use a session provided by client, but can we use it |
| // safely for our async job? |
| session = null; |
| |
| if(storagePath.charAt(0) == '/') { |
| storagePath = storagePath.substring(1); |
| } |
| final String absStoragePath = "/" + storagePath; |
| |
| try { |
| session = repository.loginAdministrative(null); |
| if(session.itemExists(absStoragePath)) { |
| final Item i = session.getItem(absStoragePath); |
| if(i.isNode()) { |
| storageRoot = (Node)i; |
| } else { |
| throw new WebloaderException("Item at " + storagePath + " is not a Node"); |
| } |
| } else { |
| // TODO deep-create hierarchy if needed |
| storageRoot = session.getRootNode().addNode(storagePath); |
| session.save(); |
| } |
| |
| int offset = 0; |
| for(String type : filetypes) { |
| final URL[] urls = getDocumentUrlsFromGoogle(type, offset); |
| for(URL url : urls) { |
| try { |
| getAndStoreDocument(url); |
| session.save(); |
| numDocsLoaded++; |
| if(numDocsLoaded >= maxDocsToRetrieve) { |
| break; |
| } |
| } catch(DocTooBigException dtb) { |
| log.info(dtb.getMessage()); |
| } catch(Exception e) { |
| log.warn("Exception while retrieving url " + url, e); |
| } finally { |
| session.refresh(false); |
| } |
| } |
| offset += 10; |
| |
| if(numDocsLoaded >= maxDocsToRetrieve) { |
| break; |
| } |
| } |
| |
| statusInfo = "All done."; |
| |
| } catch(Exception e) { |
| error = e; |
| log.warn("Exception in WebloaderJob.run()", e); |
| statusInfo = "Exception while running job: " + e; |
| |
| } finally { |
| if(session != null) { |
| session.logout(); |
| } |
| statusDetails = ""; |
| running = false; |
| } |
| |
| if(numDocsLoaded >= maxDocsToRetrieve) { |
| log.info("Stopped after retrieving maximum number of documents ({})", maxDocsToRetrieve); |
| } |
| |
| log.info("Job thread ends: {}, {} documents loaded", this, numDocsLoaded); |
| } |
| |
| private URL [] getDocumentUrlsFromGoogle(String currentFiletype, int start) throws IOException, BadLocationException { |
| final List urls = new ArrayList(); |
| String query = webQuery + " filetype:" + currentFiletype; |
| final URL google = new URL("http://www.google.com/search?q=" + |
| URLEncoder.encode(query, "UTF-8") + "&start=" + start); |
| log.debug("Querying {}", google.toString()); |
| statusInfo = "Querying " + google.toString(); |
| statusDetails = ""; |
| URLConnection con = google.openConnection(); |
| con.setRequestProperty("User-Agent", ""); |
| InputStream in = con.getInputStream(); |
| try { |
| HTMLEditorKit kit = new HTMLEditorKit(); |
| HTMLDocument doc = new HTMLDocument(); |
| doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE); |
| kit.read(new InputStreamReader(in, "UTF-8"), doc, 0); |
| HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A); |
| while (it.isValid()) { |
| if(it.getAttributes() != null) { |
| String href = (String) it.getAttributes().getAttribute(HTML.Attribute.HREF); |
| if (href != null && href.endsWith("." + currentFiletype)) { |
| URL url = new URL(new URL("http", "www.google.com", "dummy"), href); |
| if (url.getHost().indexOf("google") == -1) { |
| log.debug("Got document URL from google: {}", url); |
| statusDetails = "Got URL " + url; |
| urls.add(url); |
| } |
| } |
| } |
| it.next(); |
| } |
| } finally { |
| in.close(); |
| } |
| return (URL[]) urls.toArray(new URL[urls.size()]); |
| |
| } |
| |
| private void getAndStoreDocument(URL currentURL) throws RepositoryException, IOException { |
| |
| statusInfo = "Retrieving document " + currentURL; |
| statusDetails = ""; |
| |
| // build JCR path for storing document, based on its URL |
| String path = currentURL.getPath(); |
| if (path.startsWith("/")) { |
| path = path.substring(1); |
| } |
| final String host = currentURL.getHost(); |
| final List folderNames = new ArrayList(); |
| folderNames.addAll(Arrays.asList(host.split("\\."))); |
| Collections.reverse(folderNames); |
| folderNames.addAll(Arrays.asList(path.split("/", 0))); |
| final String filename = URLDecoder.decode((String) folderNames.remove(folderNames.size() - 1), "UTF-8").replaceAll(":", "_"); |
| Node node = storageRoot; |
| for (Iterator fn = folderNames.iterator(); fn.hasNext(); ) { |
| String name = URLDecoder.decode((String) fn.next(), "UTF-8"); |
| name = name.replaceAll(":", "_"); |
| if (name.length() == 0) { |
| continue; |
| } |
| if (!node.hasNode(name)) { |
| node.addNode(name, "nt:folder"); |
| } |
| node = node.getNode(name); |
| } |
| |
| log.debug("Retrieving document {}, will be stored at {}", currentURL, node.getPath() + "/" + filename); |
| |
| if (!node.hasNode(filename)) { |
| Node file = node.addNode(filename, "nt:file"); |
| final Node resource = file.addNode("jcr:content", "nt:resource"); |
| getAndStoreContent(currentURL, resource, filename); |
| } |
| |
| } |
| |
| private void getAndStoreContent(URL currentURL, Node resource, String filename) |
| throws RepositoryException, IOException { |
| statusInfo = "Retrieving content from " + currentURL; |
| statusDetails = ""; |
| |
| final URLConnection con = currentURL.openConnection(); |
| con.setReadTimeout(URL_RETRIEVE_TIMEOUT_SECONDS * 1000); |
| InputStream in = con.getInputStream(); |
| try { |
| // Read with a ProgressInputStream, so that our status is updated while |
| // downloading |
| int length = con.getContentLength(); |
| if (length != -1) { |
| if(length > maxDocSize * 1024) { |
| throw new DocTooBigException(currentURL, length); |
| } |
| in = new ProgressInputStream(in, length) { |
| int nextReport = 0; |
| protected void reportProgress(int bytesRead, int totalBytesToRead) { |
| if(bytesRead > nextReport) { |
| nextReport += 1024; |
| statusDetails = "Downloaded " + bytesRead + " bytes out of " + totalBytesToRead; |
| } |
| } |
| }; |
| } |
| |
| resource.setProperty("jcr:data", in); |
| final String mimeType = mimeTypeService.getMimeType(filename); |
| resource.setProperty("jcr:mimeType", mimeType); |
| final Calendar lastModified = Calendar.getInstance(); |
| lastModified.setTimeInMillis(con.getLastModified()); |
| resource.setProperty("jcr:lastModified", lastModified); |
| } finally { |
| if(in != null) { |
| in.close(); |
| } |
| } |
| } |
| |
| String getJobId() { |
| return jobId; |
| } |
| |
| /** {@inheritDoc} */ |
| public Throwable getError() { |
| return error; |
| } |
| |
| /** {@inheritDoc} */ |
| public int getNumberOfDocumentsLoaded() { |
| return numDocsLoaded; |
| } |
| |
| /** {@inheritDoc} */ |
| public String getStatusInfo() { |
| return statusInfo; |
| } |
| |
| /** {@inheritDoc} */ |
| public String getStatusDetails() { |
| return statusDetails; |
| } |
| |
| /** {@inheritDoc} */ |
| public boolean isRunning() { |
| return running; |
| } |
| |
| } |