src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.parse.zip;

 import java.lang.invoke.MethodHandles;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.List;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
 import java.net.URL;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.protocol.Content;
 import org.apache.tika.Tika;

 /**
  *
  * @author Rohit Kulkarni and Ashish Vaidya
  */
 public class ZipTextExtractor {

   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   private Configuration conf;

   /** Creates a new instance of ZipTextExtractor */
   public ZipTextExtractor(Configuration conf) {
     this.conf = conf;
   }

   public String extractText(InputStream input, String url,
       List<Outlink> outLinksList) throws IOException {
     String resultText = "";
     ZipInputStream zin = new ZipInputStream(input);
     ZipEntry entry;

     while ((entry = zin.getNextEntry()) != null) {

       if (!entry.isDirectory()) {
         int size = (int) entry.getSize();
         byte[] b = new byte[size];
         for (int x = 0; x < size; x++) {
           int err = zin.read();
           if (err != -1) {
             b[x] = (byte) err;
           }
         }
         String newurl = url + "/";
         String fname = entry.getName();
         newurl += fname;
         URL aURL = new URL(newurl);
         String base = aURL.toString();
         int i = fname.lastIndexOf('.');
         if (i != -1) {
           // Trying to resolve the Mime-Type
           Tika tika = new Tika();
           String contentType = tika.detect(fname);
           try {
             Metadata metadata = new Metadata();
             metadata.set(Response.CONTENT_LENGTH,
                 Long.toString(entry.getSize()));
             metadata.set(Response.CONTENT_TYPE, contentType);
             Content content = new Content(newurl, base, b, contentType,
                 metadata, this.conf);
             Parse parse = new ParseUtil(this.conf).parse(content).get(
                 content.getUrl());
             ParseData theParseData = parse.getData();
             Outlink[] theOutlinks = theParseData.getOutlinks();

             for (int count = 0; count < theOutlinks.length; count++) {
               outLinksList.add(new Outlink(theOutlinks[count].getToUrl(),
                   theOutlinks[count].getAnchor()));
             }

             resultText += entry.getName() + " " + parse.getText() + " ";
           } catch (ParseException e) {
             if (LOG.isInfoEnabled()) {
               LOG.info("fetch okay, but can't parse " + fname + ", reason: "
                   + e.getMessage());
             }
           }
         }
       }
     }

     return resultText;
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.parse.zip;

	import java.lang.invoke.MethodHandles;
	import java.io.IOException;
	import java.io.InputStream;
	import java.util.List;
	import java.util.zip.ZipEntry;
	import java.util.zip.ZipInputStream;
	import java.net.URL;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import org.apache.hadoop.conf.Configuration;

	import org.apache.nutch.metadata.Metadata;
	import org.apache.nutch.net.protocols.Response;
	import org.apache.nutch.parse.Parse;
	import org.apache.nutch.parse.ParseData;
	import org.apache.nutch.parse.ParseUtil;
	import org.apache.nutch.parse.ParseException;
	import org.apache.nutch.parse.Outlink;
	import org.apache.nutch.protocol.Content;
	import org.apache.tika.Tika;

	/**
	*
	* @author Rohit Kulkarni and Ashish Vaidya
	*/
	public class ZipTextExtractor {

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	private Configuration conf;

	/** Creates a new instance of ZipTextExtractor */
	public ZipTextExtractor(Configuration conf) {
	this.conf = conf;
	}

	public String extractText(InputStream input, String url,
	List<Outlink> outLinksList) throws IOException {
	String resultText = "";
	ZipInputStream zin = new ZipInputStream(input);
	ZipEntry entry;

	while ((entry = zin.getNextEntry()) != null) {

	if (!entry.isDirectory()) {
	int size = (int) entry.getSize();
	byte[] b = new byte[size];
	for (int x = 0; x < size; x++) {
	int err = zin.read();
	if (err != -1) {
	b[x] = (byte) err;
	}
	}
	String newurl = url + "/";
	String fname = entry.getName();
	newurl += fname;
	URL aURL = new URL(newurl);
	String base = aURL.toString();
	int i = fname.lastIndexOf('.');
	if (i != -1) {
	// Trying to resolve the Mime-Type
	Tika tika = new Tika();
	String contentType = tika.detect(fname);
	try {
	Metadata metadata = new Metadata();
	metadata.set(Response.CONTENT_LENGTH,
	Long.toString(entry.getSize()));
	metadata.set(Response.CONTENT_TYPE, contentType);
	Content content = new Content(newurl, base, b, contentType,
	metadata, this.conf);
	Parse parse = new ParseUtil(this.conf).parse(content).get(
	content.getUrl());
	ParseData theParseData = parse.getData();
	Outlink[] theOutlinks = theParseData.getOutlinks();

	for (int count = 0; count < theOutlinks.length; count++) {
	outLinksList.add(new Outlink(theOutlinks[count].getToUrl(),
	theOutlinks[count].getAnchor()));
	}

	resultText += entry.getName() + " " + parse.getText() + " ";
	} catch (ParseException e) {
	if (LOG.isInfoEnabled()) {
	LOG.info("fetch okay, but can't parse " + fname + ", reason: "
	+ e.getMessage());
	}
	}
	}
	}
	}

	return resultText;
	}

	}