blob: a605f3b0bf8f130dad3da4a1545051316d114421 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.zip;
import java.lang.invoke.MethodHandles;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.hadoop.conf.Configuration;
/**
* ZipParser class based on MSPowerPointParser class by Stephan Strittmatter.
* Nutch parse plugin for zip files - Content Type : application/zip
*/
public class ZipParser implements Parser {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private Configuration conf;
/** Creates a new instance of ZipParser */
public ZipParser() {
}
@Override
public ParseResult getParse(final Content content) {
String resultText = null;
String resultTitle = null;
Outlink[] outlinks = null;
List<Outlink> outLinksList = new ArrayList<Outlink>();
try {
final String contentLen = content.getMetadata().get(
Response.CONTENT_LENGTH);
final int len = Integer.parseInt(contentLen);
if (LOG.isDebugEnabled()) {
LOG.debug("ziplen: " + len);
}
final byte[] contentInBytes = content.getContent();
if (contentLen != null && contentInBytes.length != len) {
return new ParseStatus(ParseStatus.FAILED,
ParseStatus.FAILED_TRUNCATED, "Content truncated at "
+ contentInBytes.length
+ " bytes. Parser can't handle incomplete zip file.")
.getEmptyParseResult(content.getUrl(), getConf());
}
ZipTextExtractor extractor = new ZipTextExtractor(getConf());
// extract text
resultText = extractor.extractText(new ByteArrayInputStream(
contentInBytes), content.getUrl(), outLinksList);
} catch (Exception e) {
return new ParseStatus(ParseStatus.FAILED,
"Can't be handled as Zip document. " + e).getEmptyParseResult(
content.getUrl(), getConf());
}
if (resultText == null) {
resultText = "";
}
if (resultTitle == null) {
resultTitle = "";
}
outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]);
final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
resultTitle, outlinks, content.getMetadata());
if (LOG.isTraceEnabled()) {
LOG.trace("Zip file parsed sucessfully !!");
}
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(
resultText, parseData));
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public Configuration getConf() {
return this.conf;
}
public static void main(String[] args) throws IOException {
if (args.length < 1) {
System.out.println("ZipParser <zip_file>");
System.exit(1);
}
File file = new File(args[0]);
String url = "file:"+file.getCanonicalPath();
FileInputStream in = new FileInputStream(file);
byte[] bytes = new byte[in.available()];
in.read(bytes);
in.close();
Configuration conf = NutchConfiguration.create();
ZipParser parser = new ZipParser();
parser.setConf(conf);
Metadata meta = new Metadata();
meta.add(Response.CONTENT_LENGTH, ""+file.length());
ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
"application/zip", meta, conf));
Parse p = parseResult.get(url);
System.out.println(parseResult.size());
System.out.println("Parse Text:");
System.out.println(p.getText());
System.out.println("Parse Data:");
System.out.println(p.getData());
}
}