enhancement-engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractor.java - stanbol - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.stanbol.enhancer.engines.metaxa.core.mail.simple;

 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.Charset;
 import java.util.List;

 import javax.activation.DataHandler;
 import javax.mail.Address;
 import javax.mail.BodyPart;
 import javax.mail.Message.RecipientType;
 import javax.mail.MessagingException;
 import javax.mail.Multipart;
 import javax.mail.Part;
 import javax.mail.internet.AddressException;
 import javax.mail.internet.ContentType;
 import javax.mail.internet.InternetAddress;
 import javax.mail.internet.MimeMessage;
 import javax.mail.internet.MimeUtility;

 import org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlTextExtractUtil;
 import org.apache.stanbol.enhancer.engines.metaxa.core.html.InitializationException;
 import org.ontoware.rdf2go.exception.ModelException;
 import org.ontoware.rdf2go.model.Model;
 import org.ontoware.rdf2go.model.Syntax;
 import org.ontoware.rdf2go.model.impl.URIGenerator;
 import org.ontoware.rdf2go.model.node.URI;
 import org.ontoware.rdf2go.model.node.impl.URIImpl;
 import org.ontoware.rdf2go.vocabulary.RDF;
 import org.semanticdesktop.aperture.extractor.Extractor;
 import org.semanticdesktop.aperture.extractor.ExtractorException;
 import org.semanticdesktop.aperture.extractor.mime.MailUtil;
 import org.semanticdesktop.aperture.rdf.RDFContainer;
 import org.semanticdesktop.aperture.rdf.RDFContainerFactory;
 import org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl;
 import org.semanticdesktop.aperture.vocabulary.NFO;
 import org.semanticdesktop.aperture.vocabulary.NIE;
 import org.semanticdesktop.aperture.vocabulary.NMO;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * An Extractor implementation for message/rfc822-style messages.
  *
  * <p>
  * Only typical body parts are processed during full-text extraction. Attachments are only listed but not
  * further handled. In case of mails in HTML format, the full HTML is included in the extracted data as value
  * of the <code>nmo:htmlMessageContent</code> property. The plain text (extract) is represented by the
  * <code>nmo:plainTextMessageContent</code> property and as value of the <code>nie:plainTextContent</code>
  * property for compliance with the representation from other extractors.
  *
  *
  */
 public class SimpleMailExtractor implements Extractor {
     private static final Logger logger = LoggerFactory.getLogger(SimpleMailExtractor.class);

     public void extract(URI id, InputStream stream, Charset charset, String mimeType, RDFContainer result) throws ExtractorException {
         try {
             // parse the stream
             MimeMessage message = new MimeMessage(null, stream);

             result.add(RDF.type, NMO.Email);

             // extract the full-text
             StringBuilder buffer = new StringBuilder(10000);
             processMessage(message, buffer, result);
             String text = buffer.toString().trim();
             if (text.length() > 0) {
                 result.add(NMO.plainTextMessageContent, text);
                 result.add(NIE.plainTextContent, text);
             }

             // extract other metadata
             String title = message.getSubject();
             if (title != null) {
                 title = title.trim();
                 if (title.length() > 0) {
                     result.add(NMO.messageSubject, title);
                 }
             }

             try {
                 copyAddress(message.getFrom(), NMO.from, result);
             } catch (AddressException e) {
                 // ignore
             }

             copyAddress(getRecipients(message, RecipientType.TO), NMO.to, result);
             copyAddress(getRecipients(message, RecipientType.CC), NMO.cc, result);
             copyAddress(getRecipients(message, RecipientType.BCC), NMO.bcc, result);

             MailUtil.getDates(message, result);

         } catch (MessagingException e) {
             throw new ExtractorException(e);
         } catch (IOException e) {
             throw new ExtractorException(e);
         }
     }

     // the top level message
     protected void processMessage(MimeMessage msg, StringBuilder buffer, RDFContainer rdf) throws MessagingException,
                                                                                           IOException,
                                                                                           ExtractorException {
         if (msg.isMimeType("text/plain")) {
             processContent(msg.getContent(), buffer, rdf);
         } else if (msg.isMimeType("text/html")) {
             String encoding = getContentEncoding(new ContentType(msg.getContentType()));
             logger.debug("HTML encoding: {}", encoding);
             if (msg.getContent() instanceof String) {
                 String text = extractTextFromHtml(((String) msg.getContent()).trim(), encoding, rdf);
                 rdf.add(NMO.htmlMessageContent, (String) msg.getContent());
                 processContent(text, buffer, rdf);
             } else {
                 processContent(msg.getContent(), buffer, rdf);
             }
         } else {
             processContent(msg.getContent(), buffer, rdf);
         }
     }

     // the recursive part
     protected void processContent(Object content, StringBuilder buffer, RDFContainer rdf) throws MessagingException,
                                                                                          IOException,
                                                                                          ExtractorException {
         if (content instanceof String) {
             buffer.append(content);
             buffer.append(' ');
         } else if (content instanceof BodyPart) {
             BodyPart bodyPart = (BodyPart) content;
             DataHandler handler = bodyPart.getDataHandler();
             String encoding = null;
             if (handler != null) {
                 encoding = MimeUtility.getEncoding(handler);
             }
             String fileName = bodyPart.getFileName();
             String contentType = bodyPart.getContentType();
             if (fileName != null) {
                 try {
                     fileName = MimeUtility.decodeWord(fileName);
                 } catch (MessagingException e) {
                     // happens on unencoded file names! so just ignore it and leave the file name as it is
                 }
                 URI attachURI = URIGenerator.createNewRandomUniqueURI();
                 rdf.add(NMO.hasAttachment, attachURI);
                 Model m = rdf.getModel();
                 m.addStatement(attachURI, RDF.type, NFO.Attachment);
                 m.addStatement(attachURI, NFO.fileName, fileName);
                 if (handler != null) {
                     if (encoding != null) {
                         m.addStatement(attachURI, NFO.encoding, encoding);
                     }
                 }
                 if (contentType != null) {
                     contentType = (new ContentType(contentType)).getBaseType();
                     m.addStatement(attachURI, NIE.mimeType, contentType.trim());
                 }
                 // TODO: encoding?
             }

             // append the content, if any
             content = bodyPart.getContent();

             // remove any html markup if necessary
             if (contentType != null && content instanceof String) {
                 contentType = contentType.toLowerCase();
                 if (contentType.indexOf("text/html") >= 0) {
                     if (encoding != null) {
                         encoding = MimeUtility.javaCharset(encoding);
                     }
                     content = extractTextFromHtml((String) content, encoding, rdf);
                 }
             }

             processContent(content, buffer, rdf);
         } else if (content instanceof Multipart) {
             Multipart multipart = (Multipart) content;
             String subType = null;

             String contentType = multipart.getContentType();
             if (contentType != null) {
                 ContentType ct = new ContentType(contentType);
                 subType = ct.getSubType();
                 if (subType != null) {
                     subType = subType.trim().toLowerCase();
                 }
             }

             if ("alternative".equals(subType)) {
                 handleAlternativePart(multipart, buffer, rdf);
             } else if ("signed".equals(subType)) {
                 handleProtectedPart(multipart, 0, buffer, rdf);
             } else if ("encrypted".equals(subType)) {
                 handleProtectedPart(multipart, 1, buffer, rdf);
             } else {
                 // handles multipart/mixed, /digest, /related, /parallel, /report and unknown subtypes
                 handleMixedPart(multipart, buffer, rdf);
             }
         }
     }

     protected void handleAlternativePart(Multipart multipart, StringBuilder buffer, RDFContainer rdf) throws MessagingException,
                                                                                                      IOException,
                                                                                                      ExtractorException {
         // find the first text/plain part or else the first text/html part
         boolean isHtml = false;

         int idx = getPartWithMimeType(multipart, "text/plain");
         int idxh = getPartWithMimeType(multipart, "text/html");
         if (idx < 0) {
             isHtml = true;
         }
         // add nmo:htmlMessageContent property
         if (idxh >= 0) {
             Object html = multipart.getBodyPart(idxh).getContent();
             if (html != null && html instanceof String) {
                 rdf.add(NMO.htmlMessageContent, (String) html);
             }
         }
         if (idx >= 0) {
             Object content = multipart.getBodyPart(idx).getContent();
             if (content != null) {
                 if (content instanceof String && isHtml) {
                     String encoding = getEncoding(multipart.getBodyPart(idx));
                     if (encoding != null) {
                         encoding = MimeUtility.javaCharset(encoding);
                     }
                     content = extractTextFromHtml((String) content, encoding, rdf);
                 }

                 processContent(content, buffer, rdf);
             }
         }
     }

     protected void handleMixedPart(Multipart multipart, StringBuilder buffer, RDFContainer rdf) throws MessagingException,
                                                                                                IOException,
                                                                                                ExtractorException {
         int count = multipart.getCount();
         for (int i = 0; i < count; i++) {
             processContent(multipart.getBodyPart(i), buffer, rdf);
         }
     }

     protected void handleProtectedPart(Multipart multipart, int index, StringBuilder buffer, RDFContainer rdf) throws MessagingException,
                                                                                                               IOException,
                                                                                                               ExtractorException {
         if (index < multipart.getCount()) {
             processContent(multipart.getBodyPart(index), buffer, rdf);
         }
     }

     protected int getPartWithMimeType(Multipart multipart, String mimeType) throws MessagingException {
         for (int i = 0; i < multipart.getCount(); i++) {
             BodyPart bodyPart = multipart.getBodyPart(i);
             if (mimeType.equalsIgnoreCase(getMimeType(bodyPart))) {
                 return i;
             }
         }

         return -1;
     }

     protected String getContentEncoding(ContentType contentType) {
         if (contentType != null) {
             return contentType.getParameter("charset");
         }
         return null;
     }

     protected String getEncoding(Part mailPart) throws MessagingException {
         DataHandler handler = mailPart.getDataHandler();
         if (handler != null) {
             return MimeUtility.getEncoding(handler);
         }
         return null;
     }

     protected String getMimeType(Part mailPart) throws MessagingException {
         String contentType = mailPart.getContentType();
         if (contentType != null) {
             ContentType ct = new ContentType(contentType);
             return ct.getBaseType();
         }

         return null;
     }

     protected String extractTextFromHtml(String string, String charset, RDFContainer rdf) throws ExtractorException {
         // parse the HTML and extract full-text and metadata
         HtmlTextExtractUtil extractor;
         try {
             extractor = new HtmlTextExtractUtil();
         } catch (InitializationException e) {
             throw new ExtractorException("Could not initialize HtmlExtractor: " + e.getMessage());
         }
         InputStream stream = new ByteArrayInputStream(string.getBytes());
         RDFContainerFactory containerFactory = new RDFContainerFactoryImpl();
         URI id = rdf.getDescribedUri();
         RDFContainer result = containerFactory.getRDFContainer(id);
         extractor.extract(id, charset, stream, result);
         Model meta = result.getModel();

         // append metadata and full-text to a string buffer
         StringBuilder buffer = new StringBuilder(32 * 1024);
         append(buffer, extractor.getTitle(meta), "\n");
         append(buffer, extractor.getAuthor(meta), "\n");
         append(buffer, extractor.getDescription(meta), "\n");
         List<String> keywords = extractor.getKeywords(meta);
         for (String kw : keywords) {
             append(buffer, kw, " ");
         }
         buffer.append("\n");
         append(buffer, extractor.getText(meta), " ");
         logger.debug("text extracted:\n{}", buffer);
         meta.close();

         // return the buffer's content
         return buffer.toString();
     }

     protected void append(StringBuilder buffer, String text, String sep) {
         if (text != null) {
             buffer.append(text);
             buffer.append(sep);
         }
     }

     protected Address[] getRecipients(MimeMessage message, RecipientType type) throws MessagingException {
         Address[] result = null;

         try {
             result = message.getRecipients(type);
         } catch (AddressException e) {
             // ignore
         }

         return result;
     }

     protected void copyAddress(Object address, URI predicate, RDFContainer result) {
         try {
             if (address instanceof InternetAddress) {
                 MailUtil.addAddressMetadata((InternetAddress) address, predicate, result);
             } else if (address instanceof InternetAddress[]) {
                 InternetAddress[] array = (InternetAddress[]) address;
                 for (int i = 0; i < array.length; i++) {
                     MailUtil.addAddressMetadata(array[i], predicate, result);
                 }
             }
         } catch (ModelException e) {
             logger.error("ModelException while adding address metadata", e);
         }
     }

     public static void main(String[] args) throws Exception {
         int argv = 0;
         SimpleMailExtractor extractor = new SimpleMailExtractor();

         RDFContainerFactory rdfFactory = new RDFContainerFactoryImpl();
         for (int i = argv; i < args.length; ++i) {
             File file = new File(args[i]);
             InputStream in = new FileInputStream(file);
             URI uri = new URIImpl(file.toURI().toString());
             RDFContainer rdfContainer = rdfFactory.getRDFContainer(uri);
             extractor.extract(uri, in, null, null, rdfContainer);
             Model model = rdfContainer.getModel();
             model.writeTo(System.out, Syntax.RdfXml);
             model.close();
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.stanbol.enhancer.engines.metaxa.core.mail.simple;

	import java.io.ByteArrayInputStream;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.nio.charset.Charset;
	import java.util.List;

	import javax.activation.DataHandler;
	import javax.mail.Address;
	import javax.mail.BodyPart;
	import javax.mail.Message.RecipientType;
	import javax.mail.MessagingException;
	import javax.mail.Multipart;
	import javax.mail.Part;
	import javax.mail.internet.AddressException;
	import javax.mail.internet.ContentType;
	import javax.mail.internet.InternetAddress;
	import javax.mail.internet.MimeMessage;
	import javax.mail.internet.MimeUtility;

	import org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlTextExtractUtil;
	import org.apache.stanbol.enhancer.engines.metaxa.core.html.InitializationException;
	import org.ontoware.rdf2go.exception.ModelException;
	import org.ontoware.rdf2go.model.Model;
	import org.ontoware.rdf2go.model.Syntax;
	import org.ontoware.rdf2go.model.impl.URIGenerator;
	import org.ontoware.rdf2go.model.node.URI;
	import org.ontoware.rdf2go.model.node.impl.URIImpl;
	import org.ontoware.rdf2go.vocabulary.RDF;
	import org.semanticdesktop.aperture.extractor.Extractor;
	import org.semanticdesktop.aperture.extractor.ExtractorException;
	import org.semanticdesktop.aperture.extractor.mime.MailUtil;
	import org.semanticdesktop.aperture.rdf.RDFContainer;
	import org.semanticdesktop.aperture.rdf.RDFContainerFactory;
	import org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl;
	import org.semanticdesktop.aperture.vocabulary.NFO;
	import org.semanticdesktop.aperture.vocabulary.NIE;
	import org.semanticdesktop.aperture.vocabulary.NMO;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* An Extractor implementation for message/rfc822-style messages.
	*
	* <p>
	* Only typical body parts are processed during full-text extraction. Attachments are only listed but not
	* further handled. In case of mails in HTML format, the full HTML is included in the extracted data as value
	* of the <code>nmo:htmlMessageContent</code> property. The plain text (extract) is represented by the
	* <code>nmo:plainTextMessageContent</code> property and as value of the <code>nie:plainTextContent</code>
	* property for compliance with the representation from other extractors.
	*
	*
	*/
	public class SimpleMailExtractor implements Extractor {
	private static final Logger logger = LoggerFactory.getLogger(SimpleMailExtractor.class);

	public void extract(URI id, InputStream stream, Charset charset, String mimeType, RDFContainer result) throws ExtractorException {
	try {
	// parse the stream
	MimeMessage message = new MimeMessage(null, stream);

	result.add(RDF.type, NMO.Email);

	// extract the full-text
	StringBuilder buffer = new StringBuilder(10000);
	processMessage(message, buffer, result);
	String text = buffer.toString().trim();
	if (text.length() > 0) {
	result.add(NMO.plainTextMessageContent, text);
	result.add(NIE.plainTextContent, text);
	}

	// extract other metadata
	String title = message.getSubject();
	if (title != null) {
	title = title.trim();
	if (title.length() > 0) {
	result.add(NMO.messageSubject, title);
	}
	}

	try {
	copyAddress(message.getFrom(), NMO.from, result);
	} catch (AddressException e) {
	// ignore
	}

	copyAddress(getRecipients(message, RecipientType.TO), NMO.to, result);
	copyAddress(getRecipients(message, RecipientType.CC), NMO.cc, result);
	copyAddress(getRecipients(message, RecipientType.BCC), NMO.bcc, result);

	MailUtil.getDates(message, result);

	} catch (MessagingException e) {
	throw new ExtractorException(e);
	} catch (IOException e) {
	throw new ExtractorException(e);
	}
	}

	// the top level message
	protected void processMessage(MimeMessage msg, StringBuilder buffer, RDFContainer rdf) throws MessagingException,
	IOException,
	ExtractorException {
	if (msg.isMimeType("text/plain")) {
	processContent(msg.getContent(), buffer, rdf);
	} else if (msg.isMimeType("text/html")) {
	String encoding = getContentEncoding(new ContentType(msg.getContentType()));
	logger.debug("HTML encoding: {}", encoding);
	if (msg.getContent() instanceof String) {
	String text = extractTextFromHtml(((String) msg.getContent()).trim(), encoding, rdf);
	rdf.add(NMO.htmlMessageContent, (String) msg.getContent());
	processContent(text, buffer, rdf);
	} else {
	processContent(msg.getContent(), buffer, rdf);
	}
	} else {
	processContent(msg.getContent(), buffer, rdf);
	}
	}

	// the recursive part
	protected void processContent(Object content, StringBuilder buffer, RDFContainer rdf) throws MessagingException,
	IOException,
	ExtractorException {
	if (content instanceof String) {
	buffer.append(content);
	buffer.append(' ');
	} else if (content instanceof BodyPart) {
	BodyPart bodyPart = (BodyPart) content;
	DataHandler handler = bodyPart.getDataHandler();
	String encoding = null;
	if (handler != null) {
	encoding = MimeUtility.getEncoding(handler);
	}
	String fileName = bodyPart.getFileName();
	String contentType = bodyPart.getContentType();
	if (fileName != null) {
	try {
	fileName = MimeUtility.decodeWord(fileName);
	} catch (MessagingException e) {
	// happens on unencoded file names! so just ignore it and leave the file name as it is
	}
	URI attachURI = URIGenerator.createNewRandomUniqueURI();
	rdf.add(NMO.hasAttachment, attachURI);
	Model m = rdf.getModel();
	m.addStatement(attachURI, RDF.type, NFO.Attachment);
	m.addStatement(attachURI, NFO.fileName, fileName);
	if (handler != null) {
	if (encoding != null) {
	m.addStatement(attachURI, NFO.encoding, encoding);
	}
	}
	if (contentType != null) {
	contentType = (new ContentType(contentType)).getBaseType();
	m.addStatement(attachURI, NIE.mimeType, contentType.trim());
	}
	// TODO: encoding?
	}

	// append the content, if any
	content = bodyPart.getContent();

	// remove any html markup if necessary
	if (contentType != null && content instanceof String) {
	contentType = contentType.toLowerCase();
	if (contentType.indexOf("text/html") >= 0) {
	if (encoding != null) {
	encoding = MimeUtility.javaCharset(encoding);
	}
	content = extractTextFromHtml((String) content, encoding, rdf);
	}
	}

	processContent(content, buffer, rdf);
	} else if (content instanceof Multipart) {
	Multipart multipart = (Multipart) content;
	String subType = null;

	String contentType = multipart.getContentType();
	if (contentType != null) {
	ContentType ct = new ContentType(contentType);
	subType = ct.getSubType();
	if (subType != null) {
	subType = subType.trim().toLowerCase();
	}
	}

	if ("alternative".equals(subType)) {
	handleAlternativePart(multipart, buffer, rdf);
	} else if ("signed".equals(subType)) {
	handleProtectedPart(multipart, 0, buffer, rdf);
	} else if ("encrypted".equals(subType)) {
	handleProtectedPart(multipart, 1, buffer, rdf);
	} else {
	// handles multipart/mixed, /digest, /related, /parallel, /report and unknown subtypes
	handleMixedPart(multipart, buffer, rdf);
	}
	}
	}

	protected void handleAlternativePart(Multipart multipart, StringBuilder buffer, RDFContainer rdf) throws MessagingException,
	IOException,
	ExtractorException {
	// find the first text/plain part or else the first text/html part
	boolean isHtml = false;

	int idx = getPartWithMimeType(multipart, "text/plain");
	int idxh = getPartWithMimeType(multipart, "text/html");
	if (idx < 0) {
	isHtml = true;
	}
	// add nmo:htmlMessageContent property
	if (idxh >= 0) {
	Object html = multipart.getBodyPart(idxh).getContent();
	if (html != null && html instanceof String) {
	rdf.add(NMO.htmlMessageContent, (String) html);
	}
	}
	if (idx >= 0) {
	Object content = multipart.getBodyPart(idx).getContent();
	if (content != null) {
	if (content instanceof String && isHtml) {
	String encoding = getEncoding(multipart.getBodyPart(idx));
	if (encoding != null) {
	encoding = MimeUtility.javaCharset(encoding);
	}
	content = extractTextFromHtml((String) content, encoding, rdf);
	}

	processContent(content, buffer, rdf);
	}
	}
	}

	protected void handleMixedPart(Multipart multipart, StringBuilder buffer, RDFContainer rdf) throws MessagingException,
	IOException,
	ExtractorException {
	int count = multipart.getCount();
	for (int i = 0; i < count; i++) {
	processContent(multipart.getBodyPart(i), buffer, rdf);
	}
	}

	protected void handleProtectedPart(Multipart multipart, int index, StringBuilder buffer, RDFContainer rdf) throws MessagingException,
	IOException,
	ExtractorException {
	if (index < multipart.getCount()) {
	processContent(multipart.getBodyPart(index), buffer, rdf);
	}
	}

	protected int getPartWithMimeType(Multipart multipart, String mimeType) throws MessagingException {
	for (int i = 0; i < multipart.getCount(); i++) {
	BodyPart bodyPart = multipart.getBodyPart(i);
	if (mimeType.equalsIgnoreCase(getMimeType(bodyPart))) {
	return i;
	}
	}

	return -1;
	}

	protected String getContentEncoding(ContentType contentType) {
	if (contentType != null) {
	return contentType.getParameter("charset");
	}
	return null;
	}

	protected String getEncoding(Part mailPart) throws MessagingException {
	DataHandler handler = mailPart.getDataHandler();
	if (handler != null) {
	return MimeUtility.getEncoding(handler);
	}
	return null;
	}

	protected String getMimeType(Part mailPart) throws MessagingException {
	String contentType = mailPart.getContentType();
	if (contentType != null) {
	ContentType ct = new ContentType(contentType);
	return ct.getBaseType();
	}

	return null;
	}

	protected String extractTextFromHtml(String string, String charset, RDFContainer rdf) throws ExtractorException {
	// parse the HTML and extract full-text and metadata
	HtmlTextExtractUtil extractor;
	try {
	extractor = new HtmlTextExtractUtil();
	} catch (InitializationException e) {
	throw new ExtractorException("Could not initialize HtmlExtractor: " + e.getMessage());
	}
	InputStream stream = new ByteArrayInputStream(string.getBytes());
	RDFContainerFactory containerFactory = new RDFContainerFactoryImpl();
	URI id = rdf.getDescribedUri();
	RDFContainer result = containerFactory.getRDFContainer(id);
	extractor.extract(id, charset, stream, result);
	Model meta = result.getModel();

	// append metadata and full-text to a string buffer
	StringBuilder buffer = new StringBuilder(32 * 1024);
	append(buffer, extractor.getTitle(meta), "\n");
	append(buffer, extractor.getAuthor(meta), "\n");
	append(buffer, extractor.getDescription(meta), "\n");
	List<String> keywords = extractor.getKeywords(meta);
	for (String kw : keywords) {
	append(buffer, kw, " ");
	}
	buffer.append("\n");
	append(buffer, extractor.getText(meta), " ");
	logger.debug("text extracted:\n{}", buffer);
	meta.close();

	// return the buffer's content
	return buffer.toString();
	}

	protected void append(StringBuilder buffer, String text, String sep) {
	if (text != null) {
	buffer.append(text);
	buffer.append(sep);
	}
	}

	protected Address[] getRecipients(MimeMessage message, RecipientType type) throws MessagingException {
	Address[] result = null;

	try {
	result = message.getRecipients(type);
	} catch (AddressException e) {
	// ignore
	}

	return result;
	}

	protected void copyAddress(Object address, URI predicate, RDFContainer result) {
	try {
	if (address instanceof InternetAddress) {
	MailUtil.addAddressMetadata((InternetAddress) address, predicate, result);
	} else if (address instanceof InternetAddress[]) {
	InternetAddress[] array = (InternetAddress[]) address;
	for (int i = 0; i < array.length; i++) {
	MailUtil.addAddressMetadata(array[i], predicate, result);
	}
	}
	} catch (ModelException e) {
	logger.error("ModelException while adding address metadata", e);
	}
	}

	public static void main(String[] args) throws Exception {
	int argv = 0;
	SimpleMailExtractor extractor = new SimpleMailExtractor();

	RDFContainerFactory rdfFactory = new RDFContainerFactoryImpl();
	for (int i = argv; i < args.length; ++i) {
	File file = new File(args[i]);
	InputStream in = new FileInputStream(file);
	URI uri = new URIImpl(file.toURI().toString());
	RDFContainer rdfContainer = rdfFactory.getRDFContainer(uri);
	extractor.extract(uri, in, null, null, rdfContainer);
	Model model = rdfContainer.getModel();
	model.writeTo(System.out, Syntax.RdfXml);
	model.close();
	}
	}
	}