trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.microsoft;

 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;

 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.HWPFOldDocument;
 import org.apache.poi.hwpf.OldWordFileFormatException;
 import org.apache.poi.hwpf.extractor.Word6Extractor;
 import org.apache.poi.hwpf.model.PicturesTable;
 import org.apache.poi.hwpf.model.StyleDescription;
 import org.apache.poi.hwpf.usermodel.CharacterRun;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Picture;
 import org.apache.poi.hwpf.usermodel.Range;
 import org.apache.poi.hwpf.usermodel.Table;
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableRow;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;

 public class WordExtractor extends AbstractPOIFSExtractor {

     public WordExtractor(ParseContext context) {
         super(context);
     }

     protected void parse(
             POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
         HWPFDocument document;
         try {
             document = new HWPFDocument(filesystem);
         } catch(OldWordFileFormatException e) {
             parseWord6(filesystem, xhtml);
             return;
         }
         org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
             new org.apache.poi.hwpf.extractor.WordExtractor(document);

         addTextIfAny(xhtml, "header", wordExtractor.getHeaderText());

         // Grab the list of pictures. As far as we can tell,
         //  the pictures should be in order, and may be directly
         //  placed or referenced from an anchor
         PicturesTable pictureTable = document.getPicturesTable();
         PicturesSource pictures = new PicturesSource(document);

         // Do the main paragraph text
         Range r = document.getRange();
         for(int i=0; i<r.numParagraphs(); i++) {
            Paragraph p = r.getParagraph(i);
            i += handleParagraph(p, 0, r, document, pictures, pictureTable, xhtml);
         }

         // Do everything else
         for (String paragraph : wordExtractor.getFootnoteText()) {
             xhtml.element("p", paragraph);
         }

         for (String paragraph : wordExtractor.getCommentsText()) {
             xhtml.element("p", paragraph);
         }

         for (String paragraph : wordExtractor.getEndnoteText()) {
             xhtml.element("p", paragraph);
         }

         addTextIfAny(xhtml, "footer", wordExtractor.getFooterText());

         // Handle any pictures that we haven't output yet
         for(Picture p = pictures.nextUnclaimed(); p != null; ) {
            handlePictureCharacterRun(
                  null, p, pictures, xhtml
            );
            p = pictures.nextUnclaimed();
         }

         // Handle any embeded office documents
         try {
             DirectoryEntry op =
                 (DirectoryEntry) filesystem.getRoot().getEntry("ObjectPool");
             for (Entry entry : op) {
                 if (entry.getName().startsWith("_")
                         && entry instanceof DirectoryEntry) {
                     handleEmbededOfficeDoc((DirectoryEntry) entry, xhtml);
                 }
             }
         } catch(FileNotFoundException e) {
         }
     }

     private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document,
           PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml)
           throws SAXException, IOException, TikaException {
        // Note - a poi bug means we can't currently properly recurse
        //  into nested tables, so currently we don't
        if(p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel==0) {
           Table t = r.getTable(p);
           xhtml.startElement("table");
           xhtml.startElement("tbody");
           for(int rn=0; rn<t.numRows(); rn++) {
              TableRow row = t.getRow(rn);
              xhtml.startElement("tr");
              for(int cn=0; cn<row.numCells(); cn++) {
                 TableCell cell = row.getCell(cn);
                 xhtml.startElement("td");

                 for(int pn=0; pn<cell.numParagraphs(); pn++) {
                    Paragraph cellP = cell.getParagraph(pn);
                    handleParagraph(cellP, p.getTableLevel(), cell, document, pictures, pictureTable, xhtml);
                 }
                 xhtml.endElement("td");
              }
              xhtml.endElement("tr");
           }
           xhtml.endElement("tbody");
           xhtml.endElement("table");
           return (t.numParagraphs()-1);
        }

        StyleDescription style =
           document.getStyleSheet().getStyleDescription(p.getStyleIndex());
        TagAndStyle tas = buildParagraphTagAndStyle(
              style.getName(), (parentTableLevel>0)
        );

        if(tas.getStyleClass() != null) {
           xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
        } else {
           xhtml.startElement(tas.getTag());
        }

        for(int j=0; j<p.numCharacterRuns(); j++) {
           CharacterRun cr = p.getCharacterRun(j);

           if(cr.text().equals("\u0013")) {
              j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
           } else if(cr.text().startsWith("\u0008")) {
              // Floating Picture(s)
              for(int pn=0; pn<cr.text().length(); pn++) {
                 // Assume they're in the order from the unclaimed list...
                 Picture picture = pictures.nextUnclaimed();

                 // Output
                 handlePictureCharacterRun(cr, picture, pictures, xhtml);
              }
           } else if(pictureTable.hasPicture(cr)) {
              // Inline Picture
              Picture picture = pictures.getFor(cr);
              handlePictureCharacterRun(cr, picture, pictures, xhtml);
           } else {
              handleCharacterRun(cr, tas.isHeading(), xhtml);
           }
        }

        xhtml.endElement(tas.getTag());

        return 0;
     }

     private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLContentHandler xhtml)
           throws SAXException {
        // Skip trailing newlines
        if(cr.text().equals("\r"))
           return;

        List<String> tags = new ArrayList<String>();
        if(!skipStyling) {
           if(cr.isBold()) tags.add("b");
           if(cr.isItalic()) tags.add("i");
           if(cr.isStrikeThrough()) tags.add("s");
           for(String tag : tags) {
              xhtml.startElement(tag);
           }
        }

        // Clean up the text
        String text = cr.text();
        text = text.replace('\r', '\n');
        if(text.endsWith("\u0007")) {
           // Strip the table cell end marker
           text = text.substring(0, text.length()-1);
        }

        xhtml.characters(text);

        for(int tn=tags.size()-1; tn>=0; tn--) {
           xhtml.endElement(tags.get(tn));
        }
     }
     /**
      * Can be \13..text..\15 or \13..control..\14..text..\15 .
      * Nesting is allowed
      */
     private int handleSpecialCharacterRuns(Paragraph p, int index, boolean skipStyling,
           PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, TikaException, IOException {
        List<CharacterRun> controls = new ArrayList<CharacterRun>();
        List<CharacterRun> texts = new ArrayList<CharacterRun>();
        boolean has14 = false;

        // Split it into before and after the 14
        int i;
        for(i=index+1; i<p.numCharacterRuns(); i++) {
           CharacterRun cr = p.getCharacterRun(i);
           if(cr.text().equals("\u0013")) {
              // Nested, oh joy...
              int increment = handleSpecialCharacterRuns(p, i+1, skipStyling, pictures, xhtml);
              i += increment;
           } else if(cr.text().equals("\u0014")) {
              has14 = true;
           } else if(cr.text().equals("\u0015")) {
              if(!has14) {
                 texts = controls;
                 controls = new ArrayList<CharacterRun>();
              }
              break;
           } else {
              if(has14) {
                 texts.add(cr);
              } else {
                 controls.add(cr);
              }
           }
        }

        // Do we need to do something special with this?
        if(controls.size() > 0) {
           String text = controls.get(0).text();
           for(int j=1; j<controls.size(); j++) {
              text += controls.get(j).text();
           }

           if(text.startsWith("HYPERLINK") && text.indexOf('"') > -1) {
              String url = text.substring(
                    text.indexOf('"') + 1,
                    text.lastIndexOf('"')
              );
              xhtml.startElement("a", "href", url);
              for(CharacterRun cr : texts) {
                 handleCharacterRun(cr, skipStyling, xhtml);
              }
              xhtml.endElement("a");
           } else {
              // Just output the text ones
              for(CharacterRun cr : texts) {
                 if(pictures.hasPicture(cr)) {
                    Picture picture = pictures.getFor(cr);
                    handlePictureCharacterRun(cr, picture, pictures, xhtml);
                 } else {
                    handleCharacterRun(cr, skipStyling, xhtml);
                 }
              }
           }
        } else {
           // We only had text
           // Output as-is
           for(CharacterRun cr : texts) {
              handleCharacterRun(cr, skipStyling, xhtml);
           }
        }

        // Tell them how many to skip over
        return i-index;
     }

     private void handlePictureCharacterRun(CharacterRun cr, Picture picture, PicturesSource pictures, XHTMLContentHandler xhtml)
           throws SAXException, IOException, TikaException {
        if(picture == null) {
           // Oh dear, we've run out...
           // Probably caused by multiple \u0008 images referencing
           //  the same real image
           return;
        }

        // Which one is it?
        String extension = picture.suggestFileExtension();
        int pictureNumber = pictures.pictureNumber(picture);

        // Make up a name for the picture
        // There isn't one in the file, but we need to be able to reference
        //  the picture from the img tag and the embedded resource
        String filename = "image"+pictureNumber+(extension.length()>0 ? "."+extension : "");

        // Grab the mime type for the picture
        String mimeType = picture.getMimeType();

        // Output the img tag
        xhtml.startElement("img", "src", "embedded:" + filename);
        xhtml.endElement("img");

        // Have we already output this one?
        // (Only expose each individual image once)
        if(! pictures.hasOutput(picture)) {
           TikaInputStream stream = TikaInputStream.get(picture.getContent());
           handleEmbeddedResource(stream, filename, mimeType, xhtml, false);
           pictures.recordOutput(picture);
        }
     }

     /**
      * Outputs a section of text if the given text is non-empty.
      *
      * @param xhtml XHTML content handler
      * @param section the class of the &lt;div/&gt; section emitted
      * @param text text to be emitted, if any
      * @throws SAXException if an error occurs
      */
     private void addTextIfAny(
             XHTMLContentHandler xhtml, String section, String text)
             throws SAXException {
         if (text != null && text.length() > 0) {
             xhtml.startElement("div", "class", section);
             xhtml.element("p", text);
             xhtml.endElement("div");
         }
     }

     protected void parseWord6(
             POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
         HWPFOldDocument doc = new HWPFOldDocument(filesystem);
         Word6Extractor extractor = new Word6Extractor(doc);

         for(String p : extractor.getParagraphText()) {
             xhtml.element("p", p);
         }
     }

     /**
      * Given a style name, return what tag should be used, and
      *  what style should be applied to it.
      */
     public static TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable) {
        String tag = "p";
        String styleClass = null;

        if(styleName.equals("Default") || styleName.equals("Normal")) {
           // Already setup
        } else if(styleName.equals("Table Contents") && isTable) {
           // Already setup
        } else if(styleName.equals("heading") || styleName.equals("Heading")) {
           tag = "h1";
        } else if(styleName.startsWith("heading") || styleName.startsWith("Heading")) {
           // "Heading 3" or "Heading2" or "heading 4"
           int num = 1;
           try {
              num = Integer.parseInt(
                    styleName.substring(styleName.length()-1)
              );
           } catch(NumberFormatException e) {}
           tag = "h"+num;
        } else if(styleName.equals("Title")) {
           tag = "h1";
           styleClass = "title";
        } else if(styleName.equals("Subtitle")) {
           tag = "h2";
           styleClass = "subtitle";
        } else if(styleName.equals("HTML Preformatted")) {
           tag = "pre";
        } else {
           styleClass = styleName.replace(' ', '_');
           styleClass = styleClass.substring(0,1).toLowerCase() +
                          styleClass.substring(1);
        }

        return new TagAndStyle(tag,styleClass);
     }

     public static class TagAndStyle {
        private String tag;
        private String styleClass;
        public TagAndStyle(String tag, String styleClass) {
          this.tag = tag;
          this.styleClass = styleClass;
        }
        public String getTag() {
          return tag;
        }
        public String getStyleClass() {
          return styleClass;
        }
        public boolean isHeading() {
           return tag.length()==2 && tag.startsWith("h");
        }
     }

     /**
      * Provides access to the pictures both by offset, iteration
      *  over the un-claimed, and peeking forward
      */
     private static class PicturesSource {
        private PicturesTable picturesTable;
        private Set<Picture> output = new HashSet<Picture>();
        private Map<Integer,Picture> lookup;
        private List<Picture> nonU1based;
        private List<Picture> all;
        private int pn = 0;

        private PicturesSource(HWPFDocument doc) {
           picturesTable = doc.getPicturesTable();
           all = picturesTable.getAllPictures();

           // Compute the Offset-Picture lookup
           lookup = new HashMap<Integer, Picture>();
           for(Picture p : all) {
              // TODO Make this nicer when POI 3.7 is out
              String name = p.suggestFullFileName();
              if(name.indexOf('.') > -1)
                 name = name.substring(0, name.indexOf('.'));
              int offset = Integer.parseInt(name, 16);
              lookup.put(offset, p);
           }

           // Work out which Pictures aren't referenced by
           //  a \u0001 in the main text
           // These are \u0008 escher floating ones, ones
           //  found outside the normal text, and who
           //  knows what else...
           nonU1based = new ArrayList<Picture>();
           nonU1based.addAll(all);
           Range r = doc.getRange();
           for(int i=0; i<r.numCharacterRuns(); i++) {
              CharacterRun cr = r.getCharacterRun(i);
              if(picturesTable.hasPicture(cr)) {
                 Picture p = getFor(cr);
                 int at = nonU1based.indexOf(p);
                 nonU1based.set(at, null);
              }
           }
        }

        private boolean hasPicture(CharacterRun cr) {
           return picturesTable.hasPicture(cr);
        }

        private void recordOutput(Picture picture) {
           output.add(picture);
        }
        private boolean hasOutput(Picture picture) {
           return output.contains(picture);
        }

        private int pictureNumber(Picture picture) {
           return all.indexOf(picture) + 1;
        }

        private Picture getFor(CharacterRun cr) {
           return lookup.get(cr.getPicOffset());
        }

        /**
         * Return the next unclaimed one, used towards
         *  the end
         */
        private Picture nextUnclaimed() {
           Picture p = null;
           while(pn < nonU1based.size()) {
              p = nonU1based.get(pn);
              pn++;
              if(p != null) return p;
           }
           return null;
        }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.parser.microsoft;

	import java.io.FileNotFoundException;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;

	import org.apache.poi.hwpf.HWPFDocument;
	import org.apache.poi.hwpf.HWPFOldDocument;
	import org.apache.poi.hwpf.OldWordFileFormatException;
	import org.apache.poi.hwpf.extractor.Word6Extractor;
	import org.apache.poi.hwpf.model.PicturesTable;
	import org.apache.poi.hwpf.model.StyleDescription;
	import org.apache.poi.hwpf.usermodel.CharacterRun;
	import org.apache.poi.hwpf.usermodel.Paragraph;
	import org.apache.poi.hwpf.usermodel.Picture;
	import org.apache.poi.hwpf.usermodel.Range;
	import org.apache.poi.hwpf.usermodel.Table;
	import org.apache.poi.hwpf.usermodel.TableCell;
	import org.apache.poi.hwpf.usermodel.TableRow;
	import org.apache.poi.poifs.filesystem.DirectoryEntry;
	import org.apache.poi.poifs.filesystem.Entry;
	import org.apache.poi.poifs.filesystem.POIFSFileSystem;
	import org.apache.tika.exception.TikaException;
	import org.apache.tika.io.TikaInputStream;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.sax.XHTMLContentHandler;
	import org.xml.sax.SAXException;

	public class WordExtractor extends AbstractPOIFSExtractor {

	public WordExtractor(ParseContext context) {
	super(context);
	}

	protected void parse(
	POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
	throws IOException, SAXException, TikaException {
	HWPFDocument document;
	try {
	document = new HWPFDocument(filesystem);
	} catch(OldWordFileFormatException e) {
	parseWord6(filesystem, xhtml);
	return;
	}
	org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
	new org.apache.poi.hwpf.extractor.WordExtractor(document);

	addTextIfAny(xhtml, "header", wordExtractor.getHeaderText());

	// Grab the list of pictures. As far as we can tell,
	// the pictures should be in order, and may be directly
	// placed or referenced from an anchor
	PicturesTable pictureTable = document.getPicturesTable();
	PicturesSource pictures = new PicturesSource(document);

	// Do the main paragraph text
	Range r = document.getRange();
	for(int i=0; i<r.numParagraphs(); i++) {
	Paragraph p = r.getParagraph(i);
	i += handleParagraph(p, 0, r, document, pictures, pictureTable, xhtml);
	}

	// Do everything else
	for (String paragraph : wordExtractor.getFootnoteText()) {
	xhtml.element("p", paragraph);
	}

	for (String paragraph : wordExtractor.getCommentsText()) {
	xhtml.element("p", paragraph);
	}

	for (String paragraph : wordExtractor.getEndnoteText()) {
	xhtml.element("p", paragraph);
	}

	addTextIfAny(xhtml, "footer", wordExtractor.getFooterText());

	// Handle any pictures that we haven't output yet
	for(Picture p = pictures.nextUnclaimed(); p != null; ) {
	handlePictureCharacterRun(
	null, p, pictures, xhtml
	);
	p = pictures.nextUnclaimed();
	}

	// Handle any embeded office documents
	try {
	DirectoryEntry op =
	(DirectoryEntry) filesystem.getRoot().getEntry("ObjectPool");
	for (Entry entry : op) {
	if (entry.getName().startsWith("_")
	&& entry instanceof DirectoryEntry) {
	handleEmbededOfficeDoc((DirectoryEntry) entry, xhtml);
	}
	}
	} catch(FileNotFoundException e) {
	}
	}

	private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document,
	PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml)
	throws SAXException, IOException, TikaException {
	// Note - a poi bug means we can't currently properly recurse
	// into nested tables, so currently we don't
	if(p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel==0) {
	Table t = r.getTable(p);
	xhtml.startElement("table");
	xhtml.startElement("tbody");
	for(int rn=0; rn<t.numRows(); rn++) {
	TableRow row = t.getRow(rn);
	xhtml.startElement("tr");
	for(int cn=0; cn<row.numCells(); cn++) {
	TableCell cell = row.getCell(cn);
	xhtml.startElement("td");

	for(int pn=0; pn<cell.numParagraphs(); pn++) {
	Paragraph cellP = cell.getParagraph(pn);
	handleParagraph(cellP, p.getTableLevel(), cell, document, pictures, pictureTable, xhtml);
	}
	xhtml.endElement("td");
	}
	xhtml.endElement("tr");
	}
	xhtml.endElement("tbody");
	xhtml.endElement("table");
	return (t.numParagraphs()-1);
	}

	StyleDescription style =
	document.getStyleSheet().getStyleDescription(p.getStyleIndex());
	TagAndStyle tas = buildParagraphTagAndStyle(
	style.getName(), (parentTableLevel>0)
	);

	if(tas.getStyleClass() != null) {
	xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
	} else {
	xhtml.startElement(tas.getTag());
	}

	for(int j=0; j<p.numCharacterRuns(); j++) {
	CharacterRun cr = p.getCharacterRun(j);

	if(cr.text().equals("\u0013")) {
	j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
	} else if(cr.text().startsWith("\u0008")) {
	// Floating Picture(s)
	for(int pn=0; pn<cr.text().length(); pn++) {
	// Assume they're in the order from the unclaimed list...
	Picture picture = pictures.nextUnclaimed();

	// Output
	handlePictureCharacterRun(cr, picture, pictures, xhtml);
	}
	} else if(pictureTable.hasPicture(cr)) {
	// Inline Picture
	Picture picture = pictures.getFor(cr);
	handlePictureCharacterRun(cr, picture, pictures, xhtml);
	} else {
	handleCharacterRun(cr, tas.isHeading(), xhtml);
	}
	}

	xhtml.endElement(tas.getTag());

	return 0;
	}

	private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLContentHandler xhtml)
	throws SAXException {
	// Skip trailing newlines
	if(cr.text().equals("\r"))
	return;

	List<String> tags = new ArrayList<String>();
	if(!skipStyling) {
	if(cr.isBold()) tags.add("b");
	if(cr.isItalic()) tags.add("i");
	if(cr.isStrikeThrough()) tags.add("s");
	for(String tag : tags) {
	xhtml.startElement(tag);
	}
	}

	// Clean up the text
	String text = cr.text();
	text = text.replace('\r', '\n');
	if(text.endsWith("\u0007")) {
	// Strip the table cell end marker
	text = text.substring(0, text.length()-1);
	}

	xhtml.characters(text);

	for(int tn=tags.size()-1; tn>=0; tn--) {
	xhtml.endElement(tags.get(tn));
	}
	}
	/**
	* Can be \13..text..\15 or \13..control..\14..text..\15 .
	* Nesting is allowed
	*/
	private int handleSpecialCharacterRuns(Paragraph p, int index, boolean skipStyling,
	PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, TikaException, IOException {
	List<CharacterRun> controls = new ArrayList<CharacterRun>();
	List<CharacterRun> texts = new ArrayList<CharacterRun>();
	boolean has14 = false;

	// Split it into before and after the 14
	int i;
	for(i=index+1; i<p.numCharacterRuns(); i++) {
	CharacterRun cr = p.getCharacterRun(i);
	if(cr.text().equals("\u0013")) {
	// Nested, oh joy...
	int increment = handleSpecialCharacterRuns(p, i+1, skipStyling, pictures, xhtml);
	i += increment;
	} else if(cr.text().equals("\u0014")) {
	has14 = true;
	} else if(cr.text().equals("\u0015")) {
	if(!has14) {
	texts = controls;
	controls = new ArrayList<CharacterRun>();
	}
	break;
	} else {
	if(has14) {
	texts.add(cr);
	} else {
	controls.add(cr);
	}
	}
	}

	// Do we need to do something special with this?
	if(controls.size() > 0) {
	String text = controls.get(0).text();
	for(int j=1; j<controls.size(); j++) {
	text += controls.get(j).text();
	}

	if(text.startsWith("HYPERLINK") && text.indexOf('"') > -1) {
	String url = text.substring(
	text.indexOf('"') + 1,
	text.lastIndexOf('"')
	);
	xhtml.startElement("a", "href", url);
	for(CharacterRun cr : texts) {
	handleCharacterRun(cr, skipStyling, xhtml);
	}
	xhtml.endElement("a");
	} else {
	// Just output the text ones
	for(CharacterRun cr : texts) {
	if(pictures.hasPicture(cr)) {
	Picture picture = pictures.getFor(cr);
	handlePictureCharacterRun(cr, picture, pictures, xhtml);
	} else {
	handleCharacterRun(cr, skipStyling, xhtml);
	}
	}
	}
	} else {
	// We only had text
	// Output as-is
	for(CharacterRun cr : texts) {
	handleCharacterRun(cr, skipStyling, xhtml);
	}
	}

	// Tell them how many to skip over
	return i-index;
	}

	private void handlePictureCharacterRun(CharacterRun cr, Picture picture, PicturesSource pictures, XHTMLContentHandler xhtml)
	throws SAXException, IOException, TikaException {
	if(picture == null) {
	// Oh dear, we've run out...
	// Probably caused by multiple \u0008 images referencing
	// the same real image
	return;
	}

	// Which one is it?
	String extension = picture.suggestFileExtension();
	int pictureNumber = pictures.pictureNumber(picture);

	// Make up a name for the picture
	// There isn't one in the file, but we need to be able to reference
	// the picture from the img tag and the embedded resource
	String filename = "image"+pictureNumber+(extension.length()>0 ? "."+extension : "");

	// Grab the mime type for the picture
	String mimeType = picture.getMimeType();

	// Output the img tag
	xhtml.startElement("img", "src", "embedded:" + filename);
	xhtml.endElement("img");

	// Have we already output this one?
	// (Only expose each individual image once)
	if(! pictures.hasOutput(picture)) {
	TikaInputStream stream = TikaInputStream.get(picture.getContent());
	handleEmbeddedResource(stream, filename, mimeType, xhtml, false);
	pictures.recordOutput(picture);
	}
	}

	/**
	* Outputs a section of text if the given text is non-empty.
	*
	* @param xhtml XHTML content handler
	* @param section the class of the <div/> section emitted
	* @param text text to be emitted, if any
	* @throws SAXException if an error occurs
	*/
	private void addTextIfAny(
	XHTMLContentHandler xhtml, String section, String text)
	throws SAXException {
	if (text != null && text.length() > 0) {
	xhtml.startElement("div", "class", section);
	xhtml.element("p", text);
	xhtml.endElement("div");
	}
	}

	protected void parseWord6(
	POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
	throws IOException, SAXException, TikaException {
	HWPFOldDocument doc = new HWPFOldDocument(filesystem);
	Word6Extractor extractor = new Word6Extractor(doc);

	for(String p : extractor.getParagraphText()) {
	xhtml.element("p", p);
	}
	}

	/**
	* Given a style name, return what tag should be used, and
	* what style should be applied to it.
	*/
	public static TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable) {
	String tag = "p";
	String styleClass = null;

	if(styleName.equals("Default") \|\| styleName.equals("Normal")) {
	// Already setup
	} else if(styleName.equals("Table Contents") && isTable) {
	// Already setup
	} else if(styleName.equals("heading") \|\| styleName.equals("Heading")) {
	tag = "h1";
	} else if(styleName.startsWith("heading") \|\| styleName.startsWith("Heading")) {
	// "Heading 3" or "Heading2" or "heading 4"
	int num = 1;
	try {
	num = Integer.parseInt(
	styleName.substring(styleName.length()-1)
	);
	} catch(NumberFormatException e) {}
	tag = "h"+num;
	} else if(styleName.equals("Title")) {
	tag = "h1";
	styleClass = "title";
	} else if(styleName.equals("Subtitle")) {
	tag = "h2";
	styleClass = "subtitle";
	} else if(styleName.equals("HTML Preformatted")) {
	tag = "pre";
	} else {
	styleClass = styleName.replace(' ', '_');
	styleClass = styleClass.substring(0,1).toLowerCase() +
	styleClass.substring(1);
	}

	return new TagAndStyle(tag,styleClass);
	}

	public static class TagAndStyle {
	private String tag;
	private String styleClass;
	public TagAndStyle(String tag, String styleClass) {
	this.tag = tag;
	this.styleClass = styleClass;
	}
	public String getTag() {
	return tag;
	}
	public String getStyleClass() {
	return styleClass;
	}
	public boolean isHeading() {
	return tag.length()==2 && tag.startsWith("h");
	}
	}

	/**
	* Provides access to the pictures both by offset, iteration
	* over the un-claimed, and peeking forward
	*/
	private static class PicturesSource {
	private PicturesTable picturesTable;
	private Set<Picture> output = new HashSet<Picture>();
	private Map<Integer,Picture> lookup;
	private List<Picture> nonU1based;
	private List<Picture> all;
	private int pn = 0;

	private PicturesSource(HWPFDocument doc) {
	picturesTable = doc.getPicturesTable();
	all = picturesTable.getAllPictures();

	// Compute the Offset-Picture lookup
	lookup = new HashMap<Integer, Picture>();
	for(Picture p : all) {
	// TODO Make this nicer when POI 3.7 is out
	String name = p.suggestFullFileName();
	if(name.indexOf('.') > -1)
	name = name.substring(0, name.indexOf('.'));
	int offset = Integer.parseInt(name, 16);
	lookup.put(offset, p);
	}

	// Work out which Pictures aren't referenced by
	// a \u0001 in the main text
	// These are \u0008 escher floating ones, ones
	// found outside the normal text, and who
	// knows what else...
	nonU1based = new ArrayList<Picture>();
	nonU1based.addAll(all);
	Range r = doc.getRange();
	for(int i=0; i<r.numCharacterRuns(); i++) {
	CharacterRun cr = r.getCharacterRun(i);
	if(picturesTable.hasPicture(cr)) {
	Picture p = getFor(cr);
	int at = nonU1based.indexOf(p);
	nonU1based.set(at, null);
	}
	}
	}

	private boolean hasPicture(CharacterRun cr) {
	return picturesTable.hasPicture(cr);
	}

	private void recordOutput(Picture picture) {
	output.add(picture);
	}
	private boolean hasOutput(Picture picture) {
	return output.contains(picture);
	}

	private int pictureNumber(Picture picture) {
	return all.indexOf(picture) + 1;
	}

	private Picture getFor(CharacterRun cr) {
	return lookup.get(cr.getPicOffset());
	}

	/**
	* Return the next unclaimed one, used towards
	* the end
	*/
	private Picture nextUnclaimed() {
	Picture p = null;
	while(pn < nonU1based.size()) {
	p = nonU1based.get(pn);
	pn++;
	if(p != null) return p;
	}
	return null;
	}
	}
	}