| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.microsoft; |
| |
| import static java.nio.charset.StandardCharsets.UTF_8; |
| |
| import java.io.FileNotFoundException; |
| import java.io.IOException; |
| import java.util.ArrayDeque; |
| import java.util.ArrayList; |
| import java.util.Deque; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.poi.hwpf.HWPFDocument; |
| import org.apache.poi.hwpf.HWPFOldDocument; |
| import org.apache.poi.hwpf.OldWordFileFormatException; |
| import org.apache.poi.hwpf.extractor.Word6Extractor; |
| import org.apache.poi.hwpf.model.FieldsDocumentPart; |
| import org.apache.poi.hwpf.model.PicturesTable; |
| import org.apache.poi.hwpf.model.SavedByEntry; |
| import org.apache.poi.hwpf.model.SavedByTable; |
| import org.apache.poi.hwpf.model.StyleDescription; |
| import org.apache.poi.hwpf.usermodel.CharacterRun; |
| import org.apache.poi.hwpf.usermodel.Field; |
| import org.apache.poi.hwpf.usermodel.HeaderStories; |
| import org.apache.poi.hwpf.usermodel.Paragraph; |
| import org.apache.poi.hwpf.usermodel.Picture; |
| import org.apache.poi.hwpf.usermodel.Range; |
| import org.apache.poi.hwpf.usermodel.Table; |
| import org.apache.poi.hwpf.usermodel.TableCell; |
| import org.apache.poi.hwpf.usermodel.TableRow; |
| import org.apache.poi.poifs.filesystem.DirectoryEntry; |
| import org.apache.poi.poifs.filesystem.DirectoryNode; |
| import org.apache.poi.poifs.filesystem.Entry; |
| import org.apache.poi.poifs.filesystem.POIFSFileSystem; |
| import org.xml.sax.SAXException; |
| import org.xml.sax.helpers.AttributesImpl; |
| |
| import org.apache.tika.exception.EncryptedDocumentException; |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.io.TikaInputStream; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.metadata.TikaCoreProperties; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.sax.XHTMLContentHandler; |
| |
| public class WordExtractor extends AbstractPOIFSExtractor { |
| |
| private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011'; |
| private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b'; |
| // could be improved by using the real delimiter in xchFollow |
| // [MS-DOC], v20140721, 2.4.6.3, Part 3, Step 3 |
| private static final String LIST_DELIMITER = " "; |
| private static final Map<String, TagAndStyle> fixedParagraphStyles = new HashMap<>(); |
| private static final TagAndStyle defaultParagraphStyle = new TagAndStyle("p", null); |
| |
| static { |
| fixedParagraphStyles.put("Default", defaultParagraphStyle); |
| fixedParagraphStyles.put("Normal", defaultParagraphStyle); |
| fixedParagraphStyles.put("heading", new TagAndStyle("h1", null)); |
| fixedParagraphStyles.put("Heading", new TagAndStyle("h1", null)); |
| fixedParagraphStyles.put("Title", new TagAndStyle("h1", "title")); |
| fixedParagraphStyles.put("Subtitle", new TagAndStyle("h2", "subtitle")); |
| fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", null)); |
| } |
| |
| private final Deque<FormattingUtils.Tag> formattingState = new ArrayDeque<>(); |
| |
| private final Metadata metadata; |
| |
| public WordExtractor(ParseContext context, Metadata metadata) { |
| super(context); |
| this.metadata = metadata; |
| } |
| |
| private static int countParagraphs(Range... ranges) { |
| int count = 0; |
| for (Range r : ranges) { |
| if (r != null) { |
| count += r.numParagraphs(); |
| } |
| } |
| return count; |
| } |
| |
| /** |
| * Given a style name, return what tag should be used, and |
| * what style should be applied to it. |
| */ |
| public static TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable) { |
| |
| if (styleName == null || styleName.length() < 2) { |
| return defaultParagraphStyle; |
| } |
| |
| TagAndStyle tagAndStyle = fixedParagraphStyles.get(styleName); |
| if (tagAndStyle != null) { |
| return tagAndStyle; |
| } |
| |
| if (styleName.equals("Table Contents") && isTable) { |
| return defaultParagraphStyle; |
| } |
| |
| String tag = "p"; |
| String styleClass = null; |
| |
| if (styleName.startsWith("heading") || styleName.startsWith("Heading")) { |
| // "Heading 3" or "Heading2" or "heading 4" |
| int num = 1; |
| try { |
| num = Integer.parseInt(styleName.substring(styleName.length() - 1)); |
| } catch (NumberFormatException e) { |
| //swallow |
| } |
| // Turn it into a H1 - H6 (H7+ isn't valid!) |
| tag = "h" + Math.min(num, 6); |
| } else { |
| styleClass = styleName.replace(' ', '_'); |
| styleClass = |
| styleClass.substring(0, 1).toLowerCase(Locale.ROOT) + styleClass.substring(1); |
| } |
| |
| return new TagAndStyle(tag, styleClass); |
| } |
| |
| protected void parse(POIFSFileSystem filesystem, XHTMLContentHandler xhtml) |
| throws IOException, SAXException, TikaException { |
| parse(filesystem.getRoot(), xhtml); |
| } |
| |
| protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) |
| throws IOException, SAXException, TikaException { |
| HWPFDocument document; |
| try { |
| document = new HWPFDocument(root); |
| } catch (org.apache.poi.EncryptedDocumentException e) { |
| throw new EncryptedDocumentException(e); |
| } catch (OldWordFileFormatException e) { |
| parseWord6(root, xhtml); |
| return; |
| } |
| |
| extractSavedByMetadata(document); |
| |
| org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = |
| new org.apache.poi.hwpf.extractor.WordExtractor(document); |
| |
| // Grab the list of pictures. As far as we can tell, |
| // the pictures should be in order, and may be directly |
| // placed or referenced from an anchor |
| PicturesTable pictureTable = document.getPicturesTable(); |
| PicturesSource pictures = new PicturesSource(document); |
| HeaderStories headerFooter = null; |
| // Do any headers, if present |
| if (officeParserConfig.isIncludeHeadersAndFooters()) { |
| headerFooter = new HeaderStories(document); |
| Range[] headers = new Range[]{headerFooter.getFirstHeaderSubrange(), |
| headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange()}; |
| handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml); |
| } |
| // Do the main paragraph text |
| Range r = document.getRange(); |
| ListManager listManager = new ListManager(document); |
| for (int i = 0; i < r.numParagraphs(); i++) { |
| Paragraph p = r.getParagraph(i); |
| i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, |
| listManager, xhtml); |
| } |
| |
| if (officeParserConfig.isIncludeShapeBasedContent()) { |
| // Do everything else |
| for (String paragraph : wordExtractor.getMainTextboxText()) { |
| xhtml.element("p", paragraph); |
| } |
| } |
| |
| for (String paragraph : wordExtractor.getFootnoteText()) { |
| xhtml.element("p", paragraph); |
| } |
| |
| for (String paragraph : wordExtractor.getCommentsText()) { |
| xhtml.element("p", paragraph); |
| } |
| |
| for (String paragraph : wordExtractor.getEndnoteText()) { |
| xhtml.element("p", paragraph); |
| } |
| |
| if (officeParserConfig.isIncludeHeadersAndFooters()) { |
| // Do any footers, if present |
| Range[] footers = new Range[]{headerFooter.getFirstFooterSubrange(), |
| headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange()}; |
| handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml); |
| } |
| // Handle any pictures that we haven't output yet |
| for (Picture p = pictures.nextUnclaimed(); p != null; ) { |
| handlePictureCharacterRun(null, p, pictures, xhtml); |
| p = pictures.nextUnclaimed(); |
| } |
| |
| // Handle any embeded office documents |
| try { |
| DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); |
| for (Entry entry : op) { |
| if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) { |
| handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); |
| } |
| } |
| } catch (FileNotFoundException e) { |
| //swallow |
| } |
| } |
| |
| private void extractSavedByMetadata(HWPFDocument document) { |
| SavedByTable savedByTable = document.getSavedByTable(); |
| if (savedByTable == null) { |
| return; |
| } |
| for (SavedByEntry sbe : savedByTable.getEntries()) { |
| metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, sbe.getSaveLocation()); |
| } |
| } |
| |
| private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document, |
| PicturesSource pictures, PicturesTable pictureTable, |
| XHTMLContentHandler xhtml) |
| throws SAXException, IOException, TikaException { |
| if (countParagraphs(ranges) > 0) { |
| xhtml.startElement("div", "class", type); |
| ListManager listManager = new ListManager(document); |
| for (Range r : ranges) { |
| if (r != null) { |
| for (int i = 0; i < r.numParagraphs(); i++) { |
| Paragraph p = r.getParagraph(i); |
| |
| i += handleParagraph(p, 0, r, document, FieldsDocumentPart.HEADER, pictures, |
| pictureTable, listManager, xhtml); |
| } |
| } |
| } |
| xhtml.endElement("div"); |
| } |
| } |
| |
| private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document, |
| FieldsDocumentPart docPart, PicturesSource pictures, |
| PicturesTable pictureTable, ListManager listManager, |
| XHTMLContentHandler xhtml) |
| throws SAXException, IOException, TikaException { |
| // Note - a poi bug means we can't currently properly recurse |
| // into nested tables, so currently we don't |
| if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) { |
| Table t = r.getTable(p); |
| xhtml.startElement("table"); |
| xhtml.startElement("tbody"); |
| for (int rn = 0; rn < t.numRows(); rn++) { |
| TableRow row = t.getRow(rn); |
| xhtml.startElement("tr"); |
| for (int cn = 0; cn < row.numCells(); cn++) { |
| TableCell cell = row.getCell(cn); |
| xhtml.startElement("td"); |
| |
| for (int pn = 0; pn < cell.numParagraphs(); pn++) { |
| Paragraph cellP = cell.getParagraph(pn); |
| handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, |
| pictureTable, listManager, xhtml); |
| } |
| xhtml.endElement("td"); |
| } |
| xhtml.endElement("tr"); |
| } |
| xhtml.endElement("tbody"); |
| xhtml.endElement("table"); |
| return (t.numParagraphs() - 1); |
| } |
| |
| String text = p.text(); |
| if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) { |
| // Skip empty paragraphs |
| return 0; |
| } |
| |
| TagAndStyle tas; |
| String numbering = null; |
| |
| if (document.getStyleSheet().numStyles() > p.getStyleIndex()) { |
| StyleDescription style = |
| document.getStyleSheet().getStyleDescription(p.getStyleIndex()); |
| if (style != null && style.getName() != null && style.getName().length() > 0) { |
| if (p.isInList()) { |
| numbering = listManager.getFormattedNumber(p); |
| } |
| tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0)); |
| } else { |
| tas = new TagAndStyle("p", null); |
| } |
| } else { |
| tas = new TagAndStyle("p", null); |
| } |
| |
| if (tas.getStyleClass() != null) { |
| xhtml.startElement(tas.getTag(), "class", tas.getStyleClass()); |
| } else { |
| xhtml.startElement(tas.getTag()); |
| } |
| |
| if (numbering != null) { |
| xhtml.characters(numbering); |
| } |
| |
| for (int j = 0; j < p.numCharacterRuns(); j++) { |
| CharacterRun cr = p.getCharacterRun(j); |
| |
| // FIELD_BEGIN_MARK: |
| if (cr.text().getBytes(UTF_8)[0] == 0x13) { |
| Field field = |
| document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset()); |
| // 58 is an embedded document |
| // 56 is a document link |
| if (field != null && (field.getType() == 58 || field.getType() == 56)) { |
| // Embedded Object: add a <div |
| // class="embedded" id="_X"/> so consumer can see where |
| // in the main text each embedded document |
| // occurred: |
| String id = "_unknown_id"; |
| //this can return null (TIKA-1956) |
| CharacterRun mscr = field.getMarkSeparatorCharacterRun(r); |
| if (mscr != null) { |
| id = "_" + mscr.getPicOffset(); |
| } |
| AttributesImpl attributes = new AttributesImpl(); |
| attributes.addAttribute("", "class", "class", "CDATA", "embedded"); |
| attributes.addAttribute("", "id", "id", "CDATA", id); |
| xhtml.startElement("div", attributes); |
| xhtml.endElement("div"); |
| } |
| } |
| |
| if (cr.text().equals("\u0013")) { |
| j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml); |
| } else if (cr.text().startsWith("\b")) { //\u0008" |
| // Floating Picture(s) |
| for (int pn = 0; pn < cr.text().length(); pn++) { |
| // Assume they're in the order from the unclaimed list... |
| Picture picture = pictures.nextUnclaimed(); |
| |
| // Output |
| handlePictureCharacterRun(cr, picture, pictures, xhtml); |
| } |
| } else if (pictureTable.hasPicture(cr)) { |
| // Inline Picture |
| Picture picture = pictures.getFor(cr); |
| handlePictureCharacterRun(cr, picture, pictures, xhtml); |
| } else { |
| handleCharacterRun(cr, tas.isHeading(), xhtml); |
| } |
| } |
| |
| closeStyleElements(false, xhtml); |
| |
| xhtml.endElement(tas.getTag()); |
| |
| return 0; |
| } |
| |
| private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLContentHandler xhtml) |
| throws SAXException { |
| // Skip trailing newlines |
| if (!isRendered(cr) || cr.text().equals("\r")) { |
| return; |
| } |
| |
| if (!skipStyling) { |
| FormattingUtils |
| .ensureFormattingState(xhtml, FormattingUtils.toTags(cr), formattingState); |
| } |
| |
| // Clean up the text |
| String text = cr.text(); |
| text = text.replace('\r', '\n'); |
| if (text.endsWith("\u0007")) { |
| // Strip the table cell end marker |
| text = text.substring(0, text.length() - 1); |
| } |
| |
| // Copied from POI's org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters: |
| |
| // Non-breaking hyphens are returned as char 30 |
| text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN); |
| |
| // Non-required hyphens to zero-width space |
| text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE); |
| |
| // Control characters as line break |
| text = text.replaceAll("[\u0000-\u001f]", "\n"); |
| xhtml.characters(text); |
| } |
| |
| /** |
| * Can be \13..text..\15 or \13..control..\14..text..\15 . |
| * Nesting is allowed |
| */ |
| private int handleSpecialCharacterRuns(Paragraph p, int index, boolean skipStyling, |
| PicturesSource pictures, XHTMLContentHandler xhtml) |
| throws SAXException, TikaException, IOException { |
| List<CharacterRun> controls = new ArrayList<>(); |
| List<CharacterRun> texts = new ArrayList<>(); |
| boolean has14 = false; |
| |
| // Split it into before and after the 14 |
| int i; |
| for (i = index + 1; i < p.numCharacterRuns(); i++) { |
| CharacterRun cr = p.getCharacterRun(i); |
| if (cr.text().equals("\u0013")) { |
| // Nested, oh joy... |
| int increment = handleSpecialCharacterRuns(p, i + 1, skipStyling, pictures, xhtml); |
| i += increment; |
| } else if (cr.text().equals("\u0014")) { |
| has14 = true; |
| } else if (cr.text().equals("\u0015")) { |
| if (!has14) { |
| texts = controls; |
| controls = new ArrayList<>(); |
| } |
| break; |
| } else if (cr.text().equals("\u0014\u0015")) { |
| has14 = true; |
| } else { |
| if (has14) { |
| texts.add(cr); |
| } else { |
| controls.add(cr); |
| } |
| } |
| } |
| |
| // Do we need to do something special with this? |
| if (controls.size() > 0) { |
| StringBuilder text = new StringBuilder(controls.get(0).text()); |
| for (int j = 1; j < controls.size(); j++) { |
| text.append(controls.get(j).text()); |
| } |
| |
| if ((text.toString().startsWith("HYPERLINK") || text.toString().startsWith(" HYPERLINK")) |
| && text.toString().indexOf('"') > -1) { |
| int start = text.toString().indexOf('"') + 1; |
| int end = findHyperlinkEnd(text.toString(), start); |
| String url = ""; |
| if (start >= 0 && start < end && end <= text.length()) { |
| url = text.substring(start, end); |
| } |
| |
| closeStyleElements(skipStyling, xhtml); |
| xhtml.startElement("a", "href", url); |
| for (CharacterRun cr : texts) { |
| handleCharacterRun(cr, skipStyling, xhtml); |
| } |
| closeStyleElements(skipStyling, xhtml); |
| xhtml.endElement("a"); |
| } else { |
| // Just output the text ones |
| for (CharacterRun cr : texts) { |
| if (pictures.hasPicture(cr)) { |
| Picture picture = pictures.getFor(cr); |
| handlePictureCharacterRun(cr, picture, pictures, xhtml); |
| } else { |
| handleCharacterRun(cr, skipStyling, xhtml); |
| } |
| } |
| } |
| } else { |
| // We only had text |
| // Output as-is |
| for (CharacterRun cr : texts) { |
| handleCharacterRun(cr, skipStyling, xhtml); |
| } |
| } |
| |
| // Tell them how many to skip over |
| return i - index; |
| } |
| |
| private void closeStyleElements(boolean skipStyling, XHTMLContentHandler xhtml) |
| throws SAXException { |
| if (skipStyling) { |
| return; |
| } |
| FormattingUtils.closeStyleTags(xhtml, formattingState); |
| } |
| |
| //temporary work around for TIKA-1512 |
| private int findHyperlinkEnd(String text, int start) { |
| int end = text.lastIndexOf('"'); |
| if (end > start) { |
| return end; |
| } |
| end = text.lastIndexOf('\u201D');//smart right double quote |
| if (end > start) { |
| return end; |
| } |
| end = text.lastIndexOf('\r'); |
| if (end > start) { |
| return end; |
| } |
| //if nothing so far, take the full length of the string |
| //If the full string is > 256 characters, it appears |
| //that the url is truncated in the .doc file. This |
| //will return the value as it is in the file, which |
| //may be incorrect; but it is the same behavior as opening |
| //the link in MSWord. |
| //This code does not currently check that length is actually >= 256. |
| //we might want to add that? |
| return text.length(); |
| } |
| |
| private void handlePictureCharacterRun(CharacterRun cr, Picture picture, |
| PicturesSource pictures, XHTMLContentHandler xhtml) |
| throws SAXException, IOException, TikaException { |
| if (!isRendered(cr) || picture == null) { |
| // Oh dear, we've run out... |
| // Probably caused by multiple \u0008 images referencing |
| // the same real image |
| return; |
| } |
| |
| // Which one is it? |
| String extension = picture.suggestFileExtension(); |
| int pictureNumber = pictures.pictureNumber(picture); |
| |
| // Make up a name for the picture |
| // There isn't one in the file, but we need to be able to reference |
| // the picture from the img tag and the embedded resource |
| String filename = "image" + pictureNumber + (extension.length() > 0 ? "." + extension : ""); |
| |
| // Grab the mime type for the picture |
| String mimeType = picture.getMimeType(); |
| |
| // Output the img tag |
| AttributesImpl attr = new AttributesImpl(); |
| attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename); |
| attr.addAttribute("", "alt", "alt", "CDATA", filename); |
| xhtml.startElement("img", attr); |
| xhtml.endElement("img"); |
| |
| // Have we already output this one? |
| // (Only expose each individual image once) |
| if (!pictures.hasOutput(picture)) { |
| TikaInputStream stream = TikaInputStream.get(picture.getContent()); |
| handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false); |
| pictures.recordOutput(picture); |
| } |
| } |
| |
| /** |
| * Outputs a section of text if the given text is non-empty. |
| * |
| * @param xhtml XHTML content handler |
| * @param section the class of the <div/> section emitted |
| * @param text text to be emitted, if any |
| * @throws SAXException if an error occurs |
| */ |
| private void addTextIfAny(XHTMLContentHandler xhtml, String section, String text) |
| throws SAXException { |
| if (text != null && text.length() > 0) { |
| xhtml.startElement("div", "class", section); |
| xhtml.element("p", text); |
| xhtml.endElement("div"); |
| } |
| } |
| |
| protected void parseWord6(POIFSFileSystem filesystem, XHTMLContentHandler xhtml) |
| throws IOException, SAXException, TikaException { |
| parseWord6(filesystem.getRoot(), xhtml); |
| } |
| |
| protected void parseWord6(DirectoryNode root, XHTMLContentHandler xhtml) |
| throws IOException, SAXException { |
| Word6Extractor extractor; |
| try (HWPFOldDocument doc = new HWPFOldDocument(root)) { |
| extractor = new Word6Extractor(doc); |
| } |
| |
| for (String p : extractor.getParagraphText()) { |
| xhtml.element("p", p); |
| } |
| } |
| |
| /** |
| * Determines if character run should be included in the extraction. |
| * |
| * @param cr character run. |
| * @return true if character run should be included in extraction. |
| */ |
| private boolean isRendered(final CharacterRun cr) { |
| if (cr == null) { |
| return true; |
| } |
| return !cr.isMarkedDeleted() || |
| (cr.isMarkedDeleted() && officeParserConfig.isIncludeDeletedContent()); |
| } |
| |
| public static class TagAndStyle { |
| private String tag; |
| private String styleClass; |
| |
| public TagAndStyle(String tag, String styleClass) { |
| this.tag = tag; |
| this.styleClass = styleClass; |
| } |
| |
| public String getTag() { |
| return tag; |
| } |
| |
| public String getStyleClass() { |
| return styleClass; |
| } |
| |
| public boolean isHeading() { |
| return tag.length() == 2 && tag.startsWith("h"); |
| } |
| } |
| |
| /** |
| * Provides access to the pictures both by offset, iteration |
| * over the un-claimed, and peeking forward |
| * <p> |
| * TODO When POI 3.18 is out, replace this with PictureRunMapper, |
| * which is this class ported over into POI core |
| */ |
| private static class PicturesSource { |
| private PicturesTable picturesTable; |
| private Set<Picture> output = new HashSet<>(); |
| private Map<Integer, Picture> lookup; |
| private List<Picture> nonU1based; |
| private List<Picture> all; |
| private int pn = 0; |
| |
| private PicturesSource(HWPFDocument doc) { |
| picturesTable = doc.getPicturesTable(); |
| all = picturesTable.getAllPictures(); |
| |
| // Build the Offset-Picture lookup map |
| lookup = new HashMap<>(); |
| for (Picture p : all) { |
| lookup.put(p.getStartOffset(), p); |
| } |
| |
| // Work out which Pictures aren't referenced by |
| // a \u0001 in the main text |
| // These are \u0008 escher floating ones, ones |
| // found outside the normal text, and who |
| // knows what else... |
| nonU1based = new ArrayList<>(); |
| nonU1based.addAll(all); |
| Range r = doc.getRange(); |
| for (int i = 0; i < r.numCharacterRuns(); i++) { |
| CharacterRun cr = r.getCharacterRun(i); |
| if (picturesTable.hasPicture(cr)) { |
| Picture p = getFor(cr); |
| int at = nonU1based.indexOf(p); |
| nonU1based.set(at, null); |
| } |
| } |
| } |
| |
| private boolean hasPicture(CharacterRun cr) { |
| return picturesTable.hasPicture(cr); |
| } |
| |
| private void recordOutput(Picture picture) { |
| output.add(picture); |
| } |
| |
| private boolean hasOutput(Picture picture) { |
| return output.contains(picture); |
| } |
| |
| private int pictureNumber(Picture picture) { |
| return all.indexOf(picture) + 1; |
| } |
| |
| private Picture getFor(CharacterRun cr) { |
| return lookup.get(cr.getPicOffset()); |
| } |
| |
| /** |
| * Return the next unclaimed one, used towards |
| * the end |
| */ |
| private Picture nextUnclaimed() { |
| Picture p = null; |
| while (pn < nonU1based.size()) { |
| p = nonU1based.get(pn); |
| pn++; |
| if (p != null) { |
| return p; |
| } |
| } |
| return null; |
| } |
| } |
| } |