| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.microsoft; |
| |
| import java.awt.Point; |
| import java.io.IOException; |
| import java.text.NumberFormat; |
| import java.util.ArrayList; |
| import java.util.Comparator; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.SortedMap; |
| import java.util.TreeMap; |
| |
| import org.apache.poi.ddf.EscherBSERecord; |
| import org.apache.poi.ddf.EscherBlipRecord; |
| import org.apache.poi.ddf.EscherRecord; |
| import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener; |
| import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; |
| import org.apache.poi.hssf.eventusermodel.HSSFListener; |
| import org.apache.poi.hssf.eventusermodel.HSSFRequest; |
| import org.apache.poi.hssf.extractor.OldExcelExtractor; |
| import org.apache.poi.hssf.record.BOFRecord; |
| import org.apache.poi.hssf.record.BoundSheetRecord; |
| import org.apache.poi.hssf.record.CellValueRecordInterface; |
| import org.apache.poi.hssf.record.CountryRecord; |
| import org.apache.poi.hssf.record.DateWindow1904Record; |
| import org.apache.poi.hssf.record.DrawingGroupRecord; |
| import org.apache.poi.hssf.record.EOFRecord; |
| import org.apache.poi.hssf.record.ExtendedFormatRecord; |
| import org.apache.poi.hssf.record.FooterRecord; |
| import org.apache.poi.hssf.record.FormatRecord; |
| import org.apache.poi.hssf.record.FormulaRecord; |
| import org.apache.poi.hssf.record.HeaderRecord; |
| import org.apache.poi.hssf.record.HyperlinkRecord; |
| import org.apache.poi.hssf.record.LabelRecord; |
| import org.apache.poi.hssf.record.LabelSSTRecord; |
| import org.apache.poi.hssf.record.NumberRecord; |
| import org.apache.poi.hssf.record.RKRecord; |
| import org.apache.poi.hssf.record.Record; |
| import org.apache.poi.hssf.record.SSTRecord; |
| import org.apache.poi.hssf.record.StringRecord; |
| import org.apache.poi.hssf.record.TextObjectRecord; |
| import org.apache.poi.hssf.record.chart.SeriesTextRecord; |
| import org.apache.poi.hssf.record.common.UnicodeString; |
| import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; |
| import org.apache.poi.hssf.usermodel.HSSFPictureData; |
| import org.apache.poi.poifs.filesystem.DirectoryEntry; |
| import org.apache.poi.poifs.filesystem.DirectoryNode; |
| import org.apache.poi.poifs.filesystem.DocumentInputStream; |
| import org.apache.poi.poifs.filesystem.Entry; |
| import org.apache.poi.poifs.filesystem.POIFSFileSystem; |
| import org.apache.tika.exception.EncryptedDocumentException; |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.io.TikaInputStream; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.sax.XHTMLContentHandler; |
| import org.xml.sax.SAXException; |
| |
| /** |
| * Excel parser implementation which uses POI's Event API |
| * to handle the contents of a Workbook. |
| * <p/> |
| * The Event API uses a much smaller memory footprint than |
| * <code>HSSFWorkbook</code> when processing excel files |
| * but at the cost of more complexity. |
| * <p/> |
| * With the Event API a <i>listener</i> is registered for |
| * specific record types and those records are created, |
| * fired off to the listener and then discarded as the stream |
| * is being processed. |
| * |
| * @see org.apache.poi.hssf.eventusermodel.HSSFListener |
| * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api"> |
| * POI Event API How To</a> |
| */ |
| public class ExcelExtractor extends AbstractPOIFSExtractor { |
| |
| private static final String WORKBOOK_ENTRY = "Workbook"; |
| private static final String BOOK_ENTRY = "Book"; |
| /** |
| * <code>true</code> if the HSSFListener should be registered |
| * to listen for all records or <code>false</code> (the default) |
| * if the listener should be configured to only receive specified |
| * records. |
| */ |
| private boolean listenForAllRecords = false; |
| |
| public ExcelExtractor(ParseContext context, Metadata metadata) { |
| super(context, metadata); |
| } |
| |
| /** |
| * Returns <code>true</code> if this parser is configured to listen |
| * for all records instead of just the specified few. |
| */ |
| public boolean isListenForAllRecords() { |
| return listenForAllRecords; |
| } |
| |
| /** |
| * Specifies whether this parser should to listen for all |
| * records or just for the specified few. |
| * <p/> |
| * <strong>Note:</strong> Under normal operation this setting should |
| * be <code>false</code> (the default), but you can experiment with |
| * this setting for testing and debugging purposes. |
| * |
| * @param listenForAllRecords <code>true</code> if the HSSFListener |
| * should be registered to listen for all records or <code>false</code> |
| * if the listener should be configured to only receive specified records. |
| */ |
| public void setListenForAllRecords(boolean listenForAllRecords) { |
| this.listenForAllRecords = listenForAllRecords; |
| } |
| |
| /** |
| * Extracts text from an Excel Workbook writing the extracted content |
| * to the specified {@link Appendable}. |
| * |
| * @param filesystem POI file system |
| * @throws IOException if an error occurs processing the workbook |
| * or writing the extracted content |
| */ |
| protected void parse( |
| POIFSFileSystem filesystem, XHTMLContentHandler xhtml, |
| Locale locale) throws IOException, SAXException, TikaException { |
| parse(filesystem.getRoot(), xhtml, locale); |
| } |
| |
| protected void parse( |
| DirectoryNode root, XHTMLContentHandler xhtml, |
| Locale locale) throws IOException, SAXException, TikaException { |
| if (!root.hasEntry(WORKBOOK_ENTRY)) { |
| if (root.hasEntry(BOOK_ENTRY)) { |
| // Excel 5 / Excel 95 file |
| // Records are in a different structure so needs a |
| // different parser to process them |
| OldExcelExtractor extractor = new OldExcelExtractor(root); |
| OldExcelParser.parse(extractor, xhtml); |
| return; |
| } else { |
| // Corrupt file / very old file, just skip text extraction |
| return; |
| } |
| } |
| |
| // If a password was supplied, use it, otherwise the default |
| Biff8EncryptionKey.setCurrentUserPassword(getPassword()); |
| |
| // Have the file processed in event mode |
| TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this, officeParserConfig); |
| listener.processFile(root, isListenForAllRecords()); |
| listener.throwStoredException(); |
| |
| for (Entry entry : root) { |
| if (entry.getName().startsWith("MBD") |
| && entry instanceof DirectoryEntry) { |
| try { |
| handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); |
| } catch (TikaException e) { |
| // ignore parse errors from embedded documents |
| } |
| } |
| } |
| } |
| |
| // ====================================================================== |
| |
| /** |
| * HSSF Listener implementation which processes the HSSF records. |
| */ |
| private static class TikaHSSFListener implements HSSFListener { |
| |
| /** |
| * XHTML content handler to which the document content is rendered. |
| */ |
| private final XHTMLContentHandler handler; |
| |
| /** |
| * The POIFS Extractor, used for embeded resources. |
| */ |
| private final AbstractPOIFSExtractor extractor; |
| /** |
| * Format for rendering numbers in the worksheet. Currently we just |
| * use the platform default formatting. |
| * |
| * @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a> |
| */ |
| private final NumberFormat format; |
| |
| private final OfficeParserConfig officeParserConfig; |
| /** |
| * Potential exception thrown by the content handler. When set to |
| * non-<code>null</code>, causes all subsequent HSSF records to be |
| * ignored and the stored exception to be thrown when |
| * {@link #throwStoredException()} is invoked. |
| */ |
| private Exception exception = null; |
| private SSTRecord sstRecord; |
| private FormulaRecord stringFormulaRecord; |
| private short previousSid; |
| /** |
| * Internal <code>FormatTrackingHSSFListener</code> to handle cell |
| * formatting within the extraction. |
| */ |
| private FormatTrackingHSSFListener formatListener; |
| private final TikaExcelDataFormatter tikaExcelDataFormatter; |
| |
| /** |
| * List of worksheet names. |
| */ |
| private List<String> sheetNames = new ArrayList<String>(); |
| /** |
| * Index of the current worksheet within the workbook. |
| * Used to find the worksheet name in the {@link #sheetNames} list. |
| */ |
| private short currentSheetIndex; |
| /** |
| * Content of the current worksheet, or <code>null</code> if no |
| * worksheet is currently active. |
| */ |
| private SortedMap<Point, Cell> currentSheet = null; |
| /** |
| * Extra text or cells that crops up, typically as part of a |
| * worksheet but not always. |
| */ |
| private List<Cell> extraTextCells = new ArrayList<Cell>(); |
| /** |
| * These aren't complete when we first see them, as the |
| * depend on continue records that aren't always |
| * contiguous. Collect them for later processing. |
| */ |
| private List<DrawingGroupRecord> drawingGroups = new ArrayList<DrawingGroupRecord>(); |
| |
| /** |
| * Construct a new listener instance outputting parsed data to |
| * the specified XHTML content handler. |
| * |
| * @param handler Destination to write the parsed output to |
| */ |
| private TikaHSSFListener(XHTMLContentHandler handler, Locale locale, AbstractPOIFSExtractor extractor, OfficeParserConfig officeParserConfig) { |
| this.handler = handler; |
| this.extractor = extractor; |
| this.format = NumberFormat.getInstance(locale); |
| this.formatListener = new TikaFormatTrackingHSSFListener(this, locale); |
| this.tikaExcelDataFormatter = new TikaExcelDataFormatter(locale); |
| this.officeParserConfig = officeParserConfig; |
| } |
| |
| /** |
| * Entry point to listener to start the processing of a file. |
| * |
| * @param filesystem POI file system. |
| * @param listenForAllRecords sets whether the listener is configured to listen |
| * for all records types or not. |
| * @throws IOException on any IO errors. |
| * @throws SAXException on any SAX parsing errors. |
| */ |
| public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords) |
| throws IOException, SAXException, TikaException { |
| processFile(filesystem.getRoot(), listenForAllRecords); |
| } |
| |
| public void processFile(DirectoryNode root, boolean listenForAllRecords) |
| throws IOException, SAXException, TikaException { |
| // Set up listener and register the records we want to process |
| HSSFRequest hssfRequest = new HSSFRequest(); |
| if (listenForAllRecords) { |
| hssfRequest.addListenerForAllRecords(formatListener); |
| } else { |
| hssfRequest.addListener(formatListener, BOFRecord.sid); |
| hssfRequest.addListener(formatListener, EOFRecord.sid); |
| hssfRequest.addListener(formatListener, DateWindow1904Record.sid); |
| hssfRequest.addListener(formatListener, CountryRecord.sid); |
| hssfRequest.addListener(formatListener, BoundSheetRecord.sid); |
| hssfRequest.addListener(formatListener, SSTRecord.sid); |
| hssfRequest.addListener(formatListener, FormulaRecord.sid); |
| hssfRequest.addListener(formatListener, LabelRecord.sid); |
| hssfRequest.addListener(formatListener, LabelSSTRecord.sid); |
| hssfRequest.addListener(formatListener, NumberRecord.sid); |
| hssfRequest.addListener(formatListener, RKRecord.sid); |
| hssfRequest.addListener(formatListener, StringRecord.sid); |
| hssfRequest.addListener(formatListener, HyperlinkRecord.sid); |
| hssfRequest.addListener(formatListener, TextObjectRecord.sid); |
| hssfRequest.addListener(formatListener, SeriesTextRecord.sid); |
| hssfRequest.addListener(formatListener, FormatRecord.sid); |
| hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid); |
| hssfRequest.addListener(formatListener, DrawingGroupRecord.sid); |
| if (extractor.officeParserConfig.getIncludeHeadersAndFooters()) { |
| hssfRequest.addListener(formatListener, HeaderRecord.sid); |
| hssfRequest.addListener(formatListener, FooterRecord.sid); |
| } |
| } |
| |
| // Create event factory and process Workbook (fire events) |
| DocumentInputStream documentInputStream = root.createDocumentInputStream(WORKBOOK_ENTRY); |
| HSSFEventFactory eventFactory = new HSSFEventFactory(); |
| try { |
| eventFactory.processEvents(hssfRequest, documentInputStream); |
| } catch (org.apache.poi.EncryptedDocumentException e) { |
| throw new EncryptedDocumentException(e); |
| } |
| |
| // Output any extra text that came after all the sheets |
| processExtraText(); |
| |
| // Look for embeded images, now that the drawing records |
| // have been fully matched with their continue data |
| for (DrawingGroupRecord dgr : drawingGroups) { |
| dgr.decode(); |
| findPictures(dgr.getEscherRecords()); |
| } |
| } |
| |
| /** |
| * Process a HSSF record. |
| * |
| * @param record HSSF Record |
| */ |
| public void processRecord(Record record) { |
| if (exception == null) { |
| try { |
| internalProcessRecord(record); |
| } catch (TikaException te) { |
| exception = te; |
| } catch (IOException ie) { |
| exception = ie; |
| } catch (SAXException se) { |
| exception = se; |
| } |
| } |
| } |
| |
| public void throwStoredException() throws TikaException, SAXException, IOException { |
| if (exception != null) { |
| if (exception instanceof IOException) |
| throw (IOException) exception; |
| if (exception instanceof SAXException) |
| throw (SAXException) exception; |
| if (exception instanceof TikaException) |
| throw (TikaException) exception; |
| throw new TikaException(exception.getMessage()); |
| } |
| } |
| |
| private void internalProcessRecord(Record record) throws SAXException, TikaException, IOException { |
| switch (record.getSid()) { |
| case BOFRecord.sid: // start of workbook, worksheet etc. records |
| BOFRecord bof = (BOFRecord) record; |
| if (bof.getType() == BOFRecord.TYPE_WORKBOOK) { |
| currentSheetIndex = -1; |
| } else if (bof.getType() == BOFRecord.TYPE_CHART) { |
| if (previousSid == EOFRecord.sid) { |
| // This is a sheet which contains only a chart |
| newSheet(); |
| } else { |
| // This is a chart within a normal sheet |
| // Handling of this is a bit hacky... |
| if (currentSheet != null) { |
| processSheet(); |
| currentSheetIndex--; |
| newSheet(); |
| } |
| } |
| } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) { |
| newSheet(); |
| } |
| break; |
| |
| case EOFRecord.sid: // end of workbook, worksheet etc. records |
| if (currentSheet != null) { |
| processSheet(); |
| } |
| currentSheet = null; |
| break; |
| |
| case BoundSheetRecord.sid: // Worksheet index record |
| BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record; |
| sheetNames.add(boundSheetRecord.getSheetname()); |
| break; |
| |
| case SSTRecord.sid: // holds all the strings for LabelSSTRecords |
| sstRecord = (SSTRecord) record; |
| break; |
| |
| case FormulaRecord.sid: // Cell value from a formula |
| FormulaRecord formula = (FormulaRecord) record; |
| if (formula.hasCachedResultString()) { |
| // The String itself should be the next record |
| stringFormulaRecord = formula; |
| } else { |
| addTextCell(record, formatListener.formatNumberDateCell(formula)); |
| } |
| break; |
| |
| case StringRecord.sid: |
| if (previousSid == FormulaRecord.sid) { |
| // Cached string value of a string formula |
| StringRecord sr = (StringRecord) record; |
| addTextCell(stringFormulaRecord, sr.getString()); |
| } else { |
| // Some other string not associated with a cell, skip |
| } |
| break; |
| |
| case LabelRecord.sid: // strings stored directly in the cell |
| LabelRecord label = (LabelRecord) record; |
| addTextCell(record, label.getValue()); |
| break; |
| |
| case LabelSSTRecord.sid: // Ref. a string in the shared string table |
| LabelSSTRecord sst = (LabelSSTRecord) record; |
| UnicodeString unicode = sstRecord.getString(sst.getSSTIndex()); |
| String cellString = null; |
| if (officeParserConfig.getConcatenatePhoneticRuns()) { |
| String phonetic = (unicode != null |
| && unicode.getExtendedRst() != null |
| && unicode.getExtendedRst().getPhoneticText() != null |
| && unicode.getExtendedRst().getPhoneticText().trim().length() > 0) ? |
| unicode.getExtendedRst().getPhoneticText() : ""; |
| cellString = unicode.getString()+" "+phonetic; |
| } else { |
| cellString = unicode.getString(); |
| } |
| addTextCell(record, cellString); |
| break; |
| |
| case NumberRecord.sid: // Contains a numeric cell value |
| NumberRecord number = (NumberRecord) record; |
| addTextCell(record, formatListener.formatNumberDateCell(number)); |
| break; |
| |
| case RKRecord.sid: // Excel internal number record |
| RKRecord rk = (RKRecord) record; |
| addCell(record, new NumberCell(rk.getRKNumber(), format)); |
| break; |
| |
| case HyperlinkRecord.sid: // holds a URL associated with a cell |
| if (currentSheet != null) { |
| HyperlinkRecord link = (HyperlinkRecord) record; |
| Point point = |
| new Point(link.getFirstColumn(), link.getFirstRow()); |
| Cell cell = currentSheet.get(point); |
| if (cell != null) { |
| String address = link.getAddress(); |
| if (address != null) { |
| addCell(record, new LinkedCell(cell, address)); |
| } else { |
| addCell(record, cell); |
| } |
| } |
| } |
| break; |
| |
| case TextObjectRecord.sid: |
| if (extractor.officeParserConfig.getIncludeShapeBasedContent()) { |
| TextObjectRecord tor = (TextObjectRecord) record; |
| addTextCell(record, tor.getStr().getString()); |
| } |
| break; |
| |
| case SeriesTextRecord.sid: // Chart label or title |
| SeriesTextRecord str = (SeriesTextRecord) record; |
| addTextCell(record, str.getText()); |
| break; |
| |
| case DrawingGroupRecord.sid: |
| // Collect this now, we'll process later when all |
| // the continue records are in |
| drawingGroups.add((DrawingGroupRecord) record); |
| break; |
| |
| case HeaderRecord.sid: |
| if (extractor.officeParserConfig.getIncludeHeadersAndFooters()) { |
| HeaderRecord headerRecord = (HeaderRecord) record; |
| addTextCell(record, headerRecord.getText()); |
| } |
| break; |
| |
| case FooterRecord.sid: |
| if (extractor.officeParserConfig.getIncludeHeadersAndFooters()) { |
| FooterRecord footerRecord = (FooterRecord) record; |
| addTextCell(record, footerRecord.getText()); |
| } |
| break; |
| } |
| |
| previousSid = record.getSid(); |
| |
| if (stringFormulaRecord != record) { |
| stringFormulaRecord = null; |
| } |
| } |
| |
| private void processExtraText() throws SAXException { |
| if (extraTextCells.size() > 0) { |
| for (Cell cell : extraTextCells) { |
| handler.startElement("div", "class", "outside"); |
| cell.render(handler); |
| handler.endElement("div"); |
| } |
| |
| // Reset |
| extraTextCells.clear(); |
| } |
| } |
| |
| /** |
| * Adds the given cell (unless <code>null</code>) to the current |
| * worksheet (if any) at the position (if any) of the given record. |
| * |
| * @param record record that holds the cell value |
| * @param cell cell value (or <code>null</code>) |
| */ |
| private void addCell(Record record, Cell cell) throws SAXException { |
| if (cell == null) { |
| // Ignore empty cells |
| } else if (currentSheet != null |
| && record instanceof CellValueRecordInterface) { |
| // Normal cell inside a worksheet |
| CellValueRecordInterface value = |
| (CellValueRecordInterface) record; |
| Point point = new Point(value.getColumn(), value.getRow()); |
| if (currentSheet.containsKey(point)) { |
| //avoid overwriting content |
| //for now, add to extraTextCells |
| //TODO: consider allowing multiple text pieces |
| //per x,y to keep the text together |
| extraTextCells.add(cell); |
| } else { |
| currentSheet.put(point, cell); |
| } |
| |
| } else { |
| // Cell outside the worksheets |
| extraTextCells.add(cell); |
| } |
| } |
| |
| /** |
| * Adds a text cell with the given text comment. The given text |
| * is trimmed, and ignored if <code>null</code> or empty. |
| * |
| * @param record record that holds the text value |
| * @param text text content, may be <code>null</code> |
| * @throws SAXException |
| */ |
| private void addTextCell(Record record, String text) throws SAXException { |
| if (text != null) { |
| text = text.trim(); |
| if (text.length() > 0) { |
| addCell(record, new TextCell(text)); |
| } |
| } |
| } |
| |
| private void newSheet() { |
| currentSheetIndex++; |
| currentSheet = new TreeMap<Point, Cell>(new PointComparator()); |
| } |
| |
| /** |
| * Process an excel sheet. |
| * |
| * @throws SAXException if an error occurs |
| */ |
| private void processSheet() throws SAXException { |
| // Sheet Start |
| handler.startElement("div", "class", "page"); |
| if (currentSheetIndex < sheetNames.size()) { |
| handler.element("h1", sheetNames.get(currentSheetIndex)); |
| } |
| handler.startElement("table"); |
| handler.startElement("tbody"); |
| |
| // Process Rows |
| int currentRow = 0; |
| int currentColumn = 0; |
| handler.startElement("tr"); |
| handler.startElement("td"); |
| for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) { |
| if (currentRow != entry.getKey().y) { |
| // We've moved onto a new row, possibly skipping some |
| do { |
| handler.endElement("td"); |
| handler.endElement("tr"); |
| handler.startElement("tr"); |
| handler.startElement("td"); |
| currentRow++; |
| } while (officeParserConfig.getIncludeMissingRows() && |
| currentRow < entry.getKey().y); |
| currentRow = entry.getKey().y; |
| currentColumn = 0; |
| } |
| |
| while (currentColumn < entry.getKey().x) { |
| handler.endElement("td"); |
| handler.startElement("td"); |
| currentColumn++; |
| } |
| |
| entry.getValue().render(handler); |
| } |
| handler.endElement("td"); |
| handler.endElement("tr"); |
| |
| // Sheet End |
| handler.endElement("tbody"); |
| handler.endElement("table"); |
| |
| // Finish up |
| processExtraText(); |
| handler.endElement("div"); |
| } |
| |
| private void findPictures(List<EscherRecord> records) throws IOException, SAXException, TikaException { |
| for (EscherRecord escherRecord : records) { |
| if (escherRecord instanceof EscherBSERecord) { |
| EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord(); |
| if (blip != null) { |
| HSSFPictureData picture = new HSSFPictureData(blip); |
| String mimeType = picture.getMimeType(); |
| TikaInputStream stream = TikaInputStream.get(picture.getData()); |
| |
| // Handle the embeded resource |
| extractor.handleEmbeddedResource( |
| stream, null, null, mimeType, |
| handler, true |
| ); |
| } |
| } |
| |
| // Recursive call. |
| findPictures(escherRecord.getChildRecords()); |
| } |
| } |
| private class TikaFormatTrackingHSSFListener extends FormatTrackingHSSFListener { |
| //TIKA-2025 -- use this to preserve large numbers in "General" format |
| //against the MS spec. |
| final TikaExcelGeneralFormat generalFormat; |
| public TikaFormatTrackingHSSFListener(HSSFListener childListener, Locale locale) { |
| super(childListener, locale); |
| generalFormat = new TikaExcelGeneralFormat(locale); |
| } |
| |
| @Override |
| public void processRecord(Record record) { |
| // System.out.println(record.getClass() + " : "+record.toString()); |
| super.processRecord(record); |
| } |
| |
| @Override |
| public String formatNumberDateCell(CellValueRecordInterface cell) { |
| String formatString = this.getFormatString(cell); |
| if (formatString != null && ! formatString.equals("General")) { |
| return super.formatNumberDateCell(cell); |
| } |
| |
| double value; |
| if(cell instanceof NumberRecord) { |
| value = ((NumberRecord)cell).getValue(); |
| } else { |
| if(!(cell instanceof FormulaRecord)) { |
| throw new IllegalArgumentException("Unsupported CellValue Record passed in " + cell); |
| } |
| |
| value = ((FormulaRecord)cell).getValue(); |
| } |
| return generalFormat.format(value); |
| } |
| } |
| } |
| |
| /** |
| * Utility comparator for points. |
| */ |
| private static class PointComparator implements Comparator<Point> { |
| |
| public int compare(Point a, Point b) { |
| int diff = a.y - b.y; |
| if (diff == 0) { |
| diff = a.x - b.x; |
| } |
| return diff; |
| } |
| |
| } |
| } |