| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.microsoft; |
| |
| import java.util.NoSuchElementException; |
| |
| import org.apache.poi.hwpf.HWPFDocument; |
| import org.apache.poi.hwpf.model.ListData; |
| import org.apache.poi.hwpf.model.ListFormatOverrideLevel; |
| import org.apache.poi.hwpf.model.ListLevel; |
| import org.apache.poi.hwpf.model.ListTables; |
| import org.apache.poi.hwpf.usermodel.Paragraph; |
| |
| /** |
| * Computes the number text which goes at the beginning of each list paragraph |
| * <p/> |
| * <p><em>Note:</em> This class only handles the raw number text and does not apply any further |
| * formatting as described in [MS-DOC], v20140721, 2.4.6.3, Part 3 to it.<p> |
| * <p><em>Note 2:</em> The {@code tplc}, a visual override for the appearance of list levels, as |
| * defined in [MS-DOC], v20140721, 2.9.328 is not taken care of in this class.</p> |
| * <p>Further, this class does not yet handle overrides</p> |
| */ |
| public class ListManager extends AbstractListManager { |
| private final ListTables listTables; |
| |
| /** |
| * Ordinary constructor for a new list reader |
| * |
| * @param document Document to process |
| */ |
| public ListManager(final HWPFDocument document) { |
| this.listTables = document.getListTables(); |
| } |
| |
| /** |
| * Get the formatted number for a given paragraph |
| * <p/> |
| * <p><em>Note:</em> This only works correctly if called subsequently for <em>all</em> |
| * paragraphs in a valid selection (main document, text field, ...) which are part of a list |
| * .</p> |
| * |
| * @param paragraph list paragraph to process |
| * @return String which represents the numbering of this list paragraph; never {@code null}, |
| * can be empty string, though, |
| * if something goes wrong in getList() |
| * @throws IllegalArgumentException If the given paragraph is {@code null} or is not part of |
| * a list |
| */ |
| public String getFormattedNumber(final Paragraph paragraph) { |
| if (paragraph == null) { |
| throw new IllegalArgumentException("Given paragraph cannot be null."); |
| } |
| if (!paragraph.isInList()) { |
| throw new IllegalArgumentException("Can only process list paragraphs."); |
| } |
| //lsid is equivalent to docx's abnum |
| //ilfo is equivalent to docx's num |
| int currAbNumId = -1; |
| try { |
| currAbNumId = paragraph.getList().getLsid(); |
| } catch (NoSuchElementException e) { |
| //somewhat frequent exception when initializing HWPFList |
| return ""; |
| } catch (IllegalArgumentException | NullPointerException e) { |
| return ""; |
| } |
| |
| int currNumId = paragraph.getIlfo(); |
| ParagraphLevelCounter lc = listLevelMap.get(currAbNumId); |
| LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId); |
| |
| if (lc == null) { |
| ListData listData = listTables.getListData(paragraph.getList().getLsid()); |
| if (listData == null) { |
| //silently skip |
| return ""; |
| } |
| LevelTuple[] levelTuples = new LevelTuple[listData.getLevels().length]; |
| for (int i = 0; i < listData.getLevels().length; i++) { |
| levelTuples[i] = buildTuple(i, listData.getLevels()[i]); |
| } |
| lc = new ParagraphLevelCounter(levelTuples); |
| } |
| if (overrideTuples == null) { |
| overrideTuples = buildOverrideTuples(paragraph, lc.getNumberOfLevels()); |
| } |
| String formattedString = lc.incrementLevel(paragraph.getIlvl(), overrideTuples); |
| |
| listLevelMap.put(currAbNumId, lc); |
| overrideTupleMap.put(currNumId, overrideTuples); |
| return formattedString; |
| } |
| |
| private LevelTuple buildTuple(int i, ListLevel listLevel) { |
| boolean isLegal = false; |
| int start = 1; |
| int restart = -1; |
| String lvlText = "%" + i + "."; |
| String numFmt = "decimal"; |
| |
| start = listLevel.getStartAt(); |
| restart = listLevel.getRestart(); |
| isLegal = listLevel.isLegalNumbering(); |
| numFmt = convertToNewNumFormat(listLevel.getNumberFormat()); |
| lvlText = convertToNewNumberText(listLevel.getNumberText(), |
| listLevel.getLevelNumberingPlaceholderOffsets()); |
| return new LevelTuple(start, restart, lvlText, numFmt, isLegal); |
| } |
| |
| private LevelTuple[] buildOverrideTuples(Paragraph par, int length) { |
| ListFormatOverrideLevel overrideLevel; |
| // find the override for this level |
| if (listTables.getLfoData(par.getIlfo()).getRgLfoLvl().length == 0) { |
| return null; |
| } |
| overrideLevel = listTables.getLfoData(par.getIlfo()).getRgLfoLvl()[0]; |
| if (overrideLevel == null) { |
| return null; |
| } |
| LevelTuple[] levelTuples = new LevelTuple[length]; |
| ListLevel listLevel = overrideLevel.getLevel(); |
| if (listLevel == null) { |
| return null; |
| } |
| for (int i = 0; i < length; i++) { |
| levelTuples[i] = buildTuple(i, listLevel); |
| } |
| |
| return levelTuples; |
| |
| } |
| |
| private String convertToNewNumberText(String numberText, byte[] numberOffsets) { |
| |
| StringBuilder sb = new StringBuilder(); |
| int last = 0; |
| for (byte numberOffset : numberOffsets) { |
| int offset = (int) numberOffset; |
| |
| if (offset == 0) { |
| break; |
| } |
| if (offset - 1 < last || offset > numberText.length()) { |
| //something went wrong. |
| //silently stop |
| break; |
| } |
| sb.append(numberText, last, offset - 1); |
| //need to add one because newer format |
| //adds one. In .doc, this was the array index; |
| //but in .docx, this is the level number |
| int lvlNum = (int) numberText.charAt(offset - 1) + 1; |
| sb.append("%").append(lvlNum); |
| last = offset; |
| } |
| if (last < numberText.length()) { |
| sb.append(numberText.substring(last)); |
| } |
| return sb.toString(); |
| } |
| |
| private String convertToNewNumFormat(int numberFormat) { |
| switch (numberFormat) { |
| case -1: |
| return "none"; |
| case 0: |
| return "decimal"; |
| case 1: |
| return "upperRoman"; |
| case 2: |
| return "lowerRoman"; |
| case 3: |
| return "upperLetter"; |
| case 4: |
| return "lowerLetter"; |
| case 5: |
| return "ordinal"; |
| case 22: |
| return "decimalZero"; |
| case 23: |
| return "bullet"; |
| case 47: |
| return "none"; |
| default: |
| //do we really want to silently swallow these uncovered cases? |
| //throw new RuntimeException("NOT COVERED: " + numberFormat); |
| return "decimal"; |
| } |
| } |
| } |