| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser; |
| |
| |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.assertTrue; |
| |
| import java.text.DateFormatSymbols; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.regex.Pattern; |
| |
| import org.junit.Test; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import org.apache.tika.TikaTest; |
| |
| /** |
| * Ensure that our various Table-based formats produce consistent, |
| * broadly similar output. |
| * This is mostly focused on the XHTML output |
| */ |
| public class TabularFormatsTest extends TikaTest { |
| protected static final String[] columnNames = |
| new String[]{"recnum", "square", "desc", "pctdone", "pctincr", "date", "datetime", |
| "time"}; |
| protected static final String[] columnLabels = |
| new String[]{"Record Number", "Square of the Record Number", "Description of the Row", |
| "Percent Done", "Percent Increment", "date", "datetime", "time"}; |
| // Which columns hold percentages? Not all parsers |
| // correctly format these... |
| protected static final List<Integer> percentageColumns = Arrays.asList(3, 4); |
| |
| private static final Logger LOG = LoggerFactory.getLogger(TabularFormatsTest.class); |
| // to prevent this build test from failing outside the english speaking world, we need to have |
| // both local and english month names (testCSV uses english names, the other tests local names) |
| private static String[] SHORT_MONTHS_EXPR; |
| |
| static { |
| String[] shortMonthsEnglish = new DateFormatSymbols(Locale.ENGLISH).getShortMonths(); |
| String[] shortMonthsLocal = new DateFormatSymbols(Locale.getDefault()).getShortMonths(); |
| List<String> shortMonthsExpr = new ArrayList(); |
| for (int i = 0; i < 12; ++i) { |
| String expr = |
| shortMonthsEnglish[i].toUpperCase(Locale.ENGLISH) + "|" + shortMonthsEnglish[i]; |
| if (!shortMonthsEnglish[i].equals(shortMonthsLocal[i])) { |
| expr += "|" + shortMonthsLocal[i].toUpperCase(Locale.getDefault()) + "|" + |
| shortMonthsLocal[i]; |
| } |
| LOG.info(expr); |
| shortMonthsExpr.add(expr); |
| } |
| SHORT_MONTHS_EXPR = shortMonthsExpr.toArray(new String[0]); |
| } |
| |
| /** |
| * Expected values, by <em>column</em> |
| */ |
| protected static final Object[][] table = |
| new Object[][]{new String[]{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"}, |
| new String[]{"0", "1", "4", "9", "16", "25", "36", "49", "64", "81", "100"}, |
| new String[]{}, // Generated later |
| new Pattern[]{Pattern.compile("0%|0.00%"), Pattern.compile("10%|10.00%"), |
| Pattern.compile("20%|20.00%"), Pattern.compile("30%|30.00%"), |
| Pattern.compile("40%|40.00%"), Pattern.compile("50%|50.00%"), |
| Pattern.compile("60%|60.00%"), Pattern.compile("70%|70.00%"), |
| Pattern.compile("80%|80.00%"), Pattern.compile("90%|90.00%"), |
| Pattern.compile("100%|100.00%"),}, |
| new Pattern[]{Pattern.compile(""), Pattern.compile("0.0%|0.00%"), |
| Pattern.compile("50.0%|50.00%"), Pattern.compile("66.7%|66.67%"), |
| Pattern.compile("75.0%|75.00%"), Pattern.compile("80.0%|80.00%"), |
| Pattern.compile("83.3%|83.33%"), Pattern.compile("85.7%|85.71%"), |
| Pattern.compile("87.5%|87.50%"), Pattern.compile("88.9%|88.89%"), |
| Pattern.compile("90.0%|90.00%"),}, |
| new Pattern[]{Pattern.compile("0?1-01-1960"), Pattern.compile("0?2-01-1960"), |
| Pattern.compile("17-01-1960"), Pattern.compile("22-03-1960"), |
| Pattern.compile("13-09-1960"), Pattern.compile("17-09-1961"), |
| Pattern.compile("20-07-1963"), Pattern.compile("29-07-1966"), |
| Pattern.compile("20-03-1971"), Pattern.compile("18-12-1977"), |
| Pattern.compile("19-05-1987"),}, new Pattern[]{Pattern.compile( |
| "01(" + SHORT_MONTHS_EXPR[0] + ")(60|1960)[:\\s]00:00:01(.00)?"), |
| Pattern.compile( |
| "01(" + SHORT_MONTHS_EXPR[0] + ")(60|1960)[:\\s]00:00:10(.00)?"), |
| Pattern.compile( |
| "01(" + SHORT_MONTHS_EXPR[0] + ")(60|1960)[:\\s]00:01:40(.00)?"), |
| Pattern.compile( |
| "01(" + SHORT_MONTHS_EXPR[0] + ")(60|1960)[:\\s]00:16:40(.00)?"), |
| Pattern.compile( |
| "01(" + SHORT_MONTHS_EXPR[0] + ")(60|1960)[:\\s]02:46:40(.00)?"), |
| Pattern.compile( |
| "02(" + SHORT_MONTHS_EXPR[0] + ")(60|1960)[:\\s]03:46:40(.00)?"), |
| Pattern.compile( |
| "12(" + SHORT_MONTHS_EXPR[0] + ")(60|1960)[:\\s]13:46:40(.00)?"), |
| Pattern.compile( |
| "25(" + SHORT_MONTHS_EXPR[3] + ")(60|1960)[:\\s]17:46:40(.00)?"), |
| Pattern.compile( |
| "03(" + SHORT_MONTHS_EXPR[2] + ")(63|1963)[:\\s]09:46:40(.00)?"), |
| Pattern.compile( |
| "09(" + SHORT_MONTHS_EXPR[8] + ")(91|1991)[:\\s]01:46:40(.00)?"), |
| Pattern.compile( |
| "19(" + SHORT_MONTHS_EXPR[10] + ")(76|2276)[:\\s]17:46:40(.00)?")}, |
| new Pattern[]{Pattern.compile("0?0:00:01(.\\d\\d)?"), |
| Pattern.compile("0?0:00:03(.\\d\\d)?"), |
| Pattern.compile("0?0:00:09(.\\d\\d)?"), |
| Pattern.compile("0?0:00:27(.\\d\\d)?"), |
| Pattern.compile("0?0:01:21(.\\d\\d)?"), |
| Pattern.compile("0?0:04:03(.\\d\\d)?"), |
| Pattern.compile("0?0:12:09(.\\d\\d)?"), |
| Pattern.compile("0?0:36:27(.\\d\\d)?"), |
| Pattern.compile("0?1:49:21(.\\d\\d)?"), |
| Pattern.compile("0?5:28:03(.\\d\\d)?"), |
| Pattern.compile("16:24:09(.\\d\\d)?")}}; |
| |
| |
| static { |
| // Row text in 3rd column |
| table[2] = new String[table[0].length]; |
| for (int i = 0; i < table[0].length; i++) { |
| table[2][i] = "This is row " + i + " of 10"; |
| } |
| } |
| |
| protected static String[] toCells(String row, boolean isTH) { |
| // Split into cells, ignoring stuff before first cell |
| String[] cells; |
| if (isTH) { |
| cells = row.split("<th"); |
| } else { |
| cells = row.split("<td"); |
| } |
| cells = Arrays.copyOfRange(cells, 1, cells.length); |
| |
| // Ignore the closing tag onwards, and normalise whitespace |
| for (int i = 0; i < cells.length; i++) { |
| cells[i] = cells[i].trim(); |
| if (cells[i].equals("/>")) { |
| cells[i] = ""; |
| continue; |
| } |
| |
| int splitAt = cells[i].lastIndexOf("</"); |
| cells[i] = cells[i].substring(0, splitAt).trim(); |
| cells[i] = cells[i].replaceAll("\\s+", " "); |
| } |
| return cells; |
| } |
| |
| protected void assertHeaders(String xml, boolean isTH, boolean hasLabel, boolean hasName) { |
| // Find the first row |
| int splitAt = xml.indexOf("</tr>"); |
| String hRow = xml.substring(0, splitAt); |
| splitAt = xml.indexOf("<tr>"); |
| hRow = hRow.substring(splitAt + 4); |
| |
| // Split into cells, ignoring stuff before first cell |
| String[] cells = toCells(hRow, isTH); |
| |
| // Check we got the right number |
| assertEquals("Wrong number of cells in header row " + hRow, columnLabels.length, |
| cells.length); |
| |
| // Check we got the right stuff |
| for (int i = 0; i < cells.length; i++) { |
| if (hasLabel && hasName) { |
| assertContains("title=\"" + columnNames[i] + "\"", cells[i]); |
| assertContains(">" + columnLabels[i], cells[i]); |
| } else if (hasName) { |
| assertContains(">" + columnNames[i], cells[i]); |
| } else { |
| assertContains(">" + columnLabels[i], cells[i]); |
| } |
| } |
| } |
| |
| protected void assertContents(String xml, boolean hasHeader, boolean doesPercents) { |
| // Ignore anything before the first <tr> |
| // Ignore the header row if there is one |
| int ignores = 1; |
| if (hasHeader) { |
| ignores++; |
| } |
| |
| // Split into rows, and discard the row closing (and anything after) |
| String[] rows = xml.split("<tr>"); |
| rows = Arrays.copyOfRange(rows, ignores, rows.length); |
| for (int i = 0; i < rows.length; i++) { |
| rows[i] = rows[i].split("</tr>")[0].trim(); |
| } |
| |
| // Check we got the right number of rows |
| for (int cn = 0; cn < table.length; cn++) { |
| assertEquals("Wrong number of rows found compared to column " + (cn + 1), |
| table[cn].length, rows.length); |
| } |
| |
| // Check each row's values |
| for (int rn = 0; rn < rows.length; rn++) { |
| String[] cells = toCells(rows[rn], false); |
| assertEquals("Wrong number of values in row " + (rn + 1), table.length, cells.length); |
| |
| for (int cn = 0; cn < table.length; cn++) { |
| String val = cells[cn]; |
| |
| // If the parser doesn't know about % formats, |
| // skip the cell if the column in a % one |
| if (!doesPercents && percentageColumns.contains(cn)) { |
| continue; |
| } |
| |
| // Ignore cell attributes |
| if (!val.isEmpty()) { |
| val = val.split(">")[1]; |
| } |
| // Check |
| String error = "Wrong text in row " + (rn + 1) + " and column " + (cn + 1) + " - " + |
| table[cn][rn] + " vs " + val; |
| if (table[cn][rn] instanceof String) { |
| assertEquals(error, table[cn][rn], val); |
| } else { |
| assertTrue(error, ((Pattern) table[cn][rn]).matcher(val).matches()); |
| } |
| } |
| } |
| } |
| |
| @Test |
| public void testSAS7BDAT() throws Exception { |
| XMLResult result = getXML("test-columnar.sas7bdat"); |
| String xml = result.xml; |
| assertHeaders(xml, true, true, true); |
| assertContents(xml, true, true); |
| } |
| |
| @Test |
| public void testXLS() throws Exception { |
| XMLResult result = getXML("test-columnar.xls"); |
| String xml = result.xml; |
| assertHeaders(xml, false, true, false); |
| assertContents(xml, true, true); |
| } |
| |
| @Test |
| public void testXLSX() throws Exception { |
| XMLResult result = getXML("test-columnar.xlsx"); |
| String xml = result.xml; |
| assertHeaders(xml, false, true, false); |
| assertContents(xml, true, true); |
| } |
| |
| @Test |
| public void testXLSB() throws Exception { |
| XMLResult result = getXML("test-columnar.xlsb"); |
| String xml = result.xml; |
| assertHeaders(xml, false, true, false); |
| assertContents(xml, true, true); |
| } |
| |
| // TODO Fix the ODS test - currently failing with |
| // org.xml.sax.SAXException: Namespace http://www.w3.org/1999/xhtml not declared |
| // @Test |
| // public void testODS() throws Exception { |
| // XMLResult result = getXML("test-columnar.ods"); |
| // String xml = result.xml; |
| // assertHeaders(xml, false, true, false); |
| // assertContents(xml, true, true); |
| // } |
| |
| // TODO Test other formats, eg Database formats |
| |
| /** |
| * Note - we don't have a dedicated CSV parser |
| * <p> |
| * This means we don't get proper HTML out... |
| */ |
| @Test |
| public void testCSV() throws Exception { |
| XMLResult result = getXML("test-columnar.csv"); |
| String xml = result.xml; |
| // Normalise whitespace before testing |
| xml = xml.replaceAll("\\s+", " "); |
| |
| for (String label : columnLabels) { |
| assertContains(label, xml); |
| } |
| for (Object[] vals : table) { |
| for (Object val : vals) { |
| if (val instanceof String) { |
| assertContains((String) val, xml); |
| } else if (val instanceof Pattern) { |
| assertTrue("Not matched: " + val, ((Pattern) val).matcher(xml).find()); |
| } |
| } |
| } |
| } |
| } |