blob: 21fc981f720bf89353348b739549f39c7c8fde48 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.dbf;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.util.Locale;
import java.util.TimeZone;
import org.apache.commons.io.IOUtils;
import org.apache.tika.io.EndianUtils;
class DBFCell {
private final DBFColumnHeader.ColType colType;
private final byte[] bytes;
private final int decimalCount;
int bytesReadLast = 0;
DBFCell(DBFColumnHeader.ColType colType, int fieldLength, int decimalCount) {
this.colType = colType;
this.decimalCount = decimalCount;
this.bytes = new byte[fieldLength];
}
String getString(Charset charset) {
switch (colType) {
case C:
return new String(getBytes(), charset).trim();
case D:
return getFormattedDate();
case N:
return new String(getBytes(), StandardCharsets.US_ASCII).trim();
case L:
return new String(getBytes(), StandardCharsets.US_ASCII).trim();
case T:
return getFormattedDateTime();
default:
//TODO: find examples of other cell types for testing
return new String(getBytes(), StandardCharsets.US_ASCII).trim();
}
}
//returns whether any content was read
boolean read(InputStream is) throws IOException {
bytesReadLast = IOUtils.read(is, bytes);
if (DBFReader.STRICT && bytesReadLast != bytes.length) {
throw new IOException("Truncated record, only read " + bytesReadLast +
" bytes, but should have read: " + bytes.length);
}
return bytesReadLast > 0;
}
/**
* @return copy of bytes that were read on the last read
*/
byte[] getBytes() {
byte[] ret = new byte[bytesReadLast];
System.arraycopy(bytes, 0, ret, 0, bytesReadLast);
return ret;
}
DBFColumnHeader.ColType getColType() {
return colType;
}
@Override
public String toString() {
return "DBFCell{" + "colType=" + colType + ", bytes=" + Arrays.toString(bytes) +
", decimalCount=" + decimalCount + '}';
}
DBFCell deepCopy() {
DBFCell cell = new DBFCell(colType, bytes.length, decimalCount);
cell.bytesReadLast = this.bytesReadLast;
System.arraycopy(this.bytes, 0, cell.bytes, 0, bytesReadLast);
return cell;
}
private String getFormattedDate() {
byte[] dateBytes = getBytes();
if (dateBytes.length < 8) {
return "";
}
String year = new String(dateBytes, 0, 4, StandardCharsets.US_ASCII);
String month = new String(dateBytes, 4, 2, StandardCharsets.US_ASCII);
String day = new String(dateBytes, 6, 2, StandardCharsets.US_ASCII);
//test to see that these values make any sense
for (String s : new String[]{year, month, day}) {
try {
Integer.parseInt(s);
} catch (NumberFormatException e) {
return "";
}
}
return String.format(Locale.ROOT, "%s/%s/%s", month, day, year);
}
public String getFormattedDateTime() {
//sometimes 12/31/1899 instead of 01/01/4713 BC.
//http://stackoverflow.com/questions/20026154/convert-dbase-timestamp
//TODO: add heuristic for deciding;
//TODO: find example of file with time != 0
Calendar baseCalendar =
GregorianCalendar.getInstance(TimeZone.getTimeZone("UTC"), Locale.ROOT);
// baseCalendar.set(1899, 11, 31, 0, 0, 0);
baseCalendar.set(-4712, 0, 1, 0, 0, 0);
try (InputStream is = new ByteArrayInputStream(getBytes())) {
int date = EndianUtils.readIntLE(is);
int time = EndianUtils.readIntLE(is);
baseCalendar.add(Calendar.DATE, date);
DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.ROOT);
return df.format(baseCalendar.getTime());
} catch (IOException | EndianUtils.BufferUnderrunException e) {
//swallow
}
return "";
}
}