| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.dbf; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.nio.charset.Charset; |
| import java.nio.charset.StandardCharsets; |
| import java.util.Map; |
| import java.util.concurrent.ConcurrentHashMap; |
| |
| import org.apache.tika.exception.TikaException; |
| |
| /** |
| * This reads many dbase3 file variants (not DBASE 7, yet!). |
| * This parses the header on open. The client |
| * should get a row and then iterate until next() returns null. |
| * Be careful to deepCopy the row (if caching) because the row |
| * is mutable and will change as the reader iterates over new rows. |
| * <p> |
| * This is based on: <a href="http://web.archive.org/web/20150323061445/http://ulisse.elettra.trieste.it/services/doc/dbase/DBFstruct.htm"> |
| * http://ulisse.elettra.trieste.it/services/doc/dbase/DBFstruct.htm</a> |
| * <p> |
| * This is designed to separate out Tika-specific code so that it can |
| * be copied/pasted as a standalone if desired. |
| */ |
| |
| class DBFReader { |
| |
| public static final int MAX_FIELD_LENGTH = 66000; |
| private static final Map<Integer, Version> VERSION_MAP = new ConcurrentHashMap<>(); |
| public static boolean STRICT = false; |
| |
| static { |
| for (Version version : Version.values()) { |
| VERSION_MAP.put(version.id, version); |
| } |
| } |
| |
| private final DBFFileHeader header; |
| private final InputStream is; |
| private DBFRow currRow = null; |
| private Charset charset = StandardCharsets.US_ASCII; |
| private DBFReader(InputStream is) throws IOException, TikaException { |
| header = DBFFileHeader.parse(is); |
| this.is = is; |
| currRow = new DBFRow(header); |
| } |
| |
| static DBFReader open(InputStream is) throws IOException, TikaException { |
| return new DBFReader(is); |
| } |
| |
| //can return null! |
| static Version getVersion(int b) { |
| return VERSION_MAP.get(b); |
| } |
| |
| /** |
| * removes trailing 0 from byte array |
| * |
| * @param bytes |
| * @return |
| */ |
| public static byte[] trim(byte[] bytes) { |
| int end = bytes.length - 1; |
| for (int i = end; i > -1; i--) { |
| if (bytes[i] != 0) { |
| end = i; |
| break; |
| } |
| } |
| if (end == bytes.length - 1) { |
| return bytes; |
| } |
| byte[] ret = new byte[end + 1]; |
| System.arraycopy(bytes, 0, ret, 0, end + 1); |
| return ret; |
| } |
| |
| /** |
| * Iterate through the rows with this. |
| * <p> |
| * Be careful: the reader reuses the row! Make sure to call deep copy |
| * if you are buffering rows. |
| * |
| * @return |
| * @throws IOException |
| * @throws TikaException |
| */ |
| DBFRow next() throws IOException, TikaException { |
| if (fillRow(currRow)) { |
| return currRow; |
| } |
| return null; |
| } |
| |
| //returns whether or not some content was read. |
| //it might not be complete! |
| private boolean fillRow(DBFRow row) throws IOException, TikaException { |
| if (row == null) { |
| return false; |
| } |
| DBFCell[] cells = row.cells; |
| int isDeletedByte = is.read(); |
| boolean isDeleted = false; |
| if (isDeletedByte == 32) { |
| //all ok |
| } else if (isDeletedByte == 42) { //asterisk |
| isDeleted = true; |
| } else if (isDeletedByte == 26) { //marker for end of dbf file |
| return false; |
| } else if (isDeletedByte == -1) { //truncated file |
| if (DBFReader.STRICT) { |
| throw new IOException("EOF reached too early"); |
| } |
| return false; |
| } else { |
| throw new TikaException( |
| "Expecting space or asterisk at beginning of record, not:" + isDeletedByte); |
| } |
| row.setDeleted(isDeleted); |
| |
| boolean readSomeContent = false; |
| for (DBFCell cell : cells) { |
| if (cell.read(is)) { |
| readSomeContent = true; |
| } |
| } |
| return readSomeContent; |
| } |
| |
| public DBFFileHeader getHeader() { |
| return header; |
| } |
| |
| public Charset getCharset() { |
| return charset; |
| } |
| |
| enum Version { |
| |
| FOXBASE(0x02, "FoxBASE", ""), FOXBASE_PLUS(0x03, "FoxBASE_plus", ""), |
| VISUAL_FOXPRO(0x30, "Visual_FoxPro", ""), |
| VISUAL_FOXPRO_AUTOINCREMENT(0x31, "Visual_FoxPro", "autoincrement"), |
| VISUAL_FOXPRO_VAR(0x32, "Visual_FoxPro", "Varchar_or_Varbinary"), |
| DBASE_IV_SQL_TABLE(0x43, "dBASE_IV_SQL", "table"), |
| DBASE_IV_SQL_SYSTEM(0x63, "dBASE_IV_SQL", "system"), |
| FOX_BASE_PLUS_WITH_MEMO(0x83, "FoxBASE_plus", "memo"), |
| DBASE_IV_WITH_MEMO(0x8B, "dBASE_IV", "memo"), |
| DBASE_IV_SQL_TABLE_WITH_MEMO(0xCB, "dBASE_IV_SQL", "table_with_memo"), |
| FOXPRO_2x_WITH_MEMO(0xF5, "FoxPro_2.x", "memo"), |
| HIPER_SIZ_WITH_SMT_MEMO(0xE5, "HiPer-Siz", "SMT_memo"), FOXBASE2(0xFB, "FoxBASE", ""); |
| |
| private final int id; |
| private final String format; |
| private final String type; |
| |
| Version(int id, String format, String type) { |
| this.id = id; |
| this.format = format; |
| this.type = type; |
| } |
| |
| int getId() { |
| return id; |
| } |
| |
| String getFormat() { |
| return format; |
| } |
| |
| String getType() { |
| return type; |
| } |
| |
| String getFullMimeString() { |
| StringBuilder sb = new StringBuilder(); |
| sb.append("application/x-dbf; ").append("format=").append(getFormat()); |
| if (!"".equals(type)) { |
| sb.append("; type=").append(getType()); |
| } |
| return sb.toString(); |
| } |
| } |
| } |