blob: bb54ea86c52c7989a96513ddd18ea0a01414930b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io.tika;
import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument;
import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Objects;
import javax.annotation.Nullable;
import org.apache.beam.sdk.util.SerializableThrowable;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.MoreObjects;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Throwables;
import org.apache.tika.metadata.Metadata;
/**
* The result of parsing a single file with Tika: contains the file's location, metadata, extracted
* text, and optionally an error. If there is an error, the metadata and extracted text may be
* partial (i.e. not represent the entire file).
*/
public class ParseResult implements Serializable {
private final String fileLocation;
private final String content;
private final Metadata metadata;
private final String[] metadataNames;
@Nullable private final SerializableThrowable error;
public static ParseResult success(String fileLocation, String content, Metadata metadata) {
return new ParseResult(fileLocation, content, metadata, null);
}
public static ParseResult success(String fileLocation, String content) {
return new ParseResult(fileLocation, content, new Metadata(), null);
}
public static ParseResult failure(
String fileLocation, String partialContent, Metadata partialMetadata, Throwable error) {
return new ParseResult(fileLocation, partialContent, partialMetadata, error);
}
private ParseResult(String fileLocation, String content, Metadata metadata, Throwable error) {
checkArgument(fileLocation != null, "fileLocation can not be null");
checkArgument(content != null, "content can not be null");
checkArgument(metadata != null, "metadata can not be null");
this.fileLocation = fileLocation;
this.content = content;
this.metadata = metadata;
this.metadataNames = metadata.names();
this.error = (error == null) ? null : new SerializableThrowable(error);
}
/** Returns the absolute path to the input file. */
public String getFileLocation() {
return fileLocation;
}
/** Returns whether this file was parsed successfully. */
public boolean isSuccess() {
return error == null;
}
/** Returns the parse error, if the file was parsed unsuccessfully. */
public Throwable getError() {
checkState(error != null, "This is a successful ParseResult");
return error.getThrowable();
}
/**
* Same as {@link #getError}, but returns the complete stack trace of the error as a {@link
* String}.
*/
public String getErrorAsString() {
return Throwables.getStackTraceAsString(getError());
}
/** Returns the extracted text. May be partial, if this parse result contains a failure. */
public String getContent() {
return content;
}
/** Returns the extracted metadata. May be partial, if this parse result contains a failure. */
public Metadata getMetadata() {
return metadata;
}
@Override
public int hashCode() {
return Objects.hash(
getFileLocation(),
getContent(),
getMetadataHashCode(),
isSuccess() ? "" : Throwables.getStackTraceAsString(getError()));
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof ParseResult)) {
return false;
}
ParseResult other = (ParseResult) obj;
return Objects.equals(getFileLocation(), other.getFileLocation())
&& Objects.equals(getContent(), other.getContent())
&& Objects.equals(getMetadata(), other.getMetadata())
&& (isSuccess()
? other.isSuccess()
: (!other.isSuccess() && Objects.equals(getErrorAsString(), other.getErrorAsString())));
}
// TODO: Remove this function and use metadata.hashCode() once Apache Tika 1.17 gets released.
private int getMetadataHashCode() {
int hashCode = 0;
for (String name : metadataNames) {
hashCode += name.hashCode() ^ Arrays.hashCode(metadata.getValues(name));
}
return hashCode;
}
@Override
public String toString() {
return MoreObjects.toStringHelper(this)
.add("fileLocation", fileLocation)
.add("content", "<" + content.length() + " chars>")
.add("metadata", metadata)
.add("error", getError() == null ? null : Throwables.getStackTraceAsString(getError()))
.toString();
}
}