| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.uima.cas.serdes; |
| |
| import static java.util.Arrays.asList; |
| import static java.util.Collections.unmodifiableSet; |
| import static java.util.Comparator.comparing; |
| import static java.util.Comparator.comparingInt; |
| import static java.util.Comparator.reverseOrder; |
| import static java.util.stream.Collectors.joining; |
| import static java.util.stream.Collectors.toList; |
| import static org.apache.commons.csv.CSVFormat.DEFAULT; |
| |
| import java.io.Closeable; |
| import java.io.IOException; |
| import java.io.PrintWriter; |
| import java.io.StringWriter; |
| import java.io.Writer; |
| import java.lang.reflect.Array; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Comparator; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.LinkedHashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.regex.Pattern; |
| import java.util.stream.Collectors; |
| |
| import org.apache.commons.csv.CSVPrinter; |
| import org.apache.commons.lang3.StringUtils; |
| import org.apache.commons.lang3.tuple.Pair; |
| import org.apache.uima.cas.ArrayFS; |
| import org.apache.uima.cas.BooleanArrayFS; |
| import org.apache.uima.cas.ByteArrayFS; |
| import org.apache.uima.cas.CAS; |
| import org.apache.uima.cas.CommonArrayFS; |
| import org.apache.uima.cas.DoubleArrayFS; |
| import org.apache.uima.cas.Feature; |
| import org.apache.uima.cas.FeatureStructure; |
| import org.apache.uima.cas.FloatArrayFS; |
| import org.apache.uima.cas.IntArrayFS; |
| import org.apache.uima.cas.LongArrayFS; |
| import org.apache.uima.cas.ShortArrayFS; |
| import org.apache.uima.cas.StringArrayFS; |
| import org.apache.uima.cas.Type; |
| import org.apache.uima.cas.TypeSystem; |
| import org.apache.uima.cas.impl.CASImpl; |
| import org.apache.uima.cas.text.AnnotationFS; |
| import org.apache.uima.jcas.JCas; |
| import org.apache.uima.jcas.cas.AnnotationBase; |
| |
| /** |
| * @author entwicklerteam |
| */ |
| public class CasToComparableText { |
| |
| // Parameters |
| private boolean markIndexed = true; |
| private boolean markView = true; |
| private boolean coveredTextColumnEnabled = true; |
| private boolean indexedColumnEnabled = false; |
| private boolean treatEmptyStringsAsNull = false; |
| private int maxLengthCoveredText = 30; |
| private boolean sortAnnotationsInMultiValuedFeatures = true; |
| private boolean uniqueAnchors = true; |
| private Set<String> excludeFeaturePatterns = new HashSet<>(); |
| private Set<String> excludeTypePatterns = new HashSet<>(); |
| private String nullValue = "<NULL>"; |
| |
| // State |
| private final CAS cas; |
| private Set<FeatureStructure> _indexedFses; |
| private Map<String, Pattern> regexCache = new HashMap<>(); |
| private Map<String, Boolean> exclusionCache; |
| |
| public CasToComparableText(CAS aCas) { |
| |
| cas = aCas; |
| } |
| |
| public CasToComparableText(JCas jCas) { |
| |
| this(jCas.getCas()); |
| } |
| |
| public void addExcludeTypePatterns(String... aPatterns) { |
| |
| resetExclusionCache(); |
| excludeTypePatterns.addAll(asList(aPatterns)); |
| } |
| |
| public Set<String> getExcludeTypePatterns() { |
| |
| return unmodifiableSet(excludeTypePatterns); |
| } |
| |
| public void setExcludeTypePatterns(Collection<String> aExcludeFeaturePatterns) { |
| |
| resetExclusionCache(); |
| excludeTypePatterns.clear(); |
| |
| if (aExcludeFeaturePatterns != null) { |
| excludeTypePatterns.addAll(aExcludeFeaturePatterns); |
| } |
| } |
| |
| public void addExcludeFeaturePatterns(String... aPatterns) { |
| |
| excludeFeaturePatterns.addAll(asList(aPatterns)); |
| } |
| |
| public Set<String> getExcludeFeaturePatterns() { |
| |
| return unmodifiableSet(excludeFeaturePatterns); |
| } |
| |
| public void setExcludeFeaturePatterns(Collection<String> aExcludeFeaturePatterns) { |
| |
| resetExclusionCache(); |
| excludeFeaturePatterns.clear(); |
| |
| if (aExcludeFeaturePatterns != null) { |
| excludeFeaturePatterns.addAll(aExcludeFeaturePatterns); |
| } |
| } |
| |
| public void setUniqueAnchors(boolean aUniqueAnchors) { |
| |
| uniqueAnchors = aUniqueAnchors; |
| } |
| |
| public boolean isUniqueAnchors() { |
| |
| return uniqueAnchors; |
| } |
| |
| public void setSortAnnotationsInMultiValuedFeatures( |
| boolean aSortAnnotationsInMultiValuedFeatures) { |
| |
| sortAnnotationsInMultiValuedFeatures = aSortAnnotationsInMultiValuedFeatures; |
| } |
| |
| public boolean isSortAnnotationsInMultiValuedFeatures() { |
| |
| return sortAnnotationsInMultiValuedFeatures; |
| } |
| |
| public boolean isMarkIndexed() { |
| |
| return markIndexed; |
| } |
| |
| /** |
| * @param aMarkIndexed |
| * whether to mark indexed feature structures with an asterisk in the anchor and to add a |
| * column indicating the indexing status. |
| */ |
| public void setMarkIndexed(boolean aMarkIndexed) { |
| |
| markIndexed = aMarkIndexed; |
| } |
| |
| public boolean isCoveredTextColumnEnabled() { |
| |
| return coveredTextColumnEnabled; |
| } |
| |
| public void setCoveredTextColumnEnabled(boolean aCoveredTextColumnEnabled) { |
| |
| coveredTextColumnEnabled = aCoveredTextColumnEnabled; |
| } |
| |
| public boolean isIndexedColumnEnabled() { |
| |
| return indexedColumnEnabled; |
| } |
| |
| public void setIndexedColumnEnabled(boolean aIndexedColumnEnabled) { |
| |
| indexedColumnEnabled = aIndexedColumnEnabled; |
| } |
| |
| /** |
| * @return to add the view name to the anchor. Should be disabled when this class is used to |
| * compare feature structures across views. |
| */ |
| public boolean isMarkView() { |
| |
| return markView; |
| } |
| |
| public void setMarkView(boolean aMarkView) { |
| |
| markView = aMarkView; |
| } |
| |
| public int getMaxLengthCoveredText() { |
| |
| return maxLengthCoveredText; |
| } |
| |
| public void setMaxLengthCoveredText(int aMaxLengthCoveredText) { |
| |
| maxLengthCoveredText = aMaxLengthCoveredText; |
| } |
| |
| public String getNullValue() { |
| |
| return nullValue; |
| } |
| |
| public void setNullValue(String aNullValue) { |
| |
| nullValue = aNullValue; |
| } |
| |
| public void setTreatEmptyStringsAsNull(boolean aTreatEmptyStringsAsNull) { |
| treatEmptyStringsAsNull = aTreatEmptyStringsAsNull; |
| } |
| |
| public boolean isTreatEmptyStringsAsNull() { |
| return treatEmptyStringsAsNull; |
| } |
| |
| private Pattern pattern(String aRegex) { |
| |
| return regexCache.computeIfAbsent(aRegex, ex -> Pattern.compile(ex)); |
| } |
| |
| public String toString(FeatureStructure aFS) { |
| |
| if (aFS.getCAS() != cas) { |
| throw new IllegalArgumentException("FeatureStructure does not belong to CAS"); |
| } |
| |
| return toString(asList(aFS)); |
| } |
| |
| public String toString(Collection<? extends FeatureStructure> aSeeds) { |
| |
| try (StringWriter out = new StringWriter()) { |
| write(out, aSeeds); |
| return out.toString(); |
| } catch (IOException e) { |
| // The StringWriter shouldn't be throwing any IOExceptions, so if something goes wrong, |
| // it must be the fault of the rendering code / feature structure *caugh* |
| throw new IllegalArgumentException(e); |
| } |
| } |
| |
| public void write(Writer out) throws IOException { |
| |
| write(out, cas.getIndexedFSs()); |
| } |
| |
| private Set<FeatureStructure> getIndexedFses() { |
| |
| if (!markIndexed && !indexedColumnEnabled) { |
| return null; |
| } |
| |
| if (_indexedFses == null) { |
| _indexedFses = new HashSet<>(cas.getIndexedFSs()); |
| } |
| |
| return _indexedFses; |
| } |
| |
| public void write(Writer out, Collection<? extends FeatureStructure> aSeeds) throws IOException { |
| |
| if (aSeeds.isEmpty()) { |
| return; |
| } |
| |
| for (FeatureStructure fs : aSeeds) { |
| if (fs.getCAS() != cas && fs.getCAS() != ((CASImpl) cas).getBaseCAS()) { |
| throw new IllegalArgumentException("FeatureStructure does not belong to CAS"); |
| } |
| } |
| |
| Set<FeatureStructure> reachableFses = findReachableFeatureStructures(aSeeds); |
| |
| // First group by type so we have per-type sections in the output |
| Map<Type, List<FeatureStructure>> indexByType = reachableFses.stream() |
| .collect(Collectors.groupingBy(fs -> fs.getType())); |
| |
| // Ensure that the type sections have a stable order |
| List<Type> typesSorted = indexByType.keySet().stream() |
| .filter(type -> excludeTypePatterns.stream() |
| .noneMatch(p -> pattern(p).matcher(type.getName()).matches())) |
| .sorted(comparing(Type::getName)).collect(Collectors.toList()); |
| |
| // Build an anchor for every feature structure |
| Map<FeatureStructure, Anchor> fsToAnchor = generateAnchors(typesSorted, indexByType); |
| |
| // Process the feature structures in each type section |
| PrintWriter pout = new PrintWriter(out); |
| for (Type type : typesSorted) { |
| try (CSVPrinter csv = new CSVPrinter(new CloseShieldAppendable(pout), DEFAULT)) { |
| renderHeader(csv, type); |
| |
| // Generate all the rows for this type and then we sort them - this is necessary |
| // because there can be multiple annotations of the same type at the same location |
| // and we need to have a semantically stable ordering - so we take the actual |
| // row content into consideration |
| // @formatter:off |
| List<Pair<FeatureStructure, List<String>>> rows = indexByType.get(type).stream() |
| .map(fs -> renderFS(fsToAnchor, fs)) |
| .sorted(comparing( |
| // Compare by type name and offsets |
| Pair<FeatureStructure, List<String>>::getKey, new FSComparator()) |
| // ... then (if necessary) compare by the actual data |
| .thenComparing(p -> p.getValue().stream().collect(joining("\0")))) |
| .collect(toList()); |
| // @formatter:on |
| |
| for (Pair<FeatureStructure, List<String>> row : rows) { |
| csv.printRecord(row.getValue()); |
| } |
| } |
| |
| pout.print("\n"); |
| } |
| } |
| |
| private String escape(String aString) { |
| |
| return StringUtils.replaceEach(aString, new String[] { "\t", "\n", "\r", "[", "]", ",", "\\" }, |
| new String[] { "\\t", "\\n", "\\r", "\\[", "\\]", "\\,", "\\\\" }); |
| } |
| |
| private void renderHeader(CSVPrinter aCSV, Type aType) throws IOException { |
| |
| TypeSystem ts = cas.getTypeSystem(); |
| Type annotationType = ts.getType(CAS.TYPE_NAME_ANNOTATION); |
| |
| // Type as comment |
| aCSV.printRecord(aType.getName()); |
| |
| List<String> sectionHeader = new ArrayList<>(); |
| sectionHeader.add("<ANCHOR>"); |
| |
| if (indexedColumnEnabled) { |
| sectionHeader.add("<INDEXED>"); |
| } |
| |
| if (coveredTextColumnEnabled && ts.subsumes(annotationType, aType)) { |
| sectionHeader.add("<COVERED_TEXT>"); |
| } |
| |
| listFeatures(aType).stream() // |
| .filter(f -> !isExcluded(f)) // |
| .map(f -> f.getShortName()) // |
| .forEachOrdered(sectionHeader::add); |
| aCSV.printRecord(sectionHeader); |
| } |
| |
| /** |
| * Build an anchor for every feature structure. |
| */ |
| private Map<FeatureStructure, Anchor> generateAnchors(List<Type> aTypesSorted, |
| Map<Type, List<FeatureStructure>> aIndexByType) { |
| |
| Set<FeatureStructure> indexedFses = getIndexedFses(); |
| Map<FeatureStructure, Anchor> fsToAnchor = new HashMap<>(); |
| Map<String, Integer> disambiguationByPrefix = new HashMap<>(); |
| for (Type type : aTypesSorted) { |
| List<FeatureStructure> fses = new ArrayList<>(aIndexByType.get(type)); |
| fses.sort(new FSComparator()); |
| |
| for (FeatureStructure fs : fses) { |
| Anchor anchor = new Anchor(fs, markIndexed && indexedFses.contains(fs), |
| disambiguationByPrefix); |
| fsToAnchor.put(fs, anchor); |
| } |
| } |
| |
| return fsToAnchor; |
| } |
| |
| private List<Feature> listFeatures(Type aType) { |
| |
| // Determine which feature to show in which column |
| List<Feature> features = new ArrayList<>(aType.getFeatures()); |
| features.sort(comparing(Feature::getShortName)); |
| |
| // Features going into the anchor column are suppressed |
| features.removeIf(f -> CAS.FEATURE_BASE_NAME_SOFA.equals(f.getShortName())); |
| features.removeIf(f -> CAS.FEATURE_BASE_NAME_BEGIN.equals(f.getShortName())); |
| features.removeIf(f -> CAS.FEATURE_BASE_NAME_END.equals(f.getShortName())); |
| |
| return features; |
| } |
| |
| private Pair<FeatureStructure, List<String>> renderFS(Map<FeatureStructure, Anchor> aFsToAnchor, |
| FeatureStructure aFS) { |
| |
| List<String> data = new ArrayList<>(); |
| |
| // First column is always the anchor |
| data.add(aFsToAnchor.get(aFS).toString()); |
| |
| // Then add if the FS was in the index |
| if (indexedColumnEnabled) { |
| data.add(String.valueOf(getIndexedFses().contains(aFS))); |
| } |
| |
| if (coveredTextColumnEnabled && aFS instanceof AnnotationFS) { |
| String coveredText = ((AnnotationFS) aFS).getCoveredText(); |
| if (maxLengthCoveredText > 0) { |
| coveredText = StringUtils.abbreviateMiddle(coveredText, "...", maxLengthCoveredText); |
| } |
| data.add(escape(coveredText)); |
| } |
| |
| // Process the rest of the features |
| nextFeature: for (Feature feature : listFeatures(aFS.getType())) { |
| |
| // Check if the feature is excluded |
| if (isExcluded(feature)) { |
| continue; |
| } |
| |
| // Primitive features can be rendered as strings |
| if (feature.getRange().isStringOrStringSubtype()) { |
| data.add(escape(renderStringValue(aFS.getFeatureValueAsString(feature)))); |
| continue; |
| } |
| |
| if (feature.getRange().isPrimitive()) { |
| data.add(escape(aFS.getFeatureValueAsString(feature))); |
| continue; |
| } |
| |
| // For multi-valued features, we need to dive into the list/array |
| if (isMultiValuedFeature(aFS, feature)) { |
| data.add(renderMultiValuedFeatureStructure(aFS.getFeatureValue(feature), aFsToAnchor)); |
| continue nextFeature; |
| } |
| |
| // So once we get here, it must be a feature structure or null |
| FeatureStructure value = aFS.getFeatureValue(feature); |
| if (value == null) { |
| data.add(nullValue); |
| continue nextFeature; |
| } |
| |
| // Ok, so it's a feature structure |
| Anchor anchor = aFsToAnchor.get(value); |
| if (anchor == null) { |
| throw new IllegalStateException("No anchor - bug - should not happen"); |
| } |
| data.add(anchor.toString()); |
| } |
| |
| return Pair.of(aFS, data); |
| } |
| |
| private void resetExclusionCache() { |
| |
| exclusionCache = null; |
| } |
| |
| private boolean isExcluded(Feature aFeature) { |
| |
| if (exclusionCache == null) { |
| exclusionCache = new HashMap<>(); |
| } |
| |
| return exclusionCache.computeIfAbsent(aFeature.getName(), |
| f -> excludeFeaturePatterns.stream().anyMatch(p -> pattern(p).matcher(f).matches())); |
| } |
| |
| private static boolean isMultiValuedFeature(FeatureStructure aFS, Feature aFeature) { |
| if (aFeature == null) { |
| return false; |
| } |
| |
| TypeSystem aTS = aFS.getCAS().getTypeSystem(); |
| |
| return aFeature.getRange().isArray() |
| || aTS.subsumes(aTS.getType(CAS.TYPE_NAME_LIST_BASE), aFeature.getRange()); |
| } |
| |
| private boolean isMultiValued(FeatureStructure fs) { |
| TypeSystem ts = fs.getCAS().getTypeSystem(); |
| return fs.getType().isArray() || ts.subsumes(ts.getType(CAS.TYPE_NAME_LIST_BASE), fs.getType()); |
| } |
| |
| private String renderMultiValuedFeatureStructure(FeatureStructure aFS, |
| Map<FeatureStructure, Anchor> aFsToAnchor) { |
| List<Object> values = multiValuedFeatureStructureToList(aFS); |
| |
| if (values == null) { |
| return nullValue; |
| } |
| |
| // Optionally sort multi-valued feature that consist only of annotations. This essentially |
| // means that annotation-typed multi-valued features are treated as sets. |
| boolean allValuesAreAnnotations = values.stream().allMatch(v -> v instanceof AnnotationFS); |
| if (sortAnnotationsInMultiValuedFeatures && allValuesAreAnnotations) { |
| values = values.stream().map(v -> (AnnotationFS) v) |
| .sorted(comparingInt(AnnotationFS::getBegin) |
| .thenComparing(AnnotationFS::getEnd, reverseOrder()) |
| .thenComparing(r -> r.getType().getName())) |
| .collect(Collectors.toList()); |
| } |
| |
| List<String> items = new ArrayList<>(); |
| nextItem: for (Object item : values) { |
| if (item == null) { |
| items.add(nullValue); |
| continue nextItem; |
| } |
| |
| if (item instanceof String) { |
| items.add(escape(renderStringValue((String) item))); |
| continue nextItem; |
| } |
| |
| if (item instanceof FeatureStructure) { |
| FeatureStructure fsItem = (FeatureStructure) item; |
| if (isMultiValued(fsItem)) { |
| items.add(renderMultiValuedFeatureStructure(fsItem, aFsToAnchor)); |
| } else { |
| Anchor anchor = aFsToAnchor.get(fsItem); |
| if (anchor == null) { |
| throw new IllegalStateException("No anchor - bug - should not happen"); |
| } |
| items.add(anchor.toString()); |
| } |
| continue nextItem; |
| } |
| |
| items.add(escape(String.valueOf(item))); |
| } |
| |
| return items.stream().collect(joining(",", "[", "]")); |
| } |
| |
| private String renderStringValue(String aString) { |
| if (aString == null) { |
| return nullValue; |
| } |
| |
| if (treatEmptyStringsAsNull && aString.isEmpty()) { |
| return nullValue; |
| } |
| |
| return aString; |
| } |
| |
| // This method was derived from uimaFIT FSUtil.getFeature() |
| private List<Object> multiValuedFeatureStructureToList(FeatureStructure aValue) { |
| |
| if (aValue == null) { |
| return null; |
| } |
| |
| // Handle case where feature is an array |
| TypeSystem ts = aValue.getCAS().getTypeSystem(); |
| Object target = null; |
| int length = -1; |
| if (aValue instanceof CommonArrayFS) { |
| CommonArrayFS<?> source = (CommonArrayFS<?>) aValue; |
| length = source.size(); |
| if (aValue instanceof BooleanArrayFS) { |
| target = new boolean[length]; |
| ((BooleanArrayFS) source).copyToArray(0, (boolean[]) target, 0, length); |
| } else if (aValue instanceof ByteArrayFS) { |
| target = new byte[length]; |
| ((ByteArrayFS) source).copyToArray(0, (byte[]) target, 0, length); |
| } else if (aValue instanceof DoubleArrayFS) { |
| target = new double[length]; |
| ((DoubleArrayFS) source).copyToArray(0, (double[]) target, 0, length); |
| } else if (aValue instanceof FloatArrayFS) { |
| target = new float[length]; |
| ((FloatArrayFS) source).copyToArray(0, (float[]) target, 0, length); |
| } else if (aValue instanceof IntArrayFS) { |
| target = new int[length]; |
| ((IntArrayFS) source).copyToArray(0, (int[]) target, 0, length); |
| } else if (aValue instanceof LongArrayFS) { |
| target = new long[length]; |
| ((LongArrayFS) source).copyToArray(0, (long[]) target, 0, length); |
| } else if (aValue instanceof ShortArrayFS) { |
| target = new short[length]; |
| ((ShortArrayFS) source).copyToArray(0, (short[]) target, 0, length); |
| } else if (aValue instanceof StringArrayFS) { |
| target = new String[length]; |
| ((StringArrayFS) source).copyToArray(0, (String[]) target, 0, length); |
| } else { |
| target = new FeatureStructure[length]; |
| ((ArrayFS<?>) source).copyToArray(0, (FeatureStructure[]) target, 0, length); |
| } |
| } |
| // Handle case where feature is a list |
| else if (ts.subsumes(ts.getType(CAS.TYPE_NAME_LIST_BASE), aValue.getType())) { |
| // Get length of list |
| length = 0; |
| { |
| FeatureStructure cur = aValue; |
| // We assume to by facing a non-empty element if it has a "head" feature |
| while (cur.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_HEAD) != null) { |
| length++; |
| cur = cur.getFeatureValue(cur.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_TAIL)); |
| } |
| } |
| |
| if (ts.subsumes(ts.getType(CAS.TYPE_NAME_FLOAT_LIST), aValue.getType())) { |
| float[] floatTarget = new float[length]; |
| FeatureStructure cur = aValue; |
| // We assume to by facing a non-empty element if it has a "head" feature |
| int i = 0; |
| while (cur.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_HEAD) != null) { |
| floatTarget[i] = cur |
| .getFloatValue(cur.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_HEAD)); |
| cur = cur.getFeatureValue(cur.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_TAIL)); |
| i++; |
| } |
| target = floatTarget; |
| } else if (ts.subsumes(ts.getType(CAS.TYPE_NAME_INTEGER_LIST), aValue.getType())) { |
| int[] intTarget = new int[length]; |
| FeatureStructure cur = aValue; |
| // We assume to by facing a non-empty element if it has a "head" feature |
| int i = 0; |
| while (cur.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_HEAD) != null) { |
| intTarget[i] = cur |
| .getIntValue(cur.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_HEAD)); |
| cur = cur.getFeatureValue(cur.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_TAIL)); |
| i++; |
| } |
| target = intTarget; |
| } else if (ts.subsumes(ts.getType(CAS.TYPE_NAME_STRING_LIST), aValue.getType())) { |
| String[] stringTarget = new String[length]; |
| FeatureStructure cur = aValue; |
| // We assume to by facing a non-empty element if it has a "head" feature |
| int i = 0; |
| while (cur.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_HEAD) != null) { |
| stringTarget[i] = cur |
| .getStringValue(cur.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_HEAD)); |
| cur = cur.getFeatureValue(cur.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_TAIL)); |
| i++; |
| } |
| target = stringTarget; |
| } else if (ts.subsumes(ts.getType(CAS.TYPE_NAME_FS_LIST), aValue.getType())) { |
| target = new FeatureStructure[length]; |
| FeatureStructure cur = aValue; |
| // We assume to by facing a non-empty element if it has a "head" feature |
| int i = 0; |
| while (cur.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_HEAD) != null) { |
| Array.set(target, i, cur |
| .getFeatureValue(cur.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_HEAD))); |
| cur = cur.getFeatureValue(cur.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_TAIL)); |
| i++; |
| } |
| } else { |
| throw new IllegalStateException( |
| "Unsupported list type [" + aValue.getType().getName() + "]"); |
| } |
| } |
| |
| if (length == -1) { |
| throw new IllegalStateException("Unable to extract values"); |
| } |
| |
| List<Object> targetCollection = new ArrayList<>(); |
| for (int i = 0; i < length; i++) { |
| targetCollection.add(Array.get(target, i)); |
| } |
| |
| return targetCollection; |
| } |
| |
| private Set<FeatureStructure> findReachableFeatureStructures( |
| Collection<? extends FeatureStructure> aSeeds) { |
| |
| // Collect the seed points for the reachability tracking. We use a set here instead of a |
| // Deque because we use the contains() method on this queue and it is slow for typical |
| // Deques (e.g. LinkedList) but fast for sets. |
| Set<FeatureStructure> toProcess = new LinkedHashSet<>(aSeeds); |
| |
| // Collect all feature structures that are reachable via the seed points |
| Set<FeatureStructure> seen = new HashSet<>(); |
| while (!toProcess.isEmpty()) { |
| // Poll the next element from the processing queue |
| FeatureStructure fs = toProcess.iterator().next(); |
| toProcess.remove(fs); |
| |
| if (seen.contains(fs)) { |
| continue; |
| } |
| |
| seen.add(fs); |
| |
| if (isMultiValued(fs)) { |
| List<Object> values = multiValuedFeatureStructureToList(fs); |
| if (values != null) { |
| values.stream().filter(v -> v instanceof FeatureStructure).filter(v -> !seen.contains(v)) |
| .forEach(v -> toProcess.add((FeatureStructure) v)); |
| } |
| |
| for (Feature feature : fs.getType().getFeatures()) { |
| if (feature.getRange().isPrimitive()) { |
| continue; |
| } |
| |
| // Check if the feature is excluded |
| if (isExcluded(feature)) { |
| continue; |
| } |
| |
| if (CAS.FEATURE_BASE_NAME_SOFA.equals(feature.getShortName())) { |
| continue; |
| } |
| |
| if (isMultiValuedFeature(fs, feature)) { |
| List<Object> featureValues = multiValuedFeatureStructureToList( |
| fs.getFeatureValue(feature)); |
| |
| if (values != null) { |
| for (Object value : featureValues) { |
| if (value instanceof FeatureStructure && !seen.contains(value)) { |
| toProcess.add((FeatureStructure) value); |
| } |
| } |
| } |
| } else { |
| FeatureStructure value = fs.getFeatureValue(feature); |
| if (value != null && !seen.contains(value)) { |
| toProcess.add(value); |
| } |
| } |
| } |
| } |
| } |
| |
| return seen; |
| } |
| |
| private int featureHash(FeatureStructure aFS) { |
| int hash = 0; |
| for (Feature f : aFS.getType().getFeatures()) { |
| if (f.getRange().isStringOrStringSubtype() || f.getRange().isPrimitive()) { |
| String value = renderStringValue(aFS.getFeatureValueAsString(f)); |
| hash += value != null ? value.hashCode() : 0; |
| continue; |
| } |
| |
| if (f.getRange().isArray()) { |
| if (f.getRange().getComponentType().isStringOrStringSubtype()) { |
| StringArrayFS array = ((StringArrayFS) aFS.getFeatureValue(f)); |
| if (array != null) { |
| for (int i = 0; i < array.size(); i++) { |
| String v = renderStringValue(array.get(i)); |
| hash += v != null ? v.hashCode() : 0; |
| } |
| } |
| continue; |
| } |
| |
| switch (f.getRange().getComponentType().getName()) { |
| case CAS.TYPE_NAME_BOOLEAN: { |
| BooleanArrayFS array = ((BooleanArrayFS) aFS.getFeatureValue(f)); |
| if (array != null) { |
| for (int i = 0; i < array.size(); i++) { |
| hash += array.get(i) ? -(i + 1) : (i + 1); |
| } |
| } |
| break; |
| } |
| case CAS.TYPE_NAME_BYTE: { |
| ByteArrayFS array = ((ByteArrayFS) aFS.getFeatureValue(f)); |
| if (array != null) { |
| for (int i = 0; i < array.size(); i++) { |
| hash += array.get(i); |
| } |
| } |
| break; |
| } |
| case CAS.TYPE_NAME_DOUBLE: { |
| DoubleArrayFS array = ((DoubleArrayFS) aFS.getFeatureValue(f)); |
| if (array != null) { |
| for (int i = 0; i < array.size(); i++) { |
| hash += Double.hashCode(array.get(i)); |
| } |
| } |
| break; |
| } |
| case CAS.TYPE_NAME_FLOAT: { |
| FloatArrayFS array = ((FloatArrayFS) aFS.getFeatureValue(f)); |
| if (array != null) { |
| for (int i = 0; i < array.size(); i++) { |
| hash += Float.hashCode(array.get(i)); |
| } |
| } |
| break; |
| } |
| case CAS.TYPE_NAME_INTEGER: { |
| IntArrayFS array = ((IntArrayFS) aFS.getFeatureValue(f)); |
| if (array != null) { |
| for (int i = 0; i < array.size(); i++) { |
| hash += array.get(i); |
| } |
| } |
| break; |
| } |
| case CAS.TYPE_NAME_LONG: { |
| LongArrayFS array = ((LongArrayFS) aFS.getFeatureValue(f)); |
| if (array != null) { |
| for (int i = 0; i < array.size(); i++) { |
| hash += Long.hashCode(array.get(i)); |
| } |
| } |
| break; |
| } |
| case CAS.TYPE_NAME_SHORT: { |
| ShortArrayFS array = ((ShortArrayFS) aFS.getFeatureValue(f)); |
| if (array != null) { |
| for (int i = 0; i < array.size(); i++) { |
| hash += array.get(i); |
| } |
| } |
| break; |
| } |
| case CAS.TYPE_NAME_FS_ARRAY: |
| // We cannot really recursively calculate the hash... let's just use the array length |
| if (aFS.getFeatureValue(f) != null) { |
| hash *= ((CommonArrayFS) aFS.getFeatureValue(f)).size() + 1; |
| } |
| break; |
| } |
| } |
| |
| // If we get here, it is a feature structure reference... we cannot really recursively |
| // go into it to calculate a recursive hash... so we just check if the value is non-null |
| hash *= aFS.getFeatureValue(f) != null ? 1 : -1; |
| } |
| |
| return hash; |
| } |
| |
| private static class CloseShieldAppendable implements Appendable, Closeable { |
| |
| private final Appendable delegate; |
| |
| public CloseShieldAppendable(Appendable aDelegate) { |
| |
| delegate = aDelegate; |
| } |
| |
| @Override |
| public Appendable append(CharSequence aSequence) throws IOException { |
| |
| return delegate.append(aSequence); |
| } |
| |
| @Override |
| public Appendable append(CharSequence aSequence, int aStart, int aEnd) throws IOException { |
| |
| return delegate.append(aSequence, aStart, aEnd); |
| } |
| |
| @Override |
| public Appendable append(char aCharacter) throws IOException { |
| |
| return delegate.append(aCharacter); |
| } |
| |
| @Override |
| public void close() throws IOException { |
| |
| // Do not forward close |
| } |
| } |
| |
| private class FSComparator implements Comparator<FeatureStructure> { |
| |
| @Override |
| public int compare(FeatureStructure aFS1, FeatureStructure aFS2) { |
| |
| if (aFS1 == aFS2) { |
| return 0; |
| } |
| |
| // Same name? |
| int nameCmp = aFS2.getType().getName().compareTo(aFS2.getType().getName()); |
| if (nameCmp != 0) { |
| return nameCmp; |
| } |
| |
| // Annotation? Then sort by offsets |
| boolean fs1IsAnnotation = aFS1 instanceof AnnotationFS; |
| boolean fs2IsAnnotation = aFS2 instanceof AnnotationFS; |
| if (fs1IsAnnotation != fs2IsAnnotation) { |
| return -1; |
| } |
| if (fs1IsAnnotation && fs2IsAnnotation) { |
| AnnotationFS ann1 = (AnnotationFS) aFS1; |
| AnnotationFS ann2 = (AnnotationFS) aFS2; |
| |
| // Ascending by begin |
| int beginCmp = ann1.getBegin() - ann2.getBegin(); |
| if (beginCmp != 0) { |
| return beginCmp; |
| } |
| |
| // Descending by end |
| int endCmp = ann2.getEnd() - ann1.getEnd(); |
| if (endCmp != 0) { |
| return endCmp; |
| } |
| } |
| |
| // Ok, so let's calculate a hash over the features then... |
| int fh1 = featureHash(aFS1); |
| int fh2 = featureHash(aFS2); |
| if (fh1 < fh2) { |
| return -1; |
| } |
| if (fh1 > fh2) { |
| return 1; |
| } |
| return 0; |
| } |
| } |
| |
| private class Anchor { |
| |
| private final String stringValue; |
| private final int disambiguationId; |
| |
| public Anchor(FeatureStructure aFS, boolean aIndexed, |
| Map<String, Integer> aDisambiguationByPrefix) { |
| |
| StringBuilder anchor = new StringBuilder(); |
| |
| anchor.append(aFS.getType().getShortName()); |
| |
| // Special handling for AnnotationFS |
| if (aFS instanceof AnnotationFS) { |
| AnnotationFS ann = (AnnotationFS) aFS; |
| anchor.append("["); |
| anchor.append(ann.getBegin()); |
| anchor.append("-"); |
| anchor.append(ann.getEnd()); |
| anchor.append("]"); |
| } |
| |
| if (markIndexed && aIndexed) { |
| anchor.append("*"); |
| } |
| |
| // Special handling for AnnotationBase |
| if (markView && aFS instanceof AnnotationBase) { |
| AnnotationBase annBase = (AnnotationBase) aFS; |
| anchor.append('@'); |
| anchor.append(String.valueOf(annBase.getSofa().getSofaID())); |
| } |
| |
| // If we have the same anchor multiple times, then we need to disambiguate |
| String prefix = anchor.toString(); |
| disambiguationId = aDisambiguationByPrefix.computeIfAbsent(prefix, key -> 0); |
| |
| if (uniqueAnchors && disambiguationId > 0) { |
| anchor.append("("); |
| anchor.append(disambiguationId); |
| anchor.append(")"); |
| } |
| |
| aDisambiguationByPrefix.put(prefix, disambiguationId + 1); |
| |
| stringValue = anchor.toString(); |
| } |
| |
| @Override |
| public String toString() { |
| |
| return stringValue; |
| } |
| } |
| |
| public static String toComparableString(CAS aCas) { |
| try (StringWriter sourceCasRepresentationBuffer = new StringWriter()) { |
| new CasToComparableText(aCas).write(sourceCasRepresentationBuffer); |
| return sourceCasRepresentationBuffer.toString(); |
| } catch (IOException e) { |
| // This should normally never happen, so it should be ok to not throw a checked exception here |
| throw new IllegalStateException("Unable to serialize CAS", e); |
| } |
| } |
| } |