| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.jackrabbit.oak.plugins.tika; |
| |
| import com.beust.jcommander.internal.Maps; |
| import com.google.common.collect.FluentIterable; |
| import com.google.common.collect.Lists; |
| import com.google.common.collect.Sets; |
| import com.google.common.io.ByteSource; |
| import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter; |
| import org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory; |
| import org.apache.jackrabbit.oak.plugins.index.lucene.OakAnalyzer; |
| import org.apache.jackrabbit.oak.plugins.tika.TextPopulator.PopulatorStats; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.index.IndexWriter; |
| import org.apache.lucene.index.IndexWriterConfig; |
| import org.apache.lucene.store.FSDirectory; |
| import org.apache.lucene.util.Version; |
| import org.jetbrains.annotations.NotNull; |
| import org.junit.Before; |
| import org.junit.Rule; |
| import org.junit.Test; |
| import org.junit.rules.TemporaryFolder; |
| |
| import java.io.ByteArrayInputStream; |
| import java.io.File; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.util.Arrays; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| import static com.google.common.base.Charsets.UTF_8; |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.assertTrue; |
| |
| public class TextPopulatorTest { |
| @Rule |
| public TemporaryFolder temporaryFolder = new TemporaryFolder(); |
| |
| private File indexDir = null; |
| private File csv = null; |
| private FakeTextWriter textWriter = new FakeTextWriter(); |
| private PopulatorStats stats = new PopulatorStats(); |
| private TextPopulator textPopulator = new TextPopulator(textWriter); |
| |
| @Before |
| public void setup() throws Exception { |
| indexDir = temporaryFolder.newFolder("index-dump"); |
| csv = temporaryFolder.newFile("blobs.csv"); |
| |
| textPopulator.setStats(stats); |
| |
| setupIndexData(); |
| } |
| |
| private void setupIndexData() throws Exception { |
| Map<String, String> dataMap = Maps.newHashMap(); |
| dataMap.put("/sentence", "some sentence."); |
| dataMap.put("/para", "some sentence.\nAnd more sentence after a new line"); |
| dataMap.put("/error", TextPopulator.ERROR_TEXT); |
| dataMap.put("/null", null); |
| dataMap.put("/empty", ""); |
| dataMap.put("/untrimmed-empty", " "); |
| dataMap.put("/untrimmed", " untrimmed "); |
| |
| FSDirectory directory = FSDirectory.open(indexDir); |
| IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, new OakAnalyzer(Version.LUCENE_47)); |
| try (IndexWriter writer = new IndexWriter(directory, config)) { |
| for (Map.Entry<String, String> data : dataMap.entrySet()) { |
| writer.addDocument(createLuceneDocument(data.getKey(), data.getValue())); |
| } |
| |
| // add document with multiple :fulltext |
| writer.addDocument(createLuceneDocument("/multi", "value1", "value2")); |
| } |
| } |
| |
| private void setupCSV(String ... paths) throws IOException { |
| BinaryResourceProvider brp = new FakeBinaryResourceProvider(paths); |
| CSVFileGenerator generator = new CSVFileGenerator(csv); |
| generator.generate(brp.getBinaries("/")); |
| } |
| |
| private List<Field> createLuceneDocument(@NotNull String path, String ... values) { |
| List<Field> fields = Lists.newArrayList(); |
| for (String value : values) { |
| if (value != null) { |
| fields.add(FieldFactory.newFulltextField(value, true)); |
| } |
| } |
| fields.add(FieldFactory.newPathField(path)); |
| return fields; |
| } |
| |
| @Test |
| public void simpleTest() throws Exception { |
| setupCSV("/sentence", "/para"); |
| |
| textPopulator.populate(csv, indexDir); |
| assertEquals("Incorrect binaries processed", 2, stats.processed); |
| |
| textPopulator.populate(csv, indexDir); |
| assertEquals("Repeated call for already processed stuff shouldn't process anything more", |
| 2, stats.ignored); |
| |
| assertConsistentStatsAndWriter(); |
| assertStatsInvariants(); |
| } |
| |
| @Test |
| public void untrimmedText() throws Exception { |
| setupCSV("/untrimmed"); |
| |
| textPopulator.populate(csv, indexDir); |
| assertEquals("Store generation didn't trim data", "untrimmed", |
| textWriter.data.get(FakeBinaryResourceProvider.getBlobId("/untrimmed"))); |
| |
| assertConsistentStatsAndWriter(); |
| assertStatsInvariants(); |
| } |
| |
| @Test |
| public void indexedError() throws Exception { |
| setupCSV("/error"); |
| |
| textPopulator.populate(csv, indexDir); |
| assertEquals("Indexed data reporting errored extraction not marked as error", |
| 1, stats.errored); |
| |
| textPopulator.populate(csv, indexDir); |
| assertEquals("Repeated run for indexed error shouldn't get processed again", 1, stats.ignored); |
| |
| assertConsistentStatsAndWriter(); |
| assertStatsInvariants(); |
| } |
| |
| @Test |
| public void indexedEmpty() throws Exception { |
| setupCSV("/empty"); |
| |
| textPopulator.populate(csv, indexDir); |
| assertEquals("Indexed data for empty extraction not marked as empty", |
| 1, stats.empty); |
| |
| textPopulator.populate(csv, indexDir); |
| assertEquals("Repeated run for empty extraction shouldn't get processed again", 1, stats.ignored); |
| |
| assertConsistentStatsAndWriter(); |
| assertStatsInvariants(); |
| } |
| |
| @Test |
| public void indexedUntrimmedEmpty() throws Exception { |
| setupCSV("/untrimmed-empty"); |
| |
| textPopulator.populate(csv, indexDir); |
| assertEquals("Indexed data for untrimmed empty extraction not marked as empty", |
| 1, stats.empty); |
| |
| textPopulator.populate(csv, indexDir); |
| assertEquals("Repeated run for untrimmed empty extraction shouldn't get processed again", |
| 1, stats.ignored); |
| |
| assertConsistentStatsAndWriter(); |
| assertStatsInvariants(); |
| } |
| |
| @Test |
| public void multiFTField() throws Exception { |
| setupCSV("/multi"); |
| |
| textPopulator.populate(csv, indexDir); |
| assertEquals("Multi FT field in a doc not marked as error", |
| 1, stats.errored); |
| |
| textPopulator.populate(csv, indexDir); |
| assertEquals("Repeated run for multi FT error should get processed again", 0, stats.ignored); |
| |
| assertStatsInvariants(); |
| } |
| |
| @Test |
| public void indexHasDocumentButNotData() throws Exception { |
| setupCSV("/null"); |
| |
| textPopulator.populate(csv, indexDir); |
| assertEquals("No FT field in a doc not marked as error", |
| 1, stats.errored); |
| |
| textPopulator.populate(csv, indexDir); |
| assertEquals("Repeated run for no FT error should get processed again", 0, stats.ignored); |
| |
| assertStatsInvariants(); |
| } |
| |
| @Test |
| public void indexDoesNotHaveDocument() throws Exception { |
| setupCSV("/somethingRandom"); |
| |
| textPopulator.populate(csv, indexDir); |
| assertEquals("No indexed doc not marked as error", |
| 1, stats.errored); |
| |
| textPopulator.populate(csv, indexDir); |
| assertEquals("Repeated run for no indexed doc error should get processed again", 0, stats.ignored); |
| |
| assertStatsInvariants(); |
| } |
| |
| private void assertConsistentStatsAndWriter() { |
| assertEquals("Num blobs processed by text writer didn't process same not same as reported in stats", |
| textWriter.processed.size(), stats.processed); |
| |
| } |
| |
| private void assertStatsInvariants() { |
| assertTrue("Read (" + stats.read + ") !=" + |
| " Processed (" + stats.processed + ") + Ignored (" + stats.ignored + ")", |
| stats.read == stats.processed + stats.ignored); |
| |
| assertTrue("Processed (" + stats.processed + ") !=" + |
| " Empty (" + stats.empty + ") + Errored (" + stats.errored + ") + Parsed (" + stats.parsed + ")", |
| stats.processed == stats.empty + stats.errored + stats.parsed); |
| } |
| |
| private static class FakeTextWriter implements TextWriter { |
| final Set<String> processed = Sets.newHashSet(); |
| final Map<String, String> data = Maps.newHashMap(); |
| |
| @Override |
| public void write(@NotNull String blobId, @NotNull String text) { |
| processed.add(blobId); |
| data.put(blobId, text); |
| } |
| |
| @Override |
| public void markEmpty(String blobId) { |
| processed.add(blobId); |
| } |
| |
| @Override |
| public void markError(String blobId) { |
| processed.add(blobId); |
| } |
| |
| @Override |
| public boolean isProcessed(String blobId) { |
| return processed.contains(blobId); |
| } |
| } |
| |
| private static class FakeBinaryResourceProvider implements BinaryResourceProvider { |
| private List<BinaryResource> binaries = Lists.newArrayList(); |
| |
| FakeBinaryResourceProvider(String ... paths) { |
| for (String path : paths) { |
| binaries.add(new BinaryResource(new StringByteSource(""), null, null, path, getBlobId(path))); |
| } |
| } |
| |
| static String getBlobId(String path) { |
| return path + ":" + path; |
| } |
| |
| @Override |
| public FluentIterable<BinaryResource> getBinaries(String path) { |
| return new FluentIterable<BinaryResource>() { |
| @NotNull |
| @Override |
| public Iterator<BinaryResource> iterator() { |
| return binaries.iterator(); |
| } |
| }; |
| } |
| } |
| |
| private static class StringByteSource extends ByteSource { |
| private final String data; |
| |
| StringByteSource(String data) { |
| this.data = data; |
| } |
| |
| @Override |
| public InputStream openStream() { |
| return new ByteArrayInputStream(data.getBytes(UTF_8)); |
| } |
| } |
| } |