blob: 2c257525480044ec5c2bf750126c7bca098df46d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.vxquery.runtime.functions.index;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.hyracks.data.std.api.IPointable;
import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.vxquery.datamodel.accessors.TaggedValuePointable;
import org.apache.vxquery.datamodel.builders.sequence.SequenceBuilder;
import org.apache.vxquery.exceptions.ErrorCode;
import org.apache.vxquery.exceptions.SystemException;
import org.apache.vxquery.index.IndexDocumentBuilder;
import org.apache.vxquery.runtime.functions.index.update.MetaFileUtil;
import org.apache.vxquery.runtime.functions.index.update.XmlMetadata;
import org.apache.vxquery.runtime.functions.util.FunctionHelper;
import org.apache.vxquery.xmlparser.IParser;
import org.apache.vxquery.xmlparser.ITreeNodeIdProvider;
import org.apache.vxquery.xmlparser.XMLParser;
public class IndexConstructorUtil {
private final TaggedValuePointable nodep = (TaggedValuePointable) TaggedValuePointable.FACTORY.createPointable();
private final SequenceBuilder sb = new SequenceBuilder();
private boolean isMetaFilePresent = false;
private MetaFileUtil metaFileUtil;
private ConcurrentHashMap<String, XmlMetadata> metadataMap = new ConcurrentHashMap<>();
public void evaluate(String collectioFolder, String indexFolder, IPointable result, ArrayBackedValueStorage abvs,
ITreeNodeIdProvider nodeIdProvider, ArrayBackedValueStorage abvsFileNode, boolean isElementPath,
String nodeId) throws IOException {
metaFileUtil = new MetaFileUtil(indexFolder);
isMetaFilePresent = metaFileUtil.isMetaFilePresent();
metaFileUtil.setCollection(collectioFolder);
File collectionDirectory = new File(collectioFolder);
if (!collectionDirectory.exists()) {
throw new IOException("The collection directory (" + collectioFolder + ") does not exist.");
}
try {
abvs.reset();
sb.reset(abvs);
Directory dir = FSDirectory.open(Paths.get(indexFolder));
Analyzer analyzer = new CaseSensitiveAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
// Create will overwrite the index everytime
iwc.setOpenMode(OpenMode.CREATE);
//Create an index writer
IndexWriter writer = new IndexWriter(dir, iwc);
//Add files to index
indexXmlFiles(collectionDirectory, writer, isElementPath, abvsFileNode, nodeIdProvider, sb, nodeId);
if (!isMetaFilePresent) {
// Write metadata map to a file.
metaFileUtil.updateMetadataMap(metadataMap, indexFolder);
metaFileUtil.writeMetadataToFile();
}
//This makes write slower but search faster.
writer.forceMerge(1);
writer.close();
sb.finish();
result.set(abvs);
} catch (IOException e) {
throw new SystemException(ErrorCode.SYSE0001, e);
}
}
/*
* This function goes recursively one file at a time. First it turns the file into an ABVS document node, then
* it indexes that document node.
*/
public void indexXmlFiles(File collectionDirectory, IndexWriter writer, boolean isElementPath,
ArrayBackedValueStorage abvsFileNode, ITreeNodeIdProvider nodeIdProvider, SequenceBuilder sb, String nodeId)
throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy, HH:mm:ss");
for (File file : collectionDirectory.listFiles()) {
if (readableXmlFile(file.getPath())) {
abvsFileNode.reset();
IndexDocumentBuilder ibuilder = getIndexBuilder(file, writer, abvsFileNode, nodeIdProvider, nodeId);
ibuilder.printStart();
if (!isMetaFilePresent) {
XmlMetadata xmlMetadata = new XmlMetadata();
xmlMetadata.setPath(file.getCanonicalPath());
xmlMetadata.setFileName(file.getName());
xmlMetadata.setLastModified(sdf.format(file.lastModified()));
xmlMetadata.setMd5(metaFileUtil.generateMD5(file));
metadataMap.put(file.getCanonicalPath(), xmlMetadata);
}
} else if (file.isDirectory()) {
// Consider all XML file in sub directories.
indexXmlFiles(file, writer, isElementPath, abvsFileNode, nodeIdProvider, sb, nodeId);
}
}
}
public boolean readableXmlFile(String path) {
return path.toLowerCase().endsWith(".xml") || path.toLowerCase().endsWith(".xml.gz");
}
public IndexDocumentBuilder getIndexBuilder(File file, IndexWriter writer, ArrayBackedValueStorage abvsFileNode,
ITreeNodeIdProvider nodeIdProvider, String nodeId) throws IOException {
//Get the document node
IParser parser = new XMLParser(false, nodeIdProvider, nodeId);
FunctionHelper.readInDocFromString(file.getPath(), abvsFileNode, parser);
nodep.set(abvsFileNode.getByteArray(), abvsFileNode.getStartOffset(), abvsFileNode.getLength());
//Add the document to the index
//Creates one lucene doc per file
return new IndexDocumentBuilder(nodep, writer, file.getCanonicalPath());
}
}