blob: 2f22ade2eedf0eb0783de0375355e2da9699cbd0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.vxquery.runtime.functions.index;
import org.apache.hyracks.data.std.api.IPointable;
import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
import org.apache.hyracks.dataflow.common.comm.util.ByteBufferInputStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.vxquery.datamodel.accessors.TaggedValuePointable;
import org.apache.vxquery.datamodel.builders.sequence.SequenceBuilder;
import org.apache.vxquery.exceptions.ErrorCode;
import org.apache.vxquery.exceptions.SystemException;
import org.apache.vxquery.index.IndexDocumentBuilder;
import org.apache.vxquery.runtime.functions.index.updateIndex.MetaFileUtil;
import org.apache.vxquery.runtime.functions.index.updateIndex.XmlMetadata;
import org.apache.vxquery.runtime.functions.util.FunctionHelper;
import org.apache.vxquery.xmlparser.IParser;
import org.apache.vxquery.xmlparser.ITreeNodeIdProvider;
import org.apache.vxquery.xmlparser.XMLParser;
import java.io.DataInputStream;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.concurrent.ConcurrentHashMap;
public class IndexConstructorUtil {
boolean isMetaFilePresent = false;
MetaFileUtil metaFileUtil;
ConcurrentHashMap<String, XmlMetadata> metadataMap = new ConcurrentHashMap<>();
public void evaluate(String collectioFolder, String indexFolder, IPointable result, UTF8StringPointable
stringp, ByteBufferInputStream bbis, DataInputStream di, SequenceBuilder sb, ArrayBackedValueStorage abvs,
ITreeNodeIdProvider nodeIdProvider, ArrayBackedValueStorage abvsFileNode, TaggedValuePointable nodep,
boolean isElementPath, String nodeId) throws SystemException {
metaFileUtil = new MetaFileUtil(indexFolder);
// metaFileUtil = .create(indexFolder);
isMetaFilePresent = metaFileUtil.isMetaFilePresent();
metaFileUtil.setCollection(collectioFolder);
File collectionDirectory = new File(collectioFolder);
if (!collectionDirectory.exists()) {
throw new RuntimeException("The collection directory (" + collectioFolder + ") does not exist.");
}
try {
abvs.reset();
sb.reset(abvs);
Directory dir = FSDirectory.open(Paths.get(indexFolder));
Analyzer analyzer = new CaseSensitiveAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
// Create will overwrite the index everytime
iwc.setOpenMode(OpenMode.CREATE);
//Create an index writer
IndexWriter writer = new IndexWriter(dir, iwc);
//Add files to index
indexXmlFiles(collectionDirectory, writer, isElementPath, nodep, abvsFileNode, nodeIdProvider, sb, bbis, di,
nodeId);
if (!isMetaFilePresent) {
// Write metadata map to a file.
metaFileUtil.updateMetadataMap(metadataMap, indexFolder);
metaFileUtil.writeMetadataToFile();
}
//This makes write slower but search faster.
writer.forceMerge(1);
writer.close();
sb.finish();
result.set(abvs);
} catch (IOException e) {
throw new SystemException(ErrorCode.SYSE0001, e);
}
}
/*This function goes recursively one file at a time. First it turns the file into an ABVS document node, then
* it indexes that document node.
*/
public void indexXmlFiles(File collectionDirectory, IndexWriter writer, boolean isElementPath,
TaggedValuePointable nodep, ArrayBackedValueStorage abvsFileNode, ITreeNodeIdProvider nodeIdProvider,
SequenceBuilder sb, ByteBufferInputStream bbis, DataInputStream di, String nodeId)
throws SystemException, IOException {
SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy, HH:mm:ss");
for (File file : collectionDirectory.listFiles()) {
if (readableXmlFile(file.getPath())) {
abvsFileNode.reset();
IndexDocumentBuilder ibuilder = getIndexBuilder(file, writer, nodep, abvsFileNode, nodeIdProvider, bbis,
di, nodeId);
ibuilder.printStart();
if (!isMetaFilePresent) {
XmlMetadata xmlMetadata = new XmlMetadata();
xmlMetadata.setPath(file.getCanonicalPath());
xmlMetadata.setFileName(file.getName());
xmlMetadata.setLastModified(sdf.format(file.lastModified()));
xmlMetadata.setMd5(metaFileUtil.generateMD5(file));
metadataMap.put(file.getCanonicalPath(), xmlMetadata);
}
} else if (file.isDirectory()) {
// Consider all XML file in sub directories.
indexXmlFiles(file, writer, isElementPath, nodep, abvsFileNode, nodeIdProvider, sb, bbis, di, nodeId);
}
}
}
public boolean readableXmlFile(String path) {
return (path.toLowerCase().endsWith(".xml") || path.toLowerCase().endsWith(".xml.gz"));
}
/**
* Separated from create index method so that it could be used as a helper function in IndexUpdater
*/
public IndexDocumentBuilder getIndexBuilder(File file, IndexWriter writer, TaggedValuePointable nodep,
ArrayBackedValueStorage abvsFileNode, ITreeNodeIdProvider nodeIdProvider, ByteBufferInputStream bbis,
DataInputStream di, String nodeId) throws IOException {
//Get the document node
IParser parser = new XMLParser(false, nodeIdProvider, nodeId);
FunctionHelper.readInDocFromString(file.getPath(), bbis, di, abvsFileNode, parser);
nodep.set(abvsFileNode.getByteArray(), abvsFileNode.getStartOffset(), abvsFileNode.getLength());
//Add the document to the index
//Creates one lucene doc per file
return new IndexDocumentBuilder(nodep, writer, file.getCanonicalPath());
}
}