blob: ed409f10a116b4d5e95fe0b9d19f900cc00ad687 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.vxquery.runtime.functions.index;
import org.apache.hyracks.data.std.api.IPointable;
import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
import org.apache.hyracks.dataflow.common.comm.util.ByteBufferInputStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.vxquery.datamodel.accessors.TaggedValuePointable;
import org.apache.vxquery.datamodel.builders.sequence.SequenceBuilder;
import org.apache.vxquery.datamodel.values.ValueTag;
import org.apache.vxquery.exceptions.ErrorCode;
import org.apache.vxquery.exceptions.SystemException;
import org.apache.vxquery.index.IndexDocumentBuilder;
import org.apache.vxquery.runtime.functions.index.updateIndex.Constants;
import org.apache.vxquery.runtime.functions.index.updateIndex.MetaFileUtil;
import org.apache.vxquery.runtime.functions.index.updateIndex.XmlMetadata;
import org.apache.vxquery.runtime.functions.util.FunctionHelper;
import org.apache.vxquery.xmlparser.ITreeNodeIdProvider;
import org.apache.vxquery.xmlparser.XMLParser;
import java.io.DataInputStream;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Paths;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.concurrent.ConcurrentHashMap;
public class IndexConstructorUtil {
static boolean isMetaFilePresent = false;
static MetaFileUtil metaFileUtil;
static ConcurrentHashMap<String, XmlMetadata> metadataMap = new ConcurrentHashMap<>();
public static void evaluate(TaggedValuePointable[] args, IPointable result, UTF8StringPointable stringp,
ByteBufferInputStream bbis, DataInputStream di, SequenceBuilder sb,
ArrayBackedValueStorage abvs, ITreeNodeIdProvider nodeIdProvider,
ArrayBackedValueStorage abvsFileNode, TaggedValuePointable nodep,
boolean isElementPath, String nodeId) throws SystemException {
String collectionFolder;
String indexFolder;
TaggedValuePointable collectionTVP = args[0];
TaggedValuePointable indexTVP = args[1];
if (collectionTVP.getTag() != ValueTag.XS_STRING_TAG || indexTVP.getTag() != ValueTag.XS_STRING_TAG) {
throw new SystemException(ErrorCode.FORG0006);
}
try {
// Get the list of files.
collectionTVP.getValue(stringp);
bbis.setByteBuffer(ByteBuffer.wrap(Arrays.copyOfRange(stringp.getByteArray(), stringp.getStartOffset(),
stringp.getLength() + stringp.getStartOffset())), 0);
collectionFolder = di.readUTF();
// Get the index folder
indexTVP.getValue(stringp);
bbis.setByteBuffer(ByteBuffer.wrap(Arrays.copyOfRange(stringp.getByteArray(), stringp.getStartOffset(),
stringp.getLength() + stringp.getStartOffset())), 0);
indexFolder = di.readUTF();
metaFileUtil = MetaFileUtil.create(indexFolder);
isMetaFilePresent = metaFileUtil.isMetaFilePresent();
} catch (IOException e) {
throw new SystemException(ErrorCode.SYSE0001, e);
}
File collectionDirectory = new File(collectionFolder);
if (!collectionDirectory.exists()) {
throw new RuntimeException("The collection directory (" + collectionFolder + ") does not exist.");
}
try {
abvs.reset();
sb.reset(abvs);
Directory dir = FSDirectory.open(Paths.get(indexFolder));
Analyzer analyzer = new CaseSensitiveAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
// Create will overwrite the index everytime
iwc.setOpenMode(OpenMode.CREATE);
//Create an index writer
IndexWriter writer = new IndexWriter(dir, iwc);
//Add files to index
indexXmlFiles(collectionDirectory, writer, isElementPath, nodep, abvsFileNode, nodeIdProvider, sb, bbis, di,
nodeId);
if (!isMetaFilePresent) {
// Add collection information to the map.
XmlMetadata data = new XmlMetadata();
data.setPath(collectionFolder);
metadataMap.put(Constants.COLLECTION_ENTRY, data);
// Write metadata map to a file.
metaFileUtil.writeMetaFile(metadataMap);
}
//This makes write slower but search faster.
writer.forceMerge(1);
writer.close();
sb.finish();
result.set(abvs);
} catch (IOException e) {
throw new SystemException(ErrorCode.SYSE0001, e);
}
}
/*This function goes recursively one file at a time. First it turns the file into an ABVS document node, then
* it indexes that document node.
*/
public static void indexXmlFiles(File collectionDirectory, IndexWriter writer, boolean isElementPath,
TaggedValuePointable nodep, ArrayBackedValueStorage abvsFileNode,
ITreeNodeIdProvider nodeIdProvider, SequenceBuilder sb,
ByteBufferInputStream bbis, DataInputStream di, String nodeId)
throws SystemException, IOException {
for (File file : collectionDirectory.listFiles()) {
if (readableXmlFile(file.getPath())) {
abvsFileNode.reset();
IndexDocumentBuilder ibuilder = getIndexBuilder(file, writer, nodep, abvsFileNode, nodeIdProvider,
bbis, di, nodeId);
ibuilder.printStart();
if (!isMetaFilePresent) {
XmlMetadata xmlMetadata = new XmlMetadata();
xmlMetadata.setPath(file.getCanonicalPath());
xmlMetadata.setFileName(file.getName());
try {
xmlMetadata.setMd5(metaFileUtil.generateMD5(file));
} catch (NoSuchAlgorithmException e) {
throw new SystemException(ErrorCode.SYSE0001, e);
}
metadataMap.put(file.getCanonicalPath(), xmlMetadata);
}
} else if (file.isDirectory()) {
// Consider all XML file in sub directories.
indexXmlFiles(file, writer, isElementPath, nodep, abvsFileNode, nodeIdProvider, sb, bbis, di, nodeId);
}
}
}
public static boolean readableXmlFile(String path) {
return (path.toLowerCase().endsWith(".xml") || path.toLowerCase().endsWith(".xml.gz"));
}
/**
* Separated from create index method so that it could be used as a helper function in IndexUpdater
*/
public static IndexDocumentBuilder getIndexBuilder(File file, IndexWriter writer,
TaggedValuePointable nodep, ArrayBackedValueStorage abvsFileNode,
ITreeNodeIdProvider nodeIdProvider,
ByteBufferInputStream bbis, DataInputStream di, String nodeId)
throws IOException {
//Get the document node
XMLParser parser = new XMLParser(false, nodeIdProvider, nodeId);
FunctionHelper.readInDocFromString(file.getPath(), bbis, di, abvsFileNode, parser);
nodep.set(abvsFileNode.getByteArray(), abvsFileNode.getStartOffset(), abvsFileNode.getLength());
//Add the document to the index
//Creates one lucene doc per file
return new IndexDocumentBuilder(nodep, writer, file.getCanonicalPath());
}
}