| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.vxquery.runtime.functions.index; |
| |
| import org.apache.hyracks.data.std.api.IPointable; |
| import org.apache.hyracks.data.std.primitive.UTF8StringPointable; |
| import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; |
| import org.apache.hyracks.dataflow.common.comm.util.ByteBufferInputStream; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.index.IndexWriter; |
| import org.apache.lucene.index.IndexWriterConfig; |
| import org.apache.lucene.index.IndexWriterConfig.OpenMode; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.FSDirectory; |
| import org.apache.vxquery.datamodel.accessors.TaggedValuePointable; |
| import org.apache.vxquery.datamodel.builders.sequence.SequenceBuilder; |
| import org.apache.vxquery.exceptions.ErrorCode; |
| import org.apache.vxquery.exceptions.SystemException; |
| import org.apache.vxquery.index.IndexDocumentBuilder; |
| import org.apache.vxquery.runtime.functions.index.updateIndex.MetaFileUtil; |
| import org.apache.vxquery.runtime.functions.index.updateIndex.XmlMetadata; |
| import org.apache.vxquery.runtime.functions.util.FunctionHelper; |
| import org.apache.vxquery.xmlparser.IParser; |
| import org.apache.vxquery.xmlparser.ITreeNodeIdProvider; |
| import org.apache.vxquery.xmlparser.XMLParser; |
| |
| import java.io.DataInputStream; |
| import java.io.File; |
| import java.io.IOException; |
| import java.nio.file.Paths; |
| import java.text.SimpleDateFormat; |
| import java.util.concurrent.ConcurrentHashMap; |
| |
| public class IndexConstructorUtil { |
| boolean isMetaFilePresent = false; |
| MetaFileUtil metaFileUtil; |
| ConcurrentHashMap<String, XmlMetadata> metadataMap = new ConcurrentHashMap<>(); |
| |
| public void evaluate(String collectioFolder, String indexFolder, IPointable result, UTF8StringPointable |
| stringp, ByteBufferInputStream bbis, DataInputStream di, SequenceBuilder sb, ArrayBackedValueStorage abvs, |
| ITreeNodeIdProvider nodeIdProvider, ArrayBackedValueStorage abvsFileNode, TaggedValuePointable nodep, |
| boolean isElementPath, String nodeId) throws SystemException { |
| |
| metaFileUtil = new MetaFileUtil(indexFolder); |
| // metaFileUtil = .create(indexFolder); |
| isMetaFilePresent = metaFileUtil.isMetaFilePresent(); |
| metaFileUtil.setCollection(collectioFolder); |
| |
| File collectionDirectory = new File(collectioFolder); |
| if (!collectionDirectory.exists()) { |
| throw new RuntimeException("The collection directory (" + collectioFolder + ") does not exist."); |
| } |
| |
| try { |
| abvs.reset(); |
| sb.reset(abvs); |
| |
| Directory dir = FSDirectory.open(Paths.get(indexFolder)); |
| Analyzer analyzer = new CaseSensitiveAnalyzer(); |
| IndexWriterConfig iwc = new IndexWriterConfig(analyzer); |
| |
| // Create will overwrite the index everytime |
| iwc.setOpenMode(OpenMode.CREATE); |
| |
| //Create an index writer |
| IndexWriter writer = new IndexWriter(dir, iwc); |
| |
| //Add files to index |
| indexXmlFiles(collectionDirectory, writer, isElementPath, nodep, abvsFileNode, nodeIdProvider, sb, bbis, di, |
| nodeId); |
| |
| if (!isMetaFilePresent) { |
| // Write metadata map to a file. |
| metaFileUtil.updateMetadataMap(metadataMap, indexFolder); |
| metaFileUtil.writeMetadataToFile(); |
| } |
| |
| //This makes write slower but search faster. |
| writer.forceMerge(1); |
| |
| writer.close(); |
| |
| sb.finish(); |
| result.set(abvs); |
| } catch (IOException e) { |
| throw new SystemException(ErrorCode.SYSE0001, e); |
| } |
| } |
| |
| /*This function goes recursively one file at a time. First it turns the file into an ABVS document node, then |
| * it indexes that document node. |
| */ |
| public void indexXmlFiles(File collectionDirectory, IndexWriter writer, boolean isElementPath, |
| TaggedValuePointable nodep, ArrayBackedValueStorage abvsFileNode, ITreeNodeIdProvider nodeIdProvider, |
| SequenceBuilder sb, ByteBufferInputStream bbis, DataInputStream di, String nodeId) |
| throws SystemException, IOException { |
| |
| SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy, HH:mm:ss"); |
| |
| for (File file : collectionDirectory.listFiles()) { |
| |
| if (readableXmlFile(file.getPath())) { |
| abvsFileNode.reset(); |
| |
| IndexDocumentBuilder ibuilder = getIndexBuilder(file, writer, nodep, abvsFileNode, nodeIdProvider, bbis, |
| di, nodeId); |
| |
| ibuilder.printStart(); |
| if (!isMetaFilePresent) { |
| XmlMetadata xmlMetadata = new XmlMetadata(); |
| xmlMetadata.setPath(file.getCanonicalPath()); |
| xmlMetadata.setFileName(file.getName()); |
| xmlMetadata.setLastModified(sdf.format(file.lastModified())); |
| xmlMetadata.setMd5(metaFileUtil.generateMD5(file)); |
| metadataMap.put(file.getCanonicalPath(), xmlMetadata); |
| } |
| |
| } else if (file.isDirectory()) { |
| // Consider all XML file in sub directories. |
| indexXmlFiles(file, writer, isElementPath, nodep, abvsFileNode, nodeIdProvider, sb, bbis, di, nodeId); |
| } |
| } |
| } |
| |
| public boolean readableXmlFile(String path) { |
| return (path.toLowerCase().endsWith(".xml") || path.toLowerCase().endsWith(".xml.gz")); |
| } |
| |
| public IndexDocumentBuilder getIndexBuilder(File file, IndexWriter writer, TaggedValuePointable nodep, |
| ArrayBackedValueStorage abvsFileNode, ITreeNodeIdProvider nodeIdProvider, ByteBufferInputStream bbis, |
| DataInputStream di, String nodeId) throws IOException { |
| |
| //Get the document node |
| IParser parser = new XMLParser(false, nodeIdProvider, nodeId); |
| FunctionHelper.readInDocFromString(file.getPath(), bbis, di, abvsFileNode, parser); |
| |
| nodep.set(abvsFileNode.getByteArray(), abvsFileNode.getStartOffset(), abvsFileNode.getLength()); |
| |
| //Add the document to the index |
| //Creates one lucene doc per file |
| return new IndexDocumentBuilder(nodep, writer, file.getCanonicalPath()); |
| } |
| } |