| /* |
| * Licensed under the Apache License, Version 2.0 (the "License"); you |
| * may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.sdap.mudrod.ssearch.ranking; |
| |
| import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; |
| import org.apache.sdap.mudrod.driver.ESDriver; |
| import org.apache.sdap.mudrod.driver.SparkDriver; |
| import org.apache.sdap.mudrod.main.MudrodConstants; |
| import org.elasticsearch.action.index.IndexRequest; |
| import org.elasticsearch.common.xcontent.XContentBuilder; |
| import org.elasticsearch.index.query.QueryBuilders; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import java.io.BufferedReader; |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.IOException; |
| import java.io.InputStreamReader; |
| import java.nio.charset.StandardCharsets; |
| import java.util.Properties; |
| |
| import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; |
| |
| /** |
| * Supports the ability to importing training set into Elasticsearch |
| */ |
| public class TrainingImporter extends MudrodAbstract { |
| /** |
| * |
| */ |
| private static final long serialVersionUID = 1L; |
| |
| private static final Logger LOG = LoggerFactory.getLogger(TrainingImporter.class); |
| |
| public TrainingImporter(Properties props, ESDriver es, SparkDriver spark) { |
| super(props, es, spark); |
| es.deleteAllByQuery(props.getProperty(MudrodConstants.ES_INDEX_NAME), "trainingranking", QueryBuilders.matchAllQuery()); |
| addMapping(); |
| } |
| |
| /** |
| * Method of adding mapping to training set type |
| */ |
| private void addMapping() { |
| XContentBuilder Mapping; |
| try { |
| Mapping = jsonBuilder().startObject().startObject("trainingranking").startObject("properties").startObject("query").field("type", "string").field("index", "not_analyzed").endObject() |
| .startObject("dataID").field("type", "string").field("index", "not_analyzed").endObject().startObject("label").field("type", "string").field("index", "not_analyzed").endObject().endObject() |
| .endObject().endObject(); |
| |
| es.getClient().admin().indices().preparePutMapping(props.getProperty(MudrodConstants.ES_INDEX_NAME)).setType("trainingranking").setSource(Mapping).execute().actionGet(); |
| } catch (IOException e) { |
| LOG.error("Adding mapping to training set type is failed!", e); |
| } |
| } |
| |
| /** |
| * Method of importing training set in to Elasticsearch |
| * |
| * @param dataFolder the path to the traing set |
| * @throws IOException IOException |
| */ |
| public void importTrainingSet(String dataFolder) throws IOException { |
| es.createBulkProcessor(); |
| |
| File[] files = new File(dataFolder).listFiles(); |
| for (File file : files) { |
| try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file.getAbsolutePath()), StandardCharsets.UTF_8))) { |
| br.readLine(); |
| String line = br.readLine(); |
| while (line != null) { |
| String[] list = line.split(","); |
| String query = file.getName().replace(".csv", ""); |
| if (list.length > 0) { |
| IndexRequest ir = new IndexRequest(props.getProperty(MudrodConstants.ES_INDEX_NAME), "trainingranking") |
| .source(jsonBuilder().startObject().field("query", query).field("dataID", list[0]).field("label", list[list.length - 1]).endObject()); |
| es.getBulkProcessor().add(ir); |
| } |
| line = br.readLine(); |
| } |
| } |
| } |
| es.destroyBulkProcessor(); |
| } |
| } |