[CARBONDATA-2172][Lucene] Add text_columns property for Lucene DataMap

Add text_columns property for Lucene DataMap

This closes #2019
diff --git a/core/src/main/java/org/apache/carbondata/core/datamap/DataMapStoreManager.java b/core/src/main/java/org/apache/carbondata/core/datamap/DataMapStoreManager.java
index 0223ae2..e7c72e8 100644
--- a/core/src/main/java/org/apache/carbondata/core/datamap/DataMapStoreManager.java
+++ b/core/src/main/java/org/apache/carbondata/core/datamap/DataMapStoreManager.java
@@ -163,7 +163,8 @@
   }
 
   public TableDataMap registerDataMap(AbsoluteTableIdentifier identifier,
-      DataMapSchema dataMapSchema,  DataMapFactory dataMapFactory) throws IOException {
+      DataMapSchema dataMapSchema,  DataMapFactory dataMapFactory)
+      throws IOException, MalformedDataMapCommandException {
     String table = identifier.getCarbonTableIdentifier().getTableUniqueName();
     // Just update the segmentRefreshMap with the table if not added.
     getTableSegmentRefresher(identifier);
diff --git a/core/src/main/java/org/apache/carbondata/core/datamap/dev/DataMapFactory.java b/core/src/main/java/org/apache/carbondata/core/datamap/dev/DataMapFactory.java
index 7bf04c9..ef9bb66 100644
--- a/core/src/main/java/org/apache/carbondata/core/datamap/dev/DataMapFactory.java
+++ b/core/src/main/java/org/apache/carbondata/core/datamap/dev/DataMapFactory.java
@@ -19,6 +19,7 @@
 import java.io.IOException;
 import java.util.List;
 
+import org.apache.carbondata.common.exceptions.sql.MalformedDataMapCommandException;
 import org.apache.carbondata.core.datamap.DataMapDistributable;
 import org.apache.carbondata.core.datamap.DataMapLevel;
 import org.apache.carbondata.core.datamap.DataMapMeta;
@@ -34,7 +35,8 @@
   /**
    * Initialization of Datamap factory with the identifier and datamap name
    */
-  void init(AbsoluteTableIdentifier identifier, DataMapSchema dataMapSchema) throws IOException;
+  void init(AbsoluteTableIdentifier identifier, DataMapSchema dataMapSchema)
+      throws IOException, MalformedDataMapCommandException;
 
   /**
    * Return a new write for this datamap
diff --git a/datamap/lucene/pom.xml b/datamap/lucene/pom.xml
index ee504c6..4019065 100644
--- a/datamap/lucene/pom.xml
+++ b/datamap/lucene/pom.xml
@@ -26,6 +26,11 @@
       <version>${project.version}</version>
     </dependency>
     <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-lang3</artifactId>
+      <version>3.3.2</version>
+    </dependency>
+    <dependency>
       <groupId>org.apache.lucene</groupId>
       <artifactId>lucene-core</artifactId>
       <version>${lucene.version}</version>
diff --git a/datamap/lucene/src/main/java/org/apache/carbondata/datamap/lucene/LuceneDataMapFactoryBase.java b/datamap/lucene/src/main/java/org/apache/carbondata/datamap/lucene/LuceneDataMapFactoryBase.java
index 5eb7054..3a1adab 100644
--- a/datamap/lucene/src/main/java/org/apache/carbondata/datamap/lucene/LuceneDataMapFactoryBase.java
+++ b/datamap/lucene/src/main/java/org/apache/carbondata/datamap/lucene/LuceneDataMapFactoryBase.java
@@ -23,6 +23,7 @@
 import java.util.Objects;
 
 import org.apache.carbondata.common.annotations.InterfaceAudience;
+import org.apache.carbondata.common.exceptions.sql.MalformedDataMapCommandException;
 import org.apache.carbondata.common.logging.LogService;
 import org.apache.carbondata.common.logging.LogServiceFactory;
 import org.apache.carbondata.core.datamap.DataMapDistributable;
@@ -32,14 +33,15 @@
 import org.apache.carbondata.core.datamap.dev.DataMapWriter;
 import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
 import org.apache.carbondata.core.metadata.CarbonMetadata;
+import org.apache.carbondata.core.metadata.datatype.DataTypes;
 import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
 import org.apache.carbondata.core.metadata.schema.table.DataMapSchema;
-import org.apache.carbondata.core.metadata.schema.table.TableInfo;
-import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema;
+import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
 import org.apache.carbondata.core.scan.filter.intf.ExpressionType;
 import org.apache.carbondata.core.util.path.CarbonTablePath;
 import org.apache.carbondata.events.Event;
 
+import org.apache.commons.lang.StringUtils;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 
@@ -49,6 +51,8 @@
 @InterfaceAudience.Internal
 abstract class LuceneDataMapFactoryBase<T extends DataMap> implements DataMapFactory<T> {
 
+  static final String TEXT_COLUMNS = "text_columns";
+
   /**
    * Logger
    */
@@ -76,7 +80,7 @@
 
   @Override
   public void init(AbsoluteTableIdentifier identifier, DataMapSchema dataMapSchema)
-      throws IOException {
+      throws IOException, MalformedDataMapCommandException {
     Objects.requireNonNull(identifier);
     Objects.requireNonNull(dataMapSchema);
 
@@ -97,34 +101,17 @@
       throw new IOException(errorMessage);
     }
 
-    TableInfo tableInfo = carbonTable.getTableInfo();
-    List<ColumnSchema> lstCoumnSchemas = tableInfo.getFactTable().getListOfColumns();
-
-    // currently add all columns into lucene indexer
-    // TODO:only add index columns
-    List<String> indexedColumns = new ArrayList<String>();
-    for (ColumnSchema columnSchema : lstCoumnSchemas) {
-      if (!columnSchema.isInvisible()) {
-        indexedColumns.add(columnSchema.getColumnName());
-      }
-    }
-
-    // get indexed columns
-    //    Map<String, String> properties = dataMapSchema.getProperties();
-    //    String columns = properties.get("text_column");
-    //    if (columns != null) {
-    //      String[] columnArray = columns.split(CarbonCommonConstants.COMMA, -1);
-    //      Collections.addAll(indexedColumns, columnArray);
-    //    }
+    // validate DataMapSchema and get index columns
+    List<String> indexedColumns =  validateAndGetIndexedColumns(dataMapSchema, carbonTable);
 
     // add optimizedOperations
     List<ExpressionType> optimizedOperations = new ArrayList<ExpressionType>();
-    //    optimizedOperations.add(ExpressionType.EQUALS);
-    //    optimizedOperations.add(ExpressionType.GREATERTHAN);
-    //    optimizedOperations.add(ExpressionType.GREATERTHAN_EQUALTO);
-    //    optimizedOperations.add(ExpressionType.LESSTHAN);
-    //    optimizedOperations.add(ExpressionType.LESSTHAN_EQUALTO);
-    //    optimizedOperations.add(ExpressionType.NOT);
+    // optimizedOperations.add(ExpressionType.EQUALS);
+    // optimizedOperations.add(ExpressionType.GREATERTHAN);
+    // optimizedOperations.add(ExpressionType.GREATERTHAN_EQUALTO);
+    // optimizedOperations.add(ExpressionType.LESSTHAN);
+    // optimizedOperations.add(ExpressionType.LESSTHAN_EQUALTO);
+    // optimizedOperations.add(ExpressionType.NOT);
     optimizedOperations.add(ExpressionType.TEXT_MATCH);
     this.dataMapMeta = new DataMapMeta(indexedColumns, optimizedOperations);
 
@@ -134,6 +121,52 @@
   }
 
   /**
+   * validate Lucene DataMap
+   * 1. require TEXT_COLUMNS property
+   * 2. TEXT_COLUMNS can't contains illegal argument(empty, blank)
+   * 3. TEXT_COLUMNS can't contains duplicate same columns
+   * 4. TEXT_COLUMNS should be exists in table columns
+   * 5. TEXT_COLUMNS support only String DataType columns
+   */
+  private List<String> validateAndGetIndexedColumns(DataMapSchema dataMapSchema,
+      CarbonTable carbonTable) throws MalformedDataMapCommandException {
+    String textColumnsStr = dataMapSchema.getProperties().get(TEXT_COLUMNS);
+    if (textColumnsStr == null || StringUtils.isBlank(textColumnsStr)) {
+      throw new MalformedDataMapCommandException(
+          "Lucene DataMap require proper TEXT_COLUMNS property.");
+    }
+    String[] textColumns = textColumnsStr.split(",", -1);
+    for (int i = 0; i < textColumns.length; i++) {
+      textColumns[i] = textColumns[i].trim().toLowerCase();
+    }
+    for (int i = 0; i < textColumns.length; i++) {
+      if (textColumns[i].isEmpty()) {
+        throw new MalformedDataMapCommandException("TEXT_COLUMNS contains illegal argument.");
+      }
+      for (int j = i + 1; j < textColumns.length; j++) {
+        if (textColumns[i].equals(textColumns[j])) {
+          throw new MalformedDataMapCommandException(
+              "TEXT_COLUMNS has duplicate columns :" + textColumns[i]);
+        }
+      }
+    }
+    List<String> textColumnList = new ArrayList<String>(textColumns.length);
+    for (int i = 0; i < textColumns.length; i++) {
+      CarbonColumn column = carbonTable.getColumnByName(carbonTable.getTableName(), textColumns[i]);
+      if (null == column) {
+        throw new MalformedDataMapCommandException("TEXT_COLUMNS: " + textColumns[i]
+            + " does not exist in table. Please check create DataMap statement.");
+      } else if (column.getDataType() != DataTypes.STRING) {
+        throw new MalformedDataMapCommandException(
+            "TEXT_COLUMNS only supports String column. " + "Unsupported column: " + textColumns[i]
+                + ", DataType: " + column.getDataType());
+      }
+      textColumnList.add(column.getColName());
+    }
+    return textColumnList;
+  }
+
+  /**
    * Return a new write for this datamap
    */
   public DataMapWriter createWriter(String segmentId, String writeDirectoryPath) {
diff --git a/datamap/lucene/src/test/scala/org/apache/carbondata/datamap/lucene/LuceneFineGrainDataMapSuite.scala b/datamap/lucene/src/test/scala/org/apache/carbondata/datamap/lucene/LuceneFineGrainDataMapSuite.scala
index 5e28e8a..bfcfa67 100644
--- a/datamap/lucene/src/test/scala/org/apache/carbondata/datamap/lucene/LuceneFineGrainDataMapSuite.scala
+++ b/datamap/lucene/src/test/scala/org/apache/carbondata/datamap/lucene/LuceneFineGrainDataMapSuite.scala
@@ -24,7 +24,7 @@
 import org.apache.spark.sql.test.util.QueryTest
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.carbondata.core.metadata.CarbonMetadata
+import org.apache.carbondata.common.exceptions.sql.MalformedDataMapCommandException
 
 class LuceneFineGrainDataMapSuite extends QueryTest with BeforeAndAfterAll {
 
@@ -42,9 +42,7 @@
         | TBLPROPERTIES('SORT_COLUMNS'='city,name', 'SORT_SCOPE'='LOCAL_SORT')
       """.stripMargin)
     sql(s"LOAD DATA LOCAL INPATH '$file2' INTO TABLE normal_test OPTIONS('header'='false')")
-  }
 
-  test("test lucene fine grain data map") {
     sql("DROP TABLE IF EXISTS datamap_test")
     sql(
       """
@@ -52,11 +50,65 @@
         | STORED BY 'carbondata'
         | TBLPROPERTIES('SORT_COLUMNS'='city,name', 'SORT_SCOPE'='LOCAL_SORT')
       """.stripMargin)
+  }
 
+  test("validate TEXT_COLUMNS DataMap property") {
+    // require TEXT_COLUMNS
+    var exception = intercept[MalformedDataMapCommandException](sql(
+      s"""
+         | CREATE DATAMAP dm1 ON TABLE datamap_test
+         | USING 'org.apache.carbondata.datamap.lucene.LuceneFineGrainDataMapFactory'
+      """.stripMargin))
+
+    assertResult("Lucene DataMap require proper TEXT_COLUMNS property.")(exception.getMessage)
+
+    // illegal argumnet.
+    exception = intercept[MalformedDataMapCommandException](sql(
+      s"""
+         | CREATE DATAMAP dm1 ON TABLE datamap_test
+         | USING 'org.apache.carbondata.datamap.lucene.LuceneFineGrainDataMapFactory'
+         | DMProperties('text_COLUMNS'='name, ')
+      """.stripMargin))
+
+    assertResult("TEXT_COLUMNS contains illegal argument.")(exception.getMessage)
+
+    // not exists
+    exception = intercept[MalformedDataMapCommandException](sql(
+      s"""
+         | CREATE DATAMAP dm1 ON TABLE datamap_test
+         | USING 'org.apache.carbondata.datamap.lucene.LuceneFineGrainDataMapFactory'
+         | DMProperties('text_COLUMNS'='city,school')
+    """.stripMargin))
+
+    assertResult("TEXT_COLUMNS: school does not exist in table. Please check create DataMap statement.")(exception.getMessage)
+
+    // duplicate columns
+    exception = intercept[MalformedDataMapCommandException](sql(
+      s"""
+         | CREATE DATAMAP dm1 ON TABLE datamap_test
+         | USING 'org.apache.carbondata.datamap.lucene.LuceneFineGrainDataMapFactory'
+         | DMProperties('text_COLUMNS'='name,city,name')
+      """.stripMargin))
+
+    assertResult("TEXT_COLUMNS has duplicate columns :name")(exception.getMessage)
+
+    // only support String DataType
+    exception = intercept[MalformedDataMapCommandException](sql(
+    s"""
+         | CREATE DATAMAP dm1 ON TABLE datamap_test
+         | USING 'org.apache.carbondata.datamap.lucene.LuceneFineGrainDataMapFactory'
+         | DMProperties('text_COLUMNS'='city,id')
+      """.stripMargin))
+
+    assertResult("TEXT_COLUMNS only supports String column. Unsupported column: id, DataType: INT")(exception.getMessage)
+  }
+
+  test("test lucene fine grain data map") {
     sql(
       s"""
          | CREATE DATAMAP dm ON TABLE datamap_test
          | USING 'org.apache.carbondata.datamap.lucene.LuceneFineGrainDataMapFactory'
+         | DMProperties('TEXT_COLUMNS'='Name , cIty')
       """.stripMargin)
 
     sql(s"LOAD DATA LOCAL INPATH '$file2' INTO TABLE datamap_test OPTIONS('header'='false')")