[CARBONDATA-2172][Lucene] Add text_columns property for Lucene DataMap
Add text_columns property for Lucene DataMap
This closes #2019
diff --git a/core/src/main/java/org/apache/carbondata/core/datamap/DataMapStoreManager.java b/core/src/main/java/org/apache/carbondata/core/datamap/DataMapStoreManager.java
index 0223ae2..e7c72e8 100644
--- a/core/src/main/java/org/apache/carbondata/core/datamap/DataMapStoreManager.java
+++ b/core/src/main/java/org/apache/carbondata/core/datamap/DataMapStoreManager.java
@@ -163,7 +163,8 @@
}
public TableDataMap registerDataMap(AbsoluteTableIdentifier identifier,
- DataMapSchema dataMapSchema, DataMapFactory dataMapFactory) throws IOException {
+ DataMapSchema dataMapSchema, DataMapFactory dataMapFactory)
+ throws IOException, MalformedDataMapCommandException {
String table = identifier.getCarbonTableIdentifier().getTableUniqueName();
// Just update the segmentRefreshMap with the table if not added.
getTableSegmentRefresher(identifier);
diff --git a/core/src/main/java/org/apache/carbondata/core/datamap/dev/DataMapFactory.java b/core/src/main/java/org/apache/carbondata/core/datamap/dev/DataMapFactory.java
index 7bf04c9..ef9bb66 100644
--- a/core/src/main/java/org/apache/carbondata/core/datamap/dev/DataMapFactory.java
+++ b/core/src/main/java/org/apache/carbondata/core/datamap/dev/DataMapFactory.java
@@ -19,6 +19,7 @@
import java.io.IOException;
import java.util.List;
+import org.apache.carbondata.common.exceptions.sql.MalformedDataMapCommandException;
import org.apache.carbondata.core.datamap.DataMapDistributable;
import org.apache.carbondata.core.datamap.DataMapLevel;
import org.apache.carbondata.core.datamap.DataMapMeta;
@@ -34,7 +35,8 @@
/**
* Initialization of Datamap factory with the identifier and datamap name
*/
- void init(AbsoluteTableIdentifier identifier, DataMapSchema dataMapSchema) throws IOException;
+ void init(AbsoluteTableIdentifier identifier, DataMapSchema dataMapSchema)
+ throws IOException, MalformedDataMapCommandException;
/**
* Return a new write for this datamap
diff --git a/datamap/lucene/pom.xml b/datamap/lucene/pom.xml
index ee504c6..4019065 100644
--- a/datamap/lucene/pom.xml
+++ b/datamap/lucene/pom.xml
@@ -26,6 +26,11 @@
<version>${project.version}</version>
</dependency>
<dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-lang3</artifactId>
+ <version>3.3.2</version>
+ </dependency>
+ <dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>${lucene.version}</version>
diff --git a/datamap/lucene/src/main/java/org/apache/carbondata/datamap/lucene/LuceneDataMapFactoryBase.java b/datamap/lucene/src/main/java/org/apache/carbondata/datamap/lucene/LuceneDataMapFactoryBase.java
index 5eb7054..3a1adab 100644
--- a/datamap/lucene/src/main/java/org/apache/carbondata/datamap/lucene/LuceneDataMapFactoryBase.java
+++ b/datamap/lucene/src/main/java/org/apache/carbondata/datamap/lucene/LuceneDataMapFactoryBase.java
@@ -23,6 +23,7 @@
import java.util.Objects;
import org.apache.carbondata.common.annotations.InterfaceAudience;
+import org.apache.carbondata.common.exceptions.sql.MalformedDataMapCommandException;
import org.apache.carbondata.common.logging.LogService;
import org.apache.carbondata.common.logging.LogServiceFactory;
import org.apache.carbondata.core.datamap.DataMapDistributable;
@@ -32,14 +33,15 @@
import org.apache.carbondata.core.datamap.dev.DataMapWriter;
import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
import org.apache.carbondata.core.metadata.CarbonMetadata;
+import org.apache.carbondata.core.metadata.datatype.DataTypes;
import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
import org.apache.carbondata.core.metadata.schema.table.DataMapSchema;
-import org.apache.carbondata.core.metadata.schema.table.TableInfo;
-import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema;
+import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
import org.apache.carbondata.core.scan.filter.intf.ExpressionType;
import org.apache.carbondata.core.util.path.CarbonTablePath;
import org.apache.carbondata.events.Event;
+import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
@@ -49,6 +51,8 @@
@InterfaceAudience.Internal
abstract class LuceneDataMapFactoryBase<T extends DataMap> implements DataMapFactory<T> {
+ static final String TEXT_COLUMNS = "text_columns";
+
/**
* Logger
*/
@@ -76,7 +80,7 @@
@Override
public void init(AbsoluteTableIdentifier identifier, DataMapSchema dataMapSchema)
- throws IOException {
+ throws IOException, MalformedDataMapCommandException {
Objects.requireNonNull(identifier);
Objects.requireNonNull(dataMapSchema);
@@ -97,34 +101,17 @@
throw new IOException(errorMessage);
}
- TableInfo tableInfo = carbonTable.getTableInfo();
- List<ColumnSchema> lstCoumnSchemas = tableInfo.getFactTable().getListOfColumns();
-
- // currently add all columns into lucene indexer
- // TODO:only add index columns
- List<String> indexedColumns = new ArrayList<String>();
- for (ColumnSchema columnSchema : lstCoumnSchemas) {
- if (!columnSchema.isInvisible()) {
- indexedColumns.add(columnSchema.getColumnName());
- }
- }
-
- // get indexed columns
- // Map<String, String> properties = dataMapSchema.getProperties();
- // String columns = properties.get("text_column");
- // if (columns != null) {
- // String[] columnArray = columns.split(CarbonCommonConstants.COMMA, -1);
- // Collections.addAll(indexedColumns, columnArray);
- // }
+ // validate DataMapSchema and get index columns
+ List<String> indexedColumns = validateAndGetIndexedColumns(dataMapSchema, carbonTable);
// add optimizedOperations
List<ExpressionType> optimizedOperations = new ArrayList<ExpressionType>();
- // optimizedOperations.add(ExpressionType.EQUALS);
- // optimizedOperations.add(ExpressionType.GREATERTHAN);
- // optimizedOperations.add(ExpressionType.GREATERTHAN_EQUALTO);
- // optimizedOperations.add(ExpressionType.LESSTHAN);
- // optimizedOperations.add(ExpressionType.LESSTHAN_EQUALTO);
- // optimizedOperations.add(ExpressionType.NOT);
+ // optimizedOperations.add(ExpressionType.EQUALS);
+ // optimizedOperations.add(ExpressionType.GREATERTHAN);
+ // optimizedOperations.add(ExpressionType.GREATERTHAN_EQUALTO);
+ // optimizedOperations.add(ExpressionType.LESSTHAN);
+ // optimizedOperations.add(ExpressionType.LESSTHAN_EQUALTO);
+ // optimizedOperations.add(ExpressionType.NOT);
optimizedOperations.add(ExpressionType.TEXT_MATCH);
this.dataMapMeta = new DataMapMeta(indexedColumns, optimizedOperations);
@@ -134,6 +121,52 @@
}
/**
+ * validate Lucene DataMap
+ * 1. require TEXT_COLUMNS property
+ * 2. TEXT_COLUMNS can't contains illegal argument(empty, blank)
+ * 3. TEXT_COLUMNS can't contains duplicate same columns
+ * 4. TEXT_COLUMNS should be exists in table columns
+ * 5. TEXT_COLUMNS support only String DataType columns
+ */
+ private List<String> validateAndGetIndexedColumns(DataMapSchema dataMapSchema,
+ CarbonTable carbonTable) throws MalformedDataMapCommandException {
+ String textColumnsStr = dataMapSchema.getProperties().get(TEXT_COLUMNS);
+ if (textColumnsStr == null || StringUtils.isBlank(textColumnsStr)) {
+ throw new MalformedDataMapCommandException(
+ "Lucene DataMap require proper TEXT_COLUMNS property.");
+ }
+ String[] textColumns = textColumnsStr.split(",", -1);
+ for (int i = 0; i < textColumns.length; i++) {
+ textColumns[i] = textColumns[i].trim().toLowerCase();
+ }
+ for (int i = 0; i < textColumns.length; i++) {
+ if (textColumns[i].isEmpty()) {
+ throw new MalformedDataMapCommandException("TEXT_COLUMNS contains illegal argument.");
+ }
+ for (int j = i + 1; j < textColumns.length; j++) {
+ if (textColumns[i].equals(textColumns[j])) {
+ throw new MalformedDataMapCommandException(
+ "TEXT_COLUMNS has duplicate columns :" + textColumns[i]);
+ }
+ }
+ }
+ List<String> textColumnList = new ArrayList<String>(textColumns.length);
+ for (int i = 0; i < textColumns.length; i++) {
+ CarbonColumn column = carbonTable.getColumnByName(carbonTable.getTableName(), textColumns[i]);
+ if (null == column) {
+ throw new MalformedDataMapCommandException("TEXT_COLUMNS: " + textColumns[i]
+ + " does not exist in table. Please check create DataMap statement.");
+ } else if (column.getDataType() != DataTypes.STRING) {
+ throw new MalformedDataMapCommandException(
+ "TEXT_COLUMNS only supports String column. " + "Unsupported column: " + textColumns[i]
+ + ", DataType: " + column.getDataType());
+ }
+ textColumnList.add(column.getColName());
+ }
+ return textColumnList;
+ }
+
+ /**
* Return a new write for this datamap
*/
public DataMapWriter createWriter(String segmentId, String writeDirectoryPath) {
diff --git a/datamap/lucene/src/test/scala/org/apache/carbondata/datamap/lucene/LuceneFineGrainDataMapSuite.scala b/datamap/lucene/src/test/scala/org/apache/carbondata/datamap/lucene/LuceneFineGrainDataMapSuite.scala
index 5e28e8a..bfcfa67 100644
--- a/datamap/lucene/src/test/scala/org/apache/carbondata/datamap/lucene/LuceneFineGrainDataMapSuite.scala
+++ b/datamap/lucene/src/test/scala/org/apache/carbondata/datamap/lucene/LuceneFineGrainDataMapSuite.scala
@@ -24,7 +24,7 @@
import org.apache.spark.sql.test.util.QueryTest
import org.scalatest.BeforeAndAfterAll
-import org.apache.carbondata.core.metadata.CarbonMetadata
+import org.apache.carbondata.common.exceptions.sql.MalformedDataMapCommandException
class LuceneFineGrainDataMapSuite extends QueryTest with BeforeAndAfterAll {
@@ -42,9 +42,7 @@
| TBLPROPERTIES('SORT_COLUMNS'='city,name', 'SORT_SCOPE'='LOCAL_SORT')
""".stripMargin)
sql(s"LOAD DATA LOCAL INPATH '$file2' INTO TABLE normal_test OPTIONS('header'='false')")
- }
- test("test lucene fine grain data map") {
sql("DROP TABLE IF EXISTS datamap_test")
sql(
"""
@@ -52,11 +50,65 @@
| STORED BY 'carbondata'
| TBLPROPERTIES('SORT_COLUMNS'='city,name', 'SORT_SCOPE'='LOCAL_SORT')
""".stripMargin)
+ }
+ test("validate TEXT_COLUMNS DataMap property") {
+ // require TEXT_COLUMNS
+ var exception = intercept[MalformedDataMapCommandException](sql(
+ s"""
+ | CREATE DATAMAP dm1 ON TABLE datamap_test
+ | USING 'org.apache.carbondata.datamap.lucene.LuceneFineGrainDataMapFactory'
+ """.stripMargin))
+
+ assertResult("Lucene DataMap require proper TEXT_COLUMNS property.")(exception.getMessage)
+
+ // illegal argumnet.
+ exception = intercept[MalformedDataMapCommandException](sql(
+ s"""
+ | CREATE DATAMAP dm1 ON TABLE datamap_test
+ | USING 'org.apache.carbondata.datamap.lucene.LuceneFineGrainDataMapFactory'
+ | DMProperties('text_COLUMNS'='name, ')
+ """.stripMargin))
+
+ assertResult("TEXT_COLUMNS contains illegal argument.")(exception.getMessage)
+
+ // not exists
+ exception = intercept[MalformedDataMapCommandException](sql(
+ s"""
+ | CREATE DATAMAP dm1 ON TABLE datamap_test
+ | USING 'org.apache.carbondata.datamap.lucene.LuceneFineGrainDataMapFactory'
+ | DMProperties('text_COLUMNS'='city,school')
+ """.stripMargin))
+
+ assertResult("TEXT_COLUMNS: school does not exist in table. Please check create DataMap statement.")(exception.getMessage)
+
+ // duplicate columns
+ exception = intercept[MalformedDataMapCommandException](sql(
+ s"""
+ | CREATE DATAMAP dm1 ON TABLE datamap_test
+ | USING 'org.apache.carbondata.datamap.lucene.LuceneFineGrainDataMapFactory'
+ | DMProperties('text_COLUMNS'='name,city,name')
+ """.stripMargin))
+
+ assertResult("TEXT_COLUMNS has duplicate columns :name")(exception.getMessage)
+
+ // only support String DataType
+ exception = intercept[MalformedDataMapCommandException](sql(
+ s"""
+ | CREATE DATAMAP dm1 ON TABLE datamap_test
+ | USING 'org.apache.carbondata.datamap.lucene.LuceneFineGrainDataMapFactory'
+ | DMProperties('text_COLUMNS'='city,id')
+ """.stripMargin))
+
+ assertResult("TEXT_COLUMNS only supports String column. Unsupported column: id, DataType: INT")(exception.getMessage)
+ }
+
+ test("test lucene fine grain data map") {
sql(
s"""
| CREATE DATAMAP dm ON TABLE datamap_test
| USING 'org.apache.carbondata.datamap.lucene.LuceneFineGrainDataMapFactory'
+ | DMProperties('TEXT_COLUMNS'='Name , cIty')
""".stripMargin)
sql(s"LOAD DATA LOCAL INPATH '$file2' INTO TABLE datamap_test OPTIONS('header'='false')")