pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java - pinot - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.pinot.queries;

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Random;
 import java.util.Set;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.queryparser.classic.QueryParser;
 import org.apache.lucene.search.ControlledRealTimeReopenThread;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.SearcherManager;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.pinot.common.response.broker.ResultTable;
 import org.apache.pinot.common.utils.DataSchema;
 import org.apache.pinot.common.utils.DataSchema.ColumnDataType;
 import org.apache.pinot.core.operator.BaseOperator;
 import org.apache.pinot.core.operator.blocks.results.AggregationResultsBlock;
 import org.apache.pinot.core.operator.query.SelectionOnlyOperator;
 import org.apache.pinot.segment.local.indexsegment.immutable.ImmutableSegmentLoader;
 import org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLuceneTextIndex;
 import org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl;
 import org.apache.pinot.segment.local.segment.index.loader.IndexLoadingConfig;
 import org.apache.pinot.segment.local.segment.readers.GenericRowRecordReader;
 import org.apache.pinot.segment.spi.ImmutableSegment;
 import org.apache.pinot.segment.spi.IndexSegment;
 import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig;
 import org.apache.pinot.spi.config.table.FieldConfig;
 import org.apache.pinot.spi.config.table.TableConfig;
 import org.apache.pinot.spi.config.table.TableType;
 import org.apache.pinot.spi.data.FieldSpec;
 import org.apache.pinot.spi.data.Schema;
 import org.apache.pinot.spi.data.readers.GenericRow;
 import org.apache.pinot.spi.data.readers.RecordReader;
 import org.apache.pinot.spi.utils.builder.TableConfigBuilder;
 import org.testng.annotations.AfterClass;
 import org.testng.annotations.BeforeClass;
 import org.testng.annotations.Test;

 import static org.testng.Assert.assertEquals;
 import static org.testng.Assert.assertNotEquals;
 import static org.testng.Assert.assertNotNull;
 import static org.testng.Assert.assertTrue;


 /**
  * Functional tests for text search feature.
  * The tests use two kinds of input data
  * (1) Skills file
  * (2) Query log file
  * The test table has a SKILLS column and QUERY_LOG column. Text index is created
  * on each of these columns.
  */
 public class TextSearchQueriesTest extends BaseQueriesTest {
   private static final File INDEX_DIR = new File(FileUtils.getTempDirectory(), "TextSearchQueriesTest");
   private static final String TABLE_NAME = "MyTable";
   private static final String SEGMENT_NAME = "testSegment";

   private static final String QUERY_LOG_TEXT_COL_NAME = "QUERY_LOG_TEXT_COL";
   private static final String SKILLS_TEXT_COL_NAME = "SKILLS_TEXT_COL";
   private static final String SKILLS_TEXT_COL_DICT_NAME = "SKILLS_TEXT_COL_DICT";
   private static final String SKILLS_TEXT_COL_MULTI_TERM_NAME = "SKILLS_TEXT_COL_1";
   private static final String SKILLS_TEXT_NO_RAW_NAME = "SKILLS_TEXT_COL_2";
   private static final String SKILLS_TEXT_MV_COL_NAME = "SKILLS_TEXT_MV_COL";
   private static final String SKILLS_TEXT_MV_COL_DICT_NAME = "SKILLS_TEXT_MV_COL_DICT";
   private static final String INT_COL_NAME = "INT_COL";
   private static final List<String> RAW_TEXT_INDEX_COLUMNS =
       Arrays.asList(QUERY_LOG_TEXT_COL_NAME, SKILLS_TEXT_COL_NAME, SKILLS_TEXT_COL_MULTI_TERM_NAME,
           SKILLS_TEXT_NO_RAW_NAME, SKILLS_TEXT_MV_COL_NAME);
   private static final List<String> DICT_TEXT_INDEX_COLUMNS =
       Arrays.asList(SKILLS_TEXT_COL_DICT_NAME, SKILLS_TEXT_MV_COL_DICT_NAME);
   private static final int INT_BASE_VALUE = 1000;

   private IndexSegment _indexSegment;
   private List<IndexSegment> _indexSegments;

   @Override
   protected String getFilter() {
     return "";
   }

   @Override
   protected IndexSegment getIndexSegment() {
     return _indexSegment;
   }

   @Override
   protected List<IndexSegment> getIndexSegments() {
     return _indexSegments;
   }

   @BeforeClass
   public void setUp()
       throws Exception {
     FileUtils.deleteQuietly(INDEX_DIR);

     buildSegment();
     IndexLoadingConfig indexLoadingConfig = new IndexLoadingConfig();
     Set<String> textIndexColumns = new HashSet<>();
     textIndexColumns.addAll(RAW_TEXT_INDEX_COLUMNS);
     textIndexColumns.addAll(DICT_TEXT_INDEX_COLUMNS);
     indexLoadingConfig.setTextIndexColumns(textIndexColumns);
     indexLoadingConfig.setInvertedIndexColumns(new HashSet<>(DICT_TEXT_INDEX_COLUMNS));
     Map<String, Map<String, String>> columnProperties = new HashMap<>();
     Map<String, String> props = new HashMap<>();
     props.put(FieldConfig.TEXT_INDEX_USE_AND_FOR_MULTI_TERM_QUERIES, "true");
     columnProperties.put(SKILLS_TEXT_COL_MULTI_TERM_NAME, props);
     indexLoadingConfig.setColumnProperties(columnProperties);
     ImmutableSegment immutableSegment =
         ImmutableSegmentLoader.load(new File(INDEX_DIR, SEGMENT_NAME), indexLoadingConfig);
     _indexSegment = immutableSegment;
     _indexSegments = Arrays.asList(immutableSegment, immutableSegment);
   }

   @AfterClass
   public void tearDown() {
     _indexSegment.destroy();
     FileUtils.deleteQuietly(INDEX_DIR);
   }

   private void buildSegment()
       throws Exception {
     List<GenericRow> rows = createTestData();

     List<FieldConfig> fieldConfigs = new ArrayList<>(RAW_TEXT_INDEX_COLUMNS.size() + DICT_TEXT_INDEX_COLUMNS.size());
     for (String textIndexColumn : RAW_TEXT_INDEX_COLUMNS) {
       fieldConfigs.add(
           new FieldConfig(textIndexColumn, FieldConfig.EncodingType.RAW, FieldConfig.IndexType.TEXT, null, null));
     }
     for (String textIndexColumn : DICT_TEXT_INDEX_COLUMNS) {
       fieldConfigs.add(
           new FieldConfig(textIndexColumn, FieldConfig.EncodingType.DICTIONARY, FieldConfig.IndexType.TEXT, null,
               null));
     }
     TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(TABLE_NAME)
         .setNoDictionaryColumns(RAW_TEXT_INDEX_COLUMNS).setInvertedIndexColumns(DICT_TEXT_INDEX_COLUMNS)
         .setFieldConfigList(fieldConfigs).build();
     Schema schema = new Schema.SchemaBuilder().setSchemaName(TABLE_NAME)
         .addSingleValueDimension(QUERY_LOG_TEXT_COL_NAME, FieldSpec.DataType.STRING)
         .addSingleValueDimension(SKILLS_TEXT_COL_NAME, FieldSpec.DataType.STRING)
         .addSingleValueDimension(SKILLS_TEXT_COL_DICT_NAME, FieldSpec.DataType.STRING)
         .addSingleValueDimension(SKILLS_TEXT_COL_MULTI_TERM_NAME, FieldSpec.DataType.STRING)
         .addSingleValueDimension(SKILLS_TEXT_NO_RAW_NAME, FieldSpec.DataType.STRING)
         .addMultiValueDimension(SKILLS_TEXT_MV_COL_NAME, FieldSpec.DataType.STRING)
         .addMultiValueDimension(SKILLS_TEXT_MV_COL_DICT_NAME, FieldSpec.DataType.STRING)
         .addMetric(INT_COL_NAME, FieldSpec.DataType.INT).build();
     SegmentGeneratorConfig config = new SegmentGeneratorConfig(tableConfig, schema);
     config.setOutDir(INDEX_DIR.getPath());
     config.setTableName(TABLE_NAME);
     config.setSegmentName(SEGMENT_NAME);
     Map<String, Map<String, String>> columnProperties = new HashMap<>();
     Map<String, String> props = new HashMap<>();
     props.put(FieldConfig.TEXT_INDEX_NO_RAW_DATA, "true");
     props.put(FieldConfig.TEXT_INDEX_RAW_VALUE, "ILoveCoding");
     columnProperties.put(SKILLS_TEXT_NO_RAW_NAME, props);
     config.setColumnProperties(columnProperties);
     SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
     try (RecordReader recordReader = new GenericRowRecordReader(rows)) {
       driver.init(config, recordReader);
       driver.build();
     }
   }

   private List<GenericRow> createTestData()
       throws Exception {
     List<GenericRow> rows = new ArrayList<>();

     // read the skills file
     String[] skills = new String[24];
     List<String[]> multiValueStringList = new ArrayList<>();
     int skillCount = 0;
     try (BufferedReader reader = new BufferedReader(new InputStreamReader(
         Objects.requireNonNull(getClass().getClassLoader().getResourceAsStream("data/text_search_data/skills.txt"))))) {
       String line;
       while ((line = reader.readLine()) != null) {
         skills[skillCount++] = line;
         multiValueStringList.add(StringUtils.splitByWholeSeparator(line, ", "));
       }
     }
     assertEquals(skillCount, 24);

     // read the query log file (24k queries) and build dataset
     int counter = 0;
     try (BufferedReader reader = new BufferedReader(new InputStreamReader(Objects.requireNonNull(
         getClass().getClassLoader().getResourceAsStream("data/text_search_data/queries.txt"))))) {
       String line;
       while ((line = reader.readLine()) != null) {
         GenericRow row = new GenericRow();
         row.putValue(INT_COL_NAME, INT_BASE_VALUE + counter);
         row.putValue(QUERY_LOG_TEXT_COL_NAME, line);
         if (counter >= skillCount) {
           row.putValue(SKILLS_TEXT_COL_NAME, "software engineering");
           row.putValue(SKILLS_TEXT_COL_DICT_NAME, "software engineering");
           row.putValue(SKILLS_TEXT_COL_MULTI_TERM_NAME, "software engineering");
           row.putValue(SKILLS_TEXT_NO_RAW_NAME, "software engineering");
           row.putValue(SKILLS_TEXT_MV_COL_NAME, new String[]{"software", "engineering"});
           row.putValue(SKILLS_TEXT_MV_COL_DICT_NAME, new String[]{"software", "engineering"});
         } else {
           row.putValue(SKILLS_TEXT_COL_NAME, skills[counter]);
           row.putValue(SKILLS_TEXT_COL_DICT_NAME, skills[counter]);
           row.putValue(SKILLS_TEXT_COL_MULTI_TERM_NAME, skills[counter]);
           row.putValue(SKILLS_TEXT_NO_RAW_NAME, skills[counter]);
           row.putValue(SKILLS_TEXT_MV_COL_NAME, multiValueStringList.get(counter));
           row.putValue(SKILLS_TEXT_MV_COL_DICT_NAME, multiValueStringList.get(counter));
         }
         rows.add(row);
         counter++;
       }
     }
     assertEquals(counter, 24150);

     return rows;
   }

   /**
    * Tests for phrase, term, regex, composite (using AND/OR) text search queries.
    * Both selection and aggregation queries are used.
    */
   @Test
   public void testTextSearch()
       throws Exception {
     // TEST 1: phrase query
     // Search in QUERY_TEXT_COL to look for documents where each document MUST contain phrase "SELECT dimensionCol2"
     // as is. In other words, we are trying to find all "SELECT dimensionCol2" style queries. The expected result size
     // of 11787 comes from manually doing a grep on the test file.
     String query =
         "SELECT INT_COL, QUERY_LOG_TEXT_COL FROM MyTable WHERE TEXT_MATCH(QUERY_LOG_TEXT_COL, '\"SELECT "
             + "dimensionCol2\"') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, 11787, false, null);

     query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(QUERY_LOG_TEXT_COL, '\"SELECT dimensionCol2\"')";
     testTextSearchAggregationQueryHelper(query, 11787);

     // TEST 2: phrase query
     // Search in QUERY_TEXT_LOG column to look for documents where each document MUST contain phrase "SELECT count"
     // as is. In other words, we are trying to find all "SELECT count" style queries from log. The expected result size
     // of 12363 comes from manually doing a grep on the test file.
     query =
         "SELECT INT_COL, QUERY_LOG_TEXT_COL FROM MyTable WHERE TEXT_MATCH(QUERY_LOG_TEXT_COL, '\"SELECT count\"') "
             + "LIMIT 50000";
     testTextSearchSelectQueryHelper(query, 12363, false, null);

     query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(QUERY_LOG_TEXT_COL, '\"SELECT count\"')";
     testTextSearchAggregationQueryHelper(query, 12363);

     // TEST 3: phrase query
     // Search in QUERY_LOG_TEXT_COL to look for documents where each document MUST contain phrase "GROUP BY" as is.
     // In other words, we are trying to find all GROUP BY queries from log. The actual resultset is then also compared
     // to the output of grep since the resultset size is small.
     query =
         "SELECT INT_COL, QUERY_LOG_TEXT_COL FROM MyTable WHERE TEXT_MATCH(QUERY_LOG_TEXT_COL, '\"GROUP BY\"') LIMIT "
             + "50000";
     testTextSearchSelectQueryHelper(query, 26, true, null);

     query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(QUERY_LOG_TEXT_COL, '\"GROUP BY\"')";
     testTextSearchAggregationQueryHelper(query, 26);

     // TEST 4: phrase query
     // Search in SKILL_TEXT_COL column to look for documents where each document MUST contain phrase "distributed
     // systems"
     // as is. The expected result table is built by doing grep -n -i 'distributed systems' skills.txt
     List<Object[]> expected = new ArrayList<>();
     expected.add(new Object[]{
         1005,
         "Distributed systems, Java, C++, Go, distributed query engines for analytics and data warehouses, Machine "
             + "learning, spark, Kubernetes, transaction processing"
     });
     expected.add(new Object[]{
         1009,
         "Distributed systems, database development, columnar query engine, database kernel, storage, indexing and "
             + "transaction processing, building large scale systems"
     });
     expected.add(new Object[]{
         1010,
         "Distributed systems, Java, realtime streaming systems, Machine learning, spark, Kubernetes, distributed "
             + "storage, concurrency, multi-threading"
     });
     expected.add(new Object[]{
         1012, "Distributed systems, Java, database engine, cluster management, docker image building and distribution"
     });
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });
     expected.add(new Object[]{
         1020,
         "Databases, columnar query processing, Apache Arrow, distributed systems, Machine learning, cluster "
             + "management, docker image building and distribution"
     });

     testSkillsColumn("\"Distributed systems\"", expected);

     // TEST 5: phrase query
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase
     // "query processing" as is. The expected result table is built by doing grep -n -i 'query processing' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1014,
         "Apache spark, Java, C++, query processing, transaction processing, distributed storage, concurrency, "
             + "multi-threading, apache airflow"
     });
     expected.add(new Object[]{
         1020,
         "Databases, columnar query processing, Apache Arrow, distributed systems, Machine learning, cluster "
             + "management, docker image building and distribution"
     });

     testSkillsColumn("\"query processing\"", expected);

     // TEST 6: phrase query
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain the phrase "machine
     // learning"
     // as is. The expected result table is built by doing grep -n -i 'machine learning' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{1003, "Java, C++, worked on open source projects, coursera machine learning"});
     expected.add(new Object[]{1004, "Machine learning, Tensor flow, Java, Stanford university,"});
     expected.add(new Object[]{
         1005,
         "Distributed systems, Java, C++, Go, distributed query engines for analytics and data warehouses, Machine "
             + "learning, spark, Kubernetes, transaction processing"
     });
     expected.add(new Object[]{
         1006,
         "Java, Python, C++, Machine learning, building and deploying large scale production systems, concurrency, "
             + "multi-threading, CPU processing"
     });
     expected.add(new Object[]{
         1007,
         "C++, Python, Tensor flow, database kernel, storage, indexing and transaction processing, building large "
             + "scale systems, Machine learning"
     });
     expected.add(new Object[]{
         1010,
         "Distributed systems, Java, realtime streaming systems, Machine learning, spark, Kubernetes, distributed "
             + "storage, concurrency, multi-threading"
     });
     expected.add(new Object[]{
         1011,
         "CUDA, GPU, Python, Machine learning, database kernel, storage, indexing and transaction processing, building"
             + " large scale systems"
     });
     expected.add(new Object[]{
         1016,
         "CUDA, GPU processing, Tensor flow, Pandas, Python, Jupyter notebook, spark, Machine learning, building high "
             + "performance scalable systems"
     });
     expected.add(new Object[]{
         1019,
         "C++, Java, Python, realtime streaming systems, Machine learning, spark, Kubernetes, transaction processing, "
             + "distributed storage, concurrency, multi-threading, apache airflow"
     });
     expected.add(new Object[]{
         1020,
         "Databases, columnar query processing, Apache Arrow, distributed systems, Machine learning, cluster "
             + "management, docker image building and distribution"
     });

     testSkillsColumn("\"Machine learning\"", expected);

     // TEST 7: composite phrase query using boolean operator AND
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain two independent phrases
     // "machine learning" and "tensor flow" as is. The expected result table is built by doing
     // grep -n -i -E 'machine learning.*tensor flow|tensor flow.*machine learning' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{1004, "Machine learning, Tensor flow, Java, Stanford university,"});
     expected.add(new Object[]{
         1007,
         "C++, Python, Tensor flow, database kernel, storage, indexing and transaction processing, building large "
             + "scale systems, Machine learning"
     });
     expected.add(new Object[]{
         1016,
         "CUDA, GPU processing, Tensor flow, Pandas, Python, Jupyter notebook, spark, Machine learning, building high "
             + "performance scalable systems"
     });

     testSkillsColumn("\"Machine learning\" AND \"Tensor flow\"", expected);

     // TEST 8: term query
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain term 'Java'.
     // The expected result table is built by doing grep -n -i 'Java' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{1000, "Accounts, Banking, Insurance, worked in NGO, Java"});
     expected.add(new Object[]{1003, "Java, C++, worked on open source projects, coursera machine learning"});
     expected.add(new Object[]{1004, "Machine learning, Tensor flow, Java, Stanford university,"});
     expected.add(new Object[]{
         1005,
         "Distributed systems, Java, C++, Go, distributed query engines for analytics and data warehouses, Machine "
             + "learning, spark, Kubernetes, transaction processing"
     });
     expected.add(new Object[]{
         1006,
         "Java, Python, C++, Machine learning, building and deploying large scale production systems, concurrency, "
             + "multi-threading, CPU processing"
     });
     expected.add(new Object[]{
         1008,
         "Amazon EC2, AWS, hadoop, big data, spark, building high performance scalable systems, building and deploying"
             + " large scale production systems, concurrency, multi-threading, Java, C++, CPU processing"
     });
     expected.add(new Object[]{
         1010,
         "Distributed systems, Java, realtime streaming systems, Machine learning, spark, Kubernetes, distributed "
             + "storage, concurrency, multi-threading"
     });
     expected.add(new Object[]{
         1012, "Distributed systems, Java, database engine, cluster management, docker image building and distribution"
     });
     expected.add(new Object[]{
         1014,
         "Apache spark, Java, C++, query processing, transaction processing, distributed storage, concurrency, "
             + "multi-threading, apache airflow"
     });
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });
     expected.add(new Object[]{
         1018,
         "Realtime stream processing, publish subscribe, columnar processing for data warehouses, concurrency, Java, "
             + "multi-threading, C++,"
     });
     expected.add(new Object[]{
         1019,
         "C++, Java, Python, realtime streaming systems, Machine learning, spark, Kubernetes, transaction processing, "
             + "distributed storage, concurrency, multi-threading, apache airflow"
     });

     testSkillsColumn("Java", expected);

     // TEST 9: composite term query using BOOLEAN operator AND
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain two independent
     // terms 'Java' and 'C++'. The expected result table is built by doing
     // grep -E -n -i 'c\+\+.*java|java.*c\+\+' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{1003, "Java, C++, worked on open source projects, coursera machine learning"});
     expected.add(new Object[]{
         1005,
         "Distributed systems, Java, C++, Go, distributed query engines for analytics and data warehouses, Machine "
             + "learning, spark, Kubernetes, transaction processing"
     });
     expected.add(new Object[]{
         1006,
         "Java, Python, C++, Machine learning, building and deploying large scale production systems, concurrency, "
             + "multi-threading, CPU processing"
     });
     expected.add(new Object[]{
         1008,
         "Amazon EC2, AWS, hadoop, big data, spark, building high performance scalable systems, building and deploying"
             + " large scale production systems, concurrency, multi-threading, Java, C++, CPU processing"
     });
     expected.add(new Object[]{
         1014,
         "Apache spark, Java, C++, query processing, transaction processing, distributed storage, concurrency, "
             + "multi-threading, apache airflow"
     });
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });
     expected.add(new Object[]{
         1018,
         "Realtime stream processing, publish subscribe, columnar processing for data warehouses, concurrency, Java, "
             + "multi-threading, C++,"
     });
     expected.add(new Object[]{
         1019,
         "C++, Java, Python, realtime streaming systems, Machine learning, spark, Kubernetes, transaction processing, "
             + "distributed storage, concurrency, multi-threading, apache airflow"
     });

     testSkillsColumn("Java AND C++", expected);

     // TEST 10: phrase query
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase "Java C++" as is.
     // Notice the difference in results with previous term query. in these cases, phrase query does not help a lot since
     // we are essentially relying on user's way of specifying skills in the text document. A user that has both Java and
     // C++ but the latter is not immediately followed after former will be missed with this phrase query. This is where
     // term query using boolean operator seems to be more appropriate as shown in the previous test since all we care
     // is that users should have both skills. Whether Java is written before C++ or vice-versa doesn't matter.
     // Hence term queries are very useful for such cases. The expected result table is built by doing
     // grep -n -i 'Java, C++' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{1003, "Java, C++, worked on open source projects, coursera machine learning"});
     expected.add(new Object[]{
         1005,
         "Distributed systems, Java, C++, Go, distributed query engines for analytics and data warehouses, Machine "
             + "learning, spark, Kubernetes, transaction processing"
     });
     expected.add(new Object[]{
         1008,
         "Amazon EC2, AWS, hadoop, big data, spark, building high performance scalable systems, building and deploying"
             + " large scale production systems, concurrency, multi-threading, Java, C++, CPU processing"
     });
     expected.add(new Object[]{
         1014,
         "Apache spark, Java, C++, query processing, transaction processing, distributed storage, concurrency, "
             + "multi-threading, apache airflow"
     });

     testSkillsColumn("\"Java C++\"", expected);

     // TEST 11: composite phrase query using boolean operator AND.
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain two independent phrases
     // "machine learning" and "gpu processing" as is. The expected result table is built by doing
     // grep -n -i -E 'machine learning.*gpu processing|gpu processing.*machine learning' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1016,
         "CUDA, GPU processing, Tensor flow, Pandas, Python, Jupyter notebook, spark, Machine learning, building high "
             + "performance scalable systems"
     });

     testSkillsColumn("\"Machine learning\" AND \"gpu processing\"", expected);

     // TEST 12: composite phrase and term query using boolean operator AND.
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase "machine learning"
     // as is and term 'gpu'. Note the difference in result with previous query where 'gpu' was used in the context of
     // phrase "gpu processing" but that resulted in missing out on one row. The expected result table is built by doing
     // grep -n -i -E 'machine learning.*gpu|gpu.*machine learning' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1011,
         "CUDA, GPU, Python, Machine learning, database kernel, storage, indexing and transaction processing, building"
             + " large scale systems"
     });
     expected.add(new Object[]{
         1016,
         "CUDA, GPU processing, Tensor flow, Pandas, Python, Jupyter notebook, spark, Machine learning, building high "
             + "performance scalable systems"
     });

     testSkillsColumn("\"Machine learning\" AND gpu", expected);

     // TEST 13: composite phrase and term query using boolean operator AND
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase "machine learning"
     // as is and terms 'gpu' and 'python'. This example shows the usefulness of combining phrases and terms
     // in a single WHERE clause to build a strong search query. The expected result table is built by doing
     // grep -n -i -E 'machine learning.*gpu.*python|gpu.*machine learning.*python|gpu.*python.*machine learning'
     // skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1011,
         "CUDA, GPU, Python, Machine learning, database kernel, storage, indexing and transaction processing, building"
             + " large scale systems"
     });
     expected.add(new Object[]{
         1016,
         "CUDA, GPU processing, Tensor flow, Pandas, Python, Jupyter notebook, spark, Machine learning, building high "
             + "performance scalable systems"
     });

     testSkillsColumn("\"Machine learning\" AND gpu AND python", expected);

     // TEST 14: term query
     // Search in SKILLS_TEXT_COL column to look for documents that MUST contain term 'apache'. The expected result
     // table is built by doing grep -n -i 'apache' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1013,
         "Kubernetes, cluster management, operating systems, concurrency, multi-threading, apache airflow, Apache Spark,"
     });
     expected.add(new Object[]{
         1014,
         "Apache spark, Java, C++, query processing, transaction processing, distributed storage, concurrency, "
             + "multi-threading, apache airflow"
     });
     expected.add(new Object[]{
         1015,
         "Big data stream processing, Apache Flink, Apache Beam, database kernel, distributed query engines for "
             + "analytics and data warehouses"
     });
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });
     expected.add(new Object[]{
         1019,
         "C++, Java, Python, realtime streaming systems, Machine learning, spark, Kubernetes, transaction processing, "
             + "distributed storage, concurrency, multi-threading, apache airflow"
     });
     expected.add(new Object[]{
         1020,
         "Databases, columnar query processing, Apache Arrow, distributed systems, Machine learning, cluster "
             + "management, docker image building and distribution"
     });

     testSkillsColumn("apache", expected);

     // TEST 15: composite phrase and term query using boolean operator AND.
     // search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase "distributed
     // systems"
     // as is and term 'apache'. The expected result table was built by doing
     // grep -n -i -E 'distributed systems.*apache|apache.*distributed systems' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });
     expected.add(new Object[]{
         1020,
         "Databases, columnar query processing, Apache Arrow, distributed systems, Machine learning, cluster "
             + "management, docker image building and distribution"
     });

     testSkillsColumn("\"distributed systems\" AND apache", expected);

     // TEST 16: term query
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain term 'database'.
     // The expected result table is built by doing grep -n -i 'database' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1007,
         "C++, Python, Tensor flow, database kernel, storage, indexing and transaction processing, building large "
             + "scale systems, Machine learning"
     });
     expected.add(new Object[]{
         1009,
         "Distributed systems, database development, columnar query engine, database kernel, storage, indexing and "
             + "transaction processing, building large scale systems"
     });
     expected.add(new Object[]{
         1011,
         "CUDA, GPU, Python, Machine learning, database kernel, storage, indexing and transaction processing, building"
             + " large scale systems"
     });
     expected.add(new Object[]{
         1012, "Distributed systems, Java, database engine, cluster management, docker image building and distribution"
     });
     expected.add(new Object[]{
         1015,
         "Big data stream processing, Apache Flink, Apache Beam, database kernel, distributed query engines for "
             + "analytics and data warehouses"
     });
     expected.add(new Object[]{
         1021,
         "Database engine, OLAP systems, OLTP transaction processing at large scale, concurrency, multi-threading, GO,"
             + " building large scale systems"
     });

     testSkillsColumn("database", expected);

     // TEST 17: phrase query
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase "database engine"
     // as is. The expected result table is built by doing grep -n -i 'database engine' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1012, "Distributed systems, Java, database engine, cluster management, docker image building and distribution"
     });
     expected.add(new Object[]{
         1021,
         "Database engine, OLAP systems, OLTP transaction processing at large scale, concurrency, multi-threading, GO,"
             + " building large scale systems"
     });

     testSkillsColumn("\"database engine\"", expected);

     // TEST 18: phrase query
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase
     // "publish subscribe" as is. This is where we see the power of lucene's in-built text parser and tokenizer.
     // Ideally only second row should have been in the output but the parser is intelligent to break publish-subscribe
     // from the original text (even though not separated by white space delimiter) into two different indexable words.
     // The expected result table is built by doing grep -n -i 'publish-subscribe' skills.txt and
     // grep -n -i 'publish subscribe' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });
     expected.add(new Object[]{
         1018,
         "Realtime stream processing, publish subscribe, columnar processing for data warehouses, concurrency, Java, "
             + "multi-threading, C++,"
     });

     testSkillsColumn("\"publish subscribe\"", expected);

     // TEST 19: phrase query
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase
     // "accounts banking insurance" as is. The expected result table is built by doing
     // grep -n -i 'accounts, banking, insurance' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{1000, "Accounts, Banking, Insurance, worked in NGO, Java"});

     testSkillsColumn("\"accounts banking insurance\"", expected);

     // TEST 20: composite term query with boolean operator AND
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain terms 'accounts'
     // 'banking'
     // and 'insurance' this is another case where a phrase query will not be helpful since the user may have specified
     // the skills in any arbitrary order and so a boolean term query helps a lot in these cases. The expected result
     // table
     // was built by doing grep -n -i 'accounts.*banking.*insurance' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{1000, "Accounts, Banking, Insurance, worked in NGO, Java"});
     expected.add(new Object[]{1001, "Accounts, Banking, Finance, Insurance"});
     expected.add(new Object[]{1002, "Accounts, Finance, Banking, Insurance"});

     testSkillsColumn("accounts AND banking AND insurance", expected);

     // TEST 21: composite phrase and term query using boolean operator AND.
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain ALL the following skills:
     // phrase "distributed systems" as is, term 'Java', term 'C++'. The expected result table was built by doing
     // grep -n -i -E 'distributed systems.*java.*c\+\+|java.*distributed systems.*c\+\+|distributed systems.*c\+\+
     // .*java' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1005,
         "Distributed systems, Java, C++, Go, distributed query engines for analytics and data warehouses, Machine "
             + "learning, spark, Kubernetes, transaction processing"
     });
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });

     testSkillsColumn("\"distributed systems\" AND Java AND C++", expected);

     // test for the index configured to use AND as the default
     // conjunction operator
     query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" "
             + "Java C++') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" Java C++') LIMIT "
             + "50000";
     testTextSearchAggregationQueryHelper(query, expected.size());
     query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" "
             + "AND Java AND C++') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" AND Java AND C++')"
             + " LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     // test for the text index configured to not store the default value
     // full index is stored
     query =
         "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_2, '\"distributed systems\" AND Java AND C++')"
             + " LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());
     // configurable default value is used
     query =
         "SELECT INT_COL, SKILLS_TEXT_COL_2 FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_2, '\"distributed systems\" "
             + "AND Java AND C++') LIMIT 50000";
     expected = new ArrayList<>();
     expected.add(new Object[]{1005, "ILoveCoding"});
     expected.add(new Object[]{1017, "ILoveCoding"});
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);

     // TEST 22: composite phrase and term query using boolean operator OR
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain ANY of the following
     // skills:
     // phrase "distributed systems" as is, term 'Java', term 'C++'. Note: OR operator is implicit when we don't specify
     // any operator. The expected result table was built by doing grep -n -i -E 'distributed systems|java|c\+\+'
     // skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{1000, "Accounts, Banking, Insurance, worked in NGO, Java"});
     expected.add(new Object[]{1003, "Java, C++, worked on open source projects, coursera machine learning"});
     expected.add(new Object[]{1004, "Machine learning, Tensor flow, Java, Stanford university,"});
     expected.add(new Object[]{
         1005,
         "Distributed systems, Java, C++, Go, distributed query engines for analytics and data warehouses, Machine "
             + "learning, spark, Kubernetes, transaction processing"
     });
     expected.add(new Object[]{
         1006,
         "Java, Python, C++, Machine learning, building and deploying large scale production systems, concurrency, "
             + "multi-threading, CPU processing"
     });
     expected.add(new Object[]{
         1007,
         "C++, Python, Tensor flow, database kernel, storage, indexing and transaction processing, building large "
             + "scale systems, Machine learning"
     });
     expected.add(new Object[]{
         1008,
         "Amazon EC2, AWS, hadoop, big data, spark, building high performance scalable systems, building and deploying"
             + " large scale production systems, concurrency, multi-threading, Java, C++, CPU processing"
     });
     expected.add(new Object[]{
         1009,
         "Distributed systems, database development, columnar query engine, database kernel, storage, indexing and "
             + "transaction processing, building large scale systems"
     });
     expected.add(new Object[]{
         1010,
         "Distributed systems, Java, realtime streaming systems, Machine learning, spark, Kubernetes, distributed "
             + "storage, concurrency, multi-threading"
     });
     expected.add(new Object[]{
         1012, "Distributed systems, Java, database engine, cluster management, docker image building and distribution"
     });
     expected.add(new Object[]{
         1014,
         "Apache spark, Java, C++, query processing, transaction processing, distributed storage, concurrency, "
             + "multi-threading, apache airflow"
     });
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });
     expected.add(new Object[]{
         1018,
         "Realtime stream processing, publish subscribe, columnar processing for data warehouses, concurrency, Java, "
             + "multi-threading, C++,"
     });
     expected.add(new Object[]{
         1019,
         "C++, Java, Python, realtime streaming systems, Machine learning, spark, Kubernetes, transaction processing, "
             + "distributed storage, concurrency, multi-threading, apache airflow"
     });
     expected.add(new Object[]{
         1020,
         "Databases, columnar query processing, Apache Arrow, distributed systems, Machine learning, cluster "
             + "management, docker image building and distribution"
     });

     query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\" Java"
             + " C++') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\" Java C++') LIMIT "
             + "50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     // test for the index configured to use AND as the default
     // conjunction operator
     query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" OR"
             + " Java OR C++') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" OR Java OR C++') "
             + "LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     // TEST 23: composite phrase and term query using both AND and OR
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase "distributed
     // systems"
     // as is and any of the following terms 'Java' or 'C++'. The expected result table was built by doing
     // grep -n -i -E 'distributed systems.*(java|c\+\+)' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1005,
         "Distributed systems, Java, C++, Go, distributed query engines for analytics and data warehouses, Machine "
             + "learning, spark, Kubernetes, transaction processing"
     });
     expected.add(new Object[]{
         1010,
         "Distributed systems, Java, realtime streaming systems, Machine learning, spark, Kubernetes, distributed "
             + "storage, concurrency, multi-threading"
     });
     expected.add(new Object[]{
         1012, "Distributed systems, Java, database engine, cluster management, docker image building and distribution"
     });
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });

     query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\" AND "
             + "(Java C++)') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\" AND (Java C++)') "
             + "LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     // test for the index configured to use AND as the default
     // conjunction operator
     query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" "
             + "AND (Java OR C++)') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" AND (Java OR C++)"
             + "') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1005,
         "Distributed systems, Java, C++, Go, distributed query engines for analytics and data warehouses, Machine "
             + "learning, spark, Kubernetes, transaction processing"
     });
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });
     query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" "
             + "AND (Java C++)') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" AND (Java C++)') "
             + "LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     // TEST 24: prefix query
     // Search in SKILLS_TEXT_COL column to look for documents that have stream* -- stream, streaming, streams etc.
     // The expected result table was built by doing grep -n -i -E 'stream' skills.txt
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1010,
         "Distributed systems, Java, realtime streaming systems, Machine learning, spark, Kubernetes, distributed "
             + "storage, concurrency, multi-threading"
     });
     expected.add(new Object[]{
         1015,
         "Big data stream processing, Apache Flink, Apache Beam, database kernel, distributed query engines for "
             + "analytics and data warehouses"
     });
     expected.add(new Object[]{
         1018,
         "Realtime stream processing, publish subscribe, columnar processing for data warehouses, concurrency, Java, "
             + "multi-threading, C++,"
     });
     expected.add(new Object[]{
         1019,
         "C++, Java, Python, realtime streaming systems, Machine learning, spark, Kubernetes, transaction processing, "
             + "distributed storage, concurrency, multi-threading, apache airflow"
     });

     query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'stream*') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);

     query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'stream*') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     // TEST 25: regex query
     // Search in SKILLS_TEXT_COL column to look for documents that have 'exception'.
     // Based on how 'exception' is present in the original text, it won't be tokenized into an individual
     // indexable token/term. Hence it won't be available in the in the index on its own. It will be  present as part
     // of larger token depending on where the word boundary is. So we need to use Lucene regex query.
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1022,
         "GET /administrator/ HTTP/1.1 200 4263 - Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0 - "
             + "NullPointerException"
     });

     query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '/.*exception/') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);

     query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '/.*exception/') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());
   }

   /**
    * Tests for combining (using AND/OR)
    * the execution of text match filters with other filters.
    */
   @Test
   public void testTextSearchWithAdditionalFilter()
       throws Exception {
     // TEST 1: combine an index based doc id iterator (text_match) with scan based doc id iterator (range >= ) using AND
     List<Object[]> expected = new ArrayList<>();
     expected.add(new Object[]{
         1010,
         "Distributed systems, Java, realtime streaming systems, Machine learning, spark, Kubernetes, distributed "
             + "storage, concurrency, multi-threading"
     });
     expected.add(new Object[]{
         1012, "Distributed systems, Java, database engine, cluster management, docker image building and distribution"
     });
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });
     expected.add(new Object[]{
         1020,
         "Databases, columnar query processing, Apache Arrow, distributed systems, Machine learning, cluster "
             + "management, docker image building and distribution"
     });

     String query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE INT_COL >= 1010 AND TEXT_MATCH(SKILLS_TEXT_COL, "
             + "'\"Distributed systems\"') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE INT_COL >= 1010 AND TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed "
             + "systems\"') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE INT_COL >= 1010 AND TEXT_MATCH(SKILLS_TEXT_COL_DICT, "
             + "'\"Distributed systems\"') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE INT_COL >= 1010 AND TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"distributed "
             + "systems\"') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     // TEST 2: combine an index based doc id iterator (text_match) with scan based doc id iterator (range <= ) using AND
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1005,
         "Distributed systems, Java, C++, Go, distributed query engines for analytics and data warehouses, Machine "
             + "learning, spark, Kubernetes, transaction processing"
     });
     expected.add(new Object[]{
         1009,
         "Distributed systems, database development, columnar query engine, database kernel, storage, indexing and "
             + "transaction processing, building large scale systems"
     });
     expected.add(new Object[]{
         1010,
         "Distributed systems, Java, realtime streaming systems, Machine learning, spark, Kubernetes, distributed "
             + "storage, concurrency, multi-threading"
     });

     query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE INT_COL <= 1010 AND TEXT_MATCH(SKILLS_TEXT_COL, "
             + "'\"Distributed systems\"') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE INT_COL <= 1010 AND TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed "
             + "systems\"') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE INT_COL <= 1010 AND TEXT_MATCH(SKILLS_TEXT_COL_DICT, "
             + "'\"Distributed systems\"') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE INT_COL <= 1010 AND TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"distributed "
             + "systems\"') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     // TEST 3: combine an index based doc id iterator (text_match) with scan based doc id iterator (range >= ) using OR
     query =
         "SELECT COUNT(*) FROM MyTable WHERE INT_COL >= 1010 OR TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\"')"
             + " LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, 24142);

     query =
         "SELECT COUNT(*) FROM MyTable WHERE INT_COL >= 1010 OR TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"distributed "
             + "systems\"') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, 24142);

     // TEST 4: combine an index based doc id iterator (text_match) with scan based doc id iterator (range <= ) using OR
     expected = new ArrayList<>();
     expected.add(new Object[]{1000, "Accounts, Banking, Insurance, worked in NGO, Java"});
     expected.add(new Object[]{1001, "Accounts, Banking, Finance, Insurance"});
     expected.add(new Object[]{1002, "Accounts, Finance, Banking, Insurance"});
     expected.add(new Object[]{1003, "Java, C++, worked on open source projects, coursera machine learning"});
     expected.add(new Object[]{1004, "Machine learning, Tensor flow, Java, Stanford university,"});
     expected.add(new Object[]{
         1005,
         "Distributed systems, Java, C++, Go, distributed query engines for analytics and data warehouses, Machine "
             + "learning, spark, Kubernetes, transaction processing"
     });
     expected.add(new Object[]{
         1006,
         "Java, Python, C++, Machine learning, building and deploying large scale production systems, concurrency, "
             + "multi-threading, CPU processing"
     });
     expected.add(new Object[]{
         1007,
         "C++, Python, Tensor flow, database kernel, storage, indexing and transaction processing, building large "
             + "scale systems, Machine learning"
     });
     expected.add(new Object[]{
         1008,
         "Amazon EC2, AWS, hadoop, big data, spark, building high performance scalable systems, building and deploying"
             + " large scale production systems, concurrency, multi-threading, Java, C++, CPU processing"
     });
     expected.add(new Object[]{
         1009,
         "Distributed systems, database development, columnar query engine, database kernel, storage, indexing and "
             + "transaction processing, building large scale systems"
     });
     expected.add(new Object[]{
         1010,
         "Distributed systems, Java, realtime streaming systems, Machine learning, spark, Kubernetes, distributed "
             + "storage, concurrency, multi-threading"
     });
     expected.add(new Object[]{
         1012, "Distributed systems, Java, database engine, cluster management, docker image building and distribution"
     });
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });
     expected.add(new Object[]{
         1020,
         "Databases, columnar query processing, Apache Arrow, distributed systems, Machine learning, cluster "
             + "management, docker image building and distribution"
     });

     query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE INT_COL <= 1010 OR TEXT_MATCH(SKILLS_TEXT_COL, "
             + "'\"Distributed systems\"') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE INT_COL <= 1010 OR TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\"')"
             + " LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE INT_COL <= 1010 OR TEXT_MATCH(SKILLS_TEXT_COL_DICT, "
             + "'\"Distributed systems\"') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE INT_COL <= 1010 OR TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"distributed "
             + "systems\"') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     // TEST 5: combine an index based doc id iterator (text_match) with sorted inverted index doc id iterator
     // (equality) using AND
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });

     query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE INT_COL = 1017 AND TEXT_MATCH(SKILLS_TEXT_COL, "
             + "'\"Distributed systems\"') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE INT_COL = 1017 AND TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\"')"
             + " LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     query =
         "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE INT_COL = 1017 AND TEXT_MATCH(SKILLS_TEXT_COL_DICT, "
             + "'\"Distributed systems\"') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE INT_COL = 1017 AND TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"distributed "
             + "systems\"') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     // TEST 6: combine an index based doc id iterator (text_match) with sorted inverted index doc id iterator
     // (equality) using OR
     expected = new ArrayList<>();
     expected.add(new Object[]{1005});
     expected.add(new Object[]{1009});
     expected.add(new Object[]{1010});
     expected.add(new Object[]{1012});
     expected.add(new Object[]{1017});
     expected.add(new Object[]{1020});

     query =
         "SELECT INT_COL FROM MyTable WHERE INT_COL = 1017 OR TEXT_MATCH(SKILLS_TEXT_COL, '\"Distributed systems\"') "
             + "LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE INT_COL = 1017 OR TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\"') "
             + "LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     query =
         "SELECT INT_COL FROM MyTable WHERE INT_COL = 1017 OR TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"Distributed "
             + "systems\"') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE INT_COL = 1017 OR TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"distributed "
             + "systems\"') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     // TEST 7: combine an index based doc id iterator (text_match) with another index based doc id iterator
     // (text_match) using AND
     expected = new ArrayList<>();
     expected.add(new Object[]{1005});
     expected.add(new Object[]{1009});
     expected.add(new Object[]{1010});
     expected.add(new Object[]{1012});
     expected.add(new Object[]{1017});
     expected.add(new Object[]{1020});

     query =
         "SELECT INT_COL FROM MyTable WHERE TEXT_MATCH(QUERY_LOG_TEXT_COL, '\"SELECT count\"') AND TEXT_MATCH"
             + "(SKILLS_TEXT_COL, '\"Distributed systems\"') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(QUERY_LOG_TEXT_COL, '\"SELECT count\"') AND TEXT_MATCH"
             + "(SKILLS_TEXT_COL, '\"Distributed systems\"') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     // TEST 8: combine an index based doc id iterator (text_match) with another index based doc id iterator
     // (text_match) using OR
     expected = new ArrayList<>();
     expected.add(new Object[]{1005});
     expected.add(new Object[]{1009});
     expected.add(new Object[]{1010});
     expected.add(new Object[]{1012});
     expected.add(new Object[]{1013});
     expected.add(new Object[]{1014});
     expected.add(new Object[]{1015});
     expected.add(new Object[]{1017});
     expected.add(new Object[]{1019});
     expected.add(new Object[]{1020});

     query =
         "SELECT INT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'apache') OR TEXT_MATCH(SKILLS_TEXT_COL, "
             + "'\"Distributed systems\"') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'apache') OR TEXT_MATCH(SKILLS_TEXT_COL, "
             + "'\"Distributed systems\"') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     // since we support text index on dictionary encoded columns, the column might
     // also have native pinot inverted index in addition to text index.
     // so this query tests the exact match on the text column which will use the
     // native inverted index.
     expected = new ArrayList<>();
     expected.add(new Object[]{1004});
     query =
         "SELECT INT_COL FROM MyTable WHERE SKILLS_TEXT_COL_DICT = 'Machine learning, Tensor flow, Java, Stanford "
             + "university,' LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE SKILLS_TEXT_COL_DICT = 'Machine learning, Tensor flow, Java, Stanford "
             + "university,' LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());

     // since we support text index on dictionary encoded columns, the column might
     // also have native pinot inverted index in addition to text index.
     // so this query tests the exact match on the text column which will use the
     // native inverted index along with text match on the text column which will
     // use the text index
     expected = new ArrayList<>();
     expected.add(new Object[]{1003});
     expected.add(new Object[]{1004});
     expected.add(new Object[]{1005});
     expected.add(new Object[]{1006});
     expected.add(new Object[]{1007});
     expected.add(new Object[]{1010});
     expected.add(new Object[]{1011});
     expected.add(new Object[]{1016});
     expected.add(new Object[]{1019});
     expected.add(new Object[]{1020});

     query =
         "SELECT INT_COL FROM MyTable WHERE SKILLS_TEXT_COL_DICT = 'Machine learning, Tensor flow, Java, Stanford "
             + "university,' OR TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"machine learning\"') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
     query =
         "SELECT COUNT(*) FROM MyTable WHERE SKILLS_TEXT_COL_DICT = 'Machine learning, Tensor flow, Java, Stanford "
             + "university,' OR TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"machine learning\"') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());
   }

   /**
    * Test the reference counting mechanism of {@link SearcherManager}
    * used by {@link RealtimeLuceneTextIndex}
    * for near realtime text search.
    */
   @Test
   public void testLuceneRealtimeWithSearcherManager()
       throws Exception {
     // create and open an index writer
     File indexFile = new File(INDEX_DIR.getPath() + "/realtime-test1.index");
     Directory indexDirectory = FSDirectory.open(indexFile.toPath());
     StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
     IndexWriterConfig indexWriterConfig = new IndexWriterConfig(standardAnalyzer);
     indexWriterConfig.setRAMBufferSizeMB(500);
     IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig);

     // create an NRT index reader
     SearcherManager searcherManager = new SearcherManager(indexWriter, false, false, null);

     QueryParser queryParser = new QueryParser("skill", standardAnalyzer);
     Query query = queryParser.parse("\"machine learning\"");

     // acquire a searcher
     // the creation of SearcherManager would have created an IndexReader with a
     // refcount of 1. manager.acquire would have bumped up the refcount by 1
     // so the refcount should be 2 now
     // search() should not see any hits since nothing has been added to the index
     IndexSearcher searcher1 = searcherManager.acquire();
     assertEquals(2, searcher1.getIndexReader().getRefCount());
     assertEquals(0, searcher1.search(query, 100).scoreDocs.length);

     // acquire a searcher
     // since refresh hasn't happenend yet, this should be the same searcher as searcher1
     // but with refcount incremented. so refcount should be 3 now
     // search() should not see any hits since nothing has been added to the index
     IndexSearcher searcher2 = searcherManager.acquire();
     assertEquals(3, searcher2.getIndexReader().getRefCount());
     assertEquals(0, searcher2.search(query, 100).scoreDocs.length);
     assertEquals(searcher1, searcher2);
     assertEquals(3, searcher1.getIndexReader().getRefCount());
     assertEquals(0, searcher1.search(query, 100).scoreDocs.length);

     // add something to the index but don't commit
     Document docToIndex = new Document();
     docToIndex.add(new TextField("skill", "machine learning", Field.Store.NO));
     indexWriter.addDocument(docToIndex);

     // refresh the searcher inside SearcherManager
     // this
     searcherManager.maybeRefresh();

     // acquire a searcher
     // this should be the refreshed one
     // the refcount of the reader associated with this searcher
     // should be 2 (1 as the initial refcount of the reader and +1
     // due to acquire)
     IndexSearcher searcher3 = searcherManager.acquire();
     assertEquals(2, searcher3.getIndexReader().getRefCount());
     // this searcher should see the uncommitted document in the index
     assertEquals(1, searcher3.search(query, 100).scoreDocs.length);
     assertNotEquals(searcher2, searcher3);

     // searcher1 and searcher2 ref count should have gone down by 1 due to refresh
     // since they were the current searcher before refresh happened and after SearcherManager
     // got new searcher after refresh, it decrements the ref count for old one.
     assertEquals(2, searcher1.getIndexReader().getRefCount());
     assertEquals(0, searcher1.search(query, 100).scoreDocs.length);
     assertEquals(2, searcher2.getIndexReader().getRefCount());
     assertEquals(0, searcher2.search(query, 100).scoreDocs.length);

     // done searching with searcher1
     // release it -- this should decrement the refcount by 1 for both searcher1
     // and searcher2 since they are same
     searcherManager.release(searcher1);
     assertEquals(1, searcher1.getIndexReader().getRefCount());
     assertEquals(1, searcher2.getIndexReader().getRefCount());
     // the above release should not have impacted searcher3
     assertEquals(2, searcher3.getIndexReader().getRefCount());
     assertEquals(1, searcher3.search(query, 100).scoreDocs.length);

     // done searching with searcher2
     // release it -- this should decrement the refcount by 1 for both searcher1
     // and searcher2 since they are same
     // this gets the refcount to 0 and the associated reader is closed
     searcherManager.release(searcher2);
     assertEquals(0, searcher1.getIndexReader().getRefCount());
     assertEquals(0, searcher2.getIndexReader().getRefCount());
     // the above release should not have impacted searcher3
     assertEquals(2, searcher3.getIndexReader().getRefCount());
     assertEquals(1, searcher3.search(query, 100).scoreDocs.length);

     // add another document to the index but don't commit
     docToIndex = new Document();
     docToIndex.add(new TextField("skill", "java, machine learning", Field.Store.NO));
     indexWriter.addDocument(docToIndex);

     // searcher3 should not see the second document
     assertEquals(2, searcher3.getIndexReader().getRefCount());
     assertEquals(1, searcher3.search(query, 100).scoreDocs.length);

     // refresh
     searcherManager.maybeRefresh();

     // refresh would have resulted in a new current searcher inside searcher
     // manager and decremented the ref count of previous current searcher
     // (searcher3)
     assertEquals(1, searcher3.getIndexReader().getRefCount());
     assertEquals(1, searcher3.search(query, 100).scoreDocs.length);

     // acquire a searcher
     // this should be the refreshed one
     // the refcount of the reader associated with this searcher
     // should be 2 (1 as the initial refcount of the reader and +1
     // due to acquire)
     IndexSearcher searcher4 = searcherManager.acquire();
     assertEquals(2, searcher4.getIndexReader().getRefCount());
     // we should see both the uncommitted documents with the refreshed searcher
     assertEquals(2, searcher4.search(query, 100).scoreDocs.length);
     // searcher3 should not have been affected by the new acquire
     assertEquals(1, searcher3.getIndexReader().getRefCount());
     assertEquals(1, searcher3.search(query, 100).scoreDocs.length);

     // done searching with searcher3
     // release it -- this should decrement its refcount to 0
     // and close the associated reader
     searcherManager.release(searcher3);
     assertEquals(0, searcher1.getIndexReader().getRefCount());
     assertEquals(0, searcher2.getIndexReader().getRefCount());
     assertEquals(0, searcher3.getIndexReader().getRefCount());
     // searcher4 should not have been impacted by above release
     assertEquals(2, searcher4.getIndexReader().getRefCount());
     assertEquals(2, searcher4.search(query, 100).scoreDocs.length);

     // refresh
     searcherManager.maybeRefresh();

     // the above refresh is essentially a NOOP since nothing has changed
     // in the index. so any acquire after the above refresh will return
     // the same searcher as searcher4 but with 1 more refcount
     IndexSearcher searcher5 = searcherManager.acquire();
     assertEquals(3, searcher4.getIndexReader().getRefCount());
     assertEquals(2, searcher4.search(query, 100).scoreDocs.length);
     assertEquals(3, searcher5.getIndexReader().getRefCount());
     assertEquals(2, searcher5.search(query, 100).scoreDocs.length);
     assertEquals(searcher4, searcher5);

     searcherManager.release(searcher4);
     assertEquals(0, searcher1.getIndexReader().getRefCount());
     assertEquals(0, searcher2.getIndexReader().getRefCount());
     assertEquals(0, searcher3.getIndexReader().getRefCount());
     assertEquals(2, searcher4.getIndexReader().getRefCount());
     assertEquals(2, searcher4.search(query, 100).scoreDocs.length);
     assertEquals(2, searcher5.getIndexReader().getRefCount());
     assertEquals(2, searcher5.search(query, 100).scoreDocs.length);

     searcherManager.release(searcher5);
     assertEquals(0, searcher1.getIndexReader().getRefCount());
     assertEquals(0, searcher2.getIndexReader().getRefCount());
     assertEquals(0, searcher3.getIndexReader().getRefCount());
     assertEquals(1, searcher4.getIndexReader().getRefCount());
     assertEquals(2, searcher4.search(query, 100).scoreDocs.length);
     assertEquals(1, searcher5.getIndexReader().getRefCount());
     assertEquals(2, searcher5.search(query, 100).scoreDocs.length);

     searcherManager.close();
     assertEquals(0, searcher4.getIndexReader().getRefCount());
     assertEquals(0, searcher5.getIndexReader().getRefCount());
     indexWriter.close();
   }

   /**
    * Test the realtime search by verifying that realtime reader is able
    * to see monotonically increasing number of uncommitted documents
    * added to the index.
    */
   @Test
   public void testLuceneRealtimeWithoutSearcherManager()
       throws Exception {
     // create and open an index writer
     File indexFile = new File(INDEX_DIR.getPath() + "/realtime-test2.index");
     Directory indexDirectory = FSDirectory.open(indexFile.toPath());
     StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
     IndexWriterConfig indexWriterConfig = new IndexWriterConfig(standardAnalyzer);
     indexWriterConfig.setRAMBufferSizeMB(50);
     IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig);

     // add a document but don't commit
     Document docToIndex = new Document();
     docToIndex.add(new TextField("skill", "distributed systems, machine learning, JAVA, C++", Field.Store.NO));
     indexWriter.addDocument(docToIndex);

     // create an NRT index reader from the writer -- should see one uncommitted document
     QueryParser queryParser = new QueryParser("skill", standardAnalyzer);
     Query query = queryParser.parse("\"distributed systems\" AND (Java C++)");
     IndexReader indexReader1 = DirectoryReader.open(indexWriter);
     IndexSearcher searcher1 = new IndexSearcher(indexReader1);
     assertEquals(1, searcher1.search(query, 50).scoreDocs.length);

     // add another document but don't commit
     docToIndex = new Document();
     docToIndex.add(new TextField("skill", "distributed systems, python, JAVA, C++", Field.Store.NO));
     indexWriter.addDocument(docToIndex);

     // reopen NRT reader and search -- should see two uncommitted documents
     IndexReader indexReader2 = DirectoryReader.openIfChanged((DirectoryReader) indexReader1);
     assertNotNull(indexReader2);
     IndexSearcher searcher2 = new IndexSearcher(indexReader2);
     assertEquals(2, searcher2.search(query, 50).scoreDocs.length);

     // add another document
     docToIndex = new Document();
     docToIndex.add(new TextField("skill", "distributed systems, GPU, JAVA, C++", Field.Store.NO));
     indexWriter.addDocument(docToIndex);

     // reopen NRT reader and search -- should see three uncommitted documents
     IndexReader indexReader3 = DirectoryReader.openIfChanged((DirectoryReader) indexReader2);
     assertNotNull(indexReader3);
     IndexSearcher searcher3 = new IndexSearcher(indexReader3);
     assertEquals(3, searcher3.search(query, 50).scoreDocs.length);

     indexWriter.close();
     indexReader1.close();
     indexReader2.close();
     indexReader3.close();
   }

   @Test
   public void testMultiThreadedLuceneRealtime()
       throws Exception {
     File indexFile = new File(INDEX_DIR.getPath() + "/realtime-test3.index");
     Directory indexDirectory = FSDirectory.open(indexFile.toPath());
     StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
     // create and open a writer
     IndexWriterConfig indexWriterConfig = new IndexWriterConfig(standardAnalyzer);
     indexWriterConfig.setRAMBufferSizeMB(500);
     IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig);

     // create an NRT index reader
     SearcherManager searcherManager = new SearcherManager(indexWriter, false, false, null);

     // background thread to refresh NRT reader
     ControlledRealTimeReopenThread controlledRealTimeReopenThread =
         new ControlledRealTimeReopenThread(indexWriter, searcherManager, 0.01, 0.01);
     controlledRealTimeReopenThread.start();

     // start writer and reader
     Thread writer = new Thread(new RealtimeWriter(indexWriter));
     Thread realtimeReader = new Thread(new RealtimeReader(searcherManager, standardAnalyzer));

     writer.start();
     realtimeReader.start();

     writer.join();
     realtimeReader.join();
     controlledRealTimeReopenThread.join();
   }

   private static class RealtimeWriter implements Runnable {
     private final IndexWriter _indexWriter;

     RealtimeWriter(IndexWriter indexWriter) {
       _indexWriter = indexWriter;
     }

     @Override
     public void run() {
       URL resourceUrl = getClass().getClassLoader().getResource("data/text_search_data/skills.txt");
       File skillFile = new File(resourceUrl.getFile());
       String[] skills = new String[100];

       int skillCount = 0;
       try (InputStream inputStream = new FileInputStream(skillFile);
           BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream))) {
         String line;
         while ((line = reader.readLine()) != null) {
           skills[skillCount++] = line;
         }
       } catch (Exception e) {
         throw new RuntimeException("Caught exception while reading skills file");
       }

       try {
         int counter = 0;
         Random random = new Random();
         // ingest 500k documents
         while (counter < 500000) {
           Document docToIndex = new Document();
           if (counter >= skillCount) {
             int index = random.nextInt(skillCount);
             docToIndex.add(new TextField("skill", skills[index], Field.Store.NO));
           } else {
             docToIndex.add(new TextField("skill", skills[counter], Field.Store.NO));
           }
           counter++;
           _indexWriter.addDocument(docToIndex);
         }
       } catch (Exception e) {
         throw new RuntimeException("Caught exception while adding a document to index");
       } finally {
         try {
           _indexWriter.commit();
           _indexWriter.close();
         } catch (Exception e) {
           throw new RuntimeException("Failed to commit/close the index writer");
         }
       }
     }
   }

   private static class RealtimeReader implements Runnable {
     private final QueryParser _queryParser;
     private final SearcherManager _searcherManager;

     RealtimeReader(SearcherManager searcherManager, StandardAnalyzer standardAnalyzer) {
       _queryParser = new QueryParser("skill", standardAnalyzer);
       _searcherManager = searcherManager;
     }

     @Override
     public void run() {
       try {
         Query query = _queryParser.parse("\"machine learning\" AND spark");
         int count = 0;
         int prevHits = 0;
         // run the same query 1000 times and see in increasing number of hits
         // in the index
         while (count < 1000) {
           IndexSearcher indexSearcher = _searcherManager.acquire();
           int hits = indexSearcher.search(query, Integer.MAX_VALUE).scoreDocs.length;
           // TODO: see how we can make this more deterministic
           if (count > 200) {
             // we should see an increasing number of hits
             assertTrue(hits > 0);
             assertTrue(hits >= prevHits);
           }
           count++;
           prevHits = hits;
           _searcherManager.release(indexSearcher);
           Thread.sleep(1);
         }
       } catch (Exception e) {
         throw new RuntimeException("Caught exception in realtime reader");
       }
     }
   }

   /*
    * Helper methods for tests
    */

   private void testTextSearchSelectQueryHelper(String query, int expectedResultSize, boolean compareGrepOutput,
       List<Object[]> expectedResults)
       throws Exception {
     SelectionOnlyOperator operator = getOperator(query);
     List<Object[]> resultset = (List<Object[]>) operator.nextBlock().getRows();
     assertNotNull(resultset);
     assertEquals(resultset.size(), expectedResultSize);
     if (compareGrepOutput) {
       // compare with grep output
       verifySearchOutputWithGrepResults(resultset);
     } else if (expectedResults != null) {
       // compare with expected result table
       for (int i = 0; i < expectedResultSize; i++) {
         Object[] actualRow = resultset.get(i);
         Object[] expectedRow = expectedResults.get(i);
         assertEquals(actualRow.length, expectedRow.length);
         for (int j = 0; j < actualRow.length; j++) {
           Object actualColValue = actualRow[j];
           Object expectedColValue = expectedRow[j];
           assertEquals(actualColValue, expectedColValue);
         }
       }
     }
   }

   private void verifySearchOutputWithGrepResults(List<Object[]> actualResultSet)
       throws Exception {
     URL resourceUrl = getClass().getClassLoader().getResource("data/text_search_data/group_by_grep_results.out");
     File file = new File(resourceUrl.getFile());
     InputStream inputStream = new FileInputStream(file);
     BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
     String line;
     int counter = 0;
     while ((line = reader.readLine()) != null) {
       String[] expectedRow = line.split(":");
       Object[] actualRow = actualResultSet.get(counter);
       int expectedIntColValue = Integer.valueOf(expectedRow[0]) + INT_BASE_VALUE - 1;
       assertEquals(expectedIntColValue, actualRow[0]);
       assertEquals(expectedRow[1], actualRow[1]);
       counter++;
     }
   }

   private void testTextSearchAggregationQueryHelper(String query, int expectedCount) {
     BaseOperator<AggregationResultsBlock> operator = getOperator(query);
     long count = (Long) operator.nextBlock().getResults().get(0);
     assertEquals(expectedCount, count);
   }

   private void testSkillsColumn(String searchQuery, List<Object[]> expected)
       throws Exception {
     for (String skillColumn : Arrays.asList(SKILLS_TEXT_COL_NAME, SKILLS_TEXT_COL_DICT_NAME,
         SKILLS_TEXT_COL_MULTI_TERM_NAME, SKILLS_TEXT_NO_RAW_NAME, SKILLS_TEXT_MV_COL_NAME,
         SKILLS_TEXT_MV_COL_DICT_NAME)) {
       String query =
           String.format("SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(%s, '%s') LIMIT 50000",
               skillColumn, searchQuery);
       testTextSearchSelectQueryHelper(query, expected.size(), false, expected);

       query = String.format("SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(%s, '%s') LIMIT 50000", skillColumn,
           searchQuery);
       testTextSearchAggregationQueryHelper(query, expected.size());
     }
   }

   @Test
   public void testInterSegment() {
     String query = "SELECT count(*) FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL, '\"Machine learning\" AND \"Tensor flow\"')";
     testInterSegmentAggregationQueryHelper(query, 12);
     query = "SELECT count(*) FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"Machine learning\" AND \"Tensor flow\"')";
     testInterSegmentAggregationQueryHelper(query, 12);

     query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL, '\"Machine learning\" AND \"Tensor flow\"') LIMIT 50000";
     List<Object[]> expected = new ArrayList<>();
     expected.add(new Object[]{1004, "Machine learning, Tensor flow, Java, Stanford university,"});
     expected.add(new Object[]{
         1007,
         "C++, Python, Tensor flow, database kernel, storage, indexing and transaction processing, building large "
             + "scale systems, Machine learning"
     });
     expected.add(new Object[]{
         1016,
         "CUDA, GPU processing, Tensor flow, Pandas, Python, Jupyter notebook, spark, Machine learning, building high "
             + "performance scalable systems"
     });
     expected.add(new Object[]{1004, "Machine learning, Tensor flow, Java, Stanford university,"});
     expected.add(new Object[]{
         1007,
         "C++, Python, Tensor flow, database kernel, storage, indexing and transaction processing, building large "
             + "scale systems, Machine learning"
     });
     expected.add(new Object[]{
         1016,
         "CUDA, GPU processing, Tensor flow, Pandas, Python, Jupyter notebook, spark, Machine learning, building high "
             + "performance scalable systems"
     });
     expected.add(new Object[]{1004, "Machine learning, Tensor flow, Java, Stanford university,"});
     expected.add(new Object[]{
         1007,
         "C++, Python, Tensor flow, database kernel, storage, indexing and transaction processing, building large "
             + "scale systems, Machine learning"
     });
     expected.add(new Object[]{
         1016,
         "CUDA, GPU processing, Tensor flow, Pandas, Python, Jupyter notebook, spark, Machine learning, building high "
             + "performance scalable systems"
     });
     expected.add(new Object[]{1004, "Machine learning, Tensor flow, Java, Stanford university,"});
     expected.add(new Object[]{
         1007,
         "C++, Python, Tensor flow, database kernel, storage, indexing and transaction processing, building large "
             + "scale systems, Machine learning"
     });
     expected.add(new Object[]{
         1016,
         "CUDA, GPU processing, Tensor flow, Pandas, Python, Jupyter notebook, spark, Machine learning, building high "
             + "performance scalable systems"
     });
     testInterSegmentSelectionQueryHelper(query, expected);
     query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"Machine learning\" AND \"Tensor flow\"') LIMIT 50000";
     testInterSegmentSelectionQueryHelper(query, expected);

     // try arbitrary filters in search expressions
     query = "SELECT count(*) FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL, '(\"distributed systems\" AND apache) OR (Java AND C++)')";
     testInterSegmentAggregationQueryHelper(query, 36);
     query = "SELECT count(*) FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL_DICT, '(\"distributed systems\" AND apache) OR (Java AND C++)')";
     testInterSegmentAggregationQueryHelper(query, 36);

     query = "SELECT count(*) FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL, '(\"distributed systems\" AND apache) AND (Java AND C++)')";
     testInterSegmentAggregationQueryHelper(query, 4);
     query = "SELECT count(*) FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL_DICT, '(\"distributed systems\" AND apache) AND (Java AND C++)')";
     testInterSegmentAggregationQueryHelper(query, 4);

     query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL, '(\"distributed systems\" AND apache) AND (Java AND C++)') LIMIT 50000";
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });
     expected.add(new Object[]{
         1017,
         "Distributed systems, Apache Kafka, publish-subscribe, building and deploying large scale production systems,"
             + " concurrency, multi-threading, C++, CPU processing, Java"
     });
     testInterSegmentSelectionQueryHelper(query, expected);
     query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL_DICT, '(\"distributed systems\" AND apache) AND (Java AND C++)') LIMIT 50000";
     testInterSegmentSelectionQueryHelper(query, expected);

     query = "SELECT count(*) FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL, '(\"apache spark\" OR \"query processing\") AND \"machine learning\"')";
     testInterSegmentAggregationQueryHelper(query, 4);
     query = "SELECT count(*) FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL_DICT, '(\"apache spark\" OR \"query processing\") AND \"machine learning\"')";
     testInterSegmentAggregationQueryHelper(query, 4);

     query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL, '(\"apache spark\" OR \"query processing\") AND \"machine learning\"') "
         + "LIMIT 50000";
     expected = new ArrayList<>();
     expected.add(new Object[]{
         1020,
         "Databases, columnar query processing, Apache Arrow, distributed systems, Machine learning, cluster "
             + "management, docker image building and distribution"
     });
     expected.add(new Object[]{
         1020,
         "Databases, columnar query processing, Apache Arrow, distributed systems, Machine learning, cluster "
             + "management, docker image building and distribution"
     });
     expected.add(new Object[]{
         1020,
         "Databases, columnar query processing, Apache Arrow, distributed systems, Machine learning, cluster "
             + "management, docker image building and distribution"
     });
     expected.add(new Object[]{
         1020,
         "Databases, columnar query processing, Apache Arrow, distributed systems, Machine learning, cluster "
             + "management, docker image building and distribution"
     });
     testInterSegmentSelectionQueryHelper(query, expected);
     query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL_DICT, '(\"apache spark\" OR \"query processing\") AND \"machine learning\"') "
         + "LIMIT 50000";
     testInterSegmentSelectionQueryHelper(query, expected);

     // query with only stop-words. they should not be indexed
     query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'a and or in the are')";
     testInterSegmentAggregationQueryHelper(query, 0);
     query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_DICT, 'a and or in the are')";
     testInterSegmentAggregationQueryHelper(query, 0);

     // analyzer should prune/ignore the stop words from search expression and consider everything else for a match
     query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"learned a lot\"')";
     testInterSegmentAggregationQueryHelper(query, 4);
     query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"learned a lot\"')";
     testInterSegmentAggregationQueryHelper(query, 4);
     query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"indexing and transaction processing\"')";
     testInterSegmentAggregationQueryHelper(query, 12);
     query = "SELECT count(*) FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"indexing and transaction processing\"')";
     testInterSegmentAggregationQueryHelper(query, 12);
     query = "SELECT count(*) FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL, '\"docker image building and distribution\"')";
     testInterSegmentAggregationQueryHelper(query, 8);
     query = "SELECT count(*) FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"docker image building and distribution\"')";
     testInterSegmentAggregationQueryHelper(query, 8);
     query = "SELECT count(*) FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed query engines for analytics and data warehouses\"')";
     testInterSegmentAggregationQueryHelper(query, 8);
     query = "SELECT count(*) FROM MyTable WHERE "
         + "TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"distributed query engines for analytics and data warehouses\"')";
     testInterSegmentAggregationQueryHelper(query, 8);
     query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"worked in NGO\"')";
     testInterSegmentAggregationQueryHelper(query, 4);
     query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"worked in NGO\"')";
     testInterSegmentAggregationQueryHelper(query, 4);
   }

   private void testInterSegmentAggregationQueryHelper(String query, long expectedCount) {
     DataSchema expectedDataSchema = new DataSchema(new String[]{"count(*)"}, new ColumnDataType[]{ColumnDataType.LONG});
     List<Object[]> expectedRows = Collections.singletonList(new Object[]{expectedCount});
     QueriesTestUtils.testInterSegmentsResult(getBrokerResponse(query),
         new ResultTable(expectedDataSchema, expectedRows));
   }

   private void testInterSegmentSelectionQueryHelper(String query, List<Object[]> expectedRows) {
     DataSchema expectedDataSchema = new DataSchema(new String[]{"INT_COL", "SKILLS_TEXT_COL"},
         new ColumnDataType[]{ColumnDataType.INT, ColumnDataType.STRING});
     QueriesTestUtils.testInterSegmentsResult(getBrokerResponse(query),
         new ResultTable(expectedDataSchema, expectedRows));
   }
 }