Fix empty TAG column result in to_dataframe when querying table model. (#730)
diff --git a/python/tests/test_load_tsfile_from_iotdb.py b/python/tests/test_load_tsfile_from_iotdb.py index 50ca0ba..21347c9 100644 --- a/python/tests/test_load_tsfile_from_iotdb.py +++ b/python/tests/test_load_tsfile_from_iotdb.py
@@ -51,6 +51,7 @@ (1760106080000 + 1760106109000) * 30 // 2 ) assert df["s0"].isna().sum() == 0 + df_s0 = df["s0"] assert df["s1"].isna().sum() == 0 assert df["s2"].isna().sum() == 8 assert df["s3"].isna().sum() == 0 @@ -73,6 +74,12 @@ assert df["s8"].isna().sum() == 0 assert df["s8"].nunique() == 60 assert df["s9"].isna().sum() == 8 + + df = ts.to_dataframe(simple_tabl1_path, table_name="test", column_names=["s0"]) + assert len(df) == 60 + assert len(df.columns) == 2 + assert df["s0"].equals(df_s0) + ## --------- simple_tabl2_path = os.path.join(dir_path, 'simple_table_t2.tsfile') @@ -118,17 +125,23 @@ assert math.isclose(df["temperature"].sum(), 2.5, rel_tol=1e-9) assert math.isclose(df["humidity"].sum(), 2.5, rel_tol=1e-9) assert (df["region_id"] == "loc").sum() == 25 + df_id = df["id"] - df = ts.to_dataframe(table_with_time_column_path, table_name="table2", column_names=["region_id", "temperature", "humidity"]) + df = ts.to_dataframe(table_with_time_column_path, table_name="table2", + column_names=["region_id", "temperature", "humidity"]) assert list(df.columns)[0] == "id" assert len(df) == 25 assert math.isclose(df["temperature"].sum(), 2.5, rel_tol=1e-9) assert (df["region_id"] == "loc").sum() == 25 - df = ts.to_dataframe(table_with_time_column_path, table_name="table2", column_names=["id", "temperature", "humidity"]) + df = ts.to_dataframe(table_with_time_column_path, table_name="table2", + column_names=["id", "temperature", "humidity"]) assert list(df.columns)[0] == "time" assert df["id"].equals(df["time"]) assert len(df) == 25 assert math.isclose(df["temperature"].sum(), 2.5, rel_tol=1e-9) assert math.isclose(df["humidity"].sum(), 2.5, rel_tol=1e-9) + df = ts.to_dataframe(table_with_time_column_path, table_name="table2", column_names=["id"]) + assert len(df.columns) == 2 + assert df_id.equals(df["id"])
diff --git a/python/tsfile/utils.py b/python/tsfile/utils.py index 6044ddb..2e5fc05 100644 --- a/python/tsfile/utils.py +++ b/python/tsfile/utils.py
@@ -22,7 +22,7 @@ import pandas as pd from pandas.core.dtypes.common import is_integer_dtype, is_object_dtype -from tsfile import ColumnSchema, TableSchema, ColumnCategory, TSDataType +from tsfile import ColumnSchema, TableSchema, ColumnCategory, TSDataType, TIME_COLUMN from tsfile.exceptions import TableNotExistError, ColumnNotExistError from tsfile.tsfile_reader import TsFileReaderPy from tsfile.tsfile_table_writer import TsFileTableWriter, infer_object_column_type, validate_dataframe_for_tsfile @@ -116,10 +116,16 @@ is_tree_model = len(table_schema) == 0 time_column = None + column_name_to_query = [] + no_field_query = True if is_tree_model: if _column_names is None: print("columns name is None, return all columns") + # When querying tables in the tree, only measurements are allowed currently. + no_field_query = False else: + _table_name = _table_name.lower() if _table_name else None + _column_names = [column.lower() for column in _column_names] if _column_names else None if _table_name is None: _table_name, table_schema = next(iter(table_schema.items())) else: @@ -137,17 +143,26 @@ if _column_names is not None: for column in _column_names: - if column.lower() not in column_names_in_file and column.lower() != time_column : + if column not in column_names_in_file and column != time_column: raise ColumnNotExistError(column) + if table_schema.get_column(column).get_category() == ColumnCategory.FIELD: + no_field_query = False + if no_field_query: + if time_column is not None: + column_name_to_query.append(time_column) + column_name_to_query.extend(column_names_in_file) + else: + column_name_to_query = _column_names else: - _column_names = column_names_in_file + no_field_query = False + column_name_to_query = column_names_in_file if is_tree_model: - if _column_names is None: - _column_names = [] - query_result = reader.query_table_on_tree(_column_names, _start_time, _end_time) + if _column_names is not None: + column_name_to_query = _column_names + query_result = reader.query_table_on_tree(column_name_to_query, _start_time, _end_time) else: - query_result = reader.query_table(_table_name, _column_names, _start_time, _end_time) + query_result = reader.query_table(_table_name, column_name_to_query, _start_time, _end_time) with query_result as result: while result.next(): @@ -164,8 +179,11 @@ continue total_rows += len(dataframe) if time_column is not None: - if _column_names is None or time_column.lower() not in [c.lower() for c in _column_names]: + if _column_names is None or time_column not in _column_names: dataframe = dataframe.rename(columns={dataframe.columns[0]: time_column}) + if no_field_query and _column_names is not None: + _column_names.insert(0, TIME_COLUMN) + dataframe = dataframe[_column_names] yield dataframe if (not is_iterator) and max_row_num is not None and total_rows >= max_row_num: break