Update like statements to reflect sql behaviors (#91) * Update like statements to reflect sql behaciors * Codestyle * Codestyle * Handle NotStartsWith * Update pyiceberg/expressions/parser.py Co-authored-by: Fokko Driesprong <fokko@apache.org> * Update tests/expressions/test_parser.py Co-authored-by: Fokko Driesprong <fokko@apache.org> --------- Co-authored-by: Fokko Driesprong <fokko@apache.org>

commit: a09de6914e2ed0b2e3472f1a1e274a546f87ca51 [log] [tgz]
author: Daniel Weeks <daniel.c.weeks@gmail.com> Sat Oct 21 15:38:00 2023 -0700
committer: Fokko Driesprong <fokko@tabular.io> Tue Oct 24 09:01:36 2023 -0400
tree: 6d61fde91f7718f2e2af2a21819b062061c08e94
parent: 1b186d6041f0160c736365709bd091c59dd1795c [diff]
diff --git a/pyiceberg/expressions/parser.py b/pyiceberg/expressions/parser.py
index 4580533..8873907 100644
--- a/pyiceberg/expressions/parser.py
+++ b/pyiceberg/expressions/parser.py

@@ -14,6 +14,7 @@
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
+import re
 from decimal import Decimal
 
 from pyparsing import (
@@ -51,7 +52,6 @@
     NotIn,
     NotNaN,
     NotNull,
-    NotStartsWith,
     Or,
     Reference,
     StartsWith,
@@ -78,6 +78,8 @@
 identifier = Word(alphas, alphanums + "_$").set_results_name("identifier")
 column = DelimitedList(identifier, delim=".", combine=False).set_results_name("column")
 
+like_regex = r'(?P<valid_wildcard>(?<!\\)%$)|(?P<invalid_wildcard>(?<!\\)%)'
+
 
 @column.set_parse_action
 def _(result: ParseResults) -> Reference:
@@ -217,12 +219,25 @@
 
 @starts_with.set_parse_action
 def _(result: ParseResults) -> BooleanExpression:
-    return StartsWith(result.column, result.raw_quoted_string)
+    return _evaluate_like_statement(result)
 
 
 @not_starts_with.set_parse_action
 def _(result: ParseResults) -> BooleanExpression:
-    return NotStartsWith(result.column, result.raw_quoted_string)
+    return ~_evaluate_like_statement(result)
+
+
+def _evaluate_like_statement(result: ParseResults) -> BooleanExpression:
+    literal_like: StringLiteral = result.raw_quoted_string
+
+    match = re.search(like_regex, literal_like.value)
+
+    if match and match.groupdict()['invalid_wildcard']:
+        raise ValueError("LIKE expressions only supports wildcard, '%', at the end of a string")
+    elif match and match.groupdict()['valid_wildcard']:
+        return StartsWith(result.column, StringLiteral(literal_like.value[:-1].replace('\\%', '%')))
+    else:
+        return EqualTo(result.column, StringLiteral(literal_like.value.replace('\\%', '%')))
 
 
 predicate = (comparison | in_check | null_check | nan_check | starts_check | boolean).set_results_name("predicate")

diff --git a/tests/expressions/test_parser.py b/tests/expressions/test_parser.py
index 65415f2..8257710 100644
--- a/tests/expressions/test_parser.py
+++ b/tests/expressions/test_parser.py

@@ -168,12 +168,30 @@
     ) == parser.parse("foo is not null and foo < 5 or (foo > 10 and foo < 100 and bar is null)")
 
 
+def test_like_equality() -> None:
+    assert EqualTo("foo", "data") == parser.parse("foo LIKE 'data'")
+    assert EqualTo("foo", "data%") == parser.parse("foo LIKE 'data\\%'")
+
+
 def test_starts_with() -> None:
-    assert StartsWith("foo", "data") == parser.parse("foo LIKE 'data'")
+    assert StartsWith("foo", "data") == parser.parse("foo LIKE 'data%'")
+    assert StartsWith("foo", "some % data") == parser.parse("foo LIKE 'some \\% data%'")
+    assert StartsWith("foo", "some data%") == parser.parse("foo LIKE 'some data\\%%'")
+
+
+def test_invalid_likes() -> None:
+    invalid_statements = ["foo LIKE '%data%'", "foo LIKE 'da%ta'", "foo LIKE '%data'"]
+
+    for statement in invalid_statements:
+        with pytest.raises(ValueError) as exc_info:
+            parser.parse(statement)
+
+        assert "LIKE expressions only supports wildcard, '%', at the end of a string" in str(exc_info)
 
 
 def test_not_starts_with() -> None:
-    assert NotStartsWith("foo", "data") == parser.parse("foo NOT LIKE 'data'")
+    assert NotEqualTo("foo", "data") == parser.parse("foo NOT LIKE 'data'")
+    assert NotStartsWith("foo", "data") == parser.parse("foo NOT LIKE 'data%'")
 
 
 def test_with_function() -> None:
commit	a09de6914e2ed0b2e3472f1a1e274a546f87ca51	[log] [tgz]
author	Daniel Weeks <daniel.c.weeks@gmail.com>	Sat Oct 21 15:38:00 2023 -0700
committer	Fokko Driesprong <fokko@tabular.io>	Tue Oct 24 09:01:36 2023 -0400
tree	6d61fde91f7718f2e2af2a21819b062061c08e94
parent	1b186d6041f0160c736365709bd091c59dd1795c [diff]