From 1d015063fe2ff373da3996c24ab47b317a5b0db5 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 30 Jan 2025 16:37:30 -0500 Subject: [PATCH 1/4] Fully support LIKE/NLIKE with Utf8View --- datafusion/sql/src/expr/mod.rs | 5 +- datafusion/sqllogictest/test_files/scalar.slt | 9 ++ .../sqllogictest/test_files/string/string.slt | 135 ----------------- .../test_files/string/string_query.slt.part | 142 ++++++++++++++++++ 4 files changed, 152 insertions(+), 139 deletions(-) diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 9b40ebdaf6a5..2ef28cbb235e 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -30,6 +30,7 @@ use datafusion_common::{ internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, Result, ScalarValue, }; + use datafusion_expr::expr::ScalarFunction; use datafusion_expr::expr::{InList, WildcardOptions}; use datafusion_expr::{ @@ -819,10 +820,6 @@ impl SqlToRel<'_, S> { return not_impl_err!("ANY in LIKE expression"); } let pattern = self.sql_expr_to_logical_expr(pattern, schema, planner_context)?; - let pattern_type = pattern.get_type(schema)?; - if pattern_type != DataType::Utf8 && pattern_type != DataType::Null { - return plan_err!("Invalid pattern in LIKE expression"); - } let escape_char = if let Some(char) = escape_char { if char.len() != 1 { return plan_err!("Invalid escape character in LIKE expression"); diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index 107721c5fe9d..62ad70160371 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -1689,6 +1689,15 @@ true true false false true true statement ok drop table t1 +# can't use like with non stirngs +query error +select column1 like 1 from (values('a'), ('b'), (NULL)) as t; +---- +DataFusion error: type_coercion +caused by +Error during planning: There isn't a common type to coerce Utf8 and Int64 in LIKE expression + + # like nlike with null lt query BB rowsort SELECT column1 like NULL as col_null, NULL like column1 as null_col from (values('a'), ('b'), (NULL)) as t diff --git a/datafusion/sqllogictest/test_files/string/string.slt b/datafusion/sqllogictest/test_files/string/string.slt index 55f0c034f5f9..c62a6b0b3c87 100644 --- a/datafusion/sqllogictest/test_files/string/string.slt +++ b/datafusion/sqllogictest/test_files/string/string.slt @@ -41,141 +41,6 @@ select arrow_cast(col1, 'Utf8') as c1 from test_substr_base; # include ./string_query.slt.part -# TODO support all String types in sql_like_to_expr and move this test to `string_query.slt.part` -# dynamic LIKE as filter -query TTT rowsort -SELECT ascii_1, 'is LIKE', ascii_2 FROM test_basic_operator WHERE ascii_1 LIKE ascii_2 -UNION ALL -SELECT ascii_1, 'is NOT LIKE', ascii_2 FROM test_basic_operator WHERE ascii_1 NOT LIKE ascii_2 -UNION ALL -SELECT unicode_1, 'is LIKE', ascii_2 FROM test_basic_operator WHERE unicode_1 LIKE ascii_2 -UNION ALL -SELECT unicode_1, 'is NOT LIKE', ascii_2 FROM test_basic_operator WHERE unicode_1 NOT LIKE ascii_2 -UNION ALL -SELECT unicode_2, 'is LIKE', ascii_2 FROM test_basic_operator WHERE unicode_2 LIKE ascii_2 -UNION ALL -SELECT unicode_2, 'is NOT LIKE', ascii_2 FROM test_basic_operator WHERE unicode_2 NOT LIKE ascii_2 ----- -% is LIKE \% -(empty) is LIKE % -(empty) is LIKE % -(empty) is LIKE % -(empty) is LIKE %% -(empty) is LIKE %% -(empty) is LIKE %% -(empty) is NOT LIKE \% -(empty) is NOT LIKE \% -(empty) is NOT LIKE \_ -(empty) is NOT LIKE \_ -Andrew is NOT LIKE X -Pan Tadeusz ma frunąć stąd w kąt is NOT LIKE p%t -Raphael is NOT LIKE R -Xiangpeng is LIKE Xiangpeng -_ is LIKE \_ -chrząszcz na łące w 東京都 is NOT LIKE un_____core -datafusionДатаФусион is NOT LIKE R -datafusion数据融合 is NOT LIKE Xiangpeng -datafusion数据融合 is NOT LIKE Xiangpeng -datafusion📊🔥 is NOT LIKE X -pan Tadeusz ma iść w kąt is LIKE p%t -percent is LIKE p%t -un iść core is LIKE un_____core -under_score is LIKE un_____core -аФус is NOT LIKE R -🔥 is NOT LIKE R -🔥 is NOT LIKE X - -# TODO support all String types in sql_like_to_expr and move this test to `string_query.slt.part` -# dynamic LIKE as projection -query TTTTBBBB rowsort -SELECT - ascii_1, ascii_2, unicode_1, unicode_2, - (ascii_1 LIKE ascii_2) AS ascii_1_like_ascii_2, - (ascii_2 LIKE ascii_1) AS ascii_2_like_ascii_1, - (unicode_1 LIKE ascii_2) AS unicode_1_like_ascii_2, - (unicode_2 LIKE ascii_2) AS unicode_2_like_ascii_2 -FROM test_basic_operator ----- -% \% (empty) (empty) true true false false -(empty) % (empty) (empty) true false true true -(empty) %% (empty) (empty) true false true true -Andrew X datafusion📊🔥 🔥 false false false false -NULL % NULL NULL NULL NULL NULL NULL -NULL R NULL 🔥 NULL NULL NULL false -Raphael R datafusionДатаФусион аФус false false false false -Xiangpeng Xiangpeng datafusion数据融合 datafusion数据融合 true true false false -_ \_ (empty) (empty) true false false false -percent p%t pan Tadeusz ma iść w kąt Pan Tadeusz ma frunąć stąd w kąt true false true false -under_score un_____core un iść core chrząszcz na łące w 東京都 true false true false - -# TODO support all String types in sql_like_to_expr and move this test to `string_query.slt.part` -# dynamic ILIKE as filter -query TTT rowsort -SELECT ascii_1, 'is ILIKE', ascii_2 FROM test_basic_operator WHERE ascii_1 ILIKE ascii_2 -UNION ALL -SELECT ascii_1, 'is NOT ILIKE', ascii_2 FROM test_basic_operator WHERE ascii_1 NOT ILIKE ascii_2 -UNION ALL -SELECT unicode_1, 'is ILIKE', ascii_2 FROM test_basic_operator WHERE unicode_1 ILIKE ascii_2 -UNION ALL -SELECT unicode_1, 'is NOT ILIKE', ascii_2 FROM test_basic_operator WHERE unicode_1 NOT ILIKE ascii_2 -UNION ALL -SELECT unicode_2, 'is ILIKE', ascii_2 FROM test_basic_operator WHERE unicode_2 ILIKE ascii_2 -UNION ALL -SELECT unicode_2, 'is NOT ILIKE', ascii_2 FROM test_basic_operator WHERE unicode_2 NOT ILIKE ascii_2 ----- -% is ILIKE \% -(empty) is ILIKE % -(empty) is ILIKE % -(empty) is ILIKE % -(empty) is ILIKE %% -(empty) is ILIKE %% -(empty) is ILIKE %% -(empty) is NOT ILIKE \% -(empty) is NOT ILIKE \% -(empty) is NOT ILIKE \_ -(empty) is NOT ILIKE \_ -Andrew is NOT ILIKE X -Pan Tadeusz ma frunąć stąd w kąt is ILIKE p%t -Raphael is NOT ILIKE R -Xiangpeng is ILIKE Xiangpeng -_ is ILIKE \_ -chrząszcz na łące w 東京都 is NOT ILIKE un_____core -datafusionДатаФусион is NOT ILIKE R -datafusion数据融合 is NOT ILIKE Xiangpeng -datafusion数据融合 is NOT ILIKE Xiangpeng -datafusion📊🔥 is NOT ILIKE X -pan Tadeusz ma iść w kąt is ILIKE p%t -percent is ILIKE p%t -un iść core is ILIKE un_____core -under_score is ILIKE un_____core -аФус is NOT ILIKE R -🔥 is NOT ILIKE R -🔥 is NOT ILIKE X - -# TODO support all String types in sql_like_to_expr and move this test to `string_query.slt.part` -# dynamic ILIKE as projection -query TTTTBBBB rowsort -SELECT - ascii_1, ascii_2, unicode_1, unicode_2, - (ascii_1 ILIKE ascii_2) AS ascii_1_ilike_ascii_2, - (ascii_2 ILIKE ascii_1) AS ascii_2_ilike_ascii_1, - (unicode_1 ILIKE ascii_2) AS unicode_1_ilike_ascii_2, - (unicode_2 ILIKE ascii_2) AS unicode_2_ilike_ascii_2 -FROM test_basic_operator ----- -% \% (empty) (empty) true true false false -(empty) % (empty) (empty) true false true true -(empty) %% (empty) (empty) true false true true -Andrew X datafusion📊🔥 🔥 false false false false -NULL % NULL NULL NULL NULL NULL NULL -NULL R NULL 🔥 NULL NULL NULL false -Raphael R datafusionДатаФусион аФус false false false false -Xiangpeng Xiangpeng datafusion数据融合 datafusion数据融合 true true false false -_ \_ (empty) (empty) true false false false -percent p%t pan Tadeusz ma iść w kąt Pan Tadeusz ma frunąć stąd w kąt true false true true -under_score un_____core un iść core chrząszcz na łące w 東京都 true false true false - - # # Clean up diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index 2414e5864c99..ecb102f05389 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -188,6 +188,148 @@ _ (empty) false true false true NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL +# -------------------------------------- +# dynamic LIKE as filter +# -------------------------------------- + +query TTT rowsort +SELECT ascii_1, 'is LIKE', ascii_2 FROM test_basic_operator WHERE ascii_1 LIKE ascii_2 +UNION ALL +SELECT ascii_1, 'is NOT LIKE', ascii_2 FROM test_basic_operator WHERE ascii_1 NOT LIKE ascii_2 +UNION ALL +SELECT unicode_1, 'is LIKE', ascii_2 FROM test_basic_operator WHERE unicode_1 LIKE ascii_2 +UNION ALL +SELECT unicode_1, 'is NOT LIKE', ascii_2 FROM test_basic_operator WHERE unicode_1 NOT LIKE ascii_2 +UNION ALL +SELECT unicode_2, 'is LIKE', ascii_2 FROM test_basic_operator WHERE unicode_2 LIKE ascii_2 +UNION ALL +SELECT unicode_2, 'is NOT LIKE', ascii_2 FROM test_basic_operator WHERE unicode_2 NOT LIKE ascii_2 +---- +% is LIKE \% +(empty) is LIKE % +(empty) is LIKE % +(empty) is LIKE % +(empty) is LIKE %% +(empty) is LIKE %% +(empty) is LIKE %% +(empty) is NOT LIKE \% +(empty) is NOT LIKE \% +(empty) is NOT LIKE \_ +(empty) is NOT LIKE \_ +Andrew is NOT LIKE X +Pan Tadeusz ma frunąć stąd w kąt is NOT LIKE p%t +Raphael is NOT LIKE R +Xiangpeng is LIKE Xiangpeng +_ is LIKE \_ +chrząszcz na łące w 東京都 is NOT LIKE un_____core +datafusionДатаФусион is NOT LIKE R +datafusion数据融合 is NOT LIKE Xiangpeng +datafusion数据融合 is NOT LIKE Xiangpeng +datafusion📊🔥 is NOT LIKE X +pan Tadeusz ma iść w kąt is LIKE p%t +percent is LIKE p%t +un iść core is LIKE un_____core +under_score is LIKE un_____core +аФус is NOT LIKE R +🔥 is NOT LIKE R +🔥 is NOT LIKE X + +# -------------------------------------- +# dynamic LIKE as projection +# -------------------------------------- + +query TTTTBBBB rowsort +SELECT + ascii_1, ascii_2, unicode_1, unicode_2, + (ascii_1 LIKE ascii_2) AS ascii_1_like_ascii_2, + (ascii_2 LIKE ascii_1) AS ascii_2_like_ascii_1, + (unicode_1 LIKE ascii_2) AS unicode_1_like_ascii_2, + (unicode_2 LIKE ascii_2) AS unicode_2_like_ascii_2 +FROM test_basic_operator +---- +% \% (empty) (empty) true true false false +(empty) % (empty) (empty) true false true true +(empty) %% (empty) (empty) true false true true +Andrew X datafusion📊🔥 🔥 false false false false +NULL % NULL NULL NULL NULL NULL NULL +NULL R NULL 🔥 NULL NULL NULL false +Raphael R datafusionДатаФусион аФус false false false false +Xiangpeng Xiangpeng datafusion数据融合 datafusion数据融合 true true false false +_ \_ (empty) (empty) true false false false +percent p%t pan Tadeusz ma iść w kąt Pan Tadeusz ma frunąć stąd w kąt true false true false +under_score un_____core un iść core chrząszcz na łące w 東京都 true false true false + +# -------------------------------------- +# dynamic ILIKE as filter +# -------------------------------------- + +query TTT rowsort +SELECT ascii_1, 'is ILIKE', ascii_2 FROM test_basic_operator WHERE ascii_1 ILIKE ascii_2 +UNION ALL +SELECT ascii_1, 'is NOT ILIKE', ascii_2 FROM test_basic_operator WHERE ascii_1 NOT ILIKE ascii_2 +UNION ALL +SELECT unicode_1, 'is ILIKE', ascii_2 FROM test_basic_operator WHERE unicode_1 ILIKE ascii_2 +UNION ALL +SELECT unicode_1, 'is NOT ILIKE', ascii_2 FROM test_basic_operator WHERE unicode_1 NOT ILIKE ascii_2 +UNION ALL +SELECT unicode_2, 'is ILIKE', ascii_2 FROM test_basic_operator WHERE unicode_2 ILIKE ascii_2 +UNION ALL +SELECT unicode_2, 'is NOT ILIKE', ascii_2 FROM test_basic_operator WHERE unicode_2 NOT ILIKE ascii_2 +---- +% is ILIKE \% +(empty) is ILIKE % +(empty) is ILIKE % +(empty) is ILIKE % +(empty) is ILIKE %% +(empty) is ILIKE %% +(empty) is ILIKE %% +(empty) is NOT ILIKE \% +(empty) is NOT ILIKE \% +(empty) is NOT ILIKE \_ +(empty) is NOT ILIKE \_ +Andrew is NOT ILIKE X +Pan Tadeusz ma frunąć stąd w kąt is ILIKE p%t +Raphael is NOT ILIKE R +Xiangpeng is ILIKE Xiangpeng +_ is ILIKE \_ +chrząszcz na łące w 東京都 is NOT ILIKE un_____core +datafusionДатаФусион is NOT ILIKE R +datafusion数据融合 is NOT ILIKE Xiangpeng +datafusion数据融合 is NOT ILIKE Xiangpeng +datafusion📊🔥 is NOT ILIKE X +pan Tadeusz ma iść w kąt is ILIKE p%t +percent is ILIKE p%t +un iść core is ILIKE un_____core +under_score is ILIKE un_____core +аФус is NOT ILIKE R +🔥 is NOT ILIKE R +🔥 is NOT ILIKE X + +# -------------------------------------- +# dynamic ILIKE as projection +# -------------------------------------- +query TTTTBBBB rowsort +SELECT + ascii_1, ascii_2, unicode_1, unicode_2, + (ascii_1 ILIKE ascii_2) AS ascii_1_ilike_ascii_2, + (ascii_2 ILIKE ascii_1) AS ascii_2_ilike_ascii_1, + (unicode_1 ILIKE ascii_2) AS unicode_1_ilike_ascii_2, + (unicode_2 ILIKE ascii_2) AS unicode_2_ilike_ascii_2 +FROM test_basic_operator +---- +% \% (empty) (empty) true true false false +(empty) % (empty) (empty) true false true true +(empty) %% (empty) (empty) true false true true +Andrew X datafusion📊🔥 🔥 false false false false +NULL % NULL NULL NULL NULL NULL NULL +NULL R NULL 🔥 NULL NULL NULL false +Raphael R datafusionДатаФусион аФус false false false false +Xiangpeng Xiangpeng datafusion数据融合 datafusion数据融合 true true false false +_ \_ (empty) (empty) true false false false +percent p%t pan Tadeusz ma iść w kąt Pan Tadeusz ma frunąć stąd w kąt true false true true +under_score un_____core un iść core chrząszcz na łące w 東京都 true false true false + + # -------------------------------------- # substr function # -------------------------------------- From 8d8da1a2c951859ca8de30ad8484dbae3b7fcb06 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 30 Jan 2025 20:04:31 -0500 Subject: [PATCH 2/4] update test --- datafusion/sqllogictest/test_files/scalar.slt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index 62ad70160371..7ee9b842f1f4 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -1690,12 +1690,8 @@ statement ok drop table t1 # can't use like with non stirngs -query error +query error There isn't a common type to coerce Utf8 and Int64 in LIKE expression select column1 like 1 from (values('a'), ('b'), (NULL)) as t; ----- -DataFusion error: type_coercion -caused by -Error during planning: There isn't a common type to coerce Utf8 and Int64 in LIKE expression # like nlike with null lt From 58ad6323bb827eb1229f8e1673d148f26a75b478 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 3 Feb 2025 07:13:54 -0500 Subject: [PATCH 3/4] fix typo --- datafusion/sqllogictest/test_files/scalar.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index 7ee9b842f1f4..0add75c7c152 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -1689,7 +1689,7 @@ true true false false true true statement ok drop table t1 -# can't use like with non stirngs +# can't use like with non strings query error There isn't a common type to coerce Utf8 and Int64 in LIKE expression select column1 like 1 from (values('a'), ('b'), (NULL)) as t; From f138f606ed8da70c3452f8b7a034ed76076dbd45 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 3 Feb 2025 07:22:59 -0500 Subject: [PATCH 4/4] Add literal tests for like --- .../test_files/string/string_literal.slt | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt b/datafusion/sqllogictest/test_files/string/string_literal.slt index 738cb7f27054..4bcc57f1b88a 100644 --- a/datafusion/sqllogictest/test_files/string/string_literal.slt +++ b/datafusion/sqllogictest/test_files/string/string_literal.slt @@ -839,6 +839,28 @@ SELECT ---- NULL true true +# Literals with different arrow types +query BBBB +select + arrow_cast('foobar', 'Utf8') LIKE arrow_cast('foo%', 'Utf8'), + arrow_cast('foobar', 'LargeUtf8') LIKE arrow_cast('foo%', 'LargeUtf8'), + arrow_cast('foobar', 'Utf8View') LIKE arrow_cast('foo%', 'Utf8View'), + arrow_cast('foobar', 'Dictionary(Int32, Utf8)') LIKE arrow_cast('foo%', 'Dictionary(Int32, Utf8)') +---- +true true true true + +# Literal with UTF8 string and different arrow types for pattern +query BBBB +select + 'foobar' LIKE arrow_cast('foo%', 'Utf8'), + 'foobar' LIKE arrow_cast('foo%', 'LargeUtf8'), + 'foobar' LIKE arrow_cast('foo%', 'Utf8View'), + 'foobar' LIKE arrow_cast('foo%', 'Dictionary(Int32, Utf8)') +---- +true true true true + +# Escapes + # \ is an implicit escape character query BBBB SELECT