From bfa990daaf9f24f434f4e6a41d537eb63df7beaf Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 18 Oct 2022 12:32:49 -0700 Subject: [PATCH 01/25] Uncomment tests in test_compatibility.py --- tests/integration/test_compatibility.py | 258 ++++++++++++------------ 1 file changed, 124 insertions(+), 134 deletions(-) diff --git a/tests/integration/test_compatibility.py b/tests/integration/test_compatibility.py index d93fbbac0..de29108c2 100644 --- a/tests/integration/test_compatibility.py +++ b/tests/integration/test_compatibility.py @@ -594,72 +594,71 @@ def test_window_row_number_partition_by(): ) -# TODO: Except not implemented so far -# def test_window_ranks(): -# a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50)) -# eq_sqlite( -# """ -# SELECT *, -# RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1, -# DENSE_RANK() OVER (ORDER BY a ASC, b DESC NULLS LAST, c DESC) AS a2, -# PERCENT_RANK() OVER (ORDER BY a ASC, b ASC NULLS LAST, c) AS a4 -# FROM a -# """, -# a=a, -# ) - -# TODO: Except not implemented so far -# def test_window_ranks_partition_by(): -# a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50)) -# eq_sqlite( -# """ -# SELECT *, -# RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1, -# DENSE_RANK() OVER -# (PARTITION BY a ORDER BY a ASC, b DESC NULLS LAST, c DESC) -# AS a2, -# PERCENT_RANK() OVER -# (PARTITION BY a ORDER BY a ASC, b ASC NULLS LAST, c) AS a4 -# FROM a -# """, -# a=a, -# ) - -# TODO: Except not implemented so far -# def test_window_lead_lag(): -# a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50)) -# eq_sqlite( -# """ -# SELECT -# LEAD(b,1) OVER (ORDER BY a) AS a1, -# LEAD(b,2,10) OVER (ORDER BY a) AS a2, -# LEAD(b,1) OVER (PARTITION BY c ORDER BY a) AS a3, -# LEAD(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS a5, - -# LAG(b,1) OVER (ORDER BY a) AS b1, -# LAG(b,2,10) OVER (ORDER BY a) AS b2, -# LAG(b,1) OVER (PARTITION BY c ORDER BY a) AS b3, -# LAG(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS b5 -# FROM a -# """, -# a=a, -# ) - -# TODO: Except not implemented so far -# def test_window_lead_lag_partition_by(): -# a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50)) -# eq_sqlite( -# """ -# SELECT -# LEAD(b,1,10) OVER (PARTITION BY c ORDER BY a) AS a3, -# LEAD(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS a5, - -# LAG(b,1) OVER (PARTITION BY c ORDER BY a) AS b3, -# LAG(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS b5 -# FROM a -# """, -# a=a, -# ) +def test_window_ranks(): + a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50)) + eq_sqlite( + """ + SELECT *, + RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1, + DENSE_RANK() OVER (ORDER BY a ASC, b DESC NULLS LAST, c DESC) AS a2 + PERCENT_RANK() OVER (ORDER BY a ASC, b ASC NULLS LAST, c) AS a4 + FROM a + """, + a=a, + ) + + +def test_window_ranks_partition_by(): + a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50)) + eq_sqlite( + """ + SELECT *, + RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1, + DENSE_RANK() OVER + (PARTITION BY a ORDER BY a ASC, b DESC NULLS LAST, c DESC) + AS a2, + PERCENT_RANK() OVER + (PARTITION BY a ORDER BY a ASC, b ASC NULLS LAST, c) AS a4 + FROM a + """, + a=a, + ) + + +def test_window_lead_lag(): + a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50)) + eq_sqlite( + """ + SELECT + LEAD(b,1) OVER (ORDER BY a) AS a1, + LEAD(b,2,10) OVER (ORDER BY a) AS a2, + LEAD(b,1) OVER (PARTITION BY c ORDER BY a) AS a3, + LEAD(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS a5, + + LAG(b,1) OVER (ORDER BY a) AS b1, + LAG(b,2,10) OVER (ORDER BY a) AS b2, + LAG(b,1) OVER (PARTITION BY c ORDER BY a) AS b3, + LAG(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS b5 + FROM a + """, + a=a, + ) + + +def test_window_lead_lag_partition_by(): + a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50)) + eq_sqlite( + """ + SELECT + LEAD(b,1,10) OVER (PARTITION BY c ORDER BY a) AS a3, + LEAD(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS a5, + + LAG(b,1) OVER (PARTITION BY c ORDER BY a) AS b3, + LAG(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS b5 + FROM a + """, + a=a, + ) def test_window_sum_avg(): @@ -833,19 +832,17 @@ def test_window_count(): ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS a5, {func}(b+a) OVER (PARTITION BY b ORDER BY a ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) - AS a6 - - -- No support for rolling on string types - -- {func}(c) OVER () AS b1, - -- {func}(c) OVER (PARTITION BY c) AS b2, - -- {func}(c) OVER (PARTITION BY c,b) AS b3, - -- {func}(c) OVER (PARTITION BY b ORDER BY a - -- ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS b4, - -- {func}(c) OVER (PARTITION BY b ORDER BY a DESC - -- ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS b5, - -- {func}(c) OVER (PARTITION BY b ORDER BY a - -- ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) - -- AS b6 + AS a6, + {func}(c) OVER () AS b1, + {func}(c) OVER (PARTITION BY c) AS b2, + {func}(c) OVER (PARTITION BY c,b) AS b3, + {func}(c) OVER (PARTITION BY b ORDER BY a + ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS b4, + {func}(c) OVER (PARTITION BY b ORDER BY a DESC + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS b5, + {func}(c) OVER (PARTITION BY b ORDER BY a + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) + AS b6 FROM a ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST """, @@ -858,13 +855,11 @@ def test_window_count(): {func}(b) OVER (ORDER BY a DESC ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS a6, {func}(b) OVER (PARTITION BY c ORDER BY a DESC - ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS a9 --, - - -- No support for rolling on string types - -- {func}(c) OVER (ORDER BY a DESC - -- ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b6, - -- {func}(c) OVER (PARTITION BY c ORDER BY a DESC - -- ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b9 + ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS a9, + {func}(c) OVER (ORDER BY a DESC + ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b6, + {func}(c) OVER (PARTITION BY c ORDER BY a DESC + ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b9 FROM a ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST """, @@ -886,18 +881,16 @@ def test_window_count_partition_by(): ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS a5, {func}(b+a) OVER (PARTITION BY b ORDER BY a ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) - AS a6 --, - - -- No support for rolling on string types - -- {func}(c) OVER (PARTITION BY c) AS b2, - -- {func}(c) OVER (PARTITION BY c,b) AS b3, - -- {func}(c) OVER (PARTITION BY b ORDER BY a - -- ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS b4, - -- {func}(c) OVER (PARTITION BY b ORDER BY a DESC - -- ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS b5, - -- {func}(c) OVER (PARTITION BY b ORDER BY a - -- ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) - -- AS b6 + AS a6, + {func}(c) OVER (PARTITION BY c) AS b2, + {func}(c) OVER (PARTITION BY c,b) AS b3, + {func}(c) OVER (PARTITION BY b ORDER BY a + ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS b4, + {func}(c) OVER (PARTITION BY b ORDER BY a DESC + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS b5, + {func}(c) OVER (PARTITION BY b ORDER BY a + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) + AS b6 FROM a ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST """, @@ -908,11 +901,9 @@ def test_window_count_partition_by(): f""" SELECT a,b, {func}(b) OVER (PARTITION BY c ORDER BY a DESC - ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS a9 --, - - -- No support for rolling on string types - -- {func}(c) OVER (PARTITION BY c ORDER BY a DESC - -- ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b9 + ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS a9, + {func}(c) OVER (PARTITION BY c ORDER BY a DESC + ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b9 FROM a ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST """, @@ -963,37 +954,36 @@ def test_union(): ) -# TODO: Except not implemented so far -# def test_except(): -# a = make_rand_df(30, b=(int, 10), c=(str, 10)) -# b = make_rand_df(80, b=(int, 50), c=(str, 50)) -# c = make_rand_df(100, b=(int, 50), c=(str, 50)) -# eq_sqlite( -# """ -# SELECT * FROM c -# EXCEPT SELECT * FROM b -# EXCEPT SELECT * FROM c -# """, -# a=a, -# b=b, -# c=c, -# ) - -# TODO: Intersect not implemented so far -# def test_intersect(): -# a = make_rand_df(30, b=(int, 10), c=(str, 10)) -# b = make_rand_df(80, b=(int, 50), c=(str, 50)) -# c = make_rand_df(100, b=(int, 50), c=(str, 50)) -# eq_sqlite( -# """ -# SELECT * FROM c -# INTERSECT SELECT * FROM b -# INTERSECT SELECT * FROM c -# """, -# a=a, -# b=b, -# c=c, -# ) +def test_except(): + a = make_rand_df(30, b=(int, 10), c=(str, 10)) + b = make_rand_df(80, b=(int, 50), c=(str, 50)) + c = make_rand_df(100, b=(int, 50), c=(str, 50)) + eq_sqlite( + """ + SELECT * FROM c + EXCEPT SELECT * FROM b + EXCEPT SELECT * FROM c + """, + a=a, + b=b, + c=c, + ) + + +def test_intersect(): + a = make_rand_df(30, b=(int, 10), c=(str, 10)) + b = make_rand_df(80, b=(int, 50), c=(str, 50)) + c = make_rand_df(100, b=(int, 50), c=(str, 50)) + eq_sqlite( + """ + SELECT * FROM c + INTERSECT SELECT * FROM b + INTERSECT SELECT * FROM c + """, + a=a, + b=b, + c=c, + ) def test_with(): From 4c3c9ad7c1aa2962c5c9eec7d94b32d9c1d8c6b1 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 18 Oct 2022 13:11:10 -0700 Subject: [PATCH 02/25] Uncomments tests in test_explain.py --- tests/integration/test_explain.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tests/integration/test_explain.py b/tests/integration/test_explain.py index 2a1793612..e0a1d85bc 100644 --- a/tests/integration/test_explain.py +++ b/tests/integration/test_explain.py @@ -2,6 +2,8 @@ import pandas as pd import pytest +from dask_planner.rust import DaskStatistics + @pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) def test_sql_query_explain(c, gpu): @@ -12,15 +14,6 @@ def test_sql_query_explain(c, gpu): assert sql_string.startswith("Projection: df.a\n") - # TODO: Need to add statistics to Rust optimizer before this can be uncommented. - # c.create_table("df", data_frame, statistics=Statistics(row_count=1337)) - - # sql_string = c.explain("SELECT * FROM df") - - # assert sql_string.startswith( - # "DaskTableScan(table=[[root, df]]): rowcount = 1337.0, cumulative cost = {1337.0 rows, 1338.0 cpu, 0.0 io}, id = " - # ) - sql_string = c.sql( "EXPLAIN SELECT MIN(a) AS a_min FROM other_df GROUP BY a", dataframes={"other_df": df}, @@ -28,3 +21,15 @@ def test_sql_query_explain(c, gpu): ) assert sql_string.startswith("Projection: MIN(other_df.a) AS a_min\n") assert "Aggregate: groupBy=[[other_df.a]], aggr=[[MIN(other_df.a)]]" in sql_string + + +@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) +def test_statistics_explain(c, gpu): + df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) + c.create_table("df", df, statistics=DaskStatistics(row_count=1337)) + + sql_string = c.explain("SELECT * FROM df") + + assert sql_string.startswith( + "DaskTableScan(table=[[root, df]]): rowcount = 1337.0, cumulative cost = {1337.0 rows, 1338.0 cpu, 0.0 io}, id = " + ) From 3c6f404828b320107fef150dbfbc91d21a33df85 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 19 Oct 2022 10:03:52 -0700 Subject: [PATCH 03/25] xfail instead of skipping for failing independent cluster tests --- tests/integration/fixtures.py | 4 ++-- tests/integration/test_complex.py | 4 ++-- tests/integration/test_fugue.py | 6 +++--- tests/integration/test_model.py | 28 ++++++++++++++-------------- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/tests/integration/fixtures.py b/tests/integration/fixtures.py index 8dec835c3..e0b478dbc 100644 --- a/tests/integration/fixtures.py +++ b/tests/integration/fixtures.py @@ -340,7 +340,7 @@ def client(): yield client -skip_if_external_scheduler = pytest.mark.skipif( - os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None, +xfail_if_external_scheduler = pytest.mark.xfail( + condition=os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None, reason="Can not run with external cluster", ) diff --git a/tests/integration/test_complex.py b/tests/integration/test_complex.py index fc79f0a11..209e383f5 100644 --- a/tests/integration/test_complex.py +++ b/tests/integration/test_complex.py @@ -1,9 +1,9 @@ from dask.datasets import timeseries -from tests.integration.fixtures import skip_if_external_scheduler +from tests.integration.fixtures import xfail_if_external_scheduler -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_complex_query(c): df = timeseries(freq="1d").persist() c.create_table("timeseries", df) diff --git a/tests/integration/test_fugue.py b/tests/integration/test_fugue.py index d846b5559..9c5eeab12 100644 --- a/tests/integration/test_fugue.py +++ b/tests/integration/test_fugue.py @@ -3,7 +3,7 @@ import pytest from dask_sql import Context -from tests.integration.fixtures import skip_if_external_scheduler +from tests.integration.fixtures import xfail_if_external_scheduler from tests.utils import assert_eq fugue_sql = pytest.importorskip("fugue_sql") @@ -13,7 +13,7 @@ from dask_sql.integrations.fugue import DaskSQLExecutionEngine, fsql_dask -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_simple_statement(client): with fugue_sql.FugueSQLWorkflow(DaskSQLExecutionEngine) as dag: df = dag.df([[0, "hello"], [1, "world"]], "a:int64,b:str") @@ -44,7 +44,7 @@ def test_simple_statement(client): # TODO: Revisit fixing this on an independant cluster (without dask-sql) based on the # discussion in https://github.com/dask-contrib/dask-sql/issues/407 -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_fsql(client): def assert_fsql(df: pd.DataFrame) -> None: assert_eq(df, pd.DataFrame({"a": [1]})) diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index 3c1bd1a69..33e71cc54 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -6,7 +6,7 @@ import pytest from dask.datasets import timeseries -from tests.integration.fixtures import skip_if_external_scheduler +from tests.integration.fixtures import xfail_if_external_scheduler from tests.utils import assert_eq try: @@ -64,7 +64,7 @@ def gpu_training_df(c): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_training_and_prediction(c, training_df): c.sql( """ @@ -101,7 +101,7 @@ def test_cuml_training_and_prediction(c, gpu_training_df): @pytest.mark.gpu -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_dask_cuml_training_and_prediction(c, gpu_training_df, gpu_client): model_query = """ @@ -117,7 +117,7 @@ def test_dask_cuml_training_and_prediction(c, gpu_training_df, gpu_client): check_trained_model(c) -@skip_if_external_scheduler +@xfail_if_external_scheduler @pytest.mark.gpu def test_dask_xgboost_training_prediction(c, gpu_training_df, gpu_client): model_query = """ @@ -152,7 +152,7 @@ def test_xgboost_training_prediction(c, gpu_training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_clustering_and_prediction(c, training_df): c.sql( """ @@ -170,7 +170,7 @@ def test_clustering_and_prediction(c, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_create_model_with_prediction(c, training_df): c.sql( """ @@ -205,7 +205,7 @@ def test_create_model_with_prediction(c, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_iterative_and_prediction(c, training_df): c.sql( """ @@ -226,7 +226,7 @@ def test_iterative_and_prediction(c, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_show_models(c, training_df): c.sql( """ @@ -458,7 +458,7 @@ def test_drop_model(c, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_describe_model(c, training_df): c.sql( """ @@ -558,7 +558,7 @@ def test_export_model(c, training_df, tmpdir): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_mlflow_export(c, training_df, tmpdir): # Test only when mlflow was installed mlflow = pytest.importorskip("mlflow", reason="mlflow not installed") @@ -616,7 +616,7 @@ def test_mlflow_export(c, training_df, tmpdir): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_mlflow_export_xgboost(c, client, training_df, tmpdir): # Test only when mlflow & xgboost was installed mlflow = pytest.importorskip("mlflow", reason="mlflow not installed") @@ -680,7 +680,7 @@ def test_mlflow_export_lightgbm(c, training_df, tmpdir): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_ml_experiment(c, client, training_df): with pytest.raises( @@ -881,7 +881,7 @@ def test_ml_experiment(c, client, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_experiment_automl_classifier(c, client, training_df): tpot = pytest.importorskip("tpot", reason="tpot not installed") # currently tested with tpot== @@ -906,7 +906,7 @@ def test_experiment_automl_classifier(c, client, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_experiment_automl_regressor(c, client, training_df): tpot = pytest.importorskip("tpot", reason="tpot not installed") # test regressor From 32997cc6e40fcd89c241f9b955d663b8727ce647 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 19 Oct 2022 10:18:38 -0700 Subject: [PATCH 04/25] Switch skips to xfails in test_groupby.py --- tests/integration/test_groupby.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/tests/integration/test_groupby.py b/tests/integration/test_groupby.py index b3007b066..7bb7d730f 100644 --- a/tests/integration/test_groupby.py +++ b/tests/integration/test_groupby.py @@ -154,7 +154,7 @@ def test_group_by_filtered(c): assert_eq(return_df, expected_df) -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_group_by_case(c): return_df = c.sql( """ @@ -276,7 +276,21 @@ def test_aggregations(c): assert_eq(return_df.reset_index(drop=True), expected_df) -@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) +@pytest.mark.parametrize( + "gpu", + [ + False, + pytest.param( + True, + marks=( + pytest.mark.gpu, + pytest.mark.xfail( + reason="stddev_pop is failing on GPU, see https://github.com/dask-contrib/dask-sql/issues/681" + ), + ), + ), + ], +) def test_stddev(c, gpu): df = pd.DataFrame( { @@ -312,11 +326,6 @@ def test_stddev(c, gpu): assert_eq(return_df, expected_df.reset_index(drop=True)) - # Can be removed after addressing: https://github.com/dask-contrib/dask-sql/issues/681 - if gpu: - c.drop_table("df") - pytest.skip() - return_df = c.sql( """ SELECT @@ -355,12 +364,9 @@ def test_stddev(c, gpu): @pytest.mark.parametrize( - "gpu", [False, pytest.param(True, marks=(pytest.mark.gpu, pytest.mark.skip))] + "gpu", [False, pytest.param(True, marks=(pytest.mark.gpu, pytest.mark.xfail))] ) def test_regr_aggregation(c, timeseries_df, gpu): - if gpu: - pytest.skip() - # test regr_count regr_count = c.sql( """ @@ -421,7 +427,7 @@ def test_regr_aggregation(c, timeseries_df, gpu): ) -@pytest.mark.skip( +@pytest.mark.xfail( reason="WIP DataFusion - https://github.com/dask-contrib/dask-sql/issues/753" ) def test_covar_aggregation(c, timeseries_df): From cebc475a819a3ced023e53a6c0f0b767c409c972 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 19 Oct 2022 10:24:43 -0700 Subject: [PATCH 05/25] Uncomments tests in test_over.py --- tests/integration/test_over.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/integration/test_over.py b/tests/integration/test_over.py index 34119924b..be53817e9 100644 --- a/tests/integration/test_over.py +++ b/tests/integration/test_over.py @@ -1,4 +1,5 @@ import pandas as pd +import pytest from tests.utils import assert_eq @@ -114,6 +115,30 @@ def test_over_calls(c, user_table_1): assert_eq(return_df, expected_df, check_dtype=False, check_index=False) +@pytest.mark.xfail( + reason="Need to add single_value window function, see https://github.com/dask-contrib/dask-sql/issues/651" +) +def test_over_single_value(c, user_table_1): + return_df = c.sql( + """ + SELECT + user_id, + b, + SINGLE_VALUE(user_id*10 - b) OVER (PARTITION BY user_id ORDER BY b) AS "O3", + FROM user_table_1 + """ + ) + expected_df = pd.DataFrame( + { + "user_id": user_table_1.user_id, + "b": user_table_1.b, + "O3": [19, 7, 19, 27], + } + ) + + assert_eq(return_df, expected_df, check_dtype=False, check_index=False) + + def test_over_with_windows(c): tmp_df = pd.DataFrame({"a": range(5)}) c.create_table("tmp", tmp_df) From 3395a022ef869c3e51a9f5fcf631c0d13e07ea0c Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 19 Oct 2022 10:27:21 -0700 Subject: [PATCH 06/25] Switch skips to xfails in test_postgres.py --- tests/integration/test_postgres.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_postgres.py b/tests/integration/test_postgres.py index 05a249c77..afbb59a64 100644 --- a/tests/integration/test_postgres.py +++ b/tests/integration/test_postgres.py @@ -53,7 +53,7 @@ def engine(): network.remove() -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_select(assert_query_gives_same_result): assert_query_gives_same_result( """ @@ -174,7 +174,7 @@ def test_limit(assert_query_gives_same_result): ) -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_groupby(assert_query_gives_same_result): assert_query_gives_same_result( """ @@ -221,7 +221,7 @@ def test_filter(assert_query_gives_same_result): ) -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_string_operations(assert_query_gives_same_result): assert_query_gives_same_result( """ @@ -261,7 +261,7 @@ def test_string_operations(assert_query_gives_same_result): ) -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_statistical_functions(assert_query_gives_same_result): # test regr_count From 74bedf7bcd42087a32160cc5f5c1667e4413eb74 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 19 Oct 2022 10:57:21 -0700 Subject: [PATCH 07/25] Uncomments tests in test_rex.py --- tests/integration/test_rex.py | 92 +++++++++++++++++++++++++++++++---- 1 file changed, 82 insertions(+), 10 deletions(-) diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py index 655ff69de..9cb807bb4 100644 --- a/tests/integration/test_rex.py +++ b/tests/integration/test_rex.py @@ -18,7 +18,7 @@ def test_year(c, datetime_table): assert result_df.compute().iloc[0][0] == 2014 -@pytest.mark.skip( +@pytest.mark.xfail( reason="WIP DataFusion - Enabling CBO generates yet to be implemented edge case" ) def test_case(c, df): @@ -455,23 +455,25 @@ def test_integer_div(c, df_simple): df = c.sql( """ SELECT - -- 1 / a AS a, + 1 / a AS a, a / 2 AS b, 1.0 / a AS c FROM df_simple """ ) - expected_df = pd.DataFrame(index=df_simple.index) - # expected_df["a"] = [1, 0, 0] # dtype returned by df for 1/a is float instead of int - # expected_df["a"] = expected_df["a"].astype("Int64") - expected_df["b"] = [0, 1, 1] - expected_df["b"] = expected_df["b"].astype("Int64") - expected_df["c"] = [1.0, 0.5, 0.333333] + expected_df = pd.DataFrame( + { + "a": (1 // df_simple.a).astype("Int64"), + "b": (df_simple.a // 2).astype("Int64"), + "c": 1 / df_simple.a, + } + ) + assert_eq(df, expected_df) -@pytest.mark.skip(reason="Subquery expressions not yet enabled") +@pytest.mark.xfail(reason="Subquery expressions not yet enabled") def test_subqueries(c, user_table_1, user_table_2): df = c.sql( """ @@ -564,7 +566,77 @@ def test_string_functions(c, gpu): ) -@pytest.mark.skip( +@pytest.mark.xfail(reason="POSITION syntax not supported by parser") +@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) +def test_string_position(c, gpu): + if gpu: + input_table = "gpu_string_table" + else: + input_table = "string_table" + + df = c.sql( + f""" + SELECT + POSITION('a' IN a FROM 4) AS f, + POSITION('ZL' IN a) AS g, + FROM + {input_table} + """ + ) + + if gpu: + df = df.astype({"f": "int64", "g": "int64"}) + + expected_df = pd.DataFrame( + { + "f": [7], + "g": [0], + } + ) + + assert_eq( + df.head(1), + expected_df, + ) + + +@pytest.mark.xfail(reason="OVERLAY syntax not supported by parser") +@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) +def test_string_overlay(c, gpu): + if gpu: + input_table = "gpu_string_table" + else: + input_table = "string_table" + + df = c.sql( + f""" + SELECT + OVERLAY(a PLACING 'XXX' FROM -1) AS l, + OVERLAY(a PLACING 'XXX' FROM 2 FOR 4) AS m, + OVERLAY(a PLACING 'XXX' FROM 2 FOR 1) AS n, + FROM + {input_table} + """ + ) + + if gpu: + df = df.astype({"c": "int64"}) # , "f": "int64", "g": "int64"}) + + expected_df = pd.DataFrame( + { + "l": ["XXXormal string"], + "m": ["aXXXmal string"], + "n": ["aXXXnormal string"], + } + ) + + assert_eq( + df.head(1), + expected_df, + ) + + +@pytest.mark.xfail( reason="TIMESTAMP add, ceil, floor for dt ops not supported by parser" ) def test_date_functions(c): From d57e94962a5317d6bc75cbdc76dcbd859b49529e Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 19 Oct 2022 10:58:15 -0700 Subject: [PATCH 08/25] Switch skips to xfails in test_schema.py --- tests/integration/test_schema.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_schema.py b/tests/integration/test_schema.py index 8189e23a0..ecba00b48 100644 --- a/tests/integration/test_schema.py +++ b/tests/integration/test_schema.py @@ -6,7 +6,7 @@ from tests.utils import assert_eq -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_table_schema(c, df): original_df = c.sql("SELECT * FROM df") @@ -36,7 +36,7 @@ def test_table_schema(c, df): c.sql("SELECT * FROM foo.bar") -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_function(c): c.sql("CREATE SCHEMA other") c.sql("USE SCHEMA root") From 9fb367be2084872b95bd8a29f0e7c24f6bd92df2 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 19 Oct 2022 11:00:22 -0700 Subject: [PATCH 09/25] Switch skips to xfails in test_jdbc.py --- tests/integration/test_jdbc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_jdbc.py b/tests/integration/test_jdbc.py index 7997302f1..03dd6c515 100644 --- a/tests/integration/test_jdbc.py +++ b/tests/integration/test_jdbc.py @@ -43,7 +43,7 @@ def app_client(c): app.client.close() -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_jdbc_has_schema(app_client, c): create_meta_data(c) @@ -75,7 +75,7 @@ def test_jdbc_has_schema(app_client, c): ] -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_jdbc_has_table(app_client, c): create_meta_data(c) check_data(app_client) @@ -93,7 +93,7 @@ def test_jdbc_has_table(app_client, c): ] -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_jdbc_has_columns(app_client, c): create_meta_data(c) check_data(app_client) From fd50a462c91656d174d67d718aed8dd86bba62bf Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 19 Oct 2022 11:04:42 -0700 Subject: [PATCH 10/25] Switch skips to xfails in test_sample.py --- tests/integration/test_sample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_sample.py b/tests/integration/test_sample.py index 44e15b67c..1542ba941 100644 --- a/tests/integration/test_sample.py +++ b/tests/integration/test_sample.py @@ -21,7 +21,7 @@ def get_system_sample(df, fraction, seed): return df -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_sample(c, df): ddf = c.sql("SELECT * FROM df") From 2f5fb4efa7cdc6e75a915891f7311bc21261cd25 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 19 Oct 2022 11:11:55 -0700 Subject: [PATCH 11/25] Switch skips to xfails in test_server.py --- tests/integration/test_server.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_server.py b/tests/integration/test_server.py index 29f635bee..003ff866b 100644 --- a/tests/integration/test_server.py +++ b/tests/integration/test_server.py @@ -26,7 +26,7 @@ def app_client(): app.client.close() -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_routes(app_client): assert app_client.post("/v1/statement", data="SELECT 1 + 1").status_code == 200 assert app_client.get("/v1/statement", data="SELECT 1 + 1").status_code == 405 @@ -36,7 +36,7 @@ def test_routes(app_client): assert app_client.get("/v1/cancel/some-wrong-uuid").status_code == 405 -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_sql_query_cancel(app_client): response = app_client.post("/v1/statement", data="SELECT 1 + 1") assert response.status_code == 200 @@ -50,7 +50,7 @@ def test_sql_query_cancel(app_client): assert response.status_code == 404 -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_sql_query(app_client): response = app_client.post("/v1/statement", data="SELECT 1 + 1") assert response.status_code == 200 @@ -72,7 +72,7 @@ def test_sql_query(app_client): assert result["data"] == [[2]] -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_wrong_sql_query(app_client): response = app_client.post("/v1/statement", data="SELECT 1 + ") assert response.status_code == 200 @@ -90,7 +90,7 @@ def test_wrong_sql_query(app_client): } -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_add_and_query(app_client, df, temporary_data_file): df.to_csv(temporary_data_file, index=False) @@ -133,7 +133,7 @@ def test_add_and_query(app_client, df, temporary_data_file): assert "error" not in result -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_register_and_query(app_client, df): df["a"] = df["a"].astype("UInt8") app_client.app.c.create_table("new_table", df) @@ -162,7 +162,7 @@ def test_register_and_query(app_client, df): assert "error" not in result -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_inf_table(app_client, user_table_inf): app_client.app.c.create_table("new_table", user_table_inf) @@ -186,7 +186,7 @@ def test_inf_table(app_client, user_table_inf): assert "error" not in result -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def get_result_or_error(app_client, response): result = response.json() From 2177378d4b296aa773657bfbc4497dad5606593f Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 19 Oct 2022 11:14:53 -0700 Subject: [PATCH 12/25] Uncomments tests in test_context.py --- tests/unit/test_context.py | 66 +++++++++----------------------------- 1 file changed, 16 insertions(+), 50 deletions(-) diff --git a/tests/unit/test_context.py b/tests/unit/test_context.py index d10f7012b..a1dc97b81 100644 --- a/tests/unit/test_context.py +++ b/tests/unit/test_context.py @@ -54,37 +54,6 @@ def test_deprecation_warning(gpu): assert "table" not in c.schema[c.schema_name].tables -@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) -def test_explain(gpu): - c = Context() - - data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) - c.create_table("df", data_frame, gpu=gpu) - - sql_string = c.explain("SELECT * FROM df") - - assert sql_string.startswith("Projection: df.a\n") - - # TODO: Need to add statistics to Rust optimizer before this can be uncommented. - # c.create_table("df", data_frame, statistics=Statistics(row_count=1337)) - - # sql_string = c.explain("SELECT * FROM df") - - # assert sql_string.startswith( - # "DaskTableScan(table=[[root, df]]): rowcount = 1337.0, cumulative cost = {1337.0 rows, 1338.0 cpu, 0.0 io}, id = " - # ) - - c = Context() - - data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) - - sql_string = c.explain( - "SELECT * FROM other_df", dataframes={"other_df": data_frame}, gpu=gpu - ) - - assert sql_string.startswith("Projection: other_df.a\n") - - @pytest.mark.parametrize( "gpu", [ @@ -116,7 +85,6 @@ def test_sql(gpu): assert_eq(result, data_frame) -@pytest.mark.skip(reason="WIP DataFusion - missing create statement logic") @pytest.mark.parametrize( "gpu", [ @@ -391,28 +359,26 @@ def test_aggregation_adding(): assert c.schema[c.schema_name].function_lists[1].aggregation -# TODO: Alter schema is not yet implemented -# def test_alter_schema(c): -# c.create_schema("test_schema") -# c.sql("ALTER SCHEMA test_schema RENAME TO prod_schema") -# assert "prod_schema" in c.schema +def test_alter_schema(c): + c.create_schema("test_schema") + c.sql("ALTER SCHEMA test_schema RENAME TO prod_schema") + assert "prod_schema" in c.schema -# with pytest.raises(KeyError): -# c.sql("ALTER SCHEMA MARVEL RENAME TO DC") + with pytest.raises(KeyError): + c.sql("ALTER SCHEMA MARVEL RENAME TO DC") -# del c.schema["prod_schema"] + del c.schema["prod_schema"] -# TODO: Alter table is not yet implemented -# def test_alter_table(c, df_simple): -# c.create_table("maths", df_simple) -# c.sql("ALTER TABLE maths RENAME TO physics") -# assert "physics" in c.schema[c.schema_name].tables +def test_alter_table(c, df_simple): + c.create_table("maths", df_simple) + c.sql("ALTER TABLE maths RENAME TO physics") + assert "physics" in c.schema[c.schema_name].tables -# with pytest.raises(KeyError): -# c.sql("ALTER TABLE four_legs RENAME TO two_legs") + with pytest.raises(KeyError): + c.sql("ALTER TABLE four_legs RENAME TO two_legs") -# c.sql("ALTER TABLE IF EXISTS alien RENAME TO humans") + c.sql("ALTER TABLE IF EXISTS alien RENAME TO humans") -# print(c.schema[c.schema_name].tables) -# del c.schema[c.schema_name].tables["physics"] + print(c.schema[c.schema_name].tables) + del c.schema[c.schema_name].tables["physics"] From f0bf4d8aadae0fb84edc4ac95ffcda420e156b11 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 20 Oct 2022 08:09:41 -0700 Subject: [PATCH 13/25] Un-xfail passing tests --- tests/integration/test_complex.py | 3 --- tests/integration/test_fugue.py | 1 - tests/integration/test_model.py | 2 -- tests/integration/test_rex.py | 3 --- tests/integration/test_server.py | 2 -- 5 files changed, 11 deletions(-) diff --git a/tests/integration/test_complex.py b/tests/integration/test_complex.py index 209e383f5..06196519a 100644 --- a/tests/integration/test_complex.py +++ b/tests/integration/test_complex.py @@ -1,9 +1,6 @@ from dask.datasets import timeseries -from tests.integration.fixtures import xfail_if_external_scheduler - -@xfail_if_external_scheduler def test_complex_query(c): df = timeseries(freq="1d").persist() c.create_table("timeseries", df) diff --git a/tests/integration/test_fugue.py b/tests/integration/test_fugue.py index 9c5eeab12..ce5b4bdf5 100644 --- a/tests/integration/test_fugue.py +++ b/tests/integration/test_fugue.py @@ -13,7 +13,6 @@ from dask_sql.integrations.fugue import DaskSQLExecutionEngine, fsql_dask -@xfail_if_external_scheduler def test_simple_statement(client): with fugue_sql.FugueSQLWorkflow(DaskSQLExecutionEngine) as dag: df = dag.df([[0, "hello"], [1, "world"]], "a:int64,b:str") diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index 33e71cc54..a73291603 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -557,8 +557,6 @@ def test_export_model(c, training_df, tmpdir): ) -# TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@xfail_if_external_scheduler def test_mlflow_export(c, training_df, tmpdir): # Test only when mlflow was installed mlflow = pytest.importorskip("mlflow", reason="mlflow not installed") diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py index 9cb807bb4..27836e202 100644 --- a/tests/integration/test_rex.py +++ b/tests/integration/test_rex.py @@ -18,9 +18,6 @@ def test_year(c, datetime_table): assert result_df.compute().iloc[0][0] == 2014 -@pytest.mark.xfail( - reason="WIP DataFusion - Enabling CBO generates yet to be implemented edge case" -) def test_case(c, df): result_df = c.sql( """ diff --git a/tests/integration/test_server.py b/tests/integration/test_server.py index 003ff866b..c28e6e456 100644 --- a/tests/integration/test_server.py +++ b/tests/integration/test_server.py @@ -26,7 +26,6 @@ def app_client(): app.client.close() -@pytest.mark.xfail(reason="WIP DataFusion") def test_routes(app_client): assert app_client.post("/v1/statement", data="SELECT 1 + 1").status_code == 200 assert app_client.get("/v1/statement", data="SELECT 1 + 1").status_code == 405 @@ -36,7 +35,6 @@ def test_routes(app_client): assert app_client.get("/v1/cancel/some-wrong-uuid").status_code == 405 -@pytest.mark.xfail(reason="WIP DataFusion") def test_sql_query_cancel(app_client): response = app_client.post("/v1/statement", data="SELECT 1 + 1") assert response.status_code == 200 From 2cfdbf7976f149364b234644188e8b904291a822 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 24 Oct 2022 06:50:48 -0700 Subject: [PATCH 14/25] Add xfails to some broken tests --- tests/integration/test_compatibility.py | 20 +++++++++++++++++++- tests/integration/test_explain.py | 5 +++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_compatibility.py b/tests/integration/test_compatibility.py index de29108c2..a19206fd0 100644 --- a/tests/integration/test_compatibility.py +++ b/tests/integration/test_compatibility.py @@ -594,13 +594,16 @@ def test_window_row_number_partition_by(): ) +@pytest.mark.xfail( + reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878" +) def test_window_ranks(): a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50)) eq_sqlite( """ SELECT *, RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1, - DENSE_RANK() OVER (ORDER BY a ASC, b DESC NULLS LAST, c DESC) AS a2 + DENSE_RANK() OVER (ORDER BY a ASC, b DESC NULLS LAST, c DESC) AS a2, PERCENT_RANK() OVER (ORDER BY a ASC, b ASC NULLS LAST, c) AS a4 FROM a """, @@ -608,6 +611,9 @@ def test_window_ranks(): ) +@pytest.mark.xfail( + reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878" +) def test_window_ranks_partition_by(): a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50)) eq_sqlite( @@ -625,6 +631,9 @@ def test_window_ranks_partition_by(): ) +@pytest.mark.xfail( + reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878" +) def test_window_lead_lag(): a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50)) eq_sqlite( @@ -645,6 +654,9 @@ def test_window_lead_lag(): ) +@pytest.mark.xfail( + reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878" +) def test_window_lead_lag_partition_by(): a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50)) eq_sqlite( @@ -954,6 +966,9 @@ def test_union(): ) +@pytest.mark.xfail( + reason="'ANTI' joins not supported yet, see https://github.com/dask-contrib/dask-sql/issues/879" +) def test_except(): a = make_rand_df(30, b=(int, 10), c=(str, 10)) b = make_rand_df(80, b=(int, 50), c=(str, 50)) @@ -970,6 +985,9 @@ def test_except(): ) +@pytest.mark.xfail( + reason="INTERSECT is not compliant with SQLite, see https://github.com/dask-contrib/dask-sql/issues/880" +) def test_intersect(): a = make_rand_df(30, b=(int, 10), c=(str, 10)) b = make_rand_df(80, b=(int, 50), c=(str, 50)) diff --git a/tests/integration/test_explain.py b/tests/integration/test_explain.py index e0a1d85bc..8fb539ac9 100644 --- a/tests/integration/test_explain.py +++ b/tests/integration/test_explain.py @@ -2,7 +2,7 @@ import pandas as pd import pytest -from dask_planner.rust import DaskStatistics +from dask_sql import Statistics @pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) @@ -23,10 +23,11 @@ def test_sql_query_explain(c, gpu): assert "Aggregate: groupBy=[[other_df.a]], aggr=[[MIN(other_df.a)]]" in sql_string +@pytest.mark.xfail(reason="Need to add statistics to Rust optimizer") @pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) def test_statistics_explain(c, gpu): df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) - c.create_table("df", df, statistics=DaskStatistics(row_count=1337)) + c.create_table("df", df, statistics=Statistics(row_count=1337), gpu=gpu) sql_string = c.explain("SELECT * FROM df") From 86a72a5b1d848fa9eda6354c37076c6dc63a142b Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 24 Oct 2022 08:18:16 -0700 Subject: [PATCH 15/25] xfail alter table/schema tests --- tests/unit/test_context.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/unit/test_context.py b/tests/unit/test_context.py index a1dc97b81..8665ac9ef 100644 --- a/tests/unit/test_context.py +++ b/tests/unit/test_context.py @@ -359,6 +359,9 @@ def test_aggregation_adding(): assert c.schema[c.schema_name].function_lists[1].aggregation +@pytest.mark.xfail( + reason="Need to add back support for alter table/schema, see https://github.com/dask-contrib/dask-sql/issues/884" +) def test_alter_schema(c): c.create_schema("test_schema") c.sql("ALTER SCHEMA test_schema RENAME TO prod_schema") @@ -370,6 +373,9 @@ def test_alter_schema(c): del c.schema["prod_schema"] +@pytest.mark.xfail( + reason="Need to add back support for alter table/schema, see https://github.com/dask-contrib/dask-sql/issues/884" +) def test_alter_table(c, df_simple): c.create_table("maths", df_simple) c.sql("ALTER TABLE maths RENAME TO physics") From 1f74895b1e5b0b5c4f4b335e11b0ba203ea42425 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 24 Oct 2022 11:47:44 -0700 Subject: [PATCH 16/25] Un-xfail test_regr_aggregation on GPU --- tests/integration/test_groupby.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/integration/test_groupby.py b/tests/integration/test_groupby.py index 7bb7d730f..ed0ac40ba 100644 --- a/tests/integration/test_groupby.py +++ b/tests/integration/test_groupby.py @@ -363,9 +363,7 @@ def test_stddev(c, gpu): c.drop_table("df") -@pytest.mark.parametrize( - "gpu", [False, pytest.param(True, marks=(pytest.mark.gpu, pytest.mark.xfail))] -) +@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) def test_regr_aggregation(c, timeseries_df, gpu): # test regr_count regr_count = c.sql( From 890a4170869268a52f21d8c26895693d7a7b1240 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 26 Oct 2022 08:49:00 -0700 Subject: [PATCH 17/25] Fix external scheduler xfail --- tests/integration/test_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index 292bc82f9..a3e0d74b3 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -935,7 +935,7 @@ def test_experiment_automl_regressor(c, client, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_predict_with_nullable_types(c): df = pd.DataFrame( { From d9ecc75cd50772db6638831e53ac0f7098ee2d7f Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 13 Dec 2022 12:36:31 -0800 Subject: [PATCH 18/25] Replace skip_if_external_scheduler in ML wrapper tests --- tests/unit/test_ml_wrappers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_ml_wrappers.py b/tests/unit/test_ml_wrappers.py index 0f02707b0..0673aaae4 100644 --- a/tests/unit/test_ml_wrappers.py +++ b/tests/unit/test_ml_wrappers.py @@ -17,7 +17,7 @@ from dask_sql.physical.rel.custom.wrappers import Incremental, ParallelPostFit -from ..integration.fixtures import skip_if_external_scheduler +from ..integration.fixtures import xfail_if_external_scheduler def _check_axis_partitioning(chunks, n_features): @@ -125,7 +125,7 @@ def assert_estimator_equal(left, right, exclude=None, **kwargs): _assert_eq(l, r, name=attr, **kwargs) -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_parallelpostfit_basic(): clf = ParallelPostFit(GradientBoostingClassifier()) @@ -197,7 +197,7 @@ def test_transform(kind): assert_eq_ar(result, expected) -@skip_if_external_scheduler +@xfail_if_external_scheduler @pytest.mark.parametrize("dataframes", [False, True]) def test_incremental_basic(dataframes): # Create observations that we know linear models can recover From 4ff69ae518d75bf357e779de721541cf33413db9 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 13 Dec 2022 13:36:30 -0800 Subject: [PATCH 19/25] unxfail test_date_functions --- tests/integration/test_rex.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py index f5cf5876a..8b3fe2855 100644 --- a/tests/integration/test_rex.py +++ b/tests/integration/test_rex.py @@ -696,9 +696,6 @@ def test_string_overlay(c, gpu): ) -@pytest.mark.xfail( - reason="TIMESTAMP add, ceil, floor for dt ops not supported by parser" -) def test_date_functions(c): date = datetime(2021, 10, 3, 15, 53, 42, 47) From 00353836811a4d7d2abb1e8aa4bc2a60fd8421f0 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 14 Dec 2022 06:55:04 -0800 Subject: [PATCH 20/25] Set max constraint for fastapi --- continuous_integration/environment-3.10-dev.yaml | 3 ++- continuous_integration/environment-3.9-dev.yaml | 3 ++- continuous_integration/gpuci/environment.yaml | 3 ++- continuous_integration/recipe/meta.yaml | 3 ++- docker/conda.txt | 3 ++- docker/main.dockerfile | 3 ++- docs/environment.yml | 3 ++- docs/requirements-docs.txt | 3 ++- setup.py | 3 ++- 9 files changed, 18 insertions(+), 9 deletions(-) diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml index d5ffa777a..8ff56d413 100644 --- a/continuous_integration/environment-3.10-dev.yaml +++ b/continuous_integration/environment-3.10-dev.yaml @@ -4,7 +4,8 @@ channels: - nodefaults dependencies: - dask>=2022.3.0 -- fastapi>=0.69.0 +# FIXME: handling is needed for httpx-based fastapi>=0.87.0 +- fastapi>=0.69.0,<0.87.0 - fugue>=0.7.0 - intake>=0.6.0 - jsonschema diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml index 4c5c23511..d6e1179c1 100644 --- a/continuous_integration/environment-3.9-dev.yaml +++ b/continuous_integration/environment-3.9-dev.yaml @@ -4,7 +4,8 @@ channels: - nodefaults dependencies: - dask>=2022.3.0 -- fastapi>=0.69.0 +# FIXME: handling is needed for httpx-based fastapi>=0.87.0 +- fastapi>=0.69.0,<0.87.0 - fugue>=0.7.0 - intake>=0.6.0 - jsonschema diff --git a/continuous_integration/gpuci/environment.yaml b/continuous_integration/gpuci/environment.yaml index 7997ec7c4..866f41642 100644 --- a/continuous_integration/gpuci/environment.yaml +++ b/continuous_integration/gpuci/environment.yaml @@ -7,7 +7,8 @@ channels: - nodefaults dependencies: - dask>=2022.3.0 -- fastapi>=0.69.0 +# FIXME: handling is needed for httpx-based fastapi>=0.87.0 +- fastapi>=0.69.0,<0.87.0 - fugue>=0.7.0 - intake>=0.6.0 - jsonschema diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index ab5274076..efc59f3a0 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -32,7 +32,8 @@ requirements: - python - dask >=2022.3.0 - pandas >=1.4.0 - - fastapi >=0.69.0 + # FIXME: handling is needed for httpx-based fastapi>=0.87.0 + - fastapi >=0.69.0,<0.87.0 - uvicorn >=0.13.4 - tzlocal >=2.1 - prompt-toolkit >=3.0.8 diff --git a/docker/conda.txt b/docker/conda.txt index 3d57e18dc..99d16d6fd 100644 --- a/docker/conda.txt +++ b/docker/conda.txt @@ -10,7 +10,8 @@ pytest-xdist mock>=4.0.3 sphinx>=3.2.1 tzlocal>=2.1 -fastapi>=0.69.0 +# FIXME: handling is needed for httpx-based fastapi>=0.87.0 +fastapi>=0.69.0,<0.87.0 nest-asyncio>=1.4.3 uvicorn>=0.13.4 pyarrow>=6.0.1 diff --git a/docker/main.dockerfile b/docker/main.dockerfile index 8c908ce19..19e7c324c 100644 --- a/docker/main.dockerfile +++ b/docker/main.dockerfile @@ -18,7 +18,8 @@ RUN mamba install -y \ # core dependencies "dask>=2022.3.0" \ "pandas>=1.4.0" \ - "fastapi>=0.69.0" \ + # FIXME: handling is needed for httpx-based fastapi>=0.87.0 + "fastapi>=0.69.0,<0.87.0" \ "uvicorn>=0.13.4" \ "tzlocal>=2.1" \ "prompt_toolkit>=3.0.8" \ diff --git a/docs/environment.yml b/docs/environment.yml index 5d562c532..ef7c92be0 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -11,7 +11,8 @@ dependencies: - pandas>=1.4.0 - fugue>=0.7.0 - jpype1>=1.0.2 - - fastapi>=0.69.0 + # FIXME: handling is needed for httpx-based fastapi>=0.87.0 + - fastapi>=0.69.0,<0.87.0 - uvicorn>=0.13.4 - tzlocal>=2.1 - prompt_toolkit>=3.0.8 diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 439516478..e21fd5846 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -4,7 +4,8 @@ dask-sphinx-theme>=3.0.0 dask>=2022.3.0 pandas>=1.4.0 fugue>=0.7.0 -fastapi>=0.69.0 +# FIXME: handling is needed for httpx-based fastapi>=0.87.0 +fastapi>=0.69.0,<0.87.0 uvicorn>=0.13.4 tzlocal>=2.1 prompt_toolkit>=3.0.8 diff --git a/setup.py b/setup.py index 0f8520de9..a370549e8 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,8 @@ install_requires=[ "dask[dataframe,distributed]>=2022.3.0", "pandas>=1.4.0", - "fastapi>=0.69.0", + # FIXME: handling is needed for httpx-based fastapi>=0.87.0 + "fastapi>=0.69.0,<0.87.0", "uvicorn>=0.13.4", "tzlocal>=2.1", "prompt_toolkit>=3.0.8", From 37dc53b29d8b4adb5e06be063e23394ea6d4f4a1 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 14 Dec 2022 12:04:02 -0800 Subject: [PATCH 21/25] Skip test_iterative_and_prediction on cluster --- tests/integration/test_model.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index 0853e36ba..9feb7e5f1 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -149,8 +149,6 @@ def test_xgboost_training_prediction(c, gpu_training_df): check_trained_model(c) -# TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@xfail_if_external_scheduler def test_clustering_and_prediction(c, training_df): c.sql( """ @@ -220,7 +218,11 @@ def test_create_model_with_prediction(c, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@xfail_if_external_scheduler +# this test failure shuts down the cluster and must be skipped instead of xfailed +@pytest.mark.skipif( + os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None, + reason="Can not run with external cluster", +) def test_iterative_and_prediction(c, training_df): c.sql( """ From cc105003647a8f0dfdba06a19116ff81fc1044fe Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 5 Jan 2023 07:02:22 -0800 Subject: [PATCH 22/25] Fix style errors --- tests/integration/test_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index 42f581f6d..c3c7ee874 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -512,7 +512,7 @@ def test_describe_model(c, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_export_model(c, training_df, tmpdir): with pytest.raises(RuntimeError): c.sql( From af7830d0ec4b04e9bad3990baa18d4717f81d811 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 5 Jan 2023 12:14:14 -0800 Subject: [PATCH 23/25] Remove uses of skip_if_external_scheduler --- tests/integration/test_rex.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py index 53e4e47be..5f006b76f 100644 --- a/tests/integration/test_rex.py +++ b/tests/integration/test_rex.py @@ -5,7 +5,7 @@ import pandas as pd import pytest -from tests.integration.fixtures import skip_if_external_scheduler +from tests.integration.fixtures import xfail_if_external_scheduler from tests.utils import assert_eq @@ -139,7 +139,7 @@ def test_literal_null(c): # TODO - https://github.com/dask-contrib/dask-sql/issues/978 -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_random(c): query_with_seed = """ SELECT From 8783b8451db5ec4eeea01ae72537ad3eb8bd3c86 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 9 Jan 2023 08:52:28 -0800 Subject: [PATCH 24/25] Resolve remaining cluster tests --- tests/integration/fixtures.py | 2 +- tests/integration/test_model.py | 4 ++++ tests/integration/test_rex.py | 3 --- tests/unit/test_ml_wrappers.py | 17 +++++++++++++---- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/tests/integration/fixtures.py b/tests/integration/fixtures.py index e0b478dbc..84869cc9c 100644 --- a/tests/integration/fixtures.py +++ b/tests/integration/fixtures.py @@ -332,7 +332,7 @@ def gpu_client(gpu_cluster): # client for all computations. otherwise, only connect to a client # when specified. @pytest.fixture( - scope="function" if SCHEDULER_ADDR is None else "session", + scope="function", autouse=False if SCHEDULER_ADDR is None else True, ) def client(): diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index c3c7ee874..1252666e5 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -149,6 +149,8 @@ def test_xgboost_training_prediction(c, gpu_training_df): check_trained_model(c) +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? +@xfail_if_external_scheduler def test_clustering_and_prediction(c, training_df): c.sql( """ @@ -576,6 +578,8 @@ def test_export_model(c, training_df, tmpdir): ) +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? +@xfail_if_external_scheduler def test_mlflow_export(c, training_df, tmpdir): # Test only when mlflow was installed mlflow = pytest.importorskip("mlflow", reason="mlflow not installed") diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py index 5f006b76f..8b3fe2855 100644 --- a/tests/integration/test_rex.py +++ b/tests/integration/test_rex.py @@ -5,7 +5,6 @@ import pandas as pd import pytest -from tests.integration.fixtures import xfail_if_external_scheduler from tests.utils import assert_eq @@ -138,8 +137,6 @@ def test_literal_null(c): assert_eq(df, expected_df) -# TODO - https://github.com/dask-contrib/dask-sql/issues/978 -@xfail_if_external_scheduler def test_random(c): query_with_seed = """ SELECT diff --git a/tests/unit/test_ml_wrappers.py b/tests/unit/test_ml_wrappers.py index 0673aaae4..4c8b65b2f 100644 --- a/tests/unit/test_ml_wrappers.py +++ b/tests/unit/test_ml_wrappers.py @@ -1,5 +1,6 @@ # Copyright 2017, Dask developers # Dask-ML project - https://github.com/dask/dask-ml +import os from collections.abc import Sequence import dask @@ -17,8 +18,6 @@ from dask_sql.physical.rel.custom.wrappers import Incremental, ParallelPostFit -from ..integration.fixtures import xfail_if_external_scheduler - def _check_axis_partitioning(chunks, n_features): c = chunks[1][0] @@ -125,7 +124,12 @@ def assert_estimator_equal(left, right, exclude=None, **kwargs): _assert_eq(l, r, name=attr, **kwargs) -@xfail_if_external_scheduler +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? +# this test failure shuts down the cluster and must be skipped instead of xfailed +@pytest.mark.skipif( + os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None, + reason="Can not run with external cluster", +) def test_parallelpostfit_basic(): clf = ParallelPostFit(GradientBoostingClassifier()) @@ -197,7 +201,12 @@ def test_transform(kind): assert_eq_ar(result, expected) -@xfail_if_external_scheduler +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? +# this test failure shuts down the cluster and must be skipped instead of xfailed +@pytest.mark.skipif( + os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None, + reason="Can not run with external cluster", +) @pytest.mark.parametrize("dataframes", [False, True]) def test_incremental_basic(dataframes): # Create observations that we know linear models can recover From 96434b0fbd48a8e13374fa92060c9a2dcba4a387 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 9 Jan 2023 18:02:49 -0800 Subject: [PATCH 25/25] un-xfail fugue tests --- tests/integration/test_fugue.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/integration/test_fugue.py b/tests/integration/test_fugue.py index 48506f42b..d99aa950b 100644 --- a/tests/integration/test_fugue.py +++ b/tests/integration/test_fugue.py @@ -11,7 +11,6 @@ from dask_sql.integrations.fugue import fsql_dask # noqa: E402 -@xfail_if_external_scheduler def test_fugue_workflow(client): dag = fugue_sql.FugueSQLWorkflow() df = dag.df([[0, "hello"], [1, "world"]], "a:int64,b:str") @@ -26,7 +25,6 @@ def test_fugue_workflow(client): assert_eq(return_df, pd.DataFrame({"a": [1], "b": ["world"]})) -@xfail_if_external_scheduler def test_fugue_fsql(client): pdf = pd.DataFrame([[0, "hello"], [1, "world"]], columns=["a", "b"]) dag = fugue_sql.fsql(