diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml index 33e794664..ab5b80fa3 100644 --- a/continuous_integration/environment-3.10-dev.yaml +++ b/continuous_integration/environment-3.10-dev.yaml @@ -4,7 +4,8 @@ channels: - nodefaults dependencies: - dask>=2022.3.0 -- fastapi>=0.69.0 +# FIXME: handling is needed for httpx-based fastapi>=0.87.0 +- fastapi>=0.69.0,<0.87.0 - fugue>=0.7.3 - intake>=0.6.0 - jsonschema diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml index 607dd0acf..d4391e782 100644 --- a/continuous_integration/environment-3.9-dev.yaml +++ b/continuous_integration/environment-3.9-dev.yaml @@ -4,7 +4,8 @@ channels: - nodefaults dependencies: - dask>=2022.3.0 -- fastapi>=0.69.0 +# FIXME: handling is needed for httpx-based fastapi>=0.87.0 +- fastapi>=0.69.0,<0.87.0 - fugue>=0.7.3 - intake>=0.6.0 - jsonschema diff --git a/continuous_integration/gpuci/environment.yaml b/continuous_integration/gpuci/environment.yaml index b2fd5fdb8..fba4d68a6 100644 --- a/continuous_integration/gpuci/environment.yaml +++ b/continuous_integration/gpuci/environment.yaml @@ -7,7 +7,8 @@ channels: - nodefaults dependencies: - dask>=2022.3.0 -- fastapi>=0.69.0 +# FIXME: handling is needed for httpx-based fastapi>=0.87.0 +- fastapi>=0.69.0,<0.87.0 - fugue>=0.7.3 - intake>=0.6.0 - jsonschema diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index ab5274076..efc59f3a0 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -32,7 +32,8 @@ requirements: - python - dask >=2022.3.0 - pandas >=1.4.0 - - fastapi >=0.69.0 + # FIXME: handling is needed for httpx-based fastapi>=0.87.0 + - fastapi >=0.69.0,<0.87.0 - uvicorn >=0.13.4 - tzlocal >=2.1 - prompt-toolkit >=3.0.8 diff --git a/docker/conda.txt b/docker/conda.txt index 3d57e18dc..99d16d6fd 100644 --- a/docker/conda.txt +++ b/docker/conda.txt @@ -10,7 +10,8 @@ pytest-xdist mock>=4.0.3 sphinx>=3.2.1 tzlocal>=2.1 -fastapi>=0.69.0 +# FIXME: handling is needed for httpx-based fastapi>=0.87.0 +fastapi>=0.69.0,<0.87.0 nest-asyncio>=1.4.3 uvicorn>=0.13.4 pyarrow>=6.0.1 diff --git a/docker/main.dockerfile b/docker/main.dockerfile index 8c908ce19..19e7c324c 100644 --- a/docker/main.dockerfile +++ b/docker/main.dockerfile @@ -18,7 +18,8 @@ RUN mamba install -y \ # core dependencies "dask>=2022.3.0" \ "pandas>=1.4.0" \ - "fastapi>=0.69.0" \ + # FIXME: handling is needed for httpx-based fastapi>=0.87.0 + "fastapi>=0.69.0,<0.87.0" \ "uvicorn>=0.13.4" \ "tzlocal>=2.1" \ "prompt_toolkit>=3.0.8" \ diff --git a/docs/environment.yml b/docs/environment.yml index f43b8a26c..6d3ff95cc 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -11,7 +11,8 @@ dependencies: - pandas>=1.4.0 - fugue>=0.7.3 - jpype1>=1.0.2 - - fastapi>=0.69.0 + # FIXME: handling is needed for httpx-based fastapi>=0.87.0 + - fastapi>=0.69.0,<0.87.0 - uvicorn>=0.13.4 - tzlocal>=2.1 - prompt_toolkit>=3.0.8 diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 7ca5c1fe9..94e79de59 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -4,7 +4,8 @@ dask-sphinx-theme>=3.0.0 dask>=2022.3.0 pandas>=1.4.0 fugue>=0.7.3 -fastapi>=0.69.0 +# FIXME: handling is needed for httpx-based fastapi>=0.87.0 +fastapi>=0.69.0,<0.87.0 uvicorn>=0.13.4 tzlocal>=2.1 prompt_toolkit>=3.0.8 diff --git a/setup.py b/setup.py index 9227245ec..286b570d3 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,8 @@ install_requires=[ "dask[dataframe,distributed]>=2022.3.0", "pandas>=1.4.0", - "fastapi>=0.69.0", + # FIXME: handling is needed for httpx-based fastapi>=0.87.0 + "fastapi>=0.69.0,<0.87.0", "uvicorn>=0.13.4", "tzlocal>=2.1", "prompt_toolkit>=3.0.8", diff --git a/tests/integration/fixtures.py b/tests/integration/fixtures.py index 8dec835c3..84869cc9c 100644 --- a/tests/integration/fixtures.py +++ b/tests/integration/fixtures.py @@ -332,7 +332,7 @@ def gpu_client(gpu_cluster): # client for all computations. otherwise, only connect to a client # when specified. @pytest.fixture( - scope="function" if SCHEDULER_ADDR is None else "session", + scope="function", autouse=False if SCHEDULER_ADDR is None else True, ) def client(): @@ -340,7 +340,7 @@ def client(): yield client -skip_if_external_scheduler = pytest.mark.skipif( - os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None, +xfail_if_external_scheduler = pytest.mark.xfail( + condition=os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None, reason="Can not run with external cluster", ) diff --git a/tests/integration/test_compatibility.py b/tests/integration/test_compatibility.py index 5b574d51c..7ac4424ae 100644 --- a/tests/integration/test_compatibility.py +++ b/tests/integration/test_compatibility.py @@ -594,72 +594,83 @@ def test_window_row_number_partition_by(): ) -# TODO: Except not implemented so far -# def test_window_ranks(): -# a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50)) -# eq_sqlite( -# """ -# SELECT *, -# RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1, -# DENSE_RANK() OVER (ORDER BY a ASC, b DESC NULLS LAST, c DESC) AS a2, -# PERCENT_RANK() OVER (ORDER BY a ASC, b ASC NULLS LAST, c) AS a4 -# FROM a -# """, -# a=a, -# ) - -# TODO: Except not implemented so far -# def test_window_ranks_partition_by(): -# a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50)) -# eq_sqlite( -# """ -# SELECT *, -# RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1, -# DENSE_RANK() OVER -# (PARTITION BY a ORDER BY a ASC, b DESC NULLS LAST, c DESC) -# AS a2, -# PERCENT_RANK() OVER -# (PARTITION BY a ORDER BY a ASC, b ASC NULLS LAST, c) AS a4 -# FROM a -# """, -# a=a, -# ) - -# TODO: Except not implemented so far -# def test_window_lead_lag(): -# a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50)) -# eq_sqlite( -# """ -# SELECT -# LEAD(b,1) OVER (ORDER BY a) AS a1, -# LEAD(b,2,10) OVER (ORDER BY a) AS a2, -# LEAD(b,1) OVER (PARTITION BY c ORDER BY a) AS a3, -# LEAD(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS a5, - -# LAG(b,1) OVER (ORDER BY a) AS b1, -# LAG(b,2,10) OVER (ORDER BY a) AS b2, -# LAG(b,1) OVER (PARTITION BY c ORDER BY a) AS b3, -# LAG(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS b5 -# FROM a -# """, -# a=a, -# ) - -# TODO: Except not implemented so far -# def test_window_lead_lag_partition_by(): -# a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50)) -# eq_sqlite( -# """ -# SELECT -# LEAD(b,1,10) OVER (PARTITION BY c ORDER BY a) AS a3, -# LEAD(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS a5, - -# LAG(b,1) OVER (PARTITION BY c ORDER BY a) AS b3, -# LAG(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS b5 -# FROM a -# """, -# a=a, -# ) +@pytest.mark.xfail( + reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878" +) +def test_window_ranks(): + a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50)) + eq_sqlite( + """ + SELECT *, + RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1, + DENSE_RANK() OVER (ORDER BY a ASC, b DESC NULLS LAST, c DESC) AS a2, + PERCENT_RANK() OVER (ORDER BY a ASC, b ASC NULLS LAST, c) AS a4 + FROM a + """, + a=a, + ) + + +@pytest.mark.xfail( + reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878" +) +def test_window_ranks_partition_by(): + a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50)) + eq_sqlite( + """ + SELECT *, + RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1, + DENSE_RANK() OVER + (PARTITION BY a ORDER BY a ASC, b DESC NULLS LAST, c DESC) + AS a2, + PERCENT_RANK() OVER + (PARTITION BY a ORDER BY a ASC, b ASC NULLS LAST, c) AS a4 + FROM a + """, + a=a, + ) + + +@pytest.mark.xfail( + reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878" +) +def test_window_lead_lag(): + a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50)) + eq_sqlite( + """ + SELECT + LEAD(b,1) OVER (ORDER BY a) AS a1, + LEAD(b,2,10) OVER (ORDER BY a) AS a2, + LEAD(b,1) OVER (PARTITION BY c ORDER BY a) AS a3, + LEAD(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS a5, + + LAG(b,1) OVER (ORDER BY a) AS b1, + LAG(b,2,10) OVER (ORDER BY a) AS b2, + LAG(b,1) OVER (PARTITION BY c ORDER BY a) AS b3, + LAG(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS b5 + FROM a + """, + a=a, + ) + + +@pytest.mark.xfail( + reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878" +) +def test_window_lead_lag_partition_by(): + a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50)) + eq_sqlite( + """ + SELECT + LEAD(b,1,10) OVER (PARTITION BY c ORDER BY a) AS a3, + LEAD(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS a5, + + LAG(b,1) OVER (PARTITION BY c ORDER BY a) AS b3, + LAG(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS b5 + FROM a + """, + a=a, + ) def test_window_sum_avg(): @@ -833,19 +844,17 @@ def test_window_count(): ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS a5, {func}(b+a) OVER (PARTITION BY b ORDER BY a ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) - AS a6 - - -- No support for rolling on string types - -- {func}(c) OVER () AS b1, - -- {func}(c) OVER (PARTITION BY c) AS b2, - -- {func}(c) OVER (PARTITION BY c,b) AS b3, - -- {func}(c) OVER (PARTITION BY b ORDER BY a - -- ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS b4, - -- {func}(c) OVER (PARTITION BY b ORDER BY a DESC - -- ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS b5, - -- {func}(c) OVER (PARTITION BY b ORDER BY a - -- ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) - -- AS b6 + AS a6, + {func}(c) OVER () AS b1, + {func}(c) OVER (PARTITION BY c) AS b2, + {func}(c) OVER (PARTITION BY c,b) AS b3, + {func}(c) OVER (PARTITION BY b ORDER BY a + ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS b4, + {func}(c) OVER (PARTITION BY b ORDER BY a DESC + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS b5, + {func}(c) OVER (PARTITION BY b ORDER BY a + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) + AS b6 FROM a ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST """, @@ -858,13 +867,11 @@ def test_window_count(): {func}(b) OVER (ORDER BY a DESC ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS a6, {func}(b) OVER (PARTITION BY c ORDER BY a DESC - ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS a9 --, - - -- No support for rolling on string types - -- {func}(c) OVER (ORDER BY a DESC - -- ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b6, - -- {func}(c) OVER (PARTITION BY c ORDER BY a DESC - -- ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b9 + ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS a9, + {func}(c) OVER (ORDER BY a DESC + ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b6, + {func}(c) OVER (PARTITION BY c ORDER BY a DESC + ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b9 FROM a ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST """, @@ -886,18 +893,16 @@ def test_window_count_partition_by(): ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS a5, {func}(b+a) OVER (PARTITION BY b ORDER BY a ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) - AS a6 --, - - -- No support for rolling on string types - -- {func}(c) OVER (PARTITION BY c) AS b2, - -- {func}(c) OVER (PARTITION BY c,b) AS b3, - -- {func}(c) OVER (PARTITION BY b ORDER BY a - -- ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS b4, - -- {func}(c) OVER (PARTITION BY b ORDER BY a DESC - -- ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS b5, - -- {func}(c) OVER (PARTITION BY b ORDER BY a - -- ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) - -- AS b6 + AS a6, + {func}(c) OVER (PARTITION BY c) AS b2, + {func}(c) OVER (PARTITION BY c,b) AS b3, + {func}(c) OVER (PARTITION BY b ORDER BY a + ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS b4, + {func}(c) OVER (PARTITION BY b ORDER BY a DESC + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS b5, + {func}(c) OVER (PARTITION BY b ORDER BY a + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) + AS b6 FROM a ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST """, @@ -908,11 +913,9 @@ def test_window_count_partition_by(): f""" SELECT a,b, {func}(b) OVER (PARTITION BY c ORDER BY a DESC - ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS a9 --, - - -- No support for rolling on string types - -- {func}(c) OVER (PARTITION BY c ORDER BY a DESC - -- ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b9 + ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS a9, + {func}(c) OVER (PARTITION BY c ORDER BY a DESC + ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b9 FROM a ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST """, @@ -964,37 +967,42 @@ def test_union(): ) -# TODO: Except not implemented so far -# def test_except(): -# a = make_rand_df(30, b=(int, 10), c=(str, 10)) -# b = make_rand_df(80, b=(int, 50), c=(str, 50)) -# c = make_rand_df(100, b=(int, 50), c=(str, 50)) -# eq_sqlite( -# """ -# SELECT * FROM c -# EXCEPT SELECT * FROM b -# EXCEPT SELECT * FROM c -# """, -# a=a, -# b=b, -# c=c, -# ) - -# TODO: Intersect not implemented so far -# def test_intersect(): -# a = make_rand_df(30, b=(int, 10), c=(str, 10)) -# b = make_rand_df(80, b=(int, 50), c=(str, 50)) -# c = make_rand_df(100, b=(int, 50), c=(str, 50)) -# eq_sqlite( -# """ -# SELECT * FROM c -# INTERSECT SELECT * FROM b -# INTERSECT SELECT * FROM c -# """, -# a=a, -# b=b, -# c=c, -# ) +@pytest.mark.xfail( + reason="'ANTI' joins not supported yet, see https://github.com/dask-contrib/dask-sql/issues/879" +) +def test_except(): + a = make_rand_df(30, b=(int, 10), c=(str, 10)) + b = make_rand_df(80, b=(int, 50), c=(str, 50)) + c = make_rand_df(100, b=(int, 50), c=(str, 50)) + eq_sqlite( + """ + SELECT * FROM c + EXCEPT SELECT * FROM b + EXCEPT SELECT * FROM c + """, + a=a, + b=b, + c=c, + ) + + +@pytest.mark.xfail( + reason="INTERSECT is not compliant with SQLite, see https://github.com/dask-contrib/dask-sql/issues/880" +) +def test_intersect(): + a = make_rand_df(30, b=(int, 10), c=(str, 10)) + b = make_rand_df(80, b=(int, 50), c=(str, 50)) + c = make_rand_df(100, b=(int, 50), c=(str, 50)) + eq_sqlite( + """ + SELECT * FROM c + INTERSECT SELECT * FROM b + INTERSECT SELECT * FROM c + """, + a=a, + b=b, + c=c, + ) def test_with(): diff --git a/tests/integration/test_complex.py b/tests/integration/test_complex.py index fc79f0a11..06196519a 100644 --- a/tests/integration/test_complex.py +++ b/tests/integration/test_complex.py @@ -1,9 +1,6 @@ from dask.datasets import timeseries -from tests.integration.fixtures import skip_if_external_scheduler - -@skip_if_external_scheduler def test_complex_query(c): df = timeseries(freq="1d").persist() c.create_table("timeseries", df) diff --git a/tests/integration/test_explain.py b/tests/integration/test_explain.py index 2a1793612..8fb539ac9 100644 --- a/tests/integration/test_explain.py +++ b/tests/integration/test_explain.py @@ -2,6 +2,8 @@ import pandas as pd import pytest +from dask_sql import Statistics + @pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) def test_sql_query_explain(c, gpu): @@ -12,15 +14,6 @@ def test_sql_query_explain(c, gpu): assert sql_string.startswith("Projection: df.a\n") - # TODO: Need to add statistics to Rust optimizer before this can be uncommented. - # c.create_table("df", data_frame, statistics=Statistics(row_count=1337)) - - # sql_string = c.explain("SELECT * FROM df") - - # assert sql_string.startswith( - # "DaskTableScan(table=[[root, df]]): rowcount = 1337.0, cumulative cost = {1337.0 rows, 1338.0 cpu, 0.0 io}, id = " - # ) - sql_string = c.sql( "EXPLAIN SELECT MIN(a) AS a_min FROM other_df GROUP BY a", dataframes={"other_df": df}, @@ -28,3 +21,16 @@ def test_sql_query_explain(c, gpu): ) assert sql_string.startswith("Projection: MIN(other_df.a) AS a_min\n") assert "Aggregate: groupBy=[[other_df.a]], aggr=[[MIN(other_df.a)]]" in sql_string + + +@pytest.mark.xfail(reason="Need to add statistics to Rust optimizer") +@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) +def test_statistics_explain(c, gpu): + df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) + c.create_table("df", df, statistics=Statistics(row_count=1337), gpu=gpu) + + sql_string = c.explain("SELECT * FROM df") + + assert sql_string.startswith( + "DaskTableScan(table=[[root, df]]): rowcount = 1337.0, cumulative cost = {1337.0 rows, 1338.0 cpu, 0.0 io}, id = " + ) diff --git a/tests/integration/test_fugue.py b/tests/integration/test_fugue.py index 09a29ef49..d99aa950b 100644 --- a/tests/integration/test_fugue.py +++ b/tests/integration/test_fugue.py @@ -3,7 +3,7 @@ import pytest from dask_sql import Context -from tests.integration.fixtures import skip_if_external_scheduler +from tests.integration.fixtures import xfail_if_external_scheduler from tests.utils import assert_eq fugue_sql = pytest.importorskip("fugue_sql") @@ -11,7 +11,6 @@ from dask_sql.integrations.fugue import fsql_dask # noqa: E402 -@skip_if_external_scheduler def test_fugue_workflow(client): dag = fugue_sql.FugueSQLWorkflow() df = dag.df([[0, "hello"], [1, "world"]], "a:int64,b:str") @@ -26,7 +25,6 @@ def test_fugue_workflow(client): assert_eq(return_df, pd.DataFrame({"a": [1], "b": ["world"]})) -@skip_if_external_scheduler def test_fugue_fsql(client): pdf = pd.DataFrame([[0, "hello"], [1, "world"]], columns=["a", "b"]) dag = fugue_sql.fsql( @@ -45,7 +43,7 @@ def test_fugue_fsql(client): # TODO: Revisit fixing this on an independant cluster (without dask-sql) based on the # discussion in https://github.com/dask-contrib/dask-sql/issues/407 -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_dask_fsql(client): def assert_fsql(df: pd.DataFrame) -> None: assert_eq(df, pd.DataFrame({"a": [1]})) diff --git a/tests/integration/test_groupby.py b/tests/integration/test_groupby.py index adc7a710f..f99f13952 100644 --- a/tests/integration/test_groupby.py +++ b/tests/integration/test_groupby.py @@ -151,7 +151,7 @@ def test_group_by_filtered(c): assert_eq(return_df, expected_df) -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_group_by_case(c): return_df = c.sql( """ @@ -273,7 +273,21 @@ def test_aggregations(c): assert_eq(return_df.reset_index(drop=True), expected_df) -@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) +@pytest.mark.parametrize( + "gpu", + [ + False, + pytest.param( + True, + marks=( + pytest.mark.gpu, + pytest.mark.xfail( + reason="stddev_pop is failing on GPU, see https://github.com/dask-contrib/dask-sql/issues/681" + ), + ), + ), + ], +) def test_stddev(c, gpu): df = pd.DataFrame( { @@ -309,11 +323,6 @@ def test_stddev(c, gpu): assert_eq(return_df, expected_df.reset_index(drop=True)) - # Can be removed after addressing: https://github.com/dask-contrib/dask-sql/issues/681 - if gpu: - c.drop_table("df") - pytest.skip() - return_df = c.sql( """ SELECT @@ -351,13 +360,8 @@ def test_stddev(c, gpu): c.drop_table("df") -@pytest.mark.parametrize( - "gpu", [False, pytest.param(True, marks=(pytest.mark.gpu, pytest.mark.skip))] -) +@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) def test_regr_aggregation(c, timeseries_df, gpu): - if gpu: - pytest.skip() - # test regr_count regr_count = c.sql( """ @@ -418,7 +422,7 @@ def test_regr_aggregation(c, timeseries_df, gpu): ) -@pytest.mark.skip( +@pytest.mark.xfail( reason="WIP DataFusion - https://github.com/dask-contrib/dask-sql/issues/753" ) def test_covar_aggregation(c, timeseries_df): diff --git a/tests/integration/test_jdbc.py b/tests/integration/test_jdbc.py index 7997302f1..03dd6c515 100644 --- a/tests/integration/test_jdbc.py +++ b/tests/integration/test_jdbc.py @@ -43,7 +43,7 @@ def app_client(c): app.client.close() -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_jdbc_has_schema(app_client, c): create_meta_data(c) @@ -75,7 +75,7 @@ def test_jdbc_has_schema(app_client, c): ] -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_jdbc_has_table(app_client, c): create_meta_data(c) check_data(app_client) @@ -93,7 +93,7 @@ def test_jdbc_has_table(app_client, c): ] -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_jdbc_has_columns(app_client, c): create_meta_data(c) check_data(app_client) diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index aba0ad95e..1252666e5 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -6,7 +6,7 @@ import pytest from dask.datasets import timeseries -from tests.integration.fixtures import skip_if_external_scheduler +from tests.integration.fixtures import xfail_if_external_scheduler from tests.utils import assert_eq try: @@ -62,7 +62,7 @@ def gpu_training_df(c): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_training_and_prediction(c, training_df): c.sql( """ @@ -99,7 +99,7 @@ def test_cuml_training_and_prediction(c, gpu_training_df): @pytest.mark.gpu -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_dask_cuml_training_and_prediction(c, gpu_training_df, gpu_client): model_query = """ @@ -115,7 +115,7 @@ def test_dask_cuml_training_and_prediction(c, gpu_training_df, gpu_client): check_trained_model(c) -@skip_if_external_scheduler +@xfail_if_external_scheduler @pytest.mark.gpu def test_dask_xgboost_training_prediction(c, gpu_training_df, gpu_client): model_query = """ @@ -150,7 +150,7 @@ def test_xgboost_training_prediction(c, gpu_training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_clustering_and_prediction(c, training_df): c.sql( """ @@ -185,7 +185,7 @@ def test_gpu_clustering_and_prediction(c, gpu_training_df, gpu_client): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_create_model_with_prediction(c, training_df): c.sql( """ @@ -220,7 +220,11 @@ def test_create_model_with_prediction(c, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +# this test failure shuts down the cluster and must be skipped instead of xfailed +@pytest.mark.skipif( + os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None, + reason="Can not run with external cluster", +) def test_iterative_and_prediction(c, training_df): c.sql( """ @@ -241,7 +245,7 @@ def test_iterative_and_prediction(c, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_show_models(c, training_df): c.sql( """ @@ -473,7 +477,7 @@ def test_drop_model(c, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_describe_model(c, training_df): c.sql( """ @@ -510,7 +514,7 @@ def test_describe_model(c, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_export_model(c, training_df, tmpdir): with pytest.raises(RuntimeError): c.sql( @@ -575,7 +579,7 @@ def test_export_model(c, training_df, tmpdir): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_mlflow_export(c, training_df, tmpdir): # Test only when mlflow was installed mlflow = pytest.importorskip("mlflow", reason="mlflow not installed") @@ -633,7 +637,7 @@ def test_mlflow_export(c, training_df, tmpdir): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_mlflow_export_xgboost(c, client, training_df, tmpdir): # Test only when mlflow & xgboost was installed mlflow = pytest.importorskip("mlflow", reason="mlflow not installed") @@ -697,7 +701,7 @@ def test_mlflow_export_lightgbm(c, training_df, tmpdir): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_ml_experiment(c, client, training_df): with pytest.raises( @@ -898,7 +902,7 @@ def test_ml_experiment(c, client, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler @pytest.mark.skip(reason="Waiting on https://github.com/EpistasisLab/tpot/pull/1280") def test_experiment_automl_classifier(c, client, training_df): tpot = pytest.importorskip("tpot", reason="tpot not installed") @@ -924,7 +928,7 @@ def test_experiment_automl_classifier(c, client, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler @pytest.mark.skip(reason="Waiting on https://github.com/EpistasisLab/tpot/pull/1280") def test_experiment_automl_regressor(c, client, training_df): tpot = pytest.importorskip("tpot", reason="tpot not installed") @@ -955,7 +959,7 @@ def test_experiment_automl_regressor(c, client, training_df): # TODO - many ML tests fail on clusters without sklearn - can we avoid this? -@skip_if_external_scheduler +@xfail_if_external_scheduler def test_predict_with_nullable_types(c): df = pd.DataFrame( { diff --git a/tests/integration/test_over.py b/tests/integration/test_over.py index 34119924b..be53817e9 100644 --- a/tests/integration/test_over.py +++ b/tests/integration/test_over.py @@ -1,4 +1,5 @@ import pandas as pd +import pytest from tests.utils import assert_eq @@ -114,6 +115,30 @@ def test_over_calls(c, user_table_1): assert_eq(return_df, expected_df, check_dtype=False, check_index=False) +@pytest.mark.xfail( + reason="Need to add single_value window function, see https://github.com/dask-contrib/dask-sql/issues/651" +) +def test_over_single_value(c, user_table_1): + return_df = c.sql( + """ + SELECT + user_id, + b, + SINGLE_VALUE(user_id*10 - b) OVER (PARTITION BY user_id ORDER BY b) AS "O3", + FROM user_table_1 + """ + ) + expected_df = pd.DataFrame( + { + "user_id": user_table_1.user_id, + "b": user_table_1.b, + "O3": [19, 7, 19, 27], + } + ) + + assert_eq(return_df, expected_df, check_dtype=False, check_index=False) + + def test_over_with_windows(c): tmp_df = pd.DataFrame({"a": range(5)}) c.create_table("tmp", tmp_df) diff --git a/tests/integration/test_postgres.py b/tests/integration/test_postgres.py index 05a249c77..afbb59a64 100644 --- a/tests/integration/test_postgres.py +++ b/tests/integration/test_postgres.py @@ -53,7 +53,7 @@ def engine(): network.remove() -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_select(assert_query_gives_same_result): assert_query_gives_same_result( """ @@ -174,7 +174,7 @@ def test_limit(assert_query_gives_same_result): ) -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_groupby(assert_query_gives_same_result): assert_query_gives_same_result( """ @@ -221,7 +221,7 @@ def test_filter(assert_query_gives_same_result): ) -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_string_operations(assert_query_gives_same_result): assert_query_gives_same_result( """ @@ -261,7 +261,7 @@ def test_string_operations(assert_query_gives_same_result): ) -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_statistical_functions(assert_query_gives_same_result): # test regr_count diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py index ca1fc41b2..8b3fe2855 100644 --- a/tests/integration/test_rex.py +++ b/tests/integration/test_rex.py @@ -5,7 +5,6 @@ import pandas as pd import pytest -from tests.integration.fixtures import skip_if_external_scheduler from tests.utils import assert_eq @@ -19,9 +18,6 @@ def test_year(c, datetime_table): assert result_df.compute().iloc[0][0] == 2014 -@pytest.mark.skip( - reason="WIP DataFusion - Enabling CBO generates yet to be implemented edge case" -) def test_case(c, df): result_df = c.sql( """ @@ -141,8 +137,6 @@ def test_literal_null(c): assert_eq(df, expected_df) -# TODO - https://github.com/dask-contrib/dask-sql/issues/978 -@skip_if_external_scheduler def test_random(c): query_with_seed = """ SELECT @@ -517,23 +511,25 @@ def test_integer_div(c, df_simple): df = c.sql( """ SELECT - -- 1 / a AS a, + 1 / a AS a, a / 2 AS b, 1.0 / a AS c FROM df_simple """ ) - expected_df = pd.DataFrame(index=df_simple.index) - # expected_df["a"] = [1, 0, 0] # dtype returned by df for 1/a is float instead of int - # expected_df["a"] = expected_df["a"].astype("Int64") - expected_df["b"] = [0, 1, 1] - expected_df["b"] = expected_df["b"].astype("Int64") - expected_df["c"] = [1.0, 0.5, 0.333333] + expected_df = pd.DataFrame( + { + "a": (1 // df_simple.a).astype("Int64"), + "b": (df_simple.a // 2).astype("Int64"), + "c": 1 / df_simple.a, + } + ) + assert_eq(df, expected_df) -@pytest.mark.skip(reason="Subquery expressions not yet enabled") +@pytest.mark.xfail(reason="Subquery expressions not yet enabled") def test_subqueries(c, user_table_1, user_table_2): df = c.sql( """ @@ -630,6 +626,76 @@ def test_string_functions(c, gpu): ) +@pytest.mark.xfail(reason="POSITION syntax not supported by parser") +@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) +def test_string_position(c, gpu): + if gpu: + input_table = "gpu_string_table" + else: + input_table = "string_table" + + df = c.sql( + f""" + SELECT + POSITION('a' IN a FROM 4) AS f, + POSITION('ZL' IN a) AS g, + FROM + {input_table} + """ + ) + + if gpu: + df = df.astype({"f": "int64", "g": "int64"}) + + expected_df = pd.DataFrame( + { + "f": [7], + "g": [0], + } + ) + + assert_eq( + df.head(1), + expected_df, + ) + + +@pytest.mark.xfail(reason="OVERLAY syntax not supported by parser") +@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) +def test_string_overlay(c, gpu): + if gpu: + input_table = "gpu_string_table" + else: + input_table = "string_table" + + df = c.sql( + f""" + SELECT + OVERLAY(a PLACING 'XXX' FROM -1) AS l, + OVERLAY(a PLACING 'XXX' FROM 2 FOR 4) AS m, + OVERLAY(a PLACING 'XXX' FROM 2 FOR 1) AS n, + FROM + {input_table} + """ + ) + + if gpu: + df = df.astype({"c": "int64"}) # , "f": "int64", "g": "int64"}) + + expected_df = pd.DataFrame( + { + "l": ["XXXormal string"], + "m": ["aXXXmal string"], + "n": ["aXXXnormal string"], + } + ) + + assert_eq( + df.head(1), + expected_df, + ) + + def test_date_functions(c): date = datetime(2021, 10, 3, 15, 53, 42, 47) diff --git a/tests/integration/test_sample.py b/tests/integration/test_sample.py index 44e15b67c..1542ba941 100644 --- a/tests/integration/test_sample.py +++ b/tests/integration/test_sample.py @@ -21,7 +21,7 @@ def get_system_sample(df, fraction, seed): return df -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_sample(c, df): ddf = c.sql("SELECT * FROM df") diff --git a/tests/integration/test_schema.py b/tests/integration/test_schema.py index 8189e23a0..ecba00b48 100644 --- a/tests/integration/test_schema.py +++ b/tests/integration/test_schema.py @@ -6,7 +6,7 @@ from tests.utils import assert_eq -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_table_schema(c, df): original_df = c.sql("SELECT * FROM df") @@ -36,7 +36,7 @@ def test_table_schema(c, df): c.sql("SELECT * FROM foo.bar") -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_function(c): c.sql("CREATE SCHEMA other") c.sql("USE SCHEMA root") diff --git a/tests/integration/test_server.py b/tests/integration/test_server.py index 29f635bee..c28e6e456 100644 --- a/tests/integration/test_server.py +++ b/tests/integration/test_server.py @@ -26,7 +26,6 @@ def app_client(): app.client.close() -@pytest.mark.skip(reason="WIP DataFusion") def test_routes(app_client): assert app_client.post("/v1/statement", data="SELECT 1 + 1").status_code == 200 assert app_client.get("/v1/statement", data="SELECT 1 + 1").status_code == 405 @@ -36,7 +35,6 @@ def test_routes(app_client): assert app_client.get("/v1/cancel/some-wrong-uuid").status_code == 405 -@pytest.mark.skip(reason="WIP DataFusion") def test_sql_query_cancel(app_client): response = app_client.post("/v1/statement", data="SELECT 1 + 1") assert response.status_code == 200 @@ -50,7 +48,7 @@ def test_sql_query_cancel(app_client): assert response.status_code == 404 -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_sql_query(app_client): response = app_client.post("/v1/statement", data="SELECT 1 + 1") assert response.status_code == 200 @@ -72,7 +70,7 @@ def test_sql_query(app_client): assert result["data"] == [[2]] -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_wrong_sql_query(app_client): response = app_client.post("/v1/statement", data="SELECT 1 + ") assert response.status_code == 200 @@ -90,7 +88,7 @@ def test_wrong_sql_query(app_client): } -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_add_and_query(app_client, df, temporary_data_file): df.to_csv(temporary_data_file, index=False) @@ -133,7 +131,7 @@ def test_add_and_query(app_client, df, temporary_data_file): assert "error" not in result -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_register_and_query(app_client, df): df["a"] = df["a"].astype("UInt8") app_client.app.c.create_table("new_table", df) @@ -162,7 +160,7 @@ def test_register_and_query(app_client, df): assert "error" not in result -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def test_inf_table(app_client, user_table_inf): app_client.app.c.create_table("new_table", user_table_inf) @@ -186,7 +184,7 @@ def test_inf_table(app_client, user_table_inf): assert "error" not in result -@pytest.mark.skip(reason="WIP DataFusion") +@pytest.mark.xfail(reason="WIP DataFusion") def get_result_or_error(app_client, response): result = response.json() diff --git a/tests/unit/test_context.py b/tests/unit/test_context.py index 77e87ed19..52b66937f 100644 --- a/tests/unit/test_context.py +++ b/tests/unit/test_context.py @@ -54,37 +54,6 @@ def test_deprecation_warning(gpu): assert "table" not in c.schema[c.schema_name].tables -@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) -def test_explain(gpu): - c = Context() - - data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) - c.create_table("df", data_frame, gpu=gpu) - - sql_string = c.explain("SELECT * FROM df") - - assert sql_string.startswith("Projection: df.a\n") - - # TODO: Need to add statistics to Rust optimizer before this can be uncommented. - # c.create_table("df", data_frame, statistics=Statistics(row_count=1337)) - - # sql_string = c.explain("SELECT * FROM df") - - # assert sql_string.startswith( - # "DaskTableScan(table=[[root, df]]): rowcount = 1337.0, cumulative cost = {1337.0 rows, 1338.0 cpu, 0.0 io}, id = " - # ) - - c = Context() - - data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) - - sql_string = c.explain( - "SELECT * FROM other_df", dataframes={"other_df": data_frame}, gpu=gpu - ) - - assert sql_string.startswith("Projection: other_df.a\n") - - @pytest.mark.parametrize( "gpu", [ @@ -116,7 +85,6 @@ def test_sql(gpu): assert_eq(result, data_frame) -@pytest.mark.skip(reason="WIP DataFusion - missing create statement logic") @pytest.mark.parametrize( "gpu", [ diff --git a/tests/unit/test_ml_wrappers.py b/tests/unit/test_ml_wrappers.py index 0f02707b0..4c8b65b2f 100644 --- a/tests/unit/test_ml_wrappers.py +++ b/tests/unit/test_ml_wrappers.py @@ -1,5 +1,6 @@ # Copyright 2017, Dask developers # Dask-ML project - https://github.com/dask/dask-ml +import os from collections.abc import Sequence import dask @@ -17,8 +18,6 @@ from dask_sql.physical.rel.custom.wrappers import Incremental, ParallelPostFit -from ..integration.fixtures import skip_if_external_scheduler - def _check_axis_partitioning(chunks, n_features): c = chunks[1][0] @@ -125,7 +124,12 @@ def assert_estimator_equal(left, right, exclude=None, **kwargs): _assert_eq(l, r, name=attr, **kwargs) -@skip_if_external_scheduler +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? +# this test failure shuts down the cluster and must be skipped instead of xfailed +@pytest.mark.skipif( + os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None, + reason="Can not run with external cluster", +) def test_parallelpostfit_basic(): clf = ParallelPostFit(GradientBoostingClassifier()) @@ -197,7 +201,12 @@ def test_transform(kind): assert_eq_ar(result, expected) -@skip_if_external_scheduler +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? +# this test failure shuts down the cluster and must be skipped instead of xfailed +@pytest.mark.skipif( + os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None, + reason="Can not run with external cluster", +) @pytest.mark.parametrize("dataframes", [False, True]) def test_incremental_basic(dataframes): # Create observations that we know linear models can recover