From bfa990daaf9f24f434f4e6a41d537eb63df7beaf Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Tue, 18 Oct 2022 12:32:49 -0700
Subject: [PATCH 01/25] Uncomment tests in test_compatibility.py

---
 tests/integration/test_compatibility.py | 258 ++++++++++++------------
 1 file changed, 124 insertions(+), 134 deletions(-)

diff --git a/tests/integration/test_compatibility.py b/tests/integration/test_compatibility.py
index d93fbbac0..de29108c2 100644
--- a/tests/integration/test_compatibility.py
+++ b/tests/integration/test_compatibility.py
@@ -594,72 +594,71 @@ def test_window_row_number_partition_by():
     )
 
 
-# TODO: Except not implemented so far
-# def test_window_ranks():
-#     a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50))
-#     eq_sqlite(
-#         """
-#         SELECT *,
-#             RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1,
-#             DENSE_RANK() OVER (ORDER BY a ASC, b DESC NULLS LAST, c DESC) AS a2,
-#             PERCENT_RANK() OVER (ORDER BY a ASC, b ASC NULLS LAST, c) AS a4
-#         FROM a
-#         """,
-#         a=a,
-#     )
-
-# TODO: Except not implemented so far
-# def test_window_ranks_partition_by():
-#     a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50))
-#     eq_sqlite(
-#         """
-#         SELECT *,
-#             RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1,
-#             DENSE_RANK() OVER
-#                 (PARTITION BY a ORDER BY a ASC, b DESC NULLS LAST, c DESC)
-#                 AS a2,
-#             PERCENT_RANK() OVER
-#                 (PARTITION BY a ORDER BY a ASC, b ASC NULLS LAST, c) AS a4
-#         FROM a
-#         """,
-#         a=a,
-#     )
-
-# TODO: Except not implemented so far
-# def test_window_lead_lag():
-#     a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50))
-#     eq_sqlite(
-#         """
-#         SELECT
-#             LEAD(b,1) OVER (ORDER BY a) AS a1,
-#             LEAD(b,2,10) OVER (ORDER BY a) AS a2,
-#             LEAD(b,1) OVER (PARTITION BY c ORDER BY a) AS a3,
-#             LEAD(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS a5,
-
-#             LAG(b,1) OVER (ORDER BY a) AS b1,
-#             LAG(b,2,10) OVER (ORDER BY a) AS b2,
-#             LAG(b,1) OVER (PARTITION BY c ORDER BY a) AS b3,
-#             LAG(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS b5
-#         FROM a
-#         """,
-#         a=a,
-#     )
-
-# TODO: Except not implemented so far
-# def test_window_lead_lag_partition_by():
-#     a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50))
-#     eq_sqlite(
-#         """
-#         SELECT
-#             LEAD(b,1,10) OVER (PARTITION BY c ORDER BY a) AS a3,
-#             LEAD(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS a5,
-
-#             LAG(b,1) OVER (PARTITION BY c ORDER BY a) AS b3,
-#             LAG(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS b5
-#         FROM a
-#         """,
-#         a=a,
-#     )
+def test_window_ranks():
+    a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50))
+    eq_sqlite(
+        """
+        SELECT *,
+            RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1,
+            DENSE_RANK() OVER (ORDER BY a ASC, b DESC NULLS LAST, c DESC) AS a2
+            PERCENT_RANK() OVER (ORDER BY a ASC, b ASC NULLS LAST, c) AS a4
+        FROM a
+        """,
+        a=a,
+    )
+
+
+def test_window_ranks_partition_by():
+    a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50))
+    eq_sqlite(
+        """
+        SELECT *,
+            RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1,
+            DENSE_RANK() OVER
+                (PARTITION BY a ORDER BY a ASC, b DESC NULLS LAST, c DESC)
+                AS a2,
+            PERCENT_RANK() OVER
+                (PARTITION BY a ORDER BY a ASC, b ASC NULLS LAST, c) AS a4
+        FROM a
+        """,
+        a=a,
+    )
+
+
+def test_window_lead_lag():
+    a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50))
+    eq_sqlite(
+        """
+        SELECT
+            LEAD(b,1) OVER (ORDER BY a) AS a1,
+            LEAD(b,2,10) OVER (ORDER BY a) AS a2,
+            LEAD(b,1) OVER (PARTITION BY c ORDER BY a) AS a3,
+            LEAD(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS a5,
+
+            LAG(b,1) OVER (ORDER BY a) AS b1,
+            LAG(b,2,10) OVER (ORDER BY a) AS b2,
+            LAG(b,1) OVER (PARTITION BY c ORDER BY a) AS b3,
+            LAG(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS b5
+        FROM a
+        """,
+        a=a,
+    )
+
+
+def test_window_lead_lag_partition_by():
+    a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50))
+    eq_sqlite(
+        """
+        SELECT
+            LEAD(b,1,10) OVER (PARTITION BY c ORDER BY a) AS a3,
+            LEAD(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS a5,
+
+            LAG(b,1) OVER (PARTITION BY c ORDER BY a) AS b3,
+            LAG(b,1) OVER (PARTITION BY c ORDER BY b, a ASC NULLS LAST) AS b5
+        FROM a
+        """,
+        a=a,
+    )
 
 
 def test_window_sum_avg():
@@ -833,19 +832,17 @@ def test_window_count():
                     ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS a5,
                 {func}(b+a) OVER (PARTITION BY b ORDER BY a
                     ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
-                    AS a6
-
-                -- No support for rolling on string types
-                -- {func}(c) OVER () AS b1,
-                -- {func}(c) OVER (PARTITION BY c) AS b2,
-                -- {func}(c) OVER (PARTITION BY c,b) AS b3,
-                -- {func}(c) OVER (PARTITION BY b ORDER BY a
-                --     ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS b4,
-                -- {func}(c) OVER (PARTITION BY b ORDER BY a DESC
-                --     ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS b5,
-                -- {func}(c) OVER (PARTITION BY b ORDER BY a
-                --     ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
-                --     AS b6
+                    AS a6,
+                {func}(c) OVER () AS b1,
+                {func}(c) OVER (PARTITION BY c) AS b2,
+                {func}(c) OVER (PARTITION BY c,b) AS b3,
+                {func}(c) OVER (PARTITION BY b ORDER BY a
+                    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS b4,
+                {func}(c) OVER (PARTITION BY b ORDER BY a DESC
+                    ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS b5,
+                {func}(c) OVER (PARTITION BY b ORDER BY a
+                    ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
+                    AS b6
             FROM a
             ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST
             """,
@@ -858,13 +855,11 @@ def test_window_count():
                 {func}(b) OVER (ORDER BY a DESC
                     ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS a6,
                 {func}(b) OVER (PARTITION BY c ORDER BY a DESC
-                    ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS a9 --,
-
-                -- No support for rolling on string types
-                -- {func}(c) OVER (ORDER BY a DESC
-                --     ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b6,
-                -- {func}(c) OVER (PARTITION BY c ORDER BY a DESC
-                --     ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b9
+                    ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS a9,
+                {func}(c) OVER (ORDER BY a DESC
+                    ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b6,
+                {func}(c) OVER (PARTITION BY c ORDER BY a DESC
+                    ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b9
             FROM a
             ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST
             """,
@@ -886,18 +881,16 @@ def test_window_count_partition_by():
                     ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS a5,
                 {func}(b+a) OVER (PARTITION BY b ORDER BY a
                     ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
-                    AS a6 --,
-
-                -- No support for rolling on string types
-                -- {func}(c) OVER (PARTITION BY c) AS b2,
-                -- {func}(c) OVER (PARTITION BY c,b) AS b3,
-                -- {func}(c) OVER (PARTITION BY b ORDER BY a
-                --     ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS b4,
-                -- {func}(c) OVER (PARTITION BY b ORDER BY a DESC
-                --     ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS b5,
-                -- {func}(c) OVER (PARTITION BY b ORDER BY a
-                --     ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
-                --     AS b6
+                    AS a6,
+                {func}(c) OVER (PARTITION BY c) AS b2,
+                {func}(c) OVER (PARTITION BY c,b) AS b3,
+                {func}(c) OVER (PARTITION BY b ORDER BY a
+                    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS b4,
+                {func}(c) OVER (PARTITION BY b ORDER BY a DESC
+                    ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS b5,
+                {func}(c) OVER (PARTITION BY b ORDER BY a
+                    ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
+                    AS b6
             FROM a
             ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST
             """,
@@ -908,11 +901,9 @@ def test_window_count_partition_by():
             f"""
             SELECT a,b,
                 {func}(b) OVER (PARTITION BY c ORDER BY a DESC
-                    ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS a9 --,
-
-                -- No support for rolling on string types
-                -- {func}(c) OVER (PARTITION BY c ORDER BY a DESC
-                --     ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b9
+                    ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS a9,
+                {func}(c) OVER (PARTITION BY c ORDER BY a DESC
+                    ROWS BETWEEN 2 PRECEDING AND 0 PRECEDING) AS b9
             FROM a
             ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST
             """,
@@ -963,37 +954,36 @@ def test_union():
     )
 
 
-# TODO: Except not implemented so far
-# def test_except():
-#     a = make_rand_df(30, b=(int, 10), c=(str, 10))
-#     b = make_rand_df(80, b=(int, 50), c=(str, 50))
-#     c = make_rand_df(100, b=(int, 50), c=(str, 50))
-#     eq_sqlite(
-#         """
-#         SELECT * FROM c
-#             EXCEPT SELECT * FROM b
-#             EXCEPT SELECT * FROM c
-#         """,
-#         a=a,
-#         b=b,
-#         c=c,
-#     )
-
-# TODO: Intersect not implemented so far
-# def test_intersect():
-#     a = make_rand_df(30, b=(int, 10), c=(str, 10))
-#     b = make_rand_df(80, b=(int, 50), c=(str, 50))
-#     c = make_rand_df(100, b=(int, 50), c=(str, 50))
-#     eq_sqlite(
-#         """
-#         SELECT * FROM c
-#             INTERSECT SELECT * FROM b
-#             INTERSECT SELECT * FROM c
-#         """,
-#         a=a,
-#         b=b,
-#         c=c,
-#     )
+def test_except():
+    a = make_rand_df(30, b=(int, 10), c=(str, 10))
+    b = make_rand_df(80, b=(int, 50), c=(str, 50))
+    c = make_rand_df(100, b=(int, 50), c=(str, 50))
+    eq_sqlite(
+        """
+        SELECT * FROM c
+            EXCEPT SELECT * FROM b
+            EXCEPT SELECT * FROM c
+        """,
+        a=a,
+        b=b,
+        c=c,
+    )
+
+
+def test_intersect():
+    a = make_rand_df(30, b=(int, 10), c=(str, 10))
+    b = make_rand_df(80, b=(int, 50), c=(str, 50))
+    c = make_rand_df(100, b=(int, 50), c=(str, 50))
+    eq_sqlite(
+        """
+        SELECT * FROM c
+            INTERSECT SELECT * FROM b
+            INTERSECT SELECT * FROM c
+        """,
+        a=a,
+        b=b,
+        c=c,
+    )
 
 
 def test_with():

From 4c3c9ad7c1aa2962c5c9eec7d94b32d9c1d8c6b1 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Tue, 18 Oct 2022 13:11:10 -0700
Subject: [PATCH 02/25] Uncomments tests in test_explain.py

---
 tests/integration/test_explain.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/tests/integration/test_explain.py b/tests/integration/test_explain.py
index 2a1793612..e0a1d85bc 100644
--- a/tests/integration/test_explain.py
+++ b/tests/integration/test_explain.py
@@ -2,6 +2,8 @@
 import pandas as pd
 import pytest
 
+from dask_planner.rust import DaskStatistics
+
 
 @pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)])
 def test_sql_query_explain(c, gpu):
@@ -12,15 +14,6 @@ def test_sql_query_explain(c, gpu):
 
     assert sql_string.startswith("Projection: df.a\n")
 
-    # TODO: Need to add statistics to Rust optimizer before this can be uncommented.
-    # c.create_table("df", data_frame, statistics=Statistics(row_count=1337))
-
-    # sql_string = c.explain("SELECT * FROM df")
-
-    # assert sql_string.startswith(
-    #     "DaskTableScan(table=[[root, df]]): rowcount = 1337.0, cumulative cost = {1337.0 rows, 1338.0 cpu, 0.0 io}, id = "
-    # )
-
     sql_string = c.sql(
         "EXPLAIN SELECT MIN(a) AS a_min FROM other_df GROUP BY a",
         dataframes={"other_df": df},
@@ -28,3 +21,15 @@ def test_sql_query_explain(c, gpu):
     )
     assert sql_string.startswith("Projection: MIN(other_df.a) AS a_min\n")
     assert "Aggregate: groupBy=[[other_df.a]], aggr=[[MIN(other_df.a)]]" in sql_string
+
+
+@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)])
+def test_statistics_explain(c, gpu):
+    df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
+    c.create_table("df", df, statistics=DaskStatistics(row_count=1337))
+
+    sql_string = c.explain("SELECT * FROM df")
+
+    assert sql_string.startswith(
+        "DaskTableScan(table=[[root, df]]): rowcount = 1337.0, cumulative cost = {1337.0 rows, 1338.0 cpu, 0.0 io}, id = "
+    )

From 3c6f404828b320107fef150dbfbc91d21a33df85 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 19 Oct 2022 10:03:52 -0700
Subject: [PATCH 03/25] xfail instead of skipping for failing independent
 cluster tests

---
 tests/integration/fixtures.py     |  4 ++--
 tests/integration/test_complex.py |  4 ++--
 tests/integration/test_fugue.py   |  6 +++---
 tests/integration/test_model.py   | 28 ++++++++++++++--------------
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/tests/integration/fixtures.py b/tests/integration/fixtures.py
index 8dec835c3..e0b478dbc 100644
--- a/tests/integration/fixtures.py
+++ b/tests/integration/fixtures.py
@@ -340,7 +340,7 @@ def client():
         yield client
 
 
-skip_if_external_scheduler = pytest.mark.skipif(
-    os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None,
+xfail_if_external_scheduler = pytest.mark.xfail(
+    condition=os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None,
     reason="Can not run with external cluster",
 )
diff --git a/tests/integration/test_complex.py b/tests/integration/test_complex.py
index fc79f0a11..209e383f5 100644
--- a/tests/integration/test_complex.py
+++ b/tests/integration/test_complex.py
@@ -1,9 +1,9 @@
 from dask.datasets import timeseries
 
-from tests.integration.fixtures import skip_if_external_scheduler
+from tests.integration.fixtures import xfail_if_external_scheduler
 
 
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_complex_query(c):
     df = timeseries(freq="1d").persist()
     c.create_table("timeseries", df)
diff --git a/tests/integration/test_fugue.py b/tests/integration/test_fugue.py
index d846b5559..9c5eeab12 100644
--- a/tests/integration/test_fugue.py
+++ b/tests/integration/test_fugue.py
@@ -3,7 +3,7 @@
 import pytest
 
 from dask_sql import Context
-from tests.integration.fixtures import skip_if_external_scheduler
+from tests.integration.fixtures import xfail_if_external_scheduler
 from tests.utils import assert_eq
 
 fugue_sql = pytest.importorskip("fugue_sql")
@@ -13,7 +13,7 @@
     from dask_sql.integrations.fugue import DaskSQLExecutionEngine, fsql_dask
 
 
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_simple_statement(client):
     with fugue_sql.FugueSQLWorkflow(DaskSQLExecutionEngine) as dag:
         df = dag.df([[0, "hello"], [1, "world"]], "a:int64,b:str")
@@ -44,7 +44,7 @@ def test_simple_statement(client):
 
 # TODO: Revisit fixing this on an independant cluster (without dask-sql) based on the
 # discussion in https://github.com/dask-contrib/dask-sql/issues/407
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_fsql(client):
     def assert_fsql(df: pd.DataFrame) -> None:
         assert_eq(df, pd.DataFrame({"a": [1]}))
diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py
index 3c1bd1a69..33e71cc54 100644
--- a/tests/integration/test_model.py
+++ b/tests/integration/test_model.py
@@ -6,7 +6,7 @@
 import pytest
 from dask.datasets import timeseries
 
-from tests.integration.fixtures import skip_if_external_scheduler
+from tests.integration.fixtures import xfail_if_external_scheduler
 from tests.utils import assert_eq
 
 try:
@@ -64,7 +64,7 @@ def gpu_training_df(c):
 
 
 # TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_training_and_prediction(c, training_df):
     c.sql(
         """
@@ -101,7 +101,7 @@ def test_cuml_training_and_prediction(c, gpu_training_df):
 
 
 @pytest.mark.gpu
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_dask_cuml_training_and_prediction(c, gpu_training_df, gpu_client):
 
     model_query = """
@@ -117,7 +117,7 @@ def test_dask_cuml_training_and_prediction(c, gpu_training_df, gpu_client):
     check_trained_model(c)
 
 
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 @pytest.mark.gpu
 def test_dask_xgboost_training_prediction(c, gpu_training_df, gpu_client):
     model_query = """
@@ -152,7 +152,7 @@ def test_xgboost_training_prediction(c, gpu_training_df):
 
 
 # TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_clustering_and_prediction(c, training_df):
     c.sql(
         """
@@ -170,7 +170,7 @@ def test_clustering_and_prediction(c, training_df):
 
 
 # TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_create_model_with_prediction(c, training_df):
     c.sql(
         """
@@ -205,7 +205,7 @@ def test_create_model_with_prediction(c, training_df):
 
 
 # TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_iterative_and_prediction(c, training_df):
     c.sql(
         """
@@ -226,7 +226,7 @@ def test_iterative_and_prediction(c, training_df):
 
 
 # TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_show_models(c, training_df):
     c.sql(
         """
@@ -458,7 +458,7 @@ def test_drop_model(c, training_df):
 
 
 # TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_describe_model(c, training_df):
     c.sql(
         """
@@ -558,7 +558,7 @@ def test_export_model(c, training_df, tmpdir):
 
 
 # TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_mlflow_export(c, training_df, tmpdir):
     # Test only when mlflow was installed
     mlflow = pytest.importorskip("mlflow", reason="mlflow not installed")
@@ -616,7 +616,7 @@ def test_mlflow_export(c, training_df, tmpdir):
 
 
 # TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_mlflow_export_xgboost(c, client, training_df, tmpdir):
     # Test only when mlflow & xgboost was installed
     mlflow = pytest.importorskip("mlflow", reason="mlflow not installed")
@@ -680,7 +680,7 @@ def test_mlflow_export_lightgbm(c, training_df, tmpdir):
 
 
 # TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_ml_experiment(c, client, training_df):
 
     with pytest.raises(
@@ -881,7 +881,7 @@ def test_ml_experiment(c, client, training_df):
 
 
 # TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_experiment_automl_classifier(c, client, training_df):
     tpot = pytest.importorskip("tpot", reason="tpot not installed")
     # currently tested with tpot==
@@ -906,7 +906,7 @@ def test_experiment_automl_classifier(c, client, training_df):
 
 
 # TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_experiment_automl_regressor(c, client, training_df):
     tpot = pytest.importorskip("tpot", reason="tpot not installed")
     # test regressor

From 32997cc6e40fcd89c241f9b955d663b8727ce647 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 19 Oct 2022 10:18:38 -0700
Subject: [PATCH 04/25] Switch skips to xfails in test_groupby.py

---
 tests/integration/test_groupby.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/tests/integration/test_groupby.py b/tests/integration/test_groupby.py
index b3007b066..7bb7d730f 100644
--- a/tests/integration/test_groupby.py
+++ b/tests/integration/test_groupby.py
@@ -154,7 +154,7 @@ def test_group_by_filtered(c):
     assert_eq(return_df, expected_df)
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_group_by_case(c):
     return_df = c.sql(
         """
@@ -276,7 +276,21 @@ def test_aggregations(c):
     assert_eq(return_df.reset_index(drop=True), expected_df)
 
 
-@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)])
+@pytest.mark.parametrize(
+    "gpu",
+    [
+        False,
+        pytest.param(
+            True,
+            marks=(
+                pytest.mark.gpu,
+                pytest.mark.xfail(
+                    reason="stddev_pop is failing on GPU, see https://github.com/dask-contrib/dask-sql/issues/681"
+                ),
+            ),
+        ),
+    ],
+)
 def test_stddev(c, gpu):
     df = pd.DataFrame(
         {
@@ -312,11 +326,6 @@ def test_stddev(c, gpu):
 
     assert_eq(return_df, expected_df.reset_index(drop=True))
 
-    # Can be removed after addressing: https://github.com/dask-contrib/dask-sql/issues/681
-    if gpu:
-        c.drop_table("df")
-        pytest.skip()
-
     return_df = c.sql(
         """
         SELECT
@@ -355,12 +364,9 @@ def test_stddev(c, gpu):
 
 
 @pytest.mark.parametrize(
-    "gpu", [False, pytest.param(True, marks=(pytest.mark.gpu, pytest.mark.skip))]
+    "gpu", [False, pytest.param(True, marks=(pytest.mark.gpu, pytest.mark.xfail))]
 )
 def test_regr_aggregation(c, timeseries_df, gpu):
-    if gpu:
-        pytest.skip()
-
     # test regr_count
     regr_count = c.sql(
         """
@@ -421,7 +427,7 @@ def test_regr_aggregation(c, timeseries_df, gpu):
     )
 
 
-@pytest.mark.skip(
+@pytest.mark.xfail(
     reason="WIP DataFusion - https://github.com/dask-contrib/dask-sql/issues/753"
 )
 def test_covar_aggregation(c, timeseries_df):

From cebc475a819a3ced023e53a6c0f0b767c409c972 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 19 Oct 2022 10:24:43 -0700
Subject: [PATCH 05/25] Uncomments tests in test_over.py

---
 tests/integration/test_over.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/tests/integration/test_over.py b/tests/integration/test_over.py
index 34119924b..be53817e9 100644
--- a/tests/integration/test_over.py
+++ b/tests/integration/test_over.py
@@ -1,4 +1,5 @@
 import pandas as pd
+import pytest
 
 from tests.utils import assert_eq
 
@@ -114,6 +115,30 @@ def test_over_calls(c, user_table_1):
     assert_eq(return_df, expected_df, check_dtype=False, check_index=False)
 
 
+@pytest.mark.xfail(
+    reason="Need to add single_value window function, see https://github.com/dask-contrib/dask-sql/issues/651"
+)
+def test_over_single_value(c, user_table_1):
+    return_df = c.sql(
+        """
+    SELECT
+        user_id,
+        b,
+        SINGLE_VALUE(user_id*10 - b) OVER (PARTITION BY user_id ORDER BY b) AS "O3",
+    FROM user_table_1
+    """
+    )
+    expected_df = pd.DataFrame(
+        {
+            "user_id": user_table_1.user_id,
+            "b": user_table_1.b,
+            "O3": [19, 7, 19, 27],
+        }
+    )
+
+    assert_eq(return_df, expected_df, check_dtype=False, check_index=False)
+
+
 def test_over_with_windows(c):
     tmp_df = pd.DataFrame({"a": range(5)})
     c.create_table("tmp", tmp_df)

From 3395a022ef869c3e51a9f5fcf631c0d13e07ea0c Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 19 Oct 2022 10:27:21 -0700
Subject: [PATCH 06/25] Switch skips to xfails in test_postgres.py

---
 tests/integration/test_postgres.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/integration/test_postgres.py b/tests/integration/test_postgres.py
index 05a249c77..afbb59a64 100644
--- a/tests/integration/test_postgres.py
+++ b/tests/integration/test_postgres.py
@@ -53,7 +53,7 @@ def engine():
     network.remove()
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_select(assert_query_gives_same_result):
     assert_query_gives_same_result(
         """
@@ -174,7 +174,7 @@ def test_limit(assert_query_gives_same_result):
     )
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_groupby(assert_query_gives_same_result):
     assert_query_gives_same_result(
         """
@@ -221,7 +221,7 @@ def test_filter(assert_query_gives_same_result):
     )
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_string_operations(assert_query_gives_same_result):
     assert_query_gives_same_result(
         """
@@ -261,7 +261,7 @@ def test_string_operations(assert_query_gives_same_result):
     )
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_statistical_functions(assert_query_gives_same_result):
 
     # test regr_count

From 74bedf7bcd42087a32160cc5f5c1667e4413eb74 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 19 Oct 2022 10:57:21 -0700
Subject: [PATCH 07/25] Uncomments tests in test_rex.py

---
 tests/integration/test_rex.py | 92 +++++++++++++++++++++++++++++++----
 1 file changed, 82 insertions(+), 10 deletions(-)

diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py
index 655ff69de..9cb807bb4 100644
--- a/tests/integration/test_rex.py
+++ b/tests/integration/test_rex.py
@@ -18,7 +18,7 @@ def test_year(c, datetime_table):
     assert result_df.compute().iloc[0][0] == 2014
 
 
-@pytest.mark.skip(
+@pytest.mark.xfail(
     reason="WIP DataFusion - Enabling CBO generates yet to be implemented edge case"
 )
 def test_case(c, df):
@@ -455,23 +455,25 @@ def test_integer_div(c, df_simple):
     df = c.sql(
         """
         SELECT
-            -- 1 / a AS a,
+            1 / a AS a,
             a / 2 AS b,
             1.0 / a AS c
         FROM df_simple
     """
     )
 
-    expected_df = pd.DataFrame(index=df_simple.index)
-    # expected_df["a"] = [1, 0, 0] # dtype returned by df for 1/a is float instead of int
-    # expected_df["a"] = expected_df["a"].astype("Int64")
-    expected_df["b"] = [0, 1, 1]
-    expected_df["b"] = expected_df["b"].astype("Int64")
-    expected_df["c"] = [1.0, 0.5, 0.333333]
+    expected_df = pd.DataFrame(
+        {
+            "a": (1 // df_simple.a).astype("Int64"),
+            "b": (df_simple.a // 2).astype("Int64"),
+            "c": 1 / df_simple.a,
+        }
+    )
+
     assert_eq(df, expected_df)
 
 
-@pytest.mark.skip(reason="Subquery expressions not yet enabled")
+@pytest.mark.xfail(reason="Subquery expressions not yet enabled")
 def test_subqueries(c, user_table_1, user_table_2):
     df = c.sql(
         """
@@ -564,7 +566,77 @@ def test_string_functions(c, gpu):
     )
 
 
-@pytest.mark.skip(
+@pytest.mark.xfail(reason="POSITION syntax not supported by parser")
+@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)])
+def test_string_position(c, gpu):
+    if gpu:
+        input_table = "gpu_string_table"
+    else:
+        input_table = "string_table"
+
+    df = c.sql(
+        f"""
+        SELECT
+            POSITION('a' IN a FROM 4) AS f,
+            POSITION('ZL' IN a) AS g,
+        FROM
+            {input_table}
+        """
+    )
+
+    if gpu:
+        df = df.astype({"f": "int64", "g": "int64"})
+
+    expected_df = pd.DataFrame(
+        {
+            "f": [7],
+            "g": [0],
+        }
+    )
+
+    assert_eq(
+        df.head(1),
+        expected_df,
+    )
+
+
+@pytest.mark.xfail(reason="OVERLAY syntax not supported by parser")
+@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)])
+def test_string_overlay(c, gpu):
+    if gpu:
+        input_table = "gpu_string_table"
+    else:
+        input_table = "string_table"
+
+    df = c.sql(
+        f"""
+        SELECT
+            OVERLAY(a PLACING 'XXX' FROM -1) AS l,
+            OVERLAY(a PLACING 'XXX' FROM 2 FOR 4) AS m,
+            OVERLAY(a PLACING 'XXX' FROM 2 FOR 1) AS n,
+        FROM
+            {input_table}
+        """
+    )
+
+    if gpu:
+        df = df.astype({"c": "int64"})  # , "f": "int64", "g": "int64"})
+
+    expected_df = pd.DataFrame(
+        {
+            "l": ["XXXormal string"],
+            "m": ["aXXXmal string"],
+            "n": ["aXXXnormal string"],
+        }
+    )
+
+    assert_eq(
+        df.head(1),
+        expected_df,
+    )
+
+
+@pytest.mark.xfail(
     reason="TIMESTAMP add, ceil, floor for dt ops not supported by parser"
 )
 def test_date_functions(c):

From d57e94962a5317d6bc75cbdc76dcbd859b49529e Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 19 Oct 2022 10:58:15 -0700
Subject: [PATCH 08/25] Switch skips to xfails in test_schema.py

---
 tests/integration/test_schema.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_schema.py b/tests/integration/test_schema.py
index 8189e23a0..ecba00b48 100644
--- a/tests/integration/test_schema.py
+++ b/tests/integration/test_schema.py
@@ -6,7 +6,7 @@
 from tests.utils import assert_eq
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_table_schema(c, df):
     original_df = c.sql("SELECT * FROM df")
 
@@ -36,7 +36,7 @@ def test_table_schema(c, df):
         c.sql("SELECT * FROM foo.bar")
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_function(c):
     c.sql("CREATE SCHEMA other")
     c.sql("USE SCHEMA root")

From 9fb367be2084872b95bd8a29f0e7c24f6bd92df2 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 19 Oct 2022 11:00:22 -0700
Subject: [PATCH 09/25] Switch skips to xfails in test_jdbc.py

---
 tests/integration/test_jdbc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/integration/test_jdbc.py b/tests/integration/test_jdbc.py
index 7997302f1..03dd6c515 100644
--- a/tests/integration/test_jdbc.py
+++ b/tests/integration/test_jdbc.py
@@ -43,7 +43,7 @@ def app_client(c):
         app.client.close()
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_jdbc_has_schema(app_client, c):
     create_meta_data(c)
 
@@ -75,7 +75,7 @@ def test_jdbc_has_schema(app_client, c):
     ]
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_jdbc_has_table(app_client, c):
     create_meta_data(c)
     check_data(app_client)
@@ -93,7 +93,7 @@ def test_jdbc_has_table(app_client, c):
     ]
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_jdbc_has_columns(app_client, c):
     create_meta_data(c)
     check_data(app_client)

From fd50a462c91656d174d67d718aed8dd86bba62bf Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 19 Oct 2022 11:04:42 -0700
Subject: [PATCH 10/25] Switch skips to xfails in test_sample.py

---
 tests/integration/test_sample.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_sample.py b/tests/integration/test_sample.py
index 44e15b67c..1542ba941 100644
--- a/tests/integration/test_sample.py
+++ b/tests/integration/test_sample.py
@@ -21,7 +21,7 @@ def get_system_sample(df, fraction, seed):
     return df
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_sample(c, df):
     ddf = c.sql("SELECT * FROM df")
 

From 2f5fb4efa7cdc6e75a915891f7311bc21261cd25 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 19 Oct 2022 11:11:55 -0700
Subject: [PATCH 11/25] Switch skips to xfails in test_server.py

---
 tests/integration/test_server.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/integration/test_server.py b/tests/integration/test_server.py
index 29f635bee..003ff866b 100644
--- a/tests/integration/test_server.py
+++ b/tests/integration/test_server.py
@@ -26,7 +26,7 @@ def app_client():
         app.client.close()
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_routes(app_client):
     assert app_client.post("/v1/statement", data="SELECT 1 + 1").status_code == 200
     assert app_client.get("/v1/statement", data="SELECT 1 + 1").status_code == 405
@@ -36,7 +36,7 @@ def test_routes(app_client):
     assert app_client.get("/v1/cancel/some-wrong-uuid").status_code == 405
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_sql_query_cancel(app_client):
     response = app_client.post("/v1/statement", data="SELECT 1 + 1")
     assert response.status_code == 200
@@ -50,7 +50,7 @@ def test_sql_query_cancel(app_client):
     assert response.status_code == 404
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_sql_query(app_client):
     response = app_client.post("/v1/statement", data="SELECT 1 + 1")
     assert response.status_code == 200
@@ -72,7 +72,7 @@ def test_sql_query(app_client):
     assert result["data"] == [[2]]
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_wrong_sql_query(app_client):
     response = app_client.post("/v1/statement", data="SELECT 1 + ")
     assert response.status_code == 200
@@ -90,7 +90,7 @@ def test_wrong_sql_query(app_client):
     }
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_add_and_query(app_client, df, temporary_data_file):
     df.to_csv(temporary_data_file, index=False)
 
@@ -133,7 +133,7 @@ def test_add_and_query(app_client, df, temporary_data_file):
     assert "error" not in result
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_register_and_query(app_client, df):
     df["a"] = df["a"].astype("UInt8")
     app_client.app.c.create_table("new_table", df)
@@ -162,7 +162,7 @@ def test_register_and_query(app_client, df):
     assert "error" not in result
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def test_inf_table(app_client, user_table_inf):
     app_client.app.c.create_table("new_table", user_table_inf)
 
@@ -186,7 +186,7 @@ def test_inf_table(app_client, user_table_inf):
     assert "error" not in result
 
 
-@pytest.mark.skip(reason="WIP DataFusion")
+@pytest.mark.xfail(reason="WIP DataFusion")
 def get_result_or_error(app_client, response):
     result = response.json()
 

From 2177378d4b296aa773657bfbc4497dad5606593f Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 19 Oct 2022 11:14:53 -0700
Subject: [PATCH 12/25] Uncomments tests in test_context.py

---
 tests/unit/test_context.py | 66 +++++++++-----------------------------
 1 file changed, 16 insertions(+), 50 deletions(-)

diff --git a/tests/unit/test_context.py b/tests/unit/test_context.py
index d10f7012b..a1dc97b81 100644
--- a/tests/unit/test_context.py
+++ b/tests/unit/test_context.py
@@ -54,37 +54,6 @@ def test_deprecation_warning(gpu):
     assert "table" not in c.schema[c.schema_name].tables
 
 
-@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)])
-def test_explain(gpu):
-    c = Context()
-
-    data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
-    c.create_table("df", data_frame, gpu=gpu)
-
-    sql_string = c.explain("SELECT * FROM df")
-
-    assert sql_string.startswith("Projection: df.a\n")
-
-    # TODO: Need to add statistics to Rust optimizer before this can be uncommented.
-    # c.create_table("df", data_frame, statistics=Statistics(row_count=1337))
-
-    # sql_string = c.explain("SELECT * FROM df")
-
-    # assert sql_string.startswith(
-    #     "DaskTableScan(table=[[root, df]]): rowcount = 1337.0, cumulative cost = {1337.0 rows, 1338.0 cpu, 0.0 io}, id = "
-    # )
-
-    c = Context()
-
-    data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
-
-    sql_string = c.explain(
-        "SELECT * FROM other_df", dataframes={"other_df": data_frame}, gpu=gpu
-    )
-
-    assert sql_string.startswith("Projection: other_df.a\n")
-
-
 @pytest.mark.parametrize(
     "gpu",
     [
@@ -116,7 +85,6 @@ def test_sql(gpu):
     assert_eq(result, data_frame)
 
 
-@pytest.mark.skip(reason="WIP DataFusion - missing create statement logic")
 @pytest.mark.parametrize(
     "gpu",
     [
@@ -391,28 +359,26 @@ def test_aggregation_adding():
     assert c.schema[c.schema_name].function_lists[1].aggregation
 
 
-# TODO: Alter schema is not yet implemented
-# def test_alter_schema(c):
-#     c.create_schema("test_schema")
-#     c.sql("ALTER SCHEMA test_schema RENAME TO prod_schema")
-#     assert "prod_schema" in c.schema
+def test_alter_schema(c):
+    c.create_schema("test_schema")
+    c.sql("ALTER SCHEMA test_schema RENAME TO prod_schema")
+    assert "prod_schema" in c.schema
 
-#     with pytest.raises(KeyError):
-#         c.sql("ALTER SCHEMA MARVEL RENAME TO DC")
+    with pytest.raises(KeyError):
+        c.sql("ALTER SCHEMA MARVEL RENAME TO DC")
 
-#     del c.schema["prod_schema"]
+    del c.schema["prod_schema"]
 
 
-# TODO: Alter table is not yet implemented
-# def test_alter_table(c, df_simple):
-#     c.create_table("maths", df_simple)
-#     c.sql("ALTER TABLE maths RENAME TO physics")
-#     assert "physics" in c.schema[c.schema_name].tables
+def test_alter_table(c, df_simple):
+    c.create_table("maths", df_simple)
+    c.sql("ALTER TABLE maths RENAME TO physics")
+    assert "physics" in c.schema[c.schema_name].tables
 
-#     with pytest.raises(KeyError):
-#         c.sql("ALTER TABLE four_legs RENAME TO two_legs")
+    with pytest.raises(KeyError):
+        c.sql("ALTER TABLE four_legs RENAME TO two_legs")
 
-#     c.sql("ALTER TABLE IF EXISTS alien RENAME TO humans")
+    c.sql("ALTER TABLE IF EXISTS alien RENAME TO humans")
 
-#     print(c.schema[c.schema_name].tables)
-#     del c.schema[c.schema_name].tables["physics"]
+    print(c.schema[c.schema_name].tables)
+    del c.schema[c.schema_name].tables["physics"]

From f0bf4d8aadae0fb84edc4ac95ffcda420e156b11 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Thu, 20 Oct 2022 08:09:41 -0700
Subject: [PATCH 13/25] Un-xfail passing tests

---
 tests/integration/test_complex.py | 3 ---
 tests/integration/test_fugue.py   | 1 -
 tests/integration/test_model.py   | 2 --
 tests/integration/test_rex.py     | 3 ---
 tests/integration/test_server.py  | 2 --
 5 files changed, 11 deletions(-)

diff --git a/tests/integration/test_complex.py b/tests/integration/test_complex.py
index 209e383f5..06196519a 100644
--- a/tests/integration/test_complex.py
+++ b/tests/integration/test_complex.py
@@ -1,9 +1,6 @@
 from dask.datasets import timeseries
 
-from tests.integration.fixtures import xfail_if_external_scheduler
 
-
-@xfail_if_external_scheduler
 def test_complex_query(c):
     df = timeseries(freq="1d").persist()
     c.create_table("timeseries", df)
diff --git a/tests/integration/test_fugue.py b/tests/integration/test_fugue.py
index 9c5eeab12..ce5b4bdf5 100644
--- a/tests/integration/test_fugue.py
+++ b/tests/integration/test_fugue.py
@@ -13,7 +13,6 @@
     from dask_sql.integrations.fugue import DaskSQLExecutionEngine, fsql_dask
 
 
-@xfail_if_external_scheduler
 def test_simple_statement(client):
     with fugue_sql.FugueSQLWorkflow(DaskSQLExecutionEngine) as dag:
         df = dag.df([[0, "hello"], [1, "world"]], "a:int64,b:str")
diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py
index 33e71cc54..a73291603 100644
--- a/tests/integration/test_model.py
+++ b/tests/integration/test_model.py
@@ -557,8 +557,6 @@ def test_export_model(c, training_df, tmpdir):
         )
 
 
-# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@xfail_if_external_scheduler
 def test_mlflow_export(c, training_df, tmpdir):
     # Test only when mlflow was installed
     mlflow = pytest.importorskip("mlflow", reason="mlflow not installed")
diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py
index 9cb807bb4..27836e202 100644
--- a/tests/integration/test_rex.py
+++ b/tests/integration/test_rex.py
@@ -18,9 +18,6 @@ def test_year(c, datetime_table):
     assert result_df.compute().iloc[0][0] == 2014
 
 
-@pytest.mark.xfail(
-    reason="WIP DataFusion - Enabling CBO generates yet to be implemented edge case"
-)
 def test_case(c, df):
     result_df = c.sql(
         """
diff --git a/tests/integration/test_server.py b/tests/integration/test_server.py
index 003ff866b..c28e6e456 100644
--- a/tests/integration/test_server.py
+++ b/tests/integration/test_server.py
@@ -26,7 +26,6 @@ def app_client():
         app.client.close()
 
 
-@pytest.mark.xfail(reason="WIP DataFusion")
 def test_routes(app_client):
     assert app_client.post("/v1/statement", data="SELECT 1 + 1").status_code == 200
     assert app_client.get("/v1/statement", data="SELECT 1 + 1").status_code == 405
@@ -36,7 +35,6 @@ def test_routes(app_client):
     assert app_client.get("/v1/cancel/some-wrong-uuid").status_code == 405
 
 
-@pytest.mark.xfail(reason="WIP DataFusion")
 def test_sql_query_cancel(app_client):
     response = app_client.post("/v1/statement", data="SELECT 1 + 1")
     assert response.status_code == 200

From 2cfdbf7976f149364b234644188e8b904291a822 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Mon, 24 Oct 2022 06:50:48 -0700
Subject: [PATCH 14/25] Add xfails to some broken tests

---
 tests/integration/test_compatibility.py | 20 +++++++++++++++++++-
 tests/integration/test_explain.py       |  5 +++--
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/tests/integration/test_compatibility.py b/tests/integration/test_compatibility.py
index de29108c2..a19206fd0 100644
--- a/tests/integration/test_compatibility.py
+++ b/tests/integration/test_compatibility.py
@@ -594,13 +594,16 @@ def test_window_row_number_partition_by():
     )
 
 
+@pytest.mark.xfail(
+    reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
+)
 def test_window_ranks():
     a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50))
     eq_sqlite(
         """
         SELECT *,
             RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1,
-            DENSE_RANK() OVER (ORDER BY a ASC, b DESC NULLS LAST, c DESC) AS a2
+            DENSE_RANK() OVER (ORDER BY a ASC, b DESC NULLS LAST, c DESC) AS a2,
             PERCENT_RANK() OVER (ORDER BY a ASC, b ASC NULLS LAST, c) AS a4
         FROM a
         """,
@@ -608,6 +611,9 @@ def test_window_ranks():
     )
 
 
+@pytest.mark.xfail(
+    reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
+)
 def test_window_ranks_partition_by():
     a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50))
     eq_sqlite(
@@ -625,6 +631,9 @@ def test_window_ranks_partition_by():
     )
 
 
+@pytest.mark.xfail(
+    reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
+)
 def test_window_lead_lag():
     a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50))
     eq_sqlite(
@@ -645,6 +654,9 @@ def test_window_lead_lag():
     )
 
 
+@pytest.mark.xfail(
+    reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
+)
 def test_window_lead_lag_partition_by():
     a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50))
     eq_sqlite(
@@ -954,6 +966,9 @@ def test_union():
     )
 
 
+@pytest.mark.xfail(
+    reason="'ANTI' joins not supported yet, see https://github.com/dask-contrib/dask-sql/issues/879"
+)
 def test_except():
     a = make_rand_df(30, b=(int, 10), c=(str, 10))
     b = make_rand_df(80, b=(int, 50), c=(str, 50))
@@ -970,6 +985,9 @@ def test_except():
     )
 
 
+@pytest.mark.xfail(
+    reason="INTERSECT is not compliant with SQLite, see https://github.com/dask-contrib/dask-sql/issues/880"
+)
 def test_intersect():
     a = make_rand_df(30, b=(int, 10), c=(str, 10))
     b = make_rand_df(80, b=(int, 50), c=(str, 50))
diff --git a/tests/integration/test_explain.py b/tests/integration/test_explain.py
index e0a1d85bc..8fb539ac9 100644
--- a/tests/integration/test_explain.py
+++ b/tests/integration/test_explain.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import pytest
 
-from dask_planner.rust import DaskStatistics
+from dask_sql import Statistics
 
 
 @pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)])
@@ -23,10 +23,11 @@ def test_sql_query_explain(c, gpu):
     assert "Aggregate: groupBy=[[other_df.a]], aggr=[[MIN(other_df.a)]]" in sql_string
 
 
+@pytest.mark.xfail(reason="Need to add statistics to Rust optimizer")
 @pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)])
 def test_statistics_explain(c, gpu):
     df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
-    c.create_table("df", df, statistics=DaskStatistics(row_count=1337))
+    c.create_table("df", df, statistics=Statistics(row_count=1337), gpu=gpu)
 
     sql_string = c.explain("SELECT * FROM df")
 

From 86a72a5b1d848fa9eda6354c37076c6dc63a142b Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Mon, 24 Oct 2022 08:18:16 -0700
Subject: [PATCH 15/25] xfail alter table/schema tests

---
 tests/unit/test_context.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/unit/test_context.py b/tests/unit/test_context.py
index a1dc97b81..8665ac9ef 100644
--- a/tests/unit/test_context.py
+++ b/tests/unit/test_context.py
@@ -359,6 +359,9 @@ def test_aggregation_adding():
     assert c.schema[c.schema_name].function_lists[1].aggregation
 
 
+@pytest.mark.xfail(
+    reason="Need to add back support for alter table/schema, see https://github.com/dask-contrib/dask-sql/issues/884"
+)
 def test_alter_schema(c):
     c.create_schema("test_schema")
     c.sql("ALTER SCHEMA test_schema RENAME TO prod_schema")
@@ -370,6 +373,9 @@ def test_alter_schema(c):
     del c.schema["prod_schema"]
 
 
+@pytest.mark.xfail(
+    reason="Need to add back support for alter table/schema, see https://github.com/dask-contrib/dask-sql/issues/884"
+)
 def test_alter_table(c, df_simple):
     c.create_table("maths", df_simple)
     c.sql("ALTER TABLE maths RENAME TO physics")

From 1f74895b1e5b0b5c4f4b335e11b0ba203ea42425 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Mon, 24 Oct 2022 11:47:44 -0700
Subject: [PATCH 16/25] Un-xfail test_regr_aggregation on GPU

---
 tests/integration/test_groupby.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/integration/test_groupby.py b/tests/integration/test_groupby.py
index 7bb7d730f..ed0ac40ba 100644
--- a/tests/integration/test_groupby.py
+++ b/tests/integration/test_groupby.py
@@ -363,9 +363,7 @@ def test_stddev(c, gpu):
     c.drop_table("df")
 
 
-@pytest.mark.parametrize(
-    "gpu", [False, pytest.param(True, marks=(pytest.mark.gpu, pytest.mark.xfail))]
-)
+@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)])
 def test_regr_aggregation(c, timeseries_df, gpu):
     # test regr_count
     regr_count = c.sql(

From 890a4170869268a52f21d8c26895693d7a7b1240 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 26 Oct 2022 08:49:00 -0700
Subject: [PATCH 17/25] Fix external scheduler xfail

---
 tests/integration/test_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py
index 292bc82f9..a3e0d74b3 100644
--- a/tests/integration/test_model.py
+++ b/tests/integration/test_model.py
@@ -935,7 +935,7 @@ def test_experiment_automl_regressor(c, client, training_df):
 
 
 # TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_predict_with_nullable_types(c):
     df = pd.DataFrame(
         {

From d9ecc75cd50772db6638831e53ac0f7098ee2d7f Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Tue, 13 Dec 2022 12:36:31 -0800
Subject: [PATCH 18/25] Replace skip_if_external_scheduler in ML wrapper tests

---
 tests/unit/test_ml_wrappers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit/test_ml_wrappers.py b/tests/unit/test_ml_wrappers.py
index 0f02707b0..0673aaae4 100644
--- a/tests/unit/test_ml_wrappers.py
+++ b/tests/unit/test_ml_wrappers.py
@@ -17,7 +17,7 @@
 
 from dask_sql.physical.rel.custom.wrappers import Incremental, ParallelPostFit
 
-from ..integration.fixtures import skip_if_external_scheduler
+from ..integration.fixtures import xfail_if_external_scheduler
 
 
 def _check_axis_partitioning(chunks, n_features):
@@ -125,7 +125,7 @@ def assert_estimator_equal(left, right, exclude=None, **kwargs):
         _assert_eq(l, r, name=attr, **kwargs)
 
 
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_parallelpostfit_basic():
     clf = ParallelPostFit(GradientBoostingClassifier())
 
@@ -197,7 +197,7 @@ def test_transform(kind):
     assert_eq_ar(result, expected)
 
 
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 @pytest.mark.parametrize("dataframes", [False, True])
 def test_incremental_basic(dataframes):
     # Create observations that we know linear models can recover

From 4ff69ae518d75bf357e779de721541cf33413db9 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Tue, 13 Dec 2022 13:36:30 -0800
Subject: [PATCH 19/25] unxfail test_date_functions

---
 tests/integration/test_rex.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py
index f5cf5876a..8b3fe2855 100644
--- a/tests/integration/test_rex.py
+++ b/tests/integration/test_rex.py
@@ -696,9 +696,6 @@ def test_string_overlay(c, gpu):
     )
 
 
-@pytest.mark.xfail(
-    reason="TIMESTAMP add, ceil, floor for dt ops not supported by parser"
-)
 def test_date_functions(c):
     date = datetime(2021, 10, 3, 15, 53, 42, 47)
 

From 00353836811a4d7d2abb1e8aa4bc2a60fd8421f0 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 14 Dec 2022 06:55:04 -0800
Subject: [PATCH 20/25] Set max constraint for fastapi

---
 continuous_integration/environment-3.10-dev.yaml | 3 ++-
 continuous_integration/environment-3.9-dev.yaml  | 3 ++-
 continuous_integration/gpuci/environment.yaml    | 3 ++-
 continuous_integration/recipe/meta.yaml          | 3 ++-
 docker/conda.txt                                 | 3 ++-
 docker/main.dockerfile                           | 3 ++-
 docs/environment.yml                             | 3 ++-
 docs/requirements-docs.txt                       | 3 ++-
 setup.py                                         | 3 ++-
 9 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml
index d5ffa777a..8ff56d413 100644
--- a/continuous_integration/environment-3.10-dev.yaml
+++ b/continuous_integration/environment-3.10-dev.yaml
@@ -4,7 +4,8 @@ channels:
 - nodefaults
 dependencies:
 - dask>=2022.3.0
-- fastapi>=0.69.0
+# FIXME: handling is needed for httpx-based fastapi>=0.87.0
+- fastapi>=0.69.0,<0.87.0
 - fugue>=0.7.0
 - intake>=0.6.0
 - jsonschema
diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml
index 4c5c23511..d6e1179c1 100644
--- a/continuous_integration/environment-3.9-dev.yaml
+++ b/continuous_integration/environment-3.9-dev.yaml
@@ -4,7 +4,8 @@ channels:
 - nodefaults
 dependencies:
 - dask>=2022.3.0
-- fastapi>=0.69.0
+# FIXME: handling is needed for httpx-based fastapi>=0.87.0
+- fastapi>=0.69.0,<0.87.0
 - fugue>=0.7.0
 - intake>=0.6.0
 - jsonschema
diff --git a/continuous_integration/gpuci/environment.yaml b/continuous_integration/gpuci/environment.yaml
index 7997ec7c4..866f41642 100644
--- a/continuous_integration/gpuci/environment.yaml
+++ b/continuous_integration/gpuci/environment.yaml
@@ -7,7 +7,8 @@ channels:
 - nodefaults
 dependencies:
 - dask>=2022.3.0
-- fastapi>=0.69.0
+# FIXME: handling is needed for httpx-based fastapi>=0.87.0
+- fastapi>=0.69.0,<0.87.0
 - fugue>=0.7.0
 - intake>=0.6.0
 - jsonschema
diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml
index ab5274076..efc59f3a0 100644
--- a/continuous_integration/recipe/meta.yaml
+++ b/continuous_integration/recipe/meta.yaml
@@ -32,7 +32,8 @@ requirements:
     - python
     - dask >=2022.3.0
     - pandas >=1.4.0
-    - fastapi >=0.69.0
+    # FIXME: handling is needed for httpx-based fastapi>=0.87.0
+    - fastapi >=0.69.0,<0.87.0
     - uvicorn >=0.13.4
     - tzlocal >=2.1
     - prompt-toolkit >=3.0.8
diff --git a/docker/conda.txt b/docker/conda.txt
index 3d57e18dc..99d16d6fd 100644
--- a/docker/conda.txt
+++ b/docker/conda.txt
@@ -10,7 +10,8 @@ pytest-xdist
 mock>=4.0.3
 sphinx>=3.2.1
 tzlocal>=2.1
-fastapi>=0.69.0
+# FIXME: handling is needed for httpx-based fastapi>=0.87.0
+fastapi>=0.69.0,<0.87.0
 nest-asyncio>=1.4.3
 uvicorn>=0.13.4
 pyarrow>=6.0.1
diff --git a/docker/main.dockerfile b/docker/main.dockerfile
index 8c908ce19..19e7c324c 100644
--- a/docker/main.dockerfile
+++ b/docker/main.dockerfile
@@ -18,7 +18,8 @@ RUN mamba install -y \
     # core dependencies
     "dask>=2022.3.0" \
     "pandas>=1.4.0" \
-    "fastapi>=0.69.0" \
+    # FIXME: handling is needed for httpx-based fastapi>=0.87.0
+    "fastapi>=0.69.0,<0.87.0" \
     "uvicorn>=0.13.4" \
     "tzlocal>=2.1" \
     "prompt_toolkit>=3.0.8" \
diff --git a/docs/environment.yml b/docs/environment.yml
index 5d562c532..ef7c92be0 100644
--- a/docs/environment.yml
+++ b/docs/environment.yml
@@ -11,7 +11,8 @@ dependencies:
   - pandas>=1.4.0
   - fugue>=0.7.0
   - jpype1>=1.0.2
-  - fastapi>=0.69.0
+  # FIXME: handling is needed for httpx-based fastapi>=0.87.0
+  - fastapi>=0.69.0,<0.87.0
   - uvicorn>=0.13.4
   - tzlocal>=2.1
   - prompt_toolkit>=3.0.8
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 439516478..e21fd5846 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -4,7 +4,8 @@ dask-sphinx-theme>=3.0.0
 dask>=2022.3.0
 pandas>=1.4.0
 fugue>=0.7.0
-fastapi>=0.69.0
+# FIXME: handling is needed for httpx-based fastapi>=0.87.0
+fastapi>=0.69.0,<0.87.0
 uvicorn>=0.13.4
 tzlocal>=2.1
 prompt_toolkit>=3.0.8
diff --git a/setup.py b/setup.py
index 0f8520de9..a370549e8 100644
--- a/setup.py
+++ b/setup.py
@@ -44,7 +44,8 @@
     install_requires=[
         "dask[dataframe,distributed]>=2022.3.0",
         "pandas>=1.4.0",
-        "fastapi>=0.69.0",
+        # FIXME: handling is needed for httpx-based fastapi>=0.87.0
+        "fastapi>=0.69.0,<0.87.0",
         "uvicorn>=0.13.4",
         "tzlocal>=2.1",
         "prompt_toolkit>=3.0.8",

From 37dc53b29d8b4adb5e06be063e23394ea6d4f4a1 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 14 Dec 2022 12:04:02 -0800
Subject: [PATCH 21/25] Skip test_iterative_and_prediction on cluster

---
 tests/integration/test_model.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py
index 0853e36ba..9feb7e5f1 100644
--- a/tests/integration/test_model.py
+++ b/tests/integration/test_model.py
@@ -149,8 +149,6 @@ def test_xgboost_training_prediction(c, gpu_training_df):
     check_trained_model(c)
 
 
-# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@xfail_if_external_scheduler
 def test_clustering_and_prediction(c, training_df):
     c.sql(
         """
@@ -220,7 +218,11 @@ def test_create_model_with_prediction(c, training_df):
 
 
 # TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@xfail_if_external_scheduler
+# this test failure shuts down the cluster and must be skipped instead of xfailed
+@pytest.mark.skipif(
+    os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None,
+    reason="Can not run with external cluster",
+)
 def test_iterative_and_prediction(c, training_df):
     c.sql(
         """

From cc105003647a8f0dfdba06a19116ff81fc1044fe Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Thu, 5 Jan 2023 07:02:22 -0800
Subject: [PATCH 22/25] Fix style errors

---
 tests/integration/test_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py
index 42f581f6d..c3c7ee874 100644
--- a/tests/integration/test_model.py
+++ b/tests/integration/test_model.py
@@ -512,7 +512,7 @@ def test_describe_model(c, training_df):
 
 
 # TODO - many ML tests fail on clusters without sklearn - can we avoid this?
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_export_model(c, training_df, tmpdir):
     with pytest.raises(RuntimeError):
         c.sql(

From af7830d0ec4b04e9bad3990baa18d4717f81d811 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Thu, 5 Jan 2023 12:14:14 -0800
Subject: [PATCH 23/25] Remove uses of skip_if_external_scheduler

---
 tests/integration/test_rex.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py
index 53e4e47be..5f006b76f 100644
--- a/tests/integration/test_rex.py
+++ b/tests/integration/test_rex.py
@@ -5,7 +5,7 @@
 import pandas as pd
 import pytest
 
-from tests.integration.fixtures import skip_if_external_scheduler
+from tests.integration.fixtures import xfail_if_external_scheduler
 from tests.utils import assert_eq
 
 
@@ -139,7 +139,7 @@ def test_literal_null(c):
 
 
 # TODO - https://github.com/dask-contrib/dask-sql/issues/978
-@skip_if_external_scheduler
+@xfail_if_external_scheduler
 def test_random(c):
     query_with_seed = """
             SELECT

From 8783b8451db5ec4eeea01ae72537ad3eb8bd3c86 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Mon, 9 Jan 2023 08:52:28 -0800
Subject: [PATCH 24/25] Resolve remaining cluster tests

---
 tests/integration/fixtures.py   |  2 +-
 tests/integration/test_model.py |  4 ++++
 tests/integration/test_rex.py   |  3 ---
 tests/unit/test_ml_wrappers.py  | 17 +++++++++++++----
 4 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/tests/integration/fixtures.py b/tests/integration/fixtures.py
index e0b478dbc..84869cc9c 100644
--- a/tests/integration/fixtures.py
+++ b/tests/integration/fixtures.py
@@ -332,7 +332,7 @@ def gpu_client(gpu_cluster):
 # client for all computations. otherwise, only connect to a client
 # when specified.
 @pytest.fixture(
-    scope="function" if SCHEDULER_ADDR is None else "session",
+    scope="function",
     autouse=False if SCHEDULER_ADDR is None else True,
 )
 def client():
diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py
index c3c7ee874..1252666e5 100644
--- a/tests/integration/test_model.py
+++ b/tests/integration/test_model.py
@@ -149,6 +149,8 @@ def test_xgboost_training_prediction(c, gpu_training_df):
     check_trained_model(c)
 
 
+# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
+@xfail_if_external_scheduler
 def test_clustering_and_prediction(c, training_df):
     c.sql(
         """
@@ -576,6 +578,8 @@ def test_export_model(c, training_df, tmpdir):
         )
 
 
+# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
+@xfail_if_external_scheduler
 def test_mlflow_export(c, training_df, tmpdir):
     # Test only when mlflow was installed
     mlflow = pytest.importorskip("mlflow", reason="mlflow not installed")
diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py
index 5f006b76f..8b3fe2855 100644
--- a/tests/integration/test_rex.py
+++ b/tests/integration/test_rex.py
@@ -5,7 +5,6 @@
 import pandas as pd
 import pytest
 
-from tests.integration.fixtures import xfail_if_external_scheduler
 from tests.utils import assert_eq
 
 
@@ -138,8 +137,6 @@ def test_literal_null(c):
     assert_eq(df, expected_df)
 
 
-# TODO - https://github.com/dask-contrib/dask-sql/issues/978
-@xfail_if_external_scheduler
 def test_random(c):
     query_with_seed = """
             SELECT
diff --git a/tests/unit/test_ml_wrappers.py b/tests/unit/test_ml_wrappers.py
index 0673aaae4..4c8b65b2f 100644
--- a/tests/unit/test_ml_wrappers.py
+++ b/tests/unit/test_ml_wrappers.py
@@ -1,5 +1,6 @@
 # Copyright 2017, Dask developers
 # Dask-ML project - https://github.com/dask/dask-ml
+import os
 from collections.abc import Sequence
 
 import dask
@@ -17,8 +18,6 @@
 
 from dask_sql.physical.rel.custom.wrappers import Incremental, ParallelPostFit
 
-from ..integration.fixtures import xfail_if_external_scheduler
-
 
 def _check_axis_partitioning(chunks, n_features):
     c = chunks[1][0]
@@ -125,7 +124,12 @@ def assert_estimator_equal(left, right, exclude=None, **kwargs):
         _assert_eq(l, r, name=attr, **kwargs)
 
 
-@xfail_if_external_scheduler
+# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
+# this test failure shuts down the cluster and must be skipped instead of xfailed
+@pytest.mark.skipif(
+    os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None,
+    reason="Can not run with external cluster",
+)
 def test_parallelpostfit_basic():
     clf = ParallelPostFit(GradientBoostingClassifier())
 
@@ -197,7 +201,12 @@ def test_transform(kind):
     assert_eq_ar(result, expected)
 
 
-@xfail_if_external_scheduler
+# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
+# this test failure shuts down the cluster and must be skipped instead of xfailed
+@pytest.mark.skipif(
+    os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None,
+    reason="Can not run with external cluster",
+)
 @pytest.mark.parametrize("dataframes", [False, True])
 def test_incremental_basic(dataframes):
     # Create observations that we know linear models can recover

From 96434b0fbd48a8e13374fa92060c9a2dcba4a387 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Mon, 9 Jan 2023 18:02:49 -0800
Subject: [PATCH 25/25] un-xfail fugue tests

---
 tests/integration/test_fugue.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/integration/test_fugue.py b/tests/integration/test_fugue.py
index 48506f42b..d99aa950b 100644
--- a/tests/integration/test_fugue.py
+++ b/tests/integration/test_fugue.py
@@ -11,7 +11,6 @@
 from dask_sql.integrations.fugue import fsql_dask  # noqa: E402
 
 
-@xfail_if_external_scheduler
 def test_fugue_workflow(client):
     dag = fugue_sql.FugueSQLWorkflow()
     df = dag.df([[0, "hello"], [1, "world"]], "a:int64,b:str")
@@ -26,7 +25,6 @@ def test_fugue_workflow(client):
     assert_eq(return_df, pd.DataFrame({"a": [1], "b": ["world"]}))
 
 
-@xfail_if_external_scheduler
 def test_fugue_fsql(client):
     pdf = pd.DataFrame([[0, "hello"], [1, "world"]], columns=["a", "b"])
     dag = fugue_sql.fsql(