-
Notifications
You must be signed in to change notification settings - Fork 7
/
dask.py
23 lines (20 loc) · 888 Bytes
/
dask.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import time
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from dask.distributed import Client, LocalCluster
def bake(dataset: str) -> float:
with LocalCluster() as cluster:
client = Client(cluster)
ProgressBar().register()
start = time.time()
# NOTE: We pass in the relevant columns for the calculation because Dask
# currently does not do predicate pushdown. This is noticeably a bit of a cheat
# and not fully in the spirit of the bakeoff given that this is somewhat
# advanced knowledge that a user coming from pandas may not have.
df = dd.read_parquet(
dataset, index=False, columns=["station_id", "num_bikes_available"]
)
df.groupby("station_id")["num_bikes_available"].mean().compute()
stop = time.time()
client.close()
return stop - start