alxmrs · alxmrs · Mar 23, 2024 · Mar 23, 2024 · Mar 23, 2024 · Mar 24, 2024
diff --git a/demo/sst.py b/demo/sst.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""Demo of calculating global average sea surface temperature (SST) with SQL.
+
+Please run the following to set up cloud resources:
+```
+pip install ".[demo]"
+gcloud auth application-default login
+coiled login
+coiled setup gcp --region us-central1
+```
+To run the local demo:
+./demo/sst.py --timeframe day --fake
+
+To run the remote demo:
+```
+./demo/sst.py --timeframe month --cluster small
+```
+"""
+import argparse
+
+import numpy as np
+import xarray as xr
+
+import xarray_sql as xql
+
+# Instead of letting users choose arbitrary time frames, we only allow
+# the following choices. This design prevents users from accidentally
+# processing way more data than they might have meant to. We don't
+# want to bankrupt folks because they were off a few digits.
+TIMEFRAMES = {
+    'day': slice('1940-01-01', '1940-01-02'),
+    'month': slice('1940-01-01', '1940-02-01'),
+    'year': slice('1940-01-01', '1941-01-01'),
+    'all': slice('1940-01-01', '2023-11-01'),
+}
+
+CLUSTERS = ['none', 'local', 'small', 'arm', 'mem-opt']
+
+
+def rand_wx(times) -> xr.Dataset:
+  """Produce a random ARCO-ERA5-like weather dataset."""
+  np.random.seed(42)
+
+  lat = np.linspace(-90, 90, num=720)
+  lon = np.linspace(-180, 180, num=1440)
+  time = xr.date_range(times.start, times.stop, freq='H')
+  level = np.array([1000, 500], dtype=np.int32)
+
+  temperature = 15 + 8 * np.random.randn(720, 1440, len(time))
+  precipitation = 10 * np.random.rand(720, 1440, len(time), len(level))
+
+  return xr.Dataset(
+      data_vars=dict(
+          sea_surface_temperature=(
+              ['lat', 'lon', 'time'],
+              temperature,
+          ),
+          precipitation=(['lat', 'lon', 'time', 'level'], precipitation),
+      ),
+      coords=dict(
+          lat=lat,
+          lon=lon,
+          time=time,
+          level=level,
+      ),
+      attrs=dict(description='Random weather.'),
+  )
+
+
+# TODO(alxmrs): Make spot instances a flag.
+parser = argparse.ArgumentParser()
+parser.add_argument('--timeframe', choices=TIMEFRAMES.keys(), default='day')
+parser.add_argument(
+    '--cluster',
+    choices=CLUSTERS,
+    default='none',
+    help='Choose the Dask cluster type, or none at all.'
+    'Either: no cluster, a local cluster, ARM VMs or memory-optimized VMs in GCP via Coiled.',
+)
+parser.add_argument(
+    '--fake',
+    action='store_true',
+    help='use local dummy data, default: ARCO-ERA5 data',
+)
+
+args = parser.parse_args()
+timeframe = TIMEFRAMES[args.timeframe]
+
+if args.cluster == 'small':
+  from coiled import Cluster
+
+  cluster = Cluster(
+      region='us-central1',
+      spot_policy='spot_with_fallback',
+      n_workers=8,
+  )
+
+  client = cluster.get_client()
+elif args.cluster == 'arm':
+  from coiled import Cluster
+
+  cluster = Cluster(
+      region='us-central1',
+      spot_policy='spot_with_fallback',
+      arm=True,
+  )
+
+  client = cluster.get_client()
+  cluster.adapt(minimum=1, maximum=100)
+elif args.cluster == 'mem-opt':
+  from coiled import Cluster
+
+  cluster = Cluster(
+      region='us-central1',
+      spot_policy='spot_with_fallback',
+      worker_vm_types='m3-ultramem-32',  # 30.5 GiBs RAM per CPU, x86.
+  )
+
+  client = cluster.get_client()
+  cluster.adapt(minimum=1, maximum=25)
+
+elif args.cluster == 'local':
+  from dask.distributed import LocalCluster
+
+  cluster = LocalCluster(processes=False)
+  client = cluster.get_client()
+
+if args.fake:
+  era5_ds = rand_wx(timeframe).chunk({'time': 240, 'level': 1})
+else:
+  era5_ds = xr.open_zarr(
+      'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3/',
+      chunks={'time': 240, 'level': 1},
+  )
+
+print('dataset opened.')
+
+era5_sst_ds = era5_ds[['sea_surface_temperature']].sel(time=timeframe)
+
+print(f'sst_size={era5_sst_ds.nbytes / 2**30:.5f} GiBs')
+
+c = xql.XarrayContext()
+# `time=48` produces 190 MiB chunks
+# `time=96` produces 380 MiB chunks
+# `time=192` produces 760 MiB chunks
+# `time=240` produces 950 MiB chunks
+# `time=720` produces 2851 MiB chunks --> utilizes 30 GiBs memory per CPU.
+time_chunks = 96  # four day chunks.
+if args.cluster == 'mem-opt':
+  time_chunks = 720  # one month chunks.
+c.from_dataset('era5', era5_sst_ds, chunks=dict(time=time_chunks))
+
+print('beginning query.')
+# TODO(alxmrs): `DATE` function is not supported in Apache Calcite out-of-the-box.
+df = c.sql(
+    """
+SELECT
+  "time",
+  AVG("sea_surface_temperature") as daily_avg_sst
+FROM
+  "era5"
+GROUP BY
+  "time"
+"""
+)
+
+# Store the results for visualization later on.
+now = np.datetime64('now', 's').astype(int)
+results_name = f'global_avg_sst_{args.timeframe}_{now}'
+if args.fake:
+  results_name = 'fake_' + results_name
+if args.cluster == 'local' or args.cluster == 'none':
+  df.to_pandas().to_csv(results_name + '.csv')
+else:
+  df.write_parquet(f'gs://xarray-sql-experiments/{results_name}/')
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,10 +32,18 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+io = [
+    "xarray[io]",
+    "gcsfs",
+]
 test = [
-  "pytest",
-  "xarray[io]",
-  "gcsfs",
+    "xarray_sql[io]",
+    "pytest",
+]
+
+demo = [
+    "xarray_sql[io]",
+    "coiled"
 ]
 
 [project.urls]

diff --git a/xarray_sql/df.py b/xarray_sql/df.py
@@ -5,7 +5,6 @@
 import pandas as pd
 import pyarrow as pa
 import xarray as xr
-from datafusion.context import ArrowStreamExportable
 
 Block = t.Dict[t.Hashable, slice]
 Chunks = t.Optional[t.Dict[str, int]]