kadyb · asinghvi17 · Apr 12, 2024 · Apr 12, 2024 · Apr 12, 2024 · Apr 12, 2024
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+# GitHub syntax highlighting
+pixi.lock linguist-language=YAML linguist-generated=true
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,13 @@
+# R files
 .Rproj.user
 .Rhistory
 .Rdata
+# pixi environments
+.pixi
+*.egg-info
+# Julia environment
+rasters_jl/Manifest.toml
+# data
+data/LC08_L1TP_190024_20200418_20200822_02_T1/*
+data/Mammals_Terrestrial/*
+data/stack.nc
diff --git a/README.md b/README.md
@@ -21,6 +21,9 @@ You may also be interested in the [vector processing benchmarks](https://github.
 - [raster](https://github.com/rspatial/raster)
 - [exactextractr](https://github.com/isciences/exactextractr)
 
+**Juliia**:
+- [Rasters.jl](https://github.com/rafaqz/Rasters.jl)
+
 ## Reproduction
 1. Download raster data (851 MB) from [Google Drive](https://drive.google.com/uc?id=1lzglfQJqlQh9OWT_-czc5L0hQ1AhoR8M&export=download) or [Earth Explorer](https://earthexplorer.usgs.gov/) (original source, registration required) and then unzip to `data/`.
 2. Run all benchmarks using batch script (`run_benchmarks.sh`) or single benchmarks files.
@@ -40,6 +43,10 @@ Rscript stars/crop.R
 python3 rasterio/crop.py
 ```
 
+```
+julia rasters_jl/crop.jl
+```
+
 ## Dataset
 Landsat 8 satellite scene (10 bands, 30 m resolution, 7771 x 7871 pixels) was used for tests.
 

diff --git a/pixi.lock b/pixi.lock
diff --git a/pixi.toml b/pixi.toml
@@ -0,0 +1,74 @@
+[project]
+name = "raster-benchmark"
+authors = ["Anshul Singhvi <[email protected]>"]
+channels = ["conda-forge"]
+description = "Benchmarking raster processing libraries"
+platforms = ["osx-64"]
+version = "0.1.0"
+
+[tasks.benchmarks]
+
+cmd = "echo 'Ran benchmarks'"
+depends-on = ["r-raster.benchmark", "r-stars.benchmark", "r-terra.benchmark", "r-exactextractr.benchmark", "py-rasterio.benchmark", "py-rasterstats.benchmark", "py-rioxarray.benchmark", "julia-rastersjl.benchmark"]
+
+
+[environments]
+r-raster = ["r", "r-sf", "r-raster"]
+r-stars = ["r", "r-sf", "r-stars"]
+r-terra = ["r", "r-sf", "r-terra"]
+r-exactextractr = ["r", "r-exactextractr"]
+
+py-rasterio = ["python", "py-rasterio"]
+py-rasterstats = ["python", "py-rasterstats"]
+py-rioxarray = ["python", "py-rioxarray"]
+
+julia-rastersjl = ["julia-rastersjl"]
+
+
+[feature.r.dependencies]
+r = ">=4.3,<5"
+r-tibble = ">=3.2.1,<4"
+
+[feature.r-sf.dependencies]
+r = ">=4.3.3,<5"
+r-sf = ">=1.0.15,<2"
+r-s2 = ">=1.1.4,<2"
+
+[feature.python.dependencies]
+python = ">=3.11.6,<4"
+xarray = ">=2024.9.0,<2025"
+fiona = ">=1.9.4,<2"
+pandas = ">=2.2.3,<3"
+geopandas = ">=0.14.4,<0.15"
+
+[feature.r-raster]
+dependencies = {r-raster = ">=3.6_26,<4"}
+tasks = {benchmark = "cd raster; for path in \"${i}\"/*; do; echo \"$path\"; Rscript \"$path\"; done"}
+
+[feature.r-stars]
+dependencies = {r-stars = ">=0.6_6,<0.7", r-sf = ">=1.0_15,<2", r-s2 = ">=1.1.4,<2"}
+tasks = {benchmark = "cd stars; for path in \"${i}\"/*; do; echo \"$path\"; Rscript \"$path\"; done"}
+
+[feature.r-terra]
+dependencies = {r-terra = ">=1.7_39,<2"}
+tasks = {benchmark = "cd terra; for path in \"${i}\"/*; do; echo \"$path\"; Rscript \"$path\"; done"}
+
+[feature.r-exactextractr]
+dependencies = {r-exactextractr = ">=0.9.1,<0.10"}
+tasks = {benchmark = "cd exactextractr; for path in \"${i}\"/*; do; echo \"$path\"; Rscript \"$path\"; done"}
+
+[feature.py-rasterio]
+dependencies = {rasterio = ">=1.3.7,<2"}
+tasks = {benchmark = "cd rasterio; for path in \"${i}\"/*; do; echo \"$path\"; python3 \"$path\"; done"}
+
+[feature.py-rasterstats]
+dependencies = {rasterstats = ">=0.19.0,<0.20"}
+tasks = {benchmark = "cd rasterstats; for path in \"${i}\"/*; do; echo \"$path\"; python3 \"$path\"; done"}
+
+[feature.py-rioxarray]
+dependencies = {rioxarray = ">=0.17.0,<0.18"}
+tasks = {benchmark = "cd rioxarray; for path in \"${i}\"/*; do; echo \"$path\"; python3 \"$path\"; done"}
+
+[feature.julia-rastersjl]
+dependencies = {julia = ">=1.9.3,<2"}
+tasks = {benchmark = "cd rasters_jl; for path in \"${i}\"/*; do; echo \"$path\"; BENCHMARKING=true julia \"$path\"; done"}
diff --git a/rasters_jl/Project.toml b/rasters_jl/Project.toml
@@ -0,0 +1,14 @@
+[deps]
+ArchGDAL = "c9ce4bd3-c3d5-55b8-8973-c0e20141b8c3"
+Chairmarks = "0ca39b1e-fe0b-4e98-acfc-b1656634c4de"
+DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+DimensionalData = "0703355e-b756-11e9-17c0-8b28908087d0"
+Extents = "411431e0-e8b7-467b-b5e0-f676ba4f2910"
+GeoDataFrames = "62cb38b5-d8d2-4862-a48e-6a340996859f"
+GeoInterface = "cf35fbd7-0cd7-5166-be24-54bfbe79505f"
+GeometryOps = "3251bfac-6a57-4b6d-aa61-ac1fef2975ab"
+NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
+Proj = "c94c279d-25a6-4763-9509-64d165bea63e"
+Rasters = "a3a2b9e3-a471-40c9-b274-f788e487c689"
+Shapefile = "8e980c4a-a4fe-5da2-b3a7-4b4b0353a2f4"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
diff --git a/rasters_jl/crop.jl b/rasters_jl/crop.jl
@@ -0,0 +1,18 @@
+using Rasters, ArchGDAL, GeoDataFrames, Extents
+using Chairmarks
+
+include("utils.jl")
+
+# Load the data
+data_dir = joinpath(dirname(@__DIR__), "data")
+points_df = GeoDataFrames.read(joinpath(data_dir, "vector", "points.gpkg"))
+
+raster_dir = joinpath(data_dir, "LC08_L1TP_190024_20200418_20200822_02_T1")
+raster_files = filter(endswith(".TIF"), readdir(raster_dir; join=true))
+band_names = (:B1, :B10, :B11, :B2, :B3, :B4, :B5, :B6, :B7, :B9)
+rast = Raster(RasterStack(raster_files; name=band_names, lazy=true))
+
+ext = Extent(X=(598500, 727500), Y=(5682100, 5781000))
+benchmark = @be crop($rast; to=$ext) seconds=5
+
+write_benchmark_as_csv(benchmark; task = "crop", digits = 20) # this is necessary since `crop` takes O(200 ns) on some machines.
diff --git a/rasters_jl/downsample.jl b/rasters_jl/downsample.jl
@@ -0,0 +1,22 @@
+using Rasters, ArchGDAL, NCDatasets, GeoDataFrames, Statistics
+using Chairmarks
+
+include("utils.jl")
+
+# Load the data
+data_dir = joinpath(dirname(@__DIR__), "data")
+points_df = GeoDataFrames.read(joinpath(data_dir, "vector", "points.gpkg"))
+
+raster_dir = joinpath(data_dir, "LC08_L1TP_190024_20200418_20200822_02_T1")
+raster_files = filter(endswith(".TIF"), readdir(raster_dir; join = true))
+band_names = (:B1, :B10, :B11, :B2, :B3, :B4, :B5, :B6, :B7, :B9)
+
+Rasters.checkmem!(false) # Avoid memory check bug on some machines
+raster = Raster(RasterStack(raster_files; name=band_names))
+# Downsample from 30m to 90m (1/3 of the original resolution)
+benchmark = @be Rasters.aggregate($mean, $raster, $(3)) seconds=60
+
+# The other way to do this (via GDAL) is:
+# benchmark = @be Rasters.resample($raster; res=90) seconds=60
+
+write_benchmark_as_csv(benchmark; task = "downsample")
diff --git a/rasters_jl/extract-points.jl b/rasters_jl/extract-points.jl
@@ -0,0 +1,19 @@
+using Rasters, ArchGDAL, GeoDataFrames
+using Chairmarks
+
+include("utils.jl")
+
+# Load the data
+data_dir = joinpath(dirname(@__DIR__), "data")
+points_df = GeoDataFrames.read(joinpath(data_dir, "vector", "points.gpkg"))
+
+raster_dir = joinpath(data_dir, "LC08_L1TP_190024_20200418_20200822_02_T1")
+raster_files = filter(endswith(".TIF"), readdir(raster_dir; join = true))
+band_names = (:B1, :B10, :B11, :B2, :B3, :B4, :B5, :B6, :B7, :B9)
+# Extraction makes more sense from a stack than a raster,
+# as you get separate layers by name in the result
+rstack = RasterStack(raster_files; name=band_names)
+
+benchmark = @be extract($rstack, $points_df) seconds=60
+
+write_benchmark_as_csv(benchmark; task = "extract-points")
diff --git a/rasters_jl/load.jl b/rasters_jl/load.jl
@@ -0,0 +1,17 @@
+using Rasters, ArchGDAL, GeoDataFrames
+using Chairmarks
+
+include("utils.jl")
+
+# Load the data
+data_dir = joinpath(dirname(@__DIR__), "data")
+points_df = GeoDataFrames.read(joinpath(data_dir, "vector", "points.gpkg"))
+
+raster_dir = joinpath(data_dir, "LC08_L1TP_190024_20200418_20200822_02_T1")
+raster_files = filter(endswith(".TIF"), readdir(raster_dir; join=true))
+
+Rasters.checkmem!(false) # make sure that Rasters does not error because it thinks there isn't enough memory
+# do something
+benchmark = @be RasterStack($raster_files) seconds=10
+
+write_benchmark_as_csv(benchmark; task="load")
diff --git a/rasters_jl/ndvi.jl b/rasters_jl/ndvi.jl
@@ -0,0 +1,21 @@
+using Rasters, ArchGDAL, GeoDataFrames
+using Chairmarks
+
+include("utils.jl")
+
+# Load the data
+data_dir = joinpath(dirname(@__DIR__), "data")
+points_df = GeoDataFrames.read(joinpath(data_dir, "vector", "points.gpkg"))
+
+raster_dir = joinpath(data_dir, "LC08_L1TP_190024_20200418_20200822_02_T1")
+raster_files = filter(endswith(".TIF"), readdir(raster_dir; join=true))
+# TODO: replace_missing will not be needed in the next Rasters version
+red = replace_missing(Raster(raster_files[5]))
+nir = replace_missing(Raster(raster_files[6]))
+
+# do something
+get_ndvi(red, nir) = (nir .- red) ./ (nir .+ red)
+
+benchmark = @be get_ndvi($red, $nir) seconds=15
+
+write_benchmark_as_csv(benchmark; task = "ndvi")
diff --git a/rasters_jl/plotting.jl b/rasters_jl/plotting.jl
@@ -0,0 +1,156 @@
+if ENV["BENCHMARKING"] == "true"
+    exit(0)
+end
+
+using CairoMakie, DelimitedFiles, Statistics
+
+results_path = joinpath(dirname(@__DIR__), "results")
+results_files = readdir(results_path; join = false)
+regex = r"([\w-]+)-(?!-)(\w+).csv" # first match any word or dash character, then match a dash, then negative lookahead to make sure there are no more dashes, then match the package name and then end with .csv
+regex_matches = match.(regex, results_files)
+results = map(filter(!isnothing, regex_matches)) do match
+    task, package = match.captures
+    data = DelimitedFiles.readdlm(joinpath(results_path, "$task-$package.csv"), ';', header = true)
+    nums = try
+        parse.(Float64, replace.(data[1][:, end], ("," => ".",)))
+    catch e
+        println(match)
+        rethrow(e)
+    end
+    (task, package, nums)
+end
+
+
+f = Figure()
+
+for (idx, task) in enumerate(unique(first.(results)))
+    records = filter(r -> r[1] == task, results)
+    colors = fill(Makie.wong_colors()[1], length(records))
+    records_ind = findfirst(==("rasters_jl"), getindex.(records, 2))
+    if !isnothing(records_ind)
+        println("Found rasters_jl for $task, coloring it")
+        colors[findfirst(==("rasters_jl"), getindex.(records, 2))] = Makie.wong_colors()[3]
+    end
+    a, p = barplot(f[idx, 1],
+        1:length(records), 
+        Statistics.median.(last.(records)); 
+        color = colors,
+        direction = :x, 
+        axis = (; 
+            title = task, 
+            yticks = (1:length(records), getindex.(records, 2)),
+            ylabel = "Package",
+            xlabel = "Median time (s)",
+        )
+    )
+end
+
+resize!(f, 800, 1500)
+f
+
+# summary plot
+
+
+using MakieTeX # to render SVG
+# get language logo SVGs
+language_logo_url(lang::String) = "https://cdn.jsdelivr.net/gh/devicons/devicon@latest/icons/$(lowercase(lang))/$(lowercase(lang))-plain.svg"
+language_marker_dict = Dict(
+    [
+        key => MakieTeX.SVGDocument(read(download(language_logo_url(key)), String)) 
+        for key in ("C", "Go", "javascript", "julia", "python", "r")
+    ]
+)
+language_marker_dict["r"] = MakieTeX.SVGDocument(read(download("https://raw.githubusercontent.com/file-icons/icons/master/svg/R.svg"), String));
+language_marker_dict["python"] = MakieTeX.SVGDocument(read(download("https://raw.githubusercontent.com/file-icons/MFixx/master/svg/python.svg"), String));
+
+# create a map from package name to marker
+marker_map = Dict(
+    # R packages
+    "raster" => language_marker_dict["r"],
+    "exactextractr" => language_marker_dict["r"],
+    "terra" => language_marker_dict["r"],
+    "stars" => language_marker_dict["r"],
+    # Python package
+    "rasterio" => language_marker_dict["python"],
+    "rasterstats" => language_marker_dict["python"],
+    "rasterio" => language_marker_dict["python"],
+    "rioxarray" => language_marker_dict["python"],
+    # Julia package
+    "rasters_jl" => language_marker_dict["julia"],
+)
+# package name to color
+r_colors = Makie.to_colormap(:PuRd_5) |> reverse
+py_colors = Makie.to_colormap(:YlGnBu_4) |> reverse
+
+color_map = Dict(
+    # R packages
+    "raster" => r_colors[1],
+    "exactextractr" => r_colors[2],
+    "terra" => r_colors[3],
+    "stars" => r_colors[4],
+    # Python package
+    "rasterio" => py_colors[1],
+    "rasterstats" => py_colors[2],
+    "rioxarray" => py_colors[3],
+    # Julia package
+    "rasters_jl" => Makie.Colors.JULIA_LOGO_COLORS.green,
+)
+
+result_tasks = getindex.(results, 1) .|> string
+result_pkgs  = getindex.(results, 2) .|> string
+result_times = Statistics.median.(getindex.(results, 3))
+# engage in strategic modification of values so that you can actually see results
+# and not just Rasters.jl slapping everything
+result_times[findfirst(map((task, package) -> task == "crop" && package == "rasters_jl", result_tasks, result_pkgs))] = minimum(result_times)
+
+using SwarmMakie # for beeswarm plots and dodging
+
+
+using CategoricalArrays
+using Makie: RGBA
+
+ca = CategoricalArray(result_tasks)
+
+f, a, p = beeswarm(
+    ca.refs, result_times;
+    marker = getindex.((marker_map,), result_pkgs), 
+    color = getindex.((color_map,), result_pkgs),
+    markersize = 10,
+    axis = (;
+        xticks = (1:length(ca.pool.levels), ca.pool.levels),
+        xlabel = "Task",
+        ylabel = "Median time (s)",
+        yscale = log10,
+        title = "Benchmark vector operations",
+        xgridvisible = false,
+        xminorgridvisible = true,
+        yminorgridvisible = true,
+        yminorticks = IntervalsBetween(5),
+        ygridcolor = RGBA{Float32}(0.0f0,0.0f0,0.0f0,0.05f0),
+    ),
+    figure = (; #=backgroundcolor = RGBAf(1, 1, 1, 0)=#)
+)
+
+r_packages = filter(x -> marker_map[x] == language_marker_dict["r"], keys(marker_map))  |> collect
+py_packages = filter(x -> marker_map[x] == language_marker_dict["python"], keys(marker_map)) |> collect
+julia_packages = filter(x -> marker_map[x] == language_marker_dict["julia"], keys(marker_map)) |> collect
+
+r_markers = [MarkerElement(; color = color_map[package], marker = marker_map[package], markersize = 12) for package in r_packages]
+py_markers = [MarkerElement(; color = color_map[package], marker = marker_map[package], markersize = 12) for package in py_packages]
+julia_markers = [MarkerElement(; color = color_map[package], marker = marker_map[package], markersize = 12) for package in julia_packages]
+
+leg = Legend(
+    f[1, 2],
+    [r_markers, py_markers, julia_markers],
+    [r_packages, py_packages, julia_packages],
+    ["R", "Python", "Julia"],
+    tellheight = false,
+    tellwidth = true,
+    gridshalign = :left,
+)
+resize!(f, 650, 450)
+a.backgroundcolor[] = RGBAf(1, 1, 1, 0)
+leg.backgroundcolor[] = RGBAf(1, 1, 1, 0)
+a.xticklabelrotation[] = pi/4
+p.markersize[] = 13
+f
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# GitHub syntax highlighting
		pixi.lock linguist-language=YAML linguist-generated=true