Reading geospatial
Common examples for reading geospatial data in Fused.
Python Packages
geopandas
@fused.udf
def udf(path: str = "s3://fused-sample/demo_data/subway_stations.geojson"):
import geopandas as gpd
return gpd.read_file(path)
shapely
@fused.udf
def udf():
import geopandas as gpd
from shapely.geometry import Point, Polygon
# Create geometries with shapely
points = [Point(-122.4, 37.8), Point(-122.3, 37.7)]
polygon = Polygon([(-122.5, 37.7), (-122.3, 37.7), (-122.3, 37.9), (-122.5, 37.9)])
gdf = gpd.GeoDataFrame(
{'type': ['point', 'point', 'polygon']},
geometry=points + [polygon],
crs=4326
)
return gdf
rioxarray
@fused.udf
def udf(path: str = "s3://fused-sample/demo_data/elevation.tif"):
import rioxarray as rxr
# Read raster data with rioxarray
raster = rxr.open_rasterio(path)
# Convert to DataFrame for display
df = raster.to_dataframe().reset_index()
return df.head(1000)
xarray
@fused.udf
def udf():
import xarray as xr
# Download NetCDF data to mount disk for proper reading
path = fused.download('s3://fused-sample/demo_data/2025_01_01_ERA5_surface.nc','2025_01_01_ERA5_surface.nc')
ds = xr.open_dataset(path)
# Convert to DataFrame
df = ds.to_dataframe().reset_index()
return df.head(1000)
Vector Formats
GeoJSON (.geojson, .json)
@fused.udf
def udf(path: str = "s3://fused-sample/demo_data/table/US_states.geojson"):
import geopandas as gpd
return gpd.read_file(path)
Shapefile (.shp + .shx, .dbf, .prj)
@fused.udf
def udf(path: str = "s3://fused-sample/demo_data/table/US_states_shapefile.shp"):
import geopandas as gpd
return gpd.read_file(path)
GeoPackage (.gpkg)
@fused.udf
def udf(path: str = "s3://fused-sample/demo_data/table/US_states_geopackage.gpkg"):
import geopandas as gpd
return gpd.read_file(path)
KML/KMZ (.kml, .kmz)
@fused.udf
def udf(path: str = "s3://fused-sample/demo_data/table/US_states.kml"):
import geopandas as gpd
return gpd.read_file(path)
GeoParquet (.parquet)
@fused.udf
def udf(path: str = "s3://fused-sample/demo_data/buildings.parquet"):
import geopandas as gpd
return gpd.read_parquet(path)
Large Geoparquet (.parquet with subset)
Why not just use gpd.read_parquet()?
gpd.read_parquet()loads the ENTIRE file into memory before you can.head()- For large files (100MB+) this causes Out of Memory errors in a realtime UDF run
Solution: Use DuckDB to push LIMIT down to the read level so only n_rows are ever loaded into memory, then convert geometry from WKB to proper Shapely objects via DuckDB's spatial extension (ST_AsText).
@fused.udf
def udf(
path: str = "s3://us-west-2.opendata.source.coop/cholmes/overture/geoparquet-country-quad-2/CZ.parquet",
n_rows: int = 1000,
):
import geopandas as gpd
common = fused.load("https://github.com/fusedio/udfs/tree/9bad664/public/common/")
@fused.cache
def load_large_geoparquet(p, limit):
con = common.duckdb_connect()
# In this example, bucket is in us-west-2; set region so S3 requests don't use default us-east-1
con.sql("SET s3_region='us-west-2';")
# 1. Detect the geometry column (DuckDB reads it as BLOB/GEOMETRY)
col_info = con.sql(
f"SELECT column_name FROM (DESCRIBE SELECT * FROM read_parquet('{p}')) "
f"WHERE column_type IN ('GEOMETRY', 'BLOB', 'WKB_GEOMETRY')"
).fetchall()
geom_col = col_info[0][0] if col_info else "geometry"
# 2. Read only `limit` rows and convert geometry BLOB -> WKT text
# LIMIT is pushed down so only `limit` rows are ever in memory
df = con.sql(
f"SELECT *, ST_AsText({geom_col}) AS __wkt "
f"FROM read_parquet('{p}') LIMIT {limit}"
).df()
# 3. Parse WKT into Shapely geometry objects and build GeoDataFrame
df[geom_col] = gpd.GeoSeries.from_wkt(df["__wkt"])
df = df.drop(columns=["__wkt"])
return gpd.GeoDataFrame(df, geometry=geom_col, crs=4326)
gdf = load_large_geoparquet(path, n_rows)
print(gdf.T)
return gdf
CSV with coordinates (.csv)
@fused.udf
def udf(path: str = "s3://fused-sample/demo_data/table/subway_stations.csv"):
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
# Read CSV
df = pd.read_csv(path)
# Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(
df,
geometry=gpd.points_from_xy(df.longitude, df.latitude),
crs=4326
)
return gdf
Excel with coordinates (.xlsx)
@fused.udf
def udf(path: str = "s3://fused-sample/demo_data/table/subway_stations.xlsx"):
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
# Read Excel file
df = pd.read_excel(path)
# Convert to GeoDataFrame if coordinates exist
if 'longitude' in df.columns and 'latitude' in df.columns:
gdf = gpd.GeoDataFrame(
df,
geometry=gpd.points_from_xy(df.longitude, df.latitude),
crs=4326
)
return gdf
return df
Raster Formats
GeoTIFF (.tif, .tiff)
@fused.udf
def udf(
path: str = 's3://fused-sample/demo_data/satellite_imagery/wildfires.tiff'
):
import rasterio
with rasterio.open(path) as src:
data = src.read()
bounds = src.bounds
return data, bounds
NetCDF (.nc)
@fused.udf
def udf():
import xarray as xr
# Download to mount disk for proper NetCDF reading
path = fused.download('s3://fused-sample/demo_data/climate_data.nc', 'climate_data.nc')
# Open NetCDF dataset
ds = xr.open_dataset(path)
return ds.to_dataframe().reset_index().head(1000)
STAC Catalogs
For working with STAC catalogs (Earth on AWS, Microsoft Planetary Computer), see STAC.