Hier ist ein minimales Arbeitsbeispiel, das meinem realen Setup entspricht:
Code: Select all
import numpy as np
import xarray as xr
import dask.array as da
import zarr
import zipfile
# Simulate a large dataset
pos_len = 100_000_000 # rows
sample_len = 100 # samples
chunks = (100_000, 100)
data = da.random.random((pos_len, sample_len), chunks=chunks)
xds = xr.Dataset(
{"some_var": (("pos", "sample_id"), data)},
coords={"pos": np.arange(pos_len), "sample_id": np.arange(sample_len)}
)
# Build a boolean mask based on mean coverage
coverage_array = "some_var"
min_coverage = 0.5
mask_1d = xds_subset[coverage_array].mean(dim="sample_id", skipna=True) >= min_coverage
# Attempt to drop rows where mask is False
cds_masked = xds_subset.where(mask_1d.compute(), other=np.nan, drop=True) #
Mobile version