Source code for daisy.datasets

from __future__ import absolute_import, division
from .array import Array
from .coordinate import Coordinate
from .ext import zarr, h5py
from .klb_adaptor import KlbAdaptor
from .roi import Roi
import json
import logging
import os
import shutil
import tempfile

logger = logging.getLogger(__name__)


# monkey patch to auto set temp file permission so other users can access our
# data, until zarr fixes this issue:
# https://github.com/zarr-developers/zarr/issues/325


def UmaskNamedTemporaryFile(*args, **kargs):
    fdesc = tempfile.NamedTemporaryFile2(*args, **kargs)
    umask = os.umask(0)
    os.umask(umask)
    os.chmod(fdesc.name, 0o666 & ~umask)
    return fdesc


tempfile.NamedTemporaryFile2 = tempfile.NamedTemporaryFile
tempfile.NamedTemporaryFile = UmaskNamedTemporaryFile


def _read_voxel_size_offset(ds, order='C'):

    voxel_size = None
    offset = None
    dims = None

    if 'resolution' in ds.attrs:
        voxel_size = tuple(ds.attrs['resolution'])
        dims = len(voxel_size)
    elif 'pixelResolution' in ds.attrs:
        voxel_size = tuple(ds.attrs['pixelResolution']['dimensions'])
        dims = len(voxel_size)
    elif 'transform' in ds.attrs:
        voxel_size = tuple(ds.attrs['transform']['scale'])
        dims = len(voxel_size)

    if 'offset' in ds.attrs:

        offset = tuple(ds.attrs['offset'])

        if dims is not None:
            assert dims == len(offset), (
                "resolution and offset attributes differ in length")
        else:
            dims = len(offset)

    if dims is None:
        dims = len(ds.shape)

    if voxel_size is None:
        voxel_size = (1,)*dims

    if offset is None:
        offset = (0,)*dims

    if order == 'F':
        offset = offset[::-1]
        voxel_size = voxel_size[::-1]

    return Coordinate(voxel_size), Coordinate(offset)


[docs]def open_ds(filename, ds_name, mode='r', attr_filename=None): '''Open a Zarr, N5, or HDF5 dataset as a :class:`daisy.Array`. If the dataset has attributes ``resolution`` and ``offset``, those will be used to determine the meta-information of the returned array. Args: filename (``string``): The name of the container "file" (which is a directory for Zarr and N5). ds_name (``string``): The name of the dataset to open. attr_filename (``string``): KLB only: the name of the attributes json file. Default is "attributes.json". Returns: A :class:`daisy.Array` pointing to the dataset. ''' if filename.endswith('.zarr'): logger.debug("opening zarr dataset %s in %s", ds_name, filename) try: ds = zarr.open(filename, mode=mode)[ds_name] except Exception as e: logger.error("failed to open %s/%s" % (filename, ds_name)) raise e voxel_size, offset = _read_voxel_size_offset(ds, ds.order) shape = Coordinate(ds.shape[-len(voxel_size):]) roi = Roi(offset, voxel_size*shape) chunk_shape = ds.chunks logger.debug("opened zarr dataset %s in %s", ds_name, filename) return Array(ds, roi, voxel_size, chunk_shape=chunk_shape) elif filename.endswith('.n5'): logger.debug("opening N5 dataset %s in %s", ds_name, filename) ds = zarr.open(filename, mode=mode)[ds_name] voxel_size, offset = _read_voxel_size_offset(ds, 'F') shape = Coordinate(ds.shape[-len(voxel_size):]) roi = Roi(offset, voxel_size*shape) chunk_shape = ds.chunks logger.debug("opened N5 dataset %s in %s", ds_name, filename) return Array(ds, roi, voxel_size, chunk_shape=chunk_shape) elif filename.endswith('.h5') or filename.endswith('.hdf'): logger.debug("opening H5 dataset %s in %s", ds_name, filename) ds = h5py.File(filename, mode=mode)[ds_name] voxel_size, offset = _read_voxel_size_offset(ds, 'C') shape = Coordinate(ds.shape[-len(voxel_size):]) roi = Roi(offset, voxel_size*shape) chunk_shape = ds.chunks logger.debug("opened H5 dataset %s in %s", ds_name, filename) return Array(ds, roi, voxel_size, chunk_shape=chunk_shape) elif filename.endswith('.json'): logger.debug("found JSON container spec") with open(filename, 'r') as f: spec = json.load(f) array = open_ds(spec['container'], ds_name, mode) return Array( array.data, Roi(spec['offset'], spec['size']), array.voxel_size, array.roi.begin, chunk_shape=array.chunk_shape) elif filename.endswith('.klb'): logger.debug("opening KLB dataset %s", filename) adaptor = KlbAdaptor(filename, attr_filename=attr_filename) return Array( adaptor, adaptor.roi, adaptor.voxel_size, adaptor.roi.begin, chunk_shape=adaptor.chunk_shape) else: logger.error("don't know data format of %s in %s", ds_name, filename) raise RuntimeError("Unknown file format for %s" % filename)
[docs]def prepare_ds( filename, ds_name, total_roi, voxel_size, dtype, write_roi=None, write_size=None, num_channels=None, compressor='default', delete=False, force_exact_write_size=False): '''Prepare a Zarr or N5 dataset. Args: filename (``string``): The name of the container "file" (which is actually a directory). ds_name (``string``): The name of the dataset to prepare. total_roi (:class:`daisy.Roi`): The ROI of the dataset to prepare in world units. voxel_size (:class:`daisy.Coordinate`): The size of one voxel in the dataset in world units. write_size (:class:`daisy.Coordinate`): The size of anticipated writes to the dataset, in world units. The chunk size of the dataset will be set such that ``write_size`` is a multiple of it. This allows concurrent writes to the dataset if the writes are aligned with ``write_size``. num_channels (``int``, optional): The number of channels. compressor (``string``, optional): The compressor to use. See `zarr.get_codec` for available options. Defaults to gzip level 5. delete (``bool``, optional): Whether to delete an existing dataset if it was found to be incompatible with the other requirements. The default is not to delete the dataset and raise an exception instead. force_exact_write_size (``bool``, optional): Whether to use `write_size` as-is, or to first process it with `get_chunk_size`. Returns: A :class:`daisy.Array` pointing to the newly created dataset. ''' voxel_size = Coordinate(voxel_size) if write_size is not None: write_size = Coordinate(write_size) assert total_roi.shape.is_multiple_of(voxel_size), ( "The provided ROI shape is not a multiple of voxel_size") assert total_roi.begin.is_multiple_of(voxel_size), ( "The provided ROI offset is not a multiple of voxel_size") if write_roi is not None: logger.warning( "write_roi is deprecated, please use write_size instead") if write_size is None: write_size = write_roi.shape if write_size is not None: assert write_size.is_multiple_of(voxel_size), ( "The provided write ROI shape is not a multiple of voxel_size") if compressor == 'default': compressor = {'id': 'gzip', 'level': 5} ds_name = ds_name.lstrip('/') if filename.endswith('.h5') or filename.endswith('.hdf'): raise RuntimeError("prepare_ds does not support HDF5 files") elif filename.endswith('.zarr'): file_format = 'zarr' elif filename.endswith('.n5'): file_format = 'n5' else: raise RuntimeError("Unknown file format for %s" % filename) if write_size is not None: if not force_exact_write_size: chunk_shape = get_chunk_shape(write_size/voxel_size) else: chunk_shape = write_size/voxel_size else: chunk_shape = None shape = total_roi.shape/voxel_size if num_channels is not None: shape = (num_channels,) + shape if chunk_shape is not None: chunk_shape = Coordinate((num_channels,) + chunk_shape) voxel_size_with_channels = Coordinate((1,) + voxel_size) if not os.path.isdir(filename): logger.info("Creating new %s", filename) os.makedirs(filename) zarr.open(filename, mode='w') if not os.path.isdir(os.path.join(filename, ds_name)): logger.info("Creating new %s in %s", ds_name, filename) if compressor is not None: compressor = zarr.get_codec(compressor) root = zarr.open(filename, mode='a') ds = root.create_dataset( ds_name, shape=shape, chunks=chunk_shape, dtype=dtype, compressor=compressor) if file_format == 'zarr': ds.attrs['resolution'] = voxel_size ds.attrs['offset'] = total_roi.begin else: ds.attrs['resolution'] = voxel_size[::-1] ds.attrs['offset'] = total_roi.begin[::-1] if chunk_shape is not None: if num_channels is not None: chunk_shape = chunk_shape/voxel_size_with_channels else: chunk_shape = chunk_shape/voxel_size return Array( ds, total_roi, voxel_size, chunk_shape=chunk_shape) else: logger.debug( "Trying to reuse existing dataset %s in %s...", ds_name, filename) ds = open_ds(filename, ds_name, mode='a') compatible = True if ds.shape != shape: logger.info("Shapes differ: %s vs %s", ds.shape, shape) compatible = False if ds.roi != total_roi: logger.info("ROIs differ: %s vs %s", ds.roi, total_roi) compatible = False if ds.voxel_size != voxel_size: logger.info( "Voxel sizes differ: %s vs %s", ds.voxel_size, voxel_size) compatible = False if write_size is not None and ds.data.chunks != chunk_shape: logger.info( "Chunk shapes differ: %s vs %s", ds.data.chunks, chunk_shape) compatible = False if dtype != ds.dtype: logger.info( "dtypes differ: %s vs %s", ds.dtype, dtype) compatible = False if not compatible: if not delete: raise RuntimeError( "Existing dataset is not compatible, please manually " "delete the volume at %s/%s" % (filename, ds_name)) logger.info("Existing dataset is not compatible, creating new one") shutil.rmtree(os.path.join(filename, ds_name)) return prepare_ds( filename=filename, ds_name=ds_name, total_roi=total_roi, voxel_size=voxel_size, dtype=dtype, write_size=write_size, num_channels=num_channels, compressor=compressor) else: logger.info("Reusing existing dataset") return ds
def get_chunk_shape(block_shape): '''Get a reasonable chunk size that divides the given block size.''' chunk_shape = Coordinate( get_chunk_size_dim(b, 256) for b in block_shape) logger.debug("Setting chunk size to %s", chunk_shape) return chunk_shape def get_chunk_size_dim(b, target_chunk_size): best_k = None best_target_diff = 0 for k in range(1, b+1): if ((b//k)*k) % b == 0: diff = abs(b//k - target_chunk_size) if best_k is None or diff < best_target_diff: best_target_diff = diff best_k = k return b//best_k