I’m trying to read ORAS5 sea surface height data with xarray, and it seems to be struggling at the concatenation step. All the files can be read individually with xr.open_dataset
, and the ones I’ve checked look fine, but when I try and read them all at once with xr.open_mfdataset
and then load I get OSError: [Errno 22] Invalid argument
It seems to come down to opening 362 files is fine, but 363 is not, but I can’t see why. Has anyone seen anything similar before?
import glob
import xarray as xr
files = glob.glob('/Users/u6955431/Large_Datasets/ORAS5/oras5/ORCA025/sossheig/opa0/*/*.nc')
ds = xr.open_mfdataset(files[:363], compat='override',coords='minimal') # all good
ds.load() # NOT good (error message below)
xr.open_mfdataset(files[:362], compat='override',coords='minimal').load() # all good with one fewer file
xr.open_mfdataset(files[1:363], compat='override',coords='minimal').load() # all good with one fewer file
xr.open_mfdataset(files[1:364], compat='override',coords='minimal').load() # same number of files and we have problems
Technical details
- I’m running this on my laptop so can’t share an example notebook easily.
- I’m using dask, but get the same problem without it
- As a workaround, I’m just loading both halves of the dataset individually and concatenating them together, but it would be nice to know what’s going on and be able to do it properly
Error message:
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
Cell In[5], line 5
2 files = glob.glob('/Users/u6955431/Large_Datasets/ORAS5/oras5/ORCA025/sossheig/opa0/*/*.nc')
4 ds = xr.open_mfdataset(files[:363], compat='override',coords='minimal') # all good
----> 5 ds.load() # not good (error message below)
File ~/mambaforge/lib/python3.10/site-packages/xarray/core/dataset.py:853, in Dataset.load(self, **kwargs)
850 chunkmanager = get_chunked_array_type(*lazy_data.values())
852 # evaluate all the chunked arrays simultaneously
--> 853 evaluated_data = chunkmanager.compute(*lazy_data.values(), **kwargs)
855 for k, data in zip(lazy_data, evaluated_data):
856 self.variables[k].data = data
File ~/mambaforge/lib/python3.10/site-packages/xarray/core/daskmanager.py:70, in DaskManager.compute(self, *data, **kwargs)
67 def compute(self, *data: DaskArray, **kwargs) -> tuple[np.ndarray, ...]:
68 from dask.array import compute
---> 70 return compute(*data, **kwargs)
File ~/mambaforge/lib/python3.10/site-packages/distributed/utils_comm.py:434, in retry_operation(coro, operation, *args, **kwargs)
428 retry_delay_min = parse_timedelta(
429 dask.config.get("distributed.comm.retry.delay.min"), default="s"
430 )
431 retry_delay_max = parse_timedelta(
432 dask.config.get("distributed.comm.retry.delay.max"), default="s"
433 )
--> 434 return await retry(
435 partial(coro, *args, **kwargs),
436 count=retry_count,
437 delay_min=retry_delay_min,
438 delay_max=retry_delay_max,
439 operation=operation,
440 )
File ~/mambaforge/lib/python3.10/site-packages/distributed/utils_comm.py:413, in retry(coro, count, delay_min, delay_max, jitter_fraction, retry_on_exceptions, operation)
411 delay *= 1 + random.random() * jitter_fraction
412 await asyncio.sleep(delay)
--> 413 return await coro()
File ~/mambaforge/lib/python3.10/site-packages/distributed/core.py:1377, in PooledRPCCall.__getattr__.<locals>.send_recv_from_rpc(**kwargs)
1375 prev_name, comm.name = comm.name, "ConnectionPool." + key
1376 try:
-> 1377 return await send_recv(comm=comm, op=key, **kwargs)
1378 finally:
1379 self.pool.reuse(self.addr, comm)
File ~/mambaforge/lib/python3.10/site-packages/distributed/core.py:1136, in send_recv(comm, reply, serializers, deserializers, **kwargs)
1134 await comm.write(msg, serializers=serializers, on_error="raise")
1135 if reply:
-> 1136 response = await comm.read(deserializers=deserializers)
1137 else:
1138 response = None
File ~/mambaforge/lib/python3.10/site-packages/distributed/comm/tcp.py:235, in TCP.read(self, deserializers)
233 chunk = frames[i:j]
234 chunk_nbytes = chunk.nbytes
--> 235 n = await stream.read_into(chunk)
236 assert n == chunk_nbytes, (n, chunk_nbytes)
237 except StreamClosedError as e:
File ~/mambaforge/lib/python3.10/site-packages/tornado/iostream.py:467, in BaseIOStream.read_into(self, buf, partial)
464 self._read_partial = partial
466 try:
--> 467 self._try_inline_read()
468 except:
469 future.add_done_callback(lambda f: f.exception())
File ~/mambaforge/lib/python3.10/site-packages/tornado/iostream.py:836, in BaseIOStream._try_inline_read(self)
834 return
835 self._check_closed()
--> 836 pos = self._read_to_buffer_loop()
837 if pos is not None:
838 self._read_from_buffer(pos)
File ~/mambaforge/lib/python3.10/site-packages/tornado/iostream.py:750, in BaseIOStream._read_to_buffer_loop(self)
743 next_find_pos = 0
744 while not self.closed():
745 # Read from the socket until we get EWOULDBLOCK or equivalent.
746 # SSL sockets do some internal buffering, and if the data is
747 # sitting in the SSL object's buffer select() and friends
748 # can't see it; the only way to find out if it's there is to
749 # try to read it.
--> 750 if self._read_to_buffer() == 0:
751 break
753 # If we've read all the bytes we can use, break out of
754 # this loop.
755
756 # If we've reached target_bytes, we know we're done.
File ~/mambaforge/lib/python3.10/site-packages/tornado/iostream.py:861, in BaseIOStream._read_to_buffer(self)
859 else:
860 buf = bytearray(self.read_chunk_size)
--> 861 bytes_read = self.read_from_fd(buf)
862 except (socket.error, IOError, OSError) as e:
863 # ssl.SSLError is a subclass of socket.error
864 if self._is_connreset(e):
865 # Treat ECONNRESET as a connection close rather than
866 # an error to minimize log spam (the exception will
867 # be available on self.error for apps that care).
File ~/mambaforge/lib/python3.10/site-packages/tornado/iostream.py:1116, in IOStream.read_from_fd(***failed resolving arguments***)
1114 def read_from_fd(self, buf: Union[bytearray, memoryview]) -> Optional[int]:
1115 try:
-> 1116 return self.socket.recv_into(buf, len(buf))
1117 except BlockingIOError:
1118 return None
OSError: [Errno 22] Invalid argument