"""Intuitive namespace API for AMOCatlas data readers.
This module provides a more user-friendly API for accessing AMOC array data
with discoverable function names and consistent return types. Each array gets
its own function with IDE autocompletion support.
Key improvements over readers.load_dataset():
- Single dataset returned by default (most common use case)
- all_files=True parameter for power users who need multiple files
- Array-specific parameters feel natural (e.g., version for OSNAP)
- IDE autocompletion works for array names
Examples
--------
Basic usage (single dataset):
>>> from amocatlas import read
>>> data = read.rapid() # Single transport dataset
>>> osnap = read.osnap(version="2025") # Latest OSNAP data
>>> arctic = read.arcticgateway() # Arctic gateway transports
Power user access (multiple datasets):
>>> all_rapid = read.rapid(all_files=True) # List of all RAPID files
>>> all_osnap = read.osnap(all_files=True) # List of all OSNAP files
Custom parameters:
>>> rapid_custom = read.rapid(
... source="https://my-mirror.com/rapid/",
... transport_only=False,
... redownload=True
... )
"""
from typing import Union, List, Callable
import xarray as xr
from pathlib import Path
# Import all the individual readers from the data_sources package
from .data_sources import (
read_rapid,
read_move,
read_osnap,
read_samba,
read_fw2015,
read_mocha,
read_41n,
read_dso,
read_calafat2025,
read_zheng2024,
read_47n,
read_fbc,
read_arcticgateway,
)
# Import file constants for list_files() functionality
from .data_sources.rapid26n import RAPID_DEFAULT_FILES
from .data_sources.move16n import MOVE_DEFAULT_FILES
from .data_sources.osnap55n import OSNAP_DEFAULT_FILES
from .data_sources.samba34s import SAMBA_DEFAULT_FILES
from .data_sources.fw2015 import FW2015_DEFAULT_FILES
from .data_sources.mocha26n import MOCHA_DEFAULT_FILES
from .data_sources.wh41n import WH41N_DEFAULT_FILES
from .data_sources.dso import DSO_DEFAULT_FILES
from .data_sources.calafat2025 import CALAFAT2025_DEFAULT_FILES
from .data_sources.zheng2024 import ZHENG2024_DEFAULT_FILES
from .data_sources.noac47n import NOAC47N_DEFAULT_FILES
from .data_sources.fbc import FBC_DEFAULT_FILES
from .data_sources.arcticgateway import ARCTIC_DEFAULT_FILES
# Import standardization functions
from . import standardise
# Supported datasource IDs for standardization
SUPPORTED_STANDARDIZATION = {
"rapid26n",
"move16n",
"osnap55n",
"samba34s",
"arcticgateway",
"fw2015",
"mocha26n",
"wh41n",
"dso",
"noac47n",
"fbc",
"calafat2025",
"zheng2024",
}
[docs]
def _return_single_or_list(
datasets: List[xr.Dataset],
all_files: bool,
file_list: Union[str, List[str], None] = None,
) -> Union[xr.Dataset, List[xr.Dataset]]:
"""Helper function to return single dataset or list based on user's request.
Parameters
----------
datasets : list of xr.Dataset
List of loaded datasets.
all_files : bool
If True, return the list.
file_list : str, list of str, or None
The file list provided by user (to determine intent).
Returns
-------
xr.Dataset or list of xr.Dataset
- Single dataset if user requested single file
- List of datasets if user requested multiple files or all_files=True
Raises
------
ValueError
If no datasets were loaded.
"""
if not datasets:
raise ValueError("No datasets were loaded")
# Return list if explicitly requested via all_files=True
if all_files:
return datasets
# Return list if user provided multiple files in file_list
if file_list is not None:
file_list_normalized = file_list if isinstance(file_list, list) else [file_list]
if len(file_list_normalized) > 1:
return datasets
# Otherwise return single dataset (default behavior)
return datasets[0]
def _validate_file_selection_params(
transport_only: bool,
all_files: bool,
file_list: Union[str, List[str], None],
available_files: List[str],
transport_files: List[str] = None, # noqa: ARG001
) -> tuple[bool, Union[str, List[str], None]]:
"""Validate and resolve file selection parameters.
Parameters
----------
transport_only : bool
Whether to use only transport files
all_files : bool
Whether to use all available files
file_list : str, list of str, or None
Custom list of files to use
available_files : list of str
All available files for this array
transport_files : list of str, optional
Transport-only files for this array
Returns
-------
tuple[bool, Union[str, List[str], None]]
(effective_transport_only, effective_file_list)
Raises
------
ValueError
If conflicting parameters are provided
"""
# Count explicitly provided parameters (excluding defaults)
# Note: We need to distinguish between default transport_only=True and explicitly set transport_only=True
# For now, we'll be permissive and let file_list override transport_only default
provided_params = []
if all_files:
provided_params.append("all_files=True")
if file_list is not None:
provided_params.append("file_list")
# Check for conflicting combinations
if len(provided_params) > 1:
if "all_files=True" in provided_params and "file_list" in provided_params:
# Check if file_list matches all available files (order doesn't matter)
file_list_normalized = (
file_list if isinstance(file_list, list) else [file_list]
)
if set(file_list_normalized) != set(available_files):
raise ValueError(
f"all_files=True conflicts with file_list. "
f"Expected all files {available_files}, got {file_list_normalized}"
)
# Resolve effective parameters
if all_files:
return False, None # Use all files, transport_only=False
elif file_list is not None:
return False, file_list # Use custom file list, transport_only=False
else:
return transport_only, None # Use defaults
[docs]
def _create_array_function(
reader_func: Callable,
array_name: str,
supports_version: bool = False,
available_files: List[str] = None,
) -> Callable:
"""Create a uniform API function for an array reader with optional standardization.
This factory function eliminates repetition by generating the standard
interface for each array reader automatically. By default, applies standardization
for clean, analysis-ready data unless raw=True is specified.
Parameters
----------
reader_func : callable
The underlying reader function (e.g., read_rapid)
array_name : str
Name of the array (for documentation)
supports_version : bool, optional
Whether this reader supports the version parameter
available_files : list of str, optional
List of available files for this array
standardize_func : callable, optional
Standardization function to apply (e.g., standardise.standardise_rapid)
Returns
-------
callable
A function with uniform signature that wraps the reader
"""
def array_function(
source: Union[str, Path, None] = None,
file_list: Union[str, List[str], None] = None,
transport_only: bool = True,
all_files: bool = False,
raw: bool = False,
data_dir: Union[str, Path, None] = None,
redownload: bool = False,
version: str = None,
track_added_attrs: bool = False,
) -> Union[xr.Dataset, List[xr.Dataset]]:
# TODO: Get transport files for validation - this needs to be implemented properly
transport_files = None # We'll need to map this based on reader_func
# Validate and resolve file selection parameters
effective_transport_only, effective_file_list = _validate_file_selection_params(
transport_only=transport_only,
all_files=all_files,
file_list=file_list,
available_files=available_files
or [], # Available files passed from factory
transport_files=transport_files,
)
kwargs = {
"source": source,
"file_list": effective_file_list,
"transport_only": effective_transport_only,
"data_dir": data_dir,
"redownload": redownload,
"track_added_attrs": track_added_attrs,
}
# Only pass version if the reader supports it
if supports_version and version is not None:
kwargs["version"] = version
# Load raw datasets
reader_result = reader_func(**kwargs)
# Handle the case where track_added_attrs=True returns a tuple
if track_added_attrs:
datasets, added_attrs_per_dataset = reader_result
# Embed metadata changes into each dataset's attributes
for i, ds in enumerate(datasets):
if i < len(added_attrs_per_dataset):
ds.attrs["_amocatlas_metadata_changes"] = added_attrs_per_dataset[i]
else:
ds.attrs["_amocatlas_metadata_changes"] = {
"added": [],
"modified": [],
}
else:
datasets = reader_result
# Apply standardization by default (unless raw=True)
if not raw:
try:
# Apply standardization to each dataset based on its datasource_id
standardized_datasets = []
for i, ds in enumerate(datasets):
# Get datasource_id from dataset metadata
datasource_id = ds.attrs.get("processing_datasource")
if datasource_id and datasource_id in SUPPORTED_STANDARDIZATION:
# Get file name for standardization (needed by standardize functions)
if isinstance(file_list, list) and i < len(file_list):
file_name = file_list[i]
elif isinstance(file_list, str):
file_name = file_list
else:
# Use source_file from metadata or default
file_name = ds.attrs.get(
"source_file", f"{array_name.lower()}_data.nc"
)
standardized_ds = standardise.standardise_data(ds, file_name)
standardized_datasets.append(standardized_ds)
else:
# No standardization available, keep raw data
standardized_datasets.append(ds)
datasets = standardized_datasets
except (ValueError, KeyError, TypeError, AttributeError) as e:
# If standardization fails, log warning but continue with raw data
import warnings
# Print a visible warning message
print(f"!! WARNING: Standardization failed for {array_name}: {e}")
print("!! Returning raw data instead of standardized data.")
warnings.warn(
f"Standardization failed for {array_name}: {e}. Returning raw data.",
UserWarning,
stacklevel=2,
)
# Return datasets (metadata changes are now embedded in dataset attributes)
return _return_single_or_list(datasets, all_files, file_list)
# Add proper docstring
array_function.__doc__ = f"""Load {array_name} array data.
By default, returns standardized, analysis-ready data with consistent variable names,
metadata, and units following oceanographic conventions. Use raw=True to get data
in original format from the source files.
Parameters
----------
source : str, Path, or None, optional
URL or local path to the data source.
file_list : str, list of str, or None, optional
Specific files to load. Defaults to transport files.
transport_only : bool, optional
If True, load only transport data. Default: True.
all_files : bool, optional
If True, return list of all datasets. If False, return single dataset. Default: False.
raw : bool, optional
If True, return data in original format without standardization.
If False (default), apply standardization for analysis-ready data.
data_dir : str, Path, or None, optional
Local directory for data storage.
redownload : bool, optional
Force redownload of data. Default: False.
version : str, optional
Dataset version{' (used for version selection)' if supports_version else ' (ignored for this array)'}. Default: None.
track_added_attrs : bool, optional
**INTERNAL USE ONLY** - Track which attributes were added during metadata
enrichment. When True, embeds a temporary '_amocatlas_metadata_changes'
attribute in each returned dataset containing {{"added": [...], "modified": [...]}}.
This attribute should be extracted and removed by calling code (e.g., report
generation). Not intended for end users. Default: False.
Returns
-------
xr.Dataset or list of xr.Dataset
Standardized dataset (default) or raw dataset if raw=True.
Single dataset by default, or list of datasets if all_files=True.
Notes
-----
Standardization includes:
- Consistent variable names across arrays
- Proper CF-compliant metadata and attributes
- Standardized units following oceanographic conventions
- Additional quality control and formatting
"""
# Add list_files() method to the function
def list_files() -> List[str]:
"""Return list of available files for this array.
Returns
-------
list of str
List of available file names that can be specified in file_list parameter.
"""
return available_files.copy() if available_files else []
array_function.list_files = list_files
return array_function
# Create all array functions using the factory pattern with automatic standardization
rapid = _create_array_function(
read_rapid, "RAPID 26°N", available_files=RAPID_DEFAULT_FILES
)
move = _create_array_function(
read_move, "MOVE 16°N", available_files=MOVE_DEFAULT_FILES
)
osnap = _create_array_function(
read_osnap, "OSNAP", supports_version=True, available_files=OSNAP_DEFAULT_FILES
)
samba = _create_array_function(
read_samba, "SAMBA 34.5°S", available_files=SAMBA_DEFAULT_FILES
)
arcticgateway = _create_array_function(
read_arcticgateway, "Arctic Gateway", available_files=ARCTIC_DEFAULT_FILES
)
fw2015 = _create_array_function(
read_fw2015, "Frajka-Williams 2015", available_files=FW2015_DEFAULT_FILES
)
mocha = _create_array_function(read_mocha, "MOCHA", available_files=MOCHA_DEFAULT_FILES)
wh41n = _create_array_function(read_41n, "41°N", available_files=WH41N_DEFAULT_FILES)
dso = _create_array_function(
read_dso, "Denmark Strait Overflow", available_files=DSO_DEFAULT_FILES
)
noac47n = _create_array_function(
read_47n, "47°N", available_files=NOAC47N_DEFAULT_FILES
)
fbc = _create_array_function(
read_fbc, "Faroe Bank Channel", available_files=FBC_DEFAULT_FILES
)
calafat2025 = _create_array_function(
read_calafat2025, "Calafat et al. 2025", available_files=CALAFAT2025_DEFAULT_FILES
)
zheng2024 = _create_array_function(
read_zheng2024, "Zheng et al. 2024", available_files=ZHENG2024_DEFAULT_FILES
)
# Define __all__ to control what's exported
__all__ = [
"rapid",
"move",
"osnap",
"samba",
"arcticgateway",
"fw2015",
"mocha",
"wh41n",
"dso",
"noac47n",
"fbc",
"calafat2025",
"zheng2024",
]