Source code for amocatlas.data_sources.sf2021

"""Sanchez-Franks Satellite proxy for the AMOC at 26N data reader for AMOCatlas.

This module provides functions to read and process satellite proxy transport data for 26N
from Sanchez-Franks et al. (2021). This dataset provides a satellite reconstruction of the AMOC transport
at 26N based on satellite altimetry. It also includes the upper-mid-ocean and gulf stream components.
The components are derived through a dynamically based method.
"""

from pathlib import Path
from typing import Union

import numpy as np
import xarray as xr

# Import the modules used
from amocatlas import logger, utilities
from amocatlas.logger import log_error, log_info, log_warning, log_debug
from amocatlas.utilities import apply_defaults
from amocatlas.reader_utils import ReaderUtils

log = logger.log  # Use the global logger

# Datasource identifier for automatic standardization
DATASOURCE_ID = "sf2021"

# Default list of SF2021 (Sanchez-Franks 2021) data files
SF2021_DEFAULT_FILES = ["altimetry_moc_transport_1993_2020_18mos_smoothed.nc"]
SF2021_TRANSPORT_FILES = ["altimetry_moc_transport_1993_2020_18mos_smoothed.nc"]
SF2021_DEFAULT_SOURCE = "https://zenodo.org/records/18941523/files/"

SF2021_METADATA = {
    "project": "A satellite reconstruction of the AMOC transport at 26N",
    "weblink": "https://zenodo.org/records/18941523",
    "comment": "Dataset accessed and processed via http://github.com/AMOCcommunity/amocatlas",
}

SF2021_FILE_METADATA = {
    "altimetry_moc_transport_1993_2020_18mos_smoothed.nc": {
        "data_product": "A satellite reconstruction of the AMOC transport at 26N",
    }
}

# Metadata for time coordinate
_TIME_METADATA = {
    "units": "seconds since 1970-01-01T00:00:00Z",
    "long_name": "Time elapsed since 1970-01-01T00:00:00Z",
    "standard_name": "time",
    "calendar": "gregorian",
    "vocabulary": "http://vocab.nerc.ac.uk/collection/OG1/current/TIME/",
}


def _normalize_sf2021_time_coordinate(
    ds: xr.Dataset, source_file: str = None
) -> xr.Dataset:
    """Convert SF2021 TIME coordinate from days since 0000-01-01 to datetime64[ns].

    Parameters
    ----------
    ds : xr.Dataset
        Dataset with sat_time or TIME coordinate as float (days since 0000-01-01)
    source_file : str, optional
        Source filename (currently unused, kept for API compatibility)

    Returns
    -------
    xr.Dataset
        Dataset with time coordinate converted to datetime64[ns]

    """
    # Find time variable (check raw name first, then final name)
    time_var = next((var for var in ["sat_time", "TIME"] if var in ds.coords), None)

    if not time_var or ds[time_var].dtype.kind not in ["f", "i"]:
        log_debug(
            f"Skipping TIME normalization - {time_var or 'TIME'} not found or not numeric"
        )
        return ds

    try:
        # Convert days since 0000-01-01 to datetime64[ns] without using year 0 in ns resolution.
        # Decompose into integer days and fractional nanoseconds relative to 1970-01-01.
        time_values = np.asarray(ds[time_var].values, dtype=np.float64)
        epoch_days = 719528.0  # Days between 0000-01-01 and 1970-01-01 in proleptic Gregorian calendar.
        relative_days = time_values - epoch_days
        whole_days = np.floor(relative_days).astype(np.int64)
        fractional_ns = np.rint((relative_days - whole_days) * 86400 * 1e9).astype(
            np.int64
        )

        time_datetime = (
            np.datetime64("1970-01-01", "ns")
            + whole_days.astype("timedelta64[D]")
            + fractional_ns.astype("timedelta64[ns]")
        ).astype("datetime64[ns]")

        # Use assign_coords to properly set dimension coordinate
        ds = ds.assign_coords({time_var: time_datetime})
        ds[time_var].attrs = _TIME_METADATA
        log_debug(f"Converted SF2021 {time_var} from days to datetime64[ns]")
    except (ValueError, TypeError, OverflowError) as e:
        log_warning(f"Failed to convert SF2021 TIME coordinate: {e}")

    return ds


[docs] @apply_defaults(SF2021_DEFAULT_SOURCE, SF2021_DEFAULT_FILES) def read_sf2021( source: Union[str, Path, None], file_list: Union[str, list[str]], transport_only: bool = True, data_dir: Union[str, Path, None] = None, redownload: bool = False, track_added_attrs: bool = False, ) -> list[xr.Dataset]: """Load the SF2021 transport datasets from a URL or local file path into xarray Datasets. Parameters ---------- source : str, optional Local path to the data directory (remote source is handled per-file). file_list : str or list of str, optional Filename or list of filenames to process. Defaults to SF2021_DEFAULT_FILES. transport_only : bool, optional If True, restrict to transport files only. data_dir : str, Path or None, optional Optional local data directory. redownload : bool, optional If True, force redownload of the data. track_added_attrs : bool, optional If True, track which attributes were added during metadata enrichment. Returns ------- list of xr.Dataset List of loaded xarray datasets with basic inline and file-specific metadata. Raises ------ ValueError If no source is provided for a file and no default URL mapping is found. FileNotFoundError If the file cannot be downloaded or does not exist locally. """ log.info("Starting to read SF2021 dataset") # Load YAML metadata with fallback global_metadata, yaml_file_metadata = ReaderUtils.load_array_metadata_with_fallback( DATASOURCE_ID, SF2021_METADATA ) # Ensure file_list has a default if file_list is None: file_list = SF2021_DEFAULT_FILES if transport_only: file_list = SF2021_TRANSPORT_FILES if isinstance(file_list, str): file_list = [file_list] # Determine the local storage path local_data_dir = Path(data_dir) if data_dir else utilities.get_default_data_dir() local_data_dir.mkdir(parents=True, exist_ok=True) # Print information about files being loaded ReaderUtils.print_loading_info(file_list, DATASOURCE_ID, SF2021_FILE_METADATA) datasets = [] added_attrs_per_dataset = [] if track_added_attrs else None for file in file_list: if not (file.lower().endswith(".nc")): log_warning("Skipping unsupported file type : %s", file) continue download_url = ( f"{source.rstrip('/')}/{file}" if utilities.is_valid_url(source) else None ) file_path = utilities.resolve_file_path( file_name=file, source=source, download_url=download_url, local_data_dir=local_data_dir, redownload=redownload, ) # Open dataset if file.lower().endswith(".nc"): # Use ReaderUtils for consistent dataset loading ds = ReaderUtils.safe_load_dataset(file_path) # Attach metadata # Attach metadata with optional tracking if track_added_attrs: ds, attr_changes = ReaderUtils.attach_metadata_with_tracking( ds, file, file_path, global_metadata, yaml_file_metadata, SF2021_FILE_METADATA, DATASOURCE_ID, track_added_attrs=True, ) added_attrs_per_dataset.append(attr_changes) else: ds = ReaderUtils.attach_metadata_with_tracking( ds, file, file_path, global_metadata, yaml_file_metadata, SF2021_FILE_METADATA, DATASOURCE_ID, track_added_attrs=False, ) # Normalize SF2021 TIME coordinate AFTER metadata attachment ds = _normalize_sf2021_time_coordinate(ds, source_file=file) else: raise ValueError( f"Unsupported file type for {file}. Only .nc files are supported." ) datasets.append(ds) if not datasets: log_error("No valid SF2021 files in %s", file_list) raise FileNotFoundError(f"No valid data files found in {file_list}") log_info("Successfully loaded %d SF2021 dataset(s)", len(datasets)) if track_added_attrs: return datasets, added_attrs_per_dataset else: return datasets