Source code for amocatlas.read_samba

from pathlib import Path
from typing import Union

import pandas as pd
import xarray as xr

from amocatlas import logger, utilities
from amocatlas.logger import log_error, log_info, log_warning
from amocatlas.utilities import apply_defaults

log = logger.log  # Use the global logger

# Default file list
SAMBA_DEFAULT_FILES = [
    "Upper_Abyssal_Transport_Anomalies.txt",
    "MOC_TotalAnomaly_and_constituents.asc",
]
SAMBA_TRANSPORT_FILES = [
    "Upper_Abyssal_Transport_Anomalies.txt",
    "MOC_TotalAnomaly_and_constituents.asc",
]
# Mapping of filenames to remote URLs
SAMBA_FILE_URLS = {
    "Upper_Abyssal_Transport_Anomalies.txt": "ftp://ftp.aoml.noaa.gov/phod/pub/SAM/2020_Kersale_etal_ScienceAdvances/",
    "MOC_TotalAnomaly_and_constituents.asc": "https://www.aoml.noaa.gov/phod/SAMOC_international/documents/",
}

# Global metadata for SAMBA
SAMBA_METADATA = {
    "description": "SAMBA 34S transport estimates dataset",
    "project": "South Atlantic MOC Basin-wide Array (SAMBA)",
    "weblink": "https://www.aoml.noaa.gov/phod/SAMOC_international/",
    "comment": "Dataset accessed and processed via http://github.com/AMOCcommunity/amocatlas",
    "acknowledgement": "SAMBA data were collected and made freely available by the SAMOC international project and contributing national programs.",
    # Add DOI here when available
}

# File-specific metadata placeholders
SAMBA_FILE_METADATA = {
    "Upper_Abyssal_Transport_Anomalies.txt": {
        "data_product": "Daily volume transport anomaly estimates for the upper and abyssal cells of the MOC",
        "acknowledgement": "M. Kersalé et al., Highly variable upper and abyssal overturning cells in the South Atlantic. Sci. Adv. 6, eaba7573 (2020). DOI: 10.1126/sciadv.aba7573",
    },
    "MOC_TotalAnomaly_and_constituents.asc": {
        "data_product": "Daily travel time values, calibrated to a nominal pressure of 1000 dbar, and bottom pressures from the two PIES/CPIES moorings",
        "acknowledgement": "Meinen, C. S., Speich, S., Piola, A. R., Ansorge, I., Campos, E., Kersalé, M., et al. (2018). Meridional overturning circulation transport variability at 34.5°S during 2009–2017: Baroclinic and barotropic flows and the dueling influence of the boundaries. Geophysical Research Letters, 45, 4180–4188. https://doi.org/10.1029/2018GL077408",
    },
}


[docs] @apply_defaults(None, SAMBA_DEFAULT_FILES) def read_samba( source: Union[str, Path, None], file_list: Union[str, list[str]], transport_only: bool = True, data_dir: Union[str, Path, None] = None, redownload: bool = False, ) -> list[xr.Dataset]: """Load the SAMBA transport datasets from remote URL or local file path into xarray Datasets. Parameters ---------- source : str, optional URL or local path to the dataset directory. If None, will use predefined URLs per file. file_list : str or list of str, optional Filename or list of filenames to process. Defaults to SAMBA_DEFAULT_FILES. transport_only : bool, optional If True, restrict to transport files only. data_dir : str, Path or None, optional Optional local data directory. redownload : bool, optional If True, force redownload of the data. Returns ------- list of xr.Dataset List of loaded xarray datasets with basic inline and file-specific metadata. Raises ------ ValueError If no source is provided for a file and no default URL mapping found. FileNotFoundError If the file cannot be downloaded or does not exist locally. """ log_info("Starting to read SAMBA dataset") # Ensure file_list has a default if file_list is None: file_list = SAMBA_DEFAULT_FILES if transport_only: file_list = SAMBA_TRANSPORT_FILES if isinstance(file_list, str): file_list = [file_list] local_data_dir = Path(data_dir) if data_dir else utilities.get_default_data_dir() local_data_dir.mkdir(parents=True, exist_ok=True) datasets = [] for file in file_list: if not (file.lower().endswith(".txt") or file.lower().endswith(".asc")): log_warning("Skipping unsupported file type: %s", file) continue download_url = SAMBA_FILE_URLS.get(file) if not download_url: log_error("No download URL defined for SAMBA file: %s", file) raise FileNotFoundError(f"No download URL defined for SAMBA file {file}") file_path = utilities.resolve_file_path( file_name=file, source=source, download_url=download_url, local_data_dir=local_data_dir, redownload=redownload, ) # Parse ASCII file try: column_names, _ = utilities.parse_ascii_header(file_path, comment_char="%") df = utilities.read_ascii_file(file_path, comment_char="%") df.columns = column_names except Exception as e: log_error("Failed to parse ASCII file: %s: %s", file_path, e) raise FileNotFoundError(f"Failed to parse ASCII file: {file_path}: {e}") # Time handling try: if "Upper_Abyssal" in file: df["TIME"] = pd.to_datetime( df[["Year", "Month", "Day", "Hour", "Minute"]], ) df = df.drop(columns=["Year", "Month", "Day", "Hour", "Minute"]) else: df["TIME"] = pd.to_datetime(df[["Year", "Month", "Day", "Hour"]]) df = df.drop(columns=["Year", "Month", "Day", "Hour"]) except Exception as e: log_error("Failed to construct TIME column for %s: %s", file, e) raise ValueError(f"Failed to construct TIME column for {file}: {e}") # Convert DataFrame to xarray Dataset try: ds = df.set_index("TIME").to_xarray() except Exception as e: log_error( "Failed to convert DataFrame to xarray Dataset for %s: %s", file, e, ) raise ValueError( f"Failed to convert DataFrame to xarray Dataset for {file}: {e}", ) # Attach metadata file_metadata = SAMBA_FILE_METADATA.get(file, {}) log_info("Attaching metadata to SAMBA dataset from file: %s", file) utilities.safe_update_attrs( ds, { "source_file": file, "source_path": str(file_path), **SAMBA_METADATA, **file_metadata, }, ) datasets.append(ds) if not datasets: log_error("No valid SAMBA files found in %s", file_list) raise FileNotFoundError(f"No valid data files found in {file_list}") log_info("Successfully loaded %d SAMBA dataset(s)", len(datasets)) return datasets