Source code for amocatlas.data_sources.wh41n

"""41°N array data reader for AMOCatlas.

This module provides functions to read and process data from the 41°N
observing array in the North Atlantic. This array monitors the Atlantic
Meridional Overturning Circulation and associated heat transport at the
northern boundary of the subtropical gyre.
"""

from pathlib import Path
from typing import Union

import xarray as xr
import datetime

# Import the modules used
from amocatlas import logger, utilities
from amocatlas.logger import log_error, log_info, log_warning
from amocatlas.utilities import apply_defaults
from amocatlas.reader_utils import ReaderUtils

log = logger.log  # Use the global logger

# Datasource identifier for automatic standardization
DATASOURCE_ID = "wh41n"

# Default list of 41N data files
WH41N_DEFAULT_FILES = [
    "hobbs_willis_amoc41N_tseries.txt",
    "trans_ARGO_ERA5.nc",
    "Q_ARGO_obs_dens_2000depth_ERA5.nc",
]
A41N_TRANSPORT_FILES = ["hobbs_willis_amoc41N_tseries.txt"]
A41N_DEFAULT_SOURCE = "https://zenodo.org/records/14681441/files/"

A41N_METADATA = {
    "project": "Atlantic Meridional Overturning Circulation Near 41N from Altimetry and Argo Observations",
    "weblink": "https://zenodo.org/records/14681441",
    "comment": "Dataset accessed and processed via http://github.com/AMOCcommunity/amocatlas",
    "acknowledgement": "This study has been conducted using E.U. Copernicus Marine Service Information; https://doi.org/10.48670/moi-00149  and https://doi.org/10.48670/moi-00148. These data were collected and made freely available by the International Argo Program and the national programs that contribute to it.  (https://argo.ucsd.edu,  https://www.ocean-ops.org). The Argo Program is part of the Global Ocean Observing System.",
    "doi": "10.5281/zenodo.8170365",
    "paper": "Willis, J. K., and Hobbs, W. R., Atlantic Meridional Overturning Circulation Near 41N from Altimetry and Argo Observations. Dataset access [2025-05-27] at 10.5281/zenodo.8170366.",
}

A41N_FILE_METADATA = {
    "hobbs_willis_amoc41N_tseries.txt": {
        "data_product": "Transport time series of Ekman volume, Northward geostrophc, Meridional Overturning volume and Meridional Overturning Heat",
    },
    "trans_Argo_ERA5.nc": {
        "data_product": "Time series of geostrophic transport as a function of latitude, longitude, depth and time, for the upper 2000 m for latitudes near 41 N and time series of Ekman Transport and Overturning Transport",
    },
    "Q_ARGO_obs_dens_2000depth_ERA5.nc": {
        "data_product": "Time series of heat transport based on various assumptions about the temperature of the ocean for depths below 2000m",
    },
}



[docs]
@apply_defaults(A41N_DEFAULT_SOURCE, WH41N_DEFAULT_FILES)
def read_41n(
    ##    source: str,
    source: Union[str, Path, None],
    file_list: Union[str, list[str]],
    transport_only: bool = True,
    data_dir: Union[str, Path, None] = None,
    redownload: bool = False,
    track_added_attrs: bool = False,
) -> list[xr.Dataset]:
    """Load the 41N transport datasets from a URL or local file path into xarray Datasets.

    Parameters
    ----------
    source : str, optional
        Local path to the data directory (remote source is handled per-file).
    file_list : str or list of str, optional
        Filename or list of filenames to process.
        Defaults to 41N_DEFAULT_FILES.
    transport_only : bool, optional
        If True, restrict to transport files only.
    data_dir : str, Path or None, optional
        Optional local data directory.
    redownload : bool, optional
        If True, force redownload of the data.
    track_added_attrs : bool, optional
        If True, track which attributes were added during metadata enrichment.

    Returns
    -------
    list of xr.Dataset
        List of loaded xarray datasets with basic inline and file-specific metadata.

    Raises
    ------
    ValueError
        If no source is provided for a file and no default URL mapping is found.
    FileNotFoundError
        If the file cannot be downloaded or does not exist locally.

    """
    log.info("Starting to read 41N dataset")

    # Load YAML metadata with fallback
    global_metadata, yaml_file_metadata = ReaderUtils.load_array_metadata_with_fallback(
        DATASOURCE_ID, A41N_METADATA
    )

    # Ensure file_list has a default
    if file_list is None:
        file_list = WH41N_DEFAULT_FILES
    if transport_only:
        file_list = A41N_TRANSPORT_FILES
    if isinstance(file_list, str):
        file_list = [file_list]
    # Determine the local storage path
    local_data_dir = Path(data_dir) if data_dir else utilities.get_default_data_dir()
    local_data_dir.mkdir(parents=True, exist_ok=True)

    # Print information about files being loaded
    ReaderUtils.print_loading_info(file_list, DATASOURCE_ID, A41N_FILE_METADATA)

    datasets = []

    added_attrs_per_dataset = [] if track_added_attrs else None
    for file in file_list:
        if not (file.lower().endswith(".txt") or file.lower().endswith(".nc")):
            log_warning("Skipping unsupported file type : %s", file)
            continue

        download_url = (
            f"{source.rstrip('/')}/{file}" if utilities.is_valid_url(source) else None
        )

        file_path = utilities.resolve_file_path(
            file_name=file,
            source=source,
            download_url=download_url,
            local_data_dir=local_data_dir,
            redownload=redownload,
        )

        # Open dataset

        if file.lower().endswith(".nc"):
            # file .nc
            # Use ReaderUtils for consistent dataset loading
            ds = ReaderUtils.safe_load_dataset(file_path)

            # Fix time coordinate for ARGO files: convert YYYYMM to datetime
            if ("trans_ARGO_ERA5" in file or "Q_ARGO" in file) and "time" in ds.coords:
                time_data = ds["time"]
                if hasattr(
                    time_data.values, "dtype"
                ) and time_data.values.dtype.kind in ["i", "u"]:
                    # Check if values look like YYYYMM format
                    first_val = int(time_data.values[0])
                    if 200000 <= first_val <= 250000:  # YYYYMM range check
                        log_info(
                            f"Converting YYYYMM time format to datetime for {file}"
                        )

                        import pandas as pd

                        # Convert YYYYMM to datetime
                        yyyymm_values = time_data.values
                        datetime_values = []
                        for yyyymm in yyyymm_values:
                            year = yyyymm // 100
                            month = yyyymm % 100
                            # Use 15th of month as representative date
                            dt = pd.Timestamp(year=year, month=month, day=15)
                            datetime_values.append(dt)

                        # Replace time coordinate with TIME and convert to standard format
                        ds = ds.rename({"time": "TIME"})
                        ds = ds.assign_coords(TIME=datetime_values)

                        # Add proper TIME coordinate attributes
                        ds["TIME"].attrs.update(
                            {
                                "long_name": "Time",
                                "standard_name": "time",
                                "calendar": "gregorian",
                                "units": "datetime64[ns]",
                            }
                        )
        else:
            # file .txt
            try:
                column_names, _ = utilities.parse_ascii_header(
                    file_path, comment_char="%"
                )
                df = utilities.read_ascii_file(file_path, comment_char="%")
                df.columns = column_names
            except (
                OSError,
                IOError,
                ValueError,
                KeyError,
                pd.errors.EmptyDataError,
                pd.errors.ParserError,
            ) as e:
                log_error("Failed to parse ASCII file: %s: %s", file_path, e)
                raise FileNotFoundError(
                    f"Failed to parse ASCII file: {file_path}: {e}"
                ) from e
            # Time handling
            try:
                df = df.apply(
                    lambda col: col.astype(str)
                    .str.replace(",", "", regex=False)
                    .astype(float)
                )
                # df['Decimal year'] = df['Decimal year'].astype(str).str.replace(',', '',regex=False).astype(float)
                df["TIME"] = df["Decimal year"].apply(
                    lambda x: datetime.datetime(int(x), 1, 1)
                    + datetime.timedelta(
                        days=(x - int(x))
                        * (
                            datetime.datetime(int(x) + 1, 1, 1)
                            - datetime.datetime(int(x), 1, 1)
                        ).days
                    )
                )
                df = df.drop(columns=["Decimal year"])
                ds = df.set_index("TIME").to_xarray()
            except (ValueError, KeyError, TypeError, AttributeError) as e:
                log_error(
                    "Failed to convert DataFrame to xarray Dataset for %s: %s",
                    file,
                    e,
                )
                raise ValueError(
                    f"Failed to convert DataFrame to xarray Dataset for {file}: {e}",
                ) from e

        # Use ReaderUtils for consistent metadata attachment (for all file types)
        file_metadata = yaml_file_metadata.get(file, A41N_FILE_METADATA.get(file, {}))
        if track_added_attrs:
            # Attach metadata with tracking
            ds, attr_changes = ReaderUtils.attach_metadata_with_tracking(
                ds,
                file,
                file_path,
                global_metadata,
                yaml_file_metadata,
                file_metadata,
                DATASOURCE_ID,
                track_added_attrs=True,
            )
            added_attrs_per_dataset.append(attr_changes)
        else:
            # Standard metadata attachment without tracking
            ds = ReaderUtils.attach_metadata_with_tracking(
                ds,
                file,
                file_path,
                global_metadata,
                yaml_file_metadata,
                file_metadata,
                DATASOURCE_ID,
                track_added_attrs=False,
            )

        datasets.append(ds)

    if not datasets:
        log_error("No valid 41N files in %s", file_list)
        raise FileNotFoundError(f"No valid data files found in {file_list}")

    log_info("Successfully loaded %d 41N dataset(s)", len(datasets))
    # Handle track_added_attrs parameter

    if track_added_attrs:
        return datasets, added_attrs_per_dataset
    else:
        return datasets