Source code for amocatlas.data_sources.noac47n

"""NOAC 47°N array data reader for AMOCatlas.

This module provides functions to read and process data from the North Atlantic
Ocean Current (NOAC) observing array located at 47°N. The array provides
transport estimates from temperature and salinity profile data in the
North Atlantic.

Key functions:
- read_47n(): Main data loading interface for NOAC 47°N array data

Data source: Ocean current measurements at 47°N latitude
"""

from pathlib import Path
from typing import Union

import xarray as xr
import pandas as pd

# Import the modules used
from amocatlas import logger, utilities
from amocatlas.logger import log_error, log_info, log_warning
from amocatlas.utilities import apply_defaults
from amocatlas.reader_utils import ReaderUtils

log = logger.log  # Use the global logger

# Datasource identifier for automatic standardization
DATASOURCE_ID = "noac47n"

# Default list of 47N data files
NOAC47N_DEFAULT_FILES = [
    "NOAC_AMOC.tab",
]
NOAC47N_TRANSPORT_FILES = ["NOAC_AMOC.tab"]
A47N_DEFAULT_SOURCE = "https://doi.pangaea.de/10.1594/PANGAEA.959558"
A47N_METADATA = {
    "project": "Basin-wide AMOC volume transport from the NOAC array at 47°N in the subpolar North Atlantic (1993-2018) ",
    "weblink": "https://doi.pangaea.de/10.1594/PANGAEA.959558",
    "comment": "Dataset accessed and processed via http://github.com/AMOCcommunity/amocatlas",
}
# Mapping of filenames to download URLs
A47N_FILE_URLS = {
    "NOAC_AMOC.tab": ("https://doi.pangaea.de/10.1594/PANGAEA.959558?format=textfile"),
}

A47N_FILE_METADATA = {
    "NOAC_AMOC.tab": {
        "data_product": "Basin-wide AMOC volume transport from the NOAC array at 47°N in the subpolar North Atlantic (1993-2018)",
    },
}



[docs]
@apply_defaults(A47N_DEFAULT_SOURCE, NOAC47N_DEFAULT_FILES)
def read_47n(
    ##    source: str,
    source: Union[str, Path, None],
    file_list: Union[str, list[str]],
    transport_only: bool = True,
    data_dir: Union[str, Path, None] = None,
    redownload: bool = False,
    track_added_attrs: bool = False,
) -> list[xr.Dataset]:
    """Load the 47N transport datasets from a URL or local file path into xarray Datasets.

    Parameters
    ----------
    source : str, optional
        Local path to the data directory (remote source is handled per-file).
    file_list : str or list of str, optional
        Filename or list of filenames to process.
        Defaults to 47N_DEFAULT_FILES.
    transport_only : bool, optional
        If True, restrict to transport files only.
    data_dir : str, Path or None, optional
        Optional local data directory.
    redownload : bool, optional
        If True, force redownload of the data.
    track_added_attrs : bool, optional
        If True, track which attributes were added during metadata enrichment.

    Returns
    -------
    list of xr.Dataset
        List of loaded xarray datasets with basic inline and file-specific metadata.

    Raises
    ------
    ValueError
        If no source is provided for a file and no default URL mapping is found.
    FileNotFoundError
    If the file cannot be downloaded or does not exist locally.

    """
    log.info("Starting to read 47N dataset")

    # Load YAML metadata with fallback
    global_metadata, yaml_file_metadata = ReaderUtils.load_array_metadata_with_fallback(
        DATASOURCE_ID, A47N_METADATA
    )

    # Ensure file_list has a default
    if file_list is None:
        file_list = NOAC47N_DEFAULT_FILES
    if transport_only:
        file_list = NOAC47N_TRANSPORT_FILES
    if isinstance(file_list, str):
        file_list = [file_list]
    # Determine the local storage path
    local_data_dir = Path(data_dir) if data_dir else utilities.get_default_data_dir()
    local_data_dir.mkdir(parents=True, exist_ok=True)

    # Print information about files being loaded
    ReaderUtils.print_loading_info(file_list, DATASOURCE_ID, A47N_FILE_METADATA)

    datasets = []

    added_attrs_per_dataset = [] if track_added_attrs else None
    for file in file_list:
        if not (file.lower().endswith(".tab")):
            log_warning("Skipping unsupported file type : %s", file)
            continue

        download_url = A47N_FILE_URLS.get(file)
        if not download_url:
            log.error("No download URL found for file: %s", file)
            raise ValueError(f"No download URL found for file: {file}")

        file_path = utilities.resolve_file_path(
            file_name=file,
            source=source,
            download_url=download_url,
            local_data_dir=local_data_dir,
            redownload=redownload,
        )

        # Open dataset

        if file.lower().endswith(".tab"):
            # file .tab
            try:
                df = pd.read_csv(
                    file_path, sep="\t", skiprows=31, engine="python", encoding="utf-8"
                )
            except Exception as e:
                log_error("Failed to parse ASCII file: %s: %s", file_path, e)
                raise FileNotFoundError(
                    f"Failed to parse ASCII file: {file_path}: {e}"
                ) from e
            # Time handling
            try:
                df.rename(columns={"Date/Time": "TIME"}, inplace=True)

                # Convert TIME to datetime64
                df["TIME"] = pd.to_datetime(df["TIME"], errors="raise")

                ds = df.set_index("TIME").to_xarray()

            except Exception as e:
                log_error(
                    "Failed to convert DataFrame to xarray Dataset for %s: %s",
                    file,
                    e,
                )
                raise ValueError(
                    f"Failed to convert DataFrame to xarray Dataset for {file}: {e}",
                ) from e
            # Attach metadata
            # Use ReaderUtils for consistent metadata attachment
            file_metadata = yaml_file_metadata.get(
                file, A47N_FILE_METADATA.get(file, {})
            )

            if track_added_attrs:
                # Use tracking version to collect attribute changes
                ds, attr_changes = ReaderUtils.attach_metadata_with_tracking(
                    ds,
                    file,
                    file_path,
                    global_metadata,
                    yaml_file_metadata,
                    A47N_FILE_METADATA,
                    DATASOURCE_ID,
                    track_added_attrs=True,
                )
                added_attrs_per_dataset.append(attr_changes)
            else:
                # Standard metadata attachment without tracking
                ds = ReaderUtils.attach_metadata_with_tracking(
                    ds,
                    file,
                    file_path,
                    global_metadata,
                    yaml_file_metadata,
                    file_metadata,
                    DATASOURCE_ID,
                    track_added_attrs=False,
                )

        datasets.append(ds)

    if not datasets:
        log_error("No valid 47N files in %s", file_list)
        raise FileNotFoundError(f"No valid data files found in {file_list}")

    log_info("Successfully loaded %d 47N dataset(s)", len(datasets))
    # Handle track_added_attrs parameter

    if track_added_attrs:
        return datasets, added_attrs_per_dataset
    else:
        return datasets