Source code for amocatlas.data_sources.fbc

"""Faroe Bank Channel (FBC) overflow data reader for AMOCatlas.

This module provides functions to read and process data from the Faroe Bank
Channel overflow monitoring system. The FBC is a critical pathway for dense
water overflow from the Nordic Seas into the North Atlantic, representing
an important component of the Atlantic Meridional Overturning Circulation.

The dataset includes overflow transport estimates and hydrographic measurements
from moored instruments in the channel.

Key functions:
- read_fbc(): Main data loading interface for Faroe Bank Channel overflow data

Data source: Faroe Bank Channel overflow monitoring program
Location: Deep channel between Faroe Islands and Faroe Bank
"""

from pathlib import Path
from typing import Union

import xarray as xr
import datetime
import pandas as pd

# Import the modules used
from amocatlas import logger, utilities
from amocatlas.logger import log_error, log_info, log_warning
from amocatlas.utilities import apply_defaults
from amocatlas.reader_utils import ReaderUtils

log = logger.log  # Use the global logger

# Datasource identifier for automatic standardization
DATASOURCE_ID = "fbc"

# Default list of FBC data files
FBC_DEFAULT_FILES = [
    "FBC_overflow_transport.txt",
]
FBC_TRANSPORT_FILES = ["FBC_overflow_transport.txt"]
FBC_DEFAULT_SOURCE = "https://envofar.fo/var/ftp/Timeseries/"

FBC_METADATA = {
    "project": "Faroe Bank Channel overflow 1995-2015",
    "weblink": "https://envofar.fo/var/ftp/Timeseries/FBC_overflow_transport.txt",
    "comment": "Dataset accessed and processed via http://github.com/AMOCcommunity/amocatlas",
}

FBC_FILE_METADATA = {
    "FBC_overflow_transport.txt": {
        "data_product": "Daily averaged kinematic FBC-overflow flux (transport) in Sv",
    },
}



[docs]
@apply_defaults(FBC_DEFAULT_SOURCE, FBC_DEFAULT_FILES)
def read_fbc(
    ##    source: str,
    source: Union[str, Path, None],
    file_list: Union[str, list[str]],
    transport_only: bool = True,
    data_dir: Union[str, Path, None] = None,
    redownload: bool = False,
    track_added_attrs: bool = False,
) -> list[xr.Dataset]:
    """Load the FBC (Faroe Banks Channel) transport datasets from a URL or local file path into xarray Datasets.

    Parameters
    ----------
    source : str, optional
        Local path to the data directory (remote source is handled per-file).

    file_list : str or list of str, optional
        Filename or list of filenames to process.
        Defaults to FBC_DEFAULT_FILES.

    transport_only : bool, optional
        If True, restrict to transport files only.

    data_dir : str, Path or None, optional
        Optional local data directory.

    redownload : bool, optional
        If True, force redownload of the data.
    track_added_attrs : bool, optional
        If True, track which attributes were added during metadata enrichment.

    Returns
    -------
    list of xr.Dataset
        List of loaded xarray datasets with basic inline and file-specific metadata.

    Raises
    ------
    ValueError
        If no source is provided for a file and no default URL mapping is found.

    FileNotFoundError
        If the file cannot be downloaded or does not exist locally.

    """
    log.info("Starting to read FBC dataset")

    # Load YAML metadata with fallback
    global_metadata, yaml_file_metadata = ReaderUtils.load_array_metadata_with_fallback(
        DATASOURCE_ID, FBC_METADATA
    )

    # Ensure file_list has a default
    if file_list is None:
        file_list = FBC_DEFAULT_FILES
    if transport_only:
        file_list = FBC_TRANSPORT_FILES
    if isinstance(file_list, str):
        file_list = [file_list]
    # Determine the local storage path
    local_data_dir = Path(data_dir) if data_dir else utilities.get_default_data_dir()
    local_data_dir.mkdir(parents=True, exist_ok=True)

    # Print information about files being loaded
    ReaderUtils.print_loading_info(file_list, DATASOURCE_ID, FBC_FILE_METADATA)

    datasets = []

    added_attrs_per_dataset = [] if track_added_attrs else None
    for file in file_list:
        if not (file.lower().endswith(".txt")):
            log_warning("Skipping unsupported file type : %s", file)
            continue

        download_url = (
            f"{source.rstrip('/')}/{file}" if utilities.is_valid_url(source) else None
        )

        file_path = utilities.resolve_file_path(
            file_name=file,
            source=source,
            download_url=download_url,
            local_data_dir=local_data_dir,
            redownload=redownload,
        )

        # Open dataset
        if file.lower().endswith(".txt"):
            # file.txt
            try:
                # column_names, _ = utilities.parse_ascii_header(
                #     file_path, comment_char="%"
                # )
                data_start = utilities.find_data_start(file_path)

                df = pd.read_csv(
                    file_path,
                    sep=r"\s+",
                    encoding="latin-1",
                    skiprows=data_start,
                    names=["Decimal year", "Month", "Day", "Flux"],
                )
            except Exception as e:
                log_error("Failed to parse ASCII file: %s: %s", file_path, e)
                raise FileNotFoundError(
                    f"Failed to parse ASCII file: {file_path}: {e}"
                ) from e

            # Time handling
            try:
                df = df.apply(
                    lambda col: col.astype(str)
                    .str.replace(",", "", regex=False)
                    .astype(float)
                )
                # df['Decimal year'] = df['Decimal year'].astype(str).str.replace(',', '',regex=False).astype(float)
                df["TIME"] = df["Decimal year"].apply(
                    lambda x: datetime.datetime(int(x), 1, 1)
                    + datetime.timedelta(
                        days=(x - int(x))
                        * (
                            datetime.datetime(int(x) + 1, 1, 1)
                            - datetime.datetime(int(x), 1, 1)
                        ).days
                    )
                )
                df = df.drop(columns=["Decimal year"])
                ds = df.set_index("TIME").to_xarray()
            except Exception as e:
                log_error(
                    "Failed to convert DataFrame to xarray Dataset for %s: %s",
                    file,
                    e,
                )
                raise ValueError(
                    f"Failed to convert DataFrame to xarray Dataset for {file}: {e}",
                ) from e

            # Attach metadata
            # Attach metadata with optional tracking

            if track_added_attrs:

                ds, attr_changes = ReaderUtils.attach_metadata_with_tracking(
                    ds,
                    file,
                    file_path,
                    global_metadata,
                    yaml_file_metadata,
                    FBC_FILE_METADATA,
                    DATASOURCE_ID,
                    track_added_attrs=True,
                )

                added_attrs_per_dataset.append(attr_changes)

            else:

                ds = ReaderUtils.attach_metadata_with_tracking(
                    ds,
                    file,
                    file_path,
                    global_metadata,
                    yaml_file_metadata,
                    FBC_FILE_METADATA,
                    DATASOURCE_ID,
                    track_added_attrs=False,
                )

        datasets.append(ds)

    if not datasets:
        log_error("No valid FBC files in %s", file_list)
        raise FileNotFoundError(f"No valid data files found in {file_list}")

    log_info("Successfully loaded %d FBC dataset(s)", len(datasets))

    if track_added_attrs:
        return datasets, added_attrs_per_dataset
    else:
        return datasets