Source code for ramanlib.calc

"""
Computations over grouped Raman spectra.

This module provides analysis helpers that operate on a
:class:`ramanlib.core.GroupedSpectralContainer` (GSC) and return results designed
to feed directly into plotting functions in :mod:`ramanlib.plot`:

- :func:`outliers_per_group`  →  :func:`ramanlib.plot.outliers_per_group`
- :func:`mean_difference`     →  :func:`ramanlib.plot.mean_difference`
- :func:`mean_correlation_per_group` → :func:`ramanlib.plot.mean_correlation_per_group`

Notes
-----
All grouping semantics mirror :meth:`pandas.DataFrame.groupby`. Unless stated
otherwise, group labels are rendered from the grouping keys (tuples become
comma-separated strings).

Examples
--------
Select outliers and plot them::

    results = outliers_per_group(gsc, metric=rp.metrics.euclidean, by="sample", n_spectra=3)
    ramanlib.plot.outliers_per_group(gsc, results)
"""

from __future__ import annotations

import ramanspy as rp
import numpy as np
import pandas as pd



[docs]
def outliers_per_group(gsc, metric, by=None, n_spectra=3, highest=True):
    """
    Select per-group outlier indices according to a pairwise metric vs. the group mean.

    For each group (or the entire container if ``by is None``), compute the mean
    spectrum, score each row's spectrum against that mean using ``metric``, and
    return the indices of the top/bottom ``n_spectra`` according to the scores.
    Also returns the group's mean :class:`ramanspy.Spectrum`.

    Parameters
    ----------
    gsc : GroupedSpectralContainer
        Input container with a ``'spectrum'`` column of :class:`ramanspy.Spectrum`.
    metric : callable
        Pairwise metric with signature
        ``metric(spec_a: rp.Spectrum, spec_b: rp.Spectrum) -> float``.
        Typical choices are in :mod:`ramanspy.metrics` (e.g., ``MAE``,
        ``MSE``).
    by : str or list[str] or callable or None, optional
        Grouping key(s) passed to :meth:`pandas.DataFrame.groupby`. If ``None``,
        all rows are treated as one group labeled ``"all"``.
    n_spectra : int, optional
        Number of spectra to select per group (clipped to the group size). Default ``3``.
    highest : bool, optional
        If ``True`` (default), select the largest metric values; if ``False``,
        select the smallest.

    Returns
    -------
    dict[str, tuple[list[int], rp.Spectrum]]
        Mapping
        ``{ group_label: ([row_indices_into_gsc_df], mean_spectrum) }``.
        Indices are global row indices into ``gsc.df``.

    See Also
    --------
    ramanlib.plot.outliers_per_group
        Plot the selected spectra per group and overlay the mean.

    Notes
    -----
    The mean spectrum is computed via :meth:`ramanspy.SpectralContainer.mean`
    after stacking the group's spectra.

    """
    grouped = [("all", gsc.df)] if by is None else list(gsc.df.groupby(by))

    results = {}
    for key, group_df in grouped:
        if group_df.empty:
            continue

        spectra = group_df["spectrum"].tolist()
        cont = rp.SpectralContainer.from_stack(spectra)
        mean_spec = cont.mean

        scores = np.array([metric(spec, mean_spec) for spec in spectra])
        order = np.argsort(scores)
        if highest:
            order = order[::-1]

        k = min(n_spectra, len(group_df))
        pick_local = order[:k]
        pick_global = group_df.index.values[pick_local].tolist()

        label = ", ".join(map(str, key)) if isinstance(key, tuple) else str(key)
        results[label] = (pick_global, mean_spec)

    return results




[docs]
def mean_difference(group1_stats, group2_stats, ci_z=1.96):
    """
    Compute difference of group mean spectra and a normal-approximation CI band.

    Parameters
    ----------
    group1_stats : GroupedSpectralContainer
        Container with exactly **one row** representing group 1's statistics, as
        produced by :meth:`ramanlib.core.GroupedSpectralContainer.mean` with
        ``include_stats=True``. Must contain columns: ``"spectrum"``, ``"n"``,
        ``"var_vector"``, and ``"std_vector"``.
    group2_stats : GroupedSpectralContainer
        Same format/requirements as ``group1_stats`` for group 2.
    ci_z : float, optional
        Z-score for a two-sided normal CI (e.g., ``1.96`` ≈ 95%). Default ``1.96``.

    Returns
    -------
    rp.Spectrum
        The difference spectrum ``(group1_mean - group2_mean)`` with the same
        spectral axis as the inputs.
    numpy.ndarray
        One-dimensional nonnegative array giving the half-width of the symmetric
        CI band at each wavenumber, computed as
        ``ci_z * sqrt(var1/n1 + var2/n2)``.

    Raises
    ------
    ValueError
        If a stats container is missing required columns or contains
        more/less than one row.

    See Also
    --------
    ramanlib.plot.mean_difference
        Plot the difference spectrum with its CI band.
    ramanlib.core.GroupedSpectralContainer.mean
        Produces the required stats columns when ``include_stats=True``.

    Notes
    -----
    This uses the usual normal approximation for a difference of means with
    independent groups:
    ``Var(diff) = Var(mean1) + Var(mean2) = var1/n1 + var2/n2``.

    """

    if any(stat not in group1_stats.df.columns for stat in ["n", "var_vector", "std_vector"]) or (len(group1_stats) != 1):
        raise ValueError("group1_stats missing statistics columns or includes multiple rows. Use include_stats=True in \
GSC.mean() to ensure stats are included.")
    
    s1 = group1_stats["spectrum"].iloc[0]
    s2 = group2_stats["spectrum"].iloc[0]

    diff = s1.spectral_data - s2.spectral_data
    axis = s1.spectral_axis
    diff_spectrum = rp.Spectrum(diff, axis)

    var1 = group1_stats["var_vector"].iloc[0]
    var2 = group2_stats["var_vector"].iloc[0]
    n1 = group1_stats["n"].iloc[0]
    n2 = group2_stats["n"].iloc[0]

    ci_band = ci_z * np.sqrt((var1 / n1) + (var2 / n2))

    return diff_spectrum, ci_band




[docs]
def mean_correlation_per_group(gsc, by):
    """
    Pearson correlation matrix between per-group mean spectra.

    Parameters
    ----------
    gsc : GroupedSpectralContainer
        Input container.
    by : str
        Column name to group by when computing the means.

    Returns
    -------
    pandas.DataFrame
        Square correlation matrix (index and columns are group labels) computed
        from the stacked intensity vectors of each group's mean spectrum.

    See Also
    --------
    ramanlib.plot.mean_correlation_per_group
        Heatmap visualization of the returned matrix.
    ramanlib.core.GroupedSpectralContainer.mean
        Computes per-group mean spectra.

    Notes
    -----
    Groups are ordered according to the key order in ``groupby(by)``. The matrix
    is computed by forming a DataFrame whose columns are the intensity vectors of
    each group's mean spectrum and calling :meth:`pandas.DataFrame.corr`
    with ``method="pearson"``.

    """
    group_means_gsc = gsc.mean(by=by)
    spectral_data = [row["spectrum"].spectral_data for _, row in group_means_gsc.df.iterrows()]
    group_keys = [", ".join(map(str, k)) if isinstance(k, tuple) else str(k) for k in group_means_gsc.df.groupby(by).groups.keys()]
    df_group_means = pd.DataFrame({k: v for k, v in zip(group_keys, spectral_data)})
    correlation_matrix = df_group_means.corr(method='pearson')
    return correlation_matrix