Source code for ramanlib.calc

"""
Computations over grouped Raman spectra.

This module provides analysis helpers that operate on a
:class:`ramanlib.core.GroupedSpectralContainer` (GSC) and return results designed
to feed directly into plotting functions in :mod:`ramanlib.plot`:

- :func:`outliers_per_group`  →  :func:`ramanlib.plot.outliers_per_group`
- :func:`mean_difference`     →  :func:`ramanlib.plot.mean_difference`
- :func:`mean_correlation_per_group` → :func:`ramanlib.plot.mean_correlation_per_group`

Notes
-----
All grouping semantics mirror :meth:`pandas.DataFrame.groupby`. Unless stated
otherwise, group labels are rendered from the grouping keys (tuples become
comma-separated strings).

Examples
--------
Select outliers and plot them::

    results = outliers_per_group(gsc, metric=rp.metrics.euclidean, by="sample", n_spectra=3)
    ramanlib.plot.outliers_per_group(gsc, results)
"""

from __future__ import annotations

import ramanspy as rp
import numpy as np
import pandas as pd


[docs] def outliers_per_group(gsc, metric, by=None, n_spectra=3, highest=True): """ Select per-group outlier indices according to a pairwise metric vs. the group mean. For each group (or the entire container if ``by is None``), compute the mean spectrum, score each row's spectrum against that mean using ``metric``, and return the indices of the top/bottom ``n_spectra`` according to the scores. Also returns the group's mean :class:`ramanspy.Spectrum`. Parameters ---------- gsc : GroupedSpectralContainer Input container with a ``'spectrum'`` column of :class:`ramanspy.Spectrum`. metric : callable Pairwise metric with signature ``metric(spec_a: rp.Spectrum, spec_b: rp.Spectrum) -> float``. Typical choices are in :mod:`ramanspy.metrics` (e.g., ``MAE``, ``MSE``). by : str or list[str] or callable or None, optional Grouping key(s) passed to :meth:`pandas.DataFrame.groupby`. If ``None``, all rows are treated as one group labeled ``"all"``. n_spectra : int, optional Number of spectra to select per group (clipped to the group size). Default ``3``. highest : bool, optional If ``True`` (default), select the largest metric values; if ``False``, select the smallest. Returns ------- dict[str, tuple[list[int], rp.Spectrum]] Mapping ``{ group_label: ([row_indices_into_gsc_df], mean_spectrum) }``. Indices are global row indices into ``gsc.df``. See Also -------- ramanlib.plot.outliers_per_group Plot the selected spectra per group and overlay the mean. Notes ----- The mean spectrum is computed via :meth:`ramanspy.SpectralContainer.mean` after stacking the group's spectra. """ grouped = [("all", gsc.df)] if by is None else list(gsc.df.groupby(by)) results = {} for key, group_df in grouped: if group_df.empty: continue spectra = group_df["spectrum"].tolist() cont = rp.SpectralContainer.from_stack(spectra) mean_spec = cont.mean scores = np.array([metric(spec, mean_spec) for spec in spectra]) order = np.argsort(scores) if highest: order = order[::-1] k = min(n_spectra, len(group_df)) pick_local = order[:k] pick_global = group_df.index.values[pick_local].tolist() label = ", ".join(map(str, key)) if isinstance(key, tuple) else str(key) results[label] = (pick_global, mean_spec) return results
[docs] def mean_difference(group1_stats, group2_stats, ci_z=1.96): """ Compute difference of group mean spectra and a normal-approximation CI band. Parameters ---------- group1_stats : GroupedSpectralContainer Container with exactly **one row** representing group 1's statistics, as produced by :meth:`ramanlib.core.GroupedSpectralContainer.mean` with ``include_stats=True``. Must contain columns: ``"spectrum"``, ``"n"``, ``"var_vector"``, and ``"std_vector"``. group2_stats : GroupedSpectralContainer Same format/requirements as ``group1_stats`` for group 2. ci_z : float, optional Z-score for a two-sided normal CI (e.g., ``1.96`` ≈ 95%). Default ``1.96``. Returns ------- rp.Spectrum The difference spectrum ``(group1_mean - group2_mean)`` with the same spectral axis as the inputs. numpy.ndarray One-dimensional nonnegative array giving the half-width of the symmetric CI band at each wavenumber, computed as ``ci_z * sqrt(var1/n1 + var2/n2)``. Raises ------ ValueError If a stats container is missing required columns or contains more/less than one row. See Also -------- ramanlib.plot.mean_difference Plot the difference spectrum with its CI band. ramanlib.core.GroupedSpectralContainer.mean Produces the required stats columns when ``include_stats=True``. Notes ----- This uses the usual normal approximation for a difference of means with independent groups: ``Var(diff) = Var(mean1) + Var(mean2) = var1/n1 + var2/n2``. """ if any(stat not in group1_stats.df.columns for stat in ["n", "var_vector", "std_vector"]) or (len(group1_stats) != 1): raise ValueError("group1_stats missing statistics columns or includes multiple rows. Use include_stats=True in \ GSC.mean() to ensure stats are included.") s1 = group1_stats["spectrum"].iloc[0] s2 = group2_stats["spectrum"].iloc[0] diff = s1.spectral_data - s2.spectral_data axis = s1.spectral_axis diff_spectrum = rp.Spectrum(diff, axis) var1 = group1_stats["var_vector"].iloc[0] var2 = group2_stats["var_vector"].iloc[0] n1 = group1_stats["n"].iloc[0] n2 = group2_stats["n"].iloc[0] ci_band = ci_z * np.sqrt((var1 / n1) + (var2 / n2)) return diff_spectrum, ci_band
[docs] def mean_correlation_per_group(gsc, by): """ Pearson correlation matrix between per-group mean spectra. Parameters ---------- gsc : GroupedSpectralContainer Input container. by : str Column name to group by when computing the means. Returns ------- pandas.DataFrame Square correlation matrix (index and columns are group labels) computed from the stacked intensity vectors of each group's mean spectrum. See Also -------- ramanlib.plot.mean_correlation_per_group Heatmap visualization of the returned matrix. ramanlib.core.GroupedSpectralContainer.mean Computes per-group mean spectra. Notes ----- Groups are ordered according to the key order in ``groupby(by)``. The matrix is computed by forming a DataFrame whose columns are the intensity vectors of each group's mean spectrum and calling :meth:`pandas.DataFrame.corr` with ``method="pearson"``. """ group_means_gsc = gsc.mean(by=by) spectral_data = [row["spectrum"].spectral_data for _, row in group_means_gsc.df.iterrows()] group_keys = [", ".join(map(str, k)) if isinstance(k, tuple) else str(k) for k in group_means_gsc.df.groupby(by).groups.keys()] df_group_means = pd.DataFrame({k: v for k, v in zip(group_keys, spectral_data)}) correlation_matrix = df_group_means.corr(method='pearson') return correlation_matrix