Source code for cdiutils.analysis.stats

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.typing import ColorType
from scipy.ndimage import binary_erosion
from scipy.stats import gaussian_kde

from cdiutils.plot.formatting import save_fig
from cdiutils.utils import normalise


[docs] def kde_from_histogram( counts: np.ndarray, bin_edges: np.ndarray, ) -> tuple[np.ndarray, np.ndarray]: """ Compute the Kernel Density Estimate (KDE) from histogram counts and bin edges provided by numpy.histogram function. Args: counts (np.ndarray): the number of elements in each bin. bin_edges (np.ndarray): the limits of each bin. Returns: tuple[np.ndarray, np.ndarray]: x values used to compute the KDE estimate, the y value (KDE estimate) """ # Check if the histogram is density or not by checking the sum of # the counts bin_widths = np.diff(bin_edges) is_density = np.isclose(np.sum(counts * bin_widths), 1.0) if is_density: # When density=True, use the bin edges to reconstruct the data # for KDE data = [] for count, left_edge, right_edge in zip( counts, bin_edges[:-1], bin_edges[1:] ): data.extend( np.random.uniform( left_edge, right_edge, int(count * len(counts) * (right_edge - left_edge)), ) ) data = np.array(data) kde = gaussian_kde(data) x = np.linspace(min(bin_edges), max(bin_edges)) y = kde(x) else: # Reconstruct the data from histogram counts and bin edges bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 bin_width = bin_edges[1] - bin_edges[0] reconstructed_data = np.repeat(bin_centers, counts) # Calculate KDE using the reconstructed data kde = gaussian_kde(reconstructed_data) # Evaluate KDE x = np.linspace(bin_edges.min(), bin_edges.max()) y = kde.pdf(x) # Scale the KDE values to match the original counts y *= len(reconstructed_data) * bin_width return x, y
[docs] def find_isosurface( amplitude: np.ndarray, nbins: int = 100, sigma_criterion: float = 3, plot: bool = False, show: bool = False, save: str = None, ) -> tuple[float, plt.Axes]: """ Estimate the isosurface value from the amplitude distribution. This function computes the isosurface value based on the amplitude distribution of a 3D volume. The isosurface is calculated as: `mu - sigma_criterion * sigma`, where `mu` is the mean and `sigma` is the standard deviation of the distribution. Args: amplitude (np.ndarray): The 3D amplitude volume. nbins (int, optional): The number of bins to use for the histogram. Defaults to 100. sigma_criterion (float, optional): The factor used to compute the isosurface. Defaults to 3. plot (bool, optional): Whether to generate a plot of the histogram and density estimate. Defaults to False. show (bool, optional): Whether to display the plot. Defaults to False. save (str, optional): File path to save the plot if generated. Defaults to None. Returns: tuple[float, plt.Axes] | float: The isosurface value. If `plot` or `show` is True, also returns the matplotlib figure object. """ # normalise and flatten the amplitude flattened_amplitude = normalise(amplitude).ravel() counts, bins = np.histogram(flattened_amplitude, bins=nbins) # remove the background background_value = bins[ np.where(counts == counts.max())[0] + 1 + nbins // 20 ] filtered_amplitude = flattened_amplitude[ flattened_amplitude > background_value ] # redo the histogram with the filtered amplitude counts, bins = np.histogram(filtered_amplitude, bins=nbins, density=True) bin_centres = (bins[:-1] + bins[1:]) / 2 bin_size = bin_centres[1] - bin_centres[0] # fit the amplitude distribution kernel = gaussian_kde(filtered_amplitude) x = np.linspace(0, 1, 1000) fitted_counts = kernel(x) max_index = np.argmax(fitted_counts) right_gaussian_part = np.where(x >= x[max_index], fitted_counts, 0) # find the closest indexes right_HM_index = np.argmin( np.abs(right_gaussian_part - fitted_counts.max() / 2) ) left_HM_index = max_index - (right_HM_index - max_index) fwhm = x[right_HM_index] - x[left_HM_index] sigma_estimate = fwhm / (2 * np.sqrt(2 * np.log(2))) isosurface = x[max_index] - sigma_criterion * sigma_estimate # Ensure isosurface is non-negative if isosurface < 0: import warnings warnings.warn( f"Isosurface estimate ({isosurface:.3f}) is negative. " "Returning 0. This may indicate poor signal-to-noise ratio " "or unsuitable data for isosurface detection.", UserWarning, ) isosurface = 0 if plot or show: figsize = (6, 4) # (5.812, 3.592) # golden ratio fig, ax = plt.subplots(1, 1, layout="tight", figsize=figsize) ax.bar( bin_centres, counts, width=bin_size, color="dodgerblue", alpha=0.9, edgecolor=(0, 0, 0, 0.25), label=r"amplitude distribution", ) kde_x, kde_y = kde_from_histogram(counts, bins) ax.fill_between( kde_x, kde_y, alpha=0.3, color="navy", label=r"density estimate", ) ax.axvspan( x[left_HM_index], x[right_HM_index], edgecolor="k", facecolor="green", alpha=0.2, label="FWHM", ) ax.plot( [isosurface, isosurface], [0, fitted_counts[(np.abs(x - isosurface)).argmin()]], solid_capstyle="round", color="lightcoral", lw=5, label=rf"isosurface estimated at {isosurface:0.3f}", ) ax.set_xlabel(r"normalised amplitude") ax.set_ylabel("counts") ax.legend(frameon=False) fig.suptitle(r"Reconstructed amplitude distribution") fig.tight_layout() if save is not None: save_fig(fig, save, transparent=False) if show: plt.show() return float(isosurface), fig return float(isosurface), None
[docs] def get_histogram( data: np.ndarray, support: np.ndarray = None, bins: int = 50, density: bool = False, region: str = "overall", ) -> dict: """ Calculate histogram and optionally kernel density estimate (KDE) of the data. Optionally applies a support mask to the data before and calculates the surface and bulk histograms. Args: data (np.ndarray): the data to be analysed support (np.ndarray): the support mask to be applied to the data before histogram calculation. If None, no mask is applied. Defaults to None. bins (int, optional): number of bins for the histogram. Defaults to 50. density (bool, optional): whether to normalise the histogram to form a probability density function. Defaults to False. region (str, optional): region of the data to be analysed. Can be "overall", "surface", "bulk" or "all". Defaults to "overall". Returns: dict: a dictionary containing the histograms for the specified region(s). If kde is True, also includes the KDEs. """ if support is None and region != "overall": raise ValueError( "Support mask is required for surface or bulk region analysis." ) if region not in ["overall", "surface", "bulk", "all"]: raise ValueError( "Invalid region specified. Choose from 'overall', " "'surface', 'bulk', or 'all'." ) histograms = {} means = {} stds = {} if support is not None: overall_data = data[support > 0] # to handle any remaining NaN values, we need to specify the range histograms["overall"] = np.histogram( overall_data, bins=bins, density=density, range=(np.nanmin(overall_data), np.nanmax(overall_data)), ) means["overall"] = np.nanmean(overall_data) stds["overall"] = np.nanstd(overall_data) if region != "overall": bulk = binary_erosion(support.astype(bool)) bulk_data = data[bulk > 0] if region == "bulk" or region == "all": histograms["bulk"] = np.histogram( bulk_data, bins=bins, density=density, range=(np.nanmin(bulk_data), np.nanmax(bulk_data)), ) means["bulk"] = np.nanmean(bulk_data) stds["bulk"] = np.nanstd(bulk_data) if region == "surface" or region == "all": surface = (support.astype(bool) & ~bulk).astype(int) surface_data = data[surface > 0] histograms["surface"] = np.histogram( surface_data, bins=bins, density=density, range=(np.nanmin(surface_data), np.nanmax(surface_data)), ) means["surface"] = np.nanmean(surface_data) stds["surface"] = np.nanstd(surface_data) kdes = {k: kde_from_histogram(*v) for k, v in histograms.items()} return histograms, kdes, means, stds
[docs] def plot_histogram( ax: plt.Axes, counts: np.ndarray, bin_edges: np.ndarray, kde_x: np.ndarray = None, kde_y: np.ndarray = None, color: ColorType = "lightcoral", fwhm: bool = True, bar_args: dict = None, kde_args: dict = None, ) -> float: """ Plot the bars of a histogram as well as the kernel density estimate. Args: ax (plt.Axes): the matplotlib ax to plot the histogram on. counts (np.ndarray): the count in each bin from np.histogram(). bin_edges (np.ndarray): the bin edge values from np.histogram(). kde_x (np.ndarray, optional): the x values used to calculate the kernel density estimate values. kde_y (np.ndarray, optional): the (y) values of the kernel density estimate. color (ColorType, optional): the colour of the bar and line. Defaults to "lightcoral". fwhm (bool, optional): whether to calculate and plot the full width at half maximum (FWHM) of the kernel density estimate. Defaults to True. bar_args (dict, optional): additional arguments for the matptlotlib bar function. kde_args (dict, optional): additional arguments for the matplotlib fill_between function. Can include boolean "fill" and float "fill_alpha" to control whether to fill the kde area and the alpha value of the fill. Defaults to None. Returns: float: the fwhm if required else None. """ _bar_args = { "color": color, "alpha": 0.4, "edgecolor": color, "linewidth": 0.5, "label": "", } _bar_args.update(bar_args or {}) _kde_args = {"color": color, "label": "Kernel density estimate"} fill_kde, fill_alpha = False, False if kde_args is not None: if "fill" in kde_args: fill_kde = kde_args.pop("fill") if "fill_alpha" in kde_args: fill_alpha = kde_args.pop("fill_alpha") _kde_args.update(kde_args) # Resample the histogram to calculate the kernel density estimate bin_centres = (bin_edges[:-1] + bin_edges[1:]) / 2 bin_width = bin_edges[1] - bin_edges[0] # Plot the histogram bars ax.bar(bin_centres, counts, bin_width, **_bar_args) # Find the x-axis limits xmax = np.max(np.abs(bin_centres)) xmin = -xmax ax.set_xlim(xmin, xmax) if kde_x is not None and kde_y is not None: # Plot the kernel density estimate ax.plot(kde_x, kde_y, **_kde_args) if fill_kde: ax.fill_between(kde_x, kde_y, 0, color=color, alpha=fill_alpha) # Calculate the FWHM if fwhm: halfmax = kde_y.max() / 2 maxpos = kde_y.argmax() leftpos = (np.abs(kde_y[:maxpos] - halfmax)).argmin() rightpos = (np.abs(kde_y[maxpos:] - halfmax)).argmin() + maxpos fwhm_value = kde_x[rightpos] - kde_x[leftpos] (fwhm_line,) = ax.plot( [], [], label=f"FWHM = {fwhm_value:.4f}%", color=color, ls="--", linewidth=1, ) def update_fwhm_line(event_ax): xmin, xmax = event_ax.get_xlim() fwhm_line.set_data( [kde_x[leftpos], kde_x[rightpos]], [halfmax, halfmax] ) fwhm_line.set_transform(event_ax.transData) update_fwhm_line(ax) ax.callbacks.connect("xlim_changed", update_fwhm_line) ax.callbacks.connect("ylim_changed", update_fwhm_line) return fwhm_value return None