jimg_int.intensity

   1import json
   2import os
   3import random
   4import re
   5from itertools import combinations
   6
   7import matplotlib.pyplot as plt
   8import numpy as np
   9import pandas as pd
  10import pingouin as pg
  11from scipy import stats
  12from scipy.stats import chi2_contingency
  13from tqdm import tqdm
  14
  15import jimg_int.config as cfg
  16
  17from .utils import *
  18
  19random.seed(42)
  20
  21
  22class FeatureIntensity(ImageTools):
  23    r"""
  24    Class for quantitative analysis of pixel intensity and size measurements
  25    in 2D/3D biological images. Supports projection of 3D stacks, mask-based
  26    intensity normalization, region size estimation and metadata extraction.
  27
  28    Parameters
  29    ----------
  30    input_image : ndarray, optional
  31        Input image or 3D stack for analysis. If 3D, projection must be applied.
  32
  33    image : ndarray, optional
  34        2D projected image (internal use).
  35
  36    normalized_image_values : dict, optional
  37        Dictionary storing normalized intensity statistics.
  38
  39    mask : ndarray, optional
  40        Binary mask of region of interest (ROI).
  41
  42    background_mask : ndarray, optional
  43        Binary mask used for background estimation. If not provided, `mask` is used.
  44
  45    typ : {"avg", "median", "std", "var", "max", "min"}, optional
  46        Projection type for 3D stacks. Default is `"avg"`.
  47
  48    size_info : dict, optional
  49        Dictionary storing ROI size measurements.
  50
  51    correction_factor : float, optional
  52        Normalization correction factor applied to background intensity.
  53        Must satisfy 0 < factor < 1. Default is 0.1.
  54
  55    img_type : str, optional
  56        Image type metadata.
  57
  58    scale : float, optional
  59        Pixel resolution in physical units (e.g. µm/px). Used in size calculations.
  60
  61    stack_selection : list of int, optional
  62        List of Z-indices to remove when projecting a 3D image.
  63
  64    Attributes
  65    ----------
  66    input_image : ndarray or None
  67        Loaded input image.
  68
  69    image : ndarray or None
  70        Projected 2D image.
  71
  72    mask : ndarray or None
  73        Region of interest mask.
  74
  75    background_mask : ndarray or None
  76        Background normalization mask.
  77
  78    scale : float or None
  79        Scale value for size estimation.
  80
  81    normalized_image_values : dict or None
  82        Dictionary containing intensity metrics.
  83
  84    size_info : dict or None
  85        Dictionary with ROI size information.
  86
  87    typ : str
  88        Selected projection type for 3D images.
  89
  90    stack_selection : list of int
  91        Z-levels excluded from projection.
  92
  93    Notes
  94    -----
  95    The intensity normalization formula applied per pixel is:
  96
  97    .. math::
  98
  99        R_{i,j} = T_{i,j} - \\left( \\mu_B (1 + c) \\right)
 100
 101    where
 102    * ``T_{i,j}`` – pixel intensity in ROI
 103    * ``μ_B`` – mean intensity in background region
 104    * ``c`` – correction factor
 105    * ``R_{i,j}`` – normalized pixel intensity
 106
 107    Examples
 108    --------
 109    Load a 3D image, mask and compute statistics:
 110
 111    >>> fi = FeatureIntensity()
 112    >>> fi.load_image_3D("stack.tiff")
 113    >>> fi.load_mask_("mask.png")
 114    >>> fi.set_projection("median")
 115    >>> fi.run_calculations()
 116    >>> results = fi.get_results()
 117    >>> results["intensity"]["norm_mean"]
 118    """
 119
 120    def __init__(
 121        self,
 122        input_image=None,
 123        image=None,
 124        normalized_image_values=None,
 125        mask=None,
 126        background_mask=None,
 127        typ=None,
 128        size_info=None,
 129        correction_factor=None,
 130        img_type=None,
 131        scale=None,
 132        stack_selection=None,
 133    ):
 134        """
 135        Initialize a FeatureIntensity analysis instance.
 136
 137        Parameters
 138        ----------
 139        input_image : ndarray, optional
 140            Input image or 3D stack used for analysis. If the image is 3D, a
 141            projection will be computed depending on the `typ` parameter.
 142
 143        image : ndarray, optional
 144            2D image buffer used internally after projection of the input image.
 145            Should not be set manually.
 146
 147        normalized_image_values : dict, optional
 148            Dictionary containing normalized intensity statistics. Usually filled
 149            automatically after running `run_calculations()`.
 150
 151        mask : ndarray, optional
 152            Binary mask of the target region of interest (ROI). Required for
 153            intensity and size calculations.
 154
 155        background_mask : ndarray, optional
 156            Binary mask specifying the background region used to compute the
 157            normalization threshold. If not provided, the ROI mask is also used
 158            as the background reference.
 159
 160        typ : {"avg", "median", "std", "var", "max", "min"}, optional
 161            Projection method for 3D images. Determines how the z-stack is
 162            collapsed into a 2D image. Default is `"avg"`.
 163
 164        size_info : dict, optional
 165            Dictionary storing computed size metrics of the ROI. Populated after
 166            invoking `size_calculations()`.
 167
 168        correction_factor : float, optional
 169            Correction term used during intensity normalization. Must satisfy
 170            0 < correction_factor < 1. Default is 0.1.
 171
 172        img_type : str, optional
 173            Optional metadata about the image type (e.g., "tiff", "png").
 174
 175        scale : float, optional
 176            Pixel resolution in physical units (e.g., µm/px). Required for
 177            real-size estimation in `size_calculations()`.
 178
 179        stack_selection : list of int, optional
 180            Indices of z-planes to exclude during projection of a 3D stack.
 181
 182        Notes
 183        -----
 184        Values not provided are initialized to `None`, except for `typ`, which
 185        defaults to `"avg"`, and `correction_factor`, which defaults to 0.1.
 186
 187        The class is designed to be populated by loading functions:
 188        `load_image_()`, `load_image_3D()`, `load_mask_()`,
 189        and optionally `load_normalization_mask_()` and `load_JIMG_project_()`.
 190        """
 191
 192        self.input_image = input_image or None
 193        """ Input image or 3D stack used for analysis. If the image is 3D, a
 194         projection will be computed depending on the `typ` parameter."""
 195
 196        self.image = image or None
 197        """  2D image buffer used internally after projection of the input image.
 198          Should not be set manually."""
 199
 200        self.normalized_image_values = normalized_image_values or None
 201        """Dictionary containing normalized intensity statistics. Usually filled
 202        automatically after running `run_calculations()`."""
 203
 204        self.mask = mask or None
 205        """Binary mask of the target region of interest (ROI). Required for
 206        intensity and size calculations."""
 207
 208        self.background_mask = background_mask or None
 209        """ Binary mask specifying the background region used to compute the
 210         normalization threshold. If not provided, the ROI mask is also used
 211         as the background reference."""
 212
 213        self.typ = typ or "avg"
 214        """Projection method for 3D images. Determines how the z-stack is
 215        collapsed into a 2D image. Default is `"avg"`."""
 216
 217        self.size_info = size_info or None
 218        """Dictionary storing computed size metrics of the ROI. Populated after
 219        invoking `size_calculations()`."""
 220
 221        self.correction_factor = correction_factor or 0.1
 222        """ Correction term used during intensity normalization. Must satisfy
 223         0 < correction_factor < 1. Default is 0.1."""
 224
 225        self.scale = scale or None
 226        """ Pixel resolution in physical units (e.g., µm/px). Required for
 227         real-size estimation in `size_calculations()`."""
 228
 229        self.stack_selection = stack_selection or []
 230        """Indices of z-planes to exclude during projection of a 3D stack."""
 231
 232    @property
 233    def current_metadata(self):
 234        r"""
 235        Return current metadata parameters used in image processing and normalization.
 236
 237        Returns
 238        -------
 239        tuple
 240            A tuple containing:
 241
 242            projection_type : str
 243                Projection method used for 3D image reduction (e.g., "avg", "median").
 244
 245            correction_factor : float
 246                Correction factor used for background subtraction during intensity
 247                normalization. The applied formula is:
 248
 249                .. math::
 250
 251                    R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )
 252
 253                where
 254                * ``R_{i,j}`` — normalized pixel intensity
 255                * ``T_{i,j}`` — original pixel intensity
 256                * ``μ_B`` — mean background intensity
 257                * ``c`` — correction factor
 258            scale : float or None
 259                Pixel resolution (unit/px), loaded via `load_JIMG_project_()` or set manually
 260                using `set_scale()`.
 261
 262            stack_selection : list of int
 263                Indices of z-slices excluded from projection of a 3D image.
 264
 265        Notes
 266        -----
 267        This property also prints the metadata values to the console for quick inspection.
 268        """
 269
 270        print(f"Projection type: {self.typ}")
 271        print(f"Correction factor: {self.correction_factor}")
 272        print(f"Scale (unit/px): {self.scale}")
 273        print(f"Selected stac to remove: {self.stack_selection}")
 274
 275        return self.typ, self.correction_factor, self.scale, self.stack_selection
 276
 277    def set_projection(self, projection: str):
 278        """
 279        Set the projection method for 3D image stack reduction.
 280
 281        Parameters
 282        ----------
 283        projection : {"avg", "median", "std", "var", "max", "min"}
 284            Projection method to reduce a 3D image stack to a 2D image. Default is `"avg"`.
 285
 286        Notes
 287        -----
 288        This method updates the `typ` attribute of the class. The selected projection
 289        determines how the z-stack is collapsed:
 290        - `"avg"` : average intensity across slices
 291        - `"median"` : median intensity across slices
 292        - `"std"` : standard deviation across slices
 293        - `"var"` : variance across slices
 294        - `"max"` : maximum intensity across slices
 295        - `"min"` : minimum intensity across slices
 296        """
 297
 298        t = ["avg", "median", "std", "var", "max", "min"]
 299        if projection in t:
 300            self.typ = projection
 301        else:
 302            print(f"\nProvided parameter is incorrect. Avaiable projection types: {t}")
 303
 304    def set_correction_factorn(self, factor: float):
 305        r"""
 306        Set the correction factor for background subtraction during intensity normalization.
 307
 308        Parameters
 309        ----------
 310        factor : float
 311            Correction factor to adjust background subtraction. Must satisfy 0 < factor < 1.
 312            Default is 0.1.
 313
 314        Notes
 315        -----
 316        The correction is applied per pixel in the target mask using the formula:
 317
 318        .. math::
 319
 320            R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )
 321
 322        where
 323        * ``R_{i,j}`` — normalized pixel intensity
 324        * ``T_{i,j}`` — original pixel intensity
 325        * ``μ_B`` — mean intensity in the background mask
 326        * ``c`` — correction factor
 327        """
 328
 329        if factor < 1 and factor > 0:
 330            self.correction_factor = factor
 331        else:
 332            print(
 333                "\nProvided parameter is incorrect. The factor should be a floating-point value within the range of 0 to 1."
 334            )
 335
 336    def set_scale(self, scale):
 337        """
 338        Set the scale for converting pixel measurements to physical units.
 339
 340        Parameters
 341        ----------
 342        scale : float
 343            Pixel resolution in physical units (e.g., µm/px). Used to calculate the
 344            actual size of the tissue or organ.
 345
 346        Notes
 347        -----
 348        The scale can also be automatically loaded from a JIMG project using
 349        `load_JIMG_project_()`. This value is required for size calculations in
 350        `size_calculations()`.
 351        """
 352
 353        self.scale = scale
 354
 355    def set_selection_list(self, rm_list: list):
 356        """
 357        Set the list of z-slices to exclude when projecting a 3D image stack.
 358
 359        Parameters
 360        ----------
 361        rm_list : list of int
 362            List of indices corresponding to z-slices that should be removed from
 363            the full 3D image stack before projection.
 364
 365        Notes
 366        -----
 367        This updates the `stack_selection` attribute, which is used by the
 368        `stack_selection_()` method during projection.
 369        """
 370
 371        self.stack_selection = rm_list
 372
 373    def load_JIMG_project_(self, path):
 374        """
 375        Load a JIMG project from a `.pjm` file.
 376
 377        Parameters
 378        ----------
 379        file_path : str
 380            Path to the JIMG project file. The file must have a `.pjm` extension.
 381
 382        Returns
 383        -------
 384        project : object
 385            Loaded project object containing images and metadata.
 386
 387        Raises
 388        ------
 389        ValueError
 390            If the provided file path does not point to a `.pjm` file.
 391
 392        Notes
 393        -----
 394        The method attempts to automatically set the `scale` and `stack_selection`
 395        attributes from the project metadata if available.
 396        """
 397
 398        path = os.path.abspath(path)
 399
 400        if ".pjm" in path:
 401            metadata = self.load_JIMG_project(path)
 402
 403            try:
 404                self.scale = metadata.metadata["X_resolution[um/px]"]
 405            except:
 406
 407                try:
 408                    self.scale = metadata.images_dict["metadata"][0][
 409                        "X_resolution[um/px]"
 410                    ]
 411
 412                except:
 413                    print(
 414                        '\nUnable to set scale on this project! Set scale using "set_scale()"'
 415                    )
 416
 417            self.stack_selection = metadata.removal_list
 418
 419        else:
 420            print(
 421                "\nWrong path. The provided path does not point to a JIMG project (*.pjm)."
 422            )
 423
 424    def stack_selection_(self):
 425        """
 426        Remove selected z-slices from a 3D image stack based on `stack_selection`.
 427
 428        Notes
 429        -----
 430        Only works if `input_image` is a 3D ndarray. The slices with indices listed
 431        in `stack_selection` are excluded from the stack. Updates `input_image`
 432        in-place.
 433
 434        Prints a warning if `stack_selection` is empty.
 435        """
 436
 437        if len(self.input_image.shape) == 3:
 438            if len(self.stack_selection) > 0:
 439                self.input_image = self.input_image[
 440                    [
 441                        x
 442                        for x in range(self.input_image.shape[0])
 443                        if x not in self.stack_selection
 444                    ]
 445                ]
 446            else:
 447                print("\nImages to remove from the stack were not selected!")
 448
 449    def projection(self):
 450        """
 451        Project a 3D image stack into a 2D image using the method defined by `typ`.
 452
 453        Notes
 454        -----
 455        Updates the `image` attribute with the projected 2D result.
 456
 457        Supported projection types (`typ`):
 458        - "avg" : mean intensity across slices
 459        - "median" : median intensity across slices
 460        - "std" : standard deviation across slices
 461        - "var" : variance across slices
 462        - "max" : maximum intensity across slices
 463        - "min" : minimum intensity across slices
 464
 465        Raises
 466        ------
 467        AttributeError
 468            If `input_image` is not defined.
 469        """
 470
 471        if self.typ == "avg":
 472            img = np.mean(self.input_image, axis=0)
 473
 474        elif self.typ == "std":
 475            img = np.std(self.input_image, axis=0)
 476
 477        elif self.typ == "median":
 478            img = np.median(self.input_image, axis=0)
 479
 480        elif self.typ == "var":
 481            img = np.var(self.input_image, axis=0)
 482
 483        elif self.typ == "max":
 484            img = np.max(self.input_image, axis=0)
 485
 486        elif self.typ == "min":
 487            img = np.min(self.input_image, axis=0)
 488
 489        self.image = img
 490
 491    def detect_img(self):
 492        """
 493        Detect whether the input image is 2D or 3D and perform appropriate preprocessing.
 494
 495        Notes
 496        -----
 497        - For 3D images, applies `stack_selection_()` and then `projection()`.
 498        - For 2D images, no projection is applied.
 499        - Prints status messages indicating the type of image and applied operations.
 500
 501        Raises
 502        ------
 503        AttributeError
 504            If `input_image` is not defined.
 505        """
 506        check = len(self.input_image.shape)
 507
 508        if check == 3:
 509            print("\n3D image detected! Starting processing for 3D image...")
 510            print(f"Projection - {self.typ}...")
 511
 512            self.stack_selection_()
 513            self.projection()
 514
 515        elif check == 2:
 516            print("\n2D image detected! Starting processing for 2D image...")
 517
 518        else:
 519            print("\nData does not match any image type!")
 520
 521    def load_image_3D(self, path):
 522        """
 523        Load a 3D image stack from a TIFF file.
 524
 525        Parameters
 526        ----------
 527        path : str
 528            Path to the 3D image file (*.tiff) to be loaded.
 529
 530        Notes
 531        -----
 532        The loaded image is stored in the `input_image` attribute as a 3D ndarray.
 533        """
 534        path = os.path.abspath(path)
 535
 536        self.input_image = self.load_3D_tiff(path)
 537
 538    def load_image_(self, path):
 539        """
 540        Load a 2D image into the class.
 541
 542        Parameters
 543        ----------
 544        path : str
 545            Path to the image file to be loaded.
 546
 547        Notes
 548        -----
 549        The loaded image is stored in the `input_image` attribute as a 2D ndarray.
 550        """
 551        path = os.path.abspath(path)
 552
 553        self.input_image = self.load_image(path)
 554
 555    def load_mask_(self, path):
 556        r"""
 557        Load a binary mask into the class and optionally set it as the normalization mask.
 558
 559        Parameters
 560        ----------
 561        path : str
 562            Path to the mask image file. Supported formats include 8-bit or 16-bit images
 563            with extensions such as `.png` or `.jpeg`. The mask must be binary
 564            (e.g., 0/255, 0/2**16-1, 0/1).
 565
 566        Notes
 567        -----
 568        - If `load_normalization_mask_()` is not called, this mask is also used as the
 569          background mask for intensity normalization.
 570        - Normalization is applied per pixel using the formula:
 571
 572          .. math::
 573
 574              R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )
 575
 576          where
 577          * ``R_{i,j}`` — normalized pixel intensity
 578          * ``T_{i,j}`` — pixel intensity in the target mask
 579          * ``μ_B`` — mean intensity of the background (reversed mask)
 580          * ``c`` — correction factor
 581        """
 582
 583        path = os.path.abspath(path)
 584
 585        self.mask = self.load_mask(path)
 586
 587        print(
 588            "\nThis mask was also set as the reverse background mask. If you want a different background mask for normalization, use 'load_normalization_mask()'."
 589        )
 590        self.background_mask = self.load_mask(path)
 591
 592    def load_normalization_mask_(self, path):
 593        r"""
 594        Load a binary mask for normalization into the class.
 595
 596        Parameters
 597        ----------
 598        path : str
 599            Path to the mask image file. Supported formats include 8-bit or 16-bit
 600            images (e.g., `.png`, `.jpeg`). The mask must be binary (0/255, 0/2**16-1, 0/1).
 601
 602        Notes
 603        -----
 604        - The mask defines the area of interest. Normalization is applied to the inverse
 605          of this area (reversed mask).
 606        - Normalization formula applied per pixel:
 607
 608          .. math::
 609
 610              R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )
 611
 612          where
 613          * ``R_{i,j}`` — normalized pixel intensity
 614          * ``T_{i,j}`` — pixel intensity in the target mask
 615          * ``μ_B`` — mean intensity of the background (reversed mask)
 616          * ``c`` — correction factor
 617        """
 618
 619        path = os.path.abspath(path)
 620
 621        self.background_mask = self.load_mask(path)
 622
 623    def intensity_calculations(self):
 624        """
 625        Calculate normalized and raw intensity statistics from the image based on masks.
 626
 627        This method performs intensity calculations using the main mask (`self.mask`)
 628        and the background mask (`self.background_mask`). The pixel intensities within
 629        the mask of interest are normalized by subtracting a threshold derived from the
 630        background region and applying a correction factor (`self.correction_factor`).
 631        Negative values after normalization are clipped to zero.
 632
 633        The following statistics are computed for both normalized and raw values:
 634        - Minimum
 635        - Maximum
 636        - Mean
 637        - Median
 638        - Standard deviation
 639        - Variance
 640        - List of all normalized values (only for normalized data)
 641
 642        Notes
 643        -----
 644        - The method updates the instance attribute `self.normalized_image_values`
 645          with a dictionary containing both normalized and raw statistics.
 646        - Normalization formula applied for each pixel in the selected mask:
 647            final_val = selected_value - (threshold + threshold * correction_factor)
 648          where threshold is the mean intensity in the background mask.
 649        - Negative values after normalization are set to zero.
 650        """
 651
 652        tmp_mask = self.ajd_mask_size(image=self.image, mask=self.mask)
 653        tmp_bmask = self.ajd_mask_size(image=self.image, mask=self.background_mask)
 654
 655        selected_values = self.image[tmp_mask == np.max(tmp_mask)]
 656
 657        threshold = np.mean(self.image[tmp_bmask == np.min(tmp_bmask)])
 658
 659        # normalization
 660        final_val = selected_values - (threshold + (threshold * self.correction_factor))
 661
 662        final_val[final_val < 0] = 0
 663
 664        tmp_dict = {
 665            "norm_min": np.min(final_val),
 666            "norm_max": np.max(final_val),
 667            "norm_mean": np.mean(final_val),
 668            "norm_median": np.median(final_val),
 669            "norm_std": np.std(final_val),
 670            "norm_var": np.var(final_val),
 671            "norm_values": final_val.tolist(),
 672            "min": np.min(selected_values),
 673            "max": np.max(selected_values),
 674            "mean": np.mean(selected_values),
 675            "median": np.median(selected_values),
 676            "std": np.std(selected_values),
 677            "var": np.var(selected_values),
 678        }
 679
 680        self.normalized_image_values = tmp_dict
 681
 682    def size_calculations(self):
 683        """
 684        Calculates the size and bounding dimensions of the masked region in the image.
 685
 686        This method computes the following metrics based on the current mask:
 687            - Total number of pixels in the mask (`px_size`)
 688            - Real-world size if a scale is provided (`size`)
 689            - Maximum lengths along x and y axes (`max_length_x_axis`, `max_length_y_axis`)
 690
 691        If `self.scale` is defined (unit per pixel), the real-world size is calculated.
 692        If not, `size` will be `None` and a warning message is printed.
 693
 694        Returns:
 695            Updates the following attributes in the class:
 696                - self.size_info (dict) containing:
 697                    - 'size' (float or None): real-world size of the mask
 698                    - 'px_size' (int): number of pixels in the masked region
 699                    - 'max_length_x_axis' (int): length of the bounding box along the x-axis
 700                    - 'max_length_y_axis' (int): length of the bounding box along the y-axis
 701
 702        Example:
 703            analysis.size_calculations()
 704            print(analysis.size_info)
 705        """
 706
 707        tmp_mask = self.ajd_mask_size(image=self.image, mask=self.mask)
 708
 709        size_px = int(len(tmp_mask[tmp_mask > np.min(tmp_mask)]))
 710
 711        if self.scale is not None:
 712            size = float(size_px * self.scale)
 713        else:
 714            size = None
 715            print(
 716                '\nUnable to calculate real size, scale (unit/px) not provided, use "set_scale()" or load JIMG project .pjm metadata "load_pjm()" to set scale for calculations!'
 717            )
 718
 719        non_zero_indices = np.where(tmp_mask == np.max(tmp_mask))
 720
 721        min_y, max_y = np.min(non_zero_indices[0]), np.max(non_zero_indices[0])
 722        min_x, max_x = np.min(non_zero_indices[1]), np.max(non_zero_indices[1])
 723
 724        max_length_x_axis = int(max_x - min_x + 1)
 725        max_length_y_axis = int(max_y - min_y + 1)
 726
 727        tmp_val = {
 728            "size": size,
 729            "px_size": size_px,
 730            "max_length_x_axis": max_length_x_axis,
 731            "max_length_y_axis": max_length_y_axis,
 732        }
 733
 734        self.size_info = tmp_val
 735
 736    def run_calculations(self):
 737        """
 738        Run the full analysis pipeline on the loaded image using the provided masks.
 739
 740        Notes
 741        -----
 742        - The input image must be loaded via `load_image_()` or `load_image_3D()`.
 743        - The ROI mask must be loaded via `load_mask_()`. Optionally, a normalization
 744          mask can be loaded via `load_normalization_mask_()`.
 745        - Parameters such as projection type and correction factor can be set with
 746          `set_projection()` and `set_correction_factor()`.
 747        - Scale and stack selection can also influence calculations if defined.
 748        - To view current parameters, use the `current_metadata` property.
 749
 750        Returns
 751        -------
 752        None
 753            The results are stored internally and can be retrieved using
 754            `get_results()`.
 755        """
 756
 757        if self.input_image is not None:
 758
 759            if self.mask is not None:
 760
 761                print("\nStart...")
 762                self.detect_img()
 763                self.intensity_calculations()
 764                self.size_calculations()
 765                print("\nCompleted!")
 766
 767    def get_results(self):
 768        """
 769        Return the results from the analysis performed by `run_calculations()`.
 770
 771        Returns
 772        -------
 773        results_dict : dict or None
 774            Dictionary containing intensity and size results. Structure:
 775            - 'intensity' : dict with normalized and raw intensity statistics
 776            - 'size' : dict with ROI size metrics
 777
 778        Notes
 779        -----
 780        If analysis has not been run yet, prints a message and returns None.
 781        """
 782
 783        if self.normalized_image_values is not None and self.size_info is not None:
 784
 785            results = {
 786                "intensity": self.normalized_image_values,
 787                "size": self.size_info,
 788            }
 789
 790            return results
 791
 792        else:
 793            print('\nAnalysis were not conducted. Run analysis "run_calculations()"')
 794
 795    def save_results(
 796        self,
 797        path="",
 798        mask_region: str = "",
 799        feature_name: str = "",
 800        individual_number: int = 0,
 801        individual_name: str = "",
 802    ):
 803        """
 804        Save the analysis results to a `.int` (JSON) file.
 805
 806        Parameters
 807        ----------
 808        path : str, optional
 809            Directory path where the file will be saved. Defaults to the current working directory.
 810
 811        mask_region : str
 812            Name or identifier of the mask region (e.g., tissue, part of tissue).
 813
 814        feature_name : str
 815            Name of the feature being analyzed. Underscores or spaces are replaced with periods.
 816
 817        individual_number : int
 818            Unique identifier for the individual in the analysis (e.g., 1, 2, 3).
 819
 820        individual_name : str
 821            Name of the individual (e.g., species name, tissue, organoid).
 822
 823        Notes
 824        -----
 825        - The method validates that all required parameters are provided and that
 826          analysis results exist (`normalized_image_values` and `size_info`).
 827        - Creates the directory if it does not exist.
 828        - File name format:
 829          '<individual_name>_<individual_number>_<mask_region>_<feature_name>.int'
 830
 831        Raises
 832        ------
 833        FileNotFoundError
 834            If the specified path cannot be created or accessed.
 835
 836        ValueError
 837            If any of `mask_region`, `feature_name`, `individual_number`, or
 838            `individual_name` are missing or invalid.
 839        """
 840
 841        path = os.path.abspath(path)
 842
 843        if (
 844            len(mask_region) > 1
 845            and len(feature_name) > 1
 846            and individual_number != 0
 847            and len(individual_name) > 1
 848        ):
 849
 850            if self.normalized_image_values is not None and self.size_info is not None:
 851
 852                results = {
 853                    "intensity": self.normalized_image_values,
 854                    "size": self.size_info,
 855                }
 856
 857                mask_region = re.sub(r"[_\s]+", ".", mask_region)
 858                feature_name = re.sub(r"[_\s]+", ".", feature_name)
 859                individual_number = re.sub(r"[_\s]+", ".", str(individual_number))
 860                individual_name = re.sub(r"[_\s]+", ".", individual_name)
 861
 862                full_name = f"{individual_name}_{individual_number}_{mask_region}_{feature_name}"
 863
 864                isExist = os.path.exists(path)
 865                if not isExist:
 866                    os.makedirs(path, exist_ok=True)
 867
 868                full_path = os.path.join(
 869                    path, re.sub("\\.json", "", full_name) + ".int"
 870                )
 871
 872                with open(full_path, "w") as file:
 873                    json.dump(results, file, indent=4)
 874
 875            else:
 876                print(
 877                    '\nAnalysis were not conducted. Run analysis "run_calculations()"'
 878                )
 879
 880        else:
 881            print(
 882                "\nAny of 'mask_region', 'feature_name', 'individual_number', 'individual_name' parameters were provided wrong!"
 883            )
 884
 885    def concatenate_intensity_data(self, directory: str = "", name: str = ""):
 886        """
 887        Concatenate intensity data from multiple `.int` files and save as CSV.
 888
 889        Parameters
 890        ----------
 891        directory : str, optional
 892            Path to the directory containing `.int` files. Defaults to the current working directory.
 893
 894        name : str
 895            Prefix for the output CSV file names. CSV files are saved in the format
 896            '<name>_<gene>_<region>.csv'.
 897
 898        Raises
 899        ------
 900        FileNotFoundError
 901            If the directory cannot be accessed or no `.int` files are found.
 902
 903        ValueError
 904            If an `.int` file is missing expected data or has an incorrect format.
 905
 906        Notes
 907        -----
 908        - The method groups intensity data by gene (feature) and mask region.
 909        - Outputs one CSV file per unique gene-region combination, saved in the specified directory.
 910        """
 911
 912        directory = os.path.abspath(directory)
 913
 914        files_list = [f for f in os.listdir(directory) if f.endswith(".int")]
 915
 916        genes_set = set([re.sub("\\.int", "", x.split("_")[3]) for x in files_list])
 917        regions_set = set([re.sub("\\.int", "", x.split("_")[2]) for x in files_list])
 918
 919        for g in genes_set:
 920            for r in regions_set:
 921                json_to_save = {
 922                    "individual_name": [],
 923                    "individual_number": [],
 924                    "norm_intensity": [],
 925                    "size": [],
 926                }
 927
 928                for f in tqdm(files_list):
 929                    if g in f and r in f:
 930                        with open(os.path.join(directory, f), "r") as file:
 931                            data = json.load(file)
 932
 933                            json_to_save["norm_intensity"] = (
 934                                json_to_save["norm_intensity"]
 935                                + data["intensity"]["norm_values"]
 936                            )
 937                            json_to_save["individual_name"] = json_to_save[
 938                                "individual_name"
 939                            ] + [f.split("_")[0]] * len(
 940                                data["intensity"]["norm_values"]
 941                            )
 942                            json_to_save["individual_number"] = json_to_save[
 943                                "individual_number"
 944                            ] + [f.split("_")[1]] * len(
 945                                data["intensity"]["norm_values"]
 946                            )
 947                            json_to_save["size"] = json_to_save["size"] + [
 948                                data["size"]["px_size"]
 949                            ] * len(data["intensity"]["norm_values"])
 950
 951        pd.DataFrame(json_to_save).to_csv(f"{name}_{g}_{r}.csv", index=False)
 952
 953
 954class IntensityAnalysis:
 955    """
 956    Class for performing percentile-based statistical analysis on grouped data.
 957
 958    This class provides methods to calculate percentiles, remove outliers, aggregate
 959    data into percentile bins, perform Welch's ANOVA and Chi-squared tests, and
 960    visualize results via comparative histograms. It is designed to handle both
 961    single-column and multi-column combinations of values for group-based analysis.
 962
 963    Methods
 964    -------
 965    drop_up_df(data, group_col, values_col)
 966        Removes upper outliers from a DataFrame based on a grouping column.
 967
 968    percentiles_calculation(values, sep_perc=1)
 969        Calculates percentiles and creates loopable percentile ranges.
 970
 971    to_percentil(values, percentiles, percentiles_loop)
 972        Aggregates statistics based on percentile ranges.
 973
 974    df_to_percentiles(data, group_col, values_col, sep_perc=1, drop_outlires=True)
 975        Computes percentile statistics for grouped DataFrame data.
 976
 977    round_to_scientific_notation(num)
 978        Formats a number in scientific notation or standard format.
 979
 980    aov_percentiles(data, testes_col, comb="*")
 981        Performs Welch's ANOVA on percentile-based group data.
 982
 983    post_aov_percentiles(data, testes_col, comb="*")
 984        Performs Welch's ANOVA with pairwise t-tests.
 985
 986    chi2_percentiles(input_hist)
 987        Performs Chi-squared test on percentile histogram data.
 988
 989    post_ch2_percentiles(input_hist)
 990        Performs Chi-squared test with pairwise comparisons.
 991
 992    hist_compare_plot(data, queue, tested_value, p_adj=True, txt_size=20)
 993        Generates comparative histograms with statistical test results.
 994    """
 995
 996    def drop_up_df(self, data: pd.DataFrame, group_col: str, values_col: str):
 997        """
 998        Remove upper outliers from a DataFrame based on a specified value column, grouped by a grouping column.
 999
1000        Outliers are calculated and removed separately for each group defined by `group_col`.
1001        The upper outliers are defined using the interquartile range (IQR) method:
1002            values greater than Q3 + 1.5 * IQR are considered outliers.
1003
1004        Parameters
1005        ----------
1006        data : pd.DataFrame
1007            The input DataFrame containing the data.
1008
1009        group_col : str
1010            The name of the column used for grouping the data.
1011
1012        values_col : str
1013            The column containing the values from which upper outliers will be removed.
1014
1015        Returns
1016        -------
1017        filtered_data : pd.DataFrame
1018            A filtered DataFrame with the upper outliers removed for each group.
1019
1020        Notes
1021        -----
1022        - Outliers are removed separately within each group.
1023        - The original DataFrame is not modified; a new filtered DataFrame is returned.
1024        """
1025
1026        def iqr_filter(group):
1027            q75 = np.quantile(group[values_col], 0.75)
1028            q25 = np.quantile(group[values_col], 0.25)
1029            itq = q75 - q25
1030            return group[group[values_col] <= (q75 + 1.5 * itq)]
1031
1032        filtered_data = data.groupby(group_col).apply(iqr_filter).reset_index(drop=True)
1033
1034        return filtered_data
1035
1036    def percentiles_calculation(self, values, sep_perc: int = 1):
1037        """
1038        Calculate percentiles for a set of values and generate consecutive percentile ranges.
1039
1040        This function computes percentiles from 0 to 100 at intervals defined by `sep_perc`.
1041        It also generates a list of consecutive percentile ranges that can be used for further analysis or binning.
1042
1043        Parameters
1044        ----------
1045        values : array-like
1046            The input data values for which the percentiles are calculated.
1047
1048        sep_perc : int, optional
1049            Separation interval between percentiles (default is 1, meaning percentiles are calculated every 1%).
1050
1051        Returns
1052        -------
1053        percentiles : np.ndarray
1054            Array of calculated percentile values.
1055
1056        percentiles_loop : list of tuple
1057            List of consecutive percentile ranges as tuples, e.g., [(0, 1), (1, 2), ..., (99, 100)].
1058
1059        Notes
1060        -----
1061        - The first percentile is set to 0 to avoid issues with zero values.
1062        - `percentiles_loop` is useful for iterating through percentile ranges when aggregating statistics.
1063        """
1064
1065        per_vector = values.copy()
1066
1067        percentiles = np.percentile(per_vector, np.arange(0, 101, sep_perc))
1068        percentiles[0] = 0
1069
1070        percentiles_loop = [(i, i + 1) for i in range(int(100 / sep_perc))]
1071
1072        return percentiles, percentiles_loop
1073
1074    def to_percentil(self, values, percentiles, percentiles_loop):
1075        """
1076        Aggregate statistics for a set of values based on percentile ranges.
1077
1078        This function calculates summary statistics for each percentile range defined in `percentiles_loop`,
1079        using the percentile values calculated by `percentiles_calculation()`. Statistics include count, proportion,
1080        mean, median, standard deviation, and variance.
1081
1082        Parameters
1083        ----------
1084        values : array-like
1085            Input data values for which the statistics are calculated.
1086
1087        percentiles : np.ndarray
1088            Array of percentile values used to define the ranges.
1089
1090        percentiles_loop : list of tuple
1091            List of consecutive percentile ranges, e.g., [(0, 1), (1, 2), ..., (99, 100)].
1092
1093        Returns
1094        -------
1095        data : dict
1096            Dictionary containing the following keys:
1097            - 'n' : list
1098                Number of elements in each percentile range.
1099
1100            - 'n_standarized' : list
1101                Proportion of elements in each percentile range relative to the total number of elements.
1102
1103            - 'avg' : list
1104                Mean value of elements within each percentile range.
1105
1106            - 'median' : list
1107                Median value of elements within each percentile range.
1108
1109            - 'std' : list
1110                Standard deviation of elements within each percentile range.
1111
1112            - 'var' : list
1113                Variance of elements within each percentile range.
1114
1115        Notes
1116        -----
1117        - If a percentile range contains no elements, statistics are set to 0 and count is set to 1 to avoid empty lists.
1118        """
1119
1120        per_vector = values.copy()
1121
1122        data = {
1123            "n": [],
1124            "n_standarized": [],
1125            "avg": [],
1126            "median": [],
1127            "std": [],
1128            "var": [],
1129        }
1130
1131        amount = len(per_vector)
1132
1133        for x in percentiles_loop:
1134            if (
1135                len(
1136                    per_vector[
1137                        (per_vector > percentiles[x[0]])
1138                        & (per_vector <= percentiles[x[1]])
1139                    ]
1140                )
1141                > 0
1142            ):
1143                data["n"].append(
1144                    len(
1145                        per_vector[
1146                            (per_vector > percentiles[x[0]])
1147                            & (per_vector <= percentiles[x[1]])
1148                        ]
1149                    )
1150                )
1151                data["n_standarized"].append(
1152                    len(
1153                        per_vector[
1154                            (per_vector > percentiles[x[0]])
1155                            & (per_vector <= percentiles[x[1]])
1156                        ]
1157                    )
1158                    / amount
1159                )
1160                data["avg"].append(
1161                    np.mean(
1162                        per_vector[
1163                            (per_vector > percentiles[x[0]])
1164                            & (per_vector <= percentiles[x[1]])
1165                        ]
1166                    )
1167                )
1168                data["std"].append(
1169                    np.std(
1170                        per_vector[
1171                            (per_vector > percentiles[x[0]])
1172                            & (per_vector <= percentiles[x[1]])
1173                        ]
1174                    )
1175                )
1176                data["median"].append(
1177                    np.median(
1178                        per_vector[
1179                            (per_vector > percentiles[x[0]])
1180                            & (per_vector <= percentiles[x[1]])
1181                        ]
1182                    )
1183                )
1184                data["var"].append(
1185                    np.var(
1186                        per_vector[
1187                            (per_vector > percentiles[x[0]])
1188                            & (per_vector <= percentiles[x[1]])
1189                        ]
1190                    )
1191                )
1192            else:
1193                data["n"].append(1)
1194                data["n_standarized"].append(0)
1195                data["avg"].append(0)
1196                data["std"].append(0)
1197                data["median"].append(0)
1198                data["var"].append(0)
1199
1200        return data
1201
1202    def df_to_percentiles(
1203        self,
1204        data: pd.DataFrame,
1205        group_col: str,
1206        values_col: str,
1207        sep_perc: int = 1,
1208        drop_outlires: bool = True,
1209    ):
1210        """
1211        Calculate summary statistics based on percentile ranges for each group in a DataFrame.
1212
1213        This method groups the input DataFrame by `group_col`, computes percentile ranges for each group's values
1214        in `values_col`, and aggregates statistics (count, proportion, mean, median, standard deviation, variance)
1215        for each percentile range. Optionally, upper outliers can be removed before calculation.
1216
1217        Parameters
1218        ----------
1219        data : pd.DataFrame
1220            Input DataFrame containing the data.
1221
1222        group_col : str
1223            Column name used to define groups.
1224
1225        values_col : str
1226            Column name containing the values for percentile calculations.
1227
1228        sep_perc : int, optional
1229            Separation interval for percentiles (default is 1, meaning percentiles are calculated at every 1%).
1230
1231        drop_outlires : bool, optional
1232            If True, removes upper outliers from the data before performing calculations (default is True).
1233
1234        Returns
1235        -------
1236        full_data : dict
1237            Dictionary where each key is a group name and each value is a dictionary containing:
1238            - 'n' : list
1239                Number of elements in each percentile range.
1240
1241            - 'n_standarized' : list
1242                Proportion of elements in each percentile range relative to the total number of elements.
1243
1244            - 'avg' : list
1245                Mean value of elements within each percentile range.
1246
1247            - 'median' : list
1248                Median value of elements within each percentile range.
1249
1250            - 'std' : list
1251                Standard deviation of elements within each percentile range.
1252
1253            - 'var' : list
1254                Variance of elements within each percentile range.
1255
1256        Notes
1257        -----
1258        - Outlier removal uses the IQR method within each group if `drop_outlires` is True.
1259        """
1260
1261        full_data = {}
1262
1263        if drop_outlires == True:
1264            data = self.drop_up_df(
1265                data=data, group_col=group_col, values_col=values_col
1266            )
1267
1268        groups = set(data[group_col])
1269
1270        percentiles, percentiles_loop = self.percentiles_calculation(
1271            data[values_col], sep_perc=sep_perc
1272        )
1273
1274        for g in groups:
1275
1276            print(f"Group: {g} ...")
1277
1278            tmp_values = data[values_col][data[group_col] == g]
1279
1280            per_dat = self.to_percentil(tmp_values, percentiles, percentiles_loop)
1281
1282            full_data[g] = per_dat
1283
1284        return full_data
1285
1286    def round_to_scientific_notation(self, num):
1287        """
1288        Round a number to scientific notation if very small, otherwise to one decimal place.
1289
1290        Parameters
1291        ----------
1292        num : float
1293            The number to round.
1294
1295        Returns
1296        -------
1297        str
1298            The rounded number as a string.
1299            - If `num` is 0, returns "0.0".
1300            - If `abs(num) < 1e-4`, returns scientific notation with 1 decimal and 1-digit exponent.
1301            - Otherwise, returns the number rounded to one decimal place.
1302        """
1303
1304        if num == 0:
1305            return "0.0"
1306
1307        if abs(num) < 0.0001:
1308            rounded_num = np.format_float_scientific(num, precision=1, exp_digits=1)
1309            return rounded_num
1310        else:
1311            return f"{num:.1f}"
1312
1313    def aov_percentiles(self, data, testes_col, comb: str = "*"):
1314        """
1315        Perform Welch's ANOVA on percentile-based group data.
1316
1317        This method calculates group values by combining the columns specified in `testes_col`
1318        according to the operation defined in `comb`, then performs Welch's ANOVA to test for
1319        differences in means between the groups. Welch's ANOVA is suitable when the groups
1320        have unequal variances.
1321
1322        Parameters
1323        ----------
1324        data : dict of pd.DataFrame
1325            Dictionary where keys are group names and values are DataFrames containing the data.
1326
1327        testes_col : str or list of str
1328            Column name(s) from which the group values are derived. If a list is provided, columns
1329            will be combined based on the `comb` operation.
1330
1331        comb : str, optional
1332            Operation used to combine multiple columns if `testes_col` is a list. Options include:
1333                '*' : multiplication
1334                '+' : addition
1335                '**': exponentiation
1336                '-' : subtraction
1337                '/' : division
1338            Default is '*'.
1339
1340        Returns
1341        -------
1342        F : float
1343            F-statistic from Welch's ANOVA.
1344
1345        p_val : float
1346            Uncorrected p-value from Welch's ANOVA, testing for significant differences between groups.
1347
1348        Notes
1349        -----
1350        - If `testes_col` is a single string, no combination is performed, and the group values
1351          are taken directly from that column.
1352        - Welch's ANOVA is used as it accounts for unequal variances between groups.
1353        - The `df.melt()` method is used to reshape the data, allowing the ANOVA to be applied to all groups.
1354
1355        Examples
1356        --------
1357        >>> welch_F, welch_p = self.aov_percentiles(data, testes_col=['col1', 'col2'], comb='+')
1358        >>> print(f"Welch's ANOVA F-statistic: {welch_F}, p-value: {welch_p}")
1359        """
1360
1361        groups = []
1362
1363        for d in data.keys():
1364
1365            if isinstance(testes_col, str):
1366                g = data[d][testes_col]
1367            elif isinstance(testes_col, list):
1368                g = [1] * len(data[d][testes_col[0]])
1369                for t in testes_col:
1370                    if comb == "*":
1371                        g = [a * b for a, b in zip(g, data[d][t])]
1372                    elif comb == "+":
1373                        g = [a + b for a, b in zip(g, data[d][t])]
1374                    elif comb == "**":
1375                        g = [a**b for a, b in zip(g, data[d][t])]
1376                    elif comb == "-":
1377                        g = [a - b for a, b in zip(g, data[d][t])]
1378                    elif comb == "/":
1379                        g = [a / b for a, b in zip(g, data[d][t])]
1380
1381            groups.append(g)
1382
1383        df = pd.DataFrame({f"group_{i}": group for i, group in enumerate(groups)})
1384
1385        df_melted = df.melt(var_name="group", value_name="value")
1386
1387        welch_results = pg.welch_anova(data=df_melted, dv="value", between="group")
1388
1389        return welch_results["F"].values[0], welch_results["p-unc"].values[0]
1390
1391    def post_aov_percentiles(self, data, testes_col, comb: str = "*"):
1392        """
1393        Perform Welch's ANOVA on percentile-based group data and pairwise Welch's t-tests.
1394
1395        This method first performs Welch's ANOVA to assess differences in group means, and
1396        then conducts pairwise Welch's t-tests between all group combinations. P-values are
1397        adjusted using the Bonferroni correction for multiple comparisons.
1398
1399        Parameters
1400        ----------
1401        data : dict of pd.DataFrame
1402            Dictionary where keys are group names and values are DataFrames containing the data.
1403
1404        testes_col : str or list of str
1405            Column name(s) from which the group values are derived. If a list is provided,
1406            columns will be combined according to the `comb` operation.
1407
1408        comb : str, optional
1409            Operation used to combine multiple columns if `testes_col` is a list. Options include:
1410                '*' : multiplication
1411                '+' : addition
1412                '**': exponentiation
1413                '-' : subtraction
1414                '/' : division
1415            Default is '*'.
1416
1417        Returns
1418        -------
1419        p_val : float
1420            Uncorrected p-value from the Welch's ANOVA.
1421
1422        final_results : dict
1423            Dictionary containing results of pairwise Welch's t-tests with keys:
1424                'group1' : list of first group names in each comparison
1425                'group2' : list of second group names in each comparison
1426                'stat' : list of t-statistics for each comparison
1427                'p_val' : list of uncorrected p-values for each comparison
1428                'adj_p_val' : list of Bonferroni-adjusted p-values for multiple comparisons
1429        """
1430
1431        p_val = self.aov_percentiles(data=data, testes_col=testes_col, comb=comb)[1]
1432
1433        pairs = list(combinations(data, 2))
1434        final_results = {
1435            "group1": [],
1436            "group2": [],
1437            "stat": [],
1438            "p_val": [],
1439            "adj_p_val": [],
1440        }
1441
1442        for group1, group2 in pairs:
1443            if isinstance(testes_col, str):
1444                g1 = data[group1][testes_col]
1445            elif isinstance(testes_col, list):
1446                g1 = [1] * len(data[group1][testes_col[0]])
1447                for t in testes_col:
1448                    if comb == "*":
1449                        g1 = [a * b for a, b in zip(g1, data[group1][t])]
1450                    elif comb == "+":
1451                        g1 = [a + b for a, b in zip(g1, data[group1][t])]
1452                    elif comb == "**":
1453                        g1 = [a**b for a, b in zip(g1, data[group1][t])]
1454                    elif comb == "-":
1455                        g1 = [a - b for a, b in zip(g1, data[group1][t])]
1456                    elif comb == "/":
1457                        g1 = [a / b for a, b in zip(g1, data[group1][t])]
1458
1459            if isinstance(testes_col, str):
1460                g2 = data[group2][testes_col]
1461            elif isinstance(testes_col, list):
1462                g2 = [1] * len(data[group2][testes_col[0]])
1463                for t in testes_col:
1464                    if comb == "*":
1465                        g2 = [a * b for a, b in zip(g2, data[group2][t])]
1466                    elif comb == "+":
1467                        g2 = [a + b for a, b in zip(g2, data[group2][t])]
1468                    elif comb == "**":
1469                        g2 = [a**b for a, b in zip(g2, data[group2][t])]
1470                    elif comb == "-":
1471                        g2 = [a - b for a, b in zip(g2, data[group2][t])]
1472                    elif comb == "/":
1473                        g2 = [a / b for a, b in zip(g2, data[group2][t])]
1474
1475            stat, p_val = stats.ttest_ind(
1476                g1, g2, alternative="two-sided", equal_var=False
1477            )
1478            g = sorted([group1, group2])
1479            final_results["group1"].append(g[0])
1480            final_results["group2"].append(g[1])
1481            final_results["stat"].append(stat)
1482            final_results["p_val"].append(p_val)
1483            adj = p_val * len(pairs)
1484            if adj > 1:
1485                final_results["adj_p_val"].append(1)
1486            else:
1487                final_results["adj_p_val"].append(adj)
1488
1489        return p_val, final_results
1490
1491    def chi2_percentiles(self, input_hist):
1492        """
1493        Perform a Chi-squared test on percentile-based group data.
1494
1495        This method reformats the input histogram data into a contingency table and performs
1496        a Chi-squared test to evaluate whether there is a significant association between groups.
1497
1498        Parameters
1499        ----------
1500        input_hist : dict of pd.DataFrame
1501            Dictionary where keys are group names and values are DataFrames containing histogram data.
1502            The DataFrame must include a column 'n' representing counts for each percentile/bin.
1503
1504        Returns
1505        -------
1506        chi2_statistic : float
1507            Chi-squared test statistic.
1508
1509        p_value : float
1510            P-value from the Chi-squared test.
1511
1512        dof : int
1513            Degrees of freedom for the test.
1514
1515        expected : np.ndarray
1516            Expected frequencies for each group/bin under the null hypothesis.
1517
1518        chi_data : dict
1519            Formatted data used in the Chi-squared test, with group names as keys and bin counts as values.
1520
1521        Example
1522        -------
1523        chi2_stat, p_val, dof, expected, chi_data = self.chi2_percentiles(input_hist)
1524        print(f"Chi-squared statistic: {chi2_stat}, p-value: {p_val}")
1525        """
1526
1527        chi_data = {}
1528
1529        for d in input_hist.keys():
1530            tmp_dic = {}
1531
1532            for n, c in enumerate(input_hist[d]["n"]):
1533                tmp_dic[f"p{n+1}"] = c
1534
1535            chi_data[d] = tmp_dic
1536
1537        chi2_statistic, p_value, dof, expected = chi2_contingency(
1538            pd.DataFrame(chi_data).T, correction=True
1539        )
1540
1541        return chi2_statistic, p_value, dof, expected, chi_data
1542
1543    def post_ch2_percentiles(self, input_hist):
1544        """
1545        Perform a Chi-squared test on percentile-based group data, including pairwise comparisons.
1546
1547        This method first performs a Chi-squared test across all groups to check for a significant association.
1548        It then performs pairwise Chi-squared tests between groups to identify specific differences.
1549        P-values for multiple comparisons are adjusted using the Bonferroni correction.
1550
1551        Parameters
1552        ----------
1553        input_hist : dict of pd.DataFrame
1554            Dictionary where keys are group names and values are DataFrames containing histogram data.
1555            Each DataFrame must include a column 'n' with counts for each percentile/bin.
1556
1557        Returns
1558        -------
1559        p_val : float
1560            Overall p-value from the initial Chi-squared test across all groups.
1561
1562        final_results : dict
1563            Results of pairwise Chi-squared tests, with keys:
1564                - 'group1' (list): Name of the first group in each comparison
1565                - 'group2' (list): Name of the second group in each comparison
1566                - 'chi2' (list): Chi-squared statistic for each pairwise comparison
1567                - 'p_val' (list): P-value for each pairwise comparison
1568                - 'adj_p_val' (list): Adjusted p-value (Bonferroni correction) for multiple comparisons
1569
1570        Example
1571        -------
1572        p_val, final_results = self.post_ch2_percentiles(input_hist)
1573        print(f"Overall Chi-squared p-value: {p_val}")
1574        for i in range(len(final_results['group1'])):
1575            print(f"Comparison: {final_results['group1'][i]} vs {final_results['group2'][i]}")
1576            print(f"Chi2 stat: {final_results['chi2'][i]}, p-value: {final_results['p_val'][i]}, adj. p-value: {final_results['adj_p_val'][i]}")
1577        """
1578
1579        res = self.chi2_percentiles(input_hist)
1580
1581        pairs = list(combinations(res[4], 2))
1582        results = []
1583
1584        for group1, group2 in pairs:
1585            table_pair = pd.DataFrame(res[4])[[group1, group2]]
1586            chi2_stat, p_val, _, _ = chi2_contingency(table_pair, correction=True)
1587            results.append((group1, group2, chi2_stat, p_val))
1588
1589        final_results = {
1590            "group1": [],
1591            "group2": [],
1592            "chi2": [],
1593            "p_val": [],
1594            "adj_p_val": [],
1595        }
1596
1597        for group1, group2, chi2_stat, p_val in results:
1598            g = sorted([group1, group2])
1599            final_results["group1"].append(g[0])
1600            final_results["group2"].append(g[1])
1601            final_results["chi2"].append(chi2_stat)
1602            final_results["p_val"].append(p_val)
1603            adj = p_val * len(results)
1604            if adj > 1:
1605                final_results["adj_p_val"].append(1)
1606            else:
1607                final_results["adj_p_val"].append(adj)
1608
1609        return res[1], final_results
1610
1611    def hist_compare_plot(
1612        self, data, queue, tested_value, p_adj: bool = True, txt_size: int = 20
1613    ):
1614        """
1615        Generate comparative histograms and display results of statistical tests (ANOVA and Chi-squared).
1616
1617        This method performs transformations on the input data, generates comparative histograms
1618        for each group, and annotates statistical test results, including Welch's ANOVA and Chi-squared tests.
1619        Multiple comparison corrections can be applied using the Bonferroni method.
1620
1621        Parameters
1622        ----------
1623        data : dict of pd.DataFrame
1624            Dictionary where keys are group names and values are DataFrames containing histogram data.
1625            Each DataFrame should include the column specified by `tested_value`.
1626
1627        queue : list of str
1628            Defines the order of groups to be plotted.
1629
1630        tested_value : str
1631            The column name in `data` representing the variable to test and visualize.
1632
1633        p_adj : bool, optional
1634            If True, applies Bonferroni correction for multiple comparisons (default is True).
1635
1636        txt_size : int, optional
1637            Font size for text annotations in the plot (default is 20).
1638
1639        Returns
1640        -------
1641        fig : matplotlib.figure.Figure
1642            Matplotlib figure object containing the generated histograms and statistical test results.
1643
1644        Example
1645        -------
1646        fig = self.hist_compare_plot(
1647            data,
1648            queue=['group1', 'group2', 'group3'],
1649            tested_value='n',
1650            p_adj=True,
1651            txt_size=18
1652        )
1653        plt.show()
1654        """
1655
1656        for i in data.keys():
1657            values = np.array(data[i][tested_value])
1658            values += 1
1659            transformed_values, fitted_lambda = stats.boxcox(values)
1660            data[i][tested_value] = transformed_values.tolist()
1661
1662        if sorted(queue) != sorted(data.keys()):
1663            print(
1664                "\n Wrong queue provided! The queue will be sorted with default settings!"
1665            )
1666            queue = sorted(data.keys())
1667
1668        # parametric selected value
1669        pk, dfk = self.post_aov_percentiles(data, testes_col=tested_value)
1670
1671        dfk = pd.DataFrame(dfk)
1672
1673        dfk = dfk.sort_values(
1674            by=["group1", "group2"],
1675            key=lambda col: [queue.index(val) if val in queue else -1 for val in col],
1676        ).reset_index(drop=True)
1677
1678        # parametric standarized selected value
1679        pkc, dfkc = self.post_aov_percentiles(
1680            data, testes_col=[tested_value, "n_standarized"], comb="*"
1681        )
1682
1683        dfkc = pd.DataFrame(dfkc)
1684
1685        dfkc = dfkc.sort_values(
1686            by=["group1", "group2"],
1687            key=lambda col: [queue.index(val) if val in queue else -1 for val in col],
1688        ).reset_index(drop=True)
1689
1690        # chi2
1691        pchi, dfchi = self.post_ch2_percentiles(data)
1692
1693        dfchi = pd.DataFrame(dfchi)
1694
1695        dfchi = dfchi.sort_values(
1696            by=["group1", "group2"],
1697            key=lambda col: [queue.index(val) if val in queue else -1 for val in col],
1698        ).reset_index(drop=True)
1699
1700        ##############################################################################
1701
1702        standarized_max, standarized_min, value_max, value_min = [], [], [], []
1703        for d in queue:
1704            standarized_max.append(max(data[d]["n_standarized"]))
1705            standarized_min.append(min(data[d]["n_standarized"]))
1706            value_max.append(max(data[d][tested_value]))
1707            value_min.append(min(data[d][tested_value]))
1708
1709        num_columns = len(queue) + 1
1710
1711        fig, axs = plt.subplots(
1712            3,
1713            num_columns,
1714            figsize=(8 * num_columns, 10),
1715            gridspec_kw={"width_ratios": [1] * len(queue) + [0.5], "wspace": 0.05},
1716        )
1717
1718        for i, d in enumerate(queue):
1719            tmp_data = data[d]
1720
1721            axs[0, i].bar(
1722                [str(n) for n in range(len(tmp_data["n_standarized"]))],
1723                tmp_data["n_standarized"],
1724                width=0.95,
1725                color="gold",
1726            )
1727            axs[0, i].set_ylim(
1728                min(standarized_min) * 0.9995, max(standarized_max) * 1.0005
1729            )
1730
1731            if i == 0:
1732                axs[0, i].set_ylabel("Standarized\nfrequency", fontsize=txt_size)
1733            else:
1734                axs[0, i].set_yticks([])
1735
1736            axs[0, i].set_xticks([])
1737            axs[0, i].tick_params(axis="y", labelsize=txt_size * 0.7)
1738
1739            axs[1, i].bar(
1740                [str(n) for n in range(len(tmp_data[tested_value]))],
1741                tmp_data[tested_value],
1742                width=0.95,
1743                color="orange",
1744            )
1745
1746            mean_value = np.mean(tmp_data[tested_value])
1747            axs[1, i].axhline(y=mean_value, color="red", linestyle="--")
1748
1749            # axs[1, i].set_ylim(min(value_min) - 1, max(value_max) + 1)
1750            axs[1, i].set_ylim(min(value_min) * 0.9995, max(value_max) * 1.0005)
1751
1752            if i == 0:
1753                axs[1, i].set_ylabel(f"Normalized\n{tested_value}", fontsize=txt_size)
1754            else:
1755                axs[1, i].set_yticks([])
1756
1757            axs[1, i].set_xticks([])
1758            axs[1, i].tick_params(axis="y", labelsize=txt_size * 0.7)
1759
1760            axs[2, i].bar(
1761                [str(n) for n in range(len(tmp_data["n_standarized"]))],
1762                [
1763                    a * b
1764                    for a, b in zip(tmp_data[tested_value], tmp_data["n_standarized"])
1765                ],
1766                width=0.95,
1767                color="goldenrod",
1768            )
1769
1770            mean_value = np.mean(
1771                [
1772                    a * b
1773                    for a, b in zip(tmp_data[tested_value], tmp_data["n_standarized"])
1774                ]
1775            )
1776            axs[2, i].axhline(y=mean_value, color="red", linestyle="--")
1777
1778            axs[2, i].set_ylim(
1779                (min(standarized_min) * min(value_min)) * 0.9995,
1780                (max(standarized_max) * max(value_max) * 1.0005),
1781            )
1782            axs[2, i].set_xlabel(d, fontsize=txt_size)
1783
1784            if i == 0:
1785                axs[2, i].set_ylabel(
1786                    f"Standarized\nnorm_{tested_value}", fontsize=txt_size
1787                )
1788            else:
1789                axs[2, i].set_yticks([])
1790
1791            axs[2, i].set_xticks([])
1792            axs[2, i].tick_params(axis="y", labelsize=txt_size * 0.7)
1793
1794        sign = "ns"
1795        if float(self.round_to_scientific_notation(pk)) < 0.001:
1796            sign = "***"
1797        elif float(self.round_to_scientific_notation(pk)) < 0.01:
1798            sign = "**"
1799        elif float(self.round_to_scientific_notation(pk)) < 0.05:
1800            sign = "*"
1801
1802        text = f"Test Welch's ANOVA\np-value: {self.round_to_scientific_notation(pk)} - {sign}\n"
1803
1804        if p_adj == True:
1805            for i in range(len(dfk["group1"])):
1806                sign = "ns"
1807                if dfk["adj_p_val"][i] < 0.001:
1808                    sign = "***"
1809                elif dfk["adj_p_val"][i] < 0.01:
1810                    sign = "**"
1811                elif dfk["adj_p_val"][i] < 0.05:
1812                    sign = "*"
1813
1814                text += f"{dfk['group1'][i]} vs. {dfk['group2'][i]}\np-value: {self.round_to_scientific_notation(dfk['adj_p_val'][i])} - {sign}\n"
1815        else:
1816            for i in range(len(dfk["group1"])):
1817                sign = "ns"
1818                if dfk["p_val"][i] < 0.001:
1819                    sign = "***"
1820                elif dfk["p_val"][i] < 0.01:
1821                    sign = "**"
1822                elif dfk["p_val"][i] < 0.05:
1823                    sign = "*"
1824
1825                text += f"{dfk['group1'][i]} vs. {dfk['group2'][i]}\np-value: {self.round_to_scientific_notation(dfk['p_val'][i])} - {sign}\n"
1826
1827        axs[1, -1].text(
1828            0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True
1829        )
1830        axs[1, -1].set_axis_off()
1831
1832        sign = "ns"
1833        if float(self.round_to_scientific_notation(pkc)) < 0.001:
1834            sign = "***"
1835        elif float(self.round_to_scientific_notation(pkc)) < 0.01:
1836            sign = "**"
1837        elif float(self.round_to_scientific_notation(pkc)) < 0.05:
1838            sign = "*"
1839
1840        text = f"Test Welch's ANOVA\np-value: {self.round_to_scientific_notation(pkc)} - {sign}\n"
1841
1842        if p_adj == True:
1843            for i in range(len(dfkc["group1"])):
1844                sign = "ns"
1845                if dfkc["adj_p_val"][i] < 0.001:
1846                    sign = "***"
1847                elif dfkc["adj_p_val"][i] < 0.01:
1848                    sign = "**"
1849                elif dfkc["adj_p_val"][i] < 0.05:
1850                    sign = "*"
1851
1852                text += f"{dfkc['group1'][i]} vs. {dfkc['group2'][i]}\np-value: {self.round_to_scientific_notation(dfkc['adj_p_val'][i])} - {sign}\n"
1853        else:
1854            for i in range(len(dfkc["group1"])):
1855                sign = "ns"
1856                if dfkc["p_val"][i] < 0.001:
1857                    sign = "***"
1858                elif dfkc["p_val"][i] < 0.01:
1859                    sign = "**"
1860                elif dfkc["p_val"][i] < 0.05:
1861                    sign = "*"
1862
1863                text += f"{dfkc['group1'][i]} vs. {dfkc['group2'][i]}\np-value: {self.round_to_scientific_notation(dfkc['p_val'][i])} - {sign}\n"
1864
1865        axs[2, -1].text(
1866            0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True
1867        )
1868        axs[2, -1].set_axis_off()
1869
1870        sign = "ns"
1871        if float(self.round_to_scientific_notation(pchi)) < 0.001:
1872            sign = "***"
1873        elif float(self.round_to_scientific_notation(pchi)) < 0.01:
1874            sign = "**"
1875        elif float(self.round_to_scientific_notation(pchi)) < 0.05:
1876            sign = "*"
1877
1878        text = f"Test Chi-squared\np-value: {self.round_to_scientific_notation(pchi)} - {sign}\n"
1879
1880        if p_adj == True:
1881            for i in range(len(dfchi["group1"])):
1882                sign = "ns"
1883                if dfchi["adj_p_val"][i] < 0.001:
1884                    sign = "***"
1885                elif dfchi["adj_p_val"][i] < 0.01:
1886                    sign = "**"
1887                elif dfchi["adj_p_val"][i] < 0.05:
1888                    sign = "*"
1889
1890                text += f"{dfchi['group1'][i]} vs. {dfchi['group2'][i]}\np-value: {self.round_to_scientific_notation(dfchi['adj_p_val'][i])} - {sign}\n"
1891        else:
1892            for i in range(len(dfchi["group1"])):
1893                sign = "ns"
1894                if dfchi["p_val"][i] < 0.001:
1895                    sign = "***"
1896                elif dfchi["p_val"][i] < 0.01:
1897                    sign = "**"
1898                elif dfchi["p_val"][i] < 0.05:
1899                    sign = "*"
1900
1901                text += f"{dfchi['group1'][i]} vs. {dfchi['group2'][i]}\np-value: {self.round_to_scientific_notation(dfchi['p_val'][i])} - {sign}\n"
1902
1903        axs[0, -1].text(
1904            0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True
1905        )
1906        axs[0, -1].set_axis_off()
1907
1908        plt.tight_layout()
1909
1910        if cfg._DISPLAY_MODE:
1911            plt.show()
1912
1913        return fig
class FeatureIntensity(jimg_int.utils.ImageTools):
 23class FeatureIntensity(ImageTools):
 24    r"""
 25    Class for quantitative analysis of pixel intensity and size measurements
 26    in 2D/3D biological images. Supports projection of 3D stacks, mask-based
 27    intensity normalization, region size estimation and metadata extraction.
 28
 29    Parameters
 30    ----------
 31    input_image : ndarray, optional
 32        Input image or 3D stack for analysis. If 3D, projection must be applied.
 33
 34    image : ndarray, optional
 35        2D projected image (internal use).
 36
 37    normalized_image_values : dict, optional
 38        Dictionary storing normalized intensity statistics.
 39
 40    mask : ndarray, optional
 41        Binary mask of region of interest (ROI).
 42
 43    background_mask : ndarray, optional
 44        Binary mask used for background estimation. If not provided, `mask` is used.
 45
 46    typ : {"avg", "median", "std", "var", "max", "min"}, optional
 47        Projection type for 3D stacks. Default is `"avg"`.
 48
 49    size_info : dict, optional
 50        Dictionary storing ROI size measurements.
 51
 52    correction_factor : float, optional
 53        Normalization correction factor applied to background intensity.
 54        Must satisfy 0 < factor < 1. Default is 0.1.
 55
 56    img_type : str, optional
 57        Image type metadata.
 58
 59    scale : float, optional
 60        Pixel resolution in physical units (e.g. µm/px). Used in size calculations.
 61
 62    stack_selection : list of int, optional
 63        List of Z-indices to remove when projecting a 3D image.
 64
 65    Attributes
 66    ----------
 67    input_image : ndarray or None
 68        Loaded input image.
 69
 70    image : ndarray or None
 71        Projected 2D image.
 72
 73    mask : ndarray or None
 74        Region of interest mask.
 75
 76    background_mask : ndarray or None
 77        Background normalization mask.
 78
 79    scale : float or None
 80        Scale value for size estimation.
 81
 82    normalized_image_values : dict or None
 83        Dictionary containing intensity metrics.
 84
 85    size_info : dict or None
 86        Dictionary with ROI size information.
 87
 88    typ : str
 89        Selected projection type for 3D images.
 90
 91    stack_selection : list of int
 92        Z-levels excluded from projection.
 93
 94    Notes
 95    -----
 96    The intensity normalization formula applied per pixel is:
 97
 98    .. math::
 99
100        R_{i,j} = T_{i,j} - \\left( \\mu_B (1 + c) \\right)
101
102    where
103    * ``T_{i,j}`` – pixel intensity in ROI
104    * ``μ_B`` – mean intensity in background region
105    * ``c`` – correction factor
106    * ``R_{i,j}`` – normalized pixel intensity
107
108    Examples
109    --------
110    Load a 3D image, mask and compute statistics:
111
112    >>> fi = FeatureIntensity()
113    >>> fi.load_image_3D("stack.tiff")
114    >>> fi.load_mask_("mask.png")
115    >>> fi.set_projection("median")
116    >>> fi.run_calculations()
117    >>> results = fi.get_results()
118    >>> results["intensity"]["norm_mean"]
119    """
120
121    def __init__(
122        self,
123        input_image=None,
124        image=None,
125        normalized_image_values=None,
126        mask=None,
127        background_mask=None,
128        typ=None,
129        size_info=None,
130        correction_factor=None,
131        img_type=None,
132        scale=None,
133        stack_selection=None,
134    ):
135        """
136        Initialize a FeatureIntensity analysis instance.
137
138        Parameters
139        ----------
140        input_image : ndarray, optional
141            Input image or 3D stack used for analysis. If the image is 3D, a
142            projection will be computed depending on the `typ` parameter.
143
144        image : ndarray, optional
145            2D image buffer used internally after projection of the input image.
146            Should not be set manually.
147
148        normalized_image_values : dict, optional
149            Dictionary containing normalized intensity statistics. Usually filled
150            automatically after running `run_calculations()`.
151
152        mask : ndarray, optional
153            Binary mask of the target region of interest (ROI). Required for
154            intensity and size calculations.
155
156        background_mask : ndarray, optional
157            Binary mask specifying the background region used to compute the
158            normalization threshold. If not provided, the ROI mask is also used
159            as the background reference.
160
161        typ : {"avg", "median", "std", "var", "max", "min"}, optional
162            Projection method for 3D images. Determines how the z-stack is
163            collapsed into a 2D image. Default is `"avg"`.
164
165        size_info : dict, optional
166            Dictionary storing computed size metrics of the ROI. Populated after
167            invoking `size_calculations()`.
168
169        correction_factor : float, optional
170            Correction term used during intensity normalization. Must satisfy
171            0 < correction_factor < 1. Default is 0.1.
172
173        img_type : str, optional
174            Optional metadata about the image type (e.g., "tiff", "png").
175
176        scale : float, optional
177            Pixel resolution in physical units (e.g., µm/px). Required for
178            real-size estimation in `size_calculations()`.
179
180        stack_selection : list of int, optional
181            Indices of z-planes to exclude during projection of a 3D stack.
182
183        Notes
184        -----
185        Values not provided are initialized to `None`, except for `typ`, which
186        defaults to `"avg"`, and `correction_factor`, which defaults to 0.1.
187
188        The class is designed to be populated by loading functions:
189        `load_image_()`, `load_image_3D()`, `load_mask_()`,
190        and optionally `load_normalization_mask_()` and `load_JIMG_project_()`.
191        """
192
193        self.input_image = input_image or None
194        """ Input image or 3D stack used for analysis. If the image is 3D, a
195         projection will be computed depending on the `typ` parameter."""
196
197        self.image = image or None
198        """  2D image buffer used internally after projection of the input image.
199          Should not be set manually."""
200
201        self.normalized_image_values = normalized_image_values or None
202        """Dictionary containing normalized intensity statistics. Usually filled
203        automatically after running `run_calculations()`."""
204
205        self.mask = mask or None
206        """Binary mask of the target region of interest (ROI). Required for
207        intensity and size calculations."""
208
209        self.background_mask = background_mask or None
210        """ Binary mask specifying the background region used to compute the
211         normalization threshold. If not provided, the ROI mask is also used
212         as the background reference."""
213
214        self.typ = typ or "avg"
215        """Projection method for 3D images. Determines how the z-stack is
216        collapsed into a 2D image. Default is `"avg"`."""
217
218        self.size_info = size_info or None
219        """Dictionary storing computed size metrics of the ROI. Populated after
220        invoking `size_calculations()`."""
221
222        self.correction_factor = correction_factor or 0.1
223        """ Correction term used during intensity normalization. Must satisfy
224         0 < correction_factor < 1. Default is 0.1."""
225
226        self.scale = scale or None
227        """ Pixel resolution in physical units (e.g., µm/px). Required for
228         real-size estimation in `size_calculations()`."""
229
230        self.stack_selection = stack_selection or []
231        """Indices of z-planes to exclude during projection of a 3D stack."""
232
233    @property
234    def current_metadata(self):
235        r"""
236        Return current metadata parameters used in image processing and normalization.
237
238        Returns
239        -------
240        tuple
241            A tuple containing:
242
243            projection_type : str
244                Projection method used for 3D image reduction (e.g., "avg", "median").
245
246            correction_factor : float
247                Correction factor used for background subtraction during intensity
248                normalization. The applied formula is:
249
250                .. math::
251
252                    R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )
253
254                where
255                * ``R_{i,j}`` — normalized pixel intensity
256                * ``T_{i,j}`` — original pixel intensity
257                * ``μ_B`` — mean background intensity
258                * ``c`` — correction factor
259            scale : float or None
260                Pixel resolution (unit/px), loaded via `load_JIMG_project_()` or set manually
261                using `set_scale()`.
262
263            stack_selection : list of int
264                Indices of z-slices excluded from projection of a 3D image.
265
266        Notes
267        -----
268        This property also prints the metadata values to the console for quick inspection.
269        """
270
271        print(f"Projection type: {self.typ}")
272        print(f"Correction factor: {self.correction_factor}")
273        print(f"Scale (unit/px): {self.scale}")
274        print(f"Selected stac to remove: {self.stack_selection}")
275
276        return self.typ, self.correction_factor, self.scale, self.stack_selection
277
278    def set_projection(self, projection: str):
279        """
280        Set the projection method for 3D image stack reduction.
281
282        Parameters
283        ----------
284        projection : {"avg", "median", "std", "var", "max", "min"}
285            Projection method to reduce a 3D image stack to a 2D image. Default is `"avg"`.
286
287        Notes
288        -----
289        This method updates the `typ` attribute of the class. The selected projection
290        determines how the z-stack is collapsed:
291        - `"avg"` : average intensity across slices
292        - `"median"` : median intensity across slices
293        - `"std"` : standard deviation across slices
294        - `"var"` : variance across slices
295        - `"max"` : maximum intensity across slices
296        - `"min"` : minimum intensity across slices
297        """
298
299        t = ["avg", "median", "std", "var", "max", "min"]
300        if projection in t:
301            self.typ = projection
302        else:
303            print(f"\nProvided parameter is incorrect. Avaiable projection types: {t}")
304
305    def set_correction_factorn(self, factor: float):
306        r"""
307        Set the correction factor for background subtraction during intensity normalization.
308
309        Parameters
310        ----------
311        factor : float
312            Correction factor to adjust background subtraction. Must satisfy 0 < factor < 1.
313            Default is 0.1.
314
315        Notes
316        -----
317        The correction is applied per pixel in the target mask using the formula:
318
319        .. math::
320
321            R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )
322
323        where
324        * ``R_{i,j}`` — normalized pixel intensity
325        * ``T_{i,j}`` — original pixel intensity
326        * ``μ_B`` — mean intensity in the background mask
327        * ``c`` — correction factor
328        """
329
330        if factor < 1 and factor > 0:
331            self.correction_factor = factor
332        else:
333            print(
334                "\nProvided parameter is incorrect. The factor should be a floating-point value within the range of 0 to 1."
335            )
336
337    def set_scale(self, scale):
338        """
339        Set the scale for converting pixel measurements to physical units.
340
341        Parameters
342        ----------
343        scale : float
344            Pixel resolution in physical units (e.g., µm/px). Used to calculate the
345            actual size of the tissue or organ.
346
347        Notes
348        -----
349        The scale can also be automatically loaded from a JIMG project using
350        `load_JIMG_project_()`. This value is required for size calculations in
351        `size_calculations()`.
352        """
353
354        self.scale = scale
355
356    def set_selection_list(self, rm_list: list):
357        """
358        Set the list of z-slices to exclude when projecting a 3D image stack.
359
360        Parameters
361        ----------
362        rm_list : list of int
363            List of indices corresponding to z-slices that should be removed from
364            the full 3D image stack before projection.
365
366        Notes
367        -----
368        This updates the `stack_selection` attribute, which is used by the
369        `stack_selection_()` method during projection.
370        """
371
372        self.stack_selection = rm_list
373
374    def load_JIMG_project_(self, path):
375        """
376        Load a JIMG project from a `.pjm` file.
377
378        Parameters
379        ----------
380        file_path : str
381            Path to the JIMG project file. The file must have a `.pjm` extension.
382
383        Returns
384        -------
385        project : object
386            Loaded project object containing images and metadata.
387
388        Raises
389        ------
390        ValueError
391            If the provided file path does not point to a `.pjm` file.
392
393        Notes
394        -----
395        The method attempts to automatically set the `scale` and `stack_selection`
396        attributes from the project metadata if available.
397        """
398
399        path = os.path.abspath(path)
400
401        if ".pjm" in path:
402            metadata = self.load_JIMG_project(path)
403
404            try:
405                self.scale = metadata.metadata["X_resolution[um/px]"]
406            except:
407
408                try:
409                    self.scale = metadata.images_dict["metadata"][0][
410                        "X_resolution[um/px]"
411                    ]
412
413                except:
414                    print(
415                        '\nUnable to set scale on this project! Set scale using "set_scale()"'
416                    )
417
418            self.stack_selection = metadata.removal_list
419
420        else:
421            print(
422                "\nWrong path. The provided path does not point to a JIMG project (*.pjm)."
423            )
424
425    def stack_selection_(self):
426        """
427        Remove selected z-slices from a 3D image stack based on `stack_selection`.
428
429        Notes
430        -----
431        Only works if `input_image` is a 3D ndarray. The slices with indices listed
432        in `stack_selection` are excluded from the stack. Updates `input_image`
433        in-place.
434
435        Prints a warning if `stack_selection` is empty.
436        """
437
438        if len(self.input_image.shape) == 3:
439            if len(self.stack_selection) > 0:
440                self.input_image = self.input_image[
441                    [
442                        x
443                        for x in range(self.input_image.shape[0])
444                        if x not in self.stack_selection
445                    ]
446                ]
447            else:
448                print("\nImages to remove from the stack were not selected!")
449
450    def projection(self):
451        """
452        Project a 3D image stack into a 2D image using the method defined by `typ`.
453
454        Notes
455        -----
456        Updates the `image` attribute with the projected 2D result.
457
458        Supported projection types (`typ`):
459        - "avg" : mean intensity across slices
460        - "median" : median intensity across slices
461        - "std" : standard deviation across slices
462        - "var" : variance across slices
463        - "max" : maximum intensity across slices
464        - "min" : minimum intensity across slices
465
466        Raises
467        ------
468        AttributeError
469            If `input_image` is not defined.
470        """
471
472        if self.typ == "avg":
473            img = np.mean(self.input_image, axis=0)
474
475        elif self.typ == "std":
476            img = np.std(self.input_image, axis=0)
477
478        elif self.typ == "median":
479            img = np.median(self.input_image, axis=0)
480
481        elif self.typ == "var":
482            img = np.var(self.input_image, axis=0)
483
484        elif self.typ == "max":
485            img = np.max(self.input_image, axis=0)
486
487        elif self.typ == "min":
488            img = np.min(self.input_image, axis=0)
489
490        self.image = img
491
492    def detect_img(self):
493        """
494        Detect whether the input image is 2D or 3D and perform appropriate preprocessing.
495
496        Notes
497        -----
498        - For 3D images, applies `stack_selection_()` and then `projection()`.
499        - For 2D images, no projection is applied.
500        - Prints status messages indicating the type of image and applied operations.
501
502        Raises
503        ------
504        AttributeError
505            If `input_image` is not defined.
506        """
507        check = len(self.input_image.shape)
508
509        if check == 3:
510            print("\n3D image detected! Starting processing for 3D image...")
511            print(f"Projection - {self.typ}...")
512
513            self.stack_selection_()
514            self.projection()
515
516        elif check == 2:
517            print("\n2D image detected! Starting processing for 2D image...")
518
519        else:
520            print("\nData does not match any image type!")
521
522    def load_image_3D(self, path):
523        """
524        Load a 3D image stack from a TIFF file.
525
526        Parameters
527        ----------
528        path : str
529            Path to the 3D image file (*.tiff) to be loaded.
530
531        Notes
532        -----
533        The loaded image is stored in the `input_image` attribute as a 3D ndarray.
534        """
535        path = os.path.abspath(path)
536
537        self.input_image = self.load_3D_tiff(path)
538
539    def load_image_(self, path):
540        """
541        Load a 2D image into the class.
542
543        Parameters
544        ----------
545        path : str
546            Path to the image file to be loaded.
547
548        Notes
549        -----
550        The loaded image is stored in the `input_image` attribute as a 2D ndarray.
551        """
552        path = os.path.abspath(path)
553
554        self.input_image = self.load_image(path)
555
556    def load_mask_(self, path):
557        r"""
558        Load a binary mask into the class and optionally set it as the normalization mask.
559
560        Parameters
561        ----------
562        path : str
563            Path to the mask image file. Supported formats include 8-bit or 16-bit images
564            with extensions such as `.png` or `.jpeg`. The mask must be binary
565            (e.g., 0/255, 0/2**16-1, 0/1).
566
567        Notes
568        -----
569        - If `load_normalization_mask_()` is not called, this mask is also used as the
570          background mask for intensity normalization.
571        - Normalization is applied per pixel using the formula:
572
573          .. math::
574
575              R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )
576
577          where
578          * ``R_{i,j}`` — normalized pixel intensity
579          * ``T_{i,j}`` — pixel intensity in the target mask
580          * ``μ_B`` — mean intensity of the background (reversed mask)
581          * ``c`` — correction factor
582        """
583
584        path = os.path.abspath(path)
585
586        self.mask = self.load_mask(path)
587
588        print(
589            "\nThis mask was also set as the reverse background mask. If you want a different background mask for normalization, use 'load_normalization_mask()'."
590        )
591        self.background_mask = self.load_mask(path)
592
593    def load_normalization_mask_(self, path):
594        r"""
595        Load a binary mask for normalization into the class.
596
597        Parameters
598        ----------
599        path : str
600            Path to the mask image file. Supported formats include 8-bit or 16-bit
601            images (e.g., `.png`, `.jpeg`). The mask must be binary (0/255, 0/2**16-1, 0/1).
602
603        Notes
604        -----
605        - The mask defines the area of interest. Normalization is applied to the inverse
606          of this area (reversed mask).
607        - Normalization formula applied per pixel:
608
609          .. math::
610
611              R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )
612
613          where
614          * ``R_{i,j}`` — normalized pixel intensity
615          * ``T_{i,j}`` — pixel intensity in the target mask
616          * ``μ_B`` — mean intensity of the background (reversed mask)
617          * ``c`` — correction factor
618        """
619
620        path = os.path.abspath(path)
621
622        self.background_mask = self.load_mask(path)
623
624    def intensity_calculations(self):
625        """
626        Calculate normalized and raw intensity statistics from the image based on masks.
627
628        This method performs intensity calculations using the main mask (`self.mask`)
629        and the background mask (`self.background_mask`). The pixel intensities within
630        the mask of interest are normalized by subtracting a threshold derived from the
631        background region and applying a correction factor (`self.correction_factor`).
632        Negative values after normalization are clipped to zero.
633
634        The following statistics are computed for both normalized and raw values:
635        - Minimum
636        - Maximum
637        - Mean
638        - Median
639        - Standard deviation
640        - Variance
641        - List of all normalized values (only for normalized data)
642
643        Notes
644        -----
645        - The method updates the instance attribute `self.normalized_image_values`
646          with a dictionary containing both normalized and raw statistics.
647        - Normalization formula applied for each pixel in the selected mask:
648            final_val = selected_value - (threshold + threshold * correction_factor)
649          where threshold is the mean intensity in the background mask.
650        - Negative values after normalization are set to zero.
651        """
652
653        tmp_mask = self.ajd_mask_size(image=self.image, mask=self.mask)
654        tmp_bmask = self.ajd_mask_size(image=self.image, mask=self.background_mask)
655
656        selected_values = self.image[tmp_mask == np.max(tmp_mask)]
657
658        threshold = np.mean(self.image[tmp_bmask == np.min(tmp_bmask)])
659
660        # normalization
661        final_val = selected_values - (threshold + (threshold * self.correction_factor))
662
663        final_val[final_val < 0] = 0
664
665        tmp_dict = {
666            "norm_min": np.min(final_val),
667            "norm_max": np.max(final_val),
668            "norm_mean": np.mean(final_val),
669            "norm_median": np.median(final_val),
670            "norm_std": np.std(final_val),
671            "norm_var": np.var(final_val),
672            "norm_values": final_val.tolist(),
673            "min": np.min(selected_values),
674            "max": np.max(selected_values),
675            "mean": np.mean(selected_values),
676            "median": np.median(selected_values),
677            "std": np.std(selected_values),
678            "var": np.var(selected_values),
679        }
680
681        self.normalized_image_values = tmp_dict
682
683    def size_calculations(self):
684        """
685        Calculates the size and bounding dimensions of the masked region in the image.
686
687        This method computes the following metrics based on the current mask:
688            - Total number of pixels in the mask (`px_size`)
689            - Real-world size if a scale is provided (`size`)
690            - Maximum lengths along x and y axes (`max_length_x_axis`, `max_length_y_axis`)
691
692        If `self.scale` is defined (unit per pixel), the real-world size is calculated.
693        If not, `size` will be `None` and a warning message is printed.
694
695        Returns:
696            Updates the following attributes in the class:
697                - self.size_info (dict) containing:
698                    - 'size' (float or None): real-world size of the mask
699                    - 'px_size' (int): number of pixels in the masked region
700                    - 'max_length_x_axis' (int): length of the bounding box along the x-axis
701                    - 'max_length_y_axis' (int): length of the bounding box along the y-axis
702
703        Example:
704            analysis.size_calculations()
705            print(analysis.size_info)
706        """
707
708        tmp_mask = self.ajd_mask_size(image=self.image, mask=self.mask)
709
710        size_px = int(len(tmp_mask[tmp_mask > np.min(tmp_mask)]))
711
712        if self.scale is not None:
713            size = float(size_px * self.scale)
714        else:
715            size = None
716            print(
717                '\nUnable to calculate real size, scale (unit/px) not provided, use "set_scale()" or load JIMG project .pjm metadata "load_pjm()" to set scale for calculations!'
718            )
719
720        non_zero_indices = np.where(tmp_mask == np.max(tmp_mask))
721
722        min_y, max_y = np.min(non_zero_indices[0]), np.max(non_zero_indices[0])
723        min_x, max_x = np.min(non_zero_indices[1]), np.max(non_zero_indices[1])
724
725        max_length_x_axis = int(max_x - min_x + 1)
726        max_length_y_axis = int(max_y - min_y + 1)
727
728        tmp_val = {
729            "size": size,
730            "px_size": size_px,
731            "max_length_x_axis": max_length_x_axis,
732            "max_length_y_axis": max_length_y_axis,
733        }
734
735        self.size_info = tmp_val
736
737    def run_calculations(self):
738        """
739        Run the full analysis pipeline on the loaded image using the provided masks.
740
741        Notes
742        -----
743        - The input image must be loaded via `load_image_()` or `load_image_3D()`.
744        - The ROI mask must be loaded via `load_mask_()`. Optionally, a normalization
745          mask can be loaded via `load_normalization_mask_()`.
746        - Parameters such as projection type and correction factor can be set with
747          `set_projection()` and `set_correction_factor()`.
748        - Scale and stack selection can also influence calculations if defined.
749        - To view current parameters, use the `current_metadata` property.
750
751        Returns
752        -------
753        None
754            The results are stored internally and can be retrieved using
755            `get_results()`.
756        """
757
758        if self.input_image is not None:
759
760            if self.mask is not None:
761
762                print("\nStart...")
763                self.detect_img()
764                self.intensity_calculations()
765                self.size_calculations()
766                print("\nCompleted!")
767
768    def get_results(self):
769        """
770        Return the results from the analysis performed by `run_calculations()`.
771
772        Returns
773        -------
774        results_dict : dict or None
775            Dictionary containing intensity and size results. Structure:
776            - 'intensity' : dict with normalized and raw intensity statistics
777            - 'size' : dict with ROI size metrics
778
779        Notes
780        -----
781        If analysis has not been run yet, prints a message and returns None.
782        """
783
784        if self.normalized_image_values is not None and self.size_info is not None:
785
786            results = {
787                "intensity": self.normalized_image_values,
788                "size": self.size_info,
789            }
790
791            return results
792
793        else:
794            print('\nAnalysis were not conducted. Run analysis "run_calculations()"')
795
796    def save_results(
797        self,
798        path="",
799        mask_region: str = "",
800        feature_name: str = "",
801        individual_number: int = 0,
802        individual_name: str = "",
803    ):
804        """
805        Save the analysis results to a `.int` (JSON) file.
806
807        Parameters
808        ----------
809        path : str, optional
810            Directory path where the file will be saved. Defaults to the current working directory.
811
812        mask_region : str
813            Name or identifier of the mask region (e.g., tissue, part of tissue).
814
815        feature_name : str
816            Name of the feature being analyzed. Underscores or spaces are replaced with periods.
817
818        individual_number : int
819            Unique identifier for the individual in the analysis (e.g., 1, 2, 3).
820
821        individual_name : str
822            Name of the individual (e.g., species name, tissue, organoid).
823
824        Notes
825        -----
826        - The method validates that all required parameters are provided and that
827          analysis results exist (`normalized_image_values` and `size_info`).
828        - Creates the directory if it does not exist.
829        - File name format:
830          '<individual_name>_<individual_number>_<mask_region>_<feature_name>.int'
831
832        Raises
833        ------
834        FileNotFoundError
835            If the specified path cannot be created or accessed.
836
837        ValueError
838            If any of `mask_region`, `feature_name`, `individual_number`, or
839            `individual_name` are missing or invalid.
840        """
841
842        path = os.path.abspath(path)
843
844        if (
845            len(mask_region) > 1
846            and len(feature_name) > 1
847            and individual_number != 0
848            and len(individual_name) > 1
849        ):
850
851            if self.normalized_image_values is not None and self.size_info is not None:
852
853                results = {
854                    "intensity": self.normalized_image_values,
855                    "size": self.size_info,
856                }
857
858                mask_region = re.sub(r"[_\s]+", ".", mask_region)
859                feature_name = re.sub(r"[_\s]+", ".", feature_name)
860                individual_number = re.sub(r"[_\s]+", ".", str(individual_number))
861                individual_name = re.sub(r"[_\s]+", ".", individual_name)
862
863                full_name = f"{individual_name}_{individual_number}_{mask_region}_{feature_name}"
864
865                isExist = os.path.exists(path)
866                if not isExist:
867                    os.makedirs(path, exist_ok=True)
868
869                full_path = os.path.join(
870                    path, re.sub("\\.json", "", full_name) + ".int"
871                )
872
873                with open(full_path, "w") as file:
874                    json.dump(results, file, indent=4)
875
876            else:
877                print(
878                    '\nAnalysis were not conducted. Run analysis "run_calculations()"'
879                )
880
881        else:
882            print(
883                "\nAny of 'mask_region', 'feature_name', 'individual_number', 'individual_name' parameters were provided wrong!"
884            )
885
886    def concatenate_intensity_data(self, directory: str = "", name: str = ""):
887        """
888        Concatenate intensity data from multiple `.int` files and save as CSV.
889
890        Parameters
891        ----------
892        directory : str, optional
893            Path to the directory containing `.int` files. Defaults to the current working directory.
894
895        name : str
896            Prefix for the output CSV file names. CSV files are saved in the format
897            '<name>_<gene>_<region>.csv'.
898
899        Raises
900        ------
901        FileNotFoundError
902            If the directory cannot be accessed or no `.int` files are found.
903
904        ValueError
905            If an `.int` file is missing expected data or has an incorrect format.
906
907        Notes
908        -----
909        - The method groups intensity data by gene (feature) and mask region.
910        - Outputs one CSV file per unique gene-region combination, saved in the specified directory.
911        """
912
913        directory = os.path.abspath(directory)
914
915        files_list = [f for f in os.listdir(directory) if f.endswith(".int")]
916
917        genes_set = set([re.sub("\\.int", "", x.split("_")[3]) for x in files_list])
918        regions_set = set([re.sub("\\.int", "", x.split("_")[2]) for x in files_list])
919
920        for g in genes_set:
921            for r in regions_set:
922                json_to_save = {
923                    "individual_name": [],
924                    "individual_number": [],
925                    "norm_intensity": [],
926                    "size": [],
927                }
928
929                for f in tqdm(files_list):
930                    if g in f and r in f:
931                        with open(os.path.join(directory, f), "r") as file:
932                            data = json.load(file)
933
934                            json_to_save["norm_intensity"] = (
935                                json_to_save["norm_intensity"]
936                                + data["intensity"]["norm_values"]
937                            )
938                            json_to_save["individual_name"] = json_to_save[
939                                "individual_name"
940                            ] + [f.split("_")[0]] * len(
941                                data["intensity"]["norm_values"]
942                            )
943                            json_to_save["individual_number"] = json_to_save[
944                                "individual_number"
945                            ] + [f.split("_")[1]] * len(
946                                data["intensity"]["norm_values"]
947                            )
948                            json_to_save["size"] = json_to_save["size"] + [
949                                data["size"]["px_size"]
950                            ] * len(data["intensity"]["norm_values"])
951
952        pd.DataFrame(json_to_save).to_csv(f"{name}_{g}_{r}.csv", index=False)

Class for quantitative analysis of pixel intensity and size measurements in 2D/3D biological images. Supports projection of 3D stacks, mask-based intensity normalization, region size estimation and metadata extraction.

Parameters

input_image : ndarray, optional Input image or 3D stack for analysis. If 3D, projection must be applied.

image : ndarray, optional 2D projected image (internal use).

normalized_image_values : dict, optional Dictionary storing normalized intensity statistics.

mask : ndarray, optional Binary mask of region of interest (ROI).

background_mask : ndarray, optional Binary mask used for background estimation. If not provided, mask is used.

typ : {"avg", "median", "std", "var", "max", "min"}, optional Projection type for 3D stacks. Default is "avg".

size_info : dict, optional Dictionary storing ROI size measurements.

correction_factor : float, optional Normalization correction factor applied to background intensity. Must satisfy 0 < factor < 1. Default is 0.1.

img_type : str, optional Image type metadata.

scale : float, optional Pixel resolution in physical units (e.g. µm/px). Used in size calculations.

stack_selection : list of int, optional List of Z-indices to remove when projecting a 3D image.

Attributes

input_image : ndarray or None Loaded input image.

image : ndarray or None Projected 2D image.

mask : ndarray or None Region of interest mask.

background_mask : ndarray or None Background normalization mask.

scale : float or None Scale value for size estimation.

normalized_image_values : dict or None Dictionary containing intensity metrics.

size_info : dict or None Dictionary with ROI size information.

typ : str Selected projection type for 3D images.

stack_selection : list of int Z-levels excluded from projection.

Notes

The intensity normalization formula applied per pixel is:

$$R_{i,j} = T_{i,j} - \left( \mu_B (1 + c) \right)$$

where

  • T_{i,j} – pixel intensity in ROI
  • μ_B – mean intensity in background region
  • c – correction factor
  • R_{i,j} – normalized pixel intensity

Examples

Load a 3D image, mask and compute statistics:

>>> fi = FeatureIntensity()
>>> fi.load_image_3D("stack.tiff")
>>> fi.load_mask_("mask.png")
>>> fi.set_projection("median")
>>> fi.run_calculations()
>>> results = fi.get_results()
>>> results["intensity"]["norm_mean"]
FeatureIntensity( input_image=None, image=None, normalized_image_values=None, mask=None, background_mask=None, typ=None, size_info=None, correction_factor=None, img_type=None, scale=None, stack_selection=None)
121    def __init__(
122        self,
123        input_image=None,
124        image=None,
125        normalized_image_values=None,
126        mask=None,
127        background_mask=None,
128        typ=None,
129        size_info=None,
130        correction_factor=None,
131        img_type=None,
132        scale=None,
133        stack_selection=None,
134    ):
135        """
136        Initialize a FeatureIntensity analysis instance.
137
138        Parameters
139        ----------
140        input_image : ndarray, optional
141            Input image or 3D stack used for analysis. If the image is 3D, a
142            projection will be computed depending on the `typ` parameter.
143
144        image : ndarray, optional
145            2D image buffer used internally after projection of the input image.
146            Should not be set manually.
147
148        normalized_image_values : dict, optional
149            Dictionary containing normalized intensity statistics. Usually filled
150            automatically after running `run_calculations()`.
151
152        mask : ndarray, optional
153            Binary mask of the target region of interest (ROI). Required for
154            intensity and size calculations.
155
156        background_mask : ndarray, optional
157            Binary mask specifying the background region used to compute the
158            normalization threshold. If not provided, the ROI mask is also used
159            as the background reference.
160
161        typ : {"avg", "median", "std", "var", "max", "min"}, optional
162            Projection method for 3D images. Determines how the z-stack is
163            collapsed into a 2D image. Default is `"avg"`.
164
165        size_info : dict, optional
166            Dictionary storing computed size metrics of the ROI. Populated after
167            invoking `size_calculations()`.
168
169        correction_factor : float, optional
170            Correction term used during intensity normalization. Must satisfy
171            0 < correction_factor < 1. Default is 0.1.
172
173        img_type : str, optional
174            Optional metadata about the image type (e.g., "tiff", "png").
175
176        scale : float, optional
177            Pixel resolution in physical units (e.g., µm/px). Required for
178            real-size estimation in `size_calculations()`.
179
180        stack_selection : list of int, optional
181            Indices of z-planes to exclude during projection of a 3D stack.
182
183        Notes
184        -----
185        Values not provided are initialized to `None`, except for `typ`, which
186        defaults to `"avg"`, and `correction_factor`, which defaults to 0.1.
187
188        The class is designed to be populated by loading functions:
189        `load_image_()`, `load_image_3D()`, `load_mask_()`,
190        and optionally `load_normalization_mask_()` and `load_JIMG_project_()`.
191        """
192
193        self.input_image = input_image or None
194        """ Input image or 3D stack used for analysis. If the image is 3D, a
195         projection will be computed depending on the `typ` parameter."""
196
197        self.image = image or None
198        """  2D image buffer used internally after projection of the input image.
199          Should not be set manually."""
200
201        self.normalized_image_values = normalized_image_values or None
202        """Dictionary containing normalized intensity statistics. Usually filled
203        automatically after running `run_calculations()`."""
204
205        self.mask = mask or None
206        """Binary mask of the target region of interest (ROI). Required for
207        intensity and size calculations."""
208
209        self.background_mask = background_mask or None
210        """ Binary mask specifying the background region used to compute the
211         normalization threshold. If not provided, the ROI mask is also used
212         as the background reference."""
213
214        self.typ = typ or "avg"
215        """Projection method for 3D images. Determines how the z-stack is
216        collapsed into a 2D image. Default is `"avg"`."""
217
218        self.size_info = size_info or None
219        """Dictionary storing computed size metrics of the ROI. Populated after
220        invoking `size_calculations()`."""
221
222        self.correction_factor = correction_factor or 0.1
223        """ Correction term used during intensity normalization. Must satisfy
224         0 < correction_factor < 1. Default is 0.1."""
225
226        self.scale = scale or None
227        """ Pixel resolution in physical units (e.g., µm/px). Required for
228         real-size estimation in `size_calculations()`."""
229
230        self.stack_selection = stack_selection or []
231        """Indices of z-planes to exclude during projection of a 3D stack."""

Initialize a FeatureIntensity analysis instance.

Parameters

input_image : ndarray, optional Input image or 3D stack used for analysis. If the image is 3D, a projection will be computed depending on the typ parameter.

image : ndarray, optional 2D image buffer used internally after projection of the input image. Should not be set manually.

normalized_image_values : dict, optional Dictionary containing normalized intensity statistics. Usually filled automatically after running run_calculations().

mask : ndarray, optional Binary mask of the target region of interest (ROI). Required for intensity and size calculations.

background_mask : ndarray, optional Binary mask specifying the background region used to compute the normalization threshold. If not provided, the ROI mask is also used as the background reference.

typ : {"avg", "median", "std", "var", "max", "min"}, optional Projection method for 3D images. Determines how the z-stack is collapsed into a 2D image. Default is "avg".

size_info : dict, optional Dictionary storing computed size metrics of the ROI. Populated after invoking size_calculations().

correction_factor : float, optional Correction term used during intensity normalization. Must satisfy 0 < correction_factor < 1. Default is 0.1.

img_type : str, optional Optional metadata about the image type (e.g., "tiff", "png").

scale : float, optional Pixel resolution in physical units (e.g., µm/px). Required for real-size estimation in size_calculations().

stack_selection : list of int, optional Indices of z-planes to exclude during projection of a 3D stack.

Notes

Values not provided are initialized to None, except for typ, which defaults to "avg", and correction_factor, which defaults to 0.1.

The class is designed to be populated by loading functions: load_image_(), load_image_3D(), load_mask_(), and optionally load_normalization_mask_() and load_JIMG_project_().

input_image

Input image or 3D stack used for analysis. If the image is 3D, a projection will be computed depending on the typ parameter.

image

2D image buffer used internally after projection of the input image. Should not be set manually.

normalized_image_values

Dictionary containing normalized intensity statistics. Usually filled automatically after running run_calculations().

mask

Binary mask of the target region of interest (ROI). Required for intensity and size calculations.

background_mask

Binary mask specifying the background region used to compute the normalization threshold. If not provided, the ROI mask is also used as the background reference.

typ

Projection method for 3D images. Determines how the z-stack is collapsed into a 2D image. Default is "avg".

size_info

Dictionary storing computed size metrics of the ROI. Populated after invoking size_calculations().

correction_factor

Correction term used during intensity normalization. Must satisfy 0 < correction_factor < 1. Default is 0.1.

scale

Pixel resolution in physical units (e.g., µm/px). Required for real-size estimation in size_calculations().

stack_selection

Indices of z-planes to exclude during projection of a 3D stack.

current_metadata
233    @property
234    def current_metadata(self):
235        r"""
236        Return current metadata parameters used in image processing and normalization.
237
238        Returns
239        -------
240        tuple
241            A tuple containing:
242
243            projection_type : str
244                Projection method used for 3D image reduction (e.g., "avg", "median").
245
246            correction_factor : float
247                Correction factor used for background subtraction during intensity
248                normalization. The applied formula is:
249
250                .. math::
251
252                    R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )
253
254                where
255                * ``R_{i,j}`` — normalized pixel intensity
256                * ``T_{i,j}`` — original pixel intensity
257                * ``μ_B`` — mean background intensity
258                * ``c`` — correction factor
259            scale : float or None
260                Pixel resolution (unit/px), loaded via `load_JIMG_project_()` or set manually
261                using `set_scale()`.
262
263            stack_selection : list of int
264                Indices of z-slices excluded from projection of a 3D image.
265
266        Notes
267        -----
268        This property also prints the metadata values to the console for quick inspection.
269        """
270
271        print(f"Projection type: {self.typ}")
272        print(f"Correction factor: {self.correction_factor}")
273        print(f"Scale (unit/px): {self.scale}")
274        print(f"Selected stac to remove: {self.stack_selection}")
275
276        return self.typ, self.correction_factor, self.scale, self.stack_selection

Return current metadata parameters used in image processing and normalization.

Returns

tuple A tuple containing:

projection_type : str
    Projection method used for 3D image reduction (e.g., "avg", "median").

correction_factor : float
    Correction factor used for background subtraction during intensity
    normalization. The applied formula is:

    $$R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )$$

    where
    * ``R_{i,j}`` — normalized pixel intensity
    * ``T_{i,j}`` — original pixel intensity
    * ``μ_B`` — mean background intensity
    * ``c`` — correction factor
scale : float or None
    Pixel resolution (unit/px), loaded via `load_JIMG_project_()` or set manually
    using `set_scale()`.

stack_selection : list of int
    Indices of z-slices excluded from projection of a 3D image.

Notes

This property also prints the metadata values to the console for quick inspection.

def set_projection(self, projection: str):
278    def set_projection(self, projection: str):
279        """
280        Set the projection method for 3D image stack reduction.
281
282        Parameters
283        ----------
284        projection : {"avg", "median", "std", "var", "max", "min"}
285            Projection method to reduce a 3D image stack to a 2D image. Default is `"avg"`.
286
287        Notes
288        -----
289        This method updates the `typ` attribute of the class. The selected projection
290        determines how the z-stack is collapsed:
291        - `"avg"` : average intensity across slices
292        - `"median"` : median intensity across slices
293        - `"std"` : standard deviation across slices
294        - `"var"` : variance across slices
295        - `"max"` : maximum intensity across slices
296        - `"min"` : minimum intensity across slices
297        """
298
299        t = ["avg", "median", "std", "var", "max", "min"]
300        if projection in t:
301            self.typ = projection
302        else:
303            print(f"\nProvided parameter is incorrect. Avaiable projection types: {t}")

Set the projection method for 3D image stack reduction.

Parameters

projection : {"avg", "median", "std", "var", "max", "min"} Projection method to reduce a 3D image stack to a 2D image. Default is "avg".

Notes

This method updates the typ attribute of the class. The selected projection determines how the z-stack is collapsed:

  • "avg" : average intensity across slices
  • "median" : median intensity across slices
  • "std" : standard deviation across slices
  • "var" : variance across slices
  • "max" : maximum intensity across slices
  • "min" : minimum intensity across slices
def set_correction_factorn(self, factor: float):
305    def set_correction_factorn(self, factor: float):
306        r"""
307        Set the correction factor for background subtraction during intensity normalization.
308
309        Parameters
310        ----------
311        factor : float
312            Correction factor to adjust background subtraction. Must satisfy 0 < factor < 1.
313            Default is 0.1.
314
315        Notes
316        -----
317        The correction is applied per pixel in the target mask using the formula:
318
319        .. math::
320
321            R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )
322
323        where
324        * ``R_{i,j}`` — normalized pixel intensity
325        * ``T_{i,j}`` — original pixel intensity
326        * ``μ_B`` — mean intensity in the background mask
327        * ``c`` — correction factor
328        """
329
330        if factor < 1 and factor > 0:
331            self.correction_factor = factor
332        else:
333            print(
334                "\nProvided parameter is incorrect. The factor should be a floating-point value within the range of 0 to 1."
335            )

Set the correction factor for background subtraction during intensity normalization.

Parameters

factor : float Correction factor to adjust background subtraction. Must satisfy 0 < factor < 1. Default is 0.1.

Notes

The correction is applied per pixel in the target mask using the formula:

$$R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )$$

where

  • R_{i,j} — normalized pixel intensity
  • T_{i,j} — original pixel intensity
  • μ_B — mean intensity in the background mask
  • c — correction factor
def set_scale(self, scale):
337    def set_scale(self, scale):
338        """
339        Set the scale for converting pixel measurements to physical units.
340
341        Parameters
342        ----------
343        scale : float
344            Pixel resolution in physical units (e.g., µm/px). Used to calculate the
345            actual size of the tissue or organ.
346
347        Notes
348        -----
349        The scale can also be automatically loaded from a JIMG project using
350        `load_JIMG_project_()`. This value is required for size calculations in
351        `size_calculations()`.
352        """
353
354        self.scale = scale

Set the scale for converting pixel measurements to physical units.

Parameters

scale : float Pixel resolution in physical units (e.g., µm/px). Used to calculate the actual size of the tissue or organ.

Notes

The scale can also be automatically loaded from a JIMG project using load_JIMG_project_(). This value is required for size calculations in size_calculations().

def set_selection_list(self, rm_list: list):
356    def set_selection_list(self, rm_list: list):
357        """
358        Set the list of z-slices to exclude when projecting a 3D image stack.
359
360        Parameters
361        ----------
362        rm_list : list of int
363            List of indices corresponding to z-slices that should be removed from
364            the full 3D image stack before projection.
365
366        Notes
367        -----
368        This updates the `stack_selection` attribute, which is used by the
369        `stack_selection_()` method during projection.
370        """
371
372        self.stack_selection = rm_list

Set the list of z-slices to exclude when projecting a 3D image stack.

Parameters

rm_list : list of int List of indices corresponding to z-slices that should be removed from the full 3D image stack before projection.

Notes

This updates the stack_selection attribute, which is used by the stack_selection_() method during projection.

def load_JIMG_project_(self, path):
374    def load_JIMG_project_(self, path):
375        """
376        Load a JIMG project from a `.pjm` file.
377
378        Parameters
379        ----------
380        file_path : str
381            Path to the JIMG project file. The file must have a `.pjm` extension.
382
383        Returns
384        -------
385        project : object
386            Loaded project object containing images and metadata.
387
388        Raises
389        ------
390        ValueError
391            If the provided file path does not point to a `.pjm` file.
392
393        Notes
394        -----
395        The method attempts to automatically set the `scale` and `stack_selection`
396        attributes from the project metadata if available.
397        """
398
399        path = os.path.abspath(path)
400
401        if ".pjm" in path:
402            metadata = self.load_JIMG_project(path)
403
404            try:
405                self.scale = metadata.metadata["X_resolution[um/px]"]
406            except:
407
408                try:
409                    self.scale = metadata.images_dict["metadata"][0][
410                        "X_resolution[um/px]"
411                    ]
412
413                except:
414                    print(
415                        '\nUnable to set scale on this project! Set scale using "set_scale()"'
416                    )
417
418            self.stack_selection = metadata.removal_list
419
420        else:
421            print(
422                "\nWrong path. The provided path does not point to a JIMG project (*.pjm)."
423            )

Load a JIMG project from a .pjm file.

Parameters

file_path : str Path to the JIMG project file. The file must have a .pjm extension.

Returns

project : object Loaded project object containing images and metadata.

Raises

ValueError If the provided file path does not point to a .pjm file.

Notes

The method attempts to automatically set the scale and stack_selection attributes from the project metadata if available.

def stack_selection_(self):
425    def stack_selection_(self):
426        """
427        Remove selected z-slices from a 3D image stack based on `stack_selection`.
428
429        Notes
430        -----
431        Only works if `input_image` is a 3D ndarray. The slices with indices listed
432        in `stack_selection` are excluded from the stack. Updates `input_image`
433        in-place.
434
435        Prints a warning if `stack_selection` is empty.
436        """
437
438        if len(self.input_image.shape) == 3:
439            if len(self.stack_selection) > 0:
440                self.input_image = self.input_image[
441                    [
442                        x
443                        for x in range(self.input_image.shape[0])
444                        if x not in self.stack_selection
445                    ]
446                ]
447            else:
448                print("\nImages to remove from the stack were not selected!")

Remove selected z-slices from a 3D image stack based on stack_selection.

Notes

Only works if input_image is a 3D ndarray. The slices with indices listed in stack_selection are excluded from the stack. Updates input_image in-place.

Prints a warning if stack_selection is empty.

def projection(self):
450    def projection(self):
451        """
452        Project a 3D image stack into a 2D image using the method defined by `typ`.
453
454        Notes
455        -----
456        Updates the `image` attribute with the projected 2D result.
457
458        Supported projection types (`typ`):
459        - "avg" : mean intensity across slices
460        - "median" : median intensity across slices
461        - "std" : standard deviation across slices
462        - "var" : variance across slices
463        - "max" : maximum intensity across slices
464        - "min" : minimum intensity across slices
465
466        Raises
467        ------
468        AttributeError
469            If `input_image` is not defined.
470        """
471
472        if self.typ == "avg":
473            img = np.mean(self.input_image, axis=0)
474
475        elif self.typ == "std":
476            img = np.std(self.input_image, axis=0)
477
478        elif self.typ == "median":
479            img = np.median(self.input_image, axis=0)
480
481        elif self.typ == "var":
482            img = np.var(self.input_image, axis=0)
483
484        elif self.typ == "max":
485            img = np.max(self.input_image, axis=0)
486
487        elif self.typ == "min":
488            img = np.min(self.input_image, axis=0)
489
490        self.image = img

Project a 3D image stack into a 2D image using the method defined by typ.

Notes

Updates the image attribute with the projected 2D result.

Supported projection types (typ):

  • "avg" : mean intensity across slices
  • "median" : median intensity across slices
  • "std" : standard deviation across slices
  • "var" : variance across slices
  • "max" : maximum intensity across slices
  • "min" : minimum intensity across slices

Raises

AttributeError If input_image is not defined.

def detect_img(self):
492    def detect_img(self):
493        """
494        Detect whether the input image is 2D or 3D and perform appropriate preprocessing.
495
496        Notes
497        -----
498        - For 3D images, applies `stack_selection_()` and then `projection()`.
499        - For 2D images, no projection is applied.
500        - Prints status messages indicating the type of image and applied operations.
501
502        Raises
503        ------
504        AttributeError
505            If `input_image` is not defined.
506        """
507        check = len(self.input_image.shape)
508
509        if check == 3:
510            print("\n3D image detected! Starting processing for 3D image...")
511            print(f"Projection - {self.typ}...")
512
513            self.stack_selection_()
514            self.projection()
515
516        elif check == 2:
517            print("\n2D image detected! Starting processing for 2D image...")
518
519        else:
520            print("\nData does not match any image type!")

Detect whether the input image is 2D or 3D and perform appropriate preprocessing.

Notes

  • For 3D images, applies stack_selection_() and then projection().
  • For 2D images, no projection is applied.
  • Prints status messages indicating the type of image and applied operations.

Raises

AttributeError If input_image is not defined.

def load_image_3D(self, path):
522    def load_image_3D(self, path):
523        """
524        Load a 3D image stack from a TIFF file.
525
526        Parameters
527        ----------
528        path : str
529            Path to the 3D image file (*.tiff) to be loaded.
530
531        Notes
532        -----
533        The loaded image is stored in the `input_image` attribute as a 3D ndarray.
534        """
535        path = os.path.abspath(path)
536
537        self.input_image = self.load_3D_tiff(path)

Load a 3D image stack from a TIFF file.

Parameters

path : str Path to the 3D image file (*.tiff) to be loaded.

Notes

The loaded image is stored in the input_image attribute as a 3D ndarray.

def load_image_(self, path):
539    def load_image_(self, path):
540        """
541        Load a 2D image into the class.
542
543        Parameters
544        ----------
545        path : str
546            Path to the image file to be loaded.
547
548        Notes
549        -----
550        The loaded image is stored in the `input_image` attribute as a 2D ndarray.
551        """
552        path = os.path.abspath(path)
553
554        self.input_image = self.load_image(path)

Load a 2D image into the class.

Parameters

path : str Path to the image file to be loaded.

Notes

The loaded image is stored in the input_image attribute as a 2D ndarray.

def load_mask_(self, path):
556    def load_mask_(self, path):
557        r"""
558        Load a binary mask into the class and optionally set it as the normalization mask.
559
560        Parameters
561        ----------
562        path : str
563            Path to the mask image file. Supported formats include 8-bit or 16-bit images
564            with extensions such as `.png` or `.jpeg`. The mask must be binary
565            (e.g., 0/255, 0/2**16-1, 0/1).
566
567        Notes
568        -----
569        - If `load_normalization_mask_()` is not called, this mask is also used as the
570          background mask for intensity normalization.
571        - Normalization is applied per pixel using the formula:
572
573          .. math::
574
575              R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )
576
577          where
578          * ``R_{i,j}`` — normalized pixel intensity
579          * ``T_{i,j}`` — pixel intensity in the target mask
580          * ``μ_B`` — mean intensity of the background (reversed mask)
581          * ``c`` — correction factor
582        """
583
584        path = os.path.abspath(path)
585
586        self.mask = self.load_mask(path)
587
588        print(
589            "\nThis mask was also set as the reverse background mask. If you want a different background mask for normalization, use 'load_normalization_mask()'."
590        )
591        self.background_mask = self.load_mask(path)

Load a binary mask into the class and optionally set it as the normalization mask.

Parameters

path : str Path to the mask image file. Supported formats include 8-bit or 16-bit images with extensions such as .png or .jpeg. The mask must be binary (e.g., 0/255, 0/2**16-1, 0/1).

Notes

  • If load_normalization_mask_() is not called, this mask is also used as the background mask for intensity normalization.
  • Normalization is applied per pixel using the formula:

    $$R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )$$

    where

    • R_{i,j} — normalized pixel intensity
    • T_{i,j} — pixel intensity in the target mask
    • μ_B — mean intensity of the background (reversed mask)
    • c — correction factor
def load_normalization_mask_(self, path):
593    def load_normalization_mask_(self, path):
594        r"""
595        Load a binary mask for normalization into the class.
596
597        Parameters
598        ----------
599        path : str
600            Path to the mask image file. Supported formats include 8-bit or 16-bit
601            images (e.g., `.png`, `.jpeg`). The mask must be binary (0/255, 0/2**16-1, 0/1).
602
603        Notes
604        -----
605        - The mask defines the area of interest. Normalization is applied to the inverse
606          of this area (reversed mask).
607        - Normalization formula applied per pixel:
608
609          .. math::
610
611              R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )
612
613          where
614          * ``R_{i,j}`` — normalized pixel intensity
615          * ``T_{i,j}`` — pixel intensity in the target mask
616          * ``μ_B`` — mean intensity of the background (reversed mask)
617          * ``c`` — correction factor
618        """
619
620        path = os.path.abspath(path)
621
622        self.background_mask = self.load_mask(path)

Load a binary mask for normalization into the class.

Parameters

path : str Path to the mask image file. Supported formats include 8-bit or 16-bit images (e.g., .png, .jpeg). The mask must be binary (0/255, 0/2**16-1, 0/1).

Notes

  • The mask defines the area of interest. Normalization is applied to the inverse of this area (reversed mask).
  • Normalization formula applied per pixel:

    $$R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )$$

    where

    • R_{i,j} — normalized pixel intensity
    • T_{i,j} — pixel intensity in the target mask
    • μ_B — mean intensity of the background (reversed mask)
    • c — correction factor
def intensity_calculations(self):
624    def intensity_calculations(self):
625        """
626        Calculate normalized and raw intensity statistics from the image based on masks.
627
628        This method performs intensity calculations using the main mask (`self.mask`)
629        and the background mask (`self.background_mask`). The pixel intensities within
630        the mask of interest are normalized by subtracting a threshold derived from the
631        background region and applying a correction factor (`self.correction_factor`).
632        Negative values after normalization are clipped to zero.
633
634        The following statistics are computed for both normalized and raw values:
635        - Minimum
636        - Maximum
637        - Mean
638        - Median
639        - Standard deviation
640        - Variance
641        - List of all normalized values (only for normalized data)
642
643        Notes
644        -----
645        - The method updates the instance attribute `self.normalized_image_values`
646          with a dictionary containing both normalized and raw statistics.
647        - Normalization formula applied for each pixel in the selected mask:
648            final_val = selected_value - (threshold + threshold * correction_factor)
649          where threshold is the mean intensity in the background mask.
650        - Negative values after normalization are set to zero.
651        """
652
653        tmp_mask = self.ajd_mask_size(image=self.image, mask=self.mask)
654        tmp_bmask = self.ajd_mask_size(image=self.image, mask=self.background_mask)
655
656        selected_values = self.image[tmp_mask == np.max(tmp_mask)]
657
658        threshold = np.mean(self.image[tmp_bmask == np.min(tmp_bmask)])
659
660        # normalization
661        final_val = selected_values - (threshold + (threshold * self.correction_factor))
662
663        final_val[final_val < 0] = 0
664
665        tmp_dict = {
666            "norm_min": np.min(final_val),
667            "norm_max": np.max(final_val),
668            "norm_mean": np.mean(final_val),
669            "norm_median": np.median(final_val),
670            "norm_std": np.std(final_val),
671            "norm_var": np.var(final_val),
672            "norm_values": final_val.tolist(),
673            "min": np.min(selected_values),
674            "max": np.max(selected_values),
675            "mean": np.mean(selected_values),
676            "median": np.median(selected_values),
677            "std": np.std(selected_values),
678            "var": np.var(selected_values),
679        }
680
681        self.normalized_image_values = tmp_dict

Calculate normalized and raw intensity statistics from the image based on masks.

This method performs intensity calculations using the main mask (self.mask) and the background mask (self.background_mask). The pixel intensities within the mask of interest are normalized by subtracting a threshold derived from the background region and applying a correction factor (self.correction_factor). Negative values after normalization are clipped to zero.

The following statistics are computed for both normalized and raw values:

  • Minimum
  • Maximum
  • Mean
  • Median
  • Standard deviation
  • Variance
  • List of all normalized values (only for normalized data)

Notes

  • The method updates the instance attribute self.normalized_image_values with a dictionary containing both normalized and raw statistics.
  • Normalization formula applied for each pixel in the selected mask: final_val = selected_value - (threshold + threshold * correction_factor) where threshold is the mean intensity in the background mask.
  • Negative values after normalization are set to zero.
def size_calculations(self):
683    def size_calculations(self):
684        """
685        Calculates the size and bounding dimensions of the masked region in the image.
686
687        This method computes the following metrics based on the current mask:
688            - Total number of pixels in the mask (`px_size`)
689            - Real-world size if a scale is provided (`size`)
690            - Maximum lengths along x and y axes (`max_length_x_axis`, `max_length_y_axis`)
691
692        If `self.scale` is defined (unit per pixel), the real-world size is calculated.
693        If not, `size` will be `None` and a warning message is printed.
694
695        Returns:
696            Updates the following attributes in the class:
697                - self.size_info (dict) containing:
698                    - 'size' (float or None): real-world size of the mask
699                    - 'px_size' (int): number of pixels in the masked region
700                    - 'max_length_x_axis' (int): length of the bounding box along the x-axis
701                    - 'max_length_y_axis' (int): length of the bounding box along the y-axis
702
703        Example:
704            analysis.size_calculations()
705            print(analysis.size_info)
706        """
707
708        tmp_mask = self.ajd_mask_size(image=self.image, mask=self.mask)
709
710        size_px = int(len(tmp_mask[tmp_mask > np.min(tmp_mask)]))
711
712        if self.scale is not None:
713            size = float(size_px * self.scale)
714        else:
715            size = None
716            print(
717                '\nUnable to calculate real size, scale (unit/px) not provided, use "set_scale()" or load JIMG project .pjm metadata "load_pjm()" to set scale for calculations!'
718            )
719
720        non_zero_indices = np.where(tmp_mask == np.max(tmp_mask))
721
722        min_y, max_y = np.min(non_zero_indices[0]), np.max(non_zero_indices[0])
723        min_x, max_x = np.min(non_zero_indices[1]), np.max(non_zero_indices[1])
724
725        max_length_x_axis = int(max_x - min_x + 1)
726        max_length_y_axis = int(max_y - min_y + 1)
727
728        tmp_val = {
729            "size": size,
730            "px_size": size_px,
731            "max_length_x_axis": max_length_x_axis,
732            "max_length_y_axis": max_length_y_axis,
733        }
734
735        self.size_info = tmp_val

Calculates the size and bounding dimensions of the masked region in the image.

This method computes the following metrics based on the current mask: - Total number of pixels in the mask (px_size) - Real-world size if a scale is provided (size) - Maximum lengths along x and y axes (max_length_x_axis, max_length_y_axis)

If self.scale is defined (unit per pixel), the real-world size is calculated. If not, size will be None and a warning message is printed.

Returns: Updates the following attributes in the class: - self.size_info (dict) containing: - 'size' (float or None): real-world size of the mask - 'px_size' (int): number of pixels in the masked region - 'max_length_x_axis' (int): length of the bounding box along the x-axis - 'max_length_y_axis' (int): length of the bounding box along the y-axis

Example: analysis.size_calculations() print(analysis.size_info)

def run_calculations(self):
737    def run_calculations(self):
738        """
739        Run the full analysis pipeline on the loaded image using the provided masks.
740
741        Notes
742        -----
743        - The input image must be loaded via `load_image_()` or `load_image_3D()`.
744        - The ROI mask must be loaded via `load_mask_()`. Optionally, a normalization
745          mask can be loaded via `load_normalization_mask_()`.
746        - Parameters such as projection type and correction factor can be set with
747          `set_projection()` and `set_correction_factor()`.
748        - Scale and stack selection can also influence calculations if defined.
749        - To view current parameters, use the `current_metadata` property.
750
751        Returns
752        -------
753        None
754            The results are stored internally and can be retrieved using
755            `get_results()`.
756        """
757
758        if self.input_image is not None:
759
760            if self.mask is not None:
761
762                print("\nStart...")
763                self.detect_img()
764                self.intensity_calculations()
765                self.size_calculations()
766                print("\nCompleted!")

Run the full analysis pipeline on the loaded image using the provided masks.

Notes

Returns

None The results are stored internally and can be retrieved using get_results().

def get_results(self):
768    def get_results(self):
769        """
770        Return the results from the analysis performed by `run_calculations()`.
771
772        Returns
773        -------
774        results_dict : dict or None
775            Dictionary containing intensity and size results. Structure:
776            - 'intensity' : dict with normalized and raw intensity statistics
777            - 'size' : dict with ROI size metrics
778
779        Notes
780        -----
781        If analysis has not been run yet, prints a message and returns None.
782        """
783
784        if self.normalized_image_values is not None and self.size_info is not None:
785
786            results = {
787                "intensity": self.normalized_image_values,
788                "size": self.size_info,
789            }
790
791            return results
792
793        else:
794            print('\nAnalysis were not conducted. Run analysis "run_calculations()"')

Return the results from the analysis performed by run_calculations().

Returns

results_dict : dict or None Dictionary containing intensity and size results. Structure: - 'intensity' : dict with normalized and raw intensity statistics - 'size' : dict with ROI size metrics

Notes

If analysis has not been run yet, prints a message and returns None.

def save_results( self, path='', mask_region: str = '', feature_name: str = '', individual_number: int = 0, individual_name: str = ''):
796    def save_results(
797        self,
798        path="",
799        mask_region: str = "",
800        feature_name: str = "",
801        individual_number: int = 0,
802        individual_name: str = "",
803    ):
804        """
805        Save the analysis results to a `.int` (JSON) file.
806
807        Parameters
808        ----------
809        path : str, optional
810            Directory path where the file will be saved. Defaults to the current working directory.
811
812        mask_region : str
813            Name or identifier of the mask region (e.g., tissue, part of tissue).
814
815        feature_name : str
816            Name of the feature being analyzed. Underscores or spaces are replaced with periods.
817
818        individual_number : int
819            Unique identifier for the individual in the analysis (e.g., 1, 2, 3).
820
821        individual_name : str
822            Name of the individual (e.g., species name, tissue, organoid).
823
824        Notes
825        -----
826        - The method validates that all required parameters are provided and that
827          analysis results exist (`normalized_image_values` and `size_info`).
828        - Creates the directory if it does not exist.
829        - File name format:
830          '<individual_name>_<individual_number>_<mask_region>_<feature_name>.int'
831
832        Raises
833        ------
834        FileNotFoundError
835            If the specified path cannot be created or accessed.
836
837        ValueError
838            If any of `mask_region`, `feature_name`, `individual_number`, or
839            `individual_name` are missing or invalid.
840        """
841
842        path = os.path.abspath(path)
843
844        if (
845            len(mask_region) > 1
846            and len(feature_name) > 1
847            and individual_number != 0
848            and len(individual_name) > 1
849        ):
850
851            if self.normalized_image_values is not None and self.size_info is not None:
852
853                results = {
854                    "intensity": self.normalized_image_values,
855                    "size": self.size_info,
856                }
857
858                mask_region = re.sub(r"[_\s]+", ".", mask_region)
859                feature_name = re.sub(r"[_\s]+", ".", feature_name)
860                individual_number = re.sub(r"[_\s]+", ".", str(individual_number))
861                individual_name = re.sub(r"[_\s]+", ".", individual_name)
862
863                full_name = f"{individual_name}_{individual_number}_{mask_region}_{feature_name}"
864
865                isExist = os.path.exists(path)
866                if not isExist:
867                    os.makedirs(path, exist_ok=True)
868
869                full_path = os.path.join(
870                    path, re.sub("\\.json", "", full_name) + ".int"
871                )
872
873                with open(full_path, "w") as file:
874                    json.dump(results, file, indent=4)
875
876            else:
877                print(
878                    '\nAnalysis were not conducted. Run analysis "run_calculations()"'
879                )
880
881        else:
882            print(
883                "\nAny of 'mask_region', 'feature_name', 'individual_number', 'individual_name' parameters were provided wrong!"
884            )

Save the analysis results to a .int (JSON) file.

Parameters

path : str, optional Directory path where the file will be saved. Defaults to the current working directory.

mask_region : str Name or identifier of the mask region (e.g., tissue, part of tissue).

feature_name : str Name of the feature being analyzed. Underscores or spaces are replaced with periods.

individual_number : int Unique identifier for the individual in the analysis (e.g., 1, 2, 3).

individual_name : str Name of the individual (e.g., species name, tissue, organoid).

Notes

  • The method validates that all required parameters are provided and that analysis results exist (normalized_image_values and size_info).
  • Creates the directory if it does not exist.
  • File name format: '___.int'

Raises

FileNotFoundError If the specified path cannot be created or accessed.

ValueError If any of mask_region, feature_name, individual_number, or individual_name are missing or invalid.

def concatenate_intensity_data(self, directory: str = '', name: str = ''):
886    def concatenate_intensity_data(self, directory: str = "", name: str = ""):
887        """
888        Concatenate intensity data from multiple `.int` files and save as CSV.
889
890        Parameters
891        ----------
892        directory : str, optional
893            Path to the directory containing `.int` files. Defaults to the current working directory.
894
895        name : str
896            Prefix for the output CSV file names. CSV files are saved in the format
897            '<name>_<gene>_<region>.csv'.
898
899        Raises
900        ------
901        FileNotFoundError
902            If the directory cannot be accessed or no `.int` files are found.
903
904        ValueError
905            If an `.int` file is missing expected data or has an incorrect format.
906
907        Notes
908        -----
909        - The method groups intensity data by gene (feature) and mask region.
910        - Outputs one CSV file per unique gene-region combination, saved in the specified directory.
911        """
912
913        directory = os.path.abspath(directory)
914
915        files_list = [f for f in os.listdir(directory) if f.endswith(".int")]
916
917        genes_set = set([re.sub("\\.int", "", x.split("_")[3]) for x in files_list])
918        regions_set = set([re.sub("\\.int", "", x.split("_")[2]) for x in files_list])
919
920        for g in genes_set:
921            for r in regions_set:
922                json_to_save = {
923                    "individual_name": [],
924                    "individual_number": [],
925                    "norm_intensity": [],
926                    "size": [],
927                }
928
929                for f in tqdm(files_list):
930                    if g in f and r in f:
931                        with open(os.path.join(directory, f), "r") as file:
932                            data = json.load(file)
933
934                            json_to_save["norm_intensity"] = (
935                                json_to_save["norm_intensity"]
936                                + data["intensity"]["norm_values"]
937                            )
938                            json_to_save["individual_name"] = json_to_save[
939                                "individual_name"
940                            ] + [f.split("_")[0]] * len(
941                                data["intensity"]["norm_values"]
942                            )
943                            json_to_save["individual_number"] = json_to_save[
944                                "individual_number"
945                            ] + [f.split("_")[1]] * len(
946                                data["intensity"]["norm_values"]
947                            )
948                            json_to_save["size"] = json_to_save["size"] + [
949                                data["size"]["px_size"]
950                            ] * len(data["intensity"]["norm_values"])
951
952        pd.DataFrame(json_to_save).to_csv(f"{name}_{g}_{r}.csv", index=False)

Concatenate intensity data from multiple .int files and save as CSV.

Parameters

directory : str, optional Path to the directory containing .int files. Defaults to the current working directory.

name : str Prefix for the output CSV file names. CSV files are saved in the format '__.csv'.

Raises

FileNotFoundError If the directory cannot be accessed or no .int files are found.

ValueError If an .int file is missing expected data or has an incorrect format.

Notes

  • The method groups intensity data by gene (feature) and mask region.
  • Outputs one CSV file per unique gene-region combination, saved in the specified directory.
class IntensityAnalysis:
 955class IntensityAnalysis:
 956    """
 957    Class for performing percentile-based statistical analysis on grouped data.
 958
 959    This class provides methods to calculate percentiles, remove outliers, aggregate
 960    data into percentile bins, perform Welch's ANOVA and Chi-squared tests, and
 961    visualize results via comparative histograms. It is designed to handle both
 962    single-column and multi-column combinations of values for group-based analysis.
 963
 964    Methods
 965    -------
 966    drop_up_df(data, group_col, values_col)
 967        Removes upper outliers from a DataFrame based on a grouping column.
 968
 969    percentiles_calculation(values, sep_perc=1)
 970        Calculates percentiles and creates loopable percentile ranges.
 971
 972    to_percentil(values, percentiles, percentiles_loop)
 973        Aggregates statistics based on percentile ranges.
 974
 975    df_to_percentiles(data, group_col, values_col, sep_perc=1, drop_outlires=True)
 976        Computes percentile statistics for grouped DataFrame data.
 977
 978    round_to_scientific_notation(num)
 979        Formats a number in scientific notation or standard format.
 980
 981    aov_percentiles(data, testes_col, comb="*")
 982        Performs Welch's ANOVA on percentile-based group data.
 983
 984    post_aov_percentiles(data, testes_col, comb="*")
 985        Performs Welch's ANOVA with pairwise t-tests.
 986
 987    chi2_percentiles(input_hist)
 988        Performs Chi-squared test on percentile histogram data.
 989
 990    post_ch2_percentiles(input_hist)
 991        Performs Chi-squared test with pairwise comparisons.
 992
 993    hist_compare_plot(data, queue, tested_value, p_adj=True, txt_size=20)
 994        Generates comparative histograms with statistical test results.
 995    """
 996
 997    def drop_up_df(self, data: pd.DataFrame, group_col: str, values_col: str):
 998        """
 999        Remove upper outliers from a DataFrame based on a specified value column, grouped by a grouping column.
1000
1001        Outliers are calculated and removed separately for each group defined by `group_col`.
1002        The upper outliers are defined using the interquartile range (IQR) method:
1003            values greater than Q3 + 1.5 * IQR are considered outliers.
1004
1005        Parameters
1006        ----------
1007        data : pd.DataFrame
1008            The input DataFrame containing the data.
1009
1010        group_col : str
1011            The name of the column used for grouping the data.
1012
1013        values_col : str
1014            The column containing the values from which upper outliers will be removed.
1015
1016        Returns
1017        -------
1018        filtered_data : pd.DataFrame
1019            A filtered DataFrame with the upper outliers removed for each group.
1020
1021        Notes
1022        -----
1023        - Outliers are removed separately within each group.
1024        - The original DataFrame is not modified; a new filtered DataFrame is returned.
1025        """
1026
1027        def iqr_filter(group):
1028            q75 = np.quantile(group[values_col], 0.75)
1029            q25 = np.quantile(group[values_col], 0.25)
1030            itq = q75 - q25
1031            return group[group[values_col] <= (q75 + 1.5 * itq)]
1032
1033        filtered_data = data.groupby(group_col).apply(iqr_filter).reset_index(drop=True)
1034
1035        return filtered_data
1036
1037    def percentiles_calculation(self, values, sep_perc: int = 1):
1038        """
1039        Calculate percentiles for a set of values and generate consecutive percentile ranges.
1040
1041        This function computes percentiles from 0 to 100 at intervals defined by `sep_perc`.
1042        It also generates a list of consecutive percentile ranges that can be used for further analysis or binning.
1043
1044        Parameters
1045        ----------
1046        values : array-like
1047            The input data values for which the percentiles are calculated.
1048
1049        sep_perc : int, optional
1050            Separation interval between percentiles (default is 1, meaning percentiles are calculated every 1%).
1051
1052        Returns
1053        -------
1054        percentiles : np.ndarray
1055            Array of calculated percentile values.
1056
1057        percentiles_loop : list of tuple
1058            List of consecutive percentile ranges as tuples, e.g., [(0, 1), (1, 2), ..., (99, 100)].
1059
1060        Notes
1061        -----
1062        - The first percentile is set to 0 to avoid issues with zero values.
1063        - `percentiles_loop` is useful for iterating through percentile ranges when aggregating statistics.
1064        """
1065
1066        per_vector = values.copy()
1067
1068        percentiles = np.percentile(per_vector, np.arange(0, 101, sep_perc))
1069        percentiles[0] = 0
1070
1071        percentiles_loop = [(i, i + 1) for i in range(int(100 / sep_perc))]
1072
1073        return percentiles, percentiles_loop
1074
1075    def to_percentil(self, values, percentiles, percentiles_loop):
1076        """
1077        Aggregate statistics for a set of values based on percentile ranges.
1078
1079        This function calculates summary statistics for each percentile range defined in `percentiles_loop`,
1080        using the percentile values calculated by `percentiles_calculation()`. Statistics include count, proportion,
1081        mean, median, standard deviation, and variance.
1082
1083        Parameters
1084        ----------
1085        values : array-like
1086            Input data values for which the statistics are calculated.
1087
1088        percentiles : np.ndarray
1089            Array of percentile values used to define the ranges.
1090
1091        percentiles_loop : list of tuple
1092            List of consecutive percentile ranges, e.g., [(0, 1), (1, 2), ..., (99, 100)].
1093
1094        Returns
1095        -------
1096        data : dict
1097            Dictionary containing the following keys:
1098            - 'n' : list
1099                Number of elements in each percentile range.
1100
1101            - 'n_standarized' : list
1102                Proportion of elements in each percentile range relative to the total number of elements.
1103
1104            - 'avg' : list
1105                Mean value of elements within each percentile range.
1106
1107            - 'median' : list
1108                Median value of elements within each percentile range.
1109
1110            - 'std' : list
1111                Standard deviation of elements within each percentile range.
1112
1113            - 'var' : list
1114                Variance of elements within each percentile range.
1115
1116        Notes
1117        -----
1118        - If a percentile range contains no elements, statistics are set to 0 and count is set to 1 to avoid empty lists.
1119        """
1120
1121        per_vector = values.copy()
1122
1123        data = {
1124            "n": [],
1125            "n_standarized": [],
1126            "avg": [],
1127            "median": [],
1128            "std": [],
1129            "var": [],
1130        }
1131
1132        amount = len(per_vector)
1133
1134        for x in percentiles_loop:
1135            if (
1136                len(
1137                    per_vector[
1138                        (per_vector > percentiles[x[0]])
1139                        & (per_vector <= percentiles[x[1]])
1140                    ]
1141                )
1142                > 0
1143            ):
1144                data["n"].append(
1145                    len(
1146                        per_vector[
1147                            (per_vector > percentiles[x[0]])
1148                            & (per_vector <= percentiles[x[1]])
1149                        ]
1150                    )
1151                )
1152                data["n_standarized"].append(
1153                    len(
1154                        per_vector[
1155                            (per_vector > percentiles[x[0]])
1156                            & (per_vector <= percentiles[x[1]])
1157                        ]
1158                    )
1159                    / amount
1160                )
1161                data["avg"].append(
1162                    np.mean(
1163                        per_vector[
1164                            (per_vector > percentiles[x[0]])
1165                            & (per_vector <= percentiles[x[1]])
1166                        ]
1167                    )
1168                )
1169                data["std"].append(
1170                    np.std(
1171                        per_vector[
1172                            (per_vector > percentiles[x[0]])
1173                            & (per_vector <= percentiles[x[1]])
1174                        ]
1175                    )
1176                )
1177                data["median"].append(
1178                    np.median(
1179                        per_vector[
1180                            (per_vector > percentiles[x[0]])
1181                            & (per_vector <= percentiles[x[1]])
1182                        ]
1183                    )
1184                )
1185                data["var"].append(
1186                    np.var(
1187                        per_vector[
1188                            (per_vector > percentiles[x[0]])
1189                            & (per_vector <= percentiles[x[1]])
1190                        ]
1191                    )
1192                )
1193            else:
1194                data["n"].append(1)
1195                data["n_standarized"].append(0)
1196                data["avg"].append(0)
1197                data["std"].append(0)
1198                data["median"].append(0)
1199                data["var"].append(0)
1200
1201        return data
1202
1203    def df_to_percentiles(
1204        self,
1205        data: pd.DataFrame,
1206        group_col: str,
1207        values_col: str,
1208        sep_perc: int = 1,
1209        drop_outlires: bool = True,
1210    ):
1211        """
1212        Calculate summary statistics based on percentile ranges for each group in a DataFrame.
1213
1214        This method groups the input DataFrame by `group_col`, computes percentile ranges for each group's values
1215        in `values_col`, and aggregates statistics (count, proportion, mean, median, standard deviation, variance)
1216        for each percentile range. Optionally, upper outliers can be removed before calculation.
1217
1218        Parameters
1219        ----------
1220        data : pd.DataFrame
1221            Input DataFrame containing the data.
1222
1223        group_col : str
1224            Column name used to define groups.
1225
1226        values_col : str
1227            Column name containing the values for percentile calculations.
1228
1229        sep_perc : int, optional
1230            Separation interval for percentiles (default is 1, meaning percentiles are calculated at every 1%).
1231
1232        drop_outlires : bool, optional
1233            If True, removes upper outliers from the data before performing calculations (default is True).
1234
1235        Returns
1236        -------
1237        full_data : dict
1238            Dictionary where each key is a group name and each value is a dictionary containing:
1239            - 'n' : list
1240                Number of elements in each percentile range.
1241
1242            - 'n_standarized' : list
1243                Proportion of elements in each percentile range relative to the total number of elements.
1244
1245            - 'avg' : list
1246                Mean value of elements within each percentile range.
1247
1248            - 'median' : list
1249                Median value of elements within each percentile range.
1250
1251            - 'std' : list
1252                Standard deviation of elements within each percentile range.
1253
1254            - 'var' : list
1255                Variance of elements within each percentile range.
1256
1257        Notes
1258        -----
1259        - Outlier removal uses the IQR method within each group if `drop_outlires` is True.
1260        """
1261
1262        full_data = {}
1263
1264        if drop_outlires == True:
1265            data = self.drop_up_df(
1266                data=data, group_col=group_col, values_col=values_col
1267            )
1268
1269        groups = set(data[group_col])
1270
1271        percentiles, percentiles_loop = self.percentiles_calculation(
1272            data[values_col], sep_perc=sep_perc
1273        )
1274
1275        for g in groups:
1276
1277            print(f"Group: {g} ...")
1278
1279            tmp_values = data[values_col][data[group_col] == g]
1280
1281            per_dat = self.to_percentil(tmp_values, percentiles, percentiles_loop)
1282
1283            full_data[g] = per_dat
1284
1285        return full_data
1286
1287    def round_to_scientific_notation(self, num):
1288        """
1289        Round a number to scientific notation if very small, otherwise to one decimal place.
1290
1291        Parameters
1292        ----------
1293        num : float
1294            The number to round.
1295
1296        Returns
1297        -------
1298        str
1299            The rounded number as a string.
1300            - If `num` is 0, returns "0.0".
1301            - If `abs(num) < 1e-4`, returns scientific notation with 1 decimal and 1-digit exponent.
1302            - Otherwise, returns the number rounded to one decimal place.
1303        """
1304
1305        if num == 0:
1306            return "0.0"
1307
1308        if abs(num) < 0.0001:
1309            rounded_num = np.format_float_scientific(num, precision=1, exp_digits=1)
1310            return rounded_num
1311        else:
1312            return f"{num:.1f}"
1313
1314    def aov_percentiles(self, data, testes_col, comb: str = "*"):
1315        """
1316        Perform Welch's ANOVA on percentile-based group data.
1317
1318        This method calculates group values by combining the columns specified in `testes_col`
1319        according to the operation defined in `comb`, then performs Welch's ANOVA to test for
1320        differences in means between the groups. Welch's ANOVA is suitable when the groups
1321        have unequal variances.
1322
1323        Parameters
1324        ----------
1325        data : dict of pd.DataFrame
1326            Dictionary where keys are group names and values are DataFrames containing the data.
1327
1328        testes_col : str or list of str
1329            Column name(s) from which the group values are derived. If a list is provided, columns
1330            will be combined based on the `comb` operation.
1331
1332        comb : str, optional
1333            Operation used to combine multiple columns if `testes_col` is a list. Options include:
1334                '*' : multiplication
1335                '+' : addition
1336                '**': exponentiation
1337                '-' : subtraction
1338                '/' : division
1339            Default is '*'.
1340
1341        Returns
1342        -------
1343        F : float
1344            F-statistic from Welch's ANOVA.
1345
1346        p_val : float
1347            Uncorrected p-value from Welch's ANOVA, testing for significant differences between groups.
1348
1349        Notes
1350        -----
1351        - If `testes_col` is a single string, no combination is performed, and the group values
1352          are taken directly from that column.
1353        - Welch's ANOVA is used as it accounts for unequal variances between groups.
1354        - The `df.melt()` method is used to reshape the data, allowing the ANOVA to be applied to all groups.
1355
1356        Examples
1357        --------
1358        >>> welch_F, welch_p = self.aov_percentiles(data, testes_col=['col1', 'col2'], comb='+')
1359        >>> print(f"Welch's ANOVA F-statistic: {welch_F}, p-value: {welch_p}")
1360        """
1361
1362        groups = []
1363
1364        for d in data.keys():
1365
1366            if isinstance(testes_col, str):
1367                g = data[d][testes_col]
1368            elif isinstance(testes_col, list):
1369                g = [1] * len(data[d][testes_col[0]])
1370                for t in testes_col:
1371                    if comb == "*":
1372                        g = [a * b for a, b in zip(g, data[d][t])]
1373                    elif comb == "+":
1374                        g = [a + b for a, b in zip(g, data[d][t])]
1375                    elif comb == "**":
1376                        g = [a**b for a, b in zip(g, data[d][t])]
1377                    elif comb == "-":
1378                        g = [a - b for a, b in zip(g, data[d][t])]
1379                    elif comb == "/":
1380                        g = [a / b for a, b in zip(g, data[d][t])]
1381
1382            groups.append(g)
1383
1384        df = pd.DataFrame({f"group_{i}": group for i, group in enumerate(groups)})
1385
1386        df_melted = df.melt(var_name="group", value_name="value")
1387
1388        welch_results = pg.welch_anova(data=df_melted, dv="value", between="group")
1389
1390        return welch_results["F"].values[0], welch_results["p-unc"].values[0]
1391
1392    def post_aov_percentiles(self, data, testes_col, comb: str = "*"):
1393        """
1394        Perform Welch's ANOVA on percentile-based group data and pairwise Welch's t-tests.
1395
1396        This method first performs Welch's ANOVA to assess differences in group means, and
1397        then conducts pairwise Welch's t-tests between all group combinations. P-values are
1398        adjusted using the Bonferroni correction for multiple comparisons.
1399
1400        Parameters
1401        ----------
1402        data : dict of pd.DataFrame
1403            Dictionary where keys are group names and values are DataFrames containing the data.
1404
1405        testes_col : str or list of str
1406            Column name(s) from which the group values are derived. If a list is provided,
1407            columns will be combined according to the `comb` operation.
1408
1409        comb : str, optional
1410            Operation used to combine multiple columns if `testes_col` is a list. Options include:
1411                '*' : multiplication
1412                '+' : addition
1413                '**': exponentiation
1414                '-' : subtraction
1415                '/' : division
1416            Default is '*'.
1417
1418        Returns
1419        -------
1420        p_val : float
1421            Uncorrected p-value from the Welch's ANOVA.
1422
1423        final_results : dict
1424            Dictionary containing results of pairwise Welch's t-tests with keys:
1425                'group1' : list of first group names in each comparison
1426                'group2' : list of second group names in each comparison
1427                'stat' : list of t-statistics for each comparison
1428                'p_val' : list of uncorrected p-values for each comparison
1429                'adj_p_val' : list of Bonferroni-adjusted p-values for multiple comparisons
1430        """
1431
1432        p_val = self.aov_percentiles(data=data, testes_col=testes_col, comb=comb)[1]
1433
1434        pairs = list(combinations(data, 2))
1435        final_results = {
1436            "group1": [],
1437            "group2": [],
1438            "stat": [],
1439            "p_val": [],
1440            "adj_p_val": [],
1441        }
1442
1443        for group1, group2 in pairs:
1444            if isinstance(testes_col, str):
1445                g1 = data[group1][testes_col]
1446            elif isinstance(testes_col, list):
1447                g1 = [1] * len(data[group1][testes_col[0]])
1448                for t in testes_col:
1449                    if comb == "*":
1450                        g1 = [a * b for a, b in zip(g1, data[group1][t])]
1451                    elif comb == "+":
1452                        g1 = [a + b for a, b in zip(g1, data[group1][t])]
1453                    elif comb == "**":
1454                        g1 = [a**b for a, b in zip(g1, data[group1][t])]
1455                    elif comb == "-":
1456                        g1 = [a - b for a, b in zip(g1, data[group1][t])]
1457                    elif comb == "/":
1458                        g1 = [a / b for a, b in zip(g1, data[group1][t])]
1459
1460            if isinstance(testes_col, str):
1461                g2 = data[group2][testes_col]
1462            elif isinstance(testes_col, list):
1463                g2 = [1] * len(data[group2][testes_col[0]])
1464                for t in testes_col:
1465                    if comb == "*":
1466                        g2 = [a * b for a, b in zip(g2, data[group2][t])]
1467                    elif comb == "+":
1468                        g2 = [a + b for a, b in zip(g2, data[group2][t])]
1469                    elif comb == "**":
1470                        g2 = [a**b for a, b in zip(g2, data[group2][t])]
1471                    elif comb == "-":
1472                        g2 = [a - b for a, b in zip(g2, data[group2][t])]
1473                    elif comb == "/":
1474                        g2 = [a / b for a, b in zip(g2, data[group2][t])]
1475
1476            stat, p_val = stats.ttest_ind(
1477                g1, g2, alternative="two-sided", equal_var=False
1478            )
1479            g = sorted([group1, group2])
1480            final_results["group1"].append(g[0])
1481            final_results["group2"].append(g[1])
1482            final_results["stat"].append(stat)
1483            final_results["p_val"].append(p_val)
1484            adj = p_val * len(pairs)
1485            if adj > 1:
1486                final_results["adj_p_val"].append(1)
1487            else:
1488                final_results["adj_p_val"].append(adj)
1489
1490        return p_val, final_results
1491
1492    def chi2_percentiles(self, input_hist):
1493        """
1494        Perform a Chi-squared test on percentile-based group data.
1495
1496        This method reformats the input histogram data into a contingency table and performs
1497        a Chi-squared test to evaluate whether there is a significant association between groups.
1498
1499        Parameters
1500        ----------
1501        input_hist : dict of pd.DataFrame
1502            Dictionary where keys are group names and values are DataFrames containing histogram data.
1503            The DataFrame must include a column 'n' representing counts for each percentile/bin.
1504
1505        Returns
1506        -------
1507        chi2_statistic : float
1508            Chi-squared test statistic.
1509
1510        p_value : float
1511            P-value from the Chi-squared test.
1512
1513        dof : int
1514            Degrees of freedom for the test.
1515
1516        expected : np.ndarray
1517            Expected frequencies for each group/bin under the null hypothesis.
1518
1519        chi_data : dict
1520            Formatted data used in the Chi-squared test, with group names as keys and bin counts as values.
1521
1522        Example
1523        -------
1524        chi2_stat, p_val, dof, expected, chi_data = self.chi2_percentiles(input_hist)
1525        print(f"Chi-squared statistic: {chi2_stat}, p-value: {p_val}")
1526        """
1527
1528        chi_data = {}
1529
1530        for d in input_hist.keys():
1531            tmp_dic = {}
1532
1533            for n, c in enumerate(input_hist[d]["n"]):
1534                tmp_dic[f"p{n+1}"] = c
1535
1536            chi_data[d] = tmp_dic
1537
1538        chi2_statistic, p_value, dof, expected = chi2_contingency(
1539            pd.DataFrame(chi_data).T, correction=True
1540        )
1541
1542        return chi2_statistic, p_value, dof, expected, chi_data
1543
1544    def post_ch2_percentiles(self, input_hist):
1545        """
1546        Perform a Chi-squared test on percentile-based group data, including pairwise comparisons.
1547
1548        This method first performs a Chi-squared test across all groups to check for a significant association.
1549        It then performs pairwise Chi-squared tests between groups to identify specific differences.
1550        P-values for multiple comparisons are adjusted using the Bonferroni correction.
1551
1552        Parameters
1553        ----------
1554        input_hist : dict of pd.DataFrame
1555            Dictionary where keys are group names and values are DataFrames containing histogram data.
1556            Each DataFrame must include a column 'n' with counts for each percentile/bin.
1557
1558        Returns
1559        -------
1560        p_val : float
1561            Overall p-value from the initial Chi-squared test across all groups.
1562
1563        final_results : dict
1564            Results of pairwise Chi-squared tests, with keys:
1565                - 'group1' (list): Name of the first group in each comparison
1566                - 'group2' (list): Name of the second group in each comparison
1567                - 'chi2' (list): Chi-squared statistic for each pairwise comparison
1568                - 'p_val' (list): P-value for each pairwise comparison
1569                - 'adj_p_val' (list): Adjusted p-value (Bonferroni correction) for multiple comparisons
1570
1571        Example
1572        -------
1573        p_val, final_results = self.post_ch2_percentiles(input_hist)
1574        print(f"Overall Chi-squared p-value: {p_val}")
1575        for i in range(len(final_results['group1'])):
1576            print(f"Comparison: {final_results['group1'][i]} vs {final_results['group2'][i]}")
1577            print(f"Chi2 stat: {final_results['chi2'][i]}, p-value: {final_results['p_val'][i]}, adj. p-value: {final_results['adj_p_val'][i]}")
1578        """
1579
1580        res = self.chi2_percentiles(input_hist)
1581
1582        pairs = list(combinations(res[4], 2))
1583        results = []
1584
1585        for group1, group2 in pairs:
1586            table_pair = pd.DataFrame(res[4])[[group1, group2]]
1587            chi2_stat, p_val, _, _ = chi2_contingency(table_pair, correction=True)
1588            results.append((group1, group2, chi2_stat, p_val))
1589
1590        final_results = {
1591            "group1": [],
1592            "group2": [],
1593            "chi2": [],
1594            "p_val": [],
1595            "adj_p_val": [],
1596        }
1597
1598        for group1, group2, chi2_stat, p_val in results:
1599            g = sorted([group1, group2])
1600            final_results["group1"].append(g[0])
1601            final_results["group2"].append(g[1])
1602            final_results["chi2"].append(chi2_stat)
1603            final_results["p_val"].append(p_val)
1604            adj = p_val * len(results)
1605            if adj > 1:
1606                final_results["adj_p_val"].append(1)
1607            else:
1608                final_results["adj_p_val"].append(adj)
1609
1610        return res[1], final_results
1611
1612    def hist_compare_plot(
1613        self, data, queue, tested_value, p_adj: bool = True, txt_size: int = 20
1614    ):
1615        """
1616        Generate comparative histograms and display results of statistical tests (ANOVA and Chi-squared).
1617
1618        This method performs transformations on the input data, generates comparative histograms
1619        for each group, and annotates statistical test results, including Welch's ANOVA and Chi-squared tests.
1620        Multiple comparison corrections can be applied using the Bonferroni method.
1621
1622        Parameters
1623        ----------
1624        data : dict of pd.DataFrame
1625            Dictionary where keys are group names and values are DataFrames containing histogram data.
1626            Each DataFrame should include the column specified by `tested_value`.
1627
1628        queue : list of str
1629            Defines the order of groups to be plotted.
1630
1631        tested_value : str
1632            The column name in `data` representing the variable to test and visualize.
1633
1634        p_adj : bool, optional
1635            If True, applies Bonferroni correction for multiple comparisons (default is True).
1636
1637        txt_size : int, optional
1638            Font size for text annotations in the plot (default is 20).
1639
1640        Returns
1641        -------
1642        fig : matplotlib.figure.Figure
1643            Matplotlib figure object containing the generated histograms and statistical test results.
1644
1645        Example
1646        -------
1647        fig = self.hist_compare_plot(
1648            data,
1649            queue=['group1', 'group2', 'group3'],
1650            tested_value='n',
1651            p_adj=True,
1652            txt_size=18
1653        )
1654        plt.show()
1655        """
1656
1657        for i in data.keys():
1658            values = np.array(data[i][tested_value])
1659            values += 1
1660            transformed_values, fitted_lambda = stats.boxcox(values)
1661            data[i][tested_value] = transformed_values.tolist()
1662
1663        if sorted(queue) != sorted(data.keys()):
1664            print(
1665                "\n Wrong queue provided! The queue will be sorted with default settings!"
1666            )
1667            queue = sorted(data.keys())
1668
1669        # parametric selected value
1670        pk, dfk = self.post_aov_percentiles(data, testes_col=tested_value)
1671
1672        dfk = pd.DataFrame(dfk)
1673
1674        dfk = dfk.sort_values(
1675            by=["group1", "group2"],
1676            key=lambda col: [queue.index(val) if val in queue else -1 for val in col],
1677        ).reset_index(drop=True)
1678
1679        # parametric standarized selected value
1680        pkc, dfkc = self.post_aov_percentiles(
1681            data, testes_col=[tested_value, "n_standarized"], comb="*"
1682        )
1683
1684        dfkc = pd.DataFrame(dfkc)
1685
1686        dfkc = dfkc.sort_values(
1687            by=["group1", "group2"],
1688            key=lambda col: [queue.index(val) if val in queue else -1 for val in col],
1689        ).reset_index(drop=True)
1690
1691        # chi2
1692        pchi, dfchi = self.post_ch2_percentiles(data)
1693
1694        dfchi = pd.DataFrame(dfchi)
1695
1696        dfchi = dfchi.sort_values(
1697            by=["group1", "group2"],
1698            key=lambda col: [queue.index(val) if val in queue else -1 for val in col],
1699        ).reset_index(drop=True)
1700
1701        ##############################################################################
1702
1703        standarized_max, standarized_min, value_max, value_min = [], [], [], []
1704        for d in queue:
1705            standarized_max.append(max(data[d]["n_standarized"]))
1706            standarized_min.append(min(data[d]["n_standarized"]))
1707            value_max.append(max(data[d][tested_value]))
1708            value_min.append(min(data[d][tested_value]))
1709
1710        num_columns = len(queue) + 1
1711
1712        fig, axs = plt.subplots(
1713            3,
1714            num_columns,
1715            figsize=(8 * num_columns, 10),
1716            gridspec_kw={"width_ratios": [1] * len(queue) + [0.5], "wspace": 0.05},
1717        )
1718
1719        for i, d in enumerate(queue):
1720            tmp_data = data[d]
1721
1722            axs[0, i].bar(
1723                [str(n) for n in range(len(tmp_data["n_standarized"]))],
1724                tmp_data["n_standarized"],
1725                width=0.95,
1726                color="gold",
1727            )
1728            axs[0, i].set_ylim(
1729                min(standarized_min) * 0.9995, max(standarized_max) * 1.0005
1730            )
1731
1732            if i == 0:
1733                axs[0, i].set_ylabel("Standarized\nfrequency", fontsize=txt_size)
1734            else:
1735                axs[0, i].set_yticks([])
1736
1737            axs[0, i].set_xticks([])
1738            axs[0, i].tick_params(axis="y", labelsize=txt_size * 0.7)
1739
1740            axs[1, i].bar(
1741                [str(n) for n in range(len(tmp_data[tested_value]))],
1742                tmp_data[tested_value],
1743                width=0.95,
1744                color="orange",
1745            )
1746
1747            mean_value = np.mean(tmp_data[tested_value])
1748            axs[1, i].axhline(y=mean_value, color="red", linestyle="--")
1749
1750            # axs[1, i].set_ylim(min(value_min) - 1, max(value_max) + 1)
1751            axs[1, i].set_ylim(min(value_min) * 0.9995, max(value_max) * 1.0005)
1752
1753            if i == 0:
1754                axs[1, i].set_ylabel(f"Normalized\n{tested_value}", fontsize=txt_size)
1755            else:
1756                axs[1, i].set_yticks([])
1757
1758            axs[1, i].set_xticks([])
1759            axs[1, i].tick_params(axis="y", labelsize=txt_size * 0.7)
1760
1761            axs[2, i].bar(
1762                [str(n) for n in range(len(tmp_data["n_standarized"]))],
1763                [
1764                    a * b
1765                    for a, b in zip(tmp_data[tested_value], tmp_data["n_standarized"])
1766                ],
1767                width=0.95,
1768                color="goldenrod",
1769            )
1770
1771            mean_value = np.mean(
1772                [
1773                    a * b
1774                    for a, b in zip(tmp_data[tested_value], tmp_data["n_standarized"])
1775                ]
1776            )
1777            axs[2, i].axhline(y=mean_value, color="red", linestyle="--")
1778
1779            axs[2, i].set_ylim(
1780                (min(standarized_min) * min(value_min)) * 0.9995,
1781                (max(standarized_max) * max(value_max) * 1.0005),
1782            )
1783            axs[2, i].set_xlabel(d, fontsize=txt_size)
1784
1785            if i == 0:
1786                axs[2, i].set_ylabel(
1787                    f"Standarized\nnorm_{tested_value}", fontsize=txt_size
1788                )
1789            else:
1790                axs[2, i].set_yticks([])
1791
1792            axs[2, i].set_xticks([])
1793            axs[2, i].tick_params(axis="y", labelsize=txt_size * 0.7)
1794
1795        sign = "ns"
1796        if float(self.round_to_scientific_notation(pk)) < 0.001:
1797            sign = "***"
1798        elif float(self.round_to_scientific_notation(pk)) < 0.01:
1799            sign = "**"
1800        elif float(self.round_to_scientific_notation(pk)) < 0.05:
1801            sign = "*"
1802
1803        text = f"Test Welch's ANOVA\np-value: {self.round_to_scientific_notation(pk)} - {sign}\n"
1804
1805        if p_adj == True:
1806            for i in range(len(dfk["group1"])):
1807                sign = "ns"
1808                if dfk["adj_p_val"][i] < 0.001:
1809                    sign = "***"
1810                elif dfk["adj_p_val"][i] < 0.01:
1811                    sign = "**"
1812                elif dfk["adj_p_val"][i] < 0.05:
1813                    sign = "*"
1814
1815                text += f"{dfk['group1'][i]} vs. {dfk['group2'][i]}\np-value: {self.round_to_scientific_notation(dfk['adj_p_val'][i])} - {sign}\n"
1816        else:
1817            for i in range(len(dfk["group1"])):
1818                sign = "ns"
1819                if dfk["p_val"][i] < 0.001:
1820                    sign = "***"
1821                elif dfk["p_val"][i] < 0.01:
1822                    sign = "**"
1823                elif dfk["p_val"][i] < 0.05:
1824                    sign = "*"
1825
1826                text += f"{dfk['group1'][i]} vs. {dfk['group2'][i]}\np-value: {self.round_to_scientific_notation(dfk['p_val'][i])} - {sign}\n"
1827
1828        axs[1, -1].text(
1829            0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True
1830        )
1831        axs[1, -1].set_axis_off()
1832
1833        sign = "ns"
1834        if float(self.round_to_scientific_notation(pkc)) < 0.001:
1835            sign = "***"
1836        elif float(self.round_to_scientific_notation(pkc)) < 0.01:
1837            sign = "**"
1838        elif float(self.round_to_scientific_notation(pkc)) < 0.05:
1839            sign = "*"
1840
1841        text = f"Test Welch's ANOVA\np-value: {self.round_to_scientific_notation(pkc)} - {sign}\n"
1842
1843        if p_adj == True:
1844            for i in range(len(dfkc["group1"])):
1845                sign = "ns"
1846                if dfkc["adj_p_val"][i] < 0.001:
1847                    sign = "***"
1848                elif dfkc["adj_p_val"][i] < 0.01:
1849                    sign = "**"
1850                elif dfkc["adj_p_val"][i] < 0.05:
1851                    sign = "*"
1852
1853                text += f"{dfkc['group1'][i]} vs. {dfkc['group2'][i]}\np-value: {self.round_to_scientific_notation(dfkc['adj_p_val'][i])} - {sign}\n"
1854        else:
1855            for i in range(len(dfkc["group1"])):
1856                sign = "ns"
1857                if dfkc["p_val"][i] < 0.001:
1858                    sign = "***"
1859                elif dfkc["p_val"][i] < 0.01:
1860                    sign = "**"
1861                elif dfkc["p_val"][i] < 0.05:
1862                    sign = "*"
1863
1864                text += f"{dfkc['group1'][i]} vs. {dfkc['group2'][i]}\np-value: {self.round_to_scientific_notation(dfkc['p_val'][i])} - {sign}\n"
1865
1866        axs[2, -1].text(
1867            0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True
1868        )
1869        axs[2, -1].set_axis_off()
1870
1871        sign = "ns"
1872        if float(self.round_to_scientific_notation(pchi)) < 0.001:
1873            sign = "***"
1874        elif float(self.round_to_scientific_notation(pchi)) < 0.01:
1875            sign = "**"
1876        elif float(self.round_to_scientific_notation(pchi)) < 0.05:
1877            sign = "*"
1878
1879        text = f"Test Chi-squared\np-value: {self.round_to_scientific_notation(pchi)} - {sign}\n"
1880
1881        if p_adj == True:
1882            for i in range(len(dfchi["group1"])):
1883                sign = "ns"
1884                if dfchi["adj_p_val"][i] < 0.001:
1885                    sign = "***"
1886                elif dfchi["adj_p_val"][i] < 0.01:
1887                    sign = "**"
1888                elif dfchi["adj_p_val"][i] < 0.05:
1889                    sign = "*"
1890
1891                text += f"{dfchi['group1'][i]} vs. {dfchi['group2'][i]}\np-value: {self.round_to_scientific_notation(dfchi['adj_p_val'][i])} - {sign}\n"
1892        else:
1893            for i in range(len(dfchi["group1"])):
1894                sign = "ns"
1895                if dfchi["p_val"][i] < 0.001:
1896                    sign = "***"
1897                elif dfchi["p_val"][i] < 0.01:
1898                    sign = "**"
1899                elif dfchi["p_val"][i] < 0.05:
1900                    sign = "*"
1901
1902                text += f"{dfchi['group1'][i]} vs. {dfchi['group2'][i]}\np-value: {self.round_to_scientific_notation(dfchi['p_val'][i])} - {sign}\n"
1903
1904        axs[0, -1].text(
1905            0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True
1906        )
1907        axs[0, -1].set_axis_off()
1908
1909        plt.tight_layout()
1910
1911        if cfg._DISPLAY_MODE:
1912            plt.show()
1913
1914        return fig

Class for performing percentile-based statistical analysis on grouped data.

This class provides methods to calculate percentiles, remove outliers, aggregate data into percentile bins, perform Welch's ANOVA and Chi-squared tests, and visualize results via comparative histograms. It is designed to handle both single-column and multi-column combinations of values for group-based analysis.

Methods

drop_up_df(data, group_col, values_col) Removes upper outliers from a DataFrame based on a grouping column.

percentiles_calculation(values, sep_perc=1) Calculates percentiles and creates loopable percentile ranges.

to_percentil(values, percentiles, percentiles_loop) Aggregates statistics based on percentile ranges.

df_to_percentiles(data, group_col, values_col, sep_perc=1, drop_outlires=True) Computes percentile statistics for grouped DataFrame data.

round_to_scientific_notation(num) Formats a number in scientific notation or standard format.

aov_percentiles(data, testes_col, comb="*") Performs Welch's ANOVA on percentile-based group data.

post_aov_percentiles(data, testes_col, comb="*") Performs Welch's ANOVA with pairwise t-tests.

chi2_percentiles(input_hist) Performs Chi-squared test on percentile histogram data.

post_ch2_percentiles(input_hist) Performs Chi-squared test with pairwise comparisons.

hist_compare_plot(data, queue, tested_value, p_adj=True, txt_size=20) Generates comparative histograms with statistical test results.

def drop_up_df( self, data: pandas.core.frame.DataFrame, group_col: str, values_col: str):
 997    def drop_up_df(self, data: pd.DataFrame, group_col: str, values_col: str):
 998        """
 999        Remove upper outliers from a DataFrame based on a specified value column, grouped by a grouping column.
1000
1001        Outliers are calculated and removed separately for each group defined by `group_col`.
1002        The upper outliers are defined using the interquartile range (IQR) method:
1003            values greater than Q3 + 1.5 * IQR are considered outliers.
1004
1005        Parameters
1006        ----------
1007        data : pd.DataFrame
1008            The input DataFrame containing the data.
1009
1010        group_col : str
1011            The name of the column used for grouping the data.
1012
1013        values_col : str
1014            The column containing the values from which upper outliers will be removed.
1015
1016        Returns
1017        -------
1018        filtered_data : pd.DataFrame
1019            A filtered DataFrame with the upper outliers removed for each group.
1020
1021        Notes
1022        -----
1023        - Outliers are removed separately within each group.
1024        - The original DataFrame is not modified; a new filtered DataFrame is returned.
1025        """
1026
1027        def iqr_filter(group):
1028            q75 = np.quantile(group[values_col], 0.75)
1029            q25 = np.quantile(group[values_col], 0.25)
1030            itq = q75 - q25
1031            return group[group[values_col] <= (q75 + 1.5 * itq)]
1032
1033        filtered_data = data.groupby(group_col).apply(iqr_filter).reset_index(drop=True)
1034
1035        return filtered_data

Remove upper outliers from a DataFrame based on a specified value column, grouped by a grouping column.

Outliers are calculated and removed separately for each group defined by group_col. The upper outliers are defined using the interquartile range (IQR) method: values greater than Q3 + 1.5 * IQR are considered outliers.

Parameters

data : pd.DataFrame The input DataFrame containing the data.

group_col : str The name of the column used for grouping the data.

values_col : str The column containing the values from which upper outliers will be removed.

Returns

filtered_data : pd.DataFrame A filtered DataFrame with the upper outliers removed for each group.

Notes

  • Outliers are removed separately within each group.
  • The original DataFrame is not modified; a new filtered DataFrame is returned.
def percentiles_calculation(self, values, sep_perc: int = 1):
1037    def percentiles_calculation(self, values, sep_perc: int = 1):
1038        """
1039        Calculate percentiles for a set of values and generate consecutive percentile ranges.
1040
1041        This function computes percentiles from 0 to 100 at intervals defined by `sep_perc`.
1042        It also generates a list of consecutive percentile ranges that can be used for further analysis or binning.
1043
1044        Parameters
1045        ----------
1046        values : array-like
1047            The input data values for which the percentiles are calculated.
1048
1049        sep_perc : int, optional
1050            Separation interval between percentiles (default is 1, meaning percentiles are calculated every 1%).
1051
1052        Returns
1053        -------
1054        percentiles : np.ndarray
1055            Array of calculated percentile values.
1056
1057        percentiles_loop : list of tuple
1058            List of consecutive percentile ranges as tuples, e.g., [(0, 1), (1, 2), ..., (99, 100)].
1059
1060        Notes
1061        -----
1062        - The first percentile is set to 0 to avoid issues with zero values.
1063        - `percentiles_loop` is useful for iterating through percentile ranges when aggregating statistics.
1064        """
1065
1066        per_vector = values.copy()
1067
1068        percentiles = np.percentile(per_vector, np.arange(0, 101, sep_perc))
1069        percentiles[0] = 0
1070
1071        percentiles_loop = [(i, i + 1) for i in range(int(100 / sep_perc))]
1072
1073        return percentiles, percentiles_loop

Calculate percentiles for a set of values and generate consecutive percentile ranges.

This function computes percentiles from 0 to 100 at intervals defined by sep_perc. It also generates a list of consecutive percentile ranges that can be used for further analysis or binning.

Parameters

values : array-like The input data values for which the percentiles are calculated.

sep_perc : int, optional Separation interval between percentiles (default is 1, meaning percentiles are calculated every 1%).

Returns

percentiles : np.ndarray Array of calculated percentile values.

percentiles_loop : list of tuple List of consecutive percentile ranges as tuples, e.g., [(0, 1), (1, 2), ..., (99, 100)].

Notes

  • The first percentile is set to 0 to avoid issues with zero values.
  • percentiles_loop is useful for iterating through percentile ranges when aggregating statistics.
def to_percentil(self, values, percentiles, percentiles_loop):
1075    def to_percentil(self, values, percentiles, percentiles_loop):
1076        """
1077        Aggregate statistics for a set of values based on percentile ranges.
1078
1079        This function calculates summary statistics for each percentile range defined in `percentiles_loop`,
1080        using the percentile values calculated by `percentiles_calculation()`. Statistics include count, proportion,
1081        mean, median, standard deviation, and variance.
1082
1083        Parameters
1084        ----------
1085        values : array-like
1086            Input data values for which the statistics are calculated.
1087
1088        percentiles : np.ndarray
1089            Array of percentile values used to define the ranges.
1090
1091        percentiles_loop : list of tuple
1092            List of consecutive percentile ranges, e.g., [(0, 1), (1, 2), ..., (99, 100)].
1093
1094        Returns
1095        -------
1096        data : dict
1097            Dictionary containing the following keys:
1098            - 'n' : list
1099                Number of elements in each percentile range.
1100
1101            - 'n_standarized' : list
1102                Proportion of elements in each percentile range relative to the total number of elements.
1103
1104            - 'avg' : list
1105                Mean value of elements within each percentile range.
1106
1107            - 'median' : list
1108                Median value of elements within each percentile range.
1109
1110            - 'std' : list
1111                Standard deviation of elements within each percentile range.
1112
1113            - 'var' : list
1114                Variance of elements within each percentile range.
1115
1116        Notes
1117        -----
1118        - If a percentile range contains no elements, statistics are set to 0 and count is set to 1 to avoid empty lists.
1119        """
1120
1121        per_vector = values.copy()
1122
1123        data = {
1124            "n": [],
1125            "n_standarized": [],
1126            "avg": [],
1127            "median": [],
1128            "std": [],
1129            "var": [],
1130        }
1131
1132        amount = len(per_vector)
1133
1134        for x in percentiles_loop:
1135            if (
1136                len(
1137                    per_vector[
1138                        (per_vector > percentiles[x[0]])
1139                        & (per_vector <= percentiles[x[1]])
1140                    ]
1141                )
1142                > 0
1143            ):
1144                data["n"].append(
1145                    len(
1146                        per_vector[
1147                            (per_vector > percentiles[x[0]])
1148                            & (per_vector <= percentiles[x[1]])
1149                        ]
1150                    )
1151                )
1152                data["n_standarized"].append(
1153                    len(
1154                        per_vector[
1155                            (per_vector > percentiles[x[0]])
1156                            & (per_vector <= percentiles[x[1]])
1157                        ]
1158                    )
1159                    / amount
1160                )
1161                data["avg"].append(
1162                    np.mean(
1163                        per_vector[
1164                            (per_vector > percentiles[x[0]])
1165                            & (per_vector <= percentiles[x[1]])
1166                        ]
1167                    )
1168                )
1169                data["std"].append(
1170                    np.std(
1171                        per_vector[
1172                            (per_vector > percentiles[x[0]])
1173                            & (per_vector <= percentiles[x[1]])
1174                        ]
1175                    )
1176                )
1177                data["median"].append(
1178                    np.median(
1179                        per_vector[
1180                            (per_vector > percentiles[x[0]])
1181                            & (per_vector <= percentiles[x[1]])
1182                        ]
1183                    )
1184                )
1185                data["var"].append(
1186                    np.var(
1187                        per_vector[
1188                            (per_vector > percentiles[x[0]])
1189                            & (per_vector <= percentiles[x[1]])
1190                        ]
1191                    )
1192                )
1193            else:
1194                data["n"].append(1)
1195                data["n_standarized"].append(0)
1196                data["avg"].append(0)
1197                data["std"].append(0)
1198                data["median"].append(0)
1199                data["var"].append(0)
1200
1201        return data

Aggregate statistics for a set of values based on percentile ranges.

This function calculates summary statistics for each percentile range defined in percentiles_loop, using the percentile values calculated by percentiles_calculation(). Statistics include count, proportion, mean, median, standard deviation, and variance.

Parameters

values : array-like Input data values for which the statistics are calculated.

percentiles : np.ndarray Array of percentile values used to define the ranges.

percentiles_loop : list of tuple List of consecutive percentile ranges, e.g., [(0, 1), (1, 2), ..., (99, 100)].

Returns

data : dict Dictionary containing the following keys: - 'n' : list Number of elements in each percentile range.

- 'n_standarized' : list
    Proportion of elements in each percentile range relative to the total number of elements.

- 'avg' : list
    Mean value of elements within each percentile range.

- 'median' : list
    Median value of elements within each percentile range.

- 'std' : list
    Standard deviation of elements within each percentile range.

- 'var' : list
    Variance of elements within each percentile range.

Notes

  • If a percentile range contains no elements, statistics are set to 0 and count is set to 1 to avoid empty lists.
def df_to_percentiles( self, data: pandas.core.frame.DataFrame, group_col: str, values_col: str, sep_perc: int = 1, drop_outlires: bool = True):
1203    def df_to_percentiles(
1204        self,
1205        data: pd.DataFrame,
1206        group_col: str,
1207        values_col: str,
1208        sep_perc: int = 1,
1209        drop_outlires: bool = True,
1210    ):
1211        """
1212        Calculate summary statistics based on percentile ranges for each group in a DataFrame.
1213
1214        This method groups the input DataFrame by `group_col`, computes percentile ranges for each group's values
1215        in `values_col`, and aggregates statistics (count, proportion, mean, median, standard deviation, variance)
1216        for each percentile range. Optionally, upper outliers can be removed before calculation.
1217
1218        Parameters
1219        ----------
1220        data : pd.DataFrame
1221            Input DataFrame containing the data.
1222
1223        group_col : str
1224            Column name used to define groups.
1225
1226        values_col : str
1227            Column name containing the values for percentile calculations.
1228
1229        sep_perc : int, optional
1230            Separation interval for percentiles (default is 1, meaning percentiles are calculated at every 1%).
1231
1232        drop_outlires : bool, optional
1233            If True, removes upper outliers from the data before performing calculations (default is True).
1234
1235        Returns
1236        -------
1237        full_data : dict
1238            Dictionary where each key is a group name and each value is a dictionary containing:
1239            - 'n' : list
1240                Number of elements in each percentile range.
1241
1242            - 'n_standarized' : list
1243                Proportion of elements in each percentile range relative to the total number of elements.
1244
1245            - 'avg' : list
1246                Mean value of elements within each percentile range.
1247
1248            - 'median' : list
1249                Median value of elements within each percentile range.
1250
1251            - 'std' : list
1252                Standard deviation of elements within each percentile range.
1253
1254            - 'var' : list
1255                Variance of elements within each percentile range.
1256
1257        Notes
1258        -----
1259        - Outlier removal uses the IQR method within each group if `drop_outlires` is True.
1260        """
1261
1262        full_data = {}
1263
1264        if drop_outlires == True:
1265            data = self.drop_up_df(
1266                data=data, group_col=group_col, values_col=values_col
1267            )
1268
1269        groups = set(data[group_col])
1270
1271        percentiles, percentiles_loop = self.percentiles_calculation(
1272            data[values_col], sep_perc=sep_perc
1273        )
1274
1275        for g in groups:
1276
1277            print(f"Group: {g} ...")
1278
1279            tmp_values = data[values_col][data[group_col] == g]
1280
1281            per_dat = self.to_percentil(tmp_values, percentiles, percentiles_loop)
1282
1283            full_data[g] = per_dat
1284
1285        return full_data

Calculate summary statistics based on percentile ranges for each group in a DataFrame.

This method groups the input DataFrame by group_col, computes percentile ranges for each group's values in values_col, and aggregates statistics (count, proportion, mean, median, standard deviation, variance) for each percentile range. Optionally, upper outliers can be removed before calculation.

Parameters

data : pd.DataFrame Input DataFrame containing the data.

group_col : str Column name used to define groups.

values_col : str Column name containing the values for percentile calculations.

sep_perc : int, optional Separation interval for percentiles (default is 1, meaning percentiles are calculated at every 1%).

drop_outlires : bool, optional If True, removes upper outliers from the data before performing calculations (default is True).

Returns

full_data : dict Dictionary where each key is a group name and each value is a dictionary containing: - 'n' : list Number of elements in each percentile range.

- 'n_standarized' : list
    Proportion of elements in each percentile range relative to the total number of elements.

- 'avg' : list
    Mean value of elements within each percentile range.

- 'median' : list
    Median value of elements within each percentile range.

- 'std' : list
    Standard deviation of elements within each percentile range.

- 'var' : list
    Variance of elements within each percentile range.

Notes

  • Outlier removal uses the IQR method within each group if drop_outlires is True.
def round_to_scientific_notation(self, num):
1287    def round_to_scientific_notation(self, num):
1288        """
1289        Round a number to scientific notation if very small, otherwise to one decimal place.
1290
1291        Parameters
1292        ----------
1293        num : float
1294            The number to round.
1295
1296        Returns
1297        -------
1298        str
1299            The rounded number as a string.
1300            - If `num` is 0, returns "0.0".
1301            - If `abs(num) < 1e-4`, returns scientific notation with 1 decimal and 1-digit exponent.
1302            - Otherwise, returns the number rounded to one decimal place.
1303        """
1304
1305        if num == 0:
1306            return "0.0"
1307
1308        if abs(num) < 0.0001:
1309            rounded_num = np.format_float_scientific(num, precision=1, exp_digits=1)
1310            return rounded_num
1311        else:
1312            return f"{num:.1f}"

Round a number to scientific notation if very small, otherwise to one decimal place.

Parameters

num : float The number to round.

Returns

str The rounded number as a string. - If num is 0, returns "0.0". - If abs(num) < 1e-4, returns scientific notation with 1 decimal and 1-digit exponent. - Otherwise, returns the number rounded to one decimal place.

def aov_percentiles(self, data, testes_col, comb: str = '*'):
1314    def aov_percentiles(self, data, testes_col, comb: str = "*"):
1315        """
1316        Perform Welch's ANOVA on percentile-based group data.
1317
1318        This method calculates group values by combining the columns specified in `testes_col`
1319        according to the operation defined in `comb`, then performs Welch's ANOVA to test for
1320        differences in means between the groups. Welch's ANOVA is suitable when the groups
1321        have unequal variances.
1322
1323        Parameters
1324        ----------
1325        data : dict of pd.DataFrame
1326            Dictionary where keys are group names and values are DataFrames containing the data.
1327
1328        testes_col : str or list of str
1329            Column name(s) from which the group values are derived. If a list is provided, columns
1330            will be combined based on the `comb` operation.
1331
1332        comb : str, optional
1333            Operation used to combine multiple columns if `testes_col` is a list. Options include:
1334                '*' : multiplication
1335                '+' : addition
1336                '**': exponentiation
1337                '-' : subtraction
1338                '/' : division
1339            Default is '*'.
1340
1341        Returns
1342        -------
1343        F : float
1344            F-statistic from Welch's ANOVA.
1345
1346        p_val : float
1347            Uncorrected p-value from Welch's ANOVA, testing for significant differences between groups.
1348
1349        Notes
1350        -----
1351        - If `testes_col` is a single string, no combination is performed, and the group values
1352          are taken directly from that column.
1353        - Welch's ANOVA is used as it accounts for unequal variances between groups.
1354        - The `df.melt()` method is used to reshape the data, allowing the ANOVA to be applied to all groups.
1355
1356        Examples
1357        --------
1358        >>> welch_F, welch_p = self.aov_percentiles(data, testes_col=['col1', 'col2'], comb='+')
1359        >>> print(f"Welch's ANOVA F-statistic: {welch_F}, p-value: {welch_p}")
1360        """
1361
1362        groups = []
1363
1364        for d in data.keys():
1365
1366            if isinstance(testes_col, str):
1367                g = data[d][testes_col]
1368            elif isinstance(testes_col, list):
1369                g = [1] * len(data[d][testes_col[0]])
1370                for t in testes_col:
1371                    if comb == "*":
1372                        g = [a * b for a, b in zip(g, data[d][t])]
1373                    elif comb == "+":
1374                        g = [a + b for a, b in zip(g, data[d][t])]
1375                    elif comb == "**":
1376                        g = [a**b for a, b in zip(g, data[d][t])]
1377                    elif comb == "-":
1378                        g = [a - b for a, b in zip(g, data[d][t])]
1379                    elif comb == "/":
1380                        g = [a / b for a, b in zip(g, data[d][t])]
1381
1382            groups.append(g)
1383
1384        df = pd.DataFrame({f"group_{i}": group for i, group in enumerate(groups)})
1385
1386        df_melted = df.melt(var_name="group", value_name="value")
1387
1388        welch_results = pg.welch_anova(data=df_melted, dv="value", between="group")
1389
1390        return welch_results["F"].values[0], welch_results["p-unc"].values[0]

Perform Welch's ANOVA on percentile-based group data.

This method calculates group values by combining the columns specified in testes_col according to the operation defined in comb, then performs Welch's ANOVA to test for differences in means between the groups. Welch's ANOVA is suitable when the groups have unequal variances.

Parameters

data : dict of pd.DataFrame Dictionary where keys are group names and values are DataFrames containing the data.

testes_col : str or list of str Column name(s) from which the group values are derived. If a list is provided, columns will be combined based on the comb operation.

comb : str, optional Operation used to combine multiple columns if testes_col is a list. Options include: '' : multiplication '+' : addition '': exponentiation '-' : subtraction '/' : division Default is ''.

Returns

F : float F-statistic from Welch's ANOVA.

p_val : float Uncorrected p-value from Welch's ANOVA, testing for significant differences between groups.

Notes

  • If testes_col is a single string, no combination is performed, and the group values are taken directly from that column.
  • Welch's ANOVA is used as it accounts for unequal variances between groups.
  • The df.melt() method is used to reshape the data, allowing the ANOVA to be applied to all groups.

Examples

>>> welch_F, welch_p = self.aov_percentiles(data, testes_col=['col1', 'col2'], comb='+')
>>> print(f"Welch's ANOVA F-statistic: {welch_F}, p-value: {welch_p}")
def post_aov_percentiles(self, data, testes_col, comb: str = '*'):
1392    def post_aov_percentiles(self, data, testes_col, comb: str = "*"):
1393        """
1394        Perform Welch's ANOVA on percentile-based group data and pairwise Welch's t-tests.
1395
1396        This method first performs Welch's ANOVA to assess differences in group means, and
1397        then conducts pairwise Welch's t-tests between all group combinations. P-values are
1398        adjusted using the Bonferroni correction for multiple comparisons.
1399
1400        Parameters
1401        ----------
1402        data : dict of pd.DataFrame
1403            Dictionary where keys are group names and values are DataFrames containing the data.
1404
1405        testes_col : str or list of str
1406            Column name(s) from which the group values are derived. If a list is provided,
1407            columns will be combined according to the `comb` operation.
1408
1409        comb : str, optional
1410            Operation used to combine multiple columns if `testes_col` is a list. Options include:
1411                '*' : multiplication
1412                '+' : addition
1413                '**': exponentiation
1414                '-' : subtraction
1415                '/' : division
1416            Default is '*'.
1417
1418        Returns
1419        -------
1420        p_val : float
1421            Uncorrected p-value from the Welch's ANOVA.
1422
1423        final_results : dict
1424            Dictionary containing results of pairwise Welch's t-tests with keys:
1425                'group1' : list of first group names in each comparison
1426                'group2' : list of second group names in each comparison
1427                'stat' : list of t-statistics for each comparison
1428                'p_val' : list of uncorrected p-values for each comparison
1429                'adj_p_val' : list of Bonferroni-adjusted p-values for multiple comparisons
1430        """
1431
1432        p_val = self.aov_percentiles(data=data, testes_col=testes_col, comb=comb)[1]
1433
1434        pairs = list(combinations(data, 2))
1435        final_results = {
1436            "group1": [],
1437            "group2": [],
1438            "stat": [],
1439            "p_val": [],
1440            "adj_p_val": [],
1441        }
1442
1443        for group1, group2 in pairs:
1444            if isinstance(testes_col, str):
1445                g1 = data[group1][testes_col]
1446            elif isinstance(testes_col, list):
1447                g1 = [1] * len(data[group1][testes_col[0]])
1448                for t in testes_col:
1449                    if comb == "*":
1450                        g1 = [a * b for a, b in zip(g1, data[group1][t])]
1451                    elif comb == "+":
1452                        g1 = [a + b for a, b in zip(g1, data[group1][t])]
1453                    elif comb == "**":
1454                        g1 = [a**b for a, b in zip(g1, data[group1][t])]
1455                    elif comb == "-":
1456                        g1 = [a - b for a, b in zip(g1, data[group1][t])]
1457                    elif comb == "/":
1458                        g1 = [a / b for a, b in zip(g1, data[group1][t])]
1459
1460            if isinstance(testes_col, str):
1461                g2 = data[group2][testes_col]
1462            elif isinstance(testes_col, list):
1463                g2 = [1] * len(data[group2][testes_col[0]])
1464                for t in testes_col:
1465                    if comb == "*":
1466                        g2 = [a * b for a, b in zip(g2, data[group2][t])]
1467                    elif comb == "+":
1468                        g2 = [a + b for a, b in zip(g2, data[group2][t])]
1469                    elif comb == "**":
1470                        g2 = [a**b for a, b in zip(g2, data[group2][t])]
1471                    elif comb == "-":
1472                        g2 = [a - b for a, b in zip(g2, data[group2][t])]
1473                    elif comb == "/":
1474                        g2 = [a / b for a, b in zip(g2, data[group2][t])]
1475
1476            stat, p_val = stats.ttest_ind(
1477                g1, g2, alternative="two-sided", equal_var=False
1478            )
1479            g = sorted([group1, group2])
1480            final_results["group1"].append(g[0])
1481            final_results["group2"].append(g[1])
1482            final_results["stat"].append(stat)
1483            final_results["p_val"].append(p_val)
1484            adj = p_val * len(pairs)
1485            if adj > 1:
1486                final_results["adj_p_val"].append(1)
1487            else:
1488                final_results["adj_p_val"].append(adj)
1489
1490        return p_val, final_results

Perform Welch's ANOVA on percentile-based group data and pairwise Welch's t-tests.

This method first performs Welch's ANOVA to assess differences in group means, and then conducts pairwise Welch's t-tests between all group combinations. P-values are adjusted using the Bonferroni correction for multiple comparisons.

Parameters

data : dict of pd.DataFrame Dictionary where keys are group names and values are DataFrames containing the data.

testes_col : str or list of str Column name(s) from which the group values are derived. If a list is provided, columns will be combined according to the comb operation.

comb : str, optional Operation used to combine multiple columns if testes_col is a list. Options include: '' : multiplication '+' : addition '': exponentiation '-' : subtraction '/' : division Default is ''.

Returns

p_val : float Uncorrected p-value from the Welch's ANOVA.

final_results : dict Dictionary containing results of pairwise Welch's t-tests with keys: 'group1' : list of first group names in each comparison 'group2' : list of second group names in each comparison 'stat' : list of t-statistics for each comparison 'p_val' : list of uncorrected p-values for each comparison 'adj_p_val' : list of Bonferroni-adjusted p-values for multiple comparisons

def chi2_percentiles(self, input_hist):
1492    def chi2_percentiles(self, input_hist):
1493        """
1494        Perform a Chi-squared test on percentile-based group data.
1495
1496        This method reformats the input histogram data into a contingency table and performs
1497        a Chi-squared test to evaluate whether there is a significant association between groups.
1498
1499        Parameters
1500        ----------
1501        input_hist : dict of pd.DataFrame
1502            Dictionary where keys are group names and values are DataFrames containing histogram data.
1503            The DataFrame must include a column 'n' representing counts for each percentile/bin.
1504
1505        Returns
1506        -------
1507        chi2_statistic : float
1508            Chi-squared test statistic.
1509
1510        p_value : float
1511            P-value from the Chi-squared test.
1512
1513        dof : int
1514            Degrees of freedom for the test.
1515
1516        expected : np.ndarray
1517            Expected frequencies for each group/bin under the null hypothesis.
1518
1519        chi_data : dict
1520            Formatted data used in the Chi-squared test, with group names as keys and bin counts as values.
1521
1522        Example
1523        -------
1524        chi2_stat, p_val, dof, expected, chi_data = self.chi2_percentiles(input_hist)
1525        print(f"Chi-squared statistic: {chi2_stat}, p-value: {p_val}")
1526        """
1527
1528        chi_data = {}
1529
1530        for d in input_hist.keys():
1531            tmp_dic = {}
1532
1533            for n, c in enumerate(input_hist[d]["n"]):
1534                tmp_dic[f"p{n+1}"] = c
1535
1536            chi_data[d] = tmp_dic
1537
1538        chi2_statistic, p_value, dof, expected = chi2_contingency(
1539            pd.DataFrame(chi_data).T, correction=True
1540        )
1541
1542        return chi2_statistic, p_value, dof, expected, chi_data

Perform a Chi-squared test on percentile-based group data.

This method reformats the input histogram data into a contingency table and performs a Chi-squared test to evaluate whether there is a significant association between groups.

Parameters

input_hist : dict of pd.DataFrame Dictionary where keys are group names and values are DataFrames containing histogram data. The DataFrame must include a column 'n' representing counts for each percentile/bin.

Returns

chi2_statistic : float Chi-squared test statistic.

p_value : float P-value from the Chi-squared test.

dof : int Degrees of freedom for the test.

expected : np.ndarray Expected frequencies for each group/bin under the null hypothesis.

chi_data : dict Formatted data used in the Chi-squared test, with group names as keys and bin counts as values.

Example

chi2_stat, p_val, dof, expected, chi_data = self.chi2_percentiles(input_hist) print(f"Chi-squared statistic: {chi2_stat}, p-value: {p_val}")

def post_ch2_percentiles(self, input_hist):
1544    def post_ch2_percentiles(self, input_hist):
1545        """
1546        Perform a Chi-squared test on percentile-based group data, including pairwise comparisons.
1547
1548        This method first performs a Chi-squared test across all groups to check for a significant association.
1549        It then performs pairwise Chi-squared tests between groups to identify specific differences.
1550        P-values for multiple comparisons are adjusted using the Bonferroni correction.
1551
1552        Parameters
1553        ----------
1554        input_hist : dict of pd.DataFrame
1555            Dictionary where keys are group names and values are DataFrames containing histogram data.
1556            Each DataFrame must include a column 'n' with counts for each percentile/bin.
1557
1558        Returns
1559        -------
1560        p_val : float
1561            Overall p-value from the initial Chi-squared test across all groups.
1562
1563        final_results : dict
1564            Results of pairwise Chi-squared tests, with keys:
1565                - 'group1' (list): Name of the first group in each comparison
1566                - 'group2' (list): Name of the second group in each comparison
1567                - 'chi2' (list): Chi-squared statistic for each pairwise comparison
1568                - 'p_val' (list): P-value for each pairwise comparison
1569                - 'adj_p_val' (list): Adjusted p-value (Bonferroni correction) for multiple comparisons
1570
1571        Example
1572        -------
1573        p_val, final_results = self.post_ch2_percentiles(input_hist)
1574        print(f"Overall Chi-squared p-value: {p_val}")
1575        for i in range(len(final_results['group1'])):
1576            print(f"Comparison: {final_results['group1'][i]} vs {final_results['group2'][i]}")
1577            print(f"Chi2 stat: {final_results['chi2'][i]}, p-value: {final_results['p_val'][i]}, adj. p-value: {final_results['adj_p_val'][i]}")
1578        """
1579
1580        res = self.chi2_percentiles(input_hist)
1581
1582        pairs = list(combinations(res[4], 2))
1583        results = []
1584
1585        for group1, group2 in pairs:
1586            table_pair = pd.DataFrame(res[4])[[group1, group2]]
1587            chi2_stat, p_val, _, _ = chi2_contingency(table_pair, correction=True)
1588            results.append((group1, group2, chi2_stat, p_val))
1589
1590        final_results = {
1591            "group1": [],
1592            "group2": [],
1593            "chi2": [],
1594            "p_val": [],
1595            "adj_p_val": [],
1596        }
1597
1598        for group1, group2, chi2_stat, p_val in results:
1599            g = sorted([group1, group2])
1600            final_results["group1"].append(g[0])
1601            final_results["group2"].append(g[1])
1602            final_results["chi2"].append(chi2_stat)
1603            final_results["p_val"].append(p_val)
1604            adj = p_val * len(results)
1605            if adj > 1:
1606                final_results["adj_p_val"].append(1)
1607            else:
1608                final_results["adj_p_val"].append(adj)
1609
1610        return res[1], final_results

Perform a Chi-squared test on percentile-based group data, including pairwise comparisons.

This method first performs a Chi-squared test across all groups to check for a significant association. It then performs pairwise Chi-squared tests between groups to identify specific differences. P-values for multiple comparisons are adjusted using the Bonferroni correction.

Parameters

input_hist : dict of pd.DataFrame Dictionary where keys are group names and values are DataFrames containing histogram data. Each DataFrame must include a column 'n' with counts for each percentile/bin.

Returns

p_val : float Overall p-value from the initial Chi-squared test across all groups.

final_results : dict Results of pairwise Chi-squared tests, with keys: - 'group1' (list): Name of the first group in each comparison - 'group2' (list): Name of the second group in each comparison - 'chi2' (list): Chi-squared statistic for each pairwise comparison - 'p_val' (list): P-value for each pairwise comparison - 'adj_p_val' (list): Adjusted p-value (Bonferroni correction) for multiple comparisons

Example

p_val, final_results = self.post_ch2_percentiles(input_hist) print(f"Overall Chi-squared p-value: {p_val}") for i in range(len(final_results['group1'])): print(f"Comparison: {final_results['group1'][i]} vs {final_results['group2'][i]}") print(f"Chi2 stat: {final_results['chi2'][i]}, p-value: {final_results['p_val'][i]}, adj. p-value: {final_results['adj_p_val'][i]}")

def hist_compare_plot( self, data, queue, tested_value, p_adj: bool = True, txt_size: int = 20):
1612    def hist_compare_plot(
1613        self, data, queue, tested_value, p_adj: bool = True, txt_size: int = 20
1614    ):
1615        """
1616        Generate comparative histograms and display results of statistical tests (ANOVA and Chi-squared).
1617
1618        This method performs transformations on the input data, generates comparative histograms
1619        for each group, and annotates statistical test results, including Welch's ANOVA and Chi-squared tests.
1620        Multiple comparison corrections can be applied using the Bonferroni method.
1621
1622        Parameters
1623        ----------
1624        data : dict of pd.DataFrame
1625            Dictionary where keys are group names and values are DataFrames containing histogram data.
1626            Each DataFrame should include the column specified by `tested_value`.
1627
1628        queue : list of str
1629            Defines the order of groups to be plotted.
1630
1631        tested_value : str
1632            The column name in `data` representing the variable to test and visualize.
1633
1634        p_adj : bool, optional
1635            If True, applies Bonferroni correction for multiple comparisons (default is True).
1636
1637        txt_size : int, optional
1638            Font size for text annotations in the plot (default is 20).
1639
1640        Returns
1641        -------
1642        fig : matplotlib.figure.Figure
1643            Matplotlib figure object containing the generated histograms and statistical test results.
1644
1645        Example
1646        -------
1647        fig = self.hist_compare_plot(
1648            data,
1649            queue=['group1', 'group2', 'group3'],
1650            tested_value='n',
1651            p_adj=True,
1652            txt_size=18
1653        )
1654        plt.show()
1655        """
1656
1657        for i in data.keys():
1658            values = np.array(data[i][tested_value])
1659            values += 1
1660            transformed_values, fitted_lambda = stats.boxcox(values)
1661            data[i][tested_value] = transformed_values.tolist()
1662
1663        if sorted(queue) != sorted(data.keys()):
1664            print(
1665                "\n Wrong queue provided! The queue will be sorted with default settings!"
1666            )
1667            queue = sorted(data.keys())
1668
1669        # parametric selected value
1670        pk, dfk = self.post_aov_percentiles(data, testes_col=tested_value)
1671
1672        dfk = pd.DataFrame(dfk)
1673
1674        dfk = dfk.sort_values(
1675            by=["group1", "group2"],
1676            key=lambda col: [queue.index(val) if val in queue else -1 for val in col],
1677        ).reset_index(drop=True)
1678
1679        # parametric standarized selected value
1680        pkc, dfkc = self.post_aov_percentiles(
1681            data, testes_col=[tested_value, "n_standarized"], comb="*"
1682        )
1683
1684        dfkc = pd.DataFrame(dfkc)
1685
1686        dfkc = dfkc.sort_values(
1687            by=["group1", "group2"],
1688            key=lambda col: [queue.index(val) if val in queue else -1 for val in col],
1689        ).reset_index(drop=True)
1690
1691        # chi2
1692        pchi, dfchi = self.post_ch2_percentiles(data)
1693
1694        dfchi = pd.DataFrame(dfchi)
1695
1696        dfchi = dfchi.sort_values(
1697            by=["group1", "group2"],
1698            key=lambda col: [queue.index(val) if val in queue else -1 for val in col],
1699        ).reset_index(drop=True)
1700
1701        ##############################################################################
1702
1703        standarized_max, standarized_min, value_max, value_min = [], [], [], []
1704        for d in queue:
1705            standarized_max.append(max(data[d]["n_standarized"]))
1706            standarized_min.append(min(data[d]["n_standarized"]))
1707            value_max.append(max(data[d][tested_value]))
1708            value_min.append(min(data[d][tested_value]))
1709
1710        num_columns = len(queue) + 1
1711
1712        fig, axs = plt.subplots(
1713            3,
1714            num_columns,
1715            figsize=(8 * num_columns, 10),
1716            gridspec_kw={"width_ratios": [1] * len(queue) + [0.5], "wspace": 0.05},
1717        )
1718
1719        for i, d in enumerate(queue):
1720            tmp_data = data[d]
1721
1722            axs[0, i].bar(
1723                [str(n) for n in range(len(tmp_data["n_standarized"]))],
1724                tmp_data["n_standarized"],
1725                width=0.95,
1726                color="gold",
1727            )
1728            axs[0, i].set_ylim(
1729                min(standarized_min) * 0.9995, max(standarized_max) * 1.0005
1730            )
1731
1732            if i == 0:
1733                axs[0, i].set_ylabel("Standarized\nfrequency", fontsize=txt_size)
1734            else:
1735                axs[0, i].set_yticks([])
1736
1737            axs[0, i].set_xticks([])
1738            axs[0, i].tick_params(axis="y", labelsize=txt_size * 0.7)
1739
1740            axs[1, i].bar(
1741                [str(n) for n in range(len(tmp_data[tested_value]))],
1742                tmp_data[tested_value],
1743                width=0.95,
1744                color="orange",
1745            )
1746
1747            mean_value = np.mean(tmp_data[tested_value])
1748            axs[1, i].axhline(y=mean_value, color="red", linestyle="--")
1749
1750            # axs[1, i].set_ylim(min(value_min) - 1, max(value_max) + 1)
1751            axs[1, i].set_ylim(min(value_min) * 0.9995, max(value_max) * 1.0005)
1752
1753            if i == 0:
1754                axs[1, i].set_ylabel(f"Normalized\n{tested_value}", fontsize=txt_size)
1755            else:
1756                axs[1, i].set_yticks([])
1757
1758            axs[1, i].set_xticks([])
1759            axs[1, i].tick_params(axis="y", labelsize=txt_size * 0.7)
1760
1761            axs[2, i].bar(
1762                [str(n) for n in range(len(tmp_data["n_standarized"]))],
1763                [
1764                    a * b
1765                    for a, b in zip(tmp_data[tested_value], tmp_data["n_standarized"])
1766                ],
1767                width=0.95,
1768                color="goldenrod",
1769            )
1770
1771            mean_value = np.mean(
1772                [
1773                    a * b
1774                    for a, b in zip(tmp_data[tested_value], tmp_data["n_standarized"])
1775                ]
1776            )
1777            axs[2, i].axhline(y=mean_value, color="red", linestyle="--")
1778
1779            axs[2, i].set_ylim(
1780                (min(standarized_min) * min(value_min)) * 0.9995,
1781                (max(standarized_max) * max(value_max) * 1.0005),
1782            )
1783            axs[2, i].set_xlabel(d, fontsize=txt_size)
1784
1785            if i == 0:
1786                axs[2, i].set_ylabel(
1787                    f"Standarized\nnorm_{tested_value}", fontsize=txt_size
1788                )
1789            else:
1790                axs[2, i].set_yticks([])
1791
1792            axs[2, i].set_xticks([])
1793            axs[2, i].tick_params(axis="y", labelsize=txt_size * 0.7)
1794
1795        sign = "ns"
1796        if float(self.round_to_scientific_notation(pk)) < 0.001:
1797            sign = "***"
1798        elif float(self.round_to_scientific_notation(pk)) < 0.01:
1799            sign = "**"
1800        elif float(self.round_to_scientific_notation(pk)) < 0.05:
1801            sign = "*"
1802
1803        text = f"Test Welch's ANOVA\np-value: {self.round_to_scientific_notation(pk)} - {sign}\n"
1804
1805        if p_adj == True:
1806            for i in range(len(dfk["group1"])):
1807                sign = "ns"
1808                if dfk["adj_p_val"][i] < 0.001:
1809                    sign = "***"
1810                elif dfk["adj_p_val"][i] < 0.01:
1811                    sign = "**"
1812                elif dfk["adj_p_val"][i] < 0.05:
1813                    sign = "*"
1814
1815                text += f"{dfk['group1'][i]} vs. {dfk['group2'][i]}\np-value: {self.round_to_scientific_notation(dfk['adj_p_val'][i])} - {sign}\n"
1816        else:
1817            for i in range(len(dfk["group1"])):
1818                sign = "ns"
1819                if dfk["p_val"][i] < 0.001:
1820                    sign = "***"
1821                elif dfk["p_val"][i] < 0.01:
1822                    sign = "**"
1823                elif dfk["p_val"][i] < 0.05:
1824                    sign = "*"
1825
1826                text += f"{dfk['group1'][i]} vs. {dfk['group2'][i]}\np-value: {self.round_to_scientific_notation(dfk['p_val'][i])} - {sign}\n"
1827
1828        axs[1, -1].text(
1829            0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True
1830        )
1831        axs[1, -1].set_axis_off()
1832
1833        sign = "ns"
1834        if float(self.round_to_scientific_notation(pkc)) < 0.001:
1835            sign = "***"
1836        elif float(self.round_to_scientific_notation(pkc)) < 0.01:
1837            sign = "**"
1838        elif float(self.round_to_scientific_notation(pkc)) < 0.05:
1839            sign = "*"
1840
1841        text = f"Test Welch's ANOVA\np-value: {self.round_to_scientific_notation(pkc)} - {sign}\n"
1842
1843        if p_adj == True:
1844            for i in range(len(dfkc["group1"])):
1845                sign = "ns"
1846                if dfkc["adj_p_val"][i] < 0.001:
1847                    sign = "***"
1848                elif dfkc["adj_p_val"][i] < 0.01:
1849                    sign = "**"
1850                elif dfkc["adj_p_val"][i] < 0.05:
1851                    sign = "*"
1852
1853                text += f"{dfkc['group1'][i]} vs. {dfkc['group2'][i]}\np-value: {self.round_to_scientific_notation(dfkc['adj_p_val'][i])} - {sign}\n"
1854        else:
1855            for i in range(len(dfkc["group1"])):
1856                sign = "ns"
1857                if dfkc["p_val"][i] < 0.001:
1858                    sign = "***"
1859                elif dfkc["p_val"][i] < 0.01:
1860                    sign = "**"
1861                elif dfkc["p_val"][i] < 0.05:
1862                    sign = "*"
1863
1864                text += f"{dfkc['group1'][i]} vs. {dfkc['group2'][i]}\np-value: {self.round_to_scientific_notation(dfkc['p_val'][i])} - {sign}\n"
1865
1866        axs[2, -1].text(
1867            0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True
1868        )
1869        axs[2, -1].set_axis_off()
1870
1871        sign = "ns"
1872        if float(self.round_to_scientific_notation(pchi)) < 0.001:
1873            sign = "***"
1874        elif float(self.round_to_scientific_notation(pchi)) < 0.01:
1875            sign = "**"
1876        elif float(self.round_to_scientific_notation(pchi)) < 0.05:
1877            sign = "*"
1878
1879        text = f"Test Chi-squared\np-value: {self.round_to_scientific_notation(pchi)} - {sign}\n"
1880
1881        if p_adj == True:
1882            for i in range(len(dfchi["group1"])):
1883                sign = "ns"
1884                if dfchi["adj_p_val"][i] < 0.001:
1885                    sign = "***"
1886                elif dfchi["adj_p_val"][i] < 0.01:
1887                    sign = "**"
1888                elif dfchi["adj_p_val"][i] < 0.05:
1889                    sign = "*"
1890
1891                text += f"{dfchi['group1'][i]} vs. {dfchi['group2'][i]}\np-value: {self.round_to_scientific_notation(dfchi['adj_p_val'][i])} - {sign}\n"
1892        else:
1893            for i in range(len(dfchi["group1"])):
1894                sign = "ns"
1895                if dfchi["p_val"][i] < 0.001:
1896                    sign = "***"
1897                elif dfchi["p_val"][i] < 0.01:
1898                    sign = "**"
1899                elif dfchi["p_val"][i] < 0.05:
1900                    sign = "*"
1901
1902                text += f"{dfchi['group1'][i]} vs. {dfchi['group2'][i]}\np-value: {self.round_to_scientific_notation(dfchi['p_val'][i])} - {sign}\n"
1903
1904        axs[0, -1].text(
1905            0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True
1906        )
1907        axs[0, -1].set_axis_off()
1908
1909        plt.tight_layout()
1910
1911        if cfg._DISPLAY_MODE:
1912            plt.show()
1913
1914        return fig

Generate comparative histograms and display results of statistical tests (ANOVA and Chi-squared).

This method performs transformations on the input data, generates comparative histograms for each group, and annotates statistical test results, including Welch's ANOVA and Chi-squared tests. Multiple comparison corrections can be applied using the Bonferroni method.

Parameters

data : dict of pd.DataFrame Dictionary where keys are group names and values are DataFrames containing histogram data. Each DataFrame should include the column specified by tested_value.

queue : list of str Defines the order of groups to be plotted.

tested_value : str The column name in data representing the variable to test and visualize.

p_adj : bool, optional If True, applies Bonferroni correction for multiple comparisons (default is True).

txt_size : int, optional Font size for text annotations in the plot (default is 20).

Returns

fig : matplotlib.figure.Figure Matplotlib figure object containing the generated histograms and statistical test results.

Example

fig = self.hist_compare_plot( data, queue=['group1', 'group2', 'group3'], tested_value='n', p_adj=True, txt_size=18 ) plt.show()