jimg_int.intensity
1import json 2import os 3import random 4import re 5from itertools import combinations 6 7import matplotlib.pyplot as plt 8import numpy as np 9import pandas as pd 10import pingouin as pg 11from scipy import stats 12from scipy.stats import chi2_contingency 13from tqdm import tqdm 14 15import jimg_int.config as cfg 16 17from .utils import * 18 19random.seed(42) 20 21 22class FeatureIntensity(ImageTools): 23 r""" 24 Class for quantitative analysis of pixel intensity and size measurements 25 in 2D/3D biological images. Supports projection of 3D stacks, mask-based 26 intensity normalization, region size estimation and metadata extraction. 27 28 Parameters 29 ---------- 30 input_image : ndarray, optional 31 Input image or 3D stack for analysis. If 3D, projection must be applied. 32 33 image : ndarray, optional 34 2D projected image (internal use). 35 36 normalized_image_values : dict, optional 37 Dictionary storing normalized intensity statistics. 38 39 mask : ndarray, optional 40 Binary mask of region of interest (ROI). 41 42 background_mask : ndarray, optional 43 Binary mask used for background estimation. If not provided, `mask` is used. 44 45 typ : {"avg", "median", "std", "var", "max", "min"}, optional 46 Projection type for 3D stacks. Default is `"avg"`. 47 48 size_info : dict, optional 49 Dictionary storing ROI size measurements. 50 51 correction_factor : float, optional 52 Normalization correction factor applied to background intensity. 53 Must satisfy 0 < factor < 1. Default is 0.1. 54 55 img_type : str, optional 56 Image type metadata. 57 58 scale : float, optional 59 Pixel resolution in physical units (e.g. µm/px). Used in size calculations. 60 61 stack_selection : list of int, optional 62 List of Z-indices to remove when projecting a 3D image. 63 64 Attributes 65 ---------- 66 input_image : ndarray or None 67 Loaded input image. 68 69 image : ndarray or None 70 Projected 2D image. 71 72 mask : ndarray or None 73 Region of interest mask. 74 75 background_mask : ndarray or None 76 Background normalization mask. 77 78 scale : float or None 79 Scale value for size estimation. 80 81 normalized_image_values : dict or None 82 Dictionary containing intensity metrics. 83 84 size_info : dict or None 85 Dictionary with ROI size information. 86 87 typ : str 88 Selected projection type for 3D images. 89 90 stack_selection : list of int 91 Z-levels excluded from projection. 92 93 Notes 94 ----- 95 The intensity normalization formula applied per pixel is: 96 97 .. math:: 98 99 R_{i,j} = T_{i,j} - \\left( \\mu_B (1 + c) \\right) 100 101 where 102 * ``T_{i,j}`` – pixel intensity in ROI 103 * ``μ_B`` – mean intensity in background region 104 * ``c`` – correction factor 105 * ``R_{i,j}`` – normalized pixel intensity 106 107 Examples 108 -------- 109 Load a 3D image, mask and compute statistics: 110 111 >>> fi = FeatureIntensity() 112 >>> fi.load_image_3D("stack.tiff") 113 >>> fi.load_mask_("mask.png") 114 >>> fi.set_projection("median") 115 >>> fi.run_calculations() 116 >>> results = fi.get_results() 117 >>> results["intensity"]["norm_mean"] 118 """ 119 120 def __init__( 121 self, 122 input_image=None, 123 image=None, 124 normalized_image_values=None, 125 mask=None, 126 background_mask=None, 127 typ=None, 128 size_info=None, 129 correction_factor=None, 130 img_type=None, 131 scale=None, 132 stack_selection=None, 133 ): 134 """ 135 Initialize a FeatureIntensity analysis instance. 136 137 Parameters 138 ---------- 139 input_image : ndarray, optional 140 Input image or 3D stack used for analysis. If the image is 3D, a 141 projection will be computed depending on the `typ` parameter. 142 143 image : ndarray, optional 144 2D image buffer used internally after projection of the input image. 145 Should not be set manually. 146 147 normalized_image_values : dict, optional 148 Dictionary containing normalized intensity statistics. Usually filled 149 automatically after running `run_calculations()`. 150 151 mask : ndarray, optional 152 Binary mask of the target region of interest (ROI). Required for 153 intensity and size calculations. 154 155 background_mask : ndarray, optional 156 Binary mask specifying the background region used to compute the 157 normalization threshold. If not provided, the ROI mask is also used 158 as the background reference. 159 160 typ : {"avg", "median", "std", "var", "max", "min"}, optional 161 Projection method for 3D images. Determines how the z-stack is 162 collapsed into a 2D image. Default is `"avg"`. 163 164 size_info : dict, optional 165 Dictionary storing computed size metrics of the ROI. Populated after 166 invoking `size_calculations()`. 167 168 correction_factor : float, optional 169 Correction term used during intensity normalization. Must satisfy 170 0 < correction_factor < 1. Default is 0.1. 171 172 img_type : str, optional 173 Optional metadata about the image type (e.g., "tiff", "png"). 174 175 scale : float, optional 176 Pixel resolution in physical units (e.g., µm/px). Required for 177 real-size estimation in `size_calculations()`. 178 179 stack_selection : list of int, optional 180 Indices of z-planes to exclude during projection of a 3D stack. 181 182 Notes 183 ----- 184 Values not provided are initialized to `None`, except for `typ`, which 185 defaults to `"avg"`, and `correction_factor`, which defaults to 0.1. 186 187 The class is designed to be populated by loading functions: 188 `load_image_()`, `load_image_3D()`, `load_mask_()`, 189 and optionally `load_normalization_mask_()` and `load_JIMG_project_()`. 190 """ 191 192 self.input_image = input_image or None 193 """ Input image or 3D stack used for analysis. If the image is 3D, a 194 projection will be computed depending on the `typ` parameter.""" 195 196 self.image = image or None 197 """ 2D image buffer used internally after projection of the input image. 198 Should not be set manually.""" 199 200 self.normalized_image_values = normalized_image_values or None 201 """Dictionary containing normalized intensity statistics. Usually filled 202 automatically after running `run_calculations()`.""" 203 204 self.mask = mask or None 205 """Binary mask of the target region of interest (ROI). Required for 206 intensity and size calculations.""" 207 208 self.background_mask = background_mask or None 209 """ Binary mask specifying the background region used to compute the 210 normalization threshold. If not provided, the ROI mask is also used 211 as the background reference.""" 212 213 self.typ = typ or "avg" 214 """Projection method for 3D images. Determines how the z-stack is 215 collapsed into a 2D image. Default is `"avg"`.""" 216 217 self.size_info = size_info or None 218 """Dictionary storing computed size metrics of the ROI. Populated after 219 invoking `size_calculations()`.""" 220 221 self.correction_factor = correction_factor or 0.1 222 """ Correction term used during intensity normalization. Must satisfy 223 0 < correction_factor < 1. Default is 0.1.""" 224 225 self.scale = scale or None 226 """ Pixel resolution in physical units (e.g., µm/px). Required for 227 real-size estimation in `size_calculations()`.""" 228 229 self.stack_selection = stack_selection or [] 230 """Indices of z-planes to exclude during projection of a 3D stack.""" 231 232 @property 233 def current_metadata(self): 234 r""" 235 Return current metadata parameters used in image processing and normalization. 236 237 Returns 238 ------- 239 tuple 240 A tuple containing: 241 242 projection_type : str 243 Projection method used for 3D image reduction (e.g., "avg", "median"). 244 245 correction_factor : float 246 Correction factor used for background subtraction during intensity 247 normalization. The applied formula is: 248 249 .. math:: 250 251 R_{i,j} = T_{i,j} - ( \mu_B (1 + c) ) 252 253 where 254 * ``R_{i,j}`` — normalized pixel intensity 255 * ``T_{i,j}`` — original pixel intensity 256 * ``μ_B`` — mean background intensity 257 * ``c`` — correction factor 258 scale : float or None 259 Pixel resolution (unit/px), loaded via `load_JIMG_project_()` or set manually 260 using `set_scale()`. 261 262 stack_selection : list of int 263 Indices of z-slices excluded from projection of a 3D image. 264 265 Notes 266 ----- 267 This property also prints the metadata values to the console for quick inspection. 268 """ 269 270 print(f"Projection type: {self.typ}") 271 print(f"Correction factor: {self.correction_factor}") 272 print(f"Scale (unit/px): {self.scale}") 273 print(f"Selected stac to remove: {self.stack_selection}") 274 275 return self.typ, self.correction_factor, self.scale, self.stack_selection 276 277 def set_projection(self, projection: str): 278 """ 279 Set the projection method for 3D image stack reduction. 280 281 Parameters 282 ---------- 283 projection : {"avg", "median", "std", "var", "max", "min"} 284 Projection method to reduce a 3D image stack to a 2D image. Default is `"avg"`. 285 286 Notes 287 ----- 288 This method updates the `typ` attribute of the class. The selected projection 289 determines how the z-stack is collapsed: 290 - `"avg"` : average intensity across slices 291 - `"median"` : median intensity across slices 292 - `"std"` : standard deviation across slices 293 - `"var"` : variance across slices 294 - `"max"` : maximum intensity across slices 295 - `"min"` : minimum intensity across slices 296 """ 297 298 t = ["avg", "median", "std", "var", "max", "min"] 299 if projection in t: 300 self.typ = projection 301 else: 302 print(f"\nProvided parameter is incorrect. Avaiable projection types: {t}") 303 304 def set_correction_factorn(self, factor: float): 305 r""" 306 Set the correction factor for background subtraction during intensity normalization. 307 308 Parameters 309 ---------- 310 factor : float 311 Correction factor to adjust background subtraction. Must satisfy 0 < factor < 1. 312 Default is 0.1. 313 314 Notes 315 ----- 316 The correction is applied per pixel in the target mask using the formula: 317 318 .. math:: 319 320 R_{i,j} = T_{i,j} - ( \mu_B (1 + c) ) 321 322 where 323 * ``R_{i,j}`` — normalized pixel intensity 324 * ``T_{i,j}`` — original pixel intensity 325 * ``μ_B`` — mean intensity in the background mask 326 * ``c`` — correction factor 327 """ 328 329 if factor < 1 and factor > 0: 330 self.correction_factor = factor 331 else: 332 print( 333 "\nProvided parameter is incorrect. The factor should be a floating-point value within the range of 0 to 1." 334 ) 335 336 def set_scale(self, scale): 337 """ 338 Set the scale for converting pixel measurements to physical units. 339 340 Parameters 341 ---------- 342 scale : float 343 Pixel resolution in physical units (e.g., µm/px). Used to calculate the 344 actual size of the tissue or organ. 345 346 Notes 347 ----- 348 The scale can also be automatically loaded from a JIMG project using 349 `load_JIMG_project_()`. This value is required for size calculations in 350 `size_calculations()`. 351 """ 352 353 self.scale = scale 354 355 def set_selection_list(self, rm_list: list): 356 """ 357 Set the list of z-slices to exclude when projecting a 3D image stack. 358 359 Parameters 360 ---------- 361 rm_list : list of int 362 List of indices corresponding to z-slices that should be removed from 363 the full 3D image stack before projection. 364 365 Notes 366 ----- 367 This updates the `stack_selection` attribute, which is used by the 368 `stack_selection_()` method during projection. 369 """ 370 371 self.stack_selection = rm_list 372 373 def load_JIMG_project_(self, path): 374 """ 375 Load a JIMG project from a `.pjm` file. 376 377 Parameters 378 ---------- 379 file_path : str 380 Path to the JIMG project file. The file must have a `.pjm` extension. 381 382 Returns 383 ------- 384 project : object 385 Loaded project object containing images and metadata. 386 387 Raises 388 ------ 389 ValueError 390 If the provided file path does not point to a `.pjm` file. 391 392 Notes 393 ----- 394 The method attempts to automatically set the `scale` and `stack_selection` 395 attributes from the project metadata if available. 396 """ 397 398 path = os.path.abspath(path) 399 400 if ".pjm" in path: 401 metadata = self.load_JIMG_project(path) 402 403 try: 404 self.scale = metadata.metadata["X_resolution[um/px]"] 405 except: 406 407 try: 408 self.scale = metadata.images_dict["metadata"][0][ 409 "X_resolution[um/px]" 410 ] 411 412 except: 413 print( 414 '\nUnable to set scale on this project! Set scale using "set_scale()"' 415 ) 416 417 self.stack_selection = metadata.removal_list 418 419 else: 420 print( 421 "\nWrong path. The provided path does not point to a JIMG project (*.pjm)." 422 ) 423 424 def stack_selection_(self): 425 """ 426 Remove selected z-slices from a 3D image stack based on `stack_selection`. 427 428 Notes 429 ----- 430 Only works if `input_image` is a 3D ndarray. The slices with indices listed 431 in `stack_selection` are excluded from the stack. Updates `input_image` 432 in-place. 433 434 Prints a warning if `stack_selection` is empty. 435 """ 436 437 if len(self.input_image.shape) == 3: 438 if len(self.stack_selection) > 0: 439 self.input_image = self.input_image[ 440 [ 441 x 442 for x in range(self.input_image.shape[0]) 443 if x not in self.stack_selection 444 ] 445 ] 446 else: 447 print("\nImages to remove from the stack were not selected!") 448 449 def projection(self): 450 """ 451 Project a 3D image stack into a 2D image using the method defined by `typ`. 452 453 Notes 454 ----- 455 Updates the `image` attribute with the projected 2D result. 456 457 Supported projection types (`typ`): 458 - "avg" : mean intensity across slices 459 - "median" : median intensity across slices 460 - "std" : standard deviation across slices 461 - "var" : variance across slices 462 - "max" : maximum intensity across slices 463 - "min" : minimum intensity across slices 464 465 Raises 466 ------ 467 AttributeError 468 If `input_image` is not defined. 469 """ 470 471 if self.typ == "avg": 472 img = np.mean(self.input_image, axis=0) 473 474 elif self.typ == "std": 475 img = np.std(self.input_image, axis=0) 476 477 elif self.typ == "median": 478 img = np.median(self.input_image, axis=0) 479 480 elif self.typ == "var": 481 img = np.var(self.input_image, axis=0) 482 483 elif self.typ == "max": 484 img = np.max(self.input_image, axis=0) 485 486 elif self.typ == "min": 487 img = np.min(self.input_image, axis=0) 488 489 self.image = img 490 491 def detect_img(self): 492 """ 493 Detect whether the input image is 2D or 3D and perform appropriate preprocessing. 494 495 Notes 496 ----- 497 - For 3D images, applies `stack_selection_()` and then `projection()`. 498 - For 2D images, no projection is applied. 499 - Prints status messages indicating the type of image and applied operations. 500 501 Raises 502 ------ 503 AttributeError 504 If `input_image` is not defined. 505 """ 506 check = len(self.input_image.shape) 507 508 if check == 3: 509 print("\n3D image detected! Starting processing for 3D image...") 510 print(f"Projection - {self.typ}...") 511 512 self.stack_selection_() 513 self.projection() 514 515 elif check == 2: 516 print("\n2D image detected! Starting processing for 2D image...") 517 518 else: 519 print("\nData does not match any image type!") 520 521 def load_image_3D(self, path): 522 """ 523 Load a 3D image stack from a TIFF file. 524 525 Parameters 526 ---------- 527 path : str 528 Path to the 3D image file (*.tiff) to be loaded. 529 530 Notes 531 ----- 532 The loaded image is stored in the `input_image` attribute as a 3D ndarray. 533 """ 534 path = os.path.abspath(path) 535 536 self.input_image = self.load_3D_tiff(path) 537 538 def load_image_(self, path): 539 """ 540 Load a 2D image into the class. 541 542 Parameters 543 ---------- 544 path : str 545 Path to the image file to be loaded. 546 547 Notes 548 ----- 549 The loaded image is stored in the `input_image` attribute as a 2D ndarray. 550 """ 551 path = os.path.abspath(path) 552 553 self.input_image = self.load_image(path) 554 555 def load_mask_(self, path): 556 r""" 557 Load a binary mask into the class and optionally set it as the normalization mask. 558 559 Parameters 560 ---------- 561 path : str 562 Path to the mask image file. Supported formats include 8-bit or 16-bit images 563 with extensions such as `.png` or `.jpeg`. The mask must be binary 564 (e.g., 0/255, 0/2**16-1, 0/1). 565 566 Notes 567 ----- 568 - If `load_normalization_mask_()` is not called, this mask is also used as the 569 background mask for intensity normalization. 570 - Normalization is applied per pixel using the formula: 571 572 .. math:: 573 574 R_{i,j} = T_{i,j} - ( \mu_B (1 + c) ) 575 576 where 577 * ``R_{i,j}`` — normalized pixel intensity 578 * ``T_{i,j}`` — pixel intensity in the target mask 579 * ``μ_B`` — mean intensity of the background (reversed mask) 580 * ``c`` — correction factor 581 """ 582 583 path = os.path.abspath(path) 584 585 self.mask = self.load_mask(path) 586 587 print( 588 "\nThis mask was also set as the reverse background mask. If you want a different background mask for normalization, use 'load_normalization_mask()'." 589 ) 590 self.background_mask = self.load_mask(path) 591 592 def load_normalization_mask_(self, path): 593 r""" 594 Load a binary mask for normalization into the class. 595 596 Parameters 597 ---------- 598 path : str 599 Path to the mask image file. Supported formats include 8-bit or 16-bit 600 images (e.g., `.png`, `.jpeg`). The mask must be binary (0/255, 0/2**16-1, 0/1). 601 602 Notes 603 ----- 604 - The mask defines the area of interest. Normalization is applied to the inverse 605 of this area (reversed mask). 606 - Normalization formula applied per pixel: 607 608 .. math:: 609 610 R_{i,j} = T_{i,j} - ( \mu_B (1 + c) ) 611 612 where 613 * ``R_{i,j}`` — normalized pixel intensity 614 * ``T_{i,j}`` — pixel intensity in the target mask 615 * ``μ_B`` — mean intensity of the background (reversed mask) 616 * ``c`` — correction factor 617 """ 618 619 path = os.path.abspath(path) 620 621 self.background_mask = self.load_mask(path) 622 623 def intensity_calculations(self): 624 """ 625 Calculate normalized and raw intensity statistics from the image based on masks. 626 627 This method performs intensity calculations using the main mask (`self.mask`) 628 and the background mask (`self.background_mask`). The pixel intensities within 629 the mask of interest are normalized by subtracting a threshold derived from the 630 background region and applying a correction factor (`self.correction_factor`). 631 Negative values after normalization are clipped to zero. 632 633 The following statistics are computed for both normalized and raw values: 634 - Minimum 635 - Maximum 636 - Mean 637 - Median 638 - Standard deviation 639 - Variance 640 - List of all normalized values (only for normalized data) 641 642 Notes 643 ----- 644 - The method updates the instance attribute `self.normalized_image_values` 645 with a dictionary containing both normalized and raw statistics. 646 - Normalization formula applied for each pixel in the selected mask: 647 final_val = selected_value - (threshold + threshold * correction_factor) 648 where threshold is the mean intensity in the background mask. 649 - Negative values after normalization are set to zero. 650 """ 651 652 tmp_mask = self.ajd_mask_size(image=self.image, mask=self.mask) 653 tmp_bmask = self.ajd_mask_size(image=self.image, mask=self.background_mask) 654 655 selected_values = self.image[tmp_mask == np.max(tmp_mask)] 656 657 threshold = np.mean(self.image[tmp_bmask == np.min(tmp_bmask)]) 658 659 # normalization 660 final_val = selected_values - (threshold + (threshold * self.correction_factor)) 661 662 final_val[final_val < 0] = 0 663 664 tmp_dict = { 665 "norm_min": np.min(final_val), 666 "norm_max": np.max(final_val), 667 "norm_mean": np.mean(final_val), 668 "norm_median": np.median(final_val), 669 "norm_std": np.std(final_val), 670 "norm_var": np.var(final_val), 671 "norm_values": final_val.tolist(), 672 "min": np.min(selected_values), 673 "max": np.max(selected_values), 674 "mean": np.mean(selected_values), 675 "median": np.median(selected_values), 676 "std": np.std(selected_values), 677 "var": np.var(selected_values), 678 } 679 680 self.normalized_image_values = tmp_dict 681 682 def size_calculations(self): 683 """ 684 Calculates the size and bounding dimensions of the masked region in the image. 685 686 This method computes the following metrics based on the current mask: 687 - Total number of pixels in the mask (`px_size`) 688 - Real-world size if a scale is provided (`size`) 689 - Maximum lengths along x and y axes (`max_length_x_axis`, `max_length_y_axis`) 690 691 If `self.scale` is defined (unit per pixel), the real-world size is calculated. 692 If not, `size` will be `None` and a warning message is printed. 693 694 Returns: 695 Updates the following attributes in the class: 696 - self.size_info (dict) containing: 697 - 'size' (float or None): real-world size of the mask 698 - 'px_size' (int): number of pixels in the masked region 699 - 'max_length_x_axis' (int): length of the bounding box along the x-axis 700 - 'max_length_y_axis' (int): length of the bounding box along the y-axis 701 702 Example: 703 analysis.size_calculations() 704 print(analysis.size_info) 705 """ 706 707 tmp_mask = self.ajd_mask_size(image=self.image, mask=self.mask) 708 709 size_px = int(len(tmp_mask[tmp_mask > np.min(tmp_mask)])) 710 711 if self.scale is not None: 712 size = float(size_px * self.scale) 713 else: 714 size = None 715 print( 716 '\nUnable to calculate real size, scale (unit/px) not provided, use "set_scale()" or load JIMG project .pjm metadata "load_pjm()" to set scale for calculations!' 717 ) 718 719 non_zero_indices = np.where(tmp_mask == np.max(tmp_mask)) 720 721 min_y, max_y = np.min(non_zero_indices[0]), np.max(non_zero_indices[0]) 722 min_x, max_x = np.min(non_zero_indices[1]), np.max(non_zero_indices[1]) 723 724 max_length_x_axis = int(max_x - min_x + 1) 725 max_length_y_axis = int(max_y - min_y + 1) 726 727 tmp_val = { 728 "size": size, 729 "px_size": size_px, 730 "max_length_x_axis": max_length_x_axis, 731 "max_length_y_axis": max_length_y_axis, 732 } 733 734 self.size_info = tmp_val 735 736 def run_calculations(self): 737 """ 738 Run the full analysis pipeline on the loaded image using the provided masks. 739 740 Notes 741 ----- 742 - The input image must be loaded via `load_image_()` or `load_image_3D()`. 743 - The ROI mask must be loaded via `load_mask_()`. Optionally, a normalization 744 mask can be loaded via `load_normalization_mask_()`. 745 - Parameters such as projection type and correction factor can be set with 746 `set_projection()` and `set_correction_factor()`. 747 - Scale and stack selection can also influence calculations if defined. 748 - To view current parameters, use the `current_metadata` property. 749 750 Returns 751 ------- 752 None 753 The results are stored internally and can be retrieved using 754 `get_results()`. 755 """ 756 757 if self.input_image is not None: 758 759 if self.mask is not None: 760 761 print("\nStart...") 762 self.detect_img() 763 self.intensity_calculations() 764 self.size_calculations() 765 print("\nCompleted!") 766 767 def get_results(self): 768 """ 769 Return the results from the analysis performed by `run_calculations()`. 770 771 Returns 772 ------- 773 results_dict : dict or None 774 Dictionary containing intensity and size results. Structure: 775 - 'intensity' : dict with normalized and raw intensity statistics 776 - 'size' : dict with ROI size metrics 777 778 Notes 779 ----- 780 If analysis has not been run yet, prints a message and returns None. 781 """ 782 783 if self.normalized_image_values is not None and self.size_info is not None: 784 785 results = { 786 "intensity": self.normalized_image_values, 787 "size": self.size_info, 788 } 789 790 return results 791 792 else: 793 print('\nAnalysis were not conducted. Run analysis "run_calculations()"') 794 795 def save_results( 796 self, 797 path="", 798 mask_region: str = "", 799 feature_name: str = "", 800 individual_number: int = 0, 801 individual_name: str = "", 802 ): 803 """ 804 Save the analysis results to a `.int` (JSON) file. 805 806 Parameters 807 ---------- 808 path : str, optional 809 Directory path where the file will be saved. Defaults to the current working directory. 810 811 mask_region : str 812 Name or identifier of the mask region (e.g., tissue, part of tissue). 813 814 feature_name : str 815 Name of the feature being analyzed. Underscores or spaces are replaced with periods. 816 817 individual_number : int 818 Unique identifier for the individual in the analysis (e.g., 1, 2, 3). 819 820 individual_name : str 821 Name of the individual (e.g., species name, tissue, organoid). 822 823 Notes 824 ----- 825 - The method validates that all required parameters are provided and that 826 analysis results exist (`normalized_image_values` and `size_info`). 827 - Creates the directory if it does not exist. 828 - File name format: 829 '<individual_name>_<individual_number>_<mask_region>_<feature_name>.int' 830 831 Raises 832 ------ 833 FileNotFoundError 834 If the specified path cannot be created or accessed. 835 836 ValueError 837 If any of `mask_region`, `feature_name`, `individual_number`, or 838 `individual_name` are missing or invalid. 839 """ 840 841 path = os.path.abspath(path) 842 843 if ( 844 len(mask_region) > 1 845 and len(feature_name) > 1 846 and individual_number != 0 847 and len(individual_name) > 1 848 ): 849 850 if self.normalized_image_values is not None and self.size_info is not None: 851 852 results = { 853 "intensity": self.normalized_image_values, 854 "size": self.size_info, 855 } 856 857 mask_region = re.sub(r"[_\s]+", ".", mask_region) 858 feature_name = re.sub(r"[_\s]+", ".", feature_name) 859 individual_number = re.sub(r"[_\s]+", ".", str(individual_number)) 860 individual_name = re.sub(r"[_\s]+", ".", individual_name) 861 862 full_name = f"{individual_name}_{individual_number}_{mask_region}_{feature_name}" 863 864 isExist = os.path.exists(path) 865 if not isExist: 866 os.makedirs(path, exist_ok=True) 867 868 full_path = os.path.join( 869 path, re.sub("\\.json", "", full_name) + ".int" 870 ) 871 872 with open(full_path, "w") as file: 873 json.dump(results, file, indent=4) 874 875 else: 876 print( 877 '\nAnalysis were not conducted. Run analysis "run_calculations()"' 878 ) 879 880 else: 881 print( 882 "\nAny of 'mask_region', 'feature_name', 'individual_number', 'individual_name' parameters were provided wrong!" 883 ) 884 885 def concatenate_intensity_data(self, directory: str = "", name: str = ""): 886 """ 887 Concatenate intensity data from multiple `.int` files and save as CSV. 888 889 Parameters 890 ---------- 891 directory : str, optional 892 Path to the directory containing `.int` files. Defaults to the current working directory. 893 894 name : str 895 Prefix for the output CSV file names. CSV files are saved in the format 896 '<name>_<gene>_<region>.csv'. 897 898 Raises 899 ------ 900 FileNotFoundError 901 If the directory cannot be accessed or no `.int` files are found. 902 903 ValueError 904 If an `.int` file is missing expected data or has an incorrect format. 905 906 Notes 907 ----- 908 - The method groups intensity data by gene (feature) and mask region. 909 - Outputs one CSV file per unique gene-region combination, saved in the specified directory. 910 """ 911 912 directory = os.path.abspath(directory) 913 914 files_list = [f for f in os.listdir(directory) if f.endswith(".int")] 915 916 genes_set = set([re.sub("\\.int", "", x.split("_")[3]) for x in files_list]) 917 regions_set = set([re.sub("\\.int", "", x.split("_")[2]) for x in files_list]) 918 919 for g in genes_set: 920 for r in regions_set: 921 json_to_save = { 922 "individual_name": [], 923 "individual_number": [], 924 "norm_intensity": [], 925 "size": [], 926 } 927 928 for f in tqdm(files_list): 929 if g in f and r in f: 930 with open(os.path.join(directory, f), "r") as file: 931 data = json.load(file) 932 933 json_to_save["norm_intensity"] = ( 934 json_to_save["norm_intensity"] 935 + data["intensity"]["norm_values"] 936 ) 937 json_to_save["individual_name"] = json_to_save[ 938 "individual_name" 939 ] + [f.split("_")[0]] * len( 940 data["intensity"]["norm_values"] 941 ) 942 json_to_save["individual_number"] = json_to_save[ 943 "individual_number" 944 ] + [f.split("_")[1]] * len( 945 data["intensity"]["norm_values"] 946 ) 947 json_to_save["size"] = json_to_save["size"] + [ 948 data["size"]["px_size"] 949 ] * len(data["intensity"]["norm_values"]) 950 951 pd.DataFrame(json_to_save).to_csv(f"{name}_{g}_{r}.csv", index=False) 952 953 954class IntensityAnalysis: 955 """ 956 Class for performing percentile-based statistical analysis on grouped data. 957 958 This class provides methods to calculate percentiles, remove outliers, aggregate 959 data into percentile bins, perform Welch's ANOVA and Chi-squared tests, and 960 visualize results via comparative histograms. It is designed to handle both 961 single-column and multi-column combinations of values for group-based analysis. 962 963 Methods 964 ------- 965 drop_up_df(data, group_col, values_col) 966 Removes upper outliers from a DataFrame based on a grouping column. 967 968 percentiles_calculation(values, sep_perc=1) 969 Calculates percentiles and creates loopable percentile ranges. 970 971 to_percentil(values, percentiles, percentiles_loop) 972 Aggregates statistics based on percentile ranges. 973 974 df_to_percentiles(data, group_col, values_col, sep_perc=1, drop_outlires=True) 975 Computes percentile statistics for grouped DataFrame data. 976 977 round_to_scientific_notation(num) 978 Formats a number in scientific notation or standard format. 979 980 aov_percentiles(data, testes_col, comb="*") 981 Performs Welch's ANOVA on percentile-based group data. 982 983 post_aov_percentiles(data, testes_col, comb="*") 984 Performs Welch's ANOVA with pairwise t-tests. 985 986 chi2_percentiles(input_hist) 987 Performs Chi-squared test on percentile histogram data. 988 989 post_ch2_percentiles(input_hist) 990 Performs Chi-squared test with pairwise comparisons. 991 992 hist_compare_plot(data, queue, tested_value, p_adj=True, txt_size=20) 993 Generates comparative histograms with statistical test results. 994 """ 995 996 def drop_up_df(self, data: pd.DataFrame, group_col: str, values_col: str): 997 """ 998 Remove upper outliers from a DataFrame based on a specified value column, grouped by a grouping column. 999 1000 Outliers are calculated and removed separately for each group defined by `group_col`. 1001 The upper outliers are defined using the interquartile range (IQR) method: 1002 values greater than Q3 + 1.5 * IQR are considered outliers. 1003 1004 Parameters 1005 ---------- 1006 data : pd.DataFrame 1007 The input DataFrame containing the data. 1008 1009 group_col : str 1010 The name of the column used for grouping the data. 1011 1012 values_col : str 1013 The column containing the values from which upper outliers will be removed. 1014 1015 Returns 1016 ------- 1017 filtered_data : pd.DataFrame 1018 A filtered DataFrame with the upper outliers removed for each group. 1019 1020 Notes 1021 ----- 1022 - Outliers are removed separately within each group. 1023 - The original DataFrame is not modified; a new filtered DataFrame is returned. 1024 """ 1025 1026 def iqr_filter(group): 1027 q75 = np.quantile(group[values_col], 0.75) 1028 q25 = np.quantile(group[values_col], 0.25) 1029 itq = q75 - q25 1030 return group[group[values_col] <= (q75 + 1.5 * itq)] 1031 1032 filtered_data = data.groupby(group_col).apply(iqr_filter).reset_index(drop=True) 1033 1034 return filtered_data 1035 1036 def percentiles_calculation(self, values, sep_perc: int = 1): 1037 """ 1038 Calculate percentiles for a set of values and generate consecutive percentile ranges. 1039 1040 This function computes percentiles from 0 to 100 at intervals defined by `sep_perc`. 1041 It also generates a list of consecutive percentile ranges that can be used for further analysis or binning. 1042 1043 Parameters 1044 ---------- 1045 values : array-like 1046 The input data values for which the percentiles are calculated. 1047 1048 sep_perc : int, optional 1049 Separation interval between percentiles (default is 1, meaning percentiles are calculated every 1%). 1050 1051 Returns 1052 ------- 1053 percentiles : np.ndarray 1054 Array of calculated percentile values. 1055 1056 percentiles_loop : list of tuple 1057 List of consecutive percentile ranges as tuples, e.g., [(0, 1), (1, 2), ..., (99, 100)]. 1058 1059 Notes 1060 ----- 1061 - The first percentile is set to 0 to avoid issues with zero values. 1062 - `percentiles_loop` is useful for iterating through percentile ranges when aggregating statistics. 1063 """ 1064 1065 per_vector = values.copy() 1066 1067 percentiles = np.percentile(per_vector, np.arange(0, 101, sep_perc)) 1068 percentiles[0] = 0 1069 1070 percentiles_loop = [(i, i + 1) for i in range(int(100 / sep_perc))] 1071 1072 return percentiles, percentiles_loop 1073 1074 def to_percentil(self, values, percentiles, percentiles_loop): 1075 """ 1076 Aggregate statistics for a set of values based on percentile ranges. 1077 1078 This function calculates summary statistics for each percentile range defined in `percentiles_loop`, 1079 using the percentile values calculated by `percentiles_calculation()`. Statistics include count, proportion, 1080 mean, median, standard deviation, and variance. 1081 1082 Parameters 1083 ---------- 1084 values : array-like 1085 Input data values for which the statistics are calculated. 1086 1087 percentiles : np.ndarray 1088 Array of percentile values used to define the ranges. 1089 1090 percentiles_loop : list of tuple 1091 List of consecutive percentile ranges, e.g., [(0, 1), (1, 2), ..., (99, 100)]. 1092 1093 Returns 1094 ------- 1095 data : dict 1096 Dictionary containing the following keys: 1097 - 'n' : list 1098 Number of elements in each percentile range. 1099 1100 - 'n_standarized' : list 1101 Proportion of elements in each percentile range relative to the total number of elements. 1102 1103 - 'avg' : list 1104 Mean value of elements within each percentile range. 1105 1106 - 'median' : list 1107 Median value of elements within each percentile range. 1108 1109 - 'std' : list 1110 Standard deviation of elements within each percentile range. 1111 1112 - 'var' : list 1113 Variance of elements within each percentile range. 1114 1115 Notes 1116 ----- 1117 - If a percentile range contains no elements, statistics are set to 0 and count is set to 1 to avoid empty lists. 1118 """ 1119 1120 per_vector = values.copy() 1121 1122 data = { 1123 "n": [], 1124 "n_standarized": [], 1125 "avg": [], 1126 "median": [], 1127 "std": [], 1128 "var": [], 1129 } 1130 1131 amount = len(per_vector) 1132 1133 for x in percentiles_loop: 1134 if ( 1135 len( 1136 per_vector[ 1137 (per_vector > percentiles[x[0]]) 1138 & (per_vector <= percentiles[x[1]]) 1139 ] 1140 ) 1141 > 0 1142 ): 1143 data["n"].append( 1144 len( 1145 per_vector[ 1146 (per_vector > percentiles[x[0]]) 1147 & (per_vector <= percentiles[x[1]]) 1148 ] 1149 ) 1150 ) 1151 data["n_standarized"].append( 1152 len( 1153 per_vector[ 1154 (per_vector > percentiles[x[0]]) 1155 & (per_vector <= percentiles[x[1]]) 1156 ] 1157 ) 1158 / amount 1159 ) 1160 data["avg"].append( 1161 np.mean( 1162 per_vector[ 1163 (per_vector > percentiles[x[0]]) 1164 & (per_vector <= percentiles[x[1]]) 1165 ] 1166 ) 1167 ) 1168 data["std"].append( 1169 np.std( 1170 per_vector[ 1171 (per_vector > percentiles[x[0]]) 1172 & (per_vector <= percentiles[x[1]]) 1173 ] 1174 ) 1175 ) 1176 data["median"].append( 1177 np.median( 1178 per_vector[ 1179 (per_vector > percentiles[x[0]]) 1180 & (per_vector <= percentiles[x[1]]) 1181 ] 1182 ) 1183 ) 1184 data["var"].append( 1185 np.var( 1186 per_vector[ 1187 (per_vector > percentiles[x[0]]) 1188 & (per_vector <= percentiles[x[1]]) 1189 ] 1190 ) 1191 ) 1192 else: 1193 data["n"].append(1) 1194 data["n_standarized"].append(0) 1195 data["avg"].append(0) 1196 data["std"].append(0) 1197 data["median"].append(0) 1198 data["var"].append(0) 1199 1200 return data 1201 1202 def df_to_percentiles( 1203 self, 1204 data: pd.DataFrame, 1205 group_col: str, 1206 values_col: str, 1207 sep_perc: int = 1, 1208 drop_outlires: bool = True, 1209 ): 1210 """ 1211 Calculate summary statistics based on percentile ranges for each group in a DataFrame. 1212 1213 This method groups the input DataFrame by `group_col`, computes percentile ranges for each group's values 1214 in `values_col`, and aggregates statistics (count, proportion, mean, median, standard deviation, variance) 1215 for each percentile range. Optionally, upper outliers can be removed before calculation. 1216 1217 Parameters 1218 ---------- 1219 data : pd.DataFrame 1220 Input DataFrame containing the data. 1221 1222 group_col : str 1223 Column name used to define groups. 1224 1225 values_col : str 1226 Column name containing the values for percentile calculations. 1227 1228 sep_perc : int, optional 1229 Separation interval for percentiles (default is 1, meaning percentiles are calculated at every 1%). 1230 1231 drop_outlires : bool, optional 1232 If True, removes upper outliers from the data before performing calculations (default is True). 1233 1234 Returns 1235 ------- 1236 full_data : dict 1237 Dictionary where each key is a group name and each value is a dictionary containing: 1238 - 'n' : list 1239 Number of elements in each percentile range. 1240 1241 - 'n_standarized' : list 1242 Proportion of elements in each percentile range relative to the total number of elements. 1243 1244 - 'avg' : list 1245 Mean value of elements within each percentile range. 1246 1247 - 'median' : list 1248 Median value of elements within each percentile range. 1249 1250 - 'std' : list 1251 Standard deviation of elements within each percentile range. 1252 1253 - 'var' : list 1254 Variance of elements within each percentile range. 1255 1256 Notes 1257 ----- 1258 - Outlier removal uses the IQR method within each group if `drop_outlires` is True. 1259 """ 1260 1261 full_data = {} 1262 1263 if drop_outlires == True: 1264 data = self.drop_up_df( 1265 data=data, group_col=group_col, values_col=values_col 1266 ) 1267 1268 groups = set(data[group_col]) 1269 1270 percentiles, percentiles_loop = self.percentiles_calculation( 1271 data[values_col], sep_perc=sep_perc 1272 ) 1273 1274 for g in groups: 1275 1276 print(f"Group: {g} ...") 1277 1278 tmp_values = data[values_col][data[group_col] == g] 1279 1280 per_dat = self.to_percentil(tmp_values, percentiles, percentiles_loop) 1281 1282 full_data[g] = per_dat 1283 1284 return full_data 1285 1286 def round_to_scientific_notation(self, num): 1287 """ 1288 Round a number to scientific notation if very small, otherwise to one decimal place. 1289 1290 Parameters 1291 ---------- 1292 num : float 1293 The number to round. 1294 1295 Returns 1296 ------- 1297 str 1298 The rounded number as a string. 1299 - If `num` is 0, returns "0.0". 1300 - If `abs(num) < 1e-4`, returns scientific notation with 1 decimal and 1-digit exponent. 1301 - Otherwise, returns the number rounded to one decimal place. 1302 """ 1303 1304 if num == 0: 1305 return "0.0" 1306 1307 if abs(num) < 0.0001: 1308 rounded_num = np.format_float_scientific(num, precision=1, exp_digits=1) 1309 return rounded_num 1310 else: 1311 return f"{num:.1f}" 1312 1313 def aov_percentiles(self, data, testes_col, comb: str = "*"): 1314 """ 1315 Perform Welch's ANOVA on percentile-based group data. 1316 1317 This method calculates group values by combining the columns specified in `testes_col` 1318 according to the operation defined in `comb`, then performs Welch's ANOVA to test for 1319 differences in means between the groups. Welch's ANOVA is suitable when the groups 1320 have unequal variances. 1321 1322 Parameters 1323 ---------- 1324 data : dict of pd.DataFrame 1325 Dictionary where keys are group names and values are DataFrames containing the data. 1326 1327 testes_col : str or list of str 1328 Column name(s) from which the group values are derived. If a list is provided, columns 1329 will be combined based on the `comb` operation. 1330 1331 comb : str, optional 1332 Operation used to combine multiple columns if `testes_col` is a list. Options include: 1333 '*' : multiplication 1334 '+' : addition 1335 '**': exponentiation 1336 '-' : subtraction 1337 '/' : division 1338 Default is '*'. 1339 1340 Returns 1341 ------- 1342 F : float 1343 F-statistic from Welch's ANOVA. 1344 1345 p_val : float 1346 Uncorrected p-value from Welch's ANOVA, testing for significant differences between groups. 1347 1348 Notes 1349 ----- 1350 - If `testes_col` is a single string, no combination is performed, and the group values 1351 are taken directly from that column. 1352 - Welch's ANOVA is used as it accounts for unequal variances between groups. 1353 - The `df.melt()` method is used to reshape the data, allowing the ANOVA to be applied to all groups. 1354 1355 Examples 1356 -------- 1357 >>> welch_F, welch_p = self.aov_percentiles(data, testes_col=['col1', 'col2'], comb='+') 1358 >>> print(f"Welch's ANOVA F-statistic: {welch_F}, p-value: {welch_p}") 1359 """ 1360 1361 groups = [] 1362 1363 for d in data.keys(): 1364 1365 if isinstance(testes_col, str): 1366 g = data[d][testes_col] 1367 elif isinstance(testes_col, list): 1368 g = [1] * len(data[d][testes_col[0]]) 1369 for t in testes_col: 1370 if comb == "*": 1371 g = [a * b for a, b in zip(g, data[d][t])] 1372 elif comb == "+": 1373 g = [a + b for a, b in zip(g, data[d][t])] 1374 elif comb == "**": 1375 g = [a**b for a, b in zip(g, data[d][t])] 1376 elif comb == "-": 1377 g = [a - b for a, b in zip(g, data[d][t])] 1378 elif comb == "/": 1379 g = [a / b for a, b in zip(g, data[d][t])] 1380 1381 groups.append(g) 1382 1383 df = pd.DataFrame({f"group_{i}": group for i, group in enumerate(groups)}) 1384 1385 df_melted = df.melt(var_name="group", value_name="value") 1386 1387 welch_results = pg.welch_anova(data=df_melted, dv="value", between="group") 1388 1389 return welch_results["F"].values[0], welch_results["p-unc"].values[0] 1390 1391 def post_aov_percentiles(self, data, testes_col, comb: str = "*"): 1392 """ 1393 Perform Welch's ANOVA on percentile-based group data and pairwise Welch's t-tests. 1394 1395 This method first performs Welch's ANOVA to assess differences in group means, and 1396 then conducts pairwise Welch's t-tests between all group combinations. P-values are 1397 adjusted using the Bonferroni correction for multiple comparisons. 1398 1399 Parameters 1400 ---------- 1401 data : dict of pd.DataFrame 1402 Dictionary where keys are group names and values are DataFrames containing the data. 1403 1404 testes_col : str or list of str 1405 Column name(s) from which the group values are derived. If a list is provided, 1406 columns will be combined according to the `comb` operation. 1407 1408 comb : str, optional 1409 Operation used to combine multiple columns if `testes_col` is a list. Options include: 1410 '*' : multiplication 1411 '+' : addition 1412 '**': exponentiation 1413 '-' : subtraction 1414 '/' : division 1415 Default is '*'. 1416 1417 Returns 1418 ------- 1419 p_val : float 1420 Uncorrected p-value from the Welch's ANOVA. 1421 1422 final_results : dict 1423 Dictionary containing results of pairwise Welch's t-tests with keys: 1424 'group1' : list of first group names in each comparison 1425 'group2' : list of second group names in each comparison 1426 'stat' : list of t-statistics for each comparison 1427 'p_val' : list of uncorrected p-values for each comparison 1428 'adj_p_val' : list of Bonferroni-adjusted p-values for multiple comparisons 1429 """ 1430 1431 p_val = self.aov_percentiles(data=data, testes_col=testes_col, comb=comb)[1] 1432 1433 pairs = list(combinations(data, 2)) 1434 final_results = { 1435 "group1": [], 1436 "group2": [], 1437 "stat": [], 1438 "p_val": [], 1439 "adj_p_val": [], 1440 } 1441 1442 for group1, group2 in pairs: 1443 if isinstance(testes_col, str): 1444 g1 = data[group1][testes_col] 1445 elif isinstance(testes_col, list): 1446 g1 = [1] * len(data[group1][testes_col[0]]) 1447 for t in testes_col: 1448 if comb == "*": 1449 g1 = [a * b for a, b in zip(g1, data[group1][t])] 1450 elif comb == "+": 1451 g1 = [a + b for a, b in zip(g1, data[group1][t])] 1452 elif comb == "**": 1453 g1 = [a**b for a, b in zip(g1, data[group1][t])] 1454 elif comb == "-": 1455 g1 = [a - b for a, b in zip(g1, data[group1][t])] 1456 elif comb == "/": 1457 g1 = [a / b for a, b in zip(g1, data[group1][t])] 1458 1459 if isinstance(testes_col, str): 1460 g2 = data[group2][testes_col] 1461 elif isinstance(testes_col, list): 1462 g2 = [1] * len(data[group2][testes_col[0]]) 1463 for t in testes_col: 1464 if comb == "*": 1465 g2 = [a * b for a, b in zip(g2, data[group2][t])] 1466 elif comb == "+": 1467 g2 = [a + b for a, b in zip(g2, data[group2][t])] 1468 elif comb == "**": 1469 g2 = [a**b for a, b in zip(g2, data[group2][t])] 1470 elif comb == "-": 1471 g2 = [a - b for a, b in zip(g2, data[group2][t])] 1472 elif comb == "/": 1473 g2 = [a / b for a, b in zip(g2, data[group2][t])] 1474 1475 stat, p_val = stats.ttest_ind( 1476 g1, g2, alternative="two-sided", equal_var=False 1477 ) 1478 g = sorted([group1, group2]) 1479 final_results["group1"].append(g[0]) 1480 final_results["group2"].append(g[1]) 1481 final_results["stat"].append(stat) 1482 final_results["p_val"].append(p_val) 1483 adj = p_val * len(pairs) 1484 if adj > 1: 1485 final_results["adj_p_val"].append(1) 1486 else: 1487 final_results["adj_p_val"].append(adj) 1488 1489 return p_val, final_results 1490 1491 def chi2_percentiles(self, input_hist): 1492 """ 1493 Perform a Chi-squared test on percentile-based group data. 1494 1495 This method reformats the input histogram data into a contingency table and performs 1496 a Chi-squared test to evaluate whether there is a significant association between groups. 1497 1498 Parameters 1499 ---------- 1500 input_hist : dict of pd.DataFrame 1501 Dictionary where keys are group names and values are DataFrames containing histogram data. 1502 The DataFrame must include a column 'n' representing counts for each percentile/bin. 1503 1504 Returns 1505 ------- 1506 chi2_statistic : float 1507 Chi-squared test statistic. 1508 1509 p_value : float 1510 P-value from the Chi-squared test. 1511 1512 dof : int 1513 Degrees of freedom for the test. 1514 1515 expected : np.ndarray 1516 Expected frequencies for each group/bin under the null hypothesis. 1517 1518 chi_data : dict 1519 Formatted data used in the Chi-squared test, with group names as keys and bin counts as values. 1520 1521 Example 1522 ------- 1523 chi2_stat, p_val, dof, expected, chi_data = self.chi2_percentiles(input_hist) 1524 print(f"Chi-squared statistic: {chi2_stat}, p-value: {p_val}") 1525 """ 1526 1527 chi_data = {} 1528 1529 for d in input_hist.keys(): 1530 tmp_dic = {} 1531 1532 for n, c in enumerate(input_hist[d]["n"]): 1533 tmp_dic[f"p{n+1}"] = c 1534 1535 chi_data[d] = tmp_dic 1536 1537 chi2_statistic, p_value, dof, expected = chi2_contingency( 1538 pd.DataFrame(chi_data).T, correction=True 1539 ) 1540 1541 return chi2_statistic, p_value, dof, expected, chi_data 1542 1543 def post_ch2_percentiles(self, input_hist): 1544 """ 1545 Perform a Chi-squared test on percentile-based group data, including pairwise comparisons. 1546 1547 This method first performs a Chi-squared test across all groups to check for a significant association. 1548 It then performs pairwise Chi-squared tests between groups to identify specific differences. 1549 P-values for multiple comparisons are adjusted using the Bonferroni correction. 1550 1551 Parameters 1552 ---------- 1553 input_hist : dict of pd.DataFrame 1554 Dictionary where keys are group names and values are DataFrames containing histogram data. 1555 Each DataFrame must include a column 'n' with counts for each percentile/bin. 1556 1557 Returns 1558 ------- 1559 p_val : float 1560 Overall p-value from the initial Chi-squared test across all groups. 1561 1562 final_results : dict 1563 Results of pairwise Chi-squared tests, with keys: 1564 - 'group1' (list): Name of the first group in each comparison 1565 - 'group2' (list): Name of the second group in each comparison 1566 - 'chi2' (list): Chi-squared statistic for each pairwise comparison 1567 - 'p_val' (list): P-value for each pairwise comparison 1568 - 'adj_p_val' (list): Adjusted p-value (Bonferroni correction) for multiple comparisons 1569 1570 Example 1571 ------- 1572 p_val, final_results = self.post_ch2_percentiles(input_hist) 1573 print(f"Overall Chi-squared p-value: {p_val}") 1574 for i in range(len(final_results['group1'])): 1575 print(f"Comparison: {final_results['group1'][i]} vs {final_results['group2'][i]}") 1576 print(f"Chi2 stat: {final_results['chi2'][i]}, p-value: {final_results['p_val'][i]}, adj. p-value: {final_results['adj_p_val'][i]}") 1577 """ 1578 1579 res = self.chi2_percentiles(input_hist) 1580 1581 pairs = list(combinations(res[4], 2)) 1582 results = [] 1583 1584 for group1, group2 in pairs: 1585 table_pair = pd.DataFrame(res[4])[[group1, group2]] 1586 chi2_stat, p_val, _, _ = chi2_contingency(table_pair, correction=True) 1587 results.append((group1, group2, chi2_stat, p_val)) 1588 1589 final_results = { 1590 "group1": [], 1591 "group2": [], 1592 "chi2": [], 1593 "p_val": [], 1594 "adj_p_val": [], 1595 } 1596 1597 for group1, group2, chi2_stat, p_val in results: 1598 g = sorted([group1, group2]) 1599 final_results["group1"].append(g[0]) 1600 final_results["group2"].append(g[1]) 1601 final_results["chi2"].append(chi2_stat) 1602 final_results["p_val"].append(p_val) 1603 adj = p_val * len(results) 1604 if adj > 1: 1605 final_results["adj_p_val"].append(1) 1606 else: 1607 final_results["adj_p_val"].append(adj) 1608 1609 return res[1], final_results 1610 1611 def hist_compare_plot( 1612 self, data, queue, tested_value, p_adj: bool = True, txt_size: int = 20 1613 ): 1614 """ 1615 Generate comparative histograms and display results of statistical tests (ANOVA and Chi-squared). 1616 1617 This method performs transformations on the input data, generates comparative histograms 1618 for each group, and annotates statistical test results, including Welch's ANOVA and Chi-squared tests. 1619 Multiple comparison corrections can be applied using the Bonferroni method. 1620 1621 Parameters 1622 ---------- 1623 data : dict of pd.DataFrame 1624 Dictionary where keys are group names and values are DataFrames containing histogram data. 1625 Each DataFrame should include the column specified by `tested_value`. 1626 1627 queue : list of str 1628 Defines the order of groups to be plotted. 1629 1630 tested_value : str 1631 The column name in `data` representing the variable to test and visualize. 1632 1633 p_adj : bool, optional 1634 If True, applies Bonferroni correction for multiple comparisons (default is True). 1635 1636 txt_size : int, optional 1637 Font size for text annotations in the plot (default is 20). 1638 1639 Returns 1640 ------- 1641 fig : matplotlib.figure.Figure 1642 Matplotlib figure object containing the generated histograms and statistical test results. 1643 1644 Example 1645 ------- 1646 fig = self.hist_compare_plot( 1647 data, 1648 queue=['group1', 'group2', 'group3'], 1649 tested_value='n', 1650 p_adj=True, 1651 txt_size=18 1652 ) 1653 plt.show() 1654 """ 1655 1656 for i in data.keys(): 1657 values = np.array(data[i][tested_value]) 1658 values += 1 1659 transformed_values, fitted_lambda = stats.boxcox(values) 1660 data[i][tested_value] = transformed_values.tolist() 1661 1662 if sorted(queue) != sorted(data.keys()): 1663 print( 1664 "\n Wrong queue provided! The queue will be sorted with default settings!" 1665 ) 1666 queue = sorted(data.keys()) 1667 1668 # parametric selected value 1669 pk, dfk = self.post_aov_percentiles(data, testes_col=tested_value) 1670 1671 dfk = pd.DataFrame(dfk) 1672 1673 dfk = dfk.sort_values( 1674 by=["group1", "group2"], 1675 key=lambda col: [queue.index(val) if val in queue else -1 for val in col], 1676 ).reset_index(drop=True) 1677 1678 # parametric standarized selected value 1679 pkc, dfkc = self.post_aov_percentiles( 1680 data, testes_col=[tested_value, "n_standarized"], comb="*" 1681 ) 1682 1683 dfkc = pd.DataFrame(dfkc) 1684 1685 dfkc = dfkc.sort_values( 1686 by=["group1", "group2"], 1687 key=lambda col: [queue.index(val) if val in queue else -1 for val in col], 1688 ).reset_index(drop=True) 1689 1690 # chi2 1691 pchi, dfchi = self.post_ch2_percentiles(data) 1692 1693 dfchi = pd.DataFrame(dfchi) 1694 1695 dfchi = dfchi.sort_values( 1696 by=["group1", "group2"], 1697 key=lambda col: [queue.index(val) if val in queue else -1 for val in col], 1698 ).reset_index(drop=True) 1699 1700 ############################################################################## 1701 1702 standarized_max, standarized_min, value_max, value_min = [], [], [], [] 1703 for d in queue: 1704 standarized_max.append(max(data[d]["n_standarized"])) 1705 standarized_min.append(min(data[d]["n_standarized"])) 1706 value_max.append(max(data[d][tested_value])) 1707 value_min.append(min(data[d][tested_value])) 1708 1709 num_columns = len(queue) + 1 1710 1711 fig, axs = plt.subplots( 1712 3, 1713 num_columns, 1714 figsize=(8 * num_columns, 10), 1715 gridspec_kw={"width_ratios": [1] * len(queue) + [0.5], "wspace": 0.05}, 1716 ) 1717 1718 for i, d in enumerate(queue): 1719 tmp_data = data[d] 1720 1721 axs[0, i].bar( 1722 [str(n) for n in range(len(tmp_data["n_standarized"]))], 1723 tmp_data["n_standarized"], 1724 width=0.95, 1725 color="gold", 1726 ) 1727 axs[0, i].set_ylim( 1728 min(standarized_min) * 0.9995, max(standarized_max) * 1.0005 1729 ) 1730 1731 if i == 0: 1732 axs[0, i].set_ylabel("Standarized\nfrequency", fontsize=txt_size) 1733 else: 1734 axs[0, i].set_yticks([]) 1735 1736 axs[0, i].set_xticks([]) 1737 axs[0, i].tick_params(axis="y", labelsize=txt_size * 0.7) 1738 1739 axs[1, i].bar( 1740 [str(n) for n in range(len(tmp_data[tested_value]))], 1741 tmp_data[tested_value], 1742 width=0.95, 1743 color="orange", 1744 ) 1745 1746 mean_value = np.mean(tmp_data[tested_value]) 1747 axs[1, i].axhline(y=mean_value, color="red", linestyle="--") 1748 1749 # axs[1, i].set_ylim(min(value_min) - 1, max(value_max) + 1) 1750 axs[1, i].set_ylim(min(value_min) * 0.9995, max(value_max) * 1.0005) 1751 1752 if i == 0: 1753 axs[1, i].set_ylabel(f"Normalized\n{tested_value}", fontsize=txt_size) 1754 else: 1755 axs[1, i].set_yticks([]) 1756 1757 axs[1, i].set_xticks([]) 1758 axs[1, i].tick_params(axis="y", labelsize=txt_size * 0.7) 1759 1760 axs[2, i].bar( 1761 [str(n) for n in range(len(tmp_data["n_standarized"]))], 1762 [ 1763 a * b 1764 for a, b in zip(tmp_data[tested_value], tmp_data["n_standarized"]) 1765 ], 1766 width=0.95, 1767 color="goldenrod", 1768 ) 1769 1770 mean_value = np.mean( 1771 [ 1772 a * b 1773 for a, b in zip(tmp_data[tested_value], tmp_data["n_standarized"]) 1774 ] 1775 ) 1776 axs[2, i].axhline(y=mean_value, color="red", linestyle="--") 1777 1778 axs[2, i].set_ylim( 1779 (min(standarized_min) * min(value_min)) * 0.9995, 1780 (max(standarized_max) * max(value_max) * 1.0005), 1781 ) 1782 axs[2, i].set_xlabel(d, fontsize=txt_size) 1783 1784 if i == 0: 1785 axs[2, i].set_ylabel( 1786 f"Standarized\nnorm_{tested_value}", fontsize=txt_size 1787 ) 1788 else: 1789 axs[2, i].set_yticks([]) 1790 1791 axs[2, i].set_xticks([]) 1792 axs[2, i].tick_params(axis="y", labelsize=txt_size * 0.7) 1793 1794 sign = "ns" 1795 if float(self.round_to_scientific_notation(pk)) < 0.001: 1796 sign = "***" 1797 elif float(self.round_to_scientific_notation(pk)) < 0.01: 1798 sign = "**" 1799 elif float(self.round_to_scientific_notation(pk)) < 0.05: 1800 sign = "*" 1801 1802 text = f"Test Welch's ANOVA\np-value: {self.round_to_scientific_notation(pk)} - {sign}\n" 1803 1804 if p_adj == True: 1805 for i in range(len(dfk["group1"])): 1806 sign = "ns" 1807 if dfk["adj_p_val"][i] < 0.001: 1808 sign = "***" 1809 elif dfk["adj_p_val"][i] < 0.01: 1810 sign = "**" 1811 elif dfk["adj_p_val"][i] < 0.05: 1812 sign = "*" 1813 1814 text += f"{dfk['group1'][i]} vs. {dfk['group2'][i]}\np-value: {self.round_to_scientific_notation(dfk['adj_p_val'][i])} - {sign}\n" 1815 else: 1816 for i in range(len(dfk["group1"])): 1817 sign = "ns" 1818 if dfk["p_val"][i] < 0.001: 1819 sign = "***" 1820 elif dfk["p_val"][i] < 0.01: 1821 sign = "**" 1822 elif dfk["p_val"][i] < 0.05: 1823 sign = "*" 1824 1825 text += f"{dfk['group1'][i]} vs. {dfk['group2'][i]}\np-value: {self.round_to_scientific_notation(dfk['p_val'][i])} - {sign}\n" 1826 1827 axs[1, -1].text( 1828 0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True 1829 ) 1830 axs[1, -1].set_axis_off() 1831 1832 sign = "ns" 1833 if float(self.round_to_scientific_notation(pkc)) < 0.001: 1834 sign = "***" 1835 elif float(self.round_to_scientific_notation(pkc)) < 0.01: 1836 sign = "**" 1837 elif float(self.round_to_scientific_notation(pkc)) < 0.05: 1838 sign = "*" 1839 1840 text = f"Test Welch's ANOVA\np-value: {self.round_to_scientific_notation(pkc)} - {sign}\n" 1841 1842 if p_adj == True: 1843 for i in range(len(dfkc["group1"])): 1844 sign = "ns" 1845 if dfkc["adj_p_val"][i] < 0.001: 1846 sign = "***" 1847 elif dfkc["adj_p_val"][i] < 0.01: 1848 sign = "**" 1849 elif dfkc["adj_p_val"][i] < 0.05: 1850 sign = "*" 1851 1852 text += f"{dfkc['group1'][i]} vs. {dfkc['group2'][i]}\np-value: {self.round_to_scientific_notation(dfkc['adj_p_val'][i])} - {sign}\n" 1853 else: 1854 for i in range(len(dfkc["group1"])): 1855 sign = "ns" 1856 if dfkc["p_val"][i] < 0.001: 1857 sign = "***" 1858 elif dfkc["p_val"][i] < 0.01: 1859 sign = "**" 1860 elif dfkc["p_val"][i] < 0.05: 1861 sign = "*" 1862 1863 text += f"{dfkc['group1'][i]} vs. {dfkc['group2'][i]}\np-value: {self.round_to_scientific_notation(dfkc['p_val'][i])} - {sign}\n" 1864 1865 axs[2, -1].text( 1866 0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True 1867 ) 1868 axs[2, -1].set_axis_off() 1869 1870 sign = "ns" 1871 if float(self.round_to_scientific_notation(pchi)) < 0.001: 1872 sign = "***" 1873 elif float(self.round_to_scientific_notation(pchi)) < 0.01: 1874 sign = "**" 1875 elif float(self.round_to_scientific_notation(pchi)) < 0.05: 1876 sign = "*" 1877 1878 text = f"Test Chi-squared\np-value: {self.round_to_scientific_notation(pchi)} - {sign}\n" 1879 1880 if p_adj == True: 1881 for i in range(len(dfchi["group1"])): 1882 sign = "ns" 1883 if dfchi["adj_p_val"][i] < 0.001: 1884 sign = "***" 1885 elif dfchi["adj_p_val"][i] < 0.01: 1886 sign = "**" 1887 elif dfchi["adj_p_val"][i] < 0.05: 1888 sign = "*" 1889 1890 text += f"{dfchi['group1'][i]} vs. {dfchi['group2'][i]}\np-value: {self.round_to_scientific_notation(dfchi['adj_p_val'][i])} - {sign}\n" 1891 else: 1892 for i in range(len(dfchi["group1"])): 1893 sign = "ns" 1894 if dfchi["p_val"][i] < 0.001: 1895 sign = "***" 1896 elif dfchi["p_val"][i] < 0.01: 1897 sign = "**" 1898 elif dfchi["p_val"][i] < 0.05: 1899 sign = "*" 1900 1901 text += f"{dfchi['group1'][i]} vs. {dfchi['group2'][i]}\np-value: {self.round_to_scientific_notation(dfchi['p_val'][i])} - {sign}\n" 1902 1903 axs[0, -1].text( 1904 0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True 1905 ) 1906 axs[0, -1].set_axis_off() 1907 1908 plt.tight_layout() 1909 1910 if cfg._DISPLAY_MODE: 1911 plt.show() 1912 1913 return fig
23class FeatureIntensity(ImageTools): 24 r""" 25 Class for quantitative analysis of pixel intensity and size measurements 26 in 2D/3D biological images. Supports projection of 3D stacks, mask-based 27 intensity normalization, region size estimation and metadata extraction. 28 29 Parameters 30 ---------- 31 input_image : ndarray, optional 32 Input image or 3D stack for analysis. If 3D, projection must be applied. 33 34 image : ndarray, optional 35 2D projected image (internal use). 36 37 normalized_image_values : dict, optional 38 Dictionary storing normalized intensity statistics. 39 40 mask : ndarray, optional 41 Binary mask of region of interest (ROI). 42 43 background_mask : ndarray, optional 44 Binary mask used for background estimation. If not provided, `mask` is used. 45 46 typ : {"avg", "median", "std", "var", "max", "min"}, optional 47 Projection type for 3D stacks. Default is `"avg"`. 48 49 size_info : dict, optional 50 Dictionary storing ROI size measurements. 51 52 correction_factor : float, optional 53 Normalization correction factor applied to background intensity. 54 Must satisfy 0 < factor < 1. Default is 0.1. 55 56 img_type : str, optional 57 Image type metadata. 58 59 scale : float, optional 60 Pixel resolution in physical units (e.g. µm/px). Used in size calculations. 61 62 stack_selection : list of int, optional 63 List of Z-indices to remove when projecting a 3D image. 64 65 Attributes 66 ---------- 67 input_image : ndarray or None 68 Loaded input image. 69 70 image : ndarray or None 71 Projected 2D image. 72 73 mask : ndarray or None 74 Region of interest mask. 75 76 background_mask : ndarray or None 77 Background normalization mask. 78 79 scale : float or None 80 Scale value for size estimation. 81 82 normalized_image_values : dict or None 83 Dictionary containing intensity metrics. 84 85 size_info : dict or None 86 Dictionary with ROI size information. 87 88 typ : str 89 Selected projection type for 3D images. 90 91 stack_selection : list of int 92 Z-levels excluded from projection. 93 94 Notes 95 ----- 96 The intensity normalization formula applied per pixel is: 97 98 .. math:: 99 100 R_{i,j} = T_{i,j} - \\left( \\mu_B (1 + c) \\right) 101 102 where 103 * ``T_{i,j}`` – pixel intensity in ROI 104 * ``μ_B`` – mean intensity in background region 105 * ``c`` – correction factor 106 * ``R_{i,j}`` – normalized pixel intensity 107 108 Examples 109 -------- 110 Load a 3D image, mask and compute statistics: 111 112 >>> fi = FeatureIntensity() 113 >>> fi.load_image_3D("stack.tiff") 114 >>> fi.load_mask_("mask.png") 115 >>> fi.set_projection("median") 116 >>> fi.run_calculations() 117 >>> results = fi.get_results() 118 >>> results["intensity"]["norm_mean"] 119 """ 120 121 def __init__( 122 self, 123 input_image=None, 124 image=None, 125 normalized_image_values=None, 126 mask=None, 127 background_mask=None, 128 typ=None, 129 size_info=None, 130 correction_factor=None, 131 img_type=None, 132 scale=None, 133 stack_selection=None, 134 ): 135 """ 136 Initialize a FeatureIntensity analysis instance. 137 138 Parameters 139 ---------- 140 input_image : ndarray, optional 141 Input image or 3D stack used for analysis. If the image is 3D, a 142 projection will be computed depending on the `typ` parameter. 143 144 image : ndarray, optional 145 2D image buffer used internally after projection of the input image. 146 Should not be set manually. 147 148 normalized_image_values : dict, optional 149 Dictionary containing normalized intensity statistics. Usually filled 150 automatically after running `run_calculations()`. 151 152 mask : ndarray, optional 153 Binary mask of the target region of interest (ROI). Required for 154 intensity and size calculations. 155 156 background_mask : ndarray, optional 157 Binary mask specifying the background region used to compute the 158 normalization threshold. If not provided, the ROI mask is also used 159 as the background reference. 160 161 typ : {"avg", "median", "std", "var", "max", "min"}, optional 162 Projection method for 3D images. Determines how the z-stack is 163 collapsed into a 2D image. Default is `"avg"`. 164 165 size_info : dict, optional 166 Dictionary storing computed size metrics of the ROI. Populated after 167 invoking `size_calculations()`. 168 169 correction_factor : float, optional 170 Correction term used during intensity normalization. Must satisfy 171 0 < correction_factor < 1. Default is 0.1. 172 173 img_type : str, optional 174 Optional metadata about the image type (e.g., "tiff", "png"). 175 176 scale : float, optional 177 Pixel resolution in physical units (e.g., µm/px). Required for 178 real-size estimation in `size_calculations()`. 179 180 stack_selection : list of int, optional 181 Indices of z-planes to exclude during projection of a 3D stack. 182 183 Notes 184 ----- 185 Values not provided are initialized to `None`, except for `typ`, which 186 defaults to `"avg"`, and `correction_factor`, which defaults to 0.1. 187 188 The class is designed to be populated by loading functions: 189 `load_image_()`, `load_image_3D()`, `load_mask_()`, 190 and optionally `load_normalization_mask_()` and `load_JIMG_project_()`. 191 """ 192 193 self.input_image = input_image or None 194 """ Input image or 3D stack used for analysis. If the image is 3D, a 195 projection will be computed depending on the `typ` parameter.""" 196 197 self.image = image or None 198 """ 2D image buffer used internally after projection of the input image. 199 Should not be set manually.""" 200 201 self.normalized_image_values = normalized_image_values or None 202 """Dictionary containing normalized intensity statistics. Usually filled 203 automatically after running `run_calculations()`.""" 204 205 self.mask = mask or None 206 """Binary mask of the target region of interest (ROI). Required for 207 intensity and size calculations.""" 208 209 self.background_mask = background_mask or None 210 """ Binary mask specifying the background region used to compute the 211 normalization threshold. If not provided, the ROI mask is also used 212 as the background reference.""" 213 214 self.typ = typ or "avg" 215 """Projection method for 3D images. Determines how the z-stack is 216 collapsed into a 2D image. Default is `"avg"`.""" 217 218 self.size_info = size_info or None 219 """Dictionary storing computed size metrics of the ROI. Populated after 220 invoking `size_calculations()`.""" 221 222 self.correction_factor = correction_factor or 0.1 223 """ Correction term used during intensity normalization. Must satisfy 224 0 < correction_factor < 1. Default is 0.1.""" 225 226 self.scale = scale or None 227 """ Pixel resolution in physical units (e.g., µm/px). Required for 228 real-size estimation in `size_calculations()`.""" 229 230 self.stack_selection = stack_selection or [] 231 """Indices of z-planes to exclude during projection of a 3D stack.""" 232 233 @property 234 def current_metadata(self): 235 r""" 236 Return current metadata parameters used in image processing and normalization. 237 238 Returns 239 ------- 240 tuple 241 A tuple containing: 242 243 projection_type : str 244 Projection method used for 3D image reduction (e.g., "avg", "median"). 245 246 correction_factor : float 247 Correction factor used for background subtraction during intensity 248 normalization. The applied formula is: 249 250 .. math:: 251 252 R_{i,j} = T_{i,j} - ( \mu_B (1 + c) ) 253 254 where 255 * ``R_{i,j}`` — normalized pixel intensity 256 * ``T_{i,j}`` — original pixel intensity 257 * ``μ_B`` — mean background intensity 258 * ``c`` — correction factor 259 scale : float or None 260 Pixel resolution (unit/px), loaded via `load_JIMG_project_()` or set manually 261 using `set_scale()`. 262 263 stack_selection : list of int 264 Indices of z-slices excluded from projection of a 3D image. 265 266 Notes 267 ----- 268 This property also prints the metadata values to the console for quick inspection. 269 """ 270 271 print(f"Projection type: {self.typ}") 272 print(f"Correction factor: {self.correction_factor}") 273 print(f"Scale (unit/px): {self.scale}") 274 print(f"Selected stac to remove: {self.stack_selection}") 275 276 return self.typ, self.correction_factor, self.scale, self.stack_selection 277 278 def set_projection(self, projection: str): 279 """ 280 Set the projection method for 3D image stack reduction. 281 282 Parameters 283 ---------- 284 projection : {"avg", "median", "std", "var", "max", "min"} 285 Projection method to reduce a 3D image stack to a 2D image. Default is `"avg"`. 286 287 Notes 288 ----- 289 This method updates the `typ` attribute of the class. The selected projection 290 determines how the z-stack is collapsed: 291 - `"avg"` : average intensity across slices 292 - `"median"` : median intensity across slices 293 - `"std"` : standard deviation across slices 294 - `"var"` : variance across slices 295 - `"max"` : maximum intensity across slices 296 - `"min"` : minimum intensity across slices 297 """ 298 299 t = ["avg", "median", "std", "var", "max", "min"] 300 if projection in t: 301 self.typ = projection 302 else: 303 print(f"\nProvided parameter is incorrect. Avaiable projection types: {t}") 304 305 def set_correction_factorn(self, factor: float): 306 r""" 307 Set the correction factor for background subtraction during intensity normalization. 308 309 Parameters 310 ---------- 311 factor : float 312 Correction factor to adjust background subtraction. Must satisfy 0 < factor < 1. 313 Default is 0.1. 314 315 Notes 316 ----- 317 The correction is applied per pixel in the target mask using the formula: 318 319 .. math:: 320 321 R_{i,j} = T_{i,j} - ( \mu_B (1 + c) ) 322 323 where 324 * ``R_{i,j}`` — normalized pixel intensity 325 * ``T_{i,j}`` — original pixel intensity 326 * ``μ_B`` — mean intensity in the background mask 327 * ``c`` — correction factor 328 """ 329 330 if factor < 1 and factor > 0: 331 self.correction_factor = factor 332 else: 333 print( 334 "\nProvided parameter is incorrect. The factor should be a floating-point value within the range of 0 to 1." 335 ) 336 337 def set_scale(self, scale): 338 """ 339 Set the scale for converting pixel measurements to physical units. 340 341 Parameters 342 ---------- 343 scale : float 344 Pixel resolution in physical units (e.g., µm/px). Used to calculate the 345 actual size of the tissue or organ. 346 347 Notes 348 ----- 349 The scale can also be automatically loaded from a JIMG project using 350 `load_JIMG_project_()`. This value is required for size calculations in 351 `size_calculations()`. 352 """ 353 354 self.scale = scale 355 356 def set_selection_list(self, rm_list: list): 357 """ 358 Set the list of z-slices to exclude when projecting a 3D image stack. 359 360 Parameters 361 ---------- 362 rm_list : list of int 363 List of indices corresponding to z-slices that should be removed from 364 the full 3D image stack before projection. 365 366 Notes 367 ----- 368 This updates the `stack_selection` attribute, which is used by the 369 `stack_selection_()` method during projection. 370 """ 371 372 self.stack_selection = rm_list 373 374 def load_JIMG_project_(self, path): 375 """ 376 Load a JIMG project from a `.pjm` file. 377 378 Parameters 379 ---------- 380 file_path : str 381 Path to the JIMG project file. The file must have a `.pjm` extension. 382 383 Returns 384 ------- 385 project : object 386 Loaded project object containing images and metadata. 387 388 Raises 389 ------ 390 ValueError 391 If the provided file path does not point to a `.pjm` file. 392 393 Notes 394 ----- 395 The method attempts to automatically set the `scale` and `stack_selection` 396 attributes from the project metadata if available. 397 """ 398 399 path = os.path.abspath(path) 400 401 if ".pjm" in path: 402 metadata = self.load_JIMG_project(path) 403 404 try: 405 self.scale = metadata.metadata["X_resolution[um/px]"] 406 except: 407 408 try: 409 self.scale = metadata.images_dict["metadata"][0][ 410 "X_resolution[um/px]" 411 ] 412 413 except: 414 print( 415 '\nUnable to set scale on this project! Set scale using "set_scale()"' 416 ) 417 418 self.stack_selection = metadata.removal_list 419 420 else: 421 print( 422 "\nWrong path. The provided path does not point to a JIMG project (*.pjm)." 423 ) 424 425 def stack_selection_(self): 426 """ 427 Remove selected z-slices from a 3D image stack based on `stack_selection`. 428 429 Notes 430 ----- 431 Only works if `input_image` is a 3D ndarray. The slices with indices listed 432 in `stack_selection` are excluded from the stack. Updates `input_image` 433 in-place. 434 435 Prints a warning if `stack_selection` is empty. 436 """ 437 438 if len(self.input_image.shape) == 3: 439 if len(self.stack_selection) > 0: 440 self.input_image = self.input_image[ 441 [ 442 x 443 for x in range(self.input_image.shape[0]) 444 if x not in self.stack_selection 445 ] 446 ] 447 else: 448 print("\nImages to remove from the stack were not selected!") 449 450 def projection(self): 451 """ 452 Project a 3D image stack into a 2D image using the method defined by `typ`. 453 454 Notes 455 ----- 456 Updates the `image` attribute with the projected 2D result. 457 458 Supported projection types (`typ`): 459 - "avg" : mean intensity across slices 460 - "median" : median intensity across slices 461 - "std" : standard deviation across slices 462 - "var" : variance across slices 463 - "max" : maximum intensity across slices 464 - "min" : minimum intensity across slices 465 466 Raises 467 ------ 468 AttributeError 469 If `input_image` is not defined. 470 """ 471 472 if self.typ == "avg": 473 img = np.mean(self.input_image, axis=0) 474 475 elif self.typ == "std": 476 img = np.std(self.input_image, axis=0) 477 478 elif self.typ == "median": 479 img = np.median(self.input_image, axis=0) 480 481 elif self.typ == "var": 482 img = np.var(self.input_image, axis=0) 483 484 elif self.typ == "max": 485 img = np.max(self.input_image, axis=0) 486 487 elif self.typ == "min": 488 img = np.min(self.input_image, axis=0) 489 490 self.image = img 491 492 def detect_img(self): 493 """ 494 Detect whether the input image is 2D or 3D and perform appropriate preprocessing. 495 496 Notes 497 ----- 498 - For 3D images, applies `stack_selection_()` and then `projection()`. 499 - For 2D images, no projection is applied. 500 - Prints status messages indicating the type of image and applied operations. 501 502 Raises 503 ------ 504 AttributeError 505 If `input_image` is not defined. 506 """ 507 check = len(self.input_image.shape) 508 509 if check == 3: 510 print("\n3D image detected! Starting processing for 3D image...") 511 print(f"Projection - {self.typ}...") 512 513 self.stack_selection_() 514 self.projection() 515 516 elif check == 2: 517 print("\n2D image detected! Starting processing for 2D image...") 518 519 else: 520 print("\nData does not match any image type!") 521 522 def load_image_3D(self, path): 523 """ 524 Load a 3D image stack from a TIFF file. 525 526 Parameters 527 ---------- 528 path : str 529 Path to the 3D image file (*.tiff) to be loaded. 530 531 Notes 532 ----- 533 The loaded image is stored in the `input_image` attribute as a 3D ndarray. 534 """ 535 path = os.path.abspath(path) 536 537 self.input_image = self.load_3D_tiff(path) 538 539 def load_image_(self, path): 540 """ 541 Load a 2D image into the class. 542 543 Parameters 544 ---------- 545 path : str 546 Path to the image file to be loaded. 547 548 Notes 549 ----- 550 The loaded image is stored in the `input_image` attribute as a 2D ndarray. 551 """ 552 path = os.path.abspath(path) 553 554 self.input_image = self.load_image(path) 555 556 def load_mask_(self, path): 557 r""" 558 Load a binary mask into the class and optionally set it as the normalization mask. 559 560 Parameters 561 ---------- 562 path : str 563 Path to the mask image file. Supported formats include 8-bit or 16-bit images 564 with extensions such as `.png` or `.jpeg`. The mask must be binary 565 (e.g., 0/255, 0/2**16-1, 0/1). 566 567 Notes 568 ----- 569 - If `load_normalization_mask_()` is not called, this mask is also used as the 570 background mask for intensity normalization. 571 - Normalization is applied per pixel using the formula: 572 573 .. math:: 574 575 R_{i,j} = T_{i,j} - ( \mu_B (1 + c) ) 576 577 where 578 * ``R_{i,j}`` — normalized pixel intensity 579 * ``T_{i,j}`` — pixel intensity in the target mask 580 * ``μ_B`` — mean intensity of the background (reversed mask) 581 * ``c`` — correction factor 582 """ 583 584 path = os.path.abspath(path) 585 586 self.mask = self.load_mask(path) 587 588 print( 589 "\nThis mask was also set as the reverse background mask. If you want a different background mask for normalization, use 'load_normalization_mask()'." 590 ) 591 self.background_mask = self.load_mask(path) 592 593 def load_normalization_mask_(self, path): 594 r""" 595 Load a binary mask for normalization into the class. 596 597 Parameters 598 ---------- 599 path : str 600 Path to the mask image file. Supported formats include 8-bit or 16-bit 601 images (e.g., `.png`, `.jpeg`). The mask must be binary (0/255, 0/2**16-1, 0/1). 602 603 Notes 604 ----- 605 - The mask defines the area of interest. Normalization is applied to the inverse 606 of this area (reversed mask). 607 - Normalization formula applied per pixel: 608 609 .. math:: 610 611 R_{i,j} = T_{i,j} - ( \mu_B (1 + c) ) 612 613 where 614 * ``R_{i,j}`` — normalized pixel intensity 615 * ``T_{i,j}`` — pixel intensity in the target mask 616 * ``μ_B`` — mean intensity of the background (reversed mask) 617 * ``c`` — correction factor 618 """ 619 620 path = os.path.abspath(path) 621 622 self.background_mask = self.load_mask(path) 623 624 def intensity_calculations(self): 625 """ 626 Calculate normalized and raw intensity statistics from the image based on masks. 627 628 This method performs intensity calculations using the main mask (`self.mask`) 629 and the background mask (`self.background_mask`). The pixel intensities within 630 the mask of interest are normalized by subtracting a threshold derived from the 631 background region and applying a correction factor (`self.correction_factor`). 632 Negative values after normalization are clipped to zero. 633 634 The following statistics are computed for both normalized and raw values: 635 - Minimum 636 - Maximum 637 - Mean 638 - Median 639 - Standard deviation 640 - Variance 641 - List of all normalized values (only for normalized data) 642 643 Notes 644 ----- 645 - The method updates the instance attribute `self.normalized_image_values` 646 with a dictionary containing both normalized and raw statistics. 647 - Normalization formula applied for each pixel in the selected mask: 648 final_val = selected_value - (threshold + threshold * correction_factor) 649 where threshold is the mean intensity in the background mask. 650 - Negative values after normalization are set to zero. 651 """ 652 653 tmp_mask = self.ajd_mask_size(image=self.image, mask=self.mask) 654 tmp_bmask = self.ajd_mask_size(image=self.image, mask=self.background_mask) 655 656 selected_values = self.image[tmp_mask == np.max(tmp_mask)] 657 658 threshold = np.mean(self.image[tmp_bmask == np.min(tmp_bmask)]) 659 660 # normalization 661 final_val = selected_values - (threshold + (threshold * self.correction_factor)) 662 663 final_val[final_val < 0] = 0 664 665 tmp_dict = { 666 "norm_min": np.min(final_val), 667 "norm_max": np.max(final_val), 668 "norm_mean": np.mean(final_val), 669 "norm_median": np.median(final_val), 670 "norm_std": np.std(final_val), 671 "norm_var": np.var(final_val), 672 "norm_values": final_val.tolist(), 673 "min": np.min(selected_values), 674 "max": np.max(selected_values), 675 "mean": np.mean(selected_values), 676 "median": np.median(selected_values), 677 "std": np.std(selected_values), 678 "var": np.var(selected_values), 679 } 680 681 self.normalized_image_values = tmp_dict 682 683 def size_calculations(self): 684 """ 685 Calculates the size and bounding dimensions of the masked region in the image. 686 687 This method computes the following metrics based on the current mask: 688 - Total number of pixels in the mask (`px_size`) 689 - Real-world size if a scale is provided (`size`) 690 - Maximum lengths along x and y axes (`max_length_x_axis`, `max_length_y_axis`) 691 692 If `self.scale` is defined (unit per pixel), the real-world size is calculated. 693 If not, `size` will be `None` and a warning message is printed. 694 695 Returns: 696 Updates the following attributes in the class: 697 - self.size_info (dict) containing: 698 - 'size' (float or None): real-world size of the mask 699 - 'px_size' (int): number of pixels in the masked region 700 - 'max_length_x_axis' (int): length of the bounding box along the x-axis 701 - 'max_length_y_axis' (int): length of the bounding box along the y-axis 702 703 Example: 704 analysis.size_calculations() 705 print(analysis.size_info) 706 """ 707 708 tmp_mask = self.ajd_mask_size(image=self.image, mask=self.mask) 709 710 size_px = int(len(tmp_mask[tmp_mask > np.min(tmp_mask)])) 711 712 if self.scale is not None: 713 size = float(size_px * self.scale) 714 else: 715 size = None 716 print( 717 '\nUnable to calculate real size, scale (unit/px) not provided, use "set_scale()" or load JIMG project .pjm metadata "load_pjm()" to set scale for calculations!' 718 ) 719 720 non_zero_indices = np.where(tmp_mask == np.max(tmp_mask)) 721 722 min_y, max_y = np.min(non_zero_indices[0]), np.max(non_zero_indices[0]) 723 min_x, max_x = np.min(non_zero_indices[1]), np.max(non_zero_indices[1]) 724 725 max_length_x_axis = int(max_x - min_x + 1) 726 max_length_y_axis = int(max_y - min_y + 1) 727 728 tmp_val = { 729 "size": size, 730 "px_size": size_px, 731 "max_length_x_axis": max_length_x_axis, 732 "max_length_y_axis": max_length_y_axis, 733 } 734 735 self.size_info = tmp_val 736 737 def run_calculations(self): 738 """ 739 Run the full analysis pipeline on the loaded image using the provided masks. 740 741 Notes 742 ----- 743 - The input image must be loaded via `load_image_()` or `load_image_3D()`. 744 - The ROI mask must be loaded via `load_mask_()`. Optionally, a normalization 745 mask can be loaded via `load_normalization_mask_()`. 746 - Parameters such as projection type and correction factor can be set with 747 `set_projection()` and `set_correction_factor()`. 748 - Scale and stack selection can also influence calculations if defined. 749 - To view current parameters, use the `current_metadata` property. 750 751 Returns 752 ------- 753 None 754 The results are stored internally and can be retrieved using 755 `get_results()`. 756 """ 757 758 if self.input_image is not None: 759 760 if self.mask is not None: 761 762 print("\nStart...") 763 self.detect_img() 764 self.intensity_calculations() 765 self.size_calculations() 766 print("\nCompleted!") 767 768 def get_results(self): 769 """ 770 Return the results from the analysis performed by `run_calculations()`. 771 772 Returns 773 ------- 774 results_dict : dict or None 775 Dictionary containing intensity and size results. Structure: 776 - 'intensity' : dict with normalized and raw intensity statistics 777 - 'size' : dict with ROI size metrics 778 779 Notes 780 ----- 781 If analysis has not been run yet, prints a message and returns None. 782 """ 783 784 if self.normalized_image_values is not None and self.size_info is not None: 785 786 results = { 787 "intensity": self.normalized_image_values, 788 "size": self.size_info, 789 } 790 791 return results 792 793 else: 794 print('\nAnalysis were not conducted. Run analysis "run_calculations()"') 795 796 def save_results( 797 self, 798 path="", 799 mask_region: str = "", 800 feature_name: str = "", 801 individual_number: int = 0, 802 individual_name: str = "", 803 ): 804 """ 805 Save the analysis results to a `.int` (JSON) file. 806 807 Parameters 808 ---------- 809 path : str, optional 810 Directory path where the file will be saved. Defaults to the current working directory. 811 812 mask_region : str 813 Name or identifier of the mask region (e.g., tissue, part of tissue). 814 815 feature_name : str 816 Name of the feature being analyzed. Underscores or spaces are replaced with periods. 817 818 individual_number : int 819 Unique identifier for the individual in the analysis (e.g., 1, 2, 3). 820 821 individual_name : str 822 Name of the individual (e.g., species name, tissue, organoid). 823 824 Notes 825 ----- 826 - The method validates that all required parameters are provided and that 827 analysis results exist (`normalized_image_values` and `size_info`). 828 - Creates the directory if it does not exist. 829 - File name format: 830 '<individual_name>_<individual_number>_<mask_region>_<feature_name>.int' 831 832 Raises 833 ------ 834 FileNotFoundError 835 If the specified path cannot be created or accessed. 836 837 ValueError 838 If any of `mask_region`, `feature_name`, `individual_number`, or 839 `individual_name` are missing or invalid. 840 """ 841 842 path = os.path.abspath(path) 843 844 if ( 845 len(mask_region) > 1 846 and len(feature_name) > 1 847 and individual_number != 0 848 and len(individual_name) > 1 849 ): 850 851 if self.normalized_image_values is not None and self.size_info is not None: 852 853 results = { 854 "intensity": self.normalized_image_values, 855 "size": self.size_info, 856 } 857 858 mask_region = re.sub(r"[_\s]+", ".", mask_region) 859 feature_name = re.sub(r"[_\s]+", ".", feature_name) 860 individual_number = re.sub(r"[_\s]+", ".", str(individual_number)) 861 individual_name = re.sub(r"[_\s]+", ".", individual_name) 862 863 full_name = f"{individual_name}_{individual_number}_{mask_region}_{feature_name}" 864 865 isExist = os.path.exists(path) 866 if not isExist: 867 os.makedirs(path, exist_ok=True) 868 869 full_path = os.path.join( 870 path, re.sub("\\.json", "", full_name) + ".int" 871 ) 872 873 with open(full_path, "w") as file: 874 json.dump(results, file, indent=4) 875 876 else: 877 print( 878 '\nAnalysis were not conducted. Run analysis "run_calculations()"' 879 ) 880 881 else: 882 print( 883 "\nAny of 'mask_region', 'feature_name', 'individual_number', 'individual_name' parameters were provided wrong!" 884 ) 885 886 def concatenate_intensity_data(self, directory: str = "", name: str = ""): 887 """ 888 Concatenate intensity data from multiple `.int` files and save as CSV. 889 890 Parameters 891 ---------- 892 directory : str, optional 893 Path to the directory containing `.int` files. Defaults to the current working directory. 894 895 name : str 896 Prefix for the output CSV file names. CSV files are saved in the format 897 '<name>_<gene>_<region>.csv'. 898 899 Raises 900 ------ 901 FileNotFoundError 902 If the directory cannot be accessed or no `.int` files are found. 903 904 ValueError 905 If an `.int` file is missing expected data or has an incorrect format. 906 907 Notes 908 ----- 909 - The method groups intensity data by gene (feature) and mask region. 910 - Outputs one CSV file per unique gene-region combination, saved in the specified directory. 911 """ 912 913 directory = os.path.abspath(directory) 914 915 files_list = [f for f in os.listdir(directory) if f.endswith(".int")] 916 917 genes_set = set([re.sub("\\.int", "", x.split("_")[3]) for x in files_list]) 918 regions_set = set([re.sub("\\.int", "", x.split("_")[2]) for x in files_list]) 919 920 for g in genes_set: 921 for r in regions_set: 922 json_to_save = { 923 "individual_name": [], 924 "individual_number": [], 925 "norm_intensity": [], 926 "size": [], 927 } 928 929 for f in tqdm(files_list): 930 if g in f and r in f: 931 with open(os.path.join(directory, f), "r") as file: 932 data = json.load(file) 933 934 json_to_save["norm_intensity"] = ( 935 json_to_save["norm_intensity"] 936 + data["intensity"]["norm_values"] 937 ) 938 json_to_save["individual_name"] = json_to_save[ 939 "individual_name" 940 ] + [f.split("_")[0]] * len( 941 data["intensity"]["norm_values"] 942 ) 943 json_to_save["individual_number"] = json_to_save[ 944 "individual_number" 945 ] + [f.split("_")[1]] * len( 946 data["intensity"]["norm_values"] 947 ) 948 json_to_save["size"] = json_to_save["size"] + [ 949 data["size"]["px_size"] 950 ] * len(data["intensity"]["norm_values"]) 951 952 pd.DataFrame(json_to_save).to_csv(f"{name}_{g}_{r}.csv", index=False)
Class for quantitative analysis of pixel intensity and size measurements in 2D/3D biological images. Supports projection of 3D stacks, mask-based intensity normalization, region size estimation and metadata extraction.
Parameters
input_image : ndarray, optional Input image or 3D stack for analysis. If 3D, projection must be applied.
image : ndarray, optional 2D projected image (internal use).
normalized_image_values : dict, optional Dictionary storing normalized intensity statistics.
mask : ndarray, optional Binary mask of region of interest (ROI).
background_mask : ndarray, optional
Binary mask used for background estimation. If not provided, mask is used.
typ : {"avg", "median", "std", "var", "max", "min"}, optional
Projection type for 3D stacks. Default is "avg".
size_info : dict, optional Dictionary storing ROI size measurements.
correction_factor : float, optional Normalization correction factor applied to background intensity. Must satisfy 0 < factor < 1. Default is 0.1.
img_type : str, optional Image type metadata.
scale : float, optional Pixel resolution in physical units (e.g. µm/px). Used in size calculations.
stack_selection : list of int, optional List of Z-indices to remove when projecting a 3D image.
Attributes
input_image : ndarray or None Loaded input image.
image : ndarray or None Projected 2D image.
mask : ndarray or None Region of interest mask.
background_mask : ndarray or None Background normalization mask.
scale : float or None Scale value for size estimation.
normalized_image_values : dict or None Dictionary containing intensity metrics.
size_info : dict or None Dictionary with ROI size information.
typ : str Selected projection type for 3D images.
stack_selection : list of int Z-levels excluded from projection.
Notes
The intensity normalization formula applied per pixel is:
$$R_{i,j} = T_{i,j} - \left( \mu_B (1 + c) \right)$$
where
T_{i,j}– pixel intensity in ROIμ_B– mean intensity in background regionc– correction factorR_{i,j}– normalized pixel intensity
Examples
Load a 3D image, mask and compute statistics:
>>> fi = FeatureIntensity()
>>> fi.load_image_3D("stack.tiff")
>>> fi.load_mask_("mask.png")
>>> fi.set_projection("median")
>>> fi.run_calculations()
>>> results = fi.get_results()
>>> results["intensity"]["norm_mean"]
121 def __init__( 122 self, 123 input_image=None, 124 image=None, 125 normalized_image_values=None, 126 mask=None, 127 background_mask=None, 128 typ=None, 129 size_info=None, 130 correction_factor=None, 131 img_type=None, 132 scale=None, 133 stack_selection=None, 134 ): 135 """ 136 Initialize a FeatureIntensity analysis instance. 137 138 Parameters 139 ---------- 140 input_image : ndarray, optional 141 Input image or 3D stack used for analysis. If the image is 3D, a 142 projection will be computed depending on the `typ` parameter. 143 144 image : ndarray, optional 145 2D image buffer used internally after projection of the input image. 146 Should not be set manually. 147 148 normalized_image_values : dict, optional 149 Dictionary containing normalized intensity statistics. Usually filled 150 automatically after running `run_calculations()`. 151 152 mask : ndarray, optional 153 Binary mask of the target region of interest (ROI). Required for 154 intensity and size calculations. 155 156 background_mask : ndarray, optional 157 Binary mask specifying the background region used to compute the 158 normalization threshold. If not provided, the ROI mask is also used 159 as the background reference. 160 161 typ : {"avg", "median", "std", "var", "max", "min"}, optional 162 Projection method for 3D images. Determines how the z-stack is 163 collapsed into a 2D image. Default is `"avg"`. 164 165 size_info : dict, optional 166 Dictionary storing computed size metrics of the ROI. Populated after 167 invoking `size_calculations()`. 168 169 correction_factor : float, optional 170 Correction term used during intensity normalization. Must satisfy 171 0 < correction_factor < 1. Default is 0.1. 172 173 img_type : str, optional 174 Optional metadata about the image type (e.g., "tiff", "png"). 175 176 scale : float, optional 177 Pixel resolution in physical units (e.g., µm/px). Required for 178 real-size estimation in `size_calculations()`. 179 180 stack_selection : list of int, optional 181 Indices of z-planes to exclude during projection of a 3D stack. 182 183 Notes 184 ----- 185 Values not provided are initialized to `None`, except for `typ`, which 186 defaults to `"avg"`, and `correction_factor`, which defaults to 0.1. 187 188 The class is designed to be populated by loading functions: 189 `load_image_()`, `load_image_3D()`, `load_mask_()`, 190 and optionally `load_normalization_mask_()` and `load_JIMG_project_()`. 191 """ 192 193 self.input_image = input_image or None 194 """ Input image or 3D stack used for analysis. If the image is 3D, a 195 projection will be computed depending on the `typ` parameter.""" 196 197 self.image = image or None 198 """ 2D image buffer used internally after projection of the input image. 199 Should not be set manually.""" 200 201 self.normalized_image_values = normalized_image_values or None 202 """Dictionary containing normalized intensity statistics. Usually filled 203 automatically after running `run_calculations()`.""" 204 205 self.mask = mask or None 206 """Binary mask of the target region of interest (ROI). Required for 207 intensity and size calculations.""" 208 209 self.background_mask = background_mask or None 210 """ Binary mask specifying the background region used to compute the 211 normalization threshold. If not provided, the ROI mask is also used 212 as the background reference.""" 213 214 self.typ = typ or "avg" 215 """Projection method for 3D images. Determines how the z-stack is 216 collapsed into a 2D image. Default is `"avg"`.""" 217 218 self.size_info = size_info or None 219 """Dictionary storing computed size metrics of the ROI. Populated after 220 invoking `size_calculations()`.""" 221 222 self.correction_factor = correction_factor or 0.1 223 """ Correction term used during intensity normalization. Must satisfy 224 0 < correction_factor < 1. Default is 0.1.""" 225 226 self.scale = scale or None 227 """ Pixel resolution in physical units (e.g., µm/px). Required for 228 real-size estimation in `size_calculations()`.""" 229 230 self.stack_selection = stack_selection or [] 231 """Indices of z-planes to exclude during projection of a 3D stack."""
Initialize a FeatureIntensity analysis instance.
Parameters
input_image : ndarray, optional
Input image or 3D stack used for analysis. If the image is 3D, a
projection will be computed depending on the typ parameter.
image : ndarray, optional 2D image buffer used internally after projection of the input image. Should not be set manually.
normalized_image_values : dict, optional
Dictionary containing normalized intensity statistics. Usually filled
automatically after running run_calculations().
mask : ndarray, optional Binary mask of the target region of interest (ROI). Required for intensity and size calculations.
background_mask : ndarray, optional Binary mask specifying the background region used to compute the normalization threshold. If not provided, the ROI mask is also used as the background reference.
typ : {"avg", "median", "std", "var", "max", "min"}, optional
Projection method for 3D images. Determines how the z-stack is
collapsed into a 2D image. Default is "avg".
size_info : dict, optional
Dictionary storing computed size metrics of the ROI. Populated after
invoking size_calculations().
correction_factor : float, optional Correction term used during intensity normalization. Must satisfy 0 < correction_factor < 1. Default is 0.1.
img_type : str, optional Optional metadata about the image type (e.g., "tiff", "png").
scale : float, optional
Pixel resolution in physical units (e.g., µm/px). Required for
real-size estimation in size_calculations().
stack_selection : list of int, optional Indices of z-planes to exclude during projection of a 3D stack.
Notes
Values not provided are initialized to None, except for typ, which
defaults to "avg", and correction_factor, which defaults to 0.1.
The class is designed to be populated by loading functions:
load_image_(), load_image_3D(), load_mask_(),
and optionally load_normalization_mask_() and load_JIMG_project_().
Input image or 3D stack used for analysis. If the image is 3D, a
projection will be computed depending on the typ parameter.
2D image buffer used internally after projection of the input image. Should not be set manually.
Dictionary containing normalized intensity statistics. Usually filled
automatically after running run_calculations().
Binary mask of the target region of interest (ROI). Required for intensity and size calculations.
Binary mask specifying the background region used to compute the normalization threshold. If not provided, the ROI mask is also used as the background reference.
Projection method for 3D images. Determines how the z-stack is
collapsed into a 2D image. Default is "avg".
Dictionary storing computed size metrics of the ROI. Populated after
invoking size_calculations().
Correction term used during intensity normalization. Must satisfy 0 < correction_factor < 1. Default is 0.1.
Pixel resolution in physical units (e.g., µm/px). Required for
real-size estimation in size_calculations().
233 @property 234 def current_metadata(self): 235 r""" 236 Return current metadata parameters used in image processing and normalization. 237 238 Returns 239 ------- 240 tuple 241 A tuple containing: 242 243 projection_type : str 244 Projection method used for 3D image reduction (e.g., "avg", "median"). 245 246 correction_factor : float 247 Correction factor used for background subtraction during intensity 248 normalization. The applied formula is: 249 250 .. math:: 251 252 R_{i,j} = T_{i,j} - ( \mu_B (1 + c) ) 253 254 where 255 * ``R_{i,j}`` — normalized pixel intensity 256 * ``T_{i,j}`` — original pixel intensity 257 * ``μ_B`` — mean background intensity 258 * ``c`` — correction factor 259 scale : float or None 260 Pixel resolution (unit/px), loaded via `load_JIMG_project_()` or set manually 261 using `set_scale()`. 262 263 stack_selection : list of int 264 Indices of z-slices excluded from projection of a 3D image. 265 266 Notes 267 ----- 268 This property also prints the metadata values to the console for quick inspection. 269 """ 270 271 print(f"Projection type: {self.typ}") 272 print(f"Correction factor: {self.correction_factor}") 273 print(f"Scale (unit/px): {self.scale}") 274 print(f"Selected stac to remove: {self.stack_selection}") 275 276 return self.typ, self.correction_factor, self.scale, self.stack_selection
Return current metadata parameters used in image processing and normalization.
Returns
tuple A tuple containing:
projection_type : str
Projection method used for 3D image reduction (e.g., "avg", "median").
correction_factor : float
Correction factor used for background subtraction during intensity
normalization. The applied formula is:
$$R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )$$
where
* ``R_{i,j}`` — normalized pixel intensity
* ``T_{i,j}`` — original pixel intensity
* ``μ_B`` — mean background intensity
* ``c`` — correction factor
scale : float or None
Pixel resolution (unit/px), loaded via `load_JIMG_project_()` or set manually
using `set_scale()`.
stack_selection : list of int
Indices of z-slices excluded from projection of a 3D image.
Notes
This property also prints the metadata values to the console for quick inspection.
278 def set_projection(self, projection: str): 279 """ 280 Set the projection method for 3D image stack reduction. 281 282 Parameters 283 ---------- 284 projection : {"avg", "median", "std", "var", "max", "min"} 285 Projection method to reduce a 3D image stack to a 2D image. Default is `"avg"`. 286 287 Notes 288 ----- 289 This method updates the `typ` attribute of the class. The selected projection 290 determines how the z-stack is collapsed: 291 - `"avg"` : average intensity across slices 292 - `"median"` : median intensity across slices 293 - `"std"` : standard deviation across slices 294 - `"var"` : variance across slices 295 - `"max"` : maximum intensity across slices 296 - `"min"` : minimum intensity across slices 297 """ 298 299 t = ["avg", "median", "std", "var", "max", "min"] 300 if projection in t: 301 self.typ = projection 302 else: 303 print(f"\nProvided parameter is incorrect. Avaiable projection types: {t}")
Set the projection method for 3D image stack reduction.
Parameters
projection : {"avg", "median", "std", "var", "max", "min"}
Projection method to reduce a 3D image stack to a 2D image. Default is "avg".
Notes
This method updates the typ attribute of the class. The selected projection
determines how the z-stack is collapsed:
"avg": average intensity across slices"median": median intensity across slices"std": standard deviation across slices"var": variance across slices"max": maximum intensity across slices"min": minimum intensity across slices
305 def set_correction_factorn(self, factor: float): 306 r""" 307 Set the correction factor for background subtraction during intensity normalization. 308 309 Parameters 310 ---------- 311 factor : float 312 Correction factor to adjust background subtraction. Must satisfy 0 < factor < 1. 313 Default is 0.1. 314 315 Notes 316 ----- 317 The correction is applied per pixel in the target mask using the formula: 318 319 .. math:: 320 321 R_{i,j} = T_{i,j} - ( \mu_B (1 + c) ) 322 323 where 324 * ``R_{i,j}`` — normalized pixel intensity 325 * ``T_{i,j}`` — original pixel intensity 326 * ``μ_B`` — mean intensity in the background mask 327 * ``c`` — correction factor 328 """ 329 330 if factor < 1 and factor > 0: 331 self.correction_factor = factor 332 else: 333 print( 334 "\nProvided parameter is incorrect. The factor should be a floating-point value within the range of 0 to 1." 335 )
Set the correction factor for background subtraction during intensity normalization.
Parameters
factor : float Correction factor to adjust background subtraction. Must satisfy 0 < factor < 1. Default is 0.1.
Notes
The correction is applied per pixel in the target mask using the formula:
$$R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )$$
where
R_{i,j}— normalized pixel intensityT_{i,j}— original pixel intensityμ_B— mean intensity in the background maskc— correction factor
337 def set_scale(self, scale): 338 """ 339 Set the scale for converting pixel measurements to physical units. 340 341 Parameters 342 ---------- 343 scale : float 344 Pixel resolution in physical units (e.g., µm/px). Used to calculate the 345 actual size of the tissue or organ. 346 347 Notes 348 ----- 349 The scale can also be automatically loaded from a JIMG project using 350 `load_JIMG_project_()`. This value is required for size calculations in 351 `size_calculations()`. 352 """ 353 354 self.scale = scale
Set the scale for converting pixel measurements to physical units.
Parameters
scale : float Pixel resolution in physical units (e.g., µm/px). Used to calculate the actual size of the tissue or organ.
Notes
The scale can also be automatically loaded from a JIMG project using
load_JIMG_project_(). This value is required for size calculations in
size_calculations().
356 def set_selection_list(self, rm_list: list): 357 """ 358 Set the list of z-slices to exclude when projecting a 3D image stack. 359 360 Parameters 361 ---------- 362 rm_list : list of int 363 List of indices corresponding to z-slices that should be removed from 364 the full 3D image stack before projection. 365 366 Notes 367 ----- 368 This updates the `stack_selection` attribute, which is used by the 369 `stack_selection_()` method during projection. 370 """ 371 372 self.stack_selection = rm_list
Set the list of z-slices to exclude when projecting a 3D image stack.
Parameters
rm_list : list of int List of indices corresponding to z-slices that should be removed from the full 3D image stack before projection.
Notes
This updates the stack_selection attribute, which is used by the
stack_selection_() method during projection.
374 def load_JIMG_project_(self, path): 375 """ 376 Load a JIMG project from a `.pjm` file. 377 378 Parameters 379 ---------- 380 file_path : str 381 Path to the JIMG project file. The file must have a `.pjm` extension. 382 383 Returns 384 ------- 385 project : object 386 Loaded project object containing images and metadata. 387 388 Raises 389 ------ 390 ValueError 391 If the provided file path does not point to a `.pjm` file. 392 393 Notes 394 ----- 395 The method attempts to automatically set the `scale` and `stack_selection` 396 attributes from the project metadata if available. 397 """ 398 399 path = os.path.abspath(path) 400 401 if ".pjm" in path: 402 metadata = self.load_JIMG_project(path) 403 404 try: 405 self.scale = metadata.metadata["X_resolution[um/px]"] 406 except: 407 408 try: 409 self.scale = metadata.images_dict["metadata"][0][ 410 "X_resolution[um/px]" 411 ] 412 413 except: 414 print( 415 '\nUnable to set scale on this project! Set scale using "set_scale()"' 416 ) 417 418 self.stack_selection = metadata.removal_list 419 420 else: 421 print( 422 "\nWrong path. The provided path does not point to a JIMG project (*.pjm)." 423 )
Load a JIMG project from a .pjm file.
Parameters
file_path : str
Path to the JIMG project file. The file must have a .pjm extension.
Returns
project : object Loaded project object containing images and metadata.
Raises
ValueError
If the provided file path does not point to a .pjm file.
Notes
The method attempts to automatically set the scale and stack_selection
attributes from the project metadata if available.
425 def stack_selection_(self): 426 """ 427 Remove selected z-slices from a 3D image stack based on `stack_selection`. 428 429 Notes 430 ----- 431 Only works if `input_image` is a 3D ndarray. The slices with indices listed 432 in `stack_selection` are excluded from the stack. Updates `input_image` 433 in-place. 434 435 Prints a warning if `stack_selection` is empty. 436 """ 437 438 if len(self.input_image.shape) == 3: 439 if len(self.stack_selection) > 0: 440 self.input_image = self.input_image[ 441 [ 442 x 443 for x in range(self.input_image.shape[0]) 444 if x not in self.stack_selection 445 ] 446 ] 447 else: 448 print("\nImages to remove from the stack were not selected!")
Remove selected z-slices from a 3D image stack based on stack_selection.
Notes
Only works if input_image is a 3D ndarray. The slices with indices listed
in stack_selection are excluded from the stack. Updates input_image
in-place.
Prints a warning if stack_selection is empty.
450 def projection(self): 451 """ 452 Project a 3D image stack into a 2D image using the method defined by `typ`. 453 454 Notes 455 ----- 456 Updates the `image` attribute with the projected 2D result. 457 458 Supported projection types (`typ`): 459 - "avg" : mean intensity across slices 460 - "median" : median intensity across slices 461 - "std" : standard deviation across slices 462 - "var" : variance across slices 463 - "max" : maximum intensity across slices 464 - "min" : minimum intensity across slices 465 466 Raises 467 ------ 468 AttributeError 469 If `input_image` is not defined. 470 """ 471 472 if self.typ == "avg": 473 img = np.mean(self.input_image, axis=0) 474 475 elif self.typ == "std": 476 img = np.std(self.input_image, axis=0) 477 478 elif self.typ == "median": 479 img = np.median(self.input_image, axis=0) 480 481 elif self.typ == "var": 482 img = np.var(self.input_image, axis=0) 483 484 elif self.typ == "max": 485 img = np.max(self.input_image, axis=0) 486 487 elif self.typ == "min": 488 img = np.min(self.input_image, axis=0) 489 490 self.image = img
Project a 3D image stack into a 2D image using the method defined by typ.
Notes
Updates the image attribute with the projected 2D result.
Supported projection types (typ):
- "avg" : mean intensity across slices
- "median" : median intensity across slices
- "std" : standard deviation across slices
- "var" : variance across slices
- "max" : maximum intensity across slices
- "min" : minimum intensity across slices
Raises
AttributeError
If input_image is not defined.
492 def detect_img(self): 493 """ 494 Detect whether the input image is 2D or 3D and perform appropriate preprocessing. 495 496 Notes 497 ----- 498 - For 3D images, applies `stack_selection_()` and then `projection()`. 499 - For 2D images, no projection is applied. 500 - Prints status messages indicating the type of image and applied operations. 501 502 Raises 503 ------ 504 AttributeError 505 If `input_image` is not defined. 506 """ 507 check = len(self.input_image.shape) 508 509 if check == 3: 510 print("\n3D image detected! Starting processing for 3D image...") 511 print(f"Projection - {self.typ}...") 512 513 self.stack_selection_() 514 self.projection() 515 516 elif check == 2: 517 print("\n2D image detected! Starting processing for 2D image...") 518 519 else: 520 print("\nData does not match any image type!")
Detect whether the input image is 2D or 3D and perform appropriate preprocessing.
Notes
- For 3D images, applies
stack_selection_()and thenprojection(). - For 2D images, no projection is applied.
- Prints status messages indicating the type of image and applied operations.
Raises
AttributeError
If input_image is not defined.
522 def load_image_3D(self, path): 523 """ 524 Load a 3D image stack from a TIFF file. 525 526 Parameters 527 ---------- 528 path : str 529 Path to the 3D image file (*.tiff) to be loaded. 530 531 Notes 532 ----- 533 The loaded image is stored in the `input_image` attribute as a 3D ndarray. 534 """ 535 path = os.path.abspath(path) 536 537 self.input_image = self.load_3D_tiff(path)
Load a 3D image stack from a TIFF file.
Parameters
path : str Path to the 3D image file (*.tiff) to be loaded.
Notes
The loaded image is stored in the input_image attribute as a 3D ndarray.
539 def load_image_(self, path): 540 """ 541 Load a 2D image into the class. 542 543 Parameters 544 ---------- 545 path : str 546 Path to the image file to be loaded. 547 548 Notes 549 ----- 550 The loaded image is stored in the `input_image` attribute as a 2D ndarray. 551 """ 552 path = os.path.abspath(path) 553 554 self.input_image = self.load_image(path)
Load a 2D image into the class.
Parameters
path : str Path to the image file to be loaded.
Notes
The loaded image is stored in the input_image attribute as a 2D ndarray.
556 def load_mask_(self, path): 557 r""" 558 Load a binary mask into the class and optionally set it as the normalization mask. 559 560 Parameters 561 ---------- 562 path : str 563 Path to the mask image file. Supported formats include 8-bit or 16-bit images 564 with extensions such as `.png` or `.jpeg`. The mask must be binary 565 (e.g., 0/255, 0/2**16-1, 0/1). 566 567 Notes 568 ----- 569 - If `load_normalization_mask_()` is not called, this mask is also used as the 570 background mask for intensity normalization. 571 - Normalization is applied per pixel using the formula: 572 573 .. math:: 574 575 R_{i,j} = T_{i,j} - ( \mu_B (1 + c) ) 576 577 where 578 * ``R_{i,j}`` — normalized pixel intensity 579 * ``T_{i,j}`` — pixel intensity in the target mask 580 * ``μ_B`` — mean intensity of the background (reversed mask) 581 * ``c`` — correction factor 582 """ 583 584 path = os.path.abspath(path) 585 586 self.mask = self.load_mask(path) 587 588 print( 589 "\nThis mask was also set as the reverse background mask. If you want a different background mask for normalization, use 'load_normalization_mask()'." 590 ) 591 self.background_mask = self.load_mask(path)
Load a binary mask into the class and optionally set it as the normalization mask.
Parameters
path : str
Path to the mask image file. Supported formats include 8-bit or 16-bit images
with extensions such as .png or .jpeg. The mask must be binary
(e.g., 0/255, 0/2**16-1, 0/1).
Notes
- If
load_normalization_mask_()is not called, this mask is also used as the background mask for intensity normalization. Normalization is applied per pixel using the formula:
$$R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )$$
where
R_{i,j}— normalized pixel intensityT_{i,j}— pixel intensity in the target maskμ_B— mean intensity of the background (reversed mask)c— correction factor
593 def load_normalization_mask_(self, path): 594 r""" 595 Load a binary mask for normalization into the class. 596 597 Parameters 598 ---------- 599 path : str 600 Path to the mask image file. Supported formats include 8-bit or 16-bit 601 images (e.g., `.png`, `.jpeg`). The mask must be binary (0/255, 0/2**16-1, 0/1). 602 603 Notes 604 ----- 605 - The mask defines the area of interest. Normalization is applied to the inverse 606 of this area (reversed mask). 607 - Normalization formula applied per pixel: 608 609 .. math:: 610 611 R_{i,j} = T_{i,j} - ( \mu_B (1 + c) ) 612 613 where 614 * ``R_{i,j}`` — normalized pixel intensity 615 * ``T_{i,j}`` — pixel intensity in the target mask 616 * ``μ_B`` — mean intensity of the background (reversed mask) 617 * ``c`` — correction factor 618 """ 619 620 path = os.path.abspath(path) 621 622 self.background_mask = self.load_mask(path)
Load a binary mask for normalization into the class.
Parameters
path : str
Path to the mask image file. Supported formats include 8-bit or 16-bit
images (e.g., .png, .jpeg). The mask must be binary (0/255, 0/2**16-1, 0/1).
Notes
- The mask defines the area of interest. Normalization is applied to the inverse of this area (reversed mask).
Normalization formula applied per pixel:
$$R_{i,j} = T_{i,j} - ( \mu_B (1 + c) )$$
where
R_{i,j}— normalized pixel intensityT_{i,j}— pixel intensity in the target maskμ_B— mean intensity of the background (reversed mask)c— correction factor
624 def intensity_calculations(self): 625 """ 626 Calculate normalized and raw intensity statistics from the image based on masks. 627 628 This method performs intensity calculations using the main mask (`self.mask`) 629 and the background mask (`self.background_mask`). The pixel intensities within 630 the mask of interest are normalized by subtracting a threshold derived from the 631 background region and applying a correction factor (`self.correction_factor`). 632 Negative values after normalization are clipped to zero. 633 634 The following statistics are computed for both normalized and raw values: 635 - Minimum 636 - Maximum 637 - Mean 638 - Median 639 - Standard deviation 640 - Variance 641 - List of all normalized values (only for normalized data) 642 643 Notes 644 ----- 645 - The method updates the instance attribute `self.normalized_image_values` 646 with a dictionary containing both normalized and raw statistics. 647 - Normalization formula applied for each pixel in the selected mask: 648 final_val = selected_value - (threshold + threshold * correction_factor) 649 where threshold is the mean intensity in the background mask. 650 - Negative values after normalization are set to zero. 651 """ 652 653 tmp_mask = self.ajd_mask_size(image=self.image, mask=self.mask) 654 tmp_bmask = self.ajd_mask_size(image=self.image, mask=self.background_mask) 655 656 selected_values = self.image[tmp_mask == np.max(tmp_mask)] 657 658 threshold = np.mean(self.image[tmp_bmask == np.min(tmp_bmask)]) 659 660 # normalization 661 final_val = selected_values - (threshold + (threshold * self.correction_factor)) 662 663 final_val[final_val < 0] = 0 664 665 tmp_dict = { 666 "norm_min": np.min(final_val), 667 "norm_max": np.max(final_val), 668 "norm_mean": np.mean(final_val), 669 "norm_median": np.median(final_val), 670 "norm_std": np.std(final_val), 671 "norm_var": np.var(final_val), 672 "norm_values": final_val.tolist(), 673 "min": np.min(selected_values), 674 "max": np.max(selected_values), 675 "mean": np.mean(selected_values), 676 "median": np.median(selected_values), 677 "std": np.std(selected_values), 678 "var": np.var(selected_values), 679 } 680 681 self.normalized_image_values = tmp_dict
Calculate normalized and raw intensity statistics from the image based on masks.
This method performs intensity calculations using the main mask (self.mask)
and the background mask (self.background_mask). The pixel intensities within
the mask of interest are normalized by subtracting a threshold derived from the
background region and applying a correction factor (self.correction_factor).
Negative values after normalization are clipped to zero.
The following statistics are computed for both normalized and raw values:
- Minimum
- Maximum
- Mean
- Median
- Standard deviation
- Variance
- List of all normalized values (only for normalized data)
Notes
- The method updates the instance attribute
self.normalized_image_valueswith a dictionary containing both normalized and raw statistics. - Normalization formula applied for each pixel in the selected mask: final_val = selected_value - (threshold + threshold * correction_factor) where threshold is the mean intensity in the background mask.
- Negative values after normalization are set to zero.
683 def size_calculations(self): 684 """ 685 Calculates the size and bounding dimensions of the masked region in the image. 686 687 This method computes the following metrics based on the current mask: 688 - Total number of pixels in the mask (`px_size`) 689 - Real-world size if a scale is provided (`size`) 690 - Maximum lengths along x and y axes (`max_length_x_axis`, `max_length_y_axis`) 691 692 If `self.scale` is defined (unit per pixel), the real-world size is calculated. 693 If not, `size` will be `None` and a warning message is printed. 694 695 Returns: 696 Updates the following attributes in the class: 697 - self.size_info (dict) containing: 698 - 'size' (float or None): real-world size of the mask 699 - 'px_size' (int): number of pixels in the masked region 700 - 'max_length_x_axis' (int): length of the bounding box along the x-axis 701 - 'max_length_y_axis' (int): length of the bounding box along the y-axis 702 703 Example: 704 analysis.size_calculations() 705 print(analysis.size_info) 706 """ 707 708 tmp_mask = self.ajd_mask_size(image=self.image, mask=self.mask) 709 710 size_px = int(len(tmp_mask[tmp_mask > np.min(tmp_mask)])) 711 712 if self.scale is not None: 713 size = float(size_px * self.scale) 714 else: 715 size = None 716 print( 717 '\nUnable to calculate real size, scale (unit/px) not provided, use "set_scale()" or load JIMG project .pjm metadata "load_pjm()" to set scale for calculations!' 718 ) 719 720 non_zero_indices = np.where(tmp_mask == np.max(tmp_mask)) 721 722 min_y, max_y = np.min(non_zero_indices[0]), np.max(non_zero_indices[0]) 723 min_x, max_x = np.min(non_zero_indices[1]), np.max(non_zero_indices[1]) 724 725 max_length_x_axis = int(max_x - min_x + 1) 726 max_length_y_axis = int(max_y - min_y + 1) 727 728 tmp_val = { 729 "size": size, 730 "px_size": size_px, 731 "max_length_x_axis": max_length_x_axis, 732 "max_length_y_axis": max_length_y_axis, 733 } 734 735 self.size_info = tmp_val
Calculates the size and bounding dimensions of the masked region in the image.
This method computes the following metrics based on the current mask:
- Total number of pixels in the mask (px_size)
- Real-world size if a scale is provided (size)
- Maximum lengths along x and y axes (max_length_x_axis, max_length_y_axis)
If self.scale is defined (unit per pixel), the real-world size is calculated.
If not, size will be None and a warning message is printed.
Returns: Updates the following attributes in the class: - self.size_info (dict) containing: - 'size' (float or None): real-world size of the mask - 'px_size' (int): number of pixels in the masked region - 'max_length_x_axis' (int): length of the bounding box along the x-axis - 'max_length_y_axis' (int): length of the bounding box along the y-axis
Example: analysis.size_calculations() print(analysis.size_info)
737 def run_calculations(self): 738 """ 739 Run the full analysis pipeline on the loaded image using the provided masks. 740 741 Notes 742 ----- 743 - The input image must be loaded via `load_image_()` or `load_image_3D()`. 744 - The ROI mask must be loaded via `load_mask_()`. Optionally, a normalization 745 mask can be loaded via `load_normalization_mask_()`. 746 - Parameters such as projection type and correction factor can be set with 747 `set_projection()` and `set_correction_factor()`. 748 - Scale and stack selection can also influence calculations if defined. 749 - To view current parameters, use the `current_metadata` property. 750 751 Returns 752 ------- 753 None 754 The results are stored internally and can be retrieved using 755 `get_results()`. 756 """ 757 758 if self.input_image is not None: 759 760 if self.mask is not None: 761 762 print("\nStart...") 763 self.detect_img() 764 self.intensity_calculations() 765 self.size_calculations() 766 print("\nCompleted!")
Run the full analysis pipeline on the loaded image using the provided masks.
Notes
- The input image must be loaded via
load_image_()orload_image_3D(). - The ROI mask must be loaded via
load_mask_(). Optionally, a normalization mask can be loaded viaload_normalization_mask_(). - Parameters such as projection type and correction factor can be set with
set_projection()andset_correction_factor(). - Scale and stack selection can also influence calculations if defined.
- To view current parameters, use the
current_metadataproperty.
Returns
None
The results are stored internally and can be retrieved using
get_results().
768 def get_results(self): 769 """ 770 Return the results from the analysis performed by `run_calculations()`. 771 772 Returns 773 ------- 774 results_dict : dict or None 775 Dictionary containing intensity and size results. Structure: 776 - 'intensity' : dict with normalized and raw intensity statistics 777 - 'size' : dict with ROI size metrics 778 779 Notes 780 ----- 781 If analysis has not been run yet, prints a message and returns None. 782 """ 783 784 if self.normalized_image_values is not None and self.size_info is not None: 785 786 results = { 787 "intensity": self.normalized_image_values, 788 "size": self.size_info, 789 } 790 791 return results 792 793 else: 794 print('\nAnalysis were not conducted. Run analysis "run_calculations()"')
Return the results from the analysis performed by run_calculations().
Returns
results_dict : dict or None Dictionary containing intensity and size results. Structure: - 'intensity' : dict with normalized and raw intensity statistics - 'size' : dict with ROI size metrics
Notes
If analysis has not been run yet, prints a message and returns None.
796 def save_results( 797 self, 798 path="", 799 mask_region: str = "", 800 feature_name: str = "", 801 individual_number: int = 0, 802 individual_name: str = "", 803 ): 804 """ 805 Save the analysis results to a `.int` (JSON) file. 806 807 Parameters 808 ---------- 809 path : str, optional 810 Directory path where the file will be saved. Defaults to the current working directory. 811 812 mask_region : str 813 Name or identifier of the mask region (e.g., tissue, part of tissue). 814 815 feature_name : str 816 Name of the feature being analyzed. Underscores or spaces are replaced with periods. 817 818 individual_number : int 819 Unique identifier for the individual in the analysis (e.g., 1, 2, 3). 820 821 individual_name : str 822 Name of the individual (e.g., species name, tissue, organoid). 823 824 Notes 825 ----- 826 - The method validates that all required parameters are provided and that 827 analysis results exist (`normalized_image_values` and `size_info`). 828 - Creates the directory if it does not exist. 829 - File name format: 830 '<individual_name>_<individual_number>_<mask_region>_<feature_name>.int' 831 832 Raises 833 ------ 834 FileNotFoundError 835 If the specified path cannot be created or accessed. 836 837 ValueError 838 If any of `mask_region`, `feature_name`, `individual_number`, or 839 `individual_name` are missing or invalid. 840 """ 841 842 path = os.path.abspath(path) 843 844 if ( 845 len(mask_region) > 1 846 and len(feature_name) > 1 847 and individual_number != 0 848 and len(individual_name) > 1 849 ): 850 851 if self.normalized_image_values is not None and self.size_info is not None: 852 853 results = { 854 "intensity": self.normalized_image_values, 855 "size": self.size_info, 856 } 857 858 mask_region = re.sub(r"[_\s]+", ".", mask_region) 859 feature_name = re.sub(r"[_\s]+", ".", feature_name) 860 individual_number = re.sub(r"[_\s]+", ".", str(individual_number)) 861 individual_name = re.sub(r"[_\s]+", ".", individual_name) 862 863 full_name = f"{individual_name}_{individual_number}_{mask_region}_{feature_name}" 864 865 isExist = os.path.exists(path) 866 if not isExist: 867 os.makedirs(path, exist_ok=True) 868 869 full_path = os.path.join( 870 path, re.sub("\\.json", "", full_name) + ".int" 871 ) 872 873 with open(full_path, "w") as file: 874 json.dump(results, file, indent=4) 875 876 else: 877 print( 878 '\nAnalysis were not conducted. Run analysis "run_calculations()"' 879 ) 880 881 else: 882 print( 883 "\nAny of 'mask_region', 'feature_name', 'individual_number', 'individual_name' parameters were provided wrong!" 884 )
Save the analysis results to a .int (JSON) file.
Parameters
path : str, optional Directory path where the file will be saved. Defaults to the current working directory.
mask_region : str Name or identifier of the mask region (e.g., tissue, part of tissue).
feature_name : str Name of the feature being analyzed. Underscores or spaces are replaced with periods.
individual_number : int Unique identifier for the individual in the analysis (e.g., 1, 2, 3).
individual_name : str Name of the individual (e.g., species name, tissue, organoid).
Notes
- The method validates that all required parameters are provided and that
analysis results exist (
normalized_image_valuesandsize_info). - Creates the directory if it does not exist.
- File name format:
'
_ _ _ .int'
Raises
FileNotFoundError If the specified path cannot be created or accessed.
ValueError
If any of mask_region, feature_name, individual_number, or
individual_name are missing or invalid.
886 def concatenate_intensity_data(self, directory: str = "", name: str = ""): 887 """ 888 Concatenate intensity data from multiple `.int` files and save as CSV. 889 890 Parameters 891 ---------- 892 directory : str, optional 893 Path to the directory containing `.int` files. Defaults to the current working directory. 894 895 name : str 896 Prefix for the output CSV file names. CSV files are saved in the format 897 '<name>_<gene>_<region>.csv'. 898 899 Raises 900 ------ 901 FileNotFoundError 902 If the directory cannot be accessed or no `.int` files are found. 903 904 ValueError 905 If an `.int` file is missing expected data or has an incorrect format. 906 907 Notes 908 ----- 909 - The method groups intensity data by gene (feature) and mask region. 910 - Outputs one CSV file per unique gene-region combination, saved in the specified directory. 911 """ 912 913 directory = os.path.abspath(directory) 914 915 files_list = [f for f in os.listdir(directory) if f.endswith(".int")] 916 917 genes_set = set([re.sub("\\.int", "", x.split("_")[3]) for x in files_list]) 918 regions_set = set([re.sub("\\.int", "", x.split("_")[2]) for x in files_list]) 919 920 for g in genes_set: 921 for r in regions_set: 922 json_to_save = { 923 "individual_name": [], 924 "individual_number": [], 925 "norm_intensity": [], 926 "size": [], 927 } 928 929 for f in tqdm(files_list): 930 if g in f and r in f: 931 with open(os.path.join(directory, f), "r") as file: 932 data = json.load(file) 933 934 json_to_save["norm_intensity"] = ( 935 json_to_save["norm_intensity"] 936 + data["intensity"]["norm_values"] 937 ) 938 json_to_save["individual_name"] = json_to_save[ 939 "individual_name" 940 ] + [f.split("_")[0]] * len( 941 data["intensity"]["norm_values"] 942 ) 943 json_to_save["individual_number"] = json_to_save[ 944 "individual_number" 945 ] + [f.split("_")[1]] * len( 946 data["intensity"]["norm_values"] 947 ) 948 json_to_save["size"] = json_to_save["size"] + [ 949 data["size"]["px_size"] 950 ] * len(data["intensity"]["norm_values"]) 951 952 pd.DataFrame(json_to_save).to_csv(f"{name}_{g}_{r}.csv", index=False)
Concatenate intensity data from multiple .int files and save as CSV.
Parameters
directory : str, optional
Path to the directory containing .int files. Defaults to the current working directory.
name : str
Prefix for the output CSV file names. CSV files are saved in the format
'
Raises
FileNotFoundError
If the directory cannot be accessed or no .int files are found.
ValueError
If an .int file is missing expected data or has an incorrect format.
Notes
- The method groups intensity data by gene (feature) and mask region.
- Outputs one CSV file per unique gene-region combination, saved in the specified directory.
955class IntensityAnalysis: 956 """ 957 Class for performing percentile-based statistical analysis on grouped data. 958 959 This class provides methods to calculate percentiles, remove outliers, aggregate 960 data into percentile bins, perform Welch's ANOVA and Chi-squared tests, and 961 visualize results via comparative histograms. It is designed to handle both 962 single-column and multi-column combinations of values for group-based analysis. 963 964 Methods 965 ------- 966 drop_up_df(data, group_col, values_col) 967 Removes upper outliers from a DataFrame based on a grouping column. 968 969 percentiles_calculation(values, sep_perc=1) 970 Calculates percentiles and creates loopable percentile ranges. 971 972 to_percentil(values, percentiles, percentiles_loop) 973 Aggregates statistics based on percentile ranges. 974 975 df_to_percentiles(data, group_col, values_col, sep_perc=1, drop_outlires=True) 976 Computes percentile statistics for grouped DataFrame data. 977 978 round_to_scientific_notation(num) 979 Formats a number in scientific notation or standard format. 980 981 aov_percentiles(data, testes_col, comb="*") 982 Performs Welch's ANOVA on percentile-based group data. 983 984 post_aov_percentiles(data, testes_col, comb="*") 985 Performs Welch's ANOVA with pairwise t-tests. 986 987 chi2_percentiles(input_hist) 988 Performs Chi-squared test on percentile histogram data. 989 990 post_ch2_percentiles(input_hist) 991 Performs Chi-squared test with pairwise comparisons. 992 993 hist_compare_plot(data, queue, tested_value, p_adj=True, txt_size=20) 994 Generates comparative histograms with statistical test results. 995 """ 996 997 def drop_up_df(self, data: pd.DataFrame, group_col: str, values_col: str): 998 """ 999 Remove upper outliers from a DataFrame based on a specified value column, grouped by a grouping column. 1000 1001 Outliers are calculated and removed separately for each group defined by `group_col`. 1002 The upper outliers are defined using the interquartile range (IQR) method: 1003 values greater than Q3 + 1.5 * IQR are considered outliers. 1004 1005 Parameters 1006 ---------- 1007 data : pd.DataFrame 1008 The input DataFrame containing the data. 1009 1010 group_col : str 1011 The name of the column used for grouping the data. 1012 1013 values_col : str 1014 The column containing the values from which upper outliers will be removed. 1015 1016 Returns 1017 ------- 1018 filtered_data : pd.DataFrame 1019 A filtered DataFrame with the upper outliers removed for each group. 1020 1021 Notes 1022 ----- 1023 - Outliers are removed separately within each group. 1024 - The original DataFrame is not modified; a new filtered DataFrame is returned. 1025 """ 1026 1027 def iqr_filter(group): 1028 q75 = np.quantile(group[values_col], 0.75) 1029 q25 = np.quantile(group[values_col], 0.25) 1030 itq = q75 - q25 1031 return group[group[values_col] <= (q75 + 1.5 * itq)] 1032 1033 filtered_data = data.groupby(group_col).apply(iqr_filter).reset_index(drop=True) 1034 1035 return filtered_data 1036 1037 def percentiles_calculation(self, values, sep_perc: int = 1): 1038 """ 1039 Calculate percentiles for a set of values and generate consecutive percentile ranges. 1040 1041 This function computes percentiles from 0 to 100 at intervals defined by `sep_perc`. 1042 It also generates a list of consecutive percentile ranges that can be used for further analysis or binning. 1043 1044 Parameters 1045 ---------- 1046 values : array-like 1047 The input data values for which the percentiles are calculated. 1048 1049 sep_perc : int, optional 1050 Separation interval between percentiles (default is 1, meaning percentiles are calculated every 1%). 1051 1052 Returns 1053 ------- 1054 percentiles : np.ndarray 1055 Array of calculated percentile values. 1056 1057 percentiles_loop : list of tuple 1058 List of consecutive percentile ranges as tuples, e.g., [(0, 1), (1, 2), ..., (99, 100)]. 1059 1060 Notes 1061 ----- 1062 - The first percentile is set to 0 to avoid issues with zero values. 1063 - `percentiles_loop` is useful for iterating through percentile ranges when aggregating statistics. 1064 """ 1065 1066 per_vector = values.copy() 1067 1068 percentiles = np.percentile(per_vector, np.arange(0, 101, sep_perc)) 1069 percentiles[0] = 0 1070 1071 percentiles_loop = [(i, i + 1) for i in range(int(100 / sep_perc))] 1072 1073 return percentiles, percentiles_loop 1074 1075 def to_percentil(self, values, percentiles, percentiles_loop): 1076 """ 1077 Aggregate statistics for a set of values based on percentile ranges. 1078 1079 This function calculates summary statistics for each percentile range defined in `percentiles_loop`, 1080 using the percentile values calculated by `percentiles_calculation()`. Statistics include count, proportion, 1081 mean, median, standard deviation, and variance. 1082 1083 Parameters 1084 ---------- 1085 values : array-like 1086 Input data values for which the statistics are calculated. 1087 1088 percentiles : np.ndarray 1089 Array of percentile values used to define the ranges. 1090 1091 percentiles_loop : list of tuple 1092 List of consecutive percentile ranges, e.g., [(0, 1), (1, 2), ..., (99, 100)]. 1093 1094 Returns 1095 ------- 1096 data : dict 1097 Dictionary containing the following keys: 1098 - 'n' : list 1099 Number of elements in each percentile range. 1100 1101 - 'n_standarized' : list 1102 Proportion of elements in each percentile range relative to the total number of elements. 1103 1104 - 'avg' : list 1105 Mean value of elements within each percentile range. 1106 1107 - 'median' : list 1108 Median value of elements within each percentile range. 1109 1110 - 'std' : list 1111 Standard deviation of elements within each percentile range. 1112 1113 - 'var' : list 1114 Variance of elements within each percentile range. 1115 1116 Notes 1117 ----- 1118 - If a percentile range contains no elements, statistics are set to 0 and count is set to 1 to avoid empty lists. 1119 """ 1120 1121 per_vector = values.copy() 1122 1123 data = { 1124 "n": [], 1125 "n_standarized": [], 1126 "avg": [], 1127 "median": [], 1128 "std": [], 1129 "var": [], 1130 } 1131 1132 amount = len(per_vector) 1133 1134 for x in percentiles_loop: 1135 if ( 1136 len( 1137 per_vector[ 1138 (per_vector > percentiles[x[0]]) 1139 & (per_vector <= percentiles[x[1]]) 1140 ] 1141 ) 1142 > 0 1143 ): 1144 data["n"].append( 1145 len( 1146 per_vector[ 1147 (per_vector > percentiles[x[0]]) 1148 & (per_vector <= percentiles[x[1]]) 1149 ] 1150 ) 1151 ) 1152 data["n_standarized"].append( 1153 len( 1154 per_vector[ 1155 (per_vector > percentiles[x[0]]) 1156 & (per_vector <= percentiles[x[1]]) 1157 ] 1158 ) 1159 / amount 1160 ) 1161 data["avg"].append( 1162 np.mean( 1163 per_vector[ 1164 (per_vector > percentiles[x[0]]) 1165 & (per_vector <= percentiles[x[1]]) 1166 ] 1167 ) 1168 ) 1169 data["std"].append( 1170 np.std( 1171 per_vector[ 1172 (per_vector > percentiles[x[0]]) 1173 & (per_vector <= percentiles[x[1]]) 1174 ] 1175 ) 1176 ) 1177 data["median"].append( 1178 np.median( 1179 per_vector[ 1180 (per_vector > percentiles[x[0]]) 1181 & (per_vector <= percentiles[x[1]]) 1182 ] 1183 ) 1184 ) 1185 data["var"].append( 1186 np.var( 1187 per_vector[ 1188 (per_vector > percentiles[x[0]]) 1189 & (per_vector <= percentiles[x[1]]) 1190 ] 1191 ) 1192 ) 1193 else: 1194 data["n"].append(1) 1195 data["n_standarized"].append(0) 1196 data["avg"].append(0) 1197 data["std"].append(0) 1198 data["median"].append(0) 1199 data["var"].append(0) 1200 1201 return data 1202 1203 def df_to_percentiles( 1204 self, 1205 data: pd.DataFrame, 1206 group_col: str, 1207 values_col: str, 1208 sep_perc: int = 1, 1209 drop_outlires: bool = True, 1210 ): 1211 """ 1212 Calculate summary statistics based on percentile ranges for each group in a DataFrame. 1213 1214 This method groups the input DataFrame by `group_col`, computes percentile ranges for each group's values 1215 in `values_col`, and aggregates statistics (count, proportion, mean, median, standard deviation, variance) 1216 for each percentile range. Optionally, upper outliers can be removed before calculation. 1217 1218 Parameters 1219 ---------- 1220 data : pd.DataFrame 1221 Input DataFrame containing the data. 1222 1223 group_col : str 1224 Column name used to define groups. 1225 1226 values_col : str 1227 Column name containing the values for percentile calculations. 1228 1229 sep_perc : int, optional 1230 Separation interval for percentiles (default is 1, meaning percentiles are calculated at every 1%). 1231 1232 drop_outlires : bool, optional 1233 If True, removes upper outliers from the data before performing calculations (default is True). 1234 1235 Returns 1236 ------- 1237 full_data : dict 1238 Dictionary where each key is a group name and each value is a dictionary containing: 1239 - 'n' : list 1240 Number of elements in each percentile range. 1241 1242 - 'n_standarized' : list 1243 Proportion of elements in each percentile range relative to the total number of elements. 1244 1245 - 'avg' : list 1246 Mean value of elements within each percentile range. 1247 1248 - 'median' : list 1249 Median value of elements within each percentile range. 1250 1251 - 'std' : list 1252 Standard deviation of elements within each percentile range. 1253 1254 - 'var' : list 1255 Variance of elements within each percentile range. 1256 1257 Notes 1258 ----- 1259 - Outlier removal uses the IQR method within each group if `drop_outlires` is True. 1260 """ 1261 1262 full_data = {} 1263 1264 if drop_outlires == True: 1265 data = self.drop_up_df( 1266 data=data, group_col=group_col, values_col=values_col 1267 ) 1268 1269 groups = set(data[group_col]) 1270 1271 percentiles, percentiles_loop = self.percentiles_calculation( 1272 data[values_col], sep_perc=sep_perc 1273 ) 1274 1275 for g in groups: 1276 1277 print(f"Group: {g} ...") 1278 1279 tmp_values = data[values_col][data[group_col] == g] 1280 1281 per_dat = self.to_percentil(tmp_values, percentiles, percentiles_loop) 1282 1283 full_data[g] = per_dat 1284 1285 return full_data 1286 1287 def round_to_scientific_notation(self, num): 1288 """ 1289 Round a number to scientific notation if very small, otherwise to one decimal place. 1290 1291 Parameters 1292 ---------- 1293 num : float 1294 The number to round. 1295 1296 Returns 1297 ------- 1298 str 1299 The rounded number as a string. 1300 - If `num` is 0, returns "0.0". 1301 - If `abs(num) < 1e-4`, returns scientific notation with 1 decimal and 1-digit exponent. 1302 - Otherwise, returns the number rounded to one decimal place. 1303 """ 1304 1305 if num == 0: 1306 return "0.0" 1307 1308 if abs(num) < 0.0001: 1309 rounded_num = np.format_float_scientific(num, precision=1, exp_digits=1) 1310 return rounded_num 1311 else: 1312 return f"{num:.1f}" 1313 1314 def aov_percentiles(self, data, testes_col, comb: str = "*"): 1315 """ 1316 Perform Welch's ANOVA on percentile-based group data. 1317 1318 This method calculates group values by combining the columns specified in `testes_col` 1319 according to the operation defined in `comb`, then performs Welch's ANOVA to test for 1320 differences in means between the groups. Welch's ANOVA is suitable when the groups 1321 have unequal variances. 1322 1323 Parameters 1324 ---------- 1325 data : dict of pd.DataFrame 1326 Dictionary where keys are group names and values are DataFrames containing the data. 1327 1328 testes_col : str or list of str 1329 Column name(s) from which the group values are derived. If a list is provided, columns 1330 will be combined based on the `comb` operation. 1331 1332 comb : str, optional 1333 Operation used to combine multiple columns if `testes_col` is a list. Options include: 1334 '*' : multiplication 1335 '+' : addition 1336 '**': exponentiation 1337 '-' : subtraction 1338 '/' : division 1339 Default is '*'. 1340 1341 Returns 1342 ------- 1343 F : float 1344 F-statistic from Welch's ANOVA. 1345 1346 p_val : float 1347 Uncorrected p-value from Welch's ANOVA, testing for significant differences between groups. 1348 1349 Notes 1350 ----- 1351 - If `testes_col` is a single string, no combination is performed, and the group values 1352 are taken directly from that column. 1353 - Welch's ANOVA is used as it accounts for unequal variances between groups. 1354 - The `df.melt()` method is used to reshape the data, allowing the ANOVA to be applied to all groups. 1355 1356 Examples 1357 -------- 1358 >>> welch_F, welch_p = self.aov_percentiles(data, testes_col=['col1', 'col2'], comb='+') 1359 >>> print(f"Welch's ANOVA F-statistic: {welch_F}, p-value: {welch_p}") 1360 """ 1361 1362 groups = [] 1363 1364 for d in data.keys(): 1365 1366 if isinstance(testes_col, str): 1367 g = data[d][testes_col] 1368 elif isinstance(testes_col, list): 1369 g = [1] * len(data[d][testes_col[0]]) 1370 for t in testes_col: 1371 if comb == "*": 1372 g = [a * b for a, b in zip(g, data[d][t])] 1373 elif comb == "+": 1374 g = [a + b for a, b in zip(g, data[d][t])] 1375 elif comb == "**": 1376 g = [a**b for a, b in zip(g, data[d][t])] 1377 elif comb == "-": 1378 g = [a - b for a, b in zip(g, data[d][t])] 1379 elif comb == "/": 1380 g = [a / b for a, b in zip(g, data[d][t])] 1381 1382 groups.append(g) 1383 1384 df = pd.DataFrame({f"group_{i}": group for i, group in enumerate(groups)}) 1385 1386 df_melted = df.melt(var_name="group", value_name="value") 1387 1388 welch_results = pg.welch_anova(data=df_melted, dv="value", between="group") 1389 1390 return welch_results["F"].values[0], welch_results["p-unc"].values[0] 1391 1392 def post_aov_percentiles(self, data, testes_col, comb: str = "*"): 1393 """ 1394 Perform Welch's ANOVA on percentile-based group data and pairwise Welch's t-tests. 1395 1396 This method first performs Welch's ANOVA to assess differences in group means, and 1397 then conducts pairwise Welch's t-tests between all group combinations. P-values are 1398 adjusted using the Bonferroni correction for multiple comparisons. 1399 1400 Parameters 1401 ---------- 1402 data : dict of pd.DataFrame 1403 Dictionary where keys are group names and values are DataFrames containing the data. 1404 1405 testes_col : str or list of str 1406 Column name(s) from which the group values are derived. If a list is provided, 1407 columns will be combined according to the `comb` operation. 1408 1409 comb : str, optional 1410 Operation used to combine multiple columns if `testes_col` is a list. Options include: 1411 '*' : multiplication 1412 '+' : addition 1413 '**': exponentiation 1414 '-' : subtraction 1415 '/' : division 1416 Default is '*'. 1417 1418 Returns 1419 ------- 1420 p_val : float 1421 Uncorrected p-value from the Welch's ANOVA. 1422 1423 final_results : dict 1424 Dictionary containing results of pairwise Welch's t-tests with keys: 1425 'group1' : list of first group names in each comparison 1426 'group2' : list of second group names in each comparison 1427 'stat' : list of t-statistics for each comparison 1428 'p_val' : list of uncorrected p-values for each comparison 1429 'adj_p_val' : list of Bonferroni-adjusted p-values for multiple comparisons 1430 """ 1431 1432 p_val = self.aov_percentiles(data=data, testes_col=testes_col, comb=comb)[1] 1433 1434 pairs = list(combinations(data, 2)) 1435 final_results = { 1436 "group1": [], 1437 "group2": [], 1438 "stat": [], 1439 "p_val": [], 1440 "adj_p_val": [], 1441 } 1442 1443 for group1, group2 in pairs: 1444 if isinstance(testes_col, str): 1445 g1 = data[group1][testes_col] 1446 elif isinstance(testes_col, list): 1447 g1 = [1] * len(data[group1][testes_col[0]]) 1448 for t in testes_col: 1449 if comb == "*": 1450 g1 = [a * b for a, b in zip(g1, data[group1][t])] 1451 elif comb == "+": 1452 g1 = [a + b for a, b in zip(g1, data[group1][t])] 1453 elif comb == "**": 1454 g1 = [a**b for a, b in zip(g1, data[group1][t])] 1455 elif comb == "-": 1456 g1 = [a - b for a, b in zip(g1, data[group1][t])] 1457 elif comb == "/": 1458 g1 = [a / b for a, b in zip(g1, data[group1][t])] 1459 1460 if isinstance(testes_col, str): 1461 g2 = data[group2][testes_col] 1462 elif isinstance(testes_col, list): 1463 g2 = [1] * len(data[group2][testes_col[0]]) 1464 for t in testes_col: 1465 if comb == "*": 1466 g2 = [a * b for a, b in zip(g2, data[group2][t])] 1467 elif comb == "+": 1468 g2 = [a + b for a, b in zip(g2, data[group2][t])] 1469 elif comb == "**": 1470 g2 = [a**b for a, b in zip(g2, data[group2][t])] 1471 elif comb == "-": 1472 g2 = [a - b for a, b in zip(g2, data[group2][t])] 1473 elif comb == "/": 1474 g2 = [a / b for a, b in zip(g2, data[group2][t])] 1475 1476 stat, p_val = stats.ttest_ind( 1477 g1, g2, alternative="two-sided", equal_var=False 1478 ) 1479 g = sorted([group1, group2]) 1480 final_results["group1"].append(g[0]) 1481 final_results["group2"].append(g[1]) 1482 final_results["stat"].append(stat) 1483 final_results["p_val"].append(p_val) 1484 adj = p_val * len(pairs) 1485 if adj > 1: 1486 final_results["adj_p_val"].append(1) 1487 else: 1488 final_results["adj_p_val"].append(adj) 1489 1490 return p_val, final_results 1491 1492 def chi2_percentiles(self, input_hist): 1493 """ 1494 Perform a Chi-squared test on percentile-based group data. 1495 1496 This method reformats the input histogram data into a contingency table and performs 1497 a Chi-squared test to evaluate whether there is a significant association between groups. 1498 1499 Parameters 1500 ---------- 1501 input_hist : dict of pd.DataFrame 1502 Dictionary where keys are group names and values are DataFrames containing histogram data. 1503 The DataFrame must include a column 'n' representing counts for each percentile/bin. 1504 1505 Returns 1506 ------- 1507 chi2_statistic : float 1508 Chi-squared test statistic. 1509 1510 p_value : float 1511 P-value from the Chi-squared test. 1512 1513 dof : int 1514 Degrees of freedom for the test. 1515 1516 expected : np.ndarray 1517 Expected frequencies for each group/bin under the null hypothesis. 1518 1519 chi_data : dict 1520 Formatted data used in the Chi-squared test, with group names as keys and bin counts as values. 1521 1522 Example 1523 ------- 1524 chi2_stat, p_val, dof, expected, chi_data = self.chi2_percentiles(input_hist) 1525 print(f"Chi-squared statistic: {chi2_stat}, p-value: {p_val}") 1526 """ 1527 1528 chi_data = {} 1529 1530 for d in input_hist.keys(): 1531 tmp_dic = {} 1532 1533 for n, c in enumerate(input_hist[d]["n"]): 1534 tmp_dic[f"p{n+1}"] = c 1535 1536 chi_data[d] = tmp_dic 1537 1538 chi2_statistic, p_value, dof, expected = chi2_contingency( 1539 pd.DataFrame(chi_data).T, correction=True 1540 ) 1541 1542 return chi2_statistic, p_value, dof, expected, chi_data 1543 1544 def post_ch2_percentiles(self, input_hist): 1545 """ 1546 Perform a Chi-squared test on percentile-based group data, including pairwise comparisons. 1547 1548 This method first performs a Chi-squared test across all groups to check for a significant association. 1549 It then performs pairwise Chi-squared tests between groups to identify specific differences. 1550 P-values for multiple comparisons are adjusted using the Bonferroni correction. 1551 1552 Parameters 1553 ---------- 1554 input_hist : dict of pd.DataFrame 1555 Dictionary where keys are group names and values are DataFrames containing histogram data. 1556 Each DataFrame must include a column 'n' with counts for each percentile/bin. 1557 1558 Returns 1559 ------- 1560 p_val : float 1561 Overall p-value from the initial Chi-squared test across all groups. 1562 1563 final_results : dict 1564 Results of pairwise Chi-squared tests, with keys: 1565 - 'group1' (list): Name of the first group in each comparison 1566 - 'group2' (list): Name of the second group in each comparison 1567 - 'chi2' (list): Chi-squared statistic for each pairwise comparison 1568 - 'p_val' (list): P-value for each pairwise comparison 1569 - 'adj_p_val' (list): Adjusted p-value (Bonferroni correction) for multiple comparisons 1570 1571 Example 1572 ------- 1573 p_val, final_results = self.post_ch2_percentiles(input_hist) 1574 print(f"Overall Chi-squared p-value: {p_val}") 1575 for i in range(len(final_results['group1'])): 1576 print(f"Comparison: {final_results['group1'][i]} vs {final_results['group2'][i]}") 1577 print(f"Chi2 stat: {final_results['chi2'][i]}, p-value: {final_results['p_val'][i]}, adj. p-value: {final_results['adj_p_val'][i]}") 1578 """ 1579 1580 res = self.chi2_percentiles(input_hist) 1581 1582 pairs = list(combinations(res[4], 2)) 1583 results = [] 1584 1585 for group1, group2 in pairs: 1586 table_pair = pd.DataFrame(res[4])[[group1, group2]] 1587 chi2_stat, p_val, _, _ = chi2_contingency(table_pair, correction=True) 1588 results.append((group1, group2, chi2_stat, p_val)) 1589 1590 final_results = { 1591 "group1": [], 1592 "group2": [], 1593 "chi2": [], 1594 "p_val": [], 1595 "adj_p_val": [], 1596 } 1597 1598 for group1, group2, chi2_stat, p_val in results: 1599 g = sorted([group1, group2]) 1600 final_results["group1"].append(g[0]) 1601 final_results["group2"].append(g[1]) 1602 final_results["chi2"].append(chi2_stat) 1603 final_results["p_val"].append(p_val) 1604 adj = p_val * len(results) 1605 if adj > 1: 1606 final_results["adj_p_val"].append(1) 1607 else: 1608 final_results["adj_p_val"].append(adj) 1609 1610 return res[1], final_results 1611 1612 def hist_compare_plot( 1613 self, data, queue, tested_value, p_adj: bool = True, txt_size: int = 20 1614 ): 1615 """ 1616 Generate comparative histograms and display results of statistical tests (ANOVA and Chi-squared). 1617 1618 This method performs transformations on the input data, generates comparative histograms 1619 for each group, and annotates statistical test results, including Welch's ANOVA and Chi-squared tests. 1620 Multiple comparison corrections can be applied using the Bonferroni method. 1621 1622 Parameters 1623 ---------- 1624 data : dict of pd.DataFrame 1625 Dictionary where keys are group names and values are DataFrames containing histogram data. 1626 Each DataFrame should include the column specified by `tested_value`. 1627 1628 queue : list of str 1629 Defines the order of groups to be plotted. 1630 1631 tested_value : str 1632 The column name in `data` representing the variable to test and visualize. 1633 1634 p_adj : bool, optional 1635 If True, applies Bonferroni correction for multiple comparisons (default is True). 1636 1637 txt_size : int, optional 1638 Font size for text annotations in the plot (default is 20). 1639 1640 Returns 1641 ------- 1642 fig : matplotlib.figure.Figure 1643 Matplotlib figure object containing the generated histograms and statistical test results. 1644 1645 Example 1646 ------- 1647 fig = self.hist_compare_plot( 1648 data, 1649 queue=['group1', 'group2', 'group3'], 1650 tested_value='n', 1651 p_adj=True, 1652 txt_size=18 1653 ) 1654 plt.show() 1655 """ 1656 1657 for i in data.keys(): 1658 values = np.array(data[i][tested_value]) 1659 values += 1 1660 transformed_values, fitted_lambda = stats.boxcox(values) 1661 data[i][tested_value] = transformed_values.tolist() 1662 1663 if sorted(queue) != sorted(data.keys()): 1664 print( 1665 "\n Wrong queue provided! The queue will be sorted with default settings!" 1666 ) 1667 queue = sorted(data.keys()) 1668 1669 # parametric selected value 1670 pk, dfk = self.post_aov_percentiles(data, testes_col=tested_value) 1671 1672 dfk = pd.DataFrame(dfk) 1673 1674 dfk = dfk.sort_values( 1675 by=["group1", "group2"], 1676 key=lambda col: [queue.index(val) if val in queue else -1 for val in col], 1677 ).reset_index(drop=True) 1678 1679 # parametric standarized selected value 1680 pkc, dfkc = self.post_aov_percentiles( 1681 data, testes_col=[tested_value, "n_standarized"], comb="*" 1682 ) 1683 1684 dfkc = pd.DataFrame(dfkc) 1685 1686 dfkc = dfkc.sort_values( 1687 by=["group1", "group2"], 1688 key=lambda col: [queue.index(val) if val in queue else -1 for val in col], 1689 ).reset_index(drop=True) 1690 1691 # chi2 1692 pchi, dfchi = self.post_ch2_percentiles(data) 1693 1694 dfchi = pd.DataFrame(dfchi) 1695 1696 dfchi = dfchi.sort_values( 1697 by=["group1", "group2"], 1698 key=lambda col: [queue.index(val) if val in queue else -1 for val in col], 1699 ).reset_index(drop=True) 1700 1701 ############################################################################## 1702 1703 standarized_max, standarized_min, value_max, value_min = [], [], [], [] 1704 for d in queue: 1705 standarized_max.append(max(data[d]["n_standarized"])) 1706 standarized_min.append(min(data[d]["n_standarized"])) 1707 value_max.append(max(data[d][tested_value])) 1708 value_min.append(min(data[d][tested_value])) 1709 1710 num_columns = len(queue) + 1 1711 1712 fig, axs = plt.subplots( 1713 3, 1714 num_columns, 1715 figsize=(8 * num_columns, 10), 1716 gridspec_kw={"width_ratios": [1] * len(queue) + [0.5], "wspace": 0.05}, 1717 ) 1718 1719 for i, d in enumerate(queue): 1720 tmp_data = data[d] 1721 1722 axs[0, i].bar( 1723 [str(n) for n in range(len(tmp_data["n_standarized"]))], 1724 tmp_data["n_standarized"], 1725 width=0.95, 1726 color="gold", 1727 ) 1728 axs[0, i].set_ylim( 1729 min(standarized_min) * 0.9995, max(standarized_max) * 1.0005 1730 ) 1731 1732 if i == 0: 1733 axs[0, i].set_ylabel("Standarized\nfrequency", fontsize=txt_size) 1734 else: 1735 axs[0, i].set_yticks([]) 1736 1737 axs[0, i].set_xticks([]) 1738 axs[0, i].tick_params(axis="y", labelsize=txt_size * 0.7) 1739 1740 axs[1, i].bar( 1741 [str(n) for n in range(len(tmp_data[tested_value]))], 1742 tmp_data[tested_value], 1743 width=0.95, 1744 color="orange", 1745 ) 1746 1747 mean_value = np.mean(tmp_data[tested_value]) 1748 axs[1, i].axhline(y=mean_value, color="red", linestyle="--") 1749 1750 # axs[1, i].set_ylim(min(value_min) - 1, max(value_max) + 1) 1751 axs[1, i].set_ylim(min(value_min) * 0.9995, max(value_max) * 1.0005) 1752 1753 if i == 0: 1754 axs[1, i].set_ylabel(f"Normalized\n{tested_value}", fontsize=txt_size) 1755 else: 1756 axs[1, i].set_yticks([]) 1757 1758 axs[1, i].set_xticks([]) 1759 axs[1, i].tick_params(axis="y", labelsize=txt_size * 0.7) 1760 1761 axs[2, i].bar( 1762 [str(n) for n in range(len(tmp_data["n_standarized"]))], 1763 [ 1764 a * b 1765 for a, b in zip(tmp_data[tested_value], tmp_data["n_standarized"]) 1766 ], 1767 width=0.95, 1768 color="goldenrod", 1769 ) 1770 1771 mean_value = np.mean( 1772 [ 1773 a * b 1774 for a, b in zip(tmp_data[tested_value], tmp_data["n_standarized"]) 1775 ] 1776 ) 1777 axs[2, i].axhline(y=mean_value, color="red", linestyle="--") 1778 1779 axs[2, i].set_ylim( 1780 (min(standarized_min) * min(value_min)) * 0.9995, 1781 (max(standarized_max) * max(value_max) * 1.0005), 1782 ) 1783 axs[2, i].set_xlabel(d, fontsize=txt_size) 1784 1785 if i == 0: 1786 axs[2, i].set_ylabel( 1787 f"Standarized\nnorm_{tested_value}", fontsize=txt_size 1788 ) 1789 else: 1790 axs[2, i].set_yticks([]) 1791 1792 axs[2, i].set_xticks([]) 1793 axs[2, i].tick_params(axis="y", labelsize=txt_size * 0.7) 1794 1795 sign = "ns" 1796 if float(self.round_to_scientific_notation(pk)) < 0.001: 1797 sign = "***" 1798 elif float(self.round_to_scientific_notation(pk)) < 0.01: 1799 sign = "**" 1800 elif float(self.round_to_scientific_notation(pk)) < 0.05: 1801 sign = "*" 1802 1803 text = f"Test Welch's ANOVA\np-value: {self.round_to_scientific_notation(pk)} - {sign}\n" 1804 1805 if p_adj == True: 1806 for i in range(len(dfk["group1"])): 1807 sign = "ns" 1808 if dfk["adj_p_val"][i] < 0.001: 1809 sign = "***" 1810 elif dfk["adj_p_val"][i] < 0.01: 1811 sign = "**" 1812 elif dfk["adj_p_val"][i] < 0.05: 1813 sign = "*" 1814 1815 text += f"{dfk['group1'][i]} vs. {dfk['group2'][i]}\np-value: {self.round_to_scientific_notation(dfk['adj_p_val'][i])} - {sign}\n" 1816 else: 1817 for i in range(len(dfk["group1"])): 1818 sign = "ns" 1819 if dfk["p_val"][i] < 0.001: 1820 sign = "***" 1821 elif dfk["p_val"][i] < 0.01: 1822 sign = "**" 1823 elif dfk["p_val"][i] < 0.05: 1824 sign = "*" 1825 1826 text += f"{dfk['group1'][i]} vs. {dfk['group2'][i]}\np-value: {self.round_to_scientific_notation(dfk['p_val'][i])} - {sign}\n" 1827 1828 axs[1, -1].text( 1829 0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True 1830 ) 1831 axs[1, -1].set_axis_off() 1832 1833 sign = "ns" 1834 if float(self.round_to_scientific_notation(pkc)) < 0.001: 1835 sign = "***" 1836 elif float(self.round_to_scientific_notation(pkc)) < 0.01: 1837 sign = "**" 1838 elif float(self.round_to_scientific_notation(pkc)) < 0.05: 1839 sign = "*" 1840 1841 text = f"Test Welch's ANOVA\np-value: {self.round_to_scientific_notation(pkc)} - {sign}\n" 1842 1843 if p_adj == True: 1844 for i in range(len(dfkc["group1"])): 1845 sign = "ns" 1846 if dfkc["adj_p_val"][i] < 0.001: 1847 sign = "***" 1848 elif dfkc["adj_p_val"][i] < 0.01: 1849 sign = "**" 1850 elif dfkc["adj_p_val"][i] < 0.05: 1851 sign = "*" 1852 1853 text += f"{dfkc['group1'][i]} vs. {dfkc['group2'][i]}\np-value: {self.round_to_scientific_notation(dfkc['adj_p_val'][i])} - {sign}\n" 1854 else: 1855 for i in range(len(dfkc["group1"])): 1856 sign = "ns" 1857 if dfkc["p_val"][i] < 0.001: 1858 sign = "***" 1859 elif dfkc["p_val"][i] < 0.01: 1860 sign = "**" 1861 elif dfkc["p_val"][i] < 0.05: 1862 sign = "*" 1863 1864 text += f"{dfkc['group1'][i]} vs. {dfkc['group2'][i]}\np-value: {self.round_to_scientific_notation(dfkc['p_val'][i])} - {sign}\n" 1865 1866 axs[2, -1].text( 1867 0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True 1868 ) 1869 axs[2, -1].set_axis_off() 1870 1871 sign = "ns" 1872 if float(self.round_to_scientific_notation(pchi)) < 0.001: 1873 sign = "***" 1874 elif float(self.round_to_scientific_notation(pchi)) < 0.01: 1875 sign = "**" 1876 elif float(self.round_to_scientific_notation(pchi)) < 0.05: 1877 sign = "*" 1878 1879 text = f"Test Chi-squared\np-value: {self.round_to_scientific_notation(pchi)} - {sign}\n" 1880 1881 if p_adj == True: 1882 for i in range(len(dfchi["group1"])): 1883 sign = "ns" 1884 if dfchi["adj_p_val"][i] < 0.001: 1885 sign = "***" 1886 elif dfchi["adj_p_val"][i] < 0.01: 1887 sign = "**" 1888 elif dfchi["adj_p_val"][i] < 0.05: 1889 sign = "*" 1890 1891 text += f"{dfchi['group1'][i]} vs. {dfchi['group2'][i]}\np-value: {self.round_to_scientific_notation(dfchi['adj_p_val'][i])} - {sign}\n" 1892 else: 1893 for i in range(len(dfchi["group1"])): 1894 sign = "ns" 1895 if dfchi["p_val"][i] < 0.001: 1896 sign = "***" 1897 elif dfchi["p_val"][i] < 0.01: 1898 sign = "**" 1899 elif dfchi["p_val"][i] < 0.05: 1900 sign = "*" 1901 1902 text += f"{dfchi['group1'][i]} vs. {dfchi['group2'][i]}\np-value: {self.round_to_scientific_notation(dfchi['p_val'][i])} - {sign}\n" 1903 1904 axs[0, -1].text( 1905 0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True 1906 ) 1907 axs[0, -1].set_axis_off() 1908 1909 plt.tight_layout() 1910 1911 if cfg._DISPLAY_MODE: 1912 plt.show() 1913 1914 return fig
Class for performing percentile-based statistical analysis on grouped data.
This class provides methods to calculate percentiles, remove outliers, aggregate data into percentile bins, perform Welch's ANOVA and Chi-squared tests, and visualize results via comparative histograms. It is designed to handle both single-column and multi-column combinations of values for group-based analysis.
Methods
drop_up_df(data, group_col, values_col) Removes upper outliers from a DataFrame based on a grouping column.
percentiles_calculation(values, sep_perc=1) Calculates percentiles and creates loopable percentile ranges.
to_percentil(values, percentiles, percentiles_loop) Aggregates statistics based on percentile ranges.
df_to_percentiles(data, group_col, values_col, sep_perc=1, drop_outlires=True) Computes percentile statistics for grouped DataFrame data.
round_to_scientific_notation(num) Formats a number in scientific notation or standard format.
aov_percentiles(data, testes_col, comb="*") Performs Welch's ANOVA on percentile-based group data.
post_aov_percentiles(data, testes_col, comb="*") Performs Welch's ANOVA with pairwise t-tests.
chi2_percentiles(input_hist) Performs Chi-squared test on percentile histogram data.
post_ch2_percentiles(input_hist) Performs Chi-squared test with pairwise comparisons.
hist_compare_plot(data, queue, tested_value, p_adj=True, txt_size=20) Generates comparative histograms with statistical test results.
997 def drop_up_df(self, data: pd.DataFrame, group_col: str, values_col: str): 998 """ 999 Remove upper outliers from a DataFrame based on a specified value column, grouped by a grouping column. 1000 1001 Outliers are calculated and removed separately for each group defined by `group_col`. 1002 The upper outliers are defined using the interquartile range (IQR) method: 1003 values greater than Q3 + 1.5 * IQR are considered outliers. 1004 1005 Parameters 1006 ---------- 1007 data : pd.DataFrame 1008 The input DataFrame containing the data. 1009 1010 group_col : str 1011 The name of the column used for grouping the data. 1012 1013 values_col : str 1014 The column containing the values from which upper outliers will be removed. 1015 1016 Returns 1017 ------- 1018 filtered_data : pd.DataFrame 1019 A filtered DataFrame with the upper outliers removed for each group. 1020 1021 Notes 1022 ----- 1023 - Outliers are removed separately within each group. 1024 - The original DataFrame is not modified; a new filtered DataFrame is returned. 1025 """ 1026 1027 def iqr_filter(group): 1028 q75 = np.quantile(group[values_col], 0.75) 1029 q25 = np.quantile(group[values_col], 0.25) 1030 itq = q75 - q25 1031 return group[group[values_col] <= (q75 + 1.5 * itq)] 1032 1033 filtered_data = data.groupby(group_col).apply(iqr_filter).reset_index(drop=True) 1034 1035 return filtered_data
Remove upper outliers from a DataFrame based on a specified value column, grouped by a grouping column.
Outliers are calculated and removed separately for each group defined by group_col.
The upper outliers are defined using the interquartile range (IQR) method:
values greater than Q3 + 1.5 * IQR are considered outliers.
Parameters
data : pd.DataFrame The input DataFrame containing the data.
group_col : str The name of the column used for grouping the data.
values_col : str The column containing the values from which upper outliers will be removed.
Returns
filtered_data : pd.DataFrame A filtered DataFrame with the upper outliers removed for each group.
Notes
- Outliers are removed separately within each group.
- The original DataFrame is not modified; a new filtered DataFrame is returned.
1037 def percentiles_calculation(self, values, sep_perc: int = 1): 1038 """ 1039 Calculate percentiles for a set of values and generate consecutive percentile ranges. 1040 1041 This function computes percentiles from 0 to 100 at intervals defined by `sep_perc`. 1042 It also generates a list of consecutive percentile ranges that can be used for further analysis or binning. 1043 1044 Parameters 1045 ---------- 1046 values : array-like 1047 The input data values for which the percentiles are calculated. 1048 1049 sep_perc : int, optional 1050 Separation interval between percentiles (default is 1, meaning percentiles are calculated every 1%). 1051 1052 Returns 1053 ------- 1054 percentiles : np.ndarray 1055 Array of calculated percentile values. 1056 1057 percentiles_loop : list of tuple 1058 List of consecutive percentile ranges as tuples, e.g., [(0, 1), (1, 2), ..., (99, 100)]. 1059 1060 Notes 1061 ----- 1062 - The first percentile is set to 0 to avoid issues with zero values. 1063 - `percentiles_loop` is useful for iterating through percentile ranges when aggregating statistics. 1064 """ 1065 1066 per_vector = values.copy() 1067 1068 percentiles = np.percentile(per_vector, np.arange(0, 101, sep_perc)) 1069 percentiles[0] = 0 1070 1071 percentiles_loop = [(i, i + 1) for i in range(int(100 / sep_perc))] 1072 1073 return percentiles, percentiles_loop
Calculate percentiles for a set of values and generate consecutive percentile ranges.
This function computes percentiles from 0 to 100 at intervals defined by sep_perc.
It also generates a list of consecutive percentile ranges that can be used for further analysis or binning.
Parameters
values : array-like The input data values for which the percentiles are calculated.
sep_perc : int, optional Separation interval between percentiles (default is 1, meaning percentiles are calculated every 1%).
Returns
percentiles : np.ndarray Array of calculated percentile values.
percentiles_loop : list of tuple List of consecutive percentile ranges as tuples, e.g., [(0, 1), (1, 2), ..., (99, 100)].
Notes
- The first percentile is set to 0 to avoid issues with zero values.
percentiles_loopis useful for iterating through percentile ranges when aggregating statistics.
1075 def to_percentil(self, values, percentiles, percentiles_loop): 1076 """ 1077 Aggregate statistics for a set of values based on percentile ranges. 1078 1079 This function calculates summary statistics for each percentile range defined in `percentiles_loop`, 1080 using the percentile values calculated by `percentiles_calculation()`. Statistics include count, proportion, 1081 mean, median, standard deviation, and variance. 1082 1083 Parameters 1084 ---------- 1085 values : array-like 1086 Input data values for which the statistics are calculated. 1087 1088 percentiles : np.ndarray 1089 Array of percentile values used to define the ranges. 1090 1091 percentiles_loop : list of tuple 1092 List of consecutive percentile ranges, e.g., [(0, 1), (1, 2), ..., (99, 100)]. 1093 1094 Returns 1095 ------- 1096 data : dict 1097 Dictionary containing the following keys: 1098 - 'n' : list 1099 Number of elements in each percentile range. 1100 1101 - 'n_standarized' : list 1102 Proportion of elements in each percentile range relative to the total number of elements. 1103 1104 - 'avg' : list 1105 Mean value of elements within each percentile range. 1106 1107 - 'median' : list 1108 Median value of elements within each percentile range. 1109 1110 - 'std' : list 1111 Standard deviation of elements within each percentile range. 1112 1113 - 'var' : list 1114 Variance of elements within each percentile range. 1115 1116 Notes 1117 ----- 1118 - If a percentile range contains no elements, statistics are set to 0 and count is set to 1 to avoid empty lists. 1119 """ 1120 1121 per_vector = values.copy() 1122 1123 data = { 1124 "n": [], 1125 "n_standarized": [], 1126 "avg": [], 1127 "median": [], 1128 "std": [], 1129 "var": [], 1130 } 1131 1132 amount = len(per_vector) 1133 1134 for x in percentiles_loop: 1135 if ( 1136 len( 1137 per_vector[ 1138 (per_vector > percentiles[x[0]]) 1139 & (per_vector <= percentiles[x[1]]) 1140 ] 1141 ) 1142 > 0 1143 ): 1144 data["n"].append( 1145 len( 1146 per_vector[ 1147 (per_vector > percentiles[x[0]]) 1148 & (per_vector <= percentiles[x[1]]) 1149 ] 1150 ) 1151 ) 1152 data["n_standarized"].append( 1153 len( 1154 per_vector[ 1155 (per_vector > percentiles[x[0]]) 1156 & (per_vector <= percentiles[x[1]]) 1157 ] 1158 ) 1159 / amount 1160 ) 1161 data["avg"].append( 1162 np.mean( 1163 per_vector[ 1164 (per_vector > percentiles[x[0]]) 1165 & (per_vector <= percentiles[x[1]]) 1166 ] 1167 ) 1168 ) 1169 data["std"].append( 1170 np.std( 1171 per_vector[ 1172 (per_vector > percentiles[x[0]]) 1173 & (per_vector <= percentiles[x[1]]) 1174 ] 1175 ) 1176 ) 1177 data["median"].append( 1178 np.median( 1179 per_vector[ 1180 (per_vector > percentiles[x[0]]) 1181 & (per_vector <= percentiles[x[1]]) 1182 ] 1183 ) 1184 ) 1185 data["var"].append( 1186 np.var( 1187 per_vector[ 1188 (per_vector > percentiles[x[0]]) 1189 & (per_vector <= percentiles[x[1]]) 1190 ] 1191 ) 1192 ) 1193 else: 1194 data["n"].append(1) 1195 data["n_standarized"].append(0) 1196 data["avg"].append(0) 1197 data["std"].append(0) 1198 data["median"].append(0) 1199 data["var"].append(0) 1200 1201 return data
Aggregate statistics for a set of values based on percentile ranges.
This function calculates summary statistics for each percentile range defined in percentiles_loop,
using the percentile values calculated by percentiles_calculation(). Statistics include count, proportion,
mean, median, standard deviation, and variance.
Parameters
values : array-like Input data values for which the statistics are calculated.
percentiles : np.ndarray Array of percentile values used to define the ranges.
percentiles_loop : list of tuple List of consecutive percentile ranges, e.g., [(0, 1), (1, 2), ..., (99, 100)].
Returns
data : dict Dictionary containing the following keys: - 'n' : list Number of elements in each percentile range.
- 'n_standarized' : list
Proportion of elements in each percentile range relative to the total number of elements.
- 'avg' : list
Mean value of elements within each percentile range.
- 'median' : list
Median value of elements within each percentile range.
- 'std' : list
Standard deviation of elements within each percentile range.
- 'var' : list
Variance of elements within each percentile range.
Notes
- If a percentile range contains no elements, statistics are set to 0 and count is set to 1 to avoid empty lists.
1203 def df_to_percentiles( 1204 self, 1205 data: pd.DataFrame, 1206 group_col: str, 1207 values_col: str, 1208 sep_perc: int = 1, 1209 drop_outlires: bool = True, 1210 ): 1211 """ 1212 Calculate summary statistics based on percentile ranges for each group in a DataFrame. 1213 1214 This method groups the input DataFrame by `group_col`, computes percentile ranges for each group's values 1215 in `values_col`, and aggregates statistics (count, proportion, mean, median, standard deviation, variance) 1216 for each percentile range. Optionally, upper outliers can be removed before calculation. 1217 1218 Parameters 1219 ---------- 1220 data : pd.DataFrame 1221 Input DataFrame containing the data. 1222 1223 group_col : str 1224 Column name used to define groups. 1225 1226 values_col : str 1227 Column name containing the values for percentile calculations. 1228 1229 sep_perc : int, optional 1230 Separation interval for percentiles (default is 1, meaning percentiles are calculated at every 1%). 1231 1232 drop_outlires : bool, optional 1233 If True, removes upper outliers from the data before performing calculations (default is True). 1234 1235 Returns 1236 ------- 1237 full_data : dict 1238 Dictionary where each key is a group name and each value is a dictionary containing: 1239 - 'n' : list 1240 Number of elements in each percentile range. 1241 1242 - 'n_standarized' : list 1243 Proportion of elements in each percentile range relative to the total number of elements. 1244 1245 - 'avg' : list 1246 Mean value of elements within each percentile range. 1247 1248 - 'median' : list 1249 Median value of elements within each percentile range. 1250 1251 - 'std' : list 1252 Standard deviation of elements within each percentile range. 1253 1254 - 'var' : list 1255 Variance of elements within each percentile range. 1256 1257 Notes 1258 ----- 1259 - Outlier removal uses the IQR method within each group if `drop_outlires` is True. 1260 """ 1261 1262 full_data = {} 1263 1264 if drop_outlires == True: 1265 data = self.drop_up_df( 1266 data=data, group_col=group_col, values_col=values_col 1267 ) 1268 1269 groups = set(data[group_col]) 1270 1271 percentiles, percentiles_loop = self.percentiles_calculation( 1272 data[values_col], sep_perc=sep_perc 1273 ) 1274 1275 for g in groups: 1276 1277 print(f"Group: {g} ...") 1278 1279 tmp_values = data[values_col][data[group_col] == g] 1280 1281 per_dat = self.to_percentil(tmp_values, percentiles, percentiles_loop) 1282 1283 full_data[g] = per_dat 1284 1285 return full_data
Calculate summary statistics based on percentile ranges for each group in a DataFrame.
This method groups the input DataFrame by group_col, computes percentile ranges for each group's values
in values_col, and aggregates statistics (count, proportion, mean, median, standard deviation, variance)
for each percentile range. Optionally, upper outliers can be removed before calculation.
Parameters
data : pd.DataFrame Input DataFrame containing the data.
group_col : str Column name used to define groups.
values_col : str Column name containing the values for percentile calculations.
sep_perc : int, optional Separation interval for percentiles (default is 1, meaning percentiles are calculated at every 1%).
drop_outlires : bool, optional If True, removes upper outliers from the data before performing calculations (default is True).
Returns
full_data : dict Dictionary where each key is a group name and each value is a dictionary containing: - 'n' : list Number of elements in each percentile range.
- 'n_standarized' : list
Proportion of elements in each percentile range relative to the total number of elements.
- 'avg' : list
Mean value of elements within each percentile range.
- 'median' : list
Median value of elements within each percentile range.
- 'std' : list
Standard deviation of elements within each percentile range.
- 'var' : list
Variance of elements within each percentile range.
Notes
- Outlier removal uses the IQR method within each group if
drop_outliresis True.
1287 def round_to_scientific_notation(self, num): 1288 """ 1289 Round a number to scientific notation if very small, otherwise to one decimal place. 1290 1291 Parameters 1292 ---------- 1293 num : float 1294 The number to round. 1295 1296 Returns 1297 ------- 1298 str 1299 The rounded number as a string. 1300 - If `num` is 0, returns "0.0". 1301 - If `abs(num) < 1e-4`, returns scientific notation with 1 decimal and 1-digit exponent. 1302 - Otherwise, returns the number rounded to one decimal place. 1303 """ 1304 1305 if num == 0: 1306 return "0.0" 1307 1308 if abs(num) < 0.0001: 1309 rounded_num = np.format_float_scientific(num, precision=1, exp_digits=1) 1310 return rounded_num 1311 else: 1312 return f"{num:.1f}"
Round a number to scientific notation if very small, otherwise to one decimal place.
Parameters
num : float The number to round.
Returns
str
The rounded number as a string.
- If num is 0, returns "0.0".
- If abs(num) < 1e-4, returns scientific notation with 1 decimal and 1-digit exponent.
- Otherwise, returns the number rounded to one decimal place.
1314 def aov_percentiles(self, data, testes_col, comb: str = "*"): 1315 """ 1316 Perform Welch's ANOVA on percentile-based group data. 1317 1318 This method calculates group values by combining the columns specified in `testes_col` 1319 according to the operation defined in `comb`, then performs Welch's ANOVA to test for 1320 differences in means between the groups. Welch's ANOVA is suitable when the groups 1321 have unequal variances. 1322 1323 Parameters 1324 ---------- 1325 data : dict of pd.DataFrame 1326 Dictionary where keys are group names and values are DataFrames containing the data. 1327 1328 testes_col : str or list of str 1329 Column name(s) from which the group values are derived. If a list is provided, columns 1330 will be combined based on the `comb` operation. 1331 1332 comb : str, optional 1333 Operation used to combine multiple columns if `testes_col` is a list. Options include: 1334 '*' : multiplication 1335 '+' : addition 1336 '**': exponentiation 1337 '-' : subtraction 1338 '/' : division 1339 Default is '*'. 1340 1341 Returns 1342 ------- 1343 F : float 1344 F-statistic from Welch's ANOVA. 1345 1346 p_val : float 1347 Uncorrected p-value from Welch's ANOVA, testing for significant differences between groups. 1348 1349 Notes 1350 ----- 1351 - If `testes_col` is a single string, no combination is performed, and the group values 1352 are taken directly from that column. 1353 - Welch's ANOVA is used as it accounts for unequal variances between groups. 1354 - The `df.melt()` method is used to reshape the data, allowing the ANOVA to be applied to all groups. 1355 1356 Examples 1357 -------- 1358 >>> welch_F, welch_p = self.aov_percentiles(data, testes_col=['col1', 'col2'], comb='+') 1359 >>> print(f"Welch's ANOVA F-statistic: {welch_F}, p-value: {welch_p}") 1360 """ 1361 1362 groups = [] 1363 1364 for d in data.keys(): 1365 1366 if isinstance(testes_col, str): 1367 g = data[d][testes_col] 1368 elif isinstance(testes_col, list): 1369 g = [1] * len(data[d][testes_col[0]]) 1370 for t in testes_col: 1371 if comb == "*": 1372 g = [a * b for a, b in zip(g, data[d][t])] 1373 elif comb == "+": 1374 g = [a + b for a, b in zip(g, data[d][t])] 1375 elif comb == "**": 1376 g = [a**b for a, b in zip(g, data[d][t])] 1377 elif comb == "-": 1378 g = [a - b for a, b in zip(g, data[d][t])] 1379 elif comb == "/": 1380 g = [a / b for a, b in zip(g, data[d][t])] 1381 1382 groups.append(g) 1383 1384 df = pd.DataFrame({f"group_{i}": group for i, group in enumerate(groups)}) 1385 1386 df_melted = df.melt(var_name="group", value_name="value") 1387 1388 welch_results = pg.welch_anova(data=df_melted, dv="value", between="group") 1389 1390 return welch_results["F"].values[0], welch_results["p-unc"].values[0]
Perform Welch's ANOVA on percentile-based group data.
This method calculates group values by combining the columns specified in testes_col
according to the operation defined in comb, then performs Welch's ANOVA to test for
differences in means between the groups. Welch's ANOVA is suitable when the groups
have unequal variances.
Parameters
data : dict of pd.DataFrame Dictionary where keys are group names and values are DataFrames containing the data.
testes_col : str or list of str
Column name(s) from which the group values are derived. If a list is provided, columns
will be combined based on the comb operation.
comb : str, optional
Operation used to combine multiple columns if testes_col is a list. Options include:
'' : multiplication
'+' : addition
'': exponentiation
'-' : subtraction
'/' : division
Default is ''.
Returns
F : float F-statistic from Welch's ANOVA.
p_val : float Uncorrected p-value from Welch's ANOVA, testing for significant differences between groups.
Notes
- If
testes_colis a single string, no combination is performed, and the group values are taken directly from that column. - Welch's ANOVA is used as it accounts for unequal variances between groups.
- The
df.melt()method is used to reshape the data, allowing the ANOVA to be applied to all groups.
Examples
>>> welch_F, welch_p = self.aov_percentiles(data, testes_col=['col1', 'col2'], comb='+')
>>> print(f"Welch's ANOVA F-statistic: {welch_F}, p-value: {welch_p}")
1392 def post_aov_percentiles(self, data, testes_col, comb: str = "*"): 1393 """ 1394 Perform Welch's ANOVA on percentile-based group data and pairwise Welch's t-tests. 1395 1396 This method first performs Welch's ANOVA to assess differences in group means, and 1397 then conducts pairwise Welch's t-tests between all group combinations. P-values are 1398 adjusted using the Bonferroni correction for multiple comparisons. 1399 1400 Parameters 1401 ---------- 1402 data : dict of pd.DataFrame 1403 Dictionary where keys are group names and values are DataFrames containing the data. 1404 1405 testes_col : str or list of str 1406 Column name(s) from which the group values are derived. If a list is provided, 1407 columns will be combined according to the `comb` operation. 1408 1409 comb : str, optional 1410 Operation used to combine multiple columns if `testes_col` is a list. Options include: 1411 '*' : multiplication 1412 '+' : addition 1413 '**': exponentiation 1414 '-' : subtraction 1415 '/' : division 1416 Default is '*'. 1417 1418 Returns 1419 ------- 1420 p_val : float 1421 Uncorrected p-value from the Welch's ANOVA. 1422 1423 final_results : dict 1424 Dictionary containing results of pairwise Welch's t-tests with keys: 1425 'group1' : list of first group names in each comparison 1426 'group2' : list of second group names in each comparison 1427 'stat' : list of t-statistics for each comparison 1428 'p_val' : list of uncorrected p-values for each comparison 1429 'adj_p_val' : list of Bonferroni-adjusted p-values for multiple comparisons 1430 """ 1431 1432 p_val = self.aov_percentiles(data=data, testes_col=testes_col, comb=comb)[1] 1433 1434 pairs = list(combinations(data, 2)) 1435 final_results = { 1436 "group1": [], 1437 "group2": [], 1438 "stat": [], 1439 "p_val": [], 1440 "adj_p_val": [], 1441 } 1442 1443 for group1, group2 in pairs: 1444 if isinstance(testes_col, str): 1445 g1 = data[group1][testes_col] 1446 elif isinstance(testes_col, list): 1447 g1 = [1] * len(data[group1][testes_col[0]]) 1448 for t in testes_col: 1449 if comb == "*": 1450 g1 = [a * b for a, b in zip(g1, data[group1][t])] 1451 elif comb == "+": 1452 g1 = [a + b for a, b in zip(g1, data[group1][t])] 1453 elif comb == "**": 1454 g1 = [a**b for a, b in zip(g1, data[group1][t])] 1455 elif comb == "-": 1456 g1 = [a - b for a, b in zip(g1, data[group1][t])] 1457 elif comb == "/": 1458 g1 = [a / b for a, b in zip(g1, data[group1][t])] 1459 1460 if isinstance(testes_col, str): 1461 g2 = data[group2][testes_col] 1462 elif isinstance(testes_col, list): 1463 g2 = [1] * len(data[group2][testes_col[0]]) 1464 for t in testes_col: 1465 if comb == "*": 1466 g2 = [a * b for a, b in zip(g2, data[group2][t])] 1467 elif comb == "+": 1468 g2 = [a + b for a, b in zip(g2, data[group2][t])] 1469 elif comb == "**": 1470 g2 = [a**b for a, b in zip(g2, data[group2][t])] 1471 elif comb == "-": 1472 g2 = [a - b for a, b in zip(g2, data[group2][t])] 1473 elif comb == "/": 1474 g2 = [a / b for a, b in zip(g2, data[group2][t])] 1475 1476 stat, p_val = stats.ttest_ind( 1477 g1, g2, alternative="two-sided", equal_var=False 1478 ) 1479 g = sorted([group1, group2]) 1480 final_results["group1"].append(g[0]) 1481 final_results["group2"].append(g[1]) 1482 final_results["stat"].append(stat) 1483 final_results["p_val"].append(p_val) 1484 adj = p_val * len(pairs) 1485 if adj > 1: 1486 final_results["adj_p_val"].append(1) 1487 else: 1488 final_results["adj_p_val"].append(adj) 1489 1490 return p_val, final_results
Perform Welch's ANOVA on percentile-based group data and pairwise Welch's t-tests.
This method first performs Welch's ANOVA to assess differences in group means, and then conducts pairwise Welch's t-tests between all group combinations. P-values are adjusted using the Bonferroni correction for multiple comparisons.
Parameters
data : dict of pd.DataFrame Dictionary where keys are group names and values are DataFrames containing the data.
testes_col : str or list of str
Column name(s) from which the group values are derived. If a list is provided,
columns will be combined according to the comb operation.
comb : str, optional
Operation used to combine multiple columns if testes_col is a list. Options include:
'' : multiplication
'+' : addition
'': exponentiation
'-' : subtraction
'/' : division
Default is ''.
Returns
p_val : float Uncorrected p-value from the Welch's ANOVA.
final_results : dict Dictionary containing results of pairwise Welch's t-tests with keys: 'group1' : list of first group names in each comparison 'group2' : list of second group names in each comparison 'stat' : list of t-statistics for each comparison 'p_val' : list of uncorrected p-values for each comparison 'adj_p_val' : list of Bonferroni-adjusted p-values for multiple comparisons
1492 def chi2_percentiles(self, input_hist): 1493 """ 1494 Perform a Chi-squared test on percentile-based group data. 1495 1496 This method reformats the input histogram data into a contingency table and performs 1497 a Chi-squared test to evaluate whether there is a significant association between groups. 1498 1499 Parameters 1500 ---------- 1501 input_hist : dict of pd.DataFrame 1502 Dictionary where keys are group names and values are DataFrames containing histogram data. 1503 The DataFrame must include a column 'n' representing counts for each percentile/bin. 1504 1505 Returns 1506 ------- 1507 chi2_statistic : float 1508 Chi-squared test statistic. 1509 1510 p_value : float 1511 P-value from the Chi-squared test. 1512 1513 dof : int 1514 Degrees of freedom for the test. 1515 1516 expected : np.ndarray 1517 Expected frequencies for each group/bin under the null hypothesis. 1518 1519 chi_data : dict 1520 Formatted data used in the Chi-squared test, with group names as keys and bin counts as values. 1521 1522 Example 1523 ------- 1524 chi2_stat, p_val, dof, expected, chi_data = self.chi2_percentiles(input_hist) 1525 print(f"Chi-squared statistic: {chi2_stat}, p-value: {p_val}") 1526 """ 1527 1528 chi_data = {} 1529 1530 for d in input_hist.keys(): 1531 tmp_dic = {} 1532 1533 for n, c in enumerate(input_hist[d]["n"]): 1534 tmp_dic[f"p{n+1}"] = c 1535 1536 chi_data[d] = tmp_dic 1537 1538 chi2_statistic, p_value, dof, expected = chi2_contingency( 1539 pd.DataFrame(chi_data).T, correction=True 1540 ) 1541 1542 return chi2_statistic, p_value, dof, expected, chi_data
Perform a Chi-squared test on percentile-based group data.
This method reformats the input histogram data into a contingency table and performs a Chi-squared test to evaluate whether there is a significant association between groups.
Parameters
input_hist : dict of pd.DataFrame Dictionary where keys are group names and values are DataFrames containing histogram data. The DataFrame must include a column 'n' representing counts for each percentile/bin.
Returns
chi2_statistic : float Chi-squared test statistic.
p_value : float P-value from the Chi-squared test.
dof : int Degrees of freedom for the test.
expected : np.ndarray Expected frequencies for each group/bin under the null hypothesis.
chi_data : dict Formatted data used in the Chi-squared test, with group names as keys and bin counts as values.
Example
chi2_stat, p_val, dof, expected, chi_data = self.chi2_percentiles(input_hist) print(f"Chi-squared statistic: {chi2_stat}, p-value: {p_val}")
1544 def post_ch2_percentiles(self, input_hist): 1545 """ 1546 Perform a Chi-squared test on percentile-based group data, including pairwise comparisons. 1547 1548 This method first performs a Chi-squared test across all groups to check for a significant association. 1549 It then performs pairwise Chi-squared tests between groups to identify specific differences. 1550 P-values for multiple comparisons are adjusted using the Bonferroni correction. 1551 1552 Parameters 1553 ---------- 1554 input_hist : dict of pd.DataFrame 1555 Dictionary where keys are group names and values are DataFrames containing histogram data. 1556 Each DataFrame must include a column 'n' with counts for each percentile/bin. 1557 1558 Returns 1559 ------- 1560 p_val : float 1561 Overall p-value from the initial Chi-squared test across all groups. 1562 1563 final_results : dict 1564 Results of pairwise Chi-squared tests, with keys: 1565 - 'group1' (list): Name of the first group in each comparison 1566 - 'group2' (list): Name of the second group in each comparison 1567 - 'chi2' (list): Chi-squared statistic for each pairwise comparison 1568 - 'p_val' (list): P-value for each pairwise comparison 1569 - 'adj_p_val' (list): Adjusted p-value (Bonferroni correction) for multiple comparisons 1570 1571 Example 1572 ------- 1573 p_val, final_results = self.post_ch2_percentiles(input_hist) 1574 print(f"Overall Chi-squared p-value: {p_val}") 1575 for i in range(len(final_results['group1'])): 1576 print(f"Comparison: {final_results['group1'][i]} vs {final_results['group2'][i]}") 1577 print(f"Chi2 stat: {final_results['chi2'][i]}, p-value: {final_results['p_val'][i]}, adj. p-value: {final_results['adj_p_val'][i]}") 1578 """ 1579 1580 res = self.chi2_percentiles(input_hist) 1581 1582 pairs = list(combinations(res[4], 2)) 1583 results = [] 1584 1585 for group1, group2 in pairs: 1586 table_pair = pd.DataFrame(res[4])[[group1, group2]] 1587 chi2_stat, p_val, _, _ = chi2_contingency(table_pair, correction=True) 1588 results.append((group1, group2, chi2_stat, p_val)) 1589 1590 final_results = { 1591 "group1": [], 1592 "group2": [], 1593 "chi2": [], 1594 "p_val": [], 1595 "adj_p_val": [], 1596 } 1597 1598 for group1, group2, chi2_stat, p_val in results: 1599 g = sorted([group1, group2]) 1600 final_results["group1"].append(g[0]) 1601 final_results["group2"].append(g[1]) 1602 final_results["chi2"].append(chi2_stat) 1603 final_results["p_val"].append(p_val) 1604 adj = p_val * len(results) 1605 if adj > 1: 1606 final_results["adj_p_val"].append(1) 1607 else: 1608 final_results["adj_p_val"].append(adj) 1609 1610 return res[1], final_results
Perform a Chi-squared test on percentile-based group data, including pairwise comparisons.
This method first performs a Chi-squared test across all groups to check for a significant association. It then performs pairwise Chi-squared tests between groups to identify specific differences. P-values for multiple comparisons are adjusted using the Bonferroni correction.
Parameters
input_hist : dict of pd.DataFrame Dictionary where keys are group names and values are DataFrames containing histogram data. Each DataFrame must include a column 'n' with counts for each percentile/bin.
Returns
p_val : float Overall p-value from the initial Chi-squared test across all groups.
final_results : dict Results of pairwise Chi-squared tests, with keys: - 'group1' (list): Name of the first group in each comparison - 'group2' (list): Name of the second group in each comparison - 'chi2' (list): Chi-squared statistic for each pairwise comparison - 'p_val' (list): P-value for each pairwise comparison - 'adj_p_val' (list): Adjusted p-value (Bonferroni correction) for multiple comparisons
Example
p_val, final_results = self.post_ch2_percentiles(input_hist) print(f"Overall Chi-squared p-value: {p_val}") for i in range(len(final_results['group1'])): print(f"Comparison: {final_results['group1'][i]} vs {final_results['group2'][i]}") print(f"Chi2 stat: {final_results['chi2'][i]}, p-value: {final_results['p_val'][i]}, adj. p-value: {final_results['adj_p_val'][i]}")
1612 def hist_compare_plot( 1613 self, data, queue, tested_value, p_adj: bool = True, txt_size: int = 20 1614 ): 1615 """ 1616 Generate comparative histograms and display results of statistical tests (ANOVA and Chi-squared). 1617 1618 This method performs transformations on the input data, generates comparative histograms 1619 for each group, and annotates statistical test results, including Welch's ANOVA and Chi-squared tests. 1620 Multiple comparison corrections can be applied using the Bonferroni method. 1621 1622 Parameters 1623 ---------- 1624 data : dict of pd.DataFrame 1625 Dictionary where keys are group names and values are DataFrames containing histogram data. 1626 Each DataFrame should include the column specified by `tested_value`. 1627 1628 queue : list of str 1629 Defines the order of groups to be plotted. 1630 1631 tested_value : str 1632 The column name in `data` representing the variable to test and visualize. 1633 1634 p_adj : bool, optional 1635 If True, applies Bonferroni correction for multiple comparisons (default is True). 1636 1637 txt_size : int, optional 1638 Font size for text annotations in the plot (default is 20). 1639 1640 Returns 1641 ------- 1642 fig : matplotlib.figure.Figure 1643 Matplotlib figure object containing the generated histograms and statistical test results. 1644 1645 Example 1646 ------- 1647 fig = self.hist_compare_plot( 1648 data, 1649 queue=['group1', 'group2', 'group3'], 1650 tested_value='n', 1651 p_adj=True, 1652 txt_size=18 1653 ) 1654 plt.show() 1655 """ 1656 1657 for i in data.keys(): 1658 values = np.array(data[i][tested_value]) 1659 values += 1 1660 transformed_values, fitted_lambda = stats.boxcox(values) 1661 data[i][tested_value] = transformed_values.tolist() 1662 1663 if sorted(queue) != sorted(data.keys()): 1664 print( 1665 "\n Wrong queue provided! The queue will be sorted with default settings!" 1666 ) 1667 queue = sorted(data.keys()) 1668 1669 # parametric selected value 1670 pk, dfk = self.post_aov_percentiles(data, testes_col=tested_value) 1671 1672 dfk = pd.DataFrame(dfk) 1673 1674 dfk = dfk.sort_values( 1675 by=["group1", "group2"], 1676 key=lambda col: [queue.index(val) if val in queue else -1 for val in col], 1677 ).reset_index(drop=True) 1678 1679 # parametric standarized selected value 1680 pkc, dfkc = self.post_aov_percentiles( 1681 data, testes_col=[tested_value, "n_standarized"], comb="*" 1682 ) 1683 1684 dfkc = pd.DataFrame(dfkc) 1685 1686 dfkc = dfkc.sort_values( 1687 by=["group1", "group2"], 1688 key=lambda col: [queue.index(val) if val in queue else -1 for val in col], 1689 ).reset_index(drop=True) 1690 1691 # chi2 1692 pchi, dfchi = self.post_ch2_percentiles(data) 1693 1694 dfchi = pd.DataFrame(dfchi) 1695 1696 dfchi = dfchi.sort_values( 1697 by=["group1", "group2"], 1698 key=lambda col: [queue.index(val) if val in queue else -1 for val in col], 1699 ).reset_index(drop=True) 1700 1701 ############################################################################## 1702 1703 standarized_max, standarized_min, value_max, value_min = [], [], [], [] 1704 for d in queue: 1705 standarized_max.append(max(data[d]["n_standarized"])) 1706 standarized_min.append(min(data[d]["n_standarized"])) 1707 value_max.append(max(data[d][tested_value])) 1708 value_min.append(min(data[d][tested_value])) 1709 1710 num_columns = len(queue) + 1 1711 1712 fig, axs = plt.subplots( 1713 3, 1714 num_columns, 1715 figsize=(8 * num_columns, 10), 1716 gridspec_kw={"width_ratios": [1] * len(queue) + [0.5], "wspace": 0.05}, 1717 ) 1718 1719 for i, d in enumerate(queue): 1720 tmp_data = data[d] 1721 1722 axs[0, i].bar( 1723 [str(n) for n in range(len(tmp_data["n_standarized"]))], 1724 tmp_data["n_standarized"], 1725 width=0.95, 1726 color="gold", 1727 ) 1728 axs[0, i].set_ylim( 1729 min(standarized_min) * 0.9995, max(standarized_max) * 1.0005 1730 ) 1731 1732 if i == 0: 1733 axs[0, i].set_ylabel("Standarized\nfrequency", fontsize=txt_size) 1734 else: 1735 axs[0, i].set_yticks([]) 1736 1737 axs[0, i].set_xticks([]) 1738 axs[0, i].tick_params(axis="y", labelsize=txt_size * 0.7) 1739 1740 axs[1, i].bar( 1741 [str(n) for n in range(len(tmp_data[tested_value]))], 1742 tmp_data[tested_value], 1743 width=0.95, 1744 color="orange", 1745 ) 1746 1747 mean_value = np.mean(tmp_data[tested_value]) 1748 axs[1, i].axhline(y=mean_value, color="red", linestyle="--") 1749 1750 # axs[1, i].set_ylim(min(value_min) - 1, max(value_max) + 1) 1751 axs[1, i].set_ylim(min(value_min) * 0.9995, max(value_max) * 1.0005) 1752 1753 if i == 0: 1754 axs[1, i].set_ylabel(f"Normalized\n{tested_value}", fontsize=txt_size) 1755 else: 1756 axs[1, i].set_yticks([]) 1757 1758 axs[1, i].set_xticks([]) 1759 axs[1, i].tick_params(axis="y", labelsize=txt_size * 0.7) 1760 1761 axs[2, i].bar( 1762 [str(n) for n in range(len(tmp_data["n_standarized"]))], 1763 [ 1764 a * b 1765 for a, b in zip(tmp_data[tested_value], tmp_data["n_standarized"]) 1766 ], 1767 width=0.95, 1768 color="goldenrod", 1769 ) 1770 1771 mean_value = np.mean( 1772 [ 1773 a * b 1774 for a, b in zip(tmp_data[tested_value], tmp_data["n_standarized"]) 1775 ] 1776 ) 1777 axs[2, i].axhline(y=mean_value, color="red", linestyle="--") 1778 1779 axs[2, i].set_ylim( 1780 (min(standarized_min) * min(value_min)) * 0.9995, 1781 (max(standarized_max) * max(value_max) * 1.0005), 1782 ) 1783 axs[2, i].set_xlabel(d, fontsize=txt_size) 1784 1785 if i == 0: 1786 axs[2, i].set_ylabel( 1787 f"Standarized\nnorm_{tested_value}", fontsize=txt_size 1788 ) 1789 else: 1790 axs[2, i].set_yticks([]) 1791 1792 axs[2, i].set_xticks([]) 1793 axs[2, i].tick_params(axis="y", labelsize=txt_size * 0.7) 1794 1795 sign = "ns" 1796 if float(self.round_to_scientific_notation(pk)) < 0.001: 1797 sign = "***" 1798 elif float(self.round_to_scientific_notation(pk)) < 0.01: 1799 sign = "**" 1800 elif float(self.round_to_scientific_notation(pk)) < 0.05: 1801 sign = "*" 1802 1803 text = f"Test Welch's ANOVA\np-value: {self.round_to_scientific_notation(pk)} - {sign}\n" 1804 1805 if p_adj == True: 1806 for i in range(len(dfk["group1"])): 1807 sign = "ns" 1808 if dfk["adj_p_val"][i] < 0.001: 1809 sign = "***" 1810 elif dfk["adj_p_val"][i] < 0.01: 1811 sign = "**" 1812 elif dfk["adj_p_val"][i] < 0.05: 1813 sign = "*" 1814 1815 text += f"{dfk['group1'][i]} vs. {dfk['group2'][i]}\np-value: {self.round_to_scientific_notation(dfk['adj_p_val'][i])} - {sign}\n" 1816 else: 1817 for i in range(len(dfk["group1"])): 1818 sign = "ns" 1819 if dfk["p_val"][i] < 0.001: 1820 sign = "***" 1821 elif dfk["p_val"][i] < 0.01: 1822 sign = "**" 1823 elif dfk["p_val"][i] < 0.05: 1824 sign = "*" 1825 1826 text += f"{dfk['group1'][i]} vs. {dfk['group2'][i]}\np-value: {self.round_to_scientific_notation(dfk['p_val'][i])} - {sign}\n" 1827 1828 axs[1, -1].text( 1829 0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True 1830 ) 1831 axs[1, -1].set_axis_off() 1832 1833 sign = "ns" 1834 if float(self.round_to_scientific_notation(pkc)) < 0.001: 1835 sign = "***" 1836 elif float(self.round_to_scientific_notation(pkc)) < 0.01: 1837 sign = "**" 1838 elif float(self.round_to_scientific_notation(pkc)) < 0.05: 1839 sign = "*" 1840 1841 text = f"Test Welch's ANOVA\np-value: {self.round_to_scientific_notation(pkc)} - {sign}\n" 1842 1843 if p_adj == True: 1844 for i in range(len(dfkc["group1"])): 1845 sign = "ns" 1846 if dfkc["adj_p_val"][i] < 0.001: 1847 sign = "***" 1848 elif dfkc["adj_p_val"][i] < 0.01: 1849 sign = "**" 1850 elif dfkc["adj_p_val"][i] < 0.05: 1851 sign = "*" 1852 1853 text += f"{dfkc['group1'][i]} vs. {dfkc['group2'][i]}\np-value: {self.round_to_scientific_notation(dfkc['adj_p_val'][i])} - {sign}\n" 1854 else: 1855 for i in range(len(dfkc["group1"])): 1856 sign = "ns" 1857 if dfkc["p_val"][i] < 0.001: 1858 sign = "***" 1859 elif dfkc["p_val"][i] < 0.01: 1860 sign = "**" 1861 elif dfkc["p_val"][i] < 0.05: 1862 sign = "*" 1863 1864 text += f"{dfkc['group1'][i]} vs. {dfkc['group2'][i]}\np-value: {self.round_to_scientific_notation(dfkc['p_val'][i])} - {sign}\n" 1865 1866 axs[2, -1].text( 1867 0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True 1868 ) 1869 axs[2, -1].set_axis_off() 1870 1871 sign = "ns" 1872 if float(self.round_to_scientific_notation(pchi)) < 0.001: 1873 sign = "***" 1874 elif float(self.round_to_scientific_notation(pchi)) < 0.01: 1875 sign = "**" 1876 elif float(self.round_to_scientific_notation(pchi)) < 0.05: 1877 sign = "*" 1878 1879 text = f"Test Chi-squared\np-value: {self.round_to_scientific_notation(pchi)} - {sign}\n" 1880 1881 if p_adj == True: 1882 for i in range(len(dfchi["group1"])): 1883 sign = "ns" 1884 if dfchi["adj_p_val"][i] < 0.001: 1885 sign = "***" 1886 elif dfchi["adj_p_val"][i] < 0.01: 1887 sign = "**" 1888 elif dfchi["adj_p_val"][i] < 0.05: 1889 sign = "*" 1890 1891 text += f"{dfchi['group1'][i]} vs. {dfchi['group2'][i]}\np-value: {self.round_to_scientific_notation(dfchi['adj_p_val'][i])} - {sign}\n" 1892 else: 1893 for i in range(len(dfchi["group1"])): 1894 sign = "ns" 1895 if dfchi["p_val"][i] < 0.001: 1896 sign = "***" 1897 elif dfchi["p_val"][i] < 0.01: 1898 sign = "**" 1899 elif dfchi["p_val"][i] < 0.05: 1900 sign = "*" 1901 1902 text += f"{dfchi['group1'][i]} vs. {dfchi['group2'][i]}\np-value: {self.round_to_scientific_notation(dfchi['p_val'][i])} - {sign}\n" 1903 1904 axs[0, -1].text( 1905 0.5, 0.5, text, ha="center", va="center", fontsize=txt_size * 0.7, wrap=True 1906 ) 1907 axs[0, -1].set_axis_off() 1908 1909 plt.tight_layout() 1910 1911 if cfg._DISPLAY_MODE: 1912 plt.show() 1913 1914 return fig
Generate comparative histograms and display results of statistical tests (ANOVA and Chi-squared).
This method performs transformations on the input data, generates comparative histograms for each group, and annotates statistical test results, including Welch's ANOVA and Chi-squared tests. Multiple comparison corrections can be applied using the Bonferroni method.
Parameters
data : dict of pd.DataFrame
Dictionary where keys are group names and values are DataFrames containing histogram data.
Each DataFrame should include the column specified by tested_value.
queue : list of str Defines the order of groups to be plotted.
tested_value : str
The column name in data representing the variable to test and visualize.
p_adj : bool, optional If True, applies Bonferroni correction for multiple comparisons (default is True).
txt_size : int, optional Font size for text annotations in the plot (default is 20).
Returns
fig : matplotlib.figure.Figure Matplotlib figure object containing the generated histograms and statistical test results.
Example
fig = self.hist_compare_plot( data, queue=['group1', 'group2', 'group3'], tested_value='n', p_adj=True, txt_size=18 ) plt.show()