cfi_toolkit.CellFunctionality
1import copy 2import os 3import pickle 4import sys 5 6import pandas as pd 7from tqdm import tqdm 8 9_old_stdout = sys.stdout 10sys.stdout = open(os.devnull, "w") 11 12from gedspy import Analysis, Enrichment 13 14sys.stdout.close() 15sys.stdout = _old_stdout 16 17 18class CellFunCon: 19 """ 20 A class to perform cell-type functional analysis and enrichment based on a JDtI-COMPsc objects. 21 22 This class provides methods to calculate marker genes for cell types, perform functional enrichment 23 (GO, KEGG, REACTOME, STRING, IntAct), and compute cell-cell interaction networks. 24 Projects can also be saved and loaded via pickle. 25 26 Attributes 27 ---------- 28 jdti : object 29 JDtI-COMPsc object containing normalized single-cell data. 30 31 cells_markers : pd.DataFrame or None 32 DataFrame containing marker genes per cell type after calculation. 33 34 enr_full_info : Enrichment 35 Enrichment object containing all genes available for enrichment analysis. 36 37 cells_enrichment : dict or None 38 Dictionary storing enrichment results per cell type. 39 40 cells_connection : pd.DataFrame or None 41 DataFrame storing calculated cell-cell interaction information. 42 43 mt_genes : bool 44 Whether mitochondrial genes are included (default False). 45 46 ribo_genes : bool 47 Whether ribosomal genes are included (default False). 48 """ 49 50 def __init__(self, jdti_object, mt_genes=False, ribo_genes=False): 51 """ 52 Initializes the CellFunCon object with a COMPsc/JDTI object. 53 54 Parameters 55 ---------- 56 jdti_object : object 57 A COMPsc or JDTI object with normalized single-cell data. 58 59 mt_genes : bool 60 Whether mitochondrial genes are included (default False). 61 62 ribo_genes : bool 63 Whether ribosomal genes are included (default False). 64 """ 65 66 self.jdti = jdti_object 67 """JDtI-COMPsc object containing normalized single-cell data.""" 68 69 self.cells_markers = None 70 """DataFrame containing marker genes per cell type after calculation.""" 71 72 self.cells_connection = None 73 """DataFrame storing calculated cell-cell interaction information.""" 74 75 self.cells_enrichment = None 76 """Dictionary storing enrichment results per cell type.""" 77 78 self.mt_genes = mt_genes 79 """Whether mitochondrial genes are included (default False).""" 80 81 self.ribo_genes = ribo_genes 82 """Whether ribosomal genes are included (default False).""" 83 84 names = self.jdti.normalized_data.loc[ 85 self.jdti.normalized_data.select_dtypes(include="number").sum(axis=1) > 0 86 ].index.tolist() 87 names = list(set(names)) 88 89 if self.mt_genes is False: 90 names = [x for x in names if "MT-" not in x.upper()] 91 if self.ribo_genes is False: 92 names = [x for x in names if "RPS" != x[:3].upper()] 93 names = [x for x in names if "RPL" != x[:3].upper()] 94 95 enr = Enrichment() 96 enr.select_features(names) 97 98 self.enr_full_info = enr 99 """Enrichment object containing all genes available for enrichment analysis.""" 100 101 def save_project(self, filename): 102 """ 103 Saves the current CellFunCon project as a pickle file. 104 105 Parameters 106 ---------- 107 filename : str 108 Path to save the project (e.g., 'project_name'). 109 110 Example 111 ------- 112 >>> self.save_project('my_project') 113 """ 114 115 with open(f"{filename}.psc", "wb") as f: 116 pickle.dump(self, f) 117 print(f"Project saved as {filename}") 118 119 @classmethod 120 def load_project(cls, filename): 121 """ 122 Loads a previously saved CellFunCon project from a pickle file. 123 124 Parameters 125 ---------- 126 filename : str 127 Path to the saved pickle file. 128 129 Returns 130 ------- 131 CellFunCon 132 Loaded CellFunCon self. 133 134 Raises 135 ------ 136 TypeError 137 If the loaded object is not a CellFunCon self. 138 139 ValueError 140 If the file is not a valid CellFunCon project file. 141 142 Example 143 ------- 144 >>> self = CellFunCon.load_project('my_project.psc') 145 """ 146 147 if ".psc" in filename: 148 with open(filename, "rb") as f: 149 obj = pickle.load(f) 150 if not isinstance(obj, cls): 151 raise TypeError("File does not include project.psc") 152 print(f"Project loaded from {filename}") 153 return obj 154 else: 155 raise ValueError("Project not belong to CellFunCon project data.") 156 157 def calculate_cells_markers(self, min_exp=0, min_pct=0.05, n_proc=10): 158 """ 159 Calculates marker genes for each cell type based on expression thresholds. 160 161 Perform differential gene expression (DEG) analysis on gene expression data. 162 163 The function compares groups of cells or samples (defined by `entities` or 164 `sets`) using the Mann–Whitney U test. It computes p-values, adjusted 165 p-values, fold changes, standardized effect sizes, and other statistics. 166 167 168 Parameters 169 ---------- 170 min_exp : float, optional 171 Minimum expression level to consider a gene (default 0). 172 173 min_pct : float, optional 174 Minimum fraction of cells expressing a gene (default 0.05). 175 176 n_proc : int, optional 177 Number of parallel processes to use (default 10). 178 179 Notes 180 ----- 181 The results are stored in the `cells_markers` attribute. 182 """ 183 184 self.jdti.calculate_difference_markers( 185 min_exp=min_exp, min_pct=min_pct, n_proc=n_proc, force=True 186 ) 187 188 self.cells_markers = self.jdti.var_data 189 190 def enrich_cells_fucntionality( 191 self, p_value=0.05, adj=True, log_fc=0.1, top_max=500 192 ): 193 """ 194 Performs functional enrichment analysis for each cell type based on marker genes. 195 196 Parameters 197 ---------- 198 p_value : float 199 Maximum p-value for significant genes (default 0.05). 200 201 adj : bool 202 If True, the adjusted p-values are used to determine significant genes. 203 Adjusted p-values are calculated using the Benjamini–Hochberg false 204 discovery rate (FDR) correction. If False, raw p-values are used instead. 205 206 log_fc : float 207 Minimum log fold-change threshold for marker genes (default 0.1). 208 209 top_max : int 210 Maximum number of top marker genes per cell type to consider (default 500). 211 212 Raises 213 ------ 214 ValueError 215 If `cells_markers` is not defined. 216 217 Notes 218 ----- 219 This method populates `cells_enrichment` with results for GO-TERM, KEGG, REACTOME, 220 STRING, IntAct, and specificity analyses. 221 """ 222 223 if isinstance(self.cells_markers, pd.DataFrame): 224 225 markers = self.cells_markers 226 cells = set(markers["valid_group"]) 227 228 data_dict = {} 229 230 max_c = len(cells) 231 for n, c in enumerate(cells): 232 print(f"\nAnalysis {n+1} of {max_c} cells --> {c} \n") 233 234 if adj: 235 tmp = markers[ 236 (markers["valid_group"] == c) 237 & (markers["adj_pval"] <= p_value) 238 & (markers["log(FC)"] > log_fc) 239 ] 240 names = list(set(tmp["feature"])) 241 242 tmp = tmp[tmp["feature"].isin(names)] 243 244 else: 245 tmp = markers[ 246 (markers["valid_group"] == c) 247 & (markers["p_val"] <= p_value) 248 & (markers["log(FC)"] > log_fc) 249 ] 250 names = list(set(tmp["feature"])) 251 252 tmp = tmp[tmp["feature"].isin(names)] 253 254 tmp = tmp.sort_values("esm", ascending=False).head(top_max) 255 256 if len(tmp.index) > 0: 257 data_dict[c] = {} 258 enr = copy.copy(self.enr_full_info) 259 enr.genome = enr.genome[ 260 enr.genome["found_names"].isin(list(set(tmp["feature"]))) 261 ].reset_index(drop=True) 262 enr.enriche_specificiti() 263 enr.enriche_KEGG() 264 enr.enriche_GOTERM() 265 enr.enriche_REACTOME() 266 enr.enriche_IntAct() 267 enr.enriche_STRING() 268 enr.enriche_specificiti() 269 270 data = enr.get_results() 271 del enr 272 273 ans = Analysis(data) 274 ans.gene_interaction() 275 ans.features_specificity() 276 ans.REACTOME_overrepresentation() 277 ans.KEGG_overrepresentation() 278 ans.GO_overrepresentation() 279 ans.features_specificity() 280 281 data_dict[c] = ans.get_full_results() 282 else: 283 print( 284 f"Cell {c} was not enriched. No specific markers were found in this dataset." 285 ) 286 data_dict[c] = None 287 288 self.cells_enrichment = data_dict 289 290 else: 291 raise ValueError( 292 "`self.cells_markers` not defined. Use `self.cells_markers` to provide markers." 293 ) 294 295 def get_enrichment_data( 296 self, 297 data_type="GO-TERM", 298 p_value=0.05, 299 test="FISH", 300 adj="BH", 301 parent_inc=False, 302 top_n=50, 303 ): 304 """ 305 Retrieves enrichment results for all cells in a unified DataFrame. 306 307 Parameters 308 ---------- 309 data_type : str 310 Type of enrichment to retrieve ('GO-TERM', 'KEGG', 'REACTOME', 'specificity'). 311 312 p_value : float, optional 313 Maximum p-value threshold (default 0.05). 314 315 test : str, optional 316 Name of the statistical test column to use (default 'FISH'). 317 318 adj : str, optional 319 P-value adjustment method (default 'BH'). 320 321 parent_inc : bool, optional 322 Whether to include parent terms in the results (default False). 323 324 top_n : int, optional 325 Maximum number of terms per cell type to include (default 50). 326 327 Returns 328 ------- 329 pd.DataFrame 330 DataFrame containing filtered enrichment results with a 'cell' column indicating cell type. 331 332 Raises 333 ------ 334 ValueError 335 If `data_type` is not one of the expected values. 336 """ 337 338 if not any( 339 x in data_type for x in ("GO-TERM", "KEGG", "REACTOME", "specificity") 340 ): 341 raise ValueError( 342 "Invalid value for 'data_type'. Expected: 'GO-TERM', 'KEGG', 'REACTOME' or 'specificity'." 343 ) 344 345 if data_type == "GO-TERM": 346 parent_col = "parent" 347 348 elif data_type == "KEGG": 349 parent_col = "2nd" 350 351 elif data_type == "REACTOME": 352 parent_col = "top_level" 353 354 elif data_type == "specificity": 355 parent_col = "None" 356 357 pdl = [] 358 for i in self.cells_enrichment.keys(): 359 if self.cells_enrichment[i] is None: 360 continue 361 362 print(i) 363 if data_type == "specificity": 364 tmp_dict = self.cells_enrichment[i]["statistics"][data_type] 365 tmp = [] 366 for k in tmp_dict.keys(): 367 if k != "HPA_subcellular_location": 368 tmp.append(pd.DataFrame(tmp_dict[k])) 369 370 tmp = pd.concat(tmp) 371 372 else: 373 tmp = pd.DataFrame(self.cells_enrichment[i]["statistics"][data_type]) 374 375 cols = [x for x in tmp.columns if test in x and adj in x] 376 cols = sorted(cols, reverse=True) 377 if parent_inc is False: 378 cols = [x for x in cols if parent_col not in x.lower()] 379 380 mask = (tmp[cols] <= p_value).all(axis=1) 381 tmp = tmp.loc[mask] 382 tmp["cell"] = i 383 tmp = tmp.sort_values(by=["cell"] + cols, ascending=True) 384 385 pdl.append(tmp.head(top_n)) 386 387 df = pd.concat(pdl) 388 df["source"] = data_type 389 df = df.reset_index(drop=True) 390 391 return df 392 393 def get_included_cells(self): 394 """ 395 Returns the list of cell types included in the enrichment analysis. 396 397 Returns 398 ------- 399 list 400 List of cell type names. 401 402 Example 403 ------- 404 >>> self.get_included_cells() 405 ['CellType1', 'CellType2', ...] 406 """ 407 408 cl = [] 409 for i in self.cells_enrichment.keys(): 410 print(i) 411 cl.append(i) 412 413 return cl 414 415 def get_gene_interactions(self, cell_name): 416 """ 417 Retrieves gene or protein interaction data for a specific cell type. 418 419 Parameters 420 ---------- 421 cell_name : str 422 Name of the cell type. 423 424 Returns 425 ------- 426 pd.DataFrame 427 DataFrame containing interactions for the specified cell. 428 429 Example 430 ------- 431 >>> self.get_gene_interactions('CellType1') 432 """ 433 434 tmp = pd.DataFrame( 435 self.cells_enrichment[cell_name]["statistics"]["interactions"] 436 ) 437 438 return tmp 439 440 def calculate_cell_connections(self): 441 """ 442 Calculates cell-cell interaction connections based on gene/protein co-expression. 443 444 Notes 445 ----- 446 Populates `cells_connection` with a DataFrame containing interactions between all pairs of cells. 447 448 Each row represents an interaction between two cells and the involved genes/proteins. 449 450 Raises 451 ------ 452 ValueError 453 If `normalized_data` is not defined in the JDTI object. 454 """ 455 456 if isinstance(self.jdti.normalized_data, pd.DataFrame): 457 458 cells = set(self.jdti.normalized_data.columns) 459 460 data_dict = {} 461 462 for c in tqdm(cells): 463 464 tmp = self.jdti.normalized_data.loc[:, c] 465 names = tmp.loc[ 466 tmp.select_dtypes(include="number").sum(axis=1) > 0 467 ].index.tolist() 468 names = list(set(names)) 469 470 enr = copy.copy(self.enr_full_info) 471 enr.genome = enr.genome[ 472 enr.genome["found_names"].isin(names) 473 ].reset_index(drop=True) 474 enr.enriche_CellCon() 475 data = enr.get_results() 476 del enr 477 478 data_dict[c] = data["CellConnections"] 479 480 full_data = [] 481 for c1 in tqdm(cells): 482 for c2 in cells: 483 if c1 != c2: 484 c1_d = pd.DataFrame(data_dict[c1]["interactor2"]) 485 c2_d = pd.DataFrame(data_dict[c2]["interactor1"]) 486 487 mutual_lr = c1_d["interaction"][ 488 c1_d["interaction"].isin(list(c2_d["interaction"])) 489 ] 490 491 to_ret = ( 492 c1_d[c1_d["interaction"].isin(list(mutual_lr))] 493 .drop( 494 [ 495 "Species", 496 "protein_id_1", 497 "protein_id_2", 498 "found_names_2", 499 ], 500 axis=1, 501 ) 502 .reset_index(drop=True) 503 ) 504 505 to_ret = to_ret.rename(columns={"found_names_1": "interactor1"}) 506 c2_subset = c2_d[["interaction", "found_names_2"]].rename( 507 columns={"found_names_2": "interactor2"} 508 ) 509 510 to_ret = to_ret.merge(c2_subset, on="interaction", how="left") 511 to_ret["cell1"] = c1 512 to_ret["cell2"] = c2 513 514 full_data.append(to_ret) 515 516 self.cells_connection = pd.concat(full_data) 517 518 else: 519 raise ValueError( 520 "`self.cells_markers` not defined. Use `self.cells_markers` to provide markers." 521 ) 522 523 def get_cell_connections( 524 self, 525 connection_type: list = [ 526 "Adhesion-Adhesion", 527 "Gap-Gap", 528 "Ligand-Ligand", 529 "Ligand-Receptor", 530 "Receptor-Receptor", 531 "Undefined", 532 ], 533 ): 534 """ 535 Returns the calculated cell-cell interaction connections. 536 537 Parameters 538 ---------- 539 connection_type : list of str, optional 540 List of interaction types used to filter the returned cell–cell connections 541 based on the molecular directionality of the interaction. Possible values: 542 543 - "Adhesion-Adhesion" : interaction between two adhesion molecules. 544 - "Gap-Gap" : connection through gap junction proteins. 545 - "Ligand-Ligand" : interaction between two ligand molecules. 546 - "Ligand-Receptor" : directional interaction where a ligand binds to a receptor. 547 - "Receptor-Receptor" : interaction between two receptor molecules. 548 - "Undefined" : interactions where the directionality could not be determined. 549 550 By default, all interaction types are included. 551 552 Returns 553 ------- 554 pd.DataFrame 555 DataFrame containing cell-cell interactions. 556 557 Example 558 ------- 559 >>> connections = self.get_cell_connections() 560 """ 561 562 tmp = self.cells_connection 563 564 tmp["directionality"] = [ 565 x if x is not None else "Undefined" for x in tmp["directionality"] 566 ] 567 568 tmp = tmp[tmp["directionality"].isin(connection_type)] 569 570 return tmp 571 572 573def compare_connections( 574 instances_dict: dict, 575 cells_compartment: dict | None = None, 576 connection_type: list = [ 577 "Adhesion-Adhesion", 578 "Gap-Gap", 579 "Ligand-Ligand", 580 "Ligand-Receptor", 581 "Receptor-Receptor", 582 "Undefined", 583 ], 584): 585 """ 586 Compare gene expression between two instances based on their cell connections. 587 588 This function compares normalized gene expression data from exactly two 589 instances stored in ``instances_dict``. Optionally, the comparison can be 590 restricted to specific cell compartments for each instance. Differential 591 expression analysis is performed using ``jdti.calc_DEG``. 592 593 Parameters 594 ---------- 595 instances_dict : dict 596 Dictionary containing exactly two objects. Each object must have: 597 598 - ``jdti.normalized_data`` : pandas.DataFrame 599 Gene expression matrix with genes as rows and cells as columns. 600 601 - ``cells_connection`` : pandas.DataFrame 602 DataFrame containing at least the columns ``'interactor1'`` and 603 ``'interactor2'``. 604 605 The dictionary keys are used as group labels in the comparison. 606 607 cells_compartment : dict or None, optional 608 Dictionary mapping each key in ``instances_dict`` to a list of cell names 609 to be used for the comparison. If ``None``, all cells are used and genes 610 are filtered based on cell–cell connections. 611 612 connection_type : list of str, optional 613 List of interaction types used to filter cell–cell connections that are 614 considered in the gene expression comparison. Only connections with the 615 specified molecular interaction types will be used to define interacting 616 cells between the two instances. 617 618 Possible values: 619 620 - "Adhesion-Adhesion" : interaction between two adhesion molecules. 621 - "Gap-Gap" : connection mediated by gap junction proteins. 622 - "Ligand-Ligand" : interaction between two ligand molecules. 623 - "Ligand-Receptor" : directional interaction where a ligand binds to a receptor. 624 - "Receptor-Receptor" : interaction between two receptor molecules. 625 - "Undefined" : interactions where the directionality could not be determined. 626 627 By default, all interaction types are included in the comparison. 628 629 Returns 630 ------- 631 pandas.DataFrame 632 Differential expression results returned by ``calc_DEG``, filtered to 633 include only rows where ``valid_group`` matches the first key in 634 ``instances_dict``. 635 636 Raises 637 ------ 638 ValueError 639 If any cell specified in ``cells_compartment`` is not present in the 640 corresponding ``normalized_data`` columns. 641 642 Notes 643 ----- 644 - Only genes common to both instances are considered. 645 646 - When ``cells_compartment`` is ``None``, genes are further restricted to 647 those appearing in the cell–cell interaction networks of either instance. 648 649 - The function assumes exactly two entries in ``instances_dict``. 650 651 - Differential expression is computed with ``min_exp=0`` and ``min_pct=0.1``. 652 653 See Also 654 -------- 655 jdti.calc_DEG : Function used to compute differential expression. 656 """ 657 658 import pandas as pd 659 from jdti import calc_DEG 660 661 if isinstance(cells_compartment, dict): 662 663 keys_list = list(instances_dict.keys()) 664 tmp1 = instances_dict[keys_list[0]].jdti.normalized_data.copy() 665 cells = cells_compartment[keys_list[0]] 666 if any(cell not in tmp1.columns for cell in cells): 667 raise ValueError( 668 'Any of {keys_list[0]} cells in dictionary "cells_compartment" do not occur!' 669 ) 670 tmp1 = tmp1.loc[:, cells] 671 tmp1.columns = [keys_list[0]] * len(tmp1.columns) 672 673 tmp2 = instances_dict[keys_list[1]].jdti.normalized_data.copy() 674 cells = cells_compartment[keys_list[1]] 675 if any(cell not in tmp2.columns for cell in cells): 676 raise ValueError( 677 'Any of {keys_list[1]} cells in dictionary "cells_compartment" do not occur!' 678 ) 679 tmp2 = tmp2.loc[:, cells] 680 tmp2.columns = [keys_list[1]] * len(tmp2.columns) 681 682 common_idx = tmp1.index.intersection(tmp2.index) 683 684 tmp1 = tmp1.loc[common_idx] 685 tmp2 = tmp2.loc[common_idx] 686 687 concat_df = pd.concat([tmp1, tmp2], axis=1) 688 689 else: 690 691 keys_list = list(instances_dict.keys()) 692 tmp1 = instances_dict[keys_list[0]].jdti.normalized_data.copy() 693 tmp1.columns = [keys_list[0]] * len(tmp1.columns) 694 695 tmp2 = instances_dict[keys_list[1]].jdti.normalized_data.copy() 696 tmp2.columns = [keys_list[1]] * len(tmp2.columns) 697 698 common_idx = tmp1.index.intersection(tmp2.index) 699 700 tmp1 = tmp1.loc[common_idx] 701 tmp2 = tmp2.loc[common_idx] 702 703 concat_df = pd.concat([tmp1, tmp2], axis=1) 704 705 tmp_df_1 = instances_dict[keys_list[0]].cells_connection 706 tmp_df_2 = instances_dict[keys_list[1]].cells_connection 707 708 tmp_df_1["directionality"] = [ 709 x if x is not None else "Undefined" for x in tmp_df_1["directionality"] 710 ] 711 tmp_df_2["directionality"] = [ 712 x if x is not None else "Undefined" for x in tmp_df_2["directionality"] 713 ] 714 715 tmp_df_1 = tmp_df_1[tmp_df_1["directionality"].isin(connection_type)] 716 tmp_df_2 = tmp_df_2[tmp_df_2["directionality"].isin(connection_type)] 717 718 tmp_con1 = list(set(list(tmp_df_1["interactor1"]) + list(tmp_df_1["interactor2"]))) 719 720 tmp_con2 = list(set(list(tmp_df_2["interactor1"]) + list(tmp_df_2["interactor2"]))) 721 722 genes = list(set(tmp_con1 + tmp_con2)) 723 724 genes2 = [x for x in genes if x in common_idx] 725 726 concat_df = concat_df.loc[genes2, :] 727 728 results = calc_DEG( 729 data=concat_df, 730 metadata_list=None, 731 entities="All", 732 sets=None, 733 min_exp=0, 734 min_pct=0, 735 n_proc=10, 736 ) 737 738 results = results[results["valid_group"] == keys_list[0]] 739 740 return results
19class CellFunCon: 20 """ 21 A class to perform cell-type functional analysis and enrichment based on a JDtI-COMPsc objects. 22 23 This class provides methods to calculate marker genes for cell types, perform functional enrichment 24 (GO, KEGG, REACTOME, STRING, IntAct), and compute cell-cell interaction networks. 25 Projects can also be saved and loaded via pickle. 26 27 Attributes 28 ---------- 29 jdti : object 30 JDtI-COMPsc object containing normalized single-cell data. 31 32 cells_markers : pd.DataFrame or None 33 DataFrame containing marker genes per cell type after calculation. 34 35 enr_full_info : Enrichment 36 Enrichment object containing all genes available for enrichment analysis. 37 38 cells_enrichment : dict or None 39 Dictionary storing enrichment results per cell type. 40 41 cells_connection : pd.DataFrame or None 42 DataFrame storing calculated cell-cell interaction information. 43 44 mt_genes : bool 45 Whether mitochondrial genes are included (default False). 46 47 ribo_genes : bool 48 Whether ribosomal genes are included (default False). 49 """ 50 51 def __init__(self, jdti_object, mt_genes=False, ribo_genes=False): 52 """ 53 Initializes the CellFunCon object with a COMPsc/JDTI object. 54 55 Parameters 56 ---------- 57 jdti_object : object 58 A COMPsc or JDTI object with normalized single-cell data. 59 60 mt_genes : bool 61 Whether mitochondrial genes are included (default False). 62 63 ribo_genes : bool 64 Whether ribosomal genes are included (default False). 65 """ 66 67 self.jdti = jdti_object 68 """JDtI-COMPsc object containing normalized single-cell data.""" 69 70 self.cells_markers = None 71 """DataFrame containing marker genes per cell type after calculation.""" 72 73 self.cells_connection = None 74 """DataFrame storing calculated cell-cell interaction information.""" 75 76 self.cells_enrichment = None 77 """Dictionary storing enrichment results per cell type.""" 78 79 self.mt_genes = mt_genes 80 """Whether mitochondrial genes are included (default False).""" 81 82 self.ribo_genes = ribo_genes 83 """Whether ribosomal genes are included (default False).""" 84 85 names = self.jdti.normalized_data.loc[ 86 self.jdti.normalized_data.select_dtypes(include="number").sum(axis=1) > 0 87 ].index.tolist() 88 names = list(set(names)) 89 90 if self.mt_genes is False: 91 names = [x for x in names if "MT-" not in x.upper()] 92 if self.ribo_genes is False: 93 names = [x for x in names if "RPS" != x[:3].upper()] 94 names = [x for x in names if "RPL" != x[:3].upper()] 95 96 enr = Enrichment() 97 enr.select_features(names) 98 99 self.enr_full_info = enr 100 """Enrichment object containing all genes available for enrichment analysis.""" 101 102 def save_project(self, filename): 103 """ 104 Saves the current CellFunCon project as a pickle file. 105 106 Parameters 107 ---------- 108 filename : str 109 Path to save the project (e.g., 'project_name'). 110 111 Example 112 ------- 113 >>> self.save_project('my_project') 114 """ 115 116 with open(f"{filename}.psc", "wb") as f: 117 pickle.dump(self, f) 118 print(f"Project saved as {filename}") 119 120 @classmethod 121 def load_project(cls, filename): 122 """ 123 Loads a previously saved CellFunCon project from a pickle file. 124 125 Parameters 126 ---------- 127 filename : str 128 Path to the saved pickle file. 129 130 Returns 131 ------- 132 CellFunCon 133 Loaded CellFunCon self. 134 135 Raises 136 ------ 137 TypeError 138 If the loaded object is not a CellFunCon self. 139 140 ValueError 141 If the file is not a valid CellFunCon project file. 142 143 Example 144 ------- 145 >>> self = CellFunCon.load_project('my_project.psc') 146 """ 147 148 if ".psc" in filename: 149 with open(filename, "rb") as f: 150 obj = pickle.load(f) 151 if not isinstance(obj, cls): 152 raise TypeError("File does not include project.psc") 153 print(f"Project loaded from {filename}") 154 return obj 155 else: 156 raise ValueError("Project not belong to CellFunCon project data.") 157 158 def calculate_cells_markers(self, min_exp=0, min_pct=0.05, n_proc=10): 159 """ 160 Calculates marker genes for each cell type based on expression thresholds. 161 162 Perform differential gene expression (DEG) analysis on gene expression data. 163 164 The function compares groups of cells or samples (defined by `entities` or 165 `sets`) using the Mann–Whitney U test. It computes p-values, adjusted 166 p-values, fold changes, standardized effect sizes, and other statistics. 167 168 169 Parameters 170 ---------- 171 min_exp : float, optional 172 Minimum expression level to consider a gene (default 0). 173 174 min_pct : float, optional 175 Minimum fraction of cells expressing a gene (default 0.05). 176 177 n_proc : int, optional 178 Number of parallel processes to use (default 10). 179 180 Notes 181 ----- 182 The results are stored in the `cells_markers` attribute. 183 """ 184 185 self.jdti.calculate_difference_markers( 186 min_exp=min_exp, min_pct=min_pct, n_proc=n_proc, force=True 187 ) 188 189 self.cells_markers = self.jdti.var_data 190 191 def enrich_cells_fucntionality( 192 self, p_value=0.05, adj=True, log_fc=0.1, top_max=500 193 ): 194 """ 195 Performs functional enrichment analysis for each cell type based on marker genes. 196 197 Parameters 198 ---------- 199 p_value : float 200 Maximum p-value for significant genes (default 0.05). 201 202 adj : bool 203 If True, the adjusted p-values are used to determine significant genes. 204 Adjusted p-values are calculated using the Benjamini–Hochberg false 205 discovery rate (FDR) correction. If False, raw p-values are used instead. 206 207 log_fc : float 208 Minimum log fold-change threshold for marker genes (default 0.1). 209 210 top_max : int 211 Maximum number of top marker genes per cell type to consider (default 500). 212 213 Raises 214 ------ 215 ValueError 216 If `cells_markers` is not defined. 217 218 Notes 219 ----- 220 This method populates `cells_enrichment` with results for GO-TERM, KEGG, REACTOME, 221 STRING, IntAct, and specificity analyses. 222 """ 223 224 if isinstance(self.cells_markers, pd.DataFrame): 225 226 markers = self.cells_markers 227 cells = set(markers["valid_group"]) 228 229 data_dict = {} 230 231 max_c = len(cells) 232 for n, c in enumerate(cells): 233 print(f"\nAnalysis {n+1} of {max_c} cells --> {c} \n") 234 235 if adj: 236 tmp = markers[ 237 (markers["valid_group"] == c) 238 & (markers["adj_pval"] <= p_value) 239 & (markers["log(FC)"] > log_fc) 240 ] 241 names = list(set(tmp["feature"])) 242 243 tmp = tmp[tmp["feature"].isin(names)] 244 245 else: 246 tmp = markers[ 247 (markers["valid_group"] == c) 248 & (markers["p_val"] <= p_value) 249 & (markers["log(FC)"] > log_fc) 250 ] 251 names = list(set(tmp["feature"])) 252 253 tmp = tmp[tmp["feature"].isin(names)] 254 255 tmp = tmp.sort_values("esm", ascending=False).head(top_max) 256 257 if len(tmp.index) > 0: 258 data_dict[c] = {} 259 enr = copy.copy(self.enr_full_info) 260 enr.genome = enr.genome[ 261 enr.genome["found_names"].isin(list(set(tmp["feature"]))) 262 ].reset_index(drop=True) 263 enr.enriche_specificiti() 264 enr.enriche_KEGG() 265 enr.enriche_GOTERM() 266 enr.enriche_REACTOME() 267 enr.enriche_IntAct() 268 enr.enriche_STRING() 269 enr.enriche_specificiti() 270 271 data = enr.get_results() 272 del enr 273 274 ans = Analysis(data) 275 ans.gene_interaction() 276 ans.features_specificity() 277 ans.REACTOME_overrepresentation() 278 ans.KEGG_overrepresentation() 279 ans.GO_overrepresentation() 280 ans.features_specificity() 281 282 data_dict[c] = ans.get_full_results() 283 else: 284 print( 285 f"Cell {c} was not enriched. No specific markers were found in this dataset." 286 ) 287 data_dict[c] = None 288 289 self.cells_enrichment = data_dict 290 291 else: 292 raise ValueError( 293 "`self.cells_markers` not defined. Use `self.cells_markers` to provide markers." 294 ) 295 296 def get_enrichment_data( 297 self, 298 data_type="GO-TERM", 299 p_value=0.05, 300 test="FISH", 301 adj="BH", 302 parent_inc=False, 303 top_n=50, 304 ): 305 """ 306 Retrieves enrichment results for all cells in a unified DataFrame. 307 308 Parameters 309 ---------- 310 data_type : str 311 Type of enrichment to retrieve ('GO-TERM', 'KEGG', 'REACTOME', 'specificity'). 312 313 p_value : float, optional 314 Maximum p-value threshold (default 0.05). 315 316 test : str, optional 317 Name of the statistical test column to use (default 'FISH'). 318 319 adj : str, optional 320 P-value adjustment method (default 'BH'). 321 322 parent_inc : bool, optional 323 Whether to include parent terms in the results (default False). 324 325 top_n : int, optional 326 Maximum number of terms per cell type to include (default 50). 327 328 Returns 329 ------- 330 pd.DataFrame 331 DataFrame containing filtered enrichment results with a 'cell' column indicating cell type. 332 333 Raises 334 ------ 335 ValueError 336 If `data_type` is not one of the expected values. 337 """ 338 339 if not any( 340 x in data_type for x in ("GO-TERM", "KEGG", "REACTOME", "specificity") 341 ): 342 raise ValueError( 343 "Invalid value for 'data_type'. Expected: 'GO-TERM', 'KEGG', 'REACTOME' or 'specificity'." 344 ) 345 346 if data_type == "GO-TERM": 347 parent_col = "parent" 348 349 elif data_type == "KEGG": 350 parent_col = "2nd" 351 352 elif data_type == "REACTOME": 353 parent_col = "top_level" 354 355 elif data_type == "specificity": 356 parent_col = "None" 357 358 pdl = [] 359 for i in self.cells_enrichment.keys(): 360 if self.cells_enrichment[i] is None: 361 continue 362 363 print(i) 364 if data_type == "specificity": 365 tmp_dict = self.cells_enrichment[i]["statistics"][data_type] 366 tmp = [] 367 for k in tmp_dict.keys(): 368 if k != "HPA_subcellular_location": 369 tmp.append(pd.DataFrame(tmp_dict[k])) 370 371 tmp = pd.concat(tmp) 372 373 else: 374 tmp = pd.DataFrame(self.cells_enrichment[i]["statistics"][data_type]) 375 376 cols = [x for x in tmp.columns if test in x and adj in x] 377 cols = sorted(cols, reverse=True) 378 if parent_inc is False: 379 cols = [x for x in cols if parent_col not in x.lower()] 380 381 mask = (tmp[cols] <= p_value).all(axis=1) 382 tmp = tmp.loc[mask] 383 tmp["cell"] = i 384 tmp = tmp.sort_values(by=["cell"] + cols, ascending=True) 385 386 pdl.append(tmp.head(top_n)) 387 388 df = pd.concat(pdl) 389 df["source"] = data_type 390 df = df.reset_index(drop=True) 391 392 return df 393 394 def get_included_cells(self): 395 """ 396 Returns the list of cell types included in the enrichment analysis. 397 398 Returns 399 ------- 400 list 401 List of cell type names. 402 403 Example 404 ------- 405 >>> self.get_included_cells() 406 ['CellType1', 'CellType2', ...] 407 """ 408 409 cl = [] 410 for i in self.cells_enrichment.keys(): 411 print(i) 412 cl.append(i) 413 414 return cl 415 416 def get_gene_interactions(self, cell_name): 417 """ 418 Retrieves gene or protein interaction data for a specific cell type. 419 420 Parameters 421 ---------- 422 cell_name : str 423 Name of the cell type. 424 425 Returns 426 ------- 427 pd.DataFrame 428 DataFrame containing interactions for the specified cell. 429 430 Example 431 ------- 432 >>> self.get_gene_interactions('CellType1') 433 """ 434 435 tmp = pd.DataFrame( 436 self.cells_enrichment[cell_name]["statistics"]["interactions"] 437 ) 438 439 return tmp 440 441 def calculate_cell_connections(self): 442 """ 443 Calculates cell-cell interaction connections based on gene/protein co-expression. 444 445 Notes 446 ----- 447 Populates `cells_connection` with a DataFrame containing interactions between all pairs of cells. 448 449 Each row represents an interaction between two cells and the involved genes/proteins. 450 451 Raises 452 ------ 453 ValueError 454 If `normalized_data` is not defined in the JDTI object. 455 """ 456 457 if isinstance(self.jdti.normalized_data, pd.DataFrame): 458 459 cells = set(self.jdti.normalized_data.columns) 460 461 data_dict = {} 462 463 for c in tqdm(cells): 464 465 tmp = self.jdti.normalized_data.loc[:, c] 466 names = tmp.loc[ 467 tmp.select_dtypes(include="number").sum(axis=1) > 0 468 ].index.tolist() 469 names = list(set(names)) 470 471 enr = copy.copy(self.enr_full_info) 472 enr.genome = enr.genome[ 473 enr.genome["found_names"].isin(names) 474 ].reset_index(drop=True) 475 enr.enriche_CellCon() 476 data = enr.get_results() 477 del enr 478 479 data_dict[c] = data["CellConnections"] 480 481 full_data = [] 482 for c1 in tqdm(cells): 483 for c2 in cells: 484 if c1 != c2: 485 c1_d = pd.DataFrame(data_dict[c1]["interactor2"]) 486 c2_d = pd.DataFrame(data_dict[c2]["interactor1"]) 487 488 mutual_lr = c1_d["interaction"][ 489 c1_d["interaction"].isin(list(c2_d["interaction"])) 490 ] 491 492 to_ret = ( 493 c1_d[c1_d["interaction"].isin(list(mutual_lr))] 494 .drop( 495 [ 496 "Species", 497 "protein_id_1", 498 "protein_id_2", 499 "found_names_2", 500 ], 501 axis=1, 502 ) 503 .reset_index(drop=True) 504 ) 505 506 to_ret = to_ret.rename(columns={"found_names_1": "interactor1"}) 507 c2_subset = c2_d[["interaction", "found_names_2"]].rename( 508 columns={"found_names_2": "interactor2"} 509 ) 510 511 to_ret = to_ret.merge(c2_subset, on="interaction", how="left") 512 to_ret["cell1"] = c1 513 to_ret["cell2"] = c2 514 515 full_data.append(to_ret) 516 517 self.cells_connection = pd.concat(full_data) 518 519 else: 520 raise ValueError( 521 "`self.cells_markers` not defined. Use `self.cells_markers` to provide markers." 522 ) 523 524 def get_cell_connections( 525 self, 526 connection_type: list = [ 527 "Adhesion-Adhesion", 528 "Gap-Gap", 529 "Ligand-Ligand", 530 "Ligand-Receptor", 531 "Receptor-Receptor", 532 "Undefined", 533 ], 534 ): 535 """ 536 Returns the calculated cell-cell interaction connections. 537 538 Parameters 539 ---------- 540 connection_type : list of str, optional 541 List of interaction types used to filter the returned cell–cell connections 542 based on the molecular directionality of the interaction. Possible values: 543 544 - "Adhesion-Adhesion" : interaction between two adhesion molecules. 545 - "Gap-Gap" : connection through gap junction proteins. 546 - "Ligand-Ligand" : interaction between two ligand molecules. 547 - "Ligand-Receptor" : directional interaction where a ligand binds to a receptor. 548 - "Receptor-Receptor" : interaction between two receptor molecules. 549 - "Undefined" : interactions where the directionality could not be determined. 550 551 By default, all interaction types are included. 552 553 Returns 554 ------- 555 pd.DataFrame 556 DataFrame containing cell-cell interactions. 557 558 Example 559 ------- 560 >>> connections = self.get_cell_connections() 561 """ 562 563 tmp = self.cells_connection 564 565 tmp["directionality"] = [ 566 x if x is not None else "Undefined" for x in tmp["directionality"] 567 ] 568 569 tmp = tmp[tmp["directionality"].isin(connection_type)] 570 571 return tmp
A class to perform cell-type functional analysis and enrichment based on a JDtI-COMPsc objects.
This class provides methods to calculate marker genes for cell types, perform functional enrichment (GO, KEGG, REACTOME, STRING, IntAct), and compute cell-cell interaction networks. Projects can also be saved and loaded via pickle.
Attributes
jdti : object JDtI-COMPsc object containing normalized single-cell data.
cells_markers : pd.DataFrame or None DataFrame containing marker genes per cell type after calculation.
enr_full_info : Enrichment Enrichment object containing all genes available for enrichment analysis.
cells_enrichment : dict or None Dictionary storing enrichment results per cell type.
cells_connection : pd.DataFrame or None DataFrame storing calculated cell-cell interaction information.
mt_genes : bool Whether mitochondrial genes are included (default False).
ribo_genes : bool Whether ribosomal genes are included (default False).
51 def __init__(self, jdti_object, mt_genes=False, ribo_genes=False): 52 """ 53 Initializes the CellFunCon object with a COMPsc/JDTI object. 54 55 Parameters 56 ---------- 57 jdti_object : object 58 A COMPsc or JDTI object with normalized single-cell data. 59 60 mt_genes : bool 61 Whether mitochondrial genes are included (default False). 62 63 ribo_genes : bool 64 Whether ribosomal genes are included (default False). 65 """ 66 67 self.jdti = jdti_object 68 """JDtI-COMPsc object containing normalized single-cell data.""" 69 70 self.cells_markers = None 71 """DataFrame containing marker genes per cell type after calculation.""" 72 73 self.cells_connection = None 74 """DataFrame storing calculated cell-cell interaction information.""" 75 76 self.cells_enrichment = None 77 """Dictionary storing enrichment results per cell type.""" 78 79 self.mt_genes = mt_genes 80 """Whether mitochondrial genes are included (default False).""" 81 82 self.ribo_genes = ribo_genes 83 """Whether ribosomal genes are included (default False).""" 84 85 names = self.jdti.normalized_data.loc[ 86 self.jdti.normalized_data.select_dtypes(include="number").sum(axis=1) > 0 87 ].index.tolist() 88 names = list(set(names)) 89 90 if self.mt_genes is False: 91 names = [x for x in names if "MT-" not in x.upper()] 92 if self.ribo_genes is False: 93 names = [x for x in names if "RPS" != x[:3].upper()] 94 names = [x for x in names if "RPL" != x[:3].upper()] 95 96 enr = Enrichment() 97 enr.select_features(names) 98 99 self.enr_full_info = enr 100 """Enrichment object containing all genes available for enrichment analysis."""
Initializes the CellFunCon object with a COMPsc/JDTI object.
Parameters
jdti_object : object A COMPsc or JDTI object with normalized single-cell data.
mt_genes : bool Whether mitochondrial genes are included (default False).
ribo_genes : bool Whether ribosomal genes are included (default False).
102 def save_project(self, filename): 103 """ 104 Saves the current CellFunCon project as a pickle file. 105 106 Parameters 107 ---------- 108 filename : str 109 Path to save the project (e.g., 'project_name'). 110 111 Example 112 ------- 113 >>> self.save_project('my_project') 114 """ 115 116 with open(f"{filename}.psc", "wb") as f: 117 pickle.dump(self, f) 118 print(f"Project saved as {filename}")
Saves the current CellFunCon project as a pickle file.
Parameters
filename : str Path to save the project (e.g., 'project_name').
Example
>>> self.save_project('my_project')
120 @classmethod 121 def load_project(cls, filename): 122 """ 123 Loads a previously saved CellFunCon project from a pickle file. 124 125 Parameters 126 ---------- 127 filename : str 128 Path to the saved pickle file. 129 130 Returns 131 ------- 132 CellFunCon 133 Loaded CellFunCon self. 134 135 Raises 136 ------ 137 TypeError 138 If the loaded object is not a CellFunCon self. 139 140 ValueError 141 If the file is not a valid CellFunCon project file. 142 143 Example 144 ------- 145 >>> self = CellFunCon.load_project('my_project.psc') 146 """ 147 148 if ".psc" in filename: 149 with open(filename, "rb") as f: 150 obj = pickle.load(f) 151 if not isinstance(obj, cls): 152 raise TypeError("File does not include project.psc") 153 print(f"Project loaded from {filename}") 154 return obj 155 else: 156 raise ValueError("Project not belong to CellFunCon project data.")
Loads a previously saved CellFunCon project from a pickle file.
Parameters
filename : str Path to the saved pickle file.
Returns
CellFunCon Loaded CellFunCon self.
Raises
TypeError If the loaded object is not a CellFunCon self.
ValueError If the file is not a valid CellFunCon project file.
Example
>>> self = CellFunCon.load_project('my_project.psc')
158 def calculate_cells_markers(self, min_exp=0, min_pct=0.05, n_proc=10): 159 """ 160 Calculates marker genes for each cell type based on expression thresholds. 161 162 Perform differential gene expression (DEG) analysis on gene expression data. 163 164 The function compares groups of cells or samples (defined by `entities` or 165 `sets`) using the Mann–Whitney U test. It computes p-values, adjusted 166 p-values, fold changes, standardized effect sizes, and other statistics. 167 168 169 Parameters 170 ---------- 171 min_exp : float, optional 172 Minimum expression level to consider a gene (default 0). 173 174 min_pct : float, optional 175 Minimum fraction of cells expressing a gene (default 0.05). 176 177 n_proc : int, optional 178 Number of parallel processes to use (default 10). 179 180 Notes 181 ----- 182 The results are stored in the `cells_markers` attribute. 183 """ 184 185 self.jdti.calculate_difference_markers( 186 min_exp=min_exp, min_pct=min_pct, n_proc=n_proc, force=True 187 ) 188 189 self.cells_markers = self.jdti.var_data
Calculates marker genes for each cell type based on expression thresholds.
Perform differential gene expression (DEG) analysis on gene expression data.
The function compares groups of cells or samples (defined by entities or
sets) using the Mann–Whitney U test. It computes p-values, adjusted
p-values, fold changes, standardized effect sizes, and other statistics.
Parameters
min_exp : float, optional Minimum expression level to consider a gene (default 0).
min_pct : float, optional Minimum fraction of cells expressing a gene (default 0.05).
n_proc : int, optional Number of parallel processes to use (default 10).
Notes
The results are stored in the cells_markers attribute.
191 def enrich_cells_fucntionality( 192 self, p_value=0.05, adj=True, log_fc=0.1, top_max=500 193 ): 194 """ 195 Performs functional enrichment analysis for each cell type based on marker genes. 196 197 Parameters 198 ---------- 199 p_value : float 200 Maximum p-value for significant genes (default 0.05). 201 202 adj : bool 203 If True, the adjusted p-values are used to determine significant genes. 204 Adjusted p-values are calculated using the Benjamini–Hochberg false 205 discovery rate (FDR) correction. If False, raw p-values are used instead. 206 207 log_fc : float 208 Minimum log fold-change threshold for marker genes (default 0.1). 209 210 top_max : int 211 Maximum number of top marker genes per cell type to consider (default 500). 212 213 Raises 214 ------ 215 ValueError 216 If `cells_markers` is not defined. 217 218 Notes 219 ----- 220 This method populates `cells_enrichment` with results for GO-TERM, KEGG, REACTOME, 221 STRING, IntAct, and specificity analyses. 222 """ 223 224 if isinstance(self.cells_markers, pd.DataFrame): 225 226 markers = self.cells_markers 227 cells = set(markers["valid_group"]) 228 229 data_dict = {} 230 231 max_c = len(cells) 232 for n, c in enumerate(cells): 233 print(f"\nAnalysis {n+1} of {max_c} cells --> {c} \n") 234 235 if adj: 236 tmp = markers[ 237 (markers["valid_group"] == c) 238 & (markers["adj_pval"] <= p_value) 239 & (markers["log(FC)"] > log_fc) 240 ] 241 names = list(set(tmp["feature"])) 242 243 tmp = tmp[tmp["feature"].isin(names)] 244 245 else: 246 tmp = markers[ 247 (markers["valid_group"] == c) 248 & (markers["p_val"] <= p_value) 249 & (markers["log(FC)"] > log_fc) 250 ] 251 names = list(set(tmp["feature"])) 252 253 tmp = tmp[tmp["feature"].isin(names)] 254 255 tmp = tmp.sort_values("esm", ascending=False).head(top_max) 256 257 if len(tmp.index) > 0: 258 data_dict[c] = {} 259 enr = copy.copy(self.enr_full_info) 260 enr.genome = enr.genome[ 261 enr.genome["found_names"].isin(list(set(tmp["feature"]))) 262 ].reset_index(drop=True) 263 enr.enriche_specificiti() 264 enr.enriche_KEGG() 265 enr.enriche_GOTERM() 266 enr.enriche_REACTOME() 267 enr.enriche_IntAct() 268 enr.enriche_STRING() 269 enr.enriche_specificiti() 270 271 data = enr.get_results() 272 del enr 273 274 ans = Analysis(data) 275 ans.gene_interaction() 276 ans.features_specificity() 277 ans.REACTOME_overrepresentation() 278 ans.KEGG_overrepresentation() 279 ans.GO_overrepresentation() 280 ans.features_specificity() 281 282 data_dict[c] = ans.get_full_results() 283 else: 284 print( 285 f"Cell {c} was not enriched. No specific markers were found in this dataset." 286 ) 287 data_dict[c] = None 288 289 self.cells_enrichment = data_dict 290 291 else: 292 raise ValueError( 293 "`self.cells_markers` not defined. Use `self.cells_markers` to provide markers." 294 )
Performs functional enrichment analysis for each cell type based on marker genes.
Parameters
p_value : float Maximum p-value for significant genes (default 0.05).
adj : bool If True, the adjusted p-values are used to determine significant genes. Adjusted p-values are calculated using the Benjamini–Hochberg false discovery rate (FDR) correction. If False, raw p-values are used instead.
log_fc : float Minimum log fold-change threshold for marker genes (default 0.1).
top_max : int Maximum number of top marker genes per cell type to consider (default 500).
Raises
ValueError
If cells_markers is not defined.
Notes
This method populates cells_enrichment with results for GO-TERM, KEGG, REACTOME,
STRING, IntAct, and specificity analyses.
296 def get_enrichment_data( 297 self, 298 data_type="GO-TERM", 299 p_value=0.05, 300 test="FISH", 301 adj="BH", 302 parent_inc=False, 303 top_n=50, 304 ): 305 """ 306 Retrieves enrichment results for all cells in a unified DataFrame. 307 308 Parameters 309 ---------- 310 data_type : str 311 Type of enrichment to retrieve ('GO-TERM', 'KEGG', 'REACTOME', 'specificity'). 312 313 p_value : float, optional 314 Maximum p-value threshold (default 0.05). 315 316 test : str, optional 317 Name of the statistical test column to use (default 'FISH'). 318 319 adj : str, optional 320 P-value adjustment method (default 'BH'). 321 322 parent_inc : bool, optional 323 Whether to include parent terms in the results (default False). 324 325 top_n : int, optional 326 Maximum number of terms per cell type to include (default 50). 327 328 Returns 329 ------- 330 pd.DataFrame 331 DataFrame containing filtered enrichment results with a 'cell' column indicating cell type. 332 333 Raises 334 ------ 335 ValueError 336 If `data_type` is not one of the expected values. 337 """ 338 339 if not any( 340 x in data_type for x in ("GO-TERM", "KEGG", "REACTOME", "specificity") 341 ): 342 raise ValueError( 343 "Invalid value for 'data_type'. Expected: 'GO-TERM', 'KEGG', 'REACTOME' or 'specificity'." 344 ) 345 346 if data_type == "GO-TERM": 347 parent_col = "parent" 348 349 elif data_type == "KEGG": 350 parent_col = "2nd" 351 352 elif data_type == "REACTOME": 353 parent_col = "top_level" 354 355 elif data_type == "specificity": 356 parent_col = "None" 357 358 pdl = [] 359 for i in self.cells_enrichment.keys(): 360 if self.cells_enrichment[i] is None: 361 continue 362 363 print(i) 364 if data_type == "specificity": 365 tmp_dict = self.cells_enrichment[i]["statistics"][data_type] 366 tmp = [] 367 for k in tmp_dict.keys(): 368 if k != "HPA_subcellular_location": 369 tmp.append(pd.DataFrame(tmp_dict[k])) 370 371 tmp = pd.concat(tmp) 372 373 else: 374 tmp = pd.DataFrame(self.cells_enrichment[i]["statistics"][data_type]) 375 376 cols = [x for x in tmp.columns if test in x and adj in x] 377 cols = sorted(cols, reverse=True) 378 if parent_inc is False: 379 cols = [x for x in cols if parent_col not in x.lower()] 380 381 mask = (tmp[cols] <= p_value).all(axis=1) 382 tmp = tmp.loc[mask] 383 tmp["cell"] = i 384 tmp = tmp.sort_values(by=["cell"] + cols, ascending=True) 385 386 pdl.append(tmp.head(top_n)) 387 388 df = pd.concat(pdl) 389 df["source"] = data_type 390 df = df.reset_index(drop=True) 391 392 return df
Retrieves enrichment results for all cells in a unified DataFrame.
Parameters
data_type : str Type of enrichment to retrieve ('GO-TERM', 'KEGG', 'REACTOME', 'specificity').
p_value : float, optional Maximum p-value threshold (default 0.05).
test : str, optional Name of the statistical test column to use (default 'FISH').
adj : str, optional P-value adjustment method (default 'BH').
parent_inc : bool, optional Whether to include parent terms in the results (default False).
top_n : int, optional Maximum number of terms per cell type to include (default 50).
Returns
pd.DataFrame DataFrame containing filtered enrichment results with a 'cell' column indicating cell type.
Raises
ValueError
If data_type is not one of the expected values.
394 def get_included_cells(self): 395 """ 396 Returns the list of cell types included in the enrichment analysis. 397 398 Returns 399 ------- 400 list 401 List of cell type names. 402 403 Example 404 ------- 405 >>> self.get_included_cells() 406 ['CellType1', 'CellType2', ...] 407 """ 408 409 cl = [] 410 for i in self.cells_enrichment.keys(): 411 print(i) 412 cl.append(i) 413 414 return cl
Returns the list of cell types included in the enrichment analysis.
Returns
list List of cell type names.
Example
>>> self.get_included_cells()
['CellType1', 'CellType2', ...]
416 def get_gene_interactions(self, cell_name): 417 """ 418 Retrieves gene or protein interaction data for a specific cell type. 419 420 Parameters 421 ---------- 422 cell_name : str 423 Name of the cell type. 424 425 Returns 426 ------- 427 pd.DataFrame 428 DataFrame containing interactions for the specified cell. 429 430 Example 431 ------- 432 >>> self.get_gene_interactions('CellType1') 433 """ 434 435 tmp = pd.DataFrame( 436 self.cells_enrichment[cell_name]["statistics"]["interactions"] 437 ) 438 439 return tmp
Retrieves gene or protein interaction data for a specific cell type.
Parameters
cell_name : str Name of the cell type.
Returns
pd.DataFrame DataFrame containing interactions for the specified cell.
Example
>>> self.get_gene_interactions('CellType1')
441 def calculate_cell_connections(self): 442 """ 443 Calculates cell-cell interaction connections based on gene/protein co-expression. 444 445 Notes 446 ----- 447 Populates `cells_connection` with a DataFrame containing interactions between all pairs of cells. 448 449 Each row represents an interaction between two cells and the involved genes/proteins. 450 451 Raises 452 ------ 453 ValueError 454 If `normalized_data` is not defined in the JDTI object. 455 """ 456 457 if isinstance(self.jdti.normalized_data, pd.DataFrame): 458 459 cells = set(self.jdti.normalized_data.columns) 460 461 data_dict = {} 462 463 for c in tqdm(cells): 464 465 tmp = self.jdti.normalized_data.loc[:, c] 466 names = tmp.loc[ 467 tmp.select_dtypes(include="number").sum(axis=1) > 0 468 ].index.tolist() 469 names = list(set(names)) 470 471 enr = copy.copy(self.enr_full_info) 472 enr.genome = enr.genome[ 473 enr.genome["found_names"].isin(names) 474 ].reset_index(drop=True) 475 enr.enriche_CellCon() 476 data = enr.get_results() 477 del enr 478 479 data_dict[c] = data["CellConnections"] 480 481 full_data = [] 482 for c1 in tqdm(cells): 483 for c2 in cells: 484 if c1 != c2: 485 c1_d = pd.DataFrame(data_dict[c1]["interactor2"]) 486 c2_d = pd.DataFrame(data_dict[c2]["interactor1"]) 487 488 mutual_lr = c1_d["interaction"][ 489 c1_d["interaction"].isin(list(c2_d["interaction"])) 490 ] 491 492 to_ret = ( 493 c1_d[c1_d["interaction"].isin(list(mutual_lr))] 494 .drop( 495 [ 496 "Species", 497 "protein_id_1", 498 "protein_id_2", 499 "found_names_2", 500 ], 501 axis=1, 502 ) 503 .reset_index(drop=True) 504 ) 505 506 to_ret = to_ret.rename(columns={"found_names_1": "interactor1"}) 507 c2_subset = c2_d[["interaction", "found_names_2"]].rename( 508 columns={"found_names_2": "interactor2"} 509 ) 510 511 to_ret = to_ret.merge(c2_subset, on="interaction", how="left") 512 to_ret["cell1"] = c1 513 to_ret["cell2"] = c2 514 515 full_data.append(to_ret) 516 517 self.cells_connection = pd.concat(full_data) 518 519 else: 520 raise ValueError( 521 "`self.cells_markers` not defined. Use `self.cells_markers` to provide markers." 522 )
Calculates cell-cell interaction connections based on gene/protein co-expression.
Notes
Populates cells_connection with a DataFrame containing interactions between all pairs of cells.
Each row represents an interaction between two cells and the involved genes/proteins.
Raises
ValueError
If normalized_data is not defined in the JDTI object.
524 def get_cell_connections( 525 self, 526 connection_type: list = [ 527 "Adhesion-Adhesion", 528 "Gap-Gap", 529 "Ligand-Ligand", 530 "Ligand-Receptor", 531 "Receptor-Receptor", 532 "Undefined", 533 ], 534 ): 535 """ 536 Returns the calculated cell-cell interaction connections. 537 538 Parameters 539 ---------- 540 connection_type : list of str, optional 541 List of interaction types used to filter the returned cell–cell connections 542 based on the molecular directionality of the interaction. Possible values: 543 544 - "Adhesion-Adhesion" : interaction between two adhesion molecules. 545 - "Gap-Gap" : connection through gap junction proteins. 546 - "Ligand-Ligand" : interaction between two ligand molecules. 547 - "Ligand-Receptor" : directional interaction where a ligand binds to a receptor. 548 - "Receptor-Receptor" : interaction between two receptor molecules. 549 - "Undefined" : interactions where the directionality could not be determined. 550 551 By default, all interaction types are included. 552 553 Returns 554 ------- 555 pd.DataFrame 556 DataFrame containing cell-cell interactions. 557 558 Example 559 ------- 560 >>> connections = self.get_cell_connections() 561 """ 562 563 tmp = self.cells_connection 564 565 tmp["directionality"] = [ 566 x if x is not None else "Undefined" for x in tmp["directionality"] 567 ] 568 569 tmp = tmp[tmp["directionality"].isin(connection_type)] 570 571 return tmp
Returns the calculated cell-cell interaction connections.
Parameters
connection_type : list of str, optional List of interaction types used to filter the returned cell–cell connections based on the molecular directionality of the interaction. Possible values:
- "Adhesion-Adhesion" : interaction between two adhesion molecules.
- "Gap-Gap" : connection through gap junction proteins.
- "Ligand-Ligand" : interaction between two ligand molecules.
- "Ligand-Receptor" : directional interaction where a ligand binds to a receptor.
- "Receptor-Receptor" : interaction between two receptor molecules.
- "Undefined" : interactions where the directionality could not be determined.
By default, all interaction types are included.
Returns
pd.DataFrame DataFrame containing cell-cell interactions.
Example
>>> connections = self.get_cell_connections()
574def compare_connections( 575 instances_dict: dict, 576 cells_compartment: dict | None = None, 577 connection_type: list = [ 578 "Adhesion-Adhesion", 579 "Gap-Gap", 580 "Ligand-Ligand", 581 "Ligand-Receptor", 582 "Receptor-Receptor", 583 "Undefined", 584 ], 585): 586 """ 587 Compare gene expression between two instances based on their cell connections. 588 589 This function compares normalized gene expression data from exactly two 590 instances stored in ``instances_dict``. Optionally, the comparison can be 591 restricted to specific cell compartments for each instance. Differential 592 expression analysis is performed using ``jdti.calc_DEG``. 593 594 Parameters 595 ---------- 596 instances_dict : dict 597 Dictionary containing exactly two objects. Each object must have: 598 599 - ``jdti.normalized_data`` : pandas.DataFrame 600 Gene expression matrix with genes as rows and cells as columns. 601 602 - ``cells_connection`` : pandas.DataFrame 603 DataFrame containing at least the columns ``'interactor1'`` and 604 ``'interactor2'``. 605 606 The dictionary keys are used as group labels in the comparison. 607 608 cells_compartment : dict or None, optional 609 Dictionary mapping each key in ``instances_dict`` to a list of cell names 610 to be used for the comparison. If ``None``, all cells are used and genes 611 are filtered based on cell–cell connections. 612 613 connection_type : list of str, optional 614 List of interaction types used to filter cell–cell connections that are 615 considered in the gene expression comparison. Only connections with the 616 specified molecular interaction types will be used to define interacting 617 cells between the two instances. 618 619 Possible values: 620 621 - "Adhesion-Adhesion" : interaction between two adhesion molecules. 622 - "Gap-Gap" : connection mediated by gap junction proteins. 623 - "Ligand-Ligand" : interaction between two ligand molecules. 624 - "Ligand-Receptor" : directional interaction where a ligand binds to a receptor. 625 - "Receptor-Receptor" : interaction between two receptor molecules. 626 - "Undefined" : interactions where the directionality could not be determined. 627 628 By default, all interaction types are included in the comparison. 629 630 Returns 631 ------- 632 pandas.DataFrame 633 Differential expression results returned by ``calc_DEG``, filtered to 634 include only rows where ``valid_group`` matches the first key in 635 ``instances_dict``. 636 637 Raises 638 ------ 639 ValueError 640 If any cell specified in ``cells_compartment`` is not present in the 641 corresponding ``normalized_data`` columns. 642 643 Notes 644 ----- 645 - Only genes common to both instances are considered. 646 647 - When ``cells_compartment`` is ``None``, genes are further restricted to 648 those appearing in the cell–cell interaction networks of either instance. 649 650 - The function assumes exactly two entries in ``instances_dict``. 651 652 - Differential expression is computed with ``min_exp=0`` and ``min_pct=0.1``. 653 654 See Also 655 -------- 656 jdti.calc_DEG : Function used to compute differential expression. 657 """ 658 659 import pandas as pd 660 from jdti import calc_DEG 661 662 if isinstance(cells_compartment, dict): 663 664 keys_list = list(instances_dict.keys()) 665 tmp1 = instances_dict[keys_list[0]].jdti.normalized_data.copy() 666 cells = cells_compartment[keys_list[0]] 667 if any(cell not in tmp1.columns for cell in cells): 668 raise ValueError( 669 'Any of {keys_list[0]} cells in dictionary "cells_compartment" do not occur!' 670 ) 671 tmp1 = tmp1.loc[:, cells] 672 tmp1.columns = [keys_list[0]] * len(tmp1.columns) 673 674 tmp2 = instances_dict[keys_list[1]].jdti.normalized_data.copy() 675 cells = cells_compartment[keys_list[1]] 676 if any(cell not in tmp2.columns for cell in cells): 677 raise ValueError( 678 'Any of {keys_list[1]} cells in dictionary "cells_compartment" do not occur!' 679 ) 680 tmp2 = tmp2.loc[:, cells] 681 tmp2.columns = [keys_list[1]] * len(tmp2.columns) 682 683 common_idx = tmp1.index.intersection(tmp2.index) 684 685 tmp1 = tmp1.loc[common_idx] 686 tmp2 = tmp2.loc[common_idx] 687 688 concat_df = pd.concat([tmp1, tmp2], axis=1) 689 690 else: 691 692 keys_list = list(instances_dict.keys()) 693 tmp1 = instances_dict[keys_list[0]].jdti.normalized_data.copy() 694 tmp1.columns = [keys_list[0]] * len(tmp1.columns) 695 696 tmp2 = instances_dict[keys_list[1]].jdti.normalized_data.copy() 697 tmp2.columns = [keys_list[1]] * len(tmp2.columns) 698 699 common_idx = tmp1.index.intersection(tmp2.index) 700 701 tmp1 = tmp1.loc[common_idx] 702 tmp2 = tmp2.loc[common_idx] 703 704 concat_df = pd.concat([tmp1, tmp2], axis=1) 705 706 tmp_df_1 = instances_dict[keys_list[0]].cells_connection 707 tmp_df_2 = instances_dict[keys_list[1]].cells_connection 708 709 tmp_df_1["directionality"] = [ 710 x if x is not None else "Undefined" for x in tmp_df_1["directionality"] 711 ] 712 tmp_df_2["directionality"] = [ 713 x if x is not None else "Undefined" for x in tmp_df_2["directionality"] 714 ] 715 716 tmp_df_1 = tmp_df_1[tmp_df_1["directionality"].isin(connection_type)] 717 tmp_df_2 = tmp_df_2[tmp_df_2["directionality"].isin(connection_type)] 718 719 tmp_con1 = list(set(list(tmp_df_1["interactor1"]) + list(tmp_df_1["interactor2"]))) 720 721 tmp_con2 = list(set(list(tmp_df_2["interactor1"]) + list(tmp_df_2["interactor2"]))) 722 723 genes = list(set(tmp_con1 + tmp_con2)) 724 725 genes2 = [x for x in genes if x in common_idx] 726 727 concat_df = concat_df.loc[genes2, :] 728 729 results = calc_DEG( 730 data=concat_df, 731 metadata_list=None, 732 entities="All", 733 sets=None, 734 min_exp=0, 735 min_pct=0, 736 n_proc=10, 737 ) 738 739 results = results[results["valid_group"] == keys_list[0]] 740 741 return results
Compare gene expression between two instances based on their cell connections.
This function compares normalized gene expression data from exactly two
instances stored in instances_dict. Optionally, the comparison can be
restricted to specific cell compartments for each instance. Differential
expression analysis is performed using jdti.calc_DEG.
Parameters
instances_dict : dict Dictionary containing exactly two objects. Each object must have:
- ``jdti.normalized_data`` : pandas.DataFrame
Gene expression matrix with genes as rows and cells as columns.
- ``cells_connection`` : pandas.DataFrame
DataFrame containing at least the columns ``'interactor1'`` and
``'interactor2'``.
The dictionary keys are used as group labels in the comparison.
cells_compartment : dict or None, optional
Dictionary mapping each key in instances_dict to a list of cell names
to be used for the comparison. If None, all cells are used and genes
are filtered based on cell–cell connections.
connection_type : list of str, optional List of interaction types used to filter cell–cell connections that are considered in the gene expression comparison. Only connections with the specified molecular interaction types will be used to define interacting cells between the two instances.
Possible values:
- "Adhesion-Adhesion" : interaction between two adhesion molecules.
- "Gap-Gap" : connection mediated by gap junction proteins.
- "Ligand-Ligand" : interaction between two ligand molecules.
- "Ligand-Receptor" : directional interaction where a ligand binds to a receptor.
- "Receptor-Receptor" : interaction between two receptor molecules.
- "Undefined" : interactions where the directionality could not be determined.
By default, all interaction types are included in the comparison.
Returns
pandas.DataFrame
Differential expression results returned by calc_DEG, filtered to
include only rows where valid_group matches the first key in
instances_dict.
Raises
ValueError
If any cell specified in cells_compartment is not present in the
corresponding normalized_data columns.
Notes
Only genes common to both instances are considered.
When
cells_compartmentisNone, genes are further restricted to those appearing in the cell–cell interaction networks of either instance.The function assumes exactly two entries in
instances_dict.Differential expression is computed with
min_exp=0andmin_pct=0.1.
See Also
jdti.calc_DEG : Function used to compute differential expression.