cfi_toolkit.CellFunctionality

View Source

  1import copy
  2import os
  3import pickle
  4import sys
  5
  6import pandas as pd
  7from tqdm import tqdm
  8
  9_old_stdout = sys.stdout
 10sys.stdout = open(os.devnull, "w")
 11
 12from gedspy import Analysis, Enrichment
 13
 14sys.stdout.close()
 15sys.stdout = _old_stdout
 16
 17
 18class CellFunCon:
 19    """
 20    A class to perform cell-type functional analysis and enrichment based on a JDtI-COMPsc objects.
 21
 22    This class provides methods to calculate marker genes for cell types, perform functional enrichment
 23    (GO, KEGG, REACTOME, STRING, IntAct), and compute cell-cell interaction networks.
 24    Projects can also be saved and loaded via pickle.
 25
 26    Attributes
 27    ----------
 28    jdti : object
 29        JDtI-COMPsc object containing normalized single-cell data.
 30
 31    cells_markers : pd.DataFrame or None
 32        DataFrame containing marker genes per cell type after calculation.
 33
 34    enr_full_info : Enrichment
 35        Enrichment object containing all genes available for enrichment analysis.
 36
 37    cells_enrichment : dict or None
 38        Dictionary storing enrichment results per cell type.
 39
 40    cells_connection : pd.DataFrame or None
 41        DataFrame storing calculated cell-cell interaction information.
 42
 43    mt_genes : bool
 44        Whether mitochondrial genes are included (default False).
 45
 46    ribo_genes : bool
 47        Whether ribosomal genes are included (default False).
 48    """
 49
 50    def __init__(self, jdti_object, mt_genes=False, ribo_genes=False):
 51        """
 52        Initializes the CellFunCon object with a COMPsc/JDTI object.
 53
 54        Parameters
 55        ----------
 56        jdti_object : object
 57            A COMPsc or JDTI object with normalized single-cell data.
 58
 59        mt_genes : bool
 60            Whether mitochondrial genes are included (default False).
 61
 62        ribo_genes : bool
 63            Whether ribosomal genes are included (default False).
 64        """
 65
 66        self.jdti = jdti_object
 67        """JDtI-COMPsc object containing normalized single-cell data."""
 68
 69        self.cells_markers = None
 70        """DataFrame containing marker genes per cell type after calculation."""
 71
 72        self.cells_connection = None
 73        """DataFrame storing calculated cell-cell interaction information."""
 74
 75        self.cells_enrichment = None
 76        """Dictionary storing enrichment results per cell type."""
 77
 78        self.mt_genes = mt_genes
 79        """Whether mitochondrial genes are included (default False)."""
 80
 81        self.ribo_genes = ribo_genes
 82        """Whether ribosomal genes are included (default False)."""
 83
 84        names = self.jdti.normalized_data.loc[
 85            self.jdti.normalized_data.select_dtypes(include="number").sum(axis=1) > 0
 86        ].index.tolist()
 87        names = list(set(names))
 88
 89        if self.mt_genes is False:
 90            names = [x for x in names if "MT-" not in x.upper()]
 91        if self.ribo_genes is False:
 92            names = [x for x in names if "RPS" != x[:3].upper()]
 93            names = [x for x in names if "RPL" != x[:3].upper()]
 94
 95        enr = Enrichment()
 96        enr.select_features(names)
 97
 98        self.enr_full_info = enr
 99        """Enrichment object containing all genes available for enrichment analysis."""
100
101    def save_project(self, filename):
102        """
103        Saves the current CellFunCon project as a pickle file.
104
105        Parameters
106        ----------
107        filename : str
108            Path to save the project (e.g., 'project_name').
109
110        Example
111        -------
112        >>> self.save_project('my_project')
113        """
114
115        with open(f"{filename}.psc", "wb") as f:
116            pickle.dump(self, f)
117        print(f"Project saved as {filename}")
118
119    @classmethod
120    def load_project(cls, filename):
121        """
122        Loads a previously saved CellFunCon project from a pickle file.
123
124        Parameters
125        ----------
126        filename : str
127            Path to the saved pickle file.
128
129        Returns
130        -------
131        CellFunCon
132            Loaded CellFunCon self.
133
134        Raises
135        ------
136        TypeError
137            If the loaded object is not a CellFunCon self.
138
139        ValueError
140            If the file is not a valid CellFunCon project file.
141
142        Example
143        -------
144        >>> self = CellFunCon.load_project('my_project.psc')
145        """
146
147        if ".psc" in filename:
148            with open(filename, "rb") as f:
149                obj = pickle.load(f)
150            if not isinstance(obj, cls):
151                raise TypeError("File does not include project.psc")
152            print(f"Project loaded from {filename}")
153            return obj
154        else:
155            raise ValueError("Project not belong to CellFunCon project data.")
156
157    def calculate_cells_markers(self, min_exp=0, min_pct=0.05, n_proc=10):
158        """
159        Calculates marker genes for each cell type based on expression thresholds.
160
161        Perform differential gene expression (DEG) analysis on gene expression data.
162
163        The function compares groups of cells or samples (defined by `entities` or
164        `sets`) using the Mann–Whitney U test. It computes p-values, adjusted
165        p-values, fold changes, standardized effect sizes, and other statistics.
166
167
168        Parameters
169        ----------
170        min_exp : float, optional
171            Minimum expression level to consider a gene (default 0).
172
173        min_pct : float, optional
174            Minimum fraction of cells expressing a gene (default 0.05).
175
176        n_proc : int, optional
177            Number of parallel processes to use (default 10).
178
179        Notes
180        -----
181        The results are stored in the `cells_markers` attribute.
182        """
183
184        self.jdti.calculate_difference_markers(
185            min_exp=min_exp, min_pct=min_pct, n_proc=n_proc, force=True
186        )
187
188        self.cells_markers = self.jdti.var_data
189
190    def enrich_cells_fucntionality(
191        self, p_value=0.05, adj=True, log_fc=0.1, top_max=500
192    ):
193        """
194        Performs functional enrichment analysis for each cell type based on marker genes.
195
196        Parameters
197        ----------
198        p_value : float
199            Maximum p-value for significant genes (default 0.05).
200
201        adj : bool
202            If True, the adjusted p-values are used to determine significant genes.
203            Adjusted p-values are calculated using the Benjamini–Hochberg false
204            discovery rate (FDR) correction. If False, raw p-values are used instead.
205
206        log_fc : float
207            Minimum log fold-change threshold for marker genes (default 0.1).
208
209        top_max : int
210            Maximum number of top marker genes per cell type to consider (default 500).
211
212        Raises
213        ------
214        ValueError
215            If `cells_markers` is not defined.
216
217        Notes
218        -----
219        This method populates `cells_enrichment` with results for GO-TERM, KEGG, REACTOME,
220        STRING, IntAct, and specificity analyses.
221        """
222
223        if isinstance(self.cells_markers, pd.DataFrame):
224
225            markers = self.cells_markers
226            cells = set(markers["valid_group"])
227
228            data_dict = {}
229
230            max_c = len(cells)
231            for n, c in enumerate(cells):
232                print(f"\nAnalysis {n+1} of {max_c} cells --> {c} \n")
233
234                if adj:
235                    tmp = markers[
236                        (markers["valid_group"] == c)
237                        & (markers["adj_pval"] <= p_value)
238                        & (markers["log(FC)"] > log_fc)
239                    ]
240                    names = list(set(tmp["feature"]))
241
242                    tmp = tmp[tmp["feature"].isin(names)]
243
244                else:
245                    tmp = markers[
246                        (markers["valid_group"] == c)
247                        & (markers["p_val"] <= p_value)
248                        & (markers["log(FC)"] > log_fc)
249                    ]
250                    names = list(set(tmp["feature"]))
251
252                    tmp = tmp[tmp["feature"].isin(names)]
253
254                tmp = tmp.sort_values("esm", ascending=False).head(top_max)
255
256                if len(tmp.index) > 0:
257                    data_dict[c] = {}
258                    enr = copy.copy(self.enr_full_info)
259                    enr.genome = enr.genome[
260                        enr.genome["found_names"].isin(list(set(tmp["feature"])))
261                    ].reset_index(drop=True)
262                    enr.enriche_specificiti()
263                    enr.enriche_KEGG()
264                    enr.enriche_GOTERM()
265                    enr.enriche_REACTOME()
266                    enr.enriche_IntAct()
267                    enr.enriche_STRING()
268                    enr.enriche_specificiti()
269
270                    data = enr.get_results()
271                    del enr
272
273                    ans = Analysis(data)
274                    ans.gene_interaction()
275                    ans.features_specificity()
276                    ans.REACTOME_overrepresentation()
277                    ans.KEGG_overrepresentation()
278                    ans.GO_overrepresentation()
279                    ans.features_specificity()
280
281                    data_dict[c] = ans.get_full_results()
282                else:
283                    print(
284                        f"Cell {c} was not enriched. No specific markers were found in this dataset."
285                    )
286                    data_dict[c] = None
287
288            self.cells_enrichment = data_dict
289
290        else:
291            raise ValueError(
292                "`self.cells_markers` not defined. Use `self.cells_markers` to provide markers."
293            )
294
295    def get_enrichment_data(
296        self,
297        data_type="GO-TERM",
298        p_value=0.05,
299        test="FISH",
300        adj="BH",
301        parent_inc=False,
302        top_n=50,
303    ):
304        """
305        Retrieves enrichment results for all cells in a unified DataFrame.
306
307        Parameters
308        ----------
309        data_type : str
310            Type of enrichment to retrieve ('GO-TERM', 'KEGG', 'REACTOME', 'specificity').
311
312        p_value : float, optional
313            Maximum p-value threshold (default 0.05).
314
315        test : str, optional
316            Name of the statistical test column to use (default 'FISH').
317
318        adj : str, optional
319            P-value adjustment method (default 'BH').
320
321        parent_inc : bool, optional
322            Whether to include parent terms in the results (default False).
323
324        top_n : int, optional
325            Maximum number of terms per cell type to include (default 50).
326
327        Returns
328        -------
329        pd.DataFrame
330            DataFrame containing filtered enrichment results with a 'cell' column indicating cell type.
331
332        Raises
333        ------
334        ValueError
335            If `data_type` is not one of the expected values.
336        """
337
338        if not any(
339            x in data_type for x in ("GO-TERM", "KEGG", "REACTOME", "specificity")
340        ):
341            raise ValueError(
342                "Invalid value for 'data_type'. Expected: 'GO-TERM', 'KEGG', 'REACTOME' or 'specificity'."
343            )
344
345        if data_type == "GO-TERM":
346            parent_col = "parent"
347
348        elif data_type == "KEGG":
349            parent_col = "2nd"
350
351        elif data_type == "REACTOME":
352            parent_col = "top_level"
353
354        elif data_type == "specificity":
355            parent_col = "None"
356
357        pdl = []
358        for i in self.cells_enrichment.keys():
359            if self.cells_enrichment[i] is None:
360                continue
361
362            print(i)
363            if data_type == "specificity":
364                tmp_dict = self.cells_enrichment[i]["statistics"][data_type]
365                tmp = []
366                for k in tmp_dict.keys():
367                    if k != "HPA_subcellular_location":
368                        tmp.append(pd.DataFrame(tmp_dict[k]))
369
370                tmp = pd.concat(tmp)
371
372            else:
373                tmp = pd.DataFrame(self.cells_enrichment[i]["statistics"][data_type])
374
375            cols = [x for x in tmp.columns if test in x and adj in x]
376            cols = sorted(cols, reverse=True)
377            if parent_inc is False:
378                cols = [x for x in cols if parent_col not in x.lower()]
379
380            mask = (tmp[cols] <= p_value).all(axis=1)
381            tmp = tmp.loc[mask]
382            tmp["cell"] = i
383            tmp = tmp.sort_values(by=["cell"] + cols, ascending=True)
384
385            pdl.append(tmp.head(top_n))
386
387        df = pd.concat(pdl)
388        df["source"] = data_type
389        df = df.reset_index(drop=True)
390
391        return df
392
393    def get_included_cells(self):
394        """
395        Returns the list of cell types included in the enrichment analysis.
396
397        Returns
398        -------
399        list
400            List of cell type names.
401
402        Example
403        -------
404        >>> self.get_included_cells()
405        ['CellType1', 'CellType2', ...]
406        """
407
408        cl = []
409        for i in self.cells_enrichment.keys():
410            print(i)
411            cl.append(i)
412
413        return cl
414
415    def get_gene_interactions(self, cell_name):
416        """
417        Retrieves gene or protein interaction data for a specific cell type.
418
419        Parameters
420        ----------
421        cell_name : str
422            Name of the cell type.
423
424        Returns
425        -------
426        pd.DataFrame
427            DataFrame containing interactions for the specified cell.
428
429        Example
430        -------
431        >>> self.get_gene_interactions('CellType1')
432        """
433
434        tmp = pd.DataFrame(
435            self.cells_enrichment[cell_name]["statistics"]["interactions"]
436        )
437
438        return tmp
439
440    def calculate_cell_connections(self):
441        """
442        Calculates cell-cell interaction connections based on gene/protein co-expression.
443
444        Notes
445        -----
446        Populates `cells_connection` with a DataFrame containing interactions between all pairs of cells.
447
448        Each row represents an interaction between two cells and the involved genes/proteins.
449
450        Raises
451        ------
452        ValueError
453            If `normalized_data` is not defined in the JDTI object.
454        """
455
456        if isinstance(self.jdti.normalized_data, pd.DataFrame):
457
458            cells = set(self.jdti.normalized_data.columns)
459
460            data_dict = {}
461
462            for c in tqdm(cells):
463
464                tmp = self.jdti.normalized_data.loc[:, c]
465                names = tmp.loc[
466                    tmp.select_dtypes(include="number").sum(axis=1) > 0
467                ].index.tolist()
468                names = list(set(names))
469
470                enr = copy.copy(self.enr_full_info)
471                enr.genome = enr.genome[
472                    enr.genome["found_names"].isin(names)
473                ].reset_index(drop=True)
474                enr.enriche_CellCon()
475                data = enr.get_results()
476                del enr
477
478                data_dict[c] = data["CellConnections"]
479
480            full_data = []
481            for c1 in tqdm(cells):
482                for c2 in cells:
483                    if c1 != c2:
484                        c1_d = pd.DataFrame(data_dict[c1]["interactor2"])
485                        c2_d = pd.DataFrame(data_dict[c2]["interactor1"])
486
487                        mutual_lr = c1_d["interaction"][
488                            c1_d["interaction"].isin(list(c2_d["interaction"]))
489                        ]
490
491                        to_ret = (
492                            c1_d[c1_d["interaction"].isin(list(mutual_lr))]
493                            .drop(
494                                [
495                                    "Species",
496                                    "protein_id_1",
497                                    "protein_id_2",
498                                    "found_names_2",
499                                ],
500                                axis=1,
501                            )
502                            .reset_index(drop=True)
503                        )
504
505                        to_ret = to_ret.rename(columns={"found_names_1": "interactor1"})
506                        c2_subset = c2_d[["interaction", "found_names_2"]].rename(
507                            columns={"found_names_2": "interactor2"}
508                        )
509
510                        to_ret = to_ret.merge(c2_subset, on="interaction", how="left")
511                        to_ret["cell1"] = c1
512                        to_ret["cell2"] = c2
513
514                        full_data.append(to_ret)
515
516            self.cells_connection = pd.concat(full_data)
517
518        else:
519            raise ValueError(
520                "`self.cells_markers` not defined. Use `self.cells_markers` to provide markers."
521            )
522
523    def get_cell_connections(
524        self,
525        connection_type: list = [
526            "Adhesion-Adhesion",
527            "Gap-Gap",
528            "Ligand-Ligand",
529            "Ligand-Receptor",
530            "Receptor-Receptor",
531            "Undefined",
532        ],
533    ):
534        """
535        Returns the calculated cell-cell interaction connections.
536
537        Parameters
538        ----------
539        connection_type : list of str, optional
540            List of interaction types used to filter the returned cell–cell connections
541            based on the molecular directionality of the interaction. Possible values:
542
543            - "Adhesion-Adhesion" : interaction between two adhesion molecules.
544            - "Gap-Gap" : connection through gap junction proteins.
545            - "Ligand-Ligand" : interaction between two ligand molecules.
546            - "Ligand-Receptor" : directional interaction where a ligand binds to a receptor.
547            - "Receptor-Receptor" : interaction between two receptor molecules.
548            - "Undefined" : interactions where the directionality could not be determined.
549
550            By default, all interaction types are included.
551
552        Returns
553        -------
554        pd.DataFrame
555            DataFrame containing cell-cell interactions.
556
557        Example
558        -------
559        >>> connections = self.get_cell_connections()
560        """
561
562        tmp = self.cells_connection
563
564        tmp["directionality"] = [
565            x if x is not None else "Undefined" for x in tmp["directionality"]
566        ]
567
568        tmp = tmp[tmp["directionality"].isin(connection_type)]
569
570        return tmp
571
572
573def compare_connections(
574    instances_dict: dict,
575    cells_compartment: dict | None = None,
576    connection_type: list = [
577        "Adhesion-Adhesion",
578        "Gap-Gap",
579        "Ligand-Ligand",
580        "Ligand-Receptor",
581        "Receptor-Receptor",
582        "Undefined",
583    ],
584):
585    """
586    Compare gene expression between two instances based on their cell connections.
587
588    This function compares normalized gene expression data from exactly two
589    instances stored in ``instances_dict``. Optionally, the comparison can be
590    restricted to specific cell compartments for each instance. Differential
591    expression analysis is performed using ``jdti.calc_DEG``.
592
593    Parameters
594    ----------
595    instances_dict : dict
596        Dictionary containing exactly two objects. Each object must have:
597
598        - ``jdti.normalized_data`` : pandas.DataFrame
599            Gene expression matrix with genes as rows and cells as columns.
600
601        - ``cells_connection`` : pandas.DataFrame
602            DataFrame containing at least the columns ``'interactor1'`` and
603            ``'interactor2'``.
604
605        The dictionary keys are used as group labels in the comparison.
606
607    cells_compartment : dict or None, optional
608        Dictionary mapping each key in ``instances_dict`` to a list of cell names
609        to be used for the comparison. If ``None``, all cells are used and genes
610        are filtered based on cell–cell connections.
611
612    connection_type : list of str, optional
613        List of interaction types used to filter cell–cell connections that are
614        considered in the gene expression comparison. Only connections with the
615        specified molecular interaction types will be used to define interacting
616        cells between the two instances.
617
618        Possible values:
619
620        - "Adhesion-Adhesion" : interaction between two adhesion molecules.
621        - "Gap-Gap" : connection mediated by gap junction proteins.
622        - "Ligand-Ligand" : interaction between two ligand molecules.
623        - "Ligand-Receptor" : directional interaction where a ligand binds to a receptor.
624        - "Receptor-Receptor" : interaction between two receptor molecules.
625        - "Undefined" : interactions where the directionality could not be determined.
626
627        By default, all interaction types are included in the comparison.
628
629    Returns
630    -------
631    pandas.DataFrame
632        Differential expression results returned by ``calc_DEG``, filtered to
633        include only rows where ``valid_group`` matches the first key in
634        ``instances_dict``.
635
636    Raises
637    ------
638    ValueError
639        If any cell specified in ``cells_compartment`` is not present in the
640        corresponding ``normalized_data`` columns.
641
642    Notes
643    -----
644    - Only genes common to both instances are considered.
645
646    - When ``cells_compartment`` is ``None``, genes are further restricted to
647      those appearing in the cell–cell interaction networks of either instance.
648
649    - The function assumes exactly two entries in ``instances_dict``.
650
651    - Differential expression is computed with ``min_exp=0`` and ``min_pct=0.1``.
652
653    See Also
654    --------
655    jdti.calc_DEG : Function used to compute differential expression.
656    """
657
658    import pandas as pd
659    from jdti import calc_DEG
660
661    if isinstance(cells_compartment, dict):
662
663        keys_list = list(instances_dict.keys())
664        tmp1 = instances_dict[keys_list[0]].jdti.normalized_data.copy()
665        cells = cells_compartment[keys_list[0]]
666        if any(cell not in tmp1.columns for cell in cells):
667            raise ValueError(
668                'Any of {keys_list[0]} cells in dictionary "cells_compartment" do not occur!'
669            )
670        tmp1 = tmp1.loc[:, cells]
671        tmp1.columns = [keys_list[0]] * len(tmp1.columns)
672
673        tmp2 = instances_dict[keys_list[1]].jdti.normalized_data.copy()
674        cells = cells_compartment[keys_list[1]]
675        if any(cell not in tmp2.columns for cell in cells):
676            raise ValueError(
677                'Any of {keys_list[1]} cells in dictionary "cells_compartment" do not occur!'
678            )
679        tmp2 = tmp2.loc[:, cells]
680        tmp2.columns = [keys_list[1]] * len(tmp2.columns)
681
682        common_idx = tmp1.index.intersection(tmp2.index)
683
684        tmp1 = tmp1.loc[common_idx]
685        tmp2 = tmp2.loc[common_idx]
686
687        concat_df = pd.concat([tmp1, tmp2], axis=1)
688
689    else:
690
691        keys_list = list(instances_dict.keys())
692        tmp1 = instances_dict[keys_list[0]].jdti.normalized_data.copy()
693        tmp1.columns = [keys_list[0]] * len(tmp1.columns)
694
695        tmp2 = instances_dict[keys_list[1]].jdti.normalized_data.copy()
696        tmp2.columns = [keys_list[1]] * len(tmp2.columns)
697
698        common_idx = tmp1.index.intersection(tmp2.index)
699
700        tmp1 = tmp1.loc[common_idx]
701        tmp2 = tmp2.loc[common_idx]
702
703        concat_df = pd.concat([tmp1, tmp2], axis=1)
704
705    tmp_df_1 = instances_dict[keys_list[0]].cells_connection
706    tmp_df_2 = instances_dict[keys_list[1]].cells_connection
707
708    tmp_df_1["directionality"] = [
709        x if x is not None else "Undefined" for x in tmp_df_1["directionality"]
710    ]
711    tmp_df_2["directionality"] = [
712        x if x is not None else "Undefined" for x in tmp_df_2["directionality"]
713    ]
714
715    tmp_df_1 = tmp_df_1[tmp_df_1["directionality"].isin(connection_type)]
716    tmp_df_2 = tmp_df_2[tmp_df_2["directionality"].isin(connection_type)]
717
718    tmp_con1 = list(set(list(tmp_df_1["interactor1"]) + list(tmp_df_1["interactor2"])))
719
720    tmp_con2 = list(set(list(tmp_df_2["interactor1"]) + list(tmp_df_2["interactor2"])))
721
722    genes = list(set(tmp_con1 + tmp_con2))
723
724    genes2 = [x for x in genes if x in common_idx]
725
726    concat_df = concat_df.loc[genes2, :]
727
728    results = calc_DEG(
729        data=concat_df,
730        metadata_list=None,
731        entities="All",
732        sets=None,
733        min_exp=0,
734        min_pct=0,
735        n_proc=10,
736    )
737
738    results = results[results["valid_group"] == keys_list[0]]
739
740    return results

class CellFunCon: View Source

 19class CellFunCon:
 20    """
 21    A class to perform cell-type functional analysis and enrichment based on a JDtI-COMPsc objects.
 22
 23    This class provides methods to calculate marker genes for cell types, perform functional enrichment
 24    (GO, KEGG, REACTOME, STRING, IntAct), and compute cell-cell interaction networks.
 25    Projects can also be saved and loaded via pickle.
 26
 27    Attributes
 28    ----------
 29    jdti : object
 30        JDtI-COMPsc object containing normalized single-cell data.
 31
 32    cells_markers : pd.DataFrame or None
 33        DataFrame containing marker genes per cell type after calculation.
 34
 35    enr_full_info : Enrichment
 36        Enrichment object containing all genes available for enrichment analysis.
 37
 38    cells_enrichment : dict or None
 39        Dictionary storing enrichment results per cell type.
 40
 41    cells_connection : pd.DataFrame or None
 42        DataFrame storing calculated cell-cell interaction information.
 43
 44    mt_genes : bool
 45        Whether mitochondrial genes are included (default False).
 46
 47    ribo_genes : bool
 48        Whether ribosomal genes are included (default False).
 49    """
 50
 51    def __init__(self, jdti_object, mt_genes=False, ribo_genes=False):
 52        """
 53        Initializes the CellFunCon object with a COMPsc/JDTI object.
 54
 55        Parameters
 56        ----------
 57        jdti_object : object
 58            A COMPsc or JDTI object with normalized single-cell data.
 59
 60        mt_genes : bool
 61            Whether mitochondrial genes are included (default False).
 62
 63        ribo_genes : bool
 64            Whether ribosomal genes are included (default False).
 65        """
 66
 67        self.jdti = jdti_object
 68        """JDtI-COMPsc object containing normalized single-cell data."""
 69
 70        self.cells_markers = None
 71        """DataFrame containing marker genes per cell type after calculation."""
 72
 73        self.cells_connection = None
 74        """DataFrame storing calculated cell-cell interaction information."""
 75
 76        self.cells_enrichment = None
 77        """Dictionary storing enrichment results per cell type."""
 78
 79        self.mt_genes = mt_genes
 80        """Whether mitochondrial genes are included (default False)."""
 81
 82        self.ribo_genes = ribo_genes
 83        """Whether ribosomal genes are included (default False)."""
 84
 85        names = self.jdti.normalized_data.loc[
 86            self.jdti.normalized_data.select_dtypes(include="number").sum(axis=1) > 0
 87        ].index.tolist()
 88        names = list(set(names))
 89
 90        if self.mt_genes is False:
 91            names = [x for x in names if "MT-" not in x.upper()]
 92        if self.ribo_genes is False:
 93            names = [x for x in names if "RPS" != x[:3].upper()]
 94            names = [x for x in names if "RPL" != x[:3].upper()]
 95
 96        enr = Enrichment()
 97        enr.select_features(names)
 98
 99        self.enr_full_info = enr
100        """Enrichment object containing all genes available for enrichment analysis."""
101
102    def save_project(self, filename):
103        """
104        Saves the current CellFunCon project as a pickle file.
105
106        Parameters
107        ----------
108        filename : str
109            Path to save the project (e.g., 'project_name').
110
111        Example
112        -------
113        >>> self.save_project('my_project')
114        """
115
116        with open(f"{filename}.psc", "wb") as f:
117            pickle.dump(self, f)
118        print(f"Project saved as {filename}")
119
120    @classmethod
121    def load_project(cls, filename):
122        """
123        Loads a previously saved CellFunCon project from a pickle file.
124
125        Parameters
126        ----------
127        filename : str
128            Path to the saved pickle file.
129
130        Returns
131        -------
132        CellFunCon
133            Loaded CellFunCon self.
134
135        Raises
136        ------
137        TypeError
138            If the loaded object is not a CellFunCon self.
139
140        ValueError
141            If the file is not a valid CellFunCon project file.
142
143        Example
144        -------
145        >>> self = CellFunCon.load_project('my_project.psc')
146        """
147
148        if ".psc" in filename:
149            with open(filename, "rb") as f:
150                obj = pickle.load(f)
151            if not isinstance(obj, cls):
152                raise TypeError("File does not include project.psc")
153            print(f"Project loaded from {filename}")
154            return obj
155        else:
156            raise ValueError("Project not belong to CellFunCon project data.")
157
158    def calculate_cells_markers(self, min_exp=0, min_pct=0.05, n_proc=10):
159        """
160        Calculates marker genes for each cell type based on expression thresholds.
161
162        Perform differential gene expression (DEG) analysis on gene expression data.
163
164        The function compares groups of cells or samples (defined by `entities` or
165        `sets`) using the Mann–Whitney U test. It computes p-values, adjusted
166        p-values, fold changes, standardized effect sizes, and other statistics.
167
168
169        Parameters
170        ----------
171        min_exp : float, optional
172            Minimum expression level to consider a gene (default 0).
173
174        min_pct : float, optional
175            Minimum fraction of cells expressing a gene (default 0.05).
176
177        n_proc : int, optional
178            Number of parallel processes to use (default 10).
179
180        Notes
181        -----
182        The results are stored in the `cells_markers` attribute.
183        """
184
185        self.jdti.calculate_difference_markers(
186            min_exp=min_exp, min_pct=min_pct, n_proc=n_proc, force=True
187        )
188
189        self.cells_markers = self.jdti.var_data
190
191    def enrich_cells_fucntionality(
192        self, p_value=0.05, adj=True, log_fc=0.1, top_max=500
193    ):
194        """
195        Performs functional enrichment analysis for each cell type based on marker genes.
196
197        Parameters
198        ----------
199        p_value : float
200            Maximum p-value for significant genes (default 0.05).
201
202        adj : bool
203            If True, the adjusted p-values are used to determine significant genes.
204            Adjusted p-values are calculated using the Benjamini–Hochberg false
205            discovery rate (FDR) correction. If False, raw p-values are used instead.
206
207        log_fc : float
208            Minimum log fold-change threshold for marker genes (default 0.1).
209
210        top_max : int
211            Maximum number of top marker genes per cell type to consider (default 500).
212
213        Raises
214        ------
215        ValueError
216            If `cells_markers` is not defined.
217
218        Notes
219        -----
220        This method populates `cells_enrichment` with results for GO-TERM, KEGG, REACTOME,
221        STRING, IntAct, and specificity analyses.
222        """
223
224        if isinstance(self.cells_markers, pd.DataFrame):
225
226            markers = self.cells_markers
227            cells = set(markers["valid_group"])
228
229            data_dict = {}
230
231            max_c = len(cells)
232            for n, c in enumerate(cells):
233                print(f"\nAnalysis {n+1} of {max_c} cells --> {c} \n")
234
235                if adj:
236                    tmp = markers[
237                        (markers["valid_group"] == c)
238                        & (markers["adj_pval"] <= p_value)
239                        & (markers["log(FC)"] > log_fc)
240                    ]
241                    names = list(set(tmp["feature"]))
242
243                    tmp = tmp[tmp["feature"].isin(names)]
244
245                else:
246                    tmp = markers[
247                        (markers["valid_group"] == c)
248                        & (markers["p_val"] <= p_value)
249                        & (markers["log(FC)"] > log_fc)
250                    ]
251                    names = list(set(tmp["feature"]))
252
253                    tmp = tmp[tmp["feature"].isin(names)]
254
255                tmp = tmp.sort_values("esm", ascending=False).head(top_max)
256
257                if len(tmp.index) > 0:
258                    data_dict[c] = {}
259                    enr = copy.copy(self.enr_full_info)
260                    enr.genome = enr.genome[
261                        enr.genome["found_names"].isin(list(set(tmp["feature"])))
262                    ].reset_index(drop=True)
263                    enr.enriche_specificiti()
264                    enr.enriche_KEGG()
265                    enr.enriche_GOTERM()
266                    enr.enriche_REACTOME()
267                    enr.enriche_IntAct()
268                    enr.enriche_STRING()
269                    enr.enriche_specificiti()
270
271                    data = enr.get_results()
272                    del enr
273
274                    ans = Analysis(data)
275                    ans.gene_interaction()
276                    ans.features_specificity()
277                    ans.REACTOME_overrepresentation()
278                    ans.KEGG_overrepresentation()
279                    ans.GO_overrepresentation()
280                    ans.features_specificity()
281
282                    data_dict[c] = ans.get_full_results()
283                else:
284                    print(
285                        f"Cell {c} was not enriched. No specific markers were found in this dataset."
286                    )
287                    data_dict[c] = None
288
289            self.cells_enrichment = data_dict
290
291        else:
292            raise ValueError(
293                "`self.cells_markers` not defined. Use `self.cells_markers` to provide markers."
294            )
295
296    def get_enrichment_data(
297        self,
298        data_type="GO-TERM",
299        p_value=0.05,
300        test="FISH",
301        adj="BH",
302        parent_inc=False,
303        top_n=50,
304    ):
305        """
306        Retrieves enrichment results for all cells in a unified DataFrame.
307
308        Parameters
309        ----------
310        data_type : str
311            Type of enrichment to retrieve ('GO-TERM', 'KEGG', 'REACTOME', 'specificity').
312
313        p_value : float, optional
314            Maximum p-value threshold (default 0.05).
315
316        test : str, optional
317            Name of the statistical test column to use (default 'FISH').
318
319        adj : str, optional
320            P-value adjustment method (default 'BH').
321
322        parent_inc : bool, optional
323            Whether to include parent terms in the results (default False).
324
325        top_n : int, optional
326            Maximum number of terms per cell type to include (default 50).
327
328        Returns
329        -------
330        pd.DataFrame
331            DataFrame containing filtered enrichment results with a 'cell' column indicating cell type.
332
333        Raises
334        ------
335        ValueError
336            If `data_type` is not one of the expected values.
337        """
338
339        if not any(
340            x in data_type for x in ("GO-TERM", "KEGG", "REACTOME", "specificity")
341        ):
342            raise ValueError(
343                "Invalid value for 'data_type'. Expected: 'GO-TERM', 'KEGG', 'REACTOME' or 'specificity'."
344            )
345
346        if data_type == "GO-TERM":
347            parent_col = "parent"
348
349        elif data_type == "KEGG":
350            parent_col = "2nd"
351
352        elif data_type == "REACTOME":
353            parent_col = "top_level"
354
355        elif data_type == "specificity":
356            parent_col = "None"
357
358        pdl = []
359        for i in self.cells_enrichment.keys():
360            if self.cells_enrichment[i] is None:
361                continue
362
363            print(i)
364            if data_type == "specificity":
365                tmp_dict = self.cells_enrichment[i]["statistics"][data_type]
366                tmp = []
367                for k in tmp_dict.keys():
368                    if k != "HPA_subcellular_location":
369                        tmp.append(pd.DataFrame(tmp_dict[k]))
370
371                tmp = pd.concat(tmp)
372
373            else:
374                tmp = pd.DataFrame(self.cells_enrichment[i]["statistics"][data_type])
375
376            cols = [x for x in tmp.columns if test in x and adj in x]
377            cols = sorted(cols, reverse=True)
378            if parent_inc is False:
379                cols = [x for x in cols if parent_col not in x.lower()]
380
381            mask = (tmp[cols] <= p_value).all(axis=1)
382            tmp = tmp.loc[mask]
383            tmp["cell"] = i
384            tmp = tmp.sort_values(by=["cell"] + cols, ascending=True)
385
386            pdl.append(tmp.head(top_n))
387
388        df = pd.concat(pdl)
389        df["source"] = data_type
390        df = df.reset_index(drop=True)
391
392        return df
393
394    def get_included_cells(self):
395        """
396        Returns the list of cell types included in the enrichment analysis.
397
398        Returns
399        -------
400        list
401            List of cell type names.
402
403        Example
404        -------
405        >>> self.get_included_cells()
406        ['CellType1', 'CellType2', ...]
407        """
408
409        cl = []
410        for i in self.cells_enrichment.keys():
411            print(i)
412            cl.append(i)
413
414        return cl
415
416    def get_gene_interactions(self, cell_name):
417        """
418        Retrieves gene or protein interaction data for a specific cell type.
419
420        Parameters
421        ----------
422        cell_name : str
423            Name of the cell type.
424
425        Returns
426        -------
427        pd.DataFrame
428            DataFrame containing interactions for the specified cell.
429
430        Example
431        -------
432        >>> self.get_gene_interactions('CellType1')
433        """
434
435        tmp = pd.DataFrame(
436            self.cells_enrichment[cell_name]["statistics"]["interactions"]
437        )
438
439        return tmp
440
441    def calculate_cell_connections(self):
442        """
443        Calculates cell-cell interaction connections based on gene/protein co-expression.
444
445        Notes
446        -----
447        Populates `cells_connection` with a DataFrame containing interactions between all pairs of cells.
448
449        Each row represents an interaction between two cells and the involved genes/proteins.
450
451        Raises
452        ------
453        ValueError
454            If `normalized_data` is not defined in the JDTI object.
455        """
456
457        if isinstance(self.jdti.normalized_data, pd.DataFrame):
458
459            cells = set(self.jdti.normalized_data.columns)
460
461            data_dict = {}
462
463            for c in tqdm(cells):
464
465                tmp = self.jdti.normalized_data.loc[:, c]
466                names = tmp.loc[
467                    tmp.select_dtypes(include="number").sum(axis=1) > 0
468                ].index.tolist()
469                names = list(set(names))
470
471                enr = copy.copy(self.enr_full_info)
472                enr.genome = enr.genome[
473                    enr.genome["found_names"].isin(names)
474                ].reset_index(drop=True)
475                enr.enriche_CellCon()
476                data = enr.get_results()
477                del enr
478
479                data_dict[c] = data["CellConnections"]
480
481            full_data = []
482            for c1 in tqdm(cells):
483                for c2 in cells:
484                    if c1 != c2:
485                        c1_d = pd.DataFrame(data_dict[c1]["interactor2"])
486                        c2_d = pd.DataFrame(data_dict[c2]["interactor1"])
487
488                        mutual_lr = c1_d["interaction"][
489                            c1_d["interaction"].isin(list(c2_d["interaction"]))
490                        ]
491
492                        to_ret = (
493                            c1_d[c1_d["interaction"].isin(list(mutual_lr))]
494                            .drop(
495                                [
496                                    "Species",
497                                    "protein_id_1",
498                                    "protein_id_2",
499                                    "found_names_2",
500                                ],
501                                axis=1,
502                            )
503                            .reset_index(drop=True)
504                        )
505
506                        to_ret = to_ret.rename(columns={"found_names_1": "interactor1"})
507                        c2_subset = c2_d[["interaction", "found_names_2"]].rename(
508                            columns={"found_names_2": "interactor2"}
509                        )
510
511                        to_ret = to_ret.merge(c2_subset, on="interaction", how="left")
512                        to_ret["cell1"] = c1
513                        to_ret["cell2"] = c2
514
515                        full_data.append(to_ret)
516
517            self.cells_connection = pd.concat(full_data)
518
519        else:
520            raise ValueError(
521                "`self.cells_markers` not defined. Use `self.cells_markers` to provide markers."
522            )
523
524    def get_cell_connections(
525        self,
526        connection_type: list = [
527            "Adhesion-Adhesion",
528            "Gap-Gap",
529            "Ligand-Ligand",
530            "Ligand-Receptor",
531            "Receptor-Receptor",
532            "Undefined",
533        ],
534    ):
535        """
536        Returns the calculated cell-cell interaction connections.
537
538        Parameters
539        ----------
540        connection_type : list of str, optional
541            List of interaction types used to filter the returned cell–cell connections
542            based on the molecular directionality of the interaction. Possible values:
543
544            - "Adhesion-Adhesion" : interaction between two adhesion molecules.
545            - "Gap-Gap" : connection through gap junction proteins.
546            - "Ligand-Ligand" : interaction between two ligand molecules.
547            - "Ligand-Receptor" : directional interaction where a ligand binds to a receptor.
548            - "Receptor-Receptor" : interaction between two receptor molecules.
549            - "Undefined" : interactions where the directionality could not be determined.
550
551            By default, all interaction types are included.
552
553        Returns
554        -------
555        pd.DataFrame
556            DataFrame containing cell-cell interactions.
557
558        Example
559        -------
560        >>> connections = self.get_cell_connections()
561        """
562
563        tmp = self.cells_connection
564
565        tmp["directionality"] = [
566            x if x is not None else "Undefined" for x in tmp["directionality"]
567        ]
568
569        tmp = tmp[tmp["directionality"].isin(connection_type)]
570
571        return tmp

A class to perform cell-type functional analysis and enrichment based on a JDtI-COMPsc objects.

This class provides methods to calculate marker genes for cell types, perform functional enrichment (GO, KEGG, REACTOME, STRING, IntAct), and compute cell-cell interaction networks. Projects can also be saved and loaded via pickle.

Attributes

jdti : object JDtI-COMPsc object containing normalized single-cell data.

cells_markers : pd.DataFrame or None DataFrame containing marker genes per cell type after calculation.

enr_full_info : Enrichment Enrichment object containing all genes available for enrichment analysis.

cells_enrichment : dict or None Dictionary storing enrichment results per cell type.

cells_connection : pd.DataFrame or None DataFrame storing calculated cell-cell interaction information.

mt_genes : bool Whether mitochondrial genes are included (default False).

ribo_genes : bool Whether ribosomal genes are included (default False).

CellFunCon(jdti_object, mt_genes=False, ribo_genes=False) View Source

 51    def __init__(self, jdti_object, mt_genes=False, ribo_genes=False):
 52        """
 53        Initializes the CellFunCon object with a COMPsc/JDTI object.
 54
 55        Parameters
 56        ----------
 57        jdti_object : object
 58            A COMPsc or JDTI object with normalized single-cell data.
 59
 60        mt_genes : bool
 61            Whether mitochondrial genes are included (default False).
 62
 63        ribo_genes : bool
 64            Whether ribosomal genes are included (default False).
 65        """
 66
 67        self.jdti = jdti_object
 68        """JDtI-COMPsc object containing normalized single-cell data."""
 69
 70        self.cells_markers = None
 71        """DataFrame containing marker genes per cell type after calculation."""
 72
 73        self.cells_connection = None
 74        """DataFrame storing calculated cell-cell interaction information."""
 75
 76        self.cells_enrichment = None
 77        """Dictionary storing enrichment results per cell type."""
 78
 79        self.mt_genes = mt_genes
 80        """Whether mitochondrial genes are included (default False)."""
 81
 82        self.ribo_genes = ribo_genes
 83        """Whether ribosomal genes are included (default False)."""
 84
 85        names = self.jdti.normalized_data.loc[
 86            self.jdti.normalized_data.select_dtypes(include="number").sum(axis=1) > 0
 87        ].index.tolist()
 88        names = list(set(names))
 89
 90        if self.mt_genes is False:
 91            names = [x for x in names if "MT-" not in x.upper()]
 92        if self.ribo_genes is False:
 93            names = [x for x in names if "RPS" != x[:3].upper()]
 94            names = [x for x in names if "RPL" != x[:3].upper()]
 95
 96        enr = Enrichment()
 97        enr.select_features(names)
 98
 99        self.enr_full_info = enr
100        """Enrichment object containing all genes available for enrichment analysis."""

Initializes the CellFunCon object with a COMPsc/JDTI object.

Parameters

jdti_object : object A COMPsc or JDTI object with normalized single-cell data.

mt_genes : bool Whether mitochondrial genes are included (default False).

ribo_genes : bool Whether ribosomal genes are included (default False).

jdti

JDtI-COMPsc object containing normalized single-cell data.

cells_markers

DataFrame containing marker genes per cell type after calculation.

cells_connection

DataFrame storing calculated cell-cell interaction information.

cells_enrichment

Dictionary storing enrichment results per cell type.

mt_genes

Whether mitochondrial genes are included (default False).

ribo_genes

Whether ribosomal genes are included (default False).

enr_full_info

Enrichment object containing all genes available for enrichment analysis.

def save_project(self, filename): View Source

102    def save_project(self, filename):
103        """
104        Saves the current CellFunCon project as a pickle file.
105
106        Parameters
107        ----------
108        filename : str
109            Path to save the project (e.g., 'project_name').
110
111        Example
112        -------
113        >>> self.save_project('my_project')
114        """
115
116        with open(f"{filename}.psc", "wb") as f:
117            pickle.dump(self, f)
118        print(f"Project saved as {filename}")

Saves the current CellFunCon project as a pickle file.

Parameters

filename : str Path to save the project (e.g., 'project_name').

Example

>>> self.save_project('my_project')

@classmethod

def load_project(cls, filename): View Source

120    @classmethod
121    def load_project(cls, filename):
122        """
123        Loads a previously saved CellFunCon project from a pickle file.
124
125        Parameters
126        ----------
127        filename : str
128            Path to the saved pickle file.
129
130        Returns
131        -------
132        CellFunCon
133            Loaded CellFunCon self.
134
135        Raises
136        ------
137        TypeError
138            If the loaded object is not a CellFunCon self.
139
140        ValueError
141            If the file is not a valid CellFunCon project file.
142
143        Example
144        -------
145        >>> self = CellFunCon.load_project('my_project.psc')
146        """
147
148        if ".psc" in filename:
149            with open(filename, "rb") as f:
150                obj = pickle.load(f)
151            if not isinstance(obj, cls):
152                raise TypeError("File does not include project.psc")
153            print(f"Project loaded from {filename}")
154            return obj
155        else:
156            raise ValueError("Project not belong to CellFunCon project data.")

Loads a previously saved CellFunCon project from a pickle file.

Parameters

filename : str Path to the saved pickle file.

Returns

CellFunCon Loaded CellFunCon self.

Raises

TypeError If the loaded object is not a CellFunCon self.

ValueError If the file is not a valid CellFunCon project file.

Example

>>> self = CellFunCon.load_project('my_project.psc')

def calculate_cells_markers(self, min_exp=0, min_pct=0.05, n_proc=10): View Source

158    def calculate_cells_markers(self, min_exp=0, min_pct=0.05, n_proc=10):
159        """
160        Calculates marker genes for each cell type based on expression thresholds.
161
162        Perform differential gene expression (DEG) analysis on gene expression data.
163
164        The function compares groups of cells or samples (defined by `entities` or
165        `sets`) using the Mann–Whitney U test. It computes p-values, adjusted
166        p-values, fold changes, standardized effect sizes, and other statistics.
167
168
169        Parameters
170        ----------
171        min_exp : float, optional
172            Minimum expression level to consider a gene (default 0).
173
174        min_pct : float, optional
175            Minimum fraction of cells expressing a gene (default 0.05).
176
177        n_proc : int, optional
178            Number of parallel processes to use (default 10).
179
180        Notes
181        -----
182        The results are stored in the `cells_markers` attribute.
183        """
184
185        self.jdti.calculate_difference_markers(
186            min_exp=min_exp, min_pct=min_pct, n_proc=n_proc, force=True
187        )
188
189        self.cells_markers = self.jdti.var_data

Calculates marker genes for each cell type based on expression thresholds.

Perform differential gene expression (DEG) analysis on gene expression data.

The function compares groups of cells or samples (defined by entities or sets) using the Mann–Whitney U test. It computes p-values, adjusted p-values, fold changes, standardized effect sizes, and other statistics.

Parameters

min_exp : float, optional Minimum expression level to consider a gene (default 0).

min_pct : float, optional Minimum fraction of cells expressing a gene (default 0.05).

n_proc : int, optional Number of parallel processes to use (default 10).

Notes

The results are stored in the cells_markers attribute.

def enrich_cells_fucntionality(self, p_value=0.05, adj=True, log_fc=0.1, top_max=500): View Source

191    def enrich_cells_fucntionality(
192        self, p_value=0.05, adj=True, log_fc=0.1, top_max=500
193    ):
194        """
195        Performs functional enrichment analysis for each cell type based on marker genes.
196
197        Parameters
198        ----------
199        p_value : float
200            Maximum p-value for significant genes (default 0.05).
201
202        adj : bool
203            If True, the adjusted p-values are used to determine significant genes.
204            Adjusted p-values are calculated using the Benjamini–Hochberg false
205            discovery rate (FDR) correction. If False, raw p-values are used instead.
206
207        log_fc : float
208            Minimum log fold-change threshold for marker genes (default 0.1).
209
210        top_max : int
211            Maximum number of top marker genes per cell type to consider (default 500).
212
213        Raises
214        ------
215        ValueError
216            If `cells_markers` is not defined.
217
218        Notes
219        -----
220        This method populates `cells_enrichment` with results for GO-TERM, KEGG, REACTOME,
221        STRING, IntAct, and specificity analyses.
222        """
223
224        if isinstance(self.cells_markers, pd.DataFrame):
225
226            markers = self.cells_markers
227            cells = set(markers["valid_group"])
228
229            data_dict = {}
230
231            max_c = len(cells)
232            for n, c in enumerate(cells):
233                print(f"\nAnalysis {n+1} of {max_c} cells --> {c} \n")
234
235                if adj:
236                    tmp = markers[
237                        (markers["valid_group"] == c)
238                        & (markers["adj_pval"] <= p_value)
239                        & (markers["log(FC)"] > log_fc)
240                    ]
241                    names = list(set(tmp["feature"]))
242
243                    tmp = tmp[tmp["feature"].isin(names)]
244
245                else:
246                    tmp = markers[
247                        (markers["valid_group"] == c)
248                        & (markers["p_val"] <= p_value)
249                        & (markers["log(FC)"] > log_fc)
250                    ]
251                    names = list(set(tmp["feature"]))
252
253                    tmp = tmp[tmp["feature"].isin(names)]
254
255                tmp = tmp.sort_values("esm", ascending=False).head(top_max)
256
257                if len(tmp.index) > 0:
258                    data_dict[c] = {}
259                    enr = copy.copy(self.enr_full_info)
260                    enr.genome = enr.genome[
261                        enr.genome["found_names"].isin(list(set(tmp["feature"])))
262                    ].reset_index(drop=True)
263                    enr.enriche_specificiti()
264                    enr.enriche_KEGG()
265                    enr.enriche_GOTERM()
266                    enr.enriche_REACTOME()
267                    enr.enriche_IntAct()
268                    enr.enriche_STRING()
269                    enr.enriche_specificiti()
270
271                    data = enr.get_results()
272                    del enr
273
274                    ans = Analysis(data)
275                    ans.gene_interaction()
276                    ans.features_specificity()
277                    ans.REACTOME_overrepresentation()
278                    ans.KEGG_overrepresentation()
279                    ans.GO_overrepresentation()
280                    ans.features_specificity()
281
282                    data_dict[c] = ans.get_full_results()
283                else:
284                    print(
285                        f"Cell {c} was not enriched. No specific markers were found in this dataset."
286                    )
287                    data_dict[c] = None
288
289            self.cells_enrichment = data_dict
290
291        else:
292            raise ValueError(
293                "`self.cells_markers` not defined. Use `self.cells_markers` to provide markers."
294            )

Performs functional enrichment analysis for each cell type based on marker genes.

Parameters

p_value : float Maximum p-value for significant genes (default 0.05).

adj : bool If True, the adjusted p-values are used to determine significant genes. Adjusted p-values are calculated using the Benjamini–Hochberg false discovery rate (FDR) correction. If False, raw p-values are used instead.

log_fc : float Minimum log fold-change threshold for marker genes (default 0.1).

top_max : int Maximum number of top marker genes per cell type to consider (default 500).

Raises

ValueError If cells_markers is not defined.

Notes

This method populates cells_enrichment with results for GO-TERM, KEGG, REACTOME, STRING, IntAct, and specificity analyses.

def get_enrichment_data( self, data_type='GO-TERM', p_value=0.05, test='FISH', adj='BH', parent_inc=False, top_n=50): View Source

296    def get_enrichment_data(
297        self,
298        data_type="GO-TERM",
299        p_value=0.05,
300        test="FISH",
301        adj="BH",
302        parent_inc=False,
303        top_n=50,
304    ):
305        """
306        Retrieves enrichment results for all cells in a unified DataFrame.
307
308        Parameters
309        ----------
310        data_type : str
311            Type of enrichment to retrieve ('GO-TERM', 'KEGG', 'REACTOME', 'specificity').
312
313        p_value : float, optional
314            Maximum p-value threshold (default 0.05).
315
316        test : str, optional
317            Name of the statistical test column to use (default 'FISH').
318
319        adj : str, optional
320            P-value adjustment method (default 'BH').
321
322        parent_inc : bool, optional
323            Whether to include parent terms in the results (default False).
324
325        top_n : int, optional
326            Maximum number of terms per cell type to include (default 50).
327
328        Returns
329        -------
330        pd.DataFrame
331            DataFrame containing filtered enrichment results with a 'cell' column indicating cell type.
332
333        Raises
334        ------
335        ValueError
336            If `data_type` is not one of the expected values.
337        """
338
339        if not any(
340            x in data_type for x in ("GO-TERM", "KEGG", "REACTOME", "specificity")
341        ):
342            raise ValueError(
343                "Invalid value for 'data_type'. Expected: 'GO-TERM', 'KEGG', 'REACTOME' or 'specificity'."
344            )
345
346        if data_type == "GO-TERM":
347            parent_col = "parent"
348
349        elif data_type == "KEGG":
350            parent_col = "2nd"
351
352        elif data_type == "REACTOME":
353            parent_col = "top_level"
354
355        elif data_type == "specificity":
356            parent_col = "None"
357
358        pdl = []
359        for i in self.cells_enrichment.keys():
360            if self.cells_enrichment[i] is None:
361                continue
362
363            print(i)
364            if data_type == "specificity":
365                tmp_dict = self.cells_enrichment[i]["statistics"][data_type]
366                tmp = []
367                for k in tmp_dict.keys():
368                    if k != "HPA_subcellular_location":
369                        tmp.append(pd.DataFrame(tmp_dict[k]))
370
371                tmp = pd.concat(tmp)
372
373            else:
374                tmp = pd.DataFrame(self.cells_enrichment[i]["statistics"][data_type])
375
376            cols = [x for x in tmp.columns if test in x and adj in x]
377            cols = sorted(cols, reverse=True)
378            if parent_inc is False:
379                cols = [x for x in cols if parent_col not in x.lower()]
380
381            mask = (tmp[cols] <= p_value).all(axis=1)
382            tmp = tmp.loc[mask]
383            tmp["cell"] = i
384            tmp = tmp.sort_values(by=["cell"] + cols, ascending=True)
385
386            pdl.append(tmp.head(top_n))
387
388        df = pd.concat(pdl)
389        df["source"] = data_type
390        df = df.reset_index(drop=True)
391
392        return df

Retrieves enrichment results for all cells in a unified DataFrame.

Parameters

data_type : str Type of enrichment to retrieve ('GO-TERM', 'KEGG', 'REACTOME', 'specificity').

p_value : float, optional Maximum p-value threshold (default 0.05).

test : str, optional Name of the statistical test column to use (default 'FISH').

adj : str, optional P-value adjustment method (default 'BH').

parent_inc : bool, optional Whether to include parent terms in the results (default False).

top_n : int, optional Maximum number of terms per cell type to include (default 50).

Returns

pd.DataFrame DataFrame containing filtered enrichment results with a 'cell' column indicating cell type.

Raises

ValueError If data_type is not one of the expected values.

def get_included_cells(self): View Source

394    def get_included_cells(self):
395        """
396        Returns the list of cell types included in the enrichment analysis.
397
398        Returns
399        -------
400        list
401            List of cell type names.
402
403        Example
404        -------
405        >>> self.get_included_cells()
406        ['CellType1', 'CellType2', ...]
407        """
408
409        cl = []
410        for i in self.cells_enrichment.keys():
411            print(i)
412            cl.append(i)
413
414        return cl

Returns the list of cell types included in the enrichment analysis.

Returns

list List of cell type names.

Example

>>> self.get_included_cells()
['CellType1', 'CellType2', ...]

def get_gene_interactions(self, cell_name): View Source

416    def get_gene_interactions(self, cell_name):
417        """
418        Retrieves gene or protein interaction data for a specific cell type.
419
420        Parameters
421        ----------
422        cell_name : str
423            Name of the cell type.
424
425        Returns
426        -------
427        pd.DataFrame
428            DataFrame containing interactions for the specified cell.
429
430        Example
431        -------
432        >>> self.get_gene_interactions('CellType1')
433        """
434
435        tmp = pd.DataFrame(
436            self.cells_enrichment[cell_name]["statistics"]["interactions"]
437        )
438
439        return tmp

Retrieves gene or protein interaction data for a specific cell type.

Parameters

cell_name : str Name of the cell type.

Returns

pd.DataFrame DataFrame containing interactions for the specified cell.

Example

>>> self.get_gene_interactions('CellType1')

def calculate_cell_connections(self): View Source

441    def calculate_cell_connections(self):
442        """
443        Calculates cell-cell interaction connections based on gene/protein co-expression.
444
445        Notes
446        -----
447        Populates `cells_connection` with a DataFrame containing interactions between all pairs of cells.
448
449        Each row represents an interaction between two cells and the involved genes/proteins.
450
451        Raises
452        ------
453        ValueError
454            If `normalized_data` is not defined in the JDTI object.
455        """
456
457        if isinstance(self.jdti.normalized_data, pd.DataFrame):
458
459            cells = set(self.jdti.normalized_data.columns)
460
461            data_dict = {}
462
463            for c in tqdm(cells):
464
465                tmp = self.jdti.normalized_data.loc[:, c]
466                names = tmp.loc[
467                    tmp.select_dtypes(include="number").sum(axis=1) > 0
468                ].index.tolist()
469                names = list(set(names))
470
471                enr = copy.copy(self.enr_full_info)
472                enr.genome = enr.genome[
473                    enr.genome["found_names"].isin(names)
474                ].reset_index(drop=True)
475                enr.enriche_CellCon()
476                data = enr.get_results()
477                del enr
478
479                data_dict[c] = data["CellConnections"]
480
481            full_data = []
482            for c1 in tqdm(cells):
483                for c2 in cells:
484                    if c1 != c2:
485                        c1_d = pd.DataFrame(data_dict[c1]["interactor2"])
486                        c2_d = pd.DataFrame(data_dict[c2]["interactor1"])
487
488                        mutual_lr = c1_d["interaction"][
489                            c1_d["interaction"].isin(list(c2_d["interaction"]))
490                        ]
491
492                        to_ret = (
493                            c1_d[c1_d["interaction"].isin(list(mutual_lr))]
494                            .drop(
495                                [
496                                    "Species",
497                                    "protein_id_1",
498                                    "protein_id_2",
499                                    "found_names_2",
500                                ],
501                                axis=1,
502                            )
503                            .reset_index(drop=True)
504                        )
505
506                        to_ret = to_ret.rename(columns={"found_names_1": "interactor1"})
507                        c2_subset = c2_d[["interaction", "found_names_2"]].rename(
508                            columns={"found_names_2": "interactor2"}
509                        )
510
511                        to_ret = to_ret.merge(c2_subset, on="interaction", how="left")
512                        to_ret["cell1"] = c1
513                        to_ret["cell2"] = c2
514
515                        full_data.append(to_ret)
516
517            self.cells_connection = pd.concat(full_data)
518
519        else:
520            raise ValueError(
521                "`self.cells_markers` not defined. Use `self.cells_markers` to provide markers."
522            )

Calculates cell-cell interaction connections based on gene/protein co-expression.

Notes

Populates cells_connection with a DataFrame containing interactions between all pairs of cells.

Each row represents an interaction between two cells and the involved genes/proteins.

Raises

ValueError If normalized_data is not defined in the JDTI object.

def get_cell_connections( self, connection_type: list = ['Adhesion-Adhesion', 'Gap-Gap', 'Ligand-Ligand', 'Ligand-Receptor', 'Receptor-Receptor', 'Undefined']): View Source

524    def get_cell_connections(
525        self,
526        connection_type: list = [
527            "Adhesion-Adhesion",
528            "Gap-Gap",
529            "Ligand-Ligand",
530            "Ligand-Receptor",
531            "Receptor-Receptor",
532            "Undefined",
533        ],
534    ):
535        """
536        Returns the calculated cell-cell interaction connections.
537
538        Parameters
539        ----------
540        connection_type : list of str, optional
541            List of interaction types used to filter the returned cell–cell connections
542            based on the molecular directionality of the interaction. Possible values:
543
544            - "Adhesion-Adhesion" : interaction between two adhesion molecules.
545            - "Gap-Gap" : connection through gap junction proteins.
546            - "Ligand-Ligand" : interaction between two ligand molecules.
547            - "Ligand-Receptor" : directional interaction where a ligand binds to a receptor.
548            - "Receptor-Receptor" : interaction between two receptor molecules.
549            - "Undefined" : interactions where the directionality could not be determined.
550
551            By default, all interaction types are included.
552
553        Returns
554        -------
555        pd.DataFrame
556            DataFrame containing cell-cell interactions.
557
558        Example
559        -------
560        >>> connections = self.get_cell_connections()
561        """
562
563        tmp = self.cells_connection
564
565        tmp["directionality"] = [
566            x if x is not None else "Undefined" for x in tmp["directionality"]
567        ]
568
569        tmp = tmp[tmp["directionality"].isin(connection_type)]
570
571        return tmp

Returns the calculated cell-cell interaction connections.

Parameters

connection_type : list of str, optional List of interaction types used to filter the returned cell–cell connections based on the molecular directionality of the interaction. Possible values:

- "Adhesion-Adhesion" : interaction between two adhesion molecules.
- "Gap-Gap" : connection through gap junction proteins.
- "Ligand-Ligand" : interaction between two ligand molecules.
- "Ligand-Receptor" : directional interaction where a ligand binds to a receptor.
- "Receptor-Receptor" : interaction between two receptor molecules.
- "Undefined" : interactions where the directionality could not be determined.

By default, all interaction types are included.

Returns

pd.DataFrame DataFrame containing cell-cell interactions.

Example

>>> connections = self.get_cell_connections()

def compare_connections( instances_dict: dict, cells_compartment: dict | None = None, connection_type: list = ['Adhesion-Adhesion', 'Gap-Gap', 'Ligand-Ligand', 'Ligand-Receptor', 'Receptor-Receptor', 'Undefined']): View Source

574def compare_connections(
575    instances_dict: dict,
576    cells_compartment: dict | None = None,
577    connection_type: list = [
578        "Adhesion-Adhesion",
579        "Gap-Gap",
580        "Ligand-Ligand",
581        "Ligand-Receptor",
582        "Receptor-Receptor",
583        "Undefined",
584    ],
585):
586    """
587    Compare gene expression between two instances based on their cell connections.
588
589    This function compares normalized gene expression data from exactly two
590    instances stored in ``instances_dict``. Optionally, the comparison can be
591    restricted to specific cell compartments for each instance. Differential
592    expression analysis is performed using ``jdti.calc_DEG``.
593
594    Parameters
595    ----------
596    instances_dict : dict
597        Dictionary containing exactly two objects. Each object must have:
598
599        - ``jdti.normalized_data`` : pandas.DataFrame
600            Gene expression matrix with genes as rows and cells as columns.
601
602        - ``cells_connection`` : pandas.DataFrame
603            DataFrame containing at least the columns ``'interactor1'`` and
604            ``'interactor2'``.
605
606        The dictionary keys are used as group labels in the comparison.
607
608    cells_compartment : dict or None, optional
609        Dictionary mapping each key in ``instances_dict`` to a list of cell names
610        to be used for the comparison. If ``None``, all cells are used and genes
611        are filtered based on cell–cell connections.
612
613    connection_type : list of str, optional
614        List of interaction types used to filter cell–cell connections that are
615        considered in the gene expression comparison. Only connections with the
616        specified molecular interaction types will be used to define interacting
617        cells between the two instances.
618
619        Possible values:
620
621        - "Adhesion-Adhesion" : interaction between two adhesion molecules.
622        - "Gap-Gap" : connection mediated by gap junction proteins.
623        - "Ligand-Ligand" : interaction between two ligand molecules.
624        - "Ligand-Receptor" : directional interaction where a ligand binds to a receptor.
625        - "Receptor-Receptor" : interaction between two receptor molecules.
626        - "Undefined" : interactions where the directionality could not be determined.
627
628        By default, all interaction types are included in the comparison.
629
630    Returns
631    -------
632    pandas.DataFrame
633        Differential expression results returned by ``calc_DEG``, filtered to
634        include only rows where ``valid_group`` matches the first key in
635        ``instances_dict``.
636
637    Raises
638    ------
639    ValueError
640        If any cell specified in ``cells_compartment`` is not present in the
641        corresponding ``normalized_data`` columns.
642
643    Notes
644    -----
645    - Only genes common to both instances are considered.
646
647    - When ``cells_compartment`` is ``None``, genes are further restricted to
648      those appearing in the cell–cell interaction networks of either instance.
649
650    - The function assumes exactly two entries in ``instances_dict``.
651
652    - Differential expression is computed with ``min_exp=0`` and ``min_pct=0.1``.
653
654    See Also
655    --------
656    jdti.calc_DEG : Function used to compute differential expression.
657    """
658
659    import pandas as pd
660    from jdti import calc_DEG
661
662    if isinstance(cells_compartment, dict):
663
664        keys_list = list(instances_dict.keys())
665        tmp1 = instances_dict[keys_list[0]].jdti.normalized_data.copy()
666        cells = cells_compartment[keys_list[0]]
667        if any(cell not in tmp1.columns for cell in cells):
668            raise ValueError(
669                'Any of {keys_list[0]} cells in dictionary "cells_compartment" do not occur!'
670            )
671        tmp1 = tmp1.loc[:, cells]
672        tmp1.columns = [keys_list[0]] * len(tmp1.columns)
673
674        tmp2 = instances_dict[keys_list[1]].jdti.normalized_data.copy()
675        cells = cells_compartment[keys_list[1]]
676        if any(cell not in tmp2.columns for cell in cells):
677            raise ValueError(
678                'Any of {keys_list[1]} cells in dictionary "cells_compartment" do not occur!'
679            )
680        tmp2 = tmp2.loc[:, cells]
681        tmp2.columns = [keys_list[1]] * len(tmp2.columns)
682
683        common_idx = tmp1.index.intersection(tmp2.index)
684
685        tmp1 = tmp1.loc[common_idx]
686        tmp2 = tmp2.loc[common_idx]
687
688        concat_df = pd.concat([tmp1, tmp2], axis=1)
689
690    else:
691
692        keys_list = list(instances_dict.keys())
693        tmp1 = instances_dict[keys_list[0]].jdti.normalized_data.copy()
694        tmp1.columns = [keys_list[0]] * len(tmp1.columns)
695
696        tmp2 = instances_dict[keys_list[1]].jdti.normalized_data.copy()
697        tmp2.columns = [keys_list[1]] * len(tmp2.columns)
698
699        common_idx = tmp1.index.intersection(tmp2.index)
700
701        tmp1 = tmp1.loc[common_idx]
702        tmp2 = tmp2.loc[common_idx]
703
704        concat_df = pd.concat([tmp1, tmp2], axis=1)
705
706    tmp_df_1 = instances_dict[keys_list[0]].cells_connection
707    tmp_df_2 = instances_dict[keys_list[1]].cells_connection
708
709    tmp_df_1["directionality"] = [
710        x if x is not None else "Undefined" for x in tmp_df_1["directionality"]
711    ]
712    tmp_df_2["directionality"] = [
713        x if x is not None else "Undefined" for x in tmp_df_2["directionality"]
714    ]
715
716    tmp_df_1 = tmp_df_1[tmp_df_1["directionality"].isin(connection_type)]
717    tmp_df_2 = tmp_df_2[tmp_df_2["directionality"].isin(connection_type)]
718
719    tmp_con1 = list(set(list(tmp_df_1["interactor1"]) + list(tmp_df_1["interactor2"])))
720
721    tmp_con2 = list(set(list(tmp_df_2["interactor1"]) + list(tmp_df_2["interactor2"])))
722
723    genes = list(set(tmp_con1 + tmp_con2))
724
725    genes2 = [x for x in genes if x in common_idx]
726
727    concat_df = concat_df.loc[genes2, :]
728
729    results = calc_DEG(
730        data=concat_df,
731        metadata_list=None,
732        entities="All",
733        sets=None,
734        min_exp=0,
735        min_pct=0,
736        n_proc=10,
737    )
738
739    results = results[results["valid_group"] == keys_list[0]]
740
741    return results

Compare gene expression between two instances based on their cell connections.

This function compares normalized gene expression data from exactly two instances stored in instances_dict. Optionally, the comparison can be restricted to specific cell compartments for each instance. Differential expression analysis is performed using jdti.calc_DEG.

Parameters

instances_dict : dict Dictionary containing exactly two objects. Each object must have:

- ``jdti.normalized_data`` : pandas.DataFrame
    Gene expression matrix with genes as rows and cells as columns.

- ``cells_connection`` : pandas.DataFrame
    DataFrame containing at least the columns ``'interactor1'`` and
    ``'interactor2'``.

The dictionary keys are used as group labels in the comparison.

cells_compartment : dict or None, optional Dictionary mapping each key in instances_dict to a list of cell names to be used for the comparison. If None, all cells are used and genes are filtered based on cell–cell connections.

connection_type : list of str, optional List of interaction types used to filter cell–cell connections that are considered in the gene expression comparison. Only connections with the specified molecular interaction types will be used to define interacting cells between the two instances.

Possible values:

- "Adhesion-Adhesion" : interaction between two adhesion molecules.
- "Gap-Gap" : connection mediated by gap junction proteins.
- "Ligand-Ligand" : interaction between two ligand molecules.
- "Ligand-Receptor" : directional interaction where a ligand binds to a receptor.
- "Receptor-Receptor" : interaction between two receptor molecules.
- "Undefined" : interactions where the directionality could not be determined.

By default, all interaction types are included in the comparison.

Returns

pandas.DataFrame Differential expression results returned by calc_DEG, filtered to include only rows where valid_group matches the first key in instances_dict.

Raises

ValueError If any cell specified in cells_compartment is not present in the corresponding normalized_data columns.

Notes

Only genes common to both instances are considered.
When cells_compartment is None, genes are further restricted to those appearing in the cell–cell interaction networks of either instance.
The function assumes exactly two entries in instances_dict.
Differential expression is computed with min_exp=0 and min_pct=0.1.