ensuring that rank_by_binding_effect is passed through in rank resp…

…onse functionality update to version 1.7.2
cmatKhan · Sep 10, 2024 · 96d4294 · 96d4294
1 parent 7cf1b01
commit 96d4294
Show file tree

Hide file tree

Showing 11 changed files with 331 additions and 291 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@ __pycache__/
 *.pytest_cache
 *.log
 tests/test_data/yeast/run_6021_sample
+tmp/*
diff --git a/callingcardstools/Analysis/yeast/rank_response/bin_by_binding_rank.py b/callingcardstools/Analysis/yeast/rank_response/bin_by_binding_rank.py
@@ -1,15 +1,17 @@
 import logging
+
 import pandas as pd
+
 from .create_partitions import create_partitions
 
 logger = logging.getLogger(__name__)
 
 
-def bin_by_binding_rank(df: pd.DataFrame,
-                        bin_size: int,
-                        order_by_effect: bool = False):
+def bin_by_binding_rank(
+    df: pd.DataFrame, bin_size: int, rank_by_binding_effect: bool = False
+):
     """
-    Assigns a rank bin to each row in a DataFrame based on binding signal. 
+    Assigns a rank bin to each row in a DataFrame based on binding signal.
 
     This function divides the DataFrame into partitions based on the specified
     bin size, assigns a rank to each row within these partitions, and then
@@ -22,40 +24,42 @@ def bin_by_binding_rank(df: pd.DataFrame,
             It must contain 'effect' and 'binding_pvalue' columns.
         bin_size (int): The size of each bin for partitioning the DataFrame
             for ranking.
-        order_by_effect (bool, optional): If True, the DataFrame is sorted by
+        rank_by_binding_effect (bool, optional): If True, the DataFrame is sorted by
             abs('effect') in descending order first with ties broken by pvalue.
             If False, sort by pvalue first with ties broken by effect size.
             Defaults to False
 
     Returns:
         pd.DataFrame: The input DataFrame with an added 'rank' column, sorted
-            by 'effect' in descending order and 'binding_pvalue' in
-            ascending order.
+            by 'effect' in descending order or 'binding_pvalue' in
+            ascending order depending on `rank_by_binding_effect`.
 
     Example:
-        >>> df = pd.DataFrame({'effect': [1.2, 0.5, 0.8], 
+        >>> df = pd.DataFrame({'effect': [1.2, 0.5, 0.8],
         ...                    'binding_pvalue': [5, 3, 4]})
         >>> bin_by_binding_rank(df, 2)
         # Returns a DataFrame with added 'rank' column and sorted as per
         # the specified criteria.
     """
-    if 'binding_pvalue' not in df.columns:
+    if "binding_pvalue" not in df.columns:
         raise KeyError("Column 'binding_pvalue' is not in the data")
-    if 'binding_effect' not in df.columns:
+    if "binding_effect" not in df.columns:
         raise KeyError("Column 'binding_effect' is not in the data")
 
     parts = min(len(df), bin_size)
-    df_abs = df.assign(abs_binding_effect=df['binding_effect'].abs())
+    df_abs = df.assign(abs_binding_effect=df["binding_effect"].abs())
 
     df_sorted = df_abs.sort_values(
-        by=['abs_binding_effect', 'binding_pvalue']
-        if order_by_effect
-        else ['binding_pvalue', 'abs_binding_effect'],
-        ascending=[False, True]
-        if order_by_effect
-        else [True, False])
-
-    return df_sorted\
-        .drop(columns=['abs_binding_effect'])\
-        .reset_index(drop=True)\
-        .assign(rank_bin=create_partitions(len(df_sorted), parts) * parts)
+        by=(
+            ["abs_binding_effect", "binding_pvalue"]
+            if rank_by_binding_effect
+            else ["binding_pvalue", "abs_binding_effect"]
+        ),
+        ascending=[False, True] if rank_by_binding_effect else [True, False],
+    )
+
+    return (
+        df_sorted.drop(columns=["abs_binding_effect"])
+        .reset_index(drop=True)
+        .assign(rank_bin=create_partitions(len(df_sorted), parts) * parts)
+    )
diff --git a/callingcardstools/Analysis/yeast/rank_response/create_rank_response_table.py b/callingcardstools/Analysis/yeast/rank_response/create_rank_response_table.py
@@ -1,15 +1,17 @@
 import logging
+
 import pandas as pd
+
+from .rank_response_ratio_summarize import rank_response_ratio_summarize
 from .read_in_data import read_in_data
 from .validate_config import validate_config
-from .rank_response_ratio_summarize import rank_response_ratio_summarize
 
 logger = logging.getLogger(__name__)
 
 
-def create_rank_response_table(config_dict: dict) -> (pd.DataFrame,
-                                                      pd.DataFrame,
-                                                      pd.DataFrame):
+def create_rank_response_table(
+    config_dict: dict,
+) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
     """
     Create a rank repsonse table from a dictionary which contains the
     configuration parameters. See docs at
@@ -40,56 +42,61 @@ def create_rank_response_table(config_dict: dict) -> (pd.DataFrame,
     # read i the binding data
     try:
         binding_data = read_in_data(
-            args['binding_data_path'],
-            args['binding_identifier_col'],
-            args['binding_effect_col'],
-            args['binding_pvalue_col'],
-            args['binding_source'],
-            'binding')
+            args["binding_data_path"],
+            args["binding_identifier_col"],
+            args["binding_effect_col"],
+            args["binding_pvalue_col"],
+            args["binding_source"],
+            "binding",
+        )
     except (KeyError, FileExistsError, AttributeError) as exc:
         logger.error("Error reading in binding data: %s", exc)
         raise
 
     # read in the expression data
     try:
         expression_data = read_in_data(
-            args['expression_data_path'],
-            args['expression_identifier_col'],
-            args['expression_effect_col'],
-            args['expression_pvalue_col'],
-            args['expression_source'],
-            'expression')
+            args["expression_data_path"],
+            args["expression_identifier_col"],
+            args["expression_effect_col"],
+            args["expression_pvalue_col"],
+            args["expression_source"],
+            "expression",
+        )
     except (KeyError, FileExistsError, AttributeError) as exc:
         logger.error("Error reading in expression data: %s", exc)
         raise
 
-    df = expression_data.merge(binding_data[['binding_effect',
-                                             'binding_pvalue',
-                                             'binding_source',
-                                             'feature']],
-                               how='inner',
-                               on='feature')
+    df = expression_data.merge(
+        binding_data[["binding_effect", "binding_pvalue", "binding_source", "feature"]],
+        how="inner",
+        on="feature",
+    )
     # test that there no incomplete cases. raise an error if there are
     if df.isnull().values.any():
         raise ValueError("There are incomplete cases in the data")
 
-    logger.info('There are %s genes in the data after merging '
-                'the %s binding data and '
-                ' %s expression data',
-                str(df.shape[0]),
-                args['binding_source'],
-                args['expression_source'])
+    logger.info(
+        "There are %s genes in the data after merging "
+        "the %s binding data and "
+        " %s expression data",
+        str(df.shape[0]),
+        args["binding_source"],
+        args["expression_source"],
+    )
 
     try:
         # the first two items in the return tuple aren't passed out of
         # this function, hence _, _
         _, _, rank_response_df = rank_response_ratio_summarize(
             df,
-            effect_expression_thres=args['expression_effect_thres'],
-            p_expression_thres=args['expression_pvalue_thres'],
-            normalize=args['normalize'],
-            bin_size=args['rank_bin_size'])
-    except (KeyError) as exc:
+            effect_expression_thres=args["expression_effect_thres"],
+            p_expression_thres=args["expression_pvalue_thres"],
+            normalize=args["normalize"],
+            bin_size=args["rank_bin_size"],
+            rank_by_binding_effect=args["rank_by_binding_effect"],
+        )
+    except KeyError as exc:
         logger.error("Error summarizing data: %s", exc)
         raise
 

diff --git a/callingcardstools/Analysis/yeast/rank_response/rank_response_ratio_summarize.py b/callingcardstools/Analysis/yeast/rank_response/rank_response_ratio_summarize.py
@@ -1,19 +1,23 @@
 import logging
+
 import pandas as pd
-from .label_responsive_genes import label_responsive_genes
-from .calculate_random_expectation import calculate_random_expectation
+
 from .bin_by_binding_rank import bin_by_binding_rank
+from .calculate_random_expectation import calculate_random_expectation
 from .compute_rank_response import compute_rank_response
+from .label_responsive_genes import label_responsive_genes
 
 logger = logging.getLogger(__name__)
 
 
 def rank_response_ratio_summarize(
-        df: pd.DataFrame,
-        effect_expression_thres: float = 0,
-        p_expression_thres: float = 0.05,
-        normalize: bool = False,
-        bin_size: int = 5) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
+    df: pd.DataFrame,
+    effect_expression_thres: float = 0,
+    p_expression_thres: float = 0.05,
+    normalize: bool = False,
+    bin_size: int = 5,
+    rank_by_binding_effect: bool = False,
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """
     Processes a DataFrame to compute and summarize rank response ratios.
 
@@ -54,22 +58,27 @@ def rank_response_ratio_summarize(
         (a, b)  # a and b depend on the structure of rank response calculations
     """
     df_expression_labeled = label_responsive_genes(
-        df,
-        effect_expression_thres,
-        p_expression_thres, normalize)
+        df, effect_expression_thres, p_expression_thres, normalize
+    )
 
     random_expectation_df = calculate_random_expectation(df_expression_labeled)
 
-    df_expression_labeled_binding_ranked = \
-        bin_by_binding_rank(df_expression_labeled, bin_size)
+    df_expression_labeled_binding_ranked = bin_by_binding_rank(
+        df_expression_labeled, bin_size, rank_by_binding_effect
+    )
 
-    df_expression_labeled_binding_ranked_with_random = \
-        df_expression_labeled_binding_ranked\
-        .assign(random=float(random_expectation_df['random']))
+    df_expression_labeled_binding_ranked_with_random = (
+        df_expression_labeled_binding_ranked.assign(
+            random=float(random_expectation_df["random"])
+        )
+    )
 
     rank_response_df = compute_rank_response(
-        df_expression_labeled_binding_ranked_with_random)
+        df_expression_labeled_binding_ranked_with_random
+    )
 
-    return (df_expression_labeled_binding_ranked_with_random,
-            random_expectation_df,
-            rank_response_df)
+    return (
+        df_expression_labeled_binding_ranked_with_random,
+        random_expectation_df,
+        rank_response_df,
+    )
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,3 +9,4 @@ __pycache__/ @@
     *.pytest_cache
     *.log
     tests/test_data/yeast/run_6021_sample
+    tmp/*