Skip to content

Commit

Permalink
ensuring that rank_by_binding_effect is passed through in rank resp…
Browse files Browse the repository at this point in the history
…onse functionality

update to version 1.7.2
  • Loading branch information
cmatKhan committed Sep 10, 2024
1 parent 7cf1b01 commit 96d4294
Show file tree
Hide file tree
Showing 11 changed files with 331 additions and 291 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ __pycache__/
*.pytest_cache
*.log
tests/test_data/yeast/run_6021_sample
tmp/*
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import logging

import pandas as pd

from .create_partitions import create_partitions

logger = logging.getLogger(__name__)


def bin_by_binding_rank(df: pd.DataFrame,
bin_size: int,
order_by_effect: bool = False):
def bin_by_binding_rank(
df: pd.DataFrame, bin_size: int, rank_by_binding_effect: bool = False
):
"""
Assigns a rank bin to each row in a DataFrame based on binding signal.
Assigns a rank bin to each row in a DataFrame based on binding signal.
This function divides the DataFrame into partitions based on the specified
bin size, assigns a rank to each row within these partitions, and then
Expand All @@ -22,40 +24,42 @@ def bin_by_binding_rank(df: pd.DataFrame,
It must contain 'effect' and 'binding_pvalue' columns.
bin_size (int): The size of each bin for partitioning the DataFrame
for ranking.
order_by_effect (bool, optional): If True, the DataFrame is sorted by
rank_by_binding_effect (bool, optional): If True, the DataFrame is sorted by
abs('effect') in descending order first with ties broken by pvalue.
If False, sort by pvalue first with ties broken by effect size.
Defaults to False
Returns:
pd.DataFrame: The input DataFrame with an added 'rank' column, sorted
by 'effect' in descending order and 'binding_pvalue' in
ascending order.
by 'effect' in descending order or 'binding_pvalue' in
ascending order depending on `rank_by_binding_effect`.
Example:
>>> df = pd.DataFrame({'effect': [1.2, 0.5, 0.8],
>>> df = pd.DataFrame({'effect': [1.2, 0.5, 0.8],
... 'binding_pvalue': [5, 3, 4]})
>>> bin_by_binding_rank(df, 2)
# Returns a DataFrame with added 'rank' column and sorted as per
# the specified criteria.
"""
if 'binding_pvalue' not in df.columns:
if "binding_pvalue" not in df.columns:
raise KeyError("Column 'binding_pvalue' is not in the data")
if 'binding_effect' not in df.columns:
if "binding_effect" not in df.columns:
raise KeyError("Column 'binding_effect' is not in the data")

parts = min(len(df), bin_size)
df_abs = df.assign(abs_binding_effect=df['binding_effect'].abs())
df_abs = df.assign(abs_binding_effect=df["binding_effect"].abs())

df_sorted = df_abs.sort_values(
by=['abs_binding_effect', 'binding_pvalue']
if order_by_effect
else ['binding_pvalue', 'abs_binding_effect'],
ascending=[False, True]
if order_by_effect
else [True, False])

return df_sorted\
.drop(columns=['abs_binding_effect'])\
.reset_index(drop=True)\
.assign(rank_bin=create_partitions(len(df_sorted), parts) * parts)
by=(
["abs_binding_effect", "binding_pvalue"]
if rank_by_binding_effect
else ["binding_pvalue", "abs_binding_effect"]
),
ascending=[False, True] if rank_by_binding_effect else [True, False],
)

return (
df_sorted.drop(columns=["abs_binding_effect"])
.reset_index(drop=True)
.assign(rank_bin=create_partitions(len(df_sorted), parts) * parts)
)
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import logging

import pandas as pd

from .rank_response_ratio_summarize import rank_response_ratio_summarize
from .read_in_data import read_in_data
from .validate_config import validate_config
from .rank_response_ratio_summarize import rank_response_ratio_summarize

logger = logging.getLogger(__name__)


def create_rank_response_table(config_dict: dict) -> (pd.DataFrame,
pd.DataFrame,
pd.DataFrame):
def create_rank_response_table(
config_dict: dict,
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
"""
Create a rank repsonse table from a dictionary which contains the
configuration parameters. See docs at
Expand Down Expand Up @@ -40,56 +42,61 @@ def create_rank_response_table(config_dict: dict) -> (pd.DataFrame,
# read i the binding data
try:
binding_data = read_in_data(
args['binding_data_path'],
args['binding_identifier_col'],
args['binding_effect_col'],
args['binding_pvalue_col'],
args['binding_source'],
'binding')
args["binding_data_path"],
args["binding_identifier_col"],
args["binding_effect_col"],
args["binding_pvalue_col"],
args["binding_source"],
"binding",
)
except (KeyError, FileExistsError, AttributeError) as exc:
logger.error("Error reading in binding data: %s", exc)
raise

# read in the expression data
try:
expression_data = read_in_data(
args['expression_data_path'],
args['expression_identifier_col'],
args['expression_effect_col'],
args['expression_pvalue_col'],
args['expression_source'],
'expression')
args["expression_data_path"],
args["expression_identifier_col"],
args["expression_effect_col"],
args["expression_pvalue_col"],
args["expression_source"],
"expression",
)
except (KeyError, FileExistsError, AttributeError) as exc:
logger.error("Error reading in expression data: %s", exc)
raise

df = expression_data.merge(binding_data[['binding_effect',
'binding_pvalue',
'binding_source',
'feature']],
how='inner',
on='feature')
df = expression_data.merge(
binding_data[["binding_effect", "binding_pvalue", "binding_source", "feature"]],
how="inner",
on="feature",
)
# test that there no incomplete cases. raise an error if there are
if df.isnull().values.any():
raise ValueError("There are incomplete cases in the data")

logger.info('There are %s genes in the data after merging '
'the %s binding data and '
' %s expression data',
str(df.shape[0]),
args['binding_source'],
args['expression_source'])
logger.info(
"There are %s genes in the data after merging "
"the %s binding data and "
" %s expression data",
str(df.shape[0]),
args["binding_source"],
args["expression_source"],
)

try:
# the first two items in the return tuple aren't passed out of
# this function, hence _, _
_, _, rank_response_df = rank_response_ratio_summarize(
df,
effect_expression_thres=args['expression_effect_thres'],
p_expression_thres=args['expression_pvalue_thres'],
normalize=args['normalize'],
bin_size=args['rank_bin_size'])
except (KeyError) as exc:
effect_expression_thres=args["expression_effect_thres"],
p_expression_thres=args["expression_pvalue_thres"],
normalize=args["normalize"],
bin_size=args["rank_bin_size"],
rank_by_binding_effect=args["rank_by_binding_effect"],
)
except KeyError as exc:
logger.error("Error summarizing data: %s", exc)
raise

Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
import logging

import pandas as pd
from .label_responsive_genes import label_responsive_genes
from .calculate_random_expectation import calculate_random_expectation

from .bin_by_binding_rank import bin_by_binding_rank
from .calculate_random_expectation import calculate_random_expectation
from .compute_rank_response import compute_rank_response
from .label_responsive_genes import label_responsive_genes

logger = logging.getLogger(__name__)


def rank_response_ratio_summarize(
df: pd.DataFrame,
effect_expression_thres: float = 0,
p_expression_thres: float = 0.05,
normalize: bool = False,
bin_size: int = 5) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
df: pd.DataFrame,
effect_expression_thres: float = 0,
p_expression_thres: float = 0.05,
normalize: bool = False,
bin_size: int = 5,
rank_by_binding_effect: bool = False,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Processes a DataFrame to compute and summarize rank response ratios.
Expand Down Expand Up @@ -54,22 +58,27 @@ def rank_response_ratio_summarize(
(a, b) # a and b depend on the structure of rank response calculations
"""
df_expression_labeled = label_responsive_genes(
df,
effect_expression_thres,
p_expression_thres, normalize)
df, effect_expression_thres, p_expression_thres, normalize
)

random_expectation_df = calculate_random_expectation(df_expression_labeled)

df_expression_labeled_binding_ranked = \
bin_by_binding_rank(df_expression_labeled, bin_size)
df_expression_labeled_binding_ranked = bin_by_binding_rank(
df_expression_labeled, bin_size, rank_by_binding_effect
)

df_expression_labeled_binding_ranked_with_random = \
df_expression_labeled_binding_ranked\
.assign(random=float(random_expectation_df['random']))
df_expression_labeled_binding_ranked_with_random = (
df_expression_labeled_binding_ranked.assign(
random=float(random_expectation_df["random"])
)
)

rank_response_df = compute_rank_response(
df_expression_labeled_binding_ranked_with_random)
df_expression_labeled_binding_ranked_with_random
)

return (df_expression_labeled_binding_ranked_with_random,
random_expectation_df,
rank_response_df)
return (
df_expression_labeled_binding_ranked_with_random,
random_expectation_df,
rank_response_df,
)
Loading

0 comments on commit 96d4294

Please sign in to comment.