Skip to content

Commit

Permalink
add docstring to partitioner modules
Browse files Browse the repository at this point in the history
  • Loading branch information
chungmuen committed Nov 28, 2024
1 parent 735909a commit 34f02db
Show file tree
Hide file tree
Showing 2 changed files with 194 additions and 9 deletions.
165 changes: 161 additions & 4 deletions seqteleporter/partitioner/compute_best_partitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,49 @@
annotate_mutations_in_fragments, prepare_0idx_mutations)


def prepare_output_dirs(output_dir):
"""create output dirs if not exist"""
"""
This module contains functions to compute the best possible partitions of a given sequence based on various criteria, including the number of cuts, fragment lengths, costs, and ligation fidelity. It utilizes experimental data for fusion site information and manages the generation of optimal partitions.
Functions:
1. `prepare_output_dirs(output_dir)`:
- Creates necessary output directories for logs and results if they do not exist, and cleans the log directory before starting the run.
2. `prepare_compute_best_partitions_params(input_file_path: str) -> dict`:
- Loads input parameters from a specified file, prepares mutation data, and constructs a dictionary of parameters needed for computing best partitions.
3. `write_compute_best_partitions_log_header(...)`:
- Writes a header for the log file that contains details about the sequence, mutations, fusion sites, and various constraints.
4. `write_compute_best_partitions_log_body(...)`:
- Writes the results of the partitioning process to the log file, including details about the number of cuts, elapsed time, and selected partitions.
5. `validate_inputs(s: str, fusion_sites_used_by_backbone: Tuple[str, ...]) -> None`:
- Validates the input sequence and fusion sites to ensure they meet the required criteria.
6. `pick_top_n_partitions(res_per_count: List[dict], select_top_n_partitions: int, sort_by_cost: bool) -> List[dict]`:
- Selects the top N partitions based on specified sorting criteria.
7. `compute_best_partitions(...)`:
- Main function that identifies optimal partitions for a given sequence by iterating over a specified range of cut numbers and applying various constraints.
8. `get_lowest_cost_from_best_partitions(...)`:
- Identifies and returns the lowest cost partition from the results generated by the compute_best_partitions function.
Dependencies:
- Utilizes various utility functions and modules from the `seqteleporter` package for input validation, partition property finding, and other operations.
"""


def prepare_output_dirs(output_dir) -> Tuple[str, str]:
"""
Creates necessary output directories for logs and results if they do not exist.
Args:
output_dir (str): The base output directory where logs and results will be stored.
Returns:
tuple: A tuple containing the paths to the log directory and the result directory.
"""

log_dir = join(output_dir, 'logs')
result_dir = join(output_dir, 'results')
Expand All @@ -38,6 +79,16 @@ def prepare_output_dirs(output_dir):


def prepare_compute_best_partitions_params(input_file_path: str) -> dict:
"""
Prepares the parameters required for computing the best partitions from an input file.
Args:
input_file_path (str): The file path to the input parameters file.
Returns:
dict: A dictionary containing the prepared parameters for the partition computation.
"""

input_params = load_input_params(input_file_path=input_file_path, supress_output=True)
print(input_params['mutations_1idx'])
all_mutations_0idx, linked_mutations_0idx = prepare_0idx_mutations(
Expand Down Expand Up @@ -86,7 +137,27 @@ def write_compute_best_partitions_log_header(
sort_by_cost: bool,
compute_best_partitions_log_file_path: str
) -> None:

"""
Writes the header information to the log file for the best partitions computation.
Args:
s (str): The input sequence.
mutations_0idx (Union[list, None]): List of mutations in 0-indexed format.
linked_mutations_0idx (Union[list, None]): List of linked mutations in 0-indexed format.
fusion_sites_used_by_backbone (Tuple[str, ...]): Fusion sites used by the backbone.
min_aa_length (int): Minimum amino acid length for fragments.
max_cost (int): Maximum cost allowed for partitions.
max_unevenness (float): Maximum unevenness allowed in fragment lengths.
min_ligation_fidelity (float): Minimum ligation fidelity required.
satisfaction_fidelity (float): Satisfaction fidelity for partitions.
search_method (str): The method used for searching partitions.
host (str): The host organism for the sequence.
sort_by_cost (bool): Flag to indicate if sorting should be done by cost.
compute_best_partitions_log_file_path (str): The path to the log file.
Returns:
None
"""
with open(compute_best_partitions_log_file_path, 'a') as fd:
fd.write(f'\n Sequence: {s}'
f'\n Mutations(0-indexed): {mutations_0idx}'
Expand Down Expand Up @@ -115,6 +186,25 @@ def write_compute_best_partitions_log_body(
linked_mutations_0idx: Union[list, None],
supress_output: bool
) -> None:
"""
Writes the results of the partitioning process to the log file.
Args:
compute_best_partitions_log_file_path (str): The path to the log file.
number_of_cuts (int): The number of cuts used in the partitioning.
elapsed_time_number_of_cuts (float): The time taken to compute partitions.
num_of_checked_partitions (int): The total number of partitions checked.
num_of_checked_unique_partitions (int): The number of unique partitions checked.
hard_constraint_violations (dict): Dictionary of hard constraint violations.
select_top_n_partitions (int): The number of top partitions selected.
sel_partitions (List[dict]): List of selected partitions.
mutations_0idx (Union[list, None]): List of mutations in 0-indexed format.
linked_mutations_0idx (Union[list, None]): List of linked mutations in 0-indexed format.
supress_output (bool): Flag to suppress output to the console.
Returns:
None
"""

with open(compute_best_partitions_log_file_path, 'a') as fd:
compute_best_partitions_log_header = \
Expand Down Expand Up @@ -152,6 +242,19 @@ def write_compute_best_partitions_log_body(


def validate_inputs(s: str, fusion_sites_used_by_backbone: Tuple[str, ...]) -> None:
"""
Validates the input sequence and fusion sites to ensure they meet the required criteria.
Args:
s (str): The input sequence to validate.
fusion_sites_used_by_backbone (Tuple[str, ...]): Fusion sites used by the backbone.
Raises:
ValueError: If the input sequence or fusion sites are invalid.
Returns:
None
"""
if not is_aa(s):
raise ValueError(f"The provided input sequence is not a valid amino acid sequence!")

Expand All @@ -166,6 +269,17 @@ def validate_inputs(s: str, fusion_sites_used_by_backbone: Tuple[str, ...]) -> N


def pick_top_n_partitions(res_per_count: List[dict], select_top_n_partitions: int, sort_by_cost: bool) -> List[dict]:
"""
Selects the top N partitions based on specified sorting criteria.
Args:
res_per_count (List[dict]): List of partition results to select from.
select_top_n_partitions (int): The number of top partitions to select.
sort_by_cost (bool): Flag to indicate if sorting should be done by cost.
Returns:
List[dict]: A list of the selected top partitions.
"""
if sort_by_cost:
sorted_partitions = sorted(res_per_count, key=lambda x: (
x['cost'], -x['ligation_fidelity'], x['fragment_length_unevenness'], x['partition']))
Expand Down Expand Up @@ -195,6 +309,39 @@ def compute_best_partitions(s: str, mutations_0idx: Union[list, None], linked_mu
partition_search_mode: str, select_top_n_partitions: int,
cost_per_nt: float, provider_min_frag_len: int, provider_max_frag_len: int,
max_partition_number_checked: int) -> tuple:
"""
Identifies the optimal partitions for a given sequence based on various criteria.
Args:
s (str): The input sequence to partition.
mutations_0idx (Union[list, None]): List of mutations in 0-indexed format.
linked_mutations_0idx (Union[list, None]): List of linked mutations in 0-indexed format.
cut_number_range (Tuple[int, int]): The range of cut numbers to consider.
fidelity_data_path (str): Path to the fidelity data.
fusion_sites_used_by_backbone (Tuple[str, ...]): Fusion sites used by the backbone.
min_aa_length (int): Minimum amino acid length for fragments.
max_cost (int): Maximum cost allowed for partitions.
max_unevenness (float): Maximum unevenness allowed in fragment lengths.
min_ligation_fidelity (float): Minimum ligation fidelity required.
satisfaction_fidelity (float): Satisfaction fidelity for partitions.
output_dir (str): Directory to store output files.
supress_output (bool): Flag to suppress console output.
search_method (str): The method used for searching partitions.
codon_usage_table_path (str): Path to the codon usage table.
host (str): The host organism for the sequence.
sort_by_cost (bool): Flag to indicate if sorting should be done by cost.
enzyme (str): The enzyme used for the cuts.
allowed_cut_positions_1idx (list): List of allowed cut positions in 1-indexed format.
partition_search_mode (str): The mode used for partition searching.
select_top_n_partitions (int): The number of top partitions to select.
cost_per_nt (float): Cost per nucleotide for the computation.
provider_min_frag_len (int): Minimum fragment length provided by the provider.
provider_max_frag_len (int): Maximum fragment length provided by the provider.
max_partition_number_checked (int): Maximum number of partitions to check.
Returns:
tuple: A tuple containing the best partitions and the output path.
"""

print('\033[1m=============================================================================================\033[0m')
print(' \033[1m IDENTIFYING OPTIMAL PARTITIONS \033[0m ')
Expand Down Expand Up @@ -371,7 +518,17 @@ def compute_best_partitions(s: str, mutations_0idx: Union[list, None], linked_mu
each specified number of cuts, and selects the partition with the lowest cost."""


def get_lowest_cost_from_best_partitions(best_partitions_by_cut_number, supress_output=False):
def get_lowest_cost_from_best_partitions(best_partitions_by_cut_number, supress_output=False) -> dict:
"""
Identifies and returns the lowest cost partition from a collection of partitioning results.
Args:
best_partitions_by_cut_number: The best partitions grouped by cut number.
supress_output (bool): Flag to suppress console output.
Returns:
dict: The partition with the lowest cost.
"""

res_all = best_partitions_by_cut_number["best_partitions_by_cut_number"]
sel_cost = float('inf')
Expand Down
38 changes: 33 additions & 5 deletions seqteleporter/partitioner/partitioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,39 @@
breadth_first_product)


"""
This module contains functions for partitioning sequences into fragments based on specified cut positions, mutations, and other criteria. It provides utilities for finding cuttable positions and managing the distribution of mutations across fragments.
Functions:
1. `find_cuttable_positions(...)`:
- Computes and returns a list of positions in a sequence where cuts can be made while respecting minimum and maximum fragment lengths and avoiding cuts at mutation sites.
2. `find_even_cuts(string: str, regions: List) -> List`:
- Determines optimal cut positions within specified regions to create evenly sized fragments.
3. `partitioner(s: str, cuttable_positions: list[int], number_of_cuts: int, mutations_0idx: list, linked_mutations_0idx: list, pre_distribute_mutations: bool, one_dist: bool) -> Union[list, str]`:
- Generates partitions of a sequence based on cuttable positions and specified mutation distributions.
4. `sort_cut_sites_by_eveness(string: str, regions: List) -> List[List]`:
- Sorts cut sites based on their evenness to optimize fragment sizes.
5. `generate_cut_ranges_from_a_mutation_distribution(mutation_distribution: dict) -> list`:
- Generates allowable cut ranges based on a given mutation distribution.
6. `count_bases_in_a_mutation_distribution(s: str, distributed_mutations_0idx_lists: List) -> int`:
- Counts the number of bases in a sequence based on a specified mutation distribution.
7. `distribute_mutations(s: str, mutations_0idx: list, linked_mutations_0idx: Optional[list], n_fragments: int) -> List[dict]`:
- Distributes mutations across specified fragments and checks for constraints.
8. `generate_n_set_of_cut_ranges_from_a_list_of_mutation_distributions(mutation_distribution_dicts: list) -> list`:
- Generates a set of cut ranges from a list of mutation distributions.
Dependencies:
- Utilizes various utility functions and types from the `seqteleporter` package and standard libraries for handling sequences and combinations.
"""


def find_cuttable_positions(s: str, mutations_0idx: Optional[List[Any]], linked_mutations_0idx: Optional[List[Any]],
min_aa_length: int, provider_max_dna_len: int, enzyme: str,
allowed_cut_positions_1idx: list, enzyme_info_dic: Dict[str, dict]) -> list[int]:
Expand Down Expand Up @@ -247,8 +280,3 @@ def generate_n_set_of_cut_ranges_from_a_list_of_mutation_distributions(mutation_
set_of_cut_ranges.append(allow_cut_ranges)
return set_of_cut_ranges






0 comments on commit 34f02db

Please sign in to comment.