From 6f22d73283e2bad66d8852cbc53742bfff8d7231 Mon Sep 17 00:00:00 2001 From: cesen Date: Tue, 7 Dec 2021 18:07:48 +0100 Subject: [PATCH 01/26] Bugfix in expert config: use "no" instead of False --- ksrates/fc_configfile.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ksrates/fc_configfile.py b/ksrates/fc_configfile.py index 91c2e34..f737856 100644 --- a/ksrates/fc_configfile.py +++ b/ksrates/fc_configfile.py @@ -788,7 +788,7 @@ class Configuration: :return rank_type: string ("lambda" or "combined"). Default is "lambda". """ if self.expert_config is not None: - rank_type = self.expert_config.get("EXPERT PARAMETERS", "reciprocal_retention_rank_type", fallback="lambda") + rank_type = self.expert_config.get("EXPERT PARAMETERS", "reciprocal_retention_rank_type", fallback="lambda").lower() if rank_type not in ["lambda", "combined"]: logging.warning(f'Unrecognized field in expert configuration file [reciprocal_retention_rank_type = {rank_type}]. \ Please choose between "lambda" and "combined". Default choice will be applied ["lambda"]') @@ -834,7 +834,7 @@ class Configuration: :return boolean: boolean """ if self.expert_config is not None: - filter_false_predictions = self.expert_config.get("EXPERT PARAMETERS", "filter_by_false_predictions", fallback=False).lower() + filter_false_predictions = self.expert_config.get("EXPERT PARAMETERS", "filter_by_false_predictions", fallback="no").lower() if filter_false_predictions == "yes": return True @@ -856,7 +856,7 @@ class Configuration: :return boolean: boolean """ if self.expert_config is not None: - filter_zero_false_predictions = self.expert_config.get("EXPERT PARAMETERS", "filter_by_zero_false_predictions", fallback=False).lower() + filter_zero_false_predictions = self.expert_config.get("EXPERT PARAMETERS", "filter_by_zero_false_predictions", fallback="no").lower() if filter_zero_false_predictions == "yes": return True -- GitLab From 782d0699ada575ae87e7410f78c1b2d372af0aca Mon Sep 17 00:00:00 2001 From: cesen Date: Wed, 8 Dec 2021 16:17:59 +0100 Subject: [PATCH 02/26] Activate rec.ret. in test config file --- test/config_elaeis.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test/config_elaeis.txt b/test/config_elaeis.txt index e3da2f8..19b5760 100755 --- a/test/config_elaeis.txt +++ b/test/config_elaeis.txt @@ -20,6 +20,7 @@ ks_list_database_path = ortholog_ks_list_db.tsv [ANALYSIS SETTING] paranome = yes collinearity = yes +reciprocal_retention = yes # analysis type for paralog data; allowed values: 'yes' or 'no' gff_feature = mrna -- GitLab From 5a0fa1fd93cdfc9fa051a80bb9bb8e1a359dcd58 Mon Sep 17 00:00:00 2001 From: cesen Date: Wed, 8 Dec 2021 16:59:59 +0100 Subject: [PATCH 03/26] Peform LMM on rec.ret. paralogs if required - "paralog-analyses" command now takes also the rec.ret. Ks TSV file as input - "paralogs_analyses_methods()" is reorganized to handle which methods are by default and which are "extra" now that there are also rec.ret. data - "fc_exp_log_mixture.py" integrates recret - "lognormal_mixture.py" runs LMM also on rec.ret, when asked. --- ksrates/fc_exp_log_mixture.py | 2 +- ksrates/fc_lognormal_mixture.py | 6 ++- ksrates/lognormal_mixture.py | 57 ++++++++++++++++++++--- ksrates/paralogs_analyses.py | 80 ++++++++++++++++++++++++++------- ksrates_cli.py | 10 +++-- 5 files changed, 127 insertions(+), 28 deletions(-) diff --git a/ksrates/fc_exp_log_mixture.py b/ksrates/fc_exp_log_mixture.py index 255d4a0..0d11b00 100644 --- a/ksrates/fc_exp_log_mixture.py +++ b/ksrates/fc_exp_log_mixture.py @@ -437,7 +437,7 @@ def deconvolute_data(tsv_file, max_ks, data_type): """ tail_length = 0.5 # tail spans for 0.5 extra Ks range - if data_type == "paralogs" or data_type == "anchor pairs": + if data_type == "paralogs" or data_type == "anchor pairs" or data_type == "reciprocally retained": ks_data, ks_weights = fc_extract_ks_list.ks_list_from_tsv(tsv_file, max_ks, data_type) elif data_type == "orthologs": ks_data = fc_extract_ks_list.ks_list_from_tsv(tsv_file, max_ks, data_type) diff --git a/ksrates/fc_lognormal_mixture.py b/ksrates/fc_lognormal_mixture.py index d3cea6d..68b51ef 100644 --- a/ksrates/fc_lognormal_mixture.py +++ b/ksrates/fc_lognormal_mixture.py @@ -251,7 +251,7 @@ def lmm( histograms. Please interpret mixture model results with caution. :param fig: figure object :param max_x_axis_lim: upper limit in the x-axis - :param data_type: strings stating whether the data are "paralogs" or "anchor pairs" + :param data_type: strings stating whether the data are "paralogs" or "anchor pairs" or "reciprocally retained" :param tsv_file: wgd output file containing either paranome or anchor pairs Ks values (suffix formats: ".ks.tsv", "ks_anchors.tsv") :param species: informal name of the focal species :param axis: axis object @@ -264,7 +264,7 @@ def lmm( :param output_dir: output folder :param outfile: output file for logging details :param parameter_table: list collecting the component parameters - :param datatype_tag: string for figure title stating if data is paranome or anchors + :param datatype_tag: string for figure title stating if data is paranome or anchors or reciprocally retained :param peak_stats: states whether the cluster peak is intended as median or mode :param correction_table_available: boolean stating whether the adjustment results are available or not :param plot_correction_arrows: boolean stating whether there will be plotted adjustment arrows or not @@ -313,6 +313,8 @@ def create_legend_mixture_model(axis, legend_size, num_mixture_model_lines, data paralog_rect = Patch(facecolor=fcPlot.COLOR_PARANOME_HISTOGRAM, edgecolor="w") elif datatype == "anchors": paralog_rect = Patch(facecolor=fcPlot.COLOR_ANCHOR_HISTOGRAM, edgecolor="w") + elif datatype == "reciprocally_retained": + paralog_rect = Patch(facecolor=fcPlot.COLOR_REC_RET_HISTOGRAM, edgecolor="w") # empty patch used as spacer between histograms and divergence line legend entries empty_rect = Patch(fill=False, edgecolor='none', visible=False) diff --git a/ksrates/lognormal_mixture.py b/ksrates/lognormal_mixture.py index 53405f0..8ef4691 100644 --- a/ksrates/lognormal_mixture.py +++ b/ksrates/lognormal_mixture.py @@ -10,11 +10,12 @@ import ksrates.fc_check_input as fcCheck import ksrates.fc_plotting as fcPlot import ksrates.fc_extract_ks_list as fc_extract_ks_list import ksrates.fc_lognormal_mixture as fcLMM -from ksrates.fc_plotting import COLOR_ANCHOR_HISTOGRAM +from ksrates.fc_plotting import COLOR_ANCHOR_HISTOGRAM, COLOR_REC_RET_HISTOGRAM from ksrates.fc_cluster_anchors import subfolder from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE -def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, correction_table_file): +def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_ret_tsv_file, correction_table_file, + ignore_paranome=False, ignore_anchors=False, ignore_reciprocally_retained=False): # INPUT config = fcConf.Configuration(config_file) init_logging(f"Lognormal mixture model on Ks distribution", config.get_logging_level()) @@ -22,6 +23,17 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, correc paranome_analysis = config.get_paranome() colinearity_analysis = config.get_colinearity() + reciprocal_retention_analysis = config.get_reciprocal_retention() + + # Use these optional arguments to avoid performing LMM on a certain data type, overwriting what written in the config file; + # This is used in paralogs_analysis.py when LMM is called as extra method and allows to fine tune what is activated and what not + if ignore_paranome: + paranome_analysis = False + if ignore_anchors: + colinearity_analysis = False + if ignore_reciprocally_retained: + reciprocal_retention_analysis = False + species = config.get_species() latin_names = config.get_latin_names() latinSpecies = latin_names[species] @@ -86,14 +98,25 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, correc anchors_ks_tsv_file = fcCheck.get_argument_path(anchors_ks_tsv_file, default_path_anchors_tsv_file, "Anchor pair Ks TSV file") if anchors_ks_tsv_file == "": logging.error(f"Anchor pair Ks TSV file not found at default position [{default_path_anchors_tsv_file}].") - if paralog_tsv_file == "" or anchors_ks_tsv_file == "": + if reciprocal_retention_analysis: + default_path_rec_ret_tsv_file = os.path.join("paralog_distributions", f"wgd_{species}", f"{species}.ks_rec_ret.tsv") + rec_ret_tsv_file = fcCheck.get_argument_path(rec_ret_tsv_file, default_path_rec_ret_tsv_file, "Reciprocally retained paralog Ks TSV file") + if rec_ret_tsv_file == "": + logging.error(f"Reciprocally retained paralog Ks TSV file not found at default position [{default_path_rec_ret_tsv_file}].") + + if paralog_tsv_file == "" or anchors_ks_tsv_file == "" or rec_ret_tsv_file == "": logging.error("Exiting") sys.exit(1) fig_para, axis_para = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "corrected", - correction_table_available, plot_correction_arrows, paranome_data=True, colinearity_data=False) + correction_table_available, plot_correction_arrows, paranome_data=True, colinearity_data=False, + reciprocal_retention_data=False) fig_colin, axis_colin = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "corrected", - correction_table_available, plot_correction_arrows, paranome_data=False, colinearity_data=True) + correction_table_available, plot_correction_arrows, paranome_data=False, colinearity_data=True, + reciprocal_retention_data=False) + fig_rec_ret, axis_rec_ret = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "corrected", + correction_table_available, plot_correction_arrows, paranome_data=False, colinearity_data=False, + reciprocal_retention_data=True) parameter_table = [] @@ -140,7 +163,26 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, correc # Generating tabular text file with all model parameters fcLMM.make_parameter_table_file(parameter_table, species, "anchors") - for axis in [axis_para, axis_colin]: + if reciprocal_retention_analysis: + with open (os.path.join("rate_adjustment", f"{species}", subfolder, f"lmm_{species}_parameters_reciprocally_retained.txt"), "w+") as outfile: + logging.info("Performing lognormal mixture model on reciprocally retained paralog Ks distribution") + rec_ret_list, rec_ret_weights = fc_extract_ks_list.ks_list_from_tsv(rec_ret_tsv_file, max_ks_para, "reciprocally retained") + hist_rec_ret = fcPlot.plot_histogram("Reciprocally retained paralogs", axis_rec_ret, rec_ret_list, bin_list, + bin_width_para, max_ks_para, kde_bandwidth_modifier, rec_ret_weights, color=COLOR_REC_RET_HISTOGRAM, plot_kde=False) + # Setting the plot height based on tallest histogram bin + if y_lim is None: + fcPlot.set_mixed_plot_height(axis_rec_ret, y_lim, hist_rec_ret) + + best_model_rec_ret = fcLMM.lmm( + fig_rec_ret, x_max_lim, "reciprocally retained", rec_ret_tsv_file, species, axis_rec_ret, (0, max_ks_EM), + (1, max_num_comp), arange(-10, max_ks_EM + bin_width_para, bin_width_para), bin_width_para, max_EM_iterations, num_EM_initializations, + output_dir, outfile, parameter_table, "reciprocally_retained", peak_stats, correction_table_available, plot_correction_arrows) + + # Generating tabular text file with all model parameters + fcLMM.make_parameter_table_file(parameter_table, species, "reciprocally_retained") + + + for axis in [axis_para, axis_colin, axis_rec_ret]: # PLOTTING THE ORTHOLOG DIVERGENCE LINES if correction_table_available: dummy_fig, dummy_axis = plt.subplots() @@ -154,6 +196,9 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, correc if colinearity_analysis: fcLMM.save_lmm(fig_colin, axis_colin, species, best_model_anchors, "anchors", correction_table_available) logging.info(f"Saved PDF figure of lognormal mixture model [mixed_{species}_lmm_anchors.pdf]") + if reciprocal_retention_analysis: + fcLMM.save_lmm(fig_rec_ret, axis_rec_ret, species, best_model_rec_ret, "reciprocally_retained", correction_table_available) + logging.info(f"Saved PDF figure of lognormal mixture model [mixed_{species}_lmm_reciprocally_retained.pdf]") logging.info("") logging.info("All done") \ No newline at end of file diff --git a/ksrates/paralogs_analyses.py b/ksrates/paralogs_analyses.py index b8900be..296b024 100644 --- a/ksrates/paralogs_analyses.py +++ b/ksrates/paralogs_analyses.py @@ -1,4 +1,5 @@ +import collections import sys import logging from ksrates.utils import init_logging @@ -7,38 +8,87 @@ from ksrates.exp_log_mixture import exp_log_mixture from ksrates.lognormal_mixture import lognormal_mixture import ksrates.fc_configfile as fcConf -def paralogs_analyses_methods(config_file, paranome_table, anchors_table, correction_table, anchorpoints, multiplicons, segments, list_elements, multiplicon_pairs): +def paralogs_analyses_methods(config_file, paranome_table, anchors_table, reciprocal_retention_table, + correction_table, anchorpoints, multiplicons, segments, list_elements, multiplicon_pairs): # INPUT config = fcConf.Configuration(config_file) logging.basicConfig(format='%(levelname)s\t%(message)s', level=config.get_logging_level(), stream=sys.stdout) paranome = config.get_paranome() colinearity = config.get_colinearity() + reciprocal_retention_analysis = config.get_reciprocal_retention() extra_paralogs_analyses_methods = config.get_extra_paralogs_analyses_methods() - if paranome and not colinearity: - # Only exp-log mixture model by default + # 1) ONLY WHOLE-PARANOME + if paranome and not colinearity and not reciprocal_retention_analysis: + # Default: perform only exponential-lognormal mixture modeling (ELMM) on paranome exp_log_mixture(config_file, paranome_table, correction_table) + if extra_paralogs_analyses_methods: logging.info(f"\n") - # Lognormal mixture model on paranome - lognormal_mixture(config_file, paranome_table, anchors_table, correction_table) - - if colinearity and not paranome: - # Only anchor clustering by default + # If additional methods are asked: perform lognormal mixture modeling (LMM) on paranome + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) + + + # 2) ONLY COLLINEARITY + if colinearity and not paranome and not reciprocal_retention_analysis: + # Default: perform only anchor clustering cluster_anchor_ks(config_file, correction_table, anchorpoints, multiplicons, segments, list_elements, anchors_table, multiplicon_pairs) + if extra_paralogs_analyses_methods: logging.info(f"\n") - # Lognormal mixture model on anchors - lognormal_mixture(config_file, paranome_table, anchors_table, correction_table) + # If additional methods are asked: perform LMM on anchors + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) + + + # 3) ONLY REC.RET. + if reciprocal_retention_analysis and not colinearity and not paranome: + # Default: perform LMM on rec.ret. paralogs + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) + # There are no additional methods for rec.ret. paralogs + + + # 4) COLLINEARITY AND REC.RET. + if colinearity and reciprocal_retention_analysis and not paranome: + # Default: perform anchor clustering and LMM on rec.ret. paralogs + cluster_anchor_ks(config_file, correction_table, anchorpoints, multiplicons, segments, list_elements, anchors_table, multiplicon_pairs) + logging.info(f"\n") + if not extra_paralogs_analyses_methods: + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table, force_anchors=False) + + elif extra_paralogs_analyses_methods: + # If additional methods are asked: perform LMM both on rec.ret. paralogs and - as extra - on anchors + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table, force_anchors=True) - if colinearity and paranome: - # Only anchor clustering by default + # 5) COLLINEARITY AND PARANOME + if colinearity and not reciprocal_retention_analysis and paranome: + # Default: perform only anchor clustering cluster_anchor_ks(config_file, correction_table, anchorpoints, multiplicons, segments, list_elements, anchors_table, multiplicon_pairs) + if extra_paralogs_analyses_methods: + # If additional methods are asked: perform ELMM on paranome, and LMM on paranome and anchors + logging.info(f"\n") + exp_log_mixture(config_file, paranome_table, correction_table) + logging.info(f"\n") + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) + + # 6) REC.RET. AND PARANOME + if reciprocal_retention_analysis and not colinearity and paranome: + # Default: perform LMM only on rec.ret. paralogs and on paranome, and ELMM on paranome + exp_log_mixture(config_file, paranome_table, correction_table) logging.info(f"\n") - # Exp-log mixture model on paranome + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) + + # 7) COLLINEARITY AND REC.RET. AND PARANOME + if colinearity and reciprocal_retention_analysis and paranome: + # Default: perform anchor clustering and LMM on rec.ret. paralogs + cluster_anchor_ks(config_file, correction_table, anchorpoints, multiplicons, segments, list_elements, anchors_table, multiplicon_pairs) + logging.info(f"\n") + if not extra_paralogs_analyses_methods: + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table, ignore_anchors=True, ignore_paranome=True) + + elif extra_paralogs_analyses_methods: + # If additional methods are asked: perform ELMM on paranome, and LMM on rec.ret. paralogs, anchors and paranome exp_log_mixture(config_file, paranome_table, correction_table) logging.info(f"\n") - # Lognormal mixture model on both - lognormal_mixture(config_file, paranome_table, anchors_table, correction_table) + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) diff --git a/ksrates_cli.py b/ksrates_cli.py index 11b2b14..500fcf4 100644 --- a/ksrates_cli.py +++ b/ksrates_cli.py @@ -182,13 +182,14 @@ def plot_orthologs(config_file, trios): @click.argument('config_file', type=click.Path(exists=True)) @click.option("--paranome-table", type=click.Path(exists=True), help="User-defined path to file containing paranome Ks (default: paralog_distributions/wgd_species/species.ks.tsv)") @click.option("--anchors-table", type=click.Path(exists=True), help="User-defined path to file containing anchor pair Ks (default: paralog_distribution/wgd_species/species.ks_anchors.tsv)") +@click.option("--reciprocal-retention-table", type=click.Path(exists=True), help="User-defined path to file containing reciprocally retained paralog Ks (default: paralog_distribution/wgd_species/species.ks_rec_ret.tsv)") @click.option("--adjustment-table", type=click.Path(exists=True), help="User-defined path to file containing adjustment results (default: rate_adjustment/species/adjustment_table_species.tsv)") @click.option("--anchorpoints", type=click.Path(exists=True), help="User-defined path to i-ADHoRe file anchorpoints.txt (default: paralog_distributions/wgd_species/species_i-adhore/anchorpoints.txt)") @click.option("--multiplicons", type=click.Path(exists=True), help="User-defined path to i-ADHoRe file multiplicons.txt (default: paralog_distributions/wgd_species/species_i-adhore/multiplicons.txt)") @click.option("--segments", type=click.Path(exists=True), help="User-defined path to i-ADHoRe file segments.txt (default: paralog_distributions/wgd_species/species_i-adhore/segments.txt)") @click.option("--list-elements", type=click.Path(exists=True), help="User-defined path to i-ADHoRe file list_elements.txt (default: paralog_distributions/wgd_species/species_i-adhore/list_elements.txt)") @click.option("--multiplicon-pairs", type=click.Path(exists=True), help="User-defined path to i-ADHoRe file multiplicons_pairs.txt (default: paralog_distributions/wgd_species/species_i-adhore/multiplicons_pairs.txt)") -def paralogs_analyses(config_file, paranome_table, anchors_table, adjustment_table, anchorpoints, multiplicons, segments, list_elements, multiplicon_pairs): +def paralogs_analyses(config_file, paranome_table, anchors_table, reciprocal_retention_table, adjustment_table, anchorpoints, multiplicons, segments, list_elements, multiplicon_pairs): """ Reconstructs potential WGD peaks in the paralog Ks distributions. @@ -206,6 +207,8 @@ def paralogs_analyses(config_file, paranome_table, anchors_table, adjustment_tab click.format_filename(paranome_table) if anchors_table: click.format_filename(anchors_table) + if reciprocal_retention_table: + click.format_filename(reciprocal_retention_table) if adjustment_table: click.format_filename(adjustment_table) if anchorpoints: @@ -218,9 +221,8 @@ def paralogs_analyses(config_file, paranome_table, anchors_table, adjustment_tab click.format_filename(list_elements) if multiplicon_pairs: click.format_filename(multiplicon_pairs) - paralogs_analyses_methods(config_file, paranome_table, anchors_table, adjustment_table, - anchorpoints, multiplicons, segments, list_elements, multiplicon_pairs) - + paralogs_analyses_methods(config_file, paranome_table, anchors_table, reciprocal_retention_table, + adjustment_table, anchorpoints, multiplicons, segments, list_elements, multiplicon_pairs) # For debugging -- GitLab From 3422cf8bc194ac8e66f05dd4399df5e06bf1a277 Mon Sep 17 00:00:00 2001 From: cesen Date: Wed, 8 Dec 2021 17:04:46 +0100 Subject: [PATCH 04/26] Generate mixed plot PDFs required by config file New dynamics in plotting mixed plots - Any data type that is active (i.e. paranome, anchors or rec.ret.) will be plotted separately in its own PDF figure. - Any pair of active data types (e.g. paranome AND anchors) will be plotted together in a single PDF figure. The same holds for when all three types are acitve. However, to avoid having too many figures generated in the directory, these "composed" plots are saved into a subdirectory called "other_mixed_plots". - The code now looks really redundant, because I had to repeat the same functions with slightly different arguments depending on the active data types; ideally should make it less redundant in the future - Still work in progress, needs to be tested --- ksrates/fc_plotting.py | 28 +++-- ksrates/plot_paralogs.py | 248 ++++++++++++++++++++++++++++++--------- 2 files changed, 212 insertions(+), 64 deletions(-) diff --git a/ksrates/fc_plotting.py b/ksrates/fc_plotting.py index 36a7eed..8d2f5ed 100755 --- a/ksrates/fc_plotting.py +++ b/ksrates/fc_plotting.py @@ -49,8 +49,9 @@ LINEWIDTH_CIRCLE_LABEL_BORDER = 1 NEGATIVE_Y_FRACTION = 0.08 # Filenames -_MIXED_ADJUSTED_PLOT_FILENAME = "mixed_{}_adjusted.pdf" -_MIXED_UNADJUSTED_PLOT_FILENAME = "mixed_{}_unadjusted.pdf" +_MIXED_ADJUSTED_PLOT_FILENAME = "mixed_{}_adjusted_{}.pdf" +_MIXED_UNADJUSTED_PLOT_FILENAME = "mixed_{}_unadjusted_{}.pdf" +_OTHER_MIXED_PLOTS_SUBDIR = "other_mixed_plots" def generate_mixed_plot_figure(species, x_max_lim, y_max_lim, corrected_or_not, correction_table_available, @@ -98,12 +99,12 @@ def generate_mixed_plot_figure(species, x_max_lim, y_max_lim, corrected_or_not, seaborn.despine(offset=10) ax.set_xlabel("$K_\mathregular{S}$") - if paranome_data: # If paranome (with or without anchors) is going to be plotted + if paranome_data or reciprocal_retention_data: + # If paranome or rec.ret paralogs appear in the plot, use "duplicates" ax.set_ylabel("Number of retained duplicates (weighted)") - elif colinearity_data: # If only anchor pairs are going to plotted + elif colinearity_data: + # If instead only anchor pairs are going to plotted, use "anchor pairs" ax.set_ylabel("Number of retained anchor pairs (weighted)") - elif reciprocal_retention_data: # If only reciprocally retained data are going to be plotted - ax.set_ylabel("Number of retained duplicates (weighted)") ax.set_xlim(0, x_max_lim) if isinstance(y_max_lim, float): @@ -531,7 +532,7 @@ def create_legend(axis, paranome, colinearity, reciprocal_retention, legend_size def save_mixed_plot(fig_corr, fig_uncorr, ax_corr, ax_uncorr, species, correction_table_available, - paranome, colinearity, reciprocal_retention): + paranome, colinearity, reciprocal_retention, filename_adjusted, filename_unadjusted): """ This function must be called to save the mixed distribution figure in order to adjust the figure layout: the plot area is shrunk to the left and some reasonable space is left on the right side for the legend. @@ -544,27 +545,32 @@ def save_mixed_plot(fig_corr, fig_uncorr, ax_corr, ax_uncorr, species, correctio :param paranome: the config file field that states if the whole-paranome has to be plotted [True/False] :param colinearity: the config file field that states if the anchor pairs have to be plotted [True/False] """ + directory_path = os.path.join("rate_adjustment", f"{species}") + # Generate the subdirectory for mixed plot composed of two or three data types (e.g. paranome & anchor pairs) + if not os.path.exists(os.path.join(directory_path, _OTHER_MIXED_PLOTS_SUBDIR)): + os.makedirs(os.path.join(directory_path, _OTHER_MIXED_PLOTS_SUBDIR)) + if correction_table_available: legend_size = define_legend_size(ax_corr) chart_box = ax_uncorr.get_position() ax_uncorr.set_position([chart_box.x0, chart_box.y0, chart_box.width*0.65, chart_box.height]) lgd = create_legend(ax_uncorr, paranome, colinearity, reciprocal_retention, legend_size) - fig_uncorr.savefig(os.path.join("rate_adjustment", f"{species}", _MIXED_UNADJUSTED_PLOT_FILENAME.format(species)), + fig_uncorr.savefig(os.path.join(directory_path, filename_unadjusted), bbox_extra_artists=(lgd, fig_uncorr._suptitle), bbox_inches="tight", transparent=True, format="pdf") ax_corr.set_position([chart_box.x0, chart_box.y0, chart_box.width*0.65, chart_box.height]) lgd = create_legend(ax_corr, paranome, colinearity, reciprocal_retention, legend_size) - fig_corr.savefig(os.path.join("rate_adjustment", f"{species}", _MIXED_ADJUSTED_PLOT_FILENAME.format(species)), + fig_corr.savefig(os.path.join(directory_path, filename_adjusted.format(species)), bbox_extra_artists=(lgd, fig_corr._suptitle), bbox_inches="tight", transparent=True, format="pdf") else: # if not correction_table_available use a simpler layout with the legend # inside the plot and no right margin lgd = ax_uncorr.legend(handlelength=1.5, loc="upper right") - fig_uncorr.savefig(os.path.join("rate_adjustment", f"{species}", _MIXED_UNADJUSTED_PLOT_FILENAME.format(species)), + fig_uncorr.savefig(os.path.join(directory_path, filename_unadjusted), transparent=True, format="pdf") lgd = ax_corr.legend(handlelength=1.5, loc="upper right") - fig_corr.savefig(os.path.join("rate_adjustment", f"{species}", _MIXED_ADJUSTED_PLOT_FILENAME.format(species)), + fig_corr.savefig(os.path.join(directory_path, filename_adjusted), transparent=True, format="pdf") diff --git a/ksrates/plot_paralogs.py b/ksrates/plot_paralogs.py index a57c8df..d2c1677 100755 --- a/ksrates/plot_paralogs.py +++ b/ksrates/plot_paralogs.py @@ -9,6 +9,7 @@ import ksrates.fc_extract_ks_list as fc_extract_ks_list import ksrates.fc_check_input as fcCheck import ksrates.fc_configfile as fcConf from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE +from ksrates.fc_plotting import _MIXED_ADJUSTED_PLOT_FILENAME, _MIXED_UNADJUSTED_PLOT_FILENAME, _OTHER_MIXED_PLOTS_SUBDIR def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, anchors_ks_tsv_file, rec_ret_tsv_file): # INPUT @@ -98,44 +99,137 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an # GENERATING MIXED DISTRIBUTION PLOT logging.info("") - # TODO: REMOVE THE FOLLOWING COMMENTED BLOCK ONCE THE IF-ELIF LIST IS PROVED TO BE CORRECT - # if not colinearity_analysis: - # logging.info(f"Plotting paranome Ks distribution for species [{species}]") - # elif not paranome_analysis: - # logging.info(f"Plotting anchor pair Ks distribution for species [{species}]") - # else: - # logging.info(f"Plotting paranome and anchor pairs Ks distributions for species [{species}]") - if not colinearity_analysis and not reciprocal_retention_analysis: + ax_uncorr_list, ax_corr_list = [], [] + ax_uncorr_include_para, ax_corr_include_para= [], [] + ax_uncorr_include_col, ax_corr_include_col= [], [] + ax_uncorr_include_rr, ax_corr_include_rr= [], [] + + # Generate the mixed plot with a single data type (e.g. paranome) for any required data_type + if paranome_analysis: logging.info(f"Plotting paranome Ks distribution for species [{species}]") - elif not paranome_analysis and not reciprocal_retention_analysis: + fig_uncorr_para, ax_uncorr_para = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "un-corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, colinearity_data=False, + reciprocal_retention_data=False, top=top, rank_type=rank_type) + fig_corr_para, ax_corr_para = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, colinearity_data=False, + reciprocal_retention_data=False, top=top, rank_type=rank_type) + ax_uncorr_include_para.append(ax_uncorr_para) + ax_corr_include_para.append(ax_corr_para) + ax_uncorr_list.append(ax_uncorr_para) + ax_corr_list.append(ax_corr_para) + + if colinearity_analysis: logging.info(f"Plotting anchor pair Ks distribution for species [{species}]") - elif not paranome_analysis and not colinearity_analysis: + fig_uncorr_col, ax_uncorr_col = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "un-corrected", correction_table_available, plot_correction_arrows, + paranome_data=False, colinearity_data=colinearity_analysis, + reciprocal_retention_data=False, top=top, rank_type=rank_type) + fig_corr_col, ax_corr_col = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "corrected", correction_table_available, plot_correction_arrows, + paranome_data=False, colinearity_data=colinearity_analysis, + reciprocal_retention_data=False, top=top, rank_type=rank_type) + ax_uncorr_include_col.append(ax_uncorr_col) + ax_corr_include_col.append(ax_corr_col) + ax_uncorr_list.append(ax_uncorr_col) + ax_corr_list.append(ax_corr_col) + + if reciprocal_retention_analysis: logging.info(f"Plotting reciprocally retained paralog Ks distribution for species [{species}]") - elif paranome_analysis and colinearity_analysis: + fig_uncorr_rr, ax_uncorr_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "un-corrected", correction_table_available, plot_correction_arrows, + paranome_data=False, colinearity_data=False, + reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + fig_corr_rr, ax_corr_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "corrected", correction_table_available, plot_correction_arrows, + paranome_data=False, colinearity_data=False, + reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + ax_uncorr_include_rr.append(ax_uncorr_rr) + ax_corr_include_rr.append(ax_corr_rr) + ax_uncorr_list.append(ax_uncorr_rr) + ax_corr_list.append(ax_corr_rr) + + # Generate the mixed plot for any required pair of data types (e.g. paranome and anchors) + if paranome_analysis and colinearity_analysis: logging.info(f"Plotting paranome and anchor pairs Ks distributions for species [{species}]") - elif paranome_analysis and reciprocal_retention_analysis: + fig_uncorr_para_col, ax_uncorr_para_col = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "un-corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, + reciprocal_retention_data=False, top=top, rank_type=rank_type) + fig_corr_para_col, ax_corr_para_col = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, + reciprocal_retention_data=False, top=top, rank_type=rank_type) + ax_uncorr_include_para.append(ax_uncorr_para_col) + ax_corr_include_para.append(ax_corr_para_col) + ax_uncorr_include_col.append(ax_uncorr_para_col) + ax_corr_include_col.append(ax_corr_para_col) + ax_uncorr_list.append(ax_uncorr_para_col) + ax_corr_list.append(ax_corr_para_col) + + if paranome_analysis and reciprocal_retention_analysis: logging.info(f"Plotting paranome and reciprocally retained paralog Ks distributions for species [{species}]") - elif colinearity_analysis and reciprocal_retention_analysis: - logging.info(f"Plotting anchor pairs and reciprocally retained paralog Ks distributions for species [{species}]") - else: - logging.info(f"Plotting paranome, anchor pairs and reciprocally retained paralog Ks distributions for species [{species}]") + fig_uncorr_para_rr, ax_uncorr_para_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "un-corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, colinearity_data=False, + reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + fig_corr_para_rr, ax_corr_para_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, colinearity_data=False, + reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + ax_uncorr_include_para.append(ax_uncorr_para_rr) + ax_corr_include_para.append(ax_corr_para_rr) + ax_uncorr_include_rr.append(ax_uncorr_para_rr) + ax_corr_include_rr.append(ax_corr_para_rr) + ax_uncorr_list.append(ax_uncorr_para_rr) + ax_corr_list.append(ax_corr_para_rr) - # PLOTTING THE BACKGROUND PARALOG AND/OR ANCHOR DISTRIBUTION - fig_uncorr, ax_uncorr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, - "un-corrected", correction_table_available, plot_correction_arrows, - paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, - reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + if colinearity_analysis and reciprocal_retention_analysis: + logging.info(f"Plotting anchor pairs and reciprocally retained paralog Ks distributions for species [{species}]") + fig_uncorr_col_rr, ax_uncorr_col_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "un-corrected", correction_table_available, plot_correction_arrows, + paranome_data=False, colinearity_data=colinearity_analysis, + reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + fig_corr_col_rr, ax_corr_col_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "corrected", correction_table_available, plot_correction_arrows, + paranome_data=False, colinearity_data=colinearity_analysis, + reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + ax_uncorr_include_col.append(ax_uncorr_col_rr) + ax_corr_include_col.append(ax_corr_col_rr) + ax_uncorr_include_rr.append(ax_uncorr_col_rr) + ax_corr_include_rr.append(ax_corr_col_rr) + ax_uncorr_list.append(ax_uncorr_col_rr) + ax_corr_list.append(ax_corr_col_rr) - fig_corr, ax_corr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, - "corrected", correction_table_available, plot_correction_arrows, - paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, - reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + # Generate the mixed plot for the three data types altogether if required by configuration + if paranome_analysis and colinearity_analysis and reciprocal_retention_analysis: + logging.info(f"Plotting paranome, anchor pairs and reciprocally retained paralog Ks distributions for species [{species}]") + fig_uncorr_para_col_rr, ax_uncorr_para_col_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "un-corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, + reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + fig_corr_para_col_rr, ax_corr_para_col_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, + reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + ax_uncorr_include_para.append(ax_uncorr_para_col_rr) + ax_corr_include_para.append(ax_corr_para_col_rr) + ax_uncorr_include_col.append(ax_uncorr_para_col_rr) + ax_corr_include_col.append(ax_corr_para_col_rr) + ax_uncorr_include_rr.append(ax_uncorr_para_col_rr) + ax_corr_include_rr.append(ax_corr_para_col_rr) + ax_uncorr_list.append(ax_uncorr_para_col_rr) + ax_corr_list.append(ax_corr_para_col_rr) + # PLOTTING THE BACKGROUND PARALOG DISTRIBUTION(S) if paranome_analysis: paranome_list, paranome_weights = fc_extract_ks_list.ks_list_from_tsv(paralog_tsv_file, max_ks_para, "paralogs") - hist_paranome = fcPlot.plot_histogram("Whole-paranome", ax_uncorr, paranome_list, bin_list, bin_width_para, - max_ks_para, kde_bandwidth_modifier, weight_list=paranome_weights) - fcPlot.plot_histogram("Whole-paranome", ax_corr, paranome_list, bin_list, bin_width_para, + for ax_uncorr in ax_uncorr_include_para: + hist_paranome = fcPlot.plot_histogram("Whole-paranome", ax_uncorr, paranome_list, bin_list, bin_width_para, + max_ks_para, kde_bandwidth_modifier, weight_list=paranome_weights) + for ax_corr in ax_corr_include_para: + fcPlot.plot_histogram("Whole-paranome", ax_corr, paranome_list, bin_list, bin_width_para, max_ks_para, kde_bandwidth_modifier, weight_list=paranome_weights) if colinearity_analysis: @@ -143,43 +237,91 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an if len(anchors_list) == 0: logging.warning(f"No anchor pairs found! Maybe check your (gene) IDs between " f"the [{species}.ks_anchors.tsv] file and the [{species}.ks.tsv] files.") - # FOR NOW USE AN UNWEIGHTED HISTOGRAM - hist_anchors = fcPlot.plot_histogram("Anchor pairs", ax_uncorr, anchors_list, bin_list, bin_width_para, max_ks_para, - kde_bandwidth_modifier, color=fcPlot.COLOR_ANCHOR_HISTOGRAM, weight_list=anchors_weights) - fcPlot.plot_histogram("Anchor pairs", ax_corr, anchors_list, bin_list, bin_width_para, max_ks_para, - kde_bandwidth_modifier, color=fcPlot.COLOR_ANCHOR_HISTOGRAM, weight_list=anchors_weights) + for ax_uncorr in ax_uncorr_include_col: + hist_anchors = fcPlot.plot_histogram("Anchor pairs", ax_uncorr, anchors_list, bin_list, bin_width_para, max_ks_para, + kde_bandwidth_modifier, color=fcPlot.COLOR_ANCHOR_HISTOGRAM, weight_list=anchors_weights) + for ax_corr in ax_corr_include_col: + fcPlot.plot_histogram("Anchor pairs", ax_corr, anchors_list, bin_list, bin_width_para, max_ks_para, + kde_bandwidth_modifier, color=fcPlot.COLOR_ANCHOR_HISTOGRAM, weight_list=anchors_weights) if reciprocal_retention_analysis: rec_ret_list, rec_ret_weights = fc_extract_ks_list.ks_list_from_tsv(rec_ret_tsv_file, max_ks_para, "reciprocally retained") - hist_rec_ret = fcPlot.plot_histogram("Reciprocally retained paralogs", ax_uncorr, rec_ret_list, bin_list, bin_width_para, - max_ks_para, kde_bandwidth_modifier, weight_list=rec_ret_weights) - fcPlot.plot_histogram("Reciprocally retained paralogs", ax_corr, rec_ret_list, bin_list, bin_width_para, - max_ks_para, kde_bandwidth_modifier, color=fcPlot.COLOR_REC_RET_HISTOGRAM, weight_list=rec_ret_weights) - - # Setting plot height based on tallest histogram (if paranome analysis is on, it will come from that distribution) + for ax_uncorr in ax_uncorr_include_rr: + hist_rec_ret = fcPlot.plot_histogram("Reciprocally retained paralogs", ax_uncorr, rec_ret_list, bin_list, bin_width_para, + max_ks_para, kde_bandwidth_modifier, color=fcPlot.COLOR_REC_RET_HISTOGRAM, weight_list=rec_ret_weights) + for ax_corr in ax_corr_include_rr: + fcPlot.plot_histogram("Reciprocally retained paralogs", ax_corr, rec_ret_list, bin_list, bin_width_para, + max_ks_para, kde_bandwidth_modifier, color=fcPlot.COLOR_REC_RET_HISTOGRAM, weight_list=rec_ret_weights) + + # Setting plot height based on tallest histogram among the paralog distribution(s) if y_lim is None: - if paranome_analysis: + # Set the height based on paranome distribution when this is for sure the tallest + for ax_uncorr, ax_corr in zip(ax_uncorr_include_para, ax_corr_include_para): fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_paranome) fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_paranome) - elif colinearity_analysis and not reciprocal_retention_analysis: - fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_anchors) - fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_anchors) - elif not colinearity_analysis and reciprocal_retention_analysis: - fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_rec_ret) - fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_rec_ret) - elif colinearity_analysis and reciprocal_retention_analysis: - fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_anchors, hist_rec_ret) - fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_anchors, hist_rec_ret) + + # Set height based on anchor pair distributions if that's the only one in the plot + fcPlot.set_mixed_plot_height(ax_uncorr_col, y_lim, hist_anchors) + fcPlot.set_mixed_plot_height(ax_corr_col, y_lim, hist_anchors) + + # Set height based on reciprocally retained paralog distribution if that's the only one in the plot + fcPlot.set_mixed_plot_height(ax_uncorr_rr, y_lim, hist_rec_ret) + fcPlot.set_mixed_plot_height(ax_corr_rr, y_lim, hist_rec_ret) + + # Set height to tallest distribution between anchor pair and reciprocally retained paralogs + fcPlot.set_mixed_plot_height(ax_uncorr_col_rr, y_lim, hist_anchors, hist_rec_ret) + fcPlot.set_mixed_plot_height(ax_corr_col_rr, y_lim, hist_anchors, hist_rec_ret) + # PLOTTING THE ORTHOLOG DIVERGENCE LINES on the paralog distribution if correction_table_available: logging.info("Plotting ortholog divergence lines in the mixed plot") - fcPlot.plot_divergences(correction_table, peak_stats, consensus_peak_for_multiple_outgroups, ax_uncorr, ax_corr, color_list, plot_correction_arrows) + for ax_uncorr, ax_corr in zip(ax_uncorr_list, ax_corr_list): + fcPlot.plot_divergences(correction_table, peak_stats, consensus_peak_for_multiple_outgroups, ax_uncorr, ax_corr, color_list, plot_correction_arrows) logging.info("") - logging.info(f"Saving PDF figures of mixed plots [{fcPlot._MIXED_ADJUSTED_PLOT_FILENAME.format(species)}, {fcPlot._MIXED_UNADJUSTED_PLOT_FILENAME.format(species)}]") - fcPlot.save_mixed_plot(fig_corr, fig_uncorr, ax_corr, ax_uncorr, species, correction_table_available, paranome=paranome_analysis, - colinearity=colinearity_analysis, reciprocal_retention=reciprocal_retention_analysis) + logging.info(f"Saving PDF figures of mixed plots") + if paranome_analysis: + fcPlot.save_mixed_plot(fig_corr_para, fig_uncorr_para, ax_corr_para, ax_uncorr_para, species, correction_table_available, paranome=paranome_analysis, + colinearity=False, reciprocal_retention=False, + filename_adjusted=_MIXED_ADJUSTED_PLOT_FILENAME.format(species, "paranome"), + filename_unadjusted=_MIXED_UNADJUSTED_PLOT_FILENAME.format(species, "paranome")) + + if colinearity_analysis: + fcPlot.save_mixed_plot(fig_corr_col, fig_uncorr_col, ax_corr_col, ax_uncorr_col, species, correction_table_available, paranome=False, + colinearity=colinearity_analysis, reciprocal_retention=False, + filename_adjusted=_MIXED_ADJUSTED_PLOT_FILENAME.format(species, "anchors"), + filename_unadjusted=_MIXED_UNADJUSTED_PLOT_FILENAME.format(species, "anchors")) + + if reciprocal_retention_analysis: + fcPlot.save_mixed_plot(fig_corr_rr, fig_uncorr_rr, ax_corr_rr, ax_uncorr_rr, species, correction_table_available, paranome=False, + colinearity=False, reciprocal_retention=reciprocal_retention_analysis, + filename_adjusted=_MIXED_ADJUSTED_PLOT_FILENAME.format(species, "recret"), + filename_unadjusted=_MIXED_UNADJUSTED_PLOT_FILENAME.format(species, "recret")) + + if paranome_analysis and colinearity_analysis: + fcPlot.save_mixed_plot(fig_corr_para_col, fig_uncorr_para_col, ax_corr_para_col, ax_uncorr_para_col, species, correction_table_available, + paranome=paranome_analysis, colinearity=colinearity_analysis, reciprocal_retention=False, + filename_adjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_ADJUSTED_PLOT_FILENAME.format(species, "paranome_anchors")), + filename_unadjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_UNADJUSTED_PLOT_FILENAME.format(species, "paranome_anchors"))) + + if paranome_analysis and reciprocal_retention_analysis: + fcPlot.save_mixed_plot(fig_corr_para_rr, fig_uncorr_para_rr, ax_corr_para_rr, ax_uncorr_para_rr, species, correction_table_available, + paranome=paranome_analysis, colinearity=False, reciprocal_retention=reciprocal_retention_analysis, + filename_adjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_ADJUSTED_PLOT_FILENAME.format(species, "paranome_recret")), + filename_unadjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_UNADJUSTED_PLOT_FILENAME.format(species, "paranome_recret"))) + + if colinearity_analysis and reciprocal_retention_analysis: + fcPlot.save_mixed_plot(fig_corr_col_rr, fig_uncorr_col_rr, ax_corr_col_rr, ax_uncorr_col_rr, species, correction_table_available, paranome=False, + colinearity=reciprocal_retention_analysis, reciprocal_retention=reciprocal_retention_analysis, + filename_adjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_ADJUSTED_PLOT_FILENAME.format(species, "anchors_recret")), + filename_unadjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_UNADJUSTED_PLOT_FILENAME.format(species, "anchors_recret"))) + + if paranome_analysis and colinearity_analysis and reciprocal_retention_analysis: + fcPlot.save_mixed_plot(fig_corr_para_col_rr, fig_uncorr_para_col_rr, ax_corr_para_col_rr, ax_uncorr_para_col_rr, species, correction_table_available, + paranome=paranome_analysis, colinearity=reciprocal_retention_analysis, reciprocal_retention=reciprocal_retention_analysis, + filename_adjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_ADJUSTED_PLOT_FILENAME.format(species, "paranome_anchors_recret")), + filename_unadjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_UNADJUSTED_PLOT_FILENAME.format(species, "paranome_anchors_recret"))) logging.info("") logging.info("All done") -- GitLab From d31084e90934b27b94db3e050a011346e43b2ada Mon Sep 17 00:00:00 2001 From: cesen Date: Wed, 8 Dec 2021 17:15:06 +0100 Subject: [PATCH 05/26] Fix absolute path of cds sequences --- ksrates/fc_wgd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ksrates/fc_wgd.py b/ksrates/fc_wgd.py index b01fcbe..d6c54ac 100755 --- a/ksrates/fc_wgd.py +++ b/ksrates/fc_wgd.py @@ -349,7 +349,8 @@ def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type if do_ks: logging.info('---') logging.info('Running reciprocally retained gene families Ks analysis...') - + cds_sequences = read_fasta(os.path.abspath(cds_fasta)) + # tmp directory cw_dir = os.getcwd() if not os.path.exists(tmp_ks_paralogs): @@ -357,7 +358,6 @@ def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type os.chdir(tmp_ks_paralogs) # change dir to tmp dir, as codeml writes non-unique file names to the working dir logging.info(f'Translating CDS file {cds_fasta}...') - cds_sequences = read_fasta(os.path.abspath(cds_fasta)) protein_sequences = translate_cds(cds_sequences) if not filter_false_predictions and not filter_zero_false_predictions: -- GitLab From a7b5780943e1ae42c1fbfcc1c7a1178242906c6a Mon Sep 17 00:00:00 2001 From: cesen Date: Wed, 8 Dec 2021 17:16:26 +0100 Subject: [PATCH 06/26] Longer elaeis FASTA to get enough rec.ret paralogs --- test/elaeis.fasta | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/elaeis.fasta b/test/elaeis.fasta index 1dbfafa..8c17004 100644 --- a/test/elaeis.fasta +++ b/test/elaeis.fasta @@ -998,3 +998,11 @@ ATGCCAATCTCTTATGCAAACTCTGGTGACAGTCTTGCAGTTTTTCCACGAAGAATTGAGTTCGATTCCGACATCATTGC ATGGTTTCCCCAGAAGTCATCCGGACCGCGGTCGGGATTCTAGGAAATGCCATTGCACTAGGGTTGTTCTTGTCCCCAGTGCCAACTTTCATGAGAATACGGAAGAAGGGTTCGGTGGAGGAATTCTCACCATTCCCATACCTAGCCACCCTCCTCAACTGCATGATGTGGGTGGTGTATGGGCTGCCCATGGTGCACCCCCATAGCATCCTGGTCATAACCATTAATGGGTCAGGAGTTATCATAGAGCTCTCCTTCGTGCTTTTCTTCCTCCTCTACTCCCAGGGGAGGAAGAGATTCTTGGTGCTTGCGATGCTGCTCGCTGAGATCGCCGTCGTCGGCATCGTCACCCTGCTTGTACTGACACTTGCCCACACATATGACCGTCGCTCCTTGATCGTTGGCGTTGTCTGCGTCTTCTTGTGTGCCATGATGTATGCTTCTCCCTTATCCGTCATGAAAATGGTGATTCAGACAAGGAGCGTGGAATTCATGCCTCTCTTTCTCTCGCTGGCCTCTTTCTTCAACGGCACTTGCTGGACAGCCTATGCTCTCATCCGTTTCGACCCATACATCATTATTCCAAATGCCCTTGGAGTAATCTTTGCGGTGGCACAATTGATATTGCATGTGGTGTACCGCAAGTCCACCAAGGAACAGTTAGAGGCGAAGAAGGCGGCCCCGGCAGCAGATATGGGATTGGCCGAGGTGGTGGTGGTGAAAGAAGAGACAAGCAAGATGAGCAATGCATCTTCTAATGGCTTCTAA >p5.00_sc00001_p0500.1 ATGGAAAGATCGAGGTCGAAGAGGAGCTACCATTACGAACACGACTCCAACTCCCAAAGCCCTCCCCACCGCTCCAAGCCCCGCTACGATGGCGGCCACCACCGCCGCGGAAACCACCACCGCCGGGGCGGCGACCGCCGGGCCCCGCCCCCTCCGCCTCCCCCTCCCCCGCCACCGGTGCCGGGGGCGGCGGCGCTGGCCTCCTCATCCTCTGCCTCCTCCCCCAAGATCCAGGACGGCGCGGGGCCACCCAGCGCGGTTACCACCTTTTTCCGCATTCTGTGCCCCGACTCGAAGGTCGGCGGCGTGATCGGCAAGTCCGGGAGCATCATCAAGAACTTCCGTCAGCAGACCGGCGCGTGGATCAACGTCCACCAGCTGGTCCTCGGTGACGACGAGCGGATCATCGAGACGGCGGACAACCGCCGCCGGGAGCCCGACGGCCGGCCCCCGCAGTACTCCCCAGCCCAGGAGGCCCTGCTGCTGATCCACGAGCGGATCGTGGATGCAGAGTTCGAGTATAGCGGCGGCGGGGACGAGGAGGACGAGTATAGCGCGGGCGGCAGTGGTGGAGGCGGAGGAGGGTGGGGCGGTGGAGGAGGAGAGAGGCGAAGGGATAGGGGACGGGTAACGACGAGGCTGGTGGTGCCGAGGACACACGTCGGGTGCTTGCTTGGGAGGGGAGGGAAGATCATTGAGCAGATGAGGATCGAGACCAAAACCCACATCCGAATCTTGCCGCGGGATCAGTACACTCCTCAGTGTGTCTCGACTTCAGAAGAAGTCGTTCAGGTTGTTGGAGAAGGTAATTGTGTGAAGAAAGCTGTTGCAACCATCTCATCTCGTTTAAAGGAGAGCTTGCATCGTGATCGTGGAGCTTTTCGTGGGCGATTGCATTCACCAGACCATTACATTACCCCTGATGATGAGTTTGCCAACAGTACGCAACATGCATTGACAGTGGAAGAATCTGATTTGGGGTCACGATCTTCTGTGGGACAAAATAGAGCTAGAATCAATTCTTATAGTTCAGAACCATCTGGATATGCATTTGATTCTGATGGAAATACTCTGAATGACCACTCACAGTCATTATCCTATGAGGACCTTGTGTTTCGAATTCTTTGCCCCAATGATAAAGTTGAGAGTGTTATGGAAGCACCAAATGGAATTATTGAAATGCTCCGAGCTGATATAGGTGTGGATGTGCAGGTTACTGATCCTGTACCCGGTTCTGATGAAAGAATCATAATCATCACTTCTGAAGAGGGCCCGGATGATGATCTTTTTCCAGCTCAGGAAGCTTTGTTGCATATTCAAACTCACATTGTAGACCTTGGTCCTGACAAGGACAACATCATAACAACAAGGCTTCTTGTTCCAGCAACAGAAATTGCTTGTTTGGAGGGGAAGGATGGATCATTATCTGATATACAAAGATTCACTAATGCCAATGTACAAATCTTACCAAAAGAGGATCTTCCCCCGTGTGCATTGGAGGCTGATGAACTTGTACAGATTGTAGGGGAGATCAGAGCAGCTCGAAATGCTCTTGTCCATGTTACTGCAAAGTTGCGGAGTTATTTGTATCGTGACATCTCTGGACCAAAGGATATGCTTCCACCATCTATTGCTGCCCCAAGCCATGTTGGTAGCATTGGTGGGCATGAATCTGGCTCTCCTATTAAAGCTTCAACTCGTGAAACTTATCTGGGAAGTGATCCTCCCATTGCAATGTATCAAAACATGCATACTGCAACAACTGCCTGGCAACCAAAGGACACTGGAGGATGTGCAAGTGGATCATTTGAGCAGGAAGAGAGCAATGTTAATGATGAAGGGAGGCAGAGTGGTGTGAAAAGACTCAGTGTACCGCTAATCACCAGAAGCACACTGGAAGTTATAATACCCAGACATGCAGTTCCAAGCCTGATATTGAGATCAGGAAGCAAGCTTGCTCAGATCAGTGAGATGTCTGGGGCAACTGTCACCCTTATTGAAGATAAATCTGAATTGGCTGAAAAGGTTGTTCAAATATCAGGAAGTCCAGAGCAGGCTGAGAGAGCCCAGAGCCTGCTCCAGGGATTTATACTTAGTACACAAGATGACATTCCTTCGAGCTAA +>p5.00_sc00001_p0707.1 +ATGCCATCTCTCTGCTCCCCGTATGATTCTCAGCTGAATGATCGGCAATGTGAACCTCAAGAGGTGTCAATCACCGATGCACCAGTGGTTTGGCAGAACTACACGTGCGAAGTTTCGGTTTCAGGTTTATGTATCAGCAGTGGAAGGATTACTCCAGATATATATTTTCATCTGTCAGAGGCAGTAAATGCGAGCTATGCCCTCTACCACTACACTCCGCTATTACTCAGTCTTCAGGATTGCAGATTCGTCCGAGAAACGTTCGACACCATCACCACAAACTACTGCCCCAATCTGGAGCACGACCTCCGCATGGTCAGTGTTGGACTGGCCCTGATATCCATCGGGGTCTTGCTCTGCCTCATCTTGTGGATTTTCTATGCGAACCGCCCCCAAAGGGAGGAGGTGTTTGCGCCGCTGTCTGAAGTGAAGTTTGCTCCAAATAGCAACATTTCCGTAAGAATCGATGTATCATCCTAG +>p5.00_sc00001_p0708.1 +ATGGATGCCCGCGGCGCCGCCGGCTCCGGAGGAAGCGGCGGAAGGGGGATCGCCTTCTGCCTCCTCGTCTTGGTCTCTCTTTTCGTGGCCTCCGAGGCGTTGCCGGCTCTCGCTGGTGGAGAGGAAGGCGCTAGGAAGATTAGCCTAGATTTAAAGAGTTTGCCGGCATGGGCGAAAGATCTCTTAAAGTTGACACCAATGGCACCAGAACCGACAACTTCGGCTCCAAGTGCAAAATTCCCGCTGGTTTTAGCGGCAAACAGAACAAGGAGGCCGGATGTGCTTGATAACTTCAGAATGTATGGCGGAGGTTGGAACATTACTAACAAGCATTACTGGGTGTCTGTTGCGTTTACAGGAGCTGCTGGATTTGTTCTTTCAGCATTGTGGTTTGTTTCTTTTGGGTTGGTTTTAGGTGTACATCTCTGCTGTAGACGGGGAATGGGAACCAAAGAGGGAGGATCATTCTGTGCACGACGTATTTGTCTTGCATTGTTGATGCTCTTCACATGTGCTGCAGCGACCGGGTGCATTCTTCTCTCTGTGGGCCAGAATGAGTTCCATGAGGAAGTTTTGGATACTTTAAAGTTTGTTGTCAATCAATCAGATTTCACTATTCAGATACTAAGGAATGTCACAGATTTTCTGTCCTCTGCCAAGACAATTAATGTGGCAGAGGTCTATCTCCCGCCGGAAATTTGGAATGAAATTGACAAGTTAAATGGAGAGTTAAATGGTGCAGCTAGTATCCTATCAGAGAAAACAAATGAAAGTTCTGAGAAAATCAGAGCCGTCATCGATGACTTGCGTTGCATATTGATTGTGGTAGCATCACTGATGCTTTTGCTGGCCATACTTGGTTTCTTGCTTTCAGTTTGGGGTCACAAACATGCAATTTATGTATTTATAATGAGTGGTTGGCTCCTTGTGGCAGTTACCTTCATTCTCTATGGATTTTTTGTGATCCTTAATAAGAGATTGGCAAATGCATGTGCCAGGCAGACCTCTCATTCTGCACATACTGTGTGA +>p5.00_sc00002_p0130.1 +ATGGCGGACGAGCAATGGCGCGAGGAGGACTACTCCCAGCTGAGAGATCTCCGTGTGAGCCTCGATCAAGTAGGCGAAGACGGCGAAGGAAGAGGAGATGGGGGCGGTGATGGCGGCAGCGGCTTCTCCCTCTGCCTCTGGCTTTACCTCTCGAGCTCCGCTCGCCCTTCTTCCATGATCCTTCGACAGATGACTTCGGAAAATGAAGGTGAAGTTCCCTTTCTGGCCTTGAGTGAGCAAAACAAGCTGATGCTCTTTCCTTTGATGTTCTTACATAAAGAAGCTCCATCTACTGGCAGTTCCTTTACATGGACTGACATGCCCCACATATCTTCCAAGATTGAATTCCCACTGGAGAAATGGGTTCATGTTGGGTGTGAGGTAGCCACCCATCATATGCGCCTTCACATTGATGGAAAACTAGTTGGAGAGGAACCTGTAACACTCCTAACCAATAAGCATCACTATCGGGACAATTTCAAGAGAATAAATTTTGTTGGTAATGATGGGAAACTTGATGGTTATGTCTATCATGCTCAAGTTTTGCCTATATCAGCATCGATAACAGATCAGTTTGTAAAGGATCCGCCAGTCAAATTATCTCTTGATAGTTCAGGCATCTCTGATGGAGTTGAAGAATGTGGGGATGGTGTCTGGAGCATTGTTGGTGGCAAGGCATCTTGTCGCAGGAATTTTTCGCTGGATGTTGTTCTCTTAAATGCTTTTGGTCGGACTGTAAACAAGGAGATGGAGATTGTTGCTTCACTTGTATATGCCGATAATGGAACTCCTGTTGAAAAGACTAGAGATGATGCAGAAGCTCCATTACTGACGAGCTATGATGGACTTGAACACCCTTCCACGGACAGGCCTGTTACACTTCTTCGTGGATGTGCAACCTTTAAGCTGAAGATATCTCAGCTGTCTTCTAAATCTGACAATAGGCTGTTCCGTGTTTGTTTTCGTACAATGCATGCTCAAAGATATCCATTTCTAGAAGCATACTCCCATCCAATCCATTGCATCTCAAGGAATCGCAGTAACCGACCATTGGCTTTAGGGAAAAGATTGGCTTCCACAAGGGTGCTACTTGATGAAATTCAGTCACTCAAAGGCAATGATGGGTCTAGAGTAATTCATGATACCTATGGAGATGATCGTTTCCGCATCCCGAATCAGAGCGTATTGAGATGCAATACCCCATCAAAGCATTCTAAACTAGAACATGATAGCTCACCTTCGGCAATTGATGCTAATGGAATCTTGGAAAAGGATGAGATGTCACTGAAGACAAGTCTAGAAGTAAGGTCGAATAATTTTGAAGGAACGGATAGTGCACCATCAGATTCTGAGAGCACTGATGCTAGGAACTATGAATCCAGATGGACAAAAGAGGCTATGAACCCAATTTCAGATGCTATTATGTTTAAATACTGTTTAGAAGGCACAAATGAGCGGTTGATGTTGCTCAAAGAGATGATAACCTCTGCCTGTGATGAAGATATCACTAACTTCGCAGAGCAGGTTTCCTTGTATGCCGGATGTTCTCATCACCAATATCAAATATTGATTTCAAAGCGGTTGCTCCAAGAGGGGGCGGATACTTGGAACTCAATTACTCAGAACAACTGCCATGTTCTCTGGAAAGATGCAGTTCCTGAGATTGATAAAAGATTCAGGAACACTGCCCATTCTAACAGGGGTTTGTCAGGAGAGGACCTGGAGGTTTTGCGAGGAATTGCTGGATGTGGTGATCATTTGGGTCGAGAAGAATTTGATAGAATGTGGTACTGGTTATACCCTGTAGCTTTTTCTTTGTCGAACGAACAAATAAATACCATGTGGGCATGCATCTCACCCAAATGGATAGAAGGATTAATTACTAGGGAAGAAGCTGAGAATGCGCTGAAAGGTCCTAGAGGGCTGCAGAGACCAGGAACCTTTGTTCTCAGGTTTCCAACCTCTCGCAGCTGGCCTCACCCAGATGCTGGTAGCTTAGTTGTCACCTATGTCGCTGCTGATTCCACACTTCATCATAAGCTTCTTTCTTTTGATTATAGGAAAAAGGACACCAGCCCACTTCAAGAGTTGCTTTTGGAAGAGCCTGAATTATCGCATTTGGGAAGGCAAGTTACACTGAAATATTATTTCATGCTTGTCTATGGTGCTTTTGTCTGTAGTTAA +>p5.00_sc00002_p0131.1 +ATGAGGCCTCTCATCCCTCTCTCTCACACTACCATCCTCTCTCTCCTCCTCATTTCCTCCATTGCCAAGGCTTCCATCCTCGAAGAAACTTGCAAGAGCGTCGCGGCAGCCCGCCCCAACATCGGCTACGAATTTTGCGCGACATCTCTCCGAGCGGATCCAGGAAGCAGCTCCGCAGATCTCCATGGTCTCGCGGTCATCGCGACGAGATTGTCCCTGGCCGATGCTACAAACACGGAATCCAGAATAAATCAGCTGATGGTGGTGGAGTCCAAGAAGCCTTTCTTAAAAGACTGCCTCAGCGTTTGCTCAGAAGTGTACTCGGAAGCGATCGACCACTTGAACGACGCCAGCCGCAACCTGGAAGCCAGGCTCTACCGCGAGGCCGTGACATTTCTTAGCGCAGCACTCGACGCGCCTGATAACTGTGAGGATGCGTTCAGCGATGCCGGCAGTGGGGGGTTGTCGCCGCTGGCGAGGGAGGATGAGGATTATGGCAGGCTTGCTGACGTTGCTCTAACCATCACCGCGTCTCTGGGATGA -- GitLab From 267c132c025e2bc66c7224439deebda5acbe5fd3 Mon Sep 17 00:00:00 2001 From: cesen Date: Wed, 8 Dec 2021 18:18:05 +0100 Subject: [PATCH 07/26] Fix default mixture models for recret & paranome - When configuration requires rec.ret. AND paranome, the default methods are 1) ELMM for paranome and 2) LMM for rec.ret. paralogs, while before LMM was by default set for both rec.ret. and paranome. - This latter option is still possible when asking for "extra" methods, namely all the available ones for the data types involved. --- ksrates/paralogs_analyses.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/ksrates/paralogs_analyses.py b/ksrates/paralogs_analyses.py index 296b024..1a01622 100644 --- a/ksrates/paralogs_analyses.py +++ b/ksrates/paralogs_analyses.py @@ -31,7 +31,7 @@ def paralogs_analyses_methods(config_file, paranome_table, anchors_table, recipr # 2) ONLY COLLINEARITY - if colinearity and not paranome and not reciprocal_retention_analysis: + elif not paranome and colinearity and not reciprocal_retention_analysis: # Default: perform only anchor clustering cluster_anchor_ks(config_file, correction_table, anchorpoints, multiplicons, segments, list_elements, anchors_table, multiplicon_pairs) @@ -42,26 +42,26 @@ def paralogs_analyses_methods(config_file, paranome_table, anchors_table, recipr # 3) ONLY REC.RET. - if reciprocal_retention_analysis and not colinearity and not paranome: + elif not paranome and not colinearity and reciprocal_retention_analysis: # Default: perform LMM on rec.ret. paralogs lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) # There are no additional methods for rec.ret. paralogs # 4) COLLINEARITY AND REC.RET. - if colinearity and reciprocal_retention_analysis and not paranome: + elif not paranome and colinearity and reciprocal_retention_analysis: # Default: perform anchor clustering and LMM on rec.ret. paralogs cluster_anchor_ks(config_file, correction_table, anchorpoints, multiplicons, segments, list_elements, anchors_table, multiplicon_pairs) logging.info(f"\n") if not extra_paralogs_analyses_methods: - lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table, force_anchors=False) + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table, ignore_anchors=True) elif extra_paralogs_analyses_methods: # If additional methods are asked: perform LMM both on rec.ret. paralogs and - as extra - on anchors - lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table, force_anchors=True) + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) # 5) COLLINEARITY AND PARANOME - if colinearity and not reciprocal_retention_analysis and paranome: + elif paranome and colinearity and not reciprocal_retention_analysis: # Default: perform only anchor clustering cluster_anchor_ks(config_file, correction_table, anchorpoints, multiplicons, segments, list_elements, anchors_table, multiplicon_pairs) @@ -73,14 +73,19 @@ def paralogs_analyses_methods(config_file, paranome_table, anchors_table, recipr lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) # 6) REC.RET. AND PARANOME - if reciprocal_retention_analysis and not colinearity and paranome: - # Default: perform LMM only on rec.ret. paralogs and on paranome, and ELMM on paranome + elif paranome and not colinearity and reciprocal_retention_analysis: + # Default: perform ELMM on paranome and LMM on rec.ret. paralogs exp_log_mixture(config_file, paranome_table, correction_table) logging.info(f"\n") - lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) + if not extra_paralogs_analyses_methods: + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table, ignore_paranome=True) + + elif extra_paralogs_analyses_methods: + # If additional methods are asked: perform LMM both on rec.ret. paralogs and on paranome + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) # 7) COLLINEARITY AND REC.RET. AND PARANOME - if colinearity and reciprocal_retention_analysis and paranome: + elif paranome and colinearity and reciprocal_retention_analysis: # Default: perform anchor clustering and LMM on rec.ret. paralogs cluster_anchor_ks(config_file, correction_table, anchorpoints, multiplicons, segments, list_elements, anchors_table, multiplicon_pairs) logging.info(f"\n") -- GitLab From 407ad14043e71f7fad97fe737013c7954f069a83 Mon Sep 17 00:00:00 2001 From: cesen Date: Wed, 8 Dec 2021 18:24:06 +0100 Subject: [PATCH 08/26] Bugfix: parameter_table must be empty before lmm() --- ksrates/lognormal_mixture.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ksrates/lognormal_mixture.py b/ksrates/lognormal_mixture.py index 8ef4691..bb8bbce 100644 --- a/ksrates/lognormal_mixture.py +++ b/ksrates/lognormal_mixture.py @@ -77,6 +77,7 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re correction_table_file = fcCheck.get_argument_path(correction_table_file, default_path_correction_table_file, "Rate-adjustment table file") if correction_table_file == "": logging.warning("Rate-adjustment data are not available yet, only Ks distribution will be plotted.") + logging.info("") correction_table = None correction_table_available = False else: @@ -118,9 +119,8 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re correction_table_available, plot_correction_arrows, paranome_data=False, colinearity_data=False, reciprocal_retention_data=True) - parameter_table = [] - if paranome_analysis: + parameter_table = [] with open (os.path.join("rate_adjustment", f"{species}", subfolder, f"lmm_{species}_parameters_paranome.txt"), "w+") as outfile: logging.info("Performing lognormal mixture model on whole-paranome Ks distribution") paranome_list, paranome_weights = fc_extract_ks_list.ks_list_from_tsv(paralog_tsv_file, max_ks_para, "paralogs") @@ -140,9 +140,9 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re if paranome_analysis and colinearity_analysis: logging.info("") - parameter_table = [] if colinearity_analysis: + parameter_table = [] with open (os.path.join("rate_adjustment", f"{species}", subfolder, f"lmm_{species}_parameters_anchors.txt"), "w+") as outfile: logging.info("Performing lognormal mixture model on anchor pair Ks distribution") anchors_list, anchors_weights = fc_extract_ks_list.ks_list_from_tsv(anchors_ks_tsv_file, max_ks_para, "anchor pairs") @@ -163,7 +163,11 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re # Generating tabular text file with all model parameters fcLMM.make_parameter_table_file(parameter_table, species, "anchors") + if (paranome_analysis or colinearity_analysis) and reciprocal_retention_analysis: + logging.info("") + if reciprocal_retention_analysis: + parameter_table = [] with open (os.path.join("rate_adjustment", f"{species}", subfolder, f"lmm_{species}_parameters_reciprocally_retained.txt"), "w+") as outfile: logging.info("Performing lognormal mixture model on reciprocally retained paralog Ks distribution") rec_ret_list, rec_ret_weights = fc_extract_ks_list.ks_list_from_tsv(rec_ret_tsv_file, max_ks_para, "reciprocally retained") -- GitLab From 48b9b028be9adab2d05060252940e38902ad68a3 Mon Sep 17 00:00:00 2001 From: cesen Date: Wed, 8 Dec 2021 19:02:46 +0100 Subject: [PATCH 09/26] Add spacing lines between text for readability --- ksrates/cluster_anchor_ks.py | 3 ++- ksrates/exp_log_mixture.py | 1 + ksrates/extend_orthogroups.py | 8 +++++--- ksrates/fc_reciprocal_retention.py | 3 ++- ksrates/plot_paralogs.py | 1 + 5 files changed, 11 insertions(+), 5 deletions(-) diff --git a/ksrates/cluster_anchor_ks.py b/ksrates/cluster_anchor_ks.py index 9a6eff3..238e25f 100644 --- a/ksrates/cluster_anchor_ks.py +++ b/ksrates/cluster_anchor_ks.py @@ -41,7 +41,7 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, num_EM_initializations = config.get_num_EM_initializations() # how many times the fitting with N given components is initialized logging.info(f" - maximum EM iterations: {max_EM_iterations}") logging.info(f" - number of EM initializations: {num_EM_initializations}") - + logging.info("") # Getting the statistical measure for how to determine the representative value of an ortholog distribution peak_stats = config.get_peak_stats() # default is mode (other option, median) @@ -53,6 +53,7 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, correction_table_file = fcCheck.get_argument_path(correction_table_file, default_path_correction_table_file, "Rate-adjustment table file") if correction_table_file == "": logging.warning("Rate-adjustment data are not available yet, only anchor pair distribution will be plotted.") + logging.info("") correction_table_available = False else: with open(correction_table_file, "r") as f: diff --git a/ksrates/exp_log_mixture.py b/ksrates/exp_log_mixture.py index 8a48c58..e55bea0 100644 --- a/ksrates/exp_log_mixture.py +++ b/ksrates/exp_log_mixture.py @@ -77,6 +77,7 @@ def exp_log_mixture(config_file, paralog_tsv_file, correction_table_file): correction_table_file = fcCheck.get_argument_path(correction_table_file, default_path_correction_table_file, "Rate-adjustment table file") if correction_table_file == "": logging.warning("Rate-adjustment data are not available yet, only Ks paranome distribution will be plotted.") + logging.info("") correction_table = None correction_table_available = False else: diff --git a/ksrates/extend_orthogroups.py b/ksrates/extend_orthogroups.py index dbca999..e17aa0e 100755 --- a/ksrates/extend_orthogroups.py +++ b/ksrates/extend_orthogroups.py @@ -183,7 +183,8 @@ def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_t else: # Else open the already existing diamond table file logging.info("Found already existing homology result table with gene families associated to matched sequences") dmd_with_orthogroups = read_csv(dmd_with_orthogroups_filename, sep="\t", header=None, names=column_names + ["GF"]) - logging.info("Done\n") + logging.info("Done") + logging.info("") # ----------------------------------------------------------------------------- @@ -213,11 +214,12 @@ def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_t assigned_gfs_file = fc_rec_ret.assign_first_diamond_hit_with_rbh(species, fasta_file, species_sequences_filename, orthogroup_database_nucl, query_seqs, dmd_with_orthogroups, pident_threshold, column_names, dmd_tmp_dir, base_dir_rec_ret) - logging.info("") - if not test_run: logging.info("Done") + logging.info("") return assigned_gfs_file + + logging.info("") logging.info("-------------------------------------------------------------------------------") diff --git a/ksrates/fc_reciprocal_retention.py b/ksrates/fc_reciprocal_retention.py index 3b888bf..aa62f10 100755 --- a/ksrates/fc_reciprocal_retention.py +++ b/ksrates/fc_reciprocal_retention.py @@ -481,7 +481,8 @@ def assign_first_diamond_hit_with_rbh(species, fasta_file, species_sequences_fil Assign to the query gene the orthogroup of the first (and best) hit, but only if there is a reciprocal best hit correspondence between the two genes. """ - logging.info(f"Temporary diamond directory: {dmd_tmp_dir}\n") + logging.info(f"Temporary diamond directory: {dmd_tmp_dir}") + logging.info("") filename = os.path.join(basefolder_output_files, f"assigned_gfs_{species}_{pident_threshold}%_best_diamond_hit_with_rbh.tsv") diff --git a/ksrates/plot_paralogs.py b/ksrates/plot_paralogs.py index d2c1677..a7cf24a 100755 --- a/ksrates/plot_paralogs.py +++ b/ksrates/plot_paralogs.py @@ -65,6 +65,7 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an correction_table_file = fcCheck.get_argument_path(correction_table_file, default_path_correction_table_file, "Rate-adjustment table file") if correction_table_file == "": # it means that the correction_table is not present or available yet logging.warning("Rate-adjustment data are not available yet, only paralog distribution will be plotted.") + logging.info("") correction_table_available = False else: with open(correction_table_file, "r") as f: -- GitLab From 1ea3c383f6713780a8fbbe8d734e2883cf163e4b Mon Sep 17 00:00:00 2001 From: cesen Date: Wed, 8 Dec 2021 19:03:49 +0100 Subject: [PATCH 10/26] Add spacing and lines for readability --- ksrates/fc_wgd.py | 7 +++++++ ksrates/wgd_paralogs.py | 11 ++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/ksrates/fc_wgd.py b/ksrates/fc_wgd.py index d6c54ac..495b7ba 100755 --- a/ksrates/fc_wgd.py +++ b/ksrates/fc_wgd.py @@ -215,6 +215,7 @@ def ks_paralogs(species_name, cds_fasta, base_dir='.', eval_cutoff=1e-10, inflat logging.info('---') logging.info('Done') + logging.info('---') def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type="lambda", pident_threshold=40, @@ -344,6 +345,8 @@ def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type elif filter_zero_false_predictions: top_rec_ret_gf_list = pd.read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None).squeeze() generate_mcl_rec_ret(top_rec_ret_gf_list, assigned_rec_ret_gfs, output_mcl_path) + logging.info("Done") + logging.info("") # whole paranome Ks analysis if do_ks: @@ -387,6 +390,7 @@ def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type logging.info('---') logging.info('Done') + logging.info('---') def ks_orthologs(species1, species2, cds_fasta1, cds_fasta2, base_dir='.', eval_cutoff=1e-10, aligner='muscle', @@ -567,6 +571,7 @@ def ks_orthologs(species1, species2, cds_fasta1, cds_fasta2, base_dir='.', eval_ logging.info('---') logging.info('Done') + logging.info('---') def all_v_all_blast_1way(species1, species2, protein_seq_dict1, protein_seq_dict2, tmp_dir, output_dir, output_file, @@ -834,7 +839,9 @@ def ks_colinearity(species_name, gff_file, base_dir='.', gff_feature='mRNA', gff shutil.copy2(os.path.join(tmp_dir, "i-adhore_out", "multiplicon_pairs.txt"), subfolder) shutil.rmtree(tmp_dir) + logging.info('---') logging.info("Done") + logging.info('---') def _write_gene_lists(genome, output_dir='gene_lists'): diff --git a/ksrates/wgd_paralogs.py b/ksrates/wgd_paralogs.py index 09f4f89..7028a2f 100644 --- a/ksrates/wgd_paralogs.py +++ b/ksrates/wgd_paralogs.py @@ -67,11 +67,13 @@ def wgd_paralogs(config_file, n_threads): logging.error("Please add the missing information to the configuration file and rerun the analysis. Exiting.") sys.exit(1) logging.info("Completed") + logging.info("") # Creating folder for output files of wgd paralog pipeline paralog_dists_dir = os.path.join("paralog_distributions", "") if not os.path.isdir(paralog_dists_dir): logging.info(f"Creating directory [{paralog_dists_dir}]") + logging.info("") os.makedirs(paralog_dists_dir) # ----------------------------------------------------------------------------- @@ -81,23 +83,26 @@ def wgd_paralogs(config_file, n_threads): logging.info("Running wgd reciprocal retention paralog Ks pipeline...") fc_wgd.ks_paralogs(species, species_fasta_file, max_gene_family_size=max_gene_family_size, base_dir=paralog_dists_dir, n_threads=n_threads) + logging.info(datetime.datetime.today().ctime()) + logging.info("") # ESTIMATING RECIPROCALLY RETAINED GENE FAMILIES Ks VALUES if reciprocal_retention: - logging.info('---') logging.info("Running wgd whole-paranome Ks pipeline...") fc_wgd.ks_paralogs_rec_ret(species, species_fasta_file, latin_name, top=top, rank_type=rank_type, pident_threshold=pident_threshold, min_qcov_threshold=min_qcov_threshold, filter_false_predictions=filter_false_predictions, filter_zero_false_predictions=filter_zero_false_predictions, max_gene_family_size=max_gene_family_size, base_dir=paralog_dists_dir, n_threads=n_threads) + logging.info(datetime.datetime.today().ctime()) + logging.info("") # EXTRACTING COLLINEARITY ANCHOR PAIRS Ks VALUES if colinearity: - logging.info('---') logging.info("Running wgd collinearity Ks pipeline...") fc_wgd.ks_colinearity(species, gff, base_dir=paralog_dists_dir, gff_feature=gff_feature, gff_gene_attribute=gff_gene_attribute, n_threads=n_threads) + logging.info(datetime.datetime.today().ctime()) + logging.info("") - logging.info(datetime.datetime.today().ctime()) logging.info("Done") -- GitLab From 0536d4dfd6e33f89c9a9b93c0799eea35956355c Mon Sep 17 00:00:00 2001 From: cesen Date: Wed, 8 Dec 2021 19:04:48 +0100 Subject: [PATCH 11/26] Fix description in strings --- ksrates/wgd_paralogs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ksrates/wgd_paralogs.py b/ksrates/wgd_paralogs.py index 7028a2f..204ed45 100644 --- a/ksrates/wgd_paralogs.py +++ b/ksrates/wgd_paralogs.py @@ -79,8 +79,8 @@ def wgd_paralogs(config_file, n_threads): # ----------------------------------------------------------------------------- # ESTIMATING PARANOME Ks VALUES - if paranome or colinearity: # prerequisite for colinearity pipeline, but not for reciprocally retention pipeline - logging.info("Running wgd reciprocal retention paralog Ks pipeline...") + if paranome or colinearity: # prerequisite for colinearity pipeline, but not for reciprocally retention pipeline + logging.info("Running wgd whole-paranome Ks pipeline...") fc_wgd.ks_paralogs(species, species_fasta_file, max_gene_family_size=max_gene_family_size, base_dir=paralog_dists_dir, n_threads=n_threads) logging.info(datetime.datetime.today().ctime()) @@ -88,7 +88,7 @@ def wgd_paralogs(config_file, n_threads): # ESTIMATING RECIPROCALLY RETAINED GENE FAMILIES Ks VALUES if reciprocal_retention: - logging.info("Running wgd whole-paranome Ks pipeline...") + logging.info("Running wgd reciprocal retention paralog Ks pipeline...") fc_wgd.ks_paralogs_rec_ret(species, species_fasta_file, latin_name, top=top, rank_type=rank_type, pident_threshold=pident_threshold, min_qcov_threshold=min_qcov_threshold, filter_false_predictions=filter_false_predictions, -- GitLab From 242ef0dbffd343e9c135d9d3859c88d8289a4ffc Mon Sep 17 00:00:00 2001 From: cesen Date: Wed, 8 Dec 2021 19:10:17 +0100 Subject: [PATCH 12/26] Add "all done" message --- ksrates/wgd_paralogs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ksrates/wgd_paralogs.py b/ksrates/wgd_paralogs.py index 204ed45..bc553d3 100644 --- a/ksrates/wgd_paralogs.py +++ b/ksrates/wgd_paralogs.py @@ -105,4 +105,4 @@ def wgd_paralogs(config_file, n_threads): logging.info(datetime.datetime.today().ctime()) logging.info("") - logging.info("Done") + logging.info("All done") -- GitLab From 74fed34d345f32165e65e07782b7fcf56e7f7025 Mon Sep 17 00:00:00 2001 From: cesen Date: Fri, 10 Dec 2021 11:39:13 +0100 Subject: [PATCH 13/26] Fix: add "if" conditions --- ksrates/plot_paralogs.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/ksrates/plot_paralogs.py b/ksrates/plot_paralogs.py index a7cf24a..42661e2 100755 --- a/ksrates/plot_paralogs.py +++ b/ksrates/plot_paralogs.py @@ -257,21 +257,25 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an # Setting plot height based on tallest histogram among the paralog distribution(s) if y_lim is None: # Set the height based on paranome distribution when this is for sure the tallest - for ax_uncorr, ax_corr in zip(ax_uncorr_include_para, ax_corr_include_para): - fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_paranome) - fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_paranome) + if paranome_analysis: + for ax_uncorr, ax_corr in zip(ax_uncorr_include_para, ax_corr_include_para): + fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_paranome) + fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_paranome) # Set height based on anchor pair distributions if that's the only one in the plot - fcPlot.set_mixed_plot_height(ax_uncorr_col, y_lim, hist_anchors) - fcPlot.set_mixed_plot_height(ax_corr_col, y_lim, hist_anchors) + if colinearity_analysis: + fcPlot.set_mixed_plot_height(ax_uncorr_col, y_lim, hist_anchors) + fcPlot.set_mixed_plot_height(ax_corr_col, y_lim, hist_anchors) # Set height based on reciprocally retained paralog distribution if that's the only one in the plot - fcPlot.set_mixed_plot_height(ax_uncorr_rr, y_lim, hist_rec_ret) - fcPlot.set_mixed_plot_height(ax_corr_rr, y_lim, hist_rec_ret) + if reciprocal_retention_analysis: + fcPlot.set_mixed_plot_height(ax_uncorr_rr, y_lim, hist_rec_ret) + fcPlot.set_mixed_plot_height(ax_corr_rr, y_lim, hist_rec_ret) # Set height to tallest distribution between anchor pair and reciprocally retained paralogs - fcPlot.set_mixed_plot_height(ax_uncorr_col_rr, y_lim, hist_anchors, hist_rec_ret) - fcPlot.set_mixed_plot_height(ax_corr_col_rr, y_lim, hist_anchors, hist_rec_ret) + if colinearity_analysis and reciprocal_retention_analysis: + fcPlot.set_mixed_plot_height(ax_uncorr_col_rr, y_lim, hist_anchors, hist_rec_ret) + fcPlot.set_mixed_plot_height(ax_corr_col_rr, y_lim, hist_anchors, hist_rec_ret) # PLOTTING THE ORTHOLOG DIVERGENCE LINES on the paralog distribution -- GitLab From 83a46b7b62d4525dcdcddb495b63e24bf9ab7f60 Mon Sep 17 00:00:00 2001 From: cesen Date: Fri, 10 Dec 2021 12:16:42 +0100 Subject: [PATCH 14/26] Minor --- ksrates/exp_log_mixture.py | 3 ++- ksrates/fc_exp_log_mixture.py | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/ksrates/exp_log_mixture.py b/ksrates/exp_log_mixture.py index e55bea0..b8c24f5 100644 --- a/ksrates/exp_log_mixture.py +++ b/ksrates/exp_log_mixture.py @@ -137,7 +137,8 @@ def exp_log_mixture(config_file, paralog_tsv_file, correction_table_file): reduced_gaussians_flag=reduced_gaussians, EM_data=True) all_models_fitted_parameters[model_id] = [means_peaks, stdevs_peaks, lambd_peaks, weights_peaks] bic_dict[model_id] = bic_peaks - fcEM.plot_fitted_comp(ax_peaks_ks, ax_peaks_logks, means_peaks, stdevs_peaks, lambd_peaks, weights_peaks, x_max_lim, peak_stats, correction_table_available, plot_correction_arrows) + fcEM.plot_fitted_comp(ax_peaks_ks, ax_peaks_logks, means_peaks, stdevs_peaks, lambd_peaks, + weights_peaks, x_max_lim, peak_stats, correction_table_available, plot_correction_arrows) # ----------------------------------------------------------------- diff --git a/ksrates/fc_exp_log_mixture.py b/ksrates/fc_exp_log_mixture.py index 0d11b00..1b751e5 100644 --- a/ksrates/fc_exp_log_mixture.py +++ b/ksrates/fc_exp_log_mixture.py @@ -803,7 +803,9 @@ def plot_init_comp(ax_ks, ax_logks, means, stdevs, lambd, weights, plot_logtranf ax_logks.plot(x_points, weights[comp+1] * norm.pdf(x_points, means[comp], stdevs[comp]), style, lw=1.5, alpha=0.4) -def plot_fitted_comp(ax_ks, ax_logks, means, stdevs, lambd, weights, max_x_axis_lim, peak_stats, correction_table_available, plot_correction_arrows, scaling=1, plot_peak_markers=False, plot_logtranformed=True): +def plot_fitted_comp(ax_ks, ax_logks, means, stdevs, lambd, weights, max_x_axis_lim, + peak_stats, correction_table_available, plot_correction_arrows, scaling=1, + plot_peak_markers=False, plot_logtranformed=True): """ Plots the mixture components with fitted parameters plus their overall PDF curve. On the left axis it plots the lognormal and the exponential components, @@ -1104,7 +1106,9 @@ def plot_best_model(fig_best_model, ax_best_model, species, ks_data, ks_weights, final_means, final_stdevs, final_lambd, final_weights = all_models_fitted_parameters[best_model_id] scaling = bin_width * len(deconvoluted_data[deconvoluted_data <= max_ks_EM]) - plot_fitted_comp(ax_best_model, None, final_means, final_stdevs, final_lambd, final_weights, max_x_axis_lim, peak_stats, correction_table_available, plot_correction_arrows, scaling=scaling, plot_peak_markers=True, plot_logtranformed=False) + plot_fitted_comp(ax_best_model, None, final_means, final_stdevs, final_lambd, final_weights, + max_x_axis_lim, peak_stats, correction_table_available, plot_correction_arrows, + scaling=scaling, plot_peak_markers=True, plot_logtranformed=False) if correction_table_available: legend_size = fcPlot.define_legend_size(ax_best_model) -- GitLab From 3d5deea9d29cead3ea25fd7cf594752e990c0ef3 Mon Sep 17 00:00:00 2001 From: cesen Date: Mon, 13 Dec 2021 11:03:39 +0100 Subject: [PATCH 15/26] Add automatic interpretaion to adjusted plots - New text block after the completion of any method used to infer putative WGD peaks (i.e. cluster anchors, ELMM or LMM) in the focal species Ks plot; the text states which other species in the plot share the given WGD signatures with the focal sp. - Species are assumed to share the peak if the ortholog Ks mode with the focal species is smaller than the WGD peak Ks (i.e. they diverged chronologically after the WGD). - NOTE: since inferring peaks through mixture models is known to have overfitting problems (i.e. too many components/peaks), it is recommended to manually check the plot before drawing any conclusion. --- ksrates/cluster_anchor_ks.py | 22 +++++++--- ksrates/exp_log_mixture.py | 12 ++++-- ksrates/fc_cluster_anchors.py | 18 ++++++-- ksrates/fc_exp_log_mixture.py | 41 +++++++++--------- ksrates/fc_lognormal_mixture.py | 20 ++++++--- ksrates/fc_rrt_correction.py | 76 +++++++++++++++++++++++++++++++++ ksrates/lognormal_mixture.py | 27 +++++++++--- 7 files changed, 171 insertions(+), 45 deletions(-) diff --git a/ksrates/cluster_anchor_ks.py b/ksrates/cluster_anchor_ks.py index 238e25f..5e89271 100644 --- a/ksrates/cluster_anchor_ks.py +++ b/ksrates/cluster_anchor_ks.py @@ -14,7 +14,7 @@ import ksrates.fc_check_input as fcCheck import ksrates.fc_extract_ks_list as fc_extract_ks_list from ksrates.utils import init_logging from ksrates.fc_cluster_anchors import subfolder -from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE +from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE, interpretation_adjusted_plot def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, path_multiplicons_txt, path_segments_txt, path_list_elements_txt, path_ks_anchor_file, path_multiplicon_pair_txt): config = fcConf.Configuration(config_file) @@ -23,7 +23,7 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, colinearity_analysis = config.get_colinearity() if not colinearity_analysis: - logging.warning("Colinearity non required in configuration file; the anchor Ks clustering will be skipped.") + logging.warning("Collinearity non required in configuration file; the anchor Ks clustering will be skipped.") sys.exit(0) # exit code 0 because no actual errors were thrown species = config.get_species() @@ -301,7 +301,7 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, fcPlot.plot_histogram_for_anchor_clustering(ax_uncorr_first, anchor_ks_list, anchors_weights, bin_list, y_max_lim) # Plot the clusters of anchor Ks and on top of them their KDEs - clusters_sorted_by_median, cluster_color_letter_list = fcCluster.plot_clusters(ax_corr_first, cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) + clusters_sorted_by_median, cluster_color_letter_list, letter_to_x_coord_dict = fcCluster.plot_clusters(ax_corr_first, cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) fcCluster.plot_clusters(ax_uncorr_first, cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) # Plotting the ortholog peaks coming from the adjustment, if available @@ -324,6 +324,10 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, if updated_n_clusters == n_clusters: logging.info("All clusters were retained") + if correction_table_available: + interpretation_adjusted_plot(latin_names[species], consensus_peak_for_multiple_outgroups, + letter_to_x_coord_dict, correction_table) + # Saving the first plot with the final name (leaving away "unfiltered") logging.info(f"Saving mixed Ks plot with anchor Ks clusters [{fcCluster._ANCHOR_CLUSTERS_FILTERED.format(species)}]") fcCluster.save_anchor_cluster_plot(fig_corr_first, fig_uncorr_first, ax_corr_first, ax_uncorr_first, species, latin_names, correction_table_available, cluster_of_ks, output, "second") @@ -349,7 +353,6 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, clean_medians_list = array(clean_medians_list) # Clustering with GaussianMixtureModel, k-means or lognormal mixture modeling - logging.info("") logging.info(f"Performing a second Ks clustering round with {updated_n_clusters} clusters through {clustering_method} for the remaining Ks values") if clustering_method == "Gaussian mixture modeling": gmm_clustered_medians2 = fcCluster.gmm(clean_medians_list, updated_n_clusters, max_EM_iterations, num_EM_initializations) @@ -374,14 +377,21 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, fcPlot.plot_histogram_for_anchor_clustering(ax_uncorr_second, anchor_ks_list, anchors_weights, bin_list, y_max_lim) # Plot the clusters of anchor Ks and on top of them their KDEs - fcCluster.plot_clusters(ax_corr_second, filtered_cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) - fcCluster.plot_clusters(ax_uncorr_second, filtered_cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) + __, __, letter_to_x_coord_dict = fcCluster.plot_clusters(ax_corr_second, + filtered_cluster_of_ks, bin_width, max_ks_para, peak_stats, + correction_table_available, plot_correction_arrows) + fcCluster.plot_clusters(ax_uncorr_second, filtered_cluster_of_ks, bin_width, max_ks_para, peak_stats, + correction_table_available, plot_correction_arrows) # Plotting the ortholog peaks coming from the adjustment, if available if correction_table_available: logging.info("Plotting divergence lines") fcPlot.plot_divergences(correction_table, peak_stats, consensus_peak_for_multiple_outgroups, ax_uncorr_second, ax_corr_second, color_list, plot_correction_arrows) + + # Printing the automatic interpretation of the rate-adjusted mixed plot according to inferred (putative) WGD peaks + interpretation_adjusted_plot(latin_names[species], consensus_peak_for_multiple_outgroups, + letter_to_x_coord_dict, correction_table) logging.info(f"Saving mixed Ks plot with filtered anchor Ks clusters [{fcCluster._ANCHOR_CLUSTERS_FILTERED.format(species)}]") logging.info("") diff --git a/ksrates/exp_log_mixture.py b/ksrates/exp_log_mixture.py index b8c24f5..18bdbe0 100644 --- a/ksrates/exp_log_mixture.py +++ b/ksrates/exp_log_mixture.py @@ -12,7 +12,7 @@ import ksrates.fc_extract_ks_list as fc_extract_ks_list from ksrates.fc_plotting import COLOR_ANCHOR_HISTOGRAM import ksrates.fc_exp_log_mixture as fcEM from ksrates.fc_cluster_anchors import subfolder -from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE +from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE, interpretation_adjusted_plot def exp_log_mixture(config_file, paralog_tsv_file, correction_table_file): @@ -78,8 +78,8 @@ def exp_log_mixture(config_file, paralog_tsv_file, correction_table_file): if correction_table_file == "": logging.warning("Rate-adjustment data are not available yet, only Ks paranome distribution will be plotted.") logging.info("") - correction_table = None correction_table_available = False + correction_table = None else: with open(correction_table_file, "r") as f: correction_table = read_csv(f, sep="\t") @@ -250,10 +250,16 @@ def exp_log_mixture(config_file, paralog_tsv_file, correction_table_file): logging.info("Models are evaluated according to their BIC score.") # Get best model by lowest BIC score and plot it; print comparison with the other models best_model_id = fcEM.eval_best_model(bic_dict, outfile) - fcEM.plot_best_model(fig_best_model, ax_best_ks, species, ks_data, ks_weights, bin_list, bin_width, x_max_lim, + letter_to_peak_dict = fcEM.plot_best_model(fig_best_model, ax_best_ks, species, ks_data, ks_weights, bin_list, bin_width, x_max_lim, y_max_lim, best_model_id, all_models_init_parameters, all_models_fitted_parameters, correction_table, correction_table_available, consensus_peak_for_multiple_outgroups, peak_stats, color_list, plot_correction_arrows, deconvoluted_data, max_ks_EM) + + if correction_table_available: + # Printing the automatic interpretation of the rate-adjusted mixed plot according to inferred (putative) WGD peaks + interpretation_adjusted_plot(latinSpecies, consensus_peak_for_multiple_outgroups, + letter_to_peak_dict, correction_table) + logging.info(f"Saving PDF figure of best mixture model [mixed_{species}_elmm.pdf]") logging.info("") logging.info("All done") \ No newline at end of file diff --git a/ksrates/fc_cluster_anchors.py b/ksrates/fc_cluster_anchors.py index 1600bb7..150a739 100644 --- a/ksrates/fc_cluster_anchors.py +++ b/ksrates/fc_cluster_anchors.py @@ -335,7 +335,8 @@ def get_clusters_of_ks(labels_of_clustered_medians, all_medians_list, all_segmen return cluster_of_ks, medians_per_cluster, segments_medians_per_cluster, cluster_color_letter_list -def plot_clusters_of_medians(medians_per_cluster, cluster_color_letter_list, x_axis_max_limit_mixed_plot, bin_list, species, latin_name, output): +def plot_clusters_of_medians(medians_per_cluster, cluster_color_letter_list, x_axis_max_limit_mixed_plot, + bin_list, species, latin_name, output): """ Generates a figure showing the clusters of segment pair medians. @@ -389,7 +390,8 @@ def assign_cluster_colors(cluster_of_ks): return clusters_sorted_by_median, cluster_color_letter_list -def plot_clusters(axis, cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows): +def plot_clusters(axis, cluster_of_ks, bin_width, max_ks_para, peak_stats, + correction_table_available, plot_correction_arrows): """ Plots the anchor Ks clusters, their KDE lines a marker to show their median and . @@ -402,9 +404,12 @@ def plot_clusters(axis, cluster_of_ks, bin_width, max_ks_para, peak_stats, corre :param plot_correction_arrows: boolean tag to state whether to plot the adjustment arrows at the bottom of the adjusted plot (default: False) :return clusters_sorted_by_median: list of cluster IDs sorted according to their age :return cluster_color_letter_list: dictionary assigning to each cluster the right color and letter according to age + :return letter_to_x_coord_dict: dictionary associating cluster names (letters) to their peak x-coord (e.g. mode) """ clusters_sorted_by_median, cluster_color_letter_list = assign_cluster_colors(cluster_of_ks) + letter_to_x_coord_dict = {} + zorder_ID = 50 for cluster_id in clusters_sorted_by_median: @@ -430,11 +435,14 @@ def plot_clusters(axis, cluster_of_ks, bin_width, max_ks_para, peak_stats, corre axis.plot(kde_x, kde_y, color=cluster_color_letter_list[cluster_id][0], zorder=zorder_ID + 1) # plot a marker to highlight the median of each cluster - plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, cluster_color_letter_list[cluster_id][0], + x_coord_cluster_peak = plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, cluster_color_letter_list[cluster_id][0], cluster_color_letter_list[cluster_id][1], zorder_ID + 2, peak_stats, correction_table_available, plot_correction_arrows) + + # Assign to the cluster letter its peak (e.g. mode) x-coordinate + letter_to_x_coord_dict[cluster_color_letter_list[cluster_id][1]] = x_coord_cluster_peak zorder_ID += 10 - return clusters_sorted_by_median, cluster_color_letter_list + return clusters_sorted_by_median, cluster_color_letter_list, letter_to_x_coord_dict def plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, color, letter, zorder_ID, peak_stats, correction_table_available, plot_correction_arrows): @@ -456,6 +464,7 @@ def plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, color, letter :param peak_stats: states whether the cluster peak is intended as median or mode :param correction_table_available: boolean tag to state if the adjustment table data are available or not :param plot_correction_arrows: boolean tag to state whether to plot the adjustment arrows at the bottom of the adjusted plot (default: False) + :return: the x-coordinate of the mode / median of the current anchor cluster """ if peak_stats == "median": # Get the point of the KDE line that is as closest as possible to the median of the raw data @@ -496,6 +505,7 @@ def plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, color, letter axis.text(x=x_value, y=y_value + (y_height * marker_y_height_fraction), s=letter, fontsize=10, horizontalalignment='center', verticalalignment='center', clip_on=True, zorder=zorder_ID + 5, color='w') + return x_value def filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluster_color_letter_list): """ diff --git a/ksrates/fc_exp_log_mixture.py b/ksrates/fc_exp_log_mixture.py index 1b751e5..12f6f51 100644 --- a/ksrates/fc_exp_log_mixture.py +++ b/ksrates/fc_exp_log_mixture.py @@ -839,7 +839,8 @@ def plot_fitted_comp(ax_ks, ax_logks, means, stdevs, lambd, weights, max_x_axis_ linestyle = ":" alpha = 1 - ax_ks.plot(x_points_strictly_positive, scaling * weights[0] * expon.pdf(x_points_strictly_positive, scale=1/lambd), c=color, ls=linestyle, lw=1.5, alpha=alpha, label='Exponential') + ax_ks.plot(x_points_strictly_positive, scaling * weights[0] * expon.pdf(x_points_strictly_positive, + scale=1/lambd), c=color, ls=linestyle, lw=1.5, alpha=alpha, label='Exponential') # Getting the representative value of the lognormal peak and sort lognormals accordingly lognormal_peaks = {} # key: lognormal component ID, value: peak @@ -851,17 +852,25 @@ def plot_fitted_comp(ax_ks, ax_logks, means, stdevs, lambd, weights, max_x_axis_ lognormal_peaks[comp] = peak_coord lognormals_sorted_by_peak = sorted(lognormal_peaks, key=lognormal_peaks.get) # sort by values (peaks) letter_dict = dict(zip(lognormals_sorted_by_peak, [ "a", "b", "c", "d", "e", "f", "g"][:len(lognormals_sorted_by_peak)])) # assign progressive letter + letter_to_peak_dict = {} colors = ["b-", "r-", "c-", "m-", "k-"][:len(means)-1] + ["y-"] for comp, style in zip(lognormals_sorted_by_peak, colors): + # Letter associated to its peak x-coordinate: key a, value float + letter_to_peak_dict[letter_dict[comp]] = lognormal_peaks[comp] + color = style[0] linestyle = style[1:] if plot_peak_markers: # True only for the final plot color = "dimgrey" linestyle = "--" - plot_marker(ax_ks, lognormal_peaks[comp], scaling * weights[comp+1] * lognorm.pdf(lognormal_peaks[comp], scale=exp(means[comp]), s=stdevs[comp]), "k", letter_dict[comp], correction_table_available, plot_correction_arrows) - ax_ks.plot(x_points_strictly_positive, scaling * weights[comp+1] * lognorm.pdf(x_points_strictly_positive, scale=exp(means[comp]), s=stdevs[comp]), c=color, ls=linestyle, lw=1.5, alpha=alpha, label=f'Lognormal {letter_dict[comp]} ({peak_stats} {lognormal_peaks[comp]})') + plot_marker(ax_ks, lognormal_peaks[comp], scaling * weights[comp+1] * lognorm.pdf(lognormal_peaks[comp], + scale=exp(means[comp]), s=stdevs[comp]), "k", letter_dict[comp], correction_table_available, + plot_correction_arrows) + ax_ks.plot(x_points_strictly_positive, scaling * weights[comp+1] * lognorm.pdf(x_points_strictly_positive, + scale=exp(means[comp]), s=stdevs[comp]), c=color, ls=linestyle, lw=1.5, alpha=alpha, + label=f'Lognormal {letter_dict[comp]} ({peak_stats} {lognormal_peaks[comp]})') if plot_logtranformed: ax_logks.plot(x_points, scaling * weights[comp+1] * norm.pdf(x_points, means[comp], stdevs[comp]), c=color, ls=linestyle, lw=1.5, alpha=alpha, label=f'Norm {letter_dict[comp]}') @@ -889,6 +898,7 @@ def plot_fitted_comp(ax_ks, ax_logks, means, stdevs, lambd, weights, max_x_axis_ ax_logks.legend(handles_tuple_list, ax_logks.get_legend_handles_labels()[1], handler_map={tuple: HandlerTuple(ndivide=None)}, handlelength=3, loc="upper left") + return letter_to_peak_dict def make_tuple_handles(axis): """ @@ -1099,16 +1109,18 @@ def plot_best_model(fig_best_model, ax_best_model, species, ks_data, ks_weights, # PLOTTING THE ORTHOLOG DIVERGENCE LINES on the paralog distribution if correction_table_available: dummy_fig, dummy_axis = plt.subplots() - fcPlot.plot_divergences(correction_table, peak_stats, consensus_peak_for_multiple_outgroups, dummy_axis, ax_best_model, color_list, plot_correction_arrows) + fcPlot.plot_divergences(correction_table, peak_stats, consensus_peak_for_multiple_outgroups, + dummy_axis, ax_best_model, color_list, plot_correction_arrows) init_means, init_stdevs, init_lambd, init_weights = all_models_init_parameters[best_model_id] final_means, final_stdevs, final_lambd, final_weights = all_models_fitted_parameters[best_model_id] scaling = bin_width * len(deconvoluted_data[deconvoluted_data <= max_ks_EM]) - plot_fitted_comp(ax_best_model, None, final_means, final_stdevs, final_lambd, final_weights, - max_x_axis_lim, peak_stats, correction_table_available, plot_correction_arrows, - scaling=scaling, plot_peak_markers=True, plot_logtranformed=False) + letter_to_peak_dict = plot_fitted_comp(ax_best_model, None, final_means, final_stdevs, final_lambd, final_weights, + max_x_axis_lim, peak_stats, correction_table_available, plot_correction_arrows, + scaling=scaling, plot_peak_markers=True, plot_logtranformed=False) + logging.info("Done") if correction_table_available: legend_size = fcPlot.define_legend_size(ax_best_model) @@ -1127,17 +1139,4 @@ def plot_best_model(fig_best_model, ax_best_model, species, ks_data, ks_weights, fig_best_model.savefig(os.path.join("rate_adjustment", f"{species}", f"mixed_{species}_elmm.pdf"), transparent=True, format="pdf") - - # # TEMPORARY FOR A FIGURE PLOT WITH DENSITY FOR COMPARISON AFTER SCALING - # fig, ax = generate_best_model_figure("elaeis", "Elaeis guineensis", 3, None, True, True) - # fcPlot.plot_histogram("Whole-paranome", ax, ks_data, bin_list, None, None, None, ks_weights, plot_kde=False, density=True) - # if correction_table_available: - # fcPlot.plot_divergences(correction_table, consensus_peak_for_multiple_outgroups, dummy_axis, ax, color_list, plot_correction_arrows) - # plot_fitted_comp(ax, None, final_means, final_stdevs, final_lambd, final_weights, peak_stats, scaling=1, plot_peak_markers=True, plot_logtranformed=False) - - # legend_size = fcPlot.define_legend_size(ax) - # chart_box = ax.get_position() - # ax.set_position([chart_box.x0, chart_box.y0, chart_box.width*0.65, chart_box.height]) - # lgd = create_legend_mixture_model(ax, legend_size, len(init_means)+2) # number of plotted lines is: exp + lognormals + total PDF - # fig.savefig(os.path.join("rate_adjustment", f"{species}", f"mixed_density_elmm.pdf"), - # bbox_extra_artists=(lgd, fig._suptitle), bbox_inches="tight", transparent=True, format="pdf") + return letter_to_peak_dict \ No newline at end of file diff --git a/ksrates/fc_lognormal_mixture.py b/ksrates/fc_lognormal_mixture.py index 68b51ef..e7bd9bd 100644 --- a/ksrates/fc_lognormal_mixture.py +++ b/ksrates/fc_lognormal_mixture.py @@ -166,8 +166,9 @@ def fit_gmm(X, n1, n2, outfile, parameter_table, max_iter=300, n_init=1, **kwarg return models, bic, aic, best -def plot_mixture_model(model, data, max_x_axis_lim, ax, bin_width, scaling, peak_stats, correction_table_available, plot_correction_arrows, l=0, u=5, color='black', alpha=0.2, - log=False, bins=25): +def plot_mixture_model(model, data, max_x_axis_lim, ax, bin_width, scaling, peak_stats, + correction_table_available, plot_correction_arrows, l=0, u=5, + color='black', alpha=0.2, log=False, bins=25): """ Plot a mixture model. Assumes a log-transformed model and data and will back-transform. @@ -222,8 +223,14 @@ def plot_mixture_model(model, data, max_x_axis_lim, ax, bin_width, scaling, peak lognormals_sorted_by_peak = sorted(lognormal_peaks, key=lognormal_peaks.get) # sort by values (peaks) letter_dict = dict(zip(lognormals_sorted_by_peak, [ "a", "b", "c", "d", "e", "f", "g"][:len(lognormals_sorted_by_peak)])) # assign progressive letter + letter_to_peak_dict = {} + for comp in lognormals_sorted_by_peak: - fcEM.plot_marker(ax, lognormal_peaks[comp], scaling * weights[comp] * ss.lognorm.pdf(lognormal_peaks[comp], scale=exp(means[comp][0]), s=sqrt(varcs[comp][0][0])), "k", letter_dict[comp], correction_table_available, plot_correction_arrows) + fcEM.plot_marker(ax, lognormal_peaks[comp], scaling * weights[comp] * ss.lognorm.pdf(lognormal_peaks[comp], + scale=exp(means[comp][0]), s=sqrt(varcs[comp][0][0])), "k", letter_dict[comp], + correction_table_available, plot_correction_arrows) + # Fill in the dictionary with all peak names and their x-coordinate + letter_to_peak_dict[letter_dict[comp]] = lognormal_peaks[comp] for comp in lognormals_sorted_by_peak: curve = scaling * weights[comp] * ss.lognorm.pdf(x, scale=exp(means[comp]), s=sqrt(varcs[comp])) @@ -235,7 +242,7 @@ def plot_mixture_model(model, data, max_x_axis_lim, ax, bin_width, scaling, peak else: total_pdf += curve ax.plot(x, total_pdf, '-k', lw=1.5, label="Lognormal mixture model") - return ax + return ax, letter_to_peak_dict def lmm( @@ -291,9 +298,10 @@ def lmm( # Plotting the components of the best model on the final picture; # Components are scaled up to the size of actual count data and not to density histogram scaling = bin_width_para * len(deconvoluted_data) - plot_mixture_model(best, deconvoluted_data, max_x_axis_lim, axis, bin_width_para, scaling, peak_stats, + ax, letter_to_peak_dict = plot_mixture_model(best, deconvoluted_data, max_x_axis_lim, axis, bin_width_para, scaling, peak_stats, correction_table_available, plot_correction_arrows, ks_range[0], ks_range[1], bins=bins) - return best + + return best, letter_to_peak_dict def create_legend_mixture_model(axis, legend_size, num_mixture_model_lines, datatype): diff --git a/ksrates/fc_rrt_correction.py b/ksrates/fc_rrt_correction.py index 6d47b4b..88877ae 100644 --- a/ksrates/fc_rrt_correction.py +++ b/ksrates/fc_rrt_correction.py @@ -1,4 +1,5 @@ from math import sqrt +import logging # Filenames _ADJUSTMENT_TABLE_ALL = "adjustment_table_{}_all.tsv" @@ -63,3 +64,78 @@ def compute_corrected_ks_species_sister(rel_rate_species, rel_rate_species_sd): corrected_peak_sd = sqrt(2) * rel_rate_species_sd return corrected_peak, corrected_peak_sd + + +def interpretation_adjusted_plot(focal_species_latin, consensus_peak_for_multiple_outgroups, + wgd_peaks, correction_table): + """ + Prints an automatic interpretation of the rate-adjusted mixed Ks plots in which for each + inferred (putative) WGD peak it is said which species share it with the focal species. + This is done by comparing the x-coordinate of WGD peaks coming from mixture + modeling analyses and the rate-adjusted ortholog modes. + The WGD x-coordinate (its Ks age) is either coming from a component when the method is + exponential-lognormal mixture modeling and lognormal mixture modeling, or from an + anchor cluster when the method is anchor clustering. + + :param focal_species_latin: latin name of focal species + :param consensus_peak_for_multiple_outgroups: choice of how to deal with multiple adjustments + for the same divergence (expected values: either "mean among outgroups" or "best outgroup") + :param wgd_peaks: dictionary associating to a wgd letter the coordinate of its peak (either component or anchor cluster) + :param correction_table: adjustment results in DataFrame format (contains both possible types of consensus strategy for how to deal with multiple outgroups) + """ + if consensus_peak_for_multiple_outgroups == "mean among outgroups": + # Then consider the columns in the correction_table that are generated using the average method + column_header_peak = "Adjusted_Mode_Mean" + elif consensus_peak_for_multiple_outgroups == "best outgroup": + # Then consider the columns in the correction_table that are generated using the best outgroup method + column_header_peak = "Adjusted_Mode_Best" + + species_interpretation = {} + shared_wgd_interpretation = {} + + # Generate dictionary associating each divergent species to its mode + ortholog_modes = {} + for diverging_species, mode in zip(correction_table["Sister_Species"], correction_table[column_header_peak]): + ortholog_modes[diverging_species] = mode + # Also initialize a dictionary for the final interpretation logging output + species_interpretation[diverging_species] = [] + + logging.info("") + logging.info(f"Automatic interpretation of rate-adjusted mixed Ks plot:") + logging.info("Please revise it manually due to known overfitting of mixture modeling methods!") + # Figuring out which species share which (putative) WGDs + if len(wgd_peaks) == 0: + logging.info(" - There are no inferred (putative) WGD peaks.") + return + + for peak in wgd_peaks: # For each wgd peak + shared_wgd_interpretation[peak] = [] + for diverging_species in ortholog_modes: # For each divergent species + # What about when it's "=="? + if ortholog_modes[diverging_species] < wgd_peaks[peak]: + species_interpretation[diverging_species].append(peak) + shared_wgd_interpretation[peak].append(diverging_species) + + # For every peak, say which species share it with the focal species + for peak_letter in wgd_peaks: + # If wgd peak is not shared with any other species + if len(shared_wgd_interpretation[peak_letter]) == 0: + logging.info(f' - Inferred (putative) WGD signature "{peak_letter}" is ' + + f'specific to focal species {focal_species_latin}') + continue + # If wgd peak is shared with all of the other species in the plots + elif len(shared_wgd_interpretation[peak_letter]) == len(ortholog_modes): + logging.info(f' - Inferred (putative) WGD signature "{peak_letter}" is shared ' + + f"with all involved species in the provided phylogeny") + continue + # If wgd peak is shared with only one other species + elif len(shared_wgd_interpretation[peak_letter]) == 1: + logging.info(f' - Inferred (putative) WGD signature "{peak_letter}" is shared with {shared_wgd_interpretation[peak_letter][0]}') + else: # If wgd peak is shared with some other species + logging.info(f' - Inferred (putative) WGD signature "{peak_letter}" is shared ' + + f"with the following species:") + for species in shared_wgd_interpretation[peak_letter]: + logging.info(f" {species}") + + + logging.info("") diff --git a/ksrates/lognormal_mixture.py b/ksrates/lognormal_mixture.py index bb8bbce..7e16c03 100644 --- a/ksrates/lognormal_mixture.py +++ b/ksrates/lognormal_mixture.py @@ -12,7 +12,7 @@ import ksrates.fc_extract_ks_list as fc_extract_ks_list import ksrates.fc_lognormal_mixture as fcLMM from ksrates.fc_plotting import COLOR_ANCHOR_HISTOGRAM, COLOR_REC_RET_HISTOGRAM from ksrates.fc_cluster_anchors import subfolder -from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE +from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE, interpretation_adjusted_plot def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_ret_tsv_file, correction_table_file, ignore_paranome=False, ignore_anchors=False, ignore_reciprocally_retained=False): @@ -78,7 +78,6 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re if correction_table_file == "": logging.warning("Rate-adjustment data are not available yet, only Ks distribution will be plotted.") logging.info("") - correction_table = None correction_table_available = False else: with open(correction_table_file, "r") as f: @@ -130,15 +129,22 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re if y_lim is None: fcPlot.set_mixed_plot_height(axis_para, y_lim, hist_paranome) - best_model_paranome = fcLMM.lmm( + best_model_paranome, letter_to_peak_dict_para = fcLMM.lmm( fig_para, x_max_lim, "paralogs", paralog_tsv_file, species, axis_para, (0, max_ks_EM), (1, max_num_comp), arange(-10, max_ks_EM + bin_width_para, bin_width_para), bin_width_para, max_EM_iterations, num_EM_initializations, output_dir, outfile, parameter_table, "paranome", peak_stats, correction_table_available, plot_correction_arrows) # Generating tabular text file with all model parameters fcLMM.make_parameter_table_file(parameter_table, species, "paranome") + + if correction_table_available: + # Printing the automatic interpretation of the rate-adjusted mixed plot according to inferred (putative) WGD peaks + interpretation_adjusted_plot(latinSpecies, consensus_peak_for_multiple_outgroups, + letter_to_peak_dict_para, correction_table) + if paranome_analysis and colinearity_analysis: + logging.info("Done") logging.info("") if colinearity_analysis: @@ -155,7 +161,7 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re if y_lim is None: fcPlot.set_mixed_plot_height(axis_colin, y_lim, hist_anchors) - best_model_anchors = fcLMM.lmm( + best_model_anchors, letter_to_peak_dict_anchors = fcLMM.lmm( fig_colin, x_max_lim, "anchor pairs", anchors_ks_tsv_file, species, axis_colin, (0, max_ks_EM), (1, max_num_comp), arange(-10, max_ks_EM + bin_width_para, bin_width_para), bin_width_para, max_EM_iterations, num_EM_initializations, output_dir, outfile, parameter_table, "anchors", peak_stats, correction_table_available, plot_correction_arrows) @@ -163,7 +169,13 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re # Generating tabular text file with all model parameters fcLMM.make_parameter_table_file(parameter_table, species, "anchors") + if correction_table_available: + # Printing the automatic interpretation of the rate-adjusted mixed plot according to inferred (putative) WGD peaks + interpretation_adjusted_plot(latinSpecies, consensus_peak_for_multiple_outgroups, + letter_to_peak_dict_anchors, correction_table) + if (paranome_analysis or colinearity_analysis) and reciprocal_retention_analysis: + logging.info("Done") logging.info("") if reciprocal_retention_analysis: @@ -177,7 +189,7 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re if y_lim is None: fcPlot.set_mixed_plot_height(axis_rec_ret, y_lim, hist_rec_ret) - best_model_rec_ret = fcLMM.lmm( + best_model_rec_ret, letter_to_peak_dict_recret = fcLMM.lmm( fig_rec_ret, x_max_lim, "reciprocally retained", rec_ret_tsv_file, species, axis_rec_ret, (0, max_ks_EM), (1, max_num_comp), arange(-10, max_ks_EM + bin_width_para, bin_width_para), bin_width_para, max_EM_iterations, num_EM_initializations, output_dir, outfile, parameter_table, "reciprocally_retained", peak_stats, correction_table_available, plot_correction_arrows) @@ -185,6 +197,11 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re # Generating tabular text file with all model parameters fcLMM.make_parameter_table_file(parameter_table, species, "reciprocally_retained") + if correction_table_available: + # Printing the automatic interpretation of the rate-adjusted mixed plot according to inferred (putative) WGD peaks + interpretation_adjusted_plot(latinSpecies, consensus_peak_for_multiple_outgroups, + letter_to_peak_dict_recret, correction_table) + logging.info("Done") for axis in [axis_para, axis_colin, axis_rec_ret]: # PLOTTING THE ORTHOLOG DIVERGENCE LINES -- GitLab From b19df4fff55e9433140606831efe28af7bcf60e3 Mon Sep 17 00:00:00 2001 From: cesen Date: Mon, 13 Dec 2021 11:14:02 +0100 Subject: [PATCH 16/26] Minor (logging) --- ksrates/plot_paralogs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ksrates/plot_paralogs.py b/ksrates/plot_paralogs.py index 42661e2..72f3967 100755 --- a/ksrates/plot_paralogs.py +++ b/ksrates/plot_paralogs.py @@ -65,7 +65,6 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an correction_table_file = fcCheck.get_argument_path(correction_table_file, default_path_correction_table_file, "Rate-adjustment table file") if correction_table_file == "": # it means that the correction_table is not present or available yet logging.warning("Rate-adjustment data are not available yet, only paralog distribution will be plotted.") - logging.info("") correction_table_available = False else: with open(correction_table_file, "r") as f: -- GitLab From 0b4be035aaa093c3a6f4285c689c6571a769c3b5 Mon Sep 17 00:00:00 2001 From: cesen Date: Thu, 16 Dec 2021 11:05:16 +0100 Subject: [PATCH 17/26] Tolerate warning by I-ADHoRe instead of exit - The integration of I-ADHoRe in ksrates was interrupting and exit the pipeline when i-ADHoRe would return an error message; such error output however sometimes is just a warning that shoulnd't make the pipeline stop. Thus, if the error message begins with "Warning", ksrates will now tolerate it and proceed. - This issue arose in a few species from an error with message "Warning: Dubious indirect gene relationship between..." that is coded in AlignmentDrawer.cpp line 155. When running directly I-ADHoRe (i.e. not through ksrates) this message doesn't exit, it just gives that warning. --- ksrates/fc_wgd.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ksrates/fc_wgd.py b/ksrates/fc_wgd.py index 495b7ba..052ba47 100755 --- a/ksrates/fc_wgd.py +++ b/ksrates/fc_wgd.py @@ -976,8 +976,15 @@ def _run_iadhore(config_file): if err: # i-ADHoRe can fail without a proper exit/return code, but just with an error on stderr logging.error("i-ADHoRe execution failed with standard error output:\n" + err) - logging.error("Exiting.") - sys.exit(1) + + # Some i-ADHoRe error output is an actual fatal error and the pipeline must be interrupted, + # but there is at least one i-ADHoRe message that is passed as error output ("cerr" in source code) + # that is only a warning (it starts with "Warning:"): this kind of cerr is not supposed + # to interrupt ksrates pipeline! To take this into account, the following if-clause allows any + # i-ADHoRe error output message starting with "Warning" to be tolerated. + if not err.lower().startswith("warning"): + logging.error("Exiting.") + sys.exit(1) except subprocess.CalledProcessError as e: logging.error(f"i-ADHoRe execution failed with return code: {e.returncode}") if e.stderr: -- GitLab From 6a9244b8b037f6f0e42199f3ebabb2aedddcf56b Mon Sep 17 00:00:00 2001 From: cesen Date: Mon, 20 Dec 2021 15:58:18 +0100 Subject: [PATCH 18/26] Rename function; fix output path In fc_reciprocal_retention.py: - There were two functions (dna_to_amin()) with identical name; got two different names - One of the two functions was called too soon; got moved later in the code In extend_orthogroups.py: - Because of the changes above, removed one useless argument of assign() function - Fix: output path for orthogroup database has been set to paralog_distrib/rec_ret instead of being generated in the rec_ret folder of the focal species --- ksrates/extend_orthogroups.py | 7 ++----- ksrates/fc_reciprocal_retention.py | 11 +++++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ksrates/extend_orthogroups.py b/ksrates/extend_orthogroups.py index e17aa0e..0631672 100755 --- a/ksrates/extend_orthogroups.py +++ b/ksrates/extend_orthogroups.py @@ -83,9 +83,6 @@ def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_t # INPUT DATABASES AND GENE FAMILY FILES - # Translate it into the aminoacid alphabet - species_sequences_filename = fc_rec_ret.dna_to_aminoacid(fasta_file, output_folder=base_dir_rec_ret) - if test_run: # Nucleotidic and aminoacidi databases of species used to make the orthogroups (i.e. the remaining 35 species) # Note that this database contains only the top e.g. 2000 orthogroup from the complete list of 9178 @@ -161,7 +158,7 @@ def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_t # Matching the test species to the top 2000 orthogroups through diamond logging.info(f"Running homology search between {latin_name} and the gene family database. Will take some time...") with tempfile.TemporaryDirectory(dir=".") as dmd_tmp_dir: - orthogroup_database_amin = fc_rec_ret.make_diamond_db(orthogroup_database_amin_path, output_folder=base_dir_rec_ret) + orthogroup_database_amin = fc_rec_ret.make_diamond_db(orthogroup_database_amin_path, output_folder=basefolder_db_and_files) dmd = fc_rec_ret.run_diamond(fasta_file, orthogroup_database_amin, dmd_test_species_to_top_orthogroup, dmd_tmp_dir, pident_threshold, column_names=column_names, n_threads=n_threads) else: @@ -211,7 +208,7 @@ def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_t query_seqs = dmd_with_orthogroups.loc[:, "qseqid"].drop_duplicates() with tempfile.TemporaryDirectory(dir=".") as dmd_tmp_dir: - assigned_gfs_file = fc_rec_ret.assign_first_diamond_hit_with_rbh(species, fasta_file, species_sequences_filename, orthogroup_database_nucl, + assigned_gfs_file = fc_rec_ret.assign_first_diamond_hit_with_rbh(species, fasta_file, orthogroup_database_nucl, query_seqs, dmd_with_orthogroups, pident_threshold, column_names, dmd_tmp_dir, base_dir_rec_ret) if not test_run: diff --git a/ksrates/fc_reciprocal_retention.py b/ksrates/fc_reciprocal_retention.py index aa62f10..c7bb39c 100755 --- a/ksrates/fc_reciprocal_retention.py +++ b/ksrates/fc_reciprocal_retention.py @@ -147,7 +147,7 @@ def flesh_ids_out_with_sequences(gf_genes_list, ids_names, orthorgoups_database) logging.info(f"- {orthorgoups_database} already exists") -def dna_to_aminoacid(orthorgoups_database, orthorgoups_database_aminoacid): +def database_dna_to_aminoacid(orthorgoups_database, orthorgoups_database_aminoacid): """ Translates the database of nucleotidic sequences into aminoacid sequences, to be given to diamond. @@ -234,7 +234,7 @@ def make_gene_family_database(top, rank_type, filter_false_predictions, filter_z flesh_ids_out_with_sequences(gf_genes_list, ids_names, orthorgoups_database) logging.info("Translate into aminoacidic sequence database (ready for diamond)") - dna_to_aminoacid(orthorgoups_database, orthorgoups_database_aminoacid) + database_dna_to_aminoacid(orthorgoups_database, orthorgoups_database_aminoacid) #------------------------------------------------------------------------------ @@ -475,7 +475,7 @@ def dna_to_aminoacid(database_dna_path, output_folder="."): continue return database_aa_path -def assign_first_diamond_hit_with_rbh(species, fasta_file, species_sequences_filename, orthogroup_database_nucl_test, +def assign_first_diamond_hit_with_rbh(species, fasta_file, orthogroup_database_nucl_test, query_seqs, dmd_with_orthogroups, pident_threshold, column_names, dmd_tmp_dir, basefolder_output_files): """ Assign to the query gene the orthogroup of the first (and best) hit, but only @@ -492,7 +492,10 @@ def assign_first_diamond_hit_with_rbh(species, fasta_file, species_sequences_fil orthogroup_database_test = read_csv(orthogroup_database_nucl_test, sep="\t", header=None).loc[:,0] # Get/Make the Atha FASTA diamond database - if not os.path.exists(os.path.join(f"{fasta_file}.aa.db.dmnd")): + if not os.path.exists(os.path.join(basefolder_output_files, f"{fasta_file}.aa.db.dmnd")): + # Translate it into the aminoacid alphabet + species_sequences_filename = dna_to_aminoacid(fasta_file, output_folder=basefolder_output_files) + # Make aminoacidic diamond database test_species_database = make_diamond_db(species_sequences_filename, output_folder=basefolder_output_files) else: test_species_database = os.path.join(basefolder_output_files, f"{fasta_file}.aa.db.dmnd") -- GitLab From 23baa2059ab0778996dc253f6329408af2e7328b Mon Sep 17 00:00:00 2001 From: cesen Date: Tue, 21 Dec 2021 15:52:06 +0100 Subject: [PATCH 19/26] Generate required files; use combined rank - Pipeline generates all files needed to perform reciprocal retention analysis (was previously still in WIP and using pre-generated files). Now the user will generate the files the first time they launch a recipr retention analysis. NOTE: This is still a WIP, will be later on decided how exactly these files are stored or generated, especially the databases for which the FASTA files of all 35 species are needed. - Integration of using "combined" rank as alternative to "lambda" rank; output file names reflect the two types of rank. - Function setup_for_extend_orthogroups() takes care of producing such required files; called within fc_wgd.ks_paralogs_rec_ret() and not anymore within expand_orthogroups(). - All such required files are now stored into a single folder under the ksrates base dir (ksrates/reciprocal_retention) instead of being generated inside the paralog_distribution dir of the focal species. - Fix bug in database_dna_to_aminoacid(): now the aminoacid database gets correctly translated, without the first sequence being duplicated and empty. --- ksrates/extend_orthogroups.py | 149 +++++++++++++++----------- ksrates/fc_reciprocal_retention.py | 163 +++++++++++++++++------------ ksrates/fc_wgd.py | 32 +++--- ksrates/lognormal_mixture.py | 5 +- ksrates/plot_paralogs.py | 7 +- ksrates/wgd_paralogs.py | 2 +- ksrates_cli.py | 4 +- test/config_expert.txt | 9 +- 8 files changed, 218 insertions(+), 153 deletions(-) diff --git a/ksrates/extend_orthogroups.py b/ksrates/extend_orthogroups.py index 0631672..78757c6 100755 --- a/ksrates/extend_orthogroups.py +++ b/ksrates/extend_orthogroups.py @@ -9,6 +9,40 @@ import ksrates.fc_reciprocal_retention as fc_rec_ret set_option("display.max_rows", 20, "display.max_columns", 20) + +def setup_for_extend_orthogroups(top, rank_type, filter_false_predictions, filter_zero_false_predictions): + """ + Generates files needed for extend_orthogroups.py: + - gf_genes_top_2000_rank.tsv + - gf_genes_list_top_2000_rank.tsv + - gf_genes_with_gf_list_top_2000_rank.tsv + - orthogroup nucl database + - orthogroup amino database + + Generates files needed for fc_wgd.ks_paralogs_rec_ret: + - gf_list_top_2000_lambda.tsv + + """ + # Path where output files will be generated or read + ksrates_rec_ret_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "reciprocal_retention") + + if not os.path.exists(os.path.join(ksrates_rec_ret_dir, f"gf_list_top_{top}_{rank_type}.tsv")) \ + or not os.path.exists(os.path.join(ksrates_rec_ret_dir, f"gf_genes_list_top_{top}_{rank_type}.tsv")) \ + or not os.path.exists(os.path.join(ksrates_rec_ret_dir, f"gf_genes_with_gf_list_top_{top}_{rank_type}.tsv")) \ + or not os.path.exists(os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_top_{top}_{rank_type}_COMPLETE.fasta")) \ + or not os.path.exists(os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_COMPLETE.fasta")): + + logging.info("Generating required files for reciprocal retention analysis...") + # Generate or read input files required to perform RR analysis + # (files listing gene family IDs and gene members, orthogroup databases) + fc_rec_ret.make_gene_family_files_and_databases(top, rank_type, filter_false_predictions, + filter_zero_false_predictions, ksrates_rec_ret_dir) + else: + logging.info("All required files for reciprocal retention analysis exist") + return + + + def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_threshold, min_qcov_threshold, filter_false_predictions, filter_zero_false_predictions, base_dir='.', n_threads=4): """ @@ -29,7 +63,7 @@ def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_t """ # Extend the original GFs with the new sequences from diamond matched homology search results. - # ACTIVATE TO CHOOSE A TEST SPECIES (atha, pdac or vvin) INSTEAD OF THE NEW SPECIES FROM CONFIGURATION FILE + # TO BE ACTIVATED TO CHOOSE A TEST SPECIES (atha, pdac or vvin) INSTEAD OF THE NEW SPECIES FROM CONFIGURATION FILE test_run = False if test_run: # the species is a test species (atha, pdac or vvin) @@ -61,83 +95,74 @@ def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_t #------------------------------------------------------------------------------ - # BASEFOLDERS for input and output - - # Path to database and files to be used in the workflow - # TODO: this is temporarily because we will probably have to let the user first make this files - # themselves in a specific folder; alternatively we could provide this file already ready, but that - # would make the GitHub folder (too) heavy - # THE PATH IS paralog_distribution/reciprocal_retention - basefolder_db_and_files = os.path.join(os.path.dirname(base_dir), "reciprocal_retention") - - # THIS IS THE FUNCTION TO GENERATE THE D - # # The user generates the FASTA database files for the rec.ret. sequences on their own in paralog_distributions - # fc_rec_ret.make_gene_family_database(top, rank_type, filter_false_predictions, filter_zero_false_predictions, base_dir) - - # Path to output files - base_dir_rec_ret = os.path.join(base_dir, "reciprocal_retention", rank_type) - if not os.path.exists(base_dir_rec_ret): - os.makedirs(base_dir_rec_ret) + # GENERATE OR READ INPUT GENE FAMILY FILES AND DATABASES - #------------------------------------------------------------------------------ + # PATH TO THE DIRECTORY WHERE REQUIRED FILES FOR RECIPROCAL RETENTION ANALYSES ARE STORED + # It's within the ksrates directory (for now): /home/cesen/ksrates_rec_ret/ksrates/reciprocal_retention. + # TODO: Still to be decided whether the user will have to generate this files themselves or if they will be + # already provided; in any of the cases, to be decided the location of these files: within the GitHub + # directory could become heavy. + ksrates_rec_ret_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "reciprocal_retention") - # INPUT DATABASES AND GENE FAMILY FILES + # Get the list of the top 2000 GFs + if not filter_zero_false_predictions and not filter_false_predictions: + # Get the list of genes together with their GFs + gf_genes_with_gf_list = read_csv(os.path.join(ksrates_rec_ret_dir, f"gf_genes_with_gf_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None) + elif filter_zero_false_predictions: + # Get the list of genes together with their GFs + gf_genes_with_gf_list = read_table(os.path.join(ksrates_rec_ret_dir, f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None) + elif filter_false_predictions: + # Get the list of genes together with their GFs + gf_genes_with_gf_list = read_table(os.path.join(ksrates_rec_ret_dir, f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None) if test_run: # Nucleotidic and aminoacidi databases of species used to make the orthogroups (i.e. the remaining 35 species) # Note that this database contains only the top e.g. 2000 orthogroup from the complete list of 9178 if not filter_zero_false_predictions and not filter_false_predictions: - orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_test_without_{species}.fasta") - orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_test_without_{species}.fasta") + orthogroup_database_nucl = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_top_{top}_{rank_type}_test_without_{species}.fasta") + orthogroup_database_amin_path = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_test_without_{species}.fasta") elif filter_zero_false_predictions: - orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_filtered_zero_false_predictions_test_without_{species}.fasta") - orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_zero_false_predictions_test_without_{species}.fasta") + orthogroup_database_nucl = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_top_{top}_{rank_type}_filtered_zero_false_predictions_test_without_{species}.fasta") + orthogroup_database_amin_path = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_zero_false_predictions_test_without_{species}.fasta") elif filter_false_predictions: - orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_filtered_false_predictions_test_without_{species}.fasta") - orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_false_predictions_test_without_{species}.fasta") - else: + orthogroup_database_nucl = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_top_{top}_{rank_type}_filtered_false_predictions_test_without_{species}.fasta") + orthogroup_database_amin_path = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_false_predictions_test_without_{species}.fasta") + else: # Nucleotidic and aminoacidi databases of species used to make the orthogroups (all 36 species) if not filter_zero_false_predictions and not filter_false_predictions: - orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_COMPLETE.fasta") - orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_COMPLETE.fasta") + orthogroup_database_nucl = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_top_{top}_{rank_type}_COMPLETE.fasta") + orthogroup_database_amin_path = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_COMPLETE.fasta") elif filter_zero_false_predictions: - orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_filtered_zero_false_predictions_COMPLETE.fasta") - orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_zero_false_predictions_COMPLETE.fasta") + orthogroup_database_nucl = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_top_{top}_{rank_type}_filtered_zero_false_predictions_COMPLETE.fasta") + orthogroup_database_amin_path = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_zero_false_predictions_COMPLETE.fasta") elif filter_false_predictions: - orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_filtered_false_predictions_COMPLETE.fasta") - orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_false_predictions_COMPLETE.fasta") - + orthogroup_database_nucl = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_top_{top}_{rank_type}_filtered_false_predictions_COMPLETE.fasta") + orthogroup_database_amin_path = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_false_predictions_COMPLETE.fasta") if test_run: # Collection of all genes in the orthogroups, including those of test species # Note: if using the top 2000 orthogroups, this file lists only the genes coming from top 2000 orthogroups # if using the subset of top 2000, there will be only genes coming from that subset if not filter_zero_false_predictions and not filter_false_predictions: - gene_ids_in_orthogroups = read_csv(os.path.join(basefolder_db_and_files, f"gf_genes_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None) + gene_ids_in_orthogroups = read_csv(os.path.join(ksrates_rec_ret_dir, f"gf_genes_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None) elif filter_zero_false_predictions: - gene_ids_in_orthogroups = read_csv(os.path.join(basefolder_db_and_files, f"gf_genes_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None) + gene_ids_in_orthogroups = read_csv(os.path.join(ksrates_rec_ret_dir, f"gf_genes_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None) elif filter_false_predictions: - gene_ids_in_orthogroups = read_csv(os.path.join(basefolder_db_and_files, f"gf_genes_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None) + gene_ids_in_orthogroups = read_csv(os.path.join(ksrates_rec_ret_dir, f"gf_genes_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None) - # Get the list of the top 2000 GFs - if not filter_zero_false_predictions and not filter_false_predictions: - # Get the list of genes together with their GFs - gf_genes_with_gf_list = read_csv(os.path.join(basefolder_db_and_files, f"gf_genes_with_gf_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None) - elif filter_zero_false_predictions: - # Get the list of genes together with their GFs - gf_genes_with_gf_list = read_table(os.path.join(basefolder_db_and_files, f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None) - elif filter_false_predictions: - # Get the list of genes together with their GFs - gf_genes_with_gf_list = read_table(os.path.join(basefolder_db_and_files, f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None) + # ------------------------------------------------------------------------- - # ----------------------------------------------------------------------------- + # Path to output files + wgd_focal_rec_ret_rank_dir = os.path.join(base_dir, "reciprocal_retention", rank_type) + if not os.path.exists(wgd_focal_rec_ret_rank_dir): + os.makedirs(wgd_focal_rec_ret_rank_dir) if not filter_zero_false_predictions and not filter_false_predictions: - dmd_with_orthogroups_filename = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}_with_orthogroups.dmd") + dmd_with_orthogroups_filename = os.path.join(wgd_focal_rec_ret_rank_dir, f"diamond_{species}_top_{top}_{rank_type}_with_orthogroups.dmd") elif filter_zero_false_predictions: - dmd_with_orthogroups_filename = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}_filtered_zero_false_predictions_with_orthogroups.dmd") + dmd_with_orthogroups_filename = os.path.join(wgd_focal_rec_ret_rank_dir, f"diamond_{species}_top_{top}_{rank_type}_filtered_zero_false_predictions_with_orthogroups.dmd") elif filter_false_predictions: - dmd_with_orthogroups_filename = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}_filtered_false_predictions_with_orthogroups.dmd") + dmd_with_orthogroups_filename = os.path.join(wgd_focal_rec_ret_rank_dir, f"diamond_{species}_top_{top}_{rank_type}_filtered_false_predictions_with_orthogroups.dmd") column_names = ["qseqid", "sseqid", "pident", "length", "qlen", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"] @@ -148,22 +173,22 @@ def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_t # First step is to generate or open the diamond result table if not filter_zero_false_predictions and not filter_false_predictions: - dmd_test_species_to_top_orthogroup = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}.dmd") + dmd_species_to_top_orthogroup = os.path.join(wgd_focal_rec_ret_rank_dir, f"diamond_{species}_top_{top}_{rank_type}.dmd") elif filter_zero_false_predictions: - dmd_test_species_to_top_orthogroup = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}_filtered_zero_false_predictions.dmd") + dmd_species_to_top_orthogroup = os.path.join(wgd_focal_rec_ret_rank_dir, f"diamond_{species}_top_{top}_{rank_type}_filtered_zero_false_predictions.dmd") elif filter_false_predictions: - dmd_test_species_to_top_orthogroup = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}_filtered_false_predictions.dmd") + dmd_species_to_top_orthogroup = os.path.join(wgd_focal_rec_ret_rank_dir, f"diamond_{species}_top_{top}_{rank_type}_filtered_false_predictions.dmd") - if not os.path.exists(dmd_test_species_to_top_orthogroup): + if not os.path.exists(dmd_species_to_top_orthogroup): # Matching the test species to the top 2000 orthogroups through diamond logging.info(f"Running homology search between {latin_name} and the gene family database. Will take some time...") - with tempfile.TemporaryDirectory(dir=".") as dmd_tmp_dir: - orthogroup_database_amin = fc_rec_ret.make_diamond_db(orthogroup_database_amin_path, output_folder=basefolder_db_and_files) - dmd = fc_rec_ret.run_diamond(fasta_file, orthogroup_database_amin, dmd_test_species_to_top_orthogroup, + with tempfile.TemporaryDirectory(dir=base_dir) as dmd_tmp_dir: + orthogroup_database_amin = fc_rec_ret.make_diamond_db(orthogroup_database_amin_path, output_folder=ksrates_rec_ret_dir) + dmd = fc_rec_ret.run_diamond(fasta_file, orthogroup_database_amin, dmd_species_to_top_orthogroup, dmd_tmp_dir, pident_threshold, column_names=column_names, n_threads=n_threads) else: logging.info(f"Found already existing homology search result table between {latin_name} and the gene family database") - dmd = read_csv(dmd_test_species_to_top_orthogroup, sep="\t", header=None, names=column_names) + dmd = read_csv(dmd_species_to_top_orthogroup, sep="\t", header=None, names=column_names) logging.info("") # Second step is generating the diamond table file with en extra column containing the gene family associated to the matched sequences @@ -207,9 +232,9 @@ def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_t # Get the list of sequences coming from the main species (column 0) in the diamond result table query_seqs = dmd_with_orthogroups.loc[:, "qseqid"].drop_duplicates() - with tempfile.TemporaryDirectory(dir=".") as dmd_tmp_dir: - assigned_gfs_file = fc_rec_ret.assign_first_diamond_hit_with_rbh(species, fasta_file, orthogroup_database_nucl, - query_seqs, dmd_with_orthogroups, pident_threshold, column_names, dmd_tmp_dir, base_dir_rec_ret) + with tempfile.TemporaryDirectory(dir=base_dir) as dmd_tmp_dir: + assigned_gfs_file = fc_rec_ret.assign_first_diamond_hit_with_rbh(top, rank_type, species, fasta_file, orthogroup_database_nucl, + query_seqs, dmd_with_orthogroups, pident_threshold, column_names, dmd_tmp_dir, wgd_focal_rec_ret_rank_dir) if not test_run: logging.info("Done") diff --git a/ksrates/fc_reciprocal_retention.py b/ksrates/fc_reciprocal_retention.py index c7bb39c..9fd6ed2 100755 --- a/ksrates/fc_reciprocal_retention.py +++ b/ksrates/fc_reciprocal_retention.py @@ -24,7 +24,8 @@ ids = {'alyr':'Arabidopsis lyrata', 'atha': 'Arabidopsis thaliana', 'atri':'Ambo # TODO: TEST AND FIX THE INTEGRATION OF THIS FUNCTIONS IN THE KSRATES PIPELINE IN "extend_orthogroups.py" # where the main function should be called (now it's commented). -def get_all_gf_genes(path_for_top_gf_tsv, filter_zero_false_predictions, filter_false_predictions, rank_type, top=None): +def get_all_gf_genes(ksrates_rec_ret_basefolder, filter_zero_false_predictions, + filter_false_predictions, rank_type, top=None): """ If top parameter equal to a number, then consider only the top XXXX gene families: e.g. top 1000. This avoids including weak reciprocally retained gene families. If asked, also filter away those GFs that @@ -36,16 +37,18 @@ def get_all_gf_genes(path_for_top_gf_tsv, filter_zero_false_predictions, filter_ """ # Generate or open the file listing all the GFs with all their genes if top == None: - gf_genes_tsv = "gf_genes_all.tsv" + gf_genes_tsv = os.path.join(ksrates_rec_ret_basefolder, "gf_genes_all.tsv") + gf_list_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_list_top_{rank_type}.tsv") else: - gf_genes_tsv = f"gf_genes_top_{top}_{rank_type}.tsv" + gf_genes_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_genes_top_{top}_{rank_type}.tsv") + gf_list_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_list_top_{top}_{rank_type}.tsv") - if not os.path.exists(os.path.join(path_for_top_gf_tsv, gf_genes_tsv)): + if not os.path.exists(gf_genes_tsv) or not os.path.exists(gf_list_tsv): # Get the ranked gene families - gf_rank = read_excel(open("gene_families.xlsx", "rb"), "gene family rankings", header=None) - + gene_families_xlsx = os.path.join(os.path.join(ksrates_rec_ret_basefolder, "gene_families.xlsx")) + gf_rank = read_excel(open(gene_families_xlsx, "rb"), "gene family rankings", header=None) # Get the genes associated to all gene families - gf_genes_raw = read_excel(open("gene_families.xlsx", "rb"), "orthogroup member genes", header=None) + gf_genes_raw = read_excel(open(gene_families_xlsx, "rb"), "orthogroup member genes", header=None) if top != None: if rank_type == "combined": @@ -64,14 +67,21 @@ def get_all_gf_genes(path_for_top_gf_tsv, filter_zero_false_predictions, filter_ genes_columns = gf_genes_raw.iloc[:,2].str.split(",", expand=True) # Process third column (genes) gf_genes = concat([gf_genes, genes_columns], axis=1, ignore_index=True) - with open(os.path.join(path_for_top_gf_tsv, gf_genes_tsv), "w+") as f: - f.write(gf_genes.to_csv(sep="\t", header=False, index=False)) + if not os.path.exists(gf_genes_tsv): + with open(gf_genes_tsv, "w+") as f: + f.write(gf_genes.to_csv(sep="\t", header=False, index=False)) + + if not os.path.exists(gf_list_tsv): + with open(gf_list_tsv, "w+") as f: + f.write(top_gf_ids.to_csv(sep="\t", header=False, index=False)) + else: - logging.info(f"- {gf_genes_tsv} already exists") - gf_genes = read_csv(os.path.join(path_for_top_gf_tsv, gf_genes_tsv), sep="\t", header=None) + logging.info(f"- {os.path.basename(gf_genes_tsv)} already exists") + gf_genes = read_csv(gf_genes_tsv, sep="\t", header=None) # If asked, remove the GFs from the top 2000 that performed bad (too many false predictions) + # TODO: SET THIS PATH RIGHT IF THE FILTERED DATA WILL BE USED! basefolder_filtering_files = f"../extended_orthogroups_top_{top}_{rank_type}_filtered/" if filter_zero_false_predictions: gf_ids_filtered_zero_false_predictions = read_csv(os.path.join(basefolder_filtering_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None) @@ -84,16 +94,19 @@ def get_all_gf_genes(path_for_top_gf_tsv, filter_zero_false_predictions, filter_ # Generate or open the list of genes and the list of genes with their GF if filter_zero_false_predictions: # remaining from top 2000 after filtering away for at least one false prediction - gf_genes_list_tsv, gf_genes_with_gf_list_tsv = f"gf_genes_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv", f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv" + gf_genes_list_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_genes_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv") + gf_genes_with_gf_list_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv") elif filter_false_predictions: # remaining GFs from the top 2000 after filtering by too many false predictions gf_genes_list_tsv, gf_genes_with_gf_list_tsv = f"gf_genes_list_top_{top}_{rank_type}_filtered_false_predictions.tsv", f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv" elif top == None: gf_genes_list_tsv, gf_genes_with_gf_list_tsv = "gf_genes_list.tsv", "gf_genes_with_gf_list.tsv" else: # top = 2000 - gf_genes_list_tsv, gf_genes_with_gf_list_tsv = f"gf_genes_list_top_{top}_{rank_type}.tsv", f"gf_genes_with_gf_list_top_{top}_{rank_type}.tsv" + gf_list_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_list_top_{top}_{rank_type}.tsv") + gf_genes_list_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_genes_list_top_{top}_{rank_type}.tsv") + gf_genes_with_gf_list_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_genes_with_gf_list_top_{top}_{rank_type}.tsv") - if not os.path.exists(os.path.join(path_for_top_gf_tsv, gf_genes_list_tsv)) or not os.path.exists(os.path.join(path_for_top_gf_tsv, gf_genes_with_gf_list_tsv)): - logging.info(f"- Generating {gf_genes_list_tsv} and {gf_genes_with_gf_list_tsv}...") + if not os.path.exists(gf_list_tsv) or not os.path.exists(gf_genes_list_tsv) or not os.path.exists(gf_genes_with_gf_list_tsv): + logging.info(f"- Generating {os.path.basename(gf_list_tsv)}, {os.path.basename(gf_genes_list_tsv)} and {os.path.basename(gf_genes_with_gf_list_tsv)}...") # Get the genes associated to the GFs in both rank types gf_genes_list = [] @@ -110,68 +123,68 @@ def get_all_gf_genes(path_for_top_gf_tsv, filter_zero_false_predictions, filter_ # Save the GF together with their genes in a TSV file gf_genes_list = Series(gf_genes_list).sort_values(ignore_index=True) - gf_genes_list.to_csv(os.path.join(path_for_top_gf_tsv, gf_genes_list_tsv), sep="\t", header=False, index=False) + gf_genes_list.to_csv(gf_genes_list_tsv, sep="\t", header=False, index=False) gf_genes_with_gf_list = DataFrame.from_records(gf_genes_with_gf_list) gf_genes_with_gf_list.columns = ["gene_id", "gf"] gf_genes_with_gf_list = gf_genes_with_gf_list.sort_values(by=["gene_id"], ignore_index=True) - gf_genes_with_gf_list.to_csv(os.path.join(path_for_top_gf_tsv, gf_genes_with_gf_list_tsv), sep="\t", header=False, index=False) + gf_genes_with_gf_list.to_csv(gf_genes_with_gf_list_tsv, sep="\t", header=False, index=False) - return gf_genes_list, gf_genes_with_gf_list + return gf_genes_list else: - logging.info(f"- {gf_genes_list_tsv} and {gf_genes_with_gf_list_tsv} already exist ") - gf_genes_list = read_csv(os.path.join(path_for_top_gf_tsv, gf_genes_list_tsv), sep="\t", header=None) - gf_genes_with_gf_list = read_csv(os.path.join(path_for_top_gf_tsv, gf_genes_with_gf_list_tsv), sep="\t", header=None) - return gf_genes_list.squeeze().values.tolist(), gf_genes_with_gf_list + logging.info(f"- {os.path.basename(gf_genes_list_tsv)} already exists") + logging.info(f"- {os.path.basename(gf_genes_with_gf_list_tsv)} already exists") + gf_genes_list = read_csv(gf_genes_list_tsv, sep="\t", header=None) + gf_genes_with_gf_list = read_csv(gf_genes_with_gf_list_tsv, sep="\t", header=None) + return gf_genes_list.squeeze().values.tolist() -def flesh_ids_out_with_sequences(gf_genes_list, ids_names, orthorgoups_database): +def flesh_ids_out_with_sequences(gf_genes_list, ids_names, orthorgoups_database_path): """ Generate the database of nucleotidic sequences from the orthogroup genes (FASTA format). """ database = {} - c=0 - if not os.path.exists(os.path.join("..", orthorgoups_database)): + if not os.path.exists(orthorgoups_database_path): species_fasta = {} for species_id in ids_names: - species_fasta.update(read_fasta(os.path.join("..", "sequences", "old_genome_versions", f"{species_id}.fasta"))) + species_fasta.update(read_fasta(os.path.join("/", "home", "cesen", "wgm_predictor_midas", "sequences", "old_genome_versions", f"{species_id}.fasta"))) - with open (os.path.join("..", orthorgoups_database), "w+") as f: + with open (orthorgoups_database_path, "w+") as f: for gene_id in gf_genes_list: if gene_id in species_fasta.keys(): database[gene_id] = species_fasta[gene_id] f.write(f">{gene_id}\n") f.write(f"{species_fasta[gene_id]}\n") else: - logging.info(f"- {orthorgoups_database} already exists") - + logging.info(f"- {os.path.basename(orthorgoups_database_path)} already exists") + return -def database_dna_to_aminoacid(orthorgoups_database, orthorgoups_database_aminoacid): +def database_dna_to_aminoacid(orthorgoups_database, orthorgoups_database_aminoacid_path): """ Translates the database of nucleotidic sequences into aminoacid sequences, to be given to diamond. """ - with open(os.path.join("..", orthorgoups_database), "r") as f: - database_dna = f.readlines() - - if not os.path.exists(os.path.join("..", orthorgoups_database_aminoacid)): - with open (os.path.join("..", orthorgoups_database_aminoacid), "w+") as f: - for i, seq in enumerate(SeqIO.parse(database_dna, 'fasta')): - try: - aa_seq = seq.translate(to_stop=False, cds=False, id=seq.id) - f.write(f">{seq.id}\n") - f.write(f"{aa_seq.seq}\n") - except TranslationError as e: - logging.error("Translation error ({}) in seq {}".format( - e, seq.id)) - continue + if not os.path.exists(orthorgoups_database_aminoacid_path) or os.path.getsize(orthorgoups_database_aminoacid_path) == 0: + with open(orthorgoups_database_aminoacid_path, "w+") as database_aa: + + with open(orthorgoups_database, "r") as database_dna: + for seq in SeqIO.parse(database_dna, 'fasta'): + try: + aa_seq = seq.translate(to_stop=False, cds=False, id=seq.id) + database_aa.write(f">{seq.id}\n") + database_aa.write(f"{aa_seq.seq}\n") + except TranslationError as e: + logging.error("Translation error ({}) in seq {}".format( + e, seq.id)) + continue else: - logging.info(f"- {orthorgoups_database_aminoacid} already exists") - + logging.info(f"- {os.path.basename(orthorgoups_database_aminoacid_path)} already exists") + return # MAIN FUNCTION THAT USES ALL FUNCTIONS ABOVE -def make_gene_family_database(top, rank_type, filter_false_predictions, filter_zero_false_predictions, base_dir, remove_test_species=False): +def make_gene_family_files_and_databases(top, rank_type, filter_false_predictions, filter_zero_false_predictions, + ksrates_rec_ret_basefolder, remove_test_species=False): """ Generates a nucleotidic FASTA file and a translated FASTA file that are databases for all sequences from the 36 species used to make the top e.g. 2000 reciprocally retained gene families out of the total 9178 core angiosperm gene families. @@ -195,13 +208,15 @@ def make_gene_family_database(top, rank_type, filter_false_predictions, filter_z 'sbic':'Sorghum bicolor', 'sita':'Setaria italica', 'slyc':'Solanum lycopersicum', 'stub':'Solanum tuberosum', \ 'tcac':'Theobroma cacao', 'tpar':'Thellungiella parvula', 'vvin': 'Vitis vinifera', 'zmay':'Zea mays'} - logging.info("Get a list of all GF genes") + logging.info(f"Getting a list of top {top} ({rank_type}) reciprocally retained gene family genes") + if filter_zero_false_predictions: logging.info("Filter away from database those GFs that have at least one false prediction") elif filter_false_predictions: logging.info("Filter away from database those GFs that lead to too many false predictions") - gf_genes_list, gf_genes_with_gf_list = get_all_gf_genes("..", filter_zero_false_predictions, filter_false_predictions, rank_type, top) + gf_genes_list = get_all_gf_genes(ksrates_rec_ret_basefolder, filter_zero_false_predictions, + filter_false_predictions, rank_type, top) # Default names for databases (will be changed if necessary in this code later on) if not filter_zero_false_predictions and not filter_false_predictions: @@ -229,13 +244,19 @@ def make_gene_family_database(top, rank_type, filter_false_predictions, filter_z if filter_false_predictions: orthorgoups_database = f"orthogroups_database_top_{top}_{rank_type}_filtered_false_predictions_test_without_{scientific_name}.fasta" orthorgoups_database_aminoacid = f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_false_predictions_test_without_{scientific_name}.fasta" + logging.info("") - logging.info("Generate the nucleotidic database of GFs") - flesh_ids_out_with_sequences(gf_genes_list, ids_names, orthorgoups_database) + logging.info("Generating the nucleotidic database of GFs") + orthorgoups_database_path = os.path.join(ksrates_rec_ret_basefolder, orthorgoups_database) + flesh_ids_out_with_sequences(gf_genes_list, ids_names, orthorgoups_database_path) + logging.info("") - logging.info("Translate into aminoacidic sequence database (ready for diamond)") - database_dna_to_aminoacid(orthorgoups_database, orthorgoups_database_aminoacid) + logging.info("Translating into aminoacidic sequence database (ready for diamond)") + orthorgoups_database_aminoacid_path = os.path.join(ksrates_rec_ret_basefolder, orthorgoups_database_aminoacid) + database_dna_to_aminoacid(orthorgoups_database_path, orthorgoups_database_aminoacid_path) + logging.info("") + return #------------------------------------------------------------------------------ @@ -304,15 +325,18 @@ def make_diamond_db(fasta_aa, output_folder=".", n_threads=8, dmd_tmp_dir=False) else: fasta_aa_db = f"{os.path.basename(fasta_aa)}.db" - fasta_aa_db = os.path.join(output_folder, fasta_aa_db) - cmd = ["diamond", "makedb", "--threads", f"{n_threads}", "--in", fasta_aa, "-d", fasta_aa_db] - # capture_output option required Python 3.7 - # out = sp.run(cmd, capture_output=True) - # logging.debug(out.stderr.decode()) - sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) - # if out.returncode == 1: - # logging.error(out.stderr.decode()) - return f"{fasta_aa_db}.dmnd" + output_filename = os.path.join(output_folder, f"{fasta_aa_db}.dmnd") + + if not os.path.exists(output_filename): + fasta_aa_db = os.path.join(output_folder, fasta_aa_db) + cmd = ["diamond", "makedb", "--threads", f"{n_threads}", "--in", fasta_aa, "-d", fasta_aa_db] + # capture_output option required Python 3.7 + # out = sp.run(cmd, capture_output=True) + # logging.debug(out.stderr.decode()) + sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) + # if out.returncode == 1: + # logging.error(out.stderr.decode()) + return output_filename def run_diamond(query_fasta, subject_db, outfile, dmd_tmp_dir, pident_threshold, column_names=[], @@ -475,8 +499,9 @@ def dna_to_aminoacid(database_dna_path, output_folder="."): continue return database_aa_path -def assign_first_diamond_hit_with_rbh(species, fasta_file, orthogroup_database_nucl_test, - query_seqs, dmd_with_orthogroups, pident_threshold, column_names, dmd_tmp_dir, basefolder_output_files): +def assign_first_diamond_hit_with_rbh(top, rank_type, species, fasta_file, orthogroup_database_nucl_test, + query_seqs, dmd_with_orthogroups, pident_threshold, + column_names, dmd_tmp_dir, wgd_focal_rec_ret_rank_dir): """ Assign to the query gene the orthogroup of the first (and best) hit, but only if there is a reciprocal best hit correspondence between the two genes. @@ -484,7 +509,7 @@ def assign_first_diamond_hit_with_rbh(species, fasta_file, orthogroup_database_n logging.info(f"Temporary diamond directory: {dmd_tmp_dir}") logging.info("") - filename = os.path.join(basefolder_output_files, f"assigned_gfs_{species}_{pident_threshold}%_best_diamond_hit_with_rbh.tsv") + filename = os.path.join(wgd_focal_rec_ret_rank_dir, f"assigned_gfs_{species}_top_{top}_{rank_type}_{pident_threshold}%_best_diamond_hit_with_rbh.tsv") if not os.path.exists(filename) or os.path.getsize(filename) == 0: @@ -492,13 +517,13 @@ def assign_first_diamond_hit_with_rbh(species, fasta_file, orthogroup_database_n orthogroup_database_test = read_csv(orthogroup_database_nucl_test, sep="\t", header=None).loc[:,0] # Get/Make the Atha FASTA diamond database - if not os.path.exists(os.path.join(basefolder_output_files, f"{fasta_file}.aa.db.dmnd")): + if not os.path.exists(os.path.join(wgd_focal_rec_ret_rank_dir, f"{fasta_file}.aa.db.dmnd")): # Translate it into the aminoacid alphabet - species_sequences_filename = dna_to_aminoacid(fasta_file, output_folder=basefolder_output_files) + species_sequences_filename = dna_to_aminoacid(fasta_file, output_folder=wgd_focal_rec_ret_rank_dir) # Make aminoacidic diamond database - test_species_database = make_diamond_db(species_sequences_filename, output_folder=basefolder_output_files) + test_species_database = make_diamond_db(species_sequences_filename, output_folder=wgd_focal_rec_ret_rank_dir) else: - test_species_database = os.path.join(basefolder_output_files, f"{fasta_file}.aa.db.dmnd") + test_species_database = os.path.join(wgd_focal_rec_ret_rank_dir, f"{fasta_file}.aa.db.dmnd") assigned_gfs_file = list() no_rbh = 0 diff --git a/ksrates/fc_wgd.py b/ksrates/fc_wgd.py index 052ba47..677f1a6 100755 --- a/ksrates/fc_wgd.py +++ b/ksrates/fc_wgd.py @@ -16,14 +16,14 @@ _OUTPUT_BLAST_FILE_PATTERN = '{}.blast.tsv' _OUTPUT_MCL_FILE_PATTERN = '{}.mcl.tsv' _OUTPUT_KS_FILE_PATTERN = '{}.ks.tsv' _OUTPUT_MCL_REC_RET_FILE_PATTERN = '{}_rec_ret_top{}_{}_rank.mcl.tsv' -_OUTPUT_KS_REC_RET_FILE_PATTERN = '{}.ks_rec_ret.tsv' +_OUTPUT_KS_REC_RET_FILE_PATTERN = '{}.ks_rec_ret_top_{}_{}.tsv' _PARALOGS_OUTPUT_DIR_PATTERN = 'wgd_{}' _ORTHOLOGS_OUTPUT_DIR_PATTERN = 'wgd_{}_{}' _TMP_BLAST = '{}.blast_tmp' _TMP_KS = '{}.ks_tmp' -_TMP_KS_REC_RET = '{}.ks_rec_ret_tmp' +_TMP_KS_REC_RET = '{}.ks_rec_ret_top_{}_{}_tmp' def ks_paralogs(species_name, cds_fasta, base_dir='.', eval_cutoff=1e-10, inflation_factor=2.0, aligner='muscle', min_msa_length=100, codeml='codeml', codeml_times=1, pairwise=False, @@ -252,20 +252,20 @@ def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type sys.exit(1) output_mcl_rec_ret_file = _OUTPUT_MCL_REC_RET_FILE_PATTERN.format(species_name, top, rank_type) - output_ks_file = _OUTPUT_KS_REC_RET_FILE_PATTERN.format(species_name) + output_ks_file = _OUTPUT_KS_REC_RET_FILE_PATTERN.format(species_name, top, rank_type) output_dir = os.path.abspath(os.path.join(base_dir, _PARALOGS_OUTPUT_DIR_PATTERN.format(species_name))) output_mcl_path = os.path.join(output_dir, output_mcl_rec_ret_file) - tmp_ks_paralogs = os.path.join(output_dir, _TMP_KS_REC_RET.format(species_name)) + tmp_ks_paralogs = os.path.join(output_dir, _TMP_KS_REC_RET.format(species_name, top, rank_type)) # BASEFOLDERS for input and output # Path to database and files to be used in the workflow # TODO: this is temporarily because we will probably have to let the user first make this files # themselves in a specific folder; alternatively we could provide this file already ready, but that # would make the GitHub folder (too) heavy - basefolder_db_and_files = os.path.join("/", "home", "cesen", "ksrates_rec_ret", "ksrates", "reciprocal_retention") + ksrates_rec_ret_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "reciprocal_retention") # determine which parts to run do_rec_ret = True @@ -292,7 +292,7 @@ def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type logging.info(f'Paralog Ks data {output_ks_file} already exists, will skip wgd Ks analysis') do_ks = False else: - logging.info('No paralog Ks data, will run wgd Ks analysis') + logging.info('No reciprocally retained paralog Ks data, will run wgd Ks analysis') if not do_rec_ret and not do_ks: logging.info('All reciprocally retained paralog data already exist, nothing to do') @@ -324,10 +324,14 @@ def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type logging.info(f'Creating output directory {output_dir}') os.makedirs(output_dir) - # get reciprocally retained gene families (MCL-like format) + # Get reciprocally retained gene families (MCL-like format) if do_rec_ret: logging.info('---') - assigned_rec_ret_gfs_path = os.path.join(output_dir, "reciprocal_retention", rank_type, f"assigned_gfs_{species_name}_{pident_threshold}%_best_diamond_hit_with_rbh.tsv") + + # Preparing necessary scripts for RR analyses + recRet.setup_for_extend_orthogroups(top, rank_type, filter_false_predictions, filter_zero_false_predictions) + + assigned_rec_ret_gfs_path = os.path.join(output_dir, "reciprocal_retention", rank_type, f"assigned_gfs_{species_name}_top_{top}_{rank_type}_{pident_threshold}%_best_diamond_hit_with_rbh.tsv") if not os.path.exists(assigned_rec_ret_gfs_path): logging.info(f'Running diamond homology search against reciprocally retained gene family database...') assigned_rec_ret_gfs = recRet.extend_orthogroups(species_name, latin_name, cds_fasta, top, rank_type, pident_threshold, @@ -339,11 +343,11 @@ def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type if not os.path.exists(output_mcl_path): logging.info(f'Organizing reciprocally retained gene families in mcl-like format for wgd Ks pipeline') if not filter_false_predictions and not filter_zero_false_predictions: - top_rec_ret_gf_list = pd.read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None).squeeze() + top_rec_ret_gf_list = pd.read_csv(os.path.join(ksrates_rec_ret_dir, f"gf_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None).squeeze() elif filter_false_predictions: - top_rec_ret_gf_list = pd.read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None).squeeze() + top_rec_ret_gf_list = pd.read_csv(os.path.join(ksrates_rec_ret_dir, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None).squeeze() elif filter_zero_false_predictions: - top_rec_ret_gf_list = pd.read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None).squeeze() + top_rec_ret_gf_list = pd.read_csv(os.path.join(ksrates_rec_ret_dir, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None).squeeze() generate_mcl_rec_ret(top_rec_ret_gf_list, assigned_rec_ret_gfs, output_mcl_path) logging.info("Done") logging.info("") @@ -364,11 +368,11 @@ def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type protein_sequences = translate_cds(cds_sequences) if not filter_false_predictions and not filter_zero_false_predictions: - top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}.tsv") + top_gfs_filtered_with_rank = os.path.join(ksrates_rec_ret_dir, f"gf_list_top_{top}_{rank_type}.tsv") elif filter_false_predictions: - top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv") + top_gfs_filtered_with_rank = os.path.join(ksrates_rec_ret_dir, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv") elif filter_zero_false_predictions: - top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv") + top_gfs_filtered_with_rank = os.path.join(ksrates_rec_ret_dir, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv") results_df = ks_analysis_paranome(cds_sequences, protein_sequences, output_mcl_path, top=top, top_gfs_filtered_with_rank=top_gfs_filtered_with_rank, diff --git a/ksrates/lognormal_mixture.py b/ksrates/lognormal_mixture.py index 7e16c03..0cbc092 100644 --- a/ksrates/lognormal_mixture.py +++ b/ksrates/lognormal_mixture.py @@ -25,6 +25,9 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re colinearity_analysis = config.get_colinearity() reciprocal_retention_analysis = config.get_reciprocal_retention() + top = config.get_reciprocal_retention_top() + rank_type = config.get_reciprocal_retention_rank_type() + # Use these optional arguments to avoid performing LMM on a certain data type, overwriting what written in the config file; # This is used in paralogs_analysis.py when LMM is called as extra method and allows to fine tune what is activated and what not if ignore_paranome: @@ -99,7 +102,7 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re if anchors_ks_tsv_file == "": logging.error(f"Anchor pair Ks TSV file not found at default position [{default_path_anchors_tsv_file}].") if reciprocal_retention_analysis: - default_path_rec_ret_tsv_file = os.path.join("paralog_distributions", f"wgd_{species}", f"{species}.ks_rec_ret.tsv") + default_path_rec_ret_tsv_file = os.path.join("paralog_distributions", f"wgd_{species}", f"{species}.ks_rec_ret_top_{top}_{rank_type}.tsv") rec_ret_tsv_file = fcCheck.get_argument_path(rec_ret_tsv_file, default_path_rec_ret_tsv_file, "Reciprocally retained paralog Ks TSV file") if rec_ret_tsv_file == "": logging.error(f"Reciprocally retained paralog Ks TSV file not found at default position [{default_path_rec_ret_tsv_file}].") diff --git a/ksrates/plot_paralogs.py b/ksrates/plot_paralogs.py index 72f3967..1e52568 100755 --- a/ksrates/plot_paralogs.py +++ b/ksrates/plot_paralogs.py @@ -25,6 +25,9 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an colinearity_analysis = config.get_colinearity() reciprocal_retention_analysis = config.get_reciprocal_retention() + top = config.get_reciprocal_retention_top() + rank_type = config.get_reciprocal_retention_rank_type() + if not colinearity_analysis and not paranome_analysis and not reciprocal_retention_analysis: logging.error('At least one of the "paranome" or "collinearity" or "reciprocal_retention" parameters in the configuration file needs to be set to "yes".') logging.error("Exiting.") @@ -44,7 +47,7 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an if anchors_ks_tsv_file == "": logging.error(f"Anchor pair Ks TSV file not found at default position [{default_path_anchors_tsv_file}].") if reciprocal_retention_analysis: - default_path_rec_ret_tsv_file = os.path.join("paralog_distributions", f"wgd_{species}", f"{species}.ks_rec_ret.tsv") + default_path_rec_ret_tsv_file = os.path.join("paralog_distributions", f"wgd_{species}", f"{species}.ks_rec_ret_top_{top}_{rank_type}.tsv") rec_ret_tsv_file = fcCheck.get_argument_path(rec_ret_tsv_file, default_path_rec_ret_tsv_file, "Reciprocally retained paralog Ks TSV file") if rec_ret_tsv_file == "": logging.error(f"Reciprocally retained paralog Ks TSV file not found at default position [{default_path_rec_ret_tsv_file}].") @@ -93,8 +96,6 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an plot_correction_arrows = config.plot_correction_arrows() peak_stats = config.get_peak_stats() # default is mode (other option, median) - top = config.get_reciprocal_retention_top() - rank_type = config.get_reciprocal_retention_rank_type() # ----------------------------------------------------------------------------- # GENERATING MIXED DISTRIBUTION PLOT diff --git a/ksrates/wgd_paralogs.py b/ksrates/wgd_paralogs.py index bc553d3..d59cd46 100644 --- a/ksrates/wgd_paralogs.py +++ b/ksrates/wgd_paralogs.py @@ -88,7 +88,7 @@ def wgd_paralogs(config_file, n_threads): # ESTIMATING RECIPROCALLY RETAINED GENE FAMILIES Ks VALUES if reciprocal_retention: - logging.info("Running wgd reciprocal retention paralog Ks pipeline...") + logging.info(f"Running wgd reciprocal retention Ks pipeline ({rank_type} ranking)...") fc_wgd.ks_paralogs_rec_ret(species, species_fasta_file, latin_name, top=top, rank_type=rank_type, pident_threshold=pident_threshold, min_qcov_threshold=min_qcov_threshold, filter_false_predictions=filter_false_predictions, diff --git a/ksrates_cli.py b/ksrates_cli.py index 500fcf4..c6e0817 100644 --- a/ksrates_cli.py +++ b/ksrates_cli.py @@ -118,7 +118,7 @@ def orthologs_adjustment(config_file, trios): @click.option("--adjustment-table", type=click.Path(exists=True), help="User-defined path to file containing adjustment results (default: rate_adjustment/species/adjustment_table_species.tsv)") @click.option("--paranome-table", type=click.Path(exists=True), help="User-defined path to file containing paranome Ks (default: paralog_distributions/wgd_species/species.ks.tsv)") @click.option("--anchors-table", type=click.Path(exists=True), help="User-defined path to file containing anchor pair Ks (default: paralog_distribution/wgd_species/species.ks_anchors.tsv)") -@click.option("--reciprocal-retention-table", type=click.Path(exists=True), help="User-defined path to file containing reciprocally retained paralog Ks (default: paralog_distribution/wgd_species/species.ks_rec_ret.tsv)") +@click.option("--reciprocal-retention-table", type=click.Path(exists=True), help="User-defined path to file containing reciprocally retained paralog Ks (default: paralog_distribution/wgd_species/species.ks_rec_ret_top_2000_lambda.tsv)") def plot_paralogs(config_file, adjustment_table, paranome_table, anchors_table, reciprocal_retention_table): """ Plots rate-adjusted mixed paralog-ortholog Ks distribution. @@ -182,7 +182,7 @@ def plot_orthologs(config_file, trios): @click.argument('config_file', type=click.Path(exists=True)) @click.option("--paranome-table", type=click.Path(exists=True), help="User-defined path to file containing paranome Ks (default: paralog_distributions/wgd_species/species.ks.tsv)") @click.option("--anchors-table", type=click.Path(exists=True), help="User-defined path to file containing anchor pair Ks (default: paralog_distribution/wgd_species/species.ks_anchors.tsv)") -@click.option("--reciprocal-retention-table", type=click.Path(exists=True), help="User-defined path to file containing reciprocally retained paralog Ks (default: paralog_distribution/wgd_species/species.ks_rec_ret.tsv)") +@click.option("--reciprocal-retention-table", type=click.Path(exists=True), help="User-defined path to file containing reciprocally retained paralog Ks (default: paralog_distribution/wgd_species/species.ks_rec_ret_top_2000_lambda.tsv)") @click.option("--adjustment-table", type=click.Path(exists=True), help="User-defined path to file containing adjustment results (default: rate_adjustment/species/adjustment_table_species.tsv)") @click.option("--anchorpoints", type=click.Path(exists=True), help="User-defined path to i-ADHoRe file anchorpoints.txt (default: paralog_distributions/wgd_species/species_i-adhore/anchorpoints.txt)") @click.option("--multiplicons", type=click.Path(exists=True), help="User-defined path to i-ADHoRe file multiplicons.txt (default: paralog_distributions/wgd_species/species_i-adhore/multiplicons.txt)") diff --git a/test/config_expert.txt b/test/config_expert.txt index 16a74c6..0340662 100644 --- a/test/config_expert.txt +++ b/test/config_expert.txt @@ -8,4 +8,11 @@ num_mixture_model_initializations = 1 extra_paralogs_analyses_methods = yes max_mixture_model_components = 3 max_mixture_model_ks = 5 -max_gene_family_size = 200 \ No newline at end of file +max_gene_family_size = 200 + +top_reciprocally_retained_gfs = 2000 +reciprocal_retention_rank_type = lambda +pident_threshold = 40 +min_qcov_threshold = 50 +filter_by_false_predictions = no +filter_by_zero_false_predictions = no \ No newline at end of file -- GitLab From 96bb132808ea4162f4591a6555153076bae4a053 Mon Sep 17 00:00:00 2001 From: cesen Date: Tue, 21 Dec 2021 16:19:11 +0100 Subject: [PATCH 20/26] Move old scripts (will be probably deleted later) --- .../{ => old_scripts}/compute_ks_from_ortho_gfs.py | 0 .../{ => old_scripts}/construct_ks_plot_ortho_gfs.py | 0 .../{ => old_scripts}/get_top5000_gfs_mcl_format.py | 0 ksrates/reciprocal_retention/{ => old_scripts}/ks.mplstyle | 0 .../{ => old_scripts}/rec_retained_ks_plot_pipeline.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename ksrates/reciprocal_retention/{ => old_scripts}/compute_ks_from_ortho_gfs.py (100%) rename ksrates/reciprocal_retention/{ => old_scripts}/construct_ks_plot_ortho_gfs.py (100%) rename ksrates/reciprocal_retention/{ => old_scripts}/get_top5000_gfs_mcl_format.py (100%) rename ksrates/reciprocal_retention/{ => old_scripts}/ks.mplstyle (100%) rename ksrates/reciprocal_retention/{ => old_scripts}/rec_retained_ks_plot_pipeline.py (100%) diff --git a/ksrates/reciprocal_retention/compute_ks_from_ortho_gfs.py b/ksrates/reciprocal_retention/old_scripts/compute_ks_from_ortho_gfs.py similarity index 100% rename from ksrates/reciprocal_retention/compute_ks_from_ortho_gfs.py rename to ksrates/reciprocal_retention/old_scripts/compute_ks_from_ortho_gfs.py diff --git a/ksrates/reciprocal_retention/construct_ks_plot_ortho_gfs.py b/ksrates/reciprocal_retention/old_scripts/construct_ks_plot_ortho_gfs.py similarity index 100% rename from ksrates/reciprocal_retention/construct_ks_plot_ortho_gfs.py rename to ksrates/reciprocal_retention/old_scripts/construct_ks_plot_ortho_gfs.py diff --git a/ksrates/reciprocal_retention/get_top5000_gfs_mcl_format.py b/ksrates/reciprocal_retention/old_scripts/get_top5000_gfs_mcl_format.py similarity index 100% rename from ksrates/reciprocal_retention/get_top5000_gfs_mcl_format.py rename to ksrates/reciprocal_retention/old_scripts/get_top5000_gfs_mcl_format.py diff --git a/ksrates/reciprocal_retention/ks.mplstyle b/ksrates/reciprocal_retention/old_scripts/ks.mplstyle similarity index 100% rename from ksrates/reciprocal_retention/ks.mplstyle rename to ksrates/reciprocal_retention/old_scripts/ks.mplstyle diff --git a/ksrates/reciprocal_retention/rec_retained_ks_plot_pipeline.py b/ksrates/reciprocal_retention/old_scripts/rec_retained_ks_plot_pipeline.py similarity index 100% rename from ksrates/reciprocal_retention/rec_retained_ks_plot_pipeline.py rename to ksrates/reciprocal_retention/old_scripts/rec_retained_ks_plot_pipeline.py -- GitLab From cb4509a1cd7c4069ab78379d4b8cb006acd5eb6b Mon Sep 17 00:00:00 2001 From: cesen Date: Wed, 22 Dec 2021 17:28:14 +0100 Subject: [PATCH 21/26] Remove outdated scripts about RRs --- .../old_scripts/compute_ks_from_ortho_gfs.py | 283 ------------------ .../construct_ks_plot_ortho_gfs.py | 117 -------- .../old_scripts/get_top5000_gfs_mcl_format.py | 118 -------- .../old_scripts/ks.mplstyle | 18 -- .../rec_retained_ks_plot_pipeline.py | 149 --------- 5 files changed, 685 deletions(-) delete mode 100755 ksrates/reciprocal_retention/old_scripts/compute_ks_from_ortho_gfs.py delete mode 100755 ksrates/reciprocal_retention/old_scripts/construct_ks_plot_ortho_gfs.py delete mode 100755 ksrates/reciprocal_retention/old_scripts/get_top5000_gfs_mcl_format.py delete mode 100755 ksrates/reciprocal_retention/old_scripts/ks.mplstyle delete mode 100755 ksrates/reciprocal_retention/old_scripts/rec_retained_ks_plot_pipeline.py diff --git a/ksrates/reciprocal_retention/old_scripts/compute_ks_from_ortho_gfs.py b/ksrates/reciprocal_retention/old_scripts/compute_ks_from_ortho_gfs.py deleted file mode 100755 index e33474e..0000000 --- a/ksrates/reciprocal_retention/old_scripts/compute_ks_from_ortho_gfs.py +++ /dev/null @@ -1,283 +0,0 @@ -from wgd_ksrates.utils import read_fasta -from wgd_ksrates.ks_distribution import ks_analysis_paranome -import os -import logging -import sys -import subprocess -import tempfile -import math -import datetime - -logging.basicConfig(format='%(levelname)s\t%(message)s', level="INFO", stream=sys.stdout) - -_OUTPUT_KS_FILE_PATTERN = '{}_ortho_gf_{}_{}_rank.ks.tsv' -_PARALOGS_OUTPUT_DIR_PATTERN = 'wgd_{}' -_TMP_KS = '{}_ortho_gf.ks_tmp' - - -def ks_paralogs(species_name, cds_fasta, top, top_gfs_filtered_with_rank, rank_type, ks_tsv_path, base_dir='.', eval_cutoff=1e-10, - inflation_factor=2.0, aligner='muscle', min_msa_length=100, codeml='codeml', codeml_times=1, pairwise=False, - max_gene_family_size=200, weighting_method='fasttree', n_threads=4, overwrite=False): - """ - Among the edits, this function temporarily takes also the path and filename to the output_ks_file, in order to - provide a different name when top 2000 or a subset of it are used. - """ - - # input checks - if not species_name: - logging.error('No species name provided. Exiting.') - sys.exit(1) - - # TMP removed in order to use a custom name dependent on whether a subset of the top2000 is used or all of the top 2000 are used - # output_ks_file = _OUTPUT_KS_FILE_PATTERN.format(f"{species_name.lower()}", f"top{top}", rank_type) - output_ks_file = os.path.basename(ks_tsv_path) - - output_dir = os.path.abspath(os.path.join(base_dir, _PARALOGS_OUTPUT_DIR_PATTERN.format(species_name.lower()))) - - tmp_ks_paralogs = os.path.join(output_dir, _TMP_KS.format(f"{species_name.lower()}_{rank_type}")) - - # determine which parts to run - do_ks = True - if os.path.exists(output_dir): - - # REMOVED THE BLAST AND MCL STEPS - - # Check if Ks tmp folder of previous failed run exists (in this case the Ks data will be incomplete) - if os.path.exists(tmp_ks_paralogs): # if there is a Ks tmp folder from previous failed run - logging.error(f'tmp directory [{tmp_ks_paralogs}] was found: leftover from a failed earlier run?') - logging.error('Paralog Ks data may be incomplete. Please delete the tmp directory and rerun ' - 'the analysis. Exiting.') - sys.exit(1) - - if os.path.exists(os.path.join(output_dir, output_ks_file)): - if overwrite: - logging.warning(f'Paralog Ks file {output_ks_file} exists, will overwrite') - else: - logging.info(f'Paralog Ks data {output_ks_file} already exists, will skip wgd Ks analysis') - do_ks = False - else: - logging.info('No paralog Ks data, will run wgd Ks analysis') - - if not do_ks: - logging.info('All paralog data already exist, nothing to do') - logging.info('Done') - return - - # software checks - logging.info('---') - logging.info('Checking external software...') - software = [] - if do_ks: - software += [aligner, codeml] - if weighting_method == 'fasttree': - software += ['FastTree'] - elif weighting_method == 'phyml': - software += ['phyml'] - if can_i_run_software(software) == 1: - logging.error('Could not run all required external software. Exiting.') - sys.exit(1) - - # input checks - if not cds_fasta: - logging.error('No CDS fasta file provided. Exiting.') - sys.exit(1) - - if not os.path.exists(output_dir): - logging.info(f'Creating output directory {output_dir}') - os.makedirs(output_dir) - - # read and translate CDS file - cds_sequences = None - protein_sequences = None - if do_ks: - logging.info(f'Translating CDS file {cds_fasta}...') - cds_sequences = read_fasta(os.path.abspath(cds_fasta)) - protein_sequences = translate_cds(cds_sequences) - - # REMOVED BLAST and MCL STEPS - - # whole paranome Ks analysis - if do_ks: - logging.info('---') - logging.info('Running whole paranome Ks analysis...') - - # tmp directory - cw_dir = os.getcwd() - if not os.path.exists(tmp_ks_paralogs): - os.mkdir(tmp_ks_paralogs) - os.chdir(tmp_ks_paralogs) # change dir to tmp dir, as codeml writes non-unique file names to the working dir - - test_run = False - if test_run: - output_mcl_path = os.path.join(output_dir, f"{species_name.lower()}_ortho_gf_top5000_{rank_type}_rank.mcl.tsv") - else: - output_mcl_path = os.path.join(output_dir, f"{species_name.lower()}_ortho_gf_top{top}_{rank_type}_rank.mcl.tsv") - - results_df = ks_analysis_paranome(cds_sequences, protein_sequences, output_mcl_path, top, top_gfs_filtered_with_rank, - tmp_ks_paralogs, output_dir, codeml, times=codeml_times, aligner=aligner, - ignore_prefixes=False, min_length=min_msa_length, pairwise=pairwise, - max_gene_family_size=max_gene_family_size, method=weighting_method, - preserve=False, n_threads=n_threads) - if os.path.exists(tmp_ks_paralogs): - logging.error(f'tmp directory {tmp_ks_paralogs} still exists after analysis, ' - f'paralog Ks data may be incomplete and will not be written to paralog Ks file!') - logging.error('Please delete the tmp directory and rerun the analysis. Exiting.') - sys.exit(1) - if isinstance(results_df, type(None)) or results_df.empty: - logging.warning('No paralog Ks data computed, will write empty paralog Ks file!') - with open(os.path.join(output_dir, output_ks_file), 'w+') as o: - o.write(results_df.round(5).to_csv(sep='\t')) - # change back to current directory as tmp dir got deleted and subsequent os.getcwd() may fail - os.chdir(cw_dir) - - logging.info('---') - logging.info('Done') - - - -def can_i_run_software(software): - """ - Copied and modified from wgd.utils - - Test if external software is executable - :param software: list or string of executable(s) - :return: 1 (failure) or 0 (success) - """ - if type(software) == str: - software = [software] - ex = 0 - for s in software: - # codeml needs input otherwise it prompts the user for input, so a dummy - # file is created - if s == 'codeml': - tmp_file = "codeml.ctl" - command = ['codeml', tmp_file] - elif s == 'prank': - command = [s, '--help'] - elif s == 'FastTree': - command = s - elif s in ['blastp', 'makeblastdb', 'blast', 'muscle', 'i-adhore']: - command = [s, '-version'] - else: - command = [s, '--version'] - try: - # logging.info(command) - if s == "codeml": - # Since the Nextflow pipeline processes multiple wgd runs at the same time, - # let's generate for each run the dummy "codeml.ctl" files within a different - # temporary directory named with a unique ID. This way, the several parallel - # wgd runs will not interfere with each other when creating and removing - # the dummy codeml.ctl file. - with tempfile.TemporaryDirectory(dir = ".") as tmp_dir: - with open(os.path.join(tmp_dir, tmp_file), 'w') as o: # write the codeml.ctl file in it - o.write('seqfile = test') - sp = subprocess.run(command, cwd=tmp_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - # tmp_dir is removed both if the subprocess succeeds or fails, thanks to the "with" statement - else: - sp = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - - out = sp.stdout.decode("utf-8").strip() - err = sp.stderr.decode("utf-8").strip() - if out: - logging.info(out.splitlines()[0]) - elif err: - logging.info(err.splitlines()[0]) - - except FileNotFoundError: - logging.error('{} executable not found!'.format(s)) - ex = 1 - return ex - - -def translate_cds(sequence_dict, skip_invalid=False): - """ - Copied and modified from wgd.utils - - Just another CDS to protein translater. Will give warnings when in-frame - stop codons are found, invalid codons are found, or when the sequence length - is not a multiple of three. Will translate the full sequence or until an - unspecified or in-frame codon is encountered. - :param sequence_dict: dictionary with gene IDs and CDS sequences - :param skip_invalid: bool, skip invalid CDS? (default translates to first - stop codon or end) - :return: dictionary with gene IDs and proteins sequences - """ - # TODO I should just use the Biopython translator - aa_dict = { - 'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M', - 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T', - 'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K', - 'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R', - 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L', - 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P', - 'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q', - 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', - 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V', - 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', - 'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E', - 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', - 'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S', - 'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L', - 'TAC': 'Y', 'TAT': 'Y', 'TAA': '', 'TAG': '', - 'TGC': 'C', 'TGT': 'C', 'TGA': '', 'TGG': 'W', - } - protein_dict = {} - - j = 0 - total = 0 - for key, val in sequence_dict.items(): - j += 1 - aa_seq = '' - if len(val) % 3 != 0: - logging.warning('Sequence length != multiple of 3 for {}!'.format(key)) - total += 1 - invalid = False - for i in range(0, len(val), 3): - if val[i:i + 3] not in aa_dict.keys(): - logging.warning('Invalid codon {0:>3} in {1}'.format(val[i:i+3], key)) - invalid = True - total += 1 - break - else: - if aa_dict[val[i:i + 3]] == '' and i+3 != len(val): - logging.warning('In-frame STOP codon in {0} at position {1}:{2}'.format(key, i, i+3)) - invalid = True - total += 1 - break - aa_seq += aa_dict[val[i:i + 3]] - if invalid and skip_invalid: - continue - protein_dict[key] = aa_seq - - if total: - logging.warning("There were {} warnings during translation".format(total)) - return protein_dict - -def init_logging(log_heading, logging_level): - """ - Initialize the logging environment and print a header to the log. - - :param log_heading: heading for the log - :return: nothing - """ - logging.basicConfig(format='%(levelname)s\t%(message)s', level=logging_level, stream=sys.stdout) - length = math.ceil((len(log_heading)/2)) - logging.info('- ' * length) - logging.info(log_heading) - logging.info(datetime.datetime.today().ctime()) - logging.info('- ' * length) - - - - -def compute_ks_from_ortho_gfs(species, species_fasta_file, top, top_gfs_filtered_with_rank, \ - rank_type, ks_tsv_path, max_gene_family_size, n_threads, paralog_dists_dir): - """ - Function to be called in the main Python script. - """ - logging.info(f" Estimate Ks from the top {top} GFs ({rank_type} ranking)") - - if not os.path.exists(ks_tsv_path): - - ks_paralogs(species, species_fasta_file, top, top_gfs_filtered_with_rank, rank_type, ks_tsv_path, max_gene_family_size=max_gene_family_size, \ - base_dir=paralog_dists_dir, n_threads=n_threads) diff --git a/ksrates/reciprocal_retention/old_scripts/construct_ks_plot_ortho_gfs.py b/ksrates/reciprocal_retention/old_scripts/construct_ks_plot_ortho_gfs.py deleted file mode 100755 index 70a89f6..0000000 --- a/ksrates/reciprocal_retention/old_scripts/construct_ks_plot_ortho_gfs.py +++ /dev/null @@ -1,117 +0,0 @@ -from pandas import read_csv -from numpy import float64, arange -import matplotlib.pyplot as plt -import seaborn -import logging -import os -import sys -from numpy import zeros -import matplotlib -# Use the Agg (Anti-Grain Geometry) backend to avoid needing a graphical user interface (X11 backend) -matplotlib.use('Agg') - -logging.basicConfig(format='%(levelname)s\t%(message)s', level="INFO", stream=sys.stdout) - -plt.style.use(os.path.join(f"{os.path.dirname(os.path.abspath(__file__))}", "ks.mplstyle")) - -def ks_list_from_tsv(tsv_file, max_ks): - """ - """ - with open(tsv_file, "r") as tsv_file: - tsv = read_csv(tsv_file, sep="\t") - - filtered_tsv = tsv.loc[(tsv["Ks"].dtypes == float64) & (tsv["Ks"] <= max_ks)] - - ks_list_filtered = filtered_tsv["Ks"].to_list() - weight_list_filtered = filtered_tsv["WeightOutliersExcluded"].to_list() - - return ks_list_filtered, weight_list_filtered - - -def generate_ks_plot(id, ids_names, top, rank_type, ks_list, weight_list, max_ks, bin_width, y_axis_lim, path_to_output_folder, ks_plot_filename): - """ - :param id: species id - :param ids_names: dictionary with scientific name associated to id - """ - # ks_per_species = read_csv(open(os.path.join(path_to_output_folder, f"{rank_type}_rank", f"ks_{id}_top{top}_{rank_type}.tsv"), "r"), sep="\t", header=None, index_col=0) - # weights_per_species = read_csv(open(os.path.join(path_to_output_folder, f"{rank_type}_rank", f"weights_{id}_top{top}_{rank_type}.tsv"), "r"), sep="\t", header=None, index_col=0) - - # ks_list = ks_per_species.loc[id].tolist() - # weight_list = weights_per_species.loc[id].tolist() - fig, axis = generate_mixed_plot_figure(ids_names[id], max_ks, y_axis_lim, top, "corrected", False, False, top, rank_type, colinearity_data=False) - - bin_list = arange(0.0, max_ks + bin_width, bin_width) - hist = axis.hist(ks_list, bins=bin_list, weights=weight_list, histtype='stepfilled', color="0.4", - label="Reciprocally retained genes", density=False) - axis.legend() - fig.savefig(os.path.join(path_to_output_folder, f"{rank_type}_rank", ks_plot_filename), format="pdf") - plt.close(fig) - - -def generate_mixed_plot_figure(species, x_max_lim, y_max_lim, top, corrected_or_not, correction_table_available, - plot_correction_arrows, top_gf, rank_type, paranome_data=True, colinearity_data=True): - """ - """ - species_escape_whitespaces = species.replace(' ', '\ ') - - # Increase figure size in height if extra space for the correction arrows is required and - # any divergence line will be actually plotted (thus, the "and" condition) - if correction_table_available: - if plot_correction_arrows: - # make the figure a bit taller to make room for the arrows - fig, ax = plt.subplots(1, 1, figsize=(14.0, 7.6)) - else: - fig, ax = plt.subplots(1, 1, figsize=(14.0, 7.0)) - else: - # if not correction_table_available a simpler less-wide layout with - # the legend inside the plot and no right margin is used - fig, ax = plt.subplots(1, 1, figsize=(10.0, 7.0)) - - fig.suptitle("$K_\mathregular{S}$" + f" distribution for ${species_escape_whitespaces}$") - - ax.set_title(f"Top {top} GFs from {rank_type} ranking") - - seaborn.despine(offset=10) - ax.set_xlabel("$K_\mathregular{S}$") - - ax.set_ylabel("Number of retained duplicates (weighted)") - - ax.set_xlim(0, x_max_lim) - if isinstance(y_max_lim, float): - y_max_lim = float(y_max_lim) - ax.set_ylim(0, y_max_lim) - plt.setp(ax.yaxis.get_majorticklabels(), rotation=90, verticalalignment='center') - if not correction_table_available: - # if not correction_table_available tighten the layout - # to reduce clipping - plt.tight_layout() - - return fig, ax - - -# ----------------------------------------------------------------------------- - - - -def construct_ks_plot_ortho_gfs(id, top, rank_type, ks_plot_filename, max_ks, bin_width, y_axis_lim, ids_names): - """ - """ - logging.info(f" Construct Ks distribution for the top {top} GFs ({rank_type} ranking)") - - path_to_output_folder = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "ks_distributions", id.lower(), "weighted_with_ortho_GFs") - if not os.path.exists(os.path.join(path_to_output_folder, f"{rank_type}_rank")): - os.makedirs(os.path.join(path_to_output_folder, f"{rank_type}_rank")) - - if not os.path.exists(os.path.join(path_to_output_folder, f"{rank_type}_rank", ks_plot_filename)): - # Extract the Ks estimates from tsv file - ks_tsv = os.path.join("/", "home", "cesen", "wgm_predictor_midas", f"paralog_distributions", f"wgd_{id.lower()}", f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank.ks.tsv") - # if not os.path.exists(os.path.join(path_to_output_folder, f"{rank_type}_rank", f"ks_{id}_top{top}_{rank_type}.tsv")) or \ - # not os.path.exists(os.path.join(path_to_output_folder, f"{rank_type}_rank", f"weights_{id}_top{top}_{rank_type}.tsv")) or \ - # not os.path.exists(os.path.join(path_to_output_folder, f"{rank_type}_rank", f"ks_tsv_{id}_top{top}_{rank_type}.tsv")): - ks_list, weight_list = ks_list_from_tsv(ks_tsv, max_ks) - - # Build Ks plot - logging.info(" Plotting Ks distribution") - generate_ks_plot(id, ids_names, top, rank_type, ks_list, weight_list, max_ks, bin_width, y_axis_lim, path_to_output_folder, ks_plot_filename) - else: - logging.info(" Ks plot already present") \ No newline at end of file diff --git a/ksrates/reciprocal_retention/old_scripts/get_top5000_gfs_mcl_format.py b/ksrates/reciprocal_retention/old_scripts/get_top5000_gfs_mcl_format.py deleted file mode 100755 index e8242bc..0000000 --- a/ksrates/reciprocal_retention/old_scripts/get_top5000_gfs_mcl_format.py +++ /dev/null @@ -1,118 +0,0 @@ -import logging -from pandas import read_excel, read_csv, concat, DataFrame -import os -import sys -import matplotlib -# Use the Agg (Anti-Grain Geometry) backend to avoid needing a graphical user interface (X11 backend) -matplotlib.use('Agg') - -logging.basicConfig(format='%(levelname)s\t%(message)s', level="INFO", stream=sys.stdout) - -def get_top_gf_genes(top, rank_type, path_for_top_gf_tsv): - """ - :param top: number of top GFs considered for building the Ks distribution - """ - - # Get the ranked gene families - gf_rank = read_excel(open("gene_families.xlsx", "rb"), "gene family rankings", header=None) - - # Get the genes associated to all gene families - gf_genes_all_raw = read_excel(open("gene_families.xlsx", "rb"), "orthogroup member genes", header=None) - - # Expand each gene in the GFs as an independent column entry - gf_genes_all = gf_genes_all_raw.iloc[:,:2] # First two columns (GF ID and number of species) - genes_columns = gf_genes_all_raw.iloc[:,2].str.split(",", expand=True) # Process third column (genes) - gf_genes_all = concat([gf_genes_all, genes_columns], axis=1, ignore_index=True) - - if rank_type == "combined": - # Get the top Gfs from the COMBINED RANKING (lambda & blocks percentage) - gf_top = gf_rank.iloc[:top,1] - elif rank_type == "lambda": - # Get the top GFs from the LAMBDA-BASED RANKING - gf_top = gf_rank.loc[gf_rank.iloc[:, 8] <= top,].sort_values(axis=0, by=[8]).iloc[:,1] - - # Get the genes associated to the top GFs in both rank types - gf_genes_top = DataFrame() - for ortho_gf in gf_top.values: - gf_genes_top = gf_genes_top.append(gf_genes_all.loc[gf_genes_all.iloc[:,0] == ortho_gf.strip()], ignore_index=True) - - # Save the top GF together with their genes in a TSV file - with open (os.path.join(path_for_top_gf_tsv, f"gf_top{top}_{rank_type}_rank_conserved_order.tsv"), "w+") as f: - f.write(gf_genes_top.to_csv(sep="\t", header=False, index=False)) - return gf_genes_top - - -def top_gfs_in_mcl_format(top, id, gf_genes_top, rank_type, path_to_output_folder, mcl_like_filename): - """ - """ - ortho_gf_gene_lists = [] - gf_top_per_species = {} - gf_top_per_species[id] = DataFrame() - - for i in range(top): - # Move to next GF if the current GF contains no genes for the current species - if len(gf_genes_top.iloc[i][gf_genes_top.iloc[i].str.contains(id).fillna(False)]) == 0: - ortho_gf_gene_lists.append("") - continue - - # Get the genes associated to the current top GF - gene_list = gf_genes_top.iloc[i][gf_genes_top.iloc[i].str.contains(id).fillna(False)].reset_index(drop=True).str.split("|", expand=True)[1].dropna().tolist() - ortho_gf_gene_lists.append(gene_list) - - # Write the genes in a file, one GF per row (MCL format) - # ortho_gf_gene_lists.sort(key=len, reverse=True) - with open(os.path.join(path_to_output_folder, mcl_like_filename), "w+") as f: - for genes_in_current_family in ortho_gf_gene_lists: - f.write('\t'.join(genes_in_current_family) + "\n") - - - -def get_top5000_gfs_mcl_format(id, top=5000): - """ - Function called by the main Python script - """ - - path_for_top_gf_tsv = ".." - - path_to_output_folder = os.path.join("..", "paralog_distributions", f"wgd_{id.lower()}") - if not os.path.exists(path_to_output_folder): - os.makedirs(path_to_output_folder) - - logging.info(f"Maximum number of gene families considered: top {top} GFs") - - for rank_type in ["combined", "lambda"]: - logging.info(f"- From {rank_type} ranking...") - - # Get the top XXXX GFs (ordered as in the current ranking) - logging.info(f" Getting the top {top} GFs") - if not os.path.exists(os.path.join("..", f"gf_top{top}_{rank_type}_rank_conserved_order.tsv")): - gf_genes_top = get_top_gf_genes(top, rank_type, path_for_top_gf_tsv) - else: - logging.info(f"- File gf_top{top}_{rank_type}_rank_conserved_order.tsv already exists") - gf_genes_top = read_csv(open(os.path.join("..", f"gf_top{top}_{rank_type}_rank_conserved_order.tsv"), "r"), sep="\t", header=None) - - logging.info("") - - # Generate MCL-like file with ORTHO GFs - logging.info(f" Generating the MCL-like file with top {top} ORTHO GFs") - mcl_like_filename = f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank.mcl.tsv" - if not os.path.exists(os.path.join(path_to_output_folder, mcl_like_filename)): - top_gfs_in_mcl_format(top, id, gf_genes_top, rank_type, path_to_output_folder, mcl_like_filename) - else: - logging.info(" (MCL-link file for ORTHO GFs already exists)") - - logging.info("") - - -def generate_mcl_like_file_for_new_species(id, top, rank_type, gf_list, assigned_gfs_file, mcl_like_filename): - """ - For the current new species, generate the mcl-like file for the gene families. - Each line of the file contains the genes belonging to a certain gene family, and the gene families - are sorted according to their reciprocally retention ranking (top 2000 gene families only are used). - """ - if not os.path.exists(mcl_like_filename): - mcl_like_output = [] - for gf in list(gf_list): - genes_assigned_to_current_gf = assigned_gfs_file.loc[assigned_gfs_file[1] == gf] - mcl_like_output.append(list(genes_assigned_to_current_gf.iloc[:, 0])) - DataFrame.from_records(mcl_like_output).to_csv(mcl_like_filename, sep="\t", header=False, index=False) diff --git a/ksrates/reciprocal_retention/old_scripts/ks.mplstyle b/ksrates/reciprocal_retention/old_scripts/ks.mplstyle deleted file mode 100755 index 9613ea1..0000000 --- a/ksrates/reciprocal_retention/old_scripts/ks.mplstyle +++ /dev/null @@ -1,18 +0,0 @@ -axes.titlesize : 16 -axes.labelsize : 19 -axes.linewidth : 1.2 -axes.labelpad : 14 -xtick.labelsize : 16 -xtick.major.size : 8 -xtick.major.width : 1.2 -xtick.major.pad : 8 -ytick.labelsize : 16 -ytick.major.size : 8 -ytick.major.width : 1.2 -ytick.major.pad : 8 -legend.frameon : False -legend.fontsize : 14 -figure.subplot.bottom : 0.15 -figure.subplot.left : 0.09 -figure.titlesize : 20 -savefig.pad_inches : 0.15 \ No newline at end of file diff --git a/ksrates/reciprocal_retention/old_scripts/rec_retained_ks_plot_pipeline.py b/ksrates/reciprocal_retention/old_scripts/rec_retained_ks_plot_pipeline.py deleted file mode 100755 index a77ef9c..0000000 --- a/ksrates/reciprocal_retention/old_scripts/rec_retained_ks_plot_pipeline.py +++ /dev/null @@ -1,149 +0,0 @@ -from get_top5000_gfs_mcl_format import get_top5000_gfs_mcl_format, generate_mcl_like_file_for_new_species -from compute_ks_from_ortho_gfs import compute_ks_from_ortho_gfs -from construct_ks_plot_ortho_gfs import construct_ks_plot_ortho_gfs -import sys -import os -import logging -from pandas import read_csv - -logging.basicConfig(format='%(levelname)s\t%(message)s', level="INFO", stream=sys.stdout) - -logging.info("-----------------------------------------------") -logging.info("Ks distributions with reciprocally retained GFs") -logging.info("-----------------------------------------------") - -test_run = False - -if test_run: - # Species IDs - all_species_ids = ['Alyr', 'Atha', 'Atri', 'Brap', 'Cari', 'Ccaj', 'Clan', 'Cmel', \ - 'Cpap', 'Crub', 'Csat', 'Fves', 'Gmax', 'Grai', 'Hvul', 'Jcur', 'Ljap', 'Lusi', \ - 'Macu', 'Mesc', 'Mtru', 'Osat', 'Pbre', 'Pdac', 'Pmum', 'Pper', 'Ptri', 'Rcom', \ - 'Sbic', 'Sita', 'Slyc', 'Stub', 'Tcac', 'Tpar', 'Vvin', 'Zmay'] - - ids_names = {'Alyr':'Arabidopsis lyrata', 'Atha': 'Arabidopsis thaliana', 'Atri':'Amborella trichopoda', \ - 'Bdis':'Brachypodium distachyon', 'Brap':'Brassica rapa', 'Cari':'Cicer arietinum', 'Ccaj':'Cajanus cajan', \ - 'Clan':'Citrullus lanatus', 'Cpap':'Carica papaya', 'Crub':'Capsella rubella', 'Csat':'Cucumis sativus', \ - 'Fves': 'Fragaria vesca', 'Gmax':'Glycine maxima', 'Grai':'Gossypium raimondii', 'Jcur':'Jatropha curcas', \ - 'Ljap':'Lotus japonicus', 'Lusi':'Linum usitatissiumum', 'Macu':'Musa acuminata', 'Mesc': 'Manihot esculenta', \ - 'Mtru':'Medicago truncatula', 'Osat': 'Oryza sativa', 'Pbre':'Pyrus bretschneideri', 'Pdac':'Phoenix dactylifera', \ - 'Pmum':'Prunus mume', 'Pper':'Prunus persica', 'Ptri':'Populus trichocarpa', 'Rcom':'Ricinus communis', \ - 'Sbic':'Sorghum bicolor', 'Sita':'Setaria italica', 'Slyc':'Solanum lycopersicum', 'Stub':'Solanum tuberosum', \ - 'Tcac':'Theobroma cacao', 'Tpar':'Thellungiella parvula', 'Vvin': 'Vitis vinifera', 'Zmay':'Zea mays'} - - id_list = ["Atha", "Pdac", "Vvin"] - -else: - all_species_ids = ["CMFF", "GEHT", "HJMP", "JETM", "JTQQ", "KEGA", "KNMB", "KZED", "MYMP", \ - "NXOH", "OHAE", "OQHZ", "PEZP", "PJSX", "QZXQ", "RKFX", "RKLL", "RMWJ", \ - "SLYR", "SUAK", "TTRG", "TVSH", "VHZV", "VLNB", "XOOE", "ZCDJ"] - - ids_names = { - "RMWJ" : "Wisteria floribunda", "CMFF" : "Lupinus polyphyllus", "GEHT" : "Gleditsia triacanthos", \ - "HJMP" : "Astragalus membranaceus", "JETM" : "Bauhinia tomentosa", "JTQQ" : "Glycyrrhiza lepidota", \ - "KEGA" : "Glycine soja", "KNMB" : "Lathyrus sativus", "KZED" : "Senna hebecarpa", \ - "MYMP" : "Astragalus propinquus", "NXOH" : "Apios americana", "OHAE" : "Polygala lutea", \ - "OQHZ" : "Quillaja saponaria", "PEZP" : "Glycyrrhiza glabra", "PJSX" : "Acacia pycnantha", \ - "QZXQ" : "Gymnocladus dioicus", "RKFX" : "Cercis canadensis", "RKLL" : "Copaifera officinalis", \ - "RMWJ" : "Wisteria floribunda", "SLYR" : "Cladrastis lutea", "SUAK" : "Codariocalyx motorius", \ - "TTRG" : "Lupinus angustifolius", "TVSH" : "Bituminaria bituminosa", "VHZV" : "Gleditsia sinensis", \ - "VLNB" : "Gompholobium polymorphum", "XOOE" : "Desmanthus illinoensis", "ZCDJ" : "Acacia argyrophylla"} - - id_list = [sys.argv[1].upper()] - -# ANALYSIS SETTINGS - -top = 2000 -rank_type = "lambda" - -filter_by_false_predictions = False -filter_by_zero_false_predictions = False - -basefolder_sequence_data = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "sequences") -basefolder_db_and_files = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "databases_and_gf_files") - -pident_threshold = 40 - -# PARAMETERS -y_axis_lim = float(300) -n_threads = 8 - -max_ks = 5 -bin_width = 0.1 -max_gene_family_size = 200 -paralog_dists_dir = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "paralog_distributions", "") - -for id in id_list: - logging.info(f"Analyzing species: {ids_names[id]}\n") - - basefolder_output_files = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "fabales_1kp", id, rank_type) - - path_to_parlogs_data = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "paralog_distributions", f"wgd_{id.lower()}") - if not os.path.exists(path_to_parlogs_data): - os.makedirs(path_to_parlogs_data) - - # for top in [2000]: # [50, 100, 200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000]: - if not filter_by_false_predictions and not filter_by_zero_false_predictions: - logging.info(f"Analyzing the top {top} GFs ({rank_type} rank):") - top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}.tsv") - gf_list = read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None).squeeze() - assigned_gfs_file = read_csv(os.path.join(basefolder_output_files, f"assigned_gfs_{id}_{pident_threshold}%_best_diamond_hit_with_rbh.tsv"), sep="\t", header=None) - - elif filter_by_false_predictions: - logging.info(f"Analyzing the top {top} GFs ({rank_type} rank) after filtering away those with false predictions >= half of the true positives") - top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv") - gf_list = read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None).squeeze() - - elif filter_by_zero_false_predictions: - logging.info(f"Analyzing the top {top} GFs ({rank_type} rank) after filtering away those with at least one false prediction") - top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv") - gf_list = read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None).squeeze() - - # GENERATE OR OPEN MCL-LIKE FILE - logging.info("Generating mcl-like output per gene family...") - if test_run: - # Get the top 5000 GFs (maximum number of GFs considered) - get_top5000_gfs_mcl_format(id) - else: - mcl_like_filename = os.path.join(path_to_parlogs_data, f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank.mcl.tsv") - generate_mcl_like_file_for_new_species(id, top, rank_type, gf_list, assigned_gfs_file, mcl_like_filename) - - logging.info("") - - # for rank_type in ["lambda"]: # ["combined", "lambda"]: - logging.info(f"- From {rank_type} ranking...") - - # Estimate Ks from the top XX GFs (current rank type)") - if test_run: - species_fasta_file = os.path.join(basefolder_sequence_data, "old_genome_versions", f"{id.lower()}.fasta") - else: - species_fasta_file = os.path.join(basefolder_sequence_data, "new_species_data", f"{id.upper()}.fna") - - - - if not filter_by_false_predictions and not filter_by_zero_false_predictions: - ks_tsv_filename = f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank.ks.tsv" - - elif filter_by_false_predictions: - ks_tsv_filename = f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank_filtered_false_predictions.ks.tsv" - - elif filter_by_zero_false_predictions: - ks_tsv_filename = f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank_filtered_zero_false_predictions.ks.tsv" - ks_tsv_path = os.path.join(path_to_parlogs_data, f"wgd_{id.lower()}", ks_tsv_filename) - compute_ks_from_ortho_gfs(id, species_fasta_file, top, top_gfs_filtered_with_rank, rank_type, ks_tsv_path, max_gene_family_size, n_threads, paralog_dists_dir) - logging.info("") - - - - # Construct Ks distribution for the top XX GFs (current rank type) - if not filter_by_false_predictions and not filter_by_zero_false_predictions: - ks_plot_filename = f"ks_plot_{id}_top{top}_{rank_type}.pdf" - - elif filter_by_false_predictions: - ks_plot_filename = f"ks_plot_{id}_top{top}_{rank_type}_filtered_false_predictions.pdf" - - elif filter_by_zero_false_predictions: - ks_plot_filename = f"ks_plot_{id}_top{top}_{rank_type}_filtered_zero_false_predictions.pdf" - construct_ks_plot_ortho_gfs(id, top, rank_type, ks_plot_filename, max_ks, bin_width, y_axis_lim, ids_names) - logging.info("") - logging.info("Done") -- GitLab From 9832707e64f6ac641f4bccc064da61be80e763f5 Mon Sep 17 00:00:00 2001 From: cesen Date: Thu, 23 Dec 2021 13:53:10 +0100 Subject: [PATCH 22/26] Use "recret" in file names instead of long version --- ksrates/fc_lognormal_mixture.py | 4 ++-- ksrates/lognormal_mixture.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ksrates/fc_lognormal_mixture.py b/ksrates/fc_lognormal_mixture.py index e7bd9bd..e779833 100644 --- a/ksrates/fc_lognormal_mixture.py +++ b/ksrates/fc_lognormal_mixture.py @@ -271,7 +271,7 @@ def lmm( :param output_dir: output folder :param outfile: output file for logging details :param parameter_table: list collecting the component parameters - :param datatype_tag: string for figure title stating if data is paranome or anchors or reciprocally retained + :param datatype_tag: string for figure title stating if data is paranome or anchors or reciprocally retained ("recret") :param peak_stats: states whether the cluster peak is intended as median or mode :param correction_table_available: boolean stating whether the adjustment results are available or not :param plot_correction_arrows: boolean stating whether there will be plotted adjustment arrows or not @@ -321,7 +321,7 @@ def create_legend_mixture_model(axis, legend_size, num_mixture_model_lines, data paralog_rect = Patch(facecolor=fcPlot.COLOR_PARANOME_HISTOGRAM, edgecolor="w") elif datatype == "anchors": paralog_rect = Patch(facecolor=fcPlot.COLOR_ANCHOR_HISTOGRAM, edgecolor="w") - elif datatype == "reciprocally_retained": + elif datatype == "recret": paralog_rect = Patch(facecolor=fcPlot.COLOR_REC_RET_HISTOGRAM, edgecolor="w") # empty patch used as spacer between histograms and divergence line legend entries empty_rect = Patch(fill=False, edgecolor='none', visible=False) diff --git a/ksrates/lognormal_mixture.py b/ksrates/lognormal_mixture.py index 0cbc092..625664a 100644 --- a/ksrates/lognormal_mixture.py +++ b/ksrates/lognormal_mixture.py @@ -183,7 +183,7 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re if reciprocal_retention_analysis: parameter_table = [] - with open (os.path.join("rate_adjustment", f"{species}", subfolder, f"lmm_{species}_parameters_reciprocally_retained.txt"), "w+") as outfile: + with open (os.path.join("rate_adjustment", f"{species}", subfolder, f"lmm_{species}_parameters_recret.txt"), "w+") as outfile: logging.info("Performing lognormal mixture model on reciprocally retained paralog Ks distribution") rec_ret_list, rec_ret_weights = fc_extract_ks_list.ks_list_from_tsv(rec_ret_tsv_file, max_ks_para, "reciprocally retained") hist_rec_ret = fcPlot.plot_histogram("Reciprocally retained paralogs", axis_rec_ret, rec_ret_list, bin_list, @@ -195,10 +195,10 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re best_model_rec_ret, letter_to_peak_dict_recret = fcLMM.lmm( fig_rec_ret, x_max_lim, "reciprocally retained", rec_ret_tsv_file, species, axis_rec_ret, (0, max_ks_EM), (1, max_num_comp), arange(-10, max_ks_EM + bin_width_para, bin_width_para), bin_width_para, max_EM_iterations, num_EM_initializations, - output_dir, outfile, parameter_table, "reciprocally_retained", peak_stats, correction_table_available, plot_correction_arrows) + output_dir, outfile, parameter_table, "recret", peak_stats, correction_table_available, plot_correction_arrows) # Generating tabular text file with all model parameters - fcLMM.make_parameter_table_file(parameter_table, species, "reciprocally_retained") + fcLMM.make_parameter_table_file(parameter_table, species, "recret") if correction_table_available: # Printing the automatic interpretation of the rate-adjusted mixed plot according to inferred (putative) WGD peaks @@ -221,8 +221,8 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re fcLMM.save_lmm(fig_colin, axis_colin, species, best_model_anchors, "anchors", correction_table_available) logging.info(f"Saved PDF figure of lognormal mixture model [mixed_{species}_lmm_anchors.pdf]") if reciprocal_retention_analysis: - fcLMM.save_lmm(fig_rec_ret, axis_rec_ret, species, best_model_rec_ret, "reciprocally_retained", correction_table_available) - logging.info(f"Saved PDF figure of lognormal mixture model [mixed_{species}_lmm_reciprocally_retained.pdf]") + fcLMM.save_lmm(fig_rec_ret, axis_rec_ret, species, best_model_rec_ret, "recret", correction_table_available) + logging.info(f"Saved PDF figure of lognormal mixture model [mixed_{species}_lmm_recret.pdf]") logging.info("") logging.info("All done") \ No newline at end of file -- GitLab From 6676d031cd95bd756e3f4208666f09c8e65a44fa Mon Sep 17 00:00:00 2001 From: cesen Date: Thu, 23 Dec 2021 13:53:40 +0100 Subject: [PATCH 23/26] Filenames specify whether its lambda or combined --- ksrates/plot_paralogs.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ksrates/plot_paralogs.py b/ksrates/plot_paralogs.py index 1e52568..63d5a40 100755 --- a/ksrates/plot_paralogs.py +++ b/ksrates/plot_paralogs.py @@ -301,8 +301,8 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an if reciprocal_retention_analysis: fcPlot.save_mixed_plot(fig_corr_rr, fig_uncorr_rr, ax_corr_rr, ax_uncorr_rr, species, correction_table_available, paranome=False, colinearity=False, reciprocal_retention=reciprocal_retention_analysis, - filename_adjusted=_MIXED_ADJUSTED_PLOT_FILENAME.format(species, "recret"), - filename_unadjusted=_MIXED_UNADJUSTED_PLOT_FILENAME.format(species, "recret")) + filename_adjusted=_MIXED_ADJUSTED_PLOT_FILENAME.format(species, f"recret_{rank_type}"), + filename_unadjusted=_MIXED_UNADJUSTED_PLOT_FILENAME.format(species, f"recret_{rank_type}")) if paranome_analysis and colinearity_analysis: fcPlot.save_mixed_plot(fig_corr_para_col, fig_uncorr_para_col, ax_corr_para_col, ax_uncorr_para_col, species, correction_table_available, @@ -313,20 +313,20 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an if paranome_analysis and reciprocal_retention_analysis: fcPlot.save_mixed_plot(fig_corr_para_rr, fig_uncorr_para_rr, ax_corr_para_rr, ax_uncorr_para_rr, species, correction_table_available, paranome=paranome_analysis, colinearity=False, reciprocal_retention=reciprocal_retention_analysis, - filename_adjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_ADJUSTED_PLOT_FILENAME.format(species, "paranome_recret")), - filename_unadjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_UNADJUSTED_PLOT_FILENAME.format(species, "paranome_recret"))) + filename_adjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_ADJUSTED_PLOT_FILENAME.format(species, f"paranome_recret_{rank_type}")), + filename_unadjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_UNADJUSTED_PLOT_FILENAME.format(species, f"paranome_recret_{rank_type}"))) if colinearity_analysis and reciprocal_retention_analysis: fcPlot.save_mixed_plot(fig_corr_col_rr, fig_uncorr_col_rr, ax_corr_col_rr, ax_uncorr_col_rr, species, correction_table_available, paranome=False, colinearity=reciprocal_retention_analysis, reciprocal_retention=reciprocal_retention_analysis, - filename_adjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_ADJUSTED_PLOT_FILENAME.format(species, "anchors_recret")), - filename_unadjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_UNADJUSTED_PLOT_FILENAME.format(species, "anchors_recret"))) + filename_adjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_ADJUSTED_PLOT_FILENAME.format(species, f"anchors_recret_{rank_type}")), + filename_unadjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_UNADJUSTED_PLOT_FILENAME.format(species, f"anchors_recret_{rank_type}"))) if paranome_analysis and colinearity_analysis and reciprocal_retention_analysis: fcPlot.save_mixed_plot(fig_corr_para_col_rr, fig_uncorr_para_col_rr, ax_corr_para_col_rr, ax_uncorr_para_col_rr, species, correction_table_available, paranome=paranome_analysis, colinearity=reciprocal_retention_analysis, reciprocal_retention=reciprocal_retention_analysis, - filename_adjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_ADJUSTED_PLOT_FILENAME.format(species, "paranome_anchors_recret")), - filename_unadjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_UNADJUSTED_PLOT_FILENAME.format(species, "paranome_anchors_recret"))) + filename_adjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_ADJUSTED_PLOT_FILENAME.format(species, f"paranome_anchors_recret_{rank_type}")), + filename_unadjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_UNADJUSTED_PLOT_FILENAME.format(species, f"paranome_anchors_recret_{rank_type}"))) logging.info("") logging.info("All done") -- GitLab From ef230d1cffb04e56d0560d7b4db83508420bbcd1 Mon Sep 17 00:00:00 2001 From: cesen Date: Thu, 23 Dec 2021 13:54:03 +0100 Subject: [PATCH 24/26] Conform filename for recret mcl TSV file --- ksrates/fc_wgd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ksrates/fc_wgd.py b/ksrates/fc_wgd.py index 677f1a6..6938c5e 100755 --- a/ksrates/fc_wgd.py +++ b/ksrates/fc_wgd.py @@ -15,7 +15,7 @@ from ksrates.utils import merge_dicts, concat_files, can_i_run_software, transla _OUTPUT_BLAST_FILE_PATTERN = '{}.blast.tsv' _OUTPUT_MCL_FILE_PATTERN = '{}.mcl.tsv' _OUTPUT_KS_FILE_PATTERN = '{}.ks.tsv' -_OUTPUT_MCL_REC_RET_FILE_PATTERN = '{}_rec_ret_top{}_{}_rank.mcl.tsv' +_OUTPUT_MCL_REC_RET_FILE_PATTERN = '{}_rec_ret_top_{}_{}.mcl.tsv' _OUTPUT_KS_REC_RET_FILE_PATTERN = '{}.ks_rec_ret_top_{}_{}.tsv' _PARALOGS_OUTPUT_DIR_PATTERN = 'wgd_{}' -- GitLab From 53fc48cd427de8b347f8d386c88aaf4ccd932209 Mon Sep 17 00:00:00 2001 From: cesen Date: Fri, 24 Dec 2021 13:26:51 +0100 Subject: [PATCH 25/26] Fix fig title; rearrange arguments --- ksrates/cluster_anchor_ks.py | 8 ++++---- ksrates/fc_exp_log_mixture.py | 2 +- ksrates/fc_plotting.py | 4 ++-- ksrates/lognormal_mixture.py | 9 +++------ ksrates/plot_paralogs.py | 34 ++++++++++++++-------------------- 5 files changed, 24 insertions(+), 33 deletions(-) diff --git a/ksrates/cluster_anchor_ks.py b/ksrates/cluster_anchor_ks.py index 5e89271..679e5e0 100644 --- a/ksrates/cluster_anchor_ks.py +++ b/ksrates/cluster_anchor_ks.py @@ -291,10 +291,10 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, # Generate the plot for the mixed distribution with clusters of Ks fig_corr_first, ax_corr_first = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_max_lim, "corrected", correction_table_available, plot_correction_arrows, - paranome_data=False, colinearity_data=True) + colinearity_data=True) fig_uncorr_first, ax_uncorr_first = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_max_lim, "un-corrected", correction_table_available, plot_correction_arrows, - paranome_data=False, colinearity_data=True) + colinearity_data=True) # Plot the original complete anchor distribution in the background fcPlot.plot_histogram_for_anchor_clustering(ax_corr_first, anchor_ks_list, anchors_weights, bin_list, y_max_lim) @@ -367,10 +367,10 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, # Generate the plot for the mixed distribution with clusters of Ks fig_corr_second, ax_corr_second = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_max_lim, "corrected", correction_table_available, plot_correction_arrows, - paranome_data=False, colinearity_data=True) + colinearity_data=True) fig_uncorr_second, ax_uncorr_second = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_max_lim, "un-corrected", correction_table_available, plot_correction_arrows, - paranome_data=False, colinearity_data=True) + colinearity_data=True) # Plot the original complete anchor distribution in the background fcPlot.plot_histogram_for_anchor_clustering(ax_corr_second, anchor_ks_list, anchors_weights, bin_list, y_max_lim) diff --git a/ksrates/fc_exp_log_mixture.py b/ksrates/fc_exp_log_mixture.py index 12f6f51..d4b1968 100644 --- a/ksrates/fc_exp_log_mixture.py +++ b/ksrates/fc_exp_log_mixture.py @@ -86,7 +86,7 @@ def generate_best_model_figure(latin_species, x_max_lim, y_max_lim, correction_t :return figure, axes and figure title """ fig_best_model, ax_best_ks = fcPlot.generate_mixed_plot_figure(latin_species, x_max_lim, y_max_lim, "corrected", - correction_table_available, plot_correction_arrows, paranome_data=True, colinearity_data=False) + correction_table_available, plot_correction_arrows, paranome_data=True) return fig_best_model, ax_best_ks diff --git a/ksrates/fc_plotting.py b/ksrates/fc_plotting.py index 8d2f5ed..a792e03 100755 --- a/ksrates/fc_plotting.py +++ b/ksrates/fc_plotting.py @@ -55,8 +55,8 @@ _OTHER_MIXED_PLOTS_SUBDIR = "other_mixed_plots" def generate_mixed_plot_figure(species, x_max_lim, y_max_lim, corrected_or_not, correction_table_available, - plot_correction_arrows, paranome_data=True, colinearity_data=True, - reciprocal_retention_data=True, top=None, rank_type=None): + plot_correction_arrows, paranome_data=False, colinearity_data=False, + reciprocal_retention_data=False, top=None, rank_type=None): """ Initializes a figure with a single empty plot for the mixed distribution. diff --git a/ksrates/lognormal_mixture.py b/ksrates/lognormal_mixture.py index 625664a..6bd0782 100644 --- a/ksrates/lognormal_mixture.py +++ b/ksrates/lognormal_mixture.py @@ -112,14 +112,11 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re sys.exit(1) fig_para, axis_para = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "corrected", - correction_table_available, plot_correction_arrows, paranome_data=True, colinearity_data=False, - reciprocal_retention_data=False) + correction_table_available, plot_correction_arrows, paranome_data=True) fig_colin, axis_colin = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "corrected", - correction_table_available, plot_correction_arrows, paranome_data=False, colinearity_data=True, - reciprocal_retention_data=False) + correction_table_available, plot_correction_arrows, colinearity_data=True) fig_rec_ret, axis_rec_ret = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "corrected", - correction_table_available, plot_correction_arrows, paranome_data=False, colinearity_data=False, - reciprocal_retention_data=True) + correction_table_available, plot_correction_arrows, reciprocal_retention_data=True, top=top, rank_type=rank_type) if paranome_analysis: parameter_table = [] diff --git a/ksrates/plot_paralogs.py b/ksrates/plot_paralogs.py index 63d5a40..ec78134 100755 --- a/ksrates/plot_paralogs.py +++ b/ksrates/plot_paralogs.py @@ -110,12 +110,10 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an logging.info(f"Plotting paranome Ks distribution for species [{species}]") fig_uncorr_para, ax_uncorr_para = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "un-corrected", correction_table_available, plot_correction_arrows, - paranome_data=paranome_analysis, colinearity_data=False, - reciprocal_retention_data=False, top=top, rank_type=rank_type) + paranome_data=paranome_analysis, top=top, rank_type=rank_type) fig_corr_para, ax_corr_para = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "corrected", correction_table_available, plot_correction_arrows, - paranome_data=paranome_analysis, colinearity_data=False, - reciprocal_retention_data=False, top=top, rank_type=rank_type) + paranome_data=paranome_analysis, top=top, rank_type=rank_type) ax_uncorr_include_para.append(ax_uncorr_para) ax_corr_include_para.append(ax_corr_para) ax_uncorr_list.append(ax_uncorr_para) @@ -125,12 +123,10 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an logging.info(f"Plotting anchor pair Ks distribution for species [{species}]") fig_uncorr_col, ax_uncorr_col = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "un-corrected", correction_table_available, plot_correction_arrows, - paranome_data=False, colinearity_data=colinearity_analysis, - reciprocal_retention_data=False, top=top, rank_type=rank_type) + colinearity_data=colinearity_analysis, top=top, rank_type=rank_type) fig_corr_col, ax_corr_col = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "corrected", correction_table_available, plot_correction_arrows, - paranome_data=False, colinearity_data=colinearity_analysis, - reciprocal_retention_data=False, top=top, rank_type=rank_type) + colinearity_data=colinearity_analysis, top=top, rank_type=rank_type) ax_uncorr_include_col.append(ax_uncorr_col) ax_corr_include_col.append(ax_corr_col) ax_uncorr_list.append(ax_uncorr_col) @@ -140,11 +136,9 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an logging.info(f"Plotting reciprocally retained paralog Ks distribution for species [{species}]") fig_uncorr_rr, ax_uncorr_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "un-corrected", correction_table_available, plot_correction_arrows, - paranome_data=False, colinearity_data=False, reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) fig_corr_rr, ax_corr_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "corrected", correction_table_available, plot_correction_arrows, - paranome_data=False, colinearity_data=False, reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) ax_uncorr_include_rr.append(ax_uncorr_rr) ax_corr_include_rr.append(ax_corr_rr) @@ -157,11 +151,11 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an fig_uncorr_para_col, ax_uncorr_para_col = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "un-corrected", correction_table_available, plot_correction_arrows, paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, - reciprocal_retention_data=False, top=top, rank_type=rank_type) + top=top, rank_type=rank_type) fig_corr_para_col, ax_corr_para_col = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "corrected", correction_table_available, plot_correction_arrows, paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, - reciprocal_retention_data=False, top=top, rank_type=rank_type) + top=top, rank_type=rank_type) ax_uncorr_include_para.append(ax_uncorr_para_col) ax_corr_include_para.append(ax_corr_para_col) ax_uncorr_include_col.append(ax_uncorr_para_col) @@ -173,12 +167,12 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an logging.info(f"Plotting paranome and reciprocally retained paralog Ks distributions for species [{species}]") fig_uncorr_para_rr, ax_uncorr_para_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "un-corrected", correction_table_available, plot_correction_arrows, - paranome_data=paranome_analysis, colinearity_data=False, - reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + paranome_data=paranome_analysis, reciprocal_retention_data=reciprocal_retention_analysis, + top=top, rank_type=rank_type) fig_corr_para_rr, ax_corr_para_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "corrected", correction_table_available, plot_correction_arrows, - paranome_data=paranome_analysis, colinearity_data=False, - reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + paranome_data=paranome_analysis, reciprocal_retention_data=reciprocal_retention_analysis, + top=top, rank_type=rank_type) ax_uncorr_include_para.append(ax_uncorr_para_rr) ax_corr_include_para.append(ax_corr_para_rr) ax_uncorr_include_rr.append(ax_uncorr_para_rr) @@ -190,12 +184,12 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an logging.info(f"Plotting anchor pairs and reciprocally retained paralog Ks distributions for species [{species}]") fig_uncorr_col_rr, ax_uncorr_col_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "un-corrected", correction_table_available, plot_correction_arrows, - paranome_data=False, colinearity_data=colinearity_analysis, - reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + colinearity_data=colinearity_analysis, reciprocal_retention_data=reciprocal_retention_analysis, + top=top, rank_type=rank_type) fig_corr_col_rr, ax_corr_col_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "corrected", correction_table_available, plot_correction_arrows, - paranome_data=False, colinearity_data=colinearity_analysis, - reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + colinearity_data=colinearity_analysis, reciprocal_retention_data=reciprocal_retention_analysis, + top=top, rank_type=rank_type) ax_uncorr_include_col.append(ax_uncorr_col_rr) ax_corr_include_col.append(ax_corr_col_rr) ax_uncorr_include_rr.append(ax_uncorr_col_rr) -- GitLab From 6a6eceec62091802f3cd83d89ac2448c36018d04 Mon Sep 17 00:00:00 2001 From: cesen Date: Fri, 24 Dec 2021 13:40:24 +0100 Subject: [PATCH 26/26] LMM output files for recret display also rank type --- ksrates/fc_lognormal_mixture.py | 8 ++++---- ksrates/lognormal_mixture.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ksrates/fc_lognormal_mixture.py b/ksrates/fc_lognormal_mixture.py index e779833..5517c6d 100644 --- a/ksrates/fc_lognormal_mixture.py +++ b/ksrates/fc_lognormal_mixture.py @@ -312,7 +312,7 @@ def create_legend_mixture_model(axis, legend_size, num_mixture_model_lines, data :param axis: matplotlib axis object from which the legend is taken :param legend_size: size of the legend box as a tuple of format "(x, y, width, height)" :param num_mixture_model_lines: number of lines generated by mixture models (components + total PDF) - :param datatype: string for figure title stating if data is "paranome" or comes from "colinearity" analysis + :param datatype: string stating if data is paranome, anchors or reciprocally retained paralogs (lambda or combined ranks) :return: the updated legend object """ handles, labels = axis.get_legend_handles_labels() @@ -321,7 +321,7 @@ def create_legend_mixture_model(axis, legend_size, num_mixture_model_lines, data paralog_rect = Patch(facecolor=fcPlot.COLOR_PARANOME_HISTOGRAM, edgecolor="w") elif datatype == "anchors": paralog_rect = Patch(facecolor=fcPlot.COLOR_ANCHOR_HISTOGRAM, edgecolor="w") - elif datatype == "recret": + elif datatype == "recret" or datatype == "recret_lambda" or datatype == "recret_combined": paralog_rect = Patch(facecolor=fcPlot.COLOR_REC_RET_HISTOGRAM, edgecolor="w") # empty patch used as spacer between histograms and divergence line legend entries empty_rect = Patch(fill=False, edgecolor='none', visible=False) @@ -343,7 +343,7 @@ def save_lmm(fig, axis, species, best_model, datatype, correction_table_availabl :param fig: figure object of the adjusted mixed distribution :param axis: axis object of the adjusted mixed distribution :param species: focal species - :param datatype: string for figure title stating whether data is paranome or comes from "colinearity" analysis + :param datatype: string for filename stating whether data is paranome, anchors or reciprocally retained paralogs """ if correction_table_available: num_mixture_model_lines = len(best_model.means_) + 1 # components + total PDF @@ -370,7 +370,7 @@ def make_parameter_table_file(parameter_table, species, datatype): :param parameter_table: list collecting the component parameters :param species: informal name of the focal species - :param datatype: string for figure title stating whether data is paranome or comes from "colinearity" analysis + :param datatype: string for filename stating whether data is paranome, anchors or reciprocally retained paralogs """ #headers = ["Model", "Iteration", "BIC", "Loglikelihood", "Convergence", # "Exponential_Rate", "Exponential_Weight", "Normal_Mean", "Normal_SD", "Normal_Weight"] diff --git a/ksrates/lognormal_mixture.py b/ksrates/lognormal_mixture.py index 6bd0782..1e42010 100644 --- a/ksrates/lognormal_mixture.py +++ b/ksrates/lognormal_mixture.py @@ -180,7 +180,7 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re if reciprocal_retention_analysis: parameter_table = [] - with open (os.path.join("rate_adjustment", f"{species}", subfolder, f"lmm_{species}_parameters_recret.txt"), "w+") as outfile: + with open (os.path.join("rate_adjustment", f"{species}", subfolder, f"lmm_{species}_parameters_recret_{rank_type}.txt"), "w+") as outfile: logging.info("Performing lognormal mixture model on reciprocally retained paralog Ks distribution") rec_ret_list, rec_ret_weights = fc_extract_ks_list.ks_list_from_tsv(rec_ret_tsv_file, max_ks_para, "reciprocally retained") hist_rec_ret = fcPlot.plot_histogram("Reciprocally retained paralogs", axis_rec_ret, rec_ret_list, bin_list, @@ -195,7 +195,7 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re output_dir, outfile, parameter_table, "recret", peak_stats, correction_table_available, plot_correction_arrows) # Generating tabular text file with all model parameters - fcLMM.make_parameter_table_file(parameter_table, species, "recret") + fcLMM.make_parameter_table_file(parameter_table, species, f"recret_{rank_type}") if correction_table_available: # Printing the automatic interpretation of the rate-adjusted mixed plot according to inferred (putative) WGD peaks @@ -218,8 +218,8 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_re fcLMM.save_lmm(fig_colin, axis_colin, species, best_model_anchors, "anchors", correction_table_available) logging.info(f"Saved PDF figure of lognormal mixture model [mixed_{species}_lmm_anchors.pdf]") if reciprocal_retention_analysis: - fcLMM.save_lmm(fig_rec_ret, axis_rec_ret, species, best_model_rec_ret, "recret", correction_table_available) - logging.info(f"Saved PDF figure of lognormal mixture model [mixed_{species}_lmm_recret.pdf]") + fcLMM.save_lmm(fig_rec_ret, axis_rec_ret, species, best_model_rec_ret, f"recret_{rank_type}", correction_table_available) + logging.info(f"Saved PDF figure of lognormal mixture model [mixed_{species}_lmm_recret_{rank_type}.pdf]") logging.info("") logging.info("All done") \ No newline at end of file -- GitLab