diff --git a/ksrates/cluster_anchor_ks.py b/ksrates/cluster_anchor_ks.py index 3cf5eafb840a527d88cb7c183d570c203fb1f4b0..36181b78266dddea1184cb99142723ca8d2bc00e 100644 --- a/ksrates/cluster_anchor_ks.py +++ b/ksrates/cluster_anchor_ks.py @@ -14,7 +14,7 @@ import ksrates.fc_check_input as fcCheck import ksrates.fc_extract_ks_list as fc_extract_ks_list from ksrates.utils import init_logging from ksrates.fc_cluster_anchors import subfolder -from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE +from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE, interpretation_adjusted_plot def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, path_multiplicons_txt, path_segments_txt, path_list_elements_txt, path_ks_anchor_file, path_multiplicon_pair_txt): config = fcConf.Configuration(config_file) @@ -23,7 +23,7 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, colinearity_analysis = config.get_colinearity() if not colinearity_analysis: - logging.warning("Colinearity non required in configuration file; the anchor Ks clustering will be skipped.") + logging.warning("Collinearity non required in configuration file; the anchor Ks clustering will be skipped.") sys.exit(0) # exit code 0 because no actual errors were thrown species = config.get_species() @@ -41,7 +41,7 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, num_EM_initializations = config.get_num_EM_initializations() # how many times the fitting with N given components is initialized logging.info(f" - maximum EM iterations: {max_EM_iterations}") logging.info(f" - number of EM initializations: {num_EM_initializations}") - + logging.info("") # Getting the statistical measure for how to determine the representative value of an ortholog distribution peak_stats = config.get_peak_stats() # default is mode (other option, median) @@ -53,6 +53,7 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, correction_table_file = fcCheck.get_argument_path(correction_table_file, default_path_correction_table_file, "Rate-adjustment table file") if correction_table_file == "": logging.warning("Rate-adjustment data are not available yet, only anchor pair distribution will be plotted.") + logging.info("") correction_table_available = False else: with open(correction_table_file, "r") as f: @@ -290,17 +291,17 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, # Generate the plot for the mixed distribution with clusters of Ks fig_corr_first, ax_corr_first = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_max_lim, "corrected", correction_table_available, plot_correction_arrows, - paranome_data=False, colinearity_data=True) + colinearity_data=True) fig_uncorr_first, ax_uncorr_first = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_max_lim, "un-corrected", correction_table_available, plot_correction_arrows, - paranome_data=False, colinearity_data=True) + colinearity_data=True) # Plot the original complete anchor distribution in the background fcPlot.plot_histogram_for_anchor_clustering(ax_corr_first, anchor_ks_list, anchors_weights, bin_list, y_max_lim) fcPlot.plot_histogram_for_anchor_clustering(ax_uncorr_first, anchor_ks_list, anchors_weights, bin_list, y_max_lim) # Plot the clusters of anchor Ks and on top of them their KDEs - clusters_sorted_by_median, cluster_color_letter_list, cluster_peak_heights = fcCluster.plot_clusters(ax_corr_first, cluster_of_ks, + clusters_sorted_by_median, cluster_color_letter_list, letter_to_x_coord_dict, cluster_peak_heights = fcCluster.plot_clusters(ax_corr_first, cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) fcCluster.plot_clusters(ax_uncorr_first, cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) @@ -325,6 +326,10 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, if updated_n_clusters == n_clusters: logging.info("All clusters were retained") + if correction_table_available: + interpretation_adjusted_plot(latin_names[species], consensus_peak_for_multiple_outgroups, + letter_to_x_coord_dict, correction_table) + # Saving the first plot with the final name (leaving away "unfiltered") logging.info(f"Saving mixed Ks plot with anchor Ks clusters [{fcCluster._ANCHOR_CLUSTERS_FILTERED.format(species)}]") fcCluster.save_anchor_cluster_plot(fig_corr_first, fig_uncorr_first, ax_corr_first, ax_uncorr_first, species, @@ -352,7 +357,6 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, clean_medians_list = array(clean_medians_list) # Clustering with GaussianMixtureModel, k-means or lognormal mixture modeling - logging.info("") logging.info(f"Performing a second Ks clustering round with {updated_n_clusters} clusters through {clustering_method} for the remaining Ks values") if clustering_method == "Gaussian mixture modeling": gmm_clustered_medians2 = fcCluster.gmm(clean_medians_list, updated_n_clusters, max_EM_iterations, num_EM_initializations) @@ -368,24 +372,31 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, # Generate the plot for the mixed distribution with clusters of Ks fig_corr_second, ax_corr_second = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_max_lim, "corrected", correction_table_available, plot_correction_arrows, - paranome_data=False, colinearity_data=True) + colinearity_data=True) fig_uncorr_second, ax_uncorr_second = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_max_lim, "un-corrected", correction_table_available, plot_correction_arrows, - paranome_data=False, colinearity_data=True) + colinearity_data=True) # Plot the original complete anchor distribution in the background fcPlot.plot_histogram_for_anchor_clustering(ax_corr_second, anchor_ks_list, anchors_weights, bin_list, y_max_lim) fcPlot.plot_histogram_for_anchor_clustering(ax_uncorr_second, anchor_ks_list, anchors_weights, bin_list, y_max_lim) # Plot the clusters of anchor Ks and on top of them their KDEs - fcCluster.plot_clusters(ax_corr_second, filtered_cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) - fcCluster.plot_clusters(ax_uncorr_second, filtered_cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) + __, __, letter_to_x_coord_dict, __ = fcCluster.plot_clusters(ax_corr_second, + filtered_cluster_of_ks, bin_width, max_ks_para, peak_stats, + correction_table_available, plot_correction_arrows) + fcCluster.plot_clusters(ax_uncorr_second, filtered_cluster_of_ks, bin_width, max_ks_para, peak_stats, + correction_table_available, plot_correction_arrows) # Plotting the ortholog peaks coming from the adjustment, if available if correction_table_available: logging.info("Plotting divergence lines") fcPlot.plot_divergences(correction_table, peak_stats, consensus_peak_for_multiple_outgroups, ax_uncorr_second, ax_corr_second, color_list, plot_correction_arrows) + + # Printing the automatic interpretation of the rate-adjusted mixed plot according to inferred (putative) WGD peaks + interpretation_adjusted_plot(latin_names[species], consensus_peak_for_multiple_outgroups, + letter_to_x_coord_dict, correction_table) logging.info(f"Saving mixed Ks plot with filtered anchor Ks clusters [{fcCluster._ANCHOR_CLUSTERS_FILTERED.format(species)}]") logging.info("") diff --git a/ksrates/exp_log_mixture.py b/ksrates/exp_log_mixture.py index 8a48c58bb814a454bc08192b6a9aeb0ae83140bc..18bdbe04243b516bb9521bb8cd11dcdb72bab66a 100644 --- a/ksrates/exp_log_mixture.py +++ b/ksrates/exp_log_mixture.py @@ -12,7 +12,7 @@ import ksrates.fc_extract_ks_list as fc_extract_ks_list from ksrates.fc_plotting import COLOR_ANCHOR_HISTOGRAM import ksrates.fc_exp_log_mixture as fcEM from ksrates.fc_cluster_anchors import subfolder -from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE +from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE, interpretation_adjusted_plot def exp_log_mixture(config_file, paralog_tsv_file, correction_table_file): @@ -77,8 +77,9 @@ def exp_log_mixture(config_file, paralog_tsv_file, correction_table_file): correction_table_file = fcCheck.get_argument_path(correction_table_file, default_path_correction_table_file, "Rate-adjustment table file") if correction_table_file == "": logging.warning("Rate-adjustment data are not available yet, only Ks paranome distribution will be plotted.") - correction_table = None + logging.info("") correction_table_available = False + correction_table = None else: with open(correction_table_file, "r") as f: correction_table = read_csv(f, sep="\t") @@ -136,7 +137,8 @@ def exp_log_mixture(config_file, paralog_tsv_file, correction_table_file): reduced_gaussians_flag=reduced_gaussians, EM_data=True) all_models_fitted_parameters[model_id] = [means_peaks, stdevs_peaks, lambd_peaks, weights_peaks] bic_dict[model_id] = bic_peaks - fcEM.plot_fitted_comp(ax_peaks_ks, ax_peaks_logks, means_peaks, stdevs_peaks, lambd_peaks, weights_peaks, x_max_lim, peak_stats, correction_table_available, plot_correction_arrows) + fcEM.plot_fitted_comp(ax_peaks_ks, ax_peaks_logks, means_peaks, stdevs_peaks, lambd_peaks, + weights_peaks, x_max_lim, peak_stats, correction_table_available, plot_correction_arrows) # ----------------------------------------------------------------- @@ -248,10 +250,16 @@ def exp_log_mixture(config_file, paralog_tsv_file, correction_table_file): logging.info("Models are evaluated according to their BIC score.") # Get best model by lowest BIC score and plot it; print comparison with the other models best_model_id = fcEM.eval_best_model(bic_dict, outfile) - fcEM.plot_best_model(fig_best_model, ax_best_ks, species, ks_data, ks_weights, bin_list, bin_width, x_max_lim, + letter_to_peak_dict = fcEM.plot_best_model(fig_best_model, ax_best_ks, species, ks_data, ks_weights, bin_list, bin_width, x_max_lim, y_max_lim, best_model_id, all_models_init_parameters, all_models_fitted_parameters, correction_table, correction_table_available, consensus_peak_for_multiple_outgroups, peak_stats, color_list, plot_correction_arrows, deconvoluted_data, max_ks_EM) + + if correction_table_available: + # Printing the automatic interpretation of the rate-adjusted mixed plot according to inferred (putative) WGD peaks + interpretation_adjusted_plot(latinSpecies, consensus_peak_for_multiple_outgroups, + letter_to_peak_dict, correction_table) + logging.info(f"Saving PDF figure of best mixture model [mixed_{species}_elmm.pdf]") logging.info("") logging.info("All done") \ No newline at end of file diff --git a/ksrates/extend_orthogroups.py b/ksrates/extend_orthogroups.py index dbca999293b95e43f749ada44398874eae8d8af9..78757c668c8e4c64ce63e67aa2e68a074d597276 100755 --- a/ksrates/extend_orthogroups.py +++ b/ksrates/extend_orthogroups.py @@ -9,6 +9,40 @@ import ksrates.fc_reciprocal_retention as fc_rec_ret set_option("display.max_rows", 20, "display.max_columns", 20) + +def setup_for_extend_orthogroups(top, rank_type, filter_false_predictions, filter_zero_false_predictions): + """ + Generates files needed for extend_orthogroups.py: + - gf_genes_top_2000_rank.tsv + - gf_genes_list_top_2000_rank.tsv + - gf_genes_with_gf_list_top_2000_rank.tsv + - orthogroup nucl database + - orthogroup amino database + + Generates files needed for fc_wgd.ks_paralogs_rec_ret: + - gf_list_top_2000_lambda.tsv + + """ + # Path where output files will be generated or read + ksrates_rec_ret_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "reciprocal_retention") + + if not os.path.exists(os.path.join(ksrates_rec_ret_dir, f"gf_list_top_{top}_{rank_type}.tsv")) \ + or not os.path.exists(os.path.join(ksrates_rec_ret_dir, f"gf_genes_list_top_{top}_{rank_type}.tsv")) \ + or not os.path.exists(os.path.join(ksrates_rec_ret_dir, f"gf_genes_with_gf_list_top_{top}_{rank_type}.tsv")) \ + or not os.path.exists(os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_top_{top}_{rank_type}_COMPLETE.fasta")) \ + or not os.path.exists(os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_COMPLETE.fasta")): + + logging.info("Generating required files for reciprocal retention analysis...") + # Generate or read input files required to perform RR analysis + # (files listing gene family IDs and gene members, orthogroup databases) + fc_rec_ret.make_gene_family_files_and_databases(top, rank_type, filter_false_predictions, + filter_zero_false_predictions, ksrates_rec_ret_dir) + else: + logging.info("All required files for reciprocal retention analysis exist") + return + + + def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_threshold, min_qcov_threshold, filter_false_predictions, filter_zero_false_predictions, base_dir='.', n_threads=4): """ @@ -29,7 +63,7 @@ def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_t """ # Extend the original GFs with the new sequences from diamond matched homology search results. - # ACTIVATE TO CHOOSE A TEST SPECIES (atha, pdac or vvin) INSTEAD OF THE NEW SPECIES FROM CONFIGURATION FILE + # TO BE ACTIVATED TO CHOOSE A TEST SPECIES (atha, pdac or vvin) INSTEAD OF THE NEW SPECIES FROM CONFIGURATION FILE test_run = False if test_run: # the species is a test species (atha, pdac or vvin) @@ -61,86 +95,74 @@ def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_t #------------------------------------------------------------------------------ - # BASEFOLDERS for input and output + # GENERATE OR READ INPUT GENE FAMILY FILES AND DATABASES - # Path to database and files to be used in the workflow - # TODO: this is temporarily because we will probably have to let the user first make this files - # themselves in a specific folder; alternatively we could provide this file already ready, but that - # would make the GitHub folder (too) heavy - # THE PATH IS paralog_distribution/reciprocal_retention - basefolder_db_and_files = os.path.join(os.path.dirname(base_dir), "reciprocal_retention") + # PATH TO THE DIRECTORY WHERE REQUIRED FILES FOR RECIPROCAL RETENTION ANALYSES ARE STORED + # It's within the ksrates directory (for now): /home/cesen/ksrates_rec_ret/ksrates/reciprocal_retention. + # TODO: Still to be decided whether the user will have to generate this files themselves or if they will be + # already provided; in any of the cases, to be decided the location of these files: within the GitHub + # directory could become heavy. + ksrates_rec_ret_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "reciprocal_retention") - # THIS IS THE FUNCTION TO GENERATE THE D - # # The user generates the FASTA database files for the rec.ret. sequences on their own in paralog_distributions - # fc_rec_ret.make_gene_family_database(top, rank_type, filter_false_predictions, filter_zero_false_predictions, base_dir) - - # Path to output files - base_dir_rec_ret = os.path.join(base_dir, "reciprocal_retention", rank_type) - if not os.path.exists(base_dir_rec_ret): - os.makedirs(base_dir_rec_ret) - - #------------------------------------------------------------------------------ - - # INPUT DATABASES AND GENE FAMILY FILES - - # Translate it into the aminoacid alphabet - species_sequences_filename = fc_rec_ret.dna_to_aminoacid(fasta_file, output_folder=base_dir_rec_ret) + # Get the list of the top 2000 GFs + if not filter_zero_false_predictions and not filter_false_predictions: + # Get the list of genes together with their GFs + gf_genes_with_gf_list = read_csv(os.path.join(ksrates_rec_ret_dir, f"gf_genes_with_gf_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None) + elif filter_zero_false_predictions: + # Get the list of genes together with their GFs + gf_genes_with_gf_list = read_table(os.path.join(ksrates_rec_ret_dir, f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None) + elif filter_false_predictions: + # Get the list of genes together with their GFs + gf_genes_with_gf_list = read_table(os.path.join(ksrates_rec_ret_dir, f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None) if test_run: # Nucleotidic and aminoacidi databases of species used to make the orthogroups (i.e. the remaining 35 species) # Note that this database contains only the top e.g. 2000 orthogroup from the complete list of 9178 if not filter_zero_false_predictions and not filter_false_predictions: - orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_test_without_{species}.fasta") - orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_test_without_{species}.fasta") + orthogroup_database_nucl = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_top_{top}_{rank_type}_test_without_{species}.fasta") + orthogroup_database_amin_path = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_test_without_{species}.fasta") elif filter_zero_false_predictions: - orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_filtered_zero_false_predictions_test_without_{species}.fasta") - orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_zero_false_predictions_test_without_{species}.fasta") + orthogroup_database_nucl = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_top_{top}_{rank_type}_filtered_zero_false_predictions_test_without_{species}.fasta") + orthogroup_database_amin_path = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_zero_false_predictions_test_without_{species}.fasta") elif filter_false_predictions: - orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_filtered_false_predictions_test_without_{species}.fasta") - orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_false_predictions_test_without_{species}.fasta") - else: + orthogroup_database_nucl = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_top_{top}_{rank_type}_filtered_false_predictions_test_without_{species}.fasta") + orthogroup_database_amin_path = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_false_predictions_test_without_{species}.fasta") + else: # Nucleotidic and aminoacidi databases of species used to make the orthogroups (all 36 species) if not filter_zero_false_predictions and not filter_false_predictions: - orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_COMPLETE.fasta") - orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_COMPLETE.fasta") + orthogroup_database_nucl = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_top_{top}_{rank_type}_COMPLETE.fasta") + orthogroup_database_amin_path = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_COMPLETE.fasta") elif filter_zero_false_predictions: - orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_filtered_zero_false_predictions_COMPLETE.fasta") - orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_zero_false_predictions_COMPLETE.fasta") + orthogroup_database_nucl = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_top_{top}_{rank_type}_filtered_zero_false_predictions_COMPLETE.fasta") + orthogroup_database_amin_path = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_zero_false_predictions_COMPLETE.fasta") elif filter_false_predictions: - orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_filtered_false_predictions_COMPLETE.fasta") - orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_false_predictions_COMPLETE.fasta") - + orthogroup_database_nucl = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_top_{top}_{rank_type}_filtered_false_predictions_COMPLETE.fasta") + orthogroup_database_amin_path = os.path.join(ksrates_rec_ret_dir, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_false_predictions_COMPLETE.fasta") if test_run: # Collection of all genes in the orthogroups, including those of test species # Note: if using the top 2000 orthogroups, this file lists only the genes coming from top 2000 orthogroups # if using the subset of top 2000, there will be only genes coming from that subset if not filter_zero_false_predictions and not filter_false_predictions: - gene_ids_in_orthogroups = read_csv(os.path.join(basefolder_db_and_files, f"gf_genes_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None) + gene_ids_in_orthogroups = read_csv(os.path.join(ksrates_rec_ret_dir, f"gf_genes_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None) elif filter_zero_false_predictions: - gene_ids_in_orthogroups = read_csv(os.path.join(basefolder_db_and_files, f"gf_genes_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None) + gene_ids_in_orthogroups = read_csv(os.path.join(ksrates_rec_ret_dir, f"gf_genes_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None) elif filter_false_predictions: - gene_ids_in_orthogroups = read_csv(os.path.join(basefolder_db_and_files, f"gf_genes_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None) + gene_ids_in_orthogroups = read_csv(os.path.join(ksrates_rec_ret_dir, f"gf_genes_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None) - # Get the list of the top 2000 GFs - if not filter_zero_false_predictions and not filter_false_predictions: - # Get the list of genes together with their GFs - gf_genes_with_gf_list = read_csv(os.path.join(basefolder_db_and_files, f"gf_genes_with_gf_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None) - elif filter_zero_false_predictions: - # Get the list of genes together with their GFs - gf_genes_with_gf_list = read_table(os.path.join(basefolder_db_and_files, f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None) - elif filter_false_predictions: - # Get the list of genes together with their GFs - gf_genes_with_gf_list = read_table(os.path.join(basefolder_db_and_files, f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None) + # ------------------------------------------------------------------------- - # ----------------------------------------------------------------------------- + # Path to output files + wgd_focal_rec_ret_rank_dir = os.path.join(base_dir, "reciprocal_retention", rank_type) + if not os.path.exists(wgd_focal_rec_ret_rank_dir): + os.makedirs(wgd_focal_rec_ret_rank_dir) if not filter_zero_false_predictions and not filter_false_predictions: - dmd_with_orthogroups_filename = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}_with_orthogroups.dmd") + dmd_with_orthogroups_filename = os.path.join(wgd_focal_rec_ret_rank_dir, f"diamond_{species}_top_{top}_{rank_type}_with_orthogroups.dmd") elif filter_zero_false_predictions: - dmd_with_orthogroups_filename = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}_filtered_zero_false_predictions_with_orthogroups.dmd") + dmd_with_orthogroups_filename = os.path.join(wgd_focal_rec_ret_rank_dir, f"diamond_{species}_top_{top}_{rank_type}_filtered_zero_false_predictions_with_orthogroups.dmd") elif filter_false_predictions: - dmd_with_orthogroups_filename = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}_filtered_false_predictions_with_orthogroups.dmd") + dmd_with_orthogroups_filename = os.path.join(wgd_focal_rec_ret_rank_dir, f"diamond_{species}_top_{top}_{rank_type}_filtered_false_predictions_with_orthogroups.dmd") column_names = ["qseqid", "sseqid", "pident", "length", "qlen", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"] @@ -151,22 +173,22 @@ def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_t # First step is to generate or open the diamond result table if not filter_zero_false_predictions and not filter_false_predictions: - dmd_test_species_to_top_orthogroup = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}.dmd") + dmd_species_to_top_orthogroup = os.path.join(wgd_focal_rec_ret_rank_dir, f"diamond_{species}_top_{top}_{rank_type}.dmd") elif filter_zero_false_predictions: - dmd_test_species_to_top_orthogroup = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}_filtered_zero_false_predictions.dmd") + dmd_species_to_top_orthogroup = os.path.join(wgd_focal_rec_ret_rank_dir, f"diamond_{species}_top_{top}_{rank_type}_filtered_zero_false_predictions.dmd") elif filter_false_predictions: - dmd_test_species_to_top_orthogroup = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}_filtered_false_predictions.dmd") + dmd_species_to_top_orthogroup = os.path.join(wgd_focal_rec_ret_rank_dir, f"diamond_{species}_top_{top}_{rank_type}_filtered_false_predictions.dmd") - if not os.path.exists(dmd_test_species_to_top_orthogroup): + if not os.path.exists(dmd_species_to_top_orthogroup): # Matching the test species to the top 2000 orthogroups through diamond logging.info(f"Running homology search between {latin_name} and the gene family database. Will take some time...") - with tempfile.TemporaryDirectory(dir=".") as dmd_tmp_dir: - orthogroup_database_amin = fc_rec_ret.make_diamond_db(orthogroup_database_amin_path, output_folder=base_dir_rec_ret) - dmd = fc_rec_ret.run_diamond(fasta_file, orthogroup_database_amin, dmd_test_species_to_top_orthogroup, + with tempfile.TemporaryDirectory(dir=base_dir) as dmd_tmp_dir: + orthogroup_database_amin = fc_rec_ret.make_diamond_db(orthogroup_database_amin_path, output_folder=ksrates_rec_ret_dir) + dmd = fc_rec_ret.run_diamond(fasta_file, orthogroup_database_amin, dmd_species_to_top_orthogroup, dmd_tmp_dir, pident_threshold, column_names=column_names, n_threads=n_threads) else: logging.info(f"Found already existing homology search result table between {latin_name} and the gene family database") - dmd = read_csv(dmd_test_species_to_top_orthogroup, sep="\t", header=None, names=column_names) + dmd = read_csv(dmd_species_to_top_orthogroup, sep="\t", header=None, names=column_names) logging.info("") # Second step is generating the diamond table file with en extra column containing the gene family associated to the matched sequences @@ -183,7 +205,8 @@ def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_t else: # Else open the already existing diamond table file logging.info("Found already existing homology result table with gene families associated to matched sequences") dmd_with_orthogroups = read_csv(dmd_with_orthogroups_filename, sep="\t", header=None, names=column_names + ["GF"]) - logging.info("Done\n") + logging.info("Done") + logging.info("") # ----------------------------------------------------------------------------- @@ -209,15 +232,16 @@ def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_t # Get the list of sequences coming from the main species (column 0) in the diamond result table query_seqs = dmd_with_orthogroups.loc[:, "qseqid"].drop_duplicates() - with tempfile.TemporaryDirectory(dir=".") as dmd_tmp_dir: - assigned_gfs_file = fc_rec_ret.assign_first_diamond_hit_with_rbh(species, fasta_file, species_sequences_filename, orthogroup_database_nucl, - query_seqs, dmd_with_orthogroups, pident_threshold, column_names, dmd_tmp_dir, base_dir_rec_ret) - - logging.info("") + with tempfile.TemporaryDirectory(dir=base_dir) as dmd_tmp_dir: + assigned_gfs_file = fc_rec_ret.assign_first_diamond_hit_with_rbh(top, rank_type, species, fasta_file, orthogroup_database_nucl, + query_seqs, dmd_with_orthogroups, pident_threshold, column_names, dmd_tmp_dir, wgd_focal_rec_ret_rank_dir) if not test_run: logging.info("Done") + logging.info("") return assigned_gfs_file + + logging.info("") logging.info("-------------------------------------------------------------------------------") diff --git a/ksrates/fc_cluster_anchors.py b/ksrates/fc_cluster_anchors.py index 417bd6a777cb74504a41d754ee49792b41a98529..3b8632e2eff20bbdcd4e6fbd02f002c1072082af 100644 --- a/ksrates/fc_cluster_anchors.py +++ b/ksrates/fc_cluster_anchors.py @@ -335,7 +335,8 @@ def get_clusters_of_ks(labels_of_clustered_medians, all_medians_list, all_segmen return cluster_of_ks, medians_per_cluster, segments_medians_per_cluster, cluster_color_letter_list -def plot_clusters_of_medians(medians_per_cluster, cluster_color_letter_list, x_axis_max_limit_mixed_plot, bin_list, species, latin_name, output): +def plot_clusters_of_medians(medians_per_cluster, cluster_color_letter_list, x_axis_max_limit_mixed_plot, + bin_list, species, latin_name, output): """ Generates a figure showing the clusters of segment pair medians. @@ -389,7 +390,8 @@ def assign_cluster_colors(cluster_of_ks): return clusters_sorted_by_median, cluster_color_letter_list -def plot_clusters(axis, cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows): +def plot_clusters(axis, cluster_of_ks, bin_width, max_ks_para, peak_stats, + correction_table_available, plot_correction_arrows): """ Plots the anchor Ks clusters, their KDE lines a marker to show their median and . @@ -402,9 +404,11 @@ def plot_clusters(axis, cluster_of_ks, bin_width, max_ks_para, peak_stats, corre :param plot_correction_arrows: boolean tag to state whether to plot the adjustment arrows at the bottom of the adjusted plot (default: False) :return clusters_sorted_by_median: list of cluster IDs sorted according to their age :return cluster_color_letter_list: dictionary assigning to each cluster the right color and letter according to age + :return letter_to_x_coord_dict: dictionary associating cluster names (letters) to their peak x-coord (e.g. mode) """ clusters_sorted_by_median, cluster_color_letter_list = assign_cluster_colors(cluster_of_ks) + letter_to_x_coord_dict = {} cluster_peak_heights = {} zorder_ID = 50 @@ -434,11 +438,14 @@ def plot_clusters(axis, cluster_of_ks, bin_width, max_ks_para, peak_stats, corre axis.plot(kde_x, kde_y, color=cluster_color_letter_list[cluster_id][0], zorder=zorder_ID + 1) # plot a marker to highlight the median of each cluster - plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, cluster_color_letter_list[cluster_id][0], + x_coord_cluster_peak = plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, cluster_color_letter_list[cluster_id][0], cluster_color_letter_list[cluster_id][1], zorder_ID + 2, peak_stats, correction_table_available, plot_correction_arrows) + + # Assign to the cluster letter its peak (e.g. mode) x-coordinate + letter_to_x_coord_dict[cluster_color_letter_list[cluster_id][1]] = x_coord_cluster_peak zorder_ID += 10 - return clusters_sorted_by_median, cluster_color_letter_list, cluster_peak_heights + return clusters_sorted_by_median, cluster_color_letter_list, letter_to_x_coord_dict, cluster_peak_heights def plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, color, letter, zorder_ID, peak_stats, correction_table_available, plot_correction_arrows): @@ -460,6 +467,7 @@ def plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, color, letter :param peak_stats: states whether the cluster peak is intended as median or mode :param correction_table_available: boolean tag to state if the adjustment table data are available or not :param plot_correction_arrows: boolean tag to state whether to plot the adjustment arrows at the bottom of the adjusted plot (default: False) + :return: the x-coordinate of the mode / median of the current anchor cluster """ if peak_stats == "median": # Get the point of the KDE line that is as closest as possible to the median of the raw data @@ -500,6 +508,7 @@ def plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, color, letter axis.text(x=x_value, y=y_value + (y_height * marker_y_height_fraction), s=letter, fontsize=10, horizontalalignment='center', verticalalignment='center', clip_on=True, zorder=zorder_ID + 5, color='w') + return x_value def filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluster_color_letter_list, cluster_peak_heights): """ diff --git a/ksrates/fc_configfile.py b/ksrates/fc_configfile.py index 91c2e34284b65a7a4f1056d4b2d106037cb7669c..f73785624829f0274fff3ad3855e390e7cff0360 100644 --- a/ksrates/fc_configfile.py +++ b/ksrates/fc_configfile.py @@ -788,7 +788,7 @@ class Configuration: :return rank_type: string ("lambda" or "combined"). Default is "lambda". """ if self.expert_config is not None: - rank_type = self.expert_config.get("EXPERT PARAMETERS", "reciprocal_retention_rank_type", fallback="lambda") + rank_type = self.expert_config.get("EXPERT PARAMETERS", "reciprocal_retention_rank_type", fallback="lambda").lower() if rank_type not in ["lambda", "combined"]: logging.warning(f'Unrecognized field in expert configuration file [reciprocal_retention_rank_type = {rank_type}]. \ Please choose between "lambda" and "combined". Default choice will be applied ["lambda"]') @@ -834,7 +834,7 @@ class Configuration: :return boolean: boolean """ if self.expert_config is not None: - filter_false_predictions = self.expert_config.get("EXPERT PARAMETERS", "filter_by_false_predictions", fallback=False).lower() + filter_false_predictions = self.expert_config.get("EXPERT PARAMETERS", "filter_by_false_predictions", fallback="no").lower() if filter_false_predictions == "yes": return True @@ -856,7 +856,7 @@ class Configuration: :return boolean: boolean """ if self.expert_config is not None: - filter_zero_false_predictions = self.expert_config.get("EXPERT PARAMETERS", "filter_by_zero_false_predictions", fallback=False).lower() + filter_zero_false_predictions = self.expert_config.get("EXPERT PARAMETERS", "filter_by_zero_false_predictions", fallback="no").lower() if filter_zero_false_predictions == "yes": return True diff --git a/ksrates/fc_exp_log_mixture.py b/ksrates/fc_exp_log_mixture.py index 255d4a0596f0f78787756efad8e3b3b455270a9e..d4b1968bbb58c1f5b7648b6fb85e6cc87f923084 100644 --- a/ksrates/fc_exp_log_mixture.py +++ b/ksrates/fc_exp_log_mixture.py @@ -86,7 +86,7 @@ def generate_best_model_figure(latin_species, x_max_lim, y_max_lim, correction_t :return figure, axes and figure title """ fig_best_model, ax_best_ks = fcPlot.generate_mixed_plot_figure(latin_species, x_max_lim, y_max_lim, "corrected", - correction_table_available, plot_correction_arrows, paranome_data=True, colinearity_data=False) + correction_table_available, plot_correction_arrows, paranome_data=True) return fig_best_model, ax_best_ks @@ -437,7 +437,7 @@ def deconvolute_data(tsv_file, max_ks, data_type): """ tail_length = 0.5 # tail spans for 0.5 extra Ks range - if data_type == "paralogs" or data_type == "anchor pairs": + if data_type == "paralogs" or data_type == "anchor pairs" or data_type == "reciprocally retained": ks_data, ks_weights = fc_extract_ks_list.ks_list_from_tsv(tsv_file, max_ks, data_type) elif data_type == "orthologs": ks_data = fc_extract_ks_list.ks_list_from_tsv(tsv_file, max_ks, data_type) @@ -803,7 +803,9 @@ def plot_init_comp(ax_ks, ax_logks, means, stdevs, lambd, weights, plot_logtranf ax_logks.plot(x_points, weights[comp+1] * norm.pdf(x_points, means[comp], stdevs[comp]), style, lw=1.5, alpha=0.4) -def plot_fitted_comp(ax_ks, ax_logks, means, stdevs, lambd, weights, max_x_axis_lim, peak_stats, correction_table_available, plot_correction_arrows, scaling=1, plot_peak_markers=False, plot_logtranformed=True): +def plot_fitted_comp(ax_ks, ax_logks, means, stdevs, lambd, weights, max_x_axis_lim, + peak_stats, correction_table_available, plot_correction_arrows, scaling=1, + plot_peak_markers=False, plot_logtranformed=True): """ Plots the mixture components with fitted parameters plus their overall PDF curve. On the left axis it plots the lognormal and the exponential components, @@ -837,7 +839,8 @@ def plot_fitted_comp(ax_ks, ax_logks, means, stdevs, lambd, weights, max_x_axis_ linestyle = ":" alpha = 1 - ax_ks.plot(x_points_strictly_positive, scaling * weights[0] * expon.pdf(x_points_strictly_positive, scale=1/lambd), c=color, ls=linestyle, lw=1.5, alpha=alpha, label='Exponential') + ax_ks.plot(x_points_strictly_positive, scaling * weights[0] * expon.pdf(x_points_strictly_positive, + scale=1/lambd), c=color, ls=linestyle, lw=1.5, alpha=alpha, label='Exponential') # Getting the representative value of the lognormal peak and sort lognormals accordingly lognormal_peaks = {} # key: lognormal component ID, value: peak @@ -849,17 +852,25 @@ def plot_fitted_comp(ax_ks, ax_logks, means, stdevs, lambd, weights, max_x_axis_ lognormal_peaks[comp] = peak_coord lognormals_sorted_by_peak = sorted(lognormal_peaks, key=lognormal_peaks.get) # sort by values (peaks) letter_dict = dict(zip(lognormals_sorted_by_peak, [ "a", "b", "c", "d", "e", "f", "g"][:len(lognormals_sorted_by_peak)])) # assign progressive letter + letter_to_peak_dict = {} colors = ["b-", "r-", "c-", "m-", "k-"][:len(means)-1] + ["y-"] for comp, style in zip(lognormals_sorted_by_peak, colors): + # Letter associated to its peak x-coordinate: key a, value float + letter_to_peak_dict[letter_dict[comp]] = lognormal_peaks[comp] + color = style[0] linestyle = style[1:] if plot_peak_markers: # True only for the final plot color = "dimgrey" linestyle = "--" - plot_marker(ax_ks, lognormal_peaks[comp], scaling * weights[comp+1] * lognorm.pdf(lognormal_peaks[comp], scale=exp(means[comp]), s=stdevs[comp]), "k", letter_dict[comp], correction_table_available, plot_correction_arrows) - ax_ks.plot(x_points_strictly_positive, scaling * weights[comp+1] * lognorm.pdf(x_points_strictly_positive, scale=exp(means[comp]), s=stdevs[comp]), c=color, ls=linestyle, lw=1.5, alpha=alpha, label=f'Lognormal {letter_dict[comp]} ({peak_stats} {lognormal_peaks[comp]})') + plot_marker(ax_ks, lognormal_peaks[comp], scaling * weights[comp+1] * lognorm.pdf(lognormal_peaks[comp], + scale=exp(means[comp]), s=stdevs[comp]), "k", letter_dict[comp], correction_table_available, + plot_correction_arrows) + ax_ks.plot(x_points_strictly_positive, scaling * weights[comp+1] * lognorm.pdf(x_points_strictly_positive, + scale=exp(means[comp]), s=stdevs[comp]), c=color, ls=linestyle, lw=1.5, alpha=alpha, + label=f'Lognormal {letter_dict[comp]} ({peak_stats} {lognormal_peaks[comp]})') if plot_logtranformed: ax_logks.plot(x_points, scaling * weights[comp+1] * norm.pdf(x_points, means[comp], stdevs[comp]), c=color, ls=linestyle, lw=1.5, alpha=alpha, label=f'Norm {letter_dict[comp]}') @@ -887,6 +898,7 @@ def plot_fitted_comp(ax_ks, ax_logks, means, stdevs, lambd, weights, max_x_axis_ ax_logks.legend(handles_tuple_list, ax_logks.get_legend_handles_labels()[1], handler_map={tuple: HandlerTuple(ndivide=None)}, handlelength=3, loc="upper left") + return letter_to_peak_dict def make_tuple_handles(axis): """ @@ -1097,14 +1109,18 @@ def plot_best_model(fig_best_model, ax_best_model, species, ks_data, ks_weights, # PLOTTING THE ORTHOLOG DIVERGENCE LINES on the paralog distribution if correction_table_available: dummy_fig, dummy_axis = plt.subplots() - fcPlot.plot_divergences(correction_table, peak_stats, consensus_peak_for_multiple_outgroups, dummy_axis, ax_best_model, color_list, plot_correction_arrows) + fcPlot.plot_divergences(correction_table, peak_stats, consensus_peak_for_multiple_outgroups, + dummy_axis, ax_best_model, color_list, plot_correction_arrows) init_means, init_stdevs, init_lambd, init_weights = all_models_init_parameters[best_model_id] final_means, final_stdevs, final_lambd, final_weights = all_models_fitted_parameters[best_model_id] scaling = bin_width * len(deconvoluted_data[deconvoluted_data <= max_ks_EM]) - plot_fitted_comp(ax_best_model, None, final_means, final_stdevs, final_lambd, final_weights, max_x_axis_lim, peak_stats, correction_table_available, plot_correction_arrows, scaling=scaling, plot_peak_markers=True, plot_logtranformed=False) + letter_to_peak_dict = plot_fitted_comp(ax_best_model, None, final_means, final_stdevs, final_lambd, final_weights, + max_x_axis_lim, peak_stats, correction_table_available, plot_correction_arrows, + scaling=scaling, plot_peak_markers=True, plot_logtranformed=False) + logging.info("Done") if correction_table_available: legend_size = fcPlot.define_legend_size(ax_best_model) @@ -1123,17 +1139,4 @@ def plot_best_model(fig_best_model, ax_best_model, species, ks_data, ks_weights, fig_best_model.savefig(os.path.join("rate_adjustment", f"{species}", f"mixed_{species}_elmm.pdf"), transparent=True, format="pdf") - - # # TEMPORARY FOR A FIGURE PLOT WITH DENSITY FOR COMPARISON AFTER SCALING - # fig, ax = generate_best_model_figure("elaeis", "Elaeis guineensis", 3, None, True, True) - # fcPlot.plot_histogram("Whole-paranome", ax, ks_data, bin_list, None, None, None, ks_weights, plot_kde=False, density=True) - # if correction_table_available: - # fcPlot.plot_divergences(correction_table, consensus_peak_for_multiple_outgroups, dummy_axis, ax, color_list, plot_correction_arrows) - # plot_fitted_comp(ax, None, final_means, final_stdevs, final_lambd, final_weights, peak_stats, scaling=1, plot_peak_markers=True, plot_logtranformed=False) - - # legend_size = fcPlot.define_legend_size(ax) - # chart_box = ax.get_position() - # ax.set_position([chart_box.x0, chart_box.y0, chart_box.width*0.65, chart_box.height]) - # lgd = create_legend_mixture_model(ax, legend_size, len(init_means)+2) # number of plotted lines is: exp + lognormals + total PDF - # fig.savefig(os.path.join("rate_adjustment", f"{species}", f"mixed_density_elmm.pdf"), - # bbox_extra_artists=(lgd, fig._suptitle), bbox_inches="tight", transparent=True, format="pdf") + return letter_to_peak_dict \ No newline at end of file diff --git a/ksrates/fc_lognormal_mixture.py b/ksrates/fc_lognormal_mixture.py index d3cea6d39aa70d5d231913bd97f5a5279367b923..5517c6d165ba86b7731055be41430b0fb54f5007 100644 --- a/ksrates/fc_lognormal_mixture.py +++ b/ksrates/fc_lognormal_mixture.py @@ -166,8 +166,9 @@ def fit_gmm(X, n1, n2, outfile, parameter_table, max_iter=300, n_init=1, **kwarg return models, bic, aic, best -def plot_mixture_model(model, data, max_x_axis_lim, ax, bin_width, scaling, peak_stats, correction_table_available, plot_correction_arrows, l=0, u=5, color='black', alpha=0.2, - log=False, bins=25): +def plot_mixture_model(model, data, max_x_axis_lim, ax, bin_width, scaling, peak_stats, + correction_table_available, plot_correction_arrows, l=0, u=5, + color='black', alpha=0.2, log=False, bins=25): """ Plot a mixture model. Assumes a log-transformed model and data and will back-transform. @@ -222,8 +223,14 @@ def plot_mixture_model(model, data, max_x_axis_lim, ax, bin_width, scaling, peak lognormals_sorted_by_peak = sorted(lognormal_peaks, key=lognormal_peaks.get) # sort by values (peaks) letter_dict = dict(zip(lognormals_sorted_by_peak, [ "a", "b", "c", "d", "e", "f", "g"][:len(lognormals_sorted_by_peak)])) # assign progressive letter + letter_to_peak_dict = {} + for comp in lognormals_sorted_by_peak: - fcEM.plot_marker(ax, lognormal_peaks[comp], scaling * weights[comp] * ss.lognorm.pdf(lognormal_peaks[comp], scale=exp(means[comp][0]), s=sqrt(varcs[comp][0][0])), "k", letter_dict[comp], correction_table_available, plot_correction_arrows) + fcEM.plot_marker(ax, lognormal_peaks[comp], scaling * weights[comp] * ss.lognorm.pdf(lognormal_peaks[comp], + scale=exp(means[comp][0]), s=sqrt(varcs[comp][0][0])), "k", letter_dict[comp], + correction_table_available, plot_correction_arrows) + # Fill in the dictionary with all peak names and their x-coordinate + letter_to_peak_dict[letter_dict[comp]] = lognormal_peaks[comp] for comp in lognormals_sorted_by_peak: curve = scaling * weights[comp] * ss.lognorm.pdf(x, scale=exp(means[comp]), s=sqrt(varcs[comp])) @@ -235,7 +242,7 @@ def plot_mixture_model(model, data, max_x_axis_lim, ax, bin_width, scaling, peak else: total_pdf += curve ax.plot(x, total_pdf, '-k', lw=1.5, label="Lognormal mixture model") - return ax + return ax, letter_to_peak_dict def lmm( @@ -251,7 +258,7 @@ def lmm( histograms. Please interpret mixture model results with caution. :param fig: figure object :param max_x_axis_lim: upper limit in the x-axis - :param data_type: strings stating whether the data are "paralogs" or "anchor pairs" + :param data_type: strings stating whether the data are "paralogs" or "anchor pairs" or "reciprocally retained" :param tsv_file: wgd output file containing either paranome or anchor pairs Ks values (suffix formats: ".ks.tsv", "ks_anchors.tsv") :param species: informal name of the focal species :param axis: axis object @@ -264,7 +271,7 @@ def lmm( :param output_dir: output folder :param outfile: output file for logging details :param parameter_table: list collecting the component parameters - :param datatype_tag: string for figure title stating if data is paranome or anchors + :param datatype_tag: string for figure title stating if data is paranome or anchors or reciprocally retained ("recret") :param peak_stats: states whether the cluster peak is intended as median or mode :param correction_table_available: boolean stating whether the adjustment results are available or not :param plot_correction_arrows: boolean stating whether there will be plotted adjustment arrows or not @@ -291,9 +298,10 @@ def lmm( # Plotting the components of the best model on the final picture; # Components are scaled up to the size of actual count data and not to density histogram scaling = bin_width_para * len(deconvoluted_data) - plot_mixture_model(best, deconvoluted_data, max_x_axis_lim, axis, bin_width_para, scaling, peak_stats, + ax, letter_to_peak_dict = plot_mixture_model(best, deconvoluted_data, max_x_axis_lim, axis, bin_width_para, scaling, peak_stats, correction_table_available, plot_correction_arrows, ks_range[0], ks_range[1], bins=bins) - return best + + return best, letter_to_peak_dict def create_legend_mixture_model(axis, legend_size, num_mixture_model_lines, datatype): @@ -304,7 +312,7 @@ def create_legend_mixture_model(axis, legend_size, num_mixture_model_lines, data :param axis: matplotlib axis object from which the legend is taken :param legend_size: size of the legend box as a tuple of format "(x, y, width, height)" :param num_mixture_model_lines: number of lines generated by mixture models (components + total PDF) - :param datatype: string for figure title stating if data is "paranome" or comes from "colinearity" analysis + :param datatype: string stating if data is paranome, anchors or reciprocally retained paralogs (lambda or combined ranks) :return: the updated legend object """ handles, labels = axis.get_legend_handles_labels() @@ -313,6 +321,8 @@ def create_legend_mixture_model(axis, legend_size, num_mixture_model_lines, data paralog_rect = Patch(facecolor=fcPlot.COLOR_PARANOME_HISTOGRAM, edgecolor="w") elif datatype == "anchors": paralog_rect = Patch(facecolor=fcPlot.COLOR_ANCHOR_HISTOGRAM, edgecolor="w") + elif datatype == "recret" or datatype == "recret_lambda" or datatype == "recret_combined": + paralog_rect = Patch(facecolor=fcPlot.COLOR_REC_RET_HISTOGRAM, edgecolor="w") # empty patch used as spacer between histograms and divergence line legend entries empty_rect = Patch(fill=False, edgecolor='none', visible=False) @@ -333,7 +343,7 @@ def save_lmm(fig, axis, species, best_model, datatype, correction_table_availabl :param fig: figure object of the adjusted mixed distribution :param axis: axis object of the adjusted mixed distribution :param species: focal species - :param datatype: string for figure title stating whether data is paranome or comes from "colinearity" analysis + :param datatype: string for filename stating whether data is paranome, anchors or reciprocally retained paralogs """ if correction_table_available: num_mixture_model_lines = len(best_model.means_) + 1 # components + total PDF @@ -360,7 +370,7 @@ def make_parameter_table_file(parameter_table, species, datatype): :param parameter_table: list collecting the component parameters :param species: informal name of the focal species - :param datatype: string for figure title stating whether data is paranome or comes from "colinearity" analysis + :param datatype: string for filename stating whether data is paranome, anchors or reciprocally retained paralogs """ #headers = ["Model", "Iteration", "BIC", "Loglikelihood", "Convergence", # "Exponential_Rate", "Exponential_Weight", "Normal_Mean", "Normal_SD", "Normal_Weight"] diff --git a/ksrates/fc_plotting.py b/ksrates/fc_plotting.py index 36a7eedca12efa290b03a46f18b3a0e00caab05e..a792e036acb71eb614ca371992773c04a9dbf962 100755 --- a/ksrates/fc_plotting.py +++ b/ksrates/fc_plotting.py @@ -49,13 +49,14 @@ LINEWIDTH_CIRCLE_LABEL_BORDER = 1 NEGATIVE_Y_FRACTION = 0.08 # Filenames -_MIXED_ADJUSTED_PLOT_FILENAME = "mixed_{}_adjusted.pdf" -_MIXED_UNADJUSTED_PLOT_FILENAME = "mixed_{}_unadjusted.pdf" +_MIXED_ADJUSTED_PLOT_FILENAME = "mixed_{}_adjusted_{}.pdf" +_MIXED_UNADJUSTED_PLOT_FILENAME = "mixed_{}_unadjusted_{}.pdf" +_OTHER_MIXED_PLOTS_SUBDIR = "other_mixed_plots" def generate_mixed_plot_figure(species, x_max_lim, y_max_lim, corrected_or_not, correction_table_available, - plot_correction_arrows, paranome_data=True, colinearity_data=True, - reciprocal_retention_data=True, top=None, rank_type=None): + plot_correction_arrows, paranome_data=False, colinearity_data=False, + reciprocal_retention_data=False, top=None, rank_type=None): """ Initializes a figure with a single empty plot for the mixed distribution. @@ -98,12 +99,12 @@ def generate_mixed_plot_figure(species, x_max_lim, y_max_lim, corrected_or_not, seaborn.despine(offset=10) ax.set_xlabel("$K_\mathregular{S}$") - if paranome_data: # If paranome (with or without anchors) is going to be plotted + if paranome_data or reciprocal_retention_data: + # If paranome or rec.ret paralogs appear in the plot, use "duplicates" ax.set_ylabel("Number of retained duplicates (weighted)") - elif colinearity_data: # If only anchor pairs are going to plotted + elif colinearity_data: + # If instead only anchor pairs are going to plotted, use "anchor pairs" ax.set_ylabel("Number of retained anchor pairs (weighted)") - elif reciprocal_retention_data: # If only reciprocally retained data are going to be plotted - ax.set_ylabel("Number of retained duplicates (weighted)") ax.set_xlim(0, x_max_lim) if isinstance(y_max_lim, float): @@ -531,7 +532,7 @@ def create_legend(axis, paranome, colinearity, reciprocal_retention, legend_size def save_mixed_plot(fig_corr, fig_uncorr, ax_corr, ax_uncorr, species, correction_table_available, - paranome, colinearity, reciprocal_retention): + paranome, colinearity, reciprocal_retention, filename_adjusted, filename_unadjusted): """ This function must be called to save the mixed distribution figure in order to adjust the figure layout: the plot area is shrunk to the left and some reasonable space is left on the right side for the legend. @@ -544,27 +545,32 @@ def save_mixed_plot(fig_corr, fig_uncorr, ax_corr, ax_uncorr, species, correctio :param paranome: the config file field that states if the whole-paranome has to be plotted [True/False] :param colinearity: the config file field that states if the anchor pairs have to be plotted [True/False] """ + directory_path = os.path.join("rate_adjustment", f"{species}") + # Generate the subdirectory for mixed plot composed of two or three data types (e.g. paranome & anchor pairs) + if not os.path.exists(os.path.join(directory_path, _OTHER_MIXED_PLOTS_SUBDIR)): + os.makedirs(os.path.join(directory_path, _OTHER_MIXED_PLOTS_SUBDIR)) + if correction_table_available: legend_size = define_legend_size(ax_corr) chart_box = ax_uncorr.get_position() ax_uncorr.set_position([chart_box.x0, chart_box.y0, chart_box.width*0.65, chart_box.height]) lgd = create_legend(ax_uncorr, paranome, colinearity, reciprocal_retention, legend_size) - fig_uncorr.savefig(os.path.join("rate_adjustment", f"{species}", _MIXED_UNADJUSTED_PLOT_FILENAME.format(species)), + fig_uncorr.savefig(os.path.join(directory_path, filename_unadjusted), bbox_extra_artists=(lgd, fig_uncorr._suptitle), bbox_inches="tight", transparent=True, format="pdf") ax_corr.set_position([chart_box.x0, chart_box.y0, chart_box.width*0.65, chart_box.height]) lgd = create_legend(ax_corr, paranome, colinearity, reciprocal_retention, legend_size) - fig_corr.savefig(os.path.join("rate_adjustment", f"{species}", _MIXED_ADJUSTED_PLOT_FILENAME.format(species)), + fig_corr.savefig(os.path.join(directory_path, filename_adjusted.format(species)), bbox_extra_artists=(lgd, fig_corr._suptitle), bbox_inches="tight", transparent=True, format="pdf") else: # if not correction_table_available use a simpler layout with the legend # inside the plot and no right margin lgd = ax_uncorr.legend(handlelength=1.5, loc="upper right") - fig_uncorr.savefig(os.path.join("rate_adjustment", f"{species}", _MIXED_UNADJUSTED_PLOT_FILENAME.format(species)), + fig_uncorr.savefig(os.path.join(directory_path, filename_unadjusted), transparent=True, format="pdf") lgd = ax_corr.legend(handlelength=1.5, loc="upper right") - fig_corr.savefig(os.path.join("rate_adjustment", f"{species}", _MIXED_ADJUSTED_PLOT_FILENAME.format(species)), + fig_corr.savefig(os.path.join(directory_path, filename_adjusted), transparent=True, format="pdf") diff --git a/ksrates/fc_reciprocal_retention.py b/ksrates/fc_reciprocal_retention.py index 3b888bff3911074aa95eb6eeeb91ef6898e335d3..9fd6ed29c23896e74b79a4dd38240c4e0a822526 100755 --- a/ksrates/fc_reciprocal_retention.py +++ b/ksrates/fc_reciprocal_retention.py @@ -24,7 +24,8 @@ ids = {'alyr':'Arabidopsis lyrata', 'atha': 'Arabidopsis thaliana', 'atri':'Ambo # TODO: TEST AND FIX THE INTEGRATION OF THIS FUNCTIONS IN THE KSRATES PIPELINE IN "extend_orthogroups.py" # where the main function should be called (now it's commented). -def get_all_gf_genes(path_for_top_gf_tsv, filter_zero_false_predictions, filter_false_predictions, rank_type, top=None): +def get_all_gf_genes(ksrates_rec_ret_basefolder, filter_zero_false_predictions, + filter_false_predictions, rank_type, top=None): """ If top parameter equal to a number, then consider only the top XXXX gene families: e.g. top 1000. This avoids including weak reciprocally retained gene families. If asked, also filter away those GFs that @@ -36,16 +37,18 @@ def get_all_gf_genes(path_for_top_gf_tsv, filter_zero_false_predictions, filter_ """ # Generate or open the file listing all the GFs with all their genes if top == None: - gf_genes_tsv = "gf_genes_all.tsv" + gf_genes_tsv = os.path.join(ksrates_rec_ret_basefolder, "gf_genes_all.tsv") + gf_list_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_list_top_{rank_type}.tsv") else: - gf_genes_tsv = f"gf_genes_top_{top}_{rank_type}.tsv" + gf_genes_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_genes_top_{top}_{rank_type}.tsv") + gf_list_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_list_top_{top}_{rank_type}.tsv") - if not os.path.exists(os.path.join(path_for_top_gf_tsv, gf_genes_tsv)): + if not os.path.exists(gf_genes_tsv) or not os.path.exists(gf_list_tsv): # Get the ranked gene families - gf_rank = read_excel(open("gene_families.xlsx", "rb"), "gene family rankings", header=None) - + gene_families_xlsx = os.path.join(os.path.join(ksrates_rec_ret_basefolder, "gene_families.xlsx")) + gf_rank = read_excel(open(gene_families_xlsx, "rb"), "gene family rankings", header=None) # Get the genes associated to all gene families - gf_genes_raw = read_excel(open("gene_families.xlsx", "rb"), "orthogroup member genes", header=None) + gf_genes_raw = read_excel(open(gene_families_xlsx, "rb"), "orthogroup member genes", header=None) if top != None: if rank_type == "combined": @@ -64,14 +67,21 @@ def get_all_gf_genes(path_for_top_gf_tsv, filter_zero_false_predictions, filter_ genes_columns = gf_genes_raw.iloc[:,2].str.split(",", expand=True) # Process third column (genes) gf_genes = concat([gf_genes, genes_columns], axis=1, ignore_index=True) - with open(os.path.join(path_for_top_gf_tsv, gf_genes_tsv), "w+") as f: - f.write(gf_genes.to_csv(sep="\t", header=False, index=False)) + if not os.path.exists(gf_genes_tsv): + with open(gf_genes_tsv, "w+") as f: + f.write(gf_genes.to_csv(sep="\t", header=False, index=False)) + + if not os.path.exists(gf_list_tsv): + with open(gf_list_tsv, "w+") as f: + f.write(top_gf_ids.to_csv(sep="\t", header=False, index=False)) + else: - logging.info(f"- {gf_genes_tsv} already exists") - gf_genes = read_csv(os.path.join(path_for_top_gf_tsv, gf_genes_tsv), sep="\t", header=None) + logging.info(f"- {os.path.basename(gf_genes_tsv)} already exists") + gf_genes = read_csv(gf_genes_tsv, sep="\t", header=None) # If asked, remove the GFs from the top 2000 that performed bad (too many false predictions) + # TODO: SET THIS PATH RIGHT IF THE FILTERED DATA WILL BE USED! basefolder_filtering_files = f"../extended_orthogroups_top_{top}_{rank_type}_filtered/" if filter_zero_false_predictions: gf_ids_filtered_zero_false_predictions = read_csv(os.path.join(basefolder_filtering_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None) @@ -84,16 +94,19 @@ def get_all_gf_genes(path_for_top_gf_tsv, filter_zero_false_predictions, filter_ # Generate or open the list of genes and the list of genes with their GF if filter_zero_false_predictions: # remaining from top 2000 after filtering away for at least one false prediction - gf_genes_list_tsv, gf_genes_with_gf_list_tsv = f"gf_genes_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv", f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv" + gf_genes_list_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_genes_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv") + gf_genes_with_gf_list_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv") elif filter_false_predictions: # remaining GFs from the top 2000 after filtering by too many false predictions gf_genes_list_tsv, gf_genes_with_gf_list_tsv = f"gf_genes_list_top_{top}_{rank_type}_filtered_false_predictions.tsv", f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv" elif top == None: gf_genes_list_tsv, gf_genes_with_gf_list_tsv = "gf_genes_list.tsv", "gf_genes_with_gf_list.tsv" else: # top = 2000 - gf_genes_list_tsv, gf_genes_with_gf_list_tsv = f"gf_genes_list_top_{top}_{rank_type}.tsv", f"gf_genes_with_gf_list_top_{top}_{rank_type}.tsv" + gf_list_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_list_top_{top}_{rank_type}.tsv") + gf_genes_list_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_genes_list_top_{top}_{rank_type}.tsv") + gf_genes_with_gf_list_tsv = os.path.join(ksrates_rec_ret_basefolder, f"gf_genes_with_gf_list_top_{top}_{rank_type}.tsv") - if not os.path.exists(os.path.join(path_for_top_gf_tsv, gf_genes_list_tsv)) or not os.path.exists(os.path.join(path_for_top_gf_tsv, gf_genes_with_gf_list_tsv)): - logging.info(f"- Generating {gf_genes_list_tsv} and {gf_genes_with_gf_list_tsv}...") + if not os.path.exists(gf_list_tsv) or not os.path.exists(gf_genes_list_tsv) or not os.path.exists(gf_genes_with_gf_list_tsv): + logging.info(f"- Generating {os.path.basename(gf_list_tsv)}, {os.path.basename(gf_genes_list_tsv)} and {os.path.basename(gf_genes_with_gf_list_tsv)}...") # Get the genes associated to the GFs in both rank types gf_genes_list = [] @@ -110,68 +123,68 @@ def get_all_gf_genes(path_for_top_gf_tsv, filter_zero_false_predictions, filter_ # Save the GF together with their genes in a TSV file gf_genes_list = Series(gf_genes_list).sort_values(ignore_index=True) - gf_genes_list.to_csv(os.path.join(path_for_top_gf_tsv, gf_genes_list_tsv), sep="\t", header=False, index=False) + gf_genes_list.to_csv(gf_genes_list_tsv, sep="\t", header=False, index=False) gf_genes_with_gf_list = DataFrame.from_records(gf_genes_with_gf_list) gf_genes_with_gf_list.columns = ["gene_id", "gf"] gf_genes_with_gf_list = gf_genes_with_gf_list.sort_values(by=["gene_id"], ignore_index=True) - gf_genes_with_gf_list.to_csv(os.path.join(path_for_top_gf_tsv, gf_genes_with_gf_list_tsv), sep="\t", header=False, index=False) + gf_genes_with_gf_list.to_csv(gf_genes_with_gf_list_tsv, sep="\t", header=False, index=False) - return gf_genes_list, gf_genes_with_gf_list + return gf_genes_list else: - logging.info(f"- {gf_genes_list_tsv} and {gf_genes_with_gf_list_tsv} already exist ") - gf_genes_list = read_csv(os.path.join(path_for_top_gf_tsv, gf_genes_list_tsv), sep="\t", header=None) - gf_genes_with_gf_list = read_csv(os.path.join(path_for_top_gf_tsv, gf_genes_with_gf_list_tsv), sep="\t", header=None) - return gf_genes_list.squeeze().values.tolist(), gf_genes_with_gf_list + logging.info(f"- {os.path.basename(gf_genes_list_tsv)} already exists") + logging.info(f"- {os.path.basename(gf_genes_with_gf_list_tsv)} already exists") + gf_genes_list = read_csv(gf_genes_list_tsv, sep="\t", header=None) + gf_genes_with_gf_list = read_csv(gf_genes_with_gf_list_tsv, sep="\t", header=None) + return gf_genes_list.squeeze().values.tolist() -def flesh_ids_out_with_sequences(gf_genes_list, ids_names, orthorgoups_database): +def flesh_ids_out_with_sequences(gf_genes_list, ids_names, orthorgoups_database_path): """ Generate the database of nucleotidic sequences from the orthogroup genes (FASTA format). """ database = {} - c=0 - if not os.path.exists(os.path.join("..", orthorgoups_database)): + if not os.path.exists(orthorgoups_database_path): species_fasta = {} for species_id in ids_names: - species_fasta.update(read_fasta(os.path.join("..", "sequences", "old_genome_versions", f"{species_id}.fasta"))) + species_fasta.update(read_fasta(os.path.join("/", "home", "cesen", "wgm_predictor_midas", "sequences", "old_genome_versions", f"{species_id}.fasta"))) - with open (os.path.join("..", orthorgoups_database), "w+") as f: + with open (orthorgoups_database_path, "w+") as f: for gene_id in gf_genes_list: if gene_id in species_fasta.keys(): database[gene_id] = species_fasta[gene_id] f.write(f">{gene_id}\n") f.write(f"{species_fasta[gene_id]}\n") else: - logging.info(f"- {orthorgoups_database} already exists") - + logging.info(f"- {os.path.basename(orthorgoups_database_path)} already exists") + return -def dna_to_aminoacid(orthorgoups_database, orthorgoups_database_aminoacid): +def database_dna_to_aminoacid(orthorgoups_database, orthorgoups_database_aminoacid_path): """ Translates the database of nucleotidic sequences into aminoacid sequences, to be given to diamond. """ - with open(os.path.join("..", orthorgoups_database), "r") as f: - database_dna = f.readlines() - - if not os.path.exists(os.path.join("..", orthorgoups_database_aminoacid)): - with open (os.path.join("..", orthorgoups_database_aminoacid), "w+") as f: - for i, seq in enumerate(SeqIO.parse(database_dna, 'fasta')): - try: - aa_seq = seq.translate(to_stop=False, cds=False, id=seq.id) - f.write(f">{seq.id}\n") - f.write(f"{aa_seq.seq}\n") - except TranslationError as e: - logging.error("Translation error ({}) in seq {}".format( - e, seq.id)) - continue + if not os.path.exists(orthorgoups_database_aminoacid_path) or os.path.getsize(orthorgoups_database_aminoacid_path) == 0: + with open(orthorgoups_database_aminoacid_path, "w+") as database_aa: + + with open(orthorgoups_database, "r") as database_dna: + for seq in SeqIO.parse(database_dna, 'fasta'): + try: + aa_seq = seq.translate(to_stop=False, cds=False, id=seq.id) + database_aa.write(f">{seq.id}\n") + database_aa.write(f"{aa_seq.seq}\n") + except TranslationError as e: + logging.error("Translation error ({}) in seq {}".format( + e, seq.id)) + continue else: - logging.info(f"- {orthorgoups_database_aminoacid} already exists") - + logging.info(f"- {os.path.basename(orthorgoups_database_aminoacid_path)} already exists") + return # MAIN FUNCTION THAT USES ALL FUNCTIONS ABOVE -def make_gene_family_database(top, rank_type, filter_false_predictions, filter_zero_false_predictions, base_dir, remove_test_species=False): +def make_gene_family_files_and_databases(top, rank_type, filter_false_predictions, filter_zero_false_predictions, + ksrates_rec_ret_basefolder, remove_test_species=False): """ Generates a nucleotidic FASTA file and a translated FASTA file that are databases for all sequences from the 36 species used to make the top e.g. 2000 reciprocally retained gene families out of the total 9178 core angiosperm gene families. @@ -195,13 +208,15 @@ def make_gene_family_database(top, rank_type, filter_false_predictions, filter_z 'sbic':'Sorghum bicolor', 'sita':'Setaria italica', 'slyc':'Solanum lycopersicum', 'stub':'Solanum tuberosum', \ 'tcac':'Theobroma cacao', 'tpar':'Thellungiella parvula', 'vvin': 'Vitis vinifera', 'zmay':'Zea mays'} - logging.info("Get a list of all GF genes") + logging.info(f"Getting a list of top {top} ({rank_type}) reciprocally retained gene family genes") + if filter_zero_false_predictions: logging.info("Filter away from database those GFs that have at least one false prediction") elif filter_false_predictions: logging.info("Filter away from database those GFs that lead to too many false predictions") - gf_genes_list, gf_genes_with_gf_list = get_all_gf_genes("..", filter_zero_false_predictions, filter_false_predictions, rank_type, top) + gf_genes_list = get_all_gf_genes(ksrates_rec_ret_basefolder, filter_zero_false_predictions, + filter_false_predictions, rank_type, top) # Default names for databases (will be changed if necessary in this code later on) if not filter_zero_false_predictions and not filter_false_predictions: @@ -229,13 +244,19 @@ def make_gene_family_database(top, rank_type, filter_false_predictions, filter_z if filter_false_predictions: orthorgoups_database = f"orthogroups_database_top_{top}_{rank_type}_filtered_false_predictions_test_without_{scientific_name}.fasta" orthorgoups_database_aminoacid = f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_false_predictions_test_without_{scientific_name}.fasta" + logging.info("") - logging.info("Generate the nucleotidic database of GFs") - flesh_ids_out_with_sequences(gf_genes_list, ids_names, orthorgoups_database) + logging.info("Generating the nucleotidic database of GFs") + orthorgoups_database_path = os.path.join(ksrates_rec_ret_basefolder, orthorgoups_database) + flesh_ids_out_with_sequences(gf_genes_list, ids_names, orthorgoups_database_path) + logging.info("") - logging.info("Translate into aminoacidic sequence database (ready for diamond)") - dna_to_aminoacid(orthorgoups_database, orthorgoups_database_aminoacid) + logging.info("Translating into aminoacidic sequence database (ready for diamond)") + orthorgoups_database_aminoacid_path = os.path.join(ksrates_rec_ret_basefolder, orthorgoups_database_aminoacid) + database_dna_to_aminoacid(orthorgoups_database_path, orthorgoups_database_aminoacid_path) + logging.info("") + return #------------------------------------------------------------------------------ @@ -304,15 +325,18 @@ def make_diamond_db(fasta_aa, output_folder=".", n_threads=8, dmd_tmp_dir=False) else: fasta_aa_db = f"{os.path.basename(fasta_aa)}.db" - fasta_aa_db = os.path.join(output_folder, fasta_aa_db) - cmd = ["diamond", "makedb", "--threads", f"{n_threads}", "--in", fasta_aa, "-d", fasta_aa_db] - # capture_output option required Python 3.7 - # out = sp.run(cmd, capture_output=True) - # logging.debug(out.stderr.decode()) - sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) - # if out.returncode == 1: - # logging.error(out.stderr.decode()) - return f"{fasta_aa_db}.dmnd" + output_filename = os.path.join(output_folder, f"{fasta_aa_db}.dmnd") + + if not os.path.exists(output_filename): + fasta_aa_db = os.path.join(output_folder, fasta_aa_db) + cmd = ["diamond", "makedb", "--threads", f"{n_threads}", "--in", fasta_aa, "-d", fasta_aa_db] + # capture_output option required Python 3.7 + # out = sp.run(cmd, capture_output=True) + # logging.debug(out.stderr.decode()) + sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) + # if out.returncode == 1: + # logging.error(out.stderr.decode()) + return output_filename def run_diamond(query_fasta, subject_db, outfile, dmd_tmp_dir, pident_threshold, column_names=[], @@ -475,15 +499,17 @@ def dna_to_aminoacid(database_dna_path, output_folder="."): continue return database_aa_path -def assign_first_diamond_hit_with_rbh(species, fasta_file, species_sequences_filename, orthogroup_database_nucl_test, - query_seqs, dmd_with_orthogroups, pident_threshold, column_names, dmd_tmp_dir, basefolder_output_files): +def assign_first_diamond_hit_with_rbh(top, rank_type, species, fasta_file, orthogroup_database_nucl_test, + query_seqs, dmd_with_orthogroups, pident_threshold, + column_names, dmd_tmp_dir, wgd_focal_rec_ret_rank_dir): """ Assign to the query gene the orthogroup of the first (and best) hit, but only if there is a reciprocal best hit correspondence between the two genes. """ - logging.info(f"Temporary diamond directory: {dmd_tmp_dir}\n") + logging.info(f"Temporary diamond directory: {dmd_tmp_dir}") + logging.info("") - filename = os.path.join(basefolder_output_files, f"assigned_gfs_{species}_{pident_threshold}%_best_diamond_hit_with_rbh.tsv") + filename = os.path.join(wgd_focal_rec_ret_rank_dir, f"assigned_gfs_{species}_top_{top}_{rank_type}_{pident_threshold}%_best_diamond_hit_with_rbh.tsv") if not os.path.exists(filename) or os.path.getsize(filename) == 0: @@ -491,10 +517,13 @@ def assign_first_diamond_hit_with_rbh(species, fasta_file, species_sequences_fil orthogroup_database_test = read_csv(orthogroup_database_nucl_test, sep="\t", header=None).loc[:,0] # Get/Make the Atha FASTA diamond database - if not os.path.exists(os.path.join(f"{fasta_file}.aa.db.dmnd")): - test_species_database = make_diamond_db(species_sequences_filename, output_folder=basefolder_output_files) + if not os.path.exists(os.path.join(wgd_focal_rec_ret_rank_dir, f"{fasta_file}.aa.db.dmnd")): + # Translate it into the aminoacid alphabet + species_sequences_filename = dna_to_aminoacid(fasta_file, output_folder=wgd_focal_rec_ret_rank_dir) + # Make aminoacidic diamond database + test_species_database = make_diamond_db(species_sequences_filename, output_folder=wgd_focal_rec_ret_rank_dir) else: - test_species_database = os.path.join(basefolder_output_files, f"{fasta_file}.aa.db.dmnd") + test_species_database = os.path.join(wgd_focal_rec_ret_rank_dir, f"{fasta_file}.aa.db.dmnd") assigned_gfs_file = list() no_rbh = 0 diff --git a/ksrates/fc_rrt_correction.py b/ksrates/fc_rrt_correction.py index 6d47b4bb5586dacc40f33a5788b83c7d03616c55..88877ae0cdeb405b970939f88fb13e8813ec5844 100644 --- a/ksrates/fc_rrt_correction.py +++ b/ksrates/fc_rrt_correction.py @@ -1,4 +1,5 @@ from math import sqrt +import logging # Filenames _ADJUSTMENT_TABLE_ALL = "adjustment_table_{}_all.tsv" @@ -63,3 +64,78 @@ def compute_corrected_ks_species_sister(rel_rate_species, rel_rate_species_sd): corrected_peak_sd = sqrt(2) * rel_rate_species_sd return corrected_peak, corrected_peak_sd + + +def interpretation_adjusted_plot(focal_species_latin, consensus_peak_for_multiple_outgroups, + wgd_peaks, correction_table): + """ + Prints an automatic interpretation of the rate-adjusted mixed Ks plots in which for each + inferred (putative) WGD peak it is said which species share it with the focal species. + This is done by comparing the x-coordinate of WGD peaks coming from mixture + modeling analyses and the rate-adjusted ortholog modes. + The WGD x-coordinate (its Ks age) is either coming from a component when the method is + exponential-lognormal mixture modeling and lognormal mixture modeling, or from an + anchor cluster when the method is anchor clustering. + + :param focal_species_latin: latin name of focal species + :param consensus_peak_for_multiple_outgroups: choice of how to deal with multiple adjustments + for the same divergence (expected values: either "mean among outgroups" or "best outgroup") + :param wgd_peaks: dictionary associating to a wgd letter the coordinate of its peak (either component or anchor cluster) + :param correction_table: adjustment results in DataFrame format (contains both possible types of consensus strategy for how to deal with multiple outgroups) + """ + if consensus_peak_for_multiple_outgroups == "mean among outgroups": + # Then consider the columns in the correction_table that are generated using the average method + column_header_peak = "Adjusted_Mode_Mean" + elif consensus_peak_for_multiple_outgroups == "best outgroup": + # Then consider the columns in the correction_table that are generated using the best outgroup method + column_header_peak = "Adjusted_Mode_Best" + + species_interpretation = {} + shared_wgd_interpretation = {} + + # Generate dictionary associating each divergent species to its mode + ortholog_modes = {} + for diverging_species, mode in zip(correction_table["Sister_Species"], correction_table[column_header_peak]): + ortholog_modes[diverging_species] = mode + # Also initialize a dictionary for the final interpretation logging output + species_interpretation[diverging_species] = [] + + logging.info("") + logging.info(f"Automatic interpretation of rate-adjusted mixed Ks plot:") + logging.info("Please revise it manually due to known overfitting of mixture modeling methods!") + # Figuring out which species share which (putative) WGDs + if len(wgd_peaks) == 0: + logging.info(" - There are no inferred (putative) WGD peaks.") + return + + for peak in wgd_peaks: # For each wgd peak + shared_wgd_interpretation[peak] = [] + for diverging_species in ortholog_modes: # For each divergent species + # What about when it's "=="? + if ortholog_modes[diverging_species] < wgd_peaks[peak]: + species_interpretation[diverging_species].append(peak) + shared_wgd_interpretation[peak].append(diverging_species) + + # For every peak, say which species share it with the focal species + for peak_letter in wgd_peaks: + # If wgd peak is not shared with any other species + if len(shared_wgd_interpretation[peak_letter]) == 0: + logging.info(f' - Inferred (putative) WGD signature "{peak_letter}" is ' + + f'specific to focal species {focal_species_latin}') + continue + # If wgd peak is shared with all of the other species in the plots + elif len(shared_wgd_interpretation[peak_letter]) == len(ortholog_modes): + logging.info(f' - Inferred (putative) WGD signature "{peak_letter}" is shared ' + + f"with all involved species in the provided phylogeny") + continue + # If wgd peak is shared with only one other species + elif len(shared_wgd_interpretation[peak_letter]) == 1: + logging.info(f' - Inferred (putative) WGD signature "{peak_letter}" is shared with {shared_wgd_interpretation[peak_letter][0]}') + else: # If wgd peak is shared with some other species + logging.info(f' - Inferred (putative) WGD signature "{peak_letter}" is shared ' + + f"with the following species:") + for species in shared_wgd_interpretation[peak_letter]: + logging.info(f" {species}") + + + logging.info("") diff --git a/ksrates/fc_wgd.py b/ksrates/fc_wgd.py index b01fcbe3b44405e2cb37d6928aa44ac963eb0221..6938c5e7c50ec3b08eeb1be0d112c356ea9e30e5 100755 --- a/ksrates/fc_wgd.py +++ b/ksrates/fc_wgd.py @@ -15,15 +15,15 @@ from ksrates.utils import merge_dicts, concat_files, can_i_run_software, transla _OUTPUT_BLAST_FILE_PATTERN = '{}.blast.tsv' _OUTPUT_MCL_FILE_PATTERN = '{}.mcl.tsv' _OUTPUT_KS_FILE_PATTERN = '{}.ks.tsv' -_OUTPUT_MCL_REC_RET_FILE_PATTERN = '{}_rec_ret_top{}_{}_rank.mcl.tsv' -_OUTPUT_KS_REC_RET_FILE_PATTERN = '{}.ks_rec_ret.tsv' +_OUTPUT_MCL_REC_RET_FILE_PATTERN = '{}_rec_ret_top_{}_{}.mcl.tsv' +_OUTPUT_KS_REC_RET_FILE_PATTERN = '{}.ks_rec_ret_top_{}_{}.tsv' _PARALOGS_OUTPUT_DIR_PATTERN = 'wgd_{}' _ORTHOLOGS_OUTPUT_DIR_PATTERN = 'wgd_{}_{}' _TMP_BLAST = '{}.blast_tmp' _TMP_KS = '{}.ks_tmp' -_TMP_KS_REC_RET = '{}.ks_rec_ret_tmp' +_TMP_KS_REC_RET = '{}.ks_rec_ret_top_{}_{}_tmp' def ks_paralogs(species_name, cds_fasta, base_dir='.', eval_cutoff=1e-10, inflation_factor=2.0, aligner='muscle', min_msa_length=100, codeml='codeml', codeml_times=1, pairwise=False, @@ -215,6 +215,7 @@ def ks_paralogs(species_name, cds_fasta, base_dir='.', eval_cutoff=1e-10, inflat logging.info('---') logging.info('Done') + logging.info('---') def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type="lambda", pident_threshold=40, @@ -251,20 +252,20 @@ def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type sys.exit(1) output_mcl_rec_ret_file = _OUTPUT_MCL_REC_RET_FILE_PATTERN.format(species_name, top, rank_type) - output_ks_file = _OUTPUT_KS_REC_RET_FILE_PATTERN.format(species_name) + output_ks_file = _OUTPUT_KS_REC_RET_FILE_PATTERN.format(species_name, top, rank_type) output_dir = os.path.abspath(os.path.join(base_dir, _PARALOGS_OUTPUT_DIR_PATTERN.format(species_name))) output_mcl_path = os.path.join(output_dir, output_mcl_rec_ret_file) - tmp_ks_paralogs = os.path.join(output_dir, _TMP_KS_REC_RET.format(species_name)) + tmp_ks_paralogs = os.path.join(output_dir, _TMP_KS_REC_RET.format(species_name, top, rank_type)) # BASEFOLDERS for input and output # Path to database and files to be used in the workflow # TODO: this is temporarily because we will probably have to let the user first make this files # themselves in a specific folder; alternatively we could provide this file already ready, but that # would make the GitHub folder (too) heavy - basefolder_db_and_files = os.path.join("/", "home", "cesen", "ksrates_rec_ret", "ksrates", "reciprocal_retention") + ksrates_rec_ret_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "reciprocal_retention") # determine which parts to run do_rec_ret = True @@ -291,7 +292,7 @@ def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type logging.info(f'Paralog Ks data {output_ks_file} already exists, will skip wgd Ks analysis') do_ks = False else: - logging.info('No paralog Ks data, will run wgd Ks analysis') + logging.info('No reciprocally retained paralog Ks data, will run wgd Ks analysis') if not do_rec_ret and not do_ks: logging.info('All reciprocally retained paralog data already exist, nothing to do') @@ -323,10 +324,14 @@ def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type logging.info(f'Creating output directory {output_dir}') os.makedirs(output_dir) - # get reciprocally retained gene families (MCL-like format) + # Get reciprocally retained gene families (MCL-like format) if do_rec_ret: logging.info('---') - assigned_rec_ret_gfs_path = os.path.join(output_dir, "reciprocal_retention", rank_type, f"assigned_gfs_{species_name}_{pident_threshold}%_best_diamond_hit_with_rbh.tsv") + + # Preparing necessary scripts for RR analyses + recRet.setup_for_extend_orthogroups(top, rank_type, filter_false_predictions, filter_zero_false_predictions) + + assigned_rec_ret_gfs_path = os.path.join(output_dir, "reciprocal_retention", rank_type, f"assigned_gfs_{species_name}_top_{top}_{rank_type}_{pident_threshold}%_best_diamond_hit_with_rbh.tsv") if not os.path.exists(assigned_rec_ret_gfs_path): logging.info(f'Running diamond homology search against reciprocally retained gene family database...') assigned_rec_ret_gfs = recRet.extend_orthogroups(species_name, latin_name, cds_fasta, top, rank_type, pident_threshold, @@ -338,18 +343,21 @@ def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type if not os.path.exists(output_mcl_path): logging.info(f'Organizing reciprocally retained gene families in mcl-like format for wgd Ks pipeline') if not filter_false_predictions and not filter_zero_false_predictions: - top_rec_ret_gf_list = pd.read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None).squeeze() + top_rec_ret_gf_list = pd.read_csv(os.path.join(ksrates_rec_ret_dir, f"gf_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None).squeeze() elif filter_false_predictions: - top_rec_ret_gf_list = pd.read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None).squeeze() + top_rec_ret_gf_list = pd.read_csv(os.path.join(ksrates_rec_ret_dir, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None).squeeze() elif filter_zero_false_predictions: - top_rec_ret_gf_list = pd.read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None).squeeze() + top_rec_ret_gf_list = pd.read_csv(os.path.join(ksrates_rec_ret_dir, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None).squeeze() generate_mcl_rec_ret(top_rec_ret_gf_list, assigned_rec_ret_gfs, output_mcl_path) + logging.info("Done") + logging.info("") # whole paranome Ks analysis if do_ks: logging.info('---') logging.info('Running reciprocally retained gene families Ks analysis...') - + cds_sequences = read_fasta(os.path.abspath(cds_fasta)) + # tmp directory cw_dir = os.getcwd() if not os.path.exists(tmp_ks_paralogs): @@ -357,15 +365,14 @@ def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type os.chdir(tmp_ks_paralogs) # change dir to tmp dir, as codeml writes non-unique file names to the working dir logging.info(f'Translating CDS file {cds_fasta}...') - cds_sequences = read_fasta(os.path.abspath(cds_fasta)) protein_sequences = translate_cds(cds_sequences) if not filter_false_predictions and not filter_zero_false_predictions: - top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}.tsv") + top_gfs_filtered_with_rank = os.path.join(ksrates_rec_ret_dir, f"gf_list_top_{top}_{rank_type}.tsv") elif filter_false_predictions: - top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv") + top_gfs_filtered_with_rank = os.path.join(ksrates_rec_ret_dir, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv") elif filter_zero_false_predictions: - top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv") + top_gfs_filtered_with_rank = os.path.join(ksrates_rec_ret_dir, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv") results_df = ks_analysis_paranome(cds_sequences, protein_sequences, output_mcl_path, top=top, top_gfs_filtered_with_rank=top_gfs_filtered_with_rank, @@ -387,6 +394,7 @@ def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type logging.info('---') logging.info('Done') + logging.info('---') def ks_orthologs(species1, species2, cds_fasta1, cds_fasta2, base_dir='.', eval_cutoff=1e-10, aligner='muscle', @@ -567,6 +575,7 @@ def ks_orthologs(species1, species2, cds_fasta1, cds_fasta2, base_dir='.', eval_ logging.info('---') logging.info('Done') + logging.info('---') def all_v_all_blast_1way(species1, species2, protein_seq_dict1, protein_seq_dict2, tmp_dir, output_dir, output_file, @@ -834,7 +843,9 @@ def ks_colinearity(species_name, gff_file, base_dir='.', gff_feature='mRNA', gff shutil.copy2(os.path.join(tmp_dir, "i-adhore_out", "multiplicon_pairs.txt"), subfolder) shutil.rmtree(tmp_dir) + logging.info('---') logging.info("Done") + logging.info('---') def _write_gene_lists(genome, output_dir='gene_lists'): @@ -969,8 +980,15 @@ def _run_iadhore(config_file): if err: # i-ADHoRe can fail without a proper exit/return code, but just with an error on stderr logging.error("i-ADHoRe execution failed with standard error output:\n" + err) - logging.error("Exiting.") - sys.exit(1) + + # Some i-ADHoRe error output is an actual fatal error and the pipeline must be interrupted, + # but there is at least one i-ADHoRe message that is passed as error output ("cerr" in source code) + # that is only a warning (it starts with "Warning:"): this kind of cerr is not supposed + # to interrupt ksrates pipeline! To take this into account, the following if-clause allows any + # i-ADHoRe error output message starting with "Warning" to be tolerated. + if not err.lower().startswith("warning"): + logging.error("Exiting.") + sys.exit(1) except subprocess.CalledProcessError as e: logging.error(f"i-ADHoRe execution failed with return code: {e.returncode}") if e.stderr: diff --git a/ksrates/lognormal_mixture.py b/ksrates/lognormal_mixture.py index 53405f0f30d42e98fd5452e483da4266e9ea6431..1e4201077aa29bfc4357731149f96fd2ee96a711 100644 --- a/ksrates/lognormal_mixture.py +++ b/ksrates/lognormal_mixture.py @@ -10,11 +10,12 @@ import ksrates.fc_check_input as fcCheck import ksrates.fc_plotting as fcPlot import ksrates.fc_extract_ks_list as fc_extract_ks_list import ksrates.fc_lognormal_mixture as fcLMM -from ksrates.fc_plotting import COLOR_ANCHOR_HISTOGRAM +from ksrates.fc_plotting import COLOR_ANCHOR_HISTOGRAM, COLOR_REC_RET_HISTOGRAM from ksrates.fc_cluster_anchors import subfolder -from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE +from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE, interpretation_adjusted_plot -def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, correction_table_file): +def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, rec_ret_tsv_file, correction_table_file, + ignore_paranome=False, ignore_anchors=False, ignore_reciprocally_retained=False): # INPUT config = fcConf.Configuration(config_file) init_logging(f"Lognormal mixture model on Ks distribution", config.get_logging_level()) @@ -22,6 +23,20 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, correc paranome_analysis = config.get_paranome() colinearity_analysis = config.get_colinearity() + reciprocal_retention_analysis = config.get_reciprocal_retention() + + top = config.get_reciprocal_retention_top() + rank_type = config.get_reciprocal_retention_rank_type() + + # Use these optional arguments to avoid performing LMM on a certain data type, overwriting what written in the config file; + # This is used in paralogs_analysis.py when LMM is called as extra method and allows to fine tune what is activated and what not + if ignore_paranome: + paranome_analysis = False + if ignore_anchors: + colinearity_analysis = False + if ignore_reciprocally_retained: + reciprocal_retention_analysis = False + species = config.get_species() latin_names = config.get_latin_names() latinSpecies = latin_names[species] @@ -65,7 +80,7 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, correc correction_table_file = fcCheck.get_argument_path(correction_table_file, default_path_correction_table_file, "Rate-adjustment table file") if correction_table_file == "": logging.warning("Rate-adjustment data are not available yet, only Ks distribution will be plotted.") - correction_table = None + logging.info("") correction_table_available = False else: with open(correction_table_file, "r") as f: @@ -86,18 +101,25 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, correc anchors_ks_tsv_file = fcCheck.get_argument_path(anchors_ks_tsv_file, default_path_anchors_tsv_file, "Anchor pair Ks TSV file") if anchors_ks_tsv_file == "": logging.error(f"Anchor pair Ks TSV file not found at default position [{default_path_anchors_tsv_file}].") - if paralog_tsv_file == "" or anchors_ks_tsv_file == "": + if reciprocal_retention_analysis: + default_path_rec_ret_tsv_file = os.path.join("paralog_distributions", f"wgd_{species}", f"{species}.ks_rec_ret_top_{top}_{rank_type}.tsv") + rec_ret_tsv_file = fcCheck.get_argument_path(rec_ret_tsv_file, default_path_rec_ret_tsv_file, "Reciprocally retained paralog Ks TSV file") + if rec_ret_tsv_file == "": + logging.error(f"Reciprocally retained paralog Ks TSV file not found at default position [{default_path_rec_ret_tsv_file}].") + + if paralog_tsv_file == "" or anchors_ks_tsv_file == "" or rec_ret_tsv_file == "": logging.error("Exiting") sys.exit(1) fig_para, axis_para = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "corrected", - correction_table_available, plot_correction_arrows, paranome_data=True, colinearity_data=False) + correction_table_available, plot_correction_arrows, paranome_data=True) fig_colin, axis_colin = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "corrected", - correction_table_available, plot_correction_arrows, paranome_data=False, colinearity_data=True) - - parameter_table = [] + correction_table_available, plot_correction_arrows, colinearity_data=True) + fig_rec_ret, axis_rec_ret = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, "corrected", + correction_table_available, plot_correction_arrows, reciprocal_retention_data=True, top=top, rank_type=rank_type) if paranome_analysis: + parameter_table = [] with open (os.path.join("rate_adjustment", f"{species}", subfolder, f"lmm_{species}_parameters_paranome.txt"), "w+") as outfile: logging.info("Performing lognormal mixture model on whole-paranome Ks distribution") paranome_list, paranome_weights = fc_extract_ks_list.ks_list_from_tsv(paralog_tsv_file, max_ks_para, "paralogs") @@ -107,19 +129,26 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, correc if y_lim is None: fcPlot.set_mixed_plot_height(axis_para, y_lim, hist_paranome) - best_model_paranome = fcLMM.lmm( + best_model_paranome, letter_to_peak_dict_para = fcLMM.lmm( fig_para, x_max_lim, "paralogs", paralog_tsv_file, species, axis_para, (0, max_ks_EM), (1, max_num_comp), arange(-10, max_ks_EM + bin_width_para, bin_width_para), bin_width_para, max_EM_iterations, num_EM_initializations, output_dir, outfile, parameter_table, "paranome", peak_stats, correction_table_available, plot_correction_arrows) # Generating tabular text file with all model parameters fcLMM.make_parameter_table_file(parameter_table, species, "paranome") + + if correction_table_available: + # Printing the automatic interpretation of the rate-adjusted mixed plot according to inferred (putative) WGD peaks + interpretation_adjusted_plot(latinSpecies, consensus_peak_for_multiple_outgroups, + letter_to_peak_dict_para, correction_table) + if paranome_analysis and colinearity_analysis: + logging.info("Done") logging.info("") - parameter_table = [] if colinearity_analysis: + parameter_table = [] with open (os.path.join("rate_adjustment", f"{species}", subfolder, f"lmm_{species}_parameters_anchors.txt"), "w+") as outfile: logging.info("Performing lognormal mixture model on anchor pair Ks distribution") anchors_list, anchors_weights = fc_extract_ks_list.ks_list_from_tsv(anchors_ks_tsv_file, max_ks_para, "anchor pairs") @@ -132,7 +161,7 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, correc if y_lim is None: fcPlot.set_mixed_plot_height(axis_colin, y_lim, hist_anchors) - best_model_anchors = fcLMM.lmm( + best_model_anchors, letter_to_peak_dict_anchors = fcLMM.lmm( fig_colin, x_max_lim, "anchor pairs", anchors_ks_tsv_file, species, axis_colin, (0, max_ks_EM), (1, max_num_comp), arange(-10, max_ks_EM + bin_width_para, bin_width_para), bin_width_para, max_EM_iterations, num_EM_initializations, output_dir, outfile, parameter_table, "anchors", peak_stats, correction_table_available, plot_correction_arrows) @@ -140,7 +169,41 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, correc # Generating tabular text file with all model parameters fcLMM.make_parameter_table_file(parameter_table, species, "anchors") - for axis in [axis_para, axis_colin]: + if correction_table_available: + # Printing the automatic interpretation of the rate-adjusted mixed plot according to inferred (putative) WGD peaks + interpretation_adjusted_plot(latinSpecies, consensus_peak_for_multiple_outgroups, + letter_to_peak_dict_anchors, correction_table) + + if (paranome_analysis or colinearity_analysis) and reciprocal_retention_analysis: + logging.info("Done") + logging.info("") + + if reciprocal_retention_analysis: + parameter_table = [] + with open (os.path.join("rate_adjustment", f"{species}", subfolder, f"lmm_{species}_parameters_recret_{rank_type}.txt"), "w+") as outfile: + logging.info("Performing lognormal mixture model on reciprocally retained paralog Ks distribution") + rec_ret_list, rec_ret_weights = fc_extract_ks_list.ks_list_from_tsv(rec_ret_tsv_file, max_ks_para, "reciprocally retained") + hist_rec_ret = fcPlot.plot_histogram("Reciprocally retained paralogs", axis_rec_ret, rec_ret_list, bin_list, + bin_width_para, max_ks_para, kde_bandwidth_modifier, rec_ret_weights, color=COLOR_REC_RET_HISTOGRAM, plot_kde=False) + # Setting the plot height based on tallest histogram bin + if y_lim is None: + fcPlot.set_mixed_plot_height(axis_rec_ret, y_lim, hist_rec_ret) + + best_model_rec_ret, letter_to_peak_dict_recret = fcLMM.lmm( + fig_rec_ret, x_max_lim, "reciprocally retained", rec_ret_tsv_file, species, axis_rec_ret, (0, max_ks_EM), + (1, max_num_comp), arange(-10, max_ks_EM + bin_width_para, bin_width_para), bin_width_para, max_EM_iterations, num_EM_initializations, + output_dir, outfile, parameter_table, "recret", peak_stats, correction_table_available, plot_correction_arrows) + + # Generating tabular text file with all model parameters + fcLMM.make_parameter_table_file(parameter_table, species, f"recret_{rank_type}") + + if correction_table_available: + # Printing the automatic interpretation of the rate-adjusted mixed plot according to inferred (putative) WGD peaks + interpretation_adjusted_plot(latinSpecies, consensus_peak_for_multiple_outgroups, + letter_to_peak_dict_recret, correction_table) + logging.info("Done") + + for axis in [axis_para, axis_colin, axis_rec_ret]: # PLOTTING THE ORTHOLOG DIVERGENCE LINES if correction_table_available: dummy_fig, dummy_axis = plt.subplots() @@ -154,6 +217,9 @@ def lognormal_mixture(config_file, paralog_tsv_file, anchors_ks_tsv_file, correc if colinearity_analysis: fcLMM.save_lmm(fig_colin, axis_colin, species, best_model_anchors, "anchors", correction_table_available) logging.info(f"Saved PDF figure of lognormal mixture model [mixed_{species}_lmm_anchors.pdf]") + if reciprocal_retention_analysis: + fcLMM.save_lmm(fig_rec_ret, axis_rec_ret, species, best_model_rec_ret, f"recret_{rank_type}", correction_table_available) + logging.info(f"Saved PDF figure of lognormal mixture model [mixed_{species}_lmm_recret_{rank_type}.pdf]") logging.info("") logging.info("All done") \ No newline at end of file diff --git a/ksrates/paralogs_analyses.py b/ksrates/paralogs_analyses.py index b8900be031beddfc767525a95dc10c8b5fb7188a..1a01622a8a3c04dccfff59c534c944699d65d248 100644 --- a/ksrates/paralogs_analyses.py +++ b/ksrates/paralogs_analyses.py @@ -1,4 +1,5 @@ +import collections import sys import logging from ksrates.utils import init_logging @@ -7,38 +8,92 @@ from ksrates.exp_log_mixture import exp_log_mixture from ksrates.lognormal_mixture import lognormal_mixture import ksrates.fc_configfile as fcConf -def paralogs_analyses_methods(config_file, paranome_table, anchors_table, correction_table, anchorpoints, multiplicons, segments, list_elements, multiplicon_pairs): +def paralogs_analyses_methods(config_file, paranome_table, anchors_table, reciprocal_retention_table, + correction_table, anchorpoints, multiplicons, segments, list_elements, multiplicon_pairs): # INPUT config = fcConf.Configuration(config_file) logging.basicConfig(format='%(levelname)s\t%(message)s', level=config.get_logging_level(), stream=sys.stdout) paranome = config.get_paranome() colinearity = config.get_colinearity() + reciprocal_retention_analysis = config.get_reciprocal_retention() extra_paralogs_analyses_methods = config.get_extra_paralogs_analyses_methods() - if paranome and not colinearity: - # Only exp-log mixture model by default + # 1) ONLY WHOLE-PARANOME + if paranome and not colinearity and not reciprocal_retention_analysis: + # Default: perform only exponential-lognormal mixture modeling (ELMM) on paranome exp_log_mixture(config_file, paranome_table, correction_table) + if extra_paralogs_analyses_methods: logging.info(f"\n") - # Lognormal mixture model on paranome - lognormal_mixture(config_file, paranome_table, anchors_table, correction_table) - - if colinearity and not paranome: - # Only anchor clustering by default + # If additional methods are asked: perform lognormal mixture modeling (LMM) on paranome + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) + + + # 2) ONLY COLLINEARITY + elif not paranome and colinearity and not reciprocal_retention_analysis: + # Default: perform only anchor clustering cluster_anchor_ks(config_file, correction_table, anchorpoints, multiplicons, segments, list_elements, anchors_table, multiplicon_pairs) + if extra_paralogs_analyses_methods: logging.info(f"\n") - # Lognormal mixture model on anchors - lognormal_mixture(config_file, paranome_table, anchors_table, correction_table) + # If additional methods are asked: perform LMM on anchors + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) + + + # 3) ONLY REC.RET. + elif not paranome and not colinearity and reciprocal_retention_analysis: + # Default: perform LMM on rec.ret. paralogs + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) + # There are no additional methods for rec.ret. paralogs - if colinearity and paranome: - # Only anchor clustering by default + + # 4) COLLINEARITY AND REC.RET. + elif not paranome and colinearity and reciprocal_retention_analysis: + # Default: perform anchor clustering and LMM on rec.ret. paralogs cluster_anchor_ks(config_file, correction_table, anchorpoints, multiplicons, segments, list_elements, anchors_table, multiplicon_pairs) + logging.info(f"\n") + if not extra_paralogs_analyses_methods: + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table, ignore_anchors=True) + + elif extra_paralogs_analyses_methods: + # If additional methods are asked: perform LMM both on rec.ret. paralogs and - as extra - on anchors + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) + + # 5) COLLINEARITY AND PARANOME + elif paranome and colinearity and not reciprocal_retention_analysis: + # Default: perform only anchor clustering + cluster_anchor_ks(config_file, correction_table, anchorpoints, multiplicons, segments, list_elements, anchors_table, multiplicon_pairs) + if extra_paralogs_analyses_methods: + # If additional methods are asked: perform ELMM on paranome, and LMM on paranome and anchors logging.info(f"\n") - # Exp-log mixture model on paranome exp_log_mixture(config_file, paranome_table, correction_table) logging.info(f"\n") - # Lognormal mixture model on both - lognormal_mixture(config_file, paranome_table, anchors_table, correction_table) + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) + + # 6) REC.RET. AND PARANOME + elif paranome and not colinearity and reciprocal_retention_analysis: + # Default: perform ELMM on paranome and LMM on rec.ret. paralogs + exp_log_mixture(config_file, paranome_table, correction_table) + logging.info(f"\n") + if not extra_paralogs_analyses_methods: + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table, ignore_paranome=True) + + elif extra_paralogs_analyses_methods: + # If additional methods are asked: perform LMM both on rec.ret. paralogs and on paranome + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) + + # 7) COLLINEARITY AND REC.RET. AND PARANOME + elif paranome and colinearity and reciprocal_retention_analysis: + # Default: perform anchor clustering and LMM on rec.ret. paralogs + cluster_anchor_ks(config_file, correction_table, anchorpoints, multiplicons, segments, list_elements, anchors_table, multiplicon_pairs) + logging.info(f"\n") + if not extra_paralogs_analyses_methods: + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table, ignore_anchors=True, ignore_paranome=True) + + elif extra_paralogs_analyses_methods: + # If additional methods are asked: perform ELMM on paranome, and LMM on rec.ret. paralogs, anchors and paranome + exp_log_mixture(config_file, paranome_table, correction_table) + logging.info(f"\n") + lognormal_mixture(config_file, paranome_table, anchors_table, reciprocal_retention_table, correction_table) diff --git a/ksrates/plot_paralogs.py b/ksrates/plot_paralogs.py index a57c8df044ce459670d48ae986b79fea72ba71d0..ec78134ce30f6ba1c4f988e2af747e13b728ea8b 100755 --- a/ksrates/plot_paralogs.py +++ b/ksrates/plot_paralogs.py @@ -9,6 +9,7 @@ import ksrates.fc_extract_ks_list as fc_extract_ks_list import ksrates.fc_check_input as fcCheck import ksrates.fc_configfile as fcConf from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE +from ksrates.fc_plotting import _MIXED_ADJUSTED_PLOT_FILENAME, _MIXED_UNADJUSTED_PLOT_FILENAME, _OTHER_MIXED_PLOTS_SUBDIR def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, anchors_ks_tsv_file, rec_ret_tsv_file): # INPUT @@ -24,6 +25,9 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an colinearity_analysis = config.get_colinearity() reciprocal_retention_analysis = config.get_reciprocal_retention() + top = config.get_reciprocal_retention_top() + rank_type = config.get_reciprocal_retention_rank_type() + if not colinearity_analysis and not paranome_analysis and not reciprocal_retention_analysis: logging.error('At least one of the "paranome" or "collinearity" or "reciprocal_retention" parameters in the configuration file needs to be set to "yes".') logging.error("Exiting.") @@ -43,7 +47,7 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an if anchors_ks_tsv_file == "": logging.error(f"Anchor pair Ks TSV file not found at default position [{default_path_anchors_tsv_file}].") if reciprocal_retention_analysis: - default_path_rec_ret_tsv_file = os.path.join("paralog_distributions", f"wgd_{species}", f"{species}.ks_rec_ret.tsv") + default_path_rec_ret_tsv_file = os.path.join("paralog_distributions", f"wgd_{species}", f"{species}.ks_rec_ret_top_{top}_{rank_type}.tsv") rec_ret_tsv_file = fcCheck.get_argument_path(rec_ret_tsv_file, default_path_rec_ret_tsv_file, "Reciprocally retained paralog Ks TSV file") if rec_ret_tsv_file == "": logging.error(f"Reciprocally retained paralog Ks TSV file not found at default position [{default_path_rec_ret_tsv_file}].") @@ -92,50 +96,135 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an plot_correction_arrows = config.plot_correction_arrows() peak_stats = config.get_peak_stats() # default is mode (other option, median) - top = config.get_reciprocal_retention_top() - rank_type = config.get_reciprocal_retention_rank_type() # ----------------------------------------------------------------------------- # GENERATING MIXED DISTRIBUTION PLOT logging.info("") - # TODO: REMOVE THE FOLLOWING COMMENTED BLOCK ONCE THE IF-ELIF LIST IS PROVED TO BE CORRECT - # if not colinearity_analysis: - # logging.info(f"Plotting paranome Ks distribution for species [{species}]") - # elif not paranome_analysis: - # logging.info(f"Plotting anchor pair Ks distribution for species [{species}]") - # else: - # logging.info(f"Plotting paranome and anchor pairs Ks distributions for species [{species}]") - if not colinearity_analysis and not reciprocal_retention_analysis: + ax_uncorr_list, ax_corr_list = [], [] + ax_uncorr_include_para, ax_corr_include_para= [], [] + ax_uncorr_include_col, ax_corr_include_col= [], [] + ax_uncorr_include_rr, ax_corr_include_rr= [], [] + + # Generate the mixed plot with a single data type (e.g. paranome) for any required data_type + if paranome_analysis: logging.info(f"Plotting paranome Ks distribution for species [{species}]") - elif not paranome_analysis and not reciprocal_retention_analysis: + fig_uncorr_para, ax_uncorr_para = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "un-corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, top=top, rank_type=rank_type) + fig_corr_para, ax_corr_para = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, top=top, rank_type=rank_type) + ax_uncorr_include_para.append(ax_uncorr_para) + ax_corr_include_para.append(ax_corr_para) + ax_uncorr_list.append(ax_uncorr_para) + ax_corr_list.append(ax_corr_para) + + if colinearity_analysis: logging.info(f"Plotting anchor pair Ks distribution for species [{species}]") - elif not paranome_analysis and not colinearity_analysis: + fig_uncorr_col, ax_uncorr_col = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "un-corrected", correction_table_available, plot_correction_arrows, + colinearity_data=colinearity_analysis, top=top, rank_type=rank_type) + fig_corr_col, ax_corr_col = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "corrected", correction_table_available, plot_correction_arrows, + colinearity_data=colinearity_analysis, top=top, rank_type=rank_type) + ax_uncorr_include_col.append(ax_uncorr_col) + ax_corr_include_col.append(ax_corr_col) + ax_uncorr_list.append(ax_uncorr_col) + ax_corr_list.append(ax_corr_col) + + if reciprocal_retention_analysis: logging.info(f"Plotting reciprocally retained paralog Ks distribution for species [{species}]") - elif paranome_analysis and colinearity_analysis: + fig_uncorr_rr, ax_uncorr_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "un-corrected", correction_table_available, plot_correction_arrows, + reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + fig_corr_rr, ax_corr_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "corrected", correction_table_available, plot_correction_arrows, + reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + ax_uncorr_include_rr.append(ax_uncorr_rr) + ax_corr_include_rr.append(ax_corr_rr) + ax_uncorr_list.append(ax_uncorr_rr) + ax_corr_list.append(ax_corr_rr) + + # Generate the mixed plot for any required pair of data types (e.g. paranome and anchors) + if paranome_analysis and colinearity_analysis: logging.info(f"Plotting paranome and anchor pairs Ks distributions for species [{species}]") - elif paranome_analysis and reciprocal_retention_analysis: + fig_uncorr_para_col, ax_uncorr_para_col = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "un-corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, + top=top, rank_type=rank_type) + fig_corr_para_col, ax_corr_para_col = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, + top=top, rank_type=rank_type) + ax_uncorr_include_para.append(ax_uncorr_para_col) + ax_corr_include_para.append(ax_corr_para_col) + ax_uncorr_include_col.append(ax_uncorr_para_col) + ax_corr_include_col.append(ax_corr_para_col) + ax_uncorr_list.append(ax_uncorr_para_col) + ax_corr_list.append(ax_corr_para_col) + + if paranome_analysis and reciprocal_retention_analysis: logging.info(f"Plotting paranome and reciprocally retained paralog Ks distributions for species [{species}]") - elif colinearity_analysis and reciprocal_retention_analysis: - logging.info(f"Plotting anchor pairs and reciprocally retained paralog Ks distributions for species [{species}]") - else: - logging.info(f"Plotting paranome, anchor pairs and reciprocally retained paralog Ks distributions for species [{species}]") + fig_uncorr_para_rr, ax_uncorr_para_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "un-corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, reciprocal_retention_data=reciprocal_retention_analysis, + top=top, rank_type=rank_type) + fig_corr_para_rr, ax_corr_para_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, reciprocal_retention_data=reciprocal_retention_analysis, + top=top, rank_type=rank_type) + ax_uncorr_include_para.append(ax_uncorr_para_rr) + ax_corr_include_para.append(ax_corr_para_rr) + ax_uncorr_include_rr.append(ax_uncorr_para_rr) + ax_corr_include_rr.append(ax_corr_para_rr) + ax_uncorr_list.append(ax_uncorr_para_rr) + ax_corr_list.append(ax_corr_para_rr) - # PLOTTING THE BACKGROUND PARALOG AND/OR ANCHOR DISTRIBUTION - fig_uncorr, ax_uncorr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, - "un-corrected", correction_table_available, plot_correction_arrows, - paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, - reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + if colinearity_analysis and reciprocal_retention_analysis: + logging.info(f"Plotting anchor pairs and reciprocally retained paralog Ks distributions for species [{species}]") + fig_uncorr_col_rr, ax_uncorr_col_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "un-corrected", correction_table_available, plot_correction_arrows, + colinearity_data=colinearity_analysis, reciprocal_retention_data=reciprocal_retention_analysis, + top=top, rank_type=rank_type) + fig_corr_col_rr, ax_corr_col_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "corrected", correction_table_available, plot_correction_arrows, + colinearity_data=colinearity_analysis, reciprocal_retention_data=reciprocal_retention_analysis, + top=top, rank_type=rank_type) + ax_uncorr_include_col.append(ax_uncorr_col_rr) + ax_corr_include_col.append(ax_corr_col_rr) + ax_uncorr_include_rr.append(ax_uncorr_col_rr) + ax_corr_include_rr.append(ax_corr_col_rr) + ax_uncorr_list.append(ax_uncorr_col_rr) + ax_corr_list.append(ax_corr_col_rr) - fig_corr, ax_corr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, - "corrected", correction_table_available, plot_correction_arrows, - paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, - reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + # Generate the mixed plot for the three data types altogether if required by configuration + if paranome_analysis and colinearity_analysis and reciprocal_retention_analysis: + logging.info(f"Plotting paranome, anchor pairs and reciprocally retained paralog Ks distributions for species [{species}]") + fig_uncorr_para_col_rr, ax_uncorr_para_col_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "un-corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, + reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + fig_corr_para_col_rr, ax_corr_para_col_rr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, + "corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, + reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + ax_uncorr_include_para.append(ax_uncorr_para_col_rr) + ax_corr_include_para.append(ax_corr_para_col_rr) + ax_uncorr_include_col.append(ax_uncorr_para_col_rr) + ax_corr_include_col.append(ax_corr_para_col_rr) + ax_uncorr_include_rr.append(ax_uncorr_para_col_rr) + ax_corr_include_rr.append(ax_corr_para_col_rr) + ax_uncorr_list.append(ax_uncorr_para_col_rr) + ax_corr_list.append(ax_corr_para_col_rr) + # PLOTTING THE BACKGROUND PARALOG DISTRIBUTION(S) if paranome_analysis: paranome_list, paranome_weights = fc_extract_ks_list.ks_list_from_tsv(paralog_tsv_file, max_ks_para, "paralogs") - hist_paranome = fcPlot.plot_histogram("Whole-paranome", ax_uncorr, paranome_list, bin_list, bin_width_para, - max_ks_para, kde_bandwidth_modifier, weight_list=paranome_weights) - fcPlot.plot_histogram("Whole-paranome", ax_corr, paranome_list, bin_list, bin_width_para, + for ax_uncorr in ax_uncorr_include_para: + hist_paranome = fcPlot.plot_histogram("Whole-paranome", ax_uncorr, paranome_list, bin_list, bin_width_para, + max_ks_para, kde_bandwidth_modifier, weight_list=paranome_weights) + for ax_corr in ax_corr_include_para: + fcPlot.plot_histogram("Whole-paranome", ax_corr, paranome_list, bin_list, bin_width_para, max_ks_para, kde_bandwidth_modifier, weight_list=paranome_weights) if colinearity_analysis: @@ -143,43 +232,95 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an if len(anchors_list) == 0: logging.warning(f"No anchor pairs found! Maybe check your (gene) IDs between " f"the [{species}.ks_anchors.tsv] file and the [{species}.ks.tsv] files.") - # FOR NOW USE AN UNWEIGHTED HISTOGRAM - hist_anchors = fcPlot.plot_histogram("Anchor pairs", ax_uncorr, anchors_list, bin_list, bin_width_para, max_ks_para, - kde_bandwidth_modifier, color=fcPlot.COLOR_ANCHOR_HISTOGRAM, weight_list=anchors_weights) - fcPlot.plot_histogram("Anchor pairs", ax_corr, anchors_list, bin_list, bin_width_para, max_ks_para, - kde_bandwidth_modifier, color=fcPlot.COLOR_ANCHOR_HISTOGRAM, weight_list=anchors_weights) + for ax_uncorr in ax_uncorr_include_col: + hist_anchors = fcPlot.plot_histogram("Anchor pairs", ax_uncorr, anchors_list, bin_list, bin_width_para, max_ks_para, + kde_bandwidth_modifier, color=fcPlot.COLOR_ANCHOR_HISTOGRAM, weight_list=anchors_weights) + for ax_corr in ax_corr_include_col: + fcPlot.plot_histogram("Anchor pairs", ax_corr, anchors_list, bin_list, bin_width_para, max_ks_para, + kde_bandwidth_modifier, color=fcPlot.COLOR_ANCHOR_HISTOGRAM, weight_list=anchors_weights) if reciprocal_retention_analysis: rec_ret_list, rec_ret_weights = fc_extract_ks_list.ks_list_from_tsv(rec_ret_tsv_file, max_ks_para, "reciprocally retained") - hist_rec_ret = fcPlot.plot_histogram("Reciprocally retained paralogs", ax_uncorr, rec_ret_list, bin_list, bin_width_para, - max_ks_para, kde_bandwidth_modifier, weight_list=rec_ret_weights) - fcPlot.plot_histogram("Reciprocally retained paralogs", ax_corr, rec_ret_list, bin_list, bin_width_para, - max_ks_para, kde_bandwidth_modifier, color=fcPlot.COLOR_REC_RET_HISTOGRAM, weight_list=rec_ret_weights) - - # Setting plot height based on tallest histogram (if paranome analysis is on, it will come from that distribution) + for ax_uncorr in ax_uncorr_include_rr: + hist_rec_ret = fcPlot.plot_histogram("Reciprocally retained paralogs", ax_uncorr, rec_ret_list, bin_list, bin_width_para, + max_ks_para, kde_bandwidth_modifier, color=fcPlot.COLOR_REC_RET_HISTOGRAM, weight_list=rec_ret_weights) + for ax_corr in ax_corr_include_rr: + fcPlot.plot_histogram("Reciprocally retained paralogs", ax_corr, rec_ret_list, bin_list, bin_width_para, + max_ks_para, kde_bandwidth_modifier, color=fcPlot.COLOR_REC_RET_HISTOGRAM, weight_list=rec_ret_weights) + + # Setting plot height based on tallest histogram among the paralog distribution(s) if y_lim is None: + # Set the height based on paranome distribution when this is for sure the tallest if paranome_analysis: - fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_paranome) - fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_paranome) - elif colinearity_analysis and not reciprocal_retention_analysis: - fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_anchors) - fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_anchors) - elif not colinearity_analysis and reciprocal_retention_analysis: - fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_rec_ret) - fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_rec_ret) - elif colinearity_analysis and reciprocal_retention_analysis: - fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_anchors, hist_rec_ret) - fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_anchors, hist_rec_ret) + for ax_uncorr, ax_corr in zip(ax_uncorr_include_para, ax_corr_include_para): + fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_paranome) + fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_paranome) + + # Set height based on anchor pair distributions if that's the only one in the plot + if colinearity_analysis: + fcPlot.set_mixed_plot_height(ax_uncorr_col, y_lim, hist_anchors) + fcPlot.set_mixed_plot_height(ax_corr_col, y_lim, hist_anchors) + + # Set height based on reciprocally retained paralog distribution if that's the only one in the plot + if reciprocal_retention_analysis: + fcPlot.set_mixed_plot_height(ax_uncorr_rr, y_lim, hist_rec_ret) + fcPlot.set_mixed_plot_height(ax_corr_rr, y_lim, hist_rec_ret) + + # Set height to tallest distribution between anchor pair and reciprocally retained paralogs + if colinearity_analysis and reciprocal_retention_analysis: + fcPlot.set_mixed_plot_height(ax_uncorr_col_rr, y_lim, hist_anchors, hist_rec_ret) + fcPlot.set_mixed_plot_height(ax_corr_col_rr, y_lim, hist_anchors, hist_rec_ret) + # PLOTTING THE ORTHOLOG DIVERGENCE LINES on the paralog distribution if correction_table_available: logging.info("Plotting ortholog divergence lines in the mixed plot") - fcPlot.plot_divergences(correction_table, peak_stats, consensus_peak_for_multiple_outgroups, ax_uncorr, ax_corr, color_list, plot_correction_arrows) + for ax_uncorr, ax_corr in zip(ax_uncorr_list, ax_corr_list): + fcPlot.plot_divergences(correction_table, peak_stats, consensus_peak_for_multiple_outgroups, ax_uncorr, ax_corr, color_list, plot_correction_arrows) logging.info("") - logging.info(f"Saving PDF figures of mixed plots [{fcPlot._MIXED_ADJUSTED_PLOT_FILENAME.format(species)}, {fcPlot._MIXED_UNADJUSTED_PLOT_FILENAME.format(species)}]") - fcPlot.save_mixed_plot(fig_corr, fig_uncorr, ax_corr, ax_uncorr, species, correction_table_available, paranome=paranome_analysis, - colinearity=colinearity_analysis, reciprocal_retention=reciprocal_retention_analysis) + logging.info(f"Saving PDF figures of mixed plots") + if paranome_analysis: + fcPlot.save_mixed_plot(fig_corr_para, fig_uncorr_para, ax_corr_para, ax_uncorr_para, species, correction_table_available, paranome=paranome_analysis, + colinearity=False, reciprocal_retention=False, + filename_adjusted=_MIXED_ADJUSTED_PLOT_FILENAME.format(species, "paranome"), + filename_unadjusted=_MIXED_UNADJUSTED_PLOT_FILENAME.format(species, "paranome")) + + if colinearity_analysis: + fcPlot.save_mixed_plot(fig_corr_col, fig_uncorr_col, ax_corr_col, ax_uncorr_col, species, correction_table_available, paranome=False, + colinearity=colinearity_analysis, reciprocal_retention=False, + filename_adjusted=_MIXED_ADJUSTED_PLOT_FILENAME.format(species, "anchors"), + filename_unadjusted=_MIXED_UNADJUSTED_PLOT_FILENAME.format(species, "anchors")) + + if reciprocal_retention_analysis: + fcPlot.save_mixed_plot(fig_corr_rr, fig_uncorr_rr, ax_corr_rr, ax_uncorr_rr, species, correction_table_available, paranome=False, + colinearity=False, reciprocal_retention=reciprocal_retention_analysis, + filename_adjusted=_MIXED_ADJUSTED_PLOT_FILENAME.format(species, f"recret_{rank_type}"), + filename_unadjusted=_MIXED_UNADJUSTED_PLOT_FILENAME.format(species, f"recret_{rank_type}")) + + if paranome_analysis and colinearity_analysis: + fcPlot.save_mixed_plot(fig_corr_para_col, fig_uncorr_para_col, ax_corr_para_col, ax_uncorr_para_col, species, correction_table_available, + paranome=paranome_analysis, colinearity=colinearity_analysis, reciprocal_retention=False, + filename_adjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_ADJUSTED_PLOT_FILENAME.format(species, "paranome_anchors")), + filename_unadjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_UNADJUSTED_PLOT_FILENAME.format(species, "paranome_anchors"))) + + if paranome_analysis and reciprocal_retention_analysis: + fcPlot.save_mixed_plot(fig_corr_para_rr, fig_uncorr_para_rr, ax_corr_para_rr, ax_uncorr_para_rr, species, correction_table_available, + paranome=paranome_analysis, colinearity=False, reciprocal_retention=reciprocal_retention_analysis, + filename_adjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_ADJUSTED_PLOT_FILENAME.format(species, f"paranome_recret_{rank_type}")), + filename_unadjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_UNADJUSTED_PLOT_FILENAME.format(species, f"paranome_recret_{rank_type}"))) + + if colinearity_analysis and reciprocal_retention_analysis: + fcPlot.save_mixed_plot(fig_corr_col_rr, fig_uncorr_col_rr, ax_corr_col_rr, ax_uncorr_col_rr, species, correction_table_available, paranome=False, + colinearity=reciprocal_retention_analysis, reciprocal_retention=reciprocal_retention_analysis, + filename_adjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_ADJUSTED_PLOT_FILENAME.format(species, f"anchors_recret_{rank_type}")), + filename_unadjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_UNADJUSTED_PLOT_FILENAME.format(species, f"anchors_recret_{rank_type}"))) + + if paranome_analysis and colinearity_analysis and reciprocal_retention_analysis: + fcPlot.save_mixed_plot(fig_corr_para_col_rr, fig_uncorr_para_col_rr, ax_corr_para_col_rr, ax_uncorr_para_col_rr, species, correction_table_available, + paranome=paranome_analysis, colinearity=reciprocal_retention_analysis, reciprocal_retention=reciprocal_retention_analysis, + filename_adjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_ADJUSTED_PLOT_FILENAME.format(species, f"paranome_anchors_recret_{rank_type}")), + filename_unadjusted=os.path.join(_OTHER_MIXED_PLOTS_SUBDIR, _MIXED_UNADJUSTED_PLOT_FILENAME.format(species, f"paranome_anchors_recret_{rank_type}"))) logging.info("") logging.info("All done") diff --git a/ksrates/reciprocal_retention/compute_ks_from_ortho_gfs.py b/ksrates/reciprocal_retention/compute_ks_from_ortho_gfs.py deleted file mode 100755 index e33474e252c9baaa3f1a31bb40cf3c02f7d82b47..0000000000000000000000000000000000000000 --- a/ksrates/reciprocal_retention/compute_ks_from_ortho_gfs.py +++ /dev/null @@ -1,283 +0,0 @@ -from wgd_ksrates.utils import read_fasta -from wgd_ksrates.ks_distribution import ks_analysis_paranome -import os -import logging -import sys -import subprocess -import tempfile -import math -import datetime - -logging.basicConfig(format='%(levelname)s\t%(message)s', level="INFO", stream=sys.stdout) - -_OUTPUT_KS_FILE_PATTERN = '{}_ortho_gf_{}_{}_rank.ks.tsv' -_PARALOGS_OUTPUT_DIR_PATTERN = 'wgd_{}' -_TMP_KS = '{}_ortho_gf.ks_tmp' - - -def ks_paralogs(species_name, cds_fasta, top, top_gfs_filtered_with_rank, rank_type, ks_tsv_path, base_dir='.', eval_cutoff=1e-10, - inflation_factor=2.0, aligner='muscle', min_msa_length=100, codeml='codeml', codeml_times=1, pairwise=False, - max_gene_family_size=200, weighting_method='fasttree', n_threads=4, overwrite=False): - """ - Among the edits, this function temporarily takes also the path and filename to the output_ks_file, in order to - provide a different name when top 2000 or a subset of it are used. - """ - - # input checks - if not species_name: - logging.error('No species name provided. Exiting.') - sys.exit(1) - - # TMP removed in order to use a custom name dependent on whether a subset of the top2000 is used or all of the top 2000 are used - # output_ks_file = _OUTPUT_KS_FILE_PATTERN.format(f"{species_name.lower()}", f"top{top}", rank_type) - output_ks_file = os.path.basename(ks_tsv_path) - - output_dir = os.path.abspath(os.path.join(base_dir, _PARALOGS_OUTPUT_DIR_PATTERN.format(species_name.lower()))) - - tmp_ks_paralogs = os.path.join(output_dir, _TMP_KS.format(f"{species_name.lower()}_{rank_type}")) - - # determine which parts to run - do_ks = True - if os.path.exists(output_dir): - - # REMOVED THE BLAST AND MCL STEPS - - # Check if Ks tmp folder of previous failed run exists (in this case the Ks data will be incomplete) - if os.path.exists(tmp_ks_paralogs): # if there is a Ks tmp folder from previous failed run - logging.error(f'tmp directory [{tmp_ks_paralogs}] was found: leftover from a failed earlier run?') - logging.error('Paralog Ks data may be incomplete. Please delete the tmp directory and rerun ' - 'the analysis. Exiting.') - sys.exit(1) - - if os.path.exists(os.path.join(output_dir, output_ks_file)): - if overwrite: - logging.warning(f'Paralog Ks file {output_ks_file} exists, will overwrite') - else: - logging.info(f'Paralog Ks data {output_ks_file} already exists, will skip wgd Ks analysis') - do_ks = False - else: - logging.info('No paralog Ks data, will run wgd Ks analysis') - - if not do_ks: - logging.info('All paralog data already exist, nothing to do') - logging.info('Done') - return - - # software checks - logging.info('---') - logging.info('Checking external software...') - software = [] - if do_ks: - software += [aligner, codeml] - if weighting_method == 'fasttree': - software += ['FastTree'] - elif weighting_method == 'phyml': - software += ['phyml'] - if can_i_run_software(software) == 1: - logging.error('Could not run all required external software. Exiting.') - sys.exit(1) - - # input checks - if not cds_fasta: - logging.error('No CDS fasta file provided. Exiting.') - sys.exit(1) - - if not os.path.exists(output_dir): - logging.info(f'Creating output directory {output_dir}') - os.makedirs(output_dir) - - # read and translate CDS file - cds_sequences = None - protein_sequences = None - if do_ks: - logging.info(f'Translating CDS file {cds_fasta}...') - cds_sequences = read_fasta(os.path.abspath(cds_fasta)) - protein_sequences = translate_cds(cds_sequences) - - # REMOVED BLAST and MCL STEPS - - # whole paranome Ks analysis - if do_ks: - logging.info('---') - logging.info('Running whole paranome Ks analysis...') - - # tmp directory - cw_dir = os.getcwd() - if not os.path.exists(tmp_ks_paralogs): - os.mkdir(tmp_ks_paralogs) - os.chdir(tmp_ks_paralogs) # change dir to tmp dir, as codeml writes non-unique file names to the working dir - - test_run = False - if test_run: - output_mcl_path = os.path.join(output_dir, f"{species_name.lower()}_ortho_gf_top5000_{rank_type}_rank.mcl.tsv") - else: - output_mcl_path = os.path.join(output_dir, f"{species_name.lower()}_ortho_gf_top{top}_{rank_type}_rank.mcl.tsv") - - results_df = ks_analysis_paranome(cds_sequences, protein_sequences, output_mcl_path, top, top_gfs_filtered_with_rank, - tmp_ks_paralogs, output_dir, codeml, times=codeml_times, aligner=aligner, - ignore_prefixes=False, min_length=min_msa_length, pairwise=pairwise, - max_gene_family_size=max_gene_family_size, method=weighting_method, - preserve=False, n_threads=n_threads) - if os.path.exists(tmp_ks_paralogs): - logging.error(f'tmp directory {tmp_ks_paralogs} still exists after analysis, ' - f'paralog Ks data may be incomplete and will not be written to paralog Ks file!') - logging.error('Please delete the tmp directory and rerun the analysis. Exiting.') - sys.exit(1) - if isinstance(results_df, type(None)) or results_df.empty: - logging.warning('No paralog Ks data computed, will write empty paralog Ks file!') - with open(os.path.join(output_dir, output_ks_file), 'w+') as o: - o.write(results_df.round(5).to_csv(sep='\t')) - # change back to current directory as tmp dir got deleted and subsequent os.getcwd() may fail - os.chdir(cw_dir) - - logging.info('---') - logging.info('Done') - - - -def can_i_run_software(software): - """ - Copied and modified from wgd.utils - - Test if external software is executable - :param software: list or string of executable(s) - :return: 1 (failure) or 0 (success) - """ - if type(software) == str: - software = [software] - ex = 0 - for s in software: - # codeml needs input otherwise it prompts the user for input, so a dummy - # file is created - if s == 'codeml': - tmp_file = "codeml.ctl" - command = ['codeml', tmp_file] - elif s == 'prank': - command = [s, '--help'] - elif s == 'FastTree': - command = s - elif s in ['blastp', 'makeblastdb', 'blast', 'muscle', 'i-adhore']: - command = [s, '-version'] - else: - command = [s, '--version'] - try: - # logging.info(command) - if s == "codeml": - # Since the Nextflow pipeline processes multiple wgd runs at the same time, - # let's generate for each run the dummy "codeml.ctl" files within a different - # temporary directory named with a unique ID. This way, the several parallel - # wgd runs will not interfere with each other when creating and removing - # the dummy codeml.ctl file. - with tempfile.TemporaryDirectory(dir = ".") as tmp_dir: - with open(os.path.join(tmp_dir, tmp_file), 'w') as o: # write the codeml.ctl file in it - o.write('seqfile = test') - sp = subprocess.run(command, cwd=tmp_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - # tmp_dir is removed both if the subprocess succeeds or fails, thanks to the "with" statement - else: - sp = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - - out = sp.stdout.decode("utf-8").strip() - err = sp.stderr.decode("utf-8").strip() - if out: - logging.info(out.splitlines()[0]) - elif err: - logging.info(err.splitlines()[0]) - - except FileNotFoundError: - logging.error('{} executable not found!'.format(s)) - ex = 1 - return ex - - -def translate_cds(sequence_dict, skip_invalid=False): - """ - Copied and modified from wgd.utils - - Just another CDS to protein translater. Will give warnings when in-frame - stop codons are found, invalid codons are found, or when the sequence length - is not a multiple of three. Will translate the full sequence or until an - unspecified or in-frame codon is encountered. - :param sequence_dict: dictionary with gene IDs and CDS sequences - :param skip_invalid: bool, skip invalid CDS? (default translates to first - stop codon or end) - :return: dictionary with gene IDs and proteins sequences - """ - # TODO I should just use the Biopython translator - aa_dict = { - 'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M', - 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T', - 'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K', - 'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R', - 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L', - 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P', - 'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q', - 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', - 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V', - 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', - 'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E', - 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', - 'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S', - 'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L', - 'TAC': 'Y', 'TAT': 'Y', 'TAA': '', 'TAG': '', - 'TGC': 'C', 'TGT': 'C', 'TGA': '', 'TGG': 'W', - } - protein_dict = {} - - j = 0 - total = 0 - for key, val in sequence_dict.items(): - j += 1 - aa_seq = '' - if len(val) % 3 != 0: - logging.warning('Sequence length != multiple of 3 for {}!'.format(key)) - total += 1 - invalid = False - for i in range(0, len(val), 3): - if val[i:i + 3] not in aa_dict.keys(): - logging.warning('Invalid codon {0:>3} in {1}'.format(val[i:i+3], key)) - invalid = True - total += 1 - break - else: - if aa_dict[val[i:i + 3]] == '' and i+3 != len(val): - logging.warning('In-frame STOP codon in {0} at position {1}:{2}'.format(key, i, i+3)) - invalid = True - total += 1 - break - aa_seq += aa_dict[val[i:i + 3]] - if invalid and skip_invalid: - continue - protein_dict[key] = aa_seq - - if total: - logging.warning("There were {} warnings during translation".format(total)) - return protein_dict - -def init_logging(log_heading, logging_level): - """ - Initialize the logging environment and print a header to the log. - - :param log_heading: heading for the log - :return: nothing - """ - logging.basicConfig(format='%(levelname)s\t%(message)s', level=logging_level, stream=sys.stdout) - length = math.ceil((len(log_heading)/2)) - logging.info('- ' * length) - logging.info(log_heading) - logging.info(datetime.datetime.today().ctime()) - logging.info('- ' * length) - - - - -def compute_ks_from_ortho_gfs(species, species_fasta_file, top, top_gfs_filtered_with_rank, \ - rank_type, ks_tsv_path, max_gene_family_size, n_threads, paralog_dists_dir): - """ - Function to be called in the main Python script. - """ - logging.info(f" Estimate Ks from the top {top} GFs ({rank_type} ranking)") - - if not os.path.exists(ks_tsv_path): - - ks_paralogs(species, species_fasta_file, top, top_gfs_filtered_with_rank, rank_type, ks_tsv_path, max_gene_family_size=max_gene_family_size, \ - base_dir=paralog_dists_dir, n_threads=n_threads) diff --git a/ksrates/reciprocal_retention/construct_ks_plot_ortho_gfs.py b/ksrates/reciprocal_retention/construct_ks_plot_ortho_gfs.py deleted file mode 100755 index 70a89f6ee3f339b034fa6ebd11585f738c344b5f..0000000000000000000000000000000000000000 --- a/ksrates/reciprocal_retention/construct_ks_plot_ortho_gfs.py +++ /dev/null @@ -1,117 +0,0 @@ -from pandas import read_csv -from numpy import float64, arange -import matplotlib.pyplot as plt -import seaborn -import logging -import os -import sys -from numpy import zeros -import matplotlib -# Use the Agg (Anti-Grain Geometry) backend to avoid needing a graphical user interface (X11 backend) -matplotlib.use('Agg') - -logging.basicConfig(format='%(levelname)s\t%(message)s', level="INFO", stream=sys.stdout) - -plt.style.use(os.path.join(f"{os.path.dirname(os.path.abspath(__file__))}", "ks.mplstyle")) - -def ks_list_from_tsv(tsv_file, max_ks): - """ - """ - with open(tsv_file, "r") as tsv_file: - tsv = read_csv(tsv_file, sep="\t") - - filtered_tsv = tsv.loc[(tsv["Ks"].dtypes == float64) & (tsv["Ks"] <= max_ks)] - - ks_list_filtered = filtered_tsv["Ks"].to_list() - weight_list_filtered = filtered_tsv["WeightOutliersExcluded"].to_list() - - return ks_list_filtered, weight_list_filtered - - -def generate_ks_plot(id, ids_names, top, rank_type, ks_list, weight_list, max_ks, bin_width, y_axis_lim, path_to_output_folder, ks_plot_filename): - """ - :param id: species id - :param ids_names: dictionary with scientific name associated to id - """ - # ks_per_species = read_csv(open(os.path.join(path_to_output_folder, f"{rank_type}_rank", f"ks_{id}_top{top}_{rank_type}.tsv"), "r"), sep="\t", header=None, index_col=0) - # weights_per_species = read_csv(open(os.path.join(path_to_output_folder, f"{rank_type}_rank", f"weights_{id}_top{top}_{rank_type}.tsv"), "r"), sep="\t", header=None, index_col=0) - - # ks_list = ks_per_species.loc[id].tolist() - # weight_list = weights_per_species.loc[id].tolist() - fig, axis = generate_mixed_plot_figure(ids_names[id], max_ks, y_axis_lim, top, "corrected", False, False, top, rank_type, colinearity_data=False) - - bin_list = arange(0.0, max_ks + bin_width, bin_width) - hist = axis.hist(ks_list, bins=bin_list, weights=weight_list, histtype='stepfilled', color="0.4", - label="Reciprocally retained genes", density=False) - axis.legend() - fig.savefig(os.path.join(path_to_output_folder, f"{rank_type}_rank", ks_plot_filename), format="pdf") - plt.close(fig) - - -def generate_mixed_plot_figure(species, x_max_lim, y_max_lim, top, corrected_or_not, correction_table_available, - plot_correction_arrows, top_gf, rank_type, paranome_data=True, colinearity_data=True): - """ - """ - species_escape_whitespaces = species.replace(' ', '\ ') - - # Increase figure size in height if extra space for the correction arrows is required and - # any divergence line will be actually plotted (thus, the "and" condition) - if correction_table_available: - if plot_correction_arrows: - # make the figure a bit taller to make room for the arrows - fig, ax = plt.subplots(1, 1, figsize=(14.0, 7.6)) - else: - fig, ax = plt.subplots(1, 1, figsize=(14.0, 7.0)) - else: - # if not correction_table_available a simpler less-wide layout with - # the legend inside the plot and no right margin is used - fig, ax = plt.subplots(1, 1, figsize=(10.0, 7.0)) - - fig.suptitle("$K_\mathregular{S}$" + f" distribution for ${species_escape_whitespaces}$") - - ax.set_title(f"Top {top} GFs from {rank_type} ranking") - - seaborn.despine(offset=10) - ax.set_xlabel("$K_\mathregular{S}$") - - ax.set_ylabel("Number of retained duplicates (weighted)") - - ax.set_xlim(0, x_max_lim) - if isinstance(y_max_lim, float): - y_max_lim = float(y_max_lim) - ax.set_ylim(0, y_max_lim) - plt.setp(ax.yaxis.get_majorticklabels(), rotation=90, verticalalignment='center') - if not correction_table_available: - # if not correction_table_available tighten the layout - # to reduce clipping - plt.tight_layout() - - return fig, ax - - -# ----------------------------------------------------------------------------- - - - -def construct_ks_plot_ortho_gfs(id, top, rank_type, ks_plot_filename, max_ks, bin_width, y_axis_lim, ids_names): - """ - """ - logging.info(f" Construct Ks distribution for the top {top} GFs ({rank_type} ranking)") - - path_to_output_folder = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "ks_distributions", id.lower(), "weighted_with_ortho_GFs") - if not os.path.exists(os.path.join(path_to_output_folder, f"{rank_type}_rank")): - os.makedirs(os.path.join(path_to_output_folder, f"{rank_type}_rank")) - - if not os.path.exists(os.path.join(path_to_output_folder, f"{rank_type}_rank", ks_plot_filename)): - # Extract the Ks estimates from tsv file - ks_tsv = os.path.join("/", "home", "cesen", "wgm_predictor_midas", f"paralog_distributions", f"wgd_{id.lower()}", f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank.ks.tsv") - # if not os.path.exists(os.path.join(path_to_output_folder, f"{rank_type}_rank", f"ks_{id}_top{top}_{rank_type}.tsv")) or \ - # not os.path.exists(os.path.join(path_to_output_folder, f"{rank_type}_rank", f"weights_{id}_top{top}_{rank_type}.tsv")) or \ - # not os.path.exists(os.path.join(path_to_output_folder, f"{rank_type}_rank", f"ks_tsv_{id}_top{top}_{rank_type}.tsv")): - ks_list, weight_list = ks_list_from_tsv(ks_tsv, max_ks) - - # Build Ks plot - logging.info(" Plotting Ks distribution") - generate_ks_plot(id, ids_names, top, rank_type, ks_list, weight_list, max_ks, bin_width, y_axis_lim, path_to_output_folder, ks_plot_filename) - else: - logging.info(" Ks plot already present") \ No newline at end of file diff --git a/ksrates/reciprocal_retention/get_top5000_gfs_mcl_format.py b/ksrates/reciprocal_retention/get_top5000_gfs_mcl_format.py deleted file mode 100755 index e8242bc22421da0e214c8acd6b6d886260c0dbac..0000000000000000000000000000000000000000 --- a/ksrates/reciprocal_retention/get_top5000_gfs_mcl_format.py +++ /dev/null @@ -1,118 +0,0 @@ -import logging -from pandas import read_excel, read_csv, concat, DataFrame -import os -import sys -import matplotlib -# Use the Agg (Anti-Grain Geometry) backend to avoid needing a graphical user interface (X11 backend) -matplotlib.use('Agg') - -logging.basicConfig(format='%(levelname)s\t%(message)s', level="INFO", stream=sys.stdout) - -def get_top_gf_genes(top, rank_type, path_for_top_gf_tsv): - """ - :param top: number of top GFs considered for building the Ks distribution - """ - - # Get the ranked gene families - gf_rank = read_excel(open("gene_families.xlsx", "rb"), "gene family rankings", header=None) - - # Get the genes associated to all gene families - gf_genes_all_raw = read_excel(open("gene_families.xlsx", "rb"), "orthogroup member genes", header=None) - - # Expand each gene in the GFs as an independent column entry - gf_genes_all = gf_genes_all_raw.iloc[:,:2] # First two columns (GF ID and number of species) - genes_columns = gf_genes_all_raw.iloc[:,2].str.split(",", expand=True) # Process third column (genes) - gf_genes_all = concat([gf_genes_all, genes_columns], axis=1, ignore_index=True) - - if rank_type == "combined": - # Get the top Gfs from the COMBINED RANKING (lambda & blocks percentage) - gf_top = gf_rank.iloc[:top,1] - elif rank_type == "lambda": - # Get the top GFs from the LAMBDA-BASED RANKING - gf_top = gf_rank.loc[gf_rank.iloc[:, 8] <= top,].sort_values(axis=0, by=[8]).iloc[:,1] - - # Get the genes associated to the top GFs in both rank types - gf_genes_top = DataFrame() - for ortho_gf in gf_top.values: - gf_genes_top = gf_genes_top.append(gf_genes_all.loc[gf_genes_all.iloc[:,0] == ortho_gf.strip()], ignore_index=True) - - # Save the top GF together with their genes in a TSV file - with open (os.path.join(path_for_top_gf_tsv, f"gf_top{top}_{rank_type}_rank_conserved_order.tsv"), "w+") as f: - f.write(gf_genes_top.to_csv(sep="\t", header=False, index=False)) - return gf_genes_top - - -def top_gfs_in_mcl_format(top, id, gf_genes_top, rank_type, path_to_output_folder, mcl_like_filename): - """ - """ - ortho_gf_gene_lists = [] - gf_top_per_species = {} - gf_top_per_species[id] = DataFrame() - - for i in range(top): - # Move to next GF if the current GF contains no genes for the current species - if len(gf_genes_top.iloc[i][gf_genes_top.iloc[i].str.contains(id).fillna(False)]) == 0: - ortho_gf_gene_lists.append("") - continue - - # Get the genes associated to the current top GF - gene_list = gf_genes_top.iloc[i][gf_genes_top.iloc[i].str.contains(id).fillna(False)].reset_index(drop=True).str.split("|", expand=True)[1].dropna().tolist() - ortho_gf_gene_lists.append(gene_list) - - # Write the genes in a file, one GF per row (MCL format) - # ortho_gf_gene_lists.sort(key=len, reverse=True) - with open(os.path.join(path_to_output_folder, mcl_like_filename), "w+") as f: - for genes_in_current_family in ortho_gf_gene_lists: - f.write('\t'.join(genes_in_current_family) + "\n") - - - -def get_top5000_gfs_mcl_format(id, top=5000): - """ - Function called by the main Python script - """ - - path_for_top_gf_tsv = ".." - - path_to_output_folder = os.path.join("..", "paralog_distributions", f"wgd_{id.lower()}") - if not os.path.exists(path_to_output_folder): - os.makedirs(path_to_output_folder) - - logging.info(f"Maximum number of gene families considered: top {top} GFs") - - for rank_type in ["combined", "lambda"]: - logging.info(f"- From {rank_type} ranking...") - - # Get the top XXXX GFs (ordered as in the current ranking) - logging.info(f" Getting the top {top} GFs") - if not os.path.exists(os.path.join("..", f"gf_top{top}_{rank_type}_rank_conserved_order.tsv")): - gf_genes_top = get_top_gf_genes(top, rank_type, path_for_top_gf_tsv) - else: - logging.info(f"- File gf_top{top}_{rank_type}_rank_conserved_order.tsv already exists") - gf_genes_top = read_csv(open(os.path.join("..", f"gf_top{top}_{rank_type}_rank_conserved_order.tsv"), "r"), sep="\t", header=None) - - logging.info("") - - # Generate MCL-like file with ORTHO GFs - logging.info(f" Generating the MCL-like file with top {top} ORTHO GFs") - mcl_like_filename = f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank.mcl.tsv" - if not os.path.exists(os.path.join(path_to_output_folder, mcl_like_filename)): - top_gfs_in_mcl_format(top, id, gf_genes_top, rank_type, path_to_output_folder, mcl_like_filename) - else: - logging.info(" (MCL-link file for ORTHO GFs already exists)") - - logging.info("") - - -def generate_mcl_like_file_for_new_species(id, top, rank_type, gf_list, assigned_gfs_file, mcl_like_filename): - """ - For the current new species, generate the mcl-like file for the gene families. - Each line of the file contains the genes belonging to a certain gene family, and the gene families - are sorted according to their reciprocally retention ranking (top 2000 gene families only are used). - """ - if not os.path.exists(mcl_like_filename): - mcl_like_output = [] - for gf in list(gf_list): - genes_assigned_to_current_gf = assigned_gfs_file.loc[assigned_gfs_file[1] == gf] - mcl_like_output.append(list(genes_assigned_to_current_gf.iloc[:, 0])) - DataFrame.from_records(mcl_like_output).to_csv(mcl_like_filename, sep="\t", header=False, index=False) diff --git a/ksrates/reciprocal_retention/ks.mplstyle b/ksrates/reciprocal_retention/ks.mplstyle deleted file mode 100755 index 9613ea1cbf593326293b40aaf4568234f7a3e23f..0000000000000000000000000000000000000000 --- a/ksrates/reciprocal_retention/ks.mplstyle +++ /dev/null @@ -1,18 +0,0 @@ -axes.titlesize : 16 -axes.labelsize : 19 -axes.linewidth : 1.2 -axes.labelpad : 14 -xtick.labelsize : 16 -xtick.major.size : 8 -xtick.major.width : 1.2 -xtick.major.pad : 8 -ytick.labelsize : 16 -ytick.major.size : 8 -ytick.major.width : 1.2 -ytick.major.pad : 8 -legend.frameon : False -legend.fontsize : 14 -figure.subplot.bottom : 0.15 -figure.subplot.left : 0.09 -figure.titlesize : 20 -savefig.pad_inches : 0.15 \ No newline at end of file diff --git a/ksrates/reciprocal_retention/rec_retained_ks_plot_pipeline.py b/ksrates/reciprocal_retention/rec_retained_ks_plot_pipeline.py deleted file mode 100755 index a77ef9cee3d32ac3f530ebbe79d38c42ea65f2bc..0000000000000000000000000000000000000000 --- a/ksrates/reciprocal_retention/rec_retained_ks_plot_pipeline.py +++ /dev/null @@ -1,149 +0,0 @@ -from get_top5000_gfs_mcl_format import get_top5000_gfs_mcl_format, generate_mcl_like_file_for_new_species -from compute_ks_from_ortho_gfs import compute_ks_from_ortho_gfs -from construct_ks_plot_ortho_gfs import construct_ks_plot_ortho_gfs -import sys -import os -import logging -from pandas import read_csv - -logging.basicConfig(format='%(levelname)s\t%(message)s', level="INFO", stream=sys.stdout) - -logging.info("-----------------------------------------------") -logging.info("Ks distributions with reciprocally retained GFs") -logging.info("-----------------------------------------------") - -test_run = False - -if test_run: - # Species IDs - all_species_ids = ['Alyr', 'Atha', 'Atri', 'Brap', 'Cari', 'Ccaj', 'Clan', 'Cmel', \ - 'Cpap', 'Crub', 'Csat', 'Fves', 'Gmax', 'Grai', 'Hvul', 'Jcur', 'Ljap', 'Lusi', \ - 'Macu', 'Mesc', 'Mtru', 'Osat', 'Pbre', 'Pdac', 'Pmum', 'Pper', 'Ptri', 'Rcom', \ - 'Sbic', 'Sita', 'Slyc', 'Stub', 'Tcac', 'Tpar', 'Vvin', 'Zmay'] - - ids_names = {'Alyr':'Arabidopsis lyrata', 'Atha': 'Arabidopsis thaliana', 'Atri':'Amborella trichopoda', \ - 'Bdis':'Brachypodium distachyon', 'Brap':'Brassica rapa', 'Cari':'Cicer arietinum', 'Ccaj':'Cajanus cajan', \ - 'Clan':'Citrullus lanatus', 'Cpap':'Carica papaya', 'Crub':'Capsella rubella', 'Csat':'Cucumis sativus', \ - 'Fves': 'Fragaria vesca', 'Gmax':'Glycine maxima', 'Grai':'Gossypium raimondii', 'Jcur':'Jatropha curcas', \ - 'Ljap':'Lotus japonicus', 'Lusi':'Linum usitatissiumum', 'Macu':'Musa acuminata', 'Mesc': 'Manihot esculenta', \ - 'Mtru':'Medicago truncatula', 'Osat': 'Oryza sativa', 'Pbre':'Pyrus bretschneideri', 'Pdac':'Phoenix dactylifera', \ - 'Pmum':'Prunus mume', 'Pper':'Prunus persica', 'Ptri':'Populus trichocarpa', 'Rcom':'Ricinus communis', \ - 'Sbic':'Sorghum bicolor', 'Sita':'Setaria italica', 'Slyc':'Solanum lycopersicum', 'Stub':'Solanum tuberosum', \ - 'Tcac':'Theobroma cacao', 'Tpar':'Thellungiella parvula', 'Vvin': 'Vitis vinifera', 'Zmay':'Zea mays'} - - id_list = ["Atha", "Pdac", "Vvin"] - -else: - all_species_ids = ["CMFF", "GEHT", "HJMP", "JETM", "JTQQ", "KEGA", "KNMB", "KZED", "MYMP", \ - "NXOH", "OHAE", "OQHZ", "PEZP", "PJSX", "QZXQ", "RKFX", "RKLL", "RMWJ", \ - "SLYR", "SUAK", "TTRG", "TVSH", "VHZV", "VLNB", "XOOE", "ZCDJ"] - - ids_names = { - "RMWJ" : "Wisteria floribunda", "CMFF" : "Lupinus polyphyllus", "GEHT" : "Gleditsia triacanthos", \ - "HJMP" : "Astragalus membranaceus", "JETM" : "Bauhinia tomentosa", "JTQQ" : "Glycyrrhiza lepidota", \ - "KEGA" : "Glycine soja", "KNMB" : "Lathyrus sativus", "KZED" : "Senna hebecarpa", \ - "MYMP" : "Astragalus propinquus", "NXOH" : "Apios americana", "OHAE" : "Polygala lutea", \ - "OQHZ" : "Quillaja saponaria", "PEZP" : "Glycyrrhiza glabra", "PJSX" : "Acacia pycnantha", \ - "QZXQ" : "Gymnocladus dioicus", "RKFX" : "Cercis canadensis", "RKLL" : "Copaifera officinalis", \ - "RMWJ" : "Wisteria floribunda", "SLYR" : "Cladrastis lutea", "SUAK" : "Codariocalyx motorius", \ - "TTRG" : "Lupinus angustifolius", "TVSH" : "Bituminaria bituminosa", "VHZV" : "Gleditsia sinensis", \ - "VLNB" : "Gompholobium polymorphum", "XOOE" : "Desmanthus illinoensis", "ZCDJ" : "Acacia argyrophylla"} - - id_list = [sys.argv[1].upper()] - -# ANALYSIS SETTINGS - -top = 2000 -rank_type = "lambda" - -filter_by_false_predictions = False -filter_by_zero_false_predictions = False - -basefolder_sequence_data = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "sequences") -basefolder_db_and_files = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "databases_and_gf_files") - -pident_threshold = 40 - -# PARAMETERS -y_axis_lim = float(300) -n_threads = 8 - -max_ks = 5 -bin_width = 0.1 -max_gene_family_size = 200 -paralog_dists_dir = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "paralog_distributions", "") - -for id in id_list: - logging.info(f"Analyzing species: {ids_names[id]}\n") - - basefolder_output_files = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "fabales_1kp", id, rank_type) - - path_to_parlogs_data = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "paralog_distributions", f"wgd_{id.lower()}") - if not os.path.exists(path_to_parlogs_data): - os.makedirs(path_to_parlogs_data) - - # for top in [2000]: # [50, 100, 200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000]: - if not filter_by_false_predictions and not filter_by_zero_false_predictions: - logging.info(f"Analyzing the top {top} GFs ({rank_type} rank):") - top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}.tsv") - gf_list = read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None).squeeze() - assigned_gfs_file = read_csv(os.path.join(basefolder_output_files, f"assigned_gfs_{id}_{pident_threshold}%_best_diamond_hit_with_rbh.tsv"), sep="\t", header=None) - - elif filter_by_false_predictions: - logging.info(f"Analyzing the top {top} GFs ({rank_type} rank) after filtering away those with false predictions >= half of the true positives") - top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv") - gf_list = read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None).squeeze() - - elif filter_by_zero_false_predictions: - logging.info(f"Analyzing the top {top} GFs ({rank_type} rank) after filtering away those with at least one false prediction") - top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv") - gf_list = read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None).squeeze() - - # GENERATE OR OPEN MCL-LIKE FILE - logging.info("Generating mcl-like output per gene family...") - if test_run: - # Get the top 5000 GFs (maximum number of GFs considered) - get_top5000_gfs_mcl_format(id) - else: - mcl_like_filename = os.path.join(path_to_parlogs_data, f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank.mcl.tsv") - generate_mcl_like_file_for_new_species(id, top, rank_type, gf_list, assigned_gfs_file, mcl_like_filename) - - logging.info("") - - # for rank_type in ["lambda"]: # ["combined", "lambda"]: - logging.info(f"- From {rank_type} ranking...") - - # Estimate Ks from the top XX GFs (current rank type)") - if test_run: - species_fasta_file = os.path.join(basefolder_sequence_data, "old_genome_versions", f"{id.lower()}.fasta") - else: - species_fasta_file = os.path.join(basefolder_sequence_data, "new_species_data", f"{id.upper()}.fna") - - - - if not filter_by_false_predictions and not filter_by_zero_false_predictions: - ks_tsv_filename = f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank.ks.tsv" - - elif filter_by_false_predictions: - ks_tsv_filename = f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank_filtered_false_predictions.ks.tsv" - - elif filter_by_zero_false_predictions: - ks_tsv_filename = f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank_filtered_zero_false_predictions.ks.tsv" - ks_tsv_path = os.path.join(path_to_parlogs_data, f"wgd_{id.lower()}", ks_tsv_filename) - compute_ks_from_ortho_gfs(id, species_fasta_file, top, top_gfs_filtered_with_rank, rank_type, ks_tsv_path, max_gene_family_size, n_threads, paralog_dists_dir) - logging.info("") - - - - # Construct Ks distribution for the top XX GFs (current rank type) - if not filter_by_false_predictions and not filter_by_zero_false_predictions: - ks_plot_filename = f"ks_plot_{id}_top{top}_{rank_type}.pdf" - - elif filter_by_false_predictions: - ks_plot_filename = f"ks_plot_{id}_top{top}_{rank_type}_filtered_false_predictions.pdf" - - elif filter_by_zero_false_predictions: - ks_plot_filename = f"ks_plot_{id}_top{top}_{rank_type}_filtered_zero_false_predictions.pdf" - construct_ks_plot_ortho_gfs(id, top, rank_type, ks_plot_filename, max_ks, bin_width, y_axis_lim, ids_names) - logging.info("") - logging.info("Done") diff --git a/ksrates/wgd_paralogs.py b/ksrates/wgd_paralogs.py index 09f4f8915d4c0a3dd892fc31dd4db2f64d20be94..d59cd468cb074dfc50c2c212f2b1f3d7dd321af3 100644 --- a/ksrates/wgd_paralogs.py +++ b/ksrates/wgd_paralogs.py @@ -67,37 +67,42 @@ def wgd_paralogs(config_file, n_threads): logging.error("Please add the missing information to the configuration file and rerun the analysis. Exiting.") sys.exit(1) logging.info("Completed") + logging.info("") # Creating folder for output files of wgd paralog pipeline paralog_dists_dir = os.path.join("paralog_distributions", "") if not os.path.isdir(paralog_dists_dir): logging.info(f"Creating directory [{paralog_dists_dir}]") + logging.info("") os.makedirs(paralog_dists_dir) # ----------------------------------------------------------------------------- # ESTIMATING PARANOME Ks VALUES - if paranome or colinearity: # prerequisite for colinearity pipeline, but not for reciprocally retention pipeline - logging.info("Running wgd reciprocal retention paralog Ks pipeline...") + if paranome or colinearity: # prerequisite for colinearity pipeline, but not for reciprocally retention pipeline + logging.info("Running wgd whole-paranome Ks pipeline...") fc_wgd.ks_paralogs(species, species_fasta_file, max_gene_family_size=max_gene_family_size, base_dir=paralog_dists_dir, n_threads=n_threads) + logging.info(datetime.datetime.today().ctime()) + logging.info("") # ESTIMATING RECIPROCALLY RETAINED GENE FAMILIES Ks VALUES if reciprocal_retention: - logging.info('---') - logging.info("Running wgd whole-paranome Ks pipeline...") + logging.info(f"Running wgd reciprocal retention Ks pipeline ({rank_type} ranking)...") fc_wgd.ks_paralogs_rec_ret(species, species_fasta_file, latin_name, top=top, rank_type=rank_type, pident_threshold=pident_threshold, min_qcov_threshold=min_qcov_threshold, filter_false_predictions=filter_false_predictions, filter_zero_false_predictions=filter_zero_false_predictions, max_gene_family_size=max_gene_family_size, base_dir=paralog_dists_dir, n_threads=n_threads) + logging.info(datetime.datetime.today().ctime()) + logging.info("") # EXTRACTING COLLINEARITY ANCHOR PAIRS Ks VALUES if colinearity: - logging.info('---') logging.info("Running wgd collinearity Ks pipeline...") fc_wgd.ks_colinearity(species, gff, base_dir=paralog_dists_dir, gff_feature=gff_feature, gff_gene_attribute=gff_gene_attribute, n_threads=n_threads) + logging.info(datetime.datetime.today().ctime()) + logging.info("") - logging.info(datetime.datetime.today().ctime()) - logging.info("Done") + logging.info("All done") diff --git a/ksrates_cli.py b/ksrates_cli.py index 11b2b14f17e042c42ebb463cea62009263bf19c2..c6e0817a2a08353603a4d813c279d476f2605871 100644 --- a/ksrates_cli.py +++ b/ksrates_cli.py @@ -118,7 +118,7 @@ def orthologs_adjustment(config_file, trios): @click.option("--adjustment-table", type=click.Path(exists=True), help="User-defined path to file containing adjustment results (default: rate_adjustment/species/adjustment_table_species.tsv)") @click.option("--paranome-table", type=click.Path(exists=True), help="User-defined path to file containing paranome Ks (default: paralog_distributions/wgd_species/species.ks.tsv)") @click.option("--anchors-table", type=click.Path(exists=True), help="User-defined path to file containing anchor pair Ks (default: paralog_distribution/wgd_species/species.ks_anchors.tsv)") -@click.option("--reciprocal-retention-table", type=click.Path(exists=True), help="User-defined path to file containing reciprocally retained paralog Ks (default: paralog_distribution/wgd_species/species.ks_rec_ret.tsv)") +@click.option("--reciprocal-retention-table", type=click.Path(exists=True), help="User-defined path to file containing reciprocally retained paralog Ks (default: paralog_distribution/wgd_species/species.ks_rec_ret_top_2000_lambda.tsv)") def plot_paralogs(config_file, adjustment_table, paranome_table, anchors_table, reciprocal_retention_table): """ Plots rate-adjusted mixed paralog-ortholog Ks distribution. @@ -182,13 +182,14 @@ def plot_orthologs(config_file, trios): @click.argument('config_file', type=click.Path(exists=True)) @click.option("--paranome-table", type=click.Path(exists=True), help="User-defined path to file containing paranome Ks (default: paralog_distributions/wgd_species/species.ks.tsv)") @click.option("--anchors-table", type=click.Path(exists=True), help="User-defined path to file containing anchor pair Ks (default: paralog_distribution/wgd_species/species.ks_anchors.tsv)") +@click.option("--reciprocal-retention-table", type=click.Path(exists=True), help="User-defined path to file containing reciprocally retained paralog Ks (default: paralog_distribution/wgd_species/species.ks_rec_ret_top_2000_lambda.tsv)") @click.option("--adjustment-table", type=click.Path(exists=True), help="User-defined path to file containing adjustment results (default: rate_adjustment/species/adjustment_table_species.tsv)") @click.option("--anchorpoints", type=click.Path(exists=True), help="User-defined path to i-ADHoRe file anchorpoints.txt (default: paralog_distributions/wgd_species/species_i-adhore/anchorpoints.txt)") @click.option("--multiplicons", type=click.Path(exists=True), help="User-defined path to i-ADHoRe file multiplicons.txt (default: paralog_distributions/wgd_species/species_i-adhore/multiplicons.txt)") @click.option("--segments", type=click.Path(exists=True), help="User-defined path to i-ADHoRe file segments.txt (default: paralog_distributions/wgd_species/species_i-adhore/segments.txt)") @click.option("--list-elements", type=click.Path(exists=True), help="User-defined path to i-ADHoRe file list_elements.txt (default: paralog_distributions/wgd_species/species_i-adhore/list_elements.txt)") @click.option("--multiplicon-pairs", type=click.Path(exists=True), help="User-defined path to i-ADHoRe file multiplicons_pairs.txt (default: paralog_distributions/wgd_species/species_i-adhore/multiplicons_pairs.txt)") -def paralogs_analyses(config_file, paranome_table, anchors_table, adjustment_table, anchorpoints, multiplicons, segments, list_elements, multiplicon_pairs): +def paralogs_analyses(config_file, paranome_table, anchors_table, reciprocal_retention_table, adjustment_table, anchorpoints, multiplicons, segments, list_elements, multiplicon_pairs): """ Reconstructs potential WGD peaks in the paralog Ks distributions. @@ -206,6 +207,8 @@ def paralogs_analyses(config_file, paranome_table, anchors_table, adjustment_tab click.format_filename(paranome_table) if anchors_table: click.format_filename(anchors_table) + if reciprocal_retention_table: + click.format_filename(reciprocal_retention_table) if adjustment_table: click.format_filename(adjustment_table) if anchorpoints: @@ -218,9 +221,8 @@ def paralogs_analyses(config_file, paranome_table, anchors_table, adjustment_tab click.format_filename(list_elements) if multiplicon_pairs: click.format_filename(multiplicon_pairs) - paralogs_analyses_methods(config_file, paranome_table, anchors_table, adjustment_table, - anchorpoints, multiplicons, segments, list_elements, multiplicon_pairs) - + paralogs_analyses_methods(config_file, paranome_table, anchors_table, reciprocal_retention_table, + adjustment_table, anchorpoints, multiplicons, segments, list_elements, multiplicon_pairs) # For debugging diff --git a/test/config_elaeis.txt b/test/config_elaeis.txt index e3da2f89b0bce87b59db162a1f374df00b88d5f5..19b5760989a2208027e894cc9790a4f32a19fd3c 100755 --- a/test/config_elaeis.txt +++ b/test/config_elaeis.txt @@ -20,6 +20,7 @@ ks_list_database_path = ortholog_ks_list_db.tsv [ANALYSIS SETTING] paranome = yes collinearity = yes +reciprocal_retention = yes # analysis type for paralog data; allowed values: 'yes' or 'no' gff_feature = mrna diff --git a/test/config_expert.txt b/test/config_expert.txt index 16a74c6bb8ffaf39ff44a8df6d3303190338a37b..0340662ec0fe595a9d5a7252a9076932190e58fe 100644 --- a/test/config_expert.txt +++ b/test/config_expert.txt @@ -8,4 +8,11 @@ num_mixture_model_initializations = 1 extra_paralogs_analyses_methods = yes max_mixture_model_components = 3 max_mixture_model_ks = 5 -max_gene_family_size = 200 \ No newline at end of file +max_gene_family_size = 200 + +top_reciprocally_retained_gfs = 2000 +reciprocal_retention_rank_type = lambda +pident_threshold = 40 +min_qcov_threshold = 50 +filter_by_false_predictions = no +filter_by_zero_false_predictions = no \ No newline at end of file diff --git a/test/elaeis.fasta b/test/elaeis.fasta index 1dbfafaf7eeff375343db21c10f03d570401c043..8c170043b35b84c21f1aea26b2b146c4012a87b4 100644 --- a/test/elaeis.fasta +++ b/test/elaeis.fasta @@ -998,3 +998,11 @@ ATGCCAATCTCTTATGCAAACTCTGGTGACAGTCTTGCAGTTTTTCCACGAAGAATTGAGTTCGATTCCGACATCATTGC ATGGTTTCCCCAGAAGTCATCCGGACCGCGGTCGGGATTCTAGGAAATGCCATTGCACTAGGGTTGTTCTTGTCCCCAGTGCCAACTTTCATGAGAATACGGAAGAAGGGTTCGGTGGAGGAATTCTCACCATTCCCATACCTAGCCACCCTCCTCAACTGCATGATGTGGGTGGTGTATGGGCTGCCCATGGTGCACCCCCATAGCATCCTGGTCATAACCATTAATGGGTCAGGAGTTATCATAGAGCTCTCCTTCGTGCTTTTCTTCCTCCTCTACTCCCAGGGGAGGAAGAGATTCTTGGTGCTTGCGATGCTGCTCGCTGAGATCGCCGTCGTCGGCATCGTCACCCTGCTTGTACTGACACTTGCCCACACATATGACCGTCGCTCCTTGATCGTTGGCGTTGTCTGCGTCTTCTTGTGTGCCATGATGTATGCTTCTCCCTTATCCGTCATGAAAATGGTGATTCAGACAAGGAGCGTGGAATTCATGCCTCTCTTTCTCTCGCTGGCCTCTTTCTTCAACGGCACTTGCTGGACAGCCTATGCTCTCATCCGTTTCGACCCATACATCATTATTCCAAATGCCCTTGGAGTAATCTTTGCGGTGGCACAATTGATATTGCATGTGGTGTACCGCAAGTCCACCAAGGAACAGTTAGAGGCGAAGAAGGCGGCCCCGGCAGCAGATATGGGATTGGCCGAGGTGGTGGTGGTGAAAGAAGAGACAAGCAAGATGAGCAATGCATCTTCTAATGGCTTCTAA >p5.00_sc00001_p0500.1 ATGGAAAGATCGAGGTCGAAGAGGAGCTACCATTACGAACACGACTCCAACTCCCAAAGCCCTCCCCACCGCTCCAAGCCCCGCTACGATGGCGGCCACCACCGCCGCGGAAACCACCACCGCCGGGGCGGCGACCGCCGGGCCCCGCCCCCTCCGCCTCCCCCTCCCCCGCCACCGGTGCCGGGGGCGGCGGCGCTGGCCTCCTCATCCTCTGCCTCCTCCCCCAAGATCCAGGACGGCGCGGGGCCACCCAGCGCGGTTACCACCTTTTTCCGCATTCTGTGCCCCGACTCGAAGGTCGGCGGCGTGATCGGCAAGTCCGGGAGCATCATCAAGAACTTCCGTCAGCAGACCGGCGCGTGGATCAACGTCCACCAGCTGGTCCTCGGTGACGACGAGCGGATCATCGAGACGGCGGACAACCGCCGCCGGGAGCCCGACGGCCGGCCCCCGCAGTACTCCCCAGCCCAGGAGGCCCTGCTGCTGATCCACGAGCGGATCGTGGATGCAGAGTTCGAGTATAGCGGCGGCGGGGACGAGGAGGACGAGTATAGCGCGGGCGGCAGTGGTGGAGGCGGAGGAGGGTGGGGCGGTGGAGGAGGAGAGAGGCGAAGGGATAGGGGACGGGTAACGACGAGGCTGGTGGTGCCGAGGACACACGTCGGGTGCTTGCTTGGGAGGGGAGGGAAGATCATTGAGCAGATGAGGATCGAGACCAAAACCCACATCCGAATCTTGCCGCGGGATCAGTACACTCCTCAGTGTGTCTCGACTTCAGAAGAAGTCGTTCAGGTTGTTGGAGAAGGTAATTGTGTGAAGAAAGCTGTTGCAACCATCTCATCTCGTTTAAAGGAGAGCTTGCATCGTGATCGTGGAGCTTTTCGTGGGCGATTGCATTCACCAGACCATTACATTACCCCTGATGATGAGTTTGCCAACAGTACGCAACATGCATTGACAGTGGAAGAATCTGATTTGGGGTCACGATCTTCTGTGGGACAAAATAGAGCTAGAATCAATTCTTATAGTTCAGAACCATCTGGATATGCATTTGATTCTGATGGAAATACTCTGAATGACCACTCACAGTCATTATCCTATGAGGACCTTGTGTTTCGAATTCTTTGCCCCAATGATAAAGTTGAGAGTGTTATGGAAGCACCAAATGGAATTATTGAAATGCTCCGAGCTGATATAGGTGTGGATGTGCAGGTTACTGATCCTGTACCCGGTTCTGATGAAAGAATCATAATCATCACTTCTGAAGAGGGCCCGGATGATGATCTTTTTCCAGCTCAGGAAGCTTTGTTGCATATTCAAACTCACATTGTAGACCTTGGTCCTGACAAGGACAACATCATAACAACAAGGCTTCTTGTTCCAGCAACAGAAATTGCTTGTTTGGAGGGGAAGGATGGATCATTATCTGATATACAAAGATTCACTAATGCCAATGTACAAATCTTACCAAAAGAGGATCTTCCCCCGTGTGCATTGGAGGCTGATGAACTTGTACAGATTGTAGGGGAGATCAGAGCAGCTCGAAATGCTCTTGTCCATGTTACTGCAAAGTTGCGGAGTTATTTGTATCGTGACATCTCTGGACCAAAGGATATGCTTCCACCATCTATTGCTGCCCCAAGCCATGTTGGTAGCATTGGTGGGCATGAATCTGGCTCTCCTATTAAAGCTTCAACTCGTGAAACTTATCTGGGAAGTGATCCTCCCATTGCAATGTATCAAAACATGCATACTGCAACAACTGCCTGGCAACCAAAGGACACTGGAGGATGTGCAAGTGGATCATTTGAGCAGGAAGAGAGCAATGTTAATGATGAAGGGAGGCAGAGTGGTGTGAAAAGACTCAGTGTACCGCTAATCACCAGAAGCACACTGGAAGTTATAATACCCAGACATGCAGTTCCAAGCCTGATATTGAGATCAGGAAGCAAGCTTGCTCAGATCAGTGAGATGTCTGGGGCAACTGTCACCCTTATTGAAGATAAATCTGAATTGGCTGAAAAGGTTGTTCAAATATCAGGAAGTCCAGAGCAGGCTGAGAGAGCCCAGAGCCTGCTCCAGGGATTTATACTTAGTACACAAGATGACATTCCTTCGAGCTAA +>p5.00_sc00001_p0707.1 +ATGCCATCTCTCTGCTCCCCGTATGATTCTCAGCTGAATGATCGGCAATGTGAACCTCAAGAGGTGTCAATCACCGATGCACCAGTGGTTTGGCAGAACTACACGTGCGAAGTTTCGGTTTCAGGTTTATGTATCAGCAGTGGAAGGATTACTCCAGATATATATTTTCATCTGTCAGAGGCAGTAAATGCGAGCTATGCCCTCTACCACTACACTCCGCTATTACTCAGTCTTCAGGATTGCAGATTCGTCCGAGAAACGTTCGACACCATCACCACAAACTACTGCCCCAATCTGGAGCACGACCTCCGCATGGTCAGTGTTGGACTGGCCCTGATATCCATCGGGGTCTTGCTCTGCCTCATCTTGTGGATTTTCTATGCGAACCGCCCCCAAAGGGAGGAGGTGTTTGCGCCGCTGTCTGAAGTGAAGTTTGCTCCAAATAGCAACATTTCCGTAAGAATCGATGTATCATCCTAG +>p5.00_sc00001_p0708.1 +ATGGATGCCCGCGGCGCCGCCGGCTCCGGAGGAAGCGGCGGAAGGGGGATCGCCTTCTGCCTCCTCGTCTTGGTCTCTCTTTTCGTGGCCTCCGAGGCGTTGCCGGCTCTCGCTGGTGGAGAGGAAGGCGCTAGGAAGATTAGCCTAGATTTAAAGAGTTTGCCGGCATGGGCGAAAGATCTCTTAAAGTTGACACCAATGGCACCAGAACCGACAACTTCGGCTCCAAGTGCAAAATTCCCGCTGGTTTTAGCGGCAAACAGAACAAGGAGGCCGGATGTGCTTGATAACTTCAGAATGTATGGCGGAGGTTGGAACATTACTAACAAGCATTACTGGGTGTCTGTTGCGTTTACAGGAGCTGCTGGATTTGTTCTTTCAGCATTGTGGTTTGTTTCTTTTGGGTTGGTTTTAGGTGTACATCTCTGCTGTAGACGGGGAATGGGAACCAAAGAGGGAGGATCATTCTGTGCACGACGTATTTGTCTTGCATTGTTGATGCTCTTCACATGTGCTGCAGCGACCGGGTGCATTCTTCTCTCTGTGGGCCAGAATGAGTTCCATGAGGAAGTTTTGGATACTTTAAAGTTTGTTGTCAATCAATCAGATTTCACTATTCAGATACTAAGGAATGTCACAGATTTTCTGTCCTCTGCCAAGACAATTAATGTGGCAGAGGTCTATCTCCCGCCGGAAATTTGGAATGAAATTGACAAGTTAAATGGAGAGTTAAATGGTGCAGCTAGTATCCTATCAGAGAAAACAAATGAAAGTTCTGAGAAAATCAGAGCCGTCATCGATGACTTGCGTTGCATATTGATTGTGGTAGCATCACTGATGCTTTTGCTGGCCATACTTGGTTTCTTGCTTTCAGTTTGGGGTCACAAACATGCAATTTATGTATTTATAATGAGTGGTTGGCTCCTTGTGGCAGTTACCTTCATTCTCTATGGATTTTTTGTGATCCTTAATAAGAGATTGGCAAATGCATGTGCCAGGCAGACCTCTCATTCTGCACATACTGTGTGA +>p5.00_sc00002_p0130.1 +ATGGCGGACGAGCAATGGCGCGAGGAGGACTACTCCCAGCTGAGAGATCTCCGTGTGAGCCTCGATCAAGTAGGCGAAGACGGCGAAGGAAGAGGAGATGGGGGCGGTGATGGCGGCAGCGGCTTCTCCCTCTGCCTCTGGCTTTACCTCTCGAGCTCCGCTCGCCCTTCTTCCATGATCCTTCGACAGATGACTTCGGAAAATGAAGGTGAAGTTCCCTTTCTGGCCTTGAGTGAGCAAAACAAGCTGATGCTCTTTCCTTTGATGTTCTTACATAAAGAAGCTCCATCTACTGGCAGTTCCTTTACATGGACTGACATGCCCCACATATCTTCCAAGATTGAATTCCCACTGGAGAAATGGGTTCATGTTGGGTGTGAGGTAGCCACCCATCATATGCGCCTTCACATTGATGGAAAACTAGTTGGAGAGGAACCTGTAACACTCCTAACCAATAAGCATCACTATCGGGACAATTTCAAGAGAATAAATTTTGTTGGTAATGATGGGAAACTTGATGGTTATGTCTATCATGCTCAAGTTTTGCCTATATCAGCATCGATAACAGATCAGTTTGTAAAGGATCCGCCAGTCAAATTATCTCTTGATAGTTCAGGCATCTCTGATGGAGTTGAAGAATGTGGGGATGGTGTCTGGAGCATTGTTGGTGGCAAGGCATCTTGTCGCAGGAATTTTTCGCTGGATGTTGTTCTCTTAAATGCTTTTGGTCGGACTGTAAACAAGGAGATGGAGATTGTTGCTTCACTTGTATATGCCGATAATGGAACTCCTGTTGAAAAGACTAGAGATGATGCAGAAGCTCCATTACTGACGAGCTATGATGGACTTGAACACCCTTCCACGGACAGGCCTGTTACACTTCTTCGTGGATGTGCAACCTTTAAGCTGAAGATATCTCAGCTGTCTTCTAAATCTGACAATAGGCTGTTCCGTGTTTGTTTTCGTACAATGCATGCTCAAAGATATCCATTTCTAGAAGCATACTCCCATCCAATCCATTGCATCTCAAGGAATCGCAGTAACCGACCATTGGCTTTAGGGAAAAGATTGGCTTCCACAAGGGTGCTACTTGATGAAATTCAGTCACTCAAAGGCAATGATGGGTCTAGAGTAATTCATGATACCTATGGAGATGATCGTTTCCGCATCCCGAATCAGAGCGTATTGAGATGCAATACCCCATCAAAGCATTCTAAACTAGAACATGATAGCTCACCTTCGGCAATTGATGCTAATGGAATCTTGGAAAAGGATGAGATGTCACTGAAGACAAGTCTAGAAGTAAGGTCGAATAATTTTGAAGGAACGGATAGTGCACCATCAGATTCTGAGAGCACTGATGCTAGGAACTATGAATCCAGATGGACAAAAGAGGCTATGAACCCAATTTCAGATGCTATTATGTTTAAATACTGTTTAGAAGGCACAAATGAGCGGTTGATGTTGCTCAAAGAGATGATAACCTCTGCCTGTGATGAAGATATCACTAACTTCGCAGAGCAGGTTTCCTTGTATGCCGGATGTTCTCATCACCAATATCAAATATTGATTTCAAAGCGGTTGCTCCAAGAGGGGGCGGATACTTGGAACTCAATTACTCAGAACAACTGCCATGTTCTCTGGAAAGATGCAGTTCCTGAGATTGATAAAAGATTCAGGAACACTGCCCATTCTAACAGGGGTTTGTCAGGAGAGGACCTGGAGGTTTTGCGAGGAATTGCTGGATGTGGTGATCATTTGGGTCGAGAAGAATTTGATAGAATGTGGTACTGGTTATACCCTGTAGCTTTTTCTTTGTCGAACGAACAAATAAATACCATGTGGGCATGCATCTCACCCAAATGGATAGAAGGATTAATTACTAGGGAAGAAGCTGAGAATGCGCTGAAAGGTCCTAGAGGGCTGCAGAGACCAGGAACCTTTGTTCTCAGGTTTCCAACCTCTCGCAGCTGGCCTCACCCAGATGCTGGTAGCTTAGTTGTCACCTATGTCGCTGCTGATTCCACACTTCATCATAAGCTTCTTTCTTTTGATTATAGGAAAAAGGACACCAGCCCACTTCAAGAGTTGCTTTTGGAAGAGCCTGAATTATCGCATTTGGGAAGGCAAGTTACACTGAAATATTATTTCATGCTTGTCTATGGTGCTTTTGTCTGTAGTTAA +>p5.00_sc00002_p0131.1 +ATGAGGCCTCTCATCCCTCTCTCTCACACTACCATCCTCTCTCTCCTCCTCATTTCCTCCATTGCCAAGGCTTCCATCCTCGAAGAAACTTGCAAGAGCGTCGCGGCAGCCCGCCCCAACATCGGCTACGAATTTTGCGCGACATCTCTCCGAGCGGATCCAGGAAGCAGCTCCGCAGATCTCCATGGTCTCGCGGTCATCGCGACGAGATTGTCCCTGGCCGATGCTACAAACACGGAATCCAGAATAAATCAGCTGATGGTGGTGGAGTCCAAGAAGCCTTTCTTAAAAGACTGCCTCAGCGTTTGCTCAGAAGTGTACTCGGAAGCGATCGACCACTTGAACGACGCCAGCCGCAACCTGGAAGCCAGGCTCTACCGCGAGGCCGTGACATTTCTTAGCGCAGCACTCGACGCGCCTGATAACTGTGAGGATGCGTTCAGCGATGCCGGCAGTGGGGGGTTGTCGCCGCTGGCGAGGGAGGATGAGGATTATGGCAGGCTTGCTGACGTTGCTCTAACCATCACCGCGTCTCTGGGATGA