diff --git a/ksrates/cluster_anchor_ks.py b/ksrates/cluster_anchor_ks.py index 9a6eff34892d079f311dca1d6601b9f27cd482ee..3cf5eafb840a527d88cb7c183d570c203fb1f4b0 100644 --- a/ksrates/cluster_anchor_ks.py +++ b/ksrates/cluster_anchor_ks.py @@ -300,7 +300,8 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, fcPlot.plot_histogram_for_anchor_clustering(ax_uncorr_first, anchor_ks_list, anchors_weights, bin_list, y_max_lim) # Plot the clusters of anchor Ks and on top of them their KDEs - clusters_sorted_by_median, cluster_color_letter_list = fcCluster.plot_clusters(ax_corr_first, cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) + clusters_sorted_by_median, cluster_color_letter_list, cluster_peak_heights = fcCluster.plot_clusters(ax_corr_first, cluster_of_ks, + bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) fcCluster.plot_clusters(ax_uncorr_first, cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) # Plotting the ortholog peaks coming from the adjustment, if available @@ -314,7 +315,8 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, # Removing the clusters that are poorly populated (Ks content <= 10%), very old (cluster median >= 3 Ks) or quite horizontally spread (IQR > 1.1 Ks) logging.info("") logging.info(f"Filtering away Ks clusters with unclear signal (poor Ks content, old Ks age or flat peak)...") - clean_clusters_of_ks = fcCluster.filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluster_color_letter_list) + clean_clusters_of_ks = fcCluster.filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, + cluster_color_letter_list, cluster_peak_heights) # ----------------------------------------------------------------------------- @@ -325,7 +327,8 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, # Saving the first plot with the final name (leaving away "unfiltered") logging.info(f"Saving mixed Ks plot with anchor Ks clusters [{fcCluster._ANCHOR_CLUSTERS_FILTERED.format(species)}]") - fcCluster.save_anchor_cluster_plot(fig_corr_first, fig_uncorr_first, ax_corr_first, ax_uncorr_first, species, latin_names, correction_table_available, cluster_of_ks, output, "second") + fcCluster.save_anchor_cluster_plot(fig_corr_first, fig_uncorr_first, ax_corr_first, ax_uncorr_first, species, + latin_names, correction_table_available, cluster_of_ks, output, "second") logging.info("") elif updated_n_clusters == 0: @@ -334,7 +337,8 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, else: # if one ore more clusters were removed logging.info(f"Saving mixed Ks plot with unfiltered anchor Ks clusters [{fcCluster._ANCHOR_CLUSTERS_UNFILTERED.format(species)}]") - fcCluster.save_anchor_cluster_plot(fig_corr_first, fig_uncorr_first, ax_corr_first, ax_uncorr_first, species, latin_names, correction_table_available, cluster_of_ks, output, "first") + fcCluster.save_anchor_cluster_plot(fig_corr_first, fig_uncorr_first, ax_corr_first, ax_uncorr_first, species, + latin_names, correction_table_available, cluster_of_ks, output, "first") logging.info("") @@ -358,7 +362,8 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, # Temporary not available because pomegranate is not on midas: #lognormal_clustered_medians2 = fcCluster.lognormalmm(clean_medians_list, updated_n_clusters) - filtered_cluster_of_ks, __, __, filtered_cluster_color_list = fcCluster.get_clusters_of_ks(gmm_clustered_medians2, clean_medians_list, clean_segment_medians_list, chosen_nonred_segment_pair_ks_list, "second") + filtered_cluster_of_ks, __, __, filtered_cluster_color_list = fcCluster.get_clusters_of_ks(gmm_clustered_medians2, clean_medians_list, + clean_segment_medians_list, chosen_nonred_segment_pair_ks_list, "second") # Generate the plot for the mixed distribution with clusters of Ks fig_corr_second, ax_corr_second = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, @@ -384,7 +389,8 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, logging.info(f"Saving mixed Ks plot with filtered anchor Ks clusters [{fcCluster._ANCHOR_CLUSTERS_FILTERED.format(species)}]") logging.info("") - fcCluster.save_anchor_cluster_plot(fig_corr_second, fig_uncorr_second, ax_corr_second, ax_uncorr_second, species, latin_names, correction_table_available, filtered_cluster_of_ks, output, "second") + fcCluster.save_anchor_cluster_plot(fig_corr_second, fig_uncorr_second, ax_corr_second, ax_uncorr_second, species, + latin_names, correction_table_available, filtered_cluster_of_ks, output, "second") logging.info("All done") diff --git a/ksrates/fc_cluster_anchors.py b/ksrates/fc_cluster_anchors.py index 1600bb71301919eab7251f538db1be57de0bedbb..417bd6a777cb74504a41d754ee49792b41a98529 100644 --- a/ksrates/fc_cluster_anchors.py +++ b/ksrates/fc_cluster_anchors.py @@ -405,13 +405,17 @@ def plot_clusters(axis, cluster_of_ks, bin_width, max_ks_para, peak_stats, corre """ clusters_sorted_by_median, cluster_color_letter_list = assign_cluster_colors(cluster_of_ks) + cluster_peak_heights = {} + zorder_ID = 50 for cluster_id in clusters_sorted_by_median: median_of_the_cluster = median(cluster_of_ks[cluster_id]) __, kde_x, kde_y = fcPeak.compute_kde(cluster_of_ks[cluster_id], max_ks_para, bin_width) - mode_of_the_cluster_KDE, __ = fcPeak.get_mode_from_kde(kde_x, kde_y) + mode_of_the_cluster_KDE, peak_height_of_the_cluster_KDE = fcPeak.get_mode_from_kde(kde_x, kde_y) + + cluster_peak_heights[cluster_id] = peak_height_of_the_cluster_KDE # transparently color the area under the KDE curve kde_area_xy = [(kde_x[0], 0), *zip(kde_x, kde_y), (kde_x[-1], 0)] @@ -434,7 +438,7 @@ def plot_clusters(axis, cluster_of_ks, bin_width, max_ks_para, peak_stats, corre cluster_color_letter_list[cluster_id][1], zorder_ID + 2, peak_stats, correction_table_available, plot_correction_arrows) zorder_ID += 10 - return clusters_sorted_by_median, cluster_color_letter_list + return clusters_sorted_by_median, cluster_color_letter_list, cluster_peak_heights def plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, color, letter, zorder_ID, peak_stats, correction_table_available, plot_correction_arrows): @@ -497,7 +501,7 @@ def plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, color, letter horizontalalignment='center', verticalalignment='center', clip_on=True, zorder=zorder_ID + 5, color='w') -def filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluster_color_letter_list): +def filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluster_color_letter_list, cluster_peak_heights): """ Removes the clusters that are likely to be just noise or that are too degenerated for the visualization. Criteria for degenerated clusters are: Ks content less than 10% of the total (poorly populated), @@ -515,22 +519,41 @@ def filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluste tot_ks_list_in_clusters.extend(cluster_of_ks[cluster]) clean_clusters_of_ks = {} - for cluster in clusters_sorted_by_median: + for i in range(len(clusters_sorted_by_median)): + cluster = clusters_sorted_by_median[i] ks_list = cluster_of_ks[cluster] - # Get median, first quartile range, third quartile and interquartile range + # Get median, first quartile range, third quartile, interquartile range and height median_ks = percentile(ks_list, 50, interpolation="midpoint") q1 = percentile(ks_list, 25, interpolation="midpoint") q3 = percentile(ks_list, 75, interpolation="midpoint") iqr = q3 - q1 + height = cluster_peak_heights[cluster] evaluation = [] percentage_ks_content = len(ks_list) / tot_ks_in_clusters - if percentage_ks_content <= 0.10: # Discard clusters that are poorly populated ( less than 10% of the total Ks) + + # Discard if it looks like a small spurious peak close to 0 Ks + # If the current cluster is NOT the "oldest/last" cluster, compare its height with the successive cluster + j = i+1 + if j < len(clusters_sorted_by_median): + successive_cluster_id = clusters_sorted_by_median[j] + height_successive_peaks = cluster_peak_heights[successive_cluster_id] + # If it's lower than the successive (and older) peak, then it's very likely spurious, + # because younger peaks are (always) higher than older ones + if height < height_successive_peaks: + evaluation.append(f"smaller height than the successive WGD peak") + + + if percentage_ks_content <= 0.10: + # Discard clusters that are poorly populated ( less than 10% of the total Ks) evaluation.append(f"neglectable Ks content (< {round(10)}% of data)") - if median_ks >= 3: # Discard clusters whose median is very old - evaluation.append("high median (> 3 Ks)") - if iqr >= 1.1: # Discard clusters that tend to be flat, with no clear peak - evaluation.append("highly spread data (IQR > 1.1)") + if median_ks >= 3: + # Discard clusters whose median is very old (not reliable due to saturation and stochasticity) + evaluation.append("high median (>= 3 Ks)") + if median_ks >= 2.6 and iqr >= 1.1: + # Discard clusters that are widely spread with no clear peak because they make interpretation tricky; + # retain however those with young mode (< 2.6) because their large width could be just due to a long right tail up to e.g. 5 Ks + evaluation.append("highly spread data (IQR >= 1.1)") # If the clusters has no negative features, let's add it in the "cleaned list", otherwise just explain why is discarded cluster_letter = cluster_color_letter_list[cluster][1] @@ -543,6 +566,8 @@ def filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluste logging.info(f"- Cluster {cluster_letter}: Ks content {round(percentage_ks_content * 100, 2)}%, median {round(median_ks, 2)}, IQR {round(iqr, 2)} --> Discarded due to {evaluation[0]} and {evaluation[1]}") elif len(evaluation) == 3: logging.info(f"- Cluster {cluster_letter}: Ks content {round(percentage_ks_content * 100, 2)}%, median {round(median_ks, 2)}, IQR {round(iqr, 2)} --> Discarded due to {evaluation[0]}, {evaluation[1]} and {evaluation[2]}") + elif len(evaluation) == 4: + logging.info(f"- Cluster {cluster_letter}: Ks content {round(percentage_ks_content * 100, 2)}%, median {round(median_ks, 2)}, IQR {round(iqr, 2)} --> Discarded due to {evaluation[0]}, {evaluation[1]}, {evaluation[2]} and {evaluation[3]}") return clean_clusters_of_ks