From 86968b8b0c8eae987f76242c2bcd90410e1377e6 Mon Sep 17 00:00:00 2001 From: cesen Date: Thu, 23 Dec 2021 12:19:29 +0100 Subject: [PATCH 1/2] Anchor clustering: improve filtering steps - The filtering steps are meant to remove anchor clusters that are likely to be artefact or are very unclear. - New filtering criterion: if a cluster is lower in height than the successive cluster, it is discarded. The motivation is that any WGD peak is generally taller than the successive and older WGD peak in the distribution, due to gene loss and Ks stochasticity (increased variabilty in substitution accumulation) among paralogs. - Modified existing filtering criterion: widely spread clusters (IQR >= 1.1) are now discarded only if their median Ks is old (>2.6); this change comes from the fact that some clusters whose median was actually acceptable (e.g. 2) were discarded because they were covering a wide range with a tail up to 5 Ks. Such species were Poaceae like rice, sorghum, were there is a wide sigma-tau signal between 2 and 3. --- ksrates/cluster_anchor_ks.py | 18 +++++++++----- ksrates/fc_cluster_anchors.py | 45 +++++++++++++++++++++++++++-------- 2 files changed, 47 insertions(+), 16 deletions(-) diff --git a/ksrates/cluster_anchor_ks.py b/ksrates/cluster_anchor_ks.py index 9a6eff3..d0e82a1 100644 --- a/ksrates/cluster_anchor_ks.py +++ b/ksrates/cluster_anchor_ks.py @@ -300,7 +300,8 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, fcPlot.plot_histogram_for_anchor_clustering(ax_uncorr_first, anchor_ks_list, anchors_weights, bin_list, y_max_lim) # Plot the clusters of anchor Ks and on top of them their KDEs - clusters_sorted_by_median, cluster_color_letter_list = fcCluster.plot_clusters(ax_corr_first, cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) + clusters_sorted_by_median, cluster_color_letter_list, clusters_sorted_by_peak_height = fcCluster.plot_clusters(ax_corr_first, cluster_of_ks, + bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) fcCluster.plot_clusters(ax_uncorr_first, cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) # Plotting the ortholog peaks coming from the adjustment, if available @@ -314,7 +315,8 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, # Removing the clusters that are poorly populated (Ks content <= 10%), very old (cluster median >= 3 Ks) or quite horizontally spread (IQR > 1.1 Ks) logging.info("") logging.info(f"Filtering away Ks clusters with unclear signal (poor Ks content, old Ks age or flat peak)...") - clean_clusters_of_ks = fcCluster.filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluster_color_letter_list) + clean_clusters_of_ks = fcCluster.filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, + cluster_color_letter_list, clusters_sorted_by_peak_height) # ----------------------------------------------------------------------------- @@ -325,7 +327,8 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, # Saving the first plot with the final name (leaving away "unfiltered") logging.info(f"Saving mixed Ks plot with anchor Ks clusters [{fcCluster._ANCHOR_CLUSTERS_FILTERED.format(species)}]") - fcCluster.save_anchor_cluster_plot(fig_corr_first, fig_uncorr_first, ax_corr_first, ax_uncorr_first, species, latin_names, correction_table_available, cluster_of_ks, output, "second") + fcCluster.save_anchor_cluster_plot(fig_corr_first, fig_uncorr_first, ax_corr_first, ax_uncorr_first, species, + latin_names, correction_table_available, cluster_of_ks, output, "second") logging.info("") elif updated_n_clusters == 0: @@ -334,7 +337,8 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, else: # if one ore more clusters were removed logging.info(f"Saving mixed Ks plot with unfiltered anchor Ks clusters [{fcCluster._ANCHOR_CLUSTERS_UNFILTERED.format(species)}]") - fcCluster.save_anchor_cluster_plot(fig_corr_first, fig_uncorr_first, ax_corr_first, ax_uncorr_first, species, latin_names, correction_table_available, cluster_of_ks, output, "first") + fcCluster.save_anchor_cluster_plot(fig_corr_first, fig_uncorr_first, ax_corr_first, ax_uncorr_first, species, + latin_names, correction_table_available, cluster_of_ks, output, "first") logging.info("") @@ -358,7 +362,8 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, # Temporary not available because pomegranate is not on midas: #lognormal_clustered_medians2 = fcCluster.lognormalmm(clean_medians_list, updated_n_clusters) - filtered_cluster_of_ks, __, __, filtered_cluster_color_list = fcCluster.get_clusters_of_ks(gmm_clustered_medians2, clean_medians_list, clean_segment_medians_list, chosen_nonred_segment_pair_ks_list, "second") + filtered_cluster_of_ks, __, __, filtered_cluster_color_list = fcCluster.get_clusters_of_ks(gmm_clustered_medians2, clean_medians_list, + clean_segment_medians_list, chosen_nonred_segment_pair_ks_list, "second") # Generate the plot for the mixed distribution with clusters of Ks fig_corr_second, ax_corr_second = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, @@ -384,7 +389,8 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, logging.info(f"Saving mixed Ks plot with filtered anchor Ks clusters [{fcCluster._ANCHOR_CLUSTERS_FILTERED.format(species)}]") logging.info("") - fcCluster.save_anchor_cluster_plot(fig_corr_second, fig_uncorr_second, ax_corr_second, ax_uncorr_second, species, latin_names, correction_table_available, filtered_cluster_of_ks, output, "second") + fcCluster.save_anchor_cluster_plot(fig_corr_second, fig_uncorr_second, ax_corr_second, ax_uncorr_second, species, + latin_names, correction_table_available, filtered_cluster_of_ks, output, "second") logging.info("All done") diff --git a/ksrates/fc_cluster_anchors.py b/ksrates/fc_cluster_anchors.py index 1600bb7..4f120b5 100644 --- a/ksrates/fc_cluster_anchors.py +++ b/ksrates/fc_cluster_anchors.py @@ -405,13 +405,17 @@ def plot_clusters(axis, cluster_of_ks, bin_width, max_ks_para, peak_stats, corre """ clusters_sorted_by_median, cluster_color_letter_list = assign_cluster_colors(cluster_of_ks) + clusters_sorted_by_peak_height = {} + zorder_ID = 50 for cluster_id in clusters_sorted_by_median: median_of_the_cluster = median(cluster_of_ks[cluster_id]) __, kde_x, kde_y = fcPeak.compute_kde(cluster_of_ks[cluster_id], max_ks_para, bin_width) - mode_of_the_cluster_KDE, __ = fcPeak.get_mode_from_kde(kde_x, kde_y) + mode_of_the_cluster_KDE, peak_height_of_the_cluster_KDE = fcPeak.get_mode_from_kde(kde_x, kde_y) + + clusters_sorted_by_peak_height[cluster_id] = peak_height_of_the_cluster_KDE # transparently color the area under the KDE curve kde_area_xy = [(kde_x[0], 0), *zip(kde_x, kde_y), (kde_x[-1], 0)] @@ -434,7 +438,7 @@ def plot_clusters(axis, cluster_of_ks, bin_width, max_ks_para, peak_stats, corre cluster_color_letter_list[cluster_id][1], zorder_ID + 2, peak_stats, correction_table_available, plot_correction_arrows) zorder_ID += 10 - return clusters_sorted_by_median, cluster_color_letter_list + return clusters_sorted_by_median, cluster_color_letter_list, clusters_sorted_by_peak_height def plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, color, letter, zorder_ID, peak_stats, correction_table_available, plot_correction_arrows): @@ -497,7 +501,7 @@ def plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, color, letter horizontalalignment='center', verticalalignment='center', clip_on=True, zorder=zorder_ID + 5, color='w') -def filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluster_color_letter_list): +def filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluster_color_letter_list, clusters_sorted_by_peak_height): """ Removes the clusters that are likely to be just noise or that are too degenerated for the visualization. Criteria for degenerated clusters are: Ks content less than 10% of the total (poorly populated), @@ -515,22 +519,41 @@ def filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluste tot_ks_list_in_clusters.extend(cluster_of_ks[cluster]) clean_clusters_of_ks = {} - for cluster in clusters_sorted_by_median: + for i in range(len(clusters_sorted_by_median)): + cluster = clusters_sorted_by_median[i] ks_list = cluster_of_ks[cluster] - # Get median, first quartile range, third quartile and interquartile range + # Get median, first quartile range, third quartile, interquartile range and height median_ks = percentile(ks_list, 50, interpolation="midpoint") q1 = percentile(ks_list, 25, interpolation="midpoint") q3 = percentile(ks_list, 75, interpolation="midpoint") iqr = q3 - q1 + height = clusters_sorted_by_peak_height[cluster] evaluation = [] percentage_ks_content = len(ks_list) / tot_ks_in_clusters - if percentage_ks_content <= 0.10: # Discard clusters that are poorly populated ( less than 10% of the total Ks) + + # Discard if it looks like a small spurious peak close to 0 Ks + # If the current cluster is NOT the "oldest/last" cluster, compare its height with the successive cluster + j = i+1 + if j < len(clusters_sorted_by_median): + successive_cluster_id = clusters_sorted_by_median[j] + height_successive_peaks = clusters_sorted_by_peak_height[successive_cluster_id] + # If it's lower than the successive (and older) peak, then it's very likely spurious, + # because younger peaks are (always) higher than older ones + if height < height_successive_peaks: + evaluation.append(f"smaller height than the successive WGD peak") + + + if percentage_ks_content <= 0.10: + # Discard clusters that are poorly populated ( less than 10% of the total Ks) evaluation.append(f"neglectable Ks content (< {round(10)}% of data)") - if median_ks >= 3: # Discard clusters whose median is very old - evaluation.append("high median (> 3 Ks)") - if iqr >= 1.1: # Discard clusters that tend to be flat, with no clear peak - evaluation.append("highly spread data (IQR > 1.1)") + if median_ks >= 3: + # Discard clusters whose median is very old (not reliable due to saturation and stochasticity) + evaluation.append("high median (>= 3 Ks)") + if median_ks >= 2.6 and iqr >= 1.1: + # Discard clusters that are widely spread with no clear peak because they make interpretation tricky; + # retain however those with young mode (< 2.6) because their large width could be just due to a long right tail up to e.g. 5 Ks + evaluation.append("highly spread data (IQR >= 1.1)") # If the clusters has no negative features, let's add it in the "cleaned list", otherwise just explain why is discarded cluster_letter = cluster_color_letter_list[cluster][1] @@ -543,6 +566,8 @@ def filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluste logging.info(f"- Cluster {cluster_letter}: Ks content {round(percentage_ks_content * 100, 2)}%, median {round(median_ks, 2)}, IQR {round(iqr, 2)} --> Discarded due to {evaluation[0]} and {evaluation[1]}") elif len(evaluation) == 3: logging.info(f"- Cluster {cluster_letter}: Ks content {round(percentage_ks_content * 100, 2)}%, median {round(median_ks, 2)}, IQR {round(iqr, 2)} --> Discarded due to {evaluation[0]}, {evaluation[1]} and {evaluation[2]}") + elif len(evaluation) == 4: + logging.info(f"- Cluster {cluster_letter}: Ks content {round(percentage_ks_content * 100, 2)}%, median {round(median_ks, 2)}, IQR {round(iqr, 2)} --> Discarded due to {evaluation[0]}, {evaluation[1]}, {evaluation[2]} and {evaluation[3]}") return clean_clusters_of_ks -- GitLab From d74dc115d093971493a0139aa05394616e47702b Mon Sep 17 00:00:00 2001 From: cesen Date: Fri, 24 Dec 2021 11:51:48 +0100 Subject: [PATCH 2/2] Rename variable --- ksrates/cluster_anchor_ks.py | 4 ++-- ksrates/fc_cluster_anchors.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ksrates/cluster_anchor_ks.py b/ksrates/cluster_anchor_ks.py index d0e82a1..3cf5eaf 100644 --- a/ksrates/cluster_anchor_ks.py +++ b/ksrates/cluster_anchor_ks.py @@ -300,7 +300,7 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, fcPlot.plot_histogram_for_anchor_clustering(ax_uncorr_first, anchor_ks_list, anchors_weights, bin_list, y_max_lim) # Plot the clusters of anchor Ks and on top of them their KDEs - clusters_sorted_by_median, cluster_color_letter_list, clusters_sorted_by_peak_height = fcCluster.plot_clusters(ax_corr_first, cluster_of_ks, + clusters_sorted_by_median, cluster_color_letter_list, cluster_peak_heights = fcCluster.plot_clusters(ax_corr_first, cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) fcCluster.plot_clusters(ax_uncorr_first, cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows) @@ -316,7 +316,7 @@ def cluster_anchor_ks(config_file, correction_table_file, path_anchorpoints_txt, logging.info("") logging.info(f"Filtering away Ks clusters with unclear signal (poor Ks content, old Ks age or flat peak)...") clean_clusters_of_ks = fcCluster.filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, - cluster_color_letter_list, clusters_sorted_by_peak_height) + cluster_color_letter_list, cluster_peak_heights) # ----------------------------------------------------------------------------- diff --git a/ksrates/fc_cluster_anchors.py b/ksrates/fc_cluster_anchors.py index 4f120b5..417bd6a 100644 --- a/ksrates/fc_cluster_anchors.py +++ b/ksrates/fc_cluster_anchors.py @@ -405,7 +405,7 @@ def plot_clusters(axis, cluster_of_ks, bin_width, max_ks_para, peak_stats, corre """ clusters_sorted_by_median, cluster_color_letter_list = assign_cluster_colors(cluster_of_ks) - clusters_sorted_by_peak_height = {} + cluster_peak_heights = {} zorder_ID = 50 for cluster_id in clusters_sorted_by_median: @@ -415,7 +415,7 @@ def plot_clusters(axis, cluster_of_ks, bin_width, max_ks_para, peak_stats, corre __, kde_x, kde_y = fcPeak.compute_kde(cluster_of_ks[cluster_id], max_ks_para, bin_width) mode_of_the_cluster_KDE, peak_height_of_the_cluster_KDE = fcPeak.get_mode_from_kde(kde_x, kde_y) - clusters_sorted_by_peak_height[cluster_id] = peak_height_of_the_cluster_KDE + cluster_peak_heights[cluster_id] = peak_height_of_the_cluster_KDE # transparently color the area under the KDE curve kde_area_xy = [(kde_x[0], 0), *zip(kde_x, kde_y), (kde_x[-1], 0)] @@ -438,7 +438,7 @@ def plot_clusters(axis, cluster_of_ks, bin_width, max_ks_para, peak_stats, corre cluster_color_letter_list[cluster_id][1], zorder_ID + 2, peak_stats, correction_table_available, plot_correction_arrows) zorder_ID += 10 - return clusters_sorted_by_median, cluster_color_letter_list, clusters_sorted_by_peak_height + return clusters_sorted_by_median, cluster_color_letter_list, cluster_peak_heights def plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, color, letter, zorder_ID, peak_stats, correction_table_available, plot_correction_arrows): @@ -501,7 +501,7 @@ def plot_cluster_marker(axis, median_of_the_cluster, kde_x, kde_y, color, letter horizontalalignment='center', verticalalignment='center', clip_on=True, zorder=zorder_ID + 5, color='w') -def filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluster_color_letter_list, clusters_sorted_by_peak_height): +def filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluster_color_letter_list, cluster_peak_heights): """ Removes the clusters that are likely to be just noise or that are too degenerated for the visualization. Criteria for degenerated clusters are: Ks content less than 10% of the total (poorly populated), @@ -527,7 +527,7 @@ def filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluste q1 = percentile(ks_list, 25, interpolation="midpoint") q3 = percentile(ks_list, 75, interpolation="midpoint") iqr = q3 - q1 - height = clusters_sorted_by_peak_height[cluster] + height = cluster_peak_heights[cluster] evaluation = [] percentage_ks_content = len(ks_list) / tot_ks_in_clusters @@ -537,7 +537,7 @@ def filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluste j = i+1 if j < len(clusters_sorted_by_median): successive_cluster_id = clusters_sorted_by_median[j] - height_successive_peaks = clusters_sorted_by_peak_height[successive_cluster_id] + height_successive_peaks = cluster_peak_heights[successive_cluster_id] # If it's lower than the successive (and older) peak, then it's very likely spurious, # because younger peaks are (always) higher than older ones if height < height_successive_peaks: -- GitLab