diff --git a/ksrates/extend_orthogroups.py b/ksrates/extend_orthogroups.py new file mode 100755 index 0000000000000000000000000000000000000000..dbca999293b95e43f749ada44398874eae8d8af9 --- /dev/null +++ b/ksrates/extend_orthogroups.py @@ -0,0 +1,478 @@ +from pandas import read_csv, read_table, set_option, DataFrame, Series, read_excel, concat +import os +import sys +import logging +import tempfile +from numpy import diag +from sklearn import metrics +import ksrates.fc_reciprocal_retention as fc_rec_ret + +set_option("display.max_rows", 20, "display.max_columns", 20) + +def extend_orthogroups(species, latin_name, fasta_file, top, rank_type, pident_threshold, min_qcov_threshold, + filter_false_predictions, filter_zero_false_predictions, base_dir='.', n_threads=4): + """ + Performs a diamond homology search yto identify which genes from teh input species + belong to the top e.g. 2000 reciprocally gene gene families. + + :param species: main species used in the mixed Ks distribution plot (informal name) + :param fasta_file: FASTA file for the main species containing its nucleotidic sequences + :param top: choose how many gene families to consider from the top of the list of 9178 core angiosperm gene families. Default 2000 + :param rank_type: choose between lambda or combined rank for the 9178 gene families. Default is lambda + :param base_dir: directory where to output results (paralog_distributions) + :param pident_threshold: percentage of identity threshold for "-id" parameter in "diamond blastx" + :param min_qcov_threshold: alignment coverage threshold required for a diamond hit to be considered further + :param filter_false_predictions: boolean to state whether to use only a subset of gene families with fairly reliable orthogroup assignment + :param filter_zero_false_predictions: boolean to state whether to use only a subset of gene families with highly reliable orthogroup assignment + :param n_threads: number of threads for parallelization of diamond homology search. Default 4 + :returns: TSV file listing in the first column the detected genes and in the second column the associated gene family ID + """ + # Extend the original GFs with the new sequences from diamond matched homology search results. + + # ACTIVATE TO CHOOSE A TEST SPECIES (atha, pdac or vvin) INSTEAD OF THE NEW SPECIES FROM CONFIGURATION FILE + test_run = False + + if test_run: # the species is a test species (atha, pdac or vvin) + test_species = "atha" + basefolder_sequence_data = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "sequences", "old_genome_versions") + + if test_species == "atha": + species, latin_name, fasta_file, parsing_element = "arabidopsis", "A. thaliana", os.path.join(basefolder_sequence_data, "atha.fasta"), "AT" + elif test_species == "vvin": + species, latin_name, fasta_file, parsing_element = "vitis", "V. vinifera", os.path.join(basefolder_sequence_data, "vvin.fasta"), "VV" + elif test_species == "pdac": + species, latin_name, fasta_file, parsing_element = "phoenix", "P. dactylifera", os.path.join(basefolder_sequence_data, "pdac.fasta"), "PDK" + + #------------------------------------------------------------------------------ + + logging.info(f"Identification of reciprocally retained gene families in {latin_name}") + logging.info("") + + if not filter_false_predictions and not filter_zero_false_predictions: + logging.info(f"- Using the top {top} GFs ({rank_type} rank)") + elif filter_zero_false_predictions: + logging.info(f"- Using a subset of top {top} GFs ({rank_type} rank) after filtering away those with at least one false prediction") + elif filter_false_predictions: + logging.info(f"- Using a subset of top {top} GFs ({rank_type} rank) after filtering away those with false predictions >= half of the true positives") + + logging.info(f"- Minimum required percentage of alignment identity between {latin_name} genes and their homology hits: pident_threshold = {pident_threshold}%") + logging.info(f"- Minimum required percentage of alignment coverage for {latin_name} genes in their homology hits: min_qcov_threshold = {min_qcov_threshold}%") + logging.info("") + + #------------------------------------------------------------------------------ + + # BASEFOLDERS for input and output + + # Path to database and files to be used in the workflow + # TODO: this is temporarily because we will probably have to let the user first make this files + # themselves in a specific folder; alternatively we could provide this file already ready, but that + # would make the GitHub folder (too) heavy + # THE PATH IS paralog_distribution/reciprocal_retention + basefolder_db_and_files = os.path.join(os.path.dirname(base_dir), "reciprocal_retention") + + # THIS IS THE FUNCTION TO GENERATE THE D + # # The user generates the FASTA database files for the rec.ret. sequences on their own in paralog_distributions + # fc_rec_ret.make_gene_family_database(top, rank_type, filter_false_predictions, filter_zero_false_predictions, base_dir) + + # Path to output files + base_dir_rec_ret = os.path.join(base_dir, "reciprocal_retention", rank_type) + if not os.path.exists(base_dir_rec_ret): + os.makedirs(base_dir_rec_ret) + + #------------------------------------------------------------------------------ + + # INPUT DATABASES AND GENE FAMILY FILES + + # Translate it into the aminoacid alphabet + species_sequences_filename = fc_rec_ret.dna_to_aminoacid(fasta_file, output_folder=base_dir_rec_ret) + + if test_run: + # Nucleotidic and aminoacidi databases of species used to make the orthogroups (i.e. the remaining 35 species) + # Note that this database contains only the top e.g. 2000 orthogroup from the complete list of 9178 + if not filter_zero_false_predictions and not filter_false_predictions: + orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_test_without_{species}.fasta") + orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_test_without_{species}.fasta") + elif filter_zero_false_predictions: + orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_filtered_zero_false_predictions_test_without_{species}.fasta") + orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_zero_false_predictions_test_without_{species}.fasta") + elif filter_false_predictions: + orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_filtered_false_predictions_test_without_{species}.fasta") + orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_false_predictions_test_without_{species}.fasta") + else: + # Nucleotidic and aminoacidi databases of species used to make the orthogroups (all 36 species) + if not filter_zero_false_predictions and not filter_false_predictions: + orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_COMPLETE.fasta") + orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_COMPLETE.fasta") + elif filter_zero_false_predictions: + orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_filtered_zero_false_predictions_COMPLETE.fasta") + orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_zero_false_predictions_COMPLETE.fasta") + elif filter_false_predictions: + orthogroup_database_nucl = os.path.join(basefolder_db_and_files, f"orthogroups_database_top_{top}_{rank_type}_filtered_false_predictions_COMPLETE.fasta") + orthogroup_database_amin_path = os.path.join(basefolder_db_and_files, f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_false_predictions_COMPLETE.fasta") + + + if test_run: + # Collection of all genes in the orthogroups, including those of test species + # Note: if using the top 2000 orthogroups, this file lists only the genes coming from top 2000 orthogroups + # if using the subset of top 2000, there will be only genes coming from that subset + if not filter_zero_false_predictions and not filter_false_predictions: + gene_ids_in_orthogroups = read_csv(os.path.join(basefolder_db_and_files, f"gf_genes_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None) + elif filter_zero_false_predictions: + gene_ids_in_orthogroups = read_csv(os.path.join(basefolder_db_and_files, f"gf_genes_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None) + elif filter_false_predictions: + gene_ids_in_orthogroups = read_csv(os.path.join(basefolder_db_and_files, f"gf_genes_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None) + + # Get the list of the top 2000 GFs + if not filter_zero_false_predictions and not filter_false_predictions: + # Get the list of genes together with their GFs + gf_genes_with_gf_list = read_csv(os.path.join(basefolder_db_and_files, f"gf_genes_with_gf_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None) + elif filter_zero_false_predictions: + # Get the list of genes together with their GFs + gf_genes_with_gf_list = read_table(os.path.join(basefolder_db_and_files, f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None) + elif filter_false_predictions: + # Get the list of genes together with their GFs + gf_genes_with_gf_list = read_table(os.path.join(basefolder_db_and_files, f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None) + + # ----------------------------------------------------------------------------- + + if not filter_zero_false_predictions and not filter_false_predictions: + dmd_with_orthogroups_filename = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}_with_orthogroups.dmd") + elif filter_zero_false_predictions: + dmd_with_orthogroups_filename = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}_filtered_zero_false_predictions_with_orthogroups.dmd") + elif filter_false_predictions: + dmd_with_orthogroups_filename = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}_filtered_false_predictions_with_orthogroups.dmd") + + column_names = ["qseqid", "sseqid", "pident", "length", "qlen", "mismatch", "gapopen", + "qstart", "qend", "sstart", "send", "evalue", "bitscore"] + + # Generate or open a diamond table-like file (diamond_{species}_top_{top}_{rank_type}_with_orthogroups.dmd) + # and append as last column the GF of each matched gene + if not os.path.exists(dmd_with_orthogroups_filename): + + # First step is to generate or open the diamond result table + if not filter_zero_false_predictions and not filter_false_predictions: + dmd_test_species_to_top_orthogroup = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}.dmd") + elif filter_zero_false_predictions: + dmd_test_species_to_top_orthogroup = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}_filtered_zero_false_predictions.dmd") + elif filter_false_predictions: + dmd_test_species_to_top_orthogroup = os.path.join(base_dir_rec_ret, f"diamond_{species}_top_{top}_{rank_type}_filtered_false_predictions.dmd") + + if not os.path.exists(dmd_test_species_to_top_orthogroup): + # Matching the test species to the top 2000 orthogroups through diamond + logging.info(f"Running homology search between {latin_name} and the gene family database. Will take some time...") + with tempfile.TemporaryDirectory(dir=".") as dmd_tmp_dir: + orthogroup_database_amin = fc_rec_ret.make_diamond_db(orthogroup_database_amin_path, output_folder=base_dir_rec_ret) + dmd = fc_rec_ret.run_diamond(fasta_file, orthogroup_database_amin, dmd_test_species_to_top_orthogroup, + dmd_tmp_dir, pident_threshold, column_names=column_names, n_threads=n_threads) + else: + logging.info(f"Found already existing homology search result table between {latin_name} and the gene family database") + dmd = read_csv(dmd_test_species_to_top_orthogroup, sep="\t", header=None, names=column_names) + logging.info("") + + # Second step is generating the diamond table file with en extra column containing the gene family associated to the matched sequences + logging.info("Retrieving which gene family is associated to each homology hit in the table. Will take some time...") + dmd_with_orthogroups = dmd.copy() + gf_column = [] + for matched_gene in dmd_with_orthogroups.loc[:, "sseqid"]: + gf_column.append(gf_genes_with_gf_list[gf_genes_with_gf_list.iloc[:,0] == matched_gene][1].iloc[0]) # append GF + + dmd_with_orthogroups["GF"] = gf_column + with open(dmd_with_orthogroups_filename, "w+") as f: + f.write(dmd_with_orthogroups.to_csv(sep="\t", header=False, index=False)) + + else: # Else open the already existing diamond table file + logging.info("Found already existing homology result table with gene families associated to matched sequences") + dmd_with_orthogroups = read_csv(dmd_with_orthogroups_filename, sep="\t", header=None, names=column_names + ["GF"]) + logging.info("Done\n") + + # ----------------------------------------------------------------------------- + + logging.info("Overview of main species genes' alignment coverage in the homology result table:") + fc_rec_ret.get_query_coverage_in_dmd_table(dmd_with_orthogroups, 35) + fc_rec_ret.get_query_coverage_in_dmd_table(dmd_with_orthogroups, 40) + fc_rec_ret.get_query_coverage_in_dmd_table(dmd_with_orthogroups, 45) + fc_rec_ret.get_query_coverage_in_dmd_table(dmd_with_orthogroups, 50) + fc_rec_ret.get_query_coverage_in_dmd_table(dmd_with_orthogroups, 60) + fc_rec_ret.get_query_coverage_in_dmd_table(dmd_with_orthogroups, 70) + fc_rec_ret.get_query_coverage_in_dmd_table(dmd_with_orthogroups, 80) + fc_rec_ret.get_query_coverage_in_dmd_table(dmd_with_orthogroups, 90) + logging.info("") + + # Filter away hits with low query alignment coverage + logging.info(f"Filtering away all hits with low query alignment coverage (< {min_qcov_threshold}%):") + dmd_with_orthogroups = dmd_with_orthogroups.loc[(abs(dmd_with_orthogroups["qend"] - dmd_with_orthogroups["qstart"])) / dmd_with_orthogroups["qlen"] * 100 >= min_qcov_threshold] + logging.info("") + + # ----------------------------------------------------------------------------- + + logging.info(f"Assigning {latin_name} sequences to the gene family of their best homology hit if they are reciprocal best hits. Will take some time...") + # Get the list of sequences coming from the main species (column 0) in the diamond result table + query_seqs = dmd_with_orthogroups.loc[:, "qseqid"].drop_duplicates() + + with tempfile.TemporaryDirectory(dir=".") as dmd_tmp_dir: + assigned_gfs_file = fc_rec_ret.assign_first_diamond_hit_with_rbh(species, fasta_file, species_sequences_filename, orthogroup_database_nucl, + query_seqs, dmd_with_orthogroups, pident_threshold, column_names, dmd_tmp_dir, base_dir_rec_ret) + + logging.info("") + + if not test_run: + logging.info("Done") + return assigned_gfs_file + + logging.info("-------------------------------------------------------------------------------") + + # If using a test species, go further with the evaluation through confusion matrix + + logging.info("BINARY CONFUSION MATRIX STATISTICS") + if not filter_zero_false_predictions and not filter_false_predictions: + logging.info(f"CONSIDERING THE TOP {top} ORTHOGROUPS") + elif filter_zero_false_predictions: + logging.info(f"CONSIDERING THE TOP {top} ORTHOGROUPS (after filtering away GFs with at least one false prediction)") + elif filter_false_predictions: + logging.info(f"CONSIDERING THE TOP {top} ORTHOGROUPS (after filtering away GFs with too many false predictions)") + logging.info("") + + # EXAMINE THE QUERIES THAT WERE MATCHED BUT THAT IN THEORY THEY WERE NOT MATCHED ACCORDING TO THE ORIGINAL ORTHOGROUPS + # EXAMING ALSO THE QUERIES THAT SHOULD HAVE BEEN ASSIGNED, BUT THEY WERE NOT! + + logging.info("KNOWN A PRIORI:") + # ALL SPECIES GENES + test_species_fasta = read_csv(fasta_file, sep="\t", header=None) + all_genes_test_species = test_species_fasta.iloc[:,0].loc[test_species_fasta.iloc[:,0].str.contains(">")].str.slice(start=1) + number_all_genes_test_species = len(all_genes_test_species) + logging.info(f"Total number of queries in {latin_name} FASTA file: {number_all_genes_test_species}") + # REAL POSITIVES + logging.info("REAL POSITIVES (RP):") + real_positives = gene_ids_in_orthogroups.iloc[:,0].loc[gene_ids_in_orthogroups.iloc[:,0].str.contains(parsing_element)].drop_duplicates() # known_genes_in_orthogroups + number_known_genes_in_orthogroups = len(real_positives) + logging.info(f"Actual number of {latin_name} genes in top {top} orthogroups (RP): {number_known_genes_in_orthogroups} ({fc_rec_ret.perc(number_known_genes_in_orthogroups, number_all_genes_test_species)})") + # REAL NEGATIVES + logging.info("REAL NEGATIVES (RN):") + real_negatives = all_genes_test_species.loc[~all_genes_test_species.isin(real_positives)] # known_genes_not_in_orthogroups + number_known_genes_not_in_orthogroups = len(real_negatives) + logging.info(f"Actual number of {latin_name} genes NOT in top {top} orthogroups (RN): {number_known_genes_not_in_orthogroups} ({fc_rec_ret.perc(number_known_genes_not_in_orthogroups, number_all_genes_test_species)})") + logging.info("\n") + + # PREDICTIONS + logging.info("PREDICTIONS:") + # Total genes assigned to a gene family (without duplicates, so only to count the genes) + matched_genes = assigned_gfs_file.iloc[:,0].drop_duplicates() + number_of_matched_genes = len(assigned_gfs_file.iloc[:,0].drop_duplicates()) + # Total queries NOT assigned to a gene family + # not_matched_queries = real_positives.loc[~real_positives.isin(assigned_gfs_file.iloc[:,0].drop_duplicates())] + not_matched_genes = all_genes_test_species.loc[~all_genes_test_species.isin(assigned_gfs_file.iloc[:,0].drop_duplicates())] + # number_of_not_matched_genes = number_known_genes_in_orthogroups-len(assigned_gfs_file.iloc[:,0].drop_duplicates()) + number_of_not_matched_genes = len(not_matched_genes) + logging.info(f"PREDICTED POSITIVES: Number of matched queries (without duplicates) TP+FP): {number_of_matched_genes} ({fc_rec_ret.perc(number_of_matched_genes, number_all_genes_test_species)})") + logging.info(f"PREDICTED NEGATIVES: Number of NOT matched queries (without duplicates) (FP+FN): {number_of_not_matched_genes} ({fc_rec_ret.perc(number_of_not_matched_genes, number_all_genes_test_species)})") + + logging.info("") + logging.info("Predicted positives (genes matched to orthogroups):") + # Test species' genes that have a significant match with one ore more GFs and + # that are found among the test species' genes actually used to build the orthogroup database + test_species_true_positive = assigned_gfs_file.loc[assigned_gfs_file.iloc[:,0].isin(gene_ids_in_orthogroups.loc[:,0])] + number_test_species_true_positive = len(test_species_true_positive.iloc[:,0].drop_duplicates()) + logging.info(f"TRUE POSITIVES: Number of matched genes that are truly part of orthogroups: {number_test_species_true_positive} ({fc_rec_ret.perc(number_test_species_true_positive, number_of_matched_genes)} of PP)") + # Test species' genes that have a significant match with one ore more GFs but + # are actually not among the test species' genes used to build the orthogroup database + test_species_false_positive = assigned_gfs_file.loc[~assigned_gfs_file.iloc[:,0].isin(gene_ids_in_orthogroups.loc[:,0])] + number_test_species_false_positive = len(test_species_false_positive.iloc[:,0].drop_duplicates()) + logging.info(f"FALSE POSITIVES: Number of matched genes that are instead actually NOT part of orthogroups: {number_test_species_false_positive} ({fc_rec_ret.perc(number_test_species_false_positive, number_of_matched_genes)} of PP)") + + logging.info("") + logging.info("Predicted negatives (genes NOT matched to orthogroups):") + # Consider genes not assigned to any GF; select those that actually do not belong to real negatives, meaning that they are false negatives. + false_negatives = not_matched_genes.loc[~not_matched_genes.isin(real_negatives)] + number_false_negatives = len(false_negatives) + logging.info(f"FALSE NEGATIVE: Number of NOT matched genes that are instead actually matching an orthogroup: {number_false_negatives} ({fc_rec_ret.perc(number_false_negatives, number_of_not_matched_genes)} of PN)") + true_negatives = not_matched_genes.loc[not_matched_genes.isin(real_negatives)] + number_true_negatives = len(true_negatives) + logging.info(f"TRUE NEGATIVES: Number of NOT matched genes that are truly not matching any orthogroup: {number_true_negatives} ({fc_rec_ret.perc(number_true_negatives, number_of_not_matched_genes)} of PN)") + + logging.info("") + logging.info("Confusion matrix:\n") + fc_rec_ret.conf_matrix(number_test_species_true_positive, number_test_species_false_positive, number_false_negatives, number_true_negatives) + logging.info("") + + tpr = fc_rec_ret.tpr(number_test_species_true_positive, number_false_negatives) + logging.info(f"True positive rate: {round(tpr, 3)}") + + f1score = fc_rec_ret.f1score(number_test_species_true_positive, number_test_species_false_positive, number_false_negatives) + logging.info(f"F1-measure: {round(f1score, 3)}") + + mcc = fc_rec_ret.mcc(number_test_species_true_positive, number_test_species_false_positive, number_false_negatives, number_true_negatives) + logging.info(f"Matthews Correlation Coefficient: {round(mcc, 3)}") + + logging.info("\n") + + logging.info("-------------------------------------------------------------------------------") + + logging.info("MULTICLASS CONFUSION MATRIX STATISTICS by sklearn") + y_real = [] + y_pred = [] + + # Genes that belong to the top 2000 and that are coming from the test species (and not other species) + real_positive_with_gf = gf_genes_with_gf_list.loc[gf_genes_with_gf_list[0].isin(all_genes_test_species)] + + # Consider first genes that are real positives + for __, row in real_positive_with_gf.iterrows(): + y_real.append(row[1]) + # Gene was assigned to a top 2000 gene family, get which one + if row[0] in list(assigned_gfs_file[0]): + y_pred.append(assigned_gfs_file.loc[assigned_gfs_file[0] == row[0]].iloc[0,1]) + # Gene was not assigned, consider "non-top" its predicted class + else: + y_pred.append("nontop") + # Consider then genes that are real negatives + for gene in list(real_negatives): + y_real.append("nontop") + # Gene was predicted as non-top (= not assigned to any of the top 2000) + if gene not in list(assigned_gfs_file[0]): + y_pred.append("nontop") + # Gene was instead mistakingly assigned to one of the top 2000 orthogroup + else: + y_pred.append(assigned_gfs_file.loc[assigned_gfs_file[0] == gene].iloc[0,1]) + + # Get list of all GFs from the real positives and from the real negatives (this latter are all under the "nontop" class) + gfs_labels = Series(y_real).drop_duplicates().sort_values(ignore_index=True) + # Remove the "nontop" class from this list + gfs_labels_real_positives = gfs_labels.drop(gfs_labels.index[gfs_labels == 'nontop']) + + + # The classes (i.e. GFs) in the confusion matrix are sorted alphabetically thanks to the "labels" argument. + cm_non_transposed = metrics.confusion_matrix(y_real, y_pred, labels=list(gfs_labels)) + cm_df = DataFrame.from_records(cm_non_transposed) + + FP = cm_non_transposed.sum(axis=0) - diag(cm_non_transposed) # Sum along a col (PPs) and detract the TPs => FPs; Get a 1D array with FP count per class + FN = cm_non_transposed.sum(axis=1) - diag(cm_non_transposed) # Sum along a row (RPs) and detract the TPs => FNs; Get a 1D array with FN count per class + TP = diag(cm_non_transposed) + + # Dataframe of classification report + cr_dict = metrics.classification_report(y_real, y_pred, digits=3, labels=gfs_labels_real_positives, output_dict=True) + cr = DataFrame.from_dict(cr_dict, orient="index") + # Sort cr according to index (classes) because the cm is also alphabetically sorted + cr = cr.sort_index() + # Save it as a file + cr.to_csv(f"multiclass_confusion_matrix_{test_species}_first_hit_rbh_40pident_50qcov.tsv", sep="\t") + # Using it here below + cr = cr.drop(["micro avg", "macro avg", "weighted avg"]) + + + logging.info("Get statistics for the multiclass confusion matrix...") + + # Getting the excel file with top 2000 GFs and their associated genes + gf_rank = read_excel(open("../wgm-gf-markers/gene_families.xlsx", "rb"), "gene family rankings", header=None) + gf_genes_raw = read_excel(open("../wgm-gf-markers/gene_families.xlsx", "rb"), "orthogroup member genes", header=None) + if top != None: + # Use combined rank + if rank_type == "combined": + top_gf_ids = gf_rank.iloc[:,1].iloc[:top] + gf_genes_raw = gf_genes_raw.loc[gf_genes_raw.iloc[:,0].isin(top_gf_ids)] + elif rank_type == "lambda": + top_gf_ids = gf_rank.loc[gf_rank[8] <= 2000].sort_values(8, ignore_index=True).iloc[:,1] + gf_genes_raw = gf_genes_raw.loc[gf_genes_raw.iloc[:,0].isin(top_gf_ids)] + + # Expand each gene in the GFs as an independent column entry + gf_genes = gf_genes_raw.iloc[:,:2] # First two columns (GF ID and number of species) + genes_columns = gf_genes_raw.iloc[:,2].str.split(",", expand=True) # Process third column (genes) + gf_genes = concat([gf_genes, genes_columns], axis=1, ignore_index=True) + + top_gf_ids_combined = gf_rank.iloc[:top, :] + top_gf_ids_lambda = gf_rank.sort_values(8, ignore_index=True).iloc[:top, :] + + + logging.info(f"Write multiclass confusion matrix statistics related to test species {latin_name}") + + stats = [] + for i in range(len(cr)): + gf, row = cr.iloc[i, :].name, cr.iloc[i, :] + # Precision, recall and number of genes for the current test species + precision = row["precision"] + recall = row["recall"] + real_positives = row["support"] + tp, fp, fn = TP[i], FP[i], FN[i] + + # Indexes + if rank_type == "combined": + index_rank = top_gf_ids_combined.loc[top_gf_ids_combined[1] == gf].index.values[0]+1 + elif rank_type == "lambda": + index_rank = top_gf_ids_lambda.loc[top_gf_ids_lambda[1] == gf].index.values[0]+1 + # Number of genes in that gene family from all species + num_tot_genes_in_gf = gf_genes.loc[gf_genes[0] == gf].iloc[0,2:].count() + # Function as TAIR descrption from the xlsx file + function = gf_rank.loc[gf_rank[1] == gf].iloc[0, 2] + + stats.append([gf, precision, recall, tp, fp, fn, real_positives, num_tot_genes_in_gf, index_rank, function]) + + df_stats = DataFrame.from_records(stats) + header = ["gene family ID", f"precision_{test_species}", f"recall_{test_species}", "true positives", "false positives", "false negatives", f"nr of gene family members in {latin_name}", "nr of gene family members across all species", "gene family combined rank", "gene family description"] + df_stats.to_csv(f"multiclass_stats_{test_species}_first_hit_rbh_40pdeint_50qcov.tsv", sep="\t", header=header, index=False) + + # logging.info("") + #print(metrics.classification_report(y_real, y_pred, digits=3, labels=list(real_positive_with_gf.iloc[:,1]))) + logging.info("") + if filter_false_predictions: + logging.info(f"Precision and recall are computed by including only the GFs from the top {top} orthogroups after filtering by false predictions and are weighted by the number of true instances for each label:") + else: + logging.info(f"Precision and recall are computed by including only the top {top} orthogroups and are weighted by the number of true instances for each label:") + # Label "nontop" contains all GFs beyond the top 2000 (2001-9178) plus those GFs belonging to the top 2000 that have been filtered away + # This label "nontop" will be excluded when calculating the the multiclass weighted average precision an recall in order to ignore the majority negative class + prec_recall_averaged_weighted = metrics.precision_recall_fscore_support(y_real, y_pred, labels=list(real_positive_with_gf.iloc[:,1]), average="weighted") + mcc_multiclass = metrics.matthews_corrcoef(y_real, y_pred) + logging.info(f"Precision: {round(prec_recall_averaged_weighted[0], 3)}") + logging.info(f"Recall: {round(prec_recall_averaged_weighted[1], 3)}") + logging.info(f"MCC: {round(mcc_multiclass, 3)}") + + logging.info("") + # logging.info("Precision and recall by including the non-top class") + # print(metrics.precision_recall_fscore_support(y_real, y_pred, average="weighted")) + # logging.info("") + + + return + + logging.info("-------------------------------------------------------------------------------") + + + logging.info("HOW CORRECTLY ARE GENE FAMILIES ASSIGNED IN TRUE POSITIVES?") + logging.info("") + + correct_true_positive_single = 0 + correct_true_positive_duplicates = 0 + incorrect_true_positive_single = 0 + incorrect_true_positive_duplicates = 0 + duplicates = {} + for __, row in test_species_true_positive.iterrows(): + # test_species_true_positive.iloc[:,0].loc[(test_species_true_positive.iloc[:,0].isin(gf_genes_with_gf_list.iloc[:,0])) & () ] + # test_species_true_positive.apply(lambda row: row.iloc[0] , axis = 1) + correct = gf_genes_with_gf_list.loc[(row.iloc[0] == gf_genes_with_gf_list.iloc[:,0]) & (row.iloc[1] == gf_genes_with_gf_list.iloc[:,1])] + if len(correct) != 0: # If the assigned orthogroup is correct + + if list(test_species_true_positive.iloc[:,0]).count(row.iloc[0]) < 2: # gene with only one matched gene family + correct_true_positive_single += 1 + else: # gene with two or more matched gene families; is at least one right? + if row.iloc[0] not in duplicates: + duplicates[row.iloc[0]] = [row.iloc[1]] + else: + duplicates[row.iloc[0]].append(row.iloc[1]) + correct_true_positive_duplicates += 1 + else: # Else if the assigned orthogroup is incorrect + if list(test_species_true_positive.iloc[:,0]).count(row.iloc[0]) < 2: # gene with only one matched gene family + incorrect_true_positive_single += 1 + else: # gene with two or more matched gene families; is at least one right? + if row.iloc[0] not in duplicates: + duplicates[row.iloc[0]] = [row.iloc[1]] + else: + duplicates[row.iloc[0]].append(row.iloc[1]) + incorrect_true_positive_duplicates += 1 + + logging.info("Number of TPs that are correctly matching the original outgroup classification:") + logging.info(f"correct_true_positive_single: {correct_true_positive_single}") + logging.info(f"correct_true_positive_duplicates: {correct_true_positive_duplicates}") + logging.info(f"TOTAL CORRECT: {correct_true_positive_single+correct_true_positive_duplicates} out of {len(test_species_true_positive)} ({fc_rec_ret.perc(correct_true_positive_single+correct_true_positive_duplicates, len(test_species_true_positive))})") + logging.info("") + logging.info(f"incorrect_true_positive_single: {incorrect_true_positive_single}") + logging.info(f"incorrect_true_positive_duplicates: {incorrect_true_positive_duplicates}") + logging.info(f"TOTAL INCORRECT: {incorrect_true_positive_single+incorrect_true_positive_duplicates} out of {len(test_species_true_positive)} ({fc_rec_ret.perc(incorrect_true_positive_single+incorrect_true_positive_duplicates, len(test_species_true_positive))})") + + logging.info("All done.") \ No newline at end of file diff --git a/ksrates/fc_configfile.py b/ksrates/fc_configfile.py index 0aefbfbd8bca3149ac65abbde97912a334a4d4fc..91c2e34284b65a7a4f1056d4b2d106037cb7669c 100644 --- a/ksrates/fc_configfile.py +++ b/ksrates/fc_configfile.py @@ -341,6 +341,22 @@ class Configuration: logging.error('Unrecognized "collinearity" parameter in configuration file; please choose between "yes" and "no"') sys.exit(1) + def get_reciprocal_retention(self): + """ + Gets the config file field specifying if the mixed distribution will show the reciprocally retained gene families of the focal species or not. + + :return boolean: boolean + """ + reciprocal_retention = self.config.get("ANALYSIS SETTING", "reciprocal_retention").lower() + + if reciprocal_retention == "yes": + return True + elif reciprocal_retention == "no": + return False + else: + logging.error('Unrecognized "reciprocal_retention" parameter in configuration file; please choose between "yes" and "no"') + sys.exit(1) + def get_consensus_peak_for_multiple_outgroups(self): """ Gets the config file field of the consensus method for how to deal with multiple adjustments for the same divergence. @@ -750,3 +766,105 @@ class Configuration: else: max_size = 200 return max_size + + def get_reciprocal_retention_top(self): + """ + Gets the number of top reciprocally retained gene families to be considered out of the total ranked 9178 ones. + + :return top: integer or float + """ + if self.expert_config is not None: + top = int(self.expert_config.get("EXPERT PARAMETERS", "top_reciprocally_retained_gfs", fallback="2000")) + else: + top = 2000 + return top + + def get_reciprocal_retention_rank_type(self): + """ + Gets the rank type ("lambda" or "combined") for the reciprocally retained gene family list. The lambda rank is based + on the lambda parameter (the lower this is, the more the gene family is duplicated mainly through WGM with no gene loss), + while the combined rank takes also into account the block duplicate percentage as an independent source of evidence for reciprocal retention. + + :return rank_type: string ("lambda" or "combined"). Default is "lambda". + """ + if self.expert_config is not None: + rank_type = self.expert_config.get("EXPERT PARAMETERS", "reciprocal_retention_rank_type", fallback="lambda") + if rank_type not in ["lambda", "combined"]: + logging.warning(f'Unrecognized field in expert configuration file [reciprocal_retention_rank_type = {rank_type}]. \ + Please choose between "lambda" and "combined". Default choice will be applied ["lambda"]') + rank_type = "lambda" + else: + rank_type = "lambda" + return rank_type + + def get_reciprocal_retention_pident_threshold(self): + """ + Gets the minimum identity percentage required in the local alignment between a main species sequence and + the top reciprocally retained sequences from the top orthogroups. This number is given to diamond "blastx" + command as the "-id" argument. Diamond hits with less than e.g. 40 won't be part of the diamond output table. + + :return pident_threshold: integer or float. Default is 40 + """ + if self.expert_config is not None: + pident_threshold = int(self.expert_config.get("EXPERT PARAMETERS", "pident_threshold", fallback="40")) + else: + pident_threshold = 40 + return pident_threshold + + def get_reciprocal_retention_min_qcov_threshold(self): + """ + Gets the minimum coverage percentage in the main species sequence that is required in the alignment between + the main species sequence and the top reciprocally retained sequences from the top orthogroups. Diamond hits + with less than e.g. 50 will be removed from the diamond output table, so to remove the hits with bad coverage. + + :return top: integer or float + """ + if self.expert_config is not None: + min_qcov_threshold = int(self.expert_config.get("EXPERT PARAMETERS", "min_qcov_threshold", fallback="50")) + else: + min_qcov_threshold = 50 + return min_qcov_threshold + + def get_filter_by_false_predictions(self): + """ + Gets whether to use all the top gene families (set to "no") or a subset of gene families (set to "yes") which for the three test species + (A. thaliana, P. dactylifera and V. vinifera) had few false predictions in the confusion matrix (the total amount of false positives + and negatives was less than half of the total amount of true positives). + + :return boolean: boolean + """ + if self.expert_config is not None: + filter_false_predictions = self.expert_config.get("EXPERT PARAMETERS", "filter_by_false_predictions", fallback=False).lower() + + if filter_false_predictions == "yes": + return True + elif filter_false_predictions == "no": + return False + else: + logging.warning(f'Unrecognized field in expert configuration file [filter_by_false_predictions = {filter_false_predictions}]. \ + Please choose between "yes" and "no". Default choice will be applied ["no"]') + return False + else: + return False + + + def get_filter_by_zero_false_predictions(self): + """ + Gets whether to use all the top gene families (set to "no") or a subset of gene families (set to "yes") which for the three test species + (A. thaliana, P. dactylifera and V. vinifera) had no false positives or negatives. + + :return boolean: boolean + """ + if self.expert_config is not None: + filter_zero_false_predictions = self.expert_config.get("EXPERT PARAMETERS", "filter_by_zero_false_predictions", fallback=False).lower() + + if filter_zero_false_predictions == "yes": + return True + elif filter_zero_false_predictions == "no": + return False + else: + logging.warning(f'Unrecognized field in expert configuration file [filter_by_zero_false_predictions = {filter_zero_false_predictions}]. \ + Please choose between "yes" and "no". Default choice will be applied ["no"]') + return False + else: + return False \ No newline at end of file diff --git a/ksrates/fc_extract_ks_list.py b/ksrates/fc_extract_ks_list.py index c9d5aa8d4286cf18e65d7f194f07dcca4daa4e3c..9ee9b94ca77e73ebfe4453bafae1eae8d85c3f50 100644 --- a/ksrates/fc_extract_ks_list.py +++ b/ksrates/fc_extract_ks_list.py @@ -21,7 +21,7 @@ def ks_list_from_tsv(tsv_file, max_ks, data_type): ks_list_filtered = filtered_tsv["Ks"].to_list() weight_list_filtered = filtered_tsv["WeightOutliersExcluded"].to_list() - if data_type == "paralogs" or data_type == "anchor pairs": + if data_type == "paralogs" or data_type == "anchor pairs" or data_type == "reciprocally retained": return ks_list_filtered, weight_list_filtered if data_type == "orthologs": return ks_list_filtered diff --git a/ksrates/fc_plotting.py b/ksrates/fc_plotting.py index 05ab3c8a530af9d1647ab236b6f96ef60645826a..36a7eedca12efa290b03a46f18b3a0e00caab05e 100755 --- a/ksrates/fc_plotting.py +++ b/ksrates/fc_plotting.py @@ -31,15 +31,16 @@ class StringLegendHandler(HandlerBase): Legend.update_default_handler_map({str: StringLegendHandler()}) # Some constants to use in plotting -ALPHA_PARANOME_HISTOGRAM = 0.75 -ALPHA_ANCHOR_HISTOGRAM = 0.75 +ALPHA_HISTOGRAM = 0.75 ALPHA_DIVERGENCE_RECT = 0.3 ALPHA_DIVERGENCE_LINE = 0.6 -COLOR_PARANOME_HISTOGRAM = to_rgba("0.79", ALPHA_PARANOME_HISTOGRAM) -COLOR_ANCHOR_HISTOGRAM = to_rgba("0.64", ALPHA_ANCHOR_HISTOGRAM) +COLOR_PARANOME_HISTOGRAM = to_rgba("0.79", ALPHA_HISTOGRAM) +COLOR_ANCHOR_HISTOGRAM = to_rgba("0.64", ALPHA_HISTOGRAM) +COLOR_REC_RET_HISTOGRAM = to_rgba("0.48", ALPHA_HISTOGRAM) # TODO: check IF 0.48 corresponds to 0.2 below! COLOR_PARANOME_KDE = "0.6" COLOR_ANCHOR_KDE = "0.4" +COLOR_REC_RET_KDE = "0.2" SIZE_CIRCLE_LABEL_WHITE = 220 LINEWIDTH_CIRCLE_LABEL_BORDER = 1 @@ -53,7 +54,8 @@ _MIXED_UNADJUSTED_PLOT_FILENAME = "mixed_{}_unadjusted.pdf" def generate_mixed_plot_figure(species, x_max_lim, y_max_lim, corrected_or_not, correction_table_available, - plot_correction_arrows, paranome_data=True, colinearity_data=True): + plot_correction_arrows, paranome_data=True, colinearity_data=True, + reciprocal_retention_data=True, top=None, rank_type=None): """ Initializes a figure with a single empty plot for the mixed distribution. @@ -65,6 +67,8 @@ def generate_mixed_plot_figure(species, x_max_lim, y_max_lim, corrected_or_not, :param plot_correction_arrows: boolean stating whether there will be plotted adjustment arrows or not :return: figure and axis objects """ + # TODO: set paranome_data, colineartity_data and rec_ret_data to False by default in args + species_escape_whitespaces = species.replace(' ', '\ ') # Increase figure size in height if extra space for the correction arrows is required and @@ -88,12 +92,18 @@ def generate_mixed_plot_figure(species, x_max_lim, y_max_lim, corrected_or_not, else: fig.suptitle("$K_\mathregular{S}$" + f" distribution for ${species_escape_whitespaces}$") + # TODO: will this subtitle stay in the final version of the Ks plot? + if reciprocal_retention_data: + ax.set_title(f"Top {top} GFs from {rank_type} ranking") + seaborn.despine(offset=10) ax.set_xlabel("$K_\mathregular{S}$") if paranome_data: # If paranome (with or without anchors) is going to be plotted ax.set_ylabel("Number of retained duplicates (weighted)") elif colinearity_data: # If only anchor pairs are going to plotted ax.set_ylabel("Number of retained anchor pairs (weighted)") + elif reciprocal_retention_data: # If only reciprocally retained data are going to be plotted + ax.set_ylabel("Number of retained duplicates (weighted)") ax.set_xlim(0, x_max_lim) if isinstance(y_max_lim, float): @@ -187,6 +197,8 @@ def plot_histogram(legend_label, axis, ks_list, bin_list, bin_width, max_ks, bw_ color_hist = COLOR_PARANOME_KDE elif legend_label == "Anchor pairs": color_hist = COLOR_ANCHOR_KDE + elif legend_label == "Reciprocally retained paralogs": + color_hist = COLOR_REC_RET_KDE axis.plot(kde_x, kde_y * len(ks_list_reflected) * bin_width * np.mean(weight_list_reflected), color=color_hist, linewidth=1.3) return hist @@ -207,12 +219,16 @@ def plot_histogram_for_anchor_clustering(axis, anchor_ks_list, anchors_weights, set_mixed_plot_height(axis, y_max_lim, hist) -def set_mixed_plot_height(axis, y_max_lim, hist): +def set_mixed_plot_height(axis, y_max_lim, hist, second_hist=None): """ - Sets the height of the plot based on the tallest histogram bin (either of the - paranome distribution or of the anchor pair distribution). + Sets the height of the plot based on the tallest histogram bin (either of the paranome distribution or of the anchor pair distribution). + If there is only one histogram given, it's from the whole-paranome distribution and it is going to be the highest; if there are two + given (anchor pair and reciprocally retained distributions), then set the height on the basis of the tallest of the two. """ + # TODO: check if this new strategy is correct tallest_bin = max(hist[0]) + if second_hist != None and max(second_hist[0]) > tallest_bin: + tallest_bin = max(second_hist[0]) axis.set_ylim(0, tallest_bin * 1.25) @@ -449,7 +465,7 @@ def define_legend_size(axis): return final_legend_size -def create_legend(axis, paranome, colinearity, legend_size): +def create_legend(axis, paranome, colinearity, reciprocal_retention, legend_size): """ Places the legend elements associated to the histograms at the beginning of the legend,\\ while by default they are placed at the end. @@ -464,25 +480,58 @@ def create_legend(axis, paranome, colinearity, legend_size): sorted_handles, sorted_labels = handles.copy(), labels.copy() paranome_rect = Patch(facecolor=COLOR_PARANOME_HISTOGRAM, edgecolor="w") anchors_rect = Patch(facecolor=COLOR_ANCHOR_HISTOGRAM, edgecolor="w") + rec_ret_rect = Patch(facecolor=COLOR_REC_RET_HISTOGRAM, edgecolor="w") # empty patch used as spacer between histograms and divergence line legend entries empty_rect = Patch(fill=False, edgecolor='none', visible=False) - if paranome and not colinearity: + # if paranome and not colinearity: + # sorted_handles = [paranome_rect, empty_rect, "Divergence with:"] + sorted_handles[:-1] + # sorted_labels = [sorted_labels[-1]] + ["", ""] + sorted_labels[:-1] + # elif not paranome and colinearity: + # sorted_handles = [anchors_rect, empty_rect, "Divergence with:"] + sorted_handles[:-1] + # sorted_labels = [sorted_labels[-1]] + ["", ""] + sorted_labels[:-1] + # elif paranome and colinearity: + # sorted_handles = [paranome_rect, anchors_rect, empty_rect, "Divergence with:"] + sorted_handles[:-2] + # sorted_labels = sorted_labels[-2:] + ["", ""] + sorted_labels[:-2] + + # TODO: CHECK THE FOLLOWING IF-ELIF LIST and remove the above one + if paranome and not colinearity and not reciprocal_retention: # only paranome sorted_handles = [paranome_rect, empty_rect, "Divergence with:"] + sorted_handles[:-1] sorted_labels = [sorted_labels[-1]] + ["", ""] + sorted_labels[:-1] - elif not paranome and colinearity: + elif not paranome and colinearity and not reciprocal_retention: # only colinearity sorted_handles = [anchors_rect, empty_rect, "Divergence with:"] + sorted_handles[:-1] sorted_labels = [sorted_labels[-1]] + ["", ""] + sorted_labels[:-1] - elif paranome and colinearity: + + # TODO: CHECK THE FOLLOWING elif + elif not paranome and not colinearity and reciprocal_retention: # only recret + sorted_handles = [rec_ret_rect, empty_rect, "Divergence with:"] + sorted_handles[:-1] + sorted_labels = [sorted_labels[-1]] + ["", ""] + sorted_labels[:-1] + + elif paranome and colinearity and not reciprocal_retention: # para and colin sorted_handles = [paranome_rect, anchors_rect, empty_rect, "Divergence with:"] + sorted_handles[:-2] sorted_labels = sorted_labels[-2:] + ["", ""] + sorted_labels[:-2] + # TODO: FIX THE FOLLOWING elif + elif paranome and not colinearity and reciprocal_retention: # para and recret + sorted_handles = [paranome_rect, rec_ret_rect, empty_rect, "Divergence with:"] + sorted_handles[:-2] + sorted_labels = sorted_labels[-2:] + ["", ""] + sorted_labels[:-2] + + # TODO: CHECK THE FOLLOWING elif + elif not paranome and colinearity and reciprocal_retention: # colin and recret + sorted_handles = [paranome_rect, anchors_rect, empty_rect, "Divergence with:"] + sorted_handles[:-2] + sorted_labels = sorted_labels[-2:] + ["", ""] + sorted_labels[:-2] + + elif paranome and colinearity and reciprocal_retention: + sorted_handles = [paranome_rect, anchors_rect, rec_ret_rect, empty_rect, "Divergence with:"] + sorted_handles[:-2] + sorted_labels = sorted_labels[-3:] + ["", ""] + sorted_labels[:-3] + lgd = axis.legend(sorted_handles, sorted_labels, handlelength=1.5, mode="expand", loc="upper left", bbox_to_anchor=legend_size) return lgd -def save_mixed_plot(fig_corr, fig_uncorr, ax_corr, ax_uncorr, species, correction_table_available, paranome, colinearity): +def save_mixed_plot(fig_corr, fig_uncorr, ax_corr, ax_uncorr, species, correction_table_available, + paranome, colinearity, reciprocal_retention): """ This function must be called to save the mixed distribution figure in order to adjust the figure layout: the plot area is shrunk to the left and some reasonable space is left on the right side for the legend. @@ -500,12 +549,12 @@ def save_mixed_plot(fig_corr, fig_uncorr, ax_corr, ax_uncorr, species, correctio chart_box = ax_uncorr.get_position() ax_uncorr.set_position([chart_box.x0, chart_box.y0, chart_box.width*0.65, chart_box.height]) - lgd = create_legend(ax_uncorr, paranome, colinearity, legend_size) + lgd = create_legend(ax_uncorr, paranome, colinearity, reciprocal_retention, legend_size) fig_uncorr.savefig(os.path.join("rate_adjustment", f"{species}", _MIXED_UNADJUSTED_PLOT_FILENAME.format(species)), bbox_extra_artists=(lgd, fig_uncorr._suptitle), bbox_inches="tight", transparent=True, format="pdf") ax_corr.set_position([chart_box.x0, chart_box.y0, chart_box.width*0.65, chart_box.height]) - lgd = create_legend(ax_corr, paranome, colinearity, legend_size) + lgd = create_legend(ax_corr, paranome, colinearity, reciprocal_retention, legend_size) fig_corr.savefig(os.path.join("rate_adjustment", f"{species}", _MIXED_ADJUSTED_PLOT_FILENAME.format(species)), bbox_extra_artists=(lgd, fig_corr._suptitle), bbox_inches="tight", transparent=True, format="pdf") else: diff --git a/ksrates/fc_reciprocal_retention.py b/ksrates/fc_reciprocal_retention.py new file mode 100755 index 0000000000000000000000000000000000000000..3b888bff3911074aa95eb6eeeb91ef6898e335d3 --- /dev/null +++ b/ksrates/fc_reciprocal_retention.py @@ -0,0 +1,553 @@ +import os +import logging +import datetime +import shutil +import subprocess as sp +from pandas import read_csv, Series, DataFrame, read_excel, concat, errors, set_option +from numpy import where, median, mean, full, int +import sys +from scipy.stats import mannwhitneyu +import collections +from math import sqrt, ceil +from wgd_ksrates.utils import read_fasta +from Bio import SeqIO +from Bio.Data.CodonTable import TranslationError +import matplotlib.pyplot as plt +plt.style.use(os.path.join(f"{os.path.dirname(os.path.abspath(__file__))}", "ks.mplstyle")) + +ids = {'alyr':'Arabidopsis lyrata', 'atha': 'Arabidopsis thaliana', 'atri':'Amborella trichopoda', 'bdis':'Brachypodium distachyon', 'brap':'Brassica rapa', 'cari':'Cicer arietinum', 'ccaj':'Cajanus cajan', 'clan':'Citrullus lanatus', 'cpap':'Carica papaya', 'crub':'Capsella rubella', 'csat':'Cucumis sativus', 'fves': 'Fragaria vesca', 'gmax':'Glycine maxima', 'grai':'Gossypium raimondii', 'hvul' : 'Hordeum vulgaris', 'jcur':'Jatropha curcas', 'ljap':'Lotus japonicus', 'lusi':'Linum usitatissiumum', 'macu':'Musa acuminata', 'mesc': 'Manihot esculenta', 'mtru':'Medicago truncatula', 'osat': 'Oryza sativa', 'pbre':'Pyrus bretschneideri', 'pdac':'Phoenix dactylifera', 'pmum':'Prunus mume', 'pper':'Prunus persica', 'ptri':'Populus trichocarpa', 'rcom':'Ricinus communis', 'sbic':'Sorghum bicolor', 'sita':'Setaria italica', 'slyc':'Solanum lycopersicum', 'stub':'Solanum tuberosum', 'tcac':'Theobroma cacao', 'tpar':'Thellungiella parvula', 'vvin': 'Vitis vinifera', 'zmay':'Zea mays'} + +# ----------------------------------------------------------------------------- + +# FUNCTIONS USED TO GENERATE A FASTA DATABASE OF THE RECIPROCALLY RETAINED GENE FAMILIES IN THE 36 SPECIES +# FROM WHICH THEY WERE ORIGINALLY COMPUTED. MAIN FUNCTION IN THIS SECTION IS: "make_gene_family_database()". +# TODO: TEST AND FIX THE INTEGRATION OF THIS FUNCTIONS IN THE KSRATES PIPELINE IN "extend_orthogroups.py" +# where the main function should be called (now it's commented). + +def get_all_gf_genes(path_for_top_gf_tsv, filter_zero_false_predictions, filter_false_predictions, rank_type, top=None): + """ + If top parameter equal to a number, then consider only the top XXXX gene families: e.g. top 1000. + This avoids including weak reciprocally retained gene families. If asked, also filter away those GFs that + are known to perform bad either because they lead to too many false predictions (FPs+FNs). + + Generate a file (gf_genes_all.tsv OR gf_genes_top_XXXX.tsv) with for each row a GF followed by + all the genes in it, one per column. Then generate a file (gf_genes_with_gf_list.tsv) listing all such genes and + a file listing all genes in the first column and their GF in the second. + """ + # Generate or open the file listing all the GFs with all their genes + if top == None: + gf_genes_tsv = "gf_genes_all.tsv" + else: + gf_genes_tsv = f"gf_genes_top_{top}_{rank_type}.tsv" + + if not os.path.exists(os.path.join(path_for_top_gf_tsv, gf_genes_tsv)): + # Get the ranked gene families + gf_rank = read_excel(open("gene_families.xlsx", "rb"), "gene family rankings", header=None) + + # Get the genes associated to all gene families + gf_genes_raw = read_excel(open("gene_families.xlsx", "rb"), "orthogroup member genes", header=None) + + if top != None: + if rank_type == "combined": + top_gf_ids = gf_rank.iloc[:,1].iloc[:top] + gf_genes_raw = gf_genes_raw.loc[gf_genes_raw.iloc[:,0].isin(top_gf_ids)] + elif rank_type == "lambda": + # Reorder according to lambda rank (column 8) + gf_rank = gf_rank.sort_values(by=[8], ignore_index=True) + # Slice the first 2000 + top_gf_ids = gf_rank.iloc[:,1].iloc[:top] + gf_genes_raw = gf_genes_raw.loc[gf_genes_raw.iloc[:,0].isin(top_gf_ids)] + + + # Expand each gene in the GFs as an independent column entry + gf_genes = gf_genes_raw.iloc[:,:2] # First two columns (GF ID and number of species) + genes_columns = gf_genes_raw.iloc[:,2].str.split(",", expand=True) # Process third column (genes) + gf_genes = concat([gf_genes, genes_columns], axis=1, ignore_index=True) + + with open(os.path.join(path_for_top_gf_tsv, gf_genes_tsv), "w+") as f: + f.write(gf_genes.to_csv(sep="\t", header=False, index=False)) + else: + logging.info(f"- {gf_genes_tsv} already exists") + gf_genes = read_csv(os.path.join(path_for_top_gf_tsv, gf_genes_tsv), sep="\t", header=None) + + + # If asked, remove the GFs from the top 2000 that performed bad (too many false predictions) + basefolder_filtering_files = f"../extended_orthogroups_top_{top}_{rank_type}_filtered/" + if filter_zero_false_predictions: + gf_ids_filtered_zero_false_predictions = read_csv(os.path.join(basefolder_filtering_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None) + gf_genes = gf_genes.loc[gf_genes.iloc[:,0].isin(gf_ids_filtered_zero_false_predictions[0])] + + elif filter_false_predictions: + gf_ids_filtered_false_predictions = read_csv(os.path.join(basefolder_filtering_files, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None) + gf_genes = gf_genes.loc[gf_genes.iloc[:,0].isin(gf_ids_filtered_false_predictions[0])] + + + # Generate or open the list of genes and the list of genes with their GF + if filter_zero_false_predictions: # remaining from top 2000 after filtering away for at least one false prediction + gf_genes_list_tsv, gf_genes_with_gf_list_tsv = f"gf_genes_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv", f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv" + elif filter_false_predictions: # remaining GFs from the top 2000 after filtering by too many false predictions + gf_genes_list_tsv, gf_genes_with_gf_list_tsv = f"gf_genes_list_top_{top}_{rank_type}_filtered_false_predictions.tsv", f"gf_genes_with_gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv" + elif top == None: + gf_genes_list_tsv, gf_genes_with_gf_list_tsv = "gf_genes_list.tsv", "gf_genes_with_gf_list.tsv" + else: # top = 2000 + gf_genes_list_tsv, gf_genes_with_gf_list_tsv = f"gf_genes_list_top_{top}_{rank_type}.tsv", f"gf_genes_with_gf_list_top_{top}_{rank_type}.tsv" + + if not os.path.exists(os.path.join(path_for_top_gf_tsv, gf_genes_list_tsv)) or not os.path.exists(os.path.join(path_for_top_gf_tsv, gf_genes_with_gf_list_tsv)): + logging.info(f"- Generating {gf_genes_list_tsv} and {gf_genes_with_gf_list_tsv}...") + + # Get the genes associated to the GFs in both rank types + gf_genes_list = [] + gf_genes_with_gf_list = [] + for __, gf_row in gf_genes.iterrows(): + for gene_col in gf_row[2:]: + gf = gf_row[0] + if isinstance(gene_col, str): + gene_id = gene_col.split("|")[1] + else: + continue + gf_genes_list.append(gene_id) + gf_genes_with_gf_list.append([gene_id, gf]) + + # Save the GF together with their genes in a TSV file + gf_genes_list = Series(gf_genes_list).sort_values(ignore_index=True) + gf_genes_list.to_csv(os.path.join(path_for_top_gf_tsv, gf_genes_list_tsv), sep="\t", header=False, index=False) + + gf_genes_with_gf_list = DataFrame.from_records(gf_genes_with_gf_list) + gf_genes_with_gf_list.columns = ["gene_id", "gf"] + gf_genes_with_gf_list = gf_genes_with_gf_list.sort_values(by=["gene_id"], ignore_index=True) + gf_genes_with_gf_list.to_csv(os.path.join(path_for_top_gf_tsv, gf_genes_with_gf_list_tsv), sep="\t", header=False, index=False) + + return gf_genes_list, gf_genes_with_gf_list + else: + logging.info(f"- {gf_genes_list_tsv} and {gf_genes_with_gf_list_tsv} already exist ") + gf_genes_list = read_csv(os.path.join(path_for_top_gf_tsv, gf_genes_list_tsv), sep="\t", header=None) + gf_genes_with_gf_list = read_csv(os.path.join(path_for_top_gf_tsv, gf_genes_with_gf_list_tsv), sep="\t", header=None) + return gf_genes_list.squeeze().values.tolist(), gf_genes_with_gf_list + + +def flesh_ids_out_with_sequences(gf_genes_list, ids_names, orthorgoups_database): + """ + Generate the database of nucleotidic sequences from the orthogroup genes (FASTA format). + """ + database = {} + c=0 + if not os.path.exists(os.path.join("..", orthorgoups_database)): + + species_fasta = {} + for species_id in ids_names: + species_fasta.update(read_fasta(os.path.join("..", "sequences", "old_genome_versions", f"{species_id}.fasta"))) + + with open (os.path.join("..", orthorgoups_database), "w+") as f: + for gene_id in gf_genes_list: + if gene_id in species_fasta.keys(): + database[gene_id] = species_fasta[gene_id] + f.write(f">{gene_id}\n") + f.write(f"{species_fasta[gene_id]}\n") + else: + logging.info(f"- {orthorgoups_database} already exists") + + +def dna_to_aminoacid(orthorgoups_database, orthorgoups_database_aminoacid): + """ + Translates the database of nucleotidic sequences into aminoacid sequences, + to be given to diamond. + """ + with open(os.path.join("..", orthorgoups_database), "r") as f: + database_dna = f.readlines() + + if not os.path.exists(os.path.join("..", orthorgoups_database_aminoacid)): + with open (os.path.join("..", orthorgoups_database_aminoacid), "w+") as f: + for i, seq in enumerate(SeqIO.parse(database_dna, 'fasta')): + try: + aa_seq = seq.translate(to_stop=False, cds=False, id=seq.id) + f.write(f">{seq.id}\n") + f.write(f"{aa_seq.seq}\n") + except TranslationError as e: + logging.error("Translation error ({}) in seq {}".format( + e, seq.id)) + continue + else: + logging.info(f"- {orthorgoups_database_aminoacid} already exists") + + +# MAIN FUNCTION THAT USES ALL FUNCTIONS ABOVE +def make_gene_family_database(top, rank_type, filter_false_predictions, filter_zero_false_predictions, base_dir, remove_test_species=False): + """ + Generates a nucleotidic FASTA file and a translated FASTA file that are databases for all sequences + from the 36 species used to make the top e.g. 2000 reciprocally retained gene families out of the total 9178 core angiosperm gene families. + This function can also generate un-complete databases where the sequences coming from a certain test species (belonging to those + 36 species) are removed, to be used during the testing phase of the workflow that detects RR gene families. + + :param top: integer stating the number of top RR gene families to be considered for making the sequence database + :param rank_type: string stating which reciprocal retention rank to be used for the top ("lambda" or "combined") + :param filter_false_predictions: boolean stating whether to use a subset of the top gene families, with fair performance + :param filter_zero_false_predictions: boolean stating whether to use a subset of the top gene families, with perfect performance + :param remove_test_species: boolean stating whether to generate un-complete databases for testing a certain test species + :return: None, but generates database files + """ + ids_names = {'alyr':'Arabidopsis lyrata', 'atha': 'Arabidopsis thaliana', 'atri':'Amborella trichopoda', \ + 'bdis':'Brachypodium distachyon', 'brap':'Brassica rapa', 'cari':'Cicer arietinum', 'ccaj':'Cajanus cajan', \ + 'clan':'Citrullus lanatus', 'cpap':'Carica papaya', 'crub':'Capsella rubella', 'csat':'Cucumis sativus', \ + 'fves': 'Fragaria vesca', 'gmax':'Glycine maxima', 'grai':'Gossypium raimondii', 'jcur':'Jatropha curcas', \ + 'ljap':'Lotus japonicus', 'lusi':'Linum usitatissiumum', 'macu':'Musa acuminata', 'mesc': 'Manihot esculenta', \ + 'mtru':'Medicago truncatula', 'osat': 'Oryza sativa', 'pbre':'Pyrus bretschneideri', 'pdac':'Phoenix dactylifera', \ + 'pmum':'Prunus mume', 'pper':'Prunus persica', 'ptri':'Populus trichocarpa', 'rcom':'Ricinus communis', \ + 'sbic':'Sorghum bicolor', 'sita':'Setaria italica', 'slyc':'Solanum lycopersicum', 'stub':'Solanum tuberosum', \ + 'tcac':'Theobroma cacao', 'tpar':'Thellungiella parvula', 'vvin': 'Vitis vinifera', 'zmay':'Zea mays'} + + logging.info("Get a list of all GF genes") + if filter_zero_false_predictions: + logging.info("Filter away from database those GFs that have at least one false prediction") + elif filter_false_predictions: + logging.info("Filter away from database those GFs that lead to too many false predictions") + + gf_genes_list, gf_genes_with_gf_list = get_all_gf_genes("..", filter_zero_false_predictions, filter_false_predictions, rank_type, top) + + # Default names for databases (will be changed if necessary in this code later on) + if not filter_zero_false_predictions and not filter_false_predictions: + orthorgoups_database = f"orthogroups_database_top_{top}_{rank_type}_COMPLETE.fasta" + orthorgoups_database_aminoacid = f"orthogroups_database_aminoacid_top_{top}_{rank_type}_COMPLETE.fasta" + elif filter_zero_false_predictions: + orthorgoups_database = f"orthogroups_database_top_{top}_{rank_type}_filtered_zero_false_predictions_COMPLETE.fasta" + orthorgoups_database_aminoacid = f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_zero_false_predictions_COMPLETE.fasta" + elif filter_false_predictions: + orthorgoups_database = f"orthogroups_database_top_{top}_{rank_type}_filtered_false_predictions_COMPLETE.fasta" + orthorgoups_database_aminoacid = f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_false_predictions_COMPLETE.fasta" + + if remove_test_species: + # Use a test species ID selecting it from a key in ids_names dictionary + test_species, scientific_name = "atha", "arabidopsis" + logging.info(f"Removing species: {scientific_name}") + del ids_names[test_species] + # Overwrite output database names by stating that they don't include a species + orthorgoups_database = f"orthogroups_database_top_{top}_{rank_type}_test_without_{scientific_name}.fasta" + orthorgoups_database_aminoacid = f"orthogroups_database_aminoacid_top_{top}_{rank_type}_test_without_{scientific_name}.fasta" + + if filter_zero_false_predictions: + orthorgoups_database = f"orthogroups_database_top_{top}_{rank_type}_filtered_zero_false_predictions_test_without_{scientific_name}.fasta" + orthorgoups_database_aminoacid = f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_zero_false_predictions_test_without_{scientific_name}.fasta" + if filter_false_predictions: + orthorgoups_database = f"orthogroups_database_top_{top}_{rank_type}_filtered_false_predictions_test_without_{scientific_name}.fasta" + orthorgoups_database_aminoacid = f"orthogroups_database_aminoacid_top_{top}_{rank_type}_filtered_false_predictions_test_without_{scientific_name}.fasta" + + logging.info("Generate the nucleotidic database of GFs") + flesh_ids_out_with_sequences(gf_genes_list, ids_names, orthorgoups_database) + + logging.info("Translate into aminoacidic sequence database (ready for diamond)") + dna_to_aminoacid(orthorgoups_database, orthorgoups_database_aminoacid) + + +#------------------------------------------------------------------------------ + +# FUNCTIONS FOR THE RECIPROCAL RETENTION PIPELINE + +def init_logging(log_heading, logging_level): + """ + Initialize the logging environment and print a header to the log. + :param log_heading: heading for the log + :return: nothing + """ + logging.basicConfig(format='%(levelname)s\t%(message)s', level=logging_level, stream=sys.stdout) + length = ceil((len(log_heading)/2)) + logging.info('- ' * length) + logging.info(log_heading) + logging.info(datetime.datetime.today().ctime()) + logging.info('- ' * length) + + +def associate_genes_to_species(): + """ + """ + filename = "genes_per_species.tsv" + if not os.path.exists(filename): + # Get the ranked gene families + gf_rank = read_excel(open(os.path.join("..", "wgm-gf-markers", "gene_families.xlsx"), "rb"), "gene family rankings", header=None) + # Get the genes associated to all gene families + gf_genes_all_raw = read_excel(open(os.path.join("..", "wgm-gf-markers", "gene_families.xlsx"), "rb"), "orthogroup member genes", header=None) + + # Expand each gene in the GFs as an independent column entry + gf_genes_all = gf_genes_all_raw.iloc[:,:2] # First two columns (GF ID and number of species) + genes_columns = gf_genes_all_raw.iloc[:,2].str.split(",", expand=True) # Process third column (genes) + gf_genes_all = concat([gf_genes_all, genes_columns], axis=1, ignore_index=True) + + missing = {} + genes_per_species = {} + for id in ids: + genes_per_species[id] = [] + for i in range(genes_columns.index.size): + for j in range(genes_columns.columns.size): + if genes_columns.iloc[i,j] is not None and isinstance(genes_columns.iloc[i,j], float) == False: + species_id, gene = genes_columns.iloc[i,j].split("|") + species_id = species_id.lower() + if species_id in genes_per_species: + genes_per_species[species_id].append(gene) + else: + if species_id not in missing: + missing[species_id] = [] + logging.info(species_id) + df = DataFrame.from_dict(genes_per_species, orient="index") + + with open(filename, "w+") as f: + f.write(df.to_csv(sep="\t", header=False, index=True)) + else: + df = read_csv(filename, sep="\t", header=None, index_col=0) + return df + + +def make_diamond_db(fasta_aa, output_folder=".", n_threads=8, dmd_tmp_dir=False): + """ + Take as input the FASTA text file of a database and returns the + diamond database (binary file). + """ + if dmd_tmp_dir: # Make database into the temporary folder + fasta_aa_db = f"{fasta_aa}.db" + else: + fasta_aa_db = f"{os.path.basename(fasta_aa)}.db" + + fasta_aa_db = os.path.join(output_folder, fasta_aa_db) + cmd = ["diamond", "makedb", "--threads", f"{n_threads}", "--in", fasta_aa, "-d", fasta_aa_db] + # capture_output option required Python 3.7 + # out = sp.run(cmd, capture_output=True) + # logging.debug(out.stderr.decode()) + sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) + # if out.returncode == 1: + # logging.error(out.stderr.decode()) + return f"{fasta_aa_db}.dmnd" + + +def run_diamond(query_fasta, subject_db, outfile, dmd_tmp_dir, pident_threshold, column_names=[], + n_threads=8, max_hits=50, eval=1e-10): + """ + Take as input the path to the query fasta file and the path to the + subject database path. + Output a TSV file with the diamond matches. + Returns the DataFrame excluding the lines where the query was matched + to itself. + If there are no matches, return an empty dataframe. + It uses the default output format 6, but can be personalized if argument "column_names" is given. + """ + # Multi-thread, max number of hits, sensitive mode, minimum percentage identity, only the best HSP + cmd = ["diamond", "blastx", "--threads", f"{n_threads}", "-k", f"{max_hits}", "-d", subject_db, + "-q", query_fasta, "--sensitive", "--id", f"{pident_threshold}", "--max-hsps", "1", "-o", outfile, + "--tmpdir", dmd_tmp_dir, "--outfmt", "6"] + cmd.extend(column_names) + # capture_output option required Python 3.7 + # out = sp.run(cmd, capture_output=True) + # logging.debug(out.stderr.decode()) + sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) + try: + df = read_csv(outfile, sep="\t", header=None, names=column_names) + # df = df.loc[df["qseqid"] != df["sseqid"]] + dmd_hits = {} + dmd_hits[query_fasta] = df = df.loc[df["evalue"] <= eval] + except errors.EmptyDataError: + df = DataFrame() + + return df + + +def get_query_coverage_in_dmd_table(dmd, percentage, print_log=True): + """ + Compute the percentage of diamond results that have a query coverage of the alignment + of at least some percentage given in the related parameter (e.g. at least 50% included) + E.g. 44% of hits have an alignment coverage for the query > 90%") + """ + if len(dmd) != 0: + perc_qcov = len(dmd.loc[(abs(dmd["qend"] - dmd["qstart"])) / dmd["qlen"] * 100 >= percentage]) / len(dmd) * 100 + if print_log: + logging.info(f"{round(perc_qcov)}% of hits has an alignment coverage for the query > {percentage}%") + return perc_qcov + else: + return -1 + +def print_query_coverage_list_from_dmd_table(dmd): + """ + Computes the query coverage of the alignment for a given diamond table. + Adds to the diamond table an extra last column with such coverage. + """ + from pandas import concat + cov_col = DataFrame(((abs(dmd["qend"] - dmd["qstart"])) / dmd["qlen"]), columns=["qcov"]) + dmd_with_coverage = concat([dmd, cov_col], axis=1) + return dmd_with_coverage + +def compute_qcov(pred_type, test_species_predictions, tot_number_predictions, min_qcov_threshold, + min_perc_hits_with_good_qcov, dmd_with_orthogroups, gf_genes_with_gf_list): + """ + """ + set_option('display.max_rows', None) + set_option('display.max_columns', None) + set_option('display.width', None) + set_option('display.max_colwidth', None) + + fp_with_insufficient_coverage, fp_with_sufficient_coverage, fp_with_no_dmd_matches_and_no_coverage = 0, 0, 0 + for pred in test_species_predictions: + matched_genes_dmd_table_coverage = print_query_coverage_list_from_dmd_table(dmd_with_orthogroups.loc[dmd_with_orthogroups["qseqid"] == pred]) + qcov = get_query_coverage_in_dmd_table(matched_genes_dmd_table_coverage, min_qcov_threshold, print_log=False) + if qcov == -1: # If there is a non-empty dmd table, print it + fp_with_no_dmd_matches_and_no_coverage += 1 + # logging.info(f"{pred_type} {pred}: no diamond hits, can't compute query alignment coverage.") + # If at least 80% included of hits has at least 50% of qcov + elif qcov >= min_perc_hits_with_good_qcov: + fp_with_sufficient_coverage +=1 + # logging.info(f"{pred_type} {pred} has good coverage, will be used. Real GF: {gf_genes_with_gf_list.loc[gf_genes_with_gf_list[0] == pred].values.tolist()}") + # logging.info(f"{matched_genes_dmd_table_coverage[['qseqid', 'sseqid', 'GF']]}") + else: # < 80: If less than 80% of hits has at least 50% qcov + fp_with_insufficient_coverage += 1 + # logging.info(f"{pred_type} {pred} has insufficient coverage, will be discarded. Real GF: {gf_genes_with_gf_list.loc[gf_genes_with_gf_list[0] == pred].values.tolist()}") + # logging.info(f"{matched_genes_dmd_table_coverage[['qseqid', 'sseqid', 'GF']]}") + # print() + logging.info(f"{pred_type} with bad qcov: {perc(fp_with_insufficient_coverage, tot_number_predictions)} ({fp_with_insufficient_coverage})") + logging.info(f"{pred_type} with no dmd matches and no qcov: {perc(fp_with_no_dmd_matches_and_no_coverage, tot_number_predictions)} ({fp_with_no_dmd_matches_and_no_coverage})") + logging.info(f"{pred_type} with good qcov: {perc(fp_with_sufficient_coverage, tot_number_predictions)} ({fp_with_sufficient_coverage})") + + +def perc(part, whole): + p = round(100 * float(part)/float(whole), 2) + return str(p) + "%" + +def conf_matrix(tp, fp, fn, tn): + logging.info("PERCENTAGES OVER TOTAL DATASET") + p_tp = (tp / (tp + fp + fn + tn)) * 100 + p_fp = (fp / (tp + fp + fn + tn)) * 100 + p_fn = (fn / (tp + fp + fn + tn)) * 100 + p_tn = (tn / (tp + fp + fn + tn)) * 100 + + logging.info(f" TP | FP ") + logging.info(f"{p_tp:.2f}% | {p_fp:.2f}%") + logging.info(f"---------------") + logging.info(f" FN | TN ") + logging.info(f"{p_fn:.2f}% | {p_tn:.2f}%") + logging.info("") + + logging.info("PRECISION and NEGATIVE PREDICTIVE VALUE (horizontally sums to 100%)") + p_tp = (tp / (tp + fp)) * 100 + p_fp = (fp / (tp + fp)) * 100 + p_fn = (fn / (fn + tn)) * 100 + p_tn = (tn / (fn + tn)) * 100 + + logging.info(f" TP FP ") + logging.info(f"{p_tp:.2f}% {p_fp:.2f}%") + logging.info(f"---------------") + logging.info(f" FN TN ") + logging.info(f"{p_fn:.2f}% {p_tn:.2f}%") + logging.info("") + + logging.info("TRUE POSITIVE RATE and FALSE POSITIVE RATE (vertically sums to 100%)") + p_tp = (tp / (tp + fn)) * 100 + p_fn = (fn / (tp + fn)) * 100 + p_fp = (fp / (fp + tn)) * 100 + p_tn = (tn / (fp + tn)) * 100 + + logging.info(f" TP | FP ") + logging.info(f"{p_tp:.2f}% | {p_fp:.2f}%") + logging.info(f" | ") + logging.info(f" FN | TN ") + logging.info(f"{p_fn:.2f}% | {p_tn:.2f}%") + return + +def tpr(tp, fn): + return round(tp / (tp + fn), 4) + +def f1score(tp, fp, fn): + return (2 * tp) / ( 2 * tp + fp + fn) + +def mcc(tp, fp, fn, tn): + num = (tp * tn) - (fp * fn) + den = sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) + return num/den + +def dna_to_aminoacid(database_dna_path, output_folder="."): + """ + Translates the database of nucleotidic sequences into aminoacid sequences, + to be given to diamond. + """ + database_aa_path = os.path.join(output_folder, f"{os.path.basename(database_dna_path)}.aa") + if not os.path.exists(database_aa_path): + with open (database_aa_path, "w+") as f: + for i, seq in enumerate(SeqIO.parse(database_dna_path, 'fasta')): + try: + aa_seq = seq.translate(to_stop=False, cds=False, id=seq.id) + f.write(f">{seq.id}\n") + f.write(f"{aa_seq.seq}\n") + except TranslationError as e: + logging.error("Translation error ({}) in seq {}".format( + e, seq.id)) + continue + return database_aa_path + +def assign_first_diamond_hit_with_rbh(species, fasta_file, species_sequences_filename, orthogroup_database_nucl_test, + query_seqs, dmd_with_orthogroups, pident_threshold, column_names, dmd_tmp_dir, basefolder_output_files): + """ + Assign to the query gene the orthogroup of the first (and best) hit, but only + if there is a reciprocal best hit correspondence between the two genes. + """ + logging.info(f"Temporary diamond directory: {dmd_tmp_dir}\n") + + filename = os.path.join(basefolder_output_files, f"assigned_gfs_{species}_{pident_threshold}%_best_diamond_hit_with_rbh.tsv") + + if not os.path.exists(filename) or os.path.getsize(filename) == 0: + + # Get the orthogroup database from which the sequence of the diamond hit is taken + orthogroup_database_test = read_csv(orthogroup_database_nucl_test, sep="\t", header=None).loc[:,0] + + # Get/Make the Atha FASTA diamond database + if not os.path.exists(os.path.join(f"{fasta_file}.aa.db.dmnd")): + test_species_database = make_diamond_db(species_sequences_filename, output_folder=basefolder_output_files) + else: + test_species_database = os.path.join(basefolder_output_files, f"{fasta_file}.aa.db.dmnd") + + assigned_gfs_file = list() + no_rbh = 0 + for query in query_seqs: + # Get the subset of matches for the current gene of the new species + single_gene_matches = dmd_with_orthogroups.loc[dmd_with_orthogroups.loc[:, "qseqid"] == query].iloc[0,:] + + # Get the best matched gene and obtain its best diamond hit against the test species (Atha, Vitis...) + matched_gene = single_gene_matches.loc["sseqid"] + matched_gene_id_index = orthogroup_database_test.loc[orthogroup_database_test.str.contains(matched_gene, regex=False)].index.tolist()[0] + best_hit_sequence_filename = os.path.join(dmd_tmp_dir, f"{query}_best_hit.fasta") + orthogroup_database_test.iloc[matched_gene_id_index:matched_gene_id_index+2].to_csv(best_hit_sequence_filename, sep="\t", header=None, index=False) + reverse_best_hits_dmd_table = os.path.join(dmd_tmp_dir, f"{query}_reverse_best_hit.dmd") + reverse_best_hits = run_diamond(best_hit_sequence_filename, test_species_database, reverse_best_hits_dmd_table, dmd_tmp_dir, + pident_threshold, column_names=column_names) + if reverse_best_hits.empty: + logging.info(f"Query {query} will be ignored: its matched gene '{matched_gene}' didn't find any reverse hit in test species database") + continue + + # If RBH, assign to query the gene family + try: + if reverse_best_hits.iloc[0,1] == query: + gf_match = single_gene_matches.loc["GF"] + assigned_gfs_file.append([query, gf_match]) + # Else ignore the query (assumed to be a false positive) + else: + no_rbh += 1 + + except IndexError: + logging.error("IndexError occurred", exc_info=True) + logging.error(f"{query}, {matched_gene}") + logging.error(f"{reverse_best_hits}:") + logging.error(reverse_best_hits) + except KeyError: # Why KeyError 13 when running Vitis and Phoenix? + logging.error("KeyError occurred", exc_info=True) + logging.error(f"Query: {query}") + logging.error(f"Single gene matches dataframe:") + logging.error(single_gene_matches) + + + logging.warning(f"No RBHs for {no_rbh} queries out of {len(query_seqs)}") + + if len(assigned_gfs_file) == 0: + logging.error("None of the test species' queries had matched genes with reciprocal best hits.") + logging.error("Exiting.") + sys.exit() + + assigned_gfs_file = DataFrame(assigned_gfs_file) + with open(filename, "w+") as f: + f.write(assigned_gfs_file.to_csv(sep="\t", header=False, index=False)) + else: + logging.info(f"Found existing file with assigned orthogroups ({filename}).") + logging.info(f"Will skip") + assigned_gfs_file = read_csv(filename, sep="\t", header=None) + + return assigned_gfs_file diff --git a/ksrates/fc_wgd.py b/ksrates/fc_wgd.py index f7e501a0393fa4275d74afff847e7548fe516ad5..b01fcbe3b44405e2cb37d6928aa44ac963eb0221 100755 --- a/ksrates/fc_wgd.py +++ b/ksrates/fc_wgd.py @@ -9,16 +9,21 @@ from wgd_ksrates.utils import read_fasta from wgd_ksrates.blast_mcl import run_mcl_ava, ava_blast_to_abc, get_one_v_one_orthologs_rbh from wgd_ksrates.ks_distribution import ks_analysis_paranome, ks_analysis_one_vs_one from wgd_ksrates.colinearity import gff_parser +import ksrates.extend_orthogroups as recRet from ksrates.utils import merge_dicts, concat_files, can_i_run_software, translate_cds, write_fasta _OUTPUT_BLAST_FILE_PATTERN = '{}.blast.tsv' _OUTPUT_MCL_FILE_PATTERN = '{}.mcl.tsv' _OUTPUT_KS_FILE_PATTERN = '{}.ks.tsv' +_OUTPUT_MCL_REC_RET_FILE_PATTERN = '{}_rec_ret_top{}_{}_rank.mcl.tsv' +_OUTPUT_KS_REC_RET_FILE_PATTERN = '{}.ks_rec_ret.tsv' + _PARALOGS_OUTPUT_DIR_PATTERN = 'wgd_{}' _ORTHOLOGS_OUTPUT_DIR_PATTERN = 'wgd_{}_{}' _TMP_BLAST = '{}.blast_tmp' _TMP_KS = '{}.ks_tmp' +_TMP_KS_REC_RET = '{}.ks_rec_ret_tmp' def ks_paralogs(species_name, cds_fasta, base_dir='.', eval_cutoff=1e-10, inflation_factor=2.0, aligner='muscle', min_msa_length=100, codeml='codeml', codeml_times=1, pairwise=False, @@ -64,7 +69,6 @@ def ks_paralogs(species_name, cds_fasta, base_dir='.', eval_cutoff=1e-10, inflat do_ks = True if os.path.exists(output_dir): if os.path.exists(os.path.join(output_dir, output_blast_file)): - # Check if blast tmp folder of previous failed run exists (in this case the blast table is incomplete) if os.path.exists(tmp_blast_paralogs): logging.error(f'tmp directory [{tmp_blast_paralogs}] was found: leftover from a failed earlier run?') @@ -80,6 +84,7 @@ def ks_paralogs(species_name, cds_fasta, base_dir='.', eval_cutoff=1e-10, inflat do_blast = False else: logging.info('No paralog blast data, will run wgd all versus all Blastp') + if os.path.exists(os.path.join(output_dir, output_mcl_file)): if overwrite or do_blast: logging.warning(f'Paralog gene families file {output_mcl_file} exists, will overwrite') @@ -191,8 +196,180 @@ def ks_paralogs(species_name, cds_fasta, base_dir='.', eval_cutoff=1e-10, inflat os.chdir(tmp_ks_paralogs) # change dir to tmp dir, as codeml writes non-unique file names to the working dir output_mcl_path = os.path.join(output_dir, output_mcl_file) - results_df = ks_analysis_paranome(cds_sequences, protein_sequences, output_mcl_path, tmp_ks_paralogs, - output_dir, codeml, times=codeml_times, aligner=aligner, + results_df = ks_analysis_paranome(cds_sequences, protein_sequences, output_mcl_path, tmp_dir=tmp_ks_paralogs, + output_dir=output_dir, codeml_path=codeml, times=codeml_times, aligner=aligner, + ignore_prefixes=False, min_length=min_msa_length, pairwise=pairwise, + max_gene_family_size=max_gene_family_size, method=weighting_method, + preserve=False, n_threads=n_threads) + if os.path.exists(tmp_ks_paralogs): + logging.error(f'tmp directory {tmp_ks_paralogs} still exists after analysis, ' + f'paralog Ks data may be incomplete and will not be written to paralog Ks file!') + logging.error('Please delete the tmp directory and rerun the analysis. Exiting.') + sys.exit(1) + if isinstance(results_df, type(None)) or results_df.empty: + logging.warning('No paralog Ks data computed, will write empty paralog Ks file!') + with open(os.path.join(output_dir, output_ks_file), 'w+') as o: + o.write(results_df.round(5).to_csv(sep='\t')) + # change back to current directory as tmp dir got deleted and subsequent os.getcwd() may fail + os.chdir(cw_dir) + + logging.info('---') + logging.info('Done') + + +def ks_paralogs_rec_ret(species_name, cds_fasta, latin_name, top=2000, rank_type="lambda", pident_threshold=40, + min_qcov_threshold=50, filter_false_predictions=False, filter_zero_false_predictions=False, + base_dir='.', aligner='muscle', min_msa_length=100, codeml='codeml', codeml_times=1, pairwise=False, + max_gene_family_size=200, weighting_method='fasttree', n_threads=4, overwrite=False): + """ + Modified from wgd_cli.py + + All vs. all Blast and one-to-one ortholog delineation (reciprocal best hits) + :param species_name: name or ID of the species, used in output file names + :param cds_fasta: CDS fasta file + :param : + :param : + :param : + :param : + :param : + :param : + :param base_dir: directory where the output directory will be created in + :param aligner: aligner to use + :param min_msa_length: minimum multiple sequence alignment length + :param codeml: path to codeml executable for ML estimation of Ks + :param codeml_times: number of times to iteratively perform codeml ML estimation of Ks + :param pairwise: run in pairwise mode + :param max_gene_family_size: maximum size of a gene family to be included in analysis + :param weighting_method: weighting method (fasttree, phyml or alc) + :param n_threads: number of threads to use + :param overwrite: overwrite existing results + :return: nothing + """ + # input checks + if not species_name: + logging.error('No species name provided. Exiting.') + sys.exit(1) + + output_mcl_rec_ret_file = _OUTPUT_MCL_REC_RET_FILE_PATTERN.format(species_name, top, rank_type) + output_ks_file = _OUTPUT_KS_REC_RET_FILE_PATTERN.format(species_name) + + output_dir = os.path.abspath(os.path.join(base_dir, _PARALOGS_OUTPUT_DIR_PATTERN.format(species_name))) + + output_mcl_path = os.path.join(output_dir, output_mcl_rec_ret_file) + + tmp_ks_paralogs = os.path.join(output_dir, _TMP_KS_REC_RET.format(species_name)) + + # BASEFOLDERS for input and output + # Path to database and files to be used in the workflow + # TODO: this is temporarily because we will probably have to let the user first make this files + # themselves in a specific folder; alternatively we could provide this file already ready, but that + # would make the GitHub folder (too) heavy + basefolder_db_and_files = os.path.join("/", "home", "cesen", "ksrates_rec_ret", "ksrates", "reciprocal_retention") + + # determine which parts to run + do_rec_ret = True + do_ks = True + if os.path.exists(output_dir): + + # use reciprocally retained gene families for Ks analysis + if os.path.exists(output_mcl_path): + do_rec_ret = False + else: + logging.info("No reciprocally retained gene family data found, will generate them") + + # Check if Ks tmp folder of previous failed run exists (in this case the Ks data will be incomplete) + if os.path.exists(tmp_ks_paralogs): # if there is a Ks tmp folder from previous failed run + logging.error(f'tmp directory [{tmp_ks_paralogs}] was found: leftover from a failed earlier run?') + logging.error('Paralog Ks data may be incomplete. Please delete the tmp directory and rerun ' + 'the analysis. Exiting.') + sys.exit(1) + + if os.path.exists(os.path.join(output_dir, output_ks_file)): + if overwrite or do_rec_ret: + logging.warning(f'Paralog Ks file {output_ks_file} exists, will overwrite') + else: + logging.info(f'Paralog Ks data {output_ks_file} already exists, will skip wgd Ks analysis') + do_ks = False + else: + logging.info('No paralog Ks data, will run wgd Ks analysis') + + if not do_rec_ret and not do_ks: + logging.info('All reciprocally retained paralog data already exist, nothing to do') + logging.info('Done') + return + + # software checks + logging.info('---') + logging.info('Checking external software...') + software = [] + if do_rec_ret: + software += ['diamond'] + if do_ks: + software += [aligner, codeml] + if weighting_method == 'fasttree': + software += ['FastTree'] + elif weighting_method == 'phyml': + software += ['phyml'] + if can_i_run_software(software) == 1: + logging.error('Could not run all required external software. Exiting.') + sys.exit(1) + + # input checks + if not cds_fasta: + logging.error('No CDS fasta file provided. Exiting.') + sys.exit(1) + + if not os.path.exists(output_dir): + logging.info(f'Creating output directory {output_dir}') + os.makedirs(output_dir) + + # get reciprocally retained gene families (MCL-like format) + if do_rec_ret: + logging.info('---') + assigned_rec_ret_gfs_path = os.path.join(output_dir, "reciprocal_retention", rank_type, f"assigned_gfs_{species_name}_{pident_threshold}%_best_diamond_hit_with_rbh.tsv") + if not os.path.exists(assigned_rec_ret_gfs_path): + logging.info(f'Running diamond homology search against reciprocally retained gene family database...') + assigned_rec_ret_gfs = recRet.extend_orthogroups(species_name, latin_name, cds_fasta, top, rank_type, pident_threshold, + min_qcov_threshold, filter_false_predictions, filter_zero_false_predictions, + base_dir=output_dir, n_threads=n_threads) + else: + assigned_rec_ret_gfs = pd.read_csv(assigned_rec_ret_gfs_path, sep="\t", header=None) + + if not os.path.exists(output_mcl_path): + logging.info(f'Organizing reciprocally retained gene families in mcl-like format for wgd Ks pipeline') + if not filter_false_predictions and not filter_zero_false_predictions: + top_rec_ret_gf_list = pd.read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None).squeeze() + elif filter_false_predictions: + top_rec_ret_gf_list = pd.read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None).squeeze() + elif filter_zero_false_predictions: + top_rec_ret_gf_list = pd.read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None).squeeze() + generate_mcl_rec_ret(top_rec_ret_gf_list, assigned_rec_ret_gfs, output_mcl_path) + + # whole paranome Ks analysis + if do_ks: + logging.info('---') + logging.info('Running reciprocally retained gene families Ks analysis...') + + # tmp directory + cw_dir = os.getcwd() + if not os.path.exists(tmp_ks_paralogs): + os.mkdir(tmp_ks_paralogs) + os.chdir(tmp_ks_paralogs) # change dir to tmp dir, as codeml writes non-unique file names to the working dir + + logging.info(f'Translating CDS file {cds_fasta}...') + cds_sequences = read_fasta(os.path.abspath(cds_fasta)) + protein_sequences = translate_cds(cds_sequences) + + if not filter_false_predictions and not filter_zero_false_predictions: + top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}.tsv") + elif filter_false_predictions: + top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv") + elif filter_zero_false_predictions: + top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv") + + results_df = ks_analysis_paranome(cds_sequences, protein_sequences, output_mcl_path, + top=top, top_gfs_filtered_with_rank=top_gfs_filtered_with_rank, + tmp_dir=tmp_ks_paralogs, output_dir=output_dir, codeml_path=codeml, times=codeml_times, aligner=aligner, ignore_prefixes=False, min_length=min_msa_length, pairwise=pairwise, max_gene_family_size=max_gene_family_size, method=weighting_method, preserve=False, n_threads=n_threads) @@ -903,3 +1080,16 @@ def _write_anchor_pairs_ks(anchor_points_file, ks_file, out_file='ks_anchors.tsv if out_file: with open(out_file, "w+") as of: of.write(ks_anchors_weighted.to_csv(sep='\t')) + +def generate_mcl_rec_ret(gf_list, assigned_gfs_file, output_filename): + """ + For a new species, generate the mcl-like file for the gene families. + Each line of the file contains the genes belonging to a certain gene family, and the gene families + are sorted according to their reciprocally retention ranking (top 2000 gene families only are used). + """ + if not os.path.exists(output_filename): + mcl_like_output = [] + for gf in list(gf_list): + genes_assigned_to_current_gf = assigned_gfs_file.loc[assigned_gfs_file[1] == gf] + mcl_like_output.append(list(genes_assigned_to_current_gf.iloc[:, 0])) + pd.DataFrame.from_records(mcl_like_output).to_csv(output_filename, sep="\t", header=False, index=False) diff --git a/ksrates/generate_configfile.py b/ksrates/generate_configfile.py index c62f56bf7535351845c5884472387df95bc3348e..45433bac15d6896109edc4649bd529aba2c62a0c 100755 --- a/ksrates/generate_configfile.py +++ b/ksrates/generate_configfile.py @@ -32,6 +32,7 @@ def generate_configfile(configfile_name): Config.set("ANALYSIS SETTING", "paranome", "yes") Config.set("ANALYSIS SETTING", "collinearity", "no") + Config.set("ANALYSIS SETTING", "reciprocal_retention", "no") Config.set("ANALYSIS SETTING", "# analysis type for paralog data; allowed values: 'yes' and 'no'\n") Config.set("ANALYSIS SETTING", "gff_feature", "") diff --git a/ksrates/plot_paralogs.py b/ksrates/plot_paralogs.py index d6c8a82336980972469f78f3b2c5c71cc799c7d2..a57c8df044ce459670d48ae986b79fea72ba71d0 100755 --- a/ksrates/plot_paralogs.py +++ b/ksrates/plot_paralogs.py @@ -1,3 +1,4 @@ +import collections import pandas import sys import os @@ -9,7 +10,7 @@ import ksrates.fc_check_input as fcCheck import ksrates.fc_configfile as fcConf from ksrates.fc_rrt_correction import _ADJUSTMENT_TABLE -def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, anchors_ks_tsv_file): +def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, anchors_ks_tsv_file, rec_ret_tsv_file): # INPUT config = fcConf.Configuration(config_file) init_logging("Generating mixed paralog and ortholog distributions", config.get_logging_level()) @@ -21,8 +22,10 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an # Get analysis type paranome_analysis = config.get_paranome() colinearity_analysis = config.get_colinearity() - if not colinearity_analysis and not paranome_analysis: - logging.error('At least one of the "paranome" or "collinearity" parameters in the configuration file needs to be set to "yes".') + reciprocal_retention_analysis = config.get_reciprocal_retention() + + if not colinearity_analysis and not paranome_analysis and not reciprocal_retention_analysis: + logging.error('At least one of the "paranome" or "collinearity" or "reciprocal_retention" parameters in the configuration file needs to be set to "yes".') logging.error("Exiting.") sys.exit(1) @@ -39,7 +42,13 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an anchors_ks_tsv_file = fcCheck.get_argument_path(anchors_ks_tsv_file, default_path_anchors_tsv_file, "Anchor pair Ks TSV file") if anchors_ks_tsv_file == "": logging.error(f"Anchor pair Ks TSV file not found at default position [{default_path_anchors_tsv_file}].") - if paralog_tsv_file == "" or anchors_ks_tsv_file == "": + if reciprocal_retention_analysis: + default_path_rec_ret_tsv_file = os.path.join("paralog_distributions", f"wgd_{species}", f"{species}.ks_rec_ret.tsv") + rec_ret_tsv_file = fcCheck.get_argument_path(rec_ret_tsv_file, default_path_rec_ret_tsv_file, "Reciprocally retained paralog Ks TSV file") + if rec_ret_tsv_file == "": + logging.error(f"Reciprocally retained paralog Ks TSV file not found at default position [{default_path_rec_ret_tsv_file}].") + + if paralog_tsv_file == "" or anchors_ks_tsv_file == "" or rec_ret_tsv_file == "": logging.error("Exiting") sys.exit(1) @@ -83,27 +92,44 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an plot_correction_arrows = config.plot_correction_arrows() peak_stats = config.get_peak_stats() # default is mode (other option, median) + top = config.get_reciprocal_retention_top() + rank_type = config.get_reciprocal_retention_rank_type() # ----------------------------------------------------------------------------- # GENERATING MIXED DISTRIBUTION PLOT logging.info("") - analysis_type = 'both' - if not colinearity_analysis: - analysis_type = 'paranome' + # TODO: REMOVE THE FOLLOWING COMMENTED BLOCK ONCE THE IF-ELIF LIST IS PROVED TO BE CORRECT + # if not colinearity_analysis: + # logging.info(f"Plotting paranome Ks distribution for species [{species}]") + # elif not paranome_analysis: + # logging.info(f"Plotting anchor pair Ks distribution for species [{species}]") + # else: + # logging.info(f"Plotting paranome and anchor pairs Ks distributions for species [{species}]") + if not colinearity_analysis and not reciprocal_retention_analysis: logging.info(f"Plotting paranome Ks distribution for species [{species}]") - elif not paranome_analysis: - analysis_type = 'colinearity' + elif not paranome_analysis and not reciprocal_retention_analysis: logging.info(f"Plotting anchor pair Ks distribution for species [{species}]") - else: + elif not paranome_analysis and not colinearity_analysis: + logging.info(f"Plotting reciprocally retained paralog Ks distribution for species [{species}]") + elif paranome_analysis and colinearity_analysis: logging.info(f"Plotting paranome and anchor pairs Ks distributions for species [{species}]") + elif paranome_analysis and reciprocal_retention_analysis: + logging.info(f"Plotting paranome and reciprocally retained paralog Ks distributions for species [{species}]") + elif colinearity_analysis and reciprocal_retention_analysis: + logging.info(f"Plotting anchor pairs and reciprocally retained paralog Ks distributions for species [{species}]") + else: + logging.info(f"Plotting paranome, anchor pairs and reciprocally retained paralog Ks distributions for species [{species}]") # PLOTTING THE BACKGROUND PARALOG AND/OR ANCHOR DISTRIBUTION fig_uncorr, ax_uncorr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, - "un-corrected", correction_table_available, plot_correction_arrows, - paranome_data=paranome_analysis, colinearity_data=colinearity_analysis) + "un-corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, + reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) + fig_corr, ax_corr = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim, y_lim, - "corrected", correction_table_available, plot_correction_arrows, - paranome_data=paranome_analysis, colinearity_data=colinearity_analysis) + "corrected", correction_table_available, plot_correction_arrows, + paranome_data=paranome_analysis, colinearity_data=colinearity_analysis, + reciprocal_retention_data=reciprocal_retention_analysis, top=top, rank_type=rank_type) if paranome_analysis: paranome_list, paranome_weights = fc_extract_ks_list.ks_list_from_tsv(paralog_tsv_file, max_ks_para, "paralogs") @@ -123,14 +149,27 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an fcPlot.plot_histogram("Anchor pairs", ax_corr, anchors_list, bin_list, bin_width_para, max_ks_para, kde_bandwidth_modifier, color=fcPlot.COLOR_ANCHOR_HISTOGRAM, weight_list=anchors_weights) + if reciprocal_retention_analysis: + rec_ret_list, rec_ret_weights = fc_extract_ks_list.ks_list_from_tsv(rec_ret_tsv_file, max_ks_para, "reciprocally retained") + hist_rec_ret = fcPlot.plot_histogram("Reciprocally retained paralogs", ax_uncorr, rec_ret_list, bin_list, bin_width_para, + max_ks_para, kde_bandwidth_modifier, weight_list=rec_ret_weights) + fcPlot.plot_histogram("Reciprocally retained paralogs", ax_corr, rec_ret_list, bin_list, bin_width_para, + max_ks_para, kde_bandwidth_modifier, color=fcPlot.COLOR_REC_RET_HISTOGRAM, weight_list=rec_ret_weights) + # Setting plot height based on tallest histogram (if paranome analysis is on, it will come from that distribution) if y_lim is None: if paranome_analysis: fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_paranome) fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_paranome) - else: + elif colinearity_analysis and not reciprocal_retention_analysis: fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_anchors) fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_anchors) + elif not colinearity_analysis and reciprocal_retention_analysis: + fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_rec_ret) + fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_rec_ret) + elif colinearity_analysis and reciprocal_retention_analysis: + fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_anchors, hist_rec_ret) + fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_anchors, hist_rec_ret) # PLOTTING THE ORTHOLOG DIVERGENCE LINES on the paralog distribution if correction_table_available: @@ -140,7 +179,7 @@ def plot_paralogs_distr(config_file, correction_table_file, paralog_tsv_file, an logging.info("") logging.info(f"Saving PDF figures of mixed plots [{fcPlot._MIXED_ADJUSTED_PLOT_FILENAME.format(species)}, {fcPlot._MIXED_UNADJUSTED_PLOT_FILENAME.format(species)}]") fcPlot.save_mixed_plot(fig_corr, fig_uncorr, ax_corr, ax_uncorr, species, correction_table_available, paranome=paranome_analysis, - colinearity=colinearity_analysis) + colinearity=colinearity_analysis, reciprocal_retention=reciprocal_retention_analysis) logging.info("") logging.info("All done") diff --git a/ksrates/reciprocal_retention/compute_ks_from_ortho_gfs.py b/ksrates/reciprocal_retention/compute_ks_from_ortho_gfs.py new file mode 100755 index 0000000000000000000000000000000000000000..e33474e252c9baaa3f1a31bb40cf3c02f7d82b47 --- /dev/null +++ b/ksrates/reciprocal_retention/compute_ks_from_ortho_gfs.py @@ -0,0 +1,283 @@ +from wgd_ksrates.utils import read_fasta +from wgd_ksrates.ks_distribution import ks_analysis_paranome +import os +import logging +import sys +import subprocess +import tempfile +import math +import datetime + +logging.basicConfig(format='%(levelname)s\t%(message)s', level="INFO", stream=sys.stdout) + +_OUTPUT_KS_FILE_PATTERN = '{}_ortho_gf_{}_{}_rank.ks.tsv' +_PARALOGS_OUTPUT_DIR_PATTERN = 'wgd_{}' +_TMP_KS = '{}_ortho_gf.ks_tmp' + + +def ks_paralogs(species_name, cds_fasta, top, top_gfs_filtered_with_rank, rank_type, ks_tsv_path, base_dir='.', eval_cutoff=1e-10, + inflation_factor=2.0, aligner='muscle', min_msa_length=100, codeml='codeml', codeml_times=1, pairwise=False, + max_gene_family_size=200, weighting_method='fasttree', n_threads=4, overwrite=False): + """ + Among the edits, this function temporarily takes also the path and filename to the output_ks_file, in order to + provide a different name when top 2000 or a subset of it are used. + """ + + # input checks + if not species_name: + logging.error('No species name provided. Exiting.') + sys.exit(1) + + # TMP removed in order to use a custom name dependent on whether a subset of the top2000 is used or all of the top 2000 are used + # output_ks_file = _OUTPUT_KS_FILE_PATTERN.format(f"{species_name.lower()}", f"top{top}", rank_type) + output_ks_file = os.path.basename(ks_tsv_path) + + output_dir = os.path.abspath(os.path.join(base_dir, _PARALOGS_OUTPUT_DIR_PATTERN.format(species_name.lower()))) + + tmp_ks_paralogs = os.path.join(output_dir, _TMP_KS.format(f"{species_name.lower()}_{rank_type}")) + + # determine which parts to run + do_ks = True + if os.path.exists(output_dir): + + # REMOVED THE BLAST AND MCL STEPS + + # Check if Ks tmp folder of previous failed run exists (in this case the Ks data will be incomplete) + if os.path.exists(tmp_ks_paralogs): # if there is a Ks tmp folder from previous failed run + logging.error(f'tmp directory [{tmp_ks_paralogs}] was found: leftover from a failed earlier run?') + logging.error('Paralog Ks data may be incomplete. Please delete the tmp directory and rerun ' + 'the analysis. Exiting.') + sys.exit(1) + + if os.path.exists(os.path.join(output_dir, output_ks_file)): + if overwrite: + logging.warning(f'Paralog Ks file {output_ks_file} exists, will overwrite') + else: + logging.info(f'Paralog Ks data {output_ks_file} already exists, will skip wgd Ks analysis') + do_ks = False + else: + logging.info('No paralog Ks data, will run wgd Ks analysis') + + if not do_ks: + logging.info('All paralog data already exist, nothing to do') + logging.info('Done') + return + + # software checks + logging.info('---') + logging.info('Checking external software...') + software = [] + if do_ks: + software += [aligner, codeml] + if weighting_method == 'fasttree': + software += ['FastTree'] + elif weighting_method == 'phyml': + software += ['phyml'] + if can_i_run_software(software) == 1: + logging.error('Could not run all required external software. Exiting.') + sys.exit(1) + + # input checks + if not cds_fasta: + logging.error('No CDS fasta file provided. Exiting.') + sys.exit(1) + + if not os.path.exists(output_dir): + logging.info(f'Creating output directory {output_dir}') + os.makedirs(output_dir) + + # read and translate CDS file + cds_sequences = None + protein_sequences = None + if do_ks: + logging.info(f'Translating CDS file {cds_fasta}...') + cds_sequences = read_fasta(os.path.abspath(cds_fasta)) + protein_sequences = translate_cds(cds_sequences) + + # REMOVED BLAST and MCL STEPS + + # whole paranome Ks analysis + if do_ks: + logging.info('---') + logging.info('Running whole paranome Ks analysis...') + + # tmp directory + cw_dir = os.getcwd() + if not os.path.exists(tmp_ks_paralogs): + os.mkdir(tmp_ks_paralogs) + os.chdir(tmp_ks_paralogs) # change dir to tmp dir, as codeml writes non-unique file names to the working dir + + test_run = False + if test_run: + output_mcl_path = os.path.join(output_dir, f"{species_name.lower()}_ortho_gf_top5000_{rank_type}_rank.mcl.tsv") + else: + output_mcl_path = os.path.join(output_dir, f"{species_name.lower()}_ortho_gf_top{top}_{rank_type}_rank.mcl.tsv") + + results_df = ks_analysis_paranome(cds_sequences, protein_sequences, output_mcl_path, top, top_gfs_filtered_with_rank, + tmp_ks_paralogs, output_dir, codeml, times=codeml_times, aligner=aligner, + ignore_prefixes=False, min_length=min_msa_length, pairwise=pairwise, + max_gene_family_size=max_gene_family_size, method=weighting_method, + preserve=False, n_threads=n_threads) + if os.path.exists(tmp_ks_paralogs): + logging.error(f'tmp directory {tmp_ks_paralogs} still exists after analysis, ' + f'paralog Ks data may be incomplete and will not be written to paralog Ks file!') + logging.error('Please delete the tmp directory and rerun the analysis. Exiting.') + sys.exit(1) + if isinstance(results_df, type(None)) or results_df.empty: + logging.warning('No paralog Ks data computed, will write empty paralog Ks file!') + with open(os.path.join(output_dir, output_ks_file), 'w+') as o: + o.write(results_df.round(5).to_csv(sep='\t')) + # change back to current directory as tmp dir got deleted and subsequent os.getcwd() may fail + os.chdir(cw_dir) + + logging.info('---') + logging.info('Done') + + + +def can_i_run_software(software): + """ + Copied and modified from wgd.utils + + Test if external software is executable + :param software: list or string of executable(s) + :return: 1 (failure) or 0 (success) + """ + if type(software) == str: + software = [software] + ex = 0 + for s in software: + # codeml needs input otherwise it prompts the user for input, so a dummy + # file is created + if s == 'codeml': + tmp_file = "codeml.ctl" + command = ['codeml', tmp_file] + elif s == 'prank': + command = [s, '--help'] + elif s == 'FastTree': + command = s + elif s in ['blastp', 'makeblastdb', 'blast', 'muscle', 'i-adhore']: + command = [s, '-version'] + else: + command = [s, '--version'] + try: + # logging.info(command) + if s == "codeml": + # Since the Nextflow pipeline processes multiple wgd runs at the same time, + # let's generate for each run the dummy "codeml.ctl" files within a different + # temporary directory named with a unique ID. This way, the several parallel + # wgd runs will not interfere with each other when creating and removing + # the dummy codeml.ctl file. + with tempfile.TemporaryDirectory(dir = ".") as tmp_dir: + with open(os.path.join(tmp_dir, tmp_file), 'w') as o: # write the codeml.ctl file in it + o.write('seqfile = test') + sp = subprocess.run(command, cwd=tmp_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + # tmp_dir is removed both if the subprocess succeeds or fails, thanks to the "with" statement + else: + sp = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + out = sp.stdout.decode("utf-8").strip() + err = sp.stderr.decode("utf-8").strip() + if out: + logging.info(out.splitlines()[0]) + elif err: + logging.info(err.splitlines()[0]) + + except FileNotFoundError: + logging.error('{} executable not found!'.format(s)) + ex = 1 + return ex + + +def translate_cds(sequence_dict, skip_invalid=False): + """ + Copied and modified from wgd.utils + + Just another CDS to protein translater. Will give warnings when in-frame + stop codons are found, invalid codons are found, or when the sequence length + is not a multiple of three. Will translate the full sequence or until an + unspecified or in-frame codon is encountered. + :param sequence_dict: dictionary with gene IDs and CDS sequences + :param skip_invalid: bool, skip invalid CDS? (default translates to first + stop codon or end) + :return: dictionary with gene IDs and proteins sequences + """ + # TODO I should just use the Biopython translator + aa_dict = { + 'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M', + 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T', + 'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K', + 'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R', + 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L', + 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P', + 'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q', + 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', + 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V', + 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', + 'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E', + 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', + 'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S', + 'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L', + 'TAC': 'Y', 'TAT': 'Y', 'TAA': '', 'TAG': '', + 'TGC': 'C', 'TGT': 'C', 'TGA': '', 'TGG': 'W', + } + protein_dict = {} + + j = 0 + total = 0 + for key, val in sequence_dict.items(): + j += 1 + aa_seq = '' + if len(val) % 3 != 0: + logging.warning('Sequence length != multiple of 3 for {}!'.format(key)) + total += 1 + invalid = False + for i in range(0, len(val), 3): + if val[i:i + 3] not in aa_dict.keys(): + logging.warning('Invalid codon {0:>3} in {1}'.format(val[i:i+3], key)) + invalid = True + total += 1 + break + else: + if aa_dict[val[i:i + 3]] == '' and i+3 != len(val): + logging.warning('In-frame STOP codon in {0} at position {1}:{2}'.format(key, i, i+3)) + invalid = True + total += 1 + break + aa_seq += aa_dict[val[i:i + 3]] + if invalid and skip_invalid: + continue + protein_dict[key] = aa_seq + + if total: + logging.warning("There were {} warnings during translation".format(total)) + return protein_dict + +def init_logging(log_heading, logging_level): + """ + Initialize the logging environment and print a header to the log. + + :param log_heading: heading for the log + :return: nothing + """ + logging.basicConfig(format='%(levelname)s\t%(message)s', level=logging_level, stream=sys.stdout) + length = math.ceil((len(log_heading)/2)) + logging.info('- ' * length) + logging.info(log_heading) + logging.info(datetime.datetime.today().ctime()) + logging.info('- ' * length) + + + + +def compute_ks_from_ortho_gfs(species, species_fasta_file, top, top_gfs_filtered_with_rank, \ + rank_type, ks_tsv_path, max_gene_family_size, n_threads, paralog_dists_dir): + """ + Function to be called in the main Python script. + """ + logging.info(f" Estimate Ks from the top {top} GFs ({rank_type} ranking)") + + if not os.path.exists(ks_tsv_path): + + ks_paralogs(species, species_fasta_file, top, top_gfs_filtered_with_rank, rank_type, ks_tsv_path, max_gene_family_size=max_gene_family_size, \ + base_dir=paralog_dists_dir, n_threads=n_threads) diff --git a/ksrates/reciprocal_retention/construct_ks_plot_ortho_gfs.py b/ksrates/reciprocal_retention/construct_ks_plot_ortho_gfs.py new file mode 100755 index 0000000000000000000000000000000000000000..70a89f6ee3f339b034fa6ebd11585f738c344b5f --- /dev/null +++ b/ksrates/reciprocal_retention/construct_ks_plot_ortho_gfs.py @@ -0,0 +1,117 @@ +from pandas import read_csv +from numpy import float64, arange +import matplotlib.pyplot as plt +import seaborn +import logging +import os +import sys +from numpy import zeros +import matplotlib +# Use the Agg (Anti-Grain Geometry) backend to avoid needing a graphical user interface (X11 backend) +matplotlib.use('Agg') + +logging.basicConfig(format='%(levelname)s\t%(message)s', level="INFO", stream=sys.stdout) + +plt.style.use(os.path.join(f"{os.path.dirname(os.path.abspath(__file__))}", "ks.mplstyle")) + +def ks_list_from_tsv(tsv_file, max_ks): + """ + """ + with open(tsv_file, "r") as tsv_file: + tsv = read_csv(tsv_file, sep="\t") + + filtered_tsv = tsv.loc[(tsv["Ks"].dtypes == float64) & (tsv["Ks"] <= max_ks)] + + ks_list_filtered = filtered_tsv["Ks"].to_list() + weight_list_filtered = filtered_tsv["WeightOutliersExcluded"].to_list() + + return ks_list_filtered, weight_list_filtered + + +def generate_ks_plot(id, ids_names, top, rank_type, ks_list, weight_list, max_ks, bin_width, y_axis_lim, path_to_output_folder, ks_plot_filename): + """ + :param id: species id + :param ids_names: dictionary with scientific name associated to id + """ + # ks_per_species = read_csv(open(os.path.join(path_to_output_folder, f"{rank_type}_rank", f"ks_{id}_top{top}_{rank_type}.tsv"), "r"), sep="\t", header=None, index_col=0) + # weights_per_species = read_csv(open(os.path.join(path_to_output_folder, f"{rank_type}_rank", f"weights_{id}_top{top}_{rank_type}.tsv"), "r"), sep="\t", header=None, index_col=0) + + # ks_list = ks_per_species.loc[id].tolist() + # weight_list = weights_per_species.loc[id].tolist() + fig, axis = generate_mixed_plot_figure(ids_names[id], max_ks, y_axis_lim, top, "corrected", False, False, top, rank_type, colinearity_data=False) + + bin_list = arange(0.0, max_ks + bin_width, bin_width) + hist = axis.hist(ks_list, bins=bin_list, weights=weight_list, histtype='stepfilled', color="0.4", + label="Reciprocally retained genes", density=False) + axis.legend() + fig.savefig(os.path.join(path_to_output_folder, f"{rank_type}_rank", ks_plot_filename), format="pdf") + plt.close(fig) + + +def generate_mixed_plot_figure(species, x_max_lim, y_max_lim, top, corrected_or_not, correction_table_available, + plot_correction_arrows, top_gf, rank_type, paranome_data=True, colinearity_data=True): + """ + """ + species_escape_whitespaces = species.replace(' ', '\ ') + + # Increase figure size in height if extra space for the correction arrows is required and + # any divergence line will be actually plotted (thus, the "and" condition) + if correction_table_available: + if plot_correction_arrows: + # make the figure a bit taller to make room for the arrows + fig, ax = plt.subplots(1, 1, figsize=(14.0, 7.6)) + else: + fig, ax = plt.subplots(1, 1, figsize=(14.0, 7.0)) + else: + # if not correction_table_available a simpler less-wide layout with + # the legend inside the plot and no right margin is used + fig, ax = plt.subplots(1, 1, figsize=(10.0, 7.0)) + + fig.suptitle("$K_\mathregular{S}$" + f" distribution for ${species_escape_whitespaces}$") + + ax.set_title(f"Top {top} GFs from {rank_type} ranking") + + seaborn.despine(offset=10) + ax.set_xlabel("$K_\mathregular{S}$") + + ax.set_ylabel("Number of retained duplicates (weighted)") + + ax.set_xlim(0, x_max_lim) + if isinstance(y_max_lim, float): + y_max_lim = float(y_max_lim) + ax.set_ylim(0, y_max_lim) + plt.setp(ax.yaxis.get_majorticklabels(), rotation=90, verticalalignment='center') + if not correction_table_available: + # if not correction_table_available tighten the layout + # to reduce clipping + plt.tight_layout() + + return fig, ax + + +# ----------------------------------------------------------------------------- + + + +def construct_ks_plot_ortho_gfs(id, top, rank_type, ks_plot_filename, max_ks, bin_width, y_axis_lim, ids_names): + """ + """ + logging.info(f" Construct Ks distribution for the top {top} GFs ({rank_type} ranking)") + + path_to_output_folder = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "ks_distributions", id.lower(), "weighted_with_ortho_GFs") + if not os.path.exists(os.path.join(path_to_output_folder, f"{rank_type}_rank")): + os.makedirs(os.path.join(path_to_output_folder, f"{rank_type}_rank")) + + if not os.path.exists(os.path.join(path_to_output_folder, f"{rank_type}_rank", ks_plot_filename)): + # Extract the Ks estimates from tsv file + ks_tsv = os.path.join("/", "home", "cesen", "wgm_predictor_midas", f"paralog_distributions", f"wgd_{id.lower()}", f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank.ks.tsv") + # if not os.path.exists(os.path.join(path_to_output_folder, f"{rank_type}_rank", f"ks_{id}_top{top}_{rank_type}.tsv")) or \ + # not os.path.exists(os.path.join(path_to_output_folder, f"{rank_type}_rank", f"weights_{id}_top{top}_{rank_type}.tsv")) or \ + # not os.path.exists(os.path.join(path_to_output_folder, f"{rank_type}_rank", f"ks_tsv_{id}_top{top}_{rank_type}.tsv")): + ks_list, weight_list = ks_list_from_tsv(ks_tsv, max_ks) + + # Build Ks plot + logging.info(" Plotting Ks distribution") + generate_ks_plot(id, ids_names, top, rank_type, ks_list, weight_list, max_ks, bin_width, y_axis_lim, path_to_output_folder, ks_plot_filename) + else: + logging.info(" Ks plot already present") \ No newline at end of file diff --git a/ksrates/reciprocal_retention/get_top5000_gfs_mcl_format.py b/ksrates/reciprocal_retention/get_top5000_gfs_mcl_format.py new file mode 100755 index 0000000000000000000000000000000000000000..e8242bc22421da0e214c8acd6b6d886260c0dbac --- /dev/null +++ b/ksrates/reciprocal_retention/get_top5000_gfs_mcl_format.py @@ -0,0 +1,118 @@ +import logging +from pandas import read_excel, read_csv, concat, DataFrame +import os +import sys +import matplotlib +# Use the Agg (Anti-Grain Geometry) backend to avoid needing a graphical user interface (X11 backend) +matplotlib.use('Agg') + +logging.basicConfig(format='%(levelname)s\t%(message)s', level="INFO", stream=sys.stdout) + +def get_top_gf_genes(top, rank_type, path_for_top_gf_tsv): + """ + :param top: number of top GFs considered for building the Ks distribution + """ + + # Get the ranked gene families + gf_rank = read_excel(open("gene_families.xlsx", "rb"), "gene family rankings", header=None) + + # Get the genes associated to all gene families + gf_genes_all_raw = read_excel(open("gene_families.xlsx", "rb"), "orthogroup member genes", header=None) + + # Expand each gene in the GFs as an independent column entry + gf_genes_all = gf_genes_all_raw.iloc[:,:2] # First two columns (GF ID and number of species) + genes_columns = gf_genes_all_raw.iloc[:,2].str.split(",", expand=True) # Process third column (genes) + gf_genes_all = concat([gf_genes_all, genes_columns], axis=1, ignore_index=True) + + if rank_type == "combined": + # Get the top Gfs from the COMBINED RANKING (lambda & blocks percentage) + gf_top = gf_rank.iloc[:top,1] + elif rank_type == "lambda": + # Get the top GFs from the LAMBDA-BASED RANKING + gf_top = gf_rank.loc[gf_rank.iloc[:, 8] <= top,].sort_values(axis=0, by=[8]).iloc[:,1] + + # Get the genes associated to the top GFs in both rank types + gf_genes_top = DataFrame() + for ortho_gf in gf_top.values: + gf_genes_top = gf_genes_top.append(gf_genes_all.loc[gf_genes_all.iloc[:,0] == ortho_gf.strip()], ignore_index=True) + + # Save the top GF together with their genes in a TSV file + with open (os.path.join(path_for_top_gf_tsv, f"gf_top{top}_{rank_type}_rank_conserved_order.tsv"), "w+") as f: + f.write(gf_genes_top.to_csv(sep="\t", header=False, index=False)) + return gf_genes_top + + +def top_gfs_in_mcl_format(top, id, gf_genes_top, rank_type, path_to_output_folder, mcl_like_filename): + """ + """ + ortho_gf_gene_lists = [] + gf_top_per_species = {} + gf_top_per_species[id] = DataFrame() + + for i in range(top): + # Move to next GF if the current GF contains no genes for the current species + if len(gf_genes_top.iloc[i][gf_genes_top.iloc[i].str.contains(id).fillna(False)]) == 0: + ortho_gf_gene_lists.append("") + continue + + # Get the genes associated to the current top GF + gene_list = gf_genes_top.iloc[i][gf_genes_top.iloc[i].str.contains(id).fillna(False)].reset_index(drop=True).str.split("|", expand=True)[1].dropna().tolist() + ortho_gf_gene_lists.append(gene_list) + + # Write the genes in a file, one GF per row (MCL format) + # ortho_gf_gene_lists.sort(key=len, reverse=True) + with open(os.path.join(path_to_output_folder, mcl_like_filename), "w+") as f: + for genes_in_current_family in ortho_gf_gene_lists: + f.write('\t'.join(genes_in_current_family) + "\n") + + + +def get_top5000_gfs_mcl_format(id, top=5000): + """ + Function called by the main Python script + """ + + path_for_top_gf_tsv = ".." + + path_to_output_folder = os.path.join("..", "paralog_distributions", f"wgd_{id.lower()}") + if not os.path.exists(path_to_output_folder): + os.makedirs(path_to_output_folder) + + logging.info(f"Maximum number of gene families considered: top {top} GFs") + + for rank_type in ["combined", "lambda"]: + logging.info(f"- From {rank_type} ranking...") + + # Get the top XXXX GFs (ordered as in the current ranking) + logging.info(f" Getting the top {top} GFs") + if not os.path.exists(os.path.join("..", f"gf_top{top}_{rank_type}_rank_conserved_order.tsv")): + gf_genes_top = get_top_gf_genes(top, rank_type, path_for_top_gf_tsv) + else: + logging.info(f"- File gf_top{top}_{rank_type}_rank_conserved_order.tsv already exists") + gf_genes_top = read_csv(open(os.path.join("..", f"gf_top{top}_{rank_type}_rank_conserved_order.tsv"), "r"), sep="\t", header=None) + + logging.info("") + + # Generate MCL-like file with ORTHO GFs + logging.info(f" Generating the MCL-like file with top {top} ORTHO GFs") + mcl_like_filename = f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank.mcl.tsv" + if not os.path.exists(os.path.join(path_to_output_folder, mcl_like_filename)): + top_gfs_in_mcl_format(top, id, gf_genes_top, rank_type, path_to_output_folder, mcl_like_filename) + else: + logging.info(" (MCL-link file for ORTHO GFs already exists)") + + logging.info("") + + +def generate_mcl_like_file_for_new_species(id, top, rank_type, gf_list, assigned_gfs_file, mcl_like_filename): + """ + For the current new species, generate the mcl-like file for the gene families. + Each line of the file contains the genes belonging to a certain gene family, and the gene families + are sorted according to their reciprocally retention ranking (top 2000 gene families only are used). + """ + if not os.path.exists(mcl_like_filename): + mcl_like_output = [] + for gf in list(gf_list): + genes_assigned_to_current_gf = assigned_gfs_file.loc[assigned_gfs_file[1] == gf] + mcl_like_output.append(list(genes_assigned_to_current_gf.iloc[:, 0])) + DataFrame.from_records(mcl_like_output).to_csv(mcl_like_filename, sep="\t", header=False, index=False) diff --git a/ksrates/reciprocal_retention/ks.mplstyle b/ksrates/reciprocal_retention/ks.mplstyle new file mode 100755 index 0000000000000000000000000000000000000000..9613ea1cbf593326293b40aaf4568234f7a3e23f --- /dev/null +++ b/ksrates/reciprocal_retention/ks.mplstyle @@ -0,0 +1,18 @@ +axes.titlesize : 16 +axes.labelsize : 19 +axes.linewidth : 1.2 +axes.labelpad : 14 +xtick.labelsize : 16 +xtick.major.size : 8 +xtick.major.width : 1.2 +xtick.major.pad : 8 +ytick.labelsize : 16 +ytick.major.size : 8 +ytick.major.width : 1.2 +ytick.major.pad : 8 +legend.frameon : False +legend.fontsize : 14 +figure.subplot.bottom : 0.15 +figure.subplot.left : 0.09 +figure.titlesize : 20 +savefig.pad_inches : 0.15 \ No newline at end of file diff --git a/ksrates/reciprocal_retention/rec_retained_ks_plot_pipeline.py b/ksrates/reciprocal_retention/rec_retained_ks_plot_pipeline.py new file mode 100755 index 0000000000000000000000000000000000000000..a77ef9cee3d32ac3f530ebbe79d38c42ea65f2bc --- /dev/null +++ b/ksrates/reciprocal_retention/rec_retained_ks_plot_pipeline.py @@ -0,0 +1,149 @@ +from get_top5000_gfs_mcl_format import get_top5000_gfs_mcl_format, generate_mcl_like_file_for_new_species +from compute_ks_from_ortho_gfs import compute_ks_from_ortho_gfs +from construct_ks_plot_ortho_gfs import construct_ks_plot_ortho_gfs +import sys +import os +import logging +from pandas import read_csv + +logging.basicConfig(format='%(levelname)s\t%(message)s', level="INFO", stream=sys.stdout) + +logging.info("-----------------------------------------------") +logging.info("Ks distributions with reciprocally retained GFs") +logging.info("-----------------------------------------------") + +test_run = False + +if test_run: + # Species IDs + all_species_ids = ['Alyr', 'Atha', 'Atri', 'Brap', 'Cari', 'Ccaj', 'Clan', 'Cmel', \ + 'Cpap', 'Crub', 'Csat', 'Fves', 'Gmax', 'Grai', 'Hvul', 'Jcur', 'Ljap', 'Lusi', \ + 'Macu', 'Mesc', 'Mtru', 'Osat', 'Pbre', 'Pdac', 'Pmum', 'Pper', 'Ptri', 'Rcom', \ + 'Sbic', 'Sita', 'Slyc', 'Stub', 'Tcac', 'Tpar', 'Vvin', 'Zmay'] + + ids_names = {'Alyr':'Arabidopsis lyrata', 'Atha': 'Arabidopsis thaliana', 'Atri':'Amborella trichopoda', \ + 'Bdis':'Brachypodium distachyon', 'Brap':'Brassica rapa', 'Cari':'Cicer arietinum', 'Ccaj':'Cajanus cajan', \ + 'Clan':'Citrullus lanatus', 'Cpap':'Carica papaya', 'Crub':'Capsella rubella', 'Csat':'Cucumis sativus', \ + 'Fves': 'Fragaria vesca', 'Gmax':'Glycine maxima', 'Grai':'Gossypium raimondii', 'Jcur':'Jatropha curcas', \ + 'Ljap':'Lotus japonicus', 'Lusi':'Linum usitatissiumum', 'Macu':'Musa acuminata', 'Mesc': 'Manihot esculenta', \ + 'Mtru':'Medicago truncatula', 'Osat': 'Oryza sativa', 'Pbre':'Pyrus bretschneideri', 'Pdac':'Phoenix dactylifera', \ + 'Pmum':'Prunus mume', 'Pper':'Prunus persica', 'Ptri':'Populus trichocarpa', 'Rcom':'Ricinus communis', \ + 'Sbic':'Sorghum bicolor', 'Sita':'Setaria italica', 'Slyc':'Solanum lycopersicum', 'Stub':'Solanum tuberosum', \ + 'Tcac':'Theobroma cacao', 'Tpar':'Thellungiella parvula', 'Vvin': 'Vitis vinifera', 'Zmay':'Zea mays'} + + id_list = ["Atha", "Pdac", "Vvin"] + +else: + all_species_ids = ["CMFF", "GEHT", "HJMP", "JETM", "JTQQ", "KEGA", "KNMB", "KZED", "MYMP", \ + "NXOH", "OHAE", "OQHZ", "PEZP", "PJSX", "QZXQ", "RKFX", "RKLL", "RMWJ", \ + "SLYR", "SUAK", "TTRG", "TVSH", "VHZV", "VLNB", "XOOE", "ZCDJ"] + + ids_names = { + "RMWJ" : "Wisteria floribunda", "CMFF" : "Lupinus polyphyllus", "GEHT" : "Gleditsia triacanthos", \ + "HJMP" : "Astragalus membranaceus", "JETM" : "Bauhinia tomentosa", "JTQQ" : "Glycyrrhiza lepidota", \ + "KEGA" : "Glycine soja", "KNMB" : "Lathyrus sativus", "KZED" : "Senna hebecarpa", \ + "MYMP" : "Astragalus propinquus", "NXOH" : "Apios americana", "OHAE" : "Polygala lutea", \ + "OQHZ" : "Quillaja saponaria", "PEZP" : "Glycyrrhiza glabra", "PJSX" : "Acacia pycnantha", \ + "QZXQ" : "Gymnocladus dioicus", "RKFX" : "Cercis canadensis", "RKLL" : "Copaifera officinalis", \ + "RMWJ" : "Wisteria floribunda", "SLYR" : "Cladrastis lutea", "SUAK" : "Codariocalyx motorius", \ + "TTRG" : "Lupinus angustifolius", "TVSH" : "Bituminaria bituminosa", "VHZV" : "Gleditsia sinensis", \ + "VLNB" : "Gompholobium polymorphum", "XOOE" : "Desmanthus illinoensis", "ZCDJ" : "Acacia argyrophylla"} + + id_list = [sys.argv[1].upper()] + +# ANALYSIS SETTINGS + +top = 2000 +rank_type = "lambda" + +filter_by_false_predictions = False +filter_by_zero_false_predictions = False + +basefolder_sequence_data = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "sequences") +basefolder_db_and_files = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "databases_and_gf_files") + +pident_threshold = 40 + +# PARAMETERS +y_axis_lim = float(300) +n_threads = 8 + +max_ks = 5 +bin_width = 0.1 +max_gene_family_size = 200 +paralog_dists_dir = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "paralog_distributions", "") + +for id in id_list: + logging.info(f"Analyzing species: {ids_names[id]}\n") + + basefolder_output_files = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "fabales_1kp", id, rank_type) + + path_to_parlogs_data = os.path.join("/", "home", "cesen", "wgm_predictor_midas", "paralog_distributions", f"wgd_{id.lower()}") + if not os.path.exists(path_to_parlogs_data): + os.makedirs(path_to_parlogs_data) + + # for top in [2000]: # [50, 100, 200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000]: + if not filter_by_false_predictions and not filter_by_zero_false_predictions: + logging.info(f"Analyzing the top {top} GFs ({rank_type} rank):") + top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}.tsv") + gf_list = read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}.tsv"), sep="\t", header=None).squeeze() + assigned_gfs_file = read_csv(os.path.join(basefolder_output_files, f"assigned_gfs_{id}_{pident_threshold}%_best_diamond_hit_with_rbh.tsv"), sep="\t", header=None) + + elif filter_by_false_predictions: + logging.info(f"Analyzing the top {top} GFs ({rank_type} rank) after filtering away those with false predictions >= half of the true positives") + top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv") + gf_list = read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_false_predictions.tsv"), sep="\t", header=None).squeeze() + + elif filter_by_zero_false_predictions: + logging.info(f"Analyzing the top {top} GFs ({rank_type} rank) after filtering away those with at least one false prediction") + top_gfs_filtered_with_rank = os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv") + gf_list = read_csv(os.path.join(basefolder_db_and_files, f"gf_list_top_{top}_{rank_type}_filtered_zero_false_predictions.tsv"), sep="\t", header=None).squeeze() + + # GENERATE OR OPEN MCL-LIKE FILE + logging.info("Generating mcl-like output per gene family...") + if test_run: + # Get the top 5000 GFs (maximum number of GFs considered) + get_top5000_gfs_mcl_format(id) + else: + mcl_like_filename = os.path.join(path_to_parlogs_data, f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank.mcl.tsv") + generate_mcl_like_file_for_new_species(id, top, rank_type, gf_list, assigned_gfs_file, mcl_like_filename) + + logging.info("") + + # for rank_type in ["lambda"]: # ["combined", "lambda"]: + logging.info(f"- From {rank_type} ranking...") + + # Estimate Ks from the top XX GFs (current rank type)") + if test_run: + species_fasta_file = os.path.join(basefolder_sequence_data, "old_genome_versions", f"{id.lower()}.fasta") + else: + species_fasta_file = os.path.join(basefolder_sequence_data, "new_species_data", f"{id.upper()}.fna") + + + + if not filter_by_false_predictions and not filter_by_zero_false_predictions: + ks_tsv_filename = f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank.ks.tsv" + + elif filter_by_false_predictions: + ks_tsv_filename = f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank_filtered_false_predictions.ks.tsv" + + elif filter_by_zero_false_predictions: + ks_tsv_filename = f"{id.lower()}_ortho_gf_top{top}_{rank_type}_rank_filtered_zero_false_predictions.ks.tsv" + ks_tsv_path = os.path.join(path_to_parlogs_data, f"wgd_{id.lower()}", ks_tsv_filename) + compute_ks_from_ortho_gfs(id, species_fasta_file, top, top_gfs_filtered_with_rank, rank_type, ks_tsv_path, max_gene_family_size, n_threads, paralog_dists_dir) + logging.info("") + + + + # Construct Ks distribution for the top XX GFs (current rank type) + if not filter_by_false_predictions and not filter_by_zero_false_predictions: + ks_plot_filename = f"ks_plot_{id}_top{top}_{rank_type}.pdf" + + elif filter_by_false_predictions: + ks_plot_filename = f"ks_plot_{id}_top{top}_{rank_type}_filtered_false_predictions.pdf" + + elif filter_by_zero_false_predictions: + ks_plot_filename = f"ks_plot_{id}_top{top}_{rank_type}_filtered_zero_false_predictions.pdf" + construct_ks_plot_ortho_gfs(id, top, rank_type, ks_plot_filename, max_ks, bin_width, y_axis_lim, ids_names) + logging.info("") + logging.info("Done") diff --git a/ksrates/setup_correction.py b/ksrates/setup_correction.py index 0da88192c44aed0f4ac2e88de9f83f292ca7876b..fd76ba1b21a063ce892a7a6284791e1022b07310 100644 --- a/ksrates/setup_correction.py +++ b/ksrates/setup_correction.py @@ -23,8 +23,9 @@ def setup_correction(config_file, nextflow_flag): latin_names = config.get_latin_names() paranome = config.get_paranome() colinearity = config.get_colinearity() + reciprocal_retention = config.get_reciprocal_retention() - if not paranome and not colinearity: + if not paranome and not colinearity and not reciprocal_retention: logging.error('At least one of the "paranome" or "collinearity" parameters in the configuration file needs to be set to "yes".') logging.error("Exiting.") sys.exit(1) @@ -266,10 +267,10 @@ def setup_correction(config_file, nextflow_flag): with open(os.path.join(f"wgd_runs_{species_of_interest}.txt"), "w+") as wgd_runs: default_threads = 1 wgd_runs.write(f"# Note: the number of threads can be increased depending on the number of cores [default: {default_threads}]\n\n") - wgd_runs.write(f"ksrates paralogs-ks config_{species_of_interest}.txt --n-threads={default_threads}\n") + wgd_runs.write(f"ksrates paralogs-ks {config_file} --n-threads={default_threads}\n") for pair in species_pairs_unknown: sp1, sp2 = pair[0], pair[1] - wgd_runs.write(f"ksrates orthologs-ks config_{species_of_interest}.txt {sp1} {sp2} " + wgd_runs.write(f"ksrates orthologs-ks {config_file} {sp1} {sp2} " f"--n-threads={default_threads}\n") logging.info("All done") diff --git a/ksrates/wgd_paralogs.py b/ksrates/wgd_paralogs.py index e7fd889140c5569b447c789883bf241666dfa33d..09f4f8915d4c0a3dd892fc31dd4db2f64d20be94 100644 --- a/ksrates/wgd_paralogs.py +++ b/ksrates/wgd_paralogs.py @@ -15,13 +15,22 @@ def wgd_paralogs(config_file, n_threads): species = config.get_species() init_logging(f"Paralog wgd analysis for species [{species}]", config.get_logging_level()) - latin_names = config.get_latin_names() + latin_name = config.get_latin_names()[species] max_gene_family_size = config.get_max_gene_family_size() paranome = config.get_paranome() colinearity = config.get_colinearity() + reciprocal_retention = config.get_reciprocal_retention() - if not paranome and not colinearity: - logging.error('At least one of the "paranome" or "collinearity" parameters in the configuration file needs to be set to "yes".') + top = config.get_reciprocal_retention_top() + rank_type = config.get_reciprocal_retention_rank_type() + pident_threshold = config.get_reciprocal_retention_pident_threshold() + min_qcov_threshold = config.get_reciprocal_retention_min_qcov_threshold() + filter_false_predictions = config.get_filter_by_false_predictions() + filter_zero_false_predictions = config.get_filter_by_zero_false_predictions() + + if not paranome and not colinearity and not reciprocal_retention: + logging.error('At least one of the "paranome", "collinearity" or "reciprocal retention" parameters', + 'in the configuration file needs to be set to "yes".') logging.error("Exiting.") sys.exit(1) @@ -50,9 +59,9 @@ def wgd_paralogs(config_file, n_threads): trigger_exit = True else: # If FASTA file exists, check for ID compatibility if colinearity: - fcCheck.check_IDs(species_fasta_file, latin_names[species], gff) + fcCheck.check_IDs(species_fasta_file, latin_name, gff) else: - fcCheck.check_IDs(species_fasta_file, latin_names[species]) + fcCheck.check_IDs(species_fasta_file, latin_name) if trigger_exit: logging.error("Please add the missing information to the configuration file and rerun the analysis. Exiting.") @@ -68,13 +77,25 @@ def wgd_paralogs(config_file, n_threads): # ----------------------------------------------------------------------------- # ESTIMATING PARANOME Ks VALUES - logging.info("Running wgd paralog Ks pipeline...") - fc_wgd.ks_paralogs(species, species_fasta_file, max_gene_family_size=max_gene_family_size, base_dir=paralog_dists_dir, n_threads=n_threads) + if paranome or colinearity: # prerequisite for colinearity pipeline, but not for reciprocally retention pipeline + logging.info("Running wgd reciprocal retention paralog Ks pipeline...") + fc_wgd.ks_paralogs(species, species_fasta_file, max_gene_family_size=max_gene_family_size, + base_dir=paralog_dists_dir, n_threads=n_threads) + + # ESTIMATING RECIPROCALLY RETAINED GENE FAMILIES Ks VALUES + if reciprocal_retention: + logging.info('---') + logging.info("Running wgd whole-paranome Ks pipeline...") + fc_wgd.ks_paralogs_rec_ret(species, species_fasta_file, latin_name, top=top, rank_type=rank_type, + pident_threshold=pident_threshold, min_qcov_threshold=min_qcov_threshold, + filter_false_predictions=filter_false_predictions, + filter_zero_false_predictions=filter_zero_false_predictions, + max_gene_family_size=max_gene_family_size, base_dir=paralog_dists_dir, n_threads=n_threads) - # EXTRACTING COLINEARITY/SYNTENY ANCHOR PAIRS Ks VALUES + # EXTRACTING COLLINEARITY ANCHOR PAIRS Ks VALUES if colinearity: logging.info('---') - logging.info("Running wgd colinearity Ks pipeline...") + logging.info("Running wgd collinearity Ks pipeline...") fc_wgd.ks_colinearity(species, gff, base_dir=paralog_dists_dir, gff_feature=gff_feature, gff_gene_attribute=gff_gene_attribute, n_threads=n_threads) diff --git a/ksrates_cli.py b/ksrates_cli.py index f4def99a9eea268219c5821006af3c01e592c007..11b2b14f17e042c42ebb463cea62009263bf19c2 100644 --- a/ksrates_cli.py +++ b/ksrates_cli.py @@ -118,7 +118,8 @@ def orthologs_adjustment(config_file, trios): @click.option("--adjustment-table", type=click.Path(exists=True), help="User-defined path to file containing adjustment results (default: rate_adjustment/species/adjustment_table_species.tsv)") @click.option("--paranome-table", type=click.Path(exists=True), help="User-defined path to file containing paranome Ks (default: paralog_distributions/wgd_species/species.ks.tsv)") @click.option("--anchors-table", type=click.Path(exists=True), help="User-defined path to file containing anchor pair Ks (default: paralog_distribution/wgd_species/species.ks_anchors.tsv)") -def plot_paralogs(config_file, adjustment_table, paranome_table, anchors_table): +@click.option("--reciprocal-retention-table", type=click.Path(exists=True), help="User-defined path to file containing reciprocally retained paralog Ks (default: paralog_distribution/wgd_species/species.ks_rec_ret.tsv)") +def plot_paralogs(config_file, adjustment_table, paranome_table, anchors_table, reciprocal_retention_table): """ Plots rate-adjusted mixed paralog-ortholog Ks distribution. @@ -134,7 +135,9 @@ def plot_paralogs(config_file, adjustment_table, paranome_table, anchors_table): click.format_filename(paranome_table) if anchors_table: click.format_filename(anchors_table) - plot_paralogs_distr(config_file, adjustment_table, paranome_table, anchors_table) + if reciprocal_retention_table: + click.format_filename(reciprocal_retention_table) + plot_paralogs_distr(config_file, adjustment_table, paranome_table, anchors_table, reciprocal_retention_table) @cli.command(context_settings={'help_option_names': ['-h', '--help']}, short_help="Generates phylogram with Ks-unit branch lengths.") diff --git a/main.nf b/main.nf index 8e65f82ac779db10cda343417b2654e382a20223..6c0d37943c0733c2d51409fe2e3cda0c802cb9e1 100755 --- a/main.nf +++ b/main.nf @@ -15,6 +15,10 @@ version = version_file[0][1] params.preserve = false // giving the configuration file through the "input" process section +// NOTE: +// $params.config is what the user has entered after --config in the command line, for example ./config_files/config_elaeis.txt +// $configfile is the config file absolute path, e.g. /home/config_files/config_elaeis.txt +// $config is the basename of the config file, e.g. config_elaeis.txt configfile = file(params.config) if (configfile.isEmpty()) { newConfigFile = true @@ -160,11 +164,11 @@ process checkConfig { processDir=\$PWD cd $PWD - if [ ! -f ${config} ]; then + if [ ! -f ${configfile} ]; then echo "" echo -n "Configuration file [${config}] not found: it will now be generated... " - ksrates generate-config ${config} >> \$processDir/generate_config.txt + ksrates generate-config ${configfile} >> \$processDir/generate_config.txt RET_CODE=\$? echo "done [\${RET_CODE}]" @@ -218,7 +222,7 @@ process setupAdjustment { processDir=\$PWD cd $PWD - species=`grep "^[[:space:]]*focal_species[[:space:]]*=" ${config} | cut -d "=" -f 2 | xargs` + species=`grep "^[[:space:]]*focal_species[[:space:]]*=" ${configfile} | cut -d "=" -f 2 | xargs` echo "Focal species: \$species" # Generating folders for output files, especially to have the log_folder since the very beginning @@ -238,7 +242,7 @@ process setupAdjustment { echo -n "Extracting ortholog species pairs and trios from Newick tree... " echo "NF internal work directory for [setupAdjustment] process:\n\$processDir\n" > \${logs_folder}/${logs_names["setupAdjustment"]} - ksrates init ${config} --nextflow >> \${logs_folder}/${logs_names["setupAdjustment"]} 2>&1 + ksrates init ${configfile} --nextflow >> \${logs_folder}/${logs_names["setupAdjustment"]} 2>&1 RET_CODE=\$? echo "done [\${RET_CODE}]" @@ -298,9 +302,11 @@ process setParalogAnalysis { paranome_status="not_required" colinearity_status="not_required" + reciprocal_retention_status="not_required" - paranome=`grep "^[[:space:]]*paranome[[:space:]]*=" ${config} | cut -d "=" -f 2 | xargs | tr '[:upper:]' '[:lower:]'` - colinearity=`grep "^[[:space:]]*collinearity[[:space:]]*=" ${config} | cut -d "=" -f 2 | xargs | tr '[:upper:]' '[:lower:]'` + paranome=`grep "^[[:space:]]*paranome[[:space:]]*=" ${configfile} | cut -d "=" -f 2 | xargs | tr '[:upper:]' '[:lower:]'` + colinearity=`grep "^[[:space:]]*collinearity[[:space:]]*=" ${configfile} | cut -d "=" -f 2 | xargs | tr '[:upper:]' '[:lower:]'` + reciprocal_retention=`grep "^[[:space:]]*reciprocal_retention[[:space:]]*=" ${config} | cut -d "=" -f 2 | xargs | tr '[:upper:]' '[:lower:]'` processDir=\$PWD cd $PWD @@ -329,8 +335,18 @@ process setParalogAnalysis { colinearity_status="already_done" fi fi + if [ \${reciprocal_retention} = "yes" ]; then + if [ ! -f paralog_distributions/wgd_${species}/${species}.ks_rec_ret.tsv ]; then + echo "[$species] Will run reciprocal retention analysis" + echo "[$species] Reciprocally retained paralog TSV file not found [${species}.ks_rec_ret.tsv]" >> $logs_folder/${logs_names["setParalogAnalysis"]} + echo "[$species] Reciprocal retention pipeline will be started\n" >> $logs_folder/${logs_names["setParalogAnalysis"]} + reciprocal_retention_status="todo" + else + reciprocal_retention_status="already_done" + fi + fi - if [ \$paranome_status = "todo" ] || [ \$colinearity_status = "todo" ]; then + if [ \$paranome_status = "todo" ] || [ \$colinearity_status = "todo" ] || [ \$reciprocal_retention_status = "todo" ]; then # Trigger wgdParalog process to get the missing tsv file(s) trigger_wgdPara=true else @@ -472,7 +488,7 @@ process estimatePeaks { echo "Updating ortholog peak database" >> $logs_folder/${logs_names["estimatePeaks"]} - ksrates orthologs-analysis ${config} --ortholog-pairs=\$processDir/$species_pairs_for_peak >> $logs_folder/${logs_names["estimatePeaks"]} 2>&1 + ksrates orthologs-analysis ${configfile} --ortholog-pairs=\$processDir/$species_pairs_for_peak >> $logs_folder/${logs_names["estimatePeaks"]} 2>&1 RET_CODE=\$? echo "done [\${RET_CODE}] `date "+%T"`" @@ -520,7 +536,7 @@ process wgdParalogs { echo "Using ${task.cpus} thread(s)\n">> $logs_folder/${logs_names["wgdParalogs"]} echo "[$species] Using ${task.cpus} thread(s)" - ksrates paralogs-ks ${config} --n-threads=${task.cpus} >> $logs_folder/${logs_names["wgdParalogs"]} 2>&1 + ksrates paralogs-ks ${configfile} --n-threads=${task.cpus} >> $logs_folder/${logs_names["wgdParalogs"]} 2>&1 RET_CODE=\$? echo "done [\${RET_CODE}] `date "+%T"`" @@ -567,14 +583,14 @@ process wgdOrthologs { echo "Using ${task.cpus} thread(s)\n">> $logs_folder/${logs_names["wgdOrthologs"]}${species1}_${species2}.log # echo "[$species1 – $species2] Using ${task.cpus} thread(s)" - ksrates orthologs-ks ${config} $species1 $species2 --n-threads=${task.cpus} >> $logs_folder/${logs_names["wgdOrthologs"]}${species1}_${species2}.log 2>&1 + ksrates orthologs-ks ${configfile} $species1 $species2 --n-threads=${task.cpus} >> $logs_folder/${logs_names["wgdOrthologs"]}${species1}_${species2}.log 2>&1 RET_CODE=\$? echo "done [\${RET_CODE}] `date "+%T"`" echo -n "[$species1 – $species2] `date "+%T"` Starting ortholog peak analysis... " echo "\n" >> $logs_folder/${logs_names["wgdOrthologs"]}${species1}_${species2}.log echo "Species1\tSpecies2\n$species1\t$species2" > \${processDir}/tmp_${species1}_${species2}.txt - ksrates orthologs-analysis ${config} --ortholog-pairs=\${processDir}/tmp_${species1}_${species2}.txt >> $logs_folder/${logs_names["wgdOrthologs"]}${species1}_${species2}.log 2>&1 + ksrates orthologs-analysis ${configfile} --ortholog-pairs=\${processDir}/tmp_${species1}_${species2}.txt >> $logs_folder/${logs_names["wgdOrthologs"]}${species1}_${species2}.log 2>&1 RET_CODE=\$? echo "done [\${RET_CODE}] `date "+%T"`" @@ -633,7 +649,7 @@ process plotOrthologDistrib { cd $PWD echo "NF internal work directory for [plotOrthologDistrib] process:\n\$processDir\n" > $logs_folder/${logs_names["plotOrthologDistrib"]} - ksrates plot-orthologs ${config} >> $logs_folder/${logs_names["plotOrthologDistrib"]} 2>&1 + ksrates plot-orthologs ${configfile} >> $logs_folder/${logs_names["plotOrthologDistrib"]} 2>&1 RET_CODE=\$? echo "done [\${RET_CODE}] `date "+%T"`" @@ -686,6 +702,7 @@ process doRateAdjustment { echo "" paranome=`grep "^[[:space:]]*paranome[[:space:]]*=" ${config} | cut -d "=" -f 2 | xargs | tr '[:upper:]' '[:lower:]'` colinearity=`grep "^[[:space:]]*collinearity[[:space:]]*=" ${config} | cut -d "=" -f 2 | xargs | tr '[:upper:]' '[:lower:]'` + reciprocal_retention=`grep "^[[:space:]]*reciprocal_retention[[:space:]]*=" ${config} | cut -d "=" -f 2 | xargs | tr '[:upper:]' '[:lower:]'` missing_paranome=false if [ \${paranome} = "yes" ] && [ ! -s $PWD/paralog_distributions/wgd_${species}/${species}.ks.tsv ]; then @@ -695,6 +712,10 @@ process doRateAdjustment { if [ \${colinearity} = "yes" ] && [ ! -s $PWD/paralog_distributions/wgd_${species}/${species}.ks_anchors.tsv ]; then missing_anchorpairs=true fi + missing_reciprocal_retention=false + if [ \${reciprocal_retention} = "yes" ] && [ ! -s $PWD/paralog_distributions/wgd_${species}/${species}.ks_rec_ret.tsv ]; then + missing_reciprocal_retention=true + fi if [ \${missing_paranome} = "true" ]; then echo "Whole-paranome data not yet available; skipping rate-adjustment" @@ -704,20 +725,25 @@ process doRateAdjustment { echo "Anchor-pair data not yet available; skipping rate-adjustment" exit 0 fi + if [ \${missing_reciprocal_retention} = "true" ]; then + echo "Reciprocally retained paralog data not yet available; skipping rate-adjustment" + exit 0 + fi + processDir=\$PWD cd $PWD echo "NF internal work directory for [doRateAdjustment (${task.index})] process:\n\$processDir\n" >> $logs_folder/${logs_names["doRateAdjustment"]} echo -n "`date "+%T"` Starting rate-adjustment analysis... " - ksrates orthologs-adjustment ${config} >> $logs_folder/${logs_names["doRateAdjustment"]} 2>&1 + ksrates orthologs-adjustment ${configfile} >> $logs_folder/${logs_names["doRateAdjustment"]} 2>&1 RET_CODE=\$? echo "done [\${RET_CODE}] `date "+%T"`" echo "\n" >> $logs_folder/${logs_names["doRateAdjustment"]} echo -n "`date "+%T"` Plotting mixed distributions... " - ksrates plot-paralogs ${config} >> $logs_folder/${logs_names["doRateAdjustment"]} 2>&1 + ksrates plot-paralogs ${configfile} >> $logs_folder/${logs_names["doRateAdjustment"]} 2>&1 RET_CODE=\$? echo "done [\${RET_CODE}] `date "+%T"`" @@ -762,7 +788,7 @@ process paralogsAnalyses { cd $PWD echo "NF internal work directory for [paralogsAnalyses (${task.index})] process:\n\$processDir\n" >> $logs_folder/${logs_names["paralogsAnalyses"]} - ksrates paralogs-analyses ${config} >> $logs_folder/${logs_names["paralogsAnalyses"]} 2>&1 + ksrates paralogs-analyses ${configfile} >> $logs_folder/${logs_names["paralogsAnalyses"]} 2>&1 RET_CODE=\$? echo "done [\${RET_CODE}] `date "+%T"`" @@ -810,7 +836,7 @@ process drawTree { cd $PWD echo "NF internal work directory for [drawTree] process:\n\$processDir\n" >> $logs_folder/${logs_names["drawTree"]} - ksrates plot-tree ${config} --nextflow >> $logs_folder/${logs_names["drawTree"]} 2>&1 + ksrates plot-tree ${configfile} --nextflow >> $logs_folder/${logs_names["drawTree"]} 2>&1 RET_CODE=\$? echo "done [\${RET_CODE}] `date "+%T"`" diff --git a/test_fabales/config_expert.txt b/test_fabales/config_expert.txt new file mode 100755 index 0000000000000000000000000000000000000000..97130cf0ee1c64b6cdef766e804e78ecc74e83b6 --- /dev/null +++ b/test_fabales/config_expert.txt @@ -0,0 +1,21 @@ +[EXPERT PARAMETERS] + +logging_level = info +kde_bandwidth_modifier = 0.4 +plot_adjustment_arrows = yes +max_mixture_model_iterations = 300 +num_mixture_model_initializations = 1 +extra_paralogs_analyses_methods = yes +max_mixture_model_components = 3 +max_mixture_model_ks = 5 +max_gene_family_size = 200 + +top_reciprocally_retained_gfs = 2000 +reciprocal_retention_rank_type = lambda +pident_threshold = 40 +min_qcov_threshold = 50 +filter_by_false_predictions = no +filter_by_zero_false_predictions = no + +db_and_file_dir_path = "/home/cesen/wgm_predictor_midas/databases_and_gf_files" +# TEMPORARILY: path to the directory containing all databases and files needed to work with rec.ret. GFs diff --git a/test_fabales/config_rmwj.txt b/test_fabales/config_rmwj.txt new file mode 100755 index 0000000000000000000000000000000000000000..ee5bd9ce0f9ffce45b5cf9e93f80c226254d990f --- /dev/null +++ b/test_fabales/config_rmwj.txt @@ -0,0 +1,82 @@ +[SPECIES] +focal_species = RMWJ +# informal name of the focal species from the input tree + +# newick_tree = ((((((((((((((HJMP_Astragalus_membranaceus,MYMP_Astragalus_propinquus)1:4.73327467262877,KNMB_Lathyrus_sativus)1:0.8752256727723148,(JTQQ_Glycyrrhiza_lepidota,PEZP_Glycyrrhiza_glabra)1:3.9898118408086996)0.77:0.14276888129998114,RMWJ_Wisteria_floribunda)1:1.280422967601251,((((KEGA_Glycine_soja,TVSH_Bituminaria_bituminosa):0.7300879301528512,Phavu_v1.0)1:0.3075503094307106,SUAK_Codariocalyx_motorius)1:0.14159202800535708,NXOH_Apios_americana)0.99:2.886669823807472)1:0.4638537633157248,VLNB_Gompholobium_polymorphum)1:0.6725216381235951,(CMFF_Lupinus_polyphyllus,TTRG_Lupinus_angustifolius)1:4.376235981995783)1:1.3587259388068462,SLYR_Cladrastis_lutea)1:0.41145213906109945,ZSSR_Xanthocercis_zambesiaca)1:3.5518125039998547,((((ZCDJ_Acacia_argyrophylla,PJSX_Acacia_pycnantha)1:2.575878427993727,XOOE_Desmanthus_illinoensis)1:3.903811368091866,KZED_Senna_hebecarpa)1:0.6427948941916014,((VHZV_Gleditsia_sinensis,GEHT_Gleditsia_triacanthos)1:1.508583938878165,QZXQ_Gymnocladus_dioicus)1:2.9347282084025688)1:2.5313646428664294)1:0.6367347318541066,((JETM_Bauhinia_tomentosa,RKFX_Cercis_canadensis)1:3.7674577659256525,RKLL_Copaifera_officinalis)0.98:0.14031164681847688)1:3.3786871360167563,OQHZ_Quillaja_saponaria)1:0.16363583263216094,OHAE_Polygala_lutea)0.98:4.012115008603451); + +newick_tree = (((HJMP, KNMB), RMWJ), SVVG); + +# input phylogenetic tree in newick format; use the informal names + +latin_names = CMFF : Lupinus polyphyllus, GEHT : Gleditsia triacanthos, HJMP : Astragalus membranaceus, + JETM : Bauhinia tomentosa, JTQQ : Glycyrrhiza lepidota, KEGA : Glycine soja, + KNMB : Lathyrus sativus, KZED : Senna hebecarpa, MYMP : Astragalus propinquus, + NXOH : Apios americana, OHAE : Polygala lutea, OQHZ : Quillaja saponaria, + PEZP : Glycyrrhiza glabra, PJSX : Acacia pycnantha, QZXQ : Gymnocladus dioicus, + RKFX : Cercis canadensis, RKLL : Copaifera officinalis, RMWJ : Wisteria floribunda, + SLYR : Cladrastis lutea, SUAK : Codariocalyx motorius, TTRG : Lupinus angustifolius, + TVSH : Bituminaria bituminosa, VHZV : Gleditsia sinensis, VLNB : Gompholobium polymorphum, + XOOE : Desmanthus illinoensis, ZCDJ : Acacia argyrophylla, CWZU : Betula pendula, + SVVG : Fagus sylvatica, XVJB : Morus nigra, ZPKK : Aruncus dioicus + +# informal names associated to their latin name through a colon and separated by comma + +fasta_filenames = HJMP : /home/cesen/wgm_predictor_midas/sequences/new_species_data/HJMP.fna, + KNMB : /home/cesen/wgm_predictor_midas/sequences/new_species_data/KNMB.fna, + RMWJ : /home/cesen/wgm_predictor_midas/sequences/new_species_data/RMWJ.fna, + SVVG : /home/cesen/wgm_predictor_midas/sequences/new_species_data/SVVG.fna +gff_filename = +# informal names associated to their filename/path through a colon and separated by comma + +peak_database_path = ortholog_peak_db.tsv +ks_list_database_path = ortholog_ks_list_db.tsv +# filenames/paths of the ortholog data databases + + +[ANALYSIS SETTING] +paranome = no +collinearity = no +reciprocal_retention = yes +# analysis type for paralog data; allowed values: 'yes' or 'no' + +gff_feature = +# keyword to parse the sequence type from the gff file (column 3); can be 'gene', 'mrna'... + +gff_attribute = +# keyword to parse gene id from the gff file (column 9); can be 'id', 'name'... + +max_number_outgroups = 4 +# maximum number of outspecies/trios selected to correct each divergent species pair (default: 4) + +consensus_mode_for_multiple_outgroups = mean among outgroups +# allowed values: 'mean among outgroups' or 'best outgroup' (default: 'mean among outgroups') + + +[PARAMETERS] +x_axis_max_limit_paralogs_plot = 5 +# highest value of the x axis in the mixed distribution plot (default: 5) + +bin_width_paralogs = 0.1 +# bin width in paralog ks histograms (default: 0.1, ten bins per unit) + +y_axis_max_limit_paralogs_plot = None +# highest value of the y axis in the mixed distribution plot (default: none) + +num_bootstrap_iterations = 200 +# number of bootstrap iterations for ortholog peak estimate + +divergence_colors = Red, MediumBlue, Goldenrod, Crimson, ForestGreen, Gray, SaddleBrown, Black +# color of the divergence lines drawn in correspondence of the ortholog peaks +# use color names/codes separated by comma and use at least as many colors as the number of divergence nodes + +x_axis_max_limit_orthologs_plots = 5 +# highest value of the x axis in the ortholog distribution plots (default: 5) + +bin_width_orthologs = 0.1 +# bin width in ortholog ks histograms (default: 0.1, ten bins per unit) + +max_ks_paralogs = 5 +# maximum paralog ks value accepted from ks data table (default: 5) + +max_ks_orthologs = 10 +# maximum ortholog ks value accepted from ks data table (default: 10) diff --git a/test_fabales/nextflow.config b/test_fabales/nextflow.config new file mode 100755 index 0000000000000000000000000000000000000000..8af67c524af659d94f41e20e2f5621ac90644ca8 --- /dev/null +++ b/test_fabales/nextflow.config @@ -0,0 +1,58 @@ +// This is an example file, please adapt it to your resources. +// For more information see tool documentation and Nextflow documentation. + + +// CONTAINER SETTINGS: +singularity { + enabled = true + // cacheDir = '' // to set a directory where to download the container file + autoMounts = true // to automatically mount host paths in the executed container +} +// docker { + // enabled = true +// } + +// EXECUTOR SETTINGS: +executor { + name = 'local' // or e.g. 'sge' when using a computer cluster + cpus = 3 // to limit the CPU resources when using the 'local' executor +} + +process { + container = 'vibpsb/ksrates:latest' // to use the ksrates container (not needed if all dependencies are locally installed) + + // Specify memory and number of threads for processes submitted to the cluster + // It is advised to provide higher computational power to wgdParalogs and wgdOrthologs + withName: 'wgdParalogs' { + cpus = 1 // e.g. 2 + // penv = '' // e.g. 'serial' + // memory = '' // e.g. '16GB' + // clusterOptions = '' // e.g. '-l h_vmem=2G' + } + + withName: 'wgdOrthologs' { + cpus = 1 // e.g. 2 + // penv = '' // e.g. 'serial' + // memory = '' // e.g. '16GB' + // clusterOptions = '' // e.g. '-l h_vmem=2G' + } + + // withName: 'estimatePeaks' { + // memory = '' // e.g. '2GB' + // clusterOptions = '' // e.g. '-l h_vmem=2G' + // } + + // withName: 'paralogsAnalyses' { + // memory = '' // e.g. '2GB' + // clusterOptions = '' // e.g. '-l h_vmem=2G' + // } + + // withName: 'plotOrthologDistrib' { + // memory = '' // e.g. '2GB' + // clusterOptions = '' // e.g. '-l h_vmem=2G' + // } +} + + +// OTHER SETTINGS: +// env.SOME_ENV_VARIABLE = '' diff --git a/wgd_ksrates/ks_distribution.py b/wgd_ksrates/ks_distribution.py index df4a9ce790ba5350ab4029033f3b00aa3e2074d9..54ae6a39f016493fd9bdc79a1ec010a0acba57ac 100644 --- a/wgd_ksrates/ks_distribution.py +++ b/wgd_ksrates/ks_distribution.py @@ -702,7 +702,7 @@ def ks_analysis_one_vs_one( def ks_analysis_paranome( - nucleotide_sequences, protein_sequences, paralogs, + nucleotide_sequences, protein_sequences, paralogs, top=None, top_gfs_filtered_with_rank=None, tmp_dir='./tmp', output_dir='./ks.out', codeml_path='codeml', preserve=True, times=1, ignore_prefixes=False, n_threads=4, min_length=100, method='alc', aligner='muscle', @@ -732,7 +732,8 @@ def ks_analysis_paranome( :return: data frame """ # ignore prefixes in gene families, since only one species ----------------- - paralogs = process_gene_families(paralogs, ignore_prefix=ignore_prefixes) + paralogs = process_gene_families(paralogs, top=top, top_gfs_filtered_with_rank=top_gfs_filtered_with_rank, + ignore_prefix=ignore_prefixes) protein = get_sequences(paralogs, protein_sequences) # preserve intermediate data if asked -------------------------------------- diff --git a/wgd_ksrates/utils.py b/wgd_ksrates/utils.py index 4834028ba8125baeed72e38ae551b24e037d9476..8fa31c70e6fd026ac37ae1168fe8edf9370b8152 100644 --- a/wgd_ksrates/utils.py +++ b/wgd_ksrates/utils.py @@ -35,6 +35,7 @@ import subprocess from progressbar import ProgressBar from numpy import mean, std from scipy.spatial.distance import cdist +from pandas import read_csv def can_i_run_software(software): @@ -138,11 +139,13 @@ def get_sequences(paralog_dict, sequences): return paralog_sequence_dict -def process_gene_families(gene_family_file, ignore_prefix=False): +def process_gene_families(gene_family_file, top=None, top_gfs_filtered_with_rank=None, + ignore_prefix=False): """ Processes a raw gene family file as e.g. from MCL output into a generic dictionary structure. MCL raw file consists of one gene family per line, including tab separated gene IDs, (without gene family ID !). + Consider only the top XXXX GFs in the file if asked by the top parameter. Example:: @@ -164,7 +167,13 @@ def process_gene_families(gene_family_file, ignore_prefix=False): ID = 1 with open(gene_family_file, 'r') as f: - for line in f: + genes_per_family = f.readlines() + # else: + # genes_per_family = read_csv(gene_family_file, sep="\t", header=None) + # genes_per_family = genes_per_family.values.tolist() + + if top == None: # no top reciprocally retained gene families are asked + for line in genes_per_family: genes = line.strip().split("\t") if ignore_prefix: if '|' in genes[0]: @@ -172,6 +181,29 @@ def process_gene_families(gene_family_file, ignore_prefix=False): gene_family_dict["GF_{:06d}".format(ID)] = genes ID += 1 + else: + # Get a subset of such top 2000 according to a previous filtering step (i.e. few or zero false predictions) + # The file lists the GFs IDs in the first column and the ranks in the second column + top_gfs = read_csv(top_gfs_filtered_with_rank, sep="\t", header=None) + if len(top_gfs.columns) == 1: # this is the file coming from the top 2000 GFs, without second column for the rank + rank_of_filtered_gfs = range(1, top+1) + elif len(top_gfs.columns) == 2: # this is the file coming from the filtered top 2000 GFs, with the rank as second column + rank_of_filtered_gfs = list(read_csv(top_gfs_filtered_with_rank, sep="\t", header=None)[1]) + + for rank in range(1, top+1): + # Take into account only the GFs that belong to the filtered subset + if rank in rank_of_filtered_gfs: + line = genes_per_family[rank-1] # rank is 1, but first row index is 0 + + genes = line.strip().split("\t") + if len(genes) <= 1: # either single member or zero members + continue + if ignore_prefix: + if '|' in genes[0]: + genes = [gene.split('|')[1] for gene in genes] + gene_family_dict["GF_{:06d}".format(ID)] = genes + ID += 1 + return gene_family_dict