Commit b77b95b9 authored by timdiels's avatar timdiels
Browse files

BR: various fixes

parent 6e47eca2
......@@ -355,11 +355,10 @@ def _add_waic_ratio(results):
return results
def _parse_count_groups(count_groups_file):
df = pd.read_table(
return pd.read_table(
str(count_groups_file),
usecols=['count_group', 'gene_family']
usecols=['count_group', 'gf']
)
return df.rename(columns={'gene_family': 'gf'})
def _best_model_per_group(results):
'''
......
......@@ -96,6 +96,7 @@ class GoEnrichment:
up_genes = attr.ib()
def _split_genes_into_gene_column(genes):
genes['genes'] = genes['genes'].apply(list) # TODO this is a quick hack, in long term should get rid of using sequences inside data frames
genes = df_.split_array_like(genes, 'genes')
return genes.rename(columns={'genes': 'gene'})
......
......@@ -34,7 +34,7 @@ from cedalion.badirate.various import parse_tree_from_result, parse_nodes
from cedalion.various import join_multiline
@click.command()
@click.argument(
@click.option(
'--badirate-results', 'badirate_results_file',
type=click.Path(exists=True, dir_okay=False),
required=True,
......@@ -146,16 +146,24 @@ def main(badirate_results_file, target_species, gf_file, gene_info_file, species
--functional-annotation func_annot.csv \\
--output output.xlsx
'''
badirate_results_file = Path(badirate_results_file)
gf_file = Path(gf_file)
gene_info_file = Path(gene_info_file)
species_file = Path(species_file)
count_groups_file = Path(count_groups_file)
go_file = Path(go_file)
func_annot_file = Path(func_annot_file)
output_file = Path(output_file)
results = _parse_results(badirate_results_file)
_exit_if_no_results(results)
result_file = results.iloc[results['output_file'].first_valid_index]['output_file']
result_file = results['output_file'].iloc[results['output_file'].first_valid_index()]
tree = parse_tree_from_result(result_file)
all_species = {node.species: node.id for node in parse_nodes(tree)}
all_species = {node.species: node.id for node in parse_nodes(tree) if node.species}
results_analysis = analyse_results(
results, count_groups_file, gene_info_file, gf_file, target_species,
all_species[target_species]
)
go_enrichment = enrich_gos(results, func_annot_file, go_file)
go_enrichment = enrich_gos(results_analysis.results, func_annot_file, go_file)
write_report(
output_file, results_analysis, go_enrichment, all_species, target_species,
species_file, tree
......
......@@ -1014,9 +1014,9 @@ class TestReporting:
assert_xlsx_equals(actual_file, expected_file)
# TODO Add a TestAnalyseResultsScript which tests
# scripts.badirate_analyse_results. Left it out due to dev time constraints,
# most bugs related to that code should be obvious when trying to run it in the
# test pipeline.
# scripts.badirate_analyse_results in isolation. Left it out due to dev time
# constraints, most bugs related to that code should be obvious when trying to
# run it in the test pipeline.
#TODO add to pytil
def assert_xlsx_equals(actual_file, expected_file):
......
......@@ -22,6 +22,11 @@ import groovyx.gpars.dataflow.DataflowReadChannel
import groovy.transform.InheritConstructors
import nextflow.Global
// Note: if running python on the cluster gives you an error involving OpenBLAS
// and pthread_create, it is likely numpy's OMP trying to multithread too hard.
// Setting OMP_NUM_THREADS environment variable to the number of cpus you
// requested usually resolves the issue.
////////////////////////////////////
// Helper functions and classes
......@@ -871,7 +876,7 @@ process badiRateFormatInput {
output:
file "*.size_file.tsv" into badiRateSizeFiles
file "node_ids.tsv" into badiRateNodeIdsFile
file "node_ids.csv" into badiRateNodeIdsFile
file "count_groups.tsv" into badiRateCountGroupsFile
file "ultrametric.newick" into badiRateUltraTree
......@@ -879,7 +884,7 @@ process badiRateFormatInput {
"""
$activate_venv
Rscript $ultrametricTreeScript $tree ultrametric.newick
cedalion-badirate-format-input --gene-info $geneInfo \\
OMP_NUM_THREADS=$task.cpus cedalion-badirate-format-input --gene-info $geneInfo \\
--gene-families $orthoGroups --tree ultrametric.newick --output .
"""
}
......@@ -895,7 +900,7 @@ badiRateSizeFiles
// NS-BR model args
badiRateNodeIdsFile
.splitCsv(header: true)
.filter { it.parent_node_id != null }
.filter { it.parent_node_id != '' }
// Filter out non-badirate species
.phase(tap(badiRateSpeciesNames))
.map { it[0] }
......@@ -939,8 +944,7 @@ Channel
.combine(badiRateSizeFiles)
.map { it.sum() } // merge combination of maps
.map {
[name: "${it.groupName}_${it.modelName}_${it.startVal}_${it.repetition}"]
+ it
[name: "${it.groupName}_${it.modelName}_${it.startVal}_${it.repetition}"] + it
}
.set { badiRateInputSets }
......@@ -1008,15 +1012,17 @@ badiRateResults
// Write BadiRate results CSV file
badiRateInputSets
.join(badiRateResults, remainder: true)
.phase(badiRateResults, remainder: true) { it.name }
.map { it[0] + (it[1] ?: []) }
.map {
"$it.groupName,$it.modelDisplayName,$it.species,$it.outputFile"
"$it.groupName,$it.modelDisplayName,${it.get('species', '')},${it.get('outputFile', '')}"
}
.collectFile(
name: 'badirate_results.csv',
seed: 'count_group,model,species,output_file',
newLine: true
)
.first()
.set { badiRateResultsFile }
// Write a report per badirate=true species
......@@ -1026,7 +1032,8 @@ tap(badiRateSpeciesNames)
.set { badiRateAnalyseResultsInput }
process badiRateAnalyseResults {
publishDir "${params.output}/badirate/reports"
module 'badirate'
module 'python'
tag "$species"
cpus 1
memory '1 GB'
......@@ -1047,16 +1054,16 @@ process badiRateAnalyseResults {
script:
"""
$activate_venv
cedalion-badirate-analyse-results \\
OMP_NUM_THREADS=$task.cpus cedalion-badirate-analyse-results \\
--badirate-results '$badiRateResults' \\
--count-groups '$countGroups' \\
--functional-annotation '$functionalAnnotation' \\
--gene-families '$orthoGroups' \\
--gene-info '$geneInfo' \\
--go '$goObo' \\
--output '${species}.xlsx'
--output '${species}.xlsx' \\
--species '$species' \\
--species-info '$speciesInfo' \\
--species-info '$speciesInfo'
"""
}
......
......@@ -50,6 +50,7 @@ setup_args = dict(
entry_points={'console_scripts': [
'map-fasta = cedalion.scripts.map_fasta:main',
'clean-protein-fasta = cedalion.scripts.clean_protein_fasta:main',
'cedalion-badirate-analyse-results = cedalion.scripts.badirate_analyse_results:main',
'cedalion-badirate-format-input = cedalion.scripts.badirate_format_input:main',
'cedalion-best-hit-defs = cedalion.scripts.best_hit_defs:main',
'cedalion-orthofinder-format-input = cedalion.scripts.orthofinder_format_input:main',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment