Commit b77b95b9 authored by timdiels's avatar timdiels
Browse files

BR: various fixes

parent 6e47eca2
...@@ -355,11 +355,10 @@ def _add_waic_ratio(results): ...@@ -355,11 +355,10 @@ def _add_waic_ratio(results):
return results return results
def _parse_count_groups(count_groups_file): def _parse_count_groups(count_groups_file):
df = pd.read_table( return pd.read_table(
str(count_groups_file), str(count_groups_file),
usecols=['count_group', 'gene_family'] usecols=['count_group', 'gf']
) )
return df.rename(columns={'gene_family': 'gf'})
def _best_model_per_group(results): def _best_model_per_group(results):
''' '''
......
...@@ -96,6 +96,7 @@ class GoEnrichment: ...@@ -96,6 +96,7 @@ class GoEnrichment:
up_genes = attr.ib() up_genes = attr.ib()
def _split_genes_into_gene_column(genes): def _split_genes_into_gene_column(genes):
genes['genes'] = genes['genes'].apply(list) # TODO this is a quick hack, in long term should get rid of using sequences inside data frames
genes = df_.split_array_like(genes, 'genes') genes = df_.split_array_like(genes, 'genes')
return genes.rename(columns={'genes': 'gene'}) return genes.rename(columns={'genes': 'gene'})
......
...@@ -34,7 +34,7 @@ from cedalion.badirate.various import parse_tree_from_result, parse_nodes ...@@ -34,7 +34,7 @@ from cedalion.badirate.various import parse_tree_from_result, parse_nodes
from cedalion.various import join_multiline from cedalion.various import join_multiline
@click.command() @click.command()
@click.argument( @click.option(
'--badirate-results', 'badirate_results_file', '--badirate-results', 'badirate_results_file',
type=click.Path(exists=True, dir_okay=False), type=click.Path(exists=True, dir_okay=False),
required=True, required=True,
...@@ -146,16 +146,24 @@ def main(badirate_results_file, target_species, gf_file, gene_info_file, species ...@@ -146,16 +146,24 @@ def main(badirate_results_file, target_species, gf_file, gene_info_file, species
--functional-annotation func_annot.csv \\ --functional-annotation func_annot.csv \\
--output output.xlsx --output output.xlsx
''' '''
badirate_results_file = Path(badirate_results_file)
gf_file = Path(gf_file)
gene_info_file = Path(gene_info_file)
species_file = Path(species_file)
count_groups_file = Path(count_groups_file)
go_file = Path(go_file)
func_annot_file = Path(func_annot_file)
output_file = Path(output_file)
results = _parse_results(badirate_results_file) results = _parse_results(badirate_results_file)
_exit_if_no_results(results) _exit_if_no_results(results)
result_file = results.iloc[results['output_file'].first_valid_index]['output_file'] result_file = results['output_file'].iloc[results['output_file'].first_valid_index()]
tree = parse_tree_from_result(result_file) tree = parse_tree_from_result(result_file)
all_species = {node.species: node.id for node in parse_nodes(tree)} all_species = {node.species: node.id for node in parse_nodes(tree) if node.species}
results_analysis = analyse_results( results_analysis = analyse_results(
results, count_groups_file, gene_info_file, gf_file, target_species, results, count_groups_file, gene_info_file, gf_file, target_species,
all_species[target_species] all_species[target_species]
) )
go_enrichment = enrich_gos(results, func_annot_file, go_file) go_enrichment = enrich_gos(results_analysis.results, func_annot_file, go_file)
write_report( write_report(
output_file, results_analysis, go_enrichment, all_species, target_species, output_file, results_analysis, go_enrichment, all_species, target_species,
species_file, tree species_file, tree
......
gene_family count_group gf count_group
gf1 g1 gf1 g1
gf2 g1 gf2 g1
gf3 g2 gf3 g2
...@@ -1014,9 +1014,9 @@ class TestReporting: ...@@ -1014,9 +1014,9 @@ class TestReporting:
assert_xlsx_equals(actual_file, expected_file) assert_xlsx_equals(actual_file, expected_file)
# TODO Add a TestAnalyseResultsScript which tests # TODO Add a TestAnalyseResultsScript which tests
# scripts.badirate_analyse_results. Left it out due to dev time constraints, # scripts.badirate_analyse_results in isolation. Left it out due to dev time
# most bugs related to that code should be obvious when trying to run it in the # constraints, most bugs related to that code should be obvious when trying to
# test pipeline. # run it in the test pipeline.
#TODO add to pytil #TODO add to pytil
def assert_xlsx_equals(actual_file, expected_file): def assert_xlsx_equals(actual_file, expected_file):
......
...@@ -22,6 +22,11 @@ import groovyx.gpars.dataflow.DataflowReadChannel ...@@ -22,6 +22,11 @@ import groovyx.gpars.dataflow.DataflowReadChannel
import groovy.transform.InheritConstructors import groovy.transform.InheritConstructors
import nextflow.Global import nextflow.Global
// Note: if running python on the cluster gives you an error involving OpenBLAS
// and pthread_create, it is likely numpy's OMP trying to multithread too hard.
// Setting OMP_NUM_THREADS environment variable to the number of cpus you
// requested usually resolves the issue.
//////////////////////////////////// ////////////////////////////////////
// Helper functions and classes // Helper functions and classes
...@@ -871,7 +876,7 @@ process badiRateFormatInput { ...@@ -871,7 +876,7 @@ process badiRateFormatInput {
output: output:
file "*.size_file.tsv" into badiRateSizeFiles file "*.size_file.tsv" into badiRateSizeFiles
file "node_ids.tsv" into badiRateNodeIdsFile file "node_ids.csv" into badiRateNodeIdsFile
file "count_groups.tsv" into badiRateCountGroupsFile file "count_groups.tsv" into badiRateCountGroupsFile
file "ultrametric.newick" into badiRateUltraTree file "ultrametric.newick" into badiRateUltraTree
...@@ -879,7 +884,7 @@ process badiRateFormatInput { ...@@ -879,7 +884,7 @@ process badiRateFormatInput {
""" """
$activate_venv $activate_venv
Rscript $ultrametricTreeScript $tree ultrametric.newick Rscript $ultrametricTreeScript $tree ultrametric.newick
cedalion-badirate-format-input --gene-info $geneInfo \\ OMP_NUM_THREADS=$task.cpus cedalion-badirate-format-input --gene-info $geneInfo \\
--gene-families $orthoGroups --tree ultrametric.newick --output . --gene-families $orthoGroups --tree ultrametric.newick --output .
""" """
} }
...@@ -895,7 +900,7 @@ badiRateSizeFiles ...@@ -895,7 +900,7 @@ badiRateSizeFiles
// NS-BR model args // NS-BR model args
badiRateNodeIdsFile badiRateNodeIdsFile
.splitCsv(header: true) .splitCsv(header: true)
.filter { it.parent_node_id != null } .filter { it.parent_node_id != '' }
// Filter out non-badirate species // Filter out non-badirate species
.phase(tap(badiRateSpeciesNames)) .phase(tap(badiRateSpeciesNames))
.map { it[0] } .map { it[0] }
...@@ -939,8 +944,7 @@ Channel ...@@ -939,8 +944,7 @@ Channel
.combine(badiRateSizeFiles) .combine(badiRateSizeFiles)
.map { it.sum() } // merge combination of maps .map { it.sum() } // merge combination of maps
.map { .map {
[name: "${it.groupName}_${it.modelName}_${it.startVal}_${it.repetition}"] [name: "${it.groupName}_${it.modelName}_${it.startVal}_${it.repetition}"] + it
+ it
} }
.set { badiRateInputSets } .set { badiRateInputSets }
...@@ -1008,15 +1012,17 @@ badiRateResults ...@@ -1008,15 +1012,17 @@ badiRateResults
// Write BadiRate results CSV file // Write BadiRate results CSV file
badiRateInputSets badiRateInputSets
.join(badiRateResults, remainder: true) .phase(badiRateResults, remainder: true) { it.name }
.map { it[0] + (it[1] ?: []) }
.map { .map {
"$it.groupName,$it.modelDisplayName,$it.species,$it.outputFile" "$it.groupName,$it.modelDisplayName,${it.get('species', '')},${it.get('outputFile', '')}"
} }
.collectFile( .collectFile(
name: 'badirate_results.csv', name: 'badirate_results.csv',
seed: 'count_group,model,species,output_file', seed: 'count_group,model,species,output_file',
newLine: true newLine: true
) )
.first()
.set { badiRateResultsFile } .set { badiRateResultsFile }
// Write a report per badirate=true species // Write a report per badirate=true species
...@@ -1026,7 +1032,8 @@ tap(badiRateSpeciesNames) ...@@ -1026,7 +1032,8 @@ tap(badiRateSpeciesNames)
.set { badiRateAnalyseResultsInput } .set { badiRateAnalyseResultsInput }
process badiRateAnalyseResults { process badiRateAnalyseResults {
publishDir "${params.output}/badirate/reports" publishDir "${params.output}/badirate/reports"
module 'badirate' module 'python'
tag "$species"
cpus 1 cpus 1
memory '1 GB' memory '1 GB'
...@@ -1047,16 +1054,16 @@ process badiRateAnalyseResults { ...@@ -1047,16 +1054,16 @@ process badiRateAnalyseResults {
script: script:
""" """
$activate_venv $activate_venv
cedalion-badirate-analyse-results \\ OMP_NUM_THREADS=$task.cpus cedalion-badirate-analyse-results \\
--badirate-results '$badiRateResults' \\ --badirate-results '$badiRateResults' \\
--count-groups '$countGroups' \\ --count-groups '$countGroups' \\
--functional-annotation '$functionalAnnotation' \\ --functional-annotation '$functionalAnnotation' \\
--gene-families '$orthoGroups' \\ --gene-families '$orthoGroups' \\
--gene-info '$geneInfo' \\ --gene-info '$geneInfo' \\
--go '$goObo' \\ --go '$goObo' \\
--output '${species}.xlsx' --output '${species}.xlsx' \\
--species '$species' \\ --species '$species' \\
--species-info '$speciesInfo' \\ --species-info '$speciesInfo'
""" """
} }
......
...@@ -50,6 +50,7 @@ setup_args = dict( ...@@ -50,6 +50,7 @@ setup_args = dict(
entry_points={'console_scripts': [ entry_points={'console_scripts': [
'map-fasta = cedalion.scripts.map_fasta:main', 'map-fasta = cedalion.scripts.map_fasta:main',
'clean-protein-fasta = cedalion.scripts.clean_protein_fasta:main', 'clean-protein-fasta = cedalion.scripts.clean_protein_fasta:main',
'cedalion-badirate-analyse-results = cedalion.scripts.badirate_analyse_results:main',
'cedalion-badirate-format-input = cedalion.scripts.badirate_format_input:main', 'cedalion-badirate-format-input = cedalion.scripts.badirate_format_input:main',
'cedalion-best-hit-defs = cedalion.scripts.best_hit_defs:main', 'cedalion-best-hit-defs = cedalion.scripts.best_hit_defs:main',
'cedalion-orthofinder-format-input = cedalion.scripts.orthofinder_format_input:main', 'cedalion-orthofinder-format-input = cedalion.scripts.orthofinder_format_input:main',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment