Commit c467e2f4 authored by timdiels's avatar timdiels
Browse files

add CLIME processes to Nextflow script [partial]

work in progress. Like the previous clime commit, it's not there yet.

clime, implement further [partial]

- Use channel variables
- Finish climeMatrix process
- phyml species as columns, clime species as row species
parent bfb97300
......@@ -75,7 +75,9 @@ def main(row_species, col_species, genes_file, gene_families_file, output_file):
Example usage:
cedalion-clime-matrix --species artha --genes genes.tsv --gene-families ortho_groups.tsv --output clime_matrix.txt
cedalion-clime-matrix --row-species artha -c artha -c species1 \
--genes genes.tsv --gene-families ortho_groups.tsv \
--output clime_matrix.txt
where genes_file contains:
......
......@@ -107,6 +107,7 @@ Channel
phyml: it.phyml.toBoolean(),
interproscan: it.interproscan.toBoolean(),
blast2go: it.blast2go.toBoolean(),
clime: it.clime.toBoolean(),
]
// Assert proteome is an absolute path
......@@ -124,6 +125,14 @@ Channel
)
}
// Assert PhyML is set when CLIME is set
if (!specie.phyml && specie.clime) {
error(
"$specie.name has CLIME set, but not PhyML. CLIME requires " +
"PhyML. Please set PhyML as well or unset CLIME."
)
}
//
return specie
}
......@@ -475,11 +484,16 @@ process mergeFunctionalAnnotation {
// - PhyML web site: http://www.atgc-montpellier.fr/phyml/
// - PhyML manual: http://www.atgc-montpellier.fr/download/papers/phyml_manual_2012.pdf
// Create multiple sequence alignment of core genes with gaps removed
tap(species)
.filter { it.phyml }
.set { phymlSpecies }
tap(phymlSpecies)
.filter { it.phyml }
.collectSpeciesFile()
.set { msaSpeciesFile }
.set { phymlSpeciesFile }
// Create multiple sequence alignment of core genes with gaps removed
msa = Channel.value()
process msa {
module 'muscle'
......@@ -491,7 +505,7 @@ process msa {
input:
file orthoGroups from orthoGroupsFile
file speciesInfo from msaSpeciesFile
file speciesInfo from phymlSpeciesFile
output:
file 'msa.phylip' into msa
......@@ -570,41 +584,121 @@ process phyml {
// - Web site: http://gene-clime.org
// - CLI usage manual is included in the download, not online.
// - Other help: http://gene-clime.org/help
//climeParameters = "${workflow.projectDir}/resources/clime_parameters.txt"
//clime {
// module 'CLIME'
// publishDir "${params.output}/clime"
//
// cpus 1 // TODO is it really single core?
// memory '32 GB'
// time '2w' // weeks
// This examines species with clime=true, comparing them to species with
// phyml=true. I.e. gene sets for species with clime=true, column species are
// all species with phyml=true. Because of this when clime=true for a species
// in species.txt, so must phyml=true.
//
// input:
// file 'input_tree' from tap(bifurcatingSpeciesTree)
// GENESET_FILE "input_gene_sets" // input gene sets
// We assume the rooted tree generated by PhyML is bifurcating. If you do
// encounter a multifurcating tree, some research has been done on resolving
// polytomies; no need for something ad hoc.
//
// output:
// file 'result*' into climeResult
// //file 'result*' into climePublish TODO rm if unused
// CLIME input cheatsheet:
//
// script:
// """
// # Generate input
// #
// # We assume the rooted tree generated by PhyML is bifurcating. If you do
// # encounter a multifurcating tree, some research has been done on resolving
// # polytomies; no need for something ad hoc.
// cedalion-gene-info --species $species_txt --output gene_info.txt
// cedalion-clime-matrix --genes gene_info.txt --gene-families $gene_families --output input_clime_matrix
//
// # Run clime
// #
// # 0
// # run on all gene sets
// clime $climeParameters 0
// """
//}
// - matrix columns = all species in the newick tree
// - matrix rows = genes of a single species
// - gene sets: must be a subset (needn't be strict) of matrix rows
tap(species)
.filter { it.clime }
.set { climeSpecies }
//TODO rm if unused
tap(climeSpecies)
.collectSpeciesFile()
.set { climeSpeciesFile }
// Create CLIME gene info file with genes of all species in the tree that will
// be used
climeGeneInfoFile = Channel.value()
process climeGeneInfo {
cpus 1
memory '1 GB'
time '1h'
input:
file speciesFile from phymlSpeciesFile
output:
file "gene_info.txt" into climeGeneInfoFile
script:
"""
cedalion-gene-info --species $speciesFile --output gene_info.txt
"""
}
// Create CLIME matrix for each clime species comparing against all phyml species
tap(climeSpecies)
.map { it.name }
.set { climeMatrixSpecies }
colSpecies = tap(phymlSpecies)
.collect { "--col-species $it" }
.map { it.join(' ') }
.set { climeMatrixColSpecies }
process climeMatrix {
tag "$species"
cpus 1
memory '1 GB'
time '1h'
input:
val species from climeMatrixSpecies
val colSpecies from climeMatrixColSpecies
file orthoGroups from orthoGroupsFile
file geneInfoFile from climeGeneInfoFile
output:
file "${species}.clime_matrix" into climeWGMatrices
script:
"""
cedalion-clime-matrix --row-species $species $colSpecies --genes $geneInfoFile --gene-families $orthoGroups --output ${species}.clime_matrix
"""
}
// Create gene sets with: gene set = whole genome
// TODO
// Create input sets with: gene set = genes having a certain GO term, for each GO term
// TODO, careful about simply taking from func annot as we first have to fix
// those. Func annot does not include any species which have only one of B2G or
// IPS
// Run CLIME on input sets
// TODO implement this. Input: various matrices from above. What's in the params again?
climeParameters = "${workflow.projectDir}/resources/clime_parameters.txt"
climeGeneSets = Channel.empty() // TODO replace with real non-empty channel
process clime {
module 'CLIME'
publishDir "${params.output}/clime"
cpus 1 // TODO is it really single core?
memory '32 GB'
time '14d'
input:
file geneInfoFile from climeGeneInfoFile
file 'input_tree' from bifurcatingSpeciesTree
file 'input_gene_sets' from tap(climeGeneSets)
output:
file 'result*' into climeResult
//file 'result*' into climePublish TODO rm if unused
script:
"""
# Run clime
#
# 0
# run on all gene sets
clime $climeParameters 0
"""
}
////////////////////////////////
// Cleanup script
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment