gene_sets.py 5.95 KB
Newer Older
timdiels's avatar
timdiels committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# Copyright (C) 2017 VIB/BEG/UGent - Tim Diels <timdiels.m@gmail.com>
#
# This file is part of Cedalion.
#
# Cedalion is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Cedalion is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Cedalion.  If not, see <http://www.gnu.org/licenses/>.

timdiels's avatar
timdiels committed
18
from cedalion.various import join_multiline
timdiels's avatar
timdiels committed
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from toolz.itertoolz import groupby
from collections import defaultdict
from cedalion import parse, write
from pathlib import Path
import click
import attr

@click.command()
@click.option(
    '--output', 'output_file',
    type=click.Path(exists=False, dir_okay=False),
    required=True,
    help='File to write gene sets to.'
)
@click.option(
    '--species', '-s',
    multiple=True,
    help='Species to generate gene sets for. Defaults to all species in the input.'
)
@click.option(
    '--whole-genome', '--wg', 'whole_genome',
    is_flag=True,
timdiels's avatar
timdiels committed
41
    help=join_multiline('''\
timdiels's avatar
timdiels committed
42
        Output a gene set per species containing all genes of the species. The
timdiels's avatar
timdiels committed
43
        sets are named `{species}_wg`.
timdiels's avatar
timdiels committed
44
45
        '''
    )
timdiels's avatar
timdiels committed
46
47
48
49
50
)
@click.option(
    '--genes', 'genes_file',
    required=True,
    type=click.Path(exists=True, dir_okay=False),
timdiels's avatar
timdiels committed
51
    help=join_multiline('''\
timdiels's avatar
timdiels committed
52
        A TSV file with header describing all genes of each species. It must
timdiels's avatar
timdiels committed
53
54
55
        have at least a 'gene' and 'species' column.
        '''
    )
timdiels's avatar
timdiels committed
56
57
58
59
)
@click.option(
    '--go', 'go_file',
    type=click.Path(exists=True, dir_okay=False),
timdiels's avatar
timdiels committed
60
    help=join_multiline('''\
timdiels's avatar
timdiels committed
61
        Output a gene set per species per GO term containing all genes of the
timdiels's avatar
timdiels committed
62
        species with that GO term. The sets are named `{species}_{go_term}`. The
timdiels's avatar
timdiels committed
63
64
65
66
67
        argument should be a CSV file with header annotating all genes with GO
        terms. It must have at least a 'gene', 'species' and
        'blast2go_go_terms', 'interproscan_go_terms' column. Multiple GO terms
        should be separated by '|'.
        '''
timdiels's avatar
timdiels committed
68
    )
timdiels's avatar
timdiels committed
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
)
@click.option(
    '--min-size',
    default=5,
    type=click.IntRange(min=2),
    help='Minimum gene set size. Sets smaller than this size will not be outputted.'
)
def main(output_file, species, genes_file, go_file, whole_genome, min_size):
    '''
    Create gene set files for one or more species (compatible with CLIME).

    Example usage:

    \b
        cedalion-gene-sets -s artha -s species1 --wg --genes genes.tsv
            --go go.csv --output-dir .

    where genes_file contains:

    \b
        gene    species
        AT2G03340    artha
        AT2G03341    artha
        random1    species1
        random3    species1

    and go_file contains:

    \b
        gene,species,blast2go_go_terms,interproscan_go_terms
        AT2G03340,artha,GO:0016041|GO:0016042,
        AT2G03341,artha,GO:0016041,GO:0016041
        random1,species1,GO:0016041,GO:0016041|GO:0016042
        random3,species1,GO:0016043|GO:0016044,
    '''
    # Use pathlib.Path
    output_file = Path(str(output_file))
    genes_file = Path(str(genes_file))
    if go_file:
        go_file = Path(str(go_file))

    # Parse gene info file
    genes = {
        row['gene']: row['species']
        for row in parse.tsv(genes_file)
    }

    # Default to all species if none given
    # and convert to a set
    species = set(species or genes.values())

    # Write gene sets
    with write.tsv(output_file, ('Symbol', 'Gene set name')) as writer:
        writer.writeheader()
        for gene_set in _gene_sets(genes, species, go_file, whole_genome):
            # Skip gene sets which are too small
            if len(gene_set.genes) < min_size:
                continue

            # Write gene set
            for gene in gene_set.genes:
                writer.writerow({
                    'Symbol': gene,
                    'Gene set name': gene_set.name
                })

def _gene_sets(genes, desired_species, go_file, whole_genome):
    '''
    List gene sets of species

    Parameters
    ----------
    genes : Map[str,str]
        Map of genes to species.
    desired_species : Collection[str]
        Species to get gene sets of.
    go_file : Path or None
    whole_genome : boolean

    Returns
    -------
    Iterable[GeneSet]
    '''
    # Whole genome gene sets
    if whole_genome:
        yield from whole_genome_sets(genes, desired_species)

    # GO gene sets
    if go_file:
        yield from go_gene_sets(genes, desired_species, go_file)

def whole_genome_sets(genes, desired_species):
    def species(item):
        return item[1]
    gene_sets = groupby(species, genes.items())
    for species_, items in gene_sets.items():
timdiels's avatar
timdiels committed
165
        name = '{}_wg'.format(species_)
timdiels's avatar
timdiels committed
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
        if species_ in desired_species:
            genes_ = tuple(item[0] for item in items)
            yield GeneSet(name, genes_)

def go_gene_sets(genes, desired_species, go_file):
    # Parse go_file into go_groups
    # Map[species, Map[go, List[gene]]]
    go_groups = defaultdict(lambda: defaultdict(list))
    for row in parse.csv(go_file):
        go_terms = set(filter(None, (
            row['blast2go_go_terms'].split('|') +
            row['interproscan_go_terms'].split('|')
        )))
        gene = row['gene']
        species = genes[gene]
        if species in desired_species:
            groups = go_groups[species]
            for go_term in go_terms:
                groups[go_term].append(gene)

    # Yield GeneSet for each group
    for species, groups in go_groups.items():
        for go_term, genes_ in groups.items():
timdiels's avatar
timdiels committed
189
            name = '{}_{}'.format(species, go_term.replace(':', '_'))
timdiels's avatar
timdiels committed
190
191
192
193
194
195
            yield GeneSet(name, genes_)

@attr.s(slots=True, frozen=True)
class GeneSet:
    name = attr.ib()
    genes = attr.ib()