Commit 959d302d authored by timdiels's avatar timdiels
Browse files

Warn on invalid sequences instead of stopping the pipeline

parent 255e8925
......@@ -14,6 +14,9 @@ from chicken_turtle_util.hashlib import base85_digest
from deep_genome.core.pipeline import persisted, format_call
from Bio import SeqIO
import pandas as pd
import logging
logger = logging.getLogger(__name__)
async def files(context):
'''
......@@ -122,18 +125,22 @@ async def _clean(context, fasta_file, job_directory):
sequences['sequence'] = sequences['sequence'].apply(lambda seq: seq if seq[0] != '*' else seq[1:])
# If more '*' characters are left, raise
odd_sequences = sequences[sequences['sequence'].apply(lambda x: '*' in x)].copy()
mask = sequences['sequence'].apply(lambda x: '*' in x)
odd_sequences = sequences[mask].copy()
if not odd_sequences.empty:
# Warn
odd_sequences['id'] = odd_sequences['record'].apply(lambda record: record.id)
del odd_sequences['record']
odd_sequences['sequence'] += '*'
raise Exception(
logger.warning(
'Some protein sequences in {!r} contain "*" characters (other than at the start/end of the sequence). '
'This is impossible, a translation stop character can only appear at the end of a protein. ' # a biologist implied this
'Please fix your protein inference process and ultimately the corresponding FASTA input file. '
'Sequences:\n{}'
.format(fasta_file.name, odd_sequences.to_string())
'Those sequences are invalid and are dropped. ' # a bioinformatician implied this
'Dropping sequences of: {}'
.format(fasta_file.name, ', '.join(map(repr, odd_sequences['id'])))
)
# Drop invalid sequences
sequences = sequences[~mask]
# Assign clean sequence
for record, sequence in sequences.itertuples(False, None):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment