Skip to content

Commit

Permalink
Solved EI-CoreBioinformatics#132, with one unit-testing confirming
Browse files Browse the repository at this point in the history
  • Loading branch information
lucventurini committed Oct 5, 2018
1 parent 81b550e commit 0379c91
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 12 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Bugfixes and improvements:
- [#129](https://github.com/lucventurini/mikado/issues/129): Mikado is now capable of correctly padding the transcripts so to uniform their ends in a single locus. This will also have the effect of trying to enlarge the ORF of a transcript if it is truncated to begin with.
- [#130](https://github.com/lucventurini/mikado/issues/130): it is now possible to specify a different metric inside the "filter" section of scoring.
- [#131](https://github.com/lucventurini/mikado/issues/131): in rare instances, Mikado could have missed loci if they were lost between the sublocus and monosublocus stages. Now Mikado implements a basic backtracking recursive algorithm that should ensure no locus is missed.
- [#132](https://github.com/lucventurini/mikado/issues/132)
- [#132](https://github.com/lucventurini/mikado/issues/132): Mikado will now evaluate the CDS of transcripts during Mikado prepare.

# Version 1.2.4

Expand Down
1 change: 1 addition & 0 deletions Mikado/preparation/checking.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def create_transcript(lines,
logger.debug("Finished adding exon lines to %s", lines["tid"])
transcript_object.finalize()
transcript_object.check_strand()
transcript_object.check_orf()
except exceptions.IncorrectStrandError:
logger.info("Discarded %s because of incorrect fusions of splice junctions",
lines["tid"])
Expand Down
28 changes: 28 additions & 0 deletions Mikado/tests/test_system_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,11 +263,13 @@ def test_cdna_redundant_cds_not(self):
args.json_conf = self.conf
args.keep_redundant = b
args.json_conf["prepare"]["keep_redundant"] = b
self.logger.setLevel("DEBUG")
prepare.prepare(args, self.logger)
self.assertTrue(os.path.exists(os.path.join(self.conf["prepare"]["files"]["output_dir"],
"mikado_prepared.fasta")))
fa = pyfaidx.Fasta(os.path.join(self.conf["prepare"]["files"]["output_dir"],
"mikado_prepared.fasta"))

if b is True:
self.assertEqual(len(fa.keys()), 5)
self.assertEqual(sorted(fa.keys()), sorted(["A", "A1", "A2", "A3", "A4"]))
Expand All @@ -277,6 +279,32 @@ def test_cdna_redundant_cds_not(self):
self.assertIn("A1", fa.keys())
self.assertTrue("A2" in fa.keys() or "A3" in fa.keys())
self.assertIn("A4", fa.keys())
gtf_file = os.path.join(self.conf["prepare"]["files"]["output_dir"], "mikado_prepared.gtf")

coding_count = 0
with to_gff(gtf_file) as gtf:
lines = [line for line in gtf]
transcripts = dict()
for line in lines:
if line.is_transcript:
transcript = Transcript(line)
transcripts[transcript.id] = transcript
elif line.is_exon:
transcripts[line.transcript].add_exon(line)
[transcripts[_].finalize() for _ in transcripts]
for transcript in transcripts.values():
if transcript.is_coding:
coding_count += 1
self.assertIn("has_start_codon", transcript.attributes, str(transcript.format("gtf")))
self.assertIn("has_stop_codon", transcript.attributes, str(transcript.format("gtf")))
self.assertEqual(bool(transcript.attributes["has_start_codon"]),
transcript.has_start_codon)
self.assertEqual(bool(transcript.attributes["has_stop_codon"]),
transcript.has_stop_codon)
self.assertEqual(transcript.is_complete,
transcript.has_start_codon and transcript.has_stop_codon)

self.assertGreater(coding_count, 0)

def test_negative_cdna_redundant_cds_not(self):
"""This test will verify whether the new behaviour of not considering redundant two models with same
Expand Down
57 changes: 46 additions & 11 deletions Mikado/transcripts/transcriptchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from ..exceptions import IncorrectStrandError, InvalidTranscript
from collections import Counter
from itertools import zip_longest
from ..parsers.bed12 import BED12


# pylint: disable=too-many-instance-attributes
Expand Down Expand Up @@ -134,6 +135,7 @@ def strand_specific(self, value):
def __str__(self, print_cds=True, to_gtf=False, with_introns=False):

self.check_strand()
self.check_orf()
if self.mixed_splices is True:
self.attributes["mixed_splices"] = self.mixed_attribute

Expand All @@ -144,6 +146,7 @@ def format(self, format_name, with_introns=False, with_cds=True,
transcriptomic=False):

self.check_strand()
self.check_orf()
if self.mixed_splices is True:
self.attributes["mixed_splices"] = self.mixed_attribute

Expand Down Expand Up @@ -289,18 +292,35 @@ def _check_intron(self, intron):
strand = None
return strand

def check_orf(self):

if self.is_coding is False:
return
else:
orfs = self.find_overlapping_cds(self.get_internal_orf_beds())

if len(orfs) > 1:
self.logger.warning("Multiple ORFs found for %s. Only considering the primary.", self.id)

orf = orfs[0]
assert isinstance(orf, BED12)
orf.sequence = self.cdna
if orf.invalid:
self.logger.warning("Invalid ORF for %s (reason: %s)", self.id, orf.invalid_reason)
self.strip_cds(self.strand_specific)
else:
self.has_start_codon = self.has_start_codon or orf.has_start_codon
self.has_stop_codon = self.has_stop_codon or orf.has_stop_codon
self.attributes["has_start_codon"] = self.has_stop_codon
self.attributes["has_stop_codon"] = self.has_stop_codon

return

@property
def fasta(self):
"""
This property calculates and returns the FASTA sequence associated with
the transcript instance.
The FASTA sequence itself will be formatted to be in lines with 60 characters
(the standard).
:return:
"""
def cdna(self):

"""This property calculates the cDNA sequence of the transcript."""

self.check_strand()
fasta = [">{0}".format(self.id)]
sequence = ''

for exon in self.exons:
Expand All @@ -322,11 +342,26 @@ def fasta(self):

if self.strand == "-":
sequence = self.rev_complement(sequence)
return sequence

@property
def fasta(self):
"""
This property calculates and returns the FASTA sequence associated with
the transcript instance.
The FASTA sequence itself will be formatted to be in lines with 60 characters
(the standard).
:return:
"""

self.check_strand()
fasta = [">{0}".format(self.id)]

# assert len(sequence) == sum(exon[1] + 1 - exon[0] for exon in self.exons), (
# len(sequence), sum(exon[1] + 1 - exon[0] for exon in self.exons)
# )

sequence = self.grouper(sequence, 60)
sequence = self.grouper(self.cdna, 60)
fasta.extend(sequence)
fasta = "\n".join(fasta)
return fasta
Expand Down

0 comments on commit 0379c91

Please sign in to comment.