Solved EI-CoreBioinformatics#132, with one unit-testing confirming

lucventurini · Oct 5, 2018 · 0379c91 · 0379c91
1 parent 81b550e
commit 0379c91
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,7 +13,7 @@ Bugfixes and improvements:
 - [#129](https://github.com/lucventurini/mikado/issues/129): Mikado is now capable of correctly padding the transcripts so to uniform their ends in a single locus. This will also have the effect of trying to enlarge the ORF of a transcript if it is truncated to begin with.
 - [#130](https://github.com/lucventurini/mikado/issues/130): it is now possible to specify a different metric inside the "filter" section of scoring.
 - [#131](https://github.com/lucventurini/mikado/issues/131): in rare instances, Mikado could have missed loci if they were lost between the sublocus and monosublocus stages. Now Mikado implements a basic backtracking recursive algorithm that should ensure no locus is missed.
-- [#132](https://github.com/lucventurini/mikado/issues/132)
+- [#132](https://github.com/lucventurini/mikado/issues/132): Mikado will now evaluate the CDS of transcripts during Mikado prepare.
 
 # Version 1.2.4
 

diff --git a/Mikado/preparation/checking.py b/Mikado/preparation/checking.py
@@ -82,6 +82,7 @@ def create_transcript(lines,
         logger.debug("Finished adding exon lines to %s", lines["tid"])
         transcript_object.finalize()
         transcript_object.check_strand()
+        transcript_object.check_orf()
     except exceptions.IncorrectStrandError:
         logger.info("Discarded %s because of incorrect fusions of splice junctions",
                     lines["tid"])

diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py
@@ -263,11 +263,13 @@ def test_cdna_redundant_cds_not(self):
                 args.json_conf = self.conf
                 args.keep_redundant = b
                 args.json_conf["prepare"]["keep_redundant"] = b
+                self.logger.setLevel("DEBUG")
                 prepare.prepare(args, self.logger)
                 self.assertTrue(os.path.exists(os.path.join(self.conf["prepare"]["files"]["output_dir"],
                                                             "mikado_prepared.fasta")))
                 fa = pyfaidx.Fasta(os.path.join(self.conf["prepare"]["files"]["output_dir"],
                                                 "mikado_prepared.fasta"))
+
                 if b is True:
                     self.assertEqual(len(fa.keys()), 5)
                     self.assertEqual(sorted(fa.keys()), sorted(["A", "A1", "A2", "A3", "A4"]))
@@ -277,6 +279,32 @@ def test_cdna_redundant_cds_not(self):
                     self.assertIn("A1", fa.keys())
                     self.assertTrue("A2" in fa.keys() or "A3" in fa.keys())
                     self.assertIn("A4", fa.keys())
+                gtf_file = os.path.join(self.conf["prepare"]["files"]["output_dir"], "mikado_prepared.gtf")
+
+                coding_count = 0
+                with to_gff(gtf_file) as gtf:
+                    lines = [line for line in gtf]
+                    transcripts = dict()
+                    for line in lines:
+                        if line.is_transcript:
+                            transcript = Transcript(line)
+                            transcripts[transcript.id] = transcript
+                        elif line.is_exon:
+                            transcripts[line.transcript].add_exon(line)
+                    [transcripts[_].finalize() for _ in transcripts]
+                    for transcript in transcripts.values():
+                        if transcript.is_coding:
+                            coding_count += 1
+                            self.assertIn("has_start_codon", transcript.attributes, str(transcript.format("gtf")))
+                            self.assertIn("has_stop_codon", transcript.attributes, str(transcript.format("gtf")))
+                            self.assertEqual(bool(transcript.attributes["has_start_codon"]),
+                                             transcript.has_start_codon)
+                            self.assertEqual(bool(transcript.attributes["has_stop_codon"]),
+                                             transcript.has_stop_codon)
+                            self.assertEqual(transcript.is_complete,
+                                             transcript.has_start_codon and transcript.has_stop_codon)
+
+                self.assertGreater(coding_count, 0)
 
     def test_negative_cdna_redundant_cds_not(self):
         """This test will verify whether the new behaviour of not considering redundant two models with same

diff --git a/Mikado/transcripts/transcriptchecker.py b/Mikado/transcripts/transcriptchecker.py
@@ -9,6 +9,7 @@
 from ..exceptions import IncorrectStrandError, InvalidTranscript
 from collections import Counter
 from itertools import zip_longest
+from ..parsers.bed12 import BED12
 
 
 # pylint: disable=too-many-instance-attributes
@@ -134,6 +135,7 @@ def strand_specific(self, value):
     def __str__(self, print_cds=True, to_gtf=False, with_introns=False):
 
         self.check_strand()
+        self.check_orf()
         if self.mixed_splices is True:
             self.attributes["mixed_splices"] = self.mixed_attribute
 
@@ -144,6 +146,7 @@ def format(self, format_name, with_introns=False, with_cds=True,
                transcriptomic=False):
 
         self.check_strand()
+        self.check_orf()
         if self.mixed_splices is True:
             self.attributes["mixed_splices"] = self.mixed_attribute
 
@@ -289,18 +292,35 @@ def _check_intron(self, intron):
                 strand = None
         return strand
 
+    def check_orf(self):
+
+        if self.is_coding is False:
+            return
+        else:
+            orfs = self.find_overlapping_cds(self.get_internal_orf_beds())
+
+            if len(orfs) > 1:
+                self.logger.warning("Multiple ORFs found for %s. Only considering the primary.", self.id)
+
+            orf = orfs[0]
+            assert isinstance(orf, BED12)
+            orf.sequence = self.cdna
+            if orf.invalid:
+                self.logger.warning("Invalid ORF for %s (reason: %s)", self.id, orf.invalid_reason)
+                self.strip_cds(self.strand_specific)
+            else:
+                self.has_start_codon = self.has_start_codon or orf.has_start_codon
+                self.has_stop_codon = self.has_stop_codon or orf.has_stop_codon
+                self.attributes["has_start_codon"] = self.has_stop_codon
+                self.attributes["has_stop_codon"] = self.has_stop_codon
+
+            return
+
     @property
-    def fasta(self):
-        """
-        This property calculates and returns the FASTA sequence associated with
-        the transcript instance.
-        The FASTA sequence itself will be formatted to be in lines with 60 characters
-        (the standard).
-        :return:
-        """
+    def cdna(self):
+
+        """This property calculates the cDNA sequence of the transcript."""
 
-        self.check_strand()
-        fasta = [">{0}".format(self.id)]
         sequence = ''
 
         for exon in self.exons:
@@ -322,11 +342,26 @@ def fasta(self):
 
         if self.strand == "-":
             sequence = self.rev_complement(sequence)
+        return sequence
+
+    @property
+    def fasta(self):
+        """
+        This property calculates and returns the FASTA sequence associated with
+        the transcript instance.
+        The FASTA sequence itself will be formatted to be in lines with 60 characters
+        (the standard).
+        :return:
+        """
+
+        self.check_strand()
+        fasta = [">{0}".format(self.id)]
+
         # assert len(sequence) == sum(exon[1] + 1 - exon[0] for exon in self.exons), (
         #     len(sequence), sum(exon[1] + 1 - exon[0] for exon in self.exons)
         # )
 
-        sequence = self.grouper(sequence, 60)
+        sequence = self.grouper(self.cdna, 60)
         fasta.extend(sequence)
         fasta = "\n".join(fasta)
         return fasta