-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpartitioner_v2.py
83 lines (77 loc) · 3.32 KB
/
partitioner_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python3
##################################
# Project: Maesa Phylogeny
# script: partitioner_v2.py
# --- Action: generate partition files and remove exon sequences from the alignments
# --- Input: already manually edited alignments
# --- Output: clean alignments & partition files
# USAGE: partitioner.py --smoother 10
# Author: Wolf Eiserhardt
# Added line 48-49 to make script report the error due to exon lengths by Pirada Sumanon
# Last edited: 01/06/2021
##################################
import os, subprocess, argparse
from Bio import SeqIO
# parameter for ignoring mini-partitions <smoother
parser = argparse.ArgumentParser()
parser.add_argument("--smoother", default=5, help='minimum length of a partition to be accepted (default 10bp)')
args = parser.parse_args()
smoother = int(args.smoother)
# loop through all alignments in directory
for fn in os.listdir():
if fn.split(".")[-1] == "fasta":
print(fn)
sequences = [] # gather sequences to keep in the final alignment (all but the exons)
# extract aligned exon sequences
for record in SeqIO.parse(fn, "fasta"):
if record.id == "exon1":
exon1 = record.seq
elif record.id == "exon2":
exon2 = record.seq
else:
sequences.append(record)
# create binary partition (1 = exon, 0 = intron)
binpart = []
for i in range(len(exon1)):
# assumes that any alignment pos. where ANY of the two exons has a base is exon
if len(exon1) == len(exon2):
if exon1[i] == "-" and exon2[i] == "-":
binpart.append(0)
else:
binpart.append(1)
else:
print("Error", fn , len(exon1),len(exon2))
#print(binpart)
# define partitions as ranges
partitions_intron = []
partitions_exon = []
current_part = [1, "NA"]
current_state = binpart[0]
for i in range(len(binpart)):
if binpart[i] != current_state: # when a shift occurs...
# define search window for smoothing over mini-partitions
if (i+smoother) < len(binpart): # to avoid index errors
wndw = binpart[i:(i+smoother)]
else:
wndw = binpart[i:]
if len(set(wndw)) == 1: # only invoke a partition shift if there is no further partition shift within n=smoother basepairs upstream
current_part[1] = i # set end position of current partition
if current_state == 0: # append concluded partition to partitions_intron or partitions_exon, depending on current_state
partitions_intron.append(current_part)
else:
partitions_exon.append(current_part)
current_part = [i+1, "NA"] # initialise new current position
current_state = binpart[i] # initialise new current state
if i == len(binpart)-1: # conclude and append last partition
current_part[1] = i+1
if current_state == 0: # append concluded partition to partitions_intron or partitions_exon, depending on current_state
partitions_intron.append(current_part)
else:
partitions_exon.append(current_part)
# write RAxML style partition file
with open(".".join(fn.split(".")[:-1])+"_part.txt", "w") as partfile:
print("DNA, intron = " + ", ".join(["-".join([str(j) for j in i]) for i in partitions_intron]), file=partfile)
print("DNA, exon = " + ", ".join(["-".join([str(j) for j in i]) for i in partitions_exon]), file=partfile)
# write alignment without exon sequences
with open(".".join(fn.split(".")[:-1])+"_clean.fasta", "w") as al:
SeqIO.write(sequences, al, "fasta")