-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
225 lines (188 loc) · 10.2 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import os
import numpy as np
import pandas
import pickle
from flask import Flask, request, url_for, redirect, render_template, send_from_directory
import joblib
import pandas as pd
from werkzeug.utils import secure_filename
app = Flask(__name__)
# embeddings function
def esm_embeddings(peptide_sequence_list: list):
# NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long,
# or you have too many sequences for transformation in a single converting,
# you conputer might automatically kill the job.
# return a panda.dataframe
import torch
import pandas as pd
import esm
import collections
# load the model
# NOTICE: if the model was not downloaded in your local environment, it will automatically download it.
model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval() # disables dropout for deterministic results
# load the peptide sequence list into the bach_converter
batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
## batch tokens are the embedding results of the whole data set
# Extract per-residue representations (on CPU)
with torch.no_grad():
# Here we export the last layer of the EMS model output as the representation of the peptides
# model'esm2_t6_8M_UR50D' only has 6 layers, and therefore repr_layers parameters is equal to 6
results = model(batch_tokens, repr_layers=[6], return_contacts=True)
token_representations = results["representations"][6]
# Generate per-sequence representations via averaging
# NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
sequence_representations = []
for i, tokens_len in enumerate(batch_lens):
sequence_representations.append(token_representations[i, 1: tokens_len - 1].mean(0))
# save dataset
# sequence_representations is a list and each element is a tensor
embeddings_results = collections.defaultdict(list)
for i in range(len(sequence_representations)):
# tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
each_seq_rep = sequence_representations[i].tolist()
for each_element in each_seq_rep:
embeddings_results[i].append(each_element)
embeddings_results = pd.DataFrame(embeddings_results).T
return embeddings_results
# collect the output
def assign_activity(predicted_class):
import collections
out_put = []
for i in range(len(predicted_class)):
if predicted_class[i] == 0:
# out_put[int_features[i]].append(1)
out_put.append('active')
else:
# out_put[int_features[i]].append(2)
out_put.append('non-active')
return out_put
def get_filetype(filename):
return filename.rsplit('.', 1)[1].lower()
def model_selection(num: str):
model = ''
if num == '1':
model = 'LR.pkl'
elif num == '2':
model = 'SVM.pkl'
elif num == '3':
model = 'MLP.pkl'
return model
def text_fasta_reading(file_name):
"""
A function for reading txt and fasta files
"""
import collections
# read txt file with sequence inside
file_read = open(file_name, mode='r')
file_content = [] # create a list for the fasta content temporaty storage
for line in file_read:
file_content.append(line.strip()) # extract all the information in the file and delete the /n in the file
# build a list to collect all the sequence information
sequence_name_collect = collections.defaultdict(list)
for i in range(len(file_content)):
if '>' in file_content[i]: # check the symbol of the
sequence_name_collect[file_content[i]].append(file_content[i + 1])
# transformed into the same style as the xlsx file loaded with pd.read_excel and sequence_list = dataset['sequence']
sequence_name_collect = pd.DataFrame(sequence_name_collect).T
sequence_list = sequence_name_collect[0]
return sequence_list
# create an app object using the Flask class
@app.route('/')
def home():
return render_template('index.html')
@app.route('/predict', methods=['POST'])
def predict():
# 每一个网页上的 输入的框,是一个单独的x,下面这个就是吧这个单独的信息变成一个list,每一个单独的就是一个str (也可以吧x变成int 如果想要的话)
# int_features = [str(x) for x in request.form.values()] # this command basically use extract all the input into a list
# final_features = [np.array(int_features)]
int_features = [str(x) for x in request.form.values()]
# we have two input in the website, one is the model type and other is the peptide sequences
# choose scaler and model
# name = int_features[0]
if int(int_features[0]) < 1 or int(int_features[0]) > 12:
return render_template('index.html')
model_name = model_selection(int_features[0])
model=pickle.load(open(model_name,'rb'))
scaler_name = model_name + '.joblib'
scaler = joblib.load(scaler_name)
sequence_list = int_features[1].split(',') # 因为这个list里又两个element我们需要第二个,所以我只需要把吧这个拿出来,然后split
# 另外需要注意,这个地方,网页上输入的时候必须要是AAA,CCC,SAS, 这个格式,不同的sequence的区分只能使用逗号,其他的都不可以
peptide_sequence_list = []
for seq in sequence_list:
format_seq = [seq, seq] # the setting is just following the input format setting in ESM model, [name,sequence]
tuple_sequence = tuple(format_seq)
peptide_sequence_list.append(
tuple_sequence) # build a summarize list variable including all the sequence information
embeddings_results = esm_embeddings(peptide_sequence_list) # conduct the embedding
normalized_embeddings_results = scaler.transform(embeddings_results) # normalized the embeddings
# prediction
predicted_protability = model.predict(normalized_embeddings_results, batch_size=1)
predicted_class = []
for i in range(predicted_protability.shape[0]):
index = np.where(predicted_protability[i] == np.amax(predicted_protability[i]))[0][0]
predicted_class.append(index) # get the class of the results
predicted_class = assign_activity(predicted_class) # transform results (0 and 1) into 'active' and 'non-active'
final_output = []
for i in range(len(sequence_list)):
temp_output=sequence_list[i]+': '+predicted_class[i]+';'
final_output.append(temp_output)
return render_template('index.html',
prediction_text="Prediction results of input sequences {}".format(final_output))
@app.route('/pred_with_file', methods=['POST'])
def pred_with_file():
# delete existing files that are in the 'input' folder
dir = 'input'
for f in os.listdir(os.path.join(os.getcwd(), dir)):
os.remove(os.path.join(dir, f))
# 每一个网页上的 输入的框,是一个单独的x,下面这个就是吧这个单独的信息变成一个list,每一个单独的就是一个str (也可以吧x变成int 如果想要的话)
# int_features = [str(x) for x in request.form.values()] # this command basically use extract all the input into a list
# final_features = [np.array(int_features)]
features = request.form # .values()
# we have two input in the website, one is the model type and other is the peptide sequences
# choose scaler and model
# name = int_features[0]
model_name = model_selection(features.get("Model_selection"))
model=pickle.load(open(model_name,'rb'))
scaler_name = model_name + '.joblib'
scaler = joblib.load(scaler_name)
file = request.files["Peptide_sequences"]
filename = secure_filename(file.filename)
filetype = get_filetype(filename)
save_location = os.path.join('input', filename)
file.save(save_location)
sequence_list = []
if filetype == 'xls' or filetype == 'xlsx':
df = pandas.read_excel(save_location, header=0)
sequence_list = df["sequence"].tolist()
if filetype == 'txt' or filetype == 'fasta':
sequence_list = text_fasta_reading(save_location)
if len(sequence_list) == 0:
return render_template("index.html")
# 因为这个list里又两个element我们需要第二个,所以我只需要把吧这个拿出来,然后split
# 另外需要注意,这个地方,网页上输入的时候必须要是AAA,CCC,SAS, 这个格式,不同的sequence的区分只能使用逗号,其他的都不可以
peptide_sequence_list = []
for seq in sequence_list:
format_seq = [seq, seq] # the setting is just following the input format setting in ESM model, [name,sequence]
tuple_sequence = tuple(format_seq)
peptide_sequence_list.append(
tuple_sequence) # build a summarize list variable including all the sequence information
embeddings_results = esm_embeddings(peptide_sequence_list) # conduct the embedding
normalized_embeddings_results = scaler.transform(embeddings_results) # normalized the embeddings
# prediction
predicted_protability = model.predict(normalized_embeddings_results, batch_size=1)
predicted_class = []
for i in range(predicted_protability.shape[0]):
index = np.where(predicted_protability[i] == np.amax(predicted_protability[i]))[0][0]
predicted_class.append(index) # get the class of the results
predicted_class = assign_activity(predicted_class) # transform results (0 and 1) into 'active' and 'non-active'
report = {"sequence": sequence_list, "activity": predicted_class}
report_df = pandas.DataFrame(report)
save_result_path = os.path.join('input', "report.xlsx")
report_df.to_excel(save_result_path)
send_from_directory("input", "report.xlsx")
return send_from_directory("input", "report.xlsx")
if __name__ == '__main__':
app.run()