-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNLPGPT2024.py
223 lines (185 loc) · 7.62 KB
/
NLPGPT2024.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# /* Copyright (C) Kannan Sekar Annu Radha - All Rights Reserved
# * Unauthorized copying of this file, via any medium is strictly prohibited
# * Proprietary and confidential
# * Written by Kannan Sekar Annu Radha <kannansekara@gmail.com>, November 2019
# */ NHS DIGITAL MRS PRIYA BASKER AND MR JOHNATHAN HOPE
# Innovative uses of Data team NHS DIGITAL
import numpy as np
import pandas as pd
import multiprocessing
import os
import re
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from scipy import spatial
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout
from sklearn.model_s≈election import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
# Load the data
print("Loading data from 'd_icd_diagnoses.csv'...")
icdf2 = pd.read_csv('d_icd_diagnoses.csv')
print("Data loaded. First 5 rows:")
print(icdf2.head())
# Download NLTK stopwords
print("Downloading NLTK stopwords...")
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print("Stopwords downloaded.")
# Function to clean and tokenize text
def clean_text(text):
# Remove non-alphabetic characters and lowercase the text
text = re.sub("[^a-zA-Z]", " ", text)
words = text.lower().split()
# Remove stopwords
words = [w for w in words if w not in stop_words]
return words
# Clean the 'long_title' column
print("Cleaning text in 'long_title' column...")
icdf2['cleaned_long_title'] = icdf2['long_title'].apply(clean_text)
print("Text cleaned. First 5 rows of cleaned text:")
print(icdf2[['long_title', 'cleaned_long_title']].head())
# Create the raw corpus
raw_corpus = icdf2['cleaned_long_title'].tolist()
# Set hyperparameters for Word2Vec
num_features = 100 # Embedding vector size
min_word_count = 1 # Minimum word count
num_workers = multiprocessing.cpu_count() # Number of threads to run in parallel
context_size = 7 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
seed = 1 # Seed for the RNG
# Build the Word2Vec model
print("Building Word2Vec model with vector size:", num_features)
icd2vec = Word2Vec(
sg=1,
seed=seed,
workers=num_workers,
vector_size=num_features, # Use vector_size instead of size
min_count=min_word_count,
window=context_size,
sample=downsampling,
)
# Build vocabulary and train the model
print("Building vocabulary...")
icd2vec.build_vocab(raw_corpus)
print("Vocabulary size:", len(icd2vec.wv))
print("Training Word2Vec model...")
icd2vec.train(raw_corpus, total_examples=icd2vec.corpus_count, epochs=10)
print("Word2Vec model trained.")
# Save the model
if not os.path.exists("trained"):
os.makedirs("trained")
icd2vec.save(os.path.join("trained", "icd2vec.model"))
print("Word2Vec model saved.")
# Load the model
icd2vec = Word2Vec.load(os.path.join("trained", "icd2vec.model"))
print("Word2Vec model loaded.")
# Optional: Visualize word embeddings with t-SNE
print("Preparing to visualize word embeddings using t-SNE...")
all_word_vectors_matrix = icd2vec.wv.vectors # For gensim 4.x
# Reduce dimensionality
tsne = TSNE(n_components=2, random_state=0)
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)
# Create a DataFrame for visualization
points = pd.DataFrame(
[
(word, coords[0], coords[1])
for word, coords in zip(icd2vec.wv.index_to_key, all_word_vectors_matrix_2d)
],
columns=["word", "x", "y"]
)
# Plot the embeddings
sns.set_context("poster")
print("Plotting word embeddings...")
points.plot.scatter("x", "y", s=10, figsize=(20, 12))
plt.show()
# Function to get sentence vector by averaging word vectors
def get_sentence_vector(sentence, model):
# Filter out words not in the vocabulary
words_in_vocab = [word for word in sentence if word in model.wv.key_to_index]
if words_in_vocab:
return np.mean(model.wv[words_in_vocab], axis=0)
else:
return np.zeros(model.vector_size)
# Apply the function to create vectors for each ICD code description
print("Generating sentence vectors for each cleaned long title...")
icdf2['vector'] = icdf2['cleaned_long_title'].apply(lambda x: get_sentence_vector(x, icd2vec))
print("First 5 vectors generated:")
print(icdf2['vector'].head())
# Set 'icd9_code' as the index for easy mapping
icdf2.set_index('icd9_code', inplace=True)
# Load diagnosis data
print("Loading diagnosis data from 'diagnoses_icd.csv'...")
diagdf = pd.read_csv("diagnoses_icd.csv")
print("Diagnosis data loaded. First 5 rows:")
print(diagdf.head())
# Map vectors to diagdf based on 'icd9_code'
print("Mapping vectors to diagnosis dataframe based on 'icd9_code'...")
diagdf['vector'] = diagdf['icd9_code'].map(icdf2['vector'])
# Drop rows without vectors
diagdf.dropna(subset=['vector'], inplace=True)
print("Rows with missing vectors dropped. Remaining data size:", diagdf.shape)
# Map ICD codes to integer labels
print("Mapping ICD-9 codes to integer labels...")
label_encoder = LabelEncoder()
diagdf['icd9_code_label'] = label_encoder.fit_transform(diagdf['icd9_code'])
# Prepare feature matrix X and target vector y
print("Preparing feature matrix X and target vector y...")
X = np.vstack(diagdf['vector'].values)
y = diagdf['icd9_code_label'].values
# One-hot encode the labels
num_classes = len(label_encoder.classes_)
y = to_categorical(y, num_classes=num_classes)
# Reshape X for LSTM input (samples, timesteps, features)
X = X.reshape((X.shape[0], 1, X.shape[1]))
# Split into training and test sets
print("Splitting data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print("Data split complete. Training size:", X_train.shape, ", Test size:", X_test.shape)
# Define the LSTM model for classification
print("Building the LSTM model...")
model = Sequential()
model.add(LSTM(100, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax')) # Output layer for classification
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
# Train the model
print("Training the LSTM model...")
history = model.fit(
X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test)
)
print("Model training complete.")
# Make predictions
print("Making predictions on the test set...")
y_pred_prob = model.predict(X_test)
y_pred_classes = np.argmax(y_pred_prob, axis=1)
# Map predicted labels back to ICD codes
predicted_icd_codes = label_encoder.inverse_transform(y_pred_classes)
# Actual ICD codes
y_test_classes = np.argmax(y_test, axis=1)
actual_icd_codes = label_encoder.inverse_transform(y_test_classes)
# Compare predictions with actual ICD codes
print("Comparing predicted and actual ICD-9 codes...")
results = pd.DataFrame({
'Actual_ICD9_Code': actual_icd_codes,
'Predicted_ICD9_Code': predicted_icd_codes
})
print("First 5 predictions vs actual results:")
print(results.head())
# Optionally, print the descriptions for each prediction
print("Printing the actual and predicted ICD-9 codes with their descriptions...")
for idx in range(len(results)):
actual_code = results.loc[idx, 'Actual_ICD9_Code']
predicted_code = results.loc[idx, 'Predicted_ICD9_Code']
actual_desc = ' '.join(icdf2.loc[actual_code]['cleaned_long_title'])
predicted_desc = ' '.join(icdf2.loc[predicted_code]['cleaned_long_title'])
print(f"Actual ICD-9 Code: {actual_code}, Description: {actual_desc}")
print(f"Predicted ICD-9 Code: {predicted_code}, Description: {predicted_desc}\n")