Skip to content

Commit 07aed42

Browse files
Update 09-wordEmbed_train-word2vec.md
1 parent 2370608 commit 07aed42

File tree

1 file changed

+6
-177
lines changed

1 file changed

+6
-177
lines changed

_episodes/09-wordEmbed_train-word2vec.md

+6-177
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,6 @@ print(listdir(wksp_dir))
3131
# Add folder to colab's path so we can import the helper functions
3232
import sys
3333
sys.path.insert(0, wksp_dir)
34-
35-
# Read the data back in.
36-
from pandas import read_csv
37-
data = read_csv("/content/drive/My Drive/Colab Notebooks/text-analysis/data/data.csv")
3834
```
3935

4036
~~~
@@ -57,193 +53,26 @@ Mounted at /content/drive
5753
```
5854

5955
### Load in the data
60-
Create list of files we'll use for our analysis. We'll start by fitting a word2vec model to just one of the books in our list — Moby Dick.
61-
62-
Get list of files available to analyze
63-
64-
```python
65-
from helpers import create_file_list
66-
data_dir = wksp_dir + '/data/books/'
67-
corpus_file_list = create_file_list(data_dir, "*.txt")
68-
corpus_file_list[0:5]
69-
```
70-
71-
~~~
72-
['/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/dickens-bleakhouse.txt',
73-
'/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/dumas-blacktulip.txt',
74-
'/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/austen-northanger.txt',
75-
'/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/dickens-christmascarol.txt',
76-
'/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/austen-persuasion.txt']
77-
~~~
78-
{: .output}
79-
80-
Parse filelist into a dataframe. Make sure you don't have any extra forward slashes in the pattern — this will cause an error in the helper function.
81-
82-
```python
83-
pattern = data_dir + "{Author}-{Title}.txt"
84-
pattern
85-
```
86-
87-
~~~
88-
'/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/{Author}-{Title}.txt'
89-
~~~
90-
{: .output}
9156

9257
```python
93-
from helpers import parse_into_dataframe
94-
data = parse_into_dataframe(data_dir + "{Author}-{Title}.txt", corpus_file_list)
95-
data.head()
58+
# Read the data back in.
59+
from pandas import read_csv
60+
data = read_csv("/content/drive/My Drive/Colab Notebooks/text-analysis/data/data.csv")
9661
```
9762

98-
99-
100-
<div id="df-5f4a4787-9f3f-41ee-80d1-477fc6170a9c">
101-
<div class="colab-df-container">
102-
<div>
103-
<style scoped>
104-
.dataframe tbody tr th:only-of-type {
105-
vertical-align: middle;
106-
}
107-
108-
.dataframe tbody tr th {
109-
vertical-align: top;
110-
}
111-
112-
.dataframe thead th {
113-
text-align: right;
114-
}
115-
</style>
116-
<table border="1" class="dataframe">
117-
<thead>
118-
<tr style="text-align: right;">
119-
<th></th>
120-
<th>Author</th>
121-
<th>Title</th>
122-
<th>File</th>
123-
</tr>
124-
</thead>
125-
<tbody>
126-
<tr>
127-
<th>0</th>
128-
<td>dickens</td>
129-
<td>bleakhouse</td>
130-
<td>/content/drive/My Drive/Colab Notebooks/text-a...</td>
131-
</tr>
132-
<tr>
133-
<th>1</th>
134-
<td>dumas</td>
135-
<td>blacktulip</td>
136-
<td>/content/drive/My Drive/Colab Notebooks/text-a...</td>
137-
</tr>
138-
<tr>
139-
<th>2</th>
140-
<td>austen</td>
141-
<td>northanger</td>
142-
<td>/content/drive/My Drive/Colab Notebooks/text-a...</td>
143-
</tr>
144-
<tr>
145-
<th>3</th>
146-
<td>dickens</td>
147-
<td>christmascarol</td>
148-
<td>/content/drive/My Drive/Colab Notebooks/text-a...</td>
149-
</tr>
150-
<tr>
151-
<th>4</th>
152-
<td>austen</td>
153-
<td>persuasion</td>
154-
<td>/content/drive/My Drive/Colab Notebooks/text-a...</td>
155-
</tr>
156-
</tbody>
157-
</table>
158-
</div>
159-
<button class="colab-df-convert" onclick="convertToInteractive('df-5f4a4787-9f3f-41ee-80d1-477fc6170a9c')"
160-
title="Convert this dataframe to an interactive table."
161-
style="display:none;">
162-
163-
<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
164-
width="24px">
165-
<path d="M0 0h24v24H0V0z" fill="none"/>
166-
<path d="M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z"/><path d="M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z"/>
167-
</svg>
168-
</button>
169-
170-
<style>
171-
.colab-df-container {
172-
display:flex;
173-
flex-wrap:wrap;
174-
gap: 12px;
175-
}
176-
177-
.colab-df-convert {
178-
background-color: #E8F0FE;
179-
border: none;
180-
border-radius: 50%;
181-
cursor: pointer;
182-
display: none;
183-
fill: #1967D2;
184-
height: 32px;
185-
padding: 0 0 0 0;
186-
width: 32px;
187-
}
188-
189-
.colab-df-convert:hover {
190-
background-color: #E2EBFA;
191-
box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
192-
fill: #174EA6;
193-
}
194-
195-
[theme=dark] .colab-df-convert {
196-
background-color: #3B4455;
197-
fill: #D2E3FC;
198-
}
199-
200-
[theme=dark] .colab-df-convert:hover {
201-
background-color: #434B5C;
202-
box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
203-
filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
204-
fill: #FFFFFF;
205-
}
206-
</style>
207-
208-
<script>
209-
const buttonEl =
210-
document.querySelector('#df-5f4a4787-9f3f-41ee-80d1-477fc6170a9c button.colab-df-convert');
211-
buttonEl.style.display =
212-
google.colab.kernel.accessAllowed ? 'block' : 'none';
213-
214-
async function convertToInteractive(key) {
215-
const element = document.querySelector('#df-5f4a4787-9f3f-41ee-80d1-477fc6170a9c');
216-
const dataTable =
217-
await google.colab.kernel.invokeFunction('convertToInteractive',
218-
[key], {});
219-
if (!dataTable) return;
220-
221-
const docLinkHtml = 'Like what you see? Visit the ' +
222-
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
223-
+ ' to learn more about interactive tables.';
224-
element.innerHTML = '';
225-
dataTable['output_type'] = 'display_data';
226-
await google.colab.output.renderOutput(dataTable, element);
227-
const docLink = document.createElement('div');
228-
docLink.innerHTML = docLinkHtml;
229-
element.appendChild(docLink);
230-
}
231-
</script>
232-
</div>
233-
</div>
234-
63+
Create list of files we'll use for our analysis. We'll start by fitting a word2vec model to just one of the books in our list — Moby Dick.
23564

23665
```python
23766
single_file = data.loc[data['Title'] == 'moby_dick','File'].item()
23867
single_file
239-
```
24068

69+
```
24170
~~~
24271
'/content/drive/My Drive/Colab Notebooks/text-analysis/data/melville-moby_dick.txt'
24372
~~~
24473
{: .output}
24574

246-
Let's preview the file contents to make sure our code so far is working correctly.
75+
Let's preview the file contents to make sure our code and directory setup is working correctly.
24776

24877
```python
24978
# open and read file

0 commit comments

Comments
 (0)