@@ -31,10 +31,6 @@ print(listdir(wksp_dir))
31
31
# Add folder to colab's path so we can import the helper functions
32
32
import sys
33
33
sys.path.insert(0 , wksp_dir)
34
-
35
- # Read the data back in.
36
- from pandas import read_csv
37
- data = read_csv(" /content/drive/My Drive/Colab Notebooks/text-analysis/data/data.csv" )
38
34
```
39
35
40
36
~~~
@@ -57,193 +53,26 @@ Mounted at /content/drive
57
53
```
58
54
59
55
### Load in the data
60
- Create list of files we'll use for our analysis. We'll start by fitting a word2vec model to just one of the books in our list — Moby Dick.
61
-
62
- Get list of files available to analyze
63
-
64
- ``` python
65
- from helpers import create_file_list
66
- data_dir = wksp_dir + ' /data/books/'
67
- corpus_file_list = create_file_list(data_dir, " *.txt" )
68
- corpus_file_list[0 :5 ]
69
- ```
70
-
71
- ~~~
72
- ['/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/dickens-bleakhouse.txt',
73
- '/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/dumas-blacktulip.txt',
74
- '/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/austen-northanger.txt',
75
- '/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/dickens-christmascarol.txt',
76
- '/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/austen-persuasion.txt']
77
- ~~~
78
- {: .output}
79
-
80
- Parse filelist into a dataframe. Make sure you don't have any extra forward slashes in the pattern — this will cause an error in the helper function.
81
-
82
- ``` python
83
- pattern = data_dir + " {Author} -{Title} .txt"
84
- pattern
85
- ```
86
-
87
- ~~~
88
- '/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/{Author}-{Title}.txt'
89
- ~~~
90
- {: .output}
91
56
92
57
``` python
93
- from helpers import parse_into_dataframe
94
- data = parse_into_dataframe(data_dir + " {Author} - {Title} .txt " , corpus_file_list)
95
- data.head( )
58
+ # Read the data back in.
59
+ from pandas import read_csv
60
+ data = read_csv( " /content/drive/My Drive/Colab Notebooks/text-analysis/data/data.csv " )
96
61
```
97
62
98
-
99
-
100
- <div id =" df-5f4a4787-9f3f-41ee-80d1-477fc6170a9c " >
101
- <div class="colab-df-container">
102
- <div>
103
- <style scoped >
104
- .dataframe tbody tr th :only-of-type {
105
- vertical-align : middle ;
106
- }
107
-
108
- .dataframe tbody tr th {
109
- vertical-align : top ;
110
- }
111
-
112
- .dataframe thead th {
113
- text-align : right ;
114
- }
115
- </style >
116
- <table border =" 1 " class =" dataframe " >
117
- <thead >
118
- <tr style="text-align: right;">
119
- <th></th>
120
- <th>Author</th>
121
- <th>Title</th>
122
- <th>File</th>
123
- </tr>
124
- </thead >
125
- <tbody >
126
- <tr>
127
- <th>0</th>
128
- <td>dickens</td>
129
- <td>bleakhouse</td>
130
- <td>/content/drive/My Drive/Colab Notebooks/text-a...</td>
131
- </tr>
132
- <tr>
133
- <th>1</th>
134
- <td>dumas</td>
135
- <td>blacktulip</td>
136
- <td>/content/drive/My Drive/Colab Notebooks/text-a...</td>
137
- </tr>
138
- <tr>
139
- <th>2</th>
140
- <td>austen</td>
141
- <td>northanger</td>
142
- <td>/content/drive/My Drive/Colab Notebooks/text-a...</td>
143
- </tr>
144
- <tr>
145
- <th>3</th>
146
- <td>dickens</td>
147
- <td>christmascarol</td>
148
- <td>/content/drive/My Drive/Colab Notebooks/text-a...</td>
149
- </tr>
150
- <tr>
151
- <th>4</th>
152
- <td>austen</td>
153
- <td>persuasion</td>
154
- <td>/content/drive/My Drive/Colab Notebooks/text-a...</td>
155
- </tr>
156
- </tbody >
157
- </table >
158
- </div >
159
- <button class="colab-df-convert" onclick="convertToInteractive('df-5f4a4787-9f3f-41ee-80d1-477fc6170a9c')"
160
- title="Convert this dataframe to an interactive table."
161
- style="display:none;">
162
-
163
- <svg xmlns="http://www.w3.org/2000/svg " height="24px"viewBox="0 0 24 24"
164
- width="24px">
165
- <path d =" M0 0h24v24H0V0z " fill =" none " />
166
- <path d =" M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z " /><path d =" M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z " />
167
- </svg >
168
- </button>
169
-
170
- <style >
171
- .colab-df-container {
172
- display :flex ;
173
- flex-wrap :wrap ;
174
- gap : 12px ;
175
- }
176
-
177
- .colab-df-convert {
178
- background-color : #E8F0FE ;
179
- border : none ;
180
- border-radius : 50% ;
181
- cursor : pointer ;
182
- display : none ;
183
- fill : #1967D2 ;
184
- height : 32px ;
185
- padding : 0 0 0 0 ;
186
- width : 32px ;
187
- }
188
-
189
- .colab-df-convert :hover {
190
- background-color : #E2EBFA ;
191
- box-shadow : 0px 1px 2px rgba (60 , 64 , 67 , 0.3 ), 0px 1px 3px 1px rgba (60 , 64 , 67 , 0.15 );
192
- fill : #174EA6 ;
193
- }
194
-
195
- [theme = dark ] .colab-df-convert {
196
- background-color : #3B4455 ;
197
- fill : #D2E3FC ;
198
- }
199
-
200
- [theme = dark ] .colab-df-convert :hover {
201
- background-color : #434B5C ;
202
- box-shadow : 0px 1px 3px 1px rgba (0 , 0 , 0 , 0.15 );
203
- filter : drop-shadow (0px 1px 2px rgba (0 , 0 , 0 , 0.3 ));
204
- fill : #FFFFFF ;
205
- }
206
- </style >
207
-
208
- <script>
209
- const buttonEl =
210
- document.querySelector('#df-5f4a4787-9f3f-41ee-80d1-477fc6170a9c button.colab-df-convert');
211
- buttonEl.style.display =
212
- google.colab.kernel.accessAllowed ? 'block' : 'none';
213
-
214
- async function convertToInteractive(key) {
215
- const element = document.querySelector('#df-5f4a4787-9f3f-41ee-80d1-477fc6170a9c');
216
- const dataTable =
217
- await google.colab.kernel.invokeFunction('convertToInteractive',
218
- [key], {});
219
- if (!dataTable) return;
220
-
221
- const docLinkHtml = 'Like what you see? Visit the ' +
222
- '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
223
- + ' to learn more about interactive tables.';
224
- element.innerHTML = '';
225
- dataTable['output_type'] = 'display_data';
226
- await google.colab.output.renderOutput(dataTable, element);
227
- const docLink = document.createElement('div');
228
- docLink.innerHTML = docLinkHtml;
229
- element.appendChild(docLink);
230
- }
231
- </script>
232
- </div>
233
- </div >
234
-
63
+ Create list of files we'll use for our analysis. We'll start by fitting a word2vec model to just one of the books in our list — Moby Dick.
235
64
236
65
``` python
237
66
single_file = data.loc[data[' Title' ] == ' moby_dick' ,' File' ].item()
238
67
single_file
239
- ```
240
68
69
+ ```
241
70
~~~
242
71
'/content/drive/My Drive/Colab Notebooks/text-analysis/data/melville-moby_dick.txt'
243
72
~~~
244
73
{: .output}
245
74
246
- Let's preview the file contents to make sure our code so far is working correctly.
75
+ Let's preview the file contents to make sure our code and directory setup is working correctly.
247
76
248
77
``` python
249
78
# open and read file
0 commit comments