1
1
import asyncio
2
2
import itertools
3
3
import os
4
- import statistics
5
4
from copy import deepcopy
6
5
from typing import Any , List , Optional
7
6
30
29
load_dotenv ()
31
30
32
31
DEFAULT_OPENAI_MODEL = "gpt-3.5-turbo"
32
+ REQUEST_DELAY = 0.1
33
+ MIN_MIDRANGE_THRESHOLD = 0.01
33
34
34
35
35
36
class OpenAIAttributor (BaseAsyncLLMAttributor ):
@@ -156,11 +157,14 @@ async def hierarchical_perturbation(self, input_text: str, init_chunksize: int,
156
157
157
158
final_scores = np .zeros (token_count )
158
159
total_llm_calls = 1
160
+ stage = 0
159
161
160
162
while masks :
163
+ print (f"Stage { stage } " )
161
164
new_masks = []
162
165
perturbation_scores = []
163
166
perturbations = []
167
+ masked_out = []
164
168
for mask in masks :
165
169
perturbed_units = [token if not mask [i ] else perturbation_strategy .replacement_token for i , token in enumerate (tokens )]
166
170
# TODO: Check this is correct unit > token conversion
@@ -173,7 +177,9 @@ async def hierarchical_perturbation(self, input_text: str, init_chunksize: int,
173
177
"token_idx" : np .where (mask )[0 ].tolist (),
174
178
}
175
179
)
180
+ masked_out .append ([self .tokenizer .convert_tokens_to_string (list (itertools .chain .from_iterable (itertools .compress (tokens , mask )))).strip ()])
176
181
182
+ print ("Masked out tokens/words:" , * masked_out , sep = "\n " )
177
183
outputs = await self .compute_attribution_chunks (perturbations )
178
184
chunk_scores = self .get_scores (outputs , original_output , ** kwargs )
179
185
total_llm_calls += len (outputs )
@@ -217,7 +223,7 @@ async def hierarchical_perturbation(self, input_text: str, init_chunksize: int,
217
223
final_scores [mask ] = attr_score
218
224
219
225
midrange_score = (np .max (perturbation_scores ) + np .min (perturbation_scores )) / 2
220
- if midrange_score < 0.01 :
226
+ if midrange_score < MIN_MIDRANGE_THRESHOLD :
221
227
break
222
228
223
229
for mask , score in zip (masks , perturbation_scores ):
@@ -236,6 +242,7 @@ async def hierarchical_perturbation(self, input_text: str, init_chunksize: int,
236
242
new_masks .append (mask2 )
237
243
238
244
masks = new_masks
245
+ stage += 1
239
246
240
247
if logger :
241
248
logger .df_token_attribution_matrix = logger .df_token_attribution_matrix .drop_duplicates (subset = ["exp_id" , "input_token_pos" , "output_token" ], keep = "last" ).sort_values (by = ["input_token_pos" , "output_token_pos" ]).reset_index (drop = True )
@@ -301,7 +308,7 @@ async def compute_attribution_chunks(self, perturbations: list[dict[str, Any]])
301
308
tasks [i ] for i in range (idx , min (idx + self .request_chunksize , len (tasks )))
302
309
]
303
310
outputs .extend (await asyncio .gather (* batch ))
304
- await asyncio .sleep (0.1 )
311
+ await asyncio .sleep (REQUEST_DELAY )
305
312
else :
306
313
outputs = await asyncio .gather (* tasks )
307
314
0 commit comments