addressed PR comments - thanks guys

jessicarumbelow · jessicarumbelow · commit dda365b05455 · 2024-07-04T14:38:18.000+01:00
diff --git a/attribution/api_attribution.py b/attribution/api_attribution.py
@@ -12,6 +12,7 @@
 )
 
 from .attribution_metrics import (
+    NEAR_ZERO_PROB,
     cosine_similarity_attribution,
     token_prob_attribution,
 )
@@ -24,6 +25,8 @@
 
 load_dotenv()
 
+DEFAULT_OPENAI_MODEL = "gpt-3.5-turbo"
+
 
 class OpenAIAttributor(BaseLLMAttributor):
     def __init__(
@@ -35,7 +38,7 @@ def __init__(
     ):
         openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY")
         self.openai_client = openai.OpenAI(api_key=openai_api_key)
-        self.openai_model = openai_model or "gpt-3.5-turbo"
+        self.openai_model = openai_model or DEFAULT_OPENAI_MODEL
 
         self.tokenizer = tokenizer or GPT2Tokenizer.from_pretrained("gpt2")
         self.token_embeddings = token_embeddings or GPT2LMHeadModel.from_pretrained("gpt2").transformer.wte.weight.detach().numpy()
@@ -51,6 +54,46 @@ def get_chat_completion(self, input: str) -> openai.types.chat.chat_completion.C
             top_logprobs=20,
         )
         return response.choices[0]
+    
+    def make_output_location_invariant(self, original_output, perturbed_output):
+        # Making a copy of the original output, so we can update it with the perturbed output log probs, wherever a token from the unperturned output is found in the perturbed output.
+        location_invariant_output = deepcopy(original_output)
+
+        # Get lists of all tokens and their logprobs (including top 20 in each output position) in the perturbed output
+        all_top_logprobs = []
+        all_tokens = []
+        for perturbed_token in perturbed_output.logprobs.content:
+            all_top_logprobs.extend([token_logprob.logprob for token_logprob in perturbed_token.top_logprobs])
+            all_tokens.extend([token_logprob.token for token_logprob in perturbed_token.top_logprobs])
+
+        # Sorting the tokens and logprobs by logprob in descending order. This is because .index gets the first occurence of a token in the list, and we want to get the highest logprob for each token.
+        sorted_indexes = sorted(range(len(all_top_logprobs)), key=all_top_logprobs.__getitem__, reverse=True)
+        all_tokens_sorted = [all_tokens[s] for s in sorted_indexes]
+        all_top_logprobs_sorted = [all_top_logprobs[s] for s in sorted_indexes]
+
+        # Now, for each token in the original output, if it is found in the perturbed output , update the logprob in the original output with the logprob from the perturbed output.
+        # Otherwise, set the logprob to a near zero value.
+        
+        for unperturbed_token in location_invariant_output.logprobs.content:
+            if unperturbed_token.token in all_tokens_sorted:
+                perturbed_logprob = all_top_logprobs_sorted[all_tokens_sorted.index(unperturbed_token.token)]
+            else:
+                perturbed_logprob = NEAR_ZERO_PROB
+            
+            # Update the main token logprob
+            unperturbed_token.logprob = perturbed_logprob
+
+            # Update the same token logprob in the top 20 logprobs (duplicate information, but for consistency with the original output structure / OpenAI format)
+            for top_logprob in unperturbed_token.top_logprobs:
+                if top_logprob.token == unperturbed_token.token:
+                    top_logprob.logprob = perturbed_logprob
+
+        # And update the message content
+        location_invariant_output.message.content = perturbed_output.message.content
+
+        #Now the perturbed output contains the same tokens as the original output, but with the logprobs from the perturbed output.
+        return location_invariant_output
+
 
     def compute_attributions(self, input_text: str, **kwargs):
         perturbation_strategy: PerturbationStrategy = kwargs.get(
@@ -64,7 +107,6 @@ def compute_attributions(self, input_text: str, **kwargs):
         ignore_output_token_location: bool = kwargs.get("ignore_output_token_location", True)
 
         original_output = self.get_chat_completion(input_text)
-        remaining_output = deepcopy(original_output)
 
         if logger:
             logger.start_experiment(
@@ -114,34 +156,9 @@ def compute_attributions(self, input_text: str, **kwargs):
             # Get the output logprobs for the perturbed input
             perturbed_output = self.get_chat_completion(perturbed_input)
 
-
             if ignore_output_token_location:
-
-                all_top_logprobs = []
-                all_toks = []
-                for ptl in perturbed_output.logprobs.content:
-                    all_top_logprobs.extend([tl.logprob for tl in ptl.top_logprobs])
-                    all_toks.extend([tl.token for tl in ptl.top_logprobs])
-
-                sorted_indexes = sorted(range(len(all_top_logprobs)), key=all_top_logprobs.__getitem__, reverse=True)
-                all_toks = [all_toks[s] for s in sorted_indexes]
-                all_top_logprobs = [all_top_logprobs[s] for s in sorted_indexes]
-
-                for otl in remaining_output.logprobs.content:
-                    if otl.token in all_toks:
-                        new_lp = all_top_logprobs[all_toks.index(otl.token)]
-                        
-                    else:
-                        new_lp = -100
-                    
-                    otl.logprob = new_lp
-                    for tl in otl.top_logprobs:
-                        if tl.token == otl.token:
-                            tl.logprob = new_lp
-
-                remaining_output.message.content = perturbed_output.message.content
-                perturbed_output = remaining_output
-
+                perturbed_output = self.make_output_location_invariant(original_output, perturbed_output)
+                
             for attribution_strategy in attribution_strategies:
                 if attribution_strategy == "cosine":
                     sentence_attr, attributed_tokens, token_attributions = cosine_similarity_attribution(
diff --git a/attribution/attribution_metrics.py b/attribution/attribution_metrics.py
@@ -6,6 +6,7 @@
 from sklearn.metrics.pairwise import cosine_similarity
 from transformers import PreTrainedTokenizer
 
+NEAR_ZERO_PROB = -100  # Logprob constant for near zero probability
 
 def token_prob_attribution(
     initial_logprobs: openai.types.chat.chat_completion.ChoiceLogprobs,
@@ -25,7 +26,6 @@ def token_prob_attribution(
 
     # Probability change for each input token
     prob_difference_per_token = np.zeros(len(initial_tokens))
-    NEAR_ZERO_PROB = -100  # Logprob constant for near zero probability
 
     # Calculate the absolute difference in probabilities for each token
     for i, initial_token in enumerate(initial_token_logprobs):
@@ -44,18 +44,18 @@ def cosine_similarity_attribution(
     perturbed_output_str: str,
     token_embeddings: np.ndarray,
     tokenizer: PreTrainedTokenizer,
-) -> Tuple[float, np.ndarray]:
+) -> Tuple[float, list[str], np.ndarray]:
     # Extract embeddings
 
-    original_token_ix = tokenizer.encode(original_output_str, return_tensors="pt", add_special_tokens=False)
-    perturbed_token_ix = tokenizer.encode(perturbed_output_str, return_tensors="pt", add_special_tokens=False)
-    initial_tokens = [tokenizer.decode(t) for t in original_token_ix.squeeze(axis=0)]
+    original_token_id = tokenizer.encode(original_output_str, return_tensors="pt", add_special_tokens=False)
+    perturbed_token_id = tokenizer.encode(perturbed_output_str, return_tensors="pt", add_special_tokens=False)
+    initial_tokens = [tokenizer.decode(t) for t in original_token_id.squeeze(axis=0)]
 
-    original_output_emb = token_embeddings[original_token_ix].reshape(-1, token_embeddings.shape[-1])
-    perturbed_output_emb = token_embeddings[perturbed_token_ix].reshape(-1, token_embeddings.shape[-1])
+    original_output_emb = token_embeddings[original_token_id].reshape(-1, token_embeddings.shape[-1])
+    perturbed_output_emb = token_embeddings[perturbed_token_id].reshape(-1, token_embeddings.shape[-1])
 
-    cd = 1-cosine_similarity(original_output_emb, perturbed_output_emb)
-    token_distance = cd.min(axis=-1)    
+    cosine_distance = 1-cosine_similarity(original_output_emb, perturbed_output_emb)
+    token_distance = cosine_distance.min(axis=-1)    
     return token_distance.mean(), initial_tokens, token_distance
 
 
diff --git a/examples/example_PIZZA.ipynb b/examples/example_PIZZA.ipynb