Skip to content

Commit 571bffa

Browse files
Attribution methods for API-accessible LLMs
Merge pull request #2 from leap-laboratories/attribution-by-api
2 parents 7247341 + f379193 commit 571bffa

6 files changed

+2025
-0
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,6 @@ __pycache__
1010

1111
# pytest
1212
.pytest_cache
13+
14+
# environments
15+
.env

attribution/attribution_metrics.py

+177
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
import math
2+
from typing import List, Tuple
3+
4+
import numpy as np
5+
import openai
6+
from sklearn.metrics.pairwise import cosine_similarity
7+
from transformers import PreTrainedModel, PreTrainedTokenizer
8+
9+
10+
def token_prob_difference(
11+
initial_logprobs: openai.types.chat.chat_completion.ChoiceLogprobs,
12+
perturbed_logprobs: openai.types.chat.chat_completion.ChoiceLogprobs,
13+
) -> Tuple[float, List[str], np.ndarray]:
14+
# Extract token and logprob from initial_logprobs
15+
initial_token_logprobs = [
16+
(logprob.token, logprob.logprob) for logprob in initial_logprobs.content
17+
]
18+
initial_tokens = [content.token for content in initial_logprobs.content]
19+
20+
# Create a list of dictionaries with token and top logprobs from perturbed_logprobs
21+
perturbed_token_logprobs_list = [
22+
{
23+
top_logprob.token: top_logprob.logprob
24+
for top_logprob in token_content.top_logprobs
25+
}
26+
for token_content in perturbed_logprobs.content
27+
]
28+
29+
# Probability change for each input token
30+
prob_difference_per_token = np.zeros(len(initial_tokens))
31+
NEAR_ZERO_PROB = -100 # Logprob constant for near zero probability
32+
33+
# Calculate the absolute difference in probabilities for each token
34+
for i, initial_token in enumerate(initial_token_logprobs):
35+
perturbed_token_logprobs = (
36+
perturbed_token_logprobs_list[i]
37+
if i < len(perturbed_token_logprobs_list)
38+
else {}
39+
)
40+
perturbed_logprob = perturbed_token_logprobs.get(
41+
initial_token[0], NEAR_ZERO_PROB
42+
)
43+
prob_difference_per_token[i] = abs(
44+
math.exp(initial_token[1]) - math.exp(perturbed_logprob)
45+
)
46+
47+
# Note: Different length outputs shift the mean upwards. This may or may not be desired behaviour.
48+
return prob_difference_per_token.mean(), initial_tokens, prob_difference_per_token
49+
50+
51+
def token_displacement(
52+
initial_logprobs: openai.types.chat.chat_completion.ChoiceLogprobs,
53+
perturbed_logprobs: openai.types.chat.chat_completion.ChoiceLogprobs,
54+
) -> Tuple[float, List[str], np.ndarray]:
55+
initial_tokens = [content.token for content in initial_logprobs.content]
56+
perturbed_top_tokens = [
57+
[top_logprob.token for top_logprob in token_content.top_logprobs]
58+
for token_content in perturbed_logprobs.content
59+
]
60+
61+
# Token displacement for each initially predicted token
62+
displacement_per_token = np.zeros(len(initial_tokens))
63+
MAX_TOKEN_DISPLACEMENT = 20
64+
for i, token in enumerate(initial_tokens):
65+
if i < len(perturbed_top_tokens) and token in perturbed_top_tokens[i]:
66+
displacement_per_token[i] = perturbed_top_tokens[i].index(token)
67+
else:
68+
displacement_per_token[i] = MAX_TOKEN_DISPLACEMENT # TODO: Revise
69+
70+
return displacement_per_token.mean(), initial_tokens, displacement_per_token
71+
72+
73+
# NOTE: this metric does not work. It's left to serve as discussion
74+
def deprecated_max_logprob_difference(
75+
initial_logprobs: openai.types.chat.chat_completion.ChoiceLogprobs,
76+
perturbed_logprobs: openai.types.chat.chat_completion.ChoiceLogprobs,
77+
):
78+
# Get the logprobs of the top 20 tokens for the initial and perturbed outputs
79+
# Warning: this should probably be a list with the top logprobs at each token position instead
80+
initial_top_logprobs = {
81+
logprob.token: logprob.logprob for logprob in initial_logprobs.content
82+
}
83+
perturbed_top_logprobs = {
84+
logprob.token: logprob.logprob for logprob in perturbed_logprobs.content
85+
}
86+
87+
# Calculate the maximum difference in logprobs
88+
max_difference = 0
89+
for token, initial_logprob in initial_top_logprobs.items():
90+
perturbed_logprob = perturbed_top_logprobs.get(token, 0)
91+
max_difference = max(max_difference, abs(initial_logprob - perturbed_logprob))
92+
93+
return max_difference
94+
95+
96+
def get_sentence_embeddings(
97+
sentence: str, model: PreTrainedModel, tokenizer: PreTrainedTokenizer
98+
) -> Tuple[np.ndarray, np.ndarray]:
99+
inputs = tokenizer(sentence, return_tensors="pt")
100+
embeddings = model.transformer.wte(inputs["input_ids"]) # Get the embeddings
101+
embeddings = embeddings.detach().numpy().squeeze()
102+
return embeddings.mean(axis=0), embeddings
103+
104+
105+
def cosine_similarity_attribution(
106+
original_output_choice: openai.types.chat.chat_completion.Choice,
107+
perturbed_output_choice: openai.types.chat.chat_completion.Choice,
108+
model: PreTrainedModel,
109+
tokenizer: PreTrainedTokenizer,
110+
) -> Tuple[float, np.ndarray]:
111+
# Extract embeddings
112+
initial_output_sentence_emb, initial_output_token_embs = get_sentence_embeddings(
113+
original_output_choice.message.content, model, tokenizer
114+
)
115+
perturbed_output_sentence_emb, perturbed_output_token_embs = (
116+
get_sentence_embeddings(
117+
perturbed_output_choice.message.content, model, tokenizer
118+
)
119+
)
120+
121+
# Reshape embeddings
122+
initial_output_sentence_emb = initial_output_sentence_emb.reshape(1, -1)
123+
perturbed_output_sentence_emb = perturbed_output_sentence_emb.reshape(1, -1)
124+
125+
# Calculate similarities
126+
self_similarity = float(
127+
cosine_similarity(initial_output_sentence_emb, initial_output_sentence_emb)
128+
)
129+
sentence_similarity = float(
130+
cosine_similarity(initial_output_sentence_emb, perturbed_output_sentence_emb)
131+
)
132+
133+
# Calculate token similarities for shared length
134+
shared_length = min(
135+
initial_output_token_embs.shape[0], perturbed_output_token_embs.shape[0]
136+
)
137+
token_similarities_shared = cosine_similarity(
138+
initial_output_token_embs[:shared_length],
139+
perturbed_output_token_embs[:shared_length],
140+
).diagonal()
141+
142+
# Pad token similarities to match initial token embeddings shape
143+
token_similarities = np.pad(
144+
token_similarities_shared,
145+
(0, initial_output_token_embs.shape[0] - shared_length),
146+
)
147+
148+
# Return difference in sentence similarity and token similarities
149+
return self_similarity - sentence_similarity, 1 - token_similarities
150+
151+
152+
def _is_token_in_top_20(
153+
token: str,
154+
top_logprobs: List[openai.types.chat.chat_completion_token_logprob.TopLogprob],
155+
):
156+
top_20_tokens = set(logprob.token for logprob in top_logprobs)
157+
return token in top_20_tokens
158+
159+
160+
def any_tokens_in_top_20(
161+
initial_logprobs: openai.types.chat.chat_completion.ChoiceLogprobs,
162+
new_logprobs: openai.types.chat.chat_completion.ChoiceLogprobs,
163+
) -> bool:
164+
if (
165+
initial_logprobs is None
166+
or new_logprobs is None
167+
or initial_logprobs.content is None
168+
or new_logprobs.content is None
169+
):
170+
return False
171+
172+
return any(
173+
_is_token_in_top_20(initial_token.token, new_token.top_logprobs)
174+
for initial_token, new_token in zip(
175+
initial_logprobs.content, new_logprobs.content
176+
)
177+
)

0 commit comments

Comments
 (0)