@@ -109,34 +109,40 @@ def cosine_similarity_attribution(
109
109
tokenizer : PreTrainedTokenizer ,
110
110
) -> Tuple [float , np .ndarray ]:
111
111
# Extract embeddings
112
- initial_sentence_emb , initial_token_embs = get_sentence_embeddings (
112
+ initial_output_sentence_emb , initial_output_token_embs = get_sentence_embeddings (
113
113
original_output_choice .message .content , model , tokenizer
114
114
)
115
- perturbed_sentence_emb , perturbed_token_embs = get_sentence_embeddings (
116
- perturbed_output_choice .message .content , model , tokenizer
115
+ perturbed_output_sentence_emb , perturbed_output_token_embs = (
116
+ get_sentence_embeddings (
117
+ perturbed_output_choice .message .content , model , tokenizer
118
+ )
117
119
)
118
120
119
121
# Reshape embeddings
120
- initial_sentence_emb = initial_sentence_emb .reshape (1 , - 1 )
121
- perturbed_sentence_emb = perturbed_sentence_emb .reshape (1 , - 1 )
122
+ initial_output_sentence_emb = initial_output_sentence_emb .reshape (1 , - 1 )
123
+ perturbed_output_sentence_emb = perturbed_output_sentence_emb .reshape (1 , - 1 )
122
124
123
125
# Calculate similarities
124
126
self_similarity = float (
125
- cosine_similarity (initial_sentence_emb , initial_sentence_emb )
127
+ cosine_similarity (initial_output_sentence_emb , initial_output_sentence_emb )
126
128
)
127
129
sentence_similarity = float (
128
- cosine_similarity (initial_sentence_emb , perturbed_sentence_emb )
130
+ cosine_similarity (initial_output_sentence_emb , perturbed_output_sentence_emb )
129
131
)
130
132
131
133
# Calculate token similarities for shared length
132
- shared_length = min (initial_token_embs .shape [0 ], perturbed_token_embs .shape [0 ])
134
+ shared_length = min (
135
+ initial_output_token_embs .shape [0 ], perturbed_output_token_embs .shape [0 ]
136
+ )
133
137
token_similarities_shared = cosine_similarity (
134
- initial_token_embs [:shared_length ], perturbed_token_embs [:shared_length ]
138
+ initial_output_token_embs [:shared_length ],
139
+ perturbed_output_token_embs [:shared_length ],
135
140
).diagonal ()
136
141
137
142
# Pad token similarities to match initial token embeddings shape
138
143
token_similarities = np .pad (
139
- token_similarities_shared , (0 , initial_token_embs .shape [0 ] - shared_length )
144
+ token_similarities_shared ,
145
+ (0 , initial_output_token_embs .shape [0 ] - shared_length ),
140
146
)
141
147
142
148
# Return difference in sentence similarity and token similarities
0 commit comments