diff --git a/dianna/dashboard/_movie_model.py b/dianna/dashboard/_movie_model.py index 2cd48496..15377873 100644 --- a/dianna/dashboard/_movie_model.py +++ b/dianna/dashboard/_movie_model.py @@ -26,25 +26,26 @@ def __call__(self, sentences): if isinstance(sentences, str): sentences = [sentences] - tokenized_sentences = [ - self.tokenize(sentence) for sentence in sentences - ] + output = [] + for sentence in sentences: + # tokenize and pad to minimum length + tokens = self.tokenizer.tokenize(sentence.lower()) + if len(tokens) < self.max_filter_size: + tokens += [''] * (self.max_filter_size - len(tokens)) - expected_length = len(tokenized_sentences[0]) - if not all( - len(tokens) == expected_length - for tokens in tokenized_sentences): - raise ValueError( - 'Mismatch in length of tokenized sentences.' - 'This is a problem in the tokenizer:' - 'https://github.com/dianna-ai/dianna/issues/531', ) + # numericalize the tokens + tokens_numerical = [ + self.vocab.stoi[token] + if token in self.vocab.stoi else self.vocab.stoi[''] + for token in tokens + ] - # run the model, applying a sigmoid because the model outputs logits - logits = self.run_model(tokenized_sentences) - pred = np.apply_along_axis(sigmoid, 1, logits) + # run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis + pred = float(sigmoid(self.run_model([tokens_numerical]))) + output.append(pred) - # output pos/neg - positivity = pred[:, 0] + # output two classes + positivity = np.array(output) negativity = 1 - positivity return np.transpose([negativity, positivity]) diff --git a/tests/methods/test_lime_text.py b/tests/methods/test_lime_text.py index 5cec5cab..034c90fd 100644 --- a/tests/methods/test_lime_text.py +++ b/tests/methods/test_lime_text.py @@ -31,11 +31,11 @@ def test_lime_text(self): def test_lime_text_special_chars(self): """Tests exact expected output given a text with special characters and model for Lime.""" review = 'such a bad movie "!?\'"' - expected_words = ['bad', '?', '!', 'movie', 'such', 'a', "'", '"', '"'] - expected_word_indices = [2, 6, 5, 3, 0, 1, 7, 4, 8] + expected_words = ['bad', 'movie', '?', 'such', '!', "'", '"', 'a', '"'] + expected_word_indices = [2, 3, 6, 0, 5, 7, 8, 1, 4] expected_scores = [ - 0.50032869, 0.06458735, -0.05793979, 0.01413776, -0.01246357, - -0.00528022, 0.00305347, 0.00185159, -0.00165128 + 0.51140699, 0.02827488, 0.02657974, -0.02208464, -0.02140743, + 0.00962419, 0.00746798, -0.00743376, -0.0012061 ] explanation = dianna.explain_text(self.runner, @@ -44,7 +44,7 @@ def test_lime_text_special_chars(self): labels=[0], method='LIME', random_state=42)[0] - + print(explanation) assert_explanation_satisfies_expectations(explanation, expected_scores, expected_word_indices, expected_words) diff --git a/tests/utils.py b/tests/utils.py index ea7e731b..c5ec7bb0 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -80,7 +80,7 @@ def __init__(self, model_path, word_vector_file, max_filter_size): self.max_filter_size = max_filter_size def __call__(self, sentences): - """Call function.""" + """Call Runner.""" # ensure the input has a batch axis if isinstance(sentences, str): sentences = [sentences] @@ -89,26 +89,28 @@ def __call__(self, sentences): input_name = sess.get_inputs()[0].name output_name = sess.get_outputs()[0].name - tokenized_sentences = [ - self.tokenize(sentence) for sentence in sentences - ] - - expected_length = len(tokenized_sentences[0]) - if not all( - len(tokens) == expected_length - for tokens in tokenized_sentences): - raise ValueError( - 'Mismatch in length of tokenized sentences.' - 'This is a problem in the tokenizer:' - 'https://github.com/dianna-ai/dianna/issues/531', ) - - # run the model, applying a sigmoid because the model outputs logits - onnx_input = {input_name: tokenized_sentences} - logits = sess.run([output_name], onnx_input)[0] - pred = np.apply_along_axis(sigmoid, 1, logits) - - # output pos/neg - positivity = pred[:, 0] + output = [] + for sentence in sentences: + # tokenize and pad to minimum length + tokens = self.tokenizer.tokenize(sentence.lower()) + if len(tokens) < self.max_filter_size: + tokens += [''] * (self.max_filter_size - len(tokens)) + + # numericalize the tokens + tokens_numerical = [ + self.vocab.stoi[token] + if token in self.vocab.stoi else self.vocab.stoi[''] + for token in tokens + ] + + # run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis + onnx_input = {input_name: [tokens_numerical]} + logits = sess.run([output_name], onnx_input)[0] + pred = float(sigmoid(logits)) + output.append(pred) + + # output two classes + positivity = np.array(output) negativity = 1 - positivity return np.transpose([negativity, positivity]) diff --git a/tutorials/explainers/LIME/lime_text.ipynb b/tutorials/explainers/LIME/lime_text.ipynb index 74846fbd..a5506f88 100644 --- a/tutorials/explainers/LIME/lime_text.ipynb +++ b/tutorials/explainers/LIME/lime_text.ipynb @@ -187,18 +187,18 @@ "source": [ "class MovieReviewsModelRunner:\n", " def __init__(self, model, word_vectors, max_filter_size):\n", - " self.run_model = utils.get_function(str(model))\n", + " self.run_model = utils.get_function(model)\n", " self.vocab = Vectors(word_vectors, cache=os.path.dirname(word_vectors))\n", " self.max_filter_size = max_filter_size\n", " \n", - " self.tokenizer = SpacyTokenizer(name='en_core_web_sm')\n", + " self.tokenizer = SpacyTokenizer(name='en_core_web_sm')\n", "\n", " def __call__(self, sentences):\n", " # ensure the input has a batch axis\n", " if isinstance(sentences, str):\n", " sentences = [sentences]\n", "\n", - " tokenized_sentences = []\n", + " output = []\n", " for sentence in sentences:\n", " # tokenize and pad to minimum length\n", " tokens = self.tokenizer.tokenize(sentence.lower())\n", @@ -208,17 +208,15 @@ " # numericalize the tokens\n", " tokens_numerical = [self.vocab.stoi[token] if token in self.vocab.stoi else self.vocab.stoi['']\n", " for token in tokens]\n", - " tokenized_sentences.append(tokens_numerical)\n", - " \n", - " # run the model, applying a sigmoid because the model outputs logits\n", - " logits = self.run_model(tokenized_sentences)\n", - " pred = np.apply_along_axis(sigmoid, 1, logits)\n", - " \n", + "\n", + " # run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis\n", + " pred = float(sigmoid(self.run_model([tokens_numerical])))\n", + " output.append(pred)\n", + "\n", " # output two classes\n", - " positivity = pred[:, 0]\n", + " positivity = np.array(output)\n", " negativity = 1 - positivity\n", - " return np.transpose([negativity, positivity])\n", - " " + " return np.transpose([negativity, positivity]) " ] }, { diff --git a/tutorials/explainers/RISE/rise_text.ipynb b/tutorials/explainers/RISE/rise_text.ipynb index d564194f..6ebfb9a0 100644 --- a/tutorials/explainers/RISE/rise_text.ipynb +++ b/tutorials/explainers/RISE/rise_text.ipynb @@ -169,7 +169,7 @@ " output = []\n", " for sentence in sentences:\n", " # tokenize and pad to minimum length\n", - " tokens = self.tokenizer.tokenize(sentence)\n", + " tokens = self.tokenizer.tokenize(sentence.lower())\n", " if len(tokens) < self.max_filter_size:\n", " tokens += [''] * (self.max_filter_size - len(tokens))\n", " \n",