@@ -53,9 +53,7 @@ To enable GPU, click "Edit > Notebook settings" and select GPU. If enabled, this
53
53
These installation commands will take time to run. Begin them now.
54
54
55
55
``` python
56
- ! pip install - U accelerate
57
- ! pip install - U transformers
58
- ! pip install seqeval
56
+ ! pip install transformers datasets evaluate seqeval
59
57
```
60
58
61
59
@@ -174,7 +172,7 @@ Now, let's take a look at the example data from the dataset used in the example.
174
172
``` python
175
173
from datasets import load_dataset, load_metric
176
174
177
- ds = load_dataset(" conll2003" )
175
+ ds = load_dataset(" conll2003" , trust_remote_code = True )
178
176
print (ds)
179
177
```
180
178
@@ -339,7 +337,7 @@ Finally, lets make our own tweaks to the HuggingFace colab notebook. We'll start
339
337
340
338
``` python
341
339
import datasets
342
- from datasets import load_dataset, load_metric, Features
340
+ from datasets import load_dataset, Features
343
341
```
344
342
345
343
The HuggingFace example uses [ CONLL 2003 dataset] ( https://www.aclweb.org/anthology/W03-0419.pdf ) .
@@ -364,7 +362,7 @@ Now that we have a modified huggingface script, let's load our data.
364
362
365
363
366
364
``` python
367
- ds = load_dataset(" /content/drive/MyDrive/Colab Notebooks/text-analysis/code/mit_restaurants.py" )
365
+ ds = load_dataset(" /content/drive/MyDrive/Colab Notebooks/text-analysis/code/mit_restaurants.py" , trust_remote_code = True )
368
366
```
369
367
370
368
/usr/local/lib/python3.10/dist-packages/datasets/load.py:926: FutureWarning: The repository for mit_restaurants contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at /content/drive/MyDrive/Colab Notebooks/text-analysis/code/mit_restaurants.py
@@ -601,13 +599,16 @@ model_name = model_checkpoint.split("/")[-1]
601
599
args = TrainingArguments(
602
600
# f"{model_name}-finetuned-{task}",
603
601
f " { model_name} -carpentries-restaurant-ner " ,
604
- evaluation_strategy = " epoch" ,
605
602
learning_rate = 2e-5 ,
606
603
per_device_train_batch_size = batch_size,
607
604
per_device_eval_batch_size = batch_size,
608
605
num_train_epochs = 3 ,
609
606
weight_decay = 0.01 ,
610
- # push_to_hub=True, #You can have your model automatically pushed to HF if you uncomment this and log in.
607
+ report_to = " none" ,
608
+ eval_strategy = " epoch" ,
609
+ save_strategy = " epoch" ,
610
+ load_best_model_at_end = True ,
611
+ push_to_hub = False , # You can have your model automatically pushed to HF if you uncomment this and log in.
611
612
)
612
613
```
613
614
@@ -628,32 +629,10 @@ The last thing we want to define is the metric by which we evaluate how our mode
628
629
629
630
630
631
``` python
631
- metric = load_metric(" seqeval" )
632
- labels = [label_list[i] for i in example[f " { task} _tags " ]]
633
- metric.compute(predictions = [labels], references = [labels])
634
- ```
635
-
636
- <ipython-input-25-d0b6118e6d86>:1: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
637
- metric = load_metric("seqeval")
638
- /usr/local/lib/python3.10/dist-packages/datasets/load.py:756: FutureWarning: The repository for seqeval contains custom code which must be executed to correctly load the metric. You can inspect the repository content at https://raw.githubusercontent.com/huggingface/datasets/2.18.0/metrics/seqeval/seqeval.py
639
- You can avoid this message in future by passing the argument `trust_remote_code=True`.
640
- Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
641
- warnings.warn(
642
-
643
-
644
-
645
- Downloading builder script: 0%| | 0.00/2.47k [00:00<?, ?B/s]
632
+ import evaluate
646
633
647
-
648
-
649
-
650
-
651
- {'Hours': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
652
- 'Restaurant_Name': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
653
- 'overall_precision': 1.0,
654
- 'overall_recall': 1.0,
655
- 'overall_f1': 1.0,
656
- 'overall_accuracy': 1.0}
634
+ seqeval = evaluate.load(" seqeval" )
635
+ ```
657
636
658
637
659
638
@@ -808,72 +787,42 @@ Now let's see how our model did. We'll run a more detailed evaluation step from
808
787
809
788
810
789
``` python
811
- trainer.evaluate()
812
-
813
- predictions, labels, _ = trainer.predict(tokenized_datasets[" validation" ])
814
- predictions = np.argmax(predictions, axis = 2 )
790
+ from evaluate import evaluator
815
791
816
- # Remove ignored index (special tokens)
817
- true_predictions = [
818
- [label_list[p] for (p, l) in zip (prediction, label) if l != - 100 ]
819
- for prediction, label in zip (predictions, labels)
820
- ]
821
- true_labels = [
822
- [label_list[l] for (p, l) in zip (prediction, label) if l != - 100 ]
823
- for prediction, label in zip (predictions, labels)
824
- ]
792
+ task_evaluator = evaluator(" ner" )
793
+ data= load_dataset(" /content/drive/MyDrive/Colab Notebooks/text-analysis/code/mit_restaurants.py" , split = " test" , trust_remote_code = True )
825
794
826
- results = metric.compute(predictions = true_predictions, references = true_labels)
827
- results
795
+ eval_results = task_evaluator.compute(
796
+ model_or_pipeline = " /content/drive/MyDrive/Colab Notebooks/text-analysis/ft-model" ,
797
+ data = data,
798
+ )
828
799
```
829
800
801
+ ``` python
802
+ for r in eval_results:
803
+ print (r, eval_results[r])
804
+ ```
830
805
806
+ ```
807
+ Amenity {'precision': 0.6354515050167224, 'recall': 0.7011070110701108, 'f1': 0.6666666666666667, 'number': 271}
808
+ Cuisine {'precision': 0.8378378378378378, 'recall': 0.8641114982578397, 'f1': 0.8507718696397942, 'number': 287}
809
+ Dish {'precision': 0.6935483870967742, 'recall': 0.6991869918699187, 'f1': 0.6963562753036437, 'number': 123}
810
+ Hours {'precision': 0.5675675675675675, 'recall': 0.7078651685393258, 'f1': 0.6299999999999999, 'number': 89}
811
+ Location {'precision': 0.8277777777777777, 'recall': 0.8713450292397661, 'f1': 0.849002849002849, 'number': 342}
812
+ Price {'precision': 0.7875, 'recall': 0.863013698630137, 'f1': 0.8235294117647058, 'number': 73}
813
+ Rating {'precision': 0.7311827956989247, 'recall': 0.8395061728395061, 'f1': 0.7816091954022988, 'number': 81}
814
+ Restaurant_Name {'precision': 0.8323699421965318, 'recall': 0.8323699421965318, 'f1': 0.8323699421965318, 'number': 173}
815
+ overall_precision 0.7552083333333334
816
+ overall_recall 0.8061153578874218
817
+ overall_f1 0.7798319327731092
818
+ overall_accuracy 0.9171441163508154
819
+ total_time_in_seconds 4.749443094000526
820
+ samples_per_second 148.64900705765186
821
+ latency_in_seconds 0.006727256507082897
822
+ ```
831
823
832
824
833
-
834
-
835
-
836
-
837
- {'Amenity': {'precision': 0.6298701298701299,
838
- 'recall': 0.6689655172413793,
839
- 'f1': 0.6488294314381271,
840
- 'number': 290},
841
- 'Cuisine': {'precision': 0.8291814946619217,
842
- 'recall': 0.8175438596491228,
843
- 'f1': 0.8233215547703181,
844
- 'number': 285},
845
- 'Dish': {'precision': 0.8,
846
- 'recall': 0.8715953307392996,
847
- 'f1': 0.8342644320297952,
848
- 'number': 257},
849
- 'Hours': {'precision': 0.7132352941176471,
850
- 'recall': 0.776,
851
- 'f1': 0.7432950191570882,
852
- 'number': 125},
853
- 'Location': {'precision': 0.8140900195694716,
854
- 'recall': 0.8253968253968254,
855
- 'f1': 0.8197044334975369,
856
- 'number': 504},
857
- 'Price': {'precision': 0.7723577235772358,
858
- 'recall': 0.8482142857142857,
859
- 'f1': 0.8085106382978723,
860
- 'number': 112},
861
- 'Rating': {'precision': 0.6896551724137931,
862
- 'recall': 0.8130081300813008,
863
- 'f1': 0.746268656716418,
864
- 'number': 123},
865
- 'Restaurant_Name': {'precision': 0.8666666666666667,
866
- 'recall': 0.8802083333333334,
867
- 'f1': 0.8733850129198966,
868
- 'number': 384},
869
- 'overall_precision': 0.7805887764489421,
870
- 'overall_recall': 0.8158653846153846,
871
- 'overall_f1': 0.7978373295721672,
872
- 'overall_accuracy': 0.9095345345345346}
873
-
874
-
875
-
876
- Whether a F1 score of .795 is 'good enough' depends on the performance of other models, how difficult the task is, and so on. It may be good enough for our needs, or we may want to collect more data, train on a bigger model, or adjust our parameters. For the purposes of the workshop, we will say that this is fine.
825
+ Whether a F1 score of .779 is 'good enough' depends on the performance of other models, how difficult the task is, and so on. It may be good enough for our needs, or we may want to collect more data, train on a bigger model, or adjust our parameters. For the purposes of the workshop, we will say that this is fine.
877
826
878
827
## Using our Model
879
828
0 commit comments