From 509d5f445d48de713b4226af4b93974a22c7e687 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Fri, 29 Sep 2023 20:32:30 +0200 Subject: [PATCH] Heavily improve automatic model card generation + Patch XLM-R (#28) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Uncomment pushing to the Hub * Initial version to improve automatic model card generation * Simplify label normalization * Automatically select some eval sentences for the widget * Improve language card * Add automatic evaluation results * Use dash instead of underscore in model name * Add extra TODOs * model.predict text as the first example * Automatically set model name based on encoder & dataset * Remove accidental Dataset import * Rename examples to widget examples * Add table with label examples Also use fields instead of __dict__ * Ensure complete metadata * Add tokenizer warning if punct must be split from words * Remove dead code * Rename poor variable names * Fix incorrect warning * Add " in the model labels * Set model_id based on args if possible * Add training set metrics * Randomly select 100 samples for the widget examples Instead of taking the first 100 * Prevent duplicate widget examples * Remove completed TODO * Use title case throughout model card * Add useful comments if values not provided Also prevent crash if dataset_id is not provided * Add environmental impact with codecarbon * Ensure that the model card template is included in the install * Add training hardware section * Add Python version * Make everything title case * Add missing docstring * Add docstring for SpanMarkerModelCardData * Update CHANGELOG * Add SpanMarkerModelCardData to dunder init * Add SpanMarkerModelCardData to snippets * Resolve breaking error if hub_model_id is set * gpu_model -> hardware_used To better match what HF expects * Add "base_model" to metadata * Increment datasets min version to 2.14.0 Required for sorting on multiple columns at once * Update trainer evaluate tests * Skip old model card test for now * Fix edge case: less than 5 examples * pytest.skip -> pytest.mark.skip * Try to infer the language from the dataset * Add citations and hidden sections * Refactor inferring language * Remove unused import * Add comment explaining version * Override default Trainer create_model_card * Update model card template slightly * Add newline to model card template * Remove incorrect space * Add model card tests * Improve Trainer tests regarding model card * Remove commented out breakpoint * Add codecarbon to CI * Rename integration extra to codecarbon * Make hardware_used optional (if no GPU present) * Apply suggestions to model_card_template Co-authored-by: Daniel van Strien * Update model card test pattern alongside template changes * Don't include hardware_used when no GPU present * Set "No GPU used" for GPU Model if hardware_used is None * Don't store None in yaml * Ensure that emissions is a regular float * kgs to g * support e-05 notation * Add small test case for model cards * Update model tables in docs * Link to the spaCy integration in the tokenizer warning * Update README snippet * Update outdated docs: entity_max_length default is 8 * Remove /models from URL, caused 404s * Fix outdated type hint * 🎉 Apply XLM-R patch * Remove /models from test * Remove tokenizer warning after patch * Update training docs with model card data etc. * Pad token embeddings to multiple of 8 Removes a warning since transformers 4.32.0 * Always attach list directly to header * Tackle edge case where dataset card has no metadata * Allow installing nltk for detokenizing model card examples * Add model card docs * Mention codecarbon install in docstring * overwrite the default codecarbon log level to "error" * Update CHANGELOG * Fix issue with inference example containing full quotes * Update CHANGELOG * Never print a model when printing SpanMarkerModelCardData * Try to infer the dataset_id from the training set Thanks @cakiki * Update the main docs landing page --------- Co-authored-by: Daniel van Strien --- .github/workflows/tests.yaml | 2 +- CHANGELOG.md | 17 + MANIFEST.in | 1 + README.md | 30 +- docs/api/span_marker.model_card.rst | 17 + docs/api/span_marker.rst | 1 + docs/index.rst | 120 +++--- notebooks/getting_started.ipynb | 465 +++++++++++----------- notebooks/model_training.ipynb | 380 +++++++++--------- pyproject.toml | 5 +- span_marker/__init__.py | 8 + span_marker/evaluation.py | 12 +- span_marker/label_normalizer.py | 19 +- span_marker/model_card.py | 525 ++++++++++++++++++++++--- span_marker/model_card_template.md | 167 ++++++++ span_marker/modeling.py | 42 +- span_marker/tokenizer.py | 8 +- span_marker/trainer.py | 54 ++- tests/conftest.py | 5 +- tests/constants.py | 2 +- tests/model_card_pattern.py | 217 ++++++++++ tests/test_model_card.py | 144 +++++-- tests/test_trainer.py | 48 ++- training_scripts/conll03_context.py | 19 +- training_scripts/conll03_no_context.py | 19 +- training_scripts/conllpp_context.py | 19 +- training_scripts/fewnerd_base.py | 19 +- training_scripts/fewnerd_large.py | 19 +- training_scripts/ontonotesv5.py | 19 +- 29 files changed, 1777 insertions(+), 626 deletions(-) create mode 100644 MANIFEST.in create mode 100644 docs/api/span_marker.model_card.rst create mode 100644 span_marker/model_card_template.md create mode 100644 tests/model_card_pattern.py diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 69dbab51..5a349e13 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -38,7 +38,7 @@ jobs: - name: Install external dependencies on cache miss run: | python -m pip install --no-cache-dir --upgrade pip - python -m pip install --no-cache-dir ".[dev]" + python -m pip install --no-cache-dir ".[dev, codecarbon]" python -m spacy download en_core_web_sm if: steps.restore-cache.outputs.cache-hit != 'true' diff --git a/CHANGELOG.md b/CHANGELOG.md index 67bcef1e..7f8ad8e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,8 +19,25 @@ Types of changes ### Added +- Added `SpanMarkerModel.generate_model_card()` method to get a model card string. +- Added `SpanMarkerModelCardData` that should be passed to `SpanMarkerModel.from_pretrained` with additional information like + - `language`, `license`, `model_name`, `model_id`, `encoder_name`, `encoder_id`, `dataset_name`, `dataset_id`, `dataset_revision`. - Added `transformers` `pipeline` support, e.g. `pipeline(task="span-marker", model="tomaarsen/span-marker-mbert-base-multinerd")`. +### Changed + +- Heavily improved automatic model card generated. +- Evaluating outside of training now returns per-label outputs instead of only "overall" F1, precision and recall. +- Warn if the used tokenizer distinguishes between punctuation directly attached to a word and punctuation separated from a word by a space. + - If so, then inference of that model will require the punctuation to be split from the words. +- Improve label normalization speed. +- Allow you to call SpanMarkerModel.from_pretrained with a pre-initialized SpanMarkerConfig. + +### Fixed + +- Fixed tokenization mismatch between training and inference for XLM-RoBERTa models: allows for normal inference of those models. +- Resolve niche bug when TrainingArguments are not provided. + ## [1.3.0] ### Added diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..ae726279 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include span_marker/model_card_template.md \ No newline at end of file diff --git a/README.md b/README.md index 90b8f3b6..218e30d3 100644 --- a/README.md +++ b/README.md @@ -44,32 +44,47 @@ Please have a look at our [Getting Started](notebooks/getting_started.ipynb) not | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tomaarsen/SpanMarkerNER/blob/main/notebooks/getting_started.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/tomaarsen/SpanMarkerNER/blob/main/notebooks/getting_started.ipynb) | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/tomaarsen/SpanMarkerNER/blob/main/notebooks/getting_started.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/tomaarsen/SpanMarkerNER/blob/main/notebooks/getting_started.ipynb) | ```python +from pathlib import Path from datasets import load_dataset from transformers import TrainingArguments -from span_marker import SpanMarkerModel, Trainer +from span_marker import SpanMarkerModel, Trainer, SpanMarkerModelCardData def main() -> None: # Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels - dataset = load_dataset("DFKI-SLT/few-nerd", "supervised") + dataset_id = "DFKI-SLT/few-nerd" + dataset_name = "FewNERD" + dataset = load_dataset(dataset_id, "supervised") dataset = dataset.remove_columns("ner_tags") dataset = dataset.rename_column("fine_ner_tags", "ner_tags") labels = dataset["train"].features["ner_tags"].feature.names + # ['O', 'art-broadcastprogram', 'art-film', 'art-music', 'art-other', ... # Initialize a SpanMarker model using a pretrained BERT-style encoder - model_name = "bert-base-cased" + encoder_id = "bert-base-cased" + model_id = f"tomaarsen/span-marker-{encoder_id}-fewnerd-fine-super" model = SpanMarkerModel.from_pretrained( - model_name, + encoder_id, labels=labels, # SpanMarker hyperparameters: model_max_length=256, marker_max_length=128, entity_max_length=8, + # Model card arguments + model_card_data=SpanMarkerModelCardData( + model_id=model_id, + encoder_id=encoder_id, + dataset_name=dataset_name, + dataset_id=dataset_id, + license="cc-by-sa-4.0", + language="en", + ), ) # Prepare the 🤗 transformers training arguments + output_dir = Path("models") / model_id args = TrainingArguments( - output_dir="models/span_marker_bert_base_cased_fewnerd_fine_super", + output_dir=output_dir, # Training Hyperparameters: learning_rate=5e-5, per_device_train_batch_size=32, @@ -96,12 +111,13 @@ def main() -> None: eval_dataset=dataset["validation"], ) trainer.train() - trainer.save_model("models/span_marker_bert_base_cased_fewnerd_fine_super/checkpoint-final") # Compute & save the metrics on the test set metrics = trainer.evaluate(dataset["test"], metric_key_prefix="test") trainer.save_metrics("test", metrics) + # Save the final checkpoint + trainer.save_model(output_dir / "checkpoint-final") if __name__ == "__main__": main() @@ -121,8 +137,6 @@ entities = model.predict("Amelia Earhart flew her single engine Lockheed Vega 5B {'span': 'Paris', 'label': 'location-GPE', 'score': 0.9892390966415405, 'char_start_index': 78, 'char_end_index': 83}] ``` - - ## Pretrained Models All models in this list contain `train.py` files that show the training scripts used to generate them. Additionally, all training scripts used are stored in the [training_scripts](training_scripts) directory. diff --git a/docs/api/span_marker.model_card.rst b/docs/api/span_marker.model_card.rst new file mode 100644 index 00000000..4ccd08a1 --- /dev/null +++ b/docs/api/span_marker.model_card.rst @@ -0,0 +1,17 @@ + +:autogenerated: + +.. + This file is autogenerated by `sphinx-api`. + +span_marker.model_card module +============================= + +.. currentmodule:: span_marker.model_card + +.. automodule:: span_marker.model_card + :members: + :exclude-members: hyperparameters, eval_results_dict, eval_lines_list, metric_lines, widget, predict_example, label_example_list, tokenizer_warning, train_set_metrics_list, code_carbon_callback, pipeline_tag, library_name, version, metrics, model, set_widget_examples, set_train_set_metrics, set_label_examples, register_model, is_on_huggingface, generate_model_card + :undoc-members: + :show-inheritance: + :member-order: bysource diff --git a/docs/api/span_marker.rst b/docs/api/span_marker.rst index a2650031..28131ad1 100644 --- a/docs/api/span_marker.rst +++ b/docs/api/span_marker.rst @@ -19,6 +19,7 @@ span_marker package span_marker.modeling span_marker.trainer span_marker.configuration + span_marker.model_card span_marker.pipeline_component span_marker.data_collator span_marker.tokenizer diff --git a/docs/index.rst b/docs/index.rst index fba2de2f..55d092d6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -19,43 +19,48 @@ or no label annotation scheme. Check out all publicly available SpanMarker models on the Hugging Face Hub `here `_. Alternatively, check out any model from this list of particularly useful models: -+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+ -| Model ID | Domain | Label Count | Language | -+=====================================================================================================================================================+==========+=============+==============+ -| `tomaarsen/span-marker-mbert-base-multinerd `_ | General | 15 | Multilingual | -| | | | | -| `lxyuan/span-marker-bert-base-multilingual-uncased-multinerd `_ | | | | -| | | | | -| `lxyuan/span-marker-bert-base-multilingual-cased-multinerd `_ | | | | -+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+ -| `tomaarsen/span-marker-bert-base-fewnerd-fine-super `_ | General | 66 | English | -| | | | | -| `tomaarsen/span-marker-xlm-roberta-base-fewnerd-fine-super `_ | | | Multilingual | -+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+ -| `tomaarsen/span-marker-bert-base-cross-ner `_ | General | 39 | English | -| | | | | -| `tomaarsen/span-marker-bert-base-uncased-cross-ner `_ | | | | -+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+ -| `tomaarsen/span-marker-roberta-large-ontonotes5 `_ | General | 18 | English | -+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+ -| `tomaarsen/span-marker-bert-base-acronyms `_ | Acronyms | 2 | English | -| | | | | -| `tomaarsen/span-marker-bert-base-uncased-acronyms `_ | | | | -+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+ -| `tomaarsen/span-marker-bert-base-ncbi-disease `_ | Diseases | 1 | English | -+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+ -| `stefan-it/span-marker-gelectra-large-germeval14 `_ | General | 12 | German | -| | | | | -| `gwlms/span-marker-teams-germeval14 `_ | | | | -| | | | | -| `gwlms/span-marker-token-dropping-bert-germeval14 `_ | | | | -| | | | | -| `gwlms/span-marker-bert-germeval14 `_ | | | | -+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+ - - ++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+ +| Model ID | Domain | Label Count | Language | ++=====================================================================================================================================================+============+=============+==============+ +| `tomaarsen/span-marker-mbert-base-multinerd `_ | General | 15 | Multilingual | +| | | | | +| `lxyuan/span-marker-bert-base-multilingual-uncased-multinerd `_ | | | | +| | | | | +| `lxyuan/span-marker-bert-base-multilingual-cased-multinerd `_ | | | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+ +| `tomaarsen/span-marker-bert-base-fewnerd-fine-super `_ | General | 66 | English | +| | | | | +| `tomaarsen/span-marker-xlm-roberta-base-fewnerd-fine-super `_ | | | Multilingual | ++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+ +| `tomaarsen/span-marker-bert-base-cross-ner `_ | General | 39 | English | +| | | | | +| `tomaarsen/span-marker-bert-base-uncased-cross-ner `_ | | | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+ +| `tomaarsen/span-marker-roberta-large-ontonotes5 `_ | General | 18 | English | ++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+ +| `tomaarsen/span-marker-bert-base-uncased-keyphrase-inspec `_ | Keyphrases | 1 | English | ++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+ +| `tomaarsen/span-marker-bert-base-acronyms `_ | Acronyms | 2 | English | +| | | | | +| `tomaarsen/span-marker-bert-base-uncased-acronyms `_ | | | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+ +| `tomaarsen/span-marker-bert-base-ncbi-disease `_ | Biomedical | 1 | English | +| | | | | +| `tomaarsen/span-marker-bert-base-uncased-bionlp `_ | | 5 | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+ +| `stefan-it/span-marker-gelectra-large-germeval14 `_ | General | 12 | German | +| | | | | +| `gwlms/span-marker-teams-germeval14 `_ | | | | +| | | | | +| `gwlms/span-marker-token-dropping-bert-germeval14 `_ | | | | +| | | | | +| `gwlms/span-marker-bert-germeval14 `_ | | | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+ + + +******* Context -======= +******* .. raw:: html @@ -68,12 +73,13 @@ Context I have developed this library as a part of my thesis work at `Argilla `_. Feel free to ⭐ star or watch the `SpanMarker repository `_ to get notified when my thesis is published. -*************** +############### Quick Reference -*************** +############### +************ How to Train -============ +************ :: @@ -86,9 +92,9 @@ How to Train dataset = load_dataset("DFKI-SLT/few-nerd", "supervised") labels = ["O", "art", "building", "event", "location", "organization", "other", "person", "product"] - # Initialize a SpanMarkerModel using an encoder, e.g. BERT: - model_name = "bert-base-cased" - model = SpanMarkerModel.from_pretrained(model_name, labels=labels) + # Initialize a SpanMarkerModel using an encoder, e.g. BERT, and the labels: + encoder_id = "bert-base-cased" + model = SpanMarkerModel.from_pretrained(encoder_id, labels=labels) # See the 🤗 TrainingArguments documentation for details here args = TrainingArguments( @@ -114,14 +120,21 @@ How to Train # Training is really simple using our Trainer! trainer.train() - trainer.save_model("my_span_marker_model/checkpoint-final") # ... and so is evaluating! metrics = trainer.evaluate() print(metrics) + # Save the model locally or on the Hugging Face Hub + trainer.save_model("my_span_marker_model/checkpoint-final") + trainer.push_to_hub("my_span_marker_model/checkpoint-final") + +See :doc:`Initializing & Training ` for more details, or check out the documentation for +:class:`~span_marker.modeling.SpanMarkerModel`, :class:`~span_marker.trainer.Trainer`, :func:`~datasets.load_dataset`, or :class:`~transformers.TrainingArguments`. + +************** How to predict -============== +************** :: @@ -130,7 +143,7 @@ How to predict # Load a finetuned SpanMarkerModel from the 🤗 Hub model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-bert-base-fewnerd-fine-super") - # It is recommended to explicitly move the model to CUDA for faster inference + # It is recommended to explicitly move the model to CUDA for faster inference, if possible model.cuda() model.predict("A prototype was fitted in the mid-'60s in a one-off DB5 extended 4'' after the doors and driven by Marek personally, and a normally 6-cylinder Aston Martin DB7 was equipped with a V8 unit in 1998.") @@ -142,26 +155,33 @@ How to predict You can also load a locally saved model through ``SpanMarkerModel.from_pretrained("path/to/model")``, much like in 🤗 Transformers. +See :doc:`Loading & Inferencing ` for more details, or check out the documentation for +:class:`~span_marker.modeling.SpanMarkerModel` or :meth:`~span_marker.modeling.SpanMarkerModel.predict`. + +******************* How to save a model -=================== +******************* Locally -------- +======= :: model.save_pretrained("my_model_dir") +See the documentation for :meth:`~span_marker.modeling.SpanMarkerModel.save_pretrained` for more details. + To the 🤗 Hub -------------- +============= :: - model_name = "span-marker-bert-base-fewnerd-fine-super" - model.push_to_hub(model_name) + model_id = "span-marker-bert-base-fewnerd-fine-super" + model.push_to_hub(model_id) +See the documentation for :meth:`~span_marker.modeling.SpanMarkerModel.push_to_hub` for more details. .. toctree:: :maxdepth: 2 diff --git a/notebooks/getting_started.ipynb b/notebooks/getting_started.ipynb index 227add6f..e5c569c8 100644 --- a/notebooks/getting_started.ipynb +++ b/notebooks/getting_started.ipynb @@ -45,37 +45,8 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "100%|██████████| 3359329/3359329 [00:09<00:00, 342056.99it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "100%|██████████| 482037/482037 [00:01<00:00, 346172.32it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "100%|██████████| 958765/958765 [00:02<00:00, 346564.24it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset few-nerd downloaded and prepared to .... Subsequent calls will reuse this data.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "data": { + "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],\n", @@ -90,13 +61,18 @@ " num_rows: 37648\n", " })\n", "})" - ] + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "from datasets import load_dataset\n", "\n", - "dataset = load_dataset(\"DFKI-SLT/few-nerd\", \"supervised\")\n", + "dataset_id = \"DFKI-SLT/few-nerd\"\n", + "dataset = load_dataset(dataset_id, \"supervised\")\n", "dataset" ] }, @@ -190,31 +166,52 @@ "\n", "Importantly, the model can *either* be an encoder or an already trained and saved SpanMarker model. As we haven't trained anything yet, we will use an encoder. To learn how to load and use a saved SpanMarker model, please have a look at the [Loading & Inferencing](model_loading.ipynb) notebook.\n", "\n", - "Reasonable options for encoders include BERT, RoBERTa, etc., which means that the following are all good options: `\"bert-base-cased\"`, `\"bert-large-cased\"`, `\"roberta-base\"`, `\"roberta-large\"`. Not all encoders work though, they **must** allow for `position_ids` as an input argument, which disqualifies DistilBERT, T5, DistilRoBERTa, ALBERT & BART. Furthermore, using uncased models is generally not recommended, as the capitalisation can be very useful to find named entities.\n", + "Reasonable options for encoders include BERT, RoBERTa, mBERT, XLM-RoBERTa, etc., which means that the following are all good options:\n", + "\n", + "* [prajjwal1/bert-tiny](https://huggingface.co/prajjwal1/bert-tiny)\n", + "* [prajjwal1/bert-mini](https://huggingface.co/prajjwal1/bert-mini)\n", + "* [prajjwal1/bert-small](https://huggingface.co/prajjwal1/bert-small)\n", + "* [prajjwal1/bert-medium](https://huggingface.co/prajjwal1/bert-medium)\n", + "* [bert-base-cased](https://huggingface.co/bert-base-cased)\n", + "* [bert-large-cased](https://huggingface.co/bert-large-cased)\n", + "* [bert-base-multilingual-cased](https://huggingface.co/bert-base-multilingual-cased)\n", + "* [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased)\n", + "* [roberta-base](https://huggingface.co/roberta-base)\n", + "* [roberta-large](https://huggingface.co/roberta-large)\n", + "* [xlm-roberta-base](https://huggingface.co/xlm-roberta-base)\n", + "* [xlm-roberta-large](https://huggingface.co/xlm-roberta-large)\n", + "\n", + "Not all encoders work though, they **must** allow for `position_ids` as an input argument, which disqualifies DistilBERT, T5, DistilRoBERTa, ALBERT & BART. \n", + "\n", + "Additionally, it's important to consider that cased models typically demand consistent capitalization in the inference data, aligning with how the training data is formatted. In simpler terms, if your training data consistently uses correct capitalization, but your inference data does not, it may lead to suboptimal performance. In such cases, you might find an uncased model more suitable. Although it may exhibit slightly lower F1 scores on the testing set, it remains functional regardless of capitalization, making it potentially more effective in real-world scenarios.\n", "\n", "We'll use `\"bert-base-cased\"` for this notebook. If you're running this on Google Colab, be sure to set hardware accelerator to \"GPU\" in `Runtime` > `Change runtime type`." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']\n", - "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" - ] - } - ], + "outputs": [], "source": [ - "from span_marker import SpanMarkerModel\n", + "from span_marker import SpanMarkerModel, SpanMarkerModelCardData\n", "\n", - "model_name = \"bert-base-cased\"\n", - "model = SpanMarkerModel.from_pretrained(model_name, labels=labels, model_max_length=256)" + "encoder_id = \"bert-base-cased\"\n", + "model = SpanMarkerModel.from_pretrained(\n", + " # Required arguments\n", + " encoder_id,\n", + " labels=labels,\n", + " # Optional arguments\n", + " model_max_length=256,\n", + " entity_max_length=8,\n", + " # To improve the generated model card\n", + " model_card_data=SpanMarkerModelCardData(\n", + " language=[\"en\"],\n", + " license=\"cc-by-sa-4.0\",\n", + " encoder_id=encoder_id,\n", + " dataset_id=dataset_id,\n", + " )\n", + ")" ] }, { @@ -227,7 +224,9 @@ "Note that we provided `SpanMarkerModel.from_pretrained` with a list of our labels. This is required when training a new model using an encoder. Furthermore, we can specify some useful configuration parameters from `SpanMarkerConfig`, such as:\n", "\n", "* `model_max_length`: The maximum number of tokens that the model will process. If you only use short sentences for your model, reducing this number may help training and inference speeds with no loss in performance. Defaults to the encoder maximum, or 512 if the encoder doesn't have a maximum.\n", - "* `entity_max_length`: The total number of words that one entity can be. Defaults to 16." + "* `entity_max_length`: The total number of words that one entity can be. Defaults to 8.\n", + "* `model_card_data`: A [SpanMarkerModelCardData](https://tomaarsen.github.io/SpanMarkerNER/api/span_marker.model_card.html#span_marker.model_card.SpanMarkerModelCardData) instance where you can provide a lot of useful data about your model. This data will be automatically included in a generated model card whenever a model is saved or pushed to the Hugging Face Hub.\n", + " * Consider adding `language`, `license`, `model_id`, `encoder_id` and `dataset_id` to improve the generated model card README.md file." ] }, { @@ -261,6 +260,7 @@ " eval_steps=200,\n", " push_to_hub=False,\n", " logging_steps=50,\n", + " fp16=True,\n", " warmup_ratio=0.1,\n", " dataloader_num_workers=2,\n", ")" @@ -271,12 +271,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we can create a SpanMarker `Trainer` in the same way that you would initialize a 🤗 Transformers `Trainer`. We'll train on a subsection of the data to save us some time. Amazingly, this `Trainer` will automatically create logs using exactly the logging tools that you have installed. With other words, if you prefer logging with [Tensorboard](https://www.tensorflow.org/tensorboard), all that you have to do is install it." + "Now we can create a SpanMarker [Trainer](https://tomaarsen.github.io/SpanMarkerNER/api/span_marker.trainer.html#span_marker.trainer.Trainer) in the same way that you would initialize a 🤗 Transformers `Trainer`. We'll train on a subsection of the data to save us some time. Amazingly, this `Trainer` will automatically create logs using exactly the logging tools that you have installed. With other words, if you prefer logging with [Tensorboard](https://www.tensorflow.org/tensorboard), all that you have to do is install it." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -307,163 +307,48 @@ "name": "stdout", "output_type": "stream", "text": [ - "This SpanMarker model will ignore 0.339050% of all annotated entities in the train dataset. This is caused by the SpanMarkerModel maximum entity length of 8 words.\n", - "These are the frequencies of the missed entities due to maximum entity length out of 20351 total entities:\n", - "- 24 missed entities with 9 words (0.117930%)\n", - "- 15 missed entities with 10 words (0.073706%)\n", - "- 14 missed entities with 11 words (0.068793%)\n", - "- 7 missed entities with 12 words (0.034396%)\n", - "- 5 missed entities with 13 words (0.024569%)\n", - "- 2 missed entities with 15 words (0.009828%)\n", - "- 1 missed entities with 17 words (0.004914%)\n", - "- 1 missed entities with 19 words (0.004914%)\n", - "Tracking run with wandb version 0.14.0\n", - "Run data is saved locally in ...\n", - "Syncing run colorful-leaf-761 to Weights & Biases\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'loss': 0.9012, 'learning_rate': 2.032520325203252e-05, 'epoch': 0.04}\n", - "{'loss': 0.0813, 'learning_rate': 4.065040650406504e-05, 'epoch': 0.08}\n", - "{'loss': 0.0514, 'learning_rate': 4.8777173913043476e-05, 'epoch': 0.12}\n", - "{'loss': 0.0385, 'learning_rate': 4.651268115942029e-05, 'epoch': 0.16}\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "This SpanMarker model won't be able to predict 0.307515% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum entity length of 8 words.\n", - "These are the frequencies of the missed entities due to maximum entity length out of 5203 total entities:\n", - "- 5 missed entities with 9 words (0.096098%)\n", - "- 5 missed entities with 10 words (0.096098%)\n", - "- 2 missed entities with 11 words (0.038439%)\n", - "- 1 missed entities with 12 words (0.019220%)\n", - "- 3 missed entities with 13 words (0.057659%)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'eval_loss': 0.03596973791718483, 'eval_overall_precision': 0.6802749427202666, 'eval_overall_recall': 0.6297724643270344, 'eval_overall_f1': 0.6540502653449485, 'eval_overall_accuracy': 0.9053643208390295, 'eval_runtime': 28.0718, 'eval_samples_per_second': 87.241, 'eval_steps_per_second': 21.837, 'epoch': 0.16}\n", - "{'loss': 0.0334, 'learning_rate': 4.42481884057971e-05, 'epoch': 0.2}\n", - "{'loss': 0.0306, 'learning_rate': 4.1983695652173914e-05, 'epoch': 0.24}\n", - "{'loss': 0.0278, 'learning_rate': 3.971920289855073e-05, 'epoch': 0.29}\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading cached processed dataset at ...\n", - "Loading cached processed dataset at ...\n" + "{'loss': 0.6974, 'learning_rate': 1.991869918699187e-05, 'epoch': 0.04}\n", + "{'loss': 0.0896, 'learning_rate': 4.0243902439024395e-05, 'epoch': 0.08}\n", + "{'loss': 0.0584, 'learning_rate': 4.8822463768115946e-05, 'epoch': 0.12}\n", + "{'loss': 0.0382, 'learning_rate': 4.655797101449276e-05, 'epoch': 0.16}\n", + "{'eval_loss': 0.03181104362010956, 'eval_overall_precision': 0.6967930029154519, 'eval_overall_recall': 0.5989974937343359, 'eval_overall_f1': 0.6442048517520216, 'eval_overall_accuracy': 0.8993717106605198, 'eval_runtime': 29.16, 'eval_samples_per_second': 83.985, 'eval_steps_per_second': 21.022, 'epoch': 0.16}\n", + "{'loss': 0.0333, 'learning_rate': 4.429347826086957e-05, 'epoch': 0.2}\n", + "{'loss': 0.0303, 'learning_rate': 4.202898550724638e-05, 'epoch': 0.24}\n", + "{'loss': 0.032, 'learning_rate': 3.976449275362319e-05, 'epoch': 0.29}\n", + "{'loss': 0.0304, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.33}\n", + "{'eval_loss': 0.02394717186689377, 'eval_overall_precision': 0.7350157728706624, 'eval_overall_recall': 0.7187198766146135, 'eval_overall_f1': 0.7267764889365436, 'eval_overall_accuracy': 0.9227489698502713, 'eval_runtime': 29.481, 'eval_samples_per_second': 83.07, 'eval_steps_per_second': 20.793, 'epoch': 0.33}\n", + "{'loss': 0.0265, 'learning_rate': 3.5235507246376816e-05, 'epoch': 0.37}\n", + "{'loss': 0.0254, 'learning_rate': 3.297101449275363e-05, 'epoch': 0.41}\n", + "{'loss': 0.0249, 'learning_rate': 3.0706521739130435e-05, 'epoch': 0.45}\n", + "{'loss': 0.0242, 'learning_rate': 2.8442028985507245e-05, 'epoch': 0.49}\n", + "{'eval_loss': 0.02163967303931713, 'eval_overall_precision': 0.762808736476832, 'eval_overall_recall': 0.7204549836128783, 'eval_overall_f1': 0.7410271663692247, 'eval_overall_accuracy': 0.9293582473175309, 'eval_runtime': 29.0261, 'eval_samples_per_second': 84.372, 'eval_steps_per_second': 21.119, 'epoch': 0.49}\n", + "{'loss': 0.0224, 'learning_rate': 2.6177536231884058e-05, 'epoch': 0.53}\n", + "{'loss': 0.0242, 'learning_rate': 2.391304347826087e-05, 'epoch': 0.57}\n", + "{'loss': 0.0226, 'learning_rate': 2.164855072463768e-05, 'epoch': 0.61}\n", + "{'loss': 0.0245, 'learning_rate': 1.9384057971014493e-05, 'epoch': 0.65}\n", + "{'eval_loss': 0.020556513220071793, 'eval_overall_precision': 0.7680876026593665, 'eval_overall_recall': 0.7572778099093889, 'eval_overall_f1': 0.7626444034559751, 'eval_overall_accuracy': 0.9338052303047611, 'eval_runtime': 29.7545, 'eval_samples_per_second': 82.307, 'eval_steps_per_second': 20.602, 'epoch': 0.65}\n", + "{'loss': 0.0231, 'learning_rate': 1.7119565217391306e-05, 'epoch': 0.69}\n", + "{'loss': 0.0209, 'learning_rate': 1.4855072463768116e-05, 'epoch': 0.73}\n", + "{'loss': 0.0202, 'learning_rate': 1.2590579710144929e-05, 'epoch': 0.77}\n", + "{'loss': 0.0212, 'learning_rate': 1.032608695652174e-05, 'epoch': 0.81}\n", + "{'eval_loss': 0.01960749179124832, 'eval_overall_precision': 0.7743021183923976, 'eval_overall_recall': 0.7540003855793329, 'eval_overall_f1': 0.7640164094549716, 'eval_overall_accuracy': 0.9358247317530904, 'eval_runtime': 29.6794, 'eval_samples_per_second': 82.515, 'eval_steps_per_second': 20.654, 'epoch': 0.81}\n", + "{'loss': 0.0202, 'learning_rate': 8.061594202898551e-06, 'epoch': 0.86}\n", + "{'loss': 0.0196, 'learning_rate': 5.797101449275362e-06, 'epoch': 0.9}\n", + "{'loss': 0.0232, 'learning_rate': 3.5326086956521736e-06, 'epoch': 0.94}\n", + "{'loss': 0.0183, 'learning_rate': 1.2681159420289857e-06, 'epoch': 0.98}\n", + "{'eval_loss': 0.019303549081087112, 'eval_overall_precision': 0.7719162141194724, 'eval_overall_recall': 0.7673028725660305, 'eval_overall_f1': 0.769602629797931, 'eval_overall_accuracy': 0.9378442332014197, 'eval_runtime': 29.1715, 'eval_samples_per_second': 83.952, 'eval_steps_per_second': 21.014, 'epoch': 0.98}\n", + "{'train_runtime': 450.609, 'train_samples_per_second': 21.788, 'train_steps_per_second': 2.723, 'train_loss': 0.056268237500824186, 'epoch': 1.0}\n" ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'loss': 0.0245, 'learning_rate': 3.745471014492754e-05, 'epoch': 0.33}\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'eval_loss': 0.023754317313432693, 'eval_overall_precision': 0.7612159329140461, 'eval_overall_recall': 0.700154261473197, 'eval_overall_f1': 0.7294094013660104, 'eval_overall_accuracy': 0.9214634046807729, 'eval_runtime': 28.2374, 'eval_samples_per_second': 86.729, 'eval_steps_per_second': 21.709, 'epoch': 0.33}\n", - "{'loss': 0.0257, 'learning_rate': 3.5190217391304346e-05, 'epoch': 0.37}\n", - "{'loss': 0.0237, 'learning_rate': 3.292572463768116e-05, 'epoch': 0.41}\n", - "{'loss': 0.0234, 'learning_rate': 3.066123188405797e-05, 'epoch': 0.45}\n", - "{'loss': 0.0241, 'learning_rate': 2.8396739130434785e-05, 'epoch': 0.49}\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'eval_loss': 0.02093053236603737, 'eval_overall_precision': 0.7934713036057179, 'eval_overall_recall': 0.7171230235248747, 'eval_overall_f1': 0.7533677706877343, 'eval_overall_accuracy': 0.9292782958232162, 'eval_runtime': 28.1912, 'eval_samples_per_second': 86.871, 'eval_steps_per_second': 21.744, 'epoch': 0.49}\n", - "{'loss': 0.021, 'learning_rate': 2.6132246376811598e-05, 'epoch': 0.53}\n", - "{'loss': 0.02, 'learning_rate': 2.3867753623188408e-05, 'epoch': 0.57}\n", - "{'loss': 0.022, 'learning_rate': 2.1603260869565217e-05, 'epoch': 0.61}\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading cached processed dataset at ...\n", - "Loading cached processed dataset at ...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'loss': 0.0237, 'learning_rate': 1.933876811594203e-05, 'epoch': 0.65}\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'eval_loss': 0.020754070952534676, 'eval_overall_precision': 0.7628806742003448, 'eval_overall_recall': 0.7680293096799075, 'eval_overall_f1': 0.7654463341981359, 'eval_overall_accuracy': 0.9358077087881818, 'eval_runtime': 28.0953, 'eval_samples_per_second': 87.168, 'eval_steps_per_second': 21.819, 'epoch': 0.65}\n", - "{'loss': 0.0226, 'learning_rate': 1.7074275362318843e-05, 'epoch': 0.69}\n", - "{'loss': 0.0218, 'learning_rate': 1.4809782608695653e-05, 'epoch': 0.73}\n", - "{'loss': 0.0242, 'learning_rate': 1.2545289855072464e-05, 'epoch': 0.77}\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading cached processed dataset at ...\n", - "Loading cached processed dataset at ...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'loss': 0.0197, 'learning_rate': 1.0280797101449275e-05, 'epoch': 0.81}\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'eval_loss': 0.019617434591054916, 'eval_overall_precision': 0.7771473292897672, 'eval_overall_recall': 0.7659082144234477, 'eval_overall_f1': 0.7714868408274256, 'eval_overall_accuracy': 0.937746128262156, 'eval_runtime': 28.2921, 'eval_samples_per_second': 86.561, 'eval_steps_per_second': 21.667, 'epoch': 0.81}\n", - "{'loss': 0.0191, 'learning_rate': 8.016304347826086e-06, 'epoch': 0.86}\n", - "{'loss': 0.0187, 'learning_rate': 5.751811594202898e-06, 'epoch': 0.9}\n", - "{'loss': 0.0202, 'learning_rate': 3.4873188405797104e-06, 'epoch': 0.94}\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading cached processed dataset at ...\n", - "Loading cached processed dataset at ...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'loss': 0.0221, 'learning_rate': 1.2228260869565218e-06, 'epoch': 0.98}\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'eval_loss': 0.019159900024533272, 'eval_overall_precision': 0.7773279352226721, 'eval_overall_recall': 0.7774778249132279, 'eval_overall_f1': 0.7774028728429576, 'eval_overall_accuracy': 0.9399702095533473, 'eval_runtime': 28.0225, 'eval_samples_per_second': 87.394, 'eval_steps_per_second': 21.875, 'epoch': 0.98}\n", - "{'train_runtime': 453.1296, 'train_samples_per_second': 21.667, 'train_steps_per_second': 2.708, 'train_loss': 0.06319850289734186, 'epoch': 1.0}\n", - "TrainOutput(global_step=1227, training_loss=0.06319850289734186, metrics={'train_runtime': 453.1296, 'train_samples_per_second': 21.667, 'train_steps_per_second': 2.708, 'train_loss': 0.06319850289734186, 'epoch': 1.0})" - ] + "data": { + "text/plain": [ + "TrainOutput(global_step=1227, training_loss=0.056268237500824186, metrics={'train_runtime': 450.609, 'train_samples_per_second': 21.788, 'train_steps_per_second': 2.723, 'train_loss': 0.056268237500824186, 'epoch': 1.0})" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -484,21 +369,54 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading cached processed dataset at ...\n", - "Loading cached processed dataset at ...\n", - "{'eval_loss': 0.019206691533327103,\n", - " 'eval_overall_precision': 0.7758985200845666,\n", - " 'eval_overall_recall': 0.7784419591207096,\n", - " 'eval_overall_f1': 0.7771681586293194,\n", - " 'eval_overall_accuracy': 0.9398477830602543,\n", - " 'eval_runtime': 28.0849,\n", - " 'eval_samples_per_second': 87.2,\n", - " 'eval_steps_per_second': 21.827,\n", + "data": { + "text/plain": [ + "{'eval_loss': 0.019375888630747795,\n", + " 'eval_art': {'precision': 0.7661290322580645,\n", + " 'recall': 0.7723577235772358,\n", + " 'f1': 0.7692307692307692,\n", + " 'number': 246},\n", + " 'eval_building': {'precision': 0.5842293906810035,\n", + " 'recall': 0.6127819548872181,\n", + " 'f1': 0.5981651376146789,\n", + " 'number': 266},\n", + " 'eval_event': {'precision': 0.5497382198952879,\n", + " 'recall': 0.5965909090909091,\n", + " 'f1': 0.5722070844686648,\n", + " 'number': 176},\n", + " 'eval_location': {'precision': 0.8036732108929703,\n", + " 'recall': 0.8409542743538767,\n", + " 'f1': 0.8218911917098446,\n", + " 'number': 1509},\n", + " 'eval_organization': {'precision': 0.7474226804123711,\n", + " 'recall': 0.6998069498069498,\n", + " 'f1': 0.7228315054835494,\n", + " 'number': 1036},\n", + " 'eval_other': {'precision': 0.6775818639798489,\n", + " 'recall': 0.5604166666666667,\n", + " 'f1': 0.61345496009122,\n", + " 'number': 480},\n", + " 'eval_person': {'precision': 0.8636363636363636,\n", + " 'recall': 0.9063313096270599,\n", + " 'f1': 0.8844688954718578,\n", + " 'number': 1153},\n", + " 'eval_product': {'precision': 0.7366666666666667,\n", + " 'recall': 0.6884735202492211,\n", + " 'f1': 0.7117552334943639,\n", + " 'number': 321},\n", + " 'eval_overall_precision': 0.7705836876691148,\n", + " 'eval_overall_recall': 0.7686524002313476,\n", + " 'eval_overall_f1': 0.7696168323520897,\n", + " 'eval_overall_accuracy': 0.9381502182693484,\n", + " 'eval_runtime': 28.5583,\n", + " 'eval_samples_per_second': 85.754,\n", + " 'eval_steps_per_second': 21.465,\n", " 'epoch': 1.0}" - ] + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -520,7 +438,7 @@ "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ "This SpanMarker model won't be able to predict 0.285605% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum entity length of 8 words.\n", @@ -532,17 +450,58 @@ "- 1 missed entities with 13 words (0.019040%)\n", "- 1 missed entities with 17 words (0.019040%)\n", "- 1 missed entities with 19 words (0.019040%)\n", - "- 1 missed entities with 40 words (0.019040%)\n", - "{'test_loss': 0.019189156591892242,\n", - " 'test_overall_precision': 0.769879287219774,\n", - " 'test_overall_recall': 0.7679663608562691,\n", - " 'test_overall_f1': 0.7689216342933691,\n", - " 'test_overall_accuracy': 0.938544749464231,\n", - " 'test_runtime': 28.0932,\n", - " 'test_samples_per_second': 86.854,\n", - " 'test_steps_per_second': 21.713,\n", - " 'epoch': 1.0}" + "- 1 missed entities with 40 words (0.019040%)\n" ] + }, + { + "data": { + "text/plain": [ + "{'test_loss': 0.01918497122824192,\n", + " 'test_art': {'precision': 0.7419354838709677,\n", + " 'recall': 0.7488372093023256,\n", + " 'f1': 0.7453703703703703,\n", + " 'number': 215},\n", + " 'test_building': {'precision': 0.6236559139784946,\n", + " 'recall': 0.710204081632653,\n", + " 'f1': 0.6641221374045801,\n", + " 'number': 245},\n", + " 'test_event': {'precision': 0.6153846153846154,\n", + " 'recall': 0.5529953917050692,\n", + " 'f1': 0.5825242718446603,\n", + " 'number': 217},\n", + " 'test_location': {'precision': 0.812192118226601,\n", + " 'recall': 0.8515171078114913,\n", + " 'f1': 0.8313898518751971,\n", + " 'number': 1549},\n", + " 'test_organization': {'precision': 0.7320754716981132,\n", + " 'recall': 0.6897777777777778,\n", + " 'f1': 0.7102974828375286,\n", + " 'number': 1125},\n", + " 'test_other': {'precision': 0.7375886524822695,\n", + " 'recall': 0.6328600405679513,\n", + " 'f1': 0.6812227074235807,\n", + " 'number': 493},\n", + " 'test_person': {'precision': 0.8805309734513275,\n", + " 'recall': 0.9061930783242259,\n", + " 'f1': 0.8931777378815081,\n", + " 'number': 1098},\n", + " 'test_product': {'precision': 0.6641221374045801,\n", + " 'recall': 0.5898305084745763,\n", + " 'f1': 0.6247755834829445,\n", + " 'number': 295},\n", + " 'test_overall_precision': 0.7766859344894027,\n", + " 'test_overall_recall': 0.7697154859652473,\n", + " 'test_overall_f1': 0.7731850004795243,\n", + " 'test_overall_accuracy': 0.938954021816699,\n", + " 'test_runtime': 29.8808,\n", + " 'test_samples_per_second': 81.658,\n", + " 'test_steps_per_second': 20.414,\n", + " 'epoch': 1.0}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -581,7 +540,9 @@ "text": [ "Battle of Camulodunum => event\n", "Quintus Petillius Cerialis => person\n", - "Boudica => person\n", + "Boudica => location\n", + "Camulodunum => location\n", + "Colchester => location\n", "\n", "Wellingborough => location\n", "Northamptonshire => location\n", @@ -599,9 +560,11 @@ "Bachelor of Music in Composition => other\n", "California State University => organization\n", "Northridge => location\n", + "Master of Music in Harpsichord Performance => other\n", "Cal State Northridge => organization\n", - "Ann Arbor => organization\n", - "\n" + "Doctor of Musical Arts => other\n", + "University of Michigan => organization\n", + "Ann Arbor => location" ] } ], @@ -634,7 +597,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Once trained, we can save our new model locally." + "Once trained, we can save our new model locally. The saved model also comes with a flashy `README.md` such as [this one](https://huggingface.co/tomaarsen/span-marker-bert-base-uncased-bionlp)." ] }, { @@ -651,16 +614,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Or we can push it to the 🤗 Hub like so. I've commented it away for now to prevent people from accidentally pushing models." + "Or we can push it to the 🤗 Hub like so." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# trainer.push_to_hub()" + "trainer.push_to_hub(repo_id=\"span-marker-bert-base-fewnerd-coarse-super\")" ] }, { @@ -692,11 +655,26 @@ "from transformers import TrainingArguments\n", "\n", "def main():\n", - " dataset = load_dataset(\"DFKI-SLT/few-nerd\", \"supervised\")\n", + " dataset_id = \"DFKI-SLT/few-nerd\"\n", + " dataset = load_dataset(dataset_id, \"supervised\")\n", " labels = dataset[\"train\"].features[\"ner_tags\"].feature.names\n", "\n", - " model_name = \"bert-base-cased\"\n", - " model = SpanMarkerModel.from_pretrained(model_name, labels=labels)\n", + " encoder_id = \"bert-base-cased\"\n", + " model = SpanMarkerModel.from_pretrained(\n", + " # Required arguments\n", + " encoder_id,\n", + " labels=labels,\n", + " # Optional arguments\n", + " model_max_length=256,\n", + " entity_max_length=8,\n", + " # To improve the generated model card\n", + " model_card_data=SpanMarkerModelCardData(\n", + " language=[\"en\"],\n", + " license=\"cc-by-sa-4.0\",\n", + " encoder_id=encoder_id,\n", + " dataset_id=dataset_id,\n", + " )\n", + " )\n", "\n", " args = TrainingArguments(\n", " output_dir=\"models/span-marker-bert-base-fewnerd-coarse-super\",\n", @@ -710,6 +688,7 @@ " eval_steps=200,\n", " push_to_hub=False,\n", " logging_steps=50,\n", + " fp16=True,\n", " warmup_ratio=0.1,\n", " dataloader_num_workers=2,\n", " )\n", @@ -720,13 +699,13 @@ " train_dataset=dataset[\"train\"].select(range(8000)),\n", " eval_dataset=dataset[\"validation\"].select(range(2000)),\n", " )\n", - "\n", " trainer.train()\n", - " trainer.save_model(\"models/span-marker-bert-base-fewnerd-coarse-super/checkpoint-final\")\n", "\n", " metrics = trainer.evaluate()\n", " print(metrics)\n", "\n", + " trainer.save_model(\"models/span-marker-bert-base-fewnerd-coarse-super/checkpoint-final\")\n", + "\n", "if __name__ == \"__main__\":\n", " main()\n", "```" diff --git a/notebooks/model_training.ipynb b/notebooks/model_training.ipynb index 11ae82c9..f13e11b9 100644 --- a/notebooks/model_training.ipynb +++ b/notebooks/model_training.ipynb @@ -70,7 +70,8 @@ "source": [ "from datasets import load_dataset\n", "\n", - "dataset = load_dataset(\"conll2003\")\n", + "dataset_id = \"conll2003\"\n", + "dataset = load_dataset(dataset_id)\n", "dataset" ] }, @@ -121,38 +122,53 @@ "* [prajjwal1/bert-medium](https://huggingface.co/prajjwal1/bert-medium)\n", "* [bert-base-cased](https://huggingface.co/bert-base-cased)\n", "* [bert-large-cased](https://huggingface.co/bert-large-cased)\n", + "* [bert-base-multilingual-cased](https://huggingface.co/bert-base-multilingual-cased)\n", + "* [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased)\n", "* [roberta-base](https://huggingface.co/roberta-base)\n", "* [roberta-large](https://huggingface.co/roberta-large)\n", + "* [xlm-roberta-base](https://huggingface.co/xlm-roberta-base)\n", + "* [xlm-roberta-large](https://huggingface.co/xlm-roberta-large)\n", "\n", - "Not all encoders work though, they **must** allow for `position_ids` as an input argument, which disqualifies DistilBERT, T5, DistilRoBERTa, ALBERT & BART. Furthermore, using uncased models is generally not recommended, as the capitalisation can be very useful to find named entities.\n", + "\n", + "Not all encoders work though, they **must** allow for `position_ids` as an input argument, which disqualifies DistilBERT, T5, DistilRoBERTa, ALBERT & BART. \n", + "\n", + "Additionally, it's important to consider that cased models typically demand consistent capitalization in the inference data, aligning with how the training data is formatted. In simpler terms, if your training data consistently uses correct capitalization, but your inference data does not, it may lead to suboptimal performance. In such cases, you might find an uncased model more suitable. Although it may exhibit slightly lower F1 scores on the testing set, it remains functional regardless of capitalization, making it potentially more effective in real-world scenarios.\n", "\n", "We'll use `\"roberta-base\"` for this notebook. If you're running this on Google Colab, be sure to set hardware accelerator to \"GPU\" in `Runtime` > `Change runtime type`." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']\n", - "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" + "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ - "from span_marker import SpanMarkerModel\n", + "from span_marker import SpanMarkerModel, SpanMarkerModelCardData\n", "\n", - "model_name = \"roberta-base\"\n", + "encoder_id = \"roberta-base\"\n", "model = SpanMarkerModel.from_pretrained(\n", - " model_name,\n", + " # Required arguments\n", + " encoder_id,\n", " labels=labels,\n", + " # Optional arguments\n", " model_max_length=256,\n", " entity_max_length=6,\n", + " # To improve the generated model card\n", + " model_card_data=SpanMarkerModelCardData(\n", + " language=[\"en\"],\n", + " license=\"apache-2.0\",\n", + " encoder_id=encoder_id,\n", + " dataset_id=dataset_id,\n", + " )\n", ")" ] }, @@ -161,9 +177,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For us, these warnings are expected, as we are initializing `BertModel` for a new task.\n", + "For us, these warnings are expected, as we are initializing `RobertaModel` for a new task.\n", + "\n", + "Note that we provided `SpanMarkerModel.from_pretrained` with a list of our labels. This is required when training a new model using an encoder. Furthermore, we can specify some useful configuration parameters from `SpanMarkerConfig`, such as:\n", "\n", - "Note that we provided [SpanMarkerModel.from_pretrained](https://tomaarsen.github.io/SpanMarkerNER/api/span_marker.modeling.html#span_marker.modeling.SpanMarkerModel.from_pretrained) with a list of our labels. This is required when training a new model. See [Configuring](model_configuration.ipynb) for more details and recommendations on configuration options." + "* `model_max_length`: The maximum number of tokens that the model will process. If you only use short sentences for your model, reducing this number may help training and inference speeds with no loss in performance. Defaults to the encoder maximum, or 512 if the encoder doesn't have a maximum.\n", + "* `entity_max_length`: The total number of words that one entity can be. Defaults to 8.\n", + "* `model_card_data`: A [SpanMarkerModelCardData](https://tomaarsen.github.io/SpanMarkerNER/api/span_marker.model_card.html#span_marker.model_card.SpanMarkerModelCardData) instance where you can provide a lot of useful data about your model. This data will be automatically included in a generated model card whenever a model is saved or pushed to the Hugging Face Hub.\n", + " * Consider adding `language`, `license`, `model_id`, `encoder_id` and `dataset_id` to improve the generated model card README.md file." ] }, { @@ -179,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -207,12 +228,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we can create a SpanMarker `Trainer` in the same way that you would initialize a 🤗 Transformers `Trainer`. We'll train on a subsection of the data to save us some time. Amazingly, this `Trainer` will automatically create logs using exactly the logging tools that you have installed. With other words, if you prefer logging with [Tensorboard](https://www.tensorflow.org/tensorboard), all that you have to do is install it." + "Now we can create a SpanMarker [Trainer](https://tomaarsen.github.io/SpanMarkerNER/api/span_marker.trainer.html#span_marker.trainer.Trainer) in the same way that you would initialize a 🤗 Transformers `Trainer`. We'll train on a subsection of the data to save us some time. Amazingly, this `Trainer` will automatically create logs using exactly the logging tools that you have installed. With other words, if you prefer logging with [Tensorboard](https://www.tensorflow.org/tensorboard), all that you have to do is install it." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -226,83 +247,20 @@ "- 3 missed entities with 10 words (0.012767%)\n" ] }, - { - "data": { - "text/html": [ - "wandb version 0.15.0 is available! To upgrade, please run:\n", - " $ pip install wandb --upgrade" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Tracking run with wandb version 0.14.0" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Run data is saved locally in wandb\\run-20230428_160736-klxbldeq" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Syncing run woven-plasma-757 to Weights & Biases (docs)
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "be3fbeb39544469eba6382d146d521fa", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1802 [00:00 None:\n", + " dataset_id = \"conll2003\"\n", + " dataset = load_dataset(dataset_id)\n", + " labels = dataset[\"train\"].features[\"ner_tags\"].feature.names\n", "\n", - "model_name = \"roberta-base\"\n", - "model = SpanMarkerModel.from_pretrained(model_name, labels=labels, model_max_length=256)\n", + " encoder_id = \"roberta-base\"\n", + " model = SpanMarkerModel.from_pretrained(\n", + " # Required arguments\n", + " encoder_id,\n", + " labels=labels,\n", + " # Optional arguments\n", + " model_max_length=256,\n", + " entity_max_length=6,\n", + " # To improve the generated model card\n", + " model_card_data=SpanMarkerModelCardData(\n", + " language=[\"en\"],\n", + " license=\"apache-2.0\",\n", + " encoder_id=encoder_id,\n", + " dataset_id=dataset_id,\n", + " )\n", + " )\n", "\n", - "args = TrainingArguments(\n", - " output_dir=\"models/span-marker-roberta-base-conll03\",\n", - " learning_rate=1e-5,\n", - " gradient_accumulation_steps=2,\n", - " per_device_train_batch_size=4,\n", - " per_device_eval_batch_size=4,\n", - " num_train_epochs=1,\n", - " evaluation_strategy=\"steps\",\n", - " save_strategy=\"steps\",\n", - " eval_steps=500,\n", - " push_to_hub=False,\n", - " logging_steps=50,\n", - " warmup_ratio=0.1,\n", - ")\n", + " args = TrainingArguments(\n", + " output_dir=\"models/span-marker-roberta-base-conll03\",\n", + " learning_rate=1e-5,\n", + " gradient_accumulation_steps=2,\n", + " per_device_train_batch_size=4,\n", + " per_device_eval_batch_size=4,\n", + " num_train_epochs=1,\n", + " evaluation_strategy=\"steps\",\n", + " save_strategy=\"steps\",\n", + " eval_steps=500,\n", + " push_to_hub=False,\n", + " logging_steps=50,\n", + " fp16=True,\n", + " warmup_ratio=0.1,\n", + " )\n", "\n", - "trainer = Trainer(\n", - " model=model,\n", - " args=args,\n", - " train_dataset=dataset[\"train\"].select(range(8000)),\n", - " eval_dataset=dataset[\"validation\"].select(range(2000)),\n", - ")\n", + " trainer = Trainer(\n", + " model=model,\n", + " args=args,\n", + " train_dataset=dataset[\"train\"].select(range(8000)),\n", + " eval_dataset=dataset[\"validation\"].select(range(2000)),\n", + " )\n", + " trainer.train()\n", "\n", - "trainer.train()\n", - "trainer.save_model(\"models/span-marker-roberta-base-conll03/checkpoint-final\")\n", - "trainer.push_to_hub()\n", + " metrics = trainer.evaluate()\n", + " print(metrics)\n", "\n", - "metrics = trainer.evaluate()\n", - "print(metrics)\n", + " trainer.save_model(\"models/span-marker-roberta-base-conll03/checkpoint-final\")\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()\n", "```" ] }, diff --git a/pyproject.toml b/pyproject.toml index 5c944a25..fde38413 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ dependencies = [ "torch", "accelerate", "transformers>=4.19.0", # required for EvalPrediction.inputs - "datasets>=2.0.0", + "datasets>=2.14.0", # required for sorting with multiple columns "packaging>=20.0", "evaluate", "seqeval", @@ -59,6 +59,9 @@ docs = [ wandb = [ "wandb" ] +codecarbon = [ + "codecarbon" +] [project.urls] Documentation = "https://tomaarsen.github.io/SpanMarkerNER" diff --git a/span_marker/__init__.py b/span_marker/__init__.py index 0586fd92..11e42b38 100644 --- a/span_marker/__init__.py +++ b/span_marker/__init__.py @@ -1,6 +1,8 @@ __version__ = "1.3.1.dev" +import importlib import logging +import os from typing import Optional, Union import torch @@ -8,6 +10,7 @@ from transformers.pipelines import PIPELINE_REGISTRY, pipeline from span_marker.configuration import SpanMarkerConfig +from span_marker.model_card import SpanMarkerModelCardData from span_marker.modeling import SpanMarkerModel from span_marker.pipeline_component import SpanMarkerPipeline from span_marker.trainer import Trainer @@ -62,5 +65,10 @@ def _spacy_span_marker_factory( return SpacySpanMarkerWrapper(model, batch_size=batch_size, device=device) +# If codecarbon is installed and the log level is not defined, +# automatically overwrite the default to "error" +if importlib.util.find_spec("codecarbon") and "CODECARBON_LOG_LEVEL" not in os.environ: + os.environ["CODECARBON_LOG_LEVEL"] = "error" + logger = logging.getLogger("span_marker") logger.setLevel(logging.INFO) diff --git a/span_marker/evaluation.py b/span_marker/evaluation.py index cf9df137..53aa2afa 100644 --- a/span_marker/evaluation.py +++ b/span_marker/evaluation.py @@ -9,7 +9,9 @@ from span_marker.tokenizer import SpanMarkerTokenizer -def compute_f1_via_seqeval(tokenizer: SpanMarkerTokenizer, eval_prediction: EvalPrediction) -> Dict[str, float]: +def compute_f1_via_seqeval( + tokenizer: SpanMarkerTokenizer, eval_prediction: EvalPrediction, is_in_train: bool +) -> Dict[str, float]: """Compute micro-F1, recall, precision and accuracy scores using ``seqeval`` for the evaluation predictions. Note: @@ -98,7 +100,7 @@ def compute_f1_via_seqeval(tokenizer: SpanMarkerTokenizer, eval_prediction: Eval with warnings.catch_warnings(): warnings.simplefilter("ignore", UndefinedMetricWarning) results = seqeval.compute() - # `results` also contains e.g. "person-athlete": {'precision': 0.5982658959537572, 'recall': 0.9, 'f1': 0.71875, 'number': 230} - # logging this all is overkill. Tensorboard doesn't even support it, WandB does, but it's not very useful generally. - # I'd like to revisit this to expose this information somehow still - return {key: value for key, value in results.items() if isinstance(value, float)} + + if is_in_train: + return {key: value for key, value in results.items() if isinstance(value, float)} + return results diff --git a/span_marker/label_normalizer.py b/span_marker/label_normalizer.py index 87862278..9843ff19 100644 --- a/span_marker/label_normalizer.py +++ b/span_marker/label_normalizer.py @@ -27,8 +27,17 @@ def __init__(self, config: SpanMarkerConfig) -> None: self.config = config @abstractmethod - def __call__(self, tokens: List[str], ner_tags: List[int]) -> Dict[str, List[Any]]: - raise NotImplementedError + def ner_tags_to_entities(self, ner_tags: List[int]) -> Iterator[Entity]: + pass + + def __call__(self, tokens: List[List[str]], ner_tags: List[List[int]]) -> Dict[str, List[Any]]: + output = {"ner_tags": [], "entity_count": [], "word_count": []} + for tokens, ner_tags in zip(tokens, ner_tags): + ner_tags = list(self.ner_tags_to_entities(ner_tags)) + output["ner_tags"].append(ner_tags) + output["entity_count"].append(len(ner_tags)) + output["word_count"].append(len(tokens)) + return output class LabelNormalizerScheme(LabelNormalizer): @@ -57,9 +66,6 @@ def ner_tags_to_entities(self, ner_tags: List[int]) -> Iterator[Entity]: if start_idx is not None: yield (reduced_label_id, start_idx, idx + 1) - def __call__(self, tokens: List[str], ner_tags: List[int]) -> Dict[str, List[Any]]: - return {"tokens": tokens, "ner_tags": list(self.ner_tags_to_entities(ner_tags))} - class LabelNormalizerIOB(LabelNormalizerScheme): def __init__(self, config: SpanMarkerConfig) -> None: @@ -108,9 +114,6 @@ def ner_tags_to_entities(self, ner_tags: List[int]) -> Iterator[Entity]: if start_idx is not None: yield (entity_label_id, start_idx, idx + 1) - def __call__(self, tokens: List[str], ner_tags: List[int]) -> Dict[str, List[Any]]: - return {"tokens": tokens, "ner_tags": list(self.ner_tags_to_entities(ner_tags))} - class AutoLabelNormalizer: """Factory class to return the correct LabelNormalizer subclass.""" diff --git a/span_marker/model_card.py b/span_marker/model_card.py index 399cc6f4..e9c3a507 100644 --- a/span_marker/model_card.py +++ b/span_marker/model_card.py @@ -1,79 +1,494 @@ +import logging import os +import random +from dataclasses import dataclass, field, fields from pathlib import Path -from typing import Union +from platform import python_version +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union -import jinja2 -from huggingface_hub import model_info -from huggingface_hub.utils import RepositoryNotFoundError +import datasets +import tokenizers +import torch +import transformers +from datasets import Dataset +from huggingface_hub import ( + CardData, + DatasetFilter, + ModelCard, + dataset_info, + list_datasets, + model_info, +) +from huggingface_hub.repocard_data import EvalResult, eval_results_to_model_index +from huggingface_hub.utils import yaml_dump +from transformers import TrainerCallback +from transformers.integrations import CodeCarbonCallback +from transformers.modelcard import ( + extract_hyperparameters_from_trainer, + make_markdown_table, +) +from transformers.trainer_callback import TrainerControl, TrainerState +from transformers.training_args import TrainingArguments -from span_marker.configuration import SpanMarkerConfig +import span_marker -MODEL_CARD_TEMPLATE = """ ---- -license: apache-2.0 -library_name: span-marker -tags: -- span-marker -- token-classification -- ner -- named-entity-recognition -pipeline_tag: token-classification ---- +logger = logging.getLogger(__name__) -# SpanMarker for Named Entity Recognition +if TYPE_CHECKING: + from span_marker.modeling import SpanMarkerModel + from span_marker.trainer import Trainer -This is a [SpanMarker](https://github.com/tomaarsen/SpanMarkerNER) model that can be used \ -for Named Entity Recognition. {% if encoder_name_or_path %}In particular, this SpanMarker model uses \ -{% if is_public_model %}\ -[{{ encoder_name_or_path }}](https://huggingface.co/{{ encoder_name_or_path }})\ -{% else %}\ -"{{ encoder_name_or_path }}"\ -{% endif %} as the underlying encoder. {% endif %} -## Usage +class ModelCardCallback(TrainerCallback): + def __init__(self, trainer: "Trainer") -> None: + super().__init__() + self.trainer = trainer -To use this model for inference, first install the `span_marker` library: + callbacks = [ + callback for callback in self.trainer.callback_handler.callbacks if isinstance(callback, CodeCarbonCallback) + ] + if callbacks: + trainer.model.model_card_data.code_carbon_callback = callbacks[0] -```bash -pip install span_marker -``` + def on_train_begin( + self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: "SpanMarkerModel", **kwargs + ): + model.model_card_data.hyperparameters = extract_hyperparameters_from_trainer(self.trainer) -You can then run inference with this model like so: + def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + model: "SpanMarkerModel", + metrics: Dict[str, float], + **kwargs, + ): + # Set the most recent evaluation scores for the metadata + model.model_card_data.eval_results_dict = metrics -```python -from span_marker import SpanMarkerModel + if self.trainer.is_in_train: + # Either set mid-training evaluation metrics + if "eval_loss" in metrics: + model.model_card_data.eval_lines_list.append( + { + # "Training Loss": self.state.log_history[-1]["loss"] if "loss" in self.state.log_history[-1] else "-", + "Epoch": state.epoch, + "Step": state.global_step, + "Validation Loss": metrics["eval_loss"], + "Validation Precision": metrics["eval_overall_precision"], + "Validation Recall": metrics["eval_overall_recall"], + "Validation F1": metrics["eval_overall_f1"], + "Validation Accuracy": metrics["eval_overall_accuracy"], + } + ) + else: + # Or set the post-training metrics + # Determine the dataset split + runtime_key = [key for key in metrics.keys() if key.endswith("_runtime")] + if not runtime_key: + return + dataset_split = runtime_key[0][: -len("_runtime")] -# Download from the 🤗 Hub -model = SpanMarkerModel.from_pretrained({% if model_name_or_path %}"{{ model_name_or_path }}"{% else %}"span_marker_model_name"{% endif %}) -# Run inference -entities = model.predict("Amelia Earhart flew her single engine Lockheed Vega 5B across the Atlantic to Paris.") -``` + metric_lines = [] + for key, value in metrics.items(): + if not isinstance(value, float): + metric_lines.append( + { + "Label": key[len(dataset_split) + 1 :], + "Precision": value["precision"], + "Recall": value["recall"], + "F1": value["f1"], + } + ) + metric_lines.insert( + 0, + { + "Label": "**all**", + "Precision": metrics[f"{dataset_split}_overall_precision"], + "Recall": metrics[f"{dataset_split}_overall_recall"], + "F1": metrics[f"{dataset_split}_overall_f1"], + }, + ) + model.model_card_data.metric_lines = metric_lines -See the [SpanMarker](https://github.com/tomaarsen/SpanMarkerNER) repository for documentation and additional information on this library. -""" +YAML_FIELDS = [ + "language", + "license", + "library_name", + "tags", + "datasets", + "metrics", + "pipeline_tag", + "widget", + "model-index", + "co2_eq_emissions", + "base_model", +] +IGNORED_FIELDS = ["model"] -def is_public_model(encoder_name_or_path: str) -> bool: + +@dataclass +class SpanMarkerModelCardData(CardData): + """A dataclass storing data used in the model card. + + Args: + language (Optional[Union[str, List[str]]]): The model language, either a string or a list, + e.g. "en" or ["en", "de", "nl"] + license: (Optional[str]): The license of the model, e.g. "apache-2.0", "mit" + or "cc-by-nc-sa-4.0" + model_name: (Optional[str]): The pretty name of the model, e.g. "SpanMarker with mBERT-base on CoNLL03". + If not defined, uses encoder_name/encoder_id and dataset_name/dataset_id to generate a model name. + model_id: (Optional[str]): The model ID when pushing the model to the Hub, + e.g. "tomaarsen/span-marker-mbert-base-multinerd". + encoder_name: (Optional[str]): The pretty name of the encoder, e.g. "mBERT-base". + encoder_id: (Optional[str]): The model ID of the encoder, e.g. "bert-base-multilingual-cased". + dataset_name: (Optional[str]): The pretty name of the dataset, e.g. "CoNLL03". + dataset_id: (Optional[str]): The dataset ID of the dataset, e.g. "tner/bionlp2004". + dataset_revision: (Optional[str]): The dataset revision/commit that was for training/evaluation. + + Note: + + Install ``nltk`` to detokenize the examples used in the model card, i.e. attach punctuation and brackets. + Additionally, ``codecarbon`` can be installed to automatically track carbon emission usage. + + Example:: + + >>> model = SpanMarkerModel.from_pretrained( + ... "bert-base-uncased", + ... labels=["O", "B-DNA", "I-DNA", "B-protein", ...], + ... # SpanMarker hyperparameters: + ... model_max_length=256, + ... marker_max_length=128, + ... entity_max_length=8, + ... # Model card variables + ... model_card_data=SpanMarkerModelCardData( + ... model_id="tomaarsen/span-marker-bbu-bionlp", + ... encoder_id="bert-base-uncased", + ... dataset_name="BioNLP2004, + ... dataset_id="tner/bionlp2004", + ... license="apache-2.0", + ... language="en", + ... ), + ... ) + """ + + # Potentially provided by the user + language: Optional[Union[str, List[str]]] = None + license: Optional[str] = None + tags: Optional[List[str]] = field( + default_factory=lambda: [ + "span-marker", + "token-classification", + "ner", + "named-entity-recognition", + "generated_from_span_marker_trainer", + ] + ) + model_name: Optional[str] = None + model_id: Optional[str] = None + encoder_name: Optional[str] = None + encoder_id: Optional[str] = None + dataset_name: Optional[str] = None + dataset_id: Optional[str] = None + dataset_revision: Optional[str] = None + task_name: str = "Named Entity Recognition" + + # Automatically filled by `ModelCardCallback` and the Trainer directly + hyperparameters: Dict[str, Any] = field(default_factory=dict, init=False) + eval_results_dict: Optional[Dict[str, Any]] = field(default_factory=dict, init=False) + eval_lines_list: List[Dict[str, float]] = field(default_factory=list, init=False) + metric_lines: List[Dict[str, float]] = field(default_factory=list, init=False) + widget: List[Dict[str, str]] = field(default_factory=list, init=False) + predict_example: Optional[str] = field(default=None, init=False) + label_example_list: List[Dict[str, str]] = field(default_factory=list, init=False) + tokenizer_warning: bool = field(default=False, init=False) + train_set_metrics_list: List[Dict[str, str]] = field(default_factory=list, init=False) + code_carbon_callback: Optional[CodeCarbonCallback] = field(default=None, init=False) + + # Computed once, always unchanged + pipeline_tag: str = field(default="token-classification", init=False) + library_name: str = field(default="span-marker", init=False) + version: Dict[str, str] = field( + default_factory=lambda: { + "python": python_version(), + "span_marker": span_marker.__version__, + "transformers": transformers.__version__, + "torch": torch.__version__, + "datasets": datasets.__version__, + "tokenizers": tokenizers.__version__, + }, + init=False, + ) + metrics: List[str] = field(default_factory=lambda: ["precision", "recall", "f1"], init=False) + + # Passed via `register_model` only + model: Optional["SpanMarkerModel"] = field(default=None, init=False, repr=False) + + def __post_init__(self): + # We don't want to save "ignore_metadata_errors" in our Model Card + if self.dataset_id: + if is_on_huggingface(self.dataset_id, is_model=False): + # if languages are not set, try to determine the language from the dataset on the Hub + try: + info = dataset_info(self.dataset_id) + except: + pass + else: + if info.cardData: + self.language = info.cardData.get("language", self.language) + else: + logger.warning( + f"The provided {self.dataset_id!r} dataset could not be found on the Hugging Face Hub." + " Setting `dataset_id` to None." + ) + self.dataset_id = None + + if self.encoder_id and not is_on_huggingface(self.encoder_id): + logger.warning( + f"The provided {self.encoder_id!r} model could not be found on the Hugging Face Hub." + " Setting `encoder_id` to None." + ) + self.encoder_id = None + + if self.model_id and self.model_id.count("/") != 1: + logger.warning( + f"The provided {self.model_id!r} model ID should include the organization or user," + ' such as "tomaarsen/span-marker-mbert-base-multinerd". Setting `model_id` to None.' + ) + self.model_id = None + + def set_widget_examples(self, dataset: Dataset) -> None: + # If NLTK is installed, use its detokenization. Otherwise, join by spaces. + try: + from nltk.tokenize.treebank import TreebankWordDetokenizer + + detokenize = TreebankWordDetokenizer().detokenize + + def map_detokenize(tokens) -> Dict[str, str]: + return {"text": detokenize(tokens)} + + except ImportError: + + def map_detokenize(tokens) -> Dict[str, str]: + return {"text": " ".join(tokens)} + + # Out of `sample_subset_size=100` random samples, select `example_count=5` good examples + # based on the number of unique entity classes. + # The shortest example is used in the inference example + sample_subset_size = 100 + example_count = 5 + if len(dataset) > sample_subset_size: + example_dataset = dataset.select(random.sample(range(len(dataset)), k=sample_subset_size)) + else: + example_dataset = dataset + + def count_entities(sample: Dict[str, Any]) -> Dict[str, int]: + unique_count = {reduced_label_id for reduced_label_id, _, _ in sample["ner_tags"]} + return {"unique_entity_count": len(unique_count)} + + example_dataset = ( + example_dataset.map(count_entities) + .sort(("unique_entity_count", "entity_count"), reverse=True) + .select(range(min(len(example_dataset), example_count))) + .map(map_detokenize, input_columns="tokens") + ) + self.widget = [{"text": sample["text"]} for sample in example_dataset] + + shortest_example = example_dataset.sort("word_count")[0]["text"] + self.predict_example = shortest_example + + def set_train_set_metrics(self, dataset: Dataset) -> None: + self.train_set_metrics_list = [ + { + "Training set": "Sentence length", + "Min": min(dataset["word_count"]), + "Median": sum(dataset["word_count"]) / len(dataset), + "Max": max(dataset["word_count"]), + }, + { + "Training set": "Entities per sentence", + "Min": min(dataset["entity_count"]), + "Median": sum(dataset["entity_count"]) / len(dataset), + "Max": max(dataset["entity_count"]), + }, + ] + + def set_label_examples(self, dataset: Dataset, id2label: Dict[int, str], outside_id: int) -> None: + num_examples_per_label = 3 + examples = {label: set() for label_id, label in id2label.items() if label_id != outside_id} + unfinished_entity_ids = set(id2label.keys()) - {outside_id} + for sample in dataset: + for entity_id, start, end in sample["ner_tags"]: + if entity_id in unfinished_entity_ids: + entity = id2label[entity_id] + example = " ".join(sample["tokens"][start:end]) + examples[entity].add(f'"{example}"') + if len(examples[entity]) >= num_examples_per_label: + unfinished_entity_ids.remove(entity_id) + if not unfinished_entity_ids: + break + self.label_example_list = [ + {"Label": label, "Examples": ", ".join(example_set)} for label, example_set in examples.items() + ] + + def infer_dataset_id(self, dataset: Dataset) -> None: + def subtuple_finder(tuple: Tuple[str], subtuple: Tuple[str]) -> int: + for i, element in enumerate(tuple): + if element == subtuple[0] and tuple[i : i + len(subtuple)] == subtuple: + return i + return -1 + + def normalize(dataset_id: str) -> str: + for token in "/\\_-": + dataset_id = dataset_id.replace(token, "") + return dataset_id.lower() + + if (cache_files := dataset.cache_files) and "filename" in cache_files[0]: + cache_path_parts = Path(cache_files[0]["filename"]).parts + # Check if the cachefile is under "huggingface/datasets" + subtuple = ("huggingface", "datasets") + index = subtuple_finder(cache_path_parts, subtuple) + if index == -1: + return + + # Get the folder after "huggingface/datasets" + cache_dataset_name = cache_path_parts[index + len(subtuple)] + # If the dataset has an author: + if "___" in cache_dataset_name: + author, dataset_name = cache_dataset_name.split("___") + else: + author = None + dataset_name = cache_dataset_name + + # Make sure the normalized dataset IDs match + dataset_list = [ + dataset + for dataset in list_datasets(filter=DatasetFilter(author=author, dataset_name=dataset_name)) + if normalize(dataset.id) == normalize(cache_dataset_name) + ] + # If there's only one match, get the ID from it + if len(dataset_list) == 1: + self.dataset_id = dataset_list[0].id + + def register_model(self, model: "SpanMarkerModel") -> None: + self.model = model + + if self.encoder_id is None: + encoder_id_or_path = self.model.config.get("_name_or_path") + if not os.path.exists(encoder_id_or_path): + self.encoder_id = encoder_id_or_path + + if not self.model_name: + if self.encoder_id: + self.model_name = f"SpanMarker with {self.encoder_name or self.encoder_id}" + if self.dataset_name or self.dataset_id: + self.model_name += f" on {self.dataset_name or self.dataset_id}" + else: + self.model_name = "SpanMarker" + + def to_dict(self) -> Dict[str, Any]: + super_dict = {field.name: getattr(self, field.name) for field in fields(self)} + + # Compute required formats from the raw data + if self.eval_results_dict: + dataset_split = list(self.eval_results_dict.keys())[0].split("_")[0] + dataset_id = self.dataset_id or "unknown" + dataset_name = self.dataset_name or "Unknown" + eval_results = [ + EvalResult( + task_type="token-classification", + dataset_type=dataset_id, + dataset_name=dataset_name, + metric_type="f1", + metric_value=self.eval_results_dict[f"{dataset_split}_overall_f1"], + task_name="Named Entity Recognition", + dataset_split=dataset_split, + dataset_revision=self.dataset_revision, + metric_name="F1", + ), + EvalResult( + task_type="token-classification", + dataset_type=dataset_id, + dataset_name=dataset_name, + metric_type="precision", + metric_value=self.eval_results_dict[f"{dataset_split}_overall_precision"], + task_name="Named Entity Recognition", + dataset_split=dataset_split, + dataset_revision=self.dataset_revision, + metric_name="Precision", + ), + EvalResult( + task_type="token-classification", + dataset_type=dataset_id, + dataset_name=dataset_name, + metric_type="recall", + metric_value=self.eval_results_dict[f"{dataset_split}_overall_recall"], + task_name="Named Entity Recognition", + dataset_split=dataset_split, + dataset_revision=self.dataset_revision, + metric_name="Recall", + ), + ] + super_dict["model-index"] = eval_results_to_model_index(self.model_name, eval_results) + super_dict["eval_lines"] = make_markdown_table(self.eval_lines_list) + # Replace |:---:| with |:---| for left alignment + super_dict["label_examples"] = make_markdown_table(self.label_example_list).replace("-:|", "--|") + super_dict["train_set_metrics"] = make_markdown_table(self.train_set_metrics_list).replace("-:|", "--|") + super_dict["metrics_table"] = make_markdown_table(self.metric_lines).replace("-:|", "--|") + if self.code_carbon_callback and self.code_carbon_callback.tracker: + emissions_data = self.code_carbon_callback.tracker._prepare_emissions_data() + super_dict["co2_eq_emissions"] = { + # * 1000 to convert kg to g + "emissions": float(emissions_data.emissions) * 1000, + "source": "codecarbon", + "training_type": "fine-tuning", + "on_cloud": emissions_data.on_cloud == "Y", + "cpu_model": emissions_data.cpu_model, + "ram_total_size": emissions_data.ram_total_size, + "hours_used": round(emissions_data.duration / 3600, 3), + } + if emissions_data.gpu_model: + super_dict["co2_eq_emissions"]["hardware_used"] = emissions_data.gpu_model + if self.dataset_id: + super_dict["datasets"] = [self.dataset_id] + if self.encoder_id: + super_dict["base_model"] = self.encoder_id + super_dict["model_max_length"] = self.model.tokenizer.model_max_length + + for key in IGNORED_FIELDS: + super_dict.pop(key, None) + return { + **self.model.config.to_dict(), + **super_dict, + } + + def to_yaml(self, line_break=None) -> str: + return yaml_dump( + {key: value for key, value in self.to_dict().items() if key in YAML_FIELDS and value is not None}, + sort_keys=False, + line_break=line_break, + ).strip() + + +def is_on_huggingface(repo_id: str, is_model: bool = True) -> bool: # Models with more than two 'sections' certainly are not public models - if len(encoder_name_or_path.split("/")) > 2: + if len(repo_id.split("/")) > 2: return False try: - model_info(encoder_name_or_path) + if is_model: + model_info(repo_id) + else: + dataset_info(repo_id) return True - except RepositoryNotFoundError: + except: + # Fetching models can fail for many reasons: Repository not existing, no internet access, HF down, etc. return False -def generate_model_card(save_directory: Union[str, os.PathLike], config: SpanMarkerConfig) -> str: - template = jinja2.Environment().from_string(MODEL_CARD_TEMPLATE) - save_directory = Path(save_directory) - context = {} - - context["model_name_or_path"] = "span_marker_model_name" - - if "_name_or_path" in config.encoder: - context["encoder_name_or_path"] = config.encoder["_name_or_path"] - context["is_public_model"] = is_public_model(context["encoder_name_or_path"]) - - return template.render(context) +def generate_model_card(model: "SpanMarkerModel") -> str: + template_path = Path(__file__).parent / "model_card_template.md" + model_card = ModelCard.from_template(card_data=model.model_card_data, template_path=template_path, hf_emoji="🤗") + return model_card.content diff --git a/span_marker/model_card_template.md b/span_marker/model_card_template.md new file mode 100644 index 00000000..e8430a7e --- /dev/null +++ b/span_marker/model_card_template.md @@ -0,0 +1,167 @@ +--- +# For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1 +# Doc / guide: https://huggingface.co/docs/hub/model-cards +{{ card_data }} +--- + +# {{ model_name | default("SpanMarker for Named Entity Recognition", true) }} + +This is a [SpanMarker](https://github.com/tomaarsen/SpanMarkerNER) model{% if dataset_id %} trained on the [{{ dataset_name if dataset_name else dataset_id }}](https://huggingface.co/datasets/{{ dataset_id }}) dataset{% endif %} that can be used for {{ task_name | default("Named Entity Recognition", true) }}.{% if encoder_id %} This SpanMarker model uses [{{ encoder_name if encoder_name else encoder_id }}](https://huggingface.co/{{ encoder_id }}) as the underlying encoder.{% endif %} + +## Model Details + +### Model Description +- **Model Type:** SpanMarker +{% if encoder_id -%} + - **Encoder:** [{{ encoder_name if encoder_name else encoder_id }}](https://huggingface.co/{{ encoder_id }}) +{%- else -%} + +{%- endif %} +- **Maximum Sequence Length:** {{ model_max_length }} tokens +- **Maximum Entity Length:** {{ entity_max_length }} words +{% if dataset_id -%} + - **Training Dataset:** [{{ dataset_name if dataset_name else dataset_id }}](https://huggingface.co/datasets/{{ dataset_id }}) +{%- else -%} + +{%- endif %} +{% if language -%} + - **Language{{"s" if language is not string and language | length > 1 else ""}}:** + {%- if language is string %} {{ language }} + {%- else %} {% for lang in language -%} + {{ lang }}{{ ", " if not loop.last else "" }} + {%- endfor %} + {%- endif %} +{%- else -%} + +{%- endif %} +{% if license -%} + - **License:** {{ license }} +{%- else -%} + +{%- endif %} + +### Model Sources + +- **Repository:** [SpanMarker on GitHub](https://github.com/tomaarsen/SpanMarkerNER) +- **Thesis:** [SpanMarker For Named Entity Recognition](https://raw.githubusercontent.com/tomaarsen/SpanMarkerNER/main/thesis.pdf) +{% if label_examples %} +### Model Labels +{{ label_examples }}{% endif -%} +{% if metrics_table %} +## Evaluation + +### Metrics +{{ metrics_table }}{% endif %} +## Uses + +### Direct Use for Inference + +```python +from span_marker import SpanMarkerModel + +# Download from the {{ hf_emoji }} Hub +model = SpanMarkerModel.from_pretrained("{{ model_id | default('span_marker_model_id', true) }}") +# Run inference +entities = model.predict("{{ predict_example | replace('"', '\\"') | default("Amelia Earhart flew her single engine Lockheed Vega 5B across the Atlantic to Paris.", true)}}") +``` + +### Downstream Use +You can finetune this model on your own dataset. + +
Click to expand + +```python +from span_marker import SpanMarkerModel, Trainer + +# Download from the {{ hf_emoji }} Hub +model = SpanMarkerModel.from_pretrained("{{ model_id | default('span_marker_model_id', true) }}") + +# Specify a Dataset with "tokens" and "ner_tag" columns +dataset = load_dataset("conll2003") # For example CoNLL2003 + +# Initialize a Trainer using the pretrained model & dataset +trainer = Trainer( + model=model, + train_dataset=dataset["train"], + eval_dataset=dataset["validation"], +) +trainer.train() +trainer.save_model("{{ model_id | default('span_marker_model_id', true) }}-finetuned") +``` +
+ + + + + + + +## Training Details +{% if train_set_metrics %} +### Training Set Metrics +{{ train_set_metrics }}{% endif %}{% if hyperparameters %} +### Training Hyperparameters +{% for name, value in hyperparameters.items() %}- {{ name }}: {{ value }} +{% endfor %}{% endif %}{% if eval_lines %} +### Training Results +{{ eval_lines }}{% endif %}{% if co2_eq_emissions %} +### Environmental Impact +Carbon emissions were measured using [CodeCarbon](https://github.com/mlco2/codecarbon). +- **Carbon Emitted**: {{ "%.3f"|format(co2_eq_emissions["emissions"] / 1000) }} kg of CO2 +- **Hours Used**: {{ co2_eq_emissions["hours_used"] }} hours + +### Training Hardware +- **On Cloud**: {{ "Yes" if co2_eq_emissions["on_cloud"] else "No" }} +- **GPU Model**: {{ co2_eq_emissions["hardware_used"] or "No GPU used" }} +- **CPU Model**: {{ co2_eq_emissions["cpu_model"] }} +- **RAM Size**: {{ "%.2f"|format(co2_eq_emissions["ram_total_size"]) }} GB +{% endif %} +### Framework Versions +- Python: {{ version["python"] }} +- SpanMarker: {{ version["span_marker"] }} +- Transformers: {{ version["transformers"] }} +- PyTorch: {{ version["torch"] }} +- Datasets: {{ version["datasets"] }} +- Tokenizers: {{ version["tokenizers"] }} + +## Citation + +### BibTeX +``` +@software{Aarsen_SpanMarker, + author = {Aarsen, Tom}, + license = {Apache-2.0}, + title = {{"{{SpanMarker for Named Entity Recognition}}"}}, + url = {https://github.com/tomaarsen/SpanMarkerNER} +} +``` + + + + + + \ No newline at end of file diff --git a/span_marker/modeling.py b/span_marker/modeling.py index 631aea7f..8f223458 100644 --- a/span_marker/modeling.py +++ b/span_marker/modeling.py @@ -15,7 +15,7 @@ from span_marker import __version__ as span_marker_version from span_marker.configuration import SpanMarkerConfig from span_marker.data_collator import SpanMarkerDataCollator -from span_marker.model_card import generate_model_card +from span_marker.model_card import SpanMarkerModelCardData, generate_model_card from span_marker.output import SpanMarkerOutput from span_marker.tokenizer import SpanMarkerTokenizer @@ -50,7 +50,13 @@ class SpanMarkerModel(PreTrainedModel): base_model_prefix = "encoder" _no_split_modules = [] # To support `load_in_8bit=True`` and `device_map="auto"` - def __init__(self, config: SpanMarkerConfig, encoder: Optional[PreTrainedModel] = None, **kwargs) -> None: + def __init__( + self, + config: SpanMarkerConfig, + encoder: Optional[PreTrainedModel] = None, + model_card_data: Optional[SpanMarkerModelCardData] = None, + **kwargs, + ) -> None: """Initialize a SpanMarkerModel using configuration. Do not manually initialize a SpanMarkerModel this way! Use :meth:`~SpanMarkerModel.from_pretrained` instead. @@ -89,6 +95,9 @@ def __init__(self, config: SpanMarkerConfig, encoder: Optional[PreTrainedModel] self.tokenizer = None self.data_collator = None + self.model_card_data = model_card_data or SpanMarkerModelCardData() + self.model_card_data.register_model(self) + # Initialize weights and apply final processing self.post_init() @@ -199,6 +208,8 @@ def from_pretrained( pretrained_model_name_or_path: Union[str, os.PathLike], *model_args, labels: Optional[List[str]] = None, + config: Optional[SpanMarkerConfig] = None, + model_card_data: Optional[SpanMarkerModelCardData] = None, **kwargs, ) -> T: """Instantiate a pretrained pytorch model from a pre-trained model configuration. @@ -240,7 +251,9 @@ def from_pretrained( """ # If loading a SpanMarkerConfig, then we don't want to override id2label and label2id # Create an encoder or SpanMarker config - config: PretrainedConfig = AutoConfig.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + config: PretrainedConfig = config or AutoConfig.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) # if 'pretrained_model_name_or_path' refers to a SpanMarkerModel instance, initialize it directly loading_span_marker = isinstance(config, cls.config_class) @@ -253,7 +266,9 @@ def from_pretrained( " introduced in v1.0.0, this is not recommended. Either retrain your model for" f" v{span_marker_version}, or install `span_marker < 1.0.0`." ) - model = super().from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) + model = super().from_pretrained( + pretrained_model_name_or_path, *model_args, config=config, **kwargs, model_card_data=model_card_data + ) # If 'pretrained_model_name_or_path' refers to an encoder (roberta, bert, distilbert, electra, etc.), # then initialize it and create the SpanMarker config and model using the encoder and its config. @@ -280,7 +295,7 @@ def from_pretrained( config = cls.config_class( encoder_config=config.to_dict(), span_marker_version=span_marker_version, **kwargs ) - model = cls(config, encoder, *model_args, **kwargs) + model = cls(config, encoder, *model_args, **kwargs, model_card_data=model_card_data) # Pass the tokenizer directly to the model for convenience, this way the user doesn't have to # make it themselves. @@ -288,7 +303,12 @@ def from_pretrained( config.encoder.get("_name_or_path", pretrained_model_name_or_path), config=config, **kwargs ) model.set_tokenizer(tokenizer) - model.resize_token_embeddings(len(tokenizer)) + # Since transformers 4.32.0 we should use `pad_to_multiple_of=8`. + # That'll fail for earlier versions, so we try-except it. + try: + model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8) + except TypeError: + model.resize_token_embeddings(len(tokenizer)) return model @classmethod @@ -585,7 +605,15 @@ def save_pretrained( **kwargs, ) with open(os.path.join(save_directory, "README.md"), "w", encoding="utf-8") as f: - f.write(generate_model_card(save_directory, self.config)) + f.write(self.generate_model_card()) + + def generate_model_card(self) -> str: + """Generate and return a model card string based on the model card data. + + Returns: + str: The model card string. + """ + return generate_model_card(self) def try_cuda(self, device: Optional[Union[int, device]] = None) -> Self: """Try to moves all model parameters and buffers to the GPU, do nothing if failed. diff --git a/span_marker/tokenizer.py b/span_marker/tokenizer.py index 59a85b84..89135f2a 100644 --- a/span_marker/tokenizer.py +++ b/span_marker/tokenizer.py @@ -5,7 +5,8 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union import numpy as np -from transformers import AutoTokenizer, PreTrainedTokenizer +from tokenizers.pre_tokenizers import Punctuation, Sequence +from transformers import AutoTokenizer, PreTrainedTokenizer, XLMRobertaTokenizerFast from span_marker.configuration import SpanMarkerConfig @@ -269,4 +270,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path, *inputs, **kwargs, add_prefix_space=True ) + # XLM-R is known to have some tokenization issues, so be sure to also split on punctuation. + # Strictly required for inference, shouldn't affect training. + if isinstance(tokenizer, XLMRobertaTokenizerFast): + tokenizer._tokenizer.pre_tokenizer = Sequence([Punctuation(), tokenizer._tokenizer.pre_tokenizer]) + return cls(tokenizer, config=config, **kwargs) diff --git a/span_marker/trainer.py b/span_marker/trainer.py index bc731fba..9ae4370f 100644 --- a/span_marker/trainer.py +++ b/span_marker/trainer.py @@ -1,6 +1,7 @@ import dataclasses import logging import math +import os from typing import Any, Callable, Dict, List, Optional, Tuple import torch @@ -17,6 +18,7 @@ from span_marker.evaluation import compute_f1_via_seqeval from span_marker.label_normalizer import AutoLabelNormalizer, LabelNormalizer +from span_marker.model_card import ModelCardCallback from span_marker.modeling import SpanMarkerModel from span_marker.tokenizer import SpanMarkerTokenizer @@ -107,7 +109,7 @@ def __init__( # Set some Training arguments that must be set for SpanMarker if args is None: args = TrainingArguments( - output_dir="models/my_span_marker_model", include_inputs_for_metrics=True, remove_unused_columns=True + output_dir="models/my_span_marker_model", include_inputs_for_metrics=True, remove_unused_columns=False ) else: args = dataclasses.replace(args, include_inputs_for_metrics=True, remove_unused_columns=False) @@ -115,11 +117,25 @@ def __init__( # Always compute `compute_f1_via_seqeval` - optionally compute user-provided metrics if compute_metrics is not None: compute_metrics_func = lambda eval_prediction: { - **compute_f1_via_seqeval(model.tokenizer, eval_prediction), + **compute_f1_via_seqeval(model.tokenizer, eval_prediction, self.is_in_train), **compute_metrics(eval_prediction), } else: - compute_metrics_func = lambda eval_prediction: compute_f1_via_seqeval(model.tokenizer, eval_prediction) + compute_metrics_func = lambda eval_prediction: compute_f1_via_seqeval( + model.tokenizer, eval_prediction, self.is_in_train + ) + + # If the model ID is set via the TrainingArguments, but not via the SpanMarkerModelCardData, + # then we can set it here for the model card regardless + if args.hub_model_id and not model.model_card_data.model_id: + model.model_card_data.model_id = args.hub_model_id + + if not model.model_card_data.dataset_id: + # Inferring is hacky - it may break in the future, so let's be safe + try: + model.model_card_data.infer_dataset_id(train_dataset) + except Exception: + pass super().__init__( model=model, @@ -143,6 +159,10 @@ def __init__( # Override the type hint self.model: SpanMarkerModel + # Add the callback for filling the model card data with hyperparameters + # and evaluation results + self.add_callback(ModelCardCallback(self)) + def preprocess_dataset( self, dataset: Dataset, @@ -177,11 +197,31 @@ def preprocess_dataset( set(dataset.column_names) - set(self.OPTIONAL_COLUMNS) - set(self.REQUIRED_COLUMNS) ) # Normalize the labels to a common format (list of label-start-end tuples) + # Also add "entity_count" and "word_count" labels dataset = dataset.map( label_normalizer, input_columns=("tokens", "ner_tags"), desc=f"Label normalizing the {dataset_name} dataset", + batched=True, ) + + # Setting model card data based on training data + if not is_evaluate: + # Pick some example entities from each entity class for the model card. + if not self.model.model_card_data.label_example_list: + self.model.model_card_data.set_label_examples( + dataset, self.model.config.id2label, self.model.config.outside_id + ) + if not self.model.model_card_data.train_set_metrics_list: + self.model.model_card_data.set_train_set_metrics(dataset) + + # Set some example sentences for the model card widget + if is_evaluate and not self.model.model_card_data.widget: + self.model.model_card_data.set_widget_examples(dataset) + + # Remove dataset columns that are only used for model card + dataset = dataset.remove_columns(("entity_count", "word_count")) + # Tokenize and add start/end markers with tokenizer.entity_tracker(split=dataset_name): dataset = dataset.map( @@ -393,3 +433,11 @@ def predict( f"Consider using `{self.model.__class__.__name__}.predict` instead." ) return super().predict(test_dataset, ignore_keys, metric_key_prefix) + + def create_model_card(self, *_args, **_kwargs) -> None: + """ + Creates a draft of a model card using the information available to the `Trainer`, + the `SetFitModel` and the `SpanMarkerModelCardData`. + """ + with open(os.path.join(self.args.output_dir, "README.md"), "w", encoding="utf8") as f: + f.write(self.model.generate_model_card()) diff --git a/tests/conftest.py b/tests/conftest.py index 73148b79..6f1f8746 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -30,10 +30,7 @@ def randomize_seed() -> None: yield - from datasets.fingerprint import ( - _TEMP_DIR_FOR_TEMP_CACHE_FILES, - get_temporary_cache_files_directory, - ) + from datasets.fingerprint import _TEMP_DIR_FOR_TEMP_CACHE_FILES if _TEMP_DIR_FOR_TEMP_CACHE_FILES: _TEMP_DIR_FOR_TEMP_CACHE_FILES._cleanup() diff --git a/tests/constants.py b/tests/constants.py index fe31f1b4..f4c70aaa 100644 --- a/tests/constants.py +++ b/tests/constants.py @@ -124,4 +124,4 @@ ] TINY_BERT = "prajjwal1/bert-tiny" -DEFAULT_ARGS = TrainingArguments(output_dir="models/my_span_marker_model", report_to="none") +DEFAULT_ARGS = TrainingArguments(output_dir="models/my_span_marker_model", report_to="none", num_train_epochs=1) diff --git a/tests/model_card_pattern.py b/tests/model_card_pattern.py new file mode 100644 index 00000000..72be8318 --- /dev/null +++ b/tests/model_card_pattern.py @@ -0,0 +1,217 @@ +import re + +MODEL_CARD_PATTERN = re.compile( + """\ +--- +language: +- en +license: apache-2\.0 +library_name: span-marker +tags: +- span-marker +- token-classification +- ner +- named-entity-recognition +- generated_from_span_marker_trainer +datasets: +- conll2003 +metrics: +- precision +- recall +- f1 +widget: +- text: .* +pipeline_tag: token-classification +co2_eq_emissions: + emissions: [\d\.\-e]+ + source: codecarbon + training_type: fine-tuning + on_cloud: (false|true) + cpu_model: .+ + ram_total_size: [\d\.]+ + hours_used: [\d\.]+ +( hardware_used: .+ +)?base_model: prajjwal1/bert-tiny +model-index: +- name: SpanMarker with prajjwal1/bert-tiny on CoNLL 2003 + results: + - task: + type: token-classification + name: Named Entity Recognition + dataset: + name: CoNLL 2003 + type: conll2003 + split: eval + metrics: + - type: f1 + value: [\d\.]+ + name: F1 + - type: precision + value: [\d\.]+ + name: Precision + - type: recall + value: [\d\.]+ + name: Recall +--- + +# SpanMarker with prajjwal1/bert-tiny on CoNLL 2003 + +This is a \[SpanMarker\]\(https://github.com/tomaarsen/SpanMarkerNER\) model trained on the \[CoNLL 2003\]\(https://huggingface.co/datasets/conll2003\) dataset that can be used for Named Entity Recognition. This SpanMarker model uses \[prajjwal1/bert-tiny\]\(https://huggingface.co/prajjwal1/bert-tiny\) as the underlying encoder. + +## Model Details + +### Model Description +- \*\*Model Type:\*\* SpanMarker +- \*\*Encoder:\*\* \[prajjwal1/bert-tiny\]\(https://huggingface.co/prajjwal1/bert-tiny\) +- \*\*Maximum Sequence Length:\*\* 512 tokens +- \*\*Maximum Entity Length:\*\* 8 words +- \*\*Training Dataset:\*\* \[CoNLL 2003\]\(https://huggingface.co/datasets/conll2003\) +- \*\*Language:\*\* en +- \*\*License:\*\* apache-2.0 + +### Model Sources + +- \*\*Repository:\*\* \[SpanMarker on GitHub\]\(https://github.com/tomaarsen/SpanMarkerNER\) +- \*\*Thesis:\*\* \[SpanMarker For Named Entity Recognition\]\(https://raw.githubusercontent.com/tomaarsen/SpanMarkerNER/main/thesis.pdf\) + +### Model Labels +\| Label \| Examples \| +\|:-------------\|:--------------------------------------------\| +\| art \| \| +\| building \| \| +\| event \| \| +\| location \| \| +\| organization \| \| +\| other \| \| +\| person \| [^\|]+ \| +\| product \| \| + +## Uses + +### Direct Use for Inference + +```python +from span_marker import SpanMarkerModel + +# Download from the [^H]+ Hub +model = SpanMarkerModel.from_pretrained\("tomaarsen/span-marker-test-model-card"\) +# Run inference +entities = model.predict\(".+"\) +``` + +### Downstream Use +You can finetune this model on your own dataset. + +
Click to expand + +```python +from span_marker import SpanMarkerModel, Trainer + +# Download from the [^H]+ Hub +model = SpanMarkerModel.from_pretrained\("tomaarsen/span-marker-test-model-card"\) + +# Specify a Dataset with "tokens" and "ner_tag" columns +dataset = load_dataset\("conll2003"\) # For example CoNLL2003 + +# Initialize a Trainer using the pretrained model & dataset +trainer = Trainer\( + model=model, + train_dataset=dataset\["train"\], + eval_dataset=dataset\["validation"\], +\) +trainer.train\(\) +trainer.save_model\("tomaarsen/span-marker-test-model-card-finetuned"\) +``` +
+ + + + + + + +## Training Details + +### Training Set Metrics +\| Training set \| Min \| Median \| Max \| +\|:----------------------\|:----\|:-------\|:----\| +\| Sentence length \| 4 \| 8.0 \| 12 \| +\| Entities per sentence \| 0 \| 1.5 \| 3 \| + +### Training Hyperparameters +- learning_rate: 5e-05 +- train_batch_size: 1 +- eval_batch_size: 8 +- seed: 42 +- optimizer: Adam with betas=\(0.9,0.999\) and epsilon=1e-08 +- lr_scheduler_type: linear +- num_epochs: 1 + +### Training Results +\| Epoch \| Step \| Validation Loss \| Validation Precision \| Validation Recall \| Validation F1 \| Validation Accuracy \| +\|:-----:\|:----:\|:---------------:\|:--------------------:\|:-----------------:\|:-------------:\|:-------------------:\| +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| + +### Environmental Impact +Carbon emissions were measured using \[CodeCarbon\]\(https://github.com/mlco2/codecarbon\)\. +- \*\*Carbon Emitted\*\*: [\d\.]+ kg of CO2 +- \*\*Hours Used\*\*: [\d\.]+ hours + +### Training Hardware +- \*\*On Cloud\*\*: (Yes|No) +- \*\*GPU Model\*\*: [^\n]+ +- \*\*CPU Model\*\*: [^\n]+ +- \*\*RAM Size\*\*: [\d\.]+ GB + +### Framework Versions +- Python: [^\n]+ +- SpanMarker: [^\n]+ +- Transformers: [^\n]+ +- PyTorch: [^\n]+ +- Datasets: [^\n]+ +- Tokenizers: [^\n]+ + +## Citation + +### BibTeX +``` +@software{Aarsen_SpanMarker, + author = {Aarsen, Tom}, + license = {Apache-2.0}, + title = {{SpanMarker for Named Entity Recognition}}, + url = {https://github.com/tomaarsen/SpanMarkerNER} +} +``` + + + + + +""", + flags=re.DOTALL, +) diff --git a/tests/test_model_card.py b/tests/test_model_card.py index f9fb39a0..2ee1f3a6 100644 --- a/tests/test_model_card.py +++ b/tests/test_model_card.py @@ -1,32 +1,122 @@ +import logging from pathlib import Path -from span_marker.model_card import generate_model_card -from span_marker.modeling import SpanMarkerModel +import pytest +from datasets import DatasetDict, load_dataset +from span_marker import ( + SpanMarkerModel, + SpanMarkerModelCardData, + Trainer, + TrainingArguments, +) +from span_marker.model_card import generate_model_card, is_on_huggingface -def test_model_card(finetuned_fewnerd_span_marker_model: SpanMarkerModel, tmp_path: Path) -> None: - config = finetuned_fewnerd_span_marker_model.config - model_card = generate_model_card(tmp_path, config) - assert ( - "uses [prajjwal1/bert-tiny](https://huggingface.co/prajjwal1/bert-tiny) as the underlying encoder" in model_card +from .constants import CONLL_LABELS, FEWNERD_COARSE_LABELS, TINY_BERT +from .model_card_pattern import MODEL_CARD_PATTERN + + +def test_model_card(fewnwerd_coarse_dataset_dict: DatasetDict, tmp_path: Path) -> None: + base_encoder_id = TINY_BERT + model = SpanMarkerModel.from_pretrained( + base_encoder_id, + labels=FEWNERD_COARSE_LABELS, + model_card_data=SpanMarkerModelCardData( + model_id="tomaarsen/span-marker-test-model-card", + dataset_id="conll2003", + dataset_name="CoNLL 2003", + encoder_id=base_encoder_id, + language="en", + license="apache-2.0", + ), + ) + train_dataset = fewnwerd_coarse_dataset_dict["train"] + eval_dataset = fewnwerd_coarse_dataset_dict["test"].select(range(1)) + + args = TrainingArguments( + str(tmp_path), + report_to="codecarbon", + eval_steps=1, + per_device_train_batch_size=1, + evaluation_strategy="steps", + num_train_epochs=1, + ) + trainer = Trainer( + model=model, + args=args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + ) + trainer.train() + model_card = generate_model_card(trainer.model) + assert MODEL_CARD_PATTERN.fullmatch(model_card) + + +def test_model_card_languages() -> None: + model = SpanMarkerModel.from_pretrained( + TINY_BERT, + labels=FEWNERD_COARSE_LABELS, + model_card_data=SpanMarkerModelCardData( + language=["en", "nl", "de"], + ), + ) + model_card = model.generate_model_card() + assert "**Languages:** en, nl, de" in model_card + + +def test_model_card_warnings(caplog: pytest.LogCaptureFixture): + SpanMarkerModelCardData(dataset_id="test_value") + assert any( + [ + level == logging.WARNING + and text == "The provided 'test_value' dataset could not be found on the Hugging Face Hub." + " Setting `dataset_id` to None." + for (_, level, text) in caplog.record_tuples + ] ) - assert f'SpanMarkerModel.from_pretrained("span_marker_model_name")' in model_card - assert "\n\n\n" not in model_card - assert "\n\n## Usage" in model_card - - config.encoder["_name_or_path"] = "does_not_exist" - model_card = generate_model_card(tmp_path, config) - assert 'uses "does_not_exist" as the underlying encoder' in model_card - assert "\n\n\n" not in model_card - assert "\n\n## Usage" in model_card - - del config.encoder["_name_or_path"] - model_card = generate_model_card(tmp_path, config) - assert "as the underlying encoder" not in model_card - assert "\n\n\n" not in model_card - assert "\n\n## Usage" in model_card - - model_card = generate_model_card("tomaarsen/my_test_model", config) - assert f'SpanMarkerModel.from_pretrained("span_marker_model_name")' in model_card - assert "\n\n\n" not in model_card - assert "\n\n## Usage" in model_card + + caplog.clear() + SpanMarkerModelCardData(encoder_id="test_value") + assert any( + [ + level == logging.WARNING + and text == "The provided 'test_value' model could not be found on the Hugging Face Hub." + " Setting `encoder_id` to None." + for (_, level, text) in caplog.record_tuples + ] + ) + + caplog.clear() + SpanMarkerModelCardData(model_id="test_value") + assert any( + [ + level == logging.WARNING + and text == "The provided 'test_value' model ID should include the organization or user," + ' such as "tomaarsen/span-marker-mbert-base-multinerd". Setting `model_id` to None.' + for (_, level, text) in caplog.record_tuples + ] + ) + + +def test_is_on_huggingface_edge_case() -> None: + assert not is_on_huggingface("test_value") + assert not is_on_huggingface("a/test/value") + + +@pytest.mark.parametrize("dataset_id", ("conll2003", "tomaarsen/conll2003")) +def test_infer_dataset_id(dataset_id: str) -> None: + model = SpanMarkerModel.from_pretrained(TINY_BERT, labels=CONLL_LABELS) + train_dataset = load_dataset(dataset_id, split="train") + + # This triggers inferring the dataset_id from train_dataset + Trainer(model=model, train_dataset=train_dataset) + assert model.model_card_data.dataset_id == dataset_id + + +def test_cant_infer_dataset_id(conll_dataset_dict: DatasetDict): + model = SpanMarkerModel.from_pretrained(TINY_BERT, labels=CONLL_LABELS) + train_dataset = conll_dataset_dict["train"] + + # This triggers inferring the dataset_id from train_dataset + Trainer(model=model, train_dataset=train_dataset) + assert model.model_card_data.dataset_id == None diff --git a/tests/test_trainer.py b/tests/test_trainer.py index b39db86e..253d7eea 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -5,9 +5,11 @@ import pytest from datasets import Dataset, DatasetDict -from transformers import EvalPrediction +from pytest import LogCaptureFixture +from transformers import AutoTokenizer, EvalPrediction, TrainingArguments from span_marker.modeling import SpanMarkerModel +from span_marker.tokenizer import SpanMarkerTokenizer from span_marker.trainer import Trainer from tests.constants import CONLL_LABELS, DEFAULT_ARGS, TINY_BERT @@ -41,7 +43,9 @@ def test_trainer_standard( assert model.config.trained_with_document_context metrics = trainer.evaluate() assert isinstance(metrics, dict) - assert set(metrics.keys()) == { + labels = {label for label, _id in model.config.label2id.items() if _id != model.config.outside_id} + keys = {f"eval_{label}" for label in labels} + assert set(metrics.keys()) <= { "eval_loss", "eval_overall_f1", "eval_overall_recall", @@ -51,7 +55,11 @@ def test_trainer_standard( "eval_samples_per_second", "eval_steps_per_second", "epoch", + *keys, } + for key in keys: + if key in metrics: + assert metrics[key].keys() == {"f1", "number", "precision", "recall"} # Try saving and loading the model model_path = tmp_path / model_fixture / dataset_fixture @@ -144,7 +152,9 @@ def test_trainer_incorrect_columns(finetuned_conll_span_marker_model: SpanMarker trainer.evaluate() -def test_trainer_entity_tracker_warning_entity_length(conll_dataset_dict: DatasetDict, caplog) -> None: +def test_trainer_entity_tracker_warning_entity_length( + conll_dataset_dict: DatasetDict, caplog: LogCaptureFixture +) -> None: model = SpanMarkerModel.from_pretrained(TINY_BERT, labels=CONLL_LABELS, entity_max_length=1) trainer = Trainer( model, args=DEFAULT_ARGS, train_dataset=conll_dataset_dict["train"], eval_dataset=conll_dataset_dict["train"] @@ -166,7 +176,9 @@ def test_trainer_entity_tracker_warning_entity_length(conll_dataset_dict: Datase assert any([eval_pattern.search(record.msg) for record in caplog.records]) -def test_trainer_entity_tracker_warning_model_length(conll_dataset_dict: DatasetDict, caplog) -> None: +def test_trainer_entity_tracker_warning_model_length( + conll_dataset_dict: DatasetDict, caplog: LogCaptureFixture +) -> None: model = SpanMarkerModel.from_pretrained(TINY_BERT, labels=CONLL_LABELS, model_max_length=5) trainer = Trainer( model, args=DEFAULT_ARGS, train_dataset=conll_dataset_dict["train"], eval_dataset=conll_dataset_dict["train"] @@ -188,7 +200,9 @@ def test_trainer_entity_tracker_warning_model_length(conll_dataset_dict: Dataset assert any([eval_pattern.match(record.msg) for record in caplog.records]) -def test_trainer_entity_tracker_warning_entity_and_model_length(conll_dataset_dict: DatasetDict, caplog) -> None: +def test_trainer_entity_tracker_warning_entity_and_model_length( + conll_dataset_dict: DatasetDict, caplog: LogCaptureFixture +) -> None: model = SpanMarkerModel.from_pretrained(TINY_BERT, labels=CONLL_LABELS, model_max_length=5, entity_max_length=1) trainer = Trainer( model, args=DEFAULT_ARGS, train_dataset=conll_dataset_dict["train"], eval_dataset=conll_dataset_dict["train"] @@ -212,3 +226,27 @@ def test_trainer_entity_tracker_warning_entity_and_model_length(conll_dataset_di r".*\nAdditionally, a total of \d+ \([\d\.]+%\) entities were missed due to the maximum input length\." ) assert any([eval_pattern.match(record.msg) for record in caplog.records]) + + +def test_trainer_no_args(finetuned_conll_span_marker_model: SpanMarkerModel) -> None: + trainer = Trainer(model=finetuned_conll_span_marker_model) + assert trainer.args.output_dir == "models/my_span_marker_model" + assert trainer.args.include_inputs_for_metrics == True + assert trainer.args.remove_unused_columns == False + + +def test_trainer_set_model_id_via_hub(finetuned_conll_span_marker_model: SpanMarkerModel, tmp_path: Path) -> None: + model = finetuned_conll_span_marker_model + model_id = "test_value" + args = TrainingArguments(output_dir=str(tmp_path), hub_model_id=model_id, report_to="none") + Trainer(model=model, args=args) + # Ensure that the model card data is set via the Trainer init + assert model.model_card_data.model_id == model_id + + +def test_trainer_create_model_card(finetuned_conll_span_marker_model: SpanMarkerModel, tmp_path: Path) -> None: + model = finetuned_conll_span_marker_model + args = TrainingArguments(output_dir=str(tmp_path), report_to="none") + trainer = Trainer(model=model, args=args) + trainer.create_model_card() + assert (tmp_path / "README.md").exists() diff --git a/training_scripts/conll03_context.py b/training_scripts/conll03_context.py index b2249371..7c4e18a4 100644 --- a/training_scripts/conll03_context.py +++ b/training_scripts/conll03_context.py @@ -1,24 +1,35 @@ from datasets import load_dataset from transformers import TrainingArguments -from span_marker import SpanMarkerModel, Trainer +from span_marker import SpanMarkerModel, SpanMarkerModelCardData, Trainer def main() -> None: # Load the dataset, ensure "tokens", "ner_tags", "document_id" and "sentence_id" columns, # and get a list of labels - dataset = load_dataset("tomaarsen/conll2003") + dataset_id = "conll2003" + dataset_name = "CoNLL 2003" + dataset = load_dataset(dataset_id) labels = dataset["train"].features["ner_tags"].feature.names # Initialize a SpanMarker model using a pretrained BERT-style encoder - model_name = "xlm-roberta-large" + encoder_id = "xlm-roberta-large" model = SpanMarkerModel.from_pretrained( - model_name, + encoder_id, labels=labels, # SpanMarker hyperparameters: model_max_length=512, marker_max_length=128, entity_max_length=8, + # Model card arguments + model_card_data=SpanMarkerModelCardData( + model_id="tomaarsen/span-marker-xlm-roberta-large-conll03-doc-context", + encoder_id=encoder_id, + dataset_name=dataset_name, + dataset_id=dataset_id, + license="other", + language="en", + ), ) # Prepare the 🤗 transformers training arguments diff --git a/training_scripts/conll03_no_context.py b/training_scripts/conll03_no_context.py index 5f7e549e..6662b8c4 100644 --- a/training_scripts/conll03_no_context.py +++ b/training_scripts/conll03_no_context.py @@ -1,23 +1,34 @@ from datasets import load_dataset from transformers import TrainingArguments -from span_marker import SpanMarkerModel, Trainer +from span_marker import SpanMarkerModel, SpanMarkerModelCardData, Trainer def main() -> None: # Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels - dataset = load_dataset("conll2003") + dataset_id = "conll2003" + dataset_name = "CoNLL 2003" + dataset = load_dataset(dataset_id) labels = dataset["train"].features["ner_tags"].feature.names # Initialize a SpanMarker model using a pretrained BERT-style encoder - model_name = "xlm-roberta-large" + encoder_id = "xlm-roberta-large" model = SpanMarkerModel.from_pretrained( - model_name, + encoder_id, labels=labels, # SpanMarker hyperparameters: model_max_length=128, marker_max_length=64, entity_max_length=6, + # Model card arguments + model_card_data=SpanMarkerModelCardData( + model_id="tomaarsen/span-marker-xlm-roberta-large-conll03", + encoder_id=encoder_id, + dataset_name=dataset_name, + dataset_id=dataset_id, + license="other", + language="en", + ), ) # Prepare the 🤗 transformers training arguments diff --git a/training_scripts/conllpp_context.py b/training_scripts/conllpp_context.py index 3be5cc63..2f67a46a 100644 --- a/training_scripts/conllpp_context.py +++ b/training_scripts/conllpp_context.py @@ -1,24 +1,35 @@ from datasets import load_dataset from transformers import TrainingArguments -from span_marker import SpanMarkerModel, Trainer +from span_marker import SpanMarkerModel, SpanMarkerModelCardData, Trainer def main() -> None: # Load the dataset, ensure "tokens", "ner_tags", "document_id" and "sentence_id" columns, # and get a list of labels - dataset = load_dataset("tomaarsen/conllpp") + dataset_id = "tomaarsen/conllpp" + dataset_name = "CoNLL++" + dataset = load_dataset(dataset_id) labels = dataset["train"].features["ner_tags"].feature.names # Initialize a SpanMarker model using a pretrained BERT-style encoder - model_name = "xlm-roberta-large" + encoder_id = "xlm-roberta-large" model = SpanMarkerModel.from_pretrained( - model_name, + encoder_id, labels=labels, # SpanMarker hyperparameters: model_max_length=512, marker_max_length=128, entity_max_length=8, + # Model card arguments + model_card_data=SpanMarkerModelCardData( + model_id="tomaarsen/span-marker-xlm-roberta-large-conllpp-doc-context", + encoder_id=encoder_id, + dataset_name=dataset_name, + dataset_id=dataset_id, + license="unknown", + language="en", + ), ) # Prepare the 🤗 transformers training arguments diff --git a/training_scripts/fewnerd_base.py b/training_scripts/fewnerd_base.py index 7f49768d..e7582fb0 100644 --- a/training_scripts/fewnerd_base.py +++ b/training_scripts/fewnerd_base.py @@ -1,25 +1,36 @@ from datasets import load_dataset from transformers import TrainingArguments -from span_marker import SpanMarkerModel, Trainer +from span_marker import SpanMarkerModel, SpanMarkerModelCardData, Trainer def main() -> None: # Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels - dataset = load_dataset("DFKI-SLT/few-nerd", "supervised") + dataset_id = "DFKI-SLT/few-nerd" + dataset_name = "FewNERD" + dataset = load_dataset(dataset_id, "supervised") dataset = dataset.remove_columns("ner_tags") dataset = dataset.rename_column("fine_ner_tags", "ner_tags") labels = dataset["train"].features["ner_tags"].feature.names # Initialize a SpanMarker model using a pretrained BERT-style encoder - model_name = "bert-base-cased" + encoder_id = "bert-base-cased" model = SpanMarkerModel.from_pretrained( - model_name, + encoder_id, labels=labels, # SpanMarker hyperparameters: model_max_length=256, marker_max_length=128, entity_max_length=8, + # Model card arguments + model_card_data=SpanMarkerModelCardData( + model_id="tomaarsen/span-marker-bert-base-fewnerd-fine-super", + encoder_id=encoder_id, + dataset_name=dataset_name, + dataset_id=dataset_id, + license="cc-by-nc-sa-4.0", + language="en", + ), ) # Prepare the 🤗 transformers training arguments diff --git a/training_scripts/fewnerd_large.py b/training_scripts/fewnerd_large.py index f7d35875..307098f4 100644 --- a/training_scripts/fewnerd_large.py +++ b/training_scripts/fewnerd_large.py @@ -1,25 +1,36 @@ from datasets import load_dataset from transformers import TrainingArguments -from span_marker import SpanMarkerModel, Trainer +from span_marker import SpanMarkerModel, SpanMarkerModelCardData, Trainer def main() -> None: # Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels - dataset = load_dataset("DFKI-SLT/few-nerd", "supervised") + dataset_id = "DFKI-SLT/few-nerd" + dataset_name = "FewNERD" + dataset = load_dataset(dataset_id, "supervised") dataset = dataset.remove_columns("ner_tags") dataset = dataset.rename_column("fine_ner_tags", "ner_tags") labels = dataset["train"].features["ner_tags"].feature.names # Initialize a SpanMarker model using a pretrained BERT-style encoder - model_name = "roberta-large" + encoder_id = "roberta-large" model = SpanMarkerModel.from_pretrained( - model_name, + encoder_id, labels=labels, # SpanMarker hyperparameters: model_max_length=256, marker_max_length=128, entity_max_length=8, + # Model card arguments + model_card_data=SpanMarkerModelCardData( + model_id=f"tomaarsen/span-marker-{encoder_id}-fewnerd-fine-super", + encoder_id=encoder_id, + dataset_name=dataset_name, + dataset_id=dataset_id, + license="cc-by-nc-sa-4.0", + language="en", + ), ) # Prepare the 🤗 transformers training arguments diff --git a/training_scripts/ontonotesv5.py b/training_scripts/ontonotesv5.py index d9a60d68..b368a681 100644 --- a/training_scripts/ontonotesv5.py +++ b/training_scripts/ontonotesv5.py @@ -1,12 +1,14 @@ from datasets import load_dataset from transformers import TrainingArguments -from span_marker import SpanMarkerModel, Trainer +from span_marker import SpanMarkerModel, SpanMarkerModelCardData, Trainer def main() -> None: # Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels - dataset = load_dataset("tner/ontonotes5") + dataset_id = "tner/ontonotes5" + dataset_name = "OntoNotes v5" + dataset = load_dataset(dataset_id) dataset = dataset.rename_column("tags", "ner_tags") labels = [ "O", @@ -49,14 +51,23 @@ def main() -> None: ] # Initialize a SpanMarker model using a pretrained BERT-style encoder - model_name = "roberta-large" + encoder_id = "roberta-large" model = SpanMarkerModel.from_pretrained( - model_name, + encoder_id, labels=labels, # SpanMarker hyperparameters: model_max_length=256, marker_max_length=128, entity_max_length=10, + # Model card arguments + model_card_data=SpanMarkerModelCardData( + model_id=f"tomaarsen/span-marker-{encoder_id}-ontonotes5", + encoder_id=encoder_id, + dataset_name=dataset_name, + dataset_id=dataset_id, + license="other", + language="en", + ), ) # Prepare the 🤗 transformers training arguments