From 509d5f445d48de713b4226af4b93974a22c7e687 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Fri, 29 Sep 2023 20:32:30 +0200
Subject: [PATCH] Heavily improve automatic model card generation + Patch XLM-R
 (#28)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Uncomment pushing to the Hub

* Initial version to improve automatic model card generation

* Simplify label normalization

* Automatically select some eval sentences for the widget

* Improve language card

* Add automatic evaluation results

* Use dash instead of underscore in model name

* Add extra TODOs

* model.predict text as the first example

* Automatically set model name based on encoder & dataset

* Remove accidental Dataset import

* Rename examples to widget examples

* Add table with label examples

Also use fields instead of __dict__

* Ensure complete metadata

* Add tokenizer warning if punct must be split from words

* Remove dead code

* Rename poor variable names

* Fix incorrect warning

* Add " in the model labels

* Set model_id based on args if possible

* Add training set metrics

* Randomly select 100 samples for the widget examples

Instead of taking the first 100

* Prevent duplicate widget examples

* Remove completed TODO

* Use title case throughout model card

* Add useful comments if values not provided

Also prevent crash if dataset_id is not provided

* Add environmental impact with codecarbon

* Ensure that the model card template is included in the install

* Add training hardware section

* Add Python version

* Make everything title case

* Add missing docstring

* Add docstring for SpanMarkerModelCardData

* Update CHANGELOG

* Add SpanMarkerModelCardData to dunder init

* Add SpanMarkerModelCardData to snippets

* Resolve breaking error if hub_model_id is set

* gpu_model -> hardware_used

To better match what HF expects

* Add "base_model" to metadata

* Increment datasets min version to 2.14.0

Required for sorting on multiple columns at once

* Update trainer evaluate tests

* Skip old model card test for now

* Fix edge case: less than 5 examples

* pytest.skip -> pytest.mark.skip

* Try to infer the language from the dataset

* Add citations and hidden sections

* Refactor inferring language

* Remove unused import

* Add comment explaining version

* Override default Trainer create_model_card

* Update model card template slightly

* Add newline to model card template

* Remove incorrect space

* Add model card tests

* Improve Trainer tests regarding model card

* Remove commented out breakpoint

* Add codecarbon to CI

* Rename integration extra to codecarbon

* Make hardware_used optional (if no GPU present)

* Apply suggestions to model_card_template

Co-authored-by: Daniel van Strien <davanstrien@users.noreply.github.com>

* Update model card test pattern alongside template changes

* Don't include hardware_used when no GPU present

* Set "No GPU used" for GPU Model if hardware_used is None

* Don't store None in yaml

* Ensure that emissions is a regular float

* kgs to g

* support e-05 notation

* Add small test case for model cards

* Update model tables in docs

* Link to the spaCy integration in the tokenizer warning

* Update README snippet

* Update outdated docs: entity_max_length default is 8

* Remove /models from URL, caused 404s

* Fix outdated type hint

* 🎉 Apply XLM-R patch

* Remove /models from test

* Remove tokenizer warning after patch

* Update training docs with model card data etc.

* Pad token embeddings to multiple of 8

Removes a warning since transformers 4.32.0

* Always attach list directly to header

* Tackle edge case where dataset card has no metadata

* Allow installing nltk for detokenizing model card examples

* Add model card docs

* Mention codecarbon install in docstring

* overwrite the default codecarbon log level to "error"

* Update CHANGELOG

* Fix issue with inference example containing full quotes

* Update CHANGELOG

* Never print a model when printing SpanMarkerModelCardData

* Try to infer the dataset_id from the training set

Thanks @cakiki

* Update the main docs landing page

---------

Co-authored-by: Daniel van Strien <davanstrien@users.noreply.github.com>
---
 .github/workflows/tests.yaml           |   2 +-
 CHANGELOG.md                           |  17 +
 MANIFEST.in                            |   1 +
 README.md                              |  30 +-
 docs/api/span_marker.model_card.rst    |  17 +
 docs/api/span_marker.rst               |   1 +
 docs/index.rst                         | 120 +++---
 notebooks/getting_started.ipynb        | 465 +++++++++++-----------
 notebooks/model_training.ipynb         | 380 +++++++++---------
 pyproject.toml                         |   5 +-
 span_marker/__init__.py                |   8 +
 span_marker/evaluation.py              |  12 +-
 span_marker/label_normalizer.py        |  19 +-
 span_marker/model_card.py              | 525 ++++++++++++++++++++++---
 span_marker/model_card_template.md     | 167 ++++++++
 span_marker/modeling.py                |  42 +-
 span_marker/tokenizer.py               |   8 +-
 span_marker/trainer.py                 |  54 ++-
 tests/conftest.py                      |   5 +-
 tests/constants.py                     |   2 +-
 tests/model_card_pattern.py            | 217 ++++++++++
 tests/test_model_card.py               | 144 +++++--
 tests/test_trainer.py                  |  48 ++-
 training_scripts/conll03_context.py    |  19 +-
 training_scripts/conll03_no_context.py |  19 +-
 training_scripts/conllpp_context.py    |  19 +-
 training_scripts/fewnerd_base.py       |  19 +-
 training_scripts/fewnerd_large.py      |  19 +-
 training_scripts/ontonotesv5.py        |  19 +-
 29 files changed, 1777 insertions(+), 626 deletions(-)
 create mode 100644 MANIFEST.in
 create mode 100644 docs/api/span_marker.model_card.rst
 create mode 100644 span_marker/model_card_template.md
 create mode 100644 tests/model_card_pattern.py

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 69dbab51..5a349e13 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -38,7 +38,7 @@ jobs:
       - name: Install external dependencies on cache miss
         run: |
           python -m pip install --no-cache-dir --upgrade pip
-          python -m pip install --no-cache-dir ".[dev]"
+          python -m pip install --no-cache-dir ".[dev, codecarbon]"
           python -m spacy download en_core_web_sm
         if: steps.restore-cache.outputs.cache-hit != 'true'
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 67bcef1e..7f8ad8e9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,8 +19,25 @@ Types of changes
 
 ### Added
 
+- Added `SpanMarkerModel.generate_model_card()` method to get a model card string.
+- Added `SpanMarkerModelCardData` that should be passed to `SpanMarkerModel.from_pretrained` with additional information like
+  - `language`, `license`, `model_name`, `model_id`, `encoder_name`, `encoder_id`, `dataset_name`, `dataset_id`, `dataset_revision`.
 - Added `transformers` `pipeline` support, e.g. `pipeline(task="span-marker", model="tomaarsen/span-marker-mbert-base-multinerd")`.
 
+### Changed
+
+- Heavily improved automatic model card generated.
+- Evaluating outside of training now returns per-label outputs instead of only "overall" F1, precision and recall.
+- Warn if the used tokenizer distinguishes between punctuation directly attached to a word and punctuation separated from a word by a space.
+  - If so, then inference of that model will require the punctuation to be split from the words.
+- Improve label normalization speed.
+- Allow you to call SpanMarkerModel.from_pretrained with a pre-initialized SpanMarkerConfig.
+
+### Fixed
+
+- Fixed tokenization mismatch between training and inference for XLM-RoBERTa models: allows for normal inference of those models.
+- Resolve niche bug when TrainingArguments are not provided.
+
 ## [1.3.0]
 
 ### Added
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 00000000..ae726279
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include span_marker/model_card_template.md
\ No newline at end of file
diff --git a/README.md b/README.md
index 90b8f3b6..218e30d3 100644
--- a/README.md
+++ b/README.md
@@ -44,32 +44,47 @@ Please have a look at our [Getting Started](notebooks/getting_started.ipynb) not
 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tomaarsen/SpanMarkerNER/blob/main/notebooks/getting_started.ipynb)                       | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/tomaarsen/SpanMarkerNER/blob/main/notebooks/getting_started.ipynb)                       | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/tomaarsen/SpanMarkerNER/blob/main/notebooks/getting_started.ipynb)                       | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/tomaarsen/SpanMarkerNER/blob/main/notebooks/getting_started.ipynb)                       |
 
 ```python
+from pathlib import Path
 from datasets import load_dataset
 from transformers import TrainingArguments
-from span_marker import SpanMarkerModel, Trainer
+from span_marker import SpanMarkerModel, Trainer, SpanMarkerModelCardData
 
 
 def main() -> None:
     # Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels
-    dataset = load_dataset("DFKI-SLT/few-nerd", "supervised")
+    dataset_id = "DFKI-SLT/few-nerd"
+    dataset_name = "FewNERD"
+    dataset = load_dataset(dataset_id, "supervised")
     dataset = dataset.remove_columns("ner_tags")
     dataset = dataset.rename_column("fine_ner_tags", "ner_tags")
     labels = dataset["train"].features["ner_tags"].feature.names
+    # ['O', 'art-broadcastprogram', 'art-film', 'art-music', 'art-other', ...
 
     # Initialize a SpanMarker model using a pretrained BERT-style encoder
-    model_name = "bert-base-cased"
+    encoder_id = "bert-base-cased"
+    model_id = f"tomaarsen/span-marker-{encoder_id}-fewnerd-fine-super"
     model = SpanMarkerModel.from_pretrained(
-        model_name,
+        encoder_id,
         labels=labels,
         # SpanMarker hyperparameters:
         model_max_length=256,
         marker_max_length=128,
         entity_max_length=8,
+        # Model card arguments
+        model_card_data=SpanMarkerModelCardData(
+            model_id=model_id,
+            encoder_id=encoder_id,
+            dataset_name=dataset_name,
+            dataset_id=dataset_id,
+            license="cc-by-sa-4.0",
+            language="en",
+        ),
     )
 
     # Prepare the 🤗 transformers training arguments
+    output_dir = Path("models") / model_id
     args = TrainingArguments(
-        output_dir="models/span_marker_bert_base_cased_fewnerd_fine_super",
+        output_dir=output_dir,
         # Training Hyperparameters:
         learning_rate=5e-5,
         per_device_train_batch_size=32,
@@ -96,12 +111,13 @@ def main() -> None:
         eval_dataset=dataset["validation"],
     )
     trainer.train()
-    trainer.save_model("models/span_marker_bert_base_cased_fewnerd_fine_super/checkpoint-final")
 
     # Compute & save the metrics on the test set
     metrics = trainer.evaluate(dataset["test"], metric_key_prefix="test")
     trainer.save_metrics("test", metrics)
 
+    # Save the final checkpoint
+    trainer.save_model(output_dir / "checkpoint-final")
 
 if __name__ == "__main__":
     main()
@@ -121,8 +137,6 @@ entities = model.predict("Amelia Earhart flew her single engine Lockheed Vega 5B
  {'span': 'Paris', 'label': 'location-GPE', 'score': 0.9892390966415405, 'char_start_index': 78, 'char_end_index': 83}]
 ```
 
-<!-- Because this work is based on [PL-Marker](https://arxiv.org/pdf/2109.06067v5.pdf), you may expect similar results to its [Papers with Code Leaderboard](https://paperswithcode.com/paper/pack-together-entity-and-relation-extraction) results. -->
-
 ## Pretrained Models
 
 All models in this list contain `train.py` files that show the training scripts used to generate them. Additionally, all training scripts used are stored in the [training_scripts](training_scripts) directory.
diff --git a/docs/api/span_marker.model_card.rst b/docs/api/span_marker.model_card.rst
new file mode 100644
index 00000000..4ccd08a1
--- /dev/null
+++ b/docs/api/span_marker.model_card.rst
@@ -0,0 +1,17 @@
+
+:autogenerated:
+
+..
+    This file is autogenerated by `sphinx-api`.
+
+span_marker.model_card module
+=============================
+
+.. currentmodule:: span_marker.model_card
+
+.. automodule:: span_marker.model_card
+    :members:
+    :exclude-members: hyperparameters, eval_results_dict, eval_lines_list, metric_lines, widget, predict_example, label_example_list, tokenizer_warning, train_set_metrics_list, code_carbon_callback, pipeline_tag, library_name, version, metrics, model, set_widget_examples, set_train_set_metrics, set_label_examples, register_model, is_on_huggingface, generate_model_card
+    :undoc-members:
+    :show-inheritance:
+    :member-order: bysource
diff --git a/docs/api/span_marker.rst b/docs/api/span_marker.rst
index a2650031..28131ad1 100644
--- a/docs/api/span_marker.rst
+++ b/docs/api/span_marker.rst
@@ -19,6 +19,7 @@ span_marker package
        span_marker.modeling
        span_marker.trainer
        span_marker.configuration
+       span_marker.model_card
        span_marker.pipeline_component
        span_marker.data_collator
        span_marker.tokenizer
diff --git a/docs/index.rst b/docs/index.rst
index fba2de2f..55d092d6 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -19,43 +19,48 @@ or no label annotation scheme.
 Check out all publicly available SpanMarker models on the Hugging Face Hub `here <https://huggingface.co/models?library=span-marker>`_.
 Alternatively, check out any model from this list of particularly useful models:
 
-+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+
-| Model ID                                                                                                                                            | Domain   | Label Count | Language     |
-+=====================================================================================================================================================+==========+=============+==============+
-| `tomaarsen/span-marker-mbert-base-multinerd <https://huggingface.co/tomaarsen/span-marker-mbert-base-multinerd>`_                                   | General  | 15          | Multilingual |
-|                                                                                                                                                     |          |             |              |
-| `lxyuan/span-marker-bert-base-multilingual-uncased-multinerd <https://huggingface.co/lxyuan/span-marker-bert-base-multilingual-uncased-multinerd>`_ |          |             |              |
-|                                                                                                                                                     |          |             |              |
-| `lxyuan/span-marker-bert-base-multilingual-cased-multinerd <https://huggingface.co/lxyuan/span-marker-bert-base-multilingual-cased-multinerd>`_     |          |             |              |
-+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+
-| `tomaarsen/span-marker-bert-base-fewnerd-fine-super <https://huggingface.co/tomaarsen/span-marker-bert-base-fewnerd-fine-super>`_                   | General  | 66          | English      |
-|                                                                                                                                                     |          |             |              |
-| `tomaarsen/span-marker-xlm-roberta-base-fewnerd-fine-super <https://huggingface.co/tomaarsen/span-marker-xlm-roberta-base-fewnerd-fine-super>`_     |          |             | Multilingual |
-+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+
-| `tomaarsen/span-marker-bert-base-cross-ner <https://huggingface.co/tomaarsen/span-marker-bert-base-cross-ner>`_                                     | General  | 39          | English      |
-|                                                                                                                                                     |          |             |              |
-| `tomaarsen/span-marker-bert-base-uncased-cross-ner <https://huggingface.co/tomaarsen/span-marker-bert-base-uncased-cross-ner>`_                     |          |             |              |
-+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+
-| `tomaarsen/span-marker-roberta-large-ontonotes5 <https://huggingface.co/tomaarsen/span-marker-roberta-large-ontonotes5>`_                           | General  | 18          | English      |
-+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+
-| `tomaarsen/span-marker-bert-base-acronyms <https://huggingface.co/tomaarsen/span-marker-bert-base-acronyms>`_                                       | Acronyms | 2           | English      |
-|                                                                                                                                                     |          |             |              |
-| `tomaarsen/span-marker-bert-base-uncased-acronyms <https://huggingface.co/tomaarsen/span-marker-bert-base-uncased-acronyms>`_                       |          |             |              |
-+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+
-| `tomaarsen/span-marker-bert-base-ncbi-disease <https://huggingface.co/tomaarsen/span-marker-bert-base-ncbi-disease>`_                               | Diseases | 1           | English      |
-+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+
-| `stefan-it/span-marker-gelectra-large-germeval14 <https://huggingface.co/stefan-it/span-marker-gelectra-large-germeval14>`_                         | General  | 12          | German       |
-|                                                                                                                                                     |          |             |              |
-| `gwlms/span-marker-teams-germeval14 <https://huggingface.co/gwlms/span-marker-teams-germeval14>`_                                                   |          |             |              |
-|                                                                                                                                                     |          |             |              |
-| `gwlms/span-marker-token-dropping-bert-germeval14 <https://huggingface.co/gwlms/span-marker-token-dropping-bert-germeval14>`_                       |          |             |              |
-|                                                                                                                                                     |          |             |              |
-| `gwlms/span-marker-bert-germeval14 <https://huggingface.co/gwlms/span-marker-bert-germeval14>`_                                                     |          |             |              |
-+-----------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+--------------+
-
-
++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+
+| Model ID                                                                                                                                            | Domain     | Label Count | Language     |
++=====================================================================================================================================================+============+=============+==============+
+| `tomaarsen/span-marker-mbert-base-multinerd <https://huggingface.co/tomaarsen/span-marker-mbert-base-multinerd>`_                                   | General    | 15          | Multilingual |
+|                                                                                                                                                     |            |             |              |
+| `lxyuan/span-marker-bert-base-multilingual-uncased-multinerd <https://huggingface.co/lxyuan/span-marker-bert-base-multilingual-uncased-multinerd>`_ |            |             |              |
+|                                                                                                                                                     |            |             |              |
+| `lxyuan/span-marker-bert-base-multilingual-cased-multinerd <https://huggingface.co/lxyuan/span-marker-bert-base-multilingual-cased-multinerd>`_     |            |             |              |
++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+
+| `tomaarsen/span-marker-bert-base-fewnerd-fine-super <https://huggingface.co/tomaarsen/span-marker-bert-base-fewnerd-fine-super>`_                   | General    | 66          | English      |
+|                                                                                                                                                     |            |             |              |
+| `tomaarsen/span-marker-xlm-roberta-base-fewnerd-fine-super <https://huggingface.co/tomaarsen/span-marker-xlm-roberta-base-fewnerd-fine-super>`_     |            |             | Multilingual |
++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+
+| `tomaarsen/span-marker-bert-base-cross-ner <https://huggingface.co/tomaarsen/span-marker-bert-base-cross-ner>`_                                     | General    | 39          | English      |
+|                                                                                                                                                     |            |             |              |
+| `tomaarsen/span-marker-bert-base-uncased-cross-ner <https://huggingface.co/tomaarsen/span-marker-bert-base-uncased-cross-ner>`_                     |            |             |              |
++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+
+| `tomaarsen/span-marker-roberta-large-ontonotes5 <https://huggingface.co/tomaarsen/span-marker-roberta-large-ontonotes5>`_                           | General    | 18          | English      |
++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+
+| `tomaarsen/span-marker-bert-base-uncased-keyphrase-inspec <https://huggingface.co/tomaarsen/span-marker-bert-base-uncased-keyphrase-inspec>`_       | Keyphrases | 1           | English      |
++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+
+| `tomaarsen/span-marker-bert-base-acronyms <https://huggingface.co/tomaarsen/span-marker-bert-base-acronyms>`_                                       | Acronyms   | 2           | English      |
+|                                                                                                                                                     |            |             |              |
+| `tomaarsen/span-marker-bert-base-uncased-acronyms <https://huggingface.co/tomaarsen/span-marker-bert-base-uncased-acronyms>`_                       |            |             |              |
++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+
+| `tomaarsen/span-marker-bert-base-ncbi-disease <https://huggingface.co/tomaarsen/span-marker-bert-base-ncbi-disease>`_                               | Biomedical | 1           | English      |
+|                                                                                                                                                     |            |             |              |
+| `tomaarsen/span-marker-bert-base-uncased-bionlp <https://huggingface.co/tomaarsen/span-marker-bert-base-uncased-bionlp>`_                           |            | 5           |              |
++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+
+| `stefan-it/span-marker-gelectra-large-germeval14 <https://huggingface.co/stefan-it/span-marker-gelectra-large-germeval14>`_                         | General    | 12          | German       |
+|                                                                                                                                                     |            |             |              |
+| `gwlms/span-marker-teams-germeval14 <https://huggingface.co/gwlms/span-marker-teams-germeval14>`_                                                   |            |             |              |
+|                                                                                                                                                     |            |             |              |
+| `gwlms/span-marker-token-dropping-bert-germeval14 <https://huggingface.co/gwlms/span-marker-token-dropping-bert-germeval14>`_                       |            |             |              |
+|                                                                                                                                                     |            |             |              |
+| `gwlms/span-marker-bert-germeval14 <https://huggingface.co/gwlms/span-marker-bert-germeval14>`_                                                     |            |             |              |
++-----------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+--------------+
+
+
+*******
 Context
-=======
+*******
 
 .. raw:: html
 
@@ -68,12 +73,13 @@ Context
 I have developed this library as a part of my thesis work at `Argilla <https://github.com/argilla-io/argilla>`_.
 Feel free to ⭐ star or watch the `SpanMarker repository <https://github.com/tomaarsen/SpanMarkerNER>`_ to get notified when my thesis is published.
 
-***************
+###############
 Quick Reference
-***************
+###############
 
+************
 How to Train
-============
+************
 
 ::
 
@@ -86,9 +92,9 @@ How to Train
    dataset = load_dataset("DFKI-SLT/few-nerd", "supervised")
    labels = ["O", "art", "building", "event", "location", "organization", "other", "person", "product"]
 
-   # Initialize a SpanMarkerModel using an encoder, e.g. BERT:
-   model_name = "bert-base-cased"
-   model = SpanMarkerModel.from_pretrained(model_name, labels=labels)
+   # Initialize a SpanMarkerModel using an encoder, e.g. BERT, and the labels:
+   encoder_id = "bert-base-cased"
+   model = SpanMarkerModel.from_pretrained(encoder_id, labels=labels)
 
    # See the 🤗 TrainingArguments documentation for details here
    args = TrainingArguments(
@@ -114,14 +120,21 @@ How to Train
 
    # Training is really simple using our Trainer!
    trainer.train()
-   trainer.save_model("my_span_marker_model/checkpoint-final")
 
    # ... and so is evaluating!
    metrics = trainer.evaluate()
    print(metrics)
 
+   # Save the model locally or on the Hugging Face Hub
+   trainer.save_model("my_span_marker_model/checkpoint-final")
+   trainer.push_to_hub("my_span_marker_model/checkpoint-final")
+
+See :doc:`Initializing & Training <notebooks/model_training>` for more details, or check out the documentation for
+:class:`~span_marker.modeling.SpanMarkerModel`, :class:`~span_marker.trainer.Trainer`, :func:`~datasets.load_dataset`, or :class:`~transformers.TrainingArguments`.
+
+**************
 How to predict
-==============
+**************
 
 ::
 
@@ -130,7 +143,7 @@ How to predict
    # Load a finetuned SpanMarkerModel from the 🤗 Hub
    model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-bert-base-fewnerd-fine-super")
 
-   # It is recommended to explicitly move the model to CUDA for faster inference
+   # It is recommended to explicitly move the model to CUDA for faster inference, if possible
    model.cuda()
 
    model.predict("A prototype was fitted in the mid-'60s in a one-off DB5 extended 4'' after the doors and driven by Marek personally, and a normally 6-cylinder Aston Martin DB7 was equipped with a V8 unit in 1998.")
@@ -142,26 +155,33 @@ How to predict
    You can also load a locally saved model through ``SpanMarkerModel.from_pretrained("path/to/model")``,
    much like in 🤗 Transformers.
 
+See :doc:`Loading & Inferencing <notebooks/model_loading>` for more details, or check out the documentation for
+:class:`~span_marker.modeling.SpanMarkerModel` or :meth:`~span_marker.modeling.SpanMarkerModel.predict`.
+
 
+*******************
 How to save a model
-===================
+*******************
 
 Locally
--------
+=======
 
 ::
 
    model.save_pretrained("my_model_dir")
 
+See the documentation for :meth:`~span_marker.modeling.SpanMarkerModel.save_pretrained` for more details.
+
 
 To the 🤗 Hub
--------------
+=============
 
 ::
 
-   model_name = "span-marker-bert-base-fewnerd-fine-super"
-   model.push_to_hub(model_name)
+   model_id = "span-marker-bert-base-fewnerd-fine-super"
+   model.push_to_hub(model_id)
 
+See the documentation for :meth:`~span_marker.modeling.SpanMarkerModel.push_to_hub` for more details.
 
 .. toctree::
    :maxdepth: 2
diff --git a/notebooks/getting_started.ipynb b/notebooks/getting_started.ipynb
index 227add6f..e5c569c8 100644
--- a/notebooks/getting_started.ipynb
+++ b/notebooks/getting_started.ipynb
@@ -45,37 +45,8 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 3359329/3359329 [00:09<00:00, 342056.99it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 482037/482037 [00:01<00:00, 346172.32it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 958765/958765 [00:02<00:00, 346564.24it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset few-nerd downloaded and prepared to .... Subsequent calls will reuse this data.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+     "data": {
+      "text/plain": [
        "DatasetDict({\n",
        "    train: Dataset({\n",
        "        features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],\n",
@@ -90,13 +61,18 @@
        "        num_rows: 37648\n",
        "    })\n",
        "})"
-     ]
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
     "from datasets import load_dataset\n",
     "\n",
-    "dataset = load_dataset(\"DFKI-SLT/few-nerd\", \"supervised\")\n",
+    "dataset_id = \"DFKI-SLT/few-nerd\"\n",
+    "dataset = load_dataset(dataset_id, \"supervised\")\n",
     "dataset"
    ]
   },
@@ -190,31 +166,52 @@
     "\n",
     "Importantly, the model can *either* be an encoder or an already trained and saved SpanMarker model. As we haven't trained anything yet, we will use an encoder. To learn how to load and use a saved SpanMarker model, please have a look at the [Loading & Inferencing](model_loading.ipynb) notebook.\n",
     "\n",
-    "Reasonable options for encoders include BERT, RoBERTa, etc., which means that the following are all good options: `\"bert-base-cased\"`, `\"bert-large-cased\"`, `\"roberta-base\"`, `\"roberta-large\"`. Not all encoders work though, they **must** allow for `position_ids` as an input argument, which disqualifies DistilBERT, T5, DistilRoBERTa, ALBERT & BART. Furthermore, using uncased models is generally not recommended, as the capitalisation can be very useful to find named entities.\n",
+    "Reasonable options for encoders include BERT, RoBERTa, mBERT, XLM-RoBERTa, etc., which means that the following are all good options:\n",
+    "\n",
+    "* [prajjwal1/bert-tiny](https://huggingface.co/prajjwal1/bert-tiny)\n",
+    "* [prajjwal1/bert-mini](https://huggingface.co/prajjwal1/bert-mini)\n",
+    "* [prajjwal1/bert-small](https://huggingface.co/prajjwal1/bert-small)\n",
+    "* [prajjwal1/bert-medium](https://huggingface.co/prajjwal1/bert-medium)\n",
+    "* [bert-base-cased](https://huggingface.co/bert-base-cased)\n",
+    "* [bert-large-cased](https://huggingface.co/bert-large-cased)\n",
+    "* [bert-base-multilingual-cased](https://huggingface.co/bert-base-multilingual-cased)\n",
+    "* [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased)\n",
+    "* [roberta-base](https://huggingface.co/roberta-base)\n",
+    "* [roberta-large](https://huggingface.co/roberta-large)\n",
+    "* [xlm-roberta-base](https://huggingface.co/xlm-roberta-base)\n",
+    "* [xlm-roberta-large](https://huggingface.co/xlm-roberta-large)\n",
+    "\n",
+    "Not all encoders work though, they **must** allow for `position_ids` as an input argument, which disqualifies DistilBERT, T5, DistilRoBERTa, ALBERT & BART. \n",
+    "\n",
+    "Additionally, it's important to consider that cased models typically demand consistent capitalization in the inference data, aligning with how the training data is formatted. In simpler terms, if your training data consistently uses correct capitalization, but your inference data does not, it may lead to suboptimal performance. In such cases, you might find an uncased model more suitable. Although it may exhibit slightly lower F1 scores on the testing set, it remains functional regardless of capitalization, making it potentially more effective in real-world scenarios.\n",
     "\n",
     "We'll use `\"bert-base-cased\"` for this notebook. If you're running this on Google Colab, be sure to set hardware accelerator to \"GPU\" in `Runtime` > `Change runtime type`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']\n",
-      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "from span_marker import SpanMarkerModel\n",
+    "from span_marker import SpanMarkerModel, SpanMarkerModelCardData\n",
     "\n",
-    "model_name = \"bert-base-cased\"\n",
-    "model = SpanMarkerModel.from_pretrained(model_name, labels=labels, model_max_length=256)"
+    "encoder_id = \"bert-base-cased\"\n",
+    "model = SpanMarkerModel.from_pretrained(\n",
+    "    # Required arguments\n",
+    "    encoder_id,\n",
+    "    labels=labels,\n",
+    "    # Optional arguments\n",
+    "    model_max_length=256,\n",
+    "    entity_max_length=8,\n",
+    "    # To improve the generated model card\n",
+    "    model_card_data=SpanMarkerModelCardData(\n",
+    "        language=[\"en\"],\n",
+    "        license=\"cc-by-sa-4.0\",\n",
+    "        encoder_id=encoder_id,\n",
+    "        dataset_id=dataset_id,\n",
+    "    )\n",
+    ")"
    ]
   },
   {
@@ -227,7 +224,9 @@
     "Note that we provided `SpanMarkerModel.from_pretrained` with a list of our labels. This is required when training a new model using an encoder. Furthermore, we can specify some useful configuration parameters from `SpanMarkerConfig`, such as:\n",
     "\n",
     "* `model_max_length`: The maximum number of tokens that the model will process. If you only use short sentences for your model, reducing this number may help training and inference speeds with no loss in performance. Defaults to the encoder maximum, or 512 if the encoder doesn't have a maximum.\n",
-    "* `entity_max_length`: The total number of words that one entity can be. Defaults to 16."
+    "* `entity_max_length`: The total number of words that one entity can be. Defaults to 8.\n",
+    "* `model_card_data`: A [SpanMarkerModelCardData](https://tomaarsen.github.io/SpanMarkerNER/api/span_marker.model_card.html#span_marker.model_card.SpanMarkerModelCardData) instance where you can provide a lot of useful data about your model. This data will be automatically included in a generated model card whenever a model is saved or pushed to the Hugging Face Hub.\n",
+    "    * Consider adding `language`, `license`, `model_id`, `encoder_id` and `dataset_id` to improve the generated model card README.md file."
    ]
   },
   {
@@ -261,6 +260,7 @@
     "    eval_steps=200,\n",
     "    push_to_hub=False,\n",
     "    logging_steps=50,\n",
+    "    fp16=True,\n",
     "    warmup_ratio=0.1,\n",
     "    dataloader_num_workers=2,\n",
     ")"
@@ -271,12 +271,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now we can create a SpanMarker `Trainer` in the same way that you would initialize a 🤗 Transformers `Trainer`. We'll train on a subsection of the data to save us some time. Amazingly, this `Trainer` will automatically create logs using exactly the logging tools that you have installed. With other words, if you prefer logging with [Tensorboard](https://www.tensorflow.org/tensorboard), all that you have to do is install it."
+    "Now we can create a SpanMarker [Trainer](https://tomaarsen.github.io/SpanMarkerNER/api/span_marker.trainer.html#span_marker.trainer.Trainer) in the same way that you would initialize a 🤗 Transformers `Trainer`. We'll train on a subsection of the data to save us some time. Amazingly, this `Trainer` will automatically create logs using exactly the logging tools that you have installed. With other words, if you prefer logging with [Tensorboard](https://www.tensorflow.org/tensorboard), all that you have to do is install it."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -307,163 +307,48 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "This SpanMarker model will ignore 0.339050% of all annotated entities in the train dataset. This is caused by the SpanMarkerModel maximum entity length of 8 words.\n",
-      "These are the frequencies of the missed entities due to maximum entity length out of 20351 total entities:\n",
-      "- 24 missed entities with 9 words (0.117930%)\n",
-      "- 15 missed entities with 10 words (0.073706%)\n",
-      "- 14 missed entities with 11 words (0.068793%)\n",
-      "- 7 missed entities with 12 words (0.034396%)\n",
-      "- 5 missed entities with 13 words (0.024569%)\n",
-      "- 2 missed entities with 15 words (0.009828%)\n",
-      "- 1 missed entities with 17 words (0.004914%)\n",
-      "- 1 missed entities with 19 words (0.004914%)\n",
-	  "Tracking run with wandb version 0.14.0\n",
-	  "Run data is saved locally in ...\n",
-	  "Syncing run colorful-leaf-761 to Weights & Biases\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'loss': 0.9012, 'learning_rate': 2.032520325203252e-05, 'epoch': 0.04}\n",
-      "{'loss': 0.0813, 'learning_rate': 4.065040650406504e-05, 'epoch': 0.08}\n",
-      "{'loss': 0.0514, 'learning_rate': 4.8777173913043476e-05, 'epoch': 0.12}\n",
-      "{'loss': 0.0385, 'learning_rate': 4.651268115942029e-05, 'epoch': 0.16}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "This SpanMarker model won't be able to predict 0.307515% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum entity length of 8 words.\n",
-      "These are the frequencies of the missed entities due to maximum entity length out of 5203 total entities:\n",
-      "- 5 missed entities with 9 words (0.096098%)\n",
-      "- 5 missed entities with 10 words (0.096098%)\n",
-      "- 2 missed entities with 11 words (0.038439%)\n",
-      "- 1 missed entities with 12 words (0.019220%)\n",
-      "- 3 missed entities with 13 words (0.057659%)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'eval_loss': 0.03596973791718483, 'eval_overall_precision': 0.6802749427202666, 'eval_overall_recall': 0.6297724643270344, 'eval_overall_f1': 0.6540502653449485, 'eval_overall_accuracy': 0.9053643208390295, 'eval_runtime': 28.0718, 'eval_samples_per_second': 87.241, 'eval_steps_per_second': 21.837, 'epoch': 0.16}\n",
-      "{'loss': 0.0334, 'learning_rate': 4.42481884057971e-05, 'epoch': 0.2}\n",
-      "{'loss': 0.0306, 'learning_rate': 4.1983695652173914e-05, 'epoch': 0.24}\n",
-      "{'loss': 0.0278, 'learning_rate': 3.971920289855073e-05, 'epoch': 0.29}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loading cached processed dataset at ...\n",
-      "Loading cached processed dataset at ...\n"
+      "{'loss': 0.6974, 'learning_rate': 1.991869918699187e-05, 'epoch': 0.04}\n",
+      "{'loss': 0.0896, 'learning_rate': 4.0243902439024395e-05, 'epoch': 0.08}\n",
+      "{'loss': 0.0584, 'learning_rate': 4.8822463768115946e-05, 'epoch': 0.12}\n",
+      "{'loss': 0.0382, 'learning_rate': 4.655797101449276e-05, 'epoch': 0.16}\n",
+      "{'eval_loss': 0.03181104362010956, 'eval_overall_precision': 0.6967930029154519, 'eval_overall_recall': 0.5989974937343359, 'eval_overall_f1': 0.6442048517520216, 'eval_overall_accuracy': 0.8993717106605198, 'eval_runtime': 29.16, 'eval_samples_per_second': 83.985, 'eval_steps_per_second': 21.022, 'epoch': 0.16}\n",
+      "{'loss': 0.0333, 'learning_rate': 4.429347826086957e-05, 'epoch': 0.2}\n",
+      "{'loss': 0.0303, 'learning_rate': 4.202898550724638e-05, 'epoch': 0.24}\n",
+      "{'loss': 0.032, 'learning_rate': 3.976449275362319e-05, 'epoch': 0.29}\n",
+      "{'loss': 0.0304, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.33}\n",
+      "{'eval_loss': 0.02394717186689377, 'eval_overall_precision': 0.7350157728706624, 'eval_overall_recall': 0.7187198766146135, 'eval_overall_f1': 0.7267764889365436, 'eval_overall_accuracy': 0.9227489698502713, 'eval_runtime': 29.481, 'eval_samples_per_second': 83.07, 'eval_steps_per_second': 20.793, 'epoch': 0.33}\n",
+      "{'loss': 0.0265, 'learning_rate': 3.5235507246376816e-05, 'epoch': 0.37}\n",
+      "{'loss': 0.0254, 'learning_rate': 3.297101449275363e-05, 'epoch': 0.41}\n",
+      "{'loss': 0.0249, 'learning_rate': 3.0706521739130435e-05, 'epoch': 0.45}\n",
+      "{'loss': 0.0242, 'learning_rate': 2.8442028985507245e-05, 'epoch': 0.49}\n",
+      "{'eval_loss': 0.02163967303931713, 'eval_overall_precision': 0.762808736476832, 'eval_overall_recall': 0.7204549836128783, 'eval_overall_f1': 0.7410271663692247, 'eval_overall_accuracy': 0.9293582473175309, 'eval_runtime': 29.0261, 'eval_samples_per_second': 84.372, 'eval_steps_per_second': 21.119, 'epoch': 0.49}\n",
+      "{'loss': 0.0224, 'learning_rate': 2.6177536231884058e-05, 'epoch': 0.53}\n",
+      "{'loss': 0.0242, 'learning_rate': 2.391304347826087e-05, 'epoch': 0.57}\n",
+      "{'loss': 0.0226, 'learning_rate': 2.164855072463768e-05, 'epoch': 0.61}\n",
+      "{'loss': 0.0245, 'learning_rate': 1.9384057971014493e-05, 'epoch': 0.65}\n",
+      "{'eval_loss': 0.020556513220071793, 'eval_overall_precision': 0.7680876026593665, 'eval_overall_recall': 0.7572778099093889, 'eval_overall_f1': 0.7626444034559751, 'eval_overall_accuracy': 0.9338052303047611, 'eval_runtime': 29.7545, 'eval_samples_per_second': 82.307, 'eval_steps_per_second': 20.602, 'epoch': 0.65}\n",
+      "{'loss': 0.0231, 'learning_rate': 1.7119565217391306e-05, 'epoch': 0.69}\n",
+      "{'loss': 0.0209, 'learning_rate': 1.4855072463768116e-05, 'epoch': 0.73}\n",
+      "{'loss': 0.0202, 'learning_rate': 1.2590579710144929e-05, 'epoch': 0.77}\n",
+      "{'loss': 0.0212, 'learning_rate': 1.032608695652174e-05, 'epoch': 0.81}\n",
+      "{'eval_loss': 0.01960749179124832, 'eval_overall_precision': 0.7743021183923976, 'eval_overall_recall': 0.7540003855793329, 'eval_overall_f1': 0.7640164094549716, 'eval_overall_accuracy': 0.9358247317530904, 'eval_runtime': 29.6794, 'eval_samples_per_second': 82.515, 'eval_steps_per_second': 20.654, 'epoch': 0.81}\n",
+      "{'loss': 0.0202, 'learning_rate': 8.061594202898551e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0196, 'learning_rate': 5.797101449275362e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.0232, 'learning_rate': 3.5326086956521736e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.0183, 'learning_rate': 1.2681159420289857e-06, 'epoch': 0.98}\n",
+      "{'eval_loss': 0.019303549081087112, 'eval_overall_precision': 0.7719162141194724, 'eval_overall_recall': 0.7673028725660305, 'eval_overall_f1': 0.769602629797931, 'eval_overall_accuracy': 0.9378442332014197, 'eval_runtime': 29.1715, 'eval_samples_per_second': 83.952, 'eval_steps_per_second': 21.014, 'epoch': 0.98}\n",
+      "{'train_runtime': 450.609, 'train_samples_per_second': 21.788, 'train_steps_per_second': 2.723, 'train_loss': 0.056268237500824186, 'epoch': 1.0}\n"
      ]
     },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'loss': 0.0245, 'learning_rate': 3.745471014492754e-05, 'epoch': 0.33}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'eval_loss': 0.023754317313432693, 'eval_overall_precision': 0.7612159329140461, 'eval_overall_recall': 0.700154261473197, 'eval_overall_f1': 0.7294094013660104, 'eval_overall_accuracy': 0.9214634046807729, 'eval_runtime': 28.2374, 'eval_samples_per_second': 86.729, 'eval_steps_per_second': 21.709, 'epoch': 0.33}\n",
-      "{'loss': 0.0257, 'learning_rate': 3.5190217391304346e-05, 'epoch': 0.37}\n",
-      "{'loss': 0.0237, 'learning_rate': 3.292572463768116e-05, 'epoch': 0.41}\n",
-      "{'loss': 0.0234, 'learning_rate': 3.066123188405797e-05, 'epoch': 0.45}\n",
-      "{'loss': 0.0241, 'learning_rate': 2.8396739130434785e-05, 'epoch': 0.49}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'eval_loss': 0.02093053236603737, 'eval_overall_precision': 0.7934713036057179, 'eval_overall_recall': 0.7171230235248747, 'eval_overall_f1': 0.7533677706877343, 'eval_overall_accuracy': 0.9292782958232162, 'eval_runtime': 28.1912, 'eval_samples_per_second': 86.871, 'eval_steps_per_second': 21.744, 'epoch': 0.49}\n",
-      "{'loss': 0.021, 'learning_rate': 2.6132246376811598e-05, 'epoch': 0.53}\n",
-      "{'loss': 0.02, 'learning_rate': 2.3867753623188408e-05, 'epoch': 0.57}\n",
-      "{'loss': 0.022, 'learning_rate': 2.1603260869565217e-05, 'epoch': 0.61}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loading cached processed dataset at ...\n",
-      "Loading cached processed dataset at ...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'loss': 0.0237, 'learning_rate': 1.933876811594203e-05, 'epoch': 0.65}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'eval_loss': 0.020754070952534676, 'eval_overall_precision': 0.7628806742003448, 'eval_overall_recall': 0.7680293096799075, 'eval_overall_f1': 0.7654463341981359, 'eval_overall_accuracy': 0.9358077087881818, 'eval_runtime': 28.0953, 'eval_samples_per_second': 87.168, 'eval_steps_per_second': 21.819, 'epoch': 0.65}\n",
-      "{'loss': 0.0226, 'learning_rate': 1.7074275362318843e-05, 'epoch': 0.69}\n",
-      "{'loss': 0.0218, 'learning_rate': 1.4809782608695653e-05, 'epoch': 0.73}\n",
-      "{'loss': 0.0242, 'learning_rate': 1.2545289855072464e-05, 'epoch': 0.77}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loading cached processed dataset at ...\n",
-      "Loading cached processed dataset at ...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'loss': 0.0197, 'learning_rate': 1.0280797101449275e-05, 'epoch': 0.81}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'eval_loss': 0.019617434591054916, 'eval_overall_precision': 0.7771473292897672, 'eval_overall_recall': 0.7659082144234477, 'eval_overall_f1': 0.7714868408274256, 'eval_overall_accuracy': 0.937746128262156, 'eval_runtime': 28.2921, 'eval_samples_per_second': 86.561, 'eval_steps_per_second': 21.667, 'epoch': 0.81}\n",
-      "{'loss': 0.0191, 'learning_rate': 8.016304347826086e-06, 'epoch': 0.86}\n",
-      "{'loss': 0.0187, 'learning_rate': 5.751811594202898e-06, 'epoch': 0.9}\n",
-      "{'loss': 0.0202, 'learning_rate': 3.4873188405797104e-06, 'epoch': 0.94}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loading cached processed dataset at ...\n",
-      "Loading cached processed dataset at ...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'loss': 0.0221, 'learning_rate': 1.2228260869565218e-06, 'epoch': 0.98}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'eval_loss': 0.019159900024533272, 'eval_overall_precision': 0.7773279352226721, 'eval_overall_recall': 0.7774778249132279, 'eval_overall_f1': 0.7774028728429576, 'eval_overall_accuracy': 0.9399702095533473, 'eval_runtime': 28.0225, 'eval_samples_per_second': 87.394, 'eval_steps_per_second': 21.875, 'epoch': 0.98}\n",
-      "{'train_runtime': 453.1296, 'train_samples_per_second': 21.667, 'train_steps_per_second': 2.708, 'train_loss': 0.06319850289734186, 'epoch': 1.0}\n",
-	  "TrainOutput(global_step=1227, training_loss=0.06319850289734186, metrics={'train_runtime': 453.1296, 'train_samples_per_second': 21.667, 'train_steps_per_second': 2.708, 'train_loss': 0.06319850289734186, 'epoch': 1.0})"
-     ]
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=1227, training_loss=0.056268237500824186, metrics={'train_runtime': 450.609, 'train_samples_per_second': 21.788, 'train_steps_per_second': 2.723, 'train_loss': 0.056268237500824186, 'epoch': 1.0})"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -484,21 +369,54 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loading cached processed dataset at ...\n",
-      "Loading cached processed dataset at ...\n",
-       "{'eval_loss': 0.019206691533327103,\n",
-       " 'eval_overall_precision': 0.7758985200845666,\n",
-       " 'eval_overall_recall': 0.7784419591207096,\n",
-       " 'eval_overall_f1': 0.7771681586293194,\n",
-       " 'eval_overall_accuracy': 0.9398477830602543,\n",
-       " 'eval_runtime': 28.0849,\n",
-       " 'eval_samples_per_second': 87.2,\n",
-       " 'eval_steps_per_second': 21.827,\n",
+     "data": {
+      "text/plain": [
+       "{'eval_loss': 0.019375888630747795,\n",
+       " 'eval_art': {'precision': 0.7661290322580645,\n",
+       "  'recall': 0.7723577235772358,\n",
+       "  'f1': 0.7692307692307692,\n",
+       "  'number': 246},\n",
+       " 'eval_building': {'precision': 0.5842293906810035,\n",
+       "  'recall': 0.6127819548872181,\n",
+       "  'f1': 0.5981651376146789,\n",
+       "  'number': 266},\n",
+       " 'eval_event': {'precision': 0.5497382198952879,\n",
+       "  'recall': 0.5965909090909091,\n",
+       "  'f1': 0.5722070844686648,\n",
+       "  'number': 176},\n",
+       " 'eval_location': {'precision': 0.8036732108929703,\n",
+       "  'recall': 0.8409542743538767,\n",
+       "  'f1': 0.8218911917098446,\n",
+       "  'number': 1509},\n",
+       " 'eval_organization': {'precision': 0.7474226804123711,\n",
+       "  'recall': 0.6998069498069498,\n",
+       "  'f1': 0.7228315054835494,\n",
+       "  'number': 1036},\n",
+       " 'eval_other': {'precision': 0.6775818639798489,\n",
+       "  'recall': 0.5604166666666667,\n",
+       "  'f1': 0.61345496009122,\n",
+       "  'number': 480},\n",
+       " 'eval_person': {'precision': 0.8636363636363636,\n",
+       "  'recall': 0.9063313096270599,\n",
+       "  'f1': 0.8844688954718578,\n",
+       "  'number': 1153},\n",
+       " 'eval_product': {'precision': 0.7366666666666667,\n",
+       "  'recall': 0.6884735202492211,\n",
+       "  'f1': 0.7117552334943639,\n",
+       "  'number': 321},\n",
+       " 'eval_overall_precision': 0.7705836876691148,\n",
+       " 'eval_overall_recall': 0.7686524002313476,\n",
+       " 'eval_overall_f1': 0.7696168323520897,\n",
+       " 'eval_overall_accuracy': 0.9381502182693484,\n",
+       " 'eval_runtime': 28.5583,\n",
+       " 'eval_samples_per_second': 85.754,\n",
+       " 'eval_steps_per_second': 21.465,\n",
        " 'epoch': 1.0}"
-     ]
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -520,7 +438,7 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
       "This SpanMarker model won't be able to predict 0.285605% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum entity length of 8 words.\n",
@@ -532,17 +450,58 @@
       "- 1 missed entities with 13 words (0.019040%)\n",
       "- 1 missed entities with 17 words (0.019040%)\n",
       "- 1 missed entities with 19 words (0.019040%)\n",
-      "- 1 missed entities with 40 words (0.019040%)\n",
-       "{'test_loss': 0.019189156591892242,\n",
-       " 'test_overall_precision': 0.769879287219774,\n",
-       " 'test_overall_recall': 0.7679663608562691,\n",
-       " 'test_overall_f1': 0.7689216342933691,\n",
-       " 'test_overall_accuracy': 0.938544749464231,\n",
-       " 'test_runtime': 28.0932,\n",
-       " 'test_samples_per_second': 86.854,\n",
-       " 'test_steps_per_second': 21.713,\n",
-       " 'epoch': 1.0}"
+      "- 1 missed entities with 40 words (0.019040%)\n"
      ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'test_loss': 0.01918497122824192,\n",
+       " 'test_art': {'precision': 0.7419354838709677,\n",
+       "  'recall': 0.7488372093023256,\n",
+       "  'f1': 0.7453703703703703,\n",
+       "  'number': 215},\n",
+       " 'test_building': {'precision': 0.6236559139784946,\n",
+       "  'recall': 0.710204081632653,\n",
+       "  'f1': 0.6641221374045801,\n",
+       "  'number': 245},\n",
+       " 'test_event': {'precision': 0.6153846153846154,\n",
+       "  'recall': 0.5529953917050692,\n",
+       "  'f1': 0.5825242718446603,\n",
+       "  'number': 217},\n",
+       " 'test_location': {'precision': 0.812192118226601,\n",
+       "  'recall': 0.8515171078114913,\n",
+       "  'f1': 0.8313898518751971,\n",
+       "  'number': 1549},\n",
+       " 'test_organization': {'precision': 0.7320754716981132,\n",
+       "  'recall': 0.6897777777777778,\n",
+       "  'f1': 0.7102974828375286,\n",
+       "  'number': 1125},\n",
+       " 'test_other': {'precision': 0.7375886524822695,\n",
+       "  'recall': 0.6328600405679513,\n",
+       "  'f1': 0.6812227074235807,\n",
+       "  'number': 493},\n",
+       " 'test_person': {'precision': 0.8805309734513275,\n",
+       "  'recall': 0.9061930783242259,\n",
+       "  'f1': 0.8931777378815081,\n",
+       "  'number': 1098},\n",
+       " 'test_product': {'precision': 0.6641221374045801,\n",
+       "  'recall': 0.5898305084745763,\n",
+       "  'f1': 0.6247755834829445,\n",
+       "  'number': 295},\n",
+       " 'test_overall_precision': 0.7766859344894027,\n",
+       " 'test_overall_recall': 0.7697154859652473,\n",
+       " 'test_overall_f1': 0.7731850004795243,\n",
+       " 'test_overall_accuracy': 0.938954021816699,\n",
+       " 'test_runtime': 29.8808,\n",
+       " 'test_samples_per_second': 81.658,\n",
+       " 'test_steps_per_second': 20.414,\n",
+       " 'epoch': 1.0}"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -581,7 +540,9 @@
      "text": [
       "Battle of Camulodunum => event\n",
       "Quintus Petillius Cerialis => person\n",
-      "Boudica => person\n",
+      "Boudica => location\n",
+      "Camulodunum => location\n",
+      "Colchester => location\n",
       "\n",
       "Wellingborough => location\n",
       "Northamptonshire => location\n",
@@ -599,9 +560,11 @@
       "Bachelor of Music in Composition => other\n",
       "California State University => organization\n",
       "Northridge => location\n",
+      "Master of Music in Harpsichord Performance => other\n",
       "Cal State Northridge => organization\n",
-      "Ann Arbor => organization\n",
-      "\n"
+      "Doctor of Musical Arts => other\n",
+      "University of Michigan => organization\n",
+      "Ann Arbor => location"
      ]
     }
    ],
@@ -634,7 +597,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Once trained, we can save our new model locally."
+    "Once trained, we can save our new model locally. The saved model also comes with a flashy `README.md` such as [this one](https://huggingface.co/tomaarsen/span-marker-bert-base-uncased-bionlp)."
    ]
   },
   {
@@ -651,16 +614,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Or we can push it to the 🤗 Hub like so. I've commented it away for now to prevent people from accidentally pushing models."
+    "Or we can push it to the 🤗 Hub like so."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# trainer.push_to_hub()"
+    "trainer.push_to_hub(repo_id=\"span-marker-bert-base-fewnerd-coarse-super\")"
    ]
   },
   {
@@ -692,11 +655,26 @@
     "from transformers import TrainingArguments\n",
     "\n",
     "def main():\n",
-    "    dataset = load_dataset(\"DFKI-SLT/few-nerd\", \"supervised\")\n",
+    "    dataset_id = \"DFKI-SLT/few-nerd\"\n",
+    "    dataset = load_dataset(dataset_id, \"supervised\")\n",
     "    labels = dataset[\"train\"].features[\"ner_tags\"].feature.names\n",
     "\n",
-    "    model_name = \"bert-base-cased\"\n",
-    "    model = SpanMarkerModel.from_pretrained(model_name, labels=labels)\n",
+    "    encoder_id = \"bert-base-cased\"\n",
+    "    model = SpanMarkerModel.from_pretrained(\n",
+    "        # Required arguments\n",
+    "        encoder_id,\n",
+    "        labels=labels,\n",
+    "        # Optional arguments\n",
+    "        model_max_length=256,\n",
+    "        entity_max_length=8,\n",
+    "        # To improve the generated model card\n",
+    "        model_card_data=SpanMarkerModelCardData(\n",
+    "            language=[\"en\"],\n",
+    "            license=\"cc-by-sa-4.0\",\n",
+    "            encoder_id=encoder_id,\n",
+    "            dataset_id=dataset_id,\n",
+    "        )\n",
+    "    )\n",
     "\n",
     "    args = TrainingArguments(\n",
     "        output_dir=\"models/span-marker-bert-base-fewnerd-coarse-super\",\n",
@@ -710,6 +688,7 @@
     "        eval_steps=200,\n",
     "        push_to_hub=False,\n",
     "        logging_steps=50,\n",
+    "        fp16=True,\n",
     "        warmup_ratio=0.1,\n",
     "        dataloader_num_workers=2,\n",
     "    )\n",
@@ -720,13 +699,13 @@
     "        train_dataset=dataset[\"train\"].select(range(8000)),\n",
     "        eval_dataset=dataset[\"validation\"].select(range(2000)),\n",
     "    )\n",
-    "\n",
     "    trainer.train()\n",
-    "    trainer.save_model(\"models/span-marker-bert-base-fewnerd-coarse-super/checkpoint-final\")\n",
     "\n",
     "    metrics = trainer.evaluate()\n",
     "    print(metrics)\n",
     "\n",
+    "    trainer.save_model(\"models/span-marker-bert-base-fewnerd-coarse-super/checkpoint-final\")\n",
+    "\n",
     "if __name__ == \"__main__\":\n",
     "    main()\n",
     "```"
diff --git a/notebooks/model_training.ipynb b/notebooks/model_training.ipynb
index 11ae82c9..f13e11b9 100644
--- a/notebooks/model_training.ipynb
+++ b/notebooks/model_training.ipynb
@@ -70,7 +70,8 @@
    "source": [
     "from datasets import load_dataset\n",
     "\n",
-    "dataset = load_dataset(\"conll2003\")\n",
+    "dataset_id = \"conll2003\"\n",
+    "dataset = load_dataset(dataset_id)\n",
     "dataset"
    ]
   },
@@ -121,38 +122,53 @@
     "* [prajjwal1/bert-medium](https://huggingface.co/prajjwal1/bert-medium)\n",
     "* [bert-base-cased](https://huggingface.co/bert-base-cased)\n",
     "* [bert-large-cased](https://huggingface.co/bert-large-cased)\n",
+    "* [bert-base-multilingual-cased](https://huggingface.co/bert-base-multilingual-cased)\n",
+    "* [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased)\n",
     "* [roberta-base](https://huggingface.co/roberta-base)\n",
     "* [roberta-large](https://huggingface.co/roberta-large)\n",
+    "* [xlm-roberta-base](https://huggingface.co/xlm-roberta-base)\n",
+    "* [xlm-roberta-large](https://huggingface.co/xlm-roberta-large)\n",
     "\n",
-    "Not all encoders work though, they **must** allow for `position_ids` as an input argument, which disqualifies DistilBERT, T5, DistilRoBERTa, ALBERT & BART. Furthermore, using uncased models is generally not recommended, as the capitalisation can be very useful to find named entities.\n",
+    "\n",
+    "Not all encoders work though, they **must** allow for `position_ids` as an input argument, which disqualifies DistilBERT, T5, DistilRoBERTa, ALBERT & BART. \n",
+    "\n",
+    "Additionally, it's important to consider that cased models typically demand consistent capitalization in the inference data, aligning with how the training data is formatted. In simpler terms, if your training data consistently uses correct capitalization, but your inference data does not, it may lead to suboptimal performance. In such cases, you might find an uncased model more suitable. Although it may exhibit slightly lower F1 scores on the testing set, it remains functional regardless of capitalization, making it potentially more effective in real-world scenarios.\n",
     "\n",
     "We'll use `\"roberta-base\"` for this notebook. If you're running this on Google Colab, be sure to set hardware accelerator to \"GPU\" in `Runtime` > `Change runtime type`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']\n",
-      "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+      "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
      ]
     }
    ],
    "source": [
-    "from span_marker import SpanMarkerModel\n",
+    "from span_marker import SpanMarkerModel, SpanMarkerModelCardData\n",
     "\n",
-    "model_name = \"roberta-base\"\n",
+    "encoder_id = \"roberta-base\"\n",
     "model = SpanMarkerModel.from_pretrained(\n",
-    "    model_name,\n",
+    "    # Required arguments\n",
+    "    encoder_id,\n",
     "    labels=labels,\n",
+    "    # Optional arguments\n",
     "    model_max_length=256,\n",
     "    entity_max_length=6,\n",
+    "    # To improve the generated model card\n",
+    "    model_card_data=SpanMarkerModelCardData(\n",
+    "        language=[\"en\"],\n",
+    "        license=\"apache-2.0\",\n",
+    "        encoder_id=encoder_id,\n",
+    "        dataset_id=dataset_id,\n",
+    "    )\n",
     ")"
    ]
   },
@@ -161,9 +177,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For us, these warnings are expected, as we are initializing `BertModel` for a new task.\n",
+    "For us, these warnings are expected, as we are initializing `RobertaModel` for a new task.\n",
+    "\n",
+    "Note that we provided `SpanMarkerModel.from_pretrained` with a list of our labels. This is required when training a new model using an encoder. Furthermore, we can specify some useful configuration parameters from `SpanMarkerConfig`, such as:\n",
     "\n",
-    "Note that we provided [SpanMarkerModel.from_pretrained](https://tomaarsen.github.io/SpanMarkerNER/api/span_marker.modeling.html#span_marker.modeling.SpanMarkerModel.from_pretrained) with a list of our labels. This is required when training a new model. See [Configuring](model_configuration.ipynb) for more details and recommendations on configuration options."
+    "* `model_max_length`: The maximum number of tokens that the model will process. If you only use short sentences for your model, reducing this number may help training and inference speeds with no loss in performance. Defaults to the encoder maximum, or 512 if the encoder doesn't have a maximum.\n",
+    "* `entity_max_length`: The total number of words that one entity can be. Defaults to 8.\n",
+    "* `model_card_data`: A [SpanMarkerModelCardData](https://tomaarsen.github.io/SpanMarkerNER/api/span_marker.model_card.html#span_marker.model_card.SpanMarkerModelCardData) instance where you can provide a lot of useful data about your model. This data will be automatically included in a generated model card whenever a model is saved or pushed to the Hugging Face Hub.\n",
+    "    * Consider adding `language`, `license`, `model_id`, `encoder_id` and `dataset_id` to improve the generated model card README.md file."
    ]
   },
   {
@@ -179,7 +200,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -207,12 +228,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now we can create a SpanMarker `Trainer` in the same way that you would initialize a 🤗 Transformers `Trainer`. We'll train on a subsection of the data to save us some time. Amazingly, this `Trainer` will automatically create logs using exactly the logging tools that you have installed. With other words, if you prefer logging with [Tensorboard](https://www.tensorflow.org/tensorboard), all that you have to do is install it."
+    "Now we can create a SpanMarker [Trainer](https://tomaarsen.github.io/SpanMarkerNER/api/span_marker.trainer.html#span_marker.trainer.Trainer) in the same way that you would initialize a 🤗 Transformers `Trainer`. We'll train on a subsection of the data to save us some time. Amazingly, this `Trainer` will automatically create logs using exactly the logging tools that you have installed. With other words, if you prefer logging with [Tensorboard](https://www.tensorflow.org/tensorboard), all that you have to do is install it."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -226,83 +247,20 @@
       "- 3 missed entities with 10 words (0.012767%)\n"
      ]
     },
-    {
-     "data": {
-      "text/html": [
-       "wandb version 0.15.0 is available!  To upgrade, please run:\n",
-       " $ pip install wandb --upgrade"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Tracking run with wandb version 0.14.0"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Run data is saved locally in <code>wandb\\run-20230428_160736-klxbldeq</code>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Syncing run <strong>woven-plasma-757</strong> to Weights & Biases (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "be3fbeb39544469eba6382d146d521fa",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1802 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'loss': 0.9477, 'learning_rate': 2.6519337016574586e-06, 'epoch': 0.03}\n",
-      "{'loss': 0.2025, 'learning_rate': 5.414364640883978e-06, 'epoch': 0.06}\n",
-      "{'loss': 0.1407, 'learning_rate': 8.176795580110498e-06, 'epoch': 0.08}\n",
-      "{'loss': 0.1291, 'learning_rate': 9.895126465144972e-06, 'epoch': 0.11}\n",
-      "{'loss': 0.0973, 'learning_rate': 9.58667489204195e-06, 'epoch': 0.14}\n",
-      "{'loss': 0.0737, 'learning_rate': 9.278223318938926e-06, 'epoch': 0.17}\n",
-      "{'loss': 0.0639, 'learning_rate': 8.969771745835904e-06, 'epoch': 0.19}\n",
-      "{'loss': 0.0539, 'learning_rate': 8.661320172732882e-06, 'epoch': 0.22}\n",
-      "{'loss': 0.0481, 'learning_rate': 8.352868599629858e-06, 'epoch': 0.25}\n",
-      "{'loss': 0.0489, 'learning_rate': 8.044417026526836e-06, 'epoch': 0.28}\n"
+      "{'loss': 1.1135, 'learning_rate': 2.707182320441989e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.245, 'learning_rate': 5.469613259668509e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.1466, 'learning_rate': 8.232044198895029e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.1077, 'learning_rate': 9.888957433682912e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.0839, 'learning_rate': 9.58050586057989e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0702, 'learning_rate': 9.272054287476866e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.0614, 'learning_rate': 8.963602714373844e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.0476, 'learning_rate': 8.65515114127082e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.0446, 'learning_rate': 8.346699568167798e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.0327, 'learning_rate': 8.038247995064774e-06, 'epoch': 0.28}\n"
      ]
     },
     {
@@ -319,57 +277,63 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'eval_loss': 0.03809420391917229, 'eval_overall_precision': 0.8559068219633943, 'eval_overall_recall': 0.7527070529704419, 'eval_overall_f1': 0.8009965742759265, 'eval_overall_accuracy': 0.9548683524504692, 'eval_runtime': 13.4517, 'eval_samples_per_second': 153.661, 'eval_steps_per_second': 38.434, 'epoch': 0.28}\n",
-      "{'loss': 0.0379, 'learning_rate': 7.735965453423812e-06, 'epoch': 0.31}\n",
-      "{'loss': 0.039, 'learning_rate': 7.42751388032079e-06, 'epoch': 0.33}\n",
-      "{'loss': 0.0373, 'learning_rate': 7.119062307217768e-06, 'epoch': 0.36}\n",
-      "{'loss': 0.0362, 'learning_rate': 6.810610734114744e-06, 'epoch': 0.39}\n",
-      "{'loss': 0.0287, 'learning_rate': 6.502159161011722e-06, 'epoch': 0.42}\n",
-      "{'loss': 0.0283, 'learning_rate': 6.193707587908698e-06, 'epoch': 0.44}\n",
-      "{'loss': 0.0308, 'learning_rate': 5.885256014805676e-06, 'epoch': 0.47}\n",
-      "{'loss': 0.0266, 'learning_rate': 5.576804441702654e-06, 'epoch': 0.5}\n",
-      "{'loss': 0.0193, 'learning_rate': 5.26835286859963e-06, 'epoch': 0.53}\n",
-      "{'loss': 0.0163, 'learning_rate': 4.959901295496608e-06, 'epoch': 0.55}\n",
-      "{'eval_loss': 0.018327122554183006, 'eval_overall_precision': 0.9140995260663507, 'eval_overall_recall': 0.9031314018144572, 'eval_overall_f1': 0.9085823641984395, 'eval_overall_accuracy': 0.9804157977059437, 'eval_runtime': 13.537, 'eval_samples_per_second': 152.693, 'eval_steps_per_second': 38.192, 'epoch': 0.55}\n",
-      "{'loss': 0.0249, 'learning_rate': 4.651449722393585e-06, 'epoch': 0.58}\n",
-      "{'loss': 0.0225, 'learning_rate': 4.342998149290562e-06, 'epoch': 0.61}\n",
-      "{'loss': 0.0215, 'learning_rate': 4.0345465761875395e-06, 'epoch': 0.64}\n",
-      "{'loss': 0.0251, 'learning_rate': 3.726095003084516e-06, 'epoch': 0.67}\n",
-      "{'loss': 0.0186, 'learning_rate': 3.417643429981493e-06, 'epoch': 0.69}\n",
-      "{'loss': 0.0212, 'learning_rate': 3.10919185687847e-06, 'epoch': 0.72}\n",
-      "{'loss': 0.0166, 'learning_rate': 2.800740283775448e-06, 'epoch': 0.75}\n",
-      "{'loss': 0.0226, 'learning_rate': 2.492288710672425e-06, 'epoch': 0.78}\n",
-      "{'loss': 0.0162, 'learning_rate': 2.183837137569402e-06, 'epoch': 0.8}\n"
+      "{'eval_loss': 0.02650175243616104, 'eval_overall_precision': 0.8974691758598313, 'eval_overall_recall': 0.7968885047536733, 'eval_overall_f1': 0.8441934991606898, 'eval_overall_accuracy': 0.9632217370208637, 'eval_runtime': 20.1351, 'eval_samples_per_second': 102.656, 'eval_steps_per_second': 25.676, 'epoch': 0.28}\n",
+      "{'loss': 0.0348, 'learning_rate': 7.729796421961752e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0378, 'learning_rate': 7.42134484885873e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.0275, 'learning_rate': 7.112893275755707e-06, 'epoch': 0.36}\n",
+      "{'loss': 0.0242, 'learning_rate': 6.804441702652684e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.0255, 'learning_rate': 6.495990129549661e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.0235, 'learning_rate': 6.187538556446638e-06, 'epoch': 0.44}\n",
+      "{'loss': 0.0223, 'learning_rate': 5.879086983343616e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.0183, 'learning_rate': 5.570635410240592e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.0194, 'learning_rate': 5.26218383713757e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.0191, 'learning_rate': 4.953732264034547e-06, 'epoch': 0.55}\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Loading cached processed dataset at ...\n",
-      "Loading cached processed dataset at ...\n"
+      "This SpanMarker model won't be able to predict 0.172563% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum entity length of 6 words.\n",
+      "These are the frequencies of the missed entities due to maximum entity length out of 3477 total entities:\n",
+      "- 5 missed entities with 7 words (0.143802%)\n",
+      "- 1 missed entities with 10 words (0.028760%)\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'loss': 0.0178, 'learning_rate': 1.8753855644663791e-06, 'epoch': 0.83}\n"
+      "{'eval_loss': 0.016905048862099648, 'eval_overall_precision': 0.9247838616714698, 'eval_overall_recall': 0.9245174301354077, 'eval_overall_f1': 0.9246506267108485, 'eval_overall_accuracy': 0.9844412097687207, 'eval_runtime': 20.2213, 'eval_samples_per_second': 102.219, 'eval_steps_per_second': 25.567, 'epoch': 0.55}\n",
+      "{'loss': 0.0206, 'learning_rate': 4.645280690931524e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.0198, 'learning_rate': 4.336829117828501e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0184, 'learning_rate': 4.028377544725479e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0203, 'learning_rate': 3.7199259716224557e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.0206, 'learning_rate': 3.4114743985194327e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.0187, 'learning_rate': 3.1030228254164097e-06, 'epoch': 0.72}\n",
+      "{'loss': 0.015, 'learning_rate': 2.794571252313387e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.0221, 'learning_rate': 2.486119679210364e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.0189, 'learning_rate': 2.177668106107341e-06, 'epoch': 0.8}\n",
+      "{'loss': 0.0158, 'learning_rate': 1.8692165330043186e-06, 'epoch': 0.83}\n",
+      "{'eval_loss': 0.01296199019998312, 'eval_overall_precision': 0.9394202898550724, 'eval_overall_recall': 0.933736675309709, 'eval_overall_f1': 0.9365698598468429, 'eval_overall_accuracy': 0.9868348698043021, 'eval_runtime': 20.2701, 'eval_samples_per_second': 101.973, 'eval_steps_per_second': 25.506, 'epoch': 0.83}\n",
+      "{'loss': 0.0165, 'learning_rate': 1.5607649599012956e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.017, 'learning_rate': 1.2523133867982728e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.0183, 'learning_rate': 9.438618136952499e-07, 'epoch': 0.92}\n",
+      "{'loss': 0.0164, 'learning_rate': 6.35410240592227e-07, 'epoch': 0.94}\n",
+      "{'loss': 0.0162, 'learning_rate': 3.2695866748920424e-07, 'epoch': 0.97}\n",
+      "{'loss': 0.021, 'learning_rate': 1.850709438618137e-08, 'epoch': 1.0}\n",
+      "{'train_runtime': 479.9392, 'train_samples_per_second': 30.033, 'train_steps_per_second': 3.755, 'train_loss': 0.06940532092560087, 'epoch': 1.0}\n"
      ]
     },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'eval_loss': 0.013812492601573467, 'eval_overall_precision': 0.9375370041444642, 'eval_overall_recall': 0.9268364062042728, 'eval_overall_f1': 0.9321559970566594, 'eval_overall_accuracy': 0.9858902502606882, 'eval_runtime': 13.6173, 'eval_samples_per_second': 151.793, 'eval_steps_per_second': 37.967, 'epoch': 0.83}\n",
-      "{'loss': 0.017, 'learning_rate': 1.566933991363356e-06, 'epoch': 0.86}\n",
-      "{'loss': 0.0164, 'learning_rate': 1.2584824182603333e-06, 'epoch': 0.89}\n",
-      "{'loss': 0.0202, 'learning_rate': 9.500308451573104e-07, 'epoch': 0.92}\n",
-      "{'loss': 0.0203, 'learning_rate': 6.415792720542875e-07, 'epoch': 0.94}\n",
-      "{'loss': 0.0188, 'learning_rate': 3.3312769895126467e-07, 'epoch': 0.97}\n",
-      "{'loss': 0.0175, 'learning_rate': 2.4676125848241828e-08, 'epoch': 1.0}\n",
-      "{'train_runtime': 326.1576, 'train_samples_per_second': 44.193, 'train_steps_per_second': 5.525, 'train_loss': 0.06725485075976323, 'epoch': 1.0}\n"
-     ]
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=1802, training_loss=0.06940532092560087, metrics={'train_runtime': 479.9392, 'train_samples_per_second': 30.033, 'train_steps_per_second': 3.755, 'train_loss': 0.06940532092560087, 'epoch': 1.0})"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -394,32 +358,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading cached processed dataset at ...\n",
-      "Loading cached processed dataset at ...\n"
-     ]
-    },
     {
      "data": {
       "text/plain": [
-       "{'eval_loss': 0.01344103179872036,\n",
-       " 'eval_overall_precision': 0.9364892678623934,\n",
-       " 'eval_overall_recall': 0.9321041849575651,\n",
-       " 'eval_overall_f1': 0.9342915811088296,\n",
-       " 'eval_overall_accuracy': 0.9861183524504692,\n",
-       " 'eval_runtime': 13.1532,\n",
-       " 'eval_samples_per_second': 157.148,\n",
-       " 'eval_steps_per_second': 39.306,\n",
+       "{'eval_loss': 0.012707239016890526,\n",
+       " 'eval_LOC': {'precision': 0.9642857142857143,\n",
+       "  'recall': 0.9503610108303249,\n",
+       "  'f1': 0.9572727272727273,\n",
+       "  'number': 1108},\n",
+       " 'eval_MISC': {'precision': 0.8805309734513275,\n",
+       "  'recall': 0.8378947368421052,\n",
+       "  'f1': 0.8586839266450916,\n",
+       "  'number': 475},\n",
+       " 'eval_ORG': {'precision': 0.8736842105263158,\n",
+       "  'recall': 0.9021739130434783,\n",
+       "  'f1': 0.8877005347593583,\n",
+       "  'number': 736},\n",
+       " 'eval_PER': {'precision': 0.9776247848537005,\n",
+       "  'recall': 0.9861111111111112,\n",
+       "  'f1': 0.9818496110630942,\n",
+       "  'number': 1152},\n",
+       " 'eval_overall_precision': 0.9379688401615696,\n",
+       " 'eval_overall_recall': 0.9366176894266782,\n",
+       " 'eval_overall_f1': 0.9372927778578637,\n",
+       " 'eval_overall_accuracy': 0.9872553776483908,\n",
+       " 'eval_runtime': 19.9052,\n",
+       " 'eval_samples_per_second': 103.842,\n",
+       " 'eval_steps_per_second': 25.973,\n",
        " 'epoch': 1.0}"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -439,24 +411,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'test_loss': 0.027700792998075485,\n",
-       " 'test_overall_precision': 0.9039692701664532,\n",
-       " 'test_overall_recall': 0.9067889908256881,\n",
-       " 'test_overall_f1': 0.9053769350554182,\n",
-       " 'test_overall_accuracy': 0.9796867082332724,\n",
-       " 'test_runtime': 22.7367,\n",
-       " 'test_samples_per_second': 155.915,\n",
-       " 'test_steps_per_second': 39.012,\n",
+       "{'test_loss': 0.029485255479812622,\n",
+       " 'test_LOC': {'precision': 0.9335384615384615,\n",
+       "  'recall': 0.9094724220623501,\n",
+       "  'f1': 0.9213483146067416,\n",
+       "  'number': 1668},\n",
+       " 'test_MISC': {'precision': 0.7503429355281207,\n",
+       "  'recall': 0.7792022792022792,\n",
+       "  'f1': 0.76450034940601,\n",
+       "  'number': 702},\n",
+       " 'test_ORG': {'precision': 0.8538243626062323,\n",
+       "  'recall': 0.9072847682119205,\n",
+       "  'f1': 0.87974314068885,\n",
+       "  'number': 1661},\n",
+       " 'test_PER': {'precision': 0.9658808933002482,\n",
+       "  'recall': 0.9628942486085343,\n",
+       "  'f1': 0.964385258593992,\n",
+       "  'number': 1617},\n",
+       " 'test_overall_precision': 0.8947827604257547,\n",
+       " 'test_overall_recall': 0.9079320113314447,\n",
+       " 'test_overall_f1': 0.9013094296511117,\n",
+       " 'test_overall_accuracy': 0.9782276300204588,\n",
+       " 'test_runtime': 33.9555,\n",
+       " 'test_samples_per_second': 104.401,\n",
+       " 'test_steps_per_second': 26.122,\n",
        " 'epoch': 1.0}"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -465,6 +453,13 @@
     "trainer.evaluate(dataset[\"test\"], metric_key_prefix=\"test\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Great performance for 8 minutes trained! 🎉"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -487,16 +482,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Or we can push it to the 🤗 Hub like so. I've commented it away for now to prevent people from accidentally pushing models."
+    "Or we can push it to the 🤗 Hub like so."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# trainer.push_to_hub()"
+    "trainer.push_to_hub(repo_id=\"span-marker-roberta-base-conll03\")"
    ]
   },
   {
@@ -527,40 +522,59 @@
     "from span_marker import SpanMarkerModel, Trainer\n",
     "from transformers import TrainingArguments\n",
     "\n",
-    "dataset = load_dataset(\"conll2003\")\n",
-    "labels = dataset[\"train\"].features[\"ner_tags\"].feature.names\n",
+    "def main() -> None:\n",
+    "    dataset_id = \"conll2003\"\n",
+    "    dataset = load_dataset(dataset_id)\n",
+    "    labels = dataset[\"train\"].features[\"ner_tags\"].feature.names\n",
     "\n",
-    "model_name = \"roberta-base\"\n",
-    "model = SpanMarkerModel.from_pretrained(model_name, labels=labels, model_max_length=256)\n",
+    "    encoder_id = \"roberta-base\"\n",
+    "    model = SpanMarkerModel.from_pretrained(\n",
+    "        # Required arguments\n",
+    "        encoder_id,\n",
+    "        labels=labels,\n",
+    "        # Optional arguments\n",
+    "        model_max_length=256,\n",
+    "        entity_max_length=6,\n",
+    "        # To improve the generated model card\n",
+    "        model_card_data=SpanMarkerModelCardData(\n",
+    "            language=[\"en\"],\n",
+    "            license=\"apache-2.0\",\n",
+    "            encoder_id=encoder_id,\n",
+    "            dataset_id=dataset_id,\n",
+    "        )\n",
+    "    )\n",
     "\n",
-    "args = TrainingArguments(\n",
-    "    output_dir=\"models/span-marker-roberta-base-conll03\",\n",
-    "    learning_rate=1e-5,\n",
-    "    gradient_accumulation_steps=2,\n",
-    "    per_device_train_batch_size=4,\n",
-    "    per_device_eval_batch_size=4,\n",
-    "    num_train_epochs=1,\n",
-    "    evaluation_strategy=\"steps\",\n",
-    "    save_strategy=\"steps\",\n",
-    "    eval_steps=500,\n",
-    "    push_to_hub=False,\n",
-    "    logging_steps=50,\n",
-    "    warmup_ratio=0.1,\n",
-    ")\n",
+    "    args = TrainingArguments(\n",
+    "        output_dir=\"models/span-marker-roberta-base-conll03\",\n",
+    "        learning_rate=1e-5,\n",
+    "        gradient_accumulation_steps=2,\n",
+    "        per_device_train_batch_size=4,\n",
+    "        per_device_eval_batch_size=4,\n",
+    "        num_train_epochs=1,\n",
+    "        evaluation_strategy=\"steps\",\n",
+    "        save_strategy=\"steps\",\n",
+    "        eval_steps=500,\n",
+    "        push_to_hub=False,\n",
+    "        logging_steps=50,\n",
+    "        fp16=True,\n",
+    "        warmup_ratio=0.1,\n",
+    "    )\n",
     "\n",
-    "trainer = Trainer(\n",
-    "    model=model,\n",
-    "    args=args,\n",
-    "    train_dataset=dataset[\"train\"].select(range(8000)),\n",
-    "    eval_dataset=dataset[\"validation\"].select(range(2000)),\n",
-    ")\n",
+    "    trainer = Trainer(\n",
+    "        model=model,\n",
+    "        args=args,\n",
+    "        train_dataset=dataset[\"train\"].select(range(8000)),\n",
+    "        eval_dataset=dataset[\"validation\"].select(range(2000)),\n",
+    "    )\n",
+    "    trainer.train()\n",
     "\n",
-    "trainer.train()\n",
-    "trainer.save_model(\"models/span-marker-roberta-base-conll03/checkpoint-final\")\n",
-    "trainer.push_to_hub()\n",
+    "    metrics = trainer.evaluate()\n",
+    "    print(metrics)\n",
     "\n",
-    "metrics = trainer.evaluate()\n",
-    "print(metrics)\n",
+    "    trainer.save_model(\"models/span-marker-roberta-base-conll03/checkpoint-final\")\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()\n",
     "```"
    ]
   },
diff --git a/pyproject.toml b/pyproject.toml
index 5c944a25..fde38413 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,7 @@ dependencies = [
     "torch",
     "accelerate",
     "transformers>=4.19.0", # required for EvalPrediction.inputs
-    "datasets>=2.0.0",
+    "datasets>=2.14.0", # required for sorting with multiple columns
     "packaging>=20.0",
     "evaluate",
     "seqeval",
@@ -59,6 +59,9 @@ docs = [
 wandb = [
     "wandb"
 ]
+codecarbon = [
+    "codecarbon"
+]
 
 [project.urls]
 Documentation = "https://tomaarsen.github.io/SpanMarkerNER"
diff --git a/span_marker/__init__.py b/span_marker/__init__.py
index 0586fd92..11e42b38 100644
--- a/span_marker/__init__.py
+++ b/span_marker/__init__.py
@@ -1,6 +1,8 @@
 __version__ = "1.3.1.dev"
 
+import importlib
 import logging
+import os
 from typing import Optional, Union
 
 import torch
@@ -8,6 +10,7 @@
 from transformers.pipelines import PIPELINE_REGISTRY, pipeline
 
 from span_marker.configuration import SpanMarkerConfig
+from span_marker.model_card import SpanMarkerModelCardData
 from span_marker.modeling import SpanMarkerModel
 from span_marker.pipeline_component import SpanMarkerPipeline
 from span_marker.trainer import Trainer
@@ -62,5 +65,10 @@ def _spacy_span_marker_factory(
         return SpacySpanMarkerWrapper(model, batch_size=batch_size, device=device)
 
 
+# If codecarbon is installed and the log level is not defined,
+# automatically overwrite the default to "error"
+if importlib.util.find_spec("codecarbon") and "CODECARBON_LOG_LEVEL" not in os.environ:
+    os.environ["CODECARBON_LOG_LEVEL"] = "error"
+
 logger = logging.getLogger("span_marker")
 logger.setLevel(logging.INFO)
diff --git a/span_marker/evaluation.py b/span_marker/evaluation.py
index cf9df137..53aa2afa 100644
--- a/span_marker/evaluation.py
+++ b/span_marker/evaluation.py
@@ -9,7 +9,9 @@
 from span_marker.tokenizer import SpanMarkerTokenizer
 
 
-def compute_f1_via_seqeval(tokenizer: SpanMarkerTokenizer, eval_prediction: EvalPrediction) -> Dict[str, float]:
+def compute_f1_via_seqeval(
+    tokenizer: SpanMarkerTokenizer, eval_prediction: EvalPrediction, is_in_train: bool
+) -> Dict[str, float]:
     """Compute micro-F1, recall, precision and accuracy scores using ``seqeval`` for the evaluation predictions.
 
     Note:
@@ -98,7 +100,7 @@ def compute_f1_via_seqeval(tokenizer: SpanMarkerTokenizer, eval_prediction: Eval
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", UndefinedMetricWarning)
         results = seqeval.compute()
-    # `results` also contains e.g. "person-athlete": {'precision': 0.5982658959537572, 'recall': 0.9, 'f1': 0.71875, 'number': 230}
-    # logging this all is overkill. Tensorboard doesn't even support it, WandB does, but it's not very useful generally.
-    # I'd like to revisit this to expose this information somehow still
-    return {key: value for key, value in results.items() if isinstance(value, float)}
+
+    if is_in_train:
+        return {key: value for key, value in results.items() if isinstance(value, float)}
+    return results
diff --git a/span_marker/label_normalizer.py b/span_marker/label_normalizer.py
index 87862278..9843ff19 100644
--- a/span_marker/label_normalizer.py
+++ b/span_marker/label_normalizer.py
@@ -27,8 +27,17 @@ def __init__(self, config: SpanMarkerConfig) -> None:
         self.config = config
 
     @abstractmethod
-    def __call__(self, tokens: List[str], ner_tags: List[int]) -> Dict[str, List[Any]]:
-        raise NotImplementedError
+    def ner_tags_to_entities(self, ner_tags: List[int]) -> Iterator[Entity]:
+        pass
+
+    def __call__(self, tokens: List[List[str]], ner_tags: List[List[int]]) -> Dict[str, List[Any]]:
+        output = {"ner_tags": [], "entity_count": [], "word_count": []}
+        for tokens, ner_tags in zip(tokens, ner_tags):
+            ner_tags = list(self.ner_tags_to_entities(ner_tags))
+            output["ner_tags"].append(ner_tags)
+            output["entity_count"].append(len(ner_tags))
+            output["word_count"].append(len(tokens))
+        return output
 
 
 class LabelNormalizerScheme(LabelNormalizer):
@@ -57,9 +66,6 @@ def ner_tags_to_entities(self, ner_tags: List[int]) -> Iterator[Entity]:
         if start_idx is not None:
             yield (reduced_label_id, start_idx, idx + 1)
 
-    def __call__(self, tokens: List[str], ner_tags: List[int]) -> Dict[str, List[Any]]:
-        return {"tokens": tokens, "ner_tags": list(self.ner_tags_to_entities(ner_tags))}
-
 
 class LabelNormalizerIOB(LabelNormalizerScheme):
     def __init__(self, config: SpanMarkerConfig) -> None:
@@ -108,9 +114,6 @@ def ner_tags_to_entities(self, ner_tags: List[int]) -> Iterator[Entity]:
         if start_idx is not None:
             yield (entity_label_id, start_idx, idx + 1)
 
-    def __call__(self, tokens: List[str], ner_tags: List[int]) -> Dict[str, List[Any]]:
-        return {"tokens": tokens, "ner_tags": list(self.ner_tags_to_entities(ner_tags))}
-
 
 class AutoLabelNormalizer:
     """Factory class to return the correct LabelNormalizer subclass."""
diff --git a/span_marker/model_card.py b/span_marker/model_card.py
index 399cc6f4..e9c3a507 100644
--- a/span_marker/model_card.py
+++ b/span_marker/model_card.py
@@ -1,79 +1,494 @@
+import logging
 import os
+import random
+from dataclasses import dataclass, field, fields
 from pathlib import Path
-from typing import Union
+from platform import python_version
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
-import jinja2
-from huggingface_hub import model_info
-from huggingface_hub.utils import RepositoryNotFoundError
+import datasets
+import tokenizers
+import torch
+import transformers
+from datasets import Dataset
+from huggingface_hub import (
+    CardData,
+    DatasetFilter,
+    ModelCard,
+    dataset_info,
+    list_datasets,
+    model_info,
+)
+from huggingface_hub.repocard_data import EvalResult, eval_results_to_model_index
+from huggingface_hub.utils import yaml_dump
+from transformers import TrainerCallback
+from transformers.integrations import CodeCarbonCallback
+from transformers.modelcard import (
+    extract_hyperparameters_from_trainer,
+    make_markdown_table,
+)
+from transformers.trainer_callback import TrainerControl, TrainerState
+from transformers.training_args import TrainingArguments
 
-from span_marker.configuration import SpanMarkerConfig
+import span_marker
 
-MODEL_CARD_TEMPLATE = """
----
-license: apache-2.0
-library_name: span-marker
-tags:
-- span-marker
-- token-classification
-- ner
-- named-entity-recognition
-pipeline_tag: token-classification
----
+logger = logging.getLogger(__name__)
 
-# SpanMarker for Named Entity Recognition
+if TYPE_CHECKING:
+    from span_marker.modeling import SpanMarkerModel
+    from span_marker.trainer import Trainer
 
-This is a [SpanMarker](https://github.com/tomaarsen/SpanMarkerNER) model that can be used \
-for Named Entity Recognition. {% if encoder_name_or_path %}In particular, this SpanMarker model uses \
-{% if is_public_model %}\
-[{{ encoder_name_or_path }}](https://huggingface.co/{{ encoder_name_or_path }})\
-{% else %}\
-"{{ encoder_name_or_path }}"\
-{% endif %} as the underlying encoder. {% endif %}
 
-## Usage
+class ModelCardCallback(TrainerCallback):
+    def __init__(self, trainer: "Trainer") -> None:
+        super().__init__()
+        self.trainer = trainer
 
-To use this model for inference, first install the `span_marker` library:
+        callbacks = [
+            callback for callback in self.trainer.callback_handler.callbacks if isinstance(callback, CodeCarbonCallback)
+        ]
+        if callbacks:
+            trainer.model.model_card_data.code_carbon_callback = callbacks[0]
 
-```bash
-pip install span_marker
-```
+    def on_train_begin(
+        self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: "SpanMarkerModel", **kwargs
+    ):
+        model.model_card_data.hyperparameters = extract_hyperparameters_from_trainer(self.trainer)
 
-You can then run inference with this model like so:
+    def on_evaluate(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        model: "SpanMarkerModel",
+        metrics: Dict[str, float],
+        **kwargs,
+    ):
+        # Set the most recent evaluation scores for the metadata
+        model.model_card_data.eval_results_dict = metrics
 
-```python
-from span_marker import SpanMarkerModel
+        if self.trainer.is_in_train:
+            # Either set mid-training evaluation metrics
+            if "eval_loss" in metrics:
+                model.model_card_data.eval_lines_list.append(
+                    {
+                        # "Training Loss": self.state.log_history[-1]["loss"] if "loss" in self.state.log_history[-1] else "-",
+                        "Epoch": state.epoch,
+                        "Step": state.global_step,
+                        "Validation Loss": metrics["eval_loss"],
+                        "Validation Precision": metrics["eval_overall_precision"],
+                        "Validation Recall": metrics["eval_overall_recall"],
+                        "Validation F1": metrics["eval_overall_f1"],
+                        "Validation Accuracy": metrics["eval_overall_accuracy"],
+                    }
+                )
+        else:
+            # Or set the post-training metrics
+            # Determine the dataset split
+            runtime_key = [key for key in metrics.keys() if key.endswith("_runtime")]
+            if not runtime_key:
+                return
+            dataset_split = runtime_key[0][: -len("_runtime")]
 
-# Download from the 🤗 Hub
-model = SpanMarkerModel.from_pretrained({% if model_name_or_path %}"{{ model_name_or_path }}"{% else %}"span_marker_model_name"{% endif %})
-# Run inference
-entities = model.predict("Amelia Earhart flew her single engine Lockheed Vega 5B across the Atlantic to Paris.")
-```
+            metric_lines = []
+            for key, value in metrics.items():
+                if not isinstance(value, float):
+                    metric_lines.append(
+                        {
+                            "Label": key[len(dataset_split) + 1 :],
+                            "Precision": value["precision"],
+                            "Recall": value["recall"],
+                            "F1": value["f1"],
+                        }
+                    )
+            metric_lines.insert(
+                0,
+                {
+                    "Label": "**all**",
+                    "Precision": metrics[f"{dataset_split}_overall_precision"],
+                    "Recall": metrics[f"{dataset_split}_overall_recall"],
+                    "F1": metrics[f"{dataset_split}_overall_f1"],
+                },
+            )
+            model.model_card_data.metric_lines = metric_lines
 
-See the [SpanMarker](https://github.com/tomaarsen/SpanMarkerNER) repository for documentation and additional information on this library.
-"""
 
+YAML_FIELDS = [
+    "language",
+    "license",
+    "library_name",
+    "tags",
+    "datasets",
+    "metrics",
+    "pipeline_tag",
+    "widget",
+    "model-index",
+    "co2_eq_emissions",
+    "base_model",
+]
+IGNORED_FIELDS = ["model"]
 
-def is_public_model(encoder_name_or_path: str) -> bool:
+
+@dataclass
+class SpanMarkerModelCardData(CardData):
+    """A dataclass storing data used in the model card.
+
+    Args:
+        language (Optional[Union[str, List[str]]]): The model language, either a string or a list,
+            e.g. "en" or ["en", "de", "nl"]
+        license: (Optional[str]): The license of the model, e.g. "apache-2.0", "mit"
+            or "cc-by-nc-sa-4.0"
+        model_name: (Optional[str]): The pretty name of the model, e.g. "SpanMarker with mBERT-base on CoNLL03".
+            If not defined, uses encoder_name/encoder_id and dataset_name/dataset_id to generate a model name.
+        model_id: (Optional[str]): The model ID when pushing the model to the Hub,
+            e.g. "tomaarsen/span-marker-mbert-base-multinerd".
+        encoder_name: (Optional[str]): The pretty name of the encoder, e.g. "mBERT-base".
+        encoder_id: (Optional[str]): The model ID of the encoder, e.g. "bert-base-multilingual-cased".
+        dataset_name: (Optional[str]): The pretty name of the dataset, e.g. "CoNLL03".
+        dataset_id: (Optional[str]): The dataset ID of the dataset, e.g. "tner/bionlp2004".
+        dataset_revision: (Optional[str]): The dataset revision/commit that was for training/evaluation.
+
+    Note:
+
+        Install ``nltk`` to detokenize the examples used in the model card, i.e. attach punctuation and brackets.
+        Additionally, ``codecarbon`` can be installed to automatically track carbon emission usage.
+
+    Example::
+
+        >>> model = SpanMarkerModel.from_pretrained(
+        ...     "bert-base-uncased",
+        ...     labels=["O", "B-DNA", "I-DNA", "B-protein", ...],
+        ...     # SpanMarker hyperparameters:
+        ...     model_max_length=256,
+        ...     marker_max_length=128,
+        ...     entity_max_length=8,
+        ...     # Model card variables
+        ...     model_card_data=SpanMarkerModelCardData(
+        ...         model_id="tomaarsen/span-marker-bbu-bionlp",
+        ...         encoder_id="bert-base-uncased",
+        ...         dataset_name="BioNLP2004,
+        ...         dataset_id="tner/bionlp2004",
+        ...         license="apache-2.0",
+        ...         language="en",
+        ...     ),
+        ... )
+    """
+
+    # Potentially provided by the user
+    language: Optional[Union[str, List[str]]] = None
+    license: Optional[str] = None
+    tags: Optional[List[str]] = field(
+        default_factory=lambda: [
+            "span-marker",
+            "token-classification",
+            "ner",
+            "named-entity-recognition",
+            "generated_from_span_marker_trainer",
+        ]
+    )
+    model_name: Optional[str] = None
+    model_id: Optional[str] = None
+    encoder_name: Optional[str] = None
+    encoder_id: Optional[str] = None
+    dataset_name: Optional[str] = None
+    dataset_id: Optional[str] = None
+    dataset_revision: Optional[str] = None
+    task_name: str = "Named Entity Recognition"
+
+    # Automatically filled by `ModelCardCallback` and the Trainer directly
+    hyperparameters: Dict[str, Any] = field(default_factory=dict, init=False)
+    eval_results_dict: Optional[Dict[str, Any]] = field(default_factory=dict, init=False)
+    eval_lines_list: List[Dict[str, float]] = field(default_factory=list, init=False)
+    metric_lines: List[Dict[str, float]] = field(default_factory=list, init=False)
+    widget: List[Dict[str, str]] = field(default_factory=list, init=False)
+    predict_example: Optional[str] = field(default=None, init=False)
+    label_example_list: List[Dict[str, str]] = field(default_factory=list, init=False)
+    tokenizer_warning: bool = field(default=False, init=False)
+    train_set_metrics_list: List[Dict[str, str]] = field(default_factory=list, init=False)
+    code_carbon_callback: Optional[CodeCarbonCallback] = field(default=None, init=False)
+
+    # Computed once, always unchanged
+    pipeline_tag: str = field(default="token-classification", init=False)
+    library_name: str = field(default="span-marker", init=False)
+    version: Dict[str, str] = field(
+        default_factory=lambda: {
+            "python": python_version(),
+            "span_marker": span_marker.__version__,
+            "transformers": transformers.__version__,
+            "torch": torch.__version__,
+            "datasets": datasets.__version__,
+            "tokenizers": tokenizers.__version__,
+        },
+        init=False,
+    )
+    metrics: List[str] = field(default_factory=lambda: ["precision", "recall", "f1"], init=False)
+
+    # Passed via `register_model` only
+    model: Optional["SpanMarkerModel"] = field(default=None, init=False, repr=False)
+
+    def __post_init__(self):
+        # We don't want to save "ignore_metadata_errors" in our Model Card
+        if self.dataset_id:
+            if is_on_huggingface(self.dataset_id, is_model=False):
+                # if languages are not set, try to determine the language from the dataset on the Hub
+                try:
+                    info = dataset_info(self.dataset_id)
+                except:
+                    pass
+                else:
+                    if info.cardData:
+                        self.language = info.cardData.get("language", self.language)
+            else:
+                logger.warning(
+                    f"The provided {self.dataset_id!r} dataset could not be found on the Hugging Face Hub."
+                    " Setting `dataset_id` to None."
+                )
+                self.dataset_id = None
+
+        if self.encoder_id and not is_on_huggingface(self.encoder_id):
+            logger.warning(
+                f"The provided {self.encoder_id!r} model could not be found on the Hugging Face Hub."
+                " Setting `encoder_id` to None."
+            )
+            self.encoder_id = None
+
+        if self.model_id and self.model_id.count("/") != 1:
+            logger.warning(
+                f"The provided {self.model_id!r} model ID should include the organization or user,"
+                ' such as "tomaarsen/span-marker-mbert-base-multinerd". Setting `model_id` to None.'
+            )
+            self.model_id = None
+
+    def set_widget_examples(self, dataset: Dataset) -> None:
+        # If NLTK is installed, use its detokenization. Otherwise, join by spaces.
+        try:
+            from nltk.tokenize.treebank import TreebankWordDetokenizer
+
+            detokenize = TreebankWordDetokenizer().detokenize
+
+            def map_detokenize(tokens) -> Dict[str, str]:
+                return {"text": detokenize(tokens)}
+
+        except ImportError:
+
+            def map_detokenize(tokens) -> Dict[str, str]:
+                return {"text": " ".join(tokens)}
+
+        # Out of `sample_subset_size=100` random samples, select `example_count=5` good examples
+        # based on the number of unique entity classes.
+        # The shortest example is used in the inference example
+        sample_subset_size = 100
+        example_count = 5
+        if len(dataset) > sample_subset_size:
+            example_dataset = dataset.select(random.sample(range(len(dataset)), k=sample_subset_size))
+        else:
+            example_dataset = dataset
+
+        def count_entities(sample: Dict[str, Any]) -> Dict[str, int]:
+            unique_count = {reduced_label_id for reduced_label_id, _, _ in sample["ner_tags"]}
+            return {"unique_entity_count": len(unique_count)}
+
+        example_dataset = (
+            example_dataset.map(count_entities)
+            .sort(("unique_entity_count", "entity_count"), reverse=True)
+            .select(range(min(len(example_dataset), example_count)))
+            .map(map_detokenize, input_columns="tokens")
+        )
+        self.widget = [{"text": sample["text"]} for sample in example_dataset]
+
+        shortest_example = example_dataset.sort("word_count")[0]["text"]
+        self.predict_example = shortest_example
+
+    def set_train_set_metrics(self, dataset: Dataset) -> None:
+        self.train_set_metrics_list = [
+            {
+                "Training set": "Sentence length",
+                "Min": min(dataset["word_count"]),
+                "Median": sum(dataset["word_count"]) / len(dataset),
+                "Max": max(dataset["word_count"]),
+            },
+            {
+                "Training set": "Entities per sentence",
+                "Min": min(dataset["entity_count"]),
+                "Median": sum(dataset["entity_count"]) / len(dataset),
+                "Max": max(dataset["entity_count"]),
+            },
+        ]
+
+    def set_label_examples(self, dataset: Dataset, id2label: Dict[int, str], outside_id: int) -> None:
+        num_examples_per_label = 3
+        examples = {label: set() for label_id, label in id2label.items() if label_id != outside_id}
+        unfinished_entity_ids = set(id2label.keys()) - {outside_id}
+        for sample in dataset:
+            for entity_id, start, end in sample["ner_tags"]:
+                if entity_id in unfinished_entity_ids:
+                    entity = id2label[entity_id]
+                    example = " ".join(sample["tokens"][start:end])
+                    examples[entity].add(f'"{example}"')
+                    if len(examples[entity]) >= num_examples_per_label:
+                        unfinished_entity_ids.remove(entity_id)
+            if not unfinished_entity_ids:
+                break
+        self.label_example_list = [
+            {"Label": label, "Examples": ", ".join(example_set)} for label, example_set in examples.items()
+        ]
+
+    def infer_dataset_id(self, dataset: Dataset) -> None:
+        def subtuple_finder(tuple: Tuple[str], subtuple: Tuple[str]) -> int:
+            for i, element in enumerate(tuple):
+                if element == subtuple[0] and tuple[i : i + len(subtuple)] == subtuple:
+                    return i
+            return -1
+
+        def normalize(dataset_id: str) -> str:
+            for token in "/\\_-":
+                dataset_id = dataset_id.replace(token, "")
+            return dataset_id.lower()
+
+        if (cache_files := dataset.cache_files) and "filename" in cache_files[0]:
+            cache_path_parts = Path(cache_files[0]["filename"]).parts
+            # Check if the cachefile is under "huggingface/datasets"
+            subtuple = ("huggingface", "datasets")
+            index = subtuple_finder(cache_path_parts, subtuple)
+            if index == -1:
+                return
+
+            # Get the folder after "huggingface/datasets"
+            cache_dataset_name = cache_path_parts[index + len(subtuple)]
+            # If the dataset has an author:
+            if "___" in cache_dataset_name:
+                author, dataset_name = cache_dataset_name.split("___")
+            else:
+                author = None
+                dataset_name = cache_dataset_name
+
+            # Make sure the normalized dataset IDs match
+            dataset_list = [
+                dataset
+                for dataset in list_datasets(filter=DatasetFilter(author=author, dataset_name=dataset_name))
+                if normalize(dataset.id) == normalize(cache_dataset_name)
+            ]
+            # If there's only one match, get the ID from it
+            if len(dataset_list) == 1:
+                self.dataset_id = dataset_list[0].id
+
+    def register_model(self, model: "SpanMarkerModel") -> None:
+        self.model = model
+
+        if self.encoder_id is None:
+            encoder_id_or_path = self.model.config.get("_name_or_path")
+            if not os.path.exists(encoder_id_or_path):
+                self.encoder_id = encoder_id_or_path
+
+        if not self.model_name:
+            if self.encoder_id:
+                self.model_name = f"SpanMarker with {self.encoder_name or self.encoder_id}"
+                if self.dataset_name or self.dataset_id:
+                    self.model_name += f" on {self.dataset_name or self.dataset_id}"
+            else:
+                self.model_name = "SpanMarker"
+
+    def to_dict(self) -> Dict[str, Any]:
+        super_dict = {field.name: getattr(self, field.name) for field in fields(self)}
+
+        # Compute required formats from the raw data
+        if self.eval_results_dict:
+            dataset_split = list(self.eval_results_dict.keys())[0].split("_")[0]
+            dataset_id = self.dataset_id or "unknown"
+            dataset_name = self.dataset_name or "Unknown"
+            eval_results = [
+                EvalResult(
+                    task_type="token-classification",
+                    dataset_type=dataset_id,
+                    dataset_name=dataset_name,
+                    metric_type="f1",
+                    metric_value=self.eval_results_dict[f"{dataset_split}_overall_f1"],
+                    task_name="Named Entity Recognition",
+                    dataset_split=dataset_split,
+                    dataset_revision=self.dataset_revision,
+                    metric_name="F1",
+                ),
+                EvalResult(
+                    task_type="token-classification",
+                    dataset_type=dataset_id,
+                    dataset_name=dataset_name,
+                    metric_type="precision",
+                    metric_value=self.eval_results_dict[f"{dataset_split}_overall_precision"],
+                    task_name="Named Entity Recognition",
+                    dataset_split=dataset_split,
+                    dataset_revision=self.dataset_revision,
+                    metric_name="Precision",
+                ),
+                EvalResult(
+                    task_type="token-classification",
+                    dataset_type=dataset_id,
+                    dataset_name=dataset_name,
+                    metric_type="recall",
+                    metric_value=self.eval_results_dict[f"{dataset_split}_overall_recall"],
+                    task_name="Named Entity Recognition",
+                    dataset_split=dataset_split,
+                    dataset_revision=self.dataset_revision,
+                    metric_name="Recall",
+                ),
+            ]
+            super_dict["model-index"] = eval_results_to_model_index(self.model_name, eval_results)
+        super_dict["eval_lines"] = make_markdown_table(self.eval_lines_list)
+        # Replace |:---:| with |:---| for left alignment
+        super_dict["label_examples"] = make_markdown_table(self.label_example_list).replace("-:|", "--|")
+        super_dict["train_set_metrics"] = make_markdown_table(self.train_set_metrics_list).replace("-:|", "--|")
+        super_dict["metrics_table"] = make_markdown_table(self.metric_lines).replace("-:|", "--|")
+        if self.code_carbon_callback and self.code_carbon_callback.tracker:
+            emissions_data = self.code_carbon_callback.tracker._prepare_emissions_data()
+            super_dict["co2_eq_emissions"] = {
+                # * 1000 to convert kg to g
+                "emissions": float(emissions_data.emissions) * 1000,
+                "source": "codecarbon",
+                "training_type": "fine-tuning",
+                "on_cloud": emissions_data.on_cloud == "Y",
+                "cpu_model": emissions_data.cpu_model,
+                "ram_total_size": emissions_data.ram_total_size,
+                "hours_used": round(emissions_data.duration / 3600, 3),
+            }
+            if emissions_data.gpu_model:
+                super_dict["co2_eq_emissions"]["hardware_used"] = emissions_data.gpu_model
+        if self.dataset_id:
+            super_dict["datasets"] = [self.dataset_id]
+        if self.encoder_id:
+            super_dict["base_model"] = self.encoder_id
+        super_dict["model_max_length"] = self.model.tokenizer.model_max_length
+
+        for key in IGNORED_FIELDS:
+            super_dict.pop(key, None)
+        return {
+            **self.model.config.to_dict(),
+            **super_dict,
+        }
+
+    def to_yaml(self, line_break=None) -> str:
+        return yaml_dump(
+            {key: value for key, value in self.to_dict().items() if key in YAML_FIELDS and value is not None},
+            sort_keys=False,
+            line_break=line_break,
+        ).strip()
+
+
+def is_on_huggingface(repo_id: str, is_model: bool = True) -> bool:
     # Models with more than two 'sections' certainly are not public models
-    if len(encoder_name_or_path.split("/")) > 2:
+    if len(repo_id.split("/")) > 2:
         return False
 
     try:
-        model_info(encoder_name_or_path)
+        if is_model:
+            model_info(repo_id)
+        else:
+            dataset_info(repo_id)
         return True
-    except RepositoryNotFoundError:
+    except:
+        # Fetching models can fail for many reasons: Repository not existing, no internet access, HF down, etc.
         return False
 
 
-def generate_model_card(save_directory: Union[str, os.PathLike], config: SpanMarkerConfig) -> str:
-    template = jinja2.Environment().from_string(MODEL_CARD_TEMPLATE)
-    save_directory = Path(save_directory)
-    context = {}
-
-    context["model_name_or_path"] = "span_marker_model_name"
-
-    if "_name_or_path" in config.encoder:
-        context["encoder_name_or_path"] = config.encoder["_name_or_path"]
-        context["is_public_model"] = is_public_model(context["encoder_name_or_path"])
-
-    return template.render(context)
+def generate_model_card(model: "SpanMarkerModel") -> str:
+    template_path = Path(__file__).parent / "model_card_template.md"
+    model_card = ModelCard.from_template(card_data=model.model_card_data, template_path=template_path, hf_emoji="🤗")
+    return model_card.content
diff --git a/span_marker/model_card_template.md b/span_marker/model_card_template.md
new file mode 100644
index 00000000..e8430a7e
--- /dev/null
+++ b/span_marker/model_card_template.md
@@ -0,0 +1,167 @@
+---
+# For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
+# Doc / guide: https://huggingface.co/docs/hub/model-cards
+{{ card_data }}
+---
+
+# {{ model_name | default("SpanMarker for Named Entity Recognition", true) }}
+
+This is a [SpanMarker](https://github.com/tomaarsen/SpanMarkerNER) model{% if dataset_id %} trained on the [{{ dataset_name if dataset_name else dataset_id }}](https://huggingface.co/datasets/{{ dataset_id }}) dataset{% endif %} that can be used for {{ task_name | default("Named Entity Recognition", true) }}.{% if encoder_id %} This SpanMarker model uses [{{ encoder_name if encoder_name else encoder_id }}](https://huggingface.co/{{ encoder_id }}) as the underlying encoder.{% endif %}
+
+## Model Details
+
+### Model Description
+- **Model Type:** SpanMarker
+{% if encoder_id -%}
+    - **Encoder:** [{{ encoder_name if encoder_name else encoder_id }}](https://huggingface.co/{{ encoder_id }})
+{%- else -%}
+    <!-- - **Encoder:** [Unknown](https://huggingface.co/unknown) -->
+{%- endif %}
+- **Maximum Sequence Length:** {{ model_max_length }} tokens
+- **Maximum Entity Length:** {{ entity_max_length }} words
+{% if dataset_id -%}
+    - **Training Dataset:** [{{ dataset_name if dataset_name else dataset_id }}](https://huggingface.co/datasets/{{ dataset_id }})
+{%- else -%}
+    <!-- - **Training Dataset:** [Unknown](https://huggingface.co/datasets/unknown) -->
+{%- endif %}
+{% if language -%}
+    - **Language{{"s" if language is not string and language | length > 1 else ""}}:**
+    {%- if language is string %} {{ language }}
+    {%- else %} {% for lang in language -%}
+            {{ lang }}{{ ", " if not loop.last else "" }}
+        {%- endfor %}
+    {%- endif %}
+{%- else -%}
+    <!-- - **Language:** Unknown -->
+{%- endif %}
+{% if license -%}
+    - **License:** {{ license }}
+{%- else -%}
+    <!-- - **License:** Unknown -->
+{%- endif %}
+
+### Model Sources
+
+- **Repository:** [SpanMarker on GitHub](https://github.com/tomaarsen/SpanMarkerNER)
+- **Thesis:** [SpanMarker For Named Entity Recognition](https://raw.githubusercontent.com/tomaarsen/SpanMarkerNER/main/thesis.pdf)
+{% if label_examples %}
+### Model Labels
+{{ label_examples }}{% endif -%}
+{% if metrics_table %}
+## Evaluation
+
+### Metrics
+{{ metrics_table }}{% endif %}
+## Uses
+
+### Direct Use for Inference
+
+```python
+from span_marker import SpanMarkerModel
+
+# Download from the {{ hf_emoji }} Hub
+model = SpanMarkerModel.from_pretrained("{{ model_id | default('span_marker_model_id', true) }}")
+# Run inference
+entities = model.predict("{{ predict_example | replace('"', '\\"') | default("Amelia Earhart flew her single engine Lockheed Vega 5B across the Atlantic to Paris.", true)}}")
+```
+
+### Downstream Use
+You can finetune this model on your own dataset.
+
+<details><summary>Click to expand</summary>
+
+```python
+from span_marker import SpanMarkerModel, Trainer
+
+# Download from the {{ hf_emoji }} Hub
+model = SpanMarkerModel.from_pretrained("{{ model_id | default('span_marker_model_id', true) }}")
+
+# Specify a Dataset with "tokens" and "ner_tag" columns
+dataset = load_dataset("conll2003") # For example CoNLL2003
+
+# Initialize a Trainer using the pretrained model & dataset
+trainer = Trainer(
+    model=model,
+    train_dataset=dataset["train"],
+    eval_dataset=dataset["validation"],
+)
+trainer.train()
+trainer.save_model("{{ model_id | default('span_marker_model_id', true) }}-finetuned")
+```
+</details>
+
+<!--
+### Out-of-Scope Use
+
+*List how the model may foreseeably be misused and address what users ought not to do with the model.*
+-->
+
+<!--
+## Bias, Risks and Limitations
+
+*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
+-->
+
+<!--
+### Recommendations
+
+*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
+-->
+
+## Training Details
+{% if train_set_metrics %}
+### Training Set Metrics
+{{ train_set_metrics }}{% endif %}{% if hyperparameters %}
+### Training Hyperparameters
+{% for name, value in hyperparameters.items() %}- {{ name }}: {{ value }}
+{% endfor %}{% endif %}{% if eval_lines %}
+### Training Results
+{{ eval_lines }}{% endif %}{% if co2_eq_emissions %}
+### Environmental Impact
+Carbon emissions were measured using [CodeCarbon](https://github.com/mlco2/codecarbon).
+- **Carbon Emitted**: {{ "%.3f"|format(co2_eq_emissions["emissions"] / 1000) }} kg of CO2
+- **Hours Used**: {{ co2_eq_emissions["hours_used"] }} hours
+
+### Training Hardware
+- **On Cloud**: {{ "Yes" if co2_eq_emissions["on_cloud"] else "No" }}
+- **GPU Model**: {{ co2_eq_emissions["hardware_used"] or "No GPU used" }}
+- **CPU Model**: {{ co2_eq_emissions["cpu_model"] }}
+- **RAM Size**: {{ "%.2f"|format(co2_eq_emissions["ram_total_size"]) }} GB
+{% endif %}
+### Framework Versions
+- Python: {{ version["python"] }}
+- SpanMarker: {{ version["span_marker"] }}
+- Transformers: {{ version["transformers"] }}
+- PyTorch: {{ version["torch"] }}
+- Datasets: {{ version["datasets"] }}
+- Tokenizers: {{ version["tokenizers"] }}
+
+## Citation
+
+### BibTeX
+```
+@software{Aarsen_SpanMarker,
+    author = {Aarsen, Tom},
+    license = {Apache-2.0},
+    title = {{"{{SpanMarker for Named Entity Recognition}}"}},
+    url = {https://github.com/tomaarsen/SpanMarkerNER}
+}
+```
+
+<!--
+## Glossary
+
+*Clearly define terms in order to be accessible across audiences.*
+-->
+
+<!--
+## Model Card Authors
+
+*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
+-->
+
+<!--
+## Model Card Contact
+
+*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
+-->
\ No newline at end of file
diff --git a/span_marker/modeling.py b/span_marker/modeling.py
index 631aea7f..8f223458 100644
--- a/span_marker/modeling.py
+++ b/span_marker/modeling.py
@@ -15,7 +15,7 @@
 from span_marker import __version__ as span_marker_version
 from span_marker.configuration import SpanMarkerConfig
 from span_marker.data_collator import SpanMarkerDataCollator
-from span_marker.model_card import generate_model_card
+from span_marker.model_card import SpanMarkerModelCardData, generate_model_card
 from span_marker.output import SpanMarkerOutput
 from span_marker.tokenizer import SpanMarkerTokenizer
 
@@ -50,7 +50,13 @@ class SpanMarkerModel(PreTrainedModel):
     base_model_prefix = "encoder"
     _no_split_modules = []  # To support `load_in_8bit=True`` and `device_map="auto"`
 
-    def __init__(self, config: SpanMarkerConfig, encoder: Optional[PreTrainedModel] = None, **kwargs) -> None:
+    def __init__(
+        self,
+        config: SpanMarkerConfig,
+        encoder: Optional[PreTrainedModel] = None,
+        model_card_data: Optional[SpanMarkerModelCardData] = None,
+        **kwargs,
+    ) -> None:
         """Initialize a SpanMarkerModel using configuration.
 
         Do not manually initialize a SpanMarkerModel this way! Use :meth:`~SpanMarkerModel.from_pretrained` instead.
@@ -89,6 +95,9 @@ def __init__(self, config: SpanMarkerConfig, encoder: Optional[PreTrainedModel]
         self.tokenizer = None
         self.data_collator = None
 
+        self.model_card_data = model_card_data or SpanMarkerModelCardData()
+        self.model_card_data.register_model(self)
+
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -199,6 +208,8 @@ def from_pretrained(
         pretrained_model_name_or_path: Union[str, os.PathLike],
         *model_args,
         labels: Optional[List[str]] = None,
+        config: Optional[SpanMarkerConfig] = None,
+        model_card_data: Optional[SpanMarkerModelCardData] = None,
         **kwargs,
     ) -> T:
         """Instantiate a pretrained pytorch model from a pre-trained model configuration.
@@ -240,7 +251,9 @@ def from_pretrained(
         """
         # If loading a SpanMarkerConfig, then we don't want to override id2label and label2id
         # Create an encoder or SpanMarker config
-        config: PretrainedConfig = AutoConfig.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        config: PretrainedConfig = config or AutoConfig.from_pretrained(
+            pretrained_model_name_or_path, *model_args, **kwargs
+        )
 
         # if 'pretrained_model_name_or_path' refers to a SpanMarkerModel instance, initialize it directly
         loading_span_marker = isinstance(config, cls.config_class)
@@ -253,7 +266,9 @@ def from_pretrained(
                     " introduced in v1.0.0, this is not recommended. Either retrain your model for"
                     f" v{span_marker_version}, or install `span_marker < 1.0.0`."
                 )
-            model = super().from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
+            model = super().from_pretrained(
+                pretrained_model_name_or_path, *model_args, config=config, **kwargs, model_card_data=model_card_data
+            )
 
         # If 'pretrained_model_name_or_path' refers to an encoder (roberta, bert, distilbert, electra, etc.),
         # then initialize it and create the SpanMarker config and model using the encoder and its config.
@@ -280,7 +295,7 @@ def from_pretrained(
             config = cls.config_class(
                 encoder_config=config.to_dict(), span_marker_version=span_marker_version, **kwargs
             )
-            model = cls(config, encoder, *model_args, **kwargs)
+            model = cls(config, encoder, *model_args, **kwargs, model_card_data=model_card_data)
 
         # Pass the tokenizer directly to the model for convenience, this way the user doesn't have to
         # make it themselves.
@@ -288,7 +303,12 @@ def from_pretrained(
             config.encoder.get("_name_or_path", pretrained_model_name_or_path), config=config, **kwargs
         )
         model.set_tokenizer(tokenizer)
-        model.resize_token_embeddings(len(tokenizer))
+        # Since transformers 4.32.0 we should use `pad_to_multiple_of=8`.
+        # That'll fail for earlier versions, so we try-except it.
+        try:
+            model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
+        except TypeError:
+            model.resize_token_embeddings(len(tokenizer))
         return model
 
     @classmethod
@@ -585,7 +605,15 @@ def save_pretrained(
             **kwargs,
         )
         with open(os.path.join(save_directory, "README.md"), "w", encoding="utf-8") as f:
-            f.write(generate_model_card(save_directory, self.config))
+            f.write(self.generate_model_card())
+
+    def generate_model_card(self) -> str:
+        """Generate and return a model card string based on the model card data.
+
+        Returns:
+            str: The model card string.
+        """
+        return generate_model_card(self)
 
     def try_cuda(self, device: Optional[Union[int, device]] = None) -> Self:
         """Try to moves all model parameters and buffers to the GPU, do nothing if failed.
diff --git a/span_marker/tokenizer.py b/span_marker/tokenizer.py
index 59a85b84..89135f2a 100644
--- a/span_marker/tokenizer.py
+++ b/span_marker/tokenizer.py
@@ -5,7 +5,8 @@
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 
 import numpy as np
-from transformers import AutoTokenizer, PreTrainedTokenizer
+from tokenizers.pre_tokenizers import Punctuation, Sequence
+from transformers import AutoTokenizer, PreTrainedTokenizer, XLMRobertaTokenizerFast
 
 from span_marker.configuration import SpanMarkerConfig
 
@@ -269,4 +270,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         tokenizer = AutoTokenizer.from_pretrained(
             pretrained_model_name_or_path, *inputs, **kwargs, add_prefix_space=True
         )
+        # XLM-R is known to have some tokenization issues, so be sure to also split on punctuation.
+        # Strictly required for inference, shouldn't affect training.
+        if isinstance(tokenizer, XLMRobertaTokenizerFast):
+            tokenizer._tokenizer.pre_tokenizer = Sequence([Punctuation(), tokenizer._tokenizer.pre_tokenizer])
+
         return cls(tokenizer, config=config, **kwargs)
diff --git a/span_marker/trainer.py b/span_marker/trainer.py
index bc731fba..9ae4370f 100644
--- a/span_marker/trainer.py
+++ b/span_marker/trainer.py
@@ -1,6 +1,7 @@
 import dataclasses
 import logging
 import math
+import os
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
@@ -17,6 +18,7 @@
 
 from span_marker.evaluation import compute_f1_via_seqeval
 from span_marker.label_normalizer import AutoLabelNormalizer, LabelNormalizer
+from span_marker.model_card import ModelCardCallback
 from span_marker.modeling import SpanMarkerModel
 from span_marker.tokenizer import SpanMarkerTokenizer
 
@@ -107,7 +109,7 @@ def __init__(
         # Set some Training arguments that must be set for SpanMarker
         if args is None:
             args = TrainingArguments(
-                output_dir="models/my_span_marker_model", include_inputs_for_metrics=True, remove_unused_columns=True
+                output_dir="models/my_span_marker_model", include_inputs_for_metrics=True, remove_unused_columns=False
             )
         else:
             args = dataclasses.replace(args, include_inputs_for_metrics=True, remove_unused_columns=False)
@@ -115,11 +117,25 @@ def __init__(
         # Always compute `compute_f1_via_seqeval` - optionally compute user-provided metrics
         if compute_metrics is not None:
             compute_metrics_func = lambda eval_prediction: {
-                **compute_f1_via_seqeval(model.tokenizer, eval_prediction),
+                **compute_f1_via_seqeval(model.tokenizer, eval_prediction, self.is_in_train),
                 **compute_metrics(eval_prediction),
             }
         else:
-            compute_metrics_func = lambda eval_prediction: compute_f1_via_seqeval(model.tokenizer, eval_prediction)
+            compute_metrics_func = lambda eval_prediction: compute_f1_via_seqeval(
+                model.tokenizer, eval_prediction, self.is_in_train
+            )
+
+        # If the model ID is set via the TrainingArguments, but not via the SpanMarkerModelCardData,
+        # then we can set it here for the model card regardless
+        if args.hub_model_id and not model.model_card_data.model_id:
+            model.model_card_data.model_id = args.hub_model_id
+
+        if not model.model_card_data.dataset_id:
+            # Inferring is hacky - it may break in the future, so let's be safe
+            try:
+                model.model_card_data.infer_dataset_id(train_dataset)
+            except Exception:
+                pass
 
         super().__init__(
             model=model,
@@ -143,6 +159,10 @@ def __init__(
         # Override the type hint
         self.model: SpanMarkerModel
 
+        # Add the callback for filling the model card data with hyperparameters
+        # and evaluation results
+        self.add_callback(ModelCardCallback(self))
+
     def preprocess_dataset(
         self,
         dataset: Dataset,
@@ -177,11 +197,31 @@ def preprocess_dataset(
             set(dataset.column_names) - set(self.OPTIONAL_COLUMNS) - set(self.REQUIRED_COLUMNS)
         )
         # Normalize the labels to a common format (list of label-start-end tuples)
+        # Also add "entity_count" and "word_count" labels
         dataset = dataset.map(
             label_normalizer,
             input_columns=("tokens", "ner_tags"),
             desc=f"Label normalizing the {dataset_name} dataset",
+            batched=True,
         )
+
+        # Setting model card data based on training data
+        if not is_evaluate:
+            # Pick some example entities from each entity class for the model card.
+            if not self.model.model_card_data.label_example_list:
+                self.model.model_card_data.set_label_examples(
+                    dataset, self.model.config.id2label, self.model.config.outside_id
+                )
+            if not self.model.model_card_data.train_set_metrics_list:
+                self.model.model_card_data.set_train_set_metrics(dataset)
+
+        # Set some example sentences for the model card widget
+        if is_evaluate and not self.model.model_card_data.widget:
+            self.model.model_card_data.set_widget_examples(dataset)
+
+        # Remove dataset columns that are only used for model card
+        dataset = dataset.remove_columns(("entity_count", "word_count"))
+
         # Tokenize and add start/end markers
         with tokenizer.entity_tracker(split=dataset_name):
             dataset = dataset.map(
@@ -393,3 +433,11 @@ def predict(
             f"Consider using `{self.model.__class__.__name__}.predict` instead."
         )
         return super().predict(test_dataset, ignore_keys, metric_key_prefix)
+
+    def create_model_card(self, *_args, **_kwargs) -> None:
+        """
+        Creates a draft of a model card using the information available to the `Trainer`,
+        the `SetFitModel` and the `SpanMarkerModelCardData`.
+        """
+        with open(os.path.join(self.args.output_dir, "README.md"), "w", encoding="utf8") as f:
+            f.write(self.model.generate_model_card())
diff --git a/tests/conftest.py b/tests/conftest.py
index 73148b79..6f1f8746 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -30,10 +30,7 @@ def randomize_seed() -> None:
 
     yield
 
-    from datasets.fingerprint import (
-        _TEMP_DIR_FOR_TEMP_CACHE_FILES,
-        get_temporary_cache_files_directory,
-    )
+    from datasets.fingerprint import _TEMP_DIR_FOR_TEMP_CACHE_FILES
 
     if _TEMP_DIR_FOR_TEMP_CACHE_FILES:
         _TEMP_DIR_FOR_TEMP_CACHE_FILES._cleanup()
diff --git a/tests/constants.py b/tests/constants.py
index fe31f1b4..f4c70aaa 100644
--- a/tests/constants.py
+++ b/tests/constants.py
@@ -124,4 +124,4 @@
 ]
 TINY_BERT = "prajjwal1/bert-tiny"
 
-DEFAULT_ARGS = TrainingArguments(output_dir="models/my_span_marker_model", report_to="none")
+DEFAULT_ARGS = TrainingArguments(output_dir="models/my_span_marker_model", report_to="none", num_train_epochs=1)
diff --git a/tests/model_card_pattern.py b/tests/model_card_pattern.py
new file mode 100644
index 00000000..72be8318
--- /dev/null
+++ b/tests/model_card_pattern.py
@@ -0,0 +1,217 @@
+import re
+
+MODEL_CARD_PATTERN = re.compile(
+    """\
+---
+language:
+- en
+license: apache-2\.0
+library_name: span-marker
+tags:
+- span-marker
+- token-classification
+- ner
+- named-entity-recognition
+- generated_from_span_marker_trainer
+datasets:
+- conll2003
+metrics:
+- precision
+- recall
+- f1
+widget:
+- text: .*
+pipeline_tag: token-classification
+co2_eq_emissions:
+  emissions: [\d\.\-e]+
+  source: codecarbon
+  training_type: fine-tuning
+  on_cloud: (false|true)
+  cpu_model: .+
+  ram_total_size: [\d\.]+
+  hours_used: [\d\.]+
+(  hardware_used: .+
+)?base_model: prajjwal1/bert-tiny
+model-index:
+- name: SpanMarker with prajjwal1/bert-tiny on CoNLL 2003
+  results:
+  - task:
+      type: token-classification
+      name: Named Entity Recognition
+    dataset:
+      name: CoNLL 2003
+      type: conll2003
+      split: eval
+    metrics:
+    - type: f1
+      value: [\d\.]+
+      name: F1
+    - type: precision
+      value: [\d\.]+
+      name: Precision
+    - type: recall
+      value: [\d\.]+
+      name: Recall
+---
+
+# SpanMarker with prajjwal1/bert-tiny on CoNLL 2003
+
+This is a \[SpanMarker\]\(https://github.com/tomaarsen/SpanMarkerNER\) model trained on the \[CoNLL 2003\]\(https://huggingface.co/datasets/conll2003\) dataset that can be used for Named Entity Recognition. This SpanMarker model uses \[prajjwal1/bert-tiny\]\(https://huggingface.co/prajjwal1/bert-tiny\) as the underlying encoder.
+
+## Model Details
+
+### Model Description
+- \*\*Model Type:\*\* SpanMarker
+- \*\*Encoder:\*\* \[prajjwal1/bert-tiny\]\(https://huggingface.co/prajjwal1/bert-tiny\)
+- \*\*Maximum Sequence Length:\*\* 512 tokens
+- \*\*Maximum Entity Length:\*\* 8 words
+- \*\*Training Dataset:\*\* \[CoNLL 2003\]\(https://huggingface.co/datasets/conll2003\)
+- \*\*Language:\*\* en
+- \*\*License:\*\* apache-2.0
+
+### Model Sources
+
+- \*\*Repository:\*\* \[SpanMarker on GitHub\]\(https://github.com/tomaarsen/SpanMarkerNER\)
+- \*\*Thesis:\*\* \[SpanMarker For Named Entity Recognition\]\(https://raw.githubusercontent.com/tomaarsen/SpanMarkerNER/main/thesis.pdf\)
+
+### Model Labels
+\| Label        \| Examples                                    \|
+\|:-------------\|:--------------------------------------------\|
+\| art          \|                                             \|
+\| building     \|                                             \|
+\| event        \|                                             \|
+\| location     \|                                             \|
+\| organization \|                                             \|
+\| other        \|                                             \|
+\| person       \| [^\|]+ \|
+\| product      \|                                             \|
+
+## Uses
+
+### Direct Use for Inference
+
+```python
+from span_marker import SpanMarkerModel
+
+# Download from the [^H]+ Hub
+model = SpanMarkerModel.from_pretrained\("tomaarsen/span-marker-test-model-card"\)
+# Run inference
+entities = model.predict\(".+"\)
+```
+
+### Downstream Use
+You can finetune this model on your own dataset.
+
+<details><summary>Click to expand</summary>
+
+```python
+from span_marker import SpanMarkerModel, Trainer
+
+# Download from the [^H]+ Hub
+model = SpanMarkerModel.from_pretrained\("tomaarsen/span-marker-test-model-card"\)
+
+# Specify a Dataset with "tokens" and "ner_tag" columns
+dataset = load_dataset\("conll2003"\) # For example CoNLL2003
+
+# Initialize a Trainer using the pretrained model & dataset
+trainer = Trainer\(
+    model=model,
+    train_dataset=dataset\["train"\],
+    eval_dataset=dataset\["validation"\],
+\)
+trainer.train\(\)
+trainer.save_model\("tomaarsen/span-marker-test-model-card-finetuned"\)
+```
+</details>
+
+<!--
+### Out-of-Scope Use
+
+\*List how the model may foreseeably be misused and address what users ought not to do with the model\.\*
+-->
+
+<!--
+## Bias, Risks and Limitations
+
+\*What are the known or foreseeable issues stemming from this model\? You could also flag here known failure cases or weaknesses of the model\.\*
+-->
+
+<!--
+### Recommendations
+
+\*What are recommendations with respect to the foreseeable issues\? For example, filtering explicit content\.\*
+-->
+
+## Training Details
+
+### Training Set Metrics
+\| Training set          \| Min \| Median \| Max \|
+\|:----------------------\|:----\|:-------\|:----\|
+\| Sentence length       \| 4   \| 8.0    \| 12  \|
+\| Entities per sentence \| 0   \| 1.5    \| 3   \|
+
+### Training Hyperparameters
+- learning_rate: 5e-05
+- train_batch_size: 1
+- eval_batch_size: 8
+- seed: 42
+- optimizer: Adam with betas=\(0.9,0.999\) and epsilon=1e-08
+- lr_scheduler_type: linear
+- num_epochs: 1
+
+### Training Results
+\| Epoch \| Step \| Validation Loss \| Validation Precision \| Validation Recall \| Validation F1 \| Validation Accuracy \|
+\|:-----:\|:----:\|:---------------:\|:--------------------:\|:-----------------:\|:-------------:\|:-------------------:\|
+\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\|
+\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\| [\d\.]+ +\|
+
+### Environmental Impact
+Carbon emissions were measured using \[CodeCarbon\]\(https://github.com/mlco2/codecarbon\)\.
+- \*\*Carbon Emitted\*\*: [\d\.]+ kg of CO2
+- \*\*Hours Used\*\*: [\d\.]+ hours
+
+### Training Hardware
+- \*\*On Cloud\*\*: (Yes|No)
+- \*\*GPU Model\*\*: [^\n]+
+- \*\*CPU Model\*\*: [^\n]+
+- \*\*RAM Size\*\*: [\d\.]+ GB
+
+### Framework Versions
+- Python: [^\n]+
+- SpanMarker: [^\n]+
+- Transformers: [^\n]+
+- PyTorch: [^\n]+
+- Datasets: [^\n]+
+- Tokenizers: [^\n]+
+
+## Citation
+
+### BibTeX
+```
+@software{Aarsen_SpanMarker,
+    author = {Aarsen, Tom},
+    license = {Apache-2.0},
+    title = {{SpanMarker for Named Entity Recognition}},
+    url = {https://github.com/tomaarsen/SpanMarkerNER}
+}
+```
+
+<!--
+## Glossary
+
+\*Clearly define terms in order to be accessible across audiences\.\*
+-->
+
+<!--
+## Model Card Authors
+
+\*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction\.\*
+-->
+
+<!--
+## Model Card Contact
+
+\*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors\.\*
+-->""",
+    flags=re.DOTALL,
+)
diff --git a/tests/test_model_card.py b/tests/test_model_card.py
index f9fb39a0..2ee1f3a6 100644
--- a/tests/test_model_card.py
+++ b/tests/test_model_card.py
@@ -1,32 +1,122 @@
+import logging
 from pathlib import Path
 
-from span_marker.model_card import generate_model_card
-from span_marker.modeling import SpanMarkerModel
+import pytest
+from datasets import DatasetDict, load_dataset
 
+from span_marker import (
+    SpanMarkerModel,
+    SpanMarkerModelCardData,
+    Trainer,
+    TrainingArguments,
+)
+from span_marker.model_card import generate_model_card, is_on_huggingface
 
-def test_model_card(finetuned_fewnerd_span_marker_model: SpanMarkerModel, tmp_path: Path) -> None:
-    config = finetuned_fewnerd_span_marker_model.config
-    model_card = generate_model_card(tmp_path, config)
-    assert (
-        "uses [prajjwal1/bert-tiny](https://huggingface.co/prajjwal1/bert-tiny) as the underlying encoder" in model_card
+from .constants import CONLL_LABELS, FEWNERD_COARSE_LABELS, TINY_BERT
+from .model_card_pattern import MODEL_CARD_PATTERN
+
+
+def test_model_card(fewnwerd_coarse_dataset_dict: DatasetDict, tmp_path: Path) -> None:
+    base_encoder_id = TINY_BERT
+    model = SpanMarkerModel.from_pretrained(
+        base_encoder_id,
+        labels=FEWNERD_COARSE_LABELS,
+        model_card_data=SpanMarkerModelCardData(
+            model_id="tomaarsen/span-marker-test-model-card",
+            dataset_id="conll2003",
+            dataset_name="CoNLL 2003",
+            encoder_id=base_encoder_id,
+            language="en",
+            license="apache-2.0",
+        ),
+    )
+    train_dataset = fewnwerd_coarse_dataset_dict["train"]
+    eval_dataset = fewnwerd_coarse_dataset_dict["test"].select(range(1))
+
+    args = TrainingArguments(
+        str(tmp_path),
+        report_to="codecarbon",
+        eval_steps=1,
+        per_device_train_batch_size=1,
+        evaluation_strategy="steps",
+        num_train_epochs=1,
+    )
+    trainer = Trainer(
+        model=model,
+        args=args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )
+    trainer.train()
+    model_card = generate_model_card(trainer.model)
+    assert MODEL_CARD_PATTERN.fullmatch(model_card)
+
+
+def test_model_card_languages() -> None:
+    model = SpanMarkerModel.from_pretrained(
+        TINY_BERT,
+        labels=FEWNERD_COARSE_LABELS,
+        model_card_data=SpanMarkerModelCardData(
+            language=["en", "nl", "de"],
+        ),
+    )
+    model_card = model.generate_model_card()
+    assert "**Languages:** en, nl, de" in model_card
+
+
+def test_model_card_warnings(caplog: pytest.LogCaptureFixture):
+    SpanMarkerModelCardData(dataset_id="test_value")
+    assert any(
+        [
+            level == logging.WARNING
+            and text == "The provided 'test_value' dataset could not be found on the Hugging Face Hub."
+            " Setting `dataset_id` to None."
+            for (_, level, text) in caplog.record_tuples
+        ]
     )
-    assert f'SpanMarkerModel.from_pretrained("span_marker_model_name")' in model_card
-    assert "\n\n\n" not in model_card
-    assert "\n\n## Usage" in model_card
-
-    config.encoder["_name_or_path"] = "does_not_exist"
-    model_card = generate_model_card(tmp_path, config)
-    assert 'uses "does_not_exist" as the underlying encoder' in model_card
-    assert "\n\n\n" not in model_card
-    assert "\n\n## Usage" in model_card
-
-    del config.encoder["_name_or_path"]
-    model_card = generate_model_card(tmp_path, config)
-    assert "as the underlying encoder" not in model_card
-    assert "\n\n\n" not in model_card
-    assert "\n\n## Usage" in model_card
-
-    model_card = generate_model_card("tomaarsen/my_test_model", config)
-    assert f'SpanMarkerModel.from_pretrained("span_marker_model_name")' in model_card
-    assert "\n\n\n" not in model_card
-    assert "\n\n## Usage" in model_card
+
+    caplog.clear()
+    SpanMarkerModelCardData(encoder_id="test_value")
+    assert any(
+        [
+            level == logging.WARNING
+            and text == "The provided 'test_value' model could not be found on the Hugging Face Hub."
+            " Setting `encoder_id` to None."
+            for (_, level, text) in caplog.record_tuples
+        ]
+    )
+
+    caplog.clear()
+    SpanMarkerModelCardData(model_id="test_value")
+    assert any(
+        [
+            level == logging.WARNING
+            and text == "The provided 'test_value' model ID should include the organization or user,"
+            ' such as "tomaarsen/span-marker-mbert-base-multinerd". Setting `model_id` to None.'
+            for (_, level, text) in caplog.record_tuples
+        ]
+    )
+
+
+def test_is_on_huggingface_edge_case() -> None:
+    assert not is_on_huggingface("test_value")
+    assert not is_on_huggingface("a/test/value")
+
+
+@pytest.mark.parametrize("dataset_id", ("conll2003", "tomaarsen/conll2003"))
+def test_infer_dataset_id(dataset_id: str) -> None:
+    model = SpanMarkerModel.from_pretrained(TINY_BERT, labels=CONLL_LABELS)
+    train_dataset = load_dataset(dataset_id, split="train")
+
+    # This triggers inferring the dataset_id from train_dataset
+    Trainer(model=model, train_dataset=train_dataset)
+    assert model.model_card_data.dataset_id == dataset_id
+
+
+def test_cant_infer_dataset_id(conll_dataset_dict: DatasetDict):
+    model = SpanMarkerModel.from_pretrained(TINY_BERT, labels=CONLL_LABELS)
+    train_dataset = conll_dataset_dict["train"]
+
+    # This triggers inferring the dataset_id from train_dataset
+    Trainer(model=model, train_dataset=train_dataset)
+    assert model.model_card_data.dataset_id == None
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index b39db86e..253d7eea 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -5,9 +5,11 @@
 
 import pytest
 from datasets import Dataset, DatasetDict
-from transformers import EvalPrediction
+from pytest import LogCaptureFixture
+from transformers import AutoTokenizer, EvalPrediction, TrainingArguments
 
 from span_marker.modeling import SpanMarkerModel
+from span_marker.tokenizer import SpanMarkerTokenizer
 from span_marker.trainer import Trainer
 from tests.constants import CONLL_LABELS, DEFAULT_ARGS, TINY_BERT
 
@@ -41,7 +43,9 @@ def test_trainer_standard(
         assert model.config.trained_with_document_context
     metrics = trainer.evaluate()
     assert isinstance(metrics, dict)
-    assert set(metrics.keys()) == {
+    labels = {label for label, _id in model.config.label2id.items() if _id != model.config.outside_id}
+    keys = {f"eval_{label}" for label in labels}
+    assert set(metrics.keys()) <= {
         "eval_loss",
         "eval_overall_f1",
         "eval_overall_recall",
@@ -51,7 +55,11 @@ def test_trainer_standard(
         "eval_samples_per_second",
         "eval_steps_per_second",
         "epoch",
+        *keys,
     }
+    for key in keys:
+        if key in metrics:
+            assert metrics[key].keys() == {"f1", "number", "precision", "recall"}
 
     # Try saving and loading the model
     model_path = tmp_path / model_fixture / dataset_fixture
@@ -144,7 +152,9 @@ def test_trainer_incorrect_columns(finetuned_conll_span_marker_model: SpanMarker
         trainer.evaluate()
 
 
-def test_trainer_entity_tracker_warning_entity_length(conll_dataset_dict: DatasetDict, caplog) -> None:
+def test_trainer_entity_tracker_warning_entity_length(
+    conll_dataset_dict: DatasetDict, caplog: LogCaptureFixture
+) -> None:
     model = SpanMarkerModel.from_pretrained(TINY_BERT, labels=CONLL_LABELS, entity_max_length=1)
     trainer = Trainer(
         model, args=DEFAULT_ARGS, train_dataset=conll_dataset_dict["train"], eval_dataset=conll_dataset_dict["train"]
@@ -166,7 +176,9 @@ def test_trainer_entity_tracker_warning_entity_length(conll_dataset_dict: Datase
     assert any([eval_pattern.search(record.msg) for record in caplog.records])
 
 
-def test_trainer_entity_tracker_warning_model_length(conll_dataset_dict: DatasetDict, caplog) -> None:
+def test_trainer_entity_tracker_warning_model_length(
+    conll_dataset_dict: DatasetDict, caplog: LogCaptureFixture
+) -> None:
     model = SpanMarkerModel.from_pretrained(TINY_BERT, labels=CONLL_LABELS, model_max_length=5)
     trainer = Trainer(
         model, args=DEFAULT_ARGS, train_dataset=conll_dataset_dict["train"], eval_dataset=conll_dataset_dict["train"]
@@ -188,7 +200,9 @@ def test_trainer_entity_tracker_warning_model_length(conll_dataset_dict: Dataset
     assert any([eval_pattern.match(record.msg) for record in caplog.records])
 
 
-def test_trainer_entity_tracker_warning_entity_and_model_length(conll_dataset_dict: DatasetDict, caplog) -> None:
+def test_trainer_entity_tracker_warning_entity_and_model_length(
+    conll_dataset_dict: DatasetDict, caplog: LogCaptureFixture
+) -> None:
     model = SpanMarkerModel.from_pretrained(TINY_BERT, labels=CONLL_LABELS, model_max_length=5, entity_max_length=1)
     trainer = Trainer(
         model, args=DEFAULT_ARGS, train_dataset=conll_dataset_dict["train"], eval_dataset=conll_dataset_dict["train"]
@@ -212,3 +226,27 @@ def test_trainer_entity_tracker_warning_entity_and_model_length(conll_dataset_di
         r".*\nAdditionally, a total of \d+ \([\d\.]+%\) entities were missed due to the maximum input length\."
     )
     assert any([eval_pattern.match(record.msg) for record in caplog.records])
+
+
+def test_trainer_no_args(finetuned_conll_span_marker_model: SpanMarkerModel) -> None:
+    trainer = Trainer(model=finetuned_conll_span_marker_model)
+    assert trainer.args.output_dir == "models/my_span_marker_model"
+    assert trainer.args.include_inputs_for_metrics == True
+    assert trainer.args.remove_unused_columns == False
+
+
+def test_trainer_set_model_id_via_hub(finetuned_conll_span_marker_model: SpanMarkerModel, tmp_path: Path) -> None:
+    model = finetuned_conll_span_marker_model
+    model_id = "test_value"
+    args = TrainingArguments(output_dir=str(tmp_path), hub_model_id=model_id, report_to="none")
+    Trainer(model=model, args=args)
+    # Ensure that the model card data is set via the Trainer init
+    assert model.model_card_data.model_id == model_id
+
+
+def test_trainer_create_model_card(finetuned_conll_span_marker_model: SpanMarkerModel, tmp_path: Path) -> None:
+    model = finetuned_conll_span_marker_model
+    args = TrainingArguments(output_dir=str(tmp_path), report_to="none")
+    trainer = Trainer(model=model, args=args)
+    trainer.create_model_card()
+    assert (tmp_path / "README.md").exists()
diff --git a/training_scripts/conll03_context.py b/training_scripts/conll03_context.py
index b2249371..7c4e18a4 100644
--- a/training_scripts/conll03_context.py
+++ b/training_scripts/conll03_context.py
@@ -1,24 +1,35 @@
 from datasets import load_dataset
 from transformers import TrainingArguments
 
-from span_marker import SpanMarkerModel, Trainer
+from span_marker import SpanMarkerModel, SpanMarkerModelCardData, Trainer
 
 
 def main() -> None:
     # Load the dataset, ensure "tokens", "ner_tags", "document_id" and "sentence_id" columns,
     # and get a list of labels
-    dataset = load_dataset("tomaarsen/conll2003")
+    dataset_id = "conll2003"
+    dataset_name = "CoNLL 2003"
+    dataset = load_dataset(dataset_id)
     labels = dataset["train"].features["ner_tags"].feature.names
 
     # Initialize a SpanMarker model using a pretrained BERT-style encoder
-    model_name = "xlm-roberta-large"
+    encoder_id = "xlm-roberta-large"
     model = SpanMarkerModel.from_pretrained(
-        model_name,
+        encoder_id,
         labels=labels,
         # SpanMarker hyperparameters:
         model_max_length=512,
         marker_max_length=128,
         entity_max_length=8,
+        # Model card arguments
+        model_card_data=SpanMarkerModelCardData(
+            model_id="tomaarsen/span-marker-xlm-roberta-large-conll03-doc-context",
+            encoder_id=encoder_id,
+            dataset_name=dataset_name,
+            dataset_id=dataset_id,
+            license="other",
+            language="en",
+        ),
     )
 
     # Prepare the 🤗 transformers training arguments
diff --git a/training_scripts/conll03_no_context.py b/training_scripts/conll03_no_context.py
index 5f7e549e..6662b8c4 100644
--- a/training_scripts/conll03_no_context.py
+++ b/training_scripts/conll03_no_context.py
@@ -1,23 +1,34 @@
 from datasets import load_dataset
 from transformers import TrainingArguments
 
-from span_marker import SpanMarkerModel, Trainer
+from span_marker import SpanMarkerModel, SpanMarkerModelCardData, Trainer
 
 
 def main() -> None:
     # Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels
-    dataset = load_dataset("conll2003")
+    dataset_id = "conll2003"
+    dataset_name = "CoNLL 2003"
+    dataset = load_dataset(dataset_id)
     labels = dataset["train"].features["ner_tags"].feature.names
 
     # Initialize a SpanMarker model using a pretrained BERT-style encoder
-    model_name = "xlm-roberta-large"
+    encoder_id = "xlm-roberta-large"
     model = SpanMarkerModel.from_pretrained(
-        model_name,
+        encoder_id,
         labels=labels,
         # SpanMarker hyperparameters:
         model_max_length=128,
         marker_max_length=64,
         entity_max_length=6,
+        # Model card arguments
+        model_card_data=SpanMarkerModelCardData(
+            model_id="tomaarsen/span-marker-xlm-roberta-large-conll03",
+            encoder_id=encoder_id,
+            dataset_name=dataset_name,
+            dataset_id=dataset_id,
+            license="other",
+            language="en",
+        ),
     )
 
     # Prepare the 🤗 transformers training arguments
diff --git a/training_scripts/conllpp_context.py b/training_scripts/conllpp_context.py
index 3be5cc63..2f67a46a 100644
--- a/training_scripts/conllpp_context.py
+++ b/training_scripts/conllpp_context.py
@@ -1,24 +1,35 @@
 from datasets import load_dataset
 from transformers import TrainingArguments
 
-from span_marker import SpanMarkerModel, Trainer
+from span_marker import SpanMarkerModel, SpanMarkerModelCardData, Trainer
 
 
 def main() -> None:
     # Load the dataset, ensure "tokens", "ner_tags", "document_id" and "sentence_id" columns,
     # and get a list of labels
-    dataset = load_dataset("tomaarsen/conllpp")
+    dataset_id = "tomaarsen/conllpp"
+    dataset_name = "CoNLL++"
+    dataset = load_dataset(dataset_id)
     labels = dataset["train"].features["ner_tags"].feature.names
 
     # Initialize a SpanMarker model using a pretrained BERT-style encoder
-    model_name = "xlm-roberta-large"
+    encoder_id = "xlm-roberta-large"
     model = SpanMarkerModel.from_pretrained(
-        model_name,
+        encoder_id,
         labels=labels,
         # SpanMarker hyperparameters:
         model_max_length=512,
         marker_max_length=128,
         entity_max_length=8,
+        # Model card arguments
+        model_card_data=SpanMarkerModelCardData(
+            model_id="tomaarsen/span-marker-xlm-roberta-large-conllpp-doc-context",
+            encoder_id=encoder_id,
+            dataset_name=dataset_name,
+            dataset_id=dataset_id,
+            license="unknown",
+            language="en",
+        ),
     )
 
     # Prepare the 🤗 transformers training arguments
diff --git a/training_scripts/fewnerd_base.py b/training_scripts/fewnerd_base.py
index 7f49768d..e7582fb0 100644
--- a/training_scripts/fewnerd_base.py
+++ b/training_scripts/fewnerd_base.py
@@ -1,25 +1,36 @@
 from datasets import load_dataset
 from transformers import TrainingArguments
 
-from span_marker import SpanMarkerModel, Trainer
+from span_marker import SpanMarkerModel, SpanMarkerModelCardData, Trainer
 
 
 def main() -> None:
     # Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels
-    dataset = load_dataset("DFKI-SLT/few-nerd", "supervised")
+    dataset_id = "DFKI-SLT/few-nerd"
+    dataset_name = "FewNERD"
+    dataset = load_dataset(dataset_id, "supervised")
     dataset = dataset.remove_columns("ner_tags")
     dataset = dataset.rename_column("fine_ner_tags", "ner_tags")
     labels = dataset["train"].features["ner_tags"].feature.names
 
     # Initialize a SpanMarker model using a pretrained BERT-style encoder
-    model_name = "bert-base-cased"
+    encoder_id = "bert-base-cased"
     model = SpanMarkerModel.from_pretrained(
-        model_name,
+        encoder_id,
         labels=labels,
         # SpanMarker hyperparameters:
         model_max_length=256,
         marker_max_length=128,
         entity_max_length=8,
+        # Model card arguments
+        model_card_data=SpanMarkerModelCardData(
+            model_id="tomaarsen/span-marker-bert-base-fewnerd-fine-super",
+            encoder_id=encoder_id,
+            dataset_name=dataset_name,
+            dataset_id=dataset_id,
+            license="cc-by-nc-sa-4.0",
+            language="en",
+        ),
     )
 
     # Prepare the 🤗 transformers training arguments
diff --git a/training_scripts/fewnerd_large.py b/training_scripts/fewnerd_large.py
index f7d35875..307098f4 100644
--- a/training_scripts/fewnerd_large.py
+++ b/training_scripts/fewnerd_large.py
@@ -1,25 +1,36 @@
 from datasets import load_dataset
 from transformers import TrainingArguments
 
-from span_marker import SpanMarkerModel, Trainer
+from span_marker import SpanMarkerModel, SpanMarkerModelCardData, Trainer
 
 
 def main() -> None:
     # Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels
-    dataset = load_dataset("DFKI-SLT/few-nerd", "supervised")
+    dataset_id = "DFKI-SLT/few-nerd"
+    dataset_name = "FewNERD"
+    dataset = load_dataset(dataset_id, "supervised")
     dataset = dataset.remove_columns("ner_tags")
     dataset = dataset.rename_column("fine_ner_tags", "ner_tags")
     labels = dataset["train"].features["ner_tags"].feature.names
 
     # Initialize a SpanMarker model using a pretrained BERT-style encoder
-    model_name = "roberta-large"
+    encoder_id = "roberta-large"
     model = SpanMarkerModel.from_pretrained(
-        model_name,
+        encoder_id,
         labels=labels,
         # SpanMarker hyperparameters:
         model_max_length=256,
         marker_max_length=128,
         entity_max_length=8,
+        # Model card arguments
+        model_card_data=SpanMarkerModelCardData(
+            model_id=f"tomaarsen/span-marker-{encoder_id}-fewnerd-fine-super",
+            encoder_id=encoder_id,
+            dataset_name=dataset_name,
+            dataset_id=dataset_id,
+            license="cc-by-nc-sa-4.0",
+            language="en",
+        ),
     )
 
     # Prepare the 🤗 transformers training arguments
diff --git a/training_scripts/ontonotesv5.py b/training_scripts/ontonotesv5.py
index d9a60d68..b368a681 100644
--- a/training_scripts/ontonotesv5.py
+++ b/training_scripts/ontonotesv5.py
@@ -1,12 +1,14 @@
 from datasets import load_dataset
 from transformers import TrainingArguments
 
-from span_marker import SpanMarkerModel, Trainer
+from span_marker import SpanMarkerModel, SpanMarkerModelCardData, Trainer
 
 
 def main() -> None:
     # Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels
-    dataset = load_dataset("tner/ontonotes5")
+    dataset_id = "tner/ontonotes5"
+    dataset_name = "OntoNotes v5"
+    dataset = load_dataset(dataset_id)
     dataset = dataset.rename_column("tags", "ner_tags")
     labels = [
         "O",
@@ -49,14 +51,23 @@ def main() -> None:
     ]
 
     # Initialize a SpanMarker model using a pretrained BERT-style encoder
-    model_name = "roberta-large"
+    encoder_id = "roberta-large"
     model = SpanMarkerModel.from_pretrained(
-        model_name,
+        encoder_id,
         labels=labels,
         # SpanMarker hyperparameters:
         model_max_length=256,
         marker_max_length=128,
         entity_max_length=10,
+        # Model card arguments
+        model_card_data=SpanMarkerModelCardData(
+            model_id=f"tomaarsen/span-marker-{encoder_id}-ontonotes5",
+            encoder_id=encoder_id,
+            dataset_name=dataset_name,
+            dataset_id=dataset_id,
+            license="other",
+            language="en",
+        ),
     )
 
     # Prepare the 🤗 transformers training arguments