Skip to content

Commit

Permalink
Merge branch 'main' into feat/improved_model_cards
Browse files Browse the repository at this point in the history
  • Loading branch information
tomaarsen committed Sep 29, 2023
2 parents 7d1fa8b + 506c25b commit 9321361
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Types of changes
- Added `SpanMarkerModel.generate_model_card()` method to get a model card string.
- Added `SpanMarkerModelCardData` that should be passed to `SpanMarkerModel.from_pretrained` with additional information like
- `language`, `license`, `model_name`, `model_id`, `encoder_name`, `encoder_id`, `dataset_name`, `dataset_id`, `dataset_revision`.
- Added `transformers` `pipeline` support, e.g. `pipeline(task="span-marker", model="tomaarsen/span-marker-mbert-base-multinerd")`.

### Changed

Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ These models are further elaborated on in my [thesis](https://raw.githubusercont

### FewNERD
* [`tomaarsen/span-marker-bert-base-fewnerd-fine-super`](https://huggingface.co/tomaarsen/span-marker-bert-base-fewnerd-fine-super) is a model that I have trained in 2 hours on the finegrained, supervised [Few-NERD dataset](https://huggingface.co/datasets/DFKI-SLT/few-nerd). It reached a 70.53 Test F1, competitive in the all-time [Few-NERD leaderboard](https://paperswithcode.com/sota/named-entity-recognition-on-few-nerd-sup) using `bert-base`. My training script resembles the one that you can see above.
* Try the model out online using this [🤗 Space](https://tomaarsen-span-marker-bert-base-fewnerd-fine-super.hf.space/).

* [`tomaarsen/span-marker-roberta-large-fewnerd-fine-super`](https://huggingface.co/tomaarsen/span-marker-roberta-large-fewnerd-fine-super) was trained in 6 hours on the finegrained, supervised [Few-NERD dataset](https://huggingface.co/datasets/DFKI-SLT/few-nerd) using `roberta-large`. It reached a 71.03 Test F1, reaching a new state of the art in the all-time [Few-NERD leaderboard](https://paperswithcode.com/sota/named-entity-recognition-on-few-nerd-sup).
* [`tomaarsen/span-marker-xlm-roberta-base-fewnerd-fine-super`](https://huggingface.co/tomaarsen/span-marker-xlm-roberta-base-fewnerd-fine-super) is a multilingual model that I have trained in 1.5 hours on the finegrained, supervised [Few-NERD dataset](https://huggingface.co/datasets/DFKI-SLT/few-nerd). It reached a 68.6 Test F1 on English, and works well on other languages like Spanish, French, German, Russian, Dutch, Polish, Icelandic, Greek and many more.
Expand Down
14 changes: 14 additions & 0 deletions docs/api/span_marker.pipeline_component.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

:autogenerated:

..
This file is autogenerated by `sphinx-api`.
span_marker.pipeline_component module
=====================================

.. currentmodule:: span_marker.pipeline_component

.. automodule:: span_marker.pipeline_component
:members:
:show-inheritance:
1 change: 1 addition & 0 deletions docs/api/span_marker.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ span_marker package
span_marker.trainer
span_marker.configuration
span_marker.model_card
span_marker.pipeline_component
span_marker.data_collator
span_marker.tokenizer
span_marker.evaluation
Expand Down
9 changes: 9 additions & 0 deletions span_marker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,24 @@

import torch
from transformers import AutoConfig, AutoModel, TrainingArguments
from transformers.pipelines import PIPELINE_REGISTRY, pipeline

from span_marker.configuration import SpanMarkerConfig
from span_marker.model_card import SpanMarkerModelCardData
from span_marker.modeling import SpanMarkerModel
from span_marker.pipeline_component import SpanMarkerPipeline
from span_marker.trainer import Trainer

# Set up for Transformers
AutoConfig.register("span-marker", SpanMarkerConfig)
AutoModel.register(SpanMarkerConfig, SpanMarkerModel)
PIPELINE_REGISTRY.register_pipeline(
"span-marker",
pipeline_class=SpanMarkerPipeline,
pt_model=SpanMarkerModel,
type="text",
default={"pt": ("tomaarsen/span-marker-bert-base-fewnerd-fine-super", "main")},
)

# Set up for spaCy
try:
Expand Down
38 changes: 38 additions & 0 deletions span_marker/pipeline_component.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from typing import Any, Dict, List, Tuple, Union

from transformers import Pipeline

INPUT_TYPES = Union[str, List[str], List[List[str]]]
OUTPUT_TYPES = Union[List[Dict[str, Union[str, int, float]]], List[List[Dict[str, Union[str, int, float]]]]]


class SpanMarkerPipeline(Pipeline):
"""A Pipeline component for SpanMarker.
The `pipeline` function is :func:`~transformers.pipeline`, which you can also import with
``from transformers import pipeline``, but you must also import ``span_marker`` to register the
``"span-marker"`` pipeline task.
Example::
>>> from span_marker import pipeline
>>> pipe = pipeline(task="span-marker", model="tomaarsen/span-marker-mbert-base-multinerd", device_map="auto")
>>> pipe("Amelia Earhart flew her single engine Lockheed Vega 5B across the Atlantic to Paris.")
[{'span': 'Amelia Earhart', 'label': 'PER', 'score': 0.9999709129333496, 'char_start_index': 0, 'char_end_index': 14},
{'span': 'Lockheed Vega 5B', 'label': 'VEHI', 'score': 0.9050095677375793, 'char_start_index': 38, 'char_end_index': 54},
{'span': 'Atlantic', 'label': 'LOC', 'score': 0.9991973042488098, 'char_start_index': 66, 'char_end_index': 74},
{'span': 'Paris', 'label': 'LOC', 'score': 0.9999232292175293, 'char_start_index': 78, 'char_end_index': 83}]
"""

def _sanitize_parameters(self, **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]:
return {}, {}, {}

def preprocess(self, inputs: INPUT_TYPES) -> INPUT_TYPES:
return inputs

def _forward(self, inputs: INPUT_TYPES) -> OUTPUT_TYPES:
return self.model.predict(inputs)

def postprocess(self, outputs: OUTPUT_TYPES) -> OUTPUT_TYPES:
return outputs
11 changes: 11 additions & 0 deletions tests/test_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from transformers import pipeline

import span_marker


def test_pipeline() -> None:
pipe = pipeline(task="span-marker", model="tomaarsen/span-marker-bert-tiny-fewnerd-coarse-super")
outputs = pipe("Tom lives in the Netherlands.")
assert len(outputs) == 2
assert outputs[0]["span"] == "Tom"
assert outputs[1]["span"] == "Netherlands"

0 comments on commit 9321361

Please sign in to comment.