From ce22087f2525f826850d222445cec4fd5e476ff0 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 19 Sep 2023 12:03:22 +0200 Subject: [PATCH 1/2] Remove link to now-removed space --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 9abc3fd3..90b8f3b6 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,6 @@ These models are further elaborated on in my [thesis](https://raw.githubusercont ### FewNERD * [`tomaarsen/span-marker-bert-base-fewnerd-fine-super`](https://huggingface.co/tomaarsen/span-marker-bert-base-fewnerd-fine-super) is a model that I have trained in 2 hours on the finegrained, supervised [Few-NERD dataset](https://huggingface.co/datasets/DFKI-SLT/few-nerd). It reached a 70.53 Test F1, competitive in the all-time [Few-NERD leaderboard](https://paperswithcode.com/sota/named-entity-recognition-on-few-nerd-sup) using `bert-base`. My training script resembles the one that you can see above. - * Try the model out online using this [🤗 Space](https://tomaarsen-span-marker-bert-base-fewnerd-fine-super.hf.space/). * [`tomaarsen/span-marker-roberta-large-fewnerd-fine-super`](https://huggingface.co/tomaarsen/span-marker-roberta-large-fewnerd-fine-super) was trained in 6 hours on the finegrained, supervised [Few-NERD dataset](https://huggingface.co/datasets/DFKI-SLT/few-nerd) using `roberta-large`. It reached a 71.03 Test F1, reaching a new state of the art in the all-time [Few-NERD leaderboard](https://paperswithcode.com/sota/named-entity-recognition-on-few-nerd-sup). * [`tomaarsen/span-marker-xlm-roberta-base-fewnerd-fine-super`](https://huggingface.co/tomaarsen/span-marker-xlm-roberta-base-fewnerd-fine-super) is a multilingual model that I have trained in 1.5 hours on the finegrained, supervised [Few-NERD dataset](https://huggingface.co/datasets/DFKI-SLT/few-nerd). It reached a 68.6 Test F1 on English, and works well on other languages like Spanish, French, German, Russian, Dutch, Polish, Icelandic, Greek and many more. From 506c25b64eb2a8796fa6ff0a9621a9838f44d690 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Fri, 29 Sep 2023 18:59:06 +0200 Subject: [PATCH 2/2] Add pipeline support for SpanMarker (#34) * Add pipeline support for SpanMarker * Add general-purpose default model * Mention that pipeline is from transformers * Add to docs --- CHANGELOG.md | 6 ++++ docs/api/span_marker.pipeline_component.rst | 14 ++++++++ docs/api/span_marker.rst | 1 + span_marker/__init__.py | 9 +++++ span_marker/pipeline_component.py | 38 +++++++++++++++++++++ tests/test_pipeline.py | 11 ++++++ 6 files changed, 79 insertions(+) create mode 100644 docs/api/span_marker.pipeline_component.rst create mode 100644 span_marker/pipeline_component.py create mode 100644 tests/test_pipeline.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 0532f5cf..67bcef1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,12 @@ Types of changes * "Security" in case of vulnerabilities. --> +## [Unreleased] + +### Added + +- Added `transformers` `pipeline` support, e.g. `pipeline(task="span-marker", model="tomaarsen/span-marker-mbert-base-multinerd")`. + ## [1.3.0] ### Added diff --git a/docs/api/span_marker.pipeline_component.rst b/docs/api/span_marker.pipeline_component.rst new file mode 100644 index 00000000..8dc3a0cc --- /dev/null +++ b/docs/api/span_marker.pipeline_component.rst @@ -0,0 +1,14 @@ + +:autogenerated: + +.. + This file is autogenerated by `sphinx-api`. + +span_marker.pipeline_component module +===================================== + +.. currentmodule:: span_marker.pipeline_component + +.. automodule:: span_marker.pipeline_component + :members: + :show-inheritance: \ No newline at end of file diff --git a/docs/api/span_marker.rst b/docs/api/span_marker.rst index b20446b7..a2650031 100644 --- a/docs/api/span_marker.rst +++ b/docs/api/span_marker.rst @@ -19,6 +19,7 @@ span_marker package span_marker.modeling span_marker.trainer span_marker.configuration + span_marker.pipeline_component span_marker.data_collator span_marker.tokenizer span_marker.evaluation diff --git a/span_marker/__init__.py b/span_marker/__init__.py index 2c2cad1d..0586fd92 100644 --- a/span_marker/__init__.py +++ b/span_marker/__init__.py @@ -5,14 +5,23 @@ import torch from transformers import AutoConfig, AutoModel, TrainingArguments +from transformers.pipelines import PIPELINE_REGISTRY, pipeline from span_marker.configuration import SpanMarkerConfig from span_marker.modeling import SpanMarkerModel +from span_marker.pipeline_component import SpanMarkerPipeline from span_marker.trainer import Trainer # Set up for Transformers AutoConfig.register("span-marker", SpanMarkerConfig) AutoModel.register(SpanMarkerConfig, SpanMarkerModel) +PIPELINE_REGISTRY.register_pipeline( + "span-marker", + pipeline_class=SpanMarkerPipeline, + pt_model=SpanMarkerModel, + type="text", + default={"pt": ("tomaarsen/span-marker-bert-base-fewnerd-fine-super", "main")}, +) # Set up for spaCy try: diff --git a/span_marker/pipeline_component.py b/span_marker/pipeline_component.py new file mode 100644 index 00000000..be80a734 --- /dev/null +++ b/span_marker/pipeline_component.py @@ -0,0 +1,38 @@ +from typing import Any, Dict, List, Tuple, Union + +from transformers import Pipeline + +INPUT_TYPES = Union[str, List[str], List[List[str]]] +OUTPUT_TYPES = Union[List[Dict[str, Union[str, int, float]]], List[List[Dict[str, Union[str, int, float]]]]] + + +class SpanMarkerPipeline(Pipeline): + """A Pipeline component for SpanMarker. + + The `pipeline` function is :func:`~transformers.pipeline`, which you can also import with + ``from transformers import pipeline``, but you must also import ``span_marker`` to register the + ``"span-marker"`` pipeline task. + + Example:: + + >>> from span_marker import pipeline + >>> pipe = pipeline(task="span-marker", model="tomaarsen/span-marker-mbert-base-multinerd", device_map="auto") + >>> pipe("Amelia Earhart flew her single engine Lockheed Vega 5B across the Atlantic to Paris.") + [{'span': 'Amelia Earhart', 'label': 'PER', 'score': 0.9999709129333496, 'char_start_index': 0, 'char_end_index': 14}, + {'span': 'Lockheed Vega 5B', 'label': 'VEHI', 'score': 0.9050095677375793, 'char_start_index': 38, 'char_end_index': 54}, + {'span': 'Atlantic', 'label': 'LOC', 'score': 0.9991973042488098, 'char_start_index': 66, 'char_end_index': 74}, + {'span': 'Paris', 'label': 'LOC', 'score': 0.9999232292175293, 'char_start_index': 78, 'char_end_index': 83}] + + """ + + def _sanitize_parameters(self, **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]: + return {}, {}, {} + + def preprocess(self, inputs: INPUT_TYPES) -> INPUT_TYPES: + return inputs + + def _forward(self, inputs: INPUT_TYPES) -> OUTPUT_TYPES: + return self.model.predict(inputs) + + def postprocess(self, outputs: OUTPUT_TYPES) -> OUTPUT_TYPES: + return outputs diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 00000000..c85a9d02 --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,11 @@ +from transformers import pipeline + +import span_marker + + +def test_pipeline() -> None: + pipe = pipeline(task="span-marker", model="tomaarsen/span-marker-bert-tiny-fewnerd-coarse-super") + outputs = pipe("Tom lives in the Netherlands.") + assert len(outputs) == 2 + assert outputs[0]["span"] == "Tom" + assert outputs[1]["span"] == "Netherlands"