Hi everyone,
I hope you are staying safe wherever you are.
Recently I wrote a custom component in order to use a Huggingface NER model for Swedish language as Entity Extractor in Rasa. It works well, however, when I use Rasa X, the PERSON entity does not show in the front-end, even though I can still see the entities printed out in my Terminal. The other entities still show up.
This is what Rasa X shows:
This is what’s in Terminal:
Below I’m attaching my script for the component. As you can see, I already mentioned all the entity in dimensions
. If you know where I might make mistake, please help me pointing it out.
import typing
from typing import Any, Dict, List, Text, Optional, Type
from transformers import pipeline
from rasa.nlu.constants import ENTITIES
from rasa.nlu.components import Component
from rasa.nlu.config import RasaNLUModelConfig
from rasa.nlu.training_data import Message, TrainingData
from rasa.nlu.extractors.extractor import EntityExtractor
from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
if typing.TYPE_CHECKING:
from rasa.nlu.model import Metadata
nlp = pipeline('ner', model='KB/bert-base-swedish-cased-ner', tokenizer='KB/bert-base-swedish-cased-ner')
class BertEntityExtractor(EntityExtractor):
@classmethod
def required_components(cls) -> List[Type[Component]]:
return [HFTransformersNLP]
defaults = {
# by default all dimensions recognized by spacy are returned
# dimensions can be configured to contain an array of strings
# with the names of the dimensions to filter for
"dimensions": ["PER", "LOC", "ORG", "OBJ", "EVN", "TME"]
}
def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
super().__init__(component_config)
def process(self, message: Message, **kwargs: Any) -> None:
# can't use the existing doc here (spacy_doc on the message)
# because tokens are lower cased which is bad for NER
#spacy_nlp = kwargs.get("spacy_nlp", None)
#doc = spacy_nlp(message.text)
doc = nlp(message.text)
all_extracted = self.add_extractor_name(self.extract_entities(doc))
dimensions = self.component_config["dimensions"]
extracted = BertEntityExtractor.filter_irrelevant_entities(
all_extracted, dimensions
)
message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
@staticmethod
def extract_entities(doc: "Doc") -> List[Dict[Text, Any]]:
l = []
for token in doc:
if token['word'].startswith('##'):
l[-1]['word'] += token['word'][2:]
else:
l += [ token ]
print(l)
entities = [
{
"entity": token['entity'],
"value": token['word'],
"confidence": token['score'],
}
for token in l
]
return entities