Hello,
I have been trying to implement a custom preprocessor that replaces certain words in the received message by fuzzy matching. I have tried to import a function that I wrote but when I set the message_preprocessor to the function it does not change and stays None.
Here is the code for the fuzzy matching function:
from fuzzywuzzy import process
FUZZY_SEARCH_DATA = ['krzysztof', 'laura', 'agnieszka', 'milena']
def fuzzy_match_preprocessor(message):
words = message.split()
new_string = []
for word in words:
fuzzy_matched_value = process.extractOne(word, FUZZY_SEARCH_DATA, score_cutoff= 65)
if fuzzy_matched_value != None:
new_string.append(fuzzy_matched_value[0].lower())
else:
new_string.append(word)
message_text = ' '.join(new_string)
return message_text
I import it from a file called fuzzy_matcher.py with this line:
from rasa.core.fuzzy_matcher import fuzzy_match_preprocessor
And this is how it is assigned to the message_preprocessor:
In case anyone is intrested in the component I’m posting the code here. This solution is optimised for fuzzy matching full names or pairs of words. You can change this behavior by changing the ngram_range.
from rasa.nlu.components import Component
import typing
from typing import Any, Optional, Text, Dict
if typing.TYPE_CHECKING:
from rasa.nlu.model import Metadata
class SpellChecker(Component):
"""A new component"""
provides = ['text']
requires = []
defaults = {}
language_list = None
def __init__(self, component_config=None):
super(SpellChecker, self).__init__(component_config)
def train(self, training_data, cfg, **kwargs):
pass
def process(self, message, **kwargs):
from fuzzywuzzy import process
FULLNAMES = ['john kowalski', 'andy michaels']
text = message.text
ngram_range = 2
words = text.split()
ngrams = zip(*[words[i:] for i in range(ngram_range)])
tokens = [' '.join(ngram) for ngram in ngrams]
fuzzy_matched_values = []
scores = []
if tokens:
for token in tokens:
fuzzy_match = process.extract(token, FULLNAMES, limit = 1)[0]
fuzzy_matched_values.append(fuzzy_match)
scores.append(fuzzy_match[1])
cutoff_threshold = 75
if max(scores) >= cutoff_threshold:
max_value_index = scores.index(max(scores))
text = text.replace(tokens[max_value_index],fuzzy_matched_values[max_value_index][0])
message.text = text
def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
"""Persist this component to disk for future loading."""
pass
@classmethod
def load(
cls,
meta: Dict[Text, Any],
model_dir: Optional[Text] = None,
model_metadata: Optional["Metadata"] = None,
cached_component: Optional["Component"] = None,
**kwargs: Any
) -> "Component":
"""Load this component from file."""
if cached_component:
return cached_component
else:
return cls(meta)