Hey, sorry I stopped working on rasa a while ago, but here is a quick example (can’t give you the full code)
from symspellpy.symspellpy import SymSpell
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
DICTIONARIES_PATH = os.path.join(ROOT_DIR, 'dictionaries/')
class SpellCheck(Component):
""" Component put at the start of the pipeline to spell check the user message."""
name = "SpellChecking"
# Defines what language(s) this component can handle.
language_list = "fr"
# Defines the default configuration parameters of a component
# these values can be overwritten in the pipeline configuration
# of the model.
defaults = {
"initial_capacity": 83000,
# maximum edit distance per dictionary precalculation (max_edit_distance_lookup <= max_edit_distance_dictionary)
"max_edit_distance_dictionary": 2,
"prefix_length": 7,
# max edit distance per lookup (per single word, not per whole input string),
"max_edit_distance_lookup": 2,
"dictionary": "fr_dictionary.txt"
}
def __init__(self, component_config=None):
super(SpellCheck, self).__init__(component_config)
logging.basicConfig(level='DEBUG')
self.sym_spell = SymSpell(self.component_config["initial_capacity"],
self.component_config["max_edit_distance_dictionary"],
self.component_config["prefix_length"])
self.load_sym_spell(self.component_config["dictionary"])
def process(self, message, **kwargs):
# For the moment, if the dictionary is not loaded, we skip the spell checking
if self.sym_spell is None:
logger.info("Skip spell check because dictionary failed to load")
return
# Get and split by numbers
numbers = re.findall(r"\d*[\.|\,]\d+|\d+", message.text)
split = re.split(r"\d*[\.|\,]\d+|\d+", message.text)
correction = ""
while i < len(split):
suggestions = self.sym_spell.lookup_compound(split[i],
self.component_config["max_edit_distance_lookup"])
correction += suggestions[0].term
i += 1
# Split is use here to remove useless space
correction = " ".join(correction.split())
logger.info("Correction from %s to %s", message.text, correction)
message.text = correction
def load_sym_spell(self, dictionary):
# load dictionary
dictionary_path = os.path.join(DICTIONARIES_PATH, dictionary)
# column of the term in the dictionary text file
term_index = 0
# column of the term frequency in the dictionary text file
count_index = 1
if not self.sym_spell.load_dictionary(dictionary_path, term_index, count_index):
logger.error("Unable to load spell dictionary")
self.sym_spell = None
So as you can see, I used a library named “sym_spell” (GitHub - mammothb/symspellpy: Python port of SymSpell).
Just have to import the dictionnary (which is a list of word + frequency, for example “a 155105”), easy to find in english, harder for other langages.
I do not understand all params sorry, and the corrector is not perfect but it’s doing its job. Small part in the code where I split the message with number, because the corrector is deleting them.
Once you have create your component, you just have to add it in your pipeline like that :
language: "fr"
pipeline:
- name: "spell_check_component.SpellCheck"
- name: "nlp_spacy"
- name: "tokenizer_whitespace"
- name: "ner_crf"
- name: "ner_synonyms"
- name: "intent_featurizer_count_vectors"
- name: "intent_classifier_tensorflow_embedding"
"epochs": 3000
(The component SpellCheck is in the spell_check_component folder)
Hope it helps !