Hi everyone,
I’m creating a custom component for SpellChecking. We have an archive in which we will compare if the token exists and substitute the wrong word with the correct from the archive.
The component code is below:
import typing
from typing import Any, Optional, Text, Dict
import re
from collections import Counter
from rasa.nlu.components import Component
from rasa.nlu.config import RasaNLUModelConfig
from rasa.nlu import utils
from rasa.nlu.model import Metadata
from rasa.nlu.training_data import Message, TrainingData
if typing.TYPE_CHECKING:
from rasa.nlu.model import Metadata
class SpellChecker(Component):
"""A custom spell checker component"""
name = "spell_checker"
provides = ["entities"]
requires = ["tokens"]
defaults = {}
language_list = "pt"
print("Initialised the class")
dataset_dictionary = {}
f = ''
l = ''
lines = ''
values= []
with open(r'/home/user/anaconda3/envs/NewVersion/lib/python3.6/site-packages/rasa/nlu/checkers/formas-cetenfolha.txt', 'r', encoding='utf8') as f:
lines = f.read().split('\n')
for l in lines:
values = l.split('\t')
if len(values)== 2:
if int(values[0].strip()) >= 3:
dataset_dictionary.setdefault(values[1].strip(), int(values[0].strip()))
self.WORDS = Counter(dataset_dictionary)
def words(self, text):
return re.findall(r'\w+', text.lower())
def P(self, word, N=sum(self.WORDS.values())):
"Probability of `word`."
return self.WORDS[word] / N
def edits1(self, word):
"All edits that are one edit away from `word`."
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
def edits2(self, word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
def known(self, words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in self.WORDS)
def candidates(self, word):
"Generate possible spelling corrections for word."
return (self.known([word]) or self.known(self.edits1(word)) or self.known(self.edits2(word)) or [word])
def correction(self, word):
"Most probable spelling correction for word."
return max(self.candidates(word), key=P), P
##Rasa component methods
def train(self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any)-> None:
for example in training_data.training_examples:
example.set("tokens", self.correction(example))
def convert_to_rasa(self, value, confidence):
"""Convert model output into the Rasa NLU compatible output format."""
entity = {"value": value,
"confidence": confidence,
"entity": "spell",
"extractor": "spell_checker"}
return entity
def process(self, message, **kwargs):
"""Retrieve the tokens of the new message, pass it to the classifier
and append prediction results to the message class."""
if not self.clf:
# component is either not trained or didn't
# receive enough training data
entity = None
else:
correction = ''
tokens = [t.text for t in message.get("tokens")]
tb = self.preprocessing(tokens)
pred = self.clf.prob_classify(tb)
for token in tokens:
correction = self.correction(token)
corrected_word = correction[0]
corrected_prediction = correction[1]
rasa_format = self.convert_to_rasa(corrected_word, corrected_prediction)
message.set("entities", [rasa_format], add_to_output=True)
The error that I get when running rasa train is:
File "/home/user/anaconda3/envs/NewVersion/lib/python3.6/site-packages/rasa/nlu/utils/__init__.py", line 48, in ordered
return sorted(ordered(x) for x in obj)
TypeError: '<' not supported between instances of 'Token' and 'Token'
I’ve already done the registry.py
step.
Does anyone know how to overcome this? It’s urgent.