[HELP WANTED] Error in Custom Components pipeline

Hi everyone,

I’m creating a custom component for SpellChecking. We have an archive in which we will compare if the token exists and substitute the wrong word with the correct from the archive.

The component code is below:

    import typing
    from typing import Any, Optional, Text, Dict
    import re
    from collections import Counter
    from rasa.nlu.components import Component
    from rasa.nlu.config import RasaNLUModelConfig
    from rasa.nlu import utils
    from rasa.nlu.model import Metadata
    from rasa.nlu.training_data import Message, TrainingData

    if typing.TYPE_CHECKING:
        from rasa.nlu.model import Metadata

    class SpellChecker(Component):
        """A custom spell checker component"""
        name = "spell_checker"
        provides = ["entities"]
        requires = ["tokens"]
        defaults = {}
        language_list = "pt"
        print("Initialised the class")
        dataset_dictionary = {}
        f = ''
        l = ''
        lines = ''
        values= []
        with open(r'/home/user/anaconda3/envs/NewVersion/lib/python3.6/site-packages/rasa/nlu/checkers/formas-cetenfolha.txt', 'r', encoding='utf8') as f:
            lines = f.read().split('\n')
            for l in lines:
                values = l.split('\t')
                if len(values)== 2:
                    if int(values[0].strip()) >= 3:
                        dataset_dictionary.setdefault(values[1].strip(), int(values[0].strip()))

        self.WORDS = Counter(dataset_dictionary)

        def words(self, text):
            return re.findall(r'\w+', text.lower())

        def P(self, word, N=sum(self.WORDS.values())):
            "Probability of `word`."
            return self.WORDS[word] / N

        def edits1(self, word):
            "All edits that are one edit away from `word`."
            letters    = 'abcdefghijklmnopqrstuvwxyz'
            splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
            deletes    = [L + R[1:]               for L, R in splits if R]
            transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
            replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
            inserts    = [L + c + R               for L, R in splits for c in letters]
            return set(deletes + transposes + replaces + inserts)

        def edits2(self, word):
            "All edits that are two edits away from `word`."
            return (e2 for e1 in edits1(word) for e2 in edits1(e1))

        def known(self, words):
            "The subset of `words` that appear in the dictionary of WORDS."
            return set(w for w in words if w in self.WORDS)

        def candidates(self, word):
            "Generate possible spelling corrections for word."
            return (self.known([word]) or self.known(self.edits1(word)) or self.known(self.edits2(word)) or [word])

        def correction(self, word):
            "Most probable spelling correction for word."
            return max(self.candidates(word), key=P), P    

        ##Rasa component methods

        def train(self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any)-> None:        
            for example in training_data.training_examples:
                example.set("tokens", self.correction(example))

        def convert_to_rasa(self, value, confidence):
            """Convert model output into the Rasa NLU compatible output format."""

            entity = {"value": value,
                      "confidence": confidence,
                      "entity": "spell",
                      "extractor": "spell_checker"}

            return entity

        def process(self, message, **kwargs):
            """Retrieve the tokens of the new message, pass it to the classifier
                and append prediction results to the message class."""
            if not self.clf:
                # component is either not trained or didn't
                # receive enough training data
                entity = None 
                correction = ''
                tokens = [t.text for t in message.get("tokens")]
                tb = self.preprocessing(tokens)
                pred = self.clf.prob_classify(tb)

                for token in tokens:
                    correction = self.correction(token)
                    corrected_word = correction[0]
                    corrected_prediction = correction[1]

                rasa_format = self.convert_to_rasa(corrected_word, corrected_prediction)

                message.set("entities", [rasa_format], add_to_output=True)

The error that I get when running rasa train is:

  File "/home/user/anaconda3/envs/NewVersion/lib/python3.6/site-packages/rasa/nlu/utils/__init__.py", line 48, in ordered
    return sorted(ordered(x) for x in obj)
TypeError: '<' not supported between instances of 'Token' and 'Token'

I’ve already done the registry.py step.

Does anyone know how to overcome this? It’s urgent.

1 Like

@miohana In your train method, you are passing example to self.correction() method. example is of type Message and I think your functions expect it to be a string. The Message object will contain a property tokens which can be used to get the string for each token. I see you have done that in process method. I would say, replicate your process method for spell correction to your train function too.