This is my code, and I have python 3.9.6, and rasa 3.6.16. I keep getting “rasa.engine.exceptions.GraphSchemaValidationException: Your model uses a component ‘SudachiTokenizer’ whose method ‘train’ does not return a fingerprintable output. This is required for proper caching between model trainings. Please make sure you’re using a return type which implements the ‘Fingerprintable’ protocol.” when I run rasa train nlu
import typing import nltk import os import re from typing import Any, Dict, List, Text from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
from rasa.nlu.constants import TOKENS_NAMES, MESSAGE_ATTRIBUTES
from rasa.engine.graph import GraphComponent from rasa.shared.nlu.constants import TEXT from rasa.shared.nlu.training_data.message import Message from rasa.engine.recipes.default_recipe import DefaultV1Recipe
from rasa.nlu import utils #from rasa.nlu.model import Metadata
import typing from typing import Any, Optional, Text, Dict, List, Type
import re import typing from typing import Any, Optional, Text, Dict, List, Type
#from rasa.nlu.config import RasaNLUConfig from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.shared.nlu.training_data.message import Message from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer from rasa.nlu.constants import (
TOKENS_NAMES,
MESSAGE_ATTRIBUTES,
)
@DefaultV1Recipe.register( DefaultV1Recipe.ComponentType.MESSAGE_TOKENIZER, is_trainable=True ) #class SudachiTokenizer(Tokenizer): class SudachiTokenizer(Tokenizer):
provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
defaults = {
"intent_tokenization_flag": False,
"intent_split_symbol": "_",
}
def __init__(self, component_config: Dict[Text, Any] = None) -> None:
super().__init__(component_config)
from sudachipy import dictionary
from sudachipy import tokenizer
self.tokenizer_obj = dictionary.Dictionary().create()
self.mode = tokenizer.Tokenizer.SplitMode.A
@classmethod
def required_packages(cls) -> List[Text]:
return ["sudachipy"]
def train(self, training_data: TrainingData, **kwargs) -> Dict[Text, Any]:
if not isinstance(training_data, TrainingData):
raise ValueError("Expected training_data to be of type TrainingData.")
messages = training_data.training_examples
for message in messages:
text = message.get("text")
# Tokenization logic
# Return a simple, fingerprintable output
return {"trained": True, "version": 1, "component_name": "SudachiTokenizer"}
#def fingerprintable(self) -> Dict[Text, Any]:
# return {"provides": self.provides, "defaults": self.defaults}
def tokenize(self, message: Message, attribute: Text) -> List[Token]:
text = message.get(attribute)
words = [m.surface() for m in self.tokenizer_obj.tokenize(text, self.mode)]
return self._convert_words_to_tokens(words, text)
def _convert_words_to_tokens(self, words: List[str], text: str) -> List[Token]:
# Implement this method based on how you want to convert words to tokens
tokens = []
start = 0
for word in words:
start = text.find(word, start)
end = start + len(word)
tokens.append(Token(word, start))
start = end
return tokens
#print([m.surface() for m in self.tokenizer_obj.tokenize(text,self.mode)])