Hello , can you please help me I tried to create a new tokenizer in Rasa v3 but it’s not working I don’t know what went wrong . I followed the migration instructions but still it gives me this error when I start training , the tokenizer runs before the countvectorsfeaturizer:
The following components require a Tokenizer: run_CountVectorsFeaturizer1, train_CountVectorsFeaturizer1.
Here is the component code :
from __future__ import annotations
from typing import Any, Dict, List, Optional, Text
import regex
import rasa.shared.utils.io
import rasa.utils.io
from itertools import *
import jellyfish
from rasa.engine.graph import ExecutionContext
from rasa.engine.recipes.default_recipe import DefaultV1Recipe
from rasa.engine.storage.resource import Resource
from rasa.engine.storage.storage import ModelStorage
from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
from rasa.shared.constants import DOCS_URL_COMPONENTS
from rasa.shared.nlu.training_data.message import Message
from custom_func.clean_data_fr import clean_data_fr
from custom_func.clean_data_fr_training import clean_data_fr_training
import abc
import logging
import re
from typing import Text, List, Dict, Any, Optional
from rasa.engine.graph import ExecutionContext, GraphComponent
from rasa.engine.storage.resource import Resource
from rasa.engine.storage.storage import ModelStorage
from rasa.shared.nlu.training_data.training_data import TrainingData
from rasa.shared.nlu.training_data.message import Message
from custom_func.generateur_oncf_fr import generateur_oncf_fr
from rasa.nlu.constants import TOKENS_NAMES, MESSAGE_ATTRIBUTES
from rasa.shared.nlu.constants import (
INTENT,
INTENT_RESPONSE_KEY,
RESPONSE_IDENTIFIER_DELIMITER,
ACTION_NAME,
)
import rasa.shared.utils.io
logger = logging.getLogger(__name__)
logger = logging.getLogger(__name__)
@DefaultV1Recipe.register(
DefaultV1Recipe.ComponentType.MESSAGE_TOKENIZER, is_trainable=False
)
class WhitespaceTokenizer_ar(Tokenizer, GraphComponent):
@staticmethod
def required_packages() -> List[Text]:
"""Any extra python dependencies required for this component to run."""
return []
@staticmethod
def get_default_config() -> Dict[Text, Any]:
"""Returns the component's default config."""
return {
**Tokenizer.get_default_config(),
# specifies the language of the subword segmentation model
"lang": None,
# specifies the dimension of the subword embeddings
"dim": None,
# specifies the vocabulary size of the segmentation model
"vs": None,
# if set to True and the given vocabulary size can't be loaded for the given
# model, the closest size is chosen
"vs_fallback": True,
}
def __init__(self, config: Dict[Text, Any]) -> None:
"""Initialize the tokenizer."""
super().__init__(config)
self.emoji_pattern = rasa.utils.io.get_emoji_regex()
if "case_sensitive" in self._config:
rasa.shared.utils.io.raise_warning(
"The option 'case_sensitive' was moved from the tokenizers to the "
"featurizers.",
docs=DOCS_URL_COMPONENTS,
)
@classmethod
def create(
cls,
config: Dict[Text, Any],
model_storage: ModelStorage,
resource: Resource,
execution_context: ExecutionContext,
) -> GraphComponent:
"""Creates a new component (see parent class for full docstring)."""
return cls(config, execution_context.node_name)
def process(self, messages: List[Message]) -> List[Message]:
"""Processes incoming messages and computes and sets features."""
print(messages)
return messages
def process_training_data(self, training_data: TrainingData) -> TrainingData:
"""Processes the training examples in the given training data in-place."""
self.process(training_data.training_examples)
return training_data