Custom Component Rasa version 3

Hello , can you please help me I tried to create a new tokenizer in Rasa v3 but it’s not working I don’t know what went wrong . I followed the migration instructions but still it gives me this error when I start training , the tokenizer runs before the countvectorsfeaturizer:

The following components require a Tokenizer: run_CountVectorsFeaturizer1, train_CountVectorsFeaturizer1.

Here is the component code :

from __future__ import annotations

from typing import Any, Dict, List, Optional, Text

import regex



from itertools import *

import jellyfish

from rasa.engine.graph import ExecutionContext

from import DefaultV1Recipe

from import Resource

from import ModelStorage

from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer

from rasa.shared.constants import DOCS_URL_COMPONENTS

from rasa.shared.nlu.training_data.message import Message

from custom_func.clean_data_fr import clean_data_fr

from custom_func.clean_data_fr_training import clean_data_fr_training

import abc

import logging

import re

from typing import Text, List, Dict, Any, Optional

from rasa.engine.graph import ExecutionContext, GraphComponent

from import Resource

from import ModelStorage

from rasa.shared.nlu.training_data.training_data import TrainingData

from rasa.shared.nlu.training_data.message import Message

from custom_func.generateur_oncf_fr import generateur_oncf_fr

from rasa.nlu.constants import TOKENS_NAMES, MESSAGE_ATTRIBUTES

from rasa.shared.nlu.constants import (







logger = logging.getLogger(__name__)

logger = logging.getLogger(__name__)


    DefaultV1Recipe.ComponentType.MESSAGE_TOKENIZER, is_trainable=False


class WhitespaceTokenizer_ar(Tokenizer, GraphComponent):



    def required_packages() -> List[Text]:

        """Any extra python dependencies required for this component to run."""

        return []


    def get_default_config() -> Dict[Text, Any]:

        """Returns the component's default config."""

        return {


            # specifies the language of the subword segmentation model

            "lang": None,

            # specifies the dimension of the subword embeddings

            "dim": None,

            # specifies the vocabulary size of the segmentation model

            "vs": None,

            # if set to True and the given vocabulary size can't be loaded for the given

            # model, the closest size is chosen

            "vs_fallback": True,


    def __init__(self, config: Dict[Text, Any]) -> None:

        """Initialize the tokenizer."""


        self.emoji_pattern =

        if "case_sensitive" in self._config:


                "The option 'case_sensitive' was moved from the tokenizers to the "





    def create(


        config: Dict[Text, Any],

        model_storage: ModelStorage,

        resource: Resource,

        execution_context: ExecutionContext,

    ) -> GraphComponent:

        """Creates a new component (see parent class for full docstring)."""

        return cls(config, execution_context.node_name)

    def process(self, messages: List[Message]) -> List[Message]:

        """Processes incoming messages and computes and sets features."""


        return messages

    def process_training_data(self, training_data: TrainingData) -> TrainingData:

        """Processes the training examples in the given training data in-place."""


        return training_data
1 Like