Custom Component Rasa version 3

Hello , can you please help me I tried to create a new tokenizer in Rasa v3 but it’s not working I don’t know what went wrong . I followed the migration instructions but still it gives me this error when I start training , the tokenizer runs before the countvectorsfeaturizer:

The following components require a Tokenizer: run_CountVectorsFeaturizer1, train_CountVectorsFeaturizer1.

Here is the component code :

from __future__ import annotations

from typing import Any, Dict, List, Optional, Text

import regex

import rasa.shared.utils.io

import rasa.utils.io

from itertools import *

import jellyfish

from rasa.engine.graph import ExecutionContext

from rasa.engine.recipes.default_recipe import DefaultV1Recipe

from rasa.engine.storage.resource import Resource

from rasa.engine.storage.storage import ModelStorage

from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer

from rasa.shared.constants import DOCS_URL_COMPONENTS

from rasa.shared.nlu.training_data.message import Message

from custom_func.clean_data_fr import clean_data_fr

from custom_func.clean_data_fr_training import clean_data_fr_training

import abc

import logging

import re

from typing import Text, List, Dict, Any, Optional

from rasa.engine.graph import ExecutionContext, GraphComponent

from rasa.engine.storage.resource import Resource

from rasa.engine.storage.storage import ModelStorage

from rasa.shared.nlu.training_data.training_data import TrainingData

from rasa.shared.nlu.training_data.message import Message

from custom_func.generateur_oncf_fr import generateur_oncf_fr

from rasa.nlu.constants import TOKENS_NAMES, MESSAGE_ATTRIBUTES

from rasa.shared.nlu.constants import (

    INTENT,

    INTENT_RESPONSE_KEY,

    RESPONSE_IDENTIFIER_DELIMITER,

    ACTION_NAME,

)

import rasa.shared.utils.io

logger = logging.getLogger(__name__)

logger = logging.getLogger(__name__)

@DefaultV1Recipe.register(

    DefaultV1Recipe.ComponentType.MESSAGE_TOKENIZER, is_trainable=False

)

class WhitespaceTokenizer_ar(Tokenizer, GraphComponent):

   

    @staticmethod

    def required_packages() -> List[Text]:

        """Any extra python dependencies required for this component to run."""

        return []

    @staticmethod

    def get_default_config() -> Dict[Text, Any]:

        """Returns the component's default config."""

        return {

            **Tokenizer.get_default_config(),

            # specifies the language of the subword segmentation model

            "lang": None,

            # specifies the dimension of the subword embeddings

            "dim": None,

            # specifies the vocabulary size of the segmentation model

            "vs": None,

            # if set to True and the given vocabulary size can't be loaded for the given

            # model, the closest size is chosen

            "vs_fallback": True,

        }

    def __init__(self, config: Dict[Text, Any]) -> None:

        """Initialize the tokenizer."""

        super().__init__(config)

        self.emoji_pattern = rasa.utils.io.get_emoji_regex()

        if "case_sensitive" in self._config:

            rasa.shared.utils.io.raise_warning(

                "The option 'case_sensitive' was moved from the tokenizers to the "

                "featurizers.",

                docs=DOCS_URL_COMPONENTS,

            )

    @classmethod

    def create(

        cls,

        config: Dict[Text, Any],

        model_storage: ModelStorage,

        resource: Resource,

        execution_context: ExecutionContext,

    ) -> GraphComponent:

        """Creates a new component (see parent class for full docstring)."""

        return cls(config, execution_context.node_name)

    def process(self, messages: List[Message]) -> List[Message]:

        """Processes incoming messages and computes and sets features."""

        print(messages)

        return messages

    def process_training_data(self, training_data: TrainingData) -> TrainingData:

        """Processes the training examples in the given training data in-place."""

        self.process(training_data.training_examples)

        return training_data
1 Like