Hello, I need to build a chatbot for Japanese language and based on previous related posts on the Rasa forum, I tried using mecab-python3.
The code shared on mahbubcseju github repo gave errors with Rasa 2.0, and the custom tokenizer needed editing. The custom tokenizer code that worked for me for intents with no Entities
import typing
from typing import Any, Optional, Text, Dict, List, Type
import re
from rasa.nlu.components import Component
from rasa.nlu.config import RasaNLUModelConfig
from rasa.shared.nlu.training_data.training_data import TrainingData
from rasa.shared.nlu.training_data.message import Message
from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
from rasa.nlu.constants import TOKENS_NAMES, MESSAGE_ATTRIBUTES
from rasa.shared.nlu.constants import (
INTENT, INTENT_RESPONSE_KEY, RESPONSE_IDENTIFIER_DELIMITER, ACTION_NAME,
)
import MeCab
class JapaneseTokenizer(Tokenizer, Component):
provides = ["tokens"]
def train(
self, training_data: TrainingData, config: RasaNLUModelConfig,
**kwargs: Any,
) -> None:
for example in training_data.training_examples:
try:
text_string = example.data['text']
except :
text_string = ""
example.set("tokens", self.tokenize(text_string))
def process(self, message: Message, **kwargs: Any) -> None:
pass
@staticmethod
def tokenize(text: Text) -> List[Token]:
mt = MeCab.Tagger("-Owakati")
parsed = mt.parse(text)
print("\nParsed - ",parsed)
words = []
for i in parsed.split():
words.append(i)
running_offset=0
tokens = []
for word in words:
word_offset = text.index(word, running_offset)
word_len = len(word)
running_offset = word_offset + word_len
tokens.append(Token(word, word_offset))
return tokens
Config.yml used
language: jp
pipeline:
- name: JapaneseTokenizer
- name: RegexFeaturizer
- name: LexicalSyntacticFeaturizer
- name: CountVectorsFeaturizer
- name: CountVectorsFeaturizer
analyzer: word
min_ngram: 1
max_ngram: 4
- name: CRFEntityExtractor
- name: KeywordIntentClassifier
- name: EntitySynonymMapper
- name: FallbackClassifier
threshold: 0.3
ambiguity_threshold: 0.1
policies:
- name: MemoizationPolicy
- name: TEDPolicy
max_history: 5
epochs: 100
constrain_similarities: true
- name: RulePolicy
core_fallback_threshold: 0.4
core_fallback_action_name: "action_default_fallback"
enable_fallback_prediction: True
constrain_similarities: True
But on adding entities in my training data, it starts giving errors
File "/Users/mac/.../lib/python3.7/site-packages/rasa/nlu/extractors/extractor.py", line 451, in check_correct_entity_annotations
token_start_positions = [t.start for t in example.get(TOKENS_NAMES[TEXT])]
TypeError: 'NoneType' object is not iterable
Would appreciate any ideas or suggestions on what needs to be corrected ? Or if there are any blogs or documentation for Japanese tokenizer with Rasa 2.0 ?