I have problem when using spell checker where the token boundary is changed and crf_entity_extractor accuracy decreased . can i change entity boundary after changing training text with spell checker text ?
(problem example): Token boundary error for token (المسافرين ) (start:61, end: 71) and entity {‘start’: 64, ‘end’: 75, ‘value’: ‘المسافرين’, ‘entity’: ‘departure’}
After spell checker for training_example became: Token boundary for token (المسافرين ) (start:61, end: 71) and entity can’t catch correct token
How solve this problem?
Our class for checker:
class TextCleaning(Component):
provides = ['text']
language_list = None
def __init__(self, component_config=None):
super(TextCleaning, self).__init__(component_config)
self.arabic_diacritics = re.compile("""
ّ | # Tashdid
َّ | # Fatha
ً | # Tanwin Fath
ُ | # Damma
ٌ | # Tanwin Damm
ِ | # Kasra
ٍ | # Tanwin Kasr
ْ | # Sukun
ـ # Tatwil/Kashida
""", re.VERBOSE)
def train(self, training_data, cfg, **kwargs):
pass
def process(self, message, **kwargs):
text_remove_dica= self.remove_diacritics(message.text)
text_normalize= self.normalize_arabic(text_remove_dica)
text= self.remove_repeating_char(text_normalize)
message.text= text
logging.debug("messag.text in clean_text {}".format( message.text))
def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
pass
@classmethod
def load(
cls,
meta: Dict[Text, Any],
model_dir: Optional[Text] = None,
model_metadata: Optional["Metadata"] = None,
cached_component: Optional["Component"] = None,
**kwargs: Any
) -> "Component":
"""##Load this component from file."""
if cached_component:
return cached_component
else:
return cls(meta)
def normalize_arabic(self,text):
text = re.sub("[إأآا]", "ا", text)
text = re.sub("ى", "ي", text)
text = re.sub("ؤ", "ء", text)
text = re.sub("ئ", "ء", text)
text = re.sub("ة", "ه", text)
text = re.sub("گ", "ك", text)
return text
def remove_diacritics(self,text):
text = re.sub(self.arabic_diacritics, '', text)
return text
def remove_repeating_char(self,text):
return re.sub(r'(.)\1+', r'\1', text)