Self.clf is not accessible

prithvini04 · June 25, 2019, 3:30pm

import logging logging.basicConfig(format=’%(asctime)s : %(levelname)s : %(message)s’, level=logging.INFO) import gensim import spacy nlp = spacy.load(‘en_core_web_sm’) from spacy.lang.en.stop_words import STOP_WORDS #regex import glob #regular expression import re from string import digits from gensim import models import numpy as np import nltk from nltk.stem.porter import *
from gensim import corpora, models, similarities from pprint import pprint
from collections import defaultdict import nltk from nltk.corpus import stopwords import numpy as np from time import time import pandas as pd import datetime import pickle from gensim.models import Word2Vec import typing from typing import Any, Optional, Text, Dict import os

import pickle

from rasa_nlu.components import Component from rasa_nlu import utils from rasa_nlu.model import Metadata

SIMILARITY_MODEL_FILE_NAME = “text_similarity.pkl”

class TextSimilarity(Component): name = “TextSimilarity” provides = [“Similarity”] requires = [] defaults = {} language_list = [“en”] user_ip="" w2v_corpus = [] # Documents to train word2vec on wmd_corpus = [] # Documents to run queries against documents = [] # wmd_corpus, with no pre-processing (so we can see the original documents). corpus_texts=[] model_word2vec="" questions=[]

def __init__(self, component_config=None, model_word2vec=None ):
    super(TextSimilarity, self).__init__(component_config) 
    self.model_word2vec=model_word2vec   

def getTrainingData(self, training_data):
    myDict={}         
    training_data = training_data.training_examples
    for t in training_data:
         intent=t.get('intent')
         
         if intent in myDict.keys():
             myDict[intent]= myDict[intent]+" "+t.text
         else:
             myDict[intent]=t.text
             self.questions.append(t.text)                 
    print("printing all the questions",len(self.questions))         
    print(self.questions)      
    return myDict

def removeNuminStr(self, string1):
    s=""
    remove_digits = str.maketrans('', '', digits)
    res = string1.translate(remove_digits)   
    res=res.split('.')
    for r in res:
        if(r or len(r)>1):
            s=r
    return s

def pre_Preocess(self, content_FAQ):
    print("I am in pre_Preocess" )
    #Pre-process the FAQ's to remove the stop words, punchuations,numbers spaces and get the root words
    texts, article=[], []
    vec_texts= []
    vec_article= []
    aword=""     
    spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
    #customized stop words, we can add more to this
    my_stop_words=[u'please', u'be', u'a' , u'A', u'what', u'who' , u'What', u'aaa', u'where', u'\'s',u'\n ']
    for stopword in my_stop_words:
        lexeme=nlp.vocab[stopword]
        lexeme.is_stop= True  
        #adding the lemmatized words to the corpus
    for query in content_FAQ:
            doc = nlp(query.lower())   
            for w in doc:        
                if w.text!='\n' and w.text!='\t ' and not w.is_punct and not w.like_num and not w.is_stop and len(w)>1:
                    if(w.lemma_.isalpha() == False):
                        removed_num=self.removeNuminStr(w.lemma_)
                        removed_num=nlp(removed_num)                  
                        for word in removed_num:
                            if not word.is_stop and word and len(word)>0:
                                aword=word
                                article.append(aword.lemma_)                                                                      
                    else:
                        article.append(w.lemma_)

print(article)

print("_________________________________________________________________________________________")

            texts.append(article)        
            article=[]  
    return texts    


def train(self, training_data, cfg, **kwargs):
    df_ExpectedVar=[]
    global corpus_texts
    global w2v_corpus
    global wmd_corpus
    global documents
    global model_word2vec
    print("I am in train")
   
    corpus= self.getTrainingData(training_data)      
    for c in corpus:
        df_ExpectedVar.append(corpus[c])       
     #calling the pre_process
    corpus_texts= self.pre_Preocess(df_ExpectedVar)  
    print(len(corpus_texts))
   
    with open('corpus', 'wb') as fp:
        pickle.dump(corpus_texts, fp)    
    
     # Add to corpus for training Word2Vec.
    w2v_corpus=corpus_texts
    print("w2v_corpus",w2v_corpus)
    # Add to corpus for similarity queries.
    wmd_corpus=corpus_texts
    documents=df_ExpectedVar
    #Training the Word2Vec model
    model = gensim.models.Word2Vec (w2v_corpus, size=150, workers=3)
    tt = datetime.datetime.now().strftime("%Y%m%d%H%S")     
    model.save("./word2vec_model/model_word2vec"+ tt +".model")
    self.model_word2vec = model
    return self.model_word2vec      

def pre_process_userinput(self, query):
    print("I am in pre_process_userinput" )
    #user input should also be pre preocessed same like dataset
    article=[]
    doc = nlp(query)   
    for w in doc:        
        if w.text!='\n' and w.text!='\t ' and not w.is_punct and not w.like_num and not w.is_stop and len(w)>1:
            article.append(w.lemma_)
    #print("User input in lemmatized format:",article)                        
    return article

def process(self, message, **kwargs):
    print("I am in process")
    columns = ['Similarities','FAQs']
    rows=[]
    jsonData={}
    global corpus_texts
    global model_word2vec
    global documents
    global wmd_corpus
    userinput=message.text.lower()
    print("userinput",userinput)
    user_ip= self.pre_process_userinput(userinput) 
    with open ('corpus', 'rb') as fp:
        wmd_corpus = pickle.load(fp)
    #self.model_word2vec = Word2Vec.load('./word2vec_model')
    print("wmd_corpus",len(wmd_corpus))
    from gensim.similarities import WmdSimilarity
    num_best = 5
    instance = WmdSimilarity(wmd_corpus, self.model_word2vec, num_best=5)
    sims = instance[user_ip]  # A query is simply a "look-up" in the similarity class.
    # Print the query and the retrieved documents, together with their similarities.
    print("printing all the questions",len(self.questions))   
    print("this is length of the questions :::::",len(documents))
    print("this is length of the questions :::::",len(wmd_corpus))
    print("Displaying the query" )
    print('Query: ', message.text)
    print("\n")
    for i in range(num_best):  
        jsonSim={}
        print('similarity score =', sims[i][1])       
        print(self.questions[sims[i][0]])
        print("\n")
        row =[sims[i][1],self.questions[sims[i][0]]]
        rows.append(row)
        jsonSim["Similarity"]= sims[i][1]
        jsonSim["FAQ"]=self.questions[sims[i][0]]
        jsonData[i]= jsonSim
        
        
    final_scores = pd.DataFrame(rows, columns=columns) 
    print("jsonData",jsonData)
   # Similarity = self.convert_to_rasa(final_scores)
    message.set("Similarity",jsonData,add_to_output=True)    

def persist(self, model_dir):
    print("I am in persist")
    """Persist this model into the passed directory."""
    similarity_file = os.path.join(model_dir, SIMILARITY_MODEL_FILE_NAME)
    print(similarity_file)
    with open(similarity_file, 'wb') as f:
        pickle.dump(self, f)    
    return {"similarity_file": SIMILARITY_MODEL_FILE_NAME}

@classmethod
def load(cls,
         model_dir=None,
         model_metadata=None,
         cached_component=None,
         **kwargs):
    meta = model_metadata.for_component(cls.name)
    file_name = meta.get("similarity_file", SIMILARITY_MODEL_FILE_NAME)
    similarity_file = os.path.join(model_dir, file_name)
    if os.path.exists(similarity_file):
        return utils.pycloud_unpickle(similarity_file)
    else:
        return cls(meta)

prithvini04 · June 25, 2019, 3:33pm

This is custom component for text similarity.

I want to access self.model_word2vec in process.

Can anyone please tell me how to access it.

Tanja · July 3, 2019, 3:40pm

It is really hard to read the code you posted. Can you please reformat it using ``` around your code? Thanks.

Topic		Replies	Views
ImportError: Cannot retrieve class from path nlp_spacy Rasa Open Source	1	1515	February 12, 2021
Able to lemmatize by modifying spacy_tokenizer, but the output confidence is differing for the same stem word Rasa Open Source	1	928	September 26, 2019
Rasa train nlu error Rasa Open Source	10	2221	October 19, 2021
Error in loading fastText word embedding in rasa_nlu_example Rasa Open Source	1	1682	July 28, 2020
Getting an error while i run nlu_model.py Rasa Open Source	1	1824	August 27, 2019

Self.clf is not accessible

print(article)

print("_________________________________________________________________________________________")

Related Topics