import logging
logging.basicConfig(format=’%(asctime)s : %(levelname)s : %(message)s’, level=logging.INFO)
import gensim
import spacy
nlp = spacy.load(‘en_core_web_sm’)
from spacy.lang.en.stop_words import STOP_WORDS
#regex
import glob
#regular expression
import re
from string import digits
from gensim import models
import numpy as np
import nltk
from nltk.stem.porter import *
from gensim import corpora, models, similarities
from pprint import pprint
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
import numpy as np
from time import time
import pandas as pd
import datetime
import pickle
from gensim.models import Word2Vec
import typing
from typing import Any, Optional, Text, Dict
import os
import pickle
from rasa_nlu.components import Component from rasa_nlu import utils from rasa_nlu.model import Metadata
SIMILARITY_MODEL_FILE_NAME = “text_similarity.pkl”
class TextSimilarity(Component): name = “TextSimilarity” provides = [“Similarity”] requires = [] defaults = {} language_list = [“en”] user_ip="" w2v_corpus = [] # Documents to train word2vec on wmd_corpus = [] # Documents to run queries against documents = [] # wmd_corpus, with no pre-processing (so we can see the original documents). corpus_texts=[] model_word2vec="" questions=[]
def __init__(self, component_config=None, model_word2vec=None ):
super(TextSimilarity, self).__init__(component_config)
self.model_word2vec=model_word2vec
def getTrainingData(self, training_data):
myDict={}
training_data = training_data.training_examples
for t in training_data:
intent=t.get('intent')
if intent in myDict.keys():
myDict[intent]= myDict[intent]+" "+t.text
else:
myDict[intent]=t.text
self.questions.append(t.text)
print("printing all the questions",len(self.questions))
print(self.questions)
return myDict
def removeNuminStr(self, string1):
s=""
remove_digits = str.maketrans('', '', digits)
res = string1.translate(remove_digits)
res=res.split('.')
for r in res:
if(r or len(r)>1):
s=r
return s
def pre_Preocess(self, content_FAQ):
print("I am in pre_Preocess" )
#Pre-process the FAQ's to remove the stop words, punchuations,numbers spaces and get the root words
texts, article=[], []
vec_texts= []
vec_article= []
aword=""
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
#customized stop words, we can add more to this
my_stop_words=[u'please', u'be', u'a' , u'A', u'what', u'who' , u'What', u'aaa', u'where', u'\'s',u'\n ']
for stopword in my_stop_words:
lexeme=nlp.vocab[stopword]
lexeme.is_stop= True
#adding the lemmatized words to the corpus
for query in content_FAQ:
doc = nlp(query.lower())
for w in doc:
if w.text!='\n' and w.text!='\t ' and not w.is_punct and not w.like_num and not w.is_stop and len(w)>1:
if(w.lemma_.isalpha() == False):
removed_num=self.removeNuminStr(w.lemma_)
removed_num=nlp(removed_num)
for word in removed_num:
if not word.is_stop and word and len(word)>0:
aword=word
article.append(aword.lemma_)
else:
article.append(w.lemma_)
print(article)
print("_________________________________________________________________________________________")
texts.append(article)
article=[]
return texts
def train(self, training_data, cfg, **kwargs):
df_ExpectedVar=[]
global corpus_texts
global w2v_corpus
global wmd_corpus
global documents
global model_word2vec
print("I am in train")
corpus= self.getTrainingData(training_data)
for c in corpus:
df_ExpectedVar.append(corpus[c])
#calling the pre_process
corpus_texts= self.pre_Preocess(df_ExpectedVar)
print(len(corpus_texts))
with open('corpus', 'wb') as fp:
pickle.dump(corpus_texts, fp)
# Add to corpus for training Word2Vec.
w2v_corpus=corpus_texts
print("w2v_corpus",w2v_corpus)
# Add to corpus for similarity queries.
wmd_corpus=corpus_texts
documents=df_ExpectedVar
#Training the Word2Vec model
model = gensim.models.Word2Vec (w2v_corpus, size=150, workers=3)
tt = datetime.datetime.now().strftime("%Y%m%d%H%S")
model.save("./word2vec_model/model_word2vec"+ tt +".model")
self.model_word2vec = model
return self.model_word2vec
def pre_process_userinput(self, query):
print("I am in pre_process_userinput" )
#user input should also be pre preocessed same like dataset
article=[]
doc = nlp(query)
for w in doc:
if w.text!='\n' and w.text!='\t ' and not w.is_punct and not w.like_num and not w.is_stop and len(w)>1:
article.append(w.lemma_)
#print("User input in lemmatized format:",article)
return article
def process(self, message, **kwargs):
print("I am in process")
columns = ['Similarities','FAQs']
rows=[]
jsonData={}
global corpus_texts
global model_word2vec
global documents
global wmd_corpus
userinput=message.text.lower()
print("userinput",userinput)
user_ip= self.pre_process_userinput(userinput)
with open ('corpus', 'rb') as fp:
wmd_corpus = pickle.load(fp)
#self.model_word2vec = Word2Vec.load('./word2vec_model')
print("wmd_corpus",len(wmd_corpus))
from gensim.similarities import WmdSimilarity
num_best = 5
instance = WmdSimilarity(wmd_corpus, self.model_word2vec, num_best=5)
sims = instance[user_ip] # A query is simply a "look-up" in the similarity class.
# Print the query and the retrieved documents, together with their similarities.
print("printing all the questions",len(self.questions))
print("this is length of the questions :::::",len(documents))
print("this is length of the questions :::::",len(wmd_corpus))
print("Displaying the query" )
print('Query: ', message.text)
print("\n")
for i in range(num_best):
jsonSim={}
print('similarity score =', sims[i][1])
print(self.questions[sims[i][0]])
print("\n")
row =[sims[i][1],self.questions[sims[i][0]]]
rows.append(row)
jsonSim["Similarity"]= sims[i][1]
jsonSim["FAQ"]=self.questions[sims[i][0]]
jsonData[i]= jsonSim
final_scores = pd.DataFrame(rows, columns=columns)
print("jsonData",jsonData)
# Similarity = self.convert_to_rasa(final_scores)
message.set("Similarity",jsonData,add_to_output=True)
def persist(self, model_dir):
print("I am in persist")
"""Persist this model into the passed directory."""
similarity_file = os.path.join(model_dir, SIMILARITY_MODEL_FILE_NAME)
print(similarity_file)
with open(similarity_file, 'wb') as f:
pickle.dump(self, f)
return {"similarity_file": SIMILARITY_MODEL_FILE_NAME}
@classmethod
def load(cls,
model_dir=None,
model_metadata=None,
cached_component=None,
**kwargs):
meta = model_metadata.for_component(cls.name)
file_name = meta.get("similarity_file", SIMILARITY_MODEL_FILE_NAME)
similarity_file = os.path.join(model_dir, file_name)
if os.path.exists(similarity_file):
return utils.pycloud_unpickle(similarity_file)
else:
return cls(meta)