IDDRS/App/similarity.py

64 lines
2.2 KiB
Python
Raw Normal View History

2022-06-17 12:06:51 +00:00
import json
import pandas as pd
#library that contains punctuation
import string
#importing nlp library
import nltk
import re
import spacy
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
nlp = spacy.load("en_core_web_lg")
def phrase_preprocessing(phrase):
phrase_punctuationfree="".join([i for i in phrase if i not in string.punctuation])
phrase_lower= phrase_punctuationfree.lower()
phrase_tokens = re.split('\W',phrase_lower)
phrase_nostopwords= [i for i in phrase_tokens if i not in stopwords]
phrase_lemma = [wordnet_lemmatizer.lemmatize(word) for word in phrase_nostopwords]
new_phrase = ' '.join(phrase_lemma)
new_nlp_phrase = nlp(new_phrase)
print(new_nlp_phrase)
return new_nlp_phrase
def similarityCheck(data, phrase):
results = pd.DataFrame(columns=['index','score','sentence', 'paragraph','color','module','level', 'heading1', 'heading2', 'heading3', 'heading4', 'pageNum'])
i = 1
list_of_indices = []
for index, row in data.iterrows():
doc = nlp(row['new_sentence'])
sim = phrase.similarity(doc)
if row['Index'] not in list_of_indices:
results.loc[i, 'index'] = index
results.loc[i, 'score'] = sim
results.loc[i, 'sentence'] = row['Sentences']
results.loc[i, 'paragraph'] = row['Paragraph']
results.loc[i, 'color'] = row['Color']
results.loc[i, 'module'] = row['Module']
results.loc[i, 'level'] = row['Level']
results.loc[i, 'heading1'] =row['Heading1']
results.loc[i, 'heading2'] =row['Heading2']
results.loc[i, 'heading3'] =row['Heading3']
results.loc[i, 'heading4'] =row['Heading4']
results.loc[i, 'pageNum'] =row['PageNum']
i+=1
list_of_indices.append(row['Index'])
else:
pass
results.sort_values(by=['score'],ascending=False,inplace=True)
results = results[0:20]
results_records = results.reset_index().to_json(orient ='records')
results_records = json.loads(results_records)
return results_records