64 lines
2.2 KiB
Python
64 lines
2.2 KiB
Python
import json
|
|
import pandas as pd
|
|
#library that contains punctuation
|
|
import string
|
|
#importing nlp library
|
|
import nltk
|
|
import re
|
|
import spacy
|
|
from nltk.stem import WordNetLemmatizer
|
|
|
|
|
|
wordnet_lemmatizer = WordNetLemmatizer()
|
|
stopwords = nltk.corpus.stopwords.words('english')
|
|
nlp = spacy.load("en_core_web_lg")
|
|
|
|
|
|
|
|
def phrase_preprocessing(phrase):
|
|
phrase_punctuationfree="".join([i for i in phrase if i not in string.punctuation])
|
|
phrase_lower= phrase_punctuationfree.lower()
|
|
phrase_tokens = re.split('\W',phrase_lower)
|
|
phrase_nostopwords= [i for i in phrase_tokens if i not in stopwords]
|
|
phrase_lemma = [wordnet_lemmatizer.lemmatize(word) for word in phrase_nostopwords]
|
|
new_phrase = ' '.join(phrase_lemma)
|
|
new_nlp_phrase = nlp(new_phrase)
|
|
print(new_nlp_phrase)
|
|
return new_nlp_phrase
|
|
|
|
def similarityCheck(data, phrase):
|
|
results = pd.DataFrame(columns=['index','score','sentence', 'paragraph','color','module','level', 'heading1', 'heading2', 'heading3', 'heading4', 'pageNum'])
|
|
|
|
i = 1
|
|
list_of_indices = []
|
|
for index, row in data.iterrows():
|
|
doc = nlp(row['new_sentence'])
|
|
sim = phrase.similarity(doc)
|
|
if row['Index'] not in list_of_indices:
|
|
results.loc[i, 'index'] = index
|
|
results.loc[i, 'score'] = sim
|
|
results.loc[i, 'sentence'] = row['Sentences']
|
|
results.loc[i, 'paragraph'] = row['Paragraph']
|
|
results.loc[i, 'color'] = row['Color']
|
|
results.loc[i, 'module'] = row['Module']
|
|
results.loc[i, 'level'] = row['Level']
|
|
results.loc[i, 'heading1'] =row['Heading1']
|
|
results.loc[i, 'heading2'] =row['Heading2']
|
|
results.loc[i, 'heading3'] =row['Heading3']
|
|
results.loc[i, 'heading4'] =row['Heading4']
|
|
results.loc[i, 'pageNum'] =row['PageNum']
|
|
i+=1
|
|
|
|
list_of_indices.append(row['Index'])
|
|
else:
|
|
pass
|
|
|
|
results.sort_values(by=['score'],ascending=False,inplace=True)
|
|
results = results[0:20]
|
|
results_records = results.reset_index().to_json(orient ='records')
|
|
results_records = json.loads(results_records)
|
|
|
|
|
|
return results_records
|
|
|