import json import pandas as pd #library that contains punctuation import string #importing nlp library import nltk import re import spacy from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() stopwords = nltk.corpus.stopwords.words('english') nlp = spacy.load("en_core_web_lg") def phrase_preprocessing(phrase): phrase_punctuationfree="".join([i for i in phrase if i not in string.punctuation]) phrase_lower= phrase_punctuationfree.lower() phrase_tokens = re.split('\W',phrase_lower) phrase_nostopwords= [i for i in phrase_tokens if i not in stopwords] phrase_lemma = [wordnet_lemmatizer.lemmatize(word) for word in phrase_nostopwords] new_phrase = ' '.join(phrase_lemma) new_nlp_phrase = nlp(new_phrase) print(new_nlp_phrase) return new_nlp_phrase def similarityCheck(data, phrase): results = pd.DataFrame(columns=['index','score','sentence', 'paragraph','color','module','level', 'heading1', 'heading2', 'heading3', 'heading4', 'pageNum']) i = 1 list_of_indices = [] for index, row in data.iterrows(): doc = nlp(row['new_sentence']) sim = phrase.similarity(doc) if row['Index'] not in list_of_indices: results.loc[i, 'index'] = index results.loc[i, 'score'] = sim results.loc[i, 'sentence'] = row['Sentences'] results.loc[i, 'paragraph'] = row['Paragraph'] results.loc[i, 'color'] = row['Color'] results.loc[i, 'module'] = row['Module'] results.loc[i, 'level'] = row['Level'] results.loc[i, 'heading1'] =row['Heading1'] results.loc[i, 'heading2'] =row['Heading2'] results.loc[i, 'heading3'] =row['Heading3'] results.loc[i, 'heading4'] =row['Heading4'] results.loc[i, 'pageNum'] =row['PageNum'] i+=1 list_of_indices.append(row['Index']) else: pass results.sort_values(by=['score'],ascending=False,inplace=True) results = results[0:20] results_records = results.reset_index().to_json(orient ='records') results_records = json.loads(results_records) return results_records