IDDRS/App/similarity.py

import json
import pandas as pd
#library that contains punctuation
import string
#importing nlp library
import nltk
import re
import spacy
from nltk.stem import WordNetLemmatizer


wordnet_lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
nlp = spacy.load("en_core_web_lg")


def phrase_preprocessing(phrase):
    phrase_punctuationfree="".join([i for i in phrase if i not in string.punctuation])
    phrase_lower= phrase_punctuationfree.lower()
    phrase_tokens = re.split('\W',phrase_lower)
    phrase_nostopwords= [i for i in phrase_tokens if i not in stopwords]
    phrase_lemma = [wordnet_lemmatizer.lemmatize(word) for word in phrase_nostopwords]
    new_phrase = ' '.join(phrase_lemma)
    new_nlp_phrase = nlp(new_phrase)
    print(new_nlp_phrase)
    return new_nlp_phrase

def similarityCheck(data, phrase):
    results = pd.DataFrame(columns=['index','score','sentence', 'paragraph','color','module','level', 'heading1', 'heading2', 'heading3', 'heading4', 'pageNum'])

    i = 1
    list_of_indices = []
    for index, row in data.iterrows():
        doc = nlp(row['new_sentence'])
        sim = phrase.similarity(doc)
        if row['Index'] not in list_of_indices:
            results.loc[i, 'index'] = index
            results.loc[i, 'score'] = sim
            results.loc[i, 'sentence'] = row['Sentences']
            results.loc[i, 'paragraph'] = row['Paragraph']
            results.loc[i, 'color'] = row['Color']
            results.loc[i, 'module'] = row['Module']
            results.loc[i, 'level'] = row['Level']
            results.loc[i, 'heading1'] =row['Heading1']
            results.loc[i, 'heading2'] =row['Heading2']
            results.loc[i, 'heading3'] =row['Heading3']
            results.loc[i, 'heading4'] =row['Heading4']
            results.loc[i, 'pageNum'] =row['PageNum']
            i+=1

            list_of_indices.append(row['Index'])
        else:
            pass

    results.sort_values(by=['score'],ascending=False,inplace=True)
    results = results[0:20]
    results_records = results.reset_index().to_json(orient ='records')
    results_records = json.loads(results_records)


    return results_records