IDDRS/App/similarity.py

import json
import pandas as pd
#library that contains punctuation
import string
#importing nlp library
import nltk
import re
import spacy
from nltk.stem import WordNetLemmatizer


wordnet_lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
nlp = spacy.load("en_core_web_lg")


def phrase_preprocessing(phrase):
    phrase_punctuationfree="".join([i for i in phrase if i not in string.punctuation])
    phrase_lower= phrase_punctuationfree.lower()
    phrase_tokens = re.split('\W',phrase_lower)
    phrase_nostopwords= [i for i in phrase_tokens if i not in stopwords]
    phrase_lemma = [wordnet_lemmatizer.lemmatize(word) for word in phrase_nostopwords]
    new_phrase = ' '.join(phrase_lemma)
    new_nlp_phrase = nlp(new_phrase)
    print(new_nlp_phrase)
    return new_nlp_phrase

def similarityCheck(data, phrase):
    results = pd.DataFrame(columns=['index','score','sentence', 'paragraph','color','module','level', 'heading1', 'heading2', 'heading3', 'heading4', 'pageNum'])

    i = 1
    list_of_indices = []
    for index, row in data.iterrows():    
        doc = nlp(row['new_sentence'])
        sim = phrase.similarity(doc)
        if row['Index'] not in list_of_indices:
            results.loc[i, 'index'] = index
            results.loc[i, 'score'] = sim
            results.loc[i, 'sentence'] = row['Sentences']
            results.loc[i, 'paragraph'] = row['Paragraph']
            results.loc[i, 'color'] = row['Color']
            results.loc[i, 'module'] = row['Module']
            results.loc[i, 'level'] = row['Level']
            results.loc[i, 'heading1'] =row['Heading1']
            results.loc[i, 'heading2'] =row['Heading2']
            results.loc[i, 'heading3'] =row['Heading3']
            results.loc[i, 'heading4'] =row['Heading4']
            results.loc[i, 'pageNum'] =row['PageNum']
            i+=1

            list_of_indices.append(row['Index'])
        else:
            pass
            
    results.sort_values(by=['score'],ascending=False,inplace=True)
    results = results[0:20]
    results_records = results.reset_index().to_json(orient ='records')
    results_records = json.loads(results_records)
    

    return results_records
Upload files 2022-06-17 12:06:51 +00:00			`import json`
			`import pandas as pd`
			`#library that contains punctuation`
			`import string`
			`#importing nlp library`
			`import nltk`
			`import re`
			`import spacy`
			`from nltk.stem import WordNetLemmatizer`


			`wordnet_lemmatizer = WordNetLemmatizer()`
			`stopwords = nltk.corpus.stopwords.words('english')`
			`nlp = spacy.load("en_core_web_lg")`



			`def phrase_preprocessing(phrase):`
			`phrase_punctuationfree="".join([i for i in phrase if i not in string.punctuation])`
			`phrase_lower= phrase_punctuationfree.lower()`
			`phrase_tokens = re.split('\W',phrase_lower)`
			`phrase_nostopwords= [i for i in phrase_tokens if i not in stopwords]`
			`phrase_lemma = [wordnet_lemmatizer.lemmatize(word) for word in phrase_nostopwords]`
			`new_phrase = ' '.join(phrase_lemma)`
			`new_nlp_phrase = nlp(new_phrase)`
			`print(new_nlp_phrase)`
			`return new_nlp_phrase`

			`def similarityCheck(data, phrase):`
			`results = pd.DataFrame(columns=['index','score','sentence', 'paragraph','color','module','level', 'heading1', 'heading2', 'heading3', 'heading4', 'pageNum'])`

			`i = 1`
			`list_of_indices = []`
			`for index, row in data.iterrows():`
			`doc = nlp(row['new_sentence'])`
			`sim = phrase.similarity(doc)`
			`if row['Index'] not in list_of_indices:`
			`results.loc[i, 'index'] = index`
			`results.loc[i, 'score'] = sim`
			`results.loc[i, 'sentence'] = row['Sentences']`
			`results.loc[i, 'paragraph'] = row['Paragraph']`
			`results.loc[i, 'color'] = row['Color']`
			`results.loc[i, 'module'] = row['Module']`
			`results.loc[i, 'level'] = row['Level']`
			`results.loc[i, 'heading1'] =row['Heading1']`
			`results.loc[i, 'heading2'] =row['Heading2']`
			`results.loc[i, 'heading3'] =row['Heading3']`
			`results.loc[i, 'heading4'] =row['Heading4']`
			`results.loc[i, 'pageNum'] =row['PageNum']`
			`i+=1`

			`list_of_indices.append(row['Index'])`
			`else:`
			`pass`

			`results.sort_values(by=['score'],ascending=False,inplace=True)`
			`results = results[0:20]`
			`results_records = results.reset_index().to_json(orient ='records')`
			`results_records = json.loads(results_records)`


			`return results_records`