IDDRS/App/tfidfSimilarity.py

167 lines
4.2 KiB
Python
Raw Normal View History

2022-06-17 12:06:51 +00:00
import string
from string import punctuation
from pathlib import Path
import os
import json
import pandas as pd
from urllib3 import Retry
import numpy as np
from collections import Counter
import math
# importing nlp library
import nltk
import re
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from .models import SearchResults
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
nlp = spacy.load("en_core_web_lg")
def preprocessQuery(phrase):
phrase="".join([i for i in phrase if i not in string.punctuation])
phrase= phrase.lower()
phrase = re.split('\W',phrase)
phrase= [i for i in phrase if i not in stopwords]
processedPhrase = [wordnet_lemmatizer.lemmatize(word) for word in phrase]
return processedPhrase
def doc_freq(DataF,word):
c = 0
try:
c = DataF[word]
except:
pass
return c
# TF-IDF Cosine Similarity Ranking
def cosine_sim(a, b):
cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
return cos_sim
def readfile():
BASE_DIR = Path(__file__).resolve().parent.parent
2022-07-25 10:03:35 +00:00
processedData = pd.read_json(os.path.join(BASE_DIR, 'static/json/prcoessedData.json'))
2022-06-17 12:06:51 +00:00
df = pd.DataFrame(processedData)
return df
df = readfile()
paragraphs = df['ProcessedSent'].tolist()
N = len (paragraphs)
# Extracting Data
def extractingData(paragraphs):
processed_text = []
for i in paragraphs[:N]:
processed_text.append(word_tokenize(i))
return processed_text
processed_text = extractingData(paragraphs)
# Calculating DF for all words
def calculateDF():
DF = {}
for i in range(N):
tokens = processed_text[i]
for w in tokens:
try:
DF[w].add(i)
except:
DF[w] = {i}
for i in DF:
DF[i] = len(DF[i])
return DF
DF = calculateDF()
total_vocab_size = len(DF)
total_vocab = [x for x in DF]
doc = 0
tf_idf = {}
for i in range(N):
tokens = processed_text[i]
counter = Counter(tokens)
words_count = len(tokens)
for token in np.unique(tokens):
tf = counter[token]/words_count
df = doc_freq(DF, token)
idf = np.log((N+1)/(df+1))
tf_idf[doc, token] = tf*idf
doc += 1
# Vectorising tf-idf
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
try:
ind = total_vocab.index(i[1])
D[i[0]][ind] = tf_idf[i]
except:
pass
def gen_vector(tokens):
Q = np.zeros((len(total_vocab)))
counter = Counter(tokens)
words_count = len(tokens)
for token in np.unique(tokens):
tf = counter[token]/words_count
df = doc_freq(DF,token)
idf = math.log((N+1)/(df+1))
try:
ind = total_vocab.index(token)
Q[ind] = tf*idf
except:
pass
return Q
def cosine_similarity(k, query):
df2 = readfile()
preprocessed_query = preprocessQuery(query)
d_cosines = []
query_vector = gen_vector(preprocessed_query)
for d in D:
score = cosine_sim(query_vector, d)
if math.isnan(score):
score = 0
d_cosines.append(score)
out = np.array(d_cosines).argsort()[-k:][::-1]
dff = pd.DataFrame(columns=['Index', 'Paragraph', 'Color', 'Level', 'LevelName',
'Title', 'Module', 'PageNum', 'Heading1', 'Heading2', 'Heading3', 'Heading4', 'Sentence'])
for i in out:
dff.loc[i, 'Index'] = i
dff.loc[i, 'Level'] = df2['Level'].iloc[i]
dff.loc[i, 'LevelName'] = df2['LevelName'].iloc[i]
dff.loc[i, 'Title'] = df2['Title'].iloc[i]
dff.loc[i, 'Paragraph'] = df2['Paragraph'].iloc[i]
dff.loc[i, 'Color'] = df2['Color'].iloc[i]
dff.loc[i, 'Module'] = df2['Module'].iloc[i]
dff.loc[i, 'Heading1'] =df2['Heading1'].iloc[i]
dff.loc[i, 'Heading2'] =df2['Heading2'].iloc[i]
dff.loc[i, 'Heading3'] =df2['Heading3'].iloc[i]
dff.loc[i, 'Heading4'] =df2['Heading4'].iloc[i]
dff.loc[i, 'PageNum'] =df2['PageNum'].iloc[i]
dff.loc[i, 'Sentence'] =df2['Sentence'].iloc[i]
results = dff.reset_index().to_json(orient ='records')
results = json.loads(results)
return results