167 lines
4.2 KiB
Python
167 lines
4.2 KiB
Python
|
import string
|
||
|
from string import punctuation
|
||
|
|
||
|
from pathlib import Path
|
||
|
import os
|
||
|
|
||
|
import json
|
||
|
import pandas as pd
|
||
|
from urllib3 import Retry
|
||
|
|
||
|
import numpy as np
|
||
|
from collections import Counter
|
||
|
import math
|
||
|
# importing nlp library
|
||
|
import nltk
|
||
|
import re
|
||
|
import spacy
|
||
|
from nltk.stem import WordNetLemmatizer
|
||
|
from nltk.tokenize import word_tokenize
|
||
|
|
||
|
from .models import SearchResults
|
||
|
|
||
|
|
||
|
wordnet_lemmatizer = WordNetLemmatizer()
|
||
|
stopwords = nltk.corpus.stopwords.words('english')
|
||
|
nlp = spacy.load("en_core_web_lg")
|
||
|
|
||
|
def preprocessQuery(phrase):
|
||
|
phrase="".join([i for i in phrase if i not in string.punctuation])
|
||
|
phrase= phrase.lower()
|
||
|
phrase = re.split('\W',phrase)
|
||
|
phrase= [i for i in phrase if i not in stopwords]
|
||
|
processedPhrase = [wordnet_lemmatizer.lemmatize(word) for word in phrase]
|
||
|
return processedPhrase
|
||
|
|
||
|
def doc_freq(DataF,word):
|
||
|
c = 0
|
||
|
try:
|
||
|
c = DataF[word]
|
||
|
except:
|
||
|
pass
|
||
|
return c
|
||
|
|
||
|
# TF-IDF Cosine Similarity Ranking
|
||
|
def cosine_sim(a, b):
|
||
|
cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
|
||
|
return cos_sim
|
||
|
|
||
|
def readfile():
|
||
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||
|
|
||
|
processedData = pd.read_json(os.path.join(BASE_DIR, 'App/static/json/prcessedData.json'))
|
||
|
df = pd.DataFrame(processedData)
|
||
|
return df
|
||
|
|
||
|
df = readfile()
|
||
|
paragraphs = df['ProcessedSent'].tolist()
|
||
|
N = len (paragraphs)
|
||
|
|
||
|
# Extracting Data
|
||
|
def extractingData(paragraphs):
|
||
|
processed_text = []
|
||
|
for i in paragraphs[:N]:
|
||
|
processed_text.append(word_tokenize(i))
|
||
|
return processed_text
|
||
|
|
||
|
processed_text = extractingData(paragraphs)
|
||
|
# Calculating DF for all words
|
||
|
def calculateDF():
|
||
|
DF = {}
|
||
|
for i in range(N):
|
||
|
tokens = processed_text[i]
|
||
|
for w in tokens:
|
||
|
try:
|
||
|
DF[w].add(i)
|
||
|
except:
|
||
|
DF[w] = {i}
|
||
|
for i in DF:
|
||
|
DF[i] = len(DF[i])
|
||
|
|
||
|
return DF
|
||
|
|
||
|
DF = calculateDF()
|
||
|
total_vocab_size = len(DF)
|
||
|
total_vocab = [x for x in DF]
|
||
|
|
||
|
doc = 0
|
||
|
tf_idf = {}
|
||
|
|
||
|
for i in range(N):
|
||
|
tokens = processed_text[i]
|
||
|
|
||
|
counter = Counter(tokens)
|
||
|
words_count = len(tokens)
|
||
|
|
||
|
for token in np.unique(tokens):
|
||
|
tf = counter[token]/words_count
|
||
|
df = doc_freq(DF, token)
|
||
|
idf = np.log((N+1)/(df+1))
|
||
|
tf_idf[doc, token] = tf*idf
|
||
|
doc += 1
|
||
|
|
||
|
# Vectorising tf-idf
|
||
|
D = np.zeros((N, total_vocab_size))
|
||
|
for i in tf_idf:
|
||
|
try:
|
||
|
ind = total_vocab.index(i[1])
|
||
|
D[i[0]][ind] = tf_idf[i]
|
||
|
except:
|
||
|
pass
|
||
|
|
||
|
def gen_vector(tokens):
|
||
|
Q = np.zeros((len(total_vocab)))
|
||
|
|
||
|
counter = Counter(tokens)
|
||
|
words_count = len(tokens)
|
||
|
|
||
|
for token in np.unique(tokens):
|
||
|
tf = counter[token]/words_count
|
||
|
df = doc_freq(DF,token)
|
||
|
idf = math.log((N+1)/(df+1))
|
||
|
try:
|
||
|
ind = total_vocab.index(token)
|
||
|
Q[ind] = tf*idf
|
||
|
except:
|
||
|
pass
|
||
|
|
||
|
return Q
|
||
|
|
||
|
def cosine_similarity(k, query):
|
||
|
df2 = readfile()
|
||
|
preprocessed_query = preprocessQuery(query)
|
||
|
d_cosines = []
|
||
|
query_vector = gen_vector(preprocessed_query)
|
||
|
|
||
|
for d in D:
|
||
|
score = cosine_sim(query_vector, d)
|
||
|
if math.isnan(score):
|
||
|
score = 0
|
||
|
d_cosines.append(score)
|
||
|
|
||
|
out = np.array(d_cosines).argsort()[-k:][::-1]
|
||
|
|
||
|
dff = pd.DataFrame(columns=['Index', 'Paragraph', 'Color', 'Level', 'LevelName',
|
||
|
'Title', 'Module', 'PageNum', 'Heading1', 'Heading2', 'Heading3', 'Heading4', 'Sentence'])
|
||
|
for i in out:
|
||
|
dff.loc[i, 'Index'] = i
|
||
|
dff.loc[i, 'Level'] = df2['Level'].iloc[i]
|
||
|
dff.loc[i, 'LevelName'] = df2['LevelName'].iloc[i]
|
||
|
dff.loc[i, 'Title'] = df2['Title'].iloc[i]
|
||
|
dff.loc[i, 'Paragraph'] = df2['Paragraph'].iloc[i]
|
||
|
dff.loc[i, 'Color'] = df2['Color'].iloc[i]
|
||
|
dff.loc[i, 'Module'] = df2['Module'].iloc[i]
|
||
|
dff.loc[i, 'Heading1'] =df2['Heading1'].iloc[i]
|
||
|
dff.loc[i, 'Heading2'] =df2['Heading2'].iloc[i]
|
||
|
dff.loc[i, 'Heading3'] =df2['Heading3'].iloc[i]
|
||
|
dff.loc[i, 'Heading4'] =df2['Heading4'].iloc[i]
|
||
|
dff.loc[i, 'PageNum'] =df2['PageNum'].iloc[i]
|
||
|
dff.loc[i, 'Sentence'] =df2['Sentence'].iloc[i]
|
||
|
|
||
|
results = dff.reset_index().to_json(orient ='records')
|
||
|
results = json.loads(results)
|
||
|
|
||
|
|
||
|
return results
|
||
|
|