IDDRS/App/tfidfSimilarity.py

import string
from string import punctuation

from pathlib import Path
import os

import json
import pandas as pd
from urllib3 import Retry

import numpy as np
from collections import Counter
import math
# importing nlp library
import nltk
import re
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from .models import SearchResults


wordnet_lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
nlp = spacy.load("en_core_web_lg")

def preprocessQuery(phrase):
    phrase="".join([i for i in phrase if i not in string.punctuation])
    phrase= phrase.lower()
    phrase = re.split('\W',phrase)
    phrase= [i for i in phrase if i not in stopwords]
    processedPhrase = [wordnet_lemmatizer.lemmatize(word) for word in phrase]
    return processedPhrase

def doc_freq(DataF,word):
    c = 0
    try:
        c = DataF[word]
    except:
        pass
    return c

# TF-IDF Cosine Similarity Ranking
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

def readfile():
    BASE_DIR = Path(__file__).resolve().parent.parent

    processedData = pd.read_json(os.path.join(BASE_DIR, 'static/json/prcoessedData.json'))
    df = pd.DataFrame(processedData)
    return df

df = readfile()
paragraphs = df['ProcessedSent'].tolist()
N = len (paragraphs)

# Extracting Data
def extractingData(paragraphs):
    processed_text = []
    for i in paragraphs[:N]:
        processed_text.append(word_tokenize(i))
    return processed_text

processed_text = extractingData(paragraphs)
# Calculating DF for all words
def calculateDF():
    DF = {}
    for i in range(N):
        tokens = processed_text[i]
        for w in tokens:
            try:
                DF[w].add(i)
            except:
                DF[w] = {i}
    for i in DF:
        DF[i] = len(DF[i])

    return DF

DF = calculateDF()
total_vocab_size = len(DF)
total_vocab = [x for x in DF]

doc = 0
tf_idf = {}

for i in range(N):
    tokens = processed_text[i]

    counter = Counter(tokens)
    words_count = len(tokens)

    for token in np.unique(tokens):
        tf = counter[token]/words_count
        df = doc_freq(DF, token)
        idf = np.log((N+1)/(df+1))
        tf_idf[doc, token] = tf*idf
    doc += 1

# Vectorising tf-idf
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1])
        D[i[0]][ind] = tf_idf[i]
    except:
        pass

def gen_vector(tokens):
    Q = np.zeros((len(total_vocab)))

    counter = Counter(tokens)
    words_count = len(tokens)

    for token in np.unique(tokens):
        tf = counter[token]/words_count
        df = doc_freq(DF,token)
        idf = math.log((N+1)/(df+1))
        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass

    return Q

def cosine_similarity(k, query):
    df2 = readfile()
    preprocessed_query = preprocessQuery(query)
    d_cosines = []
    query_vector = gen_vector(preprocessed_query)

    for d in D:
        score = cosine_sim(query_vector, d)
        if math.isnan(score):
            score = 0
        d_cosines.append(score)

    out = np.array(d_cosines).argsort()[-k:][::-1]

    dff = pd.DataFrame(columns=['Index', 'Paragraph', 'Color', 'Level', 'LevelName',
        'Title', 'Module', 'PageNum', 'Heading1', 'Heading2', 'Heading3', 'Heading4', 'Sentence'])
    for i in out:
        dff.loc[i, 'Index'] = i
        dff.loc[i, 'Level'] = df2['Level'].iloc[i]
        dff.loc[i, 'LevelName'] = df2['LevelName'].iloc[i]
        dff.loc[i, 'Title'] = df2['Title'].iloc[i]
        dff.loc[i, 'Paragraph'] = df2['Paragraph'].iloc[i]
        dff.loc[i, 'Color'] = df2['Color'].iloc[i]
        dff.loc[i, 'Module'] = df2['Module'].iloc[i]
        dff.loc[i, 'Heading1'] =df2['Heading1'].iloc[i]
        dff.loc[i, 'Heading2'] =df2['Heading2'].iloc[i]
        dff.loc[i, 'Heading3'] =df2['Heading3'].iloc[i]
        dff.loc[i, 'Heading4'] =df2['Heading4'].iloc[i]
        dff.loc[i, 'PageNum'] =df2['PageNum'].iloc[i]
        dff.loc[i, 'Sentence'] =df2['Sentence'].iloc[i]

    results = dff.reset_index().to_json(orient ='records')
    results = json.loads(results)


    return results