Upload Project

This commit is contained in:
louai98 2023-11-20 15:31:13 +01:00
parent 49eb16d0ac
commit 5592f5e7f0
168 changed files with 40547 additions and 363488 deletions

View File

@ -1,24 +0,0 @@
# Generated by Django 4.1.3 on 2023-07-07 07:17
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='NewContentTracker',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('paragraphID', models.CharField(max_length=10)),
('levelNumber', models.CharField(max_length=200)),
('standardTitle', models.CharField(max_length=200)),
('paragraph', models.TextField()),
],
),
]

View File

@ -1,12 +0,0 @@
from django.db import models
# Create your models here.
class NewContentTracker(models.Model):
paragraphID = models.CharField(max_length=10)
levelNumber = models.CharField(max_length=200)
standardTitle = models.CharField(max_length=200)
paragraph = models.TextField()
def __str__(self):
return str(self.id)

View File

@ -1,137 +0,0 @@
import json
import nltk
from spacy.matcher import Matcher
import spacy
import os
from os import walk
from pathlib import Path
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, MWETokenizer
from string import punctuation
#from App.models import Level
class PreprocessData:
def __init__(self):
self.nlp = spacy.load('en_core_web_lg')
self.stopwords = nltk.corpus.stopwords.words('english')
self.wordnet_lemmatizer = WordNetLemmatizer()
self.BASE_DIR = Path(__file__).resolve().parent.parent
def remove_punctuation(self, text):
my_punctuation = punctuation.replace(".", "")
my_punctuation = my_punctuation.replace("/", "")
punctuationfree="".join([i for i in text if i not in my_punctuation])
return punctuationfree
def remove_stopwords(self, text):
output= [i for i in text if i not in self.stopwords]
return output
def tokenization(self, text):
tokens = re.split('\W',text)
return tokens
def lemmatizer(self, text):
lemm_text = [self.wordnet_lemmatizer.lemmatize(word) for word in text]
return lemm_text
def merge_files(self, levels):
levels.sort()
allData = list()
for level in levels:
filenames = next(walk(os.path.join(self.BASE_DIR,'static/data/'+level)), (None, None, []))[2]
for file in filenames:
with open(os.path.join(self.BASE_DIR,'static/data/'+level+"/"+file), 'r') as f:
data = json.load(f)
for dd in data:
dd['LevelName'] = level
dd['Module'] = file[11:-5].replace('-',' ')
dd['Title'] = file[:-5]
allData.extend(data)
for idx, d in enumerate(allData):
d['ParagraphID'] = idx
return allData
def add_compliance(self, paragraphs):
df = pd.DataFrame(paragraphs)
matcher = Matcher(self.nlp.vocab)
pattern1 = [{'LOWER':'shall'}]
pattern2 = [{'LOWER':'should'}]
pattern3 = [{'LOWER':'may'}]
pattern4 = [{'LOWER':'must'}]
pattern5 = [{'LOWER':'can'}]
matcher.add('Shall',[pattern1])
matcher.add('Should',[pattern2])
matcher.add('May',[pattern3])
matcher.add('Must',[pattern4])
matcher.add('Can',[pattern5])
for index, row in df.iterrows():
doc = self.nlp(row['Paragraph'])
found_matches = matcher(doc)
if found_matches:
for match_id, start, end in found_matches:
string_id = self.nlp.vocab.strings[match_id]
span = doc[start:end]
if string_id == 'Shall':
df.loc[index,'Shall'] = 1
if string_id == 'Should':
df.loc[index,'Should'] = 1
if string_id == 'May':
df.loc[index,'May'] = 1
if string_id == 'Must':
df.loc[index,'Must'] = 1
if string_id == 'Can':
df.loc[index,'Can'] = 1
return df
def title_sent(self, title, sent):
new_sent = title+': '+sent
return new_sent
def split_into_sentneces(self, data):
df = data
df['Sentence'] = df['Paragraph'].apply(lambda xx:nltk.tokenize.sent_tokenize(xx))
df = df.explode("Sentence").reset_index(drop=True)
df['ProcessedSent']= df['Sentence'].apply(lambda xx:self.remove_punctuation(xx))
df['ProcessedSent']= df['ProcessedSent'].apply(lambda xx: xx.lower())
tokenizer = MWETokenizer()
df['ProcessedSent']= df['ProcessedSent'].apply(lambda xx: tokenizer.tokenize(word_tokenize(xx)))
df['ProcessedSent']= df['ProcessedSent'].apply(lambda xx: self.remove_stopwords(xx))
df['ProcessedSent']= df['ProcessedSent'].apply(lambda xx: self.lemmatizer(xx))
df['ProcessedSent']= df['ProcessedSent'].apply(lambda xx: ' '.join(xx))
df['ProcessedSent']= df.Module+' '+ df.ProcessedSent
return df
def pre_process_files(self):
print('Pre-processing started')
levels = next(walk(os.path.join(self.BASE_DIR, 'static/data')), (None, None, []))[1]
# levels = ['1 General IDDRS']
paragraphs = self.merge_files(levels)
df = self.add_compliance(paragraphs)
processed_data = self.split_into_sentneces(df)
processed_data.to_json(os.path.join(self.BASE_DIR, 'static/searchable/data2.json'), orient='records', indent=4)
print('Pre-processing finished')

View File

@ -1,7 +0,0 @@
from rest_framework import serializers
from .models import NewContentTracker
class NewContentTrackerSerializer(serializers.ModelSerializer):
class Meta:
model = NewContentTracker
fields = '__all__'

View File

@ -1,32 +0,0 @@
from django.urls import path, include
from rest_framework import routers
from .views import LevelViewSet, StandardsViewSet, NewContentTrackerViewSet
from . import views
router = routers.DefaultRouter()
router.register(r'levels', LevelViewSet)
router.register(r'standards', StandardsViewSet)
router.register(r'NewContentTracker', NewContentTrackerViewSet)
urlpatterns = [
path('', include(router.urls)),
path('content-list/', views.contentList, name='content-list'),
path('content-create/', views.contentCreate, name='content-create'),
path('content-detail/', views.contentDetail, name='content-detail'),
path('content-update/', views.contentUpdate, name='content-update'),
path('content-delete/', views.contentDelete, name='content-delete'),
path('level-submit/', views.levelSubmit, name='level-submit'),
path('level-delete/', views.levelDelete, name='level-delete'),
path('standard-submit/', views.standardSubmit, name='standard-submit'),
path('standard-delete/', views.standardDelete, name='standard-delete'),
path('pre-process/', views.preprocess, name='pre-process'),
path('api/', include('admin_api.api.urls')),
path('login/', views.loginPage, name='login'),
path('logout/', views.logoutPage, name='logout'),
]

View File

@ -1,314 +0,0 @@
from http.client import HTTPResponse
from django.shortcuts import render, redirect
from rest_framework import viewsets
from search_tfidf.models import Level, Standards
from .models import NewContentTracker
from .serializer import NewContentTrackerSerializer
from search_tfidf.serializer import LevelSerializer, StandardsSerializer
from django.http import JsonResponse
from django.views.decorators.csrf import csrf_exempt
import json
import os
from pathlib import Path
from rest_framework.decorators import api_view, permission_classes
from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response
from django.db.models import Max
import shutil
from django.contrib.auth import authenticate, login, logout
from django.contrib import messages
from .preprocessData import PreprocessData
# Create your views here.
@permission_classes([IsAuthenticated])
class LevelViewSet(viewsets.ModelViewSet):
queryset = Level.objects.all()
serializer_class = LevelSerializer
class StandardsViewSet(viewsets.ModelViewSet):
queryset = Standards.objects.all()
serializer_class = StandardsSerializer
class NewContentTrackerViewSet(viewsets.ModelViewSet):
queryset = NewContentTracker.objects.all()
serializer_class = NewContentTrackerSerializer
BASE_DIR = Path(__file__).resolve().parent.parent
# ################################################################
# #######################Contents#################################
# ################################################################
@csrf_exempt
@api_view(['GET'])
def contentList(request):
# Get the values from the request parameters
level = request.GET.get('level')
standard = request.GET.get('standard')
data = ""
module_path = filePath(level, standard)
# Read the JSON file
with open(module_path) as f:
data = json.load(f)
# Return the filtered data as a JSON response
return JsonResponse({'contents': data})
@csrf_exempt
@api_view(['POST'])
def contentCreate(request):
level = request.data['params']['level']
standard = request.data['params']['standard']
#print(request.data['params']['formData']['Heading1'])
module_path = filePath(level, standard)
#
levelInfo = Level.objects.get(levelNumber=level)
standardInfo = Standards.objects.get(standardTitle = standard)
#
with open(module_path) as json_file:
data = json.load(json_file)
#
new_id = 0
ids = []
for obj in data:
ids.append(obj['ID'])
if len(ids) > 0:
new_id = max(ids)+1
#
new_obj = request.data['params']['formData']
new_obj['ID'] = new_id
new_obj['Color'] = levelInfo.levelColor
new_obj['Level'] = levelInfo.levelNumber
new_obj['LevelName'] = levelInfo.levelName
new_obj['Title'] = standardInfo.standardTitle
new_obj['Module'] = standardInfo.standardTitle
data.append(new_obj)
#
with open(module_path, 'w') as f:
json.dump(data, f, indent=4)
haveToPreProcess(new_id, levelInfo.levelNumber, standardInfo.standardTitle, new_obj['Paragraph'])
return Response('')
@csrf_exempt
@api_view(['GET'])
def contentDetail(request):
level = request.GET.get('level')
standard = request.GET.get('standard')
id = request.GET.get('id')
module_path = filePath(level, standard)
with open(module_path) as f:
data = json.load(f)
for obj in data:
if obj['ID'] == int(id):
data = obj
return JsonResponse({'paragraph': data})
@csrf_exempt
@api_view(['POST'])
def contentUpdate(request):
level = request.data['params']['level']
standard = request.data['params']['standard']
id = request.data['params']['id']
updated_content = request.data['params']['formData']
module_path = filePath(level, standard)
with open(module_path) as f:
data = json.load(f)
for obj in data:
if obj['ID'] == int(id):
obj['Heading1'] = updated_content['Heading1']
obj['Heading2'] = updated_content['Heading2']
obj['Heading3'] = updated_content['Heading3']
obj['Heading4'] = updated_content['Heading4']
obj['Paragraph'] = updated_content['Paragraph']
obj['PageNum'] = updated_content['PageNum']
with open(module_path, 'w') as f:
json.dump(data, f)
haveToPreProcess(obj['ID'], level, standard, updated_content['Paragraph'])
return Response('')
@csrf_exempt
@api_view(['POST'])
def contentDelete(request):
level = request.data['params']['level']
standard = request.data['params']['standard']
id = request.data['params']['id']
print(level,standard,id)
module_path = filePath(level, standard)
with open(module_path) as f:
data = json.load(f)
for i in range(len(data)):
if data[i]['ID'] == int(id):
data.pop(i)
break
with open(module_path, 'w') as f:
json.dump(data, f)
haveToPreProcess(id, level, standard, 'Deleted')
return Response('')
# ################################################################
# #######################Levels###################################
# ################################################################
@csrf_exempt
@api_view(['POST'])
def levelSubmit(request):
data = request.data['params']['editingRow']
if 'id' in data:
level = Level.objects.get(id=data['id'])
serializer = LevelSerializer(instance=level, data=data)
if serializer.is_valid():
serializer.save()
else:
newLevelNumber = Level.objects.aggregate(Max('levelNumber'))['levelNumber__max']+1
data['levelNumber'] = newLevelNumber
serializer = LevelSerializer(data=data)
if serializer.is_valid():
serializer.save()
level_new_dir = os.path.join(BASE_DIR, 'static/data/'+str(newLevelNumber)+'/')
os.makedirs(level_new_dir, exist_ok=True)
return Response('')
@csrf_exempt
@api_view(['POST'])
def levelDelete(request):
data = request.data['params']['rowData']
level = Level.objects.get(id=data['id'])
level.delete()
level_del_dir = os.path.join(BASE_DIR, 'static/data/'+str(data['levelNumber'])+'/')
shutil.rmtree(level_del_dir)
haveToPreProcess(data['id'], data['levelNumber'], 'No', 'LevelDeleted')
return Response('')
# ################################################################
# #######################Standards################################
# ################################################################
@csrf_exempt
@api_view(['POST'])
def standardSubmit(request):
data = request.data['params']['editingRow']
if 'id' in data:
standard = Standards.objects.get(id = data['id'])
current_path = os.path.join(BASE_DIR, 'static/data/'+str(standard.standardLevel)+'/'+standard.standardTitle+'.json')
new_path = os.path.join(BASE_DIR, 'static/data/'+str(standard.standardLevel)+'/'+data['standardTitle']+'.json')
serializer = StandardsSerializer(instance=standard, data=data)
if serializer.is_valid():
serializer.save()
os.rename(current_path, new_path)
else:
levelRow = Level.objects.get(id = data['levelID'])
data['levelID'] = int(data['levelID'])
data['standardLevel'] = levelRow.levelNumber
serializer = StandardsSerializer(data=data)
if serializer.is_valid():
serializer.save()
newFile=[]
standard_new = os.path.join(BASE_DIR, 'static/data/'+str(levelRow.levelNumber)+'/'+data['standardTitle']+'.json')
with open(standard_new, 'w') as file:
# Write the JSON data to the file
json.dump(newFile, file)
file.close()
else:
print(serializer.errors)
return Response('')
@csrf_exempt
@api_view(['POST'])
def standardDelete(request):
data = request.data['params']['rowData']
standard = Standards.objects.get(id=data['id'])
standard.delete()
haveToPreProcess(data['id'], data['standardLevel'], data['standardTitle'], 'StandardDeleted')
return Response('Item successfully deleted!')
# ################################################################
# #######################pre-process##############################
# ################################################################
@csrf_exempt
@api_view(['POST'])
def preprocess(request):
process_files = PreprocessData()
process_files.pre_process_files()
NewContentTracker.objects.all().delete()
return Response('Procssed successfully')
# ################################################################
# #######################Authentication###########################
# ################################################################
def loginPage(request):
pass
def logoutPage(request):
pass
# ################################################################
# ################################################################
# ################################################################
def filePath(level_input, standard_input):
standards_dir = os.path.join(BASE_DIR, 'static/data/')
file_path = ''
levels = next(os.walk(os.path.join(BASE_DIR, 'static/data')), (None, None, []))[1]
if str(level_input) in levels:
filenames = next(os.walk(standards_dir+level_input), (None, None, []))[2]
for file in filenames:
if str(standard_input) in file:
file_path = standards_dir+str(level_input)+'/'+file
return file_path
def haveToPreProcess(id, levelNumber, standardTitle, paragraph):
######################################################
############NewContentTracker#########################
newContent = {}
newContent['paragraphID'] = id
newContent['levelNumber'] = levelNumber
newContent['standardTitle'] = standardTitle
newContent['paragraph'] = paragraph
serializer = NewContentTrackerSerializer(data=newContent)
if serializer.is_valid():
serializer.save()
return ('Added')

60
data_api/CreateIndexES.py Normal file
View File

@ -0,0 +1,60 @@
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import json
import os
from os import walk
from pathlib import Path
class CreateIndexES:
def __init__(self):
self.ELASTIC_PASSWORD = "p-P7luUvrPggWrS4UQsy"
self.BASE_DIR = Path(__file__).resolve().parent.parent
def createIndex(self):
# Create the client instance
es = Elasticsearch(
"https://localhost:9200",
ca_certs="/etc/elasticsearch/certs/http_ca.crt",
basic_auth=("elastic", self.ELASTIC_PASSWORD)
)
index_name = "iddrs"
mapping = {
"mappings": {
"properties": {
"Title": {"type": "text"},
"Paragraph": {"type": "text"},
}
}
}
if not es.indices.exists(index=index_name):
print("Index does not exist. Creating...")
es.indices.create(index=index_name, body=mapping)
elif es.indices.exists(index=index_name):
print("Index already exists. Deleting and recreating...")
# Delete the index (including all documents)
es.indices.delete(index=index_name, ignore=[400, 404])
es.indices.create(index=index_name, body=mapping)
#es.indices.create(index=index_name, body=mapping, ignore=400) # Ignore if the index already exists
#
filenames = next(walk(os.path.join(self.BASE_DIR,'static/data/json/')), (None, None, []))[2]
for file in filenames:
with open(os.path.join(self.BASE_DIR,'static/data/json/')+file, 'r') as f:
data = json.load(f)
actions = [
{
"_op_type": "index",
"_index": index_name,
#"_id": i + 1,
"_source": document,
}
for i, document in enumerate(data)
]
success, failed = bulk(es, actions, index=index_name, raise_on_error=False)
print(success)

243
data_api/PreprocessFile.py Normal file
View File

@ -0,0 +1,243 @@
from docx import Document
import os
import fitz
import re
import uuid
import shutil
import json
import logging
from pathlib import Path
import spacy
from spacy.matcher import Matcher
from .models import Levels, StandardsList
from .CreateIndexES import CreateIndexES
class PreprocessFile:
def __init__(self):
self.BASE_DIR = Path(__file__).resolve().parent.parent
def find_summary_page(self, pdf_path, summary_text):
doc = fitz.open(pdf_path)
summary_count = 0
for page_num in range(len(doc)):
page = doc[page_num]
text_instances = page.get_text("text")
# Counting the number of occurrences of the summary text on the page
summary_count += text_instances.count(summary_text)
if summary_count >= 2:
return page_num
else:
page_num = 0
return page_num
def find_text_in_pdf_from_summary(self, pdf_path, search_text, summary_text):
summary_page = self.find_summary_page(pdf_path, summary_text)
if summary_page is None:
# print("Summary not found in the PDF.")
return None
doc = fitz.open(pdf_path)
for page_num in range(
summary_page + 1, len(doc)
): # Start searching after the 2nd summary
page = doc[page_num]
text_instances = page.get_text("text")
# Use regex to find instances of search_text without anything following it on the same line
regex_pattern = re.compile(rf"\b{search_text}\b(?![^\n]*\S)", re.IGNORECASE)
match = regex_pattern.search(text_instances)
if match:
# print(f"Text found on page {page_num + 1}, after the 2nd summary.")
return page_num + 1
# print("Text not found in the PDF.")
return None
# Custom serialization function
def custom_json_serialization(self, text):
# Replace newline characters with spaces
return text.replace("\n", " ")
def process_standards(self):
# BASE_DIR = Path(__file__).resolve().parent.parent
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern1 = [{"LOWER": "shall"}]
pattern2 = [{"LOWER": "should"}]
pattern3 = [{"LOWER": "may"}]
pattern4 = [{"LOWER": "must"}]
pattern5 = [{"LOWER": "can"}]
matcher.add("Shall", [pattern1])
matcher.add("Should", [pattern2])
matcher.add("May", [pattern3])
matcher.add("Must", [pattern4])
matcher.add("Can", [pattern5])
root_json_dir = os.path.join(
self.BASE_DIR, "static/data/json/"
) # "../Standards/json"
root_data_json_dir = os.path.join(
self.BASE_DIR, "static/data/"
) # "../Standards/json"
summary_text = "Summary"
data = []
# for logging
number_of_successed_files = 0
number_of_sections = 0
number_of_sections_after_cleaning = 0
number_of_cleaned_sections = 0
# Check if the json directory exists
if os.path.exists(root_json_dir):
# Delete the directory and its contents
shutil.rmtree(root_json_dir)
# Create a new directory
os.makedirs(root_json_dir)
# Configure logging settings
log_file = os.path.join(
self.BASE_DIR, "static/data/json_log.log"
) # "../Standards/json/json_log.log" # Specify the path and filename for the log file
logging.basicConfig(
filename=log_file, # Set the log file
level=logging.DEBUG, # Set the logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
format="%(asctime)s - %(levelname)s - %(message)s", # Define log message format
datefmt="%Y-%m-%d %H:%M:%S", # Define date/time format
)
for standard in StandardsList.objects.all():
standard_file_pdf = standard.standardFilePDF
standard_file_word = standard.standardFileWord
json_path = os.path.join(root_json_dir, standard.standardTitle + ".json")
print(json_path)
sections = []
current_section = ""
data_per_file = []
# logging has to come here
number_of_sections_after_cleaning_per_file = 0
try:
word_doc = Document(standard_file_word.path)
for paragraph in word_doc.paragraphs:
if (
paragraph.style.name.startswith("Heading")
or "Section" in paragraph.style.name
or "Sub-section" in paragraph.style.name
):
# If we're starting a new section, save off the old one
if current_section:
sections.append(current_section.strip())
current_section = ""
current_section += paragraph.text + "\n"
else:
# Otherwise, append text to current section
current_section += paragraph.text + "\n"
# Append the last section to the list of sections if it exists
if current_section.strip():
sections.append(current_section.strip())
# print all sections
for index, section in enumerate(sections):
# for logging
number_of_sections += 1
if section != "" and len(section.split()) > 25:
# for logging
number_of_sections_after_cleaning += 1
number_of_sections_after_cleaning_per_file += 1
first_line = section.strip().splitlines()[0]
text_to_search = first_line
page_num = self.find_text_in_pdf_from_summary(
standard_file_pdf, text_to_search, summary_text
)
doc = nlp(section)
found_matches = matcher(doc)
shall = should = may = must = can = False
if found_matches:
for match_id, start, end in found_matches:
string_id = nlp.vocab.strings[match_id]
span = doc[start:end]
if string_id == "Shall":
shall = True
if string_id == "Should":
should = True
if string_id == "May":
may = True
if string_id == "Must":
must = True
if string_id == "Can":
can = True
section_boj = {
"ID": str(uuid.uuid4()),
"Color": standard.levelID.levelColor,
"Level": str(standard.levelNumber),
"LevelName": "",
"Title": standard.standardTitle,
"Heading1": "",
"Heading2": "",
"Heading3": "",
"Heading4": "",
"Module": standard.standardTitle,
"PageNum": page_num,
"Paragraph": self.custom_json_serialization(section),
"Can": can,
"May": may,
"Shall": shall,
"Should": should,
"Must": must,
}
data_per_file.append(section_boj)
data.append(section_boj)
else:
# for logging
number_of_cleaned_sections + 1
except Exception as e:
print(
f"An error occurred while processing {standard.standardTitle}: {str(e)}"
)
try:
with open(json_path, "w") as json_file:
json.dump(data_per_file, json_file, indent=4)
logging.info(
f"{number_of_sections_after_cleaning_per_file} seactions has been saved to: {json_path}"
)
number_of_successed_files += 1
except Exception as e:
logging.error(
f"Number of successed saved files: {number_of_successed_files}"
)
logging.info(f"Number of successed saved files: {number_of_successed_files}")
logging.info(f"Number of seactions: {number_of_sections}")
logging.info(
f"Number of seactions after cleaning: {number_of_sections_after_cleaning}"
)
logging.info(f"Number of cleaned seactions: {number_of_cleaned_sections}")
with open(root_data_json_dir + "/data.json", "w") as json_file:
# Use json.dump to write the data to the file
json.dump(
data, json_file, indent=4
) # Use indent for pretty formatting (optional)
# Close the log file (optional, usually done automatically)
logging.shutdown()

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,7 +1,5 @@
from django.http import JsonResponse
from rest_framework.response import Response from rest_framework.response import Response
from rest_framework.decorators import api_view from rest_framework.decorators import api_view
from rest_framework_simplejwt.serializers import TokenObtainPairSerializer from rest_framework_simplejwt.serializers import TokenObtainPairSerializer
from rest_framework_simplejwt.views import TokenObtainPairView from rest_framework_simplejwt.views import TokenObtainPairView

View File

@ -1,6 +1,6 @@
from django.apps import AppConfig from django.apps import AppConfig
class AdminApiConfig(AppConfig): class DataApiConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField' default_auto_field = 'django.db.models.BigAutoField'
name = 'admin_api' name = 'data_api'

7
data_api/forms.py Normal file
View File

@ -0,0 +1,7 @@
from django import forms
from .models import StandardsList
class StandardUploadForm(forms.ModelForm):
class Meta:
model = StandardsList
fields = '__all__'

View File

@ -0,0 +1,34 @@
# Generated by Django 4.1.3 on 2023-11-03 09:07
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='Levels',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('levelTitle', models.CharField(max_length=200)),
('levelColor', models.CharField(max_length=200)),
('levelNumber', models.IntegerField()),
],
),
migrations.CreateModel(
name='Standards',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('standardFile', models.FileField(blank=True, null=True, upload_to='')),
('standardTitle', models.CharField(max_length=200)),
('standardPath', models.CharField(blank=True, max_length=200, null=True)),
('levelID', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='data_api.levels')),
],
),
]

View File

@ -0,0 +1,17 @@
# Generated by Django 4.1.3 on 2023-11-03 09:08
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('data_api', '0001_initial'),
]
operations = [
migrations.RenameModel(
old_name='Standards',
new_name='StandardsList',
),
]

View File

@ -0,0 +1,33 @@
# Generated by Django 4.1.3 on 2023-11-03 21:15
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('data_api', '0002_rename_standards_standardslist'),
]
operations = [
migrations.RenameField(
model_name='standardslist',
old_name='standardFile',
new_name='standardFilePDF',
),
migrations.RenameField(
model_name='standardslist',
old_name='standardPath',
new_name='standardPathPDF',
),
migrations.AddField(
model_name='standardslist',
name='standardFileWord',
field=models.FileField(blank=True, null=True, upload_to=''),
),
migrations.AddField(
model_name='standardslist',
name='standardPathWord',
field=models.CharField(blank=True, max_length=200, null=True),
),
]

View File

@ -0,0 +1,31 @@
# Generated by Django 4.1.3 on 2023-11-06 08:50
import data_api.models
from django.db import migrations, models
import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
('data_api', '0003_rename_standardfile_standardslist_standardfilepdf_and_more'),
]
operations = [
migrations.AddField(
model_name='standardslist',
name='uploaded_at',
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
preserve_default=False,
),
migrations.AlterField(
model_name='standardslist',
name='standardFilePDF',
field=models.FileField(blank=True, null=True, upload_to=data_api.models.dynamic_upload_to),
),
migrations.AlterField(
model_name='standardslist',
name='standardFileWord',
field=models.FileField(blank=True, null=True, upload_to=data_api.models.dynamic_upload_to),
),
]

View File

@ -0,0 +1,29 @@
# Generated by Django 4.1.3 on 2023-11-06 12:41
import data_api.models
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('data_api', '0004_standardslist_uploaded_at_and_more'),
]
operations = [
migrations.AddField(
model_name='standardslist',
name='levelNumber',
field=models.IntegerField(blank=True, null=True),
),
migrations.AlterField(
model_name='standardslist',
name='standardFilePDF',
field=models.FileField(upload_to=data_api.models.dynamic_upload_to),
),
migrations.AlterField(
model_name='standardslist',
name='standardFileWord',
field=models.FileField(upload_to=data_api.models.dynamic_upload_to),
),
]

View File

@ -0,0 +1,21 @@
# Generated by Django 4.1.3 on 2023-11-06 12:49
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('data_api', '0005_standardslist_levelnumber_and_more'),
]
operations = [
migrations.RemoveField(
model_name='standardslist',
name='standardPathPDF',
),
migrations.RemoveField(
model_name='standardslist',
name='standardPathWord',
),
]

View File

@ -0,0 +1,18 @@
# Generated by Django 4.1.3 on 2023-11-20 10:14
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('data_api', '0006_remove_standardslist_standardpathpdf_and_more'),
]
operations = [
migrations.AddField(
model_name='standardslist',
name='standardColor',
field=models.CharField(blank=True, max_length=200),
),
]

View File

@ -0,0 +1,22 @@
# Generated by Django 4.1.3 on 2023-11-20 10:47
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('data_api', '0007_standardslist_standardcolor'),
]
operations = [
migrations.CreateModel(
name='FileEvent',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('event_type', models.CharField(choices=[('UPLOAD', 'File Uploaded'), ('DELETE', 'File Deleted')], max_length=10)),
('file_name', models.CharField(max_length=255)),
('timestamp', models.DateTimeField(auto_now_add=True)),
],
),
]

View File

@ -0,0 +1,18 @@
# Generated by Django 4.1.3 on 2023-11-20 13:03
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('data_api', '0008_fileevent'),
]
operations = [
migrations.AddField(
model_name='fileevent',
name='indexed',
field=models.BooleanField(default=False),
),
]

View File

@ -0,0 +1,22 @@
# Generated by Django 4.1.3 on 2023-11-20 13:39
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('data_api', '0009_fileevent_indexed'),
]
operations = [
migrations.RemoveField(
model_name='fileevent',
name='indexed',
),
migrations.AddField(
model_name='fileevent',
name='fileStatus',
field=models.CharField(default='Pending', max_length=255),
),
]

60
data_api/models.py Normal file
View File

@ -0,0 +1,60 @@
from django.db import models
import os
def dynamic_upload_to(instance, filename):
# Generate a dynamic folder name based on some criteria (e.g., user, date, etc.)
return os.path.join(
"iddrs_api/static/data/Standards", str(instance.levelNumber), filename
)
# Create your models here.
class Levels(models.Model):
# The Levels model defines the schema for levels in the database.
# It has fields for the level's title, color, and number.
levelTitle = models.CharField(max_length=200)
levelColor = models.CharField(max_length=200)
levelNumber = models.IntegerField()
def __str__(self):
return str(self.levelNumber)
class StandardsList(models.Model):
# The StandardsList model defines the schema for standards in the database.
# It has a foreign key to Levels, fields for the standard's file, title and path,
# and a __str__ method to represent the standard by its title.
levelID = models.ForeignKey(Levels, on_delete=models.CASCADE, blank=True, null=True)
levelNumber = models.IntegerField(blank=True, null=True)
standardFilePDF = models.FileField(upload_to=dynamic_upload_to) # , upload_to=""
standardFileWord = models.FileField(upload_to=dynamic_upload_to) # , upload_to=""
standardTitle = models.CharField(max_length=200)
standardColor = models.CharField(max_length=200, blank=True)
uploaded_at = models.DateTimeField(auto_now_add=True)
def save(self, *args, **kwargs):
# Automatically set StdColor based on the associated Levels model's color
if self.levelID:
self.standardColor = self.levelID.levelColor
super().save(*args, **kwargs)
def __str__(self):
return self.standardTitle
class FileEvent(models.Model):
EVENT_CHOICES = (
("UPLOAD", "File Uploaded"),
("DELETE", "File Deleted"),
)
event_type = models.CharField(max_length=10, choices=EVENT_CHOICES)
file_name = models.CharField(max_length=255)
timestamp = models.DateTimeField(auto_now_add=True)
fileStatus = models.CharField(default='Pending', max_length=255)
def __str__(self):
return f"{self.get_event_type_display()}: {self.file_name}"

17
data_api/serializers.py Normal file
View File

@ -0,0 +1,17 @@
from rest_framework import serializers
from .models import Levels, StandardsList, FileEvent
class LevelSerializer(serializers.ModelSerializer):
class Meta:
model = Levels
fields = '__all__'
class StandardsSerializer(serializers.ModelSerializer):
class Meta:
model = StandardsList
fields = '__all__'
class FileEventSerializer(serializers.ModelSerializer):
class Meta:
model = FileEvent
fields = '__all__'

View File

@ -0,0 +1 @@
[]

View File

@ -0,0 +1,35 @@
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
2023-11-09 16:17:19 - INFO - Number of successed saved files: 0
2023-11-09 16:17:19 - INFO - Number of seactions: 0
2023-11-09 16:17:19 - INFO - Number of seactions after cleaning: 0
2023-11-09 16:17:19 - INFO - Number of cleaned seactions: 0
2023-11-09 16:21:55 - INFO - /home/louai/Documents/BICC/IDDRS/iddrs_tool/iddrs_api/data_api/PreprocessFile.py changed, reloading.

25
data_api/urls.py Normal file
View File

@ -0,0 +1,25 @@
from django.urls import path, include
from rest_framework import routers
from .views import LevelViewSet, StandardsViewSet, FileEventsViewSet
from . import views
router = routers.DefaultRouter()
router.register(r'levels', LevelViewSet)
router.register(r'standards', StandardsViewSet)
router.register(r'fileEvents', FileEventsViewSet)
urlpatterns = [
path('', include(router.urls)),
path('level-submit/', views.levelSubmit, name='level-submit'),
path('level-delete/', views.levelDelete, name='level-delete'),
path('upload-standard/', views.upload_standard, name='upload-standard'),
path('standard-delete/', views.standardDelete, name='standard-delete'),
path('process-files/', views.processFiles, name='process-files'),
#path('api/', include('admin_api.api.urls')),
]

159
data_api/views.py Normal file
View File

@ -0,0 +1,159 @@
from rest_framework import viewsets
from .models import Levels, StandardsList, FileEvent
from .serializers import LevelSerializer, StandardsSerializer, FileEventSerializer
from django.views.decorators.csrf import csrf_exempt
import os
from pathlib import Path
from rest_framework.decorators import api_view, permission_classes
from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response
from django.db.models import Max
import shutil
from .PreprocessFile import PreprocessFile
from .CreateIndexES import CreateIndexES
import logging
logger = logging.getLogger(__name__)
# Create your views here.
#@permission_classes([IsAuthenticated])
class LevelViewSet(viewsets.ModelViewSet):
queryset = Levels.objects.all()
serializer_class = LevelSerializer
#@permission_classes([IsAuthenticated])
class StandardsViewSet(viewsets.ModelViewSet):
queryset = StandardsList.objects.all().order_by('levelNumber')
serializer_class = StandardsSerializer
class FileEventsViewSet(viewsets.ModelViewSet):
queryset = FileEvent.objects.all().filter(fileStatus='Pending')
serializer_class = FileEventSerializer
BASE_DIR = Path(__file__).resolve().parent.parent
# ################################################################
# #######################Levels###################################
# ################################################################
def check_level_folder():
level_numbers = Levels.objects.values_list('levelNumber', flat=True)
for level_number in level_numbers:
level_folder = os.path.join(BASE_DIR,'static/data/Standards/'+str(level_number)+'/')
if not os.path.exists(level_folder):
os.makedirs(level_folder)
@csrf_exempt
@api_view(['POST'])
def levelSubmit(request):
data = request.data['params']['editingRow']
if 'id' in data:
level = Levels.objects.get(id=data['id'])
serializer = LevelSerializer(instance=level, data=data)
if serializer.is_valid():
serializer.save()
check_level_folder()
else:
newLevelNumber = Levels.objects.aggregate(Max('levelNumber'))['levelNumber__max']+1
data['levelNumber'] = newLevelNumber
serializer = LevelSerializer(data=data)
if serializer.is_valid():
serializer.save()
check_level_folder()
return Response('')
@csrf_exempt
@api_view(['POST'])
def levelDelete(request):
data = request.data['params']['rowData']
level = Levels.objects.get(id=data['id'])
level.delete()
level_del_dir = os.path.join(BASE_DIR, 'static/data/Standards/'+str(data['levelNumber'])+'/')
shutil.rmtree(level_del_dir)
return Response('')
# ################################################################
# #######################Standards################################
# ################################################################
@csrf_exempt
@api_view(['POST'])
def upload_standard(request):
if request.method == 'POST':
level_number = request.data['selectedLevel']
pdfFile = request.FILES['pdfFile']
wordFile = request.FILES['wordFile']
standard_level_id = Levels.objects.filter(levelNumber=level_number).values_list('id', flat=True)[0]
standard = {
'levelID': standard_level_id,
'levelNumber': level_number,
'standardFilePDF': pdfFile,
'standardFileWord': wordFile,
'standardTitle': pdfFile.name.split('.pdf')[0]
}
serializer = StandardsSerializer(data=standard)
if serializer.is_valid():
serializer.save()
# Create a FileEvent for the upload
FileEvent.objects.create(event_type='UPLOAD', file_name=pdfFile.name.split('.pdf')[0])
else:
print('Invalid')
return Response('Done!')
@csrf_exempt
@api_view(['POST'])
def standardDelete(request):
data = request.data['params']['stdData']
standard = StandardsList.objects.get(id = data['id'])
delete_file(standard.standardFilePDF.path)
delete_file(standard.standardFileWord.path)
standard.delete()
# Create a FileEvent for the delete
FileEvent.objects.create(event_type='DELETE', file_name=standard.standardFilePDF.name.split('.pdf')[0])
return Response('')
def delete_file(file_path):
try:
os.remove(file_path)
print(f"File {file_path} deleted successfully.")
except FileNotFoundError:
print(f"File {file_path} not found.")
except PermissionError:
print(f"Permission error: Unable to delete {file_path}.")
except Exception as e:
print(f"An error occurred: {e}")
@csrf_exempt
@api_view(['POST'])
def processFiles(request):
logger.info('Starting file Processing ...')
try:
process_files = PreprocessFile()
process_files.process_standards()
craetIndex = CreateIndexES()
craetIndex.createIndex()
except Exception as e:
logger.error(e)
raise
# delete all records in the FileEvents table
# FileEvent.objects.all().delete()
# set all records in FileEvents table in column indexed to true
FileEvent.objects.all().update(fileStatus='Indexed')
logger.info('File Processing completed')
return Response('Procssed successfully')

Binary file not shown.

View File

@ -50,11 +50,11 @@ INSTALLED_APPS = [
'django.contrib.messages', 'django.contrib.messages',
'django.contrib.staticfiles', 'django.contrib.staticfiles',
'rest_framework', 'rest_framework',
'rest_framework.authtoken',
'rest_framework_simplejwt.token_blacklist', 'rest_framework_simplejwt.token_blacklist',
'django_filters', 'django_filters',
'search_tfidf', 'search_tfidf',
'admin_api', 'data_api',
'user_auth',
'corsheaders', 'corsheaders',
] ]

View File

@ -19,6 +19,5 @@ from django.urls import path, include
urlpatterns = [ urlpatterns = [
path('admin/', admin.site.urls), path('admin/', admin.site.urls),
path('client_api/', include('search_tfidf.urls')), path('client_api/', include('search_tfidf.urls')),
path('admin_api/', include('admin_api.urls')), path('data_api/', include('data_api.urls')),
path('user_auth/', include('user_auth.urls')),
] ]

Binary file not shown.

View File

@ -0,0 +1,71 @@
from django.shortcuts import render
from elasticsearch import Elasticsearch
import os
def build_search_query(phrase, min_score):
search_query = {
"size": 100,
"query": {
"multi_match": {
"query": phrase,
"fields": ["Paragraph", "Title"]
}
},
"highlight": {
"fields": {
"Paragraph": {}
},
"pre_tags": [""],
"post_tags": [""],
},
"min_score": min_score
}
return search_query
def eSearch (phrase):
# Set the password for connecting to Elasticsearch
ELASTIC_PASSWORD = "p-P7luUvrPggWrS4UQsy"
ca_certs="/etc/elasticsearch/certs/http_ca.crt"
# Get the Elasticsearch password from environment variable
# ELASTIC_PASSWORD = os.environ.get('ELASTIC_PASSWORD')
# Get the CA certificates path from environment variable
# ca_certs = os.environ.get('CA_CERTS')
# Create an Elasticsearch client instance to use for searching
# Connect to the local Elasticsearch instance on port 9200
# Use certificate authentication with the provided certificate
# Authenticate with the elastic user and the password set above
es = Elasticsearch(
"https://localhost:9200",
ca_certs=ca_certs,
basic_auth=("elastic", ELASTIC_PASSWORD)
)
# Search for products matching a specific category
number_of_hits = 0
min_score = 8
final_results = []
search_results = []
while number_of_hits == 0:
search_query = build_search_query(phrase, min_score)
search_results = es.search(index="iddrs", body=search_query)
number_of_hits = len(search_results["hits"]["hits"])
min_score = min_score - 1
# Process and display search results
for hit in search_results["hits"]["hits"]:
highlighted_texts = hit.get('highlight', {}).get('Paragraph', []) # Use get() to avoid KeyError
original_paragraph = hit.get('_source', {}).get('Paragraph', [])
#print(highlighted_texts)
if highlighted_texts: # Check if highlight is not None
for highlighted_text in highlighted_texts:
original_paragraph = original_paragraph.replace(highlighted_text, f"<span style='background-color:#ffff00'>{highlighted_text}</span>")
hit["_source"]["Highlight"] = original_paragraph
else:
hit["_source"]["Highlight"] = []
final_results.append(hit["_source"])
return final_results , min_score + 1

View File

@ -1,3 +0,0 @@
from django.test import TestCase
# Create your tests here.

View File

@ -6,6 +6,7 @@ from django.http import JsonResponse
from django.views.decorators.csrf import csrf_exempt from django.views.decorators.csrf import csrf_exempt
import json import json
from .tfidfSearch import cosine_similarity from .tfidfSearch import cosine_similarity
from .elasticSearch import eSearch
from rest_framework.decorators import api_view from rest_framework.decorators import api_view
from pathlib import Path from pathlib import Path
import os import os
@ -45,7 +46,8 @@ def get_input(request):
return JsonResponse({"message": "Data received", "results":searchResults}) return JsonResponse({"message": "Data received", "results":searchResults})
else: else:
searchResults = cosine_similarity(phrase, title=False) #searchResults = cosine_similarity(phrase, title=False)
searchResults = eSearch(phrase)
return JsonResponse({"message": "Data received", "results":searchResults}) return JsonResponse({"message": "Data received", "results":searchResults})

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More