Upload Project
This commit is contained in:
parent
49eb16d0ac
commit
5592f5e7f0
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,24 +0,0 @@
|
||||
# Generated by Django 4.1.3 on 2023-07-07 07:17
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='NewContentTracker',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('paragraphID', models.CharField(max_length=10)),
|
||||
('levelNumber', models.CharField(max_length=200)),
|
||||
('standardTitle', models.CharField(max_length=200)),
|
||||
('paragraph', models.TextField()),
|
||||
],
|
||||
),
|
||||
]
|
Binary file not shown.
Binary file not shown.
@ -1,12 +0,0 @@
|
||||
from django.db import models
|
||||
|
||||
# Create your models here.
|
||||
class NewContentTracker(models.Model):
|
||||
paragraphID = models.CharField(max_length=10)
|
||||
levelNumber = models.CharField(max_length=200)
|
||||
standardTitle = models.CharField(max_length=200)
|
||||
paragraph = models.TextField()
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return str(self.id)
|
@ -1,137 +0,0 @@
|
||||
import json
|
||||
import nltk
|
||||
from spacy.matcher import Matcher
|
||||
import spacy
|
||||
|
||||
import os
|
||||
from os import walk
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
import re
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
from nltk.tokenize import word_tokenize, MWETokenizer
|
||||
from string import punctuation
|
||||
#from App.models import Level
|
||||
|
||||
class PreprocessData:
|
||||
|
||||
def __init__(self):
|
||||
self.nlp = spacy.load('en_core_web_lg')
|
||||
self.stopwords = nltk.corpus.stopwords.words('english')
|
||||
self.wordnet_lemmatizer = WordNetLemmatizer()
|
||||
self.BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
|
||||
|
||||
|
||||
def remove_punctuation(self, text):
|
||||
my_punctuation = punctuation.replace(".", "")
|
||||
my_punctuation = my_punctuation.replace("/", "")
|
||||
punctuationfree="".join([i for i in text if i not in my_punctuation])
|
||||
return punctuationfree
|
||||
|
||||
|
||||
def remove_stopwords(self, text):
|
||||
output= [i for i in text if i not in self.stopwords]
|
||||
return output
|
||||
|
||||
def tokenization(self, text):
|
||||
tokens = re.split('\W',text)
|
||||
return tokens
|
||||
|
||||
|
||||
def lemmatizer(self, text):
|
||||
lemm_text = [self.wordnet_lemmatizer.lemmatize(word) for word in text]
|
||||
return lemm_text
|
||||
|
||||
|
||||
def merge_files(self, levels):
|
||||
levels.sort()
|
||||
allData = list()
|
||||
for level in levels:
|
||||
filenames = next(walk(os.path.join(self.BASE_DIR,'static/data/'+level)), (None, None, []))[2]
|
||||
for file in filenames:
|
||||
with open(os.path.join(self.BASE_DIR,'static/data/'+level+"/"+file), 'r') as f:
|
||||
data = json.load(f)
|
||||
for dd in data:
|
||||
dd['LevelName'] = level
|
||||
dd['Module'] = file[11:-5].replace('-',' ')
|
||||
dd['Title'] = file[:-5]
|
||||
allData.extend(data)
|
||||
for idx, d in enumerate(allData):
|
||||
d['ParagraphID'] = idx
|
||||
return allData
|
||||
|
||||
def add_compliance(self, paragraphs):
|
||||
df = pd.DataFrame(paragraphs)
|
||||
matcher = Matcher(self.nlp.vocab)
|
||||
pattern1 = [{'LOWER':'shall'}]
|
||||
pattern2 = [{'LOWER':'should'}]
|
||||
pattern3 = [{'LOWER':'may'}]
|
||||
pattern4 = [{'LOWER':'must'}]
|
||||
pattern5 = [{'LOWER':'can'}]
|
||||
|
||||
matcher.add('Shall',[pattern1])
|
||||
matcher.add('Should',[pattern2])
|
||||
matcher.add('May',[pattern3])
|
||||
matcher.add('Must',[pattern4])
|
||||
matcher.add('Can',[pattern5])
|
||||
|
||||
for index, row in df.iterrows():
|
||||
doc = self.nlp(row['Paragraph'])
|
||||
found_matches = matcher(doc)
|
||||
if found_matches:
|
||||
for match_id, start, end in found_matches:
|
||||
string_id = self.nlp.vocab.strings[match_id]
|
||||
span = doc[start:end]
|
||||
if string_id == 'Shall':
|
||||
df.loc[index,'Shall'] = 1
|
||||
|
||||
if string_id == 'Should':
|
||||
df.loc[index,'Should'] = 1
|
||||
|
||||
if string_id == 'May':
|
||||
df.loc[index,'May'] = 1
|
||||
|
||||
if string_id == 'Must':
|
||||
df.loc[index,'Must'] = 1
|
||||
|
||||
if string_id == 'Can':
|
||||
df.loc[index,'Can'] = 1
|
||||
return df
|
||||
|
||||
def title_sent(self, title, sent):
|
||||
new_sent = title+': '+sent
|
||||
return new_sent
|
||||
|
||||
def split_into_sentneces(self, data):
|
||||
df = data
|
||||
df['Sentence'] = df['Paragraph'].apply(lambda xx:nltk.tokenize.sent_tokenize(xx))
|
||||
df = df.explode("Sentence").reset_index(drop=True)
|
||||
|
||||
df['ProcessedSent']= df['Sentence'].apply(lambda xx:self.remove_punctuation(xx))
|
||||
df['ProcessedSent']= df['ProcessedSent'].apply(lambda xx: xx.lower())
|
||||
tokenizer = MWETokenizer()
|
||||
df['ProcessedSent']= df['ProcessedSent'].apply(lambda xx: tokenizer.tokenize(word_tokenize(xx)))
|
||||
df['ProcessedSent']= df['ProcessedSent'].apply(lambda xx: self.remove_stopwords(xx))
|
||||
df['ProcessedSent']= df['ProcessedSent'].apply(lambda xx: self.lemmatizer(xx))
|
||||
df['ProcessedSent']= df['ProcessedSent'].apply(lambda xx: ' '.join(xx))
|
||||
df['ProcessedSent']= df.Module+' '+ df.ProcessedSent
|
||||
|
||||
return df
|
||||
|
||||
def pre_process_files(self):
|
||||
print('Pre-processing started')
|
||||
levels = next(walk(os.path.join(self.BASE_DIR, 'static/data')), (None, None, []))[1]
|
||||
# levels = ['1 General IDDRS']
|
||||
paragraphs = self.merge_files(levels)
|
||||
df = self.add_compliance(paragraphs)
|
||||
processed_data = self.split_into_sentneces(df)
|
||||
processed_data.to_json(os.path.join(self.BASE_DIR, 'static/searchable/data2.json'), orient='records', indent=4)
|
||||
print('Pre-processing finished')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -1,7 +0,0 @@
|
||||
from rest_framework import serializers
|
||||
from .models import NewContentTracker
|
||||
|
||||
class NewContentTrackerSerializer(serializers.ModelSerializer):
|
||||
class Meta:
|
||||
model = NewContentTracker
|
||||
fields = '__all__'
|
@ -1,32 +0,0 @@
|
||||
from django.urls import path, include
|
||||
from rest_framework import routers
|
||||
from .views import LevelViewSet, StandardsViewSet, NewContentTrackerViewSet
|
||||
from . import views
|
||||
|
||||
router = routers.DefaultRouter()
|
||||
router.register(r'levels', LevelViewSet)
|
||||
router.register(r'standards', StandardsViewSet)
|
||||
router.register(r'NewContentTracker', NewContentTrackerViewSet)
|
||||
|
||||
|
||||
urlpatterns = [
|
||||
path('', include(router.urls)),
|
||||
path('content-list/', views.contentList, name='content-list'),
|
||||
path('content-create/', views.contentCreate, name='content-create'),
|
||||
path('content-detail/', views.contentDetail, name='content-detail'),
|
||||
path('content-update/', views.contentUpdate, name='content-update'),
|
||||
path('content-delete/', views.contentDelete, name='content-delete'),
|
||||
|
||||
path('level-submit/', views.levelSubmit, name='level-submit'),
|
||||
path('level-delete/', views.levelDelete, name='level-delete'),
|
||||
|
||||
path('standard-submit/', views.standardSubmit, name='standard-submit'),
|
||||
path('standard-delete/', views.standardDelete, name='standard-delete'),
|
||||
|
||||
path('pre-process/', views.preprocess, name='pre-process'),
|
||||
|
||||
path('api/', include('admin_api.api.urls')),
|
||||
|
||||
path('login/', views.loginPage, name='login'),
|
||||
path('logout/', views.logoutPage, name='logout'),
|
||||
]
|
@ -1,314 +0,0 @@
|
||||
from http.client import HTTPResponse
|
||||
from django.shortcuts import render, redirect
|
||||
from rest_framework import viewsets
|
||||
from search_tfidf.models import Level, Standards
|
||||
from .models import NewContentTracker
|
||||
from .serializer import NewContentTrackerSerializer
|
||||
from search_tfidf.serializer import LevelSerializer, StandardsSerializer
|
||||
from django.http import JsonResponse
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from rest_framework.decorators import api_view, permission_classes
|
||||
from rest_framework.permissions import IsAuthenticated
|
||||
from rest_framework.response import Response
|
||||
from django.db.models import Max
|
||||
import shutil
|
||||
from django.contrib.auth import authenticate, login, logout
|
||||
from django.contrib import messages
|
||||
|
||||
|
||||
from .preprocessData import PreprocessData
|
||||
# Create your views here.
|
||||
@permission_classes([IsAuthenticated])
|
||||
class LevelViewSet(viewsets.ModelViewSet):
|
||||
queryset = Level.objects.all()
|
||||
serializer_class = LevelSerializer
|
||||
|
||||
class StandardsViewSet(viewsets.ModelViewSet):
|
||||
queryset = Standards.objects.all()
|
||||
serializer_class = StandardsSerializer
|
||||
|
||||
class NewContentTrackerViewSet(viewsets.ModelViewSet):
|
||||
queryset = NewContentTracker.objects.all()
|
||||
serializer_class = NewContentTrackerSerializer
|
||||
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
|
||||
# ################################################################
|
||||
# #######################Contents#################################
|
||||
# ################################################################
|
||||
|
||||
@csrf_exempt
|
||||
@api_view(['GET'])
|
||||
def contentList(request):
|
||||
# Get the values from the request parameters
|
||||
level = request.GET.get('level')
|
||||
standard = request.GET.get('standard')
|
||||
data = ""
|
||||
|
||||
module_path = filePath(level, standard)
|
||||
# Read the JSON file
|
||||
with open(module_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Return the filtered data as a JSON response
|
||||
return JsonResponse({'contents': data})
|
||||
|
||||
@csrf_exempt
|
||||
@api_view(['POST'])
|
||||
def contentCreate(request):
|
||||
level = request.data['params']['level']
|
||||
standard = request.data['params']['standard']
|
||||
#print(request.data['params']['formData']['Heading1'])
|
||||
module_path = filePath(level, standard)
|
||||
#
|
||||
levelInfo = Level.objects.get(levelNumber=level)
|
||||
standardInfo = Standards.objects.get(standardTitle = standard)
|
||||
#
|
||||
with open(module_path) as json_file:
|
||||
data = json.load(json_file)
|
||||
#
|
||||
new_id = 0
|
||||
ids = []
|
||||
for obj in data:
|
||||
ids.append(obj['ID'])
|
||||
if len(ids) > 0:
|
||||
new_id = max(ids)+1
|
||||
#
|
||||
new_obj = request.data['params']['formData']
|
||||
new_obj['ID'] = new_id
|
||||
new_obj['Color'] = levelInfo.levelColor
|
||||
new_obj['Level'] = levelInfo.levelNumber
|
||||
new_obj['LevelName'] = levelInfo.levelName
|
||||
new_obj['Title'] = standardInfo.standardTitle
|
||||
new_obj['Module'] = standardInfo.standardTitle
|
||||
data.append(new_obj)
|
||||
#
|
||||
with open(module_path, 'w') as f:
|
||||
json.dump(data, f, indent=4)
|
||||
|
||||
haveToPreProcess(new_id, levelInfo.levelNumber, standardInfo.standardTitle, new_obj['Paragraph'])
|
||||
|
||||
return Response('')
|
||||
|
||||
@csrf_exempt
|
||||
@api_view(['GET'])
|
||||
def contentDetail(request):
|
||||
level = request.GET.get('level')
|
||||
standard = request.GET.get('standard')
|
||||
id = request.GET.get('id')
|
||||
|
||||
module_path = filePath(level, standard)
|
||||
with open(module_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
for obj in data:
|
||||
if obj['ID'] == int(id):
|
||||
data = obj
|
||||
|
||||
return JsonResponse({'paragraph': data})
|
||||
|
||||
@csrf_exempt
|
||||
@api_view(['POST'])
|
||||
def contentUpdate(request):
|
||||
level = request.data['params']['level']
|
||||
standard = request.data['params']['standard']
|
||||
|
||||
id = request.data['params']['id']
|
||||
updated_content = request.data['params']['formData']
|
||||
|
||||
module_path = filePath(level, standard)
|
||||
|
||||
with open(module_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
for obj in data:
|
||||
if obj['ID'] == int(id):
|
||||
obj['Heading1'] = updated_content['Heading1']
|
||||
obj['Heading2'] = updated_content['Heading2']
|
||||
obj['Heading3'] = updated_content['Heading3']
|
||||
obj['Heading4'] = updated_content['Heading4']
|
||||
obj['Paragraph'] = updated_content['Paragraph']
|
||||
obj['PageNum'] = updated_content['PageNum']
|
||||
|
||||
|
||||
|
||||
|
||||
with open(module_path, 'w') as f:
|
||||
json.dump(data, f)
|
||||
|
||||
haveToPreProcess(obj['ID'], level, standard, updated_content['Paragraph'])
|
||||
|
||||
return Response('')
|
||||
|
||||
@csrf_exempt
|
||||
@api_view(['POST'])
|
||||
def contentDelete(request):
|
||||
|
||||
level = request.data['params']['level']
|
||||
standard = request.data['params']['standard']
|
||||
id = request.data['params']['id']
|
||||
print(level,standard,id)
|
||||
module_path = filePath(level, standard)
|
||||
|
||||
with open(module_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
for i in range(len(data)):
|
||||
if data[i]['ID'] == int(id):
|
||||
data.pop(i)
|
||||
break
|
||||
|
||||
with open(module_path, 'w') as f:
|
||||
json.dump(data, f)
|
||||
|
||||
haveToPreProcess(id, level, standard, 'Deleted')
|
||||
|
||||
return Response('')
|
||||
|
||||
# ################################################################
|
||||
# #######################Levels###################################
|
||||
# ################################################################
|
||||
@csrf_exempt
|
||||
@api_view(['POST'])
|
||||
def levelSubmit(request):
|
||||
data = request.data['params']['editingRow']
|
||||
|
||||
if 'id' in data:
|
||||
level = Level.objects.get(id=data['id'])
|
||||
serializer = LevelSerializer(instance=level, data=data)
|
||||
if serializer.is_valid():
|
||||
serializer.save()
|
||||
|
||||
else:
|
||||
newLevelNumber = Level.objects.aggregate(Max('levelNumber'))['levelNumber__max']+1
|
||||
data['levelNumber'] = newLevelNumber
|
||||
serializer = LevelSerializer(data=data)
|
||||
if serializer.is_valid():
|
||||
serializer.save()
|
||||
level_new_dir = os.path.join(BASE_DIR, 'static/data/'+str(newLevelNumber)+'/')
|
||||
os.makedirs(level_new_dir, exist_ok=True)
|
||||
|
||||
|
||||
|
||||
return Response('')
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@api_view(['POST'])
|
||||
def levelDelete(request):
|
||||
data = request.data['params']['rowData']
|
||||
level = Level.objects.get(id=data['id'])
|
||||
level.delete()
|
||||
level_del_dir = os.path.join(BASE_DIR, 'static/data/'+str(data['levelNumber'])+'/')
|
||||
shutil.rmtree(level_del_dir)
|
||||
|
||||
haveToPreProcess(data['id'], data['levelNumber'], 'No', 'LevelDeleted')
|
||||
|
||||
return Response('')
|
||||
|
||||
|
||||
# ################################################################
|
||||
# #######################Standards################################
|
||||
# ################################################################
|
||||
|
||||
@csrf_exempt
|
||||
@api_view(['POST'])
|
||||
def standardSubmit(request):
|
||||
data = request.data['params']['editingRow']
|
||||
|
||||
if 'id' in data:
|
||||
standard = Standards.objects.get(id = data['id'])
|
||||
current_path = os.path.join(BASE_DIR, 'static/data/'+str(standard.standardLevel)+'/'+standard.standardTitle+'.json')
|
||||
new_path = os.path.join(BASE_DIR, 'static/data/'+str(standard.standardLevel)+'/'+data['standardTitle']+'.json')
|
||||
serializer = StandardsSerializer(instance=standard, data=data)
|
||||
if serializer.is_valid():
|
||||
serializer.save()
|
||||
os.rename(current_path, new_path)
|
||||
|
||||
else:
|
||||
levelRow = Level.objects.get(id = data['levelID'])
|
||||
data['levelID'] = int(data['levelID'])
|
||||
data['standardLevel'] = levelRow.levelNumber
|
||||
serializer = StandardsSerializer(data=data)
|
||||
if serializer.is_valid():
|
||||
serializer.save()
|
||||
newFile=[]
|
||||
standard_new = os.path.join(BASE_DIR, 'static/data/'+str(levelRow.levelNumber)+'/'+data['standardTitle']+'.json')
|
||||
with open(standard_new, 'w') as file:
|
||||
# Write the JSON data to the file
|
||||
json.dump(newFile, file)
|
||||
file.close()
|
||||
|
||||
else:
|
||||
print(serializer.errors)
|
||||
|
||||
|
||||
return Response('')
|
||||
|
||||
@csrf_exempt
|
||||
@api_view(['POST'])
|
||||
def standardDelete(request):
|
||||
data = request.data['params']['rowData']
|
||||
standard = Standards.objects.get(id=data['id'])
|
||||
standard.delete()
|
||||
haveToPreProcess(data['id'], data['standardLevel'], data['standardTitle'], 'StandardDeleted')
|
||||
|
||||
return Response('Item successfully deleted!')
|
||||
|
||||
# ################################################################
|
||||
# #######################pre-process##############################
|
||||
# ################################################################
|
||||
@csrf_exempt
|
||||
@api_view(['POST'])
|
||||
def preprocess(request):
|
||||
process_files = PreprocessData()
|
||||
process_files.pre_process_files()
|
||||
NewContentTracker.objects.all().delete()
|
||||
return Response('Procssed successfully')
|
||||
|
||||
# ################################################################
|
||||
# #######################Authentication###########################
|
||||
# ################################################################
|
||||
|
||||
def loginPage(request):
|
||||
pass
|
||||
|
||||
|
||||
def logoutPage(request):
|
||||
pass
|
||||
|
||||
# ################################################################
|
||||
# ################################################################
|
||||
# ################################################################
|
||||
|
||||
def filePath(level_input, standard_input):
|
||||
standards_dir = os.path.join(BASE_DIR, 'static/data/')
|
||||
file_path = ''
|
||||
levels = next(os.walk(os.path.join(BASE_DIR, 'static/data')), (None, None, []))[1]
|
||||
if str(level_input) in levels:
|
||||
filenames = next(os.walk(standards_dir+level_input), (None, None, []))[2]
|
||||
for file in filenames:
|
||||
if str(standard_input) in file:
|
||||
file_path = standards_dir+str(level_input)+'/'+file
|
||||
|
||||
return file_path
|
||||
|
||||
def haveToPreProcess(id, levelNumber, standardTitle, paragraph):
|
||||
|
||||
######################################################
|
||||
############NewContentTracker#########################
|
||||
newContent = {}
|
||||
newContent['paragraphID'] = id
|
||||
newContent['levelNumber'] = levelNumber
|
||||
newContent['standardTitle'] = standardTitle
|
||||
newContent['paragraph'] = paragraph
|
||||
|
||||
serializer = NewContentTrackerSerializer(data=newContent)
|
||||
if serializer.is_valid():
|
||||
serializer.save()
|
||||
|
||||
return ('Added')
|
60
data_api/CreateIndexES.py
Normal file
60
data_api/CreateIndexES.py
Normal file
@ -0,0 +1,60 @@
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch.helpers import bulk
|
||||
import json
|
||||
import os
|
||||
from os import walk
|
||||
from pathlib import Path
|
||||
|
||||
class CreateIndexES:
|
||||
|
||||
def __init__(self):
|
||||
self.ELASTIC_PASSWORD = "p-P7luUvrPggWrS4UQsy"
|
||||
self.BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
|
||||
def createIndex(self):
|
||||
# Create the client instance
|
||||
es = Elasticsearch(
|
||||
"https://localhost:9200",
|
||||
ca_certs="/etc/elasticsearch/certs/http_ca.crt",
|
||||
basic_auth=("elastic", self.ELASTIC_PASSWORD)
|
||||
)
|
||||
|
||||
index_name = "iddrs"
|
||||
|
||||
mapping = {
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"Title": {"type": "text"},
|
||||
"Paragraph": {"type": "text"},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if not es.indices.exists(index=index_name):
|
||||
print("Index does not exist. Creating...")
|
||||
es.indices.create(index=index_name, body=mapping)
|
||||
elif es.indices.exists(index=index_name):
|
||||
print("Index already exists. Deleting and recreating...")
|
||||
# Delete the index (including all documents)
|
||||
es.indices.delete(index=index_name, ignore=[400, 404])
|
||||
es.indices.create(index=index_name, body=mapping)
|
||||
|
||||
#es.indices.create(index=index_name, body=mapping, ignore=400) # Ignore if the index already exists
|
||||
#
|
||||
filenames = next(walk(os.path.join(self.BASE_DIR,'static/data/json/')), (None, None, []))[2]
|
||||
|
||||
for file in filenames:
|
||||
with open(os.path.join(self.BASE_DIR,'static/data/json/')+file, 'r') as f:
|
||||
data = json.load(f)
|
||||
actions = [
|
||||
{
|
||||
"_op_type": "index",
|
||||
"_index": index_name,
|
||||
#"_id": i + 1,
|
||||
"_source": document,
|
||||
}
|
||||
for i, document in enumerate(data)
|
||||
]
|
||||
success, failed = bulk(es, actions, index=index_name, raise_on_error=False)
|
||||
print(success)
|
||||
|
243
data_api/PreprocessFile.py
Normal file
243
data_api/PreprocessFile.py
Normal file
@ -0,0 +1,243 @@
|
||||
from docx import Document
|
||||
import os
|
||||
import fitz
|
||||
import re
|
||||
import uuid
|
||||
import shutil
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import spacy
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
from .models import Levels, StandardsList
|
||||
|
||||
from .CreateIndexES import CreateIndexES
|
||||
|
||||
|
||||
class PreprocessFile:
|
||||
def __init__(self):
|
||||
self.BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
|
||||
def find_summary_page(self, pdf_path, summary_text):
|
||||
doc = fitz.open(pdf_path)
|
||||
summary_count = 0
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
text_instances = page.get_text("text")
|
||||
|
||||
# Counting the number of occurrences of the summary text on the page
|
||||
summary_count += text_instances.count(summary_text)
|
||||
|
||||
if summary_count >= 2:
|
||||
return page_num
|
||||
else:
|
||||
page_num = 0
|
||||
return page_num
|
||||
|
||||
def find_text_in_pdf_from_summary(self, pdf_path, search_text, summary_text):
|
||||
summary_page = self.find_summary_page(pdf_path, summary_text)
|
||||
|
||||
if summary_page is None:
|
||||
# print("Summary not found in the PDF.")
|
||||
return None
|
||||
|
||||
doc = fitz.open(pdf_path)
|
||||
|
||||
for page_num in range(
|
||||
summary_page + 1, len(doc)
|
||||
): # Start searching after the 2nd summary
|
||||
page = doc[page_num]
|
||||
text_instances = page.get_text("text")
|
||||
|
||||
# Use regex to find instances of search_text without anything following it on the same line
|
||||
regex_pattern = re.compile(rf"\b{search_text}\b(?![^\n]*\S)", re.IGNORECASE)
|
||||
match = regex_pattern.search(text_instances)
|
||||
|
||||
if match:
|
||||
# print(f"Text found on page {page_num + 1}, after the 2nd summary.")
|
||||
return page_num + 1
|
||||
|
||||
# print("Text not found in the PDF.")
|
||||
return None
|
||||
|
||||
# Custom serialization function
|
||||
def custom_json_serialization(self, text):
|
||||
# Replace newline characters with spaces
|
||||
return text.replace("\n", " ")
|
||||
|
||||
def process_standards(self):
|
||||
# BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
matcher = Matcher(nlp.vocab)
|
||||
pattern1 = [{"LOWER": "shall"}]
|
||||
pattern2 = [{"LOWER": "should"}]
|
||||
pattern3 = [{"LOWER": "may"}]
|
||||
pattern4 = [{"LOWER": "must"}]
|
||||
pattern5 = [{"LOWER": "can"}]
|
||||
|
||||
matcher.add("Shall", [pattern1])
|
||||
matcher.add("Should", [pattern2])
|
||||
matcher.add("May", [pattern3])
|
||||
matcher.add("Must", [pattern4])
|
||||
matcher.add("Can", [pattern5])
|
||||
|
||||
root_json_dir = os.path.join(
|
||||
self.BASE_DIR, "static/data/json/"
|
||||
) # "../Standards/json"
|
||||
root_data_json_dir = os.path.join(
|
||||
self.BASE_DIR, "static/data/"
|
||||
) # "../Standards/json"
|
||||
summary_text = "Summary"
|
||||
data = []
|
||||
|
||||
# for logging
|
||||
number_of_successed_files = 0
|
||||
number_of_sections = 0
|
||||
number_of_sections_after_cleaning = 0
|
||||
number_of_cleaned_sections = 0
|
||||
|
||||
# Check if the json directory exists
|
||||
if os.path.exists(root_json_dir):
|
||||
# Delete the directory and its contents
|
||||
shutil.rmtree(root_json_dir)
|
||||
|
||||
# Create a new directory
|
||||
os.makedirs(root_json_dir)
|
||||
|
||||
# Configure logging settings
|
||||
log_file = os.path.join(
|
||||
self.BASE_DIR, "static/data/json_log.log"
|
||||
) # "../Standards/json/json_log.log" # Specify the path and filename for the log file
|
||||
logging.basicConfig(
|
||||
filename=log_file, # Set the log file
|
||||
level=logging.DEBUG, # Set the logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
format="%(asctime)s - %(levelname)s - %(message)s", # Define log message format
|
||||
datefmt="%Y-%m-%d %H:%M:%S", # Define date/time format
|
||||
)
|
||||
|
||||
for standard in StandardsList.objects.all():
|
||||
standard_file_pdf = standard.standardFilePDF
|
||||
standard_file_word = standard.standardFileWord
|
||||
json_path = os.path.join(root_json_dir, standard.standardTitle + ".json")
|
||||
print(json_path)
|
||||
sections = []
|
||||
current_section = ""
|
||||
data_per_file = []
|
||||
# logging has to come here
|
||||
number_of_sections_after_cleaning_per_file = 0
|
||||
try:
|
||||
word_doc = Document(standard_file_word.path)
|
||||
for paragraph in word_doc.paragraphs:
|
||||
if (
|
||||
paragraph.style.name.startswith("Heading")
|
||||
or "Section" in paragraph.style.name
|
||||
or "Sub-section" in paragraph.style.name
|
||||
):
|
||||
# If we're starting a new section, save off the old one
|
||||
if current_section:
|
||||
sections.append(current_section.strip())
|
||||
current_section = ""
|
||||
current_section += paragraph.text + "\n"
|
||||
else:
|
||||
# Otherwise, append text to current section
|
||||
current_section += paragraph.text + "\n"
|
||||
|
||||
# Append the last section to the list of sections if it exists
|
||||
if current_section.strip():
|
||||
sections.append(current_section.strip())
|
||||
|
||||
# print all sections
|
||||
for index, section in enumerate(sections):
|
||||
# for logging
|
||||
number_of_sections += 1
|
||||
if section != "" and len(section.split()) > 25:
|
||||
# for logging
|
||||
number_of_sections_after_cleaning += 1
|
||||
number_of_sections_after_cleaning_per_file += 1
|
||||
|
||||
first_line = section.strip().splitlines()[0]
|
||||
text_to_search = first_line
|
||||
page_num = self.find_text_in_pdf_from_summary(
|
||||
standard_file_pdf, text_to_search, summary_text
|
||||
)
|
||||
|
||||
doc = nlp(section)
|
||||
found_matches = matcher(doc)
|
||||
shall = should = may = must = can = False
|
||||
if found_matches:
|
||||
for match_id, start, end in found_matches:
|
||||
string_id = nlp.vocab.strings[match_id]
|
||||
span = doc[start:end]
|
||||
if string_id == "Shall":
|
||||
shall = True
|
||||
|
||||
if string_id == "Should":
|
||||
should = True
|
||||
|
||||
if string_id == "May":
|
||||
may = True
|
||||
|
||||
if string_id == "Must":
|
||||
must = True
|
||||
|
||||
if string_id == "Can":
|
||||
can = True
|
||||
section_boj = {
|
||||
"ID": str(uuid.uuid4()),
|
||||
"Color": standard.levelID.levelColor,
|
||||
"Level": str(standard.levelNumber),
|
||||
"LevelName": "",
|
||||
"Title": standard.standardTitle,
|
||||
"Heading1": "",
|
||||
"Heading2": "",
|
||||
"Heading3": "",
|
||||
"Heading4": "",
|
||||
"Module": standard.standardTitle,
|
||||
"PageNum": page_num,
|
||||
"Paragraph": self.custom_json_serialization(section),
|
||||
"Can": can,
|
||||
"May": may,
|
||||
"Shall": shall,
|
||||
"Should": should,
|
||||
"Must": must,
|
||||
}
|
||||
data_per_file.append(section_boj)
|
||||
data.append(section_boj)
|
||||
else:
|
||||
# for logging
|
||||
number_of_cleaned_sections + 1
|
||||
except Exception as e:
|
||||
print(
|
||||
f"An error occurred while processing {standard.standardTitle}: {str(e)}"
|
||||
)
|
||||
|
||||
try:
|
||||
with open(json_path, "w") as json_file:
|
||||
json.dump(data_per_file, json_file, indent=4)
|
||||
logging.info(
|
||||
f"{number_of_sections_after_cleaning_per_file} seactions has been saved to: {json_path}"
|
||||
)
|
||||
number_of_successed_files += 1
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
f"Number of successed saved files: {number_of_successed_files}"
|
||||
)
|
||||
|
||||
logging.info(f"Number of successed saved files: {number_of_successed_files}")
|
||||
logging.info(f"Number of seactions: {number_of_sections}")
|
||||
logging.info(
|
||||
f"Number of seactions after cleaning: {number_of_sections_after_cleaning}"
|
||||
)
|
||||
logging.info(f"Number of cleaned seactions: {number_of_cleaned_sections}")
|
||||
|
||||
with open(root_data_json_dir + "/data.json", "w") as json_file:
|
||||
# Use json.dump to write the data to the file
|
||||
json.dump(
|
||||
data, json_file, indent=4
|
||||
) # Use indent for pretty formatting (optional)
|
||||
|
||||
# Close the log file (optional, usually done automatically)
|
||||
logging.shutdown()
|
BIN
data_api/__pycache__/CreateIndexES.cpython-310.pyc
Normal file
BIN
data_api/__pycache__/CreateIndexES.cpython-310.pyc
Normal file
Binary file not shown.
BIN
data_api/__pycache__/PreprocessFile.cpython-310.pyc
Normal file
BIN
data_api/__pycache__/PreprocessFile.cpython-310.pyc
Normal file
Binary file not shown.
BIN
data_api/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
data_api/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
data_api/__pycache__/admin.cpython-310.pyc
Normal file
BIN
data_api/__pycache__/admin.cpython-310.pyc
Normal file
Binary file not shown.
BIN
data_api/__pycache__/apps.cpython-310.pyc
Normal file
BIN
data_api/__pycache__/apps.cpython-310.pyc
Normal file
Binary file not shown.
BIN
data_api/__pycache__/forms.cpython-310.pyc
Normal file
BIN
data_api/__pycache__/forms.cpython-310.pyc
Normal file
Binary file not shown.
BIN
data_api/__pycache__/models.cpython-310.pyc
Normal file
BIN
data_api/__pycache__/models.cpython-310.pyc
Normal file
Binary file not shown.
BIN
data_api/__pycache__/serializers.cpython-310.pyc
Normal file
BIN
data_api/__pycache__/serializers.cpython-310.pyc
Normal file
Binary file not shown.
BIN
data_api/__pycache__/urls.cpython-310.pyc
Normal file
BIN
data_api/__pycache__/urls.cpython-310.pyc
Normal file
Binary file not shown.
BIN
data_api/__pycache__/views.cpython-310.pyc
Normal file
BIN
data_api/__pycache__/views.cpython-310.pyc
Normal file
Binary file not shown.
@ -1,7 +1,5 @@
|
||||
from django.http import JsonResponse
|
||||
from rest_framework.response import Response
|
||||
from rest_framework.decorators import api_view
|
||||
|
||||
from rest_framework_simplejwt.serializers import TokenObtainPairSerializer
|
||||
from rest_framework_simplejwt.views import TokenObtainPairView
|
||||
|
@ -1,6 +1,6 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class AdminApiConfig(AppConfig):
|
||||
class DataApiConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'admin_api'
|
||||
name = 'data_api'
|
7
data_api/forms.py
Normal file
7
data_api/forms.py
Normal file
@ -0,0 +1,7 @@
|
||||
from django import forms
|
||||
from .models import StandardsList
|
||||
|
||||
class StandardUploadForm(forms.ModelForm):
|
||||
class Meta:
|
||||
model = StandardsList
|
||||
fields = '__all__'
|
34
data_api/migrations/0001_initial.py
Normal file
34
data_api/migrations/0001_initial.py
Normal file
@ -0,0 +1,34 @@
|
||||
# Generated by Django 4.1.3 on 2023-11-03 09:07
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Levels',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('levelTitle', models.CharField(max_length=200)),
|
||||
('levelColor', models.CharField(max_length=200)),
|
||||
('levelNumber', models.IntegerField()),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Standards',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('standardFile', models.FileField(blank=True, null=True, upload_to='')),
|
||||
('standardTitle', models.CharField(max_length=200)),
|
||||
('standardPath', models.CharField(blank=True, max_length=200, null=True)),
|
||||
('levelID', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='data_api.levels')),
|
||||
],
|
||||
),
|
||||
]
|
17
data_api/migrations/0002_rename_standards_standardslist.py
Normal file
17
data_api/migrations/0002_rename_standards_standardslist.py
Normal file
@ -0,0 +1,17 @@
|
||||
# Generated by Django 4.1.3 on 2023-11-03 09:08
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('data_api', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameModel(
|
||||
old_name='Standards',
|
||||
new_name='StandardsList',
|
||||
),
|
||||
]
|
@ -0,0 +1,33 @@
|
||||
# Generated by Django 4.1.3 on 2023-11-03 21:15
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('data_api', '0002_rename_standards_standardslist'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='standardslist',
|
||||
old_name='standardFile',
|
||||
new_name='standardFilePDF',
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name='standardslist',
|
||||
old_name='standardPath',
|
||||
new_name='standardPathPDF',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='standardslist',
|
||||
name='standardFileWord',
|
||||
field=models.FileField(blank=True, null=True, upload_to=''),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='standardslist',
|
||||
name='standardPathWord',
|
||||
field=models.CharField(blank=True, max_length=200, null=True),
|
||||
),
|
||||
]
|
@ -0,0 +1,31 @@
|
||||
# Generated by Django 4.1.3 on 2023-11-06 08:50
|
||||
|
||||
import data_api.models
|
||||
from django.db import migrations, models
|
||||
import django.utils.timezone
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('data_api', '0003_rename_standardfile_standardslist_standardfilepdf_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='standardslist',
|
||||
name='uploaded_at',
|
||||
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
|
||||
preserve_default=False,
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='standardslist',
|
||||
name='standardFilePDF',
|
||||
field=models.FileField(blank=True, null=True, upload_to=data_api.models.dynamic_upload_to),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='standardslist',
|
||||
name='standardFileWord',
|
||||
field=models.FileField(blank=True, null=True, upload_to=data_api.models.dynamic_upload_to),
|
||||
),
|
||||
]
|
@ -0,0 +1,29 @@
|
||||
# Generated by Django 4.1.3 on 2023-11-06 12:41
|
||||
|
||||
import data_api.models
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('data_api', '0004_standardslist_uploaded_at_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='standardslist',
|
||||
name='levelNumber',
|
||||
field=models.IntegerField(blank=True, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='standardslist',
|
||||
name='standardFilePDF',
|
||||
field=models.FileField(upload_to=data_api.models.dynamic_upload_to),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='standardslist',
|
||||
name='standardFileWord',
|
||||
field=models.FileField(upload_to=data_api.models.dynamic_upload_to),
|
||||
),
|
||||
]
|
@ -0,0 +1,21 @@
|
||||
# Generated by Django 4.1.3 on 2023-11-06 12:49
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('data_api', '0005_standardslist_levelnumber_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='standardslist',
|
||||
name='standardPathPDF',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='standardslist',
|
||||
name='standardPathWord',
|
||||
),
|
||||
]
|
18
data_api/migrations/0007_standardslist_standardcolor.py
Normal file
18
data_api/migrations/0007_standardslist_standardcolor.py
Normal file
@ -0,0 +1,18 @@
|
||||
# Generated by Django 4.1.3 on 2023-11-20 10:14
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('data_api', '0006_remove_standardslist_standardpathpdf_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='standardslist',
|
||||
name='standardColor',
|
||||
field=models.CharField(blank=True, max_length=200),
|
||||
),
|
||||
]
|
22
data_api/migrations/0008_fileevent.py
Normal file
22
data_api/migrations/0008_fileevent.py
Normal file
@ -0,0 +1,22 @@
|
||||
# Generated by Django 4.1.3 on 2023-11-20 10:47
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('data_api', '0007_standardslist_standardcolor'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='FileEvent',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('event_type', models.CharField(choices=[('UPLOAD', 'File Uploaded'), ('DELETE', 'File Deleted')], max_length=10)),
|
||||
('file_name', models.CharField(max_length=255)),
|
||||
('timestamp', models.DateTimeField(auto_now_add=True)),
|
||||
],
|
||||
),
|
||||
]
|
18
data_api/migrations/0009_fileevent_indexed.py
Normal file
18
data_api/migrations/0009_fileevent_indexed.py
Normal file
@ -0,0 +1,18 @@
|
||||
# Generated by Django 4.1.3 on 2023-11-20 13:03
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('data_api', '0008_fileevent'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='fileevent',
|
||||
name='indexed',
|
||||
field=models.BooleanField(default=False),
|
||||
),
|
||||
]
|
@ -0,0 +1,22 @@
|
||||
# Generated by Django 4.1.3 on 2023-11-20 13:39
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('data_api', '0009_fileevent_indexed'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='fileevent',
|
||||
name='indexed',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='fileevent',
|
||||
name='fileStatus',
|
||||
field=models.CharField(default='Pending', max_length=255),
|
||||
),
|
||||
]
|
BIN
data_api/migrations/__pycache__/0001_initial.cpython-310.pyc
Normal file
BIN
data_api/migrations/__pycache__/0001_initial.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
data_api/migrations/__pycache__/0008_fileevent.cpython-310.pyc
Normal file
BIN
data_api/migrations/__pycache__/0008_fileevent.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
data_api/migrations/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
data_api/migrations/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
60
data_api/models.py
Normal file
60
data_api/models.py
Normal file
@ -0,0 +1,60 @@
|
||||
from django.db import models
|
||||
import os
|
||||
|
||||
|
||||
def dynamic_upload_to(instance, filename):
|
||||
# Generate a dynamic folder name based on some criteria (e.g., user, date, etc.)
|
||||
return os.path.join(
|
||||
"iddrs_api/static/data/Standards", str(instance.levelNumber), filename
|
||||
)
|
||||
|
||||
|
||||
# Create your models here.
|
||||
|
||||
|
||||
class Levels(models.Model):
|
||||
# The Levels model defines the schema for levels in the database.
|
||||
# It has fields for the level's title, color, and number.
|
||||
levelTitle = models.CharField(max_length=200)
|
||||
levelColor = models.CharField(max_length=200)
|
||||
levelNumber = models.IntegerField()
|
||||
|
||||
def __str__(self):
|
||||
return str(self.levelNumber)
|
||||
|
||||
|
||||
class StandardsList(models.Model):
|
||||
# The StandardsList model defines the schema for standards in the database.
|
||||
# It has a foreign key to Levels, fields for the standard's file, title and path,
|
||||
# and a __str__ method to represent the standard by its title.
|
||||
levelID = models.ForeignKey(Levels, on_delete=models.CASCADE, blank=True, null=True)
|
||||
levelNumber = models.IntegerField(blank=True, null=True)
|
||||
standardFilePDF = models.FileField(upload_to=dynamic_upload_to) # , upload_to=""
|
||||
standardFileWord = models.FileField(upload_to=dynamic_upload_to) # , upload_to=""
|
||||
standardTitle = models.CharField(max_length=200)
|
||||
standardColor = models.CharField(max_length=200, blank=True)
|
||||
uploaded_at = models.DateTimeField(auto_now_add=True)
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
# Automatically set StdColor based on the associated Levels model's color
|
||||
if self.levelID:
|
||||
self.standardColor = self.levelID.levelColor
|
||||
super().save(*args, **kwargs)
|
||||
|
||||
def __str__(self):
|
||||
return self.standardTitle
|
||||
|
||||
|
||||
class FileEvent(models.Model):
|
||||
EVENT_CHOICES = (
|
||||
("UPLOAD", "File Uploaded"),
|
||||
("DELETE", "File Deleted"),
|
||||
)
|
||||
|
||||
event_type = models.CharField(max_length=10, choices=EVENT_CHOICES)
|
||||
file_name = models.CharField(max_length=255)
|
||||
timestamp = models.DateTimeField(auto_now_add=True)
|
||||
fileStatus = models.CharField(default='Pending', max_length=255)
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.get_event_type_display()}: {self.file_name}"
|
17
data_api/serializers.py
Normal file
17
data_api/serializers.py
Normal file
@ -0,0 +1,17 @@
|
||||
from rest_framework import serializers
|
||||
from .models import Levels, StandardsList, FileEvent
|
||||
|
||||
class LevelSerializer(serializers.ModelSerializer):
|
||||
class Meta:
|
||||
model = Levels
|
||||
fields = '__all__'
|
||||
|
||||
class StandardsSerializer(serializers.ModelSerializer):
|
||||
class Meta:
|
||||
model = StandardsList
|
||||
fields = '__all__'
|
||||
|
||||
class FileEventSerializer(serializers.ModelSerializer):
|
||||
class Meta:
|
||||
model = FileEvent
|
||||
fields = '__all__'
|
1
data_api/static/data/json/data.json
Normal file
1
data_api/static/data/json/data.json
Normal file
@ -0,0 +1 @@
|
||||
[]
|
35
data_api/static/data/json_log.log
Normal file
35
data_api/static/data/json_log.log
Normal file
@ -0,0 +1,35 @@
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - ERROR - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - INFO - Number of successed saved files: 0
|
||||
2023-11-09 16:17:19 - INFO - Number of seactions: 0
|
||||
2023-11-09 16:17:19 - INFO - Number of seactions after cleaning: 0
|
||||
2023-11-09 16:17:19 - INFO - Number of cleaned seactions: 0
|
||||
2023-11-09 16:21:55 - INFO - /home/louai/Documents/BICC/IDDRS/iddrs_tool/iddrs_api/data_api/PreprocessFile.py changed, reloading.
|
25
data_api/urls.py
Normal file
25
data_api/urls.py
Normal file
@ -0,0 +1,25 @@
|
||||
from django.urls import path, include
|
||||
from rest_framework import routers
|
||||
from .views import LevelViewSet, StandardsViewSet, FileEventsViewSet
|
||||
from . import views
|
||||
|
||||
router = routers.DefaultRouter()
|
||||
router.register(r'levels', LevelViewSet)
|
||||
router.register(r'standards', StandardsViewSet)
|
||||
router.register(r'fileEvents', FileEventsViewSet)
|
||||
|
||||
urlpatterns = [
|
||||
path('', include(router.urls)),
|
||||
|
||||
path('level-submit/', views.levelSubmit, name='level-submit'),
|
||||
path('level-delete/', views.levelDelete, name='level-delete'),
|
||||
|
||||
path('upload-standard/', views.upload_standard, name='upload-standard'),
|
||||
path('standard-delete/', views.standardDelete, name='standard-delete'),
|
||||
|
||||
path('process-files/', views.processFiles, name='process-files'),
|
||||
|
||||
#path('api/', include('admin_api.api.urls')),
|
||||
|
||||
|
||||
]
|
159
data_api/views.py
Normal file
159
data_api/views.py
Normal file
@ -0,0 +1,159 @@
|
||||
from rest_framework import viewsets
|
||||
from .models import Levels, StandardsList, FileEvent
|
||||
from .serializers import LevelSerializer, StandardsSerializer, FileEventSerializer
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
import os
|
||||
from pathlib import Path
|
||||
from rest_framework.decorators import api_view, permission_classes
|
||||
from rest_framework.permissions import IsAuthenticated
|
||||
from rest_framework.response import Response
|
||||
from django.db.models import Max
|
||||
import shutil
|
||||
from .PreprocessFile import PreprocessFile
|
||||
from .CreateIndexES import CreateIndexES
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Create your views here.
|
||||
#@permission_classes([IsAuthenticated])
|
||||
class LevelViewSet(viewsets.ModelViewSet):
|
||||
queryset = Levels.objects.all()
|
||||
serializer_class = LevelSerializer
|
||||
|
||||
#@permission_classes([IsAuthenticated])
|
||||
class StandardsViewSet(viewsets.ModelViewSet):
|
||||
queryset = StandardsList.objects.all().order_by('levelNumber')
|
||||
serializer_class = StandardsSerializer
|
||||
|
||||
class FileEventsViewSet(viewsets.ModelViewSet):
|
||||
queryset = FileEvent.objects.all().filter(fileStatus='Pending')
|
||||
serializer_class = FileEventSerializer
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
|
||||
# ################################################################
|
||||
# #######################Levels###################################
|
||||
# ################################################################
|
||||
def check_level_folder():
|
||||
level_numbers = Levels.objects.values_list('levelNumber', flat=True)
|
||||
for level_number in level_numbers:
|
||||
level_folder = os.path.join(BASE_DIR,'static/data/Standards/'+str(level_number)+'/')
|
||||
if not os.path.exists(level_folder):
|
||||
os.makedirs(level_folder)
|
||||
|
||||
@csrf_exempt
|
||||
@api_view(['POST'])
|
||||
def levelSubmit(request):
|
||||
data = request.data['params']['editingRow']
|
||||
|
||||
if 'id' in data:
|
||||
level = Levels.objects.get(id=data['id'])
|
||||
serializer = LevelSerializer(instance=level, data=data)
|
||||
if serializer.is_valid():
|
||||
serializer.save()
|
||||
check_level_folder()
|
||||
|
||||
else:
|
||||
newLevelNumber = Levels.objects.aggregate(Max('levelNumber'))['levelNumber__max']+1
|
||||
data['levelNumber'] = newLevelNumber
|
||||
serializer = LevelSerializer(data=data)
|
||||
if serializer.is_valid():
|
||||
serializer.save()
|
||||
check_level_folder()
|
||||
|
||||
return Response('')
|
||||
|
||||
@csrf_exempt
|
||||
@api_view(['POST'])
|
||||
def levelDelete(request):
|
||||
data = request.data['params']['rowData']
|
||||
level = Levels.objects.get(id=data['id'])
|
||||
level.delete()
|
||||
level_del_dir = os.path.join(BASE_DIR, 'static/data/Standards/'+str(data['levelNumber'])+'/')
|
||||
shutil.rmtree(level_del_dir)
|
||||
|
||||
return Response('')
|
||||
|
||||
# ################################################################
|
||||
# #######################Standards################################
|
||||
# ################################################################
|
||||
|
||||
@csrf_exempt
|
||||
@api_view(['POST'])
|
||||
def upload_standard(request):
|
||||
if request.method == 'POST':
|
||||
|
||||
level_number = request.data['selectedLevel']
|
||||
pdfFile = request.FILES['pdfFile']
|
||||
wordFile = request.FILES['wordFile']
|
||||
|
||||
standard_level_id = Levels.objects.filter(levelNumber=level_number).values_list('id', flat=True)[0]
|
||||
|
||||
standard = {
|
||||
'levelID': standard_level_id,
|
||||
'levelNumber': level_number,
|
||||
'standardFilePDF': pdfFile,
|
||||
'standardFileWord': wordFile,
|
||||
'standardTitle': pdfFile.name.split('.pdf')[0]
|
||||
}
|
||||
serializer = StandardsSerializer(data=standard)
|
||||
if serializer.is_valid():
|
||||
serializer.save()
|
||||
# Create a FileEvent for the upload
|
||||
FileEvent.objects.create(event_type='UPLOAD', file_name=pdfFile.name.split('.pdf')[0])
|
||||
|
||||
else:
|
||||
print('Invalid')
|
||||
|
||||
return Response('Done!')
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@api_view(['POST'])
|
||||
def standardDelete(request):
|
||||
data = request.data['params']['stdData']
|
||||
standard = StandardsList.objects.get(id = data['id'])
|
||||
delete_file(standard.standardFilePDF.path)
|
||||
delete_file(standard.standardFileWord.path)
|
||||
standard.delete()
|
||||
# Create a FileEvent for the delete
|
||||
FileEvent.objects.create(event_type='DELETE', file_name=standard.standardFilePDF.name.split('.pdf')[0])
|
||||
|
||||
|
||||
return Response('')
|
||||
|
||||
|
||||
|
||||
def delete_file(file_path):
|
||||
try:
|
||||
os.remove(file_path)
|
||||
print(f"File {file_path} deleted successfully.")
|
||||
except FileNotFoundError:
|
||||
print(f"File {file_path} not found.")
|
||||
except PermissionError:
|
||||
print(f"Permission error: Unable to delete {file_path}.")
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
@csrf_exempt
|
||||
@api_view(['POST'])
|
||||
def processFiles(request):
|
||||
logger.info('Starting file Processing ...')
|
||||
try:
|
||||
process_files = PreprocessFile()
|
||||
process_files.process_standards()
|
||||
craetIndex = CreateIndexES()
|
||||
craetIndex.createIndex()
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
raise
|
||||
# delete all records in the FileEvents table
|
||||
# FileEvent.objects.all().delete()
|
||||
# set all records in FileEvents table in column indexed to true
|
||||
FileEvent.objects.all().update(fileStatus='Indexed')
|
||||
|
||||
|
||||
logger.info('File Processing completed')
|
||||
return Response('Procssed successfully')
|
BIN
db.sqlite3
BIN
db.sqlite3
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -50,11 +50,11 @@ INSTALLED_APPS = [
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'rest_framework',
|
||||
'rest_framework.authtoken',
|
||||
'rest_framework_simplejwt.token_blacklist',
|
||||
'django_filters',
|
||||
'search_tfidf',
|
||||
'admin_api',
|
||||
'user_auth',
|
||||
'data_api',
|
||||
'corsheaders',
|
||||
]
|
||||
|
||||
|
@ -19,6 +19,5 @@ from django.urls import path, include
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
path('client_api/', include('search_tfidf.urls')),
|
||||
path('admin_api/', include('admin_api.urls')),
|
||||
path('user_auth/', include('user_auth.urls')),
|
||||
path('data_api/', include('data_api.urls')),
|
||||
]
|
||||
|
BIN
search_tfidf/__pycache__/PreprocessFile.cpython-310.pyc
Normal file
BIN
search_tfidf/__pycache__/PreprocessFile.cpython-310.pyc
Normal file
Binary file not shown.
BIN
search_tfidf/__pycache__/elasticSearch.cpython-310.pyc
Normal file
BIN
search_tfidf/__pycache__/elasticSearch.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
71
search_tfidf/elasticSearch.py
Normal file
71
search_tfidf/elasticSearch.py
Normal file
@ -0,0 +1,71 @@
|
||||
from django.shortcuts import render
|
||||
from elasticsearch import Elasticsearch
|
||||
import os
|
||||
|
||||
def build_search_query(phrase, min_score):
|
||||
|
||||
search_query = {
|
||||
"size": 100,
|
||||
"query": {
|
||||
"multi_match": {
|
||||
"query": phrase,
|
||||
"fields": ["Paragraph", "Title"]
|
||||
}
|
||||
},
|
||||
"highlight": {
|
||||
"fields": {
|
||||
"Paragraph": {}
|
||||
},
|
||||
"pre_tags": [""],
|
||||
"post_tags": [""],
|
||||
},
|
||||
"min_score": min_score
|
||||
}
|
||||
|
||||
return search_query
|
||||
|
||||
def eSearch (phrase):
|
||||
# Set the password for connecting to Elasticsearch
|
||||
ELASTIC_PASSWORD = "p-P7luUvrPggWrS4UQsy"
|
||||
ca_certs="/etc/elasticsearch/certs/http_ca.crt"
|
||||
|
||||
# Get the Elasticsearch password from environment variable
|
||||
# ELASTIC_PASSWORD = os.environ.get('ELASTIC_PASSWORD')
|
||||
# Get the CA certificates path from environment variable
|
||||
# ca_certs = os.environ.get('CA_CERTS')
|
||||
|
||||
# Create an Elasticsearch client instance to use for searching
|
||||
# Connect to the local Elasticsearch instance on port 9200
|
||||
# Use certificate authentication with the provided certificate
|
||||
# Authenticate with the elastic user and the password set above
|
||||
es = Elasticsearch(
|
||||
"https://localhost:9200",
|
||||
ca_certs=ca_certs,
|
||||
basic_auth=("elastic", ELASTIC_PASSWORD)
|
||||
)
|
||||
|
||||
# Search for products matching a specific category
|
||||
number_of_hits = 0
|
||||
min_score = 8
|
||||
final_results = []
|
||||
search_results = []
|
||||
while number_of_hits == 0:
|
||||
search_query = build_search_query(phrase, min_score)
|
||||
search_results = es.search(index="iddrs", body=search_query)
|
||||
number_of_hits = len(search_results["hits"]["hits"])
|
||||
min_score = min_score - 1
|
||||
|
||||
# Process and display search results
|
||||
for hit in search_results["hits"]["hits"]:
|
||||
highlighted_texts = hit.get('highlight', {}).get('Paragraph', []) # Use get() to avoid KeyError
|
||||
original_paragraph = hit.get('_source', {}).get('Paragraph', [])
|
||||
#print(highlighted_texts)
|
||||
if highlighted_texts: # Check if highlight is not None
|
||||
for highlighted_text in highlighted_texts:
|
||||
original_paragraph = original_paragraph.replace(highlighted_text, f"<span style='background-color:#ffff00'>{highlighted_text}</span>")
|
||||
hit["_source"]["Highlight"] = original_paragraph
|
||||
else:
|
||||
hit["_source"]["Highlight"] = []
|
||||
final_results.append(hit["_source"])
|
||||
|
||||
return final_results , min_score + 1
|
@ -1,3 +0,0 @@
|
||||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
@ -6,6 +6,7 @@ from django.http import JsonResponse
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
import json
|
||||
from .tfidfSearch import cosine_similarity
|
||||
from .elasticSearch import eSearch
|
||||
from rest_framework.decorators import api_view
|
||||
from pathlib import Path
|
||||
import os
|
||||
@ -45,7 +46,8 @@ def get_input(request):
|
||||
return JsonResponse({"message": "Data received", "results":searchResults})
|
||||
|
||||
else:
|
||||
searchResults = cosine_similarity(phrase, title=False)
|
||||
#searchResults = cosine_similarity(phrase, title=False)
|
||||
searchResults = eSearch(phrase)
|
||||
return JsonResponse({"message": "Data received", "results":searchResults})
|
||||
|
||||
|
||||
|
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user