IDDRS_API/data_api/PreprocessFile.py

244 lines
9.4 KiB
Python
Raw Normal View History

2023-11-20 14:31:13 +00:00
from docx import Document
import os
import fitz
import re
import uuid
import shutil
import json
import logging
from pathlib import Path
import spacy
from spacy.matcher import Matcher
2023-11-21 14:20:57 +00:00
from .models import StandardsList
2023-11-20 14:31:13 +00:00
2023-11-21 14:20:57 +00:00
# from .CreateIndexES import CreateIndexES
2023-11-20 14:31:13 +00:00
class PreprocessFile:
def __init__(self):
self.BASE_DIR = Path(__file__).resolve().parent.parent
def find_summary_page(self, pdf_path, summary_text):
doc = fitz.open(pdf_path)
summary_count = 0
for page_num in range(len(doc)):
page = doc[page_num]
text_instances = page.get_text("text")
# Counting the number of occurrences of the summary text on the page
summary_count += text_instances.count(summary_text)
if summary_count >= 2:
return page_num
else:
page_num = 0
return page_num
def find_text_in_pdf_from_summary(self, pdf_path, search_text, summary_text):
summary_page = self.find_summary_page(pdf_path, summary_text)
if summary_page is None:
# print("Summary not found in the PDF.")
return None
doc = fitz.open(pdf_path)
for page_num in range(
summary_page + 1, len(doc)
): # Start searching after the 2nd summary
page = doc[page_num]
text_instances = page.get_text("text")
# Use regex to find instances of search_text without anything following it on the same line
regex_pattern = re.compile(rf"\b{search_text}\b(?![^\n]*\S)", re.IGNORECASE)
match = regex_pattern.search(text_instances)
if match:
# print(f"Text found on page {page_num + 1}, after the 2nd summary.")
return page_num + 1
# print("Text not found in the PDF.")
return None
# Custom serialization function
def custom_json_serialization(self, text):
# Replace newline characters with spaces
return text.replace("\n", " ")
def process_standards(self):
# BASE_DIR = Path(__file__).resolve().parent.parent
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern1 = [{"LOWER": "shall"}]
pattern2 = [{"LOWER": "should"}]
pattern3 = [{"LOWER": "may"}]
pattern4 = [{"LOWER": "must"}]
pattern5 = [{"LOWER": "can"}]
matcher.add("Shall", [pattern1])
matcher.add("Should", [pattern2])
matcher.add("May", [pattern3])
matcher.add("Must", [pattern4])
matcher.add("Can", [pattern5])
root_json_dir = os.path.join(
self.BASE_DIR, "static/data/json/"
) # "../Standards/json"
root_data_json_dir = os.path.join(
self.BASE_DIR, "static/data/"
) # "../Standards/json"
summary_text = "Summary"
data = []
# for logging
number_of_successed_files = 0
number_of_sections = 0
number_of_sections_after_cleaning = 0
number_of_cleaned_sections = 0
# Check if the json directory exists
if os.path.exists(root_json_dir):
# Delete the directory and its contents
shutil.rmtree(root_json_dir)
# Create a new directory
os.makedirs(root_json_dir)
# Configure logging settings
log_file = os.path.join(
self.BASE_DIR, "static/data/json_log.log"
) # "../Standards/json/json_log.log" # Specify the path and filename for the log file
logging.basicConfig(
filename=log_file, # Set the log file
level=logging.DEBUG, # Set the logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
format="%(asctime)s - %(levelname)s - %(message)s", # Define log message format
datefmt="%Y-%m-%d %H:%M:%S", # Define date/time format
)
for standard in StandardsList.objects.all():
standard_file_pdf = standard.standardFilePDF
standard_file_word = standard.standardFileWord
json_path = os.path.join(root_json_dir, standard.standardTitle + ".json")
print(json_path)
sections = []
current_section = ""
data_per_file = []
# logging has to come here
number_of_sections_after_cleaning_per_file = 0
try:
word_doc = Document(standard_file_word.path)
for paragraph in word_doc.paragraphs:
if (
paragraph.style.name.startswith("Heading")
or "Section" in paragraph.style.name
or "Sub-section" in paragraph.style.name
):
# If we're starting a new section, save off the old one
if current_section:
sections.append(current_section.strip())
current_section = ""
current_section += paragraph.text + "\n"
else:
# Otherwise, append text to current section
current_section += paragraph.text + "\n"
# Append the last section to the list of sections if it exists
if current_section.strip():
sections.append(current_section.strip())
# print all sections
for index, section in enumerate(sections):
# for logging
number_of_sections += 1
if section != "" and len(section.split()) > 25:
# for logging
number_of_sections_after_cleaning += 1
number_of_sections_after_cleaning_per_file += 1
first_line = section.strip().splitlines()[0]
text_to_search = first_line
page_num = self.find_text_in_pdf_from_summary(
standard_file_pdf, text_to_search, summary_text
)
doc = nlp(section)
found_matches = matcher(doc)
shall = should = may = must = can = False
if found_matches:
for match_id, start, end in found_matches:
string_id = nlp.vocab.strings[match_id]
2023-11-21 14:20:57 +00:00
#span = doc[start:end]
2023-11-20 14:31:13 +00:00
if string_id == "Shall":
shall = True
if string_id == "Should":
should = True
if string_id == "May":
may = True
if string_id == "Must":
must = True
if string_id == "Can":
can = True
section_boj = {
"ID": str(uuid.uuid4()),
"Color": standard.levelID.levelColor,
"Level": str(standard.levelNumber),
"LevelName": "",
"Title": standard.standardTitle,
"Heading1": "",
"Heading2": "",
"Heading3": "",
"Heading4": "",
"Module": standard.standardTitle,
"PageNum": page_num,
"Paragraph": self.custom_json_serialization(section),
"Can": can,
"May": may,
"Shall": shall,
"Should": should,
"Must": must,
}
data_per_file.append(section_boj)
data.append(section_boj)
else:
# for logging
number_of_cleaned_sections + 1
except Exception as e:
print(
f"An error occurred while processing {standard.standardTitle}: {str(e)}"
)
try:
with open(json_path, "w") as json_file:
json.dump(data_per_file, json_file, indent=4)
logging.info(
f"{number_of_sections_after_cleaning_per_file} seactions has been saved to: {json_path}"
)
number_of_successed_files += 1
except Exception as e:
logging.error(
f"Number of successed saved files: {number_of_successed_files}"
)
logging.info(f"Number of successed saved files: {number_of_successed_files}")
logging.info(f"Number of seactions: {number_of_sections}")
logging.info(
f"Number of seactions after cleaning: {number_of_sections_after_cleaning}"
)
logging.info(f"Number of cleaned seactions: {number_of_cleaned_sections}")
with open(root_data_json_dir + "/data.json", "w") as json_file:
# Use json.dump to write the data to the file
json.dump(
data, json_file, indent=4
) # Use indent for pretty formatting (optional)
# Close the log file (optional, usually done automatically)
logging.shutdown()