IDDRS_API/data_api/PreprocessFile.py

244 lines
9.4 KiB
Python
Raw Normal View History

2023-11-20 14:31:13 +00:00
from docx import Document
import os
import fitz
import re
import uuid
import shutil
import json
import logging
from pathlib import Path
import spacy
from spacy.matcher import Matcher
from .models import Levels, StandardsList
from .CreateIndexES import CreateIndexES
class PreprocessFile:
def __init__(self):
self.BASE_DIR = Path(__file__).resolve().parent.parent
def find_summary_page(self, pdf_path, summary_text):
doc = fitz.open(pdf_path)
summary_count = 0
for page_num in range(len(doc)):
page = doc[page_num]
text_instances = page.get_text("text")
# Counting the number of occurrences of the summary text on the page
summary_count += text_instances.count(summary_text)
if summary_count >= 2:
return page_num
else:
page_num = 0
return page_num
def find_text_in_pdf_from_summary(self, pdf_path, search_text, summary_text):
summary_page = self.find_summary_page(pdf_path, summary_text)
if summary_page is None:
# print("Summary not found in the PDF.")
return None
doc = fitz.open(pdf_path)
for page_num in range(
summary_page + 1, len(doc)
): # Start searching after the 2nd summary
page = doc[page_num]
text_instances = page.get_text("text")
# Use regex to find instances of search_text without anything following it on the same line
regex_pattern = re.compile(rf"\b{search_text}\b(?![^\n]*\S)", re.IGNORECASE)
match = regex_pattern.search(text_instances)
if match:
# print(f"Text found on page {page_num + 1}, after the 2nd summary.")
return page_num + 1
# print("Text not found in the PDF.")
return None
# Custom serialization function
def custom_json_serialization(self, text):
# Replace newline characters with spaces
return text.replace("\n", " ")
def process_standards(self):
# BASE_DIR = Path(__file__).resolve().parent.parent
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern1 = [{"LOWER": "shall"}]
pattern2 = [{"LOWER": "should"}]
pattern3 = [{"LOWER": "may"}]
pattern4 = [{"LOWER": "must"}]
pattern5 = [{"LOWER": "can"}]
matcher.add("Shall", [pattern1])
matcher.add("Should", [pattern2])
matcher.add("May", [pattern3])
matcher.add("Must", [pattern4])
matcher.add("Can", [pattern5])
root_json_dir = os.path.join(
self.BASE_DIR, "static/data/json/"
) # "../Standards/json"
root_data_json_dir = os.path.join(
self.BASE_DIR, "static/data/"
) # "../Standards/json"
summary_text = "Summary"
data = []
# for logging
number_of_successed_files = 0
number_of_sections = 0
number_of_sections_after_cleaning = 0
number_of_cleaned_sections = 0
# Check if the json directory exists
if os.path.exists(root_json_dir):
# Delete the directory and its contents
shutil.rmtree(root_json_dir)
# Create a new directory
os.makedirs(root_json_dir)
# Configure logging settings
log_file = os.path.join(
self.BASE_DIR, "static/data/json_log.log"
) # "../Standards/json/json_log.log" # Specify the path and filename for the log file
logging.basicConfig(
filename=log_file, # Set the log file
level=logging.DEBUG, # Set the logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
format="%(asctime)s - %(levelname)s - %(message)s", # Define log message format
datefmt="%Y-%m-%d %H:%M:%S", # Define date/time format
)
for standard in StandardsList.objects.all():
standard_file_pdf = standard.standardFilePDF
standard_file_word = standard.standardFileWord
json_path = os.path.join(root_json_dir, standard.standardTitle + ".json")
print(json_path)
sections = []
current_section = ""
data_per_file = []
# logging has to come here
number_of_sections_after_cleaning_per_file = 0
try:
word_doc = Document(standard_file_word.path)
for paragraph in word_doc.paragraphs:
if (
paragraph.style.name.startswith("Heading")
or "Section" in paragraph.style.name
or "Sub-section" in paragraph.style.name
):
# If we're starting a new section, save off the old one
if current_section:
sections.append(current_section.strip())
current_section = ""
current_section += paragraph.text + "\n"
else:
# Otherwise, append text to current section
current_section += paragraph.text + "\n"
# Append the last section to the list of sections if it exists
if current_section.strip():
sections.append(current_section.strip())
# print all sections
for index, section in enumerate(sections):
# for logging
number_of_sections += 1
if section != "" and len(section.split()) > 25:
# for logging
number_of_sections_after_cleaning += 1
number_of_sections_after_cleaning_per_file += 1
first_line = section.strip().splitlines()[0]
text_to_search = first_line
page_num = self.find_text_in_pdf_from_summary(
standard_file_pdf, text_to_search, summary_text
)
doc = nlp(section)
found_matches = matcher(doc)
shall = should = may = must = can = False
if found_matches:
for match_id, start, end in found_matches:
string_id = nlp.vocab.strings[match_id]
span = doc[start:end]
if string_id == "Shall":
shall = True
if string_id == "Should":
should = True
if string_id == "May":
may = True
if string_id == "Must":
must = True
if string_id == "Can":
can = True
section_boj = {
"ID": str(uuid.uuid4()),
"Color": standard.levelID.levelColor,
"Level": str(standard.levelNumber),
"LevelName": "",
"Title": standard.standardTitle,
"Heading1": "",
"Heading2": "",
"Heading3": "",
"Heading4": "",
"Module": standard.standardTitle,
"PageNum": page_num,
"Paragraph": self.custom_json_serialization(section),
"Can": can,
"May": may,
"Shall": shall,
"Should": should,
"Must": must,
}
data_per_file.append(section_boj)
data.append(section_boj)
else:
# for logging
number_of_cleaned_sections + 1
except Exception as e:
print(
f"An error occurred while processing {standard.standardTitle}: {str(e)}"
)
try:
with open(json_path, "w") as json_file:
json.dump(data_per_file, json_file, indent=4)
logging.info(
f"{number_of_sections_after_cleaning_per_file} seactions has been saved to: {json_path}"
)
number_of_successed_files += 1
except Exception as e:
logging.error(
f"Number of successed saved files: {number_of_successed_files}"
)
logging.info(f"Number of successed saved files: {number_of_successed_files}")
logging.info(f"Number of seactions: {number_of_sections}")
logging.info(
f"Number of seactions after cleaning: {number_of_sections_after_cleaning}"
)
logging.info(f"Number of cleaned seactions: {number_of_cleaned_sections}")
with open(root_data_json_dir + "/data.json", "w") as json_file:
# Use json.dump to write the data to the file
json.dump(
data, json_file, indent=4
) # Use indent for pretty formatting (optional)
# Close the log file (optional, usually done automatically)
logging.shutdown()