249 lines
9.6 KiB
Python
249 lines
9.6 KiB
Python
from docx import Document
|
|
import os
|
|
import fitz
|
|
import re
|
|
import uuid
|
|
import shutil
|
|
import json
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
import spacy
|
|
from spacy.matcher import Matcher
|
|
|
|
from .models import StandardsList
|
|
|
|
# from .CreateIndexES import CreateIndexES
|
|
|
|
|
|
class PreprocessFile:
|
|
def __init__(self):
|
|
self.BASE_DIR = Path(__file__).resolve().parent.parent
|
|
|
|
def find_summary_page(self, pdf_path, summary_text):
|
|
doc = fitz.open(pdf_path)
|
|
summary_count = 0
|
|
|
|
for page_num in range(len(doc)):
|
|
page = doc[page_num]
|
|
text_instances = page.get_text("text")
|
|
|
|
# Counting the number of occurrences of the summary text on the page
|
|
summary_count += text_instances.count(summary_text)
|
|
|
|
if summary_count >= 2:
|
|
return page_num
|
|
else:
|
|
page_num = 0
|
|
return page_num
|
|
|
|
def find_text_in_pdf_from_summary(self, pdf_path, search_text, summary_text):
|
|
summary_page = self.find_summary_page(pdf_path, summary_text)
|
|
|
|
if summary_page is None:
|
|
# print("Summary not found in the PDF.")
|
|
return None
|
|
|
|
doc = fitz.open(pdf_path)
|
|
|
|
for page_num in range(
|
|
summary_page + 1, len(doc)
|
|
): # Start searching after the 2nd summary
|
|
page = doc[page_num]
|
|
text_instances = page.get_text("text")
|
|
|
|
# Use regex to find instances of search_text without anything following it on the same line
|
|
regex_pattern = re.compile(rf"\b{search_text}\b(?![^\n]*\S)", re.IGNORECASE)
|
|
match = regex_pattern.search(text_instances)
|
|
|
|
if match:
|
|
# print(f"Text found on page {page_num + 1}, after the 2nd summary.")
|
|
return page_num + 1
|
|
|
|
# print("Text not found in the PDF.")
|
|
return None
|
|
|
|
# Custom serialization function
|
|
def custom_json_serialization(self, text):
|
|
# Replace newline characters with spaces
|
|
return text.replace("\n", " ")
|
|
|
|
def process_standards(self):
|
|
# BASE_DIR = Path(__file__).resolve().parent.parent
|
|
nlp = spacy.load("en_core_web_sm")
|
|
matcher = Matcher(nlp.vocab)
|
|
pattern1 = [{"LOWER": "shall"}]
|
|
pattern2 = [{"LOWER": "should"}]
|
|
pattern3 = [{"LOWER": "may"}]
|
|
pattern4 = [{"LOWER": "must"}]
|
|
pattern5 = [{"LOWER": "can"}]
|
|
|
|
matcher.add("Shall", [pattern1])
|
|
matcher.add("Should", [pattern2])
|
|
matcher.add("May", [pattern3])
|
|
matcher.add("Must", [pattern4])
|
|
matcher.add("Can", [pattern5])
|
|
|
|
root_json_dir = os.path.join(
|
|
self.BASE_DIR, "media/data/json/"
|
|
) # "../Standards/json"
|
|
root_data_json_dir = os.path.join(
|
|
self.BASE_DIR, "media/data/"
|
|
) # "../Standards/json"
|
|
summary_text = "Summary"
|
|
data = []
|
|
|
|
# for logging
|
|
number_of_successed_files = 0
|
|
number_of_sections = 0
|
|
number_of_sections_after_cleaning = 0
|
|
number_of_cleaned_sections = 0
|
|
|
|
# Check if the json directory exists
|
|
if os.path.exists(root_json_dir):
|
|
# Delete the directory and its contents
|
|
shutil.rmtree(root_json_dir)
|
|
|
|
# Create a new directory
|
|
os.makedirs(root_json_dir)
|
|
|
|
# Configure logging settings
|
|
log_file = os.path.join(
|
|
self.BASE_DIR, "media/data/json_log.log"
|
|
) # "../Standards/json/json_log.log" # Specify the path and filename for the log file
|
|
logging.basicConfig(
|
|
filename=log_file, # Set the log file
|
|
level=logging.DEBUG, # Set the logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
|
format="%(asctime)s - %(levelname)s - %(message)s", # Define log message format
|
|
datefmt="%Y-%m-%d %H:%M:%S", # Define date/time format
|
|
)
|
|
|
|
for standard in StandardsList.objects.all():
|
|
standard_file_pdf = standard.standardFilePDF
|
|
standard_file_word = standard.standardFileWord
|
|
json_path = os.path.join(root_json_dir, standard.standardTitle + ".json")
|
|
print(json_path)
|
|
sections = []
|
|
current_section = ""
|
|
data_per_file = []
|
|
# logging has to come here
|
|
number_of_sections_after_cleaning_per_file = 0
|
|
try:
|
|
print(standard_file_pdf.path)
|
|
print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
|
|
word_doc = Document(standard_file_word.path)
|
|
for paragraph in word_doc.paragraphs:
|
|
|
|
if (
|
|
paragraph.style.name.startswith("Heading")
|
|
or "Section" in paragraph.style.name
|
|
or "Sub-section" in paragraph.style.name
|
|
):
|
|
# If we're starting a new section, save off the old one
|
|
|
|
if current_section:
|
|
sections.append(current_section.strip())
|
|
current_section = ""
|
|
current_section += paragraph.text + "\n"
|
|
else:
|
|
# Otherwise, append text to current section
|
|
current_section += paragraph.text + "\n"
|
|
|
|
# Append the last section to the list of sections if it exists
|
|
if current_section.strip():
|
|
sections.append(current_section.strip())
|
|
|
|
# print all sections
|
|
for index, section in enumerate(sections):
|
|
# for logging
|
|
number_of_sections += 1
|
|
if section != "" and len(section.split()) > 25:
|
|
# for logging
|
|
number_of_sections_after_cleaning += 1
|
|
number_of_sections_after_cleaning_per_file += 1
|
|
|
|
first_line = section.strip().splitlines()[0]
|
|
text_to_search = first_line
|
|
page_num = self.find_text_in_pdf_from_summary(
|
|
standard_file_pdf.path, text_to_search, summary_text
|
|
)
|
|
|
|
doc = nlp(section)
|
|
found_matches = matcher(doc)
|
|
shall = should = may = must = can = False
|
|
if found_matches:
|
|
for match_id, start, end in found_matches:
|
|
string_id = nlp.vocab.strings[match_id]
|
|
#span = doc[start:end]
|
|
if string_id == "Shall":
|
|
shall = True
|
|
|
|
if string_id == "Should":
|
|
should = True
|
|
|
|
if string_id == "May":
|
|
may = True
|
|
|
|
if string_id == "Must":
|
|
must = True
|
|
|
|
if string_id == "Can":
|
|
can = True
|
|
section_boj = {
|
|
"ID": str(uuid.uuid4()),
|
|
"Color": standard.levelID.levelColor,
|
|
"Level": str(standard.levelNumber),
|
|
"LevelName": "",
|
|
"Title": standard.standardTitle,
|
|
"Heading1": "",
|
|
"Heading2": "",
|
|
"Heading3": "",
|
|
"Heading4": "",
|
|
"Module": standard.standardTitle,
|
|
"PageNum": page_num,
|
|
"Paragraph": self.custom_json_serialization(section),
|
|
"Can": can,
|
|
"May": may,
|
|
"Shall": shall,
|
|
"Should": should,
|
|
"Must": must,
|
|
}
|
|
print(section_boj)
|
|
data_per_file.append(section_boj)
|
|
data.append(section_boj)
|
|
else:
|
|
# for logging
|
|
number_of_cleaned_sections + 1
|
|
except Exception as e:
|
|
print(
|
|
f"An error occurred while processing {standard.standardTitle}: {str(e)}"
|
|
)
|
|
|
|
try:
|
|
with open(json_path, "w") as json_file:
|
|
json.dump(data_per_file, json_file, indent=4)
|
|
logging.info(
|
|
f"{number_of_sections_after_cleaning_per_file} seactions has been saved to: {json_path}"
|
|
)
|
|
number_of_successed_files += 1
|
|
except Exception as e:
|
|
logging.error(
|
|
f"Number of successed saved files: {number_of_successed_files}"
|
|
)
|
|
|
|
logging.info(f"Number of successed saved files: {number_of_successed_files}")
|
|
logging.info(f"Number of seactions: {number_of_sections}")
|
|
logging.info(
|
|
f"Number of seactions after cleaning: {number_of_sections_after_cleaning}"
|
|
)
|
|
logging.info(f"Number of cleaned seactions: {number_of_cleaned_sections}")
|
|
|
|
with open(root_data_json_dir + "/data.json", "w") as json_file:
|
|
# Use json.dump to write the data to the file
|
|
json.dump(
|
|
data, json_file, indent=4
|
|
) # Use indent for pretty formatting (optional)
|
|
|
|
# Close the log file (optional, usually done automatically)
|
|
logging.shutdown()
|