IDDRS_API/data_api/PreprocessFile.py

from docx import Document
import os
import fitz
import re
import uuid
import shutil
import json
import logging
from pathlib import Path

import spacy
from spacy.matcher import Matcher

from .models import StandardsList

# from .CreateIndexES import CreateIndexES


class PreprocessFile:
    def __init__(self):
        self.BASE_DIR = Path(__file__).resolve().parent.parent

    def find_summary_page(self, pdf_path, summary_text):
        doc = fitz.open(pdf_path)
        summary_count = 0

        for page_num in range(len(doc)):
            page = doc[page_num]
            text_instances = page.get_text("text")

            # Counting the number of occurrences of the summary text on the page
            summary_count += text_instances.count(summary_text)

            if summary_count >= 2:
                return page_num
            else:
                page_num = 0
        return page_num

    def find_text_in_pdf_from_summary(self, pdf_path, search_text, summary_text):
        summary_page = self.find_summary_page(pdf_path, summary_text)

        if summary_page is None:
            # print("Summary not found in the PDF.")
            return None

        doc = fitz.open(pdf_path)

        for page_num in range(
            summary_page + 1, len(doc)
        ):  # Start searching after the 2nd summary
            page = doc[page_num]
            text_instances = page.get_text("text")

            # Use regex to find instances of search_text without anything following it on the same line
            regex_pattern = re.compile(rf"\b{search_text}\b(?![^\n]*\S)", re.IGNORECASE)
            match = regex_pattern.search(text_instances)

            if match:
                # print(f"Text found on page {page_num + 1}, after the 2nd summary.")
                return page_num + 1

        # print("Text not found in the PDF.")
        return None

    # Custom serialization function
    def custom_json_serialization(self, text):
        # Replace newline characters with spaces
        return text.replace("\n", " ")

    def process_standards(self):
        # BASE_DIR = Path(__file__).resolve().parent.parent
        nlp = spacy.load("en_core_web_sm")
        matcher = Matcher(nlp.vocab)
        pattern1 = [{"LOWER": "shall"}]
        pattern2 = [{"LOWER": "should"}]
        pattern3 = [{"LOWER": "may"}]
        pattern4 = [{"LOWER": "must"}]
        pattern5 = [{"LOWER": "can"}]

        matcher.add("Shall", [pattern1])
        matcher.add("Should", [pattern2])
        matcher.add("May", [pattern3])
        matcher.add("Must", [pattern4])
        matcher.add("Can", [pattern5])

        root_json_dir = os.path.join(
            self.BASE_DIR, "media/data/json/"
        )  # "../Standards/json"
        root_data_json_dir = os.path.join(
            self.BASE_DIR, "media/data/"
        )  # "../Standards/json"
        summary_text = "Summary"
        data = []

        # for logging
        number_of_successed_files = 0
        number_of_sections = 0
        number_of_sections_after_cleaning = 0
        number_of_cleaned_sections = 0

        # Check if the json directory exists
        if os.path.exists(root_json_dir):
            # Delete the directory and its contents
            shutil.rmtree(root_json_dir)

        # Create a new directory
        os.makedirs(root_json_dir)

        # Configure logging settings
        log_file = os.path.join(
            self.BASE_DIR, "media/data/json_log.log"
        )  # "../Standards/json/json_log.log"  # Specify the path and filename for the log file
        logging.basicConfig(
            filename=log_file,  # Set the log file
            level=logging.DEBUG,  # Set the logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
            format="%(asctime)s - %(levelname)s - %(message)s",  # Define log message format
            datefmt="%Y-%m-%d %H:%M:%S",  # Define date/time format
        )

        for standard in StandardsList.objects.all():
            standard_file_pdf = standard.standardFilePDF
            standard_file_word = standard.standardFileWord
            json_path = os.path.join(root_json_dir, standard.standardTitle + ".json")
            print(json_path)
            sections = []
            current_section = ""
            data_per_file = []
            # logging has to come here
            number_of_sections_after_cleaning_per_file = 0
            try:
                print(standard_file_pdf.path)
                print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
                word_doc = Document(standard_file_word.path)
                for paragraph in word_doc.paragraphs:

                    if (
                        paragraph.style.name.startswith("Heading")
                        or "Section" in paragraph.style.name
                        or "Sub-section" in paragraph.style.name
                    ):
                        # If we're starting a new section, save off the old one

                        if current_section:
                            sections.append(current_section.strip())
                            current_section = ""
                        current_section += paragraph.text + "\n"
                    else:
                        # Otherwise, append text to current section
                        current_section += paragraph.text + "\n"

                # Append the last section to the list of sections if it exists
                if current_section.strip():
                    sections.append(current_section.strip())

                # print all sections
                for index, section in enumerate(sections):
                    # for logging
                    number_of_sections += 1
                    if section != "" and len(section.split()) > 25:
                        # for logging
                        number_of_sections_after_cleaning += 1
                        number_of_sections_after_cleaning_per_file += 1

                        first_line = section.strip().splitlines()[0]
                        text_to_search = first_line
                        page_num = self.find_text_in_pdf_from_summary(
                            standard_file_pdf.path, text_to_search, summary_text
                        )

                        doc = nlp(section)
                        found_matches = matcher(doc)
                        shall = should = may = must = can = False
                        if found_matches:
                            for match_id, start, end in found_matches:
                                string_id = nlp.vocab.strings[match_id]
                                #span = doc[start:end]
                                if string_id == "Shall":
                                    shall = True

                                if string_id == "Should":
                                    should = True

                                if string_id == "May":
                                    may = True

                                if string_id == "Must":
                                    must = True

                                if string_id == "Can":
                                    can = True
                        section_boj = {
                            "ID": str(uuid.uuid4()),
                            "Color": standard.levelID.levelColor,
                            "Level": str(standard.levelNumber),
                            "LevelName": "",
                            "Title": standard.standardTitle,
                            "Heading1": "",
                            "Heading2": "",
                            "Heading3": "",
                            "Heading4": "",
                            "Module": standard.standardTitle,
                            "PageNum": page_num,
                            "Paragraph": self.custom_json_serialization(section),
                            "Can": can,
                            "May": may,
                            "Shall": shall,
                            "Should": should,
                            "Must": must,
                        }
                        print(section_boj)
                        data_per_file.append(section_boj)
                        data.append(section_boj)
                    else:
                        # for logging
                        number_of_cleaned_sections + 1
            except Exception as e:
                print(
                    f"An error occurred while processing {standard.standardTitle}: {str(e)}"
                )

            try:
                with open(json_path, "w") as json_file:
                    json.dump(data_per_file, json_file, indent=4)
                    logging.info(
                        f"{number_of_sections_after_cleaning_per_file} seactions has been saved to: {json_path}"
                    )
                    number_of_successed_files += 1
            except Exception as e:
                logging.error(
                    f"Number of successed saved files: {number_of_successed_files}"
                )

        logging.info(f"Number of successed saved files: {number_of_successed_files}")
        logging.info(f"Number of seactions: {number_of_sections}")
        logging.info(
            f"Number of seactions after cleaning: {number_of_sections_after_cleaning}"
        )
        logging.info(f"Number of cleaned seactions: {number_of_cleaned_sections}")

        with open(root_data_json_dir + "/data.json", "w") as json_file:
            # Use json.dump to write the data to the file
            json.dump(
                data, json_file, indent=4
            )  # Use indent for pretty formatting (optional)

        # Close the log file (optional, usually done automatically)
        logging.shutdown()