mh_parser/xml_parse.py

import string

import pythonbible as bible
from pathlib import Path
from bs4 import BeautifulSoup
from html.parser import HTMLParser
import re

base = Path('./Matthew Henry Commentary/xlm').absolute()

class MyHTMLParser(HTMLParser):
    file = None
    tag = None
    tag_type = None
    attrs = dict()
    patterns = [
        (r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~1'),
        (r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~2'),
        (r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~3'),
        (r"(\S\s)(\d+\.)(\s\b|\W)", lambda x: f'{x.group(1)}\n{x.group(2)}~3{x.group(3)}'),  # THis may need to be added to the cleanup script
        (r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.", lambda x: f'\n{parser.roman_to_int(x.group())}.~5'),
    ]

    @staticmethod
    def roman_to_int(number: str) -> int:
        if number is None:
            return 0

        number = number.strip('.')

        roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
        total = 0
        for i in range(len(number) - 1, -1, -1):
            num = roman[number[i]]
            if 3 * num < total:
                total -= num
            else:
                total += num

        return total

    def create_md_file(self, base_dir: Path, folder: str, file: str):
        md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
        md_file.parent.mkdir(parents=True, exist_ok=True)
        md_file.unlink(missing_ok=True)
        self.file = md_file

    def write_to_file(self, data):
        # if self.writeable is False:
        #     return

        if self.file is None:
            raise ValueError('No File specified')

        for pattern, sub in self.patterns:
            print(f'Running Pattern: {pattern}')
            data = re.sub(pattern, sub, data)

        print(f'writing: {data!r}')

        with open(self.file, 'a+') as file:
            file.write(data)

    def clean_file(self):
        print(' CLEANING FILE')

        if self.file is None:
            raise ValueError('No File specified')

        with open(self.file, 'r') as file:
            lines = file.readlines()

        with open(self.file, 'w') as file:
            indent = 0
            sublist = dict()
            for line in lines:
                if line.strip():
                    if line.startswith('#'):
                        line = f'\n{line}'

                    if line.strip()[0] not in string.digits:
                        line = f'{line}\n'
                        # print('resetting indent')
                        indent = 0
                        sublist.clear()
                    else:
                        start = line.find('~')
                        end = start + 2
                        list_id = line[start:end]
                        if list_id not in sublist:
                            sublist[list_id] = indent
                            indent += 1
                            # print(f'{indent=}')
                            # print(f'{sublist=}')

                        line = f'{"\t" * sublist[list_id]}{line.replace(list_id, "")}'
                        # line = f'{"\t" * sublist[list_id]}{line}'

                        # print(f'{sublist[list_id]=}, {list_id=}')

                    file.write(line)

    @staticmethod
    def normalize_verses(verses):
        """Takes this 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25'
        and turns it into somthing more readable"""
        normalized_refs = list()
        for chunks in verses.removeprefix("Bible:").split('Bible:'):
            spanned_verses = list()
            for ref in chunks.split('-'):
                verse = ref.replace('.', ' ', 1).replace('.', ':')
                spanned_verses.append(verse)

            normalized = bible.format_scripture_references(
                bible.get_references(str(spanned_verses))
            )
            normalized_refs.append(normalized)

        return ';'.join(normalized_refs)

    @staticmethod
    def obsidian_links(verses):
        links = []
        for verse in verses.split(';'):
            links.append(f"[[{verse.replace(':', '#').replace('-', '..')}]]")

        return '; '.join(links)

    def process_tag(self, test=None):
        match self.tag:
            case 'h4':  # this is the section header. Need to keep track of it per file.
                if self.tag_type == 'start':
                    self.write_to_file('\n## ')

                if self.tag_type == 'end':
                    self.write_to_file('\n\n')
            case 'b':
                self.write_to_file('**')
            case 'i':
                self.write_to_file('*')
            case 'hr':
                pass
            case 'scripref':  # Scripture ref
                # get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
                #  bible.format_scripture_references(bible.get_references(attrs['osisref']))
                if self.tag_type == 'start':
                    verse_link = self.obsidian_links(
                        self.normalize_verses(self.attrs['osisref'])
                    )
                    self.write_to_file(verse_link)
            case 'p':
                if self.tag_type == 'start':
                    if self.attrs.get('class', False) == 'passage':
                        self.write_to_file('Passage: ')
                if self.tag_type == 'end':
                    self.write_to_file('\n\n')
            # case 'html':
            #     if self.tag_type == 'end':
            #         self.clean_file()
            case _:
                # print(tag.upper())
                pass

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        print(f'Start: {tag}, {attrs}')
        self.tag_type = 'start'
        self.tag = tag
        self.attrs = attrs
        self.process_tag(f'what what {tag}')

    def handle_endtag(self, tag):
        print(f'End: {tag}')
        self.tag = tag
        self.tag_type = 'end'
        self.process_tag()

        self.tag = None
        self.tag_type = None
        self.attrs.clear()

    def handle_data(self, data):
        print(f'Data: {data!r}')

        if self.tag in ['scripref', 'h2', 'h3', 'h5']:
            return

        if data == '\n':
            return

        data = data.replace('—', '-- ')
        data = data.replace('GOD', 'God')

        self.write_to_file(data.replace('\n', ' '))


parser = MyHTMLParser()
soup = BeautifulSoup(Path('./vol_1.xml').read_text(), 'xml')

# Clearing all scripRef of internal text. That way if it
# had other tags in it would not be processed
for i in soup.find_all('scripRef'):
    i.clear()

# May need to add a loop to look in all <p class='passage> and
# unrap any scripref that is in them. the place Dan is getting confused
# with the book Daniel.
# ??????????
# for passage in soup.find_all('p', class='passage'):
#     for ref in passafe.find_all('scripRef'):
#         ref.unwrap()


for book in soup.find_all('div1')[2:]:
    # this is the folder for the books
    print(book.attrs)

    # These are the chapters/files for each folder
    # for chapter in book.find_all('div2')[:2]:
    for chapter in book.find_all('div2'):
        print('\t', chapter.attrs)
        parser.create_md_file(base, book['title'], f'{chapter['title']}')

        # this is the text that needs parsing
        for doc in chapter.children:
        # for doc in chapter.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        # for doc in chapter.find_all('scripRef'):
        # for doc in chapter.find_all('span'):
            # sr = str(doc['osisRef'])
            # passage = parser.normalize_verses(sr)
            # print('\t\tPassage: ', passage)
            # print('\t\tLink: ', parser.obsidian_links(passage))

            # print('\t\tDATA: ', str(doc).strip())
            parser.feed(str(doc))
            # pass

        parser.clean_file()