renamed and updated

2023-12-01 21:47:45 -05:00 · 2023-12-01 21:47:45 -05:00 · a1e69b1368
commit a1e69b1368
parent 8c767b5235
2 changed files with 469 additions and 100 deletions
--- a/html_parse.py
+++ b/html_parse.py
@ -0,0 +1,126 @@
+import time
+from pathlib import Path
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+from loguru import logger
+import string
+import shutil
+from html.parser import HTMLParser
+
+
+def roman_to_int(number: str) -> int:
+    if number is None:
+        return 0
+
+    roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
+    total = 0
+    for i in range(len(number)-1,-1,-1):
+        num = roman[number[i]]
+        if 3*num < total:
+            total -= num
+        else:
+            total += num
+
+    return total
+
+
+# File rename...
+# <TITLE>Matthew Henry's Complete Commentary on the Whole Bible [Genesis, Chapter II].</TITLE>
+# The text within the '[]' should be the name of the file.
+def get_filename(soup: BeautifulSoup) -> tuple:
+    title = soup.title.string
+    title = (title
+             .replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')
+             .replace("].", ''))
+
+    *book, _ = title.replace('Chapter', '').split()
+    book = ' '.join(book)
+    book = book.strip(string.punctuation).title()
+    book = (
+        book.replace('First', '1').
+        replace('Second', '2').
+        replace('Third', '3')
+    )
+
+    if 'Introduction' in title:
+        roman = ''
+    elif 'CHAPTER' in title.upper():
+        *_, roman = title.upper().split('CHAPTER')
+    else:
+        *_, roman = title.split()
+
+    num = roman_to_int(roman.strip())
+    file = f'{book} {num}' if num else book
+
+    return book.strip(), file.strip()
+
+
+def create_md_file(base_dir: Path, folder: str, file: str) -> Path:
+    md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
+    # print(md_file)
+    md_file.parent.mkdir(parents=True, exist_ok=True)
+    md_file.touch(exist_ok=True)
+    return md_file
+
+
+def write_blocks(soup: BeautifulSoup, md_file: Path) -> None:
+    # def get_string(block):
+    #     if list(block.decendents) > 1:
+    #         for i in block.decendents:
+
+
+
+    for block in soup.find_all('p'):
+        for text in block.stripped_strings:
+            print(repr(text))
+    #     for tag in block:
+    #         if tag is None:
+    #             continue
+    #
+    #         if not list(tag.children):
+    #             print(tag)
+    #             continue
+    #
+    #         for s in tag.children:
+    #             print(s)
+    #
+    # for block in soup.find_all('p'):
+    #     print(list(block.childGenerator()))
+    #         # match tag.name:
+    #             case None:
+    #                 print(tag)
+    #
+    #             case 'b':
+    #                 print('\t', tag.name)
+    #                 print('\t\t', tag)
+    #                 # file.write(f"**{tag.string}**")
+    #
+    #             case 'i':
+    #                 print('\t', tag.name)
+    #                 print('\t\t', tag)
+    #                 # file.write(f"*{tag.string}*")
+    #
+    #             case _:
+    #                 print('\t', tag.name)
+    #                 print('\t\t', tag)
+    #
+    #
+    #             file.write('\n')
+    #
+    #         file.write('\n')
+
+
+
+if __name__ == '__main__':
+    folder = Path('/home/bear/PycharmProjects/mark_henry_md/matthew_henry')
+
+    htm_files = list(folder.glob('*.HTM'))
+    output_folder = Path('./Matthew Henry Commentary').absolute()
+    # for file in tqdm(htm_files[7:8]):
+    for file in htm_files[7:8]:
+        soup = BeautifulSoup(file.read_text(), 'html.parser')
+
+        book, chapter = get_filename(soup)
+        md_file = create_md_file(output_folder, book, chapter)
+
+        write_blocks(soup, md_file)
--- a/main.py
+++ b/main.py
@ -1,126 +1,369 @@
-import time
-from pathlib import Path
-from bs4 import BeautifulSoup
-from tqdm import tqdm
-from loguru import logger
+import re
 import string
-import shutil
+import sys
 from html.parser import HTMLParser
+from pathlib import Path
+
+import pythonbible as bible
+from bs4 import BeautifulSoup
+from loguru import logger
+from tqdm import tqdm
+
+base = Path('./Matthew Henry Commentary/xml').absolute()
+
+logger.remove(0)
+logger.add('log.txt')
+
+class MyHTMLParser(HTMLParser):
+    file = None
+    passage_verse = None
+    tag = None
+    tag_type = None  # 'start' or 'end'
+    attrs = dict()
+
+    # All the patterns are for re.sub(). This is specifically for getting
+    # list/sublist into the correct position. The first item is for the pattern
+    # to find and the second is the replacement pattern. The replacement pattern
+    # also ids the list so that when the self.clean_file() we can properly add
+    # the correct amount of `\t` to the line.
+    patterns = [
+        (r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'),
+        (r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'),
+
+        # These share the same id. There are times were the two are a mixed
+        # within the parent lists.
+        (r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'),
+        (r"(\S\s)(\d+\.)(\s\b|\W)",
+         lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'),


-def roman_to_int(number: str) -> int:
-    if number is None:
-        return 0
+        (r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.",
+         lambda x: f'\n{parser.roman_to_int(x.group())}.~1'),
+    ]

-    roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
-    total = 0
-    for i in range(len(number)-1,-1,-1):
-        num = roman[number[i]]
-        if 3*num < total:
-            total -= num
-        else:
-            total += num
+    @staticmethod
+    def roman_to_int(number: str) -> int:
+        if number is None:
+            return 0

-    return total
+        # Sometimes the roman numeral comes with a '.'. Striping to not cause
+        # issues
+        number = number.strip('.')

+        roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
+        total = 0
+        for i in range(len(number) - 1, -1, -1):
+            num = roman[number[i]]
+            if 3 * num < total:
+                total -= num
+            else:
+                total += num

-# File rename...
-# <TITLE>Matthew Henry's Complete Commentary on the Whole Bible [Genesis, Chapter II].</TITLE>
-# The text within the '[]' should be the name of the file.
-def get_filename(soup: BeautifulSoup) -> tuple:
-    title = soup.title.string
-    title = (title
-             .replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')
-             .replace("].", ''))
+        return total

-    *book, _ = title.replace('Chapter', '').split()
-    book = ' '.join(book)
-    book = book.strip(string.punctuation).title()
-    book = (
-        book.replace('First', '1').
-        replace('Second', '2').
-        replace('Third', '3')
-    )
+    def create_md_file(self, base_dir: Path, folder: str, file: str):
+        """ Create the path of the md file that will be written to as the
+        program process the xml file.
+        """

-    if 'Introduction' in title:
-        roman = ''
-    elif 'CHAPTER' in title.upper():
-        *_, roman = title.upper().split('CHAPTER')
-    else:
-        *_, roman = title.split()
+        md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
+        md_file.parent.mkdir(parents=True, exist_ok=True)
+        md_file.unlink(missing_ok=True)
+        # print(md_file.absolute())
+        self.file = md_file
+        return md_file

-    num = roman_to_int(roman.strip())
-    file = f'{book} {num}' if num else book
+    def write_to_file(self, data):

-    return book.strip(), file.strip()
+        if self.file is None:
+            raise ValueError('No File specified')

+        # Here the list/sublist are searched for and altered for later
+        # processing in self.clean_file()
+        for pattern, sub in self.patterns:
+            # print(f'Running Pattern: {pattern}')
+            data = re.sub(pattern, sub, data)

-def create_md_file(base_dir: Path, folder: str, file: str) -> Path:
-    md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
-    # print(md_file)
-    md_file.parent.mkdir(parents=True, exist_ok=True)
-    md_file.touch(exist_ok=True)
-    return md_file
+        # print(f'writing: {data!r}')

+        with open(self.file, 'a+') as file:
+            file.write(data)

-def write_blocks(soup: BeautifulSoup, md_file: Path) -> None:
-    # def get_string(block):
-    #     if list(block.decendents) > 1:
-    #         for i in block.decendents:
+    def clean_file(self):
+        logger.info(' CLEANING FILE')

+        if self.file is None:
+            raise ValueError('No File specified')

+        # Read all the lines
+        with open(self.file, 'r') as file:
+            lines = file.readlines()

-    for block in soup.find_all('p'):
-        for text in block.stripped_strings:
-            print(repr(text))
-    #     for tag in block:
-    #         if tag is None:
-    #             continue
-    #
-    #         if not list(tag.children):
-    #             print(tag)
-    #             continue
-    #
-    #         for s in tag.children:
-    #             print(s)
-    #
-    # for block in soup.find_all('p'):
-    #     print(list(block.childGenerator()))
-    #         # match tag.name:
-    #             case None:
-    #                 print(tag)
-    #
-    #             case 'b':
-    #                 print('\t', tag.name)
-    #                 print('\t\t', tag)
-    #                 # file.write(f"**{tag.string}**")
-    #
-    #             case 'i':
-    #                 print('\t', tag.name)
-    #                 print('\t\t', tag)
-    #                 # file.write(f"*{tag.string}*")
-    #
-    #             case _:
-    #                 print('\t', tag.name)
-    #                 print('\t\t', tag)
-    #
-    #
-    #             file.write('\n')
-    #
-    #         file.write('\n')
+        indent = 0  # Current number of '\t' to add in front of the list number
+        sublist = dict()  # Track the indent number based on the id.
+        with open(self.file, 'w') as file:
+            file.write('# Chapter Introduction\n\n')

+            header = None
+            for line in lines:
+                # Do not write blank lines
+                if not line.strip():
+                    continue
+
+                # Main header, if a header has already been written skip it
+                if line.strip().startswith('# '):
+                    if header == line.strip():
+                        continue
+                    header = line.strip()
+
+                # Add a `\n` if line is a heading.
+                if line.startswith('#'):
+                    line = f'\n{line.strip()}'
+
+                # If line is not a list then add a `\n` after the paragraph,
+                # and reset the indent tracker
+                if line.strip()[0] not in string.digits:
+                    line = f'{line}\n'
+                    # print('resetting indent')
+                    indent = 0
+                    sublist.clear()
+                else:
+                    # Get the list id
+                    start = line.find('~')
+                    end = start + 2
+                    list_id = line[start:end]
+
+                    # Get the number of indents based on the list id, if it does
+                    # not exist they set the value to the max value +1. If this
+                    # is the first it will be set to 0 (-1 +1 = 0)
+                    indents = "\t" * sublist.setdefault(
+                        list_id, max(sublist.values(), default=-1)+1
+                    )
+                    line = f'{indents}{line.replace(list_id, "")}'
+
+                if line.startswith('Passage: '):
+                    line = line.removeprefix('Passage: ')
+
+                file.write(line)
+
+    @staticmethod
+    def normalize_osis_verses(verses):
+        """Takes this 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25'
+        and turns it into somthing more readable"""
+        normalized_refs = list()
+        for chunks in verses.removeprefix("Bible:").split('Bible:'):
+            spanned_verses = list()
+            for ref in chunks.split('-'):
+                verse = ref.replace('.', ' ', 1).replace('.', ':')
+                spanned_verses.append(
+                    bible.format_scripture_references(
+                        bible.get_references(verse.strip())
+                    )
+                )
+
+            try:
+                normalized = bible.format_scripture_references(
+                    bible.get_references('-'.join(spanned_verses))
+                )
+            except:
+                logger.warning(f"Error with: {verses=}: {spanned_verses=}")
+                raise
+            normalized_refs.append(normalized)
+
+        return ';'.join(normalized_refs)
+
+    @staticmethod
+    def obsidian_links(verses):
+        """Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
+        links = []
+        for verse in verses.split(';'):
+            links.append(f"[[{verse.replace(':', '#').replace('-', '..')}]]")
+
+        return '; '.join(links)
+
+    def convert_passage(self, verses):
+        book = self.file.parts[-2]
+        chapter = None
+        ref = []
+        for ver in verses.split(','):
+            try:
+                chapter, v = ver.split(':')
+            except:
+                v = ver
+
+            r = []
+            for i in v.split('-'):
+                r.append(f'{book}.{chapter}.{i}')
+
+            ref.append(f'Bible:{"-".join(r)}')
+        x = ' '.join(ref)
+        return f'osisRef="{x}"'
+
+    def process_tag(self, test=None):
+        match self.tag:
+            case 'b':  # Bold
+                self.write_to_file('**')
+
+            case 'i':  # Italics
+                self.write_to_file('*')
+
+            case 'scripcom':
+                # This saves the verse for the section.
+                if self.tag_type == 'start':
+                    if self.attrs['type'] == 'Commentary':
+                        self.passage_verse = self.attrs['osisref']
+
+            case 'h4':  # this is the section header. Need to keep track of it per file.
+                if self.tag_type == 'start':
+                    self.write_to_file('\n# ')
+
+                if self.tag_type == 'end':
+                    self.write_to_file('\n')
+
+                    #
+                    if self.passage_verse:
+                        try:
+                            verse = bible.get_references(
+                                self.normalize_osis_verses(
+                                    self.passage_verse)
+                            )[0]
+                        except:
+                            logger.debug(self.passage_verse)
+                            raise
+
+                        self.write_to_file('## Verses: ')
+                        self.write_to_file(
+                            f'{verse.start_verse} - {verse.end_verse}'
+                        )
+                        self.write_to_file('\n\n')
+
+            case 'scripref':  # Scripture ref
+                # get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
+                #  bible.format_scripture_references(bible.get_references(attrs['osisref']))
+                if self.tag_type == 'start':
+                    if 'osisref' in self.attrs:
+                        verses = self.normalize_osis_verses(
+                            self.attrs['osisref']
+                        )
+                    else:
+                        verses = self.normalize_osis_verses(
+                            self.convert_passage(self.attrs['passage'])
+                        )
+
+                    self.write_to_file(self.obsidian_links(verses))
+
+            case 'p':  # Paragraph
+                if self.tag_type == 'start':
+                    if self.attrs.get('class', False) == 'passage':
+                        # Need this do the regex in self.clean does not pick
+                        # this up. Will be also cleaned in the same function.
+                        self.write_to_file('Passage: ')
+                if self.tag_type == 'end':
+                    self.write_to_file('\n\n')
+
+            case 'li':
+                if self.tag_type == 'start':
+                    self.write_to_file('- ')
+                else:
+                    self.write_to_file('\n')
+
+            # case _:
+            #     # print(tag.upper())
+            #     pass
+
+    def handle_starttag(self, tag, attrs):
+        # Set taf trackers
+        self.tag_type = 'start'
+        self.tag = tag
+        self.attrs = dict(attrs)
+
+        self.process_tag()
+
+    def handle_endtag(self, tag):
+        # print(f'End: {tag}')
+        self.tag = tag
+        self.tag_type = 'end'
+        self.process_tag()
+
+        # Clear tag tracker
+        self.tag = None
+        self.tag_type = None
+        self.attrs.clear()
+
+    def handle_data(self, data):
+        # print(f'Data: {data!r}')
+
+        if self.tag in ['scripref', 'h2', 'h3', 'h5']:
+            return
+
+        if data == '\n':
+            return
+
+        data = data.replace('—', '-- ')
+        data = data.replace('GOD', 'God')
+
+        self.write_to_file(data.replace('\n', ' '))


 if __name__ == '__main__':
-    folder = Path('/home/bear/PycharmProjects/mark_henry_md/matthew_henry')
+    parser = MyHTMLParser()

-    htm_files = list(folder.glob('*.HTM'))
-    output_folder = Path('./Matthew Henry Commentary').absolute()
-    # for file in tqdm(htm_files[7:8]):
-    for file in htm_files[7:8]:
-        soup = BeautifulSoup(file.read_text(), 'html.parser')
+    # Process each volume file.
+    pbar = tqdm(total=1273)
+    for vol in sorted(Path('./').glob('*.xml')):
+        logger.info(vol)
+        soup = BeautifulSoup(vol.read_text(), 'xml')

-        book, chapter = get_filename(soup)
-        md_file = create_md_file(output_folder, book, chapter)
+        # Get each book in the volume
+        # This will be the main folder for all the book's chapters
+        for book in soup.find_all('div1'):
+            logger.info(book['title'])
+            book_name = book['title'].replace('First', '1').replace('Second', '2').replace('Third', '3')

-        write_blocks(soup, md_file)
+            # These are the chapters/files for each book folder
+            for chapter in book.find_all('div2'):
+                logger.info(chapter['title'])
+
+                filename = chapter['title']
+
+                # Remove Roman Numerals from file name
+                if 'CHAPTER' in filename.upper():
+                    _, roman_num = chapter['title'].split(' ')
+                    filename = f'Chapter {parser.roman_to_int(roman_num)}'
+
+                parser.create_md_file(base, book_name, filename)
+
+                pbar.set_description(f"Processing: {vol.name}: {book_name}: {filename}")
+
+                # Parse the text of the chapter
+                # May need to add a loop to look in all <p class='passage> and
+                # unrap any scripref that is in them. the place Dan is getting confused
+                # with the book Daniel.
+                for passage in chapter.find_all('p', {'class': 'passage'}):
+                    # print(passage.text)
+                    new_t = soup.new_tag('p', attrs=passage.attrs)
+                    new_t.string = passage.text
+                    passage.replace_with(new_t)
+
+                # Clearing all scripRef of internal text. That way if it
+                # had other tags in it would not be processed
+                for scripture in chapter.find_all('scripRef'):
+                    scripture.clear()
+
+                for doc in chapter.children:
+
+                    try:
+                        parser.feed(str(doc))
+                    except Exception as e:
+                        logger.exception(e)
+                        logger.debug(chapter.attrs)
+                        raise
+
+                parser.clean_file()
+                pbar.update(1)
+
+                # if pbar.last_print_n > 2:
+                #     sys.exit()
+
+    pbar.close()
+    parser.close()