renamed and updated

2023-12-01 21:47:45 -05:00 · 2023-12-01 21:47:45 -05:00 · a1e69b1368
commit a1e69b1368
parent 8c767b5235
2 changed files with 469 additions and 100 deletions
--- a/html_parse.py
+++ b/html_parse.py
@ -0,0 +1,126 @@
 import time
 from pathlib import Path
 from bs4 import BeautifulSoup
 from tqdm import tqdm
 from loguru import logger
 import string
 import shutil
 from html.parser import HTMLParser
 def roman_to_int(number: str) -> int:
    if number is None:
        return 0
    roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    total = 0
    for i in range(len(number)-1,-1,-1):
        num = roman[number[i]]
        if 3*num < total:
            total -= num
        else:
            total += num
    return total
 # File rename...
 # <TITLE>Matthew Henry's Complete Commentary on the Whole Bible [Genesis, Chapter II].</TITLE>
 # The text within the '[]' should be the name of the file.
 def get_filename(soup: BeautifulSoup) -> tuple:
    title = soup.title.string
    title = (title
             .replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')
             .replace("].", ''))
    *book, _ = title.replace('Chapter', '').split()
    book = ' '.join(book)
    book = book.strip(string.punctuation).title()
    book = (
        book.replace('First', '1').
        replace('Second', '2').
        replace('Third', '3')
    )
    if 'Introduction' in title:
        roman = ''
    elif 'CHAPTER' in title.upper():
        *_, roman = title.upper().split('CHAPTER')
    else:
        *_, roman = title.split()
    num = roman_to_int(roman.strip())
    file = f'{book} {num}' if num else book
    return book.strip(), file.strip()
 def create_md_file(base_dir: Path, folder: str, file: str) -> Path:
    md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
    # print(md_file)
    md_file.parent.mkdir(parents=True, exist_ok=True)
    md_file.touch(exist_ok=True)
    return md_file
 def write_blocks(soup: BeautifulSoup, md_file: Path) -> None:
    # def get_string(block):
    #     if list(block.decendents) > 1:
    #         for i in block.decendents:
    for block in soup.find_all('p'):
        for text in block.stripped_strings:
            print(repr(text))
    #     for tag in block:
    #         if tag is None:
    #             continue
    #
    #         if not list(tag.children):
    #             print(tag)
    #             continue
    #
    #         for s in tag.children:
    #             print(s)
    #
    # for block in soup.find_all('p'):
    #     print(list(block.childGenerator()))
    #         # match tag.name:
    #             case None:
    #                 print(tag)
    #
    #             case 'b':
    #                 print('\t', tag.name)
    #                 print('\t\t', tag)
    #                 # file.write(f"**{tag.string}**")
    #
    #             case 'i':
    #                 print('\t', tag.name)
    #                 print('\t\t', tag)
    #                 # file.write(f"*{tag.string}*")
    #
    #             case _:
    #                 print('\t', tag.name)
    #                 print('\t\t', tag)
    #
    #
    #             file.write('\n')
    #
    #         file.write('\n')
 if __name__ == '__main__':
    folder = Path('/home/bear/PycharmProjects/mark_henry_md/matthew_henry')
    htm_files = list(folder.glob('*.HTM'))
    output_folder = Path('./Matthew Henry Commentary').absolute()
    # for file in tqdm(htm_files[7:8]):
    for file in htm_files[7:8]:
        soup = BeautifulSoup(file.read_text(), 'html.parser')
        book, chapter = get_filename(soup)
        md_file = create_md_file(output_folder, book, chapter)
        write_blocks(soup, md_file)
--- a/main.py
+++ b/main.py
@ -1,17 +1,55 @@
-import time
+import re
 from pathlib import Path
 from bs4 import BeautifulSoup
 from tqdm import tqdm
 from loguru import logger
 import string
-import shutil
+import sys
 from html.parser import HTMLParser
 from pathlib import Path
 import pythonbible as bible
 from bs4 import BeautifulSoup
 from loguru import logger
 from tqdm import tqdm
 base = Path('./Matthew Henry Commentary/xml').absolute()
 logger.remove(0)
 logger.add('log.txt')
 class MyHTMLParser(HTMLParser):
    file = None
    passage_verse = None
    tag = None
    tag_type = None  # 'start' or 'end'
    attrs = dict()
    # All the patterns are for re.sub(). This is specifically for getting
    # list/sublist into the correct position. The first item is for the pattern
    # to find and the second is the replacement pattern. The replacement pattern
    # also ids the list so that when the self.clean_file() we can properly add
    # the correct amount of `\t` to the line.
    patterns = [
        (r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'),
        (r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'),
        # These share the same id. There are times were the two are a mixed
        # within the parent lists.
        (r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'),
        (r"(\S\s)(\d+\.)(\s\b|\W)",
         lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'),
        (r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.",
         lambda x: f'\n{parser.roman_to_int(x.group())}.~1'),
    ]
    @staticmethod
    def roman_to_int(number: str) -> int:
        if number is None:
            return 0
        # Sometimes the roman numeral comes with a '.'. Striping to not cause
        # issues
        number = number.strip('.')
        roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
        total = 0
        for i in range(len(number) - 1, -1, -1):
@ -23,104 +61,309 @@ def roman_to_int(number: str) -> int:
        return total
    def create_md_file(self, base_dir: Path, folder: str, file: str):
        """ Create the path of the md file that will be written to as the
        program process the xml file.
        """
 # File rename...
 # <TITLE>Matthew Henry's Complete Commentary on the Whole Bible [Genesis, Chapter II].</TITLE>
 # The text within the '[]' should be the name of the file.
 def get_filename(soup: BeautifulSoup) -> tuple:
    title = soup.title.string
    title = (title
             .replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')
             .replace("].", ''))
    *book, _ = title.replace('Chapter', '').split()
    book = ' '.join(book)
    book = book.strip(string.punctuation).title()
    book = (
        book.replace('First', '1').
        replace('Second', '2').
        replace('Third', '3')
    )
    if 'Introduction' in title:
        roman = ''
    elif 'CHAPTER' in title.upper():
        *_, roman = title.upper().split('CHAPTER')
    else:
        *_, roman = title.split()
    num = roman_to_int(roman.strip())
    file = f'{book} {num}' if num else book
    return book.strip(), file.strip()
 def create_md_file(base_dir: Path, folder: str, file: str) -> Path:
        md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
    # print(md_file)
        md_file.parent.mkdir(parents=True, exist_ok=True)
-    md_file.touch(exist_ok=True)
+        md_file.unlink(missing_ok=True)
        # print(md_file.absolute())
        self.file = md_file
        return md_file
    def write_to_file(self, data):
-def write_blocks(soup: BeautifulSoup, md_file: Path) -> None:
+        if self.file is None:
-    # def get_string(block):
+            raise ValueError('No File specified')
    #     if list(block.decendents) > 1:
    #         for i in block.decendents:
        # Here the list/sublist are searched for and altered for later
        # processing in self.clean_file()
        for pattern, sub in self.patterns:
            # print(f'Running Pattern: {pattern}')
            data = re.sub(pattern, sub, data)
        # print(f'writing: {data!r}')
        with open(self.file, 'a+') as file:
            file.write(data)
    def clean_file(self):
        logger.info(' CLEANING FILE')
        if self.file is None:
            raise ValueError('No File specified')
        # Read all the lines
        with open(self.file, 'r') as file:
            lines = file.readlines()
        indent = 0  # Current number of '\t' to add in front of the list number
        sublist = dict()  # Track the indent number based on the id.
        with open(self.file, 'w') as file:
            file.write('# Chapter Introduction\n\n')
            header = None
            for line in lines:
                # Do not write blank lines
                if not line.strip():
                    continue
                # Main header, if a header has already been written skip it
                if line.strip().startswith('# '):
                    if header == line.strip():
                        continue
                    header = line.strip()
                # Add a `\n` if line is a heading.
                if line.startswith('#'):
                    line = f'\n{line.strip()}'
                # If line is not a list then add a `\n` after the paragraph,
                # and reset the indent tracker
                if line.strip()[0] not in string.digits:
                    line = f'{line}\n'
                    # print('resetting indent')
                    indent = 0
                    sublist.clear()
                else:
                    # Get the list id
                    start = line.find('~')
                    end = start + 2
                    list_id = line[start:end]
                    # Get the number of indents based on the list id, if it does
                    # not exist they set the value to the max value +1. If this
                    # is the first it will be set to 0 (-1 +1 = 0)
                    indents = "\t" * sublist.setdefault(
                        list_id, max(sublist.values(), default=-1)+1
                    )
                    line = f'{indents}{line.replace(list_id, "")}'
                if line.startswith('Passage: '):
                    line = line.removeprefix('Passage: ')
                file.write(line)
    @staticmethod
    def normalize_osis_verses(verses):
        """Takes this 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25'
        and turns it into somthing more readable"""
        normalized_refs = list()
        for chunks in verses.removeprefix("Bible:").split('Bible:'):
            spanned_verses = list()
            for ref in chunks.split('-'):
                verse = ref.replace('.', ' ', 1).replace('.', ':')
                spanned_verses.append(
                    bible.format_scripture_references(
                        bible.get_references(verse.strip())
                    )
                )
            try:
                normalized = bible.format_scripture_references(
                    bible.get_references('-'.join(spanned_verses))
                )
            except:
                logger.warning(f"Error with: {verses=}: {spanned_verses=}")
                raise
            normalized_refs.append(normalized)
        return ';'.join(normalized_refs)
    @staticmethod
    def obsidian_links(verses):
        """Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
        links = []
        for verse in verses.split(';'):
            links.append(f"[[{verse.replace(':', '#').replace('-', '..')}]]")
        return '; '.join(links)
    def convert_passage(self, verses):
        book = self.file.parts[-2]
        chapter = None
        ref = []
        for ver in verses.split(','):
            try:
                chapter, v = ver.split(':')
            except:
                v = ver
            r = []
            for i in v.split('-'):
                r.append(f'{book}.{chapter}.{i}')
            ref.append(f'Bible:{"-".join(r)}')
        x = ' '.join(ref)
        return f'osisRef="{x}"'
    def process_tag(self, test=None):
        match self.tag:
            case 'b':  # Bold
                self.write_to_file('**')
            case 'i':  # Italics
                self.write_to_file('*')
            case 'scripcom':
                # This saves the verse for the section.
                if self.tag_type == 'start':
                    if self.attrs['type'] == 'Commentary':
                        self.passage_verse = self.attrs['osisref']
            case 'h4':  # this is the section header. Need to keep track of it per file.
                if self.tag_type == 'start':
                    self.write_to_file('\n# ')
                if self.tag_type == 'end':
                    self.write_to_file('\n')
    for block in soup.find_all('p'):
        for text in block.stripped_strings:
            print(repr(text))
    #     for tag in block:
    #         if tag is None:
    #             continue
    #
    #         if not list(tag.children):
    #             print(tag)
    #             continue
    #
    #         for s in tag.children:
    #             print(s)
    #
    # for block in soup.find_all('p'):
    #     print(list(block.childGenerator()))
    #         # match tag.name:
    #             case None:
    #                 print(tag)
    #
    #             case 'b':
    #                 print('\t', tag.name)
    #                 print('\t\t', tag)
    #                 # file.write(f"**{tag.string}**")
    #
    #             case 'i':
    #                 print('\t', tag.name)
    #                 print('\t\t', tag)
    #                 # file.write(f"*{tag.string}*")
                    #
                    if self.passage_verse:
                        try:
                            verse = bible.get_references(
                                self.normalize_osis_verses(
                                    self.passage_verse)
                            )[0]
                        except:
                            logger.debug(self.passage_verse)
                            raise
                        self.write_to_file('## Verses: ')
                        self.write_to_file(
                            f'{verse.start_verse} - {verse.end_verse}'
                        )
                        self.write_to_file('\n\n')
            case 'scripref':  # Scripture ref
                # get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
                #  bible.format_scripture_references(bible.get_references(attrs['osisref']))
                if self.tag_type == 'start':
                    if 'osisref' in self.attrs:
                        verses = self.normalize_osis_verses(
                            self.attrs['osisref']
                        )
                    else:
                        verses = self.normalize_osis_verses(
                            self.convert_passage(self.attrs['passage'])
                        )
                    self.write_to_file(self.obsidian_links(verses))
            case 'p':  # Paragraph
                if self.tag_type == 'start':
                    if self.attrs.get('class', False) == 'passage':
                        # Need this do the regex in self.clean does not pick
                        # this up. Will be also cleaned in the same function.
                        self.write_to_file('Passage: ')
                if self.tag_type == 'end':
                    self.write_to_file('\n\n')
            case 'li':
                if self.tag_type == 'start':
                    self.write_to_file('- ')
                else:
                    self.write_to_file('\n')
            # case _:
-    #                 print('\t', tag.name)
+            #     # print(tag.upper())
-    #                 print('\t\t', tag)
+            #     pass
    #
    #
    #             file.write('\n')
    #
    #         file.write('\n')
    def handle_starttag(self, tag, attrs):
        # Set taf trackers
        self.tag_type = 'start'
        self.tag = tag
        self.attrs = dict(attrs)
        self.process_tag()
    def handle_endtag(self, tag):
        # print(f'End: {tag}')
        self.tag = tag
        self.tag_type = 'end'
        self.process_tag()
        # Clear tag tracker
        self.tag = None
        self.tag_type = None
        self.attrs.clear()
    def handle_data(self, data):
        # print(f'Data: {data!r}')
        if self.tag in ['scripref', 'h2', 'h3', 'h5']:
            return
        if data == '\n':
            return
        data = data.replace('—', '-- ')
        data = data.replace('GOD', 'God')
        self.write_to_file(data.replace('\n', ' '))
 if __name__ == '__main__':
-    folder = Path('/home/bear/PycharmProjects/mark_henry_md/matthew_henry')
+    parser = MyHTMLParser()
-    htm_files = list(folder.glob('*.HTM'))
+    # Process each volume file.
-    output_folder = Path('./Matthew Henry Commentary').absolute()
+    pbar = tqdm(total=1273)
-    # for file in tqdm(htm_files[7:8]):
+    for vol in sorted(Path('./').glob('*.xml')):
-    for file in htm_files[7:8]:
+        logger.info(vol)
-        soup = BeautifulSoup(file.read_text(), 'html.parser')
+        soup = BeautifulSoup(vol.read_text(), 'xml')
-        book, chapter = get_filename(soup)
+        # Get each book in the volume
-        md_file = create_md_file(output_folder, book, chapter)
+        # This will be the main folder for all the book's chapters
        for book in soup.find_all('div1'):
            logger.info(book['title'])
            book_name = book['title'].replace('First', '1').replace('Second', '2').replace('Third', '3')
-        write_blocks(soup, md_file)
+            # These are the chapters/files for each book folder
            for chapter in book.find_all('div2'):
                logger.info(chapter['title'])
                filename = chapter['title']
                # Remove Roman Numerals from file name
                if 'CHAPTER' in filename.upper():
                    _, roman_num = chapter['title'].split(' ')
                    filename = f'Chapter {parser.roman_to_int(roman_num)}'
                parser.create_md_file(base, book_name, filename)
                pbar.set_description(f"Processing: {vol.name}: {book_name}: {filename}")
                # Parse the text of the chapter
                # May need to add a loop to look in all <p class='passage> and
                # unrap any scripref that is in them. the place Dan is getting confused
                # with the book Daniel.
                for passage in chapter.find_all('p', {'class': 'passage'}):
                    # print(passage.text)
                    new_t = soup.new_tag('p', attrs=passage.attrs)
                    new_t.string = passage.text
                    passage.replace_with(new_t)
                # Clearing all scripRef of internal text. That way if it
                # had other tags in it would not be processed
                for scripture in chapter.find_all('scripRef'):
                    scripture.clear()
                for doc in chapter.children:
                    try:
                        parser.feed(str(doc))
                    except Exception as e:
                        logger.exception(e)
                        logger.debug(chapter.attrs)
                        raise
                parser.clean_file()
                pbar.update(1)
                # if pbar.last_print_n > 2:
                #     sys.exit()
    pbar.close()
    parser.close()