mh_parser/main.py

import itertools
import re
import string
import sys
from html.parser import HTMLParser
from pathlib import Path

import pythonbible as bible
from bs4 import BeautifulSoup
from loguru import logger
from tqdm import tqdm

base = Path('./xml/Matthew Henry Commentary 2').absolute()

logger.remove(0)

from functools import partialmethod

logger.__class__.foobar = partialmethod(logger.__class__.log, "foobar", level='WARNING')
logger.add('log.txt', level='WARNING')

convert = dict([
("Eccl", "Ecclesiastes"),
("Rom", "Romans"),
("2Tim", "2 Timothy"),
("2Pet", "2 Peter"),
("Gen", "Genesis"),
("John", "John"),
("1Cor", "1 Corinthians"),
("Ezek", "Ezekiel"),
("Heb", "Hebrews"),
("Rev", "Revelation"),
("Eph", "Ephesians"),
("Isa", "Isaiah"),
("Ps", "Psalms"),
("Acts", "Acts"),
("Deut", "Deuteronomy"),
("Neh", "Nehemiah"),
("1Chr", "1 Chronicles"),
("2Kgs", "2 Kings"),
("2Cor", "2 Corinthians"),
("Hos", "Hosea"),
("Exod", "Exodus"),
("Matt", "Matthew"),
("Job", "Job"),
("Prov", "Proverbs"),
("Col", "Colossians"),
("Jer", "Jeremiah"),
("1John", "1 John"),
("Jas", "James"),
("1Tim", "1 Timothy"),
("Amos", "Amos"),
("Mic", "Micah"),
("Dan", "Daniel"),
("Jonah", "Jonah"),
("Luke", "Luke"),
("Mal", "Malachi"),
("Zech", "Zechariah"),
("Phil", "Philippians"),
("Mark", "Mark"),
("2Thess", "2 Thessalonians"),
("Jude", "Jude"),
("1Sam", "1 Samuel"),
("Num", "Numbers"),
("Gal", "Galatians"),
("1Pet", "1 Peter"),
("Lev", "Leviticus"),
("2Chr", "2 Chronicles"),
("Lam", "Lamentations"),
("1Kgs", "1 Kings"),
("Ezra", "Ezra"),
("Obad", "Obadiah"),
("Hab", "Habakkuk"),
("2Sam", "2 Samuel"),
("Josh", "Joshua"),
("Judg", "Judges"),
("Ruth", "Ruth"),
("Esth", "Esther"),
("Phlm", "Philemon"),
("1Thess", "1 Thessalonians"),
("Joel", "Joel"),
("Titus", "Titus"),
("3John", "3 John"),
("Zeph", "Zephaniah"),
("Song", "Song of Songs"),
("Hag", "Haggai"),
("Wis", "Wisdom of Solomon"),
("2John", "2 John"),
("2Macc", "2 Maccabees"),
("Nah", "Nahum"),
("Sir", "Ecclesiasticus"),
("Psa", "Psalms"),
("1Macc", "1 Maccabees"),
("2Peter", "2 Peter"),
]
)

class MyHTMLParser(HTMLParser):
    file = None
    passage_verse = None
    tag = None
    tag_type = None  # 'start' or 'end'
    skip_re = False
    attrs = dict()

    # All the patterns are for re.sub(). This is specifically for getting
    # list/sublist into the correct position. The first item is for the pattern
    # to find and the second is the replacement pattern. The replacement pattern
    # also ids the list so that when the self.clean_file() we can properly add
    # the correct amount of `\t` to the line.
    patterns = [
        (r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'),
        (r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'),

        # These share the same id. There are times were the two are a mixed
        # within the parent lists.
        (r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'),
        (r"(\S\s)(\d+\.)(\s\b|\W)",
         lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'),


        (r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.",
         lambda x: f'\n{parser.roman_to_int(x.group())}.~1'),
    ]

    @staticmethod
    def roman_to_int(number: str) -> int:
        if number is None:
            return 0

        # Sometimes the roman numeral comes with a '.'. Striping to not cause
        # issues
        number = number.strip('.')

        roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
        total = 0
        for i in range(len(number) - 1, -1, -1):
            num = roman[number[i]]
            if 3 * num < total:
                total -= num
            else:
                total += num

        return total

    def create_md_file(self, base_dir: Path, folder: str, file: str):
        """ Create the path of the md file that will be written to as the
        program process the xml file.
        """

        md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
        md_file.parent.mkdir(parents=True, exist_ok=True)
        md_file.unlink(missing_ok=True)
        # print(md_file.absolute())
        self.file = md_file
        return md_file

    def write_to_file(self, data, skip_re=False):

        # if self.file is None:
        #     raise ValueError('No File specified')

        if not skip_re:
            # Here the list/sublist are searched for and altered for later
            # processing in self.clean_file()
            for pattern, sub in self.patterns:
                # logger.debug(f'Running Pattern: {pattern} on {data[:50]}')
                data = re.sub(pattern, sub, data)

        # print(f'writing: {data!r}')

        if self.file:
            # raise ValueError('No File specified')
            with open(self.file, 'a+') as file:
                file.write(data)
        else:
            print(data)

    def clean_file(self):
        logger.info(' CLEANING FILE')

        if self.file is None:
            raise ValueError('No File specified')

        # Read all the lines
        with open(self.file, 'r') as file:
            lines = file.readlines()

        indent = 0  # Current number of '\t' to add in front of the list number
        sublist = dict()  # Track the indent number based on the id.
        with open(self.file, 'w') as file:
            file.write('# Chapter Introduction\n\n')

            header = None
            for line in lines:
                # Do not write blank lines
                if not line.strip():
                    continue

                # Main header, if a header has already been written skip it
                if line.strip().startswith('# '):
                    if header == line.strip():
                        continue
                    header = line.strip()

                # Add a `\n` if line is a heading.
                if line.startswith('#'):
                    line = f'\n{line.strip()}'

                # If line is not a list then add a `\n` after the paragraph,
                # and reset the indent tracker
                if line.strip()[0] not in string.digits:
                    line = f'{line}\n'
                    # print('resetting indent')
                    indent = 0
                    sublist.clear()
                else:
                    # Get the list id
                    start = line.find('~')
                    end = start + 2
                    list_id = line[start:end]

                    # Get the number of indents based on the list id, if it does
                    # not exist they set the value to the max value +1. If this
                    # is the first it will be set to 0 (-1 +1 = 0)
                    indents = "\t" * sublist.setdefault(
                        list_id, max(sublist.values(), default=-1)+1
                    )
                    line = f'{indents}{line.replace(list_id, "")}'

                if line.startswith('Passage: '):
                    line = line.removeprefix('Passage: ')

                file.write(line)

    def normalize_osis_verses(self, verses):
        """Takes this
        'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
        and turns it into somthing more readable"""
        normalized_refs = list()
        for chunks in verses.removeprefix("Bible:").split('Bible:'):
            spanned_verses = list()
            for ref in chunks.split('-'):
                verse = ref.replace('.', ' ', 1).replace('.', ':')
                book = ref.split('.')[0]
                verse.replace(book, convert[book])
                spanned_verses.append(
                    bible.format_scripture_references(
                        bible.get_references(verse.strip())
                    )
                )

            try:
                normalized = bible.format_scripture_references(
                    bible.get_references('-'.join(spanned_verses))
                )
            except:
                logger.warning(f"Error with: {verses=}: {spanned_verses=}")
                raise

            if normalized is False:
                print(f"Error with: {verses=}: {spanned_verses=}")
                sys.exit()

            normalized_refs.append(normalized)

        return ';'.join(normalized_refs)

    # @staticmethod
    def old_normalize_osis_verses(self, verses):
        """Takes this
        'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
        and turns it into somthing more readable"""
        normalized_refs = list()
        for chunks in verses.removeprefix("Bible:").split('Bible:'):
            spanned_verses = chunks.split('-')
            if len(spanned_verses) < 2:
                spanned_verses = list(itertools.repeat(spanned_verses[0], 2))

            book, start_chapter, start_verse = spanned_verses[0].split('.')
            _, end_chapter, end_verse = spanned_verses[1].split('.')
            book = bible.Book[convert[book].upper()]

            normalized = bible.NormalizedReference(
                book=book,
                start_chapter=start_chapter,
                start_verse=start_verse,
                end_chapter=end_chapter,
                end_verse=end_verse,
            )

            if normalized is False:
                logger.foobar(f'{self.args=}')

            normalized_refs.append(
                bible.format_scripture_references([normalized]).strip()
            )

        return ';'.join(normalized_refs)

    @staticmethod
    @logger.catch(reraise=False)
    def obsidian_links(verses):
        """Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
        links = []
        for verse in verses.split(';'):
            verse = verse.replace(':', '#').replace('-', '..')
            if verse[0] in string.digits:
                verse.replace(' ', '', 1)
            links.append(f"[[{verse}]]")

        return '; '.join(links)

    def convert_passage(self, verses):
        book = self.file.parts[-2]
        chapter = None
        ref = []
        for ver in verses.split(','):
            try:
                chapter, v = ver.split(':')
            except:
                v = ver

            r = []
            for i in v.split('-'):
                r.append(f'{book}.{chapter}.{i}')

            ref.append(f'Bible:{"-".join(r)}')
        x = ' '.join(ref)
        return f'osisRef="{x}"'

    def process_tag(self, test=None):
        match self.tag:
            case 'b':  # Bold
                self.write_to_file('**')

            case 'i':  # Italics
                self.write_to_file('*')

            case 'scripcom':
                # This saves the verse for the section.
                if self.tag_type == 'start':
                    if self.attrs['type'] == 'Commentary':
                        self.passage_verse = self.attrs['osisref']

            case 'h4':  # this is the section header. Need to keep track of it per file.
                if self.tag_type == 'start':
                    self.write_to_file('\n# ',)
                    self.skip_re = True

                if self.tag_type == 'end':
                    self.write_to_file('\n')
                    self.skip_re = False

            case 'scripref':  # Scripture ref
                # get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
                #  bible.format_scripture_references(bible.get_references(attrs['osisref']))
                if self.tag_type == 'start':
                    if 'osisref' in self.attrs:
                        verses = self.normalize_osis_verses(
                            self.attrs['osisref']
                        )
                    else:
                        # verses = self.normalize_osis_verses(
                        #     self.convert_passage(self.attrs['passage'])
                        # )
                        raise ValueError('No `osisref` in tag attrs')

                    self.write_to_file(self.obsidian_links(verses))

            case 'p':  # Paragraph
                if self.tag_type == 'start':
                    if self.attrs.get('class', False) == 'passage':
                        if self.passage_verse:
                            try:
                                verse = bible.get_references(
                                    self.normalize_osis_verses(
                                        self.passage_verse)
                                )[0]
                            except:
                                logger.debug(self.passage_verse)
                                raise

                            self.write_to_file('## Verses: ')
                            self.write_to_file(
                                f'{verse.start_verse} - {verse.end_verse}'
                            )
                            self.write_to_file('\n\n')

                        # Need this do the regex in self.clean does not pick
                        # this up. Will be also cleaned in the same function.
                        self.write_to_file('Passage: ')
                if self.tag_type == 'end':
                    self.write_to_file('\n\n')

            case 'li':
                if self.tag_type == 'start':
                    self.write_to_file('- ')
                else:
                    self.write_to_file('\n')

            # case _:
            #     # print(tag.upper())
            #     pass

    def handle_starttag(self, tag, attrs):
        # Set taf trackers
        self.tag_type = 'start'
        self.tag = tag
        self.attrs = dict(attrs)

        self.process_tag()

    def handle_endtag(self, tag):
        # print(f'End: {tag}')
        self.tag = tag
        self.tag_type = 'end'
        self.process_tag()

        # Clear tag tracker
        self.tag = None
        self.tag_type = None
        self.attrs.clear()
        self.skip_re = False

    def handle_data(self, data):
        # print(f'Data: {data!r}')

        if self.tag in ['scripref', 'h2', 'h3', 'h5']:
            return

        if data == '\n':
            return

        data = data.replace('—', '-- ')
        data = data.replace('GOD', 'God')
        data = data.replace('\n', ' ')

        self.write_to_file(data, self.skip_re)


if __name__ == '__main__':
    parser = MyHTMLParser()

    # Process each volume file.
    pbar = tqdm(total=1273)
    for vol in sorted(Path('./').glob('*.xml')):
        logger.info(vol)
        soup = BeautifulSoup(vol.read_text(), 'xml')

        # Get each book in the volume
        # This will be the main folder for all the book's chapters
        for book in soup.find_all('div1'):
            book_name = (book['title'].
                         replace('First ', '1').
                         replace('Second ', '2').
                         replace('Third ', '3')
                         )
            logger.info(book_name)

            # These are the chapters/files for each book folder
            for chapter in book.find_all('div2'):

                # For introduction files
                filename = f'MH {book_name} {chapter['title']}'

                # Remove Roman Numerals from file name
                if 'CHAPTER' in filename.upper():
                    *_, roman_num = chapter['title'].split(' ')
                    filename = f'MH {book_name} {parser.roman_to_int(roman_num)}'

                logger.info(filename)
                parser.create_md_file(base, book_name, filename)

                pbar.set_description(f"Processing: {vol.name}: {book_name}: {filename}")

                # Parse the text of the chapter
                # May need to add a loop to look in all <p class='passage> and
                # unrap any scripref that is in them. the place Dan is getting confused
                # with the book Daniel.
                for passage in chapter.find_all('p', {'class': 'passage'}):
                    # print(passage.text)
                    new_t = soup.new_tag('p', attrs=passage.attrs)
                    new_t.string = passage.text
                    passage.replace_with(new_t)

                # Clearing all scripRef of internal text. That way if it
                # had other tags in it would not be processed
                for scripture in chapter.find_all('scripRef'):
                    scripture.clear()

                for doc in chapter.children:

                    try:
                        logger.debug(f"{book_name}, {filename}")
                        parser.feed(str(doc))
                    except Exception as e:
                        logger.exception(e)
                        logger.debug(chapter.attrs)
                        raise

                parser.clean_file()
                pbar.update(1)

                # if pbar.last_print_n > 2:
                #     sys.exit()

    pbar.close()
    parser.close()
current 2023-12-07 11:56:01 +00:00			`import itertools`
renamed and updated 2023-12-02 02:47:45 +00:00			`import re`
			`import string`
			`import sys`
			`from html.parser import HTMLParser`
init 2023-11-30 02:23:35 +00:00			`from pathlib import Path`
renamed and updated 2023-12-02 02:47:45 +00:00
			`import pythonbible as bible`
init 2023-11-30 02:23:35 +00:00			`from bs4 import BeautifulSoup`
			`from loguru import logger`
renamed and updated 2023-12-02 02:47:45 +00:00			`from tqdm import tqdm`

current 2023-12-07 11:56:01 +00:00			`base = Path('./xml/Matthew Henry Commentary 2').absolute()`
renamed and updated 2023-12-02 02:47:45 +00:00
			`logger.remove(0)`
current 2023-12-07 11:56:01 +00:00
			`from functools import partialmethod`

			`logger.__class__.foobar = partialmethod(logger.__class__.log, "foobar", level='WARNING')`
			`logger.add('log.txt', level='WARNING')`

			`convert = dict([`
			`("Eccl", "Ecclesiastes"),`
			`("Rom", "Romans"),`
			`("2Tim", "2 Timothy"),`
			`("2Pet", "2 Peter"),`
			`("Gen", "Genesis"),`
			`("John", "John"),`
			`("1Cor", "1 Corinthians"),`
			`("Ezek", "Ezekiel"),`
			`("Heb", "Hebrews"),`
			`("Rev", "Revelation"),`
			`("Eph", "Ephesians"),`
			`("Isa", "Isaiah"),`
			`("Ps", "Psalms"),`
			`("Acts", "Acts"),`
			`("Deut", "Deuteronomy"),`
			`("Neh", "Nehemiah"),`
			`("1Chr", "1 Chronicles"),`
			`("2Kgs", "2 Kings"),`
			`("2Cor", "2 Corinthians"),`
			`("Hos", "Hosea"),`
			`("Exod", "Exodus"),`
			`("Matt", "Matthew"),`
			`("Job", "Job"),`
			`("Prov", "Proverbs"),`
			`("Col", "Colossians"),`
			`("Jer", "Jeremiah"),`
			`("1John", "1 John"),`
			`("Jas", "James"),`
			`("1Tim", "1 Timothy"),`
			`("Amos", "Amos"),`
			`("Mic", "Micah"),`
			`("Dan", "Daniel"),`
			`("Jonah", "Jonah"),`
			`("Luke", "Luke"),`
			`("Mal", "Malachi"),`
			`("Zech", "Zechariah"),`
			`("Phil", "Philippians"),`
			`("Mark", "Mark"),`
			`("2Thess", "2 Thessalonians"),`
			`("Jude", "Jude"),`
			`("1Sam", "1 Samuel"),`
			`("Num", "Numbers"),`
			`("Gal", "Galatians"),`
			`("1Pet", "1 Peter"),`
			`("Lev", "Leviticus"),`
			`("2Chr", "2 Chronicles"),`
			`("Lam", "Lamentations"),`
			`("1Kgs", "1 Kings"),`
			`("Ezra", "Ezra"),`
			`("Obad", "Obadiah"),`
			`("Hab", "Habakkuk"),`
			`("2Sam", "2 Samuel"),`
			`("Josh", "Joshua"),`
			`("Judg", "Judges"),`
			`("Ruth", "Ruth"),`
			`("Esth", "Esther"),`
			`("Phlm", "Philemon"),`
			`("1Thess", "1 Thessalonians"),`
			`("Joel", "Joel"),`
			`("Titus", "Titus"),`
			`("3John", "3 John"),`
			`("Zeph", "Zephaniah"),`
			`("Song", "Song of Songs"),`
			`("Hag", "Haggai"),`
			`("Wis", "Wisdom of Solomon"),`
			`("2John", "2 John"),`
			`("2Macc", "2 Maccabees"),`
			`("Nah", "Nahum"),`
			`("Sir", "Ecclesiasticus"),`
			`("Psa", "Psalms"),`
			`("1Macc", "1 Maccabees"),`
			`("2Peter", "2 Peter"),`
			`]`
			`)`
renamed and updated 2023-12-02 02:47:45 +00:00
			`class MyHTMLParser(HTMLParser):`
			`file = None`
			`passage_verse = None`
			`tag = None`
			`tag_type = None # 'start' or 'end'`
Moved the `## Verses: ` heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections. Both Of these issues were caught with Psalms 119 2023-12-02 14:29:00 +00:00			`skip_re = False`
renamed and updated 2023-12-02 02:47:45 +00:00			`attrs = dict()`

			`# All the patterns are for re.sub(). This is specifically for getting`
			`# list/sublist into the correct position. The first item is for the pattern`
			`# to find and the second is the replacement pattern. The replacement pattern`
			`# also ids the list so that when the self.clean_file() we can properly add`
			# the correct amount of `\t` to the line.
			`patterns = [`
			`(r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'),`
			`(r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'),`

			`# These share the same id. There are times were the two are a mixed`
			`# within the parent lists.`
			`(r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'),`
			`(r"(\S\s)(\d+\.)(\s\b\|\W)",`
			`lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'),`


			`(r"(?=[MDCLXVI])M*(C[MD]\|D?C{0,3})(X[CL]\|L?X{0,3})(I[XV]\|V?I{0,3})\.",`
			`lambda x: f'\n{parser.roman_to_int(x.group())}.~1'),`
			`]`

			`@staticmethod`
			`def roman_to_int(number: str) -> int:`
			`if number is None:`
			`return 0`

			`# Sometimes the roman numeral comes with a '.'. Striping to not cause`
			`# issues`
			`number = number.strip('.')`

			`roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}`
			`total = 0`
			`for i in range(len(number) - 1, -1, -1):`
			`num = roman[number[i]]`
			`if 3 * num < total:`
			`total -= num`
			`else:`
			`total += num`

			`return total`

			`def create_md_file(self, base_dir: Path, folder: str, file: str):`
			`""" Create the path of the md file that will be written to as the`
			`program process the xml file.`
			`"""`

			`md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()`
			`md_file.parent.mkdir(parents=True, exist_ok=True)`
			`md_file.unlink(missing_ok=True)`
			`# print(md_file.absolute())`
			`self.file = md_file`
			`return md_file`

Moved the `## Verses: ` heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections. Both Of these issues were caught with Psalms 119 2023-12-02 14:29:00 +00:00			`def write_to_file(self, data, skip_re=False):`
renamed and updated 2023-12-02 02:47:45 +00:00
Moved the `## Verses: ` heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections. Both Of these issues were caught with Psalms 119 2023-12-02 14:29:00 +00:00			`# if self.file is None:`
			`# raise ValueError('No File specified')`
renamed and updated 2023-12-02 02:47:45 +00:00
Moved the `## Verses: ` heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections. Both Of these issues were caught with Psalms 119 2023-12-02 14:29:00 +00:00			`if not skip_re:`
			`# Here the list/sublist are searched for and altered for later`
			`# processing in self.clean_file()`
			`for pattern, sub in self.patterns:`
			`# logger.debug(f'Running Pattern: {pattern} on {data[:50]}')`
			`data = re.sub(pattern, sub, data)`
renamed and updated 2023-12-02 02:47:45 +00:00
			`# print(f'writing: {data!r}')`

Moved the `## Verses: ` heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections. Both Of these issues were caught with Psalms 119 2023-12-02 14:29:00 +00:00			`if self.file:`
			`# raise ValueError('No File specified')`
			`with open(self.file, 'a+') as file:`
			`file.write(data)`
			`else:`
			`print(data)`
renamed and updated 2023-12-02 02:47:45 +00:00
			`def clean_file(self):`
			`logger.info(' CLEANING FILE')`

			`if self.file is None:`
			`raise ValueError('No File specified')`

			`# Read all the lines`
			`with open(self.file, 'r') as file:`
			`lines = file.readlines()`

			`indent = 0 # Current number of '\t' to add in front of the list number`
			`sublist = dict() # Track the indent number based on the id.`
			`with open(self.file, 'w') as file:`
			`file.write('# Chapter Introduction\n\n')`

			`header = None`
			`for line in lines:`
			`# Do not write blank lines`
			`if not line.strip():`
			`continue`

			`# Main header, if a header has already been written skip it`
			`if line.strip().startswith('# '):`
			`if header == line.strip():`
			`continue`
Headers were being repeated.... FIXED 2023-12-04 01:23:25 +00:00			`header = line.strip()`
renamed and updated 2023-12-02 02:47:45 +00:00
			# Add a `\n` if line is a heading.
			`if line.startswith('#'):`
			`line = f'\n{line.strip()}'`

			# If line is not a list then add a `\n` after the paragraph,
			`# and reset the indent tracker`
			`if line.strip()[0] not in string.digits:`
			`line = f'{line}\n'`
			`# print('resetting indent')`
			`indent = 0`
			`sublist.clear()`
			`else:`
			`# Get the list id`
			`start = line.find('~')`
			`end = start + 2`
			`list_id = line[start:end]`

			`# Get the number of indents based on the list id, if it does`
			`# not exist they set the value to the max value +1. If this`
			`# is the first it will be set to 0 (-1 +1 = 0)`
			`indents = "\t" * sublist.setdefault(`
			`list_id, max(sublist.values(), default=-1)+1`
			`)`
			`line = f'{indents}{line.replace(list_id, "")}'`

			`if line.startswith('Passage: '):`
			`line = line.removeprefix('Passage: ')`

			`file.write(line)`

current 2023-12-07 11:56:01 +00:00			`def normalize_osis_verses(self, verses):`
			`"""Takes this`
			`'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'`
renamed and updated 2023-12-02 02:47:45 +00:00			`and turns it into somthing more readable"""`
			`normalized_refs = list()`
			`for chunks in verses.removeprefix("Bible:").split('Bible:'):`
			`spanned_verses = list()`
			`for ref in chunks.split('-'):`
			`verse = ref.replace('.', ' ', 1).replace('.', ':')`
current 2023-12-07 11:56:01 +00:00			`book = ref.split('.')[0]`
			`verse.replace(book, convert[book])`
renamed and updated 2023-12-02 02:47:45 +00:00			`spanned_verses.append(`
			`bible.format_scripture_references(`
			`bible.get_references(verse.strip())`
			`)`
			`)`
init 2023-11-30 02:23:35 +00:00
renamed and updated 2023-12-02 02:47:45 +00:00			`try:`
			`normalized = bible.format_scripture_references(`
			`bible.get_references('-'.join(spanned_verses))`
			`)`
			`except:`
			`logger.warning(f"Error with: {verses=}: {spanned_verses=}")`
			`raise`
current 2023-12-07 11:56:01 +00:00
			`if normalized is False:`
			`print(f"Error with: {verses=}: {spanned_verses=}")`
			`sys.exit()`

renamed and updated 2023-12-02 02:47:45 +00:00			`normalized_refs.append(normalized)`
init 2023-11-30 02:23:35 +00:00
renamed and updated 2023-12-02 02:47:45 +00:00			`return ';'.join(normalized_refs)`
init 2023-11-30 02:23:35 +00:00
current 2023-12-07 11:56:01 +00:00			`# @staticmethod`
			`def old_normalize_osis_verses(self, verses):`
			`"""Takes this`
			`'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'`
			`and turns it into somthing more readable"""`
			`normalized_refs = list()`
			`for chunks in verses.removeprefix("Bible:").split('Bible:'):`
			`spanned_verses = chunks.split('-')`
			`if len(spanned_verses) < 2:`
			`spanned_verses = list(itertools.repeat(spanned_verses[0], 2))`

			`book, start_chapter, start_verse = spanned_verses[0].split('.')`
			`_, end_chapter, end_verse = spanned_verses[1].split('.')`
			`book = bible.Book[convert[book].upper()]`

			`normalized = bible.NormalizedReference(`
			`book=book,`
			`start_chapter=start_chapter,`
			`start_verse=start_verse,`
			`end_chapter=end_chapter,`
			`end_verse=end_verse,`
			`)`

			`if normalized is False:`
			`logger.foobar(f'{self.args=}')`

			`normalized_refs.append(`
			`bible.format_scripture_references([normalized]).strip()`
			`)`

			`return ';'.join(normalized_refs)`

renamed and updated 2023-12-02 02:47:45 +00:00			`@staticmethod`
current 2023-12-07 11:56:01 +00:00			`@logger.catch(reraise=False)`
renamed and updated 2023-12-02 02:47:45 +00:00			`def obsidian_links(verses):`
			"""Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
			`links = []`
			`for verse in verses.split(';'):`
current 2023-12-07 11:56:01 +00:00			`verse = verse.replace(':', '#').replace('-', '..')`
			`if verse[0] in string.digits:`
			`verse.replace(' ', '', 1)`
			`links.append(f"[[{verse}]]")`
renamed and updated 2023-12-02 02:47:45 +00:00
			`return '; '.join(links)`

			`def convert_passage(self, verses):`
			`book = self.file.parts[-2]`
			`chapter = None`
			`ref = []`
			`for ver in verses.split(','):`
			`try:`
			`chapter, v = ver.split(':')`
			`except:`
			`v = ver`

			`r = []`
			`for i in v.split('-'):`
			`r.append(f'{book}.{chapter}.{i}')`

			`ref.append(f'Bible:{"-".join(r)}')`
			`x = ' '.join(ref)`
			`return f'osisRef="{x}"'`

			`def process_tag(self, test=None):`
			`match self.tag:`
			`case 'b': # Bold`
			`self.write_to_file('**')`

			`case 'i': # Italics`
			`self.write_to_file('*')`

			`case 'scripcom':`
			`# This saves the verse for the section.`
			`if self.tag_type == 'start':`
			`if self.attrs['type'] == 'Commentary':`
			`self.passage_verse = self.attrs['osisref']`

			`case 'h4': # this is the section header. Need to keep track of it per file.`
			`if self.tag_type == 'start':`
Moved the `## Verses: ` heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections. Both Of these issues were caught with Psalms 119 2023-12-02 14:29:00 +00:00			`self.write_to_file('\n# ',)`
			`self.skip_re = True`
renamed and updated 2023-12-02 02:47:45 +00:00
			`if self.tag_type == 'end':`
			`self.write_to_file('\n')`
Moved the `## Verses: ` heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections. Both Of these issues were caught with Psalms 119 2023-12-02 14:29:00 +00:00			`self.skip_re = False`
renamed and updated 2023-12-02 02:47:45 +00:00
			`case 'scripref': # Scripture ref`
			`# get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]`
			`# bible.format_scripture_references(bible.get_references(attrs['osisref']))`
			`if self.tag_type == 'start':`
			`if 'osisref' in self.attrs:`
			`verses = self.normalize_osis_verses(`
			`self.attrs['osisref']`
			`)`
			`else:`
current 2023-12-07 11:56:01 +00:00			`# verses = self.normalize_osis_verses(`
			`# self.convert_passage(self.attrs['passage'])`
			`# )`
			raise ValueError('No `osisref` in tag attrs')
renamed and updated 2023-12-02 02:47:45 +00:00
			`self.write_to_file(self.obsidian_links(verses))`

			`case 'p': # Paragraph`
			`if self.tag_type == 'start':`
			`if self.attrs.get('class', False) == 'passage':`
Moved the `## Verses: ` heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections. Both Of these issues were caught with Psalms 119 2023-12-02 14:29:00 +00:00			`if self.passage_verse:`
			`try:`
			`verse = bible.get_references(`
			`self.normalize_osis_verses(`
			`self.passage_verse)`
			`)[0]`
			`except:`
			`logger.debug(self.passage_verse)`
			`raise`

			`self.write_to_file('## Verses: ')`
			`self.write_to_file(`
			`f'{verse.start_verse} - {verse.end_verse}'`
			`)`
			`self.write_to_file('\n\n')`

renamed and updated 2023-12-02 02:47:45 +00:00			`# Need this do the regex in self.clean does not pick`
			`# this up. Will be also cleaned in the same function.`
			`self.write_to_file('Passage: ')`
			`if self.tag_type == 'end':`
			`self.write_to_file('\n\n')`

			`case 'li':`
			`if self.tag_type == 'start':`
			`self.write_to_file('- ')`
			`else:`
			`self.write_to_file('\n')`

			`# case _:`
			`# # print(tag.upper())`
			`# pass`

			`def handle_starttag(self, tag, attrs):`
			`# Set taf trackers`
			`self.tag_type = 'start'`
			`self.tag = tag`
			`self.attrs = dict(attrs)`

			`self.process_tag()`

			`def handle_endtag(self, tag):`
			`# print(f'End: {tag}')`
			`self.tag = tag`
			`self.tag_type = 'end'`
			`self.process_tag()`

			`# Clear tag tracker`
			`self.tag = None`
			`self.tag_type = None`
			`self.attrs.clear()`
Moved the `## Verses: ` heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections. Both Of these issues were caught with Psalms 119 2023-12-02 14:29:00 +00:00			`self.skip_re = False`
renamed and updated 2023-12-02 02:47:45 +00:00
			`def handle_data(self, data):`
			`# print(f'Data: {data!r}')`

			`if self.tag in ['scripref', 'h2', 'h3', 'h5']:`
			`return`

			`if data == '\n':`
			`return`

			`data = data.replace('—', '-- ')`
			`data = data.replace('GOD', 'God')`
Moved the `## Verses: ` heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections. Both Of these issues were caught with Psalms 119 2023-12-02 14:29:00 +00:00			`data = data.replace('\n', ' ')`
renamed and updated 2023-12-02 02:47:45 +00:00
Moved the `## Verses: ` heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections. Both Of these issues were caught with Psalms 119 2023-12-02 14:29:00 +00:00			`self.write_to_file(data, self.skip_re)`
init 2023-11-30 02:23:35 +00:00

			`if __name__ == '__main__':`
renamed and updated 2023-12-02 02:47:45 +00:00			`parser = MyHTMLParser()`

			`# Process each volume file.`
			`pbar = tqdm(total=1273)`
			`for vol in sorted(Path('./').glob('*.xml')):`
			`logger.info(vol)`
			`soup = BeautifulSoup(vol.read_text(), 'xml')`

			`# Get each book in the volume`
			`# This will be the main folder for all the book's chapters`
			`for book in soup.find_all('div1'):`
current 2023-12-07 11:56:01 +00:00			`book_name = (book['title'].`
			`replace('First ', '1').`
			`replace('Second ', '2').`
			`replace('Third ', '3')`
			`)`
Moved the `## Verses: ` heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections. Both Of these issues were caught with Psalms 119 2023-12-02 14:29:00 +00:00			`logger.info(book_name)`
renamed and updated 2023-12-02 02:47:45 +00:00
			`# These are the chapters/files for each book folder`
			`for chapter in book.find_all('div2'):`

current 2023-12-07 11:56:01 +00:00			`# For introduction files`
			`filename = f'MH {book_name} {chapter['title']}'`
renamed and updated 2023-12-02 02:47:45 +00:00
			`# Remove Roman Numerals from file name`
			`if 'CHAPTER' in filename.upper():`
current 2023-12-07 11:56:01 +00:00			`*_, roman_num = chapter['title'].split(' ')`
			`filename = f'MH {book_name} {parser.roman_to_int(roman_num)}'`
renamed and updated 2023-12-02 02:47:45 +00:00
Moved the `## Verses: ` heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections. Both Of these issues were caught with Psalms 119 2023-12-02 14:29:00 +00:00			`logger.info(filename)`
renamed and updated 2023-12-02 02:47:45 +00:00			`parser.create_md_file(base, book_name, filename)`

			`pbar.set_description(f"Processing: {vol.name}: {book_name}: {filename}")`

			`# Parse the text of the chapter`
			`# May need to add a loop to look in all <p class='passage> and`
			`# unrap any scripref that is in them. the place Dan is getting confused`
			`# with the book Daniel.`
			`for passage in chapter.find_all('p', {'class': 'passage'}):`
			`# print(passage.text)`
			`new_t = soup.new_tag('p', attrs=passage.attrs)`
			`new_t.string = passage.text`
			`passage.replace_with(new_t)`

			`# Clearing all scripRef of internal text. That way if it`
			`# had other tags in it would not be processed`
			`for scripture in chapter.find_all('scripRef'):`
			`scripture.clear()`

			`for doc in chapter.children:`

			`try:`
Moved the `## Verses: ` heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections. Both Of these issues were caught with Psalms 119 2023-12-02 14:29:00 +00:00			`logger.debug(f"{book_name}, {filename}")`
renamed and updated 2023-12-02 02:47:45 +00:00			`parser.feed(str(doc))`
			`except Exception as e:`
			`logger.exception(e)`
			`logger.debug(chapter.attrs)`
			`raise`
init 2023-11-30 02:23:35 +00:00
renamed and updated 2023-12-02 02:47:45 +00:00			`parser.clean_file()`
			`pbar.update(1)`
init 2023-11-30 02:23:35 +00:00
renamed and updated 2023-12-02 02:47:45 +00:00			`# if pbar.last_print_n > 2:`
			`# sys.exit()`
init 2023-11-30 02:23:35 +00:00
renamed and updated 2023-12-02 02:47:45 +00:00			`pbar.close()`
			`parser.close()`