current

2023-12-19 06:01:43 -05:00 · 2023-12-19 06:01:43 -05:00 · 8f46274857
commit 8f46274857
parent 22e3584494
2 changed files with 1151 additions and 0 deletions
--- a/main_1.py
+++ b/main_1.py
@ -0,0 +1,572 @@
 import collections
 import itertools
 import re
 import string
 import sys
 from html.parser import HTMLParser
 from pathlib import Path
 import pythonbible as bible
 from bs4 import BeautifulSoup
 from loguru import logger
 from tqdm import tqdm
 base = Path('./scraps_consolidated').absolute()
 # logger.remove(0)
 from functools import partialmethod
 logger.__class__.foobar = partialmethod(logger.__class__.log, "foobar", level='WARNING')
 logger.add('log.txt', level='WARNING')
 shorts = dict(
    [
        ("Nahum", "Nah"),
        ("Colossians", "Col"),
        ("Esther", "Esth"),
        ("James", "Jas"),
        ("John", "John"),
        ("3John", "3John"),
        ("Revelation", "Rev"),
        ("Titus", "Titus"),
        ("Zechariah", "Zech"),
        ("Malachi", "Mal"),
        ("Lamentations", "Lam"),
        ("Matthew", "Matt"),
        ("Jonah", "Jonah"),
        ("Hosea", "Hos"),
        ("Joel", "Joel"),
        ("Genesis", "Gen"),
        ("Proverbs", "Prov"),
        ("Romans", "Rom"),
        ("Micah", "Mic"),
        ("Jeremiah", "Jer"),
        ("Ecclesiastes", "Eccl"),
        ("Philippians", "Phil"),
        ("Galatians", "Gal"),
        ("Acts", "Acts"),
        ("Psalms", "Ps"),
        ("Isaiah", "Isa"),
        ("2Chronicles", "2Chr"),
        ("Job", "Job"),
        ("Deuteronomy", "Deut"),
        ("Luke", "Luke"),
        ("Numbers", "Num"),
        ("Daniel", "Dan"),
        ("Hebrews", "Heb"),
        ("Nehemiah", "Neh"),
        ("1John", "1John"),
        ("Haggai", "Hag"),
        ("1Kings", "1Kgs"),
        ("Joshua", "Josh"),
        ("1Corinthians", "1Cor"),
        ("Mark", "Mark"),
        ("2Kings", "2Kgs"),
        ("Obadiah", "Obad"),
        ("Ephesians", "Eph"),
        ("1Timothy", "1Tim"),
        ("Philemon", "Phlm"),
        ("1Chronicles", "1Chr"),
        ("2Samuel", "2Sam"),
        ("1Samuel", "1Sam"),
        ("2Peter", "2Pet"),
        ("2John", "2John"),
        ("Song of Solomon", "Song"),
        ("Jude", "Jude"),
        ("Amos", "Amos"),
        ("1Peter", "1Pet"),
        ("Leviticus", "Lev"),
        ("Habakkuk", "Hab"),
        ("Ruth", "Ruth"),
        ("1Thessalonians", "1Thess"),
        ("Ezra", "Ezra"),
        ("Ezekiel", "Ezek"),
        ("2Thessalonians", "2Thess"),
        ("2Timothy", "2Tim"),
        ("Exodus", "Exod"),
        ("2Corinthians", "2Cor"),
        ("Judges", "Judg"),
        ("Zephaniah", "Zeph"),
    ]
 )
 end_verses = dict([
    ('Obadiah', 21),
    ('Philemon', 25),
    ('2John', 13),
    ('3John', 15),
    ('Jude', 25),
 ])
 class MyHTMLParser(HTMLParser):
    file = None
    passage_verse = None
    tag = None
    tag_type = None  # 'start' or 'end'
    skip_re = False
    attrs = dict()
    # All the patterns are for re.sub(). This is specifically for getting
    # list/sublist into the correct position. The first item is for the pattern
    # to find and the second is the replacement pattern. The replacement pattern
    # also ids the list so that when the self.clean_file() we can properly add
    # the correct amount of `\t` to the line.
    patterns = [
        (r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'),
        (r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'),
        # These share the same id. There are times were the two are a mixed
        # within the parent lists.
        (r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'),
        (r"(\S\s)(\d+\.)(\s\b|\W)",
         lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'),
        (r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.",
         lambda x: f'\n{parser.roman_to_int(x.group())}.~1'),
    ]
    @staticmethod
    def roman_to_int(number: str) -> int:
        if number is None:
            return 0
        # Sometimes the roman numeral comes with a '.'. Striping to not cause
        # issues
        number = number.strip('.')
        roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
        total = 0
        for i in range(len(number) - 1, -1, -1):
            num = roman[number[i]]
            if 3 * num < total:
                total -= num
            else:
                total += num
        return total
    def create_md_file(self, base_dir: Path, folder: str, file: str):
        """ Create the path of the md file that will be written to as the
        program process the xml file.
        """
        md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
        md_file.parent.mkdir(parents=True, exist_ok=True)
        md_file.unlink(missing_ok=True)
        # print(md_file.absolute())
        self.file = md_file
        return md_file
    def write_to_file(self, data, skip_re=False):
        # if self.file is None:
        #     raise ValueError('No File specified')
        if not skip_re:
            # Here the list/sublist are searched for and altered for later
            # processing in self.clean_file()
            for pattern, sub in self.patterns:
                # logger.debug(f'Running Pattern: {pattern} on {data[:50]}')
                data = re.sub(pattern, sub, data)
        # print(f'writing: {data!r}')
        if self.file:
            # raise ValueError('No File specified')
            with open(self.file, 'a+') as file:
                file.write(data)
        else:
            print(data)
    def clean_file(self):
        logger.info(' CLEANING FILE')
        if self.file is None:
            raise ValueError('No File specified')
        # Read all the lines
        with open(self.file, 'r') as file:
            lines = file.readlines()
        indent = 0  # Current number of '\t' to add in front of the list number
        sublist = dict()  # Track the indent number based on the id.
        with open(self.file, 'w') as file:
            file.write('# Chapter Introduction\n\n')
            header = None
            for line in lines:
                # Do not write blank lines
                if not line.strip():
                    continue
                # Main header, if a header has already been written skip it
                if line.strip().startswith('# '):
                    if header == line.strip():
                        continue
                    header = line.strip()
                # Add a `\n` if line is a heading.
                if line.startswith('#'):
                    line = f'\n{line.strip()}'
                # If line is not a list then add a `\n` after the paragraph,
                # and reset the indent tracker
                if line.strip()[0] not in string.digits:
                    line = f'{line}\n'
                    # print('resetting indent')
                    indent = 0
                    sublist.clear()
                else:
                    # Get the list id
                    start = line.find('~')
                    end = start + 2
                    list_id = line[start:end]
                    # Get the number of indents based on the list id, if it does
                    # not exist they set the value to the max value +1. If this
                    # is the first it will be set to 0 (-1 +1 = 0)
                    indents = "\t" * sublist.setdefault(
                        list_id, max(sublist.values(), default=-1)+1
                    )
                    line = f'{indents}{line.replace(list_id, "")}'
                if line.startswith('Passage: '):
                    line = line.removeprefix('Passage: ')
                file.write(line)
    def normalize_osis_verses(self, verses):
        """Takes this
        'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
        and turns it into somthing more readable"""
        normalized_refs = list()
        for chunks in verses.removeprefix("Bible:").split('Bible:'):
            spanned_verses = list()
            for ref in chunks.split('-'):
                verse = ref.replace('.', ' ', 1).replace('.', ':')
                book = ref.split('.')[0]
                verse.replace(book, convert[book])
                spanned_verses.append(
                    bible.format_scripture_references(
                        bible.get_references(verse.strip())
                    )
                )
            try:
                normalized = bible.format_scripture_references(
                    bible.get_references('-'.join(spanned_verses))
                )
            except:
                logger.warning(f"Error with: {verses=}: {spanned_verses=}")
                raise
            if normalized is False:
                print(f"Error with: {verses=}: {spanned_verses=}")
                sys.exit()
            normalized_refs.append(normalized)
        return ';'.join(normalized_refs)
    # @staticmethod
    def old_normalize_osis_verses(self, verses):
        """Takes this
        'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
        and turns it into somthing more readable"""
        normalized_refs = list()
        for chunks in verses.removeprefix("Bible:").split('Bible:'):
            spanned_verses = chunks.split('-')
            if len(spanned_verses) < 2:
                spanned_verses = list(itertools.repeat(spanned_verses[0], 2))
            book, start_chapter, start_verse = spanned_verses[0].split('.')
            _, end_chapter, end_verse = spanned_verses[1].split('.')
            book = bible.Book[convert[book].upper()]
            normalized = bible.NormalizedReference(
                book=book,
                start_chapter=start_chapter,
                start_verse=start_verse,
                end_chapter=end_chapter,
                end_verse=end_verse,
            )
            if normalized is False:
                logger.foobar(f'{self.args=}')
            normalized_refs.append(
                bible.format_scripture_references([normalized]).strip()
            )
        return ';'.join(normalized_refs)
    @staticmethod
    @logger.catch(reraise=False)
    def obsidian_links(verses):
        """Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
        links = []
        for verse in verses.split(';'):
            verse = verse.replace(':', '#').replace('-', '..')
            if verse[0] in string.digits:
                verse.replace(' ', '', 1)
            links.append(f"[[{verse}]]")
        return '; '.join(links)
    def convert_passage(self, verses):
        book = self.file.parts[-2]
        chapter = None
        ref = []
        for ver in verses.split(','):
            try:
                chapter, v = ver.split(':')
            except:
                v = ver
            r = []
            for i in v.split('-'):
                r.append(f'{book}.{chapter}.{i}')
            ref.append(f'Bible:{"-".join(r)}')
        x = ' '.join(ref)
        return f'osisRef="{x}"'
    def process_tag(self, test=None):
        match self.tag:
            case 'b':  # Bold
                self.write_to_file('**')
            case 'i':  # Italics
                self.write_to_file('*')
            case 'scripcom':
                # This saves the verse for the section.
                if self.tag_type == 'start':
                    if self.attrs['type'] == 'Commentary':
                        self.passage_verse = self.attrs['osisref']
            case 'h4':  # this is the section header. Need to keep track of it per file.
                if self.tag_type == 'start':
                    self.write_to_file('\n# ',)
                    self.skip_re = True
                if self.tag_type == 'end':
                    self.write_to_file('\n')
                    self.skip_re = False
            case 'scripref':  # Scripture ref
                # get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
                #  bible.format_scripture_references(bible.get_references(attrs['osisref']))
                if self.tag_type == 'start':
                    if 'osisref' in self.attrs:
                        verses = self.normalize_osis_verses(
                            self.attrs['osisref']
                        )
                    else:
                        # verses = self.normalize_osis_verses(
                        #     self.convert_passage(self.attrs['passage'])
                        # )
                        raise ValueError('No `osisref` in tag attrs')
                    self.write_to_file(self.obsidian_links(verses))
            case 'p':  # Paragraph
                if self.tag_type == 'start':
                    if self.attrs.get('class', False) == 'passage':
                        if self.passage_verse:
                            try:
                                verse = bible.get_references(
                                    self.normalize_osis_verses(
                                        self.passage_verse)
                                )[0]
                            except:
                                logger.debug(self.passage_verse)
                                raise
                            if verse.start_verse == verse.end_verse:
                                self.write_to_file('## Verse: ')
                                self.write_to_file(
                                    f'{verse.start_verse}'
                                )
                            else:
                                self.write_to_file('## Verses: ')
                                self.write_to_file(
                                    f'{verse.start_verse} - {verse.end_verse}'
                                )
                            self.write_to_file('\n\n')
                        # Need this do the regex in self.clean does not pick
                        # this up. Will be also cleaned in the same function.
                        self.write_to_file('Passage: ')
                if self.tag_type == 'end':
                    self.write_to_file('\n\n')
            case 'li':
                if self.tag_type == 'start':
                    self.write_to_file('- ')
                else:
                    self.write_to_file('\n')
            # case _:
            #     # print(tag.upper())
            #     pass
    def handle_starttag(self, tag, attrs):
        # Set taf trackers
        self.tag_type = 'start'
        self.tag = tag
        self.attrs = dict(attrs)
        self.process_tag()
    def handle_endtag(self, tag):
        # print(f'End: {tag}')
        self.tag = tag
        self.tag_type = 'end'
        self.process_tag()
        # Clear tag tracker
        self.tag = None
        self.tag_type = None
        self.attrs.clear()
        self.skip_re = False
    def handle_data(self, data):
        # print(f'Data: {data!r}')
        if self.tag in ['scripref', 'h2', 'h3', 'h5']:
            return
        if data == '\n':
            return
        data = data.replace('—', '-- ')
        data = data.replace('GOD', 'God')
        data = data.replace('\n', ' ')
        self.write_to_file(data, self.skip_re)
 def create_file(base_dir: Path, folder: str, file: str, file_type: str) -> Path:
    """ Create the path of the md file that will be written to as the
    program process the xml file.
    """
    new_file = Path(f'{base_dir}/{folder}/{file}.{file_type}').absolute()
    new_file.parent.mkdir(parents=True, exist_ok=True)
    new_file.unlink(missing_ok=True)
    # print(md_file.absolute())
    return new_file
 def write_file(file, data):
    with open(file, 'a+') as f:
        for d in data:
            f.write(d)
        f.write('\n')
 if __name__ == '__main__':
    parser = MyHTMLParser()
    # Process each volume file.
    scraps = Path('./scraps')
    files_used = collections.Counter()
    for vol in sorted(Path('./').glob('*.xml')):
        print(vol)
        soup = BeautifulSoup(vol.read_text(), 'xml')
        for book in soup.find_all('div1'):
            book_name = book['title'].replace('First ', '1').replace('Second ', '2').replace('Third ', '3')
            if shorts.get(book_name, False) is False:
                print(f'skipping {book_name}')
                continue
            print('\t', book_name, book['id'])
            for chapter in book.find_all('div2'):
                filename = chapter['title']
                if 'Index' in filename:
                    print('skipping', filename)
                    continue
                # Remove Roman Numerals from file name
                if 'CHAPTER' in filename.upper():
                    _, roman_num = chapter['title'].split(' ')
                    chapter_num = parser.roman_to_int(roman_num)
                    filename = f'Chapter {chapter_num}'
                print('\t\t', filename, chapter['id'])
                file = create_file(base, book_name, filename, 'html')
                file.touch()
                # book intro file
                if 'INTRODUCTION' in file.name.upper():
                    book_intro = scraps / f"{book_name}.html"
                    if book_intro.exists() is False:
                        print(book_intro.name, book_intro.exists())
                    files_used.update([book_intro.name])
                    file.write_bytes(book_intro.read_bytes())
                    continue
                # Chapter intro
                book_short = shorts[book_name]
                ci = bible.get_references(f"{book_name} {chapter_num}")[0]
                if ev:=end_verses.get(book_name, False):
                    ci.end_verse = ev
                chapter_into = scraps / f"{book_short}_{chapter_num}_{ci.start_verse}-{book_short}_{chapter_num}_{ci.end_verse}.html"
                if chapter_into.exists() is False:
                    print(chapter_into.name, chapter_into.exists())
                    raise ValueError
                files_used.update([chapter_into.name])
                # write(file, '<h1>Chapter Introduction</h1>')
                write_file(file, '<!-- Chapter Intro -->')
                write_file(file, chapter_into.read_text())
                # Chapter sections
                for passage in chapter.find_all('p', {'class': 'passage'}):
                    # print(passage.text)
                    new_t = soup.new_tag('p', attrs=passage.attrs)
                    new_t.string = passage.text
                    passage.replace_with(new_t)
                write_file(file, '<!-- Chapter Sections -->')
                for section in chapter.find_all('div', {'class': 'Commentary'}):
                    if header := section.find('h4'):
                        write_file(file, str(header))
                    # verses_id = str(section['id'])
                    # verses = verses_id.replace('Bible:', '').replace('.', '_')
                    #
                    # verse_section = scraps / f"{verses}.html"
                    # if verse_section.exists() is False:
                    for sc in section.previous_elements:
                        if str(sc).strip():
                            break
                    # print(sc['osisRef'])
                    verses_id = str(sc['osisRef'])
                    verses = verses_id.replace('Bible:', '').replace('.', '_')
                    verse_section = scraps / f"{verses}.html"
                    print('\t\t',verse_section.name, verse_section.exists())
                    if verse_section.exists() is False:
                        raise ValueError(f'osisRef="{verses_id}"')
                    files_used.update([verse_section.name])
                    # print(verses)
                    # passage = section.find('p', {'class': 'passage'})
--- a/main_2.py
+++ b/main_2.py
@ -0,0 +1,579 @@
 import collections
 import itertools
 import re
 import string
 import sys
 from html.parser import HTMLParser
 from pathlib import Path
 import pythonbible as bible
 from bs4 import BeautifulSoup
 from loguru import logger
 from tqdm import tqdm
 import difflib
 base = Path('./scraps_consolidated').absolute()
 logger.remove(0)
 logger.add('mixing.log')
 from functools import partialmethod
 logger.__class__.foobar = partialmethod(logger.__class__.log, "foobar", level='WARNING')
 logger.add('log.txt', level='WARNING')
 shorts = dict(
    [
        ("Nahum", "Nah"),
        ("Colossians", "Col"),
        ("Esther", "Esth"),
        ("James", "Jas"),
        ("John", "John"),
        ("3John", "3John"),
        ("Revelation", "Rev"),
        ("Titus", "Titus"),
        ("Zechariah", "Zech"),
        ("Malachi", "Mal"),
        ("Lamentations", "Lam"),
        ("Matthew", "Matt"),
        ("Jonah", "Jonah"),
        ("Hosea", "Hos"),
        ("Joel", "Joel"),
        ("Genesis", "Gen"),
        ("Proverbs", "Prov"),
        ("Romans", "Rom"),
        ("Micah", "Mic"),
        ("Jeremiah", "Jer"),
        ("Ecclesiastes", "Eccl"),
        ("Philippians", "Phil"),
        ("Galatians", "Gal"),
        ("Acts", "Acts"),
        ("Psalms", "Ps"),
        ("Isaiah", "Isa"),
        ("2Chronicles", "2Chr"),
        ("Job", "Job"),
        ("Deuteronomy", "Deut"),
        ("Luke", "Luke"),
        ("Numbers", "Num"),
        ("Daniel", "Dan"),
        ("Hebrews", "Heb"),
        ("Nehemiah", "Neh"),
        ("1John", "1John"),
        ("Haggai", "Hag"),
        ("1Kings", "1Kgs"),
        ("Joshua", "Josh"),
        ("1Corinthians", "1Cor"),
        ("Mark", "Mark"),
        ("2Kings", "2Kgs"),
        ("Obadiah", "Obad"),
        ("Ephesians", "Eph"),
        ("1Timothy", "1Tim"),
        ("Philemon", "Phlm"),
        ("1Chronicles", "1Chr"),
        ("2Samuel", "2Sam"),
        ("1Samuel", "1Sam"),
        ("2Peter", "2Pet"),
        ("2John", "2John"),
        ("Song of Solomon", "Song"),
        ("Jude", "Jude"),
        ("Amos", "Amos"),
        ("1Peter", "1Pet"),
        ("Leviticus", "Lev"),
        ("Habakkuk", "Hab"),
        ("Ruth", "Ruth"),
        ("1Thessalonians", "1Thess"),
        ("Ezra", "Ezra"),
        ("Ezekiel", "Ezek"),
        ("2Thessalonians", "2Thess"),
        ("2Timothy", "2Tim"),
        ("Exodus", "Exod"),
        ("2Corinthians", "2Cor"),
        ("Judges", "Judg"),
        ("Zephaniah", "Zeph"),
    ]
 )
 end_verses = dict([
    ('Obadiah', 21),
    ('Philemon', 25),
    ('2John', 13),
    ('3John', 15),
    ('Jude', 25),
 ])
 class MyHTMLParser(HTMLParser):
    file = None
    passage_verse = None
    tag = None
    tag_type = None  # 'start' or 'end'
    skip_re = False
    attrs = dict()
    # All the patterns are for re.sub(). This is specifically for getting
    # list/sublist into the correct position. The first item is for the pattern
    # to find and the second is the replacement pattern. The replacement pattern
    # also ids the list so that when the self.clean_file() we can properly add
    # the correct amount of `\t` to the line.
    patterns = [
        (r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'),
        (r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'),
        # These share the same id. There are times were the two are a mixed
        # within the parent lists.
        (r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'),
        (r"(\S\s)(\d+\.)(\s\b|\W)",
         lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'),
        (r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.",
         lambda x: f'\n{parser.roman_to_int(x.group())}.~1'),
    ]
    @staticmethod
    def roman_to_int(number: str) -> int:
        if number is None:
            return 0
        # Sometimes the roman numeral comes with a '.'. Striping to not cause
        # issues
        number = number.strip('.')
        roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
        total = 0
        for i in range(len(number) - 1, -1, -1):
            num = roman[number[i]]
            if 3 * num < total:
                total -= num
            else:
                total += num
        return total
    def create_md_file(self, base_dir: Path, folder: str, file: str):
        """ Create the path of the md file that will be written to as the
        program process the xml file.
        """
        md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
        md_file.parent.mkdir(parents=True, exist_ok=True)
        md_file.unlink(missing_ok=True)
        # logger.info(md_file.absolute())
        self.file = md_file
        return md_file
    def write_to_file(self, data, skip_re=False):
        # if self.file is None:
        #     raise ValueError('No File specified')
        if not skip_re:
            # Here the list/sublist are searched for and altered for later
            # processing in self.clean_file()
            for pattern, sub in self.patterns:
                # logger.debug(f'Running Pattern: {pattern} on {data[:50]}')
                data = re.sub(pattern, sub, data)
        # logger.info(f'writing: {data!r}')
        if self.file:
            # raise ValueError('No File specified')
            with open(self.file, 'a+') as file:
                file.write(data)
        else:
            logger.info(data)
    def clean_file(self):
        logger.info(' CLEANING FILE')
        if self.file is None:
            raise ValueError('No File specified')
        # Read all the lines
        with open(self.file, 'r') as file:
            lines = file.readlines()
        indent = 0  # Current number of '\t' to add in front of the list number
        sublist = dict()  # Track the indent number based on the id.
        with open(self.file, 'w') as file:
            file.write('# Chapter Introduction\n\n')
            header = None
            for line in lines:
                # Do not write blank lines
                if not line.strip():
                    continue
                # Main header, if a header has already been written skip it
                if line.strip().startswith('# '):
                    if header == line.strip():
                        continue
                    header = line.strip()
                # Add a `\n` if line is a heading.
                if line.startswith('#'):
                    line = f'\n{line.strip()}'
                # If line is not a list then add a `\n` after the paragraph,
                # and reset the indent tracker
                if line.strip()[0] not in string.digits:
                    line = f'{line}\n'
                    # logger.info('resetting indent')
                    indent = 0
                    sublist.clear()
                else:
                    # Get the list id
                    start = line.find('~')
                    end = start + 2
                    list_id = line[start:end]
                    # Get the number of indents based on the list id, if it does
                    # not exist they set the value to the max value +1. If this
                    # is the first it will be set to 0 (-1 +1 = 0)
                    indents = "\t" * sublist.setdefault(
                        list_id, max(sublist.values(), default=-1)+1
                    )
                    line = f'{indents}{line.replace(list_id, "")}'
                if line.startswith('Passage: '):
                    line = line.removeprefix('Passage: ')
                file.write(line)
    def normalize_osis_verses(self, verses):
        """Takes this
        'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
        and turns it into somthing more readable"""
        normalized_refs = list()
        for chunks in verses.removeprefix("Bible:").split('Bible:'):
            spanned_verses = list()
            for ref in chunks.split('-'):
                verse = ref.replace('.', ' ', 1).replace('.', ':')
                book = ref.split('.')[0]
                spanned_verses.append(
                    bible.format_scripture_references(
                        bible.get_references(verse.strip())
                    )
                )
            try:
                normalized = bible.format_scripture_references(
                    bible.get_references('-'.join(spanned_verses))
                )
            except:
                logger.warning(f"Error with: {verses=}: {spanned_verses=}")
                raise
            if normalized is False:
                logger.info(f"Error with: {verses=}: {spanned_verses=}")
                sys.exit()
            normalized_refs.append(normalized)
        return ';'.join(normalized_refs)
    # @staticmethod
    def old_normalize_osis_verses(self, verses):
        """Takes this
        'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
        and turns it into somthing more readable"""
        normalized_refs = list()
        for chunks in verses.removeprefix("Bible:").split('Bible:'):
            spanned_verses = chunks.split('-')
            if len(spanned_verses) < 2:
                spanned_verses = list(itertools.repeat(spanned_verses[0], 2))
            book, start_chapter, start_verse = spanned_verses[0].split('.')
            _, end_chapter, end_verse = spanned_verses[1].split('.')
            book = bible.Book[convert[book].upper()]
            normalized = bible.NormalizedReference(
                book=book,
                start_chapter=start_chapter,
                start_verse=start_verse,
                end_chapter=end_chapter,
                end_verse=end_verse,
            )
            if normalized is False:
                logger.foobar(f'{self.args=}')
            normalized_refs.append(
                bible.format_scripture_references([normalized]).strip()
            )
        return ';'.join(normalized_refs)
    @staticmethod
    @logger.catch(reraise=False)
    def obsidian_links(verses):
        """Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
        links = []
        for verse in verses.split(';'):
            verse = verse.replace(':', '#').replace('-', '..')
            if verse[0] in string.digits:
                verse.replace(' ', '', 1)
            links.append(f"[[{verse}]]")
        return '; '.join(links)
    def convert_passage(self, verses):
        book = self.file.parts[-2]
        chapter = None
        ref = []
        for ver in verses.split(','):
            try:
                chapter, v = ver.split(':')
            except:
                v = ver
            r = []
            for i in v.split('-'):
                r.append(f'{book}.{chapter}.{i}')
            ref.append(f'Bible:{"-".join(r)}')
        x = ' '.join(ref)
        return f'osisRef="{x}"'
    def process_tag(self, test=None):
        match self.tag:
            case 'b':  # Bold
                self.write_to_file('**')
            case 'i':  # Italics
                self.write_to_file('*')
            case 'scripcom':
                # This saves the verse for the section.
                if self.tag_type == 'start':
                    try:
                        verse = bible.get_references(
                            self.normalize_osis_verses(
                                self.attrs['osisref'])
                        )[0]
                    except:
                        logger.debug(self.passage_verse)
                        raise
                    if verse.start_verse == verse.end_verse:
                        self.write_to_file('## Verse: ')
                        self.write_to_file(
                            f'{verse.start_verse}'
                        )
                    else:
                        self.write_to_file('## Verses: ')
                        self.write_to_file(
                            f'{verse.start_verse} - {verse.end_verse}'
                        )
                    self.write_to_file('\n\n')
            case 'h4':  # this is the section header. Need to keep track of it per file.
                if self.tag_type == 'start':
                    self.write_to_file('\n# ',)
                    self.skip_re = True
                if self.tag_type == 'end':
                    self.write_to_file('\n')
                    self.skip_re = False
            # case 'scripref':  # Scripture ref
            #     # get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
            #     #  bible.format_scripture_references(bible.get_references(attrs['osisref']))
            #     if self.tag_type == 'start':
            #         if 'osisref' in self.attrs:
            #             verses = self.normalize_osis_verses(
            #                 self.attrs['osisref']
            #             )
            #         else:
            #             # verses = self.normalize_osis_verses(
            #             #     self.convert_passage(self.attrs['passage'])
            #             # )
            #             raise ValueError('No `osisref` in tag attrs')
            #
            #         self.write_to_file(self.obsidian_links(verses))
            case 'p':  # Paragraph
                if self.tag_type == 'start':
                    if self.attrs.get('class', False) == 'passage':
                        # Need this so the regex in self.clean does not pick
                        # this up. Will be also cleaned in the same function.
                        self.write_to_file('Passage: ')
                if self.tag_type == 'end':
                    self.write_to_file('\n\n')
            case 'li':
                if self.tag_type == 'start':
                    self.write_to_file('- ')
                else:
                    self.write_to_file('\n')
            # case _:
            #     # logger.info(tag.upper())
            #     pass
    def handle_starttag(self, tag, attrs):
        # Set taf trackers
        self.tag_type = 'start'
        self.tag = tag
        self.attrs = dict(attrs)
        self.process_tag()
    def handle_endtag(self, tag):
        # logger.info(f'End: {tag}')
        self.tag = tag
        self.tag_type = 'end'
        self.process_tag()
        # Clear tag tracker
        self.tag = None
        self.tag_type = None
        self.attrs.clear()
        self.skip_re = False
    def handle_data(self, data):
        # logger.info(f'Data: {data!r}')
        if self.tag in ['scripref', 'h2', 'h3', 'h5']:
            return
        if data == '\n':
            return
        data = data.replace('—', '-- ')
        data = data.replace('GOD', 'God')
        data = data.replace('\n', ' ')
        self.write_to_file(data, self.skip_re)
 def create_file(base_dir: Path, folder: str, file: str, file_type: str) -> Path:
    """ Create the path of the md file that will be written to as the
    program process the xml file.
    """
    new_file = Path(f'{base_dir}/{folder}/{file}.{file_type}').absolute()
    new_file.parent.mkdir(parents=True, exist_ok=True)
    new_file.unlink(missing_ok=True)
    # logger.info(md_file.absolute())
    return new_file
 def write_file(file, data):
    with open(file, 'a+') as f:
        for d in data:
            f.write(d)
        f.write('\n')
 if __name__ == '__main__':
    parser = MyHTMLParser()
    # Process each volume file.
    scraps = Path('./scraps')
    files_used = list()
    missing = list()
    for book in Path('./vol_split').glob('*'):
        book_name = book.name.split(' - ')[-1]
        logger.info(book_name)
        print(book_name)
        for chapter in book.glob('*.xml'):
            chapter_name = chapter.stem
            chapter_num = chapter_name.split(' ')[-1]
            logger.info(f"{chapter_name}")
            file = create_file(base, book_name, chapter_name, 'html')
            # book intro file
            if 'INTRODUCTION' in chapter_name.upper():
                book_intro = scraps / f"{book_name}.html"
                if book_intro.exists() is False:
                    logger.info(book_intro.name, book_intro.exists())
                files_used.append(book_intro.name)
                write_file(file, f'<!-- Gotten from {book_intro}-->')
                write_file(file, book_intro.read_text())
                parser.create_md_file('./mh_scraps/', book_name, file.stem)
                parser.feed(file.read_text())
                parser.clean_file()
                continue
            # Chapter intro
            book_short = shorts[book_name]
            ci = bible.get_references(f"{book_name} {chapter_num}")[0]
            if ev := end_verses.get(book_name, False):
                ci.end_verse = ev
            start = f'{book_short}_{chapter_num}_{ci.start_verse}'
            end = f'{book_short}_{chapter_num}_{ci.end_verse}'
            chapter_into = scraps / f"chapter_{start}-{end}.html"
            if chapter_into.exists() is False:
                logger.info(chapter_into.name, chapter_into.exists())
                raise ValueError
            files_used.append(chapter_into.name)
            # write(file, '<h1>Chapter Introduction</h1>')
            write_file(file, f'<!-- Chapter Intro: {chapter_into} -->')
            write_file(file, chapter_into.read_text())
            sections = BeautifulSoup(chapter.read_text(), 'xml')
            # Parse the text of the chapter
            # May need to add a loop to look in all <p class='passage> and
            # unrap any scripref that is in them. the place Dan is getting confused
            # with the book Daniel.
            for passage in sections.find_all('p', {'class': 'passage'}):
                # print(passage.text)
                new_t = sections.new_tag('p', attrs=passage.attrs)
                new_t.string = passage.text
                passage.replace_with(new_t)
            # Clearing all scripRef of internal text. That way if it
            # had other tags in it would not be processed
            for scripture in sections.find_all('scripRef'):
                scripture.clear()
            write_file(file, '<!-- Chapter Sections -->')
            for section in sections.find_all('div', {'class': 'Commentary'}):
                if header := section.find('h4'):
                    write_file(file, str(header))
                # Scripture ref, should be the section!
                scripture_ref = section.previous_sibling
                # write_file(file, '\n')
                write_file(file, str(scripture_ref))
                # Passage text
                if passage := section.find('p', {'class': 'passage'}):
                    write_file(file, str(passage))
                logger.info(scripture_ref)
                verses_id = str(scripture_ref['osisRef'])
                verses = verses_id.replace('Bible:', '').replace('.', '_')
                # Sanity Check verses are legit
                for v in verses.split('-'):
                    v = v.replace('_', ' ', 1).replace('_', ':')
                    b_ref = bible.get_references(v)
                    if not b_ref:
                        raise ValueError(f"{v} is in valid")
                    if int(b_ref[0].start_chapter) != int(chapter_num):
                        raise ValueError('Wrong Chapter')
                verse_section = scraps / f"{verses}.html"
                logger.info('\t\t',verse_section.name, verse_section.exists())
                if verse_section.exists() is False:
                    missing.append((book_name, chapter_num, verse_section.name))
                if verse_section.name in files_used:
                    raise FileExistsError(f'{verse_section} has been already used.')
                files_used.append(verse_section.name)
                write_file(file, f'<!-- {verse_section} -->')
                write_file(file, verse_section.read_text())
            parser.create_md_file('./mh_scraps/', book_name, file.stem)
            parser.feed(file.read_text())
            parser.clean_file()