current

2023-12-19 06:01:43 -05:00 · 2023-12-19 06:01:43 -05:00 · 8f46274857
commit 8f46274857
parent 22e3584494
2 changed files with 1151 additions and 0 deletions
--- a/main_1.py
+++ b/main_1.py
@ -0,0 +1,572 @@
+import collections
+import itertools
+import re
+import string
+import sys
+from html.parser import HTMLParser
+from pathlib import Path
+
+import pythonbible as bible
+from bs4 import BeautifulSoup
+from loguru import logger
+from tqdm import tqdm
+
+base = Path('./scraps_consolidated').absolute()
+
+# logger.remove(0)
+
+from functools import partialmethod
+
+logger.__class__.foobar = partialmethod(logger.__class__.log, "foobar", level='WARNING')
+logger.add('log.txt', level='WARNING')
+
+shorts = dict(
+    [
+        ("Nahum", "Nah"),
+        ("Colossians", "Col"),
+        ("Esther", "Esth"),
+        ("James", "Jas"),
+        ("John", "John"),
+        ("3John", "3John"),
+        ("Revelation", "Rev"),
+        ("Titus", "Titus"),
+        ("Zechariah", "Zech"),
+        ("Malachi", "Mal"),
+        ("Lamentations", "Lam"),
+        ("Matthew", "Matt"),
+        ("Jonah", "Jonah"),
+        ("Hosea", "Hos"),
+        ("Joel", "Joel"),
+        ("Genesis", "Gen"),
+        ("Proverbs", "Prov"),
+        ("Romans", "Rom"),
+        ("Micah", "Mic"),
+        ("Jeremiah", "Jer"),
+        ("Ecclesiastes", "Eccl"),
+        ("Philippians", "Phil"),
+        ("Galatians", "Gal"),
+        ("Acts", "Acts"),
+        ("Psalms", "Ps"),
+        ("Isaiah", "Isa"),
+        ("2Chronicles", "2Chr"),
+        ("Job", "Job"),
+        ("Deuteronomy", "Deut"),
+        ("Luke", "Luke"),
+        ("Numbers", "Num"),
+        ("Daniel", "Dan"),
+        ("Hebrews", "Heb"),
+        ("Nehemiah", "Neh"),
+        ("1John", "1John"),
+        ("Haggai", "Hag"),
+        ("1Kings", "1Kgs"),
+        ("Joshua", "Josh"),
+        ("1Corinthians", "1Cor"),
+        ("Mark", "Mark"),
+        ("2Kings", "2Kgs"),
+        ("Obadiah", "Obad"),
+        ("Ephesians", "Eph"),
+        ("1Timothy", "1Tim"),
+        ("Philemon", "Phlm"),
+        ("1Chronicles", "1Chr"),
+        ("2Samuel", "2Sam"),
+        ("1Samuel", "1Sam"),
+        ("2Peter", "2Pet"),
+        ("2John", "2John"),
+        ("Song of Solomon", "Song"),
+        ("Jude", "Jude"),
+        ("Amos", "Amos"),
+        ("1Peter", "1Pet"),
+        ("Leviticus", "Lev"),
+        ("Habakkuk", "Hab"),
+        ("Ruth", "Ruth"),
+        ("1Thessalonians", "1Thess"),
+        ("Ezra", "Ezra"),
+        ("Ezekiel", "Ezek"),
+        ("2Thessalonians", "2Thess"),
+        ("2Timothy", "2Tim"),
+        ("Exodus", "Exod"),
+        ("2Corinthians", "2Cor"),
+        ("Judges", "Judg"),
+        ("Zephaniah", "Zeph"),
+    ]
+)
+
+end_verses = dict([
+    ('Obadiah', 21),
+    ('Philemon', 25),
+    ('2John', 13),
+    ('3John', 15),
+    ('Jude', 25),
+])
+
+class MyHTMLParser(HTMLParser):
+    file = None
+    passage_verse = None
+    tag = None
+    tag_type = None  # 'start' or 'end'
+    skip_re = False
+    attrs = dict()
+
+    # All the patterns are for re.sub(). This is specifically for getting
+    # list/sublist into the correct position. The first item is for the pattern
+    # to find and the second is the replacement pattern. The replacement pattern
+    # also ids the list so that when the self.clean_file() we can properly add
+    # the correct amount of `\t` to the line.
+    patterns = [
+        (r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'),
+        (r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'),
+
+        # These share the same id. There are times were the two are a mixed
+        # within the parent lists.
+        (r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'),
+        (r"(\S\s)(\d+\.)(\s\b|\W)",
+         lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'),
+
+
+        (r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.",
+         lambda x: f'\n{parser.roman_to_int(x.group())}.~1'),
+    ]
+
+    @staticmethod
+    def roman_to_int(number: str) -> int:
+        if number is None:
+            return 0
+
+        # Sometimes the roman numeral comes with a '.'. Striping to not cause
+        # issues
+        number = number.strip('.')
+
+        roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
+        total = 0
+        for i in range(len(number) - 1, -1, -1):
+            num = roman[number[i]]
+            if 3 * num < total:
+                total -= num
+            else:
+                total += num
+
+        return total
+
+    def create_md_file(self, base_dir: Path, folder: str, file: str):
+        """ Create the path of the md file that will be written to as the
+        program process the xml file.
+        """
+
+        md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
+        md_file.parent.mkdir(parents=True, exist_ok=True)
+        md_file.unlink(missing_ok=True)
+        # print(md_file.absolute())
+        self.file = md_file
+        return md_file
+
+    def write_to_file(self, data, skip_re=False):
+
+        # if self.file is None:
+        #     raise ValueError('No File specified')
+
+        if not skip_re:
+            # Here the list/sublist are searched for and altered for later
+            # processing in self.clean_file()
+            for pattern, sub in self.patterns:
+                # logger.debug(f'Running Pattern: {pattern} on {data[:50]}')
+                data = re.sub(pattern, sub, data)
+
+        # print(f'writing: {data!r}')
+
+        if self.file:
+            # raise ValueError('No File specified')
+            with open(self.file, 'a+') as file:
+                file.write(data)
+        else:
+            print(data)
+
+    def clean_file(self):
+        logger.info(' CLEANING FILE')
+
+        if self.file is None:
+            raise ValueError('No File specified')
+
+        # Read all the lines
+        with open(self.file, 'r') as file:
+            lines = file.readlines()
+
+        indent = 0  # Current number of '\t' to add in front of the list number
+        sublist = dict()  # Track the indent number based on the id.
+        with open(self.file, 'w') as file:
+            file.write('# Chapter Introduction\n\n')
+
+            header = None
+            for line in lines:
+                # Do not write blank lines
+                if not line.strip():
+                    continue
+
+                # Main header, if a header has already been written skip it
+                if line.strip().startswith('# '):
+                    if header == line.strip():
+                        continue
+                    header = line.strip()
+
+                # Add a `\n` if line is a heading.
+                if line.startswith('#'):
+                    line = f'\n{line.strip()}'
+
+                # If line is not a list then add a `\n` after the paragraph,
+                # and reset the indent tracker
+                if line.strip()[0] not in string.digits:
+                    line = f'{line}\n'
+                    # print('resetting indent')
+                    indent = 0
+                    sublist.clear()
+                else:
+                    # Get the list id
+                    start = line.find('~')
+                    end = start + 2
+                    list_id = line[start:end]
+
+                    # Get the number of indents based on the list id, if it does
+                    # not exist they set the value to the max value +1. If this
+                    # is the first it will be set to 0 (-1 +1 = 0)
+                    indents = "\t" * sublist.setdefault(
+                        list_id, max(sublist.values(), default=-1)+1
+                    )
+                    line = f'{indents}{line.replace(list_id, "")}'
+
+                if line.startswith('Passage: '):
+                    line = line.removeprefix('Passage: ')
+
+                file.write(line)
+
+    def normalize_osis_verses(self, verses):
+        """Takes this
+        'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
+        and turns it into somthing more readable"""
+        normalized_refs = list()
+        for chunks in verses.removeprefix("Bible:").split('Bible:'):
+            spanned_verses = list()
+            for ref in chunks.split('-'):
+                verse = ref.replace('.', ' ', 1).replace('.', ':')
+                book = ref.split('.')[0]
+                verse.replace(book, convert[book])
+                spanned_verses.append(
+                    bible.format_scripture_references(
+                        bible.get_references(verse.strip())
+                    )
+                )
+
+            try:
+                normalized = bible.format_scripture_references(
+                    bible.get_references('-'.join(spanned_verses))
+                )
+            except:
+                logger.warning(f"Error with: {verses=}: {spanned_verses=}")
+                raise
+
+            if normalized is False:
+                print(f"Error with: {verses=}: {spanned_verses=}")
+                sys.exit()
+
+            normalized_refs.append(normalized)
+
+        return ';'.join(normalized_refs)
+
+    # @staticmethod
+    def old_normalize_osis_verses(self, verses):
+        """Takes this
+        'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
+        and turns it into somthing more readable"""
+        normalized_refs = list()
+        for chunks in verses.removeprefix("Bible:").split('Bible:'):
+            spanned_verses = chunks.split('-')
+            if len(spanned_verses) < 2:
+                spanned_verses = list(itertools.repeat(spanned_verses[0], 2))
+
+            book, start_chapter, start_verse = spanned_verses[0].split('.')
+            _, end_chapter, end_verse = spanned_verses[1].split('.')
+            book = bible.Book[convert[book].upper()]
+
+            normalized = bible.NormalizedReference(
+                book=book,
+                start_chapter=start_chapter,
+                start_verse=start_verse,
+                end_chapter=end_chapter,
+                end_verse=end_verse,
+            )
+
+            if normalized is False:
+                logger.foobar(f'{self.args=}')
+
+            normalized_refs.append(
+                bible.format_scripture_references([normalized]).strip()
+            )
+
+        return ';'.join(normalized_refs)
+
+    @staticmethod
+    @logger.catch(reraise=False)
+    def obsidian_links(verses):
+        """Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
+        links = []
+        for verse in verses.split(';'):
+            verse = verse.replace(':', '#').replace('-', '..')
+            if verse[0] in string.digits:
+                verse.replace(' ', '', 1)
+            links.append(f"[[{verse}]]")
+
+        return '; '.join(links)
+
+    def convert_passage(self, verses):
+        book = self.file.parts[-2]
+        chapter = None
+        ref = []
+        for ver in verses.split(','):
+            try:
+                chapter, v = ver.split(':')
+            except:
+                v = ver
+
+            r = []
+            for i in v.split('-'):
+                r.append(f'{book}.{chapter}.{i}')
+
+            ref.append(f'Bible:{"-".join(r)}')
+        x = ' '.join(ref)
+        return f'osisRef="{x}"'
+
+    def process_tag(self, test=None):
+        match self.tag:
+            case 'b':  # Bold
+                self.write_to_file('**')
+
+            case 'i':  # Italics
+                self.write_to_file('*')
+
+            case 'scripcom':
+                # This saves the verse for the section.
+                if self.tag_type == 'start':
+                    if self.attrs['type'] == 'Commentary':
+                        self.passage_verse = self.attrs['osisref']
+
+            case 'h4':  # this is the section header. Need to keep track of it per file.
+                if self.tag_type == 'start':
+                    self.write_to_file('\n# ',)
+                    self.skip_re = True
+
+                if self.tag_type == 'end':
+                    self.write_to_file('\n')
+                    self.skip_re = False
+
+            case 'scripref':  # Scripture ref
+                # get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
+                #  bible.format_scripture_references(bible.get_references(attrs['osisref']))
+                if self.tag_type == 'start':
+                    if 'osisref' in self.attrs:
+                        verses = self.normalize_osis_verses(
+                            self.attrs['osisref']
+                        )
+                    else:
+                        # verses = self.normalize_osis_verses(
+                        #     self.convert_passage(self.attrs['passage'])
+                        # )
+                        raise ValueError('No `osisref` in tag attrs')
+
+                    self.write_to_file(self.obsidian_links(verses))
+
+            case 'p':  # Paragraph
+                if self.tag_type == 'start':
+                    if self.attrs.get('class', False) == 'passage':
+                        if self.passage_verse:
+                            try:
+                                verse = bible.get_references(
+                                    self.normalize_osis_verses(
+                                        self.passage_verse)
+                                )[0]
+                            except:
+                                logger.debug(self.passage_verse)
+                                raise
+
+                            if verse.start_verse == verse.end_verse:
+                                self.write_to_file('## Verse: ')
+                                self.write_to_file(
+                                    f'{verse.start_verse}'
+                                )
+                            else:
+                                self.write_to_file('## Verses: ')
+                                self.write_to_file(
+                                    f'{verse.start_verse} - {verse.end_verse}'
+                                )
+                            self.write_to_file('\n\n')
+
+                        # Need this do the regex in self.clean does not pick
+                        # this up. Will be also cleaned in the same function.
+                        self.write_to_file('Passage: ')
+
+                if self.tag_type == 'end':
+                    self.write_to_file('\n\n')
+
+            case 'li':
+                if self.tag_type == 'start':
+                    self.write_to_file('- ')
+                else:
+                    self.write_to_file('\n')
+
+            # case _:
+            #     # print(tag.upper())
+            #     pass
+
+    def handle_starttag(self, tag, attrs):
+        # Set taf trackers
+        self.tag_type = 'start'
+        self.tag = tag
+        self.attrs = dict(attrs)
+
+        self.process_tag()
+
+    def handle_endtag(self, tag):
+        # print(f'End: {tag}')
+        self.tag = tag
+        self.tag_type = 'end'
+        self.process_tag()
+
+        # Clear tag tracker
+        self.tag = None
+        self.tag_type = None
+        self.attrs.clear()
+        self.skip_re = False
+
+    def handle_data(self, data):
+        # print(f'Data: {data!r}')
+
+        if self.tag in ['scripref', 'h2', 'h3', 'h5']:
+            return
+
+        if data == '\n':
+            return
+
+        data = data.replace('—', '-- ')
+        data = data.replace('GOD', 'God')
+        data = data.replace('\n', ' ')
+
+        self.write_to_file(data, self.skip_re)
+
+
+def create_file(base_dir: Path, folder: str, file: str, file_type: str) -> Path:
+    """ Create the path of the md file that will be written to as the
+    program process the xml file.
+    """
+
+    new_file = Path(f'{base_dir}/{folder}/{file}.{file_type}').absolute()
+    new_file.parent.mkdir(parents=True, exist_ok=True)
+    new_file.unlink(missing_ok=True)
+    # print(md_file.absolute())
+    return new_file
+
+
+def write_file(file, data):
+    with open(file, 'a+') as f:
+        for d in data:
+            f.write(d)
+        f.write('\n')
+
+if __name__ == '__main__':
+    parser = MyHTMLParser()
+
+    # Process each volume file.
+    scraps = Path('./scraps')
+    files_used = collections.Counter()
+    for vol in sorted(Path('./').glob('*.xml')):
+        print(vol)
+        soup = BeautifulSoup(vol.read_text(), 'xml')
+        for book in soup.find_all('div1'):
+            book_name = book['title'].replace('First ', '1').replace('Second ', '2').replace('Third ', '3')
+
+            if shorts.get(book_name, False) is False:
+                print(f'skipping {book_name}')
+                continue
+
+            print('\t', book_name, book['id'])
+            for chapter in book.find_all('div2'):
+                filename = chapter['title']
+
+                if 'Index' in filename:
+                    print('skipping', filename)
+                    continue
+
+                # Remove Roman Numerals from file name
+                if 'CHAPTER' in filename.upper():
+                    _, roman_num = chapter['title'].split(' ')
+                    chapter_num = parser.roman_to_int(roman_num)
+                    filename = f'Chapter {chapter_num}'
+
+                print('\t\t', filename, chapter['id'])
+
+                file = create_file(base, book_name, filename, 'html')
+                file.touch()
+
+                # book intro file
+                if 'INTRODUCTION' in file.name.upper():
+                    book_intro = scraps / f"{book_name}.html"
+                    if book_intro.exists() is False:
+                        print(book_intro.name, book_intro.exists())
+                    files_used.update([book_intro.name])
+                    file.write_bytes(book_intro.read_bytes())
+                    continue
+
+                # Chapter intro
+                book_short = shorts[book_name]
+
+                ci = bible.get_references(f"{book_name} {chapter_num}")[0]
+                if ev:=end_verses.get(book_name, False):
+                    ci.end_verse = ev
+
+                chapter_into = scraps / f"{book_short}_{chapter_num}_{ci.start_verse}-{book_short}_{chapter_num}_{ci.end_verse}.html"
+
+                if chapter_into.exists() is False:
+                    print(chapter_into.name, chapter_into.exists())
+                    raise ValueError
+                files_used.update([chapter_into.name])
+
+                # write(file, '<h1>Chapter Introduction</h1>')
+                write_file(file, '<!-- Chapter Intro -->')
+                write_file(file, chapter_into.read_text())
+
+                # Chapter sections
+                for passage in chapter.find_all('p', {'class': 'passage'}):
+                    # print(passage.text)
+                    new_t = soup.new_tag('p', attrs=passage.attrs)
+                    new_t.string = passage.text
+                    passage.replace_with(new_t)
+
+                write_file(file, '<!-- Chapter Sections -->')
+
+                for section in chapter.find_all('div', {'class': 'Commentary'}):
+                    if header := section.find('h4'):
+                        write_file(file, str(header))
+
+                    # verses_id = str(section['id'])
+                    # verses = verses_id.replace('Bible:', '').replace('.', '_')
+                    #
+                    # verse_section = scraps / f"{verses}.html"
+                    # if verse_section.exists() is False:
+
+                    for sc in section.previous_elements:
+                        if str(sc).strip():
+                            break
+                    # print(sc['osisRef'])
+
+                    verses_id = str(sc['osisRef'])
+                    verses = verses_id.replace('Bible:', '').replace('.', '_')
+
+                    verse_section = scraps / f"{verses}.html"
+                    print('\t\t',verse_section.name, verse_section.exists())
+                    if verse_section.exists() is False:
+                        raise ValueError(f'osisRef="{verses_id}"')
+
+                    files_used.update([verse_section.name])
+
+                    # print(verses)
+                    # passage = section.find('p', {'class': 'passage'})
+
+
+
+
--- a/main_2.py
+++ b/main_2.py
@ -0,0 +1,579 @@
+import collections
+import itertools
+import re
+import string
+import sys
+from html.parser import HTMLParser
+from pathlib import Path
+
+import pythonbible as bible
+from bs4 import BeautifulSoup
+from loguru import logger
+from tqdm import tqdm
+import difflib
+
+base = Path('./scraps_consolidated').absolute()
+
+logger.remove(0)
+logger.add('mixing.log')
+from functools import partialmethod
+
+logger.__class__.foobar = partialmethod(logger.__class__.log, "foobar", level='WARNING')
+logger.add('log.txt', level='WARNING')
+
+shorts = dict(
+    [
+        ("Nahum", "Nah"),
+        ("Colossians", "Col"),
+        ("Esther", "Esth"),
+        ("James", "Jas"),
+        ("John", "John"),
+        ("3John", "3John"),
+        ("Revelation", "Rev"),
+        ("Titus", "Titus"),
+        ("Zechariah", "Zech"),
+        ("Malachi", "Mal"),
+        ("Lamentations", "Lam"),
+        ("Matthew", "Matt"),
+        ("Jonah", "Jonah"),
+        ("Hosea", "Hos"),
+        ("Joel", "Joel"),
+        ("Genesis", "Gen"),
+        ("Proverbs", "Prov"),
+        ("Romans", "Rom"),
+        ("Micah", "Mic"),
+        ("Jeremiah", "Jer"),
+        ("Ecclesiastes", "Eccl"),
+        ("Philippians", "Phil"),
+        ("Galatians", "Gal"),
+        ("Acts", "Acts"),
+        ("Psalms", "Ps"),
+        ("Isaiah", "Isa"),
+        ("2Chronicles", "2Chr"),
+        ("Job", "Job"),
+        ("Deuteronomy", "Deut"),
+        ("Luke", "Luke"),
+        ("Numbers", "Num"),
+        ("Daniel", "Dan"),
+        ("Hebrews", "Heb"),
+        ("Nehemiah", "Neh"),
+        ("1John", "1John"),
+        ("Haggai", "Hag"),
+        ("1Kings", "1Kgs"),
+        ("Joshua", "Josh"),
+        ("1Corinthians", "1Cor"),
+        ("Mark", "Mark"),
+        ("2Kings", "2Kgs"),
+        ("Obadiah", "Obad"),
+        ("Ephesians", "Eph"),
+        ("1Timothy", "1Tim"),
+        ("Philemon", "Phlm"),
+        ("1Chronicles", "1Chr"),
+        ("2Samuel", "2Sam"),
+        ("1Samuel", "1Sam"),
+        ("2Peter", "2Pet"),
+        ("2John", "2John"),
+        ("Song of Solomon", "Song"),
+        ("Jude", "Jude"),
+        ("Amos", "Amos"),
+        ("1Peter", "1Pet"),
+        ("Leviticus", "Lev"),
+        ("Habakkuk", "Hab"),
+        ("Ruth", "Ruth"),
+        ("1Thessalonians", "1Thess"),
+        ("Ezra", "Ezra"),
+        ("Ezekiel", "Ezek"),
+        ("2Thessalonians", "2Thess"),
+        ("2Timothy", "2Tim"),
+        ("Exodus", "Exod"),
+        ("2Corinthians", "2Cor"),
+        ("Judges", "Judg"),
+        ("Zephaniah", "Zeph"),
+    ]
+)
+
+end_verses = dict([
+    ('Obadiah', 21),
+    ('Philemon', 25),
+    ('2John', 13),
+    ('3John', 15),
+    ('Jude', 25),
+])
+
+class MyHTMLParser(HTMLParser):
+    file = None
+    passage_verse = None
+    tag = None
+    tag_type = None  # 'start' or 'end'
+    skip_re = False
+    attrs = dict()
+
+    # All the patterns are for re.sub(). This is specifically for getting
+    # list/sublist into the correct position. The first item is for the pattern
+    # to find and the second is the replacement pattern. The replacement pattern
+    # also ids the list so that when the self.clean_file() we can properly add
+    # the correct amount of `\t` to the line.
+    patterns = [
+        (r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'),
+        (r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'),
+
+        # These share the same id. There are times were the two are a mixed
+        # within the parent lists.
+        (r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'),
+        (r"(\S\s)(\d+\.)(\s\b|\W)",
+         lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'),
+
+
+        (r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.",
+         lambda x: f'\n{parser.roman_to_int(x.group())}.~1'),
+    ]
+
+    @staticmethod
+    def roman_to_int(number: str) -> int:
+        if number is None:
+            return 0
+
+        # Sometimes the roman numeral comes with a '.'. Striping to not cause
+        # issues
+        number = number.strip('.')
+
+        roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
+        total = 0
+        for i in range(len(number) - 1, -1, -1):
+            num = roman[number[i]]
+            if 3 * num < total:
+                total -= num
+            else:
+                total += num
+
+        return total
+
+    def create_md_file(self, base_dir: Path, folder: str, file: str):
+        """ Create the path of the md file that will be written to as the
+        program process the xml file.
+        """
+
+        md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
+        md_file.parent.mkdir(parents=True, exist_ok=True)
+        md_file.unlink(missing_ok=True)
+        # logger.info(md_file.absolute())
+        self.file = md_file
+        return md_file
+
+    def write_to_file(self, data, skip_re=False):
+
+        # if self.file is None:
+        #     raise ValueError('No File specified')
+
+        if not skip_re:
+            # Here the list/sublist are searched for and altered for later
+            # processing in self.clean_file()
+            for pattern, sub in self.patterns:
+                # logger.debug(f'Running Pattern: {pattern} on {data[:50]}')
+                data = re.sub(pattern, sub, data)
+
+        # logger.info(f'writing: {data!r}')
+
+        if self.file:
+            # raise ValueError('No File specified')
+            with open(self.file, 'a+') as file:
+                file.write(data)
+        else:
+            logger.info(data)
+
+    def clean_file(self):
+        logger.info(' CLEANING FILE')
+
+        if self.file is None:
+            raise ValueError('No File specified')
+
+        # Read all the lines
+        with open(self.file, 'r') as file:
+            lines = file.readlines()
+
+        indent = 0  # Current number of '\t' to add in front of the list number
+        sublist = dict()  # Track the indent number based on the id.
+        with open(self.file, 'w') as file:
+            file.write('# Chapter Introduction\n\n')
+
+            header = None
+            for line in lines:
+                # Do not write blank lines
+                if not line.strip():
+                    continue
+
+                # Main header, if a header has already been written skip it
+                if line.strip().startswith('# '):
+                    if header == line.strip():
+                        continue
+                    header = line.strip()
+
+                # Add a `\n` if line is a heading.
+                if line.startswith('#'):
+                    line = f'\n{line.strip()}'
+
+                # If line is not a list then add a `\n` after the paragraph,
+                # and reset the indent tracker
+                if line.strip()[0] not in string.digits:
+                    line = f'{line}\n'
+                    # logger.info('resetting indent')
+                    indent = 0
+                    sublist.clear()
+                else:
+                    # Get the list id
+                    start = line.find('~')
+                    end = start + 2
+                    list_id = line[start:end]
+
+                    # Get the number of indents based on the list id, if it does
+                    # not exist they set the value to the max value +1. If this
+                    # is the first it will be set to 0 (-1 +1 = 0)
+                    indents = "\t" * sublist.setdefault(
+                        list_id, max(sublist.values(), default=-1)+1
+                    )
+                    line = f'{indents}{line.replace(list_id, "")}'
+
+                if line.startswith('Passage: '):
+                    line = line.removeprefix('Passage: ')
+
+                file.write(line)
+
+    def normalize_osis_verses(self, verses):
+        """Takes this
+        'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
+        and turns it into somthing more readable"""
+        normalized_refs = list()
+        for chunks in verses.removeprefix("Bible:").split('Bible:'):
+            spanned_verses = list()
+            for ref in chunks.split('-'):
+                verse = ref.replace('.', ' ', 1).replace('.', ':')
+                book = ref.split('.')[0]
+                spanned_verses.append(
+                    bible.format_scripture_references(
+                        bible.get_references(verse.strip())
+                    )
+                )
+
+            try:
+                normalized = bible.format_scripture_references(
+                    bible.get_references('-'.join(spanned_verses))
+                )
+            except:
+                logger.warning(f"Error with: {verses=}: {spanned_verses=}")
+                raise
+
+            if normalized is False:
+                logger.info(f"Error with: {verses=}: {spanned_verses=}")
+                sys.exit()
+
+            normalized_refs.append(normalized)
+
+        return ';'.join(normalized_refs)
+
+    # @staticmethod
+    def old_normalize_osis_verses(self, verses):
+        """Takes this
+        'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
+        and turns it into somthing more readable"""
+        normalized_refs = list()
+        for chunks in verses.removeprefix("Bible:").split('Bible:'):
+            spanned_verses = chunks.split('-')
+            if len(spanned_verses) < 2:
+                spanned_verses = list(itertools.repeat(spanned_verses[0], 2))
+
+            book, start_chapter, start_verse = spanned_verses[0].split('.')
+            _, end_chapter, end_verse = spanned_verses[1].split('.')
+            book = bible.Book[convert[book].upper()]
+
+            normalized = bible.NormalizedReference(
+                book=book,
+                start_chapter=start_chapter,
+                start_verse=start_verse,
+                end_chapter=end_chapter,
+                end_verse=end_verse,
+            )
+
+            if normalized is False:
+                logger.foobar(f'{self.args=}')
+
+            normalized_refs.append(
+                bible.format_scripture_references([normalized]).strip()
+            )
+
+        return ';'.join(normalized_refs)
+
+    @staticmethod
+    @logger.catch(reraise=False)
+    def obsidian_links(verses):
+        """Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
+        links = []
+        for verse in verses.split(';'):
+            verse = verse.replace(':', '#').replace('-', '..')
+            if verse[0] in string.digits:
+                verse.replace(' ', '', 1)
+            links.append(f"[[{verse}]]")
+
+        return '; '.join(links)
+
+    def convert_passage(self, verses):
+        book = self.file.parts[-2]
+        chapter = None
+        ref = []
+        for ver in verses.split(','):
+            try:
+                chapter, v = ver.split(':')
+            except:
+                v = ver
+
+            r = []
+            for i in v.split('-'):
+                r.append(f'{book}.{chapter}.{i}')
+
+            ref.append(f'Bible:{"-".join(r)}')
+        x = ' '.join(ref)
+        return f'osisRef="{x}"'
+
+    def process_tag(self, test=None):
+        match self.tag:
+            case 'b':  # Bold
+                self.write_to_file('**')
+
+            case 'i':  # Italics
+                self.write_to_file('*')
+
+            case 'scripcom':
+                # This saves the verse for the section.
+                if self.tag_type == 'start':
+                    try:
+                        verse = bible.get_references(
+                            self.normalize_osis_verses(
+                                self.attrs['osisref'])
+                        )[0]
+                    except:
+                        logger.debug(self.passage_verse)
+                        raise
+
+                    if verse.start_verse == verse.end_verse:
+                        self.write_to_file('## Verse: ')
+                        self.write_to_file(
+                            f'{verse.start_verse}'
+                        )
+                    else:
+                        self.write_to_file('## Verses: ')
+                        self.write_to_file(
+                            f'{verse.start_verse} - {verse.end_verse}'
+                        )
+                    self.write_to_file('\n\n')
+
+            case 'h4':  # this is the section header. Need to keep track of it per file.
+                if self.tag_type == 'start':
+                    self.write_to_file('\n# ',)
+                    self.skip_re = True
+
+                if self.tag_type == 'end':
+                    self.write_to_file('\n')
+                    self.skip_re = False
+
+            # case 'scripref':  # Scripture ref
+            #     # get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
+            #     #  bible.format_scripture_references(bible.get_references(attrs['osisref']))
+            #     if self.tag_type == 'start':
+            #         if 'osisref' in self.attrs:
+            #             verses = self.normalize_osis_verses(
+            #                 self.attrs['osisref']
+            #             )
+            #         else:
+            #             # verses = self.normalize_osis_verses(
+            #             #     self.convert_passage(self.attrs['passage'])
+            #             # )
+            #             raise ValueError('No `osisref` in tag attrs')
+            #
+            #         self.write_to_file(self.obsidian_links(verses))
+
+            case 'p':  # Paragraph
+                if self.tag_type == 'start':
+                    if self.attrs.get('class', False) == 'passage':
+                        # Need this so the regex in self.clean does not pick
+                        # this up. Will be also cleaned in the same function.
+                        self.write_to_file('Passage: ')
+
+                if self.tag_type == 'end':
+                    self.write_to_file('\n\n')
+
+            case 'li':
+                if self.tag_type == 'start':
+                    self.write_to_file('- ')
+                else:
+                    self.write_to_file('\n')
+
+            # case _:
+            #     # logger.info(tag.upper())
+            #     pass
+
+    def handle_starttag(self, tag, attrs):
+        # Set taf trackers
+        self.tag_type = 'start'
+        self.tag = tag
+        self.attrs = dict(attrs)
+
+        self.process_tag()
+
+    def handle_endtag(self, tag):
+        # logger.info(f'End: {tag}')
+        self.tag = tag
+        self.tag_type = 'end'
+        self.process_tag()
+
+        # Clear tag tracker
+        self.tag = None
+        self.tag_type = None
+        self.attrs.clear()
+        self.skip_re = False
+
+    def handle_data(self, data):
+        # logger.info(f'Data: {data!r}')
+
+        if self.tag in ['scripref', 'h2', 'h3', 'h5']:
+            return
+
+        if data == '\n':
+            return
+
+        data = data.replace('—', '-- ')
+        data = data.replace('GOD', 'God')
+        data = data.replace('\n', ' ')
+
+        self.write_to_file(data, self.skip_re)
+
+
+def create_file(base_dir: Path, folder: str, file: str, file_type: str) -> Path:
+    """ Create the path of the md file that will be written to as the
+    program process the xml file.
+    """
+
+    new_file = Path(f'{base_dir}/{folder}/{file}.{file_type}').absolute()
+    new_file.parent.mkdir(parents=True, exist_ok=True)
+    new_file.unlink(missing_ok=True)
+    # logger.info(md_file.absolute())
+    return new_file
+
+
+def write_file(file, data):
+    with open(file, 'a+') as f:
+        for d in data:
+            f.write(d)
+        f.write('\n')
+
+if __name__ == '__main__':
+    parser = MyHTMLParser()
+
+    # Process each volume file.
+    scraps = Path('./scraps')
+    files_used = list()
+    missing = list()
+    for book in Path('./vol_split').glob('*'):
+        book_name = book.name.split(' - ')[-1]
+        logger.info(book_name)
+        print(book_name)
+        for chapter in book.glob('*.xml'):
+            chapter_name = chapter.stem
+            chapter_num = chapter_name.split(' ')[-1]
+            logger.info(f"{chapter_name}")
+
+            file = create_file(base, book_name, chapter_name, 'html')
+
+            # book intro file
+            if 'INTRODUCTION' in chapter_name.upper():
+                book_intro = scraps / f"{book_name}.html"
+                if book_intro.exists() is False:
+                    logger.info(book_intro.name, book_intro.exists())
+                files_used.append(book_intro.name)
+                write_file(file, f'<!-- Gotten from {book_intro}-->')
+                write_file(file, book_intro.read_text())
+                parser.create_md_file('./mh_scraps/', book_name, file.stem)
+                parser.feed(file.read_text())
+                parser.clean_file()
+                continue
+
+            # Chapter intro
+            book_short = shorts[book_name]
+
+            ci = bible.get_references(f"{book_name} {chapter_num}")[0]
+            if ev := end_verses.get(book_name, False):
+                ci.end_verse = ev
+
+            start = f'{book_short}_{chapter_num}_{ci.start_verse}'
+            end = f'{book_short}_{chapter_num}_{ci.end_verse}'
+            chapter_into = scraps / f"chapter_{start}-{end}.html"
+
+            if chapter_into.exists() is False:
+                logger.info(chapter_into.name, chapter_into.exists())
+                raise ValueError
+            files_used.append(chapter_into.name)
+
+            # write(file, '<h1>Chapter Introduction</h1>')
+            write_file(file, f'<!-- Chapter Intro: {chapter_into} -->')
+            write_file(file, chapter_into.read_text())
+
+            sections = BeautifulSoup(chapter.read_text(), 'xml')
+
+            # Parse the text of the chapter
+            # May need to add a loop to look in all <p class='passage> and
+            # unrap any scripref that is in them. the place Dan is getting confused
+            # with the book Daniel.
+            for passage in sections.find_all('p', {'class': 'passage'}):
+                # print(passage.text)
+                new_t = sections.new_tag('p', attrs=passage.attrs)
+                new_t.string = passage.text
+                passage.replace_with(new_t)
+
+            # Clearing all scripRef of internal text. That way if it
+            # had other tags in it would not be processed
+            for scripture in sections.find_all('scripRef'):
+                scripture.clear()
+
+            write_file(file, '<!-- Chapter Sections -->')
+            for section in sections.find_all('div', {'class': 'Commentary'}):
+                if header := section.find('h4'):
+                    write_file(file, str(header))
+
+                # Scripture ref, should be the section!
+                scripture_ref = section.previous_sibling
+                # write_file(file, '\n')
+                write_file(file, str(scripture_ref))
+
+                # Passage text
+                if passage := section.find('p', {'class': 'passage'}):
+                    write_file(file, str(passage))
+
+                logger.info(scripture_ref)
+                verses_id = str(scripture_ref['osisRef'])
+                verses = verses_id.replace('Bible:', '').replace('.', '_')
+
+                # Sanity Check verses are legit
+                for v in verses.split('-'):
+                    v = v.replace('_', ' ', 1).replace('_', ':')
+                    b_ref = bible.get_references(v)
+                    if not b_ref:
+                        raise ValueError(f"{v} is in valid")
+                    if int(b_ref[0].start_chapter) != int(chapter_num):
+                        raise ValueError('Wrong Chapter')
+
+
+                verse_section = scraps / f"{verses}.html"
+                logger.info('\t\t',verse_section.name, verse_section.exists())
+                if verse_section.exists() is False:
+                    missing.append((book_name, chapter_num, verse_section.name))
+
+                if verse_section.name in files_used:
+                    raise FileExistsError(f'{verse_section} has been already used.')
+
+                files_used.append(verse_section.name)
+
+                write_file(file, f'<!-- {verse_section} -->')
+                write_file(file, verse_section.read_text())
+
+
+            parser.create_md_file('./mh_scraps/', book_name, file.stem)
+            parser.feed(file.read_text())
+            parser.clean_file()