diff --git a/main_1.py b/main_1.py new file mode 100644 index 0000000..f9a4c82 --- /dev/null +++ b/main_1.py @@ -0,0 +1,572 @@ +import collections +import itertools +import re +import string +import sys +from html.parser import HTMLParser +from pathlib import Path + +import pythonbible as bible +from bs4 import BeautifulSoup +from loguru import logger +from tqdm import tqdm + +base = Path('./scraps_consolidated').absolute() + +# logger.remove(0) + +from functools import partialmethod + +logger.__class__.foobar = partialmethod(logger.__class__.log, "foobar", level='WARNING') +logger.add('log.txt', level='WARNING') + +shorts = dict( + [ + ("Nahum", "Nah"), + ("Colossians", "Col"), + ("Esther", "Esth"), + ("James", "Jas"), + ("John", "John"), + ("3John", "3John"), + ("Revelation", "Rev"), + ("Titus", "Titus"), + ("Zechariah", "Zech"), + ("Malachi", "Mal"), + ("Lamentations", "Lam"), + ("Matthew", "Matt"), + ("Jonah", "Jonah"), + ("Hosea", "Hos"), + ("Joel", "Joel"), + ("Genesis", "Gen"), + ("Proverbs", "Prov"), + ("Romans", "Rom"), + ("Micah", "Mic"), + ("Jeremiah", "Jer"), + ("Ecclesiastes", "Eccl"), + ("Philippians", "Phil"), + ("Galatians", "Gal"), + ("Acts", "Acts"), + ("Psalms", "Ps"), + ("Isaiah", "Isa"), + ("2Chronicles", "2Chr"), + ("Job", "Job"), + ("Deuteronomy", "Deut"), + ("Luke", "Luke"), + ("Numbers", "Num"), + ("Daniel", "Dan"), + ("Hebrews", "Heb"), + ("Nehemiah", "Neh"), + ("1John", "1John"), + ("Haggai", "Hag"), + ("1Kings", "1Kgs"), + ("Joshua", "Josh"), + ("1Corinthians", "1Cor"), + ("Mark", "Mark"), + ("2Kings", "2Kgs"), + ("Obadiah", "Obad"), + ("Ephesians", "Eph"), + ("1Timothy", "1Tim"), + ("Philemon", "Phlm"), + ("1Chronicles", "1Chr"), + ("2Samuel", "2Sam"), + ("1Samuel", "1Sam"), + ("2Peter", "2Pet"), + ("2John", "2John"), + ("Song of Solomon", "Song"), + ("Jude", "Jude"), + ("Amos", "Amos"), + ("1Peter", "1Pet"), + ("Leviticus", "Lev"), + ("Habakkuk", "Hab"), + ("Ruth", "Ruth"), + ("1Thessalonians", "1Thess"), + ("Ezra", "Ezra"), + ("Ezekiel", "Ezek"), + ("2Thessalonians", "2Thess"), + ("2Timothy", "2Tim"), + ("Exodus", "Exod"), + ("2Corinthians", "2Cor"), + ("Judges", "Judg"), + ("Zephaniah", "Zeph"), + ] +) + +end_verses = dict([ + ('Obadiah', 21), + ('Philemon', 25), + ('2John', 13), + ('3John', 15), + ('Jude', 25), +]) + +class MyHTMLParser(HTMLParser): + file = None + passage_verse = None + tag = None + tag_type = None # 'start' or 'end' + skip_re = False + attrs = dict() + + # All the patterns are for re.sub(). This is specifically for getting + # list/sublist into the correct position. The first item is for the pattern + # to find and the second is the replacement pattern. The replacement pattern + # also ids the list so that when the self.clean_file() we can properly add + # the correct amount of `\t` to the line. + patterns = [ + (r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'), + (r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'), + + # These share the same id. There are times were the two are a mixed + # within the parent lists. + (r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'), + (r"(\S\s)(\d+\.)(\s\b|\W)", + lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'), + + + (r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.", + lambda x: f'\n{parser.roman_to_int(x.group())}.~1'), + ] + + @staticmethod + def roman_to_int(number: str) -> int: + if number is None: + return 0 + + # Sometimes the roman numeral comes with a '.'. Striping to not cause + # issues + number = number.strip('.') + + roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} + total = 0 + for i in range(len(number) - 1, -1, -1): + num = roman[number[i]] + if 3 * num < total: + total -= num + else: + total += num + + return total + + def create_md_file(self, base_dir: Path, folder: str, file: str): + """ Create the path of the md file that will be written to as the + program process the xml file. + """ + + md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute() + md_file.parent.mkdir(parents=True, exist_ok=True) + md_file.unlink(missing_ok=True) + # print(md_file.absolute()) + self.file = md_file + return md_file + + def write_to_file(self, data, skip_re=False): + + # if self.file is None: + # raise ValueError('No File specified') + + if not skip_re: + # Here the list/sublist are searched for and altered for later + # processing in self.clean_file() + for pattern, sub in self.patterns: + # logger.debug(f'Running Pattern: {pattern} on {data[:50]}') + data = re.sub(pattern, sub, data) + + # print(f'writing: {data!r}') + + if self.file: + # raise ValueError('No File specified') + with open(self.file, 'a+') as file: + file.write(data) + else: + print(data) + + def clean_file(self): + logger.info(' CLEANING FILE') + + if self.file is None: + raise ValueError('No File specified') + + # Read all the lines + with open(self.file, 'r') as file: + lines = file.readlines() + + indent = 0 # Current number of '\t' to add in front of the list number + sublist = dict() # Track the indent number based on the id. + with open(self.file, 'w') as file: + file.write('# Chapter Introduction\n\n') + + header = None + for line in lines: + # Do not write blank lines + if not line.strip(): + continue + + # Main header, if a header has already been written skip it + if line.strip().startswith('# '): + if header == line.strip(): + continue + header = line.strip() + + # Add a `\n` if line is a heading. + if line.startswith('#'): + line = f'\n{line.strip()}' + + # If line is not a list then add a `\n` after the paragraph, + # and reset the indent tracker + if line.strip()[0] not in string.digits: + line = f'{line}\n' + # print('resetting indent') + indent = 0 + sublist.clear() + else: + # Get the list id + start = line.find('~') + end = start + 2 + list_id = line[start:end] + + # Get the number of indents based on the list id, if it does + # not exist they set the value to the max value +1. If this + # is the first it will be set to 0 (-1 +1 = 0) + indents = "\t" * sublist.setdefault( + list_id, max(sublist.values(), default=-1)+1 + ) + line = f'{indents}{line.replace(list_id, "")}' + + if line.startswith('Passage: '): + line = line.removeprefix('Passage: ') + + file.write(line) + + def normalize_osis_verses(self, verses): + """Takes this + 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5' + and turns it into somthing more readable""" + normalized_refs = list() + for chunks in verses.removeprefix("Bible:").split('Bible:'): + spanned_verses = list() + for ref in chunks.split('-'): + verse = ref.replace('.', ' ', 1).replace('.', ':') + book = ref.split('.')[0] + verse.replace(book, convert[book]) + spanned_verses.append( + bible.format_scripture_references( + bible.get_references(verse.strip()) + ) + ) + + try: + normalized = bible.format_scripture_references( + bible.get_references('-'.join(spanned_verses)) + ) + except: + logger.warning(f"Error with: {verses=}: {spanned_verses=}") + raise + + if normalized is False: + print(f"Error with: {verses=}: {spanned_verses=}") + sys.exit() + + normalized_refs.append(normalized) + + return ';'.join(normalized_refs) + + # @staticmethod + def old_normalize_osis_verses(self, verses): + """Takes this + 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5' + and turns it into somthing more readable""" + normalized_refs = list() + for chunks in verses.removeprefix("Bible:").split('Bible:'): + spanned_verses = chunks.split('-') + if len(spanned_verses) < 2: + spanned_verses = list(itertools.repeat(spanned_verses[0], 2)) + + book, start_chapter, start_verse = spanned_verses[0].split('.') + _, end_chapter, end_verse = spanned_verses[1].split('.') + book = bible.Book[convert[book].upper()] + + normalized = bible.NormalizedReference( + book=book, + start_chapter=start_chapter, + start_verse=start_verse, + end_chapter=end_chapter, + end_verse=end_verse, + ) + + if normalized is False: + logger.foobar(f'{self.args=}') + + normalized_refs.append( + bible.format_scripture_references([normalized]).strip() + ) + + return ';'.join(normalized_refs) + + @staticmethod + @logger.catch(reraise=False) + def obsidian_links(verses): + """Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link""" + links = [] + for verse in verses.split(';'): + verse = verse.replace(':', '#').replace('-', '..') + if verse[0] in string.digits: + verse.replace(' ', '', 1) + links.append(f"[[{verse}]]") + + return '; '.join(links) + + def convert_passage(self, verses): + book = self.file.parts[-2] + chapter = None + ref = [] + for ver in verses.split(','): + try: + chapter, v = ver.split(':') + except: + v = ver + + r = [] + for i in v.split('-'): + r.append(f'{book}.{chapter}.{i}') + + ref.append(f'Bible:{"-".join(r)}') + x = ' '.join(ref) + return f'osisRef="{x}"' + + def process_tag(self, test=None): + match self.tag: + case 'b': # Bold + self.write_to_file('**') + + case 'i': # Italics + self.write_to_file('*') + + case 'scripcom': + # This saves the verse for the section. + if self.tag_type == 'start': + if self.attrs['type'] == 'Commentary': + self.passage_verse = self.attrs['osisref'] + + case 'h4': # this is the section header. Need to keep track of it per file. + if self.tag_type == 'start': + self.write_to_file('\n# ',) + self.skip_re = True + + if self.tag_type == 'end': + self.write_to_file('\n') + self.skip_re = False + + case 'scripref': # Scripture ref + # get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')] + # bible.format_scripture_references(bible.get_references(attrs['osisref'])) + if self.tag_type == 'start': + if 'osisref' in self.attrs: + verses = self.normalize_osis_verses( + self.attrs['osisref'] + ) + else: + # verses = self.normalize_osis_verses( + # self.convert_passage(self.attrs['passage']) + # ) + raise ValueError('No `osisref` in tag attrs') + + self.write_to_file(self.obsidian_links(verses)) + + case 'p': # Paragraph + if self.tag_type == 'start': + if self.attrs.get('class', False) == 'passage': + if self.passage_verse: + try: + verse = bible.get_references( + self.normalize_osis_verses( + self.passage_verse) + )[0] + except: + logger.debug(self.passage_verse) + raise + + if verse.start_verse == verse.end_verse: + self.write_to_file('## Verse: ') + self.write_to_file( + f'{verse.start_verse}' + ) + else: + self.write_to_file('## Verses: ') + self.write_to_file( + f'{verse.start_verse} - {verse.end_verse}' + ) + self.write_to_file('\n\n') + + # Need this do the regex in self.clean does not pick + # this up. Will be also cleaned in the same function. + self.write_to_file('Passage: ') + + if self.tag_type == 'end': + self.write_to_file('\n\n') + + case 'li': + if self.tag_type == 'start': + self.write_to_file('- ') + else: + self.write_to_file('\n') + + # case _: + # # print(tag.upper()) + # pass + + def handle_starttag(self, tag, attrs): + # Set taf trackers + self.tag_type = 'start' + self.tag = tag + self.attrs = dict(attrs) + + self.process_tag() + + def handle_endtag(self, tag): + # print(f'End: {tag}') + self.tag = tag + self.tag_type = 'end' + self.process_tag() + + # Clear tag tracker + self.tag = None + self.tag_type = None + self.attrs.clear() + self.skip_re = False + + def handle_data(self, data): + # print(f'Data: {data!r}') + + if self.tag in ['scripref', 'h2', 'h3', 'h5']: + return + + if data == '\n': + return + + data = data.replace('—', '-- ') + data = data.replace('GOD', 'God') + data = data.replace('\n', ' ') + + self.write_to_file(data, self.skip_re) + + +def create_file(base_dir: Path, folder: str, file: str, file_type: str) -> Path: + """ Create the path of the md file that will be written to as the + program process the xml file. + """ + + new_file = Path(f'{base_dir}/{folder}/{file}.{file_type}').absolute() + new_file.parent.mkdir(parents=True, exist_ok=True) + new_file.unlink(missing_ok=True) + # print(md_file.absolute()) + return new_file + + +def write_file(file, data): + with open(file, 'a+') as f: + for d in data: + f.write(d) + f.write('\n') + +if __name__ == '__main__': + parser = MyHTMLParser() + + # Process each volume file. + scraps = Path('./scraps') + files_used = collections.Counter() + for vol in sorted(Path('./').glob('*.xml')): + print(vol) + soup = BeautifulSoup(vol.read_text(), 'xml') + for book in soup.find_all('div1'): + book_name = book['title'].replace('First ', '1').replace('Second ', '2').replace('Third ', '3') + + if shorts.get(book_name, False) is False: + print(f'skipping {book_name}') + continue + + print('\t', book_name, book['id']) + for chapter in book.find_all('div2'): + filename = chapter['title'] + + if 'Index' in filename: + print('skipping', filename) + continue + + # Remove Roman Numerals from file name + if 'CHAPTER' in filename.upper(): + _, roman_num = chapter['title'].split(' ') + chapter_num = parser.roman_to_int(roman_num) + filename = f'Chapter {chapter_num}' + + print('\t\t', filename, chapter['id']) + + file = create_file(base, book_name, filename, 'html') + file.touch() + + # book intro file + if 'INTRODUCTION' in file.name.upper(): + book_intro = scraps / f"{book_name}.html" + if book_intro.exists() is False: + print(book_intro.name, book_intro.exists()) + files_used.update([book_intro.name]) + file.write_bytes(book_intro.read_bytes()) + continue + + # Chapter intro + book_short = shorts[book_name] + + ci = bible.get_references(f"{book_name} {chapter_num}")[0] + if ev:=end_verses.get(book_name, False): + ci.end_verse = ev + + chapter_into = scraps / f"{book_short}_{chapter_num}_{ci.start_verse}-{book_short}_{chapter_num}_{ci.end_verse}.html" + + if chapter_into.exists() is False: + print(chapter_into.name, chapter_into.exists()) + raise ValueError + files_used.update([chapter_into.name]) + + # write(file, '

Chapter Introduction

') + write_file(file, '') + write_file(file, chapter_into.read_text()) + + # Chapter sections + for passage in chapter.find_all('p', {'class': 'passage'}): + # print(passage.text) + new_t = soup.new_tag('p', attrs=passage.attrs) + new_t.string = passage.text + passage.replace_with(new_t) + + write_file(file, '') + + for section in chapter.find_all('div', {'class': 'Commentary'}): + if header := section.find('h4'): + write_file(file, str(header)) + + # verses_id = str(section['id']) + # verses = verses_id.replace('Bible:', '').replace('.', '_') + # + # verse_section = scraps / f"{verses}.html" + # if verse_section.exists() is False: + + for sc in section.previous_elements: + if str(sc).strip(): + break + # print(sc['osisRef']) + + verses_id = str(sc['osisRef']) + verses = verses_id.replace('Bible:', '').replace('.', '_') + + verse_section = scraps / f"{verses}.html" + print('\t\t',verse_section.name, verse_section.exists()) + if verse_section.exists() is False: + raise ValueError(f'osisRef="{verses_id}"') + + files_used.update([verse_section.name]) + + # print(verses) + # passage = section.find('p', {'class': 'passage'}) + + + + diff --git a/main_2.py b/main_2.py new file mode 100644 index 0000000..02e8563 --- /dev/null +++ b/main_2.py @@ -0,0 +1,579 @@ +import collections +import itertools +import re +import string +import sys +from html.parser import HTMLParser +from pathlib import Path + +import pythonbible as bible +from bs4 import BeautifulSoup +from loguru import logger +from tqdm import tqdm +import difflib + +base = Path('./scraps_consolidated').absolute() + +logger.remove(0) +logger.add('mixing.log') +from functools import partialmethod + +logger.__class__.foobar = partialmethod(logger.__class__.log, "foobar", level='WARNING') +logger.add('log.txt', level='WARNING') + +shorts = dict( + [ + ("Nahum", "Nah"), + ("Colossians", "Col"), + ("Esther", "Esth"), + ("James", "Jas"), + ("John", "John"), + ("3John", "3John"), + ("Revelation", "Rev"), + ("Titus", "Titus"), + ("Zechariah", "Zech"), + ("Malachi", "Mal"), + ("Lamentations", "Lam"), + ("Matthew", "Matt"), + ("Jonah", "Jonah"), + ("Hosea", "Hos"), + ("Joel", "Joel"), + ("Genesis", "Gen"), + ("Proverbs", "Prov"), + ("Romans", "Rom"), + ("Micah", "Mic"), + ("Jeremiah", "Jer"), + ("Ecclesiastes", "Eccl"), + ("Philippians", "Phil"), + ("Galatians", "Gal"), + ("Acts", "Acts"), + ("Psalms", "Ps"), + ("Isaiah", "Isa"), + ("2Chronicles", "2Chr"), + ("Job", "Job"), + ("Deuteronomy", "Deut"), + ("Luke", "Luke"), + ("Numbers", "Num"), + ("Daniel", "Dan"), + ("Hebrews", "Heb"), + ("Nehemiah", "Neh"), + ("1John", "1John"), + ("Haggai", "Hag"), + ("1Kings", "1Kgs"), + ("Joshua", "Josh"), + ("1Corinthians", "1Cor"), + ("Mark", "Mark"), + ("2Kings", "2Kgs"), + ("Obadiah", "Obad"), + ("Ephesians", "Eph"), + ("1Timothy", "1Tim"), + ("Philemon", "Phlm"), + ("1Chronicles", "1Chr"), + ("2Samuel", "2Sam"), + ("1Samuel", "1Sam"), + ("2Peter", "2Pet"), + ("2John", "2John"), + ("Song of Solomon", "Song"), + ("Jude", "Jude"), + ("Amos", "Amos"), + ("1Peter", "1Pet"), + ("Leviticus", "Lev"), + ("Habakkuk", "Hab"), + ("Ruth", "Ruth"), + ("1Thessalonians", "1Thess"), + ("Ezra", "Ezra"), + ("Ezekiel", "Ezek"), + ("2Thessalonians", "2Thess"), + ("2Timothy", "2Tim"), + ("Exodus", "Exod"), + ("2Corinthians", "2Cor"), + ("Judges", "Judg"), + ("Zephaniah", "Zeph"), + ] +) + +end_verses = dict([ + ('Obadiah', 21), + ('Philemon', 25), + ('2John', 13), + ('3John', 15), + ('Jude', 25), +]) + +class MyHTMLParser(HTMLParser): + file = None + passage_verse = None + tag = None + tag_type = None # 'start' or 'end' + skip_re = False + attrs = dict() + + # All the patterns are for re.sub(). This is specifically for getting + # list/sublist into the correct position. The first item is for the pattern + # to find and the second is the replacement pattern. The replacement pattern + # also ids the list so that when the self.clean_file() we can properly add + # the correct amount of `\t` to the line. + patterns = [ + (r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'), + (r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'), + + # These share the same id. There are times were the two are a mixed + # within the parent lists. + (r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'), + (r"(\S\s)(\d+\.)(\s\b|\W)", + lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'), + + + (r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.", + lambda x: f'\n{parser.roman_to_int(x.group())}.~1'), + ] + + @staticmethod + def roman_to_int(number: str) -> int: + if number is None: + return 0 + + # Sometimes the roman numeral comes with a '.'. Striping to not cause + # issues + number = number.strip('.') + + roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} + total = 0 + for i in range(len(number) - 1, -1, -1): + num = roman[number[i]] + if 3 * num < total: + total -= num + else: + total += num + + return total + + def create_md_file(self, base_dir: Path, folder: str, file: str): + """ Create the path of the md file that will be written to as the + program process the xml file. + """ + + md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute() + md_file.parent.mkdir(parents=True, exist_ok=True) + md_file.unlink(missing_ok=True) + # logger.info(md_file.absolute()) + self.file = md_file + return md_file + + def write_to_file(self, data, skip_re=False): + + # if self.file is None: + # raise ValueError('No File specified') + + if not skip_re: + # Here the list/sublist are searched for and altered for later + # processing in self.clean_file() + for pattern, sub in self.patterns: + # logger.debug(f'Running Pattern: {pattern} on {data[:50]}') + data = re.sub(pattern, sub, data) + + # logger.info(f'writing: {data!r}') + + if self.file: + # raise ValueError('No File specified') + with open(self.file, 'a+') as file: + file.write(data) + else: + logger.info(data) + + def clean_file(self): + logger.info(' CLEANING FILE') + + if self.file is None: + raise ValueError('No File specified') + + # Read all the lines + with open(self.file, 'r') as file: + lines = file.readlines() + + indent = 0 # Current number of '\t' to add in front of the list number + sublist = dict() # Track the indent number based on the id. + with open(self.file, 'w') as file: + file.write('# Chapter Introduction\n\n') + + header = None + for line in lines: + # Do not write blank lines + if not line.strip(): + continue + + # Main header, if a header has already been written skip it + if line.strip().startswith('# '): + if header == line.strip(): + continue + header = line.strip() + + # Add a `\n` if line is a heading. + if line.startswith('#'): + line = f'\n{line.strip()}' + + # If line is not a list then add a `\n` after the paragraph, + # and reset the indent tracker + if line.strip()[0] not in string.digits: + line = f'{line}\n' + # logger.info('resetting indent') + indent = 0 + sublist.clear() + else: + # Get the list id + start = line.find('~') + end = start + 2 + list_id = line[start:end] + + # Get the number of indents based on the list id, if it does + # not exist they set the value to the max value +1. If this + # is the first it will be set to 0 (-1 +1 = 0) + indents = "\t" * sublist.setdefault( + list_id, max(sublist.values(), default=-1)+1 + ) + line = f'{indents}{line.replace(list_id, "")}' + + if line.startswith('Passage: '): + line = line.removeprefix('Passage: ') + + file.write(line) + + def normalize_osis_verses(self, verses): + """Takes this + 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5' + and turns it into somthing more readable""" + normalized_refs = list() + for chunks in verses.removeprefix("Bible:").split('Bible:'): + spanned_verses = list() + for ref in chunks.split('-'): + verse = ref.replace('.', ' ', 1).replace('.', ':') + book = ref.split('.')[0] + spanned_verses.append( + bible.format_scripture_references( + bible.get_references(verse.strip()) + ) + ) + + try: + normalized = bible.format_scripture_references( + bible.get_references('-'.join(spanned_verses)) + ) + except: + logger.warning(f"Error with: {verses=}: {spanned_verses=}") + raise + + if normalized is False: + logger.info(f"Error with: {verses=}: {spanned_verses=}") + sys.exit() + + normalized_refs.append(normalized) + + return ';'.join(normalized_refs) + + # @staticmethod + def old_normalize_osis_verses(self, verses): + """Takes this + 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5' + and turns it into somthing more readable""" + normalized_refs = list() + for chunks in verses.removeprefix("Bible:").split('Bible:'): + spanned_verses = chunks.split('-') + if len(spanned_verses) < 2: + spanned_verses = list(itertools.repeat(spanned_verses[0], 2)) + + book, start_chapter, start_verse = spanned_verses[0].split('.') + _, end_chapter, end_verse = spanned_verses[1].split('.') + book = bible.Book[convert[book].upper()] + + normalized = bible.NormalizedReference( + book=book, + start_chapter=start_chapter, + start_verse=start_verse, + end_chapter=end_chapter, + end_verse=end_verse, + ) + + if normalized is False: + logger.foobar(f'{self.args=}') + + normalized_refs.append( + bible.format_scripture_references([normalized]).strip() + ) + + return ';'.join(normalized_refs) + + @staticmethod + @logger.catch(reraise=False) + def obsidian_links(verses): + """Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link""" + links = [] + for verse in verses.split(';'): + verse = verse.replace(':', '#').replace('-', '..') + if verse[0] in string.digits: + verse.replace(' ', '', 1) + links.append(f"[[{verse}]]") + + return '; '.join(links) + + def convert_passage(self, verses): + book = self.file.parts[-2] + chapter = None + ref = [] + for ver in verses.split(','): + try: + chapter, v = ver.split(':') + except: + v = ver + + r = [] + for i in v.split('-'): + r.append(f'{book}.{chapter}.{i}') + + ref.append(f'Bible:{"-".join(r)}') + x = ' '.join(ref) + return f'osisRef="{x}"' + + def process_tag(self, test=None): + match self.tag: + case 'b': # Bold + self.write_to_file('**') + + case 'i': # Italics + self.write_to_file('*') + + case 'scripcom': + # This saves the verse for the section. + if self.tag_type == 'start': + try: + verse = bible.get_references( + self.normalize_osis_verses( + self.attrs['osisref']) + )[0] + except: + logger.debug(self.passage_verse) + raise + + if verse.start_verse == verse.end_verse: + self.write_to_file('## Verse: ') + self.write_to_file( + f'{verse.start_verse}' + ) + else: + self.write_to_file('## Verses: ') + self.write_to_file( + f'{verse.start_verse} - {verse.end_verse}' + ) + self.write_to_file('\n\n') + + case 'h4': # this is the section header. Need to keep track of it per file. + if self.tag_type == 'start': + self.write_to_file('\n# ',) + self.skip_re = True + + if self.tag_type == 'end': + self.write_to_file('\n') + self.skip_re = False + + # case 'scripref': # Scripture ref + # # get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')] + # # bible.format_scripture_references(bible.get_references(attrs['osisref'])) + # if self.tag_type == 'start': + # if 'osisref' in self.attrs: + # verses = self.normalize_osis_verses( + # self.attrs['osisref'] + # ) + # else: + # # verses = self.normalize_osis_verses( + # # self.convert_passage(self.attrs['passage']) + # # ) + # raise ValueError('No `osisref` in tag attrs') + # + # self.write_to_file(self.obsidian_links(verses)) + + case 'p': # Paragraph + if self.tag_type == 'start': + if self.attrs.get('class', False) == 'passage': + # Need this so the regex in self.clean does not pick + # this up. Will be also cleaned in the same function. + self.write_to_file('Passage: ') + + if self.tag_type == 'end': + self.write_to_file('\n\n') + + case 'li': + if self.tag_type == 'start': + self.write_to_file('- ') + else: + self.write_to_file('\n') + + # case _: + # # logger.info(tag.upper()) + # pass + + def handle_starttag(self, tag, attrs): + # Set taf trackers + self.tag_type = 'start' + self.tag = tag + self.attrs = dict(attrs) + + self.process_tag() + + def handle_endtag(self, tag): + # logger.info(f'End: {tag}') + self.tag = tag + self.tag_type = 'end' + self.process_tag() + + # Clear tag tracker + self.tag = None + self.tag_type = None + self.attrs.clear() + self.skip_re = False + + def handle_data(self, data): + # logger.info(f'Data: {data!r}') + + if self.tag in ['scripref', 'h2', 'h3', 'h5']: + return + + if data == '\n': + return + + data = data.replace('—', '-- ') + data = data.replace('GOD', 'God') + data = data.replace('\n', ' ') + + self.write_to_file(data, self.skip_re) + + +def create_file(base_dir: Path, folder: str, file: str, file_type: str) -> Path: + """ Create the path of the md file that will be written to as the + program process the xml file. + """ + + new_file = Path(f'{base_dir}/{folder}/{file}.{file_type}').absolute() + new_file.parent.mkdir(parents=True, exist_ok=True) + new_file.unlink(missing_ok=True) + # logger.info(md_file.absolute()) + return new_file + + +def write_file(file, data): + with open(file, 'a+') as f: + for d in data: + f.write(d) + f.write('\n') + +if __name__ == '__main__': + parser = MyHTMLParser() + + # Process each volume file. + scraps = Path('./scraps') + files_used = list() + missing = list() + for book in Path('./vol_split').glob('*'): + book_name = book.name.split(' - ')[-1] + logger.info(book_name) + print(book_name) + for chapter in book.glob('*.xml'): + chapter_name = chapter.stem + chapter_num = chapter_name.split(' ')[-1] + logger.info(f"{chapter_name}") + + file = create_file(base, book_name, chapter_name, 'html') + + # book intro file + if 'INTRODUCTION' in chapter_name.upper(): + book_intro = scraps / f"{book_name}.html" + if book_intro.exists() is False: + logger.info(book_intro.name, book_intro.exists()) + files_used.append(book_intro.name) + write_file(file, f'') + write_file(file, book_intro.read_text()) + parser.create_md_file('./mh_scraps/', book_name, file.stem) + parser.feed(file.read_text()) + parser.clean_file() + continue + + # Chapter intro + book_short = shorts[book_name] + + ci = bible.get_references(f"{book_name} {chapter_num}")[0] + if ev := end_verses.get(book_name, False): + ci.end_verse = ev + + start = f'{book_short}_{chapter_num}_{ci.start_verse}' + end = f'{book_short}_{chapter_num}_{ci.end_verse}' + chapter_into = scraps / f"chapter_{start}-{end}.html" + + if chapter_into.exists() is False: + logger.info(chapter_into.name, chapter_into.exists()) + raise ValueError + files_used.append(chapter_into.name) + + # write(file, '

Chapter Introduction

') + write_file(file, f'') + write_file(file, chapter_into.read_text()) + + sections = BeautifulSoup(chapter.read_text(), 'xml') + + # Parse the text of the chapter + # May need to add a loop to look in all

') + for section in sections.find_all('div', {'class': 'Commentary'}): + if header := section.find('h4'): + write_file(file, str(header)) + + # Scripture ref, should be the section! + scripture_ref = section.previous_sibling + # write_file(file, '\n') + write_file(file, str(scripture_ref)) + + # Passage text + if passage := section.find('p', {'class': 'passage'}): + write_file(file, str(passage)) + + logger.info(scripture_ref) + verses_id = str(scripture_ref['osisRef']) + verses = verses_id.replace('Bible:', '').replace('.', '_') + + # Sanity Check verses are legit + for v in verses.split('-'): + v = v.replace('_', ' ', 1).replace('_', ':') + b_ref = bible.get_references(v) + if not b_ref: + raise ValueError(f"{v} is in valid") + if int(b_ref[0].start_chapter) != int(chapter_num): + raise ValueError('Wrong Chapter') + + + verse_section = scraps / f"{verses}.html" + logger.info('\t\t',verse_section.name, verse_section.exists()) + if verse_section.exists() is False: + missing.append((book_name, chapter_num, verse_section.name)) + + if verse_section.name in files_used: + raise FileExistsError(f'{verse_section} has been already used.') + + files_used.append(verse_section.name) + + write_file(file, f'') + write_file(file, verse_section.read_text()) + + + parser.create_md_file('./mh_scraps/', book_name, file.stem) + parser.feed(file.read_text()) + parser.clean_file()