import re import string import sys from html.parser import HTMLParser from pathlib import Path import pythonbible as bible from bs4 import BeautifulSoup from loguru import logger from tqdm import tqdm base = Path('./xml/Matthew Henry Commentary').absolute() logger.remove(0) logger.add('log.txt') class MyHTMLParser(HTMLParser): file = None passage_verse = None tag = None tag_type = None # 'start' or 'end' skip_re = False attrs = dict() # All the patterns are for re.sub(). This is specifically for getting # list/sublist into the correct position. The first item is for the pattern # to find and the second is the replacement pattern. The replacement pattern # also ids the list so that when the self.clean_file() we can properly add # the correct amount of `\t` to the line. patterns = [ (r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'), (r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'), # These share the same id. There are times were the two are a mixed # within the parent lists. (r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'), (r"(\S\s)(\d+\.)(\s\b|\W)", lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'), (r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.", lambda x: f'\n{parser.roman_to_int(x.group())}.~1'), ] @staticmethod def roman_to_int(number: str) -> int: if number is None: return 0 # Sometimes the roman numeral comes with a '.'. Striping to not cause # issues number = number.strip('.') roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} total = 0 for i in range(len(number) - 1, -1, -1): num = roman[number[i]] if 3 * num < total: total -= num else: total += num return total def create_md_file(self, base_dir: Path, folder: str, file: str): """ Create the path of the md file that will be written to as the program process the xml file. """ md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute() md_file.parent.mkdir(parents=True, exist_ok=True) md_file.unlink(missing_ok=True) # print(md_file.absolute()) self.file = md_file return md_file def write_to_file(self, data, skip_re=False): # if self.file is None: # raise ValueError('No File specified') if not skip_re: # Here the list/sublist are searched for and altered for later # processing in self.clean_file() for pattern, sub in self.patterns: # logger.debug(f'Running Pattern: {pattern} on {data[:50]}') data = re.sub(pattern, sub, data) # print(f'writing: {data!r}') if self.file: # raise ValueError('No File specified') with open(self.file, 'a+') as file: file.write(data) else: print(data) def clean_file(self): logger.info(' CLEANING FILE') if self.file is None: raise ValueError('No File specified') # Read all the lines with open(self.file, 'r') as file: lines = file.readlines() indent = 0 # Current number of '\t' to add in front of the list number sublist = dict() # Track the indent number based on the id. with open(self.file, 'w') as file: file.write('# Chapter Introduction\n\n') header = None for line in lines: # Do not write blank lines if not line.strip(): continue # Main header, if a header has already been written skip it if line.strip().startswith('# '): if header == line.strip(): continue header = line.strip() # Add a `\n` if line is a heading. if line.startswith('#'): line = f'\n{line.strip()}' # If line is not a list then add a `\n` after the paragraph, # and reset the indent tracker if line.strip()[0] not in string.digits: line = f'{line}\n' # print('resetting indent') indent = 0 sublist.clear() else: # Get the list id start = line.find('~') end = start + 2 list_id = line[start:end] # Get the number of indents based on the list id, if it does # not exist they set the value to the max value +1. If this # is the first it will be set to 0 (-1 +1 = 0) indents = "\t" * sublist.setdefault( list_id, max(sublist.values(), default=-1)+1 ) line = f'{indents}{line.replace(list_id, "")}' if line.startswith('Passage: '): line = line.removeprefix('Passage: ') file.write(line) @staticmethod def normalize_osis_verses(verses): """Takes this 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25' and turns it into somthing more readable""" normalized_refs = list() for chunks in verses.removeprefix("Bible:").split('Bible:'): spanned_verses = list() for ref in chunks.split('-'): verse = ref.replace('.', ' ', 1).replace('.', ':') spanned_verses.append( bible.format_scripture_references( bible.get_references(verse.strip()) ) ) try: normalized = bible.format_scripture_references( bible.get_references('-'.join(spanned_verses)) ) except: logger.warning(f"Error with: {verses=}: {spanned_verses=}") raise normalized_refs.append(normalized) return ';'.join(normalized_refs) @staticmethod def obsidian_links(verses): """Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link""" links = [] for verse in verses.split(';'): links.append(f"[[{verse.replace(':', '#').replace('-', '..')}]]") return '; '.join(links) def convert_passage(self, verses): book = self.file.parts[-2] chapter = None ref = [] for ver in verses.split(','): try: chapter, v = ver.split(':') except: v = ver r = [] for i in v.split('-'): r.append(f'{book}.{chapter}.{i}') ref.append(f'Bible:{"-".join(r)}') x = ' '.join(ref) return f'osisRef="{x}"' def process_tag(self, test=None): match self.tag: case 'b': # Bold self.write_to_file('**') case 'i': # Italics self.write_to_file('*') case 'scripcom': # This saves the verse for the section. if self.tag_type == 'start': if self.attrs['type'] == 'Commentary': self.passage_verse = self.attrs['osisref'] case 'h4': # this is the section header. Need to keep track of it per file. if self.tag_type == 'start': self.write_to_file('\n# ',) self.skip_re = True if self.tag_type == 'end': self.write_to_file('\n') self.skip_re = False case 'scripref': # Scripture ref # get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')] # bible.format_scripture_references(bible.get_references(attrs['osisref'])) if self.tag_type == 'start': if 'osisref' in self.attrs: verses = self.normalize_osis_verses( self.attrs['osisref'] ) else: verses = self.normalize_osis_verses( self.convert_passage(self.attrs['passage']) ) self.write_to_file(self.obsidian_links(verses)) case 'p': # Paragraph if self.tag_type == 'start': if self.attrs.get('class', False) == 'passage': if self.passage_verse: try: verse = bible.get_references( self.normalize_osis_verses( self.passage_verse) )[0] except: logger.debug(self.passage_verse) raise self.write_to_file('## Verses: ') self.write_to_file( f'{verse.start_verse} - {verse.end_verse}' ) self.write_to_file('\n\n') # Need this do the regex in self.clean does not pick # this up. Will be also cleaned in the same function. self.write_to_file('Passage: ') if self.tag_type == 'end': self.write_to_file('\n\n') case 'li': if self.tag_type == 'start': self.write_to_file('- ') else: self.write_to_file('\n') # case _: # # print(tag.upper()) # pass def handle_starttag(self, tag, attrs): # Set taf trackers self.tag_type = 'start' self.tag = tag self.attrs = dict(attrs) self.process_tag() def handle_endtag(self, tag): # print(f'End: {tag}') self.tag = tag self.tag_type = 'end' self.process_tag() # Clear tag tracker self.tag = None self.tag_type = None self.attrs.clear() self.skip_re = False def handle_data(self, data): # print(f'Data: {data!r}') if self.tag in ['scripref', 'h2', 'h3', 'h5']: return if data == '\n': return data = data.replace('—', '-- ') data = data.replace('GOD', 'God') data = data.replace('\n', ' ') self.write_to_file(data, self.skip_re) if __name__ == '__main__': parser = MyHTMLParser() # Process each volume file. pbar = tqdm(total=1273) for vol in sorted(Path('./').glob('*.xml')): logger.info(vol) soup = BeautifulSoup(vol.read_text(), 'xml') # Get each book in the volume # This will be the main folder for all the book's chapters for book in soup.find_all('div1'): book_name = (book['title']. replace('First ', '1'). replace('Second ', '2'). replace('Third ', '3') ) logger.info(book_name) # These are the chapters/files for each book folder for chapter in book.find_all('div2'): # For introduction files filename = f'MH {book_name} {chapter['title']}' # Remove Roman Numerals from file name if 'CHAPTER' in filename.upper(): *_, roman_num = chapter['title'].split(' ') filename = f'MH {book_name} {parser.roman_to_int(roman_num)}' logger.info(filename) parser.create_md_file(base, book_name, filename) pbar.set_description(f"Processing: {vol.name}: {book_name}: {filename}") # Parse the text of the chapter # May need to add a loop to look in all

2: # sys.exit() pbar.close() parser.close()