import re import string from html.parser import HTMLParser from pathlib import Path import pythonbible as bible from bs4 import BeautifulSoup base = Path('./Matthew Henry Commentary/xlm').absolute() class MyHTMLParser(HTMLParser): file = None tag = None tag_type = None attrs = dict() # All the patterns are for re.sub(). This is specifically for getting # list/sublist into the correct position. The first item is for the pattern # to find and the second is the replacement pattern. The replacement pattern # also ids the list so that when the self.clean_file() we can properly add # the correct amount of `\t` to the line. patterns = [ (r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'), (r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'), # These share the same id. The reason (r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'), (r"(\S\s)(\d+\.)(\s\b|\W)", lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'), (r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.", lambda x: f'\n{parser.roman_to_int(x.group())}.~1'), ] @staticmethod def roman_to_int(number: str) -> int: if number is None: return 0 number = number.strip('.') roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} total = 0 for i in range(len(number) - 1, -1, -1): num = roman[number[i]] if 3 * num < total: total -= num else: total += num return total def create_md_file(self, base_dir: Path, folder: str, file: str): md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute() md_file.parent.mkdir(parents=True, exist_ok=True) md_file.unlink(missing_ok=True) print(md_file.absolute()) self.file = md_file def write_to_file(self, data): # if self.writeable is False: # return if self.file is None: raise ValueError('No File specified') for pattern, sub in self.patterns: # print(f'Running Pattern: {pattern}') data = re.sub(pattern, sub, data) # print(f'writing: {data!r}') with open(self.file, 'a+') as file: file.write(data) def clean_file(self): print(' CLEANING FILE') if self.file is None: raise ValueError('No File specified') with open(self.file, 'r') as file: lines = file.readlines() with open(self.file, 'w') as file: indent = 0 sublist = dict() for line in lines: if line.strip(): if line.startswith('#'): line = f'\n{line}' if line.strip()[0] not in string.digits: line = f'{line}\n' # print('resetting indent') indent = 0 sublist.clear() else: start = line.find('~') end = start + 2 list_id = line[start:end] if list_id not in sublist: sublist[list_id] = indent indent += 1 # print(f'{indent=}') # print(f'{sublist=}') indents = "\t" * sublist[list_id] line = f'{indents}{line.replace(list_id, "")}' # line = f'{"\t" * sublist[list_id]}{line}' # print(f'{sublist[list_id]=}, {list_id=}') file.write(line) @staticmethod def normalize_verses(verses): """Takes this 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25' and turns it into somthing more readable""" normalized_refs = list() for chunks in verses.removeprefix("Bible:").split('Bible:'): spanned_verses = list() for ref in chunks.split('-'): verse = ref.replace('.', ' ', 1).replace('.', ':') spanned_verses.append(verse) normalized = bible.format_scripture_references( bible.get_references(str(spanned_verses)) ) normalized_refs.append(normalized) return ';'.join(normalized_refs) @staticmethod def obsidian_links(verses): links = [] for verse in verses.split(';'): links.append(f"[[{verse.replace(':', '#').replace('-', '..')}]]") return '; '.join(links) def process_tag(self, test=None): match self.tag: case 'h4': # this is the section header. Need to keep track of it per file. if self.tag_type == 'start': self.write_to_file('\n## ') if self.tag_type == 'end': self.write_to_file('\n\n') case 'b': self.write_to_file('**') case 'i': self.write_to_file('*') case 'hr': pass case 'scripref': # Scripture ref # get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')] # bible.format_scripture_references(bible.get_references(attrs['osisref'])) if self.tag_type == 'start': verse_link = self.obsidian_links( self.normalize_verses(self.attrs['osisref']) ) self.write_to_file(verse_link) case 'p': if self.tag_type == 'start': if self.attrs.get('class', False) == 'passage': self.write_to_file('Passage: ') if self.tag_type == 'end': self.write_to_file('\n\n') # case 'html': # if self.tag_type == 'end': # self.clean_file() case _: # print(tag.upper()) pass def handle_starttag(self, tag, attrs): attrs = dict(attrs) # print(f'Start: {tag}, {attrs}') self.tag_type = 'start' self.tag = tag self.attrs = attrs self.process_tag(f'what what {tag}') def handle_endtag(self, tag): # print(f'End: {tag}') self.tag = tag self.tag_type = 'end' self.process_tag() self.tag = None self.tag_type = None self.attrs.clear() def handle_data(self, data): # print(f'Data: {data!r}') if self.tag in ['scripref', 'h2', 'h3', 'h5']: return if data == '\n': return data = data.replace('—', '-- ') data = data.replace('GOD', 'God') self.write_to_file(data.replace('\n', ' ')) parser = MyHTMLParser() soup = BeautifulSoup(Path('./vol_1.xml').read_text(), 'xml') # Clearing all scripRef of internal text. That way if it # had other tags in it would not be processed for i in soup.find_all('scripRef'): i.clear() # May need to add a loop to look in all