from pathlib import Path import string from html.parser import HTMLParser class MyHTMLParser(HTMLParser): file = None current_tag = None writeable = False output_folder = Path('Matthew Henry Commentary OLD') section = [] @staticmethod def roman_to_int(number: str) -> int: if number is None: return 0 roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} total = 0 for i in range(len(number) - 1, -1, -1): num = roman[number[i]] if 3 * num < total: total -= num else: total += num return total def get_filename(self, title): title = (title .replace("Matthew Henry's Complete Commentary on the Whole Bible [", '') .replace("].", '')) *book, _ = title.replace('Chapter', '').split() book = ' '.join(book) book = book.strip(string.punctuation).title() book = ( book.replace('First', '1'). replace('Second', '2'). replace('Third', '3') ) if 'Introduction' in title: roman = '' elif 'CHAPTER' in title.upper(): *_, roman = title.upper().split('CHAPTER') else: *_, roman = title.split() num = self.roman_to_int(roman.strip()) file = f'{book} {num}' if num else book return book.strip(), file.strip() def create_md_file(self, base_dir: Path, folder: str, file: str) -> Path: md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute() # print(md_file) md_file.parent.mkdir(parents=True, exist_ok=True) # md_file.touch(exist_ok=True) return md_file def write_to_file(self, data): if self.writeable is False: return if self.file is None: raise ValueError('No File specified') print(f'writing: {data!r}') with open(self.file, 'a+') as file: file.write(data) def clean_file(self): print(' CLEANING FILE') if self.file is None: raise ValueError('No File specified') with open(self.file, 'r') as file: lines = file.readlines() with open(self.file, 'w') as file: for line in lines: line = line.lstrip(' ').strip() if line: file.write(line) file.write('\n\n') # self.write_to_file('\n\n') def process_tag(self, tag, attrs=None): match tag: case 'b': self.write_to_file('**') case 'i': self.write_to_file('*') case 'hr': # self.write_to_file('\n---\n') if attrs: print('START WRITING') self.writeable = True case _: # print(tag.upper()) pass def handle_starttag(self, tag, attrs): self.current_tag = tag self.process_tag(tag, attrs) def handle_endtag(self, tag): if tag in ['b', 'i', 'hr']: self.process_tag(tag) if tag == 'p': self.write_to_file('\n\n') if tag == 'html': self.clean_file() self.current_tag = None def handle_data(self, data): # print("Encountered some data :", data) # print(self.get_starttag_text()) if self.current_tag == 'title': book, chapter = self.get_filename(data) self.file = self.create_md_file(self.output_folder.absolute(), book, chapter) if self.writeable: data = data.replace("\xa0", "") data = data.replace('\n', '') # data = data if data[0] != ' ' else data[1:] if data: self.write_to_file(data) def handle_comment(self, data): print(data) if 'End'.upper() in data.upper(): self.writeable = False parser = MyHTMLParser() parser.feed(Path('./matthew_henry/MHC01000.HTM').read_text())