mh_parser/console.py
2023-11-29 21:23:35 -05:00

143 lines
4.0 KiB
Python

from pathlib import Path
import string
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
file = None
current_tag = None
writeable = False
output_folder = Path('./Matthew Henry Commentary')
section = []
@staticmethod
def roman_to_int(number: str) -> int:
if number is None:
return 0
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
total = 0
for i in range(len(number) - 1, -1, -1):
num = roman[number[i]]
if 3 * num < total:
total -= num
else:
total += num
return total
def get_filename(self, title):
title = (title
.replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')
.replace("].", ''))
*book, _ = title.replace('Chapter', '').split()
book = ' '.join(book)
book = book.strip(string.punctuation).title()
book = (
book.replace('First', '1').
replace('Second', '2').
replace('Third', '3')
)
if 'Introduction' in title:
roman = ''
elif 'CHAPTER' in title.upper():
*_, roman = title.upper().split('CHAPTER')
else:
*_, roman = title.split()
num = self.roman_to_int(roman.strip())
file = f'{book} {num}' if num else book
return book.strip(), file.strip()
def create_md_file(self, base_dir: Path, folder: str, file: str) -> Path:
md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
# print(md_file)
md_file.parent.mkdir(parents=True, exist_ok=True)
# md_file.touch(exist_ok=True)
return md_file
def write_to_file(self, data):
if self.writeable is False:
return
if self.file is None:
raise ValueError('No File specified')
print(f'writing: {data!r}')
with open(self.file, 'a+') as file:
file.write(data)
def clean_file(self):
print(' CLEANING FILE')
if self.file is None:
raise ValueError('No File specified')
with open(self.file, 'r') as file:
lines = file.readlines()
with open(self.file, 'w') as file:
for line in lines:
line = line.lstrip(' ').strip()
if line:
file.write(line)
file.write('\n\n')
# self.write_to_file('\n\n')
def process_tag(self, tag, attrs=None):
match tag:
case 'b':
self.write_to_file('**')
case 'i':
self.write_to_file('*')
case 'hr':
# self.write_to_file('\n---\n')
if attrs:
print('START WRITING')
self.writeable = True
case _:
# print(tag.upper())
pass
def handle_starttag(self, tag, attrs):
self.current_tag = tag
self.process_tag(tag, attrs)
def handle_endtag(self, tag):
if tag in ['b', 'i', 'hr']:
self.process_tag(tag)
if tag == 'p':
self.write_to_file('\n\n')
if tag == 'html':
self.clean_file()
self.current_tag = None
def handle_data(self, data):
# print("Encountered some data :", data)
# print(self.get_starttag_text())
if self.current_tag == 'title':
book, chapter = self.get_filename(data)
self.file = self.create_md_file(self.output_folder.absolute(), book, chapter)
if self.writeable:
data = data.replace("\xa0", "")
data = data.replace('\n', '')
# data = data if data[0] != ' ' else data[1:]
if data:
self.write_to_file(data)
def handle_comment(self, data):
print(data)
if 'End'.upper() in data.upper():
self.writeable = False
parser = MyHTMLParser()
parser.feed(Path('./matthew_henry/MHC01000.HTM').read_text())