143 lines
4.0 KiB
Python
143 lines
4.0 KiB
Python
from pathlib import Path
|
|
import string
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
class MyHTMLParser(HTMLParser):
|
|
file = None
|
|
current_tag = None
|
|
writeable = False
|
|
output_folder = Path('Matthew Henry Commentary OLD')
|
|
section = []
|
|
|
|
@staticmethod
|
|
def roman_to_int(number: str) -> int:
|
|
if number is None:
|
|
return 0
|
|
|
|
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
|
|
total = 0
|
|
for i in range(len(number) - 1, -1, -1):
|
|
num = roman[number[i]]
|
|
if 3 * num < total:
|
|
total -= num
|
|
else:
|
|
total += num
|
|
|
|
return total
|
|
|
|
def get_filename(self, title):
|
|
title = (title
|
|
.replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')
|
|
.replace("].", ''))
|
|
|
|
*book, _ = title.replace('Chapter', '').split()
|
|
book = ' '.join(book)
|
|
book = book.strip(string.punctuation).title()
|
|
book = (
|
|
book.replace('First', '1').
|
|
replace('Second', '2').
|
|
replace('Third', '3')
|
|
)
|
|
|
|
if 'Introduction' in title:
|
|
roman = ''
|
|
elif 'CHAPTER' in title.upper():
|
|
*_, roman = title.upper().split('CHAPTER')
|
|
else:
|
|
*_, roman = title.split()
|
|
|
|
num = self.roman_to_int(roman.strip())
|
|
file = f'{book} {num}' if num else book
|
|
|
|
return book.strip(), file.strip()
|
|
|
|
def create_md_file(self, base_dir: Path, folder: str, file: str) -> Path:
|
|
md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
|
|
# print(md_file)
|
|
md_file.parent.mkdir(parents=True, exist_ok=True)
|
|
# md_file.touch(exist_ok=True)
|
|
return md_file
|
|
|
|
def write_to_file(self, data):
|
|
if self.writeable is False:
|
|
return
|
|
|
|
if self.file is None:
|
|
raise ValueError('No File specified')
|
|
|
|
print(f'writing: {data!r}')
|
|
|
|
with open(self.file, 'a+') as file:
|
|
file.write(data)
|
|
|
|
def clean_file(self):
|
|
print(' CLEANING FILE')
|
|
|
|
if self.file is None:
|
|
raise ValueError('No File specified')
|
|
|
|
with open(self.file, 'r') as file:
|
|
lines = file.readlines()
|
|
|
|
with open(self.file, 'w') as file:
|
|
for line in lines:
|
|
line = line.lstrip(' ').strip()
|
|
if line:
|
|
file.write(line)
|
|
file.write('\n\n')
|
|
# self.write_to_file('\n\n')
|
|
|
|
def process_tag(self, tag, attrs=None):
|
|
match tag:
|
|
case 'b':
|
|
self.write_to_file('**')
|
|
case 'i':
|
|
self.write_to_file('*')
|
|
case 'hr':
|
|
# self.write_to_file('\n---\n')
|
|
if attrs:
|
|
print('START WRITING')
|
|
self.writeable = True
|
|
case _:
|
|
# print(tag.upper())
|
|
pass
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
self.current_tag = tag
|
|
self.process_tag(tag, attrs)
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag in ['b', 'i', 'hr']:
|
|
self.process_tag(tag)
|
|
|
|
if tag == 'p':
|
|
self.write_to_file('\n\n')
|
|
|
|
if tag == 'html':
|
|
self.clean_file()
|
|
|
|
self.current_tag = None
|
|
|
|
def handle_data(self, data):
|
|
# print("Encountered some data :", data)
|
|
# print(self.get_starttag_text())
|
|
if self.current_tag == 'title':
|
|
book, chapter = self.get_filename(data)
|
|
self.file = self.create_md_file(self.output_folder.absolute(), book, chapter)
|
|
|
|
if self.writeable:
|
|
data = data.replace("\xa0", "")
|
|
data = data.replace('\n', '')
|
|
# data = data if data[0] != ' ' else data[1:]
|
|
if data:
|
|
self.write_to_file(data)
|
|
|
|
def handle_comment(self, data):
|
|
print(data)
|
|
if 'End'.upper() in data.upper():
|
|
self.writeable = False
|
|
|
|
|
|
parser = MyHTMLParser()
|
|
parser.feed(Path('./matthew_henry/MHC01000.HTM').read_text()) |