mh_parser/console.py

from pathlib import Path
import string
from html.parser import HTMLParser


class MyHTMLParser(HTMLParser):
    file = None
    current_tag = None
    writeable = False
    output_folder = Path('Matthew Henry Commentary OLD')
    section = []

    @staticmethod
    def roman_to_int(number: str) -> int:
        if number is None:
            return 0

        roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
        total = 0
        for i in range(len(number) - 1, -1, -1):
            num = roman[number[i]]
            if 3 * num < total:
                total -= num
            else:
                total += num

        return total

    def get_filename(self, title):
        title = (title
                 .replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')
                 .replace("].", ''))

        *book, _ = title.replace('Chapter', '').split()
        book = ' '.join(book)
        book = book.strip(string.punctuation).title()
        book = (
            book.replace('First', '1').
            replace('Second', '2').
            replace('Third', '3')
        )

        if 'Introduction' in title:
            roman = ''
        elif 'CHAPTER' in title.upper():
            *_, roman = title.upper().split('CHAPTER')
        else:
            *_, roman = title.split()

        num = self.roman_to_int(roman.strip())
        file = f'{book} {num}' if num else book

        return book.strip(), file.strip()

    def create_md_file(self, base_dir: Path, folder: str, file: str) -> Path:
        md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
        # print(md_file)
        md_file.parent.mkdir(parents=True, exist_ok=True)
        # md_file.touch(exist_ok=True)
        return md_file

    def write_to_file(self, data):
        if self.writeable is False:
            return

        if self.file is None:
            raise ValueError('No File specified')

        print(f'writing: {data!r}')

        with open(self.file, 'a+') as file:
            file.write(data)

    def clean_file(self):
        print(' CLEANING FILE')

        if self.file is None:
            raise ValueError('No File specified')

        with open(self.file, 'r') as file:
            lines = file.readlines()

        with open(self.file, 'w') as file:
            for line in lines:
                line = line.lstrip(' ').strip()
                if line:
                    file.write(line)
                    file.write('\n\n')
            # self.write_to_file('\n\n')

    def process_tag(self, tag, attrs=None):
        match tag:
            case 'b':
                self.write_to_file('**')
            case 'i':
                self.write_to_file('*')
            case 'hr':
                # self.write_to_file('\n---\n')
                if attrs:
                    print('START WRITING')
                    self.writeable = True
            case _:
                # print(tag.upper())
                pass

    def handle_starttag(self, tag, attrs):
        self.current_tag = tag
        self.process_tag(tag, attrs)

    def handle_endtag(self, tag):
        if tag in ['b', 'i', 'hr']:
            self.process_tag(tag)

        if tag == 'p':
            self.write_to_file('\n\n')

        if tag == 'html':
            self.clean_file()

        self.current_tag = None

    def handle_data(self, data):
        # print("Encountered some data  :", data)
        # print(self.get_starttag_text())
        if self.current_tag == 'title':
            book, chapter = self.get_filename(data)
            self.file = self.create_md_file(self.output_folder.absolute(), book, chapter)

        if self.writeable:
            data = data.replace("\xa0", "")
            data = data.replace('\n', '')
            # data = data if data[0] != ' ' else data[1:]
            if data:
                self.write_to_file(data)

    def handle_comment(self, data):
        print(data)
        if 'End'.upper() in data.upper():
            self.writeable = False


parser = MyHTMLParser()
parser.feed(Path('./matthew_henry/MHC01000.HTM').read_text())