mh_parser/html_parse.py

import time
from pathlib import Path
from bs4 import BeautifulSoup
from tqdm import tqdm
from loguru import logger
import string
import shutil
from html.parser import HTMLParser


def roman_to_int(number: str) -> int:
    if number is None:
        return 0

    roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    total = 0
    for i in range(len(number)-1,-1,-1):
        num = roman[number[i]]
        if 3*num < total:
            total -= num
        else:
            total += num

    return total


# File rename...
# <TITLE>Matthew Henry's Complete Commentary on the Whole Bible [Genesis, Chapter II].</TITLE>
# The text within the '[]' should be the name of the file.
def get_filename(soup: BeautifulSoup) -> tuple:
    title = soup.title.string
    title = (title
             .replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')
             .replace("].", ''))

    *book, _ = title.replace('Chapter', '').split()
    book = ' '.join(book)
    book = book.strip(string.punctuation).title()
    book = (
        book.replace('First', '1').
        replace('Second', '2').
        replace('Third', '3')
    )

    if 'Introduction' in title:
        roman = ''
    elif 'CHAPTER' in title.upper():
        *_, roman = title.upper().split('CHAPTER')
    else:
        *_, roman = title.split()

    num = roman_to_int(roman.strip())
    file = f'{book} {num}' if num else book

    return book.strip(), file.strip()


def create_md_file(base_dir: Path, folder: str, file: str) -> Path:
    md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
    # print(md_file)
    md_file.parent.mkdir(parents=True, exist_ok=True)
    md_file.touch(exist_ok=True)
    return md_file


def write_blocks(soup: BeautifulSoup, md_file: Path) -> None:
    # def get_string(block):
    #     if list(block.decendents) > 1:
    #         for i in block.decendents:


    for block in soup.find_all('p'):
        for text in block.stripped_strings:
            print(repr(text))
    #     for tag in block:
    #         if tag is None:
    #             continue
    #
    #         if not list(tag.children):
    #             print(tag)
    #             continue
    #
    #         for s in tag.children:
    #             print(s)
    #
    # for block in soup.find_all('p'):
    #     print(list(block.childGenerator()))
    #         # match tag.name:
    #             case None:
    #                 print(tag)
    #
    #             case 'b':
    #                 print('\t', tag.name)
    #                 print('\t\t', tag)
    #                 # file.write(f"**{tag.string}**")
    #
    #             case 'i':
    #                 print('\t', tag.name)
    #                 print('\t\t', tag)
    #                 # file.write(f"*{tag.string}*")
    #
    #             case _:
    #                 print('\t', tag.name)
    #                 print('\t\t', tag)
    #
    #
    #             file.write('\n')
    #
    #         file.write('\n')


if __name__ == '__main__':
    folder = Path('/home/bear/PycharmProjects/mark_henry_md/matthew_henry')

    htm_files = list(folder.glob('*.HTM'))
    output_folder = Path('Matthew Henry Commentary OLD').absolute()
    # for file in tqdm(htm_files[7:8]):
    for file in htm_files[7:8]:
        soup = BeautifulSoup(file.read_text(), 'html.parser')

        book, chapter = get_filename(soup)
        md_file = create_md_file(output_folder, book, chapter)

        write_blocks(soup, md_file)