mh_parser/html_parse.py

import time
from pathlib import Path
from bs4 import BeautifulSoup
from tqdm import tqdm
from loguru import logger
import string
import shutil
from html.parser import HTMLParser


def roman_to_int(number: str) -> int:
    if number is None:
        return 0

    roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    total = 0
    for i in range(len(number)-1,-1,-1):
        num = roman[number[i]]
        if 3*num < total:
            total -= num
        else:
            total += num

    return total


# File rename...
# <TITLE>Matthew Henry's Complete Commentary on the Whole Bible [Genesis, Chapter II].</TITLE>
# The text within the '[]' should be the name of the file.
def get_filename(soup: BeautifulSoup) -> tuple:
    title = soup.title.string
    title = (title
             .replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')
             .replace("].", ''))

    *book, _ = title.replace('Chapter', '').split()
    book = ' '.join(book)
    book = book.strip(string.punctuation).title()
    book = (
        book.replace('First', '1').
        replace('Second', '2').
        replace('Third', '3')
    )

    if 'Introduction' in title:
        roman = ''
    elif 'CHAPTER' in title.upper():
        *_, roman = title.upper().split('CHAPTER')
    else:
        *_, roman = title.split()

    num = roman_to_int(roman.strip())
    file = f'{book} {num}' if num else book

    return book.strip(), file.strip()


def create_md_file(base_dir: Path, folder: str, file: str) -> Path:
    md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
    # print(md_file)
    md_file.parent.mkdir(parents=True, exist_ok=True)
    md_file.touch(exist_ok=True)
    return md_file


def write_blocks(soup: BeautifulSoup, md_file: Path) -> None:
    # def get_string(block):
    #     if list(block.decendents) > 1:
    #         for i in block.decendents:


    for block in soup.find_all('p'):
        for text in block.stripped_strings:
            print(repr(text))
    #     for tag in block:
    #         if tag is None:
    #             continue
    #
    #         if not list(tag.children):
    #             print(tag)
    #             continue
    #
    #         for s in tag.children:
    #             print(s)
    #
    # for block in soup.find_all('p'):
    #     print(list(block.childGenerator()))
    #         # match tag.name:
    #             case None:
    #                 print(tag)
    #
    #             case 'b':
    #                 print('\t', tag.name)
    #                 print('\t\t', tag)
    #                 # file.write(f"**{tag.string}**")
    #
    #             case 'i':
    #                 print('\t', tag.name)
    #                 print('\t\t', tag)
    #                 # file.write(f"*{tag.string}*")
    #
    #             case _:
    #                 print('\t', tag.name)
    #                 print('\t\t', tag)
    #
    #
    #             file.write('\n')
    #
    #         file.write('\n')


if __name__ == '__main__':
    folder = Path('/home/bear/PycharmProjects/mark_henry_md/matthew_henry')

    htm_files = list(folder.glob('*.HTM'))
    output_folder = Path('Matthew Henry Commentary OLD').absolute()
    # for file in tqdm(htm_files[7:8]):
    for file in htm_files[7:8]:
        soup = BeautifulSoup(file.read_text(), 'html.parser')

        book, chapter = get_filename(soup)
        md_file = create_md_file(output_folder, book, chapter)

        write_blocks(soup, md_file)
renamed and updated 2023-12-02 02:47:45 +00:00			`import time`
			`from pathlib import Path`
			`from bs4 import BeautifulSoup`
			`from tqdm import tqdm`
			`from loguru import logger`
			`import string`
			`import shutil`
			`from html.parser import HTMLParser`


			`def roman_to_int(number: str) -> int:`
			`if number is None:`
			`return 0`

			`roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}`
			`total = 0`
			`for i in range(len(number)-1,-1,-1):`
			`num = roman[number[i]]`
			`if 3*num < total:`
			`total -= num`
			`else:`
			`total += num`

			`return total`


			`# File rename...`
			`# <TITLE>Matthew Henry's Complete Commentary on the Whole Bible [Genesis, Chapter II].</TITLE>`
			`# The text within the '[]' should be the name of the file.`
			`def get_filename(soup: BeautifulSoup) -> tuple:`
			`title = soup.title.string`
			`title = (title`
			`.replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')`
			`.replace("].", ''))`

			`*book, _ = title.replace('Chapter', '').split()`
			`book = ' '.join(book)`
			`book = book.strip(string.punctuation).title()`
			`book = (`
			`book.replace('First', '1').`
			`replace('Second', '2').`
			`replace('Third', '3')`
			`)`

			`if 'Introduction' in title:`
			`roman = ''`
			`elif 'CHAPTER' in title.upper():`
			`*_, roman = title.upper().split('CHAPTER')`
			`else:`
			`*_, roman = title.split()`

			`num = roman_to_int(roman.strip())`
			`file = f'{book} {num}' if num else book`

			`return book.strip(), file.strip()`


			`def create_md_file(base_dir: Path, folder: str, file: str) -> Path:`
			`md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()`
			`# print(md_file)`
			`md_file.parent.mkdir(parents=True, exist_ok=True)`
			`md_file.touch(exist_ok=True)`
			`return md_file`


			`def write_blocks(soup: BeautifulSoup, md_file: Path) -> None:`
			`# def get_string(block):`
			`# if list(block.decendents) > 1:`
			`# for i in block.decendents:`



			`for block in soup.find_all('p'):`
			`for text in block.stripped_strings:`
			`print(repr(text))`
			`# for tag in block:`
			`# if tag is None:`
			`# continue`
			`#`
			`# if not list(tag.children):`
			`# print(tag)`
			`# continue`
			`#`
			`# for s in tag.children:`
			`# print(s)`
			`#`
			`# for block in soup.find_all('p'):`
			`# print(list(block.childGenerator()))`
			`# # match tag.name:`
			`# case None:`
			`# print(tag)`
			`#`
			`# case 'b':`
			`# print('\t', tag.name)`
			`# print('\t\t', tag)`
			`# # file.write(f"{tag.string}")`
			`#`
			`# case 'i':`
			`# print('\t', tag.name)`
			`# print('\t\t', tag)`
			`# # file.write(f"{tag.string}")`
			`#`
			`# case _:`
			`# print('\t', tag.name)`
			`# print('\t\t', tag)`
			`#`
			`#`
			`# file.write('\n')`
			`#`
			`# file.write('\n')`



			`if __name__ == '__main__':`
			`folder = Path('/home/bear/PycharmProjects/mark_henry_md/matthew_henry')`

			`htm_files = list(folder.glob('*.HTM'))`
current 2023-12-07 11:56:01 +00:00			`output_folder = Path('Matthew Henry Commentary OLD').absolute()`
renamed and updated 2023-12-02 02:47:45 +00:00			`# for file in tqdm(htm_files[7:8]):`
			`for file in htm_files[7:8]:`
			`soup = BeautifulSoup(file.read_text(), 'html.parser')`

			`book, chapter = get_filename(soup)`
			`md_file = create_md_file(output_folder, book, chapter)`

			`write_blocks(soup, md_file)`