mh_parser/html_parse.py
2023-12-07 06:56:01 -05:00

126 lines
3.4 KiB
Python

import time
from pathlib import Path
from bs4 import BeautifulSoup
from tqdm import tqdm
from loguru import logger
import string
import shutil
from html.parser import HTMLParser
def roman_to_int(number: str) -> int:
if number is None:
return 0
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
total = 0
for i in range(len(number)-1,-1,-1):
num = roman[number[i]]
if 3*num < total:
total -= num
else:
total += num
return total
# File rename...
# <TITLE>Matthew Henry's Complete Commentary on the Whole Bible [Genesis, Chapter II].</TITLE>
# The text within the '[]' should be the name of the file.
def get_filename(soup: BeautifulSoup) -> tuple:
title = soup.title.string
title = (title
.replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')
.replace("].", ''))
*book, _ = title.replace('Chapter', '').split()
book = ' '.join(book)
book = book.strip(string.punctuation).title()
book = (
book.replace('First', '1').
replace('Second', '2').
replace('Third', '3')
)
if 'Introduction' in title:
roman = ''
elif 'CHAPTER' in title.upper():
*_, roman = title.upper().split('CHAPTER')
else:
*_, roman = title.split()
num = roman_to_int(roman.strip())
file = f'{book} {num}' if num else book
return book.strip(), file.strip()
def create_md_file(base_dir: Path, folder: str, file: str) -> Path:
md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
# print(md_file)
md_file.parent.mkdir(parents=True, exist_ok=True)
md_file.touch(exist_ok=True)
return md_file
def write_blocks(soup: BeautifulSoup, md_file: Path) -> None:
# def get_string(block):
# if list(block.decendents) > 1:
# for i in block.decendents:
for block in soup.find_all('p'):
for text in block.stripped_strings:
print(repr(text))
# for tag in block:
# if tag is None:
# continue
#
# if not list(tag.children):
# print(tag)
# continue
#
# for s in tag.children:
# print(s)
#
# for block in soup.find_all('p'):
# print(list(block.childGenerator()))
# # match tag.name:
# case None:
# print(tag)
#
# case 'b':
# print('\t', tag.name)
# print('\t\t', tag)
# # file.write(f"**{tag.string}**")
#
# case 'i':
# print('\t', tag.name)
# print('\t\t', tag)
# # file.write(f"*{tag.string}*")
#
# case _:
# print('\t', tag.name)
# print('\t\t', tag)
#
#
# file.write('\n')
#
# file.write('\n')
if __name__ == '__main__':
folder = Path('/home/bear/PycharmProjects/mark_henry_md/matthew_henry')
htm_files = list(folder.glob('*.HTM'))
output_folder = Path('Matthew Henry Commentary OLD').absolute()
# for file in tqdm(htm_files[7:8]):
for file in htm_files[7:8]:
soup = BeautifulSoup(file.read_text(), 'html.parser')
book, chapter = get_filename(soup)
md_file = create_md_file(output_folder, book, chapter)
write_blocks(soup, md_file)