126 lines
3.4 KiB
Python
126 lines
3.4 KiB
Python
import time
|
|
from pathlib import Path
|
|
from bs4 import BeautifulSoup
|
|
from tqdm import tqdm
|
|
from loguru import logger
|
|
import string
|
|
import shutil
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
def roman_to_int(number: str) -> int:
|
|
if number is None:
|
|
return 0
|
|
|
|
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
|
|
total = 0
|
|
for i in range(len(number)-1,-1,-1):
|
|
num = roman[number[i]]
|
|
if 3*num < total:
|
|
total -= num
|
|
else:
|
|
total += num
|
|
|
|
return total
|
|
|
|
|
|
# File rename...
|
|
# <TITLE>Matthew Henry's Complete Commentary on the Whole Bible [Genesis, Chapter II].</TITLE>
|
|
# The text within the '[]' should be the name of the file.
|
|
def get_filename(soup: BeautifulSoup) -> tuple:
|
|
title = soup.title.string
|
|
title = (title
|
|
.replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')
|
|
.replace("].", ''))
|
|
|
|
*book, _ = title.replace('Chapter', '').split()
|
|
book = ' '.join(book)
|
|
book = book.strip(string.punctuation).title()
|
|
book = (
|
|
book.replace('First', '1').
|
|
replace('Second', '2').
|
|
replace('Third', '3')
|
|
)
|
|
|
|
if 'Introduction' in title:
|
|
roman = ''
|
|
elif 'CHAPTER' in title.upper():
|
|
*_, roman = title.upper().split('CHAPTER')
|
|
else:
|
|
*_, roman = title.split()
|
|
|
|
num = roman_to_int(roman.strip())
|
|
file = f'{book} {num}' if num else book
|
|
|
|
return book.strip(), file.strip()
|
|
|
|
|
|
def create_md_file(base_dir: Path, folder: str, file: str) -> Path:
|
|
md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
|
|
# print(md_file)
|
|
md_file.parent.mkdir(parents=True, exist_ok=True)
|
|
md_file.touch(exist_ok=True)
|
|
return md_file
|
|
|
|
|
|
def write_blocks(soup: BeautifulSoup, md_file: Path) -> None:
|
|
# def get_string(block):
|
|
# if list(block.decendents) > 1:
|
|
# for i in block.decendents:
|
|
|
|
|
|
|
|
for block in soup.find_all('p'):
|
|
for text in block.stripped_strings:
|
|
print(repr(text))
|
|
# for tag in block:
|
|
# if tag is None:
|
|
# continue
|
|
#
|
|
# if not list(tag.children):
|
|
# print(tag)
|
|
# continue
|
|
#
|
|
# for s in tag.children:
|
|
# print(s)
|
|
#
|
|
# for block in soup.find_all('p'):
|
|
# print(list(block.childGenerator()))
|
|
# # match tag.name:
|
|
# case None:
|
|
# print(tag)
|
|
#
|
|
# case 'b':
|
|
# print('\t', tag.name)
|
|
# print('\t\t', tag)
|
|
# # file.write(f"**{tag.string}**")
|
|
#
|
|
# case 'i':
|
|
# print('\t', tag.name)
|
|
# print('\t\t', tag)
|
|
# # file.write(f"*{tag.string}*")
|
|
#
|
|
# case _:
|
|
# print('\t', tag.name)
|
|
# print('\t\t', tag)
|
|
#
|
|
#
|
|
# file.write('\n')
|
|
#
|
|
# file.write('\n')
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
folder = Path('/home/bear/PycharmProjects/mark_henry_md/matthew_henry')
|
|
|
|
htm_files = list(folder.glob('*.HTM'))
|
|
output_folder = Path('Matthew Henry Commentary OLD').absolute()
|
|
# for file in tqdm(htm_files[7:8]):
|
|
for file in htm_files[7:8]:
|
|
soup = BeautifulSoup(file.read_text(), 'html.parser')
|
|
|
|
book, chapter = get_filename(soup)
|
|
md_file = create_md_file(output_folder, book, chapter)
|
|
|
|
write_blocks(soup, md_file) |