import time from pathlib import Path from bs4 import BeautifulSoup from tqdm import tqdm from loguru import logger import string import shutil from html.parser import HTMLParser def roman_to_int(number: str) -> int: if number is None: return 0 roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} total = 0 for i in range(len(number)-1,-1,-1): num = roman[number[i]] if 3*num < total: total -= num else: total += num return total # File rename... # Matthew Henry's Complete Commentary on the Whole Bible [Genesis, Chapter II]. # The text within the '[]' should be the name of the file. def get_filename(soup: BeautifulSoup) -> tuple: title = soup.title.string title = (title .replace("Matthew Henry's Complete Commentary on the Whole Bible [", '') .replace("].", '')) *book, _ = title.replace('Chapter', '').split() book = ' '.join(book) book = book.strip(string.punctuation).title() book = ( book.replace('First', '1'). replace('Second', '2'). replace('Third', '3') ) if 'Introduction' in title: roman = '' elif 'CHAPTER' in title.upper(): *_, roman = title.upper().split('CHAPTER') else: *_, roman = title.split() num = roman_to_int(roman.strip()) file = f'{book} {num}' if num else book return book.strip(), file.strip() def create_md_file(base_dir: Path, folder: str, file: str) -> Path: md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute() # print(md_file) md_file.parent.mkdir(parents=True, exist_ok=True) md_file.touch(exist_ok=True) return md_file def write_blocks(soup: BeautifulSoup, md_file: Path) -> None: # def get_string(block): # if list(block.decendents) > 1: # for i in block.decendents: for block in soup.find_all('p'): for text in block.stripped_strings: print(repr(text)) # for tag in block: # if tag is None: # continue # # if not list(tag.children): # print(tag) # continue # # for s in tag.children: # print(s) # # for block in soup.find_all('p'): # print(list(block.childGenerator())) # # match tag.name: # case None: # print(tag) # # case 'b': # print('\t', tag.name) # print('\t\t', tag) # # file.write(f"**{tag.string}**") # # case 'i': # print('\t', tag.name) # print('\t\t', tag) # # file.write(f"*{tag.string}*") # # case _: # print('\t', tag.name) # print('\t\t', tag) # # # file.write('\n') # # file.write('\n') if __name__ == '__main__': folder = Path('/home/bear/PycharmProjects/mark_henry_md/matthew_henry') htm_files = list(folder.glob('*.HTM')) output_folder = Path('Matthew Henry Commentary OLD').absolute() # for file in tqdm(htm_files[7:8]): for file in htm_files[7:8]: soup = BeautifulSoup(file.read_text(), 'html.parser') book, chapter = get_filename(soup) md_file = create_md_file(output_folder, book, chapter) write_blocks(soup, md_file)