renamed and updated

This commit is contained in:
boose_magoose 2023-12-01 21:47:45 -05:00
parent 8c767b5235
commit a1e69b1368
2 changed files with 469 additions and 100 deletions

126
html_parse.py Normal file
View File

@ -0,0 +1,126 @@
import time
from pathlib import Path
from bs4 import BeautifulSoup
from tqdm import tqdm
from loguru import logger
import string
import shutil
from html.parser import HTMLParser
def roman_to_int(number: str) -> int:
if number is None:
return 0
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
total = 0
for i in range(len(number)-1,-1,-1):
num = roman[number[i]]
if 3*num < total:
total -= num
else:
total += num
return total
# File rename...
# <TITLE>Matthew Henry's Complete Commentary on the Whole Bible [Genesis, Chapter II].</TITLE>
# The text within the '[]' should be the name of the file.
def get_filename(soup: BeautifulSoup) -> tuple:
title = soup.title.string
title = (title
.replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')
.replace("].", ''))
*book, _ = title.replace('Chapter', '').split()
book = ' '.join(book)
book = book.strip(string.punctuation).title()
book = (
book.replace('First', '1').
replace('Second', '2').
replace('Third', '3')
)
if 'Introduction' in title:
roman = ''
elif 'CHAPTER' in title.upper():
*_, roman = title.upper().split('CHAPTER')
else:
*_, roman = title.split()
num = roman_to_int(roman.strip())
file = f'{book} {num}' if num else book
return book.strip(), file.strip()
def create_md_file(base_dir: Path, folder: str, file: str) -> Path:
md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
# print(md_file)
md_file.parent.mkdir(parents=True, exist_ok=True)
md_file.touch(exist_ok=True)
return md_file
def write_blocks(soup: BeautifulSoup, md_file: Path) -> None:
# def get_string(block):
# if list(block.decendents) > 1:
# for i in block.decendents:
for block in soup.find_all('p'):
for text in block.stripped_strings:
print(repr(text))
# for tag in block:
# if tag is None:
# continue
#
# if not list(tag.children):
# print(tag)
# continue
#
# for s in tag.children:
# print(s)
#
# for block in soup.find_all('p'):
# print(list(block.childGenerator()))
# # match tag.name:
# case None:
# print(tag)
#
# case 'b':
# print('\t', tag.name)
# print('\t\t', tag)
# # file.write(f"**{tag.string}**")
#
# case 'i':
# print('\t', tag.name)
# print('\t\t', tag)
# # file.write(f"*{tag.string}*")
#
# case _:
# print('\t', tag.name)
# print('\t\t', tag)
#
#
# file.write('\n')
#
# file.write('\n')
if __name__ == '__main__':
folder = Path('/home/bear/PycharmProjects/mark_henry_md/matthew_henry')
htm_files = list(folder.glob('*.HTM'))
output_folder = Path('./Matthew Henry Commentary').absolute()
# for file in tqdm(htm_files[7:8]):
for file in htm_files[7:8]:
soup = BeautifulSoup(file.read_text(), 'html.parser')
book, chapter = get_filename(soup)
md_file = create_md_file(output_folder, book, chapter)
write_blocks(soup, md_file)

443
main.py
View File

@ -1,126 +1,369 @@
import time import re
from pathlib import Path
from bs4 import BeautifulSoup
from tqdm import tqdm
from loguru import logger
import string import string
import shutil import sys
from html.parser import HTMLParser from html.parser import HTMLParser
from pathlib import Path
import pythonbible as bible
from bs4 import BeautifulSoup
from loguru import logger
from tqdm import tqdm
base = Path('./Matthew Henry Commentary/xml').absolute()
logger.remove(0)
logger.add('log.txt')
class MyHTMLParser(HTMLParser):
file = None
passage_verse = None
tag = None
tag_type = None # 'start' or 'end'
attrs = dict()
# All the patterns are for re.sub(). This is specifically for getting
# list/sublist into the correct position. The first item is for the pattern
# to find and the second is the replacement pattern. The replacement pattern
# also ids the list so that when the self.clean_file() we can properly add
# the correct amount of `\t` to the line.
patterns = [
(r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'),
(r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'),
# These share the same id. There are times were the two are a mixed
# within the parent lists.
(r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'),
(r"(\S\s)(\d+\.)(\s\b|\W)",
lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'),
def roman_to_int(number: str) -> int: (r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.",
if number is None: lambda x: f'\n{parser.roman_to_int(x.group())}.~1'),
return 0 ]
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} @staticmethod
total = 0 def roman_to_int(number: str) -> int:
for i in range(len(number)-1,-1,-1): if number is None:
num = roman[number[i]] return 0
if 3*num < total:
total -= num
else:
total += num
return total # Sometimes the roman numeral comes with a '.'. Striping to not cause
# issues
number = number.strip('.')
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
total = 0
for i in range(len(number) - 1, -1, -1):
num = roman[number[i]]
if 3 * num < total:
total -= num
else:
total += num
# File rename... return total
# <TITLE>Matthew Henry's Complete Commentary on the Whole Bible [Genesis, Chapter II].</TITLE>
# The text within the '[]' should be the name of the file.
def get_filename(soup: BeautifulSoup) -> tuple:
title = soup.title.string
title = (title
.replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')
.replace("].", ''))
*book, _ = title.replace('Chapter', '').split() def create_md_file(self, base_dir: Path, folder: str, file: str):
book = ' '.join(book) """ Create the path of the md file that will be written to as the
book = book.strip(string.punctuation).title() program process the xml file.
book = ( """
book.replace('First', '1').
replace('Second', '2').
replace('Third', '3')
)
if 'Introduction' in title: md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
roman = '' md_file.parent.mkdir(parents=True, exist_ok=True)
elif 'CHAPTER' in title.upper(): md_file.unlink(missing_ok=True)
*_, roman = title.upper().split('CHAPTER') # print(md_file.absolute())
else: self.file = md_file
*_, roman = title.split() return md_file
num = roman_to_int(roman.strip()) def write_to_file(self, data):
file = f'{book} {num}' if num else book
return book.strip(), file.strip() if self.file is None:
raise ValueError('No File specified')
# Here the list/sublist are searched for and altered for later
# processing in self.clean_file()
for pattern, sub in self.patterns:
# print(f'Running Pattern: {pattern}')
data = re.sub(pattern, sub, data)
def create_md_file(base_dir: Path, folder: str, file: str) -> Path: # print(f'writing: {data!r}')
md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
# print(md_file)
md_file.parent.mkdir(parents=True, exist_ok=True)
md_file.touch(exist_ok=True)
return md_file
with open(self.file, 'a+') as file:
file.write(data)
def write_blocks(soup: BeautifulSoup, md_file: Path) -> None: def clean_file(self):
# def get_string(block): logger.info(' CLEANING FILE')
# if list(block.decendents) > 1:
# for i in block.decendents:
if self.file is None:
raise ValueError('No File specified')
# Read all the lines
with open(self.file, 'r') as file:
lines = file.readlines()
for block in soup.find_all('p'): indent = 0 # Current number of '\t' to add in front of the list number
for text in block.stripped_strings: sublist = dict() # Track the indent number based on the id.
print(repr(text)) with open(self.file, 'w') as file:
# for tag in block: file.write('# Chapter Introduction\n\n')
# if tag is None:
# continue
#
# if not list(tag.children):
# print(tag)
# continue
#
# for s in tag.children:
# print(s)
#
# for block in soup.find_all('p'):
# print(list(block.childGenerator()))
# # match tag.name:
# case None:
# print(tag)
#
# case 'b':
# print('\t', tag.name)
# print('\t\t', tag)
# # file.write(f"**{tag.string}**")
#
# case 'i':
# print('\t', tag.name)
# print('\t\t', tag)
# # file.write(f"*{tag.string}*")
#
# case _:
# print('\t', tag.name)
# print('\t\t', tag)
#
#
# file.write('\n')
#
# file.write('\n')
header = None
for line in lines:
# Do not write blank lines
if not line.strip():
continue
# Main header, if a header has already been written skip it
if line.strip().startswith('# '):
if header == line.strip():
continue
header = line.strip()
# Add a `\n` if line is a heading.
if line.startswith('#'):
line = f'\n{line.strip()}'
# If line is not a list then add a `\n` after the paragraph,
# and reset the indent tracker
if line.strip()[0] not in string.digits:
line = f'{line}\n'
# print('resetting indent')
indent = 0
sublist.clear()
else:
# Get the list id
start = line.find('~')
end = start + 2
list_id = line[start:end]
# Get the number of indents based on the list id, if it does
# not exist they set the value to the max value +1. If this
# is the first it will be set to 0 (-1 +1 = 0)
indents = "\t" * sublist.setdefault(
list_id, max(sublist.values(), default=-1)+1
)
line = f'{indents}{line.replace(list_id, "")}'
if line.startswith('Passage: '):
line = line.removeprefix('Passage: ')
file.write(line)
@staticmethod
def normalize_osis_verses(verses):
"""Takes this 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25'
and turns it into somthing more readable"""
normalized_refs = list()
for chunks in verses.removeprefix("Bible:").split('Bible:'):
spanned_verses = list()
for ref in chunks.split('-'):
verse = ref.replace('.', ' ', 1).replace('.', ':')
spanned_verses.append(
bible.format_scripture_references(
bible.get_references(verse.strip())
)
)
try:
normalized = bible.format_scripture_references(
bible.get_references('-'.join(spanned_verses))
)
except:
logger.warning(f"Error with: {verses=}: {spanned_verses=}")
raise
normalized_refs.append(normalized)
return ';'.join(normalized_refs)
@staticmethod
def obsidian_links(verses):
"""Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
links = []
for verse in verses.split(';'):
links.append(f"[[{verse.replace(':', '#').replace('-', '..')}]]")
return '; '.join(links)
def convert_passage(self, verses):
book = self.file.parts[-2]
chapter = None
ref = []
for ver in verses.split(','):
try:
chapter, v = ver.split(':')
except:
v = ver
r = []
for i in v.split('-'):
r.append(f'{book}.{chapter}.{i}')
ref.append(f'Bible:{"-".join(r)}')
x = ' '.join(ref)
return f'osisRef="{x}"'
def process_tag(self, test=None):
match self.tag:
case 'b': # Bold
self.write_to_file('**')
case 'i': # Italics
self.write_to_file('*')
case 'scripcom':
# This saves the verse for the section.
if self.tag_type == 'start':
if self.attrs['type'] == 'Commentary':
self.passage_verse = self.attrs['osisref']
case 'h4': # this is the section header. Need to keep track of it per file.
if self.tag_type == 'start':
self.write_to_file('\n# ')
if self.tag_type == 'end':
self.write_to_file('\n')
#
if self.passage_verse:
try:
verse = bible.get_references(
self.normalize_osis_verses(
self.passage_verse)
)[0]
except:
logger.debug(self.passage_verse)
raise
self.write_to_file('## Verses: ')
self.write_to_file(
f'{verse.start_verse} - {verse.end_verse}'
)
self.write_to_file('\n\n')
case 'scripref': # Scripture ref
# get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
# bible.format_scripture_references(bible.get_references(attrs['osisref']))
if self.tag_type == 'start':
if 'osisref' in self.attrs:
verses = self.normalize_osis_verses(
self.attrs['osisref']
)
else:
verses = self.normalize_osis_verses(
self.convert_passage(self.attrs['passage'])
)
self.write_to_file(self.obsidian_links(verses))
case 'p': # Paragraph
if self.tag_type == 'start':
if self.attrs.get('class', False) == 'passage':
# Need this do the regex in self.clean does not pick
# this up. Will be also cleaned in the same function.
self.write_to_file('Passage: ')
if self.tag_type == 'end':
self.write_to_file('\n\n')
case 'li':
if self.tag_type == 'start':
self.write_to_file('- ')
else:
self.write_to_file('\n')
# case _:
# # print(tag.upper())
# pass
def handle_starttag(self, tag, attrs):
# Set taf trackers
self.tag_type = 'start'
self.tag = tag
self.attrs = dict(attrs)
self.process_tag()
def handle_endtag(self, tag):
# print(f'End: {tag}')
self.tag = tag
self.tag_type = 'end'
self.process_tag()
# Clear tag tracker
self.tag = None
self.tag_type = None
self.attrs.clear()
def handle_data(self, data):
# print(f'Data: {data!r}')
if self.tag in ['scripref', 'h2', 'h3', 'h5']:
return
if data == '\n':
return
data = data.replace('', '-- ')
data = data.replace('GOD', 'God')
self.write_to_file(data.replace('\n', ' '))
if __name__ == '__main__': if __name__ == '__main__':
folder = Path('/home/bear/PycharmProjects/mark_henry_md/matthew_henry') parser = MyHTMLParser()
htm_files = list(folder.glob('*.HTM')) # Process each volume file.
output_folder = Path('./Matthew Henry Commentary').absolute() pbar = tqdm(total=1273)
# for file in tqdm(htm_files[7:8]): for vol in sorted(Path('./').glob('*.xml')):
for file in htm_files[7:8]: logger.info(vol)
soup = BeautifulSoup(file.read_text(), 'html.parser') soup = BeautifulSoup(vol.read_text(), 'xml')
book, chapter = get_filename(soup) # Get each book in the volume
md_file = create_md_file(output_folder, book, chapter) # This will be the main folder for all the book's chapters
for book in soup.find_all('div1'):
logger.info(book['title'])
book_name = book['title'].replace('First', '1').replace('Second', '2').replace('Third', '3')
write_blocks(soup, md_file) # These are the chapters/files for each book folder
for chapter in book.find_all('div2'):
logger.info(chapter['title'])
filename = chapter['title']
# Remove Roman Numerals from file name
if 'CHAPTER' in filename.upper():
_, roman_num = chapter['title'].split(' ')
filename = f'Chapter {parser.roman_to_int(roman_num)}'
parser.create_md_file(base, book_name, filename)
pbar.set_description(f"Processing: {vol.name}: {book_name}: {filename}")
# Parse the text of the chapter
# May need to add a loop to look in all <p class='passage> and
# unrap any scripref that is in them. the place Dan is getting confused
# with the book Daniel.
for passage in chapter.find_all('p', {'class': 'passage'}):
# print(passage.text)
new_t = soup.new_tag('p', attrs=passage.attrs)
new_t.string = passage.text
passage.replace_with(new_t)
# Clearing all scripRef of internal text. That way if it
# had other tags in it would not be processed
for scripture in chapter.find_all('scripRef'):
scripture.clear()
for doc in chapter.children:
try:
parser.feed(str(doc))
except Exception as e:
logger.exception(e)
logger.debug(chapter.attrs)
raise
parser.clean_file()
pbar.update(1)
# if pbar.last_print_n > 2:
# sys.exit()
pbar.close()
parser.close()