mh_parser/main.py

510 lines
16 KiB
Python
Raw Normal View History

2023-12-07 11:56:01 +00:00
import itertools
2023-12-02 02:47:45 +00:00
import re
import string
import sys
from html.parser import HTMLParser
2023-11-30 02:23:35 +00:00
from pathlib import Path
2023-12-02 02:47:45 +00:00
import pythonbible as bible
2023-11-30 02:23:35 +00:00
from bs4 import BeautifulSoup
from loguru import logger
2023-12-02 02:47:45 +00:00
from tqdm import tqdm
2023-12-07 11:56:01 +00:00
base = Path('./xml/Matthew Henry Commentary 2').absolute()
2023-12-02 02:47:45 +00:00
logger.remove(0)
2023-12-07 11:56:01 +00:00
from functools import partialmethod
logger.__class__.foobar = partialmethod(logger.__class__.log, "foobar", level='WARNING')
logger.add('log.txt', level='WARNING')
convert = dict([
("Eccl", "Ecclesiastes"),
("Rom", "Romans"),
("2Tim", "2 Timothy"),
("2Pet", "2 Peter"),
("Gen", "Genesis"),
("John", "John"),
("1Cor", "1 Corinthians"),
("Ezek", "Ezekiel"),
("Heb", "Hebrews"),
("Rev", "Revelation"),
("Eph", "Ephesians"),
("Isa", "Isaiah"),
("Ps", "Psalms"),
("Acts", "Acts"),
("Deut", "Deuteronomy"),
("Neh", "Nehemiah"),
("1Chr", "1 Chronicles"),
("2Kgs", "2 Kings"),
("2Cor", "2 Corinthians"),
("Hos", "Hosea"),
("Exod", "Exodus"),
("Matt", "Matthew"),
("Job", "Job"),
("Prov", "Proverbs"),
("Col", "Colossians"),
("Jer", "Jeremiah"),
("1John", "1 John"),
("Jas", "James"),
("1Tim", "1 Timothy"),
("Amos", "Amos"),
("Mic", "Micah"),
("Dan", "Daniel"),
("Jonah", "Jonah"),
("Luke", "Luke"),
("Mal", "Malachi"),
("Zech", "Zechariah"),
("Phil", "Philippians"),
("Mark", "Mark"),
("2Thess", "2 Thessalonians"),
("Jude", "Jude"),
("1Sam", "1 Samuel"),
("Num", "Numbers"),
("Gal", "Galatians"),
("1Pet", "1 Peter"),
("Lev", "Leviticus"),
("2Chr", "2 Chronicles"),
("Lam", "Lamentations"),
("1Kgs", "1 Kings"),
("Ezra", "Ezra"),
("Obad", "Obadiah"),
("Hab", "Habakkuk"),
("2Sam", "2 Samuel"),
("Josh", "Joshua"),
("Judg", "Judges"),
("Ruth", "Ruth"),
("Esth", "Esther"),
("Phlm", "Philemon"),
("1Thess", "1 Thessalonians"),
("Joel", "Joel"),
("Titus", "Titus"),
("3John", "3 John"),
("Zeph", "Zephaniah"),
("Song", "Song of Songs"),
("Hag", "Haggai"),
("Wis", "Wisdom of Solomon"),
("2John", "2 John"),
("2Macc", "2 Maccabees"),
("Nah", "Nahum"),
("Sir", "Ecclesiasticus"),
("Psa", "Psalms"),
("1Macc", "1 Maccabees"),
("2Peter", "2 Peter"),
]
)
2023-12-02 02:47:45 +00:00
class MyHTMLParser(HTMLParser):
file = None
passage_verse = None
tag = None
tag_type = None # 'start' or 'end'
skip_re = False
2023-12-02 02:47:45 +00:00
attrs = dict()
# All the patterns are for re.sub(). This is specifically for getting
# list/sublist into the correct position. The first item is for the pattern
# to find and the second is the replacement pattern. The replacement pattern
# also ids the list so that when the self.clean_file() we can properly add
# the correct amount of `\t` to the line.
patterns = [
(r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'),
(r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'),
# These share the same id. There are times were the two are a mixed
# within the parent lists.
(r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'),
(r"(\S\s)(\d+\.)(\s\b|\W)",
lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'),
(r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.",
lambda x: f'\n{parser.roman_to_int(x.group())}.~1'),
]
@staticmethod
def roman_to_int(number: str) -> int:
if number is None:
return 0
# Sometimes the roman numeral comes with a '.'. Striping to not cause
# issues
number = number.strip('.')
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
total = 0
for i in range(len(number) - 1, -1, -1):
num = roman[number[i]]
if 3 * num < total:
total -= num
else:
total += num
return total
def create_md_file(self, base_dir: Path, folder: str, file: str):
""" Create the path of the md file that will be written to as the
program process the xml file.
"""
md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
md_file.parent.mkdir(parents=True, exist_ok=True)
md_file.unlink(missing_ok=True)
# print(md_file.absolute())
self.file = md_file
return md_file
def write_to_file(self, data, skip_re=False):
2023-12-02 02:47:45 +00:00
# if self.file is None:
# raise ValueError('No File specified')
2023-12-02 02:47:45 +00:00
if not skip_re:
# Here the list/sublist are searched for and altered for later
# processing in self.clean_file()
for pattern, sub in self.patterns:
# logger.debug(f'Running Pattern: {pattern} on {data[:50]}')
data = re.sub(pattern, sub, data)
2023-12-02 02:47:45 +00:00
# print(f'writing: {data!r}')
if self.file:
# raise ValueError('No File specified')
with open(self.file, 'a+') as file:
file.write(data)
else:
print(data)
2023-12-02 02:47:45 +00:00
def clean_file(self):
logger.info(' CLEANING FILE')
if self.file is None:
raise ValueError('No File specified')
# Read all the lines
with open(self.file, 'r') as file:
lines = file.readlines()
indent = 0 # Current number of '\t' to add in front of the list number
sublist = dict() # Track the indent number based on the id.
with open(self.file, 'w') as file:
file.write('# Chapter Introduction\n\n')
header = None
for line in lines:
# Do not write blank lines
if not line.strip():
continue
# Main header, if a header has already been written skip it
if line.strip().startswith('# '):
if header == line.strip():
continue
2023-12-04 01:23:25 +00:00
header = line.strip()
2023-12-02 02:47:45 +00:00
# Add a `\n` if line is a heading.
if line.startswith('#'):
line = f'\n{line.strip()}'
# If line is not a list then add a `\n` after the paragraph,
# and reset the indent tracker
if line.strip()[0] not in string.digits:
line = f'{line}\n'
# print('resetting indent')
indent = 0
sublist.clear()
else:
# Get the list id
start = line.find('~')
end = start + 2
list_id = line[start:end]
# Get the number of indents based on the list id, if it does
# not exist they set the value to the max value +1. If this
# is the first it will be set to 0 (-1 +1 = 0)
indents = "\t" * sublist.setdefault(
list_id, max(sublist.values(), default=-1)+1
)
line = f'{indents}{line.replace(list_id, "")}'
if line.startswith('Passage: '):
line = line.removeprefix('Passage: ')
file.write(line)
2023-12-07 11:56:01 +00:00
def normalize_osis_verses(self, verses):
"""Takes this
'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
2023-12-02 02:47:45 +00:00
and turns it into somthing more readable"""
normalized_refs = list()
for chunks in verses.removeprefix("Bible:").split('Bible:'):
spanned_verses = list()
for ref in chunks.split('-'):
verse = ref.replace('.', ' ', 1).replace('.', ':')
2023-12-07 11:56:01 +00:00
book = ref.split('.')[0]
verse.replace(book, convert[book])
2023-12-02 02:47:45 +00:00
spanned_verses.append(
bible.format_scripture_references(
bible.get_references(verse.strip())
)
)
2023-11-30 02:23:35 +00:00
2023-12-02 02:47:45 +00:00
try:
normalized = bible.format_scripture_references(
bible.get_references('-'.join(spanned_verses))
)
except:
logger.warning(f"Error with: {verses=}: {spanned_verses=}")
raise
2023-12-07 11:56:01 +00:00
if normalized is False:
print(f"Error with: {verses=}: {spanned_verses=}")
sys.exit()
2023-12-02 02:47:45 +00:00
normalized_refs.append(normalized)
2023-11-30 02:23:35 +00:00
2023-12-02 02:47:45 +00:00
return ';'.join(normalized_refs)
2023-11-30 02:23:35 +00:00
2023-12-07 11:56:01 +00:00
# @staticmethod
def old_normalize_osis_verses(self, verses):
"""Takes this
'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
and turns it into somthing more readable"""
normalized_refs = list()
for chunks in verses.removeprefix("Bible:").split('Bible:'):
spanned_verses = chunks.split('-')
if len(spanned_verses) < 2:
spanned_verses = list(itertools.repeat(spanned_verses[0], 2))
book, start_chapter, start_verse = spanned_verses[0].split('.')
_, end_chapter, end_verse = spanned_verses[1].split('.')
book = bible.Book[convert[book].upper()]
normalized = bible.NormalizedReference(
book=book,
start_chapter=start_chapter,
start_verse=start_verse,
end_chapter=end_chapter,
end_verse=end_verse,
)
if normalized is False:
logger.foobar(f'{self.args=}')
normalized_refs.append(
bible.format_scripture_references([normalized]).strip()
)
return ';'.join(normalized_refs)
2023-12-02 02:47:45 +00:00
@staticmethod
2023-12-07 11:56:01 +00:00
@logger.catch(reraise=False)
2023-12-02 02:47:45 +00:00
def obsidian_links(verses):
"""Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
links = []
for verse in verses.split(';'):
2023-12-07 11:56:01 +00:00
verse = verse.replace(':', '#').replace('-', '..')
if verse[0] in string.digits:
verse.replace(' ', '', 1)
links.append(f"[[{verse}]]")
2023-12-02 02:47:45 +00:00
return '; '.join(links)
def convert_passage(self, verses):
book = self.file.parts[-2]
chapter = None
ref = []
for ver in verses.split(','):
try:
chapter, v = ver.split(':')
except:
v = ver
r = []
for i in v.split('-'):
r.append(f'{book}.{chapter}.{i}')
ref.append(f'Bible:{"-".join(r)}')
x = ' '.join(ref)
return f'osisRef="{x}"'
def process_tag(self, test=None):
match self.tag:
case 'b': # Bold
self.write_to_file('**')
case 'i': # Italics
self.write_to_file('*')
case 'scripcom':
# This saves the verse for the section.
if self.tag_type == 'start':
if self.attrs['type'] == 'Commentary':
self.passage_verse = self.attrs['osisref']
case 'h4': # this is the section header. Need to keep track of it per file.
if self.tag_type == 'start':
self.write_to_file('\n# ',)
self.skip_re = True
2023-12-02 02:47:45 +00:00
if self.tag_type == 'end':
self.write_to_file('\n')
self.skip_re = False
2023-12-02 02:47:45 +00:00
case 'scripref': # Scripture ref
# get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
# bible.format_scripture_references(bible.get_references(attrs['osisref']))
if self.tag_type == 'start':
if 'osisref' in self.attrs:
verses = self.normalize_osis_verses(
self.attrs['osisref']
)
else:
2023-12-07 11:56:01 +00:00
# verses = self.normalize_osis_verses(
# self.convert_passage(self.attrs['passage'])
# )
raise ValueError('No `osisref` in tag attrs')
2023-12-02 02:47:45 +00:00
self.write_to_file(self.obsidian_links(verses))
case 'p': # Paragraph
if self.tag_type == 'start':
if self.attrs.get('class', False) == 'passage':
if self.passage_verse:
try:
verse = bible.get_references(
self.normalize_osis_verses(
self.passage_verse)
)[0]
except:
logger.debug(self.passage_verse)
raise
self.write_to_file('## Verses: ')
self.write_to_file(
f'{verse.start_verse} - {verse.end_verse}'
)
self.write_to_file('\n\n')
2023-12-02 02:47:45 +00:00
# Need this do the regex in self.clean does not pick
# this up. Will be also cleaned in the same function.
self.write_to_file('Passage: ')
if self.tag_type == 'end':
self.write_to_file('\n\n')
case 'li':
if self.tag_type == 'start':
self.write_to_file('- ')
else:
self.write_to_file('\n')
# case _:
# # print(tag.upper())
# pass
def handle_starttag(self, tag, attrs):
# Set taf trackers
self.tag_type = 'start'
self.tag = tag
self.attrs = dict(attrs)
self.process_tag()
def handle_endtag(self, tag):
# print(f'End: {tag}')
self.tag = tag
self.tag_type = 'end'
self.process_tag()
# Clear tag tracker
self.tag = None
self.tag_type = None
self.attrs.clear()
self.skip_re = False
2023-12-02 02:47:45 +00:00
def handle_data(self, data):
# print(f'Data: {data!r}')
if self.tag in ['scripref', 'h2', 'h3', 'h5']:
return
if data == '\n':
return
data = data.replace('', '-- ')
data = data.replace('GOD', 'God')
data = data.replace('\n', ' ')
2023-12-02 02:47:45 +00:00
self.write_to_file(data, self.skip_re)
2023-11-30 02:23:35 +00:00
if __name__ == '__main__':
2023-12-02 02:47:45 +00:00
parser = MyHTMLParser()
# Process each volume file.
pbar = tqdm(total=1273)
for vol in sorted(Path('./').glob('*.xml')):
logger.info(vol)
soup = BeautifulSoup(vol.read_text(), 'xml')
# Get each book in the volume
# This will be the main folder for all the book's chapters
for book in soup.find_all('div1'):
2023-12-07 11:56:01 +00:00
book_name = (book['title'].
replace('First ', '1').
replace('Second ', '2').
replace('Third ', '3')
)
logger.info(book_name)
2023-12-02 02:47:45 +00:00
# These are the chapters/files for each book folder
for chapter in book.find_all('div2'):
2023-12-07 11:56:01 +00:00
# For introduction files
filename = f'MH {book_name} {chapter['title']}'
2023-12-02 02:47:45 +00:00
# Remove Roman Numerals from file name
if 'CHAPTER' in filename.upper():
2023-12-07 11:56:01 +00:00
*_, roman_num = chapter['title'].split(' ')
filename = f'MH {book_name} {parser.roman_to_int(roman_num)}'
2023-12-02 02:47:45 +00:00
logger.info(filename)
2023-12-02 02:47:45 +00:00
parser.create_md_file(base, book_name, filename)
pbar.set_description(f"Processing: {vol.name}: {book_name}: {filename}")
# Parse the text of the chapter
# May need to add a loop to look in all <p class='passage> and
# unrap any scripref that is in them. the place Dan is getting confused
# with the book Daniel.
for passage in chapter.find_all('p', {'class': 'passage'}):
# print(passage.text)
new_t = soup.new_tag('p', attrs=passage.attrs)
new_t.string = passage.text
passage.replace_with(new_t)
# Clearing all scripRef of internal text. That way if it
# had other tags in it would not be processed
for scripture in chapter.find_all('scripRef'):
scripture.clear()
for doc in chapter.children:
try:
logger.debug(f"{book_name}, {filename}")
2023-12-02 02:47:45 +00:00
parser.feed(str(doc))
except Exception as e:
logger.exception(e)
logger.debug(chapter.attrs)
raise
2023-11-30 02:23:35 +00:00
2023-12-02 02:47:45 +00:00
parser.clean_file()
pbar.update(1)
2023-11-30 02:23:35 +00:00
2023-12-02 02:47:45 +00:00
# if pbar.last_print_n > 2:
# sys.exit()
2023-11-30 02:23:35 +00:00
2023-12-02 02:47:45 +00:00
pbar.close()
parser.close()