238 lines
7.8 KiB
Python
238 lines
7.8 KiB
Python
import string
|
|
|
|
import pythonbible as bible
|
|
from pathlib import Path
|
|
from bs4 import BeautifulSoup
|
|
from html.parser import HTMLParser
|
|
import re
|
|
|
|
base = Path('./Matthew Henry Commentary/xlm').absolute()
|
|
|
|
class MyHTMLParser(HTMLParser):
|
|
file = None
|
|
tag = None
|
|
tag_type = None
|
|
attrs = dict()
|
|
patterns = [
|
|
(r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~1'),
|
|
(r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~2'),
|
|
(r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~3'),
|
|
(r"(\S\s)(\d+\.)(\s\b|\W)", lambda x: f'{x.group(1)}\n{x.group(2)}~3{x.group(3)}'), # THis may need to be added to the cleanup script
|
|
(r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.", lambda x: f'\n{parser.roman_to_int(x.group())}.~5'),
|
|
]
|
|
|
|
@staticmethod
|
|
def roman_to_int(number: str) -> int:
|
|
if number is None:
|
|
return 0
|
|
|
|
number = number.strip('.')
|
|
|
|
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
|
|
total = 0
|
|
for i in range(len(number) - 1, -1, -1):
|
|
num = roman[number[i]]
|
|
if 3 * num < total:
|
|
total -= num
|
|
else:
|
|
total += num
|
|
|
|
return total
|
|
|
|
def create_md_file(self, base_dir: Path, folder: str, file: str):
|
|
md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
|
|
md_file.parent.mkdir(parents=True, exist_ok=True)
|
|
md_file.unlink(missing_ok=True)
|
|
self.file = md_file
|
|
|
|
def write_to_file(self, data):
|
|
# if self.writeable is False:
|
|
# return
|
|
|
|
if self.file is None:
|
|
raise ValueError('No File specified')
|
|
|
|
for pattern, sub in self.patterns:
|
|
print(f'Running Pattern: {pattern}')
|
|
data = re.sub(pattern, sub, data)
|
|
|
|
print(f'writing: {data!r}')
|
|
|
|
with open(self.file, 'a+') as file:
|
|
file.write(data)
|
|
|
|
def clean_file(self):
|
|
print(' CLEANING FILE')
|
|
|
|
if self.file is None:
|
|
raise ValueError('No File specified')
|
|
|
|
with open(self.file, 'r') as file:
|
|
lines = file.readlines()
|
|
|
|
with open(self.file, 'w') as file:
|
|
indent = 0
|
|
sublist = dict()
|
|
for line in lines:
|
|
if line.strip():
|
|
if line.startswith('#'):
|
|
line = f'\n{line}'
|
|
|
|
if line.strip()[0] not in string.digits:
|
|
line = f'{line}\n'
|
|
# print('resetting indent')
|
|
indent = 0
|
|
sublist.clear()
|
|
else:
|
|
start = line.find('~')
|
|
end = start + 2
|
|
list_id = line[start:end]
|
|
if list_id not in sublist:
|
|
sublist[list_id] = indent
|
|
indent += 1
|
|
# print(f'{indent=}')
|
|
# print(f'{sublist=}')
|
|
|
|
line = f'{"\t" * sublist[list_id]}{line.replace(list_id, "")}'
|
|
# line = f'{"\t" * sublist[list_id]}{line}'
|
|
|
|
# print(f'{sublist[list_id]=}, {list_id=}')
|
|
|
|
file.write(line)
|
|
|
|
@staticmethod
|
|
def normalize_verses(verses):
|
|
"""Takes this 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25'
|
|
and turns it into somthing more readable"""
|
|
normalized_refs = list()
|
|
for chunks in verses.removeprefix("Bible:").split('Bible:'):
|
|
spanned_verses = list()
|
|
for ref in chunks.split('-'):
|
|
verse = ref.replace('.', ' ', 1).replace('.', ':')
|
|
spanned_verses.append(verse)
|
|
|
|
normalized = bible.format_scripture_references(
|
|
bible.get_references(str(spanned_verses))
|
|
)
|
|
normalized_refs.append(normalized)
|
|
|
|
return ';'.join(normalized_refs)
|
|
|
|
@staticmethod
|
|
def obsidian_links(verses):
|
|
links = []
|
|
for verse in verses.split(';'):
|
|
links.append(f"[[{verse.replace(':', '#').replace('-', '..')}]]")
|
|
|
|
return '; '.join(links)
|
|
|
|
def process_tag(self, test=None):
|
|
match self.tag:
|
|
case 'h4': # this is the section header. Need to keep track of it per file.
|
|
if self.tag_type == 'start':
|
|
self.write_to_file('\n## ')
|
|
|
|
if self.tag_type == 'end':
|
|
self.write_to_file('\n\n')
|
|
case 'b':
|
|
self.write_to_file('**')
|
|
case 'i':
|
|
self.write_to_file('*')
|
|
case 'hr':
|
|
pass
|
|
case 'scripref': # Scripture ref
|
|
# get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
|
|
# bible.format_scripture_references(bible.get_references(attrs['osisref']))
|
|
if self.tag_type == 'start':
|
|
verse_link = self.obsidian_links(
|
|
self.normalize_verses(self.attrs['osisref'])
|
|
)
|
|
self.write_to_file(verse_link)
|
|
case 'p':
|
|
if self.tag_type == 'start':
|
|
if self.attrs.get('class', False) == 'passage':
|
|
self.write_to_file('Passage: ')
|
|
if self.tag_type == 'end':
|
|
self.write_to_file('\n\n')
|
|
# case 'html':
|
|
# if self.tag_type == 'end':
|
|
# self.clean_file()
|
|
case _:
|
|
# print(tag.upper())
|
|
pass
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
attrs = dict(attrs)
|
|
print(f'Start: {tag}, {attrs}')
|
|
self.tag_type = 'start'
|
|
self.tag = tag
|
|
self.attrs = attrs
|
|
self.process_tag(f'what what {tag}')
|
|
|
|
def handle_endtag(self, tag):
|
|
print(f'End: {tag}')
|
|
self.tag = tag
|
|
self.tag_type = 'end'
|
|
self.process_tag()
|
|
|
|
self.tag = None
|
|
self.tag_type = None
|
|
self.attrs.clear()
|
|
|
|
def handle_data(self, data):
|
|
print(f'Data: {data!r}')
|
|
|
|
if self.tag in ['scripref', 'h2', 'h3', 'h5']:
|
|
return
|
|
|
|
if data == '\n':
|
|
return
|
|
|
|
data = data.replace('—', '-- ')
|
|
data = data.replace('GOD', 'God')
|
|
|
|
self.write_to_file(data.replace('\n', ' '))
|
|
|
|
|
|
parser = MyHTMLParser()
|
|
soup = BeautifulSoup(Path('./vol_1.xml').read_text(), 'xml')
|
|
|
|
# Clearing all scripRef of internal text. That way if it
|
|
# had other tags in it would not be processed
|
|
for i in soup.find_all('scripRef'):
|
|
i.clear()
|
|
|
|
# May need to add a loop to look in all <p class='passage> and
|
|
# unrap any scripref that is in them. the place Dan is getting confused
|
|
# with the book Daniel.
|
|
# ??????????
|
|
# for passage in soup.find_all('p', class='passage'):
|
|
# for ref in passafe.find_all('scripRef'):
|
|
# ref.unwrap()
|
|
|
|
|
|
for book in soup.find_all('div1')[2:]:
|
|
# this is the folder for the books
|
|
print(book.attrs)
|
|
|
|
# These are the chapters/files for each folder
|
|
# for chapter in book.find_all('div2')[:2]:
|
|
for chapter in book.find_all('div2'):
|
|
print('\t', chapter.attrs)
|
|
parser.create_md_file(base, book['title'], f'{chapter['title']}')
|
|
|
|
# this is the text that needs parsing
|
|
for doc in chapter.children:
|
|
# for doc in chapter.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
|
# for doc in chapter.find_all('scripRef'):
|
|
# for doc in chapter.find_all('span'):
|
|
# sr = str(doc['osisRef'])
|
|
# passage = parser.normalize_verses(sr)
|
|
# print('\t\tPassage: ', passage)
|
|
# print('\t\tLink: ', parser.obsidian_links(passage))
|
|
|
|
# print('\t\tDATA: ', str(doc).strip())
|
|
parser.feed(str(doc))
|
|
# pass
|
|
|
|
parser.clean_file() |