Moved the ## Verses: heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections.

Both Of these issues were caught with Psalms 119
This commit is contained in:
boose_magoose 2023-12-02 09:29:00 -05:00
parent a1e69b1368
commit f73b11d153

76
main.py
View File

@ -9,7 +9,7 @@ from bs4 import BeautifulSoup
from loguru import logger
from tqdm import tqdm
base = Path('./Matthew Henry Commentary/xml').absolute()
base = Path('./xml/Matthew Henry Commentary').absolute()
logger.remove(0)
logger.add('log.txt')
@ -19,6 +19,7 @@ class MyHTMLParser(HTMLParser):
passage_verse = None
tag = None
tag_type = None # 'start' or 'end'
skip_re = False
attrs = dict()
# All the patterns are for re.sub(). This is specifically for getting
@ -73,21 +74,26 @@ class MyHTMLParser(HTMLParser):
self.file = md_file
return md_file
def write_to_file(self, data):
def write_to_file(self, data, skip_re=False):
if self.file is None:
raise ValueError('No File specified')
# if self.file is None:
# raise ValueError('No File specified')
# Here the list/sublist are searched for and altered for later
# processing in self.clean_file()
for pattern, sub in self.patterns:
# print(f'Running Pattern: {pattern}')
data = re.sub(pattern, sub, data)
if not skip_re:
# Here the list/sublist are searched for and altered for later
# processing in self.clean_file()
for pattern, sub in self.patterns:
# logger.debug(f'Running Pattern: {pattern} on {data[:50]}')
data = re.sub(pattern, sub, data)
# print(f'writing: {data!r}')
with open(self.file, 'a+') as file:
file.write(data)
if self.file:
# raise ValueError('No File specified')
with open(self.file, 'a+') as file:
file.write(data)
else:
print(data)
def clean_file(self):
logger.info(' CLEANING FILE')
@ -114,7 +120,7 @@ class MyHTMLParser(HTMLParser):
if line.strip().startswith('# '):
if header == line.strip():
continue
header = line.strip()
header = line
# Add a `\n` if line is a heading.
if line.startswith('#'):
@ -215,27 +221,12 @@ class MyHTMLParser(HTMLParser):
case 'h4': # this is the section header. Need to keep track of it per file.
if self.tag_type == 'start':
self.write_to_file('\n# ')
self.write_to_file('\n# ',)
self.skip_re = True
if self.tag_type == 'end':
self.write_to_file('\n')
#
if self.passage_verse:
try:
verse = bible.get_references(
self.normalize_osis_verses(
self.passage_verse)
)[0]
except:
logger.debug(self.passage_verse)
raise
self.write_to_file('## Verses: ')
self.write_to_file(
f'{verse.start_verse} - {verse.end_verse}'
)
self.write_to_file('\n\n')
self.skip_re = False
case 'scripref': # Scripture ref
# get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
@ -255,6 +246,22 @@ class MyHTMLParser(HTMLParser):
case 'p': # Paragraph
if self.tag_type == 'start':
if self.attrs.get('class', False) == 'passage':
if self.passage_verse:
try:
verse = bible.get_references(
self.normalize_osis_verses(
self.passage_verse)
)[0]
except:
logger.debug(self.passage_verse)
raise
self.write_to_file('## Verses: ')
self.write_to_file(
f'{verse.start_verse} - {verse.end_verse}'
)
self.write_to_file('\n\n')
# Need this do the regex in self.clean does not pick
# this up. Will be also cleaned in the same function.
self.write_to_file('Passage: ')
@ -289,6 +296,7 @@ class MyHTMLParser(HTMLParser):
self.tag = None
self.tag_type = None
self.attrs.clear()
self.skip_re = False
def handle_data(self, data):
# print(f'Data: {data!r}')
@ -301,8 +309,9 @@ class MyHTMLParser(HTMLParser):
data = data.replace('', '-- ')
data = data.replace('GOD', 'God')
data = data.replace('\n', ' ')
self.write_to_file(data.replace('\n', ' '))
self.write_to_file(data, self.skip_re)
if __name__ == '__main__':
@ -317,12 +326,11 @@ if __name__ == '__main__':
# Get each book in the volume
# This will be the main folder for all the book's chapters
for book in soup.find_all('div1'):
logger.info(book['title'])
book_name = book['title'].replace('First', '1').replace('Second', '2').replace('Third', '3')
logger.info(book_name)
# These are the chapters/files for each book folder
for chapter in book.find_all('div2'):
logger.info(chapter['title'])
filename = chapter['title']
@ -331,6 +339,7 @@ if __name__ == '__main__':
_, roman_num = chapter['title'].split(' ')
filename = f'Chapter {parser.roman_to_int(roman_num)}'
logger.info(filename)
parser.create_md_file(base, book_name, filename)
pbar.set_description(f"Processing: {vol.name}: {book_name}: {filename}")
@ -353,6 +362,7 @@ if __name__ == '__main__':
for doc in chapter.children:
try:
logger.debug(f"{book_name}, {filename}")
parser.feed(str(doc))
except Exception as e:
logger.exception(e)