Moved the ## Verses: heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections.

Both Of these issues were caught with Psalms 119
This commit is contained in:
boose_magoose 2023-12-02 09:29:00 -05:00
parent a1e69b1368
commit f73b11d153

64
main.py
View File

@ -9,7 +9,7 @@ from bs4 import BeautifulSoup
from loguru import logger from loguru import logger
from tqdm import tqdm from tqdm import tqdm
base = Path('./Matthew Henry Commentary/xml').absolute() base = Path('./xml/Matthew Henry Commentary').absolute()
logger.remove(0) logger.remove(0)
logger.add('log.txt') logger.add('log.txt')
@ -19,6 +19,7 @@ class MyHTMLParser(HTMLParser):
passage_verse = None passage_verse = None
tag = None tag = None
tag_type = None # 'start' or 'end' tag_type = None # 'start' or 'end'
skip_re = False
attrs = dict() attrs = dict()
# All the patterns are for re.sub(). This is specifically for getting # All the patterns are for re.sub(). This is specifically for getting
@ -73,21 +74,26 @@ class MyHTMLParser(HTMLParser):
self.file = md_file self.file = md_file
return md_file return md_file
def write_to_file(self, data): def write_to_file(self, data, skip_re=False):
if self.file is None: # if self.file is None:
raise ValueError('No File specified') # raise ValueError('No File specified')
if not skip_re:
# Here the list/sublist are searched for and altered for later # Here the list/sublist are searched for and altered for later
# processing in self.clean_file() # processing in self.clean_file()
for pattern, sub in self.patterns: for pattern, sub in self.patterns:
# print(f'Running Pattern: {pattern}') # logger.debug(f'Running Pattern: {pattern} on {data[:50]}')
data = re.sub(pattern, sub, data) data = re.sub(pattern, sub, data)
# print(f'writing: {data!r}') # print(f'writing: {data!r}')
if self.file:
# raise ValueError('No File specified')
with open(self.file, 'a+') as file: with open(self.file, 'a+') as file:
file.write(data) file.write(data)
else:
print(data)
def clean_file(self): def clean_file(self):
logger.info(' CLEANING FILE') logger.info(' CLEANING FILE')
@ -114,7 +120,7 @@ class MyHTMLParser(HTMLParser):
if line.strip().startswith('# '): if line.strip().startswith('# '):
if header == line.strip(): if header == line.strip():
continue continue
header = line.strip() header = line
# Add a `\n` if line is a heading. # Add a `\n` if line is a heading.
if line.startswith('#'): if line.startswith('#'):
@ -215,27 +221,12 @@ class MyHTMLParser(HTMLParser):
case 'h4': # this is the section header. Need to keep track of it per file. case 'h4': # this is the section header. Need to keep track of it per file.
if self.tag_type == 'start': if self.tag_type == 'start':
self.write_to_file('\n# ') self.write_to_file('\n# ',)
self.skip_re = True
if self.tag_type == 'end': if self.tag_type == 'end':
self.write_to_file('\n') self.write_to_file('\n')
self.skip_re = False
#
if self.passage_verse:
try:
verse = bible.get_references(
self.normalize_osis_verses(
self.passage_verse)
)[0]
except:
logger.debug(self.passage_verse)
raise
self.write_to_file('## Verses: ')
self.write_to_file(
f'{verse.start_verse} - {verse.end_verse}'
)
self.write_to_file('\n\n')
case 'scripref': # Scripture ref case 'scripref': # Scripture ref
# get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')] # get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
@ -255,6 +246,22 @@ class MyHTMLParser(HTMLParser):
case 'p': # Paragraph case 'p': # Paragraph
if self.tag_type == 'start': if self.tag_type == 'start':
if self.attrs.get('class', False) == 'passage': if self.attrs.get('class', False) == 'passage':
if self.passage_verse:
try:
verse = bible.get_references(
self.normalize_osis_verses(
self.passage_verse)
)[0]
except:
logger.debug(self.passage_verse)
raise
self.write_to_file('## Verses: ')
self.write_to_file(
f'{verse.start_verse} - {verse.end_verse}'
)
self.write_to_file('\n\n')
# Need this do the regex in self.clean does not pick # Need this do the regex in self.clean does not pick
# this up. Will be also cleaned in the same function. # this up. Will be also cleaned in the same function.
self.write_to_file('Passage: ') self.write_to_file('Passage: ')
@ -289,6 +296,7 @@ class MyHTMLParser(HTMLParser):
self.tag = None self.tag = None
self.tag_type = None self.tag_type = None
self.attrs.clear() self.attrs.clear()
self.skip_re = False
def handle_data(self, data): def handle_data(self, data):
# print(f'Data: {data!r}') # print(f'Data: {data!r}')
@ -301,8 +309,9 @@ class MyHTMLParser(HTMLParser):
data = data.replace('', '-- ') data = data.replace('', '-- ')
data = data.replace('GOD', 'God') data = data.replace('GOD', 'God')
data = data.replace('\n', ' ')
self.write_to_file(data.replace('\n', ' ')) self.write_to_file(data, self.skip_re)
if __name__ == '__main__': if __name__ == '__main__':
@ -317,12 +326,11 @@ if __name__ == '__main__':
# Get each book in the volume # Get each book in the volume
# This will be the main folder for all the book's chapters # This will be the main folder for all the book's chapters
for book in soup.find_all('div1'): for book in soup.find_all('div1'):
logger.info(book['title'])
book_name = book['title'].replace('First', '1').replace('Second', '2').replace('Third', '3') book_name = book['title'].replace('First', '1').replace('Second', '2').replace('Third', '3')
logger.info(book_name)
# These are the chapters/files for each book folder # These are the chapters/files for each book folder
for chapter in book.find_all('div2'): for chapter in book.find_all('div2'):
logger.info(chapter['title'])
filename = chapter['title'] filename = chapter['title']
@ -331,6 +339,7 @@ if __name__ == '__main__':
_, roman_num = chapter['title'].split(' ') _, roman_num = chapter['title'].split(' ')
filename = f'Chapter {parser.roman_to_int(roman_num)}' filename = f'Chapter {parser.roman_to_int(roman_num)}'
logger.info(filename)
parser.create_md_file(base, book_name, filename) parser.create_md_file(base, book_name, filename)
pbar.set_description(f"Processing: {vol.name}: {book_name}: {filename}") pbar.set_description(f"Processing: {vol.name}: {book_name}: {filename}")
@ -353,6 +362,7 @@ if __name__ == '__main__':
for doc in chapter.children: for doc in chapter.children:
try: try:
logger.debug(f"{book_name}, {filename}")
parser.feed(str(doc)) parser.feed(str(doc))
except Exception as e: except Exception as e:
logger.exception(e) logger.exception(e)