Moved the ## Verses:
heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections.
Both Of these issues were caught with Psalms 119
This commit is contained in:
parent
a1e69b1368
commit
f73b11d153
64
main.py
64
main.py
@ -9,7 +9,7 @@ from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
|
||||
base = Path('./Matthew Henry Commentary/xml').absolute()
|
||||
base = Path('./xml/Matthew Henry Commentary').absolute()
|
||||
|
||||
logger.remove(0)
|
||||
logger.add('log.txt')
|
||||
@ -19,6 +19,7 @@ class MyHTMLParser(HTMLParser):
|
||||
passage_verse = None
|
||||
tag = None
|
||||
tag_type = None # 'start' or 'end'
|
||||
skip_re = False
|
||||
attrs = dict()
|
||||
|
||||
# All the patterns are for re.sub(). This is specifically for getting
|
||||
@ -73,21 +74,26 @@ class MyHTMLParser(HTMLParser):
|
||||
self.file = md_file
|
||||
return md_file
|
||||
|
||||
def write_to_file(self, data):
|
||||
def write_to_file(self, data, skip_re=False):
|
||||
|
||||
if self.file is None:
|
||||
raise ValueError('No File specified')
|
||||
# if self.file is None:
|
||||
# raise ValueError('No File specified')
|
||||
|
||||
if not skip_re:
|
||||
# Here the list/sublist are searched for and altered for later
|
||||
# processing in self.clean_file()
|
||||
for pattern, sub in self.patterns:
|
||||
# print(f'Running Pattern: {pattern}')
|
||||
# logger.debug(f'Running Pattern: {pattern} on {data[:50]}')
|
||||
data = re.sub(pattern, sub, data)
|
||||
|
||||
# print(f'writing: {data!r}')
|
||||
|
||||
if self.file:
|
||||
# raise ValueError('No File specified')
|
||||
with open(self.file, 'a+') as file:
|
||||
file.write(data)
|
||||
else:
|
||||
print(data)
|
||||
|
||||
def clean_file(self):
|
||||
logger.info(' CLEANING FILE')
|
||||
@ -114,7 +120,7 @@ class MyHTMLParser(HTMLParser):
|
||||
if line.strip().startswith('# '):
|
||||
if header == line.strip():
|
||||
continue
|
||||
header = line.strip()
|
||||
header = line
|
||||
|
||||
# Add a `\n` if line is a heading.
|
||||
if line.startswith('#'):
|
||||
@ -215,27 +221,12 @@ class MyHTMLParser(HTMLParser):
|
||||
|
||||
case 'h4': # this is the section header. Need to keep track of it per file.
|
||||
if self.tag_type == 'start':
|
||||
self.write_to_file('\n# ')
|
||||
self.write_to_file('\n# ',)
|
||||
self.skip_re = True
|
||||
|
||||
if self.tag_type == 'end':
|
||||
self.write_to_file('\n')
|
||||
|
||||
#
|
||||
if self.passage_verse:
|
||||
try:
|
||||
verse = bible.get_references(
|
||||
self.normalize_osis_verses(
|
||||
self.passage_verse)
|
||||
)[0]
|
||||
except:
|
||||
logger.debug(self.passage_verse)
|
||||
raise
|
||||
|
||||
self.write_to_file('## Verses: ')
|
||||
self.write_to_file(
|
||||
f'{verse.start_verse} - {verse.end_verse}'
|
||||
)
|
||||
self.write_to_file('\n\n')
|
||||
self.skip_re = False
|
||||
|
||||
case 'scripref': # Scripture ref
|
||||
# get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
|
||||
@ -255,6 +246,22 @@ class MyHTMLParser(HTMLParser):
|
||||
case 'p': # Paragraph
|
||||
if self.tag_type == 'start':
|
||||
if self.attrs.get('class', False) == 'passage':
|
||||
if self.passage_verse:
|
||||
try:
|
||||
verse = bible.get_references(
|
||||
self.normalize_osis_verses(
|
||||
self.passage_verse)
|
||||
)[0]
|
||||
except:
|
||||
logger.debug(self.passage_verse)
|
||||
raise
|
||||
|
||||
self.write_to_file('## Verses: ')
|
||||
self.write_to_file(
|
||||
f'{verse.start_verse} - {verse.end_verse}'
|
||||
)
|
||||
self.write_to_file('\n\n')
|
||||
|
||||
# Need this do the regex in self.clean does not pick
|
||||
# this up. Will be also cleaned in the same function.
|
||||
self.write_to_file('Passage: ')
|
||||
@ -289,6 +296,7 @@ class MyHTMLParser(HTMLParser):
|
||||
self.tag = None
|
||||
self.tag_type = None
|
||||
self.attrs.clear()
|
||||
self.skip_re = False
|
||||
|
||||
def handle_data(self, data):
|
||||
# print(f'Data: {data!r}')
|
||||
@ -301,8 +309,9 @@ class MyHTMLParser(HTMLParser):
|
||||
|
||||
data = data.replace('—', '-- ')
|
||||
data = data.replace('GOD', 'God')
|
||||
data = data.replace('\n', ' ')
|
||||
|
||||
self.write_to_file(data.replace('\n', ' '))
|
||||
self.write_to_file(data, self.skip_re)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
@ -317,12 +326,11 @@ if __name__ == '__main__':
|
||||
# Get each book in the volume
|
||||
# This will be the main folder for all the book's chapters
|
||||
for book in soup.find_all('div1'):
|
||||
logger.info(book['title'])
|
||||
book_name = book['title'].replace('First', '1').replace('Second', '2').replace('Third', '3')
|
||||
logger.info(book_name)
|
||||
|
||||
# These are the chapters/files for each book folder
|
||||
for chapter in book.find_all('div2'):
|
||||
logger.info(chapter['title'])
|
||||
|
||||
filename = chapter['title']
|
||||
|
||||
@ -331,6 +339,7 @@ if __name__ == '__main__':
|
||||
_, roman_num = chapter['title'].split(' ')
|
||||
filename = f'Chapter {parser.roman_to_int(roman_num)}'
|
||||
|
||||
logger.info(filename)
|
||||
parser.create_md_file(base, book_name, filename)
|
||||
|
||||
pbar.set_description(f"Processing: {vol.name}: {book_name}: {filename}")
|
||||
@ -353,6 +362,7 @@ if __name__ == '__main__':
|
||||
for doc in chapter.children:
|
||||
|
||||
try:
|
||||
logger.debug(f"{book_name}, {filename}")
|
||||
parser.feed(str(doc))
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
|
Loading…
Reference in New Issue
Block a user