Moved the ## Verses:
heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections.
Both Of these issues were caught with Psalms 119
This commit is contained in:
parent
a1e69b1368
commit
f73b11d153
76
main.py
76
main.py
@ -9,7 +9,7 @@ from bs4 import BeautifulSoup
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
base = Path('./Matthew Henry Commentary/xml').absolute()
|
base = Path('./xml/Matthew Henry Commentary').absolute()
|
||||||
|
|
||||||
logger.remove(0)
|
logger.remove(0)
|
||||||
logger.add('log.txt')
|
logger.add('log.txt')
|
||||||
@ -19,6 +19,7 @@ class MyHTMLParser(HTMLParser):
|
|||||||
passage_verse = None
|
passage_verse = None
|
||||||
tag = None
|
tag = None
|
||||||
tag_type = None # 'start' or 'end'
|
tag_type = None # 'start' or 'end'
|
||||||
|
skip_re = False
|
||||||
attrs = dict()
|
attrs = dict()
|
||||||
|
|
||||||
# All the patterns are for re.sub(). This is specifically for getting
|
# All the patterns are for re.sub(). This is specifically for getting
|
||||||
@ -73,21 +74,26 @@ class MyHTMLParser(HTMLParser):
|
|||||||
self.file = md_file
|
self.file = md_file
|
||||||
return md_file
|
return md_file
|
||||||
|
|
||||||
def write_to_file(self, data):
|
def write_to_file(self, data, skip_re=False):
|
||||||
|
|
||||||
if self.file is None:
|
# if self.file is None:
|
||||||
raise ValueError('No File specified')
|
# raise ValueError('No File specified')
|
||||||
|
|
||||||
# Here the list/sublist are searched for and altered for later
|
if not skip_re:
|
||||||
# processing in self.clean_file()
|
# Here the list/sublist are searched for and altered for later
|
||||||
for pattern, sub in self.patterns:
|
# processing in self.clean_file()
|
||||||
# print(f'Running Pattern: {pattern}')
|
for pattern, sub in self.patterns:
|
||||||
data = re.sub(pattern, sub, data)
|
# logger.debug(f'Running Pattern: {pattern} on {data[:50]}')
|
||||||
|
data = re.sub(pattern, sub, data)
|
||||||
|
|
||||||
# print(f'writing: {data!r}')
|
# print(f'writing: {data!r}')
|
||||||
|
|
||||||
with open(self.file, 'a+') as file:
|
if self.file:
|
||||||
file.write(data)
|
# raise ValueError('No File specified')
|
||||||
|
with open(self.file, 'a+') as file:
|
||||||
|
file.write(data)
|
||||||
|
else:
|
||||||
|
print(data)
|
||||||
|
|
||||||
def clean_file(self):
|
def clean_file(self):
|
||||||
logger.info(' CLEANING FILE')
|
logger.info(' CLEANING FILE')
|
||||||
@ -114,7 +120,7 @@ class MyHTMLParser(HTMLParser):
|
|||||||
if line.strip().startswith('# '):
|
if line.strip().startswith('# '):
|
||||||
if header == line.strip():
|
if header == line.strip():
|
||||||
continue
|
continue
|
||||||
header = line.strip()
|
header = line
|
||||||
|
|
||||||
# Add a `\n` if line is a heading.
|
# Add a `\n` if line is a heading.
|
||||||
if line.startswith('#'):
|
if line.startswith('#'):
|
||||||
@ -215,27 +221,12 @@ class MyHTMLParser(HTMLParser):
|
|||||||
|
|
||||||
case 'h4': # this is the section header. Need to keep track of it per file.
|
case 'h4': # this is the section header. Need to keep track of it per file.
|
||||||
if self.tag_type == 'start':
|
if self.tag_type == 'start':
|
||||||
self.write_to_file('\n# ')
|
self.write_to_file('\n# ',)
|
||||||
|
self.skip_re = True
|
||||||
|
|
||||||
if self.tag_type == 'end':
|
if self.tag_type == 'end':
|
||||||
self.write_to_file('\n')
|
self.write_to_file('\n')
|
||||||
|
self.skip_re = False
|
||||||
#
|
|
||||||
if self.passage_verse:
|
|
||||||
try:
|
|
||||||
verse = bible.get_references(
|
|
||||||
self.normalize_osis_verses(
|
|
||||||
self.passage_verse)
|
|
||||||
)[0]
|
|
||||||
except:
|
|
||||||
logger.debug(self.passage_verse)
|
|
||||||
raise
|
|
||||||
|
|
||||||
self.write_to_file('## Verses: ')
|
|
||||||
self.write_to_file(
|
|
||||||
f'{verse.start_verse} - {verse.end_verse}'
|
|
||||||
)
|
|
||||||
self.write_to_file('\n\n')
|
|
||||||
|
|
||||||
case 'scripref': # Scripture ref
|
case 'scripref': # Scripture ref
|
||||||
# get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
|
# get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
|
||||||
@ -255,6 +246,22 @@ class MyHTMLParser(HTMLParser):
|
|||||||
case 'p': # Paragraph
|
case 'p': # Paragraph
|
||||||
if self.tag_type == 'start':
|
if self.tag_type == 'start':
|
||||||
if self.attrs.get('class', False) == 'passage':
|
if self.attrs.get('class', False) == 'passage':
|
||||||
|
if self.passage_verse:
|
||||||
|
try:
|
||||||
|
verse = bible.get_references(
|
||||||
|
self.normalize_osis_verses(
|
||||||
|
self.passage_verse)
|
||||||
|
)[0]
|
||||||
|
except:
|
||||||
|
logger.debug(self.passage_verse)
|
||||||
|
raise
|
||||||
|
|
||||||
|
self.write_to_file('## Verses: ')
|
||||||
|
self.write_to_file(
|
||||||
|
f'{verse.start_verse} - {verse.end_verse}'
|
||||||
|
)
|
||||||
|
self.write_to_file('\n\n')
|
||||||
|
|
||||||
# Need this do the regex in self.clean does not pick
|
# Need this do the regex in self.clean does not pick
|
||||||
# this up. Will be also cleaned in the same function.
|
# this up. Will be also cleaned in the same function.
|
||||||
self.write_to_file('Passage: ')
|
self.write_to_file('Passage: ')
|
||||||
@ -289,6 +296,7 @@ class MyHTMLParser(HTMLParser):
|
|||||||
self.tag = None
|
self.tag = None
|
||||||
self.tag_type = None
|
self.tag_type = None
|
||||||
self.attrs.clear()
|
self.attrs.clear()
|
||||||
|
self.skip_re = False
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
# print(f'Data: {data!r}')
|
# print(f'Data: {data!r}')
|
||||||
@ -301,8 +309,9 @@ class MyHTMLParser(HTMLParser):
|
|||||||
|
|
||||||
data = data.replace('—', '-- ')
|
data = data.replace('—', '-- ')
|
||||||
data = data.replace('GOD', 'God')
|
data = data.replace('GOD', 'God')
|
||||||
|
data = data.replace('\n', ' ')
|
||||||
|
|
||||||
self.write_to_file(data.replace('\n', ' '))
|
self.write_to_file(data, self.skip_re)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
@ -317,12 +326,11 @@ if __name__ == '__main__':
|
|||||||
# Get each book in the volume
|
# Get each book in the volume
|
||||||
# This will be the main folder for all the book's chapters
|
# This will be the main folder for all the book's chapters
|
||||||
for book in soup.find_all('div1'):
|
for book in soup.find_all('div1'):
|
||||||
logger.info(book['title'])
|
|
||||||
book_name = book['title'].replace('First', '1').replace('Second', '2').replace('Third', '3')
|
book_name = book['title'].replace('First', '1').replace('Second', '2').replace('Third', '3')
|
||||||
|
logger.info(book_name)
|
||||||
|
|
||||||
# These are the chapters/files for each book folder
|
# These are the chapters/files for each book folder
|
||||||
for chapter in book.find_all('div2'):
|
for chapter in book.find_all('div2'):
|
||||||
logger.info(chapter['title'])
|
|
||||||
|
|
||||||
filename = chapter['title']
|
filename = chapter['title']
|
||||||
|
|
||||||
@ -331,6 +339,7 @@ if __name__ == '__main__':
|
|||||||
_, roman_num = chapter['title'].split(' ')
|
_, roman_num = chapter['title'].split(' ')
|
||||||
filename = f'Chapter {parser.roman_to_int(roman_num)}'
|
filename = f'Chapter {parser.roman_to_int(roman_num)}'
|
||||||
|
|
||||||
|
logger.info(filename)
|
||||||
parser.create_md_file(base, book_name, filename)
|
parser.create_md_file(base, book_name, filename)
|
||||||
|
|
||||||
pbar.set_description(f"Processing: {vol.name}: {book_name}: {filename}")
|
pbar.set_description(f"Processing: {vol.name}: {book_name}: {filename}")
|
||||||
@ -353,6 +362,7 @@ if __name__ == '__main__':
|
|||||||
for doc in chapter.children:
|
for doc in chapter.children:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
logger.debug(f"{book_name}, {filename}")
|
||||||
parser.feed(str(doc))
|
parser.feed(str(doc))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception(e)
|
logger.exception(e)
|
||||||
|
Loading…
Reference in New Issue
Block a user